From 24f09c9923be4cee2db52b0e9a64ec89d3f6a696 Mon Sep 17 00:00:00 2001
From: "Attarde, Mahesh" <mahesh.attarde@intel.com>
Date: Fri, 26 Sep 2025 03:24:33 -0700
Subject: [PATCH 001/878] [X86][GlobalIsel] support G_IS_FPCLASS

---
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp |   3 +
 llvm/test/CodeGen/X86/isel-fpclass.ll         | 549 +++++++++++++++---
 2 files changed, 474 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index ee9760f881ae9..807a2a7d1542b 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -410,6 +410,9 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
+  getActionDefinitionsBuilder(G_IS_FPCLASS)
+      .lower();
+
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalFor({s32, s64})
diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index 960bbf53a6451..d033d7cb66bc9 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -3,8 +3,8 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
 ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
 ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
-
-; FIXME: We can reuse/delete llvm/test/CodeGen/X86/is_fpclass.ll when all patches are included.
+; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X86-GISEL
+; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-GISEL
 
 define i1 @isnone_f(float %x) {
 ; X86-SDAGISEL-LABEL: isnone_f:
@@ -23,6 +23,11 @@ define i1 @isnone_f(float %x) {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isnone_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
   ret i1 %0
@@ -45,6 +50,11 @@ define i1 @isany_f(float %x) {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isany_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movb $1, %al
+; X86-GISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
   ret i1 %0
@@ -62,16 +72,16 @@ define i1 @issignaling_f(float %x) {
 ; X86-SDAGISEL-NEXT:    andb %cl, %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: issignaling_f:
-; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-NEXT:    setl %cl
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setge %al
-; X64-NEXT:    andb %cl, %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: issignaling_f:
+; X64-SDAGISEL:       # %bb.0:
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-SDAGISEL-NEXT:    setl %cl
+; X64-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-SDAGISEL-NEXT:    setge %al
+; X64-SDAGISEL-NEXT:    andb %cl, %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: issignaling_f:
 ; X86-FASTISEL:       # %bb.0:
@@ -89,6 +99,43 @@ define i1 @issignaling_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: issignaling_f:
+; X64-FASTISEL:       # %bb.0:
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-FASTISEL-NEXT:    setl %cl
+; X64-FASTISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-FASTISEL-NEXT:    setge %al
+; X64-FASTISEL-NEXT:    andb %cl, %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: issignaling_f:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %dl
+; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    andb %dl, %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: issignaling_f:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %dl
+; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    andb %dl, %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
    %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1)  ; "snan"
    ret i1 %a0
 }
@@ -102,13 +149,13 @@ define i1 @issignaling_f(float %x) {
 ; X86-SDAGISEL-NEXT:    setge %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: isquiet_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-NEXT:    setge %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: isquiet_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-SDAGISEL-NEXT:    setge %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: isquiet_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -123,6 +170,34 @@ define i1 @issignaling_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isquiet_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-FASTISEL-NEXT:    setge %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isquiet_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-GISEL-NEXT:    setae %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isquiet_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GISEL-NEXT:    setae %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
  entry:
    %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2)  ; "qnan"
    ret i1 %0
@@ -137,13 +212,13 @@ define i1 @not_isquiet_f(float %x) {
 ; X86-SDAGISEL-NEXT:    setl %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: not_isquiet_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-NEXT:    setl %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_isquiet_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-SDAGISEL-NEXT:    setl %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_isquiet_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -158,6 +233,52 @@ define i1 @not_isquiet_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_isquiet_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-FASTISEL-NEXT:    setl %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_isquiet_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %dl
+; X86-GISEL-NEXT:    orb %cl, %dl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %cl
+; X86-GISEL-NEXT:    orb %dl, %cl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %dl
+; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    andb %dl, %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_isquiet_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %dl
+; X64-GISEL-NEXT:    orb %cl, %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %cl
+; X64-GISEL-NEXT:    orb %dl, %cl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %dl
+; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    andb %dl, %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021)  ; ~"qnan"
   ret i1 %0
@@ -172,13 +293,13 @@ define i1 @isinf_f(float %x) {
 ; X86-SDAGISEL-NEXT:    sete %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: isinf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: isinf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    sete %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: isinf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -193,6 +314,34 @@ define i1 @isinf_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isinf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    sete %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isinf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isinf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
   ret i1 %0
@@ -207,13 +356,13 @@ define i1 @not_isinf_f(float %x) {
 ; X86-SDAGISEL-NEXT:    setne %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: not_isinf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_isinf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setne %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_isinf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -228,6 +377,40 @@ define i1 @not_isinf_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_isinf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setne %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_isinf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %dl
+; X86-GISEL-NEXT:    orb %cl, %dl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %al
+; X86-GISEL-NEXT:    orb %dl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_isinf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %dl
+; X64-GISEL-NEXT:    orb %cl, %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %al
+; X64-GISEL-NEXT:    orb %dl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; ~0x204 = "~inf"
   ret i1 %0
@@ -240,12 +423,12 @@ define i1 @is_plus_inf_f(float %x) {
 ; X86-SDAGISEL-NEXT:    sete %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: is_plus_inf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: is_plus_inf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    sete %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: is_plus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -258,6 +441,30 @@ define i1 @is_plus_inf_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: is_plus_inf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    sete %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: is_plus_inf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: is_plus_inf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512)  ; 0x200 = "+inf"
   ret i1 %0
@@ -270,12 +477,12 @@ define i1 @is_minus_inf_f(float %x) {
 ; X86-SDAGISEL-NEXT:    sete %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: is_minus_inf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: is_minus_inf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-SDAGISEL-NEXT:    sete %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: is_minus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -288,6 +495,30 @@ define i1 @is_minus_inf_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: is_minus_inf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-FASTISEL-NEXT:    sete %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: is_minus_inf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: is_minus_inf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4)  ; "-inf"
   ret i1 %0
@@ -300,12 +531,12 @@ define i1 @not_is_minus_inf_f(float %x) {
 ; X86-SDAGISEL-NEXT:    setne %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: not_is_minus_inf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_is_minus_inf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-SDAGISEL-NEXT:    setne %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_is_minus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -318,6 +549,52 @@ define i1 @not_is_minus_inf_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_is_minus_inf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-FASTISEL-NEXT:    setne %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_is_minus_inf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    pushl %ebx
+; X86-GISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-GISEL-NEXT:    .cfi_offset %ebx, -8
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    movl %eax, %ecx
+; X86-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %edx, %edx
+; X86-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %bl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %ah
+; X86-GISEL-NEXT:    orb %dl, %ah
+; X86-GISEL-NEXT:    orb %bl, %ah
+; X86-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %al
+; X86-GISEL-NEXT:    orb %ah, %al
+; X86-GISEL-NEXT:    popl %ebx
+; X86-GISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_is_minus_inf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    movl %eax, %ecx
+; X64-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %edx, %edx
+; X64-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %sil
+; X64-GISEL-NEXT:    orb %dl, %sil
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %al
+; X64-GISEL-NEXT:    orb %dl, %al
+; X64-GISEL-NEXT:    orb %sil, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019)  ; ~"-inf"
   ret i1 %0
@@ -332,13 +609,13 @@ define i1 @isfinite_f(float %x) {
 ; X86-SDAGISEL-NEXT:    setl %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: isfinite_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setl %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: isfinite_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setl %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: isfinite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -353,6 +630,34 @@ define i1 @isfinite_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isfinite_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setl %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isfinite_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isfinite_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %0
@@ -367,13 +672,13 @@ define i1 @not_isfinite_f(float %x) {
 ; X86-SDAGISEL-NEXT:    setge %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: not_isfinite_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setge %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_isfinite_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setge %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_isfinite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -388,6 +693,40 @@ define i1 @not_isfinite_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_isfinite_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setge %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_isfinite_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %dl
+; X86-GISEL-NEXT:    orb %cl, %dl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %al
+; X86-GISEL-NEXT:    orb %dl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_isfinite_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %dl
+; X64-GISEL-NEXT:    orb %cl, %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %al
+; X64-GISEL-NEXT:    orb %dl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; ~0x1f8 = "~finite"
   ret i1 %0
@@ -400,12 +739,12 @@ define i1 @is_plus_finite_f(float %x) {
 ; X86-SDAGISEL-NEXT:    setb %al
 ; X86-SDAGISEL-NEXT:    retl
 ;
-; X64-LABEL: is_plus_finite_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: is_plus_finite_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setb %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: is_plus_finite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -418,6 +757,30 @@ define i1 @is_plus_finite_f(float %x) {
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: is_plus_finite_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setb %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: is_plus_finite_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: is_plus_finite_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448)  ; 0x1c0 = "+finite"
   ret i1 %0
@@ -440,6 +803,11 @@ define i1 @isnone_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isnone_d:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    retl
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0)
     ret i1 %0
@@ -462,6 +830,11 @@ define i1 @isany_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isany_d:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movb $1, %al
+; X86-GISEL-NEXT:    retl
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023)
     ret i1 %0
@@ -491,6 +864,16 @@ define i1 @isnone_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    fstp %st(0)
 ; X64-FASTISEL-NEXT:    xorl %eax, %eax
 ; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isnone_f80:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isnone_f80:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %eax, %eax
+; X64-GISEL-NEXT:    retq
 entry:
 %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 0)
 ret i1 %0
@@ -520,6 +903,16 @@ define i1 @isany_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    fstp %st(0)
 ; X64-FASTISEL-NEXT:    movb $1, %al
 ; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isany_f80:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movb $1, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isany_f80:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movb $1, %al
+; X64-GISEL-NEXT:    retq
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 1023)
     ret i1 %0

From 7a83b077187301fcefcf2bf788cc60ff397020fe Mon Sep 17 00:00:00 2001
From: "Attarde, Mahesh" <mahesh.attarde@intel.com>
Date: Fri, 26 Sep 2025 04:39:34 -0700
Subject: [PATCH 002/878] fmt

---
 llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 807a2a7d1542b..77b6ec83cb984 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -410,8 +410,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
-  getActionDefinitionsBuilder(G_IS_FPCLASS)
-      .lower();
+  getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
 
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)

From d1b5607dc113016b74d0a58e95fed00ea9ad7950 Mon Sep 17 00:00:00 2001
From: lbonn <lbonn@users.noreply.github.com>
Date: Sat, 27 Sep 2025 16:49:54 +0200
Subject: [PATCH 003/878] [libc++][ranges] Fix `ranges::join_view` segmented
 iterator trait (#158347)

The outer iterator needs to move to the next segment when calling
__compose.

Without this change, `find_segment_if` would never reach the end of the
join_view which caused erroneous result when calling `ranges::find` on a
join_view of bidirectional ranges.

Other specializations using the segmented iterator trait were likely to
be affected as well.

Fixes #158279
Fixes #93180
---
 libcxx/include/__ranges/join_view.h           |   9 +-
 .../alg.find/ranges.find.pass.cpp             | 121 ++++++++++++------
 2 files changed, 89 insertions(+), 41 deletions(-)

diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h
index 327b349f476a7..364f056d8d2cf 100644
--- a/libcxx/include/__ranges/join_view.h
+++ b/libcxx/include/__ranges/join_view.h
@@ -410,8 +410,13 @@ struct __segmented_iterator_traits<_JoinViewIterator> {
 
   static constexpr _LIBCPP_HIDE_FROM_ABI _JoinViewIterator
   __compose(__segment_iterator __seg_iter, __local_iterator __local_iter) {
-    return _JoinViewIterator(
-        std::move(__seg_iter).__get_data(), std::move(__seg_iter).__get_iter(), std::move(__local_iter));
+    auto&& __parent = std::move(__seg_iter).__get_data();
+    auto&& __outer  = std::move(__seg_iter).__get_iter();
+    if (__local_iter == ranges::end(*__outer)) {
+      ++__outer;
+      return _JoinViewIterator(*__parent, __outer);
+    }
+    return _JoinViewIterator(__parent, __outer, std::move(__local_iter));
   }
 };
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp
index 5b4abc45b6f4f..3303b4a76f467 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp
@@ -272,57 +272,100 @@ class Comparable {
   friend bool operator==(const Comparable& lhs, long long rhs) { return comparable_data[lhs.index_] == rhs; }
 };
 
-void test_deque() {
-  { // empty deque
-    std::deque<int> data;
-    assert(std::ranges::find(data, 4) == data.end());
-    assert(std::ranges::find(data.begin(), data.end(), 4) == data.end());
-  }
-
-  { // single element - match
-    std::deque<int> data = {4};
-    assert(std::ranges::find(data, 4) == data.begin());
-    assert(std::ranges::find(data.begin(), data.end(), 4) == data.begin());
-  }
-
-  { // single element - no match
-    std::deque<int> data = {3};
-    assert(std::ranges::find(data, 4) == data.end());
-    assert(std::ranges::find(data.begin(), data.end(), 4) == data.end());
-  }
-
-  // many elements
-  for (auto size : {2, 3, 1023, 1024, 1025, 2047, 2048, 2049}) {
-    { // last element match
+void test_segmented_iterator_types() {
+  // Test the optimized find algorithm for types that implement the segment iterator trait
+  // deque
+  {
+    { // empty deque
       std::deque<int> data;
-      data.resize(size);
-      std::fill(data.begin(), data.end(), 3);
-      data[size - 1] = 4;
-      assert(std::ranges::find(data, 4) == data.end() - 1);
-      assert(std::ranges::find(data.begin(), data.end(), 4) == data.end() - 1);
+      assert(std::ranges::find(data, 4) == data.end());
+      assert(std::ranges::find(data.begin(), data.end(), 4) == data.end());
     }
 
-    { // second-last element match
-      std::deque<int> data;
-      data.resize(size);
-      std::fill(data.begin(), data.end(), 3);
-      data[size - 2] = 4;
-      assert(std::ranges::find(data, 4) == data.end() - 2);
-      assert(std::ranges::find(data.begin(), data.end(), 4) == data.end() - 2);
+    { // single element - match
+      std::deque<int> data = {4};
+      assert(std::ranges::find(data, 4) == data.begin());
+      assert(std::ranges::find(data.begin(), data.end(), 4) == data.begin());
     }
 
-    { // no match
-      std::deque<int> data;
-      data.resize(size);
-      std::fill(data.begin(), data.end(), 3);
+    { // single element - no match
+      std::deque<int> data = {3};
       assert(std::ranges::find(data, 4) == data.end());
       assert(std::ranges::find(data.begin(), data.end(), 4) == data.end());
     }
+
+    // many elements
+    for (auto size : {2, 3, 1023, 1024, 1025, 2047, 2048, 2049}) {
+      { // last element match
+        std::deque<int> data;
+        data.resize(size);
+        std::fill(data.begin(), data.end(), 3);
+        data[size - 1] = 4;
+        assert(std::ranges::find(data, 4) == data.end() - 1);
+        assert(std::ranges::find(data.begin(), data.end(), 4) == data.end() - 1);
+      }
+
+      { // second-last element match
+        std::deque<int> data;
+        data.resize(size);
+        std::fill(data.begin(), data.end(), 3);
+        data[size - 2] = 4;
+        assert(std::ranges::find(data, 4) == data.end() - 2);
+        assert(std::ranges::find(data.begin(), data.end(), 4) == data.end() - 2);
+      }
+
+      { // no match
+        std::deque<int> data;
+        data.resize(size);
+        std::fill(data.begin(), data.end(), 3);
+        assert(std::ranges::find(data, 4) == data.end());
+        assert(std::ranges::find(data.begin(), data.end(), 4) == data.end());
+      }
+    }
+  }
+  // join_view ranges adaptor
+  {
+    { // single element - match
+      int data[1][1] = {{4}};
+      auto joined    = std::views::join(data);
+      assert(std::ranges::find(joined, 4) == std::ranges::begin(joined));
+    }
+    { // single element - no match
+      // (reproducer for https://llvm.org/PR158279, where the iterator would never reach the end sentinel)
+      int data[1][1] = {{3}};
+      auto joined    = std::views::join(data);
+      assert(std::ranges::find(joined, 4) == std::ranges::end(joined));
+    }
+    { // several sub-arrays of size 1 - match
+      int data[3][1] = {{0}, {4}, {0}};
+      auto joined    = std::views::join(data);
+      assert(std::ranges::find(joined, 4) == std::next(std::ranges::begin(joined)));
+    }
+    { // several sub-arrays of size 2 - match in second element of an array
+      int data[3][2] = {{0, 0}, {0, 4}, {0, 0}};
+      auto joined    = std::views::join(data);
+      assert(std::ranges::find(joined, 4) == std::ranges::next(std::ranges::begin(joined), 3));
+    }
+    { // vector of empty vectors
+      std::vector<std::vector<int>> data = {{}, {}};
+      auto joined                        = std::views::join(data);
+      assert(std::ranges::find(joined, 4) == std::ranges::end(joined));
+    }
+    { // vector of variably sized vectors - match
+      std::vector<std::vector<int>> data = {{}, {}, {3, 4}, {}, {}};
+      auto joined                        = std::views::join(data);
+      assert(std::ranges::find(joined, 4) == std::ranges::next(std::ranges::begin(joined)));
+    }
+    { // vector of variably sized vectors - no match
+      std::vector<std::vector<int>> data = {{}, {}, {3, 5}, {}, {}};
+      auto joined                        = std::views::join(data);
+      assert(std::ranges::find(joined, 4) == std::ranges::end(joined));
+    }
   }
 }
 
 int main(int, char**) {
-  test_deque();
+  test_segmented_iterator_types();
   test();
   static_assert(test());
 

From 371b3cae08c69b4991a4693ae69eeccab244bb52 Mon Sep 17 00:00:00 2001
From: Corbin Robeck <robeck@meta.com>
Date: Sat, 27 Sep 2025 11:12:03 -0400
Subject: [PATCH 004/878] [mlir] Fix typo in StandalonePasses.td doc string
 (#161009)

---
 mlir/examples/standalone/include/Standalone/StandalonePasses.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/examples/standalone/include/Standalone/StandalonePasses.td b/mlir/examples/standalone/include/Standalone/StandalonePasses.td
index 4cb2be02e4a20..d5aad34f2f457 100644
--- a/mlir/examples/standalone/include/Standalone/StandalonePasses.td
+++ b/mlir/examples/standalone/include/Standalone/StandalonePasses.td
@@ -1,4 +1,4 @@
-//===- StandalonePsss.td - Standalone dialect passes -------*- tablegen -*-===//
+//===- StandalonePasses.td - Standalone dialect passes -------*- tablegen -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From 9bf51b2b19064644ca447cde425dccf2bd971722 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 27 Sep 2025 16:50:48 +0100
Subject: [PATCH 005/878] [ARM] Generate build-attributes more correctly in the
 presence of intrinsic declarations. (#160749)

This code doesn't work very well, but this makes it work when intrinsic
definitions are present. It now discounts functions declarations from
the set of attributes it looks at.

The code would have worked better before
0ab5b5b8581d9f2951575f7245824e6e4fc57dec when module-level attributes
could provide the information used to construct build-attributes.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         | 19 +++++++++++--------
 .../CodeGen/ARM/build-attributes-fn-attr3.ll  |  3 +++
 .../CodeGen/ARM/build-attributes-fn-attr4.ll  |  3 +++
 .../CodeGen/ARM/build-attributes-fn-attr5.ll  |  3 +++
 .../CodeGen/ARM/build-attributes-fn-attr6.ll  |  3 +++
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 1c42f44765abf..2381effb1b6d3 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -610,20 +610,23 @@ void ARMAsmPrinter::emitEndOfAsmFile(Module &M) {
 // to appear in the .ARM.attributes section in ELF.
 // Instead of subclassing the MCELFStreamer, we do the work here.
 
- // Returns true if all functions have the same function attribute value.
- // It also returns true when the module has no functions.
+// Returns true if all function definitions have the same function attribute
+// value. It also returns true when the module has no functions.
 static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr,
                                                StringRef Value) {
-   return !any_of(M, [&](const Function &F) {
-       return F.getFnAttribute(Attr).getValueAsString() != Value;
-   });
+  return !any_of(M, [&](const Function &F) {
+    if (F.isDeclaration())
+      return false;
+    return F.getFnAttribute(Attr).getValueAsString() != Value;
+  });
 }
-// Returns true if all functions have the same denormal mode.
+// Returns true if all functions definitions have the same denormal mode.
 // It also returns true when the module has no functions.
-static bool checkDenormalAttributeConsistency(const Module &M,
-                                              StringRef Attr,
+static bool checkDenormalAttributeConsistency(const Module &M, StringRef Attr,
                                               DenormalMode Value) {
   return !any_of(M, [&](const Function &F) {
+    if (F.isDeclaration())
+      return false;
     StringRef AttrVal = F.getFnAttribute(Attr).getValueAsString();
     return parseDenormalFPAttribute(AttrVal) != Value;
   });
diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll
index 7f70c44c78f9c..27d1dc20bd815 100644
--- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll
+++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll
@@ -11,7 +11,10 @@
 
 define i32 @foo() local_unnamed_addr #0 {
 entry:
+  %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0)
   ret i32 42
 }
 
+declare float @llvm.fma.f32(float, float, float)
+
 attributes #0 = { minsize norecurse nounwind optsize readnone "no-trapping-math"="true" "denormal-fp-math"="ieee"}
diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll
index c99cb27adf155..9c8dd8d95c61c 100644
--- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll
+++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll
@@ -10,7 +10,10 @@
 
 define i32 @foo1() local_unnamed_addr #0 {
 entry:
+  %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0)
   ret i32 42
 }
 
+declare float @llvm.fma.f32(float, float, float)
+
 attributes #0 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="positive-zero,positive-zero" }
diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll
index ba1e7d7ce55c1..cda3ea0fc6d18 100644
--- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll
+++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll
@@ -10,7 +10,10 @@
 
 define i32 @foo1() local_unnamed_addr #0 {
 entry:
+  %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0)
   ret i32 42
 }
 
+declare float @llvm.fma.f32(float, float, float)
+
 attributes #0 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="preserve-sign,preserve-sign"}
diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll
index 1cd68aed1e051..59d0a40198392 100644
--- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll
+++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll
@@ -11,6 +11,7 @@
 
 define i32 @foo1() local_unnamed_addr #0 {
 entry:
+  %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0)
   ret i32 42
 }
 
@@ -19,5 +20,7 @@ entry:
   ret i32 42
 }
 
+declare float @llvm.fma.f32(float, float, float)
+
 attributes #0 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="preserve-sign,preserve-sign"}
 attributes #1 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="positive-zero,positive-zero"}

From 60912f96549e97c5a628418f33f5cd5825944cdd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 27 Sep 2025 09:04:52 -0700
Subject: [PATCH 006/878] [ADT] Use structured bindings in
 CoalescingBitVector.h (NFC) (#160976)

---
 llvm/include/llvm/ADT/CoalescingBitVector.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/ADT/CoalescingBitVector.h b/llvm/include/llvm/ADT/CoalescingBitVector.h
index 4940bc1c2c18b..b126fc699ad87 100644
--- a/llvm/include/llvm/ADT/CoalescingBitVector.h
+++ b/llvm/include/llvm/ADT/CoalescingBitVector.h
@@ -194,10 +194,7 @@ template <typename IndexT> class CoalescingBitVector {
 
     // Delete the overlapping intervals. Split up intervals that only partially
     // intersect an overlap.
-    for (IntervalT Overlap : Overlaps) {
-      IndexT OlapStart, OlapStop;
-      std::tie(OlapStart, OlapStop) = Overlap;
-
+    for (auto [OlapStart, OlapStop] : Overlaps) {
       auto It = Intervals.find(OlapStart);
       IndexT CurrStart = It.start();
       IndexT CurrStop = It.stop();
@@ -420,10 +417,7 @@ template <typename IndexT> class CoalescingBitVector {
                               const SmallVectorImpl<IntervalT> &Overlaps,
                               SmallVectorImpl<IntervalT> &NonOverlappingParts) {
     IndexT NextUncoveredBit = Start;
-    for (IntervalT Overlap : Overlaps) {
-      IndexT OlapStart, OlapStop;
-      std::tie(OlapStart, OlapStop) = Overlap;
-
+    for (auto [OlapStart, OlapStop] : Overlaps) {
       // [Start;Stop] and [OlapStart;OlapStop] overlap iff OlapStart <= Stop
       // and Start <= OlapStop.
       bool DoesOverlap = OlapStart <= Stop && Start <= OlapStop;

From 54beb58ae6a6e93a9da2e1cf219e2f37df535084 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 27 Sep 2025 09:05:00 -0700
Subject: [PATCH 007/878] [ADT] Add derived() to DenseMapBase (NFC) (#160977)

This patch adds derived() to obtain the CRTP derived class, following
conventions in other classes.  This makes forwarder functions a little
more readable.
---
 llvm/include/llvm/ADT/DenseMap.h | 35 +++++++++++++-------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 5f716751751c4..78d5b04f487fb 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -448,6 +448,11 @@ class DenseMapBase : public DebugEpochBase {
   static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
 
 private:
+  DerivedT &derived() { return *static_cast<DerivedT *>(this); }
+  const DerivedT &derived() const {
+    return *static_cast<const DerivedT *>(this);
+  }
+
   template <typename KeyArgT, typename... Ts>
   std::pair<BucketT *, bool> lookupOrInsertIntoBucket(KeyArgT &&Key,
                                                       Ts &&...Args) {
@@ -477,39 +482,27 @@ class DenseMapBase : public DebugEpochBase {
     return const_iterator::makeIterator(TheBucket, buckets(), *this);
   }
 
-  unsigned getNumEntries() const {
-    return static_cast<const DerivedT *>(this)->getNumEntries();
-  }
+  unsigned getNumEntries() const { return derived().getNumEntries(); }
 
-  void setNumEntries(unsigned Num) {
-    static_cast<DerivedT *>(this)->setNumEntries(Num);
-  }
+  void setNumEntries(unsigned Num) { derived().setNumEntries(Num); }
 
   void incrementNumEntries() { setNumEntries(getNumEntries() + 1); }
 
   void decrementNumEntries() { setNumEntries(getNumEntries() - 1); }
 
-  unsigned getNumTombstones() const {
-    return static_cast<const DerivedT *>(this)->getNumTombstones();
-  }
+  unsigned getNumTombstones() const { return derived().getNumTombstones(); }
 
-  void setNumTombstones(unsigned Num) {
-    static_cast<DerivedT *>(this)->setNumTombstones(Num);
-  }
+  void setNumTombstones(unsigned Num) { derived().setNumTombstones(Num); }
 
   void incrementNumTombstones() { setNumTombstones(getNumTombstones() + 1); }
 
   void decrementNumTombstones() { setNumTombstones(getNumTombstones() - 1); }
 
-  const BucketT *getBuckets() const {
-    return static_cast<const DerivedT *>(this)->getBuckets();
-  }
+  const BucketT *getBuckets() const { return derived().getBuckets(); }
 
-  BucketT *getBuckets() { return static_cast<DerivedT *>(this)->getBuckets(); }
+  BucketT *getBuckets() { return derived().getBuckets(); }
 
-  unsigned getNumBuckets() const {
-    return static_cast<const DerivedT *>(this)->getNumBuckets();
-  }
+  unsigned getNumBuckets() const { return derived().getNumBuckets(); }
 
   BucketT *getBucketsEnd() { return getBuckets() + getNumBuckets(); }
 
@@ -525,9 +518,9 @@ class DenseMapBase : public DebugEpochBase {
     return llvm::make_range(getBuckets(), getBucketsEnd());
   }
 
-  void grow(unsigned AtLeast) { static_cast<DerivedT *>(this)->grow(AtLeast); }
+  void grow(unsigned AtLeast) { derived().grow(AtLeast); }
 
-  void shrink_and_clear() { static_cast<DerivedT *>(this)->shrink_and_clear(); }
+  void shrink_and_clear() { derived().shrink_and_clear(); }
 
   template <typename LookupKeyT>
   BucketT *findBucketForInsertion(const LookupKeyT &Lookup,

From 3163fcfa453dce61aa06da05272a660b18407623 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 27 Sep 2025 09:05:08 -0700
Subject: [PATCH 008/878] [ADT] Add [[nodiscard]] to set/map classes (NFC)
 (#160978)

This patch adds [[nodiscard]] to user-facing functions in set/map
classes if they are:

- const and return non-void values (e.g., size()), or
- non-const and have no side effects (e.g., find()).
---
 llvm/include/llvm/ADT/DenseMap.h    | 81 +++++++++++++++++------------
 llvm/include/llvm/ADT/DenseSet.h    | 38 +++++++++-----
 llvm/include/llvm/ADT/MapVector.h   | 54 +++++++++++--------
 llvm/include/llvm/ADT/SetVector.h   | 50 ++++++------------
 llvm/include/llvm/ADT/SmallPtrSet.h | 24 ++++-----
 llvm/include/llvm/ADT/SmallSet.h    | 18 ++++---
 llvm/unittests/ADT/APIntTest.cpp    |  5 +-
 7 files changed, 144 insertions(+), 126 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 78d5b04f487fb..bcf3e9676a7b5 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -75,37 +75,39 @@ class DenseMapBase : public DebugEpochBase {
   using const_iterator =
       DenseMapIterator<KeyT, ValueT, KeyInfoT, BucketT, true>;
 
-  inline iterator begin() {
+  [[nodiscard]] inline iterator begin() {
     return iterator::makeBegin(buckets(), empty(), *this);
   }
-  inline iterator end() { return iterator::makeEnd(buckets(), *this); }
-  inline const_iterator begin() const {
+  [[nodiscard]] inline iterator end() {
+    return iterator::makeEnd(buckets(), *this);
+  }
+  [[nodiscard]] inline const_iterator begin() const {
     return const_iterator::makeBegin(buckets(), empty(), *this);
   }
-  inline const_iterator end() const {
+  [[nodiscard]] inline const_iterator end() const {
     return const_iterator::makeEnd(buckets(), *this);
   }
 
   // Return an iterator to iterate over keys in the map.
-  inline auto keys() {
+  [[nodiscard]] inline auto keys() {
     return map_range(*this, [](const BucketT &P) { return P.getFirst(); });
   }
 
   // Return an iterator to iterate over values in the map.
-  inline auto values() {
+  [[nodiscard]] inline auto values() {
     return map_range(*this, [](const BucketT &P) { return P.getSecond(); });
   }
 
-  inline auto keys() const {
+  [[nodiscard]] inline auto keys() const {
     return map_range(*this, [](const BucketT &P) { return P.getFirst(); });
   }
 
-  inline auto values() const {
+  [[nodiscard]] inline auto values() const {
     return map_range(*this, [](const BucketT &P) { return P.getSecond(); });
   }
 
   [[nodiscard]] bool empty() const { return getNumEntries() == 0; }
-  unsigned size() const { return getNumEntries(); }
+  [[nodiscard]] unsigned size() const { return getNumEntries(); }
 
   /// Grow the densemap so that it can contain at least \p NumEntries items
   /// before resizing again.
@@ -153,30 +155,35 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   /// Return true if the specified key is in the map, false otherwise.
-  bool contains(const_arg_type_t<KeyT> Val) const {
+  [[nodiscard]] bool contains(const_arg_type_t<KeyT> Val) const {
     return doFind(Val) != nullptr;
   }
 
   /// Return 1 if the specified key is in the map, 0 otherwise.
-  size_type count(const_arg_type_t<KeyT> Val) const {
+  [[nodiscard]] size_type count(const_arg_type_t<KeyT> Val) const {
     return contains(Val) ? 1 : 0;
   }
 
-  iterator find(const_arg_type_t<KeyT> Val) { return find_as(Val); }
-  const_iterator find(const_arg_type_t<KeyT> Val) const { return find_as(Val); }
+  [[nodiscard]] iterator find(const_arg_type_t<KeyT> Val) {
+    return find_as(Val);
+  }
+  [[nodiscard]] const_iterator find(const_arg_type_t<KeyT> Val) const {
+    return find_as(Val);
+  }
 
   /// Alternate version of find() which allows a different, and possibly
   /// less expensive, key type.
   /// The DenseMapInfo is responsible for supplying methods
   /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
   /// type used.
-  template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) {
+  template <class LookupKeyT>
+  [[nodiscard]] iterator find_as(const LookupKeyT &Val) {
     if (BucketT *Bucket = doFind(Val))
       return makeIterator(Bucket);
     return end();
   }
   template <class LookupKeyT>
-  const_iterator find_as(const LookupKeyT &Val) const {
+  [[nodiscard]] const_iterator find_as(const LookupKeyT &Val) const {
     if (const BucketT *Bucket = doFind(Val))
       return makeConstIterator(Bucket);
     return end();
@@ -184,7 +191,7 @@ class DenseMapBase : public DebugEpochBase {
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
-  ValueT lookup(const_arg_type_t<KeyT> Val) const {
+  [[nodiscard]] ValueT lookup(const_arg_type_t<KeyT> Val) const {
     if (const BucketT *Bucket = doFind(Val))
       return Bucket->getSecond();
     return ValueT();
@@ -194,7 +201,8 @@ class DenseMapBase : public DebugEpochBase {
   // useful, because `lookup` cannot be used with non-default-constructible
   // values.
   template <typename U = std::remove_cv_t<ValueT>>
-  ValueT lookup_or(const_arg_type_t<KeyT> Val, U &&Default) const {
+  [[nodiscard]] ValueT lookup_or(const_arg_type_t<KeyT> Val,
+                                 U &&Default) const {
     if (const BucketT *Bucket = doFind(Val))
       return Bucket->getSecond();
     return Default;
@@ -202,7 +210,7 @@ class DenseMapBase : public DebugEpochBase {
 
   /// at - Return the entry for the specified key, or abort if no such
   /// entry exists.
-  const ValueT &at(const_arg_type_t<KeyT> Val) const {
+  [[nodiscard]] const ValueT &at(const_arg_type_t<KeyT> Val) const {
     auto Iter = this->find(std::move(Val));
     assert(Iter != this->end() && "DenseMap::at failed due to a missing key");
     return Iter->second;
@@ -330,14 +338,16 @@ class DenseMapBase : public DebugEpochBase {
   /// isPointerIntoBucketsArray - Return true if the specified pointer points
   /// somewhere into the DenseMap's array of buckets (i.e. either to a key or
   /// value in the DenseMap).
-  bool isPointerIntoBucketsArray(const void *Ptr) const {
+  [[nodiscard]] bool isPointerIntoBucketsArray(const void *Ptr) const {
     return Ptr >= getBuckets() && Ptr < getBucketsEnd();
   }
 
   /// getPointerIntoBucketsArray() - Return an opaque pointer into the buckets
   /// array.  In conjunction with the previous method, this can be used to
   /// determine whether an insertion caused the DenseMap to reallocate.
-  const void *getPointerIntoBucketsArray() const { return getBuckets(); }
+  [[nodiscard]] const void *getPointerIntoBucketsArray() const {
+    return getBuckets();
+  }
 
 protected:
   DenseMapBase() = default;
@@ -649,7 +659,9 @@ class DenseMapBase : public DebugEpochBase {
   /// This is just the raw memory used by DenseMap.
   /// If entries are pointers to objects, the size of the referenced objects
   /// are not included.
-  size_t getMemorySize() const { return getNumBuckets() * sizeof(BucketT); }
+  [[nodiscard]] size_t getMemorySize() const {
+    return getNumBuckets() * sizeof(BucketT);
+  }
 };
 
 /// Equality comparison for DenseMap.
@@ -660,9 +672,9 @@ class DenseMapBase : public DebugEpochBase {
 /// complexity is linear, worst case is O(N^2) (if every hash collides).
 template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
           typename BucketT>
-bool operator==(
-    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
-    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+[[nodiscard]] bool
+operator==(const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+           const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
   if (LHS.size() != RHS.size())
     return false;
 
@@ -680,9 +692,9 @@ bool operator==(
 /// Equivalent to !(LHS == RHS). See operator== for performance notes.
 template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
           typename BucketT>
-bool operator!=(
-    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
-    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+[[nodiscard]] bool
+operator!=(const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+           const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
   return !(LHS == RHS);
 }
 
@@ -1220,15 +1232,15 @@ class DenseMapIterator : DebugEpochBase::HandleBase {
       const DenseMapIterator<KeyT, ValueT, KeyInfoT, Bucket, IsConstSrc> &I)
       : DebugEpochBase::HandleBase(I), Ptr(I.Ptr), End(I.End) {}
 
-  reference operator*() const {
+  [[nodiscard]] reference operator*() const {
     assert(isHandleInSync() && "invalid iterator access!");
     assert(Ptr != End && "dereferencing end() iterator");
     return *Ptr;
   }
-  pointer operator->() const { return &operator*(); }
+  [[nodiscard]] pointer operator->() const { return &operator*(); }
 
-  friend bool operator==(const DenseMapIterator &LHS,
-                         const DenseMapIterator &RHS) {
+  [[nodiscard]] friend bool operator==(const DenseMapIterator &LHS,
+                                       const DenseMapIterator &RHS) {
     assert((!LHS.getEpochAddress() || LHS.isHandleInSync()) &&
            "handle not in sync!");
     assert((!RHS.getEpochAddress() || RHS.isHandleInSync()) &&
@@ -1238,8 +1250,8 @@ class DenseMapIterator : DebugEpochBase::HandleBase {
     return LHS.Ptr == RHS.Ptr;
   }
 
-  friend bool operator!=(const DenseMapIterator &LHS,
-                         const DenseMapIterator &RHS) {
+  [[nodiscard]] friend bool operator!=(const DenseMapIterator &LHS,
+                                       const DenseMapIterator &RHS) {
     return !(LHS == RHS);
   }
 
@@ -1277,7 +1289,8 @@ class DenseMapIterator : DebugEpochBase::HandleBase {
 };
 
 template <typename KeyT, typename ValueT, typename KeyInfoT>
-inline size_t capacity_in_bytes(const DenseMap<KeyT, ValueT, KeyInfoT> &X) {
+[[nodiscard]] inline size_t
+capacity_in_bytes(const DenseMap<KeyT, ValueT, KeyInfoT> &X) {
   return X.getMemorySize();
 }
 
diff --git a/llvm/include/llvm/ADT/DenseSet.h b/llvm/include/llvm/ADT/DenseSet.h
index 60ad9b2eb7762..eec800d07b6df 100644
--- a/llvm/include/llvm/ADT/DenseSet.h
+++ b/llvm/include/llvm/ADT/DenseSet.h
@@ -83,9 +83,9 @@ class DenseSetImpl {
   DenseSetImpl(llvm::from_range_t, Range &&R)
       : DenseSetImpl(adl_begin(R), adl_end(R)) {}
 
-  bool empty() const { return TheMap.empty(); }
-  size_type size() const { return TheMap.size(); }
-  size_t getMemorySize() const { return TheMap.getMemorySize(); }
+  [[nodiscard]] bool empty() const { return TheMap.empty(); }
+  [[nodiscard]] size_type size() const { return TheMap.size(); }
+  [[nodiscard]] size_t getMemorySize() const { return TheMap.getMemorySize(); }
 
   /// Grow the DenseSet so that it has at least Size buckets. Will not shrink
   /// the Size of the set.
@@ -154,14 +154,20 @@ class DenseSetImpl {
   using iterator = DenseSetIterator<false>;
   using const_iterator = DenseSetIterator<true>;
 
-  iterator begin() { return iterator(TheMap.begin()); }
-  iterator end() { return iterator(TheMap.end()); }
+  [[nodiscard]] iterator begin() { return iterator(TheMap.begin()); }
+  [[nodiscard]] iterator end() { return iterator(TheMap.end()); }
 
-  const_iterator begin() const { return const_iterator(TheMap.begin()); }
-  const_iterator end() const { return const_iterator(TheMap.end()); }
+  [[nodiscard]] const_iterator begin() const {
+    return const_iterator(TheMap.begin());
+  }
+  [[nodiscard]] const_iterator end() const {
+    return const_iterator(TheMap.end());
+  }
 
-  iterator find(const_arg_type_t<ValueT> V) { return iterator(TheMap.find(V)); }
-  const_iterator find(const_arg_type_t<ValueT> V) const {
+  [[nodiscard]] iterator find(const_arg_type_t<ValueT> V) {
+    return iterator(TheMap.find(V));
+  }
+  [[nodiscard]] const_iterator find(const_arg_type_t<ValueT> V) const {
     return const_iterator(TheMap.find(V));
   }
 
@@ -180,10 +186,12 @@ class DenseSetImpl {
   /// The DenseMapInfo is responsible for supplying methods
   /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key type
   /// used.
-  template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) {
+  template <class LookupKeyT>
+  [[nodiscard]] iterator find_as(const LookupKeyT &Val) {
     return iterator(TheMap.find_as(Val));
   }
   template <class LookupKeyT>
+  [[nodiscard]]
   const_iterator find_as(const LookupKeyT &Val) const {
     return const_iterator(TheMap.find_as(Val));
   }
@@ -229,8 +237,9 @@ class DenseSetImpl {
 /// Equivalent to N calls to RHS.count. Amortized complexity is linear, worst
 /// case is O(N^2) (if every hash collides).
 template <typename ValueT, typename MapTy, typename ValueInfoT>
-bool operator==(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
-                const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
+[[nodiscard]] bool
+operator==(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
+           const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
   if (LHS.size() != RHS.size())
     return false;
 
@@ -245,8 +254,9 @@ bool operator==(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
 ///
 /// Equivalent to !(LHS == RHS). See operator== for performance notes.
 template <typename ValueT, typename MapTy, typename ValueInfoT>
-bool operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
-                const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
+[[nodiscard]] bool
+operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
+           const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
   return !(LHS == RHS);
 }
 
diff --git a/llvm/include/llvm/ADT/MapVector.h b/llvm/include/llvm/ADT/MapVector.h
index 4a50126ff5aad..82f2c4977e01d 100644
--- a/llvm/include/llvm/ADT/MapVector.h
+++ b/llvm/include/llvm/ADT/MapVector.h
@@ -45,15 +45,15 @@ class MapVector {
   using const_reverse_iterator = typename VectorType::const_reverse_iterator;
 
   /// Clear the MapVector and return the underlying vector.
-  VectorType takeVector() {
+  [[nodiscard]] VectorType takeVector() {
     Map.clear();
     return std::move(Vector);
   }
 
   /// Returns an array reference of the underlying vector.
-  ArrayRef<value_type> getArrayRef() const { return Vector; }
+  [[nodiscard]] ArrayRef<value_type> getArrayRef() const { return Vector; }
 
-  size_type size() const { return Vector.size(); }
+  [[nodiscard]] size_type size() const { return Vector.size(); }
 
   /// Grow the MapVector so that it can contain at least \p NumEntries items
   /// before resizing again.
@@ -62,24 +62,28 @@ class MapVector {
     Vector.reserve(NumEntries);
   }
 
-  iterator begin() { return Vector.begin(); }
-  const_iterator begin() const { return Vector.begin(); }
-  iterator end() { return Vector.end(); }
-  const_iterator end() const { return Vector.end(); }
+  [[nodiscard]] iterator begin() { return Vector.begin(); }
+  [[nodiscard]] const_iterator begin() const { return Vector.begin(); }
+  [[nodiscard]] iterator end() { return Vector.end(); }
+  [[nodiscard]] const_iterator end() const { return Vector.end(); }
 
-  reverse_iterator rbegin() { return Vector.rbegin(); }
-  const_reverse_iterator rbegin() const { return Vector.rbegin(); }
-  reverse_iterator rend() { return Vector.rend(); }
-  const_reverse_iterator rend() const { return Vector.rend(); }
-
-  bool empty() const {
-    return Vector.empty();
+  [[nodiscard]] reverse_iterator rbegin() { return Vector.rbegin(); }
+  [[nodiscard]] const_reverse_iterator rbegin() const {
+    return Vector.rbegin();
   }
+  [[nodiscard]] reverse_iterator rend() { return Vector.rend(); }
+  [[nodiscard]] const_reverse_iterator rend() const { return Vector.rend(); }
+
+  [[nodiscard]] bool empty() const { return Vector.empty(); }
 
-  std::pair<KeyT, ValueT>       &front()       { return Vector.front(); }
-  const std::pair<KeyT, ValueT> &front() const { return Vector.front(); }
-  std::pair<KeyT, ValueT>       &back()        { return Vector.back(); }
-  const std::pair<KeyT, ValueT> &back()  const { return Vector.back(); }
+  [[nodiscard]] std::pair<KeyT, ValueT> &front() { return Vector.front(); }
+  [[nodiscard]] const std::pair<KeyT, ValueT> &front() const {
+    return Vector.front();
+  }
+  [[nodiscard]] std::pair<KeyT, ValueT> &back() { return Vector.back(); }
+  [[nodiscard]] const std::pair<KeyT, ValueT> &back() const {
+    return Vector.back();
+  }
 
   void clear() {
     Map.clear();
@@ -96,7 +100,7 @@ class MapVector {
   }
 
   // Returns a copy of the value.  Only allowed if ValueT is copyable.
-  ValueT lookup(const KeyT &Key) const {
+  [[nodiscard]] ValueT lookup(const KeyT &Key) const {
     static_assert(std::is_copy_constructible_v<ValueT>,
                   "Cannot call lookup() if ValueT is not copyable.");
     typename MapType::const_iterator Pos = Map.find(Key);
@@ -134,17 +138,21 @@ class MapVector {
     return Ret;
   }
 
-  bool contains(const KeyT &Key) const { return Map.find(Key) != Map.end(); }
+  [[nodiscard]] bool contains(const KeyT &Key) const {
+    return Map.find(Key) != Map.end();
+  }
 
-  size_type count(const KeyT &Key) const { return contains(Key) ? 1 : 0; }
+  [[nodiscard]] size_type count(const KeyT &Key) const {
+    return contains(Key) ? 1 : 0;
+  }
 
-  iterator find(const KeyT &Key) {
+  [[nodiscard]] iterator find(const KeyT &Key) {
     typename MapType::const_iterator Pos = Map.find(Key);
     return Pos == Map.end()? Vector.end() :
                             (Vector.begin() + Pos->second);
   }
 
-  const_iterator find(const KeyT &Key) const {
+  [[nodiscard]] const_iterator find(const KeyT &Key) const {
     typename MapType::const_iterator Pos = Map.find(Key);
     return Pos == Map.end()? Vector.end() :
                             (Vector.begin() + Pos->second);
diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h
index 5f6db9a78a003..c129f3a695b9e 100644
--- a/llvm/include/llvm/ADT/SetVector.h
+++ b/llvm/include/llvm/ADT/SetVector.h
@@ -87,72 +87,54 @@ class SetVector {
   SetVector(llvm::from_range_t, Range &&R)
       : SetVector(adl_begin(R), adl_end(R)) {}
 
-  ArrayRef<value_type> getArrayRef() const { return vector_; }
+  [[nodiscard]] ArrayRef<value_type> getArrayRef() const { return vector_; }
 
   /// Clear the SetVector and return the underlying vector.
-  Vector takeVector() {
+  [[nodiscard]] Vector takeVector() {
     set_.clear();
     return std::move(vector_);
   }
 
   /// Determine if the SetVector is empty or not.
-  bool empty() const {
-    return vector_.empty();
-  }
+  [[nodiscard]] bool empty() const { return vector_.empty(); }
 
   /// Determine the number of elements in the SetVector.
-  size_type size() const {
-    return vector_.size();
-  }
+  [[nodiscard]] size_type size() const { return vector_.size(); }
 
   /// Get an iterator to the beginning of the SetVector.
-  iterator begin() {
-    return vector_.begin();
-  }
+  [[nodiscard]] iterator begin() { return vector_.begin(); }
 
   /// Get a const_iterator to the beginning of the SetVector.
-  const_iterator begin() const {
-    return vector_.begin();
-  }
+  [[nodiscard]] const_iterator begin() const { return vector_.begin(); }
 
   /// Get an iterator to the end of the SetVector.
-  iterator end() {
-    return vector_.end();
-  }
+  [[nodiscard]] iterator end() { return vector_.end(); }
 
   /// Get a const_iterator to the end of the SetVector.
-  const_iterator end() const {
-    return vector_.end();
-  }
+  [[nodiscard]] const_iterator end() const { return vector_.end(); }
 
   /// Get an reverse_iterator to the end of the SetVector.
-  reverse_iterator rbegin() {
-    return vector_.rbegin();
-  }
+  [[nodiscard]] reverse_iterator rbegin() { return vector_.rbegin(); }
 
   /// Get a const_reverse_iterator to the end of the SetVector.
-  const_reverse_iterator rbegin() const {
+  [[nodiscard]] const_reverse_iterator rbegin() const {
     return vector_.rbegin();
   }
 
   /// Get a reverse_iterator to the beginning of the SetVector.
-  reverse_iterator rend() {
-    return vector_.rend();
-  }
+  [[nodiscard]] reverse_iterator rend() { return vector_.rend(); }
 
   /// Get a const_reverse_iterator to the beginning of the SetVector.
-  const_reverse_iterator rend() const {
-    return vector_.rend();
-  }
+  [[nodiscard]] const_reverse_iterator rend() const { return vector_.rend(); }
 
   /// Return the first element of the SetVector.
-  const value_type &front() const {
+  [[nodiscard]] const value_type &front() const {
     assert(!empty() && "Cannot call front() on empty SetVector!");
     return vector_.front();
   }
 
   /// Return the last element of the SetVector.
-  const value_type &back() const {
+  [[nodiscard]] const value_type &back() const {
     assert(!empty() && "Cannot call back() on empty SetVector!");
     return vector_.back();
   }
@@ -299,11 +281,11 @@ class SetVector {
     return Ret;
   }
 
-  bool operator==(const SetVector &that) const {
+  [[nodiscard]] bool operator==(const SetVector &that) const {
     return vector_ == that.vector_;
   }
 
-  bool operator!=(const SetVector &that) const {
+  [[nodiscard]] bool operator!=(const SetVector &that) const {
     return vector_ != that.vector_;
   }
 
diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index 665ecd03a58f8..e24cd6415b687 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -96,8 +96,8 @@ class SmallPtrSetImplBase : public DebugEpochBase {
   SmallPtrSetImplBase &operator=(const SmallPtrSetImplBase &) = delete;
 
   [[nodiscard]] bool empty() const { return size() == 0; }
-  size_type size() const { return NumEntries; }
-  size_type capacity() const { return CurArraySize; }
+  [[nodiscard]] size_type size() const { return NumEntries; }
+  [[nodiscard]] size_type capacity() const { return CurArraySize; }
 
   void clear() {
     incrementEpoch();
@@ -344,7 +344,7 @@ class SmallPtrSetIterator : public SmallPtrSetIteratorImpl {
 
   // Most methods are provided by the base class.
 
-  const PtrTy operator*() const {
+  [[nodiscard]] const PtrTy operator*() const {
     return PtrTraits::getFromVoidPointer(dereference());
   }
 
@@ -452,13 +452,13 @@ template <typename PtrType> class SmallPtrSetImpl : public SmallPtrSetImplBase {
   }
 
   /// count - Return 1 if the specified pointer is in the set, 0 otherwise.
-  size_type count(ConstPtrType Ptr) const {
+  [[nodiscard]] size_type count(ConstPtrType Ptr) const {
     return contains_imp(ConstPtrTraits::getAsVoidPointer(Ptr));
   }
-  iterator find(ConstPtrType Ptr) const {
+  [[nodiscard]] iterator find(ConstPtrType Ptr) const {
     return makeIterator(find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)));
   }
-  bool contains(ConstPtrType Ptr) const {
+  [[nodiscard]] bool contains(ConstPtrType Ptr) const {
     return contains_imp(ConstPtrTraits::getAsVoidPointer(Ptr));
   }
 
@@ -475,12 +475,12 @@ template <typename PtrType> class SmallPtrSetImpl : public SmallPtrSetImplBase {
     insert(adl_begin(R), adl_end(R));
   }
 
-  iterator begin() const {
+  [[nodiscard]] iterator begin() const {
     if (shouldReverseIterate())
       return makeIterator(EndPointer() - 1);
     return makeIterator(CurArray);
   }
-  iterator end() const { return makeIterator(EndPointer()); }
+  [[nodiscard]] iterator end() const { return makeIterator(EndPointer()); }
 
 private:
   /// Create an iterator that dereferences to same place as the given pointer.
@@ -496,8 +496,8 @@ template <typename PtrType> class SmallPtrSetImpl : public SmallPtrSetImplBase {
 /// Iterates over elements of LHS confirming that each value from LHS is also in
 /// RHS, and that no additional values are in RHS.
 template <typename PtrType>
-bool operator==(const SmallPtrSetImpl<PtrType> &LHS,
-                const SmallPtrSetImpl<PtrType> &RHS) {
+[[nodiscard]] bool operator==(const SmallPtrSetImpl<PtrType> &LHS,
+                              const SmallPtrSetImpl<PtrType> &RHS) {
   if (LHS.size() != RHS.size())
     return false;
 
@@ -512,8 +512,8 @@ bool operator==(const SmallPtrSetImpl<PtrType> &LHS,
 ///
 /// Equivalent to !(LHS == RHS).
 template <typename PtrType>
-bool operator!=(const SmallPtrSetImpl<PtrType> &LHS,
-                const SmallPtrSetImpl<PtrType> &RHS) {
+[[nodiscard]] bool operator!=(const SmallPtrSetImpl<PtrType> &LHS,
+                              const SmallPtrSetImpl<PtrType> &RHS) {
   return !(LHS == RHS);
 }
 
diff --git a/llvm/include/llvm/ADT/SmallSet.h b/llvm/include/llvm/ADT/SmallSet.h
index 0e90293352630..3ca833f15eed3 100644
--- a/llvm/include/llvm/ADT/SmallSet.h
+++ b/llvm/include/llvm/ADT/SmallSet.h
@@ -167,12 +167,14 @@ class SmallSet {
 
   [[nodiscard]] bool empty() const { return Vector.empty() && Set.empty(); }
 
-  size_type size() const {
+  [[nodiscard]] size_type size() const {
     return isSmall() ? Vector.size() : Set.size();
   }
 
   /// count - Return 1 if the element is in the set, 0 otherwise.
-  size_type count(const T &V) const { return contains(V) ? 1 : 0; }
+  [[nodiscard]] size_type count(const T &V) const {
+    return contains(V) ? 1 : 0;
+  }
 
   /// insert - Insert an element into the set if it isn't already there.
   /// Returns a pair. The first value of it is an iterator to the inserted
@@ -210,20 +212,20 @@ class SmallSet {
     Set.clear();
   }
 
-  const_iterator begin() const {
+  [[nodiscard]] const_iterator begin() const {
     if (isSmall())
       return {Vector.begin()};
     return {Set.begin()};
   }
 
-  const_iterator end() const {
+  [[nodiscard]] const_iterator end() const {
     if (isSmall())
       return {Vector.end()};
     return {Set.end()};
   }
 
   /// Check if the SmallSet contains the given element.
-  bool contains(const T &V) const {
+  [[nodiscard]] bool contains(const T &V) const {
     if (isSmall())
       return vfind(V) != Vector.end();
     return Set.find(V) != Set.end();
@@ -279,7 +281,8 @@ class SmallSet<PointeeType *, N> : public SmallPtrSet<PointeeType *, N> {};
 /// For large-set mode amortized complexity is linear, worst case is O(N^2) (if
 /// every hash collides).
 template <typename T, unsigned LN, unsigned RN, typename C>
-bool operator==(const SmallSet<T, LN, C> &LHS, const SmallSet<T, RN, C> &RHS) {
+[[nodiscard]] bool operator==(const SmallSet<T, LN, C> &LHS,
+                              const SmallSet<T, RN, C> &RHS) {
   if (LHS.size() != RHS.size())
     return false;
 
@@ -291,7 +294,8 @@ bool operator==(const SmallSet<T, LN, C> &LHS, const SmallSet<T, RN, C> &RHS) {
 ///
 /// Equivalent to !(LHS == RHS). See operator== for performance notes.
 template <typename T, unsigned LN, unsigned RN, typename C>
-bool operator!=(const SmallSet<T, LN, C> &LHS, const SmallSet<T, RN, C> &RHS) {
+[[nodiscard]] bool operator!=(const SmallSet<T, LN, C> &LHS,
+                              const SmallSet<T, RN, C> &RHS) {
   return !(LHS == RHS);
 }
 
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 116693c873f30..ca9f9f17ee112 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -3718,8 +3718,9 @@ TEST(APIntTest, ScaleBitMask) {
 TEST(APIntTest, DenseMap) {
   DenseMap<APInt, int> Map;
   APInt ZeroWidthInt(0, 0, false);
-  Map.insert({ZeroWidthInt, 0});
-  Map.find(ZeroWidthInt);
+  Map.insert({ZeroWidthInt, 123});
+  auto It = Map.find(ZeroWidthInt);
+  EXPECT_EQ(It->second, 123);
 }
 
 TEST(APIntTest, TryExt) {

From 798ccd2e4722b228317e8a30dd3624a0308a927e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 27 Sep 2025 09:05:16 -0700
Subject: [PATCH 009/878] [Support] Deprecate one form of support::endian::read
 (NFC) (#160979)

This is a follow-up to #156140, which deprecated one form of write.

We have two forms of read:

  template <typename value_type, std::size_t alignment>
[[nodiscard]] inline value_type read(const void *memory, endianness
endian)

template <typename value_type, endianness endian, std::size_t alignment>
  [[nodiscard]] inline value_type read(const void *memory)

The difference is that endian is a function parameter in the former
but a template parameter in the latter.

This patch streamlines the code by migrating the use of the latter to
the former while deprecating the latter.
---
 llvm/include/llvm/Support/Endian.h        | 12 +++++++-----
 llvm/lib/CGData/CodeGenDataReader.cpp     |  4 ++--
 llvm/lib/ProfileData/InstrProfReader.cpp  |  8 ++++----
 llvm/lib/ProfileData/SampleProfReader.cpp |  4 ++--
 llvm/unittests/Support/EndianTest.cpp     | 13 ++++++-------
 5 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h
index 7eb1d7e8dfe7f..6c86feb78053c 100644
--- a/llvm/include/llvm/Support/Endian.h
+++ b/llvm/include/llvm/Support/Endian.h
@@ -66,7 +66,9 @@ template <typename value_type, std::size_t alignment = unaligned>
 }
 
 template <typename value_type, endianness endian, std::size_t alignment>
-[[nodiscard]] inline value_type read(const void *memory) {
+[[nodiscard]] LLVM_DEPRECATED("Pass endian as a function argument instead",
+                              "read") inline value_type
+    read(const void *memory) {
   return read<value_type, alignment>(memory, endian);
 }
 
@@ -127,7 +129,7 @@ template <typename value_type, endianness endian, std::size_t alignment>
                                                    uint64_t startBit) {
   assert(startBit < 8);
   if (startBit == 0)
-    return read<value_type, endian, alignment>(memory);
+    return read<value_type, alignment>(memory, endian);
   else {
     // Read two values and compose the result from them.
     value_type val[2];
@@ -223,8 +225,8 @@ struct packed_endian_specific_integral {
   explicit packed_endian_specific_integral(value_type val) { *this = val; }
 
   value_type value() const {
-    return endian::read<value_type, endian, alignment>(
-      (const void*)Value.buffer);
+    return endian::read<value_type, alignment>((const void *)Value.buffer,
+                                               endian);
   }
   operator value_type() const { return value(); }
 
@@ -263,7 +265,7 @@ struct packed_endian_specific_integral {
     explicit ref(void *Ptr) : Ptr(Ptr) {}
 
     operator value_type() const {
-      return endian::read<value_type, endian, alignment>(Ptr);
+      return endian::read<value_type, alignment>(Ptr, endian);
     }
 
     void operator=(value_type NewValue) {
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index fc59be8df525a..3fd8cfe1a8762 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -169,8 +169,8 @@ bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) {
   if (DataBuffer.getBufferSize() < sizeof(IndexedCGData::Magic))
     return false;
 
-  uint64_t Magic = endian::read<uint64_t, llvm::endianness::little, aligned>(
-      DataBuffer.getBufferStart());
+  uint64_t Magic = endian::read<uint64_t, aligned>(DataBuffer.getBufferStart(),
+                                                   llvm::endianness::little);
   // Verify that it's magical.
   return Magic == IndexedCGData::Magic;
 }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 886add7131da2..1da92eafa4b4a 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1171,8 +1171,8 @@ bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
 
   if (DataBuffer.getBufferSize() < 8)
     return false;
-  uint64_t Magic = endian::read<uint64_t, llvm::endianness::little, aligned>(
-      DataBuffer.getBufferStart());
+  uint64_t Magic = endian::read<uint64_t, aligned>(DataBuffer.getBufferStart(),
+                                                   llvm::endianness::little);
   // Verify that it's magical.
   return Magic == IndexedInstrProf::Magic;
 }
@@ -1598,8 +1598,8 @@ Error IndexedInstrProfReader::getFunctionBitmap(StringRef FuncName,
         std::memset(W, 0, sizeof(W));
         std::memcpy(W, &BitmapBytes[I], N);
         I += N;
-        return support::endian::read<XTy, llvm::endianness::little,
-                                     support::aligned>(W);
+        return support::endian::read<XTy, support::aligned>(
+            W, llvm::endianness::little);
       },
       Bitmap, Bitmap);
   assert(I == E);
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 81ae792e70b99..766c0814ca067 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -1290,8 +1290,8 @@ SampleProfileReaderExtBinaryBase::readNameTableSec(bool IsMD5,
     NameTable.reserve(*Size);
     for (size_t I = 0; I < *Size; ++I) {
       using namespace support;
-      uint64_t FID = endian::read<uint64_t, endianness::little, unaligned>(
-          Data + I * sizeof(uint64_t));
+      uint64_t FID = endian::read<uint64_t, unaligned>(
+          Data + I * sizeof(uint64_t), endianness::little);
       NameTable.emplace_back(FunctionId(FID));
     }
     if (!ProfileIsCS)
diff --git a/llvm/unittests/Support/EndianTest.cpp b/llvm/unittests/Support/EndianTest.cpp
index c48b7707b7751..0ee631db74ac1 100644
--- a/llvm/unittests/Support/EndianTest.cpp
+++ b/llvm/unittests/Support/EndianTest.cpp
@@ -24,16 +24,15 @@ TEST(Endian, Read) {
   unsigned char littleval[] = {0x00, 0x04, 0x03, 0x02, 0x01};
   int32_t BigAsHost = 0x00010203;
   EXPECT_EQ(BigAsHost,
-            (endian::read<int32_t, llvm::endianness::big, unaligned>(bigval)));
+            (endian::read<int32_t, unaligned>(bigval, llvm::endianness::big)));
   int32_t LittleAsHost = 0x02030400;
-  EXPECT_EQ(
-      LittleAsHost,
-      (endian::read<int32_t, llvm::endianness::little, unaligned>(littleval)));
+  EXPECT_EQ(LittleAsHost, (endian::read<int32_t, unaligned>(
+                              littleval, llvm::endianness::little)));
 
   EXPECT_EQ(
-      (endian::read<int32_t, llvm::endianness::big, unaligned>(bigval + 1)),
-      (endian::read<int32_t, llvm::endianness::little, unaligned>(littleval +
-                                                                  1)));
+      (endian::read<int32_t, unaligned>(bigval + 1, llvm::endianness::big)),
+      (endian::read<int32_t, unaligned>(littleval + 1,
+                                        llvm::endianness::little)));
 }
 
 TEST(Endian, WriteNext) {

From d70490c6940f0bca4f13be199396701249876685 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 27 Sep 2025 09:05:24 -0700
Subject: [PATCH 010/878] [llvm] Proofread BuildingADistribution.rst (#160980)

---
 llvm/docs/BuildingADistribution.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/BuildingADistribution.rst b/llvm/docs/BuildingADistribution.rst
index 10e571cdea3f9..81ed8b8723a26 100644
--- a/llvm/docs/BuildingADistribution.rst
+++ b/llvm/docs/BuildingADistribution.rst
@@ -13,8 +13,8 @@ combination of its sub-project tools for distribution. This document covers
 useful features of the LLVM build system as well as best practices and general
 information about packaging LLVM.
 
-If you are new to CMake you may find the :doc:`CMake` or :doc:`CMakePrimer`
-documentation useful. Some of the things covered in this document are the inner
+If you are new to CMake, you may find the :doc:`CMake` or :doc:`CMakePrimer`
+documentation useful. This document covers some of the inner
 workings of the builds described in the :doc:`AdvancedBuilds` document.
 
 General Distribution Guidance
@@ -27,7 +27,7 @@ compiler. This is done so that the compiler you distribute benefits from all the
 bug fixes, performance optimizations and general improvements provided by the
 new compiler.
 
-In deciding how to build your distribution there are a few trade-offs that you
+In deciding how to build your distribution, there are a few trade-offs that you
 will need to evaluate. The big two are:
 
 #. Compile time of the distribution against performance of the built compiler
@@ -41,8 +41,8 @@ opportunity for the compiler to optimize.
 
 The guidance for minimizing distribution size is to dynamically link LLVM and
 Clang libraries into the tools to reduce code duplication. This will come at a
-substantial performance penalty to the generated binary both because it reduces
-optimization opportunity, and because dynamic linking requires resolving symbols
+substantial performance penalty to the generated binary, both because it reduces
+optimization opportunities and because dynamic linking requires resolving symbols
 at process launch time, which can be very slow for C++ code.
 
 .. _shared_libs:
@@ -76,7 +76,7 @@ LLVM testing tools. Alternatively the ``install-distribution`` target, which is
 recommended for building distributions, only installs specific parts of LLVM as
 specified at configuration time by *LLVM_DISTRIBUTION_COMPONENTS*.
 
-Additionally by default the ``install`` target will install the LLVM testing
+Additionally, by default, the ``install`` target will install the LLVM testing
 tools as the public tools. This can be changed well by setting
 *LLVM_INSTALL_TOOLCHAIN_ONLY* to ``On``. The LLVM tools are intended for
 development and testing of LLVM, and should only be included in distributions

From 0df3651802d35b26ae857b549de9edf73b67fb98 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 27 Sep 2025 17:43:58 +0100
Subject: [PATCH 011/878] [X86] matchVPMADD52 - only use 512-bit MADD52 on
 AVX512IFMA targets (#161011)

If we have a AVX512 target capable of AVXIFMA but not AVX512IFMA then we must split 512-bit (or larger) types to 256-bits

Fixes #160928
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   8 +-
 .../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 619 +++++++++++-------
 2 files changed, 392 insertions(+), 235 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3af673d951f65..efeddd7c9bd4b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4452,11 +4452,12 @@ static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG,
 template <typename F>
 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
-                         F Builder, bool CheckBWI = true) {
+                         F Builder, bool CheckBWI = true,
+                         bool AllowAVX512 = true) {
   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
   unsigned NumSubs = 1;
   if ((CheckBWI && Subtarget.useBWIRegs()) ||
-      (!CheckBWI && Subtarget.useAVX512Regs())) {
+      (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) {
     if (VT.getSizeInBits() > 512) {
       NumSubs = VT.getSizeInBits() / 512;
       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -58076,7 +58077,8 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
   };
 
   return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
-                          /*CheckBWI*/ false);
+                          /*CheckBWI*/ false,
+                          /*AllowAVX512*/ Subtarget.hasIFMA());
 }
 
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index aebfc7d483d6f..3ece4beb9c22e 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -1,25 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX,AVXIFMA
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512-NOIFMA
 
 ; 67108863 == (1 << 26) - 1
 ; 4503599627370496 == (1 << 52)
 ; 4503599627370495 == (1 << 52) - 1
 
 define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_combine:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
-; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
-; AVX-NEXT:    vmovdqa %ymm4, %ymm0
-; AVX-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_combine:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm0, %ymm0
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm1, %ymm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVXIFMA-NEXT:    vmovdqa %ymm4, %ymm0
+; AVXIFMA-NEXT:    vmovdqa %ymm5, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_combine:
 ; AVX512:       # %bb.0:
@@ -29,6 +30,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 ; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_combine:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm2, %ymm5
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm3, %ymm4, %ymm5
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 67108863)
   %y_masked = and <8 x i64> %y, splat (i64 67108863)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -37,19 +51,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
 }
 
 define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_combine_v2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
-; AVX-NEXT:    vpand %ymm7, %ymm0, %ymm0
-; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm0
-; AVX-NEXT:    vpand %ymm7, %ymm1, %ymm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
-; AVX-NEXT:    vmovdqa %ymm4, %ymm0
-; AVX-NEXT:    vmovdqa %ymm5, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_combine_v2:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
+; AVXIFMA-NEXT:    vpand %ymm7, %ymm0, %ymm0
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm7, %ymm1, %ymm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
+; AVXIFMA-NEXT:    vmovdqa %ymm4, %ymm0
+; AVXIFMA-NEXT:    vmovdqa %ymm5, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_combine_v2:
 ; AVX512:       # %bb.0:
@@ -58,6 +72,18 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 ; AVX512-NEXT:    vpmadd52luq %zmm1, %zmm0, %zmm2
 ; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_combine_v2:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm4, %ymm5, %ymm3
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1
   %y_masked = and <8 x i64> %y, splat (i64 3)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -66,32 +92,32 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 }
 
 define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_no_combine:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
-; AVX-NEXT:    vpand %ymm6, %ymm0, %ymm7
-; AVX-NEXT:    vpand %ymm6, %ymm1, %ymm8
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm9
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm6
-; AVX-NEXT:    vpsrlq $32, %ymm8, %ymm8
-; AVX-NEXT:    vpmuludq %ymm3, %ymm8, %ymm8
-; AVX-NEXT:    vpsrlq $32, %ymm6, %ymm6
-; AVX-NEXT:    vpmuludq %ymm6, %ymm1, %ymm6
-; AVX-NEXT:    vpaddq %ymm6, %ymm8, %ymm6
-; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
-; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX-NEXT:    vpsrlq $32, %ymm7, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
-; AVX-NEXT:    vpsrlq $32, %ymm9, %ymm7
-; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
-; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
-; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
-; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_no_combine:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm0, %ymm7
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm1, %ymm8
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm9
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm6
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm8, %ymm8
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm8, %ymm8
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm6, %ymm1, %ymm6
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm8, %ymm6
+; AVXIFMA-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm7, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm9, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVXIFMA-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_no_combine:
 ; AVX512:       # %bb.0:
@@ -108,6 +134,22 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_no_combine:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495]
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm0, %zmm4
+; AVX512-NOIFMA-NEXT:    vpandq %zmm3, %zmm1, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm4, %zmm4
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm4, %zmm4
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <8 x i64> %x, splat (i64 4503599627370495)
   %y_masked = and <8 x i64> %y, splat (i64 4503599627370495)
   %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
@@ -116,27 +158,27 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z)
 }
 
 define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_512_no_combine_v2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm6
-; AVX-NEXT:    vpmuludq %ymm3, %ymm6, %ymm6
-; AVX-NEXT:    vpsrlq $32, %ymm3, %ymm7
-; AVX-NEXT:    vpmuludq %ymm7, %ymm1, %ymm7
-; AVX-NEXT:    vpaddq %ymm6, %ymm7, %ymm6
-; AVX-NEXT:    vpsllq $32, %ymm6, %ymm6
-; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX-NEXT:    vpsrlq $32, %ymm0, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
-; AVX-NEXT:    vpsrlq $32, %ymm2, %ymm7
-; AVX-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
-; AVX-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
-; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
-; AVX-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_512_no_combine_v2:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm1, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm3, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm7, %ymm1, %ymm7
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm7, %ymm6
+; AVXIFMA-NEXT:    vpsllq $32, %ymm6, %ymm6
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm0, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpsrlq $32, %ymm2, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm7, %ymm0, %ymm7
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm7, %ymm3
+; AVXIFMA-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm5, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm6, %ymm1, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_512_no_combine_v2:
 ; AVX512:       # %bb.0:
@@ -150,6 +192,19 @@ define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_512_no_combine_v2:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm0, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm1, %zmm4
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm4, %zmm0, %zmm4
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm4, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsllq $32, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %mul = mul <8 x i64> %x, %y
   %res = add <8 x i64> %mul, %z
   ret <8 x i64> %res
@@ -255,25 +310,25 @@ define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z)
 
 ; 40-bit and 13-bit, too wide
 define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
-; AVX-LABEL: test_mixed_width_too_wide:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191]
-; AVX-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052]
-; AVX-NEXT:    vpshufb %ymm6, %ymm1, %ymm7
-; AVX-NEXT:    vpmuludq %ymm3, %ymm7, %ymm7
-; AVX-NEXT:    vpsllq $32, %ymm7, %ymm7
-; AVX-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
-; AVX-NEXT:    vpsllq $32, %ymm3, %ymm3
-; AVX-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
-; AVX-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm1, %ymm5, %ymm1
-; AVX-NEXT:    vpaddq %ymm7, %ymm1, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_mixed_width_too_wide:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191]
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm2, %ymm2
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052]
+; AVXIFMA-NEXT:    vpshufb %ymm6, %ymm1, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm7, %ymm7
+; AVXIFMA-NEXT:    vpsllq $32, %ymm7, %ymm7
+; AVXIFMA-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
+; AVXIFMA-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpsllq $32, %ymm3, %ymm3
+; AVXIFMA-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm1, %ymm5, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm7, %ymm1, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_mixed_width_too_wide:
 ; AVX512:       # %bb.0:
@@ -286,6 +341,18 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64
 ; AVX512-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
 ; AVX512-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_mixed_width_too_wide:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm3
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpsllq $32, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x40 = and <8 x i64> %x, splat (i64 1099511627775)
   %y13 = and <8 x i64> %y, splat (i64 8191)
   %mul = mul <8 x i64> %x40, %y13
@@ -294,19 +361,19 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64
 }
 
 define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) {
-; AVX-LABEL: test_zext32_inputs_not_safe:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX-NEXT:    vpmuludq %ymm5, %ymm4, %ymm4
-; AVX-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
-; AVX-NEXT:    vpaddq %ymm4, %ymm2, %ymm0
-; AVX-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_zext32_inputs_not_safe:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVXIFMA-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVXIFMA-NEXT:    vpmuludq %ymm5, %ymm4, %ymm4
+; AVXIFMA-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVXIFMA-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVXIFMA-NEXT:    vpmuludq %ymm1, %ymm0, %ymm1
+; AVXIFMA-NEXT:    vpaddq %ymm4, %ymm2, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zext32_inputs_not_safe:
 ; AVX512:       # %bb.0:
@@ -315,6 +382,14 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32,
 ; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_zext32_inputs_not_safe:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; AVX512-NOIFMA-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x = zext <8 x i32> %xi32 to <8 x i64>
   %y = zext <8 x i32> %yi32 to <8 x i64>
   %mul = mul <8 x i64> %x, %y
@@ -323,36 +398,36 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32,
 }
 
 define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) nounwind {
-; AVX-LABEL: test_1024_combine_split:
-; AVX:       # %bb.0:
-; AVX-NEXT:    pushq %rbp
-; AVX-NEXT:    movq %rsp, %rbp
-; AVX-NEXT:    andq $-32, %rsp
-; AVX-NEXT:    subq $32, %rsp
-; AVX-NEXT:    vmovdqa 112(%rbp), %ymm8
-; AVX-NEXT:    vmovdqa 80(%rbp), %ymm9
-; AVX-NEXT:    vmovdqa 48(%rbp), %ymm10
-; AVX-NEXT:    vmovdqa 16(%rbp), %ymm11
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm3, %ymm12, %ymm3
-; AVX-NEXT:    vpand %ymm2, %ymm12, %ymm2
-; AVX-NEXT:    vpand %ymm1, %ymm12, %ymm1
-; AVX-NEXT:    vpand %ymm0, %ymm12, %ymm0
-; AVX-NEXT:    vpand %ymm7, %ymm12, %ymm7
-; AVX-NEXT:    {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
-; AVX-NEXT:    vpand %ymm6, %ymm12, %ymm3
-; AVX-NEXT:    {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
-; AVX-NEXT:    vpand %ymm5, %ymm12, %ymm2
-; AVX-NEXT:    {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
-; AVX-NEXT:    vpand %ymm4, %ymm12, %ymm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
-; AVX-NEXT:    vmovdqa %ymm11, %ymm0
-; AVX-NEXT:    vmovdqa %ymm10, %ymm1
-; AVX-NEXT:    vmovdqa %ymm9, %ymm2
-; AVX-NEXT:    vmovdqa %ymm8, %ymm3
-; AVX-NEXT:    movq %rbp, %rsp
-; AVX-NEXT:    popq %rbp
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_1024_combine_split:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    pushq %rbp
+; AVXIFMA-NEXT:    movq %rsp, %rbp
+; AVXIFMA-NEXT:    andq $-32, %rsp
+; AVXIFMA-NEXT:    subq $32, %rsp
+; AVXIFMA-NEXT:    vmovdqa 112(%rbp), %ymm8
+; AVXIFMA-NEXT:    vmovdqa 80(%rbp), %ymm9
+; AVXIFMA-NEXT:    vmovdqa 48(%rbp), %ymm10
+; AVXIFMA-NEXT:    vmovdqa 16(%rbp), %ymm11
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm3, %ymm12, %ymm3
+; AVXIFMA-NEXT:    vpand %ymm2, %ymm12, %ymm2
+; AVXIFMA-NEXT:    vpand %ymm1, %ymm12, %ymm1
+; AVXIFMA-NEXT:    vpand %ymm0, %ymm12, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm7, %ymm12, %ymm7
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
+; AVXIFMA-NEXT:    vpand %ymm6, %ymm12, %ymm3
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
+; AVXIFMA-NEXT:    vpand %ymm5, %ymm12, %ymm2
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
+; AVXIFMA-NEXT:    vpand %ymm4, %ymm12, %ymm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
+; AVXIFMA-NEXT:    vmovdqa %ymm11, %ymm0
+; AVXIFMA-NEXT:    vmovdqa %ymm10, %ymm1
+; AVXIFMA-NEXT:    vmovdqa %ymm9, %ymm2
+; AVXIFMA-NEXT:    vmovdqa %ymm8, %ymm3
+; AVXIFMA-NEXT:    movq %rbp, %rsp
+; AVXIFMA-NEXT:    popq %rbp
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_1024_combine_split:
 ; AVX512:       # %bb.0:
@@ -366,6 +441,27 @@ define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i
 ; AVX512-NEXT:    vmovdqa64 %zmm4, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_1024_combine_split:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm1, %zmm1
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT:    vpandq %zmm6, %zmm2, %zmm2
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm2, %ymm6
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm7
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm4, %ymm8
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm6, %ymm7, %ymm8
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm0
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm5, %ymm6
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm4, %ymm6
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm3, %ymm1, %ymm5
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm1
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <16 x i64> %x, splat (i64 67108863)
   %y_masked = and <16 x i64> %y, splat (i64 67108863)
   %mul = mul <16 x i64> %x_masked, %y_masked
@@ -388,13 +484,13 @@ define <1 x i64> @test_not_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
 }
 
 define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
-; AVX-LABEL: test_v3i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v3i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-NOVL-LABEL: test_v3i64:
 ; AVX512-NOVL:       # %bb.0:
@@ -410,6 +506,13 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
 ; AVX512VL-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v3i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %ymm0, %ymm0, %ymm0
+; AVX512-NOIFMA-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <3 x i64> %x, splat (i64 67108863)
   %y_masked = and <3 x i64> %x, splat (i64 67108863)
   %mul = mul <3 x i64> %x_masked, %y_masked
@@ -418,35 +521,35 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
 }
 
 define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
-; AVX-LABEL: test_v5i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    vmovq %r8, %xmm0
-; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovq %rdx, %xmm1
-; AVX-NEXT:    vmovq %rsi, %xmm2
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
-; AVX-NEXT:    vmovq %rcx, %xmm3
-; AVX-NEXT:    vmovq %r9, %xmm4
-; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
-; AVX-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
-; AVX-NEXT:    vpsllq $33, %xmm4, %xmm4
-; AVX-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
-; AVX-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
-; AVX-NEXT:    vmovdqa %ymm2, (%rdi)
-; AVX-NEXT:    vmovq %xmm1, 32(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v5i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    movq %rdi, %rax
+; AVXIFMA-NEXT:    vmovq %r8, %xmm0
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVXIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVXIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVXIFMA-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm3
+; AVXIFMA-NEXT:    vmovq %r9, %xmm4
+; AVXIFMA-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVXIFMA-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVXIFMA-NEXT:    vpmuludq %xmm4, %xmm3, %xmm4
+; AVXIFMA-NEXT:    vpsllq $33, %xmm4, %xmm4
+; AVXIFMA-NEXT:    vpmuludq %xmm3, %xmm3, %xmm3
+; AVXIFMA-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVXIFMA-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
+; AVXIFMA-NEXT:    vmovdqa %ymm2, (%rdi)
+; AVXIFMA-NEXT:    vmovq %xmm1, 32(%rdi)
+; AVXIFMA-NEXT:    vzeroupper
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v5i64:
 ; AVX512:       # %bb.0:
@@ -454,6 +557,13 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
 ; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v5i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <5 x i64> %x, splat (i64 67108863)
   %y_masked = and <5 x i64> %x, splat (i64 67108863)
   %mul = mul <5 x i64> %x_masked, %y_masked
@@ -462,30 +572,30 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
 }
 
 define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
-; AVX-LABEL: test_v6i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    vmovq %r8, %xmm0
-; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovq %rdx, %xmm1
-; AVX-NEXT:    vmovq %rsi, %xmm2
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm1
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm1
-; AVX-NEXT:    vmovq %r9, %xmm0
-; AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rdi)
-; AVX-NEXT:    vmovdqa %ymm1, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v6i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    movq %rdi, %rax
+; AVXIFMA-NEXT:    vmovq %r8, %xmm0
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVXIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVXIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm1
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm1
+; AVXIFMA-NEXT:    vmovq %r9, %xmm0
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVXIFMA-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVXIFMA-NEXT:    vpmuldq %xmm0, %xmm0, %xmm0
+; AVXIFMA-NEXT:    vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVXIFMA-NEXT:    vmovdqa %xmm0, 32(%rdi)
+; AVXIFMA-NEXT:    vmovdqa %ymm1, (%rdi)
+; AVXIFMA-NEXT:    vzeroupper
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v6i64:
 ; AVX512:       # %bb.0:
@@ -493,6 +603,13 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
 ; AVX512-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
 ; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v6i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpmuludq %zmm0, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <6 x i64> %x, splat (i64 67108863)
   %y_masked = and <6 x i64> %x, splat (i64 67108863)
   %mul = mul <6 x i64> %x_masked, %y_masked
@@ -501,43 +618,43 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
 }
 
 define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) {
-; AVX-LABEL: test_v9i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    vmovq %r8, %xmm0
-; AVX-NEXT:    vmovq %rcx, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovq %rdx, %xmm1
-; AVX-NEXT:    vmovq %rsi, %xmm2
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT:    vmovq %r9, %xmm1
-; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
-; AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
-; AVX-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm4
-; AVX-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863]
-; AVX-NEXT:    vpand %ymm5, %ymm0, %ymm0
-; AVX-NEXT:    vpand %ymm5, %ymm1, %ymm1
-; AVX-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
-; AVX-NEXT:    vmovq %rcx, %xmm5
-; AVX-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
-; AVX-NEXT:    vpand %xmm5, %xmm6, %xmm5
-; AVX-NEXT:    vpsrlq $32, %xmm5, %xmm6
-; AVX-NEXT:    vpmuludq %xmm6, %xmm5, %xmm6
-; AVX-NEXT:    vpsllq $33, %xmm6, %xmm6
-; AVX-NEXT:    vpmuludq %xmm5, %xmm5, %xmm5
-; AVX-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
-; AVX-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm4
-; AVX-NEXT:    {vex} vpmadd52luq %ymm1, %ymm1, %ymm3
-; AVX-NEXT:    vmovdqa %ymm3, 32(%rdi)
-; AVX-NEXT:    vmovdqa %ymm4, (%rdi)
-; AVX-NEXT:    vmovq %xmm2, 64(%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVXIFMA-LABEL: test_v9i64:
+; AVXIFMA:       # %bb.0:
+; AVXIFMA-NEXT:    movq %rdi, %rax
+; AVXIFMA-NEXT:    vmovq %r8, %xmm0
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVXIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVXIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVXIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVXIFMA-NEXT:    vmovq %r9, %xmm1
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVXIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVXIFMA-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
+; AVXIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm4
+; AVXIFMA-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863]
+; AVXIFMA-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVXIFMA-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVXIFMA-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVXIFMA-NEXT:    vmovq %rcx, %xmm5
+; AVXIFMA-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
+; AVXIFMA-NEXT:    vpand %xmm5, %xmm6, %xmm5
+; AVXIFMA-NEXT:    vpsrlq $32, %xmm5, %xmm6
+; AVXIFMA-NEXT:    vpmuludq %xmm6, %xmm5, %xmm6
+; AVXIFMA-NEXT:    vpsllq $33, %xmm6, %xmm6
+; AVXIFMA-NEXT:    vpmuludq %xmm5, %xmm5, %xmm5
+; AVXIFMA-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVXIFMA-NEXT:    vpaddq %xmm6, %xmm2, %xmm2
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm4
+; AVXIFMA-NEXT:    {vex} vpmadd52luq %ymm1, %ymm1, %ymm3
+; AVXIFMA-NEXT:    vmovdqa %ymm3, 32(%rdi)
+; AVXIFMA-NEXT:    vmovdqa %ymm4, (%rdi)
+; AVXIFMA-NEXT:    vmovq %xmm2, 64(%rdi)
+; AVXIFMA-NEXT:    vzeroupper
+; AVXIFMA-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v9i64:
 ; AVX512:       # %bb.0:
@@ -572,6 +689,44 @@ define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) {
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rdi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
+;
+; AVX512-NOIFMA-LABEL: test_v9i64:
+; AVX512-NOIFMA:       # %bb.0:
+; AVX512-NOIFMA-NEXT:    movq %rdi, %rax
+; AVX512-NOIFMA-NEXT:    vmovq %r8, %xmm0
+; AVX512-NOIFMA-NEXT:    vmovq %rcx, %xmm1
+; AVX512-NOIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NOIFMA-NEXT:    vmovq %rdx, %xmm1
+; AVX512-NOIFMA-NEXT:    vmovq %rsi, %xmm2
+; AVX512-NOIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NOIFMA-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NOIFMA-NEXT:    vmovq %r9, %xmm1
+; AVX512-NOIFMA-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NOIFMA-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512-NOIFMA-NEXT:    vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NOIFMA-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512-NOIFMA-NEXT:    movl $67108863, %ecx # imm = 0x3FFFFFF
+; AVX512-NOIFMA-NEXT:    vmovq %rcx, %xmm2
+; AVX512-NOIFMA-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NOIFMA-NEXT:    vpand %xmm2, %xmm3, %xmm2
+; AVX512-NOIFMA-NEXT:    vpsrlq $32, %xmm2, %xmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %xmm3, %xmm2, %xmm3
+; AVX512-NOIFMA-NEXT:    vpsllq $33, %xmm3, %xmm3
+; AVX512-NOIFMA-NEXT:    vpmuludq %xmm2, %xmm2, %xmm2
+; AVX512-NOIFMA-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX512-NOIFMA-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
+; AVX512-NOIFMA-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm3
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm2, %ymm2, %ymm3
+; AVX512-NOIFMA-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm2
+; AVX512-NOIFMA-NEXT:    {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
+; AVX512-NOIFMA-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; AVX512-NOIFMA-NEXT:    vmovq %xmm1, 64(%rdi)
+; AVX512-NOIFMA-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512-NOIFMA-NEXT:    vzeroupper
+; AVX512-NOIFMA-NEXT:    retq
   %x_masked = and <9 x i64> %x, splat (i64 67108863)
   %y_masked = and <9 x i64> %x, splat (i64 67108863)
   %mul = mul <9 x i64> %x_masked, %y_masked

From 3834c5428dcdd8807331b2ff522115dba2f89aae Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Sat, 27 Sep 2025 14:54:33 -0400
Subject: [PATCH 012/878] [MLIR][Python] add unchecked gettors (#160954)

Some of the current gettors require passing locations (i.e., there be an
active location) because they're using the "checked" APIs. This PR adds
"unchecked" gettors which only require an active context.
---
 mlir/lib/Bindings/Python/DialectLLVM.cpp  |  46 ++++++---
 mlir/lib/Bindings/Python/IRAttributes.cpp |  12 +++
 mlir/lib/Bindings/Python/IRTypes.cpp      | 116 +++++++++++++++++++++-
 mlir/test/python/ir/builtin_types.py      |  11 +-
 4 files changed, 162 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Bindings/Python/DialectLLVM.cpp b/mlir/lib/Bindings/Python/DialectLLVM.cpp
index 55b9331270cdc..38de4a0e329a0 100644
--- a/mlir/lib/Bindings/Python/DialectLLVM.cpp
+++ b/mlir/lib/Bindings/Python/DialectLLVM.cpp
@@ -33,21 +33,37 @@ static void populateDialectLLVMSubmodule(const nanobind::module_ &m) {
   auto llvmStructType =
       mlir_type_subclass(m, "StructType", mlirTypeIsALLVMStructType);
 
-  llvmStructType.def_classmethod(
-      "get_literal",
-      [](const nb::object &cls, const std::vector<MlirType> &elements,
-         bool packed, MlirLocation loc) {
-        CollectDiagnosticsToStringScope scope(mlirLocationGetContext(loc));
-
-        MlirType type = mlirLLVMStructTypeLiteralGetChecked(
-            loc, elements.size(), elements.data(), packed);
-        if (mlirTypeIsNull(type)) {
-          throw nb::value_error(scope.takeMessage().c_str());
-        }
-        return cls(type);
-      },
-      "cls"_a, "elements"_a, nb::kw_only(), "packed"_a = false,
-      "loc"_a = nb::none());
+  llvmStructType
+      .def_classmethod(
+          "get_literal",
+          [](const nb::object &cls, const std::vector<MlirType> &elements,
+             bool packed, MlirLocation loc) {
+            CollectDiagnosticsToStringScope scope(mlirLocationGetContext(loc));
+
+            MlirType type = mlirLLVMStructTypeLiteralGetChecked(
+                loc, elements.size(), elements.data(), packed);
+            if (mlirTypeIsNull(type)) {
+              throw nb::value_error(scope.takeMessage().c_str());
+            }
+            return cls(type);
+          },
+          "cls"_a, "elements"_a, nb::kw_only(), "packed"_a = false,
+          "loc"_a = nb::none())
+      .def_classmethod(
+          "get_literal_unchecked",
+          [](const nb::object &cls, const std::vector<MlirType> &elements,
+             bool packed, MlirContext context) {
+            CollectDiagnosticsToStringScope scope(context);
+
+            MlirType type = mlirLLVMStructTypeLiteralGet(
+                context, elements.size(), elements.data(), packed);
+            if (mlirTypeIsNull(type)) {
+              throw nb::value_error(scope.takeMessage().c_str());
+            }
+            return cls(type);
+          },
+          "cls"_a, "elements"_a, nb::kw_only(), "packed"_a = false,
+          "context"_a = nb::none());
 
   llvmStructType.def_classmethod(
       "get_identified",
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index c77653f97e6dd..045c0fbf4630f 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -575,6 +575,18 @@ class PyFloatAttribute : public PyConcreteAttribute<PyFloatAttribute> {
         },
         nb::arg("type"), nb::arg("value"), nb::arg("loc") = nb::none(),
         "Gets an uniqued float point attribute associated to a type");
+    c.def_static(
+        "get_unchecked",
+        [](PyType &type, double value, DefaultingPyMlirContext context) {
+          PyMlirContext::ErrorCapture errors(context->getRef());
+          MlirAttribute attr =
+              mlirFloatAttrDoubleGet(context.get()->get(), type, value);
+          if (mlirAttributeIsNull(attr))
+            throw MLIRError("Invalid attribute", errors.take());
+          return PyFloatAttribute(type.getContext(), attr);
+        },
+        nb::arg("type"), nb::arg("value"), nb::arg("context") = nb::none(),
+        "Gets an uniqued float point attribute associated to a type");
     c.def_static(
         "get_f32",
         [](double value, DefaultingPyMlirContext context) {
diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp
index 07dc00521833f..3488d92250b45 100644
--- a/mlir/lib/Bindings/Python/IRTypes.cpp
+++ b/mlir/lib/Bindings/Python/IRTypes.cpp
@@ -639,11 +639,16 @@ class PyVectorType : public PyConcreteType<PyVectorType, PyShapedType> {
   using PyConcreteType::PyConcreteType;
 
   static void bindDerived(ClassTy &c) {
-    c.def_static("get", &PyVectorType::get, nb::arg("shape"),
+    c.def_static("get", &PyVectorType::getChecked, nb::arg("shape"),
                  nb::arg("element_type"), nb::kw_only(),
                  nb::arg("scalable") = nb::none(),
                  nb::arg("scalable_dims") = nb::none(),
                  nb::arg("loc") = nb::none(), "Create a vector type")
+        .def_static("get_unchecked", &PyVectorType::get, nb::arg("shape"),
+                    nb::arg("element_type"), nb::kw_only(),
+                    nb::arg("scalable") = nb::none(),
+                    nb::arg("scalable_dims") = nb::none(),
+                    nb::arg("context") = nb::none(), "Create a vector type")
         .def_prop_ro(
             "scalable",
             [](MlirType self) { return mlirVectorTypeIsScalable(self); })
@@ -658,10 +663,11 @@ class PyVectorType : public PyConcreteType<PyVectorType, PyShapedType> {
   }
 
 private:
-  static PyVectorType get(std::vector<int64_t> shape, PyType &elementType,
-                          std::optional<nb::list> scalable,
-                          std::optional<std::vector<int64_t>> scalableDims,
-                          DefaultingPyLocation loc) {
+  static PyVectorType
+  getChecked(std::vector<int64_t> shape, PyType &elementType,
+             std::optional<nb::list> scalable,
+             std::optional<std::vector<int64_t>> scalableDims,
+             DefaultingPyLocation loc) {
     if (scalable && scalableDims) {
       throw nb::value_error("'scalable' and 'scalable_dims' kwargs "
                             "are mutually exclusive.");
@@ -696,6 +702,42 @@ class PyVectorType : public PyConcreteType<PyVectorType, PyShapedType> {
       throw MLIRError("Invalid type", errors.take());
     return PyVectorType(elementType.getContext(), type);
   }
+
+  static PyVectorType get(std::vector<int64_t> shape, PyType &elementType,
+                          std::optional<nb::list> scalable,
+                          std::optional<std::vector<int64_t>> scalableDims,
+                          DefaultingPyMlirContext context) {
+    if (scalable && scalableDims) {
+      throw nb::value_error("'scalable' and 'scalable_dims' kwargs "
+                            "are mutually exclusive.");
+    }
+
+    PyMlirContext::ErrorCapture errors(context->getRef());
+    MlirType type;
+    if (scalable) {
+      if (scalable->size() != shape.size())
+        throw nb::value_error("Expected len(scalable) == len(shape).");
+
+      SmallVector<bool> scalableDimFlags = llvm::to_vector(llvm::map_range(
+          *scalable, [](const nb::handle &h) { return nb::cast<bool>(h); }));
+      type = mlirVectorTypeGetScalable(shape.size(), shape.data(),
+                                       scalableDimFlags.data(), elementType);
+    } else if (scalableDims) {
+      SmallVector<bool> scalableDimFlags(shape.size(), false);
+      for (int64_t dim : *scalableDims) {
+        if (static_cast<size_t>(dim) >= scalableDimFlags.size() || dim < 0)
+          throw nb::value_error("Scalable dimension index out of bounds.");
+        scalableDimFlags[dim] = true;
+      }
+      type = mlirVectorTypeGetScalable(shape.size(), shape.data(),
+                                       scalableDimFlags.data(), elementType);
+    } else {
+      type = mlirVectorTypeGet(shape.size(), shape.data(), elementType);
+    }
+    if (mlirTypeIsNull(type))
+      throw MLIRError("Invalid type", errors.take());
+    return PyVectorType(elementType.getContext(), type);
+  }
 };
 
 /// Ranked Tensor Type subclass - RankedTensorType.
@@ -724,6 +766,22 @@ class PyRankedTensorType
         nb::arg("shape"), nb::arg("element_type"),
         nb::arg("encoding") = nb::none(), nb::arg("loc") = nb::none(),
         "Create a ranked tensor type");
+    c.def_static(
+        "get_unchecked",
+        [](std::vector<int64_t> shape, PyType &elementType,
+           std::optional<PyAttribute> &encodingAttr,
+           DefaultingPyMlirContext context) {
+          PyMlirContext::ErrorCapture errors(context->getRef());
+          MlirType t = mlirRankedTensorTypeGet(
+              shape.size(), shape.data(), elementType,
+              encodingAttr ? encodingAttr->get() : mlirAttributeGetNull());
+          if (mlirTypeIsNull(t))
+            throw MLIRError("Invalid type", errors.take());
+          return PyRankedTensorType(elementType.getContext(), t);
+        },
+        nb::arg("shape"), nb::arg("element_type"),
+        nb::arg("encoding") = nb::none(), nb::arg("context") = nb::none(),
+        "Create a ranked tensor type");
     c.def_prop_ro(
         "encoding",
         [](PyRankedTensorType &self)
@@ -758,6 +816,17 @@ class PyUnrankedTensorType
         },
         nb::arg("element_type"), nb::arg("loc") = nb::none(),
         "Create a unranked tensor type");
+    c.def_static(
+        "get_unchecked",
+        [](PyType &elementType, DefaultingPyMlirContext context) {
+          PyMlirContext::ErrorCapture errors(context->getRef());
+          MlirType t = mlirUnrankedTensorTypeGet(elementType);
+          if (mlirTypeIsNull(t))
+            throw MLIRError("Invalid type", errors.take());
+          return PyUnrankedTensorType(elementType.getContext(), t);
+        },
+        nb::arg("element_type"), nb::arg("context") = nb::none(),
+        "Create a unranked tensor type");
   }
 };
 
@@ -790,6 +859,27 @@ class PyMemRefType : public PyConcreteType<PyMemRefType, PyShapedType> {
          nb::arg("shape"), nb::arg("element_type"),
          nb::arg("layout") = nb::none(), nb::arg("memory_space") = nb::none(),
          nb::arg("loc") = nb::none(), "Create a memref type")
+        .def_static(
+            "get_unchecked",
+            [](std::vector<int64_t> shape, PyType &elementType,
+               PyAttribute *layout, PyAttribute *memorySpace,
+               DefaultingPyMlirContext context) {
+              PyMlirContext::ErrorCapture errors(context->getRef());
+              MlirAttribute layoutAttr =
+                  layout ? *layout : mlirAttributeGetNull();
+              MlirAttribute memSpaceAttr =
+                  memorySpace ? *memorySpace : mlirAttributeGetNull();
+              MlirType t =
+                  mlirMemRefTypeGet(elementType, shape.size(), shape.data(),
+                                    layoutAttr, memSpaceAttr);
+              if (mlirTypeIsNull(t))
+                throw MLIRError("Invalid type", errors.take());
+              return PyMemRefType(elementType.getContext(), t);
+            },
+            nb::arg("shape"), nb::arg("element_type"),
+            nb::arg("layout") = nb::none(),
+            nb::arg("memory_space") = nb::none(),
+            nb::arg("context") = nb::none(), "Create a memref type")
         .def_prop_ro(
             "layout",
             [](PyMemRefType &self) -> nb::typed<nb::object, PyAttribute> {
@@ -858,6 +948,22 @@ class PyUnrankedMemRefType
          },
          nb::arg("element_type"), nb::arg("memory_space").none(),
          nb::arg("loc") = nb::none(), "Create a unranked memref type")
+        .def_static(
+            "get_unchecked",
+            [](PyType &elementType, PyAttribute *memorySpace,
+               DefaultingPyMlirContext context) {
+              PyMlirContext::ErrorCapture errors(context->getRef());
+              MlirAttribute memSpaceAttr = {};
+              if (memorySpace)
+                memSpaceAttr = *memorySpace;
+
+              MlirType t = mlirUnrankedMemRefTypeGet(elementType, memSpaceAttr);
+              if (mlirTypeIsNull(t))
+                throw MLIRError("Invalid type", errors.take());
+              return PyUnrankedMemRefType(elementType.getContext(), t);
+            },
+            nb::arg("element_type"), nb::arg("memory_space").none(),
+            nb::arg("context") = nb::none(), "Create a unranked memref type")
         .def_prop_ro(
             "memory_space",
             [](PyUnrankedMemRefType &self)
diff --git a/mlir/test/python/ir/builtin_types.py b/mlir/test/python/ir/builtin_types.py
index b42bfd9bc6587..54863253fc770 100644
--- a/mlir/test/python/ir/builtin_types.py
+++ b/mlir/test/python/ir/builtin_types.py
@@ -371,11 +371,16 @@ def testAbstractShapedType():
 # CHECK-LABEL: TEST: testVectorType
 @run
 def testVectorType():
+    shape = [2, 3]
+    with Context():
+        f32 = F32Type.get()
+        # CHECK: unchecked vector type: vector<2x3xf32>
+        print("unchecked vector type:", VectorType.get_unchecked(shape, f32))
+
     with Context(), Location.unknown():
         f32 = F32Type.get()
-        shape = [2, 3]
-        # CHECK: vector type: vector<2x3xf32>
-        print("vector type:", VectorType.get(shape, f32))
+        # CHECK: checked vector type: vector<2x3xf32>
+        print("checked vector type:", VectorType.get(shape, f32))
 
         none = NoneType.get()
         try:

From 17a66ea68d787e0938fdcc9535ca80029c88a210 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sat, 27 Sep 2025 12:16:14 -0700
Subject: [PATCH 013/878] [clang-format][NFC] Rename a unit test

---
 clang/unittests/Format/TokenAnnotatorTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 4c43a963632a6..899cc47f8213f 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -4159,7 +4159,7 @@ TEST_F(TokenAnnotatorTest, LineCommentTrailingBackslash) {
   EXPECT_TOKEN(Tokens[1], tok::comment, TT_LineComment);
 }
 
-TEST_F(TokenAnnotatorTest, KeywordedFunctionLikeMacro) {
+TEST_F(TokenAnnotatorTest, QtProperty) {
   auto Style = getLLVMStyle();
   Style.AllowBreakBeforeQtProperty = true;
 

From 8460dbb450813358859ee20cfa54edfe575372fd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 27 Sep 2025 20:48:42 +0100
Subject: [PATCH 014/878] [VPlan] Mark VPInstruction::Broadcast as not
 reading/writing memory.

This enables additional DCE/CSE opportunities and ensures that we don't
end up with multiple redundant users of a VPInstruction using EVL. It
fixes a verifier error in the added test_3_inductions test.
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  1 +
 .../LoopVectorize/RISCV/induction-costs.ll    | 75 +++++++++++++++++++
 .../X86/epilog-vectorization-inductions.ll    | 14 ++--
 .../LoopVectorize/X86/induction-step.ll       | 14 ++--
 .../LoopVectorize/blend-in-header.ll          |  4 -
 .../LoopVectorize/expand-scev-after-invoke.ll |  4 +-
 .../LoopVectorize/float-induction.ll          | 42 +++++------
 .../LoopVectorize/induction-step.ll           |  8 +-
 .../Transforms/LoopVectorize/induction.ll     | 16 ++--
 .../nested-loops-scev-expansion.ll            | 10 +--
 10 files changed, 118 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8892f9b098349..cf5e6bf0ac418 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1214,6 +1214,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   case Instruction::Select:
   case Instruction::PHI:
   case VPInstruction::AnyOf:
+  case VPInstruction::Broadcast:
   case VPInstruction::BuildStructVector:
   case VPInstruction::BuildVector:
   case VPInstruction::CalculateTripCountMinusVF:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 83619250b4ad4..f4fbb10ac5b54 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -119,7 +119,81 @@ exit:
   ret void
 }
 
+define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
+; CHECK-LABEL: define void @test_3_inductions(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[DST]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <vscale x 2 x i32> [[TMP1]], splat (i32 2)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> splat (i32 1), [[TMP2]]
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND2:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION1]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i32 2, [[TMP3]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <vscale x 2 x i32> [[VEC_IND2]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <vscale x 2 x i32> [[TMP5]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 2 x i64> [[TMP6]]
+; CHECK-NEXT:    call void @llvm.vp.scatter.nxv2p0.nxv2p0(<vscale x 2 x ptr> [[TMP7]], <vscale x 2 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 2 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[IV_0_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_OR:%.*]] = or i32 [[IV_2]], [[IV_0]]
+; CHECK-NEXT:    [[IV_OR_EXT:%.*]] = sext i32 [[IV_OR]] to i64
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_OR_EXT]]
+; CHECK-NEXT:    store ptr [[GEP_SRC]], ptr [[DST]], align 8
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i32 [[IV_0]], 2
+; CHECK-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
+; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 2
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.0 = phi i32 [ 1, %entry ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ]
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.or = or i32 %iv.2, %iv.0
+  %iv.or.ext = sext i32 %iv.or to i64
+  %gep.src = getelementptr i8, ptr %src, i64 %iv.or.ext
+  store ptr %gep.src, ptr %dst, align 8
+  %iv.0.next = add i32 %iv.0, 2
+  %iv.1.next = add i64 %iv.1, 1
+  %iv.2.next = add i32 %iv.2, 2
+  %ec = icmp eq i64 %iv.1, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
+attributes #1 = { "target-cpu"="sifive-p670" }
 ;.
 ; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
 ; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
@@ -131,4 +205,5 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
 ; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]], [[META8]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index f33c8d9e9ae9b..ed288d2f99a0b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -145,11 +145,9 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <16 x i16> splat (i16 16), [[TMP2]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, [[TMP2]]
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i16> zeroinitializer, [[TMP11]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -158,10 +156,10 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <16 x i16> [[VEC_IND]], [[TMP1]]
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <16 x i16> [[STEP_ADD]], [[TMP1]]
 ; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <16 x i16> [[STEP_ADD_2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <16 x i16> [[VEC_IND]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[STEP_ADD_2]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[STEP_ADD_3]], [[BROADCAST_SPLAT3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <16 x i16> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[STEP_ADD]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[STEP_ADD_2]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <16 x i16> [[STEP_ADD_3]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 16
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
index 4b4103e9806b9..61f07eff768c1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll
@@ -17,9 +17,7 @@ define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -89,19 +87,17 @@ define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i16> <i16 0, i16 1, i16 2, i16 3>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4
 ; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP6]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
index 85f72d283a0e4..6f262109f95be 100644
--- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
+++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
@@ -111,8 +111,6 @@ define i64 @invar_cond(i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -164,8 +162,6 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll b/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll
index ff550da1ae0e1..4af9f4a13b62b 100644
--- a/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll
+++ b/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll
@@ -18,9 +18,7 @@ define void @test(ptr %dst) personality ptr null {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 160, [[STEP]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> splat (i32 4), [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 2b15aae628274..901f67ee676ee 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -85,17 +85,15 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
 ; VEC4_INTERL2-NEXT:    [[FPINC_INS:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
+; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[FPINC_INS]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
-; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[FPINC_INS]], <float 4.000000e+00, float poison, float poison, float poison>
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], splat (float 4.000000e+00)
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT3]], [[TMP7]]
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -332,17 +330,15 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
+; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
-; VEC4_INTERL2-NEXT:    [[MUL:%.*]] = fmul reassoc <4 x float> [[DOTSPLATINSERT2]], <float 4.000000e+00, float poison, float poison, float poison>
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[MUL]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = fmul reassoc <4 x float> [[BROADCAST_SPLAT]], splat (float 4.000000e+00)
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP2]]
+; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = fmul reassoc <4 x float> [[BROADCAST_SPLAT]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT3]], [[TMP7]]
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -834,22 +830,20 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483640
 ; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; VEC4_INTERL2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
 ; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
 ; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC4_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]]
-; VEC4_INTERL2-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLATINSERT2]], <float 4.000000e+00, float poison, float poison, float poison>
-; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
-; VEC4_INTERL2-NEXT:    [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], splat (float 4.000000e+00)
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
 ; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer
-; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]]
+; VEC4_INTERL2-NEXT:    [[TMP19:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT7]], [[TMP19]]
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -860,8 +854,8 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4
-; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST]]
-; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST]]
+; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT]]
+; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT]]
 ; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND]], splat (float -5.000000e-01)
 ; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], splat (float -2.500000e+00)
 ; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]]
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index b3cb3a77467ee..362de0e0bba7a 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -337,8 +337,6 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT1:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, [[DOTSPLAT1]]
@@ -350,7 +348,7 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP3]], 8
@@ -362,11 +360,11 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[ADD]] = add i16 [[IV_2]], [[O_1]]
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_DST]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 261c336b329fa..60c844c3f6415 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -6211,12 +6211,10 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL:       vector.ph:
 ; UNROLL-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -4
 ; UNROLL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
+; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; UNROLL-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NEXT:    [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]]
-; UNROLL-NEXT:    [[TMP15:%.*]] = shl <2 x i32> [[BROADCAST_SPLATINSERT]], <i32 1, i32 0>
-; UNROLL-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
-; UNROLL-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT:    [[TMP16:%.*]] = shl <2 x i32> [[DOTSPLAT]], splat (i32 1)
 ; UNROLL-NEXT:    [[TMP17:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
@@ -6293,9 +6291,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]]
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = mul <2 x i32> splat (i32 2), [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0
-; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> zeroinitializer, [[TMP18]]
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
@@ -6365,12 +6361,10 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -8
 ; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
+; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; INTERLEAVE-NEXT:    [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]]
-; INTERLEAVE-NEXT:    [[TMP15:%.*]] = shl <4 x i32> [[BROADCAST_SPLATINSERT]], <i32 2, i32 0, i32 0, i32 0>
-; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0
-; INTERLEAVE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = shl <4 x i32> [[DOTSPLAT]], splat (i32 2)
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = mul <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll
index 6cdd154f0e00e..8525b3aa5d349 100644
--- a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll
@@ -240,11 +240,9 @@ define void @pr52024(ptr %dst, i16 %N) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i16 24, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i16> splat (i16 2), [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i16> poison, i16 [[REM_TRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT3]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i16> poison, i16 [[REM_TRUNC]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT5]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <2 x i16> <i16 0, i16 1>, [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <2 x i16> <i16 0, i16 1>, [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i16> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
@@ -252,8 +250,8 @@ define void @pr52024(ptr %dst, i16 %N) {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i16> [[VEC_IND]], [[TMP6]]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 8, [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i16> [[VEC_IND]], [[BROADCAST_SPLAT4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i16> [[VEC_IND]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext <2 x i16> [[TMP8]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i32 [[OFFSET_IDX]]

From 08c057e6110b0d1ff6ae60886571fd09b50cfaa7 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Sat, 27 Sep 2025 23:39:35 +0300
Subject: [PATCH 015/878] [GitHub][docker] Fix 'FromAsCasing' violation in CI
 dockerfile (#161017)

Fixes https://docs.docker.com/reference/build-checks/from-as-casing/
which also gives warning when building container from command line:
```
=> WARN: FromAsCasing: 'as' and 'FROM' keywords' casing do not match (line 1)
=> WARN: FromAsCasing: 'as' and 'FROM' keywords' casing do not match (line 4)
=> WARN: FromAsCasing: 'as' and 'FROM' keywords' casing do not match (line 40)
=> WARN: FromAsCasing: 'as' and 'FROM' keywords' casing do not match (line 100)
```
---
 .github/workflows/containers/github-action-ci/Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 892fc9005de85..1d3f5f9c35d7f 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -1,7 +1,7 @@
-FROM docker.io/library/ubuntu:24.04 as base
+FROM docker.io/library/ubuntu:24.04 AS base
 ENV LLVM_SYSROOT=/opt/llvm
 
-FROM base as stage1-toolchain
+FROM base AS stage1-toolchain
 ENV LLVM_VERSION=21.1.1
 
 RUN apt-get update && \
@@ -37,7 +37,7 @@ RUN cmake -B ./build -G Ninja ./llvm \
 
 RUN ninja -C ./build stage2-clang-bolt stage2-install-distribution && ninja -C ./build install-distribution
 
-FROM base as ci-container
+FROM base AS ci-container
 
 COPY --from=stage1-toolchain $LLVM_SYSROOT $LLVM_SYSROOT
 
@@ -97,7 +97,7 @@ RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
 USER gha
 WORKDIR /home/gha
 
-FROM ci-container as ci-container-agent
+FROM ci-container AS ci-container-agent
 
 ENV GITHUB_RUNNER_VERSION=2.328.0
 

From 41a2dfc0d77d9ad977d1d36358f979abb3a0928f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 27 Sep 2025 21:43:30 +0100
Subject: [PATCH 016/878] [VPlan] Allow multiple users of (broadcast %evl).

CSE may replace multiple redundant broadcasts of EVL with a single
broadcast which may have more than 1 user. Adjust the verifier to allow
this.

Fixes a crash when building llvm-test-suite with EVL:
https://lab.llvm.org/buildbot/#/builders/210/builds/3303
---
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  3 +-
 .../LoopVectorize/RISCV/induction-costs.ll    | 92 +++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 92caa0b4e51d5..013ea2e883534 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -199,7 +199,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
           // EVLIVIncrement is only used by EVLIV & BranchOnCount.
           // Having more than two users is unexpected.
           using namespace llvm::VPlanPatternMatch;
-          if ((I->getNumUsers() != 1) &&
+          if (I->getOpcode() != VPInstruction::Broadcast &&
+              I->getNumUsers() != 1 &&
               (I->getNumUsers() != 2 ||
                none_of(I->users(), match_fn(m_BranchOnCount(m_Specific(I),
                                                             m_VPValue()))))) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index f4fbb10ac5b54..4d97a659e94e9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -192,6 +192,97 @@ exit:
   ret void
 }
 
+define void @redundant_iv_trunc_for_cse(ptr noalias %src, ptr noalias %dst, i64 %n) #0 {
+; CHECK-LABEL: define void @redundant_iv_trunc_for_cse(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <vscale x 4 x i32> [[TMP1]], splat (i32 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP2]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = shl <vscale x 4 x i32> [[VEC_IND1]], splat (i32 16)
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 4 x i32> [[PREDPHI]] to <vscale x 4 x i8>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP7]], ptr align 1 [[TMP8]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
+; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <vscale x 4 x i32> [[VEC_IND1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-NEXT:    br i1 [[C_0]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[TRUNC_IV_2:%.*]] = trunc i64 [[IV]] to i32
+; CHECK-NEXT:    [[SHL_IV:%.*]] = shl i32 [[TRUNC_IV_2]], 16
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[SHL_IV]], %[[THEN]] ], [ [[TRUNC_IV]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TRUNC_P:%.*]] = trunc i32 [[P]] to i8
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 [[TRUNC_P]], ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %c.0 = icmp eq i32 %l, 0
+  %trunc.iv = trunc i64 %iv to i32
+  br i1 %c.0, label %then, label %loop.latch
+
+then:
+  %trunc.iv.2  = trunc i64 %iv to i32
+  %shl.iv = shl i32 %trunc.iv.2, 16
+  br label %loop.latch
+
+loop.latch:
+  %p = phi i32 [ %shl.iv, %then ], [ %trunc.iv, %loop.header ]
+  %trunc.p = trunc i32 %p to i8
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
+  store i8 %trunc.p, ptr %gep.dst, align 1
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %n
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+
+
 attributes #0 = { "target-features"="+64bit,+v,+zvl256b" }
 attributes #1 = { "target-cpu"="sifive-p670" }
 ;.
@@ -206,4 +297,5 @@ attributes #1 = { "target-cpu"="sifive-p670" }
 ; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]]}
 ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]], [[META8]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META7]], [[META8]]}
 ;.

From 378b6d51de97ce220c042a0823d047a546c82bf6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sat, 27 Sep 2025 23:09:35 +0100
Subject: [PATCH 017/878] [AMDGPU] fcanonicalize.bf16.ll - regenerate test
 checks (#161026)

---
 .../test/CodeGen/AMDGPU/fcanonicalize.bf16.ll | 1845 +++++++++--------
 1 file changed, 950 insertions(+), 895 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
index a4cdb0387df9a..d747fb7cce7dc 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
@@ -15,67 +15,67 @@ declare <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat>) #0
 declare <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat>) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-; GFX1250-LABEL:     test_fold_canonicalize_undef_value_bf16:
-; GFX1250:           %bb.0:
-; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        global_store_b16 v0, v0, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat undef)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
 
-; GFX1250-LABEL:    v_test_canonicalize_var_bf16:
-; GFX1250:          ; %bb.0:
-; GFX1250-NEXT:       s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:       v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:       s_wait_kmcnt 0x0
-; GFX1250-NEXT:       global_load_u16 v0, v0, s[0:1]
-; GFX1250-NEXT:       s_wait_loadcnt 0x0
-; GFX1250-NEXT:       v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:       s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:       v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT:       v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:       global_store_b16 v[0:1], v0, off
-; GFX1250-NEXT:       s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
   store bfloat %canonicalized, ptr addrspace(1) poison
   ret void
 }
 
-; GFX1250-LABEL:     s_test_canonicalize_var_bf16:
-; GFX1250:           ; %bb.0:
-; GFX1250-NEXT:        s_load_b96 s[0:2], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v1, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        s_lshl_b32 s2, s2, 16
-; GFX1250-NEXT:        s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:        v_max_num_f32_e64 v0, s2, s2
-; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:        global_store_b16 v1, v0, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
+; GFX1250-LABEL: s_test_canonicalize_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e64 v0, s2, s2
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = bitcast i16 %val.arg to bfloat
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
 
-; GFX1250-LABEL:    v_test_canonicalize_build_vector_v2bf16:
-; GFX1250:          ; %bb.0:
-; GFX1250-NEXT:       s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:       s_wait_kmcnt 0x0
-; GFX1250-NEXT:       v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250-NEXT:       s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:       v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
-; GFX1250-NEXT:       v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT:       s_set_pc_i64 s[30:31]
 define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat %hi) #1 {
+; GFX1250-LABEL: v_test_canonicalize_build_vector_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %ins0 = insertelement <2 x bfloat> poison, bfloat %lo, i32 0
   %ins1 = insertelement <2 x bfloat> %ins0, bfloat %hi, i32 1
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %ins1)
@@ -83,22 +83,22 @@ define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat
 }
 
 
-; GFX1250-LABEL:     v_test_canonicalize_fabs_var_bf16:
-; GFX1250:           ; %bb.0:
-; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:         s_wait_kmcnt 0x0
-; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:         s_wait_loadcnt 0x0
-; GFX1250-NEXT:         v_and_b32_e32 v1, 0x7fff, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:         s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs)
@@ -107,22 +107,22 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %o
 }
 
 
-; GFX1250-LABEL:     v_test_canonicalize_fneg_fabs_var_bf16:
-; GFX1250:           ; %bb.0:
-; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:        s_wait_loadcnt 0x0
-; GFX1250-NEXT:        v_or_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:        v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:        global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
   %val.fabs.fneg = fneg bfloat %val.fabs
@@ -131,22 +131,22 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(
   ret void
 }
 
-; GFX1250-LABEL:    v_test_canonicalize_fneg_var_bf16:
-; GFX1250:          ; %bb.0:
-; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:        s_wait_loadcnt 0x0
-; GFX1250-NEXT:        v_xor_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:        v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:        global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fneg = fneg bfloat %val
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg)
@@ -154,22 +154,22 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %o
   ret void
 }
 
-; GFX1250-LABEL:      v_test_no_denormals_canonicalize_fneg_var_bf16:
-; GFX1250:            ; %bb.0:
-; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:         s_wait_kmcnt 0x0
-; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:         s_wait_loadcnt 0x0
-; GFX1250-NEXT:         v_xor_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:         s_endpgm
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 {
+; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fneg = fneg bfloat %val
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg)
@@ -177,22 +177,22 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr ad
   ret void
 }
 
-; GFX1250-LABEL:      v_test_no_denormals_canonicalize_fneg_fabs_var_bf16:
-; GFX1250: ;          %bb.0:
-; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:         s_wait_kmcnt 0x0
-; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:         s_wait_loadcnt 0x0
-; GFX1250-NEXT:         v_or_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:         s_endpgm
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 {
+; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
   %val.fabs.fneg = fneg bfloat %val.fabs
@@ -201,217 +201,231 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(p
   ret void
 }
 
+define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_p0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
- define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
    %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0.0)
    store bfloat %canonicalized, ptr addrspace(1) %out
    ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_n0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
-; GFX1250-NEXT: .Lfunc_end10:
+
 define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_n0_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -0.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_p1_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_p1_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 1.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_n1_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
-; GFX1250-NEXT: .Lfunc_end12:
+
 define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_n1_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -1.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_literal_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_literal_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 16.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 {
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 {
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C00)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -1 to bfloat))
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -2 to bfloat))
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C01)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7DFF)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFDFF)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFC01)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
-; GFX1250-NEXT: 	s_wait_loadcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v1, 0xffff0000, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
   %val = load <2 x bfloat>, ptr addrspace(1) %gep
@@ -419,27 +433,28 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
-; GFX1250-NEXT: 	s_wait_loadcnt 0x0
-; GFX1250-NEXT: 	v_lshrrev_b32_e32 v1, 16, v0
-; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x7fff, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: 	v_and_b32_e32 v1, 0x7fff, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
   %val = load <2 x bfloat>, ptr addrspace(1) %gep
@@ -448,27 +463,28 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
-; GFX1250-NEXT: 	s_wait_loadcnt 0x0
-; GFX1250-NEXT: 	v_lshrrev_b32_e32 v1, 16, v0
-; GFX1250-NEXT: 	v_or_b32_e32 v0, 0x8000, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: 	v_or_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
   %val = load <2 x bfloat>, ptr addrspace(1) %gep
@@ -478,27 +494,28 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspac
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_load_b32 v0, v0, s[0:1] scale_offset
-; GFX1250-NEXT: 	s_wait_loadcnt 0x0
-; GFX1250-NEXT: 	v_lshrrev_b32_e32 v1, 16, v0
-; GFX1250-NEXT: 	v_xor_b32_e32 v0, 0x8000, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: 	v_xor_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid
   %val = load <2 x bfloat>, ptr addrspace(1) %gep
@@ -507,781 +524,819 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: s_test_canonicalize_var_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b96 s[0:2], s[4:5], 0x24
-; GFX1250-NEXT: 	v_mov_b32_e32 v2, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	s_and_b32 s3, s2, 0xffff0000
-; GFX1250-NEXT: 	s_lshl_b32 s2, s2, 16
-; GFX1250-NEXT: 	v_max_num_f32_e64 v0, s3, s3
-; GFX1250-NEXT: 	v_max_num_f32_e64 v1, s2, s2
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v1, v0
-; GFX1250-NEXT: 	global_store_b32 v2, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
+; GFX1250-LABEL: s_test_canonicalize_var_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_and_b32 s3, s2, 0xffff0000
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT:    v_max_num_f32_e64 v0, s3, s3
+; GFX1250-NEXT:    v_max_num_f32_e64 v1, s2, s2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v1, v0
+; GFX1250-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = bitcast i32 %val.arg to <2 x bfloat>
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> zeroinitializer)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat -0.0, bfloat -0.0>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 1.0, bfloat 1.0>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat -1.0, bfloat -1.0>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 16.0, bfloat 16.0>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR03FF, bfloat 0xR03FF>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #3 {
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR03FF, bfloat 0xR03FF>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR83FF, bfloat 0xR83FF>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #3 {
+; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR83FF, bfloat 0xR83FF>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR7C00, bfloat 0xR7C00>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> bitcast (i32 -1 to <2 x bfloat>))
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat bitcast (i16 -2 to bfloat), bfloat bitcast (i16 -2 to bfloat)>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR7C01, bfloat 0xR7C01>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xR7DFF, bfloat 0xR7DFF>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xRFDFF, bfloat 0xRFDFF>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> <bfloat 0xRFC01, bfloat 0xRFC01>)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v3bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v2, 0xffff0000, v0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0
-; GFX1250-NEXT: 	v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v2
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <3 x bfloat> @v_test_canonicalize_var_v3bf16(<3 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v3bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> %val)
   ret <3 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v4bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v2, 0xffff0000, v1
-; GFX1250-NEXT: 	v_and_b32_e32 v3, 0xffff0000, v0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v3
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v2
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <4 x bfloat> @v_test_canonicalize_var_v4bf16(<4 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v3
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %val)
   ret <4 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b32 v0, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
   store <2 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: v_test_canonicalize_reg_undef_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_reg_undef_v2bf16(bfloat %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_reg_undef_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 0
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_undef_reg_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: 	v_perm_b32 v0, v0, s0, 0x5040100
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_undef_reg_v2bf16(bfloat %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_undef_reg_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_perm_b32 v0, v0, s0, 0x5040100
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 1
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 1.0
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_undef_lo_imm_hi_v2bf16() #1 {
+; GFX1250-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 1
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0x3f80
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_imm_lo_undef_hi_v2bf16() #1 {
+; GFX1250-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0x3f80
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 0
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_undef_lo_k_hi_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0x41800000
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_undef_lo_k_hi_v2bf16() #1 {
+; GFX1250-LABEL: v_test_canonicalize_undef_lo_k_hi_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0x41800000
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 1
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_k_lo_undef_hi_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0x4180
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_k_lo_undef_hi_v2bf16() #1 {
+; GFX1250-LABEL: v_test_canonicalize_k_lo_undef_hi_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0x4180
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 0
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_reg_k_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: 	s_movk_i32 s0, 0x4000
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_reg_k_v2bf16(bfloat %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_reg_k_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x4000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec0 = insertelement <2 x bfloat> poison, bfloat %val, i32 0
   %vec1 = insertelement <2 x bfloat> %vec0, bfloat 2.0, i32 1
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_k_reg_v2bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: 	s_movk_i32 s0, 0x4000
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: 	v_perm_b32 v0, v0, s0, 0x5040100
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_k_reg_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x4000
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_perm_b32 v0, v0, s0, 0x5040100
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec0 = insertelement <2 x bfloat> poison, bfloat 2.0, i32 0
   %vec1 = insertelement <2 x bfloat> %vec0, bfloat %val, i32 1
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1)
   ret <2 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: 	v_mov_b32_e32 v1, v0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b64 v0, v[0:1], s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
   store <4 x bfloat> %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <4 x bfloat> @v_test_canonicalize_reg_undef_undef_undef_v4bf16(bfloat %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec = insertelement <4 x bfloat> poison, bfloat %val, i32 0
   %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec)
   ret <4 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT: 	v_mov_b32_e32 v1, 0x7fc07fc0
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <4 x bfloat> @v_test_canonicalize_reg_reg_undef_undef_v4bf16(bfloat %val0, bfloat %val1) #1 {
+; GFX1250-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0x7fc07fc0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0
   %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 1
   %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec1)
   ret <4 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT: 	s_movk_i32 s0, 0x7fc0
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v2
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-NEXT: 	v_perm_b32 v0, s0, v0, 0x5040100
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <4 x bfloat> @v_test_canonicalize_reg_undef_reg_reg_v4bf16(bfloat %val0, bfloat %val1, bfloat %val2) #1 {
+; GFX1250-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x7fc0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT:    v_perm_b32 v0, s0, v0, 0x5040100
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0
   %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 2
   %vec2 = insertelement <4 x bfloat> %vec1, bfloat %val2, i32 3
   %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec2)
   ret <4 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v6bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v3, 0xffff0000, v2
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v2, 16, v2
-; GFX1250-NEXT: 	v_and_b32_e32 v4, 0xffff0000, v1
-; GFX1250-NEXT: 	v_and_b32_e32 v5, 0xffff0000, v0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
-; GFX1250-NEXT: 	v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v0, v0, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v5
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v4
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v3
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <6 x bfloat> @v_test_canonicalize_var_v6bf16(<6 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v6bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT:    v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v4
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat> %val)
   ret <6 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v8bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v5, 0xffff0000, v2
-; GFX1250-NEXT: 	v_and_b32_e32 v4, 0xffff0000, v3
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v3, 16, v3
-; GFX1250-NEXT: 	v_and_b32_e32 v6, 0xffff0000, v1
-; GFX1250-NEXT: 	v_and_b32_e32 v7, 0xffff0000, v0
-; GFX1250-NEXT: 	v_dual_max_num_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v6, v6, v6
-; GFX1250-NEXT: 	v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v0, v0, v0
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
-; GFX1250-NEXT: 	v_max_num_f32_e32 v3, v3, v3
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v7
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v6
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v5
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v4
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <8 x bfloat> @v_test_canonicalize_var_v8bf16(<8 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v8bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX1250-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX1250-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_max_num_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v6, v6, v6
+; GFX1250-NEXT:    v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT:    v_max_num_f32_e32 v3, v3, v3
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v4
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat> %val)
   ret <8 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v12bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v6, 0xffff0000, v5
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v5, 16, v5
-; GFX1250-NEXT: 	v_and_b32_e32 v7, 0xffff0000, v4
-; GFX1250-NEXT: 	v_and_b32_e32 v8, 0xffff0000, v3
-; GFX1250-NEXT: 	v_and_b32_e32 v9, 0xffff0000, v2
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX1250-NEXT: 	v_and_b32_e32 v10, 0xffff0000, v1
-; GFX1250-NEXT: 	v_and_b32_e32 v11, 0xffff0000, v0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v2, 16, v2
-; GFX1250-NEXT: 	v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v5, v5, v5
-; GFX1250-NEXT: 	v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v8, v8, v8
-; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v10, v10, v10
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v11, v11, v11
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
-; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v11
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v10
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v9
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v8
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v7
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v6
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <12 x bfloat> @v_test_canonicalize_var_v12bf16(<12 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v12bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX1250-NEXT:    v_and_b32_e32 v8, 0xffff0000, v3
+; GFX1250-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v11, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v5, v5, v5
+; GFX1250-NEXT:    v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v8, v8, v8
+; GFX1250-NEXT:    v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v10, v10, v10
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v11, v11, v11
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT:    v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v11
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v9
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v8
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v7
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v6
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat> %val)
   ret <12 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v16bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v8, 0xffff0000, v7
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v7, 16, v7
-; GFX1250-NEXT: 	v_and_b32_e32 v9, 0xffff0000, v6
-; GFX1250-NEXT: 	v_and_b32_e32 v10, 0xffff0000, v5
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: 	v_dual_max_num_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX1250-NEXT: 	v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX1250-NEXT: 	v_and_b32_e32 v11, 0xffff0000, v4
-; GFX1250-NEXT: 	v_and_b32_e32 v12, 0xffff0000, v3
-; GFX1250-NEXT: 	v_and_b32_e32 v13, 0xffff0000, v2
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX1250-NEXT: 	v_and_b32_e32 v14, 0xffff0000, v1
-; GFX1250-NEXT: 	v_and_b32_e32 v15, 0xffff0000, v0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX1250-NEXT: 	v_max_num_f32_e32 v6, v6, v6
-; GFX1250-NEXT: 	v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v5, v5, v5
-; GFX1250-NEXT: 	v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v12, v12, v12
-; GFX1250-NEXT: 	v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v14, v14, v14
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v15, v15, v15
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
-; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v15
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v14
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v13
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v12
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v11
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v10
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v6, v6, v9
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v7, v7, v8
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <16 x bfloat> @v_test_canonicalize_var_v16bf16(<16 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v16bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX1250-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
+; GFX1250-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_dual_max_num_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX1250-NEXT:    v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX1250-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX1250-NEXT:    v_and_b32_e32 v12, 0xffff0000, v3
+; GFX1250-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT:    v_and_b32_e32 v14, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT:    v_max_num_f32_e32 v6, v6, v6
+; GFX1250-NEXT:    v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v5, v5, v5
+; GFX1250-NEXT:    v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v12, v12, v12
+; GFX1250-NEXT:    v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v14, v14, v14
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v15, v15, v15
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT:    v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v15
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v14
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v13
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v12
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v11
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v10
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v9
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v8
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> %val)
   ret <16 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v32bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v16, 0xffff0000, v15
-; GFX1250-NEXT: 	v_and_b32_e32 v18, 0xffff0000, v13
-; GFX1250-NEXT: 	v_and_b32_e32 v20, 0xffff0000, v11
-; GFX1250-NEXT: 	v_and_b32_e32 v22, 0xffff0000, v9
-; GFX1250-NEXT: 	v_and_b32_e32 v24, 0xffff0000, v7
-; GFX1250-NEXT: 	v_dual_max_num_f32 v16, v16, v16 :: v_dual_lshlrev_b32 v15, 16, v15
-; GFX1250-NEXT: 	v_and_b32_e32 v17, 0xffff0000, v14
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v14, 16, v14 :: v_dual_lshlrev_b32 v13, 16, v13
-; GFX1250-NEXT: 	v_max_num_f32_e32 v18, v18, v18
-; GFX1250-NEXT: 	v_and_b32_e32 v19, 0xffff0000, v12
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v12, 16, v12 :: v_dual_lshlrev_b32 v11, 16, v11
-; GFX1250-NEXT: 	v_max_num_f32_e32 v20, v20, v20
-; GFX1250-NEXT: 	v_and_b32_e32 v21, 0xffff0000, v10
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v10, 16, v10 :: v_dual_lshlrev_b32 v9, 16, v9
-; GFX1250-NEXT: 	v_max_num_f32_e32 v22, v22, v22
-; GFX1250-NEXT: 	v_and_b32_e32 v23, 0xffff0000, v8
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v8, 16, v8 :: v_dual_lshlrev_b32 v7, 16, v7
-; GFX1250-NEXT: 	v_max_num_f32_e32 v24, v24, v24
-; GFX1250-NEXT: 	v_and_b32_e32 v25, 0xffff0000, v6
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v6, 16, v6
-; GFX1250-NEXT: 	v_and_b32_e32 v26, 0xffff0000, v5
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v5, 16, v5
-; GFX1250-NEXT: 	v_and_b32_e32 v27, 0xffff0000, v4
-; GFX1250-NEXT: 	v_and_b32_e32 v28, 0xffff0000, v3
-; GFX1250-NEXT: 	v_and_b32_e32 v29, 0xffff0000, v2
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX1250-NEXT: 	v_and_b32_e32 v30, 0xffff0000, v1
-; GFX1250-NEXT: 	v_and_b32_e32 v31, 0xffff0000, v0
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: 	v_dual_max_num_f32 v15, v15, v15 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX1250-NEXT: 	v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v14, v14, v14
-; GFX1250-NEXT: 	v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v19, v19, v19
-; GFX1250-NEXT: 	v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v11, v11, v11
-; GFX1250-NEXT: 	v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v10, v10, v10
-; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v23, v23, v23
-; GFX1250-NEXT: 	v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v7, v7, v7
-; GFX1250-NEXT: 	v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v6, v6, v6
-; GFX1250-NEXT: 	v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v5, v5, v5
-; GFX1250-NEXT: 	v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v28, v28, v28
-; GFX1250-NEXT: 	v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v30, v30, v30
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v31, v31, v31
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
-; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v31
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v30
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v29
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v28
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v27
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v26
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v6, v6, v25
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v7, v7, v24
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v8, v8, v23
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v9, v9, v22
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v10, v10, v21
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v11, v11, v20
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v12, v12, v19
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v13, v13, v18
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v14, v14, v17
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v15, v15, v16
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <32 x bfloat> @v_test_canonicalize_var_v32bf16(<32 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v32bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX1250-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX1250-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX1250-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX1250-NEXT:    v_and_b32_e32 v24, 0xffff0000, v7
+; GFX1250-NEXT:    v_dual_max_num_f32 v16, v16, v16 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX1250-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v14, 16, v14 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX1250-NEXT:    v_max_num_f32_e32 v18, v18, v18
+; GFX1250-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v12, 16, v12 :: v_dual_lshlrev_b32 v11, 16, v11
+; GFX1250-NEXT:    v_max_num_f32_e32 v20, v20, v20
+; GFX1250-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v10, 16, v10 :: v_dual_lshlrev_b32 v9, 16, v9
+; GFX1250-NEXT:    v_max_num_f32_e32 v22, v22, v22
+; GFX1250-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v8, 16, v8 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX1250-NEXT:    v_max_num_f32_e32 v24, v24, v24
+; GFX1250-NEXT:    v_and_b32_e32 v25, 0xffff0000, v6
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX1250-NEXT:    v_and_b32_e32 v26, 0xffff0000, v5
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-NEXT:    v_and_b32_e32 v27, 0xffff0000, v4
+; GFX1250-NEXT:    v_and_b32_e32 v28, 0xffff0000, v3
+; GFX1250-NEXT:    v_and_b32_e32 v29, 0xffff0000, v2
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT:    v_and_b32_e32 v30, 0xffff0000, v1
+; GFX1250-NEXT:    v_and_b32_e32 v31, 0xffff0000, v0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1250-NEXT:    v_dual_max_num_f32 v15, v15, v15 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-NEXT:    v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v14, v14, v14
+; GFX1250-NEXT:    v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v19, v19, v19
+; GFX1250-NEXT:    v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v11, v11, v11
+; GFX1250-NEXT:    v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v10, v10, v10
+; GFX1250-NEXT:    v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v23, v23, v23
+; GFX1250-NEXT:    v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v7, v7, v7
+; GFX1250-NEXT:    v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v6, v6, v6
+; GFX1250-NEXT:    v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v5, v5, v5
+; GFX1250-NEXT:    v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v28, v28, v28
+; GFX1250-NEXT:    v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v30, v30, v30
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v31, v31, v31
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2
+; GFX1250-NEXT:    v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v31
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v30
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v29
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v28
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v27
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v26
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v25
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v24
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v8, v8, v23
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v9, v9, v22
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v10, v10, v21
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v11, v11, v20
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v12, v12, v19
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v13, v13, v18
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v14, v14, v17
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v15, v15, v16
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat> %val)
   ret <32 x bfloat> %canonicalized
 }
-; GFX1250-LABEL: v_test_canonicalize_var_v64bf16:
-; GFX1250:  %bb.0:
-; GFX1250-NEXT: 	s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	scratch_load_b32 v31, off, s32
-; GFX1250-NEXT: 	v_and_b32_e32 v81, 0xffff0000, v0
-; GFX1250-NEXT: 	v_and_b32_e32 v38, 0xffff0000, v24
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v24, 16, v24
-; GFX1250-NEXT: 	v_and_b32_e32 v39, 0xffff0000, v23
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v23, 16, v23
-; GFX1250-NEXT: 	v_and_b32_e32 v80, 0xffff0000, v6
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v6
-; GFX1250-NEXT: 	v_and_b32_e32 v82, 0xffff0000, v1
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT: 	v_max_num_f32_e32 v81, v81, v81
-; GFX1250-NEXT: 	v_and_b32_e32 v83, 0xffff0000, v2
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v2, 16, v2
-; GFX1250-NEXT: 	v_and_b32_e32 v34, 0xffff0000, v28
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v28, 16, v28
-; GFX1250-NEXT: 	v_and_b32_e32 v35, 0xffff0000, v27
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v27, 16, v27
-; GFX1250-NEXT: 	v_and_b32_e32 v36, 0xffff0000, v26
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v26, 16, v26
-; GFX1250-NEXT: 	v_and_b32_e32 v48, 0xffff0000, v22
-; GFX1250-NEXT: 	v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v82, v82, v82
-; GFX1250-NEXT: 	v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v83, v83, v83
-; GFX1250-NEXT: 	v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v24, v24, v24
-; GFX1250-NEXT: 	v_max_num_f32_e32 v39, v39, v39
-; GFX1250-NEXT: 	v_dual_max_num_f32 v23, v23, v23 :: v_dual_max_num_f32 v48, v48, v48
-; GFX1250-NEXT: 	v_and_b32_e32 v32, 0xffff0000, v30
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v30, 16, v30
-; GFX1250-NEXT: 	v_and_b32_e32 v33, 0xffff0000, v29
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v29, 16, v29
-; GFX1250-NEXT: 	v_and_b32_e32 v37, 0xffff0000, v25
-; GFX1250-NEXT: 	v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX1250-NEXT: 	v_and_b32_e32 v49, 0xffff0000, v21
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v21, 16, v21
-; GFX1250-NEXT: 	v_and_b32_e32 v50, 0xffff0000, v20
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v20, 16, v20
-; GFX1250-NEXT: 	v_and_b32_e32 v51, 0xffff0000, v19
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v19, 16, v19
-; GFX1250-NEXT: 	v_and_b32_e32 v52, 0xffff0000, v18
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v18, 16, v18
-; GFX1250-NEXT: 	v_and_b32_e32 v53, 0xffff0000, v17
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v17, 16, v17
-; GFX1250-NEXT: 	v_and_b32_e32 v54, 0xffff0000, v16
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v16, 16, v16
-; GFX1250-NEXT: 	v_and_b32_e32 v55, 0xffff0000, v15
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v15, 16, v15
-; GFX1250-NEXT: 	v_and_b32_e32 v64, 0xffff0000, v14
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v14, 16, v14
-; GFX1250-NEXT: 	v_and_b32_e32 v65, 0xffff0000, v13
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v13, 16, v13
-; GFX1250-NEXT: 	v_and_b32_e32 v66, 0xffff0000, v12
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v12, 16, v12
-; GFX1250-NEXT: 	v_and_b32_e32 v67, 0xffff0000, v11
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v11, 16, v11
-; GFX1250-NEXT: 	v_and_b32_e32 v68, 0xffff0000, v10
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v10, 16, v10
-; GFX1250-NEXT: 	v_and_b32_e32 v69, 0xffff0000, v9
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v9, 16, v9
-; GFX1250-NEXT: 	v_and_b32_e32 v70, 0xffff0000, v8
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v8, 16, v8
-; GFX1250-NEXT: 	v_and_b32_e32 v71, 0xffff0000, v7
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v7, 16, v7
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v0, v0, v81
-; GFX1250-NEXT: 	v_and_b32_e32 v81, 0xffff0000, v5
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v5, 16, v5
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v1, v1, v82
-; GFX1250-NEXT: 	v_and_b32_e32 v82, 0xffff0000, v4
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v4, 16, v4
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v2, v2, v83
-; GFX1250-NEXT: 	v_and_b32_e32 v83, 0xffff0000, v3
-; GFX1250-NEXT: 	v_dual_max_num_f32 v32, v32, v32 :: v_dual_lshlrev_b32 v3, 16, v3
-; GFX1250-NEXT: 	v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v36, v36, v36
-; GFX1250-NEXT: 	v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v37, v37, v37
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v23, v23, v39
-; GFX1250-NEXT: 	v_dual_max_num_f32 v30, v30, v30 :: v_dual_max_num_f32 v33, v33, v33
-; GFX1250-NEXT: 	v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v34, v34, v34
-; GFX1250-NEXT: 	v_dual_max_num_f32 v28, v28, v28 :: v_dual_max_num_f32 v35, v35, v35
-; GFX1250-NEXT: 	v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v38, v38, v38
-; GFX1250-NEXT: 	v_dual_max_num_f32 v22, v22, v22 :: v_dual_max_num_f32 v49, v49, v49
-; GFX1250-NEXT: 	v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v50, v50, v50
-; GFX1250-NEXT: 	v_dual_max_num_f32 v20, v20, v20 :: v_dual_max_num_f32 v51, v51, v51
-; GFX1250-NEXT: 	v_dual_max_num_f32 v19, v19, v19 :: v_dual_max_num_f32 v52, v52, v52
-; GFX1250-NEXT: 	v_dual_max_num_f32 v18, v18, v18 :: v_dual_max_num_f32 v53, v53, v53
-; GFX1250-NEXT: 	v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v54, v54, v54
-; GFX1250-NEXT: 	v_dual_max_num_f32 v16, v16, v16 :: v_dual_max_num_f32 v55, v55, v55
-; GFX1250-NEXT: 	v_dual_max_num_f32 v15, v15, v15 :: v_dual_max_num_f32 v64, v64, v64
-; GFX1250-NEXT: 	v_dual_max_num_f32 v14, v14, v14 :: v_dual_max_num_f32 v65, v65, v65
-; GFX1250-NEXT: 	v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v66, v66, v66
-; GFX1250-NEXT: 	v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v67, v67, v67
-; GFX1250-NEXT: 	v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v68, v68, v68
-; GFX1250-NEXT: 	v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v69, v69, v69
-; GFX1250-NEXT: 	v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v70, v70, v70
-; GFX1250-NEXT: 	v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v71, v71, v71
-; GFX1250-NEXT: 	v_dual_max_num_f32 v80, v80, v80 :: v_dual_max_num_f32 v81, v81, v81
-; GFX1250-NEXT: 	v_dual_max_num_f32 v82, v82, v82 :: v_dual_max_num_f32 v83, v83, v83
-; GFX1250-NEXT: 	v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
-; GFX1250-NEXT: 	v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v6, v6, v6
-; GFX1250-NEXT: 	v_max_num_f32_e32 v7, v7, v7
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v26, v26, v36
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_4)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v3, v3, v83
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v4, v4, v82
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v5, v5, v81
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v6, v6, v80
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v7, v7, v71
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v8, v8, v70
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v9, v9, v69
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v10, v10, v68
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v11, v11, v67
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v12, v12, v66
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v13, v13, v65
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v14, v14, v64
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v15, v15, v55
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v16, v16, v54
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v17, v17, v53
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v18, v18, v52
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v19, v19, v51
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v20, v20, v50
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v21, v21, v49
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v22, v22, v48
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v24, v24, v38
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v25, v25, v37
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v27, v27, v35
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v28, v28, v34
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v29, v29, v33
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v30, v30, v32
-; GFX1250-NEXT: 	s_wait_loadcnt 0x0
-; GFX1250-NEXT: 	v_and_b32_e32 v39, 0xffff0000, v31
-; GFX1250-NEXT: 	v_lshlrev_b32_e32 v31, 16, v31
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: 	v_max_num_f32_e32 v36, v39, v39
-; GFX1250-NEXT: 	v_max_num_f32_e32 v31, v31, v31
-; GFX1250-NEXT: 	s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: 	v_cvt_pk_bf16_f32 v31, v31, v36
-; GFX1250-NEXT: 	s_set_pc_i64 s[30:31]
+
 define <64 x bfloat> @v_test_canonicalize_var_v64bf16(<64 x bfloat> %val) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_v64bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    scratch_load_b32 v31, off, s32
+; GFX1250-NEXT:    v_and_b32_e32 v81, 0xffff0000, v0
+; GFX1250-NEXT:    v_and_b32_e32 v38, 0xffff0000, v24
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX1250-NEXT:    v_and_b32_e32 v39, 0xffff0000, v23
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX1250-NEXT:    v_and_b32_e32 v80, 0xffff0000, v6
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX1250-NEXT:    v_and_b32_e32 v82, 0xffff0000, v1
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v81, v81, v81
+; GFX1250-NEXT:    v_and_b32_e32 v83, 0xffff0000, v2
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-NEXT:    v_and_b32_e32 v34, 0xffff0000, v28
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX1250-NEXT:    v_and_b32_e32 v35, 0xffff0000, v27
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX1250-NEXT:    v_and_b32_e32 v36, 0xffff0000, v26
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX1250-NEXT:    v_and_b32_e32 v48, 0xffff0000, v22
+; GFX1250-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v82, v82, v82
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v83, v83, v83
+; GFX1250-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v24, v24, v24
+; GFX1250-NEXT:    v_max_num_f32_e32 v39, v39, v39
+; GFX1250-NEXT:    v_dual_max_num_f32 v23, v23, v23 :: v_dual_max_num_f32 v48, v48, v48
+; GFX1250-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX1250-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX1250-NEXT:    v_and_b32_e32 v37, 0xffff0000, v25
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX1250-NEXT:    v_and_b32_e32 v49, 0xffff0000, v21
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX1250-NEXT:    v_and_b32_e32 v50, 0xffff0000, v20
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX1250-NEXT:    v_and_b32_e32 v51, 0xffff0000, v19
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX1250-NEXT:    v_and_b32_e32 v52, 0xffff0000, v18
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX1250-NEXT:    v_and_b32_e32 v53, 0xffff0000, v17
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX1250-NEXT:    v_and_b32_e32 v54, 0xffff0000, v16
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX1250-NEXT:    v_and_b32_e32 v55, 0xffff0000, v15
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX1250-NEXT:    v_and_b32_e32 v64, 0xffff0000, v14
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX1250-NEXT:    v_and_b32_e32 v65, 0xffff0000, v13
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX1250-NEXT:    v_and_b32_e32 v66, 0xffff0000, v12
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX1250-NEXT:    v_and_b32_e32 v67, 0xffff0000, v11
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX1250-NEXT:    v_and_b32_e32 v68, 0xffff0000, v10
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX1250-NEXT:    v_and_b32_e32 v69, 0xffff0000, v9
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX1250-NEXT:    v_and_b32_e32 v70, 0xffff0000, v8
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX1250-NEXT:    v_and_b32_e32 v71, 0xffff0000, v7
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v81
+; GFX1250-NEXT:    v_and_b32_e32 v81, 0xffff0000, v5
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, v82
+; GFX1250-NEXT:    v_and_b32_e32 v82, 0xffff0000, v4
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v2, v2, v83
+; GFX1250-NEXT:    v_and_b32_e32 v83, 0xffff0000, v3
+; GFX1250-NEXT:    v_dual_max_num_f32 v32, v32, v32 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX1250-NEXT:    v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v36, v36, v36
+; GFX1250-NEXT:    v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v37, v37, v37
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v23, v23, v39
+; GFX1250-NEXT:    v_dual_max_num_f32 v30, v30, v30 :: v_dual_max_num_f32 v33, v33, v33
+; GFX1250-NEXT:    v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v34, v34, v34
+; GFX1250-NEXT:    v_dual_max_num_f32 v28, v28, v28 :: v_dual_max_num_f32 v35, v35, v35
+; GFX1250-NEXT:    v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v38, v38, v38
+; GFX1250-NEXT:    v_dual_max_num_f32 v22, v22, v22 :: v_dual_max_num_f32 v49, v49, v49
+; GFX1250-NEXT:    v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v50, v50, v50
+; GFX1250-NEXT:    v_dual_max_num_f32 v20, v20, v20 :: v_dual_max_num_f32 v51, v51, v51
+; GFX1250-NEXT:    v_dual_max_num_f32 v19, v19, v19 :: v_dual_max_num_f32 v52, v52, v52
+; GFX1250-NEXT:    v_dual_max_num_f32 v18, v18, v18 :: v_dual_max_num_f32 v53, v53, v53
+; GFX1250-NEXT:    v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v54, v54, v54
+; GFX1250-NEXT:    v_dual_max_num_f32 v16, v16, v16 :: v_dual_max_num_f32 v55, v55, v55
+; GFX1250-NEXT:    v_dual_max_num_f32 v15, v15, v15 :: v_dual_max_num_f32 v64, v64, v64
+; GFX1250-NEXT:    v_dual_max_num_f32 v14, v14, v14 :: v_dual_max_num_f32 v65, v65, v65
+; GFX1250-NEXT:    v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v66, v66, v66
+; GFX1250-NEXT:    v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v67, v67, v67
+; GFX1250-NEXT:    v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v68, v68, v68
+; GFX1250-NEXT:    v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v69, v69, v69
+; GFX1250-NEXT:    v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v70, v70, v70
+; GFX1250-NEXT:    v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v71, v71, v71
+; GFX1250-NEXT:    v_dual_max_num_f32 v80, v80, v80 :: v_dual_max_num_f32 v81, v81, v81
+; GFX1250-NEXT:    v_dual_max_num_f32 v82, v82, v82 :: v_dual_max_num_f32 v83, v83, v83
+; GFX1250-NEXT:    v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4
+; GFX1250-NEXT:    v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v6, v6, v6
+; GFX1250-NEXT:    v_max_num_f32_e32 v7, v7, v7
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v26, v26, v36
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v3, v3, v83
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v4, v4, v82
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v5, v5, v81
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v6, v6, v80
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v7, v7, v71
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v8, v8, v70
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v9, v9, v69
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v10, v10, v68
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v11, v11, v67
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v12, v12, v66
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v13, v13, v65
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v14, v14, v64
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v15, v15, v55
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v16, v16, v54
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v17, v17, v53
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v18, v18, v52
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v19, v19, v51
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v20, v20, v50
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v21, v21, v49
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v22, v22, v48
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v24, v24, v38
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v25, v25, v37
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v27, v27, v35
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v28, v28, v34
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v29, v29, v33
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v30, v30, v32
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v39, 0xffff0000, v31
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_max_num_f32_e32 v36, v39, v39
+; GFX1250-NEXT:    v_max_num_f32_e32 v31, v31, v31
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v31, v31, v36
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %canonicalized = call <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat> %val)
   ret <64 x bfloat> %canonicalized
 }

From 4a4573a4adeab93111c3d58bff16ad24123ccc5f Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Sat, 27 Sep 2025 21:59:53 -0300
Subject: [PATCH 018/878] [clang] simplify placeholder type deduction for
 constant template parameters (#160439)

This makes the deduction for dependent types operate in more similar
ways to the non-dependent one, such as when matching template template
parameters, making errors in those generate similar diagnostics to the
non-dependent ones. This also removes some superfluous implicit casts,
simplifying the resulting AST a little bit.
---
 clang/docs/ReleaseNotes.rst                   |  3 +-
 clang/lib/Sema/SemaTemplate.cpp               | 67 +++++++++----------
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 32 ++++-----
 .../SemaTemplate/temp_arg_template_p0522.cpp  | 10 +--
 4 files changed, 51 insertions(+), 61 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 977396e249622..6cfe42b5dc9f8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -299,7 +299,8 @@ Improvements to Clang's diagnostics
   "format specifies type 'unsigned int' but the argument has type 'int', which differs in signedness [-Wformat-signedness]"
   "signedness of format specifier 'u' is incompatible with 'c' [-Wformat-signedness]"
   and the API-visible diagnostic id will be appropriate.
-
+- Clang now produces better diagnostics for template template parameter matching
+  involving 'auto' template parameters.
 - Fixed false positives in ``-Waddress-of-packed-member`` diagnostics when
   potential misaligned members get processed before they can get discarded.
   (#GH144729)
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index f051a246f954f..91e4dc8a2d039 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -7068,22 +7068,8 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
 
   // If the parameter type somehow involves auto, deduce the type now.
   DeducedType *DeducedT = ParamType->getContainedDeducedType();
-  if (getLangOpts().CPlusPlus17 && DeducedT && !DeducedT->isDeduced()) {
-    // During template argument deduction, we allow 'decltype(auto)' to
-    // match an arbitrary dependent argument.
-    // FIXME: The language rules don't say what happens in this case.
-    // FIXME: We get an opaque dependent type out of decltype(auto) if the
-    // expression is merely instantiation-dependent; is this enough?
-    if (DeductionArg->isTypeDependent()) {
-      auto *AT = dyn_cast<AutoType>(DeducedT);
-      if (AT && AT->isDecltypeAuto()) {
-        SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
-        CanonicalConverted = TemplateArgument(
-            Context.getCanonicalTemplateArgument(SugaredConverted));
-        return Arg;
-      }
-    }
-
+  bool IsDeduced = DeducedT && !DeducedT->isDeduced();
+  if (IsDeduced) {
     // When checking a deduced template argument, deduce from its type even if
     // the type is dependent, in order to check the types of non-type template
     // arguments line up properly in partial ordering.
@@ -7112,17 +7098,21 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
                          // along with the other associated constraints after
                          // checking the template argument list.
                          /*IgnoreConstraints=*/true);
-      if (Result == TemplateDeductionResult::AlreadyDiagnosed) {
-        return ExprError();
-      } else if (Result != TemplateDeductionResult::Success) {
-        if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
-          Diag(Arg->getExprLoc(),
-               diag::err_non_type_template_parm_type_deduction_failure)
-              << Param->getDeclName() << NTTP->getType() << Arg->getType()
-              << Arg->getSourceRange();
+      if (Result != TemplateDeductionResult::Success) {
+        ParamType = TSI->getType();
+        if (StrictCheck || !DeductionArg->isTypeDependent()) {
+          if (Result == TemplateDeductionResult::AlreadyDiagnosed)
+            return ExprError();
+          if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param))
+            Diag(Arg->getExprLoc(),
+                 diag::err_non_type_template_parm_type_deduction_failure)
+                << Param->getDeclName() << NTTP->getType() << Arg->getType()
+                << Arg->getSourceRange();
+          NoteTemplateParameterLocation(*Param);
+          return ExprError();
         }
-        NoteTemplateParameterLocation(*Param);
-        return ExprError();
+        ParamType = SubstAutoTypeDependent(ParamType);
+        assert(!ParamType.isNull() && "substituting DependentTy can't fail");
       }
     }
     // CheckNonTypeTemplateParameterType will produce a diagnostic if there's
@@ -7144,14 +7134,16 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
   // type-dependent, there's nothing we can check now.
   if (ParamType->isDependentType() || DeductionArg->isTypeDependent()) {
     // Force the argument to the type of the parameter to maintain invariants.
-    ExprResult E = ImpCastExprToType(
-        DeductionArg, ParamType.getNonLValueExprType(Context), CK_Dependent,
-        ParamType->isLValueReferenceType()   ? VK_LValue
-        : ParamType->isRValueReferenceType() ? VK_XValue
-                                             : VK_PRValue);
-    if (E.isInvalid())
-      return ExprError();
-    setDeductionArg(E.get());
+    if (!IsDeduced) {
+      ExprResult E = ImpCastExprToType(
+          DeductionArg, ParamType.getNonLValueExprType(Context), CK_Dependent,
+          ParamType->isLValueReferenceType()   ? VK_LValue
+          : ParamType->isRValueReferenceType() ? VK_XValue
+                                               : VK_PRValue);
+      if (E.isInvalid())
+        return ExprError();
+      setDeductionArg(E.get());
+    }
     SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false);
     CanonicalConverted = TemplateArgument(
         Context.getCanonicalTemplateArgument(SugaredConverted));
@@ -8555,6 +8547,7 @@ static SourceRange findTemplateParameter(unsigned Depth, TypeLoc TL) {
 static bool CheckNonTypeTemplatePartialSpecializationArgs(
     Sema &S, SourceLocation TemplateNameLoc, NonTypeTemplateParmDecl *Param,
     const TemplateArgument *Args, unsigned NumArgs, bool IsDefaultArgument) {
+  bool HasError = false;
   for (unsigned I = 0; I != NumArgs; ++I) {
     if (Args[I].getKind() == TemplateArgument::Pack) {
       if (CheckNonTypeTemplatePartialSpecializationArgs(
@@ -8569,6 +8562,10 @@ static bool CheckNonTypeTemplatePartialSpecializationArgs(
       continue;
 
     Expr *ArgExpr = Args[I].getAsExpr();
+    if (ArgExpr->containsErrors()) {
+      HasError = true;
+      continue;
+    }
 
     // We can have a pack expansion of any of the bullets below.
     if (PackExpansionExpr *Expansion = dyn_cast<PackExpansionExpr>(ArgExpr))
@@ -8638,7 +8635,7 @@ static bool CheckNonTypeTemplatePartialSpecializationArgs(
     }
   }
 
-  return false;
+  return HasError;
 }
 
 bool Sema::CheckTemplatePartialSpecializationArgs(
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 962fa4da75946..f6ee7452c2f9a 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5262,18 +5262,6 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
   SmallVector<DeducedTemplateArgument, 1> Deduced;
   Deduced.resize(1);
 
-  // If deduction failed, don't diagnose if the initializer is dependent; it
-  // might acquire a matching type in the instantiation.
-  auto DeductionFailed = [&](TemplateDeductionResult TDK) {
-    if (Init->isTypeDependent()) {
-      Result =
-          SubstituteDeducedTypeTransform(*this, DependentResult).Apply(Type);
-      assert(!Result.isNull() && "substituting DependentTy can't fail");
-      return TemplateDeductionResult::Success;
-    }
-    return TDK;
-  };
-
   SmallVector<OriginalCallArg, 4> OriginalCallArgs;
 
   QualType DeducedType;
@@ -5323,9 +5311,9 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
             Diag(Info.getLocation(), diag::err_auto_inconsistent_deduction)
                 << Info.FirstArg << Info.SecondArg << DeducedFromInitRange
                 << Init->getSourceRange();
-            return DeductionFailed(TemplateDeductionResult::AlreadyDiagnosed);
+            return TemplateDeductionResult::AlreadyDiagnosed;
           }
-          return DeductionFailed(TDK);
+          return TDK;
         }
 
         if (DeducedFromInitRange.isInvalid() &&
@@ -5347,12 +5335,12 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
               OriginalCallArgs,
               /*Decomposed=*/false, /*ArgIdx=*/0, /*TDF=*/0, FailedTSC);
           TDK != TemplateDeductionResult::Success)
-        return DeductionFailed(TDK);
+        return TDK;
     }
 
     // Could be null if somehow 'auto' appears in a non-deduced context.
     if (Deduced[0].getKind() != TemplateArgument::Type)
-      return DeductionFailed(TemplateDeductionResult::Incomplete);
+      return TemplateDeductionResult::Incomplete;
     DeducedType = Deduced[0].getAsType();
 
     if (InitList) {
@@ -5366,7 +5354,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
     if (!Context.hasSameType(DeducedType, Result)) {
       Info.FirstArg = Result;
       Info.SecondArg = DeducedType;
-      return DeductionFailed(TemplateDeductionResult::Inconsistent);
+      return TemplateDeductionResult::Inconsistent;
     }
     DeducedType = Context.getCommonSugaredType(Result, DeducedType);
   }
@@ -5390,7 +5378,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result,
             CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA);
         TDK != TemplateDeductionResult::Success) {
       Result = QualType();
-      return DeductionFailed(TDK);
+      return TDK;
     }
   }
 
@@ -5412,13 +5400,17 @@ TypeSourceInfo *Sema::SubstAutoTypeSourceInfo(TypeSourceInfo *TypeWithAuto,
 }
 
 QualType Sema::SubstAutoTypeDependent(QualType TypeWithAuto) {
-  return SubstituteDeducedTypeTransform(*this, DependentAuto{false})
+  return SubstituteDeducedTypeTransform(
+             *this,
+             DependentAuto{/*IsPack=*/isa<PackExpansionType>(TypeWithAuto)})
       .TransformType(TypeWithAuto);
 }
 
 TypeSourceInfo *
 Sema::SubstAutoTypeSourceInfoDependent(TypeSourceInfo *TypeWithAuto) {
-  return SubstituteDeducedTypeTransform(*this, DependentAuto{false})
+  return SubstituteDeducedTypeTransform(
+             *this, DependentAuto{/*IsPack=*/isa<PackExpansionType>(
+                        TypeWithAuto->getType())})
       .TransformType(TypeWithAuto);
 }
 
diff --git a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
index d8a81bb363112..60d98a653ff02 100644
--- a/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
+++ b/clang/test/SemaTemplate/temp_arg_template_p0522.cpp
@@ -83,11 +83,11 @@ namespace DependentType {
 namespace Auto {
   template<template<int> typename T> struct TInt {}; // #TInt
   template<template<int*> typename T> struct TIntPtr {}; // #TIntPtr
-  template<template<auto> typename T> struct TAuto {};
+  template<template<auto> typename T> struct TAuto {}; // #TAuto
   template<template<auto*> typename T> struct TAutoPtr {};
-  template<template<decltype(auto)> typename T> struct TDecltypeAuto {};
+  template<template<decltype(auto)> typename T> struct TDecltypeAuto {}; // #TDecltypeAuto
   template<auto> struct Auto;
-  template<auto*> struct AutoPtr; // #AutoPtr
+  template<auto*> struct AutoPtr;
   template<decltype(auto)> struct DecltypeAuto;
   template<int> struct Int;
   template<int*> struct IntPtr;
@@ -108,7 +108,7 @@ namespace Auto {
   TIntPtr<IntPtr> ipip;
 
   TAuto<Auto> aa;
-  TAuto<AutoPtr> aap; // expected-error@#AutoPtr {{could not match 'auto *' against 'auto'}}
+  TAuto<AutoPtr> aap; // expected-error@#TAuto {{non-type template parameter '' with type 'auto *' has incompatible initializer of type 'auto'}}
                       // expected-note@-1 {{different template parameters}}
   TAuto<Int> ai; // FIXME: ill-formed (?)
   TAuto<IntPtr> aip; // FIXME: ill-formed (?)
@@ -130,7 +130,7 @@ namespace Auto {
   // parameters (such as 'user-defined-type &') that are not valid 'auto'
   // parameters.
   TDecltypeAuto<Auto> daa;
-  TDecltypeAuto<AutoPtr> daap; // expected-error@#AutoPtr {{could not match 'auto *' against 'decltype(auto)'}}
+  TDecltypeAuto<AutoPtr> daap; // expected-error@#TDecltypeAuto {{non-type template parameter '' with type 'auto *' has incompatible initializer of type 'decltype(auto)'}}
                                // expected-note@-1 {{different template parameters}}
 
   int n;

From 078e99ef017cac3899e5dbc2ed917f173c9eedad Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Sat, 27 Sep 2025 22:30:06 -0300
Subject: [PATCH 019/878] [clang] fix transformation of subst constant template
 parameter nodes (#161029)

This simplifies those transforms a lot, removing a bunch of workarounds
which were introducing problems.

The transforms become independent of the template instantiator, so they
are moved to TreeTransform instead.

Fixes #131342

This PR was already reviewed and approved at
https://github.com/llvm/llvm-project/pull/160777, but I accidentally
merged that into another PR, instead of main.
---
 clang/docs/ReleaseNotes.rst                   |   1 +
 clang/include/clang/AST/ExprCXX.h             |   2 +-
 clang/include/clang/AST/TypeBase.h            |   4 +-
 clang/include/clang/Sema/Sema.h               |  17 ++
 clang/lib/AST/ExprCXX.cpp                     |   4 +-
 clang/lib/AST/StmtProfile.cpp                 |   3 +-
 clang/lib/Sema/SemaTemplate.cpp               |  34 +++
 clang/lib/Sema/SemaTemplateInstantiate.cpp    | 203 ++----------------
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  23 +-
 clang/lib/Sema/TreeTransform.h                |  72 +++++--
 clang/test/SemaCXX/ctad.cpp                   |   7 +
 clang/test/SemaCXX/cxx20-ctad-type-alias.cpp  |   2 +-
 .../SemaTemplate/temp_arg_nontype_cxx2c.cpp   |  11 +
 13 files changed, 168 insertions(+), 215 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6cfe42b5dc9f8..98c889c08b329 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -361,6 +361,7 @@ Bug Fixes in This Version
   first parameter. (#GH113323).
 - Fixed a crash with incompatible pointer to integer conversions in designated
   initializers involving string literals. (#GH154046)
+- Fix crash on CTAD for alias template. (#GH131342)
 - Clang now emits a frontend error when a function marked with the `flatten` attribute
   calls another function that requires target features not enabled in the caller. This
   prevents a fatal error in the backend.
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 9fedb230ce397..5f16bac94d5e6 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -4714,7 +4714,7 @@ class SubstNonTypeTemplateParmExpr : public Expr {
   // sugared: it doesn't need to be resugared later.
   bool getFinal() const { return Final; }
 
-  NamedDecl *getParameter() const;
+  NonTypeTemplateParmDecl *getParameter() const;
 
   bool isReferenceParameter() const { return AssociatedDeclAndRef.getInt(); }
 
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index b02d9c7499fe5..e0d00b82f2b76 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -3495,7 +3495,9 @@ class AdjustedType : public Type, public llvm::FoldingSetNode {
 
   AdjustedType(TypeClass TC, QualType OriginalTy, QualType AdjustedTy,
                QualType CanonicalPtr)
-      : Type(TC, CanonicalPtr, OriginalTy->getDependence()),
+      : Type(TC, CanonicalPtr,
+             AdjustedTy->getDependence() |
+                 (OriginalTy->getDependence() & ~TypeDependence::Dependent)),
         OriginalTy(OriginalTy), AdjustedTy(AdjustedTy) {}
 
 public:
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5edfc29d93781..2bd6be2a32cd5 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11714,6 +11714,23 @@ class Sema final : public SemaBase {
                                const TemplateArgumentListInfo *TemplateArgs,
                                bool IsAddressOfOperand);
 
+  UnsignedOrNone getPackIndex(TemplateArgument Pack) const {
+    return Pack.pack_size() - 1 - *ArgPackSubstIndex;
+  }
+
+  TemplateArgument
+  getPackSubstitutedTemplateArgument(TemplateArgument Arg) const {
+    Arg = Arg.pack_elements()[*ArgPackSubstIndex];
+    if (Arg.isPackExpansion())
+      Arg = Arg.getPackExpansionPattern();
+    return Arg;
+  }
+
+  ExprResult BuildSubstNonTypeTemplateParmExpr(
+      Decl *AssociatedDecl, const NonTypeTemplateParmDecl *NTTP,
+      SourceLocation loc, TemplateArgument Replacement,
+      UnsignedOrNone PackIndex, bool Final);
+
   /// Form a template name from a name that is syntactically required to name a
   /// template, either due to use of the 'template' keyword or because a name in
   /// this syntactic context is assumed to name a template (C++
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 97ae4a07f32aa..95de6a82a5270 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -1725,8 +1725,8 @@ SizeOfPackExpr *SizeOfPackExpr::CreateDeserialized(ASTContext &Context,
   return new (Storage) SizeOfPackExpr(EmptyShell(), NumPartialArgs);
 }
 
-NamedDecl *SubstNonTypeTemplateParmExpr::getParameter() const {
-  return cast<NamedDecl>(
+NonTypeTemplateParmDecl *SubstNonTypeTemplateParmExpr::getParameter() const {
+  return cast<NonTypeTemplateParmDecl>(
       getReplacedTemplateParameterList(getAssociatedDecl())->asArray()[Index]);
 }
 
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 37c4d43ec0b2f..8b3af94e1a8bc 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -1353,7 +1353,8 @@ void StmtProfiler::VisitExpr(const Expr *S) {
 }
 
 void StmtProfiler::VisitConstantExpr(const ConstantExpr *S) {
-  VisitExpr(S);
+  // Profile exactly as the sub-expression.
+  Visit(S->getSubExpr());
 }
 
 void StmtProfiler::VisitDeclRefExpr(const DeclRefExpr *S) {
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 91e4dc8a2d039..3ebbb30ae483e 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -775,6 +775,40 @@ Sema::BuildDependentDeclRefExpr(const CXXScopeSpec &SS,
       TemplateArgs);
 }
 
+ExprResult Sema::BuildSubstNonTypeTemplateParmExpr(
+    Decl *AssociatedDecl, const NonTypeTemplateParmDecl *NTTP,
+    SourceLocation Loc, TemplateArgument Arg, UnsignedOrNone PackIndex,
+    bool Final) {
+  // The template argument itself might be an expression, in which case we just
+  // return that expression. This happens when substituting into an alias
+  // template.
+  Expr *Replacement;
+  bool refParam = true;
+  if (Arg.getKind() == TemplateArgument::Expression) {
+    Replacement = Arg.getAsExpr();
+    refParam = Replacement->isLValue();
+    if (refParam && Replacement->getType()->isRecordType()) {
+      QualType ParamType =
+          NTTP->isExpandedParameterPack()
+              ? NTTP->getExpansionType(*SemaRef.ArgPackSubstIndex)
+              : NTTP->getType();
+      if (const auto *PET = dyn_cast<PackExpansionType>(ParamType))
+        ParamType = PET->getPattern();
+      refParam = ParamType->isReferenceType();
+    }
+  } else {
+    ExprResult result =
+        SemaRef.BuildExpressionFromNonTypeTemplateArgument(Arg, Loc);
+    if (result.isInvalid())
+      return ExprError();
+    Replacement = result.get();
+    refParam = Arg.getNonTypeTemplateArgumentType()->isReferenceType();
+  }
+  return new (SemaRef.Context) SubstNonTypeTemplateParmExpr(
+      Replacement->getType(), Replacement->getValueKind(), Loc, Replacement,
+      AssociatedDecl, NTTP->getIndex(), PackIndex, refParam, Final);
+}
+
 bool Sema::DiagnoseUninstantiableTemplate(SourceLocation PointOfInstantiation,
                                           NamedDecl *Instantiation,
                                           bool InstantiatedFromMember,
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index a72c95d6d77cf..1ff94d7ae397f 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1373,16 +1373,6 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
   return std::nullopt;
 }
 
-static TemplateArgument
-getPackSubstitutedTemplateArgument(Sema &S, TemplateArgument Arg) {
-  assert(S.ArgPackSubstIndex);
-  assert(*S.ArgPackSubstIndex < Arg.pack_size());
-  Arg = Arg.pack_begin()[*S.ArgPackSubstIndex];
-  if (Arg.isPackExpansion())
-    Arg = Arg.getPackExpansionPattern();
-  return Arg;
-}
-
 //===----------------------------------------------------------------------===/
 // Template Instantiation for Types
 //===----------------------------------------------------------------------===/
@@ -1449,13 +1439,6 @@ namespace {
       return TemplateArgs.getNewDepth(Depth);
     }
 
-    UnsignedOrNone getPackIndex(TemplateArgument Pack) {
-      UnsignedOrNone Index = getSema().ArgPackSubstIndex;
-      if (!Index)
-        return std::nullopt;
-      return Pack.pack_size() - 1 - *Index;
-    }
-
     bool TryExpandParameterPacks(SourceLocation EllipsisLoc,
                                  SourceRange PatternRange,
                                  ArrayRef<UnexpandedParameterPack> Unexpanded,
@@ -1537,7 +1520,7 @@ namespace {
       if (TA.getKind() != TemplateArgument::Pack)
         return TA;
       if (SemaRef.ArgPackSubstIndex)
-        return getPackSubstitutedTemplateArgument(SemaRef, TA);
+        return SemaRef.getPackSubstitutedTemplateArgument(TA);
       assert(TA.pack_size() == 1 && TA.pack_begin()->isPackExpansion() &&
              "unexpected pack arguments in template rewrite");
       TemplateArgument Arg = *TA.pack_begin();
@@ -1643,10 +1626,6 @@ namespace {
 
     ExprResult TransformTemplateParmRefExpr(DeclRefExpr *E,
                                             NonTypeTemplateParmDecl *D);
-    ExprResult TransformSubstNonTypeTemplateParmPackExpr(
-                                           SubstNonTypeTemplateParmPackExpr *E);
-    ExprResult TransformSubstNonTypeTemplateParmExpr(
-                                           SubstNonTypeTemplateParmExpr *E);
 
     /// Rebuild a DeclRefExpr for a VarDecl reference.
     ExprResult RebuildVarDeclRefExpr(ValueDecl *PD, SourceLocation Loc);
@@ -1933,12 +1912,6 @@ namespace {
         SmallVectorImpl<QualType> &PTypes,
         SmallVectorImpl<ParmVarDecl *> &TransParams,
         Sema::ExtParameterInfoBuilder &PInfos);
-
-  private:
-    ExprResult
-    transformNonTypeTemplateParmRef(Decl *AssociatedDecl, const NamedDecl *parm,
-                                    SourceLocation loc, TemplateArgument arg,
-                                    UnsignedOrNone PackIndex, bool Final);
   };
 }
 
@@ -1975,7 +1948,7 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
       if (TTP->isParameterPack()) {
         assert(Arg.getKind() == TemplateArgument::Pack &&
                "Missing argument pack");
-        Arg = getPackSubstitutedTemplateArgument(getSema(), Arg);
+        Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
       }
 
       TemplateName Template = Arg.getAsTemplate();
@@ -2079,7 +2052,7 @@ TemplateInstantiator::TransformFirstQualifierInScope(NamedDecl *D,
         if (!getSema().ArgPackSubstIndex)
           return nullptr;
 
-        Arg = getPackSubstitutedTemplateArgument(getSema(), Arg);
+        Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
       }
 
       QualType T = Arg.getAsType();
@@ -2165,8 +2138,8 @@ TemplateName TemplateInstantiator::TransformTemplateName(
               Arg, AssociatedDecl, TTP->getIndex(), Final);
         }
 
-        PackIndex = getPackIndex(Arg);
-        Arg = getPackSubstitutedTemplateArgument(getSema(), Arg);
+        PackIndex = SemaRef.getPackIndex(Arg);
+        Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
       }
 
       TemplateName Template = Arg.getAsTemplate();
@@ -2183,10 +2156,10 @@ TemplateName TemplateInstantiator::TransformTemplateName(
 
     TemplateArgument Pack = SubstPack->getArgumentPack();
     TemplateName Template =
-        getPackSubstitutedTemplateArgument(getSema(), Pack).getAsTemplate();
+        SemaRef.getPackSubstitutedTemplateArgument(Pack).getAsTemplate();
     return getSema().Context.getSubstTemplateTemplateParm(
         Template, SubstPack->getAssociatedDecl(), SubstPack->getIndex(),
-        getPackIndex(Pack), SubstPack->getFinal());
+        SemaRef.getPackIndex(Pack), SubstPack->getFinal());
   }
 
   return inherited::TransformTemplateName(
@@ -2252,11 +2225,11 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E,
           ExprType, TargetType->isReferenceType() ? VK_LValue : VK_PRValue,
           E->getLocation(), Arg, AssociatedDecl, NTTP->getPosition(), Final);
     }
-    PackIndex = getPackIndex(Arg);
-    Arg = getPackSubstitutedTemplateArgument(getSema(), Arg);
+    PackIndex = SemaRef.getPackIndex(Arg);
+    Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
   }
-  return transformNonTypeTemplateParmRef(AssociatedDecl, NTTP, E->getLocation(),
-                                         Arg, PackIndex, Final);
+  return SemaRef.BuildSubstNonTypeTemplateParmExpr(
+      AssociatedDecl, NTTP, E->getLocation(), Arg, PackIndex, Final);
 }
 
 const AnnotateAttr *
@@ -2344,144 +2317,6 @@ TemplateInstantiator::TransformOpenACCRoutineDeclAttr(
                    "applies to a Function Decl (and a few places for VarDecl)");
 }
 
-ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef(
-    Decl *AssociatedDecl, const NamedDecl *parm, SourceLocation loc,
-    TemplateArgument arg, UnsignedOrNone PackIndex, bool Final) {
-  ExprResult result;
-
-  // Determine the substituted parameter type. We can usually infer this from
-  // the template argument, but not always.
-  auto SubstParamType = [&] {
-    if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(parm)) {
-      QualType T;
-      if (NTTP->isExpandedParameterPack())
-        T = NTTP->getExpansionType(*SemaRef.ArgPackSubstIndex);
-      else
-        T = NTTP->getType();
-      if (parm->isParameterPack() && isa<PackExpansionType>(T))
-        T = cast<PackExpansionType>(T)->getPattern();
-      return SemaRef.SubstType(T, TemplateArgs, loc, parm->getDeclName());
-    }
-    return SemaRef.SubstType(arg.getAsExpr()->getType(), TemplateArgs, loc,
-                             parm->getDeclName());
-  };
-
-  bool refParam = false;
-
-  // The template argument itself might be an expression, in which case we just
-  // return that expression. This happens when substituting into an alias
-  // template.
-  if (arg.getKind() == TemplateArgument::Expression) {
-    Expr *argExpr = arg.getAsExpr();
-    result = argExpr;
-    if (argExpr->isLValue()) {
-      if (argExpr->getType()->isRecordType()) {
-        // Check whether the parameter was actually a reference.
-        QualType paramType = SubstParamType();
-        if (paramType.isNull())
-          return ExprError();
-        refParam = paramType->isReferenceType();
-      } else {
-        refParam = true;
-      }
-    }
-  } else if (arg.getKind() == TemplateArgument::Declaration ||
-             arg.getKind() == TemplateArgument::NullPtr) {
-    if (arg.getKind() == TemplateArgument::Declaration) {
-      ValueDecl *VD = arg.getAsDecl();
-
-      // Find the instantiation of the template argument.  This is
-      // required for nested templates.
-      VD = cast_or_null<ValueDecl>(
-             getSema().FindInstantiatedDecl(loc, VD, TemplateArgs));
-      if (!VD)
-        return ExprError();
-    }
-
-    QualType paramType = arg.getNonTypeTemplateArgumentType();
-    assert(!paramType.isNull() && "type substitution failed for param type");
-    assert(!paramType->isDependentType() && "param type still dependent");
-    result = SemaRef.BuildExpressionFromDeclTemplateArgument(arg, paramType, loc);
-    refParam = paramType->isReferenceType();
-  } else {
-    QualType paramType = arg.getNonTypeTemplateArgumentType();
-    result = SemaRef.BuildExpressionFromNonTypeTemplateArgument(arg, loc);
-    refParam = paramType->isReferenceType();
-    assert(result.isInvalid() ||
-           SemaRef.Context.hasSameType(result.get()->getType(),
-                                       paramType.getNonReferenceType()));
-  }
-
-  if (result.isInvalid())
-    return ExprError();
-
-  Expr *resultExpr = result.get();
-  return new (SemaRef.Context) SubstNonTypeTemplateParmExpr(
-      resultExpr->getType(), resultExpr->getValueKind(), loc, resultExpr,
-      AssociatedDecl,
-      clang::getDepthAndIndex(const_cast<NamedDecl *>(parm)).second, PackIndex,
-      refParam, Final);
-}
-
-ExprResult
-TemplateInstantiator::TransformSubstNonTypeTemplateParmPackExpr(
-                                          SubstNonTypeTemplateParmPackExpr *E) {
-  if (!getSema().ArgPackSubstIndex) {
-    // We aren't expanding the parameter pack, so just return ourselves.
-    return E;
-  }
-
-  TemplateArgument Pack = E->getArgumentPack();
-  TemplateArgument Arg = getPackSubstitutedTemplateArgument(getSema(), Pack);
-  return transformNonTypeTemplateParmRef(
-      E->getAssociatedDecl(), E->getParameterPack(),
-      E->getParameterPackLocation(), Arg, getPackIndex(Pack), E->getFinal());
-}
-
-ExprResult
-TemplateInstantiator::TransformSubstNonTypeTemplateParmExpr(
-                                          SubstNonTypeTemplateParmExpr *E) {
-  ExprResult SubstReplacement = E->getReplacement();
-  if (!isa<ConstantExpr>(SubstReplacement.get()))
-    SubstReplacement = TransformExpr(E->getReplacement());
-  if (SubstReplacement.isInvalid())
-    return true;
-  QualType SubstType = TransformType(E->getParameterType(getSema().Context));
-  if (SubstType.isNull())
-    return true;
-  // The type may have been previously dependent and not now, which means we
-  // might have to implicit cast the argument to the new type, for example:
-  // template<auto T, decltype(T) U>
-  // concept C = sizeof(U) == 4;
-  // void foo() requires C<2, 'a'> { }
-  // When normalizing foo(), we first form the normalized constraints of C:
-  // AtomicExpr(sizeof(U) == 4,
-  //            U=SubstNonTypeTemplateParmExpr(Param=U,
-  //                                           Expr=DeclRef(U),
-  //                                           Type=decltype(T)))
-  // Then we substitute T = 2, U = 'a' into the parameter mapping, and need to
-  // produce:
-  // AtomicExpr(sizeof(U) == 4,
-  //            U=SubstNonTypeTemplateParmExpr(Param=U,
-  //                                           Expr=ImpCast(
-  //                                               decltype(2),
-  //                                               SubstNTTPE(Param=U, Expr='a',
-  //                                                          Type=char)),
-  //                                           Type=decltype(2)))
-  // The call to CheckTemplateArgument here produces the ImpCast.
-  TemplateArgument SugaredConverted, CanonicalConverted;
-  if (SemaRef
-          .CheckTemplateArgument(E->getParameter(), SubstType,
-                                 SubstReplacement.get(), SugaredConverted,
-                                 CanonicalConverted,
-                                 /*StrictCheck=*/false, Sema::CTAK_Specified)
-          .isInvalid())
-    return true;
-  return transformNonTypeTemplateParmRef(
-      E->getAssociatedDecl(), E->getParameter(), E->getExprLoc(),
-      SugaredConverted, E->getPackIndex(), E->getFinal());
-}
-
 ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(ValueDecl *PD,
                                                        SourceLocation Loc) {
   DeclarationNameInfo NameInfo(PD->getDeclName(), Loc);
@@ -2701,8 +2536,8 @@ TemplateInstantiator::TransformTemplateTypeParmType(TypeLocBuilder &TLB,
       }
 
       // PackIndex starts from last element.
-      PackIndex = getPackIndex(Arg);
-      Arg = getPackSubstitutedTemplateArgument(getSema(), Arg);
+      PackIndex = SemaRef.getPackIndex(Arg);
+      Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
     }
 
     assert(Arg.getKind() == TemplateArgument::Type &&
@@ -2749,20 +2584,20 @@ QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType(
   }
 
   TemplateArgument Pack = T->getArgumentPack();
-  TemplateArgument Arg = getPackSubstitutedTemplateArgument(getSema(), Pack);
+  TemplateArgument Arg = SemaRef.getPackSubstitutedTemplateArgument(Pack);
   return BuildSubstTemplateTypeParmType(
       TLB, SuppressObjCLifetime, T->getFinal(), NewReplaced, T->getIndex(),
-      getPackIndex(Pack), Arg, TL.getNameLoc());
+      SemaRef.getPackIndex(Pack), Arg, TL.getNameLoc());
 }
 
 QualType TemplateInstantiator::TransformSubstBuiltinTemplatePackType(
     TypeLocBuilder &TLB, SubstBuiltinTemplatePackTypeLoc TL) {
   if (!getSema().ArgPackSubstIndex)
     return TreeTransform::TransformSubstBuiltinTemplatePackType(TLB, TL);
-  auto &Sema = getSema();
-  TemplateArgument Result = getPackSubstitutedTemplateArgument(
-      Sema, TL.getTypePtr()->getArgumentPack());
-  TLB.pushTrivial(Sema.getASTContext(), Result.getAsType(), TL.getBeginLoc());
+  TemplateArgument Result = SemaRef.getPackSubstitutedTemplateArgument(
+      TL.getTypePtr()->getArgumentPack());
+  TLB.pushTrivial(SemaRef.getASTContext(), Result.getAsType(),
+                  TL.getBeginLoc());
   return Result.getAsType();
 }
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index adac3dff5b2b4..e2dc70360506e 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -3742,7 +3742,7 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl(
     ExpandedParams.reserve(D->getNumExpansionTemplateParameters());
     for (unsigned I = 0, N = D->getNumExpansionTemplateParameters();
          I != N; ++I) {
-      LocalInstantiationScope Scope(SemaRef);
+      LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
       TemplateParameterList *Expansion =
         SubstTemplateParams(D->getExpansionTemplateParameters(I));
       if (!Expansion)
@@ -3774,7 +3774,7 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl(
     if (Expand) {
       for (unsigned I = 0; I != *NumExpansions; ++I) {
         Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I);
-        LocalInstantiationScope Scope(SemaRef);
+        LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
         TemplateParameterList *Expansion = SubstTemplateParams(TempParams);
         if (!Expansion)
           return nullptr;
@@ -3785,21 +3785,18 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl(
       // expanded parameter pack is the original expansion type, but callers
       // will end up using the expanded parameter pack types for type-checking.
       IsExpandedParameterPack = true;
-      InstParams = TempParams;
-    } else {
-      // We cannot fully expand the pack expansion now, so just substitute
-      // into the pattern.
-      Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, std::nullopt);
-
-      LocalInstantiationScope Scope(SemaRef);
-      InstParams = SubstTemplateParams(TempParams);
-      if (!InstParams)
-        return nullptr;
     }
+
+    Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, std::nullopt);
+
+    LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
+    InstParams = SubstTemplateParams(TempParams);
+    if (!InstParams)
+      return nullptr;
   } else {
     // Perform the actual substitution of template parameters within a new,
     // local instantiation scope.
-    LocalInstantiationScope Scope(SemaRef);
+    LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
     InstParams = SubstTemplateParams(TempParams);
     if (!InstParams)
       return nullptr;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 242ffb09af006..021407842aa6d 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -16289,20 +16289,68 @@ TreeTransform<Derived>::TransformPackIndexingExpr(PackIndexingExpr *E) {
       IndexExpr.get(), ExpandedExprs, FullySubstituted);
 }
 
-template<typename Derived>
-ExprResult
-TreeTransform<Derived>::TransformSubstNonTypeTemplateParmPackExpr(
-                                          SubstNonTypeTemplateParmPackExpr *E) {
-  // Default behavior is to do nothing with this transformation.
-  return E;
+template <typename Derived>
+ExprResult TreeTransform<Derived>::TransformSubstNonTypeTemplateParmPackExpr(
+    SubstNonTypeTemplateParmPackExpr *E) {
+  if (!getSema().ArgPackSubstIndex)
+    // We aren't expanding the parameter pack, so just return ourselves.
+    return E;
+
+  TemplateArgument Pack = E->getArgumentPack();
+  TemplateArgument Arg = SemaRef.getPackSubstitutedTemplateArgument(Pack);
+  return SemaRef.BuildSubstNonTypeTemplateParmExpr(
+      E->getAssociatedDecl(), E->getParameterPack(),
+      E->getParameterPackLocation(), Arg, SemaRef.getPackIndex(Pack),
+      E->getFinal());
 }
 
-template<typename Derived>
-ExprResult
-TreeTransform<Derived>::TransformSubstNonTypeTemplateParmExpr(
-                                          SubstNonTypeTemplateParmExpr *E) {
-  // Default behavior is to do nothing with this transformation.
-  return E;
+template <typename Derived>
+ExprResult TreeTransform<Derived>::TransformSubstNonTypeTemplateParmExpr(
+    SubstNonTypeTemplateParmExpr *E) {
+  Expr *OrigReplacement = E->getReplacement()->IgnoreImplicitAsWritten();
+  ExprResult Replacement = getDerived().TransformExpr(OrigReplacement);
+  if (Replacement.isInvalid())
+    return true;
+
+  Decl *AssociatedDecl =
+      getDerived().TransformDecl(E->getNameLoc(), E->getAssociatedDecl());
+  if (!AssociatedDecl)
+    return true;
+
+  if (Replacement.get() == OrigReplacement &&
+      AssociatedDecl == E->getAssociatedDecl())
+    return E;
+
+  // If the replacement expression did not change, and the parameter type
+  // did not change, we can skip the semantic action because it would
+  // produce the same result anyway.
+  auto *Param = cast<NonTypeTemplateParmDecl>(
+      getReplacedTemplateParameterList(AssociatedDecl)
+          ->asArray()[E->getIndex()]);
+  if (QualType ParamType = Param->getType();
+      !SemaRef.Context.hasSameType(ParamType, E->getParameter()->getType()) ||
+      Replacement.get() != OrigReplacement) {
+
+    // When transforming the replacement expression previously, all Sema
+    // specific annotations, such as implicit casts, are discarded. Calling the
+    // corresponding sema action is necessary to recover those. Otherwise,
+    // equivalency of the result would be lost.
+    TemplateArgument SugaredConverted, CanonicalConverted;
+    Replacement = SemaRef.CheckTemplateArgument(
+        Param, ParamType, Replacement.get(), SugaredConverted,
+        CanonicalConverted,
+        /*StrictCheck=*/false, Sema::CTAK_Specified);
+    if (Replacement.isInvalid())
+      return true;
+  } else {
+    // Otherwise, the same expression would have been produced.
+    Replacement = E->getReplacement();
+  }
+
+  return new (SemaRef.Context) SubstNonTypeTemplateParmExpr(
+      Replacement.get()->getType(), Replacement.get()->getValueKind(),
+      E->getNameLoc(), Replacement.get(), AssociatedDecl, E->getIndex(),
+      E->getPackIndex(), E->isReferenceParameter(), E->getFinal());
 }
 
 template<typename Derived>
diff --git a/clang/test/SemaCXX/ctad.cpp b/clang/test/SemaCXX/ctad.cpp
index 8380b564bdcdd..7de7f50337e8c 100644
--- a/clang/test/SemaCXX/ctad.cpp
+++ b/clang/test/SemaCXX/ctad.cpp
@@ -190,3 +190,10 @@ namespace GH136624 {
   foo::Alias t = 0;
   // expected-error@-1 {{no viable conversion from 'int' to 'GH136624::A<int>' (aka 'A<int>')}}
 } // namespace GH136624
+
+namespace GH131342 {
+  template <class> constexpr int val{0};
+  template <class T, int> struct A { A(T) {} };
+  template <class T> using AA = A<T, val<T>>;
+  AA a{0};
+} // namespace GH131342
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 1f4d44218ad1f..2f1817d0ca7eb 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -113,7 +113,7 @@ using Bar = Foo<X, sizeof(X)>; // expected-note {{candidate template ignored: co
                                // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, test9::Foo<X, sizeof(X)>) Bar(test9::Foo<X, sizeof(X)>) -> test9::Foo<X, sizeof(X)>'}} \
                                // expected-note {{implicit deduction guide declared as 'template <typename X> requires __is_deducible(test9::Bar, test9::Foo<X, sizeof(X)>) Bar(const X (&)[sizeof(X)]) -> test9::Foo<X, sizeof(X)>'}} \
                                // expected-note {{candidate template ignored: constraints not satisfied [with X = int]}} \
-                               // expected-note {{cannot deduce template arguments for 'test9::Bar' from 'test9::Foo<int, 4UL>'}}
+                               // expected-note {{cannot deduce template arguments for 'test9::Bar' from 'test9::Foo<int, sizeof(int)>'}}
 
 
 Bar s = {{1}}; // expected-error {{no viable constructor or deduction guide }}
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
index e74c031eba4c1..c4ac36e263bc8 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx2c.cpp
@@ -123,3 +123,14 @@ Set<float> sf;
 // expected-note@#C {{evaluated to false}}
 
 } // namespace GH84052
+
+namespace error_on_type_instantiation {
+  int f(int) = delete;
+  // expected-note@-1 {{candidate function has been explicitly deleted}}
+  template<class T, decltype(f(T()))> struct X {};
+  // expected-error@-1 {{call to deleted function 'f'}}
+  template<class T> void g() { X<T, 0> x; }
+  // expected-note@-1 {{while substituting prior template arguments into non-type template parameter [with T = int]}}
+  template void g<int>();
+  // expected-note@-1 {{in instantiation of function template specialization}}
+}

From ebfee327df69e6cfeaa4c5300e6abd19476b8bfe Mon Sep 17 00:00:00 2001
From: Hongyu Chen <xxs_chy@outlook.com>
Date: Sun, 28 Sep 2025 10:14:30 +0800
Subject: [PATCH 020/878] [SDAG] Constant fold frexp in signed way (#161015)

Fixes #160981
The exponential part of a floating-point number is signed. This patch
prevents treating it as unsigned.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  4 ++--
 llvm/test/CodeGen/X86/llvm.frexp.ll            | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 7aa293af963e6..8fc7eabf90ea8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -11161,8 +11161,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
       APFloat FrexpMant =
           frexp(C->getValueAPF(), FrexpExp, APFloat::rmNearestTiesToEven);
       SDValue Result0 = getConstantFP(FrexpMant, DL, VTList.VTs[0]);
-      SDValue Result1 =
-          getConstant(FrexpMant.isFinite() ? FrexpExp : 0, DL, VTList.VTs[1]);
+      SDValue Result1 = getSignedConstant(FrexpMant.isFinite() ? FrexpExp : 0,
+                                          DL, VTList.VTs[1]);
       return getNode(ISD::MERGE_VALUES, DL, VTList, {Result0, Result1}, Flags);
     }
 
diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll
index 83840dd85c533..e3a1b1b83b2e3 100644
--- a/llvm/test/CodeGen/X86/llvm.frexp.ll
+++ b/llvm/test/CodeGen/X86/llvm.frexp.ll
@@ -582,6 +582,22 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) nounwind {
   ret i32 %result.0
 }
 
+define { float, i32 } @pr160981() {
+; X64-LABEL: pr160981:
+; X64:       # %bb.0:
+; X64-NEXT:    movss {{.*#+}} xmm0 = [9.9999988E-1,0.0E+0,0.0E+0,0.0E+0]
+; X64-NEXT:    movl $-126, %eax
+; X64-NEXT:    retq
+;
+; WIN32-LABEL: pr160981:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    flds __real@3f7ffffe
+; WIN32-NEXT:    movl $-126, %eax
+; WIN32-NEXT:    retl
+  %ret = call { float, i32 } @llvm.frexp.f32.i32(float bitcast (i32 8388607 to float))
+  ret { float, i32 } %ret
+}
+
 ; FIXME: Widen vector result
 ; define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) nounwind {
 ;   %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a)

From 9630b321a50a3712ca092a53a4a4c7bea94b3af2 Mon Sep 17 00:00:00 2001
From: Davide Mor <39653004+Tazdevil971@users.noreply.github.com>
Date: Sun, 28 Sep 2025 04:49:40 +0200
Subject: [PATCH 021/878] [MIPS][float] Fixed SingleFloat codegen on N32/N64
 targets (#140575)

This patch aims at making the combination of single-float and N32/N64
ABI properly work.

Right now when both options are enabled the compiler chooses an
incorrect ABI and in some cases even generates wrong instructions.

The floating point behavior on MIPS is controlled through 3 flags:
soft-float, single-float, fp64. This makes things complicated because
fp64 indicates the presence of 64bit floating point registers, but
cannot be easily disabled (the mips3 feature require it, but mips3 CPUs
with only 32bit floating point exist). Also if fp64 is missing it
doesn't actually disable 64bit floating point operations, because
certain MIPS1/2 CPUs support 64bit floating point with 32bit registers,
hence the single-float option.

I'm guessing that originally single-float was only intended for the
latter case, and that's the reason why it doesn't properly work on 64bit
targets.

So this patch does the following:
- Make single-float a "master disable", even if fp64 is enabled this
should completely disable generation of 64bit floating point operations,
making it available on targets which hard require fp64.
- Add proper calling conventions for N32/N64 single-float combinations.
- Fixup codegen to not generate certain 64bit floating point operations,
apparently not assigning a register class to f64 values is not enough to
prevent them from showing up.
- Add tests for the new calling conventions and codegen.
---
 llvm/lib/Target/Mips/MipsCallingConv.td       |  26 +-
 llvm/lib/Target/Mips/MipsISelLowering.cpp     |  20 +-
 llvm/lib/Target/Mips/MipsRegisterInfo.cpp     |  38 ++-
 llvm/lib/Target/Mips/MipsSEISelLowering.cpp   |  11 +
 .../arguments-hard-single-float-varargs.ll    | 148 ++++++++++++
 .../Mips/cconv/arguments-hard-single-float.ll | 224 ++++++++++++++++++
 .../Mips/cconv/arguments-hard-single-fp128.ll |  42 ++++
 .../Mips/cconv/callee-saved-singlefloat.ll    | 111 +++++++++
 .../Mips/cconv/return-hard-single-float.ll    |  43 ++++
 .../Mips/cconv/return-hard-single-fp128.ll    |  24 ++
 .../Mips/inlineasm-constraints-singlefloat.ll |  68 ++++++
 .../CodeGen/Mips/int-to-float-conversion.ll   |  52 ++--
 12 files changed, 766 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float-varargs.ll
 create mode 100644 llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float.ll
 create mode 100644 llvm/test/CodeGen/Mips/cconv/arguments-hard-single-fp128.ll
 create mode 100644 llvm/test/CodeGen/Mips/cconv/callee-saved-singlefloat.ll
 create mode 100644 llvm/test/CodeGen/Mips/cconv/return-hard-single-float.ll
 create mode 100644 llvm/test/CodeGen/Mips/cconv/return-hard-single-fp128.ll
 create mode 100644 llvm/test/CodeGen/Mips/inlineasm-constraints-singlefloat.ll

diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td
index 3501f9fbfd2e7..748162525b091 100644
--- a/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -186,7 +186,8 @@ def RetCC_MipsN : CallingConv<[
   //
   // f128 should only occur for the N64 ABI where long double is 128-bit. On
   // N32, long double is equivalent to double.
-  CCIfType<[i64], CCIfOrigArgWasF128<CCDelegateTo<RetCC_F128>>>,
+  CCIfSubtargetNot<"isSingleFloat()",
+      CCIfType<[i64], CCIfOrigArgWasF128<CCDelegateTo<RetCC_F128>>>>,
 
   // Aggregate returns are positioned at the lowest address in the slot for
   // both little and big-endian targets. When passing in registers, this
@@ -316,9 +317,10 @@ def CC_Mips_FixedArg : CallingConv<[
   //
   // f128 should only occur for the N64 ABI where long double is 128-bit. On
   // N32, long double is equivalent to double.
-  CCIfType<[i64],
-      CCIfSubtargetNot<"useSoftFloat()",
-          CCIfOrigArgWasF128<CCBitConvertToType<f64>>>>,
+  CCIfType<[i64], 
+      CCIfSubtargetNot<"isSingleFloat()",
+          CCIfSubtargetNot<"useSoftFloat()", 
+              CCIfOrigArgWasF128<CCBitConvertToType<f64>>>>>,
 
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_Mips_FastCC>>,
 
@@ -342,8 +344,8 @@ def CC_Mips : CallingConv<[
 // Callee-saved register lists.
 //===----------------------------------------------------------------------===//
 
-def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
-                                               (sequence "S%u", 7, 0))>;
+def CSR_O32_SingleFloat : CalleeSavedRegs<(add(sequence "F%u", 31, 20), RA, FP,
+                              (sequence "S%u", 7, 0))>;
 
 def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
                                         (sequence "S%u", 7, 0))> {
@@ -357,13 +359,19 @@ def CSR_O32_FP64 :
   CalleeSavedRegs<(add (decimate (sequence "D%u_64", 30, 20), 2), RA, FP,
                        (sequence "S%u", 7, 0))>;
 
-def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64,
-                                   D30_64, RA_64, FP_64, GP_64,
-                                   (sequence "S%u_64", 7, 0))>;
+def CSR_N32 : CalleeSavedRegs<(add(decimate(sequence "D%u_64", 30, 20), 2),
+                  RA_64, FP_64, GP_64, (sequence "S%u_64", 7, 0))>;
+
+def CSR_N32_SingleFloat
+    : CalleeSavedRegs<(add(decimate(sequence "F%u", 30, 20), 2), RA_64, FP_64,
+          GP_64, (sequence "S%u_64", 7, 0))>;
 
 def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
                                    GP_64, (sequence "S%u_64", 7, 0))>;
 
+def CSR_N64_SingleFloat : CalleeSavedRegs<(add(sequence "F%u", 31, 24), RA_64,
+                              FP_64, GP_64, (sequence "S%u_64", 7, 0))>;
+
 def CSR_Mips16RetHelper :
   CalleeSavedRegs<(add V0, V1, FP,
                    (sequence "A%u", 3, 0), (sequence "S%u", 7, 0),
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 1491300e37d3e..b05de49d8332a 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -4265,10 +4265,16 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const {
     return std::make_pair(0U, nullptr);
 
   if (Prefix == "$f") { // Parse $f0-$f31.
-    // If the size of FP registers is 64-bit or Reg is an even number, select
-    // the 64-bit register class. Otherwise, select the 32-bit register class.
-    if (VT == MVT::Other)
-      VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+    // If the targets is single float only, always select 32-bit registers,
+    // otherwise if the size of FP registers is 64-bit or Reg is an even number,
+    // select the 64-bit register class. Otherwise, select the 32-bit register
+    // class.
+    if (VT == MVT::Other) {
+      if (Subtarget.isSingleFloat())
+        VT = MVT::f32;
+      else
+        VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+    }
 
     RC = getRegClassFor(VT);
 
@@ -4308,10 +4314,12 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::GPR32RegClass);
       }
-      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat()) ||
+           (VT == MVT::f64 && Subtarget.isSingleFloat())) &&
           !Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR32RegClass);
-      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) &&
+      if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat()) ||
+           (VT == MVT::f64 && Subtarget.isSingleFloat())) &&
           Subtarget.isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 6f8d6764e77b8..6ca587b1ba4d5 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -89,14 +89,25 @@ MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
                                      : CSR_Interrupt_32_SaveList;
   }
 
-  if (Subtarget.isSingleFloat())
-    return CSR_SingleFloatOnly_SaveList;
+  // N64 ABI
+  if (Subtarget.isABI_N64()) {
+    if (Subtarget.isSingleFloat())
+      return CSR_N64_SingleFloat_SaveList;
 
-  if (Subtarget.isABI_N64())
     return CSR_N64_SaveList;
+  }
+
+  // N32 ABI
+  if (Subtarget.isABI_N32()) {
+    if (Subtarget.isSingleFloat())
+      return CSR_N32_SingleFloat_SaveList;
 
-  if (Subtarget.isABI_N32())
     return CSR_N32_SaveList;
+  }
+
+  // O32 ABI
+  if (Subtarget.isSingleFloat())
+    return CSR_O32_SingleFloat_SaveList;
 
   if (Subtarget.isFP64bit())
     return CSR_O32_FP64_SaveList;
@@ -111,14 +122,25 @@ const uint32_t *
 MipsRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const {
   const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
-  if (Subtarget.isSingleFloat())
-    return CSR_SingleFloatOnly_RegMask;
+  // N64 ABI
+  if (Subtarget.isABI_N64()) {
+    if (Subtarget.isSingleFloat())
+      return CSR_N64_SingleFloat_RegMask;
 
-  if (Subtarget.isABI_N64())
     return CSR_N64_RegMask;
+  }
+
+  // N32 ABI
+  if (Subtarget.isABI_N32()) {
+    if (Subtarget.isSingleFloat())
+      return CSR_N32_SingleFloat_RegMask;
 
-  if (Subtarget.isABI_N32())
     return CSR_N32_RegMask;
+  }
+
+  // O32 ABI
+  if (Subtarget.isSingleFloat())
+    return CSR_O32_SingleFloat_RegMask;
 
   if (Subtarget.isFP64bit())
     return CSR_O32_FP64_RegMask;
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 71a70d9c2dd46..19917f3650bb5 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGenTypes/MachineValueType.h"
@@ -211,6 +212,16 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     }
   }
 
+  // Targets with 64bits integer registers, but no 64bit floating point register
+  // do not support conversion between them
+  if (Subtarget.isGP64bit() && Subtarget.isSingleFloat() &&
+      !Subtarget.useSoftFloat()) {
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  }
+
   setOperationAction(ISD::SMUL_LOHI,          MVT::i32, Custom);
   setOperationAction(ISD::UMUL_LOHI,          MVT::i32, Custom);
   setOperationAction(ISD::MULHS,              MVT::i32, Custom);
diff --git a/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float-varargs.ll b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float-varargs.ll
new file mode 100644
index 0000000000000..8cbc879310f61
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float-varargs.ll
@@ -0,0 +1,148 @@
+; RUN: llc -mtriple=mips -relocation-model=static -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+; RUN: llc -mtriple=mipsel -relocation-model=static -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,N32,NEW,NEWBE %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,N32,NEW,NEWLE %s
+
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64,N64,NEW,NEWBE %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64,N64,NEW,NEWLE %s
+
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, ...)
+                         nounwind {
+entry:
+        %0 = getelementptr [11 x double], ptr @doubles, i32 0, i32 1
+        store volatile double %a, ptr %0
+
+        %ap = alloca ptr
+        call void @llvm.va_start(ptr %ap)
+        %b = va_arg ptr %ap, double
+        %1 = getelementptr [11 x double], ptr @doubles, i32 0, i32 2
+        store volatile double %b, ptr %1
+        call void @llvm.va_end(ptr %ap)
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:         daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+
+; O32 forbids using floating point registers for the non-variable portion.
+; N32/N64 allow it.
+; O32-DAG:           sw $4, 8([[R2]])
+; O32-DAG:           sw $5, 12([[R2]])
+; NEW-DAG:           sd $4, 8([[R2]])
+
+; The varargs portion is dumped to stack
+; O32-DAG:           sw $6, 16($sp)
+; O32-DAG:           sw $7, 20($sp)
+; NEW-DAG:           sd $5, 8($sp)
+; NEW-DAG:           sd $6, 16($sp)
+; NEW-DAG:           sd $7, 24($sp)
+; NEW-DAG:           sd $8, 32($sp)
+; NEW-DAG:           sd $9, 40($sp)
+; NEW-DAG:           sd $10, 48($sp)
+; NEW-DAG:           sd $11, 56($sp)
+
+; Get the varargs pointer
+; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and 8 bytes reserved
+; for arguments 1 and 2.
+; N32/N64 has 8 bytes for the varargs pointer, and no reserved area.
+; O32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 16
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+
+; Increment the pointer then get the varargs arg
+; LLVM will rebind the load to the stack pointer instead of the varargs pointer
+; during lowering. This is fine and doesn't change the behaviour.
+; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
+; N32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
+; N64-DAG:           daddiu [[VAPTR]], [[VAPTR]], 8
+; O32-DAG:           lw	[[R3:\$[0-9]+]], 16($sp)
+; O32-DAG:           lw	[[R4:\$[0-9]+]], 20($sp)
+; O32-DAG:           sw [[R3]], 16([[R2]])
+; O32-DAG:           sw [[R4]], 20([[R2]])
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 8($sp)
+; NEW-DAG:           sd [[R3]], 16([[R2]])
+
+define void @float_args(float %a, ...) nounwind {
+entry:
+        %0 = getelementptr [11 x float], ptr @floats, i32 0, i32 1
+        store volatile float %a, ptr %0
+
+        %ap = alloca ptr
+        call void @llvm.va_start(ptr %ap)
+        %b = va_arg ptr %ap, float
+        %1 = getelementptr [11 x float], ptr @floats, i32 0, i32 2
+        store volatile float %b, ptr %1
+        call void @llvm.va_end(ptr %ap)
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:         daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+
+; The first four arguments are the same in O32/N32/N64.
+; The non-variable portion should be unaffected.
+; O32-DAG:           mtc1 $4, $f0
+; O32-DAG:           swc1 $f0, 4([[R2]])
+; NEW-DAG:           swc1 $f12, 4([[R2]])
+
+; The varargs portion is dumped to stack
+; O32-DAG:           sw $5, 12($sp)
+; O32-DAG:           sw $6, 16($sp)
+; O32-DAG:           sw $7, 20($sp)
+; NEW-DAG:           sd $5, 8($sp)
+; NEW-DAG:           sd $6, 16($sp)
+; NEW-DAG:           sd $7, 24($sp)
+; NEW-DAG:           sd $8, 32($sp)
+; NEW-DAG:           sd $9, 40($sp)
+; NEW-DAG:           sd $10, 48($sp)
+; NEW-DAG:           sd $11, 56($sp)
+
+; Get the varargs pointer
+; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and should have 8
+; bytes reserved for arguments 1 and 2 (the first float arg) but as discussed in
+; arguments-float.ll, GCC doesn't agree with MD00305 and treats floats as 4
+; bytes so we only have 12 bytes total.
+; N32/N64 has 8 bytes for the varargs pointer, and no reserved area.
+; O32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 12
+; O32-DAG:           sw [[VAPTR]], 4($sp)
+; N32-DAG:           addiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N32-DAG:           sw [[VAPTR]], 4($sp)
+; N64-DAG:           daddiu [[VAPTR:\$[0-9]+]], $sp, 8
+; N64-DAG:           sd [[VAPTR]], 0($sp)
+
+; Increment the pointer then get the varargs arg
+; LLVM will rebind the load to the stack pointer instead of the varargs pointer
+; during lowering. This is fine and doesn't change the behaviour.
+; Also, in big-endian mode the offset must be increased by 4 to retrieve the
+; correct half of the argument slot.
+;
+; O32-DAG:           addiu [[VAPTR]], [[VAPTR]], 4
+; N32-DAG:           addiu [[VAPTR]], [[VAPTR]], 8
+; N64-DAG:           daddiu [[VAPTR]], [[VAPTR]], 8
+; O32-DAG:           lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
+; NEWLE-DAG:         lwc1 [[FTMP1:\$f[0-9]+]], 8($sp)
+; NEWBE-DAG:         lwc1 [[FTMP1:\$f[0-9]+]], 12($sp)
+; ALL-DAG:           swc1 [[FTMP1]], 8([[R2]])
+
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_copy(ptr, ptr)
+declare void @llvm.va_end(ptr)
diff --git a/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float.ll b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float.ll
new file mode 100644
index 0000000000000..6b7ad03c8e1c2
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float.ll
@@ -0,0 +1,224 @@
+; RUN: llc -mtriple=mips -relocation-model=static -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+; RUN: llc -mtriple=mipsel -relocation-model=static -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,O32 %s
+
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,NEW %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32,NEW %s
+
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64,NEW %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64,NEW %s
+
+@bytes = global [11 x i8] zeroinitializer
+@dwords = global [11 x i64] zeroinitializer
+@floats = global [11 x float] zeroinitializer
+@doubles = global [11 x double] zeroinitializer
+
+define void @double_args(double %a, double %b, double %c, double %d, double %e,
+                         double %f, double %g, double %h, double %i) nounwind {
+entry:
+        %0 = getelementptr [11 x double], ptr @doubles, i32 0, i32 1
+        store volatile double %a, ptr %0
+        %1 = getelementptr [11 x double], ptr @doubles, i32 0, i32 2
+        store volatile double %b, ptr %1
+        %2 = getelementptr [11 x double], ptr @doubles, i32 0, i32 3
+        store volatile double %c, ptr %2
+        %3 = getelementptr [11 x double], ptr @doubles, i32 0, i32 4
+        store volatile double %d, ptr %3
+        %4 = getelementptr [11 x double], ptr @doubles, i32 0, i32 5
+        store volatile double %e, ptr %4
+        %5 = getelementptr [11 x double], ptr @doubles, i32 0, i32 6
+        store volatile double %f, ptr %5
+        %6 = getelementptr [11 x double], ptr @doubles, i32 0, i32 7
+        store volatile double %g, ptr %6
+        %7 = getelementptr [11 x double], ptr @doubles, i32 0, i32 8
+        store volatile double %h, ptr %7
+        %8 = getelementptr [11 x double], ptr @doubles, i32 0, i32 9
+        store volatile double %i, ptr %8
+        ret void
+}
+
+; ALL-LABEL: double_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:         daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument is floating point but single-float is enabled so floating
+; point registers are not used.
+; O32-DAG:           sw $4, 8([[R2]])
+; O32-DAG:           sw $5, 12([[R2]])
+; NEW-DAG:           sd $4, 8([[R2]])
+
+; O32-DAG:           sw $6, 16([[R2]])
+; O32-DAG:           sw $7, 20([[R2]])
+; NEW-DAG:           sd $5, 16([[R2]])
+
+; O32 has run out of argument registers and starts using the stack
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 16($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 20($sp)
+; O32-DAG:           sw [[R3]], 24([[R2]])
+; O32-DAG:           sw [[R4]], 28([[R2]])
+; NEW-DAG:           sd $6, 24([[R2]])
+
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 24($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 28($sp)
+; O32-DAG:           sw [[R3]], 32([[R2]])
+; O32-DAG:           sw [[R4]], 36([[R2]])
+; NEW-DAG:           sd $7, 32([[R2]])
+
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 32($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 36($sp)
+; O32-DAG:           sw [[R3]], 40([[R2]])
+; O32-DAG:           sw [[R4]], 44([[R2]])
+; NEW-DAG:           sd $8, 40([[R2]])
+
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 40($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 44($sp)
+; O32-DAG:           sw [[R3]], 48([[R2]])
+; O32-DAG:           sw [[R4]], 52([[R2]])
+; NEW-DAG:           sd $9, 48([[R2]])
+
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 48($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 52($sp)
+; O32-DAG:           sw [[R3]], 56([[R2]])
+; O32-DAG:           sw [[R4]], 60([[R2]])
+; NEW-DAG:           sd $10, 56([[R2]])
+
+; N32/N64 have run out of registers and starts using the stack too
+; O32-DAG:           lw [[R3:\$[0-9]+]], 56($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 60($sp)
+; O32-DAG:           sw [[R3]], 64([[R2]])
+; O32-DAG:           sw [[R4]], 68([[R2]])
+; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
+; NEW-DAG:           sd $11, 64([[R2]])
+
+define void @float_args(float %a, float %b, float %c, float %d, float %e,
+                        float %f, float %g, float %h, float %i) nounwind {
+entry:
+        %0 = getelementptr [11 x float], ptr @floats, i32 0, i32 1
+        store volatile float %a, ptr %0
+        %1 = getelementptr [11 x float], ptr @floats, i32 0, i32 2
+        store volatile float %b, ptr %1
+        %2 = getelementptr [11 x float], ptr @floats, i32 0, i32 3
+        store volatile float %c, ptr %2
+        %3 = getelementptr [11 x float], ptr @floats, i32 0, i32 4
+        store volatile float %d, ptr %3
+        %4 = getelementptr [11 x float], ptr @floats, i32 0, i32 5
+        store volatile float %e, ptr %4
+        %5 = getelementptr [11 x float], ptr @floats, i32 0, i32 6
+        store volatile float %f, ptr %5
+        %6 = getelementptr [11 x float], ptr @floats, i32 0, i32 7
+        store volatile float %g, ptr %6
+        %7 = getelementptr [11 x float], ptr @floats, i32 0, i32 8
+        store volatile float %h, ptr %7
+        %8 = getelementptr [11 x float], ptr @floats, i32 0, i32 9
+        store volatile float %i, ptr %8
+        ret void
+}
+
+; ALL-LABEL: float_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:         daddiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+
+; The first argument is floating point so floating point registers are used.
+; The first argument is the same for O32/N32/N64 but the second argument differs
+; by register
+; ALL-DAG:           swc1 $f12, 4([[R1]])
+; O32-DAG:           swc1 $f14, 8([[R1]])
+; NEW-DAG:           swc1 $f13, 8([[R1]])
+
+; O32 has run out of argument registers and (in theory) starts using the stack
+; I've yet to find a reference in the documentation about this but GCC uses up
+; the remaining two argument slots in the GPR's first. We'll do the same for
+; compatibility.
+; O32-DAG:           mtc1 $6, $f0
+; O32-DAG:           swc1 $f0, 12([[R1]])
+; NEW-DAG:           swc1 $f14, 12([[R1]])
+; O32-DAG:           mtc1 $7, $f0
+; O32-DAG:           swc1 $f0, 16([[R1]])
+; NEW-DAG:           swc1 $f15, 16([[R1]])
+
+; O32 is definitely out of registers now and switches to the stack.
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 16($sp)
+; O32-DAG:           swc1 [[F1]], 20([[R1]])
+; NEW-DAG:           swc1 $f16, 20([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 20($sp)
+; O32-DAG:           swc1 [[F1]], 24([[R1]])
+; NEW-DAG:           swc1 $f17, 24([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 24($sp)
+; O32-DAG:           swc1 [[F1]], 28([[R1]])
+; NEW-DAG:           swc1 $f18, 28([[R1]])
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 28($sp)
+; O32-DAG:           swc1 [[F1]], 32([[R1]])
+; NEW-DAG:           swc1 $f19, 32([[R1]])
+
+; N32/N64 have run out of registers and start using the stack too
+; O32-DAG:           lwc1 [[F1:\$f[0-9]+]], 32($sp)
+; O32-DAG:           swc1 [[F1]], 36([[R1]])
+; NEW-DAG:           lwc1 [[F1:\$f[0-9]+]], 0($sp)
+; NEW-DAG:           swc1 [[F1]], 36([[R1]])
+
+
+define void @double_arg2(i8 %a, double %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8], ptr @bytes, i32 0, i32 1
+        store volatile i8 %a, ptr %0
+        %1 = getelementptr [11 x double], ptr @doubles, i32 0, i32 1
+        store volatile double %b, ptr %1
+        ret void
+}
+
+; ALL-LABEL: double_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:         daddiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+; SYM64-DAG:         daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles)
+
+; The first four arguments are the same in O32/N32/N64.
+; The first argument isn't floating point so floating point registers are not
+; used.
+; The second slot is insufficiently aligned for double on O32 so it is skipped.
+; Also, double occupies two slots on O32 and only one for N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+; O32-DAG:           sw $6, 8([[R2]])
+; O32-DAG:           sw $7, 12([[R2]])
+; NEW-DAG:           sd $5, 8([[R2]])
+
+define void @float_arg2(i8 %a, float %b) nounwind {
+entry:
+        %0 = getelementptr [11 x i8], ptr @bytes, i32 0, i32 1
+        store volatile i8 %a, ptr %0
+        %1 = getelementptr [11 x float], ptr @floats, i32 0, i32 1
+        store volatile float %b, ptr %1
+        ret void
+}
+
+; ALL-LABEL: float_arg2:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM64-DAG:         daddiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes)
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+; SYM64-DAG:         daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats)
+
+; The first argument is the same in O32/N32/N64.
+; ALL-DAG:           sb $4, 1([[R1]])
+
+; The first argument isn't floating point so floating point registers are not
+; used in O32, but N32/N64 will still use them.
+; MD00305 and GCC disagree on this one. MD00305 says that floats are treated
+; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte
+; aligned and occupying one slot. We'll use GCC's definition.
+; O32-DAG:           mtc1 $5, $f0
+; O32-DAG:           swc1 $f0, 4([[R2]])
+; NEW-DAG:           swc1 $f13, 4([[R2]])
diff --git a/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-fp128.ll b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-fp128.ll
new file mode 100644
index 0000000000000..9268e37b02fb5
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-fp128.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32 %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32 %s
+
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64 %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64 %s
+
+@ldoubles = global [11 x fp128] zeroinitializer
+
+define void @ldouble_args(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e) nounwind {
+entry:
+        %0 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 1
+        store volatile fp128 %a, ptr %0
+        %1 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 2
+        store volatile fp128 %b, ptr %1
+        %2 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 3
+        store volatile fp128 %c, ptr %2
+        %3 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 4
+        store volatile fp128 %d, ptr %3
+        %4 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 5
+        store volatile fp128 %e, ptr %4
+        ret void
+}
+
+; ALL-LABEL: ldouble_args:
+; We won't test the way the global address is calculated in this test. This is
+; just to get the register number for the other checks.
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles)
+; SYM64-DAG:         daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles)
+
+; The first four arguments are the same in N32/N64.
+; ALL-DAG:           sd	$5, 24([[R2]])
+; ALL-DAG:           sd	$4, 16([[R2]])
+; ALL-DAG:           sd	$7, 40([[R2]])
+; ALL-DAG:           sd	$6, 32([[R2]])
+; ALL-DAG:           sd	$9, 56([[R2]])
+; ALL-DAG:           sd	$8, 48([[R2]])
+; ALL-DAG:           sd	$11, 72([[R2]])
+; ALL-DAG:           sd	$10, 64([[R2]])
diff --git a/llvm/test/CodeGen/Mips/cconv/callee-saved-singlefloat.ll b/llvm/test/CodeGen/Mips/cconv/callee-saved-singlefloat.ll
new file mode 100644
index 0000000000000..5bf1f2c2d60da
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/cconv/callee-saved-singlefloat.ll
@@ -0,0 +1,111 @@
+; RUN: llc -mtriple=mips -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mipsel -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,O32 %s
+
+; RUN: llc -mtriple=mips64 -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64 -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N32-INV %s
+; RUN: llc -mtriple=mips64el -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N32-INV %s
+
+; RUN: llc -mtriple=mips64 -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64 -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N64-INV %s
+; RUN: llc -mtriple=mips64el -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N64-INV %s
+
+define void @fpu_clobber() nounwind {
+entry:
+        call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f13},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"()
+        ret void
+}
+
+; ALL-LABEL: fpu_clobber:
+; ALL-INV-NOT:   swc1 $f0,
+; ALL-INV-NOT:   swc1 $f1,
+; ALL-INV-NOT:   swc1 $f2,
+; ALL-INV-NOT:   swc1 $f3,
+; ALL-INV-NOT:   swc1 $f4,
+; ALL-INV-NOT:   swc1 $f5,
+; ALL-INV-NOT:   swc1 $f6,
+; ALL-INV-NOT:   swc1 $f7,
+; ALL-INV-NOT:   swc1 $f8,
+; ALL-INV-NOT:   swc1 $f9,
+; ALL-INV-NOT:   swc1 $f10,
+; ALL-INV-NOT:   swc1 $f11,
+; ALL-INV-NOT:   swc1 $f12,
+; ALL-INV-NOT:   swc1 $f13,
+; ALL-INV-NOT:   swc1 $f14,
+; ALL-INV-NOT:   swc1 $f15,
+; ALL-INV-NOT:   swc1 $f16,
+; ALL-INV-NOT:   swc1 $f17,
+; ALL-INV-NOT:   swc1 $f18,
+; ALL-INV-NOT:   swc1 $f19,
+
+; O32:           addiu $sp, $sp, -48
+; O32-DAG:       swc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F21:\$f21]], [[OFF21:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F23:\$f23]], [[OFF23:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F25:\$f25]], [[OFF25:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F27:\$f27]], [[OFF27:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F29:\$f29]], [[OFF29:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; O32-DAG:       swc1 [[F31:\$f31]], [[OFF31:[0-9]+]]($sp)
+; O32-DAG:       lwc1 [[F20]], [[OFF20]]($sp)
+; O32-DAG:       lwc1 [[F21]], [[OFF21]]($sp)
+; O32-DAG:       lwc1 [[F22]], [[OFF22]]($sp)
+; O32-DAG:       lwc1 [[F23]], [[OFF23]]($sp)
+; O32-DAG:       lwc1 [[F24]], [[OFF24]]($sp)
+; O32-DAG:       lwc1 [[F25]], [[OFF25]]($sp)
+; O32-DAG:       lwc1 [[F26]], [[OFF26]]($sp)
+; O32-DAG:       lwc1 [[F27]], [[OFF27]]($sp)
+; O32-DAG:       lwc1 [[F28]], [[OFF28]]($sp)
+; O32-DAG:       lwc1 [[F29]], [[OFF29]]($sp)
+; O32-DAG:       lwc1 [[F30]], [[OFF30]]($sp)
+; O32-DAG:       lwc1 [[F31]], [[OFF31]]($sp)
+; O32:           addiu $sp, $sp, 48
+
+; N32:           addiu $sp, $sp, -32
+; N32-DAG:       swc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp)
+; N32-INV-NOT:   swc1 $f21,
+; N32-DAG:       swc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp)
+; N32-INV-NOT:   swc1 $f23,
+; N32-DAG:       swc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; N32-INV-NOT:   swc1 $f25,
+; N32-DAG:       swc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; N32-INV-NOT:   swc1 $f27,
+; N32-DAG:       swc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; N32-INV-NOT:   swc1 $f29,
+; N32-DAG:       swc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; N32-INV-NOT:   swc1 $f31,
+; N32-DAG:       lwc1 [[F20]], [[OFF20]]($sp)
+; N32-DAG:       lwc1 [[F22]], [[OFF22]]($sp)
+; N32-DAG:       lwc1 [[F24]], [[OFF24]]($sp)
+; N32-DAG:       lwc1 [[F26]], [[OFF26]]($sp)
+; N32-DAG:       lwc1 [[F28]], [[OFF28]]($sp)
+; N32-DAG:       lwc1 [[F30]], [[OFF30]]($sp)
+; N32:           addiu $sp, $sp, 32
+
+; N64:           addiu $sp, $sp, -32
+; N64-INV-NOT:   swc1 $f20,
+; N64-INV-NOT:   swc1 $f21,
+; N64-INV-NOT:   swc1 $f22,
+; N64-INV-NOT:   swc1 $f23,
+; N64-DAG:       swc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp)
+; N64-DAG:       swc1 [[F25:\$f25]], [[OFF25:[0-9]+]]($sp)
+; N64-DAG:       swc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp)
+; N64-DAG:       swc1 [[F27:\$f27]], [[OFF27:[0-9]+]]($sp)
+; N64-DAG:       swc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp)
+; N64-DAG:       swc1 [[F29:\$f29]], [[OFF29:[0-9]+]]($sp)
+; N64-DAG:       swc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp)
+; N64-DAG:       swc1 [[F31:\$f31]], [[OFF31:[0-9]+]]($sp)
+; N64-DAG:       lwc1 [[F24]], [[OFF24]]($sp)
+; N64-DAG:       lwc1 [[F25]], [[OFF25]]($sp)
+; N64-DAG:       lwc1 [[F26]], [[OFF26]]($sp)
+; N64-DAG:       lwc1 [[F27]], [[OFF27]]($sp)
+; N64-DAG:       lwc1 [[F28]], [[OFF28]]($sp)
+; N64-DAG:       lwc1 [[F29]], [[OFF29]]($sp)
+; N64-DAG:       lwc1 [[F30]], [[OFF30]]($sp)
+; N64-DAG:       lwc1 [[F31]], [[OFF31]]($sp)
+; N64:           addiu $sp, $sp, 32
\ No newline at end of file
diff --git a/llvm/test/CodeGen/Mips/cconv/return-hard-single-float.ll b/llvm/test/CodeGen/Mips/cconv/return-hard-single-float.ll
new file mode 100644
index 0000000000000..1abf08d8200fb
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/cconv/return-hard-single-float.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,O32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 -mattr=+single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 -mattr=+single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,N32 %s
+
+; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 -mattr=+single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 -mattr=+single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,N64 %s
+
+@float = global float zeroinitializer
+@double = global double zeroinitializer
+
+define float @retfloat() nounwind {
+entry:
+        %0 = load volatile float, ptr @float
+        ret float %0
+}
+
+; ALL-LABEL: retfloat:
+; O32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; O32-DAG:           lwc1 $f0, %lo(float)([[R1]])
+; N32-DAG:           lui [[R1:\$[0-9]+]], %hi(float)
+; N32-DAG:           lwc1 $f0, %lo(float)([[R1]])
+; N64-DAG:           lwc1 $f0, %lo(float)([[R1:\$[0-9+]]])
+
+define double @retdouble() nounwind {
+entry:
+        %0 = load volatile double, ptr @double
+        ret double %0
+}
+
+; ALL-LABEL: retdouble:
+; O32-DAG:           lw $2, %lo(double)([[R1:\$[0-9]+]])
+; O32-DAG:           addiu [[R2:\$[0-9]+]], [[R1]], %lo(double)
+; O32-DAG:           lw $3, 4([[R2]])
+; N32-DAG:           ld $2, %lo(double)([[R1:\$[0-9]+]])
+; N64-DAG:           ld $2, %lo(double)([[R1:\$[0-9]+]])
diff --git a/llvm/test/CodeGen/Mips/cconv/return-hard-single-fp128.ll b/llvm/test/CodeGen/Mips/cconv/return-hard-single-fp128.ll
new file mode 100644
index 0000000000000..e4d04146ecc2f
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/cconv/return-hard-single-fp128.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32 %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM32 %s
+
+; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64 %s
+; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \
+; RUN:   | FileCheck --check-prefixes=ALL,SYM64 %s
+
+@fp128 = global fp128 zeroinitializer
+
+define fp128 @retldouble() nounwind {
+entry:
+        %0 = load volatile fp128, ptr @fp128
+        ret fp128 %0
+}
+
+; ALL-LABEL: retldouble:
+; SYM32-DAG:         addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(fp128)
+; SYM64-DAG:         daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(fp128)
+
+; ALL-DAG:           ld $2, %lo(fp128)([[R2]])
+; ALL-DAG:           ld $3, 8([[R2]])
diff --git a/llvm/test/CodeGen/Mips/inlineasm-constraints-singlefloat.ll b/llvm/test/CodeGen/Mips/inlineasm-constraints-singlefloat.ll
new file mode 100644
index 0000000000000..ddebddcdab260
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/inlineasm-constraints-singlefloat.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=mips -mattr=+single-float < %s | FileCheck %s --check-prefix=MIPS32
+; RUN: llc -mtriple=mips64 -mattr=+single-float < %s | FileCheck %s --check-prefix=MIPS64
+
+define void @read_double(ptr %0) {
+; MIPS32-LABEL: read_double:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lw $2, 4($4)
+; MIPS32-NEXT:    lw $3, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_double:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    ld $2, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load double, ptr %0, align 8
+  tail call void asm sideeffect "", "r,~{$1}"(double %2)
+  ret void
+}
+
+define void @read_float(ptr %0) {
+; MIPS32-LABEL: read_float:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lwc1 $f0, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+;
+; MIPS64-LABEL: read_float:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    lwc1 $f0, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    nop
+  %2 = load float, ptr %0, align 8
+  tail call void asm sideeffect "", "f"(float %2)
+  ret void
+}
+
+; Test that a proper register class is assigned to clobbers in single-float mode
+define float @explicit_float_register_clobber(ptr %0) {
+; MIPS32-LABEL: explicit_float_register_clobber:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lwc1 $f1, 0($4)
+; MIPS32-NEXT:    #APP
+; MIPS32-NEXT:    #NO_APP
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    mov.s $f0, $f1
+;
+; MIPS64-LABEL: explicit_float_register_clobber:
+; MIPS64:       # %bb.0:
+; MIPS64-NEXT:    lwc1 $f1, 0($4)
+; MIPS64-NEXT:    #APP
+; MIPS64-NEXT:    #NO_APP
+; MIPS64-NEXT:    jr $ra
+; MIPS64-NEXT:    mov.s $f0, $f1
+  %2 = load float, ptr %0, align 8
+  tail call void asm sideeffect "", "~{$f0}"()
+  ret float %2
+}
diff --git a/llvm/test/CodeGen/Mips/int-to-float-conversion.ll b/llvm/test/CodeGen/Mips/int-to-float-conversion.ll
index 84bc6a253595a..1c8ad9ad07e15 100644
--- a/llvm/test/CodeGen/Mips/int-to-float-conversion.ll
+++ b/llvm/test/CodeGen/Mips/int-to-float-conversion.ll
@@ -1,13 +1,24 @@
-; RUN: llc -mtriple=mipsel < %s | FileCheck %s -check-prefix=32
-; RUN: llc -mtriple=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64
-; RUN: llc -mtriple=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
+; RUN: llc -mtriple=mipsel < %s | FileCheck %s -check-prefixes=ALL,32,32DF
+; RUN: llc -mtriple=mipsel -mattr=+single-float < %s | FileCheck %s -check-prefixes=ALL,32,32SF
+
+; RUN: llc -mtriple=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefixes=ALL,64,64DF
+; RUN: llc -mtriple=mips64el -mcpu=mips4 -mattr=+single-float < %s \
+; RUN:   | FileCheck %s -check-prefixes=ALL,64,64SF
+
+; RUN: llc -mtriple=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefixes=ALL,64,64DF
+; RUN: llc -mtriple=mips64el -mcpu=mips64 -mattr=+single-float < %s \
+; RUN:   | FileCheck %s -check-prefixes=ALL,64,64SF
+
+; Test various combinations of 32/64bit GP registers and single/double floating point support.
 
 @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4
 @i3 = common global ptr null, align 4
 
-; 32-LABEL: test_float_int_:
-; 32: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
-; 32: cvt.s.w $f{{[0-9]+}}, $f[[R0]]
+; ALL-LABEL: test_float_int_:
+; 32:   mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
+; 32:   cvt.s.w $f{{[0-9]+}}, $f[[R0]]
+; 64:   mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
+; 64:   cvt.s.w $f{{[0-9]+}}, $f[[R0]]
 
 define float @test_float_int_(i32 %a) {
 entry:
@@ -15,12 +26,13 @@ entry:
   ret float %conv
 }
 
-; 32-LABEL: test_double_int_:
-; 32: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
-; 32: cvt.d.w $f{{[0-9]+}}, $f[[R0]]
-; 64-LABEL: test_double_int_:
-; 64: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
-; 64: cvt.d.w $f{{[0-9]+}}, $f[[R0]]
+; ALL-LABEL: test_double_int_:
+; 32DF: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
+; 32DF: cvt.d.w $f{{[0-9]+}}, $f[[R0]]
+; 32SF: jal	__floatsidf
+; 64DF: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
+; 64DF: cvt.d.w $f{{[0-9]+}}, $f[[R0]]
+; 64SF: jal	__floatsidf
 
 define double @test_double_int_(i32 %a) {
 entry:
@@ -28,9 +40,11 @@ entry:
   ret double %conv
 }
 
-; 64-LABEL: test_float_LL_:
-; 64: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
-; 64: cvt.s.l $f{{[0-9]+}}, $f[[R0]]
+; ALL-LABEL: test_float_LL_:
+; 32:   jal __floatdisf
+; 64DF: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
+; 64DF: cvt.s.l $f{{[0-9]+}}, $f[[R0]]
+; 64SF: jal __floatdisf
 
 define float @test_float_LL_(i64 %a) {
 entry:
@@ -38,9 +52,11 @@ entry:
   ret float %conv
 }
 
-; 64-LABEL: test_double_LL_:
-; 64: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
-; 64: cvt.d.l $f{{[0-9]+}}, $f[[R0]]
+; ALL-LABEL: test_double_LL_:
+; 32:   jal __floatdidf
+; 64DF: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]]
+; 64DF: cvt.d.l $f{{[0-9]+}}, $f[[R0]]
+; 64SF: jal __floatdidf
 
 define double @test_double_LL_(i64 %a) {
 entry:

From 9a01561760a2785fbf6d66f86ba2361d36a648f7 Mon Sep 17 00:00:00 2001
From: Ruoyu Zhong <zhongruoyu@outlook.com>
Date: Sun, 28 Sep 2025 11:12:36 +0800
Subject: [PATCH 022/878] [clang-format] Fix qualifier ordering for lines after
 PP directives (#160731)

Lines appearing after preprocessor conditional blocks (like `#endif`)
were not having their qualifiers reordered by `QualifierOrder`, while
lines inside the conditional blocks were processed correctly.

The issue was that tokens on lines following preprocessor directives
have `MustBreakBefore` = `true`. The qualifier alignment logic was
breaking immediately upon encountering any token with `MustBreakBefore`
= `true`, preventing analysis of the entire line.

The fix allows processing to continue when `MustBreakBefore` = `true` on
the first token of a line, since this is expected behavior (the token
legitimately starts a new line). Only tokens with `MustBreakBefore` =
`true` that appear mid-line will cause the analysis loop to break.

Fixes https://github.com/llvm/llvm-project/issues/160487.
---
 clang/lib/Format/QualifierAlignmentFixer.cpp  |  2 +-
 clang/unittests/Format/QualifierFixerTest.cpp | 35 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/QualifierAlignmentFixer.cpp b/clang/lib/Format/QualifierAlignmentFixer.cpp
index 441a37a4902b7..043d957611b19 100644
--- a/clang/lib/Format/QualifierAlignmentFixer.cpp
+++ b/clang/lib/Format/QualifierAlignmentFixer.cpp
@@ -571,7 +571,7 @@ void LeftRightQualifierAlignmentFixer::fixQualifierAlignment(
 
     for (const auto *Tok = First; Tok && Tok != Last && Tok->Next;
          Tok = Tok->Next) {
-      if (Tok->MustBreakBefore)
+      if (Tok->MustBreakBefore && Tok != First)
         break;
       if (Tok->is(tok::comment))
         continue;
diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index f42f2e307f713..58e64ff368946 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -1195,6 +1195,41 @@ TEST_F(QualifierFixerTest, QualifiersBrokenUpByPPDirectives) {
                Style);
 }
 
+TEST_F(QualifierFixerTest, QualifierOrderingAfterPreprocessorDirectives) {
+  auto Style = getLLVMStyle();
+  Style.QualifierAlignment = FormatStyle::QAS_Custom;
+  Style.QualifierOrder = {"static", "inline", "const", "type"};
+
+  verifyFormat("#if 1\n"
+               "void foo(const int par);\n"
+               "const int var1;\n"
+               "#endif\n"
+               "\n"
+               "const int var2;\n"
+               "const int var3;",
+               "#if 1\n"
+               "void foo(int const par);\n"
+               "int const var1;\n"
+               "#endif\n"
+               "\n"
+               "int const var2;\n"
+               "int const var3;",
+               Style);
+  verifyFormat("#if defined(FOO)\n"
+               "static const int x = 1;\n"
+               "#else\n"
+               "static const int x = 2;\n"
+               "#endif\n"
+               "static const int y = 3;",
+               "#if defined(FOO)\n"
+               "const static int x = 1;\n"
+               "#else\n"
+               "const static int x = 2;\n"
+               "#endif\n"
+               "const static int y = 3;",
+               Style);
+}
+
 TEST_F(QualifierFixerTest, UnsignedQualifier) {
 
   FormatStyle Style = getLLVMStyle();

From 63f1c03feea17c8236774f3b5dc811d5cb3a2364 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 27 Sep 2025 20:38:26 -0700
Subject: [PATCH 023/878] [Driver] Make -fvectorize and -fslp-vectorize
 override -O group options (#161032)

`clang -fno-slp-vectorize -O2` incorrectly enabled CC1 -vectorize-slp.
Make -fvectorize and -fslp-vectorize properly override -O, following the
convention.

Fix #160633
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 12 ++++--------
 clang/test/Driver/Ofast.c                  |  2 +-
 clang/test/Driver/clang_f_opts.c           |  4 ++--
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 4902c2f3c0cbe..cce4f6487c0bd 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3346,20 +3346,16 @@ bool tools::shouldEnableVectorizerAtOLevel(const ArgList &Args, bool isSlpVec) {
 void tools::handleVectorizeLoopsArgs(const ArgList &Args,
                                      ArgStringList &CmdArgs) {
   bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false);
-  OptSpecifier vectorizeAliasOption =
-      EnableVec ? options::OPT_O_Group : options::OPT_fvectorize;
-  if (Args.hasFlag(options::OPT_fvectorize, vectorizeAliasOption,
-                   options::OPT_fno_vectorize, EnableVec))
+  if (Args.hasFlag(options::OPT_fvectorize, options::OPT_fno_vectorize,
+                   EnableVec))
     CmdArgs.push_back("-vectorize-loops");
 }
 
 void tools::handleVectorizeSLPArgs(const ArgList &Args,
                                    ArgStringList &CmdArgs) {
   bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true);
-  OptSpecifier SLPVectAliasOption =
-      EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize;
-  if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption,
-                   options::OPT_fno_slp_vectorize, EnableSLPVec))
+  if (Args.hasFlag(options::OPT_fslp_vectorize, options::OPT_fno_slp_vectorize,
+                   EnableSLPVec))
     CmdArgs.push_back("-vectorize-slp");
 }
 
diff --git a/clang/test/Driver/Ofast.c b/clang/test/Driver/Ofast.c
index 612478cc89558..e04ce036638f9 100644
--- a/clang/test/Driver/Ofast.c
+++ b/clang/test/Driver/Ofast.c
@@ -2,7 +2,7 @@
 // RUN: %clang -c -O2 -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -c -fno-fast-math -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -c -fno-strict-aliasing -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -c -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
+// RUN: %clang -c -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-VECTORIZE %s
 // RUN: %clang -c -Ofast -O2 -### -Werror %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \
 // RUN:  %if target={{.*-windows-msvc.*}} %{ --check-prefix=CHECK-OFAST-O2-ALIASING-MSVC %} \
 // RUN:  %else %{ --check-prefix=CHECK-OFAST-O2-ALIASING %} %s
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index bdeb747aa66a3..94b983f14e3ef 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -156,7 +156,7 @@
 // RUN: %clang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
 // RUN: %clang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
 // RUN: %clang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
-// RUN: %clang -### -S -fno-vectorize -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
+// RUN: %clang -### -S -fno-vectorize -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-VECTORIZE %s
 // RUN: %clang -### -S -O1 -fvectorize %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
 // RUN: %clang -### -S -Ofast %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
 // RUN: %clang -### -S %s 2>&1 | FileCheck -check-prefix=CHECK-NO-VECTORIZE %s
@@ -179,7 +179,7 @@
 // RUN: %clang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-SLP-VECTORIZE %s
 // RUN: %clang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-SLP-VECTORIZE %s
 // RUN: %clang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-SLP-VECTORIZE %s
-// RUN: %clang -### -S -fno-slp-vectorize -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-SLP-VECTORIZE %s
+// RUN: %clang -### -S -fno-slp-vectorize -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-SLP-VECTORIZE %s
 // RUN: %clang -### -S -O1 -fslp-vectorize %s 2>&1 | FileCheck -check-prefix=CHECK-SLP-VECTORIZE %s
 // RUN: %clang -### -S -Ofast %s 2>&1 | FileCheck -check-prefix=CHECK-SLP-VECTORIZE %s
 // RUN: %clang -### -S %s 2>&1 | FileCheck -check-prefix=CHECK-NO-SLP-VECTORIZE %s

From 4edda3d78c26b9d928d115b2059d0c719eec237b Mon Sep 17 00:00:00 2001
From: owenca <owenpiano@gmail.com>
Date: Sat, 27 Sep 2025 21:29:25 -0700
Subject: [PATCH 024/878] [clang-format] Fix bugs in annotating arrows and
 square brackets (#160973)

Fixes #160518
---
 clang/lib/Format/TokenAnnotator.cpp           |  5 ----
 clang/lib/Format/UnwrappedLineParser.cpp      | 25 ++++++++++---------
 clang/unittests/Format/TokenAnnotatorTest.cpp | 14 +++++++++++
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 6a8286da73442..67066a104d738 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -833,11 +833,6 @@ class AnnotatingParser {
           if (Parent && Parent->is(TT_PointerOrReference))
             Parent->overwriteFixedType(TT_BinaryOperator);
         }
-        // An arrow after an ObjC method expression is not a lambda arrow.
-        if (CurrentToken->is(TT_ObjCMethodExpr) && CurrentToken->Next &&
-            CurrentToken->Next->is(TT_LambdaArrow)) {
-          CurrentToken->Next->overwriteFixedType(TT_Unknown);
-        }
         Left->MatchingParen = CurrentToken;
         CurrentToken->MatchingParen = Left;
         // FirstObjCSelectorName is set when a colon is found. This does
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 2c9766c9b7bc0..6948b3de1e408 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2268,7 +2268,7 @@ bool UnwrappedLineParser::tryToParseLambda() {
   if (!tryToParseLambdaIntroducer())
     return false;
 
-  bool SeenArrow = false;
+  FormatToken *Arrow = nullptr;
   bool InTemplateParameterList = false;
 
   while (FormatTok->isNot(tok::l_brace)) {
@@ -2343,17 +2343,13 @@ bool UnwrappedLineParser::tryToParseLambda() {
     case tok::ellipsis:
     case tok::kw_true:
     case tok::kw_false:
-      if (SeenArrow || InTemplateParameterList) {
+      if (Arrow || InTemplateParameterList) {
         nextToken();
         break;
       }
       return true;
     case tok::arrow:
-      // This might or might not actually be a lambda arrow (this could be an
-      // ObjC method invocation followed by a dereferencing arrow). We might
-      // reset this back to TT_Unknown in TokenAnnotator.
-      FormatTok->setFinalizedType(TT_LambdaArrow);
-      SeenArrow = true;
+      Arrow = FormatTok;
       nextToken();
       break;
     case tok::kw_requires: {
@@ -2375,6 +2371,9 @@ bool UnwrappedLineParser::tryToParseLambda() {
   FormatTok->setFinalizedType(TT_LambdaLBrace);
   LSquare.setFinalizedType(TT_LambdaLSquare);
 
+  if (Arrow)
+    Arrow->setFinalizedType(TT_LambdaArrow);
+
   NestedLambdas.push_back(Line->SeenDecltypeAuto);
   parseChildBlock();
   assert(!NestedLambdas.empty());
@@ -2388,11 +2387,6 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() {
   const FormatToken *LeftSquare = FormatTok;
   nextToken();
   if (Previous) {
-    if (Previous->Tok.getIdentifierInfo() &&
-        !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield,
-                           tok::kw_co_return)) {
-      return false;
-    }
     if (Previous->closesScope()) {
       // Not a potential C-style cast.
       if (Previous->isNot(tok::r_paren))
@@ -2402,6 +2396,13 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() {
       // and `int (*)()`.
       if (!BeforeRParen || !BeforeRParen->isOneOf(tok::greater, tok::r_paren))
         return false;
+    } else if (Previous->is(tok::star)) {
+      Previous = Previous->getPreviousNonComment();
+    }
+    if (Previous && Previous->Tok.getIdentifierInfo() &&
+        !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield,
+                           tok::kw_co_return)) {
+      return false;
     }
   }
   if (LeftSquare->isCppStructuredBinding(IsCpp))
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 899cc47f8213f..4a8f27f656f1d 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2237,6 +2237,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[11], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_LambdaLBrace);
+
+  Tokens = annotate("SomeFunction({[]() -> int *[] { return {}; }});");
+  ASSERT_EQ(Tokens.size(), 22u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::l_square, TT_LambdaLSquare);
+  EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_LambdaDefinitionLParen);
+  EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsFunctionAnnotations) {
@@ -4159,6 +4165,14 @@ TEST_F(TokenAnnotatorTest, LineCommentTrailingBackslash) {
   EXPECT_TOKEN(Tokens[1], tok::comment, TT_LineComment);
 }
 
+TEST_F(TokenAnnotatorTest, ArrowAfterSubscript) {
+  auto Tokens =
+      annotate("return (getStructType()->getElements())[eIdx]->getName();");
+  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
+  // Not TT_LambdaArrow.
+  EXPECT_TOKEN(Tokens[13], tok::arrow, TT_Unknown);
+}
+
 TEST_F(TokenAnnotatorTest, QtProperty) {
   auto Style = getLLVMStyle();
   Style.AllowBreakBeforeQtProperty = true;

From c7504872927486bb70c26aa71f795b1331cbbd38 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Sun, 28 Sep 2025 07:21:40 +0200
Subject: [PATCH 025/878] [clang][bytecode] Diagnose volatile writes (#160350)

---
 clang/lib/AST/ByteCode/Compiler.cpp    | 18 +++++---
 clang/lib/AST/ByteCode/Compiler.h      |  1 +
 clang/lib/AST/ByteCode/Context.cpp     | 15 ++++++-
 clang/lib/AST/ByteCode/Interp.cpp      |  2 +
 clang/lib/AST/ByteCode/Interp.h        | 15 ++++++-
 clang/lib/AST/ByteCode/InterpFrame.cpp |  2 +
 clang/lib/AST/ByteCode/Opcodes.td      |  1 +
 clang/test/AST/ByteCode/cxx23.cpp      | 61 ++++++++++++++++++++++++--
 clang/test/AST/ByteCode/invalid.cpp    |  2 +-
 9 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index b4da99957ee88..0b7b6cd64dd97 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -2934,8 +2934,9 @@ bool Compiler<Emitter>::VisitMaterializeTemporaryExpr(
   // For everyhing else, use local variables.
   if (SubExprT) {
     bool IsConst = SubExpr->getType().isConstQualified();
-    unsigned LocalIndex =
-        allocateLocalPrimitive(E, *SubExprT, IsConst, E->getExtendingDecl());
+    bool IsVolatile = SubExpr->getType().isVolatileQualified();
+    unsigned LocalIndex = allocateLocalPrimitive(
+        E, *SubExprT, IsConst, IsVolatile, E->getExtendingDecl());
     if (!this->visit(SubExpr))
       return false;
     if (!this->emitSetLocal(*SubExprT, LocalIndex, E))
@@ -4452,6 +4453,9 @@ bool Compiler<Emitter>::visitAssignment(const Expr *LHS, const Expr *RHS,
   if (!this->visit(LHS))
     return false;
 
+  if (LHS->getType().isVolatileQualified())
+    return this->emitInvalidStore(LHS->getType().getTypePtr(), E);
+
   // We don't support assignments in C.
   if (!Ctx.getLangOpts().CPlusPlus && !this->emitInvalid(E))
     return false;
@@ -4560,13 +4564,14 @@ bool Compiler<Emitter>::emitConst(const APSInt &Value, const Expr *E) {
 
 template <class Emitter>
 unsigned Compiler<Emitter>::allocateLocalPrimitive(
-    DeclTy &&Src, PrimType Ty, bool IsConst, const ValueDecl *ExtendingDecl,
-    ScopeKind SC, bool IsConstexprUnknown) {
+    DeclTy &&Src, PrimType Ty, bool IsConst, bool IsVolatile,
+    const ValueDecl *ExtendingDecl, ScopeKind SC, bool IsConstexprUnknown) {
   // FIXME: There are cases where Src.is<Expr*>() is wrong, e.g.
   //   (int){12} in C. Consider using Expr::isTemporaryObject() instead
   //   or isa<MaterializeTemporaryExpr>().
   Descriptor *D = P.createDescriptor(Src, Ty, nullptr, Descriptor::InlineDescMD,
-                                     IsConst, isa<const Expr *>(Src));
+                                     IsConst, isa<const Expr *>(Src),
+                                     /*IsMutable=*/false, IsVolatile);
   D->IsConstexprUnknown = IsConstexprUnknown;
   Scope::Local Local = this->createLocal(D);
   if (auto *VD = dyn_cast_if_present<ValueDecl>(Src.dyn_cast<const Decl *>()))
@@ -4874,7 +4879,8 @@ Compiler<Emitter>::visitVarDecl(const VarDecl *VD, const Expr *Init,
 
   if (VarT) {
     unsigned Offset = this->allocateLocalPrimitive(
-        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
+        VD, *VarT, VD->getType().isConstQualified(),
+        VD->getType().isVolatileQualified(), nullptr, ScopeKind::Block,
         IsConstexprUnknown);
     if (Init) {
       // If this is a toplevel declaration, create a scope for the
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index 09599b3547888..5c46f75af4da3 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -327,6 +327,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
 
   /// Creates a local primitive value.
   unsigned allocateLocalPrimitive(DeclTy &&Decl, PrimType Ty, bool IsConst,
+                                  bool IsVolatile = false,
                                   const ValueDecl *ExtendingDecl = nullptr,
                                   ScopeKind SC = ScopeKind::Block,
                                   bool IsConstexprUnknown = false);
diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp
index 306f95c479d0f..683e916391337 100644
--- a/clang/lib/AST/ByteCode/Context.cpp
+++ b/clang/lib/AST/ByteCode/Context.cpp
@@ -567,9 +567,15 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FuncDecl) {
   // Assign descriptors to all parameters.
   // Composite objects are lowered to pointers.
   for (const ParmVarDecl *PD : FuncDecl->parameters()) {
+    bool IsConst = PD->getType().isConstQualified();
+    bool IsVolatile = PD->getType().isVolatileQualified();
+
     OptPrimType T = classify(PD->getType());
     PrimType PT = T.value_or(PT_Ptr);
-    Descriptor *Desc = P->createDescriptor(PD, PT);
+    Descriptor *Desc = P->createDescriptor(PD, PT, nullptr, std::nullopt,
+                                           IsConst, /*IsTemporary=*/false,
+                                           /*IsMutable=*/false, IsVolatile);
+
     ParamDescriptors.insert({ParamOffset, {PT, Desc}});
     ParamOffsets.push_back(ParamOffset);
     ParamOffset += align(primSize(PT));
@@ -595,9 +601,14 @@ const Function *Context::getOrCreateObjCBlock(const BlockExpr *E) {
   // Assign descriptors to all parameters.
   // Composite objects are lowered to pointers.
   for (const ParmVarDecl *PD : BD->parameters()) {
+    bool IsConst = PD->getType().isConstQualified();
+    bool IsVolatile = PD->getType().isVolatileQualified();
+
     OptPrimType T = classify(PD->getType());
     PrimType PT = T.value_or(PT_Ptr);
-    Descriptor *Desc = P->createDescriptor(PD, PT);
+    Descriptor *Desc = P->createDescriptor(PD, PT, nullptr, std::nullopt,
+                                           IsConst, /*IsTemporary=*/false,
+                                           /*IsMutable=*/false, IsVolatile);
     ParamDescriptors.insert({ParamOffset, {PT, Desc}});
     ParamOffsets.push_back(ParamOffset);
     ParamOffset += align(primSize(PT));
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 8aaefc70e506e..21af3d6ac7f90 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -889,6 +889,8 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
     return false;
   if (!CheckConst(S, OpPC, Ptr))
     return false;
+  if (!CheckVolatile(S, OpPC, Ptr, AK_Assign))
+    return false;
   if (!S.inConstantContext() && isConstexprUnknown(Ptr))
     return false;
   return true;
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 7867a0669b472..bb0c4580b14a9 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -1730,9 +1730,8 @@ inline bool GetPtrLocal(InterpState &S, CodePtr OpPC, uint32_t I) {
 }
 
 inline bool GetPtrParam(InterpState &S, CodePtr OpPC, uint32_t I) {
-  if (S.checkingPotentialConstantExpression()) {
+  if (S.Current->isBottomFrame())
     return false;
-  }
   S.Stk.push<Pointer>(S.Current->getParamPointer(I));
   return true;
 }
@@ -3344,6 +3343,18 @@ inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind,
   return false;
 }
 
+inline bool InvalidStore(InterpState &S, CodePtr OpPC, const Type *T) {
+  if (S.getLangOpts().CPlusPlus) {
+    QualType VolatileType = QualType(T, 0).withVolatile();
+    S.FFDiag(S.Current->getSource(OpPC),
+             diag::note_constexpr_access_volatile_type)
+        << AK_Assign << VolatileType;
+  } else {
+    S.FFDiag(S.Current->getSource(OpPC));
+  }
+  return false;
+}
+
 inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR,
                            bool InitializerFailed) {
   assert(DR);
diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp
index a3db0d7a29cfa..039acb5d72b2c 100644
--- a/clang/lib/AST/ByteCode/InterpFrame.cpp
+++ b/clang/lib/AST/ByteCode/InterpFrame.cpp
@@ -231,6 +231,8 @@ Pointer InterpFrame::getParamPointer(unsigned Off) {
   if (auto Pt = Params.find(Off); Pt != Params.end())
     return Pointer(reinterpret_cast<Block *>(Pt->second.get()));
 
+  assert(!isBottomFrame());
+
   // Allocate memory to store the parameter and the block metadata.
   const auto &Desc = Func->getParamDescriptor(Off);
   size_t BlockSize = sizeof(Block) + Desc.second->getAllocSize();
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 7af2df5318106..532c4448e6f40 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -797,6 +797,7 @@ def SideEffect : Opcode {}
 def InvalidCast : Opcode {
   let Args = [ArgCastKind, ArgBool];
 }
+def InvalidStore : Opcode { let Args = [ArgTypePtr]; }
 def CheckPseudoDtor : Opcode {}
 
 def InvalidDeclRef : Opcode {
diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp
index 72c751d627a44..ce0a4777ffa9b 100644
--- a/clang/test/AST/ByteCode/cxx23.cpp
+++ b/clang/test/AST/ByteCode/cxx23.cpp
@@ -1,8 +1,8 @@
 // UNSUPPORTED:  target={{.*}}-zos{{.*}}
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref,ref20,all,all20 %s
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref,ref23,all,all23 %s
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=expected23,all,all23 %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=ref,ref20,all,all20 %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=ref,ref23,all,all23 %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -Wno-deprecated-volatile -verify=expected23,all,all23 %s -fexperimental-new-constant-interpreter
 
 
 #define assert_active(F)   if (!__builtin_is_within_lifetime(&F)) (1/0);
@@ -393,6 +393,59 @@ namespace UnionMemberCallDiags {
   static_assert(g()); // all-error {{not an integral constant expression}} \
                       // all-note {{in call to}}
 }
+#endif
+
+namespace VolatileWrites {
+  constexpr void test1() {// all20-error {{never produces a constant expression}}
+    int k;
+    volatile int &m = k;
+    m = 10; // all20-note {{assignment to volatile-qualified type 'volatile int'}}
+  }
 
+  constexpr void test2() { // all20-error {{never produces a constant expression}}
+    volatile int k = 12;
 
+    k = 13; // all20-note {{assignment to volatile-qualified type 'volatile int'}}
+  }
+
+  constexpr void test3() { // all20-error {{never produces a constant expression}}
+    volatile int k = 12; // all20-note {{volatile object declared here}}
+
+    *((int *)&k) = 13; // all20-note {{assignment to volatile object 'k' is not allowed in a constant expression}}
+  }
+
+  constexpr void test4() { // all20-error {{never produces a constant expression}}
+    int k = 12;
+
+    *((volatile int *)&k) = 13; // all20-note {{assignment to volatile-qualified type 'volatile int' is not allowed in a constant expression}}
+  }
+
+#if __cplusplus >= 202302L
+  struct S {
+    volatile int k;
+  };
+  constexpr int test5() {
+    S s;
+    s.k = 12; // all-note {{assignment to volatile-qualified type 'volatile int' is not}}
+
+    return 0;
+  }
+  static_assert(test5() == 0); // all-error{{not an integral constant expression}} \
+                               // all-note {{in call to}}
 #endif
+
+  constexpr bool test6(volatile int k) { // ref20-error {{never produces a constant expression}}
+    k = 14; // ref20-note {{assignment to volatile-qualified type 'volatile int' is not}} \
+            // all-note {{assignment to volatile-qualified type 'volatile int' is not}}
+    return true;
+  }
+  static_assert(test6(5)); // all-error {{not an integral constant expression}} \
+                           // all-note {{in call to}}
+
+  constexpr bool test7(volatile int k) { // all-note {{declared here}}
+    *((int *)&k) = 13; // all-note {{assignment to volatile object 'k' is not allowed in a constant expression}}
+    return true;
+  }
+  static_assert(test7(12)); // all-error {{not an integral constant expression}} \
+                            // all-note {{in call to}}
+}
diff --git a/clang/test/AST/ByteCode/invalid.cpp b/clang/test/AST/ByteCode/invalid.cpp
index affb40eada870..00db27419e36b 100644
--- a/clang/test/AST/ByteCode/invalid.cpp
+++ b/clang/test/AST/ByteCode/invalid.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref,both %s
+// RUN: %clang_cc1 -fcxx-exceptions -std=c++20                                         -verify=ref,both %s
 
 namespace Throw {
 

From 8ce3b8b518badcf5c6df8e0723fb93fa57cb2456 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Sun, 28 Sep 2025 13:50:20 +0800
Subject: [PATCH 026/878] [ARM] Remove `UnsafeFPMath` uses (#151275)

Try to remove `UnsafeFPMath` uses in arm backend. These global flags
block some improvements like
https://discourse.llvm.org/t/rfc-honor-pragmas-with-ffp-contract-fast/80797.
Remove them incrementally.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp     |  19 +-
 llvm/test/CodeGen/ARM/build-attributes.ll | 319 ----------------------
 2 files changed, 17 insertions(+), 321 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 2381effb1b6d3..1f773e2a7e0fc 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -632,6 +632,19 @@ static bool checkDenormalAttributeConsistency(const Module &M, StringRef Attr,
   });
 }
 
+// Returns true if all functions have different denormal modes.
+static bool checkDenormalAttributeInconsistency(const Module &M) {
+  auto F = M.functions().begin();
+  auto E = M.functions().end();
+  if (F == E)
+    return false;
+  DenormalMode Value = F->getDenormalModeRaw();
+  ++F;
+  return std::any_of(F, E, [&](const Function &F) {
+    return !F.isDeclaration() && F.getDenormalModeRaw() != Value;
+  });
+}
+
 void ARMAsmPrinter::emitAttributes() {
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -698,7 +711,9 @@ void ARMAsmPrinter::emitAttributes() {
                                              DenormalMode::getPositiveZero()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
                       ARMBuildAttrs::PositiveZero);
-  else if (!TM.Options.UnsafeFPMath)
+  else if (checkDenormalAttributeInconsistency(*MMI->getModule()) ||
+           checkDenormalAttributeConsistency(
+               *MMI->getModule(), "denormal-fp-math", DenormalMode::getIEEE()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
                       ARMBuildAttrs::IEEEDenormals);
   else {
@@ -733,7 +748,7 @@ void ARMAsmPrinter::emitAttributes() {
       TM.Options.NoTrappingFPMath)
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
                       ARMBuildAttrs::Not_Allowed);
-  else if (!TM.Options.UnsafeFPMath) {
+  else {
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Allowed);
 
     // If the user has permitted this code to choose the IEEE 754
diff --git a/llvm/test/CodeGen/ARM/build-attributes.ll b/llvm/test/CodeGen/ARM/build-attributes.ll
index 68844aed03630..306a4a31b79fa 100644
--- a/llvm/test/CodeGen/ARM/build-attributes.ll
+++ b/llvm/test/CodeGen/ARM/build-attributes.ll
@@ -3,23 +3,16 @@
 
 ; RUN: llc < %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale -mattr=+strict-align | FileCheck %s --check-prefix=XSCALE
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6-FAST
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
 ; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M
-; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align | FileCheck %s --check-prefix=ARM1156T2F-S
-; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast  | FileCheck %s --check-prefix=ARM1156T2F-S-FAST
 ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefix=V7M
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7M-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=V7
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V8-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi | FileCheck %s --check-prefix=Vt8
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
@@ -31,35 +24,24 @@
 ; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi | FileCheck %s --check-prefix=V8MMAINLINE
 ; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi -mattr=+dsp | FileCheck %s --check-prefix=V8MMAINLINE_DSP
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-neon,-d32 | FileCheck %s --check-prefix=CORTEX-A5-NONEON
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A5-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-SOFT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A8-HARD
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-HARD-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-SOFT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-HARD-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A12-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A15-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 | FileCheck %s --check-prefix=CORTEX-A17-DEFAULT
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-FAST
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A17-NOFPU
-; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-NOFPU-FAST
 
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-no-trapping-fp-math | FileCheck %s --check-prefix=NO-TRAPPING-MATH
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -denormal-fp-math=ieee | FileCheck %s --check-prefix=DENORMAL-IEEE
@@ -74,37 +56,26 @@
 
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus | FileCheck %s --check-prefix=CORTEX-M0PLUS
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0PLUS-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 | FileCheck %s --check-prefix=CORTEX-M1
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M1-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align | FileCheck %s --check-prefix=SC000
-; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC000-FAST
 ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M3-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 | FileCheck %s --check-prefix=SC300
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC300-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-SOFT-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
-; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-HARD-FAST
 ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SOFT
-; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2sp  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-NOFPU-FAST
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-fp64 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SINGLE
-; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-fp64  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-FAST
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7-DOUBLE
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m23 | FileCheck %s --check-prefix=CORTEX-M23
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CORTEX-M33
-; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M33-FAST
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m35p | FileCheck %s --check-prefix=CORTEX-M35P
@@ -113,49 +84,34 @@
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4 | FileCheck %s --check-prefix=CORTEX-R4
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4f | FileCheck %s --check-prefix=CORTEX-R4F
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
-; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R5-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7
-; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 | FileCheck %s --check-prefix=CORTEX-R8
-; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R8-FAST
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 | FileCheck %s --check-prefix=CORTEX-A32
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A32-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A57-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=CORTEX-A72
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a73 | FileCheck %s --check-prefix=CORTEX-A73
 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3 | FileCheck %s --check-prefix=EXYNOS-M3
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4 | FileCheck %s --check-prefix=EXYNOS-M4
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5 | FileCheck %s --check-prefix=EXYNOS-M5
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A-FAST
 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s  --check-prefix=CORTEX-A7-CHECK
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s  --check-prefix=CORTEX-A7-CHECK-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2sp,-vfp3,-vfp4,-neon,-fp16 | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2sp,-vfp3,-vfp4,-neon,-fp16  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-NOFPU-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
-; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon  -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-FPUV4-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,-d32,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER
@@ -278,15 +234,6 @@
 ; V6-NOT:   .eabi_attribute 28
 ; V6:    .eabi_attribute 38, 1
 
-; V6-FAST-NOT:   .eabi_attribute 19
-;; Despite the V6 CPU having no FPU by default, we chose to flush to
-;; positive zero here. There's no hardware support doing this, but the
-;; fast maths software library might.
-; V6-FAST-NOT:   .eabi_attribute 20
-; V6-FAST-NOT:   .eabi_attribute 21
-; V6-FAST-NOT:   .eabi_attribute 22
-; V6-FAST:   .eabi_attribute 23, 1
-
 ;; We emit 6, 12 for both v6-M and v6S-M, technically this is incorrect for
 ;; V6-M, however we don't model the OS extension so this is fine.
 ; V6M:  .eabi_attribute 6, 12
@@ -312,14 +259,6 @@
 ; V6M-NOT:  .eabi_attribute 28
 ; V6M:  .eabi_attribute 38, 1
 
-; V6M-FAST-NOT:   .eabi_attribute 19
-;; Despite the V6M CPU having no FPU by default, we chose to flush to
-;; positive zero here. There's no hardware support doing this, but the
-;; fast maths software library might.
-; V6M-FAST-NOT:  .eabi_attribute 20
-; V6M-FAST-NOT:   .eabi_attribute 21
-; V6M-FAST-NOT:   .eabi_attribute 22
-; V6M-FAST:   .eabi_attribute 23, 1
 
 ; ARM1156T2F-S: .cpu arm1156t2f-s
 ; ARM1156T2F-S: .eabi_attribute 6, 8
@@ -342,14 +281,6 @@
 ; ARM1156T2F-S-NOT: .eabi_attribute 28
 ; ARM1156T2F-S: .eabi_attribute 38, 1
 
-; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 19
-;; V6 cores default to flush to positive zero (value 0). Note that value 2 is also equally
-;; valid for this core, it's an implementation defined question as to which of 0 and 2 you
-;; select. LLVM historically picks 0.
-; ARM1156T2F-S-FAST-NOT: .eabi_attribute 20
-; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 21
-; ARM1156T2F-S-FAST-NOT:   .eabi_attribute 22
-; ARM1156T2F-S-FAST:   .eabi_attribute 23, 1
 
 ; V7M:  .eabi_attribute 6, 10
 ; V7M:  .eabi_attribute 7, 77
@@ -374,15 +305,6 @@
 ; V7M-NOT:  .eabi_attribute 28
 ; V7M:  .eabi_attribute 38, 1
 
-; V7M-FAST-NOT:   .eabi_attribute 19
-;; Despite the V7M CPU having no FPU by default, we chose to flush
-;; preserving sign. This matches what the hardware would do in the
-;; architecture revision were to exist on the current target.
-; V7M-FAST:  .eabi_attribute 20, 2
-; V7M-FAST-NOT:   .eabi_attribute 21
-; V7M-FAST-NOT:   .eabi_attribute 22
-; V7M-FAST:   .eabi_attribute 23, 1
-
 ; V7:      .syntax unified
 ; V7: .eabi_attribute 6, 10
 ; V7-NOT: .eabi_attribute 27
@@ -401,13 +323,6 @@
 ; V7-NOT: .eabi_attribute 28
 ; V7: .eabi_attribute 38, 1
 
-; V7-FAST-NOT:   .eabi_attribute 19
-;; The default CPU does have an FPU and it must be VFPv3 or better, so it flushes
-;; denormals to zero preserving the sign.
-; V7-FAST: .eabi_attribute 20, 2
-; V7-FAST-NOT:   .eabi_attribute 21
-; V7-FAST-NOT:   .eabi_attribute 22
-; V7-FAST:   .eabi_attribute 23, 1
 
 ; V7VE:      .syntax unified
 ; V7VE: .eabi_attribute 6, 10   @ Tag_CPU_arch
@@ -435,12 +350,6 @@
 ; V8-NOT: .eabi_attribute 22
 ; V8: .eabi_attribute 23, 3
 
-; V8-FAST-NOT:   .eabi_attribute 19
-;; The default does have an FPU, and for V8-A, it flushes preserving sign.
-; V8-FAST: .eabi_attribute 20, 2
-; V8-FAST-NOT: .eabi_attribute 21
-; V8-FAST-NOT: .eabi_attribute 22
-; V8-FAST: .eabi_attribute 23, 1
 
 ; Vt8:     .syntax unified
 ; Vt8: .eabi_attribute 6, 14
@@ -552,15 +461,11 @@
 ;; We default to IEEE 754 compliance
 ; CORTEX-A7-CHECK: .eabi_attribute      20, 1
 ;; The A7 has VFPv3 support by default, so flush preserving sign.
-; CORTEX-A7-CHECK-FAST: .eabi_attribute 20, 2
 ; CORTEX-A7-NOFPU: .eabi_attribute      20, 1
 ;; Despite there being no FPU, we chose to flush to zero preserving
 ;; sign. This matches what the hardware would do for this architecture
 ;; revision.
-; CORTEX-A7-NOFPU-FAST: .eabi_attribute 20, 2
 ; CORTEX-A7-FPUV4: .eabi_attribute      20, 1
-;; The VFPv4 FPU flushes preserving sign.
-; CORTEX-A7-FPUV4-FAST: .eabi_attribute 20, 2
 
 ; Tag_ABI_FP_exceptions
 ; CORTEX-A7-CHECK: .eabi_attribute      21, 1
@@ -610,13 +515,6 @@
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 24, 1
 ; CORTEX-A5-DEFAULT:        .eabi_attribute 25, 1
 
-; CORTEX-A5-DEFAULT-FAST-NOT:   .eabi_attribute 19
-;; The A5 defaults to a VFPv4 FPU, so it flushed preserving the sign when -ffast-math
-;; is given.
-; CORTEX-A5-DEFAULT-FAST:        .eabi_attribute 20, 2
-; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-DEFAULT-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A5-NONEON:        .cpu    cortex-a5
 ; CORTEX-A5-NONEON:        .eabi_attribute 6, 10
@@ -634,13 +532,6 @@
 ; CORTEX-A5-NONEON:        .eabi_attribute 24, 1
 ; CORTEX-A5-NONEON:        .eabi_attribute 25, 1
 
-; CORTEX-A5-NONEON-FAST-NOT:   .eabi_attribute 19
-;; The A5 defaults to a VFPv4 FPU, so it flushed preserving sign when -ffast-math
-;; is given.
-; CORTEX-A5-NONEON-FAST:        .eabi_attribute 20, 2
-; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-NONEON-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A5-NOFPU:        .cpu    cortex-a5
 ; CORTEX-A5-NOFPU:        .eabi_attribute 6, 10
@@ -659,14 +550,9 @@
 ; CORTEX-A5-NOFPU:        .eabi_attribute 24, 1
 ; CORTEX-A5-NOFPU:        .eabi_attribute 25, 1
 
-; CORTEX-A5-NOFPU-FAST-NOT:   .eabi_attribute 19
 ;; Despite there being no FPU, we chose to flush to zero preserving
 ;; sign. This matches what the hardware would do for this architecture
 ;; revision.
-; CORTEX-A5-NOFPU-FAST: .eabi_attribute 20, 2
-; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-NOFPU-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A8-SOFT:  .cpu cortex-a8
 ; CORTEX-A8-SOFT:  .eabi_attribute 6, 10
@@ -712,15 +598,6 @@
 ; CORTEX-A9-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-A9-SOFT:  .eabi_attribute 38, 1
 
-; CORTEX-A8-SOFT-FAST-NOT:   .eabi_attribute 19
-; CORTEX-A9-SOFT-FAST-NOT:   .eabi_attribute 19
-;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A8-SOFT-FAST:  .eabi_attribute 20, 2
-; CORTEX-A9-SOFT-FAST:  .eabi_attribute 20, 2
-; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 21
-; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 22
-; CORTEX-A5-SOFT-FAST: .eabi_attribute 23, 1
 
 ; CORTEX-A8-HARD:  .cpu cortex-a8
 ; CORTEX-A8-HARD:  .eabi_attribute 6, 10
@@ -766,21 +643,6 @@
 ; CORTEX-A9-HARD:  .eabi_attribute 28, 1
 ; CORTEX-A9-HARD:  .eabi_attribute 38, 1
 
-; CORTEX-A8-HARD-FAST-NOT:   .eabi_attribute 19
-;; The A8 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A8-HARD-FAST:  .eabi_attribute 20, 2
-; CORTEX-A8-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A8-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A8-HARD-FAST:  .eabi_attribute 23, 1
-
-; CORTEX-A9-HARD-FAST-NOT:   .eabi_attribute 19
-;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A9-HARD-FAST:  .eabi_attribute 20, 2
-; CORTEX-A9-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A9-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A9-HARD-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A12-DEFAULT:  .cpu cortex-a12
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 6, 10
@@ -800,13 +662,6 @@
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 24, 1
 ; CORTEX-A12-DEFAULT:  .eabi_attribute 25, 1
 
-; CORTEX-A12-DEFAULT-FAST-NOT:   .eabi_attribute 19
-;; The A12 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A12-DEFAULT-FAST:  .eabi_attribute 20, 2
-; CORTEX-A12-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A12-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A12-HARD-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A12-NOFPU:  .cpu cortex-a12
 ; CORTEX-A12-NOFPU:  .eabi_attribute 6, 10
@@ -826,14 +681,6 @@
 ; CORTEX-A12-NOFPU:  .eabi_attribute 24, 1
 ; CORTEX-A12-NOFPU:  .eabi_attribute 25, 1
 
-; CORTEX-A12-NOFPU-FAST-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-A12-NOFPU-FAST:  .eabi_attribute 20, 2
-; CORTEX-A12-NOFPU-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A12-NOFPU-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A12-NOFPU-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A15: .cpu cortex-a15
 ; CORTEX-A15: .eabi_attribute 6, 10
@@ -857,13 +704,6 @@
 ; CORTEX-A15-NOT: .eabi_attribute 28
 ; CORTEX-A15: .eabi_attribute 38, 1
 
-; CORTEX-A15-FAST-NOT:   .eabi_attribute 19
-;; The A15 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A15-FAST: .eabi_attribute 20, 2
-; CORTEX-A15-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A15-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A15-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A17-DEFAULT:  .cpu cortex-a17
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 6, 10
@@ -883,13 +723,6 @@
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 24, 1
 ; CORTEX-A17-DEFAULT:  .eabi_attribute 25, 1
 
-; CORTEX-A17-FAST-NOT:   .eabi_attribute 19
-;; The A17 defaults to a VFPv3 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-A17-FAST:  .eabi_attribute 20, 2
-; CORTEX-A17-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A17-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A17-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A17-NOFPU:  .cpu cortex-a17
 ; CORTEX-A17-NOFPU:  .eabi_attribute 6, 10
@@ -910,13 +743,6 @@
 ; CORTEX-A17-NOFPU:  .eabi_attribute 25, 1
 
 ; CORTEX-A17-NOFPU-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-A17-NOFPU-FAST:  .eabi_attribute 20, 2
-; CORTEX-A17-NOFPU-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A17-NOFPU-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A17-NOFPU-FAST:  .eabi_attribute 23, 1
 
 ; Test flags -enable-no-trapping-fp-math and -denormal-fp-math:
 ; NO-TRAPPING-MATH:  .eabi_attribute 21, 0
@@ -946,16 +772,6 @@
 ; CORTEX-M0-NOT:  .eabi_attribute 28
 ; CORTEX-M0:  .eabi_attribute 38, 1
 
-; CORTEX-M0-FAST-NOT:   .eabi_attribute 19
-;; Despite the M0 CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; CORTEX-M0-FAST-NOT:  .eabi_attribute 20
-; CORTEX-M0-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M0-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M0-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M0PLUS:  .cpu cortex-m0plus
 ; CORTEX-M0PLUS:  .eabi_attribute 6, 12
@@ -978,16 +794,6 @@
 ; CORTEX-M0PLUS-NOT:  .eabi_attribute 28
 ; CORTEX-M0PLUS:  .eabi_attribute 38, 1
 
-; CORTEX-M0PLUS-FAST-NOT:   .eabi_attribute 19
-;; Despite the M0+ CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 20
-; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M0PLUS-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M0PLUS-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M1:  .cpu cortex-m1
 ; CORTEX-M1:  .eabi_attribute 6, 12
@@ -1010,16 +816,6 @@
 ; CORTEX-M1-NOT:  .eabi_attribute 28
 ; CORTEX-M1:  .eabi_attribute 38, 1
 
-; CORTEX-M1-FAST-NOT:   .eabi_attribute 19
-;; Despite the M1 CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; CORTEX-M1-FAST-NOT:  .eabi_attribute 20
-; CORTEX-M1-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M1-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M1-FAST:  .eabi_attribute 23, 1
 
 ; SC000:  .cpu sc000
 ; SC000:  .eabi_attribute 6, 12
@@ -1041,16 +837,6 @@
 ; SC000-NOT:  .eabi_attribute 28
 ; SC000:  .eabi_attribute 38, 1
 
-; SC000-FAST-NOT:   .eabi_attribute 19
-;; Despite the SC000 CPU having no FPU in this scenario, we chose to
-;; flush to positive zero here. There's no hardware support doing
-;; this, but the fast maths software library might and such behaviour
-;; would match hardware support on this architecture revision if it
-;; existed.
-; SC000-FAST-NOT:  .eabi_attribute 20
-; SC000-FAST-NOT:  .eabi_attribute 21
-; SC000-FAST-NOT:  .eabi_attribute 22
-; SC000-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M3:  .cpu cortex-m3
 ; CORTEX-M3:  .eabi_attribute 6, 10
@@ -1073,14 +859,6 @@
 ; CORTEX-M3-NOT:  .eabi_attribute 28
 ; CORTEX-M3:  .eabi_attribute 38, 1
 
-; CORTEX-M3-FAST-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-M3-FAST:  .eabi_attribute 20, 2
-; CORTEX-M3-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M3-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M3-FAST:  .eabi_attribute 23, 1
 
 ; SC300:  .cpu sc300
 ; SC300:  .eabi_attribute 6, 10
@@ -1103,14 +881,6 @@
 ; SC300-NOT:  .eabi_attribute 28
 ; SC300:  .eabi_attribute 38, 1
 
-; SC300-FAST-NOT:   .eabi_attribute 19
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; SC300-FAST:  .eabi_attribute 20, 2
-; SC300-FAST-NOT:  .eabi_attribute 21
-; SC300-FAST-NOT:  .eabi_attribute 22
-; SC300-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M4-SOFT:  .cpu cortex-m4
 ; CORTEX-M4-SOFT:  .eabi_attribute 6, 13
@@ -1134,13 +904,6 @@
 ; CORTEX-M4-SOFT-NOT:  .eabi_attribute 28
 ; CORTEX-M4-SOFT:  .eabi_attribute 38, 1
 
-; CORTEX-M4-SOFT-FAST-NOT:   .eabi_attribute 19
-;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-M4-SOFT-FAST:  .eabi_attribute 20, 2
-; CORTEX-M4-SOFT-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M4-SOFT-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M4-SOFT-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M4-HARD:  .cpu cortex-m4
 ; CORTEX-M4-HARD:  .eabi_attribute 6, 13
@@ -1164,13 +927,6 @@
 ; CORTEX-M4-HARD:  .eabi_attribute 28, 1
 ; CORTEX-M4-HARD:  .eabi_attribute 38, 1
 
-; CORTEX-M4-HARD-FAST-NOT:   .eabi_attribute 19
-;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when
-;; -ffast-math is specified.
-; CORTEX-M4-HARD-FAST:  .eabi_attribute 20, 2
-; CORTEX-M4-HARD-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M4-HARD-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M4-HARD-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M7:  .cpu    cortex-m7
 ; CORTEX-M7:  .eabi_attribute 6, 13
@@ -1197,16 +953,6 @@
 ; CORTEX-M7:  .eabi_attribute 38, 1
 ; CORTEX-M7:  .eabi_attribute 14, 0
 
-; CORTEX-M7-NOFPU-FAST-NOT:   .eabi_attribute 19
-;; The M7 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-M7-FAST:  .eabi_attribute 20, 2
-;; Despite there being no FPU, we chose to flush to zero preserving
-;; sign. This matches what the hardware would do for this architecture
-;; revision.
-; CORTEX-M7-NOFPU-FAST: .eabi_attribute 20, 2
-; CORTEX-M7-NOFPU-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M7-NOFPU-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M7-NOFPU-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-R4:  .cpu cortex-r4
 ; CORTEX-R4:  .eabi_attribute 6, 10
@@ -1273,12 +1019,6 @@
 ; CORTEX-R5-NOT:  .eabi_attribute 28
 ; CORTEX-R5:  .eabi_attribute 38, 1
 
-; CORTEX-R5-FAST-NOT:   .eabi_attribute 19
-;; The R5 has the VFPv3 FP unit, which always flushes preserving sign.
-; CORTEX-R5-FAST:  .eabi_attribute 20, 2
-; CORTEX-R5-FAST-NOT:  .eabi_attribute 21
-; CORTEX-R5-FAST-NOT:  .eabi_attribute 22
-; CORTEX-R5-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-R7:  .cpu cortex-r7
 ; CORTEX-R7:  .eabi_attribute 6, 10
@@ -1301,12 +1041,6 @@
 ; CORTEX-R7-NOT:  .eabi_attribute 28
 ; CORTEX-R7:  .eabi_attribute 38, 1
 
-; CORTEX-R7-FAST-NOT:   .eabi_attribute 19
-;; The R7 has the VFPv3 FP unit, which always flushes preserving sign.
-; CORTEX-R7-FAST:  .eabi_attribute 20, 2
-; CORTEX-R7-FAST-NOT:  .eabi_attribute 21
-; CORTEX-R7-FAST-NOT:  .eabi_attribute 22
-; CORTEX-R7-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-R8:  .cpu cortex-r8
 ; CORTEX-R8:  .eabi_attribute 6, 10
@@ -1329,12 +1063,6 @@
 ; CORTEX-R8-NOT:  .eabi_attribute 28
 ; CORTEX-R8:  .eabi_attribute 38, 1
 
-; CORTEX-R8-FAST-NOT:   .eabi_attribute 19
-;; The R8 has the VFPv3 FP unit, which always flushes preserving sign.
-; CORTEX-R8-FAST:  .eabi_attribute 20, 2
-; CORTEX-R8-FAST-NOT:  .eabi_attribute 21
-; CORTEX-R8-FAST-NOT:  .eabi_attribute 22
-; CORTEX-R8-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A32:  .cpu cortex-a32
 ; CORTEX-A32:  .eabi_attribute 6, 14
@@ -1359,12 +1087,6 @@
 ; CORTEX-A32-NOT:  .eabi_attribute 28
 ; CORTEX-A32:  .eabi_attribute 38, 1
 
-; CORTEX-A32-FAST-NOT:   .eabi_attribute 19
-;; The A32 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A32-FAST:  .eabi_attribute 20, 2
-; CORTEX-A32-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A32-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A32-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-M23:  .cpu cortex-m23
 ; CORTEX-M23:  .eabi_attribute 6, 16
@@ -1430,11 +1152,6 @@
 ; CORTEX-M35P:  .eabi_attribute 38, 1
 ; CORTEX-M35P:  .eabi_attribute 14, 0
 
-; CORTEX-M33-FAST-NOT:   .eabi_attribute 19
-; CORTEX-M33-FAST:  .eabi_attribute 20, 2
-; CORTEX-M33-FAST-NOT:  .eabi_attribute 21
-; CORTEX-M33-FAST-NOT:  .eabi_attribute 22
-; CORTEX-M33-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A35:  .cpu cortex-a35
 ; CORTEX-A35:  .eabi_attribute 6, 14
@@ -1459,12 +1176,6 @@
 ; CORTEX-A35-NOT:  .eabi_attribute 28
 ; CORTEX-A35:  .eabi_attribute 38, 1
 
-; CORTEX-A35-FAST-NOT:   .eabi_attribute 19
-;; The A35 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A35-FAST:  .eabi_attribute 20, 2
-; CORTEX-A35-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A35-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A35-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A53:  .cpu cortex-a53
 ; CORTEX-A53:  .eabi_attribute 6, 14
@@ -1489,12 +1200,6 @@
 ; CORTEX-A53-NOT:  .eabi_attribute 28
 ; CORTEX-A53:  .eabi_attribute 38, 1
 
-; CORTEX-A53-FAST-NOT:   .eabi_attribute 19
-;; The A53 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A53-FAST:  .eabi_attribute 20, 2
-; CORTEX-A53-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A53-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A53-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A57:  .cpu cortex-a57
 ; CORTEX-A57:  .eabi_attribute 6, 14
@@ -1519,12 +1224,6 @@
 ; CORTEX-A57-NOT:  .eabi_attribute 28
 ; CORTEX-A57:  .eabi_attribute 38, 1
 
-; CORTEX-A57-FAST-NOT:   .eabi_attribute 19
-;; The A57 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A57-FAST:  .eabi_attribute 20, 2
-; CORTEX-A57-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A57-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A57-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A72:  .cpu cortex-a72
 ; CORTEX-A72:  .eabi_attribute 6, 14
@@ -1549,12 +1248,6 @@
 ; CORTEX-A72-NOT:  .eabi_attribute 28
 ; CORTEX-A72:  .eabi_attribute 38, 1
 
-; CORTEX-A72-FAST-NOT:   .eabi_attribute 19
-;; The A72 has the ARMv8 FP unit, which always flushes preserving sign.
-; CORTEX-A72-FAST:  .eabi_attribute 20, 2
-; CORTEX-A72-FAST-NOT:  .eabi_attribute 21
-; CORTEX-A72-FAST-NOT:  .eabi_attribute 22
-; CORTEX-A72-FAST:  .eabi_attribute 23, 1
 
 ; CORTEX-A73:  .cpu cortex-a73
 ; CORTEX-A73:  .eabi_attribute 6, 14
@@ -1580,12 +1273,6 @@
 ; CORTEX-A73:  .eabi_attribute 38, 1
 ; CORTEX-A73:  .eabi_attribute 14, 0
 
-; EXYNOS-FAST-NOT:   .eabi_attribute 19
-;; The Exynos processors have the ARMv8 FP unit, which always flushes preserving sign.
-; EXYNOS-FAST:  .eabi_attribute 20, 2
-; EXYNOS-FAST-NOT:  .eabi_attribute 21
-; EXYNOS-FAST-NOT:  .eabi_attribute 22
-; EXYNOS-FAST:  .eabi_attribute 23, 1
 
 ; EXYNOS-M3:  .cpu exynos-m3
 ; EXYNOS-M3:  .eabi_attribute 6, 14
@@ -1684,12 +1371,6 @@
 ; GENERIC-ARMV8_1-A-NOT:  .eabi_attribute 28
 ; GENERIC-ARMV8_1-A:  .eabi_attribute 38, 1
 
-; GENERIC-ARMV8_1-A-FAST-NOT:   .eabi_attribute 19
-;; GENERIC-ARMV8_1-A has the ARMv8 FP unit, which always flushes preserving sign.
-; GENERIC-ARMV8_1-A-FAST:  .eabi_attribute 20, 2
-; GENERIC-ARMV8_1-A-FAST-NOT:  .eabi_attribute 21
-; GENERIC-ARMV8_1-A-FAST-NOT:  .eabi_attribute 22
-; GENERIC-ARMV8_1-A-FAST:  .eabi_attribute 23, 1
 
 ; RELOC-PIC:  .eabi_attribute 15, 1
 ; RELOC-PIC:  .eabi_attribute 16, 1

From 5a13c857f9aaa262e846444e0d3db4ee60854f7d Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Sun, 28 Sep 2025 09:08:09 +0300
Subject: [PATCH 027/878] [GitHub][docker] Add python3 venv package to CI
 container (#161024)

I'm trying to make `pr-code-format.yml` job run natively on
`ci-ubuntu-24.04` container.
As it appears, `ci-ubuntu-24.04` already
[has](https://github.com/llvm/llvm-project/blob/41a2dfc0d77d9ad977d1d36358f979abb3a0928f/.github/workflows/containers/github-action-ci/Dockerfile#L35)
latest `clang-format`, `python3.12` installed, but `python3.12` needs
`venv` to work properly, and Ubuntu asks for `python3-venv` package to
be installed to create a venv.
---
 .github/workflows/containers/github-action-ci/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 1d3f5f9c35d7f..dc0c9cabc7f01 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -62,6 +62,7 @@ RUN apt-get update && \
     # Having a symlink from python to python3 enables code sharing between
     # the Linux and Windows pipelines.
     python3-pip \
+    python3-venv \
     file \
     tzdata \
     python-is-python3 && \

From f9e7f95b3dfceac8b680ad2f3d5f35543c0d46ae Mon Sep 17 00:00:00 2001
From: Victor Chernyakin <chernyakin.victor.j@outlook.com>
Date: Sat, 27 Sep 2025 23:25:51 -0700
Subject: [PATCH 028/878] [clang-tidy] Fix `modernize-use-nullptr` crash on
 32-bit Windows (#160023)

Fixes #53778.
---
 clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp | 2 +-
 clang-tools-extra/docs/ReleaseNotes.rst                    | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index 4dc4baecddd50..4084d713665ea 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -53,7 +53,7 @@ StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef<StringRef> NameList) {
       unless(hasImplicitDestinationType(
           qualType(matchers::matchesAnyListedTypeName(NameList)))));
 
-  auto IsOrHasDescendant = [](auto InnerMatcher) {
+  auto IsOrHasDescendant = [](const auto &InnerMatcher) {
     return anyOf(InnerMatcher, hasDescendant(InnerMatcher));
   };
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 9257dc6b98ba2..c3a6d2f9b2890 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -329,6 +329,11 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-designated-initializers>` check to
   suggest using designated initializers for aliased aggregate types.
 
+- Improved :doc:`modernize-use-nullptr
+  <clang-tidy/checks/modernize/use-nullptr>` check by fixing a crash
+  on Windows when the check was enabled with a 32-bit :program:`clang-tidy`
+  binary.
+
 - Improved :doc:`modernize-use-std-format
   <clang-tidy/checks/modernize/use-std-format>` check to correctly match
   when the format string is converted to a different type by an implicit

From 1ab4113d0e0d30ba923ea070530a0e65910b26cb Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sun, 28 Sep 2025 09:19:15 +0200
Subject: [PATCH 029/878] [libc++] Remove a bunch of unused includes from
 <flat_*> (#160658)

---
 libcxx/include/__flat_map/flat_map.h           |  2 --
 libcxx/include/__flat_map/flat_multimap.h      |  5 -----
 libcxx/include/__flat_map/key_value_iterator.h |  1 -
 libcxx/include/__flat_set/flat_multiset.h      | 14 --------------
 libcxx/include/__flat_set/flat_set.h           |  8 --------
 libcxx/include/module.modulemap.in             |  5 ++++-
 6 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index 31ba9bc0b91ac..7bb235ba76503 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -29,7 +29,6 @@
 #include <__flat_map/key_value_iterator.h>
 #include <__flat_map/sorted_unique.h>
 #include <__flat_map/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/memory.h>
@@ -48,7 +47,6 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
 #include <__ranges/zip_view.h>
diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h
index abaacf9e3cda3..96d945405cffe 100644
--- a/libcxx/include/__flat_map/flat_multimap.h
+++ b/libcxx/include/__flat_map/flat_multimap.h
@@ -22,7 +22,6 @@
 #include <__algorithm/upper_bound.h>
 #include <__assert>
 #include <__compare/synth_three_way.h>
-#include <__concepts/convertible_to.h>
 #include <__concepts/swappable.h>
 #include <__config>
 #include <__cstddef/byte.h>
@@ -30,7 +29,6 @@
 #include <__flat_map/key_value_iterator.h>
 #include <__flat_map/sorted_equivalent.h>
 #include <__flat_map/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/vector.h>
@@ -47,7 +45,6 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
 #include <__ranges/zip_view.h>
@@ -57,14 +54,12 @@
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
-#include <__type_traits/maybe_const.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/scope_guard.h>
 #include <__vector/vector.h>
 #include <initializer_list>
-#include <stdexcept>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__flat_map/key_value_iterator.h b/libcxx/include/__flat_map/key_value_iterator.h
index d04a23d1f8606..795651a07937b 100644
--- a/libcxx/include/__flat_map/key_value_iterator.h
+++ b/libcxx/include/__flat_map/key_value_iterator.h
@@ -20,7 +20,6 @@
 #include <__type_traits/conditional.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
-#include <__utility/pair.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h
index 65f4161a8c34c..b1a4917659c49 100644
--- a/libcxx/include/__flat_set/flat_multiset.h
+++ b/libcxx/include/__flat_set/flat_multiset.h
@@ -13,54 +13,40 @@
 #include <__algorithm/equal_range.h>
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/lower_bound.h>
-#include <__algorithm/min.h>
 #include <__algorithm/ranges_equal.h>
 #include <__algorithm/ranges_inplace_merge.h>
 #include <__algorithm/ranges_is_sorted.h>
 #include <__algorithm/ranges_sort.h>
-#include <__algorithm/ranges_unique.h>
 #include <__algorithm/remove_if.h>
 #include <__algorithm/upper_bound.h>
 #include <__assert>
 #include <__compare/synth_three_way.h>
-#include <__concepts/convertible_to.h>
 #include <__concepts/swappable.h>
 #include <__config>
-#include <__cstddef/byte.h>
-#include <__cstddef/ptrdiff_t.h>
-#include <__flat_map/key_value_iterator.h>
 #include <__flat_map/sorted_equivalent.h>
 #include <__flat_set/ra_iterator.h>
 #include <__flat_set/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/vector.h>
 #include <__iterator/concepts.h>
-#include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/prev.h>
-#include <__iterator/ranges_iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/uses_allocator.h>
 #include <__memory/uses_allocator_construction.h>
-#include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
-#include <__ranges/zip_view.h>
-#include <__type_traits/conjunction.h>
 #include <__type_traits/container_traits.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_allocator.h>
 #include <__type_traits/is_nothrow_constructible.h>
 #include <__type_traits/is_same.h>
-#include <__type_traits/maybe_const.h>
 #include <__utility/as_const.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
diff --git a/libcxx/include/__flat_set/flat_set.h b/libcxx/include/__flat_set/flat_set.h
index cc788bda544de..5fa1f2d8acb9b 100644
--- a/libcxx/include/__flat_set/flat_set.h
+++ b/libcxx/include/__flat_set/flat_set.h
@@ -12,7 +12,6 @@
 
 #include <__algorithm/lexicographical_compare_three_way.h>
 #include <__algorithm/lower_bound.h>
-#include <__algorithm/min.h>
 #include <__algorithm/ranges_adjacent_find.h>
 #include <__algorithm/ranges_equal.h>
 #include <__algorithm/ranges_inplace_merge.h>
@@ -24,20 +23,16 @@
 #include <__compare/synth_three_way.h>
 #include <__concepts/swappable.h>
 #include <__config>
-#include <__cstddef/ptrdiff_t.h>
 #include <__flat_map/sorted_unique.h>
 #include <__flat_set/ra_iterator.h>
 #include <__flat_set/utils.h>
-#include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
 #include <__fwd/vector.h>
 #include <__iterator/concepts.h>
-#include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
 #include <__iterator/prev.h>
-#include <__iterator/ranges_iterator_traits.h>
 #include <__iterator/reverse_iterator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/uses_allocator.h>
@@ -47,10 +42,7 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
-#include <__ranges/ref_view.h>
 #include <__ranges/size.h>
-#include <__ranges/subrange.h>
-#include <__type_traits/conjunction.h>
 #include <__type_traits/container_traits.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_allocator.h>
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index dc1933324ef79..5e96adc1aaa65 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1847,7 +1847,10 @@ module std [system] {
 
   module ranges {
     module access                         { header "__ranges/access.h" }
-    module all                            { header "__ranges/all.h" }
+    module all                            {
+      header "__ranges/all.h"
+      export std.ranges.ref_view
+    }
     module as_rvalue_view                 { header "__ranges/as_rvalue_view.h" }
     module chunk_by_view {
       header "__ranges/chunk_by_view.h"

From ddfbfd6b580c6d0ac89fc826f795cb67d051c101 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Sun, 28 Sep 2025 16:07:27 +0800
Subject: [PATCH 030/878] [NFC][clang] Move simplifyConstraint to
 TargetInfo.cpp (#154905)

Co-authored-by: Andy Kaylor <akaylor@nvidia.com>
---
 clang/include/clang/Basic/TargetInfo.h |  4 ++
 clang/lib/Basic/TargetInfo.cpp         | 49 ++++++++++++++++++++++
 clang/lib/CodeGen/CGStmt.cpp           | 58 ++------------------------
 3 files changed, 57 insertions(+), 54 deletions(-)

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index e5c5ada3b0858..ceb16174e13e7 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1259,6 +1259,10 @@ class TargetInfo : public TransferrableTargetInfo,
                            ArrayRef<ConstraintInfo> OutputConstraints,
                            unsigned &Index) const;
 
+  std::string
+  simplifyConstraint(StringRef Constraint,
+                     SmallVectorImpl<ConstraintInfo> *OutCons = nullptr) const;
+
   // Constraint parm will be left pointing at the last character of
   // the constraint.  In practice, it won't be changed unless the
   // constraint is longer than one character.
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 72ee09d209e02..f4d7c1288cc04 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -18,6 +18,7 @@
 #include "clang/Basic/LangOptions.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/TargetParser.h"
 #include <cstdlib>
@@ -1042,3 +1043,51 @@ void TargetInfo::copyAuxTarget(const TargetInfo *Aux) {
   auto *Src = static_cast<const TransferrableTargetInfo*>(Aux);
   *Target = *Src;
 }
+
+std::string
+TargetInfo::simplifyConstraint(StringRef Constraint,
+                               SmallVectorImpl<ConstraintInfo> *OutCons) const {
+  std::string Result;
+
+  for (const char *I = Constraint.begin(), *E = Constraint.end(); I < E; I++) {
+    switch (*I) {
+    default:
+      Result += convertConstraint(I);
+      break;
+    // Ignore these
+    case '*':
+    case '?':
+    case '!':
+    case '=': // Will see this and the following in mult-alt constraints.
+    case '+':
+      break;
+    case '#': // Ignore the rest of the constraint alternative.
+      while (I + 1 != E && I[1] != ',')
+        I++;
+      break;
+    case '&':
+    case '%':
+      Result += *I;
+      while (I + 1 != E && I[1] == *I)
+        I++;
+      break;
+    case ',':
+      Result += "|";
+      break;
+    case 'g':
+      Result += "imr";
+      break;
+    case '[': {
+      assert(OutCons &&
+             "Must pass output names to constraints with a symbolic name");
+      unsigned Index;
+      bool ResolveResult = resolveSymbolicName(I, *OutCons, Index);
+      assert(ResolveResult && "Could not resolve symbolic name");
+      (void)ResolveResult;
+      Result += llvm::utostr(Index);
+      break;
+    }
+    }
+  }
+  return Result;
+}
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 440100650c43f..e62bc7617da3d 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2471,56 +2471,6 @@ void CodeGenFunction::EmitSwitchStmt(const SwitchStmt &S) {
   CaseRangeBlock = SavedCRBlock;
 }
 
-static std::string
-SimplifyConstraint(const char *Constraint, const TargetInfo &Target,
-                 SmallVectorImpl<TargetInfo::ConstraintInfo> *OutCons=nullptr) {
-  std::string Result;
-
-  while (*Constraint) {
-    switch (*Constraint) {
-    default:
-      Result += Target.convertConstraint(Constraint);
-      break;
-    // Ignore these
-    case '*':
-    case '?':
-    case '!':
-    case '=': // Will see this and the following in mult-alt constraints.
-    case '+':
-      break;
-    case '#': // Ignore the rest of the constraint alternative.
-      while (Constraint[1] && Constraint[1] != ',')
-        Constraint++;
-      break;
-    case '&':
-    case '%':
-      Result += *Constraint;
-      while (Constraint[1] && Constraint[1] == *Constraint)
-        Constraint++;
-      break;
-    case ',':
-      Result += "|";
-      break;
-    case 'g':
-      Result += "imr";
-      break;
-    case '[': {
-      assert(OutCons &&
-             "Must pass output names to constraints with a symbolic name");
-      unsigned Index;
-      bool result = Target.resolveSymbolicName(Constraint, *OutCons, Index);
-      assert(result && "Could not resolve symbolic name"); (void)result;
-      Result += llvm::utostr(Index);
-      break;
-    }
-    }
-
-    Constraint++;
-  }
-
-  return Result;
-}
-
 /// AddVariableConstraints - Look at AsmExpr and if it is a variable declared
 /// as using a particular register add that as a constraint that will be used
 /// in this asm stmt.
@@ -2899,8 +2849,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
     // Simplify the output constraint.
     std::string OutputConstraint(S.getOutputConstraint(i));
-    OutputConstraint = SimplifyConstraint(OutputConstraint.c_str() + 1,
-                                          getTarget(), &OutputConstraintInfos);
+    OutputConstraint = getTarget().simplifyConstraint(
+        StringRef(OutputConstraint).substr(1), &OutputConstraintInfos);
 
     const Expr *OutExpr = S.getOutputExpr(i);
     OutExpr = OutExpr->IgnoreParenNoopCasts(getContext());
@@ -3062,8 +3012,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
 
     // Simplify the input constraint.
     std::string InputConstraint(S.getInputConstraint(i));
-    InputConstraint = SimplifyConstraint(InputConstraint.c_str(), getTarget(),
-                                         &OutputConstraintInfos);
+    InputConstraint =
+        getTarget().simplifyConstraint(InputConstraint, &OutputConstraintInfos);
 
     InputConstraint = AddVariableConstraints(
         InputConstraint, *InputExpr->IgnoreParenNoopCasts(getContext()),

From 0fc6213aee05b07e670bee5a25a31119c563227e Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sun, 28 Sep 2025 10:02:22 +0100
Subject: [PATCH 031/878] [LV] Clarify nature of legacy CSE (NFC) (#160855)

In order to avoid conflating the legacy CSE with the VPlan-based one,
rename the legacy CSE and insert a FIXME to clarify the nature of the
legacy CSE.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 93a5f22bd0976..96f52076b1837 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2438,8 +2438,9 @@ struct CSEDenseMapInfo {
 
 } // end anonymous namespace
 
-///Perform cse of induction variable instructions.
-static void cse(BasicBlock *BB) {
+/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
+/// removal, in favor of the VPlan-based one.
+static void legacyCSE(BasicBlock *BB) {
   // Perform simple cse.
   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
@@ -2543,7 +2544,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
 
   // Remove redundant induction instructions.
-  cse(HeaderBB);
+  legacyCSE(HeaderBB);
 }
 
 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {

From 0df525bc56651ec5a7cf76cee5bd4127e9df7cf7 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Sun, 28 Sep 2025 17:19:36 +0800
Subject: [PATCH 032/878] [LoongArch] Add patterns to support
 `[x]vadda.{b/h/w/d}` generation (#160674)

This commit add patterns for lsx and lasx to support generating
`[x]vadda.{b/h/w/d}` instructions.

Note: For convenience, this commit also set `ISD::ABS` as legal. As
shown in the tests, this brings no change to the results, just same as
the results obtained from expanding it before. But, setting it as legal
brings more vectorization opportunities to IR transformation which may
bring more vector optimization chances for later stages and the backend.
---
 .../LoongArch/LoongArchISelLowering.cpp       |   2 +
 .../LoongArch/LoongArchLASXInstrInfo.td       |  16 +++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  16 +++
 llvm/test/CodeGen/LoongArch/lasx/abs.ll       | 128 ++++++++++++++++++
 .../LoongArch/lasx/ir-instruction/adda.ll     |  24 +---
 llvm/test/CodeGen/LoongArch/lsx/abs.ll        | 128 ++++++++++++++++++
 .../LoongArch/lsx/ir-instruction/adda.ll      |  24 +---
 7 files changed, 298 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/abs.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/abs.ll

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 94f53d5b85f10..ecd003cae3263 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -340,6 +340,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
       setOperationAction(ISD::SADDSAT, VT, Legal);
@@ -419,6 +420,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::ABDS, VT, Legal);
       setOperationAction(ISD::ABDU, VT, Legal);
       setOperationAction(ISD::SADDSAT, VT, Legal);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index adfe990ba1234..bbc0489620193 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2015,10 +2015,26 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)),
                (XVFTINTRZ_LU_D v4f64:$vj)),
               sub_128)>;
 
+// abs
+def : Pat<(abs v32i8:$xj), (XVMAX_B v32i8:$xj, (XVNEG_B v32i8:$xj))>;
+def : Pat<(abs v16i16:$xj), (XVMAX_H v16i16:$xj, (XVNEG_H v16i16:$xj))>;
+def : Pat<(abs v8i32:$xj), (XVMAX_W v8i32:$xj, (XVNEG_W v8i32:$xj))>;
+def : Pat<(abs v4i64:$xj), (XVMAX_D v4i64:$xj, (XVNEG_D v4i64:$xj))>;
+
 // XVABSD_{B/H/W/D}[U]
 defm : PatXrXr<abds, "XVABSD">;
 defm : PatXrXrU<abdu, "XVABSD">;
 
+// XVADDA_{B/H/W/D}
+def : Pat<(add (v32i8 (abs v32i8:$xj)), (v32i8 (abs v32i8:$xk))),
+          (XVADDA_B v32i8:$xj, v32i8:$xk)>;
+def : Pat<(add (v16i16 (abs v16i16:$xj)), (v16i16 (abs v16i16:$xk))),
+          (XVADDA_H v16i16:$xj, v16i16:$xk)>;
+def : Pat<(add (v8i32 (abs v8i32:$xj)), (v8i32 (abs v8i32:$xk))),
+          (XVADDA_W v8i32:$xj, v8i32:$xk)>;
+def : Pat<(add (v4i64 (abs v4i64:$xj)), (v4i64 (abs v4i64:$xk))),
+          (XVADDA_D v4i64:$xj, v4i64:$xk)>;
+
 // XVSADD_{B/H/W/D}[U], XVSSUB_{B/H/W/D}[U]
 defm : PatXrXr<saddsat, "XVSADD">;
 defm : PatXrXr<ssubsat, "XVSSUB">;
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 2c36099f8eb71..8d1dc99e316c9 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -2154,10 +2154,26 @@ def : Pat<(f32 f32imm_vldi:$in),
 def : Pat<(f64 f64imm_vldi:$in),
           (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>;
 
+// abs
+def : Pat<(abs v16i8:$vj), (VMAX_B v16i8:$vj, (VNEG_B v16i8:$vj))>;
+def : Pat<(abs v8i16:$vj), (VMAX_H v8i16:$vj, (VNEG_H v8i16:$vj))>;
+def : Pat<(abs v4i32:$vj), (VMAX_W v4i32:$vj, (VNEG_W v4i32:$vj))>;
+def : Pat<(abs v2i64:$vj), (VMAX_D v2i64:$vj, (VNEG_D v2i64:$vj))>;
+
 // VABSD_{B/H/W/D}[U]
 defm : PatVrVr<abds, "VABSD">;
 defm : PatVrVrU<abdu, "VABSD">;
 
+// VADDA_{B/H/W/D}
+def : Pat<(add (v16i8 (abs v16i8:$vj)), (v16i8 (abs v16i8:$vk))),
+          (VADDA_B v16i8:$vj, v16i8:$vk)>;
+def : Pat<(add (v8i16 (abs v8i16:$vj)), (v8i16 (abs v8i16:$vk))),
+          (VADDA_H v8i16:$vj, v8i16:$vk)>;
+def : Pat<(add (v4i32 (abs v4i32:$vj)), (v4i32 (abs v4i32:$vk))),
+          (VADDA_W v4i32:$vj, v4i32:$vk)>;
+def : Pat<(add (v2i64 (abs v2i64:$vj)), (v2i64 (abs v2i64:$vk))),
+          (VADDA_D v2i64:$vj, v2i64:$vk)>;
+
 // VSADD_{B/H/W/D}[U], VSSUB_{B/H/W/D}[U]
 defm : PatVrVr<saddsat, "VSADD">;
 defm : PatVrVr<ssubsat, "VSSUB">;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/abs.ll b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
new file mode 100644
index 0000000000000..e3b0d04d92d75
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/abs.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @vabs_b(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.b $xr1, $xr0
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <32 x i8>, ptr %src
+  %b = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a, i1 true)
+  store <32 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_b_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.b $xr1, $xr0
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <32 x i8>, ptr %src
+  %b = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a, i1 false)
+  store <32 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.h $xr1, $xr0
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i16>, ptr %src
+  %b = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a, i1 true)
+  store <16 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.h $xr1, $xr0
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i16>, ptr %src
+  %b = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a, i1 false)
+  store <16 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.w $xr1, $xr0
+; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i32>, ptr %src
+  %b = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 true)
+  store <8 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.w $xr1, $xr0
+; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i32>, ptr %src
+  %b = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 false)
+  store <8 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.d $xr1, $xr0
+; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %src
+  %b = tail call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a, i1 true)
+  store <4 x i64> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvneg.d $xr1, $xr0
+; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %src
+  %b = tail call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a, i1 false)
+  store <4 x i64> %b, ptr %dst
+  ret void
+}
+
+declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
+declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
+declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
+declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll
index e66a15291fb18..98687755fcfb4 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll
@@ -7,11 +7,7 @@ define void @vadda_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.b $xr2, $xr0
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.b $xr2, $xr1
-; CHECK-NEXT:    xvmax.b $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.b $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,11 +29,7 @@ define void @vadda_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.h $xr2, $xr0
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.h $xr2, $xr1
-; CHECK-NEXT:    xvmax.h $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.h $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -59,11 +51,7 @@ define void @vadda_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.w $xr2, $xr0
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.w $xr2, $xr1
-; CHECK-NEXT:    xvmax.w $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -85,11 +73,7 @@ define void @vadda_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    xvneg.d $xr2, $xr0
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr2
-; CHECK-NEXT:    xvneg.d $xr2, $xr1
-; CHECK-NEXT:    xvmax.d $xr1, $xr1, $xr2
-; CHECK-NEXT:    xvadd.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvadda.d $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/abs.ll b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
new file mode 100644
index 0000000000000..85fe1fe5c0da7
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/abs.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @vabs_b(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.b $vr1, $vr0
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i8>, ptr %src
+  %b = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 true)
+  store <16 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_b_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_b_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.b $vr1, $vr0
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <16 x i8>, ptr %src
+  %b = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false)
+  store <16 x i8> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.h $vr1, $vr0
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i16>, ptr %src
+  %b = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 true)
+  store <8 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_h_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.h $vr1, $vr0
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x i16>, ptr %src
+  %b = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 false)
+  store <8 x i16> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.w $vr1, $vr0
+; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i32>, ptr %src
+  %b = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 true)
+  store <4 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_w_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_w_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.w $vr1, $vr0
+; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i32>, ptr %src
+  %b = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false)
+  store <4 x i32> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.d $vr1, $vr0
+; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %src
+  %b = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a, i1 true)
+  store <2 x i64> %b, ptr %dst
+  ret void
+}
+
+define void @vabs_d_1(ptr %dst, ptr %src) {
+; CHECK-LABEL: vabs_d_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vneg.d $vr1, $vr0
+; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %src
+  %b = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a, i1 false)
+  store <2 x i64> %b, ptr %dst
+  ret void
+}
+
+declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
+declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll
index 2bd0b597d79ac..34f22e1f6bf45 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll
@@ -7,11 +7,7 @@ define void @vadda_b(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.b $vr2, $vr0
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.b $vr2, $vr1
-; CHECK-NEXT:    vmax.b $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.b $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -33,11 +29,7 @@ define void @vadda_h(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.h $vr2, $vr0
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.h $vr2, $vr1
-; CHECK-NEXT:    vmax.h $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -59,11 +51,7 @@ define void @vadda_w(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.w $vr2, $vr0
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.w $vr2, $vr1
-; CHECK-NEXT:    vmax.w $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.w $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -85,11 +73,7 @@ define void @vadda_d(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vld $vr0, $a1, 0
 ; CHECK-NEXT:    vld $vr1, $a2, 0
-; CHECK-NEXT:    vneg.d $vr2, $vr0
-; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr2
-; CHECK-NEXT:    vneg.d $vr2, $vr1
-; CHECK-NEXT:    vmax.d $vr1, $vr1, $vr2
-; CHECK-NEXT:    vadd.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vadda.d $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:

From 2284ce0596ecd66849099f4918f726f8e27607aa Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 28 Sep 2025 10:29:43 +0100
Subject: [PATCH 033/878] [VPlan] Move using VPlanPatternMatch to top in
 VPlanUtils.cpp (NFC).

Only VPlan pattern matching is used in the file, move the using
statement to the top level.
---
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index eac0e705a877d..059993043dcda 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 
 using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
 
 bool vputils::onlyFirstLaneUsed(const VPValue *Def) {
   return all_of(Def->users(),
@@ -63,7 +64,6 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
   };
 
   VPValue *A, *B;
-  using namespace VPlanPatternMatch;
 
   if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
     return B == Plan.getTripCount() &&
@@ -90,7 +90,6 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
 }
 
 bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
-  using namespace VPlanPatternMatch;
   // Live-ins are uniform.
   if (V->isLiveIn())
     return true;
@@ -159,7 +158,6 @@ std::optional<VPValue *>
 vputils::getRecipesForUncountableExit(VPlan &Plan,
                                       SmallVectorImpl<VPRecipeBase *> &Recipes,
                                       SmallVectorImpl<VPRecipeBase *> &GEPs) {
-  using namespace llvm::VPlanPatternMatch;
   // Given a VPlan like the following (just including the recipes contributing
   // to loop control exiting here, not the actual work), we're looking to match
   // the recipes contributing to the uncountable exit condition comparison

From 58805dd9ede08df777e3e4486493b3a70c5124c7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Sun, 28 Sep 2025 13:13:41 +0200
Subject: [PATCH 034/878] [SPIRV] Fix type mismatch assertion in insertvalue.
 (#143131)

The code was incorrectly converting all `undef` arguments to `i32`,
while the `spv_insertv` intrinsics only expects that for the first
operand, representing the aggregate type.

Fixes https://github.com/llvm/llvm-project/issues/127977

---------

Co-authored-by: Michal Paszkowski <michal@michalpaszkowski.com>
---
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 11 ++++----
 .../instructions/insertvalue-undef-ptr.ll     | 28 +++++++++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index f5a49e2b47363..704edd3139260 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -1909,11 +1909,12 @@ Instruction *SPIRVEmitIntrinsics::visitInsertValueInst(InsertValueInst &I) {
   B.SetInsertPoint(&I);
   SmallVector<Type *, 1> Types = {I.getInsertedValueOperand()->getType()};
   SmallVector<Value *> Args;
-  for (auto &Op : I.operands())
-    if (isa<UndefValue>(Op))
-      Args.push_back(UndefValue::get(B.getInt32Ty()));
-    else
-      Args.push_back(Op);
+  Value *AggregateOp = I.getAggregateOperand();
+  if (isa<UndefValue>(AggregateOp))
+    Args.push_back(UndefValue::get(B.getInt32Ty()));
+  else
+    Args.push_back(AggregateOp);
+  Args.push_back(I.getInsertedValueOperand());
   for (auto &Op : I.indices())
     Args.push_back(B.getInt32(Op));
   Instruction *NewI =
diff --git a/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll b/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll
new file mode 100644
index 0000000000000..b788f34bf7238
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll
@@ -0,0 +1,28 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-LABEL: Begin function original_testcase
+define fastcc void @original_testcase() {
+top:
+  ; CHECK: OpCompositeInsert
+  %0 = insertvalue [1 x ptr] zeroinitializer, ptr poison, 0
+  ret void
+}
+
+; CHECK-LABEL: Begin function additional_testcases
+define fastcc void @additional_testcases() {
+top:
+  ; Test with different pointer types
+  ; CHECK: OpCompositeInsert
+  %1 = insertvalue [1 x ptr] zeroinitializer, ptr undef, 0
+  ; CHECK-NEXT: OpCompositeInsert
+  %2 = insertvalue {ptr, i32} zeroinitializer, ptr poison, 0
+  ; CHECK-NEXT: OpCompositeInsert
+  %3 = insertvalue {ptr, ptr} undef, ptr null, 0
+
+  ; Test with undef aggregate
+  ; CHECK-NEXT: OpCompositeInsert
+  %4 = insertvalue [1 x ptr] undef, ptr undef, 0
+
+  ret void
+}

From 3407fedb8628fab23a251ee804ee280250966c40 Mon Sep 17 00:00:00 2001
From: lonely eagle <2020382038@qq.com>
Date: Sun, 28 Sep 2025 19:28:16 +0800
Subject: [PATCH 035/878] [mlir][dataflow] Use skipRegions to print region op
 (NFC) (#161066)

The print region op prints a lot of useless IR. Use OpWithFlags(op,
OpPrintingFlags().skipRegions()) to avoid this.
---
 .../lib/Analysis/DataFlow/LivenessAnalysis.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index fdb97d5963299..d705d8d4c7819 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -109,19 +109,19 @@ LivenessAnalysis::visitOperation(Operation *op, ArrayRef<Liveness *> operands,
       foundLiveResult = true;
     }
     LDBG() << "[visitOperation] Adding dependency for result: " << r
-           << " after op: " << *op;
+           << " after op: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
     addDependency(const_cast<Liveness *>(r), getProgramPointAfter(op));
   }
   return success();
 }
 
 void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
+  Operation *op = operand.getOwner();
   LDBG() << "Visiting branch operand: " << operand.get()
-         << " in op: " << *operand.getOwner();
+         << " in op: " << OpWithFlags(op, OpPrintingFlags().skipRegions());
   // We know (at the moment) and assume (for the future) that `operand` is a
   // non-forwarded branch operand of a `RegionBranchOpInterface`,
   // `BranchOpInterface`, `RegionBranchTerminatorOpInterface` or return-like op.
-  Operation *op = operand.getOwner();
   assert((isa<RegionBranchOpInterface>(op) || isa<BranchOpInterface>(op) ||
           isa<RegionBranchTerminatorOpInterface>(op)) &&
          "expected the op to be `RegionBranchOpInterface`, "
@@ -146,12 +146,13 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
       // Therefore, if the result value is live, we conservatively consider the
       // non-forwarded operand of the region branch operation with result may
       // live and record all result.
-      for (Value result : op->getResults()) {
+      for (auto [resultIndex, result] : llvm::enumerate(op->getResults())) {
         if (getLatticeElement(result)->isLive) {
           mayLive = true;
-          LDBG() << "[visitBranchOperand] Non-forwarded branch "
-                    "operand may be live due to live result: "
-                 << result;
+          LDBG() << "[visitBranchOperand] Non-forwarded branch operand may be "
+                    "live due to live result #"
+                 << resultIndex << ": "
+                 << OpWithFlags(op, OpPrintingFlags().skipRegions());
           break;
         }
       }
@@ -233,7 +234,8 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) {
   SmallVector<const Liveness *, 4> resultsLiveness;
   for (const Value result : op->getResults())
     resultsLiveness.push_back(getLatticeElement(result));
-  LDBG() << "Visiting operation for non-forwarded branch operand: " << *op;
+  LDBG() << "Visiting operation for non-forwarded branch operand: "
+         << OpWithFlags(op, OpPrintingFlags().skipRegions());
   (void)visitOperation(op, operandLiveness, resultsLiveness);
 
   // We also visit the parent op with the parent's results and this operand if

From 6167f0c818051e378fd8b0efb6e93adf13e53fae Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 11:28:18 -0700
Subject: [PATCH 036/878] [MLIR] Apply clang-tidy fixes for llvm-qualified-auto
 in Mem2Reg.cpp (NFC)

---
 mlir/lib/Transforms/Mem2Reg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index d36a3c1362c19..b3057129fb9fd 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -286,7 +286,7 @@ LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses(
   mlir::getForwardSlice(slot.ptr, &forwardSlice);
   for (Operation *user : forwardSlice) {
     // If the next operation has no blocking uses, everything is fine.
-    auto it = userToBlockingUses.find(user);
+    auto *it = userToBlockingUses.find(user);
     if (it == userToBlockingUses.end())
       continue;
 

From 062c0fcf4b8b29a12191d782f4e23623cef061db Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 05:25:45 -0700
Subject: [PATCH 037/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in GPUOpsLowering.cpp (NFC)

---
 mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 1037e296c8128..a73afbcb6474b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -663,7 +663,7 @@ static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
 
 /// Generates a symbol with 0-sized array type for dynamic shared memory usage,
 /// or uses existing symbol.
-LLVM::GlobalOp getDynamicSharedMemorySymbol(
+static LLVM::GlobalOp getDynamicSharedMemorySymbol(
     ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp,
     gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter,
     MemRefType memrefType, unsigned alignmentBit) {

From 60e41d241c3a4ba63e777986688ec75cc0612d0f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 05:58:59 -0700
Subject: [PATCH 038/878] [MLIR] Apply clang-tidy fixes for
 bugprone-argument-comment in VectorToGPU.cpp (NFC)

---
 mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 79cb49a4f7dbc..d6a262275be3d 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -741,7 +741,7 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
   }
 
   // Adjust the load offset.
-  auto laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
+  auto laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
   FailureOr<AffineMap> offsets =
       nvgpu::getLaneIdToLdMatrixMatrixCoord(rewriter, loc, *params);
   if (failed(offsets)) {
@@ -781,7 +781,7 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
             "conversion to distributed non-ldmatrix compatible load");
   }
 
-  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
+  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
 
   // This is the individual element type.
   Type loadedElType = regInfo->registerLLVMType;
@@ -915,7 +915,7 @@ convertTransferWriteToStores(RewriterBase &rewriter, vector::TransferWriteOp op,
     return rewriter.notifyMatchFailure(op, "not mma sync reg info");
 
   VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
-  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr);
+  Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
 
   for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
     Value logicalValueId = arith::ConstantOp::create(

From c058ebda67ed85d3b45140bd25d504e1f7400441 Mon Sep 17 00:00:00 2001
From: Ebin-McW <ebin.jose@multicorewareinc.com>
Date: Sun, 28 Sep 2025 17:19:18 +0530
Subject: [PATCH 039/878] [SPIRV] Porting tests to transcoding directory from
 translator (#151661)

Checks for built-in variables, saturating conversion, half precision
fract, and workgroup variable initialization

---------

Co-authored-by: Michal Paszkowski <michal@michalpaszkowski.com>
---
 .../transcoding/OpVariable_Initializer.ll     | 11 +++++++
 .../SPIRV/transcoding/builtin_vars_gep.ll     | 16 ++++++++++
 .../transcoding/decoration-forward-decl.ll    | 30 +++++++++++++++++++
 .../test/CodeGen/SPIRV/transcoding/float16.ll | 25 ++++++++++++++++
 4 files changed, 82 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/float16.ll

diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll
new file mode 100644
index 0000000000000..c8953c701d47d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll
@@ -0,0 +1,11 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: [[#PtrT:]] = OpTypePointer Workgroup %[[#]]
+; CHECK-SPIRV: %[[#]] = OpVariable %[[#PtrT]] Workgroup
+
+@test_atomic_fn.L = internal addrspace(3) global [64 x i32] zeroinitializer, align 4
+
+define spir_kernel void @test_atomic_fn() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll
new file mode 100644
index 0000000000000..4c64a127a7019
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll
@@ -0,0 +1,16 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId
+; CHECK: %[[#Id]] = OpVariable %[[#]] CrossWorkgroup
+
+@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+define spir_kernel void @f() {
+entry:
+  %0 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll b/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll
new file mode 100644
index 0000000000000..74ce26bee9cf3
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Check saturation conversion is translated when there is forward declaration
+; of SPIRV entry.
+
+; CHECK: OpDecorate %[[#SAT:]] SaturatedConversion
+; CHECK: %[[#SAT]] = OpConvertFToU %[[#]] %[[#]]
+
+declare spir_func zeroext i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float)
+
+define spir_func void @forward(float %val, i8 %initval, ptr addrspace(1) %dst) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %new_val.0 = phi i8 [ %initval, %entry ], [ %call1, %for.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %i.0, 1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func zeroext i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float noundef %val)
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i8 %new_val.0, ptr addrspace(1) %dst, align 1
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/float16.ll b/llvm/test/CodeGen/SPIRV/transcoding/float16.ll
new file mode 100644
index 0000000000000..0018dba68d4ea
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/float16.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV: %[[#HALF:]] = OpTypeFloat 16
+; CHECK-SPIRV: %[[#HALFPTR:]] = OpTypePointer Function %[[#HALF]]
+; CHECK-SPIRV: %[[#HALFV2:]] = OpTypeVector %[[#HALF]] 2
+; CHECK-SPIRV: %[[#HALFV2PTR:]] = OpTypePointer Function %[[#HALFV2]]
+; CHECK-SPIRV: %[[#CONST:]] = OpConstant %[[#HALF]] 14788
+; CHECK-SPIRV: %[[#ADDR:]] = OpVariable %[[#HALFPTR]] Function
+; CHECK-SPIRV: %[[#ADDR2:]] = OpVariable %[[#HALFV2PTR]] Function
+; CHECK-SPIRV: %[[#]] = OpExtInst %[[#HALF]] %[[#]] fract %[[#CONST]] %[[#ADDR]]
+; CHECK-SPIRV: %[[#]] = OpExtInst %[[#HALFV2]] %[[#]] fract %[[#]] %[[#ADDR2]]
+
+define spir_kernel void @test() {
+entry:
+  %addr = alloca half
+  %addr2 = alloca <2 x half>
+  %res = call spir_func noundef half @_Z17__spirv_ocl_fractDF16_PU3AS0DF16_(half noundef 0xH39C4, ptr noundef %addr)
+  %res2 = call spir_func noundef <2 x half> @_Z17__spirv_ocl_fractDv2_DF16_PU3AS0S_(<2 x half> noundef <half 0xH39C4, half 0xH0000>, ptr noundef %addr2)
+  ret void
+}
+
+declare spir_func noundef half @_Z17__spirv_ocl_fractDF16_PU3AS0DF16_(half noundef, ptr noundef) local_unnamed_addr
+
+declare spir_func noundef <2 x half> @_Z17__spirv_ocl_fractDv2_DF16_PU3AS0S_(<2 x half> noundef, ptr noundef) local_unnamed_addr

From 47981627ddb5bfb49e383474fb1db0c95a2e3b86 Mon Sep 17 00:00:00 2001
From: macurtis-amd <macurtis@amd.com>
Date: Sun, 28 Sep 2025 06:57:09 -0500
Subject: [PATCH 040/878] [AMDGPU] Regenerate checks for
 test/CodeGen/AMDGPU/bf16.ll (#161069)

Looks like there were some checks leftover from before the GFX1250TRUE16
run line was disabled. These were causing problems downstream. Not sure
why update_llc_test_checks did not clean these up.

I removed all existing checks and re-ran update_llc_test_checks.
---
 llvm/test/CodeGen/AMDGPU/bf16.ll | 4742 +++++++++++++++++++++++++++++-
 1 file changed, 4659 insertions(+), 83 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0490e5a19b4b7..94ba5cdd09df4 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -10908,12 +10908,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fadd_v2bf16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:   s_wait_kmcnt 0x0
-; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v1
-; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_add_bf16 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = fadd <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
 }
@@ -11446,13 +11447,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fadd_v4bf16:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:   s_wait_kmcnt 0x0
-; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v2
-; GFX1250-NEXT:   v_pk_add_bf16 v1, v1, v3
-; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_add_bf16 v0, v0, v2
+; GFX1250-NEXT:    v_pk_add_bf16 v1, v1, v3
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
 }
@@ -49991,6 +49993,622 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
   ret <4 x bfloat> %op
 }
 
+define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
+; GCN-LABEL: v_fma_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GCN-NEXT:    v_fma_f32 v6, v6, v14, v22
+; GCN-NEXT:    v_fma_f32 v5, v5, v13, v21
+; GCN-NEXT:    v_fma_f32 v4, v4, v12, v20
+; GCN-NEXT:    v_fma_f32 v3, v3, v11, v19
+; GCN-NEXT:    v_fma_f32 v2, v2, v10, v18
+; GCN-NEXT:    v_fma_f32 v1, v1, v9, v17
+; GCN-NEXT:    v_fma_f32 v0, v0, v8, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_fma_f32 v6, v6, v14, v15
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_fma_f32 v5, v5, v13, v14
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_fma_f32 v4, v4, v12, v13
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_fma_f32 v3, v3, v11, v12
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_fma_f32 v2, v2, v10, v11
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_fma_f32 v1, v1, v9, v11
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_fma_f32 v0, v0, v8, v9
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX8-NEXT:    v_fma_f32 v12, v14, v13, v12
+; GFX8-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX8-NEXT:    v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v11, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX8-NEXT:    v_fma_f32 v7, v13, v11, v7
+; GFX8-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v7
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX8-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v10, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    v_fma_f32 v6, v11, v10, v6
+; GFX8-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX8-NEXT:    v_fma_f32 v5, v10, v9, v5
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_fma_v8bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX900-NEXT:    v_fma_f32 v12, v14, v13, v12
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_fma_f32 v3, v3, v7, v11
+; GFX900-NEXT:    v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v11, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v7, v11, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v6
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX900-NEXT:    v_fma_f32 v7, v13, v11, v7
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_bfe_u32 v11, v7, 16, 1
+; GFX900-NEXT:    v_fma_f32 v2, v2, v6, v10
+; GFX900-NEXT:    v_add3_u32 v11, v11, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX900-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v6, v10, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
+; GFX900-NEXT:    v_fma_f32 v6, v11, v10, v6
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX900-NEXT:    v_fma_f32 v1, v1, v5, v9
+; GFX900-NEXT:    v_add3_u32 v10, v10, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX900-NEXT:    v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v0
+; GFX900-NEXT:    v_fma_f32 v5, v10, v9, v5
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_fma_f32 v0, v0, v4, v8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v8bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v7
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v12, v14, v13
+; GFX950-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v13, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v7, v2, v6
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v10, v6
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v6, v1, v5
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v1, v9, v5
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v0, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v5, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v6, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v7, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v11, v12
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v14, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
+; GFX10-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_add3_u32 v13, v13, v12, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v14, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX10-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v13, v15, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v6
+; GFX10-NEXT:    v_add3_u32 v12, v16, v11, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX10-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v14, v6
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_add3_u32 v6, v16, v7, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v15, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX10-NEXT:    v_bfe_u32 v14, v2, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v1, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v18, v16
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v0, v14, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v4, v9, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v5, v15, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX10-NEXT:    v_add3_u32 v2, v5, v15, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v0, v4, v9, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v15
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_add3_u32 v5, v7, v8, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX10-NEXT:    v_perm_b32 v0, v4, v2, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v6, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v17, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v7, v10, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v8bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v7
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v10
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v7, v2, v6
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v12, v14, v13
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v11
+; GFX11TRUE16-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_add3_u32 v13, v13, v12, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11TRUE16-NEXT:    v_bfe_u32 v10, v11, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11TRUE16-NEXT:    v_bfe_u32 v13, v7, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v14, v16, v15
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v10, v11, 0x7fff
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v5
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX11TRUE16-NEXT:    v_bfe_u32 v15, v14, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_add3_u32 v10, v15, v14, 0x7fff
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v9
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v14
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v6.h
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v9, v1, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v12, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v10, v13, v7, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11TRUE16-NEXT:    v_bfe_u32 v7, v9, 16, 1
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v8
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v4, v10, v11 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11TRUE16-NEXT:    v_add3_u32 v7, v7, v9, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v9
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v4.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v15, v17, v16
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v12, v15, 16, 1
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v13, v16, v14
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v5, v0, v1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v15
+; GFX11TRUE16-NEXT:    v_add3_u32 v8, v12, v15, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v13, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v13
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v13, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v9, v11, v5, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v0, v12, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v11.h
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v8bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v12, v14, v13 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v13, v12, 16, 1
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v11, v3, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v12
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11FAKE16-NEXT:    v_add3_u32 v13, v13, v12, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v10
+; GFX11FAKE16-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v3, v14, v7
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_and_b32 v7, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_add3_u32 v12, v16, v11, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v15, 0x400000, v3
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_add3_u32 v13, v13, v3, 0x7fff
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v3, v13, v15 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v7, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11FAKE16-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v2, v14, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11FAKE16-NEXT:    v_add3_u32 v6, v16, v7, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v9, v1, v5 :: v_dual_and_b32 v8, 0xffff0000, v8
+; GFX11FAKE16-NEXT:    v_bfe_u32 v14, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v9
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v8, v0, v4
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v14, v2, 0x7fff
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v9, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v7, v8, 16, 1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v15, v18, v16
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v4, v9, 0x7fff
+; GFX11FAKE16-NEXT:    v_bfe_u32 v5, v15, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v4, 0x400000, v15
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v5, v15, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v5, v7, v8, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v8
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v0, v13, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v4, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v6, v3, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v5, v1, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v12, v17, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v10, 0x7060302
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1250-LABEL: v_fma_v8bf16:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -50000,85 +50618,4043 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v6, v10
 ; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v7, v11
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
-define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
   %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
   ret <8 x bfloat> %op
 }
 
-; GFX1250-LABEL: v_fma_v16bf16:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v8, v16
-; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v9, v17
-; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v10, v18
-; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v11, v19
-; GFX1250-NEXT:    v_pk_fma_bf16 v4, v4, v12, v20
-; GFX1250-NEXT:    v_pk_fma_bf16 v5, v5, v13, v21
-; GFX1250-NEXT:    v_pk_fma_bf16 v6, v6, v14, v22
-; GFX1250-NEXT:    v_pk_fma_bf16 v7, v7, v15, v23
-; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
 define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) {
-  %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
-  ret <16 x bfloat> %op
-}
-
-; GFX1250-LABEL: v_fma_v32bf16:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:     s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:     s_wait_kmcnt 0x0
-; GFX1250-NEXT:     s_clause 0x10
-; GFX1250-NEXT:     scratch_load_b32 v31, off, s32 offset:64
-; GFX1250-NEXT:     scratch_load_b32 v32, off, s32 offset:4
-; GFX1250-NEXT:     scratch_load_b32 v33, off, s32 offset:8
-; GFX1250-NEXT:     scratch_load_b32 v34, off, s32 offset:12
-; GFX1250-NEXT:     scratch_load_b32 v35, off, s32 offset:16
-; GFX1250-NEXT:     scratch_load_b32 v36, off, s32 offset:20
-; GFX1250-NEXT:     scratch_load_b32 v37, off, s32 offset:24
-; GFX1250-NEXT:     scratch_load_b32 v38, off, s32 offset:28
-; GFX1250-NEXT:     scratch_load_b32 v39, off, s32 offset:32
-; GFX1250-NEXT:     scratch_load_b32 v48, off, s32 offset:36
-; GFX1250-NEXT:     scratch_load_b32 v49, off, s32 offset:40
-; GFX1250-NEXT:     scratch_load_b32 v50, off, s32 offset:44
-; GFX1250-NEXT:     scratch_load_b32 v51, off, s32 offset:48
-; GFX1250-NEXT:     scratch_load_b32 v52, off, s32 offset:52
-; GFX1250-NEXT:     scratch_load_b32 v53, off, s32 offset:56
-; GFX1250-NEXT:     scratch_load_b32 v54, off, s32 offset:60
-; GFX1250-NEXT:     scratch_load_b32 v55, off, s32
-; GFX1250-NEXT:     s_wait_loadcnt 0xf
-; GFX1250-NEXT:     v_pk_fma_bf16 v0, v0, v16, v32
-; GFX1250-NEXT:     s_wait_loadcnt 0xe
-; GFX1250-NEXT:     v_pk_fma_bf16 v1, v1, v17, v33
-; GFX1250-NEXT:     s_wait_loadcnt 0xd
-; GFX1250-NEXT:     v_pk_fma_bf16 v2, v2, v18, v34
-; GFX1250-NEXT:     s_wait_loadcnt 0xc
-; GFX1250-NEXT:     v_pk_fma_bf16 v3, v3, v19, v35
-; GFX1250-NEXT:     s_wait_loadcnt 0xb
-; GFX1250-NEXT:     v_pk_fma_bf16 v4, v4, v20, v36
-; GFX1250-NEXT:     s_wait_loadcnt 0xa
-; GFX1250-NEXT:     v_pk_fma_bf16 v5, v5, v21, v37
-; GFX1250-NEXT:     s_wait_loadcnt 0x9
-; GFX1250-NEXT:     v_pk_fma_bf16 v6, v6, v22, v38
-; GFX1250-NEXT:     s_wait_loadcnt 0x8
-; GFX1250-NEXT:     v_pk_fma_bf16 v7, v7, v23, v39
-; GFX1250-NEXT:     s_wait_loadcnt 0x7
-; GFX1250-NEXT:     v_pk_fma_bf16 v8, v8, v24, v48
-; GFX1250-NEXT:     s_wait_loadcnt 0x6
-; GFX1250-NEXT:     v_pk_fma_bf16 v9, v9, v25, v49
-; GFX1250-NEXT:     s_wait_loadcnt 0x5
-; GFX1250-NEXT:     v_pk_fma_bf16 v10, v10, v26, v50
-; GFX1250-NEXT:     s_wait_loadcnt 0x4
-; GFX1250-NEXT:     v_pk_fma_bf16 v11, v11, v27, v51
-; GFX1250-NEXT:     s_wait_loadcnt 0x3
-; GFX1250-NEXT:     v_pk_fma_bf16 v12, v12, v28, v52
-; GFX1250-NEXT:     s_wait_loadcnt 0x2
-; GFX1250-NEXT:     v_pk_fma_bf16 v13, v13, v29, v53
-; GFX1250-NEXT:     s_wait_loadcnt 0x1
-; GFX1250-NEXT:     v_pk_fma_bf16 v14, v14, v30, v54
-; GFX1250-NEXT:     s_wait_loadcnt 0x0
-; GFX1250-NEXT:     v_pk_fma_bf16 v15, v15, v55, v31
-; GFX1250-NEXT:     s_set_pc_i64 s[30:31]
-define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
+; GCN-LABEL: v_fma_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    v_fma_f32 v15, v15, v31, v32
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    v_fma_f32 v14, v14, v30, v31
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_fma_f32 v13, v13, v29, v30
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_fma_f32 v12, v12, v28, v29
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_fma_f32 v11, v11, v27, v28
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_fma_f32 v10, v10, v26, v27
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_fma_f32 v9, v9, v25, v26
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_fma_f32 v8, v8, v24, v25
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_fma_f32 v7, v7, v23, v24
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_fma_f32 v6, v6, v22, v23
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_fma_f32 v5, v5, v21, v22
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_fma_f32 v4, v4, v20, v21
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_fma_f32 v3, v3, v19, v20
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_fma_f32 v2, v2, v18, v19
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_fma_f32 v1, v1, v17, v18
+; GCN-NEXT:    v_fma_f32 v0, v0, v16, v19
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_fma_f32 v15, v15, v31, v32
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_fma_f32 v14, v14, v30, v31
+; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_fma_f32 v13, v13, v29, v30
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_fma_f32 v12, v12, v28, v29
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_fma_f32 v11, v11, v27, v28
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_fma_f32 v10, v10, v26, v27
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_fma_f32 v9, v9, v25, v26
+; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_fma_f32 v8, v8, v24, v25
+; GFX7-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_fma_f32 v7, v7, v23, v24
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_fma_f32 v6, v6, v22, v23
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_fma_f32 v5, v5, v21, v22
+; GFX7-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_fma_f32 v4, v4, v20, v21
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_fma_f32 v3, v3, v19, v20
+; GFX7-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_fma_f32 v2, v2, v18, v19
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_fma_f32 v1, v1, v17, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v19
+; GFX7-NEXT:    v_fma_f32 v0, v0, v16, v17
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    v_fma_f32 v24, v26, v25, v24
+; GFX8-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    v_fma_f32 v15, v25, v23, v15
+; GFX8-NEXT:    v_fma_f32 v6, v6, v14, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_fma_f32 v14, v23, v22, v14
+; GFX8-NEXT:    v_fma_f32 v5, v5, v13, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    v_fma_f32 v13, v22, v21, v13
+; GFX8-NEXT:    v_fma_f32 v4, v4, v12, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    v_fma_f32 v12, v21, v20, v12
+; GFX8-NEXT:    v_fma_f32 v3, v3, v11, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_fma_f32 v11, v20, v19, v11
+; GFX8-NEXT:    v_fma_f32 v2, v2, v10, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_fma_f32 v10, v19, v18, v10
+; GFX8-NEXT:    v_fma_f32 v1, v1, v9, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_fma_f32 v0, v0, v8, v16
+; GFX8-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v24
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
+; GFX8-NEXT:    v_or_b32_e32 v16, 0x400000, v24
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v8, v16, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v7
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_fma_f32 v9, v18, v17, v9
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v15
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v6
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v14
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v14
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v5
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v13
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v4
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v12
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v3
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v11
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v2
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v10
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v9
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v17, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v9, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v10, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v11, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v13, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v14, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v15, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v8, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_fma_v16bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    v_fma_f32 v24, v26, v25, v24
+; GFX900-NEXT:    v_fma_f32 v7, v7, v15, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    v_fma_f32 v15, v25, v23, v15
+; GFX900-NEXT:    v_fma_f32 v6, v6, v14, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v14, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v13
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    v_fma_f32 v14, v23, v22, v14
+; GFX900-NEXT:    v_fma_f32 v5, v5, v13, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v13, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v12
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v4
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    v_fma_f32 v13, v22, v21, v13
+; GFX900-NEXT:    v_fma_f32 v4, v4, v12, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v12, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v11
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    v_fma_f32 v12, v21, v20, v12
+; GFX900-NEXT:    v_fma_f32 v3, v3, v11, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    v_fma_f32 v11, v20, v19, v11
+; GFX900-NEXT:    v_fma_f32 v2, v2, v10, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v9
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    v_fma_f32 v10, v19, v18, v10
+; GFX900-NEXT:    v_fma_f32 v1, v1, v9, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v8
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    v_fma_f32 v0, v0, v8, v16
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX900-NEXT:    v_add3_u32 v8, v8, v24, s4
+; GFX900-NEXT:    v_or_b32_e32 v16, 0x400000, v24
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v16, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v7, 16, 1
+; GFX900-NEXT:    v_fma_f32 v9, v18, v17, v9
+; GFX900-NEXT:    v_add3_u32 v16, v16, v7, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v7
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v15, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v15, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v15
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v6, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v6, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v6
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v14, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v14, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v5, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v5, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v5
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v13, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v13, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v13
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v4, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v12, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v12
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v3, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v3, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v3
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v11, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v11, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v11
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v2, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v2
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v10, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v10, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v10
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v1, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v1, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v1
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v9, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v9, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v16, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v16, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v0, s4
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v0
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v16, v17, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v8, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v16bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v15
+; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v7
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX950-NEXT:    v_fmac_f32_e32 v23, v7, v15
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v22
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v7, v25, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v15, v6, v14
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v6, v22, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v14, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v14, v5, v13
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v21, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v13, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v13, v4, v12
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v4, v20, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v12, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v12, v3, v11
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v19, v11
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v11, v2, v10
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v18, v10
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v10, v1, v9
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v1, v17, v9
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v9, v0, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v9, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v10, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v11, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v12, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v13, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v14, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v15, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v23, v24
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v23, v7, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
+; GFX10-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX10-NEXT:    v_bfe_u32 v28, v23, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX10-NEXT:    v_add3_u32 v25, v25, v24, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v26, v15
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    v_add3_u32 v24, v28, v23, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v26, v7, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v6, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v25, v27, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v6, v27, v14
+; GFX10-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v24, v26, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX10-NEXT:    v_bfe_u32 v26, v15, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v21
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_add3_u32 v21, v26, v15, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v14, v5, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v24, v25, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v15
+; GFX10-NEXT:    v_bfe_u32 v25, v6, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v26, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v21, v24, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v21, v25, v6, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX10-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_add3_u32 v20, v25, v14, 0x7fff
+; GFX10-NEXT:    v_fmac_f32_e32 v13, v4, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v21, v24, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v14
+; GFX10-NEXT:    v_bfe_u32 v24, v5, 16, 1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v4, v25, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v20, v21, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v20, v24, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v24, v13, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v3, v11
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_fmac_f32_e32 v19, v26, v25
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v20, v21, vcc_lo
+; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX10-NEXT:    v_add3_u32 v21, v24, v13, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v24, v12, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v25, v19, 16, 1
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX10-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v19
+; GFX10-NEXT:    v_fmac_f32_e32 v18, v2, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v20, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v11, v24, v12, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v12
+; GFX10-NEXT:    v_add3_u32 v24, v25, v19, 0x7fff
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v2, v25, v10
+; GFX10-NEXT:    v_cndmask_b32_e32 v11, v11, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_bfe_u32 v20, v2, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v17, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, v24, v26, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add3_u32 v1, v20, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v16, v0, v8
+; GFX10-NEXT:    v_bfe_u32 v0, v17, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v27, v18, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v17
+; GFX10-NEXT:    v_add3_u32 v0, v0, v17, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX10-NEXT:    v_bfe_u32 v2, v16, 16, 1
+; GFX10-NEXT:    v_add3_u32 v8, v8, v24, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v24
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX10-NEXT:    v_add3_u32 v2, v2, v16, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v12, v27, v18, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v13
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX10-NEXT:    v_perm_b32 v1, v9, v1, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v25, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v12, v19, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX10-NEXT:    v_perm_b32 v2, v8, v10, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v21, v3, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v11, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v12, v5, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v14, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v15, v7, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v23, v22, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v16bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v6
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v15
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v24, v26, v25 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v22
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11TRUE16-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v26, v28, v27
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v22, v6, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 0xffff0000, v13
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11TRUE16-NEXT:    v_add3_u32 v25, v25, v24, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v29, 0x400000, v24
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v23, v7, v15
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v25, v29, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v15, v23, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v24, v26, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v26
+; GFX11TRUE16-NEXT:    v_add3_u32 v15, v15, v23, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v24, v24, v26, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v22, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v14, v29, v28 :: v_dual_cndmask_b32 v15, v15, v25
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v24, v27, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v20
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11TRUE16-NEXT:    v_add3_u32 v23, v23, v22, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v27, 0x400000, v22
+; GFX11TRUE16-NEXT:    v_bfe_u32 v28, v14, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v20, v4, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v19
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v11
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v14
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v15.h
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v21, v5, v13
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v23, v27, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v5, v28, v14, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11TRUE16-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v21, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v27, v20, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v24
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v5, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v21
+; GFX11TRUE16-NEXT:    v_add3_u32 v14, v23, v21, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11TRUE16-NEXT:    v_add3_u32 v23, v25, v24, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v21, v27, v20, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v13.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v14, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v18
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v14.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v12, v25, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v23, v26, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v12, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v20, v21, v22 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_add3_u32 v21, v23, v12, 0x7fff
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v24, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v12
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v20.h
+; GFX11TRUE16-NEXT:    v_add3_u32 v12, v23, v24, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v9
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v19, v3, v11
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v3, v21, v22 :: v_dual_and_b32 v22, 0xffff0000, v17
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v18
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11TRUE16-NEXT:    v_bfe_u32 v18, v19, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v22, v25, v23 :: v_dual_fmac_f32 v11, v2, v10
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v10, 0x400000, v19
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v18, v19, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v8
+; GFX11TRUE16-NEXT:    v_bfe_u32 v21, v11, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v17, v1, v9 :: v_dual_cndmask_b32 v10, v2, v10
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v12, v18, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v12, v21, v11, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v16
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v11
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
+; GFX11TRUE16-NEXT:    v_bfe_u32 v11, v17, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v21, v24, v23
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v9, v0, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v12, v18, vcc_lo
+; GFX11TRUE16-NEXT:    v_add3_u32 v11, v11, v17, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v17
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v21, 16, 1
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11TRUE16-NEXT:    v_add3_u32 v12, v19, v22, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v18, v9, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v19, 0x400000, v21
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v21, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v22
+; GFX11TRUE16-NEXT:    v_add3_u32 v16, v18, v9, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v9
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v8.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v0, v19, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v10.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v11.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v16, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v18.h
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v16bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v15
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v7
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v24, v26, v25 :: v_dual_and_b32 v23, 0xffff0000, v23
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v23, v7, v15 :: v_dual_lshlrev_b32 v26, 16, v6
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v24, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v27, 0x400000, v24
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11FAKE16-NEXT:    v_bfe_u32 v28, v23, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11FAKE16-NEXT:    v_add3_u32 v25, v25, v24, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v22
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v24, v28, v23, 0x7fff
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v7, v26, v15
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v27 :: v_dual_and_b32 v15, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
+; GFX11FAKE16-NEXT:    v_bfe_u32 v26, v7, 16, 1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v5
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v23, v24, v25, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_add3_u32 v24, v26, v7, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v7
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v7, v24, v25 :: v_dual_and_b32 v6, 0xffff0000, v6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v15, v6, v14 :: v_dual_lshlrev_b32 v14, 16, v13
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v15
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v21
+; GFX11FAKE16-NEXT:    v_bfe_u32 v26, v15, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v6, v27, v14
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 0xffff0000, v21
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v26, v15, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v4
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v6, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v14, v5, v13 :: v_dual_lshlrev_b32 v5, 16, v20
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v21, v24, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v25, v6, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v6
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v14, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v5, v26, v13 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 0xffff0000, v20
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_add3_u32 v20, v25, v14, 0x7fff
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v6, v21, v24 :: v_dual_lshlrev_b32 v25, 16, v3
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v14
+; GFX11FAKE16-NEXT:    v_bfe_u32 v24, v5, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v13, v4, v12 :: v_dual_lshlrev_b32 v4, 16, v19
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v4, v25, v12
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v14, v20, v21, vcc_lo
+; GFX11FAKE16-NEXT:    v_add3_u32 v20, v24, v5, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v21, 0x400000, v5
+; GFX11FAKE16-NEXT:    v_bfe_u32 v24, v13, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 0xffff0000, v19
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v12, v3, v11 :: v_dual_cndmask_b32 v5, v20, v21
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v24, v13, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v13
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
+; GFX11FAKE16-NEXT:    v_bfe_u32 v20, v4, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v24, v12, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v19, v26, v25
+; GFX11FAKE16-NEXT:    v_add3_u32 v11, v20, v4, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v4
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v18, v2, v10
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v19, 16, 1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v11, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_add3_u32 v11, v24, v12, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v12
+; GFX11FAKE16-NEXT:    v_add3_u32 v24, v25, v19, 0x7fff
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v19
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v2, v25, v10 :: v_dual_and_b32 v9, 0xffff0000, v9
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v11, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v25, 16, v8
+; GFX11FAKE16-NEXT:    v_bfe_u32 v20, v2, 16, 1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v24, v26, vcc_lo
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v16
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v26, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v17, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_add3_u32 v1, v20, v2, 0x7fff
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v24, v26, v25
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v16, v0, v8
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v17, 16, 1
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11FAKE16-NEXT:    v_bfe_u32 v8, v24, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v24
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v16, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v17, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v17
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
+; GFX11FAKE16-NEXT:    v_add3_u32 v8, v8, v24, 0x7fff
+; GFX11FAKE16-NEXT:    v_bfe_u32 v27, v18, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v16, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v16
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v0, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
+; GFX11FAKE16-NEXT:    v_add3_u32 v12, v27, v18, 0x7fff
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v19, 0x400000, v18
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v9, v1, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v25, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v12, v19, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v8, v10, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v12, v21, v3, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v11, v4, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v12, v5, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v14, v6, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v15, v7, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v23, v22, 0x7060302
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_fma_v16bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v8, v16
+; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v9, v17
+; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v10, v18
+; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v11, v19
+; GFX1250-NEXT:    v_pk_fma_bf16 v4, v4, v12, v20
+; GFX1250-NEXT:    v_pk_fma_bf16 v5, v5, v13, v21
+; GFX1250-NEXT:    v_pk_fma_bf16 v6, v6, v14, v22
+; GFX1250-NEXT:    v_pk_fma_bf16 v7, v7, v15, v23
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+  %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) {
+; GCN-LABEL: v_fma_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:256
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    v_fma_f32 v31, v31, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:252
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v30, v30, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:248
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v29, v29, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:244
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v28, v28, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:240
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v27, v27, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:236
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v26, v26, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:232
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v25, v25, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:228
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v24, v24, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:224
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v23, v23, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v22, v22, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:216
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v21, v21, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:212
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v20, v20, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v19, v19, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:204
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v18, v18, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:200
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v17, v17, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:196
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v16, v16, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:192
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v15, v15, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v14, v14, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:184
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v13, v13, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:180
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v12, v12, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:176
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v11, v11, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v10, v10, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:168
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v9, v9, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v8, v8, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v7, v7, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v6, v6, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:152
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v5, v5, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:148
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v4, v4, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:144
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v3, v3, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v2, v2, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:136
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v1, v1, v32, v33
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_fma_f32 v0, v0, v32, v33
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:256
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_fma_f32 v31, v31, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:252
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v30, v30, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:248
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v29, v29, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:244
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v28, v28, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:240
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v27, v27, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:236
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v26, v26, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:232
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v25, v25, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:228
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v24, v24, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:224
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v23, v23, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v22, v22, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:216
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v21, v21, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:212
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v20, v20, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v19, v19, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:204
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v18, v18, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:200
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v17, v17, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:196
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v16, v16, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:192
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v15, v15, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:188
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v14, v14, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:184
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v13, v13, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:180
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v12, v12, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:176
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v11, v11, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v10, v10, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:168
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v9, v9, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v8, v8, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:160
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v7, v7, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:156
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v6, v6, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:152
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v5, v5, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:148
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v4, v4, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:144
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v3, v3, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:140
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v2, v2, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:136
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v1, v1, v32, v33
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX7-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_fma_f32 v0, v0, v32, v33
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v15, v15, v33, v32
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    v_fma_f32 v31, v31, v35, v34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v14, v14, v30, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GFX8-NEXT:    v_fma_f32 v32, v34, v32, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v13, v13, v29, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GFX8-NEXT:    v_fma_f32 v30, v34, v30, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v12, v12, v28, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX8-NEXT:    v_fma_f32 v29, v34, v29, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v11, v11, v27, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GFX8-NEXT:    v_fma_f32 v28, v34, v28, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v10, v10, v26, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX8-NEXT:    v_fma_f32 v27, v34, v27, v35
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v9, v9, v25, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GFX8-NEXT:    v_fma_f32 v26, v35, v34, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v8, v8, v24, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GFX8-NEXT:    v_fma_f32 v25, v35, v34, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v7, v7, v23, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GFX8-NEXT:    v_fma_f32 v24, v35, v34, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v6, v6, v22, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX8-NEXT:    v_fma_f32 v23, v35, v34, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v5, v5, v21, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GFX8-NEXT:    v_fma_f32 v22, v35, v34, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v4, v4, v20, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GFX8-NEXT:    v_fma_f32 v21, v35, v34, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v3, v3, v19, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX8-NEXT:    v_fma_f32 v20, v35, v34, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v2, v2, v18, v33
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    v_fma_f32 v19, v35, v34, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX8-NEXT:    v_fma_f32 v1, v1, v17, v33
+; GFX8-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    v_fma_f32 v18, v35, v34, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v35, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v33, 16, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX8-NEXT:    v_fma_f32 v0, v0, v16, v17
+; GFX8-NEXT:    v_bfe_u32 v16, v31, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v31
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v31
+; GFX8-NEXT:    v_cndmask_b32_e32 v16, v16, v17, vcc
+; GFX8-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v15
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX8-NEXT:    v_or_b32_e32 v15, 0x400000, v15
+; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; GFX8-NEXT:    v_bfe_u32 v17, v32, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v32
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX8-NEXT:    v_or_b32_e32 v31, 0x400000, v32
+; GFX8-NEXT:    v_cndmask_b32_e32 v17, v17, v31, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v14, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v14
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX8-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v31, v14, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v30, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v30
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX8-NEXT:    v_or_b32_e32 v30, 0x400000, v30
+; GFX8-NEXT:    v_cndmask_b32_e32 v30, v31, v30, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v13, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v13
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX8-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v13, v31, v13, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v29, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v29
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX8-NEXT:    v_or_b32_e32 v29, 0x400000, v29
+; GFX8-NEXT:    v_cndmask_b32_e32 v29, v31, v29, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v12, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v12
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT:    v_or_b32_e32 v12, 0x400000, v12
+; GFX8-NEXT:    v_cndmask_b32_e32 v12, v31, v12, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v28, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v28
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX8-NEXT:    v_or_b32_e32 v28, 0x400000, v28
+; GFX8-NEXT:    v_cndmask_b32_e32 v28, v31, v28, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v11, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v11
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v11, v31, v11, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v27, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v27
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX8-NEXT:    v_or_b32_e32 v27, 0x400000, v27
+; GFX8-NEXT:    v_cndmask_b32_e32 v27, v31, v27, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v10, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v10
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v31, v10, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v26, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v26
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX8-NEXT:    v_or_b32_e32 v26, 0x400000, v26
+; GFX8-NEXT:    v_cndmask_b32_e32 v26, v31, v26, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v9, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v9
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v31, v9, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v25, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v25
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX8-NEXT:    v_or_b32_e32 v25, 0x400000, v25
+; GFX8-NEXT:    v_cndmask_b32_e32 v25, v31, v25, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v8, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v8
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, v31, v8, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v24, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v24
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX8-NEXT:    v_or_b32_e32 v24, 0x400000, v24
+; GFX8-NEXT:    v_cndmask_b32_e32 v24, v31, v24, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v7, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v7
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v31, v7, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v23, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v23
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX8-NEXT:    v_or_b32_e32 v23, 0x400000, v23
+; GFX8-NEXT:    v_cndmask_b32_e32 v23, v31, v23, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v6
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v31, v6, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v22, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v22
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX8-NEXT:    v_or_b32_e32 v22, 0x400000, v22
+; GFX8-NEXT:    v_cndmask_b32_e32 v22, v31, v22, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v5
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v31, v5, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v21, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v21
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX8-NEXT:    v_or_b32_e32 v21, 0x400000, v21
+; GFX8-NEXT:    v_cndmask_b32_e32 v21, v31, v21, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v4, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v4
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v31, v4, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v20, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v20
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX8-NEXT:    v_or_b32_e32 v20, 0x400000, v20
+; GFX8-NEXT:    v_cndmask_b32_e32 v20, v31, v20, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v3, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v3
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v31, v3, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v19, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v19
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX8-NEXT:    v_or_b32_e32 v19, 0x400000, v19
+; GFX8-NEXT:    v_cndmask_b32_e32 v19, v31, v19, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v2, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v2
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v31, v2, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v18, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v18
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_cndmask_b32_e32 v18, v31, v18, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v1, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_fma_f32 v33, v35, v34, v33
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v31, v1, vcc
+; GFX8-NEXT:    v_bfe_u32 v31, v33, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, v31, v33
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, s4, v31
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v33
+; GFX8-NEXT:    v_cndmask_b32_e32 v31, v31, v32, vcc
+; GFX8-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, v32, v0
+; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_alignbit_b32 v0, v0, v31, 16
+; GFX8-NEXT:    v_alignbit_b32 v1, v1, v18, 16
+; GFX8-NEXT:    v_alignbit_b32 v2, v2, v19, 16
+; GFX8-NEXT:    v_alignbit_b32 v3, v3, v20, 16
+; GFX8-NEXT:    v_alignbit_b32 v4, v4, v21, 16
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v22, 16
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v23, 16
+; GFX8-NEXT:    v_alignbit_b32 v7, v7, v24, 16
+; GFX8-NEXT:    v_alignbit_b32 v8, v8, v25, 16
+; GFX8-NEXT:    v_alignbit_b32 v9, v9, v26, 16
+; GFX8-NEXT:    v_alignbit_b32 v10, v10, v27, 16
+; GFX8-NEXT:    v_alignbit_b32 v11, v11, v28, 16
+; GFX8-NEXT:    v_alignbit_b32 v12, v12, v29, 16
+; GFX8-NEXT:    v_alignbit_b32 v13, v13, v30, 16
+; GFX8-NEXT:    v_alignbit_b32 v14, v14, v17, 16
+; GFX8-NEXT:    v_alignbit_b32 v15, v15, v16, 16
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_fma_v32bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX900-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX900-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX900-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v32
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v15, v15, v33, v32
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GFX900-NEXT:    v_fma_f32 v31, v31, v35, v34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v32, 16, v30
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
+; GFX900-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX900-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v14, v14, v30, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GFX900-NEXT:    v_fma_f32 v32, v34, v32, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v13
+; GFX900-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX900-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v13, v13, v29, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GFX900-NEXT:    v_fma_f32 v30, v34, v30, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v12
+; GFX900-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v12, v12, v28, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX900-NEXT:    v_fma_f32 v29, v34, v29, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v11
+; GFX900-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX900-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v11, v11, v27, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GFX900-NEXT:    v_fma_f32 v28, v34, v28, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v10
+; GFX900-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX900-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v10, v10, v26, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GFX900-NEXT:    v_fma_f32 v27, v34, v27, v35
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v9
+; GFX900-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX900-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v9, v9, v25, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GFX900-NEXT:    v_fma_f32 v26, v35, v34, v26
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v8
+; GFX900-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v25, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v8, v8, v24, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GFX900-NEXT:    v_fma_f32 v25, v35, v34, v25
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v7
+; GFX900-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX900-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v24, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v7, v7, v23, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GFX900-NEXT:    v_fma_f32 v24, v35, v34, v24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v23, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v6, v6, v22, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX900-NEXT:    v_fma_f32 v23, v35, v34, v23
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v5
+; GFX900-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v22, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v5, v5, v21, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GFX900-NEXT:    v_fma_f32 v22, v35, v34, v22
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
+; GFX900-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v4, v4, v20, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GFX900-NEXT:    v_fma_f32 v21, v35, v34, v21
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX900-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v3, v3, v19, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GFX900-NEXT:    v_fma_f32 v20, v35, v34, v20
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v2
+; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v19, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v2, v2, v18, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX900-NEXT:    v_fma_f32 v19, v35, v34, v19
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v17
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v1
+; GFX900-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v18, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v1, v1, v17, v33
+; GFX900-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GFX900-NEXT:    v_fma_f32 v18, v35, v34, v18
+; GFX900-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
+; GFX900-NEXT:    v_lshlrev_b32_e32 v35, 16, v0
+; GFX900-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v17, 16, v33
+; GFX900-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX900-NEXT:    v_fma_f32 v0, v0, v16, v33
+; GFX900-NEXT:    v_bfe_u32 v16, v31, 16, 1
+; GFX900-NEXT:    v_add3_u32 v16, v16, v31, s4
+; GFX900-NEXT:    v_or_b32_e32 v31, 0x400000, v31
+; GFX900-NEXT:    v_cndmask_b32_e32 v16, v16, v31, vcc
+; GFX900-NEXT:    v_bfe_u32 v31, v15, 16, 1
+; GFX900-NEXT:    v_add3_u32 v31, v31, v15, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
+; GFX900-NEXT:    v_or_b32_e32 v15, 0x400000, v15
+; GFX900-NEXT:    v_cndmask_b32_e32 v15, v31, v15, vcc
+; GFX900-NEXT:    v_bfe_u32 v31, v32, 16, 1
+; GFX900-NEXT:    v_add3_u32 v31, v31, v32, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
+; GFX900-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX900-NEXT:    v_cndmask_b32_e32 v31, v31, v32, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v14, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v14, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
+; GFX900-NEXT:    v_or_b32_e32 v14, 0x400000, v14
+; GFX900-NEXT:    v_cndmask_b32_e32 v14, v32, v14, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v30, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v30, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX900-NEXT:    v_or_b32_e32 v30, 0x400000, v30
+; GFX900-NEXT:    v_cndmask_b32_e32 v30, v32, v30, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v13, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v13, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX900-NEXT:    v_or_b32_e32 v13, 0x400000, v13
+; GFX900-NEXT:    v_cndmask_b32_e32 v13, v32, v13, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v29, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v29, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
+; GFX900-NEXT:    v_or_b32_e32 v29, 0x400000, v29
+; GFX900-NEXT:    v_cndmask_b32_e32 v29, v32, v29, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v12, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v12, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT:    v_or_b32_e32 v12, 0x400000, v12
+; GFX900-NEXT:    v_cndmask_b32_e32 v12, v32, v12, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v28, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v28, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
+; GFX900-NEXT:    v_or_b32_e32 v28, 0x400000, v28
+; GFX900-NEXT:    v_cndmask_b32_e32 v28, v32, v28, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v11, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v11, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
+; GFX900-NEXT:    v_or_b32_e32 v11, 0x400000, v11
+; GFX900-NEXT:    v_cndmask_b32_e32 v11, v32, v11, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v27, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v27, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
+; GFX900-NEXT:    v_or_b32_e32 v27, 0x400000, v27
+; GFX900-NEXT:    v_cndmask_b32_e32 v27, v32, v27, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v10, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v10, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v32, v10, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v26, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v26, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
+; GFX900-NEXT:    v_or_b32_e32 v26, 0x400000, v26
+; GFX900-NEXT:    v_cndmask_b32_e32 v26, v32, v26, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v9, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v9, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
+; GFX900-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX900-NEXT:    v_cndmask_b32_e32 v9, v32, v9, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v25, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v25, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
+; GFX900-NEXT:    v_or_b32_e32 v25, 0x400000, v25
+; GFX900-NEXT:    v_cndmask_b32_e32 v25, v32, v25, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v8, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v8, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX900-NEXT:    v_cndmask_b32_e32 v8, v32, v8, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v24, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v24, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
+; GFX900-NEXT:    v_or_b32_e32 v24, 0x400000, v24
+; GFX900-NEXT:    v_cndmask_b32_e32 v24, v32, v24, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v7, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v7, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX900-NEXT:    v_cndmask_b32_e32 v7, v32, v7, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v23, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v23, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
+; GFX900-NEXT:    v_or_b32_e32 v23, 0x400000, v23
+; GFX900-NEXT:    v_cndmask_b32_e32 v23, v32, v23, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v6, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v6, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v32, v6, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v22, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v22, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
+; GFX900-NEXT:    v_or_b32_e32 v22, 0x400000, v22
+; GFX900-NEXT:    v_cndmask_b32_e32 v22, v32, v22, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v5, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v5, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v32, v5, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v21, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v21, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
+; GFX900-NEXT:    v_or_b32_e32 v21, 0x400000, v21
+; GFX900-NEXT:    v_cndmask_b32_e32 v21, v32, v21, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v4, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v4, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX900-NEXT:    v_cndmask_b32_e32 v4, v32, v4, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v20, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v20, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
+; GFX900-NEXT:    v_or_b32_e32 v20, 0x400000, v20
+; GFX900-NEXT:    v_cndmask_b32_e32 v20, v32, v20, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v3, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v3, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX900-NEXT:    v_cndmask_b32_e32 v3, v32, v3, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v19, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v19, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
+; GFX900-NEXT:    v_or_b32_e32 v19, 0x400000, v19
+; GFX900-NEXT:    v_cndmask_b32_e32 v19, v32, v19, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v2, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v2, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX900-NEXT:    v_cndmask_b32_e32 v2, v32, v2, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v18, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v18, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
+; GFX900-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX900-NEXT:    v_cndmask_b32_e32 v18, v32, v18, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v1, 16, 1
+; GFX900-NEXT:    v_fma_f32 v17, v35, v34, v17
+; GFX900-NEXT:    v_add3_u32 v32, v32, v1, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX900-NEXT:    v_cndmask_b32_e32 v1, v32, v1, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v17, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v17, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
+; GFX900-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX900-NEXT:    v_cndmask_b32_e32 v17, v32, v17, vcc
+; GFX900-NEXT:    v_bfe_u32 v32, v0, 16, 1
+; GFX900-NEXT:    v_add3_u32 v32, v32, v0, s4
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX900-NEXT:    v_cndmask_b32_e32 v0, v32, v0, vcc
+; GFX900-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX900-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX900-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX900-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX900-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX900-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX900-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX900-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX900-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX900-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX900-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX900-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX900-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX900-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX900-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX900-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX900-NEXT:    v_perm_b32 v15, v15, v16, s4
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v32bf16:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_load_dword v35, off, s32 offset:64
+; GFX950-NEXT:    scratch_load_dword v36, off, s32
+; GFX950-NEXT:    scratch_load_dword v38, off, s32 offset:60
+; GFX950-NEXT:    scratch_load_dword v39, off, s32 offset:56
+; GFX950-NEXT:    scratch_load_dword v48, off, s32 offset:52
+; GFX950-NEXT:    scratch_load_dword v49, off, s32 offset:48
+; GFX950-NEXT:    scratch_load_dword v50, off, s32 offset:44
+; GFX950-NEXT:    scratch_load_dword v51, off, s32 offset:40
+; GFX950-NEXT:    scratch_load_dword v52, off, s32 offset:36
+; GFX950-NEXT:    scratch_load_dword v53, off, s32 offset:32
+; GFX950-NEXT:    scratch_load_dword v54, off, s32 offset:28
+; GFX950-NEXT:    scratch_load_dword v31, off, s32 offset:4
+; GFX950-NEXT:    scratch_load_dword v32, off, s32 offset:8
+; GFX950-NEXT:    scratch_load_dword v33, off, s32 offset:12
+; GFX950-NEXT:    scratch_load_dword v34, off, s32 offset:16
+; GFX950-NEXT:    scratch_load_dword v37, off, s32 offset:20
+; GFX950-NEXT:    scratch_load_dword v55, off, s32 offset:24
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v43, 0xffff0000, v14
+; GFX950-NEXT:    v_lshlrev_b32_e32 v45, 16, v14
+; GFX950-NEXT:    v_and_b32_e32 v46, 0xffff0000, v29
+; GFX950-NEXT:    v_lshlrev_b32_e32 v56, 16, v29
+; GFX950-NEXT:    v_and_b32_e32 v59, 0xffff0000, v12
+; GFX950-NEXT:    v_lshlrev_b32_e32 v61, 16, v12
+; GFX950-NEXT:    v_and_b32_e32 v62, 0xffff0000, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX950-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v42, 0xffff0000, v30
+; GFX950-NEXT:    v_lshlrev_b32_e32 v44, 16, v30
+; GFX950-NEXT:    v_and_b32_e32 v47, 0xffff0000, v13
+; GFX950-NEXT:    v_lshlrev_b32_e32 v57, 16, v13
+; GFX950-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v40, 0xffff0000, v15
+; GFX950-NEXT:    v_lshlrev_b32_e32 v41, 16, v15
+; GFX950-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX950-NEXT:    v_and_b32_e32 v58, 0xffff0000, v28
+; GFX950-NEXT:    v_lshlrev_b32_e32 v60, 16, v28
+; GFX950-NEXT:    s_waitcnt vmcnt(16)
+; GFX950-NEXT:    v_and_b32_e32 v15, 0xffff0000, v35
+; GFX950-NEXT:    s_waitcnt vmcnt(15)
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v36
+; GFX950-NEXT:    v_lshlrev_b32_e32 v63, 16, v36
+; GFX950-NEXT:    s_waitcnt vmcnt(14)
+; GFX950-NEXT:    v_and_b32_e32 v14, 0xffff0000, v38
+; GFX950-NEXT:    v_lshlrev_b32_e32 v29, 16, v38
+; GFX950-NEXT:    s_waitcnt vmcnt(11)
+; GFX950-NEXT:    v_and_b32_e32 v36, 0xffff0000, v49
+; GFX950-NEXT:    v_and_b32_e32 v38, 0xffff0000, v11
+; GFX950-NEXT:    v_fmac_f32_e32 v36, v38, v62
+; GFX950-NEXT:    v_lshlrev_b32_e32 v38, 16, v49
+; GFX950-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT:    v_and_b32_e32 v13, 0xffff0000, v39
+; GFX950-NEXT:    v_lshlrev_b32_e32 v30, 16, v39
+; GFX950-NEXT:    v_fmac_f32_e32 v38, v11, v27
+; GFX950-NEXT:    s_waitcnt vmcnt(10)
+; GFX950-NEXT:    v_and_b32_e32 v11, 0xffff0000, v50
+; GFX950-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v10
+; GFX950-NEXT:    v_fmac_f32_e32 v11, v39, v27
+; GFX950-NEXT:    v_lshlrev_b32_e32 v27, 16, v50
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX950-NEXT:    v_fmac_f32_e32 v27, v10, v26
+; GFX950-NEXT:    s_waitcnt vmcnt(9)
+; GFX950-NEXT:    v_and_b32_e32 v10, 0xffff0000, v51
+; GFX950-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v9
+; GFX950-NEXT:    v_fmac_f32_e32 v10, v39, v26
+; GFX950-NEXT:    v_lshlrev_b32_e32 v26, 16, v51
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX950-NEXT:    v_fmac_f32_e32 v26, v9, v25
+; GFX950-NEXT:    s_waitcnt vmcnt(8)
+; GFX950-NEXT:    v_and_b32_e32 v9, 0xffff0000, v52
+; GFX950-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v8
+; GFX950-NEXT:    v_fmac_f32_e32 v9, v39, v25
+; GFX950-NEXT:    v_lshlrev_b32_e32 v25, 16, v52
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX950-NEXT:    v_fmac_f32_e32 v25, v8, v24
+; GFX950-NEXT:    s_waitcnt vmcnt(7)
+; GFX950-NEXT:    v_and_b32_e32 v8, 0xffff0000, v53
+; GFX950-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v7
+; GFX950-NEXT:    v_fmac_f32_e32 v8, v39, v24
+; GFX950-NEXT:    v_lshlrev_b32_e32 v24, 16, v53
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT:    v_fmac_f32_e32 v24, v7, v23
+; GFX950-NEXT:    s_waitcnt vmcnt(6)
+; GFX950-NEXT:    v_and_b32_e32 v7, 0xffff0000, v54
+; GFX950-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v7, v39, v23
+; GFX950-NEXT:    v_lshlrev_b32_e32 v23, 16, v54
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT:    v_fmac_f32_e32 v23, v6, v22
+; GFX950-NEXT:    s_waitcnt vmcnt(0)
+; GFX950-NEXT:    v_and_b32_e32 v6, 0xffff0000, v55
+; GFX950-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v6, v39, v22
+; GFX950-NEXT:    v_lshlrev_b32_e32 v22, 16, v55
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT:    v_fmac_f32_e32 v22, v5, v21
+; GFX950-NEXT:    v_and_b32_e32 v5, 0xffff0000, v37
+; GFX950-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX950-NEXT:    v_and_b32_e32 v39, 0xffff0000, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v5, v39, v21
+; GFX950-NEXT:    v_lshlrev_b32_e32 v21, 16, v37
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT:    v_fmac_f32_e32 v21, v4, v20
+; GFX950-NEXT:    v_and_b32_e32 v4, 0xffff0000, v34
+; GFX950-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX950-NEXT:    v_and_b32_e32 v37, 0xffff0000, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v4, v37, v20
+; GFX950-NEXT:    v_lshlrev_b32_e32 v20, 16, v34
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT:    v_fmac_f32_e32 v20, v3, v19
+; GFX950-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; GFX950-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX950-NEXT:    v_and_b32_e32 v34, 0xffff0000, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v3, v34, v19
+; GFX950-NEXT:    v_lshlrev_b32_e32 v19, 16, v33
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT:    v_fmac_f32_e32 v19, v2, v18
+; GFX950-NEXT:    v_and_b32_e32 v2, 0xffff0000, v32
+; GFX950-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX950-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v2, v33, v18
+; GFX950-NEXT:    v_lshlrev_b32_e32 v18, 16, v32
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT:    v_fmac_f32_e32 v18, v1, v17
+; GFX950-NEXT:    v_and_b32_e32 v1, 0xffff0000, v31
+; GFX950-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX950-NEXT:    v_and_b32_e32 v32, 0xffff0000, v0
+; GFX950-NEXT:    v_lshlrev_b32_e32 v28, 16, v35
+; GFX950-NEXT:    v_fmac_f32_e32 v15, v40, v12
+; GFX950-NEXT:    v_and_b32_e32 v12, 0xffff0000, v48
+; GFX950-NEXT:    v_lshlrev_b32_e32 v35, 16, v48
+; GFX950-NEXT:    v_fmac_f32_e32 v1, v32, v17
+; GFX950-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX950-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX950-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT:    v_fmac_f32_e32 v28, v41, v63
+; GFX950-NEXT:    v_fmac_f32_e32 v14, v43, v42
+; GFX950-NEXT:    v_fmac_f32_e32 v29, v45, v44
+; GFX950-NEXT:    v_fmac_f32_e32 v13, v47, v46
+; GFX950-NEXT:    v_fmac_f32_e32 v30, v57, v56
+; GFX950-NEXT:    v_fmac_f32_e32 v12, v59, v58
+; GFX950-NEXT:    v_fmac_f32_e32 v35, v61, v60
+; GFX950-NEXT:    v_fmac_f32_e32 v17, v0, v16
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v0, v17, v1
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v1, v18, v2
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v2, v19, v3
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v3, v20, v4
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v4, v21, v5
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v5, v22, v6
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v6, v23, v7
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v7, v24, v8
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v8, v25, v9
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v9, v26, v10
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v10, v27, v11
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v11, v38, v36
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v12, v35, v12
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v13, v30, v13
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v14, v29, v14
+; GFX950-NEXT:    v_cvt_pk_bf16_f32 v15, v28, v15
+; GFX950-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX950-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x8
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v15
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v32
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v33
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v32
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    v_fmac_f32_e32 v31, v49, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v14
+; GFX10-NEXT:    v_fmac_f32_e32 v15, v51, v32
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v34
+; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v34
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    v_fmac_f32_e32 v32, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v13
+; GFX10-NEXT:    v_fmac_f32_e32 v14, v51, v30
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v35
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    v_fmac_f32_e32 v30, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v12
+; GFX10-NEXT:    v_fmac_f32_e32 v13, v51, v29
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v36
+; GFX10-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v36
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    v_fmac_f32_e32 v29, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v11
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v51, v28
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v37
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    v_fmac_f32_e32 v28, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v11, v51, v27
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v25
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v39
+; GFX10-NEXT:    v_fmac_f32_e32 v27, v50, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v9
+; GFX10-NEXT:    v_fmac_f32_e32 v10, v52, v51
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v31
+; GFX10-NEXT:    v_fmac_f32_e32 v26, v49, v38
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v24
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v31, v31
+; GFX10-NEXT:    v_fmac_f32_e32 v9, v49, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v48
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
+; GFX10-NEXT:    v_fmac_f32_e32 v25, v49, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v7
+; GFX10-NEXT:    v_fmac_f32_e32 v48, v8, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v33
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX10-NEXT:    v_fmac_f32_e32 v8, v49, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT:    v_fmac_f32_e32 v33, v7, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v21
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v34
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v39, v24
+; GFX10-NEXT:    v_fmac_f32_e32 v34, v6, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v19
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v6, v23, v49
+; GFX10-NEXT:    v_fmac_f32_e32 v35, v5, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v36
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v39, v24
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX10-NEXT:    v_fmac_f32_e32 v36, v4, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v16
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v37
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v39, v23, v22
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v23, v3, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v50
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v51
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v51
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v50
+; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v33, v33
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v38
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v38
+; GFX10-NEXT:    v_fmac_f32_e32 v37, v21, v49
+; GFX10-NEXT:    v_fmac_f32_e32 v50, v2, v18
+; GFX10-NEXT:    v_fmac_f32_e32 v19, v1, v17
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v48
+; GFX10-NEXT:    v_fmac_f32_e32 v38, v0, v16
+; GFX10-NEXT:    v_bfe_u32 v0, v48, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v16, v33, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v2, v8, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v33
+; GFX10-NEXT:    v_bfe_u32 v18, v7, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v21, v34, 16, 1
+; GFX10-NEXT:    v_add3_u32 v0, v0, v48, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v48, v35, 16, 1
+; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v33, v5, 16, 1
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v24
+; GFX10-NEXT:    v_fmac_f32_e32 v51, v22, v20
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v8
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v7
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v34
+; GFX10-NEXT:    v_bfe_u32 v24, v6, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v8, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v8, v8
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v35
+; GFX10-NEXT:    v_add3_u32 v18, v18, v7, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v7, v7
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v5
+; GFX10-NEXT:    v_add3_u32 v21, v21, v34, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v34, v34
+; GFX10-NEXT:    v_bfe_u32 v34, v39, 16, 1
+; GFX10-NEXT:    v_add3_u32 v48, v48, v35, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v35, v35
+; GFX10-NEXT:    v_bfe_u32 v35, v23, 16, 1
+; GFX10-NEXT:    v_add3_u32 v33, v33, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v5, v37, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v49, 0x400000, v6
+; GFX10-NEXT:    v_add3_u32 v24, v24, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v6, v6
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v39
+; GFX10-NEXT:    v_add3_u32 v34, v34, v39, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v39, v39
+; GFX10-NEXT:    v_or_b32_e32 v39, 0x400000, v23
+; GFX10-NEXT:    v_add3_u32 v35, v35, v23, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v23, v23
+; GFX10-NEXT:    v_or_b32_e32 v23, 0x400000, v37
+; GFX10-NEXT:    v_add3_u32 v5, v5, v37, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v37, v37
+; GFX10-NEXT:    v_bfe_u32 v37, v31, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v53, v2, v4, s4
+; GFX10-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v18, v20, s6
+; GFX10-NEXT:    v_add3_u32 v37, v37, v31, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v18, v21, v22, s7
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
+; GFX10-NEXT:    v_bfe_u32 v22, v19, 16, 1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v31, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v15, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v15
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v15, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v24, v49, s8
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v19
+; GFX10-NEXT:    v_add3_u32 v37, v37, v15, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s10
+; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
+; GFX10-NEXT:    v_add3_u32 v22, v22, v19, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v34, v6, s11
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v32, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v32
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v32, v32
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v35, v35, v39, s12
+; GFX10-NEXT:    v_add3_u32 v37, v37, v32, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v39, v38, 16, 1
+; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v23, s13
+; GFX10-NEXT:    v_or_b32_e32 v23, 0x400000, v38
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v14, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v14
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v14, v14
+; GFX10-NEXT:    v_add3_u32 v39, v39, v38, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v50
+; GFX10-NEXT:    v_add3_u32 v37, v37, v14, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v48, v8, s9
+; GFX10-NEXT:    v_perm_b32 v15, v15, v31, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v30, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v30
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v30, v30
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v30, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v13, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v13
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v13, v13
+; GFX10-NEXT:    v_add3_u32 v37, v37, v13, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v29, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v29
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v29, v29
+; GFX10-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v29, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v29, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v12, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v12, v12
+; GFX10-NEXT:    v_add3_u32 v37, v37, v12, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v28, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v28
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v28, v28
+; GFX10-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v28, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v28, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v11, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v11
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v11, v11
+; GFX10-NEXT:    v_add3_u32 v37, v37, v11, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v27, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v27
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v27, v27
+; GFX10-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v27, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v10, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v10
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v10, v10
+; GFX10-NEXT:    v_add3_u32 v37, v37, v10, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v26, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v26
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v26, v26
+; GFX10-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v26, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v26, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v9, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v9
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
+; GFX10-NEXT:    v_add3_u32 v37, v37, v9, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v37, v52, s14
+; GFX10-NEXT:    v_bfe_u32 v37, v25, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v25
+; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v25, v25
+; GFX10-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX10-NEXT:    v_add3_u32 v37, v37, v25, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e64 v25, v37, v52, s14
+; GFX10-NEXT:    v_cndmask_b32_e32 v52, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT:    v_bfe_u32 v1, v50, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v37, v36, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v36
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v20, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
+; GFX10-NEXT:    v_add3_u32 v1, v1, v50, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v37, v37, v36, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v22, v24, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX10-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX10-NEXT:    v_cndmask_b32_e32 v20, v39, v23, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX10-NEXT:    v_perm_b32 v1, v4, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v3, v35, v6, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v6, v18, v17, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v2, v2, v5, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v22, v37, v0, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v20, v19, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v5, v8, v21, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v8, v52, v25, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v4, v22, v7, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v7, v16, v53, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_fma_v32bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    s_clause 0x10
+; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:60
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:56
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:48
+; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:44
+; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:40
+; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:36
+; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:28
+; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:24
+; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:20
+; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:16
+; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:12
+; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:8
+; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v99, 0xffff0000, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v100, 0xffff0000, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v101, 0xffff0000, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v102, 0xffff0000, v4
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v115, 0xffff0000, v17
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v116, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v97, 0xffff0000, v22
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v117, 0xffff0000, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v118, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v103, 0xffff0000, v19
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v112, 0xffff0000, v3
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v85, 0xffff0000, v24
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v113, 0xffff0000, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v114, 0xffff0000, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v119, 0xffff0000, v31
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v128, 0xffff0000, v32
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v129, 0xffff0000, v33
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v68, 0xffff0000, v13
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v131, 0xffff0000, v35
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v133, 0xffff0000, v37
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v134, 0xffff0000, v38
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v37, 16, v37
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v144, 0xffff0000, v48
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v48, 16, v48
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v146, 0xffff0000, v50
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v145, 0xffff0000, v49
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v49, 16, v49
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v147, 0xffff0000, v51
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v51, 16, v51
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v96, 0xffff0000, v7
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v148, 0xffff0000, v55
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v87, 0xffff0000, v23
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v83, 0xffff0000, v25
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v146, v100, v99 :: v_dual_lshlrev_b32 v25, 16, v25
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v55, 16, v55
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v98, 0xffff0000, v6
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v84, 0xffff0000, v9
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v48, v7, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v135, 0xffff0000, v39
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v39, 16, v39
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v49, v6, v22
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v134, v84, v83 :: v_dual_lshlrev_b32 v13, 16, v13
+; GFX11TRUE16-NEXT:    v_bfe_u32 v83, v146, 16, 1
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v51, v4, v20 :: v_dual_fmac_f32 v148, v118, v117
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v144, v96, v87 :: v_dual_and_b32 v81, 0xffff0000, v26
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v55, v0, v16 :: v_dual_lshlrev_b32 v26, 16, v26
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v145, v98, v97
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v84, 0x400000, v146
+; GFX11TRUE16-NEXT:    v_add3_u32 v83, v83, v146, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v86, 0xffff0000, v8
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v82, 0xffff0000, v10
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v147, v102, v101 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v38, 16, v38
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v69, 0xffff0000, v28
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v37, v10, v26 :: v_dual_lshlrev_b32 v28, 16, v28
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v39, v8, v24
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v133, v82, v81 :: v_dual_and_b32 v70, 0xffff0000, v12
+; GFX11TRUE16-NEXT:    v_bfe_u32 v97, v51, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v23, v37, 16, 1
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v135, v86, v85 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v80, 0xffff0000, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v132, 0xffff0000, v36
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v36, 16, v36
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v50, 16, v50
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v133
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v37
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v98, 0x400000, v51
+; GFX11TRUE16-NEXT:    v_add3_u32 v23, v23, v37, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v71, 0xffff0000, v27
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11TRUE16-NEXT:    v_add3_u32 v97, v97, v51, 0x7fff
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v15
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v130, 0xffff0000, v34
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v35, v12, v28 :: v_dual_lshlrev_b32 v34, 16, v34
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v36, v11, v27
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v50, v5, v21
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v132, v80, v71 :: v_dual_and_b32 v67, 0xffff0000, v29
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v130, v68, v67 :: v_dual_and_b32 v65, 0xffff0000, v30
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v20, 0x400000, v36
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v34, v13, v29 :: v_dual_fmac_f32 v31, v15, v32
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v119, v64, v128 :: v_dual_and_b32 v66, 0xffff0000, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v64, 0xffff0000, v52
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v128, 0xffff0000, v53
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v53, 16, v53
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v129, v66, v65 :: v_dual_lshlrev_b32 v30, 16, v30
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v52, 16, v52
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v54
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v54
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v64, v112, v103
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v38, v9, v25
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v131, v70, v69 :: v_dual_lshlrev_b32 v14, 16, v14
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v53, v2, v18
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v119, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v31, 16, 1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_dual_fmac_f32 v33, v14, v30 :: v_dual_fmac_f32 v52, v3, v19
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v54, v1, v17
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v119
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v31
+; GFX11TRUE16-NEXT:    v_bfe_u32 v4, v129, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v119, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v119, v119
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v31, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e64 s0, v31, v31
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v129
+; GFX11TRUE16-NEXT:    v_bfe_u32 v6, v33, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v14, v132, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v0, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e64 v149, v2, v3, s0
+; GFX11TRUE16-NEXT:    v_add3_u32 v2, v4, v129, 0x7fff
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v33
+; GFX11TRUE16-NEXT:    v_bfe_u32 v8, v130, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v3, v6, v33, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v150, v14, v132, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v2, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v130
+; GFX11TRUE16-NEXT:    v_bfe_u32 v10, v34, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v13, v35, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v4, v8, v130, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v33, v3, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v34
+; GFX11TRUE16-NEXT:    v_bfe_u32 v12, v131, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v6, v10, v34, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v10, v13, v35, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v4, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v16, 0x400000, v131
+; GFX11TRUE16-NEXT:    v_add3_u32 v8, v12, v131, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v35
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v132
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v34, v6, v11, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v131, v131
+; GFX11TRUE16-NEXT:    v_bfe_u32 v19, v36, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v21, v133, 16, 1
+; GFX11TRUE16-NEXT:    v_bfe_u32 v25, v134, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v26, 0x400000, v134
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v8, v16, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11TRUE16-NEXT:    v_add3_u32 v19, v19, v36, 0x7fff
+; GFX11TRUE16-NEXT:    v_add3_u32 v21, v21, v133, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v27, v38, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v25, v25, v134, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v10, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v132, v132
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v28, 0x400000, v38
+; GFX11TRUE16-NEXT:    v_bfe_u32 v29, v135, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v27, v27, v38, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v30, 0x400000, v135
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v150, v18, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11TRUE16-NEXT:    v_bfe_u32 v65, v39, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v29, v29, v135, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v66, 0x400000, v39
+; GFX11TRUE16-NEXT:    v_bfe_u32 v67, v144, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v19, v20, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX11TRUE16-NEXT:    v_add3_u32 v65, v65, v39, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v68, 0x400000, v144
+; GFX11TRUE16-NEXT:    v_bfe_u32 v69, v48, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v67, v67, v144, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v21, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v70, 0x400000, v48
+; GFX11TRUE16-NEXT:    v_bfe_u32 v71, v145, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v69, v69, v48, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v80, 0x400000, v145
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v23, v24, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11TRUE16-NEXT:    v_bfe_u32 v81, v49, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v71, v71, v145, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v82, 0x400000, v49
+; GFX11TRUE16-NEXT:    v_bfe_u32 v85, v50, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v25, v26, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11TRUE16-NEXT:    v_add3_u32 v81, v81, v49, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v86, 0x400000, v50
+; GFX11TRUE16-NEXT:    v_bfe_u32 v87, v147, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v85, v85, v50, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v19, v27, v28, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v135, v135
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v96, 0x400000, v147
+; GFX11TRUE16-NEXT:    v_add3_u32 v87, v87, v147, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v99, v64, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v100, 0x400000, v64
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v29, v30, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11TRUE16-NEXT:    v_bfe_u32 v101, v52, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v99, v99, v64, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v102, 0x400000, v52
+; GFX11TRUE16-NEXT:    v_bfe_u32 v117, v54, 16, 1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v20, v65, v66, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v144, v144
+; GFX11TRUE16-NEXT:    v_add3_u32 v101, v101, v52, 0x7fff
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v118, 0x400000, v54
+; GFX11TRUE16-NEXT:    v_bfe_u32 v0, v55, 16, 1
+; GFX11TRUE16-NEXT:    v_add3_u32 v117, v117, v54, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v67, v68, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v1, 0x400000, v55
+; GFX11TRUE16-NEXT:    v_add3_u32 v0, v0, v55, 0x7fff
+; GFX11TRUE16-NEXT:    v_bfe_u32 v119, v148, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v31, 0x400000, v148
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v21, v69, v70, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v20.h
+; GFX11TRUE16-NEXT:    v_add3_u32 v119, v119, v148, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v19.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v21.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v71, v80, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v18.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, v17.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v12.l, v16.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v34.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v22, v81, v82, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v33.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v15.l, v149.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v22.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v83, v84, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v23, v85, v86, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v147, v147
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v23.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v87, v96, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v128, v114, v113
+; GFX11TRUE16-NEXT:    v_bfe_u32 v113, v53, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v114, 0x400000, v53
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v24, v97, v98, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11TRUE16-NEXT:    v_bfe_u32 v103, v128, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v112, 0x400000, v128
+; GFX11TRUE16-NEXT:    v_add3_u32 v113, v113, v53, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v24.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v99, v100, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11TRUE16-NEXT:    v_add3_u32 v103, v103, v128, 0x7fff
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v25, v101, v102, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v128, v128
+; GFX11TRUE16-NEXT:    v_fmac_f32_e32 v32, v116, v115
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v25.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v103, v112, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11TRUE16-NEXT:    v_bfe_u32 v115, v32, 16, 1
+; GFX11TRUE16-NEXT:    v_or_b32_e32 v116, 0x400000, v32
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v26, v113, v114, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_add3_u32 v115, v115, v32, 0x7fff
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v26.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v27, v117, v118, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v28, v0, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v115, v116, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v148, v148
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v27.h
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v119, v31, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v28.h
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fma_v32bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    s_clause 0x10
+; GFX11FAKE16-NEXT:    scratch_load_b32 v31, off, s32 offset:64
+; GFX11FAKE16-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:60
+; GFX11FAKE16-NEXT:    scratch_load_b32 v34, off, s32 offset:56
+; GFX11FAKE16-NEXT:    scratch_load_b32 v35, off, s32 offset:52
+; GFX11FAKE16-NEXT:    scratch_load_b32 v36, off, s32 offset:48
+; GFX11FAKE16-NEXT:    scratch_load_b32 v37, off, s32 offset:44
+; GFX11FAKE16-NEXT:    scratch_load_b32 v38, off, s32 offset:40
+; GFX11FAKE16-NEXT:    scratch_load_b32 v39, off, s32 offset:36
+; GFX11FAKE16-NEXT:    scratch_load_b32 v48, off, s32 offset:32
+; GFX11FAKE16-NEXT:    scratch_load_b32 v49, off, s32 offset:28
+; GFX11FAKE16-NEXT:    scratch_load_b32 v50, off, s32 offset:24
+; GFX11FAKE16-NEXT:    scratch_load_b32 v51, off, s32 offset:20
+; GFX11FAKE16-NEXT:    scratch_load_b32 v52, off, s32 offset:16
+; GFX11FAKE16-NEXT:    scratch_load_b32 v53, off, s32 offset:12
+; GFX11FAKE16-NEXT:    scratch_load_b32 v54, off, s32 offset:8
+; GFX11FAKE16-NEXT:    scratch_load_b32 v55, off, s32 offset:4
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v99, 16, v21
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v100, 16, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v97, 16, v22
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v101, 16, v20
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v102, 16, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v117, 16, v16
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v118, 16, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v87, 16, v23
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v98, 16, v6
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v103, 16, v19
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v112, 16, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v85, 16, v24
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v113, 16, v18
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v114, 16, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v115, 16, v17
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v116, 16, v1
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v32
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v129, 16, v33
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v68, 16, v13
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v131, 16, v35
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v133, 16, v37
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v134, 16, v38
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v37, 0xffff0000, v37
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v144, 16, v48
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v48, 0xffff0000, v48
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v146, 16, v50
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v145, 16, v49
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff0000, v49
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v84, 16, v9
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v147, 16, v51
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v51, 0xffff0000, v51
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v96, 16, v7
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v83, 16, v25
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v146, v100, v99 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v135, 16, v39
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v39, 0xffff0000, v39
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v48, v7, v23 :: v_dual_fmac_f32 v49, v6, v22
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v134, v84, v83 :: v_dual_and_b32 v13, 0xffff0000, v13
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v51, v4, v20
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v144, v96, v87 :: v_dual_lshlrev_b32 v81, 16, v26
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v145, v98, v97 :: v_dual_and_b32 v26, 0xffff0000, v26
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v84, 0x400000, v146
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v86, 16, v8
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v82, 16, v10
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v147, v102, v101 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v38, 0xffff0000, v38
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v69, 16, v28
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v37, v10, v26 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v39, v8, v24
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v133, v82, v81 :: v_dual_lshlrev_b32 v70, 16, v12
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v135, v86, v85 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v35, 0xffff0000, v35
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v80, 16, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v132, 16, v36
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v36, 0xffff0000, v36
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v50, 0xffff0000, v50
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v22, 0x400000, v133
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v24, 0x400000, v37
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v71, 16, v27
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v130, 16, v34
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v35, v12, v28 :: v_dual_and_b32 v34, 0xffff0000, v34
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v36, v11, v27
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v50, v5, v21
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v132, v80, v71 :: v_dual_lshlrev_b32 v67, 16, v29
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v98, 0x400000, v51
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v119, 16, v31
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v15
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v34, v13, v29 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v130, v68, v67 :: v_dual_lshlrev_b32 v65, 16, v30
+; GFX11FAKE16-NEXT:    v_bfe_u32 v23, v37, 16, 1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v31, v15, v32 :: v_dual_lshlrev_b32 v66, 16, v14
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v119, v64, v128
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v64, 16, v52
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v128, 16, v53
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v53, 0xffff0000, v53
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v15, 16, v54
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v54
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v54, 16, v55
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v55, 0xffff0000, v55
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v129, v66, v65 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v52, 0xffff0000, v52
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v64, v112, v103
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v38, v9, v25
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v131, v70, v69 :: v_dual_and_b32 v14, 0xffff0000, v14
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v53, v2, v18
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v55, v0, v16
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v119, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v31, 16, 1
+; GFX11FAKE16-NEXT:    v_dual_fmac_f32 v33, v14, v30 :: v_dual_fmac_f32 v52, v3, v19
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v32, v1, v17
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v119
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v31
+; GFX11FAKE16-NEXT:    v_bfe_u32 v4, v129, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v119, 0x7fff
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v119, v119
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v31, 0x7fff
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e64 s0, v31, v31
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v128, v114, v113
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v54, v118, v117
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v5, 0x400000, v129
+; GFX11FAKE16-NEXT:    v_bfe_u32 v6, v33, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v10, v34, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v14, v35, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v19, v36, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v27, v38, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v65, v39, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v69, v48, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v81, v49, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v85, v50, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v97, v51, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v101, v52, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v113, v53, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v117, v32, 16, 1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v148, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e64 v149, v2, v3, s0
+; GFX11FAKE16-NEXT:    v_add3_u32 v2, v4, v129, 0x7fff
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v129, v129
+; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v15, v116, v115
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v7, 0x400000, v33
+; GFX11FAKE16-NEXT:    v_bfe_u32 v8, v130, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v3, v6, v33, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v6, v10, v34, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v10, v14, v35, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v14, v19, v36, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v19, v23, v37, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v23, v27, v38, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v27, v65, v39, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v65, v69, v48, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v69, v81, v49, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v81, v85, v50, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v85, v97, v51, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v97, v101, v52, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v101, v113, v53, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v113, v117, v32, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v117, v2, v5, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v9, 0x400000, v130
+; GFX11FAKE16-NEXT:    v_bfe_u32 v12, v131, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v17, v132, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v21, v133, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v25, v134, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v29, v135, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v67, v144, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v71, v145, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v83, v146, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v87, v147, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v99, v64, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v103, v128, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v115, v15, 16, 1
+; GFX11FAKE16-NEXT:    v_bfe_u32 v119, v54, 16, 1
+; GFX11FAKE16-NEXT:    v_add3_u32 v4, v8, v130, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v33, v3, v7, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v130, v130
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v11, 0x400000, v34
+; GFX11FAKE16-NEXT:    v_add3_u32 v8, v12, v131, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v12, v17, v132, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v17, v21, v133, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v21, v25, v134, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v25, v29, v135, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v29, v67, v144, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v67, v71, v145, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v71, v83, v146, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v83, v87, v147, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v87, v99, v64, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v99, v103, v128, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v103, v115, v15, 0x7fff
+; GFX11FAKE16-NEXT:    v_add3_u32 v115, v119, v54, 0x7fff
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v119, v4, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v131
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v16, 0x400000, v35
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v18, 0x400000, v132
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v20, 0x400000, v36
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v34, v6, v11, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v131, v131
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v26, 0x400000, v134
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v28, 0x400000, v38
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v30, 0x400000, v135
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v66, 0x400000, v39
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v8, v13, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v35, v35
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v68, 0x400000, v144
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v70, 0x400000, v48
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v80, 0x400000, v145
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v82, 0x400000, v49
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v16, v10, v16, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v132, v132
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v86, 0x400000, v50
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v96, 0x400000, v147
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v100, 0x400000, v64
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v102, 0x400000, v52
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v18, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v112, 0x400000, v128
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v116, 0x400000, v15
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v118, 0x400000, v32
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v31, 0x400000, v54
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v133, v133
+; GFX11FAKE16-NEXT:    v_bfe_u32 v0, v55, 16, 1
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v1, 0x400000, v55
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v114, 0x400000, v53
+; GFX11FAKE16-NEXT:    v_perm_b32 v11, v12, v11, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v17, v22, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v37, v37
+; GFX11FAKE16-NEXT:    v_add3_u32 v0, v0, v55, 0x7fff
+; GFX11FAKE16-NEXT:    v_perm_b32 v12, v16, v13, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v13, v34, v119, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v14, v19, v24, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v134, v134
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v10, v14, v10, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v21, v26, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v38, v38
+; GFX11FAKE16-NEXT:    v_perm_b32 v14, v33, v117, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v17, v23, v28, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v135, v135
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v9, v17, v9, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v25, v30, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v39, v39
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v18, v27, v66, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v144, v144
+; GFX11FAKE16-NEXT:    v_perm_b32 v8, v18, v8, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v29, v68, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v48, v48
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v19, v65, v70, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v145, v145
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v19, v7, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v67, v80, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v49, v49
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v20, v69, v82, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v146, v146
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v20, v6, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v71, v84, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v50, v50
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v21, v81, v86, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v147, v147
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v21, v5, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v83, v96, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v64, v64
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v87, v100, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v52, v52
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v22, v97, v102, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v128, v128
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v22, v3, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v99, v112, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v15, v15
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v103, v116, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v32, v32
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v23, v113, v118, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v54, v54
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v24, v115, v31, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v55, v55
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v53, v53
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v23, v15, 0x7060302
+; GFX11FAKE16-NEXT:    v_perm_b32 v15, v149, v148, 0x7060302
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v24, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v25, v101, v114, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v51, v51
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v25, v2, 0x7060302
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v26, v85, v98, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v26, v4, 0x7060302
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_fma_v32bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x10
+; GFX1250-NEXT:    scratch_load_b32 v31, off, s32 offset:64
+; GFX1250-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX1250-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX1250-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX1250-NEXT:    scratch_load_b32 v35, off, s32 offset:16
+; GFX1250-NEXT:    scratch_load_b32 v36, off, s32 offset:20
+; GFX1250-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX1250-NEXT:    scratch_load_b32 v38, off, s32 offset:28
+; GFX1250-NEXT:    scratch_load_b32 v39, off, s32 offset:32
+; GFX1250-NEXT:    scratch_load_b32 v48, off, s32 offset:36
+; GFX1250-NEXT:    scratch_load_b32 v49, off, s32 offset:40
+; GFX1250-NEXT:    scratch_load_b32 v50, off, s32 offset:44
+; GFX1250-NEXT:    scratch_load_b32 v51, off, s32 offset:48
+; GFX1250-NEXT:    scratch_load_b32 v52, off, s32 offset:52
+; GFX1250-NEXT:    scratch_load_b32 v53, off, s32 offset:56
+; GFX1250-NEXT:    scratch_load_b32 v54, off, s32 offset:60
+; GFX1250-NEXT:    scratch_load_b32 v55, off, s32
+; GFX1250-NEXT:    s_wait_loadcnt 0xf
+; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v16, v32
+; GFX1250-NEXT:    s_wait_loadcnt 0xe
+; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v17, v33
+; GFX1250-NEXT:    s_wait_loadcnt 0xd
+; GFX1250-NEXT:    v_pk_fma_bf16 v2, v2, v18, v34
+; GFX1250-NEXT:    s_wait_loadcnt 0xc
+; GFX1250-NEXT:    v_pk_fma_bf16 v3, v3, v19, v35
+; GFX1250-NEXT:    s_wait_loadcnt 0xb
+; GFX1250-NEXT:    v_pk_fma_bf16 v4, v4, v20, v36
+; GFX1250-NEXT:    s_wait_loadcnt 0xa
+; GFX1250-NEXT:    v_pk_fma_bf16 v5, v5, v21, v37
+; GFX1250-NEXT:    s_wait_loadcnt 0x9
+; GFX1250-NEXT:    v_pk_fma_bf16 v6, v6, v22, v38
+; GFX1250-NEXT:    s_wait_loadcnt 0x8
+; GFX1250-NEXT:    v_pk_fma_bf16 v7, v7, v23, v39
+; GFX1250-NEXT:    s_wait_loadcnt 0x7
+; GFX1250-NEXT:    v_pk_fma_bf16 v8, v8, v24, v48
+; GFX1250-NEXT:    s_wait_loadcnt 0x6
+; GFX1250-NEXT:    v_pk_fma_bf16 v9, v9, v25, v49
+; GFX1250-NEXT:    s_wait_loadcnt 0x5
+; GFX1250-NEXT:    v_pk_fma_bf16 v10, v10, v26, v50
+; GFX1250-NEXT:    s_wait_loadcnt 0x4
+; GFX1250-NEXT:    v_pk_fma_bf16 v11, v11, v27, v51
+; GFX1250-NEXT:    s_wait_loadcnt 0x3
+; GFX1250-NEXT:    v_pk_fma_bf16 v12, v12, v28, v52
+; GFX1250-NEXT:    s_wait_loadcnt 0x2
+; GFX1250-NEXT:    v_pk_fma_bf16 v13, v13, v29, v53
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    v_pk_fma_bf16 v14, v14, v30, v54
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_pk_fma_bf16 v15, v15, v55, v31
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c)
   ret <32 x bfloat> %op
 }

From 63d866e04b86f2bf18206f02acf4f25f7be44111 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 10:10:19 -0700
Subject: [PATCH 041/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in XeGPUOps.cpp (NFC)

---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 20608f97611bb..81b5788d0b9b4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -23,7 +23,7 @@
 namespace mlir {
 namespace xegpu {
 
-bool isSharedMemory(const MemRefType &memrefTy) {
+static bool isSharedMemory(const MemRefType &memrefTy) {
   Attribute attr = memrefTy.getMemorySpace();
   if (auto intAttr = llvm::dyn_cast<IntegerAttr>(attr))
     return intAttr.getInt() == 3;
@@ -340,7 +340,7 @@ LogicalResult CreateNdDescOp::verify() {
   return success();
 }
 
-ParseResult parseOptionalDynamicIndexList(
+static ParseResult parseOptionalDynamicIndexList(
     OpAsmParser &parser,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &values,
     DenseI64ArrayAttr &integers, SmallVectorImpl<Type> *valueTypes = nullptr,
@@ -378,9 +378,9 @@ ParseResult parseOptionalDynamicIndexList(
   return success();
 }
 
-void printOptionalDynamicIndexList(OpAsmPrinter &printer, Operation *op,
-                                   OperandRange values,
-                                   DenseI64ArrayAttr integers) {
+static void printOptionalDynamicIndexList(OpAsmPrinter &printer, Operation *op,
+                                          OperandRange values,
+                                          DenseI64ArrayAttr integers) {
   if (!integers || integers.empty())
     return;
   printDynamicIndexList(printer, op, values, integers,

From aaf23f0887969130fbbfedc2b525921c1c7b687c Mon Sep 17 00:00:00 2001
From: Justin Kim <jwkimrhkgkr@gmail.com>
Date: Sun, 28 Sep 2025 21:16:27 +0900
Subject: [PATCH 042/878] [mlir][mlir-tblgen] Emit correct error message if
 method is pruned (#160334)

Add verification for pruned methods for `emitCustomBuilder` and
`emitCheckedCustomBuilder` with proper diagnostic about shadowed methods.

Without this verification, `mlir-tblgen` with `--gen-attrdef-decls`
would segmentation fault if custom builder is provided with its body,
but if method is pruned out due to duplication with other builders.

Fixes #160227

---------

Co-authored-by: Justin Kim <jaewoo.kim@hyperaccel.ai>
---
 .../mlir/Dialect/Tosa/IR/TosaTypesBase.td     |  3 +-
 mlir/include/mlir/TableGen/Class.h            |  4 ++
 .../attr-duplicated-builder-error.td          | 48 ++++++++++++++
 .../attr-duplicated-custom-builders-error.td  | 52 +++++++++++++++
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp   | 65 +++++++++++++++++--
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp   |  4 +-
 6 files changed, 165 insertions(+), 11 deletions(-)
 create mode 100644 mlir/test/mlir-tblgen/attr-duplicated-builder-error.td
 create mode 100644 mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 553d69cc21d17..93ab120339d55 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -282,8 +282,7 @@ def Tosa_Shape : Tosa_Type<"shape", "shape"> {
      !tosa.shape<0>
     ```
   }];
-  let parameters = (ins "int" : $rank);
-  let builders = [TypeBuilder<(ins "int" : $rank)>];
+  let parameters = (ins "int":$rank);
   let assemblyFormat = "`<` $rank `>`";
 
   let genVerifyDecl = 1;
diff --git a/mlir/include/mlir/TableGen/Class.h b/mlir/include/mlir/TableGen/Class.h
index 10349676625d1..e6bedc7cc896d 100644
--- a/mlir/include/mlir/TableGen/Class.h
+++ b/mlir/include/mlir/TableGen/Class.h
@@ -789,6 +789,10 @@ class Class {
         std::forward<Args>(args)...);
   }
 
+  const std::vector<std::unique_ptr<Method>> &getMethods() const {
+    return methods;
+  }
+
   /// Add a new field to the class. Class fields added this way are always
   /// private.
   template <typename TypeT, typename NameT>
diff --git a/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td b/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td
new file mode 100644
index 0000000000000..5f1c61a3a505d
--- /dev/null
+++ b/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td
@@ -0,0 +1,48 @@
+// RUN: not mlir-tblgen -gen-attrdef-decls -I %S/../../include %s 2>&1 | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "::test";
+}
+
+class TestAttr<string attrName, string attrMnemonic, list<Trait> traits = []>
+    : AttrDef<Test_Dialect, attrName, traits> {
+  let mnemonic = attrMnemonic;
+}
+
+def TestAttr : TestAttr<"Test", "test"> {
+  let summary = "Test attrubute";
+  let description = "Test attribute";
+
+  let parameters = (ins AttrParameter<"std::int64_t", "arg">:$arg);
+  let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+            return $_get($_ctxt, arg);
+        }]>];
+
+  let assemblyFormat = "`<` $arg `>`";
+
+  let skipDefaultBuilders = 0;
+  let genVerifyDecl = 1;
+  let genMnemonicAlias = 1;
+}
+
+def Test_TestAttrOp : Op<Test_Dialect, "test", []> {
+  let summary = "test operation with attribute";
+  let description = "test operation with attribute";
+
+  let arguments = (ins TestAttr:$testAttr);
+  let assemblyFormat = "$testAttr attr-dict";
+}
+
+// CHECK: attr-duplicated-builder-error.td:20:7: error: builder `get` conflicts with an existing builder. 
+// CHECK-NEXT:   let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+// CHECK-NEXT:       ^
+// CHECK-NEXT: note: A new builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: is shadowed by an existing builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: Please remove one of the conflicting definitions.
diff --git a/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td b/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td
new file mode 100644
index 0000000000000..0e09f667c1ccd
--- /dev/null
+++ b/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td
@@ -0,0 +1,52 @@
+// RUN: not mlir-tblgen -gen-attrdef-decls -I %S/../../include %s 2>&1 | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let cppNamespace = "::test";
+}
+
+class TestAttr<string attrName, string attrMnemonic, list<Trait> traits = []>
+    : AttrDef<Test_Dialect, attrName, traits> {
+  let mnemonic = attrMnemonic;
+}
+
+def TestAttr : TestAttr<"Test", "test"> {
+  let summary = "Test attrubute";
+  let description = "Test attribute";
+
+  let parameters = (ins AttrParameter<"std::int64_t", "arg">:$arg);
+  let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+            return $_get($_ctxt, arg);
+        }]>,
+                  AttrBuilder<(ins "std::int64_t":$arg), [{
+            // Duplicated builder
+            return $_get($_ctxt, arg);
+        }]>];
+
+  let assemblyFormat = "`<` $arg `>`";
+
+  let skipDefaultBuilders = 1;
+  let genVerifyDecl = 1;
+  let genMnemonicAlias = 1;
+}
+
+def Test_TestAttrOp : Op<Test_Dialect, "test", []> {
+  let summary = "test operation with attribute";
+  let description = "test operation with attribute";
+
+  let arguments = (ins TestAttr:$testAttr);
+  let assemblyFormat = "$testAttr attr-dict";
+}
+
+// CHECK: attr-duplicated-custom-builders-error.td:20:7: error: builder `get` conflicts with an existing builder.
+// CHECK-NEXT:   let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{
+// CHECK-NEXT:   ^
+// CHECK-NEXT: note: A new builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: is shadowed by an existing builder with signature:
+// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg);
+// CHECK-EMPTY:
+// CHECK-NEXT: Please remove one of the conflicting definitions.
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 3140f12c0b7e8..b9115657d6bf3 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -513,14 +513,57 @@ getCustomBuilderParams(std::initializer_list<MethodParameter> prefix,
   return builderParams;
 }
 
+static std::string getSignature(const Method &m) {
+  std::string signature;
+  llvm::raw_string_ostream os(signature);
+  raw_indented_ostream indentedOs(os);
+  m.writeDeclTo(indentedOs);
+  return signature;
+}
+
+static void emitDuplicatedBuilderError(const Method &currentMethod,
+                                       StringRef methodName,
+                                       const Class &defCls,
+                                       const AttrOrTypeDef &def) {
+
+  // Try to search for method that makes `get` redundant.
+  auto loc = def.getDef()->getFieldLoc("builders");
+  for (auto &method : defCls.getMethods()) {
+    if (method->getName() == methodName &&
+        method->makesRedundant(currentMethod)) {
+      PrintError(loc, llvm::Twine("builder `") + methodName +
+                          "` conflicts with an existing builder. ");
+      PrintFatalNote(llvm::Twine("A new builder with signature:\n") +
+                     getSignature(currentMethod) +
+                     "\nis shadowed by an existing builder with signature:\n" +
+                     getSignature(*method) +
+                     "\nPlease remove one of the conflicting "
+                     "definitions.");
+    }
+  }
+
+  // This code shouldn't be reached, but leaving this here for potential future
+  // use.
+  PrintFatalError(loc, "Failed to generate builder " + methodName);
+}
+
 void DefGen::emitCustomBuilder(const AttrOrTypeBuilder &builder) {
   // Don't emit a body if there isn't one.
   auto props = builder.getBody() ? Method::Static : Method::StaticDeclaration;
   StringRef returnType = def.getCppClassName();
   if (std::optional<StringRef> builderReturnType = builder.getReturnType())
     returnType = *builderReturnType;
-  Method *m = defCls.addMethod(returnType, "get", props,
-                               getCustomBuilderParams({}, builder));
+
+  llvm::StringRef methodName = "get";
+  const auto parameters = getCustomBuilderParams({}, builder);
+  Method *m = defCls.addMethod(returnType, methodName, props, parameters);
+
+  // If method is pruned, report error and terminate.
+  if (!m) {
+    auto curMethod = Method(returnType, methodName, props, parameters);
+    emitDuplicatedBuilderError(curMethod, methodName, defCls, def);
+  }
+
   if (!builder.getBody())
     return;
 
@@ -547,11 +590,19 @@ void DefGen::emitCheckedCustomBuilder(const AttrOrTypeBuilder &builder) {
   StringRef returnType = def.getCppClassName();
   if (std::optional<StringRef> builderReturnType = builder.getReturnType())
     returnType = *builderReturnType;
-  Method *m = defCls.addMethod(
-      returnType, "getChecked", props,
-      getCustomBuilderParams(
-          {{"::llvm::function_ref<::mlir::InFlightDiagnostic()>", "emitError"}},
-          builder));
+
+  llvm::StringRef methodName = "getChecked";
+  auto parameters = getCustomBuilderParams(
+      {{"::llvm::function_ref<::mlir::InFlightDiagnostic()>", "emitError"}},
+      builder);
+  Method *m = defCls.addMethod(returnType, methodName, props, parameters);
+
+  // If method is pruned, report error and terminate.
+  if (!m) {
+    auto curMethod = Method(returnType, methodName, props, parameters);
+    emitDuplicatedBuilderError(curMethod, methodName, defCls, def);
+  }
+
   if (!builder.getBody())
     return;
 
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 4fdde76a613bb..7e8e559baf878 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -3104,8 +3104,8 @@ void OpEmitter::genBuilder() {
     std::optional<StringRef> body = builder.getBody();
     auto properties = body ? Method::Static : Method::StaticDeclaration;
     auto *method = opClass.addMethod("void", "build", properties, arguments);
-    if (body)
-      ERROR_IF_PRUNED(method, "build", op);
+
+    ERROR_IF_PRUNED(method, "build", op);
 
     if (method)
       method->setDeprecated(builder.getDeprecatedMessage());

From dcfb904473dab6c45855b6ab364521124beb5ce1 Mon Sep 17 00:00:00 2001
From: Ebin-McW <ebin.jose@multicorewareinc.com>
Date: Sun, 28 Sep 2025 17:50:20 +0530
Subject: [PATCH 043/878] [SPIRV] Test file for memmove intrinsic (#152640)

- Added test for checking the lowering of memmove to OpCopyMemorySized
- Modified NoSignedUnsignedWrap.ll by adding a RUN line
---
 .../CodeGen/SPIRV/llvm-intrinsics/memmove.ll  | 86 +++++++++++++++++++
 .../SPIRV/transcoding/NoSignedUnsignedWrap.ll |  3 +-
 2 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll

diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll
new file mode 100644
index 0000000000000..51b76640cc056
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll
@@ -0,0 +1,86 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-NOT: llvm.memmove
+
+; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#Int32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#Ptr_CrossWG_8:]] = OpTypePointer CrossWorkgroup %[[#Int8]]
+; CHECK-DAG: %[[#Ptr_Generic_32:]] = OpTypePointer Generic %[[#Int32]]
+; CHECK-DAG: %[[#Const_64:]] = OpConstant %[[#Int32]] 64
+; CHECK-DAG: %[[#Const_36:]] = OpConstant %[[#Int32]] 36
+; CHECK-DAG: %[[#Const_30:]] = OpConstant %[[#Int32]] 30
+; CHECK-DAG: %[[#Const_32_64:]] = OpConstant %[[#Int64]] 32
+
+; CHECK: %[[#Param1:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Param2:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Size1:]] = OpUConvert %[[#Int64]] %[[#Const_64]]
+; CHECK: OpCopyMemorySized %[[#Param2]] %[[#Param1]] %[[#Size1]] Aligned 64
+
+; CHECK: %[[#Src:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#CastDst2:]] = OpGenericCastToPtr %[[#Ptr_CrossWG_8]] %[[#GenPtr:]]
+; CHECK: %[[#Size2:]] = OpUConvert %[[#Int64]] %[[#Const_36]]
+; CHECK: OpCopyMemorySized %[[#CastDst2]] %[[#Src]] %[[#Size2]] Aligned 64
+
+; CHECK: %[[#Param1:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Param2:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]]
+; CHECK: %[[#Size3:]] = OpUConvert %[[#Int64]] %[[#Const_30]]
+; CHECK: OpCopyMemorySized %[[#Param2]] %[[#Param1]] %[[#Size3]] Aligned 1
+
+; CHECK: %[[#Phi:]] = OpPhi %[[#Ptr_Generic_32]] %[[#Op1:]] %[[#Lbl1:]] %[[#Op2:]] %[[#Lbl2:]]
+; CHECK: %[[#Cast:]] = OpPtrCastToGeneric %[[#]] %[[#]]
+; CHECK: OpCopyMemorySized %[[#Cast]] %[[#Phi]] %[[#Const_32_64]] Aligned 8
+
+%struct.SomeStruct = type { <16 x float>, i32, [60 x i8] }
+%class.kfunc = type <{ i32, i32, i32, [4 x i8] }>
+
+@InvocIndex = external local_unnamed_addr addrspace(1) constant i64, align 8
+@"func_object1" = internal addrspace(3) global %class.kfunc zeroinitializer, align 8
+
+define spir_kernel void @test_full_move(%struct.SomeStruct addrspace(1)* captures(none) readonly %in, %struct.SomeStruct addrspace(1)* captures(none) %out) {
+  %1 = bitcast %struct.SomeStruct addrspace(1)* %in to i8 addrspace(1)*
+  %2 = bitcast %struct.SomeStruct addrspace(1)* %out to i8 addrspace(1)*
+  call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* align 64 %2, i8 addrspace(1)* align 64 %1, i32 64, i1 false)
+  ret void
+}
+
+define spir_kernel void @test_partial_move(%struct.SomeStruct addrspace(1)* captures(none) readonly %in, %struct.SomeStruct addrspace(4)* captures(none) %out) {
+  %1 = bitcast %struct.SomeStruct addrspace(1)* %in to i8 addrspace(1)*
+  %2 = bitcast %struct.SomeStruct addrspace(4)* %out to i8 addrspace(4)*
+  %3 = addrspacecast i8 addrspace(4)* %2 to i8 addrspace(1)*
+  call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* align 64 %3, i8 addrspace(1)* align 64 %1, i32 36, i1 false)
+  ret void
+}
+
+define spir_kernel void @test_array(i8 addrspace(1)* %in, i8 addrspace(1)* %out) {
+  call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 30, i1 false)
+  ret void
+}
+
+define weak_odr dso_local spir_kernel void @test_phi() local_unnamed_addr {
+entry:
+  %0 = alloca i32, align 8
+  %1 = addrspacecast i32* %0 to i32 addrspace(4)*
+  %2 = load i64, i64 addrspace(1)* @InvocIndex, align 8
+  %cmp = icmp eq i64 %2, 0
+  br i1 %cmp, label %leader, label %entry.merge_crit_edge
+
+entry.merge_crit_edge:                            ; preds = %entry
+  %3 = bitcast i32 addrspace(4)* %1 to i8 addrspace(4)*
+  br label %merge
+
+leader:                                           ; preds = %entry
+  %4 = bitcast i32 addrspace(4)* %1 to i8 addrspace(4)*
+  br label %merge
+
+merge:                                            ; preds = %entry.merge_crit_edge, %leader
+  %phi = phi i8 addrspace(4)* [ %3, %entry.merge_crit_edge ], [ %4, %leader ]
+  %5 = addrspacecast i8 addrspace(3)* bitcast (%class.kfunc addrspace(3)* @"func_object1" to i8 addrspace(3)*) to i8 addrspace(4)*
+  call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* align 8 dereferenceable(32) %5, i8 addrspace(4)* align 8 dereferenceable(32) %phi, i64 32, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* captures(none) writeonly, i8 addrspace(4)* captures(none) readonly, i64, i1 immarg)
+
+declare void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* captures(none), i8 addrspace(1)* captures(none) readonly, i32, i1)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll b/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll
index e405ef0ed58a5..5e66b8b639f17 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll
@@ -7,10 +7,11 @@
 ;;
 ;; Positive tests:
 ;;
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-NEGATIVE
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV
 ;;
 ;; Negative tests:
 ;;
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV-NEGATIVE
 ;; Check that backend is able to skip nsw/nuw attributes if extension is
 ;; disabled implicitly or explicitly and if max SPIR-V version is lower then 1.4
 

From 047ddbf263f6e64c6ec1ef148596661d28544e3a Mon Sep 17 00:00:00 2001
From: Subash B <subash.boopathi@multicorewareinc.com>
Date: Sun, 28 Sep 2025 18:12:16 +0530
Subject: [PATCH 044/878] [SPIRV] Added support for the constrained comparison
 intrinsics (#157439)

Added SPIR-V support for constrained floating-point comparison
intrinsics (fcmp, fcmps) with lowering and tests.
---
 .../Target/SPIRV/SPIRVPrepareFunctions.cpp    | 24 ++++++++
 .../llvm-intrinsics/constrained-comparison.ll | 56 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index 2b34f61fa2434..4e4e6fb4ab791 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -335,6 +335,21 @@ static void lowerFunnelShifts(IntrinsicInst *FSHIntrinsic) {
   FSHIntrinsic->setCalledFunction(FSHFunc);
 }
 
+static void lowerConstrainedFPCmpIntrinsic(
+    ConstrainedFPCmpIntrinsic *ConstrainedCmpIntrinsic,
+    SmallVector<Instruction *> &EraseFromParent) {
+  if (!ConstrainedCmpIntrinsic)
+    return;
+  // Extract the floating-point values being compared
+  Value *LHS = ConstrainedCmpIntrinsic->getArgOperand(0);
+  Value *RHS = ConstrainedCmpIntrinsic->getArgOperand(1);
+  FCmpInst::Predicate Pred = ConstrainedCmpIntrinsic->getPredicate();
+  IRBuilder<> Builder(ConstrainedCmpIntrinsic);
+  Value *FCmp = Builder.CreateFCmp(Pred, LHS, RHS);
+  ConstrainedCmpIntrinsic->replaceAllUsesWith(FCmp);
+  EraseFromParent.push_back(dyn_cast<Instruction>(ConstrainedCmpIntrinsic));
+}
+
 static void lowerExpectAssume(IntrinsicInst *II) {
   // If we cannot use the SPV_KHR_expect_assume extension, then we need to
   // ignore the intrinsic and move on. It should be removed later on by LLVM.
@@ -376,6 +391,7 @@ static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) {
 bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
   bool Changed = false;
   const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
+  SmallVector<Instruction *> EraseFromParent;
   for (BasicBlock &BB : *F) {
     for (Instruction &I : make_early_inc_range(BB)) {
       auto Call = dyn_cast<CallInst>(&I);
@@ -423,9 +439,17 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
         lowerPtrAnnotation(II);
         Changed = true;
         break;
+      case Intrinsic::experimental_constrained_fcmp:
+      case Intrinsic::experimental_constrained_fcmps:
+        lowerConstrainedFPCmpIntrinsic(dyn_cast<ConstrainedFPCmpIntrinsic>(II),
+                                       EraseFromParent);
+        Changed = true;
+        break;
       }
     }
   }
+  for (auto *I : EraseFromParent)
+    I->eraseFromParent();
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll
new file mode 100644
index 0000000000000..49bb8eac10be8
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll
@@ -0,0 +1,56 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpFOrdEqual
+; CHECK-DAG: OpFOrdGreaterThan
+; CHECK-DAG: OpFOrdGreaterThanEqual
+; CHECK-DAG: OpFOrdLessThan
+; CHECK-DAG: OpFOrdLessThanEqual
+; CHECK-DAG: OpFOrdNotEqual
+; CHECK-DAG: OpOrdered
+; CHECK-DAG: OpFUnordEqual
+; CHECK-DAG: OpFUnordGreaterThan
+; CHECK-DAG: OpFUnordGreaterThanEqual
+; CHECK-DAG: OpFUnordLessThan
+; CHECK-DAG: OpFUnordLessThanEqual
+; CHECK-DAG: OpFUnordNotEqual
+; CHECK-DAG: OpUnordered
+
+define dso_local spir_kernel void @test(float %a){
+entry:
+  %cmp = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"oeq", metadata !"fpexcept.strict") 
+  %cmp1 = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"ogt", metadata !"fpexcept.strict") 
+  %cmp2 = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"oge", metadata !"fpexcept.strict") 
+  %cmp3 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"olt", metadata !"fpexcept.strict") 
+  %cmp4 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ole", metadata !"fpexcept.strict") 
+  %cmp5 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"one", metadata !"fpexcept.strict") 
+  %cmp6 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ord", metadata !"fpexcept.strict") 
+  %cmp7 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ueq", metadata !"fpexcept.strict") 
+  %cmp8 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ugt", metadata !"fpexcept.strict") 
+  %cmp9 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"uge", metadata !"fpexcept.strict") 
+  %cmp10 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ult", metadata !"fpexcept.strict") 
+  %cmp11 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ule", metadata !"fpexcept.strict") 
+  %cmp12 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"une", metadata !"fpexcept.strict") 
+  %cmp13 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"uno", metadata !"fpexcept.strict") 
+
+  %or1 = or i1 %cmp, %cmp1
+  %or2 = or i1 %or1, %cmp2
+  %or3 = or i1 %or2, %cmp3
+  %or4 = or i1 %or3, %cmp4
+  %or5 = or i1 %or4, %cmp5
+  %or6 = or i1 %or5, %cmp6
+  %or7 = or i1 %or6, %cmp7
+  %or8 = or i1 %or7, %cmp8
+  %or9 = or i1 %or8, %cmp9
+  %or10 = or i1 %or9, %cmp10
+  %or11 = or i1 %or10, %cmp11
+  %or12 = or i1 %or11, %cmp12
+  %or13 = or i1 %or12, %cmp13
+  br i1 %or13, label %true_block, label %false_block
+true_block:
+  ret void
+false_block:
+  ret void
+}
+declare i1 @llvm.experimental.constrained.fcmps.f32(float, float, metadata, metadata) 
+declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata) 

From 0c1acc98140c642ff87c1a759bda3ddfaaf23964 Mon Sep 17 00:00:00 2001
From: Subash B <subash.boopathi@multicorewareinc.com>
Date: Sun, 28 Sep 2025 18:15:14 +0530
Subject: [PATCH 045/878] [SPIRV] Added lowering for the debugtrap intrinsic
 (#157442)

Mapped llvm.debugtrap intrinsic to OpNop in the SPIR-V backend, since
SPIR-V has no direct equivalent with tests.
---
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 14 +++++++++++++-
 .../CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll     | 14 ++++++++++++++
 .../SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll |  1 -
 3 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index a7b2179a312e1..5266e204bb32f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -197,6 +197,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool selectOverflowArith(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I, unsigned Opcode) const;
+  bool selectDebugTrap(Register ResVReg, const SPIRVType *ResType,
+                       MachineInstr &I) const;
 
   bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType,
                         MachineInstr &I, bool Signed) const;
@@ -999,16 +1001,26 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   // represent code after lowering or intrinsics which are not implemented but
   // should not crash when found in a customer's LLVM IR input.
   case TargetOpcode::G_TRAP:
-  case TargetOpcode::G_DEBUGTRAP:
   case TargetOpcode::G_UBSANTRAP:
   case TargetOpcode::DBG_LABEL:
     return true;
+  case TargetOpcode::G_DEBUGTRAP:
+    return selectDebugTrap(ResVReg, ResType, I);
 
   default:
     return false;
   }
 }
 
+bool SPIRVInstructionSelector::selectDebugTrap(Register ResVReg,
+                                               const SPIRVType *ResType,
+                                               MachineInstr &I) const {
+  unsigned Opcode = SPIRV::OpNop;
+  MachineBasicBlock &BB = *I.getParent();
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectExtInst(Register ResVReg,
                                              const SPIRVType *ResType,
                                              MachineInstr &I,
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll
new file mode 100644
index 0000000000000..fd8cb9d7ff6f0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll
@@ -0,0 +1,14 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+
+; CHECK: OpNop
+; CHECK-NEXT: OpReturn
+
+declare void @llvm.debugtrap()
+
+define spir_kernel void @foo(ptr addrspace(1) %a){
+entry:
+  %a.addr = alloca ptr addrspace(1), align 4
+  store ptr addrspace(1) %a, ptr %a.addr, align 4
+  call void @llvm.debugtrap()
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
index a15a80754cd60..b3ef6d6bbced9 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll
@@ -11,7 +11,6 @@
 define spir_kernel void @foo(ptr %p) {
 entry:
   call void @llvm.trap()
-  call void @llvm.debugtrap()
   call void @llvm.ubsantrap(i8 100)
 
   %r1 = call ptr @llvm.invariant.start.p0(i64 1024, ptr %p)

From 5d85d54feb4e17fa449fa9d9963aabac8b403d7b Mon Sep 17 00:00:00 2001
From: Ebin-McW <ebin.jose@multicorewareinc.com>
Date: Sun, 28 Sep 2025 18:20:15 +0530
Subject: [PATCH 046/878] [SPIRV] Add support for the extension
 SPV_EXT_relaxed_printf_string_address_space (#160245)

Added support for the extension to support more storageclass for printf
strings.
---
 llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp    |  5 +-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 31 ++++++++++++
 .../builtin_printf.ll                         | 24 ++++++++++
 .../non-constant-printf.ll                    | 48 +++++++++++++++++++
 4 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 993de9e9f64ec..85ea9e156cb97 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -148,7 +148,10 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::SPV_KHR_float_controls2},
         {"SPV_INTEL_tensor_float32_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_tensor_float32_conversion},
-        {"SPV_KHR_bfloat16", SPIRV::Extension::Extension::SPV_KHR_bfloat16}};
+        {"SPV_KHR_bfloat16", SPIRV::Extension::Extension::SPV_KHR_bfloat16},
+        {"SPV_EXT_relaxed_printf_string_address_space",
+         SPIRV::Extension::Extension::
+             SPV_EXT_relaxed_printf_string_address_space}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index a95f393b75605..0a122fc994810 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1222,6 +1222,31 @@ static void AddDotProductRequirements(const MachineInstr &MI,
   }
 }
 
+void addPrintfRequirements(const MachineInstr &MI,
+                           SPIRV::RequirementHandler &Reqs,
+                           const SPIRVSubtarget &ST) {
+  SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
+  const SPIRVType *PtrType = GR->getSPIRVTypeForVReg(MI.getOperand(4).getReg());
+  if (PtrType) {
+    MachineOperand ASOp = PtrType->getOperand(1);
+    if (ASOp.isImm()) {
+      unsigned AddrSpace = ASOp.getImm();
+      if (AddrSpace != SPIRV::StorageClass::UniformConstant) {
+        if (!ST.canUseExtension(
+                SPIRV::Extension::
+                    SPV_EXT_relaxed_printf_string_address_space)) {
+          report_fatal_error("SPV_EXT_relaxed_printf_string_address_space is "
+                             "required because printf uses a format string not "
+                             "in constant address space.",
+                             false);
+        }
+        Reqs.addExtension(
+            SPIRV::Extension::SPV_EXT_relaxed_printf_string_address_space);
+      }
+    }
+  }
+}
+
 static bool isBFloat16Type(const SPIRVType *TypeDef) {
   return TypeDef && TypeDef->getNumOperands() == 3 &&
          TypeDef->getOpcode() == SPIRV::OpTypeFloat &&
@@ -1321,6 +1346,12 @@ void addInstrRequirements(const MachineInstr &MI,
         static_cast<int64_t>(
             SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100)) {
       Reqs.addExtension(SPIRV::Extension::SPV_KHR_non_semantic_info);
+      break;
+    }
+    if (MI.getOperand(3).getImm() ==
+        static_cast<int64_t>(SPIRV::OpenCLExtInst::printf)) {
+      addPrintfRequirements(MI, Reqs, ST);
+      break;
     }
     break;
   }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll
new file mode 100644
index 0000000000000..093d172c5c1b1
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll
@@ -0,0 +1,24 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_relaxed_printf_string_address_space %s -o - | FileCheck %s
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; CHECK: OpExtension "SPV_EXT_relaxed_printf_string_address_space"
+; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] printf
+
+; CHECK-ERROR: LLVM ERROR: SPV_EXT_relaxed_printf_string_address_space is required because printf uses a format string not in constant address space.
+
+@.str = private unnamed_addr addrspace(1) constant [4 x i8] c"%d\0A\00", align 1
+
+declare spir_func i32 @printf(ptr addrspace(4), ...)
+
+define spir_kernel void @test_kernel() {
+entry:
+  ; Format string in addrspace(1) → cast to addrspace(4)
+  %format = addrspacecast ptr addrspace(1) @.str to ptr addrspace(4)
+  %val = alloca i32, align 4
+  store i32 123, ptr %val, align 4
+  %loaded = load i32, ptr %val, align 4
+
+  ; Call printf with non-constant format string
+  %call = call spir_func i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) %format, i32 %loaded)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll
new file mode 100644
index 0000000000000..b54d59b30309f
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll
@@ -0,0 +1,48 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_relaxed_printf_string_address_space %s -o - | FileCheck %s
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; CHECK: OpExtension "SPV_EXT_relaxed_printf_string_address_space"
+; CHECK: %[[#ExtInstSetId:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#TypeInt32Id:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#TypeInt8Id:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#TypeInt64Id:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#TypeArrayId:]] = OpTypeArray %[[#TypeInt8Id]] %[[#]]
+; CHECK-DAG: %[[#ConstantStorClassGlobalPtrTy:]] = OpTypePointer UniformConstant %[[#TypeArrayId]]
+; CHECK-DAG: %[[#WGStorClassGlobalPtrTy:]] = OpTypePointer Workgroup %[[#TypeArrayId]]
+; CHECK-DAG: %[[#CrossWFStorClassGlobalPtrTy:]] = OpTypePointer CrossWorkgroup %[[#TypeArrayId]]
+; CHECK-DAG: %[[#FunctionStorClassPtrTy:]] = OpTypePointer Function %[[#TypeInt8Id]]
+; CHECK-DAG: %[[#WGStorClassPtrTy:]] = OpTypePointer Workgroup %[[#TypeInt8Id]]
+; CHECK-DAG: %[[#CrossWFStorClassPtrTy:]] = OpTypePointer CrossWorkgroup %[[#TypeInt8Id]]
+; CHECK: %[[#ConstantCompositeId:]] = OpConstantComposite %[[#TypeArrayId]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpVariable %[[#ConstantStorClassGlobalPtrTy]] UniformConstant %[[#ConstantCompositeId]]
+; CHECK: %[[#]] = OpVariable %[[#CrossWFStorClassGlobalPtrTy]] CrossWorkgroup %[[#ConstantCompositeId]]
+; CHECK: %[[#]] = OpVariable %[[#WGStorClassGlobalPtrTy]] Workgroup %[[#ConstantCompositeId]]
+; CHECK: %[[#GEP1:]] = OpInBoundsPtrAccessChain %[[#FunctionStorClassPtrTy]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP1]]
+; CHECK: %[[#GEP2:]] = OpInBoundsPtrAccessChain %[[#CrossWFStorClassPtrTy]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP2]]
+; CHECK: %[[#GEP3:]] = OpInBoundsPtrAccessChain %[[#WGStorClassPtrTy]] %[[#]] %[[#]] %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP3]]
+
+; CHECK-ERROR: LLVM ERROR: SPV_EXT_relaxed_printf_string_address_space is required because printf uses a format string not in constant address space.
+
+@0 = internal unnamed_addr addrspace(2) constant [6 x i8] c"Test\0A\00", align 1
+@1 = internal unnamed_addr addrspace(1) constant [6 x i8] c"Test\0A\00", align 1
+@2 = internal unnamed_addr addrspace(3) constant [6 x i8] c"Test\0A\00", align 1
+
+define spir_kernel void @test() {
+  %tmp1 = alloca [6 x i8], align 1
+  call void @llvm.memcpy.p0.p2.i64(ptr align 1 %tmp1, ptr addrspace(2) align 1 @0, i64 6, i1 false)
+  %1 = getelementptr inbounds [6 x i8], ptr %tmp1, i32 0, i32 0
+  %2 = call spir_func i32 @_Z18__spirv_ocl_printfPc(ptr %1)
+  %3 = getelementptr inbounds [6 x i8], ptr addrspace(1) @1, i32 0, i32 0
+  %4 = call spir_func i32 @_Z18__spirv_ocl_printfPU3AS1c(ptr addrspace(1) %3)
+  %5 = getelementptr inbounds [6 x i8], ptr addrspace(3) @2, i32 0, i32 0
+  %6 = call spir_func i32 @_Z18__spirv_ocl_printfPU3AS3c(ptr addrspace(3) %5)
+  ret void
+}
+
+declare spir_func i32 @_Z18__spirv_ocl_printfPc(ptr)
+declare spir_func i32 @_Z18__spirv_ocl_printfPU3AS1c(ptr addrspace(1))
+declare spir_func i32 @_Z18__spirv_ocl_printfPU3AS3c(ptr addrspace(3))
+declare void @llvm.memcpy.p0.p2.i64(ptr captures(none), ptr addrspace(2) captures(none) readonly, i64, i1)

From 0dbc1e2dff5a4ff0bdc05b310bd93d1eff9bae23 Mon Sep 17 00:00:00 2001
From: Ebin-McW <ebin.jose@multicorewareinc.com>
Date: Sun, 28 Sep 2025 18:22:19 +0530
Subject: [PATCH 047/878] [SPIRV] Added constraint for SPV_INTEL_bindless_image
 extension (#160249)

Added constraints related to Addressing model as specified in the
specification.
It conforms with the implementation in translator

Same as PR #160089
Solved all issues
---
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 37 +++++++++++++++++--
 .../i32-in-physical64.ll                      | 19 ++++++++++
 2 files changed, 53 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 0a122fc994810..bc159d5c9a113 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1255,8 +1255,9 @@ static bool isBFloat16Type(const SPIRVType *TypeDef) {
 }
 
 void addInstrRequirements(const MachineInstr &MI,
-                          SPIRV::RequirementHandler &Reqs,
+                          SPIRV::ModuleAnalysisInfo &MAI,
                           const SPIRVSubtarget &ST) {
+  SPIRV::RequirementHandler &Reqs = MAI.Reqs;
   switch (MI.getOpcode()) {
   case SPIRV::OpMemoryModel: {
     int64_t Addr = MI.getOperand(0).getImm();
@@ -1812,15 +1813,45 @@ void addInstrRequirements(const MachineInstr &MI,
     break;
   case SPIRV::OpConvertHandleToImageINTEL:
   case SPIRV::OpConvertHandleToSamplerINTEL:
-  case SPIRV::OpConvertHandleToSampledImageINTEL:
+  case SPIRV::OpConvertHandleToSampledImageINTEL: {
     if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bindless_images))
       report_fatal_error("OpConvertHandleTo[Image/Sampler/SampledImage]INTEL "
                          "instructions require the following SPIR-V extension: "
                          "SPV_INTEL_bindless_images",
                          false);
+    SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
+    SPIRV::AddressingModel::AddressingModel AddrModel = MAI.Addr;
+    SPIRVType *TyDef = GR->getSPIRVTypeForVReg(MI.getOperand(1).getReg());
+    if (MI.getOpcode() == SPIRV::OpConvertHandleToImageINTEL &&
+        TyDef->getOpcode() != SPIRV::OpTypeImage) {
+      report_fatal_error("Incorrect return type for the instruction "
+                         "OpConvertHandleToImageINTEL",
+                         false);
+    } else if (MI.getOpcode() == SPIRV::OpConvertHandleToSamplerINTEL &&
+               TyDef->getOpcode() != SPIRV::OpTypeSampler) {
+      report_fatal_error("Incorrect return type for the instruction "
+                         "OpConvertHandleToSamplerINTEL",
+                         false);
+    } else if (MI.getOpcode() == SPIRV::OpConvertHandleToSampledImageINTEL &&
+               TyDef->getOpcode() != SPIRV::OpTypeSampledImage) {
+      report_fatal_error("Incorrect return type for the instruction "
+                         "OpConvertHandleToSampledImageINTEL",
+                         false);
+    }
+    SPIRVType *SpvTy = GR->getSPIRVTypeForVReg(MI.getOperand(2).getReg());
+    unsigned Bitwidth = GR->getScalarOrVectorBitWidth(SpvTy);
+    if (!(Bitwidth == 32 && AddrModel == SPIRV::AddressingModel::Physical32) &&
+        !(Bitwidth == 64 && AddrModel == SPIRV::AddressingModel::Physical64)) {
+      report_fatal_error(
+          "Parameter value must be a 32-bit scalar in case of "
+          "Physical32 addressing model or a 64-bit scalar in case of "
+          "Physical64 addressing model",
+          false);
+    }
     Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bindless_images);
     Reqs.addCapability(SPIRV::Capability::BindlessImagesINTEL);
     break;
+  }
   case SPIRV::OpSubgroup2DBlockLoadINTEL:
   case SPIRV::OpSubgroup2DBlockLoadTransposeINTEL:
   case SPIRV::OpSubgroup2DBlockLoadTransformINTEL:
@@ -1958,7 +1989,7 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
       continue;
     for (const MachineBasicBlock &MBB : *MF)
       for (const MachineInstr &MI : MBB)
-        addInstrRequirements(MI, MAI.Reqs, ST);
+        addInstrRequirements(MI, MAI, ST);
   }
   // Collect requirements for OpExecutionMode instructions.
   auto Node = M.getNamedMetadata("spirv.ExecutionMode");
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll
new file mode 100644
index 0000000000000..3624f149cb491
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bindless_images %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; CHECK-ERROR: LLVM ERROR: Parameter value must be a 32-bit scalar in case of Physical32 addressing model or a 64-bit scalar in case of Physical64 addressing model
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func void @foo(i32 %in) {
+  %img = call spir_func target("spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELi(i32 %in)
+  %samp = call spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64 42)
+  %sampImage = call spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64 43)
+  ret void
+}
+
+declare spir_func target("spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELi(i32)
+
+declare spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64)
+
+declare spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64)

From 2cf71fcb9751a347f39035739f8865ef7a53ee68 Mon Sep 17 00:00:00 2001
From: Ebin-McW <ebin.jose@multicorewareinc.com>
Date: Sun, 28 Sep 2025 18:24:59 +0530
Subject: [PATCH 048/878] [SPIRV] Added opencl Pipe builtins (#135335)

- Added opencl Pipe builtins
- Pipe instructions were added in tablegen and lowered in
SPIRVBuiltins.cpp

---------

Co-authored-by: Michal Paszkowski <michal@michalpaszkowski.com>
Co-authored-by: Dmitry Sidorov <dmitry.sidorov@intel.com>
---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp       |  51 +++++++
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        |  24 +++
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |  33 ++++-
 .../CodeGen/SPIRV/transcoding/builtin_pipe.ll | 140 ++++++++++++++++++
 4 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 86f445954400e..f704d3afdea78 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1096,6 +1096,41 @@ static bool build2DBlockIOINTELInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+static bool buildPipeInst(const SPIRV::IncomingCall *Call, unsigned Opcode,
+                          unsigned Scope, MachineIRBuilder &MIRBuilder,
+                          SPIRVGlobalRegistry *GR) {
+  switch (Opcode) {
+  case SPIRV::OpCommitReadPipe:
+  case SPIRV::OpCommitWritePipe:
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0));
+  case SPIRV::OpGroupCommitReadPipe:
+  case SPIRV::OpGroupCommitWritePipe:
+  case SPIRV::OpGroupReserveReadPipePackets:
+  case SPIRV::OpGroupReserveWritePipePackets: {
+    Register ScopeConstReg =
+        MIRBuilder.buildConstant(LLT::scalar(32), Scope).getReg(0);
+    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+    MRI->setRegClass(ScopeConstReg, &SPIRV::iIDRegClass);
+    MachineInstrBuilder MIB;
+    MIB = MIRBuilder.buildInstr(Opcode);
+    // Add Return register and type.
+    if (Opcode == SPIRV::OpGroupReserveReadPipePackets ||
+        Opcode == SPIRV::OpGroupReserveWritePipePackets)
+      MIB.addDef(Call->ReturnRegister)
+          .addUse(GR->getSPIRVTypeID(Call->ReturnType));
+
+    MIB.addUse(ScopeConstReg);
+    for (unsigned int i = 0; i < Call->Arguments.size(); ++i)
+      MIB.addUse(Call->Arguments[i]);
+
+    return true;
+  }
+  default:
+    return buildOpFromWrapper(MIRBuilder, Opcode, Call,
+                              GR->getSPIRVTypeID(Call->ReturnType));
+  }
+}
+
 static unsigned getNumComponentsForDim(SPIRV::Dim::Dim dim) {
   switch (dim) {
   case SPIRV::Dim::DIM_1D:
@@ -2350,6 +2385,20 @@ static bool generate2DBlockIOINTELInst(const SPIRV::IncomingCall *Call,
   return build2DBlockIOINTELInst(Call, Opcode, MIRBuilder, GR);
 }
 
+static bool generatePipeInst(const SPIRV::IncomingCall *Call,
+                             MachineIRBuilder &MIRBuilder,
+                             SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  unsigned Opcode =
+      SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode;
+
+  unsigned Scope = SPIRV::Scope::Workgroup;
+  if (Builtin->Name.contains("sub_group"))
+    Scope = SPIRV::Scope::Subgroup;
+
+  return buildPipeInst(Call, Opcode, Scope, MIRBuilder, GR);
+}
+
 static bool buildNDRange(const SPIRV::IncomingCall *Call,
                          MachineIRBuilder &MIRBuilder,
                          SPIRVGlobalRegistry *GR) {
@@ -2948,6 +2997,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateTernaryBitwiseFunctionINTELInst(Call.get(), MIRBuilder, GR);
   case SPIRV::Block2DLoadStore:
     return generate2DBlockIOINTELInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::Pipe:
+    return generatePipeInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index d08560bb6565a..2a8deb6bf498b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -69,6 +69,7 @@ def ExtendedBitOps : BuiltinGroup;
 def BindlessINTEL : BuiltinGroup;
 def TernaryBitwiseINTEL : BuiltinGroup;
 def Block2DLoadStore : BuiltinGroup;
+def Pipe : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -633,6 +634,29 @@ defm : DemangledNativeBuiltin<"__spirv_AtomicSMax", OpenCL_std, Atomic, 4, 4, Op
 defm : DemangledNativeBuiltin<"__spirv_AtomicUMin", OpenCL_std, Atomic, 4, 4, OpAtomicUMin>;
 defm : DemangledNativeBuiltin<"__spirv_AtomicUMax", OpenCL_std, Atomic, 4, 4, OpAtomicUMax>;
 
+// Pipe Instruction 
+defm : DemangledNativeBuiltin<"__read_pipe_2", OpenCL_std, Pipe,2, 2, OpReadPipe>;
+defm : DemangledNativeBuiltin<"__write_pipe_2", OpenCL_std, Pipe, 2, 2, OpWritePipe>;
+defm : DemangledNativeBuiltin<"__read_pipe_4", OpenCL_std, Pipe,4, 4, OpReservedReadPipe>;
+defm : DemangledNativeBuiltin<"__write_pipe_4", OpenCL_std, Pipe, 4, 4, OpReservedWritePipe>;
+defm : DemangledNativeBuiltin<"__reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpReserveReadPipePackets>;
+defm : DemangledNativeBuiltin<"__reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpReserveWritePipePackets>;
+defm : DemangledNativeBuiltin<"__commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpCommitReadPipe>;
+defm : DemangledNativeBuiltin<"__commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpCommitWritePipe>;
+defm : DemangledNativeBuiltin<"is_valid_reserve_id", OpenCL_std, Pipe, 1, 1, OpIsValidReserveId>;
+defm : DemangledNativeBuiltin<"__get_pipe_num_packets_ro", OpenCL_std, Pipe, 1, 1, OpGetNumPipePackets>;
+defm : DemangledNativeBuiltin<"__get_pipe_max_packets_ro", OpenCL_std, Pipe, 1, 1, OpGetMaxPipePackets>;
+defm : DemangledNativeBuiltin<"__get_pipe_num_packets_wo", OpenCL_std, Pipe, 1, 1, OpGetNumPipePackets>;
+defm : DemangledNativeBuiltin<"__get_pipe_max_packets_wo", OpenCL_std, Pipe, 1, 1, OpGetMaxPipePackets>;
+defm : DemangledNativeBuiltin<"__work_group_reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveReadPipePackets>;
+defm : DemangledNativeBuiltin<"__work_group_reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveWritePipePackets>;
+defm : DemangledNativeBuiltin<"__work_group_commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitReadPipe>;
+defm : DemangledNativeBuiltin<"__work_group_commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitWritePipe>;
+defm : DemangledNativeBuiltin<"__sub_group_reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveReadPipePackets>;
+defm : DemangledNativeBuiltin<"__sub_group_reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveWritePipePackets>;
+defm : DemangledNativeBuiltin<"__sub_group_commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitReadPipe>;
+defm : DemangledNativeBuiltin<"__sub_group_commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitWritePipe>;
+
 // Barrier builtin records:
 defm : DemangledNativeBuiltin<"barrier", OpenCL_std, Barrier, 1, 3, OpControlBarrier>;
 defm : DemangledNativeBuiltin<"work_group_barrier", OpenCL_std, Barrier, 1, 3, OpControlBarrier>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 496dcba17c10d..1723bfb639189 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -763,7 +763,38 @@ def OpGetDefaultQueue: Op<303, (outs ID:$res), (ins TYPE:$type),
 def OpBuildNDRange: Op<304, (outs ID:$res), (ins TYPE:$type, ID:$GWS, ID:$LWS, ID:$GWO),
                   "$res = OpBuildNDRange $type $GWS $LWS $GWO">;
 
-// TODO: 3.42.23. Pipe Instructions
+// 3.42.23. Pipe Instructions
+
+def OpReadPipe: Op<274, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReadPipe $type $Pipe $Pointer $PcktSize $PcktAlign">;
+def OpWritePipe: Op<275, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpWritePipe $type $Pipe $Pointer $PcktSize $PcktAlign">;
+def OpReservedReadPipe : Op<276, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$ReserveId, ID:$Index, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReservedReadPipe $type $Pipe $ReserveId $Index $Pointer $PcktSize $PcktAlign">;
+def OpReservedWritePipe : Op<277, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$ReserveId, ID:$Index, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign), 
+                  "$res = OpReservedWritePipe $type $Pipe $ReserveId $Index $Pointer $PcktSize $PcktAlign">;
+def OpReserveReadPipePackets : Op<278, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$NumPckts, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReserveReadPipePackets $type $Pipe $NumPckts $PcktSize $PcktAlign">;
+def OpReserveWritePipePackets : Op<279, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$NumPckts, ID:$PcktSize, ID:$PcktAlign),
+                  "$res = OpReserveWritePipePackets $type $Pipe $NumPckts $PcktSize $PcktAlign">;
+def OpCommitReadPipe : Op<280, (outs), (ins ID:$Pipe, ID:$ReserveId, ID:$PcktSize, ID:$PcktAlign),
+                  "OpCommitReadPipe $Pipe $ReserveId $PcktSize $PcktAlign">;
+def OpCommitWritePipe : Op<281, (outs), (ins ID:$Pipe, ID:$ReserveId, ID:$PcktSize, ID:$PcktAlign),
+                  "OpCommitWritePipe $Pipe $ReserveId $PcktSize $PcktAlign">;
+def OpIsValidReserveId : Op<282, (outs ID:$res), (ins TYPE:$type, ID:$ReserveId),
+                  "$res = OpIsValidReserveId $type $ReserveId">;
+def OpGetNumPipePackets : Op<283, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGetNumPipePackets $type $Pipe $PacketSize $PacketAlign">;
+def OpGetMaxPipePackets : Op<284, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGetMaxPipePackets $type $Pipe $PacketSize $PacketAlign">;
+def OpGroupReserveReadPipePackets : Op<285, (outs ID:$res), (ins TYPE:$type, ID:$Scope, ID:$Pipe, ID:$NumPckts, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGroupReserveReadPipePackets $type $Scope $Pipe $NumPckts $PacketSize $PacketAlign">;
+def OpGroupReserveWritePipePackets : Op<286, (outs ID:$res), (ins TYPE:$type, ID:$Scope, ID:$Pipe, ID:$NumPckts, ID:$PacketSize, ID:$PacketAlign),
+                  "$res = OpGroupReserveWritePipePackets $type $Scope $Pipe $NumPckts $PacketSize $PacketAlign">;
+def OpGroupCommitReadPipe : Op<287, (outs), (ins ID:$Scope, ID:$Pipe, ID:$ReserveId, ID:$PacketSize, ID:$PacketAlign),
+                  "OpGroupCommitReadPipe $Scope $Pipe $ReserveId $PacketSize $PacketAlign">;
+def OpGroupCommitWritePipe : Op<288, (outs), (ins ID:$Scope, ID:$Pipe, ID:$ReserveId, ID:$PacketSize, ID:$PacketAlign),
+                  "OpGroupCommitWritePipe $Scope $Pipe $ReserveId $PacketSize $PacketAlign">;
 
 // 3.42.24. Non-Uniform Instructions
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll
new file mode 100644
index 0000000000000..607997d034f09
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll
@@ -0,0 +1,140 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability Kernel
+; CHECK: OpCapability Addresses
+; CHECK: OpCapability Pipes
+; CHECK: OpCapability Int8
+; CHECK: OpCapability GenericPointer
+
+; CHECK-DAG: %[[#PipeWriteTy:]] = OpTypePipe WriteOnly
+; CHECK-DAG: %[[#PipeReadTy:]] = OpTypePipe ReadOnly
+; CHECK-DAG: %[[#ReserveIdTy:]] = OpTypeReserveId
+; CHECK-DAG: %[[#BoolTy:]] = OpTypeBool
+; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#Uint1:]] = OpConstant %[[#Int32Ty]] 1
+; CHECK-DAG: %[[#Uint2:]] = OpConstant %[[#Int32Ty]] 2
+; CHECK-DAG: %[[#Uint3:]] = OpConstant %[[#Int32Ty]] 3
+; CHECK-DAG: %[[#Uint4:]] = OpConstant %[[#Int32Ty]] 4
+; CHECK-DAG: %[[#NullUint:]] = OpConstantNull %[[#Int32Ty]]
+
+; CHECK: OpFunction
+; CHECK: %[[#FuncParam1:]] = OpFunctionParameter %[[#PipeWriteTy]]
+; CHECK: %[[#FuncParam2:]] = OpFunctionParameter %[[#PipeReadTy]]
+
+; CHECK: %[[#BasicWriteReserve:]] = OpReserveWritePipePackets %[[#ReserveIdTy]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpWritePipe %[[#Int32Ty]] %[[#FuncParam1]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpCommitWritePipe %[[#FuncParam1]] %[[#BasicWriteReserve]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#BasicReadReserve:]] = OpReserveReadPipePackets %[[#ReserveIdTy]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpReadPipe %[[#Int32Ty]] %[[#FuncParam2]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpCommitReadPipe %[[#FuncParam2]] %[[#BasicReadReserve]] %[[#Uint4]] %[[#Uint4]]
+
+; --- Reserved pipe operations ---
+; CHECK: %[[#ReservedWriteReserve:]] = OpReserveWritePipePackets %[[#ReserveIdTy]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#ReservedWrite:]] = OpReservedWritePipe %[[#Int32Ty]] %[[#FuncParam1]] %[[#ReservedWriteReserve]] %[[#NullUint]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#IsValidWrite:]] = OpIsValidReserveId %[[#BoolTy]] %[[#ReservedWriteReserve]]
+; CHECK: OpCommitWritePipe %[[#FuncParam1]] %[[#ReservedWriteReserve]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#ReservedReadReserve:]] = OpReserveReadPipePackets %[[#ReserveIdTy]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#ReservedRead:]] = OpReservedReadPipe %[[#Int32Ty]] %[[#FuncParam2]] %[[#ReservedReadReserve]] %[[#NullUint]] %[[#]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#IsValidRead:]] = OpIsValidReserveId %[[#BoolTy]] %[[#ReservedReadReserve]]
+; CHECK: OpCommitReadPipe %[[#FuncParam2]] %[[#ReservedReadReserve]] %[[#Uint4]] %[[#Uint4]]
+
+; --- Pipe packet queries ---
+; CHECK: %[[#MaxPacketsWO:]] = OpGetMaxPipePackets %[[#Int32Ty]] %[[#FuncParam1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#MaxPacketsWO]] Aligned 4
+; CHECK: %[[#NumPacketsWO:]] = OpGetNumPipePackets %[[#Int32Ty]] %[[#FuncParam1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#NumPacketsWO]] Aligned 4
+; CHECK: %[[#MaxPacketsRO:]] = OpGetMaxPipePackets %[[#Int32Ty]] %[[#FuncParam2]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#MaxPacketsRO]] Aligned 4
+; CHECK: %[[#NumPacketsRO:]] = OpGetNumPipePackets %[[#Int32Ty]] %[[#FuncParam2]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpStore %[[#]] %[[#NumPacketsRO]] Aligned 4
+
+; --- Workgroup operations ---
+; CHECK: %[[#WorkgroupWriteReserve:]] = OpGroupReserveWritePipePackets %[[#ReserveIdTy]] %[[#Uint2]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint1]] %[[#Uint1]]
+; CHECK: OpGroupCommitWritePipe %[[#Uint2]] %[[#FuncParam1]] %[[#WorkgroupWriteReserve]] %[[#Uint1]] %[[#Uint1]]
+; CHECK: %[[#WorkgroupReadReserve:]] = OpGroupReserveReadPipePackets %[[#ReserveIdTy]] %[[#Uint2]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint1]] %[[#Uint1]]
+; CHECK: OpGroupCommitReadPipe %[[#Uint2]] %[[#FuncParam2]] %[[#WorkgroupReadReserve]] %[[#Uint1]] %[[#Uint1]]
+
+; --- Subgroup operations ---
+; CHECK: %[[#SubgroupWriteReserve:]] = OpGroupReserveWritePipePackets %[[#ReserveIdTy]] %[[#Uint3]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpGroupCommitWritePipe %[[#Uint3]] %[[#FuncParam1]] %[[#SubgroupWriteReserve]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: %[[#SubgroupReadReserve:]] = OpGroupReserveReadPipePackets %[[#ReserveIdTy]] %[[#Uint3]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]]
+; CHECK: OpGroupCommitReadPipe %[[#Uint3]] %[[#FuncParam2]] %[[#SubgroupReadReserve]] %[[#Uint4]] %[[#Uint4]]
+
+define spir_kernel void @test_pipe_builtins(
+  target("spirv.Pipe", 1) %out_pipe,
+  target("spirv.Pipe", 0) %in_pipe,
+  ptr addrspace(4) %src,
+  ptr addrspace(4) %dst,
+  ptr addrspace(1) %max_packets_wo,
+  ptr addrspace(1) %num_packets_wo,
+  ptr addrspace(1) %max_packets_ro,
+  ptr addrspace(1) %num_packets_ro
+) {
+entry:
+  ; Basic pipe operations
+  %0 = call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4)
+  %1 = call spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1) %out_pipe, ptr addrspace(4) %src, i32 4, i32 4)
+  call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %0, i32 4, i32 4)
+  
+  %2 = call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4)
+  %3 = call spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0) %in_pipe, ptr addrspace(4) %dst, i32 4, i32 4)
+  call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %2, i32 4, i32 4)
+  
+  ; Reserved pipe operations
+  %4 = call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4)
+  %5 = call spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %4, i32 0, ptr addrspace(4) %src, i32 4, i32 4)
+  %6 = call spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId") %4)
+  call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %4, i32 4, i32 4)
+  
+  %7 = call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4)
+  %8 = call spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %7, i32 0, ptr addrspace(4) %dst, i32 4, i32 4)
+  %9 = call spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId") %7)
+  call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %7, i32 4, i32 4)
+  
+  ; Pipe packet queries
+  %10 = call spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1) %out_pipe, i32 4, i32 4)
+  store i32 %10, ptr addrspace(1) %max_packets_wo, align 4
+  %11 = call spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1) %out_pipe, i32 4, i32 4)
+  store i32 %11, ptr addrspace(1) %num_packets_wo, align 4
+  %12 = call spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0) %in_pipe, i32 4, i32 4)
+  store i32 %12, ptr addrspace(1) %max_packets_ro, align 4
+  %13 = call spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0) %in_pipe, i32 4, i32 4)
+  store i32 %13, ptr addrspace(1) %num_packets_ro, align 4
+  
+  ; Workgroup operations
+  %14 = call spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 1, i32 1)
+  call spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %14, i32 1, i32 1)
+  %15 = call spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 1, i32 1)
+  call spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %15, i32 1, i32 1)
+  
+  ; Subgroup operations
+  %16 = call spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4)
+  call spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %16, i32 4, i32 4)
+  %17 = call spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4)
+  call spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %17, i32 4, i32 4)
+  
+  ret void
+}
+
+declare spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32)
+declare spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32)
+declare spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32)
+declare spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32)
+declare spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, ptr addrspace(4), i32, i32)
+declare spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, ptr addrspace(4), i32, i32)
+declare spir_func void @__commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32)
+declare spir_func void @__commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32)
+declare spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId"))
+declare spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1), i32, i32)
+declare spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1), i32, i32)
+declare spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0), i32, i32)
+declare spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0), i32, i32)
+declare spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32)
+declare spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32)
+declare spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32)
+declare spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32)
+declare spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32)
+declare spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32)
+declare spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32)
+declare spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32)

From 9288b084fd9c4301946dff838925c7385c1168e5 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 07:24:13 -0700
Subject: [PATCH 049/878] [MLIR] Apply clang-tidy fixes for
 bugprone-argument-comment in PadTilingInterface.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
index 8942670767231..0956c5d771394 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp
@@ -141,7 +141,7 @@ SmallVector<OpFoldResult> linalg::computePaddedShape(
       projectedDims.flip(paddingDim);
       AffineMap projectedMap =
           mlir::projectDims(partialIndexingMap, projectedDims,
-                            /*compressDims=*/true);
+                            /*compressDimsFlag=*/true);
 
       // If we are padding to the next multiple of, compose with ceil(sz) * sz.
       OpFoldResult paddingDimOfr;

From a558d656043734cc4d02e0a0a12e4c308c28f8c7 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Sun, 28 Sep 2025 21:36:03 +0800
Subject: [PATCH 050/878] [CodeGen] Get rid of incorrect `std` template
 specializations (#160804)

This patch renames comparators
- from `std::equal_to<llvm::rdf::RegisterRef>` to
`llvm::rdf::RegisterRefEqualTo`, and
- from `std::less<llvm::rdf::RegisterRef>` to
`llvm::rdf::RegisterRefLess`.

The original specializations don't satisfy the requirements for the
original `std` templates by being stateful and
non-default-constructible, so they make the program have UB due to C++17
[namespace.std]/2, C++20/23 [namespace.std]/5.

> A program may explicitly instantiate a class template defined in the
standard library only if the declaration
> - depends on the name of at least one program-defined type, and
> - the instantiation meets the standard library requirements for the
original template.
---
 llvm/include/llvm/CodeGen/RDFGraph.h      |  2 +-
 llvm/include/llvm/CodeGen/RDFRegisters.h  | 54 ++++++++++++-----------
 llvm/lib/CodeGen/RDFLiveness.cpp          |  9 ++--
 llvm/lib/Target/Hexagon/RDFCopy.cpp       |  2 +-
 llvm/lib/Target/Hexagon/RDFCopy.h         |  8 ++--
 llvm/unittests/CodeGen/TypeTraitsTest.cpp | 35 +++++++++++++++
 6 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h
index 8a93afbcb5491..6bb6033a8a2f2 100644
--- a/llvm/include/llvm/CodeGen/RDFGraph.h
+++ b/llvm/include/llvm/CodeGen/RDFGraph.h
@@ -447,7 +447,7 @@ struct NodeAllocator {
   AllocatorTy MemPool;
 };
 
-using RegisterSet = std::set<RegisterRef>;
+using RegisterSet = std::set<RegisterRef, RegisterRefLess>;
 
 struct TargetOperandInfo {
   TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h
index 4a9a4063c9e83..82027cad53bdb 100644
--- a/llvm/include/llvm/CodeGen/RDFRegisters.h
+++ b/llvm/include/llvm/CodeGen/RDFRegisters.h
@@ -199,6 +199,33 @@ struct PhysicalRegisterInfo {
   std::vector<AliasInfo> AliasInfos;
 };
 
+struct RegisterRefEqualTo {
+  constexpr RegisterRefEqualTo(const llvm::rdf::PhysicalRegisterInfo &pri)
+      : PRI(&pri) {}
+
+  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
+    return PRI->equal_to(A, B);
+  }
+
+private:
+  // Make it a pointer just in case. See comment in `RegisterRefLess` below.
+  const llvm::rdf::PhysicalRegisterInfo *PRI;
+};
+
+struct RegisterRefLess {
+  constexpr RegisterRefLess(const llvm::rdf::PhysicalRegisterInfo &pri)
+      : PRI(&pri) {}
+
+  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
+    return PRI->less(A, B);
+  }
+
+private:
+  // Make it a pointer because apparently some versions of MSVC use std::swap
+  // on the comparator object.
+  const llvm::rdf::PhysicalRegisterInfo *PRI;
+};
+
 struct RegisterAggr {
   RegisterAggr(const PhysicalRegisterInfo &pri)
       : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
@@ -334,18 +361,6 @@ template <> struct hash<llvm::rdf::RegisterAggr> {
   }
 };
 
-template <> struct equal_to<llvm::rdf::RegisterRef> {
-  constexpr equal_to(const llvm::rdf::PhysicalRegisterInfo &pri) : PRI(&pri) {}
-
-  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
-    return PRI->equal_to(A, B);
-  }
-
-private:
-  // Make it a pointer just in case. See comment in `less` below.
-  const llvm::rdf::PhysicalRegisterInfo *PRI;
-};
-
 template <> struct equal_to<llvm::rdf::RegisterAggr> {
   bool operator()(const llvm::rdf::RegisterAggr &A,
                   const llvm::rdf::RegisterAggr &B) const {
@@ -353,23 +368,10 @@ template <> struct equal_to<llvm::rdf::RegisterAggr> {
   }
 };
 
-template <> struct less<llvm::rdf::RegisterRef> {
-  constexpr less(const llvm::rdf::PhysicalRegisterInfo &pri) : PRI(&pri) {}
-
-  bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const {
-    return PRI->less(A, B);
-  }
-
-private:
-  // Make it a pointer because apparently some versions of MSVC use std::swap
-  // on the std::less specialization.
-  const llvm::rdf::PhysicalRegisterInfo *PRI;
-};
-
 } // namespace std
 
 namespace llvm::rdf {
-using RegisterSet = std::set<RegisterRef, std::less<RegisterRef>>;
+using RegisterSet = std::set<RegisterRef, RegisterRefLess>;
 } // namespace llvm::rdf
 
 #endif // LLVM_CODEGEN_RDFREGISTERS_H
diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp
index 318422b46e811..2e1cf499eab41 100644
--- a/llvm/lib/CodeGen/RDFLiveness.cpp
+++ b/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -652,8 +652,9 @@ void Liveness::computePhiInfo() {
   // defs, cache the result of subtracting these defs from a given register
   // ref.
   using RefHash = std::hash<RegisterRef>;
-  using RefEqual = std::equal_to<RegisterRef>;
-  using SubMap = std::unordered_map<RegisterRef, RegisterRef>;
+  using RefEqual = RegisterRefEqualTo;
+  using SubMap =
+      std::unordered_map<RegisterRef, RegisterRef, RefHash, RefEqual>;
   std::unordered_map<RegisterAggr, SubMap> Subs;
   auto ClearIn = [](RegisterRef RR, const RegisterAggr &Mid, SubMap &SM) {
     if (Mid.empty())
@@ -868,7 +869,7 @@ void Liveness::computeLiveIns() {
       std::vector<RegisterRef> LV;
       for (const MachineBasicBlock::RegisterMaskPair &LI : B.liveins())
         LV.push_back(RegisterRef(LI.PhysReg, LI.LaneMask));
-      llvm::sort(LV, std::less<RegisterRef>(PRI));
+      llvm::sort(LV, RegisterRefLess(PRI));
       dbgs() << printMBBReference(B) << "\t rec = {";
       for (auto I : LV)
         dbgs() << ' ' << Print(I, DFG);
@@ -878,7 +879,7 @@ void Liveness::computeLiveIns() {
       LV.clear();
       for (RegisterRef RR : LiveMap[&B].refs())
         LV.push_back(RR);
-      llvm::sort(LV, std::less<RegisterRef>(PRI));
+      llvm::sort(LV, RegisterRefLess(PRI));
       dbgs() << "\tcomp = {";
       for (auto I : LV)
         dbgs() << ' ' << Print(I, DFG);
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index fafdad08909dd..3b1d3bd89680b 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -108,7 +108,7 @@ bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
     if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
       NodeAddr<StmtNode*> SA = IA;
-      EqualityMap EM(std::less<RegisterRef>(DFG.getPRI()));
+      EqualityMap EM(RegisterRefLess(DFG.getPRI()));
       if (interpretAsCopy(SA.Addr->getCode(), EM))
         recordCopy(SA, EM);
     }
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.h b/llvm/lib/Target/Hexagon/RDFCopy.h
index e4fb89892831d..92b2c65982655 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.h
+++ b/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -25,8 +25,8 @@ class MachineInstr;
 namespace rdf {
 
   struct CopyPropagation {
-    CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
-        RDefMap(std::less<RegisterRef>(DFG.getPRI())) {}
+    CopyPropagation(DataFlowGraph &dfg)
+        : MDT(dfg.getDT()), DFG(dfg), RDefMap(RegisterRefLess(DFG.getPRI())) {}
 
     virtual ~CopyPropagation() = default;
 
@@ -35,7 +35,7 @@ namespace rdf {
     bool trace() const { return Trace; }
     DataFlowGraph &getDFG() { return DFG; }
 
-    using EqualityMap = std::map<RegisterRef, RegisterRef>;
+    using EqualityMap = std::map<RegisterRef, RegisterRef, RegisterRefLess>;
     virtual bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM);
 
   private:
@@ -45,7 +45,7 @@ namespace rdf {
     bool Trace = false;
 
     // map: register -> (map: stmt -> reaching def)
-    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
+    std::map<RegisterRef, std::map<NodeId, NodeId>, RegisterRefLess> RDefMap;
     // map: statement -> (map: dst reg -> src reg)
     std::map<NodeId, EqualityMap> CopyMap;
     std::vector<NodeId> Copies;
diff --git a/llvm/unittests/CodeGen/TypeTraitsTest.cpp b/llvm/unittests/CodeGen/TypeTraitsTest.cpp
index dde86280cff6a..1c8852fc1f071 100644
--- a/llvm/unittests/CodeGen/TypeTraitsTest.cpp
+++ b/llvm/unittests/CodeGen/TypeTraitsTest.cpp
@@ -6,13 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "gtest/gtest.h"
+#include <functional>
 #include <type_traits>
+#include <utility>
 
 using namespace llvm;
 
@@ -23,3 +26,35 @@ static_assert(std::is_trivially_copyable_v<SDValue>, "trivially copyable");
 static_assert(std::is_trivially_copyable_v<SlotIndex>, "trivially copyable");
 static_assert(std::is_trivially_copyable_v<IdentifyingPassPtr>,
               "trivially copyable");
+
+// https://llvm.org/PR105169
+// Verify that we won't accidently specialize std::less and std::equal_to in a
+// wrong way.
+// C++17 [namespace.std]/2, C++20/23 [namespace.std]/5:
+//   A program may explicitly instantiate a template defined in the standard
+//   library only if the declaration
+//   - depends on the name of a user-defined type and
+//   - the instantiation meets the standard library requirements for the
+//   original template.
+template <class Fn> constexpr bool CheckStdCmpRequirements() {
+  // std::less and std::equal_to are literal, default constructible, and
+  // copyable classes.
+  Fn f1;
+  auto f2 = f1;
+  auto f3 = std::move(f2);
+  f2 = f3;
+  f2 = std::move(f3);
+
+  // Properties held on all known implementations, although not guaranteed by
+  // the standard.
+  static_assert(std::is_empty_v<Fn>);
+  static_assert(std::is_trivially_default_constructible_v<Fn>);
+  static_assert(std::is_trivially_copyable_v<Fn>);
+
+  return true;
+}
+
+static_assert(CheckStdCmpRequirements<std::less<rdf::RegisterRef>>(),
+              "same as the original template");
+static_assert(CheckStdCmpRequirements<std::equal_to<rdf::RegisterRef>>(),
+              "same as the original template");

From c05c90949245a5beb15d425caf60020813423e72 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 08:01:11 -0700
Subject: [PATCH 051/878] [MLIR] Apply clang-tidy fixes for llvm-qualified-auto
 in MemRefUtils.cpp (NFC)

---
 mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
index 3de9c3898c713..6200366cded29 100644
--- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
+++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp
@@ -191,7 +191,7 @@ computeSuffixProductIRBlock(Location loc, OpBuilder &builder,
 }
 
 MemrefValue skipFullyAliasingOperations(MemrefValue source) {
-  while (auto op = source.getDefiningOp()) {
+  while (auto *op = source.getDefiningOp()) {
     if (auto subViewOp = dyn_cast<memref::SubViewOp>(op);
         subViewOp && subViewOp.hasZeroOffset() && subViewOp.hasUnitStride()) {
       // A `memref.subview` with an all zero offset, and all unit strides, still
@@ -208,7 +208,7 @@ MemrefValue skipFullyAliasingOperations(MemrefValue source) {
 }
 
 MemrefValue skipViewLikeOps(MemrefValue source) {
-  while (auto op = source.getDefiningOp()) {
+  while (auto *op = source.getDefiningOp()) {
     if (auto viewLike = dyn_cast<ViewLikeOpInterface>(op)) {
       if (source == viewLike.getViewDest()) {
         source = cast<MemrefValue>(viewLike.getViewSource());

From 41f3438362f5ae2a06544fec7db18c37f5ecd79b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 28 Sep 2025 17:30:55 +0100
Subject: [PATCH 052/878] [VPlan] Remove dead code for scalar VFs in
 VPRegionBlock::cost (NFC).

The VPlan cost model is not used to compute costs of scalar VFs
currently, as conversion to replicate regions makes accurately computing
the original scalar cost difficult.

Remove left over, dead code.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index a1c6f7977885f..81f1956c96254 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -845,19 +845,10 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
   if (VF.isScalable())
     return InstructionCost::getInvalid();
 
-  // First compute the cost of the conditionally executed recipes, followed by
-  // account for the branching cost, except if the mask is a header mask or
-  // uniform condition.
-  using namespace llvm::VPlanPatternMatch;
+  // Compute and return the cost of the conditionally executed recipes.
+  assert(VF.isVector() && "Can only compute vector cost at the moment.");
   VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
-  InstructionCost ThenCost = Then->cost(VF, Ctx);
-
-  // For the scalar case, we may not always execute the original predicated
-  // block, Thus, scale the block's cost by the probability of executing it.
-  if (VF.isScalar())
-    return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
-
-  return ThenCost;
+  return Then->cost(VF, Ctx);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

From d30fe62cf15960b0b0b2c15d315fe51bb718822a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 28 Sep 2025 10:15:08 -0700
Subject: [PATCH 053/878] ELF: Test .eh_frame relocation

EhInputSection currently uses scanSection path, getting ignored marker
relocations and undefined symbol diagnostics for free. This might change
in the future. Add test coverage.
---
 lld/test/ELF/eh-frame-relocation.s | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 lld/test/ELF/eh-frame-relocation.s

diff --git a/lld/test/ELF/eh-frame-relocation.s b/lld/test/ELF/eh-frame-relocation.s
new file mode 100644
index 0000000000000..9c1fe40dba7d3
--- /dev/null
+++ b/lld/test/ELF/eh-frame-relocation.s
@@ -0,0 +1,29 @@
+# REQUIRES: x86
+## Test that marker relocations are ignored and undefined symbols lead to errors.
+
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 abi.s -o abi.o
+# RUN: ld.lld a.o abi.o -o a
+# RUN: llvm-readelf -s a | FileCheck %s
+
+# CHECK: 00000000002{{.*}} 0 FUNC    GLOBAL DEFAULT [[#]] __gxx_personality_v0
+
+# RUN: not ld.lld a.o 2>&1 | FileCheck %s --check-prefix=ERR
+
+# ERR:      error: undefined symbol: __gxx_personality_v0
+# ERR-NEXT: >>> referenced by a.o:(.eh_frame+0x12)
+
+#--- a.s
+.cfi_startproc
+.cfi_personality 0, __gxx_personality_v0
+  ret
+.cfi_endproc
+
+.section .eh_frame,"a",@unwind
+.reloc ., BFD_RELOC_NONE, ignore
+
+#--- abi.s
+.globl __gxx_personality_v0
+.type __gxx_personality_v0, @function
+__gxx_personality_v0:

From f7dd258635af4d85bd8f25729c7f56aadb4c5913 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 28 Sep 2025 10:27:05 -0700
Subject: [PATCH 054/878] [ADT] Fix a bug in PackedVector::setValue for signed
 types (#159239)

Without this patch, we forget to update the sign bit.  When we assign:

  Vec[0] = -1;

the sign bit is correctly set to 1.  Overwriting the same element:

  Vec[0] = 1;

does not update the sign bit, leaving the 4-bit as 0b1001, which reads
-2 according to PackedVector's encoding.  (It does not use two's
complement.)

This patch fixes the bug by clearing the sign bit when we are
assigning a non-negative value.
---
 llvm/include/llvm/ADT/PackedVector.h    | 2 ++
 llvm/unittests/ADT/PackedVectorTest.cpp | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h
index 1146cc4bd6d23..4e31d3f098d44 100644
--- a/llvm/include/llvm/ADT/PackedVector.h
+++ b/llvm/include/llvm/ADT/PackedVector.h
@@ -58,6 +58,8 @@ class PackedVectorBase<T, BitNum, BitVectorTy, true> {
     if (val < 0) {
       val = ~val;
       Bits.set((Idx * BitNum) + BitNum - 1);
+    } else {
+      Bits.reset((Idx * BitNum) + BitNum - 1);
     }
     assert((val >> (BitNum-1)) == 0 && "value is too big");
     for (unsigned i = 0; i != BitNum-1; ++i)
diff --git a/llvm/unittests/ADT/PackedVectorTest.cpp b/llvm/unittests/ADT/PackedVectorTest.cpp
index 30fc7c0b6d07f..df2cbf0e7f0f8 100644
--- a/llvm/unittests/ADT/PackedVectorTest.cpp
+++ b/llvm/unittests/ADT/PackedVectorTest.cpp
@@ -71,6 +71,14 @@ TEST(PackedVectorTest, RawBitsSize) {
   EXPECT_EQ(12u, Vec.raw_bits().size());
 }
 
+TEST(PackedVectorTest, SignedValueOverwrite) {
+  PackedVector<signed, 4> Vec(1);
+  Vec[0] = -1;
+  EXPECT_EQ(-1, Vec[0]);
+  Vec[0] = 1;
+  EXPECT_EQ(1, Vec[0]);
+}
+
 #ifdef EXPECT_DEBUG_DEATH
 
 TEST(PackedVectorTest, UnsignedValues) {

From 0de265c3e3beaa0ac0a2976b8c7905ce321987c1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 28 Sep 2025 10:27:13 -0700
Subject: [PATCH 055/878] [ADT] Consolidate assertSafeToReferenceAfterClear
 with "if constexpr" (NFC) (#161042)

This patch consolidates two implementations of
assertSafeToReferenceAfterClear into a single template function.
---
 llvm/include/llvm/ADT/SmallVector.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index 36b324355ee10..77805f5c03c14 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -199,17 +199,18 @@ class SmallVectorTemplateCommon
   }
 
   /// Check whether any part of the range will be invalidated by clearing.
-  void assertSafeToReferenceAfterClear(const T *From, const T *To) {
-    if (From == To)
-      return;
-    this->assertSafeToReferenceAfterResize(From, 0);
-    this->assertSafeToReferenceAfterResize(To - 1, 0);
-  }
-  template <
-      class ItTy,
-      std::enable_if_t<!std::is_same<std::remove_const_t<ItTy>, T *>::value,
-                       bool> = false>
-  void assertSafeToReferenceAfterClear(ItTy, ItTy) {}
+  template <class ItTy>
+  void assertSafeToReferenceAfterClear(ItTy From, ItTy To) {
+    if constexpr (std::is_pointer_v<ItTy> &&
+                  std::is_same_v<
+                      std::remove_const_t<std::remove_pointer_t<ItTy>>,
+                      std::remove_const_t<T>>) {
+      if (From == To)
+        return;
+      this->assertSafeToReferenceAfterResize(From, 0);
+      this->assertSafeToReferenceAfterResize(To - 1, 0);
+    }
+  }
 
   /// Check whether any part of the range will be invalidated by growing.
   template <class ItTy> void assertSafeToAddRange(ItTy From, ItTy To) {

From 372f78643e7154f8b3f28a01aebdc1aab87168fb Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 28 Sep 2025 10:27:21 -0700
Subject: [PATCH 056/878] [ADT] Add [[nodiscard]] to more classes (NFC)
 (#161044)

This patch adds [[nodiscard]] to user-facing functions in set/map
classes if they return non-void values and appear to have no side
effect.
---
 llvm/include/llvm/ADT/ImmutableMap.h | 44 +++++++++++---------
 llvm/include/llvm/ADT/SparseSet.h    | 26 +++++++-----
 llvm/include/llvm/ADT/StringMap.h    | 61 +++++++++++++++++-----------
 llvm/include/llvm/ADT/StringSet.h    |  4 +-
 4 files changed, 81 insertions(+), 54 deletions(-)

diff --git a/llvm/include/llvm/ADT/ImmutableMap.h b/llvm/include/llvm/ADT/ImmutableMap.h
index 3d19ca41a5be0..32634a96ee9ea 100644
--- a/llvm/include/llvm/ADT/ImmutableMap.h
+++ b/llvm/include/llvm/ADT/ImmutableMap.h
@@ -111,25 +111,25 @@ class ImmutableMap {
     }
   };
 
-  bool contains(key_type_ref K) const {
+  [[nodiscard]] bool contains(key_type_ref K) const {
     return Root ? Root->contains(K) : false;
   }
 
-  bool operator==(const ImmutableMap &RHS) const {
+  [[nodiscard]] bool operator==(const ImmutableMap &RHS) const {
     return Root && RHS.Root ? Root->isEqual(*RHS.Root.get()) : Root == RHS.Root;
   }
 
-  bool operator!=(const ImmutableMap &RHS) const {
+  [[nodiscard]] bool operator!=(const ImmutableMap &RHS) const {
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root.get())
                             : Root != RHS.Root;
   }
 
-  TreeTy *getRoot() const {
+  [[nodiscard]] TreeTy *getRoot() const {
     if (Root) { Root->retain(); }
     return Root.get();
   }
 
-  TreeTy *getRootWithoutRetain() const { return Root.get(); }
+  [[nodiscard]] TreeTy *getRootWithoutRetain() const { return Root.get(); }
 
   void manualRetain() {
     if (Root) Root->retain();
@@ -139,7 +139,7 @@ class ImmutableMap {
     if (Root) Root->release();
   }
 
-  bool isEmpty() const { return !Root; }
+  [[nodiscard]] bool isEmpty() const { return !Root; }
 
 public:
   //===--------------------------------------------------===//
@@ -163,10 +163,10 @@ class ImmutableMap {
     data_type_ref getData() const { return (*this)->second; }
   };
 
-  iterator begin() const { return iterator(Root.get()); }
-  iterator end() const { return iterator(); }
+  [[nodiscard]] iterator begin() const { return iterator(Root.get()); }
+  [[nodiscard]] iterator end() const { return iterator(); }
 
-  data_type* lookup(key_type_ref K) const {
+  [[nodiscard]] data_type *lookup(key_type_ref K) const {
     if (Root) {
       TreeTy* T = Root->find(K);
       if (T) return &T->getValue().second;
@@ -178,7 +178,7 @@ class ImmutableMap {
   /// getMaxElement - Returns the <key,value> pair in the ImmutableMap for
   ///  which key is the highest in the ordering of keys in the map.  This
   ///  method returns NULL if the map is empty.
-  value_type* getMaxElement() const {
+  [[nodiscard]] value_type *getMaxElement() const {
     return Root ? &(Root->getMaxElement()->getValue()) : nullptr;
   }
 
@@ -186,7 +186,9 @@ class ImmutableMap {
   // Utility methods.
   //===--------------------------------------------------===//
 
-  unsigned getHeight() const { return Root ? Root->getHeight() : 0; }
+  [[nodiscard]] unsigned getHeight() const {
+    return Root ? Root->getHeight() : 0;
+  }
 
   static inline void Profile(FoldingSetNodeID& ID, const ImmutableMap& M) {
     ID.AddPointer(M.Root.get());
@@ -250,7 +252,7 @@ class ImmutableMapRef {
     return ImmutableMapRef(NewT, Factory);
   }
 
-  bool contains(key_type_ref K) const {
+  [[nodiscard]] bool contains(key_type_ref K) const {
     return Root ? Root->contains(K) : false;
   }
 
@@ -258,16 +260,16 @@ class ImmutableMapRef {
     return ImmutableMap<KeyT, ValT>(Factory->getCanonicalTree(Root.get()));
   }
 
-  bool operator==(const ImmutableMapRef &RHS) const {
+  [[nodiscard]] bool operator==(const ImmutableMapRef &RHS) const {
     return Root && RHS.Root ? Root->isEqual(*RHS.Root.get()) : Root == RHS.Root;
   }
 
-  bool operator!=(const ImmutableMapRef &RHS) const {
+  [[nodiscard]] bool operator!=(const ImmutableMapRef &RHS) const {
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root.get())
                             : Root != RHS.Root;
   }
 
-  bool isEmpty() const { return !Root; }
+  [[nodiscard]] bool isEmpty() const { return !Root; }
 
   //===--------------------------------------------------===//
   // For testing.
@@ -293,10 +295,10 @@ class ImmutableMapRef {
     data_type_ref getData() const { return (*this)->second; }
   };
 
-  iterator begin() const { return iterator(Root.get()); }
-  iterator end() const { return iterator(); }
+  [[nodiscard]] iterator begin() const { return iterator(Root.get()); }
+  [[nodiscard]] iterator end() const { return iterator(); }
 
-  data_type *lookup(key_type_ref K) const {
+  [[nodiscard]] data_type *lookup(key_type_ref K) const {
     if (Root) {
       TreeTy* T = Root->find(K);
       if (T) return &T->getValue().second;
@@ -308,7 +310,7 @@ class ImmutableMapRef {
   /// getMaxElement - Returns the <key,value> pair in the ImmutableMap for
   ///  which key is the highest in the ordering of keys in the map.  This
   ///  method returns NULL if the map is empty.
-  value_type* getMaxElement() const {
+  [[nodiscard]] value_type *getMaxElement() const {
     return Root ? &(Root->getMaxElement()->getValue()) : nullptr;
   }
 
@@ -316,7 +318,9 @@ class ImmutableMapRef {
   // Utility methods.
   //===--------------------------------------------------===//
 
-  unsigned getHeight() const { return Root ? Root->getHeight() : 0; }
+  [[nodiscard]] unsigned getHeight() const {
+    return Root ? Root->getHeight() : 0;
+  }
 
   static inline void Profile(FoldingSetNodeID &ID, const ImmutableMapRef &M) {
     ID.AddPointer(M.Root.get());
diff --git a/llvm/include/llvm/ADT/SparseSet.h b/llvm/include/llvm/ADT/SparseSet.h
index 395cfc3ebfd43..9783301be4b64 100644
--- a/llvm/include/llvm/ADT/SparseSet.h
+++ b/llvm/include/llvm/ADT/SparseSet.h
@@ -171,23 +171,23 @@ class SparseSet {
   using iterator = typename DenseT::iterator;
   using const_iterator = typename DenseT::const_iterator;
 
-  const_iterator begin() const { return Dense.begin(); }
-  const_iterator end() const { return Dense.end(); }
-  iterator begin() { return Dense.begin(); }
-  iterator end() { return Dense.end(); }
+  [[nodiscard]] const_iterator begin() const { return Dense.begin(); }
+  [[nodiscard]] const_iterator end() const { return Dense.end(); }
+  [[nodiscard]] iterator begin() { return Dense.begin(); }
+  [[nodiscard]] iterator end() { return Dense.end(); }
 
   /// empty - Returns true if the set is empty.
   ///
   /// This is not the same as BitVector::empty().
   ///
-  bool empty() const { return Dense.empty(); }
+  [[nodiscard]] bool empty() const { return Dense.empty(); }
 
   /// size - Returns the number of elements in the set.
   ///
   /// This is not the same as BitVector::size() which returns the size of the
   /// universe.
   ///
-  size_type size() const { return Dense.size(); }
+  [[nodiscard]] size_type size() const { return Dense.size(); }
 
   /// clear - Clears the set.  This is a very fast constant time operation.
   ///
@@ -222,21 +222,27 @@ class SparseSet {
   /// @param   Key A valid key to find.
   /// @returns An iterator to the element identified by key, or end().
   ///
-  iterator find(const KeyT &Key) { return findIndex(KeyIndexOf(Key)); }
+  [[nodiscard]] iterator find(const KeyT &Key) {
+    return findIndex(KeyIndexOf(Key));
+  }
 
-  const_iterator find(const KeyT &Key) const {
+  [[nodiscard]] const_iterator find(const KeyT &Key) const {
     return const_cast<SparseSet *>(this)->findIndex(KeyIndexOf(Key));
   }
 
   /// Check if the set contains the given \c Key.
   ///
   /// @param Key A valid key to find.
-  bool contains(const KeyT &Key) const { return find(Key) != end(); }
+  [[nodiscard]] bool contains(const KeyT &Key) const {
+    return find(Key) != end();
+  }
 
   /// count - Returns 1 if this set contains an element identified by Key,
   /// 0 otherwise.
   ///
-  size_type count(const KeyT &Key) const { return contains(Key) ? 1 : 0; }
+  [[nodiscard]] size_type count(const KeyT &Key) const {
+    return contains(Key) ? 1 : 0;
+  }
 
   /// insert - Attempts to insert a new element.
   ///
diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h
index 2c146fbf08df1..01cbf2d3fff71 100644
--- a/llvm/include/llvm/ADT/StringMap.h
+++ b/llvm/include/llvm/ADT/StringMap.h
@@ -102,18 +102,18 @@ class StringMapImpl {
     return reinterpret_cast<StringMapEntryBase *>(TombstoneIntVal);
   }
 
-  unsigned getNumBuckets() const { return NumBuckets; }
-  unsigned getNumItems() const { return NumItems; }
+  [[nodiscard]] unsigned getNumBuckets() const { return NumBuckets; }
+  [[nodiscard]] unsigned getNumItems() const { return NumItems; }
 
-  bool empty() const { return NumItems == 0; }
-  unsigned size() const { return NumItems; }
+  [[nodiscard]] bool empty() const { return NumItems == 0; }
+  [[nodiscard]] unsigned size() const { return NumItems; }
 
   /// Returns the hash value that will be used for the given string.
   /// This allows precomputing the value and passing it explicitly
   /// to some of the functions.
   /// The implementation of this function is not guaranteed to be stable
   /// and may change.
-  LLVM_ABI static uint32_t hash(StringRef Key);
+  [[nodiscard]] LLVM_ABI static uint32_t hash(StringRef Key);
 
   void swap(StringMapImpl &Other) {
     std::swap(TheTable, Other.TheTable);
@@ -220,30 +220,35 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap
   using const_iterator = StringMapIterBase<ValueTy, true>;
   using iterator = StringMapIterBase<ValueTy, false>;
 
-  iterator begin() { return iterator(TheTable, NumBuckets != 0); }
-  iterator end() { return iterator(TheTable + NumBuckets); }
-  const_iterator begin() const {
+  [[nodiscard]] iterator begin() { return iterator(TheTable, NumBuckets != 0); }
+  [[nodiscard]] iterator end() { return iterator(TheTable + NumBuckets); }
+  [[nodiscard]] const_iterator begin() const {
     return const_iterator(TheTable, NumBuckets != 0);
   }
-  const_iterator end() const { return const_iterator(TheTable + NumBuckets); }
+  [[nodiscard]] const_iterator end() const {
+    return const_iterator(TheTable + NumBuckets);
+  }
 
-  iterator_range<StringMapKeyIterator<ValueTy>> keys() const {
+  [[nodiscard]] iterator_range<StringMapKeyIterator<ValueTy>> keys() const {
     return make_range(StringMapKeyIterator<ValueTy>(begin()),
                       StringMapKeyIterator<ValueTy>(end()));
   }
 
-  iterator find(StringRef Key) { return find(Key, hash(Key)); }
+  [[nodiscard]] iterator find(StringRef Key) { return find(Key, hash(Key)); }
 
-  iterator find(StringRef Key, uint32_t FullHashValue) {
+  [[nodiscard]] iterator find(StringRef Key, uint32_t FullHashValue) {
     int Bucket = FindKey(Key, FullHashValue);
     if (Bucket == -1)
       return end();
     return iterator(TheTable + Bucket);
   }
 
-  const_iterator find(StringRef Key) const { return find(Key, hash(Key)); }
+  [[nodiscard]] const_iterator find(StringRef Key) const {
+    return find(Key, hash(Key));
+  }
 
-  const_iterator find(StringRef Key, uint32_t FullHashValue) const {
+  [[nodiscard]] const_iterator find(StringRef Key,
+                                    uint32_t FullHashValue) const {
     int Bucket = FindKey(Key, FullHashValue);
     if (Bucket == -1)
       return end();
@@ -252,7 +257,7 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
-  ValueTy lookup(StringRef Key) const {
+  [[nodiscard]] ValueTy lookup(StringRef Key) const {
     const_iterator Iter = find(Key);
     if (Iter != end())
       return Iter->second;
@@ -261,7 +266,7 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap
 
   /// at - Return the entry for the specified key, or abort if no such
   /// entry exists.
-  const ValueTy &at(StringRef Val) const {
+  [[nodiscard]] const ValueTy &at(StringRef Val) const {
     auto Iter = this->find(Val);
     assert(Iter != this->end() && "StringMap::at failed due to a missing key");
     return Iter->second;
@@ -272,18 +277,22 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap
   ValueTy &operator[](StringRef Key) { return try_emplace(Key).first->second; }
 
   /// contains - Return true if the element is in the map, false otherwise.
-  bool contains(StringRef Key) const { return find(Key) != end(); }
+  [[nodiscard]] bool contains(StringRef Key) const {
+    return find(Key) != end();
+  }
 
   /// count - Return 1 if the element is in the map, 0 otherwise.
-  size_type count(StringRef Key) const { return contains(Key) ? 1 : 0; }
+  [[nodiscard]] size_type count(StringRef Key) const {
+    return contains(Key) ? 1 : 0;
+  }
 
   template <typename InputTy>
-  size_type count(const StringMapEntry<InputTy> &MapEntry) const {
+  [[nodiscard]] size_type count(const StringMapEntry<InputTy> &MapEntry) const {
     return count(MapEntry.getKey());
   }
 
   /// equal - check whether both of the containers are equal.
-  bool operator==(const StringMap &RHS) const {
+  [[nodiscard]] bool operator==(const StringMap &RHS) const {
     if (size() != RHS.size())
       return false;
 
@@ -302,7 +311,9 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap
     return true;
   }
 
-  bool operator!=(const StringMap &RHS) const { return !(*this == RHS); }
+  [[nodiscard]] bool operator!=(const StringMap &RHS) const {
+    return !(*this == RHS);
+  }
 
   /// insert - Insert the specified key/value pair into the map.  If the key
   /// already exists in the map, return false and ignore the request, otherwise
@@ -447,8 +458,12 @@ template <typename ValueTy, bool IsConst> class StringMapIterBase {
       AdvancePastEmptyBuckets();
   }
 
-  reference operator*() const { return *static_cast<value_type *>(*Ptr); }
-  pointer operator->() const { return static_cast<value_type *>(*Ptr); }
+  [[nodiscard]] reference operator*() const {
+    return *static_cast<value_type *>(*Ptr);
+  }
+  [[nodiscard]] pointer operator->() const {
+    return static_cast<value_type *>(*Ptr);
+  }
 
   StringMapIterBase &operator++() { // Preincrement
     ++Ptr;
diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h
index b4853423a1ef3..c8be3f2a503e4 100644
--- a/llvm/include/llvm/ADT/StringSet.h
+++ b/llvm/include/llvm/ADT/StringSet.h
@@ -57,7 +57,9 @@ class StringSet : public StringMap<std::nullopt_t, AllocatorTy> {
   }
 
   /// Check if the set contains the given \c key.
-  bool contains(StringRef key) const { return Base::contains(key); }
+  [[nodiscard]] bool contains(StringRef key) const {
+    return Base::contains(key);
+  }
 };
 
 } // end namespace llvm

From 9a5671efac31d91e3479c6ae6a0af6196100beb3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 28 Sep 2025 10:27:29 -0700
Subject: [PATCH 057/878] [Support] Deprecate one form of
 support::endian::byte_swap (NFC) (#161045)

This is a follow-up to #156140 and #160979, which deprecated one form of
write and read, respectively.

We have two forms of byte_swap:

  template <typename value_type>
[[nodiscard]] inline value_type byte_swap(value_type value, endianness
endian)

  template <typename value_type, endianness endian>
  [[nodiscard]] inline value_type byte_swap(value_type value)

The difference is that endian is a function parameter in the former
but a template parameter in the latter.

This patch streamlines the code by migrating the use of the latter to
the former while deprecating the latter because the latter is just
forwarded to the former.
---
 clang/lib/CodeGen/CodeGenPGO.cpp              |  4 ++--
 llvm/include/llvm/Bitstream/BitstreamWriter.h |  2 +-
 .../ProfileData/Coverage/CoverageMapping.h    | 20 +++++++++----------
 llvm/include/llvm/Support/Endian.h            | 16 ++++++++-------
 llvm/lib/CGData/CodeGenDataWriter.cpp         |  4 ++--
 llvm/lib/MC/DXContainerRootSignature.cpp      |  5 ++---
 .../Coverage/CoverageMappingReader.cpp        | 12 +++++------
 .../Coverage/CoverageMappingWriter.cpp        |  2 +-
 llvm/lib/ProfileData/InstrProf.cpp            |  2 +-
 llvm/lib/ProfileData/InstrProfReader.cpp      | 10 +++++-----
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      |  8 ++++----
 llvm/unittests/MC/StringTableBuilderTest.cpp  |  4 ++--
 12 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 98b30e084b18b..8f095649f87ce 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -972,7 +972,7 @@ void PGOHash::combine(HashType Type) {
   if (Count && Count % NumTypesPerWord == 0) {
     using namespace llvm::support;
     uint64_t Swapped =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(Working);
+        endian::byte_swap<uint64_t>(Working, llvm::endianness::little);
     MD5.update(llvm::ArrayRef((uint8_t *)&Swapped, sizeof(Swapped)));
     Working = 0;
   }
@@ -999,7 +999,7 @@ uint64_t PGOHash::finalize() {
     } else {
       using namespace llvm::support;
       uint64_t Swapped =
-          endian::byte_swap<uint64_t, llvm::endianness::little>(Working);
+          endian::byte_swap<uint64_t>(Working, llvm::endianness::little);
       MD5.update(llvm::ArrayRef((uint8_t *)&Swapped, sizeof(Swapped)));
     }
   }
diff --git a/llvm/include/llvm/Bitstream/BitstreamWriter.h b/llvm/include/llvm/Bitstream/BitstreamWriter.h
index 5f53681320ce4..a2938642f824a 100644
--- a/llvm/include/llvm/Bitstream/BitstreamWriter.h
+++ b/llvm/include/llvm/Bitstream/BitstreamWriter.h
@@ -87,7 +87,7 @@ class BitstreamWriter {
 
   void WriteWord(unsigned Value) {
     Value =
-        support::endian::byte_swap<uint32_t, llvm::endianness::little>(Value);
+        support::endian::byte_swap<uint32_t>(Value, llvm::endianness::little);
     Buffer.append(reinterpret_cast<const char *>(&Value),
                   reinterpret_cast<const char *>(&Value + 1));
   }
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 7d1a85ba528fc..e09958160b9a0 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -1215,19 +1215,19 @@ namespace accessors {
 /// Return the structural hash associated with the function.
 template <class FuncRecordTy, llvm::endianness Endian>
 uint64_t getFuncHash(const FuncRecordTy *Record) {
-  return support::endian::byte_swap<uint64_t, Endian>(Record->FuncHash);
+  return support::endian::byte_swap<uint64_t>(Record->FuncHash, Endian);
 }
 
 /// Return the coverage map data size for the function.
 template <class FuncRecordTy, llvm::endianness Endian>
 uint64_t getDataSize(const FuncRecordTy *Record) {
-  return support::endian::byte_swap<uint32_t, Endian>(Record->DataSize);
+  return support::endian::byte_swap<uint32_t>(Record->DataSize, Endian);
 }
 
 /// Return the function lookup key. The value is considered opaque.
 template <class FuncRecordTy, llvm::endianness Endian>
 uint64_t getFuncNameRef(const FuncRecordTy *Record) {
-  return support::endian::byte_swap<uint64_t, Endian>(Record->NameRef);
+  return support::endian::byte_swap<uint64_t>(Record->NameRef, Endian);
 }
 
 /// Return the PGO name of the function. Used for formats in which the name is
@@ -1280,14 +1280,14 @@ struct CovMapFunctionRecordV1 {
 
   /// Return function lookup key. The value is consider opaque.
   template <llvm::endianness Endian> IntPtrT getFuncNameRef() const {
-    return support::endian::byte_swap<IntPtrT, Endian>(NamePtr);
+    return support::endian::byte_swap<IntPtrT>(NamePtr, Endian);
   }
 
   /// Return the PGO name of the function.
   template <llvm::endianness Endian>
   Error getFuncName(InstrProfSymtab &ProfileNames, StringRef &FuncName) const {
     IntPtrT NameRef = getFuncNameRef<Endian>();
-    uint32_t NameS = support::endian::byte_swap<uint32_t, Endian>(NameSize);
+    uint32_t NameS = support::endian::byte_swap<uint32_t>(NameSize, Endian);
     FuncName = ProfileNames.getFuncName(NameRef, NameS);
     if (NameS && FuncName.empty())
       return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -1385,7 +1385,7 @@ struct CovMapFunctionRecordV3 {
 
   /// Get the filename set reference.
   template <llvm::endianness Endian> uint64_t getFilenamesRef() const {
-    return support::endian::byte_swap<uint64_t, Endian>(FilenamesRef);
+    return support::endian::byte_swap<uint64_t>(FilenamesRef, Endian);
   }
 
   /// Read the inline coverage mapping. Ignore the buffer parameter, it is for
@@ -1416,19 +1416,19 @@ struct CovMapHeader {
 #define COVMAP_HEADER(Type, LLVMType, Name, Init) Type Name;
 #include "llvm/ProfileData/InstrProfData.inc"
   template <llvm::endianness Endian> uint32_t getNRecords() const {
-    return support::endian::byte_swap<uint32_t, Endian>(NRecords);
+    return support::endian::byte_swap<uint32_t>(NRecords, Endian);
   }
 
   template <llvm::endianness Endian> uint32_t getFilenamesSize() const {
-    return support::endian::byte_swap<uint32_t, Endian>(FilenamesSize);
+    return support::endian::byte_swap<uint32_t>(FilenamesSize, Endian);
   }
 
   template <llvm::endianness Endian> uint32_t getCoverageSize() const {
-    return support::endian::byte_swap<uint32_t, Endian>(CoverageSize);
+    return support::endian::byte_swap<uint32_t>(CoverageSize, Endian);
   }
 
   template <llvm::endianness Endian> uint32_t getVersion() const {
-    return support::endian::byte_swap<uint32_t, Endian>(Version);
+    return support::endian::byte_swap<uint32_t>(Version, Endian);
   }
 };
 
diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h
index 6c86feb78053c..51db225841dbe 100644
--- a/llvm/include/llvm/Support/Endian.h
+++ b/llvm/include/llvm/Support/Endian.h
@@ -49,7 +49,9 @@ template <typename value_type>
 
 /// Swap the bytes of value to match the given endianness.
 template <typename value_type, endianness endian>
-[[nodiscard]] inline value_type byte_swap(value_type value) {
+[[nodiscard]]
+LLVM_DEPRECATED("Pass endian as a function argument instead",
+                "byte_swap") inline value_type byte_swap(value_type value) {
   return byte_swap(value, endian);
 }
 
@@ -137,8 +139,8 @@ template <typename value_type, endianness endian, std::size_t alignment>
            LLVM_ASSUME_ALIGNED(
                memory, (detail::PickAlignment<value_type, alignment>::value)),
            sizeof(value_type) * 2);
-    val[0] = byte_swap<value_type, endian>(val[0]);
-    val[1] = byte_swap<value_type, endian>(val[1]);
+    val[0] = byte_swap<value_type>(val[0], endian);
+    val[1] = byte_swap<value_type>(val[1], endian);
 
     // Shift bits from the lower value into place.
     make_unsigned_t<value_type> lowerVal = val[0] >> startBit;
@@ -172,8 +174,8 @@ inline void writeAtBitAlignment(void *memory, value_type value,
            LLVM_ASSUME_ALIGNED(
                memory, (detail::PickAlignment<value_type, alignment>::value)),
            sizeof(value_type) * 2);
-    val[0] = byte_swap<value_type, endian>(val[0]);
-    val[1] = byte_swap<value_type, endian>(val[1]);
+    val[0] = byte_swap<value_type>(val[0], endian);
+    val[1] = byte_swap<value_type>(val[1], endian);
 
     // Mask off any existing bits in the upper part of the lower value that
     // we want to replace.
@@ -201,8 +203,8 @@ inline void writeAtBitAlignment(void *memory, value_type value,
     val[1] |= upperVal;
 
     // Finally, rewrite values.
-    val[0] = byte_swap<value_type, endian>(val[0]);
-    val[1] = byte_swap<value_type, endian>(val[1]);
+    val[0] = byte_swap<value_type>(val[0], endian);
+    val[1] = byte_swap<value_type>(val[1], endian);
     memcpy(LLVM_ASSUME_ALIGNED(
                memory, (detail::PickAlignment<value_type, alignment>::value)),
            &val[0], sizeof(value_type) * 2);
diff --git a/llvm/lib/CGData/CodeGenDataWriter.cpp b/llvm/lib/CGData/CodeGenDataWriter.cpp
index 14a8558ba63b7..a2bbceebd0317 100644
--- a/llvm/lib/CGData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CGData/CodeGenDataWriter.cpp
@@ -40,7 +40,7 @@ void CGDataOStream::patch(ArrayRef<CGDataPatchItem> P) {
     for (const auto &K : P) {
       for (size_t I = 0; I < K.D.size(); ++I) {
         uint64_t Bytes =
-            endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+            endian::byte_swap<uint64_t>(K.D[I], llvm::endianness::little);
         Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
                      reinterpret_cast<const char *>(&Bytes), sizeof(uint64_t));
       }
@@ -52,7 +52,7 @@ void CGDataOStream::patch(ArrayRef<CGDataPatchItem> P) {
     for (const auto &K : P) {
       for (size_t I = 0; I < K.D.size(); ++I) {
         uint64_t Bytes =
-            endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+            endian::byte_swap<uint64_t>(K.D[I], llvm::endianness::little);
         VOStream.pwrite(reinterpret_cast<const char *>(&Bytes),
                         sizeof(uint64_t), K.Pos + I * sizeof(uint64_t));
       }
diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp
index 2338370d84389..713aa3d8143e8 100644
--- a/llvm/lib/MC/DXContainerRootSignature.cpp
+++ b/llvm/lib/MC/DXContainerRootSignature.cpp
@@ -23,9 +23,8 @@ static uint32_t writePlaceholder(raw_svector_ostream &Stream) {
 static uint32_t rewriteOffsetToCurrentByte(raw_svector_ostream &Stream,
                                            uint32_t Offset) {
   uint32_t ByteOffset = Stream.tell();
-  uint32_t Value =
-      support::endian::byte_swap<uint32_t, llvm::endianness::little>(
-          ByteOffset);
+  uint32_t Value = support::endian::byte_swap<uint32_t>(
+      ByteOffset, llvm::endianness::little);
   Stream.pwrite(reinterpret_cast<const char *>(&Value), sizeof(Value), Offset);
   return ByteOffset;
 }
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index fc2577e6ada5d..075ad8d7aec8b 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -949,9 +949,9 @@ loadTestingFormat(StringRef Data, StringRef CompilationDir) {
   if (Data.size() < sizeof(uint64_t))
     return make_error<CoverageMapError>(coveragemap_error::malformed,
                                         "the size of data is too small");
-  auto TestingVersion =
-      support::endian::byte_swap<uint64_t, llvm::endianness::little>(
-          *reinterpret_cast<const uint64_t *>(Data.data()));
+  auto TestingVersion = support::endian::byte_swap<uint64_t>(
+      *reinterpret_cast<const uint64_t *>(Data.data()),
+      llvm::endianness::little);
   Data = Data.substr(sizeof(uint64_t));
 
   // Read the ProfileNames data.
@@ -1274,9 +1274,9 @@ BinaryCoverageReader::create(
   std::vector<std::unique_ptr<BinaryCoverageReader>> Readers;
 
   if (ObjectBuffer.getBuffer().size() > sizeof(TestingFormatMagic)) {
-    uint64_t Magic =
-        support::endian::byte_swap<uint64_t, llvm::endianness::little>(
-            *reinterpret_cast<const uint64_t *>(ObjectBuffer.getBufferStart()));
+    uint64_t Magic = support::endian::byte_swap<uint64_t>(
+        *reinterpret_cast<const uint64_t *>(ObjectBuffer.getBufferStart()),
+        llvm::endianness::little);
     if (Magic == TestingFormatMagic) {
       // This is a special format used for testing.
       auto ReaderOrErr =
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 12b1687af69db..3875f01c48528 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -292,7 +292,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) {
 
 void TestingFormatWriter::write(raw_ostream &OS, TestingFormatVersion Version) {
   auto ByteSwap = [](uint64_t N) {
-    return support::endian::byte_swap<uint64_t, llvm::endianness::little>(N);
+    return support::endian::byte_swap<uint64_t>(N, llvm::endianness::little);
   };
 
   // Output a 64bit magic number.
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index e1c6315853b3b..3c8e44a18f533 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -292,7 +292,7 @@ void ProfOStream::patch(ArrayRef<PatchItem> P) {
     for (const auto &K : P) {
       for (int I = 0, E = K.D.size(); I != E; I++) {
         uint64_t Bytes =
-            endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+            endian::byte_swap<uint64_t>(K.D[I], llvm::endianness::little);
         Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
                      (const char *)&Bytes, sizeof(uint64_t));
       }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 1da92eafa4b4a..d2ae4b5226ff6 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1186,10 +1186,10 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
   if (Version >= IndexedInstrProf::Version4) {
     const IndexedInstrProf::Summary *SummaryInLE =
         reinterpret_cast<const IndexedInstrProf::Summary *>(Cur);
-    uint64_t NFields = endian::byte_swap<uint64_t, llvm::endianness::little>(
-        SummaryInLE->NumSummaryFields);
-    uint64_t NEntries = endian::byte_swap<uint64_t, llvm::endianness::little>(
-        SummaryInLE->NumCutoffEntries);
+    uint64_t NFields = endian::byte_swap<uint64_t>(
+        SummaryInLE->NumSummaryFields, llvm::endianness::little);
+    uint64_t NEntries = endian::byte_swap<uint64_t>(
+        SummaryInLE->NumCutoffEntries, llvm::endianness::little);
     uint32_t SummarySize =
         IndexedInstrProf::Summary::getSize(NFields, NEntries);
     std::unique_ptr<IndexedInstrProf::Summary> SummaryData =
@@ -1198,7 +1198,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
     const uint64_t *Src = reinterpret_cast<const uint64_t *>(SummaryInLE);
     uint64_t *Dst = reinterpret_cast<uint64_t *>(SummaryData.get());
     for (unsigned I = 0; I < SummarySize / sizeof(uint64_t); I++)
-      Dst[I] = endian::byte_swap<uint64_t, llvm::endianness::little>(Src[I]);
+      Dst[I] = endian::byte_swap<uint64_t>(Src[I], llvm::endianness::little);
 
     SummaryEntryVector DetailedSummary;
     for (unsigned I = 0; I < SummaryData->NumCutoffEntries; I++) {
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 31bf6a9d2d9c8..e09ddb45da6e9 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1519,10 +1519,10 @@ class MemoryMatcher {
 
 static StringRef detectStubKind(const Session::MemoryRegionInfo &Stub) {
   using namespace support::endian;
-  auto Armv7MovWTle = byte_swap<uint32_t, endianness::little>(0xe300c000);
-  auto Armv7BxR12le = byte_swap<uint32_t, endianness::little>(0xe12fff1c);
-  auto Thumbv7MovWTle = byte_swap<uint32_t, endianness::little>(0x0c00f240);
-  auto Thumbv7BxR12le = byte_swap<uint16_t, endianness::little>(0x4760);
+  auto Armv7MovWTle = byte_swap<uint32_t>(0xe300c000, endianness::little);
+  auto Armv7BxR12le = byte_swap<uint32_t>(0xe12fff1c, endianness::little);
+  auto Thumbv7MovWTle = byte_swap<uint32_t>(0x0c00f240, endianness::little);
+  auto Thumbv7BxR12le = byte_swap<uint16_t>(0x4760, endianness::little);
 
   MemoryMatcher M(Stub.getContent());
   if (M.matchMask(Thumbv7MovWTle)) {
diff --git a/llvm/unittests/MC/StringTableBuilderTest.cpp b/llvm/unittests/MC/StringTableBuilderTest.cpp
index 05f469a229bf9..44a985be6cfcb 100644
--- a/llvm/unittests/MC/StringTableBuilderTest.cpp
+++ b/llvm/unittests/MC/StringTableBuilderTest.cpp
@@ -58,8 +58,8 @@ TEST(StringTableBuilderTest, BasicWinCOFF) {
 
   std::string Expected;
 
-  ExpectedSize = support::endian::byte_swap<uint32_t, llvm::endianness::little>(
-      ExpectedSize);
+  ExpectedSize = support::endian::byte_swap<uint32_t>(ExpectedSize,
+                                                      llvm::endianness::little);
   Expected.append((const char*)&ExpectedSize, 4);
   Expected += "pygmy hippopotamus";
   Expected += '\x00';

From 9c6d216576065479b6826ebead1ffe74ab5e1273 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 28 Sep 2025 10:27:36 -0700
Subject: [PATCH 058/878] [llvm] Proofread FuzzingLLVM.rst (#161046)

---
 llvm/docs/FuzzingLLVM.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/FuzzingLLVM.rst b/llvm/docs/FuzzingLLVM.rst
index a0355d7014c8d..76eb4288a1f2c 100644
--- a/llvm/docs/FuzzingLLVM.rst
+++ b/llvm/docs/FuzzingLLVM.rst
@@ -33,7 +33,7 @@ clang-proto-fuzzer
 A |protobuf fuzzer| that compiles valid C++ programs generated from a protobuf
 class that describes a subset of the C++ language.
 
-This fuzzer accepts clang command line options after `ignore_remaining_args=1`.
+This fuzzer accepts clang command-line options after `ignore_remaining_args=1`.
 For example, the following command will fuzz clang with a higher optimization
 level:
 
@@ -106,7 +106,7 @@ llvm-opt-fuzzer
 
 A |LLVM IR fuzzer| aimed at finding bugs in optimization passes.
 
-It receives optimization pipeline and runs it for each fuzzer input.
+It receives an optimization pipeline and runs it for each fuzzer input.
 
 Interface of this fuzzer almost directly mirrors ``llvm-isel-fuzzer``. Both
 ``mtriple`` and ``passes`` arguments are required. Passes are specified in a
@@ -117,7 +117,7 @@ this format in the doxygen for ``PassBuilder::parsePassPipeline``.
 
    % bin/llvm-opt-fuzzer <corpus-dir> -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine
 
-Similarly to the ``llvm-isel-fuzzer`` arguments in some predefined configurations
+Similarly to the ``llvm-isel-fuzzer``, arguments in some predefined configurations
 might be embedded directly into the binary file name:
 
 .. code-block:: shell
@@ -176,7 +176,7 @@ mutations that a fuzzer in LLVM might want.
 Generic Random Fuzzing
 ----------------------
 
-The most basic form of input mutation is to use the built in mutators of
+The most basic form of input mutation is to use the built-in mutators of
 LibFuzzer. These simply treat the input corpus as a bag of bits and make random
 mutations. This type of fuzzer is good for stressing the surface layers of a
 program, and is good at testing things like lexers, parsers, or binary
@@ -244,7 +244,7 @@ by adding the following two flags to your CMake invocation:
           to avoid building the sanitizers themselves with sanitizers enabled.
 
 .. note:: You may run into issues if you build with BFD ld, which is the
-          default linker on many unix systems. These issues are being tracked
+          default linker on many Unix systems. These issues are being tracked
           in https://llvm.org/PR34636.
 
 Continuously Running and Finding Bugs
@@ -280,6 +280,6 @@ your fuzzer can be built and tested when not built against libFuzzer.
 
 There is also some handling of the CMake config for fuzzers, where you should
 use the ``add_llvm_fuzzer`` to set up fuzzer targets. This function works
-similarly to functions such as ``add_llvm_tool``, but they take care of linking
+similarly to functions such as ``add_llvm_tool``, but it takes care of linking
 to LibFuzzer when appropriate and can be passed the ``DUMMY_MAIN`` argument to
 enable standalone testing.

From e930644394735c9349bab73a8c33f0d215cd35f4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 28 Sep 2025 11:07:30 -0700
Subject: [PATCH 059/878] [ADT] clang-format PackedVector.h (NFC)

I'm planning to modify this file.
---
 llvm/include/llvm/ADT/PackedVector.h | 35 +++++++++++-----------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h
index 4e31d3f098d44..77fcbf24b2861 100644
--- a/llvm/include/llvm/ADT/PackedVector.h
+++ b/llvm/include/llvm/ADT/PackedVector.h
@@ -47,7 +47,7 @@ class PackedVectorBase<T, BitNum, BitVectorTy, true> {
 protected:
   static T getValue(const BitVectorTy &Bits, unsigned Idx) {
     T val = T();
-    for (unsigned i = 0; i != BitNum-1; ++i)
+    for (unsigned i = 0; i != BitNum - 1; ++i)
       val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
     if (Bits[(Idx * BitNum) + BitNum - 1])
       val = ~val;
@@ -61,8 +61,8 @@ class PackedVectorBase<T, BitNum, BitVectorTy, true> {
     } else {
       Bits.reset((Idx * BitNum) + BitNum - 1);
     }
-    assert((val >> (BitNum-1)) == 0 && "value is too big");
-    for (unsigned i = 0; i != BitNum-1; ++i)
+    assert((val >> (BitNum - 1)) == 0 && "value is too big");
+    for (unsigned i = 0; i != BitNum - 1; ++i)
       Bits[(Idx * BitNum) + i] = val & (T(1) << i);
   }
 };
@@ -75,8 +75,9 @@ class PackedVectorBase<T, BitNum, BitVectorTy, true> {
 /// will create a vector accepting values -2, -1, 0, 1. Any other value will hit
 /// an assertion.
 template <typename T, unsigned BitNum, typename BitVectorTy = BitVector>
-class PackedVector : public PackedVectorBase<T, BitNum, BitVectorTy,
-                                            std::numeric_limits<T>::is_signed> {
+class PackedVector
+    : public PackedVectorBase<T, BitNum, BitVectorTy,
+                              std::numeric_limits<T>::is_signed> {
   BitVectorTy Bits;
   // Keep track of the number of elements on our own.
   // We always maintain Bits.size() == NumElements * BitNum.
@@ -99,9 +100,7 @@ class PackedVector : public PackedVectorBase<T, BitNum, BitVectorTy,
       return *this;
     }
 
-    operator T() const {
-      return Vec.getValue(Vec.Bits, Idx);
-    }
+    operator T() const { return Vec.getValue(Vec.Bits, Idx); }
   };
 
   PackedVector() = default;
@@ -130,25 +129,17 @@ class PackedVector : public PackedVectorBase<T, BitNum, BitVectorTy,
   }
 
   void push_back(T val) {
-    resize(size()+1);
-    (*this)[size()-1] = val;
+    resize(size() + 1);
+    (*this)[size() - 1] = val;
   }
 
-  reference operator[](unsigned Idx) {
-    return reference(*this, Idx);
-  }
+  reference operator[](unsigned Idx) { return reference(*this, Idx); }
 
-  T operator[](unsigned Idx) const {
-    return base::getValue(Bits, Idx);
-  }
+  T operator[](unsigned Idx) const { return base::getValue(Bits, Idx); }
 
-  bool operator==(const PackedVector &RHS) const {
-    return Bits == RHS.Bits;
-  }
+  bool operator==(const PackedVector &RHS) const { return Bits == RHS.Bits; }
 
-  bool operator!=(const PackedVector &RHS) const {
-    return Bits != RHS.Bits;
-  }
+  bool operator!=(const PackedVector &RHS) const { return Bits != RHS.Bits; }
 
   PackedVector &operator|=(const PackedVector &RHS) {
     Bits |= RHS.Bits;

From 48a6f2f85c8269d8326c185016801a4eb8d5dfd6 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sun, 28 Sep 2025 20:17:01 +0200
Subject: [PATCH 060/878] [Object][Archive] Recompute headers and symbol map
 when switching from COFF to GNU64 (#160606)

COFF format has no 64-bit version, so we use GNU64 instead. Since this
changes the headers, we need to recalculate everything.

Fixes #160112.
---
 llvm/lib/Object/ArchiveWriter.cpp             | 20 +++++-
 llvm/test/tools/llvm-lib/sym64-threshold.test | 71 +++++++++++++++++++
 2 files changed, 89 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/tools/llvm-lib/sym64-threshold.test

diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 6fc0889afc6a8..a11259748b9cc 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -1119,10 +1119,26 @@ Error writeArchiveToStream(raw_ostream &Out,
     // to switch to 64-bit. Note that the file can be larger than 4GB as long as
     // the last member starts before the 4GB offset.
     if (*HeadersSize + LastMemberHeaderOffset >= Sym64Threshold) {
-      if (Kind == object::Archive::K_DARWIN)
+      switch (Kind) {
+      case object::Archive::K_COFF:
+        // COFF format has no 64-bit version, so we use GNU64 instead.
+        if (!SymMap.Map.empty() && !SymMap.ECMap.empty())
+          // Only the COFF format supports the ECSYMBOLS section, so don’t use
+          // GNU64 when two symbol maps are required.
+          return make_error<object::GenericBinaryError>(
+              "Archive is too large: ARM64X does not support archives larger "
+              "than 4GB");
+        // Since this changes the headers, we need to recalculate everything.
+        return writeArchiveToStream(Out, NewMembers, WriteSymtab,
+                                    object::Archive::K_GNU64, Deterministic,
+                                    Thin, IsEC, Warn);
+      case object::Archive::K_DARWIN:
         Kind = object::Archive::K_DARWIN64;
-      else
+        break;
+      default:
         Kind = object::Archive::K_GNU64;
+        break;
+      }
       HeadersSize.reset();
     }
   }
diff --git a/llvm/test/tools/llvm-lib/sym64-threshold.test b/llvm/test/tools/llvm-lib/sym64-threshold.test
new file mode 100644
index 0000000000000..76f0a030274ef
--- /dev/null
+++ b/llvm/test/tools/llvm-lib/sym64-threshold.test
@@ -0,0 +1,71 @@
+# RUN: yaml2obj --docnum=1 %s -o %t01234567890234567789.obj
+# RUN: yaml2obj --docnum=2 %s -o %t-ec.obj
+# RUN: env SYM64_THRESHOLD=100 llvm-lib -machine:amd64 -out:%t.lib %t01234567890234567789.obj
+# RUN: llvm-nm --print-armap %t.lib | FileCheck --check-prefix=ARMAP %s
+# ARMAP:      Archive map
+# ARMAP-NEXT: sym
+
+# RUN: env SYM64_THRESHOLD=100 not llvm-lib -machine:arm64x -out:%t-ec.lib %t-ec.obj %t01234567890234567789.obj 2>&1 | FileCheck %s
+# CHECK: Archive is too large: ARM64X does not support archives larger than 4GB
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+  - !Symbol
+    Name: sym
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+...
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_ARM64
+  Characteristics: [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          0
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          1
+  - !Symbol
+    Name: sym
+    Value: 0
+    SectionNumber: 1
+    SimpleType: IMAGE_SYM_TYPE_NULL # (0)
+    ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2)
+    StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2)
+...

From 71be13a6f09d59cf45863d36a3dcc5d72645e2d6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 28 Sep 2025 21:53:51 +0100
Subject: [PATCH 061/878] [VPlan] Rewrite VPExpandSCEVExprs in
 replaceSymbolicStrides.

Extend replaceSymbolicStrides to also replace SCEVUnknowns in
VPExpandSCEVExprs using the information from StridesMaps.

This results in simpler SCEV expansions in some cases.
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 17 ++++++++++
 .../reuse-lcssa-phi-scev-expansion.ll         | 12 ++++---
 .../version-stride-with-integer-casts.ll      | 34 ++++++-------------
 .../AArch64/indvars-vectorization.ll          | 21 ++++--------
 4 files changed, 42 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 58fab8f222d23..5252e1f928294 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2853,6 +2853,7 @@ void VPlanTransforms::replaceSymbolicStrides(
     return R->getParent()->getParent() ||
            R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
   };
+  ValueToSCEVMapTy RewriteMap;
   for (const SCEV *Stride : StridesMap.values()) {
     using namespace SCEVPatternMatch;
     auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
@@ -2880,6 +2881,22 @@ void VPlanTransforms::replaceSymbolicStrides(
       VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
       StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
     }
+    RewriteMap[StrideV] = PSE.getSCEV(StrideV);
+  }
+
+  for (VPRecipeBase &R : *Plan.getEntry()) {
+    auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
+    if (!ExpSCEV)
+      continue;
+    const SCEV *ScevExpr = ExpSCEV->getSCEV();
+    auto *NewSCEV =
+        SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
+    if (NewSCEV != ScevExpr) {
+      VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
+      ExpSCEV->replaceAllUsesWith(NewExp);
+      if (Plan.getTripCount() == ExpSCEV)
+        Plan.resetTripCount(NewExp);
+    }
   }
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index cb0c778b95026..73d5e26ef82a2 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -220,14 +220,18 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[INDVAR_LCSSA1]], 2
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP3]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP15]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP15]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[IV_1_LCSSA]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
@@ -239,7 +243,7 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP15]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[IV_1_LCSSA]], %[[LOOP_2_PREHEADER]] ], [ [[IV_1_LCSSA]], %[[VECTOR_SCEVCHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
index 0b86a2280b529..027dcaf771072 100644
--- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll
@@ -22,13 +22,11 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], 200
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -94,13 +92,11 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) {
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], 200
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]]
 ; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -233,13 +229,11 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1,
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[IV_1]], 200
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]]
 ; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[OFFSET_IDX2]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX2]], 1
@@ -414,26 +408,20 @@ define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress {
 ; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true
 ; CHECK-NEXT:    br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <4 x i16> splat (i16 1), ptr [[TMP4]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i16 [[G_16]], ptr [[GEP]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
index b056f44a6c469..8d20a3ba8ed08 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -14,16 +14,9 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[XA]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[XB]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[SMAX7:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP2]], i64 32000)
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i64 [[TMP2]], 32000
-; CHECK-NEXT:    [[UMIN8:%.*]] = zext i1 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP2]], [[UMIN8]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[SMAX7]], [[TMP4]]
-; CHECK-NEXT:    [[UMAX9:%.*]] = tail call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv i64 [[TMP5]], [[UMAX9]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], [[UMIN8]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP0]], i64 31999)
+; CHECK-NEXT:    [[SMAX10:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[SMAX10]], [[TMP0]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP8]], 23
 ; CHECK-NEXT:    [[IDENT_CHECK_NOT:%.*]] = icmp eq i32 [[XB]], 1
 ; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[MIN_ITERS_CHECK]], [[IDENT_CHECK_NOT]]
@@ -50,13 +43,11 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP8]], -8
-; CHECK-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[N_VEC]], [[TMP1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP18]], [[TMP0]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP19]], [[TMP0]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[INDEX]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META0:![0-9]+]]
@@ -75,7 +66,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]]
-; CHECK:       for.body.preheader13:
+; CHECK:       for.body.preheader14:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:

From d29798767cc10a0609b5f1f9f75f032b6760cf50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?G=C3=A1bor=20Spaits?= <gaborspaits1@gmail.com>
Date: Mon, 29 Sep 2025 00:06:20 +0200
Subject: [PATCH 062/878] [InstCombine] Transform `vector.reduce.add` and
 `splat` into multiplication (#161020)

Fixes #160066

Whenever we have a vector with all the same elemnts, created with
`insertelement` and `shufflevector` and we sum the vector, we have a
multiplication.
---
 .../InstCombine/InstCombineCalls.cpp          |  12 ++
 .../InstCombine/vector-reductions.ll          | 171 ++++++++++++++++++
 2 files changed, 183 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c13c6cc9913ae..cf6d0ecab4f69 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/KnownFPClass.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
@@ -3781,6 +3782,17 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
             return replaceInstUsesWith(CI, Res);
           }
       }
+
+      // vector.reduce.add.vNiM(splat(%x)) -> mul(%x, N)
+      if (Value *Splat = getSplatValue(Arg)) {
+        ElementCount VecToReduceCount =
+            cast<VectorType>(Arg->getType())->getElementCount();
+        if (VecToReduceCount.isFixed()) {
+          unsigned VectorSize = VecToReduceCount.getFixedValue();
+          return BinaryOperator::CreateMul(
+              Splat, ConstantInt::get(Splat->getType(), VectorSize));
+        }
+      }
     }
     [[fallthrough]];
   }
diff --git a/llvm/test/Transforms/InstCombine/vector-reductions.ll b/llvm/test/Transforms/InstCombine/vector-reductions.ll
index 10f4aca72dbc7..f1e0dd9bd06d7 100644
--- a/llvm/test/Transforms/InstCombine/vector-reductions.ll
+++ b/llvm/test/Transforms/InstCombine/vector-reductions.ll
@@ -308,3 +308,174 @@ define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) {
   %r = sub i32 %r0, %r1
   ret i32 %r
 }
+
+define i32 @constant_multiplied_4xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+  ret i32 %4
+}
+
+define i32 @constant_multiplied_3xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_3xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <3 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <3 x i32> %2, <3 x i32> poison, <3 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %3)
+  ret i32 %4
+}
+
+define i64 @constant_multiplied_4xi64(i64 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi64(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0:%.*]], 2
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = insertelement <4 x i64> poison, i64 %0, i64 0
+  %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <4 x i32> zeroinitializer
+  %4 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
+  ret i64 %4
+}
+
+define i32 @constant_multiplied_8xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_8xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 3
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <8 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
+  ret i32 %4
+}
+
+
+define i32 @constant_multiplied_16xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_16xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 4
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
+  ret i32 %4
+}
+
+
+define i32 @constant_multiplied_4xi32_at_idx1(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi32_at_idx1(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 1
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison,
+  <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+  ret i32 %4
+}
+
+define i32 @negative_constant_multiplied_4xi32(i32 %0) {
+; CHECK-LABEL: @negative_constant_multiplied_4xi32(
+; CHECK-NEXT:    ret i32 poison
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 1
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+  ret i32 %4
+}
+
+define i32 @constant_multiplied_6xi32(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_6xi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP0:%.*]], 6
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <6 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.v6i32(<6 x i32> %3)
+  ret i32 %4
+}
+
+define i64 @constant_multiplied_6xi64(i64 %0) {
+; CHECK-LABEL: @constant_multiplied_6xi64(
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0:%.*]], 6
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %2 = insertelement <4 x i64> poison, i64 %0, i64 0
+  %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <6 x i32> zeroinitializer
+  %4 = tail call i64 @llvm.vector.reduce.add.v6i64(<6 x i64> %3)
+  ret i64 %4
+}
+
+define i1 @constant_multiplied_8xi1(i1 %0) {
+; CHECK-LABEL: @constant_multiplied_8xi1(
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i1> poison, i1 [[TMP0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i8 [[TMP5]] to i1
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %2 = insertelement <8 x i1> poison, i1 %0, i32 0
+  %3 = shufflevector <8 x i1> %2, <8 x i1> poison, <8 x i32> zeroinitializer
+  %4 = tail call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> %3)
+  ret i1 %4
+}
+
+define i2 @constant_multiplied_4xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_4xi2(
+; CHECK-NEXT:    ret i2 0
+;
+  %2 = insertelement <4 x i2> poison, i2 %0, i32 0
+  %3 = shufflevector <4 x i2> %2, <4 x i2> poison, <4 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v4i2(<4 x i2> %3)
+  ret i2 %4
+}
+
+define i2 @constant_multiplied_5xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_5xi2(
+; CHECK-NEXT:    ret i2 [[TMP0:%.*]]
+;
+  %2 = insertelement <5 x i2> poison, i2 %0, i64 0
+  %3 = shufflevector <5 x i2> %2, <5 x i2> poison, <5 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v5i2(<5 x i2> %3)
+  ret i2 %4
+}
+
+define i2 @constant_multiplied_6xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_6xi2(
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i2 [[TMP0:%.*]], 1
+; CHECK-NEXT:    ret i2 [[TMP2]]
+;
+  %2 = insertelement <6 x i2> poison, i2 %0, i64 0
+  %3 = shufflevector <6 x i2> %2, <6 x i2> poison, <6 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v6i2(<6 x i2> %3)
+  ret i2 %4
+}
+
+define i2 @constant_multiplied_7xi2(i2 %0) {
+; CHECK-LABEL: @constant_multiplied_7xi2(
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i2 0, [[TMP0:%.*]]
+; CHECK-NEXT:    ret i2 [[TMP2]]
+;
+  %2 = insertelement <7 x i2> poison, i2 %0, i64 0
+  %3 = shufflevector <7 x i2> %2, <7 x i2> poison, <7 x i32> zeroinitializer
+  %4 = tail call i2 @llvm.vector.reduce.add.v7i2(<7 x i2> %3)
+  ret i2 %4
+}
+
+define i32 @negative_scalable_vector(i32 %0) {
+; CHECK-LABEL: @negative_scalable_vector(
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP3]])
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %2 = insertelement <vscale x 4 x i32> poison, i32 %0, i64 0
+  %3 = shufflevector <vscale x 4 x i32> %2, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %4 = tail call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %3)
+  ret i32 %4
+}

From 2b67f5e0b444976057f097a6e4872177d2a1cc8b Mon Sep 17 00:00:00 2001
From: Aadesh Premkumar <aadesh.premkumar@multicorewareinc.com>
Date: Mon, 29 Sep 2025 05:04:42 +0530
Subject: [PATCH 063/878] [SPIRV] Addition of image_store.ll and
 signed_arithmetic_overflow.ll (#152289)

--Test for signed arithmetic overflow intrinsics, which is for now
expectedly failing
--Test checking that no duplicate image types are emitted.

---------

Co-authored-by: Michal Paszkowski <michal@michalpaszkowski.com>
---
 llvm/test/CodeGen/SPIRV/image_store.ll        | 22 ++++++++++++++
 .../signed_arithmetic_overflow.ll             | 30 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/image_store.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll

diff --git a/llvm/test/CodeGen/SPIRV/image_store.ll b/llvm/test/CodeGen/SPIRV/image_store.ll
new file mode 100644
index 0000000000000..a70651c974f36
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/image_store.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Image types may be represented in two ways while translating to SPIR-V:
+; - OpenCL form, for example, '%opencl.image2d_ro_t',
+; - SPIR-V form, for example, '%spirv.Image._void_1_0_0_0_0_0_0',
+; but it is still one type which should be translated to one SPIR-V type.
+;
+; The test checks that the code below is successfully translated and only one
+; SPIR-V type for images is generated (no duplicate OpTypeImage instructions).
+
+; CHECK:     %[[#]] = OpTypeImage %[[#]] 2D
+; CHECK-NOT: %[[#]] = OpTypeImage %[[#]] 2D
+
+declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(ptr addrspace(1), ptr addrspace(2), <2 x float>, float)
+
+define spir_kernel void @read_image(ptr addrspace(1) %srcimg, ptr addrspace(2) %sampler){
+entry:
+  %spirvimg.addr = alloca target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), align 8
+  %val = call <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(ptr addrspace(1) %srcimg, ptr addrspace(2) %sampler, <2 x float> zeroinitializer, float 0.0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll
new file mode 100644
index 0000000000000..52f939faf0a9f
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -filetype=obj -o - | spirv-val %}
+; XFAIL: *
+;@llvm.sadd.with.overflow and @llvm.ssub.with.overflow has not been implemented.
+
+define spir_func void @test_sadd_overflow(ptr %out_result, ptr %out_overflow, i32 %a, i32 %b) {
+entry:
+  %res = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %res, 0
+  %ofl = extractvalue { i32, i1 } %res, 1
+  store i32 %val, ptr %out_result
+  %zext_ofl = zext i1 %ofl to i8
+  store i8 %zext_ofl, ptr %out_overflow
+  ret void
+}
+
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
+
+define spir_func void @test_ssub_overflow(ptr %out_result, ptr %out_overflow, i32 %a, i32 %b) {
+entry:
+  %res = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %res, 0
+  %ofl = extractvalue { i32, i1 } %res, 1
+  store i32 %val, ptr %out_result
+  %zext_ofl = zext i1 %ofl to i8
+  store i8 %zext_ofl, ptr %out_overflow
+  ret void
+}
+
+declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32)

From 12f42e5ed9286711287e17ec35ea9bbdeb2aa92e Mon Sep 17 00:00:00 2001
From: Ebin-McW <ebin.jose@multicorewareinc.com>
Date: Mon, 29 Sep 2025 05:21:49 +0530
Subject: [PATCH 064/878] [SPIRV] Frexp intrinsic implementation (#157436)

- Make use of the OpenCL extended instruction frexp.
- Creates a variable and passes it to OpExtInst instruction
---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  53 +++++++-
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |   3 +
 .../CodeGen/SPIRV/llvm-intrinsics/frexp.ll    | 114 ++++++++++++++++++
 3 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 5266e204bb32f..1aadd9df189a8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -314,7 +314,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
                                 MachineInstr &I) const;
   bool selectModf(Register ResVReg, const SPIRVType *ResType,
                   MachineInstr &I) const;
-
+  bool selectFrexp(Register ResVReg, const SPIRVType *ResType,
+                   MachineInstr &I) const;
   // Utilities
   std::pair<Register, bool>
   buildI32Constant(uint32_t Val, MachineInstr &I,
@@ -835,6 +836,9 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
   case TargetOpcode::G_USUBSAT:
     return selectExtInst(ResVReg, ResType, I, CL::u_sub_sat);
 
+  case TargetOpcode::G_FFREXP:
+    return selectFrexp(ResVReg, ResType, I);
+
   case TargetOpcode::G_UADDO:
     return selectOverflowArith(ResVReg, ResType, I,
                                ResType->getOpcode() == SPIRV::OpTypeVector
@@ -1119,6 +1123,53 @@ bool SPIRVInstructionSelector::selectExtInstForLRound(
   return false;
 }
 
+bool SPIRVInstructionSelector::selectFrexp(Register ResVReg,
+                                           const SPIRVType *ResType,
+                                           MachineInstr &I) const {
+  ExtInstList ExtInsts = {{SPIRV::InstructionSet::OpenCL_std, CL::frexp},
+                          {SPIRV::InstructionSet::GLSL_std_450, GL::Frexp}};
+  for (const auto &Ex : ExtInsts) {
+    SPIRV::InstructionSet::InstructionSet Set = Ex.first;
+    uint32_t Opcode = Ex.second;
+    if (!STI.canUseExtInstSet(Set))
+      continue;
+
+    MachineIRBuilder MIRBuilder(I);
+    SPIRVType *PointeeTy = GR.getSPIRVTypeForVReg(I.getOperand(1).getReg());
+    const SPIRVType *PointerType = GR.getOrCreateSPIRVPointerType(
+        PointeeTy, MIRBuilder, SPIRV::StorageClass::Function);
+    Register PointerVReg =
+        createVirtualRegister(PointerType, &GR, MRI, MRI->getMF());
+
+    auto It = getOpVariableMBBIt(I);
+    auto MIB = BuildMI(*It->getParent(), It, It->getDebugLoc(),
+                       TII.get(SPIRV::OpVariable))
+                   .addDef(PointerVReg)
+                   .addUse(GR.getSPIRVTypeID(PointerType))
+                   .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function))
+                   .constrainAllUses(TII, TRI, RBI);
+
+    MIB = MIB &
+          BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+              .addDef(ResVReg)
+              .addUse(GR.getSPIRVTypeID(ResType))
+              .addImm(static_cast<uint32_t>(Ex.first))
+              .addImm(Opcode)
+              .add(I.getOperand(2))
+              .addUse(PointerVReg)
+              .constrainAllUses(TII, TRI, RBI);
+
+    MIB = MIB &
+          BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad))
+              .addDef(I.getOperand(1).getReg())
+              .addUse(GR.getSPIRVTypeID(PointeeTy))
+              .addUse(PointerVReg)
+              .constrainAllUses(TII, TRI, RBI);
+    return MIB;
+  }
+  return false;
+}
+
 bool SPIRVInstructionSelector::selectOpWithSrcs(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I,
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 27bb54c2d2e31..b4fc8dabbd4df 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -290,6 +290,9 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   // Control-flow. In some cases (e.g. constants) s1 may be promoted to s32.
   getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s32});
 
+  getActionDefinitionsBuilder(G_FFREXP).legalForCartesianProduct(
+      allFloatScalarsAndVectors, {s32, v2s32, v3s32, v4s32, v8s32, v16s32});
+
   // TODO: Review the target OpenCL and GLSL Extended Instruction Set specs to
   // tighten these requirements. Many of these math functions are only legal on
   // specific bitwidths, so they are not selectable for
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll
new file mode 100644
index 0000000000000..f6434e94a9d79
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll
@@ -0,0 +1,114 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std"
+; CHECK-DAG: %[[#float_32_type:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#int_32_type:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#fn_ptr_type_i32:]] = OpTypePointer Function %[[#int_32_type]]
+; CHECK-DAG: %[[#const_negzero:]] = OpConstant %[[#float_32_type]] -0
+; CHECK-DAG: %[[#vec2_float_type:]] = OpTypeVector %[[#float_32_type]] 2
+; CHECK-DAG: %[[#vec2_int_type:]] = OpTypeVector %[[#int_32_type]] 2
+; CHECK-DAG: %[[#fn_ptr_type_vec2_i32:]] = OpTypePointer Function %[[#vec2_int_type]]
+; CHECK-DAG: %[[#vec2_null:]] = OpConstantNull %[[#vec2_float_type]]
+; CHECK-DAG: %[[#scalar_null:]] = OpConstantNull %[[#float_32_type]]
+; CHECK-DAG: %[[#const_composite1:]] = OpConstantComposite %[[#vec2_float_type]] %[[#scalar_null]] %[[#const_negzero]]
+; CHECK-DAG: %[[#vec4_float_type:]] = OpTypeVector %[[#float_32_type]] 4
+; CHECK-DAG: %[[#vec4_int_type:]] = OpTypeVector %[[#int_32_type]] 4
+; CHECK-DAG: %[[#fn_ptr_type_vec4_i32:]] = OpTypePointer Function %[[#vec4_int_type]]
+; CHECK-DAG: %[[#const_composite2:]] = OpConstantComposite %[[#vec4_float_type]] %[[#const_16:]] %[[#const_neg32:]] %[[#const_0:]] %[[#const_9999:]]
+; CHECK-DAG: %[[#float_64_type:]] = OpTypeFloat 64
+; CHECK-DAG: %[[#vec2_double_type:]] = OpTypeVector %[[#float_64_type]] 2
+
+; CHECK: %[[#]] = OpFunctionParameter %[[#float_32_type]]
+; CHECK: %[[#var1:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#extinst1:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#const_negzero]] %[[#var1]]
+; CHECK: %[[#exp_part_var:]] = OpLoad %[[#int_32_type]] %[[#var1]]
+; CHECK: OpReturnValue %[[#exp_part_var]]
+define i32 @frexp_negzero(float %x) {
+  %ret = call { float, i32 } @llvm.frexp.f32.i32(float -0.0)
+  %f_part = extractvalue { float, i32 } %ret, 0
+  %exp_part = extractvalue { float, i32 } %ret, 1
+  ret i32 %exp_part
+}
+
+; CHECK: %[[#x_var4:]] = OpFunctionParameter %[[#float_32_type]]
+; CHECK: %[[#var10:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#extinst10:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#x_var4]] %[[#var10]]
+; CHECK: %[[#exp_part_var2:]] = OpLoad %[[#int_32_type]] %[[#var10]]
+; CHECK: OpReturnValue %[[#exp_part_var2]]
+define i32 @frexp_frexp_get_int(float %x) {
+  %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+  %f_part = extractvalue { float, i32 } %frexp0, 0
+  %exp_part = extractvalue { float, i32 } %frexp0, 1
+  ret i32 %exp_part
+}
+
+; CHECK: %[[#var3:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function
+; CHECK: %[[#extinst3:]] = OpExtInst %[[#vec2_float_type]] %[[#extinst_id]] frexp %[[#vec2_null]] %[[#var3]]
+; CHECK: %[[#f_part_var2:]] = OpLoad %[[#vec2_int_type]] %[[#var3]]
+; CHECK: OpReturnValue %[[#extinst3]]
+define <2 x float> @frexp_zero_vector() {
+  %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> zeroinitializer)
+  %f_part = extractvalue { <2 x float>, <2 x i32> } %ret, 0
+  %exp_part = extractvalue { <2 x float>, <2 x i32> } %ret, 1
+  ret <2 x float> %f_part
+}
+
+; CHECK: %[[#var4:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function
+; CHECK: %[[#extinst4:]] = OpExtInst %[[#vec2_float_type]] %[[#extinst_id]] frexp %[[#const_composite1]] %[[#var4]]
+; CHECK: %[[#f_part_var3:]] = OpLoad %[[#vec2_int_type]] %[[#var4]]
+; CHECK: OpReturnValue %[[#extinst4]]
+define <2 x float> @frexp_zero_negzero_vector() {
+  %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> <float 0.0, float -0.0>)
+  %f_part = extractvalue { <2 x float>, <2 x i32> } %ret, 0
+  %exp_part = extractvalue { <2 x float>, <2 x i32> } %ret, 1
+  ret <2 x float> %f_part
+}
+
+; CHECK: %[[#var5:]] = OpVariable %[[#fn_ptr_type_vec4_i32]] Function
+; CHECK: %[[#extinst5:]] = OpExtInst %[[#vec4_float_type]] %[[#extinst_id]] frexp %[[#const_composite2]] %[[#var5]]
+; CHECK: %[[#f_part_var4:]] = OpLoad %[[#vec4_int_type]] %[[#var5]]
+; CHECK: OpReturnValue %[[#extinst5]]
+define <4 x float> @frexp_nonsplat_vector() {
+    %ret = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> <float 16.0, float -32.0, float 0.0, float 9999.0>)
+    %f_part = extractvalue { <4 x float>, <4 x i32> } %ret, 0
+    %exp_part = extractvalue { <4 x float>, <4 x i32> } %ret, 1
+  ret <4 x float> %f_part
+}
+
+; CHECK: %[[#x_var2:]] = OpFunctionParameter %[[#float_32_type]]
+; CHECK: %[[#var6:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#var7:]] = OpVariable %[[#fn_ptr_type_i32]] Function
+; CHECK: %[[#extinst6:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#x_var2]] %[[#var6]]
+; CHECK: %[[#load1:]] = OpLoad %[[#int_32_type]] %[[#var6]]
+; CHECK: %[[#extinst7:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#extinst6]] %[[#var7]]
+; CHECK: %[[#f_part_var5:]] = OpLoad %[[#int_32_type]] %[[#var7]]
+; CHECK: OpReturnValue %[[#extinst7]]
+define float @frexp_frexp(float %x) {
+  %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+  %frexp0_f_part = extractvalue { float, i32 } %frexp0, 0
+  %frexp0_exp_part = extractvalue { float, i32 } %frexp0, 1
+  %frexp1 = call { float, i32 } @llvm.frexp.f32.i32(float %frexp0_f_part)
+  %frexp1_f_part = extractvalue { float, i32 } %frexp1, 0
+  %frexp1_exp_part = extractvalue { float, i32 } %frexp1, 1
+  ret float %frexp1_f_part
+}
+
+; CHECK: %[[#x_var3:]] = OpFunctionParameter %[[#vec2_double_type]]
+; CHECK: %[[#var9:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function
+; CHECK: %[[#extinst9:]] = OpExtInst %[[#vec2_double_type]] %[[#extinst_id]] frexp %[[#x_var3]] %[[#var9]]
+; CHECK: %[[#f_part_var6:]] = OpLoad %[[#vec2_int_type]] %[[#var9]]
+; CHECK: OpReturnValue %[[#extinst9]]
+define <2 x double> @frexp_frexp_vector(<2 x double> %x) {
+  %frexp0 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %x)
+  %f_part = extractvalue { <2 x double>, <2 x i32> } %frexp0, 0
+  %exp_part = extractvalue { <2 x double>, <2 x i32> } %frexp0, 1
+  ret <2 x double> %f_part
+}
+
+declare { float, i32 } @llvm.frexp.f32.i32(float)
+declare { double, i32 } @llvm.frexp.f64.i32(double)
+declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>)
+declare { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float>)
+declare { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double>)
+declare  { float, i8 } @llvm.frexp.f32.i8(float)

From cac0635ee9e947b5f90130df2f471aa4b722e04b Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Mon, 29 Sep 2025 09:02:35 +0800
Subject: [PATCH 065/878] [LoongArch] Add option for merge base offset pass

Add `loongarch-enable-merge-offset` option to allow disabling the
`MergeBaseOffset` pass when using optimization.

Reviewers: SixWeining, heiher

Reviewed By: SixWeining, heiher

Pull Request: https://github.com/llvm/llvm-project/pull/161063
---
 .../LoongArch/LoongArchTargetMachine.cpp      |  7 +++++-
 .../CodeGen/LoongArch/merge-offset-option.ll  | 24 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/merge-offset-option.ll

diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index d0a8ababe8e58..c5e26c106b5df 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -57,6 +57,11 @@ static cl::opt<bool>
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(false));
 
+static cl::opt<bool>
+    EnableMergeBaseOffset("loongarch-enable-merge-offset",
+                          cl::desc("Enable the merge base offset pass"),
+                          cl::init(true), cl::Hidden);
+
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
@@ -214,7 +219,7 @@ void LoongArchPassConfig::addMachineSSAOptimization() {
 
 void LoongArchPassConfig::addPreRegAlloc() {
   addPass(createLoongArchPreRAExpandPseudoPass());
-  if (TM->getOptLevel() != CodeGenOptLevel::None)
+  if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMergeBaseOffset)
     addPass(createLoongArchMergeBaseOffsetOptPass());
 }
 
diff --git a/llvm/test/CodeGen/LoongArch/merge-offset-option.ll b/llvm/test/CodeGen/LoongArch/merge-offset-option.ll
new file mode 100644
index 0000000000000..e5351a6589cf7
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/merge-offset-option.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 -mattr=+d --relocation-model=static -O1 \
+; RUN:     < %s | FileCheck %s --check-prefix=MERGE
+; RUN: llc --mtriple=loongarch64 -mattr=+d --relocation-model=static -O1 \
+; RUN:     --loongarch-enable-merge-offset=false < %s | FileCheck %s --check-prefix=NO_MERGE
+
+@g = dso_local global i32 zeroinitializer, align 4
+
+define void @foo() nounwind {
+; MERGE-LABEL: foo:
+; MERGE:       # %bb.0:
+; MERGE-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; MERGE-NEXT:    ld.w $zero, $a0, %pc_lo12(g)
+; MERGE-NEXT:    ret
+;
+; NO_MERGE-LABEL: foo:
+; NO_MERGE:       # %bb.0:
+; NO_MERGE-NEXT:    pcalau12i $a0, %pc_hi20(g)
+; NO_MERGE-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
+; NO_MERGE-NEXT:    ld.w $zero, $a0, 0
+; NO_MERGE-NEXT:    ret
+  %v = load volatile i32, ptr @g
+  ret void
+}

From abffc542ff876d9e74d64ee5cb83ec405ec2e70e Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Mon, 29 Sep 2025 12:30:46 +0800
Subject: [PATCH 066/878] [X86][MemFold] Allow masked load folding if masks are
 equal (#161074)

Inspired by #160920#issuecomment-3341816198
---
 llvm/lib/Target/X86/X86InstrAVX512.td   |  2 ++
 llvm/lib/Target/X86/X86InstrInfo.cpp    | 34 ++++++++++++++++++++++++-
 llvm/test/CodeGen/X86/avx512-mask-op.ll |  6 ++---
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index b8f299965faa3..2371ed4ed14a1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3238,6 +3238,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                                           (_.VT _.RC:$src1),
                                           (_.VT _.RC:$src0))))], _.ExeDomain>,
                        EVEX, EVEX_K, Sched<[Sched.RR]>;
+    let mayLoad = 1, canFoldAsLoad = 1 in
     def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                      (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3248,6 +3249,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                            (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K, Sched<[Sched.RM]>;
   }
+  let mayLoad = 1, canFoldAsLoad = 1 in
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.KRCWM:$mask, _.MemOp:$src),
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 03ac1d3ca5d89..1d2cd39951bf4 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -8113,6 +8113,39 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
     LiveIntervals *LIS) const {
 
+  // If LoadMI is a masked load, check MI having the same mask.
+  const MCInstrDesc &MCID = get(LoadMI.getOpcode());
+  unsigned NumOps = MCID.getNumOperands();
+  if (NumOps >= 3) {
+    Register MaskReg;
+    const MachineOperand &Op1 = LoadMI.getOperand(1);
+    const MachineOperand &Op2 = LoadMI.getOperand(2);
+
+    auto IsVKWMClass = [](const TargetRegisterClass *RC) {
+      return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass ||
+             RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass ||
+             RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
+    };
+
+    if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI)))
+      MaskReg = Op1.getReg();
+    else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI)))
+      MaskReg = Op2.getReg();
+
+    if (MaskReg) {
+      bool HasSameMask = false;
+      for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
+        const MachineOperand &Op = MI.getOperand(I);
+        if (Op.isReg() && Op.getReg() == MaskReg) {
+          HasSameMask = true;
+          break;
+        }
+      }
+      if (!HasSameMask)
+        return nullptr;
+    }
+  }
+
   // TODO: Support the case where LoadMI loads a wide register, but MI
   // only uses a subreg.
   for (auto Op : Ops) {
@@ -8121,7 +8154,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   }
 
   // If loading from a FrameIndex, fold directly from the FrameIndex.
-  unsigned NumOps = LoadMI.getDesc().getNumOperands();
   int FrameIndex;
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
     if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 8aa898f3ec576..da0cef0e4e99b 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -2119,8 +2119,7 @@ define void @ktest_1(<8 x double> %in, ptr %base) {
 ; KNL-LABEL: ktest_1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vcmpgtpd (%rdi), %zmm0, %k1
-; KNL-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
-; KNL-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; KNL-NEXT:    vcmpltpd 8(%rdi), %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
 ; KNL-NEXT:    je LBB44_2
@@ -2152,8 +2151,7 @@ define void @ktest_1(<8 x double> %in, ptr %base) {
 ; AVX512BW-LABEL: ktest_1:
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    vcmpgtpd (%rdi), %zmm0, %k1
-; AVX512BW-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    vcmpltpd 8(%rdi), %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testb %al, %al
 ; AVX512BW-NEXT:    je LBB44_2

From 23d3caf854f59fcf6cc73a25240cd9f58ac64b6d Mon Sep 17 00:00:00 2001
From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com>
Date: Mon, 29 Sep 2025 07:58:03 +0300
Subject: [PATCH 067/878] [libc][math] Refactor exp10m1f implementation to
 header-only in src/__support/math folder. (#159897)

Part of #147386

in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
---
 libc/shared/math.h                            |   1 +
 libc/shared/math/exp10m1f.h                   |  23 ++
 libc/src/__support/math/CMakeLists.txt        |  17 ++
 libc/src/__support/math/exp10m1f.h            | 234 ++++++++++++++++++
 libc/src/math/generic/CMakeLists.txt          |  11 +-
 libc/src/math/generic/exp10m1f.cpp            | 209 +---------------
 libc/test/shared/CMakeLists.txt               |   1 +
 libc/test/shared/shared_math_test.cpp         |   1 +
 .../llvm-project-overlay/libc/BUILD.bazel     |  18 +-
 9 files changed, 297 insertions(+), 218 deletions(-)
 create mode 100644 libc/shared/math/exp10m1f.h
 create mode 100644 libc/src/__support/math/exp10m1f.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index 9ba898ea6dac9..cccd6a375930e 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -45,6 +45,7 @@
 #include "math/exp10.h"
 #include "math/exp10f.h"
 #include "math/exp10f16.h"
+#include "math/exp10m1f.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp10m1f.h b/libc/shared/math/exp10m1f.h
new file mode 100644
index 0000000000000..9093705ce801b
--- /dev/null
+++ b/libc/shared/math/exp10m1f.h
@@ -0,0 +1,23 @@
+//===-- Shared exp10m1f function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10M1F_H
+#define LLVM_LIBC_SHARED_MATH_EXP10M1F_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp10m1f.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10m1f;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10M1F_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 12ffa2ab456e7..84c1b15498672 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -481,6 +481,23 @@ add_header_library(
     libc.src.__support.FPUtil.generic.sqrt
 )
 
+add_header_library(
+  exp10m1f
+  HDRS
+    exp10m1f.h
+  DEPENDS
+    .exp10f_utils
+    libc.src.errno.errno
+    libc.src.__support.common
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+)
+
 add_header_library(
   erff
   HDRS
diff --git a/libc/src/__support/math/exp10m1f.h b/libc/src/__support/math/exp10m1f.h
new file mode 100644
index 0000000000000..9fe4ff774ec68
--- /dev/null
+++ b/libc/src/__support/math/exp10m1f.h
@@ -0,0 +1,234 @@
+//===-- Implementation header for exp10m1f ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F_H
+
+#include "exp10f_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace exp10m1f_internal {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+static constexpr size_t N_EXCEPTS_LO = 11;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP10M1F_EXCEPTS_LO =
+    {{
+        // x = 0x1.0fe54ep-11, exp10m1f(x) = 0x1.3937eep-10 (RZ)
+        {0x3a07'f2a7U, 0x3a9c'9bf7U, 1U, 0U, 1U},
+        // x = 0x1.80e6eap-11, exp10m1f(x) = 0x1.bb8272p-10 (RZ)
+        {0x3a40'7375U, 0x3add'c139U, 1U, 0U, 1U},
+        // x = -0x1.2a33bcp-51, exp10m1f(x) = -0x1.57515ep-50 (RZ)
+        {0xa615'19deU, 0xa6ab'a8afU, 0U, 1U, 0U},
+        // x = -0x0p+0, exp10m1f(x) = -0x0p+0 (RZ)
+        {0x8000'0000U, 0x8000'0000U, 0U, 0U, 0U},
+        // x = -0x1.b59e08p-31, exp10m1f(x) = -0x1.f7d356p-30 (RZ)
+        {0xb05a'cf04U, 0xb0fb'e9abU, 0U, 1U, 1U},
+        // x = -0x1.bf342p-12, exp10m1f(x) = -0x1.014e02p-10 (RZ)
+        {0xb9df'9a10U, 0xba80'a701U, 0U, 1U, 0U},
+        // x = -0x1.6207fp-11, exp10m1f(x) = -0x1.9746cap-10 (RZ)
+        {0xba31'03f8U, 0xbacb'a365U, 0U, 1U, 1U},
+        // x = -0x1.bd0c66p-11, exp10m1f(x) = -0x1.ffe168p-10 (RZ)
+        {0xba5e'8633U, 0xbaff'f0b4U, 0U, 1U, 1U},
+        // x = -0x1.ffd84cp-10, exp10m1f(x) = -0x1.25faf2p-8 (RZ)
+        {0xbaff'ec26U, 0xbb92'fd79U, 0U, 1U, 0U},
+        // x = -0x1.a74172p-9, exp10m1f(x) = -0x1.e57be2p-8 (RZ)
+        {0xbb53'a0b9U, 0xbbf2'bdf1U, 0U, 1U, 1U},
+        // x = -0x1.cb694cp-9, exp10m1f(x) = -0x1.0764e4p-7 (RZ)
+        {0xbb65'b4a6U, 0xbc03'b272U, 0U, 1U, 0U},
+    }};
+
+static constexpr size_t N_EXCEPTS_HI = 19;
+
+static constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP10M1F_EXCEPTS_HI =
+    {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.8d31eep-8, exp10m1f(x) = 0x1.cc7e4cp-7 (RZ)
+        {0x3bc6'98f7U, 0x3c66'3f26U, 1U, 0U, 1U},
+        // x = 0x1.915fcep-8, exp10m1f(x) = 0x1.d15f72p-7 (RZ)
+        {0x3bc8'afe7U, 0x3c68'afb9U, 1U, 0U, 0U},
+        // x = 0x1.bcf982p-8, exp10m1f(x) = 0x1.022928p-6 (RZ)
+        {0x3bde'7cc1U, 0x3c81'1494U, 1U, 0U, 1U},
+        // x = 0x1.99ff0ap-7, exp10m1f(x) = 0x1.dee416p-6 (RZ)
+        {0x3c4c'ff85U, 0x3cef'720bU, 1U, 0U, 0U},
+        // x = 0x1.75ea14p-6, exp10m1f(x) = 0x1.b9ff16p-5 (RZ)
+        {0x3cba'f50aU, 0x3d5c'ff8bU, 1U, 0U, 0U},
+        // x = 0x1.f81b64p-6, exp10m1f(x) = 0x1.2cb6bcp-4 (RZ)
+        {0x3cfc'0db2U, 0x3d96'5b5eU, 1U, 0U, 0U},
+        // x = 0x1.fafecp+3, exp10m1f(x) = 0x1.8c880ap+52 (RZ)
+        {0x417d'7f60U, 0x59c6'4405U, 1U, 0U, 0U},
+        // x = -0x1.3bf094p-8, exp10m1f(x) = -0x1.69ba4ap-7 (RZ)
+        {0xbb9d'f84aU, 0xbc34'dd25U, 0U, 1U, 0U},
+        // x = -0x1.4558bcp-8, exp10m1f(x) = -0x1.746fb8p-7 (RZ)
+        {0xbba2'ac5eU, 0xbc3a'37dcU, 0U, 1U, 1U},
+        // x = -0x1.4bb43p-8, exp10m1f(x) = -0x1.7babe4p-7 (RZ)
+        {0xbba5'da18U, 0xbc3d'd5f2U, 0U, 1U, 1U},
+        // x = -0x1.776cc8p-8, exp10m1f(x) = -0x1.ad62c4p-7 (RZ)
+        {0xbbbb'b664U, 0xbc56'b162U, 0U, 1U, 0U},
+        // x = -0x1.f024cp-8, exp10m1f(x) = -0x1.1b20d6p-6 (RZ)
+        {0xbbf8'1260U, 0xbc8d'906bU, 0U, 1U, 1U},
+        // x = -0x1.f510eep-8, exp10m1f(x) = -0x1.1de9aap-6 (RZ)
+        {0xbbfa'8877U, 0xbc8e'f4d5U, 0U, 1U, 0U},
+        // x = -0x1.0b43c4p-7, exp10m1f(x) = -0x1.30d418p-6 (RZ)
+        {0xbc05'a1e2U, 0xbc98'6a0cU, 0U, 1U, 0U},
+        // x = -0x1.245ee4p-7, exp10m1f(x) = -0x1.4d2b86p-6 (RZ)
+        {0xbc12'2f72U, 0xbca6'95c3U, 0U, 1U, 0U},
+        // x = -0x1.f9f2dap-7, exp10m1f(x) = -0x1.1e2186p-5 (RZ)
+        {0xbc7c'f96dU, 0xbd0f'10c3U, 0U, 1U, 0U},
+        // x = -0x1.08e42p-6, exp10m1f(x) = -0x1.2b5c4p-5 (RZ)
+        {0xbc84'7210U, 0xbd15'ae20U, 0U, 1U, 1U},
+        // x = -0x1.0cdc44p-5, exp10m1f(x) = -0x1.2a2152p-4 (RZ)
+        {0xbd06'6e22U, 0xbd95'10a9U, 0U, 1U, 1U},
+        // x = -0x1.ca4322p-5, exp10m1f(x) = -0x1.ef073p-4 (RZ)
+        {0xbd65'2191U, 0xbdf7'8398U, 0U, 1U, 1U},
+    }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+} // namespace exp10m1f_internal
+
+LIBC_INLINE static constexpr float exp10m1f(float x) {
+  using namespace exp10m1f_internal;
+  using FPBits = fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+  // When x >= log10(2^128), or x is nan
+  if (LIBC_UNLIKELY(xbits.is_pos() && x_u >= 0x421a'209bU)) {
+    if (xbits.is_finite()) {
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+        return FPBits::max_normal().get_val();
+
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_OVERFLOW);
+    }
+
+    // x >= log10(2^128) and 10^x - 1 rounds to +inf, or x is +inf or nan
+    return x + FPBits::inf().get_val();
+  }
+
+  // When |x| <= log10(2) * 2^(-6)
+  if (LIBC_UNLIKELY(x_abs <= 0x3b9a'209bU)) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+    if (auto r = EXP10M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+    double dx = x;
+    double dx_sq = dx * dx;
+    double c0 = dx * Exp10Base::COEFFS[0];
+    double c1 =
+        fputil::multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
+    double c2 =
+        fputil::multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
+    // 10^dx - 1 ~ (1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5) - 1
+    //           = COEFFS[0] * dx + ... + COEFFS[4] * dx^5
+    return static_cast<float>(fputil::polyeval(dx_sq, c0, c1, c2));
+  }
+
+  // When x <= log10(2^-25), or x is nan
+  if (LIBC_UNLIKELY(x_u >= 0xc0f0d2f1)) {
+    // exp10m1(-inf) = -1
+    if (xbits.is_inf())
+      return -1.0f;
+    // exp10m1(nan) = nan
+    if (xbits.is_nan())
+      return x;
+
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO ||
+        (rounding == FE_TONEAREST && x_u == 0xc0f0d2f1))
+      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_UNDERFLOW);
+    return -1.0f;
+  }
+
+  // Exact outputs when x = 1, 2, ..., 10.
+  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0f | ... | bits of 10.0f)
+  if (LIBC_UNLIKELY((x_u & 0x800f'ffffU) == 0)) {
+    switch (x_u) {
+    case 0x3f800000U: // x = 1.0f
+      return 9.0f;
+    case 0x40000000U: // x = 2.0f
+      return 99.0f;
+    case 0x40400000U: // x = 3.0f
+      return 999.0f;
+    case 0x40800000U: // x = 4.0f
+      return 9'999.0f;
+    case 0x40a00000U: // x = 5.0f
+      return 99'999.0f;
+    case 0x40c00000U: // x = 6.0f
+      return 999'999.0f;
+    case 0x40e00000U: // x = 7.0f
+      return 9'999'999.0f;
+    case 0x41000000U: { // x = 8.0f
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
+        return 100'000'000.0f;
+      return 99'999'992.0f;
+    }
+    case 0x41100000U: { // x = 9.0f
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
+        return 1'000'000'000.0f;
+      return 999'999'936.0f;
+    }
+    case 0x41200000U: { // x = 10.0f
+      int rounding = fputil::quick_get_round();
+      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
+        return 10'000'000'000.0f;
+      return 9'999'998'976.0f;
+    }
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP10M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // Range reduction: 10^x = 2^(mid + hi) * 10^lo
+  //   rr = (2^(mid + hi), lo)
+  auto rr = exp_b_range_reduc<Exp10Base>(x);
+
+  // The low part is approximated by a degree-5 minimax polynomial.
+  // 10^lo ~ 1 + COEFFS[0] * lo + ... + COEFFS[4] * lo^5
+  double lo_sq = rr.lo * rr.lo;
+  double c0 = fputil::multiply_add(rr.lo, Exp10Base::COEFFS[0], 1.0);
+  double c1 =
+      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
+  double c2 =
+      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
+  double exp10_lo = fputil::polyeval(lo_sq, c0, c1, c2);
+  // 10^x - 1 = 2^(mid + hi) * 10^lo - 1
+  //          ~ mh * exp10_lo - 1
+  return static_cast<float>(fputil::multiply_add(exp10_lo, rr.mh, -1.0));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 7cd34fab53480..8074a3925626c 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1593,16 +1593,7 @@ add_entrypoint_object(
   HDRS
     ../exp10m1f.h
   DEPENDS
-    libc.src.errno.errno
-    libc.src.__support.common
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.math.exp10f_utils
+    libc.src.__support.math.exp10m1f
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/exp10m1f.cpp b/libc/src/math/generic/exp10m1f.cpp
index 8589e3fb6639d..87980b7753b40 100644
--- a/libc/src/math/generic/exp10m1f.cpp
+++ b/libc/src/math/generic/exp10m1f.cpp
@@ -7,215 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10m1f.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/libc_errno.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/math/exp10f_utils.h"
+#include "src/__support/math/exp10m1f.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr size_t N_EXCEPTS_LO = 11;
-
-static constexpr fputil::ExceptValues<float, N_EXCEPTS_LO> EXP10M1F_EXCEPTS_LO =
-    {{
-        // x = 0x1.0fe54ep-11, exp10m1f(x) = 0x1.3937eep-10 (RZ)
-        {0x3a07'f2a7U, 0x3a9c'9bf7U, 1U, 0U, 1U},
-        // x = 0x1.80e6eap-11, exp10m1f(x) = 0x1.bb8272p-10 (RZ)
-        {0x3a40'7375U, 0x3add'c139U, 1U, 0U, 1U},
-        // x = -0x1.2a33bcp-51, exp10m1f(x) = -0x1.57515ep-50 (RZ)
-        {0xa615'19deU, 0xa6ab'a8afU, 0U, 1U, 0U},
-        // x = -0x0p+0, exp10m1f(x) = -0x0p+0 (RZ)
-        {0x8000'0000U, 0x8000'0000U, 0U, 0U, 0U},
-        // x = -0x1.b59e08p-31, exp10m1f(x) = -0x1.f7d356p-30 (RZ)
-        {0xb05a'cf04U, 0xb0fb'e9abU, 0U, 1U, 1U},
-        // x = -0x1.bf342p-12, exp10m1f(x) = -0x1.014e02p-10 (RZ)
-        {0xb9df'9a10U, 0xba80'a701U, 0U, 1U, 0U},
-        // x = -0x1.6207fp-11, exp10m1f(x) = -0x1.9746cap-10 (RZ)
-        {0xba31'03f8U, 0xbacb'a365U, 0U, 1U, 1U},
-        // x = -0x1.bd0c66p-11, exp10m1f(x) = -0x1.ffe168p-10 (RZ)
-        {0xba5e'8633U, 0xbaff'f0b4U, 0U, 1U, 1U},
-        // x = -0x1.ffd84cp-10, exp10m1f(x) = -0x1.25faf2p-8 (RZ)
-        {0xbaff'ec26U, 0xbb92'fd79U, 0U, 1U, 0U},
-        // x = -0x1.a74172p-9, exp10m1f(x) = -0x1.e57be2p-8 (RZ)
-        {0xbb53'a0b9U, 0xbbf2'bdf1U, 0U, 1U, 1U},
-        // x = -0x1.cb694cp-9, exp10m1f(x) = -0x1.0764e4p-7 (RZ)
-        {0xbb65'b4a6U, 0xbc03'b272U, 0U, 1U, 0U},
-    }};
-
-static constexpr size_t N_EXCEPTS_HI = 19;
-
-static constexpr fputil::ExceptValues<float, N_EXCEPTS_HI> EXP10M1F_EXCEPTS_HI =
-    {{
-        // (input, RZ output, RU offset, RD offset, RN offset)
-        // x = 0x1.8d31eep-8, exp10m1f(x) = 0x1.cc7e4cp-7 (RZ)
-        {0x3bc6'98f7U, 0x3c66'3f26U, 1U, 0U, 1U},
-        // x = 0x1.915fcep-8, exp10m1f(x) = 0x1.d15f72p-7 (RZ)
-        {0x3bc8'afe7U, 0x3c68'afb9U, 1U, 0U, 0U},
-        // x = 0x1.bcf982p-8, exp10m1f(x) = 0x1.022928p-6 (RZ)
-        {0x3bde'7cc1U, 0x3c81'1494U, 1U, 0U, 1U},
-        // x = 0x1.99ff0ap-7, exp10m1f(x) = 0x1.dee416p-6 (RZ)
-        {0x3c4c'ff85U, 0x3cef'720bU, 1U, 0U, 0U},
-        // x = 0x1.75ea14p-6, exp10m1f(x) = 0x1.b9ff16p-5 (RZ)
-        {0x3cba'f50aU, 0x3d5c'ff8bU, 1U, 0U, 0U},
-        // x = 0x1.f81b64p-6, exp10m1f(x) = 0x1.2cb6bcp-4 (RZ)
-        {0x3cfc'0db2U, 0x3d96'5b5eU, 1U, 0U, 0U},
-        // x = 0x1.fafecp+3, exp10m1f(x) = 0x1.8c880ap+52 (RZ)
-        {0x417d'7f60U, 0x59c6'4405U, 1U, 0U, 0U},
-        // x = -0x1.3bf094p-8, exp10m1f(x) = -0x1.69ba4ap-7 (RZ)
-        {0xbb9d'f84aU, 0xbc34'dd25U, 0U, 1U, 0U},
-        // x = -0x1.4558bcp-8, exp10m1f(x) = -0x1.746fb8p-7 (RZ)
-        {0xbba2'ac5eU, 0xbc3a'37dcU, 0U, 1U, 1U},
-        // x = -0x1.4bb43p-8, exp10m1f(x) = -0x1.7babe4p-7 (RZ)
-        {0xbba5'da18U, 0xbc3d'd5f2U, 0U, 1U, 1U},
-        // x = -0x1.776cc8p-8, exp10m1f(x) = -0x1.ad62c4p-7 (RZ)
-        {0xbbbb'b664U, 0xbc56'b162U, 0U, 1U, 0U},
-        // x = -0x1.f024cp-8, exp10m1f(x) = -0x1.1b20d6p-6 (RZ)
-        {0xbbf8'1260U, 0xbc8d'906bU, 0U, 1U, 1U},
-        // x = -0x1.f510eep-8, exp10m1f(x) = -0x1.1de9aap-6 (RZ)
-        {0xbbfa'8877U, 0xbc8e'f4d5U, 0U, 1U, 0U},
-        // x = -0x1.0b43c4p-7, exp10m1f(x) = -0x1.30d418p-6 (RZ)
-        {0xbc05'a1e2U, 0xbc98'6a0cU, 0U, 1U, 0U},
-        // x = -0x1.245ee4p-7, exp10m1f(x) = -0x1.4d2b86p-6 (RZ)
-        {0xbc12'2f72U, 0xbca6'95c3U, 0U, 1U, 0U},
-        // x = -0x1.f9f2dap-7, exp10m1f(x) = -0x1.1e2186p-5 (RZ)
-        {0xbc7c'f96dU, 0xbd0f'10c3U, 0U, 1U, 0U},
-        // x = -0x1.08e42p-6, exp10m1f(x) = -0x1.2b5c4p-5 (RZ)
-        {0xbc84'7210U, 0xbd15'ae20U, 0U, 1U, 1U},
-        // x = -0x1.0cdc44p-5, exp10m1f(x) = -0x1.2a2152p-4 (RZ)
-        {0xbd06'6e22U, 0xbd95'10a9U, 0U, 1U, 1U},
-        // x = -0x1.ca4322p-5, exp10m1f(x) = -0x1.ef073p-4 (RZ)
-        {0xbd65'2191U, 0xbdf7'8398U, 0U, 1U, 1U},
-    }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-LLVM_LIBC_FUNCTION(float, exp10m1f, (float x)) {
-  using FPBits = fputil::FPBits<float>;
-  FPBits xbits(x);
-
-  uint32_t x_u = xbits.uintval();
-  uint32_t x_abs = x_u & 0x7fff'ffffU;
-
-  // When x >= log10(2^128), or x is nan
-  if (LIBC_UNLIKELY(xbits.is_pos() && x_u >= 0x421a'209bU)) {
-    if (xbits.is_finite()) {
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-        return FPBits::max_normal().get_val();
-
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_OVERFLOW);
-    }
-
-    // x >= log10(2^128) and 10^x - 1 rounds to +inf, or x is +inf or nan
-    return x + FPBits::inf().get_val();
-  }
-
-  // When |x| <= log10(2) * 2^(-6)
-  if (LIBC_UNLIKELY(x_abs <= 0x3b9a'209bU)) {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-    if (auto r = EXP10M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-      return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-    double dx = x;
-    double dx_sq = dx * dx;
-    double c0 = dx * Exp10Base::COEFFS[0];
-    double c1 =
-        fputil::multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
-    double c2 =
-        fputil::multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
-    // 10^dx - 1 ~ (1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5) - 1
-    //           = COEFFS[0] * dx + ... + COEFFS[4] * dx^5
-    return static_cast<float>(fputil::polyeval(dx_sq, c0, c1, c2));
-  }
-
-  // When x <= log10(2^-25), or x is nan
-  if (LIBC_UNLIKELY(x_u >= 0xc0f0d2f1)) {
-    // exp10m1(-inf) = -1
-    if (xbits.is_inf())
-      return -1.0f;
-    // exp10m1(nan) = nan
-    if (xbits.is_nan())
-      return x;
-
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_UPWARD || rounding == FE_TOWARDZERO ||
-        (rounding == FE_TONEAREST && x_u == 0xc0f0d2f1))
-      return -0x1.ffff'fep-1f; // -1.0f + 0x1.0p-24f
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_UNDERFLOW);
-    return -1.0f;
-  }
-
-  // Exact outputs when x = 1, 2, ..., 10.
-  // Quick check mask: 0x800f'ffffU = ~(bits of 1.0f | ... | bits of 10.0f)
-  if (LIBC_UNLIKELY((x_u & 0x800f'ffffU) == 0)) {
-    switch (x_u) {
-    case 0x3f800000U: // x = 1.0f
-      return 9.0f;
-    case 0x40000000U: // x = 2.0f
-      return 99.0f;
-    case 0x40400000U: // x = 3.0f
-      return 999.0f;
-    case 0x40800000U: // x = 4.0f
-      return 9'999.0f;
-    case 0x40a00000U: // x = 5.0f
-      return 99'999.0f;
-    case 0x40c00000U: // x = 6.0f
-      return 999'999.0f;
-    case 0x40e00000U: // x = 7.0f
-      return 9'999'999.0f;
-    case 0x41000000U: { // x = 8.0f
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
-        return 100'000'000.0f;
-      return 99'999'992.0f;
-    }
-    case 0x41100000U: { // x = 9.0f
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
-        return 1'000'000'000.0f;
-      return 999'999'936.0f;
-    }
-    case 0x41200000U: { // x = 10.0f
-      int rounding = fputil::quick_get_round();
-      if (rounding == FE_UPWARD || rounding == FE_TONEAREST)
-        return 10'000'000'000.0f;
-      return 9'999'998'976.0f;
-    }
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP10M1F_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // Range reduction: 10^x = 2^(mid + hi) * 10^lo
-  //   rr = (2^(mid + hi), lo)
-  auto rr = exp_b_range_reduc<Exp10Base>(x);
-
-  // The low part is approximated by a degree-5 minimax polynomial.
-  // 10^lo ~ 1 + COEFFS[0] * lo + ... + COEFFS[4] * lo^5
-  double lo_sq = rr.lo * rr.lo;
-  double c0 = fputil::multiply_add(rr.lo, Exp10Base::COEFFS[0], 1.0);
-  double c1 =
-      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]);
-  double c2 =
-      fputil::multiply_add(rr.lo, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]);
-  double exp10_lo = fputil::polyeval(lo_sq, c0, c1, c2);
-  // 10^x - 1 = 2^(mid + hi) * 10^lo - 1
-  //          ~ mh * exp10_lo - 1
-  return static_cast<float>(fputil::multiply_add(exp10_lo, rr.mh, -1.0));
-}
+LLVM_LIBC_FUNCTION(float, exp10m1f, (float x)) { return math::exp10m1f(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index 9f3e9838d6b78..13a0aae5d4c67 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -36,6 +36,7 @@ add_fp_unittest(
     libc.src.__support.math.cospif
     libc.src.__support.math.cospif16
     libc.src.__support.math.dsqrtl
+    libc.src.__support.math.exp10m1f
     libc.src.__support.math.erff
     libc.src.__support.math.exp
     libc.src.__support.math.exp10
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 655e7fb48230e..25bf5ad8ae411 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -57,6 +57,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat) {
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::cosf(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::coshf(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::cospif(0.0f));
+  EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::exp10m1f(0.0f));
   EXPECT_FP_EQ(0x0p+0f, LIBC_NAMESPACE::shared::erff(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::exp10f(0.0f));
   EXPECT_FP_EQ(0x1p+0f, LIBC_NAMESPACE::shared::expf(0.0f));
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 9d02ff9f459ae..8d9e80393bf20 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2657,6 +2657,22 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp10m1f",
+    hdrs = ["src/__support/math/exp10m1f.h"],
+    deps = [
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_exp10f_utils",
+        ":errno",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_erff",
     hdrs = ["src/__support/math/erff.h"],
@@ -3613,7 +3629,7 @@ libc_math_function(
 libc_math_function(
     name = "exp10m1f",
     additional_deps = [
-        ":__support_math_exp10f_utils",
+        ":__support_math_exp10m1f",
     ],
 )
 

From eb420fd669523fa4d8875d43b76326048c52698b Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 29 Sep 2025 07:02:36 +0200
Subject: [PATCH 068/878] [mlir][bzl] Fix missing dep (#161126)

Flagged with `-Wprivate-header`.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2a0cc30b2267c..422c29fc9c4d5 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6091,6 +6091,7 @@ cc_library(
         ":DialectUtils",
         ":GPUDialect",
         ":IR",
+        ":InferIntRangeInterface",
         ":LLVMDialect",
         ":NVVMOpsIncGen",
         ":NVVMRequiresSMTraitsIncGen",
@@ -6295,6 +6296,7 @@ cc_library(
         ":BytecodeOpInterface",
         ":GPUDialect",
         ":IR",
+        ":InferIntRangeInterface",
         ":LLVMDialect",
         ":ROCDLOpsIncGen",
         ":SideEffectInterfaces",
@@ -6685,6 +6687,7 @@ cc_library(
         ":IR",
         ":InferTypeOpInterface",
         ":SMTIncGen",
+        ":SideEffectInterfaces",
         ":Support",
         "//llvm:Support",
     ],
@@ -11831,6 +11834,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/PDLExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/PDLExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
         ":PDLDialect",
         ":PDLInterpDialect",
@@ -11945,6 +11949,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/IRDLExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/IRDLExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
         ":IRDLDialect",
         ":IRDLInterfacesIncGen",
@@ -11986,7 +11991,9 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/DebugExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/DebugExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
+        ":SideEffectInterfaces",
         ":Support",
         ":TransformDebugExtensionOpsIncGen",
         ":TransformDialect",
@@ -12023,6 +12030,7 @@ cc_library(
     srcs = glob(["lib/Dialect/Transform/LoopExtension/*.cpp"]),
     hdrs = glob(["include/mlir/Dialect/Transform/LoopExtension/*.h"]),
     deps = [
+        ":BytecodeOpInterface",
         ":IR",
         ":LoopLikeInterface",
         ":Rewrite",
@@ -13071,6 +13079,7 @@ cc_library(
         ":MPIOpsIncGen",
         ":MPITypesIncGen",
         ":MemRefDialect",
+        ":SideEffectInterfaces",
         "//llvm:Support",
     ],
 )

From 250854d3762deecdee9e0eae0330b124c6f6a3fa Mon Sep 17 00:00:00 2001
From: Mend Renovate <bot@renovateapp.com>
Date: Mon, 29 Sep 2025 06:26:48 +0100
Subject: [PATCH 069/878] [Github] Update GHA Dependencies (#161107)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR contains the following updates:

| Package | Type | Update | Change | Pending |
|---|---|---|---|---|
|
[EnricoMi/publish-unit-test-result-action](https://redirect.github.com/EnricoMi/publish-unit-test-result-action)
| action | digest | `170bf24` -> `3a74b29` | |
|
[actions/attest-build-provenance](https://redirect.github.com/actions/attest-build-provenance)
| action | minor | `v1.0.0` -> `v1.4.4` | |
| [actions/checkout](https://redirect.github.com/actions/checkout) |
action | minor | `v4.1.1` -> `v4.3.0` | |
|
[actions/github-script](https://redirect.github.com/actions/github-script)
| action | minor | `v7.0.1` -> `v7.1.0` | |
| [actions/setup-node](https://redirect.github.com/actions/setup-node) |
action | minor | `v4.2.0` -> `v4.4.0` | |
|
[actions/setup-python](https://redirect.github.com/actions/setup-python)
| action | minor | `v5.4.0` -> `v5.6.0` | |
| actions/setup-python | action | digest | `39cd149` -> `2e3e4b1` |  |
|
[actions/upload-artifact](https://redirect.github.com/actions/upload-artifact)
| action | patch | `v4.6.0` -> `v4.6.2` | |
|
[actions/upload-artifact](https://redirect.github.com/actions/upload-artifact)
| action | minor | `v4.3.3` -> `v4.6.2` | |
|
[actions/upload-artifact](https://redirect.github.com/actions/upload-artifact)
| action | patch | `4.6.0` -> `4.6.2` | |
|
[actions/upload-artifact](https://redirect.github.com/actions/upload-artifact)
| action | minor | `v4.3.0` -> `v4.6.2` | |
| [aminya/setup-cpp](https://redirect.github.com/aminya/setup-cpp) |
action | minor | `v1.1.1` -> `v1.7.1` | |
| [docker/login-action](https://redirect.github.com/docker/login-action)
| action | minor | `v3.3.0` -> `v3.5.0` | |
|
[github/codeql-action](https://redirect.github.com/github/codeql-action)
| action | minor | `v2.20.6` -> `v2.28.1` | |
|
[github/codeql-action](https://redirect.github.com/github/codeql-action)
| action | patch | `v3.30.3` -> `v3.30.4` | `v3.30.5` |
|
[hendrikmuhs/ccache-action](https://redirect.github.com/hendrikmuhs/ccache-action)
| action | patch | `v1.2.17` -> `v1.2.19` | |
| llvm/actions | action | digest | `22e9f90` -> `a1ea791` |  |
|
[ossf/scorecard-action](https://redirect.github.com/ossf/scorecard-action)
| action | patch | `v2.4.1` -> `v2.4.2` | |
|
[pypa/gh-action-pypi-publish](https://redirect.github.com/pypa/gh-action-pypi-publish)
| action | minor | `v1.12.4` -> `v1.13.0` | |
| [python](https://redirect.github.com/actions/python-versions) |
uses-with | minor | `3.12` -> `3.13` | |
| [python](https://redirect.github.com/actions/python-versions) |
uses-with | minor | `3.11` -> `3.13` | |
| [python](https://redirect.github.com/actions/python-versions) |
uses-with | minor | `3.10` -> `3.13` | |

---

> [!WARNING]
> Some dependencies could not be looked up. Check the Dependency
Dashboard for more information.

---

### Release Notes

<details>
<summary>actions/attest-build-provenance
(actions/attest-build-provenance)</summary>

###
[`v1.4.4`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.4.4)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.4.3...v1.4.4)

##### What's Changed

- Bump predicate action from 1.1.3 to 1.1.4 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;310](https://redirect.github.com/actions/attest-build-provenance/pull/310)
- Bump [@&#8203;actions/core](https://redirect.github.com/actions/core)
from 1.10.1 to 1.11.1 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;275](https://redirect.github.com/actions/attest-build-provenance/pull/275)
- Bump
[@&#8203;actions/attest](https://redirect.github.com/actions/attest)
from 1.4.2 to 1.5.0 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;309](https://redirect.github.com/actions/attest-build-provenance/pull/309)
- Fix SLSA provenance bug related to `workflow_ref` OIDC token claims
containing the "@&#8203;" symbol in the tag name
([actions/toolkit#1863](https://redirect.github.com/actions/toolkit/pull/1863))

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.4.3...v1.4.4>

###
[`v1.4.3`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.4.3)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.4.2...v1.4.3)

##### What's Changed

- Bump predicate from 1.1.2 to 1.1.3 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;226](https://redirect.github.com/actions/attest-build-provenance/pull/226)
- Bump
[@&#8203;actions/attest](https://redirect.github.com/actions/attest)
from 1.3.1 to 1.4.1 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;212](https://redirect.github.com/actions/attest-build-provenance/pull/212)
- Bump
[@&#8203;actions/attest](https://redirect.github.com/actions/attest)
from 1.4.1 to 1.4.2 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;225](https://redirect.github.com/actions/attest-build-provenance/pull/225)
- Fix bug w/ customized OIDC issuer URL for enterprise accounts
([#&#8203;222](https://redirect.github.com/actions/attest-build-provenance/issues/222))

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.4.2...v1.4.3>

###
[`v1.4.2`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.4.2)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.4.1...v1.4.2)

##### What's Changed

- Bump actions/attest from 1.4.0 to 1.4.1 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;209](https://redirect.github.com/actions/attest-build-provenance/pull/209)
- Includes bug fix for issue with authenticated proxies
([actions/toolkit#1798](https://redirect.github.com/actions/toolkit/issues/1798))

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.4.1...v1.4.2>

###
[`v1.4.1`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.4.1)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.4.0...v1.4.1)

##### What's Changed

- Update predicate action to 1.1.2 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;197](https://redirect.github.com/actions/attest-build-provenance/pull/197)
- Dynamic construction of oidc issuer by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;195](https://redirect.github.com/actions/attest-build-provenance/pull/195)

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.4.0...v1.4.1>

###
[`v1.4.0`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.4.0)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.3.3...v1.4.0)

##### What's Changed

- Bump predicate action from 1.1.0 to 1.1.1 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;182](https://redirect.github.com/actions/attest-build-provenance/pull/182)
  - Fix for JWKS proxy bug
- Bump actions/attest from 1.3.3 to 1.4.0 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;183](https://redirect.github.com/actions/attest-build-provenance/pull/183)
  - Add `show-summary` input
  - Format summary output as list

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.3.3...v1.4.0>

###
[`v1.3.3`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.3.3)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.3.2...v1.3.3)

##### What's Changed

- Bump actions/attest from 1.3.2 to 1.3.3 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;152](https://redirect.github.com/actions/attest-build-provenance/pull/152)
- Bugfix for properly handling glob exclusion patterns in `subject-path`
input

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.3.2...v1.3.3>

###
[`v1.3.2`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.3.2)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.3.1...v1.3.2)

##### What's Changed

- Bump actions/attest from 1.3.1 to 1.3.2 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;123](https://redirect.github.com/actions/attest-build-provenance/pull/123)
  - Increase timeout for OCI operations

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.3.1...v1.3.2>

###
[`v1.3.1`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.3.1)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.3.0...v1.3.1)

##### What's Changed

- Bump actions/attest from 1.3.0 to 1.3.1 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;117](https://redirect.github.com/actions/attest-build-provenance/pull/117)
- Bugfix when detecting support for the referrers API with OCI
registries

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.3.0...v1.3.1>

###
[`v1.3.0`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.3.0)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.2.0...v1.3.0)

##### What's Changed

- Bump actions/attest-build-provenance/predicate from 1.0.0 to 1.1.0 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;116](https://redirect.github.com/actions/attest-build-provenance/pull/116)
- Switch to new GH provenance [build
type](https://actions.github.io/buildtypes/workflow/v1)
- Bump actions/attest from 1.2.0 to 1.3.0 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;116](https://redirect.github.com/actions/attest-build-provenance/pull/116)
  - Dynamic construction of GitHub API URLs based on GITHUB\_SERVER\_URL
  - Improved handling of Rekor 409 responses
- Bugfix - detection of registries with support for the OCI referrers
API

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.2.0...v1.3.0>

###
[`v1.2.0`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.2.0)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.1.2...v1.2.0)

##### What's Changed

- Bump actions/attest from 1.1.2 to 1.2.0 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;101](https://redirect.github.com/actions/attest-build-provenance/pull/101)
  - Batch processing w/ exponential backoff
  - Bugfix when pushing attestation to OCI registry

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.1.2...v1.2.0>

###
[`v1.1.2`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.1.2)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.1.1...v1.1.2)

##### What's Changed

- Bump actions/attest from 1.1.1 to 1.1.2 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;79](https://redirect.github.com/actions/attest-build-provenance/pull/79)
  - Downcase subject name for OCI images
  - Fix accept header when retrieving image manifest
  - Support variants of the Docker Hub registry name

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.1.1...v1.1.2>

###
[`v1.1.1`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.1.1)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.1.0...v1.1.1)

##### What's Changed

- Bump actions/attest from v1.1.0 to v1.1.1 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;67](https://redirect.github.com/actions/attest-build-provenance/pull/67)
- Bump
[@&#8203;sigstore/sign](https://redirect.github.com/sigstore/sign) from
2.3.0 to 2.3.1
- Bump [@&#8203;sigstore/oci](https://redirect.github.com/sigstore/oci)
from 0.3.0 to 0.3.2
  - Include more detail in error logging
  - Send API errors to GHA debug log
  - Fix bug preventing failed API requests from being retried

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.1.0...v1.1.1>

###
[`v1.1.0`](https://redirect.github.com/actions/attest-build-provenance/releases/tag/v1.1.0)

[Compare
Source](https://redirect.github.com/actions/attest-build-provenance/compare/v1.0.0...v1.1.0)

##### What's Changed

- Bump actions/attest to v1.1.0 by
[@&#8203;bdehamer](https://redirect.github.com/bdehamer) in
[#&#8203;65](https://redirect.github.com/actions/attest-build-provenance/pull/65)
  - adds list support for `subjectPath` input
  - limit attestation subject count
  - ensure subject globs match only files

**Full Changelog**:
<https://github.com/actions/attest-build-provenance/compare/v1.0.0...v1.1.0>

</details>

<details>
<summary>actions/checkout (actions/checkout)</summary>

###
[`v4.3.0`](https://redirect.github.com/actions/checkout/releases/tag/v4.3.0)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.2.2...v4.3.0)

##### What's Changed

- docs: update README.md by
[@&#8203;motss](https://redirect.github.com/motss) in
[https://github.com/actions/checkout/pull/1971](https://redirect.github.com/actions/checkout/pull/1971)
- Add internal repos for checking out multiple repositories by
[@&#8203;mouismail](https://redirect.github.com/mouismail) in
[https://github.com/actions/checkout/pull/1977](https://redirect.github.com/actions/checkout/pull/1977)
- Documentation update - add recommended permissions to Readme by
[@&#8203;benwells](https://redirect.github.com/benwells) in
[https://github.com/actions/checkout/pull/2043](https://redirect.github.com/actions/checkout/pull/2043)
- Adjust positioning of user email note and permissions heading by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[https://github.com/actions/checkout/pull/2044](https://redirect.github.com/actions/checkout/pull/2044)
- Update README.md by
[@&#8203;nebuk89](https://redirect.github.com/nebuk89) in
[https://github.com/actions/checkout/pull/2194](https://redirect.github.com/actions/checkout/pull/2194)
- Update CODEOWNERS for actions by
[@&#8203;TingluoHuang](https://redirect.github.com/TingluoHuang) in
[https://github.com/actions/checkout/pull/2224](https://redirect.github.com/actions/checkout/pull/2224)
- Update package dependencies by
[@&#8203;salmanmkc](https://redirect.github.com/salmanmkc) in
[https://github.com/actions/checkout/pull/2236](https://redirect.github.com/actions/checkout/pull/2236)
- Prepare release v4.3.0 by
[@&#8203;salmanmkc](https://redirect.github.com/salmanmkc) in
[https://github.com/actions/checkout/pull/2237](https://redirect.github.com/actions/checkout/pull/2237)

##### New Contributors

- [@&#8203;motss](https://redirect.github.com/motss) made their first
contribution in
[https://github.com/actions/checkout/pull/1971](https://redirect.github.com/actions/checkout/pull/1971)
- [@&#8203;mouismail](https://redirect.github.com/mouismail) made their
first contribution in
[https://github.com/actions/checkout/pull/1977](https://redirect.github.com/actions/checkout/pull/1977)
- [@&#8203;benwells](https://redirect.github.com/benwells) made their
first contribution in
[https://github.com/actions/checkout/pull/2043](https://redirect.github.com/actions/checkout/pull/2043)
- [@&#8203;nebuk89](https://redirect.github.com/nebuk89) made their
first contribution in
[https://github.com/actions/checkout/pull/2194](https://redirect.github.com/actions/checkout/pull/2194)
- [@&#8203;salmanmkc](https://redirect.github.com/salmanmkc) made their
first contribution in
[https://github.com/actions/checkout/pull/2236](https://redirect.github.com/actions/checkout/pull/2236)

**Full Changelog**:
https://github.com/actions/checkout/compare/v4...v4.3.0

###
[`v4.2.2`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v422)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.2.1...v4.2.2)

- `url-helper.ts` now leverages well-known environment variables by
[@&#8203;jww3](https://redirect.github.com/jww3) in
[#&#8203;1941](https://redirect.github.com/actions/checkout/pull/1941)
- Expand unit test coverage for `isGhes` by
[@&#8203;jww3](https://redirect.github.com/jww3) in
[#&#8203;1946](https://redirect.github.com/actions/checkout/pull/1946)

###
[`v4.2.1`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v421)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.2.0...v4.2.1)

- Check out other refs/\* by commit if provided, fall back to ref by
[@&#8203;orhantoy](https://redirect.github.com/orhantoy) in
[#&#8203;1924](https://redirect.github.com/actions/checkout/pull/1924)

###
[`v4.2.0`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v420)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.1.7...v4.2.0)

- Add Ref and Commit outputs by
[@&#8203;lucacome](https://redirect.github.com/lucacome) in
[#&#8203;1180](https://redirect.github.com/actions/checkout/pull/1180)
- Dependency updates by
[@&#8203;dependabot-](https://redirect.github.com/dependabot-)
[#&#8203;1777](https://redirect.github.com/actions/checkout/pull/1777),
[#&#8203;1872](https://redirect.github.com/actions/checkout/pull/1872)

###
[`v4.1.7`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v417)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.1.6...v4.1.7)

- Bump the minor-npm-dependencies group across 1 directory with 4
updates by [@&#8203;dependabot](https://redirect.github.com/dependabot)
in
[#&#8203;1739](https://redirect.github.com/actions/checkout/pull/1739)
- Bump actions/checkout from 3 to 4 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1697](https://redirect.github.com/actions/checkout/pull/1697)
- Check out other refs/\* by commit by
[@&#8203;orhantoy](https://redirect.github.com/orhantoy) in
[#&#8203;1774](https://redirect.github.com/actions/checkout/pull/1774)
- Pin actions/checkout's own workflows to a known, good, stable version.
by [@&#8203;jww3](https://redirect.github.com/jww3) in
[#&#8203;1776](https://redirect.github.com/actions/checkout/pull/1776)

###
[`v4.1.6`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v416)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.1.5...v4.1.6)

- Check platform to set archive extension appropriately by
[@&#8203;cory-miller](https://redirect.github.com/cory-miller) in
[#&#8203;1732](https://redirect.github.com/actions/checkout/pull/1732)

###
[`v4.1.5`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v415)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.1.4...v4.1.5)

- Update NPM dependencies by
[@&#8203;cory-miller](https://redirect.github.com/cory-miller) in
[#&#8203;1703](https://redirect.github.com/actions/checkout/pull/1703)
- Bump github/codeql-action from 2 to 3 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1694](https://redirect.github.com/actions/checkout/pull/1694)
- Bump actions/setup-node from 1 to 4 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1696](https://redirect.github.com/actions/checkout/pull/1696)
- Bump actions/upload-artifact from 2 to 4 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1695](https://redirect.github.com/actions/checkout/pull/1695)
- README: Suggest `user.email` to be
`41898282+github-actions[bot]@&#8203;users.noreply.github.com` by
[@&#8203;cory-miller](https://redirect.github.com/cory-miller) in
[#&#8203;1707](https://redirect.github.com/actions/checkout/pull/1707)

###
[`v4.1.4`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v414)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.1.3...v4.1.4)

- Disable `extensions.worktreeConfig` when disabling `sparse-checkout`
by [@&#8203;jww3](https://redirect.github.com/jww3) in
[#&#8203;1692](https://redirect.github.com/actions/checkout/pull/1692)
- Add dependabot config by
[@&#8203;cory-miller](https://redirect.github.com/cory-miller) in
[#&#8203;1688](https://redirect.github.com/actions/checkout/pull/1688)
- Bump the minor-actions-dependencies group with 2 updates by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1693](https://redirect.github.com/actions/checkout/pull/1693)
- Bump word-wrap from 1.2.3 to 1.2.5 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1643](https://redirect.github.com/actions/checkout/pull/1643)

###
[`v4.1.3`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v413)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.1.2...v4.1.3)

- Check git version before attempting to disable `sparse-checkout` by
[@&#8203;jww3](https://redirect.github.com/jww3) in
[#&#8203;1656](https://redirect.github.com/actions/checkout/pull/1656)
- Add SSH user parameter by
[@&#8203;cory-miller](https://redirect.github.com/cory-miller) in
[#&#8203;1685](https://redirect.github.com/actions/checkout/pull/1685)
- Update `actions/checkout` version in `update-main-version.yml` by
[@&#8203;jww3](https://redirect.github.com/jww3) in
[#&#8203;1650](https://redirect.github.com/actions/checkout/pull/1650)

###
[`v4.1.2`](https://redirect.github.com/actions/checkout/blob/HEAD/CHANGELOG.md#v412)

[Compare
Source](https://redirect.github.com/actions/checkout/compare/v4.1.1...v4.1.2)

- Fix: Disable sparse checkout whenever `sparse-checkout` option is not
present [@&#8203;dscho](https://redirect.github.com/dscho) in
[#&#8203;1598](https://redirect.github.com/actions/checkout/pull/1598)

</details>

<details>
<summary>actions/github-script (actions/github-script)</summary>

###
[`v7.1.0`](https://redirect.github.com/actions/github-script/releases/tag/v7.1.0)

[Compare
Source](https://redirect.github.com/actions/github-script/compare/v7.0.1...v7.1.0)

#### What's Changed

- Upgrade husky to v9 by
[@&#8203;benelan](https://redirect.github.com/benelan) in
[#&#8203;482](https://redirect.github.com/actions/github-script/pull/482)
- Add workflow file for publishing releases to immutable action package
by [@&#8203;Jcambass](https://redirect.github.com/Jcambass) in
[#&#8203;485](https://redirect.github.com/actions/github-script/pull/485)
- Upgrade IA Publish by
[@&#8203;Jcambass](https://redirect.github.com/Jcambass) in
[#&#8203;486](https://redirect.github.com/actions/github-script/pull/486)
- Fix workflow status badges by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[#&#8203;497](https://redirect.github.com/actions/github-script/pull/497)
- Update usage of `actions/upload-artifact` by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[#&#8203;512](https://redirect.github.com/actions/github-script/pull/512)
- Clear up package name confusion by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[#&#8203;514](https://redirect.github.com/actions/github-script/pull/514)
- Update dependencies with `npm audit fix` by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[#&#8203;515](https://redirect.github.com/actions/github-script/pull/515)
- Specify that the used script is JavaScript by
[@&#8203;timotk](https://redirect.github.com/timotk) in
[#&#8203;478](https://redirect.github.com/actions/github-script/pull/478)
- chore: Add Dependabot for NPM and Actions by
[@&#8203;nschonni](https://redirect.github.com/nschonni) in
[#&#8203;472](https://redirect.github.com/actions/github-script/pull/472)
- Define `permissions` in workflows and update actions by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[#&#8203;531](https://redirect.github.com/actions/github-script/pull/531)
- chore: Add Dependabot for .github/actions/install-dependencies by
[@&#8203;nschonni](https://redirect.github.com/nschonni) in
[#&#8203;532](https://redirect.github.com/actions/github-script/pull/532)
- chore: Remove .vscode settings by
[@&#8203;nschonni](https://redirect.github.com/nschonni) in
[#&#8203;533](https://redirect.github.com/actions/github-script/pull/533)
- ci: Use github/setup-licensed by
[@&#8203;nschonni](https://redirect.github.com/nschonni) in
[#&#8203;473](https://redirect.github.com/actions/github-script/pull/473)
- make octokit instance available as octokit on top of github, to make
it easier to seamlessly copy examples from GitHub rest api or octokit
documentations by
[@&#8203;iamstarkov](https://redirect.github.com/iamstarkov) in
[#&#8203;508](https://redirect.github.com/actions/github-script/pull/508)
- Remove `octokit` README updates for v7 by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[#&#8203;557](https://redirect.github.com/actions/github-script/pull/557)
- docs: add "exec" usage examples by
[@&#8203;neilime](https://redirect.github.com/neilime) in
[#&#8203;546](https://redirect.github.com/actions/github-script/pull/546)
- Bump ruby/setup-ruby from 1.213.0 to 1.222.0 by
[@&#8203;dependabot](https://redirect.github.com/dependabot)\[bot] in
[#&#8203;563](https://redirect.github.com/actions/github-script/pull/563)
- Bump ruby/setup-ruby from 1.222.0 to 1.229.0 by
[@&#8203;dependabot](https://redirect.github.com/dependabot)\[bot] in
[#&#8203;575](https://redirect.github.com/actions/github-script/pull/575)
- Clearly document passing inputs to the `script` by
[@&#8203;joshmgross](https://redirect.github.com/joshmgross) in
[#&#8203;603](https://redirect.github.com/actions/github-script/pull/603)
- Update README.md by
[@&#8203;nebuk89](https://redirect.github.com/nebuk89) in
[#&#8203;610](https://redirect.github.com/actions/github-script/pull/610)

#### New Contributors

- [@&#8203;benelan](https://redirect.github.com/benelan) made their
first contribution in
[#&#8203;482](https://redirect.github.com/actions/github-script/pull/482)
- [@&#8203;Jcambass](https://redirect.github.com/Jcambass) made their
first contribution in
[#&#8203;485](https://redirect.github.com/actions/github-script/pull/485)
- [@&#8203;timotk](https://redirect.github.com/timotk) made their first
contribution in
[#&#8203;478](https://redirect.github.com/actions/github-script/pull/478)
- [@&#8203;iamstarkov](https://redirect.github.com/iamstarkov) made
their first contribution in
[#&#8203;508](https://redirect.github.com/actions/github-script/pull/508)
- [@&#8203;neilime](https://redirect.github.com/neilime) made their
first contribution in
[#&#8203;546](https://redirect.github.com/actions/github-script/pull/546)
- [@&#8203;nebuk89](https://redirect.github.com/nebuk89) made their
first contribution in
[#&#8203;610](https://redirect.github.com/actions/github-script/pull/610)

**Full Changelog**:
<https://github.com/actions/github-script/compare/v7...v7.1.0>

</details>

<details>
<summary>actions/setup-node (actions/setup-node)</summary>

###
[`v4.4.0`](https://redirect.github.com/actions/setup-node/releases/tag/v4.4.0)

[Compare
Source](https://redirect.github.com/actions/setup-node/compare/v4.3.0...v4.4.0)

##### What's Changed

##### Bug fixes:

- Make eslint-compact matcher compatible with Stylelint by
[@&#8203;FloEdelmann](https://redirect.github.com/FloEdelmann)
in [#&#8203;98](https://redirect.github.com/actions/setup-node/pull/98)
- Add support for indented eslint output by
[@&#8203;fregante](https://redirect.github.com/fregante)
in [#&#8203;1245](https://redirect.github.com/actions/setup-node/pull/1245)

##### Enhancement:

- Support private mirrors by
[@&#8203;marco-ippolito](https://redirect.github.com/marco-ippolito)
in [#&#8203;1240](https://redirect.github.com/actions/setup-node/pull/1240)

##### Dependency update:

- Upgrade
[@&#8203;action/cache](https://redirect.github.com/action/cache) from
4.0.2 to 4.0.3
by [@&#8203;aparnajyothi-y](https://redirect.github.com/aparnajyothi-y)
in [#&#8203;1262](https://redirect.github.com/actions/setup-node/pull/1262)

##### New Contributors

- [@&#8203;FloEdelmann](https://redirect.github.com/FloEdelmann) made
their first contribution
in [#&#8203;98](https://redirect.github.com/actions/setup-node/pull/98)
- [@&#8203;fregante](https://redirect.github.com/fregante) made their
first contribution
in [#&#8203;1245](https://redirect.github.com/actions/setup-node/pull/1245)
- [@&#8203;marco-ippolito](https://redirect.github.com/marco-ippolito)
made their first contribution
in [#&#8203;1240](https://redirect.github.com/actions/setup-node/pull/1240)

**Full
Changelog**: <https://github.com/actions/setup-node/compare/v4...v4.4.0>

###
[`v4.3.0`](https://redirect.github.com/actions/setup-node/releases/tag/v4.3.0)

[Compare
Source](https://redirect.github.com/actions/setup-node/compare/v4.2.0...v4.3.0)

#### What's Changed

##### Dependency updates

- Upgrade
[@&#8203;actions/glob](https://redirect.github.com/actions/glob) from
0.4.0 to 0.5.0 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1200](https://redirect.github.com/actions/setup-node/pull/1200)
- Upgrade
[@&#8203;action/cache](https://redirect.github.com/action/cache) from
4.0.0 to 4.0.2 by
[@&#8203;gowridurgad](https://redirect.github.com/gowridurgad) in
[#&#8203;1251](https://redirect.github.com/actions/setup-node/pull/1251)
- Upgrade [@&#8203;vercel/ncc](https://redirect.github.com/vercel/ncc)
from 0.38.1 to 0.38.3 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1203](https://redirect.github.com/actions/setup-node/pull/1203)
- Upgrade
[@&#8203;actions/tool-cache](https://redirect.github.com/actions/tool-cache)
from 2.0.1 to 2.0.2 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1220](https://redirect.github.com/actions/setup-node/pull/1220)

#### New Contributors

- [@&#8203;gowridurgad](https://redirect.github.com/gowridurgad) made
their first contribution in
[#&#8203;1251](https://redirect.github.com/actions/setup-node/pull/1251)

**Full Changelog**:
<https://github.com/actions/setup-node/compare/v4...v4.3.0>

</details>

<details>
<summary>actions/setup-python (actions/setup-python)</summary>

###
[`v5.6.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.6.0)

[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.5.0...v5.6.0)

##### What's Changed

- Workflow updates related to Ubuntu 20.04 by
[@&#8203;aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#&#8203;1065](https://redirect.github.com/actions/setup-python/pull/1065)
- Fix for Candidate Not Iterable Error by
[@&#8203;aparnajyothi-y](https://redirect.github.com/aparnajyothi-y) in
[#&#8203;1082](https://redirect.github.com/actions/setup-python/pull/1082)
- Upgrade semver and
[@&#8203;types/semver](https://redirect.github.com/types/semver) by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1091](https://redirect.github.com/actions/setup-python/pull/1091)
- Upgrade prettier from 2.8.8 to 3.5.3 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1046](https://redirect.github.com/actions/setup-python/pull/1046)
- Upgrade ts-jest from 29.1.2 to 29.3.2 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1081](https://redirect.github.com/actions/setup-python/pull/1081)

**Full Changelog**:
<https://github.com/actions/setup-python/compare/v5...v5.6.0>

###
[`v5.5.0`](https://redirect.github.com/actions/setup-python/releases/tag/v5.5.0)

[Compare
Source](https://redirect.github.com/actions/setup-python/compare/v5.4.0...v5.5.0)

#### What's Changed

##### Enhancements:

- Support free threaded Python versions like '3.13t' by
[@&#8203;colesbury](https://redirect.github.com/colesbury) in
[#&#8203;973](https://redirect.github.com/actions/setup-python/pull/973)
- Enhance Workflows: Include ubuntu-arm runners, Add e2e Testing for
free threaded and Upgrade
[@&#8203;action/cache](https://redirect.github.com/action/cache) from
4.0.0 to 4.0.3 by
[@&#8203;priya-kinthali](https://redirect.github.com/priya-kinthali) in
[#&#8203;1056](https://redirect.github.com/actions/setup-python/pull/1056)
- Add support for .tool-versions file in setup-python by
[@&#8203;mahabaleshwars](https://redirect.github.com/mahabaleshwars) in
[#&#8203;1043](https://redirect.github.com/actions/setup-python/pull/1043)

##### Bug fixes:

- Fix architecture for pypy on Linux ARM64 by
[@&#8203;mayeut](https://redirect.github.com/mayeut) in
[#&#8203;1011](https://redirect.github.com/actions/setup-python/pull/1011)
  This update maps arm64 to aarch64 for Linux ARM64 PyPy installations.

##### Dependency updates:

- Upgrade [@&#8203;vercel/ncc](https://redirect.github.com/vercel/ncc)
from 0.38.1 to 0.38.3 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1016](https://redirect.github.com/actions/setup-python/pull/1016)
- Upgrade
[@&#8203;actions/glob](https://redirect.github.com/actions/glob) from
0.4.0 to 0.5.0 by
[@&#8203;dependabot](https://redirect.github.com/dependabot) in
[#&#8203;1015](https://redirect.github.com/actions/setup-python/pull/1015)

#### New Contributors

- [@&#8203;colesbury](https://redirect.github.com/colesbury) made their
first contribution in
[#&#8203;973](https://redirect.github.com/actions/setup-python/pull/973)
- [@&#8203;mahabaleshwars](https://redirect.github.com/mahabaleshwars)
made their first contribution in
[#&#8203;1043](https://redirect.github.com/actions/setup-python/pull/1043)

**Full Changelog**:
<https://github.com/actions/setup-python/compare/v5...v5.5.0>

</details>

<details>
<summary>actions/upload-artifact (actions/upload-artifact)</summary>

###
[`v4.6.2`](https://redirect.github.com/actions/upload-artifact/releases/tag/v4.6.2)

[Compare
Source](https://redirect.github.com/actions/upload-artifact/compare/v4.6.1...v4.6.2)

#### What's Changed

- Update to use artifact 2.3.2 package & prepare for new upload-artifact
release by [@&#8203;salmanmkc](https://redirect.github.com/salmanmkc) in
[#&#8203;685](https://redirect.github.com/actions/upload-artifact/pull/685)

#### New Contributors

- [@&#8203;salmanmkc](https://redirect.github.com/salmanmkc) made their
first contribution in
[#&#8203;685](https://redirect.github.com/actions/upload-artifact/pull/685)

**Full Changelog**:
<https://github.com/actions/upload-artifact/compare/v4...v4.6.2>

###
[`v4.6.1`](https://redirect.github.com/actions/upload-artifact/releases/tag/v4.6.1)

[Compare
Source](https://redirect.github.com/actions/upload-artifact/compare/v4.6.0...v4.6.1)

#### What's Changed

- Update to use artifact 2.2.2 package by
[@&#8203;yacaovsnc](https://redirect.github.com/yacaovsnc) in
[#&#8203;673](https://redirect.github.com/actions/upload-artifact/pull/673)

**Full Changelog**:
<https://github.com/actions/upload-artifact/compare/v4...v4.6.1>

</details>

<details>
<summary>aminya/setup-cpp (aminya/setup-cpp)</summary>

###
[`v1.7.1`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.7.1)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.7.0...v1.7.1)

##### What's Changed

- fix: prefer complete Window LLVM package
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;425](https://redirect.github.com/aminya/setup-cpp/pull/425)
- fix: add LLVM 20.1.7 by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;424](https://redirect.github.com/aminya/setup-cpp/pull/424)
- fix: add mingw 15.1-r2 by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;424](https://redirect.github.com/aminya/setup-cpp/pull/424)
- fix: install gcovr via apt on Ubuntu by default by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;424](https://redirect.github.com/aminya/setup-cpp/pull/424)
- feat: add tar tool by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;425](https://redirect.github.com/aminya/setup-cpp/pull/425)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.7.0...v1.7.1>

###
[`v1.7.0`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.7.0)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.6.2...v1.7.0)

##### What's Changed

- feat: update default LLVM to v20 by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;387](https://redirect.github.com/aminya/setup-cpp/pull/387)
- feat: default to GCC 15 on Windows and MacOS by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;387](https://redirect.github.com/aminya/setup-cpp/pull/387)
- fix: update cmake, task, powershell, meson, doxygen by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;414](https://redirect.github.com/aminya/setup-cpp/pull/414)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.6.2...v1.7.0>

###
[`v1.6.2`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.6.2)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.6.1...v1.6.2)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.6.1...v1.6.2>

###
[`v1.6.1`](https://redirect.github.com/aminya/setup-cpp/compare/v1.6.0...v1.6.1)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.6.0...v1.6.1)

###
[`v1.6.0`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.6.0)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.5.4...v1.6.0)

##### What's Changed

- feat: add apt-fast as an installable tool by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;401](https://redirect.github.com/aminya/setup-cpp/pull/401)
- fix: add apt-fast optimizations by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;402](https://redirect.github.com/aminya/setup-cpp/pull/402)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.5.4...v1.6.0>

###
[`v1.5.4`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.5.4)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.5.3...v1.5.4)

##### What's Changed

- fix: avoid rc sourcing loops + fix: always add guards for sourcing rc
files by [@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;397](https://redirect.github.com/aminya/setup-cpp/pull/397)
- [fix: add missing git option for
actions](https://redirect.github.com/aminya/setup-cpp/commit/d0235b0adb97722c83c6f48ccfad4c98c083c0e4)
by [@&#8203;aminya](https://redirect.github.com/aminya)
- [fix: ignore setup-cpp cli installation
errors](https://redirect.github.com/aminya/setup-cpp/commit/d10f4b6db061e4bd794e409852bc38e33cc5e4a6)
by [@&#8203;aminya](https://redirect.github.com/aminya)
- [fix: fix addition of git to PATH on
Windows](https://redirect.github.com/aminya/setup-cpp/commit/75890615f7a4d1f3833f0f9008be852b0a1b256a)
by [@&#8203;aminya](https://redirect.github.com/aminya)
- [fix: fix add-apt-repository in
Debian](https://redirect.github.com/aminya/setup-cpp/commit/55f022dea4b4667ba75821264c21fc8121bd3f06)
by [@&#8203;aminya](https://redirect.github.com/aminya)
- [fix: fix llvm add-apt-repository for
debian](https://redirect.github.com/aminya/setup-cpp/commit/05bd2b5297d0c94ccbd895c9e7f35a5b88dbdee8)
by [@&#8203;aminya](https://redirect.github.com/aminya)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.5.3...v1.5.4>

###
[`v1.5.3`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.5.3)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.5.2...v1.5.3)

- fix: remove exports map from package by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[7f46810eeda56](https://redirect.github.com/aminya/setup-cpp/commit/6370aaa0252a93c71dcc4cf49397f46810eeda56)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.5.2...v1.5.3>

###
[`v1.5.2`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.5.2)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.5.1...v1.5.2)

- fix: fix CLI shabang not working - independent lib by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[c88b4364ef50](https://redirect.github.com/aminya/setup-cpp/commit/95a7de4f2eceb0baf03a70a1edb7c88b4364ef50)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.5.1...v1.5.2>

###
[`v1.5.1`](https://redirect.github.com/aminya/setup-cpp/compare/v1.5.0...v1.5.1)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.5.0...v1.5.1)

###
[`v1.5.0`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.5.0)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.4.0...v1.5.0)

##### What's Changed

- feat: allow using setup-cpp as a library by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;386](https://redirect.github.com/aminya/setup-cpp/pull/386)
- fix: pin vcpkg on Alpine Arm64 by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;389](https://redirect.github.com/aminya/setup-cpp/pull/389)
- fix: do not add LLVM libraries to dyld by default by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;388](https://redirect.github.com/aminya/setup-cpp/pull/388)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.4.0...v1.5.0>

###
[`v1.4.0`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.4.0)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.3.0...v1.4.0)

##### What's Changed

- 🎉 feat: support Alpine + add setup-alpine package by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;379](https://redirect.github.com/aminya/setup-cpp/pull/379)
- ci: add docker tags with the base platform versions by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;381](https://redirect.github.com/aminya/setup-cpp/pull/381)
(e.g. `setup-cpp-ubuntu:20.04`)
- fix: detect externally managed Python to avoid warnings by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;379](https://redirect.github.com/aminya/setup-cpp/pull/379)

##### Alpine Images (amd64 and arm64)

Setup-cpp now provides prebuilt images for Alpine with support for base
tools, and compilers `llvm`, `gcc`, and `mingw` available for `amd64`
and `arm64` architectures.

Base image with `cmake, ninja, task, vcpkg, python, make, cppcheck,
gcovr, doxygen, ccache, conan, meson, cmakelang` for Alpine:

```dockerfile
FROM aminya/setup-cpp-alpine:3.21 AS builder
```

Image with `llvm` and the base tools:

```dockerfile
FROM aminya/setup-cpp-alpine-llvm:3.21 AS builder
```

Image with `gcc` and the base tools:

```dockerfile
FROM aminya/setup-cpp-alpine-gcc:3.21 AS builder
```

Image with `mingw` and the base tools:

```dockerfile
FROM aminya/setup-cpp-alpine-mingw:3.21 AS builder
```

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.3.0...v1.4.0>

###
[`v1.3.0`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.3.0)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.2.2...v1.3.0)

##### What's Changed

- feat: add Ubuntu 20 and 24 docker builds by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;375](https://redirect.github.com/aminya/setup-cpp/pull/375)
- fix: fix python installation on Ubuntu 20 by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;375](https://redirect.github.com/aminya/setup-cpp/pull/375)
- chore(deps): update devdependencies by
[@&#8203;renovate](https://redirect.github.com/renovate) in
[#&#8203;376](https://redirect.github.com/aminya/setup-cpp/pull/376)

##### Breaking changes for Ubuntu Docker images

The `latest` tag for `setup-cpp` on Docker now points to Ubuntu `24.04`.
Please pin the specific version if needed:

Base image with `cmake, ninja, task, vcpkg, python, make, cppcheck,
gcovr, doxygen, ccache, conan, meson, cmakelang` for Ubuntu 24.04:

```dockerfile
FROM aminya/setup-cpp-ubuntu:24.04-1.3.0 AS builder
```

Image with `llvm` and the base tools:

```dockerfile
FROM aminya/setup-cpp-ubuntu-llvm:24.04-1.3.0 AS builder
```

Image with `gcc` and the base tools:

```dockerfile
FROM aminya/setup-cpp-ubuntu-gcc:24.04-1.3.0 AS builder
```

Image with `mingw` and the base tools:

```dockerfile
FROM aminya/setup-cpp-ubuntu-mingw:24.04-1.3.0 AS builder
```

There are also the variants for Ubuntu `22.04`

```dockerfile
FROM aminya/setup-cpp-ubuntu:22.04-1.3.0 AS builder
FROM aminya/setup-cpp-ubuntu-llvm:22.04-1.3.0 AS builder
FROM aminya/setup-cpp-ubuntu-gcc:22.04-1.3.0 AS builder
FROM aminya/setup-cpp-ubuntu-mingw:22.04-1.3.0 AS builder
```

And for Ubuntu `20.04`:

```dockerfile
FROM aminya/setup-cpp-ubuntu:20.04-1.3.0 AS builder
FROM aminya/setup-cpp-ubuntu-llvm:20.04-1.3.0 AS builder
FROM aminya/setup-cpp-ubuntu-gcc:20.04-1.3.0 AS builder
FROM aminya/setup-cpp-ubuntu-mingw:20.04-1.3.0 AS builder
```

Note that `nala` is no longer included in the setup-cpp images by
default. You can install it manually via `setup-cpp --nala true` in your
Docker image if you rely on it.

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.2.2...v1.3.0>

###
[`v1.2.2`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.2.2)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.2.1...v1.2.2)

##### What's Changed

- 🎉 ci: tests and executables for Linux arm by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;363](https://redirect.github.com/aminya/setup-cpp/pull/363)
- fix apt installation fallbacks to latest by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;363](https://redirect.github.com/aminya/setup-cpp/pull/363)
- fix old LLVM on latest Ubuntu arm by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;363](https://redirect.github.com/aminya/setup-cpp/pull/363)
- feat: install sccache on latest ubuntu arm by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;373](https://redirect.github.com/aminya/setup-cpp/pull/373)
- fix Doxygen on Linux Arm by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;363](https://redirect.github.com/aminya/setup-cpp/pull/363)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.2.1...v1.2.2>

###
[`v1.2.1`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.2.1)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.2.0...v1.2.1)

##### What's Changed

- 🎉 feat: add multi-architecture Docker images for setup-cpp by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;361](https://redirect.github.com/aminya/setup-cpp/pull/361)
- 🎉 feat: install LLVM via brew on Mac if possible by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;367](https://redirect.github.com/aminya/setup-cpp/pull/367) and
[#&#8203;364](https://redirect.github.com/aminya/setup-cpp/pull/364)
- 🎉 fix: avoid already installed warnings for brew by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;369](https://redirect.github.com/aminya/setup-cpp/pull/369)
- feat: add git as an installable tool by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;362](https://redirect.github.com/aminya/setup-cpp/pull/362)
- fix: add polyfill for crypto.randomuuid for Node 12 by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;368](https://redirect.github.com/aminya/setup-cpp/pull/368) and
[#&#8203;370](https://redirect.github.com/aminya/setup-cpp/pull/370)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.1.1...v1.2.0>

###
[`v1.2.0`](https://redirect.github.com/aminya/setup-cpp/releases/tag/v1.2.0)

[Compare
Source](https://redirect.github.com/aminya/setup-cpp/compare/v1.1.1...v1.2.0)

##### What's Changed

Note: superseded by v1.2.1

- feat: add multi-architecture Docker images for setup-cpp by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;361](https://redirect.github.com/aminya/setup-cpp/pull/361)
- feat: install LLVM via brew on Mac if possible by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;367](https://redirect.github.com/aminya/setup-cpp/pull/367) and
[#&#8203;364](https://redirect.github.com/aminya/setup-cpp/pull/364)
- feat: add git as an installable tool by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;362](https://redirect.github.com/aminya/setup-cpp/pull/362)
- fix: avoid already installed warnings for brew by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;369](https://redirect.github.com/aminya/setup-cpp/pull/369)
- fix: add polyfill for crypto.randomuuid for Node 12 by
[@&#8203;aminya](https://redirect.github.com/aminya) in
[#&#8203;368](https://redirect.github.com/aminya/setup-cpp/pull/368) and
[#&#8203;370](https://redirect.github.com/aminya/setup-cpp/pull/370)

**Full Changelog**:
<https://github.com/aminya/setup-cpp/compare/v1.1.1...v1.2.0>

</details>

<details>
<summary>docker/login-action (docker/login-action)</summary>

###
[`v3.5.0`](https://redirect.github.com/docker/login-action/releases/tag/v3.5.0)

[Compare
Source](https://redirect.github.com/docker/login-action/compare/v3.4.0...v3.5.0)

- Support dual-stack endpoints for AWS ECR by
[@&#8203;Spacefish](https://redirect.github.com/Spacefish)
[@&#8203;crazy-max](https://redirect.github.com/crazy-max) in
[#&#8203;874](https://redirect.github.com/docker/login-action/pull/874)
[#&#8203;876](https://redirect.github.com/docker/login-action/pull/876)
- Bump
[@&#8203;aws-sdk/client-ecr](https://redirect.github.com/aws-sdk/client-ecr)
to 3.859.0 in
[#&#8203;860](https://redirect.github.com/docker/login-action/pull/860)
[#&#8203;878](https://redirect.github.com/docker/login-action/pull/878)
- Bump
[@&#8203;aws-sdk/client-ecr-public](https://redirect.github.com/aws-sdk/client-ecr-public)
to 3.859.0 in
[#&#8203;860](https://redirect.github.com/docker/login-action/pull/860)
[#&#8203;878](https://redirect.github.com/docker/login-action/pull/878)
- Bump
[@&#8203;docker/actions-toolkit](https://redirect.github.com/docker/actions-toolkit)
from 0.57.0 to 0.62.1 in
[#&#8203;870](https://redirect.github.com/docker/login-action/pull/870)
- Bump form-data from 2.5.1 to 2.5.5 in
[#&#8203;875](https://redirect.github.com/docker/login-action/pull/875)

**Full Changelog**:
<https://github.com/docker/login-action/compare/v3.4.0...v3.5.0>

###
[`v3.4.0`](https://redirect.github.com/docker/login-action/releases/tag/v3.4.0)

[Compare
Source](https://redirect.github.com/docker/login-action/compare/v3.3.0...v3.4.0)

- Bump [@&#8203;actions/core](https://redirect.github.com/actions/core)
from 1.10.1 to 1.11.1 in
[#&#8203;791](https://redirect.github.com/docker/login-action/pull/791)
- Bump
[@&#8203;aws-sdk/client-ecr](https://redirect.github.com/aws-sdk/client-ecr)
to 3.766.0 in
[#&#8203;789](https://redirect.github.com/docker/login-action/pull/789)
[#&#8203;856](https://redirect.github.com/docker/login-action/pull/856)
- Bump
[@&#8203;aws-sdk/client-ecr-public](https://redirect.github.com/aws-sdk/client-ecr-public)
to 3.758.0 in
[#&#8203;789](https://redirect.github.com/docker/login-action/pull/789)
[#&#8203;856](https://redirect.github.com/docker/login-action/pull/856)
- Bump
[@&#8203;docker/actions-toolkit](https://redirect.github.com/docker/actions-toolkit)
from 0.35.0 to 0.57.0 in
[#&#8203;801](https://redirect.github.com/docker/login-action/pull/801)
[#&#8203;806](https://redirect.github.com/docker/login-action/pull/806)
[#&#8203;858](https://redirect.github.com/docker/login-action/pull/858)
- Bump cross-spawn from 7.0.3 to 7.0.6 in
[#&#8203;814](https://redirect.github.com/docker/login-action/pull/814)
- Bump https-proxy-agent from 7.0.5 to 7.0.6 in
[#&#8203;823](https://redirect.github.com/docker/login-action/pull/823)
- Bump path-to-regexp from 6.2.2 to 6.3.0 in
[#&#8203;777](https://redirect.github.com/docker/login-action/pull/777)

**Full Changelog**:
<https://github.com/docker/login-action/compare/v3.3.0...v3.4.0>

</details>

<details>
<summary>github/codeql-action (github/codeql-action)</summary>

###
[`v2.28.1`](https://redirect.github.com/github/codeql-action/releases/tag/v2.28.1)

[Compare
Source](https://redirect.github.com/github/codeql-action/compare/v2.28.0...v2.28.1)

### CodeQL Action Changelog

See the [releases
page](https://redirect.github.com/github/codeql-action/releases) for the
relevant changes to the CodeQL CLI and language packs.

**This is the last planned release of the `v2`. To continue getting
updates for the CodeQL Action, please switch to `v3`.**

#### 2.28.1 - 10 Jan 2025

- CodeQL Action v2 is now deprecated, and is no longer updated or
supported. For better performance, improved security, and new features,
upgrade to v3. For more information, see [this changelog
post](https://github.blog/changelog/2025-01-10-code-scanning-codeql-action-v2-is-now-deprecated/).
[#&#8203;2677](https://redirect.github.com/github/codeql-action/pull/2677)
- Update default CodeQL bundle version to 2.20.1.
[#&#8203;2678](https://redirect.github.com/github/codeql-action/pull/2678)

See the full
[CHANGELOG.md](https://redirect.github.com/github/codeql-action/blob/v2.28.1/CHANGELOG.md)
for more information.

###
[`v2.28.0`](https://redirect.github.com/github/codeql-action/releases/tag/v2.28.0)

[Compare
Source](https://redirect.github.com/github/codeql-action/compare/v2.27.9...v2.28.0)

### CodeQL Action Changelog

See the [releases
page](https://redirect.github.com/github/codeql-action/releases) for the
relevant changes to the CodeQL CLI and language packs.

Note that the only difference between `v2` and `v3` of the CodeQL Action
is the node version they support, with `v3` running on node 20 while we
continue to release `v2` to support running on node 16. For example
`3.22.11` was the first `v3` release and is functionally identical to
`2.22.11`. This approach ensures an easy way to track exactly which
features are included in different versions, indicated by the minor and
patch version numbers.

**This is the last planned release of the `v2`. To continue getting
updates for the CodeQL Action, please switch to `v3`.**

#### 2.28.0 - 20 Dec 2024

- Bump the minimum CodeQL bundle version to 2.15.5.
[#&#8203;2655](https://redirect.github.com/github/codeql-action/pull/2655)
- Don't fail in the unusual case that a file is on the search path.
[#&#8203;2660](https://redirect.github.com/github/codeql-action/pull/2660).

See the full
[CHANGELOG.md](https://redirect.github.com/github/codeql-action/blob/v2.28.0/CHANGELOG.md)
for more information.

###
[`v2.27.9`](https://redirect.github.com/github/codeql-action/releases/tag/v2.27.9)

[Compare
Source](https://redirect.github.com/github/codeql-action/compare/v2.27.8...v2.27.9)

### CodeQL Action Changelog

See the [releases
page](https://redirect.github.com/github/codeql-action/releases) for the
relevant changes to the CodeQL CLI and language packs.

Note that the only difference between `v2` and `v3` of the CodeQL Action
is the node version they support, with `v3` running on node 20 while we
continue to release `v2` to support running on node 16. For example
`3.22.11` was the first `v3` release and is functionally identical to
`2.22.11`. This approach ensures an easy way to track exactly which
features are included in different versions, indicated by the minor and
patch version numbers.

#### 2.27.9 - 12 Dec 2024

No user facing changes.

See the full
[CHANGELOG.md](https://redirect.github.com/github/codeql-action/blob/v2.27.9/CHANGELOG.md)
for more information.

###
[`v2.27.8`](https://redirect.github.com/github/codeql-action/compare/v2.27.7...v2.27.8)

[Compare
Source](https://redirect.github.com/github/codeql-action/compare/v2.27.7...v2.27.8)

###
[`v2.27.7`](https://redirect.github.com/github/codeql-action/releases/tag/v2.27.7)

[Compare
Source](https://redirect.github.com/github/codeql-action/compare/v2.27.6...v2.27.7)

### CodeQL Action Changelog

See the [releases
page](https://redirect.github.com/github/codeql-action/releases) for the
relevant changes to the CodeQL CLI and language packs.

Note that the only difference between `v2` and `v3` of the CodeQL Action
is the node version they support, with `v3` running on node 20 while we
continue to release `v2` to support running on node 16. For example
`3.22.11` was the first `v3` release and is functionally identical to
`2.22.11`. This approach ensures an easy way to track exactly which
features are included in different versions, indicated by the minor and
patch version numbers.

#### 2.27.7 - 10 Dec 2024

- We are rolling out a change in December 2024 that will extract the
CodeQL bundle directly to the toolcache to improve performance.
[#&#8203;2631](https://redirect.github.com/github/codeql-action/pull/2631)
- Update default CodeQL bundle version to 2.20.0.
[#&#8203;2636](https://redirect.github.com/github/codeql-action/pull/2636)

See the full
[CHANGELOG.md](https://redirect.github.com/github/codeql-action/blob/v2.27.7/CHANGELOG.md)
for more information.

###
[`v2.27.6`](https://redirect.github.com/github/codeql-action/releases/tag/v2

</details>

---

### Configuration

📅 **Schedule**: Branch creation - Between 12:00 AM and 12:59 AM, only on
Monday ( * 0 * * 1 ) (UTC), Automerge - At any time (no schedule
defined).

🚦 **Automerge**: Disabled by config. Please merge this manually once you
are satisfied.

♻ **Rebasing**: Whenever PR becomes conflicted, or you tick the
rebase/retry checkbox.

👻 **Immortal**: This PR will be recreated if closed unmerged. Get
[config
help](https://redirect.github.com/renovatebot/renovate/discussions) if
that's undesired.

---

- [ ] <!-- rebase-check -->If you want to rebase/retry this PR, check
this box

---

This PR was generated by [Mend Renovate](https://mend.io/renovate/).
View the [repository job
log](https://developer.mend.io/github/llvm/llvm-project).

<!--renovate-debug:eyJjcmVhdGVkSW5WZXIiOiI0MS4xMzEuOSIsInVwZGF0ZWRJblZlciI6IjQxLjEzMS45IiwidGFyZ2V0QnJhbmNoIjoibWFpbiIsImxhYmVscyI6W119-->
---
 .github/workflows/build-ci-container-windows.yml       |  2 +-
 .github/workflows/build-ci-container.yml               |  2 +-
 .github/workflows/build-metrics-container.yml          |  2 +-
 .github/workflows/check-ci.yml                         |  2 +-
 .github/workflows/ci-post-commit-analyzer.yml          |  4 ++--
 .github/workflows/commit-access-review.yml             |  2 +-
 .github/workflows/docs.yml                             |  6 +++---
 .github/workflows/email-check.yaml                     |  2 +-
 .github/workflows/gha-codeql.yml                       |  4 ++--
 .github/workflows/hlsl-test-all.yaml                   |  2 +-
 .github/workflows/issue-write.yml                      |  2 +-
 .github/workflows/libc-fullbuild-tests.yml             |  2 +-
 .github/workflows/libc-overlay-tests.yml               |  2 +-
 .github/workflows/libclang-abi-tests.yml               |  4 ++--
 .github/workflows/libclang-python-tests.yml            |  4 ++--
 .github/workflows/libcxx-build-and-test.yaml           |  8 ++++----
 .github/workflows/libcxx-build-containers.yml          |  2 +-
 .github/workflows/libcxx-check-generated-files.yml     |  2 +-
 .github/workflows/libcxx-run-benchmarks.yml            |  2 +-
 .github/workflows/llvm-bugs.yml                        |  2 +-
 .github/workflows/llvm-tests.yml                       |  6 +++---
 .github/workflows/mlir-spirv-tests.yml                 |  2 +-
 .github/workflows/pr-code-format.yml                   |  8 ++++----
 .github/workflows/pr-code-lint.yml                     | 10 +++++-----
 .github/workflows/pr-request-release-note.yml          |  2 +-
 .github/workflows/premerge.yaml                        |  6 +++---
 .github/workflows/release-asset-audit.yml              |  2 +-
 .../workflows/release-binaries-save-stage/action.yml   |  4 ++--
 .../workflows/release-binaries-setup-stage/action.yml  |  2 +-
 .github/workflows/release-binaries.yml                 | 10 +++++-----
 .github/workflows/release-documentation.yml            |  4 ++--
 .github/workflows/release-doxygen.yml                  |  2 +-
 .github/workflows/release-lit.yml                      |  6 +++---
 .github/workflows/release-sources.yml                  |  4 ++--
 .github/workflows/scorecard.yml                        |  6 +++---
 .github/workflows/spirv-tests.yml                      |  2 +-
 .../unprivileged-download-artifact/action.yml          |  2 +-
 37 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 167e7cf06b3b2..14c349b1b2fe5 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -44,7 +44,7 @@ jobs:
         run: |
           docker save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
       - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container
           path: ${{ steps.vars.outputs.container-filename }}
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index 67f35fd30701f..01f1b8dc4f990 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -64,7 +64,7 @@ jobs:
           podman save ${{ steps.vars.outputs.container-name-agent-tag }} > ${{ steps.vars.outputs.container-agent-filename }}
 
       - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container-${{ matrix.arch }}
           path: "*.tar"
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
index cadcaa9a42e8f..69b571575f40c 100644
--- a/.github/workflows/build-metrics-container.yml
+++ b/.github/workflows/build-metrics-container.yml
@@ -49,7 +49,7 @@ jobs:
         run: |
           podman save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
       - name: Upload Container Image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container
           path: ${{ steps.vars.outputs.container-filename }}
diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml
index 7e8c15696e344..f18a69c192ee9 100644
--- a/.github/workflows/check-ci.yml
+++ b/.github/workflows/check-ci.yml
@@ -26,7 +26,7 @@ jobs:
         with:
           sparse-checkout: .ci
       - name: Setup Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           python-version: 3.13
           cache: 'pip'
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
index 7d37b900d7909..49cf4100dd71c 100644
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -44,7 +44,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           # A full build of llvm, clang, lld, and lldb takes about 250MB
           # of ccache space. There's not much reason to have more than this,
@@ -87,7 +87,7 @@ jobs:
           scan-build --generate-index-only build/analyzer-results
 
       - name: Upload Results
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: analyzer-results
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
index a7be81b0e2da5..734dc212fa648 100644
--- a/.github/workflows/commit-access-review.yml
+++ b/.github/workflows/commit-access-review.yml
@@ -28,7 +28,7 @@ jobs:
           python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN
 
       - name: Upload Triage List
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: triagers
           path: triagers.log
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 8cdd39c164cca..b5f3413fe3b6b 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -95,9 +95,9 @@ jobs:
             workflow:
               - '.github/workflows/docs.yml'
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.11'
+          python-version: '3.13'
           cache: 'pip'
           cache-dependency-path: 'llvm/docs/requirements-hashed.txt'
       - name: Install python dependencies
@@ -209,7 +209,7 @@ jobs:
           mkdir built-docs/flang
           cp -r flang-build/docs/* built-docs/flang/
       - name: Upload docs
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: docs-output
           path: built-docs/
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index 9390fba4d4e3b..981c6fa62cb19 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -39,7 +39,7 @@ jobs:
           [{"body" : "$COMMENT"}]
           EOF
 
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml
index efb8143877c4e..63388ebc706bd 100644
--- a/.github/workflows/gha-codeql.yml
+++ b/.github/workflows/gha-codeql.yml
@@ -29,9 +29,9 @@ jobs:
           sparse-checkout: |
             .github/
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3
+        uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
         with:
           languages: actions
           queries: security-extended
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3
+        uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index 72cbbe2b7dded..f954528abade1 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -80,7 +80,7 @@ jobs:
             ninja check-hlsl-unit
             ninja ${{ inputs.TestTarget }}
       - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action/macos@170bf24d20d201b842d7a52403b73ed297e6645b # v2
+        uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2
         if: always() && runner.os == 'macOS'
         with:
           comment_mode: off
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index db9389b6afe53..26cd60c070251 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -40,7 +40,7 @@ jobs:
 
       - name: 'Comment on PR'
         if: steps.download-artifact.outputs.artifact-id != ''
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index 8967cd0949c11..3a048aeb9405b 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -61,7 +61,7 @@ jobs:
     # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects
     # frequent small object writes.
     - name: Setup ccache
-      uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+      uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
       with:
         max-size: 1G
         key: libc_fullbuild_${{ matrix.c_compiler }}
diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
index 7154946ac5c3d..df9a20dce8eae 100644
--- a/.github/workflows/libc-overlay-tests.yml
+++ b/.github/workflows/libc-overlay-tests.yml
@@ -51,7 +51,7 @@ jobs:
     # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects
     # frequent small object writes.
     - name: Setup ccache
-      uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+      uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
       with:
         max-size: 1G
         key: libc_overlay_build_${{ matrix.os }}_${{ matrix.compiler.c_compiler }}
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index d53a2f306afa2..5ccf976848197 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -131,7 +131,7 @@ jobs:
             sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
           done
       - name: Upload ABI file
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: ${{ matrix.name }}
           path: '*${{ matrix.ref }}.abi'
@@ -165,7 +165,7 @@ jobs:
           done
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
index e168928325561..8fb8cec3b4f00 100644
--- a/.github/workflows/libclang-python-tests.yml
+++ b/.github/workflows/libclang-python-tests.yml
@@ -34,11 +34,11 @@ jobs:
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Setup Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           python-version: ${{ matrix.python-version }}
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: 2G
           key: spirv-ubuntu-24.04
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 2e6ff7f91b6fc..5fe2ffbf58b43 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -60,7 +60,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -105,7 +105,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -169,7 +169,7 @@ jobs:
         env:
           CC: clang-22
           CXX: clang++-22
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: ${{ matrix.config }}-results
@@ -223,7 +223,7 @@ jobs:
           source .venv/bin/activate
           python -m pip install psutil
           bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: macos-${{ matrix.config }}-results
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index cbaa8e0f65129..312cb47fc3d93 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -55,7 +55,7 @@ jobs:
         TAG: ${{ github.sha }}
 
     - name: Log in to GitHub Container Registry
-      uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
+      uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
       with:
         registry: ghcr.io
         username: ${{ github.actor }}
diff --git a/.github/workflows/libcxx-check-generated-files.yml b/.github/workflows/libcxx-check-generated-files.yml
index f338bd6952779..d34b6a79556d1 100644
--- a/.github/workflows/libcxx-check-generated-files.yml
+++ b/.github/workflows/libcxx-check-generated-files.yml
@@ -15,7 +15,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Install dependencies
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           clangformat: 17.0.1
           ninja: true
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
index 17a97df029ba5..0379a0a1f857d 100644
--- a/.github/workflows/libcxx-run-benchmarks.yml
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -35,7 +35,7 @@ jobs:
     steps:
       - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
-          python-version: '3.10'
+          python-version: '3.13'
 
       - name: Extract information from the PR
         id: vars
diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index 5470662c97628..7d42abfadde7b 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -14,7 +14,7 @@ jobs:
     runs-on: ubuntu-24.04
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: actions/setup-node@1d0ff469b7ec7b3cb9d8673fde0c81c44821de2a # v4.2.0
+      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
         with:
           node-version: 18
           check-latest: true
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
index ea80e229512d5..c4701c7283da0 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-tests.yml
@@ -128,14 +128,14 @@ jobs:
           # Remove symbol versioning from dumps, so we can compare across major versions.
           sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
       - name: Upload ABI file
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: ${{ matrix.name }}
           path: ${{ matrix.ref }}.abi
 
       - name: Upload symbol list file
         if: matrix.name == 'build-baseline'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: symbol-list
           path: llvm.symbols
@@ -179,7 +179,7 @@ jobs:
           abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/mlir-spirv-tests.yml b/.github/workflows/mlir-spirv-tests.yml
index 78952ccad2642..5bb16c739cdde 100644
--- a/.github/workflows/mlir-spirv-tests.yml
+++ b/.github/workflows/mlir-spirv-tests.yml
@@ -30,7 +30,7 @@ jobs:
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: 2G
           key: spirv-mlir-ubuntu-24.04
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 61c8680cd72a1..1e0dc7045c1cc 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -43,14 +43,14 @@ jobs:
       # of a release cycle (x.1.0) or the last version of a release cycle, or
       # if there have been relevant clang-format backports.
       - name: Install clang-format
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           clangformat: 21.1.0
 
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.11'
+          python-version: '3.13'
           cache: 'pip'
           cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt'
 
@@ -72,7 +72,7 @@ jobs:
             --end-rev HEAD \
             --changed-files "$CHANGED_FILES"
 
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index daefc9baacce7..776ec4af9d2dc 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -27,7 +27,7 @@ jobs:
       cancel-in-progress: true
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
         with:
           fetch-depth: 2
       
@@ -51,14 +51,14 @@ jobs:
       # of a release cycle (x.1.0) or the last version of a release cycle, or
       # if there have been relevant clang-format backports.
       - name: Install clang-tidy
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           clang-tidy: 21.1.0
       
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.12'
+          python-version: '3.13'
 
       - name: Install Python dependencies
         run: python3 -m pip install -r llvm/utils/git/requirements_linting.txt
@@ -107,7 +107,7 @@ jobs:
             --changed-files "$CHANGED_FILES"
       
       - name: Upload results
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
index f0197d71d6aa9..8162a8984ee5f 100644
--- a/.github/workflows/pr-request-release-note.yml
+++ b/.github/workflows/pr-request-release-note.yml
@@ -41,7 +41,7 @@ jobs:
             request-release-note \
             --pr-number ${{ github.event.pull_request.number}}
 
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 63ab4a8356971..a9c107e4a5f08 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -76,7 +76,7 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: Premerge Artifacts (Linux)
           path: artifacts/
@@ -130,7 +130,7 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: Premerge Artifacts (Windows)
           path: artifacts/
@@ -151,7 +151,7 @@ jobs:
         with:
           fetch-depth: 2
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: "2000M"
       - name: Install Ninja
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
index 6546540a1b547..8b24948b568eb 100644
--- a/.github/workflows/release-asset-audit.yml
+++ b/.github/workflows/release-asset-audit.yml
@@ -38,7 +38,7 @@ jobs:
         if: >-
           github.event_name != 'pull_request' &&
           failure()
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
         with:
           github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           script: |
diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml
index f08088c7bc56f..84ccf98c23a82 100644
--- a/.github/workflows/release-binaries-save-stage/action.yml
+++ b/.github/workflows/release-binaries-save-stage/action.yml
@@ -30,14 +30,14 @@ runs:
         tar -C ${{ inputs.build-prefix }} -c build/ | zstd -T0 -c > build.tar.zst
 
     - name: Upload Stage 1 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ runner.os }}-${{ runner.arch }}-${{ github.job }}-source
         path: llvm-project.tar.zst
         retention-days: 2
 
     - name: Upload Stage 1 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ runner.os}}-${{ runner.arch }}-${{ github.job }}-build
         path: build.tar.zst
diff --git a/.github/workflows/release-binaries-setup-stage/action.yml b/.github/workflows/release-binaries-setup-stage/action.yml
index 8f45e22886b6e..475a25fa6b772 100644
--- a/.github/workflows/release-binaries-setup-stage/action.yml
+++ b/.github/workflows/release-binaries-setup-stage/action.yml
@@ -22,7 +22,7 @@ runs:
   using: "composite"
   steps:
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
+      uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main
    
     - name: Setup Windows
       if: startsWith(runner.os, 'Windows')
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 8f422a0147748..99602dbb8b8c7 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -68,9 +68,9 @@ jobs:
     steps:
     # It's good practice to use setup-python, but this is also required on macos-14
     # due to https://github.com/actions/runner-images/issues/10385
-    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f
+    - uses: actions/setup-python@2e3e4b15a884dc73a63f962bff250a855150a234
       with:
-        python-version: '3.12'
+        python-version: '3.13'
 
     - name: Checkout LLVM
       uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -250,7 +250,7 @@ jobs:
         release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'`
         mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} .
     
-    - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
+    - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ runner.os }}-${{ runner.arch }}-release-binary
         # Due to path differences on Windows when running in bash vs running on node,
@@ -301,7 +301,7 @@ jobs:
 
     - name: Attest Build Provenance
       id: provenance
-      uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
+      uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4
       with:
         subject-path: ${{ needs.prepare.outputs.release-binary-filename }}
 
@@ -310,7 +310,7 @@ jobs:
         mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
 
     - name: Upload Build Provenance
-      uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
+      uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
       with:
         name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation
         path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index 712ff1831170e..d3d375d3a6df9 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -37,7 +37,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           cache: 'pip'
           cache-dependency-path: './llvm/docs/requirements.txt'
@@ -59,7 +59,7 @@ jobs:
           ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen
 
       - name: Create Release Notes Artifact
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
         with:
           name: release-notes
           path: docs-build/html-export/
diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml
index 17c677413f744..79e509e5e6a8b 100644
--- a/.github/workflows/release-doxygen.yml
+++ b/.github/workflows/release-doxygen.yml
@@ -43,7 +43,7 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
 
       - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
           cache: 'pip'
           cache-dependency-path: './llvm/docs/requirements.txt'
diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml
index 60ec64462bc31..8b1ce04e12c4f 100644
--- a/.github/workflows/release-lit.yml
+++ b/.github/workflows/release-lit.yml
@@ -45,7 +45,7 @@ jobs:
           ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
 
       - name: Setup Cpp
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
+        uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1
         with:
           compiler: llvm-16.0.6
           cmake: true
@@ -66,14 +66,14 @@ jobs:
           python3 setup.py sdist bdist_wheel
 
       - name: Upload lit to test.pypi.org
-        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
         with:
           password: ${{ secrets.LLVM_LIT_TEST_PYPI_API_TOKEN }}
           repository-url: https://test.pypi.org/legacy/
           packages-dir: llvm/utils/lit/dist/
 
       - name: Upload lit to pypi.org
-        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
         with:
           password: ${{ secrets.LLVM_LIT_PYPI_API_TOKEN }}
           packages-dir: llvm/utils/lit/dist/
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 14cc4c4e9b94f..2278b96dbe242 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -92,14 +92,14 @@ jobs:
       - name: Attest Build Provenance
         if: github.event_name != 'pull_request'
         id: provenance
-        uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
+        uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4
         with:
           subject-path: "*.xz"
       - if: github.event_name != 'pull_request'
         run: |
           mv ${{ steps.provenance.outputs.bundle-path }} .
       - name: Create Tarball Artifacts
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           path: |
             *.xz
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 40db5504294ef..c07df338cf989 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -36,7 +36,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
+        uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
         with:
           results_file: results.sarif
           results_format: sarif
@@ -49,7 +49,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: SARIF file
           path: results.sarif
@@ -57,6 +57,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@80f993039571a6de66594ecaa432875a6942e8e0 # v2.20.6
+        uses: github/codeql-action/upload-sarif@b8d3b6e8af63cde30bdc382c0bc28114f4346c88 # v2.28.1
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/spirv-tests.yml b/.github/workflows/spirv-tests.yml
index 8708fb06d9eb8..69374ae563306 100644
--- a/.github/workflows/spirv-tests.yml
+++ b/.github/workflows/spirv-tests.yml
@@ -26,7 +26,7 @@ jobs:
     steps:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
+        uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
           max-size: 2G
           key: spirv-ubuntu-24.04
diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml
index 9d8fb59a67c0e..5b50d7ce3d3fb 100644
--- a/.github/workflows/unprivileged-download-artifact/action.yml
+++ b/.github/workflows/unprivileged-download-artifact/action.yml
@@ -27,7 +27,7 @@ outputs:
 runs:
   using: "composite"
   steps:
-    - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
+    - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0
       id: artifact-url
       with:
         script: |

From 98563d850d542411d95f5f01871762952981ba81 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 29 Sep 2025 05:32:08 +0000
Subject: [PATCH 070/878] [Github] Explicitly Annotate Versions

These two actions were missing version annotations. This confuses
renovate (slightly) and is also non-standard compared to all the other
workflows.
---
 .github/workflows/hlsl-test-all.yaml   | 2 +-
 .github/workflows/release-binaries.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index f954528abade1..dcb852312d41a 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -80,7 +80,7 @@ jobs:
             ninja check-hlsl-unit
             ninja ${{ inputs.TestTarget }}
       - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2
+        uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0
         if: always() && runner.os == 'macOS'
         with:
           comment_mode: off
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 99602dbb8b8c7..cba48e4d0c70a 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -68,7 +68,7 @@ jobs:
     steps:
     # It's good practice to use setup-python, but this is also required on macos-14
     # due to https://github.com/actions/runner-images/issues/10385
-    - uses: actions/setup-python@2e3e4b15a884dc73a63f962bff250a855150a234
+    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
         python-version: '3.13'
 

From cd4c5280c73d42fd41819cc8927d00787ade88e0 Mon Sep 17 00:00:00 2001
From: "Walter J.T.V" <81811777+eZWALT@users.noreply.github.com>
Date: Mon, 29 Sep 2025 07:48:18 +0200
Subject: [PATCH 071/878] [Clang][OpenMP][LoopTransformations] Implement
 "#pragma omp fuse" loop transformation directive and "looprange" clause
 (#139293)

This change implements the fuse directive, `#pragma omp fuse`, as specified in the OpenMP 6.0, along with the `looprange` clause in clang.

This change also adds minimal stubs so flang keeps compiling (a full implementation in flang of this directive is still pending).

---------

Co-authored-by: Roger Ferrer Ibanez <roger.ferrer@bsc.es>
---
 clang/bindings/python/clang/cindex.py         |    3 +
 clang/docs/OpenMPSupport.rst                  |    2 +
 clang/docs/ReleaseNotes.rst                   |    1 +
 clang/include/clang-c/Index.h                 |    4 +
 clang/include/clang/AST/OpenMPClause.h        |   74 +
 clang/include/clang/AST/RecursiveASTVisitor.h |   11 +
 clang/include/clang/AST/StmtOpenMP.h          |  196 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   12 +
 clang/include/clang/Basic/OpenMPKinds.h       |    7 +
 clang/include/clang/Basic/StmtNodes.td        |    4 +
 clang/include/clang/Parse/Parser.h            |    3 +
 clang/include/clang/Sema/SemaOpenMP.h         |   89 +-
 .../include/clang/Serialization/ASTBitCodes.h |    1 +
 clang/lib/AST/OpenMPClause.cpp                |   35 +
 clang/lib/AST/StmtOpenMP.cpp                  |   73 +-
 clang/lib/AST/StmtPrinter.cpp                 |    5 +
 clang/lib/AST/StmtProfile.cpp                 |   16 +
 clang/lib/Basic/OpenMPKinds.cpp               |   11 +-
 clang/lib/CodeGen/CGStmt.cpp                  |    3 +
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |   42 +-
 clang/lib/CodeGen/CodeGenFunction.h           |    1 +
 clang/lib/Parse/ParseOpenMP.cpp               |   36 +
 clang/lib/Sema/SemaExceptionSpec.cpp          |    1 +
 clang/lib/Sema/SemaOpenMP.cpp                 |  830 +++++-
 clang/lib/Sema/TreeTransform.h                |   44 +
 clang/lib/Serialization/ASTReader.cpp         |   11 +
 clang/lib/Serialization/ASTReaderStmt.cpp     |   17 +
 clang/lib/Serialization/ASTWriter.cpp         |    8 +
 clang/lib/Serialization/ASTWriterStmt.cpp     |   12 +
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |    1 +
 clang/test/OpenMP/fuse_ast_print.cpp          |  397 +++
 clang/test/OpenMP/fuse_codegen.cpp            | 2328 +++++++++++++++++
 clang/test/OpenMP/fuse_messages.cpp           |  209 ++
 clang/tools/libclang/CIndex.cpp               |   19 +
 clang/tools/libclang/CXCursor.cpp             |    3 +
 flang/include/flang/Lower/OpenMP/Clauses.h    |    1 +
 flang/include/flang/Parser/dump-parse-tree.h  |    1 +
 flang/include/flang/Parser/parse-tree.h       |    9 +
 flang/lib/Lower/OpenMP/Clauses.cpp            |    5 +
 flang/lib/Parser/openmp-parsers.cpp           |    5 +
 flang/lib/Parser/unparse.cpp                  |    7 +
 flang/lib/Semantics/check-omp-structure.cpp   |    6 +
 llvm/include/llvm/Frontend/OpenMP/ClauseT.h   |   13 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |    9 +
 .../runtime/test/transform/fuse/foreach.cpp   |  191 ++
 openmp/runtime/test/transform/fuse/intfor.c   |   50 +
 .../runtime/test/transform/fuse/iterfor.cpp   |  194 ++
 .../fuse/parallel-wsloop-collapse-foreach.cpp |  207 ++
 .../fuse/parallel-wsloop-collapse-intfor.c    |   45 +
 49 files changed, 5198 insertions(+), 54 deletions(-)
 create mode 100644 clang/test/OpenMP/fuse_ast_print.cpp
 create mode 100644 clang/test/OpenMP/fuse_codegen.cpp
 create mode 100644 clang/test/OpenMP/fuse_messages.cpp
 create mode 100644 openmp/runtime/test/transform/fuse/foreach.cpp
 create mode 100644 openmp/runtime/test/transform/fuse/intfor.c
 create mode 100644 openmp/runtime/test/transform/fuse/iterfor.cpp
 create mode 100644 openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
 create mode 100644 openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c

diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index c44e646a30f17..80140d2787608 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -1446,6 +1446,9 @@ def is_unexposed(self):
     # OpenMP stripe directive.
     OMP_STRIPE_DIRECTIVE = 310
 
+    # OpenMP fuse directive.
+    OMP_FUSE_DIRECTIVE = 311
+
     # OpenACC Compute Construct.
     OPEN_ACC_COMPUTE_DIRECTIVE = 320
 
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 6108e54a17390..68ca7bedddb06 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -482,6 +482,8 @@ implementation.
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | loop transformation apply clause                            | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
+| loop fuse transformation                                    | :good:`done`              | :none:`unclaimed`         |                                                                          |
++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | workdistribute construct                                    |                           | :none:`in progress`       | @skc7, @mjklemm                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | task_iteration                                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 98c889c08b329..270b5d336eba7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -599,6 +599,7 @@ OpenMP Support
 - Added support for ``defaultmap`` directive implicit-behavior ``storage``.
 - Added support for ``defaultmap`` directive implicit-behavior ``private``.
 - Added parsing and semantic analysis support for ``groupprivate`` directive.
+- Added support for 'omp fuse' directive.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index be038d9165fc6..f13d9c9307b40 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2162,6 +2162,10 @@ enum CXCursorKind {
    */
   CXCursor_OMPStripeDirective = 310,
 
+  /** OpenMP fuse directive
+   */
+  CXCursor_OMPFuseDirective = 311,
+
   /** OpenACC Compute Construct.
    */
   CXCursor_OpenACCComputeConstruct = 320,
diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 42b426815920d..68d220a77b18c 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -1149,6 +1149,80 @@ class OMPFullClause final : public OMPNoChildClause<llvm::omp::OMPC_full> {
   static OMPFullClause *CreateEmpty(const ASTContext &C);
 };
 
+/// This class represents the 'looprange' clause in the
+/// '#pragma omp fuse' directive
+///
+/// \code {c}
+/// #pragma omp fuse looprange(1,2)
+/// {
+///   for(int i = 0; i < 64; ++i)
+///   for(int j = 0; j < 256; j+=2)
+///   for(int k = 127; k >= 0; --k)
+/// \endcode
+class OMPLoopRangeClause final : public OMPClause {
+  friend class OMPClauseReader;
+  /// Location of '('
+  SourceLocation LParenLoc;
+
+  /// Location of first and count expressions
+  SourceLocation FirstLoc, CountLoc;
+
+  /// Number of looprange arguments (always 2: first, count)
+  enum { FirstExpr, CountExpr, NumArgs };
+  Stmt *Args[NumArgs] = {nullptr, nullptr};
+
+  /// Set looprange 'first' expression
+  void setFirst(Expr *E) { Args[FirstExpr] = E; }
+
+  /// Set looprange 'count' expression
+  void setCount(Expr *E) { Args[CountExpr] = E; }
+
+  /// Build an empty clause for deserialization.
+  explicit OMPLoopRangeClause()
+      : OMPClause(llvm::omp::OMPC_looprange, {}, {}) {}
+
+public:
+  /// Build a 'looprange' clause AST node.
+  static OMPLoopRangeClause *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+         SourceLocation FirstLoc, SourceLocation CountLoc,
+         SourceLocation EndLoc, Expr *First, Expr *Count);
+
+  /// Build an empty 'looprange' clause node.
+  static OMPLoopRangeClause *CreateEmpty(const ASTContext &C);
+
+  // Location getters/setters
+  SourceLocation getLParenLoc() const { return LParenLoc; }
+  SourceLocation getFirstLoc() const { return FirstLoc; }
+  SourceLocation getCountLoc() const { return CountLoc; }
+
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+  void setFirstLoc(SourceLocation Loc) { FirstLoc = Loc; }
+  void setCountLoc(SourceLocation Loc) { CountLoc = Loc; }
+
+  /// Get looprange 'first' expression
+  Expr *getFirst() const { return cast_or_null<Expr>(Args[FirstExpr]); }
+
+  /// Get looprange 'count' expression
+  Expr *getCount() const { return cast_or_null<Expr>(Args[CountExpr]); }
+
+  child_range children() { return child_range(Args, Args + NumArgs); }
+  const_child_range children() const {
+    return const_child_range(Args, Args + NumArgs);
+  }
+
+  child_range used_children() {
+    return child_range(child_iterator(), child_iterator());
+  }
+  const_child_range used_children() const {
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_looprange;
+  }
+};
+
 /// Representation of the 'partial' clause of the '#pragma omp unroll'
 /// directive.
 ///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index af1a073cc4a5a..7a2881f6124f3 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3177,6 +3177,9 @@ DEF_TRAVERSE_STMT(OMPUnrollDirective,
 DEF_TRAVERSE_STMT(OMPReverseDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPFuseDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPInterchangeDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
@@ -3494,6 +3497,14 @@ bool RecursiveASTVisitor<Derived>::VisitOMPFullClause(OMPFullClause *C) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPLoopRangeClause(
+    OMPLoopRangeClause *C) {
+  TRY_TO(TraverseStmt(C->getFirst()));
+  TRY_TO(TraverseStmt(C->getCount()));
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPPartialClause(OMPPartialClause *C) {
   TRY_TO(TraverseStmt(C->getFactor()));
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index d9f87f1e49b40..bc6aeaa8d143c 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -21,6 +21,7 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/Support/Casting.h"
 
 namespace clang {
 
@@ -677,6 +678,10 @@ class OMPParallelDirective : public OMPExecutableDirective {
   }
 };
 
+// Forward declaration of a generic loop transformation. Used in the declaration
+// of OMPLoopBasedDirective.
+class OMPLoopTransformationDirective;
+
 /// The base class for all loop-based directives, including loop transformation
 /// directives.
 class OMPLoopBasedDirective : public OMPExecutableDirective {
@@ -889,24 +894,23 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
 
   /// Calls the specified callback function for all the loops in \p CurStmt,
   /// from the outermost to the innermost.
-  static bool doForAllLoops(
-      Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
-      llvm::function_ref<bool(unsigned, Stmt *)> Callback,
-      llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
-          OnTransformationCallback);
+  static bool
+  doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
+                unsigned NumLoops,
+                llvm::function_ref<bool(unsigned, Stmt *)> Callback,
+                llvm::function_ref<void(OMPLoopTransformationDirective *)>
+                    OnTransformationCallback);
   static bool
   doForAllLoops(const Stmt *CurStmt, bool TryImperfectlyNestedLoops,
                 unsigned NumLoops,
                 llvm::function_ref<bool(unsigned, const Stmt *)> Callback,
-                llvm::function_ref<
-                    void(const OMPCanonicalLoopNestTransformationDirective *)>
+                llvm::function_ref<void(const OMPLoopTransformationDirective *)>
                     OnTransformationCallback) {
     auto &&NewCallback = [Callback](unsigned Cnt, Stmt *CurStmt) {
       return Callback(Cnt, CurStmt);
     };
     auto &&NewTransformCb =
-        [OnTransformationCallback](
-            OMPCanonicalLoopNestTransformationDirective *A) {
+        [OnTransformationCallback](OMPLoopTransformationDirective *A) {
           OnTransformationCallback(A);
         };
     return doForAllLoops(const_cast<Stmt *>(CurStmt), TryImperfectlyNestedLoops,
@@ -919,7 +923,7 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
   doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops,
                 unsigned NumLoops,
                 llvm::function_ref<bool(unsigned, Stmt *)> Callback) {
-    auto &&TransformCb = [](OMPCanonicalLoopNestTransformationDirective *) {};
+    auto &&TransformCb = [](OMPLoopTransformationDirective *) {};
     return doForAllLoops(CurStmt, TryImperfectlyNestedLoops, NumLoops, Callback,
                          TransformCb);
   }
@@ -957,9 +961,11 @@ class OMPLoopBasedDirective : public OMPExecutableDirective {
 };
 
 /// Common class of data shared between
-/// OMPCanonicalLoopNestTransformationDirective and transformations over
-/// canonical loop sequences.
+/// OMPCanonicalLoopNestTransformationDirective and
+/// OMPCanonicalLoopSequenceTransformationDirective
 class OMPLoopTransformationDirective {
+  friend class ASTStmtReader;
+
   /// Number of (top-level) generated loops.
   /// This value is 1 for most transformations as they only map one loop nest
   /// into another.
@@ -969,15 +975,39 @@ class OMPLoopTransformationDirective {
   /// generate more than one loop nest, so the value would be >= 1.
   unsigned NumGeneratedTopLevelLoops = 1;
 
+  /// We need this because we cannot easily make OMPLoopTransformationDirective
+  /// a proper Stmt.
+  Stmt *S = nullptr;
+
 protected:
   void setNumGeneratedTopLevelLoops(unsigned N) {
     NumGeneratedTopLevelLoops = N;
   }
 
+  explicit OMPLoopTransformationDirective(Stmt *S) : S(S) {}
+
 public:
   unsigned getNumGeneratedTopLevelLoops() const {
     return NumGeneratedTopLevelLoops;
   }
+
+  /// Returns the specific directive related to this loop transformation.
+  Stmt *getDirective() const { return S; }
+
+  /// Get the de-sugared statements after the loop transformation.
+  ///
+  /// Might be nullptr if either the directive generates no loops and is handled
+  /// directly in CodeGen, or resolving a template-dependence context is
+  /// required.
+  Stmt *getTransformedStmt() const;
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const;
+
+  static bool classof(const Stmt *T) {
+    return isa<OMPCanonicalLoopNestTransformationDirective,
+               OMPCanonicalLoopSequenceTransformationDirective>(T);
+  }
 };
 
 /// The base class for all transformation directives of canonical loop nests.
@@ -990,7 +1020,8 @@ class OMPCanonicalLoopNestTransformationDirective
   explicit OMPCanonicalLoopNestTransformationDirective(
       StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
       SourceLocation EndLoc, unsigned NumAssociatedLoops)
-      : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops) {}
+      : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops),
+        OMPLoopTransformationDirective(this) {}
 
 public:
   /// Return the number of associated (consumed) loops.
@@ -5928,6 +5959,112 @@ class OMPInterchangeDirective final
   }
 };
 
+/// The base class for all transformation directives of canonical loop
+/// sequences (currently only 'fuse')
+class OMPCanonicalLoopSequenceTransformationDirective
+    : public OMPExecutableDirective,
+      public OMPLoopTransformationDirective {
+  friend class ASTStmtReader;
+
+protected:
+  explicit OMPCanonicalLoopSequenceTransformationDirective(
+      StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc,
+      SourceLocation EndLoc)
+      : OMPExecutableDirective(SC, Kind, StartLoc, EndLoc),
+        OMPLoopTransformationDirective(this) {}
+
+public:
+  /// Get the de-sugared statements after the loop transformation.
+  ///
+  /// Might be nullptr if either the directive generates no loops and is handled
+  /// directly in CodeGen, or resolving a template-dependence context is
+  /// required.
+  Stmt *getTransformedStmt() const;
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const;
+
+  static bool classof(const Stmt *T) {
+    Stmt::StmtClass C = T->getStmtClass();
+    return C == OMPFuseDirectiveClass;
+  }
+};
+
+/// Represents the '#pragma omp fuse' loop transformation directive
+///
+/// \code{c}
+/// #pragma omp fuse
+/// {
+///   for(int i = 0; i < m1; ++i) {...}
+///   for(int j = 0; j < m2; ++j) {...}
+///   ...
+/// }
+/// \endcode
+class OMPFuseDirective final
+    : public OMPCanonicalLoopSequenceTransformationDirective {
+  friend class ASTStmtReader;
+  friend class OMPExecutableDirective;
+
+  // Offsets of child members.
+  enum {
+    PreInitsOffset = 0,
+    TransformedStmtOffset,
+  };
+
+  explicit OMPFuseDirective(SourceLocation StartLoc, SourceLocation EndLoc)
+      : OMPCanonicalLoopSequenceTransformationDirective(
+            OMPFuseDirectiveClass, llvm::omp::OMPD_fuse, StartLoc, EndLoc) {}
+
+  void setPreInits(Stmt *PreInits) {
+    Data->getChildren()[PreInitsOffset] = PreInits;
+  }
+
+  void setTransformedStmt(Stmt *S) {
+    Data->getChildren()[TransformedStmtOffset] = S;
+  }
+
+public:
+  /// Create a new AST node representation for #pragma omp fuse'
+  ///
+  /// \param C Context of the AST
+  /// \param StartLoc Location of the introducer (e.g the 'omp' token)
+  /// \param EndLoc Location of the directive's end (e.g the tok::eod)
+  /// \param Clauses The directive's clauses
+  /// \param NumLoops Total number of loops in the canonical loop sequence.
+  /// \param NumGeneratedTopLevelLoops Number of top-level generated loops.
+  //                                   Typically 1 but looprange clause can
+  //                                   change this.
+  /// \param AssociatedStmt The outermost associated loop
+  /// \param TransformedStmt The loop nest after fusion, or nullptr in
+  ///                        dependent
+  /// \param PreInits Helper preinits statements for the loop nest
+  static OMPFuseDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         ArrayRef<OMPClause *> Clauses, unsigned NumGeneratedTopLevelLoops,
+         Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits);
+
+  /// Build an empty '#pragma omp fuse' AST node for deserialization
+  ///
+  /// \param C Context of the AST
+  /// \param NumClauses Number of clauses to allocate
+  /// \param NumLoops Number of top level loops to allocate
+  static OMPFuseDirective *CreateEmpty(const ASTContext &C,
+                                       unsigned NumClauses);
+
+  /// Gets the associated loops after the transformation. This is the de-sugared
+  /// replacement or nulltpr in dependent contexts.
+  Stmt *getTransformedStmt() const {
+    return Data->getChildren()[TransformedStmtOffset];
+  }
+
+  /// Return preinits statement.
+  Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; }
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPFuseDirectiveClass;
+  }
+};
+
 /// This represents '#pragma omp scan' directive.
 ///
 /// \code
@@ -6596,4 +6733,37 @@ class OMPAssumeDirective final : public OMPExecutableDirective {
 
 } // end namespace clang
 
+namespace llvm {
+// Allow a Stmt* be casted correctly to an OMPLoopTransformationDirective*.
+// The default routines would just use a C-style cast which won't work well
+// for the multiple inheritance here. We have to use a static cast from the
+// corresponding subclass.
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>
+    : public NullableValueCastFailed<clang::OMPLoopTransformationDirective *>,
+      public DefaultDoCastIfPossible<
+          clang::OMPLoopTransformationDirective *, clang::Stmt *,
+          CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {
+  static bool isPossible(const clang::Stmt *T) {
+    return clang::OMPLoopTransformationDirective::classof(T);
+  }
+
+  static clang::OMPLoopTransformationDirective *doCast(clang::Stmt *T) {
+    if (auto *D =
+            dyn_cast<clang::OMPCanonicalLoopNestTransformationDirective>(T))
+      return static_cast<clang::OMPLoopTransformationDirective *>(D);
+    if (auto *D =
+            dyn_cast<clang::OMPCanonicalLoopSequenceTransformationDirective>(T))
+      return static_cast<clang::OMPLoopTransformationDirective *>(D);
+    llvm_unreachable("unexpected type");
+  }
+};
+template <>
+struct CastInfo<clang::OMPLoopTransformationDirective, const clang::Stmt *>
+    : public ConstStrippingForwardingCast<
+          clang::OMPLoopTransformationDirective, const clang::Stmt *,
+          CastInfo<clang::OMPLoopTransformationDirective, clang::Stmt *>> {};
+
+} // namespace llvm
+
 #endif
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dc4c6d3de27c1..b8d031ed28d06 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11761,6 +11761,18 @@ def note_omp_implicit_dsa : Note<
   "implicitly determined as %0">;
 def err_omp_loop_var_dsa : Error<
   "loop iteration variable in the associated loop of 'omp %1' directive may not be %0, predetermined as %2">;
+def err_omp_not_a_loop_sequence
+    : Error<"statement after '#pragma omp %0' must be a loop sequence "
+            "containing canonical loops or loop-generating constructs">;
+def err_omp_empty_loop_sequence
+    : Error<"loop sequence after '#pragma omp %0' must contain at least 1 "
+            "canonical loop or loop-generating construct">;
+def err_omp_invalid_looprange
+    : Error<"looprange clause selects loops from %1 to %2 but this exceeds the "
+            "number of loops (%3) in the loop sequence">;
+def warn_omp_redundant_fusion : Warning<"looprange clause selects a single "
+                                        "loop, resulting in redundant fusion">,
+                                InGroup<OpenMPClauses>;
 def err_omp_not_for : Error<
   "%select{statement after '#pragma omp %1' must be a for loop|"
   "expected %2 for loops after '#pragma omp %1'%select{|, but found only %4}3}0">;
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 4c988e0dacb57..ed89a31e2684b 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -391,6 +391,13 @@ bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind);
 bool isOpenMPCanonicalLoopNestTransformationDirective(
     OpenMPDirectiveKind DKind);
 
+/// Checks if the specified directive is a loop transformation directive that
+/// applies to a canonical loop sequence.
+/// \param DKind Specified directive.
+/// \return True iff the directive is a loop transformation.
+bool isOpenMPCanonicalLoopSequenceTransformationDirective(
+    OpenMPDirectiveKind DKind);
+
 /// Checks if the specified directive is a loop transformation directive.
 /// \param DKind Specified directive.
 /// \return True iff the directive is a loop transformation.
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index dd1a24405fae7..bf3686bb372d5 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -238,6 +238,10 @@ def OMPUnrollDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
 def OMPReverseDirective : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
 def OMPInterchangeDirective
     : StmtNode<OMPCanonicalLoopNestTransformationDirective>;
+def OMPCanonicalLoopSequenceTransformationDirective
+    : StmtNode<OMPExecutableDirective, 1>;
+def OMPFuseDirective
+    : StmtNode<OMPCanonicalLoopSequenceTransformationDirective>;
 def OMPForDirective : StmtNode<OMPLoopDirective>;
 def OMPForSimdDirective : StmtNode<OMPLoopDirective>;
 def OMPSectionsDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 30edd303e1824..e301cf1080977 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -6767,6 +6767,9 @@ class Parser : public CodeCompletionHandler {
                                                 OpenMPClauseKind Kind,
                                                 bool ParseOnly);
 
+  /// Parses the 'looprange' clause of a '#pragma omp fuse' directive.
+  OMPClause *ParseOpenMPLoopRangeClause();
+
   /// Parses the 'sizes' clause of a '#pragma omp tile' directive.
   OMPClause *ParseOpenMPSizesClause();
 
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index c0fd7a6d63611..daf58b18a03cb 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -463,6 +463,13 @@ class SemaOpenMP : public SemaBase {
                                              Stmt *AStmt,
                                              SourceLocation StartLoc,
                                              SourceLocation EndLoc);
+
+  /// Called on well-formed '#pragma omp fuse' after parsing of its
+  /// clauses and the associated statement.
+  StmtResult ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+                                      Stmt *AStmt, SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+
   /// Called on well-formed '\#pragma omp for' after parsing
   /// of the associated statement.
   StmtResult
@@ -921,6 +928,12 @@ class SemaOpenMP : public SemaBase {
                                        SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
                                        SourceLocation EndLoc);
+
+  /// Called on well-form 'looprange' clause after parsing its arguments.
+  OMPClause *
+  ActOnOpenMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+                             SourceLocation LParenLoc, SourceLocation FirstLoc,
+                             SourceLocation CountLoc, SourceLocation EndLoc);
   /// Called on well-formed 'ordered' clause.
   OMPClause *
   ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc,
@@ -1485,7 +1498,81 @@ class SemaOpenMP : public SemaBase {
   bool checkTransformableLoopNest(
       OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
       SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-      Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits);
+      Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits);
+
+  /// Holds the result of the analysis of a (possibly canonical) loop.
+  struct LoopAnalysis {
+    /// The analyzed loop or loop transformation.
+    Stmt *AStmt = nullptr;
+    /// Loop analyses results.
+    OMPLoopBasedDirective::HelperExprs HelperExprs;
+    /// The for-statement of the loop. TheForStmt equals AStmt only when the
+    /// latter is a canonical loop (i.e. not a loop transformation).
+    Stmt *TheForStmt = nullptr;
+    /// Initialization statements before transformations.
+    SmallVector<Stmt *> OriginalInits;
+    /// Initialization statements required after transformation of this loop.
+    SmallVector<Stmt *> TransformsPreInits;
+
+    explicit LoopAnalysis(Stmt *S) : AStmt(S) {}
+
+    bool isRegularLoop() const { return isRegularLoop(AStmt); }
+    bool isLoopTransformation() const { return isLoopTransformation(AStmt); }
+
+    // Convenience functions used when building LoopSequenceAnalysis.
+    static bool isRegularLoop(Stmt *S) {
+      return isa<ForStmt, CXXForRangeStmt>(S);
+    }
+    static bool isLoopTransformation(Stmt *S) {
+      return isa<OMPLoopTransformationDirective>(S);
+    }
+  };
+
+  /// Holds the result of the analysis of a (possibly canonical) loop sequence.
+  struct LoopSequenceAnalysis {
+    /// Number of top level canonical loops.
+    unsigned LoopSeqSize = 0;
+    /// For each loop results of the analysis.
+    SmallVector<LoopAnalysis, 2> Loops;
+    /// Additional code required before entering the transformed loop sequence.
+    SmallVector<Stmt *> LoopSequencePreInits;
+
+    // Convenience function used when building the LoopSequenceAnalysis.
+    static bool isLoopSequenceDerivation(Stmt *S) {
+      return LoopAnalysis::isRegularLoop(S) ||
+             LoopAnalysis::isLoopTransformation(S);
+    }
+  };
+
+  /// The main recursive process of `checkTransformableLoopSequence` that
+  /// performs grammatical parsing of a canonical loop sequence. It extracts
+  /// key information, such as the number of top-level loops, loop statements,
+  /// helper expressions, and other relevant loop-related data, all in a single
+  /// execution to avoid redundant traversals. This analysis flattens inner
+  /// Loop Sequences
+  ///
+  /// \param LoopSeqStmt    The AST of the original statement.
+  /// \param SeqAnalysis    [out] Result of the analysis of \p LoopSeqStmt
+  /// \param Context
+  /// \param Kind           The loop transformation directive kind.
+  /// \return Whether the original statement is both syntactically and
+  /// semantically correct according to OpenMP 6.0 canonical loop
+  /// sequence definition.
+  bool analyzeLoopSequence(Stmt *LoopSeqStmt, LoopSequenceAnalysis &SeqAnalysis,
+                           ASTContext &Context, OpenMPDirectiveKind Kind);
+
+  /// Validates and checks whether a loop sequence can be transformed according
+  /// to the given directive, providing necessary setup and initialization
+  /// (Driver function) before recursion using `analyzeLoopSequence`.
+  ///
+  /// \param Kind           The loop transformation directive kind.
+  /// \param AStmt          The AST of the original statement
+  /// \param SeqAnalysis    [out] Result of the analysis of \p LoopSeqStmt
+  /// \param Context
+  /// \return Whether there was an absence of errors or not
+  bool checkTransformableLoopSequence(OpenMPDirectiveKind Kind, Stmt *AStmt,
+                                      LoopSequenceAnalysis &SeqAnalysis,
+                                      ASTContext &Context);
 
   /// Helper to keep information about the current `omp begin/end declare
   /// variant` nesting.
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 441047d64f48c..99864c7373908 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1951,6 +1951,7 @@ enum StmtCode {
   STMT_OMP_UNROLL_DIRECTIVE,
   STMT_OMP_REVERSE_DIRECTIVE,
   STMT_OMP_INTERCHANGE_DIRECTIVE,
+  STMT_OMP_FUSE_DIRECTIVE,
   STMT_OMP_FOR_DIRECTIVE,
   STMT_OMP_FOR_SIMD_DIRECTIVE,
   STMT_OMP_SECTIONS_DIRECTIVE,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 55b93e1eb4034..2ce4419940e52 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -1024,6 +1024,26 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) {
   return new (C) OMPPartialClause();
 }
 
+OMPLoopRangeClause *
+OMPLoopRangeClause::Create(const ASTContext &C, SourceLocation StartLoc,
+                           SourceLocation LParenLoc, SourceLocation FirstLoc,
+                           SourceLocation CountLoc, SourceLocation EndLoc,
+                           Expr *First, Expr *Count) {
+  OMPLoopRangeClause *Clause = CreateEmpty(C);
+  Clause->setLocStart(StartLoc);
+  Clause->setLParenLoc(LParenLoc);
+  Clause->setFirstLoc(FirstLoc);
+  Clause->setCountLoc(CountLoc);
+  Clause->setLocEnd(EndLoc);
+  Clause->setFirst(First);
+  Clause->setCount(Count);
+  return Clause;
+}
+
+OMPLoopRangeClause *OMPLoopRangeClause::CreateEmpty(const ASTContext &C) {
+  return new (C) OMPLoopRangeClause();
+}
+
 OMPAllocateClause *OMPAllocateClause::Create(
     const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
     Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc,
@@ -1964,6 +1984,21 @@ void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) {
   }
 }
 
+void OMPClausePrinter::VisitOMPLoopRangeClause(OMPLoopRangeClause *Node) {
+  OS << "looprange";
+
+  Expr *First = Node->getFirst();
+  Expr *Count = Node->getCount();
+
+  if (First && Count) {
+    OS << "(";
+    First->printPretty(OS, nullptr, Policy, 0);
+    OS << ",";
+    Count->printPretty(OS, nullptr, Policy, 0);
+    OS << ")";
+  }
+}
+
 void OMPClausePrinter::VisitOMPAllocatorClause(OMPAllocatorClause *Node) {
   OS << "allocator(";
   Node->getAllocator()->printPretty(OS, nullptr, Policy, 0);
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 1f6586f95a9f8..a5b0cd3786a28 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -125,13 +125,12 @@ OMPLoopBasedDirective::tryToFindNextInnerLoop(Stmt *CurStmt,
 bool OMPLoopBasedDirective::doForAllLoops(
     Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops,
     llvm::function_ref<bool(unsigned, Stmt *)> Callback,
-    llvm::function_ref<void(OMPCanonicalLoopNestTransformationDirective *)>
+    llvm::function_ref<void(OMPLoopTransformationDirective *)>
         OnTransformationCallback) {
   CurStmt = CurStmt->IgnoreContainers();
   for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) {
     while (true) {
-      auto *Dir =
-          dyn_cast<OMPCanonicalLoopNestTransformationDirective>(CurStmt);
+      auto *Dir = dyn_cast<OMPLoopTransformationDirective>(CurStmt);
       if (!Dir)
         break;
 
@@ -371,6 +370,22 @@ OMPForDirective *OMPForDirective::Create(
   return Dir;
 }
 
+Stmt *OMPLoopTransformationDirective::getTransformedStmt() const {
+  if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S))
+    return D->getTransformedStmt();
+  if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S))
+    return D->getTransformedStmt();
+  llvm_unreachable("unexpected object type");
+}
+
+Stmt *OMPLoopTransformationDirective::getPreInits() const {
+  if (auto *D = dyn_cast<OMPCanonicalLoopNestTransformationDirective>(S))
+    return D->getPreInits();
+  if (auto *D = dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S))
+    return D->getPreInits();
+  llvm_unreachable("unexpected object type");
+}
+
 Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
   switch (getStmtClass()) {
 #define STMT(CLASS, PARENT)
@@ -380,7 +395,7 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const {
     return static_cast<const CLASS *>(this)->getTransformedStmt();
 #include "clang/AST/StmtNodes.inc"
   default:
-    llvm_unreachable("Not a loop transformation");
+    llvm_unreachable("Not a loop transformation for canonical loop nests");
   }
 }
 
@@ -393,7 +408,34 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getPreInits() const {
     return static_cast<const CLASS *>(this)->getPreInits();
 #include "clang/AST/StmtNodes.inc"
   default:
-    llvm_unreachable("Not a loop transformation");
+    llvm_unreachable("Not a loop transformation for canonical loop nests");
+  }
+}
+
+Stmt *
+OMPCanonicalLoopSequenceTransformationDirective::getTransformedStmt() const {
+  switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT)         \
+  case Stmt::CLASS##Class:                                                     \
+    return static_cast<const CLASS *>(this)->getTransformedStmt();
+#include "clang/AST/StmtNodes.inc"
+  default:
+    llvm_unreachable("Not a loop transformation for canonical loop sequences");
+  }
+}
+
+Stmt *OMPCanonicalLoopSequenceTransformationDirective::getPreInits() const {
+  switch (getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT)         \
+  case Stmt::CLASS##Class:                                                     \
+    return static_cast<const CLASS *>(this)->getPreInits();
+#include "clang/AST/StmtNodes.inc"
+  default:
+    llvm_unreachable("Not a loop transformation for canonical loop sequences");
   }
 }
 
@@ -510,6 +552,27 @@ OMPInterchangeDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses,
       SourceLocation(), SourceLocation(), NumLoops);
 }
 
+OMPFuseDirective *OMPFuseDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses, unsigned NumGeneratedTopLevelLoops,
+    Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits) {
+
+  OMPFuseDirective *Dir = createDirective<OMPFuseDirective>(
+      C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc);
+  Dir->setTransformedStmt(TransformedStmt);
+  Dir->setPreInits(PreInits);
+  Dir->setNumGeneratedTopLevelLoops(NumGeneratedTopLevelLoops);
+  return Dir;
+}
+
+OMPFuseDirective *OMPFuseDirective::CreateEmpty(const ASTContext &C,
+                                                unsigned NumClauses) {
+  OMPFuseDirective *Dir = createEmptyDirective<OMPFuseDirective>(
+      C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1,
+      SourceLocation(), SourceLocation());
+  return Dir;
+}
+
 OMPForSimdDirective *
 OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc,
                             SourceLocation EndLoc, unsigned CollapsedNum,
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 2c9c3581a2962..586c3000f105c 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -795,6 +795,11 @@ void StmtPrinter::VisitOMPInterchangeDirective(OMPInterchangeDirective *Node) {
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPFuseDirective(OMPFuseDirective *Node) {
+  Indent() << "#pragma omp fuse";
+  PrintOMPExecutableDirective(Node);
+}
+
 void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) {
   Indent() << "#pragma omp for";
   PrintOMPExecutableDirective(Node);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 8b3af94e1a8bc..589a156a2b6ea 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -510,6 +510,13 @@ void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) {
     Profiler->VisitExpr(Factor);
 }
 
+void OMPClauseProfiler::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+  if (const Expr *First = C->getFirst())
+    Profiler->VisitExpr(First);
+  if (const Expr *Count = C->getCount())
+    Profiler->VisitExpr(Count);
+}
+
 void OMPClauseProfiler::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
   if (C->getAllocator())
     Profiler->VisitStmt(C->getAllocator());
@@ -1025,6 +1032,15 @@ void StmtProfiler::VisitOMPInterchangeDirective(
   VisitOMPCanonicalLoopNestTransformationDirective(S);
 }
 
+void StmtProfiler::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    const OMPCanonicalLoopSequenceTransformationDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
+void StmtProfiler::VisitOMPFuseDirective(const OMPFuseDirective *S) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(S);
+}
+
 void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) {
   VisitOMPLoopDirective(S);
 }
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 387026e2d712f..64b2bff063340 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -282,6 +282,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
   case OMPC_affinity:
   case OMPC_when:
   case OMPC_append_args:
+  case OMPC_looprange:
     break;
   default:
     break;
@@ -627,6 +628,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_affinity:
   case OMPC_when:
   case OMPC_append_args:
+  case OMPC_looprange:
     break;
   default:
     break;
@@ -755,9 +757,14 @@ bool clang::isOpenMPCanonicalLoopNestTransformationDirective(
          DKind == OMPD_interchange || DKind == OMPD_stripe;
 }
 
+bool clang::isOpenMPCanonicalLoopSequenceTransformationDirective(
+    OpenMPDirectiveKind DKind) {
+  return DKind == OMPD_fuse;
+}
+
 bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) {
-  // FIXME: There will be more cases when we implement 'fuse'.
-  return isOpenMPCanonicalLoopNestTransformationDirective(DKind);
+  return isOpenMPCanonicalLoopNestTransformationDirective(DKind) ||
+         isOpenMPCanonicalLoopSequenceTransformationDirective(DKind);
 }
 
 bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) {
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index e62bc7617da3d..92636f27fd4e5 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -234,6 +234,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPInterchangeDirectiveClass:
     EmitOMPInterchangeDirective(cast<OMPInterchangeDirective>(*S));
     break;
+  case Stmt::OMPFuseDirectiveClass:
+    EmitOMPFuseDirective(cast<OMPFuseDirective>(*S));
+    break;
   case Stmt::OMPForDirectiveClass:
     EmitOMPForDirective(cast<OMPForDirective>(*S));
     break;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index ba9c7c60144e6..efc06a276267a 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -201,6 +201,24 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
     } else {
       llvm_unreachable("Unknown loop-based directive kind.");
     }
+    doEmitPreinits(PreInits);
+    PreCondVars.restore(CGF);
+  }
+
+  void
+  emitPreInitStmt(CodeGenFunction &CGF,
+                  const OMPCanonicalLoopSequenceTransformationDirective &S) {
+    const Stmt *PreInits;
+    if (const auto *Fuse = dyn_cast<OMPFuseDirective>(&S)) {
+      PreInits = Fuse->getPreInits();
+    } else {
+      llvm_unreachable(
+          "Unknown canonical loop sequence transform directive kind.");
+    }
+    doEmitPreinits(PreInits);
+  }
+
+  void doEmitPreinits(const Stmt *PreInits) {
     if (PreInits) {
       // CompoundStmts and DeclStmts are used as lists of PreInit statements and
       // declarations. Since declarations must be visible in the the following
@@ -222,7 +240,6 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
         CGF.EmitStmt(S);
       }
     }
-    PreCondVars.restore(CGF);
   }
 
 public:
@@ -230,6 +247,11 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
       : CodeGenFunction::RunCleanupsScope(CGF) {
     emitPreInitStmt(CGF, S);
   }
+  OMPLoopScope(CodeGenFunction &CGF,
+               const OMPCanonicalLoopSequenceTransformationDirective &S)
+      : CodeGenFunction::RunCleanupsScope(CGF) {
+    emitPreInitStmt(CGF, S);
+  }
 };
 
 class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
@@ -1929,6 +1951,15 @@ class OMPTransformDirectiveScopeRAII {
       CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
       CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
     }
+    if (const auto *Dir =
+            dyn_cast<OMPCanonicalLoopSequenceTransformationDirective>(S)) {
+      // For simplicity we reuse the loop scope similarly to what we do with
+      // OMPCanonicalLoopNestTransformationDirective do by being a subclass
+      // of OMPLoopBasedDirective.
+      Scope = new OMPLoopScope(CGF, *Dir);
+      CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP);
+      CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI);
+    }
   }
   ~OMPTransformDirectiveScopeRAII() {
     if (!Scope)
@@ -1956,8 +1987,7 @@ static void emitBody(CodeGenFunction &CGF, const Stmt *S, const Stmt *NextLoop,
     return;
   }
   if (SimplifiedS == NextLoop) {
-    if (auto *Dir =
-            dyn_cast<OMPCanonicalLoopNestTransformationDirective>(SimplifiedS))
+    if (auto *Dir = dyn_cast<OMPLoopTransformationDirective>(SimplifiedS))
       SimplifiedS = Dir->getTransformedStmt();
     if (const auto *CanonLoop = dyn_cast<OMPCanonicalLoop>(SimplifiedS))
       SimplifiedS = CanonLoop->getLoopStmt();
@@ -2952,6 +2982,12 @@ void CodeGenFunction::EmitOMPInterchangeDirective(
   EmitStmt(S.getTransformedStmt());
 }
 
+void CodeGenFunction::EmitOMPFuseDirective(const OMPFuseDirective &S) {
+  // Emit the de-sugared statement
+  OMPTransformDirectiveScopeRAII FuseScope(*this, &S);
+  EmitStmt(S.getTransformedStmt());
+}
+
 void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) {
   bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder;
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 727487b46054f..f0565c1de04c4 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3861,6 +3861,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPUnrollDirective(const OMPUnrollDirective &S);
   void EmitOMPReverseDirective(const OMPReverseDirective &S);
   void EmitOMPInterchangeDirective(const OMPInterchangeDirective &S);
+  void EmitOMPFuseDirective(const OMPFuseDirective &S);
   void EmitOMPForDirective(const OMPForDirective &S);
   void EmitOMPForSimdDirective(const OMPForSimdDirective &S);
   void EmitOMPScopeDirective(const OMPScopeDirective &S);
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 02f3f109b2562..04f29c83dd457 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2968,6 +2968,39 @@ OMPClause *Parser::ParseOpenMPSizesClause() {
                                                  OpenLoc, CloseLoc);
 }
 
+OMPClause *Parser::ParseOpenMPLoopRangeClause() {
+  SourceLocation ClauseNameLoc = ConsumeToken();
+  SourceLocation FirstLoc, CountLoc;
+
+  BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
+  if (T.consumeOpen()) {
+    Diag(Tok, diag::err_expected) << tok::l_paren;
+    return nullptr;
+  }
+
+  FirstLoc = Tok.getLocation();
+  ExprResult FirstVal = ParseConstantExpression();
+  if (!FirstVal.isUsable()) {
+    T.skipToEnd();
+    return nullptr;
+  }
+
+  ExpectAndConsume(tok::comma);
+
+  CountLoc = Tok.getLocation();
+  ExprResult CountVal = ParseConstantExpression();
+  if (!CountVal.isUsable()) {
+    T.skipToEnd();
+    return nullptr;
+  }
+
+  T.consumeClose();
+
+  return Actions.OpenMP().ActOnOpenMPLoopRangeClause(
+      FirstVal.get(), CountVal.get(), ClauseNameLoc, T.getOpenLocation(),
+      FirstLoc, CountLoc, T.getCloseLocation());
+}
+
 OMPClause *Parser::ParseOpenMPPermutationClause() {
   SourceLocation ClauseNameLoc, OpenLoc, CloseLoc;
   SmallVector<Expr *> ArgExprs;
@@ -3473,6 +3506,9 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     }
     Clause = ParseOpenMPClause(CKind, WrongDirective);
     break;
+  case OMPC_looprange:
+    Clause = ParseOpenMPLoopRangeClause();
+    break;
   default:
     break;
   }
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 552c92996dc2e..a0483c3027199 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1493,6 +1493,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OMPUnrollDirectiveClass:
   case Stmt::OMPReverseDirectiveClass:
   case Stmt::OMPInterchangeDirectiveClass:
+  case Stmt::OMPFuseDirectiveClass:
   case Stmt::OMPSingleDirectiveClass:
   case Stmt::OMPTargetDataDirectiveClass:
   case Stmt::OMPTargetDirectiveClass:
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 48e06d1dc7579..f5feed6206494 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4569,6 +4569,7 @@ void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind,
   case OMPD_unroll:
   case OMPD_reverse:
   case OMPD_interchange:
+  case OMPD_fuse:
   case OMPD_assume:
     break;
   default:
@@ -6410,6 +6411,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPInterchangeDirective(ClausesWithImplicit, AStmt, StartLoc,
                                           EndLoc);
     break;
+  case OMPD_fuse:
+    Res =
+        ActOnOpenMPFuseDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc);
+    break;
   case OMPD_for:
     Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
                                   VarsWithInheritedDSA);
@@ -9488,7 +9493,9 @@ static bool checkOpenMPIterationSpace(
     // sharing attributes.
     VarsWithImplicitDSA.erase(LCDecl);
 
-    assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars");
+    assert((isOpenMPLoopDirective(DKind) ||
+            isOpenMPCanonicalLoopSequenceTransformationDirective(DKind)) &&
+           "DSA for non-loop vars");
 
     // Check test-expr.
     HasErrors |= ISC.checkAndSetCond(For ? For->getCond() : CXXFor->getCond());
@@ -9916,7 +9923,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
   unsigned NumLoops = std::max(OrderedLoopCount, NestedLoopCount);
   SmallVector<LoopIterationSpace, 4> IterSpaces(NumLoops);
   if (!OMPLoopBasedDirective::doForAllLoops(
-          AStmt->IgnoreContainers(!isOpenMPLoopTransformationDirective(DKind)),
+          AStmt->IgnoreContainers(
+              !isOpenMPCanonicalLoopNestTransformationDirective(DKind)),
           SupportsNonPerfectlyNested, NumLoops,
           [DKind, &SemaRef, &DSA, NumLoops, NestedLoopCount,
            CollapseLoopCountExpr, OrderedLoopCountExpr, &VarsWithImplicitDSA,
@@ -9938,8 +9946,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
             }
             return false;
           },
-          [&SemaRef,
-           &Captures](OMPCanonicalLoopNestTransformationDirective *Transform) {
+          [&SemaRef, &Captures](OMPLoopTransformationDirective *Transform) {
             Stmt *DependentPreInits = Transform->getPreInits();
             if (!DependentPreInits)
               return;
@@ -9954,7 +9961,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
                   auto *D = cast<VarDecl>(C);
                   DeclRefExpr *Ref = buildDeclRefExpr(
                       SemaRef, D, D->getType().getNonReferenceType(),
-                      Transform->getBeginLoc());
+                      cast<OMPExecutableDirective>(Transform->getDirective())
+                          ->getBeginLoc());
                   Captures[Ref] = Ref;
                 }
               }
@@ -14404,10 +14412,34 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
       getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
+/// Updates OriginalInits by checking Transform against loop transformation
+/// directives and appending their pre-inits if a match is found.
+static void updatePreInits(OMPLoopTransformationDirective *Transform,
+                           SmallVectorImpl<Stmt *> &PreInits) {
+  Stmt *Dir = Transform->getDirective();
+  switch (Dir->getStmtClass()) {
+#define STMT(CLASS, PARENT)
+#define ABSTRACT_STMT(CLASS)
+#define COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)                          \
+  case Stmt::CLASS##Class:                                                     \
+    appendFlattenedStmtList(PreInits,                                          \
+                            static_cast<const CLASS *>(Dir)->getPreInits());   \
+    break;
+#define OMPCANONICALLOOPNESTTRANSFORMATIONDIRECTIVE(CLASS, PARENT)             \
+  COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT)         \
+  COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT)
+#include "clang/AST/StmtNodes.inc"
+#undef COMMON_OMP_LOOP_TRANSFORMATION
+  default:
+    llvm_unreachable("Not a loop transformation");
+  }
+}
+
 bool SemaOpenMP::checkTransformableLoopNest(
     OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
     SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-    Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits) {
+    Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *>> &OriginalInits) {
   OriginalInits.emplace_back();
   bool Result = OMPLoopBasedDirective::doForAllLoops(
       AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops,
@@ -14433,29 +14465,268 @@ bool SemaOpenMP::checkTransformableLoopNest(
         OriginalInits.emplace_back();
         return false;
       },
-      [&OriginalInits](OMPLoopBasedDirective *Transform) {
-        Stmt *DependentPreInits;
-        if (auto *Dir = dyn_cast<OMPTileDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPStripeDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPUnrollDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPReverseDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else if (auto *Dir = dyn_cast<OMPInterchangeDirective>(Transform))
-          DependentPreInits = Dir->getPreInits();
-        else
-          llvm_unreachable("Unhandled loop transformation");
-
-        appendFlattenedStmtList(OriginalInits.back(), DependentPreInits);
+      [&OriginalInits](OMPLoopTransformationDirective *Transform) {
+        updatePreInits(Transform, OriginalInits.back());
       });
   assert(OriginalInits.back().empty() && "No preinit after innermost loop");
   OriginalInits.pop_back();
   return Result;
 }
 
-/// Add preinit statements that need to be propageted from the selected loop.
+/// Counts the total number of OpenMP canonical nested loops, including the
+/// outermost loop (the original loop). PRECONDITION of this visitor is that it
+/// must be invoked from the original loop to be analyzed. The traversal stops
+/// for Decl's and Expr's given that they may contain inner loops that must not
+/// be counted.
+///
+/// Example AST structure for the code:
+///
+/// int main() {
+///     #pragma omp fuse
+///     {
+///         for (int i = 0; i < 100; i++) {    <-- Outer loop
+///             []() {
+///                 for(int j = 0; j < 100; j++) {}  <-- NOT A LOOP (1)
+///             };
+///             for(int j = 0; j < 5; ++j) {}    <-- Inner loop
+///         }
+///         for (int r = 0; i < 100; i++) {    <-- Outer loop
+///             struct LocalClass {
+///                 void bar() {
+///                     for(int j = 0; j < 100; j++) {}  <-- NOT A LOOP (2)
+///                 }
+///             };
+///             for(int k = 0; k < 10; ++k) {}    <-- Inner loop
+///             {x = 5; for(k = 0; k < 10; ++k) x += k; x}; <-- NOT A LOOP (3)
+///         }
+///     }
+/// }
+/// (1) because in a different function (here: a lambda)
+/// (2) because in a different function (here: class method)
+/// (3) because considered to be intervening-code of non-perfectly nested loop
+/// Result: Loop 'i' contains 2 loops, Loop 'r' also contains 2 loops.
+class NestedLoopCounterVisitor final : public DynamicRecursiveASTVisitor {
+private:
+  unsigned NestedLoopCount = 0;
+
+public:
+  explicit NestedLoopCounterVisitor() = default;
+
+  unsigned getNestedLoopCount() const { return NestedLoopCount; }
+
+  bool VisitForStmt(ForStmt *FS) override {
+    ++NestedLoopCount;
+    return true;
+  }
+
+  bool VisitCXXForRangeStmt(CXXForRangeStmt *FRS) override {
+    ++NestedLoopCount;
+    return true;
+  }
+
+  bool TraverseStmt(Stmt *S) override {
+    if (!S)
+      return true;
+
+    // Skip traversal of all expressions, including special cases like
+    // LambdaExpr, StmtExpr, BlockExpr, and RequiresExpr. These expressions
+    // may contain inner statements (and even loops), but they are not part
+    // of the syntactic body of the surrounding loop structure.
+    //  Therefore must not be counted.
+    if (isa<Expr>(S))
+      return true;
+
+    // Only recurse into CompoundStmt (block {}) and loop bodies.
+    if (isa<CompoundStmt, ForStmt, CXXForRangeStmt>(S)) {
+      return DynamicRecursiveASTVisitor::TraverseStmt(S);
+    }
+
+    // Stop traversal of the rest of statements, that break perfect
+    // loop nesting, such as control flow (IfStmt, SwitchStmt...).
+    return true;
+  }
+
+  bool TraverseDecl(Decl *D) override {
+    // Stop in the case of finding a declaration, it is not important
+    // in order to find nested loops (Possible CXXRecordDecl, RecordDecl,
+    // FunctionDecl...).
+    return true;
+  }
+};
+
+bool SemaOpenMP::analyzeLoopSequence(Stmt *LoopSeqStmt,
+                                     LoopSequenceAnalysis &SeqAnalysis,
+                                     ASTContext &Context,
+                                     OpenMPDirectiveKind Kind) {
+  VarsWithInheritedDSAType TmpDSA;
+  // Helper Lambda to handle storing initialization and body statements for
+  // both ForStmt and CXXForRangeStmt.
+  auto StoreLoopStatements = [](LoopAnalysis &Analysis, Stmt *LoopStmt) {
+    if (auto *For = dyn_cast<ForStmt>(LoopStmt)) {
+      Analysis.OriginalInits.push_back(For->getInit());
+      Analysis.TheForStmt = For;
+    } else {
+      auto *CXXFor = cast<CXXForRangeStmt>(LoopStmt);
+      Analysis.OriginalInits.push_back(CXXFor->getBeginStmt());
+      Analysis.TheForStmt = CXXFor;
+    }
+  };
+
+  // Helper lambda functions to encapsulate the processing of different
+  // derivations of the canonical loop sequence grammar
+  // Modularized code for handling loop generation and transformations.
+  auto AnalyzeLoopGeneration = [&](Stmt *Child) {
+    auto *LoopTransform = cast<OMPLoopTransformationDirective>(Child);
+    Stmt *TransformedStmt = LoopTransform->getTransformedStmt();
+    unsigned NumGeneratedTopLevelLoops =
+        LoopTransform->getNumGeneratedTopLevelLoops();
+    // Handle the case where transformed statement is not available due to
+    // dependent contexts
+    if (!TransformedStmt) {
+      if (NumGeneratedTopLevelLoops > 0) {
+        SeqAnalysis.LoopSeqSize += NumGeneratedTopLevelLoops;
+        return true;
+      }
+      // Unroll full (0 loops produced)
+      Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+          << 0 << getOpenMPDirectiveName(Kind);
+      return false;
+    }
+    // Handle loop transformations with multiple loop nests
+    // Unroll full
+    if (!NumGeneratedTopLevelLoops) {
+      Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+          << 0 << getOpenMPDirectiveName(Kind);
+      return false;
+    }
+    // Loop transformatons such as split or loopranged fuse
+    if (NumGeneratedTopLevelLoops > 1) {
+      // Get the preinits related to this loop sequence generating
+      // loop transformation (i.e loopranged fuse, split...)
+      // These preinits differ slightly from regular inits/pre-inits related
+      // to single loop generating loop transformations (interchange, unroll)
+      // given that they are not bounded to a particular loop nest
+      // so they need to be treated independently
+      updatePreInits(LoopTransform, SeqAnalysis.LoopSequencePreInits);
+      return analyzeLoopSequence(TransformedStmt, SeqAnalysis, Context, Kind);
+    }
+    // Vast majority: (Tile, Unroll, Stripe, Reverse, Interchange, Fuse all)
+    // Process the transformed loop statement
+    LoopAnalysis &NewTransformedSingleLoop =
+        SeqAnalysis.Loops.emplace_back(Child);
+    unsigned IsCanonical = checkOpenMPLoop(
+        Kind, nullptr, nullptr, TransformedStmt, SemaRef, *DSAStack, TmpDSA,
+        NewTransformedSingleLoop.HelperExprs);
+
+    if (!IsCanonical)
+      return false;
+
+    StoreLoopStatements(NewTransformedSingleLoop, TransformedStmt);
+    updatePreInits(LoopTransform, NewTransformedSingleLoop.TransformsPreInits);
+
+    SeqAnalysis.LoopSeqSize++;
+    return true;
+  };
+
+  // Modularized code for handling regular canonical loops.
+  auto AnalyzeRegularLoop = [&](Stmt *Child) {
+    LoopAnalysis &NewRegularLoop = SeqAnalysis.Loops.emplace_back(Child);
+    unsigned IsCanonical =
+        checkOpenMPLoop(Kind, nullptr, nullptr, Child, SemaRef, *DSAStack,
+                        TmpDSA, NewRegularLoop.HelperExprs);
+
+    if (!IsCanonical)
+      return false;
+
+    StoreLoopStatements(NewRegularLoop, Child);
+    NestedLoopCounterVisitor NLCV;
+    NLCV.TraverseStmt(Child);
+    return true;
+  };
+
+  // High level grammar validation.
+  for (Stmt *Child : LoopSeqStmt->children()) {
+    if (!Child)
+      continue;
+    // Skip over non-loop-sequence statements.
+    if (!LoopSequenceAnalysis::isLoopSequenceDerivation(Child)) {
+      Child = Child->IgnoreContainers();
+      // Ignore empty compound statement.
+      if (!Child)
+        continue;
+      // In the case of a nested loop sequence ignoring containers would not
+      // be enough, a recurisve transversal of the loop sequence is required.
+      if (isa<CompoundStmt>(Child)) {
+        if (!analyzeLoopSequence(Child, SeqAnalysis, Context, Kind))
+          return false;
+        // Already been treated, skip this children
+        continue;
+      }
+    }
+    // Regular loop sequence handling.
+    if (LoopSequenceAnalysis::isLoopSequenceDerivation(Child)) {
+      if (LoopAnalysis::isLoopTransformation(Child)) {
+        if (!AnalyzeLoopGeneration(Child))
+          return false;
+        // AnalyzeLoopGeneration updates SeqAnalysis.LoopSeqSize accordingly.
+      } else {
+        if (!AnalyzeRegularLoop(Child))
+          return false;
+        SeqAnalysis.LoopSeqSize++;
+      }
+    } else {
+      // Report error for invalid statement inside canonical loop sequence.
+      Diag(Child->getBeginLoc(), diag::err_omp_not_for)
+          << 0 << getOpenMPDirectiveName(Kind);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool SemaOpenMP::checkTransformableLoopSequence(
+    OpenMPDirectiveKind Kind, Stmt *AStmt, LoopSequenceAnalysis &SeqAnalysis,
+    ASTContext &Context) {
+  // Following OpenMP 6.0 API Specification, a Canonical Loop Sequence follows
+  // the grammar:
+  //
+  // canonical-loop-sequence:
+  //  {
+  //    loop-sequence+
+  //  }
+  // where loop-sequence can be any of the following:
+  // 1. canonical-loop-sequence
+  // 2. loop-nest
+  // 3. loop-sequence-generating-construct (i.e OMPLoopTransformationDirective)
+  //
+  // To recognise and traverse this structure the helper function
+  // analyzeLoopSequence serves as the recurisve entry point
+  // and tries to match the input AST to the canonical loop sequence grammar
+  // structure. This function will perform both a semantic and syntactical
+  // analysis of the given statement according to OpenMP 6.0 definition of
+  // the aforementioned canonical loop sequence.
+
+  // We expect an outer compound statement.
+  if (!isa<CompoundStmt>(AStmt)) {
+    Diag(AStmt->getBeginLoc(), diag::err_omp_not_a_loop_sequence)
+        << getOpenMPDirectiveName(Kind);
+    return false;
+  }
+
+  // Recursive entry point to process the main loop sequence
+  if (!analyzeLoopSequence(AStmt, SeqAnalysis, Context, Kind))
+    return false;
+
+  // Diagnose an empty loop sequence.
+  if (!SeqAnalysis.LoopSeqSize) {
+    Diag(AStmt->getBeginLoc(), diag::err_omp_empty_loop_sequence)
+        << getOpenMPDirectiveName(Kind);
+    return false;
+  }
+  return true;
+}
+
+/// Add preinit statements that need to be propagated from the selected loop.
 static void addLoopPreInits(ASTContext &Context,
                             OMPLoopBasedDirective::HelperExprs &LoopHelper,
                             Stmt *LoopStmt, ArrayRef<Stmt *> OriginalInit,
@@ -14540,7 +14811,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body,
                                   OriginalInits))
     return StmtError();
@@ -14817,7 +15088,7 @@ StmtResult SemaOpenMP::ActOnOpenMPStripeDirective(ArrayRef<OMPClause *> Clauses,
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, 4> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_stripe, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15078,7 +15349,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   Stmt *Body = nullptr;
   SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
       NumLoops);
-  SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15348,7 +15619,7 @@ StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt,
   Stmt *Body = nullptr;
   SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
       NumLoops);
-  SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, NumLoops + 1> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_reverse, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15540,7 +15811,7 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<Stmt *, 0>, 2> OriginalInits;
+  SmallVector<SmallVector<Stmt *>, 2> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_interchange, AStmt, NumLoops,
                                   LoopHelpers, Body, OriginalInits))
     return StmtError();
@@ -15716,6 +15987,484 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective(
                                          buildPreInits(Context, PreInits));
 }
 
+StmtResult SemaOpenMP::ActOnOpenMPFuseDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+
+  ASTContext &Context = getASTContext();
+  DeclContext *CurrContext = SemaRef.CurContext;
+  Scope *CurScope = SemaRef.getCurScope();
+  CaptureVars CopyTransformer(SemaRef);
+
+  // Ensure the structured block is not empty
+  if (!AStmt)
+    return StmtError();
+
+  // Defer transformation in dependent contexts
+  // The NumLoopNests argument is set to a placeholder 1 (even though
+  // using looprange fuse could yield up to 3 top level loop nests)
+  // because a dependent context could prevent determining its true value
+  if (CurrContext->isDependentContext())
+    return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                    /* NumLoops */ 1, AStmt, nullptr, nullptr);
+
+  // Validate that the potential loop sequence is transformable for fusion
+  // Also collect the HelperExprs, Loop Stmts, Inits, and Number of loops
+  LoopSequenceAnalysis SeqAnalysis;
+  if (!checkTransformableLoopSequence(OMPD_fuse, AStmt, SeqAnalysis, Context))
+    return StmtError();
+
+  // SeqAnalysis.LoopSeqSize exists mostly to handle dependent contexts,
+  // otherwise it must be the same as SeqAnalysis.Loops.size().
+  assert(SeqAnalysis.LoopSeqSize == SeqAnalysis.Loops.size() &&
+         "Inconsistent size of the loop sequence and the number of loops "
+         "found in the sequence");
+
+  // Handle clauses, which can be any of the following: [looprange, apply]
+  const auto *LRC =
+      OMPExecutableDirective::getSingleClause<OMPLoopRangeClause>(Clauses);
+
+  // The clause arguments are invalidated if any error arises
+  // such as non-constant or non-positive arguments
+  if (LRC && (!LRC->getFirst() || !LRC->getCount()))
+    return StmtError();
+
+  // Delayed semantic check of LoopRange constraint
+  // Evaluates the loop range arguments and returns the first and count values
+  auto EvaluateLoopRangeArguments = [&Context](Expr *First, Expr *Count,
+                                               uint64_t &FirstVal,
+                                               uint64_t &CountVal) {
+    llvm::APSInt FirstInt = First->EvaluateKnownConstInt(Context);
+    llvm::APSInt CountInt = Count->EvaluateKnownConstInt(Context);
+    FirstVal = FirstInt.getZExtValue();
+    CountVal = CountInt.getZExtValue();
+  };
+
+  // OpenMP [6.0, Restrictions]
+  // first + count - 1 must not evaluate to a value greater than the
+  // loop sequence length of the associated canonical loop sequence.
+  auto ValidLoopRange = [](uint64_t FirstVal, uint64_t CountVal,
+                           unsigned NumLoops) -> bool {
+    return FirstVal + CountVal - 1 <= NumLoops;
+  };
+  uint64_t FirstVal = 1, CountVal = 0, LastVal = SeqAnalysis.LoopSeqSize;
+
+  // Validates the loop range after evaluating the semantic information
+  // and ensures that the range is valid for the given loop sequence size.
+  // Expressions are evaluated at compile time to obtain constant values.
+  if (LRC) {
+    EvaluateLoopRangeArguments(LRC->getFirst(), LRC->getCount(), FirstVal,
+                               CountVal);
+    if (CountVal == 1)
+      SemaRef.Diag(LRC->getCountLoc(), diag::warn_omp_redundant_fusion)
+          << getOpenMPDirectiveName(OMPD_fuse);
+
+    if (!ValidLoopRange(FirstVal, CountVal, SeqAnalysis.LoopSeqSize)) {
+      SemaRef.Diag(LRC->getFirstLoc(), diag::err_omp_invalid_looprange)
+          << getOpenMPDirectiveName(OMPD_fuse) << FirstVal
+          << (FirstVal + CountVal - 1) << SeqAnalysis.LoopSeqSize;
+      return StmtError();
+    }
+
+    LastVal = FirstVal + CountVal - 1;
+  }
+
+  // Complete fusion generates a single canonical loop nest
+  // However looprange clause may generate several loop nests
+  unsigned NumGeneratedTopLevelLoops =
+      LRC ? SeqAnalysis.LoopSeqSize - CountVal + 1 : 1;
+
+  // Emit a warning for redundant loop fusion when the sequence contains only
+  // one loop.
+  if (SeqAnalysis.LoopSeqSize == 1)
+    SemaRef.Diag(AStmt->getBeginLoc(), diag::warn_omp_redundant_fusion)
+        << getOpenMPDirectiveName(OMPD_fuse);
+
+  // Select the type with the largest bit width among all induction variables
+  QualType IVType =
+      SeqAnalysis.Loops[FirstVal - 1].HelperExprs.IterationVarRef->getType();
+  for (unsigned I : llvm::seq<unsigned>(FirstVal, LastVal)) {
+    QualType CurrentIVType =
+        SeqAnalysis.Loops[I].HelperExprs.IterationVarRef->getType();
+    if (Context.getTypeSize(CurrentIVType) > Context.getTypeSize(IVType)) {
+      IVType = CurrentIVType;
+    }
+  }
+  uint64_t IVBitWidth = Context.getIntWidth(IVType);
+
+  // Create pre-init declarations for all loops lower bounds, upper bounds,
+  // strides and num-iterations for every top level loop in the fusion
+  SmallVector<VarDecl *, 4> LBVarDecls;
+  SmallVector<VarDecl *, 4> STVarDecls;
+  SmallVector<VarDecl *, 4> NIVarDecls;
+  SmallVector<VarDecl *, 4> UBVarDecls;
+  SmallVector<VarDecl *, 4> IVVarDecls;
+
+  // Helper lambda to create variables for bounds, strides, and other
+  // expressions. Generates both the variable declaration and the corresponding
+  // initialization statement.
+  auto CreateHelperVarAndStmt =
+      [&, &SemaRef = SemaRef](Expr *ExprToCopy, const std::string &BaseName,
+                              unsigned I, bool NeedsNewVD = false) {
+        Expr *TransformedExpr =
+            AssertSuccess(CopyTransformer.TransformExpr(ExprToCopy));
+        if (!TransformedExpr)
+          return std::pair<VarDecl *, StmtResult>(nullptr, StmtError());
+
+        auto Name = (Twine(".omp.") + BaseName + std::to_string(I)).str();
+
+        VarDecl *VD;
+        if (NeedsNewVD) {
+          VD = buildVarDecl(SemaRef, SourceLocation(), IVType, Name);
+          SemaRef.AddInitializerToDecl(VD, TransformedExpr, false);
+        } else {
+          // Create a unique variable name
+          DeclRefExpr *DRE = cast<DeclRefExpr>(TransformedExpr);
+          VD = cast<VarDecl>(DRE->getDecl());
+          VD->setDeclName(&SemaRef.PP.getIdentifierTable().get(Name));
+        }
+        // Create the corresponding declaration statement
+        StmtResult DeclStmt = new (Context) class DeclStmt(
+            DeclGroupRef(VD), SourceLocation(), SourceLocation());
+        return std::make_pair(VD, DeclStmt);
+      };
+
+  // PreInits hold a sequence of variable declarations that must be executed
+  // before the fused loop begins. These include bounds, strides, and other
+  // helper variables required for the transformation. Other loop transforms
+  // also contain their own preinits
+  SmallVector<Stmt *> PreInits;
+
+  //  Update the general preinits using the preinits generated by loop sequence
+  //  generating loop transformations. These preinits differ slightly from
+  //  single-loop transformation preinits, as they can be detached from a
+  //  specific loop inside multiple generated loop nests. This happens
+  //  because certain helper variables, like '.omp.fuse.max', are introduced to
+  //  handle fused iteration spaces and may not be directly tied to a single
+  //  original loop. The preinit structure must ensure that hidden variables
+  //  like '.omp.fuse.max' are still properly handled.
+  // Transformations that apply this concept: Loopranged Fuse, Split
+  llvm::append_range(PreInits, SeqAnalysis.LoopSequencePreInits);
+
+  // Process each single loop to generate and collect declarations
+  // and statements for all helper expressions related to
+  // particular single loop nests
+
+  // Also In the case of the fused loops, we keep track of their original
+  // inits by appending them to their preinits statement, and in the case of
+  // transformations, also append their preinits (which contain the original
+  // loop initialization statement or other statements)
+
+  // Firstly we need to set TransformIndex to match the begining of the
+  // looprange section
+  unsigned int TransformIndex = 0;
+  for (unsigned I : llvm::seq<unsigned>(FirstVal - 1)) {
+    if (SeqAnalysis.Loops[I].isLoopTransformation())
+      ++TransformIndex;
+  }
+
+  for (unsigned int I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+    if (SeqAnalysis.Loops[I].isRegularLoop()) {
+      addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+                      SeqAnalysis.Loops[I].TheForStmt,
+                      SeqAnalysis.Loops[I].OriginalInits, PreInits);
+    } else if (SeqAnalysis.Loops[I].isLoopTransformation()) {
+      // For transformed loops, insert both pre-inits and original inits.
+      // Order matters: pre-inits may define variables used in the original
+      // inits such as upper bounds...
+      SmallVector<Stmt *> &TransformPreInit =
+          SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+      llvm::append_range(PreInits, TransformPreInit);
+
+      addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs,
+                      SeqAnalysis.Loops[I].TheForStmt,
+                      SeqAnalysis.Loops[I].OriginalInits, PreInits);
+    }
+    auto [UBVD, UBDStmt] =
+        CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.UB, "ub", J);
+    auto [LBVD, LBDStmt] =
+        CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.LB, "lb", J);
+    auto [STVD, STDStmt] =
+        CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.ST, "st", J);
+    auto [NIVD, NIDStmt] = CreateHelperVarAndStmt(
+        SeqAnalysis.Loops[I].HelperExprs.NumIterations, "ni", J, true);
+    auto [IVVD, IVDStmt] = CreateHelperVarAndStmt(
+        SeqAnalysis.Loops[I].HelperExprs.IterationVarRef, "iv", J);
+
+    assert(LBVD && STVD && NIVD && IVVD &&
+           "OpenMP Fuse Helper variables creation failed");
+
+    UBVarDecls.push_back(UBVD);
+    LBVarDecls.push_back(LBVD);
+    STVarDecls.push_back(STVD);
+    NIVarDecls.push_back(NIVD);
+    IVVarDecls.push_back(IVVD);
+
+    PreInits.push_back(LBDStmt.get());
+    PreInits.push_back(STDStmt.get());
+    PreInits.push_back(NIDStmt.get());
+    PreInits.push_back(IVDStmt.get());
+  }
+
+  auto MakeVarDeclRef = [&SemaRef = this->SemaRef](VarDecl *VD) {
+    return buildDeclRefExpr(SemaRef, VD, VD->getType(), VD->getLocation(),
+                            false);
+  };
+
+  // Following up the creation of the final fused loop will be performed
+  // which has the following shape (considering the selected loops):
+  //
+  // for (fuse.index = 0; fuse.index < max(ni0, ni1..., nik); ++fuse.index) {
+  //    if (fuse.index < ni0){
+  //      iv0 = lb0 + st0 * fuse.index;
+  //      original.index0 = iv0
+  //      body(0);
+  //    }
+  //    if (fuse.index < ni1){
+  //      iv1 = lb1 + st1 * fuse.index;
+  //      original.index1 = iv1
+  //      body(1);
+  //    }
+  //
+  //    ...
+  //
+  //    if (fuse.index < nik){
+  //      ivk = lbk + stk * fuse.index;
+  //      original.indexk = ivk
+  //      body(k);  Expr *InitVal = IntegerLiteral::Create(Context,
+  //      llvm::APInt(IVWidth, 0),
+  //    }
+
+  // 1. Create the initialized fuse index
+  StringRef IndexName = ".omp.fuse.index";
+  Expr *InitVal = IntegerLiteral::Create(Context, llvm::APInt(IVBitWidth, 0),
+                                         IVType, SourceLocation());
+  VarDecl *IndexDecl =
+      buildVarDecl(SemaRef, {}, IVType, IndexName, nullptr, nullptr);
+  SemaRef.AddInitializerToDecl(IndexDecl, InitVal, false);
+  StmtResult InitStmt = new (Context)
+      DeclStmt(DeclGroupRef(IndexDecl), SourceLocation(), SourceLocation());
+
+  if (!InitStmt.isUsable())
+    return StmtError();
+
+  auto MakeIVRef = [&SemaRef = this->SemaRef, IndexDecl, IVType,
+                    Loc = InitVal->getExprLoc()]() {
+    return buildDeclRefExpr(SemaRef, IndexDecl, IVType, Loc, false);
+  };
+
+  // 2. Iteratively compute the max number of logical iterations Max(NI_1, NI_2,
+  // ..., NI_k)
+  //
+  // This loop accumulates the maximum value across multiple expressions,
+  // ensuring each step constructs a unique AST node for correctness. By using
+  // intermediate temporary variables and conditional operators, we maintain
+  // distinct nodes and avoid duplicating subtrees,  For instance, max(a,b,c):
+  //   omp.temp0 = max(a, b)
+  //   omp.temp1 = max(omp.temp0, c)
+  //   omp.fuse.max = max(omp.temp1, omp.temp0)
+
+  ExprResult MaxExpr;
+  // I is the range of loops in the sequence that we fuse.
+  for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+    DeclRefExpr *NIRef = MakeVarDeclRef(NIVarDecls[J]);
+    QualType NITy = NIRef->getType();
+
+    if (MaxExpr.isUnset()) {
+      // Initialize MaxExpr with the first NI expression
+      MaxExpr = NIRef;
+    } else {
+      // Create a new acummulator variable t_i = MaxExpr
+      std::string TempName = (Twine(".omp.temp.") + Twine(J)).str();
+      VarDecl *TempDecl =
+          buildVarDecl(SemaRef, {}, NITy, TempName, nullptr, nullptr);
+      TempDecl->setInit(MaxExpr.get());
+      DeclRefExpr *TempRef =
+          buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+      DeclRefExpr *TempRef2 =
+          buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false);
+      // Add a DeclStmt to PreInits to ensure the variable is declared.
+      StmtResult TempStmt = new (Context)
+          DeclStmt(DeclGroupRef(TempDecl), SourceLocation(), SourceLocation());
+
+      if (!TempStmt.isUsable())
+        return StmtError();
+      PreInits.push_back(TempStmt.get());
+
+      // Build MaxExpr <-(MaxExpr > NIRef ? MaxExpr : NIRef)
+      ExprResult Comparison =
+          SemaRef.BuildBinOp(nullptr, SourceLocation(), BO_GT, TempRef, NIRef);
+      // Handle any errors in Comparison creation
+      if (!Comparison.isUsable())
+        return StmtError();
+
+      DeclRefExpr *NIRef2 = MakeVarDeclRef(NIVarDecls[J]);
+      // Update MaxExpr using a conditional expression to hold the max value
+      MaxExpr = new (Context) ConditionalOperator(
+          Comparison.get(), SourceLocation(), TempRef2, SourceLocation(),
+          NIRef2->getExprStmt(), NITy, VK_LValue, OK_Ordinary);
+
+      if (!MaxExpr.isUsable())
+        return StmtError();
+    }
+  }
+  if (!MaxExpr.isUsable())
+    return StmtError();
+
+  // 3. Declare the max variable
+  const std::string MaxName = Twine(".omp.fuse.max").str();
+  VarDecl *MaxDecl =
+      buildVarDecl(SemaRef, {}, IVType, MaxName, nullptr, nullptr);
+  MaxDecl->setInit(MaxExpr.get());
+  DeclRefExpr *MaxRef = buildDeclRefExpr(SemaRef, MaxDecl, IVType, {}, false);
+  StmtResult MaxStmt = new (Context)
+      DeclStmt(DeclGroupRef(MaxDecl), SourceLocation(), SourceLocation());
+
+  if (MaxStmt.isInvalid())
+    return StmtError();
+  PreInits.push_back(MaxStmt.get());
+
+  // 4. Create condition Expr: index < n_max
+  ExprResult CondExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT,
+                                           MakeIVRef(), MaxRef);
+  if (!CondExpr.isUsable())
+    return StmtError();
+
+  // 5. Increment Expr: ++index
+  ExprResult IncrExpr =
+      SemaRef.BuildUnaryOp(CurScope, SourceLocation(), UO_PreInc, MakeIVRef());
+  if (!IncrExpr.isUsable())
+    return StmtError();
+
+  // 6. Build the Fused Loop Body
+  // The final fused loop iterates over the maximum logical range. Inside the
+  // loop, each original loop's index is calculated dynamically, and its body
+  // is executed conditionally.
+  //
+  // Each sub-loop's body is guarded by a conditional statement to ensure
+  // it executes only within its logical iteration range:
+  //
+  //    if (fuse.index < ni_k){
+  //      iv_k = lb_k + st_k * fuse.index;
+  //      original.index = iv_k
+  //      body(k);
+  //    }
+
+  CompoundStmt *FusedBody = nullptr;
+  SmallVector<Stmt *, 4> FusedBodyStmts;
+  for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) {
+    // Assingment of the original sub-loop index to compute the logical index
+    // IV_k = LB_k + omp.fuse.index * ST_k
+    ExprResult IdxExpr =
+        SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Mul,
+                           MakeVarDeclRef(STVarDecls[J]), MakeIVRef());
+    if (!IdxExpr.isUsable())
+      return StmtError();
+    IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Add,
+                                 MakeVarDeclRef(LBVarDecls[J]), IdxExpr.get());
+
+    if (!IdxExpr.isUsable())
+      return StmtError();
+    IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Assign,
+                                 MakeVarDeclRef(IVVarDecls[J]), IdxExpr.get());
+    if (!IdxExpr.isUsable())
+      return StmtError();
+
+    // Update the original i_k = IV_k
+    SmallVector<Stmt *, 4> BodyStmts;
+    BodyStmts.push_back(IdxExpr.get());
+    llvm::append_range(BodyStmts, SeqAnalysis.Loops[I].HelperExprs.Updates);
+
+    // If the loop is a CXXForRangeStmt then the iterator variable is needed
+    if (auto *SourceCXXFor =
+            dyn_cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].TheForStmt))
+      BodyStmts.push_back(SourceCXXFor->getLoopVarStmt());
+
+    Stmt *Body =
+        (isa<ForStmt>(SeqAnalysis.Loops[I].TheForStmt))
+            ? cast<ForStmt>(SeqAnalysis.Loops[I].TheForStmt)->getBody()
+            : cast<CXXForRangeStmt>(SeqAnalysis.Loops[I].TheForStmt)->getBody();
+    BodyStmts.push_back(Body);
+
+    CompoundStmt *CombinedBody =
+        CompoundStmt::Create(Context, BodyStmts, FPOptionsOverride(),
+                             SourceLocation(), SourceLocation());
+    ExprResult Condition =
+        SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT, MakeIVRef(),
+                           MakeVarDeclRef(NIVarDecls[J]));
+
+    if (!Condition.isUsable())
+      return StmtError();
+
+    IfStmt *IfStatement = IfStmt::Create(
+        Context, SourceLocation(), IfStatementKind::Ordinary, nullptr, nullptr,
+        Condition.get(), SourceLocation(), SourceLocation(), CombinedBody,
+        SourceLocation(), nullptr);
+
+    FusedBodyStmts.push_back(IfStatement);
+  }
+  FusedBody = CompoundStmt::Create(Context, FusedBodyStmts, FPOptionsOverride(),
+                                   SourceLocation(), SourceLocation());
+
+  // 7. Construct the final fused loop
+  ForStmt *FusedForStmt = new (Context)
+      ForStmt(Context, InitStmt.get(), CondExpr.get(), nullptr, IncrExpr.get(),
+              FusedBody, InitStmt.get()->getBeginLoc(), SourceLocation(),
+              IncrExpr.get()->getEndLoc());
+
+  //  In the case of looprange, the result of fuse won't simply
+  //  be a single loop (ForStmt), but rather a loop sequence
+  //  (CompoundStmt) of 3 parts: the pre-fusion loops, the fused loop
+  //  and the post-fusion loops, preserving its original order.
+  //
+  //  Note: If looprange clause produces a single fused loop nest then
+  //  this compound statement wrapper is unnecessary (Therefore this
+  //  treatment is skipped)
+
+  Stmt *FusionStmt = FusedForStmt;
+  if (LRC && CountVal != SeqAnalysis.LoopSeqSize) {
+    SmallVector<Stmt *, 4> FinalLoops;
+
+    // Reset the transform index
+    TransformIndex = 0;
+
+    // Collect all non-fused loops before and after the fused region.
+    // Pre-fusion and post-fusion loops are inserted in order exploiting their
+    // symmetry, along with their corresponding transformation pre-inits if
+    // needed. The fused loop is added between the two regions.
+    for (unsigned I : llvm::seq<unsigned>(SeqAnalysis.LoopSeqSize)) {
+      if (I >= FirstVal - 1 && I < FirstVal + CountVal - 1) {
+        // Update the Transformation counter to skip already treated
+        // loop transformations
+        if (!SeqAnalysis.Loops[I].isLoopTransformation())
+          ++TransformIndex;
+        continue;
+      }
+
+      // No need to handle:
+      // Regular loops: they are kept intact as-is.
+      // Loop-sequence-generating transformations: already handled earlier.
+      // Only TransformSingleLoop requires inserting pre-inits here
+      if (SeqAnalysis.Loops[I].isRegularLoop()) {
+        const auto &TransformPreInit =
+            SeqAnalysis.Loops[TransformIndex++].TransformsPreInits;
+        if (!TransformPreInit.empty())
+          llvm::append_range(PreInits, TransformPreInit);
+      }
+
+      FinalLoops.push_back(SeqAnalysis.Loops[I].TheForStmt);
+    }
+
+    FinalLoops.insert(FinalLoops.begin() + (FirstVal - 1), FusedForStmt);
+    FusionStmt = CompoundStmt::Create(Context, FinalLoops, FPOptionsOverride(),
+                                      SourceLocation(), SourceLocation());
+  }
+  return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses,
+                                  NumGeneratedTopLevelLoops, AStmt, FusionStmt,
+                                  buildPreInits(Context, PreInits));
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
                                                    Expr *Expr,
                                                    SourceLocation StartLoc,
@@ -16887,6 +17636,31 @@ OMPClause *SemaOpenMP::ActOnOpenMPPartialClause(Expr *FactorExpr,
                                   FactorExpr);
 }
 
+OMPClause *SemaOpenMP::ActOnOpenMPLoopRangeClause(
+    Expr *First, Expr *Count, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation FirstLoc, SourceLocation CountLoc, SourceLocation EndLoc) {
+
+  // OpenMP [6.0, Restrictions]
+  // First and Count must be integer expressions with positive value
+  ExprResult FirstVal =
+      VerifyPositiveIntegerConstantInClause(First, OMPC_looprange);
+  if (FirstVal.isInvalid())
+    First = nullptr;
+
+  ExprResult CountVal =
+      VerifyPositiveIntegerConstantInClause(Count, OMPC_looprange);
+  if (CountVal.isInvalid())
+    Count = nullptr;
+
+  // OpenMP [6.0, Restrictions]
+  // first + count - 1 must not evaluate to a value greater than the
+  // loop sequence length of the associated canonical loop sequence.
+  // This check must be performed afterwards due to the delayed
+  // parsing and computation of the associated loop sequence
+  return OMPLoopRangeClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                    FirstLoc, CountLoc, EndLoc, First, Count);
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc,
                                               SourceLocation LParenLoc,
                                               SourceLocation EndLoc) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 021407842aa6d..6967301483361 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1783,6 +1783,14 @@ class TreeTransform {
                                                        LParenLoc, EndLoc);
   }
 
+  OMPClause *
+  RebuildOMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc,
+                            SourceLocation LParenLoc, SourceLocation FirstLoc,
+                            SourceLocation CountLoc, SourceLocation EndLoc) {
+    return getSema().OpenMP().ActOnOpenMPLoopRangeClause(
+        First, Count, StartLoc, LParenLoc, FirstLoc, CountLoc, EndLoc);
+  }
+
   /// Build a new OpenMP 'allocator' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -9607,6 +9615,17 @@ StmtResult TreeTransform<Derived>::TransformOMPInterchangeDirective(
   return Res;
 }
 
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOMPFuseDirective(OMPFuseDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
 template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPForDirective(OMPForDirective *D) {
@@ -10500,6 +10519,31 @@ TreeTransform<Derived>::TransformOMPPartialClause(OMPPartialClause *C) {
                                  C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPLoopRangeClause(OMPLoopRangeClause *C) {
+  ExprResult F = getDerived().TransformExpr(C->getFirst());
+  if (F.isInvalid())
+    return nullptr;
+
+  ExprResult Cn = getDerived().TransformExpr(C->getCount());
+  if (Cn.isInvalid())
+    return nullptr;
+
+  Expr *First = F.get();
+  Expr *Count = Cn.get();
+
+  bool Changed = (First != C->getFirst()) || (Count != C->getCount());
+
+  // If no changes and AlwaysRebuild() is false, return the original clause
+  if (!Changed && !getDerived().AlwaysRebuild())
+    return C;
+
+  return RebuildOMPLoopRangeClause(First, Count, C->getBeginLoc(),
+                                   C->getLParenLoc(), C->getFirstLoc(),
+                                   C->getCountLoc(), C->getEndLoc());
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPCollapseClause(OMPCollapseClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 9ee8a0fb0f060..c05e428a6fb39 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11215,6 +11215,9 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_partial:
     C = OMPPartialClause::CreateEmpty(Context);
     break;
+  case llvm::omp::OMPC_looprange:
+    C = OMPLoopRangeClause::CreateEmpty(Context);
+    break;
   case llvm::omp::OMPC_allocator:
     C = new (Context) OMPAllocatorClause();
     break;
@@ -11618,6 +11621,14 @@ void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
 }
 
+void OMPClauseReader::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+  C->setFirst(Record.readSubExpr());
+  C->setCount(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+  C->setFirstLoc(Record.readSourceLocation());
+  C->setCountLoc(Record.readSourceLocation());
+}
+
 void OMPClauseReader::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
   C->setAllocator(Record.readExpr());
   C->setLParenLoc(Record.readSourceLocation());
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 213c2c2148f64..70b898a53fcbd 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2469,10 +2469,21 @@ void ASTStmtReader::VisitOMPReverseDirective(OMPReverseDirective *D) {
   VisitOMPCanonicalLoopNestTransformationDirective(D);
 }
 
+void ASTStmtReader::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    OMPCanonicalLoopSequenceTransformationDirective *D) {
+  VisitStmt(D);
+  VisitOMPExecutableDirective(D);
+  D->setNumGeneratedTopLevelLoops(Record.readUInt32());
+}
+
 void ASTStmtReader::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
   VisitOMPCanonicalLoopNestTransformationDirective(D);
 }
 
+void ASTStmtReader::VisitOMPFuseDirective(OMPFuseDirective *D) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
 void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   D->setHasCancel(Record.readBool());
@@ -3615,6 +3626,12 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
     }
 
+    case STMT_OMP_FUSE_DIRECTIVE: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OMPFuseDirective::CreateEmpty(Context, NumClauses);
+      break;
+    }
+
     case STMT_OMP_INTERCHANGE_DIRECTIVE: {
       unsigned NumLoops = Record[ASTStmtReader::NumStmtFields];
       unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1];
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 09859da171fcd..cdf95ba1c4ba5 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -7882,6 +7882,14 @@ void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
 }
 
+void OMPClauseWriter::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) {
+  Record.AddStmt(C->getFirst());
+  Record.AddStmt(C->getCount());
+  Record.AddSourceLocation(C->getLParenLoc());
+  Record.AddSourceLocation(C->getFirstLoc());
+  Record.AddSourceLocation(C->getCountLoc());
+}
+
 void OMPClauseWriter::VisitOMPAllocatorClause(OMPAllocatorClause *C) {
   Record.AddStmt(C->getAllocator());
   Record.AddSourceLocation(C->getLParenLoc());
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 21c04ddbc2c7a..ebda91e3819c3 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2487,6 +2487,18 @@ void ASTStmtWriter::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) {
   Code = serialization::STMT_OMP_INTERCHANGE_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    OMPCanonicalLoopSequenceTransformationDirective *D) {
+  VisitStmt(D);
+  VisitOMPExecutableDirective(D);
+  Record.writeUInt32(D->getNumGeneratedTopLevelLoops());
+}
+
+void ASTStmtWriter::VisitOMPFuseDirective(OMPFuseDirective *D) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+  Code = serialization::STMT_OMP_FUSE_DIRECTIVE;
+}
+
 void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) {
   VisitOMPLoopDirective(D);
   Record.writeBool(D->hasCancel());
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 785cdfa15bf04..4e472b7fc38b0 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1814,6 +1814,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPStripeDirectiveClass:
     case Stmt::OMPTileDirectiveClass:
     case Stmt::OMPInterchangeDirectiveClass:
+    case Stmt::OMPFuseDirectiveClass:
     case Stmt::OMPInteropDirectiveClass:
     case Stmt::OMPDispatchDirectiveClass:
     case Stmt::OMPMaskedDirectiveClass:
diff --git a/clang/test/OpenMP/fuse_ast_print.cpp b/clang/test/OpenMP/fuse_ast_print.cpp
new file mode 100644
index 0000000000000..283f5883c907d
--- /dev/null
+++ b/clang/test/OpenMP/fuse_ast_print.cpp
@@ -0,0 +1,397 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing 
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-dump  %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip 
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -std=c++20 -fopenmp-version=60 -include-pch %t -ast-print    %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER 
+
+// placeholder for loop body code
+extern "C" void body(...);
+
+// PRINT-LABEL: void foo1(
+// DUMP-LABEL: FunctionDecl {{.*}} foo1
+void foo1() {
+    // PRINT: #pragma omp fuse
+    // DUMP:  OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+        // PRINT: for (int k = 0; k <= 10; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k <= 10; ++k)
+            // PRINT: body(k)
+            // DUMP: CallExpr
+            body(k);
+
+    }
+
+}
+
+// PRINT-LABEL: void foo2(
+// DUMP-LABEL: FunctionDecl {{.*}} foo2
+void foo2() {
+    // PRINT: #pragma omp unroll partial(4)
+    // DUMP: OMPUnrollDirective
+    // DUMP-NEXT: OMPPartialClause
+    // DUMP-NEXT: ConstantExpr
+    // DUMP-NEXT: value: Int 4
+    // DUMP-NEXT: IntegerLiteral {{.*}} 4
+    #pragma omp unroll partial(4)
+    // PRINT: #pragma omp fuse
+    // DUMP-NEXT: OMPFuseDirective 
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);  
+    }    
+    
+}
+
+//PRINT-LABEL: void foo3(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo3
+template<int Factor1, int Factor2> 
+void foo3() {
+    // PRINT:  #pragma omp fuse
+    // DUMP: OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: #pragma omp unroll partial(Factor1)
+        // DUMP: OMPUnrollDirective
+        #pragma omp unroll partial(Factor1)
+        // PRINT: for (int i = 0; i < 12; i += 1)
+        // DUMP: ForStmt
+        for (int i = 0; i < 12; i += 1)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: #pragma omp unroll partial(Factor2)
+        // DUMP: OMPUnrollDirective
+        #pragma omp unroll partial(Factor2)
+        // PRINT: for (int k = 0; k <= 10; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k <= 10; ++k)
+            // PRINT: body(k)
+            // DUMP: CallExpr
+            body(k);
+
+    }
+}
+
+// Also test instantiating the template.
+void tfoo3() {
+    foo3<4,2>();
+}
+
+//PRINT-LABEL: void foo4(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo4
+template<typename T, T Step> 
+void foo4(int start, int end) {
+    // PRINT:  #pragma omp fuse
+    // DUMP: OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (T i = start; i < end; i += Step)
+        // DUMP: ForStmt
+        for (T i = start; i < end; i += Step)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+
+        // PRINT: for (T j = end; j > start; j -= Step)
+        // DUMP: ForStmt 
+        for (T j = end; j > start; j -= Step) {
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+        }
+
+    }
+}
+
+// Also test instantiating the template.
+void tfoo4() {
+    foo4<int, 4>(0, 64);
+}
+
+
+
+// PRINT-LABEL: void foo5(
+// DUMP-LABEL: FunctionDecl {{.*}} foo5
+void foo5() {
+    double arr[128], arr2[128];
+    // PRINT: #pragma omp fuse
+    // DUMP:  OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT-NEXT: for (auto &&a : arr)
+        // DUMP-NEXT: CXXForRangeStmt
+        for (auto &&a: arr)
+            // PRINT: body(a)
+            // DUMP: CallExpr
+            body(a);
+        // PRINT: for (double v = 42; auto &&b : arr)
+        // DUMP: CXXForRangeStmt
+        for (double v = 42; auto &&b: arr)
+            // PRINT: body(b, v);
+            // DUMP: CallExpr
+            body(b, v);
+        // PRINT: for (auto &&c : arr2)
+        // DUMP: CXXForRangeStmt
+        for (auto &&c: arr2)
+            // PRINT: body(c)
+            // DUMP: CallExpr
+            body(c);
+
+    }
+
+}
+
+// PRINT-LABEL: void foo6(
+// DUMP-LABEL: FunctionDecl {{.*}} foo6
+void foo6() {
+    // PRINT: #pragma omp fuse
+    // DUMP: OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt
+    {
+        // PRINT: #pragma omp fuse
+        // DUMP: OMPFuseDirective
+        #pragma omp fuse 
+        // PRINT: {
+        // DUMP: CompoundStmt
+        {
+            // PRINT: for (int i = 0; i <= 10; ++i)
+            // DUMP: ForStmt
+            for (int i = 0; i <= 10; ++i)
+                body(i);
+            // PRINT: for (int j = 0; j < 100; ++j)
+            // DUMP: ForStmt
+            for(int j = 0; j < 100; ++j)
+                body(j);
+        }
+        // PRINT: #pragma omp unroll partial(4)
+        // DUMP: OMPUnrollDirective
+        #pragma omp unroll partial(4)
+        // PRINT: for (int k = 0; k < 250; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k < 250; ++k) 
+            body(k);
+    }
+}
+
+// PRINT-LABEL: void foo7(
+// DUMP-LABEL: FunctionDecl {{.*}} foo7
+void foo7() {
+    // PRINT: #pragma omp fuse
+    // DUMP:  OMPFuseDirective
+    #pragma omp fuse 
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: {
+        // DUMP: CompoundStmt   
+        {
+            // PRINT: {
+            // DUMP: CompoundStmt   
+            {
+                // PRINT: for (int i = 0; i < 10; i += 2)
+                // DUMP: ForStmt
+                for (int i = 0; i < 10; i += 2)
+                    // PRINT: body(i)
+                    // DUMP: CallExpr
+                    body(i);
+                // PRINT: for (int j = 10; j > 0; --j)
+                // DUMP: ForStmt
+                for (int j = 10; j > 0; --j)
+                    // PRINT: body(j)
+                    // DUMP: CallExpr
+                    body(j);
+            }
+        }
+        // PRINT: {
+        // DUMP: CompoundStmt   
+        {
+            // PRINT: {
+            // DUMP: CompoundStmt   
+            {
+                // PRINT: {
+                // DUMP: CompoundStmt   
+                {
+                    // PRINT: for (int k = 0; k <= 10; ++k)
+                    // DUMP: ForStmt
+                    for (int k = 0; k <= 10; ++k)
+                        // PRINT: body(k)
+                        // DUMP: CallExpr
+                        body(k);
+                }
+            }
+        }
+    }
+
+}
+
+// PRINT-LABEL: void foo8(
+// DUMP-LABEL: FunctionDecl {{.*}} foo8
+void foo8() {
+    // PRINT: #pragma omp fuse looprange(2,2)
+    // DUMP:  OMPFuseDirective
+    // DUMP: OMPLooprangeClause
+    #pragma omp fuse looprange(2,2)
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+        // PRINT: for (int k = 0; k <= 10; ++k)
+        // DUMP: ForStmt
+        for (int k = 0; k <= 10; ++k)
+            // PRINT: body(k)
+            // DUMP: CallExpr
+            body(k);
+
+    }
+
+}
+
+//PRINT-LABEL: void foo9(
+//DUMP-LABEL: FunctionTemplateDecl {{.*}} foo9
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} F
+//DUMP-LABEL: NonTypeTemplateParmDecl {{.*}} C
+template<int F, int C> 
+void foo9() {
+    // PRINT:  #pragma omp fuse looprange(F,C)
+    // DUMP: OMPFuseDirective
+    // DUMP: OMPLooprangeClause
+    #pragma omp fuse looprange(F,C)
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int j = 10; j > 0; --j)
+        // DUMP: ForStmt
+        for (int j = 10; j > 0; --j)
+            // PRINT: body(j)
+            // DUMP: CallExpr
+            body(j);
+
+    }
+}
+
+// Also test instantiating the template.
+void tfoo9() {
+    foo9<1, 2>();
+}
+
+// PRINT-LABEL: void foo10(
+// DUMP-LABEL: FunctionDecl {{.*}} foo10
+void foo10() {
+    // PRINT: #pragma omp fuse looprange(2,2)
+    // DUMP:  OMPFuseDirective
+    // DUMP: OMPLooprangeClause
+    #pragma omp fuse looprange(2,2)
+    // PRINT: {
+    // DUMP: CompoundStmt       
+    {
+        // PRINT: for (int i = 0; i < 10; i += 2)
+        // DUMP: ForStmt
+        for (int i = 0; i < 10; i += 2)
+            // PRINT: body(i)
+            // DUMP: CallExpr
+            body(i);
+        // PRINT: for (int ii = 0; ii < 10; ii += 2)
+        // DUMP: ForStmt
+        for (int ii = 0; ii < 10; ii += 2)
+            // PRINT: body(ii)
+            // DUMP: CallExpr
+            body(ii);
+        // PRINT: #pragma omp fuse looprange(2,2)
+        // DUMP:  OMPFuseDirective
+        // DUMP: OMPLooprangeClause
+        #pragma omp fuse looprange(2,2)
+        {
+            // PRINT: for (int j = 10; j > 0; --j)
+            // DUMP: ForStmt
+            for (int j = 10; j > 0; --j)
+                // PRINT: body(j)
+                // DUMP: CallExpr
+                body(j);
+            // PRINT: for (int jj = 10; jj > 0; --jj)
+            // DUMP: ForStmt
+            for (int jj = 10; jj > 0; --jj)
+                // PRINT: body(jj)
+                // DUMP: CallExpr
+                body(jj);
+            // PRINT: for (int k = 0; k <= 10; ++k)
+            // DUMP: ForStmt
+            for (int k = 0; k <= 10; ++k)
+                // PRINT: body(k)
+                // DUMP: CallExpr
+                body(k);
+            // PRINT: for (int kk = 0; kk <= 10; ++kk)
+            // DUMP: ForStmt
+            for (int kk = 0; kk <= 10; ++kk)
+                // PRINT: body(kk)
+                // DUMP: CallExpr
+                body(kk);
+        }
+    }
+
+}
+
+#endif
diff --git a/clang/test/OpenMP/fuse_codegen.cpp b/clang/test/OpenMP/fuse_codegen.cpp
new file mode 100644
index 0000000000000..742c280ed0172
--- /dev/null
+++ b/clang/test/OpenMP/fuse_codegen.cpp
@@ -0,0 +1,2328 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 5
+// expected-no-diagnostics
+
+// Check code generation
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -std=c++20 -fclang-abi-compat=latest -fopenmp -fopenmp-version=60 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+
+#ifndef HEADER
+#define HEADER
+
+//placeholder for loop body code.
+extern "C" void body(...) {}
+
+extern "C" void foo1(int start1, int end1, int step1, int start2, int end2, int step2) {
+    int i,j;
+    #pragma omp fuse
+    {
+        for(i = start1; i < end1; i += step1) body(i);
+        for(j = start2; j < end2; j += step2) body(j);
+    }
+
+}
+
+template <typename T>
+void foo2(T start, T end, T step){
+    T i,j,k;
+    #pragma omp fuse
+    {
+        for(i = start; i < end; i += step) body(i);
+        for(j = end; j > start; j -= step) body(j);
+        for(k = start+step; k < end+step; k += step) body(k);
+    }
+}
+
+extern "C" void tfoo2() {
+    foo2<int>(0, 64, 4);
+}
+
+extern "C" void foo3() {
+    double arr[256];
+    #pragma omp fuse
+    {
+        #pragma omp fuse
+        {
+            for(int i = 0; i < 128; ++i) body(i);
+            for(int j = 0; j < 256; j+=2) body(j);
+        }
+        for(int c = 42; auto &&v: arr) body(c,v);
+        for(int cc = 37; auto &&vv: arr) body(cc, vv);
+    }
+}
+
+extern "C" void foo4() {
+    double arr[256];
+
+    #pragma omp fuse looprange(2,2)
+    {
+        for(int i = 0; i < 128; ++i) body(i);
+        for(int j = 0; j < 256; j+=2) body(j);
+        for(int k = 0; k < 64; ++k) body(k);
+        for(int c = 42; auto &&v: arr) body(c,v);
+    }
+}
+
+// This exemplifies the usage of loop transformations that generate
+// more than top level canonical loop nests (e.g split, loopranged fuse...)
+extern "C" void foo5() {
+    double arr[256];
+    #pragma omp fuse looprange(2,2)
+    {
+        #pragma omp fuse looprange(2,2)
+        {
+            for(int i = 0; i < 128; ++i) body(i);
+            for(int j = 0; j < 256; j+=2) body(j);
+            for(int k = 0; k < 512; ++k) body(k);
+        }
+        for(int c = 42; auto &&v: arr) body(c,v);
+        for(int cc = 37; auto &&vv: arr) body(cc, vv);
+    }
+}
+
+
+#endif
+// CHECK1-LABEL: define dso_local void @body(
+// CHECK1-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo1(
+// CHECK1-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT:    br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP33]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT:    br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK1:       [[IF_THEN22]]:
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK1-NEXT:    [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK1-NEXT:    store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP42]])
+// CHECK1-NEXT:    br label %[[IF_END27]]
+// CHECK1:       [[IF_END27]]:
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @tfoo2(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK1-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK1-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK1-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK1-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK1-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK1-NEXT:    store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT:    [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK1-NEXT:    [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK1-NEXT:    store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK1-NEXT:    [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT:    br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK1:       [[COND_TRUE30]]:
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK1-NEXT:    br label %[[COND_END32:.*]]
+// CHECK1:       [[COND_FALSE31]]:
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    br label %[[COND_END32]]
+// CHECK1:       [[COND_END32]]:
+// CHECK1-NEXT:    [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK1-NEXT:    store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT:    br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT:    br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK1-NEXT:    [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK1-NEXT:    [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK1-NEXT:    store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP49]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK1-NEXT:    br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK1:       [[IF_THEN40]]:
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT:    [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK1-NEXT:    store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK1-NEXT:    [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT:    [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK1-NEXT:    store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP58]])
+// CHECK1-NEXT:    br label %[[IF_END45]]
+// CHECK1:       [[IF_END45]]:
+// CHECK1-NEXT:    [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK1-NEXT:    [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK1-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1:       [[IF_THEN47]]:
+// CHECK1-NEXT:    [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK1-NEXT:    [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK1-NEXT:    [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK1-NEXT:    [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK1-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT:    [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK1-NEXT:    [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK1-NEXT:    [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK1-NEXT:    [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK1-NEXT:    [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK1-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP67]])
+// CHECK1-NEXT:    br label %[[IF_END52]]
+// CHECK1:       [[IF_END52]]:
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo3(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END222:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK1-NEXT:    store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK1-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK1-NEXT:    [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK1-NEXT:    [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK1-NEXT:    [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK1-NEXT:    store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK1-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK1-NEXT:    store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT:    br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK1:       [[COND_TRUE42]]:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK1-NEXT:    br label %[[COND_END44:.*]]
+// CHECK1:       [[COND_FALSE43]]:
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    br label %[[COND_END44]]
+// CHECK1:       [[COND_END44]]:
+// CHECK1-NEXT:    [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK1-NEXT:    store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK1-NEXT:    br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK1:       [[COND_TRUE48]]:
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK1-NEXT:    br label %[[COND_END50:.*]]
+// CHECK1:       [[COND_FALSE49]]:
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    br label %[[COND_END50]]
+// CHECK1:       [[COND_END50]]:
+// CHECK1-NEXT:    [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK1-NEXT:    store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK1-NEXT:    [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT:    br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK1-NEXT:    [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK1-NEXT:    [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK1-NEXT:    store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK1-NEXT:    [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK1-NEXT:    store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT:    br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN62]]:
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK1-NEXT:    [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK1-NEXT:    store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK1-NEXT:    [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK1-NEXT:    store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP45]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK1-NEXT:    br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK1:       [[IF_THEN68]]:
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK1-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK1-NEXT:    store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK1-NEXT:    [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK1-NEXT:    store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP52]])
+// CHECK1-NEXT:    br label %[[IF_END73]]
+// CHECK1:       [[IF_END73]]:
+// CHECK1-NEXT:    br label %[[IF_END74]]
+// CHECK1:       [[IF_END74]]:
+// CHECK1-NEXT:    [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK1-NEXT:    br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK1:       [[IF_THEN76]]:
+// CHECK1-NEXT:    [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK1-NEXT:    [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK1-NEXT:    store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK1-NEXT:    [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK1-NEXT:    br label %[[IF_END81]]
+// CHECK1:       [[IF_END81]]:
+// CHECK1-NEXT:    [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK1-NEXT:    [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK1-NEXT:    br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK1:       [[IF_THEN83]]:
+// CHECK1-NEXT:    [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK1-NEXT:    [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK1-NEXT:    [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK1-NEXT:    [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK1-NEXT:    store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK1-NEXT:    [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK1-NEXT:    [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK1-NEXT:    [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT:    [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK1-NEXT:    store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT:    [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK1-NEXT:    br label %[[IF_END88]]
+// CHECK1:       [[IF_END88]]:
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo4(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK1-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP6]])
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND2:.*]]
+// CHECK1:       [[FOR_COND2]]:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK1-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK1:       [[FOR_BODY4]]:
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP16]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT:    br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK1:       [[IF_THEN9]]:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP23]])
+// CHECK1-NEXT:    br label %[[IF_END14]]
+// CHECK1:       [[IF_END14]]:
+// CHECK1-NEXT:    br label %[[FOR_INC15:.*]]
+// CHECK1:       [[FOR_INC15]]:
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK1-NEXT:    store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1:       [[FOR_END17]]:
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND19:.*]]
+// CHECK1:       [[FOR_COND19]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK1-NEXT:    br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK1:       [[FOR_BODY21]]:
+// CHECK1-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK1-NEXT:    br label %[[FOR_INC22:.*]]
+// CHECK1:       [[FOR_INC22]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND19]]
+// CHECK1:       [[FOR_END23]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo5(
+// CHECK1-SAME: ) #[[ATTR0]] {
+// CHECK1-NEXT:  [[ENTRY:.*:]]
+// CHECK1-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK1-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END267:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK1-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK1:       [[COND_TRUE]]:
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END:.*]]
+// CHECK1:       [[COND_FALSE]]:
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    br label %[[COND_END]]
+// CHECK1:       [[COND_END]]:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK1-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK1-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK1-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK1-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK1-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK1-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK1-NEXT:    br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK1:       [[COND_TRUE24]]:
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK1-NEXT:    br label %[[COND_END26:.*]]
+// CHECK1:       [[COND_FALSE25]]:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    br label %[[COND_END26]]
+// CHECK1:       [[COND_END26]]:
+// CHECK1-NEXT:    [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK1-NEXT:    store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK1:       [[FOR_COND]]:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK1-NEXT:    br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK1:       [[FOR_BODY]]:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP21]])
+// CHECK1-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK1:       [[FOR_INC]]:
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK1-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK1-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK1:       [[FOR_END]]:
+// CHECK1-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND30:.*]]
+// CHECK1:       [[FOR_COND30]]:
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK1-NEXT:    [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK1-NEXT:    br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK1:       [[FOR_BODY32]]:
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK1-NEXT:    [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK1-NEXT:    br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK1:       [[IF_THEN]]:
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK1-NEXT:    [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK1-NEXT:    [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK1-NEXT:    [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK1-NEXT:    [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK1-NEXT:    store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK1-NEXT:    [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK1-NEXT:    [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK1-NEXT:    store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK1-NEXT:    [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK1-NEXT:    br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK1:       [[IF_THEN41]]:
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK1-NEXT:    [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK1-NEXT:    store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK1-NEXT:    [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK1-NEXT:    [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK1-NEXT:    store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP37]])
+// CHECK1-NEXT:    br label %[[IF_END]]
+// CHECK1:       [[IF_END]]:
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK1-NEXT:    [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK1-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK1:       [[IF_THEN47]]:
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK1-NEXT:    [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK1-NEXT:    [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK1-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK1-NEXT:    [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK1-NEXT:    [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK1-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK1-NEXT:    [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP44]])
+// CHECK1-NEXT:    br label %[[IF_END52]]
+// CHECK1:       [[IF_END52]]:
+// CHECK1-NEXT:    br label %[[IF_END53]]
+// CHECK1:       [[IF_END53]]:
+// CHECK1-NEXT:    [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK1-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK1-NEXT:    br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK1:       [[IF_THEN55]]:
+// CHECK1-NEXT:    [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK1-NEXT:    [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK1-NEXT:    [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK1-NEXT:    [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK1-NEXT:    store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK1-NEXT:    [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK1-NEXT:    [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK1-NEXT:    [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK1-NEXT:    br label %[[IF_END60]]
+// CHECK1:       [[IF_END60]]:
+// CHECK1-NEXT:    br label %[[FOR_INC61:.*]]
+// CHECK1:       [[FOR_INC61]]:
+// CHECK1-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK1-NEXT:    store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND30]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK1:       [[FOR_END63]]:
+// CHECK1-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK1-NEXT:    store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK1-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK1-NEXT:    store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND70:.*]]
+// CHECK1:       [[FOR_COND70]]:
+// CHECK1-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK1-NEXT:    [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK1-NEXT:    br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK1:       [[FOR_BODY72]]:
+// CHECK1-NEXT:    [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK1-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK1-NEXT:    [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK1-NEXT:    br label %[[FOR_INC73:.*]]
+// CHECK1:       [[FOR_INC73]]:
+// CHECK1-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK1-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK1-NEXT:    br label %[[FOR_COND70]]
+// CHECK1:       [[FOR_END74]]:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @body(
+// CHECK2-SAME: ...) #[[ATTR0:[0-9]+]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo1(
+// CHECK2-SAME: i32 noundef [[START1:%.*]], i32 noundef [[END1:%.*]], i32 noundef [[STEP1:%.*]], i32 noundef [[START2:%.*]], i32 noundef [[END2:%.*]], i32 noundef [[STEP2:%.*]]) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[START1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP1_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[START2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP2_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[START1]], ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END1]], ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP1]], ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[START2]], ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END2]], ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP2]], ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP1_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[START2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[END2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP2_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP18]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP19]], [[TMP20]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP21]], %[[COND_TRUE]] ], [ [[TMP22]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    [[CMP16:%.*]] = icmp ult i32 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT:    br i1 [[CMP16]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP17:%.*]] = icmp ult i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    br i1 [[CMP17]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP28]], [[TMP29]]
+// CHECK2-NEXT:    [[ADD18:%.*]] = add i32 [[TMP27]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL19:%.*]] = mul i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    [[ADD20:%.*]] = add i32 [[TMP30]], [[MUL19]]
+// CHECK2-NEXT:    store i32 [[ADD20]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP33]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP21:%.*]] = icmp ult i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT:    br i1 [[CMP21]], label %[[IF_THEN22:.*]], label %[[IF_END27:.*]]
+// CHECK2:       [[IF_THEN22]]:
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL23:%.*]] = mul i32 [[TMP37]], [[TMP38]]
+// CHECK2-NEXT:    [[ADD24:%.*]] = add i32 [[TMP36]], [[MUL23]]
+// CHECK2-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[MUL25:%.*]] = mul i32 [[TMP40]], [[TMP41]]
+// CHECK2-NEXT:    [[ADD26:%.*]] = add i32 [[TMP39]], [[MUL25]]
+// CHECK2-NEXT:    store i32 [[ADD26]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP42]])
+// CHECK2-NEXT:    br label %[[IF_END27]]
+// CHECK2:       [[IF_END27]]:
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP43]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo3(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE221:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END222:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN225:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_27:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_29:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_30:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_TEMP_140:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX46:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX52:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY23:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP15]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR24:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY23]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR24]], ptr [[__END222]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY26:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP16]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY26]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[__RANGE221]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY28:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP17]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY28]], ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT:    [[TMP18:%.*]] = load ptr, ptr [[__END222]], align 8
+// CHECK2-NEXT:    store ptr [[TMP18]], ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_29]], align 8
+// CHECK2-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST31:%.*]] = ptrtoint ptr [[TMP19]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST32:%.*]] = ptrtoint ptr [[TMP20]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB33:%.*]] = sub i64 [[SUB_PTR_LHS_CAST31]], [[SUB_PTR_RHS_CAST32]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV34:%.*]] = sdiv exact i64 [[SUB_PTR_SUB33]], 8
+// CHECK2-NEXT:    [[SUB35:%.*]] = sub nsw i64 [[SUB_PTR_DIV34]], 1
+// CHECK2-NEXT:    [[ADD36:%.*]] = add nsw i64 [[SUB35]], 1
+// CHECK2-NEXT:    [[DIV37:%.*]] = sdiv i64 [[ADD36]], 1
+// CHECK2-NEXT:    [[SUB38:%.*]] = sub nsw i64 [[DIV37]], 1
+// CHECK2-NEXT:    store i64 [[SUB38]], ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_30]], align 8
+// CHECK2-NEXT:    [[ADD39:%.*]] = add nsw i64 [[TMP21]], 1
+// CHECK2-NEXT:    store i64 [[ADD39]], ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i64 [[TMP22]], ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP41:%.*]] = icmp sgt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT:    br i1 [[CMP41]], label %[[COND_TRUE42:.*]], label %[[COND_FALSE43:.*]]
+// CHECK2:       [[COND_TRUE42]]:
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_TEMP_140]], align 8
+// CHECK2-NEXT:    br label %[[COND_END44:.*]]
+// CHECK2:       [[COND_FALSE43]]:
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    br label %[[COND_END44]]
+// CHECK2:       [[COND_END44]]:
+// CHECK2-NEXT:    [[COND45:%.*]] = phi i64 [ [[TMP25]], %[[COND_TRUE42]] ], [ [[TMP26]], %[[COND_FALSE43]] ]
+// CHECK2-NEXT:    store i64 [[COND45]], ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    [[CMP47:%.*]] = icmp sgt i64 [[TMP27]], [[TMP28]]
+// CHECK2-NEXT:    br i1 [[CMP47]], label %[[COND_TRUE48:.*]], label %[[COND_FALSE49:.*]]
+// CHECK2:       [[COND_TRUE48]]:
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_TEMP_2]], align 8
+// CHECK2-NEXT:    br label %[[COND_END50:.*]]
+// CHECK2:       [[COND_FALSE49]]:
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    br label %[[COND_END50]]
+// CHECK2:       [[COND_END50]]:
+// CHECK2-NEXT:    [[COND51:%.*]] = phi i64 [ [[TMP29]], %[[COND_TRUE48]] ], [ [[TMP30]], %[[COND_FALSE49]] ]
+// CHECK2-NEXT:    store i64 [[COND51]], ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX46]], align 8
+// CHECK2-NEXT:    [[CMP53:%.*]] = icmp slt i64 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    br i1 [[CMP53]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT:    br i1 [[CMP54]], label %[[IF_THEN:.*]], label %[[IF_END74:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    [[CONV55:%.*]] = sext i32 [[TMP35]] to i64
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[CONV56:%.*]] = sext i32 [[TMP36]] to i64
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV56]], [[TMP37]]
+// CHECK2-NEXT:    [[ADD57:%.*]] = add nsw i64 [[CONV55]], [[MUL]]
+// CHECK2-NEXT:    [[CONV58:%.*]] = trunc i64 [[ADD57]] to i32
+// CHECK2-NEXT:    store i32 [[CONV58]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[MUL59:%.*]] = mul nsw i32 [[TMP38]], 1
+// CHECK2-NEXT:    [[ADD60:%.*]] = add nsw i32 0, [[MUL59]]
+// CHECK2-NEXT:    store i32 [[ADD60]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP61:%.*]] = icmp slt i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT:    br i1 [[CMP61]], label %[[IF_THEN62:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN62]]:
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL63:%.*]] = mul nsw i32 [[TMP42]], [[TMP43]]
+// CHECK2-NEXT:    [[ADD64:%.*]] = add nsw i32 [[TMP41]], [[MUL63]]
+// CHECK2-NEXT:    store i32 [[ADD64]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[MUL65:%.*]] = mul nsw i32 [[TMP44]], 1
+// CHECK2-NEXT:    [[ADD66:%.*]] = add nsw i32 0, [[MUL65]]
+// CHECK2-NEXT:    store i32 [[ADD66]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP45:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP45]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP67:%.*]] = icmp slt i32 [[TMP46]], [[TMP47]]
+// CHECK2-NEXT:    br i1 [[CMP67]], label %[[IF_THEN68:.*]], label %[[IF_END73:.*]]
+// CHECK2:       [[IF_THEN68]]:
+// CHECK2-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP49:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL69:%.*]] = mul nsw i32 [[TMP49]], [[TMP50]]
+// CHECK2-NEXT:    [[ADD70:%.*]] = add nsw i32 [[TMP48]], [[MUL69]]
+// CHECK2-NEXT:    store i32 [[ADD70]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[MUL71:%.*]] = mul nsw i32 [[TMP51]], 2
+// CHECK2-NEXT:    [[ADD72:%.*]] = add nsw i32 0, [[MUL71]]
+// CHECK2-NEXT:    store i32 [[ADD72]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP52:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP52]])
+// CHECK2-NEXT:    br label %[[IF_END73]]
+// CHECK2:       [[IF_END73]]:
+// CHECK2-NEXT:    br label %[[IF_END74]]
+// CHECK2:       [[IF_END74]]:
+// CHECK2-NEXT:    [[TMP53:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP54:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP75:%.*]] = icmp slt i64 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT:    br i1 [[CMP75]], label %[[IF_THEN76:.*]], label %[[IF_END81:.*]]
+// CHECK2:       [[IF_THEN76]]:
+// CHECK2-NEXT:    [[TMP55:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP57:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[MUL77:%.*]] = mul nsw i64 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT:    [[ADD78:%.*]] = add nsw i64 [[TMP55]], [[MUL77]]
+// CHECK2-NEXT:    store i64 [[ADD78]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP59:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[MUL79:%.*]] = mul nsw i64 [[TMP59]], 1
+// CHECK2-NEXT:    [[ADD_PTR80:%.*]] = getelementptr inbounds double, ptr [[TMP58]], i64 [[MUL79]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR80]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP60]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP61:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP62:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP63:%.*]] = load double, ptr [[TMP62]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP61]], double noundef [[TMP63]])
+// CHECK2-NEXT:    br label %[[IF_END81]]
+// CHECK2:       [[IF_END81]]:
+// CHECK2-NEXT:    [[TMP64:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[TMP65:%.*]] = load i64, ptr [[DOTOMP_NI2]], align 8
+// CHECK2-NEXT:    [[CMP82:%.*]] = icmp slt i64 [[TMP64]], [[TMP65]]
+// CHECK2-NEXT:    br i1 [[CMP82]], label %[[IF_THEN83:.*]], label %[[IF_END88:.*]]
+// CHECK2:       [[IF_THEN83]]:
+// CHECK2-NEXT:    [[TMP66:%.*]] = load i64, ptr [[DOTOMP_LB2]], align 8
+// CHECK2-NEXT:    [[TMP67:%.*]] = load i64, ptr [[DOTOMP_ST2]], align 8
+// CHECK2-NEXT:    [[TMP68:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[MUL84:%.*]] = mul nsw i64 [[TMP67]], [[TMP68]]
+// CHECK2-NEXT:    [[ADD85:%.*]] = add nsw i64 [[TMP66]], [[MUL84]]
+// CHECK2-NEXT:    store i64 [[ADD85]], ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_27]], align 8
+// CHECK2-NEXT:    [[TMP70:%.*]] = load i64, ptr [[DOTOMP_IV2]], align 8
+// CHECK2-NEXT:    [[MUL86:%.*]] = mul nsw i64 [[TMP70]], 1
+// CHECK2-NEXT:    [[ADD_PTR87:%.*]] = getelementptr inbounds double, ptr [[TMP69]], i64 [[MUL86]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR87]], ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT:    [[TMP71:%.*]] = load ptr, ptr [[__BEGIN225]], align 8
+// CHECK2-NEXT:    store ptr [[TMP71]], ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP72:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT:    [[TMP73:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP74:%.*]] = load double, ptr [[TMP73]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP72]], double noundef [[TMP74]])
+// CHECK2-NEXT:    br label %[[IF_END88]]
+// CHECK2:       [[IF_END88]]:
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP75:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP75]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTOMP_FUSE_INDEX52]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo4(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    store i32 64, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 128
+// CHECK2-NEXT:    br i1 [[CMP1]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP6]])
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND2:.*]]
+// CHECK2:       [[FOR_COND2]]:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
+// CHECK2-NEXT:    br i1 [[CMP3]], label %[[FOR_BODY4:.*]], label %[[FOR_END17:.*]]
+// CHECK2:       [[FOR_BODY4]]:
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP10]], [[TMP11]]
+// CHECK2-NEXT:    br i1 [[CMP5]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP15]], 2
+// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP16]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP8:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT:    br i1 [[CMP8]], label %[[IF_THEN9:.*]], label %[[IF_END14:.*]]
+// CHECK2:       [[IF_THEN9]]:
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL10:%.*]] = mul nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP19]], [[MUL10]]
+// CHECK2-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[TMP22]], 1
+// CHECK2-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK2-NEXT:    store i32 [[ADD13]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP23]])
+// CHECK2-NEXT:    br label %[[IF_END14]]
+// CHECK2:       [[IF_END14]]:
+// CHECK2-NEXT:    br label %[[FOR_INC15:.*]]
+// CHECK2:       [[FOR_INC15]]:
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[INC16:%.*]] = add nsw i32 [[TMP24]], 1
+// CHECK2-NEXT:    store i32 [[INC16]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND2]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK2:       [[FOR_END17]]:
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP25]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY18:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP26]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY18]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND19:.*]]
+// CHECK2:       [[FOR_COND19]]:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[CMP20:%.*]] = icmp ne ptr [[TMP27]], [[TMP28]]
+// CHECK2-NEXT:    br i1 [[CMP20]], label %[[FOR_BODY21:.*]], label %[[FOR_END23:.*]]
+// CHECK2:       [[FOR_BODY21]]:
+// CHECK2-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP29]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP32:%.*]] = load double, ptr [[TMP31]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP30]], double noundef [[TMP32]])
+// CHECK2-NEXT:    br label %[[FOR_INC22:.*]]
+// CHECK2:       [[FOR_INC22]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP33]], i32 1
+// CHECK2-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND19]]
+// CHECK2:       [[FOR_END23]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo5(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[ARR:%.*]] = alloca [256 x double], align 16
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB03:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST04:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI05:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV06:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[C:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_10:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_11:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_LB116:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_ST117:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_NI118:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_IV120:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_TEMP_121:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX22:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX29:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[CC:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[__RANGE264:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN265:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END267:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[VV:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store i32 0, ptr [[J]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    store i32 128, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[K]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    store i32 512, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], [[TMP2]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP3]], %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK2-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], 1
+// CHECK2-NEXT:    [[CONV:%.*]] = sext i32 [[ADD]] to i64
+// CHECK2-NEXT:    store i64 [[CONV]], ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i32 42, ptr [[C]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP8]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY7:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP9]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY7]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY9:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP10]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY9]], ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP11]], ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_10]], align 8
+// CHECK2-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP12]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP13]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB12:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD13:%.*]] = add nsw i64 [[SUB12]], 1
+// CHECK2-NEXT:    [[DIV14:%.*]] = sdiv i64 [[ADD13]], 1
+// CHECK2-NEXT:    [[SUB15:%.*]] = sub nsw i64 [[DIV14]], 1
+// CHECK2-NEXT:    store i64 [[SUB15]], ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    store i64 1, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_11]], align 8
+// CHECK2-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP14]], 1
+// CHECK2-NEXT:    store i64 [[ADD19]], ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP23:%.*]] = icmp sgt i64 [[TMP16]], [[TMP17]]
+// CHECK2-NEXT:    br i1 [[CMP23]], label %[[COND_TRUE24:.*]], label %[[COND_FALSE25:.*]]
+// CHECK2:       [[COND_TRUE24]]:
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[DOTOMP_TEMP_121]], align 8
+// CHECK2-NEXT:    br label %[[COND_END26:.*]]
+// CHECK2:       [[COND_FALSE25]]:
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    br label %[[COND_END26]]
+// CHECK2:       [[COND_END26]]:
+// CHECK2-NEXT:    [[COND27:%.*]] = phi i64 [ [[TMP18]], %[[COND_TRUE24]] ], [ [[TMP19]], %[[COND_FALSE25]] ]
+// CHECK2-NEXT:    store i64 [[COND27]], ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[CMP28:%.*]] = icmp slt i32 [[TMP20]], 128
+// CHECK2-NEXT:    br i1 [[CMP28]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP21]])
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP22]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    store i64 0, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND30:.*]]
+// CHECK2:       [[FOR_COND30]]:
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[DOTOMP_FUSE_MAX22]], align 8
+// CHECK2-NEXT:    [[CMP31:%.*]] = icmp slt i64 [[TMP23]], [[TMP24]]
+// CHECK2-NEXT:    br i1 [[CMP31]], label %[[FOR_BODY32:.*]], label %[[FOR_END63:.*]]
+// CHECK2:       [[FOR_BODY32]]:
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i64, ptr [[DOTOMP_NI05]], align 8
+// CHECK2-NEXT:    [[CMP33:%.*]] = icmp slt i64 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    br i1 [[CMP33]], label %[[IF_THEN:.*]], label %[[IF_END53:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB03]], align 4
+// CHECK2-NEXT:    [[CONV34:%.*]] = sext i32 [[TMP27]] to i64
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_ST04]], align 4
+// CHECK2-NEXT:    [[CONV35:%.*]] = sext i32 [[TMP28]] to i64
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV35]], [[TMP29]]
+// CHECK2-NEXT:    [[ADD36:%.*]] = add nsw i64 [[CONV34]], [[MUL]]
+// CHECK2-NEXT:    [[CONV37:%.*]] = trunc i64 [[ADD36]] to i32
+// CHECK2-NEXT:    store i32 [[CONV37]], ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_IV06]], align 4
+// CHECK2-NEXT:    [[MUL38:%.*]] = mul nsw i32 [[TMP30]], 1
+// CHECK2-NEXT:    [[ADD39:%.*]] = add nsw i32 0, [[MUL38]]
+// CHECK2-NEXT:    store i32 [[ADD39]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP40:%.*]] = icmp slt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    br i1 [[CMP40]], label %[[IF_THEN41:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN41]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL42:%.*]] = mul nsw i32 [[TMP34]], [[TMP35]]
+// CHECK2-NEXT:    [[ADD43:%.*]] = add nsw i32 [[TMP33]], [[MUL42]]
+// CHECK2-NEXT:    store i32 [[ADD43]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[MUL44:%.*]] = mul nsw i32 [[TMP36]], 2
+// CHECK2-NEXT:    [[ADD45:%.*]] = add nsw i32 0, [[MUL44]]
+// CHECK2-NEXT:    store i32 [[ADD45]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP37]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP46:%.*]] = icmp slt i32 [[TMP38]], [[TMP39]]
+// CHECK2-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2:       [[IF_THEN47]]:
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL48:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT:    [[ADD49:%.*]] = add nsw i32 [[TMP40]], [[MUL48]]
+// CHECK2-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[MUL50:%.*]] = mul nsw i32 [[TMP43]], 1
+// CHECK2-NEXT:    [[ADD51:%.*]] = add nsw i32 0, [[MUL50]]
+// CHECK2-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP44]])
+// CHECK2-NEXT:    br label %[[IF_END52]]
+// CHECK2:       [[IF_END52]]:
+// CHECK2-NEXT:    br label %[[IF_END53]]
+// CHECK2:       [[IF_END53]]:
+// CHECK2-NEXT:    [[TMP45:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[TMP46:%.*]] = load i64, ptr [[DOTOMP_NI118]], align 8
+// CHECK2-NEXT:    [[CMP54:%.*]] = icmp slt i64 [[TMP45]], [[TMP46]]
+// CHECK2-NEXT:    br i1 [[CMP54]], label %[[IF_THEN55:.*]], label %[[IF_END60:.*]]
+// CHECK2:       [[IF_THEN55]]:
+// CHECK2-NEXT:    [[TMP47:%.*]] = load i64, ptr [[DOTOMP_LB116]], align 8
+// CHECK2-NEXT:    [[TMP48:%.*]] = load i64, ptr [[DOTOMP_ST117]], align 8
+// CHECK2-NEXT:    [[TMP49:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[MUL56:%.*]] = mul nsw i64 [[TMP48]], [[TMP49]]
+// CHECK2-NEXT:    [[ADD57:%.*]] = add nsw i64 [[TMP47]], [[MUL56]]
+// CHECK2-NEXT:    store i64 [[ADD57]], ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_8]], align 8
+// CHECK2-NEXT:    [[TMP51:%.*]] = load i64, ptr [[DOTOMP_IV120]], align 8
+// CHECK2-NEXT:    [[MUL58:%.*]] = mul nsw i64 [[TMP51]], 1
+// CHECK2-NEXT:    [[ADD_PTR59:%.*]] = getelementptr inbounds double, ptr [[TMP50]], i64 [[MUL58]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR59]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP52]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP53:%.*]] = load i32, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP55:%.*]] = load double, ptr [[TMP54]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP53]], double noundef [[TMP55]])
+// CHECK2-NEXT:    br label %[[IF_END60]]
+// CHECK2:       [[IF_END60]]:
+// CHECK2-NEXT:    br label %[[FOR_INC61:.*]]
+// CHECK2:       [[FOR_INC61]]:
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i64, ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    [[INC62:%.*]] = add nsw i64 [[TMP56]], 1
+// CHECK2-NEXT:    store i64 [[INC62]], ptr [[DOTOMP_FUSE_INDEX29]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND30]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2:       [[FOR_END63]]:
+// CHECK2-NEXT:    store i32 37, ptr [[CC]], align 4
+// CHECK2-NEXT:    store ptr [[ARR]], ptr [[__RANGE264]], align 8
+// CHECK2-NEXT:    [[TMP57:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY66:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP57]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY66]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    [[TMP58:%.*]] = load ptr, ptr [[__RANGE264]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY68:%.*]] = getelementptr inbounds [256 x double], ptr [[TMP58]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR69:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY68]], i64 256
+// CHECK2-NEXT:    store ptr [[ADD_PTR69]], ptr [[__END267]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND70:.*]]
+// CHECK2:       [[FOR_COND70]]:
+// CHECK2-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__END267]], align 8
+// CHECK2-NEXT:    [[CMP71:%.*]] = icmp ne ptr [[TMP59]], [[TMP60]]
+// CHECK2-NEXT:    br i1 [[CMP71]], label %[[FOR_BODY72:.*]], label %[[FOR_END74:.*]]
+// CHECK2:       [[FOR_BODY72]]:
+// CHECK2-NEXT:    [[TMP61:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    store ptr [[TMP61]], ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP62:%.*]] = load i32, ptr [[CC]], align 4
+// CHECK2-NEXT:    [[TMP63:%.*]] = load ptr, ptr [[VV]], align 8
+// CHECK2-NEXT:    [[TMP64:%.*]] = load double, ptr [[TMP63]], align 8
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP62]], double noundef [[TMP64]])
+// CHECK2-NEXT:    br label %[[FOR_INC73:.*]]
+// CHECK2:       [[FOR_INC73]]:
+// CHECK2-NEXT:    [[TMP65:%.*]] = load ptr, ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw double, ptr [[TMP65]], i32 1
+// CHECK2-NEXT:    store ptr [[INCDEC_PTR]], ptr [[__BEGIN265]], align 8
+// CHECK2-NEXT:    br label %[[FOR_COND70]]
+// CHECK2:       [[FOR_END74]]:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @tfoo2(
+// CHECK2-SAME: ) #[[ATTR0]] {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    call void @_Z4foo2IiEvT_S0_S0_(i32 noundef 0, i32 noundef 64, i32 noundef 4)
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define linkonce_odr void @_Z4foo2IiEvT_S0_S0_(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] comdat {
+// CHECK2-NEXT:  [[ENTRY:.*:]]
+// CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[STEP_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[J:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[K:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV0:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_6:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_7:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP8:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_9:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_17:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_19:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTNEW_STEP21:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_22:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_LB2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_ST2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_NI2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_IV2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_1:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_TEMP_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_MAX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTOMP_FUSE_INDEX:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store i32 [[START]], ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+// CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
+// CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
+// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP10]], ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP12]], ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_7]], align 4
+// CHECK2-NEXT:    [[SUB10:%.*]] = sub i32 [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[SUB11:%.*]] = sub i32 [[SUB10]], 1
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[ADD12:%.*]] = add i32 [[SUB11]], [[TMP15]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[DIV13:%.*]] = udiv i32 [[ADD12]], [[TMP16]]
+// CHECK2-NEXT:    [[SUB14:%.*]] = sub i32 [[DIV13]], 1
+// CHECK2-NEXT:    store i32 [[SUB14]], ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_9]], align 4
+// CHECK2-NEXT:    [[ADD15:%.*]] = add i32 [[TMP17]], 1
+// CHECK2-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
+// CHECK2-NEXT:    store i32 [[ADD16]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD18:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK2-NEXT:    store i32 [[ADD18]], ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    [[ADD20:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK2-NEXT:    store i32 [[ADD20]], ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP24]], ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_19]], align 4
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT:    [[SUB23:%.*]] = sub i32 [[TMP25]], [[TMP26]]
+// CHECK2-NEXT:    [[SUB24:%.*]] = sub i32 [[SUB23]], 1
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[ADD25:%.*]] = add i32 [[SUB24]], [[TMP27]]
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[DIV26:%.*]] = udiv i32 [[ADD25]], [[TMP28]]
+// CHECK2-NEXT:    [[SUB27:%.*]] = sub i32 [[DIV26]], 1
+// CHECK2-NEXT:    store i32 [[SUB27]], ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_22]], align 4
+// CHECK2-NEXT:    [[ADD28:%.*]] = add i32 [[TMP29]], 1
+// CHECK2-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    store i32 [[TMP30]], ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK2:       [[COND_TRUE]]:
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_TEMP_1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END:.*]]
+// CHECK2:       [[COND_FALSE]]:
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    br label %[[COND_END]]
+// CHECK2:       [[COND_END]]:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP33]], %[[COND_TRUE]] ], [ [[TMP34]], %[[COND_FALSE]] ]
+// CHECK2-NEXT:    store i32 [[COND]], ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    [[CMP29:%.*]] = icmp ugt i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT:    br i1 [[CMP29]], label %[[COND_TRUE30:.*]], label %[[COND_FALSE31:.*]]
+// CHECK2:       [[COND_TRUE30]]:
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_TEMP_2]], align 4
+// CHECK2-NEXT:    br label %[[COND_END32:.*]]
+// CHECK2:       [[COND_FALSE31]]:
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    br label %[[COND_END32]]
+// CHECK2:       [[COND_END32]]:
+// CHECK2-NEXT:    [[COND33:%.*]] = phi i32 [ [[TMP37]], %[[COND_TRUE30]] ], [ [[TMP38]], %[[COND_FALSE31]] ]
+// CHECK2-NEXT:    store i32 [[COND33]], ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND:.*]]
+// CHECK2:       [[FOR_COND]]:
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_FUSE_MAX]], align 4
+// CHECK2-NEXT:    [[CMP34:%.*]] = icmp ult i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT:    br i1 [[CMP34]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK2:       [[FOR_BODY]]:
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_NI0]], align 4
+// CHECK2-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[TMP41]], [[TMP42]]
+// CHECK2-NEXT:    br i1 [[CMP35]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// CHECK2:       [[IF_THEN]]:
+// CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_LB0]], align 4
+// CHECK2-NEXT:    [[TMP44:%.*]] = load i32, ptr [[DOTOMP_ST0]], align 4
+// CHECK2-NEXT:    [[TMP45:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP44]], [[TMP45]]
+// CHECK2-NEXT:    [[ADD36:%.*]] = add i32 [[TMP43]], [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD36]], ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IV0]], align 4
+// CHECK2-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL37:%.*]] = mul i32 [[TMP47]], [[TMP48]]
+// CHECK2-NEXT:    [[ADD38:%.*]] = add i32 [[TMP46]], [[MUL37]]
+// CHECK2-NEXT:    store i32 [[ADD38]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP49:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP49]])
+// CHECK2-NEXT:    br label %[[IF_END]]
+// CHECK2:       [[IF_END]]:
+// CHECK2-NEXT:    [[TMP50:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP51:%.*]] = load i32, ptr [[DOTOMP_NI1]], align 4
+// CHECK2-NEXT:    [[CMP39:%.*]] = icmp ult i32 [[TMP50]], [[TMP51]]
+// CHECK2-NEXT:    br i1 [[CMP39]], label %[[IF_THEN40:.*]], label %[[IF_END45:.*]]
+// CHECK2:       [[IF_THEN40]]:
+// CHECK2-NEXT:    [[TMP52:%.*]] = load i32, ptr [[DOTOMP_LB1]], align 4
+// CHECK2-NEXT:    [[TMP53:%.*]] = load i32, ptr [[DOTOMP_ST1]], align 4
+// CHECK2-NEXT:    [[TMP54:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL41:%.*]] = mul i32 [[TMP53]], [[TMP54]]
+// CHECK2-NEXT:    [[ADD42:%.*]] = add i32 [[TMP52]], [[MUL41]]
+// CHECK2-NEXT:    store i32 [[ADD42]], ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP55:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_6]], align 4
+// CHECK2-NEXT:    [[TMP56:%.*]] = load i32, ptr [[DOTOMP_IV1]], align 4
+// CHECK2-NEXT:    [[TMP57:%.*]] = load i32, ptr [[DOTNEW_STEP8]], align 4
+// CHECK2-NEXT:    [[MUL43:%.*]] = mul i32 [[TMP56]], [[TMP57]]
+// CHECK2-NEXT:    [[SUB44:%.*]] = sub i32 [[TMP55]], [[MUL43]]
+// CHECK2-NEXT:    store i32 [[SUB44]], ptr [[J]], align 4
+// CHECK2-NEXT:    [[TMP58:%.*]] = load i32, ptr [[J]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP58]])
+// CHECK2-NEXT:    br label %[[IF_END45]]
+// CHECK2:       [[IF_END45]]:
+// CHECK2-NEXT:    [[TMP59:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[TMP60:%.*]] = load i32, ptr [[DOTOMP_NI2]], align 4
+// CHECK2-NEXT:    [[CMP46:%.*]] = icmp ult i32 [[TMP59]], [[TMP60]]
+// CHECK2-NEXT:    br i1 [[CMP46]], label %[[IF_THEN47:.*]], label %[[IF_END52:.*]]
+// CHECK2:       [[IF_THEN47]]:
+// CHECK2-NEXT:    [[TMP61:%.*]] = load i32, ptr [[DOTOMP_LB2]], align 4
+// CHECK2-NEXT:    [[TMP62:%.*]] = load i32, ptr [[DOTOMP_ST2]], align 4
+// CHECK2-NEXT:    [[TMP63:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[MUL48:%.*]] = mul i32 [[TMP62]], [[TMP63]]
+// CHECK2-NEXT:    [[ADD49:%.*]] = add i32 [[TMP61]], [[MUL48]]
+// CHECK2-NEXT:    store i32 [[ADD49]], ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT:    [[TMP64:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_17]], align 4
+// CHECK2-NEXT:    [[TMP65:%.*]] = load i32, ptr [[DOTOMP_IV2]], align 4
+// CHECK2-NEXT:    [[TMP66:%.*]] = load i32, ptr [[DOTNEW_STEP21]], align 4
+// CHECK2-NEXT:    [[MUL50:%.*]] = mul i32 [[TMP65]], [[TMP66]]
+// CHECK2-NEXT:    [[ADD51:%.*]] = add i32 [[TMP64]], [[MUL50]]
+// CHECK2-NEXT:    store i32 [[ADD51]], ptr [[K]], align 4
+// CHECK2-NEXT:    [[TMP67:%.*]] = load i32, ptr [[K]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP67]])
+// CHECK2-NEXT:    br label %[[IF_END52]]
+// CHECK2:       [[IF_END52]]:
+// CHECK2-NEXT:    br label %[[FOR_INC:.*]]
+// CHECK2:       [[FOR_INC]]:
+// CHECK2-NEXT:    [[TMP68:%.*]] = load i32, ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP68]], 1
+// CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTOMP_FUSE_INDEX]], align 4
+// CHECK2-NEXT:    br label %[[FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK2:       [[FOR_END]]:
+// CHECK2-NEXT:    ret void
+//
+//.
+// CHECK1: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK1: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK1: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK1: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK1: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK1: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
+// CHECK2: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK2: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK2: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK2: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK2: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK2: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK2: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK2: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+//.
diff --git a/clang/test/OpenMP/fuse_messages.cpp b/clang/test/OpenMP/fuse_messages.cpp
new file mode 100644
index 0000000000000..b86ce95cfe9bc
--- /dev/null
+++ b/clang/test/OpenMP/fuse_messages.cpp
@@ -0,0 +1,209 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++20 -fopenmp -fopenmp-version=60 -fsyntax-only -Wuninitialized -verify %s
+
+void func() {
+
+    // expected-error@+2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+    #pragma omp fuse
+    ;
+
+    // expected-error@+2 {{statement after '#pragma omp fuse' must be a for loop}}
+    #pragma omp fuse
+    {int bar = 0;}
+
+    // expected-error@+4 {{statement after '#pragma omp fuse' must be a for loop}}
+    #pragma omp fuse
+    {
+        for(int i = 0; i < 10; ++i);
+        int x = 2;
+    }
+
+    // expected-error@+2 {{statement after '#pragma omp fuse' must be a loop sequence containing canonical loops or loop-generating constructs}}
+    #pragma omp fuse
+    #pragma omp for
+    for (int i = 0; i < 7; ++i)
+        ;
+
+    {
+        // expected-error@+2 {{expected statement}}
+        #pragma omp fuse
+    }
+
+    // expected-warning@+1 {{extra tokens at the end of '#pragma omp fuse' are ignored}}
+    #pragma omp fuse foo
+    {
+        for (int i = 0; i < 7; ++i)
+            ;
+        for(int j = 0; j < 100; ++j);
+
+    }
+
+
+    // expected-error@+1 {{unexpected OpenMP clause 'final' in directive '#pragma omp fuse'}}
+    #pragma omp fuse final(0)
+    {
+        for (int i = 0; i < 7; ++i)
+            ;
+        for(int j = 0; j < 100; ++j);
+
+    }
+
+    //expected-error@+3 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+    #pragma omp fuse
+    {
+        for(int i = 0; i < 10; i*=2) {
+            ;
+        }
+        for(int j = 0; j < 100; ++j);
+    }
+
+    //expected-error@+2 {{loop sequence after '#pragma omp fuse' must contain at least 1 canonical loop or loop-generating construct}}
+    #pragma omp fuse
+    {}
+
+    //expected-error@+3 {{statement after '#pragma omp fuse' must be a for loop}}
+    #pragma omp fuse
+    {
+        #pragma omp unroll full
+        for(int i = 0; i < 10; ++i);
+
+        for(int j = 0; j < 10; ++j);
+    }
+
+    //expected-warning@+2 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse
+    {
+        for(int i = 0; i < 10; ++i);
+    }
+
+    //expected-warning@+1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse looprange(1, 1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+    }
+
+    //expected-error@+1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+    #pragma omp fuse looprange(1, -1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+    }
+
+    //expected-error@+1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+    #pragma omp fuse looprange(1, 0)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+    }
+
+    const int x = 1;
+    constexpr int y = 4;
+    //expected-error@+1 {{looprange clause selects loops from 1 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(x,y)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 1 to 420 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(1,420)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 1 to 6 but this exceeds the number of loops (5) in the loop sequence}}
+    #pragma omp fuse looprange(1,6)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+        // This fusion results in 2 loops
+        #pragma omp fuse looprange(1,2)
+        {
+            for(int i = 0; i < 10; ++i);
+            for(int j = 0; j < 100; ++j);
+            for(int k = 0; k < 50; ++k);
+        }
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 2 to 4 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(2,3)
+    {
+        #pragma omp unroll partial(2)
+        for(int i = 0; i < 10; ++i);
+
+        #pragma omp reverse
+        for(int j = 0; j < 10; ++j);
+
+        #pragma omp fuse
+        {
+            {
+                #pragma omp reverse
+                for(int j = 0; j < 10; ++j);
+            }
+            for(int k = 0; k < 50; ++k);
+        }
+    }
+}
+
+// In a template context, but expression itself not instantiation-dependent
+template <typename T>
+static void templated_func() {
+
+    //expected-warning@+1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse looprange(2,1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+    //expected-error@+1 {{looprange clause selects loops from 3 to 5 but this exceeds the number of loops (3) in the loop sequence}}
+    #pragma omp fuse looprange(3,3)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+
+}
+
+template <int V>
+static void templated_func_value_dependent() {
+
+    //expected-warning@+1 {{looprange clause selects a single loop, resulting in redundant fusion}}
+    #pragma omp fuse looprange(V,1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+}
+
+template <typename T>
+static void templated_func_type_dependent() {
+    constexpr T s = 1;
+
+    //expected-error@+1 {{argument to 'looprange' clause must be a strictly positive integer value}}
+    #pragma omp fuse looprange(s,s-1)
+    {
+        for(int i = 0; i < 10; ++i);
+        for(int j = 0; j < 100; ++j);
+        for(int k = 0; k < 50; ++k);
+    }
+}
+
+
+void template_inst() {
+    // expected-note@+1 {{in instantiation of function template specialization 'templated_func<int>' requested here}}
+    templated_func<int>();
+    // expected-note@+1 {{in instantiation of function template specialization 'templated_func_value_dependent<1>' requested here}}
+    templated_func_value_dependent<1>();
+    // expected-note@+1 {{in instantiation of function template specialization 'templated_func_type_dependent<int>' requested here}}
+    templated_func_type_dependent<int>();
+}
+
+
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 5aab74348967d..30e2be758cd39 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2148,6 +2148,9 @@ class EnqueueVisitor : public ConstStmtVisitor<EnqueueVisitor, void>,
   void VisitOMPUnrollDirective(const OMPUnrollDirective *D);
   void VisitOMPReverseDirective(const OMPReverseDirective *D);
   void VisitOMPInterchangeDirective(const OMPInterchangeDirective *D);
+  void VisitOMPCanonicalLoopSequenceTransformationDirective(
+      const OMPCanonicalLoopSequenceTransformationDirective *D);
+  void VisitOMPFuseDirective(const OMPFuseDirective *D);
   void VisitOMPForDirective(const OMPForDirective *D);
   void VisitOMPForSimdDirective(const OMPForSimdDirective *D);
   void VisitOMPSectionsDirective(const OMPSectionsDirective *D);
@@ -2353,6 +2356,11 @@ void OMPClauseEnqueue::VisitOMPPartialClause(const OMPPartialClause *C) {
   Visitor->AddStmt(C->getFactor());
 }
 
+void OMPClauseEnqueue::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) {
+  Visitor->AddStmt(C->getFirst());
+  Visitor->AddStmt(C->getCount());
+}
+
 void OMPClauseEnqueue::VisitOMPAllocatorClause(const OMPAllocatorClause *C) {
   Visitor->AddStmt(C->getAllocator());
 }
@@ -3317,6 +3325,15 @@ void EnqueueVisitor::VisitOMPInterchangeDirective(
   VisitOMPCanonicalLoopNestTransformationDirective(D);
 }
 
+void EnqueueVisitor::VisitOMPCanonicalLoopSequenceTransformationDirective(
+    const OMPCanonicalLoopSequenceTransformationDirective *D) {
+  VisitOMPExecutableDirective(D);
+}
+
+void EnqueueVisitor::VisitOMPFuseDirective(const OMPFuseDirective *D) {
+  VisitOMPCanonicalLoopSequenceTransformationDirective(D);
+}
+
 void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) {
   VisitOMPLoopDirective(D);
 }
@@ -6275,6 +6292,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OMPReverseDirective");
   case CXCursor_OMPInterchangeDirective:
     return cxstring::createRef("OMPInterchangeDirective");
+  case CXCursor_OMPFuseDirective:
+    return cxstring::createRef("OMPFuseDirective");
   case CXCursor_OMPForDirective:
     return cxstring::createRef("OMPForDirective");
   case CXCursor_OMPForSimdDirective:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 3c4062410eac1..56f113c1dc309 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -687,6 +687,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPInterchangeDirectiveClass:
     K = CXCursor_OMPInterchangeDirective;
     break;
+  case Stmt::OMPFuseDirectiveClass:
+    K = CXCursor_OMPFuseDirective;
+    break;
   case Stmt::OMPForDirectiveClass:
     K = CXCursor_OMPForDirective;
     break;
diff --git a/flang/include/flang/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h
index 5267a58c7e7f7..5cd196a7869a2 100644
--- a/flang/include/flang/Lower/OpenMP/Clauses.h
+++ b/flang/include/flang/Lower/OpenMP/Clauses.h
@@ -243,6 +243,7 @@ using Initializer = tomp::clause::InitializerT<TypeTy, IdTy, ExprTy>;
 using InReduction = tomp::clause::InReductionT<TypeTy, IdTy, ExprTy>;
 using IsDevicePtr = tomp::clause::IsDevicePtrT<TypeTy, IdTy, ExprTy>;
 using Lastprivate = tomp::clause::LastprivateT<TypeTy, IdTy, ExprTy>;
+using LoopRange = tomp::clause::LoopRangeT<TypeTy, IdTy, ExprTy>;
 using Linear = tomp::clause::LinearT<TypeTy, IdTy, ExprTy>;
 using Link = tomp::clause::LinkT<TypeTy, IdTy, ExprTy>;
 using Map = tomp::clause::MapT<TypeTy, IdTy, ExprTy>;
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index fadca0a3876f4..14885293fd5eb 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -613,6 +613,7 @@ class ParseTreeDumper {
   NODE_ENUM(OmpLinearModifier, Value)
   NODE(parser, OmpLocator)
   NODE(parser, OmpLocatorList)
+  NODE(parser, OmpLoopRangeClause)
   NODE(parser, OmpMapClause)
   NODE(OmpMapClause, Modifier)
   NODE(parser, OmpMapper)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 8b23189bc1e90..325ca9b4a227b 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4546,6 +4546,15 @@ struct OmpLinearClause {
   std::tuple<OmpObjectList, MODIFIERS(), /*PostModified=*/bool> t;
 };
 
+// Ref: [6.0:207-208]
+//
+// loop-range-clause ->
+//    LOOPRANGE(first, count)                       // since 6.0
+struct OmpLoopRangeClause {
+  TUPLE_CLASS_BOILERPLATE(OmpLoopRangeClause);
+  std::tuple<ScalarIntConstantExpr, ScalarIntConstantExpr> t;
+};
+
 // Ref: [4.5:216-219], [5.0:315-324], [5.1:347-355], [5.2:150-158]
 //
 // map-clause ->
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 48b90ccea2f2a..fac37a372caaf 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1036,6 +1036,11 @@ Link make(const parser::OmpClause::Link &inp,
   return Link{/*List=*/makeObjects(inp.v, semaCtx)};
 }
 
+LoopRange make(const parser::OmpClause::Looprange &inp,
+               semantics::SemanticsContext &semaCtx) {
+  llvm_unreachable("Unimplemented: looprange");
+}
+
 Map make(const parser::OmpClause::Map &inp,
          semantics::SemanticsContext &semaCtx) {
   // inp.v -> parser::OmpMapClause
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index ea09fe04f07a0..9507021057476 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1023,6 +1023,9 @@ TYPE_PARSER(
         maybe(":"_tok >> nonemptyList(Parser<OmpLinearClause::Modifier>{})),
         /*PostModified=*/pure(true)))
 
+TYPE_PARSER(construct<OmpLoopRangeClause>(
+    scalarIntConstantExpr, "," >> scalarIntConstantExpr))
+
 // OpenMPv5.2 12.5.2 detach-clause -> DETACH (event-handle)
 TYPE_PARSER(construct<OmpDetachClause>(Parser<OmpObject>{}))
 
@@ -1207,6 +1210,8 @@ TYPE_PARSER( //
                     parenthesized(Parser<OmpLinearClause>{}))) ||
     "LINK" >> construct<OmpClause>(construct<OmpClause::Link>(
                   parenthesized(Parser<OmpObjectList>{}))) ||
+    "LOOPRANGE" >> construct<OmpClause>(construct<OmpClause::Looprange>(
+                       parenthesized(Parser<OmpLoopRangeClause>{}))) ||
     "MAP" >> construct<OmpClause>(construct<OmpClause::Map>(
                  parenthesized(Parser<OmpMapClause>{}))) ||
     "MATCH" >> construct<OmpClause>(construct<OmpClause::Match>(
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 0fbd347e91b18..0511f5bdf7478 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2345,6 +2345,13 @@ class UnparseVisitor {
       }
     }
   }
+  void Unparse(const OmpLoopRangeClause &x) {
+    Word("LOOPRANGE(");
+    Walk(std::get<0>(x.t));
+    Put(", ");
+    Walk(std::get<1>(x.t));
+    Put(")");
+  }
   void Unparse(const OmpReductionClause &x) {
     using Modifier = OmpReductionClause::Modifier;
     Walk(std::get<std::optional<std::list<Modifier>>>(x.t), ": ");
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index cc2dd0a705ab7..db030bbe1f023 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3106,6 +3106,12 @@ CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
 CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
 CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
 
+void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
+  context_.Say(GetContext().clauseSource,
+      "LOOPRANGE clause is not implemented yet"_err_en_US,
+      ContextDirectiveAsFortran());
+}
+
 // Restrictions specific to each clause are implemented apart from the
 // generalized restrictions.
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 1ade9ce0c3a7d..db781b58944bc 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -1268,6 +1268,15 @@ struct WriteT {
   using EmptyTrait = std::true_type;
 };
 
+// V6: [6.4.7] Looprange clause
+template <typename T, typename I, typename E> struct LoopRangeT {
+  using Begin = E;
+  using End = E;
+
+  using TupleTrait = std::true_type;
+  std::tuple<Begin, End> t;
+};
+
 // ---
 
 template <typename T, typename I, typename E>
@@ -1300,8 +1309,8 @@ using TupleClausesT =
                  DoacrossT<T, I, E>, DynGroupprivateT<T, I, E>, FromT<T, I, E>,
                  GrainsizeT<T, I, E>, IfT<T, I, E>, InitT<T, I, E>,
                  InReductionT<T, I, E>, LastprivateT<T, I, E>, LinearT<T, I, E>,
-                 MapT<T, I, E>, NumTasksT<T, I, E>, OrderT<T, I, E>,
-                 ReductionT<T, I, E>, ScheduleT<T, I, E>,
+                 LoopRangeT<T, I, E>, MapT<T, I, E>, NumTasksT<T, I, E>,
+                 OrderT<T, I, E>, ReductionT<T, I, E>, ScheduleT<T, I, E>,
                  TaskReductionT<T, I, E>, ToT<T, I, E>>;
 
 template <typename T, typename I, typename E>
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index ffefa269712ef..38f95a11bf85f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -284,6 +284,10 @@ def OMPC_Linear : Clause<[Spelling<"linear">]> {
 def OMPC_Link : Clause<[Spelling<"link">]> {
   let flangClass = "OmpObjectList";
 }
+def OMPC_LoopRange : Clause<[Spelling<"looprange">]> {
+  let clangClass = "OMPLoopRangeClause";
+  let flangClass = "OmpLoopRangeClause";
+}
 def OMPC_Map : Clause<[Spelling<"map">]> {
   let clangClass = "OMPMapClause";
   let flangClass = "OmpMapClause";
@@ -902,6 +906,11 @@ def OMP_Groupprivate : Directive<[Spelling<"groupprivate">]> {
   let category = CA_Declarative;
   let languages = [L_C, L_Fortran];
 }
+def OMP_Fuse : Directive<[Spelling<"fuse">]> {
+  let allowedOnceClauses = [VersionedClause<OMPC_LoopRange, 60>];
+  let association = AS_Block;
+  let category = CA_Executable;
+}
 def OMP_Interchange : Directive<[Spelling<"interchange">]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Permutation>,
diff --git a/openmp/runtime/test/transform/fuse/foreach.cpp b/openmp/runtime/test/transform/fuse/foreach.cpp
new file mode 100644
index 0000000000000..176465b201faa
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/foreach.cpp
@@ -0,0 +1,191 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp fuse
+  {
+    for (Reporter a{"C"}; auto &&v : Reporter("A"))
+      printf("v=%d\n", v);
+    for (Reporter aa{"D"}; auto &&vv : Reporter("B"))
+      printf("vv=%d\n", vv);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+// CHECK: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
+
+#endif
diff --git a/openmp/runtime/test/transform/fuse/intfor.c b/openmp/runtime/test/transform/fuse/intfor.c
new file mode 100644
index 0000000000000..b8171b4df7042
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/intfor.c
@@ -0,0 +1,50 @@
+// RUN: %libomp-compile-and-run  | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp fuse
+  {
+    for (int i = 5; i <= 25; i += 5)
+      printf("i=%d\n", i);
+    for (int j = 10; j < 100; j += 10)
+      printf("j=%d\n", j);
+    for (int k = 10; k > 0; --k)
+      printf("k=%d\n", k);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: i=5
+// CHECK-NEXT: j=10
+// CHECK-NEXT: k=10
+// CHECK-NEXT: i=10
+// CHECK-NEXT: j=20
+// CHECK-NEXT: k=9
+// CHECK-NEXT: i=15
+// CHECK-NEXT: j=30
+// CHECK-NEXT: k=8
+// CHECK-NEXT: i=20
+// CHECK-NEXT: j=40
+// CHECK-NEXT: k=7
+// CHECK-NEXT: i=25
+// CHECK-NEXT: j=50
+// CHECK-NEXT: k=6
+// CHECK-NEXT: j=60
+// CHECK-NEXT: k=5
+// CHECK-NEXT: j=70
+// CHECK-NEXT: k=4
+// CHECK-NEXT: j=80
+// CHECK-NEXT: k=3
+// CHECK-NEXT: j=90
+// CHECK-NEXT: k=2
+// CHECK-NEXT: k=1
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/iterfor.cpp b/openmp/runtime/test/transform/fuse/iterfor.cpp
new file mode 100644
index 0000000000000..552484b2981c4
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/iterfor.cpp
@@ -0,0 +1,194 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  Reporter C("C");
+  Reporter D("D");
+#pragma omp fuse
+  {
+    for (auto it = C.begin(); it != C.end(); ++it)
+      printf("v=%d\n", *it);
+
+    for (auto it = D.begin(); it != D.end(); ++it)
+      printf("vv=%d\n", *it);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK: [C] ctor
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] begin()
+// CHECK-NEXT: [C] end()
+// CHECK-NEXT: [C] iterator distance: 3
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] begin()
+// CHECK-NEXT: [D] end()
+// CHECK-NEXT: [D] iterator distance: 3
+// CHECK-NEXT: [C] iterator advance: 0 += 0
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 0
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 0
+// CHECK-NEXT: vv=0
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 1
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 1
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 1
+// CHECK-NEXT: vv=1
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator advance: 0 += 2
+// CHECK-NEXT: [C] iterator move assign
+// CHECK-NEXT: [C] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] iterator advance: 0 += 2
+// CHECK-NEXT: [D] iterator move assign
+// CHECK-NEXT: [D] iterator deref: 2
+// CHECK-NEXT: vv=2
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: done
+// CHECK-NEXT: [D] iterator dtor
+// CHECK-NEXT: [C] iterator dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [C] dtor
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 0000000000000..dcbbdf1b6734e
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,207 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+  {
+    for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+      printf("i=%d v=%d\n", i, v);
+    for (int vv = 0; vv < 3; ++vv)
+      printf("i=%d vv=%d\n", i, vv);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK: do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=0 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=1 vv=2
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=0
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=1
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: i=2 vv=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
new file mode 100644
index 0000000000000..9630fec50bc20
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c
@@ -0,0 +1,45 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp fuse
+  {
+    for (int j = 0; j < 3; ++j)
+      printf("i=%d j=%d\n", i, j);
+    for (int k = 0; k < 3; ++k)
+      printf("i=%d k=%d\n", i, k);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: i=0 k=0
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: i=0 k=1
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: i=0 k=2
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: i=1 k=0
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: i=1 k=1
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: i=1 k=2
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: i=2 k=0
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: i=2 k=1
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: i=2 k=2
+// CHECK-NEXT: done

From f27442b80acf94ccab3bf915ff0730a0d794e059 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 29 Sep 2025 07:24:57 +0100
Subject: [PATCH 072/878] [AArch64] Add missing bitcast patterns for bf16<->f16
 converts. (#159816)

This fills in the missing patterns for bicasting v4f16 to/from v4bf16,
and v8f16 to/from v8f16. Clean up some formatting whilst here.

Fixes #159772
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 20 ++++++++--
 .../CodeGen/AArch64/bf16-vector-bitcast.ll    | 40 +++++++++++++++++++
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 04b3c90c2e177..f788c7510f80c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9907,8 +9907,14 @@ def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))),
 def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))),
           (v4bf16 (REV64v4i16 FPR64:$src))>;
 }
-def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
-def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+          (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4bf16 FPR64:$src))),
+          (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))),
+          (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4f16 FPR64:$src))),
+          (v4bf16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
@@ -10236,8 +10242,14 @@ def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))),
 def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))),
           (v8bf16 (REV32v8i16 FPR128:$src))>;
 }
-def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
-def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+          (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))),
+          (v8bf16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8bf16 FPR128:$src))),
+          (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8f16 FPR128:$src))),
+          (v8bf16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
index 1c216e7357215..e371748a43b29 100644
--- a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll
@@ -11,6 +11,16 @@ entry:
   ret <4 x i16> %1
 }
 
+define <4 x half> @v4bf16_to_v4f16(float, <4 x bfloat> %a) nounwind {
+; CHECK-LABEL: v4bf16_to_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <4 x bfloat> %a to <4 x half>
+  ret <4 x half> %1
+}
+
 define <2 x i32> @v4bf16_to_v2i32(float, <4 x bfloat> %a) nounwind {
 ; CHECK-LABEL: v4bf16_to_v2i32:
 ; CHECK:       // %bb.0: // %entry
@@ -82,6 +92,16 @@ entry:
   ret <4 x bfloat> %1
 }
 
+define <4 x bfloat> @v4f16_to_v4bf16(float, <4 x half> %a) nounwind {
+; CHECK-LABEL: v4f16_to_v4bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <4 x half> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
 define <4 x bfloat> @v2i32_to_v4bf16(float, <2 x i32> %a) nounwind {
 ; CHECK-LABEL: v2i32_to_v4bf16:
 ; CHECK:       // %bb.0: // %entry
@@ -152,6 +172,16 @@ entry:
   ret <8 x i16> %1
 }
 
+define <8 x half> @v8bf16_to_v8f16(float, <8 x bfloat> %a) nounwind {
+; CHECK-LABEL: v8bf16_to_v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <8 x bfloat> %a to <8 x half>
+  ret <8 x half> %1
+}
+
 define <4 x i32> @v8bf16_to_v4i32(float, <8 x bfloat> %a) nounwind {
 ; CHECK-LABEL: v8bf16_to_v4i32:
 ; CHECK:       // %bb.0: // %entry
@@ -202,6 +232,16 @@ entry:
   ret <8 x bfloat> %1
 }
 
+define <8 x bfloat> @v8f16_to_v8bf16(float, <8 x half> %a) nounwind {
+; CHECK-LABEL: v8f16_to_v8bf16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %1 = bitcast <8 x half> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
 define <8 x bfloat> @v4i32_to_v8bf16(float, <4 x i32> %a) nounwind {
 ; CHECK-LABEL: v4i32_to_v8bf16:
 ; CHECK:       // %bb.0: // %entry

From b0a755b2bfac0a82383dcc31eca82d9e132f1afc Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Mon, 29 Sep 2025 14:33:56 +0800
Subject: [PATCH 073/878] [TargetLowering] Remove NoSignedZerosFPMath uses
 (#160975)

Remove NoSignedZerosFPMath in TargetLowering part, users should always
use instruction level fast math flags.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  12 +-
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  | 341 +++++++++++++-----
 llvm/test/CodeGen/AMDGPU/v_mac.ll             |   9 +-
 llvm/test/CodeGen/AMDGPU/v_mac_f16.ll         |  14 +-
 llvm/test/CodeGen/PowerPC/scalar_cmp.ll       | 226 ++++++++----
 llvm/test/CodeGen/X86/negative-sin.ll         |  15 +-
 6 files changed, 432 insertions(+), 185 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dba5a8c0a7315..cc503d324e74b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7492,7 +7492,6 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
   // Pre-increment recursion depth for use in recursive calls.
   ++Depth;
   const SDNodeFlags Flags = Op->getFlags();
-  const TargetOptions &Options = DAG.getTarget().Options;
   EVT VT = Op.getValueType();
   unsigned Opcode = Op.getOpcode();
 
@@ -7572,7 +7571,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
     return DAG.getBuildVector(VT, DL, Ops);
   }
   case ISD::FADD: {
-    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+    if (!Flags.hasNoSignedZeros())
       break;
 
     // After operation legalization, it might not be legal to create new FSUBs.
@@ -7617,7 +7616,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
   }
   case ISD::FSUB: {
     // We can't turn -(A-B) into B-A when we honor signed zeros.
-    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+    if (!Flags.hasNoSignedZeros())
       break;
 
     SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
@@ -7678,7 +7677,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
   }
   case ISD::FMA:
   case ISD::FMAD: {
-    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+    if (!Flags.hasNoSignedZeros())
       break;
 
     SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2);
@@ -8797,7 +8796,6 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   EVT VT = Node->getValueType(0);
   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   bool IsMax = Opc == ISD::FMAXIMUMNUM;
-  const TargetOptions &Options = DAG.getTarget().Options;
   SDNodeFlags Flags = Node->getFlags();
 
   unsigned NewOp =
@@ -8858,8 +8856,8 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node,
   // TODO: We need quiet sNaN if strictfp.
 
   // Fixup signed zero behavior.
-  if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() ||
-      DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS)) {
+  if (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(LHS) ||
+      DAG.isKnownNeverZeroFloat(RHS)) {
     return MinMax;
   }
   SDValue TestZero =
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 92d3277d5d3e3..bb22144b815a1 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -4148,28 +4148,28 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; --------------------------------------------------------------------------------
 
 define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
-; CI-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
-; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_add_f32_e32 v3, 4.0, v3
-; CI-SAFE-NEXT:    v_add_f32_e32 v2, 4.0, v2
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; CI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; CI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
-; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; CI-LABEL: select_fneg_posk_src_add_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; CI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
+; CI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-SAFE:       ; %bb.0:
@@ -4229,21 +4229,6 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
-; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NSZ-NEXT:    v_sub_f32_e32 v2, -4.0, v2
-; CI-NSZ-NEXT:    v_sub_f32_e32 v3, -4.0, v3
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
-; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
-;
 ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4302,6 +4287,105 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
   ret <2 x half> %select
 }
 
+define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %y) {
+; CI-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_sub_f32_e32 v2, -4.0, v2
+; CI-NEXT:    v_sub_f32_e32 v3, -4.0, v3
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0xc400
+; VI-NEXT:    v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_sub_f16_e32 v2, -4.0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-SAFE-TRUE16:       ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-SAFE-FAKE16:       ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-NSZ-TRUE16:       ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz:
+; GFX11-NSZ-FAKE16:       ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NSZ-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <2 x i32> %c, zeroinitializer
+  %add = fadd nsz <2 x half> %x, <half 4.0, half 4.0>
+  %fneg = fneg <2 x half> %add
+  %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0>
+  ret <2 x half> %select
+}
+
 define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; CI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; CI-SAFE:       ; %bb.0:
@@ -4704,34 +4788,34 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 }
 
 define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %z) {
-; CI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
-; CI-SAFE:       ; %bb.0:
-; CI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-SAFE-NEXT:    v_mul_f32_e32 v3, 4.0, v3
-; CI-SAFE-NEXT:    v_add_f32_e32 v3, v3, v5
-; CI-SAFE-NEXT:    v_mul_f32_e32 v2, 4.0, v2
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-SAFE-NEXT:    v_add_f32_e32 v2, v2, v4
-; CI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-SAFE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; CI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v3
-; CI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; CI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; CI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
-; CI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-SAFE-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
-; CI-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; CI-LABEL: select_fneg_posk_src_fmad_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_mul_f32_e32 v3, 4.0, v3
+; CI-NEXT:    v_add_f32_e32 v3, v3, v5
+; CI-NEXT:    v_mul_f32_e32 v2, 4.0, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_add_f32_e32 v2, v2, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
+; CI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v3, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-SAFE:       ; %bb.0:
@@ -4793,27 +4877,6 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
-; CI-NSZ:       ; %bb.0:
-; CI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NSZ-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NSZ-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NSZ-NEXT:    v_mul_f32_e32 v2, -4.0, v2
-; CI-NSZ-NEXT:    v_mul_f32_e32 v3, -4.0, v3
-; CI-NSZ-NEXT:    v_sub_f32_e32 v2, v2, v4
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NSZ-NEXT:    v_sub_f32_e32 v3, v3, v5
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
-; CI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NSZ-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
-; CI-NSZ-NEXT:    s_setpc_b64 s[30:31]
-;
 ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4873,6 +4936,112 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
   ret <2 x half> %select
 }
 
+define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %z) {
+; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_mul_f32_e32 v2, -4.0, v2
+; CI-NEXT:    v_mul_f32_e32 v3, -4.0, v3
+; CI-NEXT:    v_sub_f32_e32 v2, v2, v4
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT:    v_sub_f32_e32 v3, v3, v5
+; CI-NEXT:    v_cndmask_b32_e32 v0, 2.0, v2, vcc
+; CI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT:    v_cndmask_b32_e32 v1, 2.0, v3, vcc
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_fma_f16 v1, v4, -4.0, -v1
+; VI-NEXT:    v_fma_f16 v2, v2, -4.0, -v3
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-SAFE-TRUE16:       ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-SAFE-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-SAFE-FAKE16:       ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-NSZ-TRUE16:       ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-NSZ-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4000, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz:
+; GFX11-NSZ-FAKE16:       ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NSZ-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NSZ-FAKE16-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = icmp eq <2 x i32> %c, zeroinitializer
+  %fmad = call nsz <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z)
+  %fneg = fneg <2 x half> %fmad
+  %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> <half 2.0, half 2.0>
+  ret <2 x half> %select
+}
+
 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll
index c12871536bafa..f5dc824aae35f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll
@@ -116,7 +116,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
   %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1
   %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2
@@ -125,7 +125,7 @@ entry:
   %b = load float, ptr addrspace(1) %b_ptr
   %c = load float, ptr addrspace(1) %c_ptr
 
-  %neg_a = fsub float 0.0, %a
+  %neg_a = fsub nsz float 0.0, %a
   %tmp0 = fmul float %neg_a, %b
   %tmp1 = fadd float %tmp0, %c
 
@@ -176,7 +176,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 entry:
   %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1
   %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2
@@ -185,7 +185,7 @@ entry:
   %b = load float, ptr addrspace(1) %b_ptr
   %c = load float, ptr addrspace(1) %c_ptr
 
-  %neg_b = fsub float 0.0, %b
+  %neg_b = fsub nsz float 0.0, %b
   %tmp0 = fmul float %a, %neg_b
   %tmp1 = fadd float %tmp0, %c
 
@@ -310,6 +310,5 @@ define float @v_mac_f32_dynamic_ftz(float %a, float %b, float %c) "denormal-fp-m
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
-attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index bcc60b06db291..8da6f2348690a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -236,7 +236,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %c.val = load half, ptr addrspace(1) %c
 
-  %a.neg = fsub half 0.0, %a.val
+  %a.neg = fsub nsz half 0.0, %a.val
   %t.val = fmul half %a.neg, %b.val
   %r.val = fadd half %t.val, %c.val
 
@@ -263,7 +263,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %c.val = load half, ptr addrspace(1) %c
 
-  %b.neg = fsub half 0.0, %b.val
+  %b.neg = fsub nsz half 0.0, %b.val
   %t.val = fmul half %a.val, %b.neg
   %r.val = fadd half %t.val, %c.val
 
@@ -290,7 +290,7 @@ entry:
   %b.val = load half, ptr addrspace(1) %b
   %c.val = load half, ptr addrspace(1) %c
 
-  %c.neg = fsub half 0.0, %c.val
+  %c.neg = fsub nsz half 0.0, %c.val
   %t.val = fmul half %a.val, %b.val
   %r.val = fadd half %t.val, %c.neg
 
@@ -601,7 +601,7 @@ entry:
   %b.val = load <2 x half>, ptr addrspace(1) %b
   %c.val = load <2 x half>, ptr addrspace(1) %c
 
-  %a.neg = fsub <2 x half> <half 0.0, half 0.0>, %a.val
+  %a.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %a.val
   %t.val = fmul <2 x half> %a.neg, %b.val
   %r.val = fadd <2 x half> %t.val, %c.val
 
@@ -634,7 +634,7 @@ entry:
   %b.val = load <2 x half>, ptr addrspace(1) %b
   %c.val = load <2 x half>, ptr addrspace(1) %c
 
-  %b.neg = fsub <2 x half> <half 0.0, half 0.0>, %b.val
+  %b.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %b.val
   %t.val = fmul <2 x half> %a.val, %b.neg
   %r.val = fadd <2 x half> %t.val, %c.val
 
@@ -667,7 +667,7 @@ entry:
   %b.val = load <2 x half>, ptr addrspace(1) %b
   %c.val = load <2 x half>, ptr addrspace(1) %c
 
-  %c.neg = fsub <2 x half> <half 0.0, half 0.0>, %c.val
+  %c.neg = fsub nsz <2 x half> <half 0.0, half 0.0>, %c.val
   %t.val = fmul <2 x half> %a.val, %b.val
   %r.val = fadd <2 x half> %t.val, %c.neg
 
@@ -678,5 +678,5 @@ entry:
 declare void @llvm.amdgcn.s.barrier() #2
 
 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
 attributes #2 = { nounwind convergent }
diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
index aaabd76e163bb..fd0b494d57677 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
@@ -20,18 +20,18 @@
 define float @select_oeq_float(float %a, float %b, float %c, float %d) {
 ; FAST-P8-LABEL: select_oeq_float:
 ; FAST-P8:       # %bb.0: # %entry
-; FAST-P8-NEXT:    xssubsp f0, f2, f1
-; FAST-P8-NEXT:    xssubsp f1, f1, f2
-; FAST-P8-NEXT:    fsel f1, f1, f3, f4
-; FAST-P8-NEXT:    fsel f1, f0, f1, f4
+; FAST-P8-NEXT:    xssubsp f0, f1, f2
+; FAST-P8-NEXT:    xsnegdp f1, f0
+; FAST-P8-NEXT:    fsel f0, f0, f3, f4
+; FAST-P8-NEXT:    fsel f1, f1, f0, f4
 ; FAST-P8-NEXT:    blr
 ;
 ; FAST-P9-LABEL: select_oeq_float:
 ; FAST-P9:       # %bb.0: # %entry
-; FAST-P9-NEXT:    xssubsp f0, f2, f1
-; FAST-P9-NEXT:    xssubsp f1, f1, f2
-; FAST-P9-NEXT:    fsel f1, f1, f3, f4
-; FAST-P9-NEXT:    fsel f1, f0, f1, f4
+; FAST-P9-NEXT:    xssubsp f0, f1, f2
+; FAST-P9-NEXT:    xsnegdp f1, f0
+; FAST-P9-NEXT:    fsel f0, f0, f3, f4
+; FAST-P9-NEXT:    fsel f1, f1, f0, f4
 ; FAST-P9-NEXT:    blr
 ;
 ; NO-FAST-P8-LABEL: select_oeq_float:
@@ -59,6 +59,48 @@ entry:
   ret float %cond
 }
 
+define float @select_oeq_float_nsz(float %a, float %b, float %c, float %d) {
+; FAST-P8-LABEL: select_oeq_float_nsz:
+; FAST-P8:       # %bb.0: # %entry
+; FAST-P8-NEXT:    xssubsp f0, f2, f1
+; FAST-P8-NEXT:    xssubsp f1, f1, f2
+; FAST-P8-NEXT:    fsel f1, f1, f3, f4
+; FAST-P8-NEXT:    fsel f1, f0, f1, f4
+; FAST-P8-NEXT:    blr
+;
+; FAST-P9-LABEL: select_oeq_float_nsz:
+; FAST-P9:       # %bb.0: # %entry
+; FAST-P9-NEXT:    xssubsp f0, f2, f1
+; FAST-P9-NEXT:    xssubsp f1, f1, f2
+; FAST-P9-NEXT:    fsel f1, f1, f3, f4
+; FAST-P9-NEXT:    fsel f1, f0, f1, f4
+; FAST-P9-NEXT:    blr
+;
+; NO-FAST-P8-LABEL: select_oeq_float_nsz:
+; NO-FAST-P8:       # %bb.0: # %entry
+; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P8-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P8-NEXT:  # %bb.1: # %entry
+; NO-FAST-P8-NEXT:    fmr f3, f4
+; NO-FAST-P8-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P8-NEXT:    fmr f1, f3
+; NO-FAST-P8-NEXT:    blr
+;
+; NO-FAST-P9-LABEL: select_oeq_float_nsz:
+; NO-FAST-P9:       # %bb.0: # %entry
+; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P9-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P9-NEXT:  # %bb.1: # %entry
+; NO-FAST-P9-NEXT:    fmr f3, f4
+; NO-FAST-P9-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P9-NEXT:    fmr f1, f3
+; NO-FAST-P9-NEXT:    blr
+entry:
+  %cmp = fcmp nsz oeq float %a, %b
+  %cond = select i1 %cmp, float %c, float %d
+  ret float %cond
+}
+
 define double @select_oeq_double(double %a, double %b, double %c, double %d) {
 ; FAST-P8-LABEL: select_oeq_double:
 ; FAST-P8:       # %bb.0: # %entry
@@ -79,20 +121,20 @@ define double @select_oeq_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8-LABEL: select_oeq_double:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P8-NEXT:    beq cr0, .LBB2_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB2_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_oeq_double:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P9-NEXT:    beq cr0, .LBB1_2
+; NO-FAST-P9-NEXT:    beq cr0, .LBB2_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB1_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB2_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -182,13 +224,57 @@ entry:
 define float @select_one_float(float %a, float %b, float %c, float %d) {
 ; FAST-P8-LABEL: select_one_float:
 ; FAST-P8:       # %bb.0: # %entry
+; FAST-P8-NEXT:    xssubsp f0, f1, f2
+; FAST-P8-NEXT:    xsnegdp f1, f0
+; FAST-P8-NEXT:    fsel f0, f0, f4, f3
+; FAST-P8-NEXT:    fsel f1, f1, f0, f3
+; FAST-P8-NEXT:    blr
+;
+; FAST-P9-LABEL: select_one_float:
+; FAST-P9:       # %bb.0: # %entry
+; FAST-P9-NEXT:    xssubsp f0, f1, f2
+; FAST-P9-NEXT:    xsnegdp f1, f0
+; FAST-P9-NEXT:    fsel f0, f0, f4, f3
+; FAST-P9-NEXT:    fsel f1, f1, f0, f3
+; FAST-P9-NEXT:    blr
+;
+; NO-FAST-P8-LABEL: select_one_float:
+; NO-FAST-P8:       # %bb.0: # %entry
+; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, eq
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P8-NEXT:  # %bb.1: # %entry
+; NO-FAST-P8-NEXT:    fmr f3, f4
+; NO-FAST-P8-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P8-NEXT:    fmr f1, f3
+; NO-FAST-P8-NEXT:    blr
+;
+; NO-FAST-P9-LABEL: select_one_float:
+; NO-FAST-P9:       # %bb.0: # %entry
+; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
+; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, eq
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P9-NEXT:  # %bb.1: # %entry
+; NO-FAST-P9-NEXT:    fmr f3, f4
+; NO-FAST-P9-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P9-NEXT:    fmr f1, f3
+; NO-FAST-P9-NEXT:    blr
+entry:
+  %cmp = fcmp one float %a, %b
+  %cond = select i1 %cmp, float %c, float %d
+  ret float %cond
+}
+
+define float @select_one_float_nsz(float %a, float %b, float %c, float %d) {
+; FAST-P8-LABEL: select_one_float_nsz:
+; FAST-P8:       # %bb.0: # %entry
 ; FAST-P8-NEXT:    xssubsp f0, f2, f1
 ; FAST-P8-NEXT:    xssubsp f1, f1, f2
 ; FAST-P8-NEXT:    fsel f1, f1, f4, f3
 ; FAST-P8-NEXT:    fsel f1, f0, f1, f3
 ; FAST-P8-NEXT:    blr
 ;
-; FAST-P9-LABEL: select_one_float:
+; FAST-P9-LABEL: select_one_float_nsz:
 ; FAST-P9:       # %bb.0: # %entry
 ; FAST-P9-NEXT:    xssubsp f0, f2, f1
 ; FAST-P9-NEXT:    xssubsp f1, f1, f2
@@ -196,29 +282,29 @@ define float @select_one_float(float %a, float %b, float %c, float %d) {
 ; FAST-P9-NEXT:    fsel f1, f0, f1, f3
 ; FAST-P9-NEXT:    blr
 ;
-; NO-FAST-P8-LABEL: select_one_float:
+; NO-FAST-P8-LABEL: select_one_float_nsz:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB4_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB6_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB4_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB6_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
-; NO-FAST-P9-LABEL: select_one_float:
+; NO-FAST-P9-LABEL: select_one_float_nsz:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB4_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB6_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB4_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB6_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
-  %cmp = fcmp one float %a, %b
+  %cmp = fcmp nsz one float %a, %b
   %cond = select i1 %cmp, float %c, float %d
   ret float %cond
 }
@@ -244,10 +330,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB7_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB7_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -255,10 +341,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, eq
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB5_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB7_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB5_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB7_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -362,10 +448,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB8_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB10_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB8_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB10_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -373,10 +459,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB8_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB10_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB8_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB10_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -402,10 +488,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB9_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB11_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB9_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB11_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -413,10 +499,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, lt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB9_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB11_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB9_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB11_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -503,20 +589,20 @@ define float @select_olt_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8-LABEL: select_olt_float:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    blt cr0, .LBB12_2
+; NO-FAST-P8-NEXT:    blt cr0, .LBB14_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB12_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB14_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_olt_float:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P9-NEXT:    blt cr0, .LBB12_2
+; NO-FAST-P9-NEXT:    blt cr0, .LBB14_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB12_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB14_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -541,20 +627,20 @@ define double @select_olt_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8-LABEL: select_olt_double:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    blt cr0, .LBB13_2
+; NO-FAST-P8-NEXT:    blt cr0, .LBB15_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB13_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB15_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_olt_double:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P9-NEXT:    blt cr0, .LBB13_2
+; NO-FAST-P9-NEXT:    blt cr0, .LBB15_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB13_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB15_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -641,20 +727,20 @@ define float @select_ogt_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8-LABEL: select_ogt_float:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgt cr0, .LBB16_2
+; NO-FAST-P8-NEXT:    bgt cr0, .LBB18_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB16_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB18_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_ogt_float:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P9-NEXT:    bgt cr0, .LBB16_2
+; NO-FAST-P9-NEXT:    bgt cr0, .LBB18_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB16_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB18_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -679,20 +765,20 @@ define double @select_ogt_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8-LABEL: select_ogt_double:
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgt cr0, .LBB17_2
+; NO-FAST-P8-NEXT:    bgt cr0, .LBB19_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB17_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB19_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
 ; NO-FAST-P9-LABEL: select_ogt_double:
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P9-NEXT:    bgt cr0, .LBB17_2
+; NO-FAST-P9-NEXT:    bgt cr0, .LBB19_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB17_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB19_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -780,10 +866,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB20_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB22_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB20_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB22_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -791,10 +877,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB20_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB22_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB20_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB22_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -820,10 +906,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P8:       # %bb.0: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P8-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB21_2
+; NO-FAST-P8-NEXT:    bc 12, 4*cr5+lt, .LBB23_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f4
-; NO-FAST-P8-NEXT:  .LBB21_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB23_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -831,10 +917,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) {
 ; NO-FAST-P9:       # %bb.0: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f2
 ; NO-FAST-P9-NEXT:    crnor 4*cr5+lt, un, gt
-; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB21_2
+; NO-FAST-P9-NEXT:    bc 12, 4*cr5+lt, .LBB23_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f4
-; NO-FAST-P9-NEXT:  .LBB21_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB23_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -926,13 +1012,13 @@ define double @onecmp1(double %a, double %y, double %z) {
 ; NO-FAST-P8-NEXT:    vspltisw v2, 1
 ; NO-FAST-P8-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f0
-; NO-FAST-P8-NEXT:    bc 12, lt, .LBB24_3
+; NO-FAST-P8-NEXT:    bc 12, lt, .LBB26_3
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f1
-; NO-FAST-P8-NEXT:    bc 12, un, .LBB24_3
+; NO-FAST-P8-NEXT:    bc 12, un, .LBB26_3
 ; NO-FAST-P8-NEXT:  # %bb.2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f3, f2
-; NO-FAST-P8-NEXT:  .LBB24_3: # %entry
+; NO-FAST-P8-NEXT:  .LBB26_3: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f3
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -941,13 +1027,13 @@ define double @onecmp1(double %a, double %y, double %z) {
 ; NO-FAST-P9-NEXT:    vspltisw v2, 1
 ; NO-FAST-P9-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f0
-; NO-FAST-P9-NEXT:    bc 12, lt, .LBB24_3
+; NO-FAST-P9-NEXT:    bc 12, lt, .LBB26_3
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fcmpu cr0, f1, f1
-; NO-FAST-P9-NEXT:    bc 12, un, .LBB24_3
+; NO-FAST-P9-NEXT:    bc 12, un, .LBB26_3
 ; NO-FAST-P9-NEXT:  # %bb.2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f3, f2
-; NO-FAST-P9-NEXT:  .LBB24_3: # %entry
+; NO-FAST-P9-NEXT:  .LBB26_3: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f3
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -978,10 +1064,10 @@ define double @onecmp2(double %a, double %y, double %z) {
 ; NO-FAST-P8-NEXT:    vspltisw v2, 1
 ; NO-FAST-P8-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P8-NEXT:    bgt cr0, .LBB25_2
+; NO-FAST-P8-NEXT:    bgt cr0, .LBB27_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f2, f3
-; NO-FAST-P8-NEXT:  .LBB25_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB27_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f2
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -990,10 +1076,10 @@ define double @onecmp2(double %a, double %y, double %z) {
 ; NO-FAST-P9-NEXT:    vspltisw v2, 1
 ; NO-FAST-P9-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P9-NEXT:    bgt cr0, .LBB25_2
+; NO-FAST-P9-NEXT:    bgt cr0, .LBB27_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f2, f3
-; NO-FAST-P9-NEXT:  .LBB25_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB27_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f2
 ; NO-FAST-P9-NEXT:    blr
 entry:
@@ -1028,10 +1114,10 @@ define double @onecmp3(double %a, double %y, double %z) {
 ; NO-FAST-P8-NEXT:    vspltisw v2, 1
 ; NO-FAST-P8-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P8-NEXT:    beq cr0, .LBB26_2
+; NO-FAST-P8-NEXT:    beq cr0, .LBB28_2
 ; NO-FAST-P8-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P8-NEXT:    fmr f2, f3
-; NO-FAST-P8-NEXT:  .LBB26_2: # %entry
+; NO-FAST-P8-NEXT:  .LBB28_2: # %entry
 ; NO-FAST-P8-NEXT:    fmr f1, f2
 ; NO-FAST-P8-NEXT:    blr
 ;
@@ -1040,10 +1126,10 @@ define double @onecmp3(double %a, double %y, double %z) {
 ; NO-FAST-P9-NEXT:    vspltisw v2, 1
 ; NO-FAST-P9-NEXT:    xvcvsxwdp vs0, vs34
 ; NO-FAST-P9-NEXT:    xscmpudp cr0, f1, f0
-; NO-FAST-P9-NEXT:    beq cr0, .LBB26_2
+; NO-FAST-P9-NEXT:    beq cr0, .LBB28_2
 ; NO-FAST-P9-NEXT:  # %bb.1: # %entry
 ; NO-FAST-P9-NEXT:    fmr f2, f3
-; NO-FAST-P9-NEXT:  .LBB26_2: # %entry
+; NO-FAST-P9-NEXT:  .LBB28_2: # %entry
 ; NO-FAST-P9-NEXT:    fmr f1, f2
 ; NO-FAST-P9-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/X86/negative-sin.ll b/llvm/test/CodeGen/X86/negative-sin.ll
index f24507d3a4f38..4836da2ad7797 100644
--- a/llvm/test/CodeGen/X86/negative-sin.ll
+++ b/llvm/test/CodeGen/X86/negative-sin.ll
@@ -82,18 +82,13 @@ define double @semi_strict2(double %e) nounwind {
   ret double %h
 }
 
-; FIXME:
-; Auto-upgrade function attribute to IR-level fast-math-flags.
-
-define double @fn_attr(double %e) nounwind #0 {
-; CHECK-LABEL: fn_attr:
+define double @nsz_flag(double %e) nounwind {
+; CHECK-LABEL: nsz_flag:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    jmp sin@PLT # TAILCALL
-  %f = fsub double 0.0, %e
-  %g = call double @sin(double %f) readonly
-  %h = fsub double 0.0, %g
+  %f = fsub nsz double 0.0, %e
+  %g = call nsz double @sin(double %f) readonly
+  %h = fsub nsz double 0.0, %g
   ret double %h
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "no-signed-zeros-fp-math"="true" }
-

From 85d48b8fec371fe0979440abaf2d13718fba2ac3 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Mon, 29 Sep 2025 03:46:06 -0300
Subject: [PATCH 074/878] [clang] treat deduced-as-dependent as undeduced for
 constant template parameters (#161099)

The AutoType's deduced-as-dependent mechanism is not really used for
constant template parameters, but this is currently harmless to ignore
because when dealing which such types, they will have been transformed,
turning them back to plain undeduced AutoTypes.

This should be NFC for current main users, but
https://github.com/llvm/llvm-project/pull/141776 will depend on this.
---
 clang/lib/Sema/SemaTemplate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 3ebbb30ae483e..2bf1511c5cfa0 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -7102,7 +7102,7 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType,
 
   // If the parameter type somehow involves auto, deduce the type now.
   DeducedType *DeducedT = ParamType->getContainedDeducedType();
-  bool IsDeduced = DeducedT && !DeducedT->isDeduced();
+  bool IsDeduced = DeducedT && DeducedT->getDeducedType().isNull();
   if (IsDeduced) {
     // When checking a deduced template argument, deduce from its type even if
     // the type is dependent, in order to check the types of non-type template

From ab5bba580c581be586a23888ab7bbd39fc9ea4b2 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Mon, 29 Sep 2025 03:46:36 -0300
Subject: [PATCH 075/878] [clang] fix code synthesis context depth limit logic
 (#161103)

This makes the instantation depth limit be checked whenever the code
synthesis context is pushed, not only when creating a
InstantiatingTemplate RAII object.

Also fix the note suggesting the user increases `-ftemplate-depth` so it
is printed even in a SFINAE context.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  6 +-
 clang/include/clang/Sema/Sema.h               |  4 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    | 88 +++++++++----------
 .../invalid-requirement-requires-expr.cpp     |  3 +-
 .../instantiation-depth-subst-2.cpp           |  1 +
 .../instantiation-depth-subst.cpp             |  3 +-
 6 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b8d031ed28d06..b157cbb0b8069 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5770,8 +5770,10 @@ def err_template_recursion_depth_exceeded : Error<
 def err_constraint_depends_on_self
     : Error<"satisfaction of constraint %0 depends on itself">,
       NoSFINAE;
-def note_template_recursion_depth : Note<
-  "use -ftemplate-depth=N to increase recursive template instantiation depth">;
+def note_template_recursion_depth
+    : Note<"use -ftemplate-depth=N to increase recursive template "
+           "instantiation depth">,
+      NoSFINAE;
 
 def err_template_instantiate_within_definition : Error<
   "%select{implicit|explicit}0 instantiation of template %1 within its"
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 2bd6be2a32cd5..f53aafdeb4f36 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13335,8 +13335,6 @@ class Sema final : public SemaBase {
     Sema &SemaRef;
     bool Invalid;
     bool AlreadyInstantiating;
-    bool CheckInstantiationDepth(SourceLocation PointOfInstantiation,
-                                 SourceRange InstantiationRange);
 
     InstantiatingTemplate(Sema &SemaRef,
                           CodeSynthesisContext::SynthesisKind Kind,
@@ -13529,7 +13527,7 @@ class Sema final : public SemaBase {
     ~ArgPackSubstIndexRAII() { Self.ArgPackSubstIndex = OldSubstIndex; }
   };
 
-  void pushCodeSynthesisContext(CodeSynthesisContext Ctx);
+  bool pushCodeSynthesisContext(CodeSynthesisContext Ctx);
   void popCodeSynthesisContext();
 
   void PrintContextStack(InstantiationContextDiagFuncRef DiagFunc) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 1ff94d7ae397f..f1c9c5c868159 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -616,29 +616,30 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
     Invalid = true;
     return;
   }
-  Invalid = CheckInstantiationDepth(PointOfInstantiation, InstantiationRange);
+
+  CodeSynthesisContext Inst;
+  Inst.Kind = Kind;
+  Inst.PointOfInstantiation = PointOfInstantiation;
+  Inst.Entity = Entity;
+  Inst.Template = Template;
+  Inst.TemplateArgs = TemplateArgs.data();
+  Inst.NumTemplateArgs = TemplateArgs.size();
+  Inst.DeductionInfo = DeductionInfo;
+  Inst.InstantiationRange = InstantiationRange;
+  Inst.InConstraintSubstitution =
+      Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
+  if (!SemaRef.CodeSynthesisContexts.empty())
+    Inst.InConstraintSubstitution |=
+        SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
+
+  Invalid = SemaRef.pushCodeSynthesisContext(Inst);
   if (!Invalid) {
-    CodeSynthesisContext Inst;
-    Inst.Kind = Kind;
-    Inst.PointOfInstantiation = PointOfInstantiation;
-    Inst.Entity = Entity;
-    Inst.Template = Template;
-    Inst.TemplateArgs = TemplateArgs.data();
-    Inst.NumTemplateArgs = TemplateArgs.size();
-    Inst.DeductionInfo = DeductionInfo;
-    Inst.InstantiationRange = InstantiationRange;
-    Inst.InConstraintSubstitution =
-        Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
-    if (!SemaRef.CodeSynthesisContexts.empty())
-      Inst.InConstraintSubstitution |=
-          SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
-
-    SemaRef.pushCodeSynthesisContext(Inst);
-
-    AlreadyInstantiating = !Inst.Entity ? false :
-        !SemaRef.InstantiatingSpecializations
-             .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind})
-             .second;
+    AlreadyInstantiating =
+        !Inst.Entity
+            ? false
+            : !SemaRef.InstantiatingSpecializations
+                   .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind})
+                   .second;
     atTemplateBegin(SemaRef.TemplateInstCallbacks, SemaRef, Inst);
   }
 }
@@ -834,18 +835,34 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
     : InstantiatingTemplate(SemaRef, CodeSynthesisContext::PartialOrderingTTP,
                             ArgLoc, InstantiationRange, PArg) {}
 
-void Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
+bool Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) {
   Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext;
   InNonInstantiationSFINAEContext = false;
 
-  CodeSynthesisContexts.push_back(Ctx);
-
-  if (!Ctx.isInstantiationRecord())
+  if (!Ctx.isInstantiationRecord()) {
     ++NonInstantiationEntries;
+  } else {
+    assert(SemaRef.NonInstantiationEntries <=
+           SemaRef.CodeSynthesisContexts.size());
+    if ((SemaRef.CodeSynthesisContexts.size() -
+         SemaRef.NonInstantiationEntries) >
+        SemaRef.getLangOpts().InstantiationDepth) {
+      SemaRef.Diag(Ctx.PointOfInstantiation,
+                   diag::err_template_recursion_depth_exceeded)
+          << SemaRef.getLangOpts().InstantiationDepth << Ctx.InstantiationRange;
+      SemaRef.Diag(Ctx.PointOfInstantiation,
+                   diag::note_template_recursion_depth)
+          << SemaRef.getLangOpts().InstantiationDepth;
+      return true;
+    }
+  }
+
+  CodeSynthesisContexts.push_back(Ctx);
 
   // Check to see if we're low on stack space. We can't do anything about this
   // from here, but we can at least warn the user.
   StackHandler.warnOnStackNearlyExhausted(Ctx.PointOfInstantiation);
+  return false;
 }
 
 void Sema::popCodeSynthesisContext() {
@@ -907,25 +924,6 @@ static std::string convertCallArgsToString(Sema &S,
   return Result;
 }
 
-bool Sema::InstantiatingTemplate::CheckInstantiationDepth(
-                                        SourceLocation PointOfInstantiation,
-                                           SourceRange InstantiationRange) {
-  assert(SemaRef.NonInstantiationEntries <=
-         SemaRef.CodeSynthesisContexts.size());
-  if ((SemaRef.CodeSynthesisContexts.size() -
-          SemaRef.NonInstantiationEntries)
-        <= SemaRef.getLangOpts().InstantiationDepth)
-    return false;
-
-  SemaRef.Diag(PointOfInstantiation,
-               diag::err_template_recursion_depth_exceeded)
-    << SemaRef.getLangOpts().InstantiationDepth
-    << InstantiationRange;
-  SemaRef.Diag(PointOfInstantiation, diag::note_template_recursion_depth)
-    << SemaRef.getLangOpts().InstantiationDepth;
-  return true;
-}
-
 void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) {
   // Determine which template instantiations to skip, if any.
   unsigned SkipStart = CodeSynthesisContexts.size(), SkipEnd = SkipStart;
diff --git a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
index 097ada3caa135..436dfb9aac0a7 100644
--- a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
+++ b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
@@ -17,8 +17,7 @@ constexpr bool A<x>::far() {
       b.data_member;
       requires A<x-1>::far(); // #Invalid
       // expected-error@#Invalid {{recursive template instantiation exceeded maximum depth}}
-      // expected-note@#Invalid {{in instantiation}}
-      // expected-note@#Invalid 2 {{while}}
+      // expected-note@#Invalid 3 {{while}}
       // expected-note@#Invalid {{contexts in backtrace}}
       // expected-note@#Invalid {{increase recursive template instantiation depth}}
     };
diff --git a/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp b/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp
index 2b519e974a907..66fd1af0d1429 100644
--- a/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp
+++ b/clang/test/SemaTemplate/instantiation-depth-subst-2.cpp
@@ -2,5 +2,6 @@
 
 template<int N> struct S { };
 template<typename T> S<T() + T()> operator+(T, T); // expected-error {{instantiation exceeded maximum depth}} expected-note 2{{while substituting}}
+// expected-note@-1 {{use -ftemplate-depth=N to increase recursive template instantiation depth}}
 S<0> s;
 int k = s + s; // expected-note {{while substituting}}
diff --git a/clang/test/SemaTemplate/instantiation-depth-subst.cpp b/clang/test/SemaTemplate/instantiation-depth-subst.cpp
index 062a8ed08bb64..17944bc3aaa40 100644
--- a/clang/test/SemaTemplate/instantiation-depth-subst.cpp
+++ b/clang/test/SemaTemplate/instantiation-depth-subst.cpp
@@ -3,7 +3,8 @@
 // PR9793
 template<typename T> auto f(T t) -> decltype(f(t)); // \
 // expected-error {{recursive template instantiation exceeded maximum depth of 2}} \
-// expected-note 2 {{while substituting}}
+// expected-note 2 {{while substituting}} \
+// expected-note {{use -ftemplate-depth=N to increase recursive template instantiation depth}}
 
 struct S {};
 int k = f(S{}); // expected-note {{while substituting}}

From b3fe1b6db39e42e73131b483f4401f4bf3b05444 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Mon, 29 Sep 2025 15:01:05 +0800
Subject: [PATCH 076/878] [C++20] [Modules] Set the feature testing macro to 1
 (#161034)

See https://github.com/llvm/llvm-project/issues/71364 for details.
---
 clang/lib/Frontend/InitPreprocessor.cpp | 5 ++++-
 clang/test/Lexer/cxx-features.cpp       | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index edf0a091e087c..877ab02850667 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -742,7 +742,10 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
     Builder.defineMacro("__cpp_impl_coroutine", "201902L");
     Builder.defineMacro("__cpp_designated_initializers", "201707L");
     Builder.defineMacro("__cpp_impl_three_way_comparison", "201907L");
-    //Builder.defineMacro("__cpp_modules", "201907L");
+    // Intentionally to set __cpp_modules to 1.
+    // See https://github.com/llvm/llvm-project/issues/71364 for details.
+    // Builder.defineMacro("__cpp_modules", "201907L");
+    Builder.defineMacro("__cpp_modules", "1");
     Builder.defineMacro("__cpp_using_enum", "201907L");
   }
   // C++23 features.
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index ced5bcaf0db16..8eb9ea032879c 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -148,7 +148,7 @@
 
 // init_captures checked below
 
-#if check(modules, 0, 0, 0, 0, 0, 0, 0)
+#if check(modules, 0, 0, 0, 0, 1, 1, 1)
 // FIXME: 201907 in C++20
 #error "wrong value for __cpp_modules"
 #endif

From 52b59b5bc07ae3a05b7643119dc6b34099108bda Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 29 Sep 2025 09:08:15 +0200
Subject: [PATCH 077/878] [DropUnnecessaryAssumes] Make the ephemeral value
 check more precise (#160700)

The initial implementation used a very crude check where a value was
considered ephemeral if it has only one use. This is insufficient if
there are multiple assumes acting on the same value, or in more complex
cases like cyclic phis.

Generalize this to a more typical ephemeral value check, i.e. make sure
that all transitive users are in assumes, while stopping at
side-effecting instructions.
---
 .../Scalar/DropUnnecessaryAssumes.cpp         |  50 ++++++-
 .../DropUnnecessaryAssumes/basic.ll           | 136 ++++++++++++++++++
 2 files changed, 179 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
index c215228b480d2..89980d54ee897 100644
--- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
+++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/DropUnnecessaryAssumes.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -17,13 +18,48 @@ using namespace llvm;
 using namespace llvm::PatternMatch;
 
 static bool affectedValuesAreEphemeral(ArrayRef<Value *> Affected) {
-  // If all the affected uses have only one use (part of the assume), then
-  // the assume does not provide useful information. Note that additional
-  // users may appear as a result of inlining and CSE, so we should only
-  // make this assumption late in the optimization pipeline.
-  // TODO: Handle dead cyclic usages.
-  // TODO: Handle multiple dead assumes on the same value.
-  return all_of(Affected, match_fn(m_OneUse(m_Value())));
+  // Check whether all the uses are ephemeral, i.e. recursively only used
+  // by assumes. In that case, the assume does not provide useful information.
+  // Note that additional users may appear as a result of inlining and CSE,
+  // so we should only make this assumption late in the optimization pipeline.
+  SmallSetVector<Instruction *, 32> Worklist;
+  auto AddUsers = [&](Value *V) {
+    for (User *U : V->users()) {
+      // Bail out if we need to inspect too many users.
+      if (Worklist.size() >= 32)
+        return false;
+      Worklist.insert(cast<Instruction>(U));
+    }
+    return true;
+  };
+
+  for (Value *V : Affected) {
+    // Do not handle assumes on globals for now. The use list for them may
+    // contain uses in other functions.
+    if (!isa<Instruction, Argument>(V))
+      return false;
+
+    if (!AddUsers(V))
+      return false;
+  }
+
+  for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) {
+    Instruction *I = Worklist[Idx];
+
+    // Use in assume is ephemeral.
+    if (isa<AssumeInst>(I))
+      continue;
+
+    // Use in side-effecting instruction is non-ephemeral.
+    if (I->mayHaveSideEffects() || I->isTerminator())
+      return false;
+
+    // Otherwise, recursively look at the users.
+    if (!AddUsers(I))
+      return false;
+  }
+
+  return true;
 }
 
 PreservedAnalyses
diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
index e2a9b4eea2c7d..8a6f60ba7a204 100644
--- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
+++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=drop-unnecessary-assumes < %s | FileCheck %s
 
+declare void @use(i32 %x)
+declare i32 @get()
+
 define void @basic_dead(i32 %x) {
 ; CHECK-LABEL: define void @basic_dead(
 ; CHECK-SAME: i32 [[X:%.*]]) {
@@ -180,3 +183,136 @@ define void @type_test(ptr %x) {
   call void @llvm.assume(i1 %test)
   ret void
 }
+
+define void @multiple_dead_conds(i32 %x) {
+; CHECK-LABEL: define void @multiple_dead_conds(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  %cond1 = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond1)
+  %cond2 = icmp ne i32 %x, 64
+  call void @llvm.assume(i1 %cond2)
+  ret void
+}
+
+define void @multiple_dead_bundles(ptr %x) {
+; CHECK-LABEL: define void @multiple_dead_bundles(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr %x, i64 8), "nonnull"(ptr %x)]
+  ret void
+}
+
+; The assume is eliminated, but currently leaves behind a dead cycle.
+define void @dead_cycle(i1 %loop.cond) {
+; CHECK-LABEL: define void @dead_cycle(
+; CHECK-SAME: i1 [[LOOP_COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %cond = icmp ne i32 %iv, 64
+  call void @llvm.assume(i1 %cond)
+  %iv.next = add i32 %iv, 1
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @use_in_side_effect(i32 %x) {
+; CHECK-LABEL: define void @use_in_side_effect(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp sge i32 [[X]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    call void @use(i32 [[X]])
+; CHECK-NEXT:    ret void
+;
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  call void @use(i32 %x)
+  ret void
+}
+
+define void @indirect_use_in_side_effect(i32 %x) {
+; CHECK-LABEL: define void @indirect_use_in_side_effect(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[COND:%.*]] = icmp sge i32 [[X]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[X]], 1
+; CHECK-NEXT:    call void @use(i32 [[ADD]])
+; CHECK-NEXT:    ret void
+;
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  %add = add i32 %x, 1
+  call void @use(i32 %add)
+  ret void
+}
+
+; The affected value itself has a side effect, but we can still drop the
+; assume.
+define void @affected_value_has_side_effect() {
+; CHECK-LABEL: define void @affected_value_has_side_effect() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @get()
+; CHECK-NEXT:    ret void
+;
+  %x = call i32 @get()
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  ret void
+}
+
+define i32 @affected_value_has_side_effect_and_is_used() {
+; CHECK-LABEL: define i32 @affected_value_has_side_effect_and_is_used() {
+; CHECK-NEXT:    [[X:%.*]] = call i32 @get()
+; CHECK-NEXT:    [[COND:%.*]] = icmp sge i32 [[X]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %x = call i32 @get()
+  %cond = icmp sge i32 %x, 0
+  call void @llvm.assume(i1 %cond)
+  ret i32 %x
+}
+
+@g = external global i8
+@g2 = external global i8
+
+; Assumes on globals are currently not supported.
+define void @assume_on_global() {
+; CHECK-LABEL: define void @assume_on_global() {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr @g, i64 8) ]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr @g, i64 8)]
+  ret void
+}
+
+define void @assume_on_global_used_in_other_func() {
+; CHECK-LABEL: define void @assume_on_global_used_in_other_func() {
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr @g2, i64 8) ]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 true) ["align"(ptr @g2, i64 8)]
+  ret void
+}
+
+define ptr @other_func() {
+; CHECK-LABEL: define ptr @other_func() {
+; CHECK-NEXT:    ret ptr @g2
+;
+  ret ptr @g2
+}

From 07778490e37946a60297b6446184b2bd2fa41030 Mon Sep 17 00:00:00 2001
From: Jinjie Huang <huangjinjie@bytedance.com>
Date: Mon, 29 Sep 2025 15:11:36 +0800
Subject: [PATCH 078/878] [BOLT][DWARF] Fix debug info update issue with dwarf4
 dwp (#155619)

Fix the crash issue when updating debuginfo via DWARF4 DWP and improve efficiency.
---
 bolt/lib/Rewrite/DWARFRewriter.cpp     |   4 +-
 bolt/test/AArch64/dwarf4-dwp-aarch64.s | 407 +++++++++++++++++++++++++
 bolt/test/X86/dwarf4-dwp-x86.s         | 405 ++++++++++++++++++++++++
 3 files changed, 813 insertions(+), 3 deletions(-)
 create mode 100755 bolt/test/AArch64/dwarf4-dwp-aarch64.s
 create mode 100755 bolt/test/X86/dwarf4-dwp-x86.s

diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 6752489ad562a..5c89a424caa7f 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -504,9 +504,7 @@ static void emitDWOBuilder(const std::string &DWOName,
     }
     emitUnit(DWODIEBuilder, *Streamer, SplitCU);
   } else {
-    for (std::unique_ptr<llvm::DWARFUnit> &CU :
-         SplitCU.getContext().dwo_compile_units())
-      emitUnit(DWODIEBuilder, *Streamer, *CU);
+    emitUnit(DWODIEBuilder, *Streamer, SplitCU);
 
     // emit debug_types sections for dwarf4
     for (DWARFUnit *CU : DWODIEBuilder.getDWARF4TUVector())
diff --git a/bolt/test/AArch64/dwarf4-dwp-aarch64.s b/bolt/test/AArch64/dwarf4-dwp-aarch64.s
new file mode 100755
index 0000000000000..37507e100a62d
--- /dev/null
+++ b/bolt/test/AArch64/dwarf4-dwp-aarch64.s
@@ -0,0 +1,407 @@
+## This test checks updating debuginfo via dwarf4 dwp file
+# RUN: rm -rf %t && mkdir -p %t && cd %t
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown --split-dwarf-file=main.exe-main.dwo %t/main.s -o %t/main.o
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown --split-dwarf-file=main.exe-callee.dwo %t/callee.s -o %t/callee.o
+# RUN: %clangxx %cxxflags -gdwarf-4 -gsplit-dwarf=split -Wl,-e,main %t/main.o %t/callee.o -o main.exe
+# RUN: llvm-dwp -e %t/main.exe -o %t/main.exe.dwp
+# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections  2>&1 | FileCheck %s
+
+# CHECK-NOT: Assertion
+
+#--- main.s
+	.file	"main.cpp"
+	.globl	main                            // -- Begin function main
+	.type	main,@function
+main:                                   // @main
+.Lfunc_begin0:
+	.file	1 "." "main.cpp"
+	.loc	1 2 0                           // main.cpp:2:0
+	.loc	1 2 21 prologue_end             // main.cpp:2:21
+	.loc	1 2 14 epilogue_begin is_stmt 0 // main.cpp:2:14
+	ret
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	0                               // DW_CHILDREN_no
+	.byte	16                              // DW_AT_stmt_list
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	27                              // DW_AT_comp_dir
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\264B"                         // DW_AT_GNU_pubnames
+	.byte	25                              // DW_FORM_flag_present
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	17                              // DW_AT_low_pc
+	.byte	1                               // DW_FORM_addr
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.ascii	"\263B"                         // DW_AT_GNU_addr_base
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.word	.Ldebug_info_end0-.Ldebug_info_start0 // Length of Unit
+.Ldebug_info_start0:
+	.hword	4                               // DWARF version number
+	.word	.debug_abbrev                   // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.word	.Lline_table_start0             // DW_AT_stmt_list
+	.word	.Lskel_string0                  // DW_AT_comp_dir
+                                        // DW_AT_GNU_pubnames
+	.word	.Lskel_string1                  // DW_AT_GNU_dwo_name
+	.xword	1465063543908291764             // DW_AT_GNU_dwo_id
+	.xword	.Lfunc_begin0                   // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.word	.Laddr_table_base0              // DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             // string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-main.dwo"             // string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          // string offset=0
+.Linfo_string1:
+	.asciz	"int"                           // string offset=5
+.Linfo_string2:
+	.byte	0                               // string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      // string offset=10
+.Linfo_string4:
+	.asciz	"main.exe-main.dwo"             // string offset=19
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.word	0
+	.word	5
+	.word	9
+	.word	10
+	.word	19
+	.section	.debug_info.dwo,"e",@progbits
+	.word	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 // Length of Unit
+.Ldebug_info_dwo_start0:
+	.hword	4                               // DWARF version number
+	.word	0                               // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x22 DW_TAG_compile_unit
+	.byte	2                               // DW_AT_producer
+	.hword	33                              // DW_AT_language
+	.byte	3                               // DW_AT_name
+	.byte	4                               // DW_AT_GNU_dwo_name
+	.xword	1465063543908291764             // DW_AT_GNU_dwo_id
+	.byte	2                               // Abbrev [2] 0x19:0xf DW_TAG_subprogram
+	.byte	0                               // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.byte	1                               // DW_AT_frame_base
+	.byte	109
+	.byte	0                               // DW_AT_name
+	.byte	1                               // DW_AT_decl_file
+	.byte	2                               // DW_AT_decl_line
+	.word	40                              // DW_AT_type
+                                        // DW_AT_external
+	.byte	3                               // Abbrev [3] 0x28:0x4 DW_TAG_base_type
+	.byte	1                               // DW_AT_name
+	.byte	5                               // DW_AT_encoding
+	.byte	4                               // DW_AT_byte_size
+	.byte	0                               // End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	1                               // DW_CHILDREN_yes
+	.byte	37                              // DW_AT_producer
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	19                              // DW_AT_language
+	.byte	5                               // DW_FORM_data2
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	2                               // Abbreviation Code
+	.byte	46                              // DW_TAG_subprogram
+	.byte	0                               // DW_CHILDREN_no
+	.byte	17                              // DW_AT_low_pc
+	.ascii	"\201>"                         // DW_FORM_GNU_addr_index
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.byte	64                              // DW_AT_frame_base
+	.byte	24                              // DW_FORM_exprloc
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	58                              // DW_AT_decl_file
+	.byte	11                              // DW_FORM_data1
+	.byte	59                              // DW_AT_decl_line
+	.byte	11                              // DW_FORM_data1
+	.byte	73                              // DW_AT_type
+	.byte	19                              // DW_FORM_ref4
+	.byte	63                              // DW_AT_external
+	.byte	25                              // DW_FORM_flag_present
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	3                               // Abbreviation Code
+	.byte	36                              // DW_TAG_base_type
+	.byte	0                               // DW_CHILDREN_no
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	62                              // DW_AT_encoding
+	.byte	11                              // DW_FORM_data1
+	.byte	11                              // DW_AT_byte_size
+	.byte	11                              // DW_FORM_data1
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.xword	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.word	.LpubNames_end0-.LpubNames_start0 // Length of Public Names Info
+.LpubNames_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	25                              // DIE offset
+	.byte	48                              // Attributes: FUNCTION, EXTERNAL
+	.asciz	"main"                          // External Name
+	.word	0                               // End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.word	.LpubTypes_end0-.LpubTypes_start0 // Length of Public Types Info
+.LpubTypes_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	40                              // DIE offset
+	.byte	144                             // Attributes: TYPE, STATIC
+	.asciz	"int"                           // External Name
+	.word	0                               // End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z6calleei
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- callee.s
+	.file	"callee.cpp"
+	.globl	_Z6calleei                      // -- Begin function _Z6calleei
+	.type	_Z6calleei,@function
+_Z6calleei:                             // @_Z6calleei
+.Lfunc_begin0:
+	.file	1 "." "callee.cpp"
+	.loc	1 1 0                           // callee.cpp:1:0
+	.loc	1 1 28 prologue_end             // callee.cpp:1:28
+	.loc	1 1 21 epilogue_begin is_stmt 0 // callee.cpp:1:21
+	ret
+.Lfunc_end0:
+	.size	_Z6calleei, .Lfunc_end0-_Z6calleei
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	0                               // DW_CHILDREN_no
+	.byte	16                              // DW_AT_stmt_list
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	27                              // DW_AT_comp_dir
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\264B"                         // DW_AT_GNU_pubnames
+	.byte	25                              // DW_FORM_flag_present
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.byte	14                              // DW_FORM_strp
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	17                              // DW_AT_low_pc
+	.byte	1                               // DW_FORM_addr
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.ascii	"\263B"                         // DW_AT_GNU_addr_base
+	.byte	23                              // DW_FORM_sec_offset
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.word	.Ldebug_info_end0-.Ldebug_info_start0 // Length of Unit
+.Ldebug_info_start0:
+	.hword	4                               // DWARF version number
+	.word	.debug_abbrev                   // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.word	.Lline_table_start0             // DW_AT_stmt_list
+	.word	.Lskel_string0                  // DW_AT_comp_dir
+                                        // DW_AT_GNU_pubnames
+	.word	.Lskel_string1                  // DW_AT_GNU_dwo_name
+	.xword	7650227797527095061             // DW_AT_GNU_dwo_id
+	.xword	.Lfunc_begin0                   // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.word	.Laddr_table_base0              // DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             // string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-callee.dwo"           // string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z6calleei"                    // string offset=0
+.Linfo_string1:
+	.asciz	"callee"                        // string offset=11
+.Linfo_string2:
+	.asciz	"int"                           // string offset=18
+.Linfo_string3:
+	.asciz	"x"                             // string offset=22
+.Linfo_string4:
+	.byte	0                               // string offset=24
+.Linfo_string5:
+	.asciz	"callee.cpp"                    // string offset=25
+.Linfo_string6:
+	.asciz	"main.exe-callee.dwo"           // string offset=36
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.word	0
+	.word	11
+	.word	18
+	.word	22
+	.word	24
+	.word	25
+	.word	36
+	.section	.debug_info.dwo,"e",@progbits
+	.word	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 // Length of Unit
+.Ldebug_info_dwo_start0:
+	.hword	4                               // DWARF version number
+	.word	0                               // Offset Into Abbrev. Section
+	.byte	8                               // Address Size (in bytes)
+	.byte	1                               // Abbrev [1] 0xb:0x2f DW_TAG_compile_unit
+	.byte	4                               // DW_AT_producer
+	.hword	33                              // DW_AT_language
+	.byte	5                               // DW_AT_name
+	.byte	6                               // DW_AT_GNU_dwo_name
+	.xword	7650227797527095061             // DW_AT_GNU_dwo_id
+	.byte	2                               // Abbrev [2] 0x19:0x1c DW_TAG_subprogram
+	.byte	0                               // DW_AT_low_pc
+	.word	.Lfunc_end0-.Lfunc_begin0       // DW_AT_high_pc
+	.byte	1                               // DW_AT_frame_base
+	.byte	111
+	.byte	0                               // DW_AT_linkage_name
+	.byte	1                               // DW_AT_name
+	.byte	1                               // DW_AT_decl_file
+	.byte	1                               // DW_AT_decl_line
+	.word	53                              // DW_AT_type
+                                        // DW_AT_external
+	.byte	3                               // Abbrev [3] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               // DW_AT_location
+	.byte	145
+	.byte	12
+	.byte	3                               // DW_AT_name
+	.byte	1                               // DW_AT_decl_file
+	.byte	1                               // DW_AT_decl_line
+	.word	53                              // DW_AT_type
+	.byte	0                               // End Of Children Mark
+	.byte	4                               // Abbrev [4] 0x35:0x4 DW_TAG_base_type
+	.byte	2                               // DW_AT_name
+	.byte	5                               // DW_AT_encoding
+	.byte	4                               // DW_AT_byte_size
+	.byte	0                               // End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               // Abbreviation Code
+	.byte	17                              // DW_TAG_compile_unit
+	.byte	1                               // DW_CHILDREN_yes
+	.byte	37                              // DW_AT_producer
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	19                              // DW_AT_language
+	.byte	5                               // DW_FORM_data2
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\260B"                         // DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.ascii	"\261B"                         // DW_AT_GNU_dwo_id
+	.byte	7                               // DW_FORM_data8
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	2                               // Abbreviation Code
+	.byte	46                              // DW_TAG_subprogram
+	.byte	1                               // DW_CHILDREN_yes
+	.byte	17                              // DW_AT_low_pc
+	.ascii	"\201>"                         // DW_FORM_GNU_addr_index
+	.byte	18                              // DW_AT_high_pc
+	.byte	6                               // DW_FORM_data4
+	.byte	64                              // DW_AT_frame_base
+	.byte	24                              // DW_FORM_exprloc
+	.byte	110                             // DW_AT_linkage_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	58                              // DW_AT_decl_file
+	.byte	11                              // DW_FORM_data1
+	.byte	59                              // DW_AT_decl_line
+	.byte	11                              // DW_FORM_data1
+	.byte	73                              // DW_AT_type
+	.byte	19                              // DW_FORM_ref4
+	.byte	63                              // DW_AT_external
+	.byte	25                              // DW_FORM_flag_present
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	3                               // Abbreviation Code
+	.byte	5                               // DW_TAG_formal_parameter
+	.byte	0                               // DW_CHILDREN_no
+	.byte	2                               // DW_AT_location
+	.byte	24                              // DW_FORM_exprloc
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	58                              // DW_AT_decl_file
+	.byte	11                              // DW_FORM_data1
+	.byte	59                              // DW_AT_decl_line
+	.byte	11                              // DW_FORM_data1
+	.byte	73                              // DW_AT_type
+	.byte	19                              // DW_FORM_ref4
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	4                               // Abbreviation Code
+	.byte	36                              // DW_TAG_base_type
+	.byte	0                               // DW_CHILDREN_no
+	.byte	3                               // DW_AT_name
+	.ascii	"\202>"                         // DW_FORM_GNU_str_index
+	.byte	62                              // DW_AT_encoding
+	.byte	11                              // DW_FORM_data1
+	.byte	11                              // DW_AT_byte_size
+	.byte	11                              // DW_FORM_data1
+	.byte	0                               // EOM(1)
+	.byte	0                               // EOM(2)
+	.byte	0                               // EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.xword	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.word	.LpubNames_end0-.LpubNames_start0 // Length of Public Names Info
+.LpubNames_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	25                              // DIE offset
+	.byte	48                              // Attributes: FUNCTION, EXTERNAL
+	.asciz	"callee"                        // External Name
+	.word	0                               // End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.word	.LpubTypes_end0-.LpubTypes_start0 // Length of Public Types Info
+.LpubTypes_start0:
+	.hword	2                               // DWARF Version
+	.word	.Lcu_begin0                     // Offset of Compilation Unit Info
+	.word	48                              // Compilation Unit Length
+	.word	53                              // DIE offset
+	.byte	144                             // Attributes: TYPE, STATIC
+	.asciz	"int"                           // External Name
+	.word	0                               // End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf4-dwp-x86.s b/bolt/test/X86/dwarf4-dwp-x86.s
new file mode 100755
index 0000000000000..6dde1678f3840
--- /dev/null
+++ b/bolt/test/X86/dwarf4-dwp-x86.s
@@ -0,0 +1,405 @@
+## This test checks updating debuginfo via dwarf4 dwp file
+# RUN: rm -rf %t && mkdir -p %t && cd %t
+# RUN: split-file %s %t
+# RUN: %clangxx %cxxflags -g -gdwarf-4 -gsplit-dwarf %t/main.s %t/callee.s -o main.exe
+# RUN: llvm-dwp -e %t/main.exe -o %t/main.exe.dwp
+# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections  2>&1 | FileCheck %s
+
+# CHECK-NOT: Assertion
+
+#--- main.s
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	1 "." "main.cpp"
+	.loc	1 2 0                           # main.cpp:2:0
+	.loc	1 2 21 prologue_end             # main.cpp:2:21
+	.loc	1 2 14 epilogue_begin is_stmt 0 # main.cpp:2:14
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\264B"                         # DW_AT_GNU_pubnames
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+                                        # DW_AT_GNU_pubnames
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	1465063543908291764             # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-main.dwo"             # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"main"                          # string offset=0
+.Linfo_string1:
+	.asciz	"int"                           # string offset=5
+.Linfo_string2:
+	.byte	0                               # string offset=9
+.Linfo_string3:
+	.asciz	"main.cpp"                      # string offset=10
+.Linfo_string4:
+	.asciz	"main.exe-main.dwo"             # string offset=19
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	5
+	.long	9
+	.long	10
+	.long	19
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x22 DW_TAG_compile_unit
+	.byte	2                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	3                               # DW_AT_name
+	.byte	4                               # DW_AT_GNU_dwo_name
+	.quad	1465063543908291764             # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0xf DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.long	40                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x28:0x4 DW_TAG_base_type
+	.byte	1                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	0                               # DW_CHILDREN_no
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.long	.LpubNames_end0-.LpubNames_start0 # Length of Public Names Info
+.LpubNames_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	25                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"main"                          # External Name
+	.long	0                               # End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.long	.LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info
+.LpubTypes_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	40                              # DIE offset
+	.byte	144                             # Attributes: TYPE, STATIC
+	.asciz	"int"                           # External Name
+	.long	0                               # End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.addrsig_sym _Z6calleei
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
+#--- callee.s
+	.file	"callee.cpp"
+	.globl	_Z6calleei                      # -- Begin function _Z6calleei
+	.type	_Z6calleei,@function
+_Z6calleei:                             # @_Z6calleei
+.Lfunc_begin0:
+	.file	1 "." "callee.cpp"
+	.loc	1 1 0                           # callee.cpp:1:0
+	.loc	1 1 28 prologue_end             # callee.cpp:1:28
+	.loc	1 1 21 epilogue_begin is_stmt 0 # callee.cpp:1:21
+	retq
+.Lfunc_end0:
+	.size	_Z6calleei, .Lfunc_end0-_Z6calleei
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	0                               # DW_CHILDREN_no
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\264B"                         # DW_AT_GNU_pubnames
+	.byte	25                              # DW_FORM_flag_present
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.byte	14                              # DW_FORM_strp
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.ascii	"\263B"                         # DW_AT_GNU_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lskel_string0                  # DW_AT_comp_dir
+                                        # DW_AT_GNU_pubnames
+	.long	.Lskel_string1                  # DW_AT_GNU_dwo_name
+	.quad	-8413212350243343807            # DW_AT_GNU_dwo_id
+	.quad	.Lfunc_begin0                   # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_GNU_addr_base
+.Ldebug_info_end0:
+	.section	.debug_str,"MS",@progbits,1
+.Lskel_string0:
+	.asciz	"."                             # string offset=0
+.Lskel_string1:
+	.asciz	"main.exe-callee.dwo"           # string offset=2
+	.section	.debug_str.dwo,"eMS",@progbits,1
+.Linfo_string0:
+	.asciz	"_Z6calleei"                    # string offset=0
+.Linfo_string1:
+	.asciz	"callee"                        # string offset=11
+.Linfo_string2:
+	.asciz	"int"                           # string offset=18
+.Linfo_string3:
+	.asciz	"x"                             # string offset=22
+.Linfo_string4:
+	.byte	0                               # string offset=24
+.Linfo_string5:
+	.asciz	"callee.cpp"                    # string offset=25
+.Linfo_string6:
+	.asciz	"main.exe-callee.dwo"           # string offset=36
+	.section	.debug_str_offsets.dwo,"e",@progbits
+	.long	0
+	.long	11
+	.long	18
+	.long	22
+	.long	24
+	.long	25
+	.long	36
+	.section	.debug_info.dwo,"e",@progbits
+	.long	.Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit
+.Ldebug_info_dwo_start0:
+	.short	4                               # DWARF version number
+	.long	0                               # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	1                               # Abbrev [1] 0xb:0x2f DW_TAG_compile_unit
+	.byte	4                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	5                               # DW_AT_name
+	.byte	6                               # DW_AT_GNU_dwo_name
+	.quad	-8413212350243343807            # DW_AT_GNU_dwo_id
+	.byte	2                               # Abbrev [2] 0x19:0x1c DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	0                               # DW_AT_linkage_name
+	.byte	1                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	53                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	3                               # Abbrev [3] 0x29:0xb DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	124
+	.byte	3                               # DW_AT_name
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.long	53                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x35:0x4 DW_TAG_base_type
+	.byte	2                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_dwo_end0:
+	.section	.debug_abbrev.dwo,"e",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\260B"                         # DW_AT_GNU_dwo_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.ascii	"\261B"                         # DW_AT_GNU_dwo_id
+	.byte	7                               # DW_FORM_data8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.ascii	"\201>"                         # DW_FORM_GNU_addr_index
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	110                             # DW_AT_linkage_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.ascii	"\202>"                         # DW_FORM_GNU_str_index
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_addr,"",@progbits
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.section	.debug_gnu_pubnames,"",@progbits
+	.long	.LpubNames_end0-.LpubNames_start0 # Length of Public Names Info
+.LpubNames_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	25                              # DIE offset
+	.byte	48                              # Attributes: FUNCTION, EXTERNAL
+	.asciz	"callee"                        # External Name
+	.long	0                               # End Mark
+.LpubNames_end0:
+	.section	.debug_gnu_pubtypes,"",@progbits
+	.long	.LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info
+.LpubTypes_start0:
+	.short	2                               # DWARF Version
+	.long	.Lcu_begin0                     # Offset of Compilation Unit Info
+	.long	48                              # Compilation Unit Length
+	.long	53                              # DIE offset
+	.byte	144                             # Attributes: TYPE, STATIC
+	.asciz	"int"                           # External Name
+	.long	0                               # End Mark
+.LpubTypes_end0:
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:

From fd8adf3ccf7dd388e60bced617731aa82b7c145a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 29 Sep 2025 09:14:33 +0200
Subject: [PATCH 079/878] [IR] Use immarg for preallocated intrinsics (NFC)
 (#155835)

Mark the attributes as immarg to indicate that they require a constant
integer. This was previously enforced with a manual verifier check.
---
 llvm/include/llvm/IR/Intrinsics.td         |  8 ++++++--
 llvm/lib/IR/Verifier.cpp                   |  4 +---
 llvm/test/Verifier/preallocated-invalid.ll | 10 +++++++++-
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 3c4ed946b98d0..96da698538314 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -960,8 +960,12 @@ def int_instrprof_mcdc_tvbitmap_update : Intrinsic<[],
                                         [llvm_ptr_ty, llvm_i64_ty,
                                          llvm_i32_ty, llvm_ptr_ty]>;
 
-def int_call_preallocated_setup : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty]>;
-def int_call_preallocated_arg : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>;
+def int_call_preallocated_setup
+    : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty],
+                            [ImmArg<ArgIndex<0>>]>;
+def int_call_preallocated_arg
+    : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty],
+                            [ImmArg<ArgIndex<1>>]>;
 def int_call_preallocated_teardown : DefaultAttrsIntrinsic<[], [llvm_token_ty]>;
 
 // This intrinsic is intentionally undocumented and users shouldn't call it;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index b2e76cc7a8a90..8c03d6f809d50 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5869,9 +5869,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     break;
   }
   case Intrinsic::call_preallocated_setup: {
-    auto *NumArgs = dyn_cast<ConstantInt>(Call.getArgOperand(0));
-    Check(NumArgs != nullptr,
-          "llvm.call.preallocated.setup argument must be a constant");
+    auto *NumArgs = cast<ConstantInt>(Call.getArgOperand(0));
     bool FoundCall = false;
     for (User *U : Call.users()) {
       auto *UseCall = dyn_cast<CallBase>(U);
diff --git a/llvm/test/Verifier/preallocated-invalid.ll b/llvm/test/Verifier/preallocated-invalid.ll
index 38ed1067c497d..2c5aff231e1bd 100644
--- a/llvm/test/Verifier/preallocated-invalid.ll
+++ b/llvm/test/Verifier/preallocated-invalid.ll
@@ -65,13 +65,21 @@ define void @preallocated_one_call() {
     ret void
 }
 
-; CHECK: must be a constant
+; CHECK: immarg operand has non-immediate parameter
 define void @preallocated_setup_constant() {
     %ac = call i32 @blackbox()
     %cs = call token @llvm.call.preallocated.setup(i32 %ac)
     ret void
 }
 
+; CHECK: llvm.call.preallocated.alloc arg index must be a constant
+define void @preallocated_arg_constant() {
+    %ac = call i32 @blackbox()
+    %cs = call token @llvm.call.preallocated.setup(i32 3)
+    call token @llvm.call.preallocated.arg(token %cs, i32 %ac)
+    ret void
+}
+
 ; CHECK: must be between 0 and corresponding
 define void @preallocated_setup_arg_index_in_bounds() {
     %cs = call token @llvm.call.preallocated.setup(i32 2)

From 3a3a4fb922094c69c4c7bcf1dc1b8a079415f4b2 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Mon, 29 Sep 2025 15:27:21 +0800
Subject: [PATCH 080/878] [LoongArch][NFC] Pre-commit scalarize fp tests for
 #157824 (#160480)

tests for https://github.com/llvm/llvm-project/pull/157824
---
 .../CodeGen/LoongArch/lasx/scalarize-fp.ll    | 64 +++++++++++++++++++
 .../CodeGen/LoongArch/lsx/scalarize-fp.ll     | 63 ++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll

diff --git a/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
new file mode 100644
index 0000000000000..c93a6582b9c69
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+define <8 x float> @fadd_elt0_v8f32(float %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
+; CHECK-NEXT:    xvldi $xr1, -1424
+; CHECK-NEXT:    xvfadd.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <8 x float> poison, float %a, i32 0
+  %c = fadd <8 x float> %b, <float 1.0, float poison, float poison, float poison, float poison, float poison, float poison, float poison>
+  ret <8 x float> %c
+}
+
+define <4 x double> @fadd_elt0_v4f64(double %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvldi $xr1, -912
+; CHECK-NEXT:    xvfadd.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x double> poison, double %a, i32 0
+  %c = fadd <4 x double> %b, <double 1.0, double poison, double poison, double poison>
+  ret <4 x double> %c
+}
+
+define <8 x float> @fsub_splat_v8f32(float %a, float %b) nounwind {
+; CHECK-LABEL: fsub_splat_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vfsub.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <8 x float> poison, float %a, i32 0
+  %insb = insertelement <8 x float> poison, float %b, i32 0
+  %va = shufflevector <8 x float> %insa, <8 x float> poison, <8 x i32> zeroinitializer
+  %vb = shufflevector <8 x float> %insb, <8 x float> poison, <8 x i32> zeroinitializer
+  %c = fsub <8 x float> %va, %vb
+  ret <8 x float> %c
+}
+
+define <4 x double> @fsub_splat_v4f64(double %a) nounwind {
+; CHECK-LABEL: fsub_splat_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
+; CHECK-NEXT:    xvldi $xr1, -784
+; CHECK-NEXT:    xvfadd.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <4 x double> poison, double %a, i32 0
+  %insb = insertelement <4 x double> poison, double 1.0, i32 0
+  %va = shufflevector <4 x double> %insa, <4 x double> poison, <4 x i32> zeroinitializer
+  %vb = shufflevector <4 x double> %insb, <4 x double> poison, <4 x i32> zeroinitializer
+  %c = fsub <4 x double> %va, %vb
+  ret <4 x double> %c
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll
new file mode 100644
index 0000000000000..cc2d3d818b412
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+define <4 x float> @fadd_elt0_v4f32(float %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vldi $vr1, -1424
+; CHECK-NEXT:    vfadd.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <4 x float> poison, float %a, i32 0
+  %c = fadd <4 x float> %b, <float 1.0, float poison, float poison, float poison>
+  ret <4 x float> %c
+}
+
+define <2 x double> @fadd_elt0_v2f64(double %a) nounwind {
+; CHECK-LABEL: fadd_elt0_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT:    vldi $vr1, -912
+; CHECK-NEXT:    vfadd.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    ret
+entry:
+  %b = insertelement <2 x double> poison, double %a, i32 0
+  %c = fadd <2 x double> %b, <double 1.0, double poison>
+  ret <2 x double> %c
+}
+
+define <4 x float> @fsub_splat_v4f32(float %b) nounwind {
+; CHECK-LABEL: fsub_splat_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
+; CHECK-NEXT:    vldi $vr1, -1424
+; CHECK-NEXT:    vfsub.s $vr0, $vr1, $vr0
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <4 x float> poison, float 1.0, i32 0
+  %insb = insertelement <4 x float> poison, float %b, i32 0
+  %va = shufflevector <4 x float> %insa, <4 x float> poison, <4 x i32> zeroinitializer
+  %vb = shufflevector <4 x float> %insb, <4 x float> poison, <4 x i32> zeroinitializer
+  %c = fsub <4 x float> %va, %vb
+  ret <4 x float> %c
+}
+
+define <2 x double> @fsub_splat_v2f64(double %a, double %b) nounwind {
+; CHECK-LABEL: fsub_splat_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $vr1
+; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
+; CHECK-NEXT:    vfsub.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+entry:
+  %insa = insertelement <2 x double> poison, double %a, i32 0
+  %insb = insertelement <2 x double> poison, double %b, i32 0
+  %va = shufflevector <2 x double> %insa, <2 x double> poison, <2 x i32> zeroinitializer
+  %vb = shufflevector <2 x double> %insb, <2 x double> poison, <2 x i32> zeroinitializer
+  %c = fsub <2 x double> %va, %vb
+  ret <2 x double> %c
+}

From f84b784dac61d419318ed6598412d197f1e05ee9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 08:29:04 +0100
Subject: [PATCH 081/878] [X86] LowerShiftByScalarImmediate - move shl(x,1) ->
 add(freeze(x),freeze(x)) to X86FixupInstTunings (#161007)

Avoid the shl(x,1) -> add(freeze(x),freeze(x)) if the shift-imm if
legal, and leave it to X86FixupInstTunings.

Helps avoid missed optimisations due to oneuse limits, avoids
unnecessary freezes and allows AVX512 to fold to mi memory folding
variants.

Fixes #161006
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    |  54 ++++++++
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  16 +--
 llvm/test/CodeGen/X86/combine-add.ll          |   4 +-
 llvm/test/CodeGen/X86/combine-mul.ll          |   2 +-
 llvm/test/CodeGen/X86/combine-sdiv.ll         |  30 ++--
 llvm/test/CodeGen/X86/known-signbits-shl.ll   |   2 +-
 .../test/CodeGen/X86/masked_gather_scatter.ll |  33 ++---
 llvm/test/CodeGen/X86/oddsubvector.ll         |  12 +-
 llvm/test/CodeGen/X86/pr62286.ll              |  38 +++---
 llvm/test/CodeGen/X86/pr74736.ll              |  20 +--
 llvm/test/CodeGen/X86/shift-i512.ll           |   6 +-
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |  24 ++--
 llvm/test/CodeGen/X86/vec_shift6.ll           |  10 +-
 llvm/test/CodeGen/X86/vector-gep.ll           | 128 +++++++++---------
 llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll |  10 +-
 llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll |   8 +-
 llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll |   6 +-
 llvm/test/CodeGen/X86/vector-mul.ll           |   8 +-
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll |   2 +-
 llvm/test/CodeGen/X86/vector-shift-shl-128.ll |   4 +-
 .../CodeGen/X86/vector-shuffle-combining.ll   |   4 +-
 ...vector_splat-const-shift-of-constmasked.ll |  64 ++++-----
 22 files changed, 258 insertions(+), 227 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 33dc0a232815c..a1d4e0bc62310 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction(
     return true;
   };
 
+  // Is ADD(X,X) more efficient than SHL(X,1)?
+  auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
+    if (MI.getOperand(NumOperands - 1).getImm() != 1)
+      return false;
+    if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
+      return false;
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(AddOpc));
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MI.getOperand(NumOperands - 2));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
+    return false;
+  };
+
   switch (Opc) {
   case X86::BLENDPDrri:
     return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
@@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
   case X86::VUNPCKHPSZrmkz:
     return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
+
+  case X86::PSLLWri:
+    return ProcessShiftLeftToAdd(X86::PADDWrr);
+  case X86::VPSLLWri:
+    return ProcessShiftLeftToAdd(X86::VPADDWrr);
+  case X86::VPSLLWYri:
+    return ProcessShiftLeftToAdd(X86::VPADDWYrr);
+  case X86::VPSLLWZ128ri:
+    return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
+  case X86::VPSLLWZ256ri:
+    return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
+  case X86::VPSLLWZri:
+    return ProcessShiftLeftToAdd(X86::VPADDWZrr);
+  case X86::PSLLDri:
+    return ProcessShiftLeftToAdd(X86::PADDDrr);
+  case X86::VPSLLDri:
+    return ProcessShiftLeftToAdd(X86::VPADDDrr);
+  case X86::VPSLLDYri:
+    return ProcessShiftLeftToAdd(X86::VPADDDYrr);
+  case X86::VPSLLDZ128ri:
+    return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
+  case X86::VPSLLDZ256ri:
+    return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
+  case X86::VPSLLDZri:
+    return ProcessShiftLeftToAdd(X86::VPADDDZrr);
+  case X86::PSLLQri:
+    return ProcessShiftLeftToAdd(X86::PADDQrr);
+  case X86::VPSLLQri:
+    return ProcessShiftLeftToAdd(X86::VPADDQrr);
+  case X86::VPSLLQYri:
+    return ProcessShiftLeftToAdd(X86::VPADDQYrr);
+  case X86::VPSLLQZ128ri:
+    return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
+  case X86::VPSLLQZ256ri:
+    return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
+  case X86::VPSLLQZri:
+    return ProcessShiftLeftToAdd(X86::VPADDQZrr);
+
   default:
     return false;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index efeddd7c9bd4b..fcfeb661aa891 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30313,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
 
   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
 
-  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
-    // Hardware support for vector shifts is sparse which makes us scalarize the
-    // vector operations in many cases. Also, on sandybridge ADD is faster than
-    // shl: (shl V, 1) -> (add (freeze V), (freeze V))
-    if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
-      // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
-      // must be 0). (add undef, undef) however can be any value. To make this
-      // safe, we must freeze R to ensure that register allocation uses the same
-      // register for an undefined value. This ensures that the result will
-      // still be even and preserves the original semantics.
-      R = DAG.getFreeze(R);
-      return DAG.getNode(ISD::ADD, dl, VT, R, R);
-    }
-
+  if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
-  }
 
   // i64 SRA needs to be performed as partial shifts.
   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll
index ff9f995c4765b..51a8bf5b48415 100644
--- a/llvm/test/CodeGen/X86/combine-add.ll
+++ b/llvm/test/CodeGen/X86/combine-add.ll
@@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) {
 ; SSE-NEXT:    psubd %xmm1, %xmm3
 ; SSE-NEXT:    psubd %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    paddd %xmm2, %xmm0
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm1, %xmm1
 ; SSE-NEXT:    paddd %xmm3, %xmm1
 ; SSE-NEXT:    movdqu %xmm3, 16(%rsi)
 ; SSE-NEXT:    movdqu %xmm2, (%rsi)
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 8e4a50ea266c3..ae4d24f91ffc0 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
 ; SSE-LABEL: combine_vec_mul_pow2c:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    paddq %xmm0, %xmm2
+; SSE-NEXT:    paddq %xmm2, %xmm2
 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psllq $4, %xmm2
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 98187d61c1f84..6bcbfe1808933 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2187,13 +2187,13 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-NEXT:    pxor %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128]
 ; SSE41-NEXT:    psrlw $8, %xmm3
-; SSE41-NEXT:    paddw %xmm4, %xmm4
-; SSE41-NEXT:    pmovsxbw %xmm1, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7]
+; SSE41-NEXT:    pmovsxbw %xmm1, %xmm0
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    paddw %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    packuswb %xmm3, %xmm2
 ; SSE41-NEXT:    paddb %xmm1, %xmm2
@@ -2201,15 +2201,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
 ; SSE41-NEXT:    psraw $8, %xmm0
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    paddw %xmm0, %xmm3
-; SSE41-NEXT:    psllw $7, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
-; SSE41-NEXT:    psrlw $8, %xmm0
+; SSE41-NEXT:    psllw $7, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
+; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    psraw $8, %xmm2
 ; SSE41-NEXT:    psllw $7, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
@@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128]
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm3
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
-; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll
index 473fecc307ed4..57d557dec11b9 100644
--- a/llvm/test/CodeGen/X86/known-signbits-shl.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll
@@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind {
 ; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-NEXT:    por %xmm2, %xmm1
 ; X64-NEXT:    movdqa %xmm0, %xmm2
-; X64-NEXT:    paddw %xmm0, %xmm2
+; X64-NEXT:    paddw %xmm2, %xmm2
 ; X64-NEXT:    movdqa %xmm2, %xmm3
 ; X64-NEXT:    psraw $1, %xmm3
 ; X64-NEXT:    pcmpeqw %xmm0, %xmm3
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 4e6f666fa05de..4cde581c10508 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ; X64-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-KNL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    retq
@@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    retq
@@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
 ; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    retq
@@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ; X64-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-KNL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    retq
@@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    retq
@@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
 ; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1}
 ; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    retq
@@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ; X64-KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; X64-KNL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-KNL-NEXT:    vpaddd %zmm0, %zmm0, %zmm2
+; X64-KNL-NEXT:    vpslld $1, (%rsi), %zmm0
+; X64-KNL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
 ; X64-KNL-NEXT:    kmovw %k1, %k2
 ; X64-KNL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-KNL-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ; X64-SKX-SMALL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-SMALL-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-SMALL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; X64-SKX-SMALL-NEXT:    vpaddd %zmm0, %zmm0, %zmm2
+; X64-SKX-SMALL-NEXT:    vpslld $1, (%rsi), %zmm0
+; X64-SKX-SMALL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2
 ; X64-SKX-SMALL-NEXT:    kmovw %k1, %k2
 ; X64-SKX-SMALL-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-SMALL-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
@@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
 ; X64-SKX-LARGE-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpslld $31, %zmm0, %zmm0
 ; X64-SKX-LARGE-NEXT:    vpmovd2m %zmm0, %k1
-; X64-SKX-LARGE-NEXT:    vmovdqu64 (%rsi), %zmm0
+; X64-SKX-LARGE-NEXT:    vpslld $1, (%rsi), %zmm0
 ; X64-SKX-LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
-; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm0
-; X64-SKX-LARGE-NEXT:    vpaddd %zmm0, %zmm0, %zmm2
+; X64-SKX-LARGE-NEXT:    vpandd (%rax){1to16}, %zmm0, %zmm2
 ; X64-SKX-LARGE-NEXT:    kmovw %k1, %k2
 ; X64-SKX-LARGE-NEXT:    vmovaps %zmm1, %zmm0
 ; X64-SKX-LARGE-NEXT:    vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2}
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index f53983036a016..5df1867f73c8e 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) {
 define void @PR42833() {
 ; SSE2-LABEL: PR42833:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movl b(%rip), %eax
 ; SSE2-NEXT:    movdqa c+144(%rip), %xmm2
 ; SSE2-NEXT:    movdqa c+128(%rip), %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    addl b(%rip), %eax
+; SSE2-NEXT:    addl c+128(%rip), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
@@ -166,7 +166,7 @@ define void @PR42833() {
 ; SSE2-NEXT:    psubd %xmm2, %xmm4
 ; SSE2-NEXT:    paddd %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm5
 ; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
 ; SSE2-NEXT:    movdqa %xmm2, c+144(%rip)
 ; SSE2-NEXT:    movaps %xmm5, c+128(%rip)
@@ -191,17 +191,17 @@ define void @PR42833() {
 ;
 ; SSE42-LABEL: PR42833:
 ; SSE42:       # %bb.0:
+; SSE42-NEXT:    movl b(%rip), %eax
 ; SSE42-NEXT:    movdqa c+144(%rip), %xmm1
 ; SSE42-NEXT:    movdqa c+128(%rip), %xmm0
-; SSE42-NEXT:    movd %xmm0, %eax
-; SSE42-NEXT:    addl b(%rip), %eax
+; SSE42-NEXT:    addl c+128(%rip), %eax
 ; SSE42-NEXT:    movd %eax, %xmm2
 ; SSE42-NEXT:    paddd %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa d+144(%rip), %xmm3
 ; SSE42-NEXT:    psubd %xmm1, %xmm3
 ; SSE42-NEXT:    paddd %xmm1, %xmm1
 ; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    paddd %xmm0, %xmm4
+; SSE42-NEXT:    paddd %xmm4, %xmm4
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
 ; SSE42-NEXT:    movdqa %xmm1, c+144(%rip)
 ; SSE42-NEXT:    movdqa %xmm4, c+128(%rip)
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index ce03f8fad4a19..161e9651a9cf2 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) {
 ; AVX1-LABEL: PR62286:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
-; AVX1-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR62286:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm1
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
@@ -59,12 +65,12 @@ define i64 @PR62286(i32 %a) {
 ; AVX512-LABEL: PR62286:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovd %edi, %xmm0
-; AVX512-NEXT:    movb $8, %al
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm1
+; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    movw $4369, %ax # imm = 0x1111
 ; AVX512-NEXT:    kmovd %eax, %k1
-; AVX512-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll
index ceccee00c9457..58955265580bd 100644
--- a/llvm/test/CodeGen/X86/pr74736.ll
+++ b/llvm/test/CodeGen/X86/pr74736.ll
@@ -6,8 +6,8 @@ define void @main(<16 x i32> %0, i32 %1) {
 ; SSE-LABEL: main:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    movd %edi, %xmm4
-; SSE-NEXT:    movss {{.*#+}} xmm0 = [1,0,0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [0,1,0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
 ; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    paddd %xmm1, %xmm1
 ; SSE-NEXT:    paddd %xmm3, %xmm3
@@ -32,20 +32,20 @@ define void @main(<16 x i32> %0, i32 %1) {
 ; AVX-LABEL: main:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
 ; AVX-NEXT:    movl $1, %eax
 ; AVX-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
 ; AVX-NEXT:    vpinsrd $3, %edi, %xmm2, %xmm2
-; AVX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vpaddd %ymm1, %ymm1, %ymm1
-; AVX-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7]
-; AVX-NEXT:    vpermd %ymm0, %ymm2, %ymm2
+; AVX-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT:    vpaddd %ymm2, %ymm2, %ymm2
+; AVX-NEXT:    vpaddd %ymm1, %ymm1, %ymm3
 ; AVX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; AVX-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
-; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
+; AVX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
+; AVX-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
 ; AVX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX-NEXT:    vpxor %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [0,1,1,3,3,5,5,7]
+; AVX-NEXT:    vpermd %ymm2, %ymm1, %ymm1
+; AVX-NEXT:    vpxor %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index 756019d0e98a0..03b61d9235254 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -10,7 +10,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2]
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vpsllq $1, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
 ; AVX512VL-NEXT:    vpsrlq $63, %xmm4, %xmm4
 ; AVX512VL-NEXT:    vpaddq %xmm2, %xmm2, %xmm2
@@ -34,7 +34,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; AVX512VBMI-NEXT:    vpshldq $1, %xmm3, %xmm2, %xmm3
-; AVX512VBMI-NEXT:    vpsllq $1, %xmm0, %xmm4
+; AVX512VBMI-NEXT:    vpaddq %xmm0, %xmm0, %xmm4
 ; AVX512VBMI-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
 ; AVX512VBMI-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512VBMI-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
@@ -51,7 +51,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
 ; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; ZNVER4-NEXT:    vpsllq $1, %xmm0, %xmm4
+; ZNVER4-NEXT:    vpaddq %xmm0, %xmm0, %xmm4
 ; ZNVER4-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; ZNVER4-NEXT:    vpshldq $1, %xmm3, %xmm2, %xmm3
 ; ZNVER4-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 3f48b22e2b9ff..a48be037ebebc 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -5791,20 +5791,20 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
 ; SSE-LABEL: test_mm_slli_epi16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psllw $1, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x01]
+; SSE-NEXT:    psllw $2, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x02]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_slli_epi16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x01]
+; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x02]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_slli_epi16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllw $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x01]
+; AVX512-NEXT:    vpsllw $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x02]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
-  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1)
+  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 2)
   %bc = bitcast <8 x i16> %res to <2 x i64>
   ret <2 x i64> %bc
 }
@@ -5813,20 +5813,20 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
 define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
 ; SSE-LABEL: test_mm_slli_epi32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pslld $1, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x01]
+; SSE-NEXT:    pslld $2, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x02]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_slli_epi32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpslld $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x01]
+; AVX1-NEXT:    vpslld $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x02]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_slli_epi32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpslld $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x01]
+; AVX512-NEXT:    vpslld $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x02]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
-  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1)
+  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 2)
   %bc = bitcast <4 x i32> %res to <2 x i64>
   ret <2 x i64> %bc
 }
@@ -5835,19 +5835,19 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
 define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
 ; SSE-LABEL: test_mm_slli_epi64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    psllq $1, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x01]
+; SSE-NEXT:    psllq $2, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x02]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_slli_epi64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsllq $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x01]
+; AVX1-NEXT:    vpsllq $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x02]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_slli_epi64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllq $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x01]
+; AVX512-NEXT:    vpsllq $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x02]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
+  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 2)
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll
index 71e659c681d17..219e32c86c848 100644
--- a/llvm/test/CodeGen/X86/vec_shift6.ll
+++ b/llvm/test/CodeGen/X86/vec_shift6.ll
@@ -28,14 +28,14 @@ define <8 x i16> @test2(<8 x i16> %a) {
 ; SSE2-LABEL: test2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddw %xmm1, %xmm1
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test2:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    paddw %xmm1, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
@@ -56,7 +56,7 @@ define <4 x i32> @test3(<4 x i32> %a) {
 ; SSE2-LABEL: test3:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm1
 ; SSE2-NEXT:    pslld $2, %xmm0
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
@@ -81,14 +81,14 @@ define <4 x i32> @test4(<4 x i32> %a) {
 ; SSE2-LABEL: test4:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm1
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test4:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    paddd %xmm1, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll
index 5c485592295d3..b4cffcd171b33 100644
--- a/llvm/test/CodeGen/X86/vector-gep.ll
+++ b/llvm/test/CodeGen/X86/vector-gep.ll
@@ -122,91 +122,87 @@ define <64 x ptr> @AGEP9(ptr %param, <64 x i32> %off) nounwind {
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-32, %esp
 ; CHECK-NEXT:    subl $160, %esp
-; CHECK-NEXT:    vmovdqa %ymm2, %ymm5
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm3
-; CHECK-NEXT:    vmovdqa %ymm0, %ymm1
-; CHECK-NEXT:    vmovdqa 72(%ebp), %ymm0
-; CHECK-NEXT:    vmovdqa 40(%ebp), %ymm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm4
-; CHECK-NEXT:    vbroadcastss 12(%ebp), %xmm7
-; CHECK-NEXT:    vpaddd %xmm4, %xmm7, %xmm4
-; CHECK-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm3
+; CHECK-NEXT:    vbroadcastss 12(%ebp), %xmm5
+; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; CHECK-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 104(%ebp), %ymm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 136(%ebp), %ymm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm2, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vextractf128 $1, %ymm2, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovdqa 168(%ebp), %ymm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm2
-; CHECK-NEXT:    vpaddd %xmm2, %xmm7, %xmm2
-; CHECK-NEXT:    vmovdqa %xmm2, (%esp) # 16-byte Spill
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vmovdqa 40(%ebp), %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm2
-; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm0
-; CHECK-NEXT:    vpaddd %xmm0, %xmm7, %xmm0
-; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT:    vpaddd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpaddd %xmm1, %xmm7, %xmm1
-; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm6
-; CHECK-NEXT:    vpaddd %xmm6, %xmm7, %xmm6
-; CHECK-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vpaddd %xmm3, %xmm7, %xmm3
-; CHECK-NEXT:    vmovdqa %ymm5, %ymm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm5
-; CHECK-NEXT:    vpaddd %xmm5, %xmm7, %xmm5
-; CHECK-NEXT:    vextractf128 $1, %ymm4, %xmm4
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa 56(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa 72(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa %xmm0, (%esp) # 16-byte Spill
+; CHECK-NEXT:    vmovdqa 88(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm2
+; CHECK-NEXT:    vmovdqa 104(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm1
+; CHECK-NEXT:    vmovdqa 120(%ebp), %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm0, %xmm5, %xmm0
+; CHECK-NEXT:    vmovdqa 136(%ebp), %xmm6
+; CHECK-NEXT:    vpaddd %xmm6, %xmm6, %xmm6
+; CHECK-NEXT:    vpaddd %xmm6, %xmm5, %xmm6
+; CHECK-NEXT:    vmovdqa 152(%ebp), %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm7, %xmm7
+; CHECK-NEXT:    vpaddd %xmm7, %xmm5, %xmm7
+; CHECK-NEXT:    vmovdqa 168(%ebp), %xmm4
 ; CHECK-NEXT:    vpaddd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpaddd %xmm4, %xmm7, %xmm4
+; CHECK-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; CHECK-NEXT:    vmovdqa 184(%ebp), %xmm3
+; CHECK-NEXT:    vpaddd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
 ; CHECK-NEXT:    movl 8(%ebp), %eax
-; CHECK-NEXT:    vmovdqa %xmm4, 80(%eax)
-; CHECK-NEXT:    vmovdqa %xmm5, 64(%eax)
-; CHECK-NEXT:    vmovdqa %xmm3, 48(%eax)
-; CHECK-NEXT:    vmovdqa %xmm6, 32(%eax)
-; CHECK-NEXT:    vmovdqa %xmm1, 16(%eax)
-; CHECK-NEXT:    vmovdqa %xmm0, (%eax)
-; CHECK-NEXT:    vmovdqa %xmm2, 240(%eax)
+; CHECK-NEXT:    vmovdqa %xmm3, 240(%eax)
+; CHECK-NEXT:    vmovdqa %xmm4, 224(%eax)
+; CHECK-NEXT:    vmovdqa %xmm7, 208(%eax)
+; CHECK-NEXT:    vmovdqa %xmm6, 192(%eax)
+; CHECK-NEXT:    vmovdqa %xmm0, 176(%eax)
+; CHECK-NEXT:    vmovdqa %xmm1, 160(%eax)
+; CHECK-NEXT:    vmovdqa %xmm2, 144(%eax)
 ; CHECK-NEXT:    vmovaps (%esp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 224(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 128(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 208(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 112(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 192(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 96(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 176(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 80(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 160(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 64(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 144(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 48(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 128(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 32(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 112(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, 16(%eax)
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vmovaps %xmm0, 96(%eax)
+; CHECK-NEXT:    vmovaps %xmm0, (%eax)
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 13f7d68ccb893..33d80f63dbcc8 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -652,7 +652,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    paddb %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psllw $1, %xmm2
+; SSE2-NEXT:    paddw %xmm2, %xmm2
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -678,7 +678,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE41-NEXT:    paddb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psllw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm2, %xmm2
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    psrlw $2, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -701,7 +701,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm2
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -720,7 +720,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX2NOBW-NEXT:    vpsllw $1, %xmm1, %xmm2
+; AVX2NOBW-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -739,7 +739,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT:    vpsllw $1, %xmm1, %xmm2
+; AVX512BW-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 1a5c3730c1839..e43108fe7d784 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -590,7 +590,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX1-NEXT:    vpand %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpsllw $1, %xmm3, %xmm5
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm5
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX1-NEXT:    vpand %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm3
@@ -609,7 +609,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand %xmm2, %xmm8, %xmm2
@@ -633,7 +633,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX2NOBW-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX2NOBW-NEXT:    vpsllw $1, %ymm1, %ymm2
+; AVX2NOBW-NEXT:    vpaddw %ymm1, %ymm1, %ymm2
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX2NOBW-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
@@ -651,7 +651,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT:    vpsllw $1, %ymm1, %ymm2
+; AVX512BW-NEXT:    vpaddw %ymm1, %ymm1, %ymm2
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index 9c56894f0c59c..bf98bcca59c04 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -485,7 +485,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX512F-NEXT:    vpand %ymm6, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT:    vpsllw $1, %ymm3, %ymm5
+; AVX512F-NEXT:    vpaddw %ymm3, %ymm3, %ymm5
 ; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
 ; AVX512F-NEXT:    vpand %ymm7, %ymm5, %ymm5
 ; AVX512F-NEXT:    vpsrlw $2, %ymm3, %ymm3
@@ -504,7 +504,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsllw $1, %ymm2, %ymm3
+; AVX512F-NEXT:    vpaddw %ymm2, %ymm2, %ymm3
 ; AVX512F-NEXT:    vpand %ymm7, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $2, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpand %ymm2, %ymm8, %ymm2
@@ -528,7 +528,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpsrlw $1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpaddb %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    vpsllw $1, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 13b21a747878b..6e1bf25908302 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -821,10 +821,10 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; X86-SSE-NEXT:    andl $-16, %esp
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE-NEXT:    paddw %xmm1, %xmm3
+; X86-SSE-NEXT:    paddw %xmm3, %xmm3
 ; X86-SSE-NEXT:    paddw %xmm3, %xmm1
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE-NEXT:    paddw %xmm0, %xmm3
+; X86-SSE-NEXT:    paddw %xmm3, %xmm3
 ; X86-SSE-NEXT:    paddw %xmm2, %xmm0
 ; X86-SSE-NEXT:    paddw %xmm3, %xmm0
 ; X86-SSE-NEXT:    paddw 8(%ebp), %xmm1
@@ -835,9 +835,9 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; X64-SSE-LABEL: madd_v16i16_3:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X64-SSE-NEXT:    paddw %xmm1, %xmm4
+; X64-SSE-NEXT:    paddw %xmm4, %xmm4
 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X64-SSE-NEXT:    paddw %xmm0, %xmm5
+; X64-SSE-NEXT:    paddw %xmm5, %xmm5
 ; X64-SSE-NEXT:    paddw %xmm2, %xmm0
 ; X64-SSE-NEXT:    paddw %xmm5, %xmm0
 ; X64-SSE-NEXT:    paddw %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 227e000c6be7f..ab1feba98b008 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -907,7 +907,7 @@ define i1 @mask_v8i32_2(<8 x i32> %a0) {
 ; SSE2-LABEL: mask_v8i32_2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    por %xmm1, %xmm0
-; SSE2-NEXT:    pslld $1, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm0
 ; SSE2-NEXT:    movmskps %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
 ; SSE2-NEXT:    sete %al
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 2b1cf5b671e53..99dac74d8127b 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -927,7 +927,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; SSE2-LABEL: constant_shift_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    paddq %xmm1, %xmm1
 ; SSE2-NEXT:    psllq $7, %xmm0
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE2-NEXT:    retq
@@ -975,7 +975,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ; X86-SSE-LABEL: constant_shift_v2i64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE-NEXT:    paddq %xmm0, %xmm1
+; X86-SSE-NEXT:    paddq %xmm1, %xmm1
 ; X86-SSE-NEXT:    psllq $7, %xmm0
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-SSE-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 5b61de5a3b772..ee9d8a55aeb3e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3550,14 +3550,14 @@ define <8 x i16> @PR141475(i32 %in) {
 ; SSE-LABEL: PR141475:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    pslld $1, %xmm0
+; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR141475:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslld $1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX-NEXT:    retq
   %mul = shl i32 %in, 1
diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
index 54dc107fd0c10..3b93734c24deb 100644
--- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll
@@ -1438,26 +1438,26 @@ define <8 x i16> @test_128_i16_x_8_65024_mask_ashr_10(<8 x i16> %a0) {
 define <8 x i16> @test_128_i16_x_8_127_mask_shl_1(<8 x i16> %a0) {
 ; X86-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <8 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
   %t1 = shl <8 x i16> %t0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -1656,26 +1656,26 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_6(<8 x i16> %a0) {
 define <8 x i16> @test_128_i16_x_8_65024_mask_shl_1(<8 x i16> %a0) {
 ; X86-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <8 x i16> %a0, <i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024, i16 65024>
   %t1 = shl <8 x i16> %t0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -2373,40 +2373,40 @@ define <4 x i32> @test_128_i32_x_4_4294836224_mask_ashr_18(<4 x i32> %a0) {
 define <4 x i32> @test_128_i32_x_4_32767_mask_shl_1(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534]
+; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534]
+; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
   %t0 = and <4 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767>
   %t1 = shl <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>
@@ -2675,40 +2675,40 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_10(<4 x i32> %a0) {
 define <4 x i32> @test_128_i32_x_4_4294836224_mask_shl_1(<4 x i32> %a0) {
 ; X86-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224]
-; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152]
+; X86-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddd %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224]
-; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152]
+; X64-AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    retq
   %t0 = and <4 x i32> %a0, <i32 4294836224, i32 4294836224, i32 4294836224, i32 4294836224>
   %t1 = shl <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>
@@ -3325,26 +3325,26 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> %
 define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <2 x i64> %a0, <i64 2147483647, i64 2147483647>
   %t1 = shl <2 x i64> %t0, <i64 1, i64 1>
@@ -3543,26 +3543,26 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) {
 define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_shl_1(<2 x i64> %a0) {
 ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    paddq %xmm0, %xmm0
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
   %t0 = and <2 x i64> %a0, <i64 18446744065119617024, i64 18446744065119617024>
   %t1 = shl <2 x i64> %t0, <i64 1, i64 1>

From 483d73a5e0f63f110776a5abb2e5644f6de51cb3 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@pm.me>
Date: Mon, 29 Sep 2025 08:37:25 +0100
Subject: [PATCH 082/878] [libclc] Move myself to the list of inactive
 maintainers

Change my email address in the process. I will not be able to keep up
maintainership duties on this project in the future.

Adding the wording on the inactive maintainers section myself like this
feels self-aggrandizing but was copied from other LLVM projects.
---
 libclc/Maintainers.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/libclc/Maintainers.md b/libclc/Maintainers.md
index ac869b6945db5..695695c00be56 100644
--- a/libclc/Maintainers.md
+++ b/libclc/Maintainers.md
@@ -10,8 +10,14 @@ The following people are the active maintainers for the project. Please reach
 out to them for code reviews, questions about their area of expertise, or other
 assistance.
 
-Fraser Cormack \
-fraser@codeplay.com (email), [frasercrmck](https://github.com/frasercrmck) (GitHub)
-
 Tom Stellard \
 tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
+
+## Inactive Maintainers
+
+The following people have graciously spent time performing maintainership
+responsibilities but are no longer active in that role. Thank you for all your
+help with the success of the project!
+
+Fraser Cormack \
+frasercrmck@pm.me (email), [frasercrmck](https://github.com/frasercrmck) (GitHub)

From 306ece0aca25dd6a2109cbd28f1d37b9a0536d94 Mon Sep 17 00:00:00 2001
From: Victor Chernyakin <chernyakin.victor.j@outlook.com>
Date: Mon, 29 Sep 2025 00:45:55 -0700
Subject: [PATCH 083/878] [llvm][NFC] Simplify alignment calculations in
 `TrailingObjects` (#161134)

---
 llvm/include/llvm/Support/TrailingObjects.h | 33 +++++----------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h
index d7211a930ae49..3eb7c0bd1f379 100644
--- a/llvm/include/llvm/Support/TrailingObjects.h
+++ b/llvm/include/llvm/Support/TrailingObjects.h
@@ -57,25 +57,9 @@
 namespace llvm {
 
 namespace trailing_objects_internal {
-/// Helper template to calculate the max alignment requirement for a set of
-/// objects.
-template <typename First, typename... Rest> class AlignmentCalcHelper {
-private:
-  enum {
-    FirstAlignment = alignof(First),
-    RestAlignment = AlignmentCalcHelper<Rest...>::Alignment,
-  };
 
-public:
-  enum {
-    Alignment = FirstAlignment > RestAlignment ? FirstAlignment : RestAlignment
-  };
-};
-
-template <typename First> class AlignmentCalcHelper<First> {
-public:
-  enum { Alignment = alignof(First) };
-};
+template <typename... T>
+inline constexpr size_t MaxAlignment = std::max({alignof(T)...});
 
 /// The base class for TrailingObjects* classes.
 class TrailingObjectsBase {
@@ -209,11 +193,10 @@ class alignas(Align) TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, PrevTy>
 /// See the file comment for details on the usage of the
 /// TrailingObjects type.
 template <typename BaseTy, typename... TrailingTys>
-class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl<
-                            trailing_objects_internal::AlignmentCalcHelper<
-                                TrailingTys...>::Alignment,
-                            BaseTy, TrailingObjects<BaseTy, TrailingTys...>,
-                            BaseTy, TrailingTys...> {
+class TrailingObjects
+    : private trailing_objects_internal::TrailingObjectsImpl<
+          trailing_objects_internal::MaxAlignment<TrailingTys...>, BaseTy,
+          TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...> {
 
   template <int A, typename B, typename T, typename P, typename... M>
   friend class trailing_objects_internal::TrailingObjectsImpl;
@@ -221,8 +204,8 @@ class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl<
   template <typename... Tys> class Foo {};
 
   typedef trailing_objects_internal::TrailingObjectsImpl<
-      trailing_objects_internal::AlignmentCalcHelper<TrailingTys...>::Alignment,
-      BaseTy, TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...>
+      trailing_objects_internal::MaxAlignment<TrailingTys...>, BaseTy,
+      TrailingObjects<BaseTy, TrailingTys...>, BaseTy, TrailingTys...>
       ParentType;
   using TrailingObjectsBase = trailing_objects_internal::TrailingObjectsBase;
 

From 99774ec661a4487f0c334d8396487359db918cd1 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Mon, 29 Sep 2025 16:02:39 +0800
Subject: [PATCH 084/878] [LoongArch] Override cost hooks to expose more DAG
 combine opportunities (#157824)

---
 .../LoongArch/LoongArchISelLowering.cpp       | 17 +++++++++++++
 .../Target/LoongArch/LoongArchISelLowering.h  |  3 +++
 .../CodeGen/LoongArch/lasx/scalarize-fp.ll    | 24 +++++++------------
 .../CodeGen/LoongArch/lsx/scalarize-fp.ll     | 17 +++++--------
 4 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ecd003cae3263..098bcfa67d1d3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -9559,3 +9559,20 @@ bool LoongArchTargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
   EVT ScalarVT = VecVT.getScalarType();
   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
 }
+
+bool LoongArchTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                                                      unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  // Extract a 128-bit subvector from index 0 of a 256-bit vector is free.
+  return Index == 0;
+}
+
+bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT,
+                                                   unsigned Index) const {
+  EVT EltVT = VT.getScalarType();
+
+  // Extract a scalar FP value from index 0 of a vector is free.
+  return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 3c00296116ac2..9b60a9fd53726 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -338,6 +338,9 @@ class LoongArchTargetLowering : public TargetLowering {
                                          unsigned Depth) const override;
 
   bool shouldScalarizeBinop(SDValue VecOp) const override;
+  bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                               unsigned Index) const override;
+  bool isExtractVecEltCheap(EVT VT, unsigned Index) const override;
 
   /// Check if a constant splat can be generated using [x]vldi, where imm[12]
   /// is 1.
diff --git a/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
index c93a6582b9c69..39ac647d6875c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll
@@ -5,9 +5,8 @@
 define <8 x float> @fadd_elt0_v8f32(float %a) nounwind {
 ; CHECK-LABEL: fadd_elt0_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $xr0
-; CHECK-NEXT:    xvldi $xr1, -1424
-; CHECK-NEXT:    xvfadd.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    vldi $vr1, -1168
+; CHECK-NEXT:    fadd.s $fa0, $fa0, $fa1
 ; CHECK-NEXT:    ret
 entry:
   %b = insertelement <8 x float> poison, float %a, i32 0
@@ -18,9 +17,8 @@ entry:
 define <4 x double> @fadd_elt0_v4f64(double %a) nounwind {
 ; CHECK-LABEL: fadd_elt0_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT:    xvldi $xr1, -912
-; CHECK-NEXT:    xvfadd.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    vldi $vr1, -912
+; CHECK-NEXT:    fadd.d $fa0, $fa0, $fa1
 ; CHECK-NEXT:    ret
 entry:
   %b = insertelement <4 x double> poison, double %a, i32 0
@@ -31,11 +29,8 @@ entry:
 define <8 x float> @fsub_splat_v8f32(float %a, float %b) nounwind {
 ; CHECK-LABEL: fsub_splat_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vr1
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT:    vfsub.s $vr0, $vr0, $vr1
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr0, $xr0, 0
+; CHECK-NEXT:    fsub.s $fa0, $fa0, $fa1
+; CHECK-NEXT:    xvreplve0.w $xr0, $xr0
 ; CHECK-NEXT:    ret
 entry:
   %insa = insertelement <8 x float> poison, float %a, i32 0
@@ -49,10 +44,9 @@ entry:
 define <4 x double> @fsub_splat_v4f64(double %a) nounwind {
 ; CHECK-LABEL: fsub_splat_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $xr0
-; CHECK-NEXT:    xvldi $xr1, -784
-; CHECK-NEXT:    xvfadd.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 0
+; CHECK-NEXT:    vldi $vr1, -784
+; CHECK-NEXT:    fadd.d $fa0, $fa0, $fa1
+; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
 ; CHECK-NEXT:    ret
 entry:
   %insa = insertelement <4 x double> poison, double %a, i32 0
diff --git a/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll
index cc2d3d818b412..b651f11596c82 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll
@@ -5,9 +5,8 @@
 define <4 x float> @fadd_elt0_v4f32(float %a) nounwind {
 ; CHECK-LABEL: fadd_elt0_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT:    vldi $vr1, -1424
-; CHECK-NEXT:    vfadd.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    vldi $vr1, -1168
+; CHECK-NEXT:    fadd.s $fa0, $fa0, $fa1
 ; CHECK-NEXT:    ret
 entry:
   %b = insertelement <4 x float> poison, float %a, i32 0
@@ -18,9 +17,8 @@ entry:
 define <2 x double> @fadd_elt0_v2f64(double %a) nounwind {
 ; CHECK-LABEL: fadd_elt0_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
 ; CHECK-NEXT:    vldi $vr1, -912
-; CHECK-NEXT:    vfadd.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    fadd.d $fa0, $fa0, $fa1
 ; CHECK-NEXT:    ret
 entry:
   %b = insertelement <2 x double> poison, double %a, i32 0
@@ -31,9 +29,8 @@ entry:
 define <4 x float> @fsub_splat_v4f32(float %b) nounwind {
 ; CHECK-LABEL: fsub_splat_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; CHECK-NEXT:    vldi $vr1, -1424
-; CHECK-NEXT:    vfsub.s $vr0, $vr1, $vr0
+; CHECK-NEXT:    vldi $vr1, -1168
+; CHECK-NEXT:    fsub.s $fa0, $fa1, $fa0
 ; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -48,9 +45,7 @@ entry:
 define <2 x double> @fsub_splat_v2f64(double %a, double %b) nounwind {
 ; CHECK-LABEL: fsub_splat_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $f1_64 killed $f1_64 def $vr1
-; CHECK-NEXT:    # kill: def $f0_64 killed $f0_64 def $vr0
-; CHECK-NEXT:    vfsub.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    fsub.d $fa0, $fa0, $fa1
 ; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
 ; CHECK-NEXT:    ret
 entry:

From f92c23d713c048f8729b66310ea3fef9715f16e5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 29 Sep 2025 09:06:25 +0100
Subject: [PATCH 085/878] [AArch64][SME][SDAG] Add basic support for exception
 handling (#159363)

This patch adds basic support for exception handling to SelectionDAG for
ZT0, ZA, and agnostic ZA state. This works based on the following
assumptions:

- To throw an exception requires calling into the runtime
  * The which will be a private ZA call (that commits the lazy save)
- Therefore, as noted in https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
we will always enter the EH block with PSTATE.ZA=0 and TPIDR2_EL0=null,
so we can emit a restore of ZA/ZT0.

Note: This patch does not handle all cases yet. Currently, there is no
support for committing agnostic ZA state before `invoke`s, regardless of
whether the callee is also agnostic (to ensure ZA state is saved on all
normal returns).
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 156 +++---
 .../test/CodeGen/AArch64/sme-za-exceptions.ll | 474 +++++++++++++++++-
 2 files changed, 565 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a4c1e265f0e63..2ffc36706fb64 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8086,13 +8086,76 @@ static SDValue getZT0FrameIndex(MachineFrameInfo &MFI,
       DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
 }
 
+// Emit a call to __arm_sme_save or __arm_sme_restore.
+static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
+                                       SelectionDAG &DAG,
+                                       AArch64FunctionInfo *Info, SDLoc DL,
+                                       SDValue Chain, bool IsSave) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  FuncInfo->setSMESaveBufferUsed();
+  TargetLowering::ArgListTy Args;
+  Args.emplace_back(
+      DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
+      PointerType::getUnqual(*DAG.getContext()));
+
+  RTLIB::Libcall LC =
+      IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+  auto *RetTy = Type::getVoidTy(*DAG.getContext());
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
+  return TLI.LowerCallTo(CLI).second;
+}
+
+static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL,
+                                     const AArch64TargetLowering &TLI,
+                                     const AArch64RegisterInfo &TRI,
+                                     AArch64FunctionInfo &FuncInfo,
+                                     SelectionDAG &DAG) {
+  // Conditionally restore the lazy save using a pseudo node.
+  RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
+  TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
+  SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
+      DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC)));
+  SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
+      TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue TPIDR2_EL0 = DAG.getNode(
+      ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
+      DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
+  // Copy the address of the TPIDR2 block into X0 before 'calling' the
+  // RESTORE_ZA pseudo.
+  SDValue Glue;
+  SDValue TPIDR2Block = DAG.getFrameIndex(
+      TPIDR2.FrameIndex,
+      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
+  Chain =
+      DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
+                  {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
+                   RestoreRoutine, RegMask, Chain.getValue(1)});
+  // Finally reset the TPIDR2_EL0 register to 0.
+  Chain = DAG.getNode(
+      ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
+      DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
+      DAG.getConstant(0, DL, MVT::i64));
+  TPIDR2.Uses++;
+  return Chain;
+}
+
 SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
                                                SelectionDAG &DAG) const {
   assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
   SDValue Glue = Chain.getValue(1);
 
   MachineFunction &MF = DAG.getMachineFunction();
-  SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
+  auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
+  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+
+  SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
 
   // The following conditions are true on entry to an exception handler:
   // - PSTATE.SM is 0.
@@ -8107,14 +8170,43 @@ SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
   // These mode changes are usually optimized away in catch blocks as they
   // occur before the __cxa_begin_catch (which is a non-streaming function),
   // but are necessary in some cases (such as for cleanups).
+  //
+  // Additionally, if the function has ZA or ZT0 state, we must restore it.
 
+  // [COND_]SMSTART SM
   if (SMEFnAttrs.hasStreamingInterfaceOrBody())
-    return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
-                               /*Glue*/ Glue, AArch64SME::Always);
+    Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
+                                /*Glue*/ Glue, AArch64SME::Always);
+  else if (SMEFnAttrs.hasStreamingCompatibleInterface())
+    Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
+                                AArch64SME::IfCallerIsStreaming);
+
+  if (getTM().useNewSMEABILowering())
+    return Chain;
 
-  if (SMEFnAttrs.hasStreamingCompatibleInterface())
-    return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
-                               AArch64SME::IfCallerIsStreaming);
+  if (SMEFnAttrs.hasAgnosticZAInterface()) {
+    // Restore full ZA
+    Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
+                                    /*IsSave=*/false);
+  } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
+    // SMSTART ZA
+    Chain = DAG.getNode(
+        AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
+        DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
+
+    // Restore ZT0
+    if (SMEFnAttrs.hasZT0State()) {
+      SDValue ZT0FrameIndex =
+          getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
+      Chain =
+          DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
+                      {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
+    }
+
+    // Restore ZA
+    if (SMEFnAttrs.hasZAState())
+      Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
+  }
 
   return Chain;
 }
@@ -9232,30 +9324,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(
   return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
 }
 
-// Emit a call to __arm_sme_save or __arm_sme_restore.
-static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
-                                       SelectionDAG &DAG,
-                                       AArch64FunctionInfo *Info, SDLoc DL,
-                                       SDValue Chain, bool IsSave) {
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
-  FuncInfo->setSMESaveBufferUsed();
-  TargetLowering::ArgListTy Args;
-  Args.emplace_back(
-      DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
-      PointerType::getUnqual(*DAG.getContext()));
-
-  RTLIB::Libcall LC =
-      IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
-  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy(DAG.getDataLayout()));
-  auto *RetTy = Type::getVoidTy(*DAG.getContext());
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
-  return TLI.LowerCallTo(CLI).second;
-}
-
 static AArch64SME::ToggleCondition
 getSMToggleCondition(const SMECallAttrs &CallAttrs) {
   if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
@@ -10015,33 +10083,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                     {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
 
   if (RequiresLazySave) {
-    // Conditionally restore the lazy save using a pseudo node.
-    RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
-    TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
-    SDValue RegMask = DAG.getRegisterMask(
-        TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC)));
-    SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
-        getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
-    SDValue TPIDR2_EL0 = DAG.getNode(
-        ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
-        DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
-    // Copy the address of the TPIDR2 block into X0 before 'calling' the
-    // RESTORE_ZA pseudo.
-    SDValue Glue;
-    SDValue TPIDR2Block = DAG.getFrameIndex(
-        TPIDR2.FrameIndex,
-        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-    Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
-    Result =
-        DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
-                    {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
-                     RestoreRoutine, RegMask, Result.getValue(1)});
-    // Finally reset the TPIDR2_EL0 register to 0.
-    Result = DAG.getNode(
-        ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
-        DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64));
-    TPIDR2.Uses++;
+    Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
   } else if (RequiresSaveAllZA) {
     Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
                                      /*IsSave=*/false);
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index fc43c714d69b3..b6dee97ea2962 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG
 
 ; A simple EH test case that corresponds to the following C++ source:
 ;
@@ -87,6 +88,90 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl _Unwind_Resume
+;
+; CHECK-SDAG-LABEL: za_with_raii:
+; CHECK-SDAG:       .Lfunc_begin0:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception0
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w20, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:    tbnz w0, #0, .LBB0_2
+; CHECK-SDAG-NEXT:  // %bb.1: // %return_normally
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    b shared_za_call
+; CHECK-SDAG-NEXT:  .LBB0_2: // %throw_exception
+; CHECK-SDAG-NEXT:    sub x20, x29, #16
+; CHECK-SDAG-NEXT:    mov w0, #8 // =0x8
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT:    bl __cxa_allocate_exception
+; CHECK-SDAG-NEXT:    mov x8, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x9, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x9, .LBB0_4
+; CHECK-SDAG-NEXT:  // %bb.3: // %throw_exception
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_4: // %throw_exception
+; CHECK-SDAG-NEXT:    adrp x9, .L.str
+; CHECK-SDAG-NEXT:    add x9, x9, :lo12:.L.str
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    str x9, [x8]
+; CHECK-SDAG-NEXT:  .Ltmp0: // EH_LABEL
+; CHECK-SDAG-NEXT:    adrp x1, :got:typeinfo_for_char_const_ptr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT:    mov x0, x8
+; CHECK-SDAG-NEXT:    ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr]
+; CHECK-SDAG-NEXT:    mov x2, xzr
+; CHECK-SDAG-NEXT:    bl __cxa_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB0_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %throw_exception
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_6: // %throw_exception
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp1: // EH_LABEL
+; CHECK-SDAG-NEXT:  // %bb.7: // %throw_fail
+; CHECK-SDAG-NEXT:  .LBB0_8: // %unwind_dtors
+; CHECK-SDAG-NEXT:  .Ltmp2: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x19, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB0_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_10: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    bl shared_za_call
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-SDAG-NEXT:    bl _Unwind_Resume
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB0_12
+; CHECK-SDAG-NEXT:  // %bb.11: // %unwind_dtors
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB0_12: // %unwind_dtors
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
   br i1 %fail, label %throw_exception, label %return_normally
 
 throw_exception:
@@ -124,7 +209,7 @@ throw_fail:
 ;     }
 ;     shared_za_call();
 ; }
-define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 {
+define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: try_catch:
 ; CHECK:       .Lfunc_begin1:
 ; CHECK-NEXT:    .cfi_startproc
@@ -142,11 +227,11 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:  .Ltmp3: // EH_LABEL
 ; CHECK-NEXT:    sub x8, x29, #16
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl may_throw
-; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:  .Ltmp4: // EH_LABEL
 ; CHECK-NEXT:  .LBB1_1: // %after_catch
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -160,7 +245,7 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    b shared_za_call
 ; CHECK-NEXT:  .LBB1_4: // %catch
-; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:  .Ltmp5: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -175,6 +260,78 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per
 ; CHECK-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-NEXT:    bl __cxa_end_catch
 ; CHECK-NEXT:    b .LBB1_1
+;
+; CHECK-SDAG-LABEL: try_catch:
+; CHECK-SDAG:       .Lfunc_begin1:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception1
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:  .Ltmp3: // EH_LABEL
+; CHECK-SDAG-NEXT:    sub x19, x29, #16
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_2
+; CHECK-SDAG-NEXT:  // %bb.1:
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_2:
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .Ltmp4: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB1_3: // %after_catch
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    b shared_za_call
+; CHECK-SDAG-NEXT:  .LBB1_4: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp5: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_6: // %catch
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_8: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    bl shared_za_call
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB1_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB1_10: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    b .LBB1_3
   invoke void @may_throw()
           to label %after_catch unwind label %catch
 
@@ -235,16 +392,16 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx
 ; CHECK-NEXT:    zero {za}
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:  .Ltmp6:
+; CHECK-NEXT:  .Ltmp6: // EH_LABEL
 ; CHECK-NEXT:    bl shared_za_call
-; CHECK-NEXT:  .Ltmp7:
+; CHECK-NEXT:  .Ltmp7: // EH_LABEL
 ; CHECK-NEXT:  .LBB2_3: // %exit
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_4: // %catch
-; CHECK-NEXT:  .Ltmp8:
+; CHECK-NEXT:  .Ltmp8: // EH_LABEL
 ; CHECK-NEXT:    bl __cxa_begin_catch
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -260,6 +417,78 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx
 ; CHECK-NEXT:    bl __cxa_end_catch
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    b .LBB2_3
+;
+; CHECK-SDAG-LABEL: try_catch_shared_za_callee:
+; CHECK-SDAG:       .Lfunc_begin2:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception2
+; CHECK-SDAG-NEXT:  // %bb.0: // %prelude
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    sub sp, sp, #16
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    rdsvl x8, #1
+; CHECK-SDAG-NEXT:    mov x9, sp
+; CHECK-SDAG-NEXT:    msub x9, x8, x8, x9
+; CHECK-SDAG-NEXT:    mov sp, x9
+; CHECK-SDAG-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    cbz x8, .LBB2_2
+; CHECK-SDAG-NEXT:  // %bb.1: // %save.za
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_save
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:  .LBB2_2:
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    zero {za}
+; CHECK-SDAG-NEXT:  .Ltmp6: // EH_LABEL
+; CHECK-SDAG-NEXT:    bl shared_za_call
+; CHECK-SDAG-NEXT:  .Ltmp7: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB2_3: // %exit
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB2_4: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp8: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    sub x19, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB2_6
+; CHECK-SDAG-NEXT:  // %bb.5: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB2_6: // %catch
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB2_8
+; CHECK-SDAG-NEXT:  // %bb.7: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB2_8: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    bl noexcept_shared_za_call
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, x19
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-SDAG-NEXT:    sub x0, x29, #16
+; CHECK-SDAG-NEXT:    cbnz x8, .LBB2_10
+; CHECK-SDAG-NEXT:  // %bb.9: // %catch
+; CHECK-SDAG-NEXT:    bl __arm_tpidr2_restore
+; CHECK-SDAG-NEXT:  .LBB2_10: // %catch
+; CHECK-SDAG-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-SDAG-NEXT:    b .LBB2_3
   invoke void @shared_za_call() #4
           to label %exit unwind label %catch
 catch:
@@ -275,6 +504,234 @@ exit:
   ret void
 }
 
+; A simple ZT0 exception example that corresponds to:
+;
+; struct ZT0Resource {
+;     ~ZT0Resource() __arm_inout("zt0") {
+;         shared_zt0_call(); // simulate cleanup in destructor
+;     }
+; };
+;
+; void za_with_raii() __arm_inout("zt0") {
+;     ZT0Resource r;
+;     may_throw();
+; }
+;
+; This code may require reloading ZT0 in the cleanup for ~ZT0Resource().
+;
+; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented).
+define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_shared_zt0_callee:
+; CHECK:       .Lfunc_begin3:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception3
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:  .Ltmp9: // EH_LABEL
+; CHECK-NEXT:    sub x19, x29, #64
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:  .Ltmp10: // EH_LABEL
+; CHECK-NEXT:  // %bb.1: // %return_normally
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_2: // %unwind_dtors
+; CHECK-NEXT:  .Ltmp11: // EH_LABEL
+; CHECK-NEXT:    sub x20, x29, #64
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #80
+; CHECK-NEXT:    cbnz x8, .LBB3_4
+; CHECK-NEXT:  // %bb.3: // %unwind_dtors
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB3_4: // %unwind_dtors
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    bl shared_zt0_call
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl _Unwind_Resume
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+;
+; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee:
+; CHECK-SDAG:       .Lfunc_begin3:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception3
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    sub sp, sp, #96
+; CHECK-SDAG-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w20, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -32
+; CHECK-SDAG-NEXT:  .Ltmp9: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:  .Ltmp10: // EH_LABEL
+; CHECK-SDAG-NEXT:  // %bb.1: // %return_normally
+; CHECK-SDAG-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    add sp, sp, #96
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB3_2: // %unwind_dtors
+; CHECK-SDAG-NEXT:  .Ltmp11: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x20, sp
+; CHECK-SDAG-NEXT:    mov x19, x0
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x20]
+; CHECK-SDAG-NEXT:    bl shared_zt0_call
+; CHECK-SDAG-NEXT:    str zt0, [x20]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl _Unwind_Resume
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x20]
+  invoke void @may_throw()
+          to label %return_normally unwind label %unwind_dtors
+
+unwind_dtors:
+  %5 = landingpad { ptr, i32 }
+          cleanup
+  tail call void @shared_zt0_call()
+  resume { ptr, i32 } %5
+
+return_normally:
+  ret void
+}
+
+; This example corresponds to:
+;
+; __arm_agnostic("sme_za_state") void try_catch_agnostic_za()
+; {
+;    try {
+;        may_throw();
+;    } catch(...) {
+;    }
+; }
+;
+; In this example we must execute __arm_sme_restore once we enter the catch block
+; (before executing __arm_sme_save again, which would invalidate the prior save).
+define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_agnostic_za:
+; CHECK:       .Lfunc_begin4:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception4
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    bl __arm_sme_state_size
+; CHECK-NEXT:    sub sp, sp, x0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:  .Ltmp12: // EH_LABEL
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_save
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp13: // EH_LABEL
+; CHECK-NEXT:  .LBB4_1: // %exit
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl __arm_sme_restore
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB4_2: // %catch
+; CHECK-NEXT:  .Ltmp14: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB4_1
+;
+; CHECK-SDAG-LABEL: try_catch_agnostic_za:
+; CHECK-SDAG:       .Lfunc_begin4:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception4
+; CHECK-SDAG-NEXT:  // %bb.0:
+; CHECK-SDAG-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-SDAG-NEXT:    mov x29, sp
+; CHECK-SDAG-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -16
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -24
+; CHECK-SDAG-NEXT:    .cfi_offset w29, -32
+; CHECK-SDAG-NEXT:    bl __arm_sme_state_size
+; CHECK-SDAG-NEXT:    sub sp, sp, x0
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:  .Ltmp12: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:  .Ltmp13: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB4_1: // %exit
+; CHECK-SDAG-NEXT:    mov sp, x29
+; CHECK-SDAG-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-SDAG-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB4_2: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp14: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x1, x0
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    mov x0, x1
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_save
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    mov x0, x19
+; CHECK-SDAG-NEXT:    bl __arm_sme_restore
+; CHECK-SDAG-NEXT:    b .LBB4_1
+  invoke void @may_throw()
+          to label %exit unwind label %catch
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
 declare ptr @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(ptr, ptr, ptr)
 declare ptr @__cxa_begin_catch(ptr)
@@ -284,3 +741,4 @@ declare i32 @__gxx_personality_v0(...)
 declare void @may_throw()
 declare void @shared_za_call() "aarch64_inout_za"
 declare void @noexcept_shared_za_call() "aarch64_inout_za"
+declare void @shared_zt0_call() "aarch64_inout_zt0"

From b4be7ecaf06bfcb4aa8d47c4fda1eed9bbe4ae77 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 29 Sep 2025 09:08:09 +0100
Subject: [PATCH 086/878] [VPlan] Compute cost of more replicating loads/stores
 in ::computeCost. (#160053)

Update VPReplicateRecipe::computeCost to compute costs of more
replicating loads/stores.

There are 2 cases that require extra checks to match the legacy cost
model:
1. If the pointer is based on an induction, the legacy cost model passes
its SCEV to getAddressComputationCost. In those cases, still fall back
to the legacy cost. SCEV computations will be added as follow-up
2. If a load is used as part of an address of another load, the legacy
cost model skips the scalarization overhead. Those cases are currently
handled by a usedByLoadOrStore helper.

Note that getScalarizationOverhead also needs updating, because when the
legacy cost model computes the scalarization overhead, scalars have not
been collected yet, so we can't each for replicating recipes to skip
their cost, except other loads. This again can be further improved by
modeling inserts/extracts explicitly and consistently, and compute costs
for those operations directly where needed.

PR: https://github.com/llvm/llvm-project/pull/160053
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  16 ++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   7 +-
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  16 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 118 ++++++++++++++++--
 4 files changed, 130 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 96f52076b1837..ab5c9c99b9448 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3902,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
       if (VF.isScalar())
         continue;
 
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+                            *CM.PSE.getSE());
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4159,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
+                            *CM.PSE.getSE());
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6834,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7067,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
+                        *CM.PSE.getSE());
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
@@ -8597,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+                          *CM.PSE.getSE());
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -10054,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind);
+                          CM.CostKind, *CM.PSE.getSE());
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 81f1956c96254..728d29107808d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1750,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 }
 
 InstructionCost VPCostContext::getScalarizationOverhead(
-    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+    bool AlwaysIncludeReplicatingR) {
   if (VF.isScalar())
     return 0;
 
@@ -1770,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   SmallPtrSet<const VPValue *, 4> UniqueOperands;
   SmallVector<Type *> Tys;
   for (auto *Op : Operands) {
-    if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+    if (Op->isLiveIn() ||
+        (!AlwaysIncludeReplicatingR &&
+         isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
         !UniqueOperands.insert(Op).second)
       continue;
     Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fe59774b7c838..2a8baec74b72b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,12 +349,14 @@ struct VPCostContext {
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
+  ScalarEvolution &SE;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
-                TargetTransformInfo::TargetCostKind CostKind)
+                TargetTransformInfo::TargetCostKind CostKind,
+                ScalarEvolution &SE)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind) {}
+        CostKind(CostKind), SE(SE) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
 
   /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
   /// and \p Operands with \p VF. This is a convenience wrapper for the
-  /// type-based getScalarizationOverhead API.
-  InstructionCost getScalarizationOverhead(Type *ResultTy,
-                                           ArrayRef<const VPValue *> Operands,
-                                           ElementCount VF);
+  /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+  /// is true, always compute the cost of scalarizing replicating operands.
+  InstructionCost
+  getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+                           ElementCount VF,
+                           bool AlwaysIncludeReplicatingR = false);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cf5e6bf0ac418..b5e30cb1fa655 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3069,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
+/// Returns true if \p Ptr is a pointer computation for which the legacy cost
+/// model computes a SCEV expression when computing the address cost.
+static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+  auto *PtrR = Ptr->getDefiningRecipe();
+  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+                      Instruction::GetElementPtr) ||
+                 isa<VPWidenGEPRecipe>(PtrR)))
+    return false;
+
+  // We are looking for a GEP where all indices are either loop invariant or
+  // inductions.
+  for (VPValue *Opd : drop_begin(PtrR->operands())) {
+    if (!Opd->isDefinedOutsideLoopRegions() &&
+        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+      return false;
+  }
+
+  return true;
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+  SmallPtrSet<const VPUser *, 4> Seen;
+  SmallVector<const VPUser *> WorkList = {V};
+
+  while (!WorkList.empty()) {
+    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+    if (!Cur || !Seen.insert(Cur).second)
+      continue;
+
+    for (VPUser *U : Cur->users()) {
+      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
+        if (InterleaveR->getAddr() == Cur)
+          return true;
+      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+        if (RepR->getOpcode() == Instruction::Load &&
+            RepR->getOperand(0) == Cur)
+          return true;
+        if (RepR->getOpcode() == Instruction::Store &&
+            RepR->getOperand(1) == Cur)
+          return true;
+      }
+      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+        if (MemR->getAddr() == Cur && MemR->isConsecutive())
+          return true;
+      }
+    }
+
+    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+  }
+  return false;
+}
+
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3176,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   }
   case Instruction::Load:
   case Instruction::Store: {
-    if (isSingleScalar()) {
-      bool IsLoad = UI->getOpcode() == Instruction::Load;
-      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
-      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
-      const Align Alignment = getLoadStoreAlignment(UI);
-      unsigned AS = getLoadStoreAddressSpace(UI);
-      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
-      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
-          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
-      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
-    }
+    if (VF.isScalable() && !isSingleScalar())
+      return InstructionCost::getInvalid();
+
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
-    break;
+    const VPRegionBlock *ParentRegion = getParent()->getParent();
+    if (ParentRegion && ParentRegion->isReplicator())
+      break;
+
+    bool IsLoad = UI->getOpcode() == Instruction::Load;
+    const VPValue *PtrOp = getOperand(!IsLoad);
+    // TODO: Handle cases where we need to pass a SCEV to
+    // getAddressComputationCost.
+    if (shouldUseAddressAccessSCEV(PtrOp))
+      break;
+
+    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+    const Align Alignment = getLoadStoreAlignment(UI);
+    unsigned AS = getLoadStoreAddressSpace(UI);
+    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+
+    InstructionCost ScalarCost =
+        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
+    if (isSingleScalar())
+      return ScalarCost;
+
+    SmallVector<const VPValue *> OpsToScalarize;
+    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+    // don't assign scalarization overhead in general, if the target prefers
+    // vectorized addressing or the loaded value is used as part of an address
+    // of another load or store.
+    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
+      bool EfficientVectorLoadStore =
+          Ctx.TTI.supportsEfficientVectorElementLoadStore();
+      if (!(IsLoad && !PreferVectorizedAddressing) &&
+          !(!IsLoad && EfficientVectorLoadStore))
+        append_range(OpsToScalarize, operands());
+
+      if (!EfficientVectorLoadStore)
+        ResultTy = Ctx.Types.inferScalarType(this);
+    }
+
+    return (ScalarCost * VF.getFixedValue()) +
+           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
   }
   }
 

From 600e97dfeafc9a8c691e3a0b6ad744adf99b9a78 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 09:04:25 -0700
Subject: [PATCH 087/878] [MLIR] Apply clang-tidy fixes for
 modernize-use-emplace in LoopEmitter.cpp (NFC)

---
 mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index 659282a995123..f53950242e10c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -344,7 +344,7 @@ void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) {
       // Reverse queue into a stack.
       std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end());
       for (auto [loop, coeff] : dependentLvlMap[t][lvl])
-        depRedOrder.emplace_back(std::make_tuple(loop, t, lvl));
+        depRedOrder.emplace_back(loop, t, lvl);
     }
 
     if (depRedOrder.empty())

From a40918b3fe0c73e3fdbf1cc7cc17823f0d5b83dd Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 10:22:21 -0700
Subject: [PATCH 088/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in JitRunner.cpp (NFC)

---
 mlir/lib/ExecutionEngine/JitRunner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
index 0ada4cc96570a..db0516533afcb 100644
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -271,7 +271,7 @@ Error checkCompatibleReturnType<float>(LLVM::LLVMFuncOp mainFunction) {
   return Error::success();
 }
 template <typename Type>
-Error compileAndExecuteSingleReturnFunction(
+static Error compileAndExecuteSingleReturnFunction(
     Options &options, Operation *module, StringRef entryPoint,
     CompileAndExecuteConfig config, std::unique_ptr<llvm::TargetMachine> tm) {
   auto mainFunction = dyn_cast_or_null<LLVM::LLVMFuncOp>(

From 8907adc28cb8c77251176c513ec9f8abd5def5b4 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 06:50:48 -0700
Subject: [PATCH 089/878] [MLIR] Apply clang-tidy fixes for
 bugprone-argument-comment in SubgroupReduceLowering.cpp (NFC)

---
 mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
index b45fdf34e78e1..81c3069cec16e 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp
@@ -430,7 +430,7 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op,
       dpp = ROCDL::PermlaneX16Op::create(rewriter, loc, res.getType(), res, res,
                                          uint32Max, uint32Max,
                                          /*fi=*/true,
-                                         /*bound_ctrl=*/false);
+                                         /*boundControl=*/false);
       res = vector::makeArithReduction(
           rewriter, loc, gpu::convertReductionKind(mode), res, dpp);
     } else {

From 2e19666600c91952e39a1fb482594598880a8bd6 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 05:36:01 -0700
Subject: [PATCH 090/878] [MLIR] Apply clang-tidy fixes for
 bugprone-argument-comment in PrintCallHelper.cpp (NFC)

---
 mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
index d95aeba8a4488..da4443dc86053 100644
--- a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
@@ -67,7 +67,7 @@ LogicalResult mlir::LLVM::createPrintStrCall(
   auto arrayTy =
       LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), elementVals.size());
   auto globalOp = LLVM::GlobalOp::create(
-      builder, loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private,
+      builder, loc, arrayTy, /*isConstant=*/true, LLVM::Linkage::Private,
       ensureSymbolNameIsUnique(moduleOp, symbolName, symbolTables), dataAttr);
 
   auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());

From 631b89cc4766d2b8f0f5a17c6d90fe920436a2c9 Mon Sep 17 00:00:00 2001
From: benwu25 <soggysocks206@gmail.com>
Date: Mon, 29 Sep 2025 01:39:31 -0700
Subject: [PATCH 091/878] [MLIR][CF] Avoid collapsing blocks which participate
 in cycles (#160783)

Previously, collapseBranch did not return failure for successor blocks which were part of a cycle. mlir-opt --canonicalize would run indefinitely for any N-block cycle which is kicked off with an unconditional jump. The simplifyPassThroughBr transform would continue alternating which block was targeted in ^bb0, resulting in an infinite loop.

collapseBranch will not result in any useful transformation on blocks which participate in cycles, since the block is aliased by a different block. To avoid this, we can check for cycles in collapseBranch and abort when one is detected. Simplification of the cycle is left for other transforms.

Fixes #159743.
---
 .../OpenMP/infinite-loop-in-construct.f90     |   6 +-
 .../Dialect/ControlFlow/IR/ControlFlowOps.cpp |  10 ++
 .../Dialect/ControlFlow/canonicalize.mlir     | 144 ++++++++++++++++++
 3 files changed, 158 insertions(+), 2 deletions(-)

diff --git a/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90 b/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
index 16b400a231860..f02d0e5ccc53c 100644
--- a/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
+++ b/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
@@ -8,8 +8,10 @@
 ! CHECK:            cf.cond_br %{{[0-9]+}}, ^bb1, ^bb2
 ! CHECK-NEXT:     ^bb1:  // pred: ^bb0
 ! CHECK:            cf.br ^bb2
-! CHECK-NEXT:     ^bb2:  // 3 preds: ^bb0, ^bb1, ^bb2
-! CHECK-NEXT:       cf.br ^bb2
+! CHECK-NEXT:     ^bb2:  // 2 preds: ^bb0, ^bb1
+! CHECK:            cf.br ^bb3
+! CHECK-NEXT:     ^bb3:  // 2 preds: ^bb2, ^bb3
+! CHECK:            cf.br ^bb3
 ! CHECK-NEXT:     }
 
 subroutine sb(ninter, numnod)
diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
index 582593adfa5c0..f1da1a125e9ef 100644
--- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
+++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp
@@ -122,6 +122,16 @@ static LogicalResult collapseBranch(Block *&successor,
   Block *successorDest = successorBranch.getDest();
   if (successorDest == successor)
     return failure();
+  // Don't try to collapse branches which participate in a cycle.
+  BranchOp nextBranch = dyn_cast<BranchOp>(successorDest->getTerminator());
+  llvm::DenseSet<Block *> visited{successor, successorDest};
+  while (nextBranch) {
+    Block *nextBranchDest = nextBranch.getDest();
+    if (visited.contains(nextBranchDest))
+      return failure();
+    visited.insert(nextBranchDest);
+    nextBranch = dyn_cast<BranchOp>(nextBranchDest->getTerminator());
+  }
 
   // Update the operands to the successor. If the branch parent has no
   // arguments, we can use the branch operands directly.
diff --git a/mlir/test/Dialect/ControlFlow/canonicalize.mlir b/mlir/test/Dialect/ControlFlow/canonicalize.mlir
index bf69935a00bf0..17f7d28ba59fb 100644
--- a/mlir/test/Dialect/ControlFlow/canonicalize.mlir
+++ b/mlir/test/Dialect/ControlFlow/canonicalize.mlir
@@ -490,3 +490,147 @@ func.func @branchCondProp(%arg0: i1) {
 ^exit:
   return
 }
+
+// -----
+
+/// Test that control-flow cycles are not simplified infinitely.
+
+// CHECK-LABEL:   @cycle_2_blocks
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @cycle_2_blocks() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb1
+}
+
+// CHECK-LABEL:   @no_cycle_2_blocks
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
+// CHECK:           return %[[VAL_0]] : i32
+func.func @no_cycle_2_blocks() -> i32 {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  %ret = arith.constant 1 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL:   @cycle_4_blocks
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @cycle_4_blocks() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb1
+}
+
+// CHECK-LABEL:   @no_cycle_4_blocks
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : i32
+// CHECK:           return %[[VAL_0]] : i32
+func.func @no_cycle_4_blocks() -> i32 {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb5
+^bb5:
+  %ret = arith.constant 1 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL:   @delayed_3_cycle
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @delayed_3_cycle() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb5
+^bb5:
+  cf.br ^bb3
+}
+
+// CHECK-LABEL:   @cycle_1_block
+// CHECK:           cf.br ^bb1
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb1
+func.func @cycle_1_block() {
+  cf.br ^bb1
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb2
+}
+
+// CHECK-LABEL:   @unsimplified_cycle_1
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+// CHECK:           cf.cond_br %[[ARG0]], ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb2
+// CHECK:         ^bb2:
+// CHECK:           cf.br ^bb3
+// CHECK:         ^bb3:
+// CHECK:           cf.br ^bb3
+func.func @unsimplified_cycle_1(%c : i1) {
+  cf.cond_br %c, ^bb1, ^bb2
+^bb1:
+  cf.br ^bb2
+^bb2:
+  cf.br ^bb3
+^bb3:
+  cf.br ^bb4
+^bb4:
+  cf.br ^bb3
+}
+
+// Make sure we terminate when other cf passes can't help us.
+
+// CHECK-LABEL:   @unsimplified_cycle_2
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+// CHECK:           cf.cond_br %[[ARG0]], ^bb1, ^bb3
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb2 {A}
+// CHECK:         ^bb2:
+// CHECK:           cf.br ^bb2 {E}
+// CHECK:         ^bb3:
+// CHECK:           cf.br ^bb1
+func.func @unsimplified_cycle_2(%c : i1) {
+  cf.cond_br %c, ^bb6, ^bb7
+^bb6:
+  cf.br ^bb5 {F}
+^bb5:
+  cf.br ^bb1 {A}
+^bb1:
+  cf.br ^bb2 {B}
+^bb2:
+  cf.br ^bb3 {C}
+^bb3:
+  cf.br ^bb4 {D}
+^bb4:
+  cf.br ^bb1 {E}
+^bb7:
+  cf.br ^bb6
+}

From edc76e15ed9bf4a55d866dcd7d6e196f793903d7 Mon Sep 17 00:00:00 2001
From: Keshav Vinayak Jha <31160700+keshavvinayak01@users.noreply.github.com>
Date: Mon, 29 Sep 2025 14:17:36 +0530
Subject: [PATCH 092/878] [ROCDL][LLVM] Added rocdl.fmed3 ->
 Intrinsic::amdgcn_fmed3 (#159332)

## Description
Added ROCDL fmed3 op to support rewrite to `amdgcn_fmed3` intrinsic.

## Testing
- ROCDL -> LLVMIR lit tests for new `rocdl.med3` ops in
`/test/Target/LLVMIR/rocdl.mlir`

Addresses [#157052](https://github.com/llvm/llvm-project/issues/157052)

---------

Signed-off-by: keshavvinayak01 <keshavvinayakjha@gmail.com>
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 31 ++++++++++++++++++++
 mlir/test/Dialect/LLVMIR/rocdl.mlir          | 14 +++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir           | 14 +++++++++
 3 files changed, 59 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 8759f1dc3269d..8b687a7f29bef 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -1360,6 +1360,37 @@ def ROCDL_CvtScaleF32PkFp4F32Op :
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// FMED3 operations
+//===----------------------------------------------------------------------===//
+
+def ROCDL_FMed3Op : ROCDL_IntrOp<"fmed3", [0], [], [Pure, AllTypesMatch<["res", "src0", "src1", "src2"]>], 1>,
+  Arguments<(ins LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$src0,
+                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$src1,
+                 LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$src2)> {
+  let results = (outs LLVM_ScalarOrVectorOf<LLVM_AnyFloat>:$res);
+  let summary = "Median of three float/half values";
+  let description = [{
+    Computes the median of three floating-point values using the AMDGPU fmed3 intrinsic.
+    This operation is equivalent to `max(min(a, b), min(max(a, b), c))` but uses the
+    hardware-accelerated V_MED3_F16/V_MED3_F32 instruction for better performance.
+    
+    The operation supports both scalar and vector floating-point types (f16, f32).
+    
+    Example:
+    ```mlir
+    // Scalar f32 median
+    %result = rocdl.fmed3 %a, %b, %c : f32
+    
+    // Vector f16 median
+    %result = rocdl.fmed3 %va, %vb, %vc : vector<4xf16>
+    ```
+  }];
+  let assemblyFormat = [{
+    $src0 `,` $src1 `,` $src2 attr-dict `:` type($res)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ROCDL target attribute.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index e127fdb78a861..0bad151570029 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -29,6 +29,20 @@ func.func @rocdl_special_regs() -> i32 {
   llvm.return %0 : i32
 }
 
+func.func @rocdl.fmed3.scalar(%a: f32, %b: f32, %c: f32) -> f32 {
+  // CHECK-LABEL: rocdl.fmed3.scalar
+  // CHECK: %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f32
+  %0 = rocdl.fmed3 %a, %b, %c : f32
+  llvm.return %0 : f32
+}
+
+func.func @rocdl.fmed3.vector(%a: vector<4xf16>, %b: vector<4xf16>, %c: vector<4xf16>) -> vector<4xf16> {
+  // CHECK-LABEL: rocdl.fmed3.vector
+  // CHECK: %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : vector<4xf16>
+  %0 = rocdl.fmed3 %a, %b, %c : vector<4xf16>
+  llvm.return %0 : vector<4xf16>
+}
+
 func.func @rocdl.barrier() {
   // CHECK: rocdl.barrier
   rocdl.barrier
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index c629877b69b4e..e043a8c533d05 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1298,6 +1298,20 @@ llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 {
   llvm.return %ret : i32
 }
 
+llvm.func @test_fmed3_f16(%arg0: f16, %arg1: f16, %arg2: f16) -> f16 {
+  // CHECK-LABEL: define half @test_fmed3_f16(half %0, half %1, half %2)
+  %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f16
+  llvm.return %0 : f16
+  // CHECK: call half @llvm.amdgcn.fmed3.f16(half %0, half %1, half %2)
+}
+
+llvm.func @test_fmed3_f32(%arg0: f32, %arg1: f32, %arg2: f32) -> f32 {
+  // CHECK-LABEL: define float @test_fmed3_f32(float %0, float %1, float %2)
+  %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f32
+  llvm.return %0 : f32
+  // CHECK: call float @llvm.amdgcn.fmed3.f32(float %0, float %1, float %2)
+}
+
 // CHECK-LABEL: rocdl.cvt.scale.pk8
 // CHECK-SAME:(i32 %[[I32:.+]], <2 x i32> %[[V2I32:.+]], i32 %[[SCALE:.+]])
 llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {

From 8d57211d6f33513ccee911a5ac98d8a31d72dcde Mon Sep 17 00:00:00 2001
From: guan jian <guanjian@stu.cdut.edu.cn>
Date: Mon, 29 Sep 2025 16:53:50 +0800
Subject: [PATCH 093/878] [LLVM][AArch64] Optimize sign bit tests with TST
 instruction for SIGN_EXTEND patterns (#158061)

Hi, I recently found out in some cases LLVM doesn't generate optimal
code like:
```
sxtb w8, w0
cmp w8, #0
csel w0, w1, w2, lt
```
```
tst w0, #0x80
csel w0, w1, w2, mi
```

This optimization is only applied when the following conditions are met:
1. The comparison is setlt (signed less than)
2. The right-hand side is zero
3. The left-hand side is a sign extension operation (SIGN_EXTEND or
SIGN_EXTEND_INREG)
4. The sign-extended value has only one use (hasOneUse())
5. The original type is an integer type
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 22 ++++++++
 .../check-sign-bit-before-extension.ll        | 16 +++---
 llvm/test/CodeGen/AArch64/icmp.ll             | 51 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/vecreduce-bool.ll   | 36 ++++++-------
 4 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2ffc36706fb64..899baa9c998ec 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11778,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
       return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
     }
 
+    // Check for sign bit test patterns that can use TST optimization.
+    // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
+    //                          -> TST %operand, sign_bit; CSEL
+    // (SELECT_CC setlt, sign_extend, 0, tval, fval)
+    //                          -> TST %operand, sign_bit; CSEL
+    if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
+        (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+         LHS.getOpcode() == ISD::SIGN_EXTEND)) {
+
+      uint64_t SignBitPos;
+      std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
+      EVT TestVT = LHS.getValueType();
+      SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
+      SDValue TST =
+          DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
+                      LHS, SignBitConst);
+
+      SDValue Flags = TST.getValue(1);
+      return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
+                         DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
+    }
+
     // Canonicalise absolute difference patterns:
     //   select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
     //   select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
diff --git a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
index 0960c4c2a3342..a56d5b1b49b38 100644
--- a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
+++ b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
@@ -78,9 +78,8 @@ B:
 define i32 @g_i8_sign_extend_inreg(i8 %in, i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: g_i8_sign_extend_inreg:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w8, w1, w2, mi
+; CHECK-NEXT:    tst w0, #0x80
+; CHECK-NEXT:    csel w8, w1, w2, ne
 ; CHECK-NEXT:    add w0, w8, w0, uxtb
 ; CHECK-NEXT:    ret
 entry:
@@ -100,9 +99,8 @@ B:
 define i32 @g_i16_sign_extend_inreg(i16 %in, i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: g_i16_sign_extend_inreg:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w8, w1, w2, mi
+; CHECK-NEXT:    tst w0, #0x8000
+; CHECK-NEXT:    csel w8, w1, w2, ne
 ; CHECK-NEXT:    add w0, w8, w0, uxth
 ; CHECK-NEXT:    ret
 entry:
@@ -167,10 +165,8 @@ B:
 define i64 @g_i32_sign_extend_i64(i32 %in, i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: g_i32_sign_extend_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csel x8, x1, x2, mi
+; CHECK-NEXT:    tst w0, #0x80000000
+; CHECK-NEXT:    csel x8, x1, x2, ne
 ; CHECK-NEXT:    add x0, x8, w0, uxtw
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 18665bcbeae83..7195e2b2f1255 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -2093,3 +2093,54 @@ define <2 x i1> @icmp_slt_v2i64_Zero_LHS(<2 x i64> %a) {
     %c = icmp slt <2 x i64> <i64 0, i64 0>, %a
     ret <2 x i1> %c
 }
+
+; Test TST optimization for i8 sign bit testing with cross-type select
+; This tests the pattern: icmp slt i8 %val, 0; select i1 %cmp, i32 %a, i32 %b
+; The optimization should convert sxtb+cmp to tst for sign bit testing.
+
+define i32 @i8_signbit_tst_constants(i8 %x, i8 %y) {
+; CHECK-SD-LABEL: i8_signbit_tst_constants:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w9, w0, w1
+; CHECK-SD-NEXT:    mov w8, #42 // =0x2a
+; CHECK-SD-NEXT:    tst w9, #0x80
+; CHECK-SD-NEXT:    mov w9, #20894 // =0x519e
+; CHECK-SD-NEXT:    csel w0, w9, w8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i8_signbit_tst_constants:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, w1
+; CHECK-GI-NEXT:    mov w9, #42 // =0x2a
+; CHECK-GI-NEXT:    mov w10, #20894 // =0x519e
+; CHECK-GI-NEXT:    sxtb w8, w8
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    csel w0, w10, w9, mi
+; CHECK-GI-NEXT:    ret
+  %add = add i8 %x, %y
+  %cmp = icmp slt i8 %add, 0
+  %sel = select i1 %cmp, i32 20894, i32 42
+  ret i32 %sel
+}
+
+; Test i8 sign bit testing with variable select values (problematic case)
+define i32 @i8_signbit_variables(i8 %x, i8 %y, i32 %a, i32 %b) {
+; CHECK-SD-LABEL: i8_signbit_variables:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add w8, w0, w1
+; CHECK-SD-NEXT:    tst w8, #0x80
+; CHECK-SD-NEXT:    csel w0, w2, w3, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: i8_signbit_variables:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add w8, w0, w1
+; CHECK-GI-NEXT:    sxtb w8, w8
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    csel w0, w2, w3, mi
+; CHECK-GI-NEXT:    ret
+  %add = add i8 %x, %y
+  %cmp = icmp slt i8 %add, 0
+  %sel = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %sel
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 62d41fca10db3..19e1aa5d152ce 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -26,9 +26,9 @@ define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_and_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.b[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    tst w8, #0x80
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -120,9 +120,9 @@ define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_and_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.h[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    tst w8, #0x8000
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i16> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -305,9 +305,9 @@ define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_or_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.b[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    tst w8, #0x80
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -399,9 +399,9 @@ define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_or_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.h[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    tst w8, #0x8000
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i16> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -584,9 +584,9 @@ define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_xor_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.b[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.b[0]
+; CHECK-NEXT:    tst w8, #0x80
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i8> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
@@ -679,9 +679,9 @@ define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK-LABEL: reduce_xor_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    smov w8, v0.h[0]
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    csel w0, w0, w1, mi
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    tst w8, #0x8000
+; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret
   %x = icmp slt <1 x i16> %a0, zeroinitializer
   %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)

From 97367d1046a2ec81e9b4e708ae7acdc83d99dcf7 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 29 Sep 2025 11:10:19 +0200
Subject: [PATCH 094/878] [libc++] Vectorize std::find (#156431)

```
Apple M4:
-----------------------------------------------------------------------------
Benchmark                                                 old             new
-----------------------------------------------------------------------------
std::find(vector<char>) (bail 25%)/8                  1.43 ns         1.44 ns
std::find(vector<char>) (bail 25%)/1024               5.54 ns         5.59 ns
std::find(vector<char>) (bail 25%)/8192               38.4 ns         39.1 ns
std::find(vector<char>) (bail 25%)/32768               134 ns          136 ns
std::find(vector<int>) (bail 25%)/8                   1.56 ns         1.57 ns
std::find(vector<int>) (bail 25%)/1024                65.3 ns         65.4 ns
std::find(vector<int>) (bail 25%)/8192                 465 ns          464 ns
std::find(vector<int>) (bail 25%)/32768               1832 ns         1832 ns
std::find(vector<long long>) (bail 25%)/8            0.920 ns         1.20 ns
std::find(vector<long long>) (bail 25%)/1024          65.2 ns         31.2 ns
std::find(vector<long long>) (bail 25%)/8192           464 ns          255 ns
std::find(vector<long long>) (bail 25%)/32768         1833 ns          992 ns
std::find(vector<char>) (process all)/8               1.21 ns         1.22 ns
std::find(vector<char>) (process all)/50              1.92 ns         1.93 ns
std::find(vector<char>) (process all)/1024            16.6 ns         16.9 ns
std::find(vector<char>) (process all)/8192             134 ns          136 ns
std::find(vector<char>) (process all)/32768            488 ns          503 ns
std::find(vector<int>) (process all)/8                2.45 ns         2.48 ns
std::find(vector<int>) (process all)/50               12.7 ns         12.7 ns
std::find(vector<int>) (process all)/1024              236 ns          236 ns
std::find(vector<int>) (process all)/8192             1830 ns         1834 ns
std::find(vector<int>) (process all)/32768            7351 ns         7346 ns
std::find(vector<long long>) (process all)/8          2.02 ns         1.45 ns
std::find(vector<long long>) (process all)/50         12.0 ns         6.12 ns
std::find(vector<long long>) (process all)/1024        235 ns          123 ns
std::find(vector<long long>) (process all)/8192       1830 ns          983 ns
std::find(vector<long long>) (process all)/32768      7306 ns         3969 ns
std::find(vector<bool>) (process all)/8               1.14 ns         1.15 ns
std::find(vector<bool>) (process all)/50              1.16 ns         1.17 ns
std::find(vector<bool>) (process all)/1024            4.51 ns         4.53 ns
std::find(vector<bool>) (process all)/8192            33.6 ns         33.5 ns
std::find(vector<bool>) (process all)/1048576         3660 ns         3660 ns
```
---
 libcxx/docs/ReleaseNotes/22.rst               |   2 +
 libcxx/include/__algorithm/find.h             | 110 ++++++++++++++----
 libcxx/include/__algorithm/simd_utils.h       |   5 +
 libcxx/include/module.modulemap.in            |   2 +
 .../algorithms/nonmodifying/find.bench.cpp    |   1 +
 5 files changed, 97 insertions(+), 23 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index b5dcfc7dfef31..87d86c1345618 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -64,6 +64,8 @@ Improvements and New Features
 - Multiple internal types have been refactored to use ``[[no_unique_address]]``, resulting in faster compile times and
   reduced debug information.
 
+- The performance of ``std::find`` has been improved by up to 2x for integral types
+
 Deprecations and Removals
 -------------------------
 
diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 8c8cb5820fee3..5f32ae8fc9524 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -12,6 +12,7 @@
 
 #include <__algorithm/find_segment_if.h>
 #include <__algorithm/min.h>
+#include <__algorithm/simd_utils.h>
 #include <__algorithm/unwrap_iter.h>
 #include <__bit/countr.h>
 #include <__bit/invert_if.h>
@@ -44,39 +45,102 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // generic implementation
 template <class _Iter, class _Sent, class _Tp, class _Proj>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
-__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+__find_loop(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
   for (; __first != __last; ++__first)
     if (std::__invoke(__proj, *__first) == __value)
       break;
   return __first;
 }
 
-// trivially equality comparable implementations
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == 1,
-                        int> = 0>
-_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+template <class _Iter, class _Sent, class _Tp, class _Proj>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Iter
+__find(_Iter __first, _Sent __last, const _Tp& __value, _Proj& __proj) {
+  return std::__find_loop(std::move(__first), std::move(__last), __value, __proj);
 }
 
-#if _LIBCPP_HAS_WIDE_CHARACTERS
-template <class _Tp,
-          class _Up,
-          class _Proj,
-          __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
-                            sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t),
-                        int> = 0>
+#if _LIBCPP_VECTORIZE_ALGORITHMS
+template <class _Tp, class _Up>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last, _Up __value) {
+  if (!__libcpp_is_constant_evaluated()) {
+    constexpr size_t __unroll_count = 4;
+    constexpr size_t __vec_size     = __native_vector_size<_Tp>;
+    using __vec                     = __simd_vector<_Tp, __vec_size>;
+
+    auto __orig_first = __first;
+
+    auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value); // broadcast the value
+    while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
+      __vec __lhs[__unroll_count];
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i)
+        __lhs[__i] = std::__load_vector<__vec>(__first + __i * __vec_size);
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        if (auto __cmp_res = __lhs[__i] == __values; std::__any_of(__cmp_res)) {
+          auto __offset = __i * __vec_size + std::__find_first_set(__cmp_res);
+          return __first + __offset;
+        }
+      }
+
+      __first += __unroll_count * __vec_size;
+    }
+
+    // check the remaining 0-3 vectors
+    while (static_cast<size_t>(__last - __first) >= __vec_size) {
+      if (auto __cmp_res = std::__load_vector<__vec>(__first) == __values; std::__any_of(__cmp_res)) {
+        return __first + std::__find_first_set(__cmp_res);
+      }
+      __first += __vec_size;
+    }
+
+    if (__last - __first == 0)
+      return __first;
+
+    // Check if we can load elements in front of the current pointer. If that's the case load a vector at
+    // (last - vector_size) to check the remaining elements
+    if (static_cast<size_t>(__first - __orig_first) >= __vec_size) {
+      __first = __last - __vec_size;
+      return __first + std::__find_first_set(std::__load_vector<__vec>(__first) == __values);
+    }
+  }
+
+  __identity __proj;
+  return std::__find_loop(__first, __last, __value, __proj);
+}
+#endif
+
+#ifndef _LIBCPP_CXX03_LANG
+// trivially equality comparable implementations
+template <
+    class _Tp,
+    class _Up,
+    class _Proj,
+    __enable_if_t<__is_identity<_Proj>::value && __libcpp_is_trivially_equality_comparable<_Tp, _Up>::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
-  if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
-    return __ret;
-  return __last;
+  if constexpr (sizeof(_Tp) == 1) {
+    if (auto __ret = std::__constexpr_memchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#  if _LIBCPP_HAS_WIDE_CHARACTERS
+  else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) {
+    if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first))
+      return __ret;
+    return __last;
+  }
+#  endif
+#  if _LIBCPP_VECTORIZE_ALGORITHMS
+  else if constexpr (is_integral<_Tp>::value) {
+    return std::__find_vectorized(__first, __last, __value);
+  }
+#  endif
+  else {
+    __identity __proj;
+    return std::__find_loop(__first, __last, __value, __proj);
+  }
 }
-#endif // _LIBCPP_HAS_WIDE_CHARACTERS
+#endif
 
 // TODO: This should also be possible to get right with different signedness
 // cast integral types to allow vectorization
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 96b074c063a5d..aaeb8a881df18 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -114,6 +114,11 @@ template <class _VecT, class _Iter>
   }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
 }
 
+template <class _Tp, size_t _Np>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __any_of(__simd_vector<_Tp, _Np> __vec) noexcept {
+  return __builtin_reduce_or(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
+}
+
 template <class _Tp, size_t _Np>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
   return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 5e96adc1aaa65..93d43f8d7e195 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1225,6 +1225,7 @@ module std [system] {
     header "deque"
     export *
     export std.iterator.reverse_iterator
+    export std.algorithm.simd_utils // This is a workaround for https://llvm.org/PR120108.
   }
 
   module exception {
@@ -2238,6 +2239,7 @@ module std [system] {
     header "vector"
     export std.iterator.reverse_iterator
     export *
+    export std.algorithm.simd_utils // This is a workaround for https://llvm.org/PR120108.
   }
 
   // Experimental C++ Standard Library interfaces
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
index b2ead1cc75585..afea31fb59e95 100644
--- a/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/find.bench.cpp
@@ -51,6 +51,7 @@ int main(int argc, char** argv) {
     // find
     bm.template operator()<std::vector<char>>("std::find(vector<char>) (" + comment + ")", std_find);
     bm.template operator()<std::vector<int>>("std::find(vector<int>) (" + comment + ")", std_find);
+    bm.template operator()<std::vector<long long>>("std::find(vector<long long>) (" + comment + ")", std_find);
     bm.template operator()<std::deque<int>>("std::find(deque<int>) (" + comment + ")", std_find);
     bm.template operator()<std::list<int>>("std::find(list<int>) (" + comment + ")", std_find);
 

From 585fd4cea00b4502a435420d5193a6a2b945e93e Mon Sep 17 00:00:00 2001
From: Fraser Cormack <frasercrmck@pm.me>
Date: Mon, 29 Sep 2025 12:20:00 +0300
Subject: [PATCH 095/878] [libclc] Propose new libclc maintainer (#161141)

Wenju He has been active on the libclc project for a while now and has
been contributing to the overall health and steering the future of the
project.
---
 libclc/Maintainers.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libclc/Maintainers.md b/libclc/Maintainers.md
index 695695c00be56..cdd84e059a796 100644
--- a/libclc/Maintainers.md
+++ b/libclc/Maintainers.md
@@ -10,6 +10,9 @@ The following people are the active maintainers for the project. Please reach
 out to them for code reviews, questions about their area of expertise, or other
 assistance.
 
+Wenju He \
+wenju.he@intel.com (email), [wenju-he](https://github.com/wenju-he) (GitHub)
+
 Tom Stellard \
 tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
 

From 2ab2ffefc726095aaf1b2875d14267e35fbfbec5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 10:33:17 +0100
Subject: [PATCH 096/878] [X86] createVPDPBUSD - only use 512-bit
 X86ISD::VPDPBUSD on AVX512VNNI targets (#161152)

Inspired by #160928 - if we have a AVX512 target capable of AVXVNNI but not AVX512VNNI then we must split 512-bit (or larger) types to 256-bits
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |   6 +-
 llvm/test/CodeGen/X86/dpbusd.ll         | 202 ++++++++++++++----------
 llvm/test/CodeGen/X86/dpbusd_const.ll   | 192 ++++++++++++++--------
 3 files changed, 244 insertions(+), 156 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index fcfeb661aa891..e7eb67ab12edc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4456,8 +4456,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
                          bool AllowAVX512 = true) {
   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
   unsigned NumSubs = 1;
-  if ((CheckBWI && Subtarget.useBWIRegs()) ||
-      (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) {
+  if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
+                      (!CheckBWI && Subtarget.useAVX512Regs()))) {
     if (VT.getSizeInBits() > 512) {
       NumSubs = VT.getSizeInBits() / 512;
       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -46197,7 +46197,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
   SDValue Zero = DAG.getConstant(0, DL, DpVT);
 
   return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
-                          DpBuilder, false);
+                          DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
 }
 
 // Create a PSADBW given two sources representable as zexts of vXi8.
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 3aa77c3955c63..7bd22d57347b3 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -1,40 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI
 
 define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: no_dpbusd:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
-;
-; AVX512-LABEL: no_dpbusd:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    addl %edx, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; CHECK-LABEL: no_dpbusd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
 entry:
   %0 = load <16 x i8>, ptr %a, align 16
   %1 = zext <16 x i8> %0 to <16 x i32>
@@ -99,25 +84,44 @@ entry:
 }
 
 define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: mul_zext:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
-; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVXVNNI-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_zext:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVXVNNI-AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVXVNNI-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX-NEXT:    vzeroupper
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_zext:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVXVNNI-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVXVNNI-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX512-NEXT:    vzeroupper
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_zext:
 ; AVX512:       # %bb.0: # %entry
@@ -153,25 +157,44 @@ entry:
 }
 
 define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: mul_sext:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVXVNNI-NEXT:    vpmovsxbw (%rsi), %ymm1
-; AVXVNNI-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
-; AVXVNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVXVNNI-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_sext:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVXVNNI-AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVXVNNI-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX-NEXT:    vzeroupper
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_sext:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; AVXVNNI-AVX512-NEXT:    vpmovsxbw (%rsi), %ymm1
+; AVXVNNI-AVX512-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
+; AVXVNNI-AVX512-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVXVNNI-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVXVNNI-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX512-NEXT:    vzeroupper
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_sext:
 ; AVX512:       # %bb.0: # %entry
@@ -312,17 +335,30 @@ entry:
 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
 
 define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
-; AVXVNNI-LABEL: vpdpbusd_128:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
-; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
-; AVXVNNI-NEXT:    vmovd %xmm2, %eax
-; AVXVNNI-NEXT:    addl %edx, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: vpdpbusd_128:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXVNNI-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVXVNNI-AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
+; AVXVNNI-AVX-NEXT:    vmovd %xmm2, %eax
+; AVXVNNI-AVX-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: vpdpbusd_128:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVXVNNI-AVX512-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm2, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edx, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: vpdpbusd_128:
 ; AVX512VNNI:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 456e6e8f263aa..bb47df59eefad 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -1,20 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=ALL,AVXVNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VNNI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VLVNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI
 
 define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) {
-; ALL-LABEL: mul_4xi8_zc_exceed:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; ALL-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0]
-; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    addl %edi, %eax
-; ALL-NEXT:    retq
+; CHECK-LABEL: mul_4xi8_zc_exceed:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %0 = zext <4 x i8> %a to <4 x i32>
   %1 = mul nsw <4 x i32> %0, <i32 0, i32 1, i32 2, i32 128>
@@ -24,14 +25,24 @@ entry:
 }
 
 define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_4xi8_zc:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVXVNNI-NEXT:    vmovd %xmm1, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_4xi8_zc:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_4xi8_zc:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: mul_4xi8_zc:
 ; AVX512VNNI:       # %bb.0: # %entry
@@ -62,16 +73,26 @@ entry:
 }
 
 define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_4xi4_cz:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVXVNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVXVNNI-NEXT:    vmovd %xmm1, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_4xi4_cz:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVXVNNI-AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_4xi4_cz:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpmovdb %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: mul_4xi4_cz:
 ; AVX512VNNI:       # %bb.0: # %entry
@@ -104,15 +125,26 @@ entry:
 }
 
 define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_4xi8_cs:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
-; AVXVNNI-NEXT:    vmovd %xmm1, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_4xi8_cs:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
+; AVXVNNI-AVX-NEXT:    vmovd %xmm1, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_4xi8_cs:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVXVNNI-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %xmm0, %xmm1, %xmm2
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm2, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512VNNI-LABEL: mul_4xi8_cs:
 ; AVX512VNNI:       # %bb.0: # %entry
@@ -145,17 +177,17 @@ entry:
 }
 
 define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) {
-; ALL-LABEL: mul_4xi8_cs_exceed:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vpmovsxbd %xmm0, %xmm0
-; ALL-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0]
-; ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; ALL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    addl %edi, %eax
-; ALL-NEXT:    retq
+; CHECK-LABEL: mul_4xi8_cs_exceed:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
+; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vmovd %xmm0, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    retq
 entry:
   %0 = sext <4 x i8> %a to <4 x i32>
   %1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 256>, %0
@@ -265,24 +297,44 @@ entry:
 }
 
 define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) {
-; AVXVNNI-LABEL: mul_64xi8_zc:
-; AVXVNNI:       # %bb.0: # %entry
-; AVXVNNI-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64]
-; AVXVNNI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVXVNNI-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVXVNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm4
-; AVXVNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm0, %ymm3
-; AVXVNNI-NEXT:    vpaddd %ymm4, %ymm3, %ymm0
-; AVXVNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVXVNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVXVNNI-NEXT:    vmovd %xmm0, %eax
-; AVXVNNI-NEXT:    addl %edi, %eax
-; AVXVNNI-NEXT:    vzeroupper
-; AVXVNNI-NEXT:    retq
+; AVXVNNI-AVX-LABEL: mul_64xi8_zc:
+; AVXVNNI-AVX:       # %bb.0: # %entry
+; AVXVNNI-AVX-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64]
+; AVXVNNI-AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVXVNNI-AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm4
+; AVXVNNI-AVX-NEXT:    {vex} vpdpbusd %ymm2, %ymm0, %ymm3
+; AVXVNNI-AVX-NEXT:    vpaddd %ymm4, %ymm3, %ymm0
+; AVXVNNI-AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX-NEXT:    vzeroupper
+; AVXVNNI-AVX-NEXT:    retq
+;
+; AVXVNNI-AVX512-LABEL: mul_64xi8_zc:
+; AVXVNNI-AVX512:       # %bb.0: # %entry
+; AVXVNNI-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVXVNNI-AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64]
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVXVNNI-AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm4
+; AVXVNNI-AVX512-NEXT:    {vex} vpdpbusd %ymm2, %ymm0, %ymm3
+; AVXVNNI-AVX512-NEXT:    vpaddd %ymm4, %ymm3, %ymm0
+; AVXVNNI-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVXVNNI-AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVXVNNI-AVX512-NEXT:    vmovd %xmm0, %eax
+; AVXVNNI-AVX512-NEXT:    addl %edi, %eax
+; AVXVNNI-AVX512-NEXT:    vzeroupper
+; AVXVNNI-AVX512-NEXT:    retq
 ;
 ; AVX512-LABEL: mul_64xi8_zc:
 ; AVX512:       # %bb.0: # %entry

From f1b4a3bde510ce4990b93681458339f2183ab147 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 10:42:34 +0100
Subject: [PATCH 097/878] [llvm-cov] Fix MSVC "not all control paths return a
 value" warning. NFC. (#161150)

---
 llvm/tools/llvm-cov/CoverageExporterJson.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
index ff86c8dfe951b..4c07c05396732 100644
--- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -118,6 +118,7 @@ json::Value renderCondState(const coverage::MCDCRecord::CondState CondState) {
   case coverage::MCDCRecord::MCDC_False:
     return json::Value(false);
   }
+  llvm_unreachable("Unknown llvm::coverage::MCDCRecord::CondState enum");
 }
 
 json::Array gatherTestVectors(coverage::MCDCRecord &Record) {

From ce70773cff2ab6b5d9b8d7a97ee62c75762a51d2 Mon Sep 17 00:00:00 2001
From: DST <danstadelmann@users.noreply.github.com>
Date: Mon, 29 Sep 2025 12:23:09 +0200
Subject: [PATCH 098/878] Fix some typos in machine verifier comments and trace
 output (#160049)

Stumbled across a typo in the `MachineVerifier` file and since I had it
open, I changed some other comments.

Not important but why not leave it a bit cleaner :slightly_smiling_face:

---------

Signed-off-by: Daniel Stadelmann <dasta_7@hotmail.com>
---
 llvm/lib/CodeGen/MachineVerifier.cpp          | 20 +++++++++----------
 .../MachineVerifier/test_g_build_vector.mir   |  6 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index e911ce8a75828..115485509c4a5 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1549,7 +1549,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       report("G_BUILD_VECTOR result element type must match source type", MI);
 
     if (DstTy.getNumElements() != MI->getNumOperands() - 1)
-      report("G_BUILD_VECTOR must have an operand for each elemement", MI);
+      report("G_BUILD_VECTOR must have an operand for each element", MI);
 
     for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2))
       if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg()))
@@ -2398,11 +2398,11 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
 
     // The next two checks allow COPY between physical and virtual registers,
     // when the virtual register has a scalable size and the physical register
-    // has a fixed size. These checks allow COPY between *potentialy* mismatched
-    // sizes. However, once RegisterBankSelection occurs, MachineVerifier should
-    // be able to resolve a fixed size for the scalable vector, and at that
-    // point this function will know for sure whether the sizes are mismatched
-    // and correctly report a size mismatch.
+    // has a fixed size. These checks allow COPY between *potentially*
+    // mismatched sizes. However, once RegisterBankSelection occurs,
+    // MachineVerifier should be able to resolve a fixed size for the scalable
+    // vector, and at that point this function will know for sure whether the
+    // sizes are mismatched and correctly report a size mismatch.
     if (SrcReg.isPhysical() && DstReg.isVirtual() && DstSize.isScalable() &&
         !SrcSize.isScalable())
       break;
@@ -3213,13 +3213,13 @@ struct VRegFilter {
 
 private:
   static constexpr unsigned SparseUniverseMax = 10 * 1024 * 8;
-  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyound
-  // are tracked by Dense. The only purpose of the threashold and the Dense set
+  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyond
+  // are tracked by Dense. The only purpose of the threshold and the Dense set
   // is to have a reasonably growing memory usage in pathological cases (large
   // number of very sparse VRegFilter instances live at the same time). In
   // practice even in the worst-by-execution time cases having all elements
   // tracked by Sparse (very large SparseUniverseMax scenario) tends to be more
-  // space efficient than if tracked by Dense. The threashold is set to keep the
+  // space efficient than if tracked by Dense. The threshold is set to keep the
   // worst-case memory usage within 2x of figures determined empirically for
   // "all Dense" scenario in such worst-by-execution-time cases.
   BitVector Sparse;
@@ -3459,7 +3459,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
 
   // Check live-in list of each MBB. If a register is live into MBB, check
   // that the register is in regsLiveOut of each predecessor block. Since
-  // this must come from a definition in the predecesssor or its live-in
+  // this must come from a definition in the predecessor or its live-in
   // list, this will catch a live-through case where the predecessor does not
   // have the register in its live-in list.  This currently only checks
   // registers that have no aliases, are not allocatable and are not
diff --git a/llvm/test/MachineVerifier/test_g_build_vector.mir b/llvm/test/MachineVerifier/test_g_build_vector.mir
index 50b98017a49a7..9857306737108 100644
--- a/llvm/test/MachineVerifier/test_g_build_vector.mir
+++ b/llvm/test/MachineVerifier/test_g_build_vector.mir
@@ -16,17 +16,17 @@ body:             |
     ; CHECK: Bad machine code: G_BUILD_VECTOR must produce a vector from scalar operands
     %3:_(<2 x s32>) = G_BUILD_VECTOR %2
 
-    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement
+    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element
     %4:_(<2 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0
 
     ; CHECK: Bad machine code: G_BUILD_VECTOR result element type must match source type
-    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement
+    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element
     %5:_(<4 x s16>) = G_BUILD_VECTOR %0, %0
 
     %6:_(s16) = IMPLICIT_DEF
 
     ; CHECK: Bad machine code: G_BUILD_VECTOR result element type must match source type
-    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement
+    ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element
     %7:_(<2 x s32>) = G_BUILD_VECTOR %6, %6, %6, %6
 
     %8:_(p0) = IMPLICIT_DEF

From f628a5467addf2f8a597141ab01f7e7453e6d9a7 Mon Sep 17 00:00:00 2001
From: Hongyu Chen <xxs_chy@outlook.com>
Date: Mon, 29 Sep 2025 18:38:50 +0800
Subject: [PATCH 099/878] [ConstantFold] Fold inttoptr, ptrtoaddr to bitcast
 (#161087)

Fixes #157334.
---
 llvm/lib/IR/Instructions.cpp                  |  6 +-
 llvm/test/Transforms/InstCombine/ptrtoaddr.ll | 65 +++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/ptrtoaddr.ll

diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index daebf447a2107..dd83168ab3c6e 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2847,6 +2847,7 @@ unsigned CastInst::isEliminableCastPair(
   // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a
   // FPEXT         <       FloatPt      n/a        FloatPt      n/a
   // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
+  // PTRTOADDR    n/a      Pointer      n/a        Integral   Unsigned
   // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
   // BITCAST       =       FirstClass   n/a       FirstClass    n/a
   // ADDRSPCST    n/a      Pointer      n/a        Pointer      n/a
@@ -2878,7 +2879,7 @@ unsigned CastInst::isEliminableCastPair(
     { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt          |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt       |
     {  1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr      |
-    { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr       |
+    { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr       |
     {  5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast        |
     {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
@@ -2972,7 +2973,8 @@ unsigned CastInst::isEliminableCastPair(
       // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 11: {
-      // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
+      // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize and
+      // SrcSize==DstSize
       if (!MidIntPtrTy)
         return 0;
       unsigned PtrSize = MidIntPtrTy->getScalarSizeInBits();
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
new file mode 100644
index 0000000000000..61b13312521d2
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+target datalayout = "p1:64:64:64:32"
+
+define i32 @ptrtoaddr_inttoptr_arg(i32 %a) {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[A]] to i64
+; CHECK-NEXT:    [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
+; CHECK-NEXT:    [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32
+; CHECK-NEXT:    ret i32 [[TOADDR]]
+;
+  %toptr = inttoptr i32 %a to ptr addrspace(1)
+  %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
+  ret i32 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() {
+; CHECK-NEXT:    ret i32 -1
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i32 -1 to ptr addrspace(1)) to i32)
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() {
+; CHECK-NEXT:    ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32)
+}
+
+define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() {
+; CHECK-NEXT:    ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
+;
+  ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32)
+}
+
+define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() {
+; CHECK-NEXT:    ret i64 1
+;
+  ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64)
+}
+
+define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() {
+; CHECK-NEXT:    ret i64 123
+;
+  ret i64 ptrtoaddr (ptr inttoptr (i64 123 to ptr) to i64)
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() {
+; CHECK-NEXT:    ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
+;
+  ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64)
+}
+
+define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() {
+; CHECK-NEXT:    ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
+;
+  ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64)
+}

From 9552e899e494e619093e8685173a4af0ba73e049 Mon Sep 17 00:00:00 2001
From: Ye Tian <939808194@qq.com>
Date: Mon, 29 Sep 2025 18:49:22 +0800
Subject: [PATCH 100/878] [X86] Remove X86ISD::VSHLDV/VSHRDV and use
 ISD::FSHL/FSHR opcodes directly (#157616)

Fixes [issue](https://github.com/llvm/llvm-project/issues/155591)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  8 ++--
 llvm/lib/Target/X86/X86ISelLowering.h        |  3 +-
 llvm/lib/Target/X86/X86InstrAVX512.td        | 48 +++++++++++---------
 llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 10 ----
 4 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e7eb67ab12edc..292eab77e2002 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31215,16 +31215,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
     unsigned NumElts = VT.getVectorNumElements();
 
     if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
-      if (IsFSHR)
-        std::swap(Op0, Op1);
 
       if (IsCstSplat) {
+        if (IsFSHR)
+          std::swap(Op0, Op1);
         uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
         SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
         return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
                              {Op0, Op1, Imm}, DAG, Subtarget);
       }
-      return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
+      return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
                            {Op0, Op1, Amt}, DAG, Subtarget);
     }
     assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
@@ -35139,8 +35139,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VALIGN)
   NODE_NAME_CASE(VSHLD)
   NODE_NAME_CASE(VSHRD)
-  NODE_NAME_CASE(VSHLDV)
-  NODE_NAME_CASE(VSHRDV)
   NODE_NAME_CASE(PSHUFD)
   NODE_NAME_CASE(PSHUFHW)
   NODE_NAME_CASE(PSHUFLW)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 8ab8c66d76814..b55556aadd867 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -471,8 +471,7 @@ namespace llvm {
     // VBMI2 Concat & Shift.
     VSHLD,
     VSHRD,
-    VSHLDV,
-    VSHRDV,
+
     // Shuffle Packed Values at 128-bit granularity.
     SHUF128,
     MOVDDUP,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 2371ed4ed14a1..564810cb4b88e 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -12300,72 +12300,76 @@ defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
 // VBMI2
 //===----------------------------------------------------------------------===//
 
-multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                               X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in {
     defm r:   AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
-                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
+                !if(SwapLR,
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src3))),
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src3))))>,
                 T8, PD, EVEX, VVVV, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
-                (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (VTI.LdFrag addr:$src3))))>,
+                !if(SwapLR,
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src3)))),
+                (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.LdFrag addr:$src3)))))>,
                 T8, PD, EVEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
-multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                X86FoldableSchedWrite sched, X86VectorVTInfo VTI>
-         : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> {
+         : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched, VTI> {
   let Constraints = "$src1 = $dst",
       ExeDomain   = VTI.ExeDomain in
   defm mb:  AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
               (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
               "${src3}"#VTI.BroadcastStr#", $src2",
               "$src2, ${src3}"#VTI.BroadcastStr,
-              (OpNode VTI.RC:$src1, VTI.RC:$src2,
-               (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
+              !if(SwapLR,
+              (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))),
+              (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))))>,
               T8, PD, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
-multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                      X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+  defm Z      : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
                                    EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+    defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
                                    EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+    defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
                                    EVEX_V128;
   }
 }
 
-multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
+multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, bit SwapLR,
                                       X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> {
   let Predicates = [HasVBMI2] in
-  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>,
+  defm Z      : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.ZMM, VTI.info512>,
                                     EVEX_V512;
   let Predicates = [HasVBMI2, HasVLX] in {
-    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>,
+    defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.YMM, VTI.info256>,
                                     EVEX_V256;
-    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>,
+    defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, SwapLR, sched.XMM, VTI.info128>,
                                     EVEX_V128;
   }
 }
 multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
-                           SDNode OpNode, X86SchedWriteWidths sched> {
-  defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched,
+                           SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched> {
+  defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, SwapLR, sched,
              avx512vl_i16_info>, REX_W, EVEX_CD8<16, CD8VF>;
-  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched,
+  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, SwapLR, sched,
              avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched,
+  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, SwapLR, sched,
              avx512vl_i64_info>, REX_W, EVEX_CD8<64, CD8VF>;
 }
 
@@ -12381,8 +12385,8 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
 }
 
 // Concat & Shift
-defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>;
-defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>;
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", fshl, 0, SchedWriteVecIMul>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", fshr, 1, SchedWriteVecIMul>;
 defm VPSHLD  : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>;
 defm VPSHRD  : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>;
 
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 0c20ffed77e77..5321ecf0c1b2c 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -406,16 +406,6 @@ def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
 
 def X86VShld   : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>;
 def X86VShrd   : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>;
-def X86VShldv  : SDNode<"X86ISD::VSHLDV",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                             SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>,
-                                             SDTCisSameAs<0,3>]>>;
-def X86VShrdv  : SDNode<"X86ISD::VSHRDV",
-                        SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                             SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>,
-                                             SDTCisSameAs<0,3>]>>;
 
 def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
 

From 3c98be41465e16038aec7b5e088a2d0fbbcf8647 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 12:03:25 +0100
Subject: [PATCH 101/878] [clang][bytecode] Pointer::isZero - fix MSVC "not all
 control paths return a value" warning. NFC. (#161168)

---
 clang/lib/AST/ByteCode/Pointer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index af89b66e9f875..cd738ce8b2a3e 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -262,6 +262,7 @@ class Pointer {
     case Storage::Typeid:
       return false;
     }
+    llvm_unreachable("Unknown clang::interp::Storage enum");
   }
   /// Checks if the pointer is live.
   bool isLive() const {

From c20ef94b3f9365020a9bd39293b92f79745c3f05 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 12:14:49 +0100
Subject: [PATCH 102/878] [clang][x86] tbm-builtins.c - add i386 test coverage
 (#161169)

---
 clang/test/CodeGen/X86/tbm-builtins.c | 78 ++++++++++++++-------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/clang/test/CodeGen/X86/tbm-builtins.c b/clang/test/CodeGen/X86/tbm-builtins.c
index d916627a23f57..c52744128b4e3 100644
--- a/clang/test/CodeGen/X86/tbm-builtins.c
+++ b/clang/test/CodeGen/X86/tbm-builtins.c
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK
 
 #include <x86intrin.h>
 
@@ -13,14 +15,14 @@ unsigned int test__bextri_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__bextri_u64(unsigned long long a) {
-  // CHECK-LABEL: test__bextri_u64
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 2)
+  // X64-LABEL: test__bextri_u64
+  // X64: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 2)
   return __bextri_u64(a, 2);
 }
 
 unsigned long long test__bextri_u64_bigint(unsigned long long a) {
-  // CHECK-LABEL: test__bextri_u64_bigint
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 549755813887)
+  // X64-LABEL: test__bextri_u64_bigint
+  // X64: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 549755813887)
   return __bextri_u64(a, 0x7fffffffffLL);
 }
 #endif
@@ -34,9 +36,9 @@ unsigned int test__blcfill_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcfill_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcfill_u64
-  // CHECK: [[TMP:%.*]] = add i64 %{{.*}}, 1
-  // CHECK: %{{.*}} = and i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blcfill_u64
+  // X64: [[TMP:%.*]] = add i64 %{{.*}}, 1
+  // X64: %{{.*}} = and i64 %{{.*}}, [[TMP]]
   return __blcfill_u64(a);
 }
 #endif
@@ -51,10 +53,10 @@ unsigned int test__blci_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blci_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blci_u64
-  // CHECK: [[TMP1:%.*]] = add i64 %{{.*}}, 1
-  // CHECK: [[TMP2:%.*]] = xor i64 [[TMP1]], -1
-  // CHECK: %{{.*}} = or i64 %{{.*}}, [[TMP2]]
+  // X64-LABEL: test__blci_u64
+  // X64: [[TMP1:%.*]] = add i64 %{{.*}}, 1
+  // X64: [[TMP2:%.*]] = xor i64 [[TMP1]], -1
+  // X64: %{{.*}} = or i64 %{{.*}}, [[TMP2]]
   return __blci_u64(a);
 }
 #endif
@@ -69,10 +71,10 @@ unsigned int test__blcic_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcic_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcic_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__blcic_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
   return __blcic_u64(a);
 }
 #endif
@@ -86,9 +88,9 @@ unsigned int test__blcmsk_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcmsk_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcmsk_u64
-  // CHECK: [[TMP:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = xor i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blcmsk_u64
+  // X64: [[TMP:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = xor i64 %{{.*}}, [[TMP]]
   return __blcmsk_u64(a);
 }
 #endif
@@ -102,9 +104,9 @@ unsigned int test__blcs_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blcs_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blcs_u64
-  // CHECK: [[TMP:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blcs_u64
+  // X64: [[TMP:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
   return __blcs_u64(a);
 }
 #endif
@@ -118,9 +120,9 @@ unsigned int test__blsfill_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blsfill_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blsfill_u64
-  // CHECK: [[TMP:%.*]] = sub i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
+  // X64-LABEL: test__blsfill_u64
+  // X64: [[TMP:%.*]] = sub i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 %{{.*}}, [[TMP]]
   return __blsfill_u64(a);
 }
 #endif
@@ -135,10 +137,10 @@ unsigned int test__blsic_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__blsic_u64(unsigned long long a) {
-  // CHECK-LABEL: test__blsic_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__blsic_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
   return __blsic_u64(a);
 }
 #endif
@@ -153,10 +155,10 @@ unsigned int test__t1mskc_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__t1mskc_u64(unsigned long long a) {
-  // CHECK-LABEL: test__t1mskc_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = add i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__t1mskc_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = add i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = or i64 [[TMP1]], [[TMP2]]
   return __t1mskc_u64(a);
 }
 #endif
@@ -171,10 +173,10 @@ unsigned int test__tzmsk_u32(unsigned int a) {
 
 #ifdef __x86_64__
 unsigned long long test__tzmsk_u64(unsigned long long a) {
-  // CHECK-LABEL: test__tzmsk_u64
-  // CHECK: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
-  // CHECK-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
+  // X64-LABEL: test__tzmsk_u64
+  // X64: [[TMP1:%.*]] = xor i64 %{{.*}}, -1
+  // X64: [[TMP2:%.*]] = sub i64 %{{.*}}, 1
+  // X64-NEXT: {{.*}} = and i64 [[TMP1]], [[TMP2]]
   return __tzmsk_u64(a);
 }
 #endif

From 84e4c0686e38e27f1e72171e0be2549fff6fb113 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Mon, 29 Sep 2025 19:19:18 +0800
Subject: [PATCH 103/878] [DAGCombiner] Remove NoSignedZerosFPMath uses in
 visitFSUB (#160974)

Remove NoSignedZerosFPMath in visitFSUB part, we should always use
instruction level fast math flags.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  9 ++----
 llvm/test/CodeGen/AMDGPU/fneg-combines.ll     |  2 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 29 ++++++++++++++-----
 llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll   |  4 +--
 4 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c81568672de3c..77df4b4598c48 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17983,8 +17983,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub A, 0) -> A
   if (N1CFP && N1CFP->isZero()) {
-    if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
-        Flags.hasNoSignedZeros()) {
+    if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) {
       return N0;
     }
   }
@@ -17997,8 +17996,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub -0.0, N1) -> -N1
   if (N0CFP && N0CFP->isZero()) {
-    if (N0CFP->isNegative() ||
-        (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
+    if (N0CFP->isNegative() || Flags.hasNoSignedZeros()) {
       // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
       // flushed to zero, unless all users treat denorms as zero (DAZ).
       // FIXME: This transform will change the sign of a NaN and the behavior
@@ -18014,8 +18012,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
-  if ((Options.NoSignedZerosFPMath ||
-       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+  if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() &&
       N1.getOpcode() == ISD::FADD) {
     // X - (X + Y) -> -Y
     if (N0 == N1->getOperand(0))
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index 12e9888314fc1..aaea4f76ea49b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -5015,7 +5015,7 @@ define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out
   %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext
   %a = load volatile double, ptr addrspace(1) %a.gep
-  %fneg.a = fsub double -0.000000e+00, %a
+  %fneg.a = fsub nsz double -0.000000e+00, %a
   %fpround = fptrunc double %fneg.a to float
   %fneg = fneg float %fpround
   store float %fneg, ptr addrspace(1) %out.gep
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index c4ca79dc85312..3de6df211ac7c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -4441,25 +4441,40 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) {
   ret float %i3
 }
 
-define float @v_fmul_0_fsub_0_infloop_regression(float %arg) {
-; GCN-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression:
+define float @v_fmul_0_fsub_0_safe_infloop_regression(float %arg) {
+; GCN-SAFE-LABEL: v_fmul_0_fsub_0_safe_infloop_regression:
 ; GCN-SAFE:       ; %bb.0: ; %bb
 ; GCN-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-SAFE-NEXT:    v_mul_f32_e32 v0, 0, v0
 ; GCN-SAFE-NEXT:    v_sub_f32_e32 v0, 0, v0
 ; GCN-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-NSZ-LABEL: v_fmul_0_fsub_0_infloop_regression:
-; GCN-NSZ:       ; %bb.0: ; %bb
-; GCN-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NSZ-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
-; GCN-NSZ-NEXT:    s_setpc_b64 s[30:31]
+; SI-NSZ-LABEL: v_fmul_0_fsub_0_safe_infloop_regression:
+; SI-NSZ:       ; %bb.0: ; %bb
+; SI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NSZ-NEXT:    s_brev_b32 s4, 1
+; SI-NSZ-NEXT:    v_fma_f32 v0, v0, s4, 0
+; SI-NSZ-NEXT:    s_setpc_b64 s[30:31]
+; FIXME: utils/update_llc_test_checks.py will generate redundant VI
+; labels, remove them, they will cause test failure.
 bb:
   %i = fmul float %arg, 0.0
   %i1 = fsub float 0.0, %i
   ret float %i1
 }
 
+define float @v_fmul_0_fsub_0_nsz_infloop_regression(float %arg) {
+; GCN-LABEL: v_fmul_0_fsub_0_nsz_infloop_regression:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = fmul float %arg, 0.0
+  %i1 = fsub nsz float 0.0, %i
+  ret float %i1
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fma.f32(float, float, float) #1
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
diff --git a/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll b/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll
index 23d22e75d1e9d..3f92d2b79c85d 100644
--- a/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll
+++ b/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 ; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
 
@@ -18,7 +18,7 @@ define <4 x float> @vec_fneg(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-  %sub = fsub <4 x float> zeroinitializer, %x
+  %sub = fsub nsz <4 x float> zeroinitializer, %x
   ret <4 x float> %sub
 }
 

From 7b25cef4ce5837548ed45a27a49b5d2df955754c Mon Sep 17 00:00:00 2001
From: kper <kevin.per@protonmail.com>
Date: Mon, 29 Sep 2025 13:28:29 +0200
Subject: [PATCH 104/878] [DOC][GlobalISel] Add more explanation to
 InstructionSelect (#160510)

---
 llvm/docs/GlobalISel/InstructionSelect.rst | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/GlobalISel/InstructionSelect.rst b/llvm/docs/GlobalISel/InstructionSelect.rst
index 9798ae7a596ca..5513824cf190c 100644
--- a/llvm/docs/GlobalISel/InstructionSelect.rst
+++ b/llvm/docs/GlobalISel/InstructionSelect.rst
@@ -5,8 +5,22 @@ InstructionSelect
 -----------------
 
 This pass transforms generic machine instructions into equivalent
-target-specific instructions.  It traverses the ``MachineFunction`` bottom-up,
-selecting uses before definitions, enabling trivial dead code elimination.
+target-specific instructions.  
+
+The legacy instruction selector, SelectionDAG, iterated over each function's 
+basic block and constructed a dataflow graph. Every backend defines 
+tree patterns in the ``XXXInstrInfo.td``. The legacy selector started
+at the bottom and replaced the SDNodes greedily. 
+
+The GlobalISel's instruction selector traverses the ``MachineFunction`` 
+bottom-up, selecting uses before definitions, enabling trivial dead code 
+elimination. It does that by iterating over the basic blocks in post-order. 
+Each gMIR instruction is then replaced by a MIR instruction when a matching 
+pattern is found. So, when there is a 1:1 mapping between gMIR and MIR, where 
+is the benefit of the global scope? Even in the case of a 1:1 mapping, 
+GlobalISel includes a combiner that can match and fuse multiple gMIR 
+instructions. The scope of the combination is not limited to a basic block, 
+but can extend across the entire function.
 
 .. _api-instructionselector:
 

From 3253ec0082661fda9b37b426c7d1a2edcc5e5a6b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 12:31:34 +0100
Subject: [PATCH 105/878] [clang][x86] bmi-builtins.c - add i386 test coverage
 (#161171)

---
 clang/test/CodeGen/X86/bmi-builtins.c | 92 ++++++++++++++-------------
 1 file changed, 47 insertions(+), 45 deletions(-)

diff --git a/clang/test/CodeGen/X86/bmi-builtins.c b/clang/test/CodeGen/X86/bmi-builtins.c
index ded40ca59781e..8c9d3a64fb177 100644
--- a/clang/test/CodeGen/X86/bmi-builtins.c
+++ b/clang/test/CodeGen/X86/bmi-builtins.c
@@ -1,7 +1,9 @@
-// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
-// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT
-// RUN: %clang_cc1 -x c++ -std=c++11 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
-// RUN: %clang_cc1 -x c++ -std=c++11 -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefix=TZCNT
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefixes=TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c++ -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefixes=TZCNT,TZCNT64
 
 
 #include <immintrin.h>
@@ -48,20 +50,20 @@ unsigned int test_tzcnt_u32(unsigned int __X) {
 
 #ifdef __x86_64__
 unsigned long long test__tzcnt_u64(unsigned long long __X) {
-// TZCNT-LABEL: test__tzcnt_u64
-// TZCNT: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
+// TZCNT64-LABEL: test__tzcnt_u64
+// TZCNT64: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
   return __tzcnt_u64(__X);
 }
 
 long long test_mm_tzcnt_64(unsigned long long __X) {
-// TZCNT-LABEL: test_mm_tzcnt_64
-// TZCNT: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
+// TZCNT64-LABEL: test_mm_tzcnt_64
+// TZCNT64: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
   return _mm_tzcnt_64(__X);
 }
 
 unsigned long long test_tzcnt_u64(unsigned long long __X) {
-// TZCNT-LABEL: test_tzcnt_u64
-// TZCNT: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
+// TZCNT64-LABEL: test_tzcnt_u64
+// TZCNT64: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 false)
   return _tzcnt_u64(__X);
 }
 #endif
@@ -103,36 +105,36 @@ unsigned int test__blsr_u32(unsigned int __X) {
 
 #ifdef __x86_64__
 unsigned long long test__andn_u64(unsigned long __X, unsigned long __Y) {
-// CHECK-LABEL: test__andn_u64
-// CHECK: xor i64 %{{.*}}, -1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__andn_u64
+// X64: xor i64 %{{.*}}, -1
+// X64: and i64 %{{.*}}, %{{.*}}
   return __andn_u64(__X, __Y);
 }
 
 unsigned long long test__bextr_u64(unsigned long __X, unsigned long __Y) {
-// CHECK-LABEL: test__bextr_u64
-// CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
+// X64-LABEL: test__bextr_u64
+// X64: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return __bextr_u64(__X, __Y);
 }
 
 unsigned long long test__blsi_u64(unsigned long long __X) {
-// CHECK-LABEL: test__blsi_u64
-// CHECK: sub i64 0, %{{.*}}
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__blsi_u64
+// X64: sub i64 0, %{{.*}}
+// X64: and i64 %{{.*}}, %{{.*}}
   return __blsi_u64(__X);
 }
 
 unsigned long long test__blsmsk_u64(unsigned long long __X) {
-// CHECK-LABEL: test__blsmsk_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: xor i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__blsmsk_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: xor i64 %{{.*}}, %{{.*}}
   return __blsmsk_u64(__X);
 }
 
 unsigned long long test__blsr_u64(unsigned long long __X) {
-// CHECK-LABEL: test__blsr_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test__blsr_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: and i64 %{{.*}}, %{{.*}}
   return __blsr_u64(__X);
 }
 #endif
@@ -186,49 +188,49 @@ unsigned int test_blsr_u32(unsigned int __X) {
 
 #ifdef __x86_64__
 unsigned long long test_andn_u64(unsigned long __X, unsigned long __Y) {
-// CHECK-LABEL: test_andn_u64
-// CHECK: xor i64 %{{.*}}, -1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_andn_u64
+// X64: xor i64 %{{.*}}, -1
+// X64: and i64 %{{.*}}, %{{.*}}
   return _andn_u64(__X, __Y);
 }
 
 unsigned long long test_bextr_u64(unsigned long __X, unsigned int __Y,
                                   unsigned int __Z) {
-// CHECK-LABEL: test_bextr_u64
-// CHECK: and i32 %{{.*}}, 255
-// CHECK: and i32 %{{.*}}, 255
-// CHECK: shl i32 %{{.*}}, 8
-// CHECK: or i32 %{{.*}}, %{{.*}}
-// CHECK: zext i32 %{{.*}} to i64
-// CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
+// X64-LABEL: test_bextr_u64
+// X64: and i32 %{{.*}}, 255
+// X64: and i32 %{{.*}}, 255
+// X64: shl i32 %{{.*}}, 8
+// X64: or i32 %{{.*}}, %{{.*}}
+// X64: zext i32 %{{.*}} to i64
+// X64: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return _bextr_u64(__X, __Y, __Z);
 }
 
 unsigned long long test_bextr2_u64(unsigned long long __X,
                                    unsigned long long __Y) {
-// CHECK-LABEL: test_bextr2_u64
-// CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
+// X64-LABEL: test_bextr2_u64
+// X64: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return _bextr2_u64(__X, __Y);
 }
 
 unsigned long long test_blsi_u64(unsigned long long __X) {
-// CHECK-LABEL: test_blsi_u64
-// CHECK: sub i64 0, %{{.*}}
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_blsi_u64
+// X64: sub i64 0, %{{.*}}
+// X64: and i64 %{{.*}}, %{{.*}}
   return _blsi_u64(__X);
 }
 
 unsigned long long test_blsmsk_u64(unsigned long long __X) {
-// CHECK-LABEL: test_blsmsk_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: xor i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_blsmsk_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: xor i64 %{{.*}}, %{{.*}}
   return _blsmsk_u64(__X);
 }
 
 unsigned long long test_blsr_u64(unsigned long long __X) {
-// CHECK-LABEL: test_blsr_u64
-// CHECK: sub i64 %{{.*}}, 1
-// CHECK: and i64 %{{.*}}, %{{.*}}
+// X64-LABEL: test_blsr_u64
+// X64: sub i64 %{{.*}}, 1
+// X64: and i64 %{{.*}}, %{{.*}}
   return _blsr_u64(__X);
 }
 #endif

From 2d30392e85c6c4c17c59c020a07917277ce36f14 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 12:38:35 +0100
Subject: [PATCH 106/878] [clang][X86] bmi2-builtins.c - add
 -fexperimental-new-constant-interpreter bytecode test coverage (#161172)

Part of #155814
---
 clang/test/CodeGen/X86/bmi2-builtins.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c b/clang/test/CodeGen/X86/bmi2-builtins.c
index 48424f553768b..1b2cb9048adb2 100644
--- a/clang/test/CodeGen/X86/bmi2-builtins.c
+++ b/clang/test/CodeGen/X86/bmi2-builtins.c
@@ -3,6 +3,11 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - | FileCheck %s --check-prefix=B32
 
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix=B32
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi2 -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefix=B32
+
 
 #include <immintrin.h>
 

From 9d33b99fdb83ee2313ccd4b3a2faabbb13202c5e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 12:49:15 +0100
Subject: [PATCH 107/878] [clang][X86] tbm-builtins.c - add
 -fexperimental-new-constant-interpreter bytecode test coverage (#161174)

Part of #155814
---
 clang/test/CodeGen/X86/tbm-builtins.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/test/CodeGen/X86/tbm-builtins.c b/clang/test/CodeGen/X86/tbm-builtins.c
index c52744128b4e3..89746bf67e909 100644
--- a/clang/test/CodeGen/X86/tbm-builtins.c
+++ b/clang/test/CodeGen/X86/tbm-builtins.c
@@ -3,6 +3,11 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,X64
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK
 
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-unknown-unknown -target-feature +tbm -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK
+
 #include <x86intrin.h>
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll

From 23d08af3d422f10d180af1e0b6261b65f76dfeed Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Mon, 29 Sep 2025 14:10:26 +0200
Subject: [PATCH 108/878] [Offload][NFC] use unique ptrs for platforms
 (#160888)

Currently, devices store a raw pointer to back to their owning Platform.
Platforms are stored directly inside of a vector. Modifying this vector
risks invalidating all the platform pointers stored in devices.

This patch allocates platforms individually, and changes devices to
store a reference to its platform instead of a pointer. This is safe,
because platforms are guaranteed to outlive the devices they contain.
---
 offload/liboffload/src/OffloadImpl.cpp | 89 +++++++++++++-------------
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 08a2e25b97d85..051882da7c6c7 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -39,12 +39,28 @@ using namespace llvm::omp::target;
 using namespace llvm::omp::target::plugin;
 using namespace error;
 
+struct ol_platform_impl_t {
+  ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
+                     ol_platform_backend_t BackendType)
+      : Plugin(std::move(Plugin)), BackendType(BackendType) {}
+  std::unique_ptr<GenericPluginTy> Plugin;
+  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
+  ol_platform_backend_t BackendType;
+
+  /// Complete all pending work for this platform and perform any needed
+  /// cleanup.
+  ///
+  /// After calling this function, no liboffload functions should be called with
+  /// this platform handle.
+  llvm::Error destroy();
+};
+
 // Handle type definitions. Ideally these would be 1:1 with the plugins, but
 // we add some additional data here for now to avoid churn in the plugin
 // interface.
 struct ol_device_impl_t {
   ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device,
-                   ol_platform_handle_t Platform, InfoTreeNode &&DevInfo)
+                   ol_platform_impl_t &Platform, InfoTreeNode &&DevInfo)
       : DeviceNum(DeviceNum), Device(Device), Platform(Platform),
         Info(std::forward<InfoTreeNode>(DevInfo)) {}
 
@@ -55,7 +71,7 @@ struct ol_device_impl_t {
 
   int DeviceNum;
   GenericDeviceTy *Device;
-  ol_platform_handle_t Platform;
+  ol_platform_impl_t &Platform;
   InfoTreeNode Info;
 
   llvm::SmallVector<__tgt_async_info *> OutstandingQueues;
@@ -102,31 +118,17 @@ struct ol_device_impl_t {
   }
 };
 
-struct ol_platform_impl_t {
-  ol_platform_impl_t(std::unique_ptr<GenericPluginTy> Plugin,
-                     ol_platform_backend_t BackendType)
-      : Plugin(std::move(Plugin)), BackendType(BackendType) {}
-  std::unique_ptr<GenericPluginTy> Plugin;
-  llvm::SmallVector<std::unique_ptr<ol_device_impl_t>> Devices;
-  ol_platform_backend_t BackendType;
-
-  /// Complete all pending work for this platform and perform any needed
-  /// cleanup.
-  ///
-  /// After calling this function, no liboffload functions should be called with
-  /// this platform handle.
-  llvm::Error destroy() {
-    llvm::Error Result = Plugin::success();
-    for (auto &D : Devices)
-      if (auto Err = D->destroy())
-        Result = llvm::joinErrors(std::move(Result), std::move(Err));
+llvm::Error ol_platform_impl_t::destroy() {
+  llvm::Error Result = Plugin::success();
+  for (auto &D : Devices)
+    if (auto Err = D->destroy())
+      Result = llvm::joinErrors(std::move(Result), std::move(Err));
 
-    if (auto Res = Plugin->deinit())
-      Result = llvm::joinErrors(std::move(Result), std::move(Res));
+  if (auto Res = Plugin->deinit())
+    Result = llvm::joinErrors(std::move(Result), std::move(Res));
 
-    return Result;
-  }
-};
+  return Result;
+}
 
 struct ol_queue_impl_t {
   ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device)
@@ -206,12 +208,12 @@ struct OffloadContext {
   // Partitioned list of memory base addresses. Each element in this list is a
   // key in AllocInfoMap
   llvm::SmallVector<void *> AllocBases{};
-  SmallVector<ol_platform_impl_t, 4> Platforms{};
+  SmallVector<std::unique_ptr<ol_platform_impl_t>, 4> Platforms{};
   size_t RefCount;
 
   ol_device_handle_t HostDevice() {
     // The host platform is always inserted last
-    return Platforms.back().Devices[0].get();
+    return Platforms.back()->Devices[0].get();
   }
 
   static OffloadContext &get() {
@@ -251,35 +253,34 @@ Error initPlugins(OffloadContext &Context) {
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
     if (StringRef(#Name) != "host")                                            \
-      Context.Platforms.emplace_back(ol_platform_impl_t{                       \
+      Context.Platforms.emplace_back(std::make_unique<ol_platform_impl_t>(     \
           std::unique_ptr<GenericPluginTy>(createPlugin_##Name()),             \
-          pluginNameToBackend(#Name)});                                        \
+          pluginNameToBackend(#Name)));                                        \
   } while (false);
 #include "Shared/Targets.def"
 
   // Preemptively initialize all devices in the plugin
   for (auto &Platform : Context.Platforms) {
-    auto Err = Platform.Plugin->init();
+    auto Err = Platform->Plugin->init();
     [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
-    for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices();
+    for (auto DevNum = 0; DevNum < Platform->Plugin->number_of_devices();
          DevNum++) {
-      if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
-        auto Device = &Platform.Plugin->getDevice(DevNum);
+      if (Platform->Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) {
+        auto Device = &Platform->Plugin->getDevice(DevNum);
         auto Info = Device->obtainInfoImpl();
         if (auto Err = Info.takeError())
           return Err;
-        Platform.Devices.emplace_back(std::make_unique<ol_device_impl_t>(
-            DevNum, Device, &Platform, std::move(*Info)));
+        Platform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+            DevNum, Device, *Platform, std::move(*Info)));
       }
     }
   }
 
   // Add the special host device
   auto &HostPlatform = Context.Platforms.emplace_back(
-      ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST});
-  HostPlatform.Devices.emplace_back(
-      std::make_unique<ol_device_impl_t>(-1, nullptr, nullptr, InfoTreeNode{}));
-  Context.HostDevice()->Platform = &HostPlatform;
+      std::make_unique<ol_platform_impl_t>(nullptr, OL_PLATFORM_BACKEND_HOST));
+  HostPlatform->Devices.emplace_back(std::make_unique<ol_device_impl_t>(
+      -1, nullptr, *HostPlatform, InfoTreeNode{}));
 
   Context.TracingEnabled = std::getenv("OFFLOAD_TRACE");
   Context.ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION");
@@ -316,10 +317,10 @@ Error olShutDown_impl() {
 
   for (auto &P : OldContext->Platforms) {
     // Host plugin is nullptr and has no deinit
-    if (!P.Plugin || !P.Plugin->is_initialized())
+    if (!P->Plugin || !P->Plugin->is_initialized())
       continue;
 
-    if (auto Res = P.destroy())
+    if (auto Res = P->destroy())
       Result = llvm::joinErrors(std::move(Result), std::move(Res));
   }
 
@@ -384,7 +385,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   // These are not implemented by the plugin interface
   switch (PropName) {
   case OL_DEVICE_INFO_PLATFORM:
-    return Info.write<void *>(Device->Platform);
+    return Info.write<void *>(&Device->Platform);
 
   case OL_DEVICE_INFO_TYPE:
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_GPU);
@@ -517,7 +518,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
 
   switch (PropName) {
   case OL_DEVICE_INFO_PLATFORM:
-    return Info.write<void *>(Device->Platform);
+    return Info.write<void *>(&Device->Platform);
   case OL_DEVICE_INFO_TYPE:
     return Info.write<ol_device_type_t>(OL_DEVICE_TYPE_HOST);
   case OL_DEVICE_INFO_NAME:
@@ -595,7 +596,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device,
 
 Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) {
   for (auto &Platform : OffloadContext::get().Platforms) {
-    for (auto &Device : Platform.Devices) {
+    for (auto &Device : Platform->Devices) {
       if (!Callback(Device.get(), UserData)) {
         break;
       }

From cd9403551a622868fc9c1712cf926eff5eebe938 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 13:18:07 +0100
Subject: [PATCH 109/878] [clang][X86] bmi-builtins.c - add
 -fexperimental-new-constant-interpreter bytecode test coverage (#161182)

Part of #155814
---
 clang/test/CodeGen/X86/bmi-builtins.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/test/CodeGen/X86/bmi-builtins.c b/clang/test/CodeGen/X86/bmi-builtins.c
index 8c9d3a64fb177..d0ae0c7939255 100644
--- a/clang/test/CodeGen/X86/bmi-builtins.c
+++ b/clang/test/CodeGen/X86/bmi-builtins.c
@@ -5,6 +5,13 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,TZCNT
 // RUN: %clang_cc1 -x c++ -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT | FileCheck %s --check-prefixes=TZCNT,TZCNT64
 
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,X64,TZCNT,TZCNT64
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +bmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK,TZCNT
+// RUN: %clang_cc1 -x c++ -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -ffreestanding %s -triple=x86_64-windows-msvc -emit-llvm -o - -Wall -Werror -DTEST_TZCNT -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=TZCNT,TZCNT64
+
 
 #include <immintrin.h>
 

From 8df643f66374fc3fc16523a2d6a63d14d4a560a5 Mon Sep 17 00:00:00 2001
From: Leon Clark <Leon4116@gmail.com>
Date: Mon, 29 Sep 2025 13:26:35 +0100
Subject: [PATCH 110/878] [VectorCombine] Fix rotation in phi narrowing.
 (#160465)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix bug in #140188 where incoming vectors are rotated in the wrong
direction.﻿

Co-authored-by: Leon Clark <leoclark@amd.com>
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  2 +-
 .../AMDGPU/narrow-phi-of-shuffles.ll          | 18 +++++------
 .../X86/narrow-phi-of-shuffles.ll             | 30 +++++++++----------
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 0ef933f596604..cbdc621f1878b 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -4433,7 +4433,7 @@ bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
 
   // Create new mask using difference of the two incoming masks.
   int MaskOffset = NewMask[0u];
-  unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
+  unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
   NewMask.clear();
 
   for (unsigned I = 0u; I < InputNumElements; ++I) {
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll
index 8c504843d87d8..b293976974bf5 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll
@@ -392,7 +392,7 @@ define <4 x i32> @shuffle_v4i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -427,7 +427,7 @@ define <8 x i32> @shuffle_v8i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -462,7 +462,7 @@ define <16 x i32> @shuffle_v16i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -497,7 +497,7 @@ define <32 x i32> @shuffle_v32i32(<3 x i32> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1092,7 +1092,7 @@ define <4 x float> @shuffle_v4f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1127,7 +1127,7 @@ define <6 x float> @shuffle_v6f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1162,7 +1162,7 @@ define <8 x float> @shuffle_v8f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1197,7 +1197,7 @@ define <16 x float> @shuffle_v16f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -1232,7 +1232,7 @@ define <32 x float> @shuffle_v32f32(<3 x float> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
diff --git a/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll
index 59422e98cbcc6..594017ecf84c3 100644
--- a/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll
@@ -605,7 +605,7 @@ define <4 x bfloat> @shuffle_v4bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -640,7 +640,7 @@ define <6 x bfloat> @shuffle_v6bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -675,7 +675,7 @@ define <8 x bfloat> @shuffle_v8bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -710,7 +710,7 @@ define <16 x bfloat> @shuffle_v16bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -745,7 +745,7 @@ define <32 x bfloat> @shuffle_v32bf16(<3 x bfloat> %arg0, i1 %cond) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    tail call void @func0()
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
@@ -850,7 +850,7 @@ define <4 x half> @shuffle_v4f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -866,7 +866,7 @@ define <4 x half> @shuffle_v4f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -933,7 +933,7 @@ define <6 x half> @shuffle_v6f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -949,7 +949,7 @@ define <6 x half> @shuffle_v6f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -1016,7 +1016,7 @@ define <8 x half> @shuffle_v8f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -1032,7 +1032,7 @@ define <8 x half> @shuffle_v8f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -1099,7 +1099,7 @@ define <16 x half> @shuffle_v16f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -1115,7 +1115,7 @@ define <16 x half> @shuffle_v16f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:
@@ -1182,7 +1182,7 @@ define <32 x half> @shuffle_v32f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V1-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V1-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V1:       [[THEN]]:
-; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V1-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V1-NEXT:    tail call void @func0()
 ; CHECK-V1-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V1:       [[ELSE]]:
@@ -1198,7 +1198,7 @@ define <32 x half> @shuffle_v32f16(<3 x half> %arg0, i1 %cond) {
 ; CHECK-V2-NEXT:  [[ENTRY:.*:]]
 ; CHECK-V2-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK-V2:       [[THEN]]:
-; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 1, i32 2, i32 0>
+; CHECK-V2-NEXT:    [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> <i32 2, i32 0, i32 1>
 ; CHECK-V2-NEXT:    tail call void @func0()
 ; CHECK-V2-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK-V2:       [[ELSE]]:

From 5ff9f7b886290f7b6efc9aba33c632c2f1aac9e6 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Mon, 29 Sep 2025 10:56:23 +0200
Subject: [PATCH 111/878] [SimplifyCFG] Ensure selects have not been constant
 folded in `foldSwitchToSelect`

Make sure selects do exist prior to assigning weights to edges.

Fixes: https://github.com/llvm/llvm-project/issues/161137.
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 43 +++++++++----------
 .../SimplifyCFG/switch-to-select-two-case.ll  | 19 ++++++++
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 2d84b4ae1ba5c..216bdf4eb9efb 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -84,7 +84,6 @@
 #include <cstdint>
 #include <iterator>
 #include <map>
-#include <numeric>
 #include <optional>
 #include <set>
 #include <tuple>
@@ -6356,25 +6355,25 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
     if (DefaultResult) {
       Value *ValueCompare =
           Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
-      SelectInst *SelectValueInst = cast<SelectInst>(Builder.CreateSelect(
-          ValueCompare, ResultVector[1].first, DefaultResult, "switch.select"));
-      SelectValue = SelectValueInst;
-      if (HasBranchWeights) {
+      SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+                                         DefaultResult, "switch.select");
+      if (auto *SI = dyn_cast<SelectInst>(SelectValue);
+          SI && HasBranchWeights) {
         // We start with 3 probabilities, where the numerator is the
         // corresponding BranchWeights[i], and the denominator is the sum over
         // BranchWeights. We want the probability and negative probability of
         // Condition == SecondCase.
         assert(BranchWeights.size() == 3);
-        setBranchWeights(SelectValueInst, BranchWeights[2],
+        setBranchWeights(SI, BranchWeights[2],
                          BranchWeights[0] + BranchWeights[1],
                          /*IsExpected=*/false);
       }
     }
     Value *ValueCompare =
         Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
-    SelectInst *Ret = cast<SelectInst>(Builder.CreateSelect(
-        ValueCompare, ResultVector[0].first, SelectValue, "switch.select"));
-    if (HasBranchWeights) {
+    Value *Ret = Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+                                      SelectValue, "switch.select");
+    if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
       // We may have had a DefaultResult. Base the position of the first and
       // second's branch weights accordingly. Also the proability that Condition
       // != FirstCase needs to take that into account.
@@ -6382,7 +6381,7 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
       size_t FirstCasePos = (Condition != nullptr);
       size_t SecondCasePos = FirstCasePos + 1;
       uint32_t DefaultCase = (Condition != nullptr) ? BranchWeights[0] : 0;
-      setBranchWeights(Ret, BranchWeights[FirstCasePos],
+      setBranchWeights(SI, BranchWeights[FirstCasePos],
                        DefaultCase + BranchWeights[SecondCasePos],
                        /*IsExpected=*/false);
     }
@@ -6422,13 +6421,13 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
           Value *And = Builder.CreateAnd(Condition, AndMask);
           Value *Cmp = Builder.CreateICmpEQ(
               And, Constant::getIntegerValue(And->getType(), AndMask));
-          SelectInst *Ret = cast<SelectInst>(
-              Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-          if (HasBranchWeights) {
+          Value *Ret =
+              Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+          if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
             // We know there's a Default case. We base the resulting branch
             // weights off its probability.
             assert(BranchWeights.size() >= 2);
-            setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+            setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                              BranchWeights[0], /*IsExpected=*/false);
           }
           return Ret;
@@ -6448,11 +6447,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
         Value *And = Builder.CreateAnd(Condition, ~BitMask, "switch.and");
         Value *Cmp = Builder.CreateICmpEQ(
             And, Constant::getNullValue(And->getType()), "switch.selectcmp");
-        SelectInst *Ret = cast<SelectInst>(
-            Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-        if (HasBranchWeights) {
+        Value *Ret =
+            Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+        if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
           assert(BranchWeights.size() >= 2);
-          setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+          setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                            BranchWeights[0], /*IsExpected=*/false);
         }
         return Ret;
@@ -6466,11 +6465,11 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
       Value *Cmp2 = Builder.CreateICmpEQ(Condition, CaseValues[1],
                                          "switch.selectcmp.case2");
       Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp");
-      SelectInst *Ret = cast<SelectInst>(
-          Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult));
-      if (HasBranchWeights) {
+      Value *Ret =
+          Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
+      if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
         assert(BranchWeights.size() >= 2);
-        setBranchWeights(Ret, accumulate(drop_begin(BranchWeights), 0),
+        setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
                          BranchWeights[0], /*IsExpected=*/false);
       }
       return Ret;
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
index 39703e9b53b6b..9d78b97c204a8 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll
@@ -755,6 +755,25 @@ bb3:
   ret i1 %phi
 }
 
+define i32 @negative_constfold_select() {
+; CHECK-LABEL: @negative_constfold_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 poison
+;
+entry:
+  switch i32 poison, label %default [
+  i32 0, label %bb
+  i32 2, label %bb
+  ]
+
+bb:
+  br label %default
+
+default:
+  %ret = phi i32 [ poison, %entry ], [ poison, %bb ]
+  ret i32 %ret
+}
+
 !0 = !{!"function_entry_count", i64 1000}
 !1 = !{!"branch_weights", i32 3, i32 5, i32 7}
 !2 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13}

From 2d1f9c95d9f82217e8e35df0b2f622fa227fe53a Mon Sep 17 00:00:00 2001
From: Vladislav Dzhidzhoev <vdzhidzhoev@accesssoftek.com>
Date: Mon, 29 Sep 2025 14:40:15 +0200
Subject: [PATCH 112/878] Reland "[DebugInfo][DwarfDebug] Separate creation and
 population of abstract subprogram DIEs" (#160786)

This is an attempt to reland
https://github.com/llvm/llvm-project/pull/159104 with the fix for
https://github.com/llvm/llvm-project/issues/160197.

The original patch had the following problem: when an abstract
subprogram DIE is constructed from within
`DwarfDebug::endFunctionImpl()`,
`DwarfDebug::constructAbstractSubprogramScopeDIE()` acknowledges `unit:`
field of DISubprogram. But an abstract subprogram DIE constructed from
`DwarfDebug::beginModule()` was put in the same compile unit to which
global variable referencing the subprogram belonged, regardless of
subprogram's `unit:`.

This is fixed by adding `DwarfDebug::getOrCreateAbstractSubprogramCU()`
used by both`DwarfDebug:: constructAbstractSubprogramScopeDIE()` and
`DwarfCompileUnit::getOrCreateSubprogramDIE()` when abstract subprogram
is queried during the creation of DIEs for globals in
`DwarfDebug::beginModule()`.

The fix and the already-reviewed code from
https://github.com/llvm/llvm-project/pull/159104 are two separate
commits in this PR.

=====
The original commit message follows:

With this change, construction of abstract subprogram DIEs is split in
two stages/functions: creation of DIE (in
DwarfCompileUnit::getOrCreateAbstractSubprogramDIE) and its population
with children (in
DwarfCompileUnit::constructAbstractSubprogramScopeDIE).

With that, abstract subprograms can be created/referenced from
DwarfDebug::beginModule, which should solve the issue with static local
variables DIE creation of inlined functons with optimized-out
definitions. It fixes https://github.com/llvm/llvm-project/issues/29985.

LexicalScopes class now stores mapping from DISubprograms to their
corresponding llvm::Function's. It is supposed to be built before
processing of each function (so, now LexicalScopes class has a method
for "module initialization" alongside the method for "function
initialization"). It is used by DwarfCompileUnit to determine whether a
DISubprogram needs an abstract DIE before DwarfDebug::beginFunction is
invoked.

DwarfCompileUnit::getOrCreateSubprogramDIE method is added, which can
create an abstract or a concrete DIE for a subprogram. It accepts
llvm::Function* argument to determine whether a concrete DIE must be
created.

This is a temporary fix for
https://github.com/llvm/llvm-project/issues/29985. Ideally, it will be
fixed by moving global variables and types emission to
DwarfDebug::endModule (https://reviews.llvm.org/D144007,
https://reviews.llvm.org/D144005).

Some code proposed by Ellis Hoag <ellis.sparky.hoag@gmail.com> in
https://github.com/llvm/llvm-project/pull/90523 was taken for this
commit.
---
 llvm/include/llvm/CodeGen/DebugHandlerBase.h  |   2 +
 llvm/include/llvm/CodeGen/LexicalScopes.h     |  20 +++-
 .../CodeGen/AsmPrinter/DebugHandlerBase.cpp   |   4 +-
 .../CodeGen/AsmPrinter/DwarfCompileUnit.cpp   | 107 ++++++++++++------
 .../lib/CodeGen/AsmPrinter/DwarfCompileUnit.h |  35 +++++-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |  32 ++++--
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h      |   4 +
 llvm/lib/CodeGen/AsmPrinter/DwarfFile.h       |   9 ++
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     |  13 +--
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h       |  16 ++-
 llvm/lib/CodeGen/LexicalScopes.cpp            |  33 ++++--
 .../LiveDebugValues/InstrRefBasedImpl.cpp     |   2 +-
 .../LiveDebugValues/VarLocBasedImpl.cpp       |   2 +-
 llvm/lib/CodeGen/LiveDebugVariables.cpp       |   2 +-
 .../test/CodeGen/X86/dbg-distringtype-uint.ll |  10 +-
 .../DebugInfo/AArch64/abstract-sp-unit.ll     |  43 +++++++
 llvm/test/DebugInfo/AArch64/debug-types.ll    |  59 ++++++++++
 .../AArch64/populate-abstract-sp-once.ll      |  67 +++++++++++
 .../DebugInfo/Generic/inlined-static-var.ll   |  93 +++++++++++++++
 llvm/unittests/CodeGen/InstrRefLDVTest.cpp    |   2 +-
 llvm/unittests/CodeGen/LexicalScopesTest.cpp  |  43 +++++--
 llvm/unittests/CodeGen/MFCommon.inc           |   7 +-
 22 files changed, 515 insertions(+), 90 deletions(-)
 create mode 100644 llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll
 create mode 100644 llvm/test/DebugInfo/AArch64/debug-types.ll
 create mode 100644 llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll
 create mode 100644 llvm/test/DebugInfo/Generic/inlined-static-var.ll

diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index 2849497f9a43e..fee4bb116bb87 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -144,6 +144,8 @@ class DebugHandlerBase : public AsmPrinterHandler {
   static bool isUnsignedDIType(const DIType *Ty);
 
   const InstructionOrdering &getInstOrdering() const { return InstOrdering; }
+
+  const LexicalScopes &getLexicalScopes() const { return LScopes; }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/LexicalScopes.h b/llvm/include/llvm/CodeGen/LexicalScopes.h
index 4172e90b4c1b9..993df54c05ad5 100644
--- a/llvm/include/llvm/CodeGen/LexicalScopes.h
+++ b/llvm/include/llvm/CodeGen/LexicalScopes.h
@@ -141,12 +141,18 @@ class LexicalScopes {
 public:
   LexicalScopes() = default;
 
+  /// Scan module to build subprogram-to-function map.
+  LLVM_ABI void initialize(const Module &);
+
   /// Scan machine function and constuct lexical scope nest, resets
   /// the instance if necessary.
-  LLVM_ABI void initialize(const MachineFunction &);
+  LLVM_ABI void scanFunction(const MachineFunction &);
+
+  /// Reset the instance so that it's prepared for another module.
+  LLVM_ABI void resetModule();
 
-  /// Release memory.
-  LLVM_ABI void reset();
+  /// Reset the instance so that it's prepared for another function.
+  LLVM_ABI void resetFunction();
 
   /// Return true if there is any lexical scope information available.
   bool empty() { return CurrentFnLexicalScope == nullptr; }
@@ -196,6 +202,11 @@ class LexicalScopes {
   /// Find or create an abstract lexical scope.
   LLVM_ABI LexicalScope *getOrCreateAbstractScope(const DILocalScope *Scope);
 
+  /// Get function to which the given subprogram is attached, if exists.
+  const Function *getFunction(const DISubprogram *SP) const {
+    return FunctionMap.lookup(SP);
+  }
+
 private:
   /// Find lexical scope for the given Scope/IA. If not available
   /// then create new lexical scope.
@@ -225,6 +236,9 @@ class LexicalScopes {
 
   const MachineFunction *MF = nullptr;
 
+  /// Mapping between DISubprograms and IR functions.
+  DenseMap<const DISubprogram *, const Function *> FunctionMap;
+
   /// Tracks the scopes in the current function.
   // Use an unordered_map to ensure value pointer validity over insertion.
   std::unordered_map<const DILocalScope *, LexicalScope> LexicalScopeMap;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 0f3ff985974ce..d98d18035ac6d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -105,6 +105,8 @@ DebugHandlerBase::~DebugHandlerBase() = default;
 void DebugHandlerBase::beginModule(Module *M) {
   if (M->debug_compile_units().empty())
     Asm = nullptr;
+  else
+    LScopes.initialize(*M);
 }
 
 // Each LexicalScope has first instruction and last instruction to mark
@@ -269,7 +271,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
-  LScopes.initialize(*MF);
+  LScopes.scanFunction(*MF);
   if (LScopes.empty()) {
     beginFunctionImpl(MF);
     return;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 67f526fe91464..518121e200190 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -537,8 +537,9 @@ void DwarfCompileUnit::addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
 DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP,
+                                                const Function &F,
                                                 MCSymbol *LineTableSym) {
-  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, &F, includeMinimalInlineScopes());
   SmallVector<RangeSpan, 2> BB_List;
   // If basic block sections are on, ranges for each basic block section has
   // to be emitted separately.
@@ -1122,9 +1123,10 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
 }
 
 DIE &DwarfCompileUnit::constructSubprogramScopeDIE(const DISubprogram *Sub,
+                                                   const Function &F,
                                                    LexicalScope *Scope,
                                                    MCSymbol *LineTableSym) {
-  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, LineTableSym);
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub, F, LineTableSym);
 
   if (Scope) {
     assert(!Scope->getInlinedAt());
@@ -1198,32 +1200,17 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
   return ObjectPointer;
 }
 
-void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
-    LexicalScope *Scope) {
-  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
-  if (getAbstractScopeDIEs().count(SP))
-    return;
+DIE &DwarfCompileUnit::getOrCreateAbstractSubprogramDIE(
+    const DISubprogram *SP) {
+  if (auto *AbsDef = getAbstractScopeDIEs().lookup(SP))
+    return *AbsDef;
 
-  DIE *ContextDIE;
-  DwarfCompileUnit *ContextCU = this;
-
-  if (includeMinimalInlineScopes())
-    ContextDIE = &getUnitDie();
-  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
-  // the important distinction that the debug node is not associated with the
-  // DIE (since the debug node will be associated with the concrete DIE, if
-  // any). It could be refactored to some common utility function.
-  else if (auto *SPDecl = SP->getDeclaration()) {
-    ContextDIE = &getUnitDie();
-    getOrCreateSubprogramDIE(SPDecl);
-  } else {
-    ContextDIE = getOrCreateContextDIE(SP->getScope());
-    // The scope may be shared with a subprogram that has already been
-    // constructed in another CU, in which case we need to construct this
-    // subprogram in the same CU.
-    ContextCU = DD->lookupCU(ContextDIE->getUnitDie());
-  }
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  return createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+}
 
+DIE &DwarfCompileUnit::createAbstractSubprogramDIE(
+    const DISubprogram *SP, DIE *ContextDIE, DwarfCompileUnit *ContextCU) {
   // Passing null as the associated node because the abstract definition
   // shouldn't be found by lookup.
   DIE &AbsDef = ContextCU->createAndAddDIE(dwarf::DW_TAG_subprogram,
@@ -1237,8 +1224,45 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
                      DD->getDwarfVersion() <= 4 ? std::optional<dwarf::Form>()
                                                 : dwarf::DW_FORM_implicit_const,
                      dwarf::DW_INL_inlined);
-  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, AbsDef))
-    ContextCU->addDIEEntry(AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  return AbsDef;
+}
+
+std::pair<DIE *, DwarfCompileUnit *>
+DwarfCompileUnit::getOrCreateAbstractSubprogramContextDIE(
+    const DISubprogram *SP) {
+  bool Minimal = includeMinimalInlineScopes();
+  bool IgnoreScope = shouldPlaceInUnitDIE(SP, Minimal);
+  DIE *ContextDIE = getOrCreateSubprogramContextDIE(SP, IgnoreScope);
+
+  if (auto *SPDecl = SP->getDeclaration())
+    if (!Minimal)
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
+
+  // The scope may be shared with a subprogram that has already been
+  // constructed in another CU, in which case we need to construct this
+  // subprogram in the same CU.
+  auto *ContextCU = IgnoreScope ? this : DD->lookupCU(ContextDIE->getUnitDie());
+
+  return std::make_pair(ContextDIE, ContextCU);
+}
+
+void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
+    LexicalScope *Scope) {
+  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
+
+  // Populate subprogram DIE only once.
+  if (!getFinalizedAbstractSubprograms().insert(SP).second)
+    return;
+
+  auto [ContextDIE, ContextCU] = getOrCreateAbstractSubprogramContextDIE(SP);
+  DIE *AbsDef = getAbstractScopeDIEs().lookup(SP);
+  if (!AbsDef)
+    AbsDef = &createAbstractSubprogramDIE(SP, ContextDIE, ContextCU);
+
+  if (DIE *ObjectPointer = ContextCU->createAndAddScopeChildren(Scope, *AbsDef))
+    ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer,
+                           *ObjectPointer);
 }
 
 bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
@@ -1293,9 +1317,9 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
 }
 
 DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
-    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
-    const MCSymbol *PCAddr, const MCSymbol *CallAddr, unsigned CallReg,
-    DIType *AllocSiteTy) {
+    DIE &ScopeDIE, const DISubprogram *CalleeSP, const Function *CalleeF,
+    bool IsTail, const MCSymbol *PCAddr, const MCSymbol *CallAddr,
+    unsigned CallReg, DIType *AllocSiteTy) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
                                      ScopeDIE, nullptr);
@@ -1305,7 +1329,7 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
     addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
                MachineLocation(CallReg));
   } else if (CalleeSP) {
-    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP, CalleeF);
     assert(CalleeDIE && "Could not create DIE for call site entry origin");
     if (AddLinkageNamesToDeclCallOriginsForTuning(DD) &&
         !CalleeSP->isDefinition() &&
@@ -1396,7 +1420,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
     if (auto *AbsSPDie = getAbstractScopeDIEs().lookup(SP))
       EntityDie = AbsSPDie;
     else
-      EntityDie = getOrCreateSubprogramDIE(SP);
+      EntityDie = getOrCreateSubprogramDIE(SP, nullptr);
   } else if (auto *T = dyn_cast<DIType>(Entity))
     EntityDie = getOrCreateTypeDIE(T);
   else if (auto *GV = dyn_cast<DIGlobalVariable>(Entity))
@@ -1805,3 +1829,20 @@ DIE *DwarfCompileUnit::getOrCreateContextDIE(const DIScope *Context) {
   }
   return DwarfUnit::getOrCreateContextDIE(Context);
 }
+
+DIE *DwarfCompileUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                                const Function *F,
+                                                bool Minimal) {
+  if (!F && SP->isDefinition()) {
+    F = DD->getLexicalScopes().getFunction(SP);
+
+    if (!F) {
+      // SP may belong to another CU. Determine the CU similarly
+      // to DwarfDebug::constructAbstractSubprogramScopeDIE.
+      return &DD->getOrCreateAbstractSubprogramCU(SP, *this)
+                  .getOrCreateAbstractSubprogramDIE(SP);
+    }
+  }
+
+  return DwarfUnit::getOrCreateSubprogramDIE(SP, F, Minimal);
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index c2f6ca0913818..a3bbc8364599d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -81,6 +81,7 @@ class DwarfCompileUnit final : public DwarfUnit {
 
   // List of abstract local scopes (either DISubprogram or DILexicalBlock).
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   // List of inlined lexical block scopes that belong to subprograms within this
   // CU.
@@ -137,12 +138,28 @@ class DwarfCompileUnit final : public DwarfUnit {
     return DU->getAbstractEntities();
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return FinalizedAbstractSubprograms;
+    return DU->getFinalizedAbstractSubprograms();
+  }
+
   void finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) override;
 
   /// Add info for Wasm-global-based relocation.
   void addWasmRelocBaseGlobal(DIELoc *Loc, StringRef GlobalName,
                               uint64_t GlobalIndex);
 
+  /// Create context DIE for abstract subprogram.
+  /// \returns The context DIE and the compile unit where abstract
+  ///          DIE should be constructed.
+  std::pair<DIE *, DwarfCompileUnit *>
+  getOrCreateAbstractSubprogramContextDIE(const DISubprogram *SP);
+
+  /// Create new DIE for abstract subprogram.
+  DIE &createAbstractSubprogramDIE(const DISubprogram *SP, DIE *ContextDIE,
+                                   DwarfCompileUnit *ContextCU);
+
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU,
@@ -216,7 +233,8 @@ class DwarfCompileUnit final : public DwarfUnit {
   /// DW_AT_low_pc, DW_AT_high_pc and DW_AT_LLVM_stmt_sequence attributes.
   /// If there are global variables in this scope then create and insert DIEs
   /// for these variables.
-  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, MCSymbol *LineTableSym);
+  DIE &updateSubprogramScopeDIE(const DISubprogram *SP, const Function &F,
+                                MCSymbol *LineTableSym);
 
   void constructScopeDIE(LexicalScope *Scope, DIE &ParentScopeDIE);
 
@@ -259,12 +277,18 @@ class DwarfCompileUnit final : public DwarfUnit {
   /// This instance of 'getOrCreateContextDIE()' can handle DILocalScope.
   DIE *getOrCreateContextDIE(const DIScope *Ty) override;
 
+  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, const Function *F,
+                                bool Minimal = false) override;
+
   /// Construct a DIE for this subprogram scope.
-  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, LexicalScope *Scope,
-                                   MCSymbol *LineTableSym);
+  DIE &constructSubprogramScopeDIE(const DISubprogram *Sub, const Function &F,
+                                   LexicalScope *Scope, MCSymbol *LineTableSym);
 
   DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
 
+  /// Create an abstract subprogram DIE, that should later be populated
+  /// by \ref constructAbstractSubprogramScopeDIE.
+  DIE &getOrCreateAbstractSubprogramDIE(const DISubprogram *SP);
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
   /// Whether to use the GNU analog for a DWARF5 tag, attribute, or location
@@ -281,14 +305,15 @@ class DwarfCompileUnit final : public DwarfUnit {
   dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeSP.
+  /// callee described by \p CalleeSP and \p CalleeF.
   /// \p IsTail specifies whether the call is a tail call.
   /// \p PCAddr points to the PC value after the call instruction.
   /// \p CallAddr points to the PC value at the call instruction (or is null).
   /// \p CallReg is a register location for an indirect call. For direct calls
   /// the \p CallReg is set to 0.
   DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
-                                 bool IsTail, const MCSymbol *PCAddr,
+                                 const Function *CalleeF, bool IsTail,
+                                 const MCSymbol *PCAddr,
                                  const MCSymbol *CallAddr, unsigned CallReg,
                                  DIType *AllocSiteTy);
   /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 05d3d18aa9557..09d5f9c57a1a7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -548,6 +548,16 @@ bool DwarfDebug::shareAcrossDWOCUs() const {
   return SplitDwarfCrossCuReferences;
 }
 
+DwarfCompileUnit &
+DwarfDebug::getOrCreateAbstractSubprogramCU(const DISubprogram *SP,
+                                            DwarfCompileUnit &SrcCU) {
+  auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+  if (CU.getSkeleton())
+    return shareAcrossDWOCUs() ? CU : SrcCU;
+
+  return CU;
+}
+
 void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
                                                      LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
@@ -559,14 +569,11 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
   auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
-  if (auto *SkelCU = CU.getSkeleton()) {
-    (shareAcrossDWOCUs() ? CU : SrcCU)
-        .constructAbstractSubprogramScopeDIE(Scope);
+  auto &TargetCU = getOrCreateAbstractSubprogramCU(SP, SrcCU);
+  TargetCU.constructAbstractSubprogramScopeDIE(Scope);
+  if (auto *SkelCU = CU.getSkeleton())
     if (CU.getCUNode()->getSplitDebugInlining())
       SkelCU->constructAbstractSubprogramScopeDIE(Scope);
-  } else {
-    CU.constructAbstractSubprogramScopeDIE(Scope);
-  }
 }
 
 /// Represents a parameter whose call site value can be described by applying a
@@ -997,8 +1004,9 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                                                        ->getName(CallReg)))
                         << (IsTail ? " [IsTail]" : "") << "\n");
 
-      DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
-          ScopeDIE, CalleeSP, IsTail, PCAddr, CallAddr, CallReg, AllocSiteTy);
+      DIE &CallSiteDIE =
+          CU.constructCallSiteEntryDIE(ScopeDIE, CalleeSP, CalleeDecl, IsTail,
+                                       PCAddr, CallAddr, CallReg, AllocSiteTy);
 
       // Optionally emit call-site-param debug info.
       if (emitDebugEntryValues()) {
@@ -2707,7 +2715,8 @@ void DwarfDebug::skippedNonDebugFunction() {
 
 // Gather and emit post-function debug information.
 void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
-  const DISubprogram *SP = MF->getFunction().getSubprogram();
+  const Function &F = MF->getFunction();
+  const DISubprogram *SP = F.getSubprogram();
 
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
@@ -2776,11 +2785,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
 
   ProcessedSPNodes.insert(SP);
   DIE &ScopeDIE =
-      TheCU.constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      TheCU.constructSubprogramScopeDIE(SP, F, FnScope, FunctionLineTableLabel);
   if (auto *SkelCU = TheCU.getSkeleton())
     if (!LScopes.getAbstractScopesList().empty() &&
         TheCU.getCUNode()->getSplitDebugInlining())
-      SkelCU->constructSubprogramScopeDIE(SP, FnScope, FunctionLineTableLabel);
+      SkelCU->constructSubprogramScopeDIE(SP, F, FnScope,
+                                          FunctionLineTableLabel);
 
   FunctionLineTableLabel = nullptr;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 89813dcf0fdab..1a1b28a6fc035 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -906,6 +906,10 @@ class DwarfDebug : public DebugHandlerBase {
     return CUDieMap.lookup(Die);
   }
 
+  /// Find the matching DwarfCompileUnit for the given SP referenced from SrcCU.
+  DwarfCompileUnit &getOrCreateAbstractSubprogramCU(const DISubprogram *SP,
+                                                    DwarfCompileUnit &SrcCU);
+
   unsigned getStringTypeLoc(const DIStringType *ST) const {
     return StringTypeLocMap.lookup(ST);
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index 0fc2b91ddfa91..ef1524d875c84 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -11,6 +11,7 @@
 
 #include "DwarfStringPool.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DIE.h"
@@ -27,6 +28,7 @@ class DbgVariable;
 class DbgLabel;
 class DINode;
 class DILocalScope;
+class DISubprogram;
 class DwarfCompileUnit;
 class DwarfUnit;
 class LexicalScope;
@@ -94,6 +96,9 @@ class DwarfFile {
   // Collection of abstract subprogram DIEs.
   DenseMap<const DILocalScope *, DIE *> AbstractLocalScopeDIEs;
   DenseMap<const DINode *, std::unique_ptr<DbgEntity>> AbstractEntities;
+  /// Keeps track of abstract subprograms to populate them only once.
+  // FIXME: merge creation and population of abstract scopes.
+  SmallPtrSet<const DISubprogram *, 8> FinalizedAbstractSubprograms;
 
   /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
   /// be shared across CUs, that is why we keep the map here instead
@@ -174,6 +179,10 @@ class DwarfFile {
     return AbstractEntities;
   }
 
+  auto &getFinalizedAbstractSubprograms() {
+    return FinalizedAbstractSubprograms;
+  }
+
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index d76fd0c010209..62fb5eb011cf2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -573,7 +573,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   if (auto *NS = dyn_cast<DINamespace>(Context))
     return getOrCreateNameSpace(NS);
   if (auto *SP = dyn_cast<DISubprogram>(Context))
-    return getOrCreateSubprogramDIE(SP);
+    return getOrCreateSubprogramDIE(SP, nullptr);
   if (auto *M = dyn_cast<DIModule>(Context))
     return getOrCreateModule(M);
   return getDIE(Context);
@@ -1066,7 +1066,7 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       if (!Element)
         continue;
       if (auto *SP = dyn_cast<DISubprogram>(Element))
-        getOrCreateSubprogramDIE(SP);
+        getOrCreateSubprogramDIE(SP, nullptr);
       else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
         if (DDTy->getTag() == dwarf::DW_TAG_friend) {
           DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
@@ -1335,22 +1335,21 @@ DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
   return &MDie;
 }
 
-DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
+DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                         const Function *FnHint, bool Minimal) {
   // Construct the context before querying for the existence of the DIE in case
   // such construction creates the DIE (as is the case for member function
   // declarations).
   DIE *ContextDIE =
-      Minimal ? &getUnitDie() : getOrCreateContextDIE(SP->getScope());
+      getOrCreateSubprogramContextDIE(SP, shouldPlaceInUnitDIE(SP, Minimal));
 
   if (DIE *SPDie = getDIE(SP))
     return SPDie;
 
   if (auto *SPDecl = SP->getDeclaration()) {
     if (!Minimal) {
-      // Add subprogram definitions to the CU die directly.
-      ContextDIE = &getUnitDie();
       // Build the decl now to ensure it precedes the definition.
-      getOrCreateSubprogramDIE(SPDecl);
+      getOrCreateSubprogramDIE(SPDecl, nullptr);
       // Check whether the DIE for SP has already been created after the call
       // above.
       // FIXME: Should the creation of definition subprogram DIE during
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index fe05766cf36e1..bb00ec3af9782 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -256,7 +256,9 @@ class DwarfUnit : public DIEUnit {
 
   DIE *getOrCreateNameSpace(const DINamespace *NS);
   DIE *getOrCreateModule(const DIModule *M);
-  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
+  virtual DIE *getOrCreateSubprogramDIE(const DISubprogram *SP,
+                                        const Function *FnHint,
+                                        bool Minimal = false);
 
   void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
                                  bool SkipSPAttributes = false);
@@ -343,6 +345,18 @@ class DwarfUnit : public DIEUnit {
   /// Emit the common part of the header for this unit.
   void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
 
+  bool shouldPlaceInUnitDIE(const DISubprogram *SP, bool Minimal) {
+    // Add subprogram declarations to the CU die directly.
+    return Minimal || SP->getDeclaration();
+  }
+
+  DIE *getOrCreateSubprogramContextDIE(const DISubprogram *SP,
+                                       bool IgnoreScope) {
+    if (IgnoreScope)
+      return &getUnitDie();
+    return getOrCreateContextDIE(SP->getScope());
+  }
+
 private:
   /// A helper to add a wide integer constant to a DIE using a block
   /// form.
diff --git a/llvm/lib/CodeGen/LexicalScopes.cpp b/llvm/lib/CodeGen/LexicalScopes.cpp
index 5916f619537a4..9fc9ac9a66d41 100644
--- a/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -36,8 +37,16 @@ using namespace llvm;
 
 #define DEBUG_TYPE "lexicalscopes"
 
-/// reset - Reset the instance so that it's prepared for another function.
-void LexicalScopes::reset() {
+static bool skipUnit(const DICompileUnit *CU) {
+  return CU->getEmissionKind() == DICompileUnit::NoDebug;
+}
+
+void LexicalScopes::resetModule() {
+  FunctionMap.clear();
+  resetFunction();
+}
+
+void LexicalScopes::resetFunction() {
   MF = nullptr;
   CurrentFnLexicalScope = nullptr;
   LexicalScopeMap.clear();
@@ -47,12 +56,19 @@ void LexicalScopes::reset() {
   DominatedBlocks.clear();
 }
 
-/// initialize - Scan machine function and constuct lexical scope nest.
-void LexicalScopes::initialize(const MachineFunction &Fn) {
-  reset();
+void LexicalScopes::initialize(const Module &M) {
+  resetModule();
+  for (const Function &F : M) {
+    DISubprogram *SP = F.getSubprogram();
+    if (SP && (!SP->getUnit() || !skipUnit(SP->getUnit())))
+      FunctionMap[SP] = &F;
+  }
+}
+
+void LexicalScopes::scanFunction(const MachineFunction &Fn) {
+  resetFunction();
   // Don't attempt any lexical scope creation for a NoDebug compile unit.
-  if (Fn.getFunction().getSubprogram()->getUnit()->getEmissionKind() ==
-      DICompileUnit::NoDebug)
+  if (skipUnit(Fn.getFunction().getSubprogram()->getUnit()))
     return;
   MF = &Fn;
   SmallVector<InsnRange, 4> MIRanges;
@@ -143,8 +159,7 @@ LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
                                                      const DILocation *IA) {
   if (IA) {
     // Skip scopes inlined from a NoDebug compile unit.
-    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
-        DICompileUnit::NoDebug)
+    if (skipUnit(Scope->getSubprogram()->getUnit()))
       return getOrCreateLexicalScope(IA);
     // Create an abstract scope for inlined function.
     getOrCreateAbstractScope(Scope);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a8143bd8f4273..0037bdd270ff3 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -3721,7 +3721,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI = MF.getSubtarget().getFrameLowering();
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   MFI = &MF.getFrameInfo();
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   const auto &STI = MF.getSubtarget();
   AdjustsStackInCalls = MFI->adjustsStack() &&
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 82e0c28f2f26c..b9ea03f949ef8 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -2231,7 +2231,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF,
   TFI->getCalleeSaves(MF, CalleeSavedRegs);
   this->ShouldEmitDebugEntryValues = ShouldEmitDebugEntryValues;
 
-  LS.initialize(MF);
+  LS.scanFunction(MF);
 
   bool Changed = false;
   bool OLChanged = false;
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 84e9f030694bf..001ba5250f77e 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -1263,7 +1263,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
 void LiveDebugVariables::LDVImpl::computeIntervals() {
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
 
   for (const auto &UV : userValues) {
     UV->computeIntervals(MF->getRegInfo(), *TRI, *LIS, LS);
diff --git a/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll b/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
index 638a65d4a8b0b..7542c1b8db327 100644
--- a/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
+++ b/llvm/test/CodeGen/X86/dbg-distringtype-uint.ll
@@ -1,5 +1,13 @@
 ; RUN: llc -mtriple=x86_64 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s
-;
+
+; Ensure that static local variable elemnt is placed in abstract subprogram DIE.
+; CHECK:                   DW_TAG_subprogram
+; CHECK-NOT:               DW_TAG
+; CHECK:                     DW_AT_inline  (DW_INL_inlined)
+; CHECK-EMPTY:
+; CHECK-NEXT:                DW_TAG_variable
+; CHECK-NEXT:                  DW_AT_name  ("elemnt")
+
 ; CHECK: [[SYM:[a-z0-9]+]]:  DW_TAG_formal_parameter
 ; CHECK:                     DW_AT_name	("esym")
 ; CHECK:                     DW_AT_type	([[TYPE:[a-z0-9]+]] "CHARACTER_1")
diff --git a/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll b/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll
new file mode 100644
index 0000000000000..559f20122cc47
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll
@@ -0,0 +1,43 @@
+; RUN: llc --filetype=obj -O0 -o - %s | llvm-dwarfdump --verify -
+
+; Check that abstract DIE for a subprogram referenced from another compile unit
+; is emitted in the correct CU.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+define void @a() !dbg !10 {
+  br label %for.b.c.c, !dbg !13
+  for.b.c.c:
+    br label %for.b.c.c
+}
+
+!llvm.dbg.cu = !{!0, !6}
+!llvm.module.flags = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_20, file: !1, emissionKind: FullDebug, globals: !2)
+!1 = !DIFile(filename: "foo.cpp", directory: "")
+!2 = !{!3}
+!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression())
+!4 = !DIGlobalVariable(type: !5)
+!5 = !DICompositeType(tag: DW_TAG_class_type)
+!6 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_20, file: !7, emissionKind: FullDebug)
+!7 = !DIFile(filename: "bar.cpp", directory: "")
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(type: !11, unit: !6)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(scope: !14, inlinedAt: !15)
+!14 = distinct !DISubprogram(unit: !6)
+!15 = !DILocation(scope: !16, inlinedAt: !25)
+!16 = distinct !DISubprogram(type: !11, unit: !6, declaration: !17)
+!17 = !DISubprogram(scope: !5, type: !11, spFlags: DISPFlagOptimized, templateParams: !18)
+!18 = !{!19}
+!19 = !DITemplateTypeParameter(type: !20)
+!20 = !DICompositeType(tag: DW_TAG_class_type, scope: !21)
+!21 = distinct !DISubprogram(unit: !6, retainedNodes: !22)
+!22 = !{!23}
+!23 = !DILocalVariable(scope: !21, type: !24)
+!24 = !DIBasicType()
+!25 = !DILocation(scope: !21, inlinedAt: !26)
+!26 = !DILocation(scope: !10)
diff --git a/llvm/test/DebugInfo/AArch64/debug-types.ll b/llvm/test/DebugInfo/AArch64/debug-types.ll
new file mode 100644
index 0000000000000..0d0fd33f49fdf
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/debug-types.ll
@@ -0,0 +1,59 @@
+; Check that composite type DIEs go to debug_types section.
+
+; RUN: llc -generate-type-units -filetype=obj %s -o - | llvm-dwarfdump -debug-info -debug-types - | FileCheck %s
+
+; CHECK: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_TAG_class_type
+; CHECK: DW_AT_signature ([[SIG_A:0x[0-9a-f]+]])
+; CHECK: DW_TAG_subprogram
+; CHECK: NULL
+; CHECK: DW_TAG_subprogram
+; CHECK: "_ZN1A6AppendEv"
+; CHECK: DW_TAG_class_type
+; CHECK: DW_AT_signature ([[SIG_LAMBDA:0x[0-9a-f]+]])
+; CHECK: DW_TAG_variable
+; CHECK: NULL
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_inlined_subroutine
+; CHECK: NULL
+; CHECK: NULL
+
+; CHECK:      .debug_types contents:
+; CHECK:      Type Unit: {{.*}} type_signature = [[SIG_A]]
+; CHECK:      DW_TAG_class_type
+; CHECK-NOT:    DW_TAG
+; CHECK:        DW_AT_name ("A")
+; CHECK:      Type Unit: {{.*}} type_signature = [[SIG_LAMBDA]]
+; CHECK:      DW_TAG_class_type
+; CHECK:      DW_TAG_class_type
+; CHECK-NOT:    DW_TAG
+; CHECK:        DW_AT_decl_line (7)
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @_Z1f1A() !dbg !4 {
+entry:
+  ret void, !dbg !8
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, emissionKind: FullDebug, globals: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "f", linkageName: "_Z1f1A", scope: !5, file: !5, line: 14, type: !6, scopeLine: 14, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!5 = !DIFile(filename: "repro.ii", directory: "")
+!6 = distinct !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = !DILocation(line: 8, column: 12, scope: !9, inlinedAt: !16)
+!9 = distinct !DISubprogram(name: "Append", linkageName: "_ZN1A6AppendEv", scope: !10, file: !5, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !12, retainedNodes: !13)
+!10 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !5, line: 3, size: 32, flags: DIFlagTypePassByValue, elements: !2, identifier: "_ZTS1A")
+!11 = distinct !DISubroutineType(types: !7)
+!12 = !DISubprogram(name: "Append", linkageName: "_ZN1A6AppendEv", scope: !10, file: !5, line: 6, type: !11, scopeLine: 6, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized)
+!13 = !{!14}
+!14 = !DILocalVariable(name: "raw_append", scope: !9, file: !5, line: 7, type: !15)
+!15 = distinct !DICompositeType(tag: DW_TAG_class_type, scope: !9, file: !5, line: 7, size: 8, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !2, identifier: "_ZTSZN1A6AppendEvEUlvE_")
+!16 = distinct !DILocation(line: 14, column: 15, scope: !4)
diff --git a/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll b/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll
new file mode 100644
index 0000000000000..20cc98a1bfdcd
--- /dev/null
+++ b/llvm/test/DebugInfo/AArch64/populate-abstract-sp-once.ll
@@ -0,0 +1,67 @@
+; Check that abstract DIEs for inlined subprograms and lexical scopes
+; are populated only once.
+
+; RUN: llc -filetype=obj %s -o - | llvm-dwarfdump - -o - | FileCheck --implicit-check-not=DW_TAG_lexical_scope --implicit-check-not DW_TAG_subprogram %s
+
+; CHECK:  DW_TAG_compile_unit
+; CHECK:    DW_TAG_namespace
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      DW_TAG_subprogram
+; CHECK:        DW_AT_declaration (true)
+; CHECK:      NULL
+
+; CHECK:  [[ABSTRACT_SP:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:    DW_AT_inline (DW_INL_inlined)
+
+; CHECK:    DW_TAG_lexical_block
+; CHECK:      DW_TAG_imported_module
+; CHECK:      NULL
+
+; CHECK:    NULL
+
+; CHECK:  DW_TAG_subprogram
+; CHECK:    DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_abstract_origin ([[ABSTRACT_SP]]
+; CHECK:    NULL
+; CHECK:  DW_TAG_subprogram
+; CHECK:    DW_TAG_inlined_subroutine
+; CHECK:      DW_AT_abstract_origin ([[ABSTRACT_SP]]
+; CHECK:    NULL
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE() !dbg !4 {
+entry:
+  ret void, !dbg !8
+}
+
+define void @_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE() !dbg !15 {
+entry:
+  ret void, !dbg !17
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "CodeGenPGO.cpp", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "TraverseIfStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE", scope: !5, file: !1, line: 364, type: !6, scopeLine: 364, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !7, retainedNodes: !2, keyInstructions: true)
+!5 = !DINamespace(name: "llvm", scope: null)
+!6 = distinct !DISubroutineType(types: !2)
+!7 = !DISubprogram(name: "TraverseIfStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters14TraverseIfStmtEPN5clang6IfStmtE", scope: !5, file: !1, line: 364, type: !6, scopeLine: 364, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!8 = !DILocation(line: 982, column: 39, scope: !9, inlinedAt: !14, atomGroup: 6, atomRank: 2)
+!9 = distinct !DISubprogram(name: "combine", linkageName: "_ZN12_GLOBAL__N_17PGOHash7combineENS0_8HashTypeE", scope: !5, file: !1, line: 966, type: !6, scopeLine: 966, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !10, retainedNodes: !11, keyInstructions: true)
+!10 = !DISubprogram(name: "combine", linkageName: "_ZN12_GLOBAL__N_17PGOHash7combineENS0_8HashTypeE", scope: !5, file: !1, line: 140, type: !6, scopeLine: 140, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!11 = !{!12}
+!12 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !13, entity: !5, file: !1, line: 973)
+!13 = distinct !DILexicalBlock(scope: !9, file: !1, line: 972, column: 7)
+!14 = distinct !DILocation(line: 393, column: 10, scope: !4)
+!15 = distinct !DISubprogram(name: "VisitStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE", scope: !5, file: !1, line: 355, type: !6, scopeLine: 355, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, declaration: !16, retainedNodes: !2, keyInstructions: true)
+!16 = !DISubprogram(name: "VisitStmt", linkageName: "_ZN12_GLOBAL__N_117MapRegionCounters9VisitStmtEPN5clang4StmtE", scope: !5, file: !1, line: 355, type: !6, scopeLine: 355, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagOptimized)
+!17 = !DILocation(line: 982, column: 13, scope: !9, inlinedAt: !18)
+!18 = distinct !DILocation(line: 360, column: 12, scope: !15)
diff --git a/llvm/test/DebugInfo/Generic/inlined-static-var.ll b/llvm/test/DebugInfo/Generic/inlined-static-var.ll
new file mode 100644
index 0000000000000..1d24646896d80
--- /dev/null
+++ b/llvm/test/DebugInfo/Generic/inlined-static-var.ll
@@ -0,0 +1,93 @@
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck --implicit-check-not "{{DW_TAG|NULL}}" %s
+
+; inline __attribute__((always_inline))
+; int removed() { static int A; return A++; }
+;
+; __attribute__((always_inline))
+; int not_removed() { static int B; return B++; }
+;
+; int foo() { return removed() + not_removed(); }
+
+; Ensure that global variables belong to the correct subprograms even if those
+; subprograms are inlined.
+
+; CHECK: DW_TAG_compile_unit
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_abstract_origin {{.*}} "_Z11not_removedv"
+; TODO: This variable should be emitted in abstract subprogram DIE.
+; CHECK:     DW_TAG_variable
+; CHECK:       DW_AT_name     ("B")
+; CHECK:     NULL
+; CHECK:   DW_TAG_base_type
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("removed")
+; CHECK:     DW_TAG_variable
+; CHECK:       DW_AT_name     ("A")
+; CHECK:     NULL
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("not_removed")
+; CHECK:   DW_TAG_subprogram
+; CHECK:     DW_AT_name       ("foo")
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK:     DW_TAG_inlined_subroutine
+; CHECK:     NULL
+; CHECK:   NULL
+
+@_ZZ11not_removedvE1A = internal global i32 0, align 4, !dbg !0
+@_ZZ7removedvE1A = linkonce_odr dso_local global i32 0, align 4, !dbg !10
+
+define dso_local i32 @_Z11not_removedv() !dbg !2 {
+  %1 = load i32, i32* @_ZZ11not_removedvE1A, align 4, !dbg !24
+  %2 = add nsw i32 %1, 1, !dbg !24
+  store i32 %2, i32* @_ZZ11not_removedvE1A, align 4, !dbg !24
+  ret i32 %1, !dbg !25
+}
+
+define dso_local i32 @_Z3foov() !dbg !26 {
+  %1 = load i32, i32* @_ZZ7removedvE1A, align 4, !dbg !27
+  %2 = add nsw i32 %1, 1, !dbg !27
+  store i32 %2, i32* @_ZZ7removedvE1A, align 4, !dbg !27
+  %3 = load i32, i32* @_ZZ11not_removedvE1A, align 4, !dbg !29
+  %4 = add nsw i32 %3, 1, !dbg !29
+  store i32 %4, i32* @_ZZ11not_removedvE1A, align 4, !dbg !29
+  %5 = add nsw i32 %1, %3, !dbg !31
+  ret i32 %5, !dbg !32
+}
+
+!llvm.dbg.cu = !{!7}
+!llvm.module.flags = !{!14, !15, !16, !17, !18, !19, !20, !21, !22}
+!llvm.ident = !{!23}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "B", scope: !2, file: !3, line: 5, type: !6, isLocal: true, isDefinition: true)
+!2 = distinct !DISubprogram(name: "not_removed", linkageName: "_Z11not_removedv", scope: !3, file: !3, line: 5, type: !4, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!3 = !DIFile(filename: "example.cpp", directory: "")
+!4 = !DISubroutineType(types: !5)
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !8, producer: "clang version 14.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !9, splitDebugInlining: false, nameTableKind: None)
+!8 = !DIFile(filename: "example.cpp", directory: "")
+!9 = !{!0, !10}
+!10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = distinct !DIGlobalVariable(name: "A", scope: !12, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
+!12 = distinct !DISubprogram(name: "removed", linkageName: "_Z7removedv", scope: !3, file: !3, line: 2, type: !4, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!13 = !{}
+!14 = !{i32 7, !"Dwarf Version", i32 4}
+!15 = !{i32 2, !"Debug Info Version", i32 3}
+!16 = !{i32 1, !"wchar_size", i32 4}
+!17 = !{i32 1, !"branch-target-enforcement", i32 0}
+!18 = !{i32 1, !"sign-return-address", i32 0}
+!19 = !{i32 1, !"sign-return-address-all", i32 0}
+!20 = !{i32 1, !"sign-return-address-with-bkey", i32 0}
+!21 = !{i32 7, !"uwtable", i32 1}
+!22 = !{i32 7, !"frame-pointer", i32 1}
+!23 = !{!"clang version 14.0.0"}
+!24 = !DILocation(line: 5, column: 43, scope: !2)
+!25 = !DILocation(line: 5, column: 35, scope: !2)
+!26 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !3, file: !3, line: 7, type: !4, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !7, retainedNodes: !13)
+!27 = !DILocation(line: 2, column: 39, scope: !12, inlinedAt: !28)
+!28 = distinct !DILocation(line: 7, column: 20, scope: !26)
+!29 = !DILocation(line: 5, column: 43, scope: !2, inlinedAt: !30)
+!30 = distinct !DILocation(line: 7, column: 32, scope: !26)
+!31 = !DILocation(line: 7, column: 30, scope: !26)
+!32 = !DILocation(line: 7, column: 13, scope: !26)
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index 53bc0246c126e..3a625b299a96f 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -159,7 +159,7 @@ class InstrRefLDVTest : public testing::Test {
     // Setup things like the artifical block map, and BlockNo <=> RPO Order
     // mappings.
     LDV->initialSetup(*MF);
-    LDV->LS.initialize(*MF);
+    LDV->LS.scanFunction(*MF);
     addMTracker(MF);
     return &*LDV;
   }
diff --git a/llvm/unittests/CodeGen/LexicalScopesTest.cpp b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
index 563d496d1e600..34bd37a4afdc2 100644
--- a/llvm/unittests/CodeGen/LexicalScopesTest.cpp
+++ b/llvm/unittests/CodeGen/LexicalScopesTest.cpp
@@ -44,6 +44,7 @@ class LexicalScopesTest : public testing::Test {
   std::unique_ptr<MachineFunction> MF;
   DICompileUnit *OurCU;
   DIFile *OurFile;
+  DISubroutineType *OurSubT;
   DISubprogram *OurFunc;
   DILexicalBlock *OurBlock, *AnotherBlock;
   DISubprogram *ToInlineFunc;
@@ -103,7 +104,7 @@ class LexicalScopesTest : public testing::Test {
     OurFile = DIB.createFile("xyzzy.c", "/cave");
     OurCU =
         DIB.createCompileUnit(dwarf::DW_LANG_C99, OurFile, "nou", false, "", 0);
-    auto OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
+    OurSubT = DIB.createSubroutineType(DIB.getOrCreateTypeArray({}));
     OurFunc =
         DIB.createFunction(OurCU, "bees", "", OurFile, 1, OurSubT, 1,
                            DINode::FlagZero, DISubprogram::SPFlagDefinition);
@@ -136,10 +137,10 @@ TEST_F(LexicalScopesTest, FlatLayout) {
 
   LexicalScopes LS;
   EXPECT_TRUE(LS.empty());
-  LS.reset();
+  LS.resetFunction();
   EXPECT_EQ(LS.getCurrentFunctionScope(), nullptr);
 
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   EXPECT_FALSE(LS.empty());
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   EXPECT_EQ(FuncScope->getParent(), nullptr);
@@ -182,7 +183,7 @@ TEST_F(LexicalScopesTest, BlockScopes) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   EXPECT_EQ(FuncScope->getDesc(), OurFunc);
   auto &Children = FuncScope->getChildren();
@@ -217,7 +218,7 @@ TEST_F(LexicalScopesTest, InlinedScopes) {
   BuildMI(*MBB4, MBB4->end(), InlinedLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   auto &Children = FuncScope->getChildren();
   ASSERT_EQ(Children.size(), 1u);
@@ -252,7 +253,7 @@ TEST_F(LexicalScopesTest, FuncWithEmptyGap) {
   BuildMI(*MBB4, MBB4->end(), OutermostLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
 
   // A gap in a range that contains no other location, is not actually a
@@ -273,7 +274,7 @@ TEST_F(LexicalScopesTest, FuncWithRealGap) {
   MachineInstr *LastI = BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   ASSERT_NE(BlockScope, nullptr);
 
@@ -306,7 +307,7 @@ TEST_F(LexicalScopesTest, NotNested) {
   MachineInstr *FourthI = BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -344,7 +345,7 @@ TEST_F(LexicalScopesTest, TestDominates) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -386,7 +387,7 @@ TEST_F(LexicalScopesTest, TestGetBlocks) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   LexicalScope *OtherBlockScope = LS.findLexicalScope(NotNestedBlockLoc.get());
@@ -443,7 +444,7 @@ TEST_F(LexicalScopesTest, TestMetaInst) {
   BuildMI(*MBB4, MBB4->end(), InBlockLoc, BeanInst);
 
   LexicalScopes LS;
-  LS.initialize(*MF);
+  LS.scanFunction(*MF);
   LexicalScope *FuncScope = LS.getCurrentFunctionScope();
   LexicalScope *BlockScope = LS.findLexicalScope(InBlockLoc.get());
   ASSERT_NE(FuncScope, nullptr);
@@ -459,4 +460,24 @@ TEST_F(LexicalScopesTest, TestMetaInst) {
   EXPECT_TRUE(LS.dominates(InBlockLoc.get(), MBB4));
 }
 
+// Test function map creation.
+TEST_F(LexicalScopesTest, TestFunctionScan) {
+  auto MF2 = createMachineFunction(Ctx, Mod, "Test2");
+  DIBuilder DIB(Mod, false, OurCU);
+  DISubprogram *Func2 =
+      DIB.createFunction(OurCU, "Func2", "", OurFile, 1, OurSubT, 1,
+                         DINode::FlagZero, DISubprogram::SPFlagDefinition);
+  DISubprogram *UnattachedFunc =
+      DIB.createFunction(OurCU, "UnattachedFunc", "", OurFile, 1, OurSubT, 1,
+                         DINode::FlagZero, DISubprogram::SPFlagDefinition);
+  MF2->getFunction().setSubprogram(Func2);
+  DIB.finalize();
+
+  LexicalScopes LS;
+  LS.initialize(Mod);
+  ASSERT_EQ(LS.getFunction(OurFunc), &MF->getFunction());
+  ASSERT_EQ(LS.getFunction(Func2), &MF2->getFunction());
+  ASSERT_EQ(LS.getFunction(UnattachedFunc), nullptr);
+}
+
 } // anonymous namespace
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index cb4a2410df08b..a86a68cb4adf1 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -132,10 +132,10 @@ BogusTargetMachine *createTargetMachine() {
   return &BogusTM;
 }
 
-std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
-                                                       Module &M) {
+std::unique_ptr<MachineFunction>
+createMachineFunction(LLVMContext &Ctx, Module &M, const Twine &Name = "Test") {
   auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
-  auto F = Function::Create(Type, GlobalValue::ExternalLinkage, "Test", &M);
+  auto F = Function::Create(Type, GlobalValue::ExternalLinkage, Name, &M);
 
   auto TM = createTargetMachine();
   unsigned FunctionNum = 42;
@@ -145,4 +145,3 @@ std::unique_ptr<MachineFunction> createMachineFunction(LLVMContext &Ctx,
   return std::make_unique<MachineFunction>(*F, *TM, STI, MMI.getContext(),
                                            FunctionNum);
 }
-

From b555c991e4208cd4829b2dfd0bcb8b2afd0c1292 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Mon, 29 Sep 2025 18:15:51 +0530
Subject: [PATCH 113/878] [clang][modules] Ensure -nostdlib causes no manifest
 to be reported (#161110)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When -nostdlib is specified, Clang should not report any
library‑provided module manifest, even if a manifest for the default
standard library is present.
---
 clang/lib/Driver/Driver.cpp                               | 3 +++
 .../Driver/modules-print-library-module-manifest-path.cpp | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f110dbab3e5a5..85a1335785542 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6613,6 +6613,9 @@ std::string Driver::GetStdModuleManifestPath(const Compilation &C,
                                              const ToolChain &TC) const {
   std::string error = "<NOT PRESENT>";
 
+  if (C.getArgs().hasArg(options::OPT_nostdlib))
+    return error;
+
   switch (TC.GetCXXStdlibType(C.getArgs())) {
   case ToolChain::CST_Libcxx: {
     auto evaluate = [&](const char *library) -> std::optional<std::string> {
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
index 7606713bfa22a..af0f124477cf8 100644
--- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp
+++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
@@ -18,6 +18,14 @@
 // RUN:     --target=x86_64-linux-gnu 2>&1 \
 // RUN:   | FileCheck libcxx.cpp
 
+// check that -nostdlib causes no library-provided module manifest to
+// be reported, even when libc++.modules.json is present.
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -nostdlib \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx-no-module-json.cpp
+
 // for macos there is a different directory structure
 // where the library and libc++.modules.json file are in lib
 // directly but headers are in clang/ver directory which

From 492bcffea4c2fb1d5e0b0c273166f2b3c847e4f8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 13:47:44 +0100
Subject: [PATCH 114/878] [X86] ftrunc.ll - add nounwind to silence cfi noise
 (#161186)

---
 llvm/test/CodeGen/X86/ftrunc.ll | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index 3ed98589767fb..9095fb1550e70 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -243,7 +243,7 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
   ret <4 x double> %r
 }
 
-define float @trunc_signed_f32_no_fast_math(float %x) {
+define float @trunc_signed_f32_no_fast_math(float %x) nounwind {
 ; SSE-LABEL: trunc_signed_f32_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
@@ -259,14 +259,12 @@ define float @trunc_signed_f32_no_fast_math(float %x) {
 ; X86-AVX1-LABEL: trunc_signed_f32_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovss %xmm0, (%esp)
 ; X86-AVX1-NEXT:    flds (%esp)
 ; X86-AVX1-NEXT:    popl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi float %x to i32
   %r = sitofp i32 %i to float
@@ -306,7 +304,7 @@ define float @trunc_signed_f32_nsz(float %x) #0 {
   ret float %r
 }
 
-define double @trunc_signed32_f64_no_fast_math(double %x) {
+define double @trunc_signed32_f64_no_fast_math(double %x) nounwind {
 ; SSE-LABEL: trunc_signed32_f64_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
@@ -322,10 +320,7 @@ define double @trunc_signed32_f64_no_fast_math(double %x) {
 ; X86-AVX1-LABEL: trunc_signed32_f64_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX1-NEXT:    movl %esp, %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX1-NEXT:    andl $-8, %esp
 ; X86-AVX1-NEXT:    subl $8, %esp
 ; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -335,7 +330,6 @@ define double @trunc_signed32_f64_no_fast_math(double %x) {
 ; X86-AVX1-NEXT:    fldl (%esp)
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi double %x to i32
   %r = sitofp i32 %i to double
@@ -377,7 +371,7 @@ define double @trunc_signed32_f64_nsz(double %x) #0 {
   ret double %r
 }
 
-define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
+define double @trunc_f32_signed32_f64_no_fast_math(float %x) nounwind {
 ; SSE-LABEL: trunc_f32_signed32_f64_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
@@ -393,10 +387,7 @@ define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
 ; X86-AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX1-NEXT:    movl %esp, %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX1-NEXT:    andl $-8, %esp
 ; X86-AVX1-NEXT:    subl $8, %esp
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -406,7 +397,6 @@ define double @trunc_f32_signed32_f64_no_fast_math(float %x) {
 ; X86-AVX1-NEXT:    fldl (%esp)
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi float %x to i32
   %r = sitofp i32 %i to double
@@ -445,7 +435,7 @@ define double @trunc_f32_signed32_f64_nsz(float %x) #0 {
   ret double %r
 }
 
-define float @trunc_f64_signed32_f32_no_fast_math(double %x) {
+define float @trunc_f64_signed32_f32_no_fast_math(double %x) nounwind {
 ; SSE-LABEL: trunc_f64_signed32_f32_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
@@ -461,14 +451,12 @@ define float @trunc_f64_signed32_f32_no_fast_math(double %x) {
 ; X86-AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-AVX1-NEXT:    vcvttpd2dq %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vmovss %xmm0, (%esp)
 ; X86-AVX1-NEXT:    flds (%esp)
 ; X86-AVX1-NEXT:    popl %eax
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi double %x to i32
   %r = sitofp i32 %i to float
@@ -503,7 +491,7 @@ define float @trunc_f64_signed32_f32_nsz(double %x) #0 {
   ret float %r
 }
 
-define double @trunc_signed_f64_no_fast_math(double %x) {
+define double @trunc_signed_f64_no_fast_math(double %x) nounwind {
 ; SSE-LABEL: trunc_signed_f64_no_fast_math:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
@@ -520,10 +508,7 @@ define double @trunc_signed_f64_no_fast_math(double %x) {
 ; X86-AVX1-LABEL: trunc_signed_f64_no_fast_math:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
 ; X86-AVX1-NEXT:    movl %esp, %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-AVX1-NEXT:    andl $-8, %esp
 ; X86-AVX1-NEXT:    subl $24, %esp
 ; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -537,7 +522,6 @@ define double @trunc_signed_f64_no_fast_math(double %x) {
 ; X86-AVX1-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-AVX1-NEXT:    movl %ebp, %esp
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-AVX1-NEXT:    retl
   %i = fptosi double %x to i64
   %r = sitofp i64 %i to double

From 99f296d2a8119fabbc9cbf6e16a3521d920a44e5 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Mon, 29 Sep 2025 21:01:48 +0800
Subject: [PATCH 115/878] [CodeGen][test] Fix Buildbot failure due to
 uninitialized variables (#161085)

Under some options used by LLVM Buildbot, an uninitialized variable
(recently added to the test suite) caused constant evaluation failure,
despite the type of that variable is an empty class.

This PR explicitly initializes the variables with `{}` to fix the error.
Follows-up a558d656043734cc4d02e0a0a12e4c308c28f8c7.
---
 llvm/unittests/CodeGen/TypeTraitsTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/unittests/CodeGen/TypeTraitsTest.cpp b/llvm/unittests/CodeGen/TypeTraitsTest.cpp
index 1c8852fc1f071..f0ed0e870cbb3 100644
--- a/llvm/unittests/CodeGen/TypeTraitsTest.cpp
+++ b/llvm/unittests/CodeGen/TypeTraitsTest.cpp
@@ -39,7 +39,7 @@ static_assert(std::is_trivially_copyable_v<IdentifyingPassPtr>,
 template <class Fn> constexpr bool CheckStdCmpRequirements() {
   // std::less and std::equal_to are literal, default constructible, and
   // copyable classes.
-  Fn f1;
+  Fn f1{};
   auto f2 = f1;
   auto f3 = std::move(f2);
   f2 = f3;

From 4fe1a8736b198a7cbbad4885f5ad01b3855b263d Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Mon, 29 Sep 2025 21:11:01 +0800
Subject: [PATCH 116/878] [X86][APX] Promote 8/16-bit LEA to 32-bit to avoid
 partial dependence (#161051)

---
 llvm/lib/Target/X86/X86InstrArithmetic.td     | 23 +++++++++----------
 .../CodeGen/X86/apx/ndd-neg-addr-index.ll     |  7 +++---
 llvm/test/CodeGen/X86/lea-16bit.ll            |  3 ++-
 llvm/test/CodeGen/X86/lea-8bit.ll             |  3 ++-
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index b476859069a57..031fdc1e7162c 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -25,18 +25,12 @@ let SchedRW = [WriteLEA] in {
                      [(set GR32:$dst, lea32addr:$src)]>,
                      OpSize32, Requires<[Not64BitMode]>;
 
-  let Predicates = [HasNDD], isCodeGenOnly = 1 in {
-    def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR8:$dst), (ins lea64_8mem:$src),
-                     "lea{b}\t{$src|$dst}, {$dst|$src}",
-                     [(set GR8:$dst, lea64_iaddr:$src)]>,
-                   OpSize16,
-                   Requires<[In64BitMode]>;
-
-    def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins lea64_16mem:$src),
-                      "lea{w}\t{$src|$dst}, {$dst|$src}",
-                      [(set GR16:$dst, lea64_iaddr:$src)]>,
-                    OpSize16,
-                    Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in {
+    def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_8mem:$src),
+                     "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
+
+    def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_16mem:$src),
+                      "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32;
   }
 
   def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src),
@@ -51,6 +45,11 @@ let SchedRW = [WriteLEA] in {
                       [(set GR64:$dst, lea64addr:$src)]>;
 } // SchedRW
 
+let Predicates = [HasNDD] in {
+  def : Pat<(i8 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_8r lea64_8mem:$src), sub_8bit)>;
+  def : Pat<(i16 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_16r lea64_16mem:$src), sub_16bit)>;
+}
+
 // Pseudo instruction for lea that prevent optimizer from eliminating
 // the instruction.
 let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in {
diff --git a/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll
index 6679b5f58e8c1..41fa34667af86 100644
--- a/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll
+++ b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll
@@ -8,7 +8,7 @@ define void @neg_8bit_1(i1 %cmp) {
 ; NDD-NEXT:    andb $1, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xe7,0x01]
 ; NDD-NEXT:    movzbl 0, %ecx # encoding: [0x0f,0xb6,0x0c,0x25,0x00,0x00,0x00,0x00]
 ; NDD-NEXT:    negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8]
-; NDD-NEXT:    leab 2(%rcx,%rax), %al # encoding: [0x66,0x8d,0x44,0x01,0x02]
+; NDD-NEXT:    leal 2(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0x02]
 ; NDD-NEXT:    movb %al, 0 # encoding: [0x88,0x04,0x25,0x00,0x00,0x00,0x00]
 ; NDD-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -25,7 +25,8 @@ define void @neg_8bit_2(i8 %int8) {
 ; NDD-NEXT:    # kill: def $edi killed $edi def $rdi
 ; NDD-NEXT:    addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff]
 ; NDD-NEXT:    negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8]
-; NDD-NEXT:    leab 1(%rdi,%rax), %al # encoding: [0x66,0x8d,0x44,0x07,0x01]
+; NDD-NEXT:    leal 1(%rdi,%rax), %eax # encoding: [0x8d,0x44,0x07,0x01]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; NDD-NEXT:    mulb %dil # encoding: [0x40,0xf6,0xe7]
 ; NDD-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
 ; NDD-NEXT:    retq # encoding: [0xc3]
@@ -55,7 +56,7 @@ define i32 @neg_16bit(i16 %0) {
 ; NDD-NEXT:    cmovsl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x48,0xc1]
 ; NDD-NEXT:    andw $-256, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x25,0x00,0xff]
 ; NDD-NEXT:    negw %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd8]
-; NDD-NEXT:    leaw 1(%rdi,%rax), %ax # encoding: [0x66,0x8d,0x44,0x07,0x01]
+; NDD-NEXT:    leal 1(%rdi,%rax), %eax # encoding: [0x8d,0x44,0x07,0x01]
 ; NDD-NEXT:    movzwl %ax, %eax # encoding: [0x0f,0xb7,0xc0]
 ; NDD-NEXT:    movq %rax, 0 # encoding: [0x48,0x89,0x04,0x25,0x00,0x00,0x00,0x00]
 ; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
diff --git a/llvm/test/CodeGen/X86/lea-16bit.ll b/llvm/test/CodeGen/X86/lea-16bit.ll
index cec29ab1da6ab..40da01d9ab8f3 100644
--- a/llvm/test/CodeGen/X86/lea-16bit.ll
+++ b/llvm/test/CodeGen/X86/lea-16bit.ll
@@ -13,7 +13,8 @@ define i16 @lea16bit(i16 %in) {
 ; NDD-LABEL: lea16bit:
 ; NDD:       # %bb.0:
 ; NDD-NEXT:    # kill: def $edi killed $edi def $rdi
-; NDD-NEXT:    leaw 1(%rdi,%rdi), %ax
+; NDD-NEXT:    leal 1(%rdi,%rdi), %eax
+; NDD-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NDD-NEXT:    retq
   %shl = shl i16 %in, 1
   %or = or i16 %shl, 1
diff --git a/llvm/test/CodeGen/X86/lea-8bit.ll b/llvm/test/CodeGen/X86/lea-8bit.ll
index 98222dfc0407c..fc295f75e23c7 100644
--- a/llvm/test/CodeGen/X86/lea-8bit.ll
+++ b/llvm/test/CodeGen/X86/lea-8bit.ll
@@ -14,7 +14,8 @@ define i8 @lea8bit(i8 %in) {
 ; NDD-LABEL: lea8bit:
 ; NDD:       # %bb.0:
 ; NDD-NEXT:    # kill: def $edi killed $edi def $rdi
-; NDD-NEXT:    leab 1(%rdi,%rdi), %al
+; NDD-NEXT:    leal 1(%rdi,%rdi), %eax
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
 ; NDD-NEXT:    retq
   %shl = shl i8 %in, 1
   %or = or i8 %shl, 1

From 31818fb5f59184dfb44cc504d43c86da0b234cb1 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 29 Sep 2025 11:13:02 +0100
Subject: [PATCH 117/878] Reland "[clang][DebugInfo][NFC] Simplify
 CollectRecordLambdaFields"

This reverts commit 99a29f640809f32d1271ed5cac9764b839daeed1.

Original change was reverted because following assertion started firing:
```
clang++: clang/include/clang/AST/LambdaCapture.h:105: ValueDecl
*clang::LambdaCapture::getCapturedVar() const: Assertion
`capturesVariable() && "No variable available for capture"' failed.

PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script.

Stack dump:
0.Program arguments: ../../prebuilt/third_party/clang/custom/bin/clang++ -MD -MF host_x64/obj/third_party/android/platform/system/libbase/libbase.logging.cpp.o.d -D_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS -D_LIBCPP_REMOVE_TRANSITIVE_INCLUDES -I../.. -Ihost_x64/gen -I../../third_party/android/platform/system/libbase/include -I../../third_party/fmtlib/src/include -I../../third_party/android/platfo...com
1.<eof> parser at end of file
2.Per-file LLVM IR generation
clang++: error: clang frontend command failed with exit code 134 (use -v
to see invocation)
Fuchsia clang version 22.0.0git
(https://llvm.googlesource.com/llvm-project
8553bd2b29ad2b17a9a884f14da6c43b606ec776)
********************
```

The relanded patch just adds a `Capture.capturesVariable()` check before calling `getCapturedVar`. That's what the code did before the refactor.
---
 clang/lib/CodeGen/CGDebugInfo.cpp | 64 +++++++++++++++++++------------
 clang/lib/CodeGen/CGDebugInfo.h   |  1 +
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 12c7d48e20d67..fee6bc0cbb64b 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -26,6 +26,7 @@
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/LambdaCapture.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/VTableBuilder.h"
@@ -1903,46 +1904,61 @@ CGDebugInfo::createInlinedSubprogram(StringRef FuncName,
   return SP;
 }
 
+llvm::StringRef
+CGDebugInfo::GetLambdaCaptureName(const LambdaCapture &Capture) {
+  if (Capture.capturesThis())
+    return CGM.getCodeGenOpts().EmitCodeView ? "__this" : "this";
+
+  assert(Capture.capturesVariable());
+
+  const ValueDecl *CaptureDecl = Capture.getCapturedVar();
+  assert(CaptureDecl && "Expected valid decl for captured variable.");
+
+  return CaptureDecl->getName();
+}
+
 void CGDebugInfo::CollectRecordLambdaFields(
     const CXXRecordDecl *CXXDecl, SmallVectorImpl<llvm::Metadata *> &elements,
     llvm::DIType *RecordTy) {
   // For C++11 Lambdas a Field will be the same as a Capture, but the Capture
   // has the name and the location of the variable so we should iterate over
   // both concurrently.
-  const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(CXXDecl);
   RecordDecl::field_iterator Field = CXXDecl->field_begin();
   unsigned fieldno = 0;
   for (CXXRecordDecl::capture_const_iterator I = CXXDecl->captures_begin(),
                                              E = CXXDecl->captures_end();
        I != E; ++I, ++Field, ++fieldno) {
-    const LambdaCapture &C = *I;
-    if (C.capturesVariable()) {
-      SourceLocation Loc = C.getLocation();
-      assert(!Field->isBitField() && "lambdas don't have bitfield members!");
-      ValueDecl *V = C.getCapturedVar();
-      StringRef VName = V->getName();
-      llvm::DIFile *VUnit = getOrCreateFile(Loc);
-      auto Align = getDeclAlignIfRequired(V, CGM.getContext());
-      llvm::DIType *FieldType = createFieldType(
-          VName, Field->getType(), Loc, Field->getAccess(),
-          layout.getFieldOffset(fieldno), Align, VUnit, RecordTy, CXXDecl);
-      elements.push_back(FieldType);
-    } else if (C.capturesThis()) {
+    const LambdaCapture &Capture = *I;
+    const uint64_t FieldOffset =
+        CGM.getContext().getASTRecordLayout(CXXDecl).getFieldOffset(fieldno);
+
+    assert(!Field->isBitField() && "lambdas don't have bitfield members!");
+
+    SourceLocation Loc;
+    uint32_t Align = 0;
+
+    if (Capture.capturesThis()) {
       // TODO: Need to handle 'this' in some way by probably renaming the
       // this of the lambda class and having a field member of 'this' or
       // by using AT_object_pointer for the function and having that be
       // used as 'this' for semantic references.
-      FieldDecl *f = *Field;
-      llvm::DIFile *VUnit = getOrCreateFile(f->getLocation());
-      QualType type = f->getType();
-      StringRef ThisName =
-          CGM.getCodeGenOpts().EmitCodeView ? "__this" : "this";
-      llvm::DIType *fieldType = createFieldType(
-          ThisName, type, f->getLocation(), f->getAccess(),
-          layout.getFieldOffset(fieldno), VUnit, RecordTy, CXXDecl);
-
-      elements.push_back(fieldType);
+      Loc = Field->getLocation();
+    } else if (Capture.capturesVariable()) {
+      Loc = Capture.getLocation();
+
+      const ValueDecl *CaptureDecl = Capture.getCapturedVar();
+      assert(CaptureDecl && "Expected valid decl for captured variable.");
+
+      Align = getDeclAlignIfRequired(CaptureDecl, CGM.getContext());
+    } else {
+      continue;
     }
+
+    llvm::DIFile *VUnit = getOrCreateFile(Loc);
+
+    elements.push_back(createFieldType(
+        GetLambdaCaptureName(Capture), Field->getType(), Loc,
+        Field->getAccess(), FieldOffset, Align, VUnit, RecordTy, CXXDecl));
   }
 }
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index f86077369a42a..78c3eb9c5792e 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -397,6 +397,7 @@ class CGDebugInfo {
   void CollectRecordFields(const RecordDecl *Decl, llvm::DIFile *F,
                            SmallVectorImpl<llvm::Metadata *> &E,
                            llvm::DICompositeType *RecordTy);
+  llvm::StringRef GetLambdaCaptureName(const LambdaCapture &Capture);
 
   /// If the C++ class has vtable info then insert appropriate debug
   /// info entry in EltTys vector.

From 766c90f43966fe9eb438e5a6f6378f4e3e0bc37e Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar@gmail.com>
Date: Mon, 29 Sep 2025 09:21:59 -0400
Subject: [PATCH 118/878] [VectorCombine] foldShuffleOfCastops - handle unary
 shuffles (#160009)

Fixes #156853.
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  68 ++++++---
 .../AArch64/combine-shuffle-ext.ll            | 134 +++++++++---------
 .../AArch64/shuffletoidentity.ll              |  17 ++-
 .../VectorCombine/X86/shuffle-of-casts.ll     |  56 ++++++++
 4 files changed, 177 insertions(+), 98 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index cbdc621f1878b..32704bdb54f4f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2487,21 +2487,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
     return false;
 
+  // Check whether this is a binary shuffle.
+  bool IsBinaryShuffle = !isa<UndefValue>(V1);
+
   auto *C0 = dyn_cast<CastInst>(V0);
   auto *C1 = dyn_cast<CastInst>(V1);
-  if (!C0 || !C1)
+  if (!C0 || (IsBinaryShuffle && !C1))
     return false;
 
   Instruction::CastOps Opcode = C0->getOpcode();
-  if (C0->getSrcTy() != C1->getSrcTy())
+
+  // If this is allowed, foldShuffleOfCastops can get stuck in a loop
+  // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
+  if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
     return false;
 
-  // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
-  if (Opcode != C1->getOpcode()) {
-    if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
-      Opcode = Instruction::SExt;
-    else
+  if (IsBinaryShuffle) {
+    if (C0->getSrcTy() != C1->getSrcTy())
       return false;
+    // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
+    if (Opcode != C1->getOpcode()) {
+      if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
+        Opcode = Instruction::SExt;
+      else
+        return false;
+    }
   }
 
   auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
@@ -2544,23 +2554,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   InstructionCost CostC0 =
       TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
                            TTI::CastContextHint::None, CostKind);
-  InstructionCost CostC1 =
-      TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
-                           TTI::CastContextHint::None, CostKind);
-  InstructionCost OldCost = CostC0 + CostC1;
-  OldCost +=
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy,
-                         CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
 
-  InstructionCost NewCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy,
-                         CastSrcTy, NewMask, CostKind);
+  TargetTransformInfo::ShuffleKind ShuffleKind;
+  if (IsBinaryShuffle)
+    ShuffleKind = TargetTransformInfo::SK_PermuteTwoSrc;
+  else
+    ShuffleKind = TargetTransformInfo::SK_PermuteSingleSrc;
+
+  InstructionCost OldCost = CostC0;
+  OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
+                                CostKind, 0, nullptr, {}, &I);
+
+  InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
+                                               CastSrcTy, NewMask, CostKind);
   NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
                                   TTI::CastContextHint::None, CostKind);
   if (!C0->hasOneUse())
     NewCost += CostC0;
-  if (!C1->hasOneUse())
-    NewCost += CostC1;
+  if (IsBinaryShuffle) {
+    InstructionCost CostC1 =
+        TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
+                             TTI::CastContextHint::None, CostKind);
+    OldCost += CostC1;
+    if (!C1->hasOneUse())
+      NewCost += CostC1;
+  }
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
@@ -2568,14 +2586,20 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   if (NewCost > OldCost)
     return false;
 
-  Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
-                                            C1->getOperand(0), NewMask);
+  Value *Shuf;
+  if (IsBinaryShuffle)
+    Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
+                                       NewMask);
+  else
+    Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
+
   Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
 
   // Intersect flags from the old casts.
   if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
     NewInst->copyIRFlags(C0);
-    NewInst->andIRFlags(C1);
+    if (IsBinaryShuffle)
+      NewInst->andIRFlags(C1);
   }
 
   Worklist.pushValue(Shuf);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
index 6341c8945247d..1503a1b51d256 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll
@@ -14,9 +14,9 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -36,9 +36,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -58,9 +58,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -80,9 +80,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -102,9 +102,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -125,9 +125,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) {
 ; CHECK-NEXT:    call void @use.i32(i32 0)
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -148,9 +148,9 @@ define <4 x i32> @load_i32_sext_zext_to_v4i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -170,9 +170,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_load_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.i32(i32 [[L]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -194,9 +194,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_ins_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v2i32(<2 x i32> [[VEC_INS]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -218,9 +218,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_bc_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    call void @use.v8i8(<8 x i8> [[VEC_BC]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
@@ -266,10 +266,10 @@ define <4 x i32> @load_i32_zext_to_v4i32_shuffle_other_users(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
-; CHECK-NEXT:    call void @use.v8i16(<4 x i16> [[VEC_SHUFFLE]])
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32>
+; CHECK-NEXT:    call void @use.v8i16(<4 x i16> [[E_1]])
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -290,9 +290,9 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <16 x i8> [[VEC_BC]] to <16 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_SHUFFLE]] to <8 x i16>
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[EXT_1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -312,9 +312,9 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <6 x i8> [[VEC_BC]] to <6 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <3 x i8> [[VEC_SHUFFLE]] to <3 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <3 x i16> [[EXT_1]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:
@@ -334,9 +334,9 @@ define <4 x i32> @load_i32_insert_idx_1_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[L]], i64 1
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -356,9 +356,9 @@ define <4 x i32> @mask_extracts_not_all_elements_1_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -378,9 +378,9 @@ define <4 x i32> @mask_extracts_not_all_elements_2_sext(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -422,9 +422,9 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[E_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[E_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -444,9 +444,9 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[DI]], align 8
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <16 x i8> [[VEC_BC]] to <16 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_SHUFFLE]] to <8 x i16>
+; CHECK-NEXT:    [[OUTER_EXT:%.*]] = sext <8 x i16> [[EXT_1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[OUTER_EXT]]
 ;
 entry:
@@ -466,9 +466,9 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i24, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <6 x i8> [[VEC_BC]] to <6 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <3 x i8> [[VEC_SHUFFLE]] to <3 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <3 x i16> [[EXT_1]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[EXT_2]]
 ;
 entry:
@@ -488,9 +488,9 @@ define <4 x i32> @load_i32_insert_idx_1(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[L]], i64 1
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -510,9 +510,9 @@ define <4 x i32> @mask_extracts_not_all_elements_1(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
@@ -532,9 +532,9 @@ define <4 x i32> @mask_extracts_not_all_elements_2(ptr %di) {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
 ; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
 ; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
-; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK-NEXT:    [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT:    [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index acbc836ffcab0..ed29719d49493 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -205,8 +205,8 @@ define <8 x i8> @abs_different(<8 x i8> %a) {
 define <4 x i32> @poison_intrinsic(<2 x i16> %l256) {
 ; CHECK-LABEL: @poison_intrinsic(
 ; CHECK-NEXT:    [[L266:%.*]] = call <2 x i16> @llvm.abs.v2i16(<2 x i16> [[L256:%.*]], i1 false)
-; CHECK-NEXT:    [[L267:%.*]] = zext <2 x i16> [[L266]] to <2 x i32>
-; CHECK-NEXT:    [[L271:%.*]] = shufflevector <2 x i32> [[L267]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[L267:%.*]] = shufflevector <2 x i16> [[L266]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[L271:%.*]] = zext <4 x i16> [[L267]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[L271]]
 ;
   %l266 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %l256, i1 false)
@@ -534,9 +534,9 @@ define <4 x i64> @single_zext(<4 x i32> %x) {
 
 define <4 x i64> @not_zext(<4 x i32> %x) {
 ; CHECK-LABEL: @not_zext(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[X:%.*]] to <4 x i64>
-; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <4 x i64> [[REVSHUF]]
+; CHECK-NEXT:    [[REVSHUF:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[REVSHUF:%.*]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[ZEXT]]
 ;
   %zext = zext <4 x i32> %x to <4 x i64>
   %revshuf = shufflevector <4 x i64> %zext, <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -922,10 +922,9 @@ define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) {
 
 define <4 x i64> @cast_mismatched_types(<4 x i32> %x) {
 ; CHECK-LABEL: @cast_mismatched_types(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i32> [[SHUF]] to <2 x i64>
-; CHECK-NEXT:    [[EXTSHUF:%.*]] = shufflevector <2 x i64> [[ZEXT]], <2 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    ret <4 x i64> [[EXTSHUF]]
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <4 x i32> [[X]] to <4 x i64>
+; CHECK-NEXT:    ret <4 x i64> [[ZEXT]]
 ;
   %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
   %zext = zext <2 x i32> %shuf to <2 x i64>
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index fba4b60ef417b..82a739964c9d0 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -342,3 +342,59 @@ define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
   %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i32> %r
 }
+
+; Unary shuffles
+
+define <4 x i16> @unary_shuffle_zext_v8i8_v4i16(<8 x i8> %a0) {
+; CHECK-LABEL: define <4 x i16> @unary_shuffle_zext_v8i8_v4i16(
+; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[A0]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT: ret <4 x i16> [[X1]]
+;
+  %x1 = zext <8 x i8> %a0 to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %vec.shuffle
+}
+
+define <4 x i16> @unary_shuffle_sext_v8i8_v4i16(<8 x i8> %a0) {
+; CHECK-LABEL: define <4 x i16> @unary_shuffle_sext_v8i8_v4i16(
+; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[A0]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[X1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16>
+; CHECK-NEXT: ret <4 x i16> [[X1]]
+;
+  %x1 = sext <8 x i8> %a0 to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %vec.shuffle
+}
+
+; negative - avoid loop with foldBitcastOfShuffle
+
+define <2 x i32> @unary_shuffle_bitcast_v8i8_v2i32(<8 x i8> %a0) {
+; CHECK-LABEL: define <2 x i32> @unary_shuffle_bitcast_v8i8_v2i32(
+; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[X1:%.*]] = bitcast <8 x i8> [[A0]] to <2 x i32>
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <2 x i32> [[X1]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT: ret <2 x i32> [[VEC_SHUFFLE]]
+;
+  %x1 = bitcast <8 x i8> %a0 to <2 x i32>
+  %vec.shuffle = shufflevector <2 x i32> %x1, <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %vec.shuffle
+}
+
+; negative - multiuse
+
+define <4 x i16> @unary_shuffle_sext_v8i8_v4i16_multiuse(<8 x i8> %a0, ptr %a1) {
+; CHECK-LABEL: define <4 x i16> @unary_shuffle_sext_v8i8_v4i16_multiuse(
+; CHECK-SAME: <8 x i8> [[A0:%.*]], ptr [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[X1:%.*]] = sext <8 x i8> [[A0]] to <8 x i16>
+; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[X1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: store <8 x i16> [[X1]], ptr [[A1]], align 16
+; CHECK-NEXT: ret <4 x i16> [[VEC_SHUFFLE]]
+;
+  %x1 = sext <8 x i8> %a0 to <8 x i16>
+  %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <8 x i16> %x1, ptr %a1, align 16
+  ret <4 x i16> %vec.shuffle
+}

From d8a8d1fc56a6d8ad71157f087548bef31864cb45 Mon Sep 17 00:00:00 2001
From: jiang1997 <jieke@live.cn>
Date: Mon, 29 Sep 2025 21:25:49 +0800
Subject: [PATCH 119/878] [MLIR][MemRef] Change builders with `int` alignment
 params to `llvm::MaybeAlign` (#159449)

Change remaining OpBuilder methods to use `llvm::MaybeAlign` instead of
`uint64_t` for alignment parameters.

---------

Co-authored-by: Erick Ochoa Lopez <erick.ochoalopez@amd.com>
---
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       | 24 +++++++++----------
 .../VectorEmulateMaskedLoadStore.cpp          |  9 ++++---
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index cd92ca98b2530..2bf953e32ccce 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -1241,28 +1241,28 @@ def LoadOp : MemRef_Op<"load",
     OpBuilder<(ins "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>,
     OpBuilder<(ins "Type":$resultType,
                    "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, resultType, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>,
     OpBuilder<(ins "TypeRange":$resultTypes,
                    "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, resultTypes, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>
   ];
 
@@ -2007,10 +2007,10 @@ def MemRef_StoreOp : MemRef_Op<"store",
                    "Value":$memref,
                    "ValueRange":$indices,
                    CArg<"bool", "false">:$nontemporal,
-                   CArg<"uint64_t", "0">:$alignment), [{
+                   CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{
       return build($_builder, $_state, valueToStore, memref, indices, nontemporal,
-                   alignment != 0 ? $_builder.getI64IntegerAttr(alignment) :
-                                    nullptr);
+                   alignment ? $_builder.getI64IntegerAttr(alignment->value()) :
+                               nullptr);
     }]>,
     OpBuilder<(ins "Value":$valueToStore, "Value":$memref), [{
       $_state.addOperands(valueToStore);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
index 78f74eef7bee3..bdbb792041e3d 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
@@ -64,7 +64,6 @@ struct VectorMaskedLoadOpConverter final
     Value mask = maskedLoadOp.getMask();
     Value base = maskedLoadOp.getBase();
     Value iValue = maskedLoadOp.getPassThru();
-    std::optional<uint64_t> alignment = maskedLoadOp.getAlignment();
     auto indices = llvm::to_vector_of<Value>(maskedLoadOp.getIndices());
     Value one = arith::ConstantOp::create(rewriter, loc, indexType,
                                           IntegerAttr::get(indexType, 1));
@@ -76,7 +75,7 @@ struct VectorMaskedLoadOpConverter final
           [&](OpBuilder &builder, Location loc) {
             auto loadedValue = memref::LoadOp::create(
                 builder, loc, base, indices, /*nontemporal=*/false,
-                alignment.value_or(0));
+                llvm::MaybeAlign(maskedLoadOp.getAlignment().value_or(0)));
             auto combinedValue =
                 vector::InsertOp::create(builder, loc, loadedValue, iValue, i);
             scf::YieldOp::create(builder, loc, combinedValue.getResult());
@@ -135,7 +134,6 @@ struct VectorMaskedStoreOpConverter final
     Value base = maskedStoreOp.getBase();
     Value value = maskedStoreOp.getValueToStore();
     bool nontemporal = false;
-    std::optional<uint64_t> alignment = maskedStoreOp.getAlignment();
     auto indices = llvm::to_vector_of<Value>(maskedStoreOp.getIndices());
     Value one = arith::ConstantOp::create(rewriter, loc, indexType,
                                           IntegerAttr::get(indexType, 1));
@@ -145,8 +143,9 @@ struct VectorMaskedStoreOpConverter final
       auto ifOp = scf::IfOp::create(rewriter, loc, maskBit, /*else=*/false);
       rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front());
       auto extractedValue = vector::ExtractOp::create(rewriter, loc, value, i);
-      memref::StoreOp::create(rewriter, loc, extractedValue, base, indices,
-                              nontemporal, alignment.value_or(0));
+      memref::StoreOp::create(
+          rewriter, loc, extractedValue, base, indices, nontemporal,
+          llvm::MaybeAlign(maskedStoreOp.getAlignment().value_or(0)));
 
       rewriter.setInsertionPointAfter(ifOp);
       indices.back() =

From 41bce81820314933714f8301723185d2a5a10eba Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Mon, 29 Sep 2025 14:35:26 +0100
Subject: [PATCH 120/878] [AMDGPU][Disassembler][NFC] Regenerate check lines of
 some tests using the update script. (#161190)

---
 .../AMDGPU/gfx1250_dasm_salu_lit64.txt        |  37 +++--
 .../Disassembler/AMDGPU/gfx1250_dasm_sop1.txt |  23 +--
 .../AMDGPU/gfx1250_dasm_valu_lit64.txt        | 155 +++++++++---------
 3 files changed, 109 insertions(+), 106 deletions(-)

diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt
index d2ec2133b1b88..7064479082b7a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt
@@ -1,55 +1,56 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
-# GFX1250: s_mov_b64 s[2:3], lit64(0x10abcdef12345678)    ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_mov_b64 s[2:3], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_add_nc_u64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_add_nc_u64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_and_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_and_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), lit64(0x10abcdef12345678) ; encoding: [0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), lit64(0x10abcdef12345678) ; encoding: [0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_ashr_i64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_ashr_i64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_bfe_i64 s[2:3], lit64(0x80abcdef12345678), 5 ; encoding: [0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80]
 0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80
+# GFX1250: s_bfe_i64 s[2:3], lit64(0x80abcdef12345678), 5 ; encoding: [0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80]
 
-# GFX1250: s_bfe_u64 s[2:3], lit64(0x10abcdef12345678), 5 ; encoding: [0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_bfe_u64 s[2:3], lit64(0x10abcdef12345678), 5 ; encoding: [0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_cselect_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_cselect_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_lshl_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_lshl_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_lshr_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_lshr_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_mul_u64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_mul_u64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_nand_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_nand_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_nor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_nor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_or_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_or_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_or_not1_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_or_not1_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_xnor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_xnor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: s_xor_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: s_xor_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
index 963e69370a3ba..227e1c47b3d05 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt
@@ -1,34 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
-# GFX1250: s_add_pc_i64 lit64(0x12345678abcd0)            ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00]
 0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00
+# GFX1250: s_add_pc_i64 lit64(0x12345678abcd0)     ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00]
 
-# GFX1250: s_add_pc_i64 0x64                       ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00]
 0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00
+# GFX1250: s_add_pc_i64 0x64                       ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00]
 
-# GFX1250: s_add_pc_i64 4                          ; encoding: [0x84,0x4b,0x80,0xbe]
 0x84,0x4b,0x80,0xbe
+# GFX1250: s_add_pc_i64 4                          ; encoding: [0x84,0x4b,0x80,0xbe]
 
-# GFX1250: s_add_pc_i64 s[2:3]                     ; encoding: [0x02,0x4b,0x80,0xbe]
 0x02,0x4b,0x80,0xbe
+# GFX1250: s_add_pc_i64 s[2:3]                     ; encoding: [0x02,0x4b,0x80,0xbe]
 
-# GFX1250: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4c,0x82,0xbe]
 0x88,0x4c,0x82,0xbe
+# GFX1250: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4c,0x82,0xbe]
 
-# GFX1250: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4d,0x82,0xbe]
 0x88,0x4d,0x82,0xbe
+# GFX1250: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4d,0x82,0xbe]
 
-# GFX1250: s_get_shader_cycles_u64 s[2:3]          ; encoding: [0x00,0x06,0x82,0xbe]
 0x00,0x06,0x82,0xbe
+# GFX1250: s_get_shader_cycles_u64 s[2:3]          ; encoding: [0x00,0x06,0x82,0xbe]
 
-# GFX1250: s_barrier_signal -3                     ; encoding: [0xc3,0x4e,0x80,0xbe]
 0xc3,0x4e,0x80,0xbe
+# GFX1250: s_barrier_signal -3                     ; encoding: [0xc3,0x4e,0x80,0xbe]
 
-# GFX1250: s_get_barrier_state s3, -3               ; encoding: [0xc3,0x50,0x83,0xbe]
 0xc3,0x50,0x83,0xbe
+# GFX1250: s_get_barrier_state s3, -3              ; encoding: [0xc3,0x50,0x83,0xbe]
 
-# GFX1250: s_get_barrier_state s3, -4               ; encoding: [0xc4,0x50,0x83,0xbe]
 0xc4,0x50,0x83,0xbe
+# GFX1250: s_get_barrier_state s3, -4              ; encoding: [0xc4,0x50,0x83,0xbe]
 
-# GFX1250: s_get_barrier_state s3, m0               ; encoding: [0x7d,0x50,0x83,0xbe]
 0x7d,0x50,0x83,0xbe
+# GFX1250: s_get_barrier_state s3, m0              ; encoding: [0x7d,0x50,0x83,0xbe]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt
index 30650b4fa227f..1571fb96dcf49 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt
@@ -1,232 +1,233 @@
+# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
 
-# GFX1250: v_add_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_add_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_class_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_class_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_eq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_eq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_gt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_gt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_gt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_gt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_gt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_gt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_le_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_le_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_le_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_le_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_le_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_le_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_lt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_lt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ne_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ne_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ne_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ne_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_neq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_neq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_ngt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_ngt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nle_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nle_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nlg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nlg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_nlt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_nlt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_o_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_o_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmp_u_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmp_u_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_class_f64_e32 lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_class_f64_e32 lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_eq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_eq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_eq_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_eq_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_eq_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_eq_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ge_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ge_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ge_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ge_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_gt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_gt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_gt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_gt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_gt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_gt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_le_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_le_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_le_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_le_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_le_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_le_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_lt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_lt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ne_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ne_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ne_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ne_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_neq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_neq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_ngt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_ngt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nle_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nle_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nlg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nlg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_nlt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_nlt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_o_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_o_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cmpx_u_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cmpx_u_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cvt_f32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cvt_f32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cvt_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cvt_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_cvt_u32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_cvt_u32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_floor_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_floor_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_fract_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_fract_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_frexp_exp_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_frexp_exp_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_frexp_mant_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_frexp_mant_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_max_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_max_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_min_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_min_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_mul_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_mul_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_rcp_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_rcp_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_rndne_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_rndne_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_rsq_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_rsq_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_sqrt_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_sqrt_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_trunc_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10
+# GFX1250: v_trunc_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x4063233333333333) ; encoding: [0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40]
 0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x4063233333333333) ; encoding: [0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x448969368974c05b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44]
 0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x448969368974c05b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 0x40632000   ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40]
 0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40
+# GFX1250: v_ceil_f64_e32 v[254:255], 0x40632000   ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40]
 
-# GFX1250: v_mov_b64_e32 v[0:1], 0x12345678        ; encoding: [0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12]
 0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12
+# GFX1250: v_mov_b64_e32 v[0:1], 0x12345678        ; encoding: [0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f]
 0xf8,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], -4.0         ; encoding: [0xf7,0x30,0xfc,0x7f]
 0xf7,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], -4.0         ; encoding: [0xf7,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 2.0          ; encoding: [0xf4,0x30,0xfc,0x7f]
 0xf4,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], 2.0          ; encoding: [0xf4,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], 0            ; encoding: [0x80,0x30,0xfc,0x7f]
 0x80,0x30,0xfc,0x7f
+# GFX1250: v_ceil_f64_e32 v[254:255], 0            ; encoding: [0x80,0x30,0xfc,0x7f]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x7b)  ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x7b)  ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
 
-# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x109a) ; encoding: [0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
 0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00
+# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x109a) ; encoding: [0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00]

From 668f56d0033333901345e14a0efad39aede0cc90 Mon Sep 17 00:00:00 2001
From: guillem-bartrina-sonarsource <guillem.bartrina@sonarsource.com>
Date: Mon, 29 Sep 2025 15:37:47 +0200
Subject: [PATCH 121/878] [analyzer] CStringChecker: Fix crash in
 `CheckOverlap` when arguments are not pointers (#160511)

https://github.com/llvm/llvm-project/blob/main/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp#L675-L678
mistakenly assumes that target expressions are of pointer type.
`CheckOverlap` has multiple call sites, most of which do not verify this
assumption. Therefore, the simplest solution is to verify it just before
that point.
---
 .../Checkers/CStringChecker.cpp               |  4 ++++
 clang/test/Analysis/buffer-overlap-decls.c    | 23 +++++++++++++++++++
 clang/test/Analysis/buffer-overlap.c          |  7 ++++++
 3 files changed, 34 insertions(+)
 create mode 100644 clang/test/Analysis/buffer-overlap-decls.c

diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 36f316df0c3ff..0ae784c000f60 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -672,6 +672,10 @@ ProgramStateRef CStringChecker::CheckOverlap(CheckerContext &C,
 
   ProgramStateRef stateTrue, stateFalse;
 
+  if (!First.Expression->getType()->isAnyPointerType() ||
+      !Second.Expression->getType()->isAnyPointerType())
+    return state;
+
   // Assume different address spaces cannot overlap.
   if (First.Expression->getType()->getPointeeType().getAddressSpace() !=
       Second.Expression->getType()->getPointeeType().getAddressSpace())
diff --git a/clang/test/Analysis/buffer-overlap-decls.c b/clang/test/Analysis/buffer-overlap-decls.c
new file mode 100644
index 0000000000000..4830f4e9691d8
--- /dev/null
+++ b/clang/test/Analysis/buffer-overlap-decls.c
@@ -0,0 +1,23 @@
+// RUN: %clang_analyze_cc1 -verify %s -Wno-incompatible-library-redeclaration \
+// RUN:   -analyzer-checker=alpha.unix.cstring.BufferOverlap
+// expected-no-diagnostics
+
+typedef typeof(sizeof(int)) size_t;
+
+void memcpy(int dst, int src, size_t size);
+
+void test_memcpy_proxy() {
+  memcpy(42, 42, 42); // no-crash
+}
+
+void strcpy(int dst, char *src);
+
+void test_strcpy_proxy() {
+  strcpy(42, (char *)42); // no-crash
+}
+
+void strxfrm(int dst, char *src, size_t size);
+
+void test_strxfrm_proxy() {
+  strxfrm(42, (char *)42, 42); // no-crash
+}
diff --git a/clang/test/Analysis/buffer-overlap.c b/clang/test/Analysis/buffer-overlap.c
index 8414a764541e2..defb17a62ae0b 100644
--- a/clang/test/Analysis/buffer-overlap.c
+++ b/clang/test/Analysis/buffer-overlap.c
@@ -96,3 +96,10 @@ void test_snprintf6() {
   char b[4] = {0};
   snprintf(a, sizeof(a), "%s", b); // no-warning
 }
+
+void* memcpy(void* dest, const void* src, size_t count);
+
+void test_memcpy_esoteric() {
+label:
+  memcpy((char *)&&label, (const char *)memcpy, 1);
+}

From 36d9e10a7dbe1ade257c2aec8623f048e20b4b28 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 29 Sep 2025 08:40:12 -0500
Subject: [PATCH 122/878] [flang][OpenMP] Fix scope checks for ALLOCATE
 directive (#160948)

Make sure that the ALLOCATE directive adds its source span to the
current scope, and that the scope checks compare scoping units, not the
specific scopes.
---
 flang/lib/Semantics/resolve-directives.cpp   | 64 ++++++++++----------
 flang/lib/Semantics/resolve-names.cpp        |  4 +-
 flang/test/Semantics/OpenMP/allocate01.f90   |  1 -
 flang/test/Semantics/OpenMP/allocate08.f90   |  3 +
 flang/test/Semantics/OpenMP/allocators04.f90 |  2 +
 flang/test/Semantics/OpenMP/allocators05.f90 |  2 -
 6 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index a4c8922f58c6c..270642acb3e9b 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -362,6 +362,24 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   explicit OmpAttributeVisitor(SemanticsContext &context)
       : DirectiveAttributeVisitor(context) {}
 
+  static const Scope &scopingUnit(const Scope &scope) {
+    const Scope *iter{&scope};
+    for (; !iter->IsTopLevel(); iter = &iter->parent()) {
+      switch (iter->kind()) {
+      case Scope::Kind::BlockConstruct:
+      case Scope::Kind::BlockData:
+      case Scope::Kind::DerivedType:
+      case Scope::Kind::MainProgram:
+      case Scope::Kind::Module:
+      case Scope::Kind::Subprogram:
+        return *iter;
+      default:
+        break;
+      }
+    }
+    return *iter;
+  }
+
   template <typename A> void Walk(const A &x) { parser::Walk(x, *this); }
   template <typename A> bool Pre(const A &) { return true; }
   template <typename A> void Post(const A &) {}
@@ -952,7 +970,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   void ResolveOmpNameList(const std::list<parser::Name> &, Symbol::Flag);
   void ResolveOmpName(const parser::Name &, Symbol::Flag);
   Symbol *ResolveName(const parser::Name *);
-  Symbol *ResolveOmpObjectScope(const parser::Name *);
   Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag);
   Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag);
   void CheckMultipleAppearances(
@@ -2920,31 +2937,6 @@ Symbol *OmpAttributeVisitor::ResolveOmpCommonBlockName(
   return nullptr;
 }
 
-// Use this function over ResolveOmpName when an omp object's scope needs
-// resolving, it's symbol flag isn't important and a simple check for resolution
-// failure is desired. Using ResolveOmpName means needing to work with the
-// context to check for failure, whereas here a pointer comparison is all that's
-// needed.
-Symbol *OmpAttributeVisitor::ResolveOmpObjectScope(const parser::Name *name) {
-
-  // TODO: Investigate whether the following block can be replaced by, or
-  // included in, the ResolveOmpName function
-  if (auto *prev{name ? GetContext().scope.parent().FindSymbol(name->source)
-                      : nullptr}) {
-    name->symbol = prev;
-    return nullptr;
-  }
-
-  // TODO: Investigate whether the following block can be replaced by, or
-  // included in, the ResolveOmpName function
-  if (auto *ompSymbol{
-          name ? GetContext().scope.FindSymbol(name->source) : nullptr}) {
-    name->symbol = ompSymbol;
-    return ompSymbol;
-  }
-  return nullptr;
-}
-
 void OmpAttributeVisitor::ResolveOmpObjectList(
     const parser::OmpObjectList &ompObjectList, Symbol::Flag ompFlag) {
   for (const auto &ompObject : ompObjectList.v) {
@@ -3023,13 +3015,19 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
       context_.Say(designator.source,
           "List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement"_err_en_US);
     }
-    if ((ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective ||
-            ompFlag == Symbol::Flag::OmpExecutableAllocateDirective) &&
-        ResolveOmpObjectScope(name) == nullptr) {
-      context_.Say(designator.source, // 2.15.3
-          "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
-          parser::ToUpperCaseLetters(
-              llvm::omp::getOpenMPDirectiveName(directive, version)));
+    bool checkScope{ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective};
+    // In 5.1 the scope check only applies to declarative allocate.
+    if (version == 50 && !checkScope) {
+      checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective;
+    }
+    if (checkScope) {
+      if (scopingUnit(GetContext().scope) !=
+          scopingUnit(symbol->GetUltimate().owner())) {
+        context_.Say(designator.source, // 2.15.3
+            "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
+            parser::ToUpperCaseLetters(
+                llvm::omp::getOpenMPDirectiveName(directive, version)));
+      }
     }
     if (ompFlag == Symbol::Flag::OmpReduction) {
       // Using variables inside of a namelist in OpenMP reductions
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 2f350f016c1f5..ef0b8cdfd827b 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1618,12 +1618,14 @@ class OmpVisitor : public virtual DeclarationVisitor {
   void Post(const parser::OpenMPDeclareTargetConstruct &) {
     SkipImplicitTyping(false);
   }
-  bool Pre(const parser::OpenMPDeclarativeAllocate &) {
+  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
+    AddOmpSourceRange(x.source);
     SkipImplicitTyping(true);
     return true;
   }
   void Post(const parser::OpenMPDeclarativeAllocate &) {
     SkipImplicitTyping(false);
+    messageHandler().set_currStmtSource(std::nullopt);
   }
   bool Pre(const parser::OpenMPDeclarativeConstruct &x) {
     AddOmpSourceRange(x.source);
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index e0b084ff0030b..5280d1b68a731 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -20,7 +20,6 @@ subroutine sema()
         print *, a
 
     !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead. [-Wopen-mp-usage]
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
     !$omp allocate(x) allocator(omp_default_mem_alloc)
       allocate ( x(a), darray(a, b) )
     end subroutine sema
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index fc950ea4fca36..5bfa918be4cad 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -27,10 +27,12 @@ subroutine allocate()
 
   !$omp allocate(x) allocator(omp_default_mem_alloc)
   !$omp allocate(y) allocator(omp_default_mem_alloc)
+  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
   !$omp allocate(z) allocator(omp_default_mem_alloc)
 
   !$omp allocate(x)
   !$omp allocate(y)
+  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
   !$omp allocate(z)
 
   !$omp allocate(w) allocator(custom_allocator)
@@ -40,5 +42,6 @@ subroutine allocate()
   !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
   !$omp allocate(y) allocator(custom_allocator)
   !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
+  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
   !$omp allocate(z) allocator(custom_allocator)
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
index 212e48fbd1b26..c71c7ca8466ba 100644
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ b/flang/test/Semantics/OpenMP/allocators04.f90
@@ -22,10 +22,12 @@ subroutine allocate()
     trait(1)%value = default_mem_fb
     custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait)
 
+    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a)
 
     !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
+    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(custom_allocator: b)
         allocate(b)
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/allocators05.f90 b/flang/test/Semantics/OpenMP/allocators05.f90
index 0e8366a2461e6..efacdfaec7647 100644
--- a/flang/test/Semantics/OpenMP/allocators05.f90
+++ b/flang/test/Semantics/OpenMP/allocators05.f90
@@ -15,11 +15,9 @@ subroutine allocate()
     integer, parameter :: LEN = 2
 
     !$omp target private(a, b)
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a(LEN))
     !ERROR: ALLOCATORS directives that appear in a TARGET region must specify an allocator
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(b)
         allocate(b(LEN))
     !$omp end target

From b285bac972354f961e611b46ee9fde1b3d760730 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 29 Sep 2025 09:46:18 -0400
Subject: [PATCH 123/878] [libc++][NFC] In documentation, fix path to benchmark
 that has now moved

---
 libcxx/docs/TestingLibcxx.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 227791031bab0..6171629185af2 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -482,7 +482,7 @@ when running the benchmarks. For example,
 
 .. code-block:: bash
 
-  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/string.bench.cpp --show-all --param optimization=speed
+  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/containers/string.bench.cpp --show-all --param optimization=speed
 
 Note that benchmarks are only dry-run when run via the ``check-cxx`` target since
 we only want to make sure they don't rot. Do not rely on the results of benchmarks
@@ -504,7 +504,7 @@ more benchmarks, as usual:
 .. code-block:: bash
 
   $ cmake -S runtimes -B <build> [...]
-  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/string.bench.cpp --param optimization=speed
+  $ libcxx/utils/libcxx-lit <build> libcxx/test/benchmarks/containers/string.bench.cpp --param optimization=speed
 
 Then, get the consolidated benchmark output for that run using ``consolidate-benchmarks``:
 

From e5bbc9feae3c6b7f377f89abae47fc3819c73f95 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Mon, 29 Sep 2025 15:55:35 +0200
Subject: [PATCH 124/878] [Clang] Fixes __builtin_is_implicit_lifetime for
 types with deleted ctrs (#161163)

We failed to check that the trivial constructor where eligible (this
implies non deleted).

Fixes #160610
---
 clang/docs/ReleaseNotes.rst        |  1 +
 clang/lib/Sema/SemaTypeTraits.cpp  | 17 +++++++-----
 clang/test/SemaCXX/type-traits.cpp | 43 ++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 270b5d336eba7..e8deae50e4cb0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -432,6 +432,7 @@ Bug Fixes to C++ Support
 - Fix an assertion failure when taking the address on a non-type template parameter argument of
   object type. (#GH151531)
 - Suppress ``-Wdouble-promotion`` when explicitly asked for with C++ list initialization (#GH33409).
+- Fix the result of `__builtin_is_implicit_lifetime` for types with a user-provided constructor. (#GH160610)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index c2427dcf52538..6c798d6acb0a0 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -1163,13 +1163,16 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT,
     //   - it has at least one trivial eligible constructor and a trivial,
     //     non-deleted destructor.
     const CXXDestructorDecl *Dtor = RD->getDestructor();
-    if (UnqualT->isAggregateType())
-      if (Dtor && !Dtor->isUserProvided())
-        return true;
-    if (RD->hasTrivialDestructor() && (!Dtor || !Dtor->isDeleted()))
-      if (RD->hasTrivialDefaultConstructor() ||
-          RD->hasTrivialCopyConstructor() || RD->hasTrivialMoveConstructor())
-        return true;
+    if (UnqualT->isAggregateType() && (!Dtor || !Dtor->isUserProvided()))
+      return true;
+    if (RD->hasTrivialDestructor() && (!Dtor || !Dtor->isDeleted())) {
+      for (CXXConstructorDecl *Ctr : RD->ctors()) {
+        if (Ctr->isIneligibleOrNotSelected() || Ctr->isDeleted())
+          continue;
+        if (Ctr->isTrivial())
+          return true;
+      }
+    }
     return false;
   }
   case UTT_IsIntangibleType:
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 3f0124755c674..d49330f97fad0 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -2038,6 +2038,49 @@ void is_implicit_lifetime(int n) {
   static_assert(__builtin_is_implicit_lifetime(int * __restrict));
 }
 
+namespace GH160610 {
+class NonAggregate {
+public:
+    NonAggregate() = default;
+
+    NonAggregate(const NonAggregate&)            = delete;
+    NonAggregate& operator=(const NonAggregate&) = delete;
+private:
+    int num;
+};
+
+class DataMemberInitializer {
+public:
+    DataMemberInitializer() = default;
+
+    DataMemberInitializer(const DataMemberInitializer&)            = delete;
+    DataMemberInitializer& operator=(const DataMemberInitializer&) = delete;
+private:
+    int num = 0;
+};
+
+class UserProvidedConstructor {
+public:
+    UserProvidedConstructor() {}
+
+    UserProvidedConstructor(const UserProvidedConstructor&)            = delete;
+    UserProvidedConstructor& operator=(const UserProvidedConstructor&) = delete;
+};
+
+static_assert(__builtin_is_implicit_lifetime(NonAggregate));
+static_assert(!__builtin_is_implicit_lifetime(DataMemberInitializer));
+static_assert(!__builtin_is_implicit_lifetime(UserProvidedConstructor));
+
+#if __cplusplus >= 202002L
+template <typename T>
+class Tpl {
+    Tpl() requires false = default ;
+};
+static_assert(!__builtin_is_implicit_lifetime(Tpl<int>));
+
+#endif
+}
+
 void is_signed()
 {
   //static_assert(__is_signed(char));

From 3408e6af75b6b93bdad05cfe3ed296e58d55fb60 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 29 Sep 2025 15:56:54 +0200
Subject: [PATCH 125/878] [Support] Add SipHash-based 64-bit stable hash
 function (#160945)

Factor out the 64-bit hash calculation in getPointerAuthStableSipHash()
as getStableSipHash(). This allows using the full 64-bit hash where we
require a stable hash.

Similar to getPointerAuthStableSipHash(), the new hash function is meant
to be stable across platforms and compiler versions.
---
 llvm/include/llvm/Support/SipHash.h    |  7 +++++++
 llvm/lib/Support/SipHash.cpp           | 11 ++++++++---
 llvm/unittests/Support/SipHashTest.cpp |  7 +++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Support/SipHash.h b/llvm/include/llvm/Support/SipHash.h
index 910cf59432c69..b090565641526 100644
--- a/llvm/include/llvm/Support/SipHash.h
+++ b/llvm/include/llvm/Support/SipHash.h
@@ -33,6 +33,13 @@ LLVM_ABI void getSipHash_2_4_64(ArrayRef<uint8_t> In, const uint8_t (&K)[16],
 LLVM_ABI void getSipHash_2_4_128(ArrayRef<uint8_t> In, const uint8_t (&K)[16],
                                  uint8_t (&Out)[16]);
 
+/// Compute a stable 64-bit hash of the given string.
+///
+/// The exact algorithm is the little-endian interpretation of the
+/// non-doubled (i.e. 64-bit) result of applying a SipHash-2-4 using
+/// a specific seed value which can be found in the source.
+LLVM_ABI uint64_t getStableSipHash(StringRef Str);
+
 /// Compute a stable non-zero 16-bit hash of the given string.
 ///
 /// The exact algorithm is the little-endian interpretation of the
diff --git a/llvm/lib/Support/SipHash.cpp b/llvm/lib/Support/SipHash.cpp
index 86dad66420435..382d36f0a8da5 100644
--- a/llvm/lib/Support/SipHash.cpp
+++ b/llvm/lib/Support/SipHash.cpp
@@ -35,14 +35,19 @@ void llvm::getSipHash_2_4_128(ArrayRef<uint8_t> In, const uint8_t (&K)[16],
   siphash<2, 4>(In.data(), In.size(), K, Out);
 }
 
-/// Compute an ABI-stable 16-bit hash of the given string.
-uint16_t llvm::getPointerAuthStableSipHash(StringRef Str) {
+/// Compute an ABI-stable 64-bit hash of the given string.
+uint64_t llvm::getStableSipHash(StringRef Str) {
   static const uint8_t K[16] = {0xb5, 0xd4, 0xc9, 0xeb, 0x79, 0x10, 0x4a, 0x79,
                                 0x6f, 0xec, 0x8b, 0x1b, 0x42, 0x87, 0x81, 0xd4};
 
   uint8_t RawHashBytes[8];
   getSipHash_2_4_64(arrayRefFromStringRef(Str), K, RawHashBytes);
-  uint64_t RawHash = endian::read64le(RawHashBytes);
+  return endian::read64le(RawHashBytes);
+}
+
+/// Compute an ABI-stable 16-bit hash of the given string.
+uint16_t llvm::getPointerAuthStableSipHash(StringRef Str) {
+  uint64_t RawHash = getStableSipHash(Str);
 
   // Produce a non-zero 16-bit discriminator.
   uint16_t Discriminator = (RawHash % 0xFFFF) + 1;
diff --git a/llvm/unittests/Support/SipHashTest.cpp b/llvm/unittests/Support/SipHashTest.cpp
index 7c557eb488acc..3037e6436e18d 100644
--- a/llvm/unittests/Support/SipHashTest.cpp
+++ b/llvm/unittests/Support/SipHashTest.cpp
@@ -50,6 +50,13 @@ TEST(SipHashTest, SipHash_2_4_128) {
   }
 }
 
+// Tests for the 64-bit stable SipHash wrapper.
+TEST(SipHashTest, StableSipHash) {
+  EXPECT_EQ(0xB2BB69BB0A2AC0F1UL, getStableSipHash(""));
+  EXPECT_EQ(0x9304ABFF427B72E8UL, getStableSipHash("strlen"));
+  EXPECT_EQ(0x55F45179A08AE51BUL, getStableSipHash("_ZN1 ind; f"));
+}
+
 // Tests for the ptrauth-specific SipHash wrapper.
 TEST(SipHashTest, PointerAuthSipHash) {
   // Test some basic cases.

From e4d94f4f7f657bf932a86c6719ac2bf851bfbfdc Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski@amd.com>
Date: Mon, 29 Sep 2025 16:01:52 +0200
Subject: [PATCH 126/878] [OpenMP][Flang] Fix no-loop test (#161162)

Fortran no-loop test is supported only for GPU.
---
 offload/test/offloading/fortran/target-no-loop.f90 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/offload/test/offloading/fortran/target-no-loop.f90 b/offload/test/offloading/fortran/target-no-loop.f90
index 8e40e20e73e70..3c88b00a53541 100644
--- a/offload/test/offloading/fortran/target-no-loop.f90
+++ b/offload/test/offloading/fortran/target-no-loop.f90
@@ -1,4 +1,5 @@
 ! REQUIRES: flang
+! REQUIRES: gpu
 
 ! RUN: %libomptarget-compile-fortran-generic -O3  -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription
 ! RUN: env LIBOMPTARGET_INFO=16 OMP_NUM_TEAMS=16 OMP_TEAMS_THREAD_LIMIT=16 %libomptarget-run-generic 2>&1 | %fcheck-generic

From 7af31bf7016e3a8c9082cdd620de23561245b9b5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 15:16:55 +0100
Subject: [PATCH 127/878] [X86] setoeq.ll - add f<->i64, uint and AVX1/AVX512
 test coverage (#161197)

---
 llvm/test/CodeGen/X86/setoeq.ll | 566 +++++++++++++++++++++++++++++---
 1 file changed, 529 insertions(+), 37 deletions(-)

diff --git a/llvm/test/CodeGen/X86/setoeq.ll b/llvm/test/CodeGen/X86/setoeq.ll
index f0addf4b64599..131e279aa645c 100644
--- a/llvm/test/CodeGen/X86/setoeq.ll
+++ b/llvm/test/CodeGen/X86/setoeq.ll
@@ -1,40 +1,532 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
-
-define zeroext i8 @t(double %x) nounwind readnone {
-; CHECK-LABEL: t:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK-NEXT:    cmpeqsd %xmm0, %xmm1
-; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retl
-entry:
-	%0 = fptosi double %x to i32		; <i32> [#uses=1]
-	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
-	%2 = fcmp oeq double %1, %x		; <i1> [#uses=1]
-	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
-	ret i8 %retval12
-}
-
-define zeroext i8 @u(double %x) nounwind readnone {
-; CHECK-LABEL: u:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK-NEXT:    cmpneqsd %xmm0, %xmm1
-; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retl
-entry:
-	%0 = fptosi double %x to i32		; <i32> [#uses=1]
-	%1 = sitofp i32 %0 to double		; <double> [#uses=1]
-	%2 = fcmp une double %1, %x		; <i1> [#uses=1]
-	%retval12 = zext i1 %2 to i8		; <i8> [#uses=1]
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2    | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx     | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=i686-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define zeroext i8 @oeq_f64_i32(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    cmpeqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX-NEXT:    vcmpeqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i32
+	%1 = sitofp i32 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @oeq_f64_u32(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_u32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttsd2si %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    sarl $31, %ecx
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cvttsd2si %xmm1, %edx
+; SSE-NEXT:    andl %ecx, %edx
+; SSE-NEXT:    orl %eax, %edx
+; SSE-NEXT:    movd %edx, %xmm1
+; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cmpeqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_u32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    sarl $31, %ecx
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; AVX-NEXT:    vcvttsd2si %xmm1, %edx
+; AVX-NEXT:    andl %ecx, %edx
+; AVX-NEXT:    orl %eax, %edx
+; AVX-NEXT:    vmovd %edx, %xmm1
+; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vcmpeqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_u32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT:    vcvtusi2sd %eax, %xmm7, %xmm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i32
+	%1 = uitofp i32 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @oeq_f64_i64(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $32, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    orl $3072, %eax # imm = 0xC00
+; SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movlps %xmm1, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; SSE-NEXT:    cmpeqsd {{[0-9]+}}(%esp), %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $24, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd %xmm0, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX-NEXT:    vcmpeqsd {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i64
+	%1 = sitofp i64 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @oeq_f64_u64(double %x) nounwind readnone {
+; SSE-LABEL: oeq_f64_u64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $16, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; SSE-NEXT:    ucomisd %xmm0, %xmm1
+; SSE-NEXT:    jbe .LBB3_2
+; SSE-NEXT:  # %bb.1: # %entry
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:  .LBB3_2: # %entry
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    movsd %xmm2, {{[0-9]+}}(%esp)
+; SSE-NEXT:    setbe %al
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    orl $3072, %ecx # imm = 0xC00
+; SSE-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzbl %al, %eax
+; SSE-NEXT:    shll $31, %eax
+; SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT:    addsd %xmm2, %xmm1
+; SSE-NEXT:    cmpeqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: oeq_f64_u64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $8, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; AVX-NEXT:    vucomisd %xmm0, %xmm1
+; AVX-NEXT:    jbe .LBB3_2
+; AVX-NEXT:  # %bb.1: # %entry
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:  .LBB3_2: # %entry
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vmovsd %xmm1, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    setbe %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    shll $31, %eax
+; AVX-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpeqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: oeq_f64_u64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2uqq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtuqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpeqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i64
+	%1 = uitofp i64 %0 to double
+	%2 = fcmp oeq double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_i32(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
+; SSE-NEXT:    cmpneqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_i32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX-NEXT:    vcmpneqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i32
+	%1 = sitofp i32 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_u32(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_u32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    cvttsd2si %xmm0, %eax
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    sarl $31, %ecx
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cvttsd2si %xmm1, %edx
+; SSE-NEXT:    andl %ecx, %edx
+; SSE-NEXT:    orl %eax, %edx
+; SSE-NEXT:    movd %edx, %xmm1
+; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; SSE-NEXT:    cmpneqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_u32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX-NEXT:    movl %eax, %ecx
+; AVX-NEXT:    sarl $31, %ecx
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; AVX-NEXT:    vcvttsd2si %xmm1, %edx
+; AVX-NEXT:    andl %ecx, %edx
+; AVX-NEXT:    orl %eax, %edx
+; AVX-NEXT:    vmovd %edx, %xmm1
+; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vcmpneqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_u32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT:    vcvtusi2sd %eax, %xmm7, %xmm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i32
+	%1 = uitofp i32 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_i64(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $32, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    orl $3072, %eax # imm = 0xC00
+; SSE-NEXT:    movw %ax, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movlps %xmm1, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fstpl {{[0-9]+}}(%esp)
+; SSE-NEXT:    cmpneqsd {{[0-9]+}}(%esp), %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $24, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd %xmm0, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX-NEXT:    fstpl {{[0-9]+}}(%esp)
+; AVX-NEXT:    vcmpneqsd {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptosi double %x to i64
+	%1 = sitofp i64 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
+	ret i8 %retval12
+}
+
+define zeroext i8 @une_f64_u64(double %x) nounwind readnone {
+; SSE-LABEL: une_f64_u64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushl %ebp
+; SSE-NEXT:    movl %esp, %ebp
+; SSE-NEXT:    andl $-8, %esp
+; SSE-NEXT:    subl $16, %esp
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; SSE-NEXT:    ucomisd %xmm0, %xmm1
+; SSE-NEXT:    jbe .LBB7_2
+; SSE-NEXT:  # %bb.1: # %entry
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:  .LBB7_2: # %entry
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    movsd %xmm2, {{[0-9]+}}(%esp)
+; SSE-NEXT:    setbe %al
+; SSE-NEXT:    fldl {{[0-9]+}}(%esp)
+; SSE-NEXT:    fnstcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; SSE-NEXT:    orl $3072, %ecx # imm = 0xC00
+; SSE-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    fistpll {{[0-9]+}}(%esp)
+; SSE-NEXT:    fldcw {{[0-9]+}}(%esp)
+; SSE-NEXT:    movzbl %al, %eax
+; SSE-NEXT:    shll $31, %eax
+; SSE-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSE-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; SSE-NEXT:    movapd %xmm2, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT:    addsd %xmm2, %xmm1
+; SSE-NEXT:    cmpneqsd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    andl $1, %eax
+; SSE-NEXT:    # kill: def $al killed $al killed $eax
+; SSE-NEXT:    movl %ebp, %esp
+; SSE-NEXT:    popl %ebp
+; SSE-NEXT:    retl
+;
+; AVX-LABEL: une_f64_u64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushl %ebp
+; AVX-NEXT:    movl %esp, %ebp
+; AVX-NEXT:    andl $-8, %esp
+; AVX-NEXT:    subl $8, %esp
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; AVX-NEXT:    vucomisd %xmm0, %xmm1
+; AVX-NEXT:    jbe .LBB7_2
+; AVX-NEXT:  # %bb.1: # %entry
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:  .LBB7_2: # %entry
+; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vmovsd %xmm1, (%esp)
+; AVX-NEXT:    fldl (%esp)
+; AVX-NEXT:    fisttpll (%esp)
+; AVX-NEXT:    setbe %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    shll $31, %eax
+; AVX-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vcmpneqsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    andl $1, %eax
+; AVX-NEXT:    # kill: def $al killed $al killed $eax
+; AVX-NEXT:    movl %ebp, %esp
+; AVX-NEXT:    popl %ebp
+; AVX-NEXT:    retl
+;
+; AVX512-LABEL: une_f64_u64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vcvttpd2uqq %xmm0, %xmm1
+; AVX512-NEXT:    vcvtuqq2pd %ymm1, %ymm1
+; AVX512-NEXT:    vcmpneqsd %xmm0, %xmm1, %k0
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retl
+entry:
+	%0 = fptoui double %x to i64
+	%1 = uitofp i64 %0 to double
+	%2 = fcmp une double %1, %x
+	%retval12 = zext i1 %2 to i8
 	ret i8 %retval12
 }

From 1ad31d923d2504ba7127833f957c7020d0372324 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 29 Sep 2025 07:28:36 -0700
Subject: [PATCH 128/878] Exclude from profcheck some tests (#161207)

- Commit f9c2565
- PR #155349

LoopVectorize ones.
---
 llvm/utils/profcheck-xfail.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 98c6d84950ff7..4a69667ec08f6 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1097,6 +1097,7 @@ Transforms/IROutliner/outlining-remapped-outputs.ll
 Transforms/IROutliner/outlining-same-constants.ll
 Transforms/IROutliner/outlining-same-globals.ll
 Transforms/IROutliner/outlining-same-output-blocks.ll
+Transforms/IROutliner/outlining-special-state.ll
 Transforms/IROutliner/outlining-strip-loop-info.ll
 Transforms/IROutliner/outlining-swift-error.ll
 Transforms/IROutliner/phi-node-exit-path-order.ll
@@ -1173,6 +1174,7 @@ Transforms/LoopVectorize/AArch64/early_exit_costs.ll
 Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
 Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
 Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+Transforms/LoopVectorize/AArch64/epilogue-vectorization-fix-scalar-resume-values.ll
 Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
 Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
 Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
@@ -1288,6 +1290,7 @@ Transforms/LoopVectorize/bzip_reverse_loops.ll
 Transforms/LoopVectorize/calloc.ll
 Transforms/LoopVectorize/cast-induction.ll
 Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+Transforms/LoopVectorize/cse-casts.ll
 Transforms/LoopVectorize/dbg-outer-loop-vect.ll
 Transforms/LoopVectorize/debugloc.ll
 Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1406,6 +1409,7 @@ Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
 Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
 Transforms/LoopVectorize/RISCV/divrem.ll
 Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+Transforms/LoopVectorize/RISCV/induction-costs.ll
 Transforms/LoopVectorize/RISCV/inloop-reduction.ll
 Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
 Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -1453,6 +1457,7 @@ Transforms/LoopVectorize/skip-iterations.ll
 Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
 Transforms/LoopVectorize/strict-fadd-interleave-only.ll
 Transforms/LoopVectorize/struct-return.ll
+Transforms/LoopVectorize/struct-return-replicate.ll
 Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
 Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
 Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
@@ -1739,6 +1744,7 @@ Transforms/PGOProfile/chr-dead-pred.ll
 Transforms/PGOProfile/chr-dup-threshold.ll
 Transforms/PGOProfile/chr.ll
 Transforms/PGOProfile/chr-poison.ll
+Transforms/PGOProfile/chr-lifetimes.ll
 Transforms/PGOProfile/comdat.ll
 Transforms/PGOProfile/cspgo_profile_summary.ll
 Transforms/PGOProfile/memop_profile_funclet_wasm.ll

From b79f4eb6b79dde33d34488a38e571ee65010ac1f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 15:33:17 +0100
Subject: [PATCH 129/878] [X86] isint.ll - cleanup prefixes (#161198)

Use X86 for 32-bit targets, and X64 for 64-bit targets
---
 llvm/test/CodeGen/X86/isint.ll | 124 ++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 62 deletions(-)

diff --git a/llvm/test/CodeGen/X86/isint.ll b/llvm/test/CodeGen/X86/isint.ll
index 8a56f49a6c755..8c11fe147f0d8 100644
--- a/llvm/test/CodeGen/X86/isint.ll
+++ b/llvm/test/CodeGen/X86/isint.ll
@@ -1,29 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK64 %s
-; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 | FileCheck -check-prefix=X86 %s
 
 ; PR19059
 
 define i32 @isint_return(double %d) nounwind {
-; CHECK64-LABEL: isint_return:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK64-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK64-NEXT:    cmpeqsd %xmm0, %xmm1
-; CHECK64-NEXT:    movq %xmm1, %rax
-; CHECK64-NEXT:    andl $1, %eax
-; CHECK64-NEXT:    # kill: def $eax killed $eax killed $rax
-; CHECK64-NEXT:    retq
+; X64-LABEL: isint_return:
+; X64:       # %bb.0:
+; X64-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X64-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X64-NEXT:    cmpeqsd %xmm0, %xmm1
+; X64-NEXT:    movq %xmm1, %rax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
 ;
-; CHECK32-LABEL: isint_return:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK32-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK32-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK32-NEXT:    cmpeqsd %xmm0, %xmm1
-; CHECK32-NEXT:    movd %xmm1, %eax
-; CHECK32-NEXT:    andl $1, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: isint_return:
+; X86:       # %bb.0:
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X86-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X86-NEXT:    cmpeqsd %xmm0, %xmm1
+; X86-NEXT:    movd %xmm1, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    retl
   %i = fptosi double %d to i32
   %e = sitofp i32 %i to double
   %c = fcmp oeq double %d, %e
@@ -32,24 +32,24 @@ define i32 @isint_return(double %d) nounwind {
 }
 
 define i32 @isint_float_return(float %f) nounwind {
-; CHECK64-LABEL: isint_float_return:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK64-NEXT:    cvtdq2ps %xmm1, %xmm1
-; CHECK64-NEXT:    cmpeqss %xmm0, %xmm1
-; CHECK64-NEXT:    movd %xmm1, %eax
-; CHECK64-NEXT:    andl $1, %eax
-; CHECK64-NEXT:    retq
+; X64-LABEL: isint_float_return:
+; X64:       # %bb.0:
+; X64-NEXT:    cvttps2dq %xmm0, %xmm1
+; X64-NEXT:    cvtdq2ps %xmm1, %xmm1
+; X64-NEXT:    cmpeqss %xmm0, %xmm1
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    retq
 ;
-; CHECK32-LABEL: isint_float_return:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK32-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK32-NEXT:    cvtdq2ps %xmm1, %xmm1
-; CHECK32-NEXT:    cmpeqss %xmm0, %xmm1
-; CHECK32-NEXT:    movd %xmm1, %eax
-; CHECK32-NEXT:    andl $1, %eax
-; CHECK32-NEXT:    retl
+; X86-LABEL: isint_float_return:
+; X86:       # %bb.0:
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    cvttps2dq %xmm0, %xmm1
+; X86-NEXT:    cvtdq2ps %xmm1, %xmm1
+; X86-NEXT:    cmpeqss %xmm0, %xmm1
+; X86-NEXT:    movd %xmm1, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    retl
   %i = fptosi float %f to i32
   %g = sitofp i32 %i to float
   %c = fcmp oeq float %f, %g
@@ -60,32 +60,32 @@ define i32 @isint_float_return(float %f) nounwind {
 declare void @foo()
 
 define void @isint_branch(double %d) nounwind {
-; CHECK64-LABEL: isint_branch:
-; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK64-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK64-NEXT:    ucomisd %xmm1, %xmm0
-; CHECK64-NEXT:    jne .LBB2_2
-; CHECK64-NEXT:    jp .LBB2_2
-; CHECK64-NEXT:  # %bb.1: # %true
-; CHECK64-NEXT:    pushq %rax
-; CHECK64-NEXT:    callq foo@PLT
-; CHECK64-NEXT:    popq %rax
-; CHECK64-NEXT:  .LBB2_2: # %false
-; CHECK64-NEXT:    retq
+; X64-LABEL: isint_branch:
+; X64:       # %bb.0:
+; X64-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X64-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X64-NEXT:    ucomisd %xmm1, %xmm0
+; X64-NEXT:    jne .LBB2_2
+; X64-NEXT:    jp .LBB2_2
+; X64-NEXT:  # %bb.1: # %true
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq foo@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:  .LBB2_2: # %false
+; X64-NEXT:    retq
 ;
-; CHECK32-LABEL: isint_branch:
-; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK32-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK32-NEXT:    cvtdq2pd %xmm1, %xmm1
-; CHECK32-NEXT:    ucomisd %xmm1, %xmm0
-; CHECK32-NEXT:    jne .LBB2_2
-; CHECK32-NEXT:    jp .LBB2_2
-; CHECK32-NEXT:  # %bb.1: # %true
-; CHECK32-NEXT:    calll foo@PLT
-; CHECK32-NEXT:  .LBB2_2: # %false
-; CHECK32-NEXT:    retl
+; X86-LABEL: isint_branch:
+; X86:       # %bb.0:
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    cvttpd2dq %xmm0, %xmm1
+; X86-NEXT:    cvtdq2pd %xmm1, %xmm1
+; X86-NEXT:    ucomisd %xmm1, %xmm0
+; X86-NEXT:    jne .LBB2_2
+; X86-NEXT:    jp .LBB2_2
+; X86-NEXT:  # %bb.1: # %true
+; X86-NEXT:    calll foo@PLT
+; X86-NEXT:  .LBB2_2: # %false
+; X86-NEXT:    retl
   %i = fptosi double %d to i32
   %e = sitofp i32 %i to double
   %c = fcmp oeq double %d, %e

From f7aa472800a26e3bed11e49834f446fd43385df7 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 29 Sep 2025 07:42:27 -0700
Subject: [PATCH 130/878] Add test to profcheck exclusion list (#161209)

Introduced in PR #160443
---
 llvm/utils/profcheck-xfail.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 4a69667ec08f6..bd6627b5b6158 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -107,6 +107,7 @@ Instrumentation/AddressSanitizer/AMDGPU/global_metadata_addrspacecasts.ll
 Instrumentation/AddressSanitizer/AMDGPU/instrument-stack.ll
 Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_lds_globals.ll
 Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_scratch_globals.ll
+Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll
 Instrumentation/AddressSanitizer/asan_address_space_attr.ll
 Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll
 Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll

From c4a55190a50a986da985d586631979653d679275 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 29 Sep 2025 07:54:14 -0700
Subject: [PATCH 131/878] [ADT] Use DenseMap::contains in EquivalenceClasses.h
 (NFC) (#161120)

While I am at it, this patch adds [[nodiscard]].
---
 llvm/include/llvm/ADT/EquivalenceClasses.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h
index 1a2331c1a0322..7df0d1557af87 100644
--- a/llvm/include/llvm/ADT/EquivalenceClasses.h
+++ b/llvm/include/llvm/ADT/EquivalenceClasses.h
@@ -180,8 +180,8 @@ template <class ElemTy> class EquivalenceClasses {
   }
 
   /// Returns true if \p V is contained an equivalence class.
-  bool contains(const ElemTy &V) const {
-    return TheMapping.find(V) != TheMapping.end();
+  [[nodiscard]] bool contains(const ElemTy &V) const {
+    return TheMapping.contains(V);
   }
 
   /// getLeaderValue - Return the leader for the specified value that is in the

From 57f2a2ef33e03fd3e15b0104d56c0db6c91657a9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 29 Sep 2025 07:54:23 -0700
Subject: [PATCH 132/878] [ADT] Inline PackedVectorBase into PackedVector (NFC)
 (#161122)

This patch "inlines" PackedVectorBase into its sole user PackedVector.
The two variants of PackedVectorBase are dispatched with "if
constexpr".  getValue and setValue are now non-static methods of
PackedVector.

We could further simplify getValue and setValue by storing signed
integers as two's complement, but that's a change for another day.
This patch focuses on the code organization, like removing the
template trick and inheritance and making the two methods non-static.
---
 llvm/include/llvm/ADT/PackedVector.h | 89 +++++++++++-----------------
 1 file changed, 36 insertions(+), 53 deletions(-)

diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h
index 77fcbf24b2861..09c20e39d1552 100644
--- a/llvm/include/llvm/ADT/PackedVector.h
+++ b/llvm/include/llvm/ADT/PackedVector.h
@@ -20,53 +20,6 @@
 
 namespace llvm {
 
-template <typename T, unsigned BitNum, typename BitVectorTy, bool isSigned>
-class PackedVectorBase;
-
-// This won't be necessary if we can specialize members without specializing
-// the parent template.
-template <typename T, unsigned BitNum, typename BitVectorTy>
-class PackedVectorBase<T, BitNum, BitVectorTy, false> {
-protected:
-  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
-    T val = T();
-    for (unsigned i = 0; i != BitNum; ++i)
-      val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
-    return val;
-  }
-
-  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
-    assert((val >> BitNum) == 0 && "value is too big");
-    for (unsigned i = 0; i != BitNum; ++i)
-      Bits[(Idx * BitNum) + i] = val & (T(1) << i);
-  }
-};
-
-template <typename T, unsigned BitNum, typename BitVectorTy>
-class PackedVectorBase<T, BitNum, BitVectorTy, true> {
-protected:
-  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
-    T val = T();
-    for (unsigned i = 0; i != BitNum - 1; ++i)
-      val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
-    if (Bits[(Idx * BitNum) + BitNum - 1])
-      val = ~val;
-    return val;
-  }
-
-  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
-    if (val < 0) {
-      val = ~val;
-      Bits.set((Idx * BitNum) + BitNum - 1);
-    } else {
-      Bits.reset((Idx * BitNum) + BitNum - 1);
-    }
-    assert((val >> (BitNum - 1)) == 0 && "value is too big");
-    for (unsigned i = 0; i != BitNum - 1; ++i)
-      Bits[(Idx * BitNum) + i] = val & (T(1) << i);
-  }
-};
-
 /// Store a vector of values using a specific number of bits for each
 /// value. Both signed and unsigned types can be used, e.g
 /// @code
@@ -75,16 +28,46 @@ class PackedVectorBase<T, BitNum, BitVectorTy, true> {
 /// will create a vector accepting values -2, -1, 0, 1. Any other value will hit
 /// an assertion.
 template <typename T, unsigned BitNum, typename BitVectorTy = BitVector>
-class PackedVector
-    : public PackedVectorBase<T, BitNum, BitVectorTy,
-                              std::numeric_limits<T>::is_signed> {
+class PackedVector {
   BitVectorTy Bits;
   // Keep track of the number of elements on our own.
   // We always maintain Bits.size() == NumElements * BitNum.
   // Used to avoid an integer division in size().
   unsigned NumElements = 0;
-  using base = PackedVectorBase<T, BitNum, BitVectorTy,
-                                std::numeric_limits<T>::is_signed>;
+
+  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
+    if constexpr (std::numeric_limits<T>::is_signed) {
+      T val = T();
+      for (unsigned i = 0; i != BitNum - 1; ++i)
+        val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
+      if (Bits[(Idx * BitNum) + BitNum - 1])
+        val = ~val;
+      return val;
+    } else {
+      T val = T();
+      for (unsigned i = 0; i != BitNum; ++i)
+        val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i));
+      return val;
+    }
+  }
+
+  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
+    if constexpr (std::numeric_limits<T>::is_signed) {
+      if (val < 0) {
+        val = ~val;
+        Bits.set((Idx * BitNum) + BitNum - 1);
+      } else {
+        Bits.reset((Idx * BitNum) + BitNum - 1);
+      }
+      assert((val >> (BitNum - 1)) == 0 && "value is too big");
+      for (unsigned i = 0; i != BitNum - 1; ++i)
+        Bits[(Idx * BitNum) + i] = val & (T(1) << i);
+    } else {
+      assert((val >> BitNum) == 0 && "value is too big");
+      for (unsigned i = 0; i != BitNum; ++i)
+        Bits[(Idx * BitNum) + i] = val & (T(1) << i);
+    }
+  }
 
 public:
   class reference {
@@ -135,7 +118,7 @@ class PackedVector
 
   reference operator[](unsigned Idx) { return reference(*this, Idx); }
 
-  T operator[](unsigned Idx) const { return base::getValue(Bits, Idx); }
+  T operator[](unsigned Idx) const { return getValue(Bits, Idx); }
 
   bool operator==(const PackedVector &RHS) const { return Bits == RHS.Bits; }
 

From 41cce3b92efe7ec41df1ed757d5d037f6f0a8421 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 29 Sep 2025 07:54:32 -0700
Subject: [PATCH 133/878] [ADT] Remove DenseMapBase::getHashValue (NFC)
 (#161123)

This patch removes:

  static unsigned getHashValue(const KeyT &Val) {
    return KeyInfoT::getHashValue(Val);
  }

This function is redundant given the templated overload:

  template <typename LookupKeyT>
  static unsigned getHashValue(const LookupKeyT &Val) {
    return KeyInfoT::getHashValue(Val);
  }

Note that the callers doFind and LookupBucketFor are themselves
templated on LookupKeyT.
---
 llvm/include/llvm/ADT/DenseMap.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index bcf3e9676a7b5..4bda50f5a5cc0 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -440,10 +440,6 @@ class DenseMapBase : public DebugEpochBase {
     }
   }
 
-  static unsigned getHashValue(const KeyT &Val) {
-    return KeyInfoT::getHashValue(Val);
-  }
-
   template <typename LookupKeyT>
   static unsigned getHashValue(const LookupKeyT &Val) {
     return KeyInfoT::getHashValue(Val);

From 4a9041b138d8cb07f10da42454e578977db461e9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 29 Sep 2025 07:54:41 -0700
Subject: [PATCH 134/878] [llvm] Proofread GettingInvolved.rst (#161124)

---
 llvm/docs/GettingInvolved.rst | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 72716fa667487..f7e1374ec2aef 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -42,7 +42,7 @@ LLVM welcomes contributions of all kinds. To get started, please review the foll
    in the LLVM system.
 
 :doc:`BugLifeCycle`
-   Describes how bugs are reported, triaged and closed.
+   Describes how bugs are reported, triaged, and closed.
 
 :doc:`CodingStandards`
   Details the LLVM coding standards and provides useful information on writing
@@ -108,7 +108,7 @@ The :doc:`CodeOfConduct` applies to all these forums and mailing lists.
 `Commits Archive (llvm-commits)`__
   This list contains all commit messages that are made when LLVM developers
   commit code changes to the repository. It also serves as a forum for
-  patch review (i.e. send patches here). It is useful for those who want to
+  patch review (i.e., send patches here). It is useful for those who want to
   stay on the bleeding edge of LLVM development. This list is very high
   volume.
 
@@ -121,7 +121,7 @@ The :doc:`CodeOfConduct` applies to all these forums and mailing lists.
   .. __: http://lists.llvm.org/pipermail/llvm-bugs/
 
 `LLVM Announcements`__
-  If you just want project wide announcements such as releases, developers meetings, or blog posts, then you should check out the Announcement category on LLVM Discourse.
+  If you just want project-wide announcements such as releases, developers meetings, or blog posts, then you should check out the Announcement category on LLVM Discourse.
 
   .. __: https://discourse.llvm.org/c/announce/46
 
@@ -473,7 +473,7 @@ join one in your city. Or start a new one if there is none:
 Community wide proposals
 ------------------------
 
-Proposals for massive changes in how the community behaves and how the work flow
+Proposals for large-scale changes in how the community behaves and how the work flow
 can be better.
 
 .. toctree::
@@ -518,7 +518,7 @@ also be seen inline below:
 Note that the web view of the LLVM community calendar shows events in
 Coordinated Universal Time (UTC). If you use Google Calendar, consider
 subscribing to it with the + button in the bottom-right corner to view all
-events in your local timezone alongside your other calendars.
+events in your local time zone alongside your other calendars.
 
 .. _llvm-community-calendar-host-guidance:
 
@@ -554,9 +554,9 @@ An example invite looks as follows
   This event is a meetup for all developers of LLDB. Meeting agendas are posted
   on discourse before the event.
 
-  Attendees are required to adhere to the LLVM Code of Conduct
+  Attendees must adhere to the LLVM Code of Conduct
   (https://llvm.org/docs/CodeOfConduct.html). For any Code of Conduct reports,
-  please contact the organizers, and also email conduct@llvm.org.
+  please contact the organizers and also email conduct@llvm.org.
 
   Agenda/Meeting Minutes: Link to minutes
 

From 27fa1d0cf96469c268c46684ced2fbb7663c7713 Mon Sep 17 00:00:00 2001
From: Rana Pratap Reddy <109514914+ranapratap55@users.noreply.github.com>
Date: Mon, 29 Sep 2025 20:24:59 +0530
Subject: [PATCH 135/878] [AMDGPU] Add a new builtin type for image descriptor
 rsrc (#160258)

Adding a new builtin type for AMDGPU's image descriptor rsrc data type

This requires for https://github.com/llvm/llvm-project/pull/140210
---
 clang/include/clang/Basic/AMDGPUTypes.def       |  1 +
 clang/include/clang/Basic/Builtins.def          |  1 +
 clang/lib/AST/ASTContext.cpp                    |  4 ++++
 .../CodeGen/amdgpu-image-rsrc-type-debug-info.c | 17 +++++++++++++++++
 .../CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp   |  7 +++++++
 clang/test/SemaCXX/amdgpu-image-rsrc.cpp        | 17 +++++++++++++++++
 clang/test/SemaOpenCL/amdgpu-image-rsrc.cl      | 13 +++++++++++++
 clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp     | 12 ++++++++++++
 8 files changed, 72 insertions(+)
 create mode 100644 clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
 create mode 100644 clang/test/CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp
 create mode 100644 clang/test/SemaCXX/amdgpu-image-rsrc.cpp
 create mode 100644 clang/test/SemaOpenCL/amdgpu-image-rsrc.cl
 create mode 100644 clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp

diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
index d3dff446f9edf..089a72b5c102e 100644
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -21,6 +21,7 @@
 #endif
 
 AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
+AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_texture_t", AMDGPUTexture, AMDGPUTextureTy, 256, 256, 0)
 
 AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0)
 
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 9aad00b55d64a..b856ad145824d 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -34,6 +34,7 @@
 //  Q -> target builtin type, followed by a character to distinguish the builtin type
 //    Qa -> AArch64 svcount_t builtin type.
 //    Qb -> AMDGPU __amdgpu_buffer_rsrc_t builtin type.
+//    Qt -> AMDGPU __amdgpu_texture_t builtin type.
 //  E -> ext_vector, followed by the number of elements and the base type.
 //  X -> _Complex, followed by the base type.
 //  Y -> ptrdiff_t
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 61dd330553860..0fd0e7eb360dd 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -12590,6 +12590,10 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
       Type = Context.AMDGPUBufferRsrcTy;
       break;
     }
+    case 't': {
+      Type = Context.AMDGPUTextureTy;
+      break;
+    }
     default:
       llvm_unreachable("Unexpected target builtin type");
     }
diff --git a/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c b/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
new file mode 100644
index 0000000000000..ef68c79bef592
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-image-rsrc-type-debug-info.c
@@ -0,0 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s -debug-info-kind=limited | FileCheck %s
+
+// CHECK-LABEL: define dso_local void @test_locals(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !dbg [[DBG6:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[IMG:%.*]] = alloca ptr, align 32, addrspace(5)
+// CHECK-NEXT:    [[IMG_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IMG]] to ptr
+// CHECK-NEXT:      #dbg_declare(ptr addrspace(5) [[IMG]], [[META11:![0-9]+]], !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), [[META14:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[IMG_ASCAST]], align 32, !dbg [[DBG15:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG16:![0-9]+]]
+//
+void test_locals(void) {
+  __amdgpu_texture_t img;
+  (void)img;
+}
diff --git a/clang/test/CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp b/clang/test/CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp
new file mode 100644
index 0000000000000..0dbd51774321b
--- /dev/null
+++ b/clang/test/CodeGenCXX/amdgpu-image-rsrc-typeinfo.cpp
@@ -0,0 +1,7 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn %s -emit-llvm -o - | FileCheck %s
+namespace std { class type_info; }
+auto &a = typeid(__amdgpu_texture_t);
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/SemaCXX/amdgpu-image-rsrc.cpp b/clang/test/SemaCXX/amdgpu-image-rsrc.cpp
new file mode 100644
index 0000000000000..61a82d47cf1c0
--- /dev/null
+++ b/clang/test/SemaCXX/amdgpu-image-rsrc.cpp
@@ -0,0 +1,17 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 -triple amdgcn -Wno-unused-value %s
+
+void foo() {
+  int n = 100;
+  __amdgpu_texture_t v = 0;           // expected-error {{cannot initialize a variable of type '__amdgpu_texture_t' with an rvalue of type 'int'}}
+  static_cast<__amdgpu_texture_t>(n);  // expected-error {{static_cast from 'int' to '__amdgpu_texture_t' is not allowed}}
+  reinterpret_cast<__amdgpu_texture_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_texture_t' is not allowed}}
+  (void)(v + v); // expected-error {{invalid operands to binary expression ('__amdgpu_texture_t' and '__amdgpu_texture_t')}}
+  int x(v);      // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_texture_t'}}
+  __amdgpu_texture_t k;
+}
+
+template<class T> void bar(T);
+void use(__amdgpu_texture_t r) { bar(r); }
+struct S { __amdgpu_texture_t r; int a; };
diff --git a/clang/test/SemaOpenCL/amdgpu-image-rsrc.cl b/clang/test/SemaOpenCL/amdgpu-image-rsrc.cl
new file mode 100644
index 0000000000000..dc56494d3c2c1
--- /dev/null
+++ b/clang/test/SemaOpenCL/amdgpu-image-rsrc.cl
@@ -0,0 +1,13 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -cl-std=CL1.2 -triple amdgcn-amd-amdhsa %s
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -triple amdgcn-amd-amdhsa %s
+
+void f() {
+    int n = 3;
+    __amdgpu_texture_t v = (__amdgpu_texture_t)0; // expected-error {{used type '__amdgpu_texture_t' where arithmetic or pointer type is required}}
+    int k = v;          // expected-error {{initializing '__private int' with an expression of incompatible type '__private __amdgpu_texture_t'}}
+    (void)(v + v);      // expected-error {{invalid operands}}
+    __amdgpu_texture_t r;
+    int *p = (int*)r;   // expected-error {{operand of type '__amdgpu_texture_t' where arithmetic or pointer type is required}}
+}
diff --git a/clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp b/clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp
new file mode 100644
index 0000000000000..51b3f72d12e12
--- /dev/null
+++ b/clang/test/SemaOpenMP/amdgpu-image-rsrc.cpp
@@ -0,0 +1,12 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -Wno-unused-value %s
+
+void foo() {
+#pragma omp target
+  {
+    int n = 5;
+    __amdgpu_texture_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_texture_t' with an rvalue of type 'int'}}
+    (void)(v + v);            // expected-error {{invalid operands to binary expression}}
+  }
+}

From c2fbd12f11f5936cae9be2ebbc45c1721e554766 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 29 Sep 2025 17:19:45 +0200
Subject: [PATCH 136/878] [clang][NFC] Remove const_casts from diagnostic
 emissions (#161211)

This is apparently not necessary anymore. Not sure when exactly it
changed though.
---
 clang/lib/Sema/AnalysisBasedWarnings.cpp | 7 +++----
 clang/lib/Sema/SemaConcept.cpp           | 2 +-
 clang/lib/Sema/SemaOpenACCAtomic.cpp     | 7 ++-----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 1b66d83df5171..8606227152a84 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -983,10 +983,9 @@ static void DiagUninitUse(Sema &S, const VarDecl *VD, const UninitUse &Use,
   case UninitUse::AfterDecl:
   case UninitUse::AfterCall:
     S.Diag(VD->getLocation(), diag::warn_sometimes_uninit_var)
-      << VD->getDeclName() << IsCapturedByBlock
-      << (Use.getKind() == UninitUse::AfterDecl ? 4 : 5)
-      << const_cast<DeclContext*>(VD->getLexicalDeclContext())
-      << VD->getSourceRange();
+        << VD->getDeclName() << IsCapturedByBlock
+        << (Use.getKind() == UninitUse::AfterDecl ? 4 : 5)
+        << VD->getLexicalDeclContext() << VD->getSourceRange();
     S.Diag(Use.getUser()->getBeginLoc(), diag::note_uninit_var_use)
         << IsCapturedByBlock << Use.getUser()->getSourceRange();
     return;
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index d238b7916a330..dc6d232d9a525 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -193,7 +193,7 @@ DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
   // Sema::InstantiatingTemplate::isAlreadyBeingInstantiated function.
   if (S.SatisfactionStackContains(Templ, ID)) {
     S.Diag(E->getExprLoc(), diag::err_constraint_depends_on_self)
-        << const_cast<Expr *>(E) << E->getSourceRange();
+        << E << E->getSourceRange();
     return true;
   }
 
diff --git a/clang/lib/Sema/SemaOpenACCAtomic.cpp b/clang/lib/Sema/SemaOpenACCAtomic.cpp
index a9319dce6c586..ad21129d30c15 100644
--- a/clang/lib/Sema/SemaOpenACCAtomic.cpp
+++ b/clang/lib/Sema/SemaOpenACCAtomic.cpp
@@ -454,9 +454,7 @@ class AtomicOperandChecker {
     // If nothing matches, error out.
     DiagnoseInvalidAtomic(BinInf->FoundExpr->getExprLoc(),
                           SemaRef.PDiag(diag::note_acc_atomic_mismatch_operand)
-                              << const_cast<Expr *>(AssignInf.LHS)
-                              << const_cast<Expr *>(BinInf->LHS)
-                              << const_cast<Expr *>(BinInf->RHS));
+                              << AssignInf.LHS << BinInf->LHS << BinInf->RHS);
     return IDACInfo::Fail();
   }
 
@@ -592,8 +590,7 @@ class AtomicOperandChecker {
 
     PartialDiagnostic PD =
         SemaRef.PDiag(diag::note_acc_atomic_mismatch_compound_operand)
-        << FirstKind << const_cast<Expr *>(FirstX) << SecondKind
-        << const_cast<Expr *>(SecondX);
+        << FirstKind << FirstX << SecondKind << SecondX;
 
     return DiagnoseInvalidAtomic(SecondX->getExprLoc(), PD);
   }

From 637bfb7ea6d58c1daaa82dc39f7b9ad40870e422 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Andr=C3=A9=20Reuter?= <j.reuter@fz-juelich.de>
Date: Mon, 29 Sep 2025 17:33:28 +0200
Subject: [PATCH 137/878] Reapply "[compiler-rt][XRay] Make xray_interface.h C
 compliant" (#141728)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes to initial PR (#140068):

- Mark failing test as unsupported for powerpc64le, as test failure is
unrelated to PR changes. See
https://github.com/llvm/llvm-project/issues/141598

---

Original description (from #140068)

The XRay interface header uses no C++ specific features aside from using
the std namespace and including the C++ variant of C headers. Yet, these
changes prevent using `xray_interface.h` in external tools relying on C
for different reasons. Make this header C compliant by using C headers,
removing the std namespace from std::size_t and guard `extern "C"`.

To make sure that further changes to not break the interface
accidentially, port one test from C++ to C. This requires the C23
standard to officially support the attribute syntax used in this test
case.

Note that this only resolves this issue for `xray_interface.h`.
`xray_records.h` is also not C compliant, but requires more work to
port.

Fixes #139902

Signed-off-by: Jan André Reuter <j.reuter@fz-juelich.de>
---
 compiler-rt/include/xray/xray_interface.h     | 37 +++++++-----
 .../TestCases/Posix/patching-unpatching.c     | 56 +++++++++++++++++++
 2 files changed, 79 insertions(+), 14 deletions(-)
 create mode 100644 compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c

diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h
index 675ea0cbc48c8..3ef8ee348540f 100644
--- a/compiler-rt/include/xray/xray_interface.h
+++ b/compiler-rt/include/xray/xray_interface.h
@@ -1,4 +1,4 @@
-//===- xray_interface.h -----------------------------------------*- C++ -*-===//
+//===- xray_interface.h ---------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,10 +14,17 @@
 #ifndef XRAY_XRAY_INTERFACE_H
 #define XRAY_XRAY_INTERFACE_H
 
+#ifdef __cplusplus
 #include <cstddef>
 #include <cstdint>
+#else
+#include <stddef.h>
+#include <stdint.h>
+#endif
 
+#ifdef __cplusplus
 extern "C" {
+#endif
 
 /// Synchronize this with AsmPrinter::SledKind in LLVM.
 enum XRayEntryType {
@@ -49,7 +56,7 @@ enum XRayEntryType {
 /// achieved by marking them all with: __attribute__((xray_never_instrument))
 ///
 /// Returns 1 on success, 0 on error.
-extern int __xray_set_handler(void (*entry)(int32_t, XRayEntryType));
+extern int __xray_set_handler(void (*entry)(int32_t, enum XRayEntryType));
 
 /// This removes whatever the currently provided handler is. Returns 1 on
 /// success, 0 on error.
@@ -60,7 +67,7 @@ extern int __xray_remove_handler();
 /// start logging their subsequent affected function calls (if patched).
 ///
 /// Returns 1 on success, 0 on error.
-extern int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType,
+extern int __xray_set_handler_arg1(void (*entry)(int32_t, enum XRayEntryType,
                                                  uint64_t));
 
 /// Disables the XRay handler used to log first arguments of function calls.
@@ -68,7 +75,7 @@ extern int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType,
 extern int __xray_remove_handler_arg1();
 
 /// Provide a function to invoke when XRay encounters a custom event.
-extern int __xray_set_customevent_handler(void (*entry)(void *, std::size_t));
+extern int __xray_set_customevent_handler(void (*entry)(void *, size_t));
 
 /// This removes whatever the currently provided custom event handler is.
 /// Returns 1 on success, 0 on error.
@@ -95,39 +102,39 @@ enum XRayPatchingStatus {
 
 /// This tells XRay to patch the instrumentation points in all currently loaded
 /// objects. See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_patch();
+extern enum XRayPatchingStatus __xray_patch();
 
 /// This tells XRay to patch the instrumentation points in the given object.
 /// See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_patch_object(int32_t ObjId);
+extern enum XRayPatchingStatus __xray_patch_object(int32_t ObjId);
 
 /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible
 /// result values.
-extern XRayPatchingStatus __xray_unpatch();
+extern enum XRayPatchingStatus __xray_unpatch();
 
 /// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for
 /// possible result values.
-extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
+extern enum XRayPatchingStatus __xray_unpatch_object(int32_t ObjId);
 
 /// This unpacks the given (packed) function id and patches
 /// the corresponding function.  See XRayPatchingStatus for possible
 /// result values.
-extern XRayPatchingStatus __xray_patch_function(int32_t FuncId);
+extern enum XRayPatchingStatus __xray_patch_function(int32_t FuncId);
 
 /// This patches a specific function in the given object. See XRayPatchingStatus
 /// for possible result values.
-extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
-                                                          int32_t ObjId);
+extern enum XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId,
+                                                               int32_t ObjId);
 
 /// This unpacks the given (packed) function id and unpatches
 /// the corresponding function. See XRayPatchingStatus for possible
 /// result values.
-extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
+extern enum XRayPatchingStatus __xray_unpatch_function(int32_t FuncId);
 
 /// This unpatches a specific function in the given object.
 /// See XRayPatchingStatus for possible result values.
-extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId,
-                                                            int32_t ObjId);
+extern enum XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId,
+                                                                 int32_t ObjId);
 
 /// This function unpacks the given (packed) function id and returns the address
 /// of the corresponding function. We return 0 if we encounter any error, even
@@ -173,6 +180,8 @@ extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId);
 /// Calling __xray_init() more than once is safe across multiple threads.
 extern void __xray_init();
 
+#ifdef __cplusplus
 } // end extern "C"
+#endif
 
 #endif // XRAY_XRAY_INTERFACE_H
diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c
new file mode 100644
index 0000000000000..2dbc68142dbc5
--- /dev/null
+++ b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching.c
@@ -0,0 +1,56 @@
+// Check that we can patch and un-patch on demand, and that logging gets invoked
+// appropriately.
+//
+// Do not run on powerpc64le, as linking XRay with C compiler causes linker error
+// due to std::__throw_system_error(int) being present in XRay libraries.
+// See https://github.com/llvm/llvm-project/issues/141598
+//
+// RUN: %clang_xray -fxray-instrument -std=c23 %s -o %t
+// RUN: env XRAY_OPTIONS="patch_premain=false" %run %t 2>&1 | FileCheck %s
+// RUN: %clang_xray -fxray-instrument -fno-xray-function-index -std=c23 %s -o %t
+// RUN: env XRAY_OPTIONS="patch_premain=false" %run %t 2>&1 | FileCheck %s
+
+// UNSUPPORTED: target-is-mips64,target-is-mips64el
+// UNSUPPORTED: target=powerpc64le-{{.*}}
+
+#include "xray/xray_interface.h"
+
+#include <stdio.h>
+
+bool called = false;
+
+void test_handler(int32_t fid, enum XRayEntryType type) {
+  printf("called: %d, type=%d\n", fid, (int32_t)(type));
+  called = true;
+}
+
+[[clang::xray_always_instrument]] void always_instrument() {
+  printf("always instrumented called\n");
+}
+
+int main() {
+  __xray_set_handler(test_handler);
+  always_instrument();
+  // CHECK: always instrumented called
+  auto status = __xray_patch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+  always_instrument();
+  // CHECK-NEXT: called: {{.*}}, type=0
+  // CHECK-NEXT: always instrumented called
+  // CHECK-NEXT: called: {{.*}}, type=1
+  status = __xray_unpatch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+  always_instrument();
+  // CHECK-NEXT: always instrumented called
+  status = __xray_patch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+  __xray_remove_handler();
+  always_instrument();
+  // CHECK-NEXT: always instrumented called
+  status = __xray_unpatch();
+  printf("patching status: %d\n", (int32_t)status);
+  // CHECK-NEXT: patching status: 1
+}

From 01a7c880d27b7b594910e03d173819d90d14c421 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Mon, 29 Sep 2025 08:46:45 -0700
Subject: [PATCH 138/878] [AMDGPU][LowerBufferFatPointers] Erase dead ptr(7)
 intrinsics (#160798)

Fix a crash that would arise when intrinsics like llvm.masked.load.T.p7
were left in the module when AMDGPULowerBufferFatPointers was applied
and so a captures(none) annotation would be applied to a non-pointer
value, triggering a verifier failure.

---------

Co-authored-by: Shilei Tian <i@tianshilei.me>
---
 llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp  | 4 +++-
 .../AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll  | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index d9bfeae52e213..0a5913293238a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2562,7 +2562,9 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) {
   for (Function *F : NeedsPostProcess)
     Splitter.processFunction(*F);
   for (Function *F : Intrinsics) {
-    if (isRemovablePointerIntrinsic(F->getIntrinsicID())) {
+    // use_empty() can also occur with cases like masked load, which will
+    // have been rewritten out of the module by now but not erased.
+    if (F->use_empty() || isRemovablePointerIntrinsic(F->getIntrinsicID())) {
       F->eraseFromParent();
     } else {
       std::optional<Function *> NewF = Intrinsic::remangleIntrinsicFunction(F);
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll
new file mode 100644
index 0000000000000..d6198f5000c34
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-dead-intrinsics.ll
@@ -0,0 +1,9 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
+
+; CHECK: @arbitrary
+declare amdgpu_kernel void @arbitrary(ptr addrspace(1))
+
+; COM: This used to cause verifier errors when "lowered"
+declare <4 x i8> @llvm.masked.load.v4i8.p7(ptr addrspace(7) captures(none), i32 immarg, <4 x i1>, <4 x i8>)
+; CHECK-NOT: llvm.masked.load

From edca510555fd6c2adfe15dba6993f4e64575e647 Mon Sep 17 00:00:00 2001
From: Valery Dmitriev <valeryd@nvidia.com>
Date: Mon, 29 Sep 2025 08:47:18 -0700
Subject: [PATCH 139/878] [flang] Simplify hlfir.index in a few limited cases.
 (#157883)

Primarily targeted simplification case of substring being
a singleton by inlining a search loop (with an exception
where runtime function performs better).
Few trivial simplifications also covered.
---
 .../Transforms/SimplifyHLFIRIntrinsics.cpp    | 208 +++++++++++
 .../HLFIR/simplify-hlfir-intrinsics-index.fir | 345 ++++++++++++++++++
 2 files changed, 553 insertions(+)
 create mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index d8e36ea294cdb..9969ee474ff98 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -2284,6 +2284,213 @@ class CmpCharOpConversion : public mlir::OpRewritePattern<hlfir::CmpCharOp> {
   }
 };
 
+static std::pair<mlir::Value, hlfir::AssociateOp>
+getVariable(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value val) {
+  // If it is an expression - create a variable from it, or forward
+  // the value otherwise.
+  hlfir::AssociateOp associate;
+  if (!mlir::isa<hlfir::ExprType>(val.getType()))
+    return {val, associate};
+  hlfir::Entity entity{val};
+  mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
+  associate = hlfir::genAssociateExpr(loc, builder, entity, entity.getType(),
+                                      "", byRefAttr);
+  return {associate.getBase(), associate};
+}
+
+class IndexOpConversion : public mlir::OpRewritePattern<hlfir::IndexOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::IndexOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::IndexOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // We simplify only limited cases:
+    // 1) a substring length shall be known at compile time
+    // 2) if a substring length is 0 then replace with 1 for forward search,
+    //    or otherwise with the string length + 1 (builder shall const-fold if
+    //    lookup direction is known at compile time).
+    // 3) for known string length at compile time, if it is
+    //    shorter than substring  => replace with zero.
+    // 4) if a substring length is one => inline as simple search loop
+    // 5) for forward search with input strings of kind=1 runtime is faster.
+    // Do not simplify in all the other cases relying on a runtime call.
+
+    fir::FirOpBuilder builder{rewriter, op.getOperation()};
+    const mlir::Location &loc = op->getLoc();
+
+    auto resultTy = op.getType();
+    mlir::Value back = op.getBack();
+    mlir::Value substrLen =
+        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getSubstr()});
+
+    auto substrLenCst = fir::getIntIfConstant(substrLen);
+    if (!substrLenCst) {
+      return rewriter.notifyMatchFailure(
+          op, "substring length unknown at compile time");
+    }
+    mlir::Value strLen =
+        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getStr()});
+    auto i1Ty = builder.getI1Type();
+    auto idxTy = builder.getIndexType();
+    if (*substrLenCst == 0) {
+      mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
+      // zero length substring. For back search replace with
+      // strLen+1, or otherwise with 1.
+      mlir::Value strEnd = mlir::arith::AddIOp::create(
+          builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
+      if (back)
+        back = builder.createConvert(loc, i1Ty, back);
+      else
+        back = builder.createIntegerConstant(loc, i1Ty, 0);
+      mlir::Value result =
+          mlir::arith::SelectOp::create(builder, loc, back, strEnd, oneIdx);
+
+      rewriter.replaceOp(op, builder.createConvert(loc, resultTy, result));
+      return mlir::success();
+    }
+
+    if (auto strLenCst = fir::getIntIfConstant(strLen)) {
+      if (*strLenCst < *substrLenCst) {
+        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 0));
+        return mlir::success();
+      }
+      if (*strLenCst == 0) {
+        // both strings have zero length
+        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 1));
+        return mlir::success();
+      }
+    }
+    if (*substrLenCst != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "rely on runtime implementation if substring length > 1");
+    }
+    // For forward search and character kind=1 the runtime uses memchr
+    // which well optimized. But it looks like memchr idiom is not recognized
+    // in LLVM yet. On a micro-kernel test with strings of length 40 runtime
+    // had ~2x less execution time vs inlined code. For unknown search direction
+    // at compile time pessimistically assume "forward".
+    std::optional<bool> isBack;
+    if (back) {
+      if (auto backCst = fir::getIntIfConstant(back))
+        isBack = *backCst != 0;
+    } else {
+      isBack = false;
+    }
+    auto charTy = mlir::cast<fir::CharacterType>(
+        hlfir::getFortranElementType(op.getSubstr().getType()));
+    unsigned kind = charTy.getFKind();
+    if (kind == 1 && (!isBack || !*isBack)) {
+      return rewriter.notifyMatchFailure(
+          op, "rely on runtime implementation for character kind 1");
+    }
+
+    // All checks are passed here. Generate single character search loop.
+    auto [strV, strAssociate] = getVariable(builder, loc, op.getStr());
+    auto [substrV, substrAssociate] = getVariable(builder, loc, op.getSubstr());
+    hlfir::Entity str{strV};
+    hlfir::Entity substr{substrV};
+    mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
+
+    auto genExtractAndConvertToInt = [&charTy, &idxTy, &oneIdx,
+                                      kind](mlir::Location loc,
+                                            fir::FirOpBuilder &builder,
+                                            hlfir::Entity &charStr,
+                                            mlir::Value index) {
+      auto bits = builder.getKindMap().getCharacterBitsize(kind);
+      auto intTy = builder.getIntegerType(bits);
+      auto charLen1Ty =
+          fir::CharacterType::getSingleton(builder.getContext(), kind);
+      mlir::Type designatorTy =
+          fir::ReferenceType::get(charLen1Ty, fir::isa_volatile_type(charTy));
+      auto idxAttr = builder.getIntegerAttr(idxTy, 0);
+
+      auto singleChr = hlfir::DesignateOp::create(
+          builder, loc, designatorTy, charStr, /*component=*/{},
+          /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
+          /*substring=*/mlir::ValueRange{index, index},
+          /*complexPart=*/std::nullopt,
+          /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{oneIdx},
+          fir::FortranVariableFlagsAttr{});
+      auto chrVal = fir::LoadOp::create(builder, loc, singleChr);
+      mlir::Value intVal = fir::ExtractValueOp::create(
+          builder, loc, intTy, chrVal, builder.getArrayAttr(idxAttr));
+      return intVal;
+    };
+
+    auto wantChar = genExtractAndConvertToInt(loc, builder, substr, oneIdx);
+
+    // Generate search loop body with the following C equivalent:
+    //  idx_t result = 0;
+    //  idx_t end = strlen + 1;
+    //  char want = substr[0];
+    //  for (idx_t idx = 1; idx < end; ++idx) {
+    //    if (result == 0) {
+    //        idx_t at = back ? end - idx: idx;
+    //        result = str[at-1] == want ? at : result;
+    //    }
+    //  }
+    if (!back)
+      back = builder.createIntegerConstant(loc, i1Ty, 0);
+    else
+      back = builder.createConvert(loc, i1Ty, back);
+    mlir::Value strEnd = mlir::arith::AddIOp::create(
+        builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
+    mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
+    auto genSearchBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                             mlir::ValueRange index,
+                             mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 1> {
+      assert(index.size() == 1 && "expected single loop");
+      assert(reductionArgs.size() == 1 && "expected single reduction value");
+      mlir::Value inRes = reductionArgs[0];
+      auto resEQzero = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, inRes, zeroIdx);
+
+      mlir::Value res =
+          builder
+              .genIfOp(loc, {idxTy}, resEQzero,
+                       /*withElseRegion=*/true)
+              .genThen([&]() {
+                mlir::Value idx = builder.createConvert(loc, idxTy, index[0]);
+                // offset = back ? end - idx : idx;
+                mlir::Value offset = mlir::arith::SelectOp::create(
+                    builder, loc, back,
+                    mlir::arith::SubIOp::create(builder, loc, strEnd, idx),
+                    idx);
+
+                auto haveChar =
+                    genExtractAndConvertToInt(loc, builder, str, offset);
+                auto charsEQ = mlir::arith::CmpIOp::create(
+                    builder, loc, mlir::arith::CmpIPredicate::eq, haveChar,
+                    wantChar);
+                mlir::Value newVal = mlir::arith::SelectOp::create(
+                    builder, loc, charsEQ, offset, inRes);
+
+                fir::ResultOp::create(builder, loc, newVal);
+              })
+              .genElse([&]() { fir::ResultOp::create(builder, loc, inRes); })
+              .getResults()[0];
+      return {res};
+    };
+
+    llvm::SmallVector<mlir::Value, 1> loopOut =
+        hlfir::genLoopNestWithReductions(loc, builder, {strLen},
+                                         /*reductionInits=*/{zeroIdx},
+                                         genSearchBody,
+                                         /*isUnordered=*/false);
+    mlir::Value result = builder.createConvert(loc, resultTy, loopOut[0]);
+
+    if (strAssociate)
+      hlfir::EndAssociateOp::create(builder, loc, strAssociate);
+    if (substrAssociate)
+      hlfir::EndAssociateOp::create(builder, loc, substrAssociate);
+
+    rewriter.replaceOp(op, result);
+    return mlir::success();
+  }
+};
+
 template <typename Op>
 class MatmulConversion : public mlir::OpRewritePattern<Op> {
 public:
@@ -2955,6 +3162,7 @@ class SimplifyHLFIRIntrinsics
     patterns.insert<ArrayShiftConversion<hlfir::CShiftOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::EOShiftOp>>(context);
     patterns.insert<CmpCharOpConversion>(context);
+    patterns.insert<IndexOpConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
new file mode 100644
index 0000000000000..258a1d899a40d
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
@@ -0,0 +1,345 @@
+// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
+
+// Simplify should reduce hlfir.index to constant (5)
+func.func @_QPt1() {
+// CHECK-LABEL:   func.func @_QPt1() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 5 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 4 : index
+// CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_3]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+// CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_2]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+// CHECK:           hlfir.assign %[[VAL_10]]#0 to %[[VAL_8]]#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
+// CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
+// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_0]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c4 = arith.constant 4 : index
+    %3 = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
+    %4:2 = hlfir.declare %3 typeparams %c4 {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+    %5 = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
+    %c3 = arith.constant 3 : index
+    %6:2 = hlfir.declare %5 typeparams %c3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+    hlfir.assign %6#0 to %4#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
+    %7 = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
+    %c0 = arith.constant 0 : index
+    %8:2 = hlfir.declare %7 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
+    %true = arith.constant true
+    %9 = hlfir.index %8#0 in %4#0 back %true : (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,4>>, i1) -> i32
+    hlfir.assign %9 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// ! 'back' is unknown at compile time, substring is zero length - generate select (back ? strlen+1 : 1)
+func.func @_QPt2(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK-LABEL:   func.func @_QPt2(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_0]] : index
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<4>) -> i1
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+    %3:2 = hlfir.declare %2 {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %6 = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
+    %c0 = arith.constant 0 : index
+    %7:2 = hlfir.declare %6 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
+    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
+    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<2,0>>, !fir.boxchar<2>, !fir.logical<4>) -> i32
+    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
+    return
+}
+
+// inline as search loop (backward)
+func.func @_QPt3(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt3(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
+// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
+// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:               fir.result %[[VAL_23]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_15]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_17]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %true = arith.constant true
+    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+//inline as search loop (forward)
+func.func @_QPt4(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt4(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
+// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
+// CHECK:               fir.result %[[VAL_21]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_14]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_16]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_22]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Same as t4 above but result kind=1
+func.func @_QPt5(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt5(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
+// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
+// CHECK:               fir.result %[[VAL_21]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_14]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_16]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i8
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i8) -> i32
+// CHECK:           hlfir.assign %[[VAL_23]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i8
+    %8 = fir.convert %7 : (i8) -> i32
+    hlfir.assign %8 to %2#0 : i32, !fir.ref<i32>
+    return
+  }
+
+// Do no simplify - runtime call for forward search with character kind=1 is faster
+func.func @_QPt6(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt6(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_0]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+// CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Do not simplify - runtime call for forward search with character kind=1 is faster
+// Lookup direction is unknown at compile time, hence forward is pessimistically assumed
+func.func @_QPt7(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK-LABEL:   func.func @_QPt7(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_0]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK:           %[[VAL_10:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_9]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+// CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
+    %3:2 = hlfir.declare %2 {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %6 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %7:2 = hlfir.declare %6 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
+    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Inline as backward search loop for character kind=1.
+// The case similar to t7 but direction is known, so it is faster than runtime call.
+func.func @_QPt8(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt8(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<1>>, index, index, index) -> !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<1>) -> i8
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
+// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
+// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<1>, index, index, index) -> !fir.ref<!fir.char<1>>
+// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<1>>
+// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<1>) -> i8
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i8
+// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:               fir.result %[[VAL_23]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_15]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_17]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %true = arith.constant true
+    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+

From 2e3f2523e624a4a922c386f6f1264c19f25a2e26 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 29 Sep 2025 16:54:49 +0100
Subject: [PATCH 140/878] [AArch64] Add global isel coverage for fp16 tests and
 strict-fp. NFC

---
 .../CodeGen/AArch64/fp16-v4-instructions.ll   |  732 ++++++---
 .../CodeGen/AArch64/fp16-v8-instructions.ll   | 1454 ++++++++++++-----
 llvm/test/CodeGen/AArch64/strict-fp-opt.ll    |  150 +-
 3 files changed, 1624 insertions(+), 712 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 8bc3497ad3c3c..6233ce743b706 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -1,20 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-CVT
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
 
 define <4 x half> @add_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: add_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: add_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: add_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fadd v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: add_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fadd <4 x half> %a, %b
@@ -22,28 +32,54 @@ entry:
 }
 
 define <4 x half> @build_h4(<4 x half> %a) {
-; CHECK-COMMON-LABEL: build_h4:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    mov w8, #15565 // =0x3ccd
-; CHECK-COMMON-NEXT:    dup v0.4h, w8
-; CHECK-COMMON-NEXT:    ret
+; CHECK-CVT-SD-LABEL: build_h4:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    mov w8, #15565 // =0x3ccd
+; CHECK-CVT-SD-NEXT:    dup v0.4h, w8
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: build_h4:
+; CHECK-FP16-SD:       // %bb.0: // %entry
+; CHECK-FP16-SD-NEXT:    mov w8, #15565 // =0x3ccd
+; CHECK-FP16-SD-NEXT:    dup v0.4h, w8
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: build_h4:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    adrp x8, .LCPI1_0
+; CHECK-CVT-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: build_h4:
+; CHECK-FP16-GI:       // %bb.0: // %entry
+; CHECK-FP16-GI-NEXT:    adrp x8, .LCPI1_0
+; CHECK-FP16-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI1_0]
+; CHECK-FP16-GI-NEXT:    ret
 entry:
   ret <4 x half> <half 0xH3CCD, half 0xH3CCD, half 0xH3CCD, half 0xH3CCD>
 }
 
 define <4 x half> @sub_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: sub_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fsub v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sub_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sub_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fsub v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sub_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fsub <4 x half> %a, %b
@@ -51,18 +87,26 @@ entry:
 }
 
 define <4 x half> @mul_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: mul_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fmul v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: mul_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: mul_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: mul_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fmul <4 x half> %a, %b
@@ -70,18 +114,26 @@ entry:
 }
 
 define <4 x half> @div_h(<4 x half> %a, <4 x half> %b) {
-; CHECK-CVT-LABEL: div_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fdiv v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: div_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: div_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fdiv v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: div_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
 
   %0 = fdiv <4 x half> %a, %b
@@ -89,92 +141,162 @@ entry:
 }
 
 define <4 x half> @load_h(ptr %a) {
-; CHECK-COMMON-LABEL: load_h:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    ldr d0, [x0]
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: load_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
 entry:
   %0 = load <4 x half>, ptr %a, align 4
   ret <4 x half> %0
 }
 
 define void @store_h(ptr %a, <4 x half> %b) {
-; CHECK-COMMON-LABEL: store_h:
-; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    str d0, [x0]
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: store_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
 entry:
   store <4 x half> %b, ptr %a, align 4
   ret void
 }
 
 define <4 x half> @s_to_h(<4 x float> %a) {
-; CHECK-COMMON-LABEL: s_to_h:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: s_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = fptrunc <4 x float> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <4 x half> @d_to_h(<4 x double> %a) {
-; CHECK-COMMON-LABEL: d_to_h:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtxn v0.2s, v0.2d
-; CHECK-COMMON-NEXT:    fcvtxn2 v0.4s, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-CVT-SD-LABEL: d_to_h:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: d_to_h:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: d_to_h:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h0, d0
+; CHECK-CVT-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d1
+; CHECK-CVT-GI-NEXT:    fcvt h2, d2
+; CHECK-CVT-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h2, d3
+; CHECK-CVT-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-CVT-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-CVT-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: d_to_h:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h0, d0
+; CHECK-FP16-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d1
+; CHECK-FP16-GI-NEXT:    fcvt h2, d2
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h2, d3
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v2.h[0]
+; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fptrunc <4 x double> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <4 x float> @h_to_s(<4 x half> %a) {
-; CHECK-COMMON-LABEL: h_to_s:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: h_to_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-NEXT:    ret
   %1 = fpext <4 x half> %a to <4 x float>
   ret <4 x float> %1
 }
 
 define <4 x double> @h_to_d(<4 x half> %a) {
-; CHECK-COMMON-LABEL: h_to_d:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-COMMON-NEXT:    fcvtl2 v1.2d, v0.4s
-; CHECK-COMMON-NEXT:    fcvtl v0.2d, v0.2s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-CVT-SD-LABEL: h_to_d:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: h_to_d:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-FP16-SD-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: h_to_d:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-CVT-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-CVT-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-CVT-GI-NEXT:    fcvt d0, h0
+; CHECK-CVT-GI-NEXT:    fcvt d4, h1
+; CHECK-CVT-GI-NEXT:    fcvt d1, h2
+; CHECK-CVT-GI-NEXT:    fcvt d2, h3
+; CHECK-CVT-GI-NEXT:    mov v0.d[1], v4.d[0]
+; CHECK-CVT-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: h_to_d:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP16-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-FP16-GI-NEXT:    fcvt d0, h0
+; CHECK-FP16-GI-NEXT:    fcvt d4, h1
+; CHECK-FP16-GI-NEXT:    fcvt d1, h2
+; CHECK-FP16-GI-NEXT:    fcvt d2, h3
+; CHECK-FP16-GI-NEXT:    mov v0.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fpext <4 x half> %a to <4 x double>
   ret <4 x double> %1
 }
 
 define <4 x half> @bitcast_i_to_h(float, <4 x i16> %a) {
-; CHECK-COMMON-LABEL: bitcast_i_to_h:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fmov d0, d1
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
   %2 = bitcast <4 x i16> %a to <4 x half>
   ret <4 x half> %2
 }
 
 define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
-; CHECK-COMMON-LABEL: bitcast_h_to_i:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    fmov d0, d1
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
   %2 = bitcast <4 x half> %a to <4 x i16>
   ret <4 x i16> %2
 }
 
 define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_i8:
 ; CHECK-FP16:       // %bb.0:
@@ -182,6 +304,15 @@ define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
@@ -204,43 +335,59 @@ define <4 x half> @sitofp_i16(<4 x i16> %a) #0 {
 
 
 define <4 x half> @sitofp_i32(<4 x i32> %a) #0 {
-; CHECK-COMMON-LABEL: sitofp_i32:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: sitofp_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = sitofp <4 x i32> %a to <4 x half>
   ret <4 x half> %1
 }
 
 
 define <4 x half> @sitofp_i64(<4 x i64> %a) #0 {
-; CHECK-COMMON-LABEL: sitofp_i64:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-COMMON-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-COMMON-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: sitofp_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = sitofp <4 x i64> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <4 x half> @uitofp_i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: uitofp_i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-FP16-NEXT:    ucvtf v0.4h, v0.4h
-; CHECK-FP16-NEXT:    ret
+; CHECK-FP16-SD-LABEL: uitofp_i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-FP16-SD-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-FP16-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-FP16-GI-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
@@ -264,35 +411,35 @@ define <4 x half> @uitofp_i16(<4 x i16> %a) #0 {
 
 
 define <4 x half> @uitofp_i32(<4 x i32> %a) #0 {
-; CHECK-COMMON-LABEL: uitofp_i32:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: uitofp_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = uitofp <4 x i32> %a to <4 x half>
   ret <4 x half> %1
 }
 
 
 define <4 x half> @uitofp_i64(<4 x i64> %a) #0 {
-; CHECK-COMMON-LABEL: uitofp_i64:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-COMMON-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-COMMON-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-COMMON-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: uitofp_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
   %1 = uitofp <4 x i64> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define void @test_insert_at_zero(half %a, ptr %b) #0 {
-; CHECK-COMMON-LABEL: test_insert_at_zero:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-COMMON-NEXT:    str d0, [x0]
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
   %1 = insertelement <4 x half> undef, half %a, i64 0
   store <4 x half> %1, ptr %b, align 4
   ret void
@@ -331,17 +478,29 @@ define <4 x i16> @fptosi_i16(<4 x half> %a) #0 {
 }
 
 define <4 x i8> @fptoui_i8(<4 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptoui_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptoui_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: fptoui_i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
-; CHECK-FP16-NEXT:    ret
+; CHECK-FP16-SD-LABEL: fptoui_i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptoui_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: fptoui_i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcvtzu v0.4h, v0.4h
+; CHECK-FP16-GI-NEXT:    ret
 ; NOTE: fcvtzs selected here because the xtn shaves the sign bit
   %1 = fptoui<4 x half> %a to <4 x i8>
   ret <4 x i8> %1
@@ -364,36 +523,45 @@ define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
 }
 
 define <4 x i1> @test_fcmp_une(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_une:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_une:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_une:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmeq v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_une:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp une <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ueq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ueq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ueq:
 ; CHECK-FP16:       // %bb.0:
@@ -402,102 +570,149 @@ define <4 x i1> @test_fcmp_ueq(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ueq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ueq <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ugt(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ugt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ugt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ugt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ugt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ugt <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_uge(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uge:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp uge <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ult(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ult:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ult:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ult:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ult:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ult <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ule(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ule:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ule:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ule:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ule:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ule <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uno:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    mvn v0.8b, v0.8b
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uno:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    mvn v0.8b, v0.8b
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uno:
 ; CHECK-FP16:       // %bb.0:
@@ -506,21 +721,32 @@ define <4 x i1> @test_fcmp_uno(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uno:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp uno <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_one:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_one:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_one:
 ; CHECK-FP16:       // %bb.0:
@@ -528,60 +754,94 @@ define <4 x i1> @test_fcmp_one(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_one:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp one <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_oeq(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oeq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oeq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oeq:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmeq v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oeq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp oeq <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_ogt(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ogt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ogt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ogt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ogt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ogt <4 x half> %a, %b
   ret <4 x i1> %1
 }
 
 define <4 x i1> @test_fcmp_oge(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oge:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.4h, v0.4h, v1.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp oge <4 x half> %a, %b
   ret <4 x i1> %1
@@ -624,15 +884,15 @@ define <4 x i1> @test_fcmp_ole(<4 x half> %a, <4 x half> %b) #0 {
 }
 
 define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ord:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v2.16b
-; CHECK-CVT-NEXT:    xtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ord:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ord:
 ; CHECK-FP16:       // %bb.0:
@@ -640,6 +900,16 @@ define <4 x i1> @test_fcmp_ord(<4 x half> %a, <4 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
 ; CHECK-FP16-NEXT:    orr v0.8b, v0.8b, v2.8b
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ord:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
 
   %1 = fcmp ord <4 x half> %a, %b
   ret <4 x i1> %1
diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
index fcb42a74ce697..86763eb5f9e3b 100644
--- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -1,24 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
-; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-SD
+; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-CVT,CHECK-CVT-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
 
 define <8 x half> @add_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: add_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fadd v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fadd v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: add_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: add_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fadd v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: add_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fadd v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fadd <8 x half> %a, %b
   ret <8 x half> %0
@@ -26,22 +40,34 @@ entry:
 
 
 define <8 x half> @sub_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: sub_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fsub v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fsub v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sub_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sub_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fsub v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sub_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fsub v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fsub <8 x half> %a, %b
   ret <8 x half> %0
@@ -49,22 +75,34 @@ entry:
 
 
 define <8 x half> @mul_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: mul_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fmul v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fmul v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: mul_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: mul_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: mul_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fmul <8 x half> %a, %b
   ret <8 x half> %0
@@ -72,22 +110,34 @@ entry:
 
 
 define <8 x half> @div_h(<8 x half> %a, <8 x half> %b) {
-; CHECK-CVT-LABEL: div_h:
-; CHECK-CVT:       // %bb.0: // %entry
-; CHECK-CVT-NEXT:    fcvtl v2.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v3.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
-; CHECK-CVT-NEXT:    fdiv v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: div_h:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v2.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v3.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: div_h:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fdiv v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: div_h:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fdiv v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
 entry:
   %0 = fdiv <8 x half> %a, %b
   ret <8 x half> %0
@@ -126,39 +176,171 @@ define <8 x half> @s_to_h(<8 x float> %a) {
 }
 
 define <8 x half> @d_to_h(<8 x double> %a) {
-; CHECK-LABEL: d_to_h:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtxn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtxn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtxn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtxn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: d_to_h:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: d_to_h:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: d_to_h:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h0, d0
+; CHECK-CVT-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d1
+; CHECK-CVT-GI-NEXT:    fcvt h4, d4
+; CHECK-CVT-GI-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h4, d5
+; CHECK-CVT-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-CVT-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h2, d2
+; CHECK-CVT-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d1
+; CHECK-CVT-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-CVT-GI-NEXT:    mov d2, v3.d[1]
+; CHECK-CVT-GI-NEXT:    fcvt h3, d3
+; CHECK-CVT-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-CVT-GI-NEXT:    fcvt h1, d2
+; CHECK-CVT-GI-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-CVT-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: d_to_h:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mov d4, v0.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h0, d0
+; CHECK-FP16-GI-NEXT:    mov d5, v1.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d1
+; CHECK-FP16-GI-NEXT:    fcvt h4, d4
+; CHECK-FP16-GI-NEXT:    mov v0.h[1], v4.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h4, d5
+; CHECK-FP16-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-FP16-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h2, d2
+; CHECK-FP16-GI-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d1
+; CHECK-FP16-GI-NEXT:    mov v0.h[4], v2.h[0]
+; CHECK-FP16-GI-NEXT:    mov d2, v3.d[1]
+; CHECK-FP16-GI-NEXT:    fcvt h3, d3
+; CHECK-FP16-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-FP16-GI-NEXT:    fcvt h1, d2
+; CHECK-FP16-GI-NEXT:    mov v0.h[6], v3.h[0]
+; CHECK-FP16-GI-NEXT:    mov v0.h[7], v1.h[0]
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fptrunc <8 x double> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <8 x float> @h_to_s(<8 x half> %a) {
-; CHECK-LABEL: h_to_s:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: h_to_s:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: h_to_s:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-FP16-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: h_to_s:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: h_to_s:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-FP16-GI-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-FP16-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fpext <8 x half> %a to <8 x float>
   ret <8 x float> %1
 }
 
 define <8 x double> @h_to_d(<8 x half> %a) {
-; CHECK-LABEL: h_to_d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtl v1.4s, v0.4h
-; CHECK-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-NEXT:    fcvtl v0.2d, v1.2s
-; CHECK-NEXT:    fcvtl2 v3.2d, v2.4s
-; CHECK-NEXT:    fcvtl2 v1.2d, v1.4s
-; CHECK-NEXT:    fcvtl v2.2d, v2.2s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: h_to_d:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.2d, v1.2s
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.2d, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.2d, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: h_to_d:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-FP16-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-FP16-SD-NEXT:    fcvtl v0.2d, v1.2s
+; CHECK-FP16-SD-NEXT:    fcvtl2 v3.2d, v2.4s
+; CHECK-FP16-SD-NEXT:    fcvtl2 v1.2d, v1.4s
+; CHECK-FP16-SD-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: h_to_d:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-CVT-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-CVT-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-CVT-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-CVT-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-CVT-GI-NEXT:    mov h6, v0.h[6]
+; CHECK-CVT-GI-NEXT:    mov h7, v0.h[7]
+; CHECK-CVT-GI-NEXT:    fcvt d0, h0
+; CHECK-CVT-GI-NEXT:    fcvt d16, h1
+; CHECK-CVT-GI-NEXT:    fcvt d1, h2
+; CHECK-CVT-GI-NEXT:    fcvt d17, h3
+; CHECK-CVT-GI-NEXT:    fcvt d2, h4
+; CHECK-CVT-GI-NEXT:    fcvt d4, h5
+; CHECK-CVT-GI-NEXT:    fcvt d3, h6
+; CHECK-CVT-GI-NEXT:    fcvt d5, h7
+; CHECK-CVT-GI-NEXT:    mov v0.d[1], v16.d[0]
+; CHECK-CVT-GI-NEXT:    mov v1.d[1], v17.d[0]
+; CHECK-CVT-GI-NEXT:    mov v2.d[1], v4.d[0]
+; CHECK-CVT-GI-NEXT:    mov v3.d[1], v5.d[0]
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: h_to_d:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-FP16-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-FP16-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-FP16-GI-NEXT:    mov h4, v0.h[4]
+; CHECK-FP16-GI-NEXT:    mov h5, v0.h[5]
+; CHECK-FP16-GI-NEXT:    mov h6, v0.h[6]
+; CHECK-FP16-GI-NEXT:    mov h7, v0.h[7]
+; CHECK-FP16-GI-NEXT:    fcvt d0, h0
+; CHECK-FP16-GI-NEXT:    fcvt d16, h1
+; CHECK-FP16-GI-NEXT:    fcvt d1, h2
+; CHECK-FP16-GI-NEXT:    fcvt d17, h3
+; CHECK-FP16-GI-NEXT:    fcvt d2, h4
+; CHECK-FP16-GI-NEXT:    fcvt d4, h5
+; CHECK-FP16-GI-NEXT:    fcvt d3, h6
+; CHECK-FP16-GI-NEXT:    fcvt d5, h7
+; CHECK-FP16-GI-NEXT:    mov v0.d[1], v16.d[0]
+; CHECK-FP16-GI-NEXT:    mov v1.d[1], v17.d[0]
+; CHECK-FP16-GI-NEXT:    mov v2.d[1], v4.d[0]
+; CHECK-FP16-GI-NEXT:    mov v3.d[1], v5.d[0]
+; CHECK-FP16-GI-NEXT:    ret
   %1 = fpext <8 x half> %a to <8 x double>
   ret <8 x double> %1
 }
@@ -183,14 +365,14 @@ define <8 x i16> @bitcast_h_to_i(float, <8 x half> %a) {
 }
 
 define <4 x half> @sitofp_v4i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_v4i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_v4i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-SD-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_v4i8:
 ; CHECK-FP16:       // %bb.0:
@@ -198,76 +380,132 @@ define <4 x half> @sitofp_v4i8(<4 x i8> %a) #0 {
 ; CHECK-FP16-NEXT:    sshr v0.4h, v0.4h, #8
 ; CHECK-FP16-NEXT:    scvtf v0.4h, v0.4h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_v4i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-CVT-GI-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <8 x half> @sitofp_v8i8(<8 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_v8i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    scvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_v8i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_v8i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-FP16-NEXT:    scvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_v8i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    scvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <8 x i8> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <16 x half> @sitofp_v16i8(<16 x i8> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_v16i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-CVT-NEXT:    sshll v3.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    sshll2 v4.4s, v1.8h, #0
-; CHECK-CVT-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-CVT-NEXT:    scvtf v3.4s, v3.4s
-; CHECK-CVT-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-CVT-NEXT:    scvtf v2.4s, v4.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-CVT-NEXT:    scvtf v3.4s, v5.4s
-; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v3.4s
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-FP16-LABEL: sitofp_v16i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-FP16-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-FP16-NEXT:    scvtf v1.8h, v1.8h
-; CHECK-FP16-NEXT:    scvtf v0.8h, v0.8h
-; CHECK-FP16-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_v16i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-CVT-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v4.4s, v1.8h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    scvtf v2.4s, v4.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v3.4s
+; CHECK-CVT-SD-NEXT:    scvtf v3.4s, v5.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v1.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v3.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: sitofp_v16i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-FP16-SD-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-FP16-SD-NEXT:    scvtf v1.8h, v1.8h
+; CHECK-FP16-SD-NEXT:    scvtf v0.8h, v0.8h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_v16i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-CVT-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll v3.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    scvtf v4.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    scvtf v5.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v4.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v1.8h, v5.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: sitofp_v16i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    sshll v1.8h, v0.8b, #0
+; CHECK-FP16-GI-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-FP16-GI-NEXT:    scvtf v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    scvtf v1.8h, v2.8h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = sitofp <16 x i8> %a to <16 x half>
   ret <16 x half> %1
 }
 
 define <8 x half> @sitofp_i16(<8 x i16> %a) #0 {
-; CHECK-CVT-LABEL: sitofp_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    sshll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    scvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    scvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: sitofp_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    scvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    sshll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    scvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = sitofp <8 x i16> %a to <8 x half>
   ret <8 x half> %1
 }
@@ -286,108 +524,213 @@ define <8 x half> @sitofp_i32(<8 x i32> %a) #0 {
 
 
 define <8 x half> @sitofp_i64(<8 x i64> %a) #0 {
-; CHECK-LABEL: sitofp_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    scvtf v0.2d, v0.2d
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
-; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: sitofp_i64:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-CVT-SD-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-SD-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: sitofp_i64:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-FP16-SD-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-FP16-SD-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-FP16-SD-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: sitofp_i64:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-GI-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-CVT-GI-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: sitofp_i64:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-FP16-GI-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-GI-NEXT:    ret
   %1 = sitofp <8 x i64> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <4 x half> @uitofp_v4i8(<4 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_v4i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-FP16-LABEL: uitofp_v4i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-FP16-NEXT:    ucvtf v0.4h, v0.4h
-; CHECK-FP16-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_v4i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: uitofp_v4i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-FP16-SD-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_v4i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    movi v1.2d, #0x0000ff000000ff
+; CHECK-CVT-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_v4i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    movi d1, #0xff00ff00ff00ff
+; CHECK-FP16-GI-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-FP16-GI-NEXT:    ucvtf v0.4h, v0.4h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <4 x i8> %a to <4 x half>
   ret <4 x half> %1
 }
 
 define <8 x half> @uitofp_v8i8(<8 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_v8i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_v8i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: uitofp_v8i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-FP16-NEXT:    ucvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_v8i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = uitofp <8 x i8> %a to <8 x half>
   ret <8 x half> %1
 }
 
 define <16 x half> @uitofp_v16i8(<16 x i8> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_v16i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-CVT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-CVT-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ushll2 v4.4s, v1.8h, #0
-; CHECK-CVT-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-CVT-NEXT:    ucvtf v3.4s, v3.4s
-; CHECK-CVT-NEXT:    fcvtn v1.4h, v2.4s
-; CHECK-CVT-NEXT:    ucvtf v2.4s, v4.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v3.4s
-; CHECK-CVT-NEXT:    ucvtf v3.4s, v5.4s
-; CHECK-CVT-NEXT:    fcvtn2 v1.8h, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v3.4s
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-FP16-LABEL: uitofp_v16i8:
-; CHECK-FP16:       // %bb.0:
-; CHECK-FP16-NEXT:    ushll2 v1.8h, v0.16b, #0
-; CHECK-FP16-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-FP16-NEXT:    ucvtf v1.8h, v1.8h
-; CHECK-FP16-NEXT:    ucvtf v0.8h, v0.8h
-; CHECK-FP16-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_v16i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-CVT-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-SD-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v4.4s, v1.8h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v1.4h, v2.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v2.4s, v4.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v3.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v3.4s, v5.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v1.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v3.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: uitofp_v16i8:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-FP16-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-FP16-SD-NEXT:    ucvtf v1.8h, v1.8h
+; CHECK-FP16-SD-NEXT:    ucvtf v0.8h, v0.8h
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_v16i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-CVT-GI-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v4.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v5.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v2.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v1.4h, v3.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v4.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v1.8h, v5.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_v16i8:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-FP16-GI-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-FP16-GI-NEXT:    ucvtf v0.8h, v1.8h
+; CHECK-FP16-GI-NEXT:    ucvtf v1.8h, v2.8h
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <16 x i8> %a to <16 x half>
   ret <16 x half> %1
 }
 
 
 define <8 x half> @uitofp_i16(<8 x i16> %a) #0 {
-; CHECK-CVT-LABEL: uitofp_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-CVT-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtn v0.4h, v1.4s
-; CHECK-CVT-NEXT:    ucvtf v1.4s, v2.4s
-; CHECK-CVT-NEXT:    fcvtn2 v0.8h, v1.4s
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-SD-NEXT:    ucvtf v1.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: uitofp_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    ucvtf v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-CVT-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-GI-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    ucvtf v2.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
   %1 = uitofp <8 x i16> %a to <8 x half>
   ret <8 x half> %1
 }
@@ -407,19 +750,61 @@ define <8 x half> @uitofp_i32(<8 x i32> %a) #0 {
 
 
 define <8 x half> @uitofp_i64(<8 x i64> %a) #0 {
-; CHECK-LABEL: uitofp_i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ucvtf v0.2d, v0.2d
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
-; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
-; CHECK-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
-; CHECK-NEXT:    fcvtn v0.4h, v0.4s
-; CHECK-NEXT:    fcvtn2 v0.8h, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-CVT-SD-LABEL: uitofp_i64:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-SD-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-CVT-SD-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-SD-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: uitofp_i64:
+; CHECK-FP16-SD:       // %bb.0:
+; CHECK-FP16-SD-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-FP16-SD-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-FP16-SD-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-FP16-SD-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-SD-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-SD-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-SD-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: uitofp_i64:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-GI-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-CVT-GI-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-CVT-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: uitofp_i64:
+; CHECK-FP16-GI:       // %bb.0:
+; CHECK-FP16-GI-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-FP16-GI-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-FP16-GI-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-FP16-GI-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-FP16-GI-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-FP16-GI-NEXT:    fcvtn v0.4h, v0.4s
+; CHECK-FP16-GI-NEXT:    fcvtn2 v0.8h, v2.4s
+; CHECK-FP16-GI-NEXT:    ret
   %1 = uitofp <8 x i64> %a to <8 x half>
   ret <8 x half> %1
 }
@@ -436,94 +821,132 @@ define void @test_insert_at_zero(half %a, ptr %b) #0 {
 }
 
 define <8 x i8> @fptosi_i8(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptosi_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptosi_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptosi_i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptosi_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptosi<8 x half> %a to <8 x i8>
   ret <8 x i8> %1
 }
 
 define <8 x i16> @fptosi_i16(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptosi_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptosi_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptosi_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptosi_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptosi<8 x half> %a to <8 x i16>
   ret <8 x i16> %1
 }
 
 define <8 x i8> @fptoui_i8(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptoui_i8:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptoui_i8:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptoui_i8:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptoui_i8:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptoui<8 x half> %a to <8 x i8>
   ret <8 x i8> %1
 }
 
 define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
-; CHECK-CVT-LABEL: fptoui_i16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v1.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtzu v1.4s, v1.4s
-; CHECK-CVT-NEXT:    fcvtzu v0.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: fptoui_i16:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v1.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fptoui_i16:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: fptoui_i16:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v1.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fptoui<8 x half> %a to <8 x i16>
   ret <8 x i16> %1
 }
 
 define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_une:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_une:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_une:
 ; CHECK-FP16:       // %bb.0:
@@ -531,27 +954,41 @@ define <8 x i1> @test_fcmp_une(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_une:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmeq v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp une <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ueq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ueq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ueq:
 ; CHECK-FP16:       // %bb.0:
@@ -561,23 +998,41 @@ define <8 x i1> @test_fcmp_ueq(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ueq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ueq <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ugt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ugt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ugt:
 ; CHECK-FP16:       // %bb.0:
@@ -585,23 +1040,37 @@ define <8 x i1> @test_fcmp_ugt(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ugt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ugt <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uge:
 ; CHECK-FP16:       // %bb.0:
@@ -609,23 +1078,37 @@ define <8 x i1> @test_fcmp_uge(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp uge <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ult:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ult:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ult:
 ; CHECK-FP16:       // %bb.0:
@@ -633,23 +1116,37 @@ define <8 x i1> @test_fcmp_ult(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ult:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ult <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ule:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ule:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ule:
 ; CHECK-FP16:       // %bb.0:
@@ -657,27 +1154,41 @@ define <8 x i1> @test_fcmp_ule(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ule:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v2.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ule <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uno:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmge v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    mvn v0.16b, v0.16b
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_uno:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_uno:
 ; CHECK-FP16:       // %bb.0:
@@ -687,26 +1198,44 @@ define <8 x i1> @test_fcmp_uno(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_uno:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    mvn v1.16b, v1.16b
+; CHECK-CVT-GI-NEXT:    mvn v0.16b, v0.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp uno <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_one:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_one:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_one:
 ; CHECK-FP16:       // %bb.0:
@@ -715,136 +1244,212 @@ define <8 x i1> @test_fcmp_one(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.16b, v0.16b, v2.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_one:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp one <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_oeq(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oeq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oeq:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmeq v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oeq:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmeq v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oeq:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmeq v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp oeq <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ogt(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ogt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ogt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ogt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ogt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ogt <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_oge(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_oge:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_oge:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_oge:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp oge <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_olt(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_olt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_olt:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_olt:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_olt:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp olt <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ole(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ole:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcmge v2.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmge v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ole:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ole:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcmge v0.8h, v1.8h, v0.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ole:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ole <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ord:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvtl2 v2.4s, v1.8h
-; CHECK-CVT-NEXT:    fcvtl2 v3.4s, v0.8h
-; CHECK-CVT-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-CVT-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-CVT-NEXT:    fcmge v4.4s, v3.4s, v2.4s
-; CHECK-CVT-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
-; CHECK-CVT-NEXT:    fcmge v3.4s, v0.4s, v1.4s
-; CHECK-CVT-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-CVT-NEXT:    orr v1.16b, v2.16b, v4.16b
-; CHECK-CVT-NEXT:    orr v0.16b, v0.16b, v3.16b
-; CHECK-CVT-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-CVT-NEXT:    xtn v0.8b, v0.8h
-; CHECK-CVT-NEXT:    ret
+; CHECK-CVT-SD-LABEL: test_fcmp_ord:
+; CHECK-CVT-SD:       // %bb.0:
+; CHECK-CVT-SD-NEXT:    fcvtl2 v2.4s, v1.8h
+; CHECK-CVT-SD-NEXT:    fcvtl2 v3.4s, v0.8h
+; CHECK-CVT-SD-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    fcmge v4.4s, v3.4s, v2.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v2.4s, v2.4s, v3.4s
+; CHECK-CVT-SD-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-SD-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fcmp_ord:
 ; CHECK-FP16:       // %bb.0:
@@ -853,8 +1458,27 @@ define <8 x i1> @test_fcmp_ord(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-FP16-NEXT:    orr v0.16b, v0.16b, v2.16b
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: test_fcmp_ord:
+; CHECK-CVT-GI:       // %bb.0:
+; CHECK-CVT-GI-NEXT:    fcvtl v2.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-CVT-GI-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-CVT-GI-NEXT:    fcmge v4.4s, v2.4s, v3.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v2.4s, v3.4s, v2.4s
+; CHECK-CVT-GI-NEXT:    fcmge v3.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    orr v1.16b, v2.16b, v4.16b
+; CHECK-CVT-GI-NEXT:    orr v0.16b, v0.16b, v3.16b
+; CHECK-CVT-GI-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-CVT-GI-NEXT:    xtn v0.8b, v0.8h
+; CHECK-CVT-GI-NEXT:    ret
   %1 = fcmp ord <8 x half> %a, %b
   ret <8 x i1> %1
 }
 
 attributes #0 = { nounwind }
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-CVT: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/strict-fp-opt.ll b/llvm/test/CodeGen/AArch64/strict-fp-opt.ll
index bb7cd22c01b41..c433291ff576a 100644
--- a/llvm/test/CodeGen/AArch64/strict-fp-opt.ll
+++ b/llvm/test/CodeGen/AArch64/strict-fp-opt.ll
@@ -1,31 +1,40 @@
-; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64 -global-isel=true -global-isel-abort=2 %s -o - | FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel=true -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for unused_div_fpexcept_strict
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for unused_div_round_dynamic
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for add_twice_fpexcept_strict
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for add_twice_round_dynamic
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for set_rounding
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for set_rounding_fpexcept_strict
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for set_rounding_round_dynamic
 
 ; Div whose result is unused should be removed unless we have strict exceptions
 
-; CHECK-LABEL: unused_div:
-; CHECK-NOT: fdiv
-; CHECK: ret
 define void @unused_div(float %x, float %y) {
+; CHECK-LABEL: unused_div:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
 entry:
   %add = fdiv float %x, %y
   ret void
 }
 
-; CHECK-LABEL: unused_div_fpexcept_strict:
-; CHECK: fdiv s0, s0, s1
-; CHECK-NEXT: ret
 define void @unused_div_fpexcept_strict(float %x, float %y) #0 {
+; CHECK-LABEL: unused_div_fpexcept_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   ret void
 }
 
-; CHECK-LABEL: unused_div_round_dynamic:
-; CHECK-NOT: fdiv
-; CHECK: ret
 define void @unused_div_round_dynamic(float %x, float %y) #0 {
+; CHECK-LABEL: unused_div_round_dynamic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   ret void
@@ -33,14 +42,14 @@ entry:
 
 
 ; Machine CSE should eliminate the second add unless we have strict exceptions
-
-; CHECK-LABEL: add_twice:
-; CHECK: fadd [[ADD:s[0-9]+]], s0, s1
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: fmul [[MUL:s[0-9]+]], [[ADD]], [[ADD]]
-; CHECK-NEXT: fcsel s0, [[ADD]], [[MUL]], eq
-; CHECK-NEXT: ret
 define float @add_twice(float %x, float %y, i32 %n) {
+; CHECK-LABEL: add_twice:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    fmul s1, s0, s0
+; CHECK-NEXT:    fcsel s0, s0, s1, eq
+; CHECK-NEXT:    ret
 entry:
   %add = fadd float %x, %y
   %tobool.not = icmp eq i32 %n, 0
@@ -56,15 +65,17 @@ if.end:
   ret float %a.0
 }
 
-; CHECK-LABEL: add_twice_fpexcept_strict:
-; CHECK: fmov [[X:s[0-9]+]], s0
-; CHECK-NEXT: fadd s0, s0, s1
-; CHECK-NEXT: cbz w0, [[LABEL:.LBB[0-9_]+]]
-; CHECK: fadd [[ADD:s[0-9]+]], [[X]], s1
-; CHECK-NEXT: fmul s0, s0, [[ADD]]
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: ret
 define float @add_twice_fpexcept_strict(float %x, float %y, i32 %n) #0 {
+; CHECK-LABEL: add_twice_fpexcept_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    cbz w0, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    fadd s1, s2, s1
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:  .LBB4_2: // %if.end
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   %tobool.not = icmp eq i32 %n, 0
@@ -80,14 +91,15 @@ if.end:
   ret float %a.0
 }
 
-; CHECK-LABEL: add_twice_round_dynamic:
-; CHECK: fadd s0, s0, s1
-; CHECK-NEXT: cbz w0, [[LABEL:.LBB[0-9_]+]]
-; CHECK-NOT: fadd
-; CHECK: fmul s0, s0, s0
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: ret
 define float @add_twice_round_dynamic(float %x, float %y, i32 %n) #0 {
+; CHECK-LABEL: add_twice_round_dynamic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    cbz w0, .LBB5_2
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    fmul s0, s0, s0
+; CHECK-NEXT:  .LBB5_2: // %if.end
+; CHECK-NEXT:    ret
 entry:
   %add = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   %tobool.not = icmp eq i32 %n, 0
@@ -108,17 +120,18 @@ if.end:
 ; dynamic (as they may give different results) or when we have strict exceptions
 ; (the llvm.set.rounding is irrelevant, but both could trap).
 
-; CHECK-LABEL: set_rounding:
-; CHECK-DAG: fadd [[SREG:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG1:x[0-9]+]], FPCR
-; CHECK-DAG: orr [[XREG2:x[0-9]+]], [[XREG1]], #0xc00000
-; CHECK: msr FPCR, [[XREG2]]
-; CHECK-NEXT: mrs [[XREG3:x[0-9]+]], FPCR
-; CHECK-NEXT: and [[XREG4:x[0-9]+]], [[XREG3]], #0xffffffffff3fffff
-; CHECK-NEXT: msr FPCR, [[XREG4]]
-; CHECK-NEXT: fsub s0, [[SREG]], [[SREG]]
-; CHECK-NEXT: ret
 define float @set_rounding(float %x, float %y) {
+; CHECK-LABEL: set_rounding:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    orr x8, x8, #0xc00000
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fsub s0, s0, s0
+; CHECK-NEXT:    ret
 entry:
   %add1 = fadd float %x, %y
   call void @llvm.set.rounding(i32 0)
@@ -128,18 +141,19 @@ entry:
   ret float %sub
 }
 
-; CHECK-LABEL: set_rounding_fpexcept_strict:
-; CHECK-DAG: fadd [[SREG1:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG1:x[0-9]+]], FPCR
-; CHECK-DAG: orr [[XREG2:x[0-9]+]], [[XREG1]], #0xc00000
-; CHECK: msr FPCR, [[XREG2]]
-; CHECK-DAG: fadd [[SREG2:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG3:x[0-9]+]], FPCR
-; CHECK-DAG: and [[XREG4:x[0-9]+]], [[XREG3]], #0xffffffffff3fffff
-; CHECK-NEXT: msr FPCR, [[XREG4]]
-; CHECK-NEXT: fsub s0, [[SREG1]], [[SREG2]]
-; CHECK-NEXT: ret
 define float @set_rounding_fpexcept_strict(float %x, float %y) #0 {
+; CHECK-LABEL: set_rounding_fpexcept_strict:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd s2, s0, s1
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    orr x8, x8, #0xc00000
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fsub s0, s2, s0
+; CHECK-NEXT:    ret
 entry:
   %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
   call void @llvm.set.rounding(i32 0) #0
@@ -149,18 +163,19 @@ entry:
   ret float %sub
 }
 
-; CHECK-LABEL: set_rounding_round_dynamic:
-; CHECK-DAG: fadd [[SREG1:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG1:x[0-9]+]], FPCR
-; CHECK-DAG: orr [[XREG2:x[0-9]+]], [[XREG1]], #0xc00000
-; CHECK: msr FPCR, [[XREG2]]
-; CHECK-DAG: fadd [[SREG2:s[0-9]+]], s0, s1
-; CHECK-DAG: mrs [[XREG3:x[0-9]+]], FPCR
-; CHECK-DAG: and [[XREG4:x[0-9]+]], [[XREG3]], #0xffffffffff3fffff
-; CHECK-NEXT: msr FPCR, [[XREG4]]
-; CHECK-NEXT: fsub s0, [[SREG1]], [[SREG2]]
-; CHECK-NEXT: ret
 define float @set_rounding_round_dynamic(float %x, float %y) #0 {
+; CHECK-LABEL: set_rounding_round_dynamic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    fadd s2, s0, s1
+; CHECK-NEXT:    orr x8, x8, #0xc00000
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mrs x8, FPCR
+; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
+; CHECK-NEXT:    msr FPCR, x8
+; CHECK-NEXT:    fsub s0, s2, s0
+; CHECK-NEXT:    ret
 entry:
   %add1 = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
   call void @llvm.set.rounding(i32 0) #0
@@ -178,3 +193,6 @@ declare i32 @llvm.get.rounding()
 declare void @llvm.set.rounding(i32)
 
 attributes #0 = { strictfp }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}

From 617854f81900d1776c11796dd4aacc82375e56ba Mon Sep 17 00:00:00 2001
From: michaelselehov <michael.selehov@amd.com>
Date: Mon, 29 Sep 2025 17:57:07 +0200
Subject: [PATCH 141/878] [AMDGPU] LRO: allow same-BB non-lookthrough users for
 PHI (#160909)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Loop headers frequently consume the loop-carried value in the header
block via non-lookthrough ops (e.g. byte-wise vector binops).
LiveRegOptimizer’s same-BB filter currently prunes these users, so the
loop-carried PHI is not coerced to i32 and the intended packed form is
lost.

Relax the filter: when the def is a PHI, allow same-BB non-lookthrough
users. Also fix the check to look at the user (CII) rather than the def
(II) so the walk does not terminate prematurely.
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |  5 +-
 .../lro-phi-samebb-nonlookthrough-store.ll    | 46 +++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 38718c43a61dd..7504f1a8cea09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -150,7 +150,10 @@ class LiveRegOptimizer {
       if (!CVisited.insert(CII).second)
         continue;
 
-      if (CII->getParent() == II->getParent() && !IsLookThru(II))
+      // Same-BB filter must look at the *user*; and allow non-lookthrough
+      // users when the def is a PHI (loop-header pattern).
+      if (CII->getParent() == II->getParent() && !IsLookThru(CII) &&
+          !isa<PHINode>(II))
         continue;
 
       if (isOpLegal(CII))
diff --git a/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll
new file mode 100644
index 0000000000000..b508f739e7fd3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lro-phi-samebb-nonlookthrough-store.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -passes=amdgpu-late-codegenprepare \
+; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
+
+; Goal: With a loop-header PHI in illegal vector type and a same-BB
+; non-lookthrough user (vector add) in the header, LRO should still coerce
+; the PHI to i32 because a profitable sink (store) exists across BB.
+
+define amdgpu_kernel void @phi_samebb_nonlookthrough_store(
+    ptr addrspace(1) %out, <4 x i8> %v, i1 %exit) {
+; CHECK-LABEL: @phi_samebb_nonlookthrough_store(
+entry:
+  br label %loop
+
+loop:                                             ; preds = %entry, %loop
+  ; Loop-carried PHI in illegal vector type.
+  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
+
+  ; Same-BB non-lookthrough use in header.
+  %acc.next = add <4 x i8> %acc, %v
+
+  ; Make it a real loop: either iterate or exit to the sink block.
+  br i1 %exit, label %store, label %loop
+
+store:                                            ; preds = %loop
+  ; The across-BB sink: storing the PHI coerced to i32.
+  %acc.bc = bitcast <4 x i8> %acc to i32
+  store i32 %acc.bc, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; After AMDGPULateCodeGenPrepare we expect:
+;  - PHI is coerced to i32
+;  - A header bitcast materializes for the add
+; This proves the same-BB non-lookthrough user (add) did not get pruned
+; when the def is a PHI.
+
+; CHECK: loop:
+; CHECK:   %[[ACC_TC:[^ ]+]] = phi i32
+; CHECK:   %[[ACC_TC_BC:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
+; CHECK:   %[[ACC_NEXT:[^ ]+]] = add <4 x i8> %[[ACC_TC_BC]], %v
+; CHECK:   br i1 %exit, label %store, label %loop
+; CHECK: store:
+; CHECK:   %[[ACC_TC_BC2:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
+; CHECK:   %[[ST_I32:[^ ]+]] = bitcast <4 x i8> %[[ACC_TC_BC2]] to i32
+; CHECK:   store i32 %[[ST_I32]],
+

From 1ef1175b309804b3ce8ee91ee72a3498f9a9a115 Mon Sep 17 00:00:00 2001
From: Marcos Maronas <marcos.maronas@intel.com>
Date: Mon, 29 Sep 2025 16:57:52 +0100
Subject: [PATCH 142/878] [SPIRV] Fix code quality issues. (#160752)

Address issues reported by a static analysis tool.
---
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index b4fc8dabbd4df..db85e33a3c6c7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -587,7 +587,8 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
   }
 
   if (FPClassTest PartialCheck = Mask & fcNan) {
-    auto InfWithQnanBitC = buildSPIRVConstant(IntTy, Inf | QNaNBitMask);
+    auto InfWithQnanBitC =
+        buildSPIRVConstant(IntTy, std::move(Inf) | QNaNBitMask);
     if (PartialCheck == fcNan) {
       // isnan(V) ==> abs(V) u> int(inf)
       appendToRes(
@@ -613,7 +614,7 @@ bool SPIRVLegalizerInfo::legalizeIsFPClass(
     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
     auto ExpMinusOne = assignSPIRVTy(
         MIRBuilder.buildSub(IntTy, Abs, buildSPIRVConstant(IntTy, ExpLSB)));
-    APInt MaxExpMinusOne = ExpMask - ExpLSB;
+    APInt MaxExpMinusOne = std::move(ExpMask) - ExpLSB;
     auto NormalRes = assignSPIRVTy(
         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
                              buildSPIRVConstant(IntTy, MaxExpMinusOne)));

From dd3507b6c021eb77c3b256dc88d6a07fb7d6734e Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Mon, 29 Sep 2025 08:58:46 -0700
Subject: [PATCH 143/878] Extend LVI to cache ranges per BB predecessor.
 (#159432)

Currently LVI does the union of value ranges from block predecessors.
When storing the ranges per predecessor, the resulting ranges may be
more restricted and enable additional optimizations.
However this is costly (memory + compile time), so place this under a
flag disabled by default.

See: https://github.com/llvm/llvm-project/issues/158139.
---
 llvm/lib/Analysis/LazyValueInfo.cpp           | 140 +++++++++++++++++-
 .../track-predecessor-ranges.ll               |  98 ++++++++++++
 2 files changed, 235 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/CorrelatedValuePropagation/track-predecessor-ranges.ll

diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 90bae77bcf703..6fb28072afe46 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -59,6 +59,11 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
 
+static cl::opt<bool> PerPredRanges(
+    "lvi-per-pred-ranges", cl::Hidden, cl::init(false),
+    cl::desc("Enable tracking of ranges for a value in a block for"
+             "each block predecessor (default = false)"));
+
 namespace llvm {
 FunctionPass *createLazyValueInfoPass() {
   return new LazyValueInfoWrapperPass();
@@ -103,6 +108,10 @@ namespace {
 
 namespace {
 using NonNullPointerSet = SmallDenseSet<AssertingVH<Value>, 2>;
+using BBLatticeElementMap =
+    SmallDenseMap<PoisoningVH<BasicBlock>, ValueLatticeElement, 4>;
+using PredecessorValueLatticeMap =
+    SmallDenseMap<AssertingVH<Value>, BBLatticeElementMap, 2>;
 
 /// This is the cache kept by LazyValueInfo which
 /// maintains information about queries across the clients' queries.
@@ -117,6 +126,10 @@ class LazyValueInfoCache {
     // std::nullopt indicates that the nonnull pointers for this basic block
     // block have not been computed yet.
     std::optional<NonNullPointerSet> NonNullPointers;
+    // This is an extension of the above LatticeElements, caching, for each
+    // Value, a ValueLatticeElement, for each predecessor of the BB tracked by
+    // this entry.
+    std::optional<PredecessorValueLatticeMap> PredecessorLatticeElements;
   };
 
   /// Cached information per basic block.
@@ -134,8 +147,14 @@ class LazyValueInfoCache {
 
   BlockCacheEntry *getOrCreateBlockEntry(BasicBlock *BB) {
     auto It = BlockCache.find_as(BB);
-    if (It == BlockCache.end())
-      It = BlockCache.insert({BB, std::make_unique<BlockCacheEntry>()}).first;
+    if (It == BlockCache.end()) {
+      std::unique_ptr<BlockCacheEntry> BCE =
+          std::make_unique<BlockCacheEntry>();
+      if (PerPredRanges)
+        BCE->PredecessorLatticeElements =
+            std::make_optional<PredecessorValueLatticeMap>();
+      It = BlockCache.insert({BB, std::move(BCE)}).first;
+    }
 
     return It->second.get();
   }
@@ -161,6 +180,28 @@ class LazyValueInfoCache {
     addValueHandle(Val);
   }
 
+  void insertPredecessorResults(Value *Val, BasicBlock *BB,
+                                BBLatticeElementMap &PredLatticeElements) {
+    BlockCacheEntry *Entry = getOrCreateBlockEntry(BB);
+
+    Entry->PredecessorLatticeElements->insert({Val, PredLatticeElements});
+
+    addValueHandle(Val);
+  }
+
+  std::optional<BBLatticeElementMap>
+  getCachedPredecessorInfo(Value *V, BasicBlock *BB) const {
+    const BlockCacheEntry *Entry = getBlockEntry(BB);
+    if (!Entry)
+      return std::nullopt;
+
+    auto LatticeIt = Entry->PredecessorLatticeElements->find_as(V);
+    if (LatticeIt == Entry->PredecessorLatticeElements->end())
+      return std::nullopt;
+
+    return LatticeIt->second;
+  }
+
   std::optional<ValueLatticeElement> getCachedValueInfo(Value *V,
                                                         BasicBlock *BB) const {
     const BlockCacheEntry *Entry = getBlockEntry(BB);
@@ -216,6 +257,8 @@ void LazyValueInfoCache::eraseValue(Value *V) {
     Pair.second->OverDefined.erase(V);
     if (Pair.second->NonNullPointers)
       Pair.second->NonNullPointers->erase(V);
+    if (PerPredRanges)
+      Pair.second->PredecessorLatticeElements->erase(V);
   }
 
   auto HandleIt = ValueHandles.find_as(V);
@@ -230,6 +273,10 @@ void LVIValueHandle::deleted() {
 }
 
 void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
+  // Clear all when a BB is removed.
+  if (PerPredRanges)
+    for (auto &Pair : BlockCache)
+      Pair.second->PredecessorLatticeElements->clear();
   BlockCache.erase(BB);
 }
 
@@ -691,6 +738,9 @@ LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
   // find a path to function entry.  TODO: We should consider explicitly
   // canonicalizing to make this true rather than relying on this happy
   // accident.
+  std::optional<BBLatticeElementMap> PredLatticeElements;
+  if (PerPredRanges)
+    PredLatticeElements = std::make_optional<BBLatticeElementMap>();
   for (BasicBlock *Pred : predecessors(BB)) {
     // Skip self loops.
     if (Pred == BB)
@@ -710,8 +760,13 @@ LazyValueInfoImpl::solveBlockValueNonLocal(Value *Val, BasicBlock *BB) {
                         << Pred->getName() << "' (non local).\n");
       return Result;
     }
+    if (PerPredRanges)
+      PredLatticeElements->insert({Pred, *EdgeResult});
   }
 
+  if (PerPredRanges)
+    TheCache.insertPredecessorResults(Val, BB, *PredLatticeElements);
+
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined());
   return Result;
@@ -724,6 +779,9 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
   // Loop over all of our predecessors, merging what we know from them into
   // result.  See the comment about the chosen traversal order in
   // solveBlockValueNonLocal; the same reasoning applies here.
+  std::optional<BBLatticeElementMap> PredLatticeElements;
+  if (PerPredRanges)
+    PredLatticeElements = std::make_optional<BBLatticeElementMap>();
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
@@ -746,8 +804,14 @@ LazyValueInfoImpl::solveBlockValuePHINode(PHINode *PN, BasicBlock *BB) {
 
       return Result;
     }
+
+    if (PerPredRanges)
+      PredLatticeElements->insert({PhiBB, *EdgeResult});
   }
 
+  if (PerPredRanges)
+    TheCache.insertPredecessorResults(PN, BB, *PredLatticeElements);
+
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined() && "Possible PHI in entry block?");
   return Result;
@@ -1002,7 +1066,77 @@ LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
 
   const ConstantRange &LHSRange = *LHSRes;
   const ConstantRange &RHSRange = *RHSRes;
-  return ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
+
+  std::optional<ValueLatticeElement> MergedResult =
+      ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
+
+  if (!PerPredRanges)
+    return MergedResult;
+
+  std::optional<BBLatticeElementMap> PredLHS =
+      TheCache.getCachedPredecessorInfo(LHS, BB);
+  if (!PredLHS)
+    return MergedResult;
+  std::optional<BBLatticeElementMap> PredRHS =
+      TheCache.getCachedPredecessorInfo(RHS, BB);
+  if (!PredRHS)
+    return MergedResult;
+
+  const BBLatticeElementMap &LHSPredMap = *PredLHS;
+  const BBLatticeElementMap &RHSPredMap = *PredRHS;
+
+  BBLatticeElementMap PredLatticeElements;
+  ValueLatticeElement OverallPredResult;
+  for (auto *Pred : predecessors(BB)) {
+    auto LHSIt = LHSPredMap.find_as(Pred);
+    if (LHSIt == LHSPredMap.end())
+      return MergedResult;
+    const ValueLatticeElement &LHSFromPred = LHSIt->second;
+    std::optional<ConstantRange> LHSFromPredRes =
+        LHSFromPred.asConstantRange(LHS->getType());
+    if (!LHSFromPredRes)
+      return MergedResult;
+
+    auto RHSIt = RHSPredMap.find_as(Pred);
+    if (RHSIt == RHSPredMap.end())
+      return MergedResult;
+    const ValueLatticeElement &RHSFromPred = RHSIt->second;
+    std::optional<ConstantRange> RHSFromPredRes =
+        RHSFromPred.asConstantRange(RHS->getType());
+    if (!RHSFromPredRes)
+      return MergedResult;
+
+    const ConstantRange &LHSFromPredRange = *LHSFromPredRes;
+    const ConstantRange &RHSFromPredRange = *RHSFromPredRes;
+    std::optional<ValueLatticeElement> PredResult =
+        ValueLatticeElement::getRange(OpFn(LHSFromPredRange, RHSFromPredRange));
+    if (!PredResult)
+      return MergedResult;
+    if (PredResult->isOverdefined()) {
+      LLVM_DEBUG(
+          dbgs() << " pred BB '" << Pred->getName() << "' for BB '"
+                 << BB->getName()
+                 << "' overdefined. Discarding all predecessor intervals.\n");
+      return MergedResult;
+    }
+    PredLatticeElements.insert({Pred, *PredResult});
+    OverallPredResult.mergeIn(*PredResult);
+  }
+
+  // If this point is reached, all predecessors for both LHS and RHS have
+  // constant ranges previously computed. Can cache result and use the
+  // OverallPredResult;
+  TheCache.insertPredecessorResults(I, BB, PredLatticeElements);
+
+  LLVM_DEBUG(dbgs() << " Using predecessor intervals, evaluated " << *I
+                    << " to: " << OverallPredResult << ".\n");
+
+  if (!MergedResult)
+    return OverallPredResult;
+
+  LLVM_DEBUG(dbgs() << " Intersecting intervals for " << *I << ": "
+                    << OverallPredResult << " and  " << MergedResult << ".\n");
+  return MergedResult->intersect(OverallPredResult);
 }
 
 std::optional<ValueLatticeElement>
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/track-predecessor-ranges.ll b/llvm/test/Transforms/CorrelatedValuePropagation/track-predecessor-ranges.ll
new file mode 100644
index 0000000000000..b5f688420d9c9
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/track-predecessor-ranges.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes="correlated-propagation"  -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes="correlated-propagation" -lvi-per-pred-ranges -S 2>&1 | FileCheck %s -check-prefix=LVI-PRED-RANGES
+
+@global = external local_unnamed_addr global [4338 x i32], align 16
+
+define dso_local noundef zeroext i1 @bar(i64 noundef %arg, ptr noundef writeonly captures(none) %arg1) local_unnamed_addr {
+; CHECK-LABEL: define dso_local noundef zeroext i1 @bar(
+; CHECK-SAME: i64 noundef [[ARG:%.*]], ptr noundef writeonly captures(none) [[ARG1:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[ARG]], 1025
+; CHECK-NEXT:    br i1 [[ICMP]], label %[[BB4:.*]], label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[ICMP3:%.*]] = icmp ult i64 [[ARG]], 262145
+; CHECK-NEXT:    br i1 [[ICMP3]], label %[[BB4]], label %[[BB9:.*]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 7, %[[BB]] ], [ 15487, %[[BB2]] ]
+; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ 3, %[[BB]] ], [ 7, %[[BB2]] ]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[PHI]], [[ARG]]
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i64 [[ADD]], [[PHI5]]
+; CHECK-NEXT:    [[ICMP6:%.*]] = icmp samesign ult i64 [[LSHR]], 4338
+; CHECK-NEXT:    br i1 [[ICMP6]], label %[[BB8:.*]], label %[[BB7:.*]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    tail call void @llvm.ubsantrap(i8 18)
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw [4338 x i32], ptr @global, i64 0, i64 [[LSHR]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD]] to i64
+; CHECK-NEXT:    store i64 [[SEXT]], ptr [[ARG1]], align 8
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[PHI10:%.*]] = phi i1 [ true, %[[BB8]] ], [ false, %[[BB2]] ]
+; CHECK-NEXT:    ret i1 [[PHI10]]
+;
+; LVI-PRED-RANGES-LABEL: define dso_local noundef zeroext i1 @bar(
+; LVI-PRED-RANGES-SAME: i64 noundef [[ARG:%.*]], ptr noundef writeonly captures(none) [[ARG1:%.*]]) local_unnamed_addr {
+; LVI-PRED-RANGES-NEXT:  [[BB:.*]]:
+; LVI-PRED-RANGES-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[ARG]], 1025
+; LVI-PRED-RANGES-NEXT:    br i1 [[ICMP]], label %[[BB4:.*]], label %[[BB2:.*]]
+; LVI-PRED-RANGES:       [[BB2]]:
+; LVI-PRED-RANGES-NEXT:    [[ICMP3:%.*]] = icmp ult i64 [[ARG]], 262145
+; LVI-PRED-RANGES-NEXT:    br i1 [[ICMP3]], label %[[BB4]], label %[[BB9:.*]]
+; LVI-PRED-RANGES:       [[BB4]]:
+; LVI-PRED-RANGES-NEXT:    [[PHI:%.*]] = phi i64 [ 7, %[[BB]] ], [ 15487, %[[BB2]] ]
+; LVI-PRED-RANGES-NEXT:    [[PHI5:%.*]] = phi i64 [ 3, %[[BB]] ], [ 7, %[[BB2]] ]
+; LVI-PRED-RANGES-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[PHI]], [[ARG]]
+; LVI-PRED-RANGES-NEXT:    [[LSHR:%.*]] = lshr i64 [[ADD]], [[PHI5]]
+; LVI-PRED-RANGES-NEXT:    br i1 true, label %[[BB8:.*]], label %[[BB7:.*]]
+; LVI-PRED-RANGES:       [[BB7]]:
+; LVI-PRED-RANGES-NEXT:    tail call void @llvm.ubsantrap(i8 18)
+; LVI-PRED-RANGES-NEXT:    unreachable
+; LVI-PRED-RANGES:       [[BB8]]:
+; LVI-PRED-RANGES-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds nuw [4338 x i32], ptr @global, i64 0, i64 [[LSHR]]
+; LVI-PRED-RANGES-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GETELEMENTPTR]], align 4
+; LVI-PRED-RANGES-NEXT:    [[SEXT:%.*]] = sext i32 [[LOAD]] to i64
+; LVI-PRED-RANGES-NEXT:    store i64 [[SEXT]], ptr [[ARG1]], align 8
+; LVI-PRED-RANGES-NEXT:    br label %[[BB9]]
+; LVI-PRED-RANGES:       [[BB9]]:
+; LVI-PRED-RANGES-NEXT:    [[PHI10:%.*]] = phi i1 [ true, %[[BB8]] ], [ false, %[[BB2]] ]
+; LVI-PRED-RANGES-NEXT:    ret i1 [[PHI10]]
+;
+bb:
+  %icmp = icmp ult i64 %arg, 1025
+  br i1 %icmp, label %bb4, label %bb2
+
+bb2:                                              ; preds = %bb
+  %icmp3 = icmp ult i64 %arg, 262145
+  br i1 %icmp3, label %bb4, label %bb9
+
+bb4:                                              ; preds = %bb2, %bb
+  %phi = phi i64 [ 7, %bb ], [ 15487, %bb2 ]
+  %phi5 = phi i64 [ 3, %bb ], [ 7, %bb2 ]
+  %add = add nuw nsw i64 %phi, %arg
+  %lshr = lshr i64 %add, %phi5
+  %icmp6 = icmp samesign ult i64 %lshr, 4338
+  br i1 %icmp6, label %bb8, label %bb7
+
+bb7:                                              ; preds = %bb4
+  tail call void @llvm.ubsantrap(i8 18)
+  unreachable
+
+bb8:                                              ; preds = %bb4
+  %getelementptr = getelementptr inbounds nuw [4338 x i32], ptr @global, i64 0, i64 %lshr
+  %load = load i32, ptr %getelementptr, align 4
+  %sext = sext i32 %load to i64
+  store i64 %sext, ptr %arg1, align 8
+  br label %bb9
+
+bb9:                                              ; preds = %bb8, %bb2
+  %phi10 = phi i1 [ true, %bb8 ], [ false, %bb2 ]
+  ret i1 %phi10
+}
+
+; Function Attrs: cold noreturn nounwind
+declare void @llvm.ubsantrap(i8 immarg) #0
+
+attributes #0 = { cold noreturn nounwind }

From f9dbf738d81492eb7891655c5d4b2f481215eee0 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Mon, 29 Sep 2025 09:00:37 -0700
Subject: [PATCH 144/878] [llvm] Use the VFS to get the real path in
 `FileCollector` (#160943)

This PR starts using the correct VFS for getting file's real path in
`FileCollector` instead of using the real FS directly. This matches
compiler's behavior for other input files.
---
 llvm/lib/Support/FileCollector.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Support/FileCollector.cpp b/llvm/lib/Support/FileCollector.cpp
index 5dc224a6d427b..1e5de2c49a2b3 100644
--- a/llvm/lib/Support/FileCollector.cpp
+++ b/llvm/lib/Support/FileCollector.cpp
@@ -68,9 +68,8 @@ void FileCollector::PathCanonicalizer::updateWithRealPath(
   SmallString<256> RealPath;
   auto DirWithSymlink = CachedDirs.find(Directory);
   if (DirWithSymlink == CachedDirs.end()) {
-    // FIXME: Should this be a call to FileSystem::getRealpath(), in some
-    // cases? What if there is nothing on disk?
-    if (sys::fs::real_path(Directory, RealPath))
+    // FIXME: What if there is nothing on disk?
+    if (VFS->getRealPath(Directory, RealPath))
       return;
     CachedDirs[Directory] = std::string(RealPath);
   } else {

From a615249d02549c82913dc8c744d769ab41e7385e Mon Sep 17 00:00:00 2001
From: Billy Zhu <billyzhu@modular.com>
Date: Mon, 29 Sep 2025 09:11:47 -0700
Subject: [PATCH 145/878] [MLIR][Python] Fix PDLResultList bindings (#161102)

Adds argument names to the method stubs for PDLResultList (from
https://github.com/llvm/llvm-project/pull/159926).
---
 mlir/lib/Bindings/Python/Rewrite.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 20392b9002706..f18298ecaf415 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -155,7 +155,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackValue(results, value);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Value") ")")
+          nb::sig("def append(self, value: " MAKE_MLIR_PYTHON_QUALNAME("ir.Value") ")")
           // clang-format on
           )
       .def(
@@ -164,7 +164,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackOperation(results, op);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Operation") ")")
+          nb::sig("def append(self, op: " MAKE_MLIR_PYTHON_QUALNAME("ir.Operation") ")")
           // clang-format on
           )
       .def(
@@ -173,7 +173,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackType(results, type);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Type") ")")
+          nb::sig("def append(self, type: " MAKE_MLIR_PYTHON_QUALNAME("ir.Type") ")")
           // clang-format on
           )
       .def(
@@ -182,7 +182,7 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
             mlirPDLResultListPushBackAttribute(results, attr);
           },
           // clang-format off
-          nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Attribute") ")")
+          nb::sig("def append(self, attr: " MAKE_MLIR_PYTHON_QUALNAME("ir.Attribute") ")")
           // clang-format on
       );
   nb::class_<PyPDLPatternModule>(m, "PDLModule")

From 3b299af92383cb7558224482ccfa714f0162f772 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 29 Sep 2025 09:15:25 -0700
Subject: [PATCH 146/878] ELF: Store EhInputSection relocations to simplify
 code. NFC

Store relocations directly as `SmallVector<Relocation, 0>` within
EhInputSection to avoid processing different relocation formats
(REL/RELA/CREL) throughout the codebase.

Next: Refactor RelocationScanner to utilize EhInputSection::rels

Pull Request: https://github.com/llvm/llvm-project/pull/161041
---
 lld/ELF/InputSection.cpp      | 48 +++++++++++++++++--------
 lld/ELF/InputSection.h        |  6 +++-
 lld/ELF/MarkLive.cpp          | 64 ++++++++++++++++++---------------
 lld/ELF/SyntheticSections.cpp | 66 +++++++++++------------------------
 lld/ELF/SyntheticSections.h   | 15 +++-----
 5 files changed, 99 insertions(+), 100 deletions(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 98267d1e081db..1270f27a8d96e 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1358,20 +1358,21 @@ SyntheticSection *EhInputSection::getParent() const {
 // .eh_frame is a sequence of CIE or FDE records.
 // This function splits an input section into records and returns them.
 template <class ELFT> void EhInputSection::split() {
-  const RelsOrRelas<ELFT> rels = relsOrRelas<ELFT>(/*supportsCrel=*/false);
-  // getReloc expects the relocations to be sorted by r_offset. See the comment
-  // in scanRelocs.
-  if (rels.areRelocsRel()) {
-    SmallVector<typename ELFT::Rel, 0> storage;
-    split<ELFT>(sortRels(rels.rels, storage));
-  } else {
-    SmallVector<typename ELFT::Rela, 0> storage;
-    split<ELFT>(sortRels(rels.relas, storage));
-  }
-}
+  const RelsOrRelas<ELFT> elfRels = relsOrRelas<ELFT>();
+  if (elfRels.areRelocsCrel())
+    preprocessRelocs<ELFT>(elfRels.crels);
+  else if (elfRels.areRelocsRel())
+    preprocessRelocs<ELFT>(elfRels.rels);
+  else
+    preprocessRelocs<ELFT>(elfRels.relas);
+
+  // The loop below expects the relocations to be sorted by offset.
+  auto cmp = [](const Relocation &a, const Relocation &b) {
+    return a.offset < b.offset;
+  };
+  if (!llvm::is_sorted(rels, cmp))
+    llvm::stable_sort(rels, cmp);
 
-template <class ELFT, class RelTy>
-void EhInputSection::split(ArrayRef<RelTy> rels) {
   ArrayRef<uint8_t> d = content();
   const char *msg = nullptr;
   unsigned relI = 0;
@@ -1397,10 +1398,10 @@ void EhInputSection::split(ArrayRef<RelTy> rels) {
     // Find the first relocation that points to [off,off+size). Relocations
     // have been sorted by r_offset.
     const uint64_t off = d.data() - content().data();
-    while (relI != rels.size() && rels[relI].r_offset < off)
+    while (relI != rels.size() && rels[relI].offset < off)
       ++relI;
     unsigned firstRel = -1;
-    if (relI != rels.size() && rels[relI].r_offset < off + size)
+    if (relI != rels.size() && rels[relI].offset < off + size)
       firstRel = relI;
     (id == 0 ? cies : fdes).emplace_back(off, this, size, firstRel);
     d = d.slice(size);
@@ -1410,6 +1411,23 @@ void EhInputSection::split(ArrayRef<RelTy> rels) {
                    << getObjMsg(d.data() - content().data());
 }
 
+template <class ELFT, class RelTy>
+void EhInputSection::preprocessRelocs(Relocs<RelTy> elfRels) {
+  Ctx &ctx = file->ctx;
+  rels.reserve(elfRels.size());
+  for (auto rel : elfRels) {
+    uint64_t offset = rel.r_offset;
+    Symbol &sym = file->getSymbol(rel.getSymbol(ctx.arg.isMips64EL));
+    RelType type = rel.getType(ctx.arg.isMips64EL);
+    RelExpr expr = ctx.target->getRelExpr(type, sym, content().data() + offset);
+    int64_t addend =
+        RelTy::HasAddend
+            ? getAddend<ELFT>(rel)
+            : ctx.target->getImplicitAddend(content().data() + offset, type);
+    rels.push_back({expr, type, offset, addend, &sym});
+  }
+}
+
 // Return the offset in an output section for a given input offset.
 uint64_t EhInputSection::getParentOffset(uint64_t offset) const {
   auto it = partition_point(
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 8462f03bdb77e..dc29fedbc5c53 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -394,7 +394,7 @@ class EhInputSection : public InputSectionBase {
                  StringRef name);
   static bool classof(const SectionBase *s) { return s->kind() == EHFrame; }
   template <class ELFT> void split();
-  template <class ELFT, class RelTy> void split(ArrayRef<RelTy> rels);
+  template <class ELFT, class RelTy> void preprocessRelocs(Relocs<RelTy> rels);
 
   // Splittable sections are handled as a sequence of data
   // rather than a single large blob of data.
@@ -402,6 +402,10 @@ class EhInputSection : public InputSectionBase {
 
   SyntheticSection *getParent() const;
   uint64_t getParentOffset(uint64_t offset) const;
+
+  // Preprocessed relocations in uniform format to avoid REL/RELA/CREL
+  // relocation format handling throughout the codebase.
+  SmallVector<Relocation, 0> rels;
 };
 
 // This is a section that is added directly to an output section
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index 83ae9fb7689e0..a7b0f08c8d954 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -68,10 +68,9 @@ template <class ELFT, bool TrackWhyLive> class MarkLive {
   void mark();
 
   template <class RelTy>
-  void resolveReloc(InputSectionBase &sec, RelTy &rel, bool fromFDE);
+  void resolveReloc(InputSectionBase &sec, const RelTy &rel, bool fromFDE);
 
-  template <class RelTy>
-  void scanEhFrameSection(EhInputSection &eh, ArrayRef<RelTy> rels);
+  void scanEhFrameSection(EhInputSection &eh);
 
   Ctx &ctx;
   // The index of the partition that we are currently processing.
@@ -115,23 +114,38 @@ static uint64_t getAddend(Ctx &, InputSectionBase &sec,
 template <class ELFT, bool TrackWhyLive>
 template <class RelTy>
 void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
-                                                RelTy &rel, bool fromFDE) {
+                                                const RelTy &rel,
+                                                bool fromFDE) {
   // If a symbol is referenced in a live section, it is used.
-  Symbol &sym = sec.file->getRelocTargetSym(rel);
-  sym.used = true;
+  Symbol *sym;
+  if constexpr (std::is_same_v<RelTy, Relocation>) {
+    assert(isa<EhInputSection>(sec));
+    sym = rel.sym;
+  } else {
+    sym = &sec.file->getRelocTargetSym(rel);
+  }
+  sym->used = true;
 
   LiveReason reason;
-  if (TrackWhyLive)
-    reason = {SecOffset(&sec, rel.r_offset), "referenced by"};
+  if (TrackWhyLive) {
+    if constexpr (std::is_same_v<RelTy, Relocation>)
+      reason = {SecOffset(&sec, rel.offset), "referenced by"};
+    else
+      reason = {SecOffset(&sec, rel.r_offset), "referenced by"};
+  }
 
-  if (auto *d = dyn_cast<Defined>(&sym)) {
+  if (auto *d = dyn_cast<Defined>(sym)) {
     auto *relSec = dyn_cast_or_null<InputSectionBase>(d->section);
     if (!relSec)
       return;
 
     uint64_t offset = d->value;
-    if (d->isSection())
-      offset += getAddend<ELFT>(ctx, sec, rel);
+    if (d->isSection()) {
+      if constexpr (std::is_same_v<RelTy, Relocation>)
+        offset += rel.addend;
+      else
+        offset += getAddend<ELFT>(ctx, sec, rel);
+    }
 
     // fromFDE being true means this is referenced by a FDE in a .eh_frame
     // piece. The relocation points to the described function or to a LSDA. We
@@ -141,8 +155,9 @@ void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
     // associated text section is live, the LSDA will be retained due to section
     // group/SHF_LINK_ORDER rules (b) if the associated text section should be
     // discarded, marking the LSDA will unnecessarily retain the text section.
-    if (!(fromFDE && ((relSec->flags & (SHF_EXECINSTR | SHF_LINK_ORDER)) ||
-                      relSec->nextInSectionGroup))) {
+    if (!(fromFDE && std::is_same_v<RelTy, Relocation> &&
+          ((relSec->flags & (SHF_EXECINSTR | SHF_LINK_ORDER)) ||
+           relSec->nextInSectionGroup))) {
       Symbol *canonicalSym = d;
       if (TrackWhyLive && d->isSection()) {
         // This is expensive, so ideally this would be deferred until it's known
@@ -159,15 +174,15 @@ void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
     return;
   }
 
-  if (auto *ss = dyn_cast<SharedSymbol>(&sym)) {
+  if (auto *ss = dyn_cast<SharedSymbol>(sym)) {
     if (!ss->isWeak()) {
       cast<SharedFile>(ss->file)->isNeeded = true;
       if (TrackWhyLive)
-        whyLive.try_emplace(&sym, reason);
+        whyLive.try_emplace(sym, reason);
     }
   }
 
-  for (InputSectionBase *sec : cNamedSections.lookup(sym.getName()))
+  for (InputSectionBase *sec : cNamedSections.lookup(sym->getName()))
     enqueue(sec, /*offset=*/0, /*sym=*/nullptr, reason);
 }
 
@@ -186,9 +201,8 @@ void MarkLive<ELFT, TrackWhyLive>::resolveReloc(InputSectionBase &sec,
 // the gc pass. With that we would be able to also gc some sections holding
 // LSDAs and personality functions if we found that they were unused.
 template <class ELFT, bool TrackWhyLive>
-template <class RelTy>
-void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh,
-                                                      ArrayRef<RelTy> rels) {
+void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh) {
+  ArrayRef<Relocation> rels = eh.rels;
   for (const EhSectionPiece &cie : eh.cies)
     if (cie.firstRelocation != unsigned(-1))
       resolveReloc(eh, rels[cie.firstRelocation], false);
@@ -198,7 +212,7 @@ void MarkLive<ELFT, TrackWhyLive>::scanEhFrameSection(EhInputSection &eh,
       continue;
     uint64_t pieceEnd = fde.inputOff + fde.size;
     for (size_t j = firstRelI, end2 = rels.size();
-         j < end2 && rels[j].r_offset < pieceEnd; ++j)
+         j < end2 && rels[j].offset < pieceEnd; ++j)
       resolveReloc(eh, rels[j], true);
   }
 }
@@ -360,14 +374,8 @@ void MarkLive<ELFT, TrackWhyLive>::run() {
   // that point to .eh_frames. Otherwise, the garbage collector would drop
   // all of them. We also want to preserve personality routines and LSDA
   // referenced by .eh_frame sections, so we scan them for that here.
-  for (EhInputSection *eh : ctx.ehInputSections) {
-    const RelsOrRelas<ELFT> rels =
-        eh->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
-    if (rels.areRelocsRel())
-      scanEhFrameSection(*eh, rels.rels);
-    else if (rels.relas.size())
-      scanEhFrameSection(*eh, rels.relas);
-  }
+  for (EhInputSection *eh : ctx.ehInputSections)
+    scanEhFrameSection(*eh);
   for (InputSectionBase *sec : ctx.inputSections) {
     if (sec->flags & SHF_GNU_RETAIN) {
       enqueue(sec, /*offset=*/0, /*sym=*/nullptr, {std::nullopt, "retained"});
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 457a794a8c3a8..bbf4b29a9fda5 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -403,12 +403,12 @@ EhFrameSection::EhFrameSection(Ctx &ctx)
 // Search for an existing CIE record or create a new one.
 // CIE records from input object files are uniquified by their contents
 // and where their relocations point to.
-template <class RelTy>
-CieRecord *EhFrameSection::addCie(EhSectionPiece &cie, ArrayRef<RelTy> rels) {
+CieRecord *EhFrameSection::addCie(EhSectionPiece &cie,
+                                  ArrayRef<Relocation> rels) {
   Symbol *personality = nullptr;
   unsigned firstRelI = cie.firstRelocation;
   if (firstRelI != (unsigned)-1)
-    personality = &cie.sec->file->getRelocTargetSym(rels[firstRelI]);
+    personality = rels[firstRelI].sym;
 
   // Search for an existing CIE by CIE contents/relocation target pair.
   CieRecord *&rec = cieMap[{cie.data(), personality}];
@@ -424,25 +424,20 @@ CieRecord *EhFrameSection::addCie(EhSectionPiece &cie, ArrayRef<RelTy> rels) {
 
 // There is one FDE per function. Returns a non-null pointer to the function
 // symbol if the given FDE points to a live function.
-template <class RelTy>
-Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde, ArrayRef<RelTy> rels) {
-  auto *sec = cast<EhInputSection>(fde.sec);
-  unsigned firstRelI = fde.firstRelocation;
-
+Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde,
+                                   ArrayRef<Relocation> rels) {
   // An FDE should point to some function because FDEs are to describe
   // functions. That's however not always the case due to an issue of
   // ld.gold with -r. ld.gold may discard only functions and leave their
   // corresponding FDEs, which results in creating bad .eh_frame sections.
   // To deal with that, we ignore such FDEs.
+  unsigned firstRelI = fde.firstRelocation;
   if (firstRelI == (unsigned)-1)
     return nullptr;
 
-  const RelTy &rel = rels[firstRelI];
-  Symbol &b = sec->file->getRelocTargetSym(rel);
-
   // FDEs for garbage-collected or merged-by-ICF sections, or sections in
   // another partition, are dead.
-  if (auto *d = dyn_cast<Defined>(&b))
+  if (auto *d = dyn_cast<Defined>(rels[firstRelI].sym))
     if (!d->folded && d->section && d->section->partition == partition)
       return d;
   return nullptr;
@@ -452,13 +447,13 @@ Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde, ArrayRef<RelTy> rels) {
 // is one CIE record per input object file which is followed by
 // a list of FDEs. This function searches an existing CIE or create a new
 // one and associates FDEs to the CIE.
-template <class ELFT, class RelTy>
-void EhFrameSection::addRecords(EhInputSection *sec, ArrayRef<RelTy> rels) {
+template <endianness e> void EhFrameSection::addRecords(EhInputSection *sec) {
+  auto rels = sec->rels;
   offsetToCie.clear();
   for (EhSectionPiece &cie : sec->cies)
-    offsetToCie[cie.inputOff] = addCie<RelTy>(cie, rels);
+    offsetToCie[cie.inputOff] = addCie(cie, rels);
   for (EhSectionPiece &fde : sec->fdes) {
-    uint32_t id = endian::read32<ELFT::Endianness>(fde.data().data() + 4);
+    uint32_t id = endian::read32<e>(fde.data().data() + 4);
     CieRecord *rec = offsetToCie[fde.inputOff + 4 - id];
     if (!rec)
       Fatal(ctx) << sec << ": invalid CIE reference";
@@ -470,23 +465,11 @@ void EhFrameSection::addRecords(EhInputSection *sec, ArrayRef<RelTy> rels) {
   }
 }
 
-template <class ELFT>
-void EhFrameSection::addSectionAux(EhInputSection *sec) {
-  if (!sec->isLive())
-    return;
-  const RelsOrRelas<ELFT> rels =
-      sec->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
-  if (rels.areRelocsRel())
-    addRecords<ELFT>(sec, rels.rels);
-  else
-    addRecords<ELFT>(sec, rels.relas);
-}
-
 // Used by ICF<ELFT>::handleLSDA(). This function is very similar to
 // EhFrameSection::addRecords().
-template <class ELFT, class RelTy>
+template <class ELFT>
 void EhFrameSection::iterateFDEWithLSDAAux(
-    EhInputSection &sec, ArrayRef<RelTy> rels, DenseSet<size_t> &ciesWithLSDA,
+    EhInputSection &sec, DenseSet<size_t> &ciesWithLSDA,
     llvm::function_ref<void(InputSection &)> fn) {
   for (EhSectionPiece &cie : sec.cies)
     if (hasLSDA(cie))
@@ -497,7 +480,7 @@ void EhFrameSection::iterateFDEWithLSDAAux(
       continue;
 
     // The CIE has a LSDA argument. Call fn with d's section.
-    if (Defined *d = isFdeLive(fde, rels))
+    if (Defined *d = isFdeLive(fde, sec.rels))
       if (auto *s = dyn_cast_or_null<InputSection>(d->section))
         fn(*s);
   }
@@ -509,12 +492,7 @@ void EhFrameSection::iterateFDEWithLSDA(
   DenseSet<size_t> ciesWithLSDA;
   for (EhInputSection *sec : sections) {
     ciesWithLSDA.clear();
-    const RelsOrRelas<ELFT> rels =
-        sec->template relsOrRelas<ELFT>(/*supportsCrel=*/false);
-    if (rels.areRelocsRel())
-      iterateFDEWithLSDAAux<ELFT>(*sec, rels.rels, ciesWithLSDA, fn);
-    else
-      iterateFDEWithLSDAAux<ELFT>(*sec, rels.relas, ciesWithLSDA, fn);
+    iterateFDEWithLSDAAux<ELFT>(*sec, ciesWithLSDA, fn);
   }
 }
 
@@ -531,20 +509,16 @@ void EhFrameSection::finalizeContents() {
   case ELFNoneKind:
     llvm_unreachable("invalid ekind");
   case ELF32LEKind:
-    for (EhInputSection *sec : sections)
-      addSectionAux<ELF32LE>(sec);
-    break;
-  case ELF32BEKind:
-    for (EhInputSection *sec : sections)
-      addSectionAux<ELF32BE>(sec);
-    break;
   case ELF64LEKind:
     for (EhInputSection *sec : sections)
-      addSectionAux<ELF64LE>(sec);
+      if (sec->isLive())
+        addRecords<endianness::little>(sec);
     break;
+  case ELF32BEKind:
   case ELF64BEKind:
     for (EhInputSection *sec : sections)
-      addSectionAux<ELF64BE>(sec);
+      if (sec->isLive())
+        addRecords<endianness::big>(sec);
     break;
   }
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 55a10716c054b..ac3ec63f0a7a5 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -80,19 +80,14 @@ class EhFrameSection final : public SyntheticSection {
 
   uint64_t size = 0;
 
-  template <class ELFT, class RelTy>
-  void addRecords(EhInputSection *s, llvm::ArrayRef<RelTy> rels);
-  template <class ELFT> void addSectionAux(EhInputSection *s);
-  template <class ELFT, class RelTy>
-  void iterateFDEWithLSDAAux(EhInputSection &sec, ArrayRef<RelTy> rels,
+  template <llvm::endianness E> void addRecords(EhInputSection *s);
+  template <class ELFT>
+  void iterateFDEWithLSDAAux(EhInputSection &sec,
                              llvm::DenseSet<size_t> &ciesWithLSDA,
                              llvm::function_ref<void(InputSection &)> fn);
 
-  template <class RelTy>
-  CieRecord *addCie(EhSectionPiece &piece, ArrayRef<RelTy> rels);
-
-  template <class RelTy>
-  Defined *isFdeLive(EhSectionPiece &piece, ArrayRef<RelTy> rels);
+  CieRecord *addCie(EhSectionPiece &piece, ArrayRef<Relocation> rels);
+  Defined *isFdeLive(EhSectionPiece &piece, ArrayRef<Relocation> rels);
 
   uint64_t getFdePc(uint8_t *buf, size_t off, uint8_t enc) const;
 

From 30b0215519428ef264d010c60e36dbfea2ab107c Mon Sep 17 00:00:00 2001
From: Ryan Mansfield <ryan_mansfield@apple.com>
Date: Mon, 29 Sep 2025 12:32:28 -0400
Subject: [PATCH 147/878] [llvm-size] Add --exclude-pagezero option for Mach-O
 to exclude __PAGEZERO size. (#159574)

Do not include the ``__PAGEZERO`` segment when calculating size information
for Mach-O files when `--exclude-pagezero` is used. The ``__PAGEZERO``
segment is a virtual memory region used for memory protection that does not
contribute to actual size, and excluding can provide a better representation of
actual size.

Fixes #86644
---
 llvm/docs/CommandGuide/llvm-size.rst          |   7 ++
 llvm/test/tools/llvm-size/macho-pagezero.test | 108 ++++++++++++++++++
 llvm/tools/llvm-size/Opts.td                  |   3 +
 llvm/tools/llvm-size/llvm-size.cpp            |   6 +-
 4 files changed, 122 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/tools/llvm-size/macho-pagezero.test

diff --git a/llvm/docs/CommandGuide/llvm-size.rst b/llvm/docs/CommandGuide/llvm-size.rst
index f244769545b31..12e7c58c5776d 100644
--- a/llvm/docs/CommandGuide/llvm-size.rst
+++ b/llvm/docs/CommandGuide/llvm-size.rst
@@ -41,6 +41,13 @@ OPTIONS
  as a separate section entry for ``sysv`` output. If not specified, these
  symbols are ignored.
 
+.. option:: --exclude-pagezero
+
+ Do not include the ``__PAGEZERO`` segment when calculating size information
+ for Mach-O files. The ``__PAGEZERO`` segment is a virtual memory region used
+ for memory protection that does not contribute to actual size, and excluding
+ can provide a better representation of actual size.
+
 .. option:: -d
 
  Equivalent to :option:`--radix` with a value of ``10``.
diff --git a/llvm/test/tools/llvm-size/macho-pagezero.test b/llvm/test/tools/llvm-size/macho-pagezero.test
new file mode 100644
index 0000000000000..db69fd0c9daeb
--- /dev/null
+++ b/llvm/test/tools/llvm-size/macho-pagezero.test
@@ -0,0 +1,108 @@
+## Test the --exclude-pagezero option to skip __PAGEZERO segment in Mach-O files.
+
+# RUN: yaml2obj %s --docnum=1 -o %t-pagezero.o
+# RUN: llvm-size %t-pagezero.o | \
+# RUN:   FileCheck %s --check-prefix=NORMAL --match-full-lines
+# RUN: llvm-size --exclude-pagezero %t-pagezero.o | \
+# RUN:   FileCheck %s --check-prefix=SKIP --match-full-lines
+
+# RUN: yaml2obj %s --docnum=2 -o %t-pagezero32.o
+# RUN: llvm-size %t-pagezero32.o | \
+# RUN:   FileCheck %s --check-prefix=NORMAL --match-full-lines
+# RUN: llvm-size --exclude-pagezero %t-pagezero32.o | \
+# RUN:   FileCheck %s --check-prefix=SKIP --match-full-lines
+
+# NORMAL:__TEXT	__DATA	__OBJC	others	dec	hex
+# NORMAL-NEXT:20	100	0	4096	4216	1078
+
+# SKIP:__TEXT	__DATA	__OBJC	others	dec	hex
+# SKIP-NEXT:20	100	0	0	120	78
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      216
+  flags:           0x2000
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0x0
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __TEXT
+    vmaddr:          0x100000000
+    vmsize:          20
+    fileoff:         248
+    filesize:        20
+    maxprot:         7
+    initprot:        5
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __DATA
+    vmaddr:          0x100001000
+    vmsize:          100
+    fileoff:         268
+    filesize:        100
+    maxprot:         7
+    initprot:        3
+    nsects:          0
+    flags:           0
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACE
+  cputype:         0x7
+  cpusubtype:      0x3
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      168
+  flags:           0x2000
+LoadCommands:
+  - cmd:             LC_SEGMENT
+    cmdsize:         56
+    segname:         __PAGEZERO
+    vmaddr:          0x0
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT
+    cmdsize:         56
+    segname:         __TEXT
+    vmaddr:          0x1000
+    vmsize:          20
+    fileoff:         196
+    filesize:        20
+    maxprot:         7
+    initprot:        5
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT
+    cmdsize:         56
+    segname:         __DATA
+    vmaddr:          0x2000
+    vmsize:          100
+    fileoff:         216
+    filesize:        100
+    maxprot:         7
+    initprot:        3
+    nsects:          0
+    flags:           0
diff --git a/llvm/tools/llvm-size/Opts.td b/llvm/tools/llvm-size/Opts.td
index edae43f1abd24..88e39f293a505 100644
--- a/llvm/tools/llvm-size/Opts.td
+++ b/llvm/tools/llvm-size/Opts.td
@@ -21,6 +21,9 @@ def grp_mach_o : OptionGroup<"kind">, HelpText<"OPTIONS (Mach-O specific)">;
 def arch_EQ : Joined<["--"], "arch=">, HelpText<"architecture(s) from a Mach-O file to dump">, Group<grp_mach_o>;
 def : Separate<["--", "-"], "arch">, Alias<arch_EQ>;
 def l : F<"l", "When format is darwin, use long format to include addresses and offsets">, Group<grp_mach_o>;
+def exclude_pagezero
+    : FF<"exclude-pagezero", "Do not include __PAGEZERO segment in totals">,
+      Group<grp_mach_o>;
 
 def : F<"A", "Alias for --format">, Alias<format_EQ>, AliasArgs<["sysv"]>;
 def : F<"B", "Alias for --format">, Alias<format_EQ>, AliasArgs<["berkeley"]>;
diff --git a/llvm/tools/llvm-size/llvm-size.cpp b/llvm/tools/llvm-size/llvm-size.cpp
index acc7843ffac8b..ec94db4ff7382 100644
--- a/llvm/tools/llvm-size/llvm-size.cpp
+++ b/llvm/tools/llvm-size/llvm-size.cpp
@@ -79,6 +79,7 @@ static bool DarwinLongFormat;
 static RadixTy Radix = RadixTy::decimal;
 static bool TotalSizes;
 static bool HasMachOFiles = false;
+static bool ExcludePageZero = false;
 
 static std::vector<std::string> InputFilenames;
 
@@ -313,7 +314,7 @@ static void printDarwinSegmentSizes(MachOObjectFile *MachO) {
           total_data += Seg.vmsize;
         else if (SegmentName == "__OBJC")
           total_objc += Seg.vmsize;
-        else
+        else if (!ExcludePageZero || SegmentName != "__PAGEZERO")
           total_others += Seg.vmsize;
       }
     } else if (Load.C.cmd == MachO::LC_SEGMENT) {
@@ -339,7 +340,7 @@ static void printDarwinSegmentSizes(MachOObjectFile *MachO) {
           total_data += Seg.vmsize;
         else if (SegmentName == "__OBJC")
           total_objc += Seg.vmsize;
-        else
+        else if (!ExcludePageZero || SegmentName != "__PAGEZERO")
           total_others += Seg.vmsize;
       }
     }
@@ -914,6 +915,7 @@ int llvm_size_main(int argc, char **argv, const llvm::ToolContext &) {
 
   ELFCommons = Args.hasArg(OPT_common);
   DarwinLongFormat = Args.hasArg(OPT_l);
+  ExcludePageZero = Args.hasArg(OPT_exclude_pagezero);
   TotalSizes = Args.hasArg(OPT_totals);
   StringRef V = Args.getLastArgValue(OPT_format_EQ, "berkeley");
   if (V == "berkeley")

From 2dd743187655261815a95477d3956051e7cf5b04 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Mon, 29 Sep 2025 09:38:49 -0700
Subject: [PATCH 148/878] [clang] Use the VFS in `ModuleDependencyCollector`
 (#160944)

This PR starts using the correct VFS in `ModuleDependencyCollector`
instead of using the real FS directly. This matches compiler's behavior
for other input files.
---
 clang/lib/Frontend/ModuleDependencyCollector.cpp | 11 ++++++-----
 llvm/include/llvm/Support/FileCollector.h        |  3 +++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Frontend/ModuleDependencyCollector.cpp b/clang/lib/Frontend/ModuleDependencyCollector.cpp
index 3b363f948a3a8..ff37065885289 100644
--- a/clang/lib/Frontend/ModuleDependencyCollector.cpp
+++ b/clang/lib/Frontend/ModuleDependencyCollector.cpp
@@ -91,10 +91,10 @@ void ModuleDependencyCollector::attachToPreprocessor(Preprocessor &PP) {
       std::make_unique<ModuleDependencyMMCallbacks>(*this));
 }
 
-static bool isCaseSensitivePath(StringRef Path) {
+static bool isCaseSensitivePath(llvm::vfs::FileSystem &VFS, StringRef Path) {
   SmallString<256> TmpDest = Path, UpperDest, RealDest;
   // Remove component traversals, links, etc.
-  if (llvm::sys::fs::real_path(Path, TmpDest))
+  if (VFS.getRealPath(Path, TmpDest))
     return true; // Current default value in vfs.yaml
   Path = TmpDest;
 
@@ -104,7 +104,7 @@ static bool isCaseSensitivePath(StringRef Path) {
   // already expects when sensitivity isn't setup.
   for (auto &C : Path)
     UpperDest.push_back(toUppercase(C));
-  if (!llvm::sys::fs::real_path(UpperDest, RealDest) && Path == RealDest)
+  if (!VFS.getRealPath(UpperDest, RealDest) && Path == RealDest)
     return false;
   return true;
 }
@@ -121,7 +121,8 @@ void ModuleDependencyCollector::writeFileMap() {
 
   // Explicitly set case sensitivity for the YAML writer. For that, find out
   // the sensitivity at the path where the headers all collected to.
-  VFSWriter.setCaseSensitivity(isCaseSensitivePath(VFSDir));
+  VFSWriter.setCaseSensitivity(
+      isCaseSensitivePath(Canonicalizer.getFileSystem(), VFSDir));
 
   // Do not rely on real path names when executing the crash reproducer scripts
   // since we only want to actually use the files we have on the VFS cache.
@@ -153,7 +154,7 @@ std::error_code ModuleDependencyCollector::copyToRoot(StringRef Src,
   } else {
     // When collecting entries from input vfsoverlays, copy the external
     // contents into the cache but still map from the source.
-    if (!fs::exists(Dst))
+    if (!Canonicalizer.getFileSystem().exists(Dst))
       return std::error_code();
     path::append(CacheDst, Dst);
     Paths.CopyFrom = Dst;
diff --git a/llvm/include/llvm/Support/FileCollector.h b/llvm/include/llvm/Support/FileCollector.h
index 9cc6776b948ba..9fa11ba362241 100644
--- a/llvm/include/llvm/Support/FileCollector.h
+++ b/llvm/include/llvm/Support/FileCollector.h
@@ -81,6 +81,9 @@ class LLVM_ABI FileCollector : public FileCollectorBase {
     /// Canonicalize a pair of virtual and real paths.
     LLVM_ABI PathStorage canonicalize(StringRef SrcPath);
 
+    /// Return the underlying file system.
+    vfs::FileSystem &getFileSystem() const { return *VFS; };
+
     explicit PathCanonicalizer(IntrusiveRefCntPtr<vfs::FileSystem> VFS)
         : VFS(std::move(VFS)) {}
 

From 3e54505b439923a34fe5cbf27743d883fb62ad9f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 29 Sep 2025 09:43:52 -0700
Subject: [PATCH 149/878] [ADT] Fix a bug in EquivalenceClasses::erase
 (#161121)

This patch fixes a bug in EquivalenceClasses::erase, where we lose a
leader bit in a certain scenario.

Here is some background.  In EquivalenceClasses, each equivalence
class is maintained as a singly linked list over its members.  When we
join two classes, we concatenate the two singly linked lists.  To
support path compression, each member points to the leader (through
lazy updates).  This is implemented with the two members:

  class ECValue {
    mutable const ECValue *Leader, *Next;
    :
  };

Each member stores its leader in Leader and its sibling in Next.  Now,
the leader uses the Leader field to to point the last element of the
singly linked list to accommodate the list concatenation.  We use the
LSB of the Next field to indicate whether a given member is a leader
or not.

Now, imagine we have an equivalence class:

  Elem 1 -> Elem 2 -> nullptr
  Leader

and wish to remove Elem 2.  We would like to end up with:

  Elem 1 -> nullptr
  Leader

but we mistakenly drop the leader bit when we update the Next field of
Elem 1 with:

  Pre->Next = nullptr;

This makes Elem 1 the end of the singly linked list, as intended, but
mistakenly clears its leader bit stored in the LSB of Next, so we end
up with an equivalence class with no leader.

This patch fixes the problem by preserving the leader bit:

  Pre->Next = reinterpret_cast<const ECValue *>(
      static_cast<intptr_t>(Pre->isLeader()));

The unit test closely follows the scenario above.
---
 llvm/include/llvm/ADT/EquivalenceClasses.h    |  8 ++++---
 llvm/unittests/ADT/EquivalenceClassesTest.cpp | 23 +++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h
index 7df0d1557af87..90d8948734729 100644
--- a/llvm/include/llvm/ADT/EquivalenceClasses.h
+++ b/llvm/include/llvm/ADT/EquivalenceClasses.h
@@ -256,9 +256,11 @@ template <class ElemTy> class EquivalenceClasses {
       }
       if (!Next) {
         // If the current element is the last element(not leader), set the
-        // successor of the current element's predecessor to null, and set
-        // the 'Leader' field of the class leader to the predecessor element.
-        Pre->Next = nullptr;
+        // successor of the current element's predecessor to null while
+        // preserving the leader bit, and set the 'Leader' field of the class
+        // leader to the predecessor element.
+        Pre->Next = reinterpret_cast<const ECValue *>(
+            static_cast<intptr_t>(Pre->isLeader()));
         Leader->Leader = Pre;
       } else {
         // If the current element is in the middle of class, then simply
diff --git a/llvm/unittests/ADT/EquivalenceClassesTest.cpp b/llvm/unittests/ADT/EquivalenceClassesTest.cpp
index 3d5c48eb8e1b6..8172ff97e5169 100644
--- a/llvm/unittests/ADT/EquivalenceClassesTest.cpp
+++ b/llvm/unittests/ADT/EquivalenceClassesTest.cpp
@@ -108,6 +108,29 @@ TEST(EquivalenceClassesTest, SimpleErase4) {
   EXPECT_FALSE(EqClasses.erase(1));
 }
 
+TEST(EquivalenceClassesTest, EraseKeepsLeaderBit) {
+  EquivalenceClasses<int> EC;
+
+  // Create a set {1, 2} where 1 is the leader.
+  EC.unionSets(1, 2);
+
+  // Verify initial state.
+  EXPECT_EQ(EC.getLeaderValue(2), 1);
+
+  // Erase 2, the non-leader member.
+  EXPECT_TRUE(EC.erase(2));
+
+  // Verify that we have exactly one equivalence class.
+  ASSERT_NE(EC.begin(), EC.end());
+  ASSERT_EQ(std::next(EC.begin()), EC.end());
+
+  // Verify that 1 is still a leader after erasing 2.
+  const auto *Elem = *EC.begin();
+  ASSERT_NE(Elem, nullptr);
+  EXPECT_EQ(Elem->getData(), 1);
+  EXPECT_TRUE(Elem->isLeader()) << "The leader bit was lost!";
+}
+
 TEST(EquivalenceClassesTest, TwoSets) {
   EquivalenceClasses<int> EqClasses;
   // Form sets of odd and even numbers, check that we split them into these

From b51b967671884b6d308c5f7b659a8f459517f541 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 29 Sep 2025 17:46:21 +0100
Subject: [PATCH 150/878] [LV] Add test for more precise no-free checks.

---
 ...able-info-from-assumption-constant-size.ll | 192 ++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index b3338f475ca1d..8e9cb23c4b4e3 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1519,6 +1519,196 @@ exit:
 declare ptr @get_ptr()
 declare void @may_free()
 
+define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP13]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_may_free(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_may_free(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    call void @may_free()
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP1]], i64 4), "dereferenceable"(ptr [[TMP1]], i64 4) ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP2]], i64 4), "dereferenceable"(ptr [[TMP2]], i64 4) ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP8]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP14]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
+; CHECK:       [[LOOP_THEN]]:
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  call void @may_free()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  call void @llvm.assume(i1 true) [ "align"(ptr %gep.a, i64 4), "dereferenceable"(ptr %gep.a, i64 4) ]
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -1540,4 +1730,6 @@ declare void @may_free()
 ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
+; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
 ;.

From 44f392e999dcf6718d7dceaa7ccb39306b1c1feb Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 29 Sep 2025 11:55:44 -0500
Subject: [PATCH 151/878] [OpenMP] Fix 'libc' configuration when building
 OpenMP

Summary:
Forgot to port this option's old handling from offload. It's not way
easier since they're built in the same CMake project. Also delete the
leftover directory that's not used anymore, don't know how that was
still there.
---
 offload/DeviceRTL/CMakeLists.txt | 188 -------------------------------
 openmp/device/CMakeLists.txt     |   2 +-
 2 files changed, 1 insertion(+), 189 deletions(-)
 delete mode 100644 offload/DeviceRTL/CMakeLists.txt

diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
deleted file mode 100644
index e4916f4d49755..0000000000000
--- a/offload/DeviceRTL/CMakeLists.txt
+++ /dev/null
@@ -1,188 +0,0 @@
-set(LIBOMPTARGET_BUILD_DEVICERTL_BCLIB TRUE CACHE BOOL
-  "Can be set to false to disable building this library.")
-
-if (NOT LIBOMPTARGET_BUILD_DEVICERTL_BCLIB)
-  message(STATUS "Not building DeviceRTL: Disabled by LIBOMPTARGET_BUILD_DEVICERTL_BCLIB")
-  return()
-endif()
-
-# Check to ensure the host system is a supported host architecture.
-if(NOT ${CMAKE_SIZEOF_VOID_P} EQUAL "8")
-  message(STATUS "Not building DeviceRTL: Runtime does not support 32-bit hosts")
-  return()
-endif()
-
-if (LLVM_DIR)
-  # Builds that use pre-installed LLVM have LLVM_DIR set.
-  # A standalone or LLVM_ENABLE_RUNTIMES=openmp build takes this route
-  find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
-elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
-  # LLVM in-tree builds may use CMake target names to discover the tools.
-  # A LLVM_ENABLE_PROJECTS=openmp build takes this route
-  set(CLANG_TOOL $<TARGET_FILE:clang>)
-else()
-  message(STATUS "Not building DeviceRTL. No appropriate clang found")
-  return()
-endif()
-
-set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
-set(include_directory ${devicertl_base_directory}/include)
-set(source_directory ${devicertl_base_directory}/src)
-
-set(include_files
-  ${include_directory}/Allocator.h
-  ${include_directory}/Configuration.h
-  ${include_directory}/Debug.h
-  ${include_directory}/Interface.h
-  ${include_directory}/LibC.h
-  ${include_directory}/Mapping.h
-  ${include_directory}/Profiling.h
-  ${include_directory}/State.h
-  ${include_directory}/Synchronization.h
-  ${include_directory}/DeviceTypes.h
-  ${include_directory}/DeviceUtils.h
-  ${include_directory}/Workshare.h
-)
-
-set(src_files
-  ${source_directory}/Allocator.cpp
-  ${source_directory}/Configuration.cpp
-  ${source_directory}/Debug.cpp
-  ${source_directory}/Kernel.cpp
-  ${source_directory}/LibC.cpp
-  ${source_directory}/Mapping.cpp
-  ${source_directory}/Misc.cpp
-  ${source_directory}/Parallelism.cpp
-  ${source_directory}/Profiling.cpp
-  ${source_directory}/Reduction.cpp
-  ${source_directory}/State.cpp
-  ${source_directory}/Synchronization.cpp
-  ${source_directory}/Tasking.cpp
-  ${source_directory}/DeviceUtils.cpp
-  ${source_directory}/Workshare.cpp
-)
-
-# We disable the slp vectorizer during the runtime optimization to avoid
-# vectorized accesses to the shared state. Generally, those are "good" but
-# the optimizer pipeline (esp. Attributor) does not fully support vectorized
-# instructions yet and we end up missing out on way more important constant
-# propagation. That said, we will run the vectorizer again after the runtime
-# has been linked into the user program.
-set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
-
-# If the user built with the GPU C library enabled we will use that instead.
-if(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
-  list(APPEND clang_opt_flags -DOMPTARGET_HAS_LIBC)
-endif()
-
-# Set flags for LLVM Bitcode compilation.
-set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
-             ${clang_opt_flags} -nogpulib -nostdlibinc
-             -fno-rtti -fno-exceptions -fconvergent-functions
-             -Wno-unknown-cuda-version
-             -DOMPTARGET_DEVICE_RUNTIME
-             -I${include_directory}
-             -I${devicertl_base_directory}/../include
-             -I${devicertl_base_directory}/../../libc
-)
-
-# first create an object target
-function(compileDeviceRTLLibrary target_name target_triple)
-  set(target_bc_flags ${ARGN})
-
-  foreach(src ${src_files})
-    get_filename_component(infile ${src} ABSOLUTE)
-    get_filename_component(outfile ${src} NAME)
-    set(outfile "${outfile}-${target_name}.o")
-    set(depfile "${outfile}.d")
-
-    # Passing an empty CPU to -march= suppressed target specific metadata.
-    add_custom_command(OUTPUT ${outfile}
-      COMMAND ${CLANG_TOOL}
-      ${bc_flags}
-      --target=${target_triple}
-      ${target_bc_flags}
-      -MD -MF ${depfile}
-      ${infile} -o ${outfile}
-      DEPENDS ${infile}
-      DEPFILE ${depfile}
-      COMMENT "Building LLVM bitcode ${outfile}"
-      VERBATIM
-    )
-    if(TARGET clang)
-      # Add a file-level dependency to ensure that clang is up-to-date.
-      # By default, add_custom_command only builds clang if the
-      # executable is missing.
-      add_custom_command(OUTPUT ${outfile}
-        DEPENDS clang
-        APPEND
-      )
-    endif()
-    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
-
-    list(APPEND obj_files ${CMAKE_CURRENT_BINARY_DIR}/${outfile})
-  endforeach()
-  # Trick to combine these into a bitcode file via the linker's LTO pass. This
-  # is used to provide the legacy `libomptarget-<name>.bc` files. Hack this
-  # through as an executable to get it to use the relocatable link.
-  add_executable(libomptarget-${target_name} ${obj_files})
-  set_target_properties(libomptarget-${target_name} PROPERTIES
-    RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}
-    LINKER_LANGUAGE CXX
-    BUILD_RPATH ""
-    INSTALL_RPATH ""
-    RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
-  target_compile_options(libomptarget-${target_name} PRIVATE
-    "--target=${target_triple}" "-fuse-ld=lld" "-march=" "-mcpu="
-    "-Wno-unused-command-line-argument")
-  target_link_options(libomptarget-${target_name} PRIVATE "--target=${target_triple}"
-                      "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm"
-                      "-fuse-ld=lld" "-march=" "-mcpu=")
-  install(TARGETS libomptarget-${target_name}
-          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
-          DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
-
-  add_library(omptarget.${target_name}.all_objs OBJECT IMPORTED)
-  set_property(TARGET omptarget.${target_name}.all_objs APPEND PROPERTY IMPORTED_OBJECTS
-               ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/libomptarget-${target_name}.bc)
-  add_dependencies(omptarget.${target_name}.all_objs libomptarget-${target_name})
-
-  # Archive all the object files generated above into a static library
-  add_library(omptarget.${target_name} STATIC)
-  set_target_properties(omptarget.${target_name} PROPERTIES
-    ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/${target_triple}"
-    ARCHIVE_OUTPUT_NAME ompdevice
-    LINKER_LANGUAGE CXX
-  )
-  target_link_libraries(omptarget.${target_name} PRIVATE omptarget.${target_name}.all_objs)
-  target_link_options(omptarget.${target_name} PRIVATE "--target=${target_triple}"
-                      "-Wno-unused-command-line-argument" "-r" "-nostdlib" "-flto"
-                       "-Wl,--lto-emit-llvm" "-fuse-ld=lld" "-march=" "-mcpu=")
-
-  install(TARGETS omptarget.${target_name}
-          ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
-
-  if (CMAKE_EXPORT_COMPILE_COMMANDS)
-    set(ide_target_name omptarget-ide-${target_name})
-    add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
-    target_compile_options(${ide_target_name} PRIVATE
-      -fvisibility=hidden --target=${target_triple}
-      -nogpulib -nostdlibinc -Wno-unknown-cuda-version
-    )
-    target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
-    target_include_directories(${ide_target_name} PRIVATE
-      ${include_directory}
-      ${devicertl_base_directory}/../../libc
-      ${devicertl_base_directory}/../include
-    )
-    install(TARGETS ${ide_target_name} EXCLUDE_FROM_ALL)
-  endif()
-endfunction()
-
-if(NOT LLVM_TARGETS_TO_BUILD OR "AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
-  compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
-endif()
-
-if(NOT LLVM_TARGETS_TO_BUILD OR "NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-  compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
-endif()
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index ded961adcce1f..54cfdfef440a5 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -64,7 +64,7 @@ set_target_properties(libompdevice PROPERTIES
   RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
 
 # If the user built with the GPU C library enabled we will use that instead.
-if(LIBOMPTARGET_GPU_LIBC_SUPPORT)
+if(TARGET libc)
   target_compile_definitions(libompdevice PRIVATE OMPTARGET_HAS_LIBC)
 endif()
 target_compile_definitions(libompdevice PRIVATE SHARED_SCRATCHPAD_SIZE=512)

From 47b8bc46989b99dc3f68ee5e3ab1d9887bbbd076 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar@gmail.com>
Date: Mon, 29 Sep 2025 12:59:58 -0400
Subject: [PATCH 152/878] [Headers][X86] Allow AVX1 fixed extraction intrinsics
 to be used in constexpr (#161218)

Fixes #161204.
---
 clang/lib/Headers/avxintrin.h         | 23 ++++++++++-------------
 clang/test/CodeGen/X86/avx-builtins.c |  3 +++
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index a7f70994be9db..d6ba19a6c78af 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -2311,10 +2311,9 @@ _mm256_cvttps_epi32(__m256 __a)
 /// \param __a
 ///    A 256-bit vector of [4 x double].
 /// \returns A 64 bit double containing the first element of the input vector.
-static __inline double __DEFAULT_FN_ATTRS
-_mm256_cvtsd_f64(__m256d __a)
-{
- return __a[0];
+static __inline double __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_cvtsd_f64(__m256d __a) {
+  return __a[0];
 }
 
 /// Returns the first element of the input vector of [8 x i32].
@@ -2327,11 +2326,10 @@ _mm256_cvtsd_f64(__m256d __a)
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 32 bit integer containing the first element of the input vector.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_cvtsi256_si32(__m256i __a)
-{
- __v8si __b = (__v8si)__a;
- return __b[0];
+static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_cvtsi256_si32(__m256i __a) {
+  __v8si __b = (__v8si)__a;
+  return __b[0];
 }
 
 /// Returns the first element of the input vector of [8 x float].
@@ -2344,10 +2342,9 @@ _mm256_cvtsi256_si32(__m256i __a)
 /// \param __a
 ///    A 256-bit vector of [8 x float].
 /// \returns A 32 bit float containing the first element of the input vector.
-static __inline float __DEFAULT_FN_ATTRS
-_mm256_cvtss_f32(__m256 __a)
-{
- return __a[0];
+static __inline float __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_cvtss_f32(__m256 __a) {
+  return __a[0];
 }
 
 /* Vector replicate */
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 347cd9ee6a667..3018bb9719b89 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -985,18 +985,21 @@ double test_mm256_cvtsd_f64(__m256d __a) {
   // CHECK: extractelement <4 x double> %{{.*}}, i32 0
   return _mm256_cvtsd_f64(__a);
 }
+TEST_CONSTEXPR(_mm256_cvtsd_f64((__m256d){8.0, 7.0, 6.0, 5.0}) == 8.0);
 
 int test_mm256_cvtsi256_si32(__m256i __a) {
   // CHECK-LABEL: test_mm256_cvtsi256_si32
   // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
   return _mm256_cvtsi256_si32(__a);
 }
+TEST_CONSTEXPR(_mm256_cvtsi256_si32((__m256i)(__v8si){8, 7, 6, 5, 4, 3, 2, 1}) == 8);
 
 float test_mm256_cvtss_f32(__m256 __a) {
   // CHECK-LABEL: test_mm256_cvtss_f32
   // CHECK: extractelement <8 x float> %{{.*}}, i32 0
   return _mm256_cvtss_f32(__a);
 }
+TEST_CONSTEXPR(_mm256_cvtss_f32((__m256){8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}) == 8.0f);
 
 __m128i test_mm256_cvttpd_epi32(__m256d A) {
   // CHECK-LABEL: test_mm256_cvttpd_epi32

From 0fc972d242f732766b2dc0590d9f241f47427578 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Mon, 29 Sep 2025 10:01:10 -0700
Subject: [PATCH 153/878] [llvm] Use the underlying VFS when constructing
 `RedirectingFileSystem` (#160942)

When the root node of the `RedirectingFileSystem` is to be resolved to
the current working directory, we previously consulted the real FS
instead of the provided underlying VFS. This PR fixes that issue.
---
 llvm/lib/Support/VirtualFileSystem.cpp        |  2 +-
 .../Support/VirtualFileSystemTest.cpp         | 32 ++++++++++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index cf784595c2f1c..7ff62d43ba205 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -1973,7 +1973,7 @@ class llvm::vfs::RedirectingFileSystemParser {
           EC = FS->makeAbsolute(FullPath, Name);
           Name = canonicalize(Name);
         } else {
-          EC = sys::fs::make_absolute(Name);
+          EC = FS->makeAbsolute(Name);
         }
         if (EC) {
           assert(NameValueNode && "Name presence should be checked earlier");
diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp
index 6228de8aa897a..d47a4ee986778 100644
--- a/llvm/unittests/Support/VirtualFileSystemTest.cpp
+++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp
@@ -1941,7 +1941,7 @@ TEST_F(VFSFromYAMLTest, ReturnsExternalPathVFSHit) {
   EXPECT_EQ(0, NumDiagnostics);
 }
 
-TEST_F(VFSFromYAMLTest, RootRelativeTest) {
+TEST_F(VFSFromYAMLTest, RootRelativeToOverlayDirTest) {
   auto Lower = makeIntrusiveRefCnt<DummyFileSystem>();
   Lower->addDirectory("//root/foo/bar");
   Lower->addRegularFile("//root/foo/bar/a");
@@ -2004,6 +2004,35 @@ TEST_F(VFSFromYAMLTest, RootRelativeTest) {
 #endif
 }
 
+TEST_F(VFSFromYAMLTest, RootRelativeToCWDTest) {
+  auto Lower = makeIntrusiveRefCnt<DummyFileSystem>();
+  Lower->addDirectory("//root/foo/bar");
+  Lower->addRegularFile("//root/foo/bar/a");
+  Lower->addDirectory("//root/foo/bar/cwd");
+  Lower->addRegularFile("//root/foo/bar/cwd/a");
+  Lower->setCurrentWorkingDirectory("//root/foo/bar/cwd");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS =
+      getFromYAMLString("{\n"
+                        "  'case-sensitive': false,\n"
+                        "  'root-relative': 'cwd',\n"
+                        "  'roots': [\n"
+                        "    { 'name': 'b', 'type': 'file',\n"
+                        "      'external-contents': '//root/foo/bar/a'\n"
+                        "    }\n"
+                        "  ]\n"
+                        "}",
+                        Lower, "//root/foo/bar/overlay");
+
+  ASSERT_NE(FS.get(), nullptr);
+
+  ErrorOr<vfs::Status> S1 = FS->status("//root/foo/bar/b");
+  ASSERT_TRUE(S1.getError());
+
+  ErrorOr<vfs::Status> S2 = FS->status("//root/foo/bar/cwd/b");
+  ASSERT_FALSE(S2.getError());
+  EXPECT_EQ("//root/foo/bar/a", S2->getName());
+}
+
 TEST_F(VFSFromYAMLTest, ReturnsInternalPathVFSHit) {
   auto BaseFS = makeIntrusiveRefCnt<vfs::InMemoryFileSystem>();
   BaseFS->addFile("//root/foo/realname", 0,
@@ -2489,6 +2518,7 @@ TEST_F(VFSFromYAMLTest, RelativePaths) {
   SmallString<128> CWD;
   EC = llvm::sys::fs::current_path(CWD);
   ASSERT_FALSE(EC);
+  Lower->setCurrentWorkingDirectory(CWD);
 
   // Filename at root level without a parent directory.
   IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(

From e5d925faa931174b8e415448cc80e8b4b3e55621 Mon Sep 17 00:00:00 2001
From: SunilKuravinakop <98882378+SunilKuravinakop@users.noreply.github.com>
Date: Mon, 29 Sep 2025 22:35:30 +0530
Subject: [PATCH 154/878] [clang][OpenMP] Support for reduction clause with
 array elements as modifier (#160846)

Changes to support for array elements in reduction clause e.g.
"reduction (+:a[1])"

---------

Co-authored-by: Sunil Kuravinakop <kuravina@pe31.hpc.amslabs.hpecorp.net>
---
 clang/lib/Sema/SemaOpenMP.cpp               |  3 +-
 clang/test/OpenMP/for_reduction_codegen.cpp | 32 ++++++++++++++++++++-
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index f5feed6206494..0fa21e89b1236 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -2490,7 +2490,8 @@ VarDecl *SemaOpenMP::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
     DSAStackTy::DSAVarData DVarTop =
         DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode());
     if (DVarTop.CKind != OMPC_unknown && isOpenMPPrivate(DVarTop.CKind) &&
-        (!VD || VD->hasLocalStorage() || !DVarTop.AppliedToPointee))
+        (!VD || VD->hasLocalStorage() ||
+         !(DVarTop.AppliedToPointee && DVarTop.CKind != OMPC_reduction)))
       return VD ? VD : cast<VarDecl>(DVarTop.PrivateCopy->getDecl());
     // Threadprivate variables must not be captured.
     if (isOpenMPThreadPrivate(DVarTop.CKind))
diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp
index 83632db238484..cb4bcc99c3ff3 100644
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@@ -27,7 +27,6 @@ struct S {
   ~S() {}
 };
 
-
 template <typename T, int length>
 T tmain() {
   T t;
@@ -60,6 +59,15 @@ T tmain() {
 }
 
 extern S<float> **foo();
+int g_arr[10];
+
+void reductionArrayElement() {
+#pragma omp parallel
+#pragma omp for reduction(+:g_arr[1])
+  for (int i = 0; i < 10; i++) {
+    g_arr[1] += i;
+  }
+}
 
 int main() {
 #ifdef LAMBDA
@@ -164,6 +172,7 @@ int main() {
 #pragma omp for reduction(& : var3)
   for (int i = 0; i < 10; ++i)
     ;
+  reductionArrayElement();
   return tmain<int, 42>();
 #endif
 }
@@ -535,6 +544,26 @@ int main() {
 //.
 // CHECK4: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8
 //.
+
+// CHECK1-LABEL: define {{.*}}reductionArrayElement{{.*}}.omp_outlined{{.*}}
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK1:         [[G_ARR:%.*]] = alloca i32, align 4
+// CHECK1:         [[TMP0:%.*]] = sdiv exact i64 sub (i64 ptrtoint (ptr @g_arr to i64){{.*}}
+// CHECK1-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[G_ARR:%.*]], i64 [[TMP0]]
+// CHECK1:       omp.inner.for.body:
+// CHECK1:         [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP1]], i64 0, i64 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]],{{.+}}
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4
+// CHECK1:       omp.loop.exit:
+// CHECK1-NEXT:    call void {{.*}}__kmpc_for_static_fini{{.+}}
+// CHECK1:         {{.*}}call i32 {{.*}}__kmpc_reduce{{.+}}
+// CHECK1:       omp.reduction.default:
+// CHECK1-NEXT:    call void @__kmpc_barrier{{.+}}
+// CHECK1-NEXT:    ret void
+//
+
 // CHECK1-LABEL: define {{[^@]+}}@main
 // CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
@@ -614,6 +643,7 @@ int main() {
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined.11, ptr [[TMP7]])
 // CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[VAR3]], align 8
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined.12, ptr [[TMP8]])
+// CHECK1-NEXT:    call void {{.*}}reductionArrayElement{{.*}}
 // CHECK1-NEXT:    [[CALL10:%.*]] = call noundef i32 @_Z5tmainIiLi42EET_v()
 // CHECK1-NEXT:    store i32 [[CALL10]], ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[SAVED_STACK]], align 8

From 0251fd9a72c550bcab4906d9223c96aeb593ce57 Mon Sep 17 00:00:00 2001
From: jiang1997 <jieke@live.cn>
Date: Tue, 30 Sep 2025 01:27:16 +0800
Subject: [PATCH 155/878]  [clang][x86][bytecode] Refactor BMI intrinsic
 wrappers to use interp__builtin_elementwise_int_binop (#160362)

Fixes #160281
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 145 ++++++++---------------
 1 file changed, 52 insertions(+), 93 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 891344d4e6ed0..a2e97fcafdfef 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1294,95 +1294,6 @@ static bool interp__builtin_assume_aligned(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC,
-                                       const InterpFrame *Frame,
-                                       const CallExpr *Call) {
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType())
-    return false;
-
-  APSInt Index = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  uint64_t Shift = Index.extractBitsAsZExtValue(8, 0);
-  uint64_t Length = Index.extractBitsAsZExtValue(8, 8);
-  Length = Length > BitWidth ? BitWidth : Length;
-
-  // Handle out of bounds cases.
-  if (Length == 0 || Shift >= BitWidth) {
-    pushInteger(S, 0, Call->getType());
-    return true;
-  }
-
-  uint64_t Result = Val.getZExtValue() >> Shift;
-  Result &= llvm::maskTrailingOnes<uint64_t>(Length);
-  pushInteger(S, Result, Call->getType());
-  return true;
-}
-
-static bool interp__builtin_ia32_bzhi(InterpState &S, CodePtr OpPC,
-                                      const InterpFrame *Frame,
-                                      const CallExpr *Call) {
-  QualType CallType = Call->getType();
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType() ||
-      !CallType->isIntegerType())
-    return false;
-
-  APSInt Idx = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  uint64_t Index = Idx.extractBitsAsZExtValue(8, 0);
-
-  if (Index < BitWidth)
-    Val.clearHighBits(BitWidth - Index);
-
-  pushInteger(S, Val, CallType);
-  return true;
-}
-
-static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC,
-                                      const InterpFrame *Frame,
-                                      const CallExpr *Call) {
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType())
-    return false;
-
-  APSInt Mask = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  APInt Result = APInt::getZero(BitWidth);
-  for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
-    if (Mask[I])
-      Result.setBitVal(I, Val[P++]);
-  }
-  pushInteger(S, std::move(Result), Call->getType());
-  return true;
-}
-
-static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC,
-                                      const InterpFrame *Frame,
-                                      const CallExpr *Call) {
-  if (Call->getNumArgs() != 2 || !Call->getArg(0)->getType()->isIntegerType() ||
-      !Call->getArg(1)->getType()->isIntegerType())
-    return false;
-
-  APSInt Mask = popToAPSInt(S, Call->getArg(1));
-  APSInt Val = popToAPSInt(S, Call->getArg(0));
-
-  unsigned BitWidth = Val.getBitWidth();
-  APInt Result = APInt::getZero(BitWidth);
-  for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
-    if (Mask[I])
-      Result.setBitVal(P++, Val[I]);
-  }
-  pushInteger(S, std::move(Result), Call->getType());
-  return true;
-}
-
 /// (CarryIn, LHS, RHS, Result)
 static bool interp__builtin_ia32_addcarry_subborrow(InterpState &S,
                                                     CodePtr OpPC,
@@ -3275,11 +3186,37 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case clang::X86::BI__builtin_ia32_bextr_u64:
   case clang::X86::BI__builtin_ia32_bextri_u32:
   case clang::X86::BI__builtin_ia32_bextri_u64:
-    return interp__builtin_ia32_bextr(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Idx) {
+          unsigned BitWidth = Val.getBitWidth();
+          uint64_t Shift = Idx.extractBitsAsZExtValue(8, 0);
+          uint64_t Length = Idx.extractBitsAsZExtValue(8, 8);
+          if (Length > BitWidth) {
+            Length = BitWidth;
+          }
+
+          // Handle out of bounds cases.
+          if (Length == 0 || Shift >= BitWidth)
+            return APInt(BitWidth, 0);
+
+          uint64_t Result = Val.getZExtValue() >> Shift;
+          Result &= llvm::maskTrailingOnes<uint64_t>(Length);
+          return APInt(BitWidth, Result);
+        });
 
   case clang::X86::BI__builtin_ia32_bzhi_si:
   case clang::X86::BI__builtin_ia32_bzhi_di:
-    return interp__builtin_ia32_bzhi(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Idx) {
+          unsigned BitWidth = Val.getBitWidth();
+          uint64_t Index = Idx.extractBitsAsZExtValue(8, 0);
+          APSInt Result = Val;
+
+          if (Index < BitWidth)
+            Result.clearHighBits(BitWidth - Index);
+
+          return Result;
+        });
 
   case clang::X86::BI__builtin_ia32_lzcnt_u16:
   case clang::X86::BI__builtin_ia32_lzcnt_u32:
@@ -3299,11 +3236,33 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
 
   case clang::X86::BI__builtin_ia32_pdep_si:
   case clang::X86::BI__builtin_ia32_pdep_di:
-    return interp__builtin_ia32_pdep(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Mask) {
+          unsigned BitWidth = Val.getBitWidth();
+          APInt Result = APInt::getZero(BitWidth);
+
+          for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
+            if (Mask[I])
+              Result.setBitVal(I, Val[P++]);
+          }
+
+          return Result;
+        });
 
   case clang::X86::BI__builtin_ia32_pext_si:
   case clang::X86::BI__builtin_ia32_pext_di:
-    return interp__builtin_ia32_pext(S, OpPC, Frame, Call);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Val, const APSInt &Mask) {
+          unsigned BitWidth = Val.getBitWidth();
+          APInt Result = APInt::getZero(BitWidth);
+
+          for (unsigned I = 0, P = 0; I != BitWidth; ++I) {
+            if (Mask[I])
+              Result.setBitVal(P++, Val[I]);
+          }
+
+          return Result;
+        });
 
   case clang::X86::BI__builtin_ia32_addcarryx_u32:
   case clang::X86::BI__builtin_ia32_addcarryx_u64:

From 782ab835dcc8f6b55c6053cc38dd299830e4ffed Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 29 Sep 2025 10:27:34 -0700
Subject: [PATCH 156/878] [CIR] Set the module name to the input filename
 (#160934)

This sets the MLIR module name to the main filename (according to the
SourceManager), if one is available. The module name gets used when
creating global init functions, so we will need it to be set.
---
 clang/lib/CIR/CodeGen/CIRGenModule.cpp     | 13 +++++++++++++
 clang/test/CIR/CodeGen/lang-c-cpp.cpp      |  4 ++--
 clang/test/CIR/CodeGen/module-filename.cpp | 11 +++++++++++
 clang/test/CIR/CodeGen/opt-info-attr.cpp   | 12 ++++++------
 4 files changed, 32 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/module-filename.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index eef23a0ebda7f..c977ff9f06de6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -119,6 +119,19 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
                        cir::OptInfoAttr::get(&mlirContext,
                                              cgo.OptimizationLevel,
                                              cgo.OptimizeSize));
+  // Set the module name to be the name of the main file. TranslationUnitDecl
+  // often contains invalid source locations and isn't a reliable source for the
+  // module location.
+  FileID mainFileId = astContext.getSourceManager().getMainFileID();
+  const FileEntry &mainFile =
+      *astContext.getSourceManager().getFileEntryForID(mainFileId);
+  StringRef path = mainFile.tryGetRealPathName();
+  if (!path.empty()) {
+    theModule.setSymName(path);
+    theModule->setLoc(mlir::FileLineColLoc::get(&mlirContext, path,
+                                                /*line=*/0,
+                                                /*column=*/0));
+  }
 }
 
 CIRGenModule::~CIRGenModule() = default;
diff --git a/clang/test/CIR/CodeGen/lang-c-cpp.cpp b/clang/test/CIR/CodeGen/lang-c-cpp.cpp
index e126932104de2..893178384b472 100644
--- a/clang/test/CIR/CodeGen/lang-c-cpp.cpp
+++ b/clang/test/CIR/CodeGen/lang-c-cpp.cpp
@@ -3,8 +3,8 @@
 // RUN: %clang_cc1 -x c -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.c.cir
 // RUN: FileCheck --check-prefix=CIR-C --input-file=%t.c.cir %s
 
-// CIR-CPP: module attributes {{{.*}}cir.lang = #cir.lang<cxx>{{.*}}}
-// CIR-C: module attributes {{{.*}}cir.lang = #cir.lang<c>{{.*}}}
+// CIR-CPP: module{{.*}} attributes {{{.*}}cir.lang = #cir.lang<cxx>{{.*}}}
+// CIR-C: module{{.*}} attributes {{{.*}}cir.lang = #cir.lang<c>{{.*}}}
 
 int main() {
   return 0;
diff --git a/clang/test/CIR/CodeGen/module-filename.cpp b/clang/test/CIR/CodeGen/module-filename.cpp
new file mode 100644
index 0000000000000..05e2e929e3238
--- /dev/null
+++ b/clang/test/CIR/CodeGen/module-filename.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+
+// Normally, we try to avoid checking the filename of a test, but that's the
+// entire point of this test, so we use a wildcard for the path but check the
+// filename.
+// CIR: module @"{{.*}}module-filename.cpp"
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/CIR/CodeGen/opt-info-attr.cpp b/clang/test/CIR/CodeGen/opt-info-attr.cpp
index 444286b8db8a9..97071d7ac2b2b 100644
--- a/clang/test/CIR/CodeGen/opt-info-attr.cpp
+++ b/clang/test/CIR/CodeGen/opt-info-attr.cpp
@@ -13,10 +13,10 @@
 
 void f() {}
 
-// CHECK-O0: module attributes
+// CHECK-O0: module{{.*}} attributes
 // CHECK-O0-NOT: cir.opt_info
-// CHECK-O1: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 1, size = 0>{{.+}}
-// CHECK-O2: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 0>{{.+}}
-// CHECK-O3: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 3, size = 0>{{.+}}
-// CHECK-Os: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 1>{{.+}}
-// CHECK-Oz: module attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 2>{{.+}}
+// CHECK-O1: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 1, size = 0>{{.+}}
+// CHECK-O2: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 0>{{.+}}
+// CHECK-O3: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 3, size = 0>{{.+}}
+// CHECK-Os: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 1>{{.+}}
+// CHECK-Oz: module{{.*}} attributes {{.+}}cir.opt_info = #cir.opt_info<level = 2, size = 2>{{.+}}

From 38953f4d66a43a6a09b3c782e9efd12d2ca4d0d2 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 29 Sep 2025 10:29:41 -0700
Subject: [PATCH 157/878] [CIR] Add initial support for operator delete
 (#160574)

This adds basic operator delete handling in CIR. This does not yet
handle destroying delete or array delete, which will be added later. It
also does not insert non-null checks when not optimizing for size.
---
 clang/include/clang/CIR/MissingFeatures.h  |   2 +
 clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp    | 215 +++++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |   4 +
 clang/lib/CIR/CodeGen/CIRGenFunction.h     |   5 +
 clang/test/CIR/CodeGen/delete.cpp          |  88 +++++++++
 5 files changed, 314 insertions(+)
 create mode 100644 clang/test/CIR/CodeGen/delete.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 0fac1b211239a..7e59989dc09f1 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -208,6 +208,7 @@ struct MissingFeatures {
   static bool dataLayoutTypeAllocSize() { return false; }
   static bool dataLayoutTypeStoreSize() { return false; }
   static bool deferredCXXGlobalInit() { return false; }
+  static bool deleteArray() { return false; }
   static bool devirtualizeMemberFunction() { return false; }
   static bool ehCleanupFlags() { return false; }
   static bool ehCleanupScope() { return false; }
@@ -219,6 +220,7 @@ struct MissingFeatures {
   static bool emitCondLikelihoodViaExpectIntrinsic() { return false; }
   static bool emitLifetimeMarkers() { return false; }
   static bool emitLValueAlignmentAssumption() { return false; }
+  static bool emitNullCheckForDeleteCalls() { return false; }
   static bool emitNullabilityCheck() { return false; }
   static bool emitTypeCheck() { return false; }
   static bool emitTypeMetadataCodeForVCall() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 1f7e3dd1fa7d2..83208bf226882 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -210,6 +210,60 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
   return emitCall(fnInfo, callee, returnValue, args, nullptr, loc);
 }
 
+namespace {
+/// The parameters to pass to a usual operator delete.
+struct UsualDeleteParams {
+  TypeAwareAllocationMode typeAwareDelete = TypeAwareAllocationMode::No;
+  bool destroyingDelete = false;
+  bool size = false;
+  AlignedAllocationMode alignment = AlignedAllocationMode::No;
+};
+} // namespace
+
+// FIXME(cir): this should be shared with LLVM codegen
+static UsualDeleteParams getUsualDeleteParams(const FunctionDecl *fd) {
+  UsualDeleteParams params;
+
+  const FunctionProtoType *fpt = fd->getType()->castAs<FunctionProtoType>();
+  auto ai = fpt->param_type_begin(), ae = fpt->param_type_end();
+
+  if (fd->isTypeAwareOperatorNewOrDelete()) {
+    params.typeAwareDelete = TypeAwareAllocationMode::Yes;
+    assert(ai != ae);
+    ++ai;
+  }
+
+  // The first argument after the type-identity parameter (if any) is
+  // always a void* (or C* for a destroying operator delete for class
+  // type C).
+  ++ai;
+
+  // The next parameter may be a std::destroying_delete_t.
+  if (fd->isDestroyingOperatorDelete()) {
+    params.destroyingDelete = true;
+    assert(ai != ae);
+    ++ai;
+  }
+
+  // Figure out what other parameters we should be implicitly passing.
+  if (ai != ae && (*ai)->isIntegerType()) {
+    params.size = true;
+    ++ai;
+  } else {
+    assert(!isTypeAwareAllocation(params.typeAwareDelete));
+  }
+
+  if (ai != ae && (*ai)->isAlignValT()) {
+    params.alignment = AlignedAllocationMode::Yes;
+    ++ai;
+  } else {
+    assert(!isTypeAwareAllocation(params.typeAwareDelete));
+  }
+
+  assert(ai == ae && "unexpected usual deallocation function parameter");
+  return params;
+}
+
 static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
                                        unsigned minElements,
                                        mlir::Value &numElements,
@@ -332,6 +386,117 @@ static RValue emitNewDeleteCall(CIRGenFunction &cgf,
   return rv;
 }
 
+namespace {
+/// Calls the given 'operator delete' on a single object.
+struct CallObjectDelete final : EHScopeStack::Cleanup {
+  mlir::Value ptr;
+  const FunctionDecl *operatorDelete;
+  QualType elementType;
+
+  CallObjectDelete(mlir::Value ptr, const FunctionDecl *operatorDelete,
+                   QualType elementType)
+      : ptr(ptr), operatorDelete(operatorDelete), elementType(elementType) {}
+
+  void emit(CIRGenFunction &cgf) override {
+    cgf.emitDeleteCall(operatorDelete, ptr, elementType);
+  }
+
+  // This is a placeholder until EHCleanupScope is implemented.
+  size_t getSize() const override {
+    assert(!cir::MissingFeatures::ehCleanupScope());
+    return sizeof(CallObjectDelete);
+  }
+};
+} // namespace
+
+/// Emit the code for deleting a single object.
+static void emitObjectDelete(CIRGenFunction &cgf, const CXXDeleteExpr *de,
+                             Address ptr, QualType elementType) {
+  // C++11 [expr.delete]p3:
+  //   If the static type of the object to be deleted is different from its
+  //   dynamic type, the static type shall be a base class of the dynamic type
+  //   of the object to be deleted and the static type shall have a virtual
+  //   destructor or the behavior is undefined.
+  assert(!cir::MissingFeatures::emitTypeCheck());
+
+  const FunctionDecl *operatorDelete = de->getOperatorDelete();
+  assert(!operatorDelete->isDestroyingOperatorDelete());
+
+  // Find the destructor for the type, if applicable.  If the
+  // destructor is virtual, we'll just emit the vcall and return.
+  const CXXDestructorDecl *dtor = nullptr;
+  if (const auto *rd = elementType->getAsCXXRecordDecl()) {
+    if (rd->hasDefinition() && !rd->hasTrivialDestructor()) {
+      dtor = rd->getDestructor();
+
+      if (dtor->isVirtual()) {
+        cgf.cgm.errorNYI(de->getSourceRange(),
+                         "emitObjectDelete: virtual destructor");
+      }
+    }
+  }
+
+  // Make sure that we call delete even if the dtor throws.
+  // This doesn't have to a conditional cleanup because we're going
+  // to pop it off in a second.
+  cgf.ehStack.pushCleanup<CallObjectDelete>(
+      NormalAndEHCleanup, ptr.getPointer(), operatorDelete, elementType);
+
+  if (dtor) {
+    cgf.emitCXXDestructorCall(dtor, Dtor_Complete,
+                              /*ForVirtualBase=*/false,
+                              /*Delegating=*/false, ptr, elementType);
+  } else if (elementType.getObjCLifetime()) {
+    assert(!cir::MissingFeatures::objCLifetime());
+    cgf.cgm.errorNYI(de->getSourceRange(), "emitObjectDelete: ObjCLifetime");
+  }
+
+  // In traditional LLVM codegen null checks are emitted to save a delete call.
+  // In CIR we optimize for size by default, the null check should be added into
+  // this function callers.
+  assert(!cir::MissingFeatures::emitNullCheckForDeleteCalls());
+
+  cgf.popCleanupBlock();
+}
+
+void CIRGenFunction::emitCXXDeleteExpr(const CXXDeleteExpr *e) {
+  const Expr *arg = e->getArgument();
+  Address ptr = emitPointerWithAlignment(arg);
+
+  // Null check the pointer.
+  //
+  // We could avoid this null check if we can determine that the object
+  // destruction is trivial and doesn't require an array cookie; we can
+  // unconditionally perform the operator delete call in that case. For now, we
+  // assume that deleted pointers are null rarely enough that it's better to
+  // keep the branch. This might be worth revisiting for a -O0 code size win.
+  //
+  // CIR note: emit the code size friendly by default for now, such as mentioned
+  // in `emitObjectDelete`.
+  assert(!cir::MissingFeatures::emitNullCheckForDeleteCalls());
+  QualType deleteTy = e->getDestroyedType();
+
+  // A destroying operator delete overrides the entire operation of the
+  // delete expression.
+  if (e->getOperatorDelete()->isDestroyingOperatorDelete()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXDeleteExpr: destroying operator delete");
+    return;
+  }
+
+  // We might be deleting a pointer to array.
+  deleteTy = getContext().getBaseElementType(deleteTy);
+  ptr = ptr.withElementType(builder, convertTypeForMem(deleteTy));
+
+  if (e->isArrayForm()) {
+    assert(!cir::MissingFeatures::deleteArray());
+    cgm.errorNYI(e->getSourceRange(), "emitCXXDeleteExpr: array delete");
+    return;
+  } else {
+    emitObjectDelete(*this, e, ptr, deleteTy);
+  }
+}
+
 mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) {
   // The element type being allocated.
   QualType allocType = getContext().getBaseElementType(e->getAllocatedType());
@@ -443,3 +608,53 @@ mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) {
                      allocSizeWithoutCookie);
   return result.getPointer();
 }
+
+void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
+                                    mlir::Value ptr, QualType deleteTy) {
+  assert(!cir::MissingFeatures::deleteArray());
+
+  const auto *deleteFTy = deleteFD->getType()->castAs<FunctionProtoType>();
+  CallArgList deleteArgs;
+
+  UsualDeleteParams params = getUsualDeleteParams(deleteFD);
+  auto paramTypeIt = deleteFTy->param_type_begin();
+
+  // Pass std::type_identity tag if present
+  if (isTypeAwareAllocation(params.typeAwareDelete))
+    cgm.errorNYI(deleteFD->getSourceRange(),
+                 "emitDeleteCall: type aware delete");
+
+  // Pass the pointer itself.
+  QualType argTy = *paramTypeIt++;
+  mlir::Value deletePtr =
+      builder.createBitcast(ptr.getLoc(), ptr, convertType(argTy));
+  deleteArgs.add(RValue::get(deletePtr), argTy);
+
+  // Pass the std::destroying_delete tag if present.
+  if (params.destroyingDelete)
+    cgm.errorNYI(deleteFD->getSourceRange(),
+                 "emitDeleteCall: destroying delete");
+
+  // Pass the size if the delete function has a size_t parameter.
+  if (params.size) {
+    QualType sizeType = *paramTypeIt++;
+    CharUnits deleteTypeSize = getContext().getTypeSizeInChars(deleteTy);
+    assert(mlir::isa<cir::IntType>(convertType(sizeType)) &&
+           "expected cir::IntType");
+    cir::ConstantOp size = builder.getConstInt(
+        *currSrcLoc, convertType(sizeType), deleteTypeSize.getQuantity());
+
+    deleteArgs.add(RValue::get(size), sizeType);
+  }
+
+  // Pass the alignment if the delete function has an align_val_t parameter.
+  if (isAlignedAllocation(params.alignment))
+    cgm.errorNYI(deleteFD->getSourceRange(),
+                 "emitDeleteCall: aligned allocation");
+
+  assert(paramTypeIt == deleteFTy->param_type_end() &&
+         "unknown parameter to usual delete function");
+
+  // Emit the call to delete.
+  emitNewDeleteCall(*this, deleteFD, deleteFTy, deleteArgs);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index bd09d78cd0eb6..b93d9a9c6f883 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -687,6 +687,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value VisitCXXNewExpr(const CXXNewExpr *e) {
     return cgf.emitCXXNewExpr(e);
   }
+  mlir::Value VisitCXXDeleteExpr(const CXXDeleteExpr *e) {
+    cgf.emitCXXDeleteExpr(e);
+    return {};
+  }
 
   mlir::Value VisitCXXThrowExpr(const CXXThrowExpr *e) {
     cgf.emitCXXThrowExpr(e);
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 166435f9e7e9e..ef07db3d48ffc 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1197,6 +1197,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                               bool delegating, Address thisAddr,
                               CallArgList &args, clang::SourceLocation loc);
 
+  void emitCXXDeleteExpr(const CXXDeleteExpr *e);
+
   void emitCXXDestructorCall(const CXXDestructorDecl *dd, CXXDtorType type,
                              bool forVirtualBase, bool delegating,
                              Address thisAddr, QualType thisTy);
@@ -1244,6 +1246,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitDelegatingCXXConstructorCall(const CXXConstructorDecl *ctor,
                                         const FunctionArgList &args);
 
+  void emitDeleteCall(const FunctionDecl *deleteFD, mlir::Value ptr,
+                      QualType deleteTy);
+
   mlir::LogicalResult emitDoStmt(const clang::DoStmt &s);
 
   /// Emit an expression as an initializer for an object (variable, field, etc.)
diff --git a/clang/test/CIR/CodeGen/delete.cpp b/clang/test/CIR/CodeGen/delete.cpp
new file mode 100644
index 0000000000000..f21d203f266e5
--- /dev/null
+++ b/clang/test/CIR/CodeGen/delete.cpp
@@ -0,0 +1,88 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fclangir -mconstructor-aliases -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -fclangir -mconstructor-aliases -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -mconstructor-aliases -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+typedef __typeof(sizeof(int)) size_t;
+
+struct SizedDelete {
+  void operator delete(void*, size_t);
+  int member;
+};
+void test_sized_delete(SizedDelete *x) {
+  delete x;
+}
+
+// SizedDelete::operator delete(void*, unsigned long)
+// CIR:  cir.func private @_ZN11SizedDeletedlEPvm(!cir.ptr<!void>, !u64i)
+// LLVM: declare void @_ZN11SizedDeletedlEPvm(ptr, i64)
+
+// CIR: cir.func dso_local @_Z17test_sized_deleteP11SizedDelete
+// CIR:   %[[X:.*]] = cir.load{{.*}} %{{.*}}
+// CIR:   %[[X_CAST:.*]] = cir.cast(bitcast, %[[X]] : !cir.ptr<!rec_SizedDelete>), !cir.ptr<!void>
+// CIR:   %[[OBJ_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CIR:   cir.call @_ZN11SizedDeletedlEPvm(%[[X_CAST]], %[[OBJ_SIZE]]) nothrow : (!cir.ptr<!void>, !u64i) -> ()
+
+// LLVM: define dso_local void @_Z17test_sized_deleteP11SizedDelete
+// LLVM:   %[[X:.*]] = load ptr, ptr %{{.*}}
+// LLVM:   call void @_ZN11SizedDeletedlEPvm(ptr %[[X]], i64 4)
+
+// OGCG: define dso_local void @_Z17test_sized_deleteP11SizedDelete
+// OGCG:   %[[X:.*]] = load ptr, ptr %{{.*}}
+// OGCG:   %[[ISNULL:.*]] = icmp eq ptr %[[X]], null
+// OGCG:   br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]]
+// OGCG: [[DELETE_NOTNULL]]:
+// OGCG:   call void @_ZN11SizedDeletedlEPvm(ptr noundef %[[X]], i64 noundef 4)
+
+// This function is declared below the call in OGCG.
+// OGCG: declare void @_ZN11SizedDeletedlEPvm(ptr noundef, i64 noundef)
+
+struct Contents {
+  ~Contents() {}
+};
+struct Container {
+  Contents *contents;
+  ~Container();
+};
+Container::~Container() { delete contents; }
+
+// Contents::~Contents()
+// CIR: cir.func comdat linkonce_odr @_ZN8ContentsD2Ev
+// LLVM: define linkonce_odr void @_ZN8ContentsD2Ev
+
+// operator delete(void*, unsigned long)
+// CIR: cir.func private @_ZdlPvm(!cir.ptr<!void>, !u64i)
+// LLVM: declare void @_ZdlPvm(ptr, i64)
+
+// Container::~Container()
+// CIR: cir.func dso_local @_ZN9ContainerD2Ev
+// CIR:   %[[THIS:.*]] = cir.load %{{.*}}
+// CIR:   %[[CONTENTS_PTR_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "contents"} : !cir.ptr<!rec_Container> -> !cir.ptr<!cir.ptr<!rec_Contents>>
+// CIR:   %[[CONTENTS_PTR:.*]] = cir.load{{.*}} %[[CONTENTS_PTR_ADDR]]
+// CIR:   cir.call @_ZN8ContentsD2Ev(%[[CONTENTS_PTR]]) nothrow : (!cir.ptr<!rec_Contents>) -> ()
+// CIR:   %[[CONTENTS_CAST:.*]] = cir.cast(bitcast, %[[CONTENTS_PTR]] : !cir.ptr<!rec_Contents>), !cir.ptr<!void>
+// CIR:   %[[OBJ_SIZE:.*]] = cir.const #cir.int<1> : !u64i
+// CIR:   cir.call @_ZdlPvm(%[[CONTENTS_CAST]], %[[OBJ_SIZE]]) nothrow : (!cir.ptr<!void>, !u64i) -> ()
+
+// LLVM: define dso_local void @_ZN9ContainerD2Ev
+// LLVM:   %[[THIS:.*]] = load ptr, ptr %{{.*}}
+// LLVM:   %[[CONTENTS_PTR_ADDR:.*]] = getelementptr %struct.Container, ptr %[[THIS]], i32 0, i32 0
+// LLVM:   %[[CONTENTS_PTR:.*]] = load ptr, ptr %[[CONTENTS_PTR_ADDR]]
+// LLVM:   call void @_ZN8ContentsD2Ev(ptr %[[CONTENTS_PTR]])
+// LLVM:   call void @_ZdlPvm(ptr %[[CONTENTS_PTR]], i64 1)
+
+// OGCG: define dso_local void @_ZN9ContainerD2Ev
+// OGCG:   %[[THIS:.*]] = load ptr, ptr %{{.*}}
+// OGCG:   %[[CONTENTS:.*]] = getelementptr inbounds nuw %struct.Container, ptr %[[THIS]], i32 0, i32 0
+// OGCG:   %[[CONTENTS_PTR:.*]] = load ptr, ptr %[[CONTENTS]]
+// OGCG:   %[[ISNULL:.*]] = icmp eq ptr %[[CONTENTS_PTR]], null
+// OGCG:   br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]]
+// OGCG: [[DELETE_NOTNULL]]:
+// OGCG:   call void @_ZN8ContentsD2Ev(ptr noundef nonnull align 1 dereferenceable(1) %[[CONTENTS_PTR]])
+// OGCG:   call void @_ZdlPvm(ptr noundef %[[CONTENTS_PTR]], i64 noundef 1)
+
+// These functions are declared/defined below the calls in OGCG.
+// OGCG: define linkonce_odr void @_ZN8ContentsD2Ev
+// OGCG: declare void @_ZdlPvm(ptr noundef, i64 noundef)

From 87bd7825902ec378d4f6372c2463e51f5df69fc3 Mon Sep 17 00:00:00 2001
From: Marcos Maronas <marcos.maronas@intel.com>
Date: Mon, 29 Sep 2025 18:32:18 +0100
Subject: [PATCH 158/878] [LLVM][NFC] Fix Rule of Three/Five issues. (#160851)

Fix Rule of Three/Five issues reported by static analysis tool.
---
 llvm/include/llvm/TextAPI/SymbolSet.h | 2 ++
 llvm/lib/CAS/InMemoryCAS.cpp          | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h
index 42c411acb6f9d..22f4124f40313 100644
--- a/llvm/include/llvm/TextAPI/SymbolSet.h
+++ b/llvm/include/llvm/TextAPI/SymbolSet.h
@@ -92,6 +92,8 @@ class SymbolSet {
 
 public:
   SymbolSet() = default;
+  SymbolSet(const SymbolSet &other) = delete;
+  SymbolSet &operator=(const SymbolSet &other) = delete;
   LLVM_ABI ~SymbolSet();
   LLVM_ABI Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags,
                              const Target &Targ);
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
index 255b89c15c4c5..c63ee70de0849 100644
--- a/llvm/lib/CAS/InMemoryCAS.cpp
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -57,6 +57,9 @@ class InMemoryObject {
   InMemoryObject() = delete;
   InMemoryObject(InMemoryObject &&) = delete;
   InMemoryObject(const InMemoryObject &) = delete;
+  InMemoryObject &operator=(const InMemoryObject &) = delete;
+  InMemoryObject &operator=(InMemoryObject &&) = delete;
+  virtual ~InMemoryObject() = default;
 
 protected:
   InMemoryObject(Kind K, const InMemoryIndexValueT &I) : IndexAndKind(&I, K) {}

From 301259a6b1c8146a185e7c1bea46ec02d028243e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 29 Sep 2025 10:36:27 -0700
Subject: [PATCH 159/878] [RISCV] Teach getIntImmCostInst about (X & -(1 << C1)
 & 0xffffffff) == C2 << C1 (#160163)

We can rewrite this to (srai(w)/srli X, C1) == C2 so the AND immediate
is free. This transform is done by performSETCCCombine in
RISCVISelLowering.cpp.

This fixes the opaque constant case mentioned in #157416.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  39 +++++++
 llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll    | 106 ++++++++++++++++++
 2 files changed, 145 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 563f3bbee81df..d4124ae9aeff0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -167,6 +167,42 @@ static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
   return false;
 }
 
+// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
+// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
+// the type will be split so only the lower 32 bits need to be compared using
+// (srai/srli X, C) == C2.
+static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
+  if (!Inst->hasOneUse())
+    return false;
+
+  // Look for equality comparison.
+  auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
+  if (!Cmp || !Cmp->isEquality())
+    return false;
+
+  // Right hand side of comparison should be a constant.
+  auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
+  if (!C)
+    return false;
+
+  uint64_t Mask = Imm.getZExtValue();
+
+  // Mask should be of the form -(1 << C) in the lower 32 bits.
+  if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
+    return false;
+
+  // Comparison constant should be a subset of Mask.
+  uint64_t CmpC = C->getZExtValue();
+  if ((CmpC & Mask) != CmpC)
+    return false;
+
+  // We'll need to sign extend the comparison constant and shift it right. Make
+  // sure the new constant can use addi/xori+seqz/snez.
+  unsigned ShiftBits = llvm::countr_zero(Mask);
+  int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
+  return NewCmpC >= -2048 && NewCmpC <= 2048;
+}
+
 InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                                 const APInt &Imm, Type *Ty,
                                                 TTI::TargetCostKind CostKind,
@@ -224,6 +260,9 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
     if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
         canUseShiftPair(Inst, Imm))
       return TTI::TCC_Free;
+    if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
+        canUseShiftCmp(Inst, Imm))
+      return TTI::TCC_Free;
     Takes12BitImm = true;
     break;
   case Instruction::Add:
diff --git a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
index 55b0d1f0bf7be..2a46a59e90535 100644
--- a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
+++ b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
@@ -155,3 +155,109 @@ define i1 @test9(i64 %x) {
   %b = icmp eq i64 %a, u0x08000000
   ret i1 %b
 }
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting. If it's an opaque constant, we'll have addi -16 and addi 15.
+define i64 @test10(i64 %0) #0 {
+; RV32-LABEL: test10:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    andi a0, a0, -16
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test10:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sraiw a0, a0, 4
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0xffffffff
+  %2 = and i64 %1, u0xfffffff0
+  %3 = icmp ne i64 %2, 0
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting. If it's an opaque constant, we'll have addi -16 and addi 15.
+define i64 @test11(i64 %0) #0 {
+; RV32-LABEL: test11:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    srai a0, a0, 4
+; RV32-NEXT:    addi a0, a0, 1621
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test11:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    sraiw a0, a0, 4
+; RV64-NEXT:    addi a0, a0, 1621
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0xffffffff
+  %2 = and i64 %1, u0xfffffff0
+  %3 = icmp eq i64 %2, u0xffff9ab0
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting. If it's an opaque constant we'll end up with constant
+; materialization sequences on RV64.
+define i64 @test12(i64 %0) #0 {
+; RV32-LABEL: test12:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi a0, a0, -3
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test12:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a0, a0, -16
+; RV64-NEXT:    addi a0, a0, 13
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0xfffffff0
+  %2 = and i64 %1, u0xffffffff
+  %3 = icmp eq i64 %2, u0xfffffff3
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}
+
+; Make sure the and constant doesn't get converted to an opaque constant by
+; ConstantHoisting.
+define i64 @test13(i64 %0) #0 {
+; RV32-LABEL: test13:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    lui a1, 524288
+; RV32-NEXT:    addi a1, a1, 15
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test13:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lui a1, 524288
+; RV64-NEXT:    addi a1, a1, -15
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    sraiw a0, a0, 31
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %1 = add nuw nsw i64 %0, u0x8000000f
+  %2 = and i64 %1, u0x80000000
+  %3 = icmp eq i64 %2, 0
+  %4 = zext i1 %3 to i64
+  ret i64 %4
+}

From 0457644dfb0465cde08d25f4bd704b4b8e7a355c Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 29 Sep 2025 09:22:23 -0700
Subject: [PATCH 160/878] [SLP][NFC]Add a test with the incorrect combination
 of Xor/Mul vector instructions, NFC

---
 .../SLPVectorizer/X86/xor-combined-opcode.ll  | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll b/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll
new file mode 100644
index 0000000000000..7664fda8c5a3c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s -slp-threshold=-100 | FileCheck %s
+define i1 @foo(i1 %v) {            ; assume %v is 1
+; CHECK-LABEL: define i1 @foo(
+; CHECK-SAME: i1 [[V:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i1> poison, i1 [[V]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i1> <i1 false, i1 true>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i1 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[SUB]]
+;
+entry:
+  %not = xor i1 %v, 1              ; 0
+  %not1 = xor i1 %not, 1           ; 1
+  %mul = mul i1 %v, 1              ; 1
+  %sub = sub i1 %not1, %mul        ; 0
+  ret i1 %sub                      ; 0
+}

From f7352505ca32d05d3e219bc86e7241bbb8687f21 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 29 Sep 2025 10:38:16 -0700
Subject: [PATCH 161/878] [profcheck] Exclude LoopVectorize, temporarily
 (#161243)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LV is under active development, new tests are added. Bulk-excluding it
from `profcheck`​, for the moment.


Issue #161235 (see also its parent)
---
 llvm/test/lit.cfg.py           |  18 +-
 llvm/utils/profcheck-xfail.txt | 447 ---------------------------------
 2 files changed, 10 insertions(+), 455 deletions(-)

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index dd3f947b186b3..781240aac94b6 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -48,15 +48,17 @@
 # directories.
 config.excludes = ["Inputs", "CMakeLists.txt", "README.txt", "LICENSE.txt"]
 
-# Exclude llvm-reduce tests for profcheck because we substitute the FileCheck
-# binary with a no-op command for profcheck, but llvm-reduce tests have RUN
-# commands of the form llvm-reduce --test FileCheck, which explode if we
-# substitute FileCheck because llvm-reduce expects FileCheck in these tests.
-# It's not really possible to exclude these tests from the command substitution,
-# so we just exclude llvm-reduce tests from this config altogether. This should
-# be fine though as profcheck config tests are mostly concerned with opt.
 if config.enable_profcheck:
-    config.excludes = config.excludes + ["llvm-reduce"]
+    # Exclude llvm-reduce tests for profcheck because we substitute the FileCheck
+    # binary with a no-op command for profcheck, but llvm-reduce tests have RUN
+    # commands of the form llvm-reduce --test FileCheck, which explode if we
+    # substitute FileCheck because llvm-reduce expects FileCheck in these tests.
+    # It's not really possible to exclude these tests from the command substitution,
+    # so we just exclude llvm-reduce tests from this config altogether. This should
+    # be fine though as profcheck config tests are mostly concerned with opt.
+    config.excludes.append("llvm-reduce")
+    # (Issue #161235) Temporarily exclude LoopVectorize.
+    config.excludes.append("LoopVectorize")
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index bd6627b5b6158..a2b9e56e93e0e 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1161,453 +1161,6 @@ Transforms/LoopUnroll/AArch64/unrolling-multi-exit.ll
 Transforms/LoopUnroll/peel-last-iteration-expansion-cost.ll
 Transforms/LoopUnroll/peel-last-iteration-with-guards.ll
 Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
-Transforms/LoopVectorize/12-12-11-if-conv.ll
-Transforms/LoopVectorize/AArch64/aarch64-predication.ll
-Transforms/LoopVectorize/AArch64/arith-fp-frem-costs.ll
-Transforms/LoopVectorize/AArch64/blend-costs.ll
-Transforms/LoopVectorize/AArch64/check-prof-info.ll
-Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
-Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
-Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
-Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
-Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
-Transforms/LoopVectorize/AArch64/early_exit_costs.ll
-Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
-Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
-Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
-Transforms/LoopVectorize/AArch64/epilogue-vectorization-fix-scalar-resume-values.ll
-Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
-Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
-Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
-Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
-Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll
-Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
-Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
-Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
-Transforms/LoopVectorize/AArch64/induction-costs.ll
-Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
-Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
-Transforms/LoopVectorize/AArch64/interleaved_cost.ll
-Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll
-Transforms/LoopVectorize/AArch64/interleave-with-runtime-checks.ll
-Transforms/LoopVectorize/AArch64/interleaving-load-store.ll
-Transforms/LoopVectorize/AArch64/interleaving-reduction.ll
-Transforms/LoopVectorize/AArch64/intrinsiccost.ll
-Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
-Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
-Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll
-Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
-Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll
-Transforms/LoopVectorize/AArch64/masked-call.ll
-Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll
-Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
-Transforms/LoopVectorize/AArch64/neoverse-epilogue-vect.ll
-Transforms/LoopVectorize/AArch64/optsize_minsize.ll
-Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
-Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
-Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
-Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
-Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
-Transforms/LoopVectorize/AArch64/partial-reduce.ll
-Transforms/LoopVectorize/AArch64/pr31900.ll
-Transforms/LoopVectorize/AArch64/pr33053.ll
-Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll
-Transforms/LoopVectorize/AArch64/predicated-costs.ll
-Transforms/LoopVectorize/AArch64/predication_costs.ll
-Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
-Transforms/LoopVectorize/AArch64/reduction-small-size.ll
-Transforms/LoopVectorize/AArch64/reg-usage.ll
-Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
-Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll
-Transforms/LoopVectorize/AArch64/scalable-call.ll
-Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
-Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll
-Transforms/LoopVectorize/AArch64/scalable-reductions.ll
-Transforms/LoopVectorize/AArch64/scalable-reductions-tf.ll
-Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
-Transforms/LoopVectorize/AArch64/scalable-struct-return.ll
-Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
-Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
-Transforms/LoopVectorize/AArch64/scalarize-store-with-predication.ll
-Transforms/LoopVectorize/AArch64/sdiv-pow2.ll
-Transforms/LoopVectorize/AArch64/select-costs.ll
-Transforms/LoopVectorize/AArch64/simple_early_exit.ll
-Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll
-Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
-Transforms/LoopVectorize/AArch64/store-costs-sve.ll
-Transforms/LoopVectorize/AArch64/strict-fadd.ll
-Transforms/LoopVectorize/AArch64/struct-return-cost.ll
-Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
-Transforms/LoopVectorize/AArch64/sve-basic-vec.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
-Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll
-Transforms/LoopVectorize/AArch64/sve-gather-scatter-cost.ll
-Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
-Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
-Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
-Transforms/LoopVectorize/AArch64/sve-large-strides.ll
-Transforms/LoopVectorize/AArch64/sve-multiexit.ll
-Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-cost.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
-Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
-Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll
-Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
-Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll
-Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
-Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
-Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
-Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
-Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
-Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
-Transforms/LoopVectorize/AArch64/vplan-printing.ll
-Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
-Transforms/LoopVectorize/ARM/mve-icmpcost.ll
-Transforms/LoopVectorize/ARM/mve-multiexit.ll
-Transforms/LoopVectorize/ARM/mve-qabs.ll
-Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
-Transforms/LoopVectorize/ARM/mve-reductions.ll
-Transforms/LoopVectorize/ARM/mve-reduction-types.ll
-Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
-Transforms/LoopVectorize/ARM/optsize_minsize.ll
-Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
-Transforms/LoopVectorize/ARM/scalar-block-cost.ll
-Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
-Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
-Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
-Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
-Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll
-Transforms/LoopVectorize/ARM/tail-folding-reductions-allowed.ll
-Transforms/LoopVectorize/as_cast.ll
-Transforms/LoopVectorize/assume.ll
-Transforms/LoopVectorize/bzip_reverse_loops.ll
-Transforms/LoopVectorize/calloc.ll
-Transforms/LoopVectorize/cast-induction.ll
-Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
-Transforms/LoopVectorize/cse-casts.ll
-Transforms/LoopVectorize/dbg-outer-loop-vect.ll
-Transforms/LoopVectorize/debugloc.ll
-Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
-Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size-needs-loop-guards.ll
-Transforms/LoopVectorize/dereferenceable-info-from-assumption-variable-size.ll
-Transforms/LoopVectorize/diag-with-hotness-info.ll
-Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
-Transforms/LoopVectorize/early_exit_legality.ll
-Transforms/LoopVectorize/epilog-iv-select-cmp.ll
-Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
-Transforms/LoopVectorize/epilog-vectorization-reductions.ll
-Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll
-Transforms/LoopVectorize/explicit_outer_detection.ll
-Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
-Transforms/LoopVectorize/first-order-recurrence-complex.ll
-Transforms/LoopVectorize/first-order-recurrence.ll
-Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
-Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
-Transforms/LoopVectorize/float-induction.ll
-Transforms/LoopVectorize/float-minmax-instruction-flag.ll
-Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
-Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
-Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
-Transforms/LoopVectorize/forked-pointers.ll
-Transforms/LoopVectorize/gcc-examples.ll
-Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll
-Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll
-Transforms/LoopVectorize/hoist-loads.ll
-Transforms/LoopVectorize/i8-induction.ll
-Transforms/LoopVectorize/icmp-uniforms.ll
-Transforms/LoopVectorize/if-conversion.ll
-Transforms/LoopVectorize/if-conversion-nest.ll
-Transforms/LoopVectorize/if-pred-non-void.ll
-Transforms/LoopVectorize/if-pred-not-when-safe.ll
-Transforms/LoopVectorize/if-pred-stores.ll
-Transforms/LoopVectorize/if-reduction.ll
-Transforms/LoopVectorize/induction.ll
-Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
-Transforms/LoopVectorize/interleave-and-scalarize-only.ll
-Transforms/LoopVectorize/interleaved-accesses-2.ll
-Transforms/LoopVectorize/interleaved-accesses-3.ll
-Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
-Transforms/LoopVectorize/interleaved-accesses.ll
-Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
-Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
-Transforms/LoopVectorize/interleaved-accesses-requiring-scev-predicates.ll
-Transforms/LoopVectorize/interleaved-accesses-uniform-load.ll
-Transforms/LoopVectorize/invariant-store-vectorization-2.ll
-Transforms/LoopVectorize/invariant-store-vectorization.ll
-Transforms/LoopVectorize/is_fpclass.ll
-Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
-Transforms/LoopVectorize/iv-select-cmp.ll
-Transforms/LoopVectorize/iv-select-cmp-nested-loop.ll
-Transforms/LoopVectorize/iv-select-cmp-no-wrap.ll
-Transforms/LoopVectorize/iv-select-cmp-trunc.ll
-Transforms/LoopVectorize/lcssa-crashes.ll
-Transforms/LoopVectorize/load-deref-pred-align.ll
-Transforms/LoopVectorize/load-deref-pred-neg-off.ll
-Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
-Transforms/LoopVectorize/load-of-struct-deref-pred.ll
-Transforms/LoopVectorize/loop-form.ll
-Transforms/LoopVectorize/loop-with-constant-exit-condition.ll
-Transforms/LoopVectorize/memdep-fold-tail.ll
-Transforms/LoopVectorize/metadata.ll
-Transforms/LoopVectorize/minmax_reduction.ll
-Transforms/LoopVectorize/multiple-exits-versioning.ll
-Transforms/LoopVectorize/multiple-result-intrinsics.ll
-Transforms/LoopVectorize/noalias-scope-decl.ll
-Transforms/LoopVectorize/no_outside_user.ll
-Transforms/LoopVectorize/no_switch.ll
-Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
-Transforms/LoopVectorize/optimal-epilog-vectorization.ll
-Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
-Transforms/LoopVectorize/optsize.ll
-Transforms/LoopVectorize/outer_loop_hcfg_construction.ll
-Transforms/LoopVectorize/outer-loop-inner-latch-successors.ll
-Transforms/LoopVectorize/outer_loop_scalable.ll
-Transforms/LoopVectorize/outer_loop_test1.ll
-Transforms/LoopVectorize/outer_loop_test2.ll
-Transforms/LoopVectorize/outer-loop-vec-phi-predecessor-order.ll
-Transforms/LoopVectorize/outer-loop-wide-phis.ll
-Transforms/LoopVectorize/phi-cost.ll
-Transforms/LoopVectorize/pointer-induction.ll
-Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
-Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
-Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
-Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
-Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
-Transforms/LoopVectorize/PowerPC/vplan-scalarivsext-crash.ll
-Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
-Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
-Transforms/LoopVectorize/pr32859.ll
-Transforms/LoopVectorize/pr34681.ll
-Transforms/LoopVectorize/pr37248.ll
-Transforms/LoopVectorize/pr39099.ll
-Transforms/LoopVectorize/pr44488-predication.ll
-Transforms/LoopVectorize/pr45525.ll
-Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
-Transforms/LoopVectorize/pr48832.ll
-Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
-Transforms/LoopVectorize/pr55100-expand-scev-predicate-used.ll
-Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
-Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
-Transforms/LoopVectorize/predicate-switch.ll
-Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
-Transforms/LoopVectorize/reduction-inloop-cond.ll
-Transforms/LoopVectorize/reduction-inloop.ll
-Transforms/LoopVectorize/reduction-inloop-pred.ll
-Transforms/LoopVectorize/reduction-inloop-uf4.ll
-Transforms/LoopVectorize/reduction.ll
-Transforms/LoopVectorize/reduction-order.ll
-Transforms/LoopVectorize/reduction-predselect.ll
-Transforms/LoopVectorize/reduction-small-size.ll
-Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
-Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
-Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
-Transforms/LoopVectorize/RISCV/divrem.ll
-Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
-Transforms/LoopVectorize/RISCV/induction-costs.ll
-Transforms/LoopVectorize/RISCV/inloop-reduction.ll
-Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
-Transforms/LoopVectorize/RISCV/mask-index-type.ll
-Transforms/LoopVectorize/RISCV/pr154103.ll
-Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
-Transforms/LoopVectorize/RISCV/pr88802.ll
-Transforms/LoopVectorize/RISCV/preserve-dbg-loc.ll
-Transforms/LoopVectorize/RISCV/reductions.ll
-Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
-Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
-Transforms/LoopVectorize/RISCV/strided-accesses.ll
-Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
-Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll
-Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
-Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll
-Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll
-Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll
-Transforms/LoopVectorize/RISCV/uniform-load-store.ll
-Transforms/LoopVectorize/runtime-checks-difference.ll
-Transforms/LoopVectorize/same-base-access.ll
-Transforms/LoopVectorize/scalable-assume.ll
-Transforms/LoopVectorize/scalable-first-order-recurrence.ll
-Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
-Transforms/LoopVectorize/scalarized-bitcast.ll
-Transforms/LoopVectorize/scalarize-masked-call.ll
-Transforms/LoopVectorize/scalar-select.ll
-Transforms/LoopVectorize/scev-predicate-reasoning.ll
-Transforms/LoopVectorize/select-cmp.ll
-Transforms/LoopVectorize/select-cmp-multiuse.ll
-Transforms/LoopVectorize/select-cmp-predicated.ll
-Transforms/LoopVectorize/select-neg-cond.ll
-Transforms/LoopVectorize/select-reduction.ll
-Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
-Transforms/LoopVectorize/select-with-fastflags.ll
-Transforms/LoopVectorize/single-early-exit-cond-poison.ll
-Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
-Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
-Transforms/LoopVectorize/single-early-exit-interleave.ll
-Transforms/LoopVectorize/single-early-exit-interleave-only.ll
-Transforms/LoopVectorize/single_early_exit_live_outs.ll
-Transforms/LoopVectorize/single_early_exit.ll
-Transforms/LoopVectorize/single_early_exit_with_outer_loop.ll
-Transforms/LoopVectorize/single-value-blend-phis.ll
-Transforms/LoopVectorize/skip-iterations.ll
-Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
-Transforms/LoopVectorize/strict-fadd-interleave-only.ll
-Transforms/LoopVectorize/struct-return.ll
-Transforms/LoopVectorize/struct-return-replicate.ll
-Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
-Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
-Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
-Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
-Transforms/LoopVectorize/SystemZ/pr38110.ll
-Transforms/LoopVectorize/SystemZ/pr47665.ll
-Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
-Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
-Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
-Transforms/LoopVectorize/tail-folding-counting-down.ll
-Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
-Transforms/LoopVectorize/tail-folding-switch.ll
-Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
-Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
-Transforms/LoopVectorize/tripcount.ll
-Transforms/LoopVectorize/trunc-extended-icmps.ll
-Transforms/LoopVectorize/uncountable-early-exit-vplan.ll
-Transforms/LoopVectorize/uniform-blend.ll
-Transforms/LoopVectorize/unroll_nonlatch.ll
-Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
-Transforms/LoopVectorize/vectorize-pointer-phis.ll
-Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
-Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll
-Transforms/LoopVectorize/vect.stats.ll
-Transforms/LoopVectorize/VE/disable_lv.ll
-Transforms/LoopVectorize/version-stride-with-integer-casts.ll
-Transforms/LoopVectorize/vplan-predicate-switch.ll
-Transforms/LoopVectorize/vplan-printing.ll
-Transforms/LoopVectorize/vplan-printing-outer-loop.ll
-Transforms/LoopVectorize/vplan-printing-reductions.ll
-Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
-Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
-Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll
-Transforms/LoopVectorize/vplan-widen-call-instruction.ll
-Transforms/LoopVectorize/vplan-widen-select-instruction.ll
-Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
-Transforms/LoopVectorize/X86/avx1.ll
-Transforms/LoopVectorize/X86/avx512.ll
-Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
-Transforms/LoopVectorize/X86/constant-fold.ll
-Transforms/LoopVectorize/X86/conversion-cost.ll
-Transforms/LoopVectorize/X86/cost-conditional-branches.ll
-Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
-Transforms/LoopVectorize/X86/CostModel/handle-iptr-with-data-layout-to-not-assert.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-f64-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
-Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
-Transforms/LoopVectorize/X86/cost-model.ll
-Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-interleaved-store-i16.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
-Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
-Transforms/LoopVectorize/X86/CostModel/masked-scatter-i32-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-scatter-i64-with-i8-index.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i16.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i32.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i64.ll
-Transforms/LoopVectorize/X86/CostModel/masked-store-i8.ll
-Transforms/LoopVectorize/X86/divs-with-tail-folding.ll
-Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
-Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
-Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
-Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
-Transforms/LoopVectorize/X86/float-induction-x86.ll
-Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
-Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
-Transforms/LoopVectorize/X86/gather_scatter.ll
-Transforms/LoopVectorize/X86/imprecise-through-phis.ll
-Transforms/LoopVectorize/X86/induction-costs.ll
-Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll
-Transforms/LoopVectorize/X86/interleaved-accesses-waw-dependency.ll
-Transforms/LoopVectorize/X86/intrinsiccost.ll
-Transforms/LoopVectorize/X86/invariant-load-gather.ll
-Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
-Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
-Transforms/LoopVectorize/X86/load-deref-pred.ll
-Transforms/LoopVectorize/X86/masked_load_store.ll
-Transforms/LoopVectorize/X86/masked-store-cost.ll
-Transforms/LoopVectorize/X86/multi-exit-cost.ll
-Transforms/LoopVectorize/X86/no_fpmath.ll
-Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
-Transforms/LoopVectorize/X86/optsize.ll
-Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
-Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
-Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
-Transforms/LoopVectorize/X86/pr23997.ll
-Transforms/LoopVectorize/X86/pr47437.ll
-Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
-Transforms/LoopVectorize/X86/pr54634.ll
-Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll
-Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
-Transforms/LoopVectorize/X86/pr81872.ll
-Transforms/LoopVectorize/X86/predicate-switch.ll
-Transforms/LoopVectorize/X86/propagate-metadata.ll
-Transforms/LoopVectorize/X86/reduction-fastmath.ll
-Transforms/LoopVectorize/X86/reg-usage.ll
-Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
-Transforms/LoopVectorize/X86/replicate-uniform-call.ll
-Transforms/LoopVectorize/X86/scatter_crash.ll
-Transforms/LoopVectorize/X86/small-size.ll
-Transforms/LoopVectorize/X86/strided_load_cost.ll
-Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
-Transforms/LoopVectorize/X86/tail_loop_folding.ll
-Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
-Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
-Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
-Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
-Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
-Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
-Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
-Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
-Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
-Transforms/LoopVectorize/X86/x86-pr39099.ll
-Transforms/LoopVectorize/X86/x86-predication.ll
 Transforms/LoopVersioning/add-phi-update-users.ll
 Transforms/LoopVersioning/basic.ll
 Transforms/LoopVersioning/bound-check-partially-known.ll

From 8d48d69911c1c9a212a0f129a15fc92e803b135e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 29 Sep 2025 18:43:05 +0100
Subject: [PATCH 162/878] [X86]
 canCreateUndefOrPoisonForTargetNode/isGuaranteedNotToBeUndefOrPoisonForTargetNode
 - add X86ISD::INSERTPS handling (#161234)

X86ISD::INSERTPS shuffles can't create undef/poison itself, allowing us to fold freeze(insertps(x,y,i)) -> insertps(freeze(x),freeze(y),i)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp                 | 2 ++
 llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll | 5 +----
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 292eab77e2002..cd04ff5bc7ef4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45169,6 +45169,7 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   case X86ISD::Wrapper:
   case X86ISD::WrapperRIP:
     return true;
+  case X86ISD::INSERTPS:
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
@@ -45239,6 +45240,7 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   case X86ISD::BLENDV:
     return false;
   // SSE target shuffles.
+  case X86ISD::INSERTPS:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::UNPCKL:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index bec33492bbf1e..3590c4d027be7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -62,15 +62,12 @@ define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1)
 define <4 x float> @freeze_insertps(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-LABEL: freeze_insertps:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE-NEXT:    insertps {{.*#+}} xmm1 = xmm0[1],xmm1[1,2,3]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: freeze_insertps:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[1],xmm1[1,2,3]
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %s0 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 16)
   %f0 = freeze <4 x float> %s0

From 68c3b5363d64b15962be18ebb3a59e7713437922 Mon Sep 17 00:00:00 2001
From: Joseph Bak <36170953+josephbak@users.noreply.github.com>
Date: Mon, 29 Sep 2025 13:48:17 -0400
Subject: [PATCH 163/878] =?UTF-8?q?[mlir][doc]=20Fix=20typo=20in=20Ch6=20t?=
 =?UTF-8?q?utorial:=20'toy.cpp'=20=E2=86=92=20'toyc.cpp'=20(NFC)=20(#16124?=
 =?UTF-8?q?5)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes a documentation typo in the MLIR Toy tutorial (Ch6).
The tutorial currently refers to `examples/toy/Ch6/toy.cpp`, but the
correct
file name is `examples/toy/Ch6/toyc.cpp`.
---
 mlir/docs/Tutorials/Toy/Ch-6.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/Toy/Ch-6.md b/mlir/docs/Tutorials/Toy/Ch-6.md
index 529de55304206..178c07338ac45 100644
--- a/mlir/docs/Tutorials/Toy/Ch-6.md
+++ b/mlir/docs/Tutorials/Toy/Ch-6.md
@@ -245,7 +245,7 @@ define void @main()
 ```
 
 The full code listing for dumping LLVM IR can be found in
-`examples/toy/Ch6/toy.cpp` in the `dumpLLVMIR()` function:
+`examples/toy/Ch6/toyc.cpp` in the `dumpLLVMIR()` function:
 
 ```c++
 

From 7b96dfbb7d8cdadc2caf04fecc1060b9eeb1f4e3 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 29 Sep 2025 11:00:04 -0700
Subject: [PATCH 164/878] [llvm][mustache] Align standalone partial indentation
 with spec (#159185)

The current implementaion did not correctly handle indentation for
standalone partial tags. It was only applied to lines following a
newline, instead of the first line of a partial's content. This was
fixed by updating the AddIndentation implementaion to prepend the
indentation to the first line of the partial.
---
 llvm/lib/Support/Mustache.cpp           | 18 +++++++++++-------
 llvm/unittests/Support/MustacheTest.cpp |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 686688ad6c25f..8da6fdb7beff9 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -282,18 +282,15 @@ void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
 // For example:
 //  The template string
 //  " \t{{#section}}A{{/section}}"
-// would be considered as having no text ahead and would be render as
+// would be considered as having no text ahead and would be render as:
 //  "A"
-// The exception for this is partial tag which requires us to
-// keep track of the indentation once it's rendered.
 void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
                       Token &CurrentToken, Token::Type CurrentType) {
   Token &PrevToken = Tokens[Idx - 1];
   StringRef PrevTokenBody = PrevToken.TokenBody;
   StringRef Unindented = PrevTokenBody.rtrim(" \r\t\v");
   size_t Indentation = PrevTokenBody.size() - Unindented.size();
-  if (CurrentType != Token::Type::Partial)
-    PrevToken.TokenBody = Unindented.str();
+  PrevToken.TokenBody = Unindented.str();
   CurrentToken.setIndentation(Indentation);
 }
 
@@ -439,7 +436,8 @@ class AddIndentationStringStream : public raw_ostream {
 public:
   explicit AddIndentationStringStream(llvm::raw_ostream &WrappedStream,
                                       size_t Indentation)
-      : Indentation(Indentation), WrappedStream(WrappedStream) {
+      : Indentation(Indentation), WrappedStream(WrappedStream),
+        NeedsIndent(true) {
     SetUnbuffered();
   }
 
@@ -448,10 +446,15 @@ class AddIndentationStringStream : public raw_ostream {
     llvm::StringRef Data(Ptr, Size);
     SmallString<0> Indent;
     Indent.resize(Indentation, ' ');
+
     for (char C : Data) {
+      if (NeedsIndent && C != '\n') {
+        WrappedStream << Indent;
+        NeedsIndent = false;
+      }
       WrappedStream << C;
       if (C == '\n')
-        WrappedStream << Indent;
+        NeedsIndent = true;
     }
   }
 
@@ -460,6 +463,7 @@ class AddIndentationStringStream : public raw_ostream {
 private:
   size_t Indentation;
   llvm::raw_ostream &WrappedStream;
+  bool NeedsIndent;
 };
 
 class Parser {
diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp
index 02eaed4244cc7..3635463cd7570 100644
--- a/llvm/unittests/Support/MustacheTest.cpp
+++ b/llvm/unittests/Support/MustacheTest.cpp
@@ -998,7 +998,7 @@ TEST(MustachePartials, StandaloneIndentation) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("\\\n  |\n  <\n  ->\n  |\n/\n", Out);
+  EXPECT_EQ("\\\n  |\n  <\n  ->\n  |\n/\n", Out);
 }
 
 TEST(MustacheLambdas, BasicInterpolation) {

From 1f82553e385f449efee92da3dca43facb4a1ee66 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 29 Sep 2025 10:57:03 -0700
Subject: [PATCH 165/878] [SLP]Fix mixing xor instructions in the same opcode
 analysis

Xor with 0 operand should not be compatible with multiplications-based
instructions, only with or/xor/add/sub.

Fixes #161140
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp           | 8 +++++++-
 .../Transforms/SLPVectorizer/X86/xor-combined-opcode.ll   | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 065622efc7ecc..c547662c3a77e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1100,7 +1100,9 @@ class BinOpSameOpcodeHelper {
       // constant + x cannot be -constant - x
       // instead, it should be x - -constant
       if (Pos == 1 ||
-          (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
+          ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
+            FromOpcode == Instruction::Xor) &&
+           ToOpcode == Instruction::Sub))
         return SmallVector<Value *>({LHS, RHS});
       return SmallVector<Value *>({RHS, LHS});
     }
@@ -1188,6 +1190,10 @@ class BinOpSameOpcodeHelper {
         if (CIValue.isAllOnes())
           InterchangeableMask = CanBeAll;
         break;
+      case Instruction::Xor:
+        if (CIValue.isZero())
+          InterchangeableMask = XorBIT | OrBIT | AndBIT | SubBIT | AddBIT;
+        break;
       default:
         if (CIValue.isZero())
           InterchangeableMask = CanBeAll;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll b/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll
index 7664fda8c5a3c..9cdcdf1b5d5ca 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/xor-combined-opcode.ll
@@ -6,7 +6,7 @@ define i1 @foo(i1 %v) {            ; assume %v is 1
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i1> poison, i1 [[V]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i1> [[TMP0]], <2 x i1> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i1> <i1 false, i1 true>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i1> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i1 [[TMP3]], [[TMP4]]

From df77a86f9b491afd9816277bcff60c1c9014631d Mon Sep 17 00:00:00 2001
From: Sam Elliott <aelliott@qti.qualcomm.com>
Date: Mon, 29 Sep 2025 19:15:41 +0100
Subject: [PATCH 166/878] [RISCV][NFC] Rename getOppositeBranchCondition
 (#160972)

---
 llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp         | 8 ++++----
 llvm/lib/Target/RISCV/RISCVInstrInfo.h           | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index cb57c4377779f..d4d9e5430d390 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -193,7 +193,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
   // we need to invert the branch condition to jump over TrueBB when the
   // condition is false.
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
-  CC = RISCVCC::getOppositeBranchCondition(CC);
+  CC = RISCVCC::getInverseBranchCondition(CC);
 
   // Insert branch instruction.
   BuildMI(MBB, MBBI, DL, TII->get(RISCVCC::getBrCond(CC)))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 56db09a286547..6d418fda82534 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1134,7 +1134,7 @@ unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC, unsigned SelectOpc) {
   }
 }
 
-RISCVCC::CondCode RISCVCC::getOppositeBranchCondition(RISCVCC::CondCode CC) {
+RISCVCC::CondCode RISCVCC::getInverseBranchCondition(RISCVCC::CondCode CC) {
   switch (CC) {
   default:
     llvm_unreachable("Unrecognized conditional branch");
@@ -1554,7 +1554,7 @@ bool RISCVInstrInfo::optimizeCondBranch(MachineInstr &MI) const {
     return Register();
   };
 
-  unsigned NewOpc = RISCVCC::getBrCond(getOppositeBranchCondition(CC));
+  unsigned NewOpc = RISCVCC::getBrCond(getInverseBranchCondition(CC));
 
   // Might be case 1.
   // Don't change 0 to 1 since we can use x0.
@@ -1801,7 +1801,7 @@ RISCVInstrInfo::optimizeSelect(MachineInstr &MI,
   // Add condition code, inverting if necessary.
   auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
   if (Invert)
-    CC = RISCVCC::getOppositeBranchCondition(CC);
+    CC = RISCVCC::getInverseBranchCondition(CC);
   NewMI.addImm(CC);
 
   // Copy the false register.
@@ -3978,7 +3978,7 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   case RISCV::PseudoCCMOVGPR: {
     // CCMOV can be commuted by inverting the condition.
     auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
-    CC = RISCVCC::getOppositeBranchCondition(CC);
+    CC = RISCVCC::getInverseBranchCondition(CC);
     auto &WorkingMI = cloneIfNew(MI);
     WorkingMI.getOperand(3).setImm(CC);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI*/ false,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 2bc499bf29957..42a0c4c01b472 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -44,7 +44,7 @@ enum CondCode {
   COND_INVALID
 };
 
-CondCode getOppositeBranchCondition(CondCode);
+CondCode getInverseBranchCondition(CondCode);
 unsigned getBrCond(CondCode CC, unsigned SelectOpc = 0);
 
 } // end of namespace RISCVCC

From 8dde784135afcbbf4aaca64db163f3e726cff339 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 29 Sep 2025 19:16:36 +0100
Subject: [PATCH 167/878] [LV] Add test for more precise no-free checks w/o
 nosync attribute.

Also filter out uninteresting parts from check lines.
---
 ...able-info-from-assumption-constant-size.ll | 677 ++++++------------
 1 file changed, 210 insertions(+), 467 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
index 8e9cb23c4b4e3..75420d40f2aad 100644
--- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
 ; RUN: opt -p loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
 
 declare void @llvm.assume(i1)
@@ -47,29 +47,8 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -123,27 +102,8 @@ define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
@@ -216,29 +176,8 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 2) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -312,29 +251,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -408,29 +326,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -504,29 +401,8 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -596,29 +472,8 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -692,29 +547,8 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop.header
@@ -747,7 +581,7 @@ exit:
 define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i64 %N) nofree nosync{
 ; CHECK-LABEL: define void @deref_assumption_in_header_variable_trip_count(
 ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -792,30 +626,8 @@ define void @deref_assumption_in_header_variable_trip_count(ptr noalias noundef
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.header
@@ -867,28 +679,8 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -958,28 +750,8 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 3999) ]
@@ -1031,28 +803,8 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -1105,28 +857,8 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -1196,28 +928,8 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 4000) ]
@@ -1287,28 +999,8 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %a, i64 3999) ]
@@ -1376,27 +1068,8 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4) ]
@@ -1465,27 +1138,8 @@ define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_c
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   %a = call ptr @get_ptr()
@@ -1530,10 +1184,10 @@ define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(p
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
@@ -1542,7 +1196,7 @@ define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(p
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
 ; CHECK:       [[PRED_LOAD_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
@@ -1551,36 +1205,16 @@ define void @deref_assumption_in_header_constant_trip_count_nofree_via_context(p
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_CONTINUE2]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP13]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -1621,19 +1255,14 @@ define void @deref_assumption_in_header_constant_trip_count_may_free(ptr noalias
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP1]], i64 4), "dereferenceable"(ptr [[TMP1]], i64 4) ]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP2]], i64 4), "dereferenceable"(ptr [[TMP2]], i64 4) ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
 ; CHECK:       [[PRED_LOAD_IF]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
@@ -1642,7 +1271,8 @@ define void @deref_assumption_in_header_constant_trip_count_may_free(ptr noalias
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
 ; CHECK:       [[PRED_LOAD_IF1]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x ptr> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
@@ -1652,33 +1282,11 @@ define void @deref_assumption_in_header_constant_trip_count_may_free(ptr noalias
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sge i32 [[L_B]], 0
-; CHECK-NEXT:    br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[LOOP_THEN:.*]]
-; CHECK:       [[LOOP_THEN]]:
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ [[L_A]], %[[LOOP_THEN]] ], [ [[L_B]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[GEP_C]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
 ;
 entry:
   call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
@@ -1688,7 +1296,6 @@ entry:
 loop.header:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
   %gep.a = getelementptr i32, ptr %a, i64 %iv
-  call void @llvm.assume(i1 true) [ "align"(ptr %gep.a, i64 4), "dereferenceable"(ptr %gep.a, i64 4) ]
   %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
   %l.b = load i32, ptr %gep.b, align 4
   %c.1 = icmp sge i32 %l.b, 0
@@ -1709,27 +1316,163 @@ loop.latch:
 exit:
   ret void
 }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
-; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
-; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
-; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]]}
-;.
+
+define void @deref_assumption_in_header_constant_trip_count_nofree_via_context_but_missing_nosync(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_nofree_via_context_but_missing_nosync(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP13]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @deref_assumption_in_header_constant_trip_count_multiple_loop_predecessors(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c, i1 %pre) nosync {
+; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_multiple_loop_predecessors(
+; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i1 [[PRE:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ]
+; CHECK-NEXT:    br i1 [[PRE]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PREHEADER:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    store i32 0, ptr [[B]], align 4
+; CHECK-NEXT:    br label %[[LOOP_HEADER_PREHEADER]]
+; CHECK:       [[LOOP_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_LOAD_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP6]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_IF1]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP11]], i32 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE2]]
+; CHECK:       [[PRED_LOAD_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x i32> [ [[TMP7]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i32> [[TMP13]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT:label %.*]]
+; CHECK:       [[SCALAR_PH:.*:]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %a, i64 4), "dereferenceable"(ptr %a, i64 4000) ]
+  br i1 %pre, label %then, label %else
+
+then:
+  store i32 0, ptr %a
+  br label %loop.header
+
+else:
+  store i32 0, ptr %b
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %loop.latch ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv
+  %l.b = load i32, ptr %gep.b, align 4
+  %c.1 = icmp sge i32 %l.b, 0
+  br i1 %c.1, label %loop.latch, label %loop.then
+
+loop.then:
+  %l.a = load i32, ptr %gep.a, align 4
+  br label %loop.latch
+
+loop.latch:
+  %merge = phi i32 [ %l.a, %loop.then ], [ %l.b, %loop.header ]
+  %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv
+  store i32 %merge, ptr %gep.c, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+

From 2522a953546dbbf1cc4ad45c3e5218b8c08c0620 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 29 Sep 2025 11:19:47 -0700
Subject: [PATCH 168/878] [llvm][mustache] Precommit test for Set Delimiter
 (#159186)

Adds a new unit test for the Mustache Set Delimiter feature.

This test is written with inverted logic (`EXPECT_NE`) so that it
passes with the current implementation, which does not support
the feature. Once the feature is implemented, this test will fail,
signaling that the test logic should be flipped to `EXPECT_EQ`.
---
 llvm/unittests/Support/MustacheTest.cpp | 136 ++++++++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp
index 3635463cd7570..0ebbc58e023cc 100644
--- a/llvm/unittests/Support/MustacheTest.cpp
+++ b/llvm/unittests/Support/MustacheTest.cpp
@@ -1328,3 +1328,139 @@ TEST(MustacheTripleMustache, WithPadding) {
   T.render(D, OS);
   EXPECT_EQ("|---|", Out);
 }
+
+TEST(MustacheDelimiters, PairBehavior) {
+  Value D = Object{{"text", "Hey!"}};
+  auto T = Template("{{=<% %>=}}(<%text%>)");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("(Hey!)", Out);
+}
+
+TEST(MustacheDelimiters, SpecialCharacters) {
+  Value D = Object{{"text", "It worked!"}};
+  auto T = Template("({{=[ ]=}}[text])");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("(It worked!)", Out);
+}
+
+TEST(MustacheDelimiters, Sections) {
+  Value D = Object{{"section", true}, {"data", "I got interpolated."}};
+  auto T =
+      Template("[\n{{#section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
+               "| | =}}\n|#section|\n  {{data}}\n  |data|\n|/section|\n]\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+            "interpolated.\n]\n",
+            Out);
+}
+
+TEST(MustacheDelimiters, InvertedSections) {
+  Value D = Object{{"section", false}, {"data", "I got interpolated."}};
+  auto T =
+      Template("[\n{{^section}}\n  {{data}}\n  |data|\n{{/section}}\n\n{{= "
+               "| | =}}\n|^section|\n  {{data}}\n  |data|\n|/section|\n]\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+            "interpolated.\n]\n",
+            Out);
+}
+
+TEST(MustacheDelimiters, PartialInheritence) {
+  Value D = Object{{"value", "yes"}};
+  auto T = Template("[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n");
+  T.registerPartial("include", ".{{value}}.");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[ .yes. ]\n[ .yes. ]\n", Out);
+}
+
+TEST(MustacheDelimiters, PostPartialBehavior) {
+  Value D = Object{{"value", "yes"}};
+  auto T = Template("[ {{>include}} ]\n[ .{{value}}.  .|value|. ]\n");
+  T.registerPartial("include", ".{{value}}. {{= | | =}} .|value|.");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("[ .yes.  .yes. ]\n[ .yes.  .|value|. ]\n", Out);
+}
+
+TEST(MustacheDelimiters, SurroundingWhitespace) {
+  Value D = Object{};
+  auto T = Template("| {{=@ @=}} |");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_EQ("|  |", Out);
+}
+
+TEST(MustacheDelimiters, OutlyingWhitespaceInline) {
+  Value D = Object{};
+  auto T = Template(" | {{=@ @=}}\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_EQ(" | \n", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneTag) {
+  Value D = Object{};
+  auto T = Template("Begin.\n{{=@ @=}}\nEnd.\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("Begin.\nEnd.\n", Out);
+}
+
+TEST(MustacheDelimiters, IndentedStandaloneTag) {
+  Value D = Object{};
+  auto T = Template("Begin.\n  {{=@ @=}}\nEnd.\n");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("Begin.\nEnd.\n", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneLineEndings) {
+  Value D = Object{};
+  auto T = Template("|\r\n{{= @ @ =}}\r\n|");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("|\r\n|", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
+  Value D = Object{};
+  auto T = Template("  {{=@ @=}}\n=");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("=", Out);
+}
+
+TEST(MustacheDelimiters, StandaloneWithoutNewline) {
+  Value D = Object{};
+  auto T = Template("=\n  {{=@ @=}}");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_NE("=\n", Out);
+}
+
+TEST(MustacheDelimiters, PairwithPadding) {
+  Value D = Object{};
+  auto T = Template("|{{= @   @ =}}|");
+  std::string Out;
+  raw_string_ostream OS(Out);
+  T.render(D, OS);
+  EXPECT_EQ("||", Out);
+}

From 0f70b440160fb92be3536d26d6d97f4c61de23bd Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Mon, 29 Sep 2025 11:21:21 -0700
Subject: [PATCH 169/878] [DWARFVerifier] Fix test verify_stmt_seq.yaml to
 write output files to temp directory. (#161247)

---
 llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml b/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml
index 5312c2573902d..17e91f1cc1393 100644
--- a/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml
@@ -2,9 +2,9 @@
 # Then manually tempered with some of the value of the attribute
 # I hope there are easier ways to construct tests like this.
 
-# RUN: yaml2obj %s -o verify_stmt_seq.o
-# RUN: not llvm-dwarfdump -verify -debug-info verify_stmt_seq.o | FileCheck %s --check-prefix=CHECK_INVALID --implicit-check-not=error:
-# RUN: llvm-dwarfdump -debug-line -verbose -debug-info verify_stmt_seq.o | FileCheck %s --check-prefix=CHECK_DEBUG_LINE
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -verify -debug-info %t.o | FileCheck %s --check-prefix=CHECK_INVALID --implicit-check-not=error:
+# RUN: llvm-dwarfdump -debug-line -verbose -debug-info %t.o | FileCheck %s --check-prefix=CHECK_DEBUG_LINE
 
 # CHECK_INVALID: error: DW_AT_LLVM_stmt_sequence offset 0x00000000 is not within the line table bounds [0x00000034, 0x000000fd)
 # CHECK_INVALID: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset]     (0x00000000)

From 5da28bd331b243b62f30a211927b4e33b8dd943b Mon Sep 17 00:00:00 2001
From: Victor Chernyakin <chernyakin.victor.j@outlook.com>
Date: Mon, 29 Sep 2025 11:28:20 -0700
Subject: [PATCH 170/878] [clang-tidy][NFC] Make a few `std::string`s into
 `StringRef`s (#160961)

Following up 12cb540. Also, that commit left behind a few cases where a
temporary `StringRef` was being constructed from those variables just to
use its `.split()` function, so this PR cleans those up too.
---
 .../android/ComparisonInTempFailureRetryCheck.cpp           | 2 +-
 .../clang-tidy/bugprone/AssertSideEffectCheck.cpp           | 2 +-
 .../clang-tidy/bugprone/ExceptionEscapeCheck.cpp            | 6 +++---
 .../clang-tidy/bugprone/ExceptionEscapeCheck.h              | 4 ++--
 .../clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp  | 2 +-
 .../clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h    | 2 +-
 clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp  | 2 +-
 .../clang-tidy/openmp/ExceptionEscapeCheck.cpp              | 2 +-
 clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h  | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
index 78e58bccaeba1..36ac9a44695c9 100644
--- a/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
+++ b/clang-tools-extra/clang-tidy/android/ComparisonInTempFailureRetryCheck.cpp
@@ -19,7 +19,7 @@ ComparisonInTempFailureRetryCheck::ComparisonInTempFailureRetryCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
       RawRetryList(Options.get("RetryMacros", "TEMP_FAILURE_RETRY")) {
-  StringRef(RawRetryList).split(RetryMacros, ",", -1, false);
+  RawRetryList.split(RetryMacros, ",", -1, false);
 }
 
 void ComparisonInTempFailureRetryCheck::storeOptions(
diff --git a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
index 227641d73885e..170050247014a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/AssertSideEffectCheck.cpp
@@ -92,7 +92,7 @@ AssertSideEffectCheck::AssertSideEffectCheck(StringRef Name,
       RawAssertList(Options.get("AssertMacros", "assert,NSAssert,NSCAssert")),
       IgnoredFunctions(utils::options::parseListPair(
           "__builtin_expect;", Options.get("IgnoredFunctions", ""))) {
-  StringRef(RawAssertList).split(AssertMacros, ",", -1, false);
+  RawAssertList.split(AssertMacros, ",", -1, false);
 }
 
 // The options are explained in AssertSideEffectCheck.h.
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
index 3d839b5111cc8..837a86ff8655e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.cpp
@@ -39,12 +39,12 @@ ExceptionEscapeCheck::ExceptionEscapeCheck(StringRef Name,
       RawIgnoredExceptions(Options.get("IgnoredExceptions", "")) {
   llvm::SmallVector<StringRef, 8> FunctionsThatShouldNotThrowVec,
       IgnoredExceptionsVec;
-  StringRef(RawFunctionsThatShouldNotThrow)
-      .split(FunctionsThatShouldNotThrowVec, ",", -1, false);
+  RawFunctionsThatShouldNotThrow.split(FunctionsThatShouldNotThrowVec, ",", -1,
+                                       false);
   FunctionsThatShouldNotThrow.insert_range(FunctionsThatShouldNotThrowVec);
 
   llvm::StringSet<> IgnoredExceptions;
-  StringRef(RawIgnoredExceptions).split(IgnoredExceptionsVec, ",", -1, false);
+  RawIgnoredExceptions.split(IgnoredExceptionsVec, ",", -1, false);
   IgnoredExceptions.insert_range(IgnoredExceptionsVec);
   Tracer.ignoreExceptions(std::move(IgnoredExceptions));
   Tracer.ignoreBadAlloc(true);
diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
index ae6e2024e415d..bd1e7bae57f5d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h
@@ -33,8 +33,8 @@ class ExceptionEscapeCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
 private:
-  std::string RawFunctionsThatShouldNotThrow;
-  std::string RawIgnoredExceptions;
+  StringRef RawFunctionsThatShouldNotThrow;
+  StringRef RawIgnoredExceptions;
 
   llvm::StringSet<> FunctionsThatShouldNotThrow;
   utils::ExceptionAnalyzer Tracer;
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
index aa95fadb0290b..b8bca7286ce69 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.cpp
@@ -20,7 +20,7 @@ ProperlySeededRandomGeneratorCheck::ProperlySeededRandomGeneratorCheck(
     : ClangTidyCheck(Name, Context),
       RawDisallowedSeedTypes(
           Options.get("DisallowedSeedTypes", "time_t,std::time_t")) {
-  StringRef(RawDisallowedSeedTypes).split(DisallowedSeedTypes, ',');
+  RawDisallowedSeedTypes.split(DisallowedSeedTypes, ',');
 }
 
 void ProperlySeededRandomGeneratorCheck::storeOptions(
diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
index ea30127e25e08..7da01cc857187 100644
--- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
+++ b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h
@@ -33,7 +33,7 @@ class ProperlySeededRandomGeneratorCheck : public ClangTidyCheck {
   void checkSeed(const ast_matchers::MatchFinder::MatchResult &Result,
                  const T *Func);
 
-  std::string RawDisallowedSeedTypes;
+  StringRef RawDisallowedSeedTypes;
   SmallVector<StringRef, 5> DisallowedSeedTypes;
 };
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
index 4084d713665ea..b921819ad13e6 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp
@@ -494,7 +494,7 @@ UseNullptrCheck::UseNullptrCheck(StringRef Name, ClangTidyContext *Context)
       NullMacrosStr(Options.get("NullMacros", "NULL")),
       IgnoredTypes(utils::options::parseStringList(Options.get(
           "IgnoredTypes", "_CmpUnspecifiedParam;^std::__cmp_cat::__unspec"))) {
-  StringRef(NullMacrosStr).split(NullMacros, ",");
+  NullMacrosStr.split(NullMacros, ",");
 }
 
 void UseNullptrCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
diff --git a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
index f9becee92e148..3801fc0f420e5 100644
--- a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.cpp
@@ -23,7 +23,7 @@ ExceptionEscapeCheck::ExceptionEscapeCheck(StringRef Name,
   llvm::SmallVector<StringRef, 8> IgnoredExceptionsVec;
 
   llvm::StringSet<> IgnoredExceptions;
-  StringRef(RawIgnoredExceptions).split(IgnoredExceptionsVec, ",", -1, false);
+  RawIgnoredExceptions.split(IgnoredExceptionsVec, ",", -1, false);
   llvm::transform(IgnoredExceptionsVec, IgnoredExceptionsVec.begin(),
                   [](StringRef S) { return S.trim(); });
   IgnoredExceptions.insert_range(IgnoredExceptionsVec);
diff --git a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
index 39da124a4b37c..757a9337bb9ed 100644
--- a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
+++ b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h
@@ -30,7 +30,7 @@ class ExceptionEscapeCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 
 private:
-  std::string RawIgnoredExceptions;
+  StringRef RawIgnoredExceptions;
 
   utils::ExceptionAnalyzer Tracer;
 };

From 7de73c4e9d5ee1ec00bb57427ac04746ce858c3c Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba@intel.com>
Date: Mon, 29 Sep 2025 11:47:21 -0700
Subject: [PATCH 171/878] [OpenMP][Offload] Support `PRIVATE | ATTACH` maps for
 corresponding-pointer-initialization. (#160760)

`PRIVATE | ATTACH` maps can be used to represent firstprivate pointers
that should be initialized by doing doing the pointee's device address,
if its lookup succeeds, or retain the original host pointee's address
otherwise.

With this, for a test like the following:

  ```f90
  integer, pointer :: p(:)
  !$omp target map(p(1))
  ... print*, p(1)
  !$omp end target
  ```

The codegen can look like:
  ```llvm
   ; maps for p:
   ; &p(1),       &p(1), sizeof(p(1)),       TO|FROM              //(1)
   ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH               //(2)
   ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE|ATTACH|PARAM //(3)
   call... @__omp_outlined...(ptr %ref_ptr_of_p)
  ```

* `(1)` maps the pointee `p(1)`.
* `(2)` attaches it to the (previously) mapped `ref_ptr(p)`, if present.
  It can be controlled via OpenMP 6.1's `attach(auto/always/never)`
  map-type modifiers.
* `(3)` privatizes and initializes the local `ref_ptr(p)`, which gets
passed
  in as the kernel argument `%ref_ptr_of_p`. Can be skipped if p is not
  referenced directly within the region.

While similar mapping can be used for C/C++, it's more important/useful
for Fortran as we can avoid creating another argument for passing the
descriptor, and use that to initialize the private copy in the body of
the kernel.
---
 offload/libomptarget/omptarget.cpp | 371 ++++++++++++++++++++++++-----
 1 file changed, 307 insertions(+), 64 deletions(-)

diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 39286d41ec865..a1950cbb62908 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -330,6 +330,54 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
   return Rc;
 }
 
+/// Returns a buffer of the requested \p Size, to be used as the source for
+/// `submitData`.
+///
+/// For small buffers (`Size <= sizeof(void*)`), uses \p AsyncInfo's
+/// getVoidPtrLocation().
+/// For larger buffers, creates a dynamic buffer which will be eventually
+/// deleted by \p AsyncInfo's post-processing callback.
+static char *getOrCreateSourceBufferForSubmitData(AsyncInfoTy &AsyncInfo,
+                                                  int64_t Size) {
+  constexpr int64_t VoidPtrSize = sizeof(void *);
+
+  if (Size <= VoidPtrSize) {
+    void *&BufferElement = AsyncInfo.getVoidPtrLocation();
+    return reinterpret_cast<char *>(&BufferElement);
+  }
+
+  // Create a dynamic buffer for larger data and schedule its deletion.
+  char *DataBuffer = new char[Size];
+  AsyncInfo.addPostProcessingFunction([DataBuffer]() {
+    delete[] DataBuffer;
+    return OFFLOAD_SUCCESS;
+  });
+  return DataBuffer;
+}
+
+/// Calculates the target pointee base by applying the host
+/// pointee begin/base delta to the target pointee begin.
+///
+/// ```
+/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase)
+/// ```
+static void *calculateTargetPointeeBase(void *HstPteeBase, void *HstPteeBegin,
+                                        void *TgtPteeBegin) {
+  uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
+                   reinterpret_cast<uint64_t>(HstPteeBase);
+  void *TgtPteeBase = reinterpret_cast<void *>(
+      reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
+
+  DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD
+     ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n",
+     DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta);
+  DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD
+     "\n",
+     DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
+
+  return TgtPteeBase;
+}
+
 /// Utility function to perform a pointer attachment operation.
 ///
 /// For something like:
@@ -399,16 +447,8 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
   constexpr int64_t VoidPtrSize = sizeof(void *);
   assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small");
 
-  uint64_t Delta = reinterpret_cast<uint64_t>(HstPteeBegin) -
-                   reinterpret_cast<uint64_t>(HstPteeBase);
-  void *TgtPteeBase = reinterpret_cast<void *>(
-      reinterpret_cast<uint64_t>(TgtPteeBegin) - Delta);
-  DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD
-     ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n",
-     DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta);
-  DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD
-     "\n",
-     DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin));
+  void *TgtPteeBase =
+      calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, TgtPteeBegin);
 
   // Add shadow pointer tracking
   if (!PtrTPR.getEntry()->addShadowPointer(
@@ -435,48 +475,32 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo,
     return OFFLOAD_SUCCESS;
   };
 
-  bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize;
-  if (!IsPtrAFortranDescriptor) {
-    // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as
-    // the buffer space for the submission.
-    void *&BufferElement = AsyncInfo.getVoidPtrLocation();
-    BufferElement = TgtPteeBase;
-
-    // Submit the updated pointer value to device
-    return HandleSubmitResult(Device.submitData(
-        TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry()));
+  // Get a buffer to be used as the source for data submission.
+  char *SrcBuffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize);
+
+  // The pointee's address should occupy the first VoidPtrSize bytes
+  // irrespective of HstPtrSize.
+  std::memcpy(SrcBuffer, &TgtPteeBase, VoidPtrSize);
+
+  // For larger "pointers" (e.g., Fortran descriptors), copy remaining
+  // descriptor fields from the host descriptor into the buffer.
+  if (HstPtrSize > VoidPtrSize) {
+    uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+    void *HstDescriptorFieldsAddr =
+        reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize;
+    std::memcpy(SrcBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
+                HstDescriptorFieldsSize);
+
+    DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD
+       ") (pointer + %" PRId64 " additional bytes from host descriptor " DPxMOD
+       ")\n",
+       HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize,
+       DPxPTR(HstDescriptorFieldsAddr));
   }
 
-  // For larger "pointers" (like Fortran's descriptors), we create a dynamic
-  // buffer, which will be eventually destroyed by AsyncInfo's post-processing
-  // callback.
-  char *DataBuffer = new char[HstPtrSize];
-
-  // For such descriptors, to the first VoidPtrSize bytes, we store the
-  // pointee's device address.
-  std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *));
-
-  // And to the remaining bytes, we copy the remaining contents of the host
-  // descriptor after the initial VoidPtrSize bytes.
-  uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
-  void *HstDescriptorFieldsAddr =
-      reinterpret_cast<char *>(HstPtrAddr) + VoidPtrSize;
-  std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr,
-              HstDescriptorFieldsSize);
-
-  DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64
-     " additional bytes from host descriptor " DPxMOD ")\n",
-     HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize,
-     DPxPTR(HstDescriptorFieldsAddr));
-
-  // Submit the entire buffer to device
-  int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize,
+  // Submit the populated source buffer to device.
+  int SubmitResult = Device.submitData(TgtPtrAddr, SrcBuffer, HstPtrSize,
                                        AsyncInfo, PtrTPR.getEntry());
-
-  AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int {
-    delete[] DataBuffer;
-    return OFFLOAD_SUCCESS;
-  });
   return HandleSubmitResult(SubmitResult);
 }
 
@@ -525,10 +549,17 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
     // ATTACH map-types are supposed to be handled after all mapping for the
     // construct is done. Defer their processing.
     if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) {
-      AttachInfo->AttachEntries.emplace_back(
-          /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin,
-          /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I],
-          /*PointeeName=*/HstPtrName);
+      const bool IsCorrespondingPointerInit =
+          (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE);
+      // We don't need to keep track of PRIVATE | ATTACH entries. They
+      // represent corresponding-pointer-initialization, and are handled
+      // similar to firstprivate (PRIVATE | TO) entries by
+      // PrivateArgumentManager.
+      if (!IsCorrespondingPointerInit)
+        AttachInfo->AttachEntries.emplace_back(
+            /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin,
+            /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I],
+            /*PointeeName=*/HstPtrName);
 
       DP("Deferring ATTACH map-type processing for argument %d\n", I);
       continue;
@@ -1397,13 +1428,24 @@ class PrivateArgumentManagerTy {
     uint32_t Padding;
     /// Host pointer name
     map_var_info_t HstPtrName = nullptr;
+    /// For corresponding-pointer-initialization: host pointee base address.
+    void *HstPteeBase = nullptr;
+    /// For corresponding-pointer-initialization: host pointee begin address.
+    void *HstPteeBegin = nullptr;
+    /// Whether this argument needs corresponding-pointer-initialization.
+    bool IsCorrespondingPointerInit = false;
 
     FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
                           uint32_t Alignment, uint32_t Padding,
-                          map_var_info_t HstPtrName = nullptr)
+                          map_var_info_t HstPtrName = nullptr,
+                          void *HstPteeBase = nullptr,
+                          void *HstPteeBegin = nullptr,
+                          bool IsCorrespondingPointerInit = false)
         : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
           HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
-          Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
+          Size(Size), Padding(Padding), HstPtrName(HstPtrName),
+          HstPteeBase(HstPteeBase), HstPteeBegin(HstPteeBegin),
+          IsCorrespondingPointerInit(IsCorrespondingPointerInit) {}
   };
 
   /// A vector of target pointers for all private arguments
@@ -1421,6 +1463,153 @@ class PrivateArgumentManagerTy {
   /// A pointer to a \p AsyncInfoTy object
   AsyncInfoTy &AsyncInfo;
 
+  /// \returns the value of the target pointee's base to be used for
+  /// corresponding-pointer-initialization.
+  void *getTargetPointeeBaseForCorrespondingPointerInitialization(
+      void *HstPteeBase, void *HstPteeBegin) {
+    // See if the pointee's begin address has corresponding storage on device.
+    void *TgtPteeBegin = [&]() -> void * {
+      if (!HstPteeBegin) {
+        DP("Corresponding-pointer-initialization: pointee begin address is "
+           "null\n");
+        return nullptr;
+      }
+
+      return Device.getMappingInfo()
+          .getTgtPtrBegin(HstPteeBegin, /*Size=*/0, /*UpdateRefCount=*/false,
+                          /*UseHoldRefCount=*/false)
+          .TargetPointer;
+    }();
+
+    // If it does, we calculate target pointee base using it, and return it.
+    // Otherwise, we retain the host pointee's base as the target pointee base
+    // of the initialized pointer. It's the user's responsibility to ensure
+    // that if a lookup fails, the host pointee is accessible on the device.
+    return TgtPteeBegin ? calculateTargetPointeeBase(HstPteeBase, HstPteeBegin,
+                                                     TgtPteeBegin)
+                        : HstPteeBase;
+  }
+
+  /// Initialize the source buffer for corresponding-pointer-initialization.
+  ///
+  /// It computes and stores the target pointee base address (or the host
+  /// pointee's base address, if lookup of target pointee fails) to the first
+  /// `sizeof(void*)` bytes of \p Buffer, and for larger pointers
+  /// (Fortran descriptors), the remaining fields of the host descriptor
+  /// \p HstPtr after those `sizeof(void*)` bytes.
+  ///
+  /// Corresponding-pointer-initialization represents the initialization of the
+  /// private version of a base-pointer/referring-pointer on a target construct.
+  ///
+  /// For example, for the following test:
+  /// ```cpp
+  ///   int x[10];
+  ///   int *px = &x[0];
+  ///   ...
+  ///   #pragma omp target data map(tofrom:px)
+  ///   {
+  ///     int **ppx = omp_get_mapped_ptr(&px, omp_get_default_device());
+  ///     #pragma omp target map(tofrom:px[1]) is_device_ptr(ppx)
+  ///     {
+  ///        foo(px, ppx);
+  ///     }
+  ///   }
+  /// ```
+  /// The following shows a possible way to implement the mapping of `px`,
+  /// which is pre-determined firstprivate and should get initialized
+  /// via corresponding-pointer-initialization:
+  ///
+  /// (A) Possible way to implement the above with PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for px:
+  ///  ; &px[0], &px[1], sizeof(px[1]), TO | FROM                // (1)
+  ///  ; &px,    &px[1], sizeof(px),    ATTACH                   // (2)
+  ///  ; &px,    &px[1], sizeof(px),    PRIVATE | ATTACH | PARAM // (3)
+  ///  call... @__omp_outlined...(ptr %px, ptr %ppx)
+  ///  define ... @__omp_outlined(ptr %px, ptr %ppx) {...
+  ///    foo(%px, %ppx)
+  ///  ...}
+  /// ```
+  /// `(1)` maps the pointee `px[1].
+  /// `(2)` attaches it to the mapped version of `px`. It can be controlled by
+  /// the user based on the `attach(auto/always/never)` map-type modifier.
+  /// `(3)` privatizes and initializes the private pointer `px`, and passes it
+  /// into the kernel as the argument `%px`. Can be skipped if `px` is not
+  /// referenced in the target construct.
+  ///
+  /// While this method is not too beneficial compared to just doing the
+  /// initialization in the body of the kernel, like:
+  /// (B) Possible way to implement the above without PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for px:
+  ///  ; &px[0], &px[1], sizeof(px[1]), TO | FROM | PARAM        // (4)
+  ///  ; &px,    &px[1], sizeof(px),    ATTACH                   // (5)
+  ///  call... @__omp_outlined...(ptr %px0, ptr %ppx)
+  ///  define ... __omp_outlined...(ptr %px0, ptr %ppx) {
+  ///    %px = alloca ptr;
+  ///    store ptr %px0, ptr %px
+  ///    foo(%px, %ppx)
+  ///  }
+  /// ```
+  ///
+  /// (B) is not so convenient for Fortran descriptors, because in
+  /// addition to the lookup, the remaining fields of the descriptor have
+  /// to be passed into the kernel to initialize the private copy, which
+  /// makes (A) a cleaner option for them. e.g.
+  /// ```f90
+  /// integer, pointer :: p(:)
+  /// !$omp target map(p(1))
+  /// ```
+  ///
+  /// (C) Possible mapping for the above Fortran test using PRIVATE | ATTACH:
+  /// ```llvm
+  ///  ; maps for p:
+  ///  ; &p(1),       &p(1), sizeof(p(1)),       TO | FROM
+  ///  ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH
+  ///  ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE | ATTACH | PARAM
+  ///  call... @__omp_outlined...(ptr %ref_ptr_of_p)
+  void initBufferForCorrespondingPointerInitialization(char *Buffer,
+                                                       void *HstPtr,
+                                                       int64_t HstPtrSize,
+                                                       void *HstPteeBase,
+                                                       void *HstPteeBegin) {
+    constexpr int64_t VoidPtrSize = sizeof(void *);
+    assert(HstPtrSize >= VoidPtrSize &&
+           "corresponding-pointer-initialization: pointer size is too small");
+
+    void *TgtPteeBase =
+        getTargetPointeeBaseForCorrespondingPointerInitialization(HstPteeBase,
+                                                                  HstPteeBegin);
+
+    // Store the target pointee base address to the first VoidPtrSize bytes
+    DP("Initializing corresponding-pointer-initialization source buffer "
+       "for " DPxMOD ", with pointee base " DPxMOD "\n",
+       DPxPTR(HstPtr), DPxPTR(TgtPteeBase));
+    std::memcpy(Buffer, &TgtPteeBase, VoidPtrSize);
+    if (HstPtrSize <= VoidPtrSize)
+      return;
+
+    // For Fortran descriptors, copy the remaining descriptor fields from host
+    uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize;
+    void *HstDescriptorFieldsAddr = static_cast<char *>(HstPtr) + VoidPtrSize;
+    DP("Copying %" PRId64
+       " bytes of descriptor fields into corresponding-pointer-initialization "
+       "buffer at offset %" PRId64 ", from " DPxMOD "\n",
+       HstDescriptorFieldsSize, VoidPtrSize, DPxPTR(HstDescriptorFieldsAddr));
+    std::memcpy(Buffer + VoidPtrSize, HstDescriptorFieldsAddr,
+                HstDescriptorFieldsSize);
+  }
+
+  /// Helper function to create and initialize a buffer to be used as the source
+  /// for corresponding-pointer-initialization.
+  void *createAndInitSourceBufferForCorrespondingPointerInitialization(
+      void *HstPtr, int64_t HstPtrSize, void *HstPteeBase, void *HstPteeBegin) {
+    char *Buffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize);
+    initBufferForCorrespondingPointerInitialization(Buffer, HstPtr, HstPtrSize,
+                                                    HstPteeBase, HstPteeBegin);
+    return Buffer;
+  }
+
   // TODO: What would be the best value here? Should we make it configurable?
   // If the size is larger than this threshold, we will allocate and transfer it
   // immediately instead of packing it.
@@ -1435,7 +1624,9 @@ class PrivateArgumentManagerTy {
   int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
              bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
              map_var_info_t HstPtrName = nullptr,
-             const bool AllocImmediately = false) {
+             const bool AllocImmediately = false, void *HstPteeBase = nullptr,
+             void *HstPteeBegin = nullptr,
+             bool IsCorrespondingPointerInit = false) {
     // If the argument is not first-private, or its size is greater than a
     // predefined threshold, we will allocate memory and issue the transfer
     // immediately.
@@ -1458,9 +1649,19 @@ class PrivateArgumentManagerTy {
       // If first-private, copy data from host
       if (IsFirstPrivate) {
         DP("Submitting firstprivate data to the device.\n");
-        int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
+
+        // The source value used for corresponding-pointer-initialization
+        // is different vs regular firstprivates.
+        void *DataSource =
+            IsCorrespondingPointerInit
+                ? createAndInitSourceBufferForCorrespondingPointerInitialization(
+                      HstPtr, ArgSize, HstPteeBase, HstPteeBegin)
+                : HstPtr;
+        int Ret = Device.submitData(TgtPtr, DataSource, ArgSize, AsyncInfo);
         if (Ret != OFFLOAD_SUCCESS) {
-          DP("Copying data to device failed, failed.\n");
+          DP("Copying %s data to device failed.\n",
+             IsCorrespondingPointerInit ? "corresponding-pointer-initialization"
+                                        : "firstprivate");
           return OFFLOAD_FAIL;
         }
       }
@@ -1506,8 +1707,10 @@ class PrivateArgumentManagerTy {
         }
       }
 
-      FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
-                                       StartAlignment, Padding, HstPtrName);
+      FirstPrivateArgInfo.emplace_back(
+          TgtArgsIndex, HstPtr, ArgSize, StartAlignment, Padding, HstPtrName,
+          HstPteeBase, HstPteeBegin, IsCorrespondingPointerInit);
+
       FirstPrivateArgSize += Padding + ArgSize;
     }
 
@@ -1526,7 +1729,13 @@ class PrivateArgumentManagerTy {
       for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
         // First pad the pointer as we (have to) pad it on the device too.
         Itr = std::next(Itr, Info.Padding);
-        std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
+
+        if (Info.IsCorrespondingPointerInit)
+          initBufferForCorrespondingPointerInitialization(
+              &*Itr, Info.HstPtrBegin, Info.Size, Info.HstPteeBase,
+              Info.HstPteeBegin);
+        else
+          std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
         Itr = std::next(Itr, Info.Size);
       }
       // Allocate target memory
@@ -1682,8 +1891,40 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
       TgtPtrBegin = HstPtrBase;
       TgtBaseOffset = 0;
     } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
+      // For cases like:
+      // ```
+      // int *p = ...;
+      // #pragma omp target map(p[0:10])
+      // ```
+      // `p` is predetermined firstprivate on the target construct, and the
+      // method to determine the initial value of the private copy on the
+      // device is called "corresponding-pointer-initialization".
+      //
+      // Such firstprivate pointers that need
+      // corresponding-pointer-initialization are represented using the
+      // `PRIVATE | ATTACH` map-types, in contrast to regular firstprivate
+      // entries, which use `PRIVATE | TO`. The structure of these
+      // `PRIVATE | ATTACH` entries is the same as the non-private
+      // `ATTACH` entries used to represent pointer-attachments, i.e.:
+      // ```
+      //  &hst_ptr_base/begin, &hst_ptee_begin, sizeof(hst_ptr)
+      // ```
+      const bool IsAttach = (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH);
+      void *HstPteeBase = nullptr;
+      void *HstPteeBegin = nullptr;
+      if (IsAttach) {
+        // For corresponding-pointer-initialization, Args[I] is HstPteeBegin,
+        // and ArgBases[I] is both HstPtrBase/HstPtrBegin.
+        HstPteeBase = *reinterpret_cast<void **>(HstPtrBase);
+        HstPteeBegin = Args[I];
+        HstPtrBegin = ArgBases[I];
+      }
       TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
-      const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
+      // Corresponding-pointer-initialization is a special case of firstprivate,
+      // since it also involves initializing the private pointer.
+      const bool IsFirstPrivate =
+          (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) || IsAttach;
+
       // If there is a next argument and it depends on the current one, we need
       // to allocate the private memory immediately. If this is not the case,
       // then the argument can be marked for optimization and packed with the
@@ -1692,9 +1933,11 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
           (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
       Ret = PrivateArgumentManager.addArg(
           HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
-          TgtArgs.size(), HstPtrName, AllocImmediately);
+          /*TgtArgsIndex=*/TgtArgs.size(), HstPtrName, AllocImmediately,
+          HstPteeBase, HstPteeBegin, /*IsCorrespondingPointerInit=*/IsAttach);
       if (Ret != OFFLOAD_SUCCESS) {
-        REPORT("Failed to process %sprivate argument " DPxMOD "\n",
+        REPORT("Failed to process %s%sprivate argument " DPxMOD "\n",
+               IsAttach ? "corresponding-pointer-initialization " : "",
                (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
         return OFFLOAD_FAIL;
       }

From 74aa2b7ad59466a12c5e3e4d0bfd5b1dcec8232b Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 29 Sep 2025 21:03:02 +0200
Subject: [PATCH 172/878] [CIR] Implement UO real on result from imag with type
 promotion (#160996)

Implement UO real on the result from imag with type promotion

Issue: https://github.com/llvm/llvm-project/issues/141365
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |  3 --
 clang/test/CIR/CodeGen/complex.cpp         | 37 ++++++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index b93d9a9c6f883..7fbb2bcb4bbf3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1278,9 +1278,6 @@ mlir::Value ScalarExprEmitter::emitPromoted(const Expr *e,
   } else if (const auto *uo = dyn_cast<UnaryOperator>(e)) {
     switch (uo->getOpcode()) {
     case UO_Imag:
-      cgf.cgm.errorNYI(e->getSourceRange(),
-                       "ScalarExprEmitter::emitPromoted unary imag");
-      return {};
     case UO_Real:
       return VisitRealImag(uo, promotionType);
     case UO_Minus:
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index e90163172d2df..4c396d312d148 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1270,3 +1270,40 @@ void real_on_scalar_from_real_with_type_promotion() {
 // OGCG: %[[A_REAL_F32:.*]] = fpext half %[[A_REAL]] to float
 // OGCG: %[[A_REAL_F16:.*]] = fptrunc float %[[A_REAL_F32]] to half
 // OGCG: store half %[[A_REAL_F16]], ptr %[[B_ADDR]], align 2
+
+void real_on_scalar_from_imag_with_type_promotion() {
+  _Float16 _Complex a;
+  _Float16 b = __real__(__imag__ a);
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
+// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[A_IMAG_F16:.*]] = cir.cast(floating, %[[A_IMAG_F32]] : !cir.float), !cir.f16
+// CIR: cir.store{{.*}} %[[A_IMAG_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
+
+// LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
+// LLVM: %[[B_ADDR]] = alloca half, i64 1, align 2
+// LLVM: %[[TMP_A:.*]] = load { half, half }, ptr %[[A_ADDR]], align 2
+// LLVM: %[[A_REAL:.*]] = extractvalue { half, half } %[[TMP_A]], 0
+// LLVM: %[[A_IMAG:.*]] = extractvalue { half, half } %[[TMP_A]], 1
+// LLVM: %[[A_REAL_F32:.*]] = fpext half %[[A_REAL]] to float
+// LLVM: %[[A_IMAG_F32:.*]] = fpext half %[[A_IMAG]] to float
+// LLVM: %[[TMP_A_COMPLEX_F32:.*]] = insertvalue { float, float } {{.*}}, float %[[A_REAL_F32]], 0
+// LLVM: %[[A_COMPLEX_F32:.*]] = insertvalue { float, float } %[[TMP_A_COMPLEX_F32]], float %[[A_IMAG_F32]], 1
+// LLVM: %[[A_IMAG_F16:.*]] = fptrunc float %[[A_IMAG_F32]] to half
+// LLVM: store half %[[A_IMAG_F16]], ptr %[[B_ADDR]], align 2
+
+// OGCG: %[[A_ADDR:.*]] = alloca { half, half }, align 2
+// OGCG: %[[B_ADDR:.*]] = alloca half, align 2
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { half, half }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load half, ptr %[[A_IMAG_PTR]], align 2
+// OGCG: %[[A_IMAG_F32:.*]] = fpext half %[[A_IMAG]] to float
+// OGCG: %[[A_IMAG_F16:.*]] = fptrunc float %[[A_IMAG_F32]] to half
+// OGCG: store half %[[A_IMAG_F16]], ptr %[[B_ADDR]], align 2

From 62e7e8d66d1aa54d231b3005fb0d842ede2cce7b Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 29 Sep 2025 21:03:41 +0200
Subject: [PATCH 173/878] [CIR] Upstream UnaryExtension for Scalar Expr
 (#160997)

Upstream UnaryExtension for Scalar Expr
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |  4 ++++
 clang/test/CIR/CodeGen/vector-ext.cpp      | 20 ++++++++++++++++++++
 clang/test/CIR/CodeGen/vector.cpp          | 20 ++++++++++++++++++++
 3 files changed, 44 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 7fbb2bcb4bbf3..f4bbced781942 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -676,6 +676,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value VisitRealImag(const UnaryOperator *e,
                             QualType promotionType = QualType());
 
+  mlir::Value VisitUnaryExtension(const UnaryOperator *e) {
+    return Visit(e->getSubExpr());
+  }
+
   mlir::Value VisitCXXDefaultInitExpr(CXXDefaultInitExpr *die) {
     CIRGenFunction::CXXDefaultInitExprScope scope(cgf, die);
     return Visit(die->getExpr());
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index 8b5379a2b1a88..8bca48d8ffe0c 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -1322,3 +1322,23 @@ void logical_not() {
 // OGCG: %[[RESULT:.*]] = icmp eq <4 x i32> %[[TMP_A]], zeroinitializer
 // OGCG: %[[RESULT_VI4:.*]] = sext <4 x i1> %[[RESULT]] to <4 x i32>
 // OGCG: store <4 x i32> %[[RESULT_VI4]], ptr %[[B_ADDR]], align 16
+
+void unary_extension() {
+  vi4 a;
+  vi4 b = __extension__ a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[B_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[B_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[B_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index d8fdeea179288..f242779502148 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -1390,3 +1390,23 @@ void logical_not_float() {
 // OGCG: %[[RESULT:.*]] = fcmp oeq <4 x float> %[[TMP_A]], zeroinitializer
 // OGCG: %[[RESULT_VI4:.*]] = sext <4 x i1> %[[RESULT]] to <4 x i32>
 // OGCG: store <4 x i32> %[[RESULT_VI4]], ptr %[[B_ADDR]], align 16
+
+void unary_extension() {
+  vi4 a;
+  vi4 b = __extension__ a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[B_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+
+// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[B_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// LLVM: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16
+
+// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[B_ADDR:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[A_ADDR]], align 16
+// OGCG: store <4 x i32> %[[TMP_A]], ptr %[[B_ADDR]], align 16

From b54250940c2cd70f911386b02239b50c165e5354 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <ahatanak@gmail.com>
Date: Mon, 29 Sep 2025 12:07:32 -0700
Subject: [PATCH 174/878] [compiler-rt] Fix declarations of builtins in test
 files (#161222)

Replace `long double` and `long double _Complex` with `fp_t` and
`Qcomplex` in the test files.

This prepares for reapplying 656707086e5f6fccd2eb57f5aaf987c328c0f4f1
and running tests on targets where `fp_t` is not `long double`.
---
 .../test/builtins/Unit/fixunstfdi_test.c      | 13 ++++++++-----
 compiler-rt/test/builtins/Unit/multc3_test.c  | 19 +++++++++++--------
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
index 982f3a4629dbd..14f0f7f1565a4 100644
--- a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
+++ b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
@@ -6,19 +6,22 @@
 
 #if _ARCH_PPC || __aarch64__ || __arm64ec__
 
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
 #include "int_lib.h"
 
 // Returns: convert a to a unsigned long long, rounding toward zero.
 //          Negative values all become zero.
 
-// Assumption: long double is a 128 bit floating point type
+// Assumption: fp_t is a 128 bit floating point type
 //             du_int is a 64 bit integral type
-//             value in long double is representable in du_int or is negative 
+//             value in fp_t is representable in du_int or is negative 
 //                 (no range checking performed)
 
-COMPILER_RT_ABI du_int __fixunstfdi(long double a);
+COMPILER_RT_ABI du_int __fixunstfdi(fp_t a);
 
-int test__fixunstfdi(long double a, du_int expected)
+int test__fixunstfdi(fp_t a, du_int expected)
 {
     du_int x = __fixunstfdi(a);
     if (x != expected)
@@ -29,7 +32,7 @@ int test__fixunstfdi(long double a, du_int expected)
 
 char assumption_1[sizeof(du_int) == 2*sizeof(su_int)] = {0};
 char assumption_2[sizeof(du_int)*CHAR_BIT == 64] = {0};
-char assumption_3[sizeof(long double)*CHAR_BIT == 128] = {0};
+char assumption_3[sizeof(fp_t)*CHAR_BIT == 128] = {0};
 
 #endif
 
diff --git a/compiler-rt/test/builtins/Unit/multc3_test.c b/compiler-rt/test/builtins/Unit/multc3_test.c
index e9c99a72be35e..5eec56dc43033 100644
--- a/compiler-rt/test/builtins/Unit/multc3_test.c
+++ b/compiler-rt/test/builtins/Unit/multc3_test.c
@@ -6,19 +6,22 @@
 
 #if _ARCH_PPC || __aarch64__ || __arm64ec__
 
+#define QUAD_PRECISION
+#include "fp_lib.h"
+
 #include "int_lib.h"
 #include <math.h>
 #include <complex.h>
 
 // Returns: the product of a + ib and c + id
 
-COMPILER_RT_ABI long double _Complex
-__multc3(long double __a, long double __b, long double __c, long double __d);
+COMPILER_RT_ABI Qcomplex
+__multc3(fp_t __a, fp_t __b, fp_t __c, fp_t __d);
 
 enum {zero, non_zero, inf, NaN, non_zero_nan};
 
 int
-classify(long double _Complex x)
+classify(Qcomplex x)
 {
     if (x == 0)
         return zero;
@@ -41,13 +44,13 @@ classify(long double _Complex x)
     return non_zero;
 }
 
-int test__multc3(long double a, long double b, long double c, long double d)
+int test__multc3(fp_t a, fp_t b, fp_t c, fp_t d)
 {
-    long double _Complex r = __multc3(a, b, c, d);
+    Qcomplex r = __multc3(a, b, c, d);
 //     printf("test__multc3(%Lf, %Lf, %Lf, %Lf) = %Lf + I%Lf\n",
 //             a, b, c, d, creall(r), cimagl(r));
-	long double _Complex dividend;
-	long double _Complex divisor;
+	Qcomplex dividend;
+	Qcomplex divisor;
 	
 	__real__ dividend = a;
 	__imag__ dividend = b;
@@ -188,7 +191,7 @@ int test__multc3(long double a, long double b, long double c, long double d)
     return 0;
 }
 
-long double x[][2] =
+fp_t x[][2] =
 {
     { 1.e-6,  1.e-6},
     {-1.e-6,  1.e-6},

From 7e4678270fa90aacacb21efb93775754bfd04bf1 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Mon, 29 Sep 2025 21:30:07 +0200
Subject: [PATCH 175/878] [Clang] Instantiate variables referenced in
 `decltype` with an undeduced type. (#161231)

Fixes #160497
Fixes #56652
Fixes #116319
Fixes #161196
---
 clang/docs/ReleaseNotes.rst       |  1 +
 clang/lib/Sema/SemaExpr.cpp       |  5 +-
 clang/test/CodeGenCXX/gh56652.cpp | 41 ++++++++++++++++
 clang/test/SemaCXX/decltype.cpp   | 78 +++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/gh56652.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e8deae50e4cb0..6521fc3e9a9da 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -433,6 +433,7 @@ Bug Fixes to C++ Support
   object type. (#GH151531)
 - Suppress ``-Wdouble-promotion`` when explicitly asked for with C++ list initialization (#GH33409).
 - Fix the result of `__builtin_is_implicit_lifetime` for types with a user-provided constructor. (#GH160610)
+- Correctly deduce return types in ``decltype`` expressions. (#GH160497) (#GH56652) (#GH116319) (#GH161196)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 3b267c1b1693d..3302bfce193a2 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20108,8 +20108,9 @@ static void DoMarkVarDeclReferenced(
   bool NeededForConstantEvaluation =
       isPotentiallyConstantEvaluatedContext(SemaRef) && UsableInConstantExpr;
 
-  bool NeedDefinition =
-      OdrUse == OdrUseContext::Used || NeededForConstantEvaluation;
+  bool NeedDefinition = OdrUse == OdrUseContext::Used ||
+                        NeededForConstantEvaluation ||
+                        Var->getType()->isUndeducedType();
 
   assert(!isa<VarTemplatePartialSpecializationDecl>(Var) &&
          "Can't instantiate a partial template specialization.");
diff --git a/clang/test/CodeGenCXX/gh56652.cpp b/clang/test/CodeGenCXX/gh56652.cpp
new file mode 100644
index 0000000000000..06a496e320bfc
--- /dev/null
+++ b/clang/test/CodeGenCXX/gh56652.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-elf-gnu %s -emit-llvm -o - | FileCheck %s
+
+namespace GH56652{
+
+struct foo {};
+
+template <typename T> struct bar {
+    using type = T;
+
+    template <foo> inline static constexpr auto b = true;
+};
+
+template <typename T>
+concept C = requires(T a) { T::template b<foo{}>; };
+
+template <typename T> auto fn(T) {
+    if constexpr (!C<T>)
+        return foo{};
+    else
+        return T{};
+}
+
+auto a = decltype(fn(bar<int>{})){};
+
+}
+
+namespace GH116319 {
+
+template <int = 0> struct a {
+template <class> static constexpr auto b = 2;
+template <class> static void c() noexcept(noexcept(b<int>)) {}
+};
+
+void test() { a<>::c<int>(); }
+
+
+}
+
+// CHECK: %"struct.GH56652::bar" = type { i8 }
+// CHECK: $_ZN8GH1163191aILi0EE1cIiEEvv = comdat any
+// CHECK: @_ZN7GH566521aE = global %"struct.GH56652::bar" undef
diff --git a/clang/test/SemaCXX/decltype.cpp b/clang/test/SemaCXX/decltype.cpp
index 739485b57a3ec..45a4c4cf1ac86 100644
--- a/clang/test/SemaCXX/decltype.cpp
+++ b/clang/test/SemaCXX/decltype.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -Wno-c99-designator %s
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify -Wno-c99-designator %s
 
 // PR5290
 int const f0();
@@ -156,6 +157,8 @@ struct A {
   }
 };
 
+
+
 // This shouldn't crash.
 static_assert(A<int>().f<int>() == 0, "");
 // The result should not be dependent.
@@ -163,6 +166,81 @@ static_assert(A<int>().f<int>() != 0, ""); // expected-error {{static assertion
                                            // expected-note@-1 {{expression evaluates to '0 != 0'}}
 }
 
+
+#if __cplusplus >= 201703L
+namespace GH160497 {
+
+template <class> struct S {
+    template <class>
+    inline static auto mem =
+    [] { static_assert(false); // expected-error {{static assertion failed}} \
+        // expected-note {{while substituting into a lambda expression here}}
+        return 42;
+    }();
+};
+
+using T = decltype(S<void>::mem<void>);
+ // expected-note@-1 {{in instantiation of static data member 'GH160497::S<void>::mem<void>' requested here}}
+
+
+template <class> struct S2 {
+    template <class>
+    inline static auto* mem =
+    [] { static_assert(false); // expected-error {{static assertion failed}} \
+        // expected-note {{while substituting into a lambda expression here}}
+        return static_cast<int*>(nullptr);
+    }();
+};
+
+using T2 = decltype(S2<void>::mem<void>);
+//expected-note@-1 {{in instantiation of static data member 'GH160497::S2<void>::mem<void>' requested here}}
+
+template <class> struct S3 {
+    template <class>
+    inline static int mem =    // Check we don't instantiate when the type is not deduced.
+    [] { static_assert(false);
+        return 42;
+    }();
+};
+
+using T = decltype(S3<void>::mem<void>);
+}
+
+namespace N1 {
+
+template<class>
+struct S {
+  template<class>
+  inline static auto mem = 42;
+};
+
+using T = decltype(S<void>::mem<void>);
+
+T y = 42;
+
+}
+
+namespace GH161196 {
+
+template <typename> struct A {
+  static constexpr int digits = 0;
+};
+
+template <typename> struct B {
+  template <int, typename MaskInt = int, int = A<MaskInt>::digits>
+  static constexpr auto XBitMask = 0;
+};
+
+struct C {
+  using ReferenceHost = B<int>;
+  template <int> static decltype(ReferenceHost::XBitMask<0>) XBitMask;
+};
+
+void test() { (void)C::XBitMask<0>; }
+
+}
+#endif
+
 template<typename>
 class conditional {
 };

From 786358a3d70561f2b2cf7d7ec239c1058818236b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 29 Sep 2025 13:37:55 -0500
Subject: [PATCH 176/878] [Offload] Fix incorrect size used in
 llvm-offload-device-info tool

Summary:
This was not using the size previously queried and would fail when the
implementation actually verified it.
---
 offload/tools/deviceinfo/llvm-offload-device-info.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 67a6e07fc6b05..9b58d67f017ca 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -137,7 +137,7 @@ ol_result_t printDeviceValue(std::ostream &S, ol_device_handle_t Dev,
     size_t Size;
     OFFLOAD_ERR(olGetDeviceInfoSize(Dev, Info, &Size));
     Val.resize(Size);
-    OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, sizeof(Val), Val.data()));
+    OFFLOAD_ERR(olGetDeviceInfo(Dev, Info, Size, Val.data()));
     doWrite<T, PK>(S, reinterpret_cast<T>(Val.data()));
   } else {
     T Val;

From 38a4c9c639f6067c3aa4c88a7578d55efd236819 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Mon, 29 Sep 2025 15:40:21 -0400
Subject: [PATCH 177/878] [libc][msvc] fix mathlib build on WoA (#161258)

Fix build errors encountered when building math library on WoA.

1. Skip FEnv equality check for MSVC
2. Provide a placeholder type for vector types.
---
 libc/src/string/memory_utils/op_generic.h | 15 +++++++++++++--
 libc/test/UnitTest/FEnvSafeTest.cpp       |  2 +-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 010f2187a4ffd..a86cbd8bcfc72 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -41,12 +41,22 @@ static_assert((UINTPTR_MAX == 4294967295U) ||
               "We currently only support 32- or 64-bit platforms");
 
 #ifdef LIBC_COMPILER_IS_MSVC
-
+#ifdef LIBC_TARGET_ARCH_IS_X86
 namespace LIBC_NAMESPACE_DECL {
 using generic_v128 = __m128i;
 using generic_v256 = __m256i;
 using generic_v512 = __m512i;
 } // namespace LIBC_NAMESPACE_DECL
+#else
+// Special handling when target does not have real vector types.
+// We can potentially use uint8x16_t etc. However, MSVC does not provide
+// subscript operation.
+namespace LIBC_NAMESPACE_DECL {
+struct alignas(16) generic_v128 : public cpp::array<uint8_t, 16> {};
+struct alignas(32) generic_v256 : public cpp::array<uint8_t, 32> {};
+struct alignas(64) generic_v512 : public cpp::array<uint8_t, 64> {};
+} // namespace LIBC_NAMESPACE_DECL
+#endif
 
 #else
 namespace LIBC_NAMESPACE_DECL {
@@ -159,7 +169,8 @@ template <typename T> struct Memset {
 
   LIBC_INLINE static void block(Ptr dst, uint8_t value) {
     if constexpr (is_scalar_v<T> || is_vector_v<T>) {
-      store<T>(dst, splat<T>(value));
+      // Avoid ambiguous call due to ADL
+      generic::store<T>(dst, splat<T>(value));
     } else if constexpr (is_array_v<T>) {
       using value_type = typename T::value_type;
       const auto Splat = splat<value_type>(value);
diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp
index 2730de350b39a..4393f9d5e5c3b 100644
--- a/libc/test/UnitTest/FEnvSafeTest.cpp
+++ b/libc/test/UnitTest/FEnvSafeTest.cpp
@@ -43,7 +43,7 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) {
 
 void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv,
                                   const fenv_t &after_fenv) {
-#if defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC)
   using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState;
   const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv);
   const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv);

From 2936a2c882d76c719f9a96e443ad3f75b366bc8f Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 29 Sep 2025 12:47:52 -0700
Subject: [PATCH 178/878] [CAS] Add OnDiskTrieRawHashMap (#114100)

Add OnDiskTrieRawHashMap. This is a on-disk persistent hash map that
uses a Trie data structure that is similar to TrieRawHashMap.
OnDiskTrieRawHashMap is thread safe and process safe. It is mostly lock
free, except it internally coordinates cross process creation and
closing using file lock.
---
 llvm/include/llvm/CAS/FileOffset.h            |   39 +
 llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h  |  236 ++++
 llvm/lib/CAS/CMakeLists.txt                   |    2 +
 llvm/lib/CAS/DatabaseFile.cpp                 |  123 ++
 llvm/lib/CAS/DatabaseFile.h                   |  153 +++
 llvm/lib/CAS/OnDiskTrieRawHashMap.cpp         | 1178 +++++++++++++++++
 llvm/unittests/CAS/CMakeLists.txt             |    5 +-
 .../CAS/OnDiskTrieRawHashMapTest.cpp          |  220 +++
 8 files changed, 1952 insertions(+), 4 deletions(-)
 create mode 100644 llvm/include/llvm/CAS/FileOffset.h
 create mode 100644 llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
 create mode 100644 llvm/lib/CAS/DatabaseFile.cpp
 create mode 100644 llvm/lib/CAS/DatabaseFile.h
 create mode 100644 llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
 create mode 100644 llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp

diff --git a/llvm/include/llvm/CAS/FileOffset.h b/llvm/include/llvm/CAS/FileOffset.h
new file mode 100644
index 0000000000000..21d045e8c9d78
--- /dev/null
+++ b/llvm/include/llvm/CAS/FileOffset.h
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for FileOffset that represent stored data at an
+/// offset from the beginning of a file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_FILEOFFSET_H
+#define LLVM_CAS_FILEOFFSET_H
+
+#include <cstdlib>
+
+namespace llvm::cas {
+
+/// FileOffset is a wrapper around `uint64_t` to represent the offset of data
+/// from the beginning of the file.
+class FileOffset {
+public:
+  uint64_t get() const { return Offset; }
+
+  explicit operator bool() const { return Offset; }
+
+  FileOffset() = default;
+  explicit FileOffset(uint64_t Offset) : Offset(Offset) {}
+
+private:
+  uint64_t Offset = 0;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_FILEOFFSET_H
diff --git a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
new file mode 100644
index 0000000000000..5e41bf6ab571e
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
@@ -0,0 +1,236 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for OnDiskTrieRawHashMap, a thread-safe and
+/// (mostly) lock-free hash map stored as trie and backed by persistent files on
+/// disk.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKTRIERAWHASHMAP_H
+#define LLVM_CAS_ONDISKTRIERAWHASHMAP_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/FileOffset.h"
+#include "llvm/Support/Error.h"
+#include <optional>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace cas {
+
+/// OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps.
+/// The keys are fixed length, and are expected to be binary hashes with a
+/// normal distribution.
+///
+/// - Thread-safety is achieved through the use of atomics within a shared
+///   memory mapping. Atomic access does not work on networked filesystems.
+/// - Filesystem locks are used, but only sparingly:
+///     - during initialization, for creating / opening an existing store;
+///     - for the lifetime of the instance, a shared/reader lock is held
+///     - during destruction, if there are no concurrent readers, to shrink the
+///       files to their minimum size.
+/// - Path is used as a directory:
+///     - "index" stores the root trie and subtries.
+///     - "data" stores (most of) the entries, like a bump-ptr-allocator.
+///     - Large entries are stored externally in a file named by the key.
+/// - Code is system-dependent and binary format itself is not portable. These
+///   are not artifacts that can/should be moved between different systems; they
+///   are only appropriate for local storage.
+class OnDiskTrieRawHashMap {
+public:
+  LLVM_DUMP_METHOD void dump() const;
+  void
+  print(raw_ostream &OS,
+        function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+public:
+  /// Const value proxy to access the records stored in TrieRawHashMap.
+  struct ConstValueProxy {
+    ConstValueProxy() = default;
+    ConstValueProxy(ArrayRef<uint8_t> Hash, ArrayRef<char> Data)
+        : Hash(Hash), Data(Data) {}
+    ConstValueProxy(ArrayRef<uint8_t> Hash, StringRef Data)
+        : Hash(Hash), Data(Data.begin(), Data.size()) {}
+
+    ArrayRef<uint8_t> Hash;
+    ArrayRef<char> Data;
+  };
+
+  /// Value proxy to access the records stored in TrieRawHashMap.
+  struct ValueProxy {
+    operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); }
+
+    ValueProxy() = default;
+    ValueProxy(ArrayRef<uint8_t> Hash, MutableArrayRef<char> Data)
+        : Hash(Hash), Data(Data) {}
+
+    ArrayRef<uint8_t> Hash;
+    MutableArrayRef<char> Data;
+  };
+
+  /// Validate the trie data structure.
+  ///
+  /// Callback receives the file offset to the data entry and the data stored.
+  Error validate(
+      function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const;
+
+  /// Check the valid range of file offset for OnDiskTrieRawHashMap.
+  static bool validOffset(FileOffset Offset) {
+    return Offset.get() < (1LL << 48);
+  }
+
+public:
+  /// Template class to implement a `pointer` type into the trie data structure.
+  ///
+  /// It provides pointer-like operation, e.g., dereference to get underlying
+  /// data. It also reserves the top 16 bits of the pointer value, which can be
+  /// used to pack additional information if needed.
+  template <class ProxyT> class PointerImpl {
+  public:
+    FileOffset getOffset() const {
+      return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32);
+    }
+
+    explicit operator bool() const { return IsValue; }
+
+    const ProxyT &operator*() const {
+      assert(IsValue);
+      return Value;
+    }
+    const ProxyT *operator->() const {
+      assert(IsValue);
+      return &Value;
+    }
+
+    PointerImpl() = default;
+
+  protected:
+    PointerImpl(ProxyT Value, FileOffset Offset, bool IsValue = true)
+        : Value(Value), OffsetLow32((uint64_t)Offset.get()),
+          OffsetHigh16((uint64_t)Offset.get() >> 32), IsValue(IsValue) {
+      if (IsValue)
+        assert(validOffset(Offset));
+    }
+
+    ProxyT Value;
+    uint32_t OffsetLow32 = 0;
+    uint16_t OffsetHigh16 = 0;
+
+    // True if points to a value (not a "nullptr"). Use an extra field because
+    // 0 can be a valid offset.
+    bool IsValue = false;
+  };
+
+  class pointer;
+  class const_pointer : public PointerImpl<ConstValueProxy> {
+  public:
+    const_pointer() = default;
+
+  private:
+    friend class pointer;
+    friend class OnDiskTrieRawHashMap;
+    using const_pointer::PointerImpl::PointerImpl;
+  };
+
+  class pointer : public PointerImpl<ValueProxy> {
+  public:
+    operator const_pointer() const {
+      return const_pointer(Value, getOffset(), IsValue);
+    }
+
+    pointer() = default;
+
+  private:
+    friend class OnDiskTrieRawHashMap;
+    using pointer::PointerImpl::PointerImpl;
+  };
+
+  /// Find the value from hash.
+  ///
+  /// \returns pointer to the value if exists, otherwise returns a non-value
+  /// pointer that evaluates to `false` when convert to boolean.
+  const_pointer find(ArrayRef<uint8_t> Hash) const;
+
+  /// Helper function to recover a pointer into the trie from file offset.
+  Expected<const_pointer> recoverFromFileOffset(FileOffset Offset) const;
+
+  using LazyInsertOnConstructCB =
+      function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
+  using LazyInsertOnLeakCB =
+      function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue,
+                        FileOffset FinalOffset, ValueProxy FinalValue)>;
+
+  /// Insert lazily.
+  ///
+  /// \p OnConstruct is called when ready to insert a value, after allocating
+  /// space for the data. It is called at most once.
+  ///
+  /// \p OnLeak is called only if \p OnConstruct has been called and a race
+  /// occurred before insertion, causing the tentative offset and data to be
+  /// abandoned. This allows clients to clean up other results or update any
+  /// references.
+  ///
+  /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success.
+  /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize
+  /// simultaneous writes, but that seems dangerous to use in a memory-mapped
+  /// file in case a process crashes in the busy state.
+  Expected<pointer> insertLazy(ArrayRef<uint8_t> Hash,
+                               LazyInsertOnConstructCB OnConstruct = nullptr,
+                               LazyInsertOnLeakCB OnLeak = nullptr);
+
+  Expected<pointer> insert(const ConstValueProxy &Value) {
+    return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) {
+      assert(Allocated.Hash == Value.Hash);
+      assert(Allocated.Data.size() == Value.Data.size());
+      llvm::copy(Value.Data, Allocated.Data.begin());
+    });
+  }
+
+  size_t size() const;
+  size_t capacity() const;
+
+  /// Gets or creates a file at \p Path with a hash-mapped trie named \p
+  /// TrieName. The hash size is \p NumHashBits (in bits) and the records store
+  /// data of size \p DataSize (in bytes).
+  ///
+  /// \p MaxFileSize controls the maximum file size to support, limiting the
+  /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting
+  /// size if a new file is created.
+  ///
+  /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to
+  /// configure the trie, if it doesn't already exist.
+  ///
+  /// \pre NumHashBits is a multiple of 8 (byte-aligned).
+  static Expected<OnDiskTrieRawHashMap>
+  create(const Twine &Path, const Twine &TrieName, size_t NumHashBits,
+         uint64_t DataSize, uint64_t MaxFileSize,
+         std::optional<uint64_t> NewFileInitialSize,
+         std::optional<size_t> NewTableNumRootBits = std::nullopt,
+         std::optional<size_t> NewTableNumSubtrieBits = std::nullopt);
+
+  OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS);
+  OnDiskTrieRawHashMap &operator=(OnDiskTrieRawHashMap &&RHS);
+  ~OnDiskTrieRawHashMap();
+
+private:
+  struct ImplType;
+  explicit OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl);
+  std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace cas
+} // namespace llvm
+
+#endif // LLVM_CAS_ONDISKTRIERAWHASHMAP_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 6ed724bc2fd76..cc866f25f3240 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -2,10 +2,12 @@ add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  DatabaseFile.cpp
   InMemoryCAS.cpp
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
+  OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
diff --git a/llvm/lib/CAS/DatabaseFile.cpp b/llvm/lib/CAS/DatabaseFile.cpp
new file mode 100644
index 0000000000000..db8ce1dc5bb14
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.cpp
@@ -0,0 +1,123 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file implements the common abstractions for CAS database file.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DatabaseFile.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+Error ondisk::createTableConfigError(std::errc ErrC, StringRef Path,
+                                     StringRef TableName, const Twine &Msg) {
+  return createStringError(make_error_code(ErrC),
+                           Path + "[" + TableName + "]: " + Msg);
+}
+
+Error ondisk::checkTable(StringRef Label, size_t Expected, size_t Observed,
+                         StringRef Path, StringRef TrieName) {
+  if (Expected == Observed)
+    return Error::success();
+  return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                "mismatched " + Label +
+                                    " (expected: " + Twine(Expected) +
+                                    ", observed: " + Twine(Observed) + ")");
+}
+
+Expected<DatabaseFile>
+DatabaseFile::create(const Twine &Path, uint64_t Capacity,
+                     function_ref<Error(DatabaseFile &)> NewDBConstructor) {
+  // Constructor for if the file doesn't exist.
+  auto NewFileConstructor = [&](MappedFileRegionArena &Alloc) -> Error {
+    if (Alloc.capacity() <
+        sizeof(Header) + sizeof(MappedFileRegionArena::Header))
+      return createTableConfigError(std::errc::argument_out_of_domain,
+                                    Path.str(), "datafile",
+                                    "Allocator too small for header");
+    (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}};
+    DatabaseFile DB(Alloc);
+    return NewDBConstructor(DB);
+  };
+
+  // Get or create the file.
+  MappedFileRegionArena Alloc;
+  if (Error E = MappedFileRegionArena::create(Path, Capacity, sizeof(Header),
+                                              NewFileConstructor)
+                    .moveInto(Alloc))
+    return std::move(E);
+
+  return DatabaseFile::get(
+      std::make_unique<MappedFileRegionArena>(std::move(Alloc)));
+}
+
+Error DatabaseFile::addTable(TableHandle Table) {
+  assert(Table);
+  assert(&Table.getRegion() == &getRegion());
+  int64_t ExistingRootOffset = 0;
+  const int64_t NewOffset =
+      reinterpret_cast<const char *>(&Table.getHeader()) - getRegion().data();
+  if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset))
+    return Error::success();
+
+  // Silently ignore attempts to set the root to itself.
+  if (ExistingRootOffset == NewOffset)
+    return Error::success();
+
+  // Return an proper error message.
+  TableHandle Root(getRegion(), ExistingRootOffset);
+  if (Root.getName() == Table.getName())
+    return createStringError(
+        make_error_code(std::errc::not_supported),
+        "collision with existing table of the same name '" + Table.getName() +
+            "'");
+
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "cannot add new table '" + Table.getName() +
+                               "'"
+                               " to existing root '" +
+                               Root.getName() + "'");
+}
+
+std::optional<TableHandle> DatabaseFile::findTable(StringRef Name) {
+  int64_t RootTableOffset = H->RootTableOffset.load();
+  if (!RootTableOffset)
+    return std::nullopt;
+
+  TableHandle Root(getRegion(), RootTableOffset);
+  if (Root.getName() == Name)
+    return Root;
+
+  return std::nullopt;
+}
+
+Error DatabaseFile::validate(MappedFileRegion &Region) {
+  if (Region.size() < sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: missing header");
+
+  // Check the magic and version.
+  auto *H = reinterpret_cast<Header *>(Region.data());
+  if (H->Magic != getMagic())
+    return createStringError(std::errc::invalid_argument,
+                             "database: bad magic");
+  if (H->Version != getVersion())
+    return createStringError(std::errc::invalid_argument,
+                             "database: wrong version");
+
+  auto *MFH = reinterpret_cast<MappedFileRegionArena::Header *>(Region.data() +
+                                                                sizeof(Header));
+  // Check the bump-ptr, which should point past the header.
+  if (MFH->BumpPtr.load() < (int64_t)sizeof(Header))
+    return createStringError(std::errc::invalid_argument,
+                             "database: corrupt bump-ptr");
+
+  return Error::success();
+}
diff --git a/llvm/lib/CAS/DatabaseFile.h b/llvm/lib/CAS/DatabaseFile.h
new file mode 100644
index 0000000000000..609e5f1357190
--- /dev/null
+++ b/llvm/lib/CAS/DatabaseFile.h
@@ -0,0 +1,153 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares the common interface for a DatabaseFile that is used to
+/// implement OnDiskCAS.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CAS_DATABASEFILE_H
+#define LLVM_LIB_CAS_DATABASEFILE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas::ondisk {
+
+using MappedFileRegion = MappedFileRegionArena::RegionT;
+
+/// Generic handle for a table.
+///
+/// Generic table header layout:
+/// - 2-bytes: TableKind
+/// - 2-bytes: TableNameSize
+/// - 4-bytes: TableNameRelOffset (relative to header)
+class TableHandle {
+public:
+  enum class TableKind : uint16_t {
+    TrieRawHashMap = 1,
+    DataAllocator = 2,
+  };
+  struct Header {
+    TableKind Kind;
+    uint16_t NameSize;
+    int32_t NameRelOffset; ///< Relative to Header.
+  };
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  template <class T> static void check() {
+    static_assert(
+        std::is_same<decltype(T::Header::GenericHeader), Header>::value,
+        "T::GenericHeader should be of type TableHandle::Header");
+    static_assert(offsetof(typename T::Header, GenericHeader) == 0,
+                  "T::GenericHeader must be the head of T::Header");
+  }
+  template <class T> bool is() const { return T::Kind == H->Kind; }
+  template <class T> T dyn_cast() const {
+    check<T>();
+    if (is<T>())
+      return T(*Region, *reinterpret_cast<typename T::Header *>(H));
+    return T();
+  }
+  template <class T> T cast() const {
+    assert(is<T>());
+    return dyn_cast<T>();
+  }
+
+  StringRef getName() const {
+    auto *Begin = reinterpret_cast<const char *>(H) + H->NameRelOffset;
+    return StringRef(Begin, H->NameSize);
+  }
+
+  TableHandle() = default;
+  TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {}
+  TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TableHandle(Region,
+                    *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+/// Encapsulate a database file, which:
+/// - Sets/checks magic.
+/// - Sets/checks version.
+/// - Points at an arbitrary root table.
+/// - Sets up a MappedFileRegionArena for allocation.
+///
+/// Top-level layout:
+/// - 4-bytes: Magic
+/// - 4-bytes: Version
+/// - 8-bytes: RootTableOffset (16-bits: Kind; 48-bits: Offset)
+/// - 8-bytes: BumpPtr from MappedFileRegionArena
+class DatabaseFile {
+public:
+  static constexpr uint32_t getMagic() { return 0xDA7ABA53UL; }
+  static constexpr uint32_t getVersion() { return 1UL; }
+  struct Header {
+    uint32_t Magic;
+    uint32_t Version;
+    std::atomic<int64_t> RootTableOffset;
+  };
+
+  const Header &getHeader() { return *H; }
+  MappedFileRegionArena &getAlloc() { return Alloc; }
+  MappedFileRegion &getRegion() { return Alloc.getRegion(); }
+
+  /// Add a table. This is currently not thread safe and should be called inside
+  /// NewDBConstructor.
+  Error addTable(TableHandle Table);
+
+  /// Find a table. May return null.
+  std::optional<TableHandle> findTable(StringRef Name);
+
+  /// Create the DatabaseFile at Path with Capacity.
+  static Expected<DatabaseFile>
+  create(const Twine &Path, uint64_t Capacity,
+         function_ref<Error(DatabaseFile &)> NewDBConstructor);
+
+  size_t size() const { return Alloc.size(); }
+
+private:
+  static Expected<DatabaseFile>
+  get(std::unique_ptr<MappedFileRegionArena> Alloc) {
+    if (Error E = validate(Alloc->getRegion()))
+      return std::move(E);
+    return DatabaseFile(std::move(Alloc));
+  }
+
+  static Error validate(MappedFileRegion &Region);
+
+  DatabaseFile(MappedFileRegionArena &Alloc)
+      : H(reinterpret_cast<Header *>(Alloc.data())), Alloc(Alloc) {}
+  DatabaseFile(std::unique_ptr<MappedFileRegionArena> Alloc)
+      : DatabaseFile(*Alloc) {
+    OwnedAlloc = std::move(Alloc);
+  }
+
+  Header *H = nullptr;
+  MappedFileRegionArena &Alloc;
+  std::unique_ptr<MappedFileRegionArena> OwnedAlloc;
+};
+
+Error createTableConfigError(std::errc ErrC, StringRef Path,
+                             StringRef TableName, const Twine &Msg);
+
+Error checkTable(StringRef Label, size_t Expected, size_t Observed,
+                 StringRef Path, StringRef TrieName);
+
+} // namespace llvm::cas::ondisk
+
+#endif
diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
new file mode 100644
index 0000000000000..3000c0f0e46f1
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -0,0 +1,1178 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskTrieRawHashMap.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "DatabaseFile.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/TrieHashIndexGenerator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// TrieRawHashMap data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class SubtrieHandle;
+class TrieRawHashMapHandle;
+class TrieVisitor;
+
+/// A value stored in the slots inside a SubTrie. A stored value can either be a
+/// subtrie (encoded after negation) which is the file offset to another
+/// subtrie, or it can be a fileset to a DataRecord.
+class SubtrieSlotValue {
+public:
+  explicit operator bool() const { return !isEmpty(); }
+  bool isEmpty() const { return !Offset; }
+  bool isData() const { return Offset > 0; }
+  bool isSubtrie() const { return Offset < 0; }
+  uint64_t asData() const {
+    assert(isData());
+    return Offset;
+  }
+  uint64_t asSubtrie() const {
+    assert(isSubtrie());
+    return -Offset;
+  }
+
+  FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); }
+
+  FileOffset asDataFileOffset() const { return FileOffset(asData()); }
+
+  int64_t getRawOffset() const { return Offset; }
+
+  static SubtrieSlotValue getDataOffset(int64_t Offset) {
+    return SubtrieSlotValue(Offset);
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(int64_t Offset) {
+    return SubtrieSlotValue(-Offset);
+  }
+
+  static SubtrieSlotValue getDataOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) {
+    return getDataOffset(Offset.get());
+  }
+
+  static SubtrieSlotValue getFromSlot(std::atomic<int64_t> &Slot) {
+    return SubtrieSlotValue(Slot.load());
+  }
+
+  SubtrieSlotValue() = default;
+
+private:
+  friend class SubtrieHandle;
+  explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {}
+  int64_t Offset = 0;
+};
+
+/// Subtrie layout:
+/// - 2-bytes: StartBit
+/// - 1-bytes: NumBits=lg(num-slots)
+/// - 5-bytes: 0-pad
+/// - <slots>
+class SubtrieHandle {
+public:
+  struct Header {
+    /// The bit this subtrie starts on.
+    uint16_t StartBit;
+
+    /// The number of bits this subtrie handles. It has 2^NumBits slots.
+    uint8_t NumBits;
+
+    /// 0-pad to 8B.
+    uint8_t ZeroPad1B;
+    uint32_t ZeroPad4B;
+  };
+
+  /// Slot storage:
+  /// - zero:     Empty
+  /// - positive: RecordOffset
+  /// - negative: SubtrieOffset
+  using SlotT = std::atomic<int64_t>;
+
+  static int64_t getSlotsSize(uint32_t NumBits) {
+    return sizeof(int64_t) * (1u << NumBits);
+  }
+
+  static int64_t getSize(uint32_t NumBits) {
+    return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits);
+  }
+
+  int64_t getSize() const { return getSize(H->NumBits); }
+  size_t getNumSlots() const { return Slots.size(); }
+
+  SubtrieSlotValue load(size_t I) const {
+    return SubtrieSlotValue(Slots[I].load());
+  }
+  void store(size_t I, SubtrieSlotValue V) {
+    return Slots[I].store(V.getRawOffset());
+  }
+
+  void printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const;
+
+  /// Return None on success, or the existing offset on failure.
+  bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected,
+                               SubtrieSlotValue New) {
+    return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset);
+  }
+
+  /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with
+  /// \p NumSubtrieBits.
+  ///
+  /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a
+  /// new subtrie is created that isn't used because of a lost race, then it If
+  /// it's already valid, it should be used instead of allocating a new one.
+  /// should be returned as an out parameter to be passed back in the future.
+  /// If it's already valid, it should be used instead of allocating a new one.
+  ///
+  /// Returns the subtrie that now lives at \p I.
+  Expected<SubtrieHandle> sink(size_t I, SubtrieSlotValue V,
+                               MappedFileRegionArena &Alloc,
+                               size_t NumSubtrieBits,
+                               SubtrieHandle &UnusedSubtrie, size_t NewI);
+
+  /// Only safe if the subtrie is empty.
+  void reinitialize(uint32_t StartBit, uint32_t NumBits);
+
+  SubtrieSlotValue getOffset() const {
+    return SubtrieSlotValue::getSubtrieOffset(
+        reinterpret_cast<const char *>(H) - Region->data());
+  }
+
+  FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); }
+
+  explicit operator bool() const { return H; }
+
+  Header &getHeader() const { return *H; }
+  uint32_t getStartBit() const { return H->StartBit; }
+  uint32_t getNumBits() const { return H->NumBits; }
+
+  static Expected<SubtrieHandle> create(MappedFileRegionArena &Alloc,
+                                        uint32_t StartBit, uint32_t NumBits);
+
+  static SubtrieHandle getFromFileOffset(MappedFileRegion &Region,
+                                         FileOffset Offset) {
+    return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset));
+  }
+
+  SubtrieHandle() = default;
+  SubtrieHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H), Slots(getSlots(H)) {}
+  SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset)
+      : SubtrieHandle(Region, *reinterpret_cast<Header *>(
+                                  Region.data() + Offset.asSubtrie())) {}
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+  MutableArrayRef<SlotT> Slots;
+
+  static MutableArrayRef<SlotT> getSlots(Header &H) {
+    return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits);
+  }
+};
+
+/// Handle for a TrieRawHashMap table.
+///
+/// TrieRawHashMap table layout:
+/// - [8-bytes: Generic table header]
+/// - 1-byte:  NumSubtrieBits
+/// - 1-byte:  Flags (not used yet)
+/// - 2-bytes: NumHashBits
+/// - 4-bytes: RecordDataSize (in bytes)
+/// - 8-bytes: RootTrieOffset
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - <name> '\0'
+///
+/// Record layout:
+/// - <hash>
+/// - <data>
+class TrieRawHashMapHandle {
+public:
+  static constexpr TableHandle::TableKind Kind =
+      TableHandle::TableKind::TrieRawHashMap;
+
+  struct Header {
+    TableHandle::Header GenericHeader;
+    uint8_t NumSubtrieBits;
+    uint8_t Flags; ///< None used yet.
+    uint16_t NumHashBits;
+    uint32_t RecordDataSize;
+    std::atomic<int64_t> RootTrieOffset;
+    std::atomic<int64_t> AllocatorOffset;
+  };
+
+  operator TableHandle() const {
+    if (!H)
+      return TableHandle();
+    return TableHandle(*Region, H->GenericHeader);
+  }
+
+  struct RecordData {
+    OnDiskTrieRawHashMap::ValueProxy Proxy;
+    SubtrieSlotValue Offset;
+    FileOffset getFileOffset() const { return Offset.asDataFileOffset(); }
+  };
+
+  enum Limits : size_t {
+    /// Seems like 65528 hash bits ought to be enough.
+    MaxNumHashBytes = UINT16_MAX >> 3,
+    MaxNumHashBits = MaxNumHashBytes << 3,
+
+    /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit
+    /// index. This many slots is suspicously large anyway.
+    MaxNumRootBits = 16,
+
+    /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously
+    /// large for subtries.
+    MaxNumSubtrieBits = 10,
+  };
+
+  static constexpr size_t getNumHashBytes(size_t NumHashBits) {
+    assert(NumHashBits % 8 == 0);
+    return NumHashBits / 8;
+  }
+  static constexpr size_t getRecordSize(size_t RecordDataSize,
+                                        size_t NumHashBits) {
+    return RecordDataSize + getNumHashBytes(NumHashBits);
+  }
+
+  RecordData getRecord(SubtrieSlotValue Offset);
+  Expected<RecordData> createRecord(MappedFileRegionArena &Alloc,
+                                    ArrayRef<uint8_t> Hash);
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  SubtrieHandle getRoot() const;
+  Expected<SubtrieHandle> getOrCreateRoot(MappedFileRegionArena &Alloc);
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  size_t getFlags() const { return H->Flags; }
+  uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
+  uint64_t getNumHashBits() const { return H->NumHashBits; }
+  size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); }
+  size_t getRecordDataSize() const { return H->RecordDataSize; }
+  size_t getRecordSize() const {
+    return getRecordSize(H->RecordDataSize, H->NumHashBits);
+  }
+
+  TrieHashIndexGenerator getIndexGen(SubtrieHandle Root,
+                                     ArrayRef<uint8_t> Hash) {
+    assert(Root.getStartBit() == 0);
+    assert(getNumHashBytes() == Hash.size());
+    assert(getNumHashBits() == Hash.size() * 8);
+    return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash};
+  }
+
+  static Expected<TrieRawHashMapHandle>
+  create(MappedFileRegionArena &Alloc, StringRef Name,
+         std::optional<uint64_t> NumRootBits, uint64_t NumSubtrieBits,
+         uint64_t NumHashBits, uint64_t RecordDataSize);
+
+  void
+  print(raw_ostream &OS,
+        function_ref<void(ArrayRef<char>)> PrintRecordData = nullptr) const;
+
+  Error validate(
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier) const;
+  TrieRawHashMapHandle() = default;
+  TrieRawHashMapHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H) {}
+  TrieRawHashMapHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : TrieRawHashMapHandle(
+            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskTrieRawHashMap::ImplType {
+  DatabaseFile File;
+  TrieRawHashMapHandle Trie;
+};
+
+Expected<SubtrieHandle> SubtrieHandle::create(MappedFileRegionArena &Alloc,
+                                              uint32_t StartBit,
+                                              uint32_t NumBits) {
+  assert(StartBit <= TrieRawHashMapHandle::MaxNumHashBits);
+  assert(NumBits <= UINT8_MAX);
+  assert(NumBits <= TrieRawHashMapHandle::MaxNumRootBits);
+
+  auto Mem = Alloc.allocate(getSize(NumBits));
+  if (LLVM_UNLIKELY(!Mem))
+    return Mem.takeError();
+  auto *H =
+      new (*Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits,
+                                       /*ZeroPad1B=*/0, /*ZeroPad4B=*/0};
+  SubtrieHandle S(Alloc.getRegion(), *H);
+  for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I)
+    new (I) SlotT(0);
+  return S;
+}
+
+SubtrieHandle TrieRawHashMapHandle::getRoot() const {
+  if (int64_t Root = H->RootTrieOffset)
+    return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root));
+  return SubtrieHandle();
+}
+
+Expected<SubtrieHandle>
+TrieRawHashMapHandle::getOrCreateRoot(MappedFileRegionArena &Alloc) {
+  assert(&Alloc.getRegion() == &getRegion());
+  if (SubtrieHandle Root = getRoot())
+    return Root;
+
+  int64_t Race = 0;
+  auto LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits);
+  if (LLVM_UNLIKELY(!LazyRoot))
+    return LazyRoot.takeError();
+  if (H->RootTrieOffset.compare_exchange_strong(
+          Race, LazyRoot->getOffset().asSubtrie()))
+    return *LazyRoot;
+
+  // There was a race. Return the other root.
+  //
+  // TODO: Avoid leaking the lazy root by storing it in an allocator.
+  return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race));
+}
+
+Expected<TrieRawHashMapHandle>
+TrieRawHashMapHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+                             std::optional<uint64_t> NumRootBits,
+                             uint64_t NumSubtrieBits, uint64_t NumHashBits,
+                             uint64_t RecordDataSize) {
+  // Allocate.
+  auto Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1);
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  // Construct the header and the name.
+  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+  assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits");
+  assert(NumHashBits <= UINT16_MAX && "Expected valid hash size");
+  assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name");
+  auto *H = new (Alloc.getRegion().data() + *Offset)
+      Header{{TableHandle::TableKind::TrieRawHashMap, (uint16_t)Name.size(),
+              (uint32_t)sizeof(Header)},
+             (uint8_t)NumSubtrieBits,
+             /*Flags=*/0,
+             (uint16_t)NumHashBits,
+             (uint32_t)RecordDataSize,
+             /*RootTrieOffset=*/{0},
+             /*AllocatorOffset=*/{0}};
+  char *NameStorage = reinterpret_cast<char *>(H + 1);
+  llvm::copy(Name, NameStorage);
+  NameStorage[Name.size()] = 0;
+
+  // Construct a root trie, if requested.
+  TrieRawHashMapHandle Trie(Alloc.getRegion(), *H);
+  auto Sub = SubtrieHandle::create(Alloc, 0, *NumRootBits);
+  if (LLVM_UNLIKELY(!Sub))
+    return Sub.takeError();
+  if (NumRootBits)
+    H->RootTrieOffset = Sub->getOffset().asSubtrie();
+  return Trie;
+}
+
+TrieRawHashMapHandle::RecordData
+TrieRawHashMapHandle::getRecord(SubtrieSlotValue Offset) {
+  char *Begin = Region->data() + Offset.asData();
+  OnDiskTrieRawHashMap::ValueProxy Proxy;
+  Proxy.Data = MutableArrayRef(Begin, getRecordDataSize());
+  Proxy.Hash = ArrayRef(reinterpret_cast<const uint8_t *>(Proxy.Data.end()),
+                        getNumHashBytes());
+  return RecordData{Proxy, Offset};
+}
+
+Expected<TrieRawHashMapHandle::RecordData>
+TrieRawHashMapHandle::createRecord(MappedFileRegionArena &Alloc,
+                                   ArrayRef<uint8_t> Hash) {
+  assert(&Alloc.getRegion() == Region);
+  assert(Hash.size() == getNumHashBytes());
+  auto Offset = Alloc.allocateOffset(getRecordSize());
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  RecordData Record = getRecord(SubtrieSlotValue::getDataOffset(*Offset));
+  llvm::copy(Hash, const_cast<uint8_t *>(Record.Proxy.Hash.begin()));
+  return Record;
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  // Check alignment.
+  if (!isAligned(MappedFileRegionArena::getAlign(), Offset.get()))
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "unaligned file offset at 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Check bounds.
+  //
+  // Note: There's no potential overflow when using \c uint64_t because Offset
+  // is in valid offset range and the record size is in \c [0,UINT32_MAX].
+  if (!validOffset(Offset) ||
+      Offset.get() + Impl->Trie.getRecordSize() > Impl->File.getAlloc().size())
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "file offset too large: 0x" +
+                                 utohexstr(Offset.get(), /*LowerCase=*/true));
+
+  // Looks okay...
+  TrieRawHashMapHandle::RecordData D =
+      Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+  return const_pointer(D.Proxy, D.getFileOffset());
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  SubtrieHandle S = Trie.getRoot();
+  if (!S)
+    return const_pointer();
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
+  size_t Index = IndexGen.next();
+  for (;;) {
+    // Try to set the content.
+    SubtrieSlotValue V = S.load(Index);
+    if (!V)
+      return const_pointer();
+
+    // Check for an exact match.
+    if (V.isData()) {
+      TrieRawHashMapHandle::RecordData D = Trie.getRecord(V);
+      return D.Proxy.Hash == Hash ? const_pointer(D.Proxy, D.getFileOffset())
+                                  : const_pointer();
+    }
+
+    Index = IndexGen.next();
+    S = SubtrieHandle(Trie.getRegion(), V);
+  }
+}
+
+/// Only safe if the subtrie is empty.
+void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) {
+  assert(StartBit > H->StartBit);
+  assert(NumBits <= H->NumBits);
+  // Ideally would also assert that all slots are empty, but that's expensive.
+
+  H->StartBit = StartBit;
+  H->NumBits = NumBits;
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  TrieRawHashMapHandle Trie = Impl->Trie;
+  assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
+
+  MappedFileRegionArena &Alloc = Impl->File.getAlloc();
+  std::optional<SubtrieHandle> S;
+  auto Err = Trie.getOrCreateRoot(Alloc).moveInto(S);
+  if (LLVM_UNLIKELY(Err))
+    return std::move(Err);
+
+  TrieHashIndexGenerator IndexGen = Trie.getIndexGen(*S, Hash);
+  size_t Index = IndexGen.next();
+
+  // Walk through the hash bytes and insert into correct trie position.
+  std::optional<TrieRawHashMapHandle::RecordData> NewRecord;
+  SubtrieHandle UnusedSubtrie;
+  for (;;) {
+    SubtrieSlotValue Existing = S->load(Index);
+
+    // Try to set it, if it's empty.
+    if (!Existing) {
+      if (!NewRecord) {
+        auto Err = Trie.createRecord(Alloc, Hash).moveInto(NewRecord);
+        if (LLVM_UNLIKELY(Err))
+          return std::move(Err);
+        if (OnConstruct)
+          OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy);
+      }
+
+      if (S->compare_exchange_strong(Index, Existing, NewRecord->Offset))
+        return pointer(NewRecord->Proxy, NewRecord->Offset.asDataFileOffset());
+
+      // Race means that Existing is no longer empty; fall through...
+    }
+
+    if (Existing.isSubtrie()) {
+      S = SubtrieHandle(Trie.getRegion(), Existing);
+      Index = IndexGen.next();
+      continue;
+    }
+
+    // Check for an exact match.
+    TrieRawHashMapHandle::RecordData ExistingRecord = Trie.getRecord(Existing);
+    if (ExistingRecord.Proxy.Hash == Hash) {
+      if (NewRecord && OnLeak)
+        OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy,
+               ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy);
+      return pointer(ExistingRecord.Proxy,
+                     ExistingRecord.Offset.asDataFileOffset());
+    }
+
+    // Sink the existing content as long as the indexes match.
+    for (;;) {
+      size_t NextIndex = IndexGen.next();
+      size_t NewIndexForExistingContent =
+          IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash);
+
+      auto Err = S->sink(Index, Existing, Alloc, IndexGen.getNumBits(),
+                         UnusedSubtrie, NewIndexForExistingContent)
+                     .moveInto(S);
+      if (LLVM_UNLIKELY(Err))
+        return std::move(Err);
+      Index = NextIndex;
+
+      // Found the difference.
+      if (NextIndex != NewIndexForExistingContent)
+        break;
+    }
+  }
+}
+
+Expected<SubtrieHandle> SubtrieHandle::sink(size_t I, SubtrieSlotValue V,
+                                            MappedFileRegionArena &Alloc,
+                                            size_t NumSubtrieBits,
+                                            SubtrieHandle &UnusedSubtrie,
+                                            size_t NewI) {
+  std::optional<SubtrieHandle> NewS;
+  if (UnusedSubtrie) {
+    // Steal UnusedSubtrie and initialize it.
+    NewS.emplace();
+    std::swap(*NewS, UnusedSubtrie);
+    NewS->reinitialize(getStartBit() + getNumBits(), NumSubtrieBits);
+  } else {
+    // Allocate a new, empty subtrie.
+    auto Err = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(),
+                                     NumSubtrieBits)
+                   .moveInto(NewS);
+    if (LLVM_UNLIKELY(Err))
+      return std::move(Err);
+  }
+
+  NewS->store(NewI, V);
+  if (compare_exchange_strong(I, V, NewS->getOffset()))
+    return *NewS; // Success!
+
+  // Raced.
+  assert(V.isSubtrie() && "Expected racing sink() to add a subtrie");
+
+  // Wipe out the new slot so NewS can be reused and set the out parameter.
+  NewS->store(NewI, SubtrieSlotValue());
+  UnusedSubtrie = *NewS;
+
+  // Return the subtrie added by the concurrent sink() call.
+  return SubtrieHandle(Alloc.getRegion(), V);
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  Impl->Trie.print(OS, PrintRecordData);
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, ConstValueProxy)> RecordVerifier) const {
+  return Impl->Trie.validate(RecordVerifier);
+}
+
+// Helper function that prints hexdigit and have a sub-byte starting position.
+static void printHexDigits(raw_ostream &OS, ArrayRef<uint8_t> Bytes,
+                           size_t StartBit, size_t NumBits) {
+  assert(StartBit % 4 == 0);
+  assert(NumBits % 4 == 0);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) {
+    uint8_t HexPair = Bytes[I / 8];
+    uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf;
+    OS << hexdigit(HexDigit, /*LowerCase=*/true);
+  }
+}
+
+static void printBits(raw_ostream &OS, ArrayRef<uint8_t> Bytes, size_t StartBit,
+                      size_t NumBits) {
+  assert(StartBit + NumBits <= Bytes.size() * 8u);
+  for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) {
+    uint8_t Byte = Bytes[I / 8];
+    size_t ByteOffset = I % 8;
+    if (size_t ByteShift = 8 - ByteOffset - 1)
+      Byte >>= ByteShift;
+    OS << (Byte & 0x1 ? '1' : '0');
+  }
+}
+
+void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef<uint8_t> Bytes) const {
+  // afb[1c:00*01110*0]def
+  size_t EndBit = getStartBit() + getNumBits();
+  size_t HashEndBit = Bytes.size() * 8u;
+
+  size_t FirstBinaryBit = getStartBit() & ~0x3u;
+  printHexDigits(OS, Bytes, 0, FirstBinaryBit);
+
+  size_t LastBinaryBit = (EndBit + 3u) & ~0x3u;
+  OS << "[";
+  printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit);
+  OS << "]";
+
+  printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit);
+}
+
+static void appendIndexBits(std::string &Prefix, size_t Index,
+                            size_t NumSlots) {
+  std::string Bits;
+  for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) {
+    Bits.push_back('0' + (Index & 0x1));
+    Index >>= 1;
+  }
+  for (char Ch : llvm::reverse(Bits))
+    Prefix += Ch;
+}
+
+static void printPrefix(raw_ostream &OS, StringRef Prefix) {
+  while (Prefix.size() >= 4) {
+    uint8_t Digit;
+    bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit);
+    assert(!ErrorParsingBinary);
+    (void)ErrorParsingBinary;
+    OS << hexdigit(Digit, /*LowerCase=*/true);
+    Prefix = Prefix.drop_front(4);
+  }
+  if (!Prefix.empty())
+    OS << "[" << Prefix << "]";
+}
+
+LLVM_DUMP_METHOD void OnDiskTrieRawHashMap::dump() const { print(dbgs()); }
+
+static Expected<size_t> checkParameter(StringRef Label, size_t Max,
+                                       std::optional<size_t> Value,
+                                       std::optional<size_t> Default,
+                                       StringRef Path, StringRef TableName) {
+  assert(Value || Default);
+  assert(!Default || *Default <= Max);
+  if (!Value)
+    return *Default;
+
+  if (*Value <= Max)
+    return *Value;
+  return createTableConfigError(
+      std::errc::argument_out_of_domain, Path, TableName,
+      "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return Impl->File.size(); }
+size_t OnDiskTrieRawHashMap::capacity() const {
+  return Impl->File.getRegion().size();
+}
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  SmallString<128> PathStorage;
+  StringRef Path = PathTwine.toStringRef(PathStorage);
+  SmallString<128> TrieNameStorage;
+  StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage);
+
+  constexpr size_t DefaultNumRootBits = 10;
+  constexpr size_t DefaultNumSubtrieBits = 6;
+
+  size_t NumRootBits;
+  if (Error E = checkParameter(
+                    "root bits", TrieRawHashMapHandle::MaxNumRootBits,
+                    NewTableNumRootBits, DefaultNumRootBits, Path, TrieName)
+                    .moveInto(NumRootBits))
+    return std::move(E);
+
+  size_t NumSubtrieBits;
+  if (Error E = checkParameter("subtrie bits",
+                               TrieRawHashMapHandle::MaxNumSubtrieBits,
+                               NewTableNumSubtrieBits, DefaultNumSubtrieBits,
+                               Path, TrieName)
+                    .moveInto(NumSubtrieBits))
+    return std::move(E);
+
+  size_t NumHashBytes = NumHashBits >> 3;
+  if (Error E =
+          checkParameter("hash size", TrieRawHashMapHandle::MaxNumHashBits,
+                         NumHashBits, std::nullopt, Path, TrieName)
+              .takeError())
+    return std::move(E);
+  assert(NumHashBits == NumHashBytes << 3 &&
+         "Expected hash size to be byte-aligned");
+  if (NumHashBits != NumHashBytes << 3)
+    return createTableConfigError(
+        std::errc::argument_out_of_domain, Path, TrieName,
+        "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)");
+
+  // Constructor for if the file doesn't exist.
+  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+    auto Trie =
+        TrieRawHashMapHandle::create(DB.getAlloc(), TrieName, NumRootBits,
+                                     NumSubtrieBits, NumHashBits, DataSize);
+    if (LLVM_UNLIKELY(!Trie))
+      return Trie.takeError();
+
+    return DB.addTable(*Trie);
+  };
+
+  // Get or create the file.
+  Expected<DatabaseFile> File =
+      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+  if (!File)
+    return File.takeError();
+
+  // Find the trie and validate it.
+  std::optional<TableHandle> Table = File->findTable(TrieName);
+  if (!Table)
+    return createTableConfigError(std::errc::argument_out_of_domain, Path,
+                                  TrieName, "table not found");
+  if (Error E = checkTable("table kind", (size_t)TrieRawHashMapHandle::Kind,
+                           (size_t)Table->getHeader().Kind, Path, TrieName))
+    return std::move(E);
+  auto Trie = Table->cast<TrieRawHashMapHandle>();
+  assert(Trie && "Already checked the kind");
+
+  // Check the hash and data size.
+  if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(),
+                           Path, TrieName))
+    return std::move(E);
+  if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(),
+                           Path, TrieName))
+    return std::move(E);
+
+  // No flags supported right now. Either corrupt, or coming from a future
+  // writer.
+  if (size_t Flags = Trie.getFlags())
+    return createTableConfigError(std::errc::invalid_argument, Path, TrieName,
+                                  "unsupported flags: " + Twine(Flags));
+
+  // Success.
+  OnDiskTrieRawHashMap::ImplType Impl{DatabaseFile(std::move(*File)), Trie};
+  return OnDiskTrieRawHashMap(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+static Error createInvalidTrieError(uint64_t Offset, const Twine &Msg) {
+  return createStringError(make_error_code(std::errc::protocol_error),
+                           "invalid trie at 0x" +
+                               utohexstr(Offset, /*LowerCase=*/true) + ": " +
+                               Msg);
+}
+
+//===----------------------------------------------------------------------===//
+// TrieVisitor data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A multi-threaded vistior to traverse the Trie.
+///
+/// TODO: add more sanity checks that isn't just plain data corruption. For
+/// example, some ill-formed data can be constructed to form a cycle using
+/// Sub-Tries and it can lead to inifinite loop when visiting (or inserting
+/// data).
+class TrieVisitor {
+public:
+  TrieVisitor(TrieRawHashMapHandle Trie, unsigned ThreadCount = 0,
+              unsigned ErrorLimit = 50)
+      : Trie(Trie), ErrorLimit(ErrorLimit),
+        Threads(hardware_concurrency(ThreadCount)) {}
+  virtual ~TrieVisitor() = default;
+  Error visit();
+
+private:
+  // Virtual method to implement the action when visiting a sub-trie.
+  virtual Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) {
+    return Error::success();
+  }
+
+  // Virtual method to implement the action when visiting a slot in a trie node.
+  virtual Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                          SubtrieSlotValue Slot) {
+    return Error::success();
+  }
+
+protected:
+  TrieRawHashMapHandle Trie;
+
+private:
+  Error traverseTrieNode(SubtrieHandle Node, StringRef Prefix);
+
+  Error validateSubTrie(SubtrieHandle Node, bool IsRoot);
+
+  // Helper function to capture errors when visiting the trie nodes.
+  void addError(Error NewError) {
+    assert(NewError && "not an error");
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    if (NumError >= ErrorLimit) {
+      // Too many errors.
+      consumeError(std::move(NewError));
+      return;
+    }
+
+    if (Err)
+      Err = joinErrors(std::move(*Err), std::move(NewError));
+    else
+      Err = std::move(NewError);
+    NumError++;
+  }
+
+  bool tooManyErrors() {
+    std::lock_guard<std::mutex> ErrorLock(Lock);
+    return (bool)Err && NumError >= ErrorLimit;
+  }
+
+  const unsigned ErrorLimit;
+  std::optional<Error> Err;
+  unsigned NumError = 0;
+  std::mutex Lock;
+  DefaultThreadPool Threads;
+};
+
+/// A visitor that traverse and print the Trie.
+class TriePrinter : public TrieVisitor {
+public:
+  TriePrinter(TrieRawHashMapHandle Trie, raw_ostream &OS,
+              function_ref<void(ArrayRef<char>)> PrintRecordData)
+      : TrieVisitor(Trie, /*ThreadCount=*/1), OS(OS),
+        PrintRecordData(PrintRecordData) {}
+
+  Error printRecords() {
+    if (Records.empty())
+      return Error::success();
+
+    OS << "records\n";
+    llvm::sort(Records);
+    for (int64_t Offset : Records) {
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
+      if (auto Err = printRecord(Record))
+        return Err;
+    }
+    return Error::success();
+  }
+
+  Error printRecord(TrieRawHashMapHandle::RecordData &Record) {
+    OS << "- addr=" << (void *)Record.getFileOffset().get() << " ";
+    if (PrintRecordData) {
+      PrintRecordData(Record.Proxy.Data);
+    } else {
+      OS << "bytes=";
+      ArrayRef<uint8_t> Data(
+          reinterpret_cast<const uint8_t *>(Record.Proxy.Data.data()),
+          Record.Proxy.Data.size());
+      printHexDigits(OS, Data, 0, Data.size() * 8);
+    }
+    OS << "\n";
+    return Error::success();
+  }
+
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) override {
+    if (Prefix.empty()) {
+      OS << "root";
+    } else {
+      OS << "subtrie=";
+      printPrefix(OS, Prefix);
+    }
+
+    OS << " addr="
+       << (void *)(reinterpret_cast<const char *>(&SubTrie.getHeader()) -
+                   Trie.getRegion().data());
+    OS << " num-slots=" << SubTrie.getNumSlots() << "\n";
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) override {
+    OS << "- index=";
+    for (size_t Pad : {10, 100, 1000})
+      if (I < Pad && Subtrie.getNumSlots() >= Pad)
+        OS << "0";
+    OS << I << " ";
+    if (Slot.isSubtrie()) {
+      OS << "addr=" << (void *)Slot.asSubtrie();
+      OS << " subtrie=";
+      printPrefix(OS, Prefix);
+      OS << "\n";
+      return Error::success();
+    }
+    TrieRawHashMapHandle::RecordData Record = Trie.getRecord(Slot);
+    OS << "addr=" << (void *)Record.getFileOffset().get();
+    OS << " content=";
+    Subtrie.printHash(OS, Record.Proxy.Hash);
+    OS << "\n";
+    Records.push_back(Slot.asData());
+    return Error::success();
+  }
+
+private:
+  raw_ostream &OS;
+  function_ref<void(ArrayRef<char>)> PrintRecordData;
+  SmallVector<int64_t> Records;
+};
+
+/// TrieVerifier that adds additional verification on top of the basic visitor.
+class TrieVerifier : public TrieVisitor {
+public:
+  TrieVerifier(
+      TrieRawHashMapHandle Trie,
+      function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+          RecordVerifier)
+      : TrieVisitor(Trie), RecordVerifier(RecordVerifier) {}
+
+private:
+  Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) final {
+    return Error::success();
+  }
+
+  Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix,
+                  SubtrieSlotValue Slot) final {
+    if (RecordVerifier && Slot.isData()) {
+      if (!isAligned(MappedFileRegionArena::getAlign(), Slot.asData()))
+        return createInvalidTrieError(Slot.asData(), "mis-aligned data entry");
+
+      TrieRawHashMapHandle::RecordData Record =
+          Trie.getRecord(SubtrieSlotValue::getDataOffset(Slot.asData()));
+      return RecordVerifier(Slot.asDataFileOffset(),
+                            OnDiskTrieRawHashMap::ConstValueProxy{
+                                Record.Proxy.Hash, Record.Proxy.Data});
+    }
+    return Error::success();
+  }
+
+  function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+      RecordVerifier;
+};
+} // namespace
+
+Error TrieVisitor::visit() {
+  auto Root = Trie.getRoot();
+  if (!Root)
+    return Error::success();
+
+  if (auto Err = validateSubTrie(Root, /*IsRoot=*/true))
+    return Err;
+
+  if (auto Err = visitSubTrie("", Root))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Root.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Root.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix;
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Root, SubtriePrefix, Slot))
+      return Err;
+  }
+
+  for (size_t I = 0, E = Subs.size(); I != E; ++I) {
+    Threads.async(
+        [&](unsigned Idx) {
+          // Don't run if there is an error already.
+          if (tooManyErrors())
+            return;
+          if (auto Err = traverseTrieNode(Subs[Idx], Prefixes[Idx]))
+            addError(std::move(Err));
+        },
+        I);
+  }
+
+  Threads.wait();
+  if (Err)
+    return std::move(*Err);
+  return Error::success();
+}
+
+Error TrieVisitor::validateSubTrie(SubtrieHandle Node, bool IsRoot) {
+  char *Addr = reinterpret_cast<char *>(&Node.getHeader());
+  const int64_t Offset = Node.getFileOffset().get();
+  if (Addr + Node.getSize() >=
+      Trie.getRegion().data() + Trie.getRegion().size())
+    return createInvalidTrieError(Offset, "subtrie node spans out of bound");
+
+  if (!IsRoot &&
+      Node.getStartBit() + Node.getNumBits() > Trie.getNumHashBits()) {
+    return createInvalidTrieError(Offset,
+                                  "subtrie represents too many hash bits");
+  }
+
+  if (IsRoot) {
+    if (Node.getStartBit() != 0)
+      return createInvalidTrieError(Offset,
+                                    "root node doesn't start at 0 index");
+
+    return Error::success();
+  }
+
+  if (Node.getNumBits() > Trie.getNumSubtrieBits())
+    return createInvalidTrieError(Offset, "subtrie has wrong number of slots");
+
+  return Error::success();
+}
+
+Error TrieVisitor::traverseTrieNode(SubtrieHandle Node, StringRef Prefix) {
+  if (auto Err = validateSubTrie(Node, /*IsRoot=*/false))
+    return Err;
+
+  if (auto Err = visitSubTrie(Prefix, Node))
+    return Err;
+
+  SmallVector<SubtrieHandle> Subs;
+  SmallVector<std::string> Prefixes;
+  const size_t NumSlots = Node.getNumSlots();
+  for (size_t I = 0, E = NumSlots; I != E; ++I) {
+    SubtrieSlotValue Slot = Node.load(I);
+    if (!Slot)
+      continue;
+    uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData();
+    if (Offset >= (uint64_t)Trie.getRegion().size())
+      return createInvalidTrieError(Offset, "slot points out of bound");
+    std::string SubtriePrefix = Prefix.str();
+    appendIndexBits(SubtriePrefix, I, NumSlots);
+    if (Slot.isSubtrie()) {
+      SubtrieHandle S(Trie.getRegion(), Slot);
+      Subs.push_back(S);
+      Prefixes.push_back(SubtriePrefix);
+    }
+    if (auto Err = visitSlot(I, Node, SubtriePrefix, Slot))
+      return Err;
+  }
+  for (size_t I = 0, E = Subs.size(); I != E; ++I)
+    if (auto Err = traverseTrieNode(Subs[I], Prefixes[I]))
+      return Err;
+
+  return Error::success();
+}
+
+void TrieRawHashMapHandle::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+  OS << "hash-num-bits=" << getNumHashBits()
+     << " hash-size=" << getNumHashBytes()
+     << " record-data-size=" << getRecordDataSize() << "\n";
+
+  TriePrinter Printer(*this, OS, PrintRecordData);
+  if (auto Err = Printer.visit())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  if (auto Err = Printer.printRecords())
+    OS << "error: " << toString(std::move(Err)) << "\n";
+
+  return;
+}
+
+Error TrieRawHashMapHandle::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  // Use the base TrieVisitor to identify the errors inside trie first.
+  TrieVisitor BasicVerifier(*this);
+  if (auto Err = BasicVerifier.visit())
+    return Err;
+
+  // If the trie data structure is sound, do a second pass to verify data and
+  // verifier function can assume the index is correct. However, there can be
+  // newly added bad entries that can still produce error.
+  TrieVerifier Verifier(*this, RecordVerifier);
+  return Verifier.visit();
+}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskTrieRawHashMap::ImplType {};
+
+Expected<OnDiskTrieRawHashMap>
+OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
+                             size_t NumHashBits, uint64_t DataSize,
+                             uint64_t MaxFileSize,
+                             std::optional<uint64_t> NewFileInitialSize,
+                             std::optional<size_t> NewTableNumRootBits,
+                             std::optional<size_t> NewTableNumSubtrieBits) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::pointer>
+OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct,
+                                 LazyInsertOnLeakCB OnLeak) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+Expected<OnDiskTrieRawHashMap::const_pointer>
+OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
+  return const_pointer();
+}
+
+void OnDiskTrieRawHashMap::print(
+    raw_ostream &OS, function_ref<void(ArrayRef<char>)> PrintRecordData) const {
+}
+
+Error OnDiskTrieRawHashMap::validate(
+    function_ref<Error(FileOffset, OnDiskTrieRawHashMap::ConstValueProxy)>
+        RecordVerifier) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskTrieRawHashMap is not supported");
+}
+
+size_t OnDiskTrieRawHashMap::size() const { return 0; }
+size_t OnDiskTrieRawHashMap::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(std::unique_ptr<ImplType> Impl)
+    : Impl(std::move(Impl)) {}
+OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS) =
+    default;
+OnDiskTrieRawHashMap &
+OnDiskTrieRawHashMap::operator=(OnDiskTrieRawHashMap &&RHS) = default;
+OnDiskTrieRawHashMap::~OnDiskTrieRawHashMap() = default;
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index ab709e30369bf..0f8fcb9e98954 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,7 +1,3 @@
-if (LLVM_ENABLE_ONDISK_CAS)
-  add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1)
-endif()
-
 set(LLVM_LINK_COMPONENTS
   Support
   CAS
@@ -12,6 +8,7 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
+  OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
   )
 
diff --git a/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
new file mode 100644
index 0000000000000..7bedfe4b29e30
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskTrieRawHashMap.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+using namespace llvm;
+using namespace llvm::cas;
+
+namespace {
+
+struct OnDiskTrieRawHashMapTestFixture
+    : public ::testing::TestWithParam<size_t> {
+  static constexpr size_t MB = 1024u * 1024u;
+  static constexpr size_t DataSize = 8; // Multiple of 8B.
+
+  std::optional<unittest::TempDir> Temp;
+  size_t NumHashBytes;
+
+  void SetUp() override {
+    Temp.emplace("trie-raw-hash-map", /*Unique=*/true);
+    NumHashBytes = GetParam();
+  }
+  void TearDown() override { Temp.reset(); }
+
+  Expected<OnDiskTrieRawHashMap> createTrie() {
+    size_t NumHashBits = NumHashBytes * 8;
+    return OnDiskTrieRawHashMap::create(
+        Temp->path((Twine(NumHashBytes) + "B").str()), "index",
+        /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB,
+        /*NewInitialFileSize=*/std::nullopt);
+  }
+};
+
+// Create tries with various sizes of hash and with data.
+TEST_P(OnDiskTrieRawHashMapTestFixture, General) {
+  std::optional<OnDiskTrieRawHashMap> Trie1;
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded());
+  std::optional<OnDiskTrieRawHashMap> Trie2;
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+
+  uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0};
+  auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes);
+  auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes);
+  constexpr StringLiteral Data0v1Bytes = "data0.v1";
+  constexpr StringLiteral Data0v2Bytes = "data0.v2";
+  constexpr StringLiteral Data1Bytes = "data1...";
+  static_assert(Data0v1Bytes.size() == DataSize, "math error");
+  static_assert(Data0v2Bytes.size() == DataSize, "math error");
+  static_assert(Data1Bytes.size() == DataSize, "math error");
+  ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+  ArrayRef<char> Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size());
+  ArrayRef<char> Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size());
+
+  // Lookup when trie is empty.
+  EXPECT_FALSE(Trie1->find(Hash0));
+
+  // Insert.
+  std::optional<FileOffset> Offset;
+  std::optional<MutableArrayRef<char>> Data;
+  {
+    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    ASSERT_THAT_ERROR(Trie1->insert({Hash0, Data0v1}).moveInto(Insertion),
+                      Succeeded());
+    EXPECT_EQ(Hash0, (*Insertion)->Hash);
+    EXPECT_EQ(Data0v1, (*Insertion)->Data);
+    EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data()));
+
+    Offset = Insertion->getOffset();
+    Data = (*Insertion)->Data;
+  }
+
+  // Find.
+  {
+    auto Lookup = Trie1->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Find in a different instance of the same on-disk trie that existed
+  // before the insertion.
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Create a new instance and check that too.
+  Trie2.reset();
+  ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded());
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v1, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Change the data.
+  llvm::copy(Data0v2, Data->data());
+  {
+    auto Lookup = Trie2->find(Hash0);
+    ASSERT_TRUE(Lookup);
+    EXPECT_EQ(Hash0, Lookup->Hash);
+    EXPECT_EQ(Data0v2, Lookup->Data);
+    EXPECT_EQ(Offset->get(), Lookup.getOffset().get());
+  }
+
+  // Find different hash.
+  EXPECT_FALSE(Trie1->find(Hash1));
+  EXPECT_FALSE(Trie2->find(Hash1));
+
+  // Recover from an offset.
+  {
+    OnDiskTrieRawHashMap::const_pointer Recovered;
+    ASSERT_THAT_ERROR(Trie1->recoverFromFileOffset(*Offset).moveInto(Recovered),
+                      Succeeded());
+    ASSERT_TRUE(Recovered);
+    EXPECT_EQ(Offset->get(), Recovered.getOffset().get());
+    EXPECT_EQ(Hash0, Recovered->Hash);
+    EXPECT_EQ(Data0v2, Recovered->Data);
+  }
+
+  // Recover from a bad offset.
+  {
+    FileOffset BadOffset(1);
+    OnDiskTrieRawHashMap::const_pointer Recovered;
+    ASSERT_THAT_ERROR(
+        Trie1->recoverFromFileOffset(BadOffset).moveInto(Recovered), Failed());
+  }
+
+  // Insert another thing.
+  {
+    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    ASSERT_THAT_ERROR(Trie1->insert({Hash1, Data1}).moveInto(Insertion),
+                      Succeeded());
+    EXPECT_EQ(Hash1, (*Insertion)->Hash);
+    EXPECT_EQ(Data1, (*Insertion)->Data);
+    EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data()));
+
+    EXPECT_NE(Offset->get(), Insertion->getOffset().get());
+  }
+
+  // Validate.
+  {
+    auto RecordVerify =
+        [&](FileOffset Offset,
+            OnDiskTrieRawHashMap::ConstValueProxy Proxy) -> Error {
+      if (Proxy.Hash.size() != NumHashBytes)
+        return createStringError("wrong hash size");
+      if (Proxy.Data.size() != DataSize)
+        return createStringError("wrong data size");
+
+      return Error::success();
+    };
+    ASSERT_THAT_ERROR(Trie1->validate(RecordVerify), Succeeded());
+    ASSERT_THAT_ERROR(Trie2->validate(RecordVerify), Succeeded());
+  }
+
+  // Size and capacity.
+  {
+    EXPECT_EQ(Trie1->capacity(), MB);
+    EXPECT_EQ(Trie2->capacity(), MB);
+    EXPECT_LE(Trie1->size(), MB);
+    EXPECT_LE(Trie2->size(), MB);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(OnDiskTrieRawHashMapTest,
+                         OnDiskTrieRawHashMapTestFixture,
+                         ::testing::Values(1, 2, 4, 8));
+
+TEST(OnDiskTrieRawHashMapTest, OutOfSpace) {
+  unittest::TempDir Temp("trie-raw-hash-map", /*Unique=*/true);
+  std::optional<OnDiskTrieRawHashMap> Trie;
+
+  // Too small to create header.
+  ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create(
+                        Temp.path("NoSpace1").str(), "index",
+                        /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/8,
+                        /*NewInitialFileSize=*/std::nullopt)
+                        .moveInto(Trie),
+                    Failed());
+
+  // Just enough for root node but not enough for any insertion.
+  ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create(
+                        Temp.path("NoSpace2").str(), "index",
+                        /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/118,
+                        /*NewInitialFileSize=*/std::nullopt,
+                        /*NewTableNumRootBits=*/1, /*NewTableNumSubtrieBits=*/1)
+                        .moveInto(Trie),
+                    Succeeded());
+  uint8_t Hash0Bytes[1] = {0};
+  auto Hash0 = ArrayRef(Hash0Bytes);
+  constexpr StringLiteral Data0v1Bytes = "data0.v1";
+  ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
+  std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+  ASSERT_THAT_ERROR(Trie->insert({Hash0, Data0v1}).moveInto(Insertion),
+                    Failed());
+}
+
+} // namespace
+
+#endif // LLVM_ENABLE_ONDISK_CAS

From eef7a7663d2701c4fb073f749a6b9b7da1adc9b8 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 29 Sep 2025 21:05:37 +0100
Subject: [PATCH 179/878] [lldb][DWARFASTParserClang] Simplify obsolete error
 condition for malformed array member type offsets (#160132)

First time check was introduced in
`fa3ab4599d717feedbb83e08e7f654913942520b` to work around a debug-info
generation bug in Clang. This bug was fixed in Clang-4. The check has
since been adjusted (first in
`808ff186f6a6ba1fd38cc7e00697cd82f4afe540`, and then most recently in
`370db9c62910195e664e82dde6f0adb3e255a4fd`).

This check is getting quite convoluted, and all it does is turn an
`array[1]` into an `array[0]` type when it is deemed correct. At this
point the workaround probably never fires, apart from actually valid
codegen. This patch removes the special conditions and emits the error
specifically in those cases where we know the DWARF is malformed.

Added some shell tests for the error case.
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  37 ------
 ...ncomplete-member-beyond-parent-bounds.yaml | 104 +++++++++++++++++
 .../DWARF/member-beyond-parent-bounds.yaml    | 109 ++++++++++++++++++
 .../DWARF/member-on-parent-bounds.yaml        | 109 ++++++++++++++++++
 .../DWARF/union-types-no-member-location.yaml |   4 -
 .../zero-sized-member-in-parent-bounds.yaml   | 105 +++++++++++++++++
 6 files changed, 427 insertions(+), 41 deletions(-)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/incomplete-member-beyond-parent-bounds.yaml
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/member-beyond-parent-bounds.yaml
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/member-on-parent-bounds.yaml
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/zero-sized-member-in-parent-bounds.yaml

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index f1e73d73a733b..82e9d867c3ac0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -3126,43 +3126,6 @@ void DWARFASTParserClang::ParseSingleMember(
   if (!member_clang_type.IsCompleteType())
     member_clang_type.GetCompleteType();
 
-  {
-    // Older versions of clang emit the same DWARF for array[0] and array[1]. If
-    // the current field is at the end of the structure, then there is
-    // definitely no room for extra elements and we override the type to
-    // array[0]. This was fixed by f454dfb6b5af.
-    CompilerType member_array_element_type;
-    uint64_t member_array_size;
-    bool member_array_is_incomplete;
-
-    if (member_clang_type.IsArrayType(&member_array_element_type,
-                                      &member_array_size,
-                                      &member_array_is_incomplete) &&
-        !member_array_is_incomplete) {
-      uint64_t parent_byte_size =
-          parent_die.GetAttributeValueAsUnsigned(DW_AT_byte_size, UINT64_MAX);
-
-      // If the attrs.member_byte_offset is still set to UINT32_MAX this means
-      // that the DW_TAG_member didn't have a DW_AT_data_member_location, so
-      // don't emit an error if this is the case.
-      if (attrs.member_byte_offset != UINT32_MAX &&
-          attrs.member_byte_offset >= parent_byte_size) {
-        if (member_array_size != 1 &&
-            (member_array_size != 0 ||
-             attrs.member_byte_offset > parent_byte_size)) {
-          module_sp->ReportError(
-              "{0:x8}: DW_TAG_member '{1}' refers to type {2:x16}"
-              " which extends beyond the bounds of {3:x8}",
-              die.GetID(), attrs.name,
-              attrs.encoding_form.Reference().GetOffset(), parent_die.GetID());
-        }
-
-        member_clang_type =
-            m_ast.CreateArrayType(member_array_element_type, 0, false);
-      }
-    }
-  }
-
   TypeSystemClang::RequireCompleteType(member_clang_type);
 
   clang::FieldDecl *field_decl = TypeSystemClang::AddFieldToRecordType(
diff --git a/lldb/test/Shell/SymbolFile/DWARF/incomplete-member-beyond-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/incomplete-member-beyond-parent-bounds.yaml
new file mode 100644
index 0000000000000..4e659d02b3cd4
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/incomplete-member-beyond-parent-bounds.yaml
@@ -0,0 +1,104 @@
+# This is DWARF where we placed an incomplete type
+# at an offset that is the parent DW_AT_byte_size. Check
+# that we don't report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Incomplete")
+#     DW_AT_external  (true)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x04")
+#       DW_AT_type    (0x00000011 "Incomplete")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK:     Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - Incomplete
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x14
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x18
+            - Value:           0x11
+            - Value:           0x04
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...
diff --git a/lldb/test/Shell/SymbolFile/DWARF/member-beyond-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/member-beyond-parent-bounds.yaml
new file mode 100644
index 0000000000000..2ac538ed1a851
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/member-beyond-parent-bounds.yaml
@@ -0,0 +1,109 @@
+# This is malformed DWARF where we placed a non-zero sized type
+# at an offset that is larger the parent DW_AT_byte_size. Check
+# that we report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_base_type
+#     DW_AT_name      ("int")
+#     DW_AT_encoding  (DW_ATE_signed)
+#     DW_AT_byte_size (0x04)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x05")
+#       DW_AT_type    (0x00000011 "int")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK: Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - int
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_encoding
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+            - Value:           0x5
+            - Value:           0x4
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x0d
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x11
+            - Value:           0x11
+            - Value:           0x05
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...
diff --git a/lldb/test/Shell/SymbolFile/DWARF/member-on-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/member-on-parent-bounds.yaml
new file mode 100644
index 0000000000000..736697c002ee6
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/member-on-parent-bounds.yaml
@@ -0,0 +1,109 @@
+# This is malformed DWARF where we placed a non-zero sized type
+# at an offset that is the parent DW_AT_byte_size. Check
+# that we report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_base_type
+#     DW_AT_name      ("int")
+#     DW_AT_encoding  (DW_ATE_signed)
+#     DW_AT_byte_size (0x04)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x04")
+#       DW_AT_type    (0x00000011 "int")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK: Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - int
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_encoding
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+            - Value:           0x5
+            - Value:           0x4
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x0d
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x11
+            - Value:           0x11
+            - Value:           0x04
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...
diff --git a/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml b/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml
index fbdc626ed113f..1d1e129cdb7c0 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml
+++ b/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml
@@ -48,14 +48,10 @@
 # RUN: yaml2obj %s > %t
 # RUN: lldb-test symbols --name=UnionType --find=type %t > %t.stdout
 # RUN: cat %t.stdout | FileCheck --check-prefix=STDOUT %s
-# RUN: lldb-test symbols --name=UnionType --find=type %t 2> %t.stderr
-# RUN: cat %t.stderr | FileCheck --allow-empty --check-prefix=STDERR %s
 
 # STDOUT: Found 1 types:
 # STDOUT: {{(0x)?[0-9a-fA-F]+}}: Type{0x0000002b} , name = "UnionType", size = 32, compiler_type = 0x{{[0-9a-fA-F]+}} union UnionType {
 
-# STDERR-NOT: error: union-types-no-member-location.yaml.tmp 0x00000031: DW_TAG_member 'array' refers to type 0x000000000000001f which extends beyond the bounds of 0x0000002b
-
 --- !ELF
 FileHeader:
   Class:           ELFCLASS64
diff --git a/lldb/test/Shell/SymbolFile/DWARF/zero-sized-member-in-parent-bounds.yaml b/lldb/test/Shell/SymbolFile/DWARF/zero-sized-member-in-parent-bounds.yaml
new file mode 100644
index 0000000000000..a98f62cd34056
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/zero-sized-member-in-parent-bounds.yaml
@@ -0,0 +1,105 @@
+# This is DWARF where we placed a zero-sized type
+# at an offset that is the parent DW_AT_byte_size. Check
+# that we don't report an error in such cases.
+#
+# DW_TAG_compile_unit
+#   DW_AT_name        ("main.cpp")
+#   DW_AT_language    (DW_LANG_C)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Bar")
+#     DW_AT_byte_size (0x00)
+#
+#   DW_TAG_structure_type
+#     DW_AT_name      ("Foo")
+#     DW_AT_byte_size (0x04)
+#
+#     DW_TAG_member
+#       DW_AT_name    ("mem")
+#       DW_AT_data_member_location ("0x04")
+#       DW_AT_type    (0x00000011 "Bar")
+#
+#     NULL
+#
+#   NULL
+
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test symbols --name=Foo --find=type %t 2>&1 | FileCheck %s
+
+# CHECK:     Found 1 types:
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+DWARF:
+  debug_str:
+    - main.cpp
+    - Bar
+    - Foo
+    - mem
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x3
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+        - Code:            0x4
+          Tag:             DW_TAG_member
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_data_member_location
+              Form:            DW_FORM_data1
+  debug_info:
+    - Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x0
+            - Value:           0x2
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x9
+            - Value:           0x0
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x0d
+            - Value:           0x04
+        - AbbrCode:        0x4
+          Values:
+            - Value:           0x11
+            - Value:           0x11
+            - Value:           0x04
+        - AbbrCode:        0x0
+        - AbbrCode:        0x0
+...

From b629981a6daf397375ff1fcadad286883dcaf0ea Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Mon, 29 Sep 2025 22:17:52 +0200
Subject: [PATCH 180/878] [CIR] Add virtual base support to
 getAddressOfBaseClass (#159162)

This patch enables calling virtual functions of virtual base classes of
a derived class.
---
 clang/lib/CIR/CodeGen/CIRGenClass.cpp | 42 +++++++++-----
 clang/test/CIR/CodeGen/vbase.cpp      | 82 ++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index cb8fe6c8862dc..9d12a13dd79c0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -951,28 +951,37 @@ Address CIRGenFunction::getAddressOfBaseClass(
     bool nullCheckValue, SourceLocation loc) {
   assert(!path.empty() && "Base path should not be empty!");
 
+  CastExpr::path_const_iterator start = path.begin();
+  const CXXRecordDecl *vBase = nullptr;
+
   if ((*path.begin())->isVirtual()) {
-    // The implementation here is actually complete, but let's flag this
-    // as an error until the rest of the virtual base class support is in place.
-    cgm.errorNYI(loc, "getAddrOfBaseClass: virtual base");
-    return Address::invalid();
+    vBase = (*start)->getType()->castAsCXXRecordDecl();
+    ++start;
   }
 
   // Compute the static offset of the ultimate destination within its
   // allocating subobject (the virtual base, if there is one, or else
   // the "complete" object that we see).
-  CharUnits nonVirtualOffset =
-      cgm.computeNonVirtualBaseClassOffset(derived, path);
+  CharUnits nonVirtualOffset = cgm.computeNonVirtualBaseClassOffset(
+      vBase ? vBase : derived, {start, path.end()});
+
+  // If there's a virtual step, we can sometimes "devirtualize" it.
+  // For now, that's limited to when the derived type is final.
+  // TODO: "devirtualize" this for accesses to known-complete objects.
+  if (vBase && derived->hasAttr<FinalAttr>()) {
+    const ASTRecordLayout &layout = getContext().getASTRecordLayout(derived);
+    CharUnits vBaseOffset = layout.getVBaseClassOffset(vBase);
+    nonVirtualOffset += vBaseOffset;
+    vBase = nullptr; // we no longer have a virtual step
+  }
 
   // Get the base pointer type.
   mlir::Type baseValueTy = convertType((path.end()[-1])->getType());
   assert(!cir::MissingFeatures::addressSpace());
 
-  // The if statement here is redundant now, but it will be needed when we add
-  // support for virtual base classes.
   // If there is no virtual base, use cir.base_class_addr.  It takes care of
   // the adjustment and the null pointer check.
-  if (nonVirtualOffset.isZero()) {
+  if (nonVirtualOffset.isZero() && !vBase) {
     assert(!cir::MissingFeatures::sanitizers());
     return builder.createBaseClassAddr(getLoc(loc), value, baseValueTy, 0,
                                        /*assumeNotNull=*/true);
@@ -980,10 +989,17 @@ Address CIRGenFunction::getAddressOfBaseClass(
 
   assert(!cir::MissingFeatures::sanitizers());
 
-  // Apply the offset
-  value = builder.createBaseClassAddr(getLoc(loc), value, baseValueTy,
-                                      nonVirtualOffset.getQuantity(),
-                                      /*assumeNotNull=*/true);
+  // Compute the virtual offset.
+  mlir::Value virtualOffset = nullptr;
+  if (vBase) {
+    virtualOffset = cgm.getCXXABI().getVirtualBaseClassOffset(
+        getLoc(loc), *this, value, derived, vBase);
+  }
+
+  // Apply both offsets.
+  value = applyNonVirtualAndVirtualOffset(
+      getLoc(loc), *this, value, nonVirtualOffset, virtualOffset, derived,
+      vBase, baseValueTy, not nullCheckValue);
 
   // Cast to the destination type.
   value = value.withElementType(builder, baseValueTy);
diff --git a/clang/test/CIR/CodeGen/vbase.cpp b/clang/test/CIR/CodeGen/vbase.cpp
index 91396518a40b0..4d57f8ea74e0c 100644
--- a/clang/test/CIR/CodeGen/vbase.cpp
+++ b/clang/test/CIR/CodeGen/vbase.cpp
@@ -13,19 +13,29 @@ class Base {
 
 class Derived : public virtual Base {};
 
-// This is just here to force the record types to be emitted.
 void f() {
   Derived d;
+  d.f();
+}
+
+class DerivedFinal final : public virtual Base {};
+
+void g() {
+  DerivedFinal df;
+  df.f();
 }
 
 // CIR: !rec_Base = !cir.record<class "Base" {!cir.vptr}>
 // CIR: !rec_Derived = !cir.record<class "Derived" {!rec_Base}>
+// CIR: !rec_DerivedFinal = !cir.record<class "DerivedFinal" {!rec_Base}>
 
 // LLVM: %class.Derived = type { %class.Base }
 // LLVM: %class.Base = type { ptr }
+// LLVM: %class.DerivedFinal = type { %class.Base }
 
 // OGCG: %class.Derived = type { %class.Base }
 // OGCG: %class.Base = type { ptr }
+// OGCG: %class.DerivedFinal = type { %class.Base }
 
 // Test the constructor handling for a class with a virtual base.
 struct A {
@@ -47,6 +57,76 @@ void ppp() { B b; }
 
 // OGCG: @_ZTV1B = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr inttoptr (i64 12 to ptr), ptr null, ptr @_ZTI1B] }, comdat, align 8
 
+// CIR: cir.func {{.*}}@_Z1fv() {
+// CIR:   %[[D:.+]] = cir.alloca !rec_Derived, !cir.ptr<!rec_Derived>, ["d", init]
+// CIR:   cir.call @_ZN7DerivedC1Ev(%[[D]]) nothrow : (!cir.ptr<!rec_Derived>) -> ()
+// CIR:   %[[VPTR_PTR:.+]] = cir.vtable.get_vptr %[[D]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[VPTR:.+]] = cir.load {{.*}} %[[VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[VPTR_I8:.+]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR:   %[[NEG32:.+]] = cir.const #cir.int<-32> : !s64i
+// CIR:   %[[ADJ_VPTR_I8:.+]] = cir.ptr_stride(%[[VPTR_I8]] : !cir.ptr<!u8i>, %[[NEG32]] : !s64i), !cir.ptr<!u8i>
+// CIR:   %[[OFFSET_PTR:.+]] = cir.cast(bitcast, %[[ADJ_VPTR_I8]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR:   %[[OFFSET:.+]] = cir.load {{.*}} %[[OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
+// CIR:   %[[D_I8:.+]] = cir.cast(bitcast, %[[D]] : !cir.ptr<!rec_Derived>), !cir.ptr<!u8i>
+// CIR:   %[[ADJ_THIS_I8:.+]] = cir.ptr_stride(%[[D_I8]] : !cir.ptr<!u8i>, %[[OFFSET]] : !s64i), !cir.ptr<!u8i>
+// CIR:   %[[ADJ_THIS_D:.+]] = cir.cast(bitcast, %[[ADJ_THIS_I8]] : !cir.ptr<!u8i>), !cir.ptr<!rec_Derived>
+// CIR:   %[[BASE_THIS:.+]] = cir.cast(bitcast, %[[ADJ_THIS_D]] : !cir.ptr<!rec_Derived>), !cir.ptr<!rec_Base>
+// CIR:   %[[BASE_VPTR_PTR:.+]] = cir.vtable.get_vptr %[[BASE_THIS]] : !cir.ptr<!rec_Base> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[BASE_VPTR:.+]] = cir.load {{.*}} %[[BASE_VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[SLOT_PTR:.+]] = cir.vtable.get_virtual_fn_addr %[[BASE_VPTR]][0] : !cir.vptr -> !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>
+// CIR:   %[[FN:.+]] = cir.load {{.*}} %[[SLOT_PTR]] : !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>, !cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>
+// CIR:   cir.call %[[FN]](%[[BASE_THIS]]) : (!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>, !cir.ptr<!rec_Base>) -> ()
+// CIR:   cir.return
+
+// CIR: cir.func {{.*}}@_Z1gv() {
+// CIR:   %[[DF:.+]] = cir.alloca !rec_DerivedFinal, !cir.ptr<!rec_DerivedFinal>, ["df", init]
+// CIR:   cir.call @_ZN12DerivedFinalC1Ev(%[[DF]]) nothrow : (!cir.ptr<!rec_DerivedFinal>) -> ()
+// CIR:   %[[BASE_THIS_2:.+]] = cir.base_class_addr %[[DF]] : !cir.ptr<!rec_DerivedFinal> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR:   %[[BASE_VPTR_PTR_2:.+]] = cir.vtable.get_vptr %[[BASE_THIS_2]] : !cir.ptr<!rec_Base> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[BASE_VPTR_2:.+]] = cir.load {{.*}} %[[BASE_VPTR_PTR_2]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[SLOT_PTR_2:.+]] = cir.vtable.get_virtual_fn_addr %[[BASE_VPTR_2]][0] : !cir.vptr -> !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>
+// CIR:   %[[FN_2:.+]] = cir.load {{.*}} %[[SLOT_PTR_2]] : !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>, !cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>
+// CIR:   cir.call %[[FN_2]](%[[BASE_THIS_2]]) : (!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>, !cir.ptr<!rec_Base>) -> ()
+// CIR:   cir.return
+
+// LLVM: define {{.*}}void @_Z1fv()
+// LLVM:   %[[D:.+]] = alloca {{.*}}
+// LLVM:   call void @_ZN7DerivedC1Ev(ptr %[[D]])
+// LLVM:   %[[VPTR_ADDR:.+]] = load ptr, ptr %[[D]]
+// LLVM:   %[[NEG32_PTR:.+]] = getelementptr i8, ptr %[[VPTR_ADDR]], i64 -32
+// LLVM:   %[[OFF:.+]] = load i64, ptr %[[NEG32_PTR]]
+// LLVM:   %[[ADJ_THIS:.+]] = getelementptr i8, ptr %[[D]], i64 %[[OFF]]
+// LLVM:   %[[VFN_TAB:.+]] = load ptr, ptr %[[ADJ_THIS]]
+// LLVM:   %[[SLOT0:.+]] = getelementptr inbounds ptr, ptr %[[VFN_TAB]], i32 0
+// LLVM:   %[[VFN:.+]] = load ptr, ptr %[[SLOT0]]
+// LLVM:   call void %[[VFN]](ptr %[[ADJ_THIS]])
+// LLVM:   ret void
+
+// LLVM: define {{.*}}void @_Z1gv()
+// LLVM:   %[[DF:.+]] = alloca {{.*}}
+// LLVM:   call void @_ZN12DerivedFinalC1Ev(ptr %[[DF]])
+// LLVM:   %[[VPTR2:.+]] = load ptr, ptr %[[DF]]
+// LLVM:   %[[SLOT0_2:.+]] = getelementptr inbounds ptr, ptr %[[VPTR2]], i32 0
+// LLVM:   %[[VFN2:.+]] = load ptr, ptr %[[SLOT0_2]]
+// LLVM:   call void %[[VFN2]](ptr %[[DF]])
+// LLVM:   ret void
+
+// OGCG: define {{.*}}void @_Z1fv()
+// OGCG:   %[[D:.+]] = alloca {{.*}}
+// OGCG:   call void @_ZN7DerivedC1Ev(ptr {{.*}} %[[D]])
+// OGCG:   %[[VTABLE:.+]] = load ptr, ptr %[[D]]
+// OGCG:   %[[NEG32_PTR:.+]] = getelementptr i8, ptr %[[VTABLE]], i64 -32
+// OGCG:   %[[OFF:.+]] = load i64, ptr %[[NEG32_PTR]]
+// OGCG:   %[[ADJ_THIS:.+]] = getelementptr inbounds i8, ptr %[[D]], i64 %[[OFF]]
+// OGCG:   call void @_ZN4Base1fEv(ptr {{.*}} %[[ADJ_THIS]])
+// OGCG:   ret void
+
+// OGCG: define {{.*}}void @_Z1gv()
+// OGCG:   %[[DF:.+]] = alloca {{.*}}
+// OGCG:   call void @_ZN12DerivedFinalC1Ev(ptr {{.*}} %[[DF]])
+// OGCG:   call void @_ZN4Base1fEv(ptr {{.*}} %[[DF]])
+// OGCG:   ret void
+
 // Constructor for B
 // CIR: cir.func comdat linkonce_odr @_ZN1BC1Ev(%arg0: !cir.ptr<!rec_B>
 // CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["this", init]

From 0fcce4fb7b85ed42feb2f2291405fe4a2292b2b4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 29 Sep 2025 15:17:55 -0500
Subject: [PATCH 181/878] [OpenMP] Mark problematic tests as XFAIL /
 UNSUPPORTED (#161267)

Summary:
Several of these tests have been failing for literal years. Ideally we
make efforts to fix this, but keeping these broken has had serious
consequences on our testing infrastructure where failures are the norm
so almost all test failures are disregarded. I made a tracking issue for
the ones that have been disabled.

https://github.com/llvm/llvm-project/issues/161265
---
 offload/test/mapping/lambda_by_value.cpp               |  3 ++-
 offload/test/mapping/map_back_race.cpp                 |  3 +++
 offload/test/mapping/map_both_pointer_pointee.c        |  9 ++++-----
 offload/test/mapping/map_ptr_and_star_local.c          |  5 ++++-
 offload/test/mapping/map_structptr_and_member_global.c |  5 ++++-
 offload/test/mapping/map_structptr_and_member_local.c  |  5 ++++-
 offload/test/offloading/CUDA/basic_launch_multi_arg.cu |  8 ++++----
 offload/test/offloading/bug51781.c                     |  5 ++++-
 .../test/offloading/fortran/declare-target-automap.f90 |  3 +++
 offload/test/offloading/interop.c                      |  3 ++-
 .../offloading/single_threaded_for_barrier_hang_1.c    |  3 +++
 .../offloading/single_threaded_for_barrier_hang_2.c    |  5 +++--
 offload/test/offloading/spmdization.c                  |  3 ++-
 offload/test/sanitizer/ptr_outside_alloc_1.c           | 10 ++++------
 offload/test/sanitizer/ptr_outside_alloc_2.c           | 10 ++++------
 offload/test/sanitizer/use_after_free_1.c              | 10 ++++------
 offload/test/sanitizer/use_after_free_2.c              | 10 ++++------
 17 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/offload/test/mapping/lambda_by_value.cpp b/offload/test/mapping/lambda_by_value.cpp
index 5516dedd72a98..4c0278d405925 100644
--- a/offload/test/mapping/lambda_by_value.cpp
+++ b/offload/test/mapping/lambda_by_value.cpp
@@ -1,4 +1,5 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileopt-generic -fno-exceptions
+// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 
 #include <stdint.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_back_race.cpp b/offload/test/mapping/map_back_race.cpp
index 8a988d3be3b4f..49bbe87e2449d 100644
--- a/offload/test/mapping/map_back_race.cpp
+++ b/offload/test/mapping/map_back_race.cpp
@@ -2,6 +2,9 @@
 
 // Taken from https://github.com/llvm/llvm-project/issues/54216
 
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
+
 #include <algorithm>
 #include <cstdlib>
 #include <iostream>
diff --git a/offload/test/mapping/map_both_pointer_pointee.c b/offload/test/mapping/map_both_pointer_pointee.c
index 7be1ba465e7db..1934b702dbbac 100644
--- a/offload/test/mapping/map_both_pointer_pointee.c
+++ b/offload/test/mapping/map_both_pointer_pointee.c
@@ -1,11 +1,10 @@
-// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-x86_64-unknown-linux-gnu
-// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: amdgcn-amd-amdhsa
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: nvidiagpu
 
 #pragma omp declare target
 int *ptr1;
diff --git a/offload/test/mapping/map_ptr_and_star_local.c b/offload/test/mapping/map_ptr_and_star_local.c
index cc826b3c0290b..97fa7cd53715f 100644
--- a/offload/test/mapping/map_ptr_and_star_local.c
+++ b/offload/test/mapping/map_ptr_and_star_local.c
@@ -1,6 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_structptr_and_member_global.c b/offload/test/mapping/map_structptr_and_member_global.c
index 960eea419964f..f855e87d7218a 100644
--- a/offload/test/mapping/map_structptr_and_member_global.c
+++ b/offload/test/mapping/map_structptr_and_member_global.c
@@ -1,6 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/mapping/map_structptr_and_member_local.c b/offload/test/mapping/map_structptr_and_member_local.c
index bd759407ef09c..bd9e2a89eb6f1 100644
--- a/offload/test/mapping/map_structptr_and_member_local.c
+++ b/offload/test/mapping/map_structptr_and_member_local.c
@@ -1,6 +1,9 @@
-// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // REQUIRES: libc
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
index 1f84a0e1288d4..b2e1edf51e171 100644
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -5,10 +5,10 @@
 // RUN: %t | %fcheck-generic
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
+// REQUIRES: gpu
+//
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// XFAIL: gpu
 
 #include <stdio.h>
 
diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c
index 2f30b035afbbe..ff7fa51aafc2a 100644
--- a/offload/test/offloading/bug51781.c
+++ b/offload/test/offloading/bug51781.c
@@ -16,6 +16,7 @@
 // the generic state machine.
 //
 // RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \
+// RUN:   -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
@@ -24,7 +25,9 @@
 // Repeat with reduction clause, which has managed to break the custom state
 // machine in the past.
 //
-// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt -DADD_REDUCTION \
+// RUN: %libomptarget-compile-generic -O2 -foffload-lto -Rpass=openmp-opt \
+// RUN:   -DADD_REDUCTION \
+// RUN:   -Xoffload-linker -mllvm=-openmp-opt-disable-spmdization \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
diff --git a/offload/test/offloading/fortran/declare-target-automap.f90 b/offload/test/offloading/fortran/declare-target-automap.f90
index b9c2d34c834fa..b44c0b2815274 100644
--- a/offload/test/offloading/fortran/declare-target-automap.f90
+++ b/offload/test/offloading/fortran/declare-target-automap.f90
@@ -1,6 +1,9 @@
 !Offloading test for AUTOMAP modifier in declare target enter
 ! REQUIRES: flang, amdgpu
 
+! FIXME: https://github.com/llvm/llvm-project/issues/161265
+! XFAIL: amdgpu
+
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program automap_program
    use iso_c_binding, only: c_loc
diff --git a/offload/test/offloading/interop.c b/offload/test/offloading/interop.c
index 26287e3ec5333..d9fa2ef883b9c 100644
--- a/offload/test/offloading/interop.c
+++ b/offload/test/offloading/interop.c
@@ -1,5 +1,6 @@
 // RUN: %libomptarget-compile-run-and-check-generic
-// REQUIRES: nvptx64-nvidia-cuda
+
+// XFAIL: *
 
 #include <assert.h>
 #include <omp.h>
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_1.c b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
index 8ee6b51fb6818..a007521a5c742 100644
--- a/offload/test/offloading/single_threaded_for_barrier_hang_1.c
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_1.c
@@ -1,6 +1,9 @@
 // RUN: %libomptarget-compile-run-and-check-generic
 // RUN: %libomptarget-compileopt-run-and-check-generic
 
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
+
 #include <omp.h>
 #include <stdio.h>
 
diff --git a/offload/test/offloading/single_threaded_for_barrier_hang_2.c b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
index a98abd6922da7..cabd2ed3dde71 100644
--- a/offload/test/offloading/single_threaded_for_barrier_hang_2.c
+++ b/offload/test/offloading/single_threaded_for_barrier_hang_2.c
@@ -1,6 +1,7 @@
 // RUN: %libomptarget-compile-run-and-check-generic
-// FIXME: This fails with optimization enabled and prints b: 0
-// FIXME: RUN: %libomptarget-compileopt-run-and-check-generic
+
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: gpu
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/offload/test/offloading/spmdization.c b/offload/test/offloading/spmdization.c
index 7f3f47d9ef32e..48627cd7dae1a 100644
--- a/offload/test/offloading/spmdization.c
+++ b/offload/test/offloading/spmdization.c
@@ -2,7 +2,8 @@
 // RUN: %libomptarget-compileopt-generic
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,SPMD
-// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization
+// RUN: %libomptarget-compileopt-generic -mllvm --openmp-opt-disable-spmdization \
+// RUN:   -Xoffload-linker -mllvm=--openmp-opt-disable-spmdization
 // RUN: env LIBOMPTARGET_INFO=16 \
 // RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,GENERIC
 // clang-format on
diff --git a/offload/test/sanitizer/ptr_outside_alloc_1.c b/offload/test/sanitizer/ptr_outside_alloc_1.c
index bdd028352e403..b30ce12ef1ea2 100644
--- a/offload/test/sanitizer/ptr_outside_alloc_1.c
+++ b/offload/test/sanitizer/ptr_outside_alloc_1.c
@@ -5,12 +5,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/ptr_outside_alloc_2.c b/offload/test/sanitizer/ptr_outside_alloc_2.c
index 6a67962f9eb32..3bb8bdaca8b48 100644
--- a/offload/test/sanitizer/ptr_outside_alloc_2.c
+++ b/offload/test/sanitizer/ptr_outside_alloc_2.c
@@ -3,12 +3,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/use_after_free_1.c b/offload/test/sanitizer/use_after_free_1.c
index c4783c5c36df9..acc1de373f9e3 100644
--- a/offload/test/sanitizer/use_after_free_1.c
+++ b/offload/test/sanitizer/use_after_free_1.c
@@ -5,12 +5,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK,TRACE
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 #include <omp.h>
 
diff --git a/offload/test/sanitizer/use_after_free_2.c b/offload/test/sanitizer/use_after_free_2.c
index 1c1e09744a750..3d70fb7b3a3fc 100644
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@@ -3,12 +3,10 @@
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // clang-format on
 
-// UNSUPPORTED: aarch64-unknown-linux-gnu
-// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
-// UNSUPPORTED: x86_64-unknown-linux-gnu
-// UNSUPPORTED: x86_64-unknown-linux-gnu-LTO
-// UNSUPPORTED: s390x-ibm-linux-gnu
-// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+// FIXME: https://github.com/llvm/llvm-project/issues/161265
+// UNSUPPORTED: nvidiagpu
+//
+// REQUIRES: gpu
 
 // If offload memory pooling is enabled for a large allocation, reuse error is
 // not detected. UNSUPPORTED: large_allocation_memory_pool

From 9df1099ba7d92f921333753941360a9b9f5ed0e6 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 29 Sep 2025 13:29:29 -0700
Subject: [PATCH 182/878] [CAS] Fix a build failure on 32 bit system from
 #114100 (#161268)

Fix a build failure on 32 bit system that caused by a warning of
narrowing `uint64_t` to `size_t`.
---
 llvm/lib/CAS/OnDiskTrieRawHashMap.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
index 3000c0f0e46f1..9b382dd749ea5 100644
--- a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -271,8 +271,8 @@ class TrieRawHashMapHandle {
   MappedFileRegion &getRegion() const { return *Region; }
 
   size_t getFlags() const { return H->Flags; }
-  uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
-  uint64_t getNumHashBits() const { return H->NumHashBits; }
+  size_t getNumSubtrieBits() const { return H->NumSubtrieBits; }
+  size_t getNumHashBits() const { return H->NumHashBits; }
   size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); }
   size_t getRecordDataSize() const { return H->RecordDataSize; }
   size_t getRecordSize() const {

From 045e09f22b6149bb8288e458a465f1b16cb88b77 Mon Sep 17 00:00:00 2001
From: Alan Zhao <ayzhao@google.com>
Date: Mon, 29 Sep 2025 13:37:06 -0700
Subject: [PATCH 183/878] [InstCombine] Set !prof metadata on Selects
 identified by add.ll test (#158743)

These select instructions are created from non-branching instructions,
so their branch weights are unknown.

Tracking issue: #147390
---
 llvm/include/llvm/IR/ProfDataUtils.h          |  8 +++
 .../Transforms/InstCombine/InstCombiner.h     | 20 ++++----
 llvm/lib/IR/ProfDataUtils.cpp                 |  7 +++
 .../InstCombine/InstCombineAddSub.cpp         |  4 +-
 .../InstCombine/InstCombineInternal.h         | 18 +++++--
 .../InstCombine/InstCombineShifts.cpp         |  2 +-
 .../InstCombine/InstructionCombining.cpp      |  6 +--
 .../InstCombine/preserve-profile.ll           | 50 +++++++++++++++++++
 llvm/utils/profcheck-xfail.txt                |  2 -
 9 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index de9675f48c79b..e97160e59c795 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -185,6 +185,14 @@ inline uint32_t scaleBranchCount(uint64_t Count, uint64_t Scale) {
 LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I,
                                                 StringRef PassName);
 
+/// Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch
+/// weights in the new instruction if the parent function of the original
+/// instruction has an entry count. This is to not confuse users by injecting
+/// profile data into non-profiled functions.
+LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
+                                                          Function &F,
+                                                          StringRef PassName);
+
 /// Analogous to setExplicitlyUnknownBranchWeights, but for functions and their
 /// entry counts.
 LLVM_ABI void setExplicitlyUnknownFunctionEntryCount(Function &F,
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index fa313f5290773..d6c2d7fc48bda 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -64,6 +64,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   /// A worklist of the instructions that need to be simplified.
   InstructionWorklist &Worklist;
 
+  Function &F;
+
   // Mode in which we are running the combiner.
   const bool MinimizeSize;
 
@@ -98,17 +100,17 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
   bool ComputedBackEdges = false;
 
 public:
-  InstCombiner(InstructionWorklist &Worklist, BuilderTy &Builder,
-               bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
-               TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
-               DominatorTree &DT, OptimizationRemarkEmitter &ORE,
-               BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
-               ProfileSummaryInfo *PSI, const DataLayout &DL,
+  InstCombiner(InstructionWorklist &Worklist, BuilderTy &Builder, Function &F,
+               AAResults *AA, AssumptionCache &AC, TargetLibraryInfo &TLI,
+               TargetTransformInfo &TTI, DominatorTree &DT,
+               OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+               BranchProbabilityInfo *BPI, ProfileSummaryInfo *PSI,
+               const DataLayout &DL,
                ReversePostOrderTraversal<BasicBlock *> &RPOT)
       : TTIForTargetIntrinsicsOnly(TTI), Builder(Builder), Worklist(Worklist),
-        MinimizeSize(MinimizeSize), AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL),
-        SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true,
-           /*CanUseUndef*/ true, &DC),
+        F(F), MinimizeSize(F.hasMinSize()), AA(AA), AC(AC), TLI(TLI), DT(DT),
+        DL(DL), SQ(DL, &TLI, &DT, &AC, nullptr, /*UseInstrInfo*/ true,
+                   /*CanUseUndef*/ true, &DC),
         ORE(ORE), BFI(BFI), BPI(BPI), PSI(PSI), RPOT(RPOT) {}
 
   virtual ~InstCombiner() = default;
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 5827292cee39b..99029c1719507 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -252,6 +252,13 @@ void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName) {
                    MDB.createString(PassName)}));
 }
 
+void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, Function &F,
+                                                 StringRef PassName) {
+  if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
+      EC && EC->getCount() > 0)
+    setExplicitlyUnknownBranchWeights(I, PassName);
+}
+
 void setExplicitlyUnknownFunctionEntryCount(Function &F, StringRef PassName) {
   MDBuilder MDB(F.getContext());
   F.setMetadata(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d1ca0a6a393c5..59e103cda0230 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -880,11 +880,11 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
   // zext(bool) + C -> bool ? C + 1 : C
   if (match(Op0, m_ZExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
+    return createSelectInst(X, InstCombiner::AddOne(Op1C), Op1);
   // sext(bool) + C -> bool ? C - 1 : C
   if (match(Op0, m_SExt(m_Value(X))) &&
       X->getType()->getScalarSizeInBits() == 1)
-    return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
+    return createSelectInst(X, InstCombiner::SubOne(Op1C), Op1);
 
   // ~X + C --> (C-1) - X
   if (match(Op0, m_Not(m_Value(X)))) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7a979c16da501..4f94aa2d38541 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -23,6 +23,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
@@ -62,14 +63,14 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
       public InstVisitor<InstCombinerImpl, Instruction *> {
 public:
   InstCombinerImpl(InstructionWorklist &Worklist, BuilderTy &Builder,
-                   bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
+                   Function &F, AAResults *AA, AssumptionCache &AC,
                    TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
                    DominatorTree &DT, OptimizationRemarkEmitter &ORE,
                    BlockFrequencyInfo *BFI, BranchProbabilityInfo *BPI,
                    ProfileSummaryInfo *PSI, const DataLayout &DL,
                    ReversePostOrderTraversal<BasicBlock *> &RPOT)
-      : InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
-                     BFI, BPI, PSI, DL, RPOT) {}
+      : InstCombiner(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI, BPI,
+                     PSI, DL, RPOT) {}
 
   virtual ~InstCombinerImpl() = default;
 
@@ -469,6 +470,17 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
                                 unsigned Depth = 0);
 
+  SelectInst *createSelectInst(Value *C, Value *S1, Value *S2,
+                               const Twine &NameStr = "",
+                               InsertPosition InsertBefore = nullptr,
+                               Instruction *MDFrom = nullptr) {
+    SelectInst *SI =
+        SelectInst::Create(C, S1, S2, NameStr, InsertBefore, MDFrom);
+    if (!MDFrom)
+      setExplicitlyUnknownBranchWeightsIfProfiled(*SI, F, DEBUG_TYPE);
+    return SI;
+  }
+
 public:
   /// Create and insert the idiom we use to indicate a block is unreachable
   /// without having to rewrite the CFG from within InstCombine.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 550f095b26ba4..d457e0c7dd1c4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1253,7 +1253,7 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
     // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
     if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
       auto *NewC = Builder.CreateShl(ConstantInt::get(Ty, 1), C1);
-      return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+      return createSelectInst(X, NewC, ConstantInt::getNullValue(Ty));
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index f0ddd5ca94c5a..8fbaf68dfcc43 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1735,7 +1735,7 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
   Constant *Zero = ConstantInt::getNullValue(BO.getType());
   Value *TVal = Builder.CreateBinOp(BO.getOpcode(), Ones, C);
   Value *FVal = Builder.CreateBinOp(BO.getOpcode(), Zero, C);
-  return SelectInst::Create(X, TVal, FVal);
+  return createSelectInst(X, TVal, FVal);
 }
 
 static Value *simplifyOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
@@ -5934,8 +5934,8 @@ static bool combineInstructionsOverFunction(
     LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                       << F.getName() << "\n");
 
-    InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
-                        ORE, BFI, BPI, PSI, DL, RPOT);
+    InstCombinerImpl IC(Worklist, Builder, F, AA, AC, TLI, TTI, DT, ORE, BFI,
+                        BPI, PSI, DL, RPOT);
     IC.MaxArraySizeForCombine = MaxArraySize;
     bool MadeChangeInThisIteration = IC.prepareWorklist(F);
     MadeChangeInThisIteration |= IC.run();
diff --git a/llvm/test/Transforms/InstCombine/preserve-profile.ll b/llvm/test/Transforms/InstCombine/preserve-profile.ll
index dd83805ed3397..8cb3e685ae302 100644
--- a/llvm/test/Transforms/InstCombine/preserve-profile.ll
+++ b/llvm/test/Transforms/InstCombine/preserve-profile.ll
@@ -46,9 +46,59 @@ define i32 @NegBin(i1 %C) !prof !0 {
   ret i32 %V
 }
 
+define i32 @select_C_minus_1_or_C_from_bool(i1 %x) !prof !0 {
+; CHECK-LABEL: define i32 @select_C_minus_1_or_C_from_bool(
+; CHECK-SAME: i1 [[X:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[X]], i32 41, i32 42, !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %ext = sext i1 %x to i32
+  %add = add i32 %ext, 42
+  ret i32 %add
+}
+
+define i5 @and_add(i1 %x, i1 %y) !prof !0 {
+; CHECK-LABEL: define i5 @and_add(
+; CHECK-SAME: i1 [[X:%.*]], i1 [[Y:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[Y]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[TMP2]], i5 -2, i5 0, !prof [[PROF2]]
+; CHECK-NEXT:    ret i5 [[R]]
+;
+  %xz = zext i1 %x to i5
+  %ys = sext i1 %y to i5
+  %sub = add i5 %xz, %ys
+  %r = and i5 %sub, 30
+  ret i5 %r
+}
+
+define i32 @add_zext_zext_i1(i1 %a) !prof !0 {
+; CHECK-LABEL: define i32 @add_zext_zext_i1(
+; CHECK-SAME: i1 [[A:%.*]]) !prof [[PROF0]] {
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[A]], i32 2, i32 0, !prof [[PROF2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %zext = zext i1 %a to i32
+  %add = add i32 %zext, %zext
+  ret i32 %add
+}
+
+define i32 @no_count_no_branch_weights(i1 %a) {
+; CHECK-LABEL: define i32 @no_count_no_branch_weights(
+; CHECK-SAME: i1 [[A:%.*]]) {
+; CHECK-NEXT:    [[ADD:%.*]] = select i1 [[A]], i32 2, i32 0
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %zext = zext i1 %a to i32
+  %add = add i32 %zext, %zext
+  ret i32 %add
+}
+
+
 !0 = !{!"function_entry_count", i64 1000}
 !1 = !{!"branch_weights", i32 2, i32 3}
 ;.
 ; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF2]] = !{!"unknown", !"instcombine"}
 ;.
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index a2b9e56e93e0e..08c89441ec855 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -838,8 +838,6 @@ Transforms/InstCombine/2011-02-14-InfLoop.ll
 Transforms/InstCombine/AArch64/sve-intrinsic-sel.ll
 Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll
 Transforms/InstCombine/AArch64/sve-intrinsic-simplify-shift.ll
-Transforms/InstCombine/add2.ll
-Transforms/InstCombine/add.ll
 Transforms/InstCombine/add-mask.ll
 Transforms/InstCombine/add-shl-mul-umax.ll
 Transforms/InstCombine/add-shl-sdiv-to-srem.ll

From 07f8f088b4b3d1c58e73f13a288ff2c088d15ad6 Mon Sep 17 00:00:00 2001
From: David Salinas <dsalinas@amd.com>
Date: Mon, 29 Sep 2025 17:16:29 -0400
Subject: [PATCH 184/878] Add --offoading option to llvm-readobj (#143342)

Utilize new extensions to LLVM Offloading API to
handle offloading fatbin Bundles.

The tool will output a list of available offload bundles
using URI syntax.

---------

Co-authored-by: dsalinas_amdeng <david.salinas@amd.com>
---
 llvm/docs/CommandGuide/llvm-readelf.rst       |  4 +++
 llvm/docs/CommandGuide/llvm-readobj.rst       |  4 +++
 llvm/include/llvm/Object/OffloadBundle.h      |  2 +-
 llvm/lib/Object/OffloadBundle.cpp             | 18 ++++++-------
 .../ELF/AMDGPU/offloading-fail.test           | 26 ++++++++++++++++++
 .../llvm-readobj/ELF/AMDGPU/offloading.test   | 27 +++++++++++++++++++
 llvm/tools/llvm-readobj/ObjDumper.cpp         | 12 +++++++++
 llvm/tools/llvm-readobj/ObjDumper.h           |  2 ++
 llvm/tools/llvm-readobj/Opts.td               |  1 +
 llvm/tools/llvm-readobj/llvm-readobj.cpp      |  5 ++++
 10 files changed, 91 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading-fail.test
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test

diff --git a/llvm/docs/CommandGuide/llvm-readelf.rst b/llvm/docs/CommandGuide/llvm-readelf.rst
index 284c3aa470a6f..5403fea60d5ee 100644
--- a/llvm/docs/CommandGuide/llvm-readelf.rst
+++ b/llvm/docs/CommandGuide/llvm-readelf.rst
@@ -143,6 +143,10 @@ OPTIONS
 
  Display all notes.
 
+.. option:: --offloading
+
+ Display list of HIP offload bundles.
+
 .. option:: --pretty-print
 
  When used with :option:`--elf-output-style`, JSON output will be formatted in
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index 8bd29eafbbfcf..0d05b947a6b3e 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -104,6 +104,10 @@ file formats.
  Do not demangle symbol names in the output. This option is only for ELF and
  XCOFF file formats. The option is enabled by default.
 
+.. option:: --offloading
+
+ Display list of HIP offload bundles.
+
 .. option:: --relocations, --relocs, -r
 
  Display the relocation entries in the file.
diff --git a/llvm/include/llvm/Object/OffloadBundle.h b/llvm/include/llvm/Object/OffloadBundle.h
index f4d5a1d878b8d..18be62b10c518 100644
--- a/llvm/include/llvm/Object/OffloadBundle.h
+++ b/llvm/include/llvm/Object/OffloadBundle.h
@@ -161,7 +161,7 @@ struct OffloadBundleURI {
     OffsetStr.getAsInteger(10, O);
     Str = Str.drop_front(OffsetStr.size());
 
-    if (Str.consume_front("&size="))
+    if (!Str.consume_front("&size="))
       return createStringError(object_error::parse_failed,
                                "Reading 'size' in URI");
 
diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index 1e1042ce2bc21..0dd378e65fd81 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -89,17 +89,17 @@ Error OffloadBundleFatBin::readEntries(StringRef Buffer,
     uint64_t EntryIDSize;
     StringRef EntryID;
 
-    if (auto EC = Reader.readInteger(EntryOffset))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntryOffset))
+      return Err;
 
-    if (auto EC = Reader.readInteger(EntrySize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntrySize))
+      return Err;
 
-    if (auto EC = Reader.readInteger(EntryIDSize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readInteger(EntryIDSize))
+      return Err;
 
-    if (auto EC = Reader.readFixedString(EntryID, EntryIDSize))
-      return errorCodeToError(object_error::parse_failed);
+    if (Error Err = Reader.readFixedString(EntryID, EntryIDSize))
+      return Err;
 
     auto Entry = std::make_unique<OffloadBundleEntry>(
         EntryOffset + SectionOffset, EntrySize, EntryIDSize, EntryID);
@@ -125,7 +125,7 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
-    return errorCodeToError(object_error::parse_failed);
+    return Err;
 
   return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
 }
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading-fail.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading-fail.test
new file mode 100644
index 0000000000000..391b7ee3facce
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading-fail.test
@@ -0,0 +1,26 @@
+## Test that --offloading with a fatbin works correctly.
+# REQUIRES: amdgpu-registered-target
+
+# RUN: yaml2obj %s -o %t.elf
+# RUN: llvm-readobj --offloading %t.elf 2>&1 | \ 
+# RUN:   FileCheck %s --check-prefix=WARN -DFILE_NAME=%t.elf 
+
+# WARN: warning: '{{.*}}': Stream Error: The stream is too short to perform the requested operation.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+Sections:
+  - Name:            .hip_fatbin
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1000
+    Content:         5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B0000000000000075782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000
+  - Name:            .hipFatBinSegment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x202FD0
+    AddressAlign:    0x8
+    Content:         '465049480100000000102000000000000000000000000000'
+...
diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test
new file mode 100644
index 0000000000000..21ee60d2ea829
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/offloading.test
@@ -0,0 +1,27 @@
+## Test that --offloading with a fatbin works correctly.
+# REQUIRES: amdgpu-registered-target
+
+# RUN: yaml2obj %s -o %t.elf
+# RUN: llvm-readobj --offloading %t.elf | \ 
+# RUN:   FileCheck %s -DFILE_NAME=%t.elf
+
+# CHECK:        host-x86_64-unknown-linux--     file://[[FILE_NAME]]#offset=8192&size=0
+# CHECK-NEXT:   hipv4-amdgcn-amd-amdhsa--gfx908 file://[[FILE_NAME]]#offset=8192&size=4048
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+Sections:
+  - Name:            .hip_fatbin
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1000
+    Content:         5F5F434C414E475F4F46464C4F41445F42554E444C455F5F0200000000000000001000000000000000000000000000001B00000000000000686F73742D7838365F36342D756E6B6E6F776E2D6C696E75782D2D0010000000000000D00F0000000000001F0000000000000068697076342D616D6467636E2D616D642D616D646873612D2D676678393038000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007F454C460201014003000000000000000300E0000100000000000000000000004000000000000000100C0000000000003005000040003800090040000F000D000600000004000000400000000000000040000000000000004000000000000000F801000000000000F80100000000000008000000000000000100000004000000000000000000000000000000000000000000000000000000C008000000000000C008000000000000001000000000000001000000050000000009000000000000001900000000000000190000000000006C000000000000006C00000000000000001000000000000001000000060000007009000000000000702900000000000070290000000000007000000000000000900600000000000000100000000000000100000006000000E009000000000000E039000000000000E039000000000000000000000000000001000000000000000010000000000000020000000600000070090000000000007029000000000000702900000000000070000000000000007000000000000000080000000000000052E574640400000070090000000000007029000000000000702900000000000070000000000000009006000000000000010000000000000051E57464060000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000004000000380200000000000038020000000000003802000000000000340500000000000034050000000000000400000000000000070000001D05000020000000414D44475055000083AE616D646873612E6B65726E656C7391DE0012AB2E616770725F636F756E7400A52E61726773DC001085AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA415F642E636F65726365A72E6F666673657400A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657285AE2E616464726573735F7370616365A6676C6F62616CA52E6E616D65AA425F642E636F65726365A72E6F666673657408A52E73697A6508AB2E76616C75655F6B696E64AD676C6F62616C5F62756666657284A52E6E616D65A14EA72E6F666673657410A52E73697A6508AB2E76616C75655F6B696E64A862795F76616C756583A72E6F666673657418A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7883A72E6F66667365741CA52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7983A72E6F666673657420A52E73697A6504AB2E76616C75655F6B696E64B468696464656E5F626C6F636B5F636F756E745F7A83A72E6F666673657424A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7883A72E6F666673657426A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7983A72E6F666673657428A52E73697A6502AB2E76616C75655F6B696E64B368696464656E5F67726F75705F73697A655F7A83A72E6F66667365742AA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7883A72E6F66667365742CA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7983A72E6F66667365742EA52E73697A6502AB2E76616C75655F6B696E64B268696464656E5F72656D61696E6465725F7A83A72E6F666673657440A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7883A72E6F666673657448A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7983A72E6F666673657450A52E73697A6508AB2E76616C75655F6B696E64B668696464656E5F676C6F62616C5F6F66667365745F7A83A72E6F666673657458A52E73697A6502AB2E76616C75655F6B696E64B068696464656E5F677269645F64696D73B92E67726F75705F7365676D656E745F66697865645F73697A6500B62E6B65726E6172675F7365676D656E745F616C69676E08B52E6B65726E6172675F7365676D656E745F73697A65CD0118A92E6C616E6775616765A84F70656E434C2043B12E6C616E67756167655F76657273696F6E920200B82E6D61785F666C61745F776F726B67726F75705F73697A65CD0400A52E6E616D65B25F5A3973696D706C65416464506A504B6A6DBB2E707269766174655F7365676D656E745F66697865645F73697A6500AB2E736770725F636F756E740CB12E736770725F7370696C6C5F636F756E7400A72E73796D626F6CB55F5A3973696D706C65416464506A504B6A6D2E6B64B82E756E69666F726D5F776F726B5F67726F75705F73697A6501B32E757365735F64796E616D69635F737461636BC2AB2E766770725F636F756E7404B12E766770725F7370696C6C5F636F756E7400AF2E7761766566726F6E745F73697A6540AD616D646873612E746172676574B9616D6467636E2D616D642D616D646873612D2D676678393038AE616D646873612E76657273696F6E92010200000000000000000000000000000000000000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E03900000000000001000000000000000100000001000000010000001A000000000008400000D20001000000360A4A7A5238A4D3F113F4DD04000000040000000200000001000000000000000300000000000000000000000000000000000000005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F623730363264386333326134613933330000000000000000000000000000000000000000000000000000000000000000000000180100000000000080100000000000000000000000000000000000000000000000000000000000004000AF008C000000090000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C20102C02400000002000AC0000000008002027E7FC08CBF07FF0486FFFF0000060406920600006800008FD2820002000302067E0200043203030638008050DC02007F020102067E0000003203030238008050DC00007F03700F8CBF03050468008070DC00027F00000081BF00000000060000000000000070070000000000000B000000000000001800000000000000050000000000000020080000000000000A000000000000004600000000000000F5FEFF6F00000000D0070000000000000400000000000000F807000000000000000000000000000000000000000000004C696E6B65723A20414D44204C4C442031392E302E3000414D4420636C616E672076657273696F6E2031392E302E306769742028202032343231322063393630313665636534313337356462646438663037356266333762643666633333323230376233290000414D4420636C616E672076657273696F6E2031382E302E3067697420287373683A2F2F6765727269746769742F6C696768746E696E672F65632F6C6C766D2D70726F6A65637420616D642D6D61696E6C696E652D6F70656E20323431373620663935303039613166393032313232343865313036333964653837653635636163616338643961372900000000000000000000000000000000000000000000000000460000000002080070290000000000000000000000000000010000001203070000190000000000006C000000000000001400000011030600800800000000000040000000000000002A00000011000A00E0390000000000000100000000000000002E6E6F7465002E64796E73796D002E676E752E68617368002E68617368002E64796E737472002E726F64617461002E74657874002E64796E616D6963002E72656C726F5F70616464696E67002E627373002E636F6D6D656E74002E73796D746162002E7368737472746162002E73747274616200005F5A3973696D706C65416464506A504B6A6D005F5A3973696D706C65416464506A504B6A6D2E6B64005F5F6869705F637569645F62373036326438633332613461393333005F44594E414D494300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000070000000200000000000000380200000000000038020000000000003405000000000000000000000000000004000000000000000000000000000000070000000B00000002000000000000007007000000000000700700000000000060000000000000000500000001000000080000000000000018000000000000000F000000F6FFFF6F0200000000000000D007000000000000D007000000000000280000000000000002000000000000000800000000000000000000000000000019000000050000000200000000000000F807000000000000F80700000000000028000000000000000200000000000000040000000000000004000000000000001F000000030000000200000000000000200800000000000020080000000000004600000000000000000000000000000001000000000000000000000000000000270000000100000002000000000000008008000000000000800800000000000040000000000000000000000000000000400000000000000000000000000000002F000000010000000600000000000000001900000000000000090000000000006C00000000000000000000000000000000010000000000000000000000000000350000000600000003000000000000007029000000000000700900000000000070000000000000000500000000000000080000000000000010000000000000003E000000080000000300000000000000E029000000000000E00900000000000020060000000000000000000000000000010000000000000000000000000000004D000000080000000300000000000000E039000000000000E0090000000000000100000000000000000000000000000001000000000000000000000000000000520000000100000030000000000000000000000000000000E009000000000000F0000000000000000000000000000000010000000000000001000000000000005B0000000200000000000000000000000000000000000000D00A00000000000078000000000000000E0000000200000008000000000000001800000000000000630000000300000000000000000000000000000000000000480B00000000000075000000000000000000000000000000010000000000000000000000000000006D0000000300000000000000000000000000000000000000BD0B0000000000004F00000000000000000000000000000001000000000000000000000000000000
+  - Name:            .hipFatBinSegment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x202FD0
+    AddressAlign:    0x8
+    Content:         '465049480100000000102000000000000000000000000000'
+...
diff --git a/llvm/tools/llvm-readobj/ObjDumper.cpp b/llvm/tools/llvm-readobj/ObjDumper.cpp
index bd670aeab9ed8..0b59dd48d4203 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.cpp
+++ b/llvm/tools/llvm-readobj/ObjDumper.cpp
@@ -16,6 +16,8 @@
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Decompressor.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
+#include "llvm/Object/OffloadBundle.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
@@ -230,4 +232,14 @@ void ObjDumper::printSectionsAsHex(const object::ObjectFile &Obj,
   }
 }
 
+void ObjDumper::printOffloading(const object::ObjectFile &Obj) {
+  SmallVector<llvm::object::OffloadBundleFatBin> Bundles;
+  if (Error Err = object::extractOffloadBundleFatBinary(Obj, Bundles))
+    reportWarning(std::move(Err), Obj.getFileName());
+
+  // Print out all the FatBin Bundles that are contained in this buffer.
+  for (const auto &[Index, Bundle] : llvm::enumerate(Bundles))
+    Bundle.printEntriesAsURI();
+}
+
 } // namespace llvm
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index a654078a770ff..d26439435a82b 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/OffloadBinary.h"
 #include "llvm/Support/CommandLine.h"
 
 #include <unordered_set>
@@ -188,6 +189,7 @@ class ObjDumper {
   std::function<Error(const Twine &Msg)> WarningHandler;
   void reportUniqueWarning(Error Err) const;
   void reportUniqueWarning(const Twine &Msg) const;
+  void printOffloading(const object::ObjectFile &Obj);
 
 protected:
   ScopedPrinter &W;
diff --git a/llvm/tools/llvm-readobj/Opts.td b/llvm/tools/llvm-readobj/Opts.td
index 711522c4acb14..97d5d7f96dc32 100644
--- a/llvm/tools/llvm-readobj/Opts.td
+++ b/llvm/tools/llvm-readobj/Opts.td
@@ -32,6 +32,7 @@ def file_header : FF<"file-header", "Display file header">;
 def headers : FF<"headers", "Equivalent to setting: --file-header, --program-headers, --section-headers">;
 defm hex_dump : Eq<"hex-dump", "Display the specified section(s) as hexadecimal bytes">, MetaVarName<"<name or index>">;
 def pretty_print : FF<"pretty-print", "Pretty print JSON output">;
+def offloading : FF<"offloading", "Display the content of the offloading section">;
 def relocs : FF<"relocs", "Display the relocation entries in the file">;
 def section_data : FF<"section-data", "Display section data for each section shown. This option has no effect for GNU style output">;
 def section_details : FF<"section-details", "Display the section details">;
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 2b34761b2cc6c..5327731805010 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -135,6 +135,7 @@ static bool HashHistogram;
 static bool Memtag;
 static bool NeededLibraries;
 static bool Notes;
+static bool Offloading;
 static bool ProgramHeaders;
 static bool SectionGroups;
 static std::vector<std::string> SFrame;
@@ -274,6 +275,7 @@ static void parseOptions(const opt::InputArgList &Args) {
   opts::Memtag = Args.hasArg(OPT_memtag);
   opts::NeededLibraries = Args.hasArg(OPT_needed_libs);
   opts::Notes = Args.hasArg(OPT_notes);
+  opts::Offloading = Args.hasArg(OPT_offloading);
   opts::PrettyPrint = Args.hasArg(OPT_pretty_print);
   opts::ProgramHeaders = Args.hasArg(OPT_program_headers);
   opts::SectionGroups = Args.hasArg(OPT_section_groups);
@@ -459,6 +461,8 @@ static void dumpObject(ObjectFile &Obj, ScopedPrinter &Writer,
     Dumper->printGnuHashTable();
   if (opts::VersionInfo)
     Dumper->printVersionInfo();
+  if (opts::Offloading)
+    Dumper->printOffloading(Obj);
   if (opts::StringTable)
     Dumper->printStringTable();
   if (Obj.isELF()) {
@@ -707,6 +711,7 @@ int llvm_readobj_main(int argc, char **argv, const llvm::ToolContext &) {
     opts::DynamicTable = true;
     opts::Notes = true;
     opts::VersionInfo = true;
+    opts::Offloading = true;
     opts::UnwindInfo = true;
     opts::SectionGroups = true;
     opts::HashHistogram = true;

From 12a5854a51c99635dd72b26a73d6ff89d6a5bc81 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <ahatanak@gmail.com>
Date: Mon, 29 Sep 2025 14:22:49 -0700
Subject: [PATCH 185/878] [compiler-rt] Disable tests for unavailable builtins
 (#161275)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The builtins `__fixunstfdi` and `__multc3` may be removed by the
preprocessor depending on configuration flags. When this happens, the
corresponding tests fail at link time due to missing definitions.

Disable these tests when the builtins are not available.

Also remove the XFAILs for aarch64 windows. As this test now became a
no-op on platforms that lack CRT_HAS_128BIT or CRT_HAS_F128 (aarch64
windows lacks the latter), it no longer fails.

This reapplies e9e166e54354330c474457711a8e7a7ca2efd731 and
656707086e5f6fccd2eb57f5aaf987c328c0f4f1 after fixing declarations of
the builtins in the tests in b54250940c2cd70f911386b02239b50c165e5354.

rdar://159705803
rdar://159705705

---------

Co-authored-by: Martin Storsjö <martin@martin.st>
---
 compiler-rt/test/builtins/Unit/fixunstfdi_test.c | 8 +++-----
 compiler-rt/test/builtins/Unit/multc3_test.c     | 7 +++----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
index 14f0f7f1565a4..526ba5ca80cf6 100644
--- a/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
+++ b/compiler-rt/test/builtins/Unit/fixunstfdi_test.c
@@ -1,16 +1,14 @@
-// XFAIL: target=aarch64-{{.*}}-windows-{{.*}}
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_fixunstfdi
 
 #include <stdio.h>
+#include "int_lib.h"
 
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_TF_MODE)
 
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#include "int_lib.h"
-
 // Returns: convert a to a unsigned long long, rounding toward zero.
 //          Negative values all become zero.
 
@@ -38,7 +36,7 @@ char assumption_3[sizeof(fp_t)*CHAR_BIT == 128] = {0};
 
 int main()
 {
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_TF_MODE)
     if (test__fixunstfdi(0.0, 0))
         return 1;
 
diff --git a/compiler-rt/test/builtins/Unit/multc3_test.c b/compiler-rt/test/builtins/Unit/multc3_test.c
index 5eec56dc43033..18561cc344437 100644
--- a/compiler-rt/test/builtins/Unit/multc3_test.c
+++ b/compiler-rt/test/builtins/Unit/multc3_test.c
@@ -1,15 +1,14 @@
-// XFAIL: target=aarch64-{{.*}}-windows-{{.*}}
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_multc3
 
 #include <stdio.h>
+#include "int_lib.h"
 
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
 
 #define QUAD_PRECISION
 #include "fp_lib.h"
 
-#include "int_lib.h"
 #include <math.h>
 #include <complex.h>
 
@@ -351,7 +350,7 @@ fp_t x[][2] =
 
 int main()
 {
-#if _ARCH_PPC || __aarch64__ || __arm64ec__
+#if defined(CRT_HAS_128BIT) && defined(CRT_HAS_F128)
     const unsigned N = sizeof(x) / sizeof(x[0]);
     unsigned i, j;
     for (i = 0; i < N; ++i)

From cbfe89f0ecf6c9dfc047b39f3b12f0532d8ddeb2 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Mon, 29 Sep 2025 14:37:18 -0700
Subject: [PATCH 186/878] [llvm][clang] Use the VFS in `GCOVProfilerPass`
 (#161260)

This PR starts using the correct VFS in `GCOVProfilerPass` instead of
using the real FS directly. This matches compiler's behavior for other
input files.
---
 clang/lib/CodeGen/BackendUtil.cpp             |  5 ++--
 .../Transforms/Instrumentation/GCOVProfiler.h |  6 +++-
 .../Instrumentation/GCOVProfiling.cpp         | 30 +++++++++++--------
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 57db20f70801b..64f1917739e12 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1090,8 +1090,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     if (std::optional<GCOVOptions> Options =
             getGCOVOptions(CodeGenOpts, LangOpts))
       PB.registerPipelineStartEPCallback(
-          [Options](ModulePassManager &MPM, OptimizationLevel Level) {
-            MPM.addPass(GCOVProfilerPass(*Options));
+          [this, Options](ModulePassManager &MPM, OptimizationLevel Level) {
+            MPM.addPass(
+                GCOVProfilerPass(*Options, CI.getVirtualFileSystemPtr()));
           });
     if (std::optional<InstrProfOptions> Options =
             getInstrProfOptions(CodeGenOpts, LangOpts))
diff --git a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
index 4d3ead29af0d2..f53cee26739a4 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
@@ -20,11 +20,15 @@ namespace llvm {
 /// The gcov-style instrumentation pass
 class GCOVProfilerPass : public PassInfoMixin<GCOVProfilerPass> {
 public:
-  GCOVProfilerPass(const GCOVOptions &Options = GCOVOptions::getDefault()) : GCOVOpts(Options) { }
+  GCOVProfilerPass(
+      const GCOVOptions &Options = GCOVOptions::getDefault(),
+      IntrusiveRefCntPtr<vfs::FileSystem> VFS = vfs::getRealFileSystem())
+      : GCOVOpts(Options), VFS(std::move(VFS)) {}
   LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   GCOVOptions GCOVOpts;
+  IntrusiveRefCntPtr<vfs::FileSystem> VFS;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index e5bf2d1187a89..d8422755c28b8 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation/CFGMST.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
@@ -92,8 +93,10 @@ class GCOVFunction;
 
 class GCOVProfiler {
 public:
-  GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
-  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
+  GCOVProfiler()
+      : GCOVProfiler(GCOVOptions::getDefault(), *vfs::getRealFileSystem()) {}
+  GCOVProfiler(const GCOVOptions &Opts, vfs::FileSystem &VFS)
+      : Options(Opts), VFS(VFS) {}
   bool
   runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
               function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
@@ -110,6 +113,7 @@ class GCOVProfiler {
     os->write_zeros(4 - s.size() % 4);
   }
   void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
+  vfs::FileSystem &getVirtualFileSystem() const { return VFS; }
 
 private:
   // Create the .gcno files for the Module based on DebugInfo.
@@ -166,6 +170,7 @@ class GCOVProfiler {
   std::vector<Regex> ExcludeRe;
   DenseSet<const BasicBlock *> ExecBlocks;
   StringMap<bool> InstrumentedFiles;
+  vfs::FileSystem &VFS;
 };
 
 struct BBInfo {
@@ -214,10 +219,10 @@ static StringRef getFunctionName(const DISubprogram *SP) {
 /// Prefer relative paths in the coverage notes. Clang also may split
 /// up absolute paths into a directory and filename component. When
 /// the relative path doesn't exist, reconstruct the absolute path.
-static SmallString<128> getFilename(const DIScope *SP) {
+static SmallString<128> getFilename(const DIScope *SP, vfs::FileSystem &VFS) {
   SmallString<128> Path;
   StringRef RelPath = SP->getFilename();
-  if (sys::fs::exists(RelPath))
+  if (VFS.exists(RelPath))
     Path = RelPath;
   else
     sys::path::append(Path, SP->getDirectory(), SP->getFilename());
@@ -357,7 +362,7 @@ namespace {
 
     void writeOut(uint32_t CfgChecksum) {
       write(GCOV_TAG_FUNCTION);
-      SmallString<128> Filename = getFilename(SP);
+      SmallString<128> Filename = getFilename(SP, P->getVirtualFileSystem());
       uint32_t BlockLen = 3 + wordsOfString(getFunctionName(SP));
       BlockLen += 1 + wordsOfString(Filename) + 4;
 
@@ -455,7 +460,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
   if (FilterRe.empty() && ExcludeRe.empty()) {
     return true;
   }
-  SmallString<128> Filename = getFilename(F.getSubprogram());
+  SmallString<128> Filename = getFilename(F.getSubprogram(), VFS);
   auto It = InstrumentedFiles.find(Filename);
   if (It != InstrumentedFiles.end()) {
     return It->second;
@@ -467,7 +472,7 @@ bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
   // Path can be
   // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
   // such a case we must get the real_path.
-  if (sys::fs::real_path(Filename, RealPath)) {
+  if (VFS.getRealPath(Filename, RealPath)) {
     // real_path can fail with path like "foo.c".
     RealFilename = Filename;
   } else {
@@ -524,9 +529,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   SmallString<128> Filename = CU->getFilename();
   sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
   StringRef FName = sys::path::filename(Filename);
-  SmallString<128> CurPath;
-  if (sys::fs::current_path(CurPath))
+  ErrorOr<std::string> CWD = VFS.getCurrentWorkingDirectory();
+  if (!CWD)
     return std::string(FName);
+  SmallString<128> CurPath{*CWD};
   sys::path::append(CurPath, FName);
   return std::string(CurPath);
 }
@@ -554,7 +560,7 @@ bool GCOVProfiler::runOnModule(
 PreservedAnalyses GCOVProfilerPass::run(Module &M,
                                         ModuleAnalysisManager &AM) {
 
-  GCOVProfiler Profiler(GCOVOpts);
+  GCOVProfiler Profiler(GCOVOpts, *VFS);
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
@@ -789,7 +795,7 @@ bool GCOVProfiler::emitProfileNotes(
       // Add the function line number to the lines of the entry block
       // to have a counter for the function definition.
       uint32_t Line = SP->getLine();
-      auto Filename = getFilename(SP);
+      auto Filename = getFilename(SP, VFS);
 
       BranchProbabilityInfo *BPI = GetBPI(F);
       BlockFrequencyInfo *BFI = GetBFI(F);
@@ -881,7 +887,7 @@ bool GCOVProfiler::emitProfileNotes(
           if (SP != getDISubprogram(Scope))
             continue;
 
-          GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope()));
+          GCOVLines &Lines = Block.getFile(getFilename(Loc->getScope(), VFS));
           Lines.addLine(Loc.getLine());
         }
         Line = 0;

From af5c1a696cffb48712526cc862dc1cafc9ce6e3c Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 29 Sep 2025 17:40:56 -0400
Subject: [PATCH 187/878] [CIR] fix enumeration value 'OMPFuseDirectiveClass'
 not handled in switch (#161278)

---
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index e842892d085d2..644c383693e37 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -216,6 +216,7 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
   case Stmt::OMPSimdDirectiveClass:
   case Stmt::OMPTileDirectiveClass:
   case Stmt::OMPUnrollDirectiveClass:
+  case Stmt::OMPFuseDirectiveClass:
   case Stmt::OMPForDirectiveClass:
   case Stmt::OMPForSimdDirectiveClass:
   case Stmt::OMPSectionsDirectiveClass:

From d481e5f9b7f4bde74fc4909a8a67bbd758991b33 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Mon, 29 Sep 2025 22:50:15 +0100
Subject: [PATCH 188/878] [AMDGPU][SPIRV] Use SPIR-V syncscopes for some AMDGCN
 BIs (#154867)

AMDGCN flavoured SPIR-V allows AMDGCN specific builtins, including those
for scoped fences and some specific RMWs. However, at present we don't
map syncscopes to their SPIR-V equivalents, but rather use the AMDGCN
ones. This ends up pessimising the resulting code as system scope is
used instead of device (agent) or subgroup (wavefront), so we correct
the behaviour, to ensure that we do the right thing during reverse
translation.
---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |  27 +-
 .../builtin-amdgcn-atomic-inc-dec.cpp         | 423 +++++++++++++++++-
 .../test/CodeGenCXX/builtin-amdgcn-fence.cpp  | 103 ++++-
 .../CodeGenOpenCL/builtins-amdgcn-gfx11.cl    |  21 +-
 .../test/CodeGenOpenCL/builtins-amdgcn-vi.cl  |  28 +-
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |   3 +-
 6 files changed, 576 insertions(+), 29 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 07cf08c54985a..6596ec06199dc 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -192,9 +192,17 @@ static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
   return CGF.Builder.CreateCall(F, {Src0, Src1});
 }
 
+static inline StringRef mapScopeToSPIRV(StringRef AMDGCNScope) {
+  if (AMDGCNScope == "agent")
+    return "device";
+  if (AMDGCNScope == "wavefront")
+    return "subgroup";
+  return AMDGCNScope;
+}
+
 // For processing memory ordering and memory scope arguments of various
 // amdgcn builtins.
-// \p Order takes a C++11 comptabile memory-ordering specifier and converts
+// \p Order takes a C++11 compatible memory-ordering specifier and converts
 // it into LLVM's memory ordering specifier using atomic C ABI, and writes
 // to \p AO. \p Scope takes a const char * and converts it into AMDGCN
 // specific SyncScopeID and writes it to \p SSID.
@@ -227,6 +235,8 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
   // Some of the atomic builtins take the scope as a string name.
   StringRef scp;
   if (llvm::getConstantStringInfo(Scope, scp)) {
+    if (getTarget().getTriple().isSPIRV())
+      scp = mapScopeToSPIRV(scp);
     SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
     return;
   }
@@ -238,13 +248,19 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
     SSID = llvm::SyncScope::System;
     break;
   case 1: // __MEMORY_SCOPE_DEVICE
-    SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+    if (getTarget().getTriple().isSPIRV())
+      SSID = getLLVMContext().getOrInsertSyncScopeID("device");
+    else
+      SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
     break;
   case 2: // __MEMORY_SCOPE_WRKGRP
     SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
     break;
   case 3: // __MEMORY_SCOPE_WVFRNT
-    SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
+    if (getTarget().getTriple().isSPIRV())
+      SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup");
+    else
+      SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
     break;
   case 4: // __MEMORY_SCOPE_SINGLE
     SSID = llvm::SyncScope::SingleThread;
@@ -1510,7 +1526,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       //
       // The global/flat cases need to use agent scope to consistently produce
       // the native instruction instead of a cmpxchg expansion.
-      SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
+      if (getTarget().getTriple().isSPIRV())
+        SSID = getLLVMContext().getOrInsertSyncScopeID("device");
+      else
+        SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
       AO = AtomicOrdering::Monotonic;
 
       // The v2bf16 builtin uses i16 instead of a natural bfloat type.
diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp
index 5920ceda4a811..137a49beee9a6 100644
--- a/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp
+++ b/clang/test/CodeGenCXX/builtin-amdgcn-atomic-inc-dec.cpp
@@ -1,7 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
 // RUN: %clang_cc1 %s -x hip -fcuda-is-device -emit-llvm -O0 -o - \
-// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck %s
+// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck --check-prefix=GCN %s
+// RUN: %clang_cc1 %s -x hip -fcuda-is-device -emit-llvm -O0 -o - \
+// RUN:   -triple=spirv64-amd-amdhsa | FileCheck --check-prefix=AMDGCNSPIRV %s
 
 // CHECK-LABEL: @_Z29test_non_volatile_parameter32Pj(
 // CHECK-NEXT:  entry:
@@ -21,6 +24,43 @@
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z29test_non_volatile_parameter32Pj(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i32, align 4, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z29test_non_volatile_parameter32Pj(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) [[TMP5]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_non_volatile_parameter32(__UINT32_TYPE__ *ptr) {
   __UINT32_TYPE__ res;
@@ -47,6 +87,43 @@ __attribute__((device)) void test_non_volatile_parameter32(__UINT32_TYPE__ *ptr)
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z29test_non_volatile_parameter64Py(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i64, align 8, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z29test_non_volatile_parameter64Py(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(4) [[TMP5]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_non_volatile_parameter64(__UINT64_TYPE__ *ptr) {
   __UINT64_TYPE__ res;
@@ -73,6 +150,43 @@ __attribute__((device)) void test_non_volatile_parameter64(__UINT64_TYPE__ *ptr)
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z25test_volatile_parameter32PVj(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i32, align 4, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr [[TMP1]], align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load volatile i32, ptr [[TMP5]], align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr [[RES_ASCAST]], align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z25test_volatile_parameter32PVj(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr addrspace(4) [[TMP1]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr addrspace(4) [[TMP0]], i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load volatile i32, ptr addrspace(4) [[TMP5]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr addrspace(4) [[TMP4]], i32 [[TMP6]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_volatile_parameter32(volatile __UINT32_TYPE__ *ptr) {
   __UINT32_TYPE__ res;
@@ -99,6 +213,43 @@ __attribute__((device)) void test_volatile_parameter32(volatile __UINT32_TYPE__
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z25test_volatile_parameter64PVy(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// GCN-NEXT:    [[RES:%.*]] = alloca i64, align 8, addrspace(5)
+// GCN-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR_ADDR]] to ptr
+// GCN-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RES]] to ptr
+// GCN-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load volatile i64, ptr [[TMP1]], align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[PTR_ADDR_ASCAST]], align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load volatile i64, ptr [[TMP5]], align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr [[RES_ASCAST]], align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z25test_volatile_parameter64PVy(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr addrspace(4), align 8
+// AMDGCNSPIRV-NEXT:    [[RES:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr [[PTR_ADDR]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[RES_ASCAST:%.*]] = addrspacecast ptr [[RES]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    store ptr addrspace(4) [[PTR:%.*]], ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load volatile i64, ptr addrspace(4) [[TMP1]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw volatile uinc_wrap ptr addrspace(4) [[TMP0]], i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[PTR_ADDR_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load volatile i64, ptr addrspace(4) [[TMP5]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw volatile udec_wrap ptr addrspace(4) [[TMP4]], i64 [[TMP6]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) [[RES_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_volatile_parameter64(volatile __UINT64_TYPE__ *ptr) {
   __UINT64_TYPE__ res;
@@ -116,6 +267,25 @@ __attribute__((device)) void test_volatile_parameter64(volatile __UINT64_TYPE__
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_shared32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_shared32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_shared32() {
   __attribute__((shared)) __UINT32_TYPE__ val;
@@ -134,6 +304,25 @@ __attribute__((device)) void test_shared32() {
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_shared64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_shared64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ13test_shared64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_shared64() {
   __attribute__((shared)) __UINT64_TYPE__ val;
@@ -153,6 +342,25 @@ __attribute__((device)) __UINT32_TYPE__ global_val32;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_global32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val32 to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_global32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_global32() {
   global_val32 = __builtin_amdgcn_atomic_inc32(&global_val32, global_val32, __ATOMIC_SEQ_CST, "workgroup");
@@ -170,6 +378,25 @@ __attribute__((device)) __UINT64_TYPE__ global_val64;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z13test_global64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(1) @global_val64 to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z13test_global64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(1) @global_val64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_global64() {
   global_val64 = __builtin_amdgcn_atomic_inc64(&global_val64, global_val64, __ATOMIC_SEQ_CST, "workgroup");
@@ -189,6 +416,29 @@ __attribute__((constant)) __UINT32_TYPE__ cval32;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z15test_constant32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[LOCAL_VAL:%.*]] = alloca i32, align 4, addrspace(5)
+// GCN-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LOCAL_VAL]] to ptr
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr [[LOCAL_VAL_ASCAST]], align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval32 to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z15test_constant32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL:%.*]] = alloca i32, align 4
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr [[LOCAL_VAL]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval32 to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_constant32() {
   __UINT32_TYPE__ local_val;
@@ -210,6 +460,29 @@ __attribute__((constant)) __UINT64_TYPE__ cval64;
 // CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z15test_constant64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[LOCAL_VAL:%.*]] = alloca i64, align 8, addrspace(5)
+// GCN-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LOCAL_VAL]] to ptr
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr [[LOCAL_VAL_ASCAST]], align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(4) @cval64 to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr [[LOCAL_VAL_ASCAST]], align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z15test_constant64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL:%.*]] = alloca i64, align 8
+// AMDGCNSPIRV-NEXT:    [[LOCAL_VAL_ASCAST:%.*]] = addrspacecast ptr [[LOCAL_VAL]] to ptr addrspace(4)
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(1) @cval64 to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) [[LOCAL_VAL_ASCAST]], align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_constant64() {
   __UINT64_TYPE__ local_val;
@@ -240,6 +513,49 @@ __attribute__((device)) void test_constant64() {
 // CHECK-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP10]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_order32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP0]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP4]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP6]] syncscope("workgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP8]] syncscope("workgroup") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP9]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), i32 [[TMP10]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_order32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP0]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP4]] syncscope("workgroup") acquire, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP6]] syncscope("workgroup") release, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP8]] syncscope("workgroup") acq_rel, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP9]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), i32 [[TMP10]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP11]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_order32() {
   __attribute__((shared)) __UINT32_TYPE__ val;
@@ -278,6 +594,49 @@ __attribute__((device)) void test_order32() {
 // CHECK-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP10]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_order64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP0]] syncscope("workgroup") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP4]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP6]] syncscope("workgroup") release, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP8:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP8]] syncscope("workgroup") acq_rel, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP9]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP10:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), i64 [[TMP10]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP11]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_order64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP0]] syncscope("workgroup") monotonic, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP4]] syncscope("workgroup") acquire, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP6]] syncscope("workgroup") release, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP8:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP9:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP8]] syncscope("workgroup") acq_rel, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP9]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP10:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP11:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), i64 [[TMP10]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP11]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_order64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_order64() {
   __attribute__((shared)) __UINT64_TYPE__ val;
@@ -310,6 +669,37 @@ __attribute__((device)) void test_order64() {
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP6]] syncscope("wavefront") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i32 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_scope32v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP0]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP4]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), i32 [[TMP6]] syncscope("wavefront") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i32 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr), align 4
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_scope32v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP0]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP2]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP4]] syncscope("device") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), i32 [[TMP6]] syncscope("subgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i32 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope32vE3val to ptr addrspace(4)), align 4
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_scope32() {
   __attribute__((shared)) __UINT32_TYPE__ val;
@@ -338,6 +728,37 @@ __attribute__((device)) void test_scope32() {
 // CHECK-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP6]] syncscope("wavefront") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
 // CHECK-NEXT:    store i64 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
 // CHECK-NEXT:    ret void
+// GCN-LABEL: @_Z12test_scope64v(
+// GCN-NEXT:  entry:
+// GCN-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP0]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP1]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP3]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP4]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP5]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), i64 [[TMP6]] syncscope("wavefront") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META4]]
+// GCN-NEXT:    store i64 [[TMP7]], ptr addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr), align 8
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: @_Z12test_scope64v(
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP1:%.*]] = atomicrmw uinc_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP0]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP1]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP3:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP2]] syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP3]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP4:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP5:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP4]] syncscope("device") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP5]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP6:%.*]] = load i64, ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    [[TMP7:%.*]] = atomicrmw udec_wrap ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), i64 [[TMP6]] syncscope("subgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META5]]
+// AMDGCNSPIRV-NEXT:    store i64 [[TMP7]], ptr addrspace(4) addrspacecast (ptr addrspace(3) @_ZZ12test_scope64vE3val to ptr addrspace(4)), align 8
+// AMDGCNSPIRV-NEXT:    ret void
 //
 __attribute__((device)) void test_scope64() {
   __attribute__((shared)) __UINT64_TYPE__ val;
diff --git a/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp b/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
index 1e977dd6420f4..dd1ca459d68b5 100644
--- a/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
+++ b/clang/test/CodeGenCXX/builtin-amdgcn-fence.cpp
@@ -1,7 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // REQUIRES: amdgpu-registered-target
+// REQUIRES: spirv-registered-target
 // RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
-// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck %s
+// RUN:   -triple=amdgcn-amd-amdhsa | FileCheck --check-prefix=GCN %s
+// RUN: %clang_cc1 %s -emit-llvm -O0 -o - \
+// RUN:   -triple=spirv64-amd-amdhsa | FileCheck --check-prefix=AMDGCNSPIRV %s
 
 // CHECK-LABEL: define dso_local void @_Z25test_memory_fence_successv(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -12,6 +15,25 @@
 // CHECK-NEXT:    fence syncscope("agent") acq_rel
 // CHECK-NEXT:    fence syncscope("workgroup") release
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z25test_memory_fence_successv(
+// GCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst
+// GCN-NEXT:    fence syncscope("agent") acquire
+// GCN-NEXT:    fence seq_cst
+// GCN-NEXT:    fence syncscope("agent") acq_rel
+// GCN-NEXT:    fence syncscope("workgroup") release
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z25test_memory_fence_successv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0:[0-9]+]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire
+// AMDGCNSPIRV-NEXT:    fence seq_cst
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_memory_fence_success() {
 
@@ -35,6 +57,25 @@ void test_memory_fence_success() {
 // CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
 // CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z10test_localv(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]]
+// GCN-NEXT:    fence syncscope("agent") acquire, !mmra [[META3]]
+// GCN-NEXT:    fence seq_cst, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z10test_localv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence seq_cst, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_local() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local");
@@ -58,6 +99,25 @@ void test_local() {
 // CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META4]]
 // CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META4]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z11test_globalv(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]]
+// GCN-NEXT:    fence syncscope("agent") acquire, !mmra [[META4]]
+// GCN-NEXT:    fence seq_cst, !mmra [[META4]]
+// GCN-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META4]]
+// GCN-NEXT:    fence syncscope("workgroup") release, !mmra [[META4]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z11test_globalv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META4:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    fence seq_cst, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release, !mmra [[META4]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_global() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "global");
@@ -80,6 +140,25 @@ void test_global() {
 // CHECK-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
 // CHECK-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z10test_imagev(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("agent") acquire, !mmra [[META3]]
+// GCN-NEXT:    fence seq_cst, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("agent") acq_rel, !mmra [[META3]]
+// GCN-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z10test_imagev(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acquire, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence seq_cst, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("device") acq_rel, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") release, !mmra [[META3]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_image() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local");
@@ -99,13 +178,33 @@ void test_image() {
 // CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
 // CHECK-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5]]
 // CHECK-NEXT:    ret void
+// GCN-LABEL: define dso_local void @_Z10test_mixedv(
+// GCN-SAME: ) #[[ATTR0]] {
+// GCN-NEXT:  entry:
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
+// GCN-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5]]
+// GCN-NEXT:    ret void
+//
+// AMDGCNSPIRV-LABEL: define spir_func void @_Z10test_mixedv(
+// AMDGCNSPIRV-SAME: ) addrspace(4) #[[ATTR0]] {
+// AMDGCNSPIRV-NEXT:  entry:
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5:![0-9]+]]
+// AMDGCNSPIRV-NEXT:    fence syncscope("workgroup") seq_cst, !mmra [[META5]]
+// AMDGCNSPIRV-NEXT:    ret void
 //
 void test_mixed() {
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "global");
   __builtin_amdgcn_fence( __ATOMIC_SEQ_CST, "workgroup", "local", "local", "global", "local", "local");
 }
-//.
 // CHECK: [[META3]] = !{!"amdgpu-synchronize-as", !"local"}
 // CHECK: [[META4]] = !{!"amdgpu-synchronize-as", !"global"}
 // CHECK: [[META5]] = !{[[META4]], [[META3]]}
 //.
+// GCN: [[META3]] = !{!"amdgpu-synchronize-as", !"local"}
+// GCN: [[META4]] = !{!"amdgpu-synchronize-as", !"global"}
+// GCN: [[META5]] = !{[[META4]], [[META3]]}
+//.
+// AMDGCNSPIRV: [[META3]] = !{!"amdgpu-synchronize-as", !"local"}
+// AMDGCNSPIRV: [[META4]] = !{!"amdgpu-synchronize-as", !"global"}
+// AMDGCNSPIRV: [[META5]] = !{[[META4]], [[META3]]}
+//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
index 19ab6562e52b9..7cd3f1417844c 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -1,13 +1,13 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1101 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1102 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1152 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1153 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1101 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1102 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1152 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1153 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
 
 typedef unsigned int uint;
 typedef unsigned long ulong;
@@ -50,7 +50,8 @@ void test_s_wait_event_export_ready() {
 }
 
 // CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// GCN: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// AMDGCNSPIRV: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("device") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
 #if !defined(__SPIRV__)
 void test_global_add_f32(float *rtn, global float *addr, float x) {
 #else
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 5f202baa8a592..6bb20bff436fb 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -1,9 +1,9 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s
+// RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
@@ -252,9 +252,11 @@ void test_update_dpp_const_int(global int* out, int arg1)
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 #if !defined(__SPIRV__)
@@ -293,9 +295,11 @@ void test_ds_faddf(local float *out, float src) {
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 
@@ -334,9 +338,11 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) {
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
 
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
-// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
+// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
 // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 039d03237b530..ab0b0b936abdc 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -1231,7 +1231,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
   // CHECK: atomicrmw udec_wrap ptr addrspace(3) %lptr, i32 %val syncscope("workgroup") seq_cst, align 4
   res = __builtin_amdgcn_atomic_dec32(lptr, val, __ATOMIC_SEQ_CST, "workgroup");
 
-  // CHECK: atomicrmw uinc_wrap ptr addrspace(1) %gptr, i32 %val syncscope("agent") seq_cst, align 4
+  // CHECK-AMDGCN: atomicrmw uinc_wrap ptr addrspace(1) %gptr, i32 %val syncscope("agent") seq_cst, align 4
+  // CHECK-SPIRV: atomicrmw uinc_wrap ptr addrspace(1) %gptr, i32 %val syncscope("device") seq_cst, align 4
   res = __builtin_amdgcn_atomic_inc32(gptr, val, __ATOMIC_SEQ_CST, "agent");
 
   // CHECK: atomicrmw udec_wrap ptr addrspace(1) %gptr, i32 %val seq_cst, align 4

From 63e45504abebeccb93365e089916abea1bd53eae Mon Sep 17 00:00:00 2001
From: Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
Date: Tue, 30 Sep 2025 00:05:28 +0200
Subject: [PATCH 189/878] [clang-reorder-fields] Check for flexible array
 member (#160262)

A flexible array member must remain the last field in the struct.
---
 .../ReorderFieldsAction.cpp                   | 20 +++++++++++++++++--
 .../FlexibleArrayMember.c                     | 10 ++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/FlexibleArrayMember.c

diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
index affa276a0c550..5770bf767bc3c 100644
--- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
+++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
@@ -164,6 +164,22 @@ getNewFieldsOrder(const RecordDecl *Definition,
   return NewFieldsOrder;
 }
 
+static bool isOrderValid(const RecordDecl *RD, ArrayRef<unsigned> FieldOrder) {
+  if (FieldOrder.empty())
+    return false;
+
+  // If there is a flexible array member in the struct, it must remain the last
+  // field.
+  if (RD->hasFlexibleArrayMember() &&
+      FieldOrder.back() != FieldOrder.size() - 1) {
+    llvm::errs()
+        << "Flexible array member must remain the last field in the struct\n";
+    return false;
+  }
+
+  return true;
+}
+
 struct ReorderedStruct {
 public:
   ReorderedStruct(const RecordDecl *Decl, ArrayRef<unsigned> NewFieldsOrder)
@@ -662,7 +678,7 @@ class ReorderingConsumer : public ASTConsumer {
       return;
     SmallVector<unsigned, 4> NewFieldsOrder =
         getNewFieldsOrder(RD, DesiredFieldsOrder);
-    if (NewFieldsOrder.empty())
+    if (!isOrderValid(RD, NewFieldsOrder))
       return;
     ReorderedStruct RS{RD, NewFieldsOrder};
 
@@ -699,7 +715,7 @@ class ReorderingConsumer : public ASTConsumer {
 
 std::unique_ptr<ASTConsumer> ReorderFieldsAction::newASTConsumer() {
   return std::make_unique<ReorderingConsumer>(RecordName, DesiredFieldsOrder,
-                                               Replacements);
+                                              Replacements);
 }
 
 } // namespace reorder_fields
diff --git a/clang-tools-extra/test/clang-reorder-fields/FlexibleArrayMember.c b/clang-tools-extra/test/clang-reorder-fields/FlexibleArrayMember.c
new file mode 100644
index 0000000000000..ef64350fd08e6
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/FlexibleArrayMember.c
@@ -0,0 +1,10 @@
+// RUN: clang-reorder-fields -record-name Foo -fields-order z,y,x %s -- 2>&1 | FileCheck --check-prefix=CHECK-BAD %s
+// RUN: clang-reorder-fields -record-name Foo -fields-order y,x,z %s -- | FileCheck --check-prefix=CHECK-GOOD %s
+
+// CHECK-BAD: {{^Flexible array member must remain the last field in the struct}}
+
+struct Foo {
+  int x;   // CHECK-GOOD:      {{^  int y;}}
+  int y;   // CHECK-GOOD-NEXT: {{^  int x;}}
+  int z[]; // CHECK-GOOD-NEXT: {{^  int z\[\];}}
+};

From 0ab9ffe1bf557cd3a7ebb0c03beb3cb45dcbe077 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 29 Sep 2025 15:11:09 -0700
Subject: [PATCH 190/878] [CAS][CMake] Fix rhel bots missing symbol failure
 from #114100 (#161283)

Link LLVM_PTHREAD_LIB from LLVMCAS library to fix rhel bots.
---
 llvm/lib/CAS/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index cc866f25f3240..7ae5f7e46418e 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -12,6 +12,9 @@ add_llvm_component_library(LLVMCAS
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
 
+  LINK_LIBS
+  ${LLVM_PTHREAD_LIB}
+
   LINK_COMPONENTS
   Support
 )

From d28c07b7550af47ff7adc068d6078388cdeed61d Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Tue, 30 Sep 2025 03:42:55 +0530
Subject: [PATCH 191/878] [clang-scan-deps] Remove const from ModuleDeps loop
 to enable move. (#161109)

This changes the iteration from const to non-const so that std::move
results in a true move rather than a copy.
---
 clang/tools/clang-scan-deps/ClangScanDeps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index 0e2758d123edc..e41f4eb7999ae 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -420,7 +420,7 @@ class FullDeps {
     std::vector<ModuleDeps *> NewMDs;
     {
       std::unique_lock<std::mutex> ul(Lock);
-      for (const ModuleDeps &MD : Graph) {
+      for (ModuleDeps &MD : Graph) {
         auto I = Modules.find({MD.ID, 0});
         if (I != Modules.end()) {
           I->first.InputIndex = std::min(I->first.InputIndex, InputIndex);

From 5d739cf4186e333770a11d1376eb2ea947cc70e8 Mon Sep 17 00:00:00 2001
From: Renaud Kauffmann <rkauffmann@nvidia.com>
Date: Mon, 29 Sep 2025 15:32:03 -0700
Subject: [PATCH 192/878] Create function declaration in the proper module
 (#161281)

Using `memref.dealloc` in the gpu module would add a function definition
for `@free` in the the top level module instead of the gpu module. The
fix is to do what is already done for memref.alloc which is to use
`op->getParentWithTrait<OpTrait::SymbolTable>()` instead of
`op->getParentOfType<ModuleOp>()` to create the call in the proper
module.
---
 .../Conversion/MemRefToLLVM/MemRefToLLVM.cpp  |  8 ++---
 mlir/test/Dialect/GPU/memref-to-llvm.mlir     | 33 +++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 mlir/test/Dialect/GPU/memref-to-llvm.mlir

diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 262e0e7a30c63..cc6314cbd1ffe 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -48,8 +48,8 @@ static bool isStaticStrideOrOffset(int64_t strideOrOffset) {
 }
 
 static FailureOr<LLVM::LLVMFuncOp>
-getFreeFn(OpBuilder &b, const LLVMTypeConverter *typeConverter, ModuleOp module,
-          SymbolTableCollection *symbolTables) {
+getFreeFn(OpBuilder &b, const LLVMTypeConverter *typeConverter,
+          Operation *module, SymbolTableCollection *symbolTables) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
 
   if (useGenericFn)
@@ -483,8 +483,8 @@ class DeallocOpLowering : public ConvertOpToLLVMPattern<memref::DeallocOp> {
                   ConversionPatternRewriter &rewriter) const override {
     // Insert the `free` declaration if it is not already present.
     FailureOr<LLVM::LLVMFuncOp> freeFunc =
-        getFreeFn(rewriter, getTypeConverter(), op->getParentOfType<ModuleOp>(),
-                  symbolTables);
+        getFreeFn(rewriter, getTypeConverter(),
+                  op->getParentWithTrait<OpTrait::SymbolTable>(), symbolTables);
     if (failed(freeFunc))
       return failure();
     Value allocatedPtr;
diff --git a/mlir/test/Dialect/GPU/memref-to-llvm.mlir b/mlir/test/Dialect/GPU/memref-to-llvm.mlir
new file mode 100644
index 0000000000000..81a96bf29e84f
--- /dev/null
+++ b/mlir/test/Dialect/GPU/memref-to-llvm.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-opt --convert-to-llvm %s | FileCheck %s
+
+// Checking that malloc and free are declared in the proper module.
+
+// CHECK: module attributes {gpu.container_module} {
+// CHECK:   llvm.func @free(!llvm.ptr)
+// CHECK:   llvm.func @malloc(i64) -> !llvm.ptr
+// CHECK:   gpu.module @kernels {
+// CHECK:     llvm.func @free(!llvm.ptr)
+// CHECK:     llvm.func @malloc(i64) -> !llvm.ptr
+// CHECK:     gpu.func @kernel_1
+// CHECK:       llvm.call @malloc({{.*}}) : (i64) -> !llvm.ptr
+// CHECK:       llvm.call @free({{.*}}) : (!llvm.ptr) -> ()
+// CHECK:       gpu.return
+// CHECK:     }
+// CHECK:   }
+// CHECK: }
+module attributes {gpu.container_module} {
+
+  gpu.module @kernels {
+    gpu.func @kernel_1() kernel {
+      %memref_a = memref.alloc() : memref<8x16xf32>
+      memref.dealloc %memref_a : memref<8x16xf32>
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %memref_a = memref.alloc() : memref<8x16xf32>
+    memref.dealloc %memref_a : memref<8x16xf32>
+    return
+  }
+}

From d23f78175ca64ce4b6d92dead490970e64ca2f4c Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Mon, 29 Sep 2025 15:45:48 -0700
Subject: [PATCH 193/878] [lld][WebAssembly] Fix visibility of
 `__stack_pointer` global (#161284)

The stack pointer should be global, not hidden / dso-local. Marking it
as global allows it to be exported from the main module and imported
into side modules.
---
 lld/test/wasm/archive-export.test  |  3 +++
 lld/test/wasm/comdats.ll           |  3 +++
 lld/test/wasm/visibility-hidden.ll |  3 +++
 lld/wasm/Driver.cpp                | 15 ++++++++++-----
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/lld/test/wasm/archive-export.test b/lld/test/wasm/archive-export.test
index 9a76d60d63d91..c67e500e46dd2 100644
--- a/lld/test/wasm/archive-export.test
+++ b/lld/test/wasm/archive-export.test
@@ -14,6 +14,9 @@ CHECK:         Exports:
 CHECK-NEXT:       - Name:            memory
 CHECK-NEXT:         Kind:            MEMORY
 CHECK-NEXT:         Index:           0
+CHECK-NEXT:       - Name:            __stack_pointer
+CHECK-NEXT:         Kind:            GLOBAL
+CHECK-NEXT:         Index:           0
 CHECK-NEXT:       - Name:            foo
 CHECK-NEXT:         Kind:            FUNCTION
 CHECK-NEXT:         Index:           1
diff --git a/lld/test/wasm/comdats.ll b/lld/test/wasm/comdats.ll
index 8fc301e9a10e0..2dd687fbad1ef 100644
--- a/lld/test/wasm/comdats.ll
+++ b/lld/test/wasm/comdats.ll
@@ -35,6 +35,9 @@ entry:
 ; CHECK-NEXT:      - Name:            memory
 ; CHECK-NEXT:        Kind:            MEMORY
 ; CHECK-NEXT:        Index:           0
+; CHECK-NEXT:      - Name:            __stack_pointer
+; CHECK-NEXT:        Kind:            GLOBAL
+; CHECK-NEXT:        Index:           0
 ; CHECK-NEXT:      - Name:            _start
 ; CHECK-NEXT:        Kind:            FUNCTION
 ; CHECK-NEXT:        Index:           1
diff --git a/lld/test/wasm/visibility-hidden.ll b/lld/test/wasm/visibility-hidden.ll
index 36c29a8e47385..6ed7ba3afdc02 100644
--- a/lld/test/wasm/visibility-hidden.ll
+++ b/lld/test/wasm/visibility-hidden.ll
@@ -43,6 +43,9 @@ entry:
 ; CHECK-NEXT:       - Name:            memory
 ; CHECK-NEXT:         Kind:            MEMORY
 ; CHECK-NEXT:         Index:           0
+; CHECK-NEXT:       - Name:            __stack_pointer
+; CHECK-NEXT:         Kind:            GLOBAL
+; CHECK-NEXT:         Index:           0
 ; CHECK-NEXT:       - Name:            objectDefault
 ; CHECK-NEXT:         Kind:            FUNCTION
 ; CHECK-NEXT:         Index:           1
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 9b85b6c00b26d..46c848d5c1232 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -914,9 +914,10 @@ static InputGlobal *createGlobal(StringRef name, bool isMutable) {
   return make<InputGlobal>(wasmGlobal, nullptr);
 }
 
-static GlobalSymbol *createGlobalVariable(StringRef name, bool isMutable) {
+static GlobalSymbol *createGlobalVariable(StringRef name, bool isMutable,
+                                          uint32_t flags = 0) {
   InputGlobal *g = createGlobal(name, isMutable);
-  return symtab->addSyntheticGlobal(name, WASM_SYMBOL_VISIBILITY_HIDDEN, g);
+  return symtab->addSyntheticGlobal(name, flags, g);
 }
 
 static GlobalSymbol *createOptionalGlobal(StringRef name, bool isMutable) {
@@ -966,9 +967,13 @@ static void createSyntheticSymbols() {
   }
 
   if (ctx.arg.sharedMemory) {
-    ctx.sym.tlsBase = createGlobalVariable("__tls_base", true);
-    ctx.sym.tlsSize = createGlobalVariable("__tls_size", false);
-    ctx.sym.tlsAlign = createGlobalVariable("__tls_align", false);
+    // TLS symbols are all hidden/dso-local
+    ctx.sym.tlsBase =
+        createGlobalVariable("__tls_base", true, WASM_SYMBOL_VISIBILITY_HIDDEN);
+    ctx.sym.tlsSize = createGlobalVariable("__tls_size", false,
+                                           WASM_SYMBOL_VISIBILITY_HIDDEN);
+    ctx.sym.tlsAlign = createGlobalVariable("__tls_align", false,
+                                            WASM_SYMBOL_VISIBILITY_HIDDEN);
     ctx.sym.initTLS = symtab->addSyntheticFunction(
         "__wasm_init_tls", WASM_SYMBOL_VISIBILITY_HIDDEN,
         make<SyntheticFunction>(is64 ? i64ArgSignature : i32ArgSignature,

From 1d614a9702973aa9b099a61a6a5992c1de1d8de1 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 29 Sep 2025 16:11:33 -0700
Subject: [PATCH 194/878] [CIR] Add GlobalOp ctor and dtor regions (#160779)

This adds support for ctor and dtor regions in cir::GlobalOp. These
regions are used to capture the code that initializes and cleans up the
variable, keeping this initialization and cleanup code with the variable
definition.

This change only adds the CIR dialect support for these regions. Support
for generating the code in these regions from source and lowering these
to LLVM IR will be added in a later change, as will LoweringPrepare
support to move the code into the __cxx_global_var_init() function.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td |  39 ++++--
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp      | 128 ++++++++++++++++---
 clang/test/CIR/IR/global-init.cir            |  48 +++++++
 3 files changed, 182 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/CIR/IR/global-init.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index bb394440bf8d8..e1be08c1bbbbd 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -683,8 +683,8 @@ def CIR_ConditionOp : CIR_Op<"condition", [
 //===----------------------------------------------------------------------===//
 
 defvar CIR_YieldableScopes = [
-  "ArrayCtor", "ArrayDtor", "CaseOp", "DoWhileOp", "ForOp", "IfOp", "ScopeOp",
-  "SwitchOp", "TernaryOp", "WhileOp"
+  "ArrayCtor", "ArrayDtor", "CaseOp", "DoWhileOp", "ForOp", "GlobalOp", "IfOp",
+  "ScopeOp", "SwitchOp", "TernaryOp", "WhileOp"
 ];
 
 def CIR_YieldOp : CIR_Op<"yield", [
@@ -1776,7 +1776,9 @@ def CIR_GlobalLinkageKind : CIR_I32EnumAttr<
 // is upstreamed.
 
 def CIR_GlobalOp : CIR_Op<"global", [
-  DeclareOpInterfaceMethods<CIRGlobalValueInterface>
+  DeclareOpInterfaceMethods<RegionBranchOpInterface>,
+  DeclareOpInterfaceMethods<CIRGlobalValueInterface>,
+  NoRegionArguments
 ]> {
   let summary = "Declare or define a global variable";
   let description = [{
@@ -1807,6 +1809,9 @@ def CIR_GlobalOp : CIR_Op<"global", [
                        UnitAttr:$dso_local,
                        OptionalAttr<I64Attr>:$alignment);
 
+  let regions = (region MaxSizedRegion<1>:$ctorRegion,
+                        MaxSizedRegion<1>:$dtorRegion);
+
   let assemblyFormat = [{
     ($sym_visibility^)?
     (`` $global_visibility^)?
@@ -1815,24 +1820,34 @@ def CIR_GlobalOp : CIR_Op<"global", [
     (`comdat` $comdat^)?
     (`dso_local` $dso_local^)?
     $sym_name
-    custom<GlobalOpTypeAndInitialValue>($sym_type, $initial_value)
+    custom<GlobalOpTypeAndInitialValue>($sym_type, $initial_value,
+                                        $ctorRegion, $dtorRegion)
     attr-dict
   }];
 
   let extraClassDeclaration = [{
-    bool isDeclaration() { return !getInitialValue(); }
+    bool isDeclaration() {
+      return !getInitialValue() && getCtorRegion().empty() && getDtorRegion().empty();
+    }
     bool hasInitializer() { return !isDeclaration(); }
   }];
 
   let skipDefaultBuilders = 1;
 
-  let builders = [OpBuilder<(ins
-    "llvm::StringRef":$sym_name,
-    "mlir::Type":$sym_type,
-    CArg<"bool", "false">:$isConstant,
-    // CIR defaults to external linkage.
-    CArg<"cir::GlobalLinkageKind",
-    "cir::GlobalLinkageKind::ExternalLinkage">:$linkage)>];
+  let builders = [
+    OpBuilder<(ins
+      "llvm::StringRef":$sym_name,
+      "mlir::Type":$sym_type,
+      CArg<"bool", "false">:$isConstant,
+      // CIR defaults to external linkage.
+      CArg<"cir::GlobalLinkageKind",
+           "cir::GlobalLinkageKind::ExternalLinkage">:$linkage,
+      CArg<"llvm::function_ref<void(mlir::OpBuilder &, mlir::Location)>",
+           "nullptr">:$ctorBuilder,
+      CArg<"llvm::function_ref<void(mlir::OpBuilder &, mlir::Location)>",
+           "nullptr">:$dtorBuilder)
+    >
+  ];
 
   let hasVerifier = 1;
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 58ef500446aa7..fb87036fdfe21 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1355,9 +1355,11 @@ mlir::LogicalResult cir::GlobalOp::verify() {
   return success();
 }
 
-void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
-                          llvm::StringRef sym_name, mlir::Type sym_type,
-                          bool isConstant, cir::GlobalLinkageKind linkage) {
+void cir::GlobalOp::build(
+    OpBuilder &odsBuilder, OperationState &odsState, llvm::StringRef sym_name,
+    mlir::Type sym_type, bool isConstant, cir::GlobalLinkageKind linkage,
+    function_ref<void(OpBuilder &, Location)> ctorBuilder,
+    function_ref<void(OpBuilder &, Location)> dtorBuilder) {
   odsState.addAttribute(getSymNameAttrName(odsState.name),
                         odsBuilder.getStringAttr(sym_name));
   odsState.addAttribute(getSymTypeAttrName(odsState.name),
@@ -1370,26 +1372,88 @@ void cir::GlobalOp::build(OpBuilder &odsBuilder, OperationState &odsState,
       cir::GlobalLinkageKindAttr::get(odsBuilder.getContext(), linkage);
   odsState.addAttribute(getLinkageAttrName(odsState.name), linkageAttr);
 
+  Region *ctorRegion = odsState.addRegion();
+  if (ctorBuilder) {
+    odsBuilder.createBlock(ctorRegion);
+    ctorBuilder(odsBuilder, odsState.location);
+  }
+
+  Region *dtorRegion = odsState.addRegion();
+  if (dtorBuilder) {
+    odsBuilder.createBlock(dtorRegion);
+    dtorBuilder(odsBuilder, odsState.location);
+  }
+
   odsState.addAttribute(getGlobalVisibilityAttrName(odsState.name),
                         cir::VisibilityAttr::get(odsBuilder.getContext()));
 }
 
+/// Given the region at `index`, or the parent operation if `index` is None,
+/// return the successor regions. These are the regions that may be selected
+/// during the flow of control. `operands` is a set of optional attributes that
+/// correspond to a constant value for each operand, or null if that operand is
+/// not a constant.
+void cir::GlobalOp::getSuccessorRegions(
+    mlir::RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  // The `ctor` and `dtor` regions always branch back to the parent operation.
+  if (!point.isParent()) {
+    regions.push_back(RegionSuccessor());
+    return;
+  }
+
+  // Don't consider the ctor region if it is empty.
+  Region *ctorRegion = &this->getCtorRegion();
+  if (ctorRegion->empty())
+    ctorRegion = nullptr;
+
+  // Don't consider the dtor region if it is empty.
+  Region *dtorRegion = &this->getCtorRegion();
+  if (dtorRegion->empty())
+    dtorRegion = nullptr;
+
+  // If the condition isn't constant, both regions may be executed.
+  if (ctorRegion)
+    regions.push_back(RegionSuccessor(ctorRegion));
+  if (dtorRegion)
+    regions.push_back(RegionSuccessor(dtorRegion));
+}
+
 static void printGlobalOpTypeAndInitialValue(OpAsmPrinter &p, cir::GlobalOp op,
-                                             TypeAttr type,
-                                             Attribute initAttr) {
+                                             TypeAttr type, Attribute initAttr,
+                                             mlir::Region &ctorRegion,
+                                             mlir::Region &dtorRegion) {
+  auto printType = [&]() { p << ": " << type; };
   if (!op.isDeclaration()) {
     p << "= ";
-    // This also prints the type...
-    if (initAttr)
-      printConstant(p, initAttr);
+    if (!ctorRegion.empty()) {
+      p << "ctor ";
+      printType();
+      p << " ";
+      p.printRegion(ctorRegion,
+                    /*printEntryBlockArgs=*/false,
+                    /*printBlockTerminators=*/false);
+    } else {
+      // This also prints the type...
+      if (initAttr)
+        printConstant(p, initAttr);
+    }
+
+    if (!dtorRegion.empty()) {
+      p << " dtor ";
+      p.printRegion(dtorRegion,
+                    /*printEntryBlockArgs=*/false,
+                    /*printBlockTerminators=*/false);
+    }
   } else {
-    p << ": " << type;
+    printType();
   }
 }
 
-static ParseResult
-parseGlobalOpTypeAndInitialValue(OpAsmParser &parser, TypeAttr &typeAttr,
-                                 Attribute &initialValueAttr) {
+static ParseResult parseGlobalOpTypeAndInitialValue(OpAsmParser &parser,
+                                                    TypeAttr &typeAttr,
+                                                    Attribute &initialValueAttr,
+                                                    mlir::Region &ctorRegion,
+                                                    mlir::Region &dtorRegion) {
   mlir::Type opTy;
   if (parser.parseOptionalEqual().failed()) {
     // Absence of equal means a declaration, so we need to parse the type.
@@ -1397,16 +1461,38 @@ parseGlobalOpTypeAndInitialValue(OpAsmParser &parser, TypeAttr &typeAttr,
     if (parser.parseColonType(opTy))
       return failure();
   } else {
-    // Parse constant with initializer, examples:
-    //  cir.global @y = #cir.fp<1.250000e+00> : !cir.double
-    //  cir.global @rgb = #cir.const_array<[...] : !cir.array<i8 x 3>>
-    if (parseConstantValue(parser, initialValueAttr).failed())
-      return failure();
+    // Parse contructor, example:
+    //  cir.global @rgb = ctor : type { ... }
+    if (!parser.parseOptionalKeyword("ctor")) {
+      if (parser.parseColonType(opTy))
+        return failure();
+      auto parseLoc = parser.getCurrentLocation();
+      if (parser.parseRegion(ctorRegion, /*arguments=*/{}, /*argTypes=*/{}))
+        return failure();
+      if (ensureRegionTerm(parser, ctorRegion, parseLoc).failed())
+        return failure();
+    } else {
+      // Parse constant with initializer, examples:
+      //  cir.global @y = 3.400000e+00 : f32
+      //  cir.global @rgb = #cir.const_array<[...] : !cir.array<i8 x 3>>
+      if (parseConstantValue(parser, initialValueAttr).failed())
+        return failure();
+
+      assert(mlir::isa<mlir::TypedAttr>(initialValueAttr) &&
+             "Non-typed attrs shouldn't appear here.");
+      auto typedAttr = mlir::cast<mlir::TypedAttr>(initialValueAttr);
+      opTy = typedAttr.getType();
+    }
 
-    assert(mlir::isa<mlir::TypedAttr>(initialValueAttr) &&
-           "Non-typed attrs shouldn't appear here.");
-    auto typedAttr = mlir::cast<mlir::TypedAttr>(initialValueAttr);
-    opTy = typedAttr.getType();
+    // Parse destructor, example:
+    //   dtor { ... }
+    if (!parser.parseOptionalKeyword("dtor")) {
+      auto parseLoc = parser.getCurrentLocation();
+      if (parser.parseRegion(dtorRegion, /*arguments=*/{}, /*argTypes=*/{}))
+        return failure();
+      if (ensureRegionTerm(parser, dtorRegion, parseLoc).failed())
+        return failure();
+    }
   }
 
   typeAttr = TypeAttr::get(opTy);
diff --git a/clang/test/CIR/IR/global-init.cir b/clang/test/CIR/IR/global-init.cir
new file mode 100644
index 0000000000000..727c067e25472
--- /dev/null
+++ b/clang/test/CIR/IR/global-init.cir
@@ -0,0 +1,48 @@
+// RUN: cir-opt --verify-roundtrip %s -o - | FileCheck %s
+
+!u8i = !cir.int<u, 8>
+
+!rec_NeedsCtor = !cir.record<struct "NeedsCtor" padded {!u8i}>
+!rec_NeedsDtor = !cir.record<struct "NeedsDtor" padded {!u8i}>
+!rec_NeedsCtorDtor = !cir.record<struct "NeedsCtorDtor" padded {!u8i}>
+
+module attributes {cir.triple = "x86_64-unknown-linux-gnu"} {
+  cir.func private @_ZN9NeedsCtorC1Ev(!cir.ptr<!rec_NeedsCtor>)
+  cir.global external @needsCtor = ctor : !rec_NeedsCtor {
+    %0 = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
+    cir.call @_ZN9NeedsCtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+  }
+  // CHECK: cir.global external @needsCtor = ctor : !rec_NeedsCtor {
+  // CHECK:   %0 = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
+  // CHECK:   cir.call @_ZN9NeedsCtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+  // CHECK: }
+
+  cir.func private @_ZN9NeedsDtorD1Ev(!cir.ptr<!rec_NeedsDtor>)
+  cir.global external dso_local @needsDtor = #cir.zero : !rec_NeedsDtor dtor {
+    %0 = cir.get_global @needsDtor : !cir.ptr<!rec_NeedsDtor>
+    cir.call @_ZN9NeedsDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsDtor>) -> ()
+  }
+  // CHECK: cir.global external dso_local @needsDtor = #cir.zero : !rec_NeedsDtor dtor {
+  // CHECK:   %0 = cir.get_global @needsDtor : !cir.ptr<!rec_NeedsDtor>
+  // CHECK:   cir.call @_ZN9NeedsDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsDtor>) -> ()
+  // CHECK: }
+
+  cir.func private @_ZN13NeedsCtorDtorC1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  cir.func private @_ZN13NeedsCtorDtorD1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  cir.global external dso_local @needsCtorDtor = ctor : !rec_NeedsCtorDtor {
+    %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+    cir.call @_ZN13NeedsCtorDtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  } dtor {
+    %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+    cir.call @_ZN13NeedsCtorDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  }
+  // CHECK: cir.func private @_ZN13NeedsCtorDtorC1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  // CHECK: cir.func private @_ZN13NeedsCtorDtorD1Ev(!cir.ptr<!rec_NeedsCtorDtor>)
+  // CHECK: cir.global external dso_local @needsCtorDtor = ctor : !rec_NeedsCtorDtor {
+  // CHECK:   %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+  // CHECK:   cir.call @_ZN13NeedsCtorDtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  // CHECK: } dtor {
+  // CHECK:   %0 = cir.get_global @needsCtorDtor : !cir.ptr<!rec_NeedsCtorDtor>
+  // CHECK:   cir.call @_ZN13NeedsCtorDtorD1Ev(%0) : (!cir.ptr<!rec_NeedsCtorDtor>) -> ()
+  // CHECK: }
+}

From 7166bc7dbf299b92de193424edfb8d1841fd1ea0 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk@outlook.com>
Date: Tue, 30 Sep 2025 02:16:01 +0300
Subject: [PATCH 195/878] [Clang] Avoid null deref in lambda attribute compat
 warning (#161096)

Fixes #161070

---

This PR addresses the issue in `ext_decl_attrs_on_lambda` by using
`%0`=_attribute name_ and `%1`=_selector_, which prevents a null
`IdentifierInfo*`.


https://github.com/llvm/llvm-project/blob/48a6f2f85c8269d8326c185016801a4eb8d5dfd6/clang/lib/Parse/ParseExprCXX.cpp#L1299-L1302


https://github.com/llvm/llvm-project/blob/48a6f2f85c8269d8326c185016801a4eb8d5dfd6/clang/include/clang/Basic/DiagnosticParseKinds.td#L1143-L1145


https://github.com/llvm/llvm-project/blob/48a6f2f85c8269d8326c185016801a4eb8d5dfd6/clang/include/clang/Basic/DiagnosticParseKinds.td#L1149-L1152
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 .../clang/Basic/DiagnosticParseKinds.td       |  2 +-
 clang/lib/Parse/ParseExprCXX.cpp              |  2 +-
 clang/test/Parser/cxx2b-lambdas-ext-warns.cpp | 32 +++++++++++++++----
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6521fc3e9a9da..79dc0b2728e98 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -434,6 +434,7 @@ Bug Fixes to C++ Support
 - Suppress ``-Wdouble-promotion`` when explicitly asked for with C++ list initialization (#GH33409).
 - Fix the result of `__builtin_is_implicit_lifetime` for types with a user-provided constructor. (#GH160610)
 - Correctly deduce return types in ``decltype`` expressions. (#GH160497) (#GH56652) (#GH116319) (#GH161196)
+- Fixed a crash in the pre-C++23 warning for attributes before a lambda declarator (#GH161070).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 4d9e123eb4ef1..c724136a7fdaf 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1141,7 +1141,7 @@ def warn_cxx23_compat_binding_pack : Warning<
 def err_capture_default_first : Error<
   "capture default must be first">;
 def ext_decl_attrs_on_lambda : ExtWarn<
-  "%select{an attribute specifier sequence|%0}1 in this position "
+  "%select{an attribute specifier sequence|%1}0 in this position "
   "is a C++23 extension">, InGroup<CXX23AttrsOnLambda>;
 def ext_lambda_missing_parens : ExtWarn<
   "lambda without a parameter clause is a C++23 extension">,
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 8605ba2cdb49b..a2c69578d5087 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1299,7 +1299,7 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
     Diag(Tok, getLangOpts().CPlusPlus23
                   ? diag::warn_cxx20_compat_decl_attrs_on_lambda
                   : diag::ext_decl_attrs_on_lambda)
-        << Tok.getIdentifierInfo() << Tok.isRegularKeywordAttribute();
+        << Tok.isRegularKeywordAttribute() << Tok.getIdentifierInfo();
     MaybeParseCXX11Attributes(D);
   }
 
diff --git a/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp b/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp
index 7ffb7aae9d391..8c7a77815d47c 100644
--- a/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp
+++ b/clang/test/Parser/cxx2b-lambdas-ext-warns.cpp
@@ -1,9 +1,7 @@
-// RUN: %clang_cc1 -std=c++20 %s -verify=cxx20
-// RUN: %clang_cc1 -std=c++23 %s -verify=cxx23
-// RUN: %clang_cc1 -std=c++23 -Wpre-c++23-compat %s -verify=precxx23
-// RUN: %clang_cc1 -std=c++23 -pedantic %s -verify=cxx23
-
-//cxx23-no-diagnostics
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++20 %s -verify=cxx20
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++23 %s -verify=cxx23
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++23 -Wpre-c++23-compat %s -verify=precxx23
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -target-feature +sme -std=c++23 -pedantic %s -verify=cxx23
 
 auto L1 = [] constexpr {};
 // cxx20-warning@-1 {{lambda without a parameter clause is a C++23 extension}}
@@ -14,3 +12,25 @@ auto L3 = [] static {};
 // cxx20-warning@-1 {{lambda without a parameter clause is a C++23 extension}}
 // cxx20-warning@-2 {{static lambdas are a C++23 extension}}
 // precxx23-warning@-3 {{static lambdas are incompatible with C++ standards before C++23}}
+
+namespace GH161070 {
+void t1() { int a = [] __arm_streaming; }
+// precxx23-error@-1 {{'__arm_streaming' cannot be applied to a declaration}}
+// precxx23-error@-2 {{expected body of lambda expression}}
+// cxx23-error@-3 {{'__arm_streaming' cannot be applied to a declaration}}
+// cxx23-error@-4 {{expected body of lambda expression}}
+// cxx20-error@-5 {{'__arm_streaming' cannot be applied to a declaration}}
+// cxx20-error@-6 {{expected body of lambda expression}}
+// cxx20-warning@-7 {{'__arm_streaming' in this position is a C++23 extension}}
+// precxx23-warning@-8 {{'__arm_streaming' in this position is incompatible with C++ standards before C++23}}
+
+void t2() { int a = [] [[assume(true)]]; }
+// precxx23-error@-1 {{'assume' attribute cannot be applied to a declaration}}
+// precxx23-error@-2 {{expected body of lambda expression}}
+// cxx23-error@-3 {{'assume' attribute cannot be applied to a declaration}}
+// cxx23-error@-4 {{expected body of lambda expression}}
+// cxx20-error@-5 {{'assume' attribute cannot be applied to a declaration}}
+// cxx20-error@-6 {{expected body of lambda expression}}
+// cxx20-warning@-7 {{an attribute specifier sequence in this position is a C++23 extension}}
+// precxx23-warning@-8 {{an attribute specifier sequence in this position is incompatible with C++ standards before C++23}}
+}

From 18136c249610ce8bb1e4b2b543a605b1221730bd Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Mon, 29 Sep 2025 16:28:40 -0700
Subject: [PATCH 196/878] [lld][macho][NFC] Add release note for #158720
 (#161295)

I forgot to add a release note for
https://github.com/llvm/llvm-project/pull/158720 so I'll add it here.
---
 lld/docs/ReleaseNotes.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 6ea1ea0fd6c2f..566dde6e08115 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -44,6 +44,9 @@ MinGW Improvements
 MachO Improvements
 ------------------
 
+* ``--separate-cstring-literal-sections`` emits cstring literal sections into sections defined by their section name.
+  (`#158720 <https://github.com/llvm/llvm-project/pull/158720>`_)
+
 WebAssembly Improvements
 ------------------------
 

From 1fcf481631b5521732d99963bcb69a81d19e9f79 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 29 Sep 2025 17:00:00 -0700
Subject: [PATCH 197/878] [llvm][mustache] Support setting delimiters in
 templates (#159187)

The base mustache spec allows setting custom delimiters, which slightly
change parsing of partials. This patch implements that feature by adding
a new token type, and changing the tokenizer's behavior to allow setting
custom delimiters.
---
 llvm/lib/Support/Mustache.cpp                 | 210 ++++++++++++------
 llvm/unittests/Support/MustacheTest.cpp       |  22 +-
 .../llvm-test-mustache-spec.cpp               |  16 --
 3 files changed, 158 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 8da6fdb7beff9..9834bf6f9361f 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -7,9 +7,13 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Support/Mustache.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 #include <sstream>
 
+#define DEBUG_TYPE "mustache"
+
 using namespace llvm;
 using namespace llvm::mustache;
 
@@ -62,6 +66,7 @@ class Token {
     InvertSectionOpen,
     UnescapeVariable,
     Comment,
+    SetDelimiter,
   };
 
   Token(std::string Str)
@@ -102,6 +107,8 @@ class Token {
       return Type::Partial;
     case '&':
       return Type::UnescapeVariable;
+    case '=':
+      return Type::SetDelimiter;
     default:
       return Type::Variable;
     }
@@ -189,27 +196,27 @@ class ASTNode {
 };
 
 // A wrapper for arena allocator for ASTNodes
-AstPtr createRootNode(llvm::StringMap<AstPtr> &Partials,
-                      llvm::StringMap<Lambda> &Lambdas,
-                      llvm::StringMap<SectionLambda> &SectionLambdas,
-                      EscapeMap &Escapes) {
+static AstPtr createRootNode(llvm::StringMap<AstPtr> &Partials,
+                             llvm::StringMap<Lambda> &Lambdas,
+                             llvm::StringMap<SectionLambda> &SectionLambdas,
+                             EscapeMap &Escapes) {
   return std::make_unique<ASTNode>(Partials, Lambdas, SectionLambdas, Escapes);
 }
 
-AstPtr createNode(ASTNode::Type T, Accessor A, ASTNode *Parent,
-                  llvm::StringMap<AstPtr> &Partials,
-                  llvm::StringMap<Lambda> &Lambdas,
-                  llvm::StringMap<SectionLambda> &SectionLambdas,
-                  EscapeMap &Escapes) {
+static AstPtr createNode(ASTNode::Type T, Accessor A, ASTNode *Parent,
+                         llvm::StringMap<AstPtr> &Partials,
+                         llvm::StringMap<Lambda> &Lambdas,
+                         llvm::StringMap<SectionLambda> &SectionLambdas,
+                         EscapeMap &Escapes) {
   return std::make_unique<ASTNode>(T, std::move(A), Parent, Partials, Lambdas,
                                    SectionLambdas, Escapes);
 }
 
-AstPtr createTextNode(std::string Body, ASTNode *Parent,
-                      llvm::StringMap<AstPtr> &Partials,
-                      llvm::StringMap<Lambda> &Lambdas,
-                      llvm::StringMap<SectionLambda> &SectionLambdas,
-                      EscapeMap &Escapes) {
+static AstPtr createTextNode(std::string Body, ASTNode *Parent,
+                             llvm::StringMap<AstPtr> &Partials,
+                             llvm::StringMap<Lambda> &Lambdas,
+                             llvm::StringMap<SectionLambda> &SectionLambdas,
+                             EscapeMap &Escapes) {
   return std::make_unique<ASTNode>(std::move(Body), Parent, Partials, Lambdas,
                                    SectionLambdas, Escapes);
 }
@@ -226,7 +233,7 @@ AstPtr createTextNode(std::string Body, ASTNode *Parent,
 // and the current token is the second token.
 // For example:
 //  "{{#Section}}"
-bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
+static bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
   if (Idx == 0)
     return true;
 
@@ -242,7 +249,7 @@ bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
 // Function to check if there's no meaningful text ahead.
 // We determine if a token has text ahead if the left of previous
 // token does not start with a newline.
-bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
+static bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
   if (Idx >= Tokens.size() - 1)
     return true;
 
@@ -255,11 +262,11 @@ bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
   return !TokenBody.starts_with("\r\n") && !TokenBody.starts_with("\n");
 }
 
-bool requiresCleanUp(Token::Type T) {
+static bool requiresCleanUp(Token::Type T) {
   // We must clean up all the tokens that could contain child nodes.
   return T == Token::Type::SectionOpen || T == Token::Type::InvertSectionOpen ||
          T == Token::Type::SectionClose || T == Token::Type::Comment ||
-         T == Token::Type::Partial;
+         T == Token::Type::Partial || T == Token::Type::SetDelimiter;
 }
 
 // Adjust next token body if there is no text ahead.
@@ -268,7 +275,7 @@ bool requiresCleanUp(Token::Type T) {
 //  "{{! Comment }} \nLine 2"
 // would be considered as no text ahead and should be rendered as
 //  " Line 2"
-void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
+static void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
   Token &NextToken = Tokens[Idx + 1];
   StringRef NextTokenBody = NextToken.TokenBody;
   // Cut off the leading newline which could be \n or \r\n.
@@ -294,57 +301,128 @@ void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
   CurrentToken.setIndentation(Indentation);
 }
 
+struct Tag {
+  enum class Kind {
+    None,
+    Normal, // {{...}}
+    Triple, // {{{...}}}
+  };
+
+  Kind TagKind = Kind::None;
+  StringRef Content;   // The content between the delimiters.
+  StringRef FullMatch; // The entire tag, including delimiters.
+  size_t StartPosition = StringRef::npos;
+};
+
+static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
+                       StringRef Close) {
+  const StringLiteral TripleOpen("{{{");
+  const StringLiteral TripleClose("}}}");
+
+  size_t NormalOpenPos = Template.find(Open, StartPos);
+  size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
+
+  Tag Result;
+
+  // Determine which tag comes first.
+  if (TripleOpenPos != StringRef::npos &&
+      (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
+    // Found a triple mustache tag.
+    size_t EndPos =
+        Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
+    if (EndPos == StringRef::npos)
+      return Result; // No closing tag found.
+
+    Result.TagKind = Tag::Kind::Triple;
+    Result.StartPosition = TripleOpenPos;
+    size_t ContentStart = TripleOpenPos + TripleOpen.size();
+    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
+    Result.FullMatch = Template.substr(
+        TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
+  } else if (NormalOpenPos != StringRef::npos) {
+    // Found a normal mustache tag.
+    size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
+    if (EndPos == StringRef::npos)
+      return Result; // No closing tag found.
+
+    Result.TagKind = Tag::Kind::Normal;
+    Result.StartPosition = NormalOpenPos;
+    size_t ContentStart = NormalOpenPos + Open.size();
+    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
+    Result.FullMatch =
+        Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
+  }
+
+  return Result;
+}
+
+static void processTag(const Tag &T, SmallVectorImpl<Token> &Tokens,
+                       SmallString<8> &Open, SmallString<8> &Close) {
+  LLVM_DEBUG(dbgs() << "  Found tag: \"" << T.FullMatch << "\", Content: \""
+                    << T.Content << "\"\n");
+  if (T.TagKind == Tag::Kind::Triple) {
+    Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
+    LLVM_DEBUG(dbgs() << "  Created UnescapeVariable token.\n");
+    return;
+  }
+  StringRef Interpolated = T.Content;
+  std::string RawBody = T.FullMatch.str();
+  if (!Interpolated.trim().starts_with("=")) {
+    char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+    Tokens.emplace_back(RawBody, Interpolated.str(), Front);
+    LLVM_DEBUG(dbgs() << "  Created tag token of type '" << Front << "'\n");
+    return;
+  }
+  Tokens.emplace_back(RawBody, Interpolated.str(), '=');
+  StringRef DelimSpec = Interpolated.trim();
+  DelimSpec = DelimSpec.drop_front(1);
+  DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+  DelimSpec = DelimSpec.trim();
+
+  auto [NewOpen, NewClose] = DelimSpec.split(' ');
+  Open = NewOpen;
+  Close = NewClose;
+
+  LLVM_DEBUG(dbgs() << "  Found Set Delimiter tag. NewOpen='" << Open
+                    << "', NewClose='" << Close << "'\n");
+}
+
 // Simple tokenizer that splits the template into tokens.
 // The mustache spec allows {{{ }}} to unescape variables,
 // but we don't support that here. An unescape variable
 // is represented only by {{& variable}}.
-SmallVector<Token> tokenize(StringRef Template) {
+static SmallVector<Token> tokenize(StringRef Template) {
+  LLVM_DEBUG(dbgs() << "Tokenizing template: \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
-  StringLiteral Open("{{");
-  StringLiteral Close("}}");
-  StringLiteral TripleOpen("{{{");
-  StringLiteral TripleClose("}}}");
+  SmallString<8> Open("{{");
+  SmallString<8> Close("}}");
   size_t Start = 0;
-  size_t DelimiterStart = Template.find(Open);
-  if (DelimiterStart == StringRef::npos) {
-    Tokens.emplace_back(Template.str());
-    return Tokens;
-  }
-  while (DelimiterStart != StringRef::npos) {
-    if (DelimiterStart != Start)
-      Tokens.emplace_back(Template.substr(Start, DelimiterStart - Start).str());
-
-    if (Template.substr(DelimiterStart).starts_with(TripleOpen)) {
-      size_t DelimiterEnd = Template.find(TripleClose, DelimiterStart);
-      if (DelimiterEnd == StringRef::npos)
-        break;
-      size_t BodyStart = DelimiterStart + TripleOpen.size();
-      std::string Body =
-          Template.substr(BodyStart, DelimiterEnd - BodyStart).str();
-      std::string RawBody =
-          Template.substr(DelimiterStart, DelimiterEnd - DelimiterStart + 3)
-              .str();
-      Tokens.emplace_back(RawBody, "&" + Body, '&');
-      Start = DelimiterEnd + TripleClose.size();
-    } else {
-      size_t DelimiterEnd = Template.find(Close, DelimiterStart);
-      if (DelimiterEnd == StringRef::npos)
-        break;
-
-      // Extract the Interpolated variable without delimiters.
-      size_t InterpolatedStart = DelimiterStart + Open.size();
-      size_t InterpolatedEnd = DelimiterEnd - DelimiterStart - Close.size();
-      std::string Interpolated =
-          Template.substr(InterpolatedStart, InterpolatedEnd).str();
-      std::string RawBody = Open.str() + Interpolated + Close.str();
-      Tokens.emplace_back(RawBody, Interpolated, Interpolated[0]);
-      Start = DelimiterEnd + Close.size();
+
+  while (Start < Template.size()) {
+    LLVM_DEBUG(dbgs() << "Loop start. Start=" << Start << ", Open='" << Open
+                      << "', Close='" << Close << "'\n");
+    Tag T = findNextTag(Template, Start, Open, Close);
+
+    if (T.TagKind == Tag::Kind::None) {
+      // No more tags, the rest is text.
+      Tokens.emplace_back(Template.substr(Start).str());
+      LLVM_DEBUG(dbgs() << "  No more tags. Created final Text token: \""
+                        << Template.substr(Start) << "\"\n");
+      break;
+    }
+
+    // Add the text before the tag.
+    if (T.StartPosition > Start) {
+      StringRef Text = Template.substr(Start, T.StartPosition - Start);
+      Tokens.emplace_back(Text.str());
+      LLVM_DEBUG(dbgs() << "  Created Text token: \"" << Text << "\"\n");
     }
-    DelimiterStart = Template.find(Open, Start);
-  }
 
-  if (Start < Template.size())
-    Tokens.emplace_back(Template.substr(Start).str());
+    processTag(T, Tokens, Open, Close);
+
+    // Move past the tag.
+    Start = T.StartPosition + T.FullMatch.size();
+  }
 
   // Fix up white spaces for:
   //   - open sections
@@ -386,6 +464,7 @@ SmallVector<Token> tokenize(StringRef Template) {
     if ((!HasTextBehind && !HasTextAhead) || (!HasTextBehind && Idx == LastIdx))
       stripTokenBefore(Tokens, Idx, CurrentToken, CurrentType);
   }
+  LLVM_DEBUG(dbgs() << "Tokenizing finished.\n");
   return Tokens;
 }
 
@@ -563,13 +642,14 @@ void Parser::parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
       break;
     }
     case Token::Type::Comment:
+    case Token::Type::SetDelimiter:
       break;
     case Token::Type::SectionClose:
       return;
     }
   }
 }
-void toMustacheString(const json::Value &Data, raw_ostream &OS) {
+static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   switch (Data.kind()) {
   case json::Value::Null:
     return;
@@ -602,6 +682,8 @@ void toMustacheString(const json::Value &Data, raw_ostream &OS) {
 }
 
 void ASTNode::render(const json::Value &CurrentCtx, raw_ostream &OS) {
+  if (Ty != Root && Ty != Text && AccessorValue.empty())
+    return;
   // Set the parent context to the incoming context so that we
   // can walk up the context tree correctly in findContext().
   ParentContext = &CurrentCtx;
@@ -801,3 +883,5 @@ Template &Template::operator=(Template &&Other) noexcept {
   return *this;
 }
 } // namespace llvm::mustache
+
+#undef DEBUG_TYPE
diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp
index 0ebbc58e023cc..83f6e9afd1e71 100644
--- a/llvm/unittests/Support/MustacheTest.cpp
+++ b/llvm/unittests/Support/MustacheTest.cpp
@@ -1335,7 +1335,7 @@ TEST(MustacheDelimiters, PairBehavior) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("(Hey!)", Out);
+  EXPECT_EQ("(Hey!)", Out);
 }
 
 TEST(MustacheDelimiters, SpecialCharacters) {
@@ -1344,7 +1344,7 @@ TEST(MustacheDelimiters, SpecialCharacters) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("(It worked!)", Out);
+  EXPECT_EQ("(It worked!)", Out);
 }
 
 TEST(MustacheDelimiters, Sections) {
@@ -1355,7 +1355,7 @@ TEST(MustacheDelimiters, Sections) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+  EXPECT_EQ("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
             "interpolated.\n]\n",
             Out);
 }
@@ -1368,7 +1368,7 @@ TEST(MustacheDelimiters, InvertedSections) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
+  EXPECT_EQ("[\n  I got interpolated.\n  |data|\n\n  {{data}}\n  I got "
             "interpolated.\n]\n",
             Out);
 }
@@ -1380,7 +1380,7 @@ TEST(MustacheDelimiters, PartialInheritence) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[ .yes. ]\n[ .yes. ]\n", Out);
+  EXPECT_EQ("[ .yes. ]\n[ .yes. ]\n", Out);
 }
 
 TEST(MustacheDelimiters, PostPartialBehavior) {
@@ -1390,7 +1390,7 @@ TEST(MustacheDelimiters, PostPartialBehavior) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("[ .yes.  .yes. ]\n[ .yes.  .|value|. ]\n", Out);
+  EXPECT_EQ("[ .yes.  .yes. ]\n[ .yes.  .|value|. ]\n", Out);
 }
 
 TEST(MustacheDelimiters, SurroundingWhitespace) {
@@ -1417,7 +1417,7 @@ TEST(MustacheDelimiters, StandaloneTag) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("Begin.\nEnd.\n", Out);
+  EXPECT_EQ("Begin.\nEnd.\n", Out);
 }
 
 TEST(MustacheDelimiters, IndentedStandaloneTag) {
@@ -1426,7 +1426,7 @@ TEST(MustacheDelimiters, IndentedStandaloneTag) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("Begin.\nEnd.\n", Out);
+  EXPECT_EQ("Begin.\nEnd.\n", Out);
 }
 
 TEST(MustacheDelimiters, StandaloneLineEndings) {
@@ -1435,7 +1435,7 @@ TEST(MustacheDelimiters, StandaloneLineEndings) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("|\r\n|", Out);
+  EXPECT_EQ("|\r\n|", Out);
 }
 
 TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
@@ -1444,7 +1444,7 @@ TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("=", Out);
+  EXPECT_EQ("=", Out);
 }
 
 TEST(MustacheDelimiters, StandaloneWithoutNewline) {
@@ -1453,7 +1453,7 @@ TEST(MustacheDelimiters, StandaloneWithoutNewline) {
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
-  EXPECT_NE("=\n", Out);
+  EXPECT_EQ("=\n", Out);
 }
 
 TEST(MustacheDelimiters, PairwithPadding) {
diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
index ea1395b2646f6..bdcef376547fb 100644
--- a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -54,20 +54,6 @@ static int NumXFail = 0;
 static int NumSuccess = 0;
 
 static const StringMap<StringSet<>> XFailTestNames = {{
-    {"delimiters.json",
-     {
-         "Pair Behavior",
-         "Special Characters",
-         "Sections",
-         "Inverted Sections",
-         "Partial Inheritence",
-         "Post-Partial Behavior",
-         "Standalone Tag",
-         "Indented Standalone Tag",
-         "Standalone Line Endings",
-         "Standalone Without Previous Line",
-         "Standalone Without Newline",
-     }},
     {"~dynamic-names.json",
      {
          "Basic Behavior - Partial",
@@ -113,7 +99,6 @@ static const StringMap<StringSet<>> XFailTestNames = {{
          "Block reindentation",
          "Intrinsic indentation",
          "Nested block reindentation",
-
      }},
     {"~lambdas.json",
      {
@@ -126,7 +111,6 @@ static const StringMap<StringSet<>> XFailTestNames = {{
          "Section - Expansion",
          "Section - Alternate Delimiters",
          "Section - Multiple Calls",
-
      }},
     {"partials.json", {"Standalone Indentation"}},
 }};

From 39f292ffa13d7ca0d1edff27ac8fd55024bb4d19 Mon Sep 17 00:00:00 2001
From: Jin Huang <jinhuang1102@gmail.com>
Date: Mon, 29 Sep 2025 17:24:50 -0700
Subject: [PATCH 198/878] [profcheck] Add unknown branch weight for inlined
 memchr calls. (#160964)

The memchr inliner creates new switch branches but was failling to add
profile metada. This patch fixes the issue by explicitly adding unknown
branch weights to these branches.

Issue [#147390](https://github.com/llvm/llvm-project/issues/147390)
---
 .../AggressiveInstCombine.cpp                 |  4 ++
 .../AggressiveInstCombine/memchr.ll           | 51 ++++++++++++-------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index ee1fec0da3d73..805bdb41737c1 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1350,6 +1350,10 @@ static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU,
   BB->getTerminator()->eraseFromParent();
   SwitchInst *SI = IRB.CreateSwitch(
       IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
+  // We can't know the precise weights here, as they would depend on the value
+  // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
+  setExplicitlyUnknownBranchWeightsIfProfiled(*SI, *Call->getFunction(),
+                                              DEBUG_TYPE);
   Type *IndexTy = DL.getIndexType(Call->getType());
   SmallVector<DominatorTree::UpdateType, 8> Updates;
 
diff --git a/llvm/test/Transforms/AggressiveInstCombine/memchr.ll b/llvm/test/Transforms/AggressiveInstCombine/memchr.ll
index b26320be634b8..6fbe960109098 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/memchr.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/memchr.ll
@@ -6,9 +6,10 @@
 
 declare ptr @memchr(ptr, i32, i64)
 
-define i1 @test_memchr_null(i32 %x) {
+define i1 @test_memchr_null(i32 %x) !prof !0 {
 ; CHECK-LABEL: define i1 @test_memchr_null(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]]) 
+; CHECK: !prof [[PROF_0:![0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [
@@ -40,9 +41,10 @@ entry:
   ret i1 %isnull
 }
 
-define ptr @test_memchr(i32 %x) {
+define ptr @test_memchr(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [
@@ -72,16 +74,17 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_smaller_n(i32 %x) {
+define ptr @test_memchr_smaller_n(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_smaller_n(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[X]] to i8
 ; CHECK-NEXT:    switch i8 [[TMP0]], label %[[ENTRY_SPLIT:.*]] [
 ; CHECK-NEXT:      i8 48, label %[[MEMCHR_CASE:.*]]
 ; CHECK-NEXT:      i8 49, label %[[MEMCHR_CASE1:.*]]
 ; CHECK-NEXT:      i8 0, label %[[MEMCHR_CASE2:.*]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    ], !prof [[PROF_1:![0-9]+]]
 ; CHECK:       [[MEMCHR_CASE]]:
 ; CHECK-NEXT:    br label %[[MEMCHR_SUCCESS:.*]]
 ; CHECK:       [[MEMCHR_CASE1]]:
@@ -103,9 +106,10 @@ entry:
 
 ; negative tests
 
-define ptr @test_memchr_larger_n(i32 %x) {
+define ptr @test_memchr_larger_n(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_larger_n(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]])
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i64 6)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -115,9 +119,10 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_non_constant(i32 %x, ptr %str) {
+define ptr @test_memchr_non_constant(i32 %x, ptr %str) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_non_constant(
-; CHECK-SAME: i32 [[X:%.*]], ptr [[STR:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]], ptr [[STR:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr [[STR]], i32 [[X]], i64 5)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -127,8 +132,9 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_constant_ch() {
-; CHECK-LABEL: define ptr @test_memchr_constant_ch() {
+define ptr @test_memchr_constant_ch() !prof !0 {
+; CHECK-LABEL: define ptr @test_memchr_constant_ch() 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 49, i64 5)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -138,9 +144,10 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_dynamic_n(i32 %x, i32 %y) {
+define ptr @test_memchr_dynamic_n(i32 %x, i32 %y) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_dynamic_n(
-; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i32 [[Y]])
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -150,9 +157,10 @@ entry:
   ret ptr %memchr
 }
 
-define ptr @test_memchr_long(i32 %x) {
+define ptr @test_memchr_long(i32 %x) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_long(
-; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]])
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str_long, i32 [[X]], i64 8)
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -163,9 +171,10 @@ entry:
 }
 
 ; We want to check that the compiler still calls memchr if the length is non-constant:
-define ptr @test_memchr_non_constant_length2(i32 %x, i64 %len) {
+define ptr @test_memchr_non_constant_length2(i32 %x, i64 %len) !prof !0 {
 ; CHECK-LABEL: define ptr @test_memchr_non_constant_length2(
-; CHECK-SAME: i32 [[X:%.*]], i64 [[LEN:%.*]]) {
+; CHECK-SAME: i32 [[X:%.*]], i64 [[LEN:%.*]]) 
+; CHECK: !prof [[PROF_0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[MEMCHR:%.*]] = call ptr @memchr(ptr @str, i32 [[X]], i64 [[LEN]])
 ; CHECK-NEXT:    ret ptr [[MEMCHR]]
@@ -174,3 +183,7 @@ entry:
   %memchr = call ptr @memchr(ptr @str, i32 %x, i64 %len)
   ret ptr %memchr
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"unknown", !"aggressive-instcombine"}
\ No newline at end of file

From 6ffacae996b7eecf3859d7c83c95e7d7d95bc03f Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 29 Sep 2025 17:55:08 -0700
Subject: [PATCH 199/878] [llvm][mustache] Refactor tokenizer for clarity
 (#159188)

This patch refactors the Mustache tokenizer by breaking the logic up
with helper functions to improve clarity and simplify the code.
---
 llvm/lib/Support/Mustache.cpp | 49 +++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 9834bf6f9361f..597288f26c4ac 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -560,11 +560,34 @@ class Parser {
                      llvm::StringMap<SectionLambda> &SectionLambdas,
                      EscapeMap &Escapes);
 
+  void parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A,
+                    llvm::StringMap<AstPtr> &Partials,
+                    llvm::StringMap<Lambda> &Lambdas,
+                    llvm::StringMap<SectionLambda> &SectionLambdas,
+                    EscapeMap &Escapes);
+
   SmallVector<Token> Tokens;
   size_t CurrentPtr;
   StringRef TemplateStr;
 };
 
+void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A,
+                          llvm::StringMap<AstPtr> &Partials,
+                          llvm::StringMap<Lambda> &Lambdas,
+                          llvm::StringMap<SectionLambda> &SectionLambdas,
+                          EscapeMap &Escapes) {
+  AstPtr CurrentNode =
+      createNode(Ty, A, Parent, Partials, Lambdas, SectionLambdas, Escapes);
+  size_t Start = CurrentPtr;
+  parseMustache(CurrentNode.get(), Partials, Lambdas, SectionLambdas, Escapes);
+  const size_t End = CurrentPtr - 1;
+  std::string RawBody;
+  for (std::size_t I = Start; I < End; I++)
+    RawBody += Tokens[I].RawBody;
+  CurrentNode->setRawBody(std::move(RawBody));
+  Parent->addChild(std::move(CurrentNode));
+}
+
 AstPtr Parser::parse(llvm::StringMap<AstPtr> &Partials,
                      llvm::StringMap<Lambda> &Lambdas,
                      llvm::StringMap<SectionLambda> &SectionLambdas,
@@ -614,31 +637,13 @@ void Parser::parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
       break;
     }
     case Token::Type::SectionOpen: {
-      CurrentNode = createNode(ASTNode::Section, A, Parent, Partials, Lambdas,
-                               SectionLambdas, Escapes);
-      size_t Start = CurrentPtr;
-      parseMustache(CurrentNode.get(), Partials, Lambdas, SectionLambdas,
-                    Escapes);
-      const size_t End = CurrentPtr - 1;
-      std::string RawBody;
-      for (std::size_t I = Start; I < End; I++)
-        RawBody += Tokens[I].RawBody;
-      CurrentNode->setRawBody(std::move(RawBody));
-      Parent->addChild(std::move(CurrentNode));
+      parseSection(Parent, ASTNode::Section, A, Partials, Lambdas,
+                   SectionLambdas, Escapes);
       break;
     }
     case Token::Type::InvertSectionOpen: {
-      CurrentNode = createNode(ASTNode::InvertSection, A, Parent, Partials,
-                               Lambdas, SectionLambdas, Escapes);
-      size_t Start = CurrentPtr;
-      parseMustache(CurrentNode.get(), Partials, Lambdas, SectionLambdas,
-                    Escapes);
-      const size_t End = CurrentPtr - 1;
-      std::string RawBody;
-      for (size_t Idx = Start; Idx < End; Idx++)
-        RawBody += Tokens[Idx].RawBody;
-      CurrentNode->setRawBody(std::move(RawBody));
-      Parent->addChild(std::move(CurrentNode));
+      parseSection(Parent, ASTNode::InvertSection, A, Partials, Lambdas,
+                   SectionLambdas, Escapes);
       break;
     }
     case Token::Type::Comment:

From dc6e4e97fe5b9414b9597587670b5dfdd7c49b55 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Tue, 30 Sep 2025 09:19:40 +0800
Subject: [PATCH 200/878] [LoongArch][NFC] Pre-commit tests for
 `xvinsve0.{w/d}` (#160829)

---
 .../ir-instruction/shuffle-as-xvinsve0.ll     | 353 ++++++++++++++++++
 1 file changed, 353 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll

diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll
new file mode 100644
index 0000000000000..b5d5c9c15d7c8
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll
@@ -0,0 +1,353 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64
+
+;; xvinsve0.w
+define void @xvinsve0_v8i32_l_0(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8i32_l_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI0_0)
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8i32_l_4(ptr %d, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvinsve0_v8i32_l_4:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    ld.w $a2, $a2, 0
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 5
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 6
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 7
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 0
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
+; LA32-NEXT:    xvpermi.q $xr2, $xr1, 2
+; LA32-NEXT:    xvst $xr2, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvinsve0_v8i32_l_4:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a2, 0
+; LA64-NEXT:    xvld $xr1, $a1, 0
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 0
+; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 0
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 5
+; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 1
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 6
+; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 2
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 7
+; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 3
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 0
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 1
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 2
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 3
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
+; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
+; LA64-NEXT:    xvst $xr2, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8f32_l(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8f32_l:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI2_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI2_0)
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x float>, ptr %a
+  %vb = load <8 x float>, ptr %b
+  %vc = shufflevector <8 x float> %va, <8 x float> %vb, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8i32_h_1(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8i32_h_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 8, i32 0, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8i32_h_6(ptr %d, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvinsve0_v8i32_h_6:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a2, 0
+; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 4
+; LA32-NEXT:    ld.w $a1, $a1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
+; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 5
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 1
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 7
+; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 0
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
+; LA32-NEXT:    xvpermi.q $xr2, $xr1, 2
+; LA32-NEXT:    xvst $xr2, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvinsve0_v8i32_h_6:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    xvld $xr0, $a2, 0
+; LA64-NEXT:    xvld $xr1, $a1, 0
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 4
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 5
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 0
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 7
+; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 0
+; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 1
+; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 2
+; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
+; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 3
+; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
+; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
+; LA64-NEXT:    xvst $xr1, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <8 x i32>, ptr %a
+  %vb = load <8 x i32>, ptr %b
+  %vc = shufflevector <8 x i32> %va, <8 x i32> %vb, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 15>
+  store <8 x i32> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v8f32_h(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v8f32_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI5_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI5_0)
+; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <8 x float>, ptr %a
+  %vb = load <8 x float>, ptr %b
+  %vc = shufflevector <8 x float> %va, <8 x float> %vb, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <8 x float> %vc, ptr %d
+  ret void
+}
+
+;; xvinsve0.d
+define void @xvinsve0_v4i64_l_1(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4i64_l_1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI6_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI6_0)
+; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4i64_l_2(ptr %d, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvinsve0_v4i64_l_2:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a2, 0
+; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
+; LA32-NEXT:    xvld $xr1, $a1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 6
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 7
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 1
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 1
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 2
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 3
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 3
+; LA32-NEXT:    xvpermi.q $xr0, $xr2, 2
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvinsve0_v4i64_l_2:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ld.d $a2, $a2, 0
+; LA64-NEXT:    xvld $xr0, $a1, 0
+; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
+; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
+; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
+; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 0
+; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
+; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 1
+; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
+; LA64-NEXT:    xvpermi.q $xr2, $xr1, 2
+; LA64-NEXT:    xvst $xr2, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4f64_l(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4f64_l:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI8_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI8_0)
+; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x double>, ptr %a
+  %vb = load <4 x double>, ptr %b
+  %vc = shufflevector <4 x double> %va, <4 x double> %vb, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  store <4 x double> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4i64_h_0(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4i64_h_0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI9_0)
+; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4i64_h_2(ptr %d, ptr %a, ptr %b) nounwind {
+; LA32-LABEL: xvinsve0_v4i64_h_2:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    xvld $xr0, $a1, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 0
+; LA32-NEXT:    xvld $xr1, $a2, 0
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 6
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 7
+; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 0
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 0
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 1
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 1
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 2
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 2
+; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 3
+; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 3
+; LA32-NEXT:    xvpermi.q $xr0, $xr2, 2
+; LA32-NEXT:    xvst $xr0, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: xvinsve0_v4i64_h_2:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ld.d $a1, $a1, 0
+; LA64-NEXT:    xvld $xr0, $a2, 0
+; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 0
+; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
+; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
+; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 0
+; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
+; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 1
+; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
+; LA64-NEXT:    xvpermi.q $xr2, $xr1, 2
+; LA64-NEXT:    xvst $xr2, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  %va = load <4 x i64>, ptr %a
+  %vb = load <4 x i64>, ptr %b
+  %vc = shufflevector <4 x i64> %va, <4 x i64> %vb, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+  store <4 x i64> %vc, ptr %d
+  ret void
+}
+
+define void @xvinsve0_v4f64_h(ptr %d, ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: xvinsve0_v4f64_h:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
+; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI11_0)
+; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
+; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %va = load <4 x double>, ptr %a
+  %vb = load <4 x double>, ptr %b
+  %vc = shufflevector <4 x double> %va, <4 x double> %vb, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  store <4 x double> %vc, ptr %d
+  ret void
+}

From d2e3389abb3a2e4ee27e40f353c5eec476a229f6 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Mon, 29 Sep 2025 18:21:24 -0700
Subject: [PATCH 201/878] [mlir][GPU] Generalize gpu.printf to not need
 gpu.module (#161266)

In order to make the gpu.printf => [various LLVM calls] passes less
order-dependent and to allow downstreams that don't use gpu.module to
use gpu.printf, allow the flowerings for such prints to target the
nearest `SymbolTable` instead.
---
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   | 64 ++++++++++---------
 .../lib/Conversion/GPUCommon/GPUOpsLowering.h |  7 +-
 .../GPUToROCDL/gpu-to-rocdl-hip.mlir          | 36 +++++++++++
 3 files changed, 74 insertions(+), 33 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index a73afbcb6474b..2285d2695db4e 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -20,20 +20,20 @@
 
 using namespace mlir;
 
-LLVM::LLVMFuncOp mlir::getOrDefineFunction(gpu::GPUModuleOp moduleOp,
-                                           Location loc, OpBuilder &b,
-                                           StringRef name,
+LLVM::LLVMFuncOp mlir::getOrDefineFunction(Operation *moduleOp, Location loc,
+                                           OpBuilder &b, StringRef name,
                                            LLVM::LLVMFunctionType type) {
-  LLVM::LLVMFuncOp ret;
-  if (!(ret = moduleOp.template lookupSymbol<LLVM::LLVMFuncOp>(name))) {
-    OpBuilder::InsertionGuard guard(b);
-    b.setInsertionPointToStart(moduleOp.getBody());
-    ret = LLVM::LLVMFuncOp::create(b, loc, name, type, LLVM::Linkage::External);
-  }
-  return ret;
+  auto existing = dyn_cast_or_null<LLVM::LLVMFuncOp>(
+      SymbolTable::lookupSymbolIn(moduleOp, name));
+  if (existing)
+    return existing;
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
+  return LLVM::LLVMFuncOp::create(b, loc, name, type, LLVM::Linkage::External);
 }
 
-static SmallString<16> getUniqueSymbolName(gpu::GPUModuleOp moduleOp,
+static SmallString<16> getUniqueSymbolName(Operation *moduleOp,
                                            StringRef prefix) {
   // Get a unique global name.
   unsigned stringNumber = 0;
@@ -41,15 +41,16 @@ static SmallString<16> getUniqueSymbolName(gpu::GPUModuleOp moduleOp,
   do {
     stringConstName.clear();
     (prefix + Twine(stringNumber++)).toStringRef(stringConstName);
-  } while (moduleOp.lookupSymbol(stringConstName));
+  } while (SymbolTable::lookupSymbolIn(moduleOp, stringConstName));
   return stringConstName;
 }
 
-LLVM::GlobalOp
-mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
-                                gpu::GPUModuleOp moduleOp, Type llvmI8,
-                                StringRef namePrefix, StringRef str,
-                                uint64_t alignment, unsigned addrSpace) {
+LLVM::GlobalOp mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
+                                               Operation *moduleOp, Type llvmI8,
+                                               StringRef namePrefix,
+                                               StringRef str,
+                                               uint64_t alignment,
+                                               unsigned addrSpace) {
   llvm::SmallString<20> nullTermStr(str);
   nullTermStr.push_back('\0'); // Null terminate for C
   auto globalType =
@@ -57,7 +58,7 @@ mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
   StringAttr attr = b.getStringAttr(nullTermStr);
 
   // Try to find existing global.
-  for (auto globalOp : moduleOp.getOps<LLVM::GlobalOp>())
+  for (auto globalOp : moduleOp->getRegion(0).getOps<LLVM::GlobalOp>())
     if (globalOp.getGlobalType() == globalType && globalOp.getConstant() &&
         globalOp.getValueAttr() == attr &&
         globalOp.getAlignment().value_or(0) == alignment &&
@@ -66,7 +67,7 @@ mlir::getOrCreateStringConstant(OpBuilder &b, Location loc,
 
   // Not found: create new global.
   OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(moduleOp.getBody());
+  b.setInsertionPointToStart(&moduleOp->getRegion(0).front());
   SmallString<16> name = getUniqueSymbolName(moduleOp, namePrefix);
   return LLVM::GlobalOp::create(b, loc, globalType,
                                 /*isConstant=*/true, LLVM::Linkage::Internal,
@@ -396,10 +397,11 @@ LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite(
   auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
   mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
   mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
-  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
-  // This ensures that global constants and declarations are placed within
-  // the device code, not the host code
-  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
+
+  Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
+  if (!moduleOp)
+    return rewriter.notifyMatchFailure(gpuPrintfOp,
+                                       "Couldn't find a parent module");
 
   auto ocklBegin =
       getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin",
@@ -496,10 +498,10 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
   mlir::Type ptrType =
       LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
 
-  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
-  // This ensures that global constants and declarations are placed within
-  // the device code, not the host code
-  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
+  Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
+  if (!moduleOp)
+    return rewriter.notifyMatchFailure(gpuPrintfOp,
+                                       "Couldn't find a parent module");
 
   auto printfType =
       LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
@@ -541,10 +543,10 @@ LogicalResult GPUPrintfOpToVPrintfLowering::matchAndRewrite(
   mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
   mlir::Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
 
-  // Note: this is the GPUModule op, not the ModuleOp that surrounds it
-  // This ensures that global constants and declarations are placed within
-  // the device code, not the host code
-  auto moduleOp = gpuPrintfOp->getParentOfType<gpu::GPUModuleOp>();
+  Operation *moduleOp = gpuPrintfOp->getParentWithTrait<OpTrait::SymbolTable>();
+  if (!moduleOp)
+    return rewriter.notifyMatchFailure(gpuPrintfOp,
+                                       "Couldn't find a parent module");
 
   // Create a valid global location removing any metadata attached to the
   // location as debug info metadata inside of a function cannot be used outside
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index e17b06379988c..66d3bb40a8f5a 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -18,15 +18,18 @@ namespace mlir {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
+/// Note that these functions don't take a `SymbolTable` because GPU module
+/// lowerings can have name collisions as an intermediate state.
+
 /// Find or create an external function declaration in the given module.
-LLVM::LLVMFuncOp getOrDefineFunction(gpu::GPUModuleOp moduleOp, Location loc,
+LLVM::LLVMFuncOp getOrDefineFunction(Operation *moduleOp, Location loc,
                                      OpBuilder &b, StringRef name,
                                      LLVM::LLVMFunctionType type);
 
 /// Create a global that contains the given string. If a global with the same
 /// string already exists in the module, return that global.
 LLVM::GlobalOp getOrCreateStringConstant(OpBuilder &b, Location loc,
-                                         gpu::GPUModuleOp moduleOp, Type llvmI8,
+                                         Operation *moduleOp, Type llvmI8,
                                          StringRef namePrefix, StringRef str,
                                          uint64_t alignment = 0,
                                          unsigned addrSpace = 0);
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
index 2dc6a5ab2a86c..32da31202b688 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s
 
+// CHECK-LABEL: gpu.module @test_module
 gpu.module @test_module {
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00")
@@ -40,3 +41,38 @@ gpu.module @test_module {
     gpu.return
   }
 }
+
+// -----
+
+// The bulitin.module we're targetting is wrapped in a fake gpu.module
+// because the convert-gpu-to-rocdl pass only runs an `gpu.module` ops,
+// even though the printf patterns could run in other contexts.
+
+// CHECK-LABEL: gpu.module @fake_gpu_module_for_test
+// CHECK-LABEL: builtin.module @test_module
+gpu.module @fake_gpu_module_for_test {
+builtin.module @test_module {
+  // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00")
+  // CHECK-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
+  // CHECK-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64
+  // CHECK-DAG: llvm.func @__ockl_printf_begin(i64) -> i64
+
+  // CHECK-LABEL: llvm.func @test_printf
+  // CHECK: (%[[ARG0:.*]]: i32)
+  llvm.func @test_printf(%arg0: i32) {
+    // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+    // CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64
+    // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr
+    // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<11 x i8>
+    // CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64
+    // CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64
+    // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
+    // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
+    gpu.printf "Hello: %d\n", %arg0 : i32
+    llvm.return
+  }
+}
+}

From 781baf76fb4d9fbfc24bafe5f46946043c265074 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 29 Sep 2025 18:48:33 -0700
Subject: [PATCH 202/878] [llvm][mustache] Refactor template rendering
 (#159189)

Move the rendering logic into the ASTNode, and break the logic down into
individual methods.
---
 llvm/lib/Support/Mustache.cpp | 132 ++++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 597288f26c4ac..274bd9eff0ffa 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -180,6 +180,14 @@ class ASTNode {
 
   const llvm::json::Value *findContext();
 
+  void renderRoot(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderText(raw_ostream &OS);
+  void renderPartial(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderVariable(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderUnescapeVariable(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderSection(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderInvertSection(const json::Value &CurrentCtx, raw_ostream &OS);
+
   StringMap<AstPtr> &Partials;
   StringMap<Lambda> &Lambdas;
   StringMap<SectionLambda> &SectionLambdas;
@@ -686,76 +694,96 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   }
 }
 
+void ASTNode::renderRoot(const json::Value &CurrentCtx, raw_ostream &OS) {
+  renderChild(CurrentCtx, OS);
+}
+
+void ASTNode::renderText(raw_ostream &OS) { OS << Body; }
+
+void ASTNode::renderPartial(const json::Value &CurrentCtx, raw_ostream &OS) {
+  auto Partial = Partials.find(AccessorValue[0]);
+  if (Partial != Partials.end())
+    renderPartial(CurrentCtx, OS, Partial->getValue().get());
+}
+
+void ASTNode::renderVariable(const json::Value &CurrentCtx, raw_ostream &OS) {
+  auto Lambda = Lambdas.find(AccessorValue[0]);
+  if (Lambda != Lambdas.end()) {
+    renderLambdas(CurrentCtx, OS, Lambda->getValue());
+  } else if (const json::Value *ContextPtr = findContext()) {
+    EscapeStringStream ES(OS, Escapes);
+    toMustacheString(*ContextPtr, ES);
+  }
+}
+
+void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx,
+                                     raw_ostream &OS) {
+  auto Lambda = Lambdas.find(AccessorValue[0]);
+  if (Lambda != Lambdas.end()) {
+    renderLambdas(CurrentCtx, OS, Lambda->getValue());
+  } else if (const json::Value *ContextPtr = findContext()) {
+    toMustacheString(*ContextPtr, OS);
+  }
+}
+
+void ASTNode::renderSection(const json::Value &CurrentCtx, raw_ostream &OS) {
+  auto SectionLambda = SectionLambdas.find(AccessorValue[0]);
+  if (SectionLambda != SectionLambdas.end()) {
+    renderSectionLambdas(CurrentCtx, OS, SectionLambda->getValue());
+    return;
+  }
+
+  const json::Value *ContextPtr = findContext();
+  if (isContextFalsey(ContextPtr))
+    return;
+
+  if (const json::Array *Arr = ContextPtr->getAsArray()) {
+    for (const json::Value &V : *Arr)
+      renderChild(V, OS);
+    return;
+  }
+  renderChild(*ContextPtr, OS);
+}
+
+void ASTNode::renderInvertSection(const json::Value &CurrentCtx,
+                                  raw_ostream &OS) {
+  bool IsLambda = SectionLambdas.contains(AccessorValue[0]);
+  const json::Value *ContextPtr = findContext();
+  if (isContextFalsey(ContextPtr) && !IsLambda) {
+    renderChild(CurrentCtx, OS);
+  }
+}
+
 void ASTNode::render(const json::Value &CurrentCtx, raw_ostream &OS) {
   if (Ty != Root && Ty != Text && AccessorValue.empty())
     return;
   // Set the parent context to the incoming context so that we
   // can walk up the context tree correctly in findContext().
   ParentContext = &CurrentCtx;
-  const json::Value *ContextPtr = Ty == Root ? ParentContext : findContext();
 
   switch (Ty) {
   case Root:
-    renderChild(CurrentCtx, OS);
+    renderRoot(CurrentCtx, OS);
     return;
   case Text:
-    OS << Body;
+    renderText(OS);
     return;
-  case Partial: {
-    auto Partial = Partials.find(AccessorValue[0]);
-    if (Partial != Partials.end())
-      renderPartial(CurrentCtx, OS, Partial->getValue().get());
+  case Partial:
+    renderPartial(CurrentCtx, OS);
     return;
-  }
-  case Variable: {
-    auto Lambda = Lambdas.find(AccessorValue[0]);
-    if (Lambda != Lambdas.end()) {
-      renderLambdas(CurrentCtx, OS, Lambda->getValue());
-    } else if (ContextPtr) {
-      EscapeStringStream ES(OS, Escapes);
-      toMustacheString(*ContextPtr, ES);
-    }
+  case Variable:
+    renderVariable(CurrentCtx, OS);
     return;
-  }
-  case UnescapeVariable: {
-    auto Lambda = Lambdas.find(AccessorValue[0]);
-    if (Lambda != Lambdas.end()) {
-      renderLambdas(CurrentCtx, OS, Lambda->getValue());
-    } else if (ContextPtr) {
-      toMustacheString(*ContextPtr, OS);
-    }
+  case UnescapeVariable:
+    renderUnescapeVariable(CurrentCtx, OS);
     return;
-  }
-  case Section: {
-    auto SectionLambda = SectionLambdas.find(AccessorValue[0]);
-    bool IsLambda = SectionLambda != SectionLambdas.end();
-
-    if (IsLambda) {
-      renderSectionLambdas(CurrentCtx, OS, SectionLambda->getValue());
-      return;
-    }
-
-    if (isContextFalsey(ContextPtr))
-      return;
-
-    if (const json::Array *Arr = ContextPtr->getAsArray()) {
-      for (const json::Value &V : *Arr)
-        renderChild(V, OS);
-      return;
-    }
-    renderChild(*ContextPtr, OS);
+  case Section:
+    renderSection(CurrentCtx, OS);
     return;
-  }
-  case InvertSection: {
-    bool IsLambda = SectionLambdas.contains(AccessorValue[0]);
-    if (isContextFalsey(ContextPtr) && !IsLambda) {
-      // The context for the children remains unchanged from the parent's, so
-      // we pass this node's original incoming context.
-      renderChild(CurrentCtx, OS);
-    }
+  case InvertSection:
+    renderInvertSection(CurrentCtx, OS);
     return;
   }
-  }
   llvm_unreachable("Invalid ASTNode type");
 }
 

From 0f80f18977060cdb315f50d0365507d8497a1a03 Mon Sep 17 00:00:00 2001
From: ZhaoQi <zhaoqi01@loongson.cn>
Date: Tue, 30 Sep 2025 09:49:19 +0800
Subject: [PATCH 203/878] [LoongArch] Custom legalize vector_shuffle to
 `xvinsve0.{w/d}` when possible (#161156)

---
 .../LoongArch/LoongArchISelLowering.cpp       |  51 ++++
 .../Target/LoongArch/LoongArchISelLowering.h  |   1 +
 .../LoongArch/LoongArchLASXInstrInfo.td       |   9 +
 .../ir-instruction/shuffle-as-xvinsve0.ll     | 248 ++++--------------
 4 files changed, 107 insertions(+), 202 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 098bcfa67d1d3..4cfbfca45d359 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2319,6 +2319,53 @@ static SDValue lowerVECTOR_SHUFFLE_XVPICKOD(const SDLoc &DL, ArrayRef<int> Mask,
   return DAG.getNode(LoongArchISD::VPICKOD, DL, VT, V2, V1);
 }
 
+/// Lower VECTOR_SHUFFLE into XVINSVE0 (if possible).
+static SDValue
+lowerVECTOR_SHUFFLE_XVINSVE0(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                             SDValue V1, SDValue V2, SelectionDAG &DAG,
+                             const LoongArchSubtarget &Subtarget) {
+  // LoongArch LASX only supports xvinsve0.{w/d}.
+  if (VT != MVT::v8i32 && VT != MVT::v8f32 && VT != MVT::v4i64 &&
+      VT != MVT::v4f64)
+    return SDValue();
+
+  MVT GRLenVT = Subtarget.getGRLenVT();
+  int MaskSize = Mask.size();
+  assert(MaskSize == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  // Check if exactly one element of the Mask is replaced by 'Replaced', while
+  // all other elements are either 'Base + i' or undef (-1). On success, return
+  // the index of the replaced element. Otherwise, just return -1.
+  auto checkReplaceOne = [&](int Base, int Replaced) -> int {
+    int Idx = -1;
+    for (int i = 0; i < MaskSize; ++i) {
+      if (Mask[i] == Base + i || Mask[i] == -1)
+        continue;
+      if (Mask[i] != Replaced)
+        return -1;
+      if (Idx == -1)
+        Idx = i;
+      else
+        return -1;
+    }
+    return Idx;
+  };
+
+  // Case 1: the lowest element of V2 replaces one element in V1.
+  int Idx = checkReplaceOne(0, MaskSize);
+  if (Idx != -1)
+    return DAG.getNode(LoongArchISD::XVINSVE0, DL, VT, V1, V2,
+                       DAG.getConstant(Idx, DL, GRLenVT));
+
+  // Case 2: the lowest element of V1 replaces one element in V2.
+  Idx = checkReplaceOne(MaskSize, 0);
+  if (Idx != -1)
+    return DAG.getNode(LoongArchISD::XVINSVE0, DL, VT, V2, V1,
+                       DAG.getConstant(Idx, DL, GRLenVT));
+
+  return SDValue();
+}
+
 /// Lower VECTOR_SHUFFLE into XVSHUF (if possible).
 static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask,
                                           MVT VT, SDValue V1, SDValue V2,
@@ -2595,6 +2642,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
   if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget,
                                            Zeroable)))
     return Result;
+  if ((Result =
+           lowerVECTOR_SHUFFLE_XVINSVE0(DL, Mask, VT, V1, V2, DAG, Subtarget)))
+    return Result;
   if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG,
                                                 Subtarget)))
     return Result;
@@ -7453,6 +7503,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(XVPERM)
     NODE_NAME_CASE(XVREPLVE0)
     NODE_NAME_CASE(XVREPLVE0Q)
+    NODE_NAME_CASE(XVINSVE0)
     NODE_NAME_CASE(VPICK_SEXT_ELT)
     NODE_NAME_CASE(VPICK_ZEXT_ELT)
     NODE_NAME_CASE(VREPLVE)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 9b60a9fd53726..8a4d7748467c7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -151,6 +151,7 @@ enum NodeType : unsigned {
   XVPERM,
   XVREPLVE0,
   XVREPLVE0Q,
+  XVINSVE0,
 
   // Extended vector element extraction
   VPICK_SEXT_ELT,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index bbc0489620193..5143d53bad719 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -20,6 +20,7 @@ def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
 def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>;
 def loongarch_xvreplve0: SDNode<"LoongArchISD::XVREPLVE0", SDT_LoongArchXVREPLVE0>;
 def loongarch_xvreplve0q: SDNode<"LoongArchISD::XVREPLVE0Q", SDT_LoongArchXVREPLVE0>;
+def loongarch_xvinsve0 : SDNode<"LoongArchISD::XVINSVE0", SDT_LoongArchV2RUimm>;
 def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>;
 def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>;
@@ -1708,6 +1709,14 @@ def : Pat<(vector_insert v4f64:$xd, (f64(bitconvert i64:$rj)), uimm2:$imm),
           (XVINSGR2VR_D v4f64:$xd, GPR:$rj, uimm2:$imm)>;
 
 // XVINSVE0_{W/D}
+def : Pat<(loongarch_xvinsve0 v8i32:$xd, v8i32:$xj, uimm3:$imm),
+          (XVINSVE0_W v8i32:$xd, v8i32:$xj, uimm3:$imm)>;
+def : Pat<(loongarch_xvinsve0 v4i64:$xd, v4i64:$xj, uimm2:$imm),
+          (XVINSVE0_D v4i64:$xd, v4i64:$xj, uimm2:$imm)>;
+def : Pat<(loongarch_xvinsve0 v8f32:$xd, v8f32:$xj, uimm3:$imm),
+          (XVINSVE0_W v8f32:$xd, v8f32:$xj, uimm3:$imm)>;
+def : Pat<(loongarch_xvinsve0 v4f64:$xd, v4f64:$xj, uimm2:$imm),
+          (XVINSVE0_D v4f64:$xd, v4f64:$xj, uimm2:$imm)>;
 def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
           (XVINSVE0_W v8f32:$xd, (SUBREG_TO_REG(i64 0), FPR32:$fj, sub_32),
               uimm3:$imm)>;
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll
index b5d5c9c15d7c8..e1784f81c2a07 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvinsve0.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
 
 ;; xvinsve0.w
 define void @xvinsve0_v8i32_l_0(ptr %d, ptr %a, ptr %b) nounwind {
@@ -8,10 +8,8 @@ define void @xvinsve0_v8i32_l_0(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
@@ -22,52 +20,13 @@ entry:
 }
 
 define void @xvinsve0_v8i32_l_4(ptr %d, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: xvinsve0_v8i32_l_4:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    ld.w $a2, $a2, 0
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 5
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 6
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 7
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA32-NEXT:    xvst $xr2, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: xvinsve0_v8i32_l_4:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a2, 0
-; LA64-NEXT:    xvld $xr1, $a1, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 5
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 1
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 6
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 2
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 7
-; LA64-NEXT:    vinsgr2vr.w $vr0, $a1, 3
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 1
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 2
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 3
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr2, $xr0, 2
-; LA64-NEXT:    xvst $xr2, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: xvinsve0_v8i32_l_4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 4
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -81,10 +40,8 @@ define void @xvinsve0_v8f32_l(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI2_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.w $xr0, $xr1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <8 x float>, ptr %a
@@ -99,10 +56,8 @@ define void @xvinsve0_v8i32_h_1(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 1
+; CHECK-NEXT:    xvst $xr1, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
@@ -113,52 +68,13 @@ entry:
 }
 
 define void @xvinsve0_v8i32_h_6(ptr %d, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: xvinsve0_v8i32_h_6:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 4
-; LA32-NEXT:    ld.w $a1, $a1, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 5
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a2, 1
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 7
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA32-NEXT:    xvst $xr2, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: xvinsve0_v8i32_h_6:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    xvld $xr0, $a2, 0
-; LA64-NEXT:    xvld $xr1, $a1, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 4
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 5
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr1, 0
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 7
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 1
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr1, $xr2, 2
-; LA64-NEXT:    xvst $xr1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: xvinsve0_v8i32_h_6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 6
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <8 x i32>, ptr %a
   %vb = load <8 x i32>, ptr %b
@@ -172,10 +88,8 @@ define void @xvinsve0_v8f32_h(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI5_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI5_0)
-; CHECK-NEXT:    xvshuf.w $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.w $xr1, $xr0, 0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <8 x float>, ptr %a
@@ -191,10 +105,8 @@ define void @xvinsve0_v4i64_l_1(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI6_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI6_0)
-; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
@@ -205,44 +117,13 @@ entry:
 }
 
 define void @xvinsve0_v4i64_l_2(ptr %d, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: xvinsve0_v4i64_l_2:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 0
-; LA32-NEXT:    xvld $xr1, $a1, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 6
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 7
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 2
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 3
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr2, 2
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: xvinsve0_v4i64_l_2:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a2, $a2, 0
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT:    xvst $xr2, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: xvinsve0_v4i64_l_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -256,10 +137,8 @@ define void @xvinsve0_v4f64_l(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.d $xr0, $xr1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <4 x double>, ptr %a
@@ -274,10 +153,8 @@ define void @xvinsve0_v4i64_h_0(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI9_0)
-; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
@@ -288,44 +165,13 @@ entry:
 }
 
 define void @xvinsve0_v4i64_h_2(ptr %d, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: xvinsve0_v4i64_h_2:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 1
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 6
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 7
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 0
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 1
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 2
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr1, 3
-; LA32-NEXT:    vinsgr2vr.w $vr0, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr0, $xr2, 2
-; LA32-NEXT:    xvst $xr0, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: xvinsve0_v4i64_h_2:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    ld.d $a1, $a1, 0
-; LA64-NEXT:    xvld $xr0, $a2, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT:    xvst $xr2, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: xvinsve0_v4i64_h_2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 2
+; CHECK-NEXT:    xvst $xr1, $a0, 0
+; CHECK-NEXT:    ret
 entry:
   %va = load <4 x i64>, ptr %a
   %vb = load <4 x i64>, ptr %b
@@ -339,10 +185,8 @@ define void @xvinsve0_v4f64_h(ptr %d, ptr %a, ptr %b) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
-; CHECK-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI11_0)
-; CHECK-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI11_0)
-; CHECK-NEXT:    xvshuf.d $xr2, $xr1, $xr0
-; CHECK-NEXT:    xvst $xr2, $a0, 0
+; CHECK-NEXT:    xvinsve0.d $xr1, $xr0, 0
+; CHECK-NEXT:    xvst $xr1, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %va = load <4 x double>, ptr %a

From a8034d1809cb39c977f47bb25e190c04b243dfd2 Mon Sep 17 00:00:00 2001
From: Kelvin Li <kli@ca.ibm.com>
Date: Mon, 29 Sep 2025 21:55:23 -0400
Subject: [PATCH 204/878] [analyzer] Use sed from the ToolBox on AIX (NFC)
 (#161242)

The change in commit 30402c7 breaks the tests on AIX. This patch
is to change to use the `sed` from AIX Toolbox instead of the default
one which does not support `-r` and `-E`.
---
 .../expected-sarif/sarif-multi-file-diagnostics.c.sarif       | 2 +-
 clang/test/Analysis/lit.local.cfg                             | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
index 85e710fc7bac3..501d27ca22361 100644
--- a/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
+++ b/clang/test/Analysis/diagnostics/Inputs/expected-sarif/sarif-multi-file-diagnostics.c.sarif
@@ -141,4 +141,4 @@
     }
   ],
   "version": "[SARIF version]"
-}
\ No newline at end of file
+}
diff --git a/clang/test/Analysis/lit.local.cfg b/clang/test/Analysis/lit.local.cfg
index 3d60a16405ea6..03ab418a5a4f7 100644
--- a/clang/test/Analysis/lit.local.cfg
+++ b/clang/test/Analysis/lit.local.cfg
@@ -17,11 +17,13 @@ config.substitutions.append(
     )
 )
 
+sed_cmd = "/opt/freeware/bin/sed" if "system-aix" in config.available_features else "sed"
+
 # Filtering command for testing SARIF output against reference output.
 config.substitutions.append(
     (
         "%normalize_sarif",
-        "sed -r '%s;%s;%s;%s'"
+        f"{sed_cmd} -r '%s;%s;%s;%s'"
         % (
             # Replace version strings that are likely to change.
             r's/"version": ".* version .*"/"version": "[clang version]"/',

From 6b23f4fc4d7cf035dbf9a1af419148b991ddaf2c Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 30 Sep 2025 09:58:12 +0800
Subject: [PATCH 205/878] [LoongArch] Add R_LARCH_MARK_LA relocation for la.abs

Match gas behavior: generate `R_LARCH_MARK_LA` relocation for `la.abs`.

Reviewers: heiher, SixWeining

Reviewed By: SixWeining, heiher

Pull Request: https://github.com/llvm/llvm-project/pull/161062
---
 llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp         | 4 ++++
 llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp    | 3 +++
 llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp | 4 +++-
 .../Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp   | 1 +
 .../LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp      | 7 +++++++
 llvm/test/MC/LoongArch/Macros/macros-la.s                  | 1 +
 6 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
index f23fb346c55f9..5f956b1e6a517 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_loongarch.cpp
@@ -365,6 +365,10 @@ class ELFLinkGraphBuilder_loongarch : public ELFLinkGraphBuilder<ELFT> {
     uint32_t Type = Rel.getType(false);
     int64_t Addend = Rel.r_addend;
 
+    // ignore
+    if (Type == ELF::R_LARCH_MARK_LA)
+      return Error::success();
+
     if (Type == ELF::R_LARCH_RELAX) {
       if (BlockToFix.edges_empty())
         return make_error<StringError>(
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index d6268037dea86..dd1b1d3b2e943 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -781,6 +781,9 @@ void RuntimeDyldELF::resolveLoongArch64Relocation(const SectionEntry &Section,
   default:
     report_fatal_error("Relocation type not implemented yet!");
     break;
+  case ELF::R_LARCH_MARK_LA:
+    // ignore
+    break;
   case ELF::R_LARCH_32:
     support::ulittle32_t::ref{TargetPtr} =
         static_cast<uint32_t>(Value + Addend);
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 5be4713b349ee..9b11201d0312d 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -957,8 +957,10 @@ void LoongArchAsmParser::emitLoadAddressAbs(MCInst &Inst, SMLoc IDLoc,
                              : Inst.getOperand(2).getExpr();
   InstSeq Insts;
 
+  // To distinguish between la.abs and %abs_hi20, la.abs will generate
+  // R_LARCH_MARK_LA and R_LARCH_ABS_HI20 relocations.
   Insts.push_back(
-      LoongArchAsmParser::Inst(LoongArch::LU12I_W, ELF::R_LARCH_ABS_HI20));
+      LoongArchAsmParser::Inst(LoongArch::LU12I_W, ELF::R_LARCH_MARK_LA));
   Insts.push_back(
       LoongArchAsmParser::Inst(LoongArch::ORI, ELF::R_LARCH_ABS_LO12));
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
index 0d7761777cb7d..8ecb62d0ea7bb 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
@@ -32,6 +32,7 @@ static StringRef getLoongArchSpecifierName(uint16_t S) {
     return "b16";
   case ELF::R_LARCH_B21:
     return "b21";
+  case ELF::R_LARCH_MARK_LA:
   case ELF::R_LARCH_ABS_HI20:
     return "abs_hi20";
   case ELF::R_LARCH_ABS_LO12:
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index b7ead5e61ab81..f0e2bc4855187 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -161,6 +161,13 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
     case ELF::R_LARCH_B26:
       FixupKind = LoongArch::fixup_loongarch_b26;
       break;
+    case ELF::R_LARCH_MARK_LA:
+      // Match gas behavior: generate `R_LARCH_MARK_LA` relocation when using
+      // `la.abs`.
+      Fixups.push_back(
+          MCFixup::create(0, MCConstantExpr::create(0, Ctx),
+                          FirstLiteralRelocationKind + ELF::R_LARCH_MARK_LA));
+      [[fallthrough]];
     case ELF::R_LARCH_ABS_HI20:
       FixupKind = LoongArch::fixup_loongarch_abs_hi20;
       break;
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index a732988ef1f1a..8022d5b038880 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -26,6 +26,7 @@ la.abs $a0, sym_abs
 # ABS-NEXT:   lu32i.d $a0, %abs64_lo20(sym_abs)
 # ABS-NEXT:   lu52i.d $a0, $a0, %abs64_hi12(sym_abs)
 # ABS-EMPTY:
+# RELOC-NEXT: R_LARCH_MARK_LA - 0x0
 # RELOC-NEXT: R_LARCH_ABS_HI20 sym_abs 0x0
 # RELOC-NEXT: R_LARCH_ABS_LO12 sym_abs 0x0
 # RELOC-NEXT: R_LARCH_ABS64_LO20 sym_abs 0x0

From 978644c29f40536909b4ecabb58468e25fe6d14d Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 29 Sep 2025 19:37:33 -0700
Subject: [PATCH 206/878] [llvm][mustache] Remove out parameters from
 processTags() (#159190)

We can construct the return values directly and simplify the interface.
---
 llvm/lib/Support/Mustache.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 274bd9eff0ffa..f52ad3bb2f10a 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
+#include <optional>
 #include <sstream>
 
 #define DEBUG_TYPE "mustache"
@@ -364,14 +365,14 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
   return Result;
 }
 
-static void processTag(const Tag &T, SmallVectorImpl<Token> &Tokens,
-                       SmallString<8> &Open, SmallString<8> &Close) {
+static std::optional<std::pair<StringRef, StringRef>>
+processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
   LLVM_DEBUG(dbgs() << "  Found tag: \"" << T.FullMatch << "\", Content: \""
                     << T.Content << "\"\n");
   if (T.TagKind == Tag::Kind::Triple) {
     Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
     LLVM_DEBUG(dbgs() << "  Created UnescapeVariable token.\n");
-    return;
+    return std::nullopt;
   }
   StringRef Interpolated = T.Content;
   std::string RawBody = T.FullMatch.str();
@@ -379,7 +380,7 @@ static void processTag(const Tag &T, SmallVectorImpl<Token> &Tokens,
     char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
     Tokens.emplace_back(RawBody, Interpolated.str(), Front);
     LLVM_DEBUG(dbgs() << "  Created tag token of type '" << Front << "'\n");
-    return;
+    return std::nullopt;
   }
   Tokens.emplace_back(RawBody, Interpolated.str(), '=');
   StringRef DelimSpec = Interpolated.trim();
@@ -387,12 +388,10 @@ static void processTag(const Tag &T, SmallVectorImpl<Token> &Tokens,
   DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
   DelimSpec = DelimSpec.trim();
 
-  auto [NewOpen, NewClose] = DelimSpec.split(' ');
-  Open = NewOpen;
-  Close = NewClose;
-
-  LLVM_DEBUG(dbgs() << "  Found Set Delimiter tag. NewOpen='" << Open
-                    << "', NewClose='" << Close << "'\n");
+  std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
+  LLVM_DEBUG(dbgs() << "  Found Set Delimiter tag. NewOpen='" << Ret.first
+                    << "', NewClose='" << Ret.second << "'\n");
+  return Ret;
 }
 
 // Simple tokenizer that splits the template into tokens.
@@ -426,7 +425,9 @@ static SmallVector<Token> tokenize(StringRef Template) {
       LLVM_DEBUG(dbgs() << "  Created Text token: \"" << Text << "\"\n");
     }
 
-    processTag(T, Tokens, Open, Close);
+    if (auto NewDelims = processTag(T, Tokens)) {
+      std::tie(Open, Close) = *NewDelims;
+    }
 
     // Move past the tag.
     Start = T.StartPosition + T.FullMatch.size();

From eb1960c4812ca8ed4ef0e413f9b68178789c0f7a Mon Sep 17 00:00:00 2001
From: woruyu <1214539920@qq.com>
Date: Tue, 30 Sep 2025 10:45:20 +0800
Subject: [PATCH 207/878] [sanitizer] Handle nullptr name in prctl(PR_SET_VMA,
 PR_SET_VMA_ANON_NAME) (#160824)

### Summary
This PR resolves https://github.com/llvm/llvm-project/issues/160562
---
 .../lib/sanitizer_common/sanitizer_common_interceptors.inc     | 2 +-
 compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index a96d325d08983..b10ce7fa44afc 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1326,7 +1326,7 @@ PRCTL_INTERCEPTOR(int, prctl, int option, unsigned long arg2,
   static const int PR_SET_SECCOMP = 22;
   static const int SECCOMP_MODE_FILTER = 2;
 #  endif
-  if (option == PR_SET_VMA && arg2 == 0UL) {
+  if (option == PR_SET_VMA && arg2 == 0UL && arg5 != 0UL) {
     char *name = (char *)arg5;
     COMMON_INTERCEPTOR_READ_RANGE(ctx, name, internal_strlen(name) + 1);
   }
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
index dab1d1b48f868..afce9dc03dada 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
@@ -88,5 +88,8 @@ int main() {
   res = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &pr);
   assert(res == -1);
 
+  unsigned long name = reinterpret_cast<unsigned long>(nullptr);
+  prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, 0, nullptr, name);
+
   return 0;
 }

From d96c32cdaf588903917a9e7db172729759e34c9d Mon Sep 17 00:00:00 2001
From: "Luo, Yuanke" <lyk_03@hotmail.com>
Date: Tue, 30 Sep 2025 10:48:14 +0800
Subject: [PATCH 208/878] [CUDA] Enable variadic argument support in front-end
 (#161305)

Variadice argument for NVPTX as been support in

https://github.com/llvm/llvm-project/commit/486d00eca6b6ab470e8324b52cdf9f32023c1c9a
We can enable it in front-end.

Co-authored-by: Yuanke Luo <ykluo@birentech.com>
---
 clang/lib/Sema/SemaExpr.cpp   | 5 ++---
 clang/test/SemaCUDA/vararg.cu | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 3302bfce193a2..06b2529011c74 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -16791,12 +16791,11 @@ ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
   Expr *OrigExpr = E;
   bool IsMS = false;
 
-  // CUDA device code does not support varargs.
+  // CUDA device global function does not support varargs.
   if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
     if (const FunctionDecl *F = dyn_cast<FunctionDecl>(CurContext)) {
       CUDAFunctionTarget T = CUDA().IdentifyTarget(F);
-      if (T == CUDAFunctionTarget::Global || T == CUDAFunctionTarget::Device ||
-          T == CUDAFunctionTarget::HostDevice)
+      if (T == CUDAFunctionTarget::Global)
         return ExprError(Diag(E->getBeginLoc(), diag::err_va_arg_in_device));
     }
   }
diff --git a/clang/test/SemaCUDA/vararg.cu b/clang/test/SemaCUDA/vararg.cu
index 34ef367d89820..0238f42dc40a9 100644
--- a/clang/test/SemaCUDA/vararg.cu
+++ b/clang/test/SemaCUDA/vararg.cu
@@ -10,7 +10,7 @@
 #include <stdarg.h>
 #include "Inputs/cuda.h"
 
-__device__ void foo() {
+__global__ void foo() {
   va_list list;
   va_arg(list, int);
 #ifdef EXPECT_VA_ARG_ERR

From 2f9ae0b603c6d2ccd8e5792e6d16e99a33bd02ff Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 29 Sep 2025 20:33:41 -0700
Subject: [PATCH 209/878] ELF: Rename Relocations.cpp functions and rewrite the
 file-level comment. NFC

Pull Request: https://github.com/llvm/llvm-project/pull/161229
---
 lld/ELF/Relocations.cpp | 73 ++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 44 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index bd96c051d160d..9d514fe258af6 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -6,37 +6,22 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains platform-independent functions to process relocations.
-// I'll describe the overview of this file here.
+// This file implements the core relocation processing logic. It analyzes
+// relocations and determines what auxiliary data structures (GOT, PLT, copy
+// relocations) need to be created during linking.
 //
-// Simple relocations are easy to handle for the linker. For example,
-// for R_X86_64_PC64 relocs, the linker just has to fix up locations
-// with the relative offsets to the target symbols. It would just be
-// reading records from relocation sections and applying them to output.
+// The main entry point is scanRelocations<ELFT>(), which calls scanSection()
+// to process all relocations within an input section. For each relocation,
+// scan() analyzes the type and target, and determines whether a synthetic
+// section entry or dynamic relocation is needed.
 //
-// But not all relocations are that easy to handle. For example, for
-// R_386_GOTOFF relocs, the linker has to create new GOT entries for
-// symbols if they don't exist, and fix up locations with GOT entry
-// offsets from the beginning of GOT section. So there is more than
-// fixing addresses in relocation processing.
+// Note: This file analyzes what needs to be done but doesn't apply the
+// actual relocations - that happens later in InputSection::writeTo().
+// Instead, it populates Relocation objects in InputSectionBase::relocations
+// and creates necessary synthetic sections (GOT, PLT, etc.).
 //
-// ELF defines a large number of complex relocations.
-//
-// The functions in this file analyze relocations and do whatever needs
-// to be done. It includes, but not limited to, the following.
-//
-//  - create GOT/PLT entries
-//  - create new relocations in .dynsym to let the dynamic linker resolve
-//    them at runtime (since ELF supports dynamic linking, not all
-//    relocations can be resolved at link-time)
-//  - create COPY relocs and reserve space in .bss
-//  - replace expensive relocs (in terms of runtime cost) with cheap ones
-//  - error out infeasible combinations such as PIC and non-relative relocs
-//
-// Note that the functions in this file don't actually apply relocations
-// because it doesn't know about the output file nor the output file buffer.
-// It instead stores Relocation objects to InputSection's Relocations
-// vector to let it apply later in InputSection::writeTo.
+// In addition, this file implements the core Thunk creation logic, called
+// during finalizeAddressDependentContent().
 //
 //===----------------------------------------------------------------------===//
 
@@ -466,14 +451,14 @@ class RelocationScanner {
   int64_t computeMipsAddend(const RelTy &rel, RelExpr expr, bool isLocal) const;
   bool isStaticLinkTimeConstant(RelExpr e, RelType type, const Symbol &sym,
                                 uint64_t relOff) const;
-  void processAux(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
-                  int64_t addend) const;
+  void process(RelExpr expr, RelType type, uint64_t offset, Symbol &sym,
+               int64_t addend) const;
   unsigned handleTlsRelocation(RelExpr expr, RelType type, uint64_t offset,
                                Symbol &sym, int64_t addend);
 
   template <class ELFT, class RelTy>
-  void scanOne(typename Relocs<RelTy>::const_iterator &i);
-  template <class ELFT, class RelTy> void scan(Relocs<RelTy> rels);
+  void scan(typename Relocs<RelTy>::const_iterator &i);
+  template <class ELFT, class RelTy> void scanSectionImpl(Relocs<RelTy> rels);
 };
 } // namespace
 
@@ -961,7 +946,7 @@ static bool canDefineSymbolInExecutable(Ctx &ctx, Symbol &sym) {
 }
 
 // Returns true if a given relocation can be computed at link-time.
-// This only handles relocation types expected in processAux.
+// This only handles relocation types expected in process().
 //
 // For instance, we know the offset from a relocation to its target at
 // link-time if the relocation is PC-relative and refers a
@@ -1052,8 +1037,8 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
 // sections. Given that it is ro, we will need an extra PT_LOAD. This
 // complicates things for the dynamic linker and means we would have to reserve
 // space for the extra PT_LOAD even if we end up not using it.
-void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
-                                   Symbol &sym, int64_t addend) const {
+void RelocationScanner::process(RelExpr expr, RelType type, uint64_t offset,
+                                Symbol &sym, int64_t addend) const {
   // If non-ifunc non-preemptible, change PLT to direct call and optimize GOT
   // indirection.
   const bool isIfunc = sym.isGnuIFunc();
@@ -1493,7 +1478,7 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type,
 }
 
 template <class ELFT, class RelTy>
-void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
+void RelocationScanner::scan(typename Relocs<RelTy>::const_iterator &i) {
   const RelTy &rel = *i;
   uint32_t symIndex = rel.getSymbol(ctx.arg.isMips64EL);
   Symbol &sym = sec->getFile<ELFT>()->getSymbol(symIndex);
@@ -1587,7 +1572,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
   }
 
   // Process TLS relocations, including TLS optimizations. Note that
-  // R_TPREL and R_TPREL_NEG relocations are resolved in processAux.
+  // R_TPREL and R_TPREL_NEG relocations are resolved in process().
   //
   // Some RISCV TLSDESC relocations reference a local NOTYPE symbol,
   // but we need to process them in handleTlsRelocation.
@@ -1599,7 +1584,7 @@ void RelocationScanner::scanOne(typename Relocs<RelTy>::const_iterator &i) {
     }
   }
 
-  processAux(expr, type, offset, sym, addend);
+  process(expr, type, offset, sym, addend);
 }
 
 // R_PPC64_TLSGD/R_PPC64_TLSLD is required to mark `bl __tls_get_addr` for
@@ -1642,7 +1627,7 @@ static void checkPPC64TLSRelax(InputSectionBase &sec, Relocs<RelTy> rels) {
 }
 
 template <class ELFT, class RelTy>
-void RelocationScanner::scan(Relocs<RelTy> rels) {
+void RelocationScanner::scanSectionImpl(Relocs<RelTy> rels) {
   // Not all relocations end up in Sec->Relocations, but a lot do.
   sec->relocations.reserve(rels.size());
 
@@ -1660,12 +1645,12 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
 
   if constexpr (RelTy::IsCrel) {
     for (auto i = rels.begin(); i != rels.end();)
-      scanOne<ELFT, RelTy>(i);
+      scan<ELFT, RelTy>(i);
   } else {
     // The non-CREL code path has additional check for PPC64 TLS.
     end = static_cast<const void *>(rels.end());
     for (auto i = rels.begin(); i != end;)
-      scanOne<ELFT, RelTy>(i);
+      scan<ELFT, RelTy>(i);
   }
 
   // Sort relocations by offset for more efficient searching for
@@ -1686,11 +1671,11 @@ void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) {
   getter = OffsetGetter(s);
   const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>(!isEH);
   if (rels.areRelocsCrel())
-    scan<ELFT>(rels.crels);
+    scanSectionImpl<ELFT>(rels.crels);
   else if (rels.areRelocsRel())
-    scan<ELFT>(rels.rels);
+    scanSectionImpl<ELFT>(rels.rels);
   else
-    scan<ELFT>(rels.relas);
+    scanSectionImpl<ELFT>(rels.relas);
 }
 
 template <class ELFT> void elf::scanRelocations(Ctx &ctx) {

From 3554c78d973477835b0b062a8aa16ecbc62446dd Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 29 Sep 2025 20:39:49 -0700
Subject: [PATCH 210/878] ELF: Use preprocessed relocations for EhInputSection
 scanning

.eh_frame sections require special sub-section processing, specifically,
CIEs are de-duplicated and FDEs are garbage collected. Create a
specialized scanEhSection() function utilizing the just-added
EhInputSection::rels. OffsetGetter is moved to scanEhSection.

This improves separation of concerns between InputSection and
EhInputSection processing.

This removes another `relsOrRelas` call using `supportsCrel=false`.
DWARF.cpp now has the last call.

Pull Request: https://github.com/llvm/llvm-project/pull/161091
---
 lld/ELF/InputSection.cpp |  2 ++
 lld/ELF/Relocations.cpp  | 58 +++++++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 1270f27a8d96e..ff7ef2dce5c79 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1357,6 +1357,8 @@ SyntheticSection *EhInputSection::getParent() const {
 
 // .eh_frame is a sequence of CIE or FDE records.
 // This function splits an input section into records and returns them.
+// In rare cases (.eh_frame pieces are reordered by a linker script), the
+// relocations may be unordered.
 template <class ELFT> void EhInputSection::split() {
   const RelsOrRelas<ELFT> elfRels = relsOrRelas<ELFT>();
   if (elfRels.areRelocsCrel())
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 9d514fe258af6..84b9b5e983662 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -390,22 +390,17 @@ namespace {
 class OffsetGetter {
 public:
   OffsetGetter() = default;
-  explicit OffsetGetter(InputSectionBase &sec) {
-    if (auto *eh = dyn_cast<EhInputSection>(&sec)) {
-      cies = eh->cies;
-      fdes = eh->fdes;
-      i = cies.begin();
-      j = fdes.begin();
-    }
+  explicit OffsetGetter(EhInputSection &sec) {
+    cies = sec.cies;
+    fdes = sec.fdes;
+    i = cies.begin();
+    j = fdes.begin();
   }
 
   // Translates offsets in input sections to offsets in output sections.
   // Given offset must increase monotonically. We assume that Piece is
   // sorted by inputOff.
   uint64_t get(Ctx &ctx, uint64_t off) {
-    if (cies.empty())
-      return off;
-
     while (j != fdes.end() && j->inputOff <= off)
       ++j;
     auto it = j;
@@ -435,13 +430,12 @@ class OffsetGetter {
 class RelocationScanner {
 public:
   RelocationScanner(Ctx &ctx) : ctx(ctx) {}
-  template <class ELFT>
-  void scanSection(InputSectionBase &s, bool isEH = false);
+  template <class ELFT> void scanSection(InputSectionBase &s);
+  template <class ELFT> void scanEhSection(EhInputSection &s);
 
 private:
   Ctx &ctx;
   InputSectionBase *sec;
-  OffsetGetter getter;
 
   // End of relocations, used by Mips/PPC64.
   const void *end = nullptr;
@@ -1496,9 +1490,7 @@ void RelocationScanner::scan(typename Relocs<RelTy>::const_iterator &i) {
     }
   }
   // Get an offset in an output section this relocation is applied to.
-  uint64_t offset = getter.get(ctx, rel.r_offset);
-  if (offset == uint64_t(-1))
-    return;
+  uint64_t offset = rel.r_offset;
 
   RelExpr expr =
       ctx.target->getRelExpr(type, sym, sec->content().data() + offset);
@@ -1634,13 +1626,10 @@ void RelocationScanner::scanSectionImpl(Relocs<RelTy> rels) {
   if (ctx.arg.emachine == EM_PPC64)
     checkPPC64TLSRelax<RelTy>(*sec, rels);
 
-  // For EhInputSection, OffsetGetter expects the relocations to be sorted by
-  // r_offset. In rare cases (.eh_frame pieces are reordered by a linker
-  // script), the relocations may be unordered.
   // On SystemZ, all sections need to be sorted by r_offset, to allow TLS
   // relaxation to be handled correctly - see SystemZ::getTlsGdRelaxSkip.
   SmallVector<RelTy, 0> storage;
-  if (isa<EhInputSection>(sec) || ctx.arg.emachine == EM_S390)
+  if (ctx.arg.emachine == EM_S390)
     rels = sortRels(rels, storage);
 
   if constexpr (RelTy::IsCrel) {
@@ -1665,11 +1654,9 @@ void RelocationScanner::scanSectionImpl(Relocs<RelTy> rels) {
                       });
 }
 
-template <class ELFT>
-void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) {
+template <class ELFT> void RelocationScanner::scanSection(InputSectionBase &s) {
   sec = &s;
-  getter = OffsetGetter(s);
-  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>(!isEH);
+  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>();
   if (rels.areRelocsCrel())
     scanSectionImpl<ELFT>(rels.crels);
   else if (rels.areRelocsRel())
@@ -1678,6 +1665,27 @@ void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) {
     scanSectionImpl<ELFT>(rels.relas);
 }
 
+template <class ELFT> void RelocationScanner::scanEhSection(EhInputSection &s) {
+  sec = &s;
+  OffsetGetter getter(s);
+  auto rels = s.rels;
+  s.relocations.reserve(rels.size());
+  for (auto &r : rels) {
+    // Ignore R_*_NONE and other marker relocations.
+    if (r.expr == R_NONE)
+      continue;
+    uint64_t offset = getter.get(ctx, r.offset);
+    // Skip if the relocation offset is within a dead piece.
+    if (offset == uint64_t(-1))
+      continue;
+    Symbol *sym = r.sym;
+    if (sym->isUndefined() &&
+        maybeReportUndefined(ctx, cast<Undefined>(*sym), *sec, offset))
+      continue;
+    process(r.expr, r.type, offset, *sym, r.addend);
+  }
+}
+
 template <class ELFT> void elf::scanRelocations(Ctx &ctx) {
   // Scan all relocations. Each relocation goes through a series of tests to
   // determine if it needs special treatment, such as creating GOT, PLT,
@@ -1710,7 +1718,7 @@ template <class ELFT> void elf::scanRelocations(Ctx &ctx) {
       RelocationScanner scanner(ctx);
       for (Partition &part : ctx.partitions) {
         for (EhInputSection *sec : part.ehFrame->sections)
-          scanner.template scanSection<ELFT>(*sec, /*isEH=*/true);
+          scanner.template scanEhSection<ELFT>(*sec);
         if (part.armExidx && part.armExidx->isLive())
           for (InputSection *sec : part.armExidx->exidxSections)
             if (sec->isLive())

From aa42b6455ac1836f4f08c67f1f0870cb178dc02a Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 29 Sep 2025 20:40:11 -0700
Subject: [PATCH 211/878] [llvm][mustache] Introduce MustacheContext to
 simplify mustache APIs (#159191)

---
 llvm/include/llvm/Support/Mustache.h |  13 +-
 llvm/lib/Support/Mustache.cpp        | 181 ++++++++++-----------------
 2 files changed, 76 insertions(+), 118 deletions(-)

diff --git a/llvm/include/llvm/Support/Mustache.h b/llvm/include/llvm/Support/Mustache.h
index 781ec557950ec..ee9f40638fd12 100644
--- a/llvm/include/llvm/Support/Mustache.h
+++ b/llvm/include/llvm/Support/Mustache.h
@@ -85,6 +85,14 @@ using SectionLambda = std::function<llvm::json::Value(std::string)>;
 
 class ASTNode;
 using AstPtr = std::unique_ptr<ASTNode>;
+using EscapeMap = DenseMap<char, std::string>;
+
+struct MustacheContext {
+  StringMap<AstPtr> Partials;
+  StringMap<Lambda> Lambdas;
+  StringMap<SectionLambda> SectionLambdas;
+  EscapeMap Escapes;
+};
 
 // A Template represents the container for the AST and the partials
 // and Lambdas that are registered with it.
@@ -118,10 +126,7 @@ class Template {
   LLVM_ABI void overrideEscapeCharacters(DenseMap<char, std::string> Escapes);
 
 private:
-  StringMap<AstPtr> Partials;
-  StringMap<Lambda> Lambdas;
-  StringMap<SectionLambda> SectionLambdas;
-  DenseMap<char, std::string> Escapes;
+  MustacheContext Ctx;
   AstPtr Tree;
 };
 } // namespace llvm::mustache
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index f52ad3bb2f10a..646d7a0ff9c0e 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -138,26 +138,17 @@ class ASTNode {
     InvertSection,
   };
 
-  ASTNode(llvm::StringMap<AstPtr> &Partials, llvm::StringMap<Lambda> &Lambdas,
-          llvm::StringMap<SectionLambda> &SectionLambdas, EscapeMap &Escapes)
-      : Partials(Partials), Lambdas(Lambdas), SectionLambdas(SectionLambdas),
-        Escapes(Escapes), Ty(Type::Root), Parent(nullptr),
-        ParentContext(nullptr) {}
+  ASTNode(MustacheContext &Ctx)
+      : Ctx(Ctx), Ty(Type::Root), Parent(nullptr), ParentContext(nullptr) {}
 
-  ASTNode(std::string Body, ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
-          llvm::StringMap<Lambda> &Lambdas,
-          llvm::StringMap<SectionLambda> &SectionLambdas, EscapeMap &Escapes)
-      : Partials(Partials), Lambdas(Lambdas), SectionLambdas(SectionLambdas),
-        Escapes(Escapes), Ty(Type::Text), Body(std::move(Body)), Parent(Parent),
+  ASTNode(MustacheContext &Ctx, std::string Body, ASTNode *Parent)
+      : Ctx(Ctx), Ty(Type::Text), Body(std::move(Body)), Parent(Parent),
         ParentContext(nullptr) {}
 
   // Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes
-  ASTNode(Type Ty, Accessor Accessor, ASTNode *Parent,
-          llvm::StringMap<AstPtr> &Partials, llvm::StringMap<Lambda> &Lambdas,
-          llvm::StringMap<SectionLambda> &SectionLambdas, EscapeMap &Escapes)
-      : Partials(Partials), Lambdas(Lambdas), SectionLambdas(SectionLambdas),
-        Escapes(Escapes), Ty(Ty), Parent(Parent),
-        AccessorValue(std::move(Accessor)), ParentContext(nullptr) {}
+  ASTNode(MustacheContext &Ctx, Type Ty, Accessor Accessor, ASTNode *Parent)
+      : Ctx(Ctx), Ty(Ty), Parent(Parent), AccessorValue(std::move(Accessor)),
+        ParentContext(nullptr) {}
 
   void addChild(AstPtr Child) { Children.emplace_back(std::move(Child)); };
 
@@ -189,10 +180,7 @@ class ASTNode {
   void renderSection(const json::Value &CurrentCtx, raw_ostream &OS);
   void renderInvertSection(const json::Value &CurrentCtx, raw_ostream &OS);
 
-  StringMap<AstPtr> &Partials;
-  StringMap<Lambda> &Lambdas;
-  StringMap<SectionLambda> &SectionLambdas;
-  EscapeMap &Escapes;
+  MustacheContext &Ctx;
   Type Ty;
   size_t Indentation = 0;
   std::string RawBody;
@@ -205,29 +193,18 @@ class ASTNode {
 };
 
 // A wrapper for arena allocator for ASTNodes
-static AstPtr createRootNode(llvm::StringMap<AstPtr> &Partials,
-                             llvm::StringMap<Lambda> &Lambdas,
-                             llvm::StringMap<SectionLambda> &SectionLambdas,
-                             EscapeMap &Escapes) {
-  return std::make_unique<ASTNode>(Partials, Lambdas, SectionLambdas, Escapes);
+static AstPtr createRootNode(MustacheContext &Ctx) {
+  return std::make_unique<ASTNode>(Ctx);
 }
 
-static AstPtr createNode(ASTNode::Type T, Accessor A, ASTNode *Parent,
-                         llvm::StringMap<AstPtr> &Partials,
-                         llvm::StringMap<Lambda> &Lambdas,
-                         llvm::StringMap<SectionLambda> &SectionLambdas,
-                         EscapeMap &Escapes) {
-  return std::make_unique<ASTNode>(T, std::move(A), Parent, Partials, Lambdas,
-                                   SectionLambdas, Escapes);
+static AstPtr createNode(MustacheContext &Ctx, ASTNode::Type T, Accessor A,
+                         ASTNode *Parent) {
+  return std::make_unique<ASTNode>(Ctx, T, std::move(A), Parent);
 }
 
-static AstPtr createTextNode(std::string Body, ASTNode *Parent,
-                             llvm::StringMap<AstPtr> &Partials,
-                             llvm::StringMap<Lambda> &Lambdas,
-                             llvm::StringMap<SectionLambda> &SectionLambdas,
-                             EscapeMap &Escapes) {
-  return std::make_unique<ASTNode>(std::move(Body), Parent, Partials, Lambdas,
-                                   SectionLambdas, Escapes);
+static AstPtr createTextNode(MustacheContext &Ctx, std::string Body,
+                             ASTNode *Parent) {
+  return std::make_unique<ASTNode>(Ctx, std::move(Body), Parent);
 }
 
 // Function to check if there is meaningful text behind.
@@ -556,39 +533,26 @@ class AddIndentationStringStream : public raw_ostream {
 
 class Parser {
 public:
-  Parser(StringRef TemplateStr) : TemplateStr(TemplateStr) {}
+  Parser(StringRef TemplateStr, MustacheContext &Ctx)
+      : Ctx(Ctx), TemplateStr(TemplateStr) {}
 
-  AstPtr parse(llvm::StringMap<AstPtr> &Partials,
-               llvm::StringMap<Lambda> &Lambdas,
-               llvm::StringMap<SectionLambda> &SectionLambdas,
-               EscapeMap &Escapes);
+  AstPtr parse();
 
 private:
-  void parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
-                     llvm::StringMap<Lambda> &Lambdas,
-                     llvm::StringMap<SectionLambda> &SectionLambdas,
-                     EscapeMap &Escapes);
-
-  void parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A,
-                    llvm::StringMap<AstPtr> &Partials,
-                    llvm::StringMap<Lambda> &Lambdas,
-                    llvm::StringMap<SectionLambda> &SectionLambdas,
-                    EscapeMap &Escapes);
+  void parseMustache(ASTNode *Parent);
+  void parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A);
 
+  MustacheContext &Ctx;
   SmallVector<Token> Tokens;
   size_t CurrentPtr;
   StringRef TemplateStr;
 };
 
-void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A,
-                          llvm::StringMap<AstPtr> &Partials,
-                          llvm::StringMap<Lambda> &Lambdas,
-                          llvm::StringMap<SectionLambda> &SectionLambdas,
-                          EscapeMap &Escapes) {
-  AstPtr CurrentNode =
-      createNode(Ty, A, Parent, Partials, Lambdas, SectionLambdas, Escapes);
+void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
+                          const Accessor &A) {
+  AstPtr CurrentNode = createNode(Ctx, Ty, A, Parent);
   size_t Start = CurrentPtr;
-  parseMustache(CurrentNode.get(), Partials, Lambdas, SectionLambdas, Escapes);
+  parseMustache(CurrentNode.get());
   const size_t End = CurrentPtr - 1;
   std::string RawBody;
   for (std::size_t I = Start; I < End; I++)
@@ -597,21 +561,15 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty, const Accessor &A,
   Parent->addChild(std::move(CurrentNode));
 }
 
-AstPtr Parser::parse(llvm::StringMap<AstPtr> &Partials,
-                     llvm::StringMap<Lambda> &Lambdas,
-                     llvm::StringMap<SectionLambda> &SectionLambdas,
-                     EscapeMap &Escapes) {
+AstPtr Parser::parse() {
   Tokens = tokenize(TemplateStr);
   CurrentPtr = 0;
-  AstPtr RootNode = createRootNode(Partials, Lambdas, SectionLambdas, Escapes);
-  parseMustache(RootNode.get(), Partials, Lambdas, SectionLambdas, Escapes);
+  AstPtr RootNode = createRootNode(Ctx);
+  parseMustache(RootNode.get());
   return RootNode;
 }
 
-void Parser::parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
-                           llvm::StringMap<Lambda> &Lambdas,
-                           llvm::StringMap<SectionLambda> &SectionLambdas,
-                           EscapeMap &Escapes) {
+void Parser::parseMustache(ASTNode *Parent) {
 
   while (CurrentPtr < Tokens.size()) {
     Token CurrentToken = Tokens[CurrentPtr];
@@ -621,38 +579,34 @@ void Parser::parseMustache(ASTNode *Parent, llvm::StringMap<AstPtr> &Partials,
 
     switch (CurrentToken.getType()) {
     case Token::Type::Text: {
-      CurrentNode = createTextNode(std::move(CurrentToken.TokenBody), Parent,
-                                   Partials, Lambdas, SectionLambdas, Escapes);
+      CurrentNode =
+          createTextNode(Ctx, std::move(CurrentToken.TokenBody), Parent);
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::Variable: {
-      CurrentNode = createNode(ASTNode::Variable, std::move(A), Parent,
-                               Partials, Lambdas, SectionLambdas, Escapes);
+      CurrentNode = createNode(Ctx, ASTNode::Variable, std::move(A), Parent);
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::UnescapeVariable: {
-      CurrentNode = createNode(ASTNode::UnescapeVariable, std::move(A), Parent,
-                               Partials, Lambdas, SectionLambdas, Escapes);
+      CurrentNode =
+          createNode(Ctx, ASTNode::UnescapeVariable, std::move(A), Parent);
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::Partial: {
-      CurrentNode = createNode(ASTNode::Partial, std::move(A), Parent, Partials,
-                               Lambdas, SectionLambdas, Escapes);
+      CurrentNode = createNode(Ctx, ASTNode::Partial, std::move(A), Parent);
       CurrentNode->setIndentation(CurrentToken.getIndentation());
       Parent->addChild(std::move(CurrentNode));
       break;
     }
     case Token::Type::SectionOpen: {
-      parseSection(Parent, ASTNode::Section, A, Partials, Lambdas,
-                   SectionLambdas, Escapes);
+      parseSection(Parent, ASTNode::Section, A);
       break;
     }
     case Token::Type::InvertSectionOpen: {
-      parseSection(Parent, ASTNode::InvertSection, A, Partials, Lambdas,
-                   SectionLambdas, Escapes);
+      parseSection(Parent, ASTNode::InvertSection, A);
       break;
     }
     case Token::Type::Comment:
@@ -702,25 +656,25 @@ void ASTNode::renderRoot(const json::Value &CurrentCtx, raw_ostream &OS) {
 void ASTNode::renderText(raw_ostream &OS) { OS << Body; }
 
 void ASTNode::renderPartial(const json::Value &CurrentCtx, raw_ostream &OS) {
-  auto Partial = Partials.find(AccessorValue[0]);
-  if (Partial != Partials.end())
+  auto Partial = Ctx.Partials.find(AccessorValue[0]);
+  if (Partial != Ctx.Partials.end())
     renderPartial(CurrentCtx, OS, Partial->getValue().get());
 }
 
 void ASTNode::renderVariable(const json::Value &CurrentCtx, raw_ostream &OS) {
-  auto Lambda = Lambdas.find(AccessorValue[0]);
-  if (Lambda != Lambdas.end()) {
+  auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
+  if (Lambda != Ctx.Lambdas.end()) {
     renderLambdas(CurrentCtx, OS, Lambda->getValue());
   } else if (const json::Value *ContextPtr = findContext()) {
-    EscapeStringStream ES(OS, Escapes);
+    EscapeStringStream ES(OS, Ctx.Escapes);
     toMustacheString(*ContextPtr, ES);
   }
 }
 
 void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx,
                                      raw_ostream &OS) {
-  auto Lambda = Lambdas.find(AccessorValue[0]);
-  if (Lambda != Lambdas.end()) {
+  auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
+  if (Lambda != Ctx.Lambdas.end()) {
     renderLambdas(CurrentCtx, OS, Lambda->getValue());
   } else if (const json::Value *ContextPtr = findContext()) {
     toMustacheString(*ContextPtr, OS);
@@ -728,8 +682,8 @@ void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx,
 }
 
 void ASTNode::renderSection(const json::Value &CurrentCtx, raw_ostream &OS) {
-  auto SectionLambda = SectionLambdas.find(AccessorValue[0]);
-  if (SectionLambda != SectionLambdas.end()) {
+  auto SectionLambda = Ctx.SectionLambdas.find(AccessorValue[0]);
+  if (SectionLambda != Ctx.SectionLambdas.end()) {
     renderSectionLambdas(CurrentCtx, OS, SectionLambda->getValue());
     return;
   }
@@ -748,7 +702,7 @@ void ASTNode::renderSection(const json::Value &CurrentCtx, raw_ostream &OS) {
 
 void ASTNode::renderInvertSection(const json::Value &CurrentCtx,
                                   raw_ostream &OS) {
-  bool IsLambda = SectionLambdas.contains(AccessorValue[0]);
+  bool IsLambda = Ctx.SectionLambdas.contains(AccessorValue[0]);
   const json::Value *ContextPtr = findContext();
   if (isContextFalsey(ContextPtr) && !IsLambda) {
     renderChild(CurrentCtx, OS);
@@ -844,10 +798,10 @@ void ASTNode::renderLambdas(const json::Value &Contexts, llvm::raw_ostream &OS,
   std::string LambdaStr;
   raw_string_ostream Output(LambdaStr);
   toMustacheString(LambdaResult, Output);
-  Parser P = Parser(LambdaStr);
-  AstPtr LambdaNode = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
+  Parser P(LambdaStr, Ctx);
+  AstPtr LambdaNode = P.parse();
 
-  EscapeStringStream ES(OS, Escapes);
+  EscapeStringStream ES(OS, Ctx.Escapes);
   if (Ty == Variable) {
     LambdaNode->render(Contexts, ES);
     return;
@@ -863,8 +817,8 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts,
   std::string LambdaStr;
   raw_string_ostream Output(LambdaStr);
   toMustacheString(Return, Output);
-  Parser P = Parser(LambdaStr);
-  AstPtr LambdaNode = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
+  Parser P(LambdaStr, Ctx);
+  AstPtr LambdaNode = P.parse();
   LambdaNode->render(Contexts, OS);
 }
 
@@ -873,22 +827,26 @@ void Template::render(const json::Value &Data, llvm::raw_ostream &OS) {
 }
 
 void Template::registerPartial(std::string Name, std::string Partial) {
-  Parser P = Parser(Partial);
-  AstPtr PartialTree = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
-  Partials.insert(std::make_pair(Name, std::move(PartialTree)));
+  Parser P(Partial, Ctx);
+  AstPtr PartialTree = P.parse();
+  Ctx.Partials.insert(std::make_pair(Name, std::move(PartialTree)));
 }
 
-void Template::registerLambda(std::string Name, Lambda L) { Lambdas[Name] = L; }
+void Template::registerLambda(std::string Name, Lambda L) {
+  Ctx.Lambdas[Name] = L;
+}
 
 void Template::registerLambda(std::string Name, SectionLambda L) {
-  SectionLambdas[Name] = L;
+  Ctx.SectionLambdas[Name] = L;
 }
 
-void Template::overrideEscapeCharacters(EscapeMap E) { Escapes = std::move(E); }
+void Template::overrideEscapeCharacters(EscapeMap E) {
+  Ctx.Escapes = std::move(E);
+}
 
 Template::Template(StringRef TemplateStr) {
-  Parser P = Parser(TemplateStr);
-  Tree = P.parse(Partials, Lambdas, SectionLambdas, Escapes);
+  Parser P(TemplateStr, Ctx);
+  Tree = P.parse();
   // The default behavior is to escape html entities.
   const EscapeMap HtmlEntities = {{'&', "&amp;"},
                                   {'<', "&lt;"},
@@ -899,18 +857,13 @@ Template::Template(StringRef TemplateStr) {
 }
 
 Template::Template(Template &&Other) noexcept
-    : Partials(std::move(Other.Partials)), Lambdas(std::move(Other.Lambdas)),
-      SectionLambdas(std::move(Other.SectionLambdas)),
-      Escapes(std::move(Other.Escapes)), Tree(std::move(Other.Tree)) {}
+    : Ctx(std::move(Other.Ctx)), Tree(std::move(Other.Tree)) {}
 
 Template::~Template() = default;
 
 Template &Template::operator=(Template &&Other) noexcept {
   if (this != &Other) {
-    Partials = std::move(Other.Partials);
-    Lambdas = std::move(Other.Lambdas);
-    SectionLambdas = std::move(Other.SectionLambdas);
-    Escapes = std::move(Other.Escapes);
+    Ctx = std::move(Other.Ctx);
     Tree = std::move(Other.Tree);
     Other.Tree = nullptr;
   }

From c6d3b517ee41c05d99aee6b90a862d6121116d1d Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Tue, 30 Sep 2025 11:44:34 +0800
Subject: [PATCH 212/878] [DAGCombiner] Remove most `NoSignedZerosFPMath` uses
 (#161180)

Remained two uses are related to fneg and foldFPToIntToFP, some AMDGPU
tests are duplicated and regenerated.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   13 +-
 llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll   | 1421 +++++++++--------
 llvm/test/CodeGen/AMDGPU/fmax_legacy.ll       |  226 ++-
 .../CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll |  281 +++-
 llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll   | 1421 +++++++++--------
 llvm/test/CodeGen/AMDGPU/fmin_legacy.ll       |  302 +++-
 llvm/test/CodeGen/NVPTX/bug22322.ll           |    6 +-
 llvm/test/CodeGen/PowerPC/scalar-min-max.ll   |  232 ++-
 llvm/test/CodeGen/VE/Scalar/max.ll            |  176 +-
 llvm/test/CodeGen/VE/Scalar/min.ll            |  178 +--
 10 files changed, 2310 insertions(+), 1946 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 77df4b4598c48..204e1f0c75e00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11849,9 +11849,7 @@ static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
   if (!VT.isFloatingPoint())
     return false;
 
-  const TargetOptions &Options = DAG.getTarget().Options;
-
-  return (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) &&
+  return Flags.hasNoSignedZeros() &&
          TLI.isProfitableToCombineMinNumMaxNum(VT) &&
          (Flags.hasNoNaNs() ||
           (DAG.isKnownNeverNaN(RHS) && DAG.isKnownNeverNaN(LHS)));
@@ -17351,7 +17349,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
-  bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
+  bool NoSignedZero = Flags.hasNoSignedZeros();
 
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
@@ -18327,11 +18325,9 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
       return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
   }
 
-  // FIXME: use fast math flags instead of Options.UnsafeFPMath
-  // TODO: Finally migrate away from global TargetOptions.
   if ((Options.NoNaNsFPMath && Options.NoInfsFPMath) ||
       (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) {
-    if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros() ||
+    if (N->getFlags().hasNoSignedZeros() ||
         (N2CFP && !N2CFP->isExactlyValue(-0.0))) {
       if (N0CFP && N0CFP->isZero())
         return N2;
@@ -18636,8 +18632,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   }
 
   // Fold X/Sqrt(X) -> Sqrt(X)
-  if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
-      Flags.hasAllowReassociation())
+  if (Flags.hasNoSignedZeros() && Flags.hasAllowReassociation())
     if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
       return N1;
 
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index ed48999e6d1e7..bd28f72bb8913 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -1,734 +1,759 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI %s
 
-; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v1
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-NNAN-TRUE16:       ; %bb.0:
-; GFX11-NNAN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
-; GFX11-NNAN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-NNAN-FAKE16:       ; %bb.0:
-; GFX11-NNAN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-NNAN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt half %a, %b
   %val = select i1 %cmp, half %a, half %b
   ret half %val
 }
 
+define half @test_fmax_legacy_ugt_f16_fast(half %a, half %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16_fast:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_f16_fast:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
 define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v2
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v3
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v1.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v2, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v3, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
 }
 
+define <2 x half> @test_fmax_legacy_ugt_v2f16_fast(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v2
+; SI-NEXT:    v_max_f32_e32 v1, v1, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v2f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
 define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v3, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v4, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v5, v2
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v3
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v4
-; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v5
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v3f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v3, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v4, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v2, v5, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <3 x half> %a, %b
   %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
   ret <3 x half> %val
 }
 
+define <3 x half> @test_fmax_legacy_ugt_v3f16_fast(<3 x half> %a, <3 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v2
+; VI-NEXT:    v_max_f16_e32 v1, v1, v3
+; VI-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v3
+; SI-NEXT:    v_max_f32_e32 v1, v1, v4
+; SI-NEXT:    v_max_f32_e32 v2, v2, v5
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v3f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <3 x half> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
+  ret <3 x half> %val
+}
+
 define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v6, v1, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v4
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v5
-; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v6
-; SI-NNAN-NEXT:    v_max_f32_e32 v3, v3, v7
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1.h, v3.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v4, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v5, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v2, v6, v2
+; SI-NEXT:    v_max_legacy_f32_e32 v3, v7, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
 }
 
+define <4 x half> @test_fmax_legacy_ugt_v4f16_fast(<4 x half> %a, <4 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v1, v1, v3
+; VI-NEXT:    v_max_f16_e32 v0, v0, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v5
+; VI-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v4
+; SI-NEXT:    v_max_f32_e32 v1, v1, v5
+; SI-NEXT:    v_max_f32_e32 v2, v2, v6
+; SI-NEXT:    v_max_f32_e32 v3, v3, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v4f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
 define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v8, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v10, v1, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v2, v12, v2, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v3, v14, v3, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v2, v2, v6
-; GFX9-NNAN-NEXT:    v_pk_max_f16 v3, v3, v7
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
-; VI-SAFE-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; VI-SAFE-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_max_f16_e32 v3, v3, v7
-; VI-NNAN-NEXT:    v_max_f16_e32 v2, v2, v6
-; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v5
-; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v0, v8, v0
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v1, v9, v1
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v2, v10, v2
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v3, v11, v3
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v4, v12, v4
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v5, v13, v5
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v6, v14, v6
-; SI-SAFE-NEXT:    v_max_legacy_f32_e32 v7, v15, v7
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_max_f32_e32 v0, v0, v8
-; SI-NNAN-NEXT:    v_max_f32_e32 v1, v1, v9
-; SI-NNAN-NEXT:    v_max_f32_e32 v2, v2, v10
-; SI-NNAN-NEXT:    v_max_f32_e32 v3, v3, v11
-; SI-NNAN-NEXT:    v_max_f32_e32 v4, v4, v12
-; SI-NNAN-NEXT:    v_max_f32_e32 v5, v5, v13
-; SI-NNAN-NEXT:    v_max_f32_e32 v6, v6, v14
-; SI-NNAN-NEXT:    v_max_f32_e32 v7, v7, v15
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v4.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v1.h, v5.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v2.h, v6.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v3.h, v7.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s3, v0.l, v4.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s4, v1.l, v5.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s5, v2.l, v6.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_nle_f16_e64 s6, v3.l, v7.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v11, v10
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v2, v2, v6
-; GFX11-NNAN-NEXT:    v_pk_max_f16 v3, v3, v7
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v8, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v10, v1, s4
+; GFX9-NEXT:    v_perm_b32 v2, v12, v2, s4
+; GFX9-NEXT:    v_perm_b32 v3, v14, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v8f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v15, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v13, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v11, v10
+; VI-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v9, v8
+; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v7
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v2, v6
+; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v5
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v8f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_max_legacy_f32_e32 v0, v8, v0
+; SI-NEXT:    v_max_legacy_f32_e32 v1, v9, v1
+; SI-NEXT:    v_max_legacy_f32_e32 v2, v10, v2
+; SI-NEXT:    v_max_legacy_f32_e32 v3, v11, v3
+; SI-NEXT:    v_max_legacy_f32_e32 v4, v12, v4
+; SI-NEXT:    v_max_legacy_f32_e32 v5, v13, v5
+; SI-NEXT:    v_max_legacy_f32_e32 v6, v14, v6
+; SI-NEXT:    v_max_legacy_f32_e32 v7, v15, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s0, v1.h, v5.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s1, v2.h, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, v3.h, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s3, v0.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s4, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s5, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s6, v3.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v11, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v9, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v2, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ugt <8 x half> %a, %b
   %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
   ret <8 x half> %val
 }
 
+define <8 x half> @test_fmax_legacy_ugt_v8f16_fast(<8 x half> %a, <8 x half> %b) #0 {
+; GFX9-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_max_f16_e32 v3, v3, v7
+; VI-NEXT:    v_max_f16_e32 v2, v2, v6
+; VI-NEXT:    v_max_f16_e32 v1, v1, v5
+; VI-NEXT:    v_max_f16_e32 v0, v0, v4
+; VI-NEXT:    v_or_b32_e32 v0, v0, v11
+; VI-NEXT:    v_or_b32_e32 v1, v1, v10
+; VI-NEXT:    v_or_b32_e32 v2, v2, v9
+; VI-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v8
+; SI-NEXT:    v_max_f32_e32 v1, v1, v9
+; SI-NEXT:    v_max_f32_e32 v2, v2, v10
+; SI-NEXT:    v_max_f32_e32 v3, v3, v11
+; SI-NEXT:    v_max_f32_e32 v4, v4, v12
+; SI-NEXT:    v_max_f32_e32 v5, v5, v13
+; SI-NEXT:    v_max_f32_e32 v6, v6, v14
+; SI-NEXT:    v_max_f32_e32 v7, v7, v15
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmax_legacy_ugt_v8f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX11-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ugt <8 x half> %a, %b
+  %val = select nnan nsz <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %val
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
index eee2bd1b3725d..f3a84e6e45260 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -1,8 +1,6 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN,FUNC %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,FUNC %s
 
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
 
@@ -12,12 +10,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
-; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; VI: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_uge_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
@@ -34,18 +30,38 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_uge_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+
+  %cmp = fcmp uge float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
 
-; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; VI: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
@@ -64,16 +80,40 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(ptr addrspace(1) %o
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
+; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
+
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+
+  %cmp = fcmp uge float %a.nnan, %b.nnan
+  %val = select nnan nsz i1 %cmp, float %a.nnan, float %b.nnan
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
-; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+; VI: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_oge_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -89,17 +129,35 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
+; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_oge_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %cmp = fcmp oge float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ugt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -115,16 +173,35 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_ugt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+
+  %cmp = fcmp ugt float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+; VI: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -140,17 +217,35 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %cmp = fcmp ogt float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 
-; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -166,23 +261,39 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; EG: MAX
+define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <1 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile <1 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <1 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ogt <1 x float> %a, %b
+  %val = select nnan nsz <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
+  store <1 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
-; SI-SAFE: v_max_legacy_f32_e32
-; SI-SAFE: v_max_legacy_f32_e32
-; SI-SAFE: v_max_legacy_f32_e32
-
-; VI-SAFE: v_cmp_gt_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_gt_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_gt_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE-NOT: v_cmp
-; VI-SAFE-NOT: v_cndmask
-
-; GCN-NONAN: v_max_f32_e32
-; GCN-NONAN: v_max_f32_e32
-; GCN-NONAN: v_max_f32_e32
+; SI: v_max_legacy_f32_e32
+; SI: v_max_legacy_f32_e32
+; SI: v_max_legacy_f32_e32
+
+; VI: v_cmp_gt_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_gt_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_gt_f32_e32
+; VI: v_cndmask_b32_e32
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
 
 ; GCN-NOT: v_max
 define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
@@ -199,6 +310,27 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32_fast:
+
+; GCN: v_max_f32_e32
+; GCN: v_max_f32_e32
+; GCN: v_max_f32_e32
+
+; GCN-NOT: v_max
+define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <3 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load <3 x float>, ptr addrspace(1) %gep.0
+  %b = load <3 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ogt <3 x float> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x float> %a, <3 x float> %b
+  store <3 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
index 2ac5891773d73..37f077d53cf94 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -1,16 +1,12 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,VI-NNAN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_0:
 ; GCN-NOT: v_mul
 
-; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
-
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
+; VI: v_cmp_nle_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, 1.0
@@ -18,15 +14,23 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}min_fneg_select_regression_0_fast:
+; GCN-NOT: v_mul
+
+define amdgpu_ps float @min_fneg_select_regression_0_fast(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, 1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
 ; GCN-NOT: v_mul
 
 ; SI: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+; VI: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, -1.0
@@ -34,15 +38,24 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0
   ret float %min.a
 }
 
-; GCN-LABEL: {{^}}max_fneg_select_regression_0:
+; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0_fast:
 ; GCN-NOT: v_mul
 
-; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; VI: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+define amdgpu_ps float @min_fneg_select_regression_posk_0_fast(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_0:
+; GCN-NOT: v_mul
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
+; SI: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
 
-; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
+; VI: v_cmp_nge_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, 1.0
@@ -50,15 +63,24 @@ define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
   ret float %min.a
 }
 
-; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
+; GCN-LABEL: {{^}}max_fneg_select_regression_0_fast:
 ; GCN-NOT: v_mul
 
-; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
+define amdgpu_ps float @max_fneg_select_regression_0_fast(float %a) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, 1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
+; GCN-NOT: v_mul
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
+; SI: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
 
-; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
+; VI: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, -1.0
@@ -66,13 +88,22 @@ define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0_fast:
+; GCN-NOT: v_mul
+
+; GCN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
+define amdgpu_ps float @max_fneg_select_regression_posk_0_fast(float %a) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1:
 ; SI: v_min_legacy_f32_e64 v0, 1.0, -v0
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, -1.0
@@ -80,13 +111,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg1_fast:
+
+; VI: v_min_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ugt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1:
 ; SI: v_max_legacy_f32_e64 v0, 1.0, -v0
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_max_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, -1.0
@@ -94,13 +133,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg1_fast:
+
+; VI: v_max_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ult float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1:
 ; SI: v_min_legacy_f32_e64 v0, -v0, 1.0
 
-; VI-SAFE: v_cmp_lt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NNAN: v_min_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_lt_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ogt float %a, -1.0
@@ -108,13 +155,21 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg1_fast:
+
+; VI: v_min_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ogt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1:
 ; SI: v_max_legacy_f32_e64 v0, -v0, 1.0
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, -1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
-
-; VI-NANN: v_max_f32_e64 v0, -v0, 1.0
+; VI: v_cmp_gt_f32_e32 vcc, -1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, -1.0
@@ -122,17 +177,24 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg1_fast:
+
+; VI-NANN: v_max_f32_e64 v0, -v0, 1.0
+define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp olt float %a, -1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_min_legacy_f32_e64 v0, [[K]], -v0
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_nge_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, -8.0
@@ -140,17 +202,25 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ugt_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ugt_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ugt float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_max_legacy_f32_e64 v0, [[K]], -v0
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_nle_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, -8.0
@@ -158,17 +228,25 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ult_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ult_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ult float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_min_legacy_f32_e64 v0, -v0, [[K]]
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_lt_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ogt float %a, -8.0
@@ -176,18 +254,26 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_ogt_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_min_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_ogt_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp ogt float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8:
 ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
 ; SI-NEXT: v_max_legacy_f32_e64 v0, -v0, [[K]]
 
 
-; VI-SAFE-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
-; VI-SAFE-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[K0]], v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
-
-; VI-NNAN: s_mov_b32 [[K:s[0-9]+]], 0x41000000
-; VI-NNAN-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+; VI-DAG: s_mov_b32 [[K0:s[0-9]+]], 0xc1000000
+; VI-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41000000
+; VI: v_cmp_gt_f32_e32 vcc, [[K0]], v0
+; VI-NEXT: v_cndmask_b32_e64 v0, [[K1]], -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, -8.0
@@ -195,13 +281,22 @@ define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_q_cmp_olt_a_neg8_fast:
+
+; VI: s_mov_b32 [[K:s[0-9]+]], 0x41000000
+; VI-NEXT: v_max_f32_e64 v0, -v0, [[K]]
+define amdgpu_ps float @select_fneg_a_or_q_cmp_olt_a_neg8_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp olt float %a, -8.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float 8.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1:
 ; SI: v_max_legacy_f32_e64 v0, -v0, -1.0
 
-; VI-SAFE: v_cmp_gt_f32_e32 vcc, 1.0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
-
-; VI-NNAN: v_max_f32_e64 v0, -v0, -1.0
+; VI: v_cmp_gt_f32_e32 vcc, 1.0, v0
+; VI-NEXT: v_cndmask_b32_e64 v0, -1.0, -v0, vcc
 define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp olt float %a, 1.0
@@ -209,15 +304,22 @@ define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1(float %a, float %b) #0
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}select_fneg_a_or_neg1_cmp_olt_a_1_fast:
+
+; VI: v_max_f32_e64 v0, -v0, -1.0
+define amdgpu_ps float @select_fneg_a_or_neg1_cmp_olt_a_1_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp olt float %a, 1.0
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}ult_a_select_fneg_a_b:
 ; SI: v_cmp_nge_f32_e32 vcc, v0, v1
 ; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, v0, v1
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-
-; VI-NNAN: v_cmp_lt_f32_e32 vcc, v0, v1
-; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; VI: v_cmp_nge_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ult float %a, %b
@@ -225,15 +327,23 @@ define amdgpu_ps float @ult_a_select_fneg_a_b(float %a, float %b) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}ult_a_select_fneg_a_b_fast:
+
+; VI: v_cmp_lt_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+define amdgpu_ps float @ult_a_select_fneg_a_b_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp nnan nsz ult float %a, %b
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float %b
+  ret float %min.a
+}
+
 ; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b:
 ; SI: v_cmp_nle_f32_e32 vcc, v0, v1
 ; SI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 
-; VI-SAFE: v_cmp_nle_f32_e32 vcc, v0, v1
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
-
-; VI-NNAN: v_cmp_gt_f32_e32 vcc, v0, v1
-; VI-NNAN-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+; VI: v_cmp_nle_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
 define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 {
   %fneg.a = fneg float %a
   %cmp.a = fcmp ugt float %a, %b
@@ -241,5 +351,16 @@ define amdgpu_ps float @ugt_a_select_fneg_a_b(float %a, float %b) #0 {
   ret float %min.a
 }
 
+; GCN-LABEL: {{^}}ugt_a_select_fneg_a_b_fast:
+
+; VI: v_cmp_gt_f32_e32 vcc, v0, v1
+; VI-NEXT: v_cndmask_b32_e64 v0, v1, -v0, vcc
+define amdgpu_ps float @ugt_a_select_fneg_a_b_fast(float %a, float %b) #0 {
+  %fneg.a = fneg float %a
+  %cmp.a = fcmp nnan nsz ugt float %a, %b
+  %min.a = select nnan nsz i1 %cmp.a, float %fneg.a, float %b
+  ret float %min.a
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 34cb0b1ba29b7..40c2ec0a39f51 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -1,735 +1,760 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=VI %s
 
-; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -enable-var-scope -check-prefix=SI %s
 
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 
 define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v1
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-TRUE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-NNAN-TRUE16:       ; %bb.0:
-; GFX11-NNAN-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
-; GFX11-NNAN-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-FAKE16-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-NNAN-FAKE16:       ; %bb.0:
-; GFX11-NNAN-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-NNAN-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v1, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select i1 %cmp, half %a, half %b
   ret half %val
 }
 
+define half @test_fmin_legacy_ule_f16_fast(half %a, half %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16_fast:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_f16_fast:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule half %a, %b
+  %val = select nnan nsz i1 %cmp, half %a, half %b
+  ret half %val
+}
+
 define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v2, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v2
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v3
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v1.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v2, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v3, v1
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v1.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
 }
 
+define <2 x half> @test_fmin_legacy_ule_v2f16_fast(<2 x half> %a, <2 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v2
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v2f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <2 x half> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %val
+}
+
 define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v3, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v4, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v5, v2
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v3
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v4
-; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v5
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v3f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v3, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v4, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v2, v5, v2
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <3 x half> %a, %b
   %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
   ret <3 x half> %val
 }
 
+define <3 x half> @test_fmin_legacy_ule_v3f16_fast(<3 x half> %a, <3 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v2
+; VI-NEXT:    v_min_f16_e32 v1, v1, v3
+; VI-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v3
+; SI-NEXT:    v_min_f32_e32 v1, v1, v4
+; SI-NEXT:    v_min_f32_e32 v2, v2, v5
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v3f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <3 x half> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
+  ret <3 x half> %val
+}
+
 define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v4, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v6, v1, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v4
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v5
-; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v6
-; SI-NNAN-NEXT:    v_min_f32_e32 v3, v3, v7
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.h, v2.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v0.l, v2.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v1.l, v3.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v7, v6
+; VI-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v4, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v5, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v2, v6, v2
+; SI-NEXT:    v_min_legacy_f32_e32 v3, v7, v3
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1.h, v3.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v0.h, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v1.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, v1.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.h, v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.l, v1.l, s2
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
 }
 
+define <4 x half> @test_fmin_legacy_ule_v4f16_fast(<4 x half> %a, <4 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v1, v1, v3
+; VI-NEXT:    v_min_f16_e32 v0, v0, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v5
+; VI-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v4
+; SI-NEXT:    v_min_f32_e32 v1, v1, v5
+; SI-NEXT:    v_min_f32_e32 v2, v2, v6
+; SI-NEXT:    v_min_f32_e32 v3, v3, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v4f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <4 x half> %a, %b
+  %val = select nnan nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
+  ret <4 x half> %val
+}
+
 define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
-; GFX9-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX9-SAFE:       ; %bb.0:
-; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-SAFE-NEXT:    v_perm_b32 v0, v8, v0, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v1, v10, v1, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v2, v12, v2, s4
-; GFX9-SAFE-NEXT:    v_perm_b32 v3, v14, v3, s4
-; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX9-NNAN:       ; %bb.0:
-; GFX9-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v2, v2, v6
-; GFX9-NNAN-NEXT:    v_pk_min_f16 v3, v3, v7
-; GFX9-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; VI-SAFE:       ; %bb.0:
-; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; VI-SAFE-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
-; VI-SAFE-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
-; VI-SAFE-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; VI-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; VI-NNAN:       ; %bb.0:
-; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NNAN-NEXT:    v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NNAN-NEXT:    v_min_f16_e32 v3, v3, v7
-; VI-NNAN-NEXT:    v_min_f16_e32 v2, v2, v6
-; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v5
-; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
-; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; SI-SAFE:       ; %bb.0:
-; SI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v0, v8, v0
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v1, v9, v1
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v2, v10, v2
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v3, v11, v3
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v4, v12, v4
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v5, v13, v5
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v6, v14, v6
-; SI-SAFE-NEXT:    v_min_legacy_f32_e32 v7, v15, v7
-; SI-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; SI-NNAN:       ; %bb.0:
-; SI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v8
-; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v9
-; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v10
-; SI-NNAN-NEXT:    v_min_f32_e32 v3, v3, v11
-; SI-NNAN-NEXT:    v_min_f32_e32 v4, v4, v12
-; SI-NNAN-NEXT:    v_min_f32_e32 v5, v5, v13
-; SI-NNAN-NEXT:    v_min_f32_e32 v6, v6, v14
-; SI-NNAN-NEXT:    v_min_f32_e32 v7, v7, v15
-; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX11-SAFE-TRUE16:       ; %bb.0:
-; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v4.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v1.h, v5.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v2.h, v6.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v3.h, v7.h
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s3, v0.l, v4.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s4, v1.l, v5.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s5, v2.l, v6.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s6, v3.l, v7.l
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
-; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX11-SAFE-FAKE16:       ; %bb.0:
-; GFX11-SAFE-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v11, v10
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-SAFE-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
-; GFX11-SAFE-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX11-NNAN:       ; %bb.0:
-; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v2, v2, v6
-; GFX11-NNAN-NEXT:    v_pk_min_f16 v3, v3, v7
-; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v0, v8, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v10, v1, s4
+; GFX9-NEXT:    v_perm_b32 v2, v12, v2, s4
+; GFX9-NEXT:    v_perm_b32 v3, v14, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v8f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v15, v14
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v13, v12
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; VI-NEXT:    v_cndmask_b32_e32 v12, v12, v13, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v11, v10
+; VI-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v9, v8
+; VI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v7
+; VI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v2, v6
+; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v5
+; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v12
+; VI-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v14
+; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v8f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_min_legacy_f32_e32 v0, v8, v0
+; SI-NEXT:    v_min_legacy_f32_e32 v1, v9, v1
+; SI-NEXT:    v_min_legacy_f32_e32 v2, v10, v2
+; SI-NEXT:    v_min_legacy_f32_e32 v3, v11, v3
+; SI-NEXT:    v_min_legacy_f32_e32 v4, v12, v4
+; SI-NEXT:    v_min_legacy_f32_e32 v5, v13, v5
+; SI-NEXT:    v_min_legacy_f32_e32 v6, v14, v6
+; SI-NEXT:    v_min_legacy_f32_e32 v7, v15, v7
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.h, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s0, v1.h, v5.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s1, v2.h, v6.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s2, v3.h, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s3, v0.l, v4.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s4, v1.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s5, v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e64 s6, v3.l, v7.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.h, v3.h, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v6.h, v2.h, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v5.h, v1.h, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.h, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v6.l, v2.l, s5
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v7.l, v3.l, s6
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v11, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v13, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v15, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v9, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v2, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v7
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <8 x half> %a, %b
   %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
   ret <8 x half> %val
 }
 
+define <8 x half> @test_fmin_legacy_ule_v8f16_fast(<8 x half> %a, <8 x half> %b) #0 {
+; GFX9-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX9-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_min_f16_sdwa v8, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v9, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v3, v3, v7
+; VI-NEXT:    v_min_f16_e32 v2, v2, v6
+; VI-NEXT:    v_min_f16_e32 v1, v1, v5
+; VI-NEXT:    v_min_f16_e32 v0, v0, v4
+; VI-NEXT:    v_or_b32_e32 v0, v0, v11
+; VI-NEXT:    v_or_b32_e32 v1, v1, v10
+; VI-NEXT:    v_or_b32_e32 v2, v2, v9
+; VI-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; SI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; SI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; SI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; SI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v8
+; SI-NEXT:    v_min_f32_e32 v1, v1, v9
+; SI-NEXT:    v_min_f32_e32 v2, v2, v10
+; SI-NEXT:    v_min_f32_e32 v3, v3, v11
+; SI-NEXT:    v_min_f32_e32 v4, v4, v12
+; SI-NEXT:    v_min_f32_e32 v5, v5, v13
+; SI-NEXT:    v_min_f32_e32 v6, v6, v14
+; SI-NEXT:    v_min_f32_e32 v7, v7, v15
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmin_legacy_ule_v8f16_fast:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX11-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX11-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %cmp = fcmp ule <8 x half> %a, %b
+  %val = select nnan nsz <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %val
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index ec4dd858b92ea..defcffa641e64 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -1,8 +1,6 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN,FUNC %s
 
-; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,FUNC %s
 
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope --check-prefixes=EG,FUNC %s
 
@@ -14,13 +12,9 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
 ; EG: MIN *
-; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
-; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-
-; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-
-; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(ptr addrspace(1) %out, <4 x float> %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
@@ -30,22 +24,32 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(ptr addrspace(1)
    ret void
 }
 
-; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32_fast:
 
-; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
+; SI: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
-; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
+; VI: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32_fast(ptr addrspace(1) %out, <4 x float> %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp nnan nsz uge float %r0, %r1
+   %r3 = select nnan nsz i1 %r2, float %r1, float %r0
+   store float %r3, ptr addrspace(1) %out
+   ret void
+}
 
-; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
+; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+
+; SI: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[#LOAD + 3]], [[VA]]
+; VI: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
 
-; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
-; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[#LOAD + 2]], [[VB]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[#LOAD + 3]], [[VA]]
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]]
+; VI: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]]
+; VI: v_cmp_ngt_f32_e32 vcc, s[[#LOAD + 2]], [[VB]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -53,6 +57,19 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, flo
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32_fast:
+; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+
+; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]]
+
+; GCN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]]
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out, float %a, float %b) #0 {
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; Nsz also needed
 ; FIXME: Should separate tests
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
@@ -61,12 +78,10 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(ptr addrspace(1) %out, flo
 ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
-
-; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
-; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
+; VI: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1) %out, float %a, float %b) #0 {
   %a.nnan = fadd nnan float %a, 1.0
   %b.nnan = fadd nnan float %b, 2.0
@@ -76,16 +91,32 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(ptr addrspace(1)
   ret void
 }
 
+; Nsz also needed
+; FIXME: Should separate tests
+; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src_fast:
+; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
+
+; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0
+; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0
+
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src_fast(ptr addrspace(1) %out, float %a, float %b) #0 {
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %cmp = fcmp ule float %a.nnan, %b.nnan
+  %val = select nnan nsz i1 %cmp, float %a.nnan, float %b.nnan
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
-; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; VI: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ule_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -100,16 +131,33 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ule_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; VI-SAFE: v_cmp_le_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %cmp = fcmp ule float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI: v_cmp_le_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ole_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -124,16 +172,33 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-; VI-SAFE: v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
+
+  %cmp = fcmp ole float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; VI: v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_olt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -148,16 +213,33 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_olt_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+  %cmp = fcmp olt float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ult_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -172,16 +254,33 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32_fast:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
-; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ult_f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
 
-; VI-SAFE: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
-; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+  %a = load volatile float, ptr addrspace(1) %gep.0, align 4
+  %b = load volatile float, ptr addrspace(1) %gep.1, align 4
 
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+  %cmp = fcmp ult float %a, %b
+  %val = select nnan nsz i1 %cmp, float %a, float %b
+  store float %val, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI: v_cmp_nge_f32_e32 vcc, [[A]], [[B]]
+; VI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
 define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid
@@ -196,19 +295,35 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32_fast:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; GCN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <1 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <1 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile <1 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <1 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ult <1 x float> %a, %b
+  %val = select nnan nsz <1 x i1> %cmp, <1 x float> %a, <1 x float> %b
+  store <1 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
 ; GCN: {{buffer|flat}}_load_dwordx2
 ; GCN: {{buffer|flat}}_load_dwordx2
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE: v_min_legacy_f32_e32
-
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
+; SI: v_min_legacy_f32_e32
+; SI: v_min_legacy_f32_e32
 
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN: v_min_f32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
 define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %tid
@@ -223,25 +338,40 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32_fast:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+
+; GCN: v_min_f32_e32
+; GCN: v_min_f32_e32
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <2 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <2 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load volatile <2 x float>, ptr addrspace(1) %gep.0
+  %b = load volatile <2 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ult <2 x float> %a, %b
+  %val = select nnan nsz <2 x i1> %cmp, <2 x float> %a, <2 x float> %b
+  store <2 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE: v_min_legacy_f32_e32
-; SI-SAFE-NOT: v_min_
-
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
-; VI-SAFE: v_cmp_nge_f32_e32
-; VI-SAFE: v_cndmask_b32_e32
+; SI: v_min_legacy_f32_e32
+; SI: v_min_legacy_f32_e32
+; SI: v_min_legacy_f32_e32
+; SI-NOT: v_min_
+
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
+; VI: v_cmp_nge_f32_e32
+; VI: v_cndmask_b32_e32
 ; VI-NOT: v_cmp
 ; VI-NOT: v_cndmask
-
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN: v_min_f32_e32
-; GCN-NONAN-NOT: v_min_
 define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid
@@ -256,6 +386,28 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(ptr addrspace(1) %out, ptr
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32_fast:
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
+
+; GCN: v_min_f32_e32
+; GCN: v_min_f32_e32
+; GCN: v_min_f32_e32
+; GCN-NOT: v_min_
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32_fast(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %gep.0 = getelementptr <3 x float>, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr <3 x float>, ptr addrspace(1) %gep.0, i32 1
+
+  %a = load <3 x float>, ptr addrspace(1) %gep.0
+  %b = load <3 x float>, ptr addrspace(1) %gep.1
+
+  %cmp = fcmp ult <3 x float> %a, %b
+  %val = select nnan nsz <3 x i1> %cmp, <3 x float> %a, <3 x float> %b
+  store <3 x float> %val, ptr addrspace(1) %out
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
diff --git a/llvm/test/CodeGen/NVPTX/bug22322.ll b/llvm/test/CodeGen/NVPTX/bug22322.ll
index 055c512401b4c..71e180b39fcf9 100644
--- a/llvm/test/CodeGen/NVPTX/bug22322.ll
+++ b/llvm/test/CodeGen/NVPTX/bug22322.ll
@@ -20,12 +20,12 @@ _ZL11compute_vecRK6float3jb.exit:
   call void @llvm.lifetime.start.p0(i64 4, ptr %ret_vec.sroa.8.i)
   %6 = and i32 %4, 15
   %7 = icmp eq i32 %6, 0
-  %8 = select i1 %7, float 0.000000e+00, float -1.000000e+00
+  %8 = select nnan nsz i1 %7, float 0.000000e+00, float -1.000000e+00
   store float %8, ptr %ret_vec.sroa.8.i, align 4
 ; CHECK: max.f32 %r{{[0-9]+}}, %r{{[0-9]+}}, 0f00000000
   %9 = fcmp olt float %8, 0.000000e+00
   %ret_vec.sroa.8.i.val = load float, ptr %ret_vec.sroa.8.i, align 4
-  %10 = select i1 %9, float 0.000000e+00, float %ret_vec.sroa.8.i.val
+  %10 = select nnan nsz i1 %9, float 0.000000e+00, float %ret_vec.sroa.8.i.val
   call void @llvm.lifetime.end.p0(i64 4, ptr %ret_vec.sroa.8.i)
   %11 = getelementptr inbounds %class.float3, ptr %dst, i64 %5, i32 0
   store float 0.000000e+00, ptr %11, align 4
@@ -51,7 +51,7 @@ declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "no-signed-zeros-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/PowerPC/scalar-min-max.ll b/llvm/test/CodeGen/PowerPC/scalar-min-max.ll
index 216d498e85411..5f637e3ecddd3 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-min-max.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-min-max.ll
@@ -1,36 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
-; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
-; RUN:   --enable-no-nans-fp-math \
-; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
-; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
-; RUN:   --enable-no-nans-fp-math \
-; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
 ; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
-; RUN:   --check-prefix=NO-FAST-P9
+; RUN:   --check-prefix=P9
 ; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -verify-machineinstrs \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
-; RUN:   --check-prefix=NO-FAST-P8
+; RUN:   --check-prefix=P8
 define dso_local float @testfmax(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmax:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmax:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgtlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmax:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxcdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmax:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    fcmpu cr0, f1, f2
+; P8-NEXT:    bgtlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp ogt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
@@ -38,23 +25,18 @@ entry:
 }
 
 define dso_local double @testdmax(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmax:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmax:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmax:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    bgtlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmax:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxcdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmax:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xscmpudp cr0, f1, f2
+; P8-NEXT:    bgtlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp ogt double %a, %b
   %cond = select i1 %cmp, double %a, double %b
@@ -62,23 +44,18 @@ entry:
 }
 
 define dso_local float @testfmin(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmin:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmin:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmin:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    fcmpu cr0, f1, f2
-; NO-FAST-P8-NEXT:    bltlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmin:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmincdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmin:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    fcmpu cr0, f1, f2
+; P8-NEXT:    bltlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp olt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
@@ -86,23 +63,18 @@ entry:
 }
 
 define dso_local double @testdmin(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmin:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmin:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmin:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xscmpudp cr0, f1, f2
-; NO-FAST-P8-NEXT:    bltlr cr0
-; NO-FAST-P8-NEXT:  # %bb.1: # %entry
-; NO-FAST-P8-NEXT:    fmr f1, f2
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmin:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmincdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmin:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xscmpudp cr0, f1, f2
+; P8-NEXT:    bltlr cr0
+; P8-NEXT:  # %bb.1: # %entry
+; P8-NEXT:    fmr f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp olt double %a, %b
   %cond = select i1 %cmp, double %a, double %b
@@ -110,86 +82,62 @@ entry:
 }
 
 define dso_local float @testfmax_fast(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmax_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmax_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmax_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubsp f0, f2, f1
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmax_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmax_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmaxdp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf ogt float %a, %b
-  %cond = select i1 %cmp, float %a, float %b
+  %cond = select nnan nsz i1 %cmp, float %a, float %b
   ret float %cond
 }
 define dso_local double @testdmax_fast(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmax_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmaxdp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmax_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmaxcdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmax_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubdp f0, f2, f1
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmax_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmaxdp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmax_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmaxdp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf ogt double %a, %b
-  %cond = select i1 %cmp, double %a, double %b
+  %cond = select nnan nsz i1 %cmp, double %a, double %b
   ret double %cond
 }
 define dso_local float @testfmin_fast(float %a, float %b) local_unnamed_addr {
-; CHECK-LABEL: testfmin_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testfmin_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testfmin_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubsp f0, f1, f2
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testfmin_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmindp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testfmin_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmindp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf olt float %a, %b
-  %cond = select i1 %cmp, float %a, float %b
+  %cond = select nnan nsz i1 %cmp, float %a, float %b
   ret float %cond
 }
 define dso_local double @testdmin_fast(double %a, double %b) local_unnamed_addr {
-; CHECK-LABEL: testdmin_fast:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xsmindp f1, f1, f2
-; CHECK-NEXT:    blr
-;
-; NO-FAST-P9-LABEL: testdmin_fast:
-; NO-FAST-P9:       # %bb.0: # %entry
-; NO-FAST-P9-NEXT:    xsmincdp f1, f1, f2
-; NO-FAST-P9-NEXT:    blr
-;
-; NO-FAST-P8-LABEL: testdmin_fast:
-; NO-FAST-P8:       # %bb.0: # %entry
-; NO-FAST-P8-NEXT:    xssubdp f0, f1, f2
-; NO-FAST-P8-NEXT:    fsel f1, f0, f2, f1
-; NO-FAST-P8-NEXT:    blr
+; P9-LABEL: testdmin_fast:
+; P9:       # %bb.0: # %entry
+; P9-NEXT:    xsmindp f1, f1, f2
+; P9-NEXT:    blr
+;
+; P8-LABEL: testdmin_fast:
+; P8:       # %bb.0: # %entry
+; P8-NEXT:    xsmindp f1, f1, f2
+; P8-NEXT:    blr
 entry:
   %cmp = fcmp nnan ninf olt double %a, %b
-  %cond = select i1 %cmp, double %a, double %b
+  %cond = select nnan nsz i1 %cmp, double %a, double %b
   ret double %cond
 }
diff --git a/llvm/test/CodeGen/VE/Scalar/max.ll b/llvm/test/CodeGen/VE/Scalar/max.ll
index 51da557c6c49f..7950842670afb 100644
--- a/llvm/test/CodeGen/VE/Scalar/max.ll
+++ b/llvm/test/CodeGen/VE/Scalar/max.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
-; RUN: llc < %s -mtriple=ve-unknown-unknown -enable-no-signed-zeros-fp-math \
-; RUN:     -enable-no-nans-fp-math | FileCheck %s -check-prefix=OPT
 
 define double @maxf64(double, double) {
 ; CHECK-LABEL: maxf64:
@@ -10,16 +8,21 @@ define double @maxf64(double, double) {
 ; CHECK-NEXT:    cmov.d.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ogt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @maxf64_fast(double, double) {
+; CHECK-LABEL: maxf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ogt double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @max2f64(double, double) {
 ; CHECK-LABEL: max2f64:
 ; CHECK:       # %bb.0:
@@ -27,16 +30,21 @@ define double @max2f64(double, double) {
 ; CHECK-NEXT:    cmov.d.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2f64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp oge double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @max2f64_fast(double, double) {
+; CHECK-LABEL: max2f64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp oge double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 ; VE has no max for unordered comparison
 define double @maxuf64(double, double) {
 ; CHECK-LABEL: maxuf64:
@@ -45,16 +53,21 @@ define double @maxuf64(double, double) {
 ; CHECK-NEXT:    cmov.d.gtnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxuf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ugt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @maxuf64_fast(double, double) {
+; CHECK-LABEL: maxuf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ugt double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 ; VE has no max for unordered comparison
 define double @max2uf64(double, double) {
 ; CHECK-LABEL: max2uf64:
@@ -63,16 +76,21 @@ define double @max2uf64(double, double) {
 ; CHECK-NEXT:    cmov.d.genan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2uf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp uge double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @max2uf64_fast(double, double) {
+; CHECK-LABEL: max2uf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp uge double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define float @maxf32(float, float) {
 ; CHECK-LABEL: maxf32:
 ; CHECK:       # %bb.0:
@@ -80,16 +98,21 @@ define float @maxf32(float, float) {
 ; CHECK-NEXT:    cmov.s.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ogt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @maxf32_fast(float, float) {
+; CHECK-LABEL: maxf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ogt float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @max2f32(float, float) {
 ; CHECK-LABEL: max2f32:
 ; CHECK:       # %bb.0:
@@ -97,16 +120,21 @@ define float @max2f32(float, float) {
 ; CHECK-NEXT:    cmov.s.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2f32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp oge float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @max2f32_fast(float, float) {
+; CHECK-LABEL: max2f32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp oge float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @maxuf32(float, float) {
 ; CHECK-LABEL: maxuf32:
 ; CHECK:       # %bb.0:
@@ -114,16 +142,21 @@ define float @maxuf32(float, float) {
 ; CHECK-NEXT:    cmov.s.gtnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxuf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ugt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @maxuf32_fast(float, float) {
+; CHECK-LABEL: maxuf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ugt float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @max2uf32(float, float) {
 ; CHECK-LABEL: max2uf32:
 ; CHECK:       # %bb.0:
@@ -131,26 +164,26 @@ define float @max2uf32(float, float) {
 ; CHECK-NEXT:    cmov.s.genan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2uf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmax.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp uge float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @max2uf32_fast(float, float) {
+; CHECK-LABEL: max2uf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmax.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp uge float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define i64 @maxi64(i64, i64) {
 ; CHECK-LABEL: maxi64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxi64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sgt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -161,11 +194,6 @@ define i64 @max2i64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2i64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sge i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -178,13 +206,6 @@ define i64 @maxu64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxu64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.gt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ugt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -197,13 +218,6 @@ define i64 @max2u64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2u64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.ge %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp uge i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -214,11 +228,6 @@ define i32 @maxi32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxi32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sgt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -229,11 +238,6 @@ define i32 @max2i32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    maxs.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2i32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    maxs.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sge i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -246,13 +250,6 @@ define i32 @maxu32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.gt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxu32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.gt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ugt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -265,13 +262,6 @@ define i32 @max2u32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.ge %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: max2u32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.ge %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp uge i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -283,12 +273,6 @@ define zeroext i1 @maxi1(i1 zeroext, i1 zeroext) {
 ; CHECK-NEXT:    or %s0, %s0, %s1
 ; CHECK-NEXT:    and %s0, 1, %s0
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: maxi1:
-; OPT:       # %bb.0:
-; OPT-NEXT:    or %s0, %s0, %s1
-; OPT-NEXT:    and %s0, 1, %s0
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %1, true
   %4 = and i1 %3, %0
   %5 = select i1 %4, i1 %0, i1 %1
diff --git a/llvm/test/CodeGen/VE/Scalar/min.ll b/llvm/test/CodeGen/VE/Scalar/min.ll
index e8f4939f9149e..36a2e06a2c9d4 100644
--- a/llvm/test/CodeGen/VE/Scalar/min.ll
+++ b/llvm/test/CodeGen/VE/Scalar/min.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s
-; RUN: llc < %s -mtriple=ve-unknown-unknown -enable-no-signed-zeros-fp-math \
-; RUN:     -enable-no-nans-fp-math | FileCheck %s -check-prefix=OPT
 
 define double @minf64(double, double) {
 ; CHECK-LABEL: minf64:
@@ -10,16 +8,21 @@ define double @minf64(double, double) {
 ; CHECK-NEXT:    cmov.d.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp olt double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @minf64_fast(double, double) {
+; CHECK-LABEL: minf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp olt double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @min2f64(double, double) {
 ; CHECK-LABEL: min2f64:
 ; CHECK:       # %bb.0:
@@ -27,16 +30,21 @@ define double @min2f64(double, double) {
 ; CHECK-NEXT:    cmov.d.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2f64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ole double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @min2f64_fast(double, double) {
+; CHECK-LABEL: min2f64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ole double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @minuf64(double, double) {
 ; CHECK-LABEL: minuf64:
 ; CHECK:       # %bb.0:
@@ -44,16 +52,21 @@ define double @minuf64(double, double) {
 ; CHECK-NEXT:    cmov.d.ltnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minuf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ult double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @minuf64_fast(double, double) {
+; CHECK-LABEL: minuf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ult double %0, %1
+  %4 = select nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define double @min2uf64(double, double) {
 ; CHECK-LABEL: min2uf64:
 ; CHECK:       # %bb.0:
@@ -61,16 +74,21 @@ define double @min2uf64(double, double) {
 ; CHECK-NEXT:    cmov.d.lenan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2uf64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.d %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ule double %0, %1
   %4 = select i1 %3, double %0, double %1
   ret double %4
 }
 
+define double @min2uf64_fast(double, double) {
+; CHECK-LABEL: min2uf64_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.d %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ule double %0, %1
+  %4 = select  nnan nsz i1 %3, double %0, double %1
+  ret double %4
+}
+
 define float @minf32(float, float) {
 ; CHECK-LABEL: minf32:
 ; CHECK:       # %bb.0:
@@ -78,16 +96,21 @@ define float @minf32(float, float) {
 ; CHECK-NEXT:    cmov.s.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp olt float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @minf32_fast(float, float) {
+; CHECK-LABEL: minf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp olt float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @min2f32(float, float) {
 ; CHECK-LABEL: min2f32:
 ; CHECK:       # %bb.0:
@@ -95,16 +118,21 @@ define float @min2f32(float, float) {
 ; CHECK-NEXT:    cmov.s.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2f32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ole float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @min2f32_fast(float, float) {
+; CHECK-LABEL: min2f32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ole float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @minuf32(float, float) {
 ; CHECK-LABEL: minuf32:
 ; CHECK:       # %bb.0:
@@ -112,16 +140,21 @@ define float @minuf32(float, float) {
 ; CHECK-NEXT:    cmov.s.ltnan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minuf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ult float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @minuf32_fast(float, float) {
+; CHECK-LABEL: minuf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ult float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define float @min2uf32(float, float) {
 ; CHECK-LABEL: min2uf32:
 ; CHECK:       # %bb.0:
@@ -129,26 +162,26 @@ define float @min2uf32(float, float) {
 ; CHECK-NEXT:    cmov.s.lenan %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2uf32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    fmin.s %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = fcmp ule float %0, %1
   %4 = select i1 %3, float %0, float %1
   ret float %4
 }
 
+define float @min2uf32_fast(float, float) {
+; CHECK-LABEL: min2uf32_fast:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmin.s %s0, %s0, %s1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %3 = fcmp ule float %0, %1
+  %4 = select nnan nsz i1 %3, float %0, float %1
+  ret float %4
+}
+
 define i64 @mini64(i64, i64) {
 ; CHECK-LABEL: mini64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: mini64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp slt i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -159,11 +192,6 @@ define i64 @min2i64(i64, i64) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.l %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2i64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.l %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sle i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -176,13 +204,6 @@ define i64 @minu64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minu64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.lt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ult i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -195,13 +216,6 @@ define i64 @min2u64(i64, i64) {
 ; CHECK-NEXT:    cmov.l.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2u64:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.l %s2, %s0, %s1
-; OPT-NEXT:    cmov.l.le %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ule i64 %0, %1
   %4 = select i1 %3, i64 %0, i64 %1
   ret i64 %4
@@ -212,11 +226,6 @@ define i32 @mini32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: mini32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp slt i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -227,11 +236,6 @@ define i32 @min2i32(i32, i32) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mins.w.sx %s0, %s0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2i32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    mins.w.sx %s0, %s0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp sle i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -244,13 +248,6 @@ define i32 @minu32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.lt %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: minu32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.lt %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ult i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -263,13 +260,6 @@ define i32 @min2u32(i32, i32) {
 ; CHECK-NEXT:    cmov.w.le %s1, %s0, %s2
 ; CHECK-NEXT:    or %s0, 0, %s1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: min2u32:
-; OPT:       # %bb.0:
-; OPT-NEXT:    cmpu.w %s2, %s0, %s1
-; OPT-NEXT:    cmov.w.le %s1, %s0, %s2
-; OPT-NEXT:    or %s0, 0, %s1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = icmp ule i32 %0, %1
   %4 = select i1 %3, i32 %0, i32 %1
   ret i32 %4
@@ -283,14 +273,6 @@ define zeroext i1 @mini1(i1 zeroext, i1 zeroext) {
 ; CHECK-NEXT:    cmov.w.ne %s0, %s1, %s2
 ; CHECK-NEXT:    adds.w.zx %s0, %s0, (0)1
 ; CHECK-NEXT:    b.l.t (, %s10)
-;
-; OPT-LABEL: mini1:
-; OPT:       # %bb.0:
-; OPT-NEXT:    and %s2, 1, %s0
-; OPT-NEXT:    and %s0, %s1, %s0
-; OPT-NEXT:    cmov.w.ne %s0, %s1, %s2
-; OPT-NEXT:    adds.w.zx %s0, %s0, (0)1
-; OPT-NEXT:    b.l.t (, %s10)
   %3 = xor i1 %0, true
   %4 = and i1 %3, %1
   %5 = select i1 %4, i1 %0, i1 %1

From 56d920d5868596f48a3a779dc88825594a646af3 Mon Sep 17 00:00:00 2001
From: owenca <owenpiano@gmail.com>
Date: Mon, 29 Sep 2025 21:11:20 -0700
Subject: [PATCH 213/878] [clang-format] Fix a bug in wrapping { after else
 (#161048)

Fixes #160775
---
 clang/lib/Format/FormatToken.h        |  2 +-
 clang/lib/Format/TokenAnnotator.cpp   | 44 +++++++++++++++------------
 clang/unittests/Format/FormatTest.cpp | 21 +++++++++++++
 3 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index e04b0e7af10c0..a28446a540633 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -55,7 +55,7 @@ namespace format {
   TYPE(ConflictAlternative)                                                    \
   TYPE(ConflictEnd)                                                            \
   TYPE(ConflictStart)                                                          \
-  /* l_brace of if/for/while */                                                \
+  /* l_brace of if/for/while/switch/catch */                                   \
   TYPE(ControlStatementLBrace)                                                 \
   TYPE(ControlStatementRBrace)                                                 \
   TYPE(CppCastLParen)                                                          \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 67066a104d738..0c9c88a426c89 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4021,29 +4021,28 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     }
   }
 
-  if (IsCpp &&
-      (LineIsFunctionDeclaration ||
-       (FirstNonComment && FirstNonComment->is(TT_CtorDtorDeclName))) &&
-      Line.endsWith(tok::semi, tok::r_brace)) {
-    auto *Tok = Line.Last->Previous;
-    while (Tok->isNot(tok::r_brace))
-      Tok = Tok->Previous;
-    if (auto *LBrace = Tok->MatchingParen; LBrace && LBrace->is(TT_Unknown)) {
-      assert(LBrace->is(tok::l_brace));
-      Tok->setBlockKind(BK_Block);
-      LBrace->setBlockKind(BK_Block);
-      LBrace->setFinalizedType(TT_FunctionLBrace);
+  if (IsCpp) {
+    if ((LineIsFunctionDeclaration ||
+         (FirstNonComment && FirstNonComment->is(TT_CtorDtorDeclName))) &&
+        Line.endsWith(tok::semi, tok::r_brace)) {
+      auto *Tok = Line.Last->Previous;
+      while (Tok->isNot(tok::r_brace))
+        Tok = Tok->Previous;
+      if (auto *LBrace = Tok->MatchingParen; LBrace && LBrace->is(TT_Unknown)) {
+        assert(LBrace->is(tok::l_brace));
+        Tok->setBlockKind(BK_Block);
+        LBrace->setBlockKind(BK_Block);
+        LBrace->setFinalizedType(TT_FunctionLBrace);
+      }
     }
-  }
 
-  if (IsCpp && SeenName && AfterLastAttribute &&
-      mustBreakAfterAttributes(*AfterLastAttribute, Style)) {
-    AfterLastAttribute->MustBreakBefore = true;
-    if (LineIsFunctionDeclaration)
-      Line.ReturnTypeWrapped = true;
-  }
+    if (SeenName && AfterLastAttribute &&
+        mustBreakAfterAttributes(*AfterLastAttribute, Style)) {
+      AfterLastAttribute->MustBreakBefore = true;
+      if (LineIsFunctionDeclaration)
+        Line.ReturnTypeWrapped = true;
+    }
 
-  if (IsCpp) {
     if (!LineIsFunctionDeclaration) {
       // Annotate */&/&& in `operator` function calls as binary operators.
       for (const auto *Tok = FirstNonComment; Tok; Tok = Tok->Next) {
@@ -4089,6 +4088,11 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     }
   }
 
+  if (First->is(TT_ElseLBrace)) {
+    First->CanBreakBefore = true;
+    First->MustBreakBefore = true;
+  }
+
   bool InFunctionDecl = Line.MightBeFunctionDecl;
   bool InParameterList = false;
   for (auto *Current = First->Next; Current; Current = Current->Next) {
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 6a3385a56f53e..fef70365b5e18 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -1364,6 +1364,27 @@ TEST_F(FormatTest, FormatIfWithoutCompoundStatementButElseWith) {
                AllowsMergedIf);
 }
 
+TEST_F(FormatTest, WrapMultipleStatementIfAndElseBraces) {
+  auto Style = getLLVMStyle();
+  Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Always;
+  Style.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_AllIfsAndElse;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_Always;
+  Style.BraceWrapping.BeforeElse = true;
+
+  verifyFormat("if (x)\n"
+               "{\n"
+               "  ++x;\n"
+               "  --y;\n"
+               "}\n"
+               "else\n"
+               "{\n"
+               "  --x;\n"
+               "  ++y;\n"
+               "}",
+               Style);
+}
+
 TEST_F(FormatTest, FormatLoopsWithoutCompoundStatement) {
   verifyFormat("while (true)\n"
                "  ;");

From 32baec4483d8723f594e04918d6a22d9d1423606 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 30 Sep 2025 06:33:29 +0200
Subject: [PATCH 214/878] [clang][Diags] Automatically format AP(S)Int values
 with separators (#161047)

This adds an `operator<<` overload for `StreamingDiagnostic` that takes
an `APInt`/`APSInt` and formats it with default options, including
adding separators.

This is still an opt-in mechanism since all callers that want to use
this feature need to be changed from

```c++
  Diag() << toString(MyInt, 10);
```

to

```c++
  Diag() << MyInt;
```

This patch contains one example of a diagnostic making use of this.
---
 clang/include/clang/Basic/Diagnostic.h | 17 +++++++++++++++++
 clang/lib/Sema/SemaDecl.cpp            |  3 +--
 clang/test/SemaCXX/bitfield-layout.cpp |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index af26a04d94889..e540040ddc524 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Compiler.h"
 #include <cassert>
@@ -1366,6 +1367,22 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
   return DB;
 }
 
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+                                             const llvm::APSInt &Int) {
+  DB.AddString(toString(Int, /*Radix=*/10, Int.isSigned(),
+                        /*formatAsCLiteral=*/false,
+                        /*UpperCase=*/true, /*InsertSeparators=*/true));
+  return DB;
+}
+
+inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+                                             const llvm::APInt &Int) {
+  DB.AddString(toString(Int, /*Radix=*/10, /*Signed=*/false,
+                        /*formatAsCLiteral=*/false,
+                        /*UpperCase=*/true, /*InsertSeparators=*/true));
+  return DB;
+}
+
 inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
                                              int I) {
   DB.AddTaggedVal(I, DiagnosticsEngine::ak_sint);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 9ef7a2698913d..0069b08f1991a 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -18909,8 +18909,7 @@ ExprResult Sema::VerifyBitField(SourceLocation FieldLoc,
     // 'bool'.
     if (BitfieldIsOverwide && !FieldTy->isBooleanType() && FieldName) {
       Diag(FieldLoc, diag::warn_bitfield_width_exceeds_type_width)
-          << FieldName << toString(Value, 10)
-          << (unsigned)TypeWidth;
+          << FieldName << Value << (unsigned)TypeWidth;
     }
   }
 
diff --git a/clang/test/SemaCXX/bitfield-layout.cpp b/clang/test/SemaCXX/bitfield-layout.cpp
index 7efd1d38c682f..f30218be01c56 100644
--- a/clang/test/SemaCXX/bitfield-layout.cpp
+++ b/clang/test/SemaCXX/bitfield-layout.cpp
@@ -35,7 +35,7 @@ CHECK_SIZE(Test4, 8);
 CHECK_ALIGN(Test4, 8);
 
 struct Test5 {
-  char c : 0x100000001; // expected-warning {{width of bit-field 'c' (4294967297 bits) exceeds the width of its type; value will be truncated to 8 bits}}
+  char c : 0x100000001; // expected-warning {{width of bit-field 'c' (4'294'967'297 bits) exceeds the width of its type; value will be truncated to 8 bits}}
 };
 // Size and align don't really matter here, just make sure we don't crash.
 CHECK_SIZE(Test5, 1);

From 46ea03997da6b07dda451be7e15ec7f13ebbe3ea Mon Sep 17 00:00:00 2001
From: quic_hchandel <hchandel@qti.qualcomm.com>
Date: Tue, 30 Sep 2025 10:44:37 +0530
Subject: [PATCH 215/878] [RISCV] Add commutative support for Qualcomm uC
 Xqcicm extension (#160653)

This is a follow-up to #145643. See
https://github.com/llvm/llvm-project/pull/145643#issuecomment-3009300419.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp    |  61 +++
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |   4 +-
 llvm/test/CodeGen/RISCV/select-bare.ll      |   8 +-
 llvm/test/CodeGen/RISCV/select-cc.ll        |  50 +-
 llvm/test/CodeGen/RISCV/select-cond.ll      | 288 +++++------
 llvm/test/CodeGen/RISCV/select.ll           |  40 +-
 llvm/test/CodeGen/RISCV/xqcicm.ll           | 501 +++++++++++++++-----
 llvm/test/CodeGen/RISCV/xqcics.ll           |  32 +-
 8 files changed, 645 insertions(+), 339 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 6d418fda82534..70b6c7ea35f82 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1023,6 +1023,37 @@ static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
   Cond.push_back(LastInst.getOperand(1));
 }
 
+static unsigned getInverseXqcicmOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected Opcode");
+  case RISCV::QC_MVEQ:
+    return RISCV::QC_MVNE;
+  case RISCV::QC_MVNE:
+    return RISCV::QC_MVEQ;
+  case RISCV::QC_MVLT:
+    return RISCV::QC_MVGE;
+  case RISCV::QC_MVGE:
+    return RISCV::QC_MVLT;
+  case RISCV::QC_MVLTU:
+    return RISCV::QC_MVGEU;
+  case RISCV::QC_MVGEU:
+    return RISCV::QC_MVLTU;
+  case RISCV::QC_MVEQI:
+    return RISCV::QC_MVNEI;
+  case RISCV::QC_MVNEI:
+    return RISCV::QC_MVEQI;
+  case RISCV::QC_MVLTI:
+    return RISCV::QC_MVGEI;
+  case RISCV::QC_MVGEI:
+    return RISCV::QC_MVLTI;
+  case RISCV::QC_MVLTUI:
+    return RISCV::QC_MVGEUI;
+  case RISCV::QC_MVGEUI:
+    return RISCV::QC_MVLTUI;
+  }
+}
+
 unsigned RISCVCC::getBrCond(RISCVCC::CondCode CC, unsigned SelectOpc) {
   switch (SelectOpc) {
   default:
@@ -3762,6 +3793,19 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
       return false;
     // Operands 1 and 2 are commutable, if we switch the opcode.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+  case RISCV::QC_MVEQ:
+  case RISCV::QC_MVNE:
+  case RISCV::QC_MVLT:
+  case RISCV::QC_MVGE:
+  case RISCV::QC_MVLTU:
+  case RISCV::QC_MVGEU:
+  case RISCV::QC_MVEQI:
+  case RISCV::QC_MVNEI:
+  case RISCV::QC_MVLTI:
+  case RISCV::QC_MVGEI:
+  case RISCV::QC_MVLTUI:
+  case RISCV::QC_MVGEUI:
+    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 4);
   case RISCV::TH_MULA:
   case RISCV::TH_MULAW:
   case RISCV::TH_MULAH:
@@ -3974,6 +4018,23 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
                                                    OpIdx2);
   }
+  case RISCV::QC_MVEQ:
+  case RISCV::QC_MVNE:
+  case RISCV::QC_MVLT:
+  case RISCV::QC_MVGE:
+  case RISCV::QC_MVLTU:
+  case RISCV::QC_MVGEU:
+  case RISCV::QC_MVEQI:
+  case RISCV::QC_MVNEI:
+  case RISCV::QC_MVLTI:
+  case RISCV::QC_MVGEI:
+  case RISCV::QC_MVLTUI:
+  case RISCV::QC_MVGEUI: {
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(getInverseXqcicmOpcode(MI.getOpcode())));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
+                                                   OpIdx2);
+  }
   case RISCV::PseudoCCMOVGPRNoX0:
   case RISCV::PseudoCCMOVGPR: {
     // CCMOV can be commuted by inverting the condition.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 13b02d1b2d6db..ff4a0406799b1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -604,7 +604,7 @@ class QCILICC<bits<3> funct3, bits<2> funct2, DAGOperand InTyRs2, string opcodes
   let Inst{31-25} = {simm, funct2};
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCIMVCC<bits<3> funct3, string opcodestr>
     : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, GPRNoX0:$rs3),
@@ -612,7 +612,7 @@ class QCIMVCC<bits<3> funct3, string opcodestr>
   let Constraints = "$rd = $rd_wb";
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCIMVCCI<bits<3> funct3, string opcodestr, DAGOperand immType>
     : RVInstR4<0b10, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, immType:$imm, GPRNoX0:$rs3),
diff --git a/llvm/test/CodeGen/RISCV/select-bare.ll b/llvm/test/CodeGen/RISCV/select-bare.ll
index 796121ac572ce..44028a7651b95 100644
--- a/llvm/test/CodeGen/RISCV/select-bare.ll
+++ b/llvm/test/CodeGen/RISCV/select-bare.ll
@@ -26,8 +26,8 @@ define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind {
 ; RV32IXQCI-LABEL: bare_select:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %1 = select i1 %a, i32 %b, i32 %c
   ret i32 %1
@@ -53,8 +53,8 @@ define float @bare_select_float(i1 %a, float %b, float %c) nounwind {
 ; RV32IXQCI-LABEL: bare_select_float:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %1 = select i1 %a, float %b, float %c
   ret float %1
diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll
index 14055dff40d42..b57f625cb867f 100644
--- a/llvm/test/CodeGen/RISCV/select-cc.ll
+++ b/llvm/test/CodeGen/RISCV/select-cc.ll
@@ -87,40 +87,40 @@ define signext i32 @foo(i32 signext %a, ptr %b) nounwind {
 ;
 ; RV32IXQCI-LABEL: foo:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    lw a5, 0(a1)
 ; RV32IXQCI-NEXT:    lw a2, 0(a1)
 ; RV32IXQCI-NEXT:    lw a4, 0(a1)
 ; RV32IXQCI-NEXT:    lw t5, 0(a1)
 ; RV32IXQCI-NEXT:    lw t4, 0(a1)
+; RV32IXQCI-NEXT:    lw t3, 0(a1)
 ; RV32IXQCI-NEXT:    lw t2, 0(a1)
-; RV32IXQCI-NEXT:    lw t1, 0(a1)
 ; RV32IXQCI-NEXT:    lw t0, 0(a1)
 ; RV32IXQCI-NEXT:    lw a7, 0(a1)
 ; RV32IXQCI-NEXT:    lw a6, 0(a1)
-; RV32IXQCI-NEXT:    lw t3, 0(a1)
 ; RV32IXQCI-NEXT:    lw a3, 0(a1)
-; RV32IXQCI-NEXT:    bltz t3, .LBB0_2
+; RV32IXQCI-NEXT:    lw t1, 0(a1)
+; RV32IXQCI-NEXT:    lw a5, 0(a1)
+; RV32IXQCI-NEXT:    bltz t1, .LBB0_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    li t6, 0
-; RV32IXQCI-NEXT:    qc.mveq a5, a0, a5, a0
-; RV32IXQCI-NEXT:    qc.mvne a2, a5, a2, a5
-; RV32IXQCI-NEXT:    qc.mvltu a4, a4, a2, a2
-; RV32IXQCI-NEXT:    qc.mvgeu t5, a4, t5, a4
-; RV32IXQCI-NEXT:    qc.mvltu t4, t5, t4, t5
-; RV32IXQCI-NEXT:    qc.mvgeu t2, t2, t4, t4
-; RV32IXQCI-NEXT:    qc.mvlt t1, t1, t2, t2
-; RV32IXQCI-NEXT:    qc.mvge t0, t1, t0, t1
-; RV32IXQCI-NEXT:    qc.mvlt a7, t0, a7, t0
-; RV32IXQCI-NEXT:    qc.mvge a6, a6, a7, a7
-; RV32IXQCI-NEXT:    mv a3, t3
-; RV32IXQCI-NEXT:    qc.mvge a3, t6, t3, a6
+; RV32IXQCI-NEXT:    li a5, 0
+; RV32IXQCI-NEXT:    qc.mveq a2, a0, a2, a0
+; RV32IXQCI-NEXT:    qc.mvne a4, a2, a4, a2
+; RV32IXQCI-NEXT:    qc.mvltu t5, t5, a4, a4
+; RV32IXQCI-NEXT:    qc.mvgeu t4, t5, t4, t5
+; RV32IXQCI-NEXT:    qc.mvltu t3, t4, t3, t4
+; RV32IXQCI-NEXT:    qc.mvgeu t2, t2, t3, t3
+; RV32IXQCI-NEXT:    qc.mvlt t0, t0, t2, t2
+; RV32IXQCI-NEXT:    qc.mvge a7, t0, a7, t0
+; RV32IXQCI-NEXT:    qc.mvlt a6, a7, a6, a7
+; RV32IXQCI-NEXT:    qc.mvge a3, a3, a6, a6
+; RV32IXQCI-NEXT:    qc.mvlt a3, a5, t1, t1
+; RV32IXQCI-NEXT:    mv a5, a3
 ; RV32IXQCI-NEXT:  .LBB0_2:
 ; RV32IXQCI-NEXT:    lw a2, 0(a1)
 ; RV32IXQCI-NEXT:    lw a0, 0(a1)
 ; RV32IXQCI-NEXT:    li a1, 1024
-; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a2, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a2, a5
 ; RV32IXQCI-NEXT:    li a1, 2046
-; RV32IXQCI-NEXT:    qc.mvltu a0, a1, t3, a2
+; RV32IXQCI-NEXT:    qc.mvltu a0, a1, t1, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: foo:
@@ -417,8 +417,8 @@ define i32 @select_sge_int16min(i32 signext %x, i32 signext %y, i32 signext %z)
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    lui a3, 1048560
 ; RV32IXQCI-NEXT:    addi a3, a3, -1
-; RV32IXQCI-NEXT:    qc.mvlt a2, a3, a0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mvge a1, a3, a0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: select_sge_int16min:
@@ -471,10 +471,10 @@ define i64 @select_sge_int32min(i64 %x, i64 %y, i64 %z) {
 ; RV32IXQCI-NEXT:    srli a0, a1, 31
 ; RV32IXQCI-NEXT:    xori a0, a0, 1
 ; RV32IXQCI-NEXT:    qc.mveqi a0, a1, -1, a6
-; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32IXQCI-NEXT:    qc.mvnei a5, a0, 0, a3
-; RV32IXQCI-NEXT:    mv a0, a4
-; RV32IXQCI-NEXT:    mv a1, a5
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 0, a5
+; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    mv a1, a3
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: select_sge_int32min:
diff --git a/llvm/test/CodeGen/RISCV/select-cond.ll b/llvm/test/CodeGen/RISCV/select-cond.ll
index b88fe9aae18ec..3ca0f46e8c02f 100644
--- a/llvm/test/CodeGen/RISCV/select-cond.ll
+++ b/llvm/test/CodeGen/RISCV/select-cond.ll
@@ -35,8 +35,8 @@ define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 sign
 ; RV32-XQCICM-LABEL: select_i32_trunc:
 ; RV32-XQCICM:       # %bb.0:
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32-XQCICM-NEXT:    mv a0, a2
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32-XQCICM-NEXT:    mv a0, a1
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_trunc:
@@ -48,8 +48,8 @@ define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 sign
 ; RV32IXQCI-LABEL: select_i32_trunc:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_trunc:
@@ -93,8 +93,8 @@ define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signe
 ; RV32-XQCICM-LABEL: select_i32_param:
 ; RV32-XQCICM:       # %bb.0:
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32-XQCICM-NEXT:    mv a0, a2
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32-XQCICM-NEXT:    mv a0, a1
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_param:
@@ -106,8 +106,8 @@ define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signe
 ; RV32IXQCI-LABEL: select_i32_param:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_param:
@@ -148,8 +148,8 @@ define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32-XQCICM-LABEL: select_i32_eq:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_eq:
@@ -163,8 +163,8 @@ define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32IXQCI-LABEL: select_i32_eq:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_eq:
@@ -205,8 +205,8 @@ define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32-XQCICM-LABEL: select_i32_ne:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ne:
@@ -220,8 +220,8 @@ define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x
 ;
 ; RV32IXQCI-LABEL: select_i32_ne:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ne:
@@ -262,8 +262,8 @@ define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_ugt:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ugt:
@@ -277,8 +277,8 @@ define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_ugt:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ugt:
@@ -319,8 +319,8 @@ define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_uge:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_uge:
@@ -334,8 +334,8 @@ define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_uge:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_uge:
@@ -376,8 +376,8 @@ define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_ult:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ult:
@@ -391,8 +391,8 @@ define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_ult:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ult:
@@ -433,8 +433,8 @@ define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_ule:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_ule:
@@ -448,8 +448,8 @@ define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_ule:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_ule:
@@ -490,8 +490,8 @@ define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_sgt:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_sgt:
@@ -505,8 +505,8 @@ define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_sgt:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_sgt:
@@ -547,8 +547,8 @@ define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_sge:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_sge:
@@ -562,8 +562,8 @@ define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_sge:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_sge:
@@ -604,8 +604,8 @@ define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_slt:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_slt:
@@ -619,8 +619,8 @@ define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_slt:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_slt:
@@ -661,8 +661,8 @@ define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32-XQCICM-LABEL: select_i32_sle:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
+; RV32-XQCICM-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i32_sle:
@@ -676,8 +676,8 @@ define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %
 ;
 ; RV32IXQCI-LABEL: select_i32_sle:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i32_sle:
@@ -723,11 +723,11 @@ define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind {
 ;
 ; RV32-XQCICM-LABEL: select_i64_trunc:
 ; RV32-XQCICM:       # %bb.0:
-; RV32-XQCICM-NEXT:    mv a1, a5
+; RV32-XQCICM-NEXT:    mv a1, a3
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32-XQCICM-NEXT:    qc.mvnei a1, a0, 0, a3
-; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a5
+; RV32-XQCICM-NEXT:    mv a0, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_trunc:
@@ -740,11 +740,11 @@ define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind {
 ;
 ; RV32IXQCI-LABEL: select_i64_trunc:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    mv a1, a5
+; RV32IXQCI-NEXT:    mv a1, a3
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a3
-; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a5
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_trunc:
@@ -792,10 +792,10 @@ define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-LABEL: select_i64_param:
 ; RV32-XQCICM:       # %bb.0:
 ; RV32-XQCICM-NEXT:    andi a0, a0, 1
-; RV32-XQCICM-NEXT:    qc.mvnei a3, a0, 0, a1
-; RV32-XQCICM-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32-XQCICM-NEXT:    mv a0, a3
-; RV32-XQCICM-NEXT:    mv a1, a4
+; RV32-XQCICM-NEXT:    qc.mveqi a1, a0, 0, a3
+; RV32-XQCICM-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32-XQCICM-NEXT:    mv a0, a1
+; RV32-XQCICM-NEXT:    mv a1, a2
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_param:
@@ -810,10 +810,10 @@ define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-LABEL: select_i64_param:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 0, a1
-; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
-; RV32IXQCI-NEXT:    mv a1, a4
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 0, a4
+; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:    mv a1, a2
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_param:
@@ -866,10 +866,10 @@ define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    xor a1, a1, a3
 ; RV32-XQCICM-NEXT:    xor a0, a0, a2
 ; RV32-XQCICM-NEXT:    or a0, a0, a1
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a0, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a0, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a0, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a0, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_eq:
@@ -887,10 +887,10 @@ define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    xor a1, a1, a3
 ; RV32IXQCI-NEXT:    xor a0, a0, a2
 ; RV32IXQCI-NEXT:    or a0, a0, a1
-; RV32IXQCI-NEXT:    qc.mveqi a6, a0, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a0, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a0, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a0, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_eq:
@@ -943,10 +943,10 @@ define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    xor a1, a1, a3
 ; RV32-XQCICM-NEXT:    xor a0, a0, a2
 ; RV32-XQCICM-NEXT:    or a0, a0, a1
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a0, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a0, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a0, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a0, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ne:
@@ -964,10 +964,10 @@ define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    xor a1, a1, a3
 ; RV32IXQCI-NEXT:    xor a0, a0, a2
 ; RV32IXQCI-NEXT:    or a0, a0, a1
-; RV32IXQCI-NEXT:    qc.mvnei a6, a0, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a0, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a0, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a0, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ne:
@@ -1025,10 +1025,10 @@ define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    sltu a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ugt:
@@ -1050,10 +1050,10 @@ define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    sltu a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ugt:
@@ -1111,10 +1111,10 @@ define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    sltu a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_uge:
@@ -1136,10 +1136,10 @@ define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    sltu a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_uge:
@@ -1197,10 +1197,10 @@ define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    sltu a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ult:
@@ -1222,10 +1222,10 @@ define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    sltu a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ult:
@@ -1283,10 +1283,10 @@ define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    sltu a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_ule:
@@ -1308,10 +1308,10 @@ define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    sltu a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_ule:
@@ -1369,10 +1369,10 @@ define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    slt a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_sgt:
@@ -1394,10 +1394,10 @@ define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    slt a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_sgt:
@@ -1455,10 +1455,10 @@ define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    slt a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_sge:
@@ -1480,10 +1480,10 @@ define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    slt a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_sge:
@@ -1541,10 +1541,10 @@ define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a0, a2
 ; RV32-XQCICM-NEXT:    slt a2, a1, a3
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_slt:
@@ -1566,10 +1566,10 @@ define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a0, a2
 ; RV32IXQCI-NEXT:    slt a2, a1, a3
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mvnei a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mvnei a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mveqi a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mveqi a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_slt:
@@ -1627,10 +1627,10 @@ define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32-XQCICM-NEXT:    sltu a0, a2, a0
 ; RV32-XQCICM-NEXT:    slt a2, a3, a1
 ; RV32-XQCICM-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32-XQCICM-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32-XQCICM-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32-XQCICM-NEXT:    mv a0, a6
-; RV32-XQCICM-NEXT:    mv a1, a7
+; RV32-XQCICM-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32-XQCICM-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32-XQCICM-NEXT:    mv a0, a4
+; RV32-XQCICM-NEXT:    mv a1, a5
 ; RV32-XQCICM-NEXT:    ret
 ;
 ; RV32-XQCICS-LABEL: select_i64_sle:
@@ -1652,10 +1652,10 @@ define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; RV32IXQCI-NEXT:    sltu a0, a2, a0
 ; RV32IXQCI-NEXT:    slt a2, a3, a1
 ; RV32IXQCI-NEXT:    qc.mveq a2, a1, a3, a0
-; RV32IXQCI-NEXT:    qc.mveqi a6, a2, 0, a4
-; RV32IXQCI-NEXT:    qc.mveqi a7, a2, 0, a5
-; RV32IXQCI-NEXT:    mv a0, a6
-; RV32IXQCI-NEXT:    mv a1, a7
+; RV32IXQCI-NEXT:    qc.mvnei a4, a2, 0, a6
+; RV32IXQCI-NEXT:    qc.mvnei a5, a2, 0, a7
+; RV32IXQCI-NEXT:    mv a0, a4
+; RV32IXQCI-NEXT:    mv a1, a5
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_i64_sle:
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 19fade67afc3d..8273c65bf512e 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1153,8 +1153,8 @@ define i32 @select_sub_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_sub_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    sub a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = sub i32 %a, %b
@@ -1301,9 +1301,9 @@ define i32 @select_sub_4(i1 zeroext %cond, i32 %x) {
 ;
 ; RV32IXQCI-LABEL: select_sub_4:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    addi a1, a1, -128
-; RV32IXQCI-NEXT:    li a2, 128
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    addi a2, a1, -128
+; RV32IXQCI-NEXT:    li a1, 128
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %add = sub i32 %x, 128
@@ -1348,8 +1348,8 @@ define i32 @select_and_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_and_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    and a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = and i32 %a, %b
@@ -1493,8 +1493,8 @@ define i32 @select_udiv_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_udiv_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    divu a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = udiv i32 %a, %b
@@ -1682,8 +1682,8 @@ define i32 @select_shl_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_shl_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    sll a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = shl i32 %a, %b
@@ -1798,8 +1798,8 @@ define i32 @select_ashr_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_ashr_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    sra a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = ashr i32 %a, %b
@@ -1914,8 +1914,8 @@ define i32 @select_lshr_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ; RV32IXQCI-LABEL: select_lshr_1:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    srl a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = lshr i32 %a, %b
@@ -2371,9 +2371,9 @@ define i32 @select_cst5(i1 zeroext %cond) {
 ; RV32IXQCI-LABEL: select_cst5:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    lui a1, 1
-; RV32IXQCI-NEXT:    addi a1, a1, -2047
-; RV32IXQCI-NEXT:    li a2, 2047
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    addi a2, a1, -2047
+; RV32IXQCI-NEXT:    li a1, 2047
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %ret = select i1 %cond, i32 2047, i32 2049
@@ -2870,8 +2870,8 @@ define void @select_redundant_czero_eqz1(ptr %0, ptr %1) {
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    lui a2, %hi(select_redundant_czero_eqz_data)
 ; RV32IXQCI-NEXT:    addi a2, a2, %lo(select_redundant_czero_eqz_data)
-; RV32IXQCI-NEXT:    qc.mveqi a0, a0, 0, a2
-; RV32IXQCI-NEXT:    sw a0, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a0
+; RV32IXQCI-NEXT:    sw a2, 0(a1)
 ; RV32IXQCI-NEXT:    ret
 entry:
   %3 = icmp eq ptr %0, null
diff --git a/llvm/test/CodeGen/RISCV/xqcicm.ll b/llvm/test/CodeGen/RISCV/xqcicm.ll
index 1741be742323d..fb48301b1d8e8 100644
--- a/llvm/test/CodeGen/RISCV/xqcicm.ll
+++ b/llvm/test/CodeGen/RISCV/xqcicm.ll
@@ -23,15 +23,15 @@ define i32 @select_example(i32 %cond, i32 %x, i32 %y) {
 ; RV32IXQCICM-LABEL: select_example:
 ; RV32IXQCICM:       # %bb.0: # %entry
 ; RV32IXQCICM-NEXT:    andi a0, a0, 1
-; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCICM-NEXT:    mv a0, a2
+; RV32IXQCICM-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCICM-NEXT:    mv a0, a1
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_example:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a0, a0, 1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cond_trunc = trunc i32 %cond to i1
@@ -52,14 +52,14 @@ define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 11
@@ -80,14 +80,14 @@ define i32 @select_cc_example_eq1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 11, %a
@@ -108,14 +108,14 @@ define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 %a, 11
@@ -136,14 +136,14 @@ define i32 @select_cc_example_ne1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 11, %a
@@ -164,14 +164,14 @@ define i32 @select_cc_example_slt(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_slt:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_slt:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp slt i32 %a, 11
@@ -192,14 +192,14 @@ define i32 @select_cc_example_slt1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_slt1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_slt1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp slt i32 11, %a
@@ -220,14 +220,14 @@ define i32 @select_cc_example_sle(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sle:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sle:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, 11
@@ -248,14 +248,14 @@ define i32 @select_cc_example_sle1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sle1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sle1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sle i32 11, %a
@@ -276,14 +276,14 @@ define i32 @select_cc_example_sgt(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sgt:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sgt:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, 11
@@ -304,14 +304,14 @@ define i32 @select_cc_example_sgt1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sgt1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sgt1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 11, %a
@@ -332,14 +332,14 @@ define i32 @select_cc_example_sge(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sge:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sge:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlti a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sge i32 %a, 11
@@ -360,14 +360,14 @@ define i32 @select_cc_example_sge1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sge1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sge1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlti a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgei a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sge i32 11, %a
@@ -388,14 +388,14 @@ define i32 @select_cc_example_ule(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ule:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 %a, 11
@@ -416,14 +416,14 @@ define i32 @select_cc_example_ule1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ule1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 11, %a
@@ -444,14 +444,14 @@ define i32 @select_cc_example_ugt(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ugt:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ugt:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ugt i32 %a, 11
@@ -472,14 +472,14 @@ define i32 @select_cc_example_ugt1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ugt1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ugt1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ugt i32 11, %a
@@ -500,14 +500,14 @@ define i32 @select_cc_example_ult(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ult:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ult:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ult i32 %a, 11
@@ -528,14 +528,14 @@ define i32 @select_cc_example_ult1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ult1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ult1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ult i32 11, %a
@@ -556,14 +556,14 @@ define i32 @select_cc_example_uge(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_uge:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_uge:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeui a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltui a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp uge i32 %a, 11
@@ -584,14 +584,14 @@ define i32 @select_cc_example_uge1(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_uge1:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_uge1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltui a3, a0, 12, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeui a2, a0, 12, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp uge i32 11, %a
@@ -611,14 +611,14 @@ define i32 @select_cc_example_eq_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveq a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvne a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, %b
@@ -638,14 +638,14 @@ define i32 @select_cc_example_ne_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvne a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveq a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 %a, %b
@@ -665,14 +665,14 @@ define i32 @select_cc_example_slt_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_slt_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_slt_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlt a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp slt i32 %a, %b
@@ -692,14 +692,14 @@ define i32 @select_cc_example_sge_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sge_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sge_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvge a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sge i32 %a, %b
@@ -719,14 +719,14 @@ define i32 @select_cc_example_sgt_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sgt_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sgt_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvlt a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvge a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sgt i32 %a, %b
@@ -746,14 +746,14 @@ define i32 @select_cc_example_sle_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_sle_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_sle_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvge a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp sle i32 %a, %b
@@ -773,14 +773,14 @@ define i32 @select_cc_example_ugt_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ugt_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ugt_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ugt i32 %a, %b
@@ -800,14 +800,14 @@ define i32 @select_cc_example_ult_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ult_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ult_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ult i32 %a, %b
@@ -827,14 +827,14 @@ define i32 @select_cc_example_uge_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_uge_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_uge_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp uge i32 %a, %b
@@ -854,14 +854,14 @@ define i32 @select_cc_example_ule_reg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ule_reg:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule_reg:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvgeu a3, a1, a0, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvltu a2, a1, a0, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 %a, %b
@@ -883,18 +883,263 @@ define i32 @select_cc_example_ule_neg(i32 %a, i32 %b, i32 %x, i32 %y) {
 ; RV32IXQCICM-LABEL: select_cc_example_ule_neg:
 ; RV32IXQCICM:       # %bb.0: # %entry
 ; RV32IXQCICM-NEXT:    li a1, -10
-; RV32IXQCICM-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ule_neg:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    li a1, -10
-; RV32IXQCI-NEXT:    qc.mvltu a3, a0, a1, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvgeu a2, a0, a1, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ule i32 %a, -11
   %sel = select i1 %cmp, i32 %x, i32 %y
   ret i32 %sel
 }
+
+define i32 @select_cc_example_eq_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a2, a1, .LBB32_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB32_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_eq_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvne a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_eq_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvne a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_lt_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_lt_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    blt a2, a1, .LBB33_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB33_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_lt_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvge a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_lt_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvge a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ge_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ge_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bge a2, a1, .LBB34_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB34_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ge_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvlt a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ge_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvlt a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ult_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ult_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bltu a2, a1, .LBB35_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB35_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ult_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvgeu a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ult_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvgeu a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_uge_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_uge_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bgeu a2, a1, .LBB36_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB36_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_uge_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvltu a0, a2, a1, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_uge_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvltu a0, a2, a1, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %x, %b
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_eq_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    beq a2, a1, .LBB37_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB37_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_eq_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvnei a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_eq_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvnei a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_lt_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_lt_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    blt a2, a1, .LBB38_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB38_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_lt_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvgei a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_lt_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvgei a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp slt i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ge_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ge_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 10
+; RV32I-NEXT:    blt a1, a2, .LBB39_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB39_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ge_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvlti a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ge_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvlti a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp sge i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ult_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ult_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 11
+; RV32I-NEXT:    bltu a2, a1, .LBB40_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB40_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ult_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvgeui a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ult_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvgeui a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp ult i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_uge_imm_mv(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_uge_imm_mv:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a1, 10
+; RV32I-NEXT:    bltu a1, a2, .LBB41_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:  .LBB41_2: # %entry
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_uge_imm_mv:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.mvltui a0, a2, 11, a3
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_uge_imm_mv:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.mvltui a0, a2, 11, a3
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp uge i32 %x, 11
+  %sel = select i1 %cmp, i32 %a, i32 %y
+  ret i32 %sel
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcics.ll b/llvm/test/CodeGen/RISCV/xqcics.ll
index 38de8fbd78b36..5b7ca9e7fedb8 100644
--- a/llvm/test/CodeGen/RISCV/xqcics.ll
+++ b/llvm/test/CodeGen/RISCV/xqcics.ll
@@ -134,14 +134,14 @@ define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %a, 11
@@ -167,14 +167,14 @@ define i32 @select_cc_example_eq_c(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_eq_c:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_eq_c:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mveqi a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp eq i32 11, %a
@@ -200,14 +200,14 @@ define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 %a, 11
@@ -233,14 +233,14 @@ define i32 @select_cc_example_ne_c(i32 %a, i32 %b, i32 %x, i32 %y) {
 ;
 ; RV32IXQCICM-LABEL: select_cc_example_ne_c:
 ; RV32IXQCICM:       # %bb.0: # %entry
-; RV32IXQCICM-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCICM-NEXT:    mv a0, a3
+; RV32IXQCICM-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCICM-NEXT:    mv a0, a2
 ; RV32IXQCICM-NEXT:    ret
 ;
 ; RV32IXQCI-LABEL: select_cc_example_ne_c:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    qc.mvnei a3, a0, 11, a2
-; RV32IXQCI-NEXT:    mv a0, a3
+; RV32IXQCI-NEXT:    qc.mveqi a2, a0, 11, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %cmp = icmp ne i32 11, %a

From e911eba3ef6990da2c4620c573b99a4f20f53da4 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Tue, 30 Sep 2025 01:37:54 -0400
Subject: [PATCH 216/878] [MLIR][Python] Fix stubgen/PYTHONPATH collision/bug
 (#161307)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If `PYTHONPATH` is set and points to the build location of the python
bindings package then when stubgen runs, `_mlir` will get imported twice
and bad things will happen (e.g., `Assertion !instance && “PyGlobals
already constructed”’`). This happens because `mlir` is a namespace
package and the importer/loader can't distinguish between
`mlir._mlir_libs._mlir` and `_mlir_libs._mlir` imported from `CWD`. Or
something like that. The fix is to filter out any entries in
`PYTHONPATH` that end in `MLIR_BINDINGS_PYTHON_INSTALL_PREFIX/..` (e.g.,
`python_packages/mlir_core/`).
---
 mlir/cmake/modules/AddMLIRPython.cmake | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 208cbdd1dd535..fa6aec8a603a9 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -173,9 +173,32 @@ function(mlir_generate_type_stubs)
   if(ARG_VERBOSE)
     message(STATUS "Generating type-stubs outputs ${_generated_type_stubs}")
   endif()
+
+  # If PYTHONPATH is set and points to the build location of the python package then when stubgen runs, _mlir will get
+  # imported twice and bad things will happen (e.g., Assertion `!instance && “PyGlobals already constructed”’).
+  # This happens because mlir is a namespace package and the importer/loader can't distinguish between
+  # mlir._mlir_libs._mlir and _mlir_libs._mlir imported from CWD.
+  # So try to filter out any entries in PYTHONPATH that end in "MLIR_BINDINGS_PYTHON_INSTALL_PREFIX/.."
+  # (e.g., python_packages/mlir_core/).
+  set(_pythonpath "$ENV{PYTHONPATH}")
+  cmake_path(CONVERT "${MLIR_BINDINGS_PYTHON_INSTALL_PREFIX}/.." TO_NATIVE_PATH_LIST _install_prefix NORMALIZE)
+  if(WIN32)
+    set(_path_sep ";")
+    set(_trailing_sep "\\")
+  else()
+    set(_path_sep ":")
+    set(_trailing_sep "/")
+    # `;` is the CMake list delimiter so Windows paths are automatically lists
+    # and Unix paths can be made into lists by replacing `:` with `;`
+    string(REPLACE "${_path_sep}" ";" _pythonpath "${_pythonpath}")
+  endif()
+  string(REGEX REPLACE "${_trailing_sep}$" "" _install_prefix "${_install_prefix}")
+  list(FILTER _pythonpath EXCLUDE REGEX "(${_install_prefix}|${_install_prefix}${_trailing_sep})$")
+  # Note, ${_pythonpath} is a list but "${_pythonpath}" is not a list - it's a string with ";" chars in it.
+  string(JOIN "${_path_sep}" _pythonpath ${_pythonpath})
   add_custom_command(
     OUTPUT ${_generated_type_stubs}
-    COMMAND ${_nb_stubgen_cmd}
+    COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH="${_pythonpath}" ${_nb_stubgen_cmd}
     WORKING_DIRECTORY "${CMAKE_CURRENT_FUNCTION_LIST_DIR}"
     DEPENDS "${ARG_DEPENDS_TARGETS}"
     DEPFILE "${_depfile}"

From b9e41ae6f9efb068e1fe137357c3d8ded31cdd02 Mon Sep 17 00:00:00 2001
From: halbi2 <hehiralbi@gmail.com>
Date: Tue, 30 Sep 2025 02:29:59 -0400
Subject: [PATCH 217/878] [clang][libc++] Fix spelling of "synthesize"
 (#158523)

There is a tradition to use U.S. English spellings for APIs. For
example, it's uninitialized_fill and not uninitialised_fill,
specialization not specialisation, etcetera.
---
 clang/docs/LanguageExtensions.rst             |   6 +-
 clang/docs/ReleaseNotes.rst                   |   4 +-
 clang/include/clang/Basic/TokenKinds.def      |   8 +-
 clang/lib/Sema/SemaTypeTraits.cpp             |  16 +-
 ...type-trait-synthesizes-from-spaceship.cpp} | 142 +++++++++---------
 .../__utility/default_three_way_comparator.h  |   4 +-
 libcxx/include/string                         |   2 +-
 .../has_default_three_way.compile.pass.cpp    |   2 +-
 8 files changed, 92 insertions(+), 92 deletions(-)
 rename clang/test/SemaCXX/{type-trait-synthesises-from-spaceship.cpp => type-trait-synthesizes-from-spaceship.cpp} (57%)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index b503283559db4..6bb99c757cd19 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2065,9 +2065,9 @@ The following type trait primitives are supported by Clang. Those traits marked
     Returns true if a reference ``T`` can be copy-initialized from a temporary of type
     a non-cv-qualified ``U``.
 * ``__underlying_type`` (C++, GNU, Microsoft)
-* ``__builtin_lt_synthesises_from_spaceship``, ``__builtin_gt_synthesises_from_spaceship``,
-  ``__builtin_le_synthesises_from_spaceship``, ``__builtin_ge_synthesises_from_spaceship`` (Clang):
-  These builtins can be used to determine whether the corresponding operator is synthesised from a spaceship operator.
+* ``__builtin_lt_synthesizes_from_spaceship``, ``__builtin_gt_synthesizes_from_spaceship``,
+  ``__builtin_le_synthesizes_from_spaceship``, ``__builtin_ge_synthesizes_from_spaceship`` (Clang):
+  These builtins can be used to determine whether the corresponding operator is synthesized from a spaceship operator.
 
 In addition, the following expression traits are supported:
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 79dc0b2728e98..3b269ccd57718 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -142,8 +142,8 @@ What's New in Clang |release|?
 C++ Language Changes
 --------------------
 
-- A new family of builtins ``__builtin_*_synthesises_from_spaceship`` has been added. These can be queried to know
-  whether the ``<`` (``lt``), ``>`` (``gt``), ``<=`` (``le``), or ``>=`` (``ge``) operators are synthesised from a
+- A new family of builtins ``__builtin_*_synthesizes_from_spaceship`` has been added. These can be queried to know
+  whether the ``<`` (``lt``), ``>`` (``gt``), ``<=`` (``le``), or ``>=`` (``ge``) operators are synthesized from a
   ``<=>``. This makes it possible to optimize certain facilities by using the ``<=>`` operation directly instead of
   doing multiple comparisons.
 
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 9d1a23d1af218..564d6010181cc 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -552,10 +552,10 @@ TYPE_TRAIT_1(__can_pass_in_regs, CanPassInRegs, KEYCXX)
 TYPE_TRAIT_2(__reference_binds_to_temporary, ReferenceBindsToTemporary, KEYCXX)
 TYPE_TRAIT_2(__reference_constructs_from_temporary, ReferenceConstructsFromTemporary, KEYCXX)
 TYPE_TRAIT_2(__reference_converts_from_temporary, ReferenceConvertsFromTemporary, KEYCXX)
-TYPE_TRAIT_2(__builtin_lt_synthesises_from_spaceship, LtSynthesisesFromSpaceship, KEYCXX)
-TYPE_TRAIT_2(__builtin_le_synthesises_from_spaceship, LeSynthesisesFromSpaceship, KEYCXX)
-TYPE_TRAIT_2(__builtin_gt_synthesises_from_spaceship, GtSynthesisesFromSpaceship, KEYCXX)
-TYPE_TRAIT_2(__builtin_ge_synthesises_from_spaceship, GeSynthesisesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_lt_synthesizes_from_spaceship, LtSynthesizesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_le_synthesizes_from_spaceship, LeSynthesizesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_gt_synthesizes_from_spaceship, GtSynthesizesFromSpaceship, KEYCXX)
+TYPE_TRAIT_2(__builtin_ge_synthesizes_from_spaceship, GeSynthesizesFromSpaceship, KEYCXX)
 // IsDeducible is only used internally by clang for CTAD implementation and
 // is not exposed to users.
 TYPE_TRAIT_2(/*EmptySpellingName*/, IsDeducible, KEYCXX)
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 6c798d6acb0a0..3e34675cbf064 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -1830,10 +1830,10 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
 
     return Self.HLSL().IsScalarizedLayoutCompatible(LhsT, RhsT);
   }
-  case BTT_LtSynthesisesFromSpaceship:
-  case BTT_LeSynthesisesFromSpaceship:
-  case BTT_GtSynthesisesFromSpaceship:
-  case BTT_GeSynthesisesFromSpaceship: {
+  case BTT_LtSynthesizesFromSpaceship:
+  case BTT_LeSynthesizesFromSpaceship:
+  case BTT_GtSynthesizesFromSpaceship:
+  case BTT_GeSynthesizesFromSpaceship: {
     EnterExpressionEvaluationContext UnevaluatedContext(
         Self, Sema::ExpressionEvaluationContext::Unevaluated);
     Sema::SFINAETrap SFINAE(Self, /*ForValidityCheck=*/true);
@@ -1852,13 +1852,13 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
 
     auto OpKind = [&] {
       switch (BTT) {
-      case BTT_LtSynthesisesFromSpaceship:
+      case BTT_LtSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_LT;
-      case BTT_LeSynthesisesFromSpaceship:
+      case BTT_LeSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_LE;
-      case BTT_GtSynthesisesFromSpaceship:
+      case BTT_GtSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_GT;
-      case BTT_GeSynthesisesFromSpaceship:
+      case BTT_GeSynthesizesFromSpaceship:
         return BinaryOperatorKind::BO_GE;
       default:
         llvm_unreachable("Trying to Synthesize non-comparison operator?");
diff --git a/clang/test/SemaCXX/type-trait-synthesises-from-spaceship.cpp b/clang/test/SemaCXX/type-trait-synthesizes-from-spaceship.cpp
similarity index 57%
rename from clang/test/SemaCXX/type-trait-synthesises-from-spaceship.cpp
rename to clang/test/SemaCXX/type-trait-synthesizes-from-spaceship.cpp
index ba581475bb4c7..be312f453f4be 100644
--- a/clang/test/SemaCXX/type-trait-synthesises-from-spaceship.cpp
+++ b/clang/test/SemaCXX/type-trait-synthesizes-from-spaceship.cpp
@@ -1,24 +1,24 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
 
-static_assert(!__builtin_lt_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_lt_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_lt_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_lt_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
-
-static_assert(!__builtin_le_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_le_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_le_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_le_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
-
-static_assert(!__builtin_gt_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_gt_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_gt_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_gt_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
-
-static_assert(!__builtin_ge_synthesises_from_spaceship()); // expected-error {{expected a type}}
-static_assert(!__builtin_ge_synthesises_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
-static_assert(!__builtin_ge_synthesises_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
-static_assert(!__builtin_ge_synthesises_from_spaceship(int, 0)); // expected-error {{expected a type}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
+
+static_assert(!__builtin_le_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_le_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_le_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_le_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
+
+static_assert(!__builtin_gt_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
+
+static_assert(!__builtin_ge_synthesizes_from_spaceship()); // expected-error {{expected a type}}
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int)); // expected-error {{type trait requires 2 arguments; have 1 argument}}
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int, int, int)); // expected-error {{type trait requires 2 arguments; have 3 argument}}
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int, 0)); // expected-error {{expected a type}}
 
 namespace std {
   struct strong_ordering {
@@ -35,10 +35,10 @@ struct DefaultSpaceship {
   friend auto operator<=>(DefaultSpaceship, DefaultSpaceship) = default;
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const DefaultSpaceship&, const DefaultSpaceship&));
 
 struct CustomSpaceship {
   int i;
@@ -48,10 +48,10 @@ struct CustomSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomSpaceship&, const CustomSpaceship&));
 
 struct CustomLT {
   int i;
@@ -61,10 +61,10 @@ struct CustomLT {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomLT&, const CustomLT&));
 
 struct CustomLE {
   int i;
@@ -74,10 +74,10 @@ struct CustomLE {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomLE&, const CustomLE&));
 
 struct CustomGT {
   int i;
@@ -87,10 +87,10 @@ struct CustomGT {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomGT&, const CustomGT&));
 
 struct CustomGE {
   int i;
@@ -100,10 +100,10 @@ struct CustomGE {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomGE&, const CustomGE&));
 
 struct CustomLTAndSpaceship {
   int i;
@@ -117,10 +117,10 @@ struct CustomLTAndSpaceship {
   }
 };
 
-static_assert(!__builtin_lt_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomLTAndSpaceship&, const CustomLTAndSpaceship&));
 
 struct CustomLEAndSpaceship {
   int i;
@@ -134,10 +134,10 @@ struct CustomLEAndSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomLEAndSpaceship&, const CustomLEAndSpaceship&));
 
 struct CustomGTAndSpaceship {
   int i;
@@ -151,10 +151,10 @@ struct CustomGTAndSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const CustomGTAndSpaceship&, const CustomGTAndSpaceship&));
 
 struct CustomGEAndSpaceship {
   int i;
@@ -168,10 +168,10 @@ struct CustomGEAndSpaceship {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
-static_assert(__builtin_le_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const CustomGEAndSpaceship&, const CustomGEAndSpaceship&));
 
 struct DefaultedCmpAndSpaceship {
   int i;
@@ -187,10 +187,10 @@ struct DefaultedCmpAndSpaceship {
 };
 
 // TODO: This should probably return true
-static_assert(!__builtin_lt_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
-static_assert(!__builtin_le_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
-static_assert(!__builtin_gt_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
-static_assert(!__builtin_ge_synthesises_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_le_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(const DefaultedCmpAndSpaceship&, const DefaultedCmpAndSpaceship&));
 
 struct DifferentTypes {
   int i;
@@ -200,13 +200,13 @@ struct DifferentTypes {
   }
 };
 
-static_assert(__builtin_lt_synthesises_from_spaceship(const DifferentTypes&, const int&));
-static_assert(__builtin_le_synthesises_from_spaceship(const DifferentTypes&, const int&));
-static_assert(__builtin_gt_synthesises_from_spaceship(const DifferentTypes&, const int&));
-static_assert(__builtin_ge_synthesises_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_lt_synthesizes_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_le_synthesizes_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_gt_synthesizes_from_spaceship(const DifferentTypes&, const int&));
+static_assert(__builtin_ge_synthesizes_from_spaceship(const DifferentTypes&, const int&));
 
 // TODO: Should this return true? It's technically not synthesized from spaceship, but it behaves exactly as-if it was
-static_assert(!__builtin_lt_synthesises_from_spaceship(int, int));
-static_assert(!__builtin_le_synthesises_from_spaceship(int, int));
-static_assert(!__builtin_gt_synthesises_from_spaceship(int, int));
-static_assert(!__builtin_ge_synthesises_from_spaceship(int, int));
+static_assert(!__builtin_lt_synthesizes_from_spaceship(int, int));
+static_assert(!__builtin_le_synthesizes_from_spaceship(int, int));
+static_assert(!__builtin_gt_synthesizes_from_spaceship(int, int));
+static_assert(!__builtin_ge_synthesizes_from_spaceship(int, int));
diff --git a/libcxx/include/__utility/default_three_way_comparator.h b/libcxx/include/__utility/default_three_way_comparator.h
index 438ab55b43230..92cdce6aae117 100644
--- a/libcxx/include/__utility/default_three_way_comparator.h
+++ b/libcxx/include/__utility/default_three_way_comparator.h
@@ -40,13 +40,13 @@ struct __default_three_way_comparator<_LHS,
   }
 };
 
-#if _LIBCPP_STD_VER >= 20 && __has_builtin(__builtin_lt_synthesises_from_spaceship)
+#if _LIBCPP_STD_VER >= 20 && __has_builtin(__builtin_lt_synthesizes_from_spaceship)
 template <class _LHS, class _RHS>
 struct __default_three_way_comparator<
     _LHS,
     _RHS,
     __enable_if_t<!(is_arithmetic<_LHS>::value && is_arithmetic<_RHS>::value) &&
-                  __builtin_lt_synthesises_from_spaceship(const _LHS&, const _RHS&)>> {
+                  __builtin_lt_synthesizes_from_spaceship(const _LHS&, const _RHS&)>> {
   _LIBCPP_HIDE_FROM_ABI static int operator()(const _LHS& __lhs, const _RHS& __rhs) {
     auto __res = __lhs <=> __rhs;
     if (__res < 0)
diff --git a/libcxx/include/string b/libcxx/include/string
index 729a420d47598..cfd6861e5c9c2 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2552,7 +2552,7 @@ _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_LIBCPP_DECLARE, wchar_t)
 #  endif
 #  undef _LIBCPP_DECLARE
 
-#  if _LIBCPP_STD_VER <= 17 || !__has_builtin(__builtin_lt_synthesises_from_spaceship)
+#  if _LIBCPP_STD_VER <= 17 || !__has_builtin(__builtin_lt_synthesizes_from_spaceship)
 template <class _CharT, class _Traits, class _Alloc>
 struct __default_three_way_comparator<basic_string<_CharT, _Traits, _Alloc>, basic_string<_CharT, _Traits, _Alloc> > {
   using __string_t _LIBCPP_NODEBUG = basic_string<_CharT, _Traits, _Alloc>;
diff --git a/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp b/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
index 42b4855a9fddd..625b194b1eb1a 100644
--- a/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/utility/has_default_three_way.compile.pass.cpp
@@ -18,7 +18,7 @@ static_assert(std::__has_default_three_way_comparator<long, int>::value);
 static_assert(std::__has_default_three_way_comparator<long, long>::value);
 static_assert(std::__has_default_three_way_comparator<std::string, std::string>::value);
 
-#if __has_builtin(__builtin_lt_synthesises_from_spaceship)
+#if __has_builtin(__builtin_lt_synthesizes_from_spaceship)
 static_assert(std::__has_default_three_way_comparator<const std::string&, const std::string&>::value);
 static_assert(std::__has_default_three_way_comparator<const std::string&, const std::string_view&>::value);
 static_assert(std::__has_default_three_way_comparator<std::string, std::string_view>::value);

From ea452c0a020a7ad35bde86c0accabe1c994941aa Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Tue, 30 Sep 2025 02:49:04 -0400
Subject: [PATCH 218/878] [clangd] Fix off-by-one error in CommandMangler
 (#160029)

SawInput() is intended to be called for every argument after a `--`, but
it was mistakenly being called for the `--` itself.

Partially fixes https://github.com/clangd/clangd/issues/1850
---
 clang-tools-extra/clangd/CompileCommands.cpp           |  3 ++-
 .../clangd/unittests/CompileCommandsTests.cpp          | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index 80391fe8cce25..c9da98e96ccfb 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -270,7 +270,8 @@ void CommandMangler::operator()(tooling::CompileCommand &Command,
   if (auto *DashDash =
           ArgList.getLastArgNoClaim(driver::options::OPT__DASH_DASH)) {
     auto DashDashIndex = DashDash->getIndex() + 1; // +1 accounts for Cmd[0]
-    for (unsigned I = DashDashIndex; I < Cmd.size(); ++I)
+    // Another +1 so we don't treat the `--` itself as an input.
+    for (unsigned I = DashDashIndex + 1; I < Cmd.size(); ++I)
       SawInput(Cmd[I]);
     Cmd.resize(DashDashIndex);
   }
diff --git a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
index 2ce2975bd962b..e324404e627c2 100644
--- a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
@@ -526,6 +526,16 @@ TEST(CommandMangler, RespectsOriginalSysroot) {
                 Not(HasSubstr(testPath("fake/sysroot"))));
   }
 }
+
+TEST(CommandMangler, StdLatestFlag) {
+  const auto Mangler = CommandMangler::forTests();
+  tooling::CompileCommand Cmd;
+  Cmd.CommandLine = {"clang-cl", "/std:c++latest", "--", "/Users/foo.cc"};
+  Mangler(Cmd, "/Users/foo.cc");
+  // Check that the /std:c++latest flag is not dropped
+  EXPECT_THAT(llvm::join(Cmd.CommandLine, " "), HasSubstr("/std:c++latest"));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang

From 156e9b4b6989043024ea8c5d15360a716af51631 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 30 Sep 2025 08:28:56 +0100
Subject: [PATCH 219/878] [WebAssembly] Use partial_reduce_mla ISD nodes
 (#161184)

Addresssing issue #160847.

Move away from combining the intrinsic call and instead lower the ISD
nodes, using tablegen for pattern matching.
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 149 +-----------------
 .../WebAssembly/WebAssemblyISelLowering.h     |   5 +-
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  45 ++++++
 .../WebAssembly/partial-reduce-accumulate.ll  |  15 +-
 4 files changed, 61 insertions(+), 153 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 64b9dc31f75b7..163bf9ba5b089 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -186,7 +186,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   // SIMD-specific configuration
   if (Subtarget->hasSIMD128()) {
 
-    // Combine partial.reduce.add before legalization gets confused.
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
     // Combine wide-vector muls, with extend inputs, to extmul_half.
@@ -317,6 +316,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Custom);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
     }
+
+    // Partial MLA reductions.
+    for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) {
+      setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal);
+      setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v8i16, Legal);
+    }
   }
 
   // As a special case, these operators use the type to mean the type to
@@ -416,41 +421,6 @@ MVT WebAssemblyTargetLowering::getPointerMemTy(const DataLayout &DL,
   return TargetLowering::getPointerMemTy(DL, AS);
 }
 
-bool WebAssemblyTargetLowering::shouldExpandPartialReductionIntrinsic(
-    const IntrinsicInst *I) const {
-  if (I->getIntrinsicID() != Intrinsic::vector_partial_reduce_add)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  if (VT.getSizeInBits() > 128)
-    return true;
-
-  auto Op1 = I->getOperand(1);
-
-  if (auto *InputInst = dyn_cast<Instruction>(Op1)) {
-    unsigned Opcode = InstructionOpcodeToISD(InputInst->getOpcode());
-    if (Opcode == ISD::MUL) {
-      if (isa<Instruction>(InputInst->getOperand(0)) &&
-          isa<Instruction>(InputInst->getOperand(1))) {
-        // dot only supports signed inputs but also support lowering unsigned.
-        if (cast<Instruction>(InputInst->getOperand(0))->getOpcode() !=
-            cast<Instruction>(InputInst->getOperand(1))->getOpcode())
-          return true;
-
-        EVT Op1VT = EVT::getEVT(Op1->getType());
-        if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-            ((VT.getVectorElementCount() * 2 ==
-              Op1VT.getVectorElementCount()) ||
-             (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount())))
-          return false;
-      }
-    } else if (ISD::isExtOpcode(Opcode)) {
-      return false;
-    }
-  }
-  return true;
-}
-
 TargetLowering::AtomicExpansionKind
 WebAssemblyTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   // We have wasm instructions for these
@@ -2113,106 +2083,6 @@ SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
-// Try to lower partial.reduce.add to a dot or fallback to a sequence with
-// extmul and adds.
-SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) {
-  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN);
-  if (N->getConstantOperandVal(0) != Intrinsic::vector_partial_reduce_add)
-    return SDValue();
-
-  assert(N->getValueType(0) == MVT::v4i32 && "can only support v4i32");
-  SDLoc DL(N);
-
-  SDValue Input = N->getOperand(2);
-  if (Input->getOpcode() == ISD::MUL) {
-    SDValue ExtendLHS = Input->getOperand(0);
-    SDValue ExtendRHS = Input->getOperand(1);
-    assert((ISD::isExtOpcode(ExtendLHS.getOpcode()) &&
-            ISD::isExtOpcode(ExtendRHS.getOpcode())) &&
-           "expected widening mul or add");
-    assert(ExtendLHS.getOpcode() == ExtendRHS.getOpcode() &&
-           "expected binop to use the same extend for both operands");
-
-    SDValue ExtendInLHS = ExtendLHS->getOperand(0);
-    SDValue ExtendInRHS = ExtendRHS->getOperand(0);
-    bool IsSigned = ExtendLHS->getOpcode() == ISD::SIGN_EXTEND;
-    unsigned LowOpc =
-        IsSigned ? WebAssemblyISD::EXTEND_LOW_S : WebAssemblyISD::EXTEND_LOW_U;
-    unsigned HighOpc = IsSigned ? WebAssemblyISD::EXTEND_HIGH_S
-                                : WebAssemblyISD::EXTEND_HIGH_U;
-    SDValue LowLHS;
-    SDValue LowRHS;
-    SDValue HighLHS;
-    SDValue HighRHS;
-
-    auto AssignInputs = [&](MVT VT) {
-      LowLHS = DAG.getNode(LowOpc, DL, VT, ExtendInLHS);
-      LowRHS = DAG.getNode(LowOpc, DL, VT, ExtendInRHS);
-      HighLHS = DAG.getNode(HighOpc, DL, VT, ExtendInLHS);
-      HighRHS = DAG.getNode(HighOpc, DL, VT, ExtendInRHS);
-    };
-
-    if (ExtendInLHS->getValueType(0) == MVT::v8i16) {
-      if (IsSigned) {
-        // i32x4.dot_i16x8_s
-        SDValue Dot = DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32,
-                                  ExtendInLHS, ExtendInRHS);
-        return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Dot);
-      }
-
-      // (add (add (extmul_low_sx lhs, rhs), (extmul_high_sx lhs, rhs)))
-      MVT VT = MVT::v4i32;
-      AssignInputs(VT);
-      SDValue MulLow = DAG.getNode(ISD::MUL, DL, VT, LowLHS, LowRHS);
-      SDValue MulHigh = DAG.getNode(ISD::MUL, DL, VT, HighLHS, HighRHS);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, MulLow, MulHigh);
-      return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Add);
-    } else {
-      assert(ExtendInLHS->getValueType(0) == MVT::v16i8 &&
-             "expected v16i8 input types");
-      AssignInputs(MVT::v8i16);
-      // Lower to a wider tree, using twice the operations compared to above.
-      if (IsSigned) {
-        // Use two dots
-        SDValue DotLHS =
-            DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, LowLHS, LowRHS);
-        SDValue DotRHS =
-            DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, HighLHS, HighRHS);
-        SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, DotLHS, DotRHS);
-        return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-      }
-
-      SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS);
-      SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS);
-
-      SDValue AddLow = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
-                                   MVT::v4i32, MulLow);
-      SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL,
-                                    MVT::v4i32, MulHigh);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh);
-      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-    }
-  } else {
-    // Accumulate the input using extadd_pairwise.
-    assert(ISD::isExtOpcode(Input.getOpcode()) && "expected extend");
-    bool IsSigned = Input->getOpcode() == ISD::SIGN_EXTEND;
-    unsigned PairwiseOpc = IsSigned ? WebAssemblyISD::EXT_ADD_PAIRWISE_S
-                                    : WebAssemblyISD::EXT_ADD_PAIRWISE_U;
-    SDValue ExtendIn = Input->getOperand(0);
-    if (ExtendIn->getValueType(0) == MVT::v8i16) {
-      SDValue Add = DAG.getNode(PairwiseOpc, DL, MVT::v4i32, ExtendIn);
-      return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-    }
-
-    assert(ExtendIn->getValueType(0) == MVT::v16i8 &&
-           "expected v16i8 input types");
-    SDValue Add =
-        DAG.getNode(PairwiseOpc, DL, MVT::v4i32,
-                    DAG.getNode(PairwiseOpc, DL, MVT::v8i16, ExtendIn));
-    return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add);
-  }
-}
-
 SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -3683,11 +3553,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return performVectorTruncZeroCombine(N, DCI);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DCI);
-  case ISD::INTRINSIC_WO_CHAIN: {
-    if (auto AnyAllCombine = performAnyAllCombine(N, DCI.DAG))
-      return AnyAllCombine;
-    return performLowerPartialReduction(N, DCI.DAG);
-  }
+  case ISD::INTRINSIC_WO_CHAIN:
+    return performAnyAllCombine(N, DCI.DAG);
   case ISD::MUL:
     return performMulCombine(N, DCI);
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 72401a7a259c0..b33a8530310be 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -45,8 +45,6 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   /// right decision when generating code for different targets.
   const WebAssemblySubtarget *Subtarget;
 
-  bool
-  shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
   bool shouldScalarizeBinop(SDValue VecOp) const override;
   FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
@@ -89,8 +87,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context,
-                      const Type *RetTy) const override;
+                      LLVMContext &Context, const Type *RetTy) const override;
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index d8948ad2df037..130602650d34e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1504,6 +1504,51 @@ def : Pat<(v2f64 (extloadv2f32 (i64 I64:$addr))),
 defm Q15MULR_SAT_S :
   SIMDBinary<I16x8, int_wasm_q15mulr_sat_signed, "q15mulr_sat_s", 0x82>;
 
+//===----------------------------------------------------------------------===//
+// Partial reductions, using: dot, extmul and extadd_pairwise
+//===----------------------------------------------------------------------===//
+// MLA: v8i16 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v8i16 V128:$lhs),
+                                                         (v8i16 V128:$rhs))),
+          (ADD_I32x4 (DOT $lhs, $rhs), $acc)>;
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v8i16 V128:$lhs),
+                                                         (v8i16 V128:$rhs))),
+          (ADD_I32x4 (ADD_I32x4 (EXTMUL_LOW_U_I32x4 $lhs, $rhs),
+                                (EXTMUL_HIGH_U_I32x4 $lhs, $rhs)),
+                     $acc)>;
+// MLA: v16i8 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (ADD_I32x4 (ADD_I32x4 (DOT (extend_low_s_I16x8 $lhs),
+                                     (extend_low_s_I16x8 $rhs)),
+                                (DOT (extend_high_s_I16x8 $lhs),
+                                     (extend_high_s_I16x8 $rhs))),
+                      $acc)>;
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v16i8 V128:$lhs),
+                                                         (v16i8 V128:$rhs))),
+          (ADD_I32x4 (ADD_I32x4 (extadd_pairwise_u_I32x4 (EXTMUL_LOW_U_I16x8 $lhs, $rhs)),
+                                (extadd_pairwise_u_I32x4 (EXTMUL_HIGH_U_I16x8 $lhs, $rhs))),
+                     $acc)>;
+
+// Accumulate: v8i16 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v8i16 V128:$in),
+                                                         (I16x8.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_s_I32x4 $in), $acc)>;
+
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v8i16 V128:$in),
+                                                         (I16x8.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_u_I32x4 $in), $acc)>;
+
+// Accumulate: v16i8 -> v4i32
+def : Pat<(v4i32 (partial_reduce_smla (v4i32 V128:$acc), (v16i8 V128:$in),
+                                                         (I8x16.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_s_I32x4 (extadd_pairwise_s_I16x8 $in)),
+                     $acc)>;
+def : Pat<(v4i32 (partial_reduce_umla (v4i32 V128:$acc), (v16i8 V128:$in),
+                                                         (I8x16.splat (i32 1)))),
+          (ADD_I32x4 (extadd_pairwise_u_I32x4 (extadd_pairwise_u_I16x8 $in)),
+                     $acc)>;
+
 //===----------------------------------------------------------------------===//
 // Relaxed swizzle
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
index 47ea762864cc2..a599f4653f323 100644
--- a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
+++ b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
@@ -19,11 +19,11 @@ define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly  %a, ptr noundef re
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 
 entry:
   %cmp8.not = icmp eq i32 %N, 0
@@ -65,11 +65,11 @@ define hidden i32 @accumulate_add_s8_s8(ptr noundef readonly  %a, ptr noundef re
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -108,12 +108,11 @@ define hidden i32 @accumulate_add_s8_u8(ptr noundef readonly  %a, ptr noundef re
 
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
-; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s
+; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
@@ -363,10 +362,10 @@ define hidden i32 @accumulate_add_u16_u16(ptr noundef readonly  %a, ptr noundef
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -402,10 +401,10 @@ define hidden i32 @accumulate_add_s16_s16(ptr noundef readonly  %a, ptr noundef
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
 ; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
+; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body

From 635910d14e3f4be1921c5c130cfe7aed7237c619 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Tue, 30 Sep 2025 03:32:37 -0400
Subject: [PATCH 220/878] [clang][Tooling] Support 'c++latest' in
 InterpolatingCompilationDatabase (#160030)

Fixes https://github.com/clangd/clangd/issues/527
Fixes https://github.com/clangd/clangd/issues/1850
---
 .../clangd/unittests/CompileCommandsTests.cpp |  9 ++++++
 .../InterpolatingCompilationDatabase.cpp      | 30 ++++++++++++++++---
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
index e324404e627c2..660540afd2320 100644
--- a/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CompileCommandsTests.cpp
@@ -536,6 +536,15 @@ TEST(CommandMangler, StdLatestFlag) {
   EXPECT_THAT(llvm::join(Cmd.CommandLine, " "), HasSubstr("/std:c++latest"));
 }
 
+TEST(CommandMangler, StdLatestFlag_Inference) {
+  const auto Mangler = CommandMangler::forTests();
+  tooling::CompileCommand Cmd;
+  Cmd.CommandLine = {"clang-cl", "/std:c++latest", "--", "/Users/foo.cc"};
+  Mangler(Cmd, "/Users/foo.hpp");
+  // Check that the /std:c++latest flag is not dropped during inference
+  EXPECT_THAT(llvm::join(Cmd.CommandLine, " "), HasSubstr("/std:c++latest"));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
index 995019ca5a4d4..28568426a6c48 100644
--- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
+++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp
@@ -123,6 +123,15 @@ static types::ID foldType(types::ID Lang) {
   }
 }
 
+// Return the language standard that's activated by the /std:c++latest
+// flag in clang-CL mode.
+static LangStandard::Kind latestLangStandard() {
+  // FIXME: Have a single source of truth for the mapping from
+  // c++latest --> c++26 that's shared by the driver code
+  // (clang/lib/Driver/ToolChains/Clang.cpp) and this file.
+  return LangStandard::lang_cxx26;
+}
+
 // A CompileCommand that can be applied to another file.
 struct TransferableCommand {
   // Flags that should not apply to all files are stripped from CommandLine.
@@ -237,9 +246,16 @@ struct TransferableCommand {
     // --std flag may only be transferred if the language is the same.
     // We may consider "translating" these, e.g. c++11 -> c11.
     if (Std != LangStandard::lang_unspecified && foldType(TargetType) == Type) {
-      Result.CommandLine.emplace_back((
-          llvm::Twine(ClangCLMode ? "/std:" : "-std=") +
-          LangStandard::getLangStandardForKind(Std).getName()).str());
+      const char *Spelling =
+          LangStandard::getLangStandardForKind(Std).getName();
+      // In clang-cl mode, the latest standard is spelled 'c++latest' rather
+      // than e.g. 'c++26', and the driver does not accept the latter, so emit
+      // the spelling that the driver does accept.
+      if (ClangCLMode && Std == latestLangStandard()) {
+        Spelling = "c++latest";
+      }
+      Result.CommandLine.emplace_back(
+          (llvm::Twine(ClangCLMode ? "/std:" : "-std=") + Spelling).str());
     }
     Result.CommandLine.push_back("--");
     Result.CommandLine.push_back(std::string(Filename));
@@ -296,8 +312,14 @@ struct TransferableCommand {
   // Try to interpret the argument as '-std='.
   std::optional<LangStandard::Kind> tryParseStdArg(const llvm::opt::Arg &Arg) {
     using namespace driver::options;
-    if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ))
+    if (Arg.getOption().matches(ClangCLMode ? OPT__SLASH_std : OPT_std_EQ)) {
+      // "c++latest" is not a recognized LangStandard, but it's accepted by
+      // the clang driver in CL mode.
+      if (ClangCLMode && StringRef(Arg.getValue()) == "c++latest") {
+        return latestLangStandard();
+      }
       return LangStandard::getLangKind(Arg.getValue());
+    }
     return std::nullopt;
   }
 };

From 170b5fde8fc23525049ebbde8377f59971ee9b3f Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Tue, 30 Sep 2025 10:23:04 +0200
Subject: [PATCH 221/878] [Modules] Make -module-file-info print macro names in
 deterministic order (#161332)

Developers reported non-deterministic output from `-module-file-info`,
thinking this reflected non-determinism in the .pcm files themselves.
However, it turned out it was the printing that was non-deterministic:

```
$ cat /tmp/a.h
#define FOO 1
#define BAR 2

$ build/bin/clang -cc1 -std=c++20 -x c++ -emit-header-unit /tmp/a.h -o /tmp/a.pcm

$ build/bin/clang -cc1 -module-file-info /tmp/a.pcm | grep -A2 Definitions
   Macro Definitions:
     FOO
     BAR

$ build/bin/clang -cc1 -module-file-info /tmp/a.pcm | grep -A2 Definitions
   Macro Definitions:
     BAR
     FOO
```

Making the output deterministic also simplifies the test.

This is a follow-up to 360c5fe54c0758c73bf85453fd2913f371adc7d5
---
 clang/lib/Frontend/FrontendActions.cpp        | 19 ++++++++++--------
 .../Modules/cxx20-module-file-info-macros.cpp | 20 +++++++++----------
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index 7424958d46612..d7d56b8166350 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -971,14 +971,17 @@ void DumpModuleInfoAction::ExecuteAction() {
     // Emit the macro definitions in the module file so that we can know how
     // much definitions in the module file quickly.
     // TODO: Emit the macro definition bodies completely.
-    if (auto FilteredMacros = llvm::make_filter_range(
-            R->getPreprocessor().macros(),
-            [](const auto &Macro) { return Macro.first->isFromAST(); });
-        !FilteredMacros.empty()) {
-      Out << "   Macro Definitions:\n";
-      for (/*<IdentifierInfo *, MacroState> pair*/ const auto &Macro :
-           FilteredMacros)
-        Out << "     " << Macro.first->getName() << "\n";
+    {
+      std::vector<StringRef> MacroNames;
+      for (const auto &M : R->getPreprocessor().macros()) {
+        if (M.first->isFromAST())
+          MacroNames.push_back(M.first->getName());
+      }
+      llvm::sort(MacroNames);
+      if (!MacroNames.empty())
+        Out << "   Macro Definitions:\n";
+      for (StringRef Name : MacroNames)
+        Out << "     " << Name << "\n";
     }
 
     // Now let's print out any modules we did not see as part of the Primary.
diff --git a/clang/test/Modules/cxx20-module-file-info-macros.cpp b/clang/test/Modules/cxx20-module-file-info-macros.cpp
index 3b67e9b9acd41..431c967fbbccd 100644
--- a/clang/test/Modules/cxx20-module-file-info-macros.cpp
+++ b/clang/test/Modules/cxx20-module-file-info-macros.cpp
@@ -36,28 +36,28 @@
 #define REDEFINE
 
 // CHECK: Macro Definitions:
-// CHECK-DAG: REDEFINE
-// CHECK-DAG: FUNC_Macro
-// CHECK-DAG: CONSTANT
-// CHECK-DAG: FOO
+// CHECK: CONSTANT
+// CHECK: FOO
+// CHECK: FUNC_Macro
+// CHECK: REDEFINE
 // CHECK-NEXT: ===
 
 //--- include_foo.h
 #include "foo.h"
 #undef REDEFINE
 // CHECK: Macro Definitions:
-// CHECK-DAG: CONSTANT
-// CHECK-DAG: FUNC_Macro
-// CHECK-DAG: FOO
+// CHECK: CONSTANT
+// CHECK: FOO
+// CHECK: FUNC_Macro
 // CHECK-NEXT: ===
 
 //--- import_foo.h
 import "foo.h";
 #undef REDEFINE
 // CHECK: Macro Definitions:
-// CHECK-DAG: CONSTANT
-// CHECK-DAG: FUNC_Macro
-// CHECK-DAG: FOO
+// CHECK: CONSTANT
+// CHECK: FOO
+// CHECK: FUNC_Macro
 // CHECK-NEXT: ===
 
 //--- named_module.cppm

From ff130f293dd7f9e280698b02066e8bf7116bcc3a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 10:22:56 -0700
Subject: [PATCH 222/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in LevelZeroRuntimeWrappers.cpp (NFC)

---
 mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
index 21eaf28c9f214..d0728274b94c8 100644
--- a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
@@ -328,12 +328,12 @@ struct DynamicEventPool {
   }
 };
 
-L0RTContextWrapper &getRtContext() {
+static L0RTContextWrapper &getRtContext() {
   thread_local static L0RTContextWrapper rtContext(0);
   return rtContext;
 }
 
-DynamicEventPool &getDynamicEventPool() {
+static DynamicEventPool &getDynamicEventPool() {
   thread_local static DynamicEventPool dynEventPool{&getRtContext()};
   return dynEventPool;
 }
@@ -492,8 +492,8 @@ extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
 }
 
 template <typename PATTERN_TYPE>
-void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
-                StreamWrapper *stream) {
+static void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
+                       StreamWrapper *stream) {
   L0RTContextWrapper &rtContext = getRtContext();
   auto listType =
       rtContext.copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)

From f2c9abc2be79f4dab0103c79e5582cb6936e253f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 07:33:15 -0700
Subject: [PATCH 223/878] [MLIR] Apply clang-tidy fixes for
 readability-container-size-empty in Vectorization.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 15c467b21c81e..4919d9a26b8cb 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -287,7 +287,7 @@ struct VectorizationState {
   /// moment we only make sure that there are no broadcast dimensions, but this
   /// might change if indexing maps evolve.
   bool isValidMaskingMap(AffineMap maskingMap) {
-    return maskingMap.getBroadcastDims().size() == 0;
+    return maskingMap.getBroadcastDims().empty();
   }
 
   /// Turn the input indexing map into a valid masking map.
@@ -923,7 +923,7 @@ static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) {
        llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == 1) &&
       "For statically shaped Linalg Ops, only one "
       "non-unit loop dim is expected");
-  assert(loopRanges.size() != 0 && "Empty loops, nothing to analyse.");
+  assert(!loopRanges.empty() && "Empty loops, nothing to analyse.");
 
   size_t idx = loopRanges.size() - 1;
   for (; idx != 0; idx--)

From 87bc0f7431891b3f5d83205f856be7362911790e Mon Sep 17 00:00:00 2001
From: Hongyu Chen <xxs_chy@outlook.com>
Date: Tue, 30 Sep 2025 16:38:03 +0800
Subject: [PATCH 224/878] [VectorCombine] Preserve cast flags in
 foldBitOpOfCastConstant (#161237)

Follow-up of #157822.
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp        | 10 ++++++++++
 .../Transforms/VectorCombine/X86/bitop-of-castops.ll   |  8 ++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 32704bdb54f4f..d6eb00da11dc8 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1031,6 +1031,16 @@ bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
   // Create the cast operation directly to ensure we get a new instruction
   Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
 
+  // Preserve cast instruction flags
+  if (RHSFlags.NNeg)
+    NewCast->setNonNeg();
+  if (RHSFlags.NUW)
+    NewCast->setHasNoUnsignedWrap();
+  if (RHSFlags.NSW)
+    NewCast->setHasNoSignedWrap();
+
+  NewCast->andIRFlags(LHSCast);
+
   // Insert the new instruction
   Value *Result = Builder.Insert(NewCast);
 
diff --git a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
index 79e72aaed6082..38c624e942343 100644
--- a/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/bitop-of-castops.ll
@@ -357,7 +357,7 @@ define <4 x i32> @or_sext_v4i8_to_v4i32_constant_with_loss(<4 x i8> %a) {
 define <4 x i16> @and_trunc_nuw_nsw_constant(<4 x i32> %a) {
 ; CHECK-LABEL: @and_trunc_nuw_nsw_constant(
 ; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], <i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i16>
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw nsw <4 x i32> [[AND_INNER]] to <4 x i16>
 ; CHECK-NEXT:    ret <4 x i16> [[AND]]
 ;
   %t1 = trunc nuw nsw <4 x i32> %a to <4 x i16>
@@ -368,7 +368,7 @@ define <4 x i16> @and_trunc_nuw_nsw_constant(<4 x i32> %a) {
 define <4 x i8> @and_trunc_nuw_nsw_minus_constant(<4 x i32> %a) {
 ; CHECK-LABEL: @and_trunc_nuw_nsw_minus_constant(
 ; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], <i32 240, i32 241, i32 242, i32 243>
-; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i8>
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw <4 x i32> [[AND_INNER]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[AND]]
 ;
   %t1 = trunc nuw nsw <4 x i32> %a to <4 x i8>
@@ -379,7 +379,7 @@ define <4 x i8> @and_trunc_nuw_nsw_minus_constant(<4 x i32> %a) {
 define <4 x i8> @and_trunc_nuw_nsw_multiconstant(<4 x i32> %a) {
 ; CHECK-LABEL: @and_trunc_nuw_nsw_multiconstant(
 ; CHECK-NEXT:    [[AND_INNER:%.*]] = and <4 x i32> [[A:%.*]], <i32 240, i32 1, i32 242, i32 3>
-; CHECK-NEXT:    [[AND:%.*]] = trunc <4 x i32> [[AND_INNER]] to <4 x i8>
+; CHECK-NEXT:    [[AND:%.*]] = trunc nuw <4 x i32> [[AND_INNER]] to <4 x i8>
 ; CHECK-NEXT:    ret <4 x i8> [[AND]]
 ;
   %t1 = trunc nuw nsw <4 x i32> %a to <4 x i8>
@@ -391,7 +391,7 @@ define <4 x i8> @and_trunc_nuw_nsw_multiconstant(<4 x i32> %a) {
 define <4 x i32> @or_zext_nneg_constant(<4 x i16> %a) {
 ; CHECK-LABEL: @or_zext_nneg_constant(
 ; CHECK-NEXT:    [[OR_INNER:%.*]] = or <4 x i16> [[A:%.*]], <i16 1, i16 2, i16 3, i16 4>
-; CHECK-NEXT:    [[OR:%.*]] = zext <4 x i16> [[OR_INNER]] to <4 x i32>
+; CHECK-NEXT:    [[OR:%.*]] = zext nneg <4 x i16> [[OR_INNER]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[OR]]
 ;
   %z1 = zext nneg <4 x i16> %a to <4 x i32>

From 55054db762af3a177b89487a0c6aebf3efec5786 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Tue, 30 Sep 2025 17:42:50 +0900
Subject: [PATCH 225/878] [Unifomity] Remove unused PhiInput definition (NFC)
 (#161116)

This appears to have no users.
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 141816c304397..7fb0dbe22f12f 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -408,15 +408,6 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
                                 const CycleT *);
 
 protected:
-  /// \brief Value/block pair representing a single phi input.
-  struct PhiInput {
-    ConstValueRefT value;
-    BlockT *predBlock;
-
-    PhiInput(ConstValueRefT value, BlockT *predBlock)
-        : value(value), predBlock(predBlock) {}
-  };
-
   const ContextT &Context;
   const FunctionT &F;
   const CycleInfoT &CI;

From 048922edbb9d5d319622956d91ef062a08039949 Mon Sep 17 00:00:00 2001
From: Jack Frankland <jack.frankland@arm.com>
Date: Tue, 30 Sep 2025 09:44:06 +0100
Subject: [PATCH 226/878] [mlir][memref-to-spirv]: Remap Image Load Coordinates
 (#160495)

When converting a `memref.load` from the image address space to a
`spirv.ImageFetch` ensure that we correctly map the load indices to
width, height and depth.

The lowering currently assumes a linear image tiling, that is row-major
memory layout. This allows us to support any memref layout that is a
permutation of the dimensions, more complex layouts are not currently
supported. Because the ordering of the dimensions in the vector passed
to image fetch is the opposite to that in the memref directions a final
reversal of the mapped dimensions is always required.

---------

Signed-off-by: Jack Frankland <jack.frankland@arm.com>
---
 .../MemRefToSPIRV/MemRefToSPIRV.cpp           |  41 +++-
 .../MemRefToSPIRV/memref-to-spirv.mlir        | 218 +++++++++++++-----
 2 files changed, 197 insertions(+), 62 deletions(-)

diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
index f44552c4556c2..a90dcc8cc3ef1 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
@@ -699,6 +699,35 @@ LoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
   return success();
 }
 
+template <typename OpAdaptor>
+static FailureOr<SmallVector<Value>>
+extractLoadCoordsForComposite(memref::LoadOp loadOp, OpAdaptor adaptor,
+                              ConversionPatternRewriter &rewriter) {
+  // At present we only support linear "tiling" as specified in Vulkan, this
+  // means that texels are assumed to be laid out in memory in a row-major
+  // order. This allows us to support any memref layout that is a permutation of
+  // the dimensions. Future work will pass an optional image layout to the
+  // rewrite pattern so that we can support optimized target specific tilings.
+  SmallVector<Value> indices = adaptor.getIndices();
+  AffineMap map = loadOp.getMemRefType().getLayout().getAffineMap();
+  if (!map.isPermutation())
+    return rewriter.notifyMatchFailure(
+        loadOp,
+        "Cannot lower memrefs with memory layout which is not a permutation");
+
+  // The memrefs layout determines the dimension ordering so we need to follow
+  // the map to get the ordering of the dimensions/indices.
+  const unsigned dimCount = map.getNumDims();
+  SmallVector<Value, 3> coords(dimCount);
+  for (unsigned dim = 0; dim < dimCount; ++dim)
+    coords[map.getDimPosition(dim)] = indices[dim];
+
+  // We need to reverse the coordinates because the memref layout is slowest to
+  // fastest moving and the vector coordinates for the image op is fastest to
+  // slowest moving.
+  return llvm::to_vector(llvm::reverse(coords));
+}
+
 LogicalResult
 ImageLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
@@ -755,13 +784,17 @@ ImageLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
 
   // Build a vector of coordinates or just a scalar index if we have a 1D image.
   Value coords;
-  if (memrefType.getRank() != 1) {
+  if (memrefType.getRank() == 1) {
+    coords = adaptor.getIndices()[0];
+  } else {
+    FailureOr<SmallVector<Value>> maybeCoords =
+        extractLoadCoordsForComposite(loadOp, adaptor, rewriter);
+    if (failed(maybeCoords))
+      return failure();
     auto coordVectorType = VectorType::get({loadOp.getMemRefType().getRank()},
                                            adaptor.getIndices().getType()[0]);
     coords = spirv::CompositeConstructOp::create(rewriter, loc, coordVectorType,
-                                                 adaptor.getIndices());
-  } else {
-    coords = adaptor.getIndices()[0];
+                                                 maybeCoords.value());
   }
 
   // Fetch the value out of the image.
diff --git a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
index e6321e99693ac..ab3c8b7397e1a 100644
--- a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
+++ b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
@@ -515,6 +515,12 @@ module attributes {
 
 // Check Image Support.
 
+// CHECK: #[[$COLMAJMAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+#col_major = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK: #[[$CUSTOMLAYOUTMAP:.*]] = affine_map<(d0, d1, d2) -> (d2, d1, d0)>
+#custom = affine_map<(d0, d1, d2) -> (d2, d1, d0)>
+// CHECK: #[[$NONPERMMAP:.*]] = affine_map<(d0, d1) -> (d0, d1 mod 2)>
+#non_permutation = affine_map<(d0, d1) -> (d0, d1 mod 2)>
 module attributes {
   spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [
     Shader,
@@ -534,8 +540,8 @@ module attributes {
   // CHECK-LABEL: @load_from_image_1D(
   // CHECK-SAME: %[[ARG0:.*]]: memref<1xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1xf32, #spirv.storage_class<StorageBuffer>>
   func.func @load_from_image_1D(%arg0: memref<1xf32, #spirv.storage_class<Image>>, %arg1: memref<1xf32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim1D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim1D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
     %cst = arith.constant 0 : index
     // CHECK: %[[COORDS:.*]] = builtin.unrealized_conversion_cast %{{.*}} : index to i32
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim1D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
@@ -550,121 +556,206 @@ module attributes {
   }
 
   // CHECK-LABEL: @load_from_image_2D(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xf32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D(%arg0: memref<1x1xf32, #spirv.storage_class<Image>>, %arg1: memref<1x1xf32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x4xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D(%arg0: memref<2x4xf32, #spirv.storage_class<Image>>, %arg1: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<8 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x4xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
+    // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<2xi32> -> vector<4xf32>
+    // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
+    %0 = memref.load %arg0[%y, %x] : memref<2x4xf32, #spirv.storage_class<Image>>
+    // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
+    memref.store %0, %arg1[%y, %x] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+    return
+  }
+
+  // CHECK-LABEL: @load_from_col_major_image_2D(
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x4xf32, #[[$COLMAJMAP]], #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_col_major_image_2D(%arg0: memref<2x4xf32, #col_major, #spirv.storage_class<Image>>, %arg1: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<8 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x4xf32, #[[$COLMAJMAP]], #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<2xi32> -> vector<4xf32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xf32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%x, %y] : memref<2x4xf32, #col_major, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xf32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_3D(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1x1xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_3D(%arg0: memref<1x1x1xf32, #spirv.storage_class<Image>>, %arg1: memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1x1xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3x4xf32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_3D(%arg0: memref<2x3x4xf32, #spirv.storage_class<Image>>, %arg1: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<24 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3x4xf32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 2 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 2 : index
+    // CHECK: %[[Z:.*]] = arith.constant 1 : index
+    // CHECK: %[[Z32:.*]] = builtin.unrealized_conversion_cast %[[Z]] : index to i32
+    %z = arith.constant 1 : index
+    // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]], %[[Z32]] : (i32, i32, i32) -> vector<3xi32>
+    // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<3xi32> -> vector<4xf32>
+    // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
+    %0 = memref.load %arg0[%z, %y, %x] : memref<2x3x4xf32, #spirv.storage_class<Image>>
+    // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
+    memref.store %0, %arg1[%z, %y, %x] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
+    return
+  }
+
+  // CHECK-LABEL: @load_from_custom_layout_image_3D(
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3x4xf32, #[[$CUSTOMLAYOUTMAP]], #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_custom_layout_image_3D(%arg0: memref<2x3x4xf32, #custom,  #spirv.storage_class<Image>>, %arg1: memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<24 x f32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3x4xf32, #[[$CUSTOMLAYOUTMAP]], #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 3 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 3 : index
+    // CHECK: %[[Y:.*]] = arith.constant 2 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 2 : index
+    // CHECK: %[[Z:.*]] = arith.constant 1 : index
+    // CHECK: %[[Z32:.*]] = builtin.unrealized_conversion_cast %[[Z]] : index to i32
+    %z = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}}, %{{.*}} : (i32, i32, i32) -> vector<3xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]], %[[Z32]] : (i32, i32, i32) -> vector<3xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f32, Dim3D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32f>, vector<3xi32> -> vector<4xf32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf32>
-    %0 = memref.load %arg0[%cst, %cst, %cst] : memref<1x1x1xf32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%x, %y, %z] : memref<2x3x4xf32, #custom, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f32
-    memref.store %0, %arg1[%cst, %cst, %cst] : memref<1x1x1xf32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%z, %y, %x] : memref<2x3x4xf32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_f16(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xf16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xf16, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_f16(%arg0: memref<1x1xf16, #spirv.storage_class<Image>>, %arg1: memref<1x1xf16, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xf16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x f16, stride=2> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xf16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xf16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xf16, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_f16(%arg0: memref<2x3xf16, #spirv.storage_class<Image>>, %arg1: memref<2x3xf16, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xf16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x f16, stride=2> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xf16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<f16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16f>, vector<2xi32> -> vector<4xf16>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xf16>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xf16, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xf16, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : f16
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xf16, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xf16, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_i32(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xi32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xi32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_i32(%arg0: memref<1x1xi32, #spirv.storage_class<Image>>, %arg1: memref<1x1xi32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xi32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x i32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xi32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xi32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xi32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_i32(%arg0: memref<2x3xi32, #spirv.storage_class<Image>>, %arg1: memref<2x3xi32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xi32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x i32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xi32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<i32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32i>, vector<2xi32> -> vector<4xi32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xi32>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xi32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xi32, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : i32
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xi32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xi32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_ui32(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xui32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xui32, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_ui32(%arg0: memref<1x1xui32, #spirv.storage_class<Image>>, %arg1: memref<1x1xui32, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xui32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x ui32, stride=4> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xui32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xui32, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xui32, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_ui32(%arg0: memref<2x3xui32, #spirv.storage_class<Image>>, %arg1: memref<2x3xui32, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xui32, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x ui32, stride=4> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xui32, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<ui32, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R32ui>, vector<2xi32> -> vector<4xui32>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xui32>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xui32, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xui32, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : ui32
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xui32, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xui32, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_i16(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xi16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xi16, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_i16(%arg0: memref<1x1xi16, #spirv.storage_class<Image>>, %arg1: memref<1x1xi16, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xi16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x i16, stride=2> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xi16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xi16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xi16, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_i16(%arg0: memref<2x3xi16, #spirv.storage_class<Image>>, %arg1: memref<2x3xi16, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xi16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x i16, stride=2> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xi16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<i16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16i>, vector<2xi32> -> vector<4xi16>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xi16>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xi16, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xi16, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : i16
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xi16, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xi16, #spirv.storage_class<StorageBuffer>>
     return
   }
 
   // CHECK-LABEL: @load_from_image_2D_ui16(
-  // CHECK-SAME: %[[ARG0:.*]]: memref<1x1xui16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<1x1xui16, #spirv.storage_class<StorageBuffer>>
-  func.func @load_from_image_2D_ui16(%arg0: memref<1x1xui16, #spirv.storage_class<Image>>, %arg1: memref<1x1xui16, #spirv.storage_class<StorageBuffer>>) {
-// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %arg1 : memref<1x1xui16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<1 x ui16, stride=2> [0])>, StorageBuffer>
-// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %arg0 : memref<1x1xui16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>, UniformConstant>
-    %cst = arith.constant 0 : index
+  // CHECK-SAME: %[[ARG0:.*]]: memref<2x3xui16, #spirv.storage_class<Image>>, %[[ARG1:.*]]: memref<2x3xui16, #spirv.storage_class<StorageBuffer>>
+  func.func @load_from_image_2D_ui16(%arg0: memref<2x3xui16, #spirv.storage_class<Image>>, %arg1: memref<2x3xui16, #spirv.storage_class<StorageBuffer>>) {
+// CHECK-DAG: %[[SB:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<2x3xui16, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<6 x ui16, stride=2> [0])>, StorageBuffer>
+// CHECK-DAG: %[[IMAGE_PTR:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<2x3xui16, #spirv.storage_class<Image>> to !spirv.ptr<!spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>, UniformConstant>
+    // CHECK: %[[X:.*]] = arith.constant 2 : index
+    // CHECK: %[[X32:.*]] = builtin.unrealized_conversion_cast %[[X]] : index to i32
+    %x = arith.constant 2 : index
+    // CHECK: %[[Y:.*]] = arith.constant 1 : index
+    // CHECK: %[[Y32:.*]] = builtin.unrealized_conversion_cast %[[Y]] : index to i32
+    %y = arith.constant 1 : index
     // CHECK: %[[SIMAGE:.*]] = spirv.Load "UniformConstant" %[[IMAGE_PTR]] : !spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>
     // CHECK: %[[IMAGE:.*]] = spirv.Image %[[SIMAGE]] : !spirv.sampled_image<!spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>>
-    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %{{.*}}, %{{.*}} : (i32, i32) -> vector<2xi32>
+    // CHECK: %[[COORDS:.*]] = spirv.CompositeConstruct %[[X32]], %[[Y32]] : (i32, i32) -> vector<2xi32>
     // CHECK: %[[RES_VEC:.*]] =  spirv.ImageFetch %[[IMAGE]], %[[COORDS]]  : !spirv.image<ui16, Dim2D, DepthUnknown, NonArrayed, SingleSampled, NeedSampler, R16ui>, vector<2xi32> -> vector<4xui16>
     // CHECK: %[[RESULT:.*]] = spirv.CompositeExtract %[[RES_VEC]][0 : i32] : vector<4xui16>
-    %0 = memref.load %arg0[%cst, %cst] : memref<1x1xui16, #spirv.storage_class<Image>>
+    %0 = memref.load %arg0[%y, %x] : memref<2x3xui16, #spirv.storage_class<Image>>
     // CHECK: spirv.Store "StorageBuffer" %{{.*}}, %[[RESULT]] : ui16
-    memref.store %0, %arg1[%cst, %cst] : memref<1x1xui16, #spirv.storage_class<StorageBuffer>>
+    memref.store %0, %arg1[%y, %x] : memref<2x3xui16, #spirv.storage_class<StorageBuffer>>
     return
   }
 
@@ -697,4 +788,15 @@ module attributes {
     memref.store %0, %arg1[%cst] : memref<1xvector<1xf32>, #spirv.storage_class<StorageBuffer>>
     return
   }
+
+  // CHECK-LABEL: @load_non_perm_layout(
+  func.func @load_non_perm_layout(%arg0: memref<2x4xf32, #non_permutation, #spirv.storage_class<Image>>, %arg1: memref<2x4xf32, #spirv.storage_class<StorageBuffer>>) {
+    %x = arith.constant 3 : index
+    %y = arith.constant 1 : index
+    // CHECK-NOT: spirv.Image
+    // CHECK-NOT: spirv.ImageFetch
+    %0 = memref.load %arg0[%y, %x] : memref<2x4xf32, #non_permutation, #spirv.storage_class<Image>>
+    memref.store %0, %arg1[%y, %x] : memref<2x4xf32, #spirv.storage_class<StorageBuffer>>
+    return
+  }
 }

From 5123dfb51722a5075b3515c49930216869f595d6 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 06:23:46 -0700
Subject: [PATCH 227/878] [MLIR] Apply clang-tidy fixes for
 performance-unnecessary-value-param in ReifyValueBounds.cpp (NFC)

---
 .../include/mlir/Dialect/Arith/Transforms/Transforms.h |  4 ++--
 mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
index 8d6c68cef680d..ffd367ef11abc 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
@@ -53,7 +53,7 @@ reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
 ///   ValueBoundsOpInterface, no bound can be computed.
 FailureOr<OpFoldResult> reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
-    ValueBoundsConstraintSet::StopConditionFn stopCondition = nullptr,
+    const ValueBoundsConstraintSet::StopConditionFn &stopCondition = nullptr,
     bool closedUB = false);
 
 /// Reify a bound for the specified dimension of the given shaped value in terms
@@ -65,7 +65,7 @@ FailureOr<OpFoldResult> reifyIndexValueBound(
 FailureOr<OpFoldResult> reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
     int64_t dim,
-    ValueBoundsConstraintSet::StopConditionFn stopCondition = nullptr,
+    const ValueBoundsConstraintSet::StopConditionFn &stopCondition = nullptr,
     bool closedUB = false);
 
 } // namespace arith
diff --git a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
index 4bdd1e6a54d69..127563c8f4967 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <utility>
+
 #include "mlir/Dialect/Arith/Transforms/Transforms.h"
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -69,7 +71,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyValueBound(
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeBound(
-          boundMap, mapOperands, type, var, stopCondition, closedUB)))
+          boundMap, mapOperands, type, var, std::move(stopCondition),
+          closedUB)))
     return failure();
 
   // Materialize tensor.dim/memref.dim ops.
@@ -116,7 +119,7 @@ FailureOr<OpFoldResult> mlir::arith::reifyValueBound(
 
 FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
-    int64_t dim, ValueBoundsConstraintSet::StopConditionFn stopCondition,
+    int64_t dim, const ValueBoundsConstraintSet::StopConditionFn &stopCondition,
     bool closedUB) {
   auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
                              ValueBoundsConstraintSet &cstr) {
@@ -134,7 +137,8 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
 
 FailureOr<OpFoldResult> mlir::arith::reifyIndexValueBound(
     OpBuilder &b, Location loc, presburger::BoundType type, Value value,
-    ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
+    const ValueBoundsConstraintSet::StopConditionFn &stopCondition,
+    bool closedUB) {
   auto reifyToOperands = [&](Value v, std::optional<int64_t> d,
                              ValueBoundsConstraintSet &cstr) {
     return v != value;

From 88658dbbc510c78bce62005bf7cf35d5b869113c Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Tue, 30 Sep 2025 10:10:37 +0100
Subject: [PATCH 228/878] [LV] Add ExtNegatedMulAccReduction expression type
 (#160154)

This PR adds the ExtNegatedMulAccReduction expression type for
VPExpressionRecipe so that extend-multiply-accumulate reductions with a
negated multiply can be bundled.

Stacked PRs:

1. https://github.com/llvm/llvm-project/pull/156976
2. -> https://github.com/llvm/llvm-project/pull/160154
3. https://github.com/llvm/llvm-project/pull/147302
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  17 +++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  31 ++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  19 ++-
 .../vplan-printing-reductions.ll              | 121 ++++++++++++++++++
 4 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0822511150e9e..4c7a083e0d9b7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2997,6 +2997,10 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
     /// vector operands, performing a reduction.add on the result, and adding
     /// the scalar result to a chain.
     MulAccReduction,
+    /// Represent an inloop multiply-accumulate reduction, multiplying the
+    /// extended vector operands, negating the multiplication, performing a
+    /// reduction.add on the result, and adding the scalar result to a chain.
+    ExtNegatedMulAccReduction,
   };
 
   /// Type of the expression.
@@ -3020,6 +3024,19 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
                      VPWidenRecipe *Mul, VPReductionRecipe *Red)
       : VPExpressionRecipe(ExpressionTypes::ExtMulAccReduction,
                            {Ext0, Ext1, Mul, Red}) {}
+  VPExpressionRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+                     VPWidenRecipe *Mul, VPWidenRecipe *Sub,
+                     VPReductionRecipe *Red)
+      : VPExpressionRecipe(ExpressionTypes::ExtNegatedMulAccReduction,
+                           {Ext0, Ext1, Mul, Sub, Red}) {
+    assert(Mul->getOpcode() == Instruction::Mul && "Expected a mul");
+    assert(Red->getRecurrenceKind() == RecurKind::Add &&
+           "Expected an add reduction");
+    assert(getNumOperands() >= 3 && "Expected at least three operands");
+    auto *SubConst = dyn_cast<ConstantInt>(getOperand(2)->getLiveInIRValue());
+    assert(SubConst && SubConst->getValue() == 0 &&
+           Sub->getOpcode() == Instruction::Sub && "Expected a negating sub");
+  }
 
   ~VPExpressionRecipe() override {
     for (auto *R : reverse(ExpressionRecipes))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b5e30cb1fa655..ee03729f150b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2839,12 +2839,17 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
     return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
                                           Ctx.CostKind);
 
-  case ExpressionTypes::ExtMulAccReduction:
+  case ExpressionTypes::ExtNegatedMulAccReduction:
+    assert(Opcode == Instruction::Add && "Unexpected opcode");
+    Opcode = Instruction::Sub;
+    LLVM_FALLTHROUGH;
+  case ExpressionTypes::ExtMulAccReduction: {
     return Ctx.TTI.getMulAccReductionCost(
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
         Opcode, RedTy, SrcVecTy, Ctx.CostKind);
   }
+  }
   llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
 }
 
@@ -2890,6 +2895,30 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
     O << ")";
     break;
   }
+  case ExpressionTypes::ExtNegatedMulAccReduction: {
+    getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
+    O << " + reduce."
+      << Instruction::getOpcodeName(
+             RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
+      << " (sub (0, mul";
+    auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
+    Mul->printFlags(O);
+    O << "(";
+    getOperand(0)->printAsOperand(O, SlotTracker);
+    auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+    O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
+      << *Ext0->getResultType() << "), (";
+    getOperand(1)->printAsOperand(O, SlotTracker);
+    auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+    O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
+      << *Ext1->getResultType() << ")";
+    if (Red->isConditional()) {
+      O << ", ";
+      Red->getCondOp()->printAsOperand(O, SlotTracker);
+    }
+    O << "))";
+    break;
+  }
   case ExpressionTypes::MulAccReduction:
   case ExpressionTypes::ExtMulAccReduction: {
     getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5252e1f928294..969dce4bc98ae 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3543,7 +3543,15 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
   };
 
   VPValue *VecOp = Red->getVecOp();
+  VPRecipeBase *Sub = nullptr;
   VPValue *A, *B;
+  VPValue *Tmp = nullptr;
+  // Sub reductions could have a sub between the add reduction and vec op.
+  if (match(VecOp,
+            m_Binary<Instruction::Sub>(m_SpecificInt(0), m_VPValue(Tmp)))) {
+    Sub = VecOp->getDefiningRecipe();
+    VecOp = Tmp;
+  }
   // Try to match reduce.add(mul(...)).
   if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
     auto *RecipeA =
@@ -3560,12 +3568,21 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
                                        Instruction::CastOps::ZExt,
                                    Mul, RecipeA, RecipeB, nullptr)) {
+      if (Sub)
+        return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
+                                      cast<VPWidenRecipe>(Sub), Red);
       return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
     }
     // Match reduce.add(mul).
-    if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
+    // TODO: Add an expression type for this variant with a negated mul
+    if (!Sub &&
+        IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
       return new VPExpressionRecipe(Mul, Red);
   }
+  // TODO: Add an expression type for negated versions of other expression
+  // variants.
+  if (Sub)
+    return nullptr;
   // Match reduce.add(ext(mul(ext(A), ext(B)))).
   // All extend recipes must have same opcode or A == B
   // which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 4e6ef0de6a9ed..5a0c69bf5db1b 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -580,6 +580,127 @@ exit:
   ret i32 %add
 }
 
+define i32 @print_mulacc_negated(ptr %a, ptr %b) {
+; CHECK-LABEL: 'print_mulacc_negated'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = VF
+; CHECK-NEXT: Live-in vp<%1> = VF * UF
+; CHECK-NEXT: Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT:   EMIT vp<%3> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%3>, vp<%8>
+; CHECK-NEXT:     vp<%5> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0>
+; CHECK-NEXT:     CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%5>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:     WIDEN ir<%load.a> = load vp<%6>
+; CHECK-NEXT:     CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%5>
+; CHECK-NEXT:     vp<%7> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:     WIDEN ir<%load.b> = load vp<%7>
+; CHECK-NEXT:     EXPRESSION vp<%8> = ir<%accum> + reduce.add (sub (0, mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32)))
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<%4>, vp<%1>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<%10> = compute-reduction-result ir<%accum>, vp<%8>
+; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<1024>, vp<%2>
+; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<%10> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%2>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:   EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%10>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT: Successor(s): ir-bb<loop>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %sub = sub i32 0, %mul
+; CHECK-NEXT:   IR   %add = add i32 %accum, %sub
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 1024
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK:      VPlan 'Final VPlan for VF={4},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = vector-trip-count
+; CHECK-NEXT: Live-in ir<1024> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi ir<0>, ir<%add>
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
+; CHECK-NEXT:   WIDEN ir<%sub> = sub ir<0>, ir<%mul>
+; CHECK-NEXT:   REDUCE ir<%add> = ir<%accum> + reduce.add (ir<%sub>)
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<%index>, ir<4>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<1024>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[RED_RESULT]]> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %loop ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %sub = sub i32 0, %mul
+  %add = add i32 %accum, %sub
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 %add
+}
+
 define i64 @print_mulacc_sub_extended(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
 ; CHECK-LABEL: 'print_mulacc_sub_extended'
 ; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {

From 6b19ccdf64a0022b9665b61e4003b4e87643829b Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Tue, 30 Sep 2025 10:17:07 +0100
Subject: [PATCH 229/878] [AArch64] Simplify some masked integer comparisons.
 (#153783)

Specifically, `X & M ?= C --> (C << clz(M)) ?= (X << clz(M))` where M is
a non-empty sequence of ones starting at the least significant bit with
the remainder zero and C is a constant subset of M that cannot be
materialised into a SUBS (immediate). Proof:
https://alive2.llvm.org/ce/z/haqdJ4.

This improves the comparison in isinf, for example:
```cpp
int isinf(float x) {
  return __builtin_isinf(x);
}
```

Before:
```
isinf:
  fmov    w9, s0
  mov     w8, #2139095040
  and     w9, w9, #0x7fffffff
  cmp     w9, w8
  cset    w0, eq
  ret
```

After:
```
isinf:
  fmov    w9, s0
  mov     w8, #-16777216
  cmp     w8, w9, lsl #1
  cset    w0, eq
  ret
```
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  26 +++
 llvm/test/CodeGen/AArch64/isinf.ll            |  10 +-
 .../CodeGen/AArch64/masked-integer-compare.ll | 178 ++++++++++++++++++
 3 files changed, 208 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/masked-integer-compare.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 899baa9c998ec..9078675da0e95 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25512,6 +25512,32 @@ SDValue performCONDCombine(SDNode *N,
                                              CmpIndex, CC))
     return Val;
 
+  // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty
+  // sequence of ones starting at the least significant bit with the remainder
+  // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised
+  // into a SUBS (immediate). The transformed form can be matched into a SUBS
+  // (shifted register).
+  if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&
+      isa<ConstantSDNode>(AndNode->getOperand(1)) &&
+      isa<ConstantSDNode>(SubsNode->getOperand(1))) {
+    SDValue X = AndNode->getOperand(0);
+    APInt M = AndNode->getConstantOperandAPInt(1);
+    APInt C = SubsNode->getConstantOperandAPInt(1);
+
+    if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {
+      SDLoc DL(SubsNode);
+      EVT VT = SubsNode->getValueType(0);
+      unsigned ShiftAmt = M.countl_zero();
+      SDValue ShiftedX = DAG.getNode(
+          ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
+      SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);
+      SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),
+                                    ShiftedC, ShiftedX);
+      DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));
+      return SDValue(N, 0);
+    }
+  }
+
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
     uint32_t CNV = CN->getZExtValue();
     if (CNV == 255)
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index e68539bcf07d9..e8bbaf96395f0 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -27,9 +27,8 @@ define i32 @replace_isinf_call_f32(float %x) {
 ; CHECK-LABEL: replace_isinf_call_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
-; CHECK-NEXT:    and w9, w9, #0x7fffffff
-; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    mov w8, #-16777216 // =0xff000000
+; CHECK-NEXT:    cmp w8, w9, lsl #1
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %abs = tail call float @llvm.fabs.f32(float %x)
@@ -43,9 +42,8 @@ define i32 @replace_isinf_call_f64(double %x) {
 ; CHECK-LABEL: replace_isinf_call_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    mov x8, #9218868437227405312 // =0x7ff0000000000000
-; CHECK-NEXT:    and x9, x9, #0x7fffffffffffffff
-; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #-9007199254740992 // =0xffe0000000000000
+; CHECK-NEXT:    cmp x8, x9, lsl #1
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %abs = tail call double @llvm.fabs.f64(double %x)
diff --git a/llvm/test/CodeGen/AArch64/masked-integer-compare.ll b/llvm/test/CodeGen/AArch64/masked-integer-compare.ll
new file mode 100644
index 0000000000000..363cd10c78a94
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked-integer-compare.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
+
+; Test code generation support for SUBS (shifted register) from masked integer
+; compare sequences. These sequences appear in isinf tests, for example.
+
+define i1 @combine_masked_i32(i32 %x) {
+; CHECK-LABEL: combine_masked_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-16777216 // =0xff000000
+; CHECK-NEXT:    cmp w8, w0, lsl #1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %sub = sub i32 %and, u0x7f800000
+  %cmp = icmp eq i32 %sub, 0
+  ret i1 %cmp
+}
+
+define i1 @combine_masked_i64(i64 %x) {
+; CHECK-LABEL: combine_masked_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-9007199254740992 // =0xffe0000000000000
+; CHECK-NEXT:    cmp x8, x0, lsl #1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i64 %x, u0x7fffffffffffffff
+  %sub = sub i64 %and, u0x7ff0000000000000
+  %cmp = icmp eq i64 %sub, 0
+  ret i1 %cmp
+}
+
+define i1 @combine_masked_ne(i32 %x) {
+; CHECK-LABEL: combine_masked_ne:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-16777216 // =0xff000000
+; CHECK-NEXT:    cmp w8, w0, lsl #1
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp ne i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @combine_masked_lsl4(i32 %x) {
+; CHECK-LABEL: combine_masked_lsl4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-134217728 // =0xf8000000
+; CHECK-NEXT:    cmp w8, w0, lsl #4
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x0fffffff
+  %cmp = icmp eq i32 %and, u0x0f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_not_mask(i32 %x) {
+; CHECK-LABEL: dont_combine_not_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, #0x7ffffffe
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7ffffffe
+  %cmp = icmp eq i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_cmp_not_masked(i32 %x) {
+; CHECK-LABEL: dont_combine_cmp_not_masked:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, #0x3fffffff
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x3fffffff
+  %cmp = icmp eq i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_not_constant_mask(i32 %x, i32 %m) {
+; CHECK-LABEL: dont_combine_not_constant_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, w1
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, %m
+  %cmp = icmp eq i32 %and, u0x7f800000
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_not_constant_cmp(i32 %x, i32 %c) {
+; CHECK-LABEL: dont_combine_not_constant_cmp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xfffffff
+; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x0fffffff
+  %cmp = icmp eq i32 %and, %c
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_subs_imm(i32 %x) {
+; CHECK-LABEL: dont_combine_subs_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w8, #291
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp eq i32 %and, u0x123
+  ret i1 %cmp
+}
+
+define i1 @dont_combine_subs_imm_lsl12(i32 %x) {
+; CHECK-LABEL: dont_combine_subs_imm_lsl12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w8, #291, lsl #12 // =1191936
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp eq i32 %and, u0x123000
+  ret i1 %cmp
+}
+
+define { i1, i1 } @dont_combine_multi_use_cmp(i32 %x) {
+; CHECK-LABEL: dont_combine_multi_use_cmp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w9, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w9, w8
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    cset w1, lt
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %eq = icmp eq i32 %and, u0x7f800000
+  %lt = icmp slt i32 %and, u0x7f800000
+  %r1 = insertvalue { i1, i1 } poison, i1 %eq, 0
+  %r2 = insertvalue { i1, i1 } %r1, i1 %lt, 1
+  ret { i1, i1 } %r2
+}
+
+define { i32, i1 } @dont_combine_multi_use_sub(i32 %x) {
+; CHECK-LABEL: dont_combine_multi_use_sub:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #-2139095040 // =0x80800000
+; CHECK-NEXT:    and w9, w0, #0x7fffffff
+; CHECK-NEXT:    adds w0, w9, w8
+; CHECK-NEXT:    cset w1, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %sub = sub i32 %and, u0x7f800000
+  %cmp = icmp eq i32 %sub, 0
+  %r1 = insertvalue { i32, i1 } poison, i32 %sub, 0
+  %r2 = insertvalue { i32, i1 } %r1, i1 %cmp, 1
+  ret { i32, i1 } %r2
+}
+
+define { i32, i1 } @dont_combine_multi_use_and(i32 %x) {
+; CHECK-LABEL: dont_combine_multi_use_and:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #2139095040 // =0x7f800000
+; CHECK-NEXT:    and w0, w0, #0x7fffffff
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    cset w1, eq
+; CHECK-NEXT:    ret
+  %and = and i32 %x, u0x7fffffff
+  %cmp = icmp eq i32 %and, u0x7f800000
+  %r1 = insertvalue { i32, i1 } poison, i32 %and, 0
+  %r2 = insertvalue { i32, i1 } %r1, i1 %cmp, 1
+  ret { i32, i1 } %r2
+}

From 0d9dd60815e9b802cd2a02f0eec13941660e8fb7 Mon Sep 17 00:00:00 2001
From: BaiXilin <x53bai@uwaterloo.ca>
Date: Tue, 30 Sep 2025 05:41:12 -0400
Subject: [PATCH 230/878] [x86][AVX-VNNI] Fix VPDPBXXD Argument Type (#159222)

Fixed intrinsic VPDP[SS,SU,UU]D[,S]_128/256/512's argument types to match with the ISA.
Fixes part of #97271.
---
 clang/include/clang/Basic/BuiltinsX86.td      |  36 +-
 clang/lib/Headers/avx10_2_512niintrin.h       |  24 +-
 clang/lib/Headers/avxvnniint8intrin.h         |  88 ++-
 clang/test/CodeGen/X86/avxvnniint8-builtins.c |  24 +-
 llvm/include/llvm/IR/IntrinsicsX86.td         |  36 +-
 llvm/lib/IR/AutoUpgrade.cpp                   |  56 +-
 .../Instrumentation/MemorySanitizer.cpp       |  14 +-
 .../CodeGen/X86/avx10.2-intrinsic-upgrade.ll  |  99 +++
 .../CodeGen/X86/avx10_2_512ni-intrinsics.ll   |  54 +-
 llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll |  72 +-
 .../X86/avxvnniint8-intrinsics-upgrade.ll     | 318 +++++++++
 .../CodeGen/X86/avxvnniint8-intrinsics.ll     | 120 ++--
 .../X86/stack-folding-int-avxvnniint8.ll      | 120 ++--
 .../X86/avx10_2_512ni-intrinsics.ll           | 296 ++++----
 .../X86/avx10_2ni-intrinsics.ll               | 352 ++++++----
 .../X86/avxvnniint8-intrinsics.ll             | 656 +++++++++++-------
 .../mlir/Dialect/X86Vector/X86Vector.td       |   5 -
 .../Dialect/X86Vector/IR/X86VectorDialect.cpp |  23 -
 18 files changed, 1586 insertions(+), 807 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
 create mode 100644 llvm/test/CodeGen/X86/avxvnniint8-intrinsics-upgrade.ll

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 77e599587edc3..e98bee28c15be 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1109,51 +1109,51 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpbuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpbuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
 }
 
 let Features = "movrs", Attributes = [NoThrow, Const] in {
@@ -4282,12 +4282,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def vdpphps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<32, _Float16>, _Vector<32, _Float16>)">;
-  def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpbssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
+  def vpdpbssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, char>)">;
+  def vpdpbsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">;
+  def vpdpbsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, char>, _Vector<64, unsigned char>)">;
+  def vpdpbuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
+  def vpdpbuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, unsigned char>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index 67679fce82296..fdb57c7c9e27b 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -64,8 +64,8 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_dpph_ps(__mmask16 __U,
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssd_epi32(__m512i __W,
                                                                  __m512i __A,
                                                                  __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbssd512((__v16si)__W, (__v64qi)__A,
+                                             (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -84,8 +84,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssd_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbssds_epi32(__m512i __W,
                                                                   __m512i __A,
                                                                   __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbssds512((__v16si)__W, (__v64qi)__A,
+                                              (__v64qi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbssds_epi32(
@@ -104,8 +104,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbssds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsud_epi32(__m512i __W,
                                                                  __m512i __A,
                                                                  __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbsud512((__v16si)__W, (__v64qi)__A,
+                                             (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -124,8 +124,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbsuds_epi32(__m512i __W,
                                                                   __m512i __A,
                                                                   __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbsuds512((__v16si)__W, (__v64qi)__A,
+                                              (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbsuds_epi32(
@@ -144,8 +144,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbsuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuud_epi32(__m512i __W,
                                                                  __m512i __A,
                                                                  __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbuud512((__v16si)__W, (__v64qu)__A,
+                                             (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -164,8 +164,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpbuuds_epi32(__m512i __W,
                                                                   __m512i __A,
                                                                   __m512i __B) {
-  return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpbuuds512((__v16si)__W, (__v64qu)__A,
+                                              (__v64qu)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbuuds_epi32(
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
index c211620c68f07..858b66b138f31 100644
--- a/clang/lib/Headers/avxvnniint8intrin.h
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -14,6 +14,7 @@
 #ifndef __AVXVNNIINT8INTRIN_H
 #define __AVXVNNIINT8INTRIN_H
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -44,10 +45,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbssd_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v16qi)(__A),          \
+                                       (__v16qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -78,10 +81,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbssd_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v32qi)(__A),          \
+                                       (__v32qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -94,7 +99,7 @@
 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x char].
@@ -113,10 +118,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbssds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v16qi)(__A),         \
+                                        (__v16qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -129,7 +136,7 @@
 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSSDS instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x char].
@@ -148,10 +155,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbssds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v32qi)(__A),         \
+                                        (__v32qi)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -163,7 +172,7 @@
 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUD instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x char].
@@ -182,10 +191,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbsud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v16qi)(__A),          \
+                                       (__v16qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -197,7 +208,7 @@
 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUD instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x char].
@@ -216,10 +227,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbsud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v32qi)(__A),          \
+                                       (__v32qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -232,7 +245,7 @@
 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x char].
@@ -251,10 +264,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbsuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v16qi)(__A),         \
+                                        (__v16qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -267,7 +282,7 @@
 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBSUDS instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x char].
@@ -286,10 +301,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbsuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v32qi)(__A),         \
+                                        (__v32qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -301,7 +318,7 @@
 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBUUD instruction.
 ///
 /// \param __A
 ///    A 128-bit vector of [16 x unsigned char].
@@ -320,10 +337,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbuud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v16qu)(__A),          \
+                                       (__v16qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -335,7 +354,7 @@
 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+/// This intrinsic corresponds to the \c VPDPBUUD instruction.
 ///
 /// \param __A
 ///    A 256-bit vector of [32 x unsigned char].
@@ -354,10 +373,12 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbuud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v32qu)(__A),          \
+                                       (__v32qu)(__B)))
 
+// clang-format off
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
@@ -389,10 +410,12 @@
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+// clang-format on
 #define _mm_dpbuuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v16qu)(__A),         \
+                                        (__v16qu)(__B)))
 
+// clang-format off
 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
 ///    signed 16-bit results. Sum these 4 results with the corresponding
 ///    32-bit integer in \a __W with signed saturation, and store the packed
@@ -423,8 +446,9 @@
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
+// clang-format on
 #define _mm256_dpbuuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v32qu)(__A),         \
+                                        (__v32qu)(__B)))
 
 #endif // __AVXVNNIINT8INTRIN_H
diff --git a/clang/test/CodeGen/X86/avxvnniint8-builtins.c b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
index dd4a4483abaab..021e658cc9d2c 100644
--- a/clang/test/CodeGen/X86/avxvnniint8-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint8-builtins.c
@@ -10,73 +10,73 @@
 #include <immintrin.h>
 
 // CHECK-LABEL: test_mm_dpbssd_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbssd_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbssds_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbssds_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbssds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbsud_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbsud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbsuds_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbsuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbsuds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbuud_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbuud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm_dpbuuds_epi32
-// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK:     call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
 __m128i test_mm_dpbuuds_epi32(__m128i __W, __m128i __A, __m128i __B) {
   return _mm_dpbuuds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbssd_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbssd_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbssds_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbssds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbsud_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbsud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbsuds_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbsuds_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbuud_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbuud_epi32(__W, __A, __B);
 }
 
 // CHECK-LABEL: test_mm256_dpbuuds_epi32
-// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK:     call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
 __m256i test_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
   return _mm256_dpbuuds_epi32(__W, __A, __B);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 4af9ffc52ba6b..81fbfbf0bb1b4 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1919,62 +1919,62 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_vpdpbssd_128
       : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbssd_256
       : ClangBuiltin<"__builtin_ia32_vpdpbssd256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbssds_128
       : ClangBuiltin<"__builtin_ia32_vpdpbssds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbssds_256
       : ClangBuiltin<"__builtin_ia32_vpdpbssds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsud_128
       : ClangBuiltin<"__builtin_ia32_vpdpbsud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsud_256
       : ClangBuiltin<"__builtin_ia32_vpdpbsud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpbsuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbsuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpbsuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuud_128
       : ClangBuiltin<"__builtin_ia32_vpdpbuud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuud_256
       : ClangBuiltin<"__builtin_ia32_vpdpbuud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpbuuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v16i8_ty, llvm_v16i8_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpbuuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpbuuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v32i8_ty, llvm_v32i8_ty],
                               [IntrNoMem]>;
 
   def int_x86_avx2_vpdpwsud_128
@@ -5000,32 +5000,32 @@ let TargetPrefix = "x86" in {
   def int_x86_avx10_vpdpbssd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbssd512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbssds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbssds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbsud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbsud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbsuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbsuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbuud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbuud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpbuuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpbuuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem]>;
   // VNNI INT16
   def int_x86_avx10_vpdpwsud_512 :
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 5385b1f8cac0b..f28b98957cae4 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -594,6 +594,42 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
     return false; // No other 'x86.avx512.*'.
   }
 
+  if (Name.consume_front("avx2.vpdpb")) {
+    // Added in 21.1
+    ID = StringSwitch<Intrinsic::ID>(Name)
+             .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
+             .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
+             .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
+             .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
+             .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
+             .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
+             .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
+             .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
+             .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
+             .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
+             .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
+             .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
+             .Default(Intrinsic::not_intrinsic);
+    if (ID != Intrinsic::not_intrinsic)
+      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    return false; // No other 'x86.avx2.*'
+  }
+
+  if (Name.consume_front("avx10.vpdpb")) {
+    // Added in 21.1
+    ID = StringSwitch<Intrinsic::ID>(Name)
+             .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
+             .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
+             .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
+             .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
+             .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
+             .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
+             .Default(Intrinsic::not_intrinsic);
+    if (ID != Intrinsic::not_intrinsic)
+      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    return false; // No other 'x86.avx10.*'
+  }
+
   if (Name.consume_front("avx512bf16.")) {
     // Added in 9.0
     ID = StringSwitch<Intrinsic::ID>(Name)
@@ -5224,7 +5260,25 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   case Intrinsic::x86_avx512_vpdpbusd_512:
   case Intrinsic::x86_avx512_vpdpbusds_128:
   case Intrinsic::x86_avx512_vpdpbusds_256:
-  case Intrinsic::x86_avx512_vpdpbusds_512: {
+  case Intrinsic::x86_avx512_vpdpbusds_512:
+  case Intrinsic::x86_avx2_vpdpbssd_128:
+  case Intrinsic::x86_avx2_vpdpbssd_256:
+  case Intrinsic::x86_avx10_vpdpbssd_512:
+  case Intrinsic::x86_avx2_vpdpbssds_128:
+  case Intrinsic::x86_avx2_vpdpbssds_256:
+  case Intrinsic::x86_avx10_vpdpbssds_512:
+  case Intrinsic::x86_avx2_vpdpbsud_128:
+  case Intrinsic::x86_avx2_vpdpbsud_256:
+  case Intrinsic::x86_avx10_vpdpbsud_512:
+  case Intrinsic::x86_avx2_vpdpbsuds_128:
+  case Intrinsic::x86_avx2_vpdpbsuds_256:
+  case Intrinsic::x86_avx10_vpdpbsuds_512:
+  case Intrinsic::x86_avx2_vpdpbuud_128:
+  case Intrinsic::x86_avx2_vpdpbuud_256:
+  case Intrinsic::x86_avx10_vpdpbuud_512:
+  case Intrinsic::x86_avx2_vpdpbuuds_128:
+  case Intrinsic::x86_avx2_vpdpbuuds_256:
+  case Intrinsic::x86_avx10_vpdpbuuds_512: {
     unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 8;
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b988957dfbc08..cf076b9ad70ee 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5810,10 +5810,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_vpdpbusds_512:
     case Intrinsic::x86_avx2_vpdpbssd_128:
     case Intrinsic::x86_avx2_vpdpbssd_256:
+    case Intrinsic::x86_avx10_vpdpbssd_512:
     case Intrinsic::x86_avx2_vpdpbssds_128:
     case Intrinsic::x86_avx2_vpdpbssds_256:
-    case Intrinsic::x86_avx10_vpdpbssd_512:
     case Intrinsic::x86_avx10_vpdpbssds_512:
+    case Intrinsic::x86_avx2_vpdpbsud_128:
+    case Intrinsic::x86_avx2_vpdpbsud_256:
+    case Intrinsic::x86_avx10_vpdpbsud_512:
+    case Intrinsic::x86_avx2_vpdpbsuds_128:
+    case Intrinsic::x86_avx2_vpdpbsuds_256:
+    case Intrinsic::x86_avx10_vpdpbsuds_512:
+    case Intrinsic::x86_avx2_vpdpbuud_128:
+    case Intrinsic::x86_avx2_vpdpbuud_256:
+    case Intrinsic::x86_avx10_vpdpbuud_512:
+    case Intrinsic::x86_avx2_vpdpbuuds_128:
+    case Intrinsic::x86_avx2_vpdpbuuds_256:
+    case Intrinsic::x86_avx10_vpdpbuuds_512:
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
       break;
 
diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
new file mode 100644
index 0000000000000..76d84c1159ee4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=X64
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbssd_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbssd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbssds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbssds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbsud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbsud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbsuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbsuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbuud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbuud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpbuuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpbuuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index 09eb53faaaada..a2aad604f19bc 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -53,7 +53,7 @@ declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32
 
 ; VNNI INT8
 
-define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -64,12 +64,12 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpbssd (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x77,0x48,0x50,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_mask_dpbssds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -81,13 +81,13 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -99,16 +99,16 @@ define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssd %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xc9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -119,12 +119,12 @@ define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpbsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0x50,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_mask_dpbsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -136,13 +136,13 @@ define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -154,16 +154,16 @@ define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -174,12 +174,12 @@ define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpbuud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0x50,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_mask_dpbuuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -191,13 +191,13 @@ define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x49,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -209,14 +209,14 @@ define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xc9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
 ; VNNI INT16
 
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 0c5fd3bf9d241..1f270d539cdb4 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -101,7 +101,7 @@ declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x
 
 ; VNNI INT8
 
-define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_mask_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -113,13 +113,13 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_maskz_dpbssds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -131,13 +131,13 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0x89,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpbssds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -149,13 +149,13 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_mask_dpbssd_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -167,18 +167,18 @@ define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x77,0xa9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_mask_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -190,13 +190,13 @@ define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_maskz_dpbsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -208,13 +208,13 @@ define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpbsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -226,13 +226,13 @@ define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_mask_dpbsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -244,18 +244,18 @@ define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_mask_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -267,13 +267,13 @@ define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x74,0x09,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) {
 ; X86-LABEL: test_mm_maskz_dpbuuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -285,13 +285,13 @@ define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0x89,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpbuuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -303,13 +303,13 @@ define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x74,0x29,0x51,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) {
 ; X86-LABEL: test_mm256_mask_dpbuud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -321,16 +321,16 @@ define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x74,0xa9,0x50,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 ; VNNI INT16
 
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics-upgrade.ll
new file mode 100644
index 0000000000000..ce9a0fb0d5336
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics-upgrade.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 --show-mc-encoding | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x73,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssd_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x77,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbssds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsud_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbsuds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuud(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_128:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x50,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuud_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0x51,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+;
+; AVX10-X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X86:       # %bb.0:
+; AVX10-X86-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X86-NEXT:    retl # encoding: [0xc3]
+;
+; AVX10-X64-LABEL: test_int_x86_avx2_vpdpbuuds_256:
+; AVX10-X64:       # %bb.0:
+; AVX10-X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
+; AVX10-X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
index 0ddd0171a58a0..6c3d90aab77e8 100644
--- a/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint8-intrinsics.ll
@@ -5,9 +5,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2  --show-mc-encoding | FileCheck %s --check-prefixes=AVX10-X64
 
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -41,16 +41,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -84,16 +84,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbssds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x77,0x08,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -127,16 +127,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbssds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -170,16 +170,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x77,0x28,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -213,16 +213,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -256,16 +256,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsud_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -299,16 +299,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbsuds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -342,16 +342,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -385,16 +385,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) {
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -428,16 +428,16 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuud_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -471,16 +471,16 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; AVX10-X64-NEXT:    vpdpbuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x50,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) {
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) {
 ; X86-LABEL: test_int_x86_avx2_vpdpbuuds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -514,9 +514,9 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; AVX10-X64-NEXT:    vpdpbuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0x51,0xc2]
 ; AVX10-X64-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
 ; AVX10-X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
index fd988f7d318fe..a49d3a552f556 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint8.ll
@@ -1,20 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnniint8 < %s | FileCheck %s
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -24,11 +24,11 @@ define <4 x i32> @stack_fold_vpdpbssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -38,11 +38,11 @@ define <4 x i32> @stack_fold_vpdpbssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -52,11 +52,11 @@ define <8 x i32> @stack_fold_vpdpbssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssd_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -66,11 +66,11 @@ define <8 x i32> @stack_fold_vpdpbssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    vpdpbssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -80,11 +80,11 @@ define <4 x i32> @stack_fold_vpdpbssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -94,11 +94,11 @@ define <4 x i32> @stack_fold_vpdpbssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -108,11 +108,11 @@ define <8 x i32> @stack_fold_vpdpbssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbssds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -122,11 +122,11 @@ define <8 x i32> @stack_fold_vpdpbssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    vpdpbssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -136,11 +136,11 @@ define <4 x i32> @stack_fold_vpdpbsud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -151,11 +151,11 @@ define <4 x i32> @stack_fold_vpdpbsud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbsud %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -165,11 +165,11 @@ define <8 x i32> @stack_fold_vpdpbsud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    vpdpbsud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsud_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -180,11 +180,11 @@ define <8 x i32> @stack_fold_vpdpbsud_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    vpdpbsud %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -194,11 +194,11 @@ define <4 x i32> @stack_fold_vpdpbsuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -209,11 +209,11 @@ define <4 x i32> @stack_fold_vpdpbsuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbsuds %xmm1, %xmm2, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -223,11 +223,11 @@ define <8 x i32> @stack_fold_vpdpbsuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    vpdpbsuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbsuds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -238,11 +238,11 @@ define <8 x i32> @stack_fold_vpdpbsuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    vpdpbsuds %ymm1, %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -252,11 +252,11 @@ define <4 x i32> @stack_fold_vpdpbuud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -266,11 +266,11 @@ define <4 x i32> @stack_fold_vpdpbuud_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -280,11 +280,11 @@ define <8 x i32> @stack_fold_vpdpbuud_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuud_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -294,11 +294,11 @@ define <8 x i32> @stack_fold_vpdpbuud_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    vpdpbuud {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -308,11 +308,11 @@ define <4 x i32> @stack_fold_vpdpbuuds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <16 x i8> %a1, <16 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -322,11 +322,11 @@ define <4 x i32> @stack_fold_vpdpbuuds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %a0, <16 x i8> %a2, <16 x i8> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -336,11 +336,11 @@ define <8 x i32> @stack_fold_vpdpbuuds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <32 x i8> %a1, <32 x i8> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpbuuds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -350,6 +350,6 @@ define <8 x i32> @stack_fold_vpdpbuuds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    vpdpbuuds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %a0, <32 x i8> %a2, <32 x i8> %a1)
   ret <8 x i32> %2
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
index 93006ae30f926..991467e1f98b2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
@@ -124,11 +124,11 @@ define <16 x float> @test_mm512_maskz_dpph_ps(i16 zeroext %__U, <16 x float> %__
 declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32 x half>)
 
 
-define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbssd_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -137,22 +137,18 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[TMP10:%.*]] = load <64 x i8>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <64 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <64 x i8> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = load <64 x i8>, ptr [[TMP8]], align 64
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <64 x i8> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <64 x i8> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <64 x i8> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <64 x i1> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <64 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <64 x i1> [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <64 x i1> [[TMP14]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <64 x i1> [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <64 x i1> [[TMP20]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sext <64 x i1> [[TMP21]] to <64 x i8>
@@ -160,34 +156,30 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP27:%.*]] = sext <16 x i1> [[TMP24]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP27]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[TMP10]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbssds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP28]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
@@ -195,7 +187,7 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP1]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -207,31 +199,27 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbssd_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i32> [[__A]] to <64 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i32> [[__B]] to <64 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i32> [[TMP3]] to <64 x i8>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <64 x i8> [[TMP27]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP29]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
@@ -239,7 +227,7 @@ define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -251,21 +239,21 @@ define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssd.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbsud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -273,87 +261,123 @@ define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <64 x i8>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <64 x i8>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
+; CHECK-NEXT:    store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbsuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <64 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP23]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbsud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = and <64 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP24]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
-define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbuud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -361,80 +385,116 @@ define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <64 x i8>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <64 x i8>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <64 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <64 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <64 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <64 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <64 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <64 x i1> [[TMP17]] to <64 x i8>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <64 x i8> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
+; CHECK-NEXT:    store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <64 x i8>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbuuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <64 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP23]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbuud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <64 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <64 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = and <64 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <64 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <64 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <64 x i1> [[TMP24]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <64 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <64 x i1> [[TMP13]] to <64 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <64 x i8> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> [[__W]], <64 x i8> [[__A]], <64 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8>)
+declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
 
 define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
index e121c3b6ea177..373eff6a1af60 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
@@ -243,25 +243,21 @@ declare <4 x float> @llvm.x86.avx10.vdpphps.128(<4 x float>, <8 x half>, <8 x ha
 declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x half>)
 
 
-define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbssd_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <4 x i32> [[__A]] to <16 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[__B]] to <16 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP27]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP28]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i8>
@@ -269,7 +265,7 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP20]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP23]], [[TMP1]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -281,31 +277,27 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbssds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <4 x i32> [[__A]] to <16 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[__B]] to <16 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <16 x i8> [[TMP27]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP29]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i8>
@@ -313,7 +305,7 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <4 x i1> [[TMP20]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -325,31 +317,27 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbssds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i32> [[__A]] to <32 x i8>
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i32> [[__B]] to <32 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP27]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP28]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP28]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
@@ -357,7 +345,7 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <8 x i1> [[TMP20]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP23]], [[TMP1]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -369,31 +357,27 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbssd_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i32> [[__A]] to <32 x i8>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i32> [[__B]] to <32 x i8>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <32 x i8> [[TMP27]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP28]], zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP29]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP11]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8>
@@ -401,7 +385,7 @@ define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32> [[TMP19]], zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <8 x i1> [[TMP20]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP23]], [[TMP24]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -413,28 +397,40 @@ define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbsud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -446,23 +442,35 @@ define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbsuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -474,23 +482,35 @@ define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -502,23 +522,35 @@ define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbsud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -530,28 +562,40 @@ define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbuud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -563,23 +607,35 @@ define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbuuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[__W]], <16 x i8> [[__A]], <16 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -591,23 +647,35 @@ define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -619,23 +687,35 @@ define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbuud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i8> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i8> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i8>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i8> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[__W]], <32 x i8> [[__A]], <32 x i8> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -647,16 +727,16 @@ define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 
 define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
index 3df0f1df153c5..d91abeac6a816 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
@@ -10,15 +10,15 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssd_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1:![0-9]+]]
@@ -26,22 +26,18 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP22]] to <16 x i8>
@@ -49,18 +45,14 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <16 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <16 x i1> [[TMP43]] to <16 x i8>
@@ -68,28 +60,28 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <4 x i1> [[TMP46]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -97,22 +89,18 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <16 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD]] to <16 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <16 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <16 x i1> [[TMP22]] to <16 x i8>
@@ -120,18 +108,14 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <4 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[X1]] to <16 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[X4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <16 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <16 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <16 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <16 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <16 x i1> [[TMP43]] to <16 x i8>
@@ -139,28 +123,28 @@ define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <4 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <4 x i1> [[TMP46]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssd_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -168,22 +152,18 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[TMP30:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <32 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
@@ -191,18 +171,14 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <8 x i1> [[TMP25]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i32> [[TMP4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <32 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <32 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <32 x i1> [[TMP43]] to <32 x i8>
@@ -210,28 +186,28 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <8 x i1> [[TMP46]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -239,22 +215,18 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[TMP30:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <32 x i8>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[_MSLD]] to <32 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <32 x i8> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <32 x i8> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <32 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <32 x i1> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <32 x i1> [[TMP15]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <32 x i1> [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8>
@@ -262,18 +234,14 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <8 x i32> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext <8 x i1> [[TMP25]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP28]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[X1]] to <32 x i8>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[X4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i32> [[TMP4]] to <32 x i8>
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i8> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <32 x i8> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp ne <32 x i8> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[TMP30]])
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <32 x i8> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <32 x i1> [[TMP26]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <32 x i1> [[TMP27]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = and <32 x i1> [[TMP26]], [[TMP31]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = or <32 x i1> [[TMP39]], [[TMP40]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = or <32 x i1> [[TMP42]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = sext <32 x i1> [[TMP43]] to <32 x i8>
@@ -281,28 +249,28 @@ define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i32> [[TMP45]], zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = sext <8 x i1> [[TMP46]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP49]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsud_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -310,38 +278,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsuds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -349,38 +341,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsud_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -388,38 +404,62 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsuds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -427,38 +467,62 @@ define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuud_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -466,38 +530,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <4 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 
-define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4) sanitize_memory {
+define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuuds_128(
-; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -505,38 +593,62 @@ define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <4 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <4 x i32>, ptr [[X2P]], align 16
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i8>, ptr [[X2P]], align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <16 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <16 x i1> [[TMP18]] to <16 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <16 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <16 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <16 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <16 x i1> [[TMP33]] to <16 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i8> [[TMP34]] to <4 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <4 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <4 x i1> [[TMP36]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> [[X0]], <16 x i8> [[X1]], <16 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
-  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %x2 = load <16 x i8>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2)
+  %2 = call <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x4)
   %res = add <4 x i32> %1, %2
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuud_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -544,38 +656,62 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <8 x i32> %x1, pt
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4) sanitize_memory {
+define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuuds_256(
-; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -583,25 +719,49 @@ define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <8 x i32> %x1, p
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i32>, ptr [[X2P]], align 32
+; CHECK-NEXT:    [[X2:%.*]] = load <32 x i8>, ptr [[X2P]], align 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP9]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP9]], align 32
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <32 x i8> [[X2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP12]], [[TMP24]]
+; CHECK-NEXT:    [[TMP16:%.*]] = and <32 x i1> [[TMP23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i1> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <32 x i1> [[TMP18]] to <32 x i8>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i8> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP22]], [[TMP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X2]])
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <32 x i8> [[X1]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne <32 x i8> [[X4]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = and <32 x i1> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and <32 x i1> [[TMP27]], [[TMP26]]
+; CHECK-NEXT:    [[TMP31:%.*]] = and <32 x i1> [[TMP25]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <32 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <32 x i1> [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <32 x i1> [[TMP33]] to <32 x i8>
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <32 x i8> [[TMP34]] to <8 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp ne <8 x i32> [[TMP35]], zeroinitializer
+; CHECK-NEXT:    [[TMP37:%.*]] = sext <8 x i1> [[TMP36]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP37]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> [[X0]], <32 x i8> [[X1]], <32 x i8> [[X4]])
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP3]]
 ; CHECK-NEXT:    [[RES:%.*]] = add <8 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
-  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %x2 = load <32 x i8>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2)
+  %2 = call <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x4)
   %res = add <8 x i32> %1, %2
   ret <8 x i32> %res
 }
diff --git a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
index 38c217fc68507..468242d1c2780 100644
--- a/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
+++ b/mlir/include/mlir/Dialect/X86Vector/X86Vector.td
@@ -468,11 +468,6 @@ def DotInt8Op : AVX_Op<"dot.i8", [Pure,
       intr += "." + std::to_string(opBitWidth);
       return intr;
     }
-
-    SmallVector<Value> getIntrinsicOperands(
-        ::mlir::ArrayRef<Value> operands,
-        const ::mlir::LLVMTypeConverter &typeConverter,
-        ::mlir::RewriterBase &rewriter);
   }];
 }
 
diff --git a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
index 0fa353abc4972..ef35c39316555 100644
--- a/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
+++ b/mlir/lib/Dialect/X86Vector/IR/X86VectorDialect.cpp
@@ -83,29 +83,6 @@ x86vector::DotOp::getIntrinsicOperands(ArrayRef<Value> operands,
   return intrinsicOperands;
 }
 
-SmallVector<Value> x86vector::DotInt8Op::getIntrinsicOperands(
-    ArrayRef<Value> operands, const LLVMTypeConverter &typeConverter,
-    RewriterBase &rewriter) {
-  SmallVector<Value> intrinsicOprnds;
-  Adaptor adaptor(operands, *this);
-  intrinsicOprnds.push_back(adaptor.getW());
-  // Bitcast `a` and `b` to i32
-  Value bitcast_a = LLVM::BitcastOp::create(
-      rewriter, getLoc(),
-      VectorType::get((getA().getType().getShape()[0] / 4),
-                      rewriter.getIntegerType(32)),
-      adaptor.getA());
-  intrinsicOprnds.push_back(bitcast_a);
-  Value bitcast_b = LLVM::BitcastOp::create(
-      rewriter, getLoc(),
-      VectorType::get((getB().getType().getShape()[0] / 4),
-                      rewriter.getIntegerType(32)),
-      adaptor.getB());
-  intrinsicOprnds.push_back(bitcast_b);
-
-  return intrinsicOprnds;
-}
-
 SmallVector<Value> x86vector::BcstToPackedF32Op::getIntrinsicOperands(
     ArrayRef<Value> operands, const LLVMTypeConverter &typeConverter,
     RewriterBase &rewriter) {

From da315a3528807574d1c9ba44758d35bfc515e709 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Tue, 30 Sep 2025 12:46:04 +0300
Subject: [PATCH 231/878] [BOLT] Refactor MCInstReference and move it to Core
 (NFC) (#155846)

Refactor MCInstReference class and move it from PAuth gadget scanner to
Core.

MCInstReference is a class representing a reference to a constant
instruction inside a parent entity - either inside a basic block (which
has a reference to its parent function) or directly inside a function
(when CFG information is not available).

This patch reapplies #138655 with a fix for iterator usage and multiple
minor issues fixed during the second round of review.
---
 bolt/include/bolt/Core/MCInstUtils.h          | 181 ++++++++++++++++++
 bolt/include/bolt/Passes/PAuthGadgetScanner.h | 176 +----------------
 bolt/lib/Core/CMakeLists.txt                  |   1 +
 bolt/lib/Core/MCInstUtils.cpp                 |  86 +++++++++
 bolt/lib/Passes/PAuthGadgetScanner.cpp        | 108 ++++-------
 5 files changed, 302 insertions(+), 250 deletions(-)
 create mode 100644 bolt/include/bolt/Core/MCInstUtils.h
 create mode 100644 bolt/lib/Core/MCInstUtils.cpp

diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h
new file mode 100644
index 0000000000000..eb56629c61c7d
--- /dev/null
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -0,0 +1,181 @@
+//===- bolt/Core/MCInstUtils.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BOLT_CORE_MCINSTUTILS_H
+#define BOLT_CORE_MCINSTUTILS_H
+
+#include "bolt/Core/BinaryBasicBlock.h"
+#include <map>
+#include <variant>
+
+namespace llvm {
+class MCCodeEmitter;
+}
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+
+/// MCInstReference represents a reference to a constant MCInst as stored either
+/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
+/// (after a CFG is created).
+///
+/// The reference may be invalidated when the function containing the referenced
+/// instruction is modified.
+class MCInstReference {
+public:
+  using nocfg_const_iterator = std::map<uint32_t, MCInst>::const_iterator;
+
+  /// Constructs an empty reference.
+  MCInstReference() : Reference(RefInBB(nullptr, /*Index=*/0)) {}
+
+  /// Constructs a reference to the instruction inside the basic block.
+  MCInstReference(const BinaryBasicBlock &BB, const MCInst &Inst)
+      : Reference(RefInBB(&BB, getInstIndexInBB(BB, Inst))) {}
+  /// Constructs a reference to the instruction inside the basic block.
+  MCInstReference(const BinaryBasicBlock &BB, unsigned Index)
+      : Reference(RefInBB(&BB, Index)) {}
+
+  /// Constructs a reference to the instruction inside the function without
+  /// CFG information.
+  MCInstReference(const BinaryFunction &BF, nocfg_const_iterator It)
+      : Reference(RefInBF(&BF, It)) {}
+
+  /// Locates an instruction inside a function and returns a reference.
+  static MCInstReference get(const MCInst &Inst, const BinaryFunction &BF);
+
+  bool operator==(const MCInstReference &Other) const {
+    return Reference == Other.Reference;
+  }
+
+  const MCInst &getMCInst() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB()) {
+      [[maybe_unused]] unsigned NumInstructions = Ref->BB->size();
+      assert(Ref->Index < NumInstructions && "Invalid reference");
+      return Ref->BB->getInstructionAtIndex(Ref->Index);
+    }
+    return getRefInBF().It->second;
+  }
+
+  operator const MCInst &() const { return getMCInst(); }
+
+  bool empty() const {
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB == nullptr;
+    return getRefInBF().BF == nullptr;
+  }
+
+  bool hasCFG() const { return !empty() && tryGetRefInBB() != nullptr; }
+
+  const BinaryFunction *getFunction() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB->getFunction();
+    return getRefInBF().BF;
+  }
+
+  const BinaryBasicBlock *getBasicBlock() const {
+    assert(!empty() && "Empty reference");
+    if (auto *Ref = tryGetRefInBB())
+      return Ref->BB;
+    return nullptr;
+  }
+
+  /// Computes the original address of the instruction (or offset from base
+  /// for PIC), assuming the containing function was not modified.
+  ///
+  /// This function is intended for the use cases like debug printing, as it
+  /// is only as precise as BinaryContext::computeCodeSize() is and requires
+  /// iterating over the prefix of the basic block (when CFG is available).
+  ///
+  /// MCCodeEmitter is not thread safe and the default instance from
+  /// BinaryContext is used by default, thus pass an instance explicitly if
+  /// this function may be called from multithreaded code.
+  uint64_t computeAddress(const MCCodeEmitter *Emitter = nullptr) const;
+
+  raw_ostream &print(raw_ostream &OS) const;
+
+private:
+  static unsigned getInstIndexInBB(const BinaryBasicBlock &BB,
+                                   const MCInst &Inst) {
+    // Usage of pointer arithmetic assumes the instructions are stored in a
+    // vector, see BasicBlockStorageIsVector in MCInstUtils.cpp.
+    const MCInst *FirstInstInBB = &*BB.begin();
+    return &Inst - FirstInstInBB;
+  }
+
+  // Two cases are possible:
+  // * functions with CFG reconstructed - a function stores a collection of
+  //   basic blocks, each basic block stores a contiguous vector of MCInst
+  // * functions without CFG - there are no basic blocks created,
+  //   the instructions are directly stored in std::map in BinaryFunction
+  //
+  // In both cases, the direct parent of MCInst is stored together with an
+  // index or iterator pointing to the instruction.
+
+  // Helper struct: CFG is available, the direct parent is a basic block.
+  struct RefInBB {
+    RefInBB(const BinaryBasicBlock *BB, unsigned Index)
+        : BB(BB), Index(Index) {}
+    RefInBB(const RefInBB &Other) = default;
+    RefInBB &operator=(const RefInBB &Other) = default;
+
+    const BinaryBasicBlock *BB;
+    unsigned Index;
+
+    bool operator==(const RefInBB &Other) const {
+      return BB == Other.BB && Index == Other.Index;
+    }
+  };
+
+  // Helper struct: CFG is *not* available, the direct parent is a function,
+  // iterator's type is std::map<uint32_t, MCInst>::iterator (the mapped value
+  // is an instruction's offset).
+  struct RefInBF {
+    RefInBF(const BinaryFunction *BF, nocfg_const_iterator It)
+        : BF(BF), It(It) {}
+    RefInBF(const RefInBF &Other) = default;
+    RefInBF &operator=(const RefInBF &Other) = default;
+
+    const BinaryFunction *BF;
+    nocfg_const_iterator It;
+
+    bool operator==(const RefInBF &Other) const {
+      return BF == Other.BF && It->first == Other.It->first;
+    }
+  };
+
+  std::variant<RefInBB, RefInBF> Reference;
+
+  // Utility methods to be used like this:
+  //
+  //     if (auto *Ref = tryGetRefInBB())
+  //       return Ref->doSomething(...);
+  //     return getRefInBF().doSomethingElse(...);
+  const RefInBB *tryGetRefInBB() const {
+    assert(std::get_if<RefInBB>(&Reference) ||
+           std::get_if<RefInBF>(&Reference));
+    return std::get_if<RefInBB>(&Reference);
+  }
+  const RefInBF &getRefInBF() const {
+    assert(std::get_if<RefInBF>(&Reference));
+    return *std::get_if<RefInBF>(&Reference);
+  }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS,
+                                      const MCInstReference &Ref) {
+  return Ref.print(OS);
+}
+
+} // namespace bolt
+} // namespace llvm
+
+#endif
diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
index 721fd664a3253..cb865a725d72a 100644
--- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h
+++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h
@@ -11,187 +11,13 @@
 
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/MCInstUtils.h"
 #include "bolt/Passes/BinaryPasses.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 namespace llvm {
 namespace bolt {
-
-/// @brief  MCInstReference represents a reference to an MCInst as stored either
-/// in a BinaryFunction (i.e. before a CFG is created), or in a BinaryBasicBlock
-/// (after a CFG is created). It aims to store the necessary information to be
-/// able to find the specific MCInst in either the BinaryFunction or
-/// BinaryBasicBlock data structures later, so that e.g. the InputAddress of
-/// the corresponding instruction can be computed.
-
-struct MCInstInBBReference {
-  BinaryBasicBlock *BB;
-  int64_t BBIndex;
-  MCInstInBBReference(BinaryBasicBlock *BB, int64_t BBIndex)
-      : BB(BB), BBIndex(BBIndex) {}
-  MCInstInBBReference() : BB(nullptr), BBIndex(0) {}
-  static MCInstInBBReference get(const MCInst *Inst, BinaryFunction &BF) {
-    for (BinaryBasicBlock &BB : BF)
-      for (size_t I = 0; I < BB.size(); ++I)
-        if (Inst == &BB.getInstructionAtIndex(I))
-          return MCInstInBBReference(&BB, I);
-    return {};
-  }
-  bool operator==(const MCInstInBBReference &RHS) const {
-    return BB == RHS.BB && BBIndex == RHS.BBIndex;
-  }
-  bool operator<(const MCInstInBBReference &RHS) const {
-    return std::tie(BB, BBIndex) < std::tie(RHS.BB, RHS.BBIndex);
-  }
-  operator MCInst &() const {
-    assert(BB != nullptr);
-    return BB->getInstructionAtIndex(BBIndex);
-  }
-  uint64_t getAddress() const {
-    // 4 bytes per instruction on AArch64.
-    // FIXME: the assumption of 4 byte per instruction needs to be fixed before
-    // this method gets used on any non-AArch64 binaries (but should be fine for
-    // pac-ret analysis, as that is an AArch64-specific feature).
-    return BB->getFunction()->getAddress() + BB->getOffset() + BBIndex * 4;
-  }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &);
-
-struct MCInstInBFReference {
-  BinaryFunction *BF;
-  uint64_t Offset;
-  MCInstInBFReference(BinaryFunction *BF, uint64_t Offset)
-      : BF(BF), Offset(Offset) {}
-
-  static MCInstInBFReference get(const MCInst *Inst, BinaryFunction &BF) {
-    for (auto &I : BF.instrs())
-      if (Inst == &I.second)
-        return MCInstInBFReference(&BF, I.first);
-    return {};
-  }
-
-  MCInstInBFReference() : BF(nullptr), Offset(0) {}
-  bool operator==(const MCInstInBFReference &RHS) const {
-    return BF == RHS.BF && Offset == RHS.Offset;
-  }
-  bool operator<(const MCInstInBFReference &RHS) const {
-    return std::tie(BF, Offset) < std::tie(RHS.BF, RHS.Offset);
-  }
-  operator MCInst &() const {
-    assert(BF != nullptr);
-    return *BF->getInstructionAtOffset(Offset);
-  }
-
-  uint64_t getOffset() const { return Offset; }
-
-  uint64_t getAddress() const { return BF->getAddress() + getOffset(); }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &);
-
-struct MCInstReference {
-  enum Kind { FunctionParent, BasicBlockParent };
-  Kind ParentKind;
-  union U {
-    MCInstInBBReference BBRef;
-    MCInstInBFReference BFRef;
-    U(MCInstInBBReference BBRef) : BBRef(BBRef) {}
-    U(MCInstInBFReference BFRef) : BFRef(BFRef) {}
-  } U;
-  MCInstReference(MCInstInBBReference BBRef)
-      : ParentKind(BasicBlockParent), U(BBRef) {}
-  MCInstReference(MCInstInBFReference BFRef)
-      : ParentKind(FunctionParent), U(BFRef) {}
-  MCInstReference(BinaryBasicBlock *BB, int64_t BBIndex)
-      : MCInstReference(MCInstInBBReference(BB, BBIndex)) {}
-  MCInstReference(BinaryFunction *BF, uint32_t Offset)
-      : MCInstReference(MCInstInBFReference(BF, Offset)) {}
-
-  static MCInstReference get(const MCInst *Inst, BinaryFunction &BF) {
-    if (BF.hasCFG())
-      return MCInstInBBReference::get(Inst, BF);
-    return MCInstInBFReference::get(Inst, BF);
-  }
-
-  bool operator<(const MCInstReference &RHS) const {
-    if (ParentKind != RHS.ParentKind)
-      return ParentKind < RHS.ParentKind;
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef < RHS.U.BBRef;
-    case FunctionParent:
-      return U.BFRef < RHS.U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  bool operator==(const MCInstReference &RHS) const {
-    if (ParentKind != RHS.ParentKind)
-      return false;
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef == RHS.U.BBRef;
-    case FunctionParent:
-      return U.BFRef == RHS.U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  operator MCInst &() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef;
-    case FunctionParent:
-      return U.BFRef;
-    }
-    llvm_unreachable("");
-  }
-
-  operator bool() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef.BB != nullptr;
-    case FunctionParent:
-      return U.BFRef.BF != nullptr;
-    }
-    llvm_unreachable("");
-  }
-
-  uint64_t getAddress() const {
-    switch (ParentKind) {
-    case BasicBlockParent:
-      return U.BBRef.getAddress();
-    case FunctionParent:
-      return U.BFRef.getAddress();
-    }
-    llvm_unreachable("");
-  }
-
-  BinaryFunction *getFunction() const {
-    switch (ParentKind) {
-    case FunctionParent:
-      return U.BFRef.BF;
-    case BasicBlockParent:
-      return U.BBRef.BB->getFunction();
-    }
-    llvm_unreachable("");
-  }
-
-  BinaryBasicBlock *getBasicBlock() const {
-    switch (ParentKind) {
-    case FunctionParent:
-      return nullptr;
-    case BasicBlockParent:
-      return U.BBRef.BB;
-    }
-    llvm_unreachable("");
-  }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &);
-
 namespace PAuthGadgetScanner {
 
 // The report classes are designed to be used in an immutable manner.
diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt
index fc72dc023c590..58cfcab370f16 100644
--- a/bolt/lib/Core/CMakeLists.txt
+++ b/bolt/lib/Core/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_library(LLVMBOLTCore
   GDBIndex.cpp
   HashUtilities.cpp
   JumpTable.cpp
+  MCInstUtils.cpp
   MCPlusBuilder.cpp
   ParallelUtilities.cpp
   Relocation.cpp
diff --git a/bolt/lib/Core/MCInstUtils.cpp b/bolt/lib/Core/MCInstUtils.cpp
new file mode 100644
index 0000000000000..f505bf73c64eb
--- /dev/null
+++ b/bolt/lib/Core/MCInstUtils.cpp
@@ -0,0 +1,86 @@
+//===- bolt/Core/MCInstUtils.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/MCInstUtils.h"
+#include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/BinaryFunction.h"
+
+#include <type_traits>
+
+using namespace llvm;
+using namespace llvm::bolt;
+
+// It is assumed in a few places that BinaryBasicBlock stores its instructions
+// in a contiguous vector.
+using BasicBlockStorageIsVector =
+    std::is_same<BinaryBasicBlock::const_iterator,
+                 std::vector<MCInst>::const_iterator>;
+static_assert(BasicBlockStorageIsVector::value);
+
+MCInstReference MCInstReference::get(const MCInst &Inst,
+                                     const BinaryFunction &BF) {
+  if (BF.hasCFG()) {
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &MI : BB)
+        if (&MI == &Inst)
+          return MCInstReference(BB, Inst);
+    }
+    llvm_unreachable("Inst is not contained in BF");
+  }
+
+  for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I) {
+    if (&I->second == &Inst)
+      return MCInstReference(BF, I);
+  }
+  llvm_unreachable("Inst is not contained in BF");
+}
+
+uint64_t MCInstReference::computeAddress(const MCCodeEmitter *Emitter) const {
+  assert(!empty() && "Taking instruction address by empty reference");
+
+  const BinaryContext &BC = getFunction()->getBinaryContext();
+  if (auto *Ref = tryGetRefInBB()) {
+    const uint64_t AddressOfBB =
+        getFunction()->getAddress() + Ref->BB->getOffset();
+    const MCInst *FirstInstInBB = &*Ref->BB->begin();
+    const MCInst *ThisInst = &getMCInst();
+
+    // Usage of plain 'const MCInst *' as iterators assumes the instructions
+    // are stored in a vector, see BasicBlockStorageIsVector.
+    const uint64_t OffsetInBB =
+        BC.computeCodeSize(FirstInstInBB, ThisInst, Emitter);
+
+    return AddressOfBB + OffsetInBB;
+  }
+
+  auto &Ref = getRefInBF();
+  const uint64_t OffsetInBF = Ref.It->first;
+
+  return getFunction()->getAddress() + OffsetInBF;
+}
+
+raw_ostream &MCInstReference::print(raw_ostream &OS) const {
+  if (const RefInBB *Ref = tryGetRefInBB()) {
+    OS << "MCInstBBRef<";
+    if (Ref->BB == nullptr)
+      OS << "BB:(null)";
+    else
+      OS << "BB:" << Ref->BB->getName() << ":" << Ref->Index;
+    OS << ">";
+    return OS;
+  }
+
+  const RefInBF &Ref = getRefInBF();
+  OS << "MCInstBFRef<";
+  if (Ref.BF == nullptr)
+    OS << "BF:(null)";
+  else
+    OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.It->first;
+  OS << ">";
+  return OS;
+}
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 65c84ebc8c4f4..cfe4b6ba785e4 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -24,39 +24,6 @@
 
 namespace llvm {
 namespace bolt {
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBBReference &Ref) {
-  OS << "MCInstBBRef<";
-  if (Ref.BB == nullptr)
-    OS << "BB:(null)";
-  else
-    OS << "BB:" << Ref.BB->getName() << ":" << Ref.BBIndex;
-  OS << ">";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstInBFReference &Ref) {
-  OS << "MCInstBFRef<";
-  if (Ref.BF == nullptr)
-    OS << "BF:(null)";
-  else
-    OS << "BF:" << Ref.BF->getPrintName() << ":" << Ref.getOffset();
-  OS << ">";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MCInstReference &Ref) {
-  switch (Ref.ParentKind) {
-  case MCInstReference::BasicBlockParent:
-    OS << Ref.U.BBRef;
-    return OS;
-  case MCInstReference::FunctionParent:
-    OS << Ref.U.BFRef;
-    return OS;
-  }
-  llvm_unreachable("");
-}
-
 namespace PAuthGadgetScanner {
 
 [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label,
@@ -91,10 +58,10 @@ template <typename T> static void iterateOverInstrs(BinaryFunction &BF, T Fn) {
   if (BF.hasCFG()) {
     for (BinaryBasicBlock &BB : BF)
       for (int64_t I = 0, E = BB.size(); I < E; ++I)
-        Fn(MCInstInBBReference(&BB, I));
+        Fn(MCInstReference(BB, I));
   } else {
-    for (auto I : BF.instrs())
-      Fn(MCInstInBFReference(&BF, I.first));
+    for (auto I = BF.instrs().begin(), E = BF.instrs().end(); I != E; ++I)
+      Fn(MCInstReference(BF, I));
   }
 }
 
@@ -564,11 +531,8 @@ class SrcSafetyAnalysis {
     const SrcState &S = getStateBefore(Inst);
 
     std::vector<MCInstReference> Result;
-    for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg)) {
-      MCInstReference Ref = MCInstReference::get(Inst, BF);
-      assert(Ref && "Expected Inst to be found");
-      Result.push_back(Ref);
-    }
+    for (const MCInst *Inst : lastWritingInsts(S, ClobberedReg))
+      Result.push_back(MCInstReference::get(*Inst, BF));
     return Result;
   }
 };
@@ -1136,11 +1100,8 @@ class DstSafetyAnalysis {
     const DstState &S = getStateAfter(Inst);
 
     std::vector<MCInstReference> Result;
-    for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg)) {
-      MCInstReference Ref = MCInstReference::get(Inst, BF);
-      assert(Ref && "Expected Inst to be found");
-      Result.push_back(Ref);
-    }
+    for (const MCInst *Inst : firstLeakingInsts(S, LeakedReg))
+      Result.push_back(MCInstReference::get(*Inst, BF));
     return Result;
   }
 };
@@ -1345,8 +1306,7 @@ static bool shouldAnalyzeTailCallInst(const BinaryContext &BC,
   // (such as isBranch at the time of writing this comment), some don't (such
   // as isCall). For that reason, call MCInstrDesc's methods explicitly when
   // it is important.
-  const MCInstrDesc &Desc =
-      BC.MII->get(static_cast<const MCInst &>(Inst).getOpcode());
+  const MCInstrDesc &Desc = BC.MII->get(Inst.getMCInst().getOpcode());
   // Tail call should be a branch (but not necessarily an indirect one).
   if (!Desc.isBranch())
     return false;
@@ -1541,7 +1501,7 @@ void FunctionAnalysisContext::findUnsafeUses(
       // This is printed as "[message] in function [name], basic block ...,
       // at address ..." when the issue is reported to the user.
       Reports.push_back(make_generic_report(
-          MCInstReference::get(FirstInst, BF),
+          MCInstReference(BB, *FirstInst),
           "Warning: possibly imprecise CFG, the analysis quality may be "
           "degraded in this function. According to BOLT, unreachable code is "
           "found" /* in function [name]... */));
@@ -1705,48 +1665,44 @@ void Analysis::runOnFunction(BinaryFunction &BF,
   }
 }
 
-static void printBB(const BinaryContext &BC, const BinaryBasicBlock *BB,
+static void printBB(const BinaryContext &BC, const BinaryBasicBlock &BB,
                     size_t StartIndex = 0, size_t EndIndex = -1) {
   if (EndIndex == (size_t)-1)
-    EndIndex = BB->size() - 1;
-  const BinaryFunction *BF = BB->getFunction();
+    EndIndex = BB.size() - 1;
+  const BinaryFunction *BF = BB.getFunction();
   for (unsigned I = StartIndex; I <= EndIndex; ++I) {
-    // FIXME: this assumes all instructions are 4 bytes in size. This is true
-    // for AArch64, but it might be good to extract this function so it can be
-    // used elsewhere and for other targets too.
-    uint64_t Address = BB->getOffset() + BF->getAddress() + 4 * I;
-    const MCInst &Inst = BB->getInstructionAtIndex(I);
+    MCInstReference Inst(BB, I);
     if (BC.MIB->isCFI(Inst))
       continue;
-    BC.printInstruction(outs(), Inst, Address, BF);
+    BC.printInstruction(outs(), Inst, Inst.computeAddress(), BF);
   }
 }
 
 static void reportFoundGadgetInSingleBBSingleRelatedInst(
     raw_ostream &OS, const BinaryContext &BC, const MCInstReference RelatedInst,
     const MCInstReference Location) {
-  BinaryBasicBlock *BB = Location.getBasicBlock();
-  assert(RelatedInst.ParentKind == MCInstReference::BasicBlockParent);
-  assert(Location.ParentKind == MCInstReference::BasicBlockParent);
-  MCInstInBBReference RelatedInstBB = RelatedInst.U.BBRef;
-  if (BB == RelatedInstBB.BB) {
+  const BinaryBasicBlock *BB = Location.getBasicBlock();
+  assert(RelatedInst.hasCFG());
+  assert(Location.hasCFG());
+  if (BB == RelatedInst.getBasicBlock()) {
     OS << "  This happens in the following basic block:\n";
-    printBB(BC, BB);
+    printBB(BC, *BB);
   }
 }
 
 void Diagnostic::printBasicInfo(raw_ostream &OS, const BinaryContext &BC,
                                 StringRef IssueKind) const {
-  BinaryFunction *BF = Location.getFunction();
-  BinaryBasicBlock *BB = Location.getBasicBlock();
+  const BinaryBasicBlock *BB = Location.getBasicBlock();
+  const BinaryFunction *BF = Location.getFunction();
+  const uint64_t Address = Location.computeAddress();
 
   OS << "\nGS-PAUTH: " << IssueKind;
   OS << " in function " << BF->getPrintName();
   if (BB)
     OS << ", basic block " << BB->getName();
-  OS << ", at address " << llvm::format("%x", Location.getAddress()) << "\n";
+  OS << ", at address " << llvm::format("%x", Address) << "\n";
   OS << "  The instruction is ";
-  BC.printInstruction(OS, Location, Location.getAddress(), BF);
+  BC.printInstruction(OS, Location, Address, BF);
 }
 
 void GadgetDiagnostic::generateReport(raw_ostream &OS,
@@ -1760,21 +1716,23 @@ static void printRelatedInstrs(raw_ostream &OS, const MCInstReference Location,
   const BinaryContext &BC = BF.getBinaryContext();
 
   // Sort by address to ensure output is deterministic.
-  SmallVector<MCInstReference> RI(RelatedInstrs);
-  llvm::sort(RI, [](const MCInstReference &A, const MCInstReference &B) {
-    return A.getAddress() < B.getAddress();
-  });
+  SmallVector<std::pair<uint64_t, MCInstReference>> RI;
+  for (auto &InstRef : RelatedInstrs)
+    RI.push_back(std::make_pair(InstRef.computeAddress(), InstRef));
+  llvm::sort(RI, [](auto A, auto B) { return A.first < B.first; });
+
   for (unsigned I = 0; I < RI.size(); ++I) {
-    MCInstReference InstRef = RI[I];
+    auto [Address, InstRef] = RI[I];
     OS << "  " << (I + 1) << ". ";
-    BC.printInstruction(OS, InstRef, InstRef.getAddress(), &BF);
+    BC.printInstruction(OS, InstRef, Address, &BF);
   };
+
   if (RelatedInstrs.size() == 1) {
     const MCInstReference RelatedInst = RelatedInstrs[0];
     // Printing the details for the MCInstReference::FunctionParent case
     // is not implemented not to overcomplicate the code, as most functions
     // are expected to have CFG information.
-    if (RelatedInst.ParentKind == MCInstReference::BasicBlockParent)
+    if (RelatedInst.hasCFG())
       reportFoundGadgetInSingleBBSingleRelatedInst(OS, BC, RelatedInst,
                                                    Location);
   }

From 45ce88758d24df7c2c322b152cf4894f3e8ee45a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 30 Sep 2025 11:03:55 +0100
Subject: [PATCH 232/878] [LV] Don't preserve LCSSA in SCEVExpander for runtime
 checks. (#159556)

LV does not preserve LCSSA, it constructs it just before processing a
loop to vectorize. Runtime check expressions are invariant to that loop,
so expanding them should not break LCSSA form for the loop we are about
to vectorize.

This fixes a crash when discarding instructions generated when expanding
runtime checks, if the expansion introduces LCSSA phis for values from
other loops which are not in LCSSA form: we would introduce new LCSSA
phis and update all outside users, some of which are not created by the
expander and cannot be cleaned up.

Fixes https://github.com/llvm/llvm-project/issues/158259.

PR: https://github.com/llvm/llvm-project/pull/159556
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 +-
 .../X86/cleanup-runtime-checks.ll             | 79 +++++++++++++++++++
 llvm/test/Transforms/LoopVectorize/pr45259.ll |  5 +-
 ...pr47343-expander-lcssa-after-cfg-update.ll |  9 +--
 .../reuse-lcssa-phi-scev-expansion.ll         | 11 ++-
 .../LoopVectorize/skeleton-lcssa-crash.ll     | 18 ++---
 6 files changed, 100 insertions(+), 29 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/cleanup-runtime-checks.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ab5c9c99b9448..a0043bed2e0c8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1762,9 +1762,10 @@ class GeneratedRTChecks {
   GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
                     LoopInfo *LI, TargetTransformInfo *TTI,
                     const DataLayout &DL, TTI::TargetCostKind CostKind)
-      : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
-        MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
-        CostKind(CostKind) {}
+      : DT(DT), LI(LI), TTI(TTI),
+        SCEVExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
+        MemCheckExp(*PSE.getSE(), DL, "scev.check", /*PreserveLCSSA=*/false),
+        PSE(PSE), CostKind(CostKind) {}
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cleanup-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/X86/cleanup-runtime-checks.ll
new file mode 100644
index 0000000000000..41753f7e4f27a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/cleanup-runtime-checks.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+declare ptr @get()
+declare i1 @cond()
+
+; Make sure we can clean up the created runtime checks, if vectorization isn't
+; profitable.
+define void @widget(i32 %arg, i64 %arg1, ptr %src) #0 {
+; CHECK-LABEL: define void @widget(
+; CHECK-SAME: i32 [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER:.*]]
+; CHECK:       [[LOOP_1_HEADER]]:
+; CHECK-NEXT:    br label %[[INNER_1:.*]]
+; CHECK:       [[INNER_1]]:
+; CHECK-NEXT:    [[C_1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_1]], label %[[INNER_2:.*]], label %[[INNER_1]]
+; CHECK:       [[INNER_2]]:
+; CHECK-NEXT:    [[LOAD:%.*]] = call ptr @get()
+; CHECK-NEXT:    [[C_2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C_2]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1_LATCH:.*]]
+; CHECK:       [[LOOP_2_PREHEADER]]:
+; CHECK-NEXT:    br label %[[LOOP_2:.*]]
+; CHECK:       [[LOOP_1_LATCH]]:
+; CHECK-NEXT:    br label %[[LOOP_1_HEADER]]
+; CHECK:       [[LOOP_2]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP_2]] ], [ [[ARG]], %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[PHI8:%.*]] = phi i32 [ [[OR:%.*]], %[[LOOP_2]] ], [ 99, %[[LOOP_2_PREHEADER]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[OR]] = or i32 [[PHI8]], [[L]]
+; CHECK-NEXT:    store i32 [[OR]], ptr [[LOAD]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 100
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_2]], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.1.header
+
+loop.1.header:
+  br label %inner.1
+
+inner.1:
+  %c.1 = call i1 @cond()
+  br i1 %c.1, label %inner.2, label %inner.1
+
+inner.2:
+  %load = call ptr @get()
+  %c.2 = call i1 @cond()
+  br i1 %c.2, label %loop.2, label %loop.1.latch
+
+loop.1.latch:
+  br label %loop.1.header
+
+loop.2:
+  %iv = phi i32 [ %arg, %inner.2 ], [ %iv.next, %loop.2 ]
+  %phi8 = phi i32 [ 99, %inner.2 ], [ %or, %loop.2 ]
+  %gep.src = getelementptr i32, ptr  %src, i32 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %or = or i32 %phi8, %l
+  store i32 %or, ptr %load, align 4
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 100
+  br i1 %ec, label %exit, label %loop.2, !prof !0
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+avx2" }
+!0 = !{!"branch_weights", i32 89478484, i32 1879048192}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 89478484, i32 1879048192}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll
index fade7264f6494..f33437fd8ebde 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45259.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll
@@ -10,16 +10,15 @@ define i8 @widget(ptr %arr, i8 %t9) {
 ; CHECK-NEXT:    br label [[BB6:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    [[T1_0:%.*]] = phi ptr [ [[ARR]], [[BB:%.*]] ], [ null, [[BB6]] ]
+; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0]] to i64
 ; CHECK-NEXT:    [[C:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]]
 ; CHECK:       for.preheader:
-; CHECK-NEXT:    [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
 ; CHECK-NEXT:    [[T1_0_LCSSA4:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
 ; CHECK-NEXT:    [[T1_0_LCSSA1:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ]
-; CHECK-NEXT:    [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64
-; CHECK-NEXT:    [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[TMP0]]
+; CHECK-NEXT:    [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
index 047d36bafbf88..b9cb1cb5abae8 100644
--- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll
@@ -28,18 +28,15 @@ define void @f() {
 ; CHECK:       outer.latch:
 ; CHECK-NEXT:    br label [[OUTER_HEADER]]
 ; CHECK:       outer.exit.0:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP0]], [[OUTER_HEADER]] ]
 ; CHECK-NEXT:    br label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       outer.exit.1:
-; CHECK-NEXT:    [[DOTLCSSA1:%.*]] = phi ptr [ [[TMP0]], [[INNER_1_LATCH]] ]
 ; CHECK-NEXT:    br label [[LOOP_PREHEADER]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi ptr [ [[DOTLCSSA]], [[OUTER_EXIT_0]] ], [ [[DOTLCSSA1]], [[OUTER_EXIT_1]] ]
 ; CHECK-NEXT:    br label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr @f.e, [[SCEVGEP]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TMP1]], getelementptr inbounds nuw (i8, ptr @f.e, i64 4)
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TMP0]], getelementptr inbounds nuw (i8, ptr @f.e, i64 4)
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -59,7 +56,7 @@ define void @f() {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[CONV6_US_US_US:%.*]] = zext i1 false to i32
 ; CHECK-NEXT:    store i32 [[CONV6_US_US_US]], ptr @f.e, align 1
-; CHECK-NEXT:    store i8 10, ptr [[TMP1]], align 1
+; CHECK-NEXT:    store i8 10, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index 73d5e26ef82a2..5894c3af1d637 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -109,14 +109,13 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = phi ptr [ [[DST_1]], %[[ENTRY]] ], [ [[PTR_IV_1_NEXT:%.*]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @val()
 ; CHECK-NEXT:    [[SEL_DST:%.*]] = select i1 [[C]], ptr [[DST_1]], ptr [[DST_2]]
+; CHECK-NEXT:    [[SEL_DST_LCSSA12:%.*]] = ptrtoint ptr [[SEL_DST]] to i64
 ; CHECK-NEXT:    [[PTR_IV_1_NEXT]] = getelementptr i8, ptr [[PTR_IV_1]], i64 1
 ; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    br i1 [[EC_1]], label %[[LOOP_2_HEADER_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_HEADER_PREHEADER]]:
-; CHECK-NEXT:    [[SEL_DST_LCSSA1:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[PTR_IV_1_LCSSA:%.*]] = phi ptr [ [[PTR_IV_1]], %[[LOOP_1]] ]
 ; CHECK-NEXT:    [[SEL_DST_LCSSA:%.*]] = phi ptr [ [[SEL_DST]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[SEL_DST_LCSSA12:%.*]] = ptrtoint ptr [[SEL_DST_LCSSA1]] to i64
 ; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[PTR_IV_1_LCSSA]] to i64
@@ -140,13 +139,13 @@ define void @runtime_checks_ptr_inductions(ptr %dst.1, ptr %dst.2, i1 %c) {
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1023, %[[MIDDLE_BLOCK]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[PTR_IV_1_LCSSA]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[SEL_DST_LCSSA]], %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP_2_HEADER:.*]]
 ; CHECK:       [[LOOP_2_HEADER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[DEC7:%.*]], %[[LOOP_2_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_2:%.*]] = phi ptr [ [[PTR_IV_2_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PTR_IV_3:%.*]] = phi ptr [ [[PTR_IV_3_NEXT:%.*]], %[[LOOP_2_LATCH]] ], [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[EC_2:%.*]] = icmp eq i32 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[EC_2]], label %[[EXIT:.*]], label %[[LOOP_2_LATCH]]
 ; CHECK:       [[LOOP_2_LATCH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
index 9c14a8c08618f..1e4598e756645 100644
--- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
@@ -23,18 +23,16 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[C_3:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]]
 ; CHECK:       loop.3.preheader:
-; CHECK-NEXT:    [[L_1_LCSSA:%.*]] = phi ptr [ [[L_1]], [[INNER_BB]] ]
-; CHECK-NEXT:    [[L_2_LCSSA:%.*]] = phi ptr [ [[L_2]], [[INNER_BB]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[L_2_LCSSA]], i64 2
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[L_2]], i64 2
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[L_1]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
-; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 [[TMP2]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[L_2_LCSSA]], [[SCEVGEP6]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[L_1]], i64 [[TMP2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[L_2]], [[SCEVGEP3]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP5]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
@@ -67,19 +65,17 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[C_5:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, ptr [[L_1_LCSSA]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i16, ptr [[L_1]], i64 [[IV_NEXT]]
 ; CHECK-NEXT:    [[LOOP_L_1:%.*]] = load i16, ptr [[GEP_1]], align 2
-; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, ptr [[L_2_LCSSA]], i64 0
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, ptr [[L_2]], i64 0
 ; CHECK-NEXT:    store i16 [[LOOP_L_1]], ptr [[GEP_2]], align 2
 ; CHECK-NEXT:    br i1 [[C_5]], label [[LOOP_3]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit.loopexit1:
-; CHECK-NEXT:    [[L_1_LCSSA3:%.*]] = phi ptr [ [[L_1]], [[INNER_LATCH]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[L_14:%.*]] = phi ptr [ [[L_1_LCSSA3]], [[EXIT_LOOPEXIT1]] ], [ [[L_1_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT:    [[L_3:%.*]] = load i16, ptr [[L_14]], align 2
+; CHECK-NEXT:    [[L_3:%.*]] = load i16, ptr [[L_1]], align 2
 ; CHECK-NEXT:    ret i16 [[L_3]]
 ;
 entry:

From ec91d6ba6abdb678730fa1007230d05dc4bc09e8 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Tue, 30 Sep 2025 12:32:52 +0200
Subject: [PATCH 233/878] [Flang] Add perfect-nest and rectangular-loop
 semantic tests (#160283)

Add semantic tests of currently unsupported OpenMP canonical loops:

 * non-perfectly nested canonical loop nests
 * non-rectangular canonical loop nests

Both were introduced in OpenMP 5.0 and are not yet supported by Flang.

The message "Trip count must be computable and invariant" is the same
that OpenACC emits for non-rectangular loops in
`AccAttributeVisitor::CheckAssociatedLoop`. I considered reusing the
code, but calls OpenACC-only methods and has different behavior (e.g.
symbol resolution and does not check the step operand)
---
 flang/lib/Semantics/resolve-directives.cpp | 157 ++++++++++++++++++++-
 flang/test/Semantics/OpenMP/do08.f90       |   1 +
 flang/test/Semantics/OpenMP/do13.f90       |   1 +
 flang/test/Semantics/OpenMP/do22.f90       |  73 ++++++++++
 4 files changed, 226 insertions(+), 6 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/do22.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 270642acb3e9b..6132193332b4b 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -149,7 +149,24 @@ template <typename T> class DirectiveAttributeVisitor {
     dataSharingAttributeObjects_.clear();
   }
   bool HasDataSharingAttributeObject(const Symbol &);
+
+  /// Extract the iv and bounds of a DO loop:
+  /// 1. The loop index/induction variable
+  /// 2. The lower bound
+  /// 3. The upper bound
+  /// 4. The step/increment (or nullptr if not present)
+  ///
+  /// Each returned tuple value can be nullptr if not present. Diagnoses an
+  /// error if the the DO loop is a DO WHILE or DO CONCURRENT loop.
+  std::tuple<const parser::Name *, const parser::ScalarExpr *,
+      const parser::ScalarExpr *, const parser::ScalarExpr *>
+  GetLoopBounds(const parser::DoConstruct &);
+
+  /// Extract the loop index/induction variable from a DO loop. Diagnoses an
+  /// error if the the DO loop is a DO WHILE or DO CONCURRENT loop and returns
+  /// nullptr.
   const parser::Name *GetLoopIndex(const parser::DoConstruct &);
+
   const parser::DoConstruct *GetDoConstructIf(
       const parser::ExecutionPartConstruct &);
   Symbol *DeclareNewAccessEntity(const Symbol &, Symbol::Flag, Scope &);
@@ -953,6 +970,13 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     privateDataSharingAttributeObjects_.clear();
   }
 
+  /// Check that loops in the loop nest are perfectly nested, as well that lower
+  /// bound, upper bound, and step expressions do not use the iv
+  /// of a surrounding loop of the associated loops nest.
+  /// We do not support non-perfectly nested loops not non-rectangular loops yet
+  /// (both introduced in OpenMP 5.0)
+  void CheckPerfectNestAndRectangularLoop(const parser::OpenMPLoopConstruct &x);
+
   // Predetermined DSA rules
   void PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
       const parser::OpenMPLoopConstruct &);
@@ -1028,14 +1052,15 @@ bool DirectiveAttributeVisitor<T>::HasDataSharingAttributeObject(
 }
 
 template <typename T>
-const parser::Name *DirectiveAttributeVisitor<T>::GetLoopIndex(
-    const parser::DoConstruct &x) {
+std::tuple<const parser::Name *, const parser::ScalarExpr *,
+    const parser::ScalarExpr *, const parser::ScalarExpr *>
+DirectiveAttributeVisitor<T>::GetLoopBounds(const parser::DoConstruct &x) {
   using Bounds = parser::LoopControl::Bounds;
   if (x.GetLoopControl()) {
     if (const Bounds * b{std::get_if<Bounds>(&x.GetLoopControl()->u)}) {
-      return &b->name.thing;
-    } else {
-      return nullptr;
+      auto &step = b->step;
+      return {&b->name.thing, &b->lower, &b->upper,
+          step.has_value() ? &step.value() : nullptr};
     }
   } else {
     context_
@@ -1043,8 +1068,14 @@ const parser::Name *DirectiveAttributeVisitor<T>::GetLoopIndex(
             "Loop control is not present in the DO LOOP"_err_en_US)
         .Attach(GetContext().directiveSource,
             "associated with the enclosing LOOP construct"_en_US);
-    return nullptr;
   }
+  return {nullptr, nullptr, nullptr, nullptr};
+}
+
+template <typename T>
+const parser::Name *DirectiveAttributeVisitor<T>::GetLoopIndex(
+    const parser::DoConstruct &x) {
+  return std::get<const parser::Name *>(GetLoopBounds(x));
 }
 
 template <typename T>
@@ -1990,6 +2021,10 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
       }
     }
   }
+
+  // Must be done before iv privatization
+  CheckPerfectNestAndRectangularLoop(x);
+
   PrivatizeAssociatedLoopIndexAndCheckLoopLevel(x);
   ordCollapseLevel = GetNumAffectedLoopsFromLoopConstruct(x) + 1;
   return true;
@@ -2185,6 +2220,116 @@ void OmpAttributeVisitor::CollectNumAffectedLoopsFromClauses(
   }
 }
 
+void OmpAttributeVisitor::CheckPerfectNestAndRectangularLoop(
+    const parser::OpenMPLoopConstruct &x) {
+  auto &dirContext{GetContext()};
+  std::int64_t dirDepth{dirContext.associatedLoopLevel};
+  if (dirDepth <= 0)
+    return;
+
+  auto checkExprHasSymbols = [&](llvm::SmallVector<Symbol *> &ivs,
+                                 const parser::ScalarExpr *bound) {
+    if (ivs.empty())
+      return;
+    auto boundExpr{semantics::AnalyzeExpr(context_, *bound)};
+    if (!boundExpr)
+      return;
+    semantics::UnorderedSymbolSet boundSyms{
+        evaluate::CollectSymbols(*boundExpr)};
+    if (boundSyms.empty())
+      return;
+    for (Symbol *iv : ivs) {
+      if (boundSyms.count(*iv) != 0) {
+        // TODO: Point to occurence of iv in boundExpr, directiveSource as a
+        //       note
+        context_.Say(dirContext.directiveSource,
+            "Trip count must be computable and invariant"_err_en_US);
+      }
+    }
+  };
+
+  // Find the associated region by skipping nested loop-associated constructs
+  // such as loop transformations
+  const parser::NestedConstruct *innermostAssocRegion{nullptr};
+  const parser::OpenMPLoopConstruct *innermostConstruct{&x};
+  while (const auto &innerAssocStmt{
+      std::get<std::optional<parser::NestedConstruct>>(
+          innermostConstruct->t)}) {
+    innermostAssocRegion = &(innerAssocStmt.value());
+    if (const auto *innerConstruct{
+            std::get_if<common::Indirection<parser::OpenMPLoopConstruct>>(
+                innermostAssocRegion)}) {
+      innermostConstruct = &innerConstruct->value();
+    } else {
+      break;
+    }
+  }
+
+  if (!innermostAssocRegion)
+    return;
+  const auto &outer{std::get_if<parser::DoConstruct>(innermostAssocRegion)};
+  if (!outer)
+    return;
+
+  llvm::SmallVector<Symbol *> ivs;
+  int curLevel{0};
+  const parser::DoConstruct *loop{outer};
+  while (true) {
+    auto [iv, lb, ub, step] = GetLoopBounds(*loop);
+
+    if (lb)
+      checkExprHasSymbols(ivs, lb);
+    if (ub)
+      checkExprHasSymbols(ivs, ub);
+    if (step)
+      checkExprHasSymbols(ivs, step);
+    if (iv) {
+      if (auto *symbol{currScope().FindSymbol(iv->source)})
+        ivs.push_back(symbol);
+    }
+
+    // Stop after processing all affected loops
+    if (curLevel + 1 >= dirDepth)
+      break;
+
+    // Recurse into nested loop
+    const auto &block{std::get<parser::Block>(loop->t)};
+    if (block.empty()) {
+      // Insufficient number of nested loops already reported by
+      // CheckAssocLoopLevel()
+      break;
+    }
+
+    loop = GetDoConstructIf(block.front());
+    if (!loop) {
+      // Insufficient number of nested loops already reported by
+      // CheckAssocLoopLevel()
+      break;
+    }
+
+    auto checkPerfectNest = [&, this]() {
+      auto blockSize = block.size();
+      if (blockSize <= 1)
+        return;
+
+      if (parser::Unwrap<parser::ContinueStmt>(x))
+        blockSize -= 1;
+
+      if (blockSize <= 1)
+        return;
+
+      // Non-perfectly nested loop
+      // TODO: Point to non-DO statement, directiveSource as a note
+      context_.Say(dirContext.directiveSource,
+          "Canonical loop nest must be perfectly nested."_err_en_US);
+    };
+
+    checkPerfectNest();
+
+    ++curLevel;
+  }
+}
+
 // 2.15.1.1 Data-sharing Attribute Rules - Predetermined
 //   - The loop iteration variable(s) in the associated do-loop(s) of a do,
 //     parallel do, taskloop, or distribute construct is (are) private.
diff --git a/flang/test/Semantics/OpenMP/do08.f90 b/flang/test/Semantics/OpenMP/do08.f90
index 5143dff0dd315..bb3c1d0cd3855 100644
--- a/flang/test/Semantics/OpenMP/do08.f90
+++ b/flang/test/Semantics/OpenMP/do08.f90
@@ -61,6 +61,7 @@ program omp
   !$omp end do
 
 
+  !ERROR: Canonical loop nest must be perfectly nested.
   !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do collapse(3)
   do 60 i=2,200,2
diff --git a/flang/test/Semantics/OpenMP/do13.f90 b/flang/test/Semantics/OpenMP/do13.f90
index 6e9d1dddade4c..8f7844f4136f9 100644
--- a/flang/test/Semantics/OpenMP/do13.f90
+++ b/flang/test/Semantics/OpenMP/do13.f90
@@ -59,6 +59,7 @@ program omp
   !$omp end do
 
 
+  !ERROR: Canonical loop nest must be perfectly nested.
   !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do collapse(3)
   do 60 i=1,10
diff --git a/flang/test/Semantics/OpenMP/do22.f90 b/flang/test/Semantics/OpenMP/do22.f90
new file mode 100644
index 0000000000000..9d96d3af54e5c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/do22.f90
@@ -0,0 +1,73 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Check for existence of loop following a DO directive
+
+subroutine do_imperfectly_nested_before
+  integer i, j
+
+  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !$omp do collapse(2)
+  do i = 1, 10
+    print *, i
+    do j = 1, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_imperfectly_nested_behind
+  integer i, j
+
+  !ERROR: Canonical loop nest must be perfectly nested.
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = 1, 10
+      print *, i, j
+    end do
+    print *, i
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_nonrectangular_lb
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = i, 10
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_nonrectangular_ub
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = 0, i
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine
+
+
+subroutine do_nonrectangular_step
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp do collapse(2)
+  do i = 1, 10
+    do j = 1, 10, i
+      print *, i, j
+    end do
+  end do
+  !$omp end do
+end subroutine

From 2f7252a8412fed7be8df218a8c5ecad1fd18fe43 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 30 Sep 2025 11:42:45 +0100
Subject: [PATCH 234/878] [LV] Preserve GEP nusw when widening memory (#160885)

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   9 +-
 .../reverse-induction-gep-nowrap-flags.ll     | 182 ++++++++++++++++++
 2 files changed, 187 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a0043bed2e0c8..5a08e4d25cfa5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7487,12 +7487,13 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     VPSingleDefRecipe *VectorPtr;
     if (Reverse) {
       // When folding the tail, we may compute an address that we don't in the
-      // original scalar loop and it may not be inbounds. Drop Inbounds in that
-      // case.
+      // original scalar loop: drop the GEP no-wrap flags in this case.
+      // Otherwise preserve existing flags without no-unsigned-wrap, as we will
+      // emit negative indices.
       GEPNoWrapFlags Flags =
-          (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
+          CM.foldTailByMasking() || !GEP
               ? GEPNoWrapFlags::none()
-              : GEPNoWrapFlags::inBounds();
+              : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
       VectorPtr =
           new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
                                        /*Stride*/ -1, Flags, I->getDebugLoc());
diff --git a/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll b/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll
new file mode 100644
index 0000000000000..826696fcdc452
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll
@@ -0,0 +1,182 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define i32 @preserve_inbounds(i64 %start, ptr %ptr) {
+; CHECK-LABEL: define i32 @preserve_inbounds(
+; CHECK-SAME: i64 [[START:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
+; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
+; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
+; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rev.ind = phi i64 [ %start, %entry ], [ %rev.ind.next, %loop ]
+  %redux = phi i32 [ 0, %entry ], [ %redux.next, %loop ]
+  %rev.ind.next = add i64 %rev.ind, -1
+  %gep.ptr.ind = getelementptr inbounds i32, ptr %ptr, i64 %rev.ind.next
+  %ld.ptr = load i32, ptr %gep.ptr.ind, align 4
+  %redux.next = add i32 %ld.ptr, %redux
+  %iv.next = add i32 %iv, 1
+  %exit.cond = icmp ne i32 %iv.next, 1024
+  br i1 %exit.cond, label %loop, label %end
+
+end:
+  ret i32 %redux.next
+}
+
+define i32 @preserve_nusw(i64 %start, ptr %ptr) {
+; CHECK-LABEL: define i32 @preserve_nusw(
+; CHECK-SAME: i64 [[START:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr nusw i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr nusw i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr nusw i32, ptr [[TMP2]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
+; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr nusw i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
+; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
+; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rev.ind = phi i64 [ %start, %entry ], [ %rev.ind.next, %loop ]
+  %redux = phi i32 [ 0, %entry ], [ %redux.next, %loop ]
+  %rev.ind.next = add i64 %rev.ind, -1
+  %gep.ptr.ind = getelementptr nusw i32, ptr %ptr, i64 %rev.ind.next
+  %ld.ptr = load i32, ptr %gep.ptr.ind, align 4
+  %redux.next = add i32 %ld.ptr, %redux
+  %iv.next = add i32 %iv, 1
+  %exit.cond = icmp ne i32 %iv.next, 1024
+  br i1 %exit.cond, label %loop, label %end
+
+end:
+  ret i32 %redux.next
+}
+
+define i32 @drop_nuw(i64 %start, ptr %ptr) {
+; CHECK-LABEL: define i32 @drop_nuw(
+; CHECK-SAME: i64 [[START:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr nuw i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 -3
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[REVERSE]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br label %[[END:.*]]
+; CHECK:       [[SCALAR_PH:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
+; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr nuw i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
+; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
+; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %rev.ind = phi i64 [ %start, %entry ], [ %rev.ind.next, %loop ]
+  %redux = phi i32 [ 0, %entry ], [ %redux.next, %loop ]
+  %rev.ind.next = add i64 %rev.ind, -1
+  %gep.ptr.ind = getelementptr nuw i32, ptr %ptr, i64 %rev.ind.next
+  %ld.ptr = load i32, ptr %gep.ptr.ind, align 4
+  %redux.next = add i32 %ld.ptr, %redux
+  %iv.next = add i32 %iv, 1
+  %exit.cond = icmp ne i32 %iv.next, 1024
+  br i1 %exit.cond, label %loop, label %end
+
+end:
+  ret i32 %redux.next
+}

From 709a74dfb3b5e965479760af8bd29a84c89e1d2e Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Tue, 30 Sep 2025 12:55:35 +0200
Subject: [PATCH 235/878] AMDGPU: Fix s_barrier_leave to write to scc (#161221)

s_barrier_leave implicitly defines $scc
and does not use imm that represents type of barrier,
isel pattern ignores imm operand from llvm intrinsic.
Test if SIInsertWaitcnts tracks this scc write.
---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |  4 +++
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  2 ++
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  2 ++
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  5 ++++
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |  3 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  5 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  2 ++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  6 ++--
 llvm/test/CodeGen/AMDGPU/s-barrier.ll         | 29 ++++++++++++++++---
 9 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index bb4bf742fb861..a0cd1785c0130 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -13,6 +13,10 @@
 include "AMDGPU.td"
 include "AMDGPUCombine.td"
 
+def gi_ignore :
+    GIComplexOperandMatcher<s32, "selectIgnore">,
+    GIComplexPatternEquiv<Ignore>;
+
 def sd_vsrc0 : ComplexPattern<i32, 1, "">;
 def gi_vsrc0 :
     GIComplexOperandMatcher<s32, "selectVSRC0">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2192a72bb27b7..bdf4cd3693b2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4312,6 +4312,8 @@ bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectIgnore(SDValue In) const { return true; }
+
 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   if (In.isUndef())
     return CurDAG->getUNDEF(MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 4fa0d3f72e1c7..906548742f77f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -305,6 +305,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectWAVE_ADDRESS(SDNode *N);
   void SelectSTACKRESTORE(SDNode *N);
 
+  bool SelectIgnore(SDValue In) const;
+
 protected:
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 12915c7344426..7f08a4eef97c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4266,6 +4266,11 @@ Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
   return Src;
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectIgnore(MachineOperand &Root) const {
+  return {{}};
+}
+
 ///
 /// This will select either an SGPR or VGPR operand and will save us from
 /// having to write an extra tablegen pattern.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c760fe7ef99dd..5a575a9c66a8a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -166,6 +166,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
                                  MachineOperand Root, MachineInstr *InsertPt,
                                  bool ForceVGPR = false) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectIgnore(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectVCSRC(MachineOperand &Root) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 31a2d55e1baad..c2252afdbb064 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1006,9 +1006,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
            Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
            Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
            Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
-           Opcode == AMDGPU::S_BARRIER_LEAVE ||
-           Opcode == AMDGPU::S_BARRIER_LEAVE_IMM ||
-           Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER;
+           Opcode == AMDGPU::S_BARRIER_LEAVE || Opcode == AMDGPU::DS_GWS_INIT ||
+           Opcode == AMDGPU::DS_GWS_BARRIER;
   }
 
   static bool isF16PseudoScalarTrans(unsigned Opcode) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 18a53931a6390..e46bd45aed506 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1710,6 +1710,8 @@ def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods"
 def VINTERPMods  : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
 def VINTERPModsHi  : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
 
+def Ignore : ComplexPattern<untyped, 0, "SelectIgnore">;
+
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 296ce5a46287c..6a39187e48cc8 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1616,7 +1616,8 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
   let isConvergent = 1;
 }
 
-def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
+  def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave",
+    (ins), "", [(int_amdgcn_s_barrier_leave (Ignore))] > {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
   let fixed_imm = 1;
@@ -1624,9 +1625,6 @@ def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
   let Defs = [SCC];
 }
 
-def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
-    (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
-
 def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
   let SubtargetPredicate = isGFX8Plus;
   let simm16 = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
index 8a9beb73a6baa..4c7cef9cc1a0f 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 @bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
 @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
@@ -102,6 +102,7 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
 ; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst -1
 ; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    s_barrier_leave
 ; GFX12-SDAG-NEXT:    s_get_barrier_state s3, m0
 ; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
@@ -155,10 +156,11 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
 ; GFX12-GISEL-NEXT:    s_barrier_signal -1
 ; GFX12-GISEL-NEXT:    s_barrier_join m0
 ; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst -1
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
 ; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_barrier_leave
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
 ; GFX12-GISEL-NEXT:    s_get_barrier_state s0, 2
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_get_barrier_state s0, m0
@@ -256,6 +258,25 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
     ret void
 }
 
+define amdgpu_ps void @test_barrier_leave_write_to_scc(i32 inreg %val, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_barrier_leave_write_to_scc:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_barrier_leave
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12-NEXT:    s_movk_i32 s0, 0x7b
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cselect_b32 s0, s0, 0x1c8
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.barrier.leave(i16 1)
+  %cmp = icmp ne i32 %val, 0
+  %ret = select i1 %cmp, i32 123, i32 456
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
 declare void @llvm.amdgcn.s.barrier() #1
 declare void @llvm.amdgcn.s.barrier.wait(i16) #1
 declare void @llvm.amdgcn.s.barrier.signal(i32) #1

From 065cd64af3166da430bd03067514f9c9ac09e8e1 Mon Sep 17 00:00:00 2001
From: Jeaye Wilkerson <contact@jeaye.com>
Date: Tue, 30 Sep 2025 03:58:57 -0700
Subject: [PATCH 236/878] [clang-repl] Teach clang-repl how to load PCHs
 (reprise) (#157359)

This is an updated version of @vgvassilev's PR from last year here:
https://github.com/llvm/llvm-project/pull/94166

In short, it includes:

1. The fix for a blocking issue where `clang::Interpreter` (and thus
`clang-repl`) cannot resolve symbols defined in a PCH
2. A test to prove this is working
3. A new hidden flag for `clang-repl` so that `llvm-lit` can match the
host JIT triple between the PCH and `clang-repl`; previously, they may
differ in some cases
4. Everything based on the latest LLVM main

Shout out to @kylc for finding a logic issue which had us stumped for a
while (and securing the
[bounty](https://github.com/jank-lang/jank/issues/446)).

---------

Co-authored-by: Vassil Vassilev <v.g.vassilev@gmail.com>
Co-authored-by: Kyle Cesare <kcesare@gmail.com>
---
 clang/include/clang/CodeGen/ModuleBuilder.h |  6 ++++++
 clang/lib/CodeGen/BackendConsumer.h         |  5 -----
 clang/lib/CodeGen/CodeGenAction.cpp         |  6 +-----
 clang/lib/CodeGen/ModuleBuilder.cpp         |  9 +++++++-
 clang/lib/Interpreter/IncrementalAction.cpp |  3 ++-
 clang/lib/Interpreter/IncrementalParser.cpp |  4 ++++
 clang/lib/Interpreter/Interpreter.cpp       |  7 ++++---
 clang/test/Interpreter/execute-pch.cpp      | 23 +++++++++++++++++++++
 clang/test/lit.cfg.py                       | 13 +++++++++---
 clang/tools/clang-repl/ClangRepl.cpp        |  7 +++++++
 10 files changed, 65 insertions(+), 18 deletions(-)
 create mode 100644 clang/test/Interpreter/execute-pch.cpp

diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h
index 59b9840d02e08..f1b8229edd362 100644
--- a/clang/include/clang/CodeGen/ModuleBuilder.h
+++ b/clang/include/clang/CodeGen/ModuleBuilder.h
@@ -52,6 +52,12 @@ namespace CodeGen {
 class CodeGenerator : public ASTConsumer {
   virtual void anchor();
 
+protected:
+  /// True if we've finished generating IR. This prevents us from generating
+  /// additional LLVM IR after emitting output in HandleTranslationUnit. This
+  /// can happen when Clang plugins trigger additional AST deserialization.
+  bool IRGenFinished = false;
+
 public:
   /// Return an opaque reference to the CodeGenModule object, which can
   /// be used in various secondary APIs.  It is valid as long as the
diff --git a/clang/lib/CodeGen/BackendConsumer.h b/clang/lib/CodeGen/BackendConsumer.h
index ad3adfca36785..b7bbb81074836 100644
--- a/clang/lib/CodeGen/BackendConsumer.h
+++ b/clang/lib/CodeGen/BackendConsumer.h
@@ -40,11 +40,6 @@ class BackendConsumer : public ASTConsumer {
   llvm::Timer LLVMIRGeneration;
   unsigned LLVMIRGenerationRefCount = 0;
 
-  /// True if we've finished generating IR. This prevents us from generating
-  /// additional LLVM IR after emitting output in HandleTranslationUnit. This
-  /// can happen when Clang plugins trigger additional AST deserialization.
-  bool IRGenFinished = false;
-
   bool TimerIsEnabled = false;
 
   BackendAction Action;
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 9286f1f25c6cc..60d6b7fa009e7 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -190,9 +190,7 @@ void BackendConsumer::HandleInlineFunctionDefinition(FunctionDecl *D) {
 }
 
 void BackendConsumer::HandleInterestingDecl(DeclGroupRef D) {
-  // Ignore interesting decls from the AST reader after IRGen is finished.
-  if (!IRGenFinished)
-    HandleTopLevelDecl(D);
+  HandleTopLevelDecl(D);
 }
 
 // Links each entry in LinkModules into our module. Returns true on error.
@@ -243,8 +241,6 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
 
     if (TimerIsEnabled && !--LLVMIRGenerationRefCount)
       LLVMIRGeneration.yieldTo(CI.getFrontendTimer());
-
-    IRGenFinished = true;
   }
 
   // Silently ignore if we weren't initialized for some reason.
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 8c1fee8c974f1..96f3f6221e20f 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -138,6 +138,8 @@ namespace {
       assert(!M && "Replacing existing Module?");
       M.reset(new llvm::Module(ExpandModuleName(ModuleName, CodeGenOpts), C));
 
+      IRGenFinished = false;
+
       std::unique_ptr<CodeGenModule> OldBuilder = std::move(Builder);
 
       Initialize(*Ctx);
@@ -179,6 +181,10 @@ namespace {
     }
 
     bool HandleTopLevelDecl(DeclGroupRef DG) override {
+      // Ignore interesting decls from the AST reader after IRGen is finished.
+      if (IRGenFinished)
+        return true; // We can't CodeGen more but pass to other consumers.
+
       // FIXME: Why not return false and abort parsing?
       if (Diags.hasUnrecoverableErrorOccurred())
         return true;
@@ -292,8 +298,9 @@ namespace {
         if (Builder)
           Builder->clear();
         M.reset();
-        return;
       }
+
+      IRGenFinished = true;
     }
 
     void AssignInheritanceModel(CXXRecordDecl *RD) override {
diff --git a/clang/lib/Interpreter/IncrementalAction.cpp b/clang/lib/Interpreter/IncrementalAction.cpp
index 4d1bc4c59e851..3d489fce54bc6 100644
--- a/clang/lib/Interpreter/IncrementalAction.cpp
+++ b/clang/lib/Interpreter/IncrementalAction.cpp
@@ -106,7 +106,8 @@ std::unique_ptr<llvm::Module> IncrementalAction::GenModule() {
     // around we created an empty module to make CodeGen happy. We should make
     // sure it always stays empty.
     assert(((!CachedInCodeGenModule ||
-             !CI.getPreprocessorOpts().Includes.empty()) ||
+             !CI.getPreprocessorOpts().Includes.empty() ||
+             !CI.getPreprocessorOpts().ImplicitPCHInclude.empty()) ||
             (CachedInCodeGenModule->empty() &&
              CachedInCodeGenModule->global_empty() &&
              CachedInCodeGenModule->alias_empty() &&
diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp
index 32d1663fbe1a9..bf08911e23533 100644
--- a/clang/lib/Interpreter/IncrementalParser.cpp
+++ b/clang/lib/Interpreter/IncrementalParser.cpp
@@ -37,6 +37,10 @@ IncrementalParser::IncrementalParser(CompilerInstance &Instance,
   llvm::ErrorAsOutParameter EAO(&Err);
   Consumer = &S.getASTConsumer();
   P.reset(new Parser(S.getPreprocessor(), S, /*SkipBodies=*/false));
+
+  if (ExternalASTSource *External = S.getASTContext().getExternalSource())
+    External->StartTranslationUnit(Consumer);
+
   P->Initialize();
 }
 
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 9cc1c450b7650..b05cb5a0f1dbe 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -278,9 +278,10 @@ Interpreter::Interpreter(std::unique_ptr<CompilerInstance> Instance,
 
   if (Act->getCodeGen()) {
     Act->CacheCodeGenModule();
-    // The initial PTU is filled by `-include` or by CUDA includes
-    // automatically.
-    if (!CI->getPreprocessorOpts().Includes.empty()) {
+    // The initial PTU is filled by `-include`/`-include-pch` or by CUDA
+    // includes automatically.
+    if (!CI->getPreprocessorOpts().Includes.empty() ||
+        !CI->getPreprocessorOpts().ImplicitPCHInclude.empty()) {
       // We can't really directly pass the CachedInCodeGenModule to the Jit
       // because it will steal it, causing dangling references as explained in
       // Interpreter::Execute
diff --git a/clang/test/Interpreter/execute-pch.cpp b/clang/test/Interpreter/execute-pch.cpp
new file mode 100644
index 0000000000000..8041ee6ac966d
--- /dev/null
+++ b/clang/test/Interpreter/execute-pch.cpp
@@ -0,0 +1,23 @@
+// REQUIRES: host-supports-jit
+// UNSUPPORTED: system-aix
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -fmax-type-align=16 -Xclang -fdeprecated-macro -fno-stack-protector -Xclang -fwrapv -Xclang -fblocks -Xclang -fskip-odr-check-in-gmf -fexceptions -fcxx-exceptions -fgnuc-version=0 -target %host-jit-triple -Xclang -fblocks -Xclang -fmax-type-align=8 -Xclang -fincremental-extensions -Xclang -emit-pch -x c++-header -o %t/include.pch %t/include.hpp
+//
+// RUN: cat %t/main.cpp \
+// RUN:     | clang-repl -Xcc -fgnuc-version=0 -Xcc -fno-stack-protector -Xcc -fwrapv -Xcc -fblocks -Xcc -fskip-odr-check-in-gmf -Xcc -fmax-type-align=8 -Xcc -include-pch -Xcc %t/include.pch \
+// RUN:     | FileCheck %s
+
+//--- include.hpp
+
+int f_pch() { return 5; }
+
+//--- main.cpp
+
+extern "C" int printf(const char *, ...);
+printf("f_pch = %d\n", f_pch());
+
+// CHECK: f_pch = 5
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 4a5d9e582b54c..64e2bbad8f3b2 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -140,7 +140,8 @@ def have_host_out_of_process_jit_feature_support():
 
     return False
 
-def have_host_jit_feature_support(feature_name):
+
+def run_clang_repl(args):
     clang_repl_exe = lit.util.which("clang-repl", config.clang_tools_dir)
 
     if not clang_repl_exe:
@@ -148,7 +149,7 @@ def have_host_jit_feature_support(feature_name):
 
     try:
         clang_repl_cmd = subprocess.Popen(
-            [clang_repl_exe, "--host-supports-" + feature_name], stdout=subprocess.PIPE
+            [clang_repl_exe, args], stdout=subprocess.PIPE
         )
     except OSError:
         print("could not exec clang-repl")
@@ -157,7 +158,11 @@ def have_host_jit_feature_support(feature_name):
     clang_repl_out = clang_repl_cmd.stdout.read().decode("ascii")
     clang_repl_cmd.wait()
 
-    return "true" in clang_repl_out
+    return clang_repl_out
+
+
+def have_host_jit_feature_support(feature_name):
+    return "true" in run_clang_repl("--host-supports-" + feature_name)
 
 def have_host_clang_repl_cuda():
     clang_repl_exe = lit.util.which('clang-repl', config.clang_tools_dir)
@@ -191,6 +196,8 @@ def have_host_clang_repl_cuda():
 
     if have_host_clang_repl_cuda():
         config.available_features.add('host-supports-cuda')
+    hosttriple = run_clang_repl("--host-jit-triple")
+    config.substitutions.append(("%host-jit-triple", hosttriple.strip()))
 
     if have_host_out_of_process_jit_feature_support():
         config.available_features.add("host-supports-out-of-process-jit")
diff --git a/clang/tools/clang-repl/ClangRepl.cpp b/clang/tools/clang-repl/ClangRepl.cpp
index 1d508816d7047..c7879422cd7df 100644
--- a/clang/tools/clang-repl/ClangRepl.cpp
+++ b/clang/tools/clang-repl/ClangRepl.cpp
@@ -85,6 +85,8 @@ static llvm::cl::list<std::string>
               llvm::cl::CommaSeparated);
 static llvm::cl::opt<bool> OptHostSupportsJit("host-supports-jit",
                                               llvm::cl::Hidden);
+static llvm::cl::opt<bool> OptHostJitTriple("host-jit-triple",
+                                            llvm::cl::Hidden);
 static llvm::cl::list<std::string> OptInputs(llvm::cl::Positional,
                                              llvm::cl::desc("[code to run]"));
 
@@ -279,6 +281,11 @@ int main(int argc, const char **argv) {
       llvm::outs() << "false\n";
     }
     return 0;
+  } else if (OptHostJitTriple) {
+    auto J = ExitOnErr(llvm::orc::LLJITBuilder().create());
+    auto T = J->getTargetTriple();
+    llvm::outs() << T.normalize() << '\n';
+    return 0;
   }
 
   clang::IncrementalCompilerBuilder CB;

From c389f5008bb836655a2367fe5ca906b34b65dc18 Mon Sep 17 00:00:00 2001
From: Liao Chunyu <chunyu@iscas.ac.cn>
Date: Tue, 30 Sep 2025 19:51:56 +0800
Subject: [PATCH 237/878] [Docs][RISCV]Remove experimental from Smctr,
 Ssctr,Sdext and Sdtrig (#161058)

---
 llvm/docs/RISCVUsage.rst | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 7b1a6ce834919..f9e2e4a5f02c3 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -120,6 +120,8 @@ on support follow.
      ``H``             Assembly Support
      ``M``             Supported
      ``Q``             Assembly Support
+     ``Sdext``         Assembly Support (`See note <#riscv-debug-specification-note>`__)
+     ``Sdtrig``        Assembly Support (`See note <#riscv-debug-specification-note>`__)
      ``Sha``           Supported
      ``Shcounterenw``  Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Shgatpa``       Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
@@ -132,6 +134,7 @@ on support follow.
      ``Smcdeleg``      Supported
      ``Smcntrpmf``     Supported
      ``Smcsrind``      Supported
+     ``Smctr``         Assembly Support
      ``Smdbltrp``      Supported
      ``Smepmp``        Supported
      ``Smmpm``         Supported
@@ -144,6 +147,7 @@ on support follow.
      ``Sscofpmf``      Assembly Support
      ``Sscounterenw``  Assembly Support (`See note <#riscv-profiles-extensions-note>`__)
      ``Sscsrind``      Supported
+     ``Ssctr``         Assembly Support
      ``Ssdbltrp``      Supported
      ``Ssnpm``         Supported
      ``Sspm``          Supported
@@ -306,6 +310,10 @@ Supported
 ``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccamoc``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
+.. _riscv-debug-specification-note:
+
+``Sdext``, ``Sdtrig`` `The RISC-V Debug Specification <https://github.com/riscv/riscv-debug-spec/releases/download/1.0/riscv-debug-specification.pdf>`__.
+
 .. _riscv-zacas-note:
 
 ``Zacas``
@@ -337,12 +345,6 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-zvbc32e``, ``experimental-zvkgs``
   LLVM implements the `0.7 release specification <https://github.com/user-attachments/files/16450464/riscv-crypto-spec-vector-extra_v0.0.7.pdf>`__.
 
-``experimental-sdext``, ``experimental-sdtrig``
-  LLVM implements the `1.0-rc4 specification <https://github.com/riscv/riscv-debug-spec/releases/download/1.0.0-rc4/riscv-debug-specification.pdf>`__.
-
-``experimental-smctr``, ``experimental-ssctr``
-  LLVM implements the `1.0-rc3 specification <https://github.com/riscv/riscv-control-transfer-records/releases/tag/v1.0_rc3>`__.
-
 ``experimental-svukte``
   LLVM implements the `0.3 draft specification <https://github.com/riscv/riscv-isa-manual/pull/1564>`__.
 

From 1553b3de71112f7faf2c5d25227b322978bab9c0 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Tue, 30 Sep 2025 14:01:08 +0200
Subject: [PATCH 238/878] AMDGPU: Fix gcc build break (#161354)

---
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7f08a4eef97c9..1535f2681480e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4268,7 +4268,10 @@ Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectIgnore(MachineOperand &Root) const {
-  return {{}};
+  // Don't render anything.
+  ComplexRendererFns Renderers;
+  Renderers.emplace();
+  return Renderers;
 }
 
 ///

From 0d2b404a352e1e1abe0c51e636755f1fc8352107 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 30 Sep 2025 05:29:34 -0700
Subject: [PATCH 239/878] [LLVM] Fix a bug in `Intrinsic::getFnAttributes`
 (#161248)

---
 llvm/unittests/IR/IntrinsicsTest.cpp           | 8 ++++++++
 llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp | 5 ++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp
index 49af83609d98c..cfd99ed542162 100644
--- a/llvm/unittests/IR/IntrinsicsTest.cpp
+++ b/llvm/unittests/IR/IntrinsicsTest.cpp
@@ -189,4 +189,12 @@ TEST_F(IntrinsicsTest, InstrProfInheritance) {
   }
 }
 
+// Check that getFnAttributes for intrinsics that do not have any function
+// attributes correcty returns an empty set.
+TEST(IntrinsicAttributes, TestGetFnAttributesBug) {
+  using namespace Intrinsic;
+  LLVMContext Context;
+  AttributeSet AS = getFnAttributes(Context, experimental_guard);
+  EXPECT_FALSE(AS.hasAttributes());
+}
 } // end namespace
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 559868dd54efe..75dffb18fca5a 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -794,12 +794,15 @@ AttributeSet Intrinsic::getFnAttributes(LLVMContext &C, ID id) {{
   if (id == 0)
     return AttributeSet();
   auto [FnAttrID, _] = unpackID(IntrinsicsToAttributesMap[id - 1]);
+  if (FnAttrID == {})
+    return AttributeSet();
   return getIntrinsicFnAttributeSet(C, FnAttrID);
 }
 #endif // GET_INTRINSIC_ATTRIBUTES
 
 )",
-                UniqAttributesBitSize, MaxNumAttrs, NoFunctionAttrsID);
+                UniqAttributesBitSize, MaxNumAttrs, NoFunctionAttrsID,
+                NoFunctionAttrsID);
 }
 
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(

From 0d91e6daa1a3361138a964733c6ec1610760da71 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Tue, 30 Sep 2025 06:09:12 -0700
Subject: [PATCH 240/878] [OpenACC][CIR] Generate private recipe pointer/array
 'alloca's (#160911)

As a next step to generating pointer/array recipes, this patch generates
just the 'alloca' lines that are necessary. Copying pointers over to
restore the structure is held off to the next patch.

In the case of a pointer, we need to allocate the level 'below' it (if
we index into it), then copy the values into the pointers. In the case
of an array, we skip the alloca (since the array's alloca contains the
value).

After this, we'll need a patch that copies the pointers into place, and
finally one that does the initialization of these values.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |   7 +
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp | 112 ++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h   |  32 ++--
 .../combined-private-clause.cpp               |   6 +
 .../CodeGenOpenACC/compute-private-clause.c   |   5 +-
 .../CodeGenOpenACC/compute-private-clause.cpp |   6 +
 .../CodeGenOpenACC/loop-private-clause.cpp    |   6 +
 .../private-clause-array-recipes-CtorDtor.cpp |   4 +
 .../private-clause-array-recipes-NoOps.cpp    |   4 +
 .../private-clause-array-recipes-int.cpp      |   6 +-
 ...-clause-pointer-array-recipes-CtorDtor.cpp | 155 +++++++++++++++++
 ...ate-clause-pointer-array-recipes-NoOps.cpp | 156 ++++++++++++++++++
 ...ivate-clause-pointer-array-recipes-int.cpp | 155 +++++++++++++++++
 ...rivate-clause-pointer-recipes-CtorDtor.cpp |  70 ++++++++
 .../private-clause-pointer-recipes-NoOps.cpp  |  70 ++++++++
 .../private-clause-pointer-recipes-int.cpp    |  71 ++++++++
 16 files changed, 840 insertions(+), 25 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index a3f167e3cde2c..3f83c302176c0 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -243,6 +243,13 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::AllocaOp::create(*this, loc, addrType, type, name, alignment);
   }
 
+  mlir::Value createAlloca(mlir::Location loc, cir::PointerType addrType,
+                           mlir::Type type, llvm::StringRef name,
+                           clang::CharUnits alignment) {
+    mlir::IntegerAttr alignmentAttr = getAlignmentAttr(alignment);
+    return createAlloca(loc, addrType, type, name, alignmentAttr);
+  }
+
   /// Get constant address of a global variable as an MLIR attribute.
   /// This wrapper infers the attribute type through the global op.
   cir::GlobalViewAttr getGlobalViewAttr(cir::GlobalOp globalOp,
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index a4c2641fe631c..e41c2d85fbd5d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <numeric>
+
 #include "CIRGenOpenACCRecipe.h"
 
 namespace clang::CIRGen {
@@ -35,6 +37,110 @@ mlir::Block *OpenACCRecipeBuilderBase::createRecipeBlock(mlir::Region &region,
   return builder.createBlock(&region, region.end(), types, locs);
 }
 
+mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
+    mlir::Block *block, SourceRange exprRange, mlir::Location loc,
+    std::string_view allocaName, size_t numBounds,
+    llvm::ArrayRef<QualType> boundTypes) {
+  mlir::OpBuilder::InsertionGuard guardCase(builder);
+
+  // Get the range of bounds arguments, which are all but the 1st arg.
+  llvm::ArrayRef<mlir::BlockArgument> boundsRange =
+      block->getArguments().drop_front(1);
+
+  // boundTypes contains the before and after of each bounds, so it ends up
+  // having 1 extra. Assert this is the case to ensure we don't call this in the
+  // wrong 'block'.
+  assert(boundsRange.size() + 1 == boundTypes.size());
+
+  mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
+  auto idxType = mlir::IndexType::get(&cgf.getMLIRContext());
+
+  auto getUpperBound = [&](mlir::Value bound) {
+    auto upperBoundVal =
+        mlir::acc::GetUpperboundOp::create(builder, loc, idxType, bound);
+    return mlir::UnrealizedConversionCastOp::create(builder, loc, itrTy,
+                                                    upperBoundVal.getResult())
+        .getResult(0);
+  };
+
+  auto isArrayTy = [&](QualType ty) {
+    if (ty->isArrayType() && !ty->isConstantArrayType())
+      cgf.cgm.errorNYI(exprRange, "OpenACC recipe init for VLAs");
+    return ty->isConstantArrayType();
+  };
+
+  mlir::Type topLevelTy = cgf.convertType(boundTypes.back());
+  cir::PointerType topLevelTyPtr = builder.getPointerTo(topLevelTy);
+  // Do an alloca for the 'top' level type without bounds.
+  mlir::Value initialAlloca = builder.createAlloca(
+      loc, topLevelTyPtr, topLevelTy, allocaName,
+      cgf.getContext().getTypeAlignInChars(boundTypes.back()));
+
+  bool lastBoundWasArray = isArrayTy(boundTypes.back());
+
+  // Since we're iterating the types in reverse, this sets up for each index
+  // corresponding to the boundsRange to be the 'after application of the
+  // bounds.
+  llvm::ArrayRef<QualType> boundResults = boundTypes.drop_back(1);
+
+  // Collect the 'do we have any allocas needed after this type' list.
+  llvm::SmallVector<bool> allocasLeftArr;
+  llvm::ArrayRef<QualType> resultTypes = boundTypes.drop_front();
+  std::transform_inclusive_scan(
+      resultTypes.begin(), resultTypes.end(),
+      std::back_inserter(allocasLeftArr), std::plus<bool>{},
+      [](QualType ty) { return !ty->isConstantArrayType(); });
+
+  // Keep track of the number of 'elements' that we're allocating. Individual
+  // allocas should multiply this by the size of its current allocation.
+  mlir::Value cumulativeElts;
+  for (auto [bound, resultType, allocasLeft] : llvm::reverse(
+           llvm::zip_equal(boundsRange, boundResults, allocasLeftArr))) {
+
+    // if there is no further 'alloca' operation we need to do, we can skip
+    // creating the UB/multiplications/etc.
+    if (!allocasLeft)
+      break;
+
+    // First: figure out the number of elements in the current 'bound' list.
+    mlir::Value eltsPerSubArray = getUpperBound(bound);
+    mlir::Value eltsToAlloca;
+
+    // IF we are in a sub-bounds, the total number of elements to alloca is
+    // the product of that one and the current 'bounds' size.  That is,
+    // arr[5][5], we would need 25 elements, not just 5. Else it is just the
+    // current number of elements.
+    if (cumulativeElts)
+      eltsToAlloca = builder.createMul(loc, eltsPerSubArray, cumulativeElts);
+    else
+      eltsToAlloca = eltsPerSubArray;
+
+    if (!lastBoundWasArray) {
+      // If we have to do an allocation, figure out the size of the
+      // allocation.  alloca takes the number of bytes, not elements.
+      TypeInfoChars eltInfo = cgf.getContext().getTypeInfoInChars(resultType);
+      cir::ConstantOp eltSize = builder.getConstInt(
+          loc, itrTy, eltInfo.Width.alignTo(eltInfo.Align).getQuantity());
+      mlir::Value curSize = builder.createMul(loc, eltsToAlloca, eltSize);
+
+      mlir::Type eltTy = cgf.convertType(resultType);
+      cir::PointerType ptrTy = builder.getPointerTo(eltTy);
+      builder.createAlloca(loc, ptrTy, eltTy, "openacc.init.bounds",
+                           cgf.getContext().getTypeAlignInChars(resultType),
+                           curSize);
+
+      // TODO: OpenACC : At this point we should be copying the addresses of
+      // each element of this to the last allocation.  At the moment, that is
+      // not yet implemented.
+      cgf.cgm.errorNYI(exprRange, "OpenACC recipe alloca copying");
+    }
+
+    cumulativeElts = eltsToAlloca;
+    lastBoundWasArray = isArrayTy(resultType);
+  }
+  return initialAlloca;
+}
+
 mlir::Value
 OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
                                            mlir::Value bound,
@@ -258,7 +364,11 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
         cgf.emitAutoVarAlloca(*allocaDecl, builder.saveInsertionPoint());
     cgf.emitAutoVarInit(tempDeclEmission);
   } else {
-    cgf.cgm.errorNYI(exprRange, "private-init with bounds");
+    makeBoundsAlloca(block, exprRange, loc, "openacc.private.init", numBounds,
+                     boundTypes);
+
+    if (initExpr)
+      cgf.cgm.errorNYI(exprRange, "private-init with bounds initialization");
   }
 
   mlir::acc::YieldOp::create(builder, locEnd);
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index 978c671f9a170..acd187bbaee0d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -24,6 +24,13 @@
 
 namespace clang::CIRGen {
 class OpenACCRecipeBuilderBase {
+  // This function generates the required alloca, similar to
+  // 'emitAutoVarAlloca', except for the OpenACC array/pointer types.
+  mlir::Value makeBoundsAlloca(mlir::Block *block, SourceRange exprRange,
+                               mlir::Location loc, std::string_view allocaName,
+                               size_t numBounds,
+                               llvm::ArrayRef<QualType> boundTypes);
+
 protected:
   CIRGen::CIRGenFunction &cgf;
   CIRGen::CIRGenBuilderTy &builder;
@@ -165,28 +172,9 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
         cgf.emitAutoVarAlloca(*varRecipe, builder.saveInsertionPoint());
 
     // 'firstprivate' doesn't do its initialization in the 'init' section,
-    // instead does it in the 'copy' section.  SO only do init here.
-    // 'reduction' appears to use it too (rather than a 'copy' section), so
-    // we probably have to do it here too, but we can do that when we get to
-    // reduction implementation.
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) {
-      // We are OK with no init for builtins, arrays of builtins, or pointers,
-      // else we should NYI so we know to go look for these.
-      if (cgf.getContext().getLangOpts().CPlusPlus &&
-          !varRecipe->getType()
-               ->getPointeeOrArrayElementType()
-               ->isBuiltinType() &&
-          !varRecipe->getType()->isPointerType() && !varRecipe->getInit()) {
-        // If we don't have any initialization recipe, we failed during Sema to
-        // initialize this correctly. If we disable the
-        // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll
-        // emit an error to tell us.  However, emitting those errors during
-        // production is a violation of the standard, so we cannot do them.
-        cgf.cgm.errorNYI(exprRange, "private default-init recipe");
-      }
-      cgf.emitAutoVarInit(tempDeclEmission);
-    } else if constexpr (std::is_same_v<RecipeTy,
-                                        mlir::acc::ReductionRecipeOp>) {
+    // instead it does it in the 'copy' section.  SO, only do 'init' here for
+    // reduction.
+    if constexpr (std::is_same_v<RecipeTy, mlir::acc::ReductionRecipeOp>) {
       // Unlike Private, the recipe here is always required as it has to do
       // init, not just 'default' init.
       if (!varRecipe->getInit())
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
index f3ec9e1847b00..639320275ab0f 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
@@ -65,6 +65,7 @@ struct HasDtor {
 // int[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -72,6 +73,7 @@ struct HasDtor {
 // float[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -79,6 +81,7 @@ struct HasDtor {
 // NoCopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -86,6 +89,7 @@ struct HasDtor {
 // CopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -93,6 +97,7 @@ struct HasDtor {
 // NonDefaultCtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -100,6 +105,7 @@ struct HasDtor {
 // HasDtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
index 5235aeec60713..097005e75dcbe 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -26,6 +26,7 @@ struct NoCopyConstruct {};
 // int[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -33,6 +34,7 @@ struct NoCopyConstruct {};
 // float[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -40,6 +42,7 @@ struct NoCopyConstruct {};
 // NoCopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
index 12e14fa0f73b0..97399d9d4620e 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
@@ -58,36 +58,42 @@ struct HasDtor {
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
index 0a0552e888721..d4fd4ccc68f7a 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
@@ -65,6 +65,7 @@ struct HasDtor {
 // int[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -72,6 +73,7 @@ struct HasDtor {
 // float[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -79,6 +81,7 @@ struct HasDtor {
 // NoCopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -86,6 +89,7 @@ struct HasDtor {
 // CopyConstruct[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -93,6 +97,7 @@ struct HasDtor {
 // NonDefaultCtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -100,6 +105,7 @@ struct HasDtor {
 // HasDtor[5] with 1 'bound'
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
index 561bf700f9f95..c62ebe26584b8 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
@@ -13,6 +13,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OneArr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_8CtorDtor : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -98,6 +99,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_CtorDtor x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
@@ -214,6 +216,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B][B])
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
@@ -306,6 +309,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
index ad33ffd092a22..38df8133a38c0 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
@@ -8,6 +8,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OneArr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_5NoOps : !cir.ptr<!cir.array<!rec_NoOps x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!rec_NoOps x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -43,6 +44,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_NoOps x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -81,6 +83,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B][B])
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -94,6 +97,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
index b3eafc0691790..3d4aaa063a19f 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void do_things(unsigned A, unsigned B) {
@@ -6,6 +6,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OneArr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -24,6 +25,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_i : !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -44,6 +46,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B][B])
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_i : !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
@@ -57,6 +60,7 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreeArr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_i : !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
index be999395385f8..52bcd7cd539f2 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
@@ -13,6 +13,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OnePtr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSP8CtorDtor : !cir.ptr<!cir.ptr<!rec_CtorDtor>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -62,6 +69,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -136,6 +157,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -229,6 +271,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -250,6 +306,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtr[B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_P8CtorDtor : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -325,6 +392,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrToArrays[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_8CtorDtor : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -399,6 +473,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSA5_PP8CtorDtor : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -492,6 +584,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_PP8CtorDtor : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -512,6 +615,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPA5_8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -605,6 +722,19 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPA5_8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -700,6 +830,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPA5_P8CtorDtor : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// 
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -793,6 +941,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_P8CtorDtor : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
index fa00e6aa1fd56..4398216132616 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
@@ -8,6 +8,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OnePtr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSP5NoOps : !cir.ptr<!cir.ptr<!rec_NoOps>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.ptr<!rec_NoOps>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -26,6 +33,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -46,6 +67,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -59,6 +101,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -80,6 +136,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtr[B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_P5NoOps : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -102,6 +169,13 @@ void do_things(unsigned A, unsigned B) {
   ;
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_5NoOps : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -121,6 +195,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSA5_PP5NoOps : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -134,6 +226,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_PP5NoOps : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -154,6 +257,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPA5_5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -167,6 +284,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPA5_5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -189,6 +320,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPA5_P5NoOps : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -202,6 +351,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_P5NoOps : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
index 867aaa61d2991..79692d3468295 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
@@ -6,6 +6,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(OnePtr[A:B])
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSPi : !cir.ptr<!cir.ptr<!s32i>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.ptr<!s32i>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -24,6 +31,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPi : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -44,6 +65,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -57,6 +99,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -78,6 +134,16 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtr[B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_Pi : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, ["openacc.private.init"] {alignment = 8 : i64} 
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -99,6 +165,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrToArrays[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_i : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, ["openacc.private.init"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -119,6 +192,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSA5_PPi : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -132,6 +223,17 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ArrayOfPtrPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_PPi : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>, !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -152,6 +254,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPA5_i : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} 
@@ -165,6 +281,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrPtrToArray[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPA5_i : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -187,6 +317,24 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(PtrArrayPtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPA5_Pi : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+//
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -201,6 +349,13 @@ void do_things(unsigned A, unsigned B) {
 // #pragma acc parallel private(PtrArrayPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPA5_Pi : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
index cd8e47661cfd6..77ff3571a98b4 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
@@ -22,6 +22,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -29,6 +36,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -42,6 +63,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][A:B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -144,6 +186,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -151,6 +200,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP8CtorDtor : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -229,6 +292,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSP8CtorDtor : !cir.ptr<!cir.ptr<!rec_CtorDtor>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_CTORDTOR:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_CTORDTOR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
index 4d91d86c837f4..4822dd70227f8 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
@@ -16,6 +16,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[A])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -23,6 +30,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -34,6 +55,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -57,6 +99,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -64,6 +113,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -85,6 +148,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSP5NoOps : !cir.ptr<!cir.ptr<!rec_NoOps>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!rec_NoOps>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_NOOPS:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_NOOPS]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
index 5c9c17b2f9a2b..ddf25de34f74e 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
@@ -14,6 +14,13 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[A])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -21,6 +28,20 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -32,6 +53,27 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(ThreePtr[B][B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt3__ZTSPPPi : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -55,6 +97,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSPPi : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -62,6 +111,21 @@ void do_things(unsigned A, unsigned B) {
 #pragma acc parallel private(TwoPtr[B][B])
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSPPi : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["openacc.private.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -83,6 +147,13 @@ void do_things(unsigned A, unsigned B) {
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSPi : !cir.ptr<!cir.ptr<!s32i>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!s32i>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // 'init' section:
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["openacc.private.init"]
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }

From ff149531281270d217b80a099232f947be9ac622 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo@ca.ibm.com>
Date: Tue, 30 Sep 2025 09:16:43 -0400
Subject: [PATCH 241/878] [Clang][PowerPC] Add __dmr2048 type and DMF crypto
 builtins (#157152)

Define the __dmr2048 type to represent the DMR pair introduced by the
Dense Math Facility on PowerPC, and add three Clang builtins
corresponding to DMF cryptography:

__builtin_mma_dmsha2hash
__builtin_mma_dmsha3hash
__builtin_mma_dmxxshapad

The __dmr2048 type is required for the dmsha3hash crypto builtin, and,
as withother PPC MMA and DMR types, its use is strongly restricted.
---
 clang/include/clang/Basic/BuiltinsPPC.def     |   7 +
 clang/include/clang/Basic/PPCTypes.def        |   1 +
 .../include/clang/Serialization/ASTBitCodes.h |   2 +-
 clang/lib/AST/ASTContext.cpp                  |   1 +
 clang/lib/CodeGen/TargetBuiltins/PPC.cpp      |   3 +-
 clang/test/AST/ast-dump-ppc-types.c           |   2 +
 clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c |  73 ++++++++
 .../CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c |  10 +-
 clang/test/CodeGen/PowerPC/ppc-dmf-types.c    | 156 ++++++++++++++++++
 .../test/CodeGenCXX/ppc-mangle-mma-types.cpp  |   3 +
 clang/test/Sema/ppc-dmf-types.c               | 114 +++++++++++--
 .../TypeSystem/Clang/TypeSystemClang.cpp      |   1 +
 12 files changed, 356 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index db71efc238386..cf8bdd2a429df 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -1105,6 +1105,13 @@ UNALIASED_CUSTOM_BUILTIN(mma_disassemble_dmr, "vv*W1024*", false,
 UNALIASED_CUSTOM_BUILTIN(mma_build_dmr, "vW1024*VVVVVVVV", false,
                          "mma,isa-future-instructions")
 
+UNALIASED_CUSTOM_BUILTIN(mma_dmsha2hash, "vW1024*W1024*Ii", true,
+                         "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_dmsha3hash, "vW2048*Ii", true,
+                         "mma,isa-future-instructions")
+UNALIASED_CUSTOM_BUILTIN(mma_dmxxshapad, "vW1024*VIiIiIi", true,
+                         "mma,isa-future-instructions")
+
 // MMA builtins with positive/negative multiply/accumulate.
 UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",
                              "mma,paired-vector-memops")
diff --git a/clang/include/clang/Basic/PPCTypes.def b/clang/include/clang/Basic/PPCTypes.def
index fc4155ca98b2d..9c0fa9198d5b1 100644
--- a/clang/include/clang/Basic/PPCTypes.def
+++ b/clang/include/clang/Basic/PPCTypes.def
@@ -30,6 +30,7 @@
 #endif
 
 
+PPC_VECTOR_MMA_TYPE(__dmr2048, DMR2048, 2048)
 PPC_VECTOR_MMA_TYPE(__dmr1024, DMR1024, 1024)
 PPC_VECTOR_MMA_TYPE(__vector_quad, VectorQuad, 512)
 PPC_VECTOR_VSX_TYPE(__vector_pair, VectorPair, 256)
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 99864c7373908..5d09d5536e5ab 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1160,7 +1160,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 513;
+const unsigned NUM_PREDEF_TYPE_IDS = 514;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0fd0e7eb360dd..056bfe36b2a0a 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3501,6 +3501,7 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     case BuiltinType::VectorQuad:
     case BuiltinType::VectorPair:
     case BuiltinType::DMR1024:
+    case BuiltinType::DMR2048:
       OS << "?";
       return;
 
diff --git a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
index ba65cf1ce9b90..e71dc9ea523a2 100644
--- a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
@@ -1153,7 +1153,8 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     }
     if (BuiltinID == PPC::BI__builtin_mma_dmmr ||
         BuiltinID == PPC::BI__builtin_mma_dmxor ||
-        BuiltinID == PPC::BI__builtin_mma_disassemble_dmr) {
+        BuiltinID == PPC::BI__builtin_mma_disassemble_dmr ||
+        BuiltinID == PPC::BI__builtin_mma_dmsha2hash) {
       Address Addr = EmitPointerWithAlignment(E->getArg(1));
       Ops[1] = Builder.CreateLoad(Addr);
     }
diff --git a/clang/test/AST/ast-dump-ppc-types.c b/clang/test/AST/ast-dump-ppc-types.c
index 1c860c268e0ec..6112af5ebf92c 100644
--- a/clang/test/AST/ast-dump-ppc-types.c
+++ b/clang/test/AST/ast-dump-ppc-types.c
@@ -17,6 +17,8 @@
 // are correctly defined. We also added checks on a couple of other targets to
 // ensure the types are target-dependent.
 
+// CHECK: TypedefDecl {{.*}} implicit __dmr2048 '__dmr2048'
+// CHECK: `-BuiltinType {{.*}} '__dmr2048'
 // CHECK: TypedefDecl {{.*}} implicit __dmr1024 '__dmr1024'
 // CHECK: `-BuiltinType {{.*}} '__dmr1024'
 // CHECK: TypedefDecl {{.*}} implicit __vector_quad '__vector_quad'
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index f62656757c8c5..d8306a74ad2e9 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -208,6 +208,75 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
   __builtin_mma_build_dmr((__dmr1024*)res2, vv, vv, vv, vv, vv, vv, vv, vv);
   __builtin_mma_disassemble_dmr(res1, (__dmr1024*)p1);
 }
+
+// CHECK-LABEL: define dso_local void @test_dmsha2hash(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP1:%.*]], ptr noundef readonly captures(none) [[VDMRP2:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+// AIX-LABEL: define void @test_dmsha2hash(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP1:%.*]], ptr noundef readonly captures(none) [[VDMRP2:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
+void test_dmsha2hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned char *resp) {
+  __dmr1024 vdmr1 = *((__dmr1024 *)vdmrp1);
+  __dmr1024 vdmr2 = *((__dmr1024 *)vdmrp2);
+  __builtin_mma_dmsha2hash(&vdmr1, &vdmr2, 1);
+  *((__dmr1024 *)resp) = vdmr1;
+}
+
+// CHECK-LABEL: define dso_local void @test_dmsha3hash(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA9:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
+// CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA9]]
+// CHECK-NEXT:    ret void
+//
+// AIX-LABEL: define void @test_dmsha3hash(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA9:![0-9]+]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
+// AIX-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA9]]
+// AIX-NEXT:    ret void
+//
+void test_dmsha3hash(unsigned char *vdmrpp,  unsigned char *resp) {
+  __dmr2048 vdmrp = *((__dmr2048 *)vdmrpp);
+  __builtin_mma_dmsha3hash(&vdmrp, 4);
+  *((__dmr2048 *)resp) = vdmrp;
+}
+
+// CHECK-LABEL: define dso_local void @test_dmxxshapad(
+// CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
+// CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// CHECK-NEXT:    ret void
+//
+// AIX-LABEL: define void @test_dmxxshapad(
+// AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AIX-NEXT:  [[ENTRY:.*:]]
+// AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
+// AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA6]]
+// AIX-NEXT:    ret void
+//
+void test_dmxxshapad(unsigned char *vdmrp, vector unsigned char vc, unsigned char *resp) {
+  __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
+  __builtin_mma_dmxxshapad(&vdmr, vc, 2, 1, 5);
+  *((__dmr1024 *)resp) = vdmr;
+}
 //.
 // CHECK: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
@@ -216,6 +285,8 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
 // CHECK: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 // CHECK: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
 // CHECK: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
+// CHECK: [[__DMR2048_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// CHECK: [[META10]] = !{!"__dmr2048", [[META4]], i64 0}
 //.
 // AIX: [[__VECTOR_PAIR_TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
 // AIX: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
@@ -224,4 +295,6 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
 // AIX: [[__DMR1024_TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
 // AIX: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}
 // AIX: [[CHAR_TBAA8]] = !{[[META4]], [[META4]], i64 0}
+// AIX: [[__DMR2048_TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+// AIX: [[META10]] = !{!"__dmr2048", [[META4]], i64 0}
 //.
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
index 06497555b840f..66b9d797c65d3 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -9,7 +9,9 @@
 // RUN:   FileCheck --check-prefix=ISA_FUTURE %s
 
 //__attribute__((target("no-mma")))
-void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) {
+__attribute__((target("no-mma")))
+void test_mma(unsigned char *vdmrpp, unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) {
+  __dmr2048 vdmrpair = *((__dmr2048 *)vdmrpp);
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
   __vector_pair vp = *((__vector_pair *)vpp);
   __builtin_mma_dmxvi8gerx4(&vdmr, vp, vc);
@@ -23,6 +25,9 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
   __builtin_mma_dmxor(&vdmr, (__dmr1024*)vpp);
   __builtin_mma_build_dmr(&vdmr, vc, vc, vc, vc, vc, vc, vc, vc);
   __builtin_mma_disassemble_dmr(vdmrp, &vdmr);
+  __builtin_mma_dmsha2hash(&vdmr, &vdmr, 0);
+  __builtin_mma_dmsha3hash(&vdmrpair, 0);
+  __builtin_mma_dmxxshapad(&vdmr, vc, 0, 0, 0);
 
 // CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
 // CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
@@ -35,6 +40,9 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
 // ISA_FUTURE: error: '__builtin_mma_dmxor' needs target feature mma,isa-future-instructions
 // ISA_FUTURE: error: '__builtin_mma_build_dmr' needs target feature mma,isa-future-instructions
 // ISA_FUTURE: error: '__builtin_mma_disassemble_dmr' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmsha2hash' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmsha3hash' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_mma_dmxxshapad' needs target feature mma,isa-future-instructions
 
   // DMF VSX Vector bfloat16 GER 2x builtins.
 
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-types.c b/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
index 9dff289370eb5..fbbe62133763e 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-types.c
@@ -2,6 +2,162 @@
 // RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu future \
 // RUN:   -emit-llvm -o - %s | FileCheck %s
 
+// CHECK-LABEL: @test_dmrp_copy(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR1_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR2_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR1:%.*]], ptr [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR2:%.*]], ptr [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR1_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP0]], i64 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2048 x i1>, ptr [[ADD_PTR]], align 256
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PTR2_ADDR]], align 8
+// CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP2]], i64 1
+// CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[ADD_PTR1]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_copy(__dmr2048 *ptr1, __dmr2048 *ptr2) {
+  *(ptr2 + 1) = *(ptr1 + 2);
+}
+
+// CHECK-LABEL: @test_dmrp_typedef(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[OUTP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPIN:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPOUT:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[INP:%.*]], ptr [[INP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[OUTP:%.*]], ptr [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[INP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPIN]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[OUTP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[VDMRPOUT]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VDMRPIN]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2048 x i1>, ptr [[TMP2]], align 256
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[VDMRPOUT]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP3]], ptr [[TMP4]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_typedef(int *inp, int *outp) {
+  __dmr2048 *vdmrpin = (__dmr2048 *)inp;
+  __dmr2048 *vdmrpout = (__dmr2048 *)outp;
+  *vdmrpout = *vdmrpin;
+}
+
+// CHECK-LABEL: @test_dmrp_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMRP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMRP:%.*]], ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[TMP1]], align 256
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[TMP3]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_arg(__dmr2048 *vdmrp, int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  *vdmrpp = *vdmrp;
+}
+
+// CHECK-LABEL: @test_dmrp_const_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMRP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMRP:%.*]], ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRP_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[TMP1]], align 256
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[TMP3]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_const_arg(const __dmr2048 *const vdmrp, int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  *vdmrpp = *vdmrp;
+}
+
+// CHECK-LABEL: @test_dmrp_array_arg(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VDMRPA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[VDMRPA:%.*]], ptr [[VDMRPA_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRPA_ADDR]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[ARRAYIDX]], align 256
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[TMP3]], align 256
+// CHECK-NEXT:    ret void
+//
+void test_dmrp_array_arg(__dmr2048 vdmrpa[], int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  *vdmrpp = vdmrpa[0];
+}
+
+// CHECK-LABEL: @test_dmrp_ret_const(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds <2048 x i1>, ptr [[TMP1]], i64 2
+// CHECK-NEXT:    ret ptr [[ADD_PTR]]
+//
+const __dmr2048 *test_dmrp_ret_const(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  return vdmrpp + 2;
+}
+
+// CHECK-LABEL: @test_dmrp_sizeof_alignof(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRPP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[VDMRP:%.*]] = alloca <2048 x i1>, align 256
+// CHECK-NEXT:    [[SIZET:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SIZEV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ALIGNV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[PTR:%.*]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VDMRPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2048 x i1>, ptr [[TMP1]], align 256
+// CHECK-NEXT:    store <2048 x i1> [[TMP2]], ptr [[VDMRP]], align 256
+// CHECK-NEXT:    store i32 256, ptr [[SIZET]], align 4
+// CHECK-NEXT:    store i32 256, ptr [[ALIGNT]], align 4
+// CHECK-NEXT:    store i32 256, ptr [[SIZEV]], align 4
+// CHECK-NEXT:    store i32 256, ptr [[ALIGNV]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SIZET]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ALIGNT]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[SIZEV]], align 4
+// CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ALIGNV]], align 4
+// CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP6]]
+// CHECK-NEXT:    ret i32 [[ADD2]]
+//
+int test_dmrp_sizeof_alignof(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp = *vdmrpp;
+  unsigned sizet = sizeof(__dmr2048);
+  unsigned alignt = __alignof__(__dmr2048);
+   unsigned sizev = sizeof(vdmrp);
+  unsigned alignv = __alignof__(vdmrp);
+  return sizet + alignt + sizev + alignv;
+}
 
 // CHECK-LABEL: @test_dmr_copy(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp b/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
index 1e213e7f75127..6b792dceba2c6 100644
--- a/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
+++ b/clang/test/CodeGenCXX/ppc-mangle-mma-types.cpp
@@ -7,6 +7,9 @@
 // RUN: %clang_cc1 -triple powerpc64le-linux-unknown -target-cpu pwr8 %s \
 // RUN:   -emit-llvm -o - | FileCheck %s
 
+// CHECK: _Z1fPu9__dmr2048
+void f(__dmr2048 *vdmrp) {}
+
 // CHECK: _Z2f0Pu9__dmr1024
 void f0(__dmr1024 *vdmr) {}
 
diff --git a/clang/test/Sema/ppc-dmf-types.c b/clang/test/Sema/ppc-dmf-types.c
index b3da72df25081..88926acf2d3fb 100644
--- a/clang/test/Sema/ppc-dmf-types.c
+++ b/clang/test/Sema/ppc-dmf-types.c
@@ -12,47 +12,86 @@
 
 // typedef
 typedef __dmr1024 dmr_t;
+typedef __dmr2048 dmrp_t;
 
 // function argument
-void testDmrArg1(__dmr1024 vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+void testDmrArg1(dmr_t vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vdmrp = (dmr_t *)ptr;
   *vdmrp = vdmr;
 }
 
-void testDmrArg2(const __dmr1024 vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+void testDmrArg2(const dmr_t vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vdmrp = (dmr_t *)ptr;
   *vdmrp = vdmr;
 }
 
 void testDmrArg3(const dmr_t vdmr, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  dmr_t *vdmrp = (dmr_t *)ptr;
   *vdmrp = vdmr;
 }
 
+void testDmrPArg1(const dmrp_t vdmrp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  *vdmrpp = vdmrp;
+}
+
+void testDmrPArg2(const dmrp_t vdmrp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  *vdmrpp = vdmrp;
+}
+
+void testDmrPArg3(const dmrp_t vdmrp, int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  *vdmrpp = vdmrp;
+}
+
 // function return
-__dmr1024 testDmrRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+dmr_t testDmrRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vdmrp = (dmr_t *)ptr;
   return *vdmrp; // expected-error {{invalid use of PPC MMA type}}
 }
 
 const dmr_t testDmrRet4(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vdmrp = (__dmr1024 *)ptr;
+  dmr_t *vdmrp = (dmr_t *)ptr;
   return *vdmrp; // expected-error {{invalid use of PPC MMA type}}
 }
 
+dmrp_t testDmrPRet1(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  return *vdmrpp; // expected-error {{invalid use of PPC MMA type}}
+}
+
+const dmrp_t testDmrPRet4(int *ptr) { // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vdmrpp = (dmrp_t *)ptr;
+  return *vdmrpp; // expected-error {{invalid use of PPC MMA type}}
+}
+
 // global
-__dmr1024 globalvdmr;        // expected-error {{invalid use of PPC MMA type}}
-const __dmr1024 globalvdmr2; // expected-error {{invalid use of PPC MMA type}}
-__dmr1024 *globalvdmrp;
-const __dmr1024 *const globalvdmrp2;
+dmr_t globalvdmr;        // expected-error {{invalid use of PPC MMA type}}
+const dmr_t globalvdmr2; // expected-error {{invalid use of PPC MMA type}}
+dmr_t *globalvdmrp;
+const dmr_t *const globalvdmrp2;
 dmr_t globalvdmr_t; // expected-error {{invalid use of PPC MMA type}}
 
+dmrp_t globalvdmrp;        // expected-error {{invalid use of PPC MMA type}}
+const dmrp_t globalvdmrp2; // expected-error {{invalid use of PPC MMA type}}
+dmrp_t *globalvdmrpp;
+const dmrp_t *const globalvdmrpp2;
+dmrp_t globalvdmrp_t; // expected-error {{invalid use of PPC MMA type}}
+
 // struct field
 struct TestDmrStruct {
   int a;
   float b;
-  __dmr1024 c; // expected-error {{invalid use of PPC MMA type}}
-  __dmr1024 *vq;
+  dmr_t c; // expected-error {{invalid use of PPC MMA type}}
+  dmr_t *vq;
+};
+
+struct TestDmrPStruct {
+  int a;
+  float b;
+  dmrp_t c; // expected-error {{invalid use of PPC MMA type}}
+  dmrp_t *vq;
 };
 
 // operators
@@ -101,3 +140,50 @@ void testDmrOperators4(int v, void *ptr) {
   __dmr1024 vdmr1 = (__dmr1024)v;   // expected-error {{used type '__dmr1024' where arithmetic or pointer type is required}}
   __dmr1024 vdmr2 = (__dmr1024)vdmrp; // expected-error {{used type '__dmr1024' where arithmetic or pointer type is required}}
 }
+
+int testDmrPOperators1(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = *(vdmrpp + 0);
+  __dmr2048 vdmrp2 = *(vdmrpp + 1);
+  __dmr2048 vdmrp3 = *(vdmrpp + 2);
+  if (vdmrp1) // expected-error {{statement requires expression of scalar type ('__dmr2048' invalid)}}
+    *(vdmrpp + 10) = vdmrp1;
+  if (!vdmrp2) // expected-error {{invalid argument type '__dmr2048' to unary expression}}
+    *(vdmrpp + 11) = vdmrp3;
+  int c1 = vdmrp1 && vdmrp2; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  int c2 = vdmrp2 == vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  int c3 = vdmrp2 < vdmrp1;  // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  return c1 || c2 || c3;
+}
+
+void testDmrPOperators2(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = *(vdmrpp + 0);
+  __dmr2048 vdmrp2 = *(vdmrpp + 1);
+  __dmr2048 vdmrp3 = *(vdmrpp + 2);
+  vdmrp1 = -vdmrp1;        // expected-error {{invalid argument type '__dmr2048' to unary expression}}
+  vdmrp2 = vdmrp1 + vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  vdmrp2 = vdmrp2 * vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  vdmrp3 = vdmrp3 | vdmrp3; // expected-error {{invalid operands to binary expression ('__dmr2048' and '__dmr2048')}}
+  vdmrp3 = vdmrp3 << 2;    // expected-error {{invalid operands to binary expression ('__dmr2048' and 'int')}}
+  *(vdmrpp + 10) = vdmrp1;
+  *(vdmrpp + 11) = vdmrp2;
+  *(vdmrpp + 12) = vdmrp3;
+}
+
+
+vector unsigned char testDmrPOperators3(int *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = *(vdmrpp + 0);
+  __dmr2048 vdmrp2 = *(vdmrpp + 1);
+  __dmr2048 vdmrp3 = *(vdmrpp + 2);
+  vdmrp1 ? *(vdmrpp + 10) = vdmrp2 : *(vdmrpp + 11) = vdmrp3; // expected-error {{used type '__dmr2048' where arithmetic or pointer type is required}}
+  vdmrp2 = vdmrp3;
+  return vdmrp2[1]; // expected-error {{subscripted value is not an array, pointer, or vector}}
+}
+
+void testDmrPOperators4(int v, void *ptr) {
+  __dmr2048 *vdmrpp = (__dmr2048 *)ptr;
+  __dmr2048 vdmrp1 = (__dmr2048)v;   // expected-error {{used type '__dmr2048' where arithmetic or pointer type is required}}
+  __dmr2048 vdmrp2 = (__dmr2048)vdmrpp; // expected-error {{used type '__dmr2048' where arithmetic or pointer type is required}}
+}
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 1948f51c3f2e1..a5aaf1f9cb5af 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5033,6 +5033,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
     case clang::BuiltinType::VectorPair:
     case clang::BuiltinType::VectorQuad:
     case clang::BuiltinType::DMR1024:
+    case clang::BuiltinType::DMR2048:
       break;
 
     // ARM -- Scalable Vector Extension

From 8ae0a20f533cc5fe8a48798159d0a38b722082b5 Mon Sep 17 00:00:00 2001
From: Liao Chunyu <chunyu@iscas.ac.cn>
Date: Tue, 30 Sep 2025 08:04:10 -0400
Subject: [PATCH 242/878] [RISCV][NFC] Update ratified extensions list in
 riscv-target-features.c

---
 .../test/Preprocessor/riscv-target-features.c | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 4090f3de3075d..71d8453cdd655 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -21,6 +21,8 @@
 // CHECK-NOT: __riscv_mul {{.*$}}
 // CHECK-NOT: __riscv_muldiv {{.*$}}
 // CHECK-NOT: __riscv_q {{.*$}}
+// CHECK-NOT: __riscv_sdext{{.*$}}
+// CHECK-NOT: __riscv_sdtrig{{.*$}}
 // CHECK-NOT: __riscv_sha {{.*$}}
 // CHECK-NOT: __riscv_shcounterenw {{.*$}}
 // CHECK-NOT: __riscv_shgatpa {{.*$}}
@@ -33,8 +35,11 @@
 // CHECK-NOT: __riscv_smcdeleg {{.*$}}
 // CHECK-NOT: __riscv_smcntrpmf {{.*$}}
 // CHECK-NOT: __riscv_smcsrind {{.*$}}
+// CHECK-NOT: __riscv_smctr{{.*$}}
 // CHECK-NOT: __riscv_smdbltrp {{.*$}}
 // CHECK-NOT: __riscv_smepmp {{.*$}}
+// CHECK-NOT: __riscv_smmpm{{.*$}}
+// CHECK-NOT: __riscv_smnpm{{.*$}}
 // CHECK-NOT: __riscv_smrnmi {{.*$}}
 // CHECK-NOT: __riscv_smstateen {{.*$}}
 // CHECK-NOT: __riscv_ssaia {{.*$}}
@@ -43,7 +48,10 @@
 // CHECK-NOT: __riscv_sscofpmf {{.*$}}
 // CHECK-NOT: __riscv_sscounterenw {{.*$}}
 // CHECK-NOT: __riscv_sscsrind {{.*$}}
+// CHECK-NOT: __riscv_ssctr{{.*$}}
 // CHECK-NOT: __riscv_ssdbltrp {{.*$}}
+// CHECK-NOT: __riscv_ssnpm{{.*$}}
+// CHECK-NOT: __riscv_sspm{{.*$}}
 // CHECK-NOT: __riscv_ssqosid{{.*$}}
 // CHECK-NOT: __riscv_ssstateen {{.*$}}
 // CHECK-NOT: __riscv_ssstrict {{.*$}}
@@ -51,6 +59,7 @@
 // CHECK-NOT: __riscv_sstvala {{.*$}}
 // CHECK-NOT: __riscv_sstvecd {{.*$}}
 // CHECK-NOT: __riscv_ssu64xl {{.*$}}
+// CHECK-NOT: __riscv_supm{{.*$}}
 // CHECK-NOT: __riscv_svade {{.*$}}
 // CHECK-NOT: __riscv_svadu {{.*$}}
 // CHECK-NOT: __riscv_svbare {{.*$}}
@@ -91,6 +100,7 @@
 // CHECK-NOT: __riscv_zcmt {{.*$}}
 // CHECK-NOT: __riscv_zdinx {{.*$}}
 // CHECK-NOT: __riscv_zfa {{.*$}}
+// CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zfh {{.*$}}
 // CHECK-NOT: __riscv_zfhmin {{.*$}}
 // CHECK-NOT: __riscv_zfinx {{.*$}}
@@ -126,6 +136,7 @@
 // CHECK-NOT: __riscv_zksh {{.*$}}
 // CHECK-NOT: __riscv_zkt {{.*$}}
 // CHECK-NOT: __riscv_zmmul {{.*$}}
+// CHECK-NOT: __riscv_ztso {{.*$}}
 // CHECK-NOT: __riscv_zvbb {{.*$}}
 // CHECK-NOT: __riscv_zvbc {{.*$}}
 // CHECK-NOT: __riscv_zve32f {{.*$}}
@@ -133,6 +144,8 @@
 // CHECK-NOT: __riscv_zve64d {{.*$}}
 // CHECK-NOT: __riscv_zve64f {{.*$}}
 // CHECK-NOT: __riscv_zve64x {{.*$}}
+// CHECK-NOT: __riscv_zvfbfmin {{.*$}}
+// CHECK-NOT: __riscv_zvfbfwma {{.*$}}
 // CHECK-NOT: __riscv_zvfh {{.*$}}
 // CHECK-NOT: __riscv_zvkb {{.*$}}
 // CHECK-NOT: __riscv_zvkg {{.*$}}
@@ -163,25 +176,12 @@
 
 // Experimental extensions
 
-// CHECK-NOT: __riscv_sdext{{.*$}}
-// CHECK-NOT: __riscv_sdtrig{{.*$}}
-// CHECK-NOT: __riscv_smctr{{.*$}}
-// CHECK-NOT: __riscv_smmpm{{.*$}}
-// CHECK-NOT: __riscv_smnpm{{.*$}}
-// CHECK-NOT: __riscv_ssctr{{.*$}}
-// CHECK-NOT: __riscv_ssnpm{{.*$}}
-// CHECK-NOT: __riscv_sspm{{.*$}}
-// CHECK-NOT: __riscv_supm{{.*$}}
 // CHECK-NOT: __riscv_zalasr {{.*$}}
-// CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zicfilp {{.*$}}
 // CHECK-NOT: __riscv_zicfiss {{.*$}}
-// CHECK-NOT: __riscv_ztso {{.*$}}
 // CHECK-NOT: __riscv_zvbc32e {{.*$}}
 // CHECK-NOT: __riscv_zvfbfa {{.*$}}
 // CHECK-NOT: __riscv_zvfofp8min {{.*$}}
-// CHECK-NOT: __riscv_zvfbfmin {{.*$}}
-// CHECK-NOT: __riscv_zvfbfwma {{.*$}}
 // CHECK-NOT: __riscv_zvkgs {{.*$}}
 // CHECK-NOT: __riscv_zvqdotq {{.*$}}
 

From 5e4eb334afd2e3dc12f9bd53dd9d92c12b40b164 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim@gmail.com>
Date: Tue, 30 Sep 2025 09:23:16 -0400
Subject: [PATCH 243/878] [SLPVectorizer] Remove `align 16` in a test.
 (#161251)

It is not necessary.
---
 .../RISCV/basic-strided-loads.ll              | 558 +++++++++---------
 1 file changed, 279 insertions(+), 279 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 645dbc49269f0..4f52227c6511e 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -7,8 +7,8 @@ define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -28,22 +28,22 @@ define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -62,22 +62,22 @@ define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -87,9 +87,9 @@ define void @const_stride_1_with_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -109,22 +109,22 @@ define void @const_stride_1_with_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -144,22 +144,22 @@ define void @const_stride_1_with_reordering(ptr %pl, ptr %ps) {
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
   ; NOTE: value from %load1 in stored in  %gep_s0
-  store i8 %load1, ptr %gep_s0, align 16
-  store i8 %load0, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load1, ptr %gep_s0, align 1
+  store i8 %load0, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -170,9 +170,9 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 16, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <31 x i8> [[TMP2]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -192,22 +192,22 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 28
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 30
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -226,22 +226,22 @@ define void @const_stride_2_no_reordering(ptr %pl, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -251,10 +251,10 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
 ; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 16, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 1, <31 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <31 x i8> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> <i32 2, i32 0, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -274,22 +274,22 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 28
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 30
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -308,22 +308,22 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load1, ptr %gep_s0, align 16
-  store i8 %load0, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load1, ptr %gep_s0, align 1
+  store i8 %load0, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -335,8 +335,8 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
-; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %stride0  = mul nsw i64 %stride, 0
@@ -373,22 +373,22 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 %stride14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 %stride15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -407,22 +407,22 @@ define void @rt_stride_1_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -434,9 +434,9 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[STRIDE0]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[STRIDE]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 1 [[GEP_L0]], i64 [[TMP1]], <16 x i1> splat (i1 true), i32 16)
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %stride0  = mul nsw i64 %stride, 0
@@ -473,22 +473,22 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 %stride14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 %stride15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -507,22 +507,22 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load1, ptr %gep_s0, align 16
-  store i8 %load0, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load1, ptr %gep_s0, align 1
+  store i8 %load0, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -531,9 +531,9 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0
 ; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
-; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 %gep_l0, i64 8, <4 x i1> splat (i1 true), i32 4)
+; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 8, <4 x i1> splat (i1 true), i32 4)
 ; %bitcast_ = bitcast <4 x i32> %strided_load to <16 x i8>
-; store <16 x i8> %bitcast_, ptr %gep_s0, align 16
+; store <16 x i8> %bitcast_, ptr %gep_s0, align 1
 ; ret void
 ; }
 define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
@@ -541,9 +541,9 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 16, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 1, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -563,22 +563,22 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 26
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 27
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -597,22 +597,22 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }
@@ -621,9 +621,9 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
 ; define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0
 ; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
-; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 %gep_l0, i64 %stride, <4 x i1> splat (i1 true), i32 4)
+; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 %stride, <4 x i1> splat (i1 true), i32 4)
 ; %bitcast_ = bitcast <4 x i32> %strided_load to <16 x i8>
-; store <16 x i8> %bitcast_, ptr %gep_s0, align 16
+; store <16 x i8> %bitcast_, ptr %gep_s0, align 1
 ; ret void
 ; }
 define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
@@ -638,10 +638,10 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]]
 ; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]]
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -649,7 +649,7 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16
+; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %offset0  = mul nsw i64 %stride, 0
@@ -686,22 +686,22 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 %offset14
   %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 %offset15
 
-  %load0  = load i8, ptr %gep_l0 , align 16
-  %load1  = load i8, ptr %gep_l1 , align 16
-  %load2  = load i8, ptr %gep_l2 , align 16
-  %load3  = load i8, ptr %gep_l3 , align 16
-  %load4  = load i8, ptr %gep_l4 , align 16
-  %load5  = load i8, ptr %gep_l5 , align 16
-  %load6  = load i8, ptr %gep_l6 , align 16
-  %load7  = load i8, ptr %gep_l7 , align 16
-  %load8  = load i8, ptr %gep_l8 , align 16
-  %load9  = load i8, ptr %gep_l9 , align 16
-  %load10 = load i8, ptr %gep_l10, align 16
-  %load11 = load i8, ptr %gep_l11, align 16
-  %load12 = load i8, ptr %gep_l12, align 16
-  %load13 = load i8, ptr %gep_l13, align 16
-  %load14 = load i8, ptr %gep_l14, align 16
-  %load15 = load i8, ptr %gep_l15, align 16
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
 
   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
   %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
@@ -720,22 +720,22 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
   %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
   %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
 
-  store i8 %load0, ptr %gep_s0, align 16
-  store i8 %load1, ptr %gep_s1, align 16
-  store i8 %load2, ptr %gep_s2, align 16
-  store i8 %load3, ptr %gep_s3, align 16
-  store i8 %load4, ptr %gep_s4, align 16
-  store i8 %load5, ptr %gep_s5, align 16
-  store i8 %load6, ptr %gep_s6, align 16
-  store i8 %load7, ptr %gep_s7, align 16
-  store i8 %load8, ptr %gep_s8, align 16
-  store i8 %load9, ptr %gep_s9, align 16
-  store i8 %load10, ptr %gep_s10, align 16
-  store i8 %load11, ptr %gep_s11, align 16
-  store i8 %load12, ptr %gep_s12, align 16
-  store i8 %load13, ptr %gep_s13, align 16
-  store i8 %load14, ptr %gep_s14, align 16
-  store i8 %load15, ptr %gep_s15, align 16
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
 
   ret void
 }

From 6f1f00c24368512cf39db68d585f38662c6f65ea Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 30 Sep 2025 08:23:54 -0500
Subject: [PATCH 244/878] [flang][OpenMP] Move semantic checks for ALLOCATE to
 check-omp-structure (#161249)

The checks were previously in resolve-directives, which is mostly
intended for determining symbol properties, not performing semantic
checks.
---
 flang/lib/Semantics/check-omp-structure.cpp   | 118 +++++++++++++++++-
 flang/lib/Semantics/check-omp-structure.h     |   6 +
 flang/lib/Semantics/resolve-directives.cpp    | 118 ------------------
 .../Semantics/OpenMP/allocate-align01.f90     |   2 +-
 flang/test/Semantics/OpenMP/allocate01.f90    |   2 +-
 5 files changed, 123 insertions(+), 123 deletions(-)

diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index db030bbe1f023..e224e069abcef 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -351,6 +351,17 @@ bool OmpStructureChecker::IsCloselyNestedRegion(const OmpDirectiveSet &set) {
   return false;
 }
 
+bool OmpStructureChecker::IsNestedInDirective(llvm::omp::Directive directive) {
+  if (dirContext_.size() >= 1) {
+    for (size_t i = dirContext_.size() - 1; i > 0; --i) {
+      if (dirContext_[i - 1].directive == directive) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void OmpStructureChecker::CheckVariableListItem(
     const SymbolSourceMap &symbols) {
   for (auto &[symbol, source] : symbols) {
@@ -1880,12 +1891,89 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) {
   }
 }
 
+// Goes through the names in an OmpObjectList and checks if each name appears
+// in the given allocate statement
+void OmpStructureChecker::CheckAllNamesInAllocateStmt(
+    const parser::CharBlock &source, const parser::OmpObjectList &ompObjectList,
+    const parser::AllocateStmt &allocate) {
+  for (const auto &obj : ompObjectList.v) {
+    if (const auto *d{std::get_if<parser::Designator>(&obj.u)}) {
+      if (const auto *ref{std::get_if<parser::DataRef>(&d->u)}) {
+        if (const auto *n{std::get_if<parser::Name>(&ref->u)}) {
+          CheckNameInAllocateStmt(source, *n, allocate);
+        }
+      }
+    }
+  }
+}
+
+void OmpStructureChecker::CheckNameInAllocateStmt(
+    const parser::CharBlock &source, const parser::Name &name,
+    const parser::AllocateStmt &allocate) {
+  for (const auto &allocation :
+      std::get<std::list<parser::Allocation>>(allocate.t)) {
+    const auto &allocObj = std::get<parser::AllocateObject>(allocation.t);
+    if (const auto *n{std::get_if<parser::Name>(&allocObj.u)}) {
+      if (n->source == name.source) {
+        return;
+      }
+    }
+  }
+  unsigned version{context_.langOptions().OpenMPVersion};
+  context_.Say(source,
+      "Object '%s' in %s directive not "
+      "found in corresponding ALLOCATE statement"_err_en_US,
+      name.ToString(),
+      parser::ToUpperCaseLetters(
+          llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
+              .str()));
+}
+
 void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
-  isPredefinedAllocator = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+
+  unsigned version{context_.langOptions().OpenMPVersion};
+  if (version >= 52) {
+    context_.Warn(common::UsageWarning::OpenMPUsage, x.source,
+        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
+  }
+
+  bool hasAllocator = false;
+  // TODO: Investigate whether searching the clause list can be done with
+  // parser::Unwrap instead of the following loop
   const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+  for (const auto &clause : clauseList.v) {
+    if (std::get_if<parser::OmpClause::Allocator>(&clause.u)) {
+      hasAllocator = true;
+    }
+  }
+
+  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) {
+    // TODO: expand this check to exclude the case when a requires
+    //       directive with the dynamic_allocators clause is present
+    //       in the same compilation unit (OMP5.0 2.11.3).
+    context_.Say(x.source,
+        "ALLOCATE directives that appear in a TARGET region must specify an allocator clause"_err_en_US);
+  }
+
+  const auto &allocateStmt =
+      std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
+  if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
+    CheckAllNamesInAllocateStmt(
+        std::get<parser::Verbatim>(x.t).source, *list, allocateStmt);
+  }
+  if (const auto &subDirs{
+          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
+              x.t)}) {
+    for (const auto &dalloc : *subDirs) {
+      CheckAllNamesInAllocateStmt(std::get<parser::Verbatim>(dalloc.t).source,
+          std::get<parser::OmpObjectList>(dalloc.t), allocateStmt);
+    }
+  }
+
+  isPredefinedAllocator = true;
+  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   for (const auto &clause : clauseList.v) {
     CheckAlignValue(clause);
   }
@@ -1920,7 +2008,31 @@ void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
   const auto *allocate{
       action ? parser::Unwrap<parser::AllocateStmt>(action.stmt) : nullptr};
 
-  if (!allocate) {
+  if (allocate) {
+    for (const auto &clause : dirSpec.Clauses().v) {
+      if (auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
+        CheckAllNamesInAllocateStmt(
+            x.source, std::get<parser::OmpObjectList>(alloc->v.t), *allocate);
+
+        using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier;
+        using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier;
+
+        auto &modifiers{OmpGetModifiers(alloc->v)};
+        bool hasAllocator{
+            OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) ||
+            OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)};
+
+        // TODO: As with allocate directive, exclude the case when a requires
+        //       directive with the dynamic_allocators clause is present in
+        //       the same compilation unit (OMP5.0 2.11.3).
+        if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) &&
+            !hasAllocator) {
+          context_.Say(x.source,
+              "ALLOCATORS directives that appear in a TARGET region must specify an allocator"_err_en_US);
+        }
+      }
+    }
+  } else {
     const parser::CharBlock &source = action ? action.source : x.source;
     context_.Say(source,
         "The body of the ALLOCATORS construct should be an ALLOCATE statement"_err_en_US);
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 193784555a887..f507278fba5f2 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -177,6 +177,7 @@ class OmpStructureChecker
   bool HasInvalidWorksharingNesting(
       const parser::CharBlock &, const OmpDirectiveSet &);
   bool IsCloselyNestedRegion(const OmpDirectiveSet &set);
+  bool IsNestedInDirective(llvm::omp::Directive directive);
   void HasInvalidTeamsNesting(
       const llvm::omp::Directive &dir, const parser::CharBlock &source);
   void HasInvalidDistributeNesting(const parser::OpenMPLoopConstruct &x);
@@ -309,6 +310,11 @@ class OmpStructureChecker
       const std::optional<parser::OmpClauseList> &maybeClauses);
   void CheckCancellationNest(
       const parser::CharBlock &source, llvm::omp::Directive type);
+  void CheckAllNamesInAllocateStmt(const parser::CharBlock &source,
+      const parser::OmpObjectList &ompObjectList,
+      const parser::AllocateStmt &allocate);
+  void CheckNameInAllocateStmt(const parser::CharBlock &source,
+      const parser::Name &ompObject, const parser::AllocateStmt &allocate);
   std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x);
   void CheckReductionObjects(
       const parser::OmpObjectList &objects, llvm::omp::Clause clauseId);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 6132193332b4b..bd7b8ac552fab 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -1011,11 +1011,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     sourceLabels_.clear();
     targetLabels_.clear();
   };
-  void CheckAllNamesInAllocateStmt(const parser::CharBlock &source,
-      const parser::OmpObjectList &ompObjectList,
-      const parser::AllocateStmt &allocate);
-  void CheckNameInAllocateStmt(const parser::CharBlock &source,
-      const parser::Name &ompObject, const parser::AllocateStmt &allocate);
 
   std::int64_t ordCollapseLevel{0};
 
@@ -2550,8 +2545,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) {
 }
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
-  IssueNonConformanceWarning(llvm::omp::Directive::OMPD_allocate, x.source, 52);
-
   PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
   const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)};
   if (list) {
@@ -2632,83 +2625,10 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) {
 }
 
 void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) {
-  bool hasAllocator = false;
-  // TODO: Investigate whether searching the clause list can be done with
-  // parser::Unwrap instead of the following loop
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
-    if (std::get_if<parser::OmpClause::Allocator>(&clause.u)) {
-      hasAllocator = true;
-    }
-  }
-
-  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) {
-    // TODO: expand this check to exclude the case when a requires
-    //       directive with the dynamic_allocators clause is present
-    //       in the same compilation unit (OMP5.0 2.11.3).
-    context_.Say(x.source,
-        "ALLOCATE directives that appear in a TARGET region "
-        "must specify an allocator clause"_err_en_US);
-  }
-
-  const auto &allocateStmt =
-      std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
-  if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
-    CheckAllNamesInAllocateStmt(
-        std::get<parser::Verbatim>(x.t).source, *list, allocateStmt);
-  }
-  if (const auto &subDirs{
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              x.t)}) {
-    for (const auto &dalloc : *subDirs) {
-      CheckAllNamesInAllocateStmt(std::get<parser::Verbatim>(dalloc.t).source,
-          std::get<parser::OmpObjectList>(dalloc.t), allocateStmt);
-    }
-  }
   PopContext();
 }
 
 void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
-  const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()};
-  auto &block{std::get<parser::Block>(x.t)};
-
-  omp::SourcedActionStmt action{omp::GetActionStmt(block)};
-  const parser::AllocateStmt *allocate{[&]() {
-    if (action) {
-      if (auto *alloc{std::get_if<common::Indirection<parser::AllocateStmt>>(
-              &action.stmt->u)}) {
-        return &alloc->value();
-      }
-    }
-    return static_cast<const parser::AllocateStmt *>(nullptr);
-  }()};
-
-  if (allocate) {
-    for (const auto &clause : dirSpec.Clauses().v) {
-      if (auto *alloc{std::get_if<parser::OmpClause::Allocate>(&clause.u)}) {
-        CheckAllNamesInAllocateStmt(
-            x.source, std::get<parser::OmpObjectList>(alloc->v.t), *allocate);
-
-        using OmpAllocatorSimpleModifier = parser::OmpAllocatorSimpleModifier;
-        using OmpAllocatorComplexModifier = parser::OmpAllocatorComplexModifier;
-
-        auto &modifiers{OmpGetModifiers(alloc->v)};
-        bool hasAllocator{
-            OmpGetUniqueModifier<OmpAllocatorSimpleModifier>(modifiers) ||
-            OmpGetUniqueModifier<OmpAllocatorComplexModifier>(modifiers)};
-
-        // TODO: As with allocate directive, exclude the case when a requires
-        //       directive with the dynamic_allocators clause is present in
-        //       the same compilation unit (OMP5.0 2.11.3).
-        if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) &&
-            !hasAllocator) {
-          context_.Say(x.source,
-              "ALLOCATORS directives that appear in a TARGET region "
-              "must specify an allocator"_err_en_US);
-        }
-      }
-    }
-  }
   PopContext();
 }
 
@@ -3628,44 +3548,6 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
   }
 }
 
-// Goes through the names in an OmpObjectList and checks if each name appears
-// in the given allocate statement
-void OmpAttributeVisitor::CheckAllNamesInAllocateStmt(
-    const parser::CharBlock &source, const parser::OmpObjectList &ompObjectList,
-    const parser::AllocateStmt &allocate) {
-  for (const auto &obj : ompObjectList.v) {
-    if (const auto *d{std::get_if<parser::Designator>(&obj.u)}) {
-      if (const auto *ref{std::get_if<parser::DataRef>(&d->u)}) {
-        if (const auto *n{std::get_if<parser::Name>(&ref->u)}) {
-          CheckNameInAllocateStmt(source, *n, allocate);
-        }
-      }
-    }
-  }
-}
-
-void OmpAttributeVisitor::CheckNameInAllocateStmt(
-    const parser::CharBlock &source, const parser::Name &name,
-    const parser::AllocateStmt &allocate) {
-  for (const auto &allocation :
-      std::get<std::list<parser::Allocation>>(allocate.t)) {
-    const auto &allocObj = std::get<parser::AllocateObject>(allocation.t);
-    if (const auto *n{std::get_if<parser::Name>(&allocObj.u)}) {
-      if (n->source == name.source) {
-        return;
-      }
-    }
-  }
-  unsigned version{context_.langOptions().OpenMPVersion};
-  context_.Say(source,
-      "Object '%s' in %s directive not "
-      "found in corresponding ALLOCATE statement"_err_en_US,
-      name.ToString(),
-      parser::ToUpperCaseLetters(
-          llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
-              .str()));
-}
-
 void OmpAttributeVisitor::AddOmpRequiresToScope(Scope &scope,
     WithOmpDeclarative::RequiresFlags flags,
     std::optional<common::OmpMemoryOrderType> memOrder) {
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 508efa82f12a0..4967330e37b48 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -13,7 +13,7 @@ program allocate_align_tree
     z = 3
     !ERROR: The alignment value should be a constant positive integer
 !$omp allocate(j) align(xx)
-    !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead. [-Wopen-mp-usage]
+    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: The alignment value should be a constant positive integer
 !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
     allocate(j(z), xarray(t))
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 5280d1b68a731..1d99811156438 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -19,7 +19,7 @@ subroutine sema()
     !$omp allocate(y)
         print *, a
 
-    !WARNING: OpenMP directive ALLOCATE has been deprecated, please use ALLOCATORS instead. [-Wopen-mp-usage]
+    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !$omp allocate(x) allocator(omp_default_mem_alloc)
       allocate ( x(a), darray(a, b) )
     end subroutine sema

From e485d5e77a16fe5b775dc57e2ea4df9eade737cd Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim@gmail.com>
Date: Tue, 30 Sep 2025 09:25:32 -0400
Subject: [PATCH 245/878] [SLPVectorizer] Clear `TreeEntryToStridedPtrInfoMap`.
 (#160544)

We need to clear `TreeEntryToStridedPtrInfoMap` in `deleteTree`.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  5 ++
 .../SLPVectorizer/RISCV/test-delete-tree.ll   | 83 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/test-delete-tree.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c547662c3a77e..f77d587bf5889 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2105,6 +2105,7 @@ class BoUpSLP {
     UserIgnoreList = nullptr;
     PostponedGathers.clear();
     ValueToGatherNodes.clear();
+    TreeEntryToStridedPtrInfoMap.clear();
   }
 
   unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -8948,6 +8949,8 @@ BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         const SmallDenseSet<Value *> &UserIgnoreLst) {
   deleteTree();
+  assert(TreeEntryToStridedPtrInfoMap.empty() &&
+         "TreeEntryToStridedPtrInfoMap is not cleared");
   UserIgnoreList = &UserIgnoreLst;
   if (!allSameType(Roots))
     return;
@@ -8956,6 +8959,8 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
   deleteTree();
+  assert(TreeEntryToStridedPtrInfoMap.empty() &&
+         "TreeEntryToStridedPtrInfoMap is not cleared");
   if (!allSameType(Roots))
     return;
   buildTreeRec(Roots, 0, EdgeInfo());
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/test-delete-tree.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/test-delete-tree.ll
new file mode 100644
index 0000000000000..c4e6c4e5d5db5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/test-delete-tree.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=riscv64 -mattr=+m,+v -passes=slp-vectorizer -S < %s | FileCheck %s
+
+; CHECK-NOT: TreeEntryToStridedPtrInfoMap is not cleared
+define void @const_stride_1_no_reordering(ptr %pl, ptr %ps) {
+; CHECK-LABEL: define void @const_stride_1_no_reordering(
+; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
+; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP1]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT:    ret void
+;
+  %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+  %gep_l1 = getelementptr inbounds i8, ptr %pl, i64 1
+  %gep_l2 = getelementptr inbounds i8, ptr %pl, i64 2
+  %gep_l3 = getelementptr inbounds i8, ptr %pl, i64 3
+  %gep_l4 = getelementptr inbounds i8, ptr %pl, i64 4
+  %gep_l5 = getelementptr inbounds i8, ptr %pl, i64 5
+  %gep_l6 = getelementptr inbounds i8, ptr %pl, i64 6
+  %gep_l7 = getelementptr inbounds i8, ptr %pl, i64 7
+  %gep_l8 = getelementptr inbounds i8, ptr %pl, i64 8
+  %gep_l9 = getelementptr inbounds i8, ptr %pl, i64 9
+  %gep_l10 = getelementptr inbounds i8, ptr %pl, i64 10
+  %gep_l11 = getelementptr inbounds i8, ptr %pl, i64 11
+  %gep_l12 = getelementptr inbounds i8, ptr %pl, i64 12
+  %gep_l13 = getelementptr inbounds i8, ptr %pl, i64 13
+  %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14
+  %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15
+
+  %load0  = load i8, ptr %gep_l0
+  %load1  = load i8, ptr %gep_l1
+  %load2  = load i8, ptr %gep_l2
+  %load3  = load i8, ptr %gep_l3
+  %load4  = load i8, ptr %gep_l4
+  %load5  = load i8, ptr %gep_l5
+  %load6  = load i8, ptr %gep_l6
+  %load7  = load i8, ptr %gep_l7
+  %load8  = load i8, ptr %gep_l8
+  %load9  = load i8, ptr %gep_l9
+  %load10 = load i8, ptr %gep_l10
+  %load11 = load i8, ptr %gep_l11
+  %load12 = load i8, ptr %gep_l12
+  %load13 = load i8, ptr %gep_l13
+  %load14 = load i8, ptr %gep_l14
+  %load15 = load i8, ptr %gep_l15
+
+  %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+  %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
+  %gep_s2 = getelementptr inbounds i8, ptr %ps, i64 2
+  %gep_s3 = getelementptr inbounds i8, ptr %ps, i64 3
+  %gep_s4 = getelementptr inbounds i8, ptr %ps, i64 4
+  %gep_s5 = getelementptr inbounds i8, ptr %ps, i64 5
+  %gep_s6 = getelementptr inbounds i8, ptr %ps, i64 6
+  %gep_s7 = getelementptr inbounds i8, ptr %ps, i64 7
+  %gep_s8 = getelementptr inbounds i8, ptr %ps, i64 8
+  %gep_s9 = getelementptr inbounds i8, ptr %ps, i64 9
+  %gep_s10 = getelementptr inbounds i8, ptr %ps, i64 10
+  %gep_s11 = getelementptr inbounds i8, ptr %ps, i64 11
+  %gep_s12 = getelementptr inbounds i8, ptr %ps, i64 12
+  %gep_s13 = getelementptr inbounds i8, ptr %ps, i64 13
+  %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
+  %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
+
+  store i8 %load0, ptr %gep_s0
+  store i8 %load1, ptr %gep_s1
+  store i8 %load2, ptr %gep_s2
+  store i8 %load3, ptr %gep_s3
+  store i8 %load4, ptr %gep_s4
+  store i8 %load5, ptr %gep_s5
+  store i8 %load6, ptr %gep_s6
+  store i8 %load7, ptr %gep_s7
+  store i8 %load8, ptr %gep_s8
+  store i8 %load9, ptr %gep_s9
+  store i8 %load10, ptr %gep_s10
+  store i8 %load11, ptr %gep_s11
+  store i8 %load12, ptr %gep_s12
+  store i8 %load13, ptr %gep_s13
+  store i8 %load14, ptr %gep_s14
+  store i8 %load15, ptr %gep_s15
+
+  ret void
+}

From c2188168cab442192121b34052a4edbf4b439f60 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Tue, 30 Sep 2025 14:35:33 +0100
Subject: [PATCH 246/878] [flang][debug] Generate splitDebugFilename field in
 DICompileUnitAttr. (#161214)

This PR builds on https://github.com/llvm/llvm-project/pull/160540 and
allows us to set the `splitDebugFilename` field in `DICompileUnitAttr`.
The changes are mostly mechanical.

I saw some spurious white space in a test that I have cleaned up.
---
 .../flang/Optimizer/Passes/Pipelines.h        |  3 ++-
 .../flang/Optimizer/Transforms/Passes.td      |  4 +++
 flang/include/flang/Tools/CrossToolHelpers.h  |  2 ++
 flang/lib/Optimizer/Passes/Pipelines.cpp      | 12 ++++++---
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp |  3 ++-
 flang/test/Integration/debug-split-dwarf.f90  | 26 ++++++++++++-------
 flang/test/Transforms/debug-split-dwarf.fir   | 12 +++++++++
 7 files changed, 47 insertions(+), 15 deletions(-)
 create mode 100644 flang/test/Transforms/debug-split-dwarf.fir

diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index f9c41b382abe5..682dd829239ef 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -158,7 +158,8 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
 void createDebugPasses(mlir::PassManager &pm,
                        llvm::codegenoptions::DebugInfoKind debugLevel,
                        llvm::OptimizationLevel OptLevel,
-                       llvm::StringRef inputFilename, int32_t dwarfVersion);
+                       llvm::StringRef inputFilename, int32_t dwarfVersion,
+                       llvm::StringRef splitDwarfFile);
 
 void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
                                          MLIRToLLVMPassPipelineConfig config,
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 88573fa9dff7d..e2ba9c3522837 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -246,6 +246,10 @@ def AddDebugInfo : Pass<"add-debug-info", "mlir::ModuleOp"> {
            "int32_t",
            /*default=*/"0",
            "dwarf version">,
+    Option<"splitDwarfFile", "split-dwarf-file",
+           "std::string", /*default=*/"std::string{}",
+           "Name of the split dwarf file">
+
   ];
 }
 
diff --git a/flang/include/flang/Tools/CrossToolHelpers.h b/flang/include/flang/Tools/CrossToolHelpers.h
index 01c34eee014f3..850bd1f0940f7 100644
--- a/flang/include/flang/Tools/CrossToolHelpers.h
+++ b/flang/include/flang/Tools/CrossToolHelpers.h
@@ -109,6 +109,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
       InstrumentFunctionExit = "__cyg_profile_func_exit";
     }
     DwarfVersion = opts.DwarfVersion;
+    SplitDwarfFile = opts.SplitDwarfFile;
   }
 
   llvm::OptimizationLevel OptLevel; ///< optimisation level
@@ -146,6 +147,7 @@ struct MLIRToLLVMPassPipelineConfig : public FlangEPCallBacks {
       Fortran::frontend::CodeGenOptions::ComplexRangeKind::
           CX_Full; ///< Method for calculating complex number division
   int32_t DwarfVersion = 0; ///< Version of DWARF debug info to generate
+  std::string SplitDwarfFile = ""; ///< File name for the split debug info
 };
 
 struct OffloadModuleOpts {
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index c089941688352..a83b0665eaf1f 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -95,12 +95,14 @@ getEmissionKind(llvm::codegenoptions::DebugInfoKind kind) {
 void addDebugInfoPass(mlir::PassManager &pm,
                       llvm::codegenoptions::DebugInfoKind debugLevel,
                       llvm::OptimizationLevel optLevel,
-                      llvm::StringRef inputFilename, int32_t dwarfVersion) {
+                      llvm::StringRef inputFilename, int32_t dwarfVersion,
+                      llvm::StringRef splitDwarfFile) {
   fir::AddDebugInfoOptions options;
   options.debugLevel = getEmissionKind(debugLevel);
   options.isOptimized = optLevel != llvm::OptimizationLevel::O0;
   options.inputFilename = inputFilename;
   options.dwarfVersion = dwarfVersion;
+  options.splitDwarfFile = splitDwarfFile;
   addPassConditionally(pm, disableDebugInfo,
                        [&]() { return fir::createAddDebugInfoPass(options); });
 }
@@ -340,9 +342,11 @@ void createOpenMPFIRPassPipeline(mlir::PassManager &pm,
 void createDebugPasses(mlir::PassManager &pm,
                        llvm::codegenoptions::DebugInfoKind debugLevel,
                        llvm::OptimizationLevel OptLevel,
-                       llvm::StringRef inputFilename, int32_t dwarfVersion) {
+                       llvm::StringRef inputFilename, int32_t dwarfVersion,
+                       llvm::StringRef splitDwarfFile) {
   if (debugLevel != llvm::codegenoptions::NoDebugInfo)
-    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename, dwarfVersion);
+    addDebugInfoPass(pm, debugLevel, OptLevel, inputFilename, dwarfVersion,
+                     splitDwarfFile);
 }
 
 void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
@@ -360,7 +364,7 @@ void createDefaultFIRCodeGenPassPipeline(mlir::PassManager &pm,
       pm, (config.DebugInfo != llvm::codegenoptions::NoDebugInfo));
   fir::addExternalNameConversionPass(pm, config.Underscoring);
   fir::createDebugPasses(pm, config.DebugInfo, config.OptLevel, inputFilename,
-                         config.DwarfVersion);
+                         config.DwarfVersion, config.SplitDwarfFile);
   fir::addTargetRewritePass(pm);
   fir::addCompilerGeneratedNamesConversionPass(pm);
 
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index bc6592dd72078..bdf7e4a366cf1 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -696,7 +696,8 @@ void AddDebugInfoPass::runOnOperation() {
       llvm::dwarf::getLanguage("DW_LANG_Fortran95"), fileAttr, producer,
       isOptimized, debugLevel,
       /*nameTableKind=*/mlir::LLVM::DINameTableKind::Default,
-      /*splitDebugFilename=*/mlir::StringAttr());
+      splitDwarfFile.empty() ? mlir::StringAttr()
+                             : mlir::StringAttr::get(context, splitDwarfFile));
 
   module.walk([&](mlir::func::FuncOp funcOp) {
     handleFuncOp(funcOp, fileAttr, cuAttr, typeGen, &symbolTable);
diff --git a/flang/test/Integration/debug-split-dwarf.f90 b/flang/test/Integration/debug-split-dwarf.f90
index 60373efddc358..ebfa040a42632 100644
--- a/flang/test/Integration/debug-split-dwarf.f90
+++ b/flang/test/Integration/debug-split-dwarf.f90
@@ -2,20 +2,28 @@
 
 ! Testing to ensure that setting only -split-dwarf-file allows to place
 ! .dwo sections into regular output object.
-!  RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
-!  RUN:   -split-dwarf-file %t.o -emit-obj -o %t.o %s
-!  RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=DWO %s
+! RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
+! RUN:   -split-dwarf-file %t.o -emit-obj -o %t.o %s
+! RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=DWO %s
 
 ! Testing to ensure that setting both -split-dwarf-file and -split-dwarf-output
 ! does not place .dwo sections into regular output object but in a separate
 ! file.
-!  RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
-!  RUN:   -split-dwarf-file %t.dwo -split-dwarf-output %t.dwo -emit-obj -o %t.o %s
-!  RUN: llvm-readobj -S %t.dwo | FileCheck --check-prefix=DWO %s
-!  RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=SPLIT %s
+! RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
+! RUN:   -split-dwarf-file %t.dwo -split-dwarf-output %t.dwo -emit-obj -o %t.o %s
+! RUN: llvm-readobj -S %t.dwo | FileCheck --check-prefix=DWO %s
+! RUN: llvm-readobj -S %t.o | FileCheck --check-prefix=SPLIT %s
 
-!  DWO: .dwo
-!  SPLIT-NOT: .dwo
+! Test that splitDebugFilename field of the DICompileUnit get correctly
+! generated.
+! RUN: %flang_fc1 -debug-info-kind=standalone -triple x86_64-unknown-linux \
+! RUN:   -split-dwarf-file %t.test_dwo -split-dwarf-output %t.test_dwo \
+! RUN:   -emit-llvm %s -o - | FileCheck --check-prefix=CU %s
+
+! DWO: .dwo
+! SPLIT-NOT: .dwo
+! CU: !DICompileUnit
+! CU-SAME: splitDebugFilename: "{{.*}}test_dwo"
 
 program test
 end program test
diff --git a/flang/test/Transforms/debug-split-dwarf.fir b/flang/test/Transforms/debug-split-dwarf.fir
new file mode 100644
index 0000000000000..9c095457fb117
--- /dev/null
+++ b/flang/test/Transforms/debug-split-dwarf.fir
@@ -0,0 +1,12 @@
+// RUN: fir-opt --add-debug-info="split-dwarf-file=test.dwo"  \
+// RUN:         --mlir-print-debuginfo %s -o - | FileCheck %s
+
+module {
+  func.func @test() {
+    return
+  } loc(#loc1)
+}
+#loc1 = loc("test.f90":15:1)
+
+// CHECK: llvm.di_compile_unit
+// CHECK-SAME: splitDebugFilename = "test.dwo"

From 343476e12648c8e4afadff55f74ff95a36cfa8e3 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 07:07:01 -0700
Subject: [PATCH 247/878] [MLIR] Apply clang-tidy fixes for
 readability-container-size-empty in LinalgTransformOps.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 6ee2d8653d2dc..3f0b0bacd9756 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2799,7 +2799,7 @@ SplitOp::apply(transform::TransformRewriter &rewriter,
     }
 
     opList.append(first);
-    if (second.size())
+    if (!second.empty())
       opList.append(second);
   }
   results.set(cast<OpResult>(getSplitList()), opList);

From cf50bbf983c6ff032c7ad0de27ffaff412947ffb Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Tue, 30 Sep 2025 14:53:01 +0100
Subject: [PATCH 248/878] [AArch64][SVE2p1] Allow more uses of mask in
 performActiveLaneMaskCombine (#159360)

The combine replaces a get_active_lane_mask used by two extract
subvectors with a single paired whilelo intrinsic. When the instruction
is used for control flow in a vector loop, an additional extract of element
0 may introduce other uses of the intrinsic such as ptest and reinterpret
cast, which is currently not supported.

This patch changes performActiveLaneMaskCombine to count the number
of extract subvectors using the mask instead of the total number of uses,
and returns the concatenated results of get_active_lane_mask.
---
 llvm/include/llvm/Support/TypeSize.h          |   5 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  39 ++--
 .../AArch64/get-active-lane-mask-extract.ll   | 181 ++++++++++++++++++
 llvm/unittests/Support/TypeSizeTest.cpp       |   1 +
 4 files changed, 210 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 29d1c6894b4b6..0a7ae15edbb33 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -179,7 +179,7 @@ template <typename LeafTy, typename ValueTy> class FixedOrScalableQuantity {
   /// This function tells the caller whether the element count is known at
   /// compile time to be a multiple of the scalar value RHS.
   constexpr bool isKnownMultipleOf(ScalarTy RHS) const {
-    return getKnownMinValue() % RHS == 0;
+    return RHS != 0 && getKnownMinValue() % RHS == 0;
   }
 
   /// Returns whether or not the callee is known to be a multiple of RHS.
@@ -191,7 +191,8 @@ template <typename LeafTy, typename ValueTy> class FixedOrScalableQuantity {
     // x % y == 0 !=> x % (vscale * y) == 0
     if (!isScalable() && RHS.isScalable())
       return false;
-    return getKnownMinValue() % RHS.getKnownMinValue() == 0;
+    return RHS.getKnownMinValue() != 0 &&
+           getKnownMinValue() % RHS.getKnownMinValue() == 0;
   }
 
   // Return the minimum value with the assumption that the count is exact.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9078675da0e95..45f52352d45fd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18867,21 +18867,25 @@ performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
     return SDValue();
 
-  unsigned NumUses = N->use_size();
+  // Count the number of users which are extract_vectors.
+  unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
+    return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
+  });
+
   auto MaskEC = N->getValueType(0).getVectorElementCount();
-  if (!MaskEC.isKnownMultipleOf(NumUses))
+  if (!MaskEC.isKnownMultipleOf(NumExts))
     return SDValue();
 
-  ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumUses);
+  ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
   if (ExtMinEC.getKnownMinValue() < 2)
     return SDValue();
 
-  SmallVector<SDNode *> Extracts(NumUses, nullptr);
+  SmallVector<SDNode *> Extracts(NumExts, nullptr);
   for (SDNode *Use : N->users()) {
     if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
-      return SDValue();
+      continue;
 
-    // Ensure the extract type is correct (e.g. if NumUses is 4 and
+    // Ensure the extract type is correct (e.g. if NumExts is 4 and
     // the mask return type is nxv8i1, each extract should be nxv2i1.
     if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
       return SDValue();
@@ -18902,32 +18906,39 @@ performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 
   SDValue Idx = N->getOperand(0);
   SDValue TC = N->getOperand(1);
-  EVT OpVT = Idx.getValueType();
-  if (OpVT != MVT::i64) {
+  if (Idx.getValueType() != MVT::i64) {
     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
     TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
   }
 
   // Create the whilelo_x2 intrinsics from each pair of extracts
   EVT ExtVT = Extracts[0]->getValueType(0);
+  EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
   auto R =
       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
   DCI.CombineTo(Extracts[0], R.getValue(0));
   DCI.CombineTo(Extracts[1], R.getValue(1));
+  SmallVector<SDValue> Concats = {DAG.getNode(
+      ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
 
-  if (NumUses == 2)
-    return SDValue(N, 0);
+  if (NumExts == 2) {
+    assert(N->getValueType(0) == DoubleExtVT);
+    return Concats[0];
+  }
 
-  auto Elts = DAG.getElementCount(DL, OpVT, ExtVT.getVectorElementCount() * 2);
-  for (unsigned I = 2; I < NumUses; I += 2) {
+  auto Elts =
+      DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
+  for (unsigned I = 2; I < NumExts; I += 2) {
     // After the first whilelo_x2, we need to increment the starting value.
-    Idx = DAG.getNode(ISD::UADDSAT, DL, OpVT, Idx, Elts);
+    Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
     R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
     DCI.CombineTo(Extracts[I], R.getValue(0));
     DCI.CombineTo(Extracts[I + 1], R.getValue(1));
+    Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
+                                  R.getValue(0), R.getValue(1)));
   }
 
-  return SDValue(N, 0);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
 }
 
 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
index 5e01612e3881a..b89f55188b0f2 100644
--- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -310,6 +310,187 @@ define void @test_2x32bit_mask_with_32bit_index_and_trip_count(i32 %i, i32 %n) #
   ret void
 }
 
+; Extra use of the get_active_lane_mask from an extractelement, which is replaced with ptest_first.
+
+define void @test_2x8bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_extracts_and_ptest:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p1.b, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB11_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB11_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_ptest:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.h, p1.h }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p2.b
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p3.b, p0.b, p1.b
+; CHECK-SVE2p1-SME2-NEXT:    ptest p2, p3.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB11_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB11_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i64 %i, i64 %n)
+    %v0 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+    %v1 = call <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+    %elt0 = extractelement <vscale x 16 x i1> %r, i32 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 8 x i1> %v0, <vscale x 8 x i1> %v1)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
+; Extra use of the get_active_lane_mask from an extractelement, which is
+; replaced with ptest_first and reinterpret_casts because the extract is not nxv16i1.
+
+define void @test_2x8bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_2x8bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p1.h, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB12_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB12_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.s, p1.s }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p2.h
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p3.h, p0.h, p1.h
+; CHECK-SVE2p1-SME2-NEXT:    ptest p2, p3.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB12_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB12_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %i, i64 %n)
+    %v0 = tail call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> %r, i64 0)
+    %v1 = tail call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv8i1(<vscale x 8 x i1> %r, i64 4)
+    %elt0 = extractelement <vscale x 8 x i1> %r, i64 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 4 x i1> %v0, <vscale x 4 x i1> %v1)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
+define void @test_4x4bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_4x4bit_mask_with_extracts_and_ptest:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p0.b, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB13_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p1.h, p0.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p0.b
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    punpklo p2.h, p3.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p3.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB13_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_4x4bit_mask_with_extracts_and_ptest:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    cnth x8
+; CHECK-SVE2p1-SME2-NEXT:    adds x8, x0, x8
+; CHECK-SVE2p1-SME2-NEXT:    csinv x8, x8, xzr, lo
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.s, p1.s }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.s, p3.s }, x8, x1
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.h, p0.h, p1.h
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p5.h, p2.h, p3.h
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.b, p4.b, p5.b
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p5.b
+; CHECK-SVE2p1-SME2-NEXT:    ptest p5, p4.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB13_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB13_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i64 %i, i64 %n)
+    %v0 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 0)
+    %v1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 4)
+    %v2 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 8)
+    %v3 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1.i64(<vscale x 16 x i1> %r, i64 12)
+    %elt0 = extractelement <vscale x 16 x i1> %r, i32 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 4 x i1> %v0, <vscale x 4 x i1> %v1, <vscale x 4 x i1> %v2, <vscale x 4 x i1> %v3)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
+define void @test_4x2bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n) {
+; CHECK-SVE-LABEL: test_4x2bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    whilelo p0.h, x0, x1
+; CHECK-SVE-NEXT:    b.pl .LBB14_2
+; CHECK-SVE-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE-NEXT:    punpklo p1.h, p0.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p0.b
+; CHECK-SVE-NEXT:    punpklo p0.h, p1.b
+; CHECK-SVE-NEXT:    punpkhi p1.h, p1.b
+; CHECK-SVE-NEXT:    punpklo p2.h, p3.b
+; CHECK-SVE-NEXT:    punpkhi p3.h, p3.b
+; CHECK-SVE-NEXT:    b use
+; CHECK-SVE-NEXT:  .LBB14_2: // %if.end
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-SVE2p1-SME2-LABEL: test_4x2bit_mask_with_extracts_and_reinterpret_casts:
+; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
+; CHECK-SVE2p1-SME2-NEXT:    cntw x8
+; CHECK-SVE2p1-SME2-NEXT:    adds x8, x0, x8
+; CHECK-SVE2p1-SME2-NEXT:    csinv x8, x8, xzr, lo
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.d, p1.d }, x0, x1
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.d, p3.d }, x8, x1
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.s, p0.s, p1.s
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p5.s, p2.s, p3.s
+; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.h, p4.h, p5.h
+; CHECK-SVE2p1-SME2-NEXT:    ptrue p5.h
+; CHECK-SVE2p1-SME2-NEXT:    ptest p5, p4.b
+; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB14_2
+; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    b use
+; CHECK-SVE2p1-SME2-NEXT:  .LBB14_2: // %if.end
+; CHECK-SVE2p1-SME2-NEXT:    ret
+entry:
+    %r = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i64 %i, i64 %n)
+    %v0 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 0)
+    %v1 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 2)
+    %v2 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 4)
+    %v3 = call <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv8i1.i64(<vscale x 8 x i1> %r, i64 6)
+    %elt0 = extractelement <vscale x 8 x i1> %r, i32 0
+    br i1 %elt0, label %if.then, label %if.end
+
+if.then:
+    tail call void @use(<vscale x 2 x i1> %v0, <vscale x 2 x i1> %v1, <vscale x 2 x i1> %v2, <vscale x 2 x i1> %v3)
+    br label %if.end
+
+if.end:
+    ret void
+}
+
 declare void @use(...)
 
 attributes #0 = { nounwind }
diff --git a/llvm/unittests/Support/TypeSizeTest.cpp b/llvm/unittests/Support/TypeSizeTest.cpp
index b02b7e6009535..018b2405d4005 100644
--- a/llvm/unittests/Support/TypeSizeTest.cpp
+++ b/llvm/unittests/Support/TypeSizeTest.cpp
@@ -58,6 +58,7 @@ static_assert(ElementCount::getFixed(8).divideCoefficientBy(2) ==
 static_assert(ElementCount::getFixed(8).multiplyCoefficientBy(3) ==
               ElementCount::getFixed(24));
 static_assert(ElementCount::getFixed(8).isKnownMultipleOf(2));
+static_assert(!ElementCount::getFixed(8).isKnownMultipleOf(0));
 
 constexpr TypeSize TSFixed0 = TypeSize::getFixed(0);
 constexpr TypeSize TSFixed1 = TypeSize::getFixed(1);

From 4a873d58df755d220889b994f59cbdea2cdefc49 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 30 Sep 2025 16:16:42 +0200
Subject: [PATCH 249/878] [IR] Don't create ptrtoint expression to determine
 alignment (NFCI) (#161364)

We try to determine the alignment of a constant by creating a ptrtoint
expression and seeing if it folds. I believe the only case this can
actually handle is where the constant is an inttoptr expression. Handle
that directly instead of going through another ptrtoint expression.

I ran into this while trying to clean up our isEliminableCastPair()
mess, which is going to disable ptrtoint(inttoptr) folding without
DataLayout, breaking this code.
---
 llvm/lib/IR/Value.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 4e8f359481b81..e5e062d1cf4e2 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -1000,14 +1000,12 @@ Align Value::getPointerAlignment(const DataLayout &DL) const {
       ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
       return Align(CI->getLimitedValue());
     }
-  } else if (auto *CstPtr = dyn_cast<Constant>(this)) {
-    // Strip pointer casts to avoid creating unnecessary ptrtoint expression
-    // if the only "reduction" is combining a bitcast + ptrtoint.
-    CstPtr = CstPtr->stripPointerCasts();
-    if (auto *CstInt = dyn_cast_or_null<ConstantInt>(ConstantExpr::getPtrToInt(
-            const_cast<Constant *>(CstPtr), DL.getIntPtrType(getType()),
-            /*OnlyIfReduced=*/true))) {
-      size_t TrailingZeros = CstInt->getValue().countr_zero();
+  } else if (auto *CE = dyn_cast<ConstantExpr>(this)) {
+    // Determine the alignment of inttoptr(C).
+    if (CE->getOpcode() == Instruction::IntToPtr &&
+        isa<ConstantInt>(CE->getOperand(0))) {
+      ConstantInt *IntPtr = cast<ConstantInt>(CE->getOperand(0));
+      size_t TrailingZeros = IntPtr->getValue().countr_zero();
       // While the actual alignment may be large, elsewhere we have
       // an arbitrary upper alignmet limit, so let's clamp to it.
       return Align(TrailingZeros < Value::MaxAlignmentExponent

From 98d43ef2d842eb8b552b244a4ed6218d757682a1 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Tue, 30 Sep 2025 16:18:51 +0200
Subject: [PATCH 250/878] AMDGPU: Use srcvalue and delete Ignore complex
 pattern (#161359)

---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td                | 4 ----
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp        | 2 --
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h          | 2 --
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 8 --------
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h   | 3 ---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td                | 2 --
 llvm/lib/Target/AMDGPU/SOPInstructions.td            | 2 +-
 7 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index a0cd1785c0130..bb4bf742fb861 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -13,10 +13,6 @@
 include "AMDGPU.td"
 include "AMDGPUCombine.td"
 
-def gi_ignore :
-    GIComplexOperandMatcher<s32, "selectIgnore">,
-    GIComplexPatternEquiv<Ignore>;
-
 def sd_vsrc0 : ComplexPattern<i32, 1, "">;
 def gi_vsrc0 :
     GIComplexOperandMatcher<s32, "selectVSRC0">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index bdf4cd3693b2a..2192a72bb27b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4312,8 +4312,6 @@ bool AMDGPUDAGToDAGISel::SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1,
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectIgnore(SDValue In) const { return true; }
-
 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   if (In.isUndef())
     return CurDAG->getUNDEF(MVT::i32);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 906548742f77f..4fa0d3f72e1c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -305,8 +305,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectWAVE_ADDRESS(SDNode *N);
   void SelectSTACKRESTORE(SDNode *N);
 
-  bool SelectIgnore(SDValue In) const;
-
 protected:
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1535f2681480e..12915c7344426 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4266,14 +4266,6 @@ Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
   return Src;
 }
 
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectIgnore(MachineOperand &Root) const {
-  // Don't render anything.
-  ComplexRendererFns Renderers;
-  Renderers.emplace();
-  return Renderers;
-}
-
 ///
 /// This will select either an SGPR or VGPR operand and will save us from
 /// having to write an extra tablegen pattern.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 5a575a9c66a8a..c760fe7ef99dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -166,9 +166,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
                                  MachineOperand Root, MachineInstr *InsertPt,
                                  bool ForceVGPR = false) const;
 
-  InstructionSelector::ComplexRendererFns
-  selectIgnore(MachineOperand &Root) const;
-
   InstructionSelector::ComplexRendererFns
   selectVCSRC(MachineOperand &Root) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e46bd45aed506..18a53931a6390 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1710,8 +1710,6 @@ def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods"
 def VINTERPMods  : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
 def VINTERPModsHi  : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
 
-def Ignore : ComplexPattern<untyped, 0, "SelectIgnore">;
-
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 6a39187e48cc8..b3fd8c70dd045 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1617,7 +1617,7 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
 }
 
   def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave",
-    (ins), "", [(int_amdgcn_s_barrier_leave (Ignore))] > {
+    (ins), "", [(int_amdgcn_s_barrier_leave (i16 srcvalue))] > {
   let SchedRW = [WriteBarrier];
   let simm16 = 0;
   let fixed_imm = 1;

From 82efd72ed505c6ec183eca700290a29051c2d6e6 Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Tue, 30 Sep 2025 07:36:13 -0700
Subject: [PATCH 251/878] [MLIR] Add sincos op to math dialect (#160772)

Now that `sincos` is a supported intrinsic in the LLVM dialect
(#160561) we are able to add the corresponding operation in
the math dialect and add conversion patterns for LLVM and NVVM.

We have several benchmarks that use sine and cosine in hot-loops, and
saving some calculations by performing them together can benefit
performance. We would like to have a way to represent sincos in the math
dialect.
---
 mlir/include/mlir/Dialect/Math/IR/MathOps.td  | 37 +++++++
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        | 99 ++++++++++++++++++-
 mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp | 33 +++++++
 mlir/lib/Dialect/Math/IR/MathOps.cpp          | 10 ++
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     | 39 ++++++++
 .../Conversion/MathToLLVM/math-to-llvm.mlir   | 10 ++
 mlir/test/Dialect/Math/ops.mlir               | 12 +++
 7 files changed, 239 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
index cfd8c4b8f11f7..af65af6fedec6 100644
--- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td
+++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
@@ -510,6 +510,43 @@ def Math_SinhOp : Math_FloatUnaryOp<"sinh"> {
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// SinCosOp
+//===----------------------------------------------------------------------===//
+
+def Math_SincosOp : Math_Op<"sincos",
+    [SameOperandsAndResultShape,
+     DeclareOpInterfaceMethods<ArithFastMathInterface>,
+     AllTypesMatch<["operand", "sin", "cos"]>]> {
+  let summary = "sine and cosine of the specified value";
+  let description = [{
+    The `sincos` operation computes both the sine and cosine of a given value
+    simultaneously. It takes one operand of floating point type (i.e., scalar,
+    tensor or vector) and returns two results of the same type. This operation
+    can be more efficient than computing sine and cosine separately when both
+    values are needed.
+
+    Example:
+
+    ```mlir
+    // Scalar sine and cosine values.
+    %sin, %cos = math.sincos %input : f64
+    ```
+  }];
+
+  let arguments = (ins FloatLike:$operand,
+      DefaultValuedAttr<Arith_FastMathAttr,
+                        "::mlir::arith::FastMathFlags::none">:$fastmath);
+  let results = (outs FloatLike:$sin, FloatLike:$cos);
+
+  let assemblyFormat = [{ $operand (`fastmath` `` $fastmath^)?
+                          attr-dict `:` type($operand) }];
+
+  let extraClassDeclaration = [{
+    std::optional<SmallVector<int64_t, 4>> getShapeForUnroll();
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // CountLeadingZerosOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index a95263bb55f69..852c50c965f11 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -436,7 +436,7 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
                       LLVM::FAbsOp, LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp,
                       LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp,
                       LLVM::RoundEvenOp, LLVM::RoundOp, LLVM::SinOp,
-                      LLVM::SqrtOp>();
+                      LLVM::SincosOp, LLVM::SqrtOp>();
 
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
@@ -466,6 +466,100 @@ void mlir::configureGpuToNVVMTypeConverter(LLVMTypeConverter &converter) {
   });
 }
 
+struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
+  using ConvertOpToLLVMPattern<math::SincosOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(math::SincosOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value input = adaptor.getOperand();
+    Type inputType = input.getType();
+    auto convertedInput = maybeExt(input, rewriter);
+    auto computeType = convertedInput.getType();
+
+    StringRef sincosFunc;
+    if (isa<Float32Type>(computeType)) {
+      const arith::FastMathFlags flag = op.getFastmath();
+      const bool useApprox =
+          mlir::arith::bitEnumContainsAny(flag, arith::FastMathFlags::afn);
+      sincosFunc = useApprox ? "__nv_fast_sincosf" : "__nv_sincosf";
+    } else if (isa<Float64Type>(computeType)) {
+      sincosFunc = "__nv_sincos";
+    } else {
+      return rewriter.notifyMatchFailure(op,
+                                         "unsupported operand type for sincos");
+    }
+
+    auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
+
+    Value sinPtr, cosPtr;
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      auto *scope =
+          op->getParentWithTrait<mlir::OpTrait::AutomaticAllocationScope>();
+      assert(scope && "Expected op to be inside automatic allocation scope");
+      rewriter.setInsertionPointToStart(&scope->getRegion(0).front());
+      auto one = rewriter.create<LLVM::ConstantOp>(
+          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1));
+      sinPtr =
+          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+      cosPtr =
+          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+    }
+
+    createSincosCall(rewriter, loc, sincosFunc, convertedInput, sinPtr, cosPtr,
+                     op);
+
+    auto sinResult = rewriter.create<LLVM::LoadOp>(loc, computeType, sinPtr);
+    auto cosResult = rewriter.create<LLVM::LoadOp>(loc, computeType, cosPtr);
+
+    rewriter.replaceOp(op, {maybeTrunc(sinResult, inputType, rewriter),
+                            maybeTrunc(cosResult, inputType, rewriter)});
+    return success();
+  }
+
+private:
+  Value maybeExt(Value operand, PatternRewriter &rewriter) const {
+    if (isa<Float16Type, BFloat16Type>(operand.getType()))
+      return rewriter.create<LLVM::FPExtOp>(
+          operand.getLoc(), Float32Type::get(rewriter.getContext()), operand);
+    return operand;
+  }
+
+  Value maybeTrunc(Value operand, Type type, PatternRewriter &rewriter) const {
+    if (operand.getType() != type)
+      return rewriter.create<LLVM::FPTruncOp>(operand.getLoc(), type, operand);
+    return operand;
+  }
+
+  void createSincosCall(ConversionPatternRewriter &rewriter, Location loc,
+                        StringRef funcName, Value input, Value sinPtr,
+                        Value cosPtr, Operation *op) const {
+    auto voidType = LLVM::LLVMVoidType::get(rewriter.getContext());
+    auto ptrType = sinPtr.getType();
+
+    SmallVector<Type> operandTypes = {input.getType(), ptrType, ptrType};
+    auto funcType = LLVM::LLVMFunctionType::get(voidType, operandTypes);
+
+    auto funcAttr = StringAttr::get(op->getContext(), funcName);
+    auto funcOp =
+        SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(op, funcAttr);
+
+    if (!funcOp) {
+      auto parentFunc = op->getParentOfType<FunctionOpInterface>();
+      assert(parentFunc && "expected there to be a parent function");
+      OpBuilder b(parentFunc);
+
+      auto globalloc = loc->findInstanceOfOrUnknown<FileLineColLoc>();
+      funcOp = LLVM::LLVMFuncOp::create(b, globalloc, funcName, funcType);
+    }
+
+    SmallVector<Value> callOperands = {input, sinPtr, cosPtr};
+    rewriter.create<LLVM::CallOp>(loc, funcOp, callOperands);
+  }
+};
+
 template <typename OpTy>
 static void populateOpPatterns(const LLVMTypeConverter &converter,
                                RewritePatternSet &patterns,
@@ -589,6 +683,9 @@ void mlir::populateLibDeviceConversionPatterns(
                                   "__nv_tan", "__nv_fast_tanf");
   populateOpPatterns<math::TanhOp>(converter, patterns, benefit, "__nv_tanhf",
                                    "__nv_tanh");
+
+  // Custom pattern for sincos since it returns two values
+  patterns.add<SincosOpLowering>(converter, benefit);
 }
 
 void mlir::populateGpuToNVVMConversionPatterns(
diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
index 853f45498ac52..229e40e2061cb 100644
--- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
+++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
@@ -121,6 +121,38 @@ using CountTrailingZerosOpLowering =
                           LLVM::CountTrailingZerosOp>;
 using AbsIOpLowering = IntOpWithFlagLowering<math::AbsIOp, LLVM::AbsOp>;
 
+// A `sincos` is converted into `llvm.intr.sincos` followed by extractvalue ops.
+struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
+  using ConvertOpToLLVMPattern<math::SincosOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(math::SincosOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    const LLVMTypeConverter &typeConverter = *this->getTypeConverter();
+    mlir::Location loc = op.getLoc();
+    mlir::Type operandType = adaptor.getOperand().getType();
+    mlir::Type llvmOperandType = typeConverter.convertType(operandType);
+    mlir::Type sinType = typeConverter.convertType(op.getSin().getType());
+    mlir::Type cosType = typeConverter.convertType(op.getCos().getType());
+    if (!llvmOperandType || !sinType || !cosType)
+      return failure();
+
+    ConvertFastMath<math::SincosOp, LLVM::SincosOp> attrs(op);
+
+    auto structType = LLVM::LLVMStructType::getLiteral(
+        rewriter.getContext(), {llvmOperandType, llvmOperandType});
+
+    auto sincosOp = rewriter.create<LLVM::SincosOp>(
+        loc, structType, adaptor.getOperand(), attrs.getAttrs());
+
+    auto sinValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 0);
+    auto cosValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 1);
+
+    rewriter.replaceOp(op, {sinValue, cosValue});
+    return success();
+  }
+};
+
 // A `expm1` is converted into `exp - 1`.
 struct ExpM1OpLowering : public ConvertOpToLLVMPattern<math::ExpM1Op> {
   using ConvertOpToLLVMPattern<math::ExpM1Op>::ConvertOpToLLVMPattern;
@@ -393,6 +425,7 @@ void mlir::populateMathToLLVMConversionPatterns(
     RoundEvenOpLowering,
     RoundOpLowering,
     RsqrtOpLowering,
+    SincosOpLowering,
     SinOpLowering,
     SinhOpLowering,
     ASinOpLowering,
diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp
index a21631cbf8510..bbeef0f6ee9e5 100644
--- a/mlir/lib/Dialect/Math/IR/MathOps.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp
@@ -284,6 +284,16 @@ OpFoldResult math::SinhOp::fold(FoldAdaptor adaptor) {
       });
 }
 
+//===----------------------------------------------------------------------===//
+// SinCosOp getShapeForUnroll
+//===----------------------------------------------------------------------===//
+
+std::optional<SmallVector<int64_t, 4>> math::SincosOp::getShapeForUnroll() {
+  if (auto vt = mlir::dyn_cast<VectorType>(getOperand().getType()))
+    return llvm::to_vector<4>(vt.getShape());
+  return std::nullopt;
+}
+
 //===----------------------------------------------------------------------===//
 // CountLeadingZerosOp folder
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index ef06af3ad3163..a4b5dde8a2187 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1109,3 +1109,42 @@ gpu.module @test_module_55 {
     func.return %result32, %result64 : f32, f64
   }
 }
+
+gpu.module @test_module_56 {
+  // CHECK: gpu.module @test_module_56
+
+  // CHECK-DAG: llvm.func @__nv_sincosf(f32, !llvm.ptr, !llvm.ptr)
+  // CHECK-DAG: llvm.func @__nv_sincos(f64, !llvm.ptr, !llvm.ptr)
+
+  // CHECK-LABEL: func @gpu_sincos
+  // CHECK-SAME: %[[ARG_f16:.*]]: f16, %[[ARG_f32:.*]]: f32, %[[ARG_f64:.*]]: f64
+  func.func @gpu_sincos(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f16, f32, f32, f64, f64) {
+    // CHECK-COUNT-6: llvm.alloca
+    // CHECK: %[[ARG_f16_ext:.*]] = llvm.fpext %[[ARG_f16]] : f16 to f32
+    // CHECK: llvm.call @__nv_sincosf(%[[ARG_f16_ext]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK-COUNT-2: llvm.fptrunc
+    // CHECK: llvm.call @__nv_sincosf(%[[ARG_f32]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @__nv_sincos(%[[ARG_f64]], %{{.+}}, %{{.+}}) : (f64, !llvm.ptr, !llvm.ptr) -> ()
+    %sin16, %cos16 = math.sincos %arg_f16 : f16
+    %sin32, %cos32 = math.sincos %arg_f32 : f32
+    %sin64, %cos64 = math.sincos %arg_f64 : f64
+    func.return %sin16, %cos16, %sin32, %cos32, %sin64, %cos64 : f16, f16, f32, f32, f64, f64
+  }
+
+  // CHECK: llvm.func @__nv_fast_sincosf(f32, !llvm.ptr, !llvm.ptr)
+
+  // CHECK-LABEL: func @gpu_sincos_fastmath
+  // CHECK-SAME: %[[ARG_f16:.*]]: f16, %[[ARG_f32:.*]]: f32, %[[ARG_f64:.*]]: f64
+  func.func @gpu_sincos_fastmath(%arg_f16 : f16, %arg_f32 : f32, %arg_f64 : f64) -> (f16, f16, f32, f32, f64, f64) {
+    // CHECK-COUNT-6: llvm.alloca
+    // CHECK: %[[ARG_f16_ext:.*]] = llvm.fpext %[[ARG_f16]] : f16 to f32
+    // CHECK: llvm.call @__nv_fast_sincosf(%[[ARG_f16_ext]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK-COUNT-2: llvm.fptrunc
+    // CHECK: llvm.call @__nv_fast_sincosf(%[[ARG_f32]], %{{.+}}, %{{.+}}) : (f32, !llvm.ptr, !llvm.ptr) -> ()
+    // CHECK: llvm.call @__nv_sincos(%[[ARG_f64]], %{{.+}}, %{{.+}}) : (f64, !llvm.ptr, !llvm.ptr) -> ()
+    %sin16, %cos16 = math.sincos %arg_f16 fastmath<afn> : f16
+    %sin32, %cos32 = math.sincos %arg_f32 fastmath<afn> : f32
+    %sin64, %cos64 = math.sincos %arg_f64 fastmath<afn> : f64
+    func.return %sin16, %cos16, %sin32, %cos32, %sin64, %cos64 : f16, f16, f32, f32, f64, f64
+  }
+}
diff --git a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
index f4541220fe4d2..f7d27120d4207 100644
--- a/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
+++ b/mlir/test/Conversion/MathToLLVM/math-to-llvm.mlir
@@ -230,6 +230,16 @@ func.func @trigonometrics(%arg0: f32) {
 
 // -----
 
+// CHECK-LABEL: func @sincos
+// CHECK-SAME: [[ARG0:%.+]]: f32
+func.func @sincos(%arg0: f32) {
+  // CHECK: llvm.intr.sincos([[ARG0]]) : (f32) -> !llvm.struct<(f32, f32)>
+  %0:2 = math.sincos %arg0 : f32
+  func.return
+}
+
+// -----
+
 // CHECK-LABEL: func @inverse_trigonometrics
 // CHECK-SAME: [[ARG0:%.+]]: f32
 func.func @inverse_trigonometrics(%arg0: f32) {
diff --git a/mlir/test/Dialect/Math/ops.mlir b/mlir/test/Dialect/Math/ops.mlir
index cb10fc4397ffc..f085d1c62ea86 100644
--- a/mlir/test/Dialect/Math/ops.mlir
+++ b/mlir/test/Dialect/Math/ops.mlir
@@ -62,6 +62,18 @@ func.func @sin(%f: f32, %v: vector<4xf32>, %t: tensor<4x4x?xf32>) {
   return
 }
 
+// CHECK-LABEL: func @sincos(
+// CHECK-SAME:            %[[F:.*]]: f32, %[[V:.*]]: vector<4xf32>, %[[T:.*]]: tensor<4x4x?xf32>)
+func.func @sincos(%f: f32, %v: vector<4xf32>, %t: tensor<4x4x?xf32>) {
+  // CHECK: %{{.*}} = math.sincos %[[F]] : f32
+  %0:2 = math.sincos %f : f32
+  // CHECK: %{{.*}} = math.sincos %[[V]] : vector<4xf32>
+  %1:2 = math.sincos %v : vector<4xf32>
+  // CHECK: %{{.*}} = math.sincos %[[T]] : tensor<4x4x?xf32>
+  %2:2 = math.sincos %t : tensor<4x4x?xf32>
+  return
+}
+
 // CHECK-LABEL: func @erf(
 // CHECK-SAME:            %[[F:.*]]: f32, %[[V:.*]]: vector<4xf32>, %[[T:.*]]: tensor<4x4x?xf32>)
 func.func @erf(%f: f32, %v: vector<4xf32>, %t: tensor<4x4x?xf32>) {

From afe73f4db8f6e9d87ed07be64493dfe2fdaa92c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20Benics?=
 <108414871+balazs-benics-sonarsource@users.noreply.github.com>
Date: Tue, 30 Sep 2025 16:44:52 +0200
Subject: [PATCH 252/878] [analyzer][NFC] Explain why operator new/delete
 should never be eval-called (#161370)

Downstream, some change triggered an investigation if we could move a
checker callback from check::PostCall to eval::Call. After a lengthy
investigation that lead to ExprEngine::VisitCXXNewExpr we realized that
CXXNewExprs only trigger a PreCall and PostCall, but never an EvalCall.
It also had a FIXME that maybe it should trigger it.

Remember, it called `defaultEvalCall` which either inlines or
conservatively evaluates aka. invalidates the call. But never probes the
checker eval-calls to see if any would step in.

After implementing the changes to trigger the eval call for the
checkers, I realized that it doesn't really make sense because we are
eval-calling user-provided functions, that we can't be really sure about
their semantics, thus there is no generic way to properly implement the
eval call callback.
This touches on an important point. It only ever makes sense to eval
call functions that has a clear spec. such as standard functions, as
implementing the callback would prevent the inlining of that function,
risking regressing analysis quality if the implemented model is not
complete/correct enough.

As a conclusion, I opted for not exposing the eval call event to
checkers, in other words, keep everything as-is, but document my
journey.

CPP-6585
---
 .../Checkers/AnalysisOrderChecker.cpp           |  3 ++-
 .../Checkers/CheckerDocumentation.cpp           |  9 +++++++++
 clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp | 13 ++++++++++++-
 .../Analysis/cxxctr-evalcall-analysis-order.cpp | 17 +++++++++++++++++
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
index e64153d53bbd6..309e3d250de06 100644
--- a/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/AnalysisOrderChecker.cpp
@@ -129,7 +129,8 @@ class AnalysisOrderChecker
       llvm::errs() << " {argno: " << Call.getNumArgs() << '}';
       llvm::errs() << " [" << Call.getKindAsString() << ']';
       llvm::errs() << '\n';
-      return true;
+      // We can't return `true` from this callback without binding the return
+      // value. Let's just fallthrough here and return `false`.
     }
     return false;
   }
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
index 392c7eeea234a..c71623575ae97 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckerDocumentation.cpp
@@ -262,6 +262,15 @@ class CheckerDocumentation
   /// state. This callback allows a checker to provide domain specific knowledge
   /// about the particular functions it knows about.
   ///
+  /// Note that to evaluate a call, the handler MUST bind the return value if
+  /// its a non-void function. Invalidate the arguments if necessary.
+  ///
+  /// Note that in general, user-provided functions should not be eval-called
+  /// because the checker can't predict the exact semantics/contract of the
+  /// callee, and by having the eval::Call callback, we also prevent it from
+  /// getting inlined, potentially regressing analysis quality.
+  /// Consider using check::PreCall or check::PostCall to allow inlining.
+  ///
   /// \returns true if the call has been successfully evaluated
   /// and false otherwise. Note, that only one checker can evaluate a call. If
   /// more than one checker claims that they can evaluate the same call the
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index dee34e3e9d6a5..75d7e265af0f3 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -909,7 +909,14 @@ void ExprEngine::VisitCXXNewAllocatorCall(const CXXNewExpr *CNE,
   ExplodedNodeSet DstPostCall;
   StmtNodeBuilder CallBldr(DstPreCall, DstPostCall, *currBldrCtx);
   for (ExplodedNode *I : DstPreCall) {
-    // FIXME: Provide evalCall for checkers?
+    // Operator new calls (CXXNewExpr) are intentionally not eval-called,
+    // because it does not make sense to eval-call user-provided functions.
+    // 1) If the new operator can be inlined, then don't prevent it from
+    //    inlining by having an eval-call of that operator.
+    // 2) If it can't be inlined, then the default conservative modeling
+    //    is what we want anyway.
+    // So the best is to not allow eval-calling CXXNewExprs from checkers.
+    // Checkers can provide their pre/post-call callbacks if needed.
     defaultEvalCall(CallBldr, I, *Call);
   }
   // If the call is inlined, DstPostCall will be empty and we bail out now.
@@ -1110,6 +1117,10 @@ void ExprEngine::VisitCXXDeleteExpr(const CXXDeleteExpr *CDE,
   if (AMgr.getAnalyzerOptions().MayInlineCXXAllocator) {
     StmtNodeBuilder Bldr(DstPreCall, DstPostCall, *currBldrCtx);
     for (ExplodedNode *I : DstPreCall) {
+      // Intentionally either inline or conservative eval-call the operator
+      // delete, but avoid triggering an eval-call event for checkers.
+      // As detailed at handling CXXNewExprs, in short, because it does not
+      // really make sense to eval-call user-provided functions.
       defaultEvalCall(Bldr, I, *Call);
     }
   } else {
diff --git a/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp b/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp
index 0e1ec2f9de566..743c5ad0fa8cd 100644
--- a/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp
+++ b/clang/test/Analysis/cxxctr-evalcall-analysis-order.cpp
@@ -18,16 +18,33 @@ void foo() {
   C C0;
   C C1(42);
   C *C2 = new C{2, 3};
+  delete C2;
 }
 
 // CHECK:  PreCall (C::C) [CXXConstructorCall]
 // CHECK-NEXT:  EvalCall (C::C) {argno: 0} [CXXConstructorCall]
 // CHECK-NEXT:  PostCall (C::C) [CXXConstructorCall]
+
 // CHECK-NEXT:  PreCall (C::C) [CXXConstructorCall]
 // CHECK-NEXT:  EvalCall (C::C) {argno: 1} [CXXConstructorCall]
 // CHECK-NEXT:  PostCall (C::C) [CXXConstructorCall]
+
 // CHECK-NEXT:  PreCall (operator new) [CXXAllocatorCall]
+//    COMMENT: Operator new calls (CXXNewExpr) are intentionally not eval-called,
+//    COMMENT: because it does not make sense to eval call user-provided functions.
+//    COMMENT: 1) If the new operator can be inlined, then don't prevent it from
+//    COMMENT:    inlining by having an eval-call of that operator.
+//    COMMENT: 2) If it can't be inlined, then the default conservative modeling
+//    COMMENT:    is what we anyways want anyway.
+//    COMMENT: So the EvalCall event will not be triggered for operator new calls.
+// CHECK-NOT:   EvalCall
 // CHECK-NEXT:  PostCall (operator new) [CXXAllocatorCall]
+
 // CHECK-NEXT:  PreCall (C::C) [CXXConstructorCall]
 // CHECK-NEXT:  EvalCall (C::C) {argno: 2} [CXXConstructorCall]
 // CHECK-NEXT:  PostCall (C::C) [CXXConstructorCall]
+
+// CHECK-NEXT: PreCall (operator delete) [CXXDeallocatorCall]
+//    COMMENT: Same reasoning as for CXXNewExprs above.
+// CHECK-NOT:  EvalCall
+// CHECK-NEXT: PostCall (operator delete) [CXXDeallocatorCall]

From 296af382c814539348b2aac797e73169355c5b8c Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs@arm.com>
Date: Tue, 30 Sep 2025 15:49:45 +0100
Subject: [PATCH 253/878] [NFC][LV] Fix warning of unused SubConst variable

https://github.com/llvm/llvm-project/pull/160154 added an assertion
using a new variable, which caused a warning in builds without asserts.
This patch adds [[maybe_unused]] to prevent that warning.
---
 llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4c7a083e0d9b7..10d704df289c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3033,7 +3033,7 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
     assert(Red->getRecurrenceKind() == RecurKind::Add &&
            "Expected an add reduction");
     assert(getNumOperands() >= 3 && "Expected at least three operands");
-    auto *SubConst = dyn_cast<ConstantInt>(getOperand(2)->getLiveInIRValue());
+    [[maybe_unused]] auto *SubConst = dyn_cast<ConstantInt>(getOperand(2)->getLiveInIRValue());
     assert(SubConst && SubConst->getValue() == 0 &&
            Sub->getOpcode() == Instruction::Sub && "Expected a negating sub");
   }

From e9ffd2ac13ea09a9df50427ec7f28312244fba60 Mon Sep 17 00:00:00 2001
From: Valery Dmitriev <valeryd@nvidia.com>
Date: Tue, 30 Sep 2025 07:47:11 -0700
Subject: [PATCH 254/878] Revert "[flang] Simplify hlfir.index in a few limited
 cases. (#157883)" (#161387)

This reverts commit edca510555fd6c2adfe15dba6993f4e64575e647 due to reported failures.
---
 .../Transforms/SimplifyHLFIRIntrinsics.cpp    | 208 -----------
 .../HLFIR/simplify-hlfir-intrinsics-index.fir | 345 ------------------
 2 files changed, 553 deletions(-)
 delete mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index 9969ee474ff98..d8e36ea294cdb 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -2284,213 +2284,6 @@ class CmpCharOpConversion : public mlir::OpRewritePattern<hlfir::CmpCharOp> {
   }
 };
 
-static std::pair<mlir::Value, hlfir::AssociateOp>
-getVariable(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value val) {
-  // If it is an expression - create a variable from it, or forward
-  // the value otherwise.
-  hlfir::AssociateOp associate;
-  if (!mlir::isa<hlfir::ExprType>(val.getType()))
-    return {val, associate};
-  hlfir::Entity entity{val};
-  mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
-  associate = hlfir::genAssociateExpr(loc, builder, entity, entity.getType(),
-                                      "", byRefAttr);
-  return {associate.getBase(), associate};
-}
-
-class IndexOpConversion : public mlir::OpRewritePattern<hlfir::IndexOp> {
-public:
-  using mlir::OpRewritePattern<hlfir::IndexOp>::OpRewritePattern;
-
-  llvm::LogicalResult
-  matchAndRewrite(hlfir::IndexOp op,
-                  mlir::PatternRewriter &rewriter) const override {
-    // We simplify only limited cases:
-    // 1) a substring length shall be known at compile time
-    // 2) if a substring length is 0 then replace with 1 for forward search,
-    //    or otherwise with the string length + 1 (builder shall const-fold if
-    //    lookup direction is known at compile time).
-    // 3) for known string length at compile time, if it is
-    //    shorter than substring  => replace with zero.
-    // 4) if a substring length is one => inline as simple search loop
-    // 5) for forward search with input strings of kind=1 runtime is faster.
-    // Do not simplify in all the other cases relying on a runtime call.
-
-    fir::FirOpBuilder builder{rewriter, op.getOperation()};
-    const mlir::Location &loc = op->getLoc();
-
-    auto resultTy = op.getType();
-    mlir::Value back = op.getBack();
-    mlir::Value substrLen =
-        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getSubstr()});
-
-    auto substrLenCst = fir::getIntIfConstant(substrLen);
-    if (!substrLenCst) {
-      return rewriter.notifyMatchFailure(
-          op, "substring length unknown at compile time");
-    }
-    mlir::Value strLen =
-        hlfir::genCharLength(loc, builder, hlfir::Entity{op.getStr()});
-    auto i1Ty = builder.getI1Type();
-    auto idxTy = builder.getIndexType();
-    if (*substrLenCst == 0) {
-      mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
-      // zero length substring. For back search replace with
-      // strLen+1, or otherwise with 1.
-      mlir::Value strEnd = mlir::arith::AddIOp::create(
-          builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
-      if (back)
-        back = builder.createConvert(loc, i1Ty, back);
-      else
-        back = builder.createIntegerConstant(loc, i1Ty, 0);
-      mlir::Value result =
-          mlir::arith::SelectOp::create(builder, loc, back, strEnd, oneIdx);
-
-      rewriter.replaceOp(op, builder.createConvert(loc, resultTy, result));
-      return mlir::success();
-    }
-
-    if (auto strLenCst = fir::getIntIfConstant(strLen)) {
-      if (*strLenCst < *substrLenCst) {
-        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 0));
-        return mlir::success();
-      }
-      if (*strLenCst == 0) {
-        // both strings have zero length
-        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 1));
-        return mlir::success();
-      }
-    }
-    if (*substrLenCst != 1) {
-      return rewriter.notifyMatchFailure(
-          op, "rely on runtime implementation if substring length > 1");
-    }
-    // For forward search and character kind=1 the runtime uses memchr
-    // which well optimized. But it looks like memchr idiom is not recognized
-    // in LLVM yet. On a micro-kernel test with strings of length 40 runtime
-    // had ~2x less execution time vs inlined code. For unknown search direction
-    // at compile time pessimistically assume "forward".
-    std::optional<bool> isBack;
-    if (back) {
-      if (auto backCst = fir::getIntIfConstant(back))
-        isBack = *backCst != 0;
-    } else {
-      isBack = false;
-    }
-    auto charTy = mlir::cast<fir::CharacterType>(
-        hlfir::getFortranElementType(op.getSubstr().getType()));
-    unsigned kind = charTy.getFKind();
-    if (kind == 1 && (!isBack || !*isBack)) {
-      return rewriter.notifyMatchFailure(
-          op, "rely on runtime implementation for character kind 1");
-    }
-
-    // All checks are passed here. Generate single character search loop.
-    auto [strV, strAssociate] = getVariable(builder, loc, op.getStr());
-    auto [substrV, substrAssociate] = getVariable(builder, loc, op.getSubstr());
-    hlfir::Entity str{strV};
-    hlfir::Entity substr{substrV};
-    mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
-
-    auto genExtractAndConvertToInt = [&charTy, &idxTy, &oneIdx,
-                                      kind](mlir::Location loc,
-                                            fir::FirOpBuilder &builder,
-                                            hlfir::Entity &charStr,
-                                            mlir::Value index) {
-      auto bits = builder.getKindMap().getCharacterBitsize(kind);
-      auto intTy = builder.getIntegerType(bits);
-      auto charLen1Ty =
-          fir::CharacterType::getSingleton(builder.getContext(), kind);
-      mlir::Type designatorTy =
-          fir::ReferenceType::get(charLen1Ty, fir::isa_volatile_type(charTy));
-      auto idxAttr = builder.getIntegerAttr(idxTy, 0);
-
-      auto singleChr = hlfir::DesignateOp::create(
-          builder, loc, designatorTy, charStr, /*component=*/{},
-          /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
-          /*substring=*/mlir::ValueRange{index, index},
-          /*complexPart=*/std::nullopt,
-          /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{oneIdx},
-          fir::FortranVariableFlagsAttr{});
-      auto chrVal = fir::LoadOp::create(builder, loc, singleChr);
-      mlir::Value intVal = fir::ExtractValueOp::create(
-          builder, loc, intTy, chrVal, builder.getArrayAttr(idxAttr));
-      return intVal;
-    };
-
-    auto wantChar = genExtractAndConvertToInt(loc, builder, substr, oneIdx);
-
-    // Generate search loop body with the following C equivalent:
-    //  idx_t result = 0;
-    //  idx_t end = strlen + 1;
-    //  char want = substr[0];
-    //  for (idx_t idx = 1; idx < end; ++idx) {
-    //    if (result == 0) {
-    //        idx_t at = back ? end - idx: idx;
-    //        result = str[at-1] == want ? at : result;
-    //    }
-    //  }
-    if (!back)
-      back = builder.createIntegerConstant(loc, i1Ty, 0);
-    else
-      back = builder.createConvert(loc, i1Ty, back);
-    mlir::Value strEnd = mlir::arith::AddIOp::create(
-        builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
-    mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
-    auto genSearchBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
-                             mlir::ValueRange index,
-                             mlir::ValueRange reductionArgs)
-        -> llvm::SmallVector<mlir::Value, 1> {
-      assert(index.size() == 1 && "expected single loop");
-      assert(reductionArgs.size() == 1 && "expected single reduction value");
-      mlir::Value inRes = reductionArgs[0];
-      auto resEQzero = mlir::arith::CmpIOp::create(
-          builder, loc, mlir::arith::CmpIPredicate::eq, inRes, zeroIdx);
-
-      mlir::Value res =
-          builder
-              .genIfOp(loc, {idxTy}, resEQzero,
-                       /*withElseRegion=*/true)
-              .genThen([&]() {
-                mlir::Value idx = builder.createConvert(loc, idxTy, index[0]);
-                // offset = back ? end - idx : idx;
-                mlir::Value offset = mlir::arith::SelectOp::create(
-                    builder, loc, back,
-                    mlir::arith::SubIOp::create(builder, loc, strEnd, idx),
-                    idx);
-
-                auto haveChar =
-                    genExtractAndConvertToInt(loc, builder, str, offset);
-                auto charsEQ = mlir::arith::CmpIOp::create(
-                    builder, loc, mlir::arith::CmpIPredicate::eq, haveChar,
-                    wantChar);
-                mlir::Value newVal = mlir::arith::SelectOp::create(
-                    builder, loc, charsEQ, offset, inRes);
-
-                fir::ResultOp::create(builder, loc, newVal);
-              })
-              .genElse([&]() { fir::ResultOp::create(builder, loc, inRes); })
-              .getResults()[0];
-      return {res};
-    };
-
-    llvm::SmallVector<mlir::Value, 1> loopOut =
-        hlfir::genLoopNestWithReductions(loc, builder, {strLen},
-                                         /*reductionInits=*/{zeroIdx},
-                                         genSearchBody,
-                                         /*isUnordered=*/false);
-    mlir::Value result = builder.createConvert(loc, resultTy, loopOut[0]);
-
-    if (strAssociate)
-      hlfir::EndAssociateOp::create(builder, loc, strAssociate);
-    if (substrAssociate)
-      hlfir::EndAssociateOp::create(builder, loc, substrAssociate);
-
-    rewriter.replaceOp(op, result);
-    return mlir::success();
-  }
-};
-
 template <typename Op>
 class MatmulConversion : public mlir::OpRewritePattern<Op> {
 public:
@@ -3162,7 +2955,6 @@ class SimplifyHLFIRIntrinsics
     patterns.insert<ArrayShiftConversion<hlfir::CShiftOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::EOShiftOp>>(context);
     patterns.insert<CmpCharOpConversion>(context);
-    patterns.insert<IndexOpConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
deleted file mode 100644
index 258a1d899a40d..0000000000000
--- a/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
+++ /dev/null
@@ -1,345 +0,0 @@
-// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
-
-// Simplify should reduce hlfir.index to constant (5)
-func.func @_QPt1() {
-// CHECK-LABEL:   func.func @_QPt1() {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 5 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
-// CHECK:           %[[VAL_3:.*]] = arith.constant 4 : index
-// CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_3]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
-// CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
-// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_2]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
-// CHECK:           hlfir.assign %[[VAL_10]]#0 to %[[VAL_8]]#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
-// CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
-// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
-// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_0]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %c4 = arith.constant 4 : index
-    %3 = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
-    %4:2 = hlfir.declare %3 typeparams %c4 {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
-    %5 = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
-    %c3 = arith.constant 3 : index
-    %6:2 = hlfir.declare %5 typeparams %c3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
-    hlfir.assign %6#0 to %4#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
-    %7 = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
-    %c0 = arith.constant 0 : index
-    %8:2 = hlfir.declare %7 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
-    %true = arith.constant true
-    %9 = hlfir.index %8#0 in %4#0 back %true : (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,4>>, i1) -> i32
-    hlfir.assign %9 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-// ! 'back' is unknown at compile time, substring is zero length - generate select (back ? strlen+1 : 1)
-func.func @_QPt2(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK-LABEL:   func.func @_QPt2(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"},
-// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
-// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
-// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
-// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_0]] : index
-// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<4>) -> i1
-// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_0]] : index
-// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
-    %3:2 = hlfir.declare %2 {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %6 = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
-    %c0 = arith.constant 0 : index
-    %7:2 = hlfir.declare %6 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
-    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
-    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<2,0>>, !fir.boxchar<2>, !fir.logical<4>) -> i32
-    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
-    return
-}
-
-// inline as search loop (backward)
-func.func @_QPt3(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt3(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
-// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
-// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
-// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i16
-// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
-// CHECK:               fir.result %[[VAL_23]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_15]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_17]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-    %true = arith.constant true
-    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-//inline as search loop (forward)
-func.func @_QPt4(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt4(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
-// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
-// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
-// CHECK:               fir.result %[[VAL_21]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_14]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_16]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_22]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-    %false = arith.constant false
-    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-// Same as t4 above but result kind=1
-func.func @_QPt5(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt5(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
-// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
-// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
-// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
-// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
-// CHECK:               fir.result %[[VAL_21]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_14]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_16]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i8
-// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i8) -> i32
-// CHECK:           hlfir.assign %[[VAL_23]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
-    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
-    %false = arith.constant false
-    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i8
-    %8 = fir.convert %7 : (i8) -> i32
-    hlfir.assign %8 to %2#0 : i32, !fir.ref<i32>
-    return
-  }
-
-// Do no simplify - runtime call for forward search with character kind=1 is faster
-func.func @_QPt6(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt6(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant false
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_0]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
-// CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-    %false = arith.constant false
-    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-
-// Do not simplify - runtime call for forward search with character kind=1 is faster
-// Lookup direction is unknown at compile time, hence forward is pessimistically assumed
-func.func @_QPt7(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK-LABEL:   func.func @_QPt7(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
-// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_0]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
-// CHECK:           %[[VAL_10:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_9]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
-// CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
-    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
-    %3:2 = hlfir.declare %2 {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    %6 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-    %c1 = arith.constant 1 : index
-    %7:2 = hlfir.declare %6 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
-    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
-    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
-    return
-}
-
-// Inline as backward search loop for character kind=1.
-// The case similar to t7 but direction is known, so it is faster than runtime call.
-func.func @_QPt8(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK-LABEL:   func.func @_QPt8(
-// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
-// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
-// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
-// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
-// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<1>>, index, index, index) -> !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<1>>
-// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<1>) -> i8
-// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
-// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
-// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
-// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
-// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
-// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<1>, index, index, index) -> !fir.ref<!fir.char<1>>
-// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<1>>
-// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<1>) -> i8
-// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i8
-// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
-// CHECK:               fir.result %[[VAL_23]] : index
-// CHECK:             } else {
-// CHECK:               fir.result %[[VAL_15]] : index
-// CHECK:             }
-// CHECK:             fir.result %[[VAL_17]] : index
-// CHECK:           }
-// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
-// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
-// CHECK:           return
-// CHECK:         }
-    %0 = fir.dummy_scope : !fir.dscope
-    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
-    %2:2 = hlfir.declare %1 {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
-    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
-    %c1 = arith.constant 1 : index
-    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
-    %true = arith.constant true
-    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
-    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
-    return
-}
-

From 7ce73ab169257509336b0a3f6042c3ab7807b760 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 30 Sep 2025 15:56:00 +0100
Subject: [PATCH 255/878] [AArch64][SME] Fix typo in docs "block" -> "bundle"
 (NFC) (#161383)

---
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index cced0faa28889..474974893d945 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -22,7 +22,7 @@
 // To handle ZA state across control flow, we make use of edge bundling. This
 // assigns each block an "incoming" and "outgoing" edge bundle (representing
 // incoming and outgoing edges). Initially, these are unique to each block;
-// then, in the process of forming bundles, the outgoing block of a block is
+// then, in the process of forming bundles, the outgoing bundle of a block is
 // joined with the incoming bundle of all successors. The result is that each
 // bundle can be assigned a single ZA state, which ensures the state required by
 // all a blocks' successors is the same, and that each basic block will always

From 310db2068259ec79336b5b117ad6c5fdf4f7edf7 Mon Sep 17 00:00:00 2001
From: Chinmay Deshpande <chdeshpa@amd.com>
Date: Tue, 30 Sep 2025 07:56:23 -0700
Subject: [PATCH 256/878] [AMDGPU][GlobalISel] Add RegBankLegalize support for
 buffer loads with formats (#161291)

---
 llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp     | 4 +++-
 .../GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll      | 4 ++--
 .../GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll  | 2 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll | 4 ++--
 .../AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll     | 6 +++---
 .../GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll     | 6 +++---
 .../AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll  | 8 ++++----
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll  | 2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll     | 2 +-
 9 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 0776d14a84067..f413bbcecb526 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -840,7 +840,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
   // clang-format on
 
-  addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
+  addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
+                    G_AMDGPU_TBUFFER_LOAD_FORMAT},
+                   StandardB)
       .Div(B32, {{VgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
       .Uni(B32, {{UniInVgprB32}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
       .Div(B64, {{VgprB64}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
index feaf7ce42ecae..434f763044e45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12 %s
 
 ; Natural mapping
 define amdgpu_ps float @raw_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
index 636ba9b320591..41d45530886ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
 
 ; Natural mapping
 define amdgpu_ps float @raw_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
index 4d7d3ec5bdcb8..8ad5f50450155 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck %s
 
 define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
index 0ae28336dce8d..b7e2074ca9a63 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX10_GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps float @raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
   ; GFX10_GFX11-LABEL: name: raw_tbuffer_load_f32__sgpr_rsrc__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
index d644ef93d1850..23858b9c92947 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
 
 define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
index 7c811f489463a..dec015df58c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK-GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 -stop-after=instruction-select -o - %s | FileCheck --check-prefixes=CHECK-GFX12 %s
 
 define amdgpu_ps float @struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: struct_tbuffer_load_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
index f01e85a2e4a02..65111f14cab45 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll
@@ -4,7 +4,7 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefix=GFX10 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefix=GFX11 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
+;RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
 ; PREGFX10-LABEL: tbuffer_load:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
index b555c37d15703..a6afb757cd6c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll
@@ -4,7 +4,7 @@
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck -check-prefixes=GFX10 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+;RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
 ; PREGFX10-LABEL: tbuffer_load:

From e6425a764f039f29dfc4096567d64d529930d026 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Tue, 30 Sep 2025 15:59:40 +0100
Subject: [PATCH 257/878] [flang][debug] Improve name generation for basic
 types. (#161361)

For basic types, currently, we use the type name (e.g., `integer`,
`real`) as the debug name. This results in types of different sizes
having the same name. This patch improves the naming by appending the
size in bytes to the type name (e.g., `integer*8`, `real*8`).

Fixes https://github.com/llvm/llvm-project/issues/160890
---
 .../Transforms/DebugTypeGenerator.cpp         | 29 +++++++++++++------
 flang/test/Integration/debug-complex-1.f90    |  4 +--
 flang/test/Integration/debug-local-var-2.f90  |  6 ++--
 flang/test/Transforms/debug-complex-1.fir     |  4 +--
 .../test/Transforms/debug-derived-type-1.fir  |  6 ++--
 flang/test/Transforms/debug-fn-info.fir       |  6 ++--
 flang/test/Transforms/debug-local-var.fir     |  6 ++--
 flang/test/Transforms/debug-ref-type.fir      |  2 +-
 flang/test/Transforms/debug-tuple-type.fir    |  2 +-
 flang/test/Transforms/debug-vector-type.fir   | 12 ++++----
 10 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index d038c467b166a..a7e47239036ba 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -679,26 +679,37 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
       /*optional<address space>=*/std::nullopt, /*extra data=*/nullptr);
 }
 
+static mlir::StringAttr getBasicTypeName(mlir::MLIRContext *context,
+                                         llvm::StringRef baseName,
+                                         unsigned bitSize) {
+  std::string name(baseName.str());
+  if (bitSize != 32)
+    name += "*" + std::to_string(bitSize / 8);
+  return mlir::StringAttr::get(context, name);
+}
+
 mlir::LLVM::DITypeAttr
 DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                                 mlir::LLVM::DIScopeAttr scope,
                                 fir::cg::XDeclareOp declOp) {
   mlir::MLIRContext *context = module.getContext();
   if (Ty.isInteger()) {
-    return genBasicType(context, mlir::StringAttr::get(context, "integer"),
-                        Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_signed);
+    unsigned bitWidth = Ty.getIntOrFloatBitWidth();
+    return genBasicType(context, getBasicTypeName(context, "integer", bitWidth),
+                        bitWidth, llvm::dwarf::DW_ATE_signed);
   } else if (mlir::isa<mlir::FloatType>(Ty)) {
-    return genBasicType(context, mlir::StringAttr::get(context, "real"),
-                        Ty.getIntOrFloatBitWidth(), llvm::dwarf::DW_ATE_float);
+    unsigned bitWidth = Ty.getIntOrFloatBitWidth();
+    return genBasicType(context, getBasicTypeName(context, "real", bitWidth),
+                        bitWidth, llvm::dwarf::DW_ATE_float);
   } else if (auto logTy = mlir::dyn_cast_if_present<fir::LogicalType>(Ty)) {
-    return genBasicType(context,
-                        mlir::StringAttr::get(context, logTy.getMnemonic()),
-                        kindMapping.getLogicalBitsize(logTy.getFKind()),
-                        llvm::dwarf::DW_ATE_boolean);
+    unsigned bitWidth = kindMapping.getLogicalBitsize(logTy.getFKind());
+    return genBasicType(
+        context, getBasicTypeName(context, logTy.getMnemonic(), bitWidth),
+        bitWidth, llvm::dwarf::DW_ATE_boolean);
   } else if (auto cplxTy = mlir::dyn_cast_if_present<mlir::ComplexType>(Ty)) {
     auto floatTy = mlir::cast<mlir::FloatType>(cplxTy.getElementType());
     unsigned bitWidth = floatTy.getWidth();
-    return genBasicType(context, mlir::StringAttr::get(context, "complex"),
+    return genBasicType(context, getBasicTypeName(context, "complex", bitWidth),
                         bitWidth * 2, llvm::dwarf::DW_ATE_complex_float);
   } else if (auto seqTy = mlir::dyn_cast_if_present<fir::SequenceType>(Ty)) {
     return convertSequenceType(seqTy, fileAttr, scope, declOp);
diff --git a/flang/test/Integration/debug-complex-1.f90 b/flang/test/Integration/debug-complex-1.f90
index 1ec4b7fe33990..48ea0295eb250 100644
--- a/flang/test/Integration/debug-complex-1.f90
+++ b/flang/test/Integration/debug-complex-1.f90
@@ -17,8 +17,8 @@ function fn1(a, b) result (c)
 end program
 
 ! CHECK-DAG: ![[C4:.*]] = !DIBasicType(name: "complex", size: 64, encoding: DW_ATE_complex_float)
-! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex", size: 128, encoding: DW_ATE_complex_float)
-! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex", size: 256, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex*8", size: 128, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex*16", size: 256, encoding: DW_ATE_complex_float)
 ! CHECK-DAG: !DILocalVariable(name: "c4"{{.*}}type: ![[C4]])
 ! CHECK-DAG: !DILocalVariable(name: "c8"{{.*}}type: ![[C8]])
 ! CHECK-DAG: !DILocalVariable(name: "r"{{.*}}type: ![[C16]])
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index 0ddac633a5b1e..93659a56c7275 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -40,11 +40,11 @@ program mn
 ! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "MN", {{.*}})
 
 ! BOTH-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
-! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed)
-! BOTH-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical", size: 8, encoding: DW_ATE_boolean)
+! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer*8", size: 64, encoding: DW_ATE_signed)
+! BOTH-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical*1", size: 8, encoding: DW_ATE_boolean)
 ! BOTH-DAG: ![[TYL32:.*]] = !DIBasicType(name: "logical", size: 32, encoding: DW_ATE_boolean)
 ! BOTH-DAG: ![[TYR32:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
-! BOTH-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real", size: 64, encoding: DW_ATE_float)
+! BOTH-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real*8", size: 64, encoding: DW_ATE_float)
 
 ! BOTH-DAG: ![[I4]] = !DILocalVariable(name: "i4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI32]])
 ! BOTH-DAG: ![[I8]] = !DILocalVariable(name: "i8", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI64]])
diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir
index f7be6b2d4a931..7a288fec69be3 100644
--- a/flang/test/Transforms/debug-complex-1.fir
+++ b/flang/test/Transforms/debug-complex-1.fir
@@ -26,9 +26,9 @@ module {
 #loc3 = loc("./simple.f90":8:1)
 #loc4 = loc("./simple.f90":11:1)
 
-// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 128, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex*8", sizeInBits = 128, encoding = DW_ATE_complex_float>
 // CHECK-DAG: #[[CMPX4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
-// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 256, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex*16", sizeInBits = 256, encoding = DW_ATE_complex_float>
 
 // CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX8]], #[[CMPX4]]>
 // CHECK-DAG: #[[TY2:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX16]], #[[CMPX4]]>
diff --git a/flang/test/Transforms/debug-derived-type-1.fir b/flang/test/Transforms/debug-derived-type-1.fir
index cfbd361a91e72..672b6cf2819d2 100644
--- a/flang/test/Transforms/debug-derived-type-1.fir
+++ b/flang/test/Transforms/debug-derived-type-1.fir
@@ -45,12 +45,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 
 
 // CHECK-DAG: #[[INT_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[REAL4_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[CMX8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
 // CHECK-DAG: #[[CMX_ARR:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type, baseType = #[[CMX8_TY:.*]]{{.*}}>
-// CHECK-DAG: #[[LOG_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
-// CHECK-DAG: #[[REAL8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK-DAG: #[[STR_TY:.*]] = #llvm.di_string_type
 // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}name = "m_employee"{{.*}}>
 // CHECK-DAG: #[[MOD1:.*]] = #llvm.di_module<{{.*}}name = "t1"{{.*}}>
diff --git a/flang/test/Transforms/debug-fn-info.fir b/flang/test/Transforms/debug-fn-info.fir
index c02835be50af5..d82cef1acc423 100644
--- a/flang/test/Transforms/debug-fn-info.fir
+++ b/flang/test/Transforms/debug-fn-info.fir
@@ -64,10 +64,10 @@ module {
 #loc4 = loc("test2.f90":53:22)
 
 
-// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
 // CHECK: #[[TY0:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_program, types = #di_null_type>
diff --git a/flang/test/Transforms/debug-local-var.fir b/flang/test/Transforms/debug-local-var.fir
index 06c9b01e75a61..466f79c6ed879 100644
--- a/flang/test/Transforms/debug-local-var.fir
+++ b/flang/test/Transforms/debug-local-var.fir
@@ -71,10 +71,10 @@ module {
 #loc15 = loc("test.f90":21:24)
 #loc16 = loc("test.f90":22:5)
 
-// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[MAIN:.*]] = #llvm.di_subprogram<{{.*}}name = "mn"{{.*}}>
diff --git a/flang/test/Transforms/debug-ref-type.fir b/flang/test/Transforms/debug-ref-type.fir
index 745aebee778be..2164a40c7c111 100644
--- a/flang/test/Transforms/debug-ref-type.fir
+++ b/flang/test/Transforms/debug-ref-type.fir
@@ -5,6 +5,6 @@ module {
 }
 #loc1 = loc("test.f90":5:1)
 
-// CHECK: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 8, encoding = DW_ATE_signed>
+// CHECK: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*1", sizeInBits = 8, encoding = DW_ATE_signed>
 // CHECK: #[[REF_TY:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, name = "", baseType = #[[INT8_TY]]{{.*}}>
 // CHECK: #llvm.di_subroutine_type<{{.*}}types = #[[REF_TY]], #[[INT8_TY]]>
diff --git a/flang/test/Transforms/debug-tuple-type.fir b/flang/test/Transforms/debug-tuple-type.fir
index e3b0bafdf3cd4..b865d492b6696 100644
--- a/flang/test/Transforms/debug-tuple-type.fir
+++ b/flang/test/Transforms/debug-tuple-type.fir
@@ -5,7 +5,7 @@ module {
   func.func private @_FortranAioOutputDerivedType(!fir.ref<tuple<>>)
 }
 
-// CHECK: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}>
 // CHECK: #[[DTY1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "", baseType = #[[F64]], sizeInBits = 64, alignInBits = {{.*}}>
 // CHECK: #[[DTY2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "", baseType = #[[F64]], sizeInBits = 64, alignInBits = {{.*}}, offsetInBits = {{.*}}>
diff --git a/flang/test/Transforms/debug-vector-type.fir b/flang/test/Transforms/debug-vector-type.fir
index d3e1f6ec28d0f..cfb97ea46ba61 100644
--- a/flang/test/Transforms/debug-vector-type.fir
+++ b/flang/test/Transforms/debug-vector-type.fir
@@ -2,22 +2,22 @@
 
 module {
 func.func private @foo1(%arg0: !fir.vector<20:bf16>)
-// CHECK-DAG: #[[F16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 16, encoding = DW_ATE_float>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real (20)", baseType = #[[F16]], flags = Vector, sizeInBits = 320, elements = #llvm.di_subrange<count = 20 : i64>>
+// CHECK-DAG: #[[F16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*2", sizeInBits = 16, encoding = DW_ATE_float>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real*2 (20)", baseType = #[[F16]], flags = Vector, sizeInBits = 320, elements = #llvm.di_subrange<count = 20 : i64>>
 
 func.func private @foo2(%arg0: !fir.vector<30:f32>)
 // CHECK-DAG: #[[F32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real (30)", baseType = #[[F32]], flags = Vector, sizeInBits = 960, elements = #llvm.di_subrange<count = 30 : i64>>
 
 func.func private @foo3(%arg0: !fir.vector<10:f64>)
-// CHECK-DAG: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real (10)", baseType = #[[F64]], flags = Vector, sizeInBits = 640, elements = #llvm.di_subrange<count = 10 : i64>>
+// CHECK-DAG: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real*8 (10)", baseType = #[[F64]], flags = Vector, sizeInBits = 640, elements = #llvm.di_subrange<count = 10 : i64>>
 
 func.func private @foo4(%arg0: !fir.vector<5:i32>)
 // CHECK-DAG: #[[I32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
 // CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer (5)", baseType = #[[I32]], flags = Vector, sizeInBits = 160, elements = #llvm.di_subrange<count = 5 : i64>>
 
 func.func private @foo5(%arg0: !fir.vector<2:i64>)
-// CHECK-DAG: #[[I64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 64, encoding = DW_ATE_signed>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer (2)", baseType = #[[I64]], flags = Vector, sizeInBits = 128, elements = #llvm.di_subrange<count = 2 : i64>>
+// CHECK-DAG: #[[I64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer*8 (2)", baseType = #[[I64]], flags = Vector, sizeInBits = 128, elements = #llvm.di_subrange<count = 2 : i64>>
 }

From ebcf025e2e84da7557350e46891125f6f34fbbfe Mon Sep 17 00:00:00 2001
From: Marcos Maronas <marcos.maronas@intel.com>
Date: Tue, 30 Sep 2025 16:01:30 +0100
Subject: [PATCH 258/878] [SPIR-V] Implement SPV_KHR_float_controls2 (#146941)

Implementation of
[SPV_KHR_float_controls2](https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_float_controls2.html)
extension, and corresponding tests.

Some of the tests make use of `!spirv.ExecutionMode` LLVM named
metadata. This is because some SPIR-V instructions don't have a direct
equivalent in LLVM IR, so the SPIR-V Target uses different LLVM named
metadata to convey the necessary information. Below, you will find an
example from one of the newly added tests:
```
!spirv.ExecutionMode = !{!19, !20, !21, !22, !23, !24, !25, !26, !27}
!19 = !{ptr @k_float_controls_float, i32 6028, float poison, i32 131079}
!20 = !{ptr @k_float_controls_all, i32 6028, float poison, i32 131079}
!21 = !{ptr @k_float_controls_float, i32 31}
!22 = !{ptr @k_float_controls_all, i32 31}
!23 = !{ptr @k_float_controls_float, i32 4461, i32 32}
!24 = !{ptr @k_float_controls_all, i32 4461, i32 16}
!25 = !{ptr @k_float_controls_all, i32 4461, i32 32}
!26 = !{ptr @k_float_controls_all, i32 4461, i32 64}
!27 = !{ptr @k_float_controls_all, i32 4461, i32 128}
```
`!spirv.ExecutionMode` contains a list of metadata nodes, and each of
them specifies the required operands for expressing a particular
`OpExecutionMode` instruction in SPIR-V. For example, `!19 = !{ptr
@k_float_controls_float, i32 6028, float poison, i32 131079}` will be
lowered to `OpExecutionMode [[k_float_controls_float_ID]]
FPFastMathDefault [[float_type_ID]] 131079`.

---------

Co-authored-by: Dmitry Sidorov <dmitry.sidorov@intel.com>
---
 llvm/docs/SPIRVUsage.rst                      |  30 +-
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp     | 202 ++++++++++-
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp       |  26 +-
 llvm/lib/Target/SPIRV/SPIRVBuiltins.h         |   2 +-
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   |   6 +-
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 214 +++++++++++-
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp |   2 +-
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp      |  21 +-
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.h        |   3 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |   4 +-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 321 ++++++++++++++++--
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h   |   7 +
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp   |   3 +-
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td |   4 +
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp          |  16 +-
 llvm/lib/Target/SPIRV/SPIRVUtils.h            |  51 ++-
 .../CodeGen/SPIRV/capability-FloatControl2.ll |   2 +-
 .../SPV_KHR_float_controls2/decoration.ll     | 148 ++++++++
 .../SPV_KHR_float_controls2/exec_mode.ll      |  81 +++++
 .../SPV_KHR_float_controls2/exec_mode2.ll     |  73 ++++
 .../SPV_KHR_float_controls2/exec_mode3.ll     | 103 ++++++
 .../SPV_KHR_float_controls2/replacements.ll   |  61 ++++
 22 files changed, 1324 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/decoration.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode2.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode3.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/replacements.ll

diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index fdefc53b32aba..b6cd4b4feb46b 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -232,7 +232,7 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
    * - ``SPV_INTEL_int4``
      - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices.
    * - ``SPV_KHR_float_controls2``
-     - Adds ability to specify the floating-point environment in shaders. It can be used on whole modules and individual instructions.
+     - Adds execution modes and decorations to control floating-point computations in both kernels and shaders. It can be used on whole modules and individual instructions.
 
 SPIR-V representation in LLVM IR
 ================================
@@ -589,3 +589,31 @@ Group and Subgroup Operations
 For workgroup and subgroup operations, LLVM uses function calls to represent SPIR-V's
 group-based instructions. These builtins facilitate group synchronization, data sharing,
 and collective operations essential for efficient parallel computation.
+
+SPIR-V Instructions Mapped to LLVM Metadata
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some SPIR-V instructions don't have a direct equivalent in the LLVM IR language. To
+address this, the SPIR-V Target uses different specific LLVM named metadata to convey
+the necessary information. The SPIR-V specification allows multiple module-scope
+instructions, where as LLVM named metadata must be unique. Therefore, the encoding of
+such instructions has the following format:
+
+.. code-block:: llvm
+
+  !spirv.<OpCodeName> = !{!<InstructionMetadata1>, !<InstructionMetadata2>, ..}
+  !<InstructionMetadata1> = !{<Operand1>, <Operand2>, ..}
+  !<InstructionMetadata2> = !{<Operand1>, <Operand2>, ..}
+
+Below, you will find the mappings between SPIR-V instruction and their corresponding
+LLVM IR representations.
+
++--------------------+---------------------------------------------------------+
+| SPIR-V instruction | LLVM IR                                                 |
++====================+=========================================================+
+| OpExecutionMode    | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.ExecutionMode = !{!0}                         |
+|                    |    !0 = !{void @worker, i32 30, i32 262149}             |
+|                    |    ; Set execution mode with id 30 (VecTypeHint) and    |
+|                    |    ; literal `262149` operand.                          |
++--------------------+---------------------------------------------------------+
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index c2a6e51913a0a..b765fecbc8de3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -81,6 +81,7 @@ class SPIRVAsmPrinter : public AsmPrinter {
   void outputExecutionMode(const Module &M);
   void outputAnnotations(const Module &M);
   void outputModuleSections();
+  void outputFPFastMathDefaultInfo();
   bool isHidden() {
     return MF->getFunction()
         .getFnAttribute(SPIRV_BACKEND_SERVICE_FUN_NAME)
@@ -498,11 +499,27 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
   NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
   if (Node) {
     for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+      // If SPV_KHR_float_controls2 is enabled and we find any of
+      // FPFastMathDefault, ContractionOff or SignedZeroInfNanPreserve execution
+      // modes, skip it, it'll be done somewhere else.
+      if (ST->canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)) {
+        const auto EM =
+            cast<ConstantInt>(
+                cast<ConstantAsMetadata>((Node->getOperand(i))->getOperand(1))
+                    ->getValue())
+                ->getZExtValue();
+        if (EM == SPIRV::ExecutionMode::FPFastMathDefault ||
+            EM == SPIRV::ExecutionMode::ContractionOff ||
+            EM == SPIRV::ExecutionMode::SignedZeroInfNanPreserve)
+          continue;
+      }
+
       MCInst Inst;
       Inst.setOpcode(SPIRV::OpExecutionMode);
       addOpsFromMDNode(cast<MDNode>(Node->getOperand(i)), Inst, MAI);
       outputMCInst(Inst);
     }
+    outputFPFastMathDefaultInfo();
   }
   for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) {
     const Function &F = *FI;
@@ -552,12 +569,84 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
     }
     if (ST->isKernel() && !M.getNamedMetadata("spirv.ExecutionMode") &&
         !M.getNamedMetadata("opencl.enable.FP_CONTRACT")) {
-      MCInst Inst;
-      Inst.setOpcode(SPIRV::OpExecutionMode);
-      Inst.addOperand(MCOperand::createReg(FReg));
-      unsigned EM = static_cast<unsigned>(SPIRV::ExecutionMode::ContractionOff);
-      Inst.addOperand(MCOperand::createImm(EM));
-      outputMCInst(Inst);
+      if (ST->canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)) {
+        // When SPV_KHR_float_controls2 is enabled, ContractionOff is
+        // deprecated. We need to use FPFastMathDefault with the appropriate
+        // flags instead. Since FPFastMathDefault takes a target type, we need
+        // to emit it for each floating-point type that exists in the module
+        // to match the effect of ContractionOff. As of now, there are 3 FP
+        // types: fp16, fp32 and fp64.
+
+        // We only end up here because there is no "spirv.ExecutionMode"
+        // metadata, so that means no FPFastMathDefault. Therefore, we only
+        // need to make sure AllowContract is set to 0, as the rest of flags.
+        // We still need to emit the OpExecutionMode instruction, otherwise
+        // it's up to the client API to define the flags. Therefore, we need
+        // to find the constant with 0 value.
+
+        // Collect the SPIRVTypes for fp16, fp32, and fp64 and the constant of
+        // type int32 with 0 value to represent the FP Fast Math Mode.
+        std::vector<const MachineInstr *> SPIRVFloatTypes;
+        const MachineInstr *ConstZero = nullptr;
+        for (const MachineInstr *MI :
+             MAI->getMSInstrs(SPIRV::MB_TypeConstVars)) {
+          // Skip if the instruction is not OpTypeFloat or OpConstant.
+          unsigned OpCode = MI->getOpcode();
+          if (OpCode != SPIRV::OpTypeFloat && OpCode != SPIRV::OpConstantNull)
+            continue;
+
+          // Collect the SPIRV type if it's a float.
+          if (OpCode == SPIRV::OpTypeFloat) {
+            // Skip if the target type is not fp16, fp32, fp64.
+            const unsigned OpTypeFloatSize = MI->getOperand(1).getImm();
+            if (OpTypeFloatSize != 16 && OpTypeFloatSize != 32 &&
+                OpTypeFloatSize != 64) {
+              continue;
+            }
+            SPIRVFloatTypes.push_back(MI);
+          } else {
+            // Check if the constant is int32, if not skip it.
+            const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+            MachineInstr *TypeMI = MRI.getVRegDef(MI->getOperand(1).getReg());
+            if (!TypeMI || TypeMI->getOperand(1).getImm() != 32)
+              continue;
+
+            ConstZero = MI;
+          }
+        }
+
+        // When SPV_KHR_float_controls2 is enabled, ContractionOff is
+        // deprecated. We need to use FPFastMathDefault with the appropriate
+        // flags instead. Since FPFastMathDefault takes a target type, we need
+        // to emit it for each floating-point type that exists in the module
+        // to match the effect of ContractionOff. As of now, there are 3 FP
+        // types: fp16, fp32 and fp64.
+        for (const MachineInstr *MI : SPIRVFloatTypes) {
+          MCInst Inst;
+          Inst.setOpcode(SPIRV::OpExecutionModeId);
+          Inst.addOperand(MCOperand::createReg(FReg));
+          unsigned EM =
+              static_cast<unsigned>(SPIRV::ExecutionMode::FPFastMathDefault);
+          Inst.addOperand(MCOperand::createImm(EM));
+          const MachineFunction *MF = MI->getMF();
+          MCRegister TypeReg =
+              MAI->getRegisterAlias(MF, MI->getOperand(0).getReg());
+          Inst.addOperand(MCOperand::createReg(TypeReg));
+          assert(ConstZero && "There should be a constant zero.");
+          MCRegister ConstReg = MAI->getRegisterAlias(
+              ConstZero->getMF(), ConstZero->getOperand(0).getReg());
+          Inst.addOperand(MCOperand::createReg(ConstReg));
+          outputMCInst(Inst);
+        }
+      } else {
+        MCInst Inst;
+        Inst.setOpcode(SPIRV::OpExecutionMode);
+        Inst.addOperand(MCOperand::createReg(FReg));
+        unsigned EM =
+            static_cast<unsigned>(SPIRV::ExecutionMode::ContractionOff);
+        Inst.addOperand(MCOperand::createImm(EM));
+        outputMCInst(Inst);
+      }
     }
   }
 }
@@ -606,6 +695,101 @@ void SPIRVAsmPrinter::outputAnnotations(const Module &M) {
   }
 }
 
+void SPIRVAsmPrinter::outputFPFastMathDefaultInfo() {
+  // Collect the SPIRVTypes that are OpTypeFloat and the constants of type
+  // int32, that might be used as FP Fast Math Mode.
+  std::vector<const MachineInstr *> SPIRVFloatTypes;
+  // Hashtable to associate immediate values with the constant holding them.
+  std::unordered_map<int, const MachineInstr *> ConstMap;
+  for (const MachineInstr *MI : MAI->getMSInstrs(SPIRV::MB_TypeConstVars)) {
+    // Skip if the instruction is not OpTypeFloat or OpConstant.
+    unsigned OpCode = MI->getOpcode();
+    if (OpCode != SPIRV::OpTypeFloat && OpCode != SPIRV::OpConstantI &&
+        OpCode != SPIRV::OpConstantNull)
+      continue;
+
+    // Collect the SPIRV type if it's a float.
+    if (OpCode == SPIRV::OpTypeFloat) {
+      SPIRVFloatTypes.push_back(MI);
+    } else {
+      // Check if the constant is int32, if not skip it.
+      const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+      MachineInstr *TypeMI = MRI.getVRegDef(MI->getOperand(1).getReg());
+      if (!TypeMI || TypeMI->getOpcode() != SPIRV::OpTypeInt ||
+          TypeMI->getOperand(1).getImm() != 32)
+        continue;
+
+      if (OpCode == SPIRV::OpConstantI)
+        ConstMap[MI->getOperand(2).getImm()] = MI;
+      else
+        ConstMap[0] = MI;
+    }
+  }
+
+  for (const auto &[Func, FPFastMathDefaultInfoVec] :
+       MAI->FPFastMathDefaultInfoMap) {
+    if (FPFastMathDefaultInfoVec.empty())
+      continue;
+
+    for (const MachineInstr *MI : SPIRVFloatTypes) {
+      unsigned OpTypeFloatSize = MI->getOperand(1).getImm();
+      unsigned Index = SPIRV::FPFastMathDefaultInfoVector::
+          computeFPFastMathDefaultInfoVecIndex(OpTypeFloatSize);
+      assert(Index < FPFastMathDefaultInfoVec.size() &&
+             "Index out of bounds for FPFastMathDefaultInfoVec");
+      const auto &FPFastMathDefaultInfo = FPFastMathDefaultInfoVec[Index];
+      assert(FPFastMathDefaultInfo.Ty &&
+             "Expected target type for FPFastMathDefaultInfo");
+      assert(FPFastMathDefaultInfo.Ty->getScalarSizeInBits() ==
+                 OpTypeFloatSize &&
+             "Mismatched float type size");
+      MCInst Inst;
+      Inst.setOpcode(SPIRV::OpExecutionModeId);
+      MCRegister FuncReg = MAI->getFuncReg(Func);
+      assert(FuncReg.isValid());
+      Inst.addOperand(MCOperand::createReg(FuncReg));
+      Inst.addOperand(
+          MCOperand::createImm(SPIRV::ExecutionMode::FPFastMathDefault));
+      MCRegister TypeReg =
+          MAI->getRegisterAlias(MI->getMF(), MI->getOperand(0).getReg());
+      Inst.addOperand(MCOperand::createReg(TypeReg));
+      unsigned Flags = FPFastMathDefaultInfo.FastMathFlags;
+      if (FPFastMathDefaultInfo.ContractionOff &&
+          (Flags & SPIRV::FPFastMathMode::AllowContract))
+        report_fatal_error(
+            "Conflicting FPFastMathFlags: ContractionOff and AllowContract");
+
+      if (FPFastMathDefaultInfo.SignedZeroInfNanPreserve &&
+          !(Flags &
+            (SPIRV::FPFastMathMode::NotNaN | SPIRV::FPFastMathMode::NotInf |
+             SPIRV::FPFastMathMode::NSZ))) {
+        if (FPFastMathDefaultInfo.FPFastMathDefault)
+          report_fatal_error("Conflicting FPFastMathFlags: "
+                             "SignedZeroInfNanPreserve but at least one of "
+                             "NotNaN/NotInf/NSZ is enabled.");
+      }
+
+      // Don't emit if none of the execution modes was used.
+      if (Flags == SPIRV::FPFastMathMode::None &&
+          !FPFastMathDefaultInfo.ContractionOff &&
+          !FPFastMathDefaultInfo.SignedZeroInfNanPreserve &&
+          !FPFastMathDefaultInfo.FPFastMathDefault)
+        continue;
+
+      // Retrieve the constant instruction for the immediate value.
+      auto It = ConstMap.find(Flags);
+      if (It == ConstMap.end())
+        report_fatal_error("Expected constant instruction for FP Fast Math "
+                           "Mode operand of FPFastMathDefault execution mode.");
+      const MachineInstr *ConstMI = It->second;
+      MCRegister ConstReg = MAI->getRegisterAlias(
+          ConstMI->getMF(), ConstMI->getOperand(0).getReg());
+      Inst.addOperand(MCOperand::createReg(ConstReg));
+      outputMCInst(Inst);
+    }
+  }
+}
+
 void SPIRVAsmPrinter::outputModuleSections() {
   const Module *M = MMI->getModule();
   // Get the global subtarget to output module-level info.
@@ -614,7 +798,8 @@ void SPIRVAsmPrinter::outputModuleSections() {
   MAI = &SPIRVModuleAnalysis::MAI;
   assert(ST && TII && MAI && M && "Module analysis is required");
   // Output instructions according to the Logical Layout of a Module:
-  // 1,2. All OpCapability instructions, then optional OpExtension instructions.
+  // 1,2. All OpCapability instructions, then optional OpExtension
+  // instructions.
   outputGlobalRequirements();
   // 3. Optional OpExtInstImport instructions.
   outputOpExtInstImports(*M);
@@ -622,7 +807,8 @@ void SPIRVAsmPrinter::outputModuleSections() {
   outputOpMemoryModel();
   // 5. All entry point declarations, using OpEntryPoint.
   outputEntryPoints();
-  // 6. Execution-mode declarations, using OpExecutionMode or OpExecutionModeId.
+  // 6. Execution-mode declarations, using OpExecutionMode or
+  // OpExecutionModeId.
   outputExecutionMode(*M);
   // 7a. Debug: all OpString, OpSourceExtension, OpSource, and
   // OpSourceContinued, without forward references.
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index f704d3afdea78..0e0c4547c751e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1162,11 +1162,24 @@ static unsigned getNumSizeComponents(SPIRVType *imgType) {
 
 static bool generateExtInst(const SPIRV::IncomingCall *Call,
                             MachineIRBuilder &MIRBuilder,
-                            SPIRVGlobalRegistry *GR) {
+                            SPIRVGlobalRegistry *GR, const CallBase &CB) {
   // Lookup the extended instruction number in the TableGen records.
   const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
   uint32_t Number =
       SPIRV::lookupExtendedBuiltin(Builtin->Name, Builtin->Set)->Number;
+  // fmin_common and fmax_common are now deprecated, and we should use fmin and
+  // fmax with NotInf and NotNaN flags instead. Keep original number to add
+  // later the NoNans and NoInfs flags.
+  uint32_t OrigNumber = Number;
+  const SPIRVSubtarget &ST =
+      cast<SPIRVSubtarget>(MIRBuilder.getMF().getSubtarget());
+  if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2) &&
+      (Number == SPIRV::OpenCLExtInst::fmin_common ||
+       Number == SPIRV::OpenCLExtInst::fmax_common)) {
+    Number = (Number == SPIRV::OpenCLExtInst::fmin_common)
+                 ? SPIRV::OpenCLExtInst::fmin
+                 : SPIRV::OpenCLExtInst::fmax;
+  }
 
   // Build extended instruction.
   auto MIB =
@@ -1178,6 +1191,13 @@ static bool generateExtInst(const SPIRV::IncomingCall *Call,
 
   for (auto Argument : Call->Arguments)
     MIB.addUse(Argument);
+  MIB.getInstr()->copyIRFlags(CB);
+  if (OrigNumber == SPIRV::OpenCLExtInst::fmin_common ||
+      OrigNumber == SPIRV::OpenCLExtInst::fmax_common) {
+    // Add NoNans and NoInfs flags to fmin/fmax instruction.
+    MIB.getInstr()->setFlag(MachineInstr::MIFlag::FmNoNans);
+    MIB.getInstr()->setFlag(MachineInstr::MIFlag::FmNoInfs);
+  }
   return true;
 }
 
@@ -2908,7 +2928,7 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
                                  MachineIRBuilder &MIRBuilder,
                                  const Register OrigRet, const Type *OrigRetTy,
                                  const SmallVectorImpl<Register> &Args,
-                                 SPIRVGlobalRegistry *GR) {
+                                 SPIRVGlobalRegistry *GR, const CallBase &CB) {
   LLVM_DEBUG(dbgs() << "Lowering builtin call: " << DemangledCall << "\n");
 
   // Lookup the builtin in the TableGen records.
@@ -2931,7 +2951,7 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
   // Match the builtin with implementation based on the grouping.
   switch (Call->Builtin->Group) {
   case SPIRV::Extended:
-    return generateExtInst(Call.get(), MIRBuilder, GR);
+    return generateExtInst(Call.get(), MIRBuilder, GR, CB);
   case SPIRV::Relational:
     return generateRelationalInst(Call.get(), MIRBuilder, GR);
   case SPIRV::Group:
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.h b/llvm/lib/Target/SPIRV/SPIRVBuiltins.h
index 1a8641a8328dd..f6a5234cd3c73 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.h
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.h
@@ -39,7 +39,7 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
                                  MachineIRBuilder &MIRBuilder,
                                  const Register OrigRet, const Type *OrigRetTy,
                                  const SmallVectorImpl<Register> &Args,
-                                 SPIRVGlobalRegistry *GR);
+                                 SPIRVGlobalRegistry *GR, const CallBase &CB);
 
 /// Helper function for finding a builtin function attributes
 /// by a demangled function name. Defined in SPIRVBuiltins.cpp.
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index a412887e51adb..1a7c02c676465 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -641,9 +641,9 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
                                    GR->getPointerSize()));
       }
     }
-    if (auto Res =
-            SPIRV::lowerBuiltin(DemangledName, ST->getPreferredInstructionSet(),
-                                MIRBuilder, ResVReg, OrigRetTy, ArgVRegs, GR))
+    if (auto Res = SPIRV::lowerBuiltin(
+            DemangledName, ST->getPreferredInstructionSet(), MIRBuilder,
+            ResVReg, OrigRetTy, ArgVRegs, GR, *Info.CB))
       return *Res;
   }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 704edd3139260..9f2e07508a36a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/TypedPointerType.h"
 #include "llvm/Transforms/Utils/Local.h"
 
+#include <cassert>
 #include <queue>
 #include <unordered_set>
 
@@ -152,6 +153,7 @@ class SPIRVEmitIntrinsics
   void insertPtrCastOrAssignTypeInstr(Instruction *I, IRBuilder<> &B);
   bool shouldTryToAddMemAliasingDecoration(Instruction *Inst);
   void insertSpirvDecorations(Instruction *I, IRBuilder<> &B);
+  void insertConstantsForFPFastMathDefault(Module &M);
   void processGlobalValue(GlobalVariable &GV, IRBuilder<> &B);
   void processParamTypes(Function *F, IRBuilder<> &B);
   void processParamTypesByFunHeader(Function *F, IRBuilder<> &B);
@@ -2249,6 +2251,198 @@ void SPIRVEmitIntrinsics::insertSpirvDecorations(Instruction *I,
   }
 }
 
+static SPIRV::FPFastMathDefaultInfoVector &getOrCreateFPFastMathDefaultInfoVec(
+    const Module &M,
+    DenseMap<Function *, SPIRV::FPFastMathDefaultInfoVector>
+        &FPFastMathDefaultInfoMap,
+    Function *F) {
+  auto it = FPFastMathDefaultInfoMap.find(F);
+  if (it != FPFastMathDefaultInfoMap.end())
+    return it->second;
+
+  // If the map does not contain the entry, create a new one. Initialize it to
+  // contain all 3 elements sorted by bit width of target type: {half, float,
+  // double}.
+  SPIRV::FPFastMathDefaultInfoVector FPFastMathDefaultInfoVec;
+  FPFastMathDefaultInfoVec.emplace_back(Type::getHalfTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getFloatTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getDoubleTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  return FPFastMathDefaultInfoMap[F] = std::move(FPFastMathDefaultInfoVec);
+}
+
+static SPIRV::FPFastMathDefaultInfo &getFPFastMathDefaultInfo(
+    SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec,
+    const Type *Ty) {
+  size_t BitWidth = Ty->getScalarSizeInBits();
+  int Index =
+      SPIRV::FPFastMathDefaultInfoVector::computeFPFastMathDefaultInfoVecIndex(
+          BitWidth);
+  assert(Index >= 0 && Index < 3 &&
+         "Expected FPFastMathDefaultInfo for half, float, or double");
+  assert(FPFastMathDefaultInfoVec.size() == 3 &&
+         "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+  return FPFastMathDefaultInfoVec[Index];
+}
+
+void SPIRVEmitIntrinsics::insertConstantsForFPFastMathDefault(Module &M) {
+  const SPIRVSubtarget *ST = TM->getSubtargetImpl();
+  if (!ST->canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2))
+    return;
+
+  // Store the FPFastMathDefaultInfo in the FPFastMathDefaultInfoMap.
+  // We need the entry point (function) as the key, and the target
+  // type and flags as the value.
+  // We also need to check ContractionOff and SignedZeroInfNanPreserve
+  // execution modes, as they are now deprecated and must be replaced
+  // with FPFastMathDefaultInfo.
+  auto Node = M.getNamedMetadata("spirv.ExecutionMode");
+  if (!Node) {
+    if (!M.getNamedMetadata("opencl.enable.FP_CONTRACT")) {
+      // This requires emitting ContractionOff. However, because
+      // ContractionOff is now deprecated, we need to replace it with
+      // FPFastMathDefaultInfo with FP Fast Math Mode bitmask set to all 0.
+      // We need to create the constant for that.
+
+      // Create constant instruction with the bitmask flags.
+      Constant *InitValue =
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+      // TODO: Reuse constant if there is one already with the required
+      // value.
+      [[maybe_unused]] GlobalVariable *GV =
+          new GlobalVariable(M,                                // Module
+                             Type::getInt32Ty(M.getContext()), // Type
+                             true,                             // isConstant
+                             GlobalValue::InternalLinkage,     // Linkage
+                             InitValue                         // Initializer
+          );
+    }
+    return;
+  }
+
+  // The table maps function pointers to their default FP fast math info. It
+  // can be assumed that the SmallVector is sorted by the bit width of the
+  // type. The first element is the smallest bit width, and the last element
+  // is the largest bit width, therefore, we will have {half, float, double}
+  // in the order of their bit widths.
+  DenseMap<Function *, SPIRV::FPFastMathDefaultInfoVector>
+      FPFastMathDefaultInfoMap;
+
+  for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+    MDNode *MDN = cast<MDNode>(Node->getOperand(i));
+    assert(MDN->getNumOperands() >= 2 && "Expected at least 2 operands");
+    Function *F = cast<Function>(
+        cast<ConstantAsMetadata>(MDN->getOperand(0))->getValue());
+    const auto EM =
+        cast<ConstantInt>(
+            cast<ConstantAsMetadata>(MDN->getOperand(1))->getValue())
+            ->getZExtValue();
+    if (EM == SPIRV::ExecutionMode::FPFastMathDefault) {
+      assert(MDN->getNumOperands() == 4 &&
+             "Expected 4 operands for FPFastMathDefault");
+      const Type *T = cast<ValueAsMetadata>(MDN->getOperand(2))->getType();
+      unsigned Flags =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(3))->getValue())
+              ->getZExtValue();
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, FPFastMathDefaultInfoMap, F);
+      SPIRV::FPFastMathDefaultInfo &Info =
+          getFPFastMathDefaultInfo(FPFastMathDefaultInfoVec, T);
+      Info.FastMathFlags = Flags;
+      Info.FPFastMathDefault = true;
+    } else if (EM == SPIRV::ExecutionMode::ContractionOff) {
+      assert(MDN->getNumOperands() == 2 &&
+             "Expected no operands for ContractionOff");
+
+      // We need to save this info for every possible FP type, i.e. {half,
+      // float, double, fp128}.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, FPFastMathDefaultInfoMap, F);
+      for (SPIRV::FPFastMathDefaultInfo &Info : FPFastMathDefaultInfoVec) {
+        Info.ContractionOff = true;
+      }
+    } else if (EM == SPIRV::ExecutionMode::SignedZeroInfNanPreserve) {
+      assert(MDN->getNumOperands() == 3 &&
+             "Expected 1 operand for SignedZeroInfNanPreserve");
+      unsigned TargetWidth =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(2))->getValue())
+              ->getZExtValue();
+      // We need to save this info only for the FP type with TargetWidth.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, FPFastMathDefaultInfoMap, F);
+      int Index = SPIRV::FPFastMathDefaultInfoVector::
+          computeFPFastMathDefaultInfoVecIndex(TargetWidth);
+      assert(Index >= 0 && Index < 3 &&
+             "Expected FPFastMathDefaultInfo for half, float, or double");
+      assert(FPFastMathDefaultInfoVec.size() == 3 &&
+             "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+      FPFastMathDefaultInfoVec[Index].SignedZeroInfNanPreserve = true;
+    }
+  }
+
+  std::unordered_map<unsigned, GlobalVariable *> GlobalVars;
+  for (auto &[Func, FPFastMathDefaultInfoVec] : FPFastMathDefaultInfoMap) {
+    if (FPFastMathDefaultInfoVec.empty())
+      continue;
+
+    for (const SPIRV::FPFastMathDefaultInfo &Info : FPFastMathDefaultInfoVec) {
+      assert(Info.Ty && "Expected target type for FPFastMathDefaultInfo");
+      // Skip if none of the execution modes was used.
+      unsigned Flags = Info.FastMathFlags;
+      if (Flags == SPIRV::FPFastMathMode::None && !Info.ContractionOff &&
+          !Info.SignedZeroInfNanPreserve && !Info.FPFastMathDefault)
+        continue;
+
+      // Check if flags are compatible.
+      if (Info.ContractionOff && (Flags & SPIRV::FPFastMathMode::AllowContract))
+        report_fatal_error("Conflicting FPFastMathFlags: ContractionOff "
+                           "and AllowContract");
+
+      if (Info.SignedZeroInfNanPreserve &&
+          !(Flags &
+            (SPIRV::FPFastMathMode::NotNaN | SPIRV::FPFastMathMode::NotInf |
+             SPIRV::FPFastMathMode::NSZ))) {
+        if (Info.FPFastMathDefault)
+          report_fatal_error("Conflicting FPFastMathFlags: "
+                             "SignedZeroInfNanPreserve but at least one of "
+                             "NotNaN/NotInf/NSZ is enabled.");
+      }
+
+      if ((Flags & SPIRV::FPFastMathMode::AllowTransform) &&
+          !((Flags & SPIRV::FPFastMathMode::AllowReassoc) &&
+            (Flags & SPIRV::FPFastMathMode::AllowContract))) {
+        report_fatal_error("Conflicting FPFastMathFlags: "
+                           "AllowTransform requires AllowReassoc and "
+                           "AllowContract to be set.");
+      }
+
+      auto it = GlobalVars.find(Flags);
+      GlobalVariable *GV = nullptr;
+      if (it != GlobalVars.end()) {
+        // Reuse existing global variable.
+        GV = it->second;
+      } else {
+        // Create constant instruction with the bitmask flags.
+        Constant *InitValue =
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), Flags);
+        // TODO: Reuse constant if there is one already with the required
+        // value.
+        GV = new GlobalVariable(M,                                // Module
+                                Type::getInt32Ty(M.getContext()), // Type
+                                true,                             // isConstant
+                                GlobalValue::InternalLinkage,     // Linkage
+                                InitValue                         // Initializer
+        );
+        GlobalVars[Flags] = GV;
+      }
+    }
+  }
+}
+
 void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I,
                                                  IRBuilder<> &B) {
   auto *II = dyn_cast<IntrinsicInst>(I);
@@ -2569,9 +2763,9 @@ GetElementPtrInst *
 SPIRVEmitIntrinsics::simplifyZeroLengthArrayGepInst(GetElementPtrInst *GEP) {
   // getelementptr [0 x T], P, 0 (zero), I -> getelementptr T, P, I.
   // If type is 0-length array and first index is 0 (zero), drop both the
-  // 0-length array type and the first index. This is a common pattern in the
-  // IR, e.g. when using a zero-length array as a placeholder for a flexible
-  // array such as unbound arrays.
+  // 0-length array type and the first index. This is a common pattern in
+  // the IR, e.g. when using a zero-length array as a placeholder for a
+  // flexible array such as unbound arrays.
   assert(GEP && "GEP is null");
   Type *SrcTy = GEP->getSourceElementType();
   SmallVector<Value *, 8> Indices(GEP->indices());
@@ -2633,8 +2827,9 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
 
   processParamTypesByFunHeader(CurrF, B);
 
-  // StoreInst's operand type can be changed during the next transformations,
-  // so we need to store it in the set. Also store already transformed types.
+  // StoreInst's operand type can be changed during the next
+  // transformations, so we need to store it in the set. Also store already
+  // transformed types.
   for (auto &I : instructions(Func)) {
     StoreInst *SI = dyn_cast<StoreInst>(&I);
     if (!SI)
@@ -2681,8 +2876,8 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   for (auto &I : llvm::reverse(instructions(Func)))
     deduceOperandElementType(&I, &IncompleteRets);
 
-  // Pass forward for PHIs only, their operands are not preceed the instruction
-  // in meaning of `instructions(Func)`.
+  // Pass forward for PHIs only, their operands are not preceed the
+  // instruction in meaning of `instructions(Func)`.
   for (BasicBlock &BB : Func)
     for (PHINode &Phi : BB.phis())
       if (isPointerTy(Phi.getType()))
@@ -2692,8 +2887,8 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
     TrackConstants = true;
     if (!I->getType()->isVoidTy() || isa<StoreInst>(I))
       setInsertPointAfterDef(B, I);
-    // Visitors return either the original/newly created instruction for further
-    // processing, nullptr otherwise.
+    // Visitors return either the original/newly created instruction for
+    // further processing, nullptr otherwise.
     I = visit(*I);
     if (!I)
       continue;
@@ -2816,6 +3011,7 @@ bool SPIRVEmitIntrinsics::runOnModule(Module &M) {
   bool Changed = false;
 
   parseFunDeclarations(M);
+  insertConstantsForFPFastMathDefault(M);
 
   TodoType.clear();
   for (auto &F : M)
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 115766ce886c7..6fd1c7ed78c06 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -806,7 +806,7 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
   // arguments.
   MDNode *GVarMD = nullptr;
   if (GVar && (GVarMD = GVar->getMetadata("spirv.Decorations")) != nullptr)
-    buildOpSpirvDecorations(Reg, MIRBuilder, GVarMD);
+    buildOpSpirvDecorations(Reg, MIRBuilder, GVarMD, ST);
 
   return Reg;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index 45e88fc94144e..ba95ad822df75 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -132,7 +132,8 @@ bool SPIRVInstrInfo::isHeaderInstr(const MachineInstr &MI) const {
   }
 }
 
-bool SPIRVInstrInfo::canUseFastMathFlags(const MachineInstr &MI) const {
+bool SPIRVInstrInfo::canUseFastMathFlags(const MachineInstr &MI,
+                                         bool KHRFloatControls2) const {
   switch (MI.getOpcode()) {
   case SPIRV::OpFAddS:
   case SPIRV::OpFSubS:
@@ -146,6 +147,24 @@ bool SPIRVInstrInfo::canUseFastMathFlags(const MachineInstr &MI) const {
   case SPIRV::OpFRemV:
   case SPIRV::OpFMod:
     return true;
+  case SPIRV::OpFNegateV:
+  case SPIRV::OpFNegate:
+  case SPIRV::OpOrdered:
+  case SPIRV::OpUnordered:
+  case SPIRV::OpFOrdEqual:
+  case SPIRV::OpFOrdNotEqual:
+  case SPIRV::OpFOrdLessThan:
+  case SPIRV::OpFOrdLessThanEqual:
+  case SPIRV::OpFOrdGreaterThan:
+  case SPIRV::OpFOrdGreaterThanEqual:
+  case SPIRV::OpFUnordEqual:
+  case SPIRV::OpFUnordNotEqual:
+  case SPIRV::OpFUnordLessThan:
+  case SPIRV::OpFUnordLessThanEqual:
+  case SPIRV::OpFUnordGreaterThan:
+  case SPIRV::OpFUnordGreaterThanEqual:
+  case SPIRV::OpExtInst:
+    return KHRFloatControls2 ? true : false;
   default:
     return false;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 72d2243fba62a..4de9d6a936abd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -36,7 +36,8 @@ class SPIRVInstrInfo : public SPIRVGenInstrInfo {
   bool isTypeDeclInstr(const MachineInstr &MI) const;
   bool isDecorationInstr(const MachineInstr &MI) const;
   bool isAliasingInstr(const MachineInstr &MI) const;
-  bool canUseFastMathFlags(const MachineInstr &MI) const;
+  bool canUseFastMathFlags(const MachineInstr &MI,
+                           bool KHRFloatControls2) const;
   bool canUseNSW(const MachineInstr &MI) const;
   bool canUseNUW(const MachineInstr &MI) const;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 1aadd9df189a8..273edf374bef0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1073,7 +1073,8 @@ bool SPIRVInstructionSelector::selectExtInst(Register ResVReg,
                      .addDef(ResVReg)
                      .addUse(GR.getSPIRVTypeID(ResType))
                      .addImm(static_cast<uint32_t>(Set))
-                     .addImm(Opcode);
+                     .addImm(Opcode)
+                     .setMIFlags(I.getFlags());
       const unsigned NumOps = I.getNumOperands();
       unsigned Index = 1;
       if (Index < NumOps &&
@@ -2629,6 +2630,7 @@ bool SPIRVInstructionSelector::selectCmp(Register ResVReg,
       .addUse(GR.getSPIRVTypeID(ResType))
       .addUse(Cmp0)
       .addUse(Cmp1)
+      .setMIFlags(I.getFlags())
       .constrainAllUses(TII, TRI, RBI);
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index bc159d5c9a113..dc717a6ca5870 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -248,6 +248,22 @@ static InstrSignature instrToSignature(const MachineInstr &MI,
   Register DefReg;
   InstrSignature Signature{MI.getOpcode()};
   for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    // The only decorations that can be applied more than once to a given <id>
+    // or structure member are UserSemantic(5635), CacheControlLoadINTEL (6442),
+    // and CacheControlStoreINTEL (6443). For all the rest of decorations, we
+    // will only add to the signature the Opcode, the id to which it applies,
+    // and the decoration id, disregarding any decoration flags. This will
+    // ensure that any subsequent decoration with the same id will be deemed as
+    // a duplicate. Then, at the call site, we will be able to handle duplicates
+    // in the best way.
+    unsigned Opcode = MI.getOpcode();
+    if ((Opcode == SPIRV::OpDecorate) && i >= 2) {
+      unsigned DecorationID = MI.getOperand(1).getImm();
+      if (DecorationID != SPIRV::Decoration::UserSemantic &&
+          DecorationID != SPIRV::Decoration::CacheControlLoadINTEL &&
+          DecorationID != SPIRV::Decoration::CacheControlStoreINTEL)
+        continue;
+    }
     const MachineOperand &MO = MI.getOperand(i);
     size_t h;
     if (MO.isReg()) {
@@ -559,8 +575,54 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
   MAI.setSkipEmission(&MI);
   InstrSignature MISign = instrToSignature(MI, MAI, true);
   auto FoundMI = IS.insert(std::move(MISign));
-  if (!FoundMI.second)
+  if (!FoundMI.second) {
+    if (MI.getOpcode() == SPIRV::OpDecorate) {
+      assert(MI.getNumOperands() >= 2 &&
+             "Decoration instructions must have at least 2 operands");
+      assert(MSType == SPIRV::MB_Annotations &&
+             "Only OpDecorate instructions can be duplicates");
+      // For FPFastMathMode decoration, we need to merge the flags of the
+      // duplicate decoration with the original one, so we need to find the
+      // original instruction that has the same signature. For the rest of
+      // instructions, we will simply skip the duplicate.
+      if (MI.getOperand(1).getImm() != SPIRV::Decoration::FPFastMathMode)
+        return; // Skip duplicates of other decorations.
+
+      const SPIRV::InstrList &Decorations = MAI.MS[MSType];
+      for (const MachineInstr *OrigMI : Decorations) {
+        if (instrToSignature(*OrigMI, MAI, true) == MISign) {
+          assert(OrigMI->getNumOperands() == MI.getNumOperands() &&
+                 "Original instruction must have the same number of operands");
+          assert(
+              OrigMI->getNumOperands() == 3 &&
+              "FPFastMathMode decoration must have 3 operands for OpDecorate");
+          unsigned OrigFlags = OrigMI->getOperand(2).getImm();
+          unsigned NewFlags = MI.getOperand(2).getImm();
+          if (OrigFlags == NewFlags)
+            return; // No need to merge, the flags are the same.
+
+          // Emit warning about possible conflict between flags.
+          unsigned FinalFlags = OrigFlags | NewFlags;
+          llvm::errs()
+              << "Warning: Conflicting FPFastMathMode decoration flags "
+                 "in instruction: "
+              << *OrigMI << "Original flags: " << OrigFlags
+              << ", new flags: " << NewFlags
+              << ". They will be merged on a best effort basis, but not "
+                 "validated. Final flags: "
+              << FinalFlags << "\n";
+          MachineInstr *OrigMINonConst = const_cast<MachineInstr *>(OrigMI);
+          MachineOperand &OrigFlagsOp = OrigMINonConst->getOperand(2);
+          OrigFlagsOp =
+              MachineOperand::CreateImm(static_cast<unsigned>(FinalFlags));
+          return; // Merge done, so we found a duplicate; don't add it to MAI.MS
+        }
+      }
+      assert(false && "No original instruction found for the duplicate "
+                      "OpDecorate, but we found one in IS.");
+    }
     return; // insert failed, so we found a duplicate; don't add it to MAI.MS
+  }
   // No duplicates, so add it.
   if (Append)
     MAI.MS[MSType].push_back(&MI);
@@ -934,6 +996,11 @@ static void addOpDecorateReqs(const MachineInstr &MI, unsigned DecIndex,
   } else if (Dec == SPIRV::Decoration::FPMaxErrorDecorationINTEL) {
     Reqs.addRequirements(SPIRV::Capability::FPMaxErrorINTEL);
     Reqs.addExtension(SPIRV::Extension::SPV_INTEL_fp_max_error);
+  } else if (Dec == SPIRV::Decoration::FPFastMathMode) {
+    if (ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)) {
+      Reqs.addRequirements(SPIRV::Capability::FloatControls2);
+      Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls2);
+    }
   }
 }
 
@@ -1994,10 +2061,13 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
   // Collect requirements for OpExecutionMode instructions.
   auto Node = M.getNamedMetadata("spirv.ExecutionMode");
   if (Node) {
-    bool RequireFloatControls = false, RequireFloatControls2 = false,
+    bool RequireFloatControls = false, RequireIntelFloatControls2 = false,
+         RequireKHRFloatControls2 = false,
          VerLower14 = !ST.isAtLeastSPIRVVer(VersionTuple(1, 4));
-    bool HasFloatControls2 =
+    bool HasIntelFloatControls2 =
         ST.canUseExtension(SPIRV::Extension::SPV_INTEL_float_controls2);
+    bool HasKHRFloatControls2 =
+        ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2);
     for (unsigned i = 0; i < Node->getNumOperands(); i++) {
       MDNode *MDN = cast<MDNode>(Node->getOperand(i));
       const MDOperand &MDOp = MDN->getOperand(1);
@@ -2010,7 +2080,6 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
           switch (EM) {
           case SPIRV::ExecutionMode::DenormPreserve:
           case SPIRV::ExecutionMode::DenormFlushToZero:
-          case SPIRV::ExecutionMode::SignedZeroInfNanPreserve:
           case SPIRV::ExecutionMode::RoundingModeRTE:
           case SPIRV::ExecutionMode::RoundingModeRTZ:
             RequireFloatControls = VerLower14;
@@ -2021,8 +2090,28 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
           case SPIRV::ExecutionMode::RoundingModeRTNINTEL:
           case SPIRV::ExecutionMode::FloatingPointModeALTINTEL:
           case SPIRV::ExecutionMode::FloatingPointModeIEEEINTEL:
-            if (HasFloatControls2) {
-              RequireFloatControls2 = true;
+            if (HasIntelFloatControls2) {
+              RequireIntelFloatControls2 = true;
+              MAI.Reqs.getAndAddRequirements(
+                  SPIRV::OperandCategory::ExecutionModeOperand, EM, ST);
+            }
+            break;
+          case SPIRV::ExecutionMode::FPFastMathDefault: {
+            if (HasKHRFloatControls2) {
+              RequireKHRFloatControls2 = true;
+              MAI.Reqs.getAndAddRequirements(
+                  SPIRV::OperandCategory::ExecutionModeOperand, EM, ST);
+            }
+            break;
+          }
+          case SPIRV::ExecutionMode::ContractionOff:
+          case SPIRV::ExecutionMode::SignedZeroInfNanPreserve:
+            if (HasKHRFloatControls2) {
+              RequireKHRFloatControls2 = true;
+              MAI.Reqs.getAndAddRequirements(
+                  SPIRV::OperandCategory::ExecutionModeOperand,
+                  SPIRV::ExecutionMode::FPFastMathDefault, ST);
+            } else {
               MAI.Reqs.getAndAddRequirements(
                   SPIRV::OperandCategory::ExecutionModeOperand, EM, ST);
             }
@@ -2037,8 +2126,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
     if (RequireFloatControls &&
         ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls))
       MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls);
-    if (RequireFloatControls2)
+    if (RequireIntelFloatControls2)
       MAI.Reqs.addExtension(SPIRV::Extension::SPV_INTEL_float_controls2);
+    if (RequireKHRFloatControls2)
+      MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_float_controls2);
   }
   for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) {
     const Function &F = *FI;
@@ -2078,8 +2169,11 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
   }
 }
 
-static unsigned getFastMathFlags(const MachineInstr &I) {
+static unsigned getFastMathFlags(const MachineInstr &I,
+                                 const SPIRVSubtarget &ST) {
   unsigned Flags = SPIRV::FPFastMathMode::None;
+  bool CanUseKHRFloatControls2 =
+      ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2);
   if (I.getFlag(MachineInstr::MIFlag::FmNoNans))
     Flags |= SPIRV::FPFastMathMode::NotNaN;
   if (I.getFlag(MachineInstr::MIFlag::FmNoInfs))
@@ -2088,12 +2182,45 @@ static unsigned getFastMathFlags(const MachineInstr &I) {
     Flags |= SPIRV::FPFastMathMode::NSZ;
   if (I.getFlag(MachineInstr::MIFlag::FmArcp))
     Flags |= SPIRV::FPFastMathMode::AllowRecip;
-  if (I.getFlag(MachineInstr::MIFlag::FmReassoc))
-    Flags |= SPIRV::FPFastMathMode::Fast;
+  if (I.getFlag(MachineInstr::MIFlag::FmContract) && CanUseKHRFloatControls2)
+    Flags |= SPIRV::FPFastMathMode::AllowContract;
+  if (I.getFlag(MachineInstr::MIFlag::FmReassoc)) {
+    if (CanUseKHRFloatControls2)
+      // LLVM reassoc maps to SPIRV transform, see
+      // https://github.com/KhronosGroup/SPIRV-Registry/issues/326 for details.
+      // Because we are enabling AllowTransform, we must enable AllowReassoc and
+      // AllowContract too, as required by SPIRV spec. Also, we used to map
+      // MIFlag::FmReassoc to FPFastMathMode::Fast, which now should instead by
+      // replaced by turning all the other bits instead. Therefore, we're
+      // enabling every bit here except None and Fast.
+      Flags |= SPIRV::FPFastMathMode::NotNaN | SPIRV::FPFastMathMode::NotInf |
+               SPIRV::FPFastMathMode::NSZ | SPIRV::FPFastMathMode::AllowRecip |
+               SPIRV::FPFastMathMode::AllowTransform |
+               SPIRV::FPFastMathMode::AllowReassoc |
+               SPIRV::FPFastMathMode::AllowContract;
+    else
+      Flags |= SPIRV::FPFastMathMode::Fast;
+  }
+
+  if (CanUseKHRFloatControls2) {
+    // Error out if SPIRV::FPFastMathMode::Fast is enabled.
+    assert(!(Flags & SPIRV::FPFastMathMode::Fast) &&
+           "SPIRV::FPFastMathMode::Fast is deprecated and should not be used "
+           "anymore.");
+
+    // Error out if AllowTransform is enabled without AllowReassoc and
+    // AllowContract.
+    assert((!(Flags & SPIRV::FPFastMathMode::AllowTransform) ||
+            ((Flags & SPIRV::FPFastMathMode::AllowReassoc &&
+              Flags & SPIRV::FPFastMathMode::AllowContract))) &&
+           "SPIRV::FPFastMathMode::AllowTransform requires AllowReassoc and "
+           "AllowContract flags to be enabled as well.");
+  }
+
   return Flags;
 }
 
-static bool isFastMathMathModeAvailable(const SPIRVSubtarget &ST) {
+static bool isFastMathModeAvailable(const SPIRVSubtarget &ST) {
   if (ST.isKernel())
     return true;
   if (ST.getSPIRVVersion() < VersionTuple(1, 2))
@@ -2101,9 +2228,10 @@ static bool isFastMathMathModeAvailable(const SPIRVSubtarget &ST) {
   return ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2);
 }
 
-static void handleMIFlagDecoration(MachineInstr &I, const SPIRVSubtarget &ST,
-                                   const SPIRVInstrInfo &TII,
-                                   SPIRV::RequirementHandler &Reqs) {
+static void handleMIFlagDecoration(
+    MachineInstr &I, const SPIRVSubtarget &ST, const SPIRVInstrInfo &TII,
+    SPIRV::RequirementHandler &Reqs, const SPIRVGlobalRegistry *GR,
+    SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec) {
   if (I.getFlag(MachineInstr::MIFlag::NoSWrap) && TII.canUseNSW(I) &&
       getSymbolicOperandRequirements(SPIRV::OperandCategory::DecorationOperand,
                                      SPIRV::Decoration::NoSignedWrap, ST, Reqs)
@@ -2119,13 +2247,53 @@ static void handleMIFlagDecoration(MachineInstr &I, const SPIRVSubtarget &ST,
     buildOpDecorate(I.getOperand(0).getReg(), I, TII,
                     SPIRV::Decoration::NoUnsignedWrap, {});
   }
-  if (!TII.canUseFastMathFlags(I))
-    return;
-  unsigned FMFlags = getFastMathFlags(I);
-  if (FMFlags == SPIRV::FPFastMathMode::None)
+  if (!TII.canUseFastMathFlags(
+          I, ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2)))
     return;
 
-  if (isFastMathMathModeAvailable(ST)) {
+  unsigned FMFlags = getFastMathFlags(I, ST);
+  if (FMFlags == SPIRV::FPFastMathMode::None) {
+    // We also need to check if any FPFastMathDefault info was set for the
+    // types used in this instruction.
+    if (FPFastMathDefaultInfoVec.empty())
+      return;
+
+    // There are three types of instructions that can use fast math flags:
+    // 1. Arithmetic instructions (FAdd, FMul, FSub, FDiv, FRem, etc.)
+    // 2. Relational instructions (FCmp, FOrd, FUnord, etc.)
+    // 3. Extended instructions (ExtInst)
+    // For arithmetic instructions, the floating point type can be in the
+    // result type or in the operands, but they all must be the same.
+    // For the relational and logical instructions, the floating point type
+    // can only be in the operands 1 and 2, not the result type. Also, the
+    // operands must have the same type. For the extended instructions, the
+    // floating point type can be in the result type or in the operands. It's
+    // unclear if the operands and the result type must be the same. Let's
+    // assume they must be. Therefore, for 1. and 2., we can check the first
+    // operand type, and for 3. we can check the result type.
+    assert(I.getNumOperands() >= 3 && "Expected at least 3 operands");
+    Register ResReg = I.getOpcode() == SPIRV::OpExtInst
+                          ? I.getOperand(1).getReg()
+                          : I.getOperand(2).getReg();
+    SPIRVType *ResType = GR->getSPIRVTypeForVReg(ResReg, I.getMF());
+    const Type *Ty = GR->getTypeForSPIRVType(ResType);
+    Ty = Ty->isVectorTy() ? cast<VectorType>(Ty)->getElementType() : Ty;
+
+    // Match instruction type with the FPFastMathDefaultInfoVec.
+    bool Emit = false;
+    for (SPIRV::FPFastMathDefaultInfo &Elem : FPFastMathDefaultInfoVec) {
+      if (Ty == Elem.Ty) {
+        FMFlags = Elem.FastMathFlags;
+        Emit = Elem.ContractionOff || Elem.SignedZeroInfNanPreserve ||
+               Elem.FPFastMathDefault;
+        break;
+      }
+    }
+
+    if (FMFlags == SPIRV::FPFastMathMode::None && !Emit)
+      return;
+  }
+  if (isFastMathModeAvailable(ST)) {
     Register DstReg = I.getOperand(0).getReg();
     buildOpDecorate(DstReg, I, TII, SPIRV::Decoration::FPFastMathMode,
                     {FMFlags});
@@ -2135,14 +2303,17 @@ static void handleMIFlagDecoration(MachineInstr &I, const SPIRVSubtarget &ST,
 // Walk all functions and add decorations related to MI flags.
 static void addDecorations(const Module &M, const SPIRVInstrInfo &TII,
                            MachineModuleInfo *MMI, const SPIRVSubtarget &ST,
-                           SPIRV::ModuleAnalysisInfo &MAI) {
+                           SPIRV::ModuleAnalysisInfo &MAI,
+                           const SPIRVGlobalRegistry *GR) {
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     MachineFunction *MF = MMI->getMachineFunction(*F);
     if (!MF)
       continue;
+
     for (auto &MBB : *MF)
       for (auto &MI : MBB)
-        handleMIFlagDecoration(MI, ST, TII, MAI.Reqs);
+        handleMIFlagDecoration(MI, ST, TII, MAI.Reqs, GR,
+                               MAI.FPFastMathDefaultInfoMap[&(*F)]);
   }
 }
 
@@ -2188,6 +2359,111 @@ static void patchPhis(const Module &M, SPIRVGlobalRegistry *GR,
   }
 }
 
+static SPIRV::FPFastMathDefaultInfoVector &getOrCreateFPFastMathDefaultInfoVec(
+    const Module &M, SPIRV::ModuleAnalysisInfo &MAI, const Function *F) {
+  auto it = MAI.FPFastMathDefaultInfoMap.find(F);
+  if (it != MAI.FPFastMathDefaultInfoMap.end())
+    return it->second;
+
+  // If the map does not contain the entry, create a new one. Initialize it to
+  // contain all 3 elements sorted by bit width of target type: {half, float,
+  // double}.
+  SPIRV::FPFastMathDefaultInfoVector FPFastMathDefaultInfoVec;
+  FPFastMathDefaultInfoVec.emplace_back(Type::getHalfTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getFloatTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  FPFastMathDefaultInfoVec.emplace_back(Type::getDoubleTy(M.getContext()),
+                                        SPIRV::FPFastMathMode::None);
+  return MAI.FPFastMathDefaultInfoMap[F] = std::move(FPFastMathDefaultInfoVec);
+}
+
+static SPIRV::FPFastMathDefaultInfo &getFPFastMathDefaultInfo(
+    SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec,
+    const Type *Ty) {
+  size_t BitWidth = Ty->getScalarSizeInBits();
+  int Index =
+      SPIRV::FPFastMathDefaultInfoVector::computeFPFastMathDefaultInfoVecIndex(
+          BitWidth);
+  assert(Index >= 0 && Index < 3 &&
+         "Expected FPFastMathDefaultInfo for half, float, or double");
+  assert(FPFastMathDefaultInfoVec.size() == 3 &&
+         "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+  return FPFastMathDefaultInfoVec[Index];
+}
+
+static void collectFPFastMathDefaults(const Module &M,
+                                      SPIRV::ModuleAnalysisInfo &MAI,
+                                      const SPIRVSubtarget &ST) {
+  if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_float_controls2))
+    return;
+
+  // Store the FPFastMathDefaultInfo in the FPFastMathDefaultInfoMap.
+  // We need the entry point (function) as the key, and the target
+  // type and flags as the value.
+  // We also need to check ContractionOff and SignedZeroInfNanPreserve
+  // execution modes, as they are now deprecated and must be replaced
+  // with FPFastMathDefaultInfo.
+  auto Node = M.getNamedMetadata("spirv.ExecutionMode");
+  if (!Node)
+    return;
+
+  for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+    MDNode *MDN = cast<MDNode>(Node->getOperand(i));
+    assert(MDN->getNumOperands() >= 2 && "Expected at least 2 operands");
+    const Function *F = cast<Function>(
+        cast<ConstantAsMetadata>(MDN->getOperand(0))->getValue());
+    const auto EM =
+        cast<ConstantInt>(
+            cast<ConstantAsMetadata>(MDN->getOperand(1))->getValue())
+            ->getZExtValue();
+    if (EM == SPIRV::ExecutionMode::FPFastMathDefault) {
+      assert(MDN->getNumOperands() == 4 &&
+             "Expected 4 operands for FPFastMathDefault");
+
+      const Type *T = cast<ValueAsMetadata>(MDN->getOperand(2))->getType();
+      unsigned Flags =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(3))->getValue())
+              ->getZExtValue();
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, MAI, F);
+      SPIRV::FPFastMathDefaultInfo &Info =
+          getFPFastMathDefaultInfo(FPFastMathDefaultInfoVec, T);
+      Info.FastMathFlags = Flags;
+      Info.FPFastMathDefault = true;
+    } else if (EM == SPIRV::ExecutionMode::ContractionOff) {
+      assert(MDN->getNumOperands() == 2 &&
+             "Expected no operands for ContractionOff");
+
+      // We need to save this info for every possible FP type, i.e. {half,
+      // float, double, fp128}.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, MAI, F);
+      for (SPIRV::FPFastMathDefaultInfo &Info : FPFastMathDefaultInfoVec) {
+        Info.ContractionOff = true;
+      }
+    } else if (EM == SPIRV::ExecutionMode::SignedZeroInfNanPreserve) {
+      assert(MDN->getNumOperands() == 3 &&
+             "Expected 1 operand for SignedZeroInfNanPreserve");
+      unsigned TargetWidth =
+          cast<ConstantInt>(
+              cast<ConstantAsMetadata>(MDN->getOperand(2))->getValue())
+              ->getZExtValue();
+      // We need to save this info only for the FP type with TargetWidth.
+      SPIRV::FPFastMathDefaultInfoVector &FPFastMathDefaultInfoVec =
+          getOrCreateFPFastMathDefaultInfoVec(M, MAI, F);
+      int Index = SPIRV::FPFastMathDefaultInfoVector::
+          computeFPFastMathDefaultInfoVecIndex(TargetWidth);
+      assert(Index >= 0 && Index < 3 &&
+             "Expected FPFastMathDefaultInfo for half, float, or double");
+      assert(FPFastMathDefaultInfoVec.size() == 3 &&
+             "Expected FPFastMathDefaultInfoVec to have exactly 3 elements");
+      FPFastMathDefaultInfoVec[Index].SignedZeroInfNanPreserve = true;
+    }
+  }
+}
+
 struct SPIRV::ModuleAnalysisInfo SPIRVModuleAnalysis::MAI;
 
 void SPIRVModuleAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -2209,7 +2485,8 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
   patchPhis(M, GR, *TII, MMI);
 
   addMBBNames(M, *TII, MMI, *ST, MAI);
-  addDecorations(M, *TII, MMI, *ST, MAI);
+  collectFPFastMathDefaults(M, MAI, *ST);
+  addDecorations(M, *TII, MMI, *ST, MAI, GR);
 
   collectReqs(M, MAI, MMI, *ST);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 41c792a98534f..d8376cd1aeb5a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -159,6 +159,13 @@ struct ModuleAnalysisInfo {
   InstrList MS[NUM_MODULE_SECTIONS];
   // The table maps MBB number to SPIR-V unique ID register.
   DenseMap<std::pair<const MachineFunction *, int>, MCRegister> BBNumToRegMap;
+  // The table maps function pointers to their default FP fast math info. It can
+  // be assumed that the SmallVector is sorted by the bit width of the type. The
+  // first element is the smallest bit width, and the last element is the
+  // largest bit width, therefore, we will have {half, float, double} in
+  // the order of their bit widths.
+  DenseMap<const Function *, SPIRV::FPFastMathDefaultInfoVector>
+      FPFastMathDefaultInfoMap;
 
   MCRegister getFuncReg(const Function *F) {
     assert(F && "Function is null");
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 1a08c6ac0dcaf..db6f2d61e8f29 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -839,6 +839,7 @@ static uint32_t convertFloatToSPIRVWord(float F) {
 
 static void insertSpirvDecorations(MachineFunction &MF, SPIRVGlobalRegistry *GR,
                                    MachineIRBuilder MIB) {
+  const SPIRVSubtarget &ST = cast<SPIRVSubtarget>(MIB.getMF().getSubtarget());
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
@@ -849,7 +850,7 @@ static void insertSpirvDecorations(MachineFunction &MF, SPIRVGlobalRegistry *GR,
       MIB.setInsertPt(*MI.getParent(), MI.getNextNode());
       if (isSpvIntrinsic(MI, Intrinsic::spv_assign_decoration)) {
         buildOpSpirvDecorations(MI.getOperand(1).getReg(), MIB,
-                                MI.getOperand(2).getMetadata());
+                                MI.getOperand(2).getMetadata(), ST);
       } else if (isSpvIntrinsic(MI,
                                 Intrinsic::spv_assign_fpmaxerror_decoration)) {
         ConstantFP *OpV = mdconst::dyn_extract<ConstantFP>(
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 66ce5a2d67c3e..6a32dbabff3d3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -802,6 +802,7 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>;
 defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
 defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
 defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
+defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define StorageClass enum values and at the same time
@@ -1153,6 +1154,9 @@ defm NotInf : FPFastMathModeOperand<0x2, [Kernel]>;
 defm NSZ : FPFastMathModeOperand<0x4, [Kernel]>;
 defm AllowRecip : FPFastMathModeOperand<0x8, [Kernel]>;
 defm Fast : FPFastMathModeOperand<0x10, [Kernel]>;
+defm AllowContract : FPFastMathModeOperand<0x10000, [FloatControls2]>;
+defm AllowReassoc : FPFastMathModeOperand<0x20000, [FloatControls2]>;
+defm AllowTransform : FPFastMathModeOperand<0x40000, [FloatControls2]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define FPRoundingMode enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 820e56b362edc..327c011ea178f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -181,7 +181,7 @@ void buildOpMemberDecorate(Register Reg, MachineInstr &I,
 }
 
 void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
-                             const MDNode *GVarMD) {
+                             const MDNode *GVarMD, const SPIRVSubtarget &ST) {
   for (unsigned I = 0, E = GVarMD->getNumOperands(); I != E; ++I) {
     auto *OpMD = dyn_cast<MDNode>(GVarMD->getOperand(I));
     if (!OpMD)
@@ -193,6 +193,20 @@ void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
     if (!DecorationId)
       report_fatal_error("Expect SPIR-V <Decoration> operand to be the first "
                          "element of the decoration");
+
+    // The goal of `spirv.Decorations` metadata is to provide a way to
+    // represent SPIR-V entities that do not map to LLVM in an obvious way.
+    // FP flags do have obvious matches between LLVM IR and SPIR-V.
+    // Additionally, we have no guarantee at this point that the flags passed
+    // through the decoration are not violated already in the optimizer passes.
+    // Therefore, we simply ignore FP flags, including NoContraction, and
+    // FPFastMathMode.
+    if (DecorationId->getZExtValue() ==
+            static_cast<uint32_t>(SPIRV::Decoration::NoContraction) ||
+        DecorationId->getZExtValue() ==
+            static_cast<uint32_t>(SPIRV::Decoration::FPFastMathMode)) {
+      continue; // Ignored.
+    }
     auto MIB = MIRBuilder.buildInstr(SPIRV::OpDecorate)
                    .addUse(Reg)
                    .addImm(static_cast<uint32_t>(DecorationId->getZExtValue()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 45c520a922d10..409a0fd758a32 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -113,6 +113,54 @@ class PartialOrderingVisitor {
                          std::function<bool(BasicBlock *)> Op);
 };
 
+namespace SPIRV {
+struct FPFastMathDefaultInfo {
+  const Type *Ty = nullptr;
+  unsigned FastMathFlags = 0;
+  // When SPV_KHR_float_controls2 ContractionOff and SignzeroInfNanPreserve are
+  // deprecated, and we replace them with FPFastMathDefault appropriate flags
+  // instead. However, we have no guarantee about the order in which we will
+  // process execution modes. Therefore it could happen that we first process
+  // ContractionOff, setting AllowContraction bit to 0, and then we process
+  // FPFastMathDefault enabling AllowContraction bit, effectively invalidating
+  // ContractionOff. Because of that, it's best to keep separate bits for the
+  // different execution modes, and we will try and combine them later when we
+  // emit OpExecutionMode instructions.
+  bool ContractionOff = false;
+  bool SignedZeroInfNanPreserve = false;
+  bool FPFastMathDefault = false;
+
+  FPFastMathDefaultInfo() = default;
+  FPFastMathDefaultInfo(const Type *Ty, unsigned FastMathFlags)
+      : Ty(Ty), FastMathFlags(FastMathFlags) {}
+  bool operator==(const FPFastMathDefaultInfo &Other) const {
+    return Ty == Other.Ty && FastMathFlags == Other.FastMathFlags &&
+           ContractionOff == Other.ContractionOff &&
+           SignedZeroInfNanPreserve == Other.SignedZeroInfNanPreserve &&
+           FPFastMathDefault == Other.FPFastMathDefault;
+  }
+};
+
+struct FPFastMathDefaultInfoVector
+    : public SmallVector<SPIRV::FPFastMathDefaultInfo, 3> {
+  static size_t computeFPFastMathDefaultInfoVecIndex(size_t BitWidth) {
+    switch (BitWidth) {
+    case 16: // half
+      return 0;
+    case 32: // float
+      return 1;
+    case 64: // double
+      return 2;
+    default:
+      report_fatal_error("Expected BitWidth to be 16, 32, 64", false);
+    }
+    llvm_unreachable(
+        "Unreachable code in computeFPFastMathDefaultInfoVecIndex");
+  }
+};
+
+} // namespace SPIRV
+
 // Add the given string as a series of integer operand, inserting null
 // terminators and padding to make sure the operands all have 32-bit
 // little-endian words.
@@ -161,7 +209,7 @@ void buildOpMemberDecorate(Register Reg, MachineInstr &I,
 
 // Add an OpDecorate instruction by "spirv.Decorations" metadata node.
 void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder,
-                             const MDNode *GVarMD);
+                             const MDNode *GVarMD, const SPIRVSubtarget &ST);
 
 // Return a valid position for the OpVariable instruction inside a function,
 // i.e., at the beginning of the first block of the function.
@@ -508,6 +556,5 @@ unsigned getArrayComponentCount(const MachineRegisterInfo *MRI,
                                 const MachineInstr *ResType);
 MachineBasicBlock::iterator
 getFirstValidInstructionInsertPoint(MachineBasicBlock &BB);
-
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll b/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll
index aa60e13232b46..b4e283e746125 100644
--- a/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll
+++ b/llvm/test/CodeGen/SPIRV/capability-FloatControl2.ll
@@ -8,7 +8,7 @@
 
 ; CHECK-EXT: OpCapability FloatControls2
 ; CHECK-EXT: OpExtension "SPV_KHR_float_controls2"
-; CHECK-EXT: OpDecorate {{%[0-9]+}} FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|Fast
+; CHECK-EXT: OpDecorate {{%[0-9]+}} FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
 
 define hidden spir_func float @foo(float  %0) local_unnamed_addr {
   %2 = fmul reassoc nnan ninf nsz arcp afn float %0, 2.000000e+00
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/decoration.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/decoration.ll
new file mode 100644
index 0000000000000..d3fe9e43450cd
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/decoration.ll
@@ -0,0 +1,148 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+; CHECK: OpName %[[#addRes:]] "addRes"
+; CHECK: OpName %[[#subRes:]] "subRes"
+; CHECK: OpName %[[#mulRes:]] "mulRes"
+; CHECK: OpName %[[#divRes:]] "divRes"
+; CHECK: OpName %[[#remRes:]] "remRes"
+; CHECK: OpName %[[#negRes:]] "negRes"
+; CHECK: OpName %[[#oeqRes:]] "oeqRes"
+; CHECK: OpName %[[#oneRes:]] "oneRes"
+; CHECK: OpName %[[#oltRes:]] "oltRes"
+; CHECK: OpName %[[#ogtRes:]] "ogtRes"
+; CHECK: OpName %[[#oleRes:]] "oleRes"
+; CHECK: OpName %[[#ogeRes:]] "ogeRes"
+; CHECK: OpName %[[#ordRes:]] "ordRes"
+; CHECK: OpName %[[#ueqRes:]] "ueqRes"
+; CHECK: OpName %[[#uneRes:]] "uneRes"
+; CHECK: OpName %[[#ultRes:]] "ultRes"
+; CHECK: OpName %[[#ugtRes:]] "ugtRes"
+; CHECK: OpName %[[#uleRes:]] "uleRes"
+; CHECK: OpName %[[#ugeRes:]] "ugeRes"
+; CHECK: OpName %[[#unoRes:]] "unoRes"
+; CHECK: OpName %[[#modRes:]] "modRes"
+; CHECK: OpName %[[#maxRes:]] "maxRes"
+; CHECK: OpName %[[#maxCommonRes:]] "maxCommonRes"
+; CHECK: OpName %[[#addResV:]] "addResV"
+; CHECK: OpName %[[#subResV:]] "subResV"
+; CHECK: OpName %[[#mulResV:]] "mulResV"
+; CHECK: OpName %[[#divResV:]] "divResV"
+; CHECK: OpName %[[#remResV:]] "remResV"
+; CHECK: OpName %[[#negResV:]] "negResV"
+; CHECK: OpName %[[#oeqResV:]] "oeqResV"
+; CHECK: OpName %[[#oneResV:]] "oneResV"
+; CHECK: OpName %[[#oltResV:]] "oltResV"
+; CHECK: OpName %[[#ogtResV:]] "ogtResV"
+; CHECK: OpName %[[#oleResV:]] "oleResV"
+; CHECK: OpName %[[#ogeResV:]] "ogeResV"
+; CHECK: OpName %[[#ordResV:]] "ordResV"
+; CHECK: OpName %[[#ueqResV:]] "ueqResV"
+; CHECK: OpName %[[#uneResV:]] "uneResV"
+; CHECK: OpName %[[#ultResV:]] "ultResV"
+; CHECK: OpName %[[#ugtResV:]] "ugtResV"
+; CHECK: OpName %[[#uleResV:]] "uleResV"
+; CHECK: OpName %[[#ugeResV:]] "ugeResV"
+; CHECK: OpName %[[#unoResV:]] "unoResV"
+; CHECK: OpName %[[#modResV:]] "modResV"
+; CHECK: OpName %[[#maxResV:]] "maxResV"
+; CHECK: OpName %[[#maxCommonResV:]] "maxCommonResV"
+; CHECK: OpDecorate %[[#subRes]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#mulRes]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#divRes]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#remRes]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#negRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#oeqRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#oltRes]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#ogtRes]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#oleRes]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#ogeRes]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#ordRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#ueqRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#maxRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#subResV]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#mulResV]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#divResV]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#remResV]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#negResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#oeqResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#oltResV]] FPFastMathMode NotNaN
+; CHECK: OpDecorate %[[#ogtResV]] FPFastMathMode NotInf
+; CHECK: OpDecorate %[[#oleResV]] FPFastMathMode NSZ
+; CHECK: OpDecorate %[[#ogeResV]] FPFastMathMode AllowRecip
+; CHECK: OpDecorate %[[#ordResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#ueqResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#maxResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonResV]] FPFastMathMode NotNaN|NotInf
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare spir_func float @_Z4fmodff(float, float)
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare spir_func <2 x float> @_Z4fmodDv2_fDv2_f(<2 x float>, <2 x float>)
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define weak_odr dso_local spir_kernel void @foo(float %1, float %2) {
+entry:
+  %addRes = fadd float %1,  %2
+  %subRes = fsub nnan float %1,  %2
+  %mulRes = fmul ninf float %1,  %2
+  %divRes = fdiv nsz float %1,  %2
+  %remRes = frem arcp float %1,  %2
+  %negRes = fneg fast float %1
+  %oeqRes = fcmp nnan ninf oeq float %1,  %2
+  %oneRes = fcmp one float %1,  %2, !spirv.Decorations !3
+  %oltRes = fcmp nnan olt float %1,  %2, !spirv.Decorations !3
+  %ogtRes = fcmp ninf ogt float %1,  %2, !spirv.Decorations !3
+  %oleRes = fcmp nsz ole float %1,  %2, !spirv.Decorations !3
+  %ogeRes = fcmp arcp oge float %1,  %2, !spirv.Decorations !3
+  %ordRes = fcmp fast ord float %1,  %2, !spirv.Decorations !3
+  %ueqRes = fcmp nnan ninf ueq float %1,  %2, !spirv.Decorations !3
+  %uneRes = fcmp une float %1,  %2, !spirv.Decorations !3
+  %ultRes = fcmp ult float %1,  %2, !spirv.Decorations !3
+  %ugtRes = fcmp ugt float %1,  %2, !spirv.Decorations !3
+  %uleRes = fcmp ule float %1,  %2, !spirv.Decorations !3
+  %ugeRes = fcmp uge float %1,  %2, !spirv.Decorations !3
+  %unoRes = fcmp uno float %1,  %2, !spirv.Decorations !3
+  %modRes = call spir_func float @_Z4fmodff(float %1, float %2)
+  %maxRes = tail call fast spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+   %maxCommonRes = tail call spir_func noundef float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+  ret void
+}
+
+define weak_odr dso_local spir_kernel void @fooV(<2 x float> %v1, <2 x float> %v2) {
+  %addResV = fadd <2 x float> %v1,  %v2
+  %subResV = fsub nnan <2 x float> %v1,  %v2
+  %mulResV = fmul ninf <2 x float> %v1,  %v2
+  %divResV = fdiv nsz <2 x float> %v1,  %v2
+  %remResV = frem arcp <2 x float> %v1,  %v2
+  %negResV = fneg fast <2 x float> %v1
+  %oeqResV = fcmp nnan ninf oeq <2 x float> %v1,  %v2
+  %oneResV = fcmp one <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %oltResV = fcmp nnan olt <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ogtResV = fcmp ninf ogt <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %oleResV = fcmp nsz ole <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ogeResV = fcmp arcp oge <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ordResV = fcmp fast ord <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ueqResV = fcmp nnan ninf ueq <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %uneResV = fcmp une <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ultResV = fcmp ult <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ugtResV = fcmp ugt <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %uleResV = fcmp ule <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %ugeResV = fcmp uge <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %unoResV = fcmp uno <2 x float> %v1,  %v2, !spirv.Decorations !3
+  %modResV = call spir_func <2 x float> @_Z4fmodDv2_fDv2_f(<2 x float> %v1, <2 x float> %v2)
+  %maxResV = tail call fast spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+   %maxCommonResV = tail call spir_func noundef <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+  ret void
+}
+
+!3 = !{!5, !4}
+!4 = !{i32 42} ; 42 is NoContraction decoration
+!5 = !{i32 40, i32 393216} ; 40 is FPFastMathMode
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode.ll
new file mode 100644
index 0000000000000..4b3c13c260c51
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode.ll
@@ -0,0 +1,81 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+define dso_local dllexport spir_kernel void @k_float_controls_half(half %h) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_bfloat(bfloat %b) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_float(float %f) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_double(double %d) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all(half %h, bfloat %b, float %f, double %d) {
+entry:
+  ret void
+}
+
+!spirv.ExecutionMode = !{!17, !18, !19, !20, !22, !23, !24, !25}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_HALF:]] "k_float_controls_half"
+!0 = !{ptr @k_float_controls_half, !"k_float_controls_half", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_BFLOAT:]] "k_float_controls_bfloat"
+!1 = !{ptr @k_float_controls_bfloat, !"k_float_controls_bfloat", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT:]] "k_float_controls_float"
+!2 = !{ptr @k_float_controls_float, !"k_float_controls_float", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_DOUBLE:]] "k_float_controls_double"
+!3 = !{ptr @k_float_controls_double, !"k_float_controls_double", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL:]] "k_float_controls_all"
+!5 = !{ptr @k_float_controls_all, !"k_float_controls_all", !6, i32 0, !6, !7, !8, i32 0, i32 0}
+!6 = !{i32 2, i32 2}
+!7 = !{i32 32, i32 36}
+!8 = !{i32 0, i32 0}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_HALF]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST1:]]
+!17 = !{ptr @k_float_controls_half, i32 6028, half poison, i32 1}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_BFLOAT]] FPFastMathDefault %[[#BFLOAT_TYPE:]] %[[#CONST2:]]
+!18 = !{ptr @k_float_controls_bfloat, i32 6028, bfloat poison, i32 2}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST4:]]
+!19 = !{ptr @k_float_controls_float, i32 6028, float poison, i32 4}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_DOUBLE]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST7:]]
+!20 = !{ptr @k_float_controls_double, i32 6028, double poison, i32 7}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#HALF_TYPE]] %[[#CONST131072:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#FLOAT_TYPE]] %[[#CONST458752:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#DOUBLE_TYPE]] %[[#CONST458752:]]
+!22 = !{ptr @k_float_controls_all, i32 6028, half poison, i32 131072}
+!23 = !{ptr @k_float_controls_all, i32 6028, bfloat poison, i32 131072}
+!24 = !{ptr @k_float_controls_all, i32 6028, float poison, i32 458752}
+!25 = !{ptr @k_float_controls_all, i32 6028, double poison, i32 458752}
+
+; CHECK-DAG: %[[#INT32_TYPE:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#HALF_TYPE]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FLOAT_TYPE]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DOUBLE_TYPE]] = OpTypeFloat 64
+; CHECK-DAG: %[[#CONST1]] = OpConstant %[[#INT32_TYPE]] 1
+; CHECK-DAG: %[[#CONST2]] = OpConstant %[[#INT32_TYPE]] 2
+; CHECK-DAG: %[[#CONST4]] = OpConstant %[[#INT32_TYPE]] 4
+; CHECK-DAG: %[[#CONST7]] = OpConstant %[[#INT32_TYPE]] 7
+; CHECK-DAG: %[[#CONST131072]] = OpConstant %[[#INT32_TYPE]] 131072
+; CHECK-DAG: %[[#CONST458752]] = OpConstant %[[#INT32_TYPE]] 458752
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode2.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode2.ll
new file mode 100644
index 0000000000000..c0632725e38d9
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode2.ll
@@ -0,0 +1,73 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT:]] "k_float_controls_float"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL:]] "k_float_controls_all"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT_V:]] "k_float_controls_float_v"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL_V:]] "k_float_controls_all_v"
+
+define dso_local dllexport spir_kernel void @k_float_controls_float(float %f) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all(half %h, float %f, double %d) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_float_v(<2 x float> %f) {
+entry:
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all_v(<2 x half> %h, <2 x float> %f, <2 x double> %d) {
+entry:
+  ret void
+}
+
+!spirv.ExecutionMode = !{!19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079:]]
+!19 = !{ptr @k_float_controls_float, i32 6028, float poison, i32 131079}
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+!20 = !{ptr @k_float_controls_all, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!21 = !{ptr @k_float_controls_float, i32 31}
+!22 = !{ptr @k_float_controls_all, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!23 = !{ptr @k_float_controls_float, i32 4461, i32 32}
+!24 = !{ptr @k_float_controls_all, i32 4461, i32 16}
+!25 = !{ptr @k_float_controls_all, i32 4461, i32 32}
+!26 = !{ptr @k_float_controls_all, i32 4461, i32 64}
+
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+!27 = !{ptr @k_float_controls_float_v, i32 6028, float poison, i32 131079}
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+!28 = !{ptr @k_float_controls_all_v, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!29 = !{ptr @k_float_controls_float_v, i32 31}
+!30 = !{ptr @k_float_controls_all_v, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!31 = !{ptr @k_float_controls_float_v, i32 4461, i32 32}
+!32 = !{ptr @k_float_controls_all_v, i32 4461, i32 16}
+!33 = !{ptr @k_float_controls_all_v, i32 4461, i32 32}
+!34 = !{ptr @k_float_controls_all_v, i32 4461, i32 64}
+
+; CHECK-DAG: %[[#INT32_TYPE:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#HALF_TYPE]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FLOAT_TYPE]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DOUBLE_TYPE]] = OpTypeFloat 64
+; CHECK-DAG: %[[#CONST0]] = OpConstantNull %[[#INT32_TYPE]]
+; CHECK-DAG: %[[#CONST131079]] = OpConstant %[[#INT32_TYPE]] 131079
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode3.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode3.ll
new file mode 100644
index 0000000000000..1d09187b7f6a1
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/exec_mode3.ll
@@ -0,0 +1,103 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT:]] "k_float_controls_float"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL:]] "k_float_controls_all"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_FLOAT_V:]] "k_float_controls_float_v"
+; CHECK: OpEntryPoint Kernel %[[#KERNEL_ALL_V:]] "k_float_controls_all_v"
+
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0:]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+
+; We expect 130179 for float type.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_FLOAT_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#FLOAT_TYPE:]] %[[#CONST131079]]
+; We expect 0 for the rest of types because it's SignedZeroInfNanPreserve.
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#HALF_TYPE:]] %[[#CONST0]]
+; CHECK-DAG: OpExecutionModeId %[[#KERNEL_ALL_V]] FPFastMathDefault %[[#DOUBLE_TYPE:]] %[[#CONST0]]
+
+; CHECK-DAG: OpDecorate %[[#addRes:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResH:]] FPFastMathMode None
+; CHECK-DAG: OpDecorate %[[#addResF:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResD:]] FPFastMathMode None
+; CHECK-DAG: OpDecorate %[[#addRes_V:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResH_V:]] FPFastMathMode None
+; CHECK-DAG: OpDecorate %[[#addResF_V:]] FPFastMathMode NotNaN|NotInf|NSZ|AllowReassoc
+; CHECK-DAG: OpDecorate %[[#addResD_V:]] FPFastMathMode None
+
+; CHECK-DAG: %[[#INT32_TYPE:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#HALF_TYPE]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FLOAT_TYPE]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DOUBLE_TYPE]] = OpTypeFloat 64
+; CHECK-DAG: %[[#CONST0]] = OpConstantNull %[[#INT32_TYPE]]
+; CHECK-DAG: %[[#CONST131079]] = OpConstant %[[#INT32_TYPE]] 131079
+
+; CHECK-DAG: %[[#HALF_V_TYPE:]] = OpTypeVector %[[#HALF_TYPE]]
+; CHECK-DAG: %[[#FLOAT_V_TYPE:]] = OpTypeVector %[[#FLOAT_TYPE]]
+; CHECK-DAG: %[[#DOUBLE_V_TYPE:]] = OpTypeVector %[[#DOUBLE_TYPE]]
+
+define dso_local dllexport spir_kernel void @k_float_controls_float(float %f) {
+entry:
+; CHECK-DAG: %[[#addRes]] = OpFAdd %[[#FLOAT_TYPE]]
+  %addRes = fadd float %f,  %f
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all(half %h, float %f, double %d) {
+entry:
+; CHECK-DAG: %[[#addResH]] = OpFAdd %[[#HALF_TYPE]]
+; CHECK-DAG: %[[#addResF]] = OpFAdd %[[#FLOAT_TYPE]]
+; CHECK-DAG: %[[#addResD]] = OpFAdd %[[#DOUBLE_TYPE]]
+  %addResH = fadd half %h,  %h
+  %addResF = fadd float %f,  %f
+  %addResD = fadd double %d,  %d
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_float_v(<2 x float> %f) {
+entry:
+; CHECK-DAG: %[[#addRes_V]] = OpFAdd %[[#FLOAT_V_TYPE]]
+  %addRes = fadd <2 x float> %f,  %f
+  ret void
+}
+
+define dso_local dllexport spir_kernel void @k_float_controls_all_v(<2 x half> %h, <2 x float> %f, <2 x double> %d) {
+entry:
+; CHECK-DAG: %[[#addResH_V]] = OpFAdd %[[#HALF_V_TYPE]]
+; CHECK-DAG: %[[#addResF_V]] = OpFAdd %[[#FLOAT_V_TYPE]]
+; CHECK-DAG: %[[#addResD_V]] = OpFAdd %[[#DOUBLE_V_TYPE]]
+  %addResH = fadd <2 x half> %h,  %h
+  %addResF = fadd <2 x float> %f,  %f
+  %addResD = fadd <2 x double> %d,  %d
+  ret void
+}
+
+!spirv.ExecutionMode = !{!19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34}
+
+!19 = !{ptr @k_float_controls_float, i32 6028, float poison, i32 131079}
+!20 = !{ptr @k_float_controls_all, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!21 = !{ptr @k_float_controls_float, i32 31}
+!22 = !{ptr @k_float_controls_all, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!23 = !{ptr @k_float_controls_float, i32 4461, i32 32}
+!24 = !{ptr @k_float_controls_all, i32 4461, i32 16}
+!25 = !{ptr @k_float_controls_all, i32 4461, i32 32}
+!26 = !{ptr @k_float_controls_all, i32 4461, i32 64}
+
+!27 = !{ptr @k_float_controls_float_v, i32 6028, float poison, i32 131079}
+!28 = !{ptr @k_float_controls_all_v, i32 6028, float poison, i32 131079}
+; ContractionOff is now replaced with FPFastMathDefault with AllowContract bit set to false.
+!29 = !{ptr @k_float_controls_float_v, i32 31}
+!30 = !{ptr @k_float_controls_all_v, i32 31}
+; SignedZeroInfNanPreserve is now replaced with FPFastMathDefault with flags 0.
+!31 = !{ptr @k_float_controls_float_v, i32 4461, i32 32}
+!32 = !{ptr @k_float_controls_all_v, i32 4461, i32 16}
+!33 = !{ptr @k_float_controls_all_v, i32 4461, i32 32}
+!34 = !{ptr @k_float_controls_all_v, i32 4461, i32 64}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/replacements.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/replacements.ll
new file mode 100644
index 0000000000000..bba1c93a7e78d
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_float_controls2/replacements.ll
@@ -0,0 +1,61 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_float_controls2 %s -o - -filetype=obj | spirv-val %}
+
+;; This test checks that the OpenCL.std instructions fmin_common, fmax_common are replaced with fmin, fmax with NInf and NNaN instead.
+
+; CHECK-DAG: Capability FloatControls2
+; CHECK: Extension "SPV_KHR_float_controls2"
+
+; CHECK: OpName %[[#maxRes:]] "maxRes"
+; CHECK: OpName %[[#maxCommonRes:]] "maxCommonRes"
+; CHECK: OpName %[[#minRes:]] "minRes"
+; CHECK: OpName %[[#minCommonRes:]] "minCommonRes"
+; CHECK: OpName %[[#maxResV:]] "maxResV"
+; CHECK: OpName %[[#maxCommonResV:]] "maxCommonResV"
+; CHECK: OpName %[[#minResV:]] "minResV"
+; CHECK: OpName %[[#minCommonResV:]] "minCommonResV"
+; CHECK: OpDecorate %[[#maxRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#minRes]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#minCommonRes]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#maxResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#maxCommonResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: OpDecorate %[[#minResV]] FPFastMathMode NotNaN|NotInf|NSZ|AllowRecip|AllowContract|AllowReassoc|AllowTransform
+; CHECK: OpDecorate %[[#minCommonResV]] FPFastMathMode NotNaN|NotInf
+; CHECK: %[[#maxRes]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#maxCommonRes]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#minRes]] = OpExtInst {{.*}} fmin
+; CHECK: %[[#minCommonRes]] = OpExtInst {{.*}} fmin
+; CHECK: %[[#maxResV]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#maxCommonResV]] = OpExtInst {{.*}} fmax
+; CHECK: %[[#minResV]] = OpExtInst {{.*}} fmin
+; CHECK: %[[#minCommonResV]] = OpExtInst {{.*}} fmin
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare spir_func float @_Z4fmodff(float, float)
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fminff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) float @_Z23__spirv_ocl_fmin_commonff(float noundef nofpclass(nan inf), float noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fminDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+declare dso_local spir_func noundef nofpclass(nan inf) <2 x float> @_Z23__spirv_ocl_fmin_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf), <2 x float> noundef nofpclass(nan inf)) local_unnamed_addr #1
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define weak_odr dso_local spir_kernel void @foo(float %1, float %2) {
+entry:
+  %maxRes = tail call fast spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fmaxff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+   %maxCommonRes = tail call spir_func noundef float @_Z23__spirv_ocl_fmax_commonff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+  %minRes = tail call fast spir_func noundef nofpclass(nan inf) float @_Z16__spirv_ocl_fminff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+   %minCommonRes = tail call spir_func noundef float @_Z23__spirv_ocl_fmin_commonff(float noundef nofpclass(nan inf) %1, float noundef nofpclass(nan inf) %2)
+  ret void
+}
+
+define weak_odr dso_local spir_kernel void @fooV(<2 x float> %v1, <2 x float> %v2) {
+  %maxResV = tail call fast spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fmaxDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+   %maxCommonResV = tail call spir_func noundef <2 x float> @_Z23__spirv_ocl_fmax_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+  %minResV = tail call fast spir_func noundef nofpclass(nan inf) <2 x float> @_Z16__spirv_ocl_fminDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+   %minCommonResV = tail call spir_func noundef <2 x float> @_Z23__spirv_ocl_fmin_commonDv2_fDv2_f(<2 x float> noundef nofpclass(nan inf) %v1, <2 x float> noundef nofpclass(nan inf) %v2)
+  ret void
+}

From 8b445ab68452fc81a1f0e662c11c3989ca98928c Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Tue, 30 Sep 2025 08:16:46 -0700
Subject: [PATCH 259/878] [CodeGen] Fix performance regression introduced in
 b05101b

The isNormalValueType = false flag was not set for this pseudo value
type, which caused significant size increases for some classes: the
size of the TargetLoweringBase class to 1.5 MB, because the size of
that class is quadratic in MVT::VALUETYPE_SIZE, and this commit
increased that from 256 to 504.

Reported by: abadams
Fixes: b05101b ("[TableGen, CodeGen, CHERI] Add support for the cPTR wildcard value type.")

Reviewed By: nikic

Pull Request: https://github.com/llvm/llvm-project/pull/161313
---
 llvm/include/llvm/CodeGen/ValueTypes.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 9ea127dd15943..300addd7d4daf 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -367,11 +367,11 @@ def aarch64mfp8 : ValueType<8,  253>;  // 8-bit value in FPR (AArch64)
 def c64 : VTCheriCapability<64, 254>;   // 64-bit CHERI capability value
 def c128 : VTCheriCapability<128, 255>; // 128-bit CHERI capability value
 
+let isNormalValueType = false in {
 // Pseudo valuetype mapped to the current CHERI capability pointer size.
 // Should only be used in TableGen.
 def cPTR : VTAny<503>;
 
-let isNormalValueType = false in {
 def token      : ValueType<0, 504>;  // TokenTy
 def MetadataVT : ValueType<0, 505> { // Metadata
   let LLVMName = "Metadata";

From 58b4951726aa1cf92ca08f9b87f5ea7d28700e75 Mon Sep 17 00:00:00 2001
From: Fabrice de Gans <Steelskin@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:16:54 -0400
Subject: [PATCH 260/878] Aarch64: Emit a minimal SEH prologue when needed
 (#158173)

In some cases, with very simple thunks, it is possible that the
`.seh_endprologue` is not emitted. This causes issues in the assembler
because the epilogue ends up starting before the prologue has ended.

Bug: swiftlang/llvm-project#11377
---
 .../AArch64/AArch64PrologueEpilogue.cpp       |  7 +++
 .../AArch64/seh-minimal-prologue-epilogue.ll  | 53 +++++++++++++++++++
 ...ogue.ll => wincfi-minimal-seh-prologue.ll} |  5 +-
 3 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/seh-minimal-prologue-epilogue.ll
 rename llvm/test/CodeGen/AArch64/{wincfi-seh-only-in-epilogue.ll => wincfi-minimal-seh-prologue.ll} (78%)

diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 7947469b6c04f..09b36433801a4 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -541,6 +541,13 @@ void AArch64PrologueEmitter::emitPrologue() {
   // to determine the end of the prologue.
   DebugLoc DL;
 
+  // In some cases, particularly with CallingConv::SwiftTail, it is possible to
+  // have a tail-call where the caller only needs to adjust the stack pointer in
+  // the epilogue. In this case, we still need to emit a SEH prologue sequence.
+  // See `seh-minimal-prologue-epilogue.ll` test cases.
+  if (AFI->getArgumentStackToRestore())
+    HasWinCFI = true;
+
   if (AFI->shouldSignReturnAddress(MF)) {
     // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions
     // are inserted by emitPacRetPlusLeafHardening().
diff --git a/llvm/test/CodeGen/AArch64/seh-minimal-prologue-epilogue.ll b/llvm/test/CodeGen/AArch64/seh-minimal-prologue-epilogue.ll
new file mode 100644
index 0000000000000..cc71b8b3065ad
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/seh-minimal-prologue-epilogue.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=aarch64-windows %s -o - | FileCheck %s
+
+; This test verifies that functions requiring Windows CFI that have minimal
+; or no prologue instructions still emit proper SEH directives, specifically
+; ensuring .seh_endprologue is emitted before .seh_startepilogue.
+;
+; This reproduces the issue where Swift async functions with swifttailcc
+; calling convention would fail with:
+; "error: starting epilogue (.seh_startepilogue) before prologue has ended (.seh_endprologue)"
+
+; Test 1: Swift-style tail call function with minimal prologue
+define swifttailcc void @test_swifttailcc_minimal(ptr %async_ctx, ptr %arg1, ptr %arg2) {
+; CHECK-LABEL: test_swifttailcc_minimal:
+; CHECK-NOT:   .seh_proc test_swifttailcc_minimal
+; CHECK-NOT:   .seh_endprologue
+; CHECK-NOT:   .seh_startepilogue
+; CHECK-NOT:   .seh_endepilogue
+; CHECK-NOT:   .seh_endproc
+entry:
+  %ptr1 = getelementptr inbounds i8, ptr %async_ctx, i64 16
+  %ptr2 = getelementptr inbounds i8, ptr %async_ctx, i64 24
+  store ptr %arg1, ptr %ptr1, align 8
+  store ptr %arg2, ptr %ptr2, align 8
+  musttail call swifttailcc void @external_swift_function(ptr %async_ctx, ptr %arg1)
+  ret void
+}
+
+; Test 2: Function similar to the original failing case
+define linkonce_odr hidden swifttailcc void @test_linkonce_swifttailcc(ptr swiftasync %async_ctx, ptr %arg1, ptr noalias dereferenceable(40) %arg2, ptr %arg3, i64 %value, ptr %arg4, ptr %arg5, ptr %arg6, i1 %flag, ptr %arg7, ptr noalias dereferenceable(40) %arg8) {
+; CHECK-LABEL: test_linkonce_swifttailcc:
+; CHECK-NEXT:  .seh_proc
+; CHECK:       .seh_endprologue
+; CHECK:       .seh_startepilogue
+; CHECK:       .seh_endepilogue
+; CHECK:       .seh_endproc
+entry:
+  %frame_ptr = getelementptr inbounds nuw i8, ptr %async_ctx, i64 16
+  %ctx1 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 400
+  %ctx2 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 1168
+  %spill1 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2392
+  store ptr %arg8, ptr %spill1, align 8
+  %spill2 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2384
+  store ptr %arg7, ptr %spill2, align 8
+  %spill3 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2225
+  store i1 %flag, ptr %spill3, align 1
+  %spill4 = getelementptr inbounds nuw i8, ptr %async_ctx, i64 2376
+  store ptr %arg6, ptr %spill4, align 8
+  musttail call swifttailcc void @external_swift_continuation(ptr swiftasync %async_ctx, i64 0, i64 0)
+  ret void
+}
+
+declare swifttailcc void @external_swift_function(ptr, ptr)
+declare swifttailcc void @external_swift_continuation(ptr swiftasync, i64, i64)
diff --git a/llvm/test/CodeGen/AArch64/wincfi-seh-only-in-epilogue.ll b/llvm/test/CodeGen/AArch64/wincfi-minimal-seh-prologue.ll
similarity index 78%
rename from llvm/test/CodeGen/AArch64/wincfi-seh-only-in-epilogue.ll
rename to llvm/test/CodeGen/AArch64/wincfi-minimal-seh-prologue.ll
index 7daceae3dd4c0..8308108b84f08 100644
--- a/llvm/test/CodeGen/AArch64/wincfi-seh-only-in-epilogue.ll
+++ b/llvm/test/CodeGen/AArch64/wincfi-minimal-seh-prologue.ll
@@ -5,8 +5,9 @@ entry:
   ret void
 }
 
-; Check that there is no .seh_endprologue but there is seh_startepilogue/seh_endepilogue.
-; CHECK-NOT: .seh_endprologue
+; Check that there is a minimal SEH prologue with seh_startepilogue/seh_endepilogue.
+; CHECK:     .seh_proc test
+; CHECK:     .seh_endprologue
 ; CHECK:     .seh_startepilogue
 ; CHECK:     add sp, sp, #48
 ; CHECK:     .seh_stackalloc 48

From 981122696701d3c3897af267afd73bbefa61c3d4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 1 Oct 2025 00:18:51 +0900
Subject: [PATCH 261/878] PeepholeOpt: Try to constrain uses to support
 subregister (#161338)

This allows removing a special case hack in ARM. ARM's implementation
of getExtractSubregLikeInputs has the strange property that it reports
a register with a class that does not support the reported subregister
index. We can however reconstrain the register to support this usage.

This is an alternative to #159600. I've included the test, but
the output is different. In this case version the VMOVSR is
replaced with an ordinary subregister extract copy.
---
 llvm/lib/CodeGen/PeepholeOptimizer.cpp        | 24 ++++++++
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp   | 14 -----
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.h     |  5 --
 llvm/test/CodeGen/ARM/issue159343.ll          | 55 +++++++++++++++++++
 llvm/test/CodeGen/ARM/pr159343.mir            | 31 +++++++++++
 llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll |  4 +-
 6 files changed, 112 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/issue159343.ll
 create mode 100644 llvm/test/CodeGen/ARM/pr159343.mir

diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index fb3e6482bb096..729a57ef23b1e 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1203,6 +1203,18 @@ bool PeepholeOptimizer::optimizeCoalescableCopyImpl(Rewriter &&CpyRewriter) {
     if (!NewSrc.Reg)
       continue;
 
+    if (NewSrc.SubReg) {
+      // Verify the register class supports the subregister index. ARM's
+      // copy-like queries return register:subreg pairs where the register's
+      // current class does not directly support the subregister index.
+      const TargetRegisterClass *RC = MRI->getRegClass(NewSrc.Reg);
+      const TargetRegisterClass *WithSubRC =
+          TRI->getSubClassWithSubReg(RC, NewSrc.SubReg);
+      if (!MRI->constrainRegClass(NewSrc.Reg, WithSubRC))
+        continue;
+      Changed = true;
+    }
+
     // Rewrite source.
     if (CpyRewriter.RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) {
       // We may have extended the live-range of NewSrc, account for that.
@@ -1275,6 +1287,18 @@ MachineInstr &PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
   const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
   Register NewVReg = MRI->createVirtualRegister(DefRC);
 
+  if (NewSrc.SubReg) {
+    const TargetRegisterClass *NewSrcRC = MRI->getRegClass(NewSrc.Reg);
+    const TargetRegisterClass *WithSubRC =
+        TRI->getSubClassWithSubReg(NewSrcRC, NewSrc.SubReg);
+
+    // The new source may not directly support the subregister, but we should be
+    // able to assume it is constrainable to support the subregister (otherwise
+    // ValueTracker was lying and reported a useless value).
+    if (!MRI->constrainRegClass(NewSrc.Reg, WithSubRC))
+      llvm_unreachable("replacement register cannot support subregister");
+  }
+
   MachineInstr *NewCopy =
       BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
               TII->get(TargetOpcode::COPY), NewVReg)
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e94220af05a0d..2e8a676269a74 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -960,17 +960,3 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
   }
   return false;
 }
-
-bool ARMBaseRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
-                                               unsigned DefSubReg,
-                                               const TargetRegisterClass *SrcRC,
-                                               unsigned SrcSubReg) const {
-  // We can't extract an SPR from an arbitary DPR (as opposed to a DPR_VFP2).
-  if (DefRC == &ARM::SPRRegClass && DefSubReg == 0 &&
-      SrcRC == &ARM::DPRRegClass &&
-      (SrcSubReg == ARM::ssub_0 || SrcSubReg == ARM::ssub_1))
-    return false;
-
-  return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg,
-                                                  SrcRC, SrcSubReg);
-}
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 5b67b34089d7e..03b0fa0d1ee08 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -158,11 +158,6 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
                       const TargetRegisterClass *NewRC,
                       LiveIntervals &LIS) const override;
 
-  bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
-                            unsigned DefSubReg,
-                            const TargetRegisterClass *SrcRC,
-                            unsigned SrcSubReg) const override;
-
   int getSEHRegNum(unsigned i) const { return getEncodingValue(i); }
 };
 
diff --git a/llvm/test/CodeGen/ARM/issue159343.ll b/llvm/test/CodeGen/ARM/issue159343.ll
new file mode 100644
index 0000000000000..03292582918a9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/issue159343.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+; Make sure there's no assertion from peephole-opt introducing illegal
+; subregister index uses.
+
+target triple = "thumbv7-unknown-linux-android29"
+
+define void @_ZN11VersionEdit10DecodeFromEv(i1 %call4, ptr %__profc__ZN11VersionEdit10DecodeFromEv) nounwind {
+; CHECK-LABEL: _ZN11VersionEdit10DecodeFromEv:
+; CHECK:       @ %bb.0: @ %land.rhs.lr.ph
+; CHECK-NEXT:    lsls r0, r0, #31
+; CHECK-NEXT:    beq .LBB0_2
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:    adr r0, .LCPI0_0
+; CHECK-NEXT:    vld1.64 {d0, d1}, [r0:128]
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_2: @ %select.false
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:  .LBB0_3: @ %select.end
+; CHECK-NEXT:    vldr s5, .LCPI0_1
+; CHECK-NEXT:    vldr s4, .LCPI0_2
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vst1.64 {d2, d3}, [r1]
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.4:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:  .LCPI0_1:
+; CHECK-NEXT:    .long 0x00000000 @ float 0
+; CHECK-NEXT:  .LCPI0_2:
+; CHECK-NEXT:    .long 0x00000001 @ float 1.40129846E-45
+land.rhs.lr.ph:
+  br i1 %call4, label %sw.bb, label %while.cond.while.end_crit_edge.split.loop.exit43
+
+while.cond.while.end_crit_edge.split.loop.exit43: ; preds = %land.rhs.lr.ph
+  %ext0 = extractelement <4 x i64> zeroinitializer, i64 0
+  br label %while.cond.while.end_crit_edge
+
+while.cond.while.end_crit_edge:                   ; preds = %sw.bb, %while.cond.while.end_crit_edge.split.loop.exit43
+  %pgocount5374.ph = phi i64 [ %ext1, %sw.bb ], [ %ext0, %while.cond.while.end_crit_edge.split.loop.exit43 ]
+  %ins = insertelement <2 x i64> splat (i64 1), i64 %pgocount5374.ph, i64 1
+  store <2 x i64> %ins, ptr %__profc__ZN11VersionEdit10DecodeFromEv, align 8
+  ret void
+
+sw.bb:                                            ; preds = %land.rhs.lr.ph
+  %ext1 = extractelement <4 x i64> splat (i64 1), i64 0
+  br label %while.cond.while.end_crit_edge
+}
+
diff --git a/llvm/test/CodeGen/ARM/pr159343.mir b/llvm/test/CodeGen/ARM/pr159343.mir
new file mode 100644
index 0000000000000..9b71b1ad94b2f
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/pr159343.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -run-pass=peephole-opt -verify-machineinstrs -mtriple=thumbv7-unknown-linux-android29 %s -o - | FileCheck %s
+---
+name: Test_shouldRewriteCopySrc_Invalid_SubReg
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $r0, $r1
+
+    ; CHECK-LABEL: name: Test_shouldRewriteCopySrc_Invalid_SubReg
+    ; CHECK: liveins: $r0, $r1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:dpair = IMPLICIT_DEF
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:dpr_vfp2 = COPY [[DEF]].dsub_0
+    ; CHECK-NEXT: [[VMOVRRD:%[0-9]+]]:gpr, [[VMOVRRD1:%[0-9]+]]:gpr = VMOVRRD [[COPY]], 14 /* CC::al */, $noreg
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:spr = COPY [[COPY]].ssub_1
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:spr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:spr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:spr = IMPLICIT_DEF
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:mqpr = REG_SEQUENCE killed [[DEF2]], %subreg.ssub_0, killed [[DEF1]], %subreg.ssub_1, killed [[DEF3]], %subreg.ssub_2, [[COPY]].ssub_1, %subreg.ssub_3
+    ; CHECK-NEXT: VST1q64 $r1, 0, killed [[REG_SEQUENCE]], 14 /* CC::al */, $noreg
+    %0:dpair = IMPLICIT_DEF
+    %1:dpr = COPY %0.dsub_0
+    %2:gpr, %3:gpr = VMOVRRD killed %1, 14 /* CC::al */, $noreg
+    %4:spr = VMOVSR killed %3, 14 /* CC::al */, $noreg
+    %5:spr = IMPLICIT_DEF
+    %6:spr = IMPLICIT_DEF
+    %7:spr = IMPLICIT_DEF
+    %8:mqpr = REG_SEQUENCE killed %6, %subreg.ssub_0, killed %5, %subreg.ssub_1, killed %7, %subreg.ssub_2, killed %4, %subreg.ssub_3
+    VST1q64 $r1, 0, killed %8, 14 /* CC::al */, $noreg
+...
diff --git a/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll b/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll
index e653aaa316fed..2bf8f29eccb40 100644
--- a/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll
+++ b/llvm/test/CodeGen/ARM/shouldRewriteCopySrc.ll
@@ -12,8 +12,8 @@ define float @shouldRewriteCopySrc(double %arg) #0 {
 ; CHECK-NEXT:    @APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    @NO_APP
-; CHECK-NEXT:    vmov r0, r1, d16
-; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vmov.f64 d0, d16
+; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $d0
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 bb:

From 870e4f9a525d07d39b1e43db6c872d6c34c1bcd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 30 Sep 2025 18:21:25 +0300
Subject: [PATCH 262/878] [libc++][test] Use
 ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS in more places (#144339)

ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS allows waiving asserts, for
cases when we can't count allocations that happen within the libc++
shared library.

When compiling with optimization, it is possible that some calls end up
generated inline, where the overridden operator new/delete do get called
(counting those calls), whereas the compiler may decide to leave some
calls to the external definition (inside the shared library, where we
can't count the calls).

In particular, in one case, a non-optimized build calls
_ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEED1Ev from
the DLL, while it gets inlined (including direct calls to operator
delete) when built with optimization.

Therefore; for the cases where we can't count allocations internally
within the library, waive these asserts.

This fixes all testcases in mingw mode, when built with optimization
enabled.
---
 libcxx/test/std/containers/sequences/vector/common.h      | 8 ++++----
 .../class.path/path.member/path.append.pass.cpp           | 5 +++++
 .../class.path/path.member/path.concat.pass.cpp           | 5 +++++
 libcxx/test/support/count_new.h                           | 4 ++++
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/std/containers/sequences/vector/common.h b/libcxx/test/std/containers/sequences/vector/common.h
index 4af6559a06e73..34453f8889b73 100644
--- a/libcxx/test/std/containers/sequences/vector/common.h
+++ b/libcxx/test/std/containers/sequences/vector/common.h
@@ -214,10 +214,10 @@ struct throwing_iterator {
 };
 
 inline void check_new_delete_called() {
-  assert(globalMemCounter.new_called == globalMemCounter.delete_called);
-  assert(globalMemCounter.new_array_called == globalMemCounter.delete_array_called);
-  assert(globalMemCounter.aligned_new_called == globalMemCounter.aligned_delete_called);
-  assert(globalMemCounter.aligned_new_array_called == globalMemCounter.aligned_delete_array_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.new_called == globalMemCounter.delete_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.new_array_called == globalMemCounter.delete_array_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.aligned_new_called == globalMemCounter.aligned_delete_called);
+  ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.aligned_new_array_called == globalMemCounter.aligned_delete_array_called);
 }
 
 template <class T, typename Alloc>
diff --git a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
index 3442019a8360c..b3d96c283c9b7 100644
--- a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.append.pass.cpp
@@ -12,6 +12,11 @@
 // These tests require locale for non-char paths
 // UNSUPPORTED: no-localization
 
+// In MinGW mode, with optimizations enabled with a DLL, the number of counted
+// allocations mismatches, as some ctor/dtor calls are generated in the
+// calling code, and some are called from the DLL.
+// ADDITIONAL_COMPILE_FLAGS: -DALLOW_MISMATCHING_LIBRRARY_INTERNAL_ALLOCATIONS
+
 // <filesystem>
 
 // class path
diff --git a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
index 5596de7328da4..570d303985e86 100644
--- a/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
+++ b/libcxx/test/std/input.output/filesystems/class.path/path.member/path.concat.pass.cpp
@@ -12,6 +12,11 @@
 // These tests require locale for non-char paths
 // UNSUPPORTED: no-localization
 
+// In MinGW mode, with optimizations enabled with a DLL, the number of counted
+// allocations mismatches, as some ctor/dtor calls are generated in the
+// calling code, and some are called from the DLL.
+// ADDITIONAL_COMPILE_FLAGS: -DALLOW_MISMATCHING_LIBRRARY_INTERNAL_ALLOCATIONS
+
 // <filesystem>
 
 // class path
diff --git a/libcxx/test/support/count_new.h b/libcxx/test/support/count_new.h
index c8169d3acceab..f175bc2ffcd44 100644
--- a/libcxx/test/support/count_new.h
+++ b/libcxx/test/support/count_new.h
@@ -626,7 +626,11 @@ struct RequireAllocationGuard {
     void requireExactly(std::size_t N) { m_req_alloc = N; m_exactly = true; }
 
     ~RequireAllocationGuard() {
+#ifdef ALLOW_MISMATCHING_LIBRRARY_INTERNAL_ALLOCATIONS
+        ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkOutstandingNewEq(static_cast<int>(m_outstanding_new_on_init)));
+#else
         assert(globalMemCounter.checkOutstandingNewEq(static_cast<int>(m_outstanding_new_on_init)));
+#endif
         std::size_t Expect = m_new_count_on_init + m_req_alloc;
         assert(globalMemCounter.checkNewCalledEq(static_cast<int>(Expect)) ||
                (!m_exactly && globalMemCounter.checkNewCalledGreaterThan(static_cast<int>(Expect))));

From e8e0b3dcd0657a75df00ff12769b9837985b385a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 30 Sep 2025 17:18:38 +0200
Subject: [PATCH 263/878] [MemorySanitizer] Generate some test checks (NFC)

---
 .../MemorySanitizer/array_types.ll            | 195 ++++++++---
 .../Instrumentation/MemorySanitizer/bmi.ll    | 185 ++++++-----
 .../Instrumentation/MemorySanitizer/byval.ll  | 304 +++++++++++++-----
 3 files changed, 478 insertions(+), 206 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
index 236b019147036..ddebe3ee20038 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
@@ -1,89 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK %s --allow-empty
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK-ORIGIN %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define [2 x i32] @InsertValue(i32 %x, i32 %y) sanitize_memory {
+; CHECK-LABEL: define [2 x i32] @InsertValue(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[TMP0]], 0
+; CHECK-NEXT:    [[A:%.*]] = insertvalue [2 x i32] undef, i32 [[X]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [2 x i32] [[TMP2]], i32 [[TMP1]], 1
+; CHECK-NEXT:    [[B:%.*]] = insertvalue [2 x i32] [[A]], i32 [[Y]], 1
+; CHECK-NEXT:    store [2 x i32] [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [2 x i32] [[B]]
+;
+; CHECK-ORIGIN-LABEL: define [2 x i32] @InsertValue(
+; CHECK-ORIGIN-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
+; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP4:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP1]], i32 0
+; CHECK-ORIGIN-NEXT:    [[A:%.*]] = insertvalue [2 x i32] undef, i32 [[X]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP7:%.*]] = insertvalue [2 x i32] [[TMP4]], i32 [[TMP2]], 1
+; CHECK-ORIGIN-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-ORIGIN-NEXT:    [[B:%.*]] = insertvalue [2 x i32] [[A]], i32 [[Y]], 1
+; CHECK-ORIGIN-NEXT:    store [2 x i32] [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP9]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret [2 x i32] [[B]]
+;
 entry:
   %a = insertvalue [2 x i32] undef, i32 %x, 0
   %b = insertvalue [2 x i32] %a, i32 %y, 1
   ret [2 x i32] %b
 }
 
-; CHECK-LABEL: @InsertValue(
-; CHECK-DAG: [[Sx:%.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: [[Sy:%.*]] = load i32, ptr {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK: [[A:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[Sx]], 0
-; CHECK: [[B:%.*]] = insertvalue [2 x i32] [[A]], i32 [[Sy]], 1
-; CHECK: store [2 x i32] [[B]], ptr {{.*}}@__msan_retval_tls
-; CHECK: ret [2 x i32]
-
-
 define [2 x double] @InsertValueDouble(double %x, double %y) sanitize_memory {
+; CHECK-LABEL: define [2 x double] @InsertValueDouble(
+; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[TMP0]], 0
+; CHECK-NEXT:    [[A:%.*]] = insertvalue [2 x double] undef, double [[X]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [2 x i64] [[TMP2]], i64 [[TMP1]], 1
+; CHECK-NEXT:    [[B:%.*]] = insertvalue [2 x double] [[A]], double [[Y]], 1
+; CHECK-NEXT:    store [2 x i64] [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret [2 x double] [[B]]
+;
+; CHECK-ORIGIN-LABEL: define [2 x double] @InsertValueDouble(
+; CHECK-ORIGIN-SAME: double [[X:%.*]], double [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
+; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP4:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP1]], i32 0
+; CHECK-ORIGIN-NEXT:    [[A:%.*]] = insertvalue [2 x double] undef, double [[X]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP7:%.*]] = insertvalue [2 x i64] [[TMP4]], i64 [[TMP2]], 1
+; CHECK-ORIGIN-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-ORIGIN-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-ORIGIN-NEXT:    [[B:%.*]] = insertvalue [2 x double] [[A]], double [[Y]], 1
+; CHECK-ORIGIN-NEXT:    store [2 x i64] [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP9]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret [2 x double] [[B]]
+;
 entry:
   %a = insertvalue [2 x double] undef, double %x, 0
   %b = insertvalue [2 x double] %a, double %y, 1
   ret [2 x double] %b
 }
 
-; CHECK-LABEL: @InsertValueDouble(
-; CHECK-DAG: [[Sx:%.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: [[Sy:%.*]] = load i64, ptr {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK: [[A:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[Sx]], 0
-; CHECK: [[B:%.*]] = insertvalue [2 x i64] [[A]], i64 [[Sy]], 1
-; CHECK: store [2 x i64] [[B]], ptr {{.*}}@__msan_retval_tls
-; CHECK: ret [2 x double]
-
-
 define i32 @ExtractValue([2 x i32] %a) sanitize_memory {
+; CHECK-LABEL: define i32 @ExtractValue(
+; CHECK-SAME: [2 x i32] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load [2 x i32], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue [2 x i32] [[TMP0]], 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [2 x i32] [[A]], 1
+; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[X]]
+;
+; CHECK-ORIGIN-LABEL: define i32 @ExtractValue(
+; CHECK-ORIGIN-SAME: [2 x i32] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
+; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load [2 x i32], ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = extractvalue [2 x i32] [[TMP0]], 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [2 x i32] [[A]], 1
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret i32 [[X]]
+;
 entry:
   %x = extractvalue [2 x i32] %a, 1
   ret i32 %x
 }
 
-; CHECK-LABEL: @ExtractValue(
-; CHECK: [[Sa:%.*]] = load [2 x i32], ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue [2 x i32] [[Sa]], 1
-; CHECK: store i32 [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret i32
-
-
 ; Regression test for PR20493.
 
 %MyStruct = type { i32, i32, [3 x i32] }
 
 define i32 @ArrayInStruct(%MyStruct %s) sanitize_memory {
+; CHECK-LABEL: define i32 @ArrayInStruct(
+; CHECK-SAME: [[MYSTRUCT:%.*]] [[S:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load { i32, i32, [3 x i32] }, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i32, [3 x i32] } [[TMP1]], 2, 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [[MYSTRUCT]] [[S]], 2, 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[X]]
+;
+; CHECK-ORIGIN-LABEL: define i32 @ArrayInStruct(
+; CHECK-ORIGIN-SAME: [[MYSTRUCT:%.*]] [[S:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load { i32, i32, [3 x i32] }, ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32, [3 x i32] } [[TMP1]], 2, 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [[MYSTRUCT]] [[S]], 2, 1
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret i32 [[X]]
+;
   %x = extractvalue %MyStruct %s, 2, 1
   ret i32 %x
 }
 
-; CHECK-LABEL: @ArrayInStruct(
-; CHECK: [[Ss:%.*]] = load { i32, i32, [3 x i32] }, ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue { i32, i32, [3 x i32] } [[Ss]], 2, 1
-; CHECK: store i32 [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret i32
-
-
 define i32 @ArrayOfStructs([3 x { i32, i32 }] %a) sanitize_memory {
+; CHECK-LABEL: define i32 @ArrayOfStructs(
+; CHECK-SAME: [3 x { i32, i32 }] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load [3 x { i32, i32 }], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue [3 x { i32, i32 }] [[TMP1]], 2, 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [3 x { i32, i32 }] [[A]], 2, 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[X]]
+;
+; CHECK-ORIGIN-LABEL: define i32 @ArrayOfStructs(
+; CHECK-ORIGIN-SAME: [3 x { i32, i32 }] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load [3 x { i32, i32 }], ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = extractvalue [3 x { i32, i32 }] [[TMP1]], 2, 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [3 x { i32, i32 }] [[A]], 2, 1
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret i32 [[X]]
+;
   %x = extractvalue [3 x { i32, i32 }] %a, 2, 1
   ret i32 %x
 }
 
-; CHECK-LABEL: @ArrayOfStructs(
-; CHECK: [[Ss:%.*]] = load [3 x { i32, i32 }], ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue [3 x { i32, i32 }] [[Ss]], 2, 1
-; CHECK: store i32 [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret i32
-
-
 define <8 x i16> @ArrayOfVectors([3 x <8 x i16>] %a) sanitize_memory {
+; CHECK-LABEL: define <8 x i16> @ArrayOfVectors(
+; CHECK-SAME: [3 x <8 x i16>] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load [3 x <8 x i16>], ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue [3 x <8 x i16>] [[TMP1]], 1
+; CHECK-NEXT:    [[X:%.*]] = extractvalue [3 x <8 x i16>] [[A]], 1
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[X]]
+;
+; CHECK-ORIGIN-LABEL: define <8 x i16> @ArrayOfVectors(
+; CHECK-ORIGIN-SAME: [3 x <8 x i16>] [[A:%.*]]) #[[ATTR0]] {
+; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load [3 x <8 x i16>], ptr @__msan_param_tls, align 8
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = extractvalue [3 x <8 x i16>] [[TMP1]], 1
+; CHECK-ORIGIN-NEXT:    [[X:%.*]] = extractvalue [3 x <8 x i16>] [[A]], 1
+; CHECK-ORIGIN-NEXT:    store <8 x i16> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-ORIGIN-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-ORIGIN-NEXT:    ret <8 x i16> [[X]]
+;
   %x = extractvalue [3 x <8 x i16>] %a, 1
   ret <8 x i16> %x
 }
 
-; CHECK-LABEL: @ArrayOfVectors(
-; CHECK: [[Ss:%.*]] = load [3 x <8 x i16>], ptr @__msan_param_tls
-; CHECK: [[Sx:%.*]] = extractvalue [3 x <8 x i16>] [[Ss]], 1
-; CHECK: store <8 x i16> [[Sx]], ptr @__msan_retval_tls
-; CHECK: ret <8 x i16>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
index 2f60bd8b357b8..f0f67fc8f1210 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
@@ -15,131 +16,171 @@ declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
 declare i64 @llvm.x86.bmi.pext.64(i64, i64)
 
 define i32 @Test_bzhi_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_bzhi_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.bzhi.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.bzhi.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_bzhi_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.bzhi.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_bzhi_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_bzhi_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.bzhi.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.bzhi.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_bzhi_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.bzhi.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
 
 
 define i32 @Test_bextr_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_bextr_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.bextr.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.bextr.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_bextr_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.bextr.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_bextr_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_bextr_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.bextr.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.bextr.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_bextr_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.bextr.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
 
 
 define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_pdep_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pdep.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.pdep.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_pdep_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.pdep.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_pdep_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pdep.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.pdep.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_pdep_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.pdep.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
 
 define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
+; CHECK-LABEL: define i32 @Test_pext_32(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pext.32(i32 [[TMP1]], i32 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.pext.32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[C]]
+;
 entry:
   %c = tail call i32 @llvm.x86.bmi.pext.32(i32 %a, i32 %b)
   ret i32 %c
 }
 
-; CHECK-LABEL: @Test_pext_32(
-; CHECK-DAG: %[[SA:.*]] = load i32, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i32, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i32 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i32
-; CHECK-DAG: %[[X:.*]] = call i32 @llvm.x86.bmi.pext.32(i32 %[[SA]], i32 %b)
-; CHECK-DAG: %[[S:.*]] = or i32 %[[SB1]], %[[X]]
-; CHECK-DAG: store i32 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i32
 
 define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
+; CHECK-LABEL: define i64 @Test_pext_64(
+; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pext.64(i64 [[TMP1]], i64 [[B]])
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.pext.64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i64 [[C]]
+;
 entry:
   %c = tail call i64 @llvm.x86.bmi.pext.64(i64 %a, i64 %b)
   ret i64 %c
 }
 
-; CHECK-LABEL: @Test_pext_64(
-; CHECK-DAG: %[[SA:.*]] = load i64, ptr @__msan_param_tls
-; CHECK-DAG: %[[SB:.*]] = load i64, {{.*}}@__msan_param_tls to i64), i64 8)
-; CHECK-DAG: %[[SB0:.*]] = icmp ne i64 %[[SB]], 0
-; CHECK-DAG: %[[SB1:.*]] = sext i1 %[[SB0]] to i64
-; CHECK-DAG: %[[X:.*]] = call i64 @llvm.x86.bmi.pext.64(i64 %[[SA]], i64 %b)
-; CHECK-DAG: %[[S:.*]] = or i64 %[[SB1]], %[[X]]
-; CHECK-DAG: store i64 %[[S]], {{.*}}@__msan_retval_tls
-; CHECK: ret i64
diff --git a/llvm/test/Instrumentation/MemorySanitizer/byval.ll b/llvm/test/Instrumentation/MemorySanitizer/byval.ll
index 258cec866d6a8..69970896a0527 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/byval.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/byval.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -S -passes="msan<track-origins=1>" 2>&1 | FileCheck %s --implicit-check-not "call void @llvm.mem" --implicit-check-not " load" --implicit-check-not " store"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -passes="msan<track-origins=1>" 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -7,16 +8,28 @@ declare void @FnByVal(ptr byval(i128) %p);
 declare void @Fn(ptr %p);
 
 define i128 @ByValArgument(i32, ptr byval(i128) %p) sanitize_memory {
-; CHECK-LABEL: @ByValArgument(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[#]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         [[X:%.*]] = load i128, ptr %p, align 8
-; CHECK:         [[_MSLD:%.*]] = load i128, ptr %[[#]], align 8
-; CHECK:         %[[#]] = load i32, ptr %[[#]], align 8
-; CHECK:         store i128 [[_MSLD]], ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 %[[#]], ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i128 [[X]]
+; CHECK-LABEL: define i128 @ByValArgument(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i128, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i128, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 8
+; CHECK-NEXT:    store i128 [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP11]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i128 [[X]]
 ;
 entry:
   %x = load i128, ptr %p
@@ -24,13 +37,20 @@ entry:
 }
 
 define i128 @ByValArgumentNoSanitize(i32, ptr byval(i128) %p) {
-; CHECK-LABEL: @ByValArgumentNoSanitize(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 8 %[[#]], i8 0, i64 16, i1 false)
-; CHECK:         [[X:%.*]] = load i128, ptr %p, align 8
-; CHECK:         store i128 0, ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 0, ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i128 [[X]]
+; CHECK-LABEL: define i128 @ByValArgumentNoSanitize(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i128, ptr [[P]], align 8
+; CHECK-NEXT:    store i128 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i128 [[X]]
 ;
 entry:
   %x = load i128, ptr %p
@@ -38,13 +58,20 @@ entry:
 }
 
 define void @ByValForward(i32, ptr byval(i128) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForward(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[#]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForward(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn(ptr %p)
@@ -52,12 +79,19 @@ entry:
 }
 
 define void @ByValForwardNoSanitize(i32, ptr byval(i128) %p) {
-; CHECK-LABEL: @ByValForwardNoSanitize(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 8 %[[#]], i8 0, i64 16, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardNoSanitize(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn(ptr %p)
@@ -65,14 +99,27 @@ entry:
 }
 
 define void @ByValForwardByVal(i32, ptr byval(i128) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForwardByVal(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[#]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr %[[#]], i64 16, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 %[[#]], i64 16, i1 false)
-; CHECK:         call void @FnByVal(ptr byval(i128) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByVal(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -4
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr [[TMP8]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 [[TMP11]], i64 16, i1 false)
+; CHECK-NEXT:    call void @FnByVal(ptr byval(i128) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal(ptr byval(i128) %p)
@@ -80,12 +127,25 @@ entry:
 }
 
 define void @ByValForwardByValNoSanitize(i32, ptr byval(i128) %p) {
-; CHECK-LABEL: @ByValForwardByValNoSanitize(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 8 %[[#]], i8 0, i64 16, i1 false)
-; CHECK:         call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 16, i1 false)
-; CHECK:         call void @FnByVal(ptr byval(i128) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByValNoSanitize(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i128) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -4
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @FnByVal(ptr byval(i128) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal(ptr byval(i128) %p)
@@ -96,16 +156,30 @@ declare void @FnByVal8(ptr byval(i8) %p);
 declare void @Fn8(ptr %p);
 
 define i8 @ByValArgument8(i32, ptr byval(i8) %p) sanitize_memory {
-; CHECK-LABEL: @ByValArgument8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[#]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
-; CHECK:         [[X:%.*]] = load i8, ptr %p, align 1
-; CHECK:         [[_MSLD:%.*]] = load i8, ptr %[[#]], align 1
-; CHECK:         %[[#]] = load i32, ptr %[[#]], align 4
-; CHECK:         store i8 [[_MSLD]], ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 %[[#]], ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i8 [[X]]
+; CHECK-LABEL: define i8 @ByValArgument8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT:    store i8 [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP13]], ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i8 [[X]]
 ;
 entry:
   %x = load i8, ptr %p
@@ -113,13 +187,21 @@ entry:
 }
 
 define i8 @ByValArgumentNoSanitize8(i32, ptr byval(i8) %p) {
-; CHECK-LABEL: @ByValArgumentNoSanitize8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 1 %[[#]], i8 0, i64 1, i1 false)
-; CHECK:         [[X:%.*]] = load i8, ptr %p, align 1
-; CHECK:         store i8 0, ptr @__msan_retval_tls, align 8
-; CHECK:         store i32 0, ptr @__msan_retval_origin_tls, align 4
-; CHECK:         ret i8 [[X]]
+; CHECK-LABEL: define i8 @ByValArgumentNoSanitize8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP3]], i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4
+; CHECK-NEXT:    ret i8 [[X]]
 ;
 entry:
   %x = load i8, ptr %p
@@ -127,13 +209,21 @@ entry:
 }
 
 define void @ByValForward8(i32, ptr byval(i8) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForward8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[#]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn8(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForward8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn8(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn8(ptr %p)
@@ -141,12 +231,20 @@ entry:
 }
 
 define void @ByValForwardNoSanitize8(i32, ptr byval(i8) %p) {
-; CHECK-LABEL: @ByValForwardNoSanitize8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 1 %[[#]], i8 0, i64 1, i1 false)
-; CHECK:         store i64 0, ptr @__msan_param_tls, align 8
-; CHECK:         call void @Fn8(ptr %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardNoSanitize8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP3]], i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @Fn8(ptr [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @Fn8(ptr %p)
@@ -154,14 +252,28 @@ entry:
 }
 
 define void @ByValForwardByVal8(i32, ptr byval(i8) %p) sanitize_memory {
-; CHECK-LABEL: @ByValForwardByVal8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[#]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[#]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr %[[#]], i64 1, i1 false)
-; CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 %[[#]], i64 4, i1 false)
-; CHECK:         call void @FnByVal8(ptr byval(i8) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByVal8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr @__msan_param_tls, ptr [[TMP9]], i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 @__msan_param_origin_tls, ptr align 4 [[TMP12]], i64 4, i1 false)
+; CHECK-NEXT:    call void @FnByVal8(ptr byval(i8) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal8(ptr byval(i8) %p)
@@ -169,12 +281,26 @@ entry:
 }
 
 define void @ByValForwardByValNoSanitize8(i32, ptr byval(i8) %p) {
-; CHECK-LABEL: @ByValForwardByValNoSanitize8(
-; CHECK-NEXT:  entry:
-; CHECK:         call void @llvm.memset.p0.i64(ptr align 1 %[[#]], i8 0, i64 1, i1 false)
-; CHECK:         call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 1, i1 false)
-; CHECK:         call void @FnByVal8(ptr byval(i8) %p)
-; CHECK:         ret void
+; CHECK-LABEL: define void @ByValForwardByValNoSanitize8(
+; CHECK-SAME: i32 [[TMP0:%.*]], ptr byval(i8) [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP3]], i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr @__msan_param_tls, i8 0, i64 1, i1 false)
+; CHECK-NEXT:    call void @FnByVal8(ptr byval(i8) [[P]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   call void @FnByVal8(ptr byval(i8) %p)

From ccd06e48098b826cafcc2e553a8cb9081e0a06dc Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Tue, 30 Sep 2025 18:26:30 +0300
Subject: [PATCH 264/878] [libc++][istream] P3223R2: Making
 `std::istream::ignore` less surprising (#147007)

Implements https://wg21.link/P3223R2 as a DR as, as recommended in
https://github.com/cplusplus/papers/issues/1871#issuecomment-2993018698.
Resolves -1L ambiguity.

Closes #148178
---
 libcxx/docs/ReleaseNotes/22.rst               |  1 +
 libcxx/docs/Status/Cxx2cPapers.csv            |  2 +-
 libcxx/include/istream                        |  6 +++
 .../ignore.char_type.pass.cpp                 | 41 +++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.char_type.pass.cpp

diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 87d86c1345618..8d023a14e89e6 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -41,6 +41,7 @@ Implemented Papers
 - P2321R2: ``zip`` (`Github <https://llvm.org/PR105169>`__) (The paper is partially implemented. ``zip_transform_view``
   is implemented in this release)
 - P3044R2: sub-``string_view`` from ``string`` (`Github <https://llvm.org/PR148140>`__)
+- P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__)
 - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__)
 
 Improvements and New Features
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 9e1678f22c4be..4e0918b0246c1 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -151,7 +151,7 @@
 "`P3111R8 <https://wg21.link/P3111R8>`__","Atomic Reduction Operations","2025-06 (Sofia)","","","`#148174 <https://github.com/llvm/llvm-project/issues/148174>`__",""
 "`P3060R3 <https://wg21.link/P3060R3>`__","Add ``std::views::indices(n)``","2025-06 (Sofia)","","","`#148175 <https://github.com/llvm/llvm-project/issues/148175>`__",""
 "`P2319R5 <https://wg21.link/P2319R5>`__","Prevent ``path`` presentation problems","2025-06 (Sofia)","","","`#148177 <https://github.com/llvm/llvm-project/issues/148177>`__",""
-"`P3223R2 <https://wg21.link/P3223R2>`__","Making ``std::istream::ignore`` less surprising","2025-06 (Sofia)","","","`#148178 <https://github.com/llvm/llvm-project/issues/148178>`__",""
+"`P3223R2 <https://wg21.link/P3223R2>`__","Making ``std::istream::ignore`` less surprising","2025-06 (Sofia)","|Complete|","22","`#148178 <https://github.com/llvm/llvm-project/issues/148178>`__",""
 "`P2781R9 <https://wg21.link/P2781R9>`__","``std::constant_wrapper``","2025-06 (Sofia)","","","`#148179 <https://github.com/llvm/llvm-project/issues/148179>`__",""
 "`P3697R1 <https://wg21.link/P3697R1>`__","Minor additions to C++26 standard library hardening","2025-06 (Sofia)","","","`#148180 <https://github.com/llvm/llvm-project/issues/148180>`__",""
 "`P3552R3 <https://wg21.link/P3552R3>`__","Add a Coroutine Task Type","2025-06 (Sofia)","","","`#148182 <https://github.com/llvm/llvm-project/issues/148182>`__",""
diff --git a/libcxx/include/istream b/libcxx/include/istream
index 93def61a8b477..7f15521f91a8a 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -70,6 +70,7 @@ public:
     basic_istream& getline(char_type* s, streamsize n, char_type delim);
 
     basic_istream& ignore(streamsize n = 1, int_type delim = traits_type::eof());
+    basic_istream& ignore(streamsize n, char_type delim);                         // Since C++26, implemented as a DR
     int_type peek();
     basic_istream& read (char_type* s, streamsize n);
     streamsize readsome(char_type* s, streamsize n);
@@ -172,6 +173,7 @@ template <class Stream, class T>
 #    include <__type_traits/conjunction.h>
 #    include <__type_traits/enable_if.h>
 #    include <__type_traits/is_base_of.h>
+#    include <__type_traits/is_same.h>
 #    include <__type_traits/make_unsigned.h>
 #    include <__utility/declval.h>
 #    include <__utility/forward.h>
@@ -292,6 +294,10 @@ public:
   basic_istream& getline(char_type* __s, streamsize __n, char_type __dlm);
 
   basic_istream& ignore(streamsize __n = 1, int_type __dlm = traits_type::eof());
+  template <class _Tp = char_type, __enable_if_t<is_same<_Tp, char>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI basic_istream& ignore(streamsize __n, char_type __delim) {
+    return ignore(__n, traits_type::to_int_type(__delim));
+  }
   int_type peek();
   basic_istream& read(char_type* __s, streamsize __n);
   streamsize readsome(char_type* __s, streamsize __n);
diff --git a/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.char_type.pass.cpp b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.char_type.pass.cpp
new file mode 100644
index 0000000000000..d0d174c1d4d87
--- /dev/null
+++ b/libcxx/test/std/input.output/iostream.format/input.streams/istream.unformatted/ignore.char_type.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Requires 396145d in the built library.
+// XFAIL: using-built-library-before-llvm-9
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
+
+// <istream>
+
+// basic_istream& ignore(streamsize n, char_type delim);
+
+#include <cassert>
+#include <sstream>
+#include <string>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::istringstream in("\xF0\x9F\xA4\xA1 Clown Face");
+  in.ignore(100, '\xA1'); // Ignore up to '\xA1' delimiter,
+                          // previously might have ignored to EOF.
+
+  assert(in.gcount() == 4); // 4 bytes were ignored.
+  assert(in.peek() == ' '); // Next character is a space.
+
+  std::string str; // Read the next word.
+  in >> str;
+  assert(str == "Clown");
+
+  // Parameter value "-1L" doesn't cause ambiguity with the char_type overload.
+  in.ignore(100, -1L); // Ignore up to EOF, which is the default behavior.
+  assert(in.eof());    // Stream should be at EOF now.
+  assert(in.gcount() == 5);
+
+  return 0;
+}

From 042540ab66e5ebee650e45ba6bfa4e68a045ff0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix-Antoine=20Constantin?=
 <felix-antoine.constantin@comact.com>
Date: Tue, 30 Sep 2025 11:27:46 -0400
Subject: [PATCH 265/878] =?UTF-8?q?[clang-tidy]=C2=A0New=20Option=20Invali?=
 =?UTF-8?q?d=20Enum=20Default=20Initialization=20(#159220)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added a new Option IgnoredEnums to bugprone invalid enum default
initialization to limit the scope of the analysis. This is needed to
remove warnings on enums like std::errc where the enum doesn't define a
value of 0, but is still used to check if some function calls like
std::from_chars are executed correctly.

The C++ Standard section 22.13.2 mentions the following : "[...] If the
member ec of the return value is such that the value is equal to the
value of a value-initialized errc, the conversion was successful [...]"

This means that a call to `std::errc{}` is clearly defined by the
standard and should not raise any warning under this check.
---
 .../InvalidEnumDefaultInitializationCheck.cpp | 20 +++++-
 .../InvalidEnumDefaultInitializationCheck.h   |  4 ++
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++
 .../invalid-enum-default-initialization.rst   | 12 ++++
 .../invalid-enum-default-initialization.cpp   | 65 ++++++++++++-------
 5 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
index 1e657888b0fc0..4fc1b3b99ece4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "InvalidEnumDefaultInitializationCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/TypeVisitor.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
@@ -88,12 +90,24 @@ class FindEnumMember : public TypeVisitor<FindEnumMember, bool> {
 
 InvalidEnumDefaultInitializationCheck::InvalidEnumDefaultInitializationCheck(
     StringRef Name, ClangTidyContext *Context)
-    : ClangTidyCheck(Name, Context) {}
+    : ClangTidyCheck(Name, Context),
+      IgnoredEnums(
+          utils::options::parseStringList(Options.get("IgnoredEnums", ""))) {
+  IgnoredEnums.emplace_back("::std::errc");
+}
+
+void InvalidEnumDefaultInitializationCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IgnoredEnums",
+                utils::options::serializeStringList(IgnoredEnums));
+}
 
 void InvalidEnumDefaultInitializationCheck::registerMatchers(
     MatchFinder *Finder) {
-  auto EnumWithoutZeroValue = enumType(
-      hasDeclaration(enumDecl(isCompleteAndHasNoZeroValue()).bind("enum")));
+  auto EnumWithoutZeroValue = enumType(hasDeclaration(
+      enumDecl(isCompleteAndHasNoZeroValue(),
+               unless(matchers::matchesAnyListedName(IgnoredEnums)))
+          .bind("enum")));
   auto EnumOrArrayOfEnum = qualType(hasUnqualifiedDesugaredType(
       anyOf(EnumWithoutZeroValue,
             arrayType(hasElementType(qualType(
diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
index 4f1a4a2a21af3..5e2662f642cd7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h
@@ -24,6 +24,10 @@ class InvalidEnumDefaultInitializationCheck : public ClangTidyCheck {
                                         ClangTidyContext *Context);
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+
+private:
+  std::vector<StringRef> IgnoredEnums;
 };
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index c3a6d2f9b2890..62e1987377989 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -253,6 +253,10 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/infinite-loop>` check by adding detection for
   variables introduced by structured bindings.
 
+- Improved :doc:`bugprone-invalid-enum-default-initialization
+  <clang-tidy/checks/bugprone/invalid-enum-default-initialization>` with new
+  `IgnoredEnums` option to ignore specified enums during analysis.
+
 - Improved :doc:`bugprone-narrowing-conversions
   <clang-tidy/checks/bugprone/narrowing-conversions>` check by fixing
   false positive from analysis of a conditional expression in C.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
index a3bd2b6d85c37..45cb878383a7d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/invalid-enum-default-initialization.rst
@@ -19,6 +19,9 @@ The check emits a warning only if an ``enum`` variable is default-initialized
 value of 0. The type can be a scoped or non-scoped ``enum``. Unions are not
 handled by the check (if it contains a member of enumeration type).
 
+Note that the ``enum`` ``std::errc`` is always ignored because it is expected to
+be default initialized, despite not defining an enumerator with the value 0.
+
 .. code-block:: c++
 
   enum class Enum1: int {
@@ -70,3 +73,12 @@ enum type) are set to 0.
   enum Enum1 Array3[2][2] = {{Enum1_A, Enum1_A}}; // warn: elements of second array are initialized to 0
 
   struct Struct1 S1 = {1}; // warn: element 'b' is initialized to 0
+
+
+Options
+-------
+
+.. option:: IgnoredEnums
+
+  Semicolon-separated list of regexes specifying enums for which this check won't be
+  enforced. Default is `::std::errc`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp
index eb3d5632eaef7..85ff481aae301 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/invalid-enum-default-initialization.cpp
@@ -1,4 +1,5 @@
-// RUN: %check_clang_tidy -std=c++17 %s bugprone-invalid-enum-default-initialization %t
+// RUN: %check_clang_tidy -check-suffixes=,DEFAULT -std=c++17-or-later %s bugprone-invalid-enum-default-initialization %t
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-invalid-enum-default-initialization %t -- -config="{CheckOptions: {bugprone-invalid-enum-default-initialization.IgnoredEnums: '::MyEnum'}}"
 
 enum class Enum0: int {
   A = 0,
@@ -24,10 +25,10 @@ Enum0 E0_6{Enum0::B};
 
 Enum1 E1_1{};
 // CHECK-NOTES: :[[@LINE-1]]:11: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 Enum1 E1_2 = Enum1();
 // CHECK-NOTES: :[[@LINE-1]]:14: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 Enum1 E1_3;
 Enum1 E1_4{0};
 Enum1 E1_5{Enum1::A};
@@ -35,44 +36,44 @@ Enum1 E1_6{Enum1::B};
 
 Enum2 E2_1{};
 // CHECK-NOTES: :[[@LINE-1]]:11: warning: enum value of type 'Enum2' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 Enum2 E2_2 = Enum2();
 // CHECK-NOTES: :[[@LINE-1]]:14: warning: enum value of type 'Enum2' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 
 void f1() {
   static Enum1 S; // FIMXE: warn for this?
   Enum1 A;
   Enum1 B = Enum1();
   // CHECK-NOTES: :[[@LINE-1]]:13: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   int C = int();
 }
 
 void f2() {
   Enum1 A{};
   // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 B = Enum1();
   // CHECK-NOTES: :[[@LINE-1]]:13: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 C[5] = {{}};
   // CHECK-NOTES: :[[@LINE-1]]:16: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   // CHECK-NOTES: :[[@LINE-3]]:17: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 D[5] = {}; // FIMXE: warn for this?
   // CHECK-NOTES: :[[@LINE-1]]:16: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
 }
 
 struct S1 {
   Enum1 E_1{};
   // CHECK-NOTES: :[[@LINE-1]]:12: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 E_2 = Enum1();
   // CHECK-NOTES: :[[@LINE-1]]:15: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
   Enum1 E_3;
   Enum1 E_4;
   Enum1 E_5;
@@ -80,10 +81,10 @@ struct S1 {
   S1() :
     E_3{},
     // CHECK-NOTES: :[[@LINE-1]]:8: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-    // CHECK-NOTES: :8:12: note: enum is defined here
+    // CHECK-NOTES: :9:12: note: enum is defined here
     E_4(),
     // CHECK-NOTES: :[[@LINE-1]]:8: warning: enum value of type 'Enum1' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
-    // CHECK-NOTES: :8:12: note: enum is defined here
+    // CHECK-NOTES: :9:12: note: enum is defined here
     E_5{Enum1::B}
   {}
 };
@@ -110,22 +111,22 @@ struct S5 {
 
 S2 VarS2{};
 // CHECK-NOTES: :[[@LINE-1]]:9: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 // CHECK-NOTES: :[[@LINE-3]]:9: warning: enum value of type 'Enum2' initialized with invalid value of 0
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 S3 VarS3{};
 // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 // CHECK-NOTES: :[[@LINE-3]]:10: warning: enum value of type 'Enum2' initialized with invalid value of 0
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 S4 VarS4{};
 // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 // CHECK-NOTES: :[[@LINE-3]]:10: warning: enum value of type 'Enum2' initialized with invalid value of 0
-// CHECK-NOTES: :13:6: note: enum is defined here
+// CHECK-NOTES: :14:6: note: enum is defined here
 S5 VarS5{};
 // CHECK-NOTES: :[[@LINE-1]]:10: warning: enum value of type 'Enum1' initialized with invalid value of 0
-// CHECK-NOTES: :8:12: note: enum is defined here
+// CHECK-NOTES: :9:12: note: enum is defined here
 
 enum class EnumFwd;
 
@@ -139,7 +140,25 @@ template<typename T>
 struct Templ {
   T Mem1{};
   // CHECK-NOTES: :[[@LINE-1]]:9: warning: enum value of type 'Enum1' initialized with invalid value of 0
-  // CHECK-NOTES: :8:12: note: enum is defined here
+  // CHECK-NOTES: :9:12: note: enum is defined here
 };
 
 Templ<Enum1> TemplVar;
+
+enum MyEnum {
+  A = 1,
+  B
+};
+
+MyEnum MyEnumVar{};
+// CHECK-NOTES-DEFAULT: :[[@LINE-1]]:17: warning: enum value of type 'MyEnum' initialized with invalid value of 0, enum doesn't have a zero-value enumerator
+// CHECK-NOTES-DEFAULT: :148:6: note: enum is defined here
+
+namespace std {
+  enum errc {
+    A = 1,
+    B
+  };
+}
+
+std::errc err{};

From 95069c119668a190afe766cab912a17a432a6f0f Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 30 Sep 2025 10:45:27 -0500
Subject: [PATCH 266/878] [flang][OpenMP] Remove unused DECLARE REDUCTION from
 openmp-utils.h, NFC (#161390)

DECLARE REDUCTION is now handled by the generic code, and the special
handling no longer applies.
---
 flang/include/flang/Parser/openmp-utils.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index b8f3559097750..f761332c9cfd7 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -39,7 +39,6 @@ struct ConstructId {
   }
 
 MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate);
-MAKE_CONSTR_ID(OpenMPDeclareReductionConstruct, D::OMPD_declare_reduction);
 MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate);
 
 #undef MAKE_CONSTR_ID
@@ -92,7 +91,6 @@ struct DirectiveNameScope {
       if constexpr (std::is_base_of_v<OmpBlockConstruct, T>) {
         return std::get<OmpBeginDirective>(x.t).DirName();
       } else if constexpr (std::is_same_v<T, OpenMPDeclarativeAllocate> ||
-          std::is_same_v<T, OpenMPDeclareReductionConstruct> ||
           std::is_same_v<T, OpenMPExecutableAllocate>) {
         return MakeName(std::get<Verbatim>(x.t).source, ConstructId<T>::id);
       } else {

From 552ee3c160fe619e86e796b2fde2cc51592c3b5a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 30 Sep 2025 08:56:20 -0700
Subject: [PATCH 267/878] [ADT] Add const to AllocatorList::{empty,size}
 (#161320)

While I am at it, this patch adds [[nodiscard]].
---
 llvm/include/llvm/ADT/AllocatorList.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/AllocatorList.h b/llvm/include/llvm/ADT/AllocatorList.h
index 04d0afc9d076e..2716b83ca224a 100644
--- a/llvm/include/llvm/ADT/AllocatorList.h
+++ b/llvm/include/llvm/ADT/AllocatorList.h
@@ -155,8 +155,8 @@ template <class T, class AllocatorT> class AllocatorList : AllocatorT {
     std::swap(getAlloc(), RHS.getAlloc());
   }
 
-  bool empty() { return List.empty(); }
-  size_t size() { return List.size(); }
+  [[nodiscard]] bool empty() const { return List.empty(); }
+  [[nodiscard]] size_t size() const { return List.size(); }
 
   iterator begin() { return iterator(List.begin()); }
   iterator end() { return iterator(List.end()); }

From 0db995e60b6b52aa1debc685d364afa5bbfd220c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 30 Sep 2025 08:56:27 -0700
Subject: [PATCH 268/878] [ADT] Add const to operator== in ArrayRef.h (#161321)

While I am at it, this patch adds [[nodiscard]].
---
 llvm/include/llvm/ADT/ArrayRef.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/ArrayRef.h b/llvm/include/llvm/ADT/ArrayRef.h
index fb91690bb0eb3..448d10013d371 100644
--- a/llvm/include/llvm/ADT/ArrayRef.h
+++ b/llvm/include/llvm/ADT/ArrayRef.h
@@ -547,7 +547,8 @@ namespace llvm {
   }
 
   template <typename T>
-  inline bool operator==(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  [[nodiscard]] inline bool operator==(const SmallVectorImpl<T> &LHS,
+                                       ArrayRef<T> RHS) {
     return ArrayRef<T>(LHS).equals(RHS);
   }
 
@@ -557,7 +558,8 @@ namespace llvm {
   }
 
   template <typename T>
-  inline bool operator!=(SmallVectorImpl<T> &LHS, ArrayRef<T> RHS) {
+  [[nodiscard]] inline bool operator!=(const SmallVectorImpl<T> &LHS,
+                                       ArrayRef<T> RHS) {
     return !(LHS == RHS);
   }
 

From e457307b8aedbcb70430f2f907057589fa363600 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 30 Sep 2025 08:56:36 -0700
Subject: [PATCH 269/878] [ADT] Make non-const functions forward to const
 versions (NFC) (#161323)

These functions all correspond to their respective const versions.
This patch uses the "const_cast" trick to forward to the const
versions.
---
 llvm/include/llvm/Support/TrailingObjects.h | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h
index 3eb7c0bd1f379..dc03285c4994b 100644
--- a/llvm/include/llvm/Support/TrailingObjects.h
+++ b/llvm/include/llvm/Support/TrailingObjects.h
@@ -284,11 +284,8 @@ class TrailingObjects
   /// (which must be one of those specified in the class template). The
   /// array may have zero or more elements in it.
   template <typename T> T *getTrailingObjects() {
-    verifyTrailingObjectsAssertions<true>();
-    // Forwards to an impl function with overloads, since member
-    // function templates can't be specialized.
-    return this->getTrailingObjectsImpl(
-        static_cast<BaseTy *>(this), TrailingObjectsBase::OverloadToken<T>());
+    return const_cast<T *>(
+        static_cast<const TrailingObjects *>(this)->getTrailingObjects<T>());
   }
 
   // getTrailingObjects() specialization for a single trailing type.
@@ -306,13 +303,8 @@ class TrailingObjects
   }
 
   FirstTrailingType *getTrailingObjects() {
-    static_assert(sizeof...(TrailingTys) == 1,
-                  "Can use non-templated getTrailingObjects() only when there "
-                  "is a single trailing type");
-    verifyTrailingObjectsAssertions<false>();
-    return this->getTrailingObjectsImpl(
-        static_cast<BaseTy *>(this),
-        TrailingObjectsBase::OverloadToken<FirstTrailingType>());
+    return const_cast<FirstTrailingType *>(
+        static_cast<const TrailingObjects *>(this)->getTrailingObjects());
   }
 
   // Functions that return the trailing objects as ArrayRefs.
@@ -342,9 +334,8 @@ class TrailingObjects
   }
 
   template <typename T> T *getTrailingObjectsNonStrict() {
-    verifyTrailingObjectsAssertions<false>();
-    return this->getTrailingObjectsImpl(
-        static_cast<BaseTy *>(this), TrailingObjectsBase::OverloadToken<T>());
+    return const_cast<T *>(static_cast<const TrailingObjects *>(this)
+                               ->getTrailingObjectsNonStrict<T>());
   }
 
   template <typename T>

From d7421e6a28594b4d43a67fad68cffa801474c98a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 30 Sep 2025 08:56:44 -0700
Subject: [PATCH 270/878] [llvm] Proofread CIBestPractices.rst (#161324)

---
 llvm/docs/CIBestPractices.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index 8301b95f54938..da92ed3660e55 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -9,11 +9,11 @@ This document contains a list of guidelines and best practices to use when
 working on LLVM's CI systems. These are intended to keep our actions reliable,
 consistent, and secure.
 
-Github Actions Best Practices
+GitHub Actions Best Practices
 =============================
 
 This section contains information on best practices/guidelines when working on
-LLVM's github actions workflows.
+LLVM's GitHub actions workflows.
 
 Disabling Jobs In Forks
 -----------------------
@@ -35,7 +35,7 @@ jobs specified within a workflow:
       if: github.repository_owner == 'llvm'
 
 We choose to use ``github.repository_owner`` rather than ``github.repository``
-to enable these workflows to run in forks inside the LLVM organization such as
+to enable these workflows to run in forks inside the LLVM organization, such as
 the ClangIR fork.
 
 There are some exceptions to this rule where ``github.repository`` might be
@@ -46,7 +46,7 @@ release tasks, which should not run anywhere else.
 Hash Pinning Dependencies
 -------------------------
 
-Github Actions allows the use of actions from other repositories as steps in
+GitHub Actions allows the use of actions from other repositories as steps in
 jobs. We take advantage of various actions for a variety of different tasks,
 but especially tasks like checking out the repository, and
 downloading/uploading build caches. These actions are typically versioned with
@@ -59,9 +59,9 @@ just a release, which looks like the following:
       uses: actions/checkout@v4
 
 However, it is best practice to specify an exact commit SHA from which to pull
-the action from, noting the version in a comment:
+the action, noting the version in a comment:
 
-We plan on revisiting this recommendation once Github's immutable actions have
+We plan on revisiting this recommendation once GitHub's immutable actions have
 been rolled out as GA.
 
 .. code-block:: yaml
@@ -72,11 +72,11 @@ been rolled out as GA.
 
 This is beneficial for two reasons: reliability and security. Specifying an
 exact SHA rather than just a major version ensures we end up running the same
-action originally specified when the workflow as authored and/or updated,
+action originally specified when the workflow was authored and/or updated,
 and that no breaking changes sneak in from new versions of a workflow being
 released. However, this effect could also be achieved by specifying an exact
 dot release. The biggest reason to prefer hash pinned dependencies is security.
-Release assets on Github are mutable, allowing an attacker to change the code
+Release assets on GitHub are mutable, allowing an attacker to change the code
 within a specific version of an action after the fact, potentially stealing
 sensitive tokens and credentials. Hash pinning the dependencies prevents this
 as the hash would change with the code.
@@ -84,10 +84,10 @@ as the hash would change with the code.
 Using Versioned Runner Images
 -----------------------------
 
-Github actions allows the use of either specifically versioned runner images
+GitHub actions allows the use of either specifically versioned runner images
 (e.g., ``ubuntu-22.04``), or just the latest runner image
 (e.g., ``ubuntu-latest``). It is best practice to use explicitly versioned
-runner images. This prevents breakages when Github rolls the latest runner
+runner images. This prevents breakages when GitHub rolls the latest runner
 image to a new version with potentially breaking changes, instead allowing us
 to explicitly opt-in to using the new image when we have done sufficient
 testing to ensure that our existing workflows work as expected in the new
@@ -112,7 +112,7 @@ the principle of least privilege.
 Ensuring Workflows Run on the Correct Events
 --------------------------------------------
 
-Github allows workflows to run on a multitude of events and it is important to
+GitHub allows workflows to run on a multitude of events, and it is important to
 configure a workflow such that it triggers on the correct events. There are
 two main best practices around events that trigger workflows:
 

From ab645f1dff323569ffe8a6fabedbae259101b1a7 Mon Sep 17 00:00:00 2001
From: YixingZhang007 <yixing.zhang@intel.com>
Date: Tue, 30 Sep 2025 11:58:35 -0400
Subject: [PATCH 271/878] [SPIRV] Avoid OpQuantizeToF16 in SPIR-V kernel test
 (#158086)

This PR resolves the current failure in the `integer-casts.ll` SPIR-V
test during CI runs in `llvm-project`.
The failure occurs because the SPIR-V instruction `OpQuantizeToF16`
requires the `Capability::Shader`. However, the function in
`integer-casts.ll` is written as a kernel function and executed in a
kernel environment. Therefore, `Capability::Kernel` is emitted instead
of `Capability::Shader`. To fix this, we remove the `QuantizeToF16` test
from`integer-casts.ll` in this PR.
---
 .../SPIRV/instructions/integer-casts.ll       | 45 +++++++++----------
 .../SPIRV/instructions/quantizeto16.ll        | 15 +++++++
 2 files changed, 36 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/instructions/quantizeto16.ll

diff --git a/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll b/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll
index 6a4b4f593bf3b..5fe2cc883ceb9 100644
--- a/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll
+++ b/llvm/test/CodeGen/SPIRV/instructions/integer-casts.ll
@@ -14,11 +14,11 @@
 ; CHECK-DAG: OpName [[ZEXT8_16:%.*]] "u8tou16"
 ; CHECK-DAG: OpName [[ZEXT16_32:%.*]] "u16tou32"
 
+; CHECK-DAG: OpName %[[#R16:]] "r16"
 ; CHECK-DAG: OpName %[[#R17:]] "r17"
 ; CHECK-DAG: OpName %[[#R18:]] "r18"
 ; CHECK-DAG: OpName %[[#R19:]] "r19"
 ; CHECK-DAG: OpName %[[#R20:]] "r20"
-; CHECK-DAG: OpName %[[#R21:]] "r21"
 
 ; CHECK-DAG: OpName [[TRUNC32_16v4:%.*]] "i32toi16v4"
 ; CHECK-DAG: OpName [[TRUNC32_8v4:%.*]] "i32toi8v4"
@@ -30,11 +30,11 @@
 ; CHECK-DAG: OpName [[ZEXT8_16v4:%.*]] "u8tou16v4"
 ; CHECK-DAG: OpName [[ZEXT16_32v4:%.*]] "u16tou32v4"
 
-; CHECK-DAG: OpDecorate %[[#R17]] FPRoundingMode RTZ
-; CHECK-DAG: OpDecorate %[[#R18]] FPRoundingMode RTE
-; CHECK-DAG: OpDecorate %[[#R19]] FPRoundingMode RTP
-; CHECK-DAG: OpDecorate %[[#R20]] FPRoundingMode RTN
-; CHECK-DAG: OpDecorate %[[#R21]] SaturatedConversion
+; CHECK-DAG: OpDecorate %[[#R16]] FPRoundingMode RTZ
+; CHECK-DAG: OpDecorate %[[#R17]] FPRoundingMode RTE
+; CHECK-DAG: OpDecorate %[[#R18]] FPRoundingMode RTP
+; CHECK-DAG: OpDecorate %[[#R19]] FPRoundingMode RTN
+; CHECK-DAG: OpDecorate %[[#R20]] SaturatedConversion
 
 ; CHECK-DAG: [[F32:%.*]] = OpTypeFloat 32
 ; CHECK-DAG: [[F16:%.*]] = OpTypeFloat 16
@@ -258,7 +258,6 @@ define <4 x i32>  @u16tou32v4(<4 x i16> %a) {
 ; CHECK: %[[#]] = OpUConvert [[U32]] %[[#]]
 ; CHECK: %[[#]] = OpSConvert [[U32]] %[[#]]
 ; CHECK: %[[#]] = OpFConvert [[F16]] %[[#]]
-; CHECK: %[[#]] = OpQuantizeToF16 [[F32]] %[[#]]
 ; CHECK: %[[#]] = OpSatConvertSToU [[U64]] %[[#]]
 ; CHECK: %[[#]] = OpSatConvertUToS [[U64]] %[[#]]
 ; CHECK: %[[#]] = OpConvertPtrToU [[U64]] [[Arg1]]
@@ -267,11 +266,11 @@ define <4 x i32>  @u16tou32v4(<4 x i16> %a) {
 ; CHECK: %[[#]] = OpSConvert [[U32v4]] %[[#]]
 ; CHECK: %[[#]] = OpConvertUToF [[F32]] %[[#]]
 ; CHECK: %[[#]] = OpConvertUToF [[F32]] %[[#]]
+; CHECK: %[[#R16]] = OpFConvert [[F32v2]] %[[#]]
 ; CHECK: %[[#R17]] = OpFConvert [[F32v2]] %[[#]]
 ; CHECK: %[[#R18]] = OpFConvert [[F32v2]] %[[#]]
 ; CHECK: %[[#R19]] = OpFConvert [[F32v2]] %[[#]]
-; CHECK: %[[#R20]] = OpFConvert [[F32v2]] %[[#]]
-; CHECK: %[[#R21]] = OpConvertFToU [[U8]] %[[#]]
+; CHECK: %[[#R20]] = OpConvertFToU [[U8]] %[[#]]
 ; CHECK: OpFunctionEnd
 define dso_local spir_kernel void @test_wrappers(ptr addrspace(4) %arg, i64 %arg_ptr, <4 x i8> %arg_v2) {
   %r1 = call spir_func i32 @__spirv_ConvertFToU(float 0.000000e+00)
@@ -281,20 +280,19 @@ define dso_local spir_kernel void @test_wrappers(ptr addrspace(4) %arg, i64 %arg
   %r5 = call spir_func i32 @__spirv_UConvert(i64 1)
   %r6 = call spir_func i32 @__spirv_SConvert(i64 1)
   %r7 = call spir_func half @__spirv_FConvert(float 0.000000e+00)
-  %r8 = call spir_func float @__spirv_QuantizeToF16(float 0.000000e+00)
-  %r9 = call spir_func i64 @__spirv_SatConvertSToU(i64 1)
-  %r10 = call spir_func i64 @__spirv_SatConvertUToS(i64 1)
-  %r11 = call spir_func i64 @__spirv_ConvertPtrToU(ptr addrspace(4) %arg)
-  %r12 = call spir_func ptr addrspace(4) @__spirv_ConvertUToPtr(i64 %arg_ptr)
-  %r13 = call spir_func <4 x i32> @_Z22__spirv_UConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
-  %r14 = call spir_func <4 x i32> @_Z22__spirv_SConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
-  %r15 = call spir_func float @_Z30__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
-  %r16 = call spir_func float @__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
-  %r17 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtzDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r18 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rteDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r19 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtpDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r20 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtnDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
-  %r21 = call spir_func i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float noundef 42.0)
+  %r8 = call spir_func i64 @__spirv_SatConvertSToU(i64 1)
+  %r9 = call spir_func i64 @__spirv_SatConvertUToS(i64 1)
+  %r10 = call spir_func i64 @__spirv_ConvertPtrToU(ptr addrspace(4) %arg)
+  %r11 = call spir_func ptr addrspace(4) @__spirv_ConvertUToPtr(i64 %arg_ptr)
+  %r12 = call spir_func <4 x i32> @_Z22__spirv_UConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
+  %r13 = call spir_func <4 x i32> @_Z22__spirv_SConvert_Rint2Dv2_a(<4 x i8> %arg_v2)
+  %r14 = call spir_func float @_Z30__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
+  %r15 = call spir_func float @__spirv_ConvertUToF_Rfloat_rtz(i64 %arg_ptr)
+  %r16 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtzDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r17 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rteDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r18 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtpDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r19 = call spir_func <2 x float> @_Z28__spirv_FConvert_Rfloat2_rtnDv2_DF16_(<2 x half> noundef <half 0xH409A, half 0xH439A>)
+  %r20 = call spir_func i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float noundef 42.0)
   ret void
 }
 
@@ -305,7 +303,6 @@ declare dso_local spir_func float @__spirv_ConvertUToF(i32)
 declare dso_local spir_func i32 @__spirv_UConvert(i64)
 declare dso_local spir_func i32 @__spirv_SConvert(i64)
 declare dso_local spir_func half @__spirv_FConvert(float)
-declare dso_local spir_func float @__spirv_QuantizeToF16(float)
 declare dso_local spir_func i64 @__spirv_SatConvertSToU(i64)
 declare dso_local spir_func i64 @__spirv_SatConvertUToS(i64)
 declare dso_local spir_func i64 @__spirv_ConvertPtrToU(ptr addrspace(4))
diff --git a/llvm/test/CodeGen/SPIRV/instructions/quantizeto16.ll b/llvm/test/CodeGen/SPIRV/instructions/quantizeto16.ll
new file mode 100644
index 0000000000000..0b12ba465b289
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/instructions/quantizeto16.ll
@@ -0,0 +1,15 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %}
+
+; TODO:  Implement support for the SPIR-V QuantizeToF16 operation
+; XFAIL: *
+
+; CHECK-DAG: [[F32:%.*]] = OpTypeFloat 32
+; CHECK: %[[#]] = OpQuantizeToF16 [[F32]] %[[#]]
+define spir_func void @test_wrappers() {
+  entry:
+  %r8 = call spir_func float @__spirv_QuantizeToF16(float 0.000000e+00)
+  ret void
+}
+
+declare dso_local spir_func float @__spirv_QuantizeToF16(float)

From 0898348abd388de716e527d4454fbe874334f53c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 30 Sep 2025 17:05:04 +0100
Subject: [PATCH 272/878] [LAA] Make blockNeedsPredication arguments const
 (NFC).

The arguments aren't modified, mark them as const. This prepares for new
users in a follow-up, which only have access to const versions of the
arguments.
---
 llvm/include/llvm/Analysis/LoopAccessAnalysis.h | 5 +++--
 llvm/lib/Analysis/LoopAccessAnalysis.cpp        | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 52ab38583d5de..84b4ad7c1d5a9 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -724,8 +724,9 @@ class LoopAccessInfo {
 
   /// Return true if the block BB needs to be predicated in order for the loop
   /// to be vectorized.
-  LLVM_ABI static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
-                                             DominatorTree *DT);
+  LLVM_ABI static bool blockNeedsPredication(const BasicBlock *BB,
+                                             const Loop *TheLoop,
+                                             const DominatorTree *DT);
 
   /// Returns true if value \p V is loop invariant.
   LLVM_ABI bool isInvariant(Value *V) const;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 87fae92977cd2..05f7ac694ac9b 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2856,8 +2856,9 @@ void LoopAccessInfo::emitUnsafeDependenceRemark() {
   }
 }
 
-bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
-                                           DominatorTree *DT)  {
+bool LoopAccessInfo::blockNeedsPredication(const BasicBlock *BB,
+                                           const Loop *TheLoop,
+                                           const DominatorTree *DT) {
   assert(TheLoop->contains(BB) && "Unknown block used");
 
   // Blocks that do not dominate the latch need predication.

From 2802ab673e1613bd158bd2a0998c7604001fc7fb Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Tue, 30 Sep 2025 12:22:07 -0400
Subject: [PATCH 273/878] [PowerPC] Implement Elliptic Curve Cryptography (ECC)
 Instructions (#158362)

New instructions added:

      * xxmulmul
      * xxmulmulhiadd
      * xxmulmulloadd
      * xxssumudm
      * xxssumudmc
      * xxssumudmcext
      * xsaddadduqm
      * xsaddaddsuqm
      * xsaddsubuqm
      * xsaddsubsuqm
      * xsmerge2t1uqm
      * xsmerge2t2uqm
      * xsmerge2t3uqm
      * xsmerge3t1uqm
      * xsrebase2t1uqm
      * xsrebase2t2uqm
      * xsrebase2t3uqm
      * xsrebase2t4uqm
      * xsrebase3t1uqm
      * xsrebase3t2uqm
      * xsrebase3t3uqm
---
 llvm/lib/Target/PowerPC/PPCInstrFuture.td     | 175 ++++++++++++++++++
 .../PowerPC/ppc-encoding-ISAFuture.txt        |  63 +++++++
 .../PowerPC/ppc64le-encoding-ISAFuture.txt    |  63 +++++++
 llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s |  86 +++++++++
 4 files changed, 387 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index c3ab9651ff695..1aefea1a1c498 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -182,10 +182,113 @@ class XX3Form_XTAB6<bits<6> opcode, bits<8> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31} = XT{5};
 }
 
+class XX3Form_XTAB6_S<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : I<59, OOL, IOL, asmstr, NoItinerary> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{24...28} = xo;
+  let Inst{29} = XA{5};
+  let Inst{30} = XB{5};
+  let Inst{31} = XT{5};
+}
+
+class XX3Form_XTAB6_S3<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : XX3Form_XTAB6_S<xo, OOL, IOL, asmstr, pattern> {
+
+  bits<3> S;
+  let Inst{21...23} = S;
+}
+
+class XX3Form_XTAB6_3S1<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : XX3Form_XTAB6_S<xo, OOL, IOL, asmstr, pattern> {
+
+  bits<1> S0;
+  bits<1> S1;
+  bits<1> S2;
+
+  let Inst{21} = S0;
+  let Inst{22} = S1;
+  let Inst{23} = S2;
+}
+
+class XX3Form_XTAB6_2S1<bits<5> xo, dag OOL, dag IOL, string asmstr,
+                       list<dag> pattern>
+    : XX3Form_XTAB6_S<xo, OOL, IOL, asmstr, pattern> {
+
+  bits<1> S1;
+  bits<1> S2;
+
+  let Inst{21} = 0;
+  let Inst{22} = S1;
+  let Inst{23} = S2;
+}
+
+class XX3Form_XTAB6_P<bits<7> xo, dag OOL, dag IOL, string asmstr,
+                      list<dag> pattern>
+    : I<59, OOL, IOL, asmstr, NoItinerary> {
+
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<1> P;
+
+  let Pattern = pattern;
+
+  let Inst{6...10} = XT{4...0};
+  let Inst{11...15} = XA{4...0};
+  let Inst{16...20} = XB{4...0};
+  let Inst{21} = P;
+  let Inst{22...28} = xo;
+  let Inst{29} = XA{5};
+  let Inst{30} = XB{5};
+  let Inst{31} = XT{5};
+}
+
+// Prefix instruction classes.
+
+class 8RR_XX4Form_XTABC6_P<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                           InstrItinClass itin, list<dag> pattern>
+    : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+  bits<1> P;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6...7} = 1;
+  let Inst{8...11} = 0;
+
+  // The instruction.
+  let Inst{38...42} = XT{4...0};
+  let Inst{43...47} = XA{4...0};
+  let Inst{48...52} = XB{4...0};
+  let Inst{53...57} = XC{4...0};
+  let Inst{58} = 1;
+  let Inst{59} = P;
+  let Inst{60} = XC{5};
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = XT{5};
+}
+
 //-------------------------- Instruction definitions -------------------------//
 // Predicate combinations available:
 // [IsISAFuture]
 // [HasVSX, IsISAFuture]
+// [HasVSX, PrefixInstrs, IsISAFuture]
 
 let Predicates = [IsISAFuture] in {
   defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT),
@@ -294,6 +397,78 @@ let Predicates = [HasVSX, IsISAFuture] in {
                               "xvmulhuw $XT, $XA, $XB", []>;
   def XVMULHUH: XX3Form_XTAB6<60, 122, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                               "xvmulhuh $XT, $XA, $XB", []>;
+
+  // Elliptic Curve Cryptography Acceleration Instructions.
+  def XXMULMUL
+      : XX3Form_XTAB6_S3<1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u3imm:$S),
+                         "xxmulmul $XT, $XA, $XB, $S", []>;
+  def XXMULMULHIADD
+      : XX3Form_XTAB6_3S1<9, (outs vsrc:$XT),
+                          (ins vsrc:$XA, vsrc:$XB, u1imm:$S0, u1imm:$S1,
+                              u1imm:$S2),
+                          "xxmulmulhiadd $XT, $XA, $XB, $S0, $S1, $S2", []>;
+  def XXMULMULLOADD
+      : XX3Form_XTAB6_2S1<17, (outs vsrc:$XT),
+                          (ins vsrc:$XA, vsrc:$XB, u1imm:$S1, u1imm:$S2),
+                          "xxmulmulloadd $XT, $XA, $XB, $S1, $S2", []>;
+  def XXSSUMUDM
+      : XX3Form_XTAB6_P<25, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u1imm:$P),
+                        "xxssumudm $XT, $XA, $XB, $P", []>;
+  def XXSSUMUDMC
+      : XX3Form_XTAB6_P<57, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u1imm:$P),
+                        "xxssumudmc $XT, $XA, $XB, $P", []>;
+  def XSADDADDUQM
+      : XX3Form_XTAB6<59, 96, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddadduqm $XT, $XA, $XB", []>;
+  def XSADDADDSUQM
+      : XX3Form_XTAB6<59, 104, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddaddsuqm $XT, $XA, $XB", []>;
+  def XSADDSUBUQM
+      : XX3Form_XTAB6<59, 112, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddsubuqm $XT, $XA, $XB", []>;
+  def XSADDSUBSUQM
+      : XX3Form_XTAB6<59, 224, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsaddsubsuqm $XT, $XA, $XB", []>;
+  def XSMERGE2T1UQM
+      : XX3Form_XTAB6<59, 232, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge2t1uqm $XT, $XA, $XB", []>;
+  def XSMERGE2T2UQM
+      : XX3Form_XTAB6<59, 240, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge2t2uqm $XT, $XA, $XB", []>;
+  def XSMERGE2T3UQM
+      : XX3Form_XTAB6<59, 89, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge2t3uqm $XT, $XA, $XB", []>;
+  def XSMERGE3T1UQM
+      : XX3Form_XTAB6<59, 121, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsmerge3t1uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T1UQM
+      : XX3Form_XTAB6<59, 145, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t1uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T2UQM
+      : XX3Form_XTAB6<59, 177, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t2uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T3UQM
+      : XX3Form_XTAB6<59, 209, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t3uqm $XT, $XA, $XB", []>;
+  def XSREBASE2T4UQM
+      : XX3Form_XTAB6<59, 217, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase2t4uqm $XT, $XA, $XB", []>;
+  def XSREBASE3T1UQM
+      : XX3Form_XTAB6<59, 241, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase3t1uqm $XT, $XA, $XB", []>;
+  def XSREBASE3T2UQM
+      : XX3Form_XTAB6<59, 249, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase3t2uqm $XT, $XA, $XB", []>;
+  def XSREBASE3T3UQM
+      : XX3Form_XTAB6<59, 195, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
+                      "xsrebase3t3uqm $XT, $XA, $XB", []>;
+}
+
+let Predicates = [HasVSX, PrefixInstrs, IsISAFuture] in {
+  def XXSSUMUDMCEXT
+      : 8RR_XX4Form_XTABC6_P<
+            34, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC, u1imm:$P),
+            "xxssumudmcext $XT, $XA, $XB, $XC, $P", IIC_VecGeneral, []>;
 }
 
 //---------------------------- Anonymous Patterns ----------------------------//
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index a34e7f54c2234..cdfc8ce9e0ca5 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -273,3 +273,66 @@
 
 #CHECK: xvmulhuh  4, 5, 7
 0xf0,0x85,0x3b,0xd0
+
+#CHECK: xxmulmul 8, 3, 4, 2
+0xed,0x03,0x22,0x08
+
+#CHECK: xxmulmulhiadd 8, 3, 4, 1, 0, 1
+0xed,0x03,0x25,0x48
+
+#CHECK: xxmulmulloadd 8, 3, 4, 1, 0
+0xed,0x03,0x22,0x88
+
+#CHECK: xxssumudm 8, 3, 4, 1
+0xed,0x03,0x24,0xc8
+
+#CHECK: xxssumudmc 8, 3, 4, 1
+0xed,0x03,0x25,0xc8
+
+#CHECK: xxssumudmcext 8, 3, 4, 6, 0
+0x05,0x00,0x00,0x00,0x89,0x03,0x21,0xa0
+
+#CHECK: xsaddadduqm  4, 5, 7
+0xec,0x85,0x3b,0x00
+
+#CHECK: xsaddaddsuqm  4, 5, 7
+0xec,0x85,0x3b,0x40
+
+#CHECK: xsaddsubuqm  4, 5, 7
+0xec,0x85,0x3b,0x80
+
+#CHECK: xsaddsubsuqm  4, 5, 7
+0xec,0x85,0x3f,0x00
+
+#CHECK: xsrebase2t1uqm 4, 5, 7
+0xec,0x85,0x3c,0x88
+
+#CHECK: xsrebase2t2uqm 4, 5, 7
+0xec,0x85,0x3d,0x88
+
+#CHECK: xsrebase2t3uqm 4, 5, 7
+0xec,0x85,0x3e,0x88
+
+#CHECK: xsrebase2t4uqm 4, 5, 7
+0xec,0x85,0x3e,0xc8
+
+#CHECK: xsrebase3t1uqm 4, 5, 7
+0xec,0x85,0x3f,0x88
+
+#CHECK: xsrebase3t2uqm 4, 5, 7
+0xec,0x85,0x3f,0xc8
+
+#CHECK: xsrebase3t3uqm 4, 5, 7
+0xec,0x85,0x3e,0x18
+
+#CHECK: xsmerge2t1uqm 4, 5, 7
+0xec,0x85,0x3f,0x40
+
+#CHECK: xsmerge2t2uqm 4, 5, 7
+0xec,0x85,0x3f,0x80
+
+#CHECK: xsmerge2t3uqm 4, 5, 7
+0xec,0x85,0x3a,0xc8
+
+#CHECK: xsmerge3t1uqm 4, 5, 7
+0xec,0x85,0x3b,0xc8
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index 9cefe2451b0e3..f7e314fc819e4 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -267,3 +267,66 @@
 
 #CHECK: xvmulhuh  4, 5, 7
 0xd0,0x3b,0x85,0xf0
+
+#CHECK: xxmulmul 8, 3, 4, 2
+0x08,0x22,0x03,0xed
+
+#CHECK: xxmulmulhiadd 8, 3, 4, 1, 0, 1
+0x48,0x25,0x03,0xed
+
+#CHECK: xxmulmulloadd 8, 3, 4, 1, 0
+0x88,0x22,0x03,0xed
+
+#CHECK: xxssumudm 8, 3, 4, 1
+0xc8,0x24,0x03,0xed
+
+#CHECK: xxssumudmc 8, 3, 4, 1
+0xc8,0x25,0x03,0xed
+
+#CHECK: xxssumudmcext 8, 3, 4, 6, 0
+0x00,0x00,0x00,0x05,0xa0,0x21,0x03,0x89
+
+#CHECK: xsaddadduqm  4, 5, 7
+0x00,0x3b,0x85,0xec
+
+#CHECK: xsaddaddsuqm  4, 5, 7
+0x40,0x3b,0x85,0xec
+
+#CHECK: xsaddsubuqm  4, 5, 7
+0x80,0x3b,0x85,0xec
+
+#CHECK: xsaddsubsuqm  4, 5, 7
+0x00,0x3f,0x85,0xec
+
+#CHECK: xsrebase2t1uqm 4, 5, 7
+0x88,0x3c,0x85,0xec
+
+#CHECK: xsrebase2t2uqm 4, 5, 7
+0x88,0x3d,0x85,0xec
+
+#CHECK: xsrebase2t3uqm 4, 5, 7
+0x88,0x3e,0x85,0xec
+
+#CHECK: xsrebase2t4uqm 4, 5, 7
+0xc8,0x3e,0x85,0xec
+
+#CHECK: xsrebase3t1uqm 4, 5, 7
+0x88,0x3f,0x85,0xec
+
+#CHECK: xsrebase3t2uqm 4, 5, 7
+0xc8,0x3f,0x85,0xec
+
+#CHECK: xsrebase3t3uqm 4, 5, 7
+0x18,0x3e,0x85,0xec
+
+#CHECK: xsmerge2t1uqm 4, 5, 7
+0x40,0x3f,0x85,0xec
+
+#CHECK: xsmerge2t2uqm 4, 5, 7
+0x80,0x3f,0x85,0xec
+
+#CHECK: xsmerge2t3uqm 4, 5, 7
+0xc8,0x3a,0x85,0xec
+
+#CHECK: xsmerge3t1uqm 4, 5, 7
+0xc8,0x3b,0x85,0xec
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index f01d6fa697d89..29fedd7c20646 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -386,3 +386,89 @@
            xvmulhuh 4, 5, 7
 #CHECK-BE: xvmulhuh 4, 5, 7              # encoding: [0xf0,0x85,0x3b,0xd0]
 #CHECK-LE: xvmulhuh 4, 5, 7              # encoding: [0xd0,0x3b,0x85,0xf0]
+
+           xxmulmul 8, 3, 4, 2
+#CHECK-BE: xxmulmul 8, 3, 4, 2          # encoding: [0xed,0x03,0x22,0x08]
+#CHECK-LE: xxmulmul 8, 3, 4, 2          # encoding: [0x08,0x22,0x03,0xed]
+
+           xxmulmulhiadd 8, 3, 4, 1, 0, 1
+#CHECK-BE: xxmulmulhiadd 8, 3, 4, 1, 0, 1   # encoding: [0xed,0x03,0x25,0x48]
+#CHECK-LE: xxmulmulhiadd 8, 3, 4, 1, 0, 1   # encoding: [0x48,0x25,0x03,0xed]
+
+           xxmulmulloadd 8, 3, 4, 1, 0
+#CHECK-BE: xxmulmulloadd 8, 3, 4, 1, 0      # encoding: [0xed,0x03,0x22,0x88]
+#CHECK-LE: xxmulmulloadd 8, 3, 4, 1, 0      # encoding: [0x88,0x22,0x03,0xed]
+
+           xxssumudm 8, 3, 4, 1
+#CHECK-BE: xxssumudm 8, 3, 4, 1         # encoding: [0xed,0x03,0x24,0xc8]
+#CHECK-LE: xxssumudm 8, 3, 4, 1         # encoding: [0xc8,0x24,0x03,0xed]
+
+           xxssumudmc 8, 3, 4, 1
+#CHECK-BE: xxssumudmc 8, 3, 4, 1        # encoding: [0xed,0x03,0x25,0xc8]
+#CHECK-LE: xxssumudmc 8, 3, 4, 1        # encoding: [0xc8,0x25,0x03,0xed]
+
+           xxssumudmcext 8, 3, 4, 6, 0
+# CHECK-BE: xxssumudmcext 8, 3, 4, 6, 0 # encoding: [0x05,0x00,0x00,0x00,
+# CHECK-BE-SAME:                                     0x89,0x03,0x21,0xa0]
+# CHECK-LE: xxssumudmcext 8, 3, 4, 6, 0 # encoding: [0x00,0x00,0x00,0x05,
+# CHECK-LE-SAME:                                     0xa0,0x21,0x03,0x89]
+
+           xsaddadduqm  4, 5, 7
+#CHECK-BE: xsaddadduqm  4, 5, 7         # encoding: [0xec,0x85,0x3b,0x00]
+#CHECK-LE: xsaddadduqm  4, 5, 7         # encoding: [0x00,0x3b,0x85,0xec]
+
+           xsaddaddsuqm  4, 5, 7
+#CHECK-BE: xsaddaddsuqm  4, 5, 7        # encoding: [0xec,0x85,0x3b,0x40]
+#CHECK-LE: xsaddaddsuqm  4, 5, 7        # encoding: [0x40,0x3b,0x85,0xec]
+
+           xsaddsubuqm  4, 5, 7
+#CHECK-BE: xsaddsubuqm  4, 5, 7         # encoding: [0xec,0x85,0x3b,0x80]
+#CHECK-LE: xsaddsubuqm  4, 5, 7         # encoding: [0x80,0x3b,0x85,0xec]
+
+           xsaddsubsuqm  4, 5, 7
+#CHECK-BE: xsaddsubsuqm  4, 5, 7        # encoding: [0xec,0x85,0x3f,0x00]
+#CHECK-LE: xsaddsubsuqm  4, 5, 7        # encoding: [0x00,0x3f,0x85,0xec]
+
+           xsrebase2t1uqm 4, 5, 7
+#CHECK-BE: xsrebase2t1uqm 4, 5, 7       # encoding: [0xec,0x85,0x3c,0x88]
+#CHECK-LE: xsrebase2t1uqm 4, 5, 7       # encoding: [0x88,0x3c,0x85,0xec]
+
+           xsrebase2t2uqm 4, 5, 7
+#CHECK-BE: xsrebase2t2uqm 4, 5, 7       # encoding: [0xec,0x85,0x3d,0x88]
+#CHECK-LE: xsrebase2t2uqm 4, 5, 7       # encoding: [0x88,0x3d,0x85,0xec]
+
+           xsrebase2t3uqm 4, 5, 7
+#CHECK-BE: xsrebase2t3uqm 4, 5, 7       # encoding: [0xec,0x85,0x3e,0x88]
+#CHECK-LE: xsrebase2t3uqm 4, 5, 7       # encoding: [0x88,0x3e,0x85,0xec]
+
+           xsrebase2t4uqm 4, 5, 7
+#CHECK-BE: xsrebase2t4uqm 4, 5, 7       # encoding: [0xec,0x85,0x3e,0xc8]
+#CHECK-LE: xsrebase2t4uqm 4, 5, 7       # encoding: [0xc8,0x3e,0x85,0xec]
+
+           xsrebase3t1uqm 4, 5, 7
+#CHECK-BE: xsrebase3t1uqm 4, 5, 7       # encoding: [0xec,0x85,0x3f,0x88]
+#CHECK-LE: xsrebase3t1uqm 4, 5, 7       # encoding: [0x88,0x3f,0x85,0xec]
+
+           xsrebase3t2uqm 4, 5, 7
+#CHECK-BE: xsrebase3t2uqm 4, 5, 7       # encoding: [0xec,0x85,0x3f,0xc8]
+#CHECK-LE: xsrebase3t2uqm 4, 5, 7       # encoding: [0xc8,0x3f,0x85,0xec]
+
+           xsrebase3t3uqm 4, 5, 7
+#CHECK-BE: xsrebase3t3uqm 4, 5, 7       # encoding: [0xec,0x85,0x3e,0x18]
+#CHECK-LE: xsrebase3t3uqm 4, 5, 7       # encoding: [0x18,0x3e,0x85,0xec]
+
+           xsmerge2t1uqm 4, 5, 7
+#CHECK-BE: xsmerge2t1uqm 4, 5, 7        # encoding: [0xec,0x85,0x3f,0x40]
+#CHECK-LE: xsmerge2t1uqm 4, 5, 7        # encoding: [0x40,0x3f,0x85,0xec]
+
+           xsmerge2t2uqm 4, 5, 7
+#CHECK-BE: xsmerge2t2uqm 4, 5, 7        # encoding: [0xec,0x85,0x3f,0x80]
+#CHECK-LE: xsmerge2t2uqm 4, 5, 7        # encoding: [0x80,0x3f,0x85,0xec]
+
+           xsmerge2t3uqm 4, 5, 7
+#CHECK-BE: xsmerge2t3uqm 4, 5, 7        # encoding: [0xec,0x85,0x3a,0xc8]
+#CHECK-LE: xsmerge2t3uqm 4, 5, 7        # encoding: [0xc8,0x3a,0x85,0xec]
+
+           xsmerge3t1uqm 4, 5, 7
+#CHECK-BE: xsmerge3t1uqm 4, 5, 7        # encoding: [0xec,0x85,0x3b,0xc8]
+#CHECK-LE: xsmerge3t1uqm 4, 5, 7        # encoding: [0xc8,0x3b,0x85,0xec]

From ccf1fb00fee3a79b09adacdc95a32660546f9f14 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Tue, 30 Sep 2025 09:44:23 -0700
Subject: [PATCH 274/878] [lld][macho][NFC] Factor count zeros into helper
 function (#161241)

Move `llvm::countr_zero()` into a helper function to reduce code
duplication between `CStringSection` and `DeduplicatedCStringSection`.
More importantly, this moves a giant comment to that helper function
since it pertains to both classes.
---
 lld/MachO/SyntheticSections.cpp | 57 +++++++++++++++++----------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 228b84db21c2a..5645d8a05a28f 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1685,31 +1685,7 @@ void CStringSection::writeTo(uint8_t *buf) const {
   }
 }
 
-void CStringSection::finalizeContents() {
-  uint64_t offset = 0;
-  // TODO: Call buildCStringPriorities() to support cstring ordering when
-  // deduplication is off, although this may negatively impact build
-  // performance.
-  for (CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      // See comment above DeduplicatedCStringSection for how alignment is
-      // handled.
-      uint32_t pieceAlign = 1
-                            << llvm::countr_zero(isec->align | piece.inSecOff);
-      offset = alignToPowerOf2(offset, pieceAlign);
-      piece.outSecOff = offset;
-      isec->isFinal = true;
-      StringRef string = isec->getStringRef(i);
-      offset += string.size() + 1; // account for null terminator
-    }
-  }
-  size = offset;
-}
-
-// Mergeable cstring literals are found under the __TEXT,__cstring section. In
-// contrast to ELF, which puts strings that need different alignments into
+// In contrast to ELF, which puts strings that need different alignments into
 // different sections, clang's Mach-O backend puts them all in one section.
 // Strings that need to be aligned have the .p2align directive emitted before
 // them, which simply translates into zero padding in the object file. In other
@@ -1744,8 +1720,33 @@ void CStringSection::finalizeContents() {
 // requires its operand addresses to be 16-byte aligned). However, there will
 // typically also be other cstrings in the same file that aren't used via SIMD
 // and don't need this alignment. They will be emitted at some arbitrary address
-// `A`, but ld64 will treat them as being 16-byte aligned with an offset of `16
-// % A`.
+// `A`, but ld64 will treat them as being 16-byte aligned with an offset of
+// `16 % A`.
+static uint8_t getStringPieceAlignment(const CStringInputSection *isec,
+                                       const StringPiece &piece) {
+  return llvm::countr_zero(isec->align | piece.inSecOff);
+}
+
+void CStringSection::finalizeContents() {
+  uint64_t offset = 0;
+  // TODO: Call buildCStringPriorities() to support cstring ordering when
+  // deduplication is off, although this may negatively impact build
+  // performance.
+  for (CStringInputSection *isec : inputs) {
+    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
+      if (!piece.live)
+        continue;
+      uint32_t pieceAlign = 1 << getStringPieceAlignment(isec, piece);
+      offset = alignToPowerOf2(offset, pieceAlign);
+      piece.outSecOff = offset;
+      isec->isFinal = true;
+      StringRef string = isec->getStringRef(i);
+      offset += string.size() + 1; // account for null terminator
+    }
+  }
+  size = offset;
+}
+
 void DeduplicatedCStringSection::finalizeContents() {
   // Find the largest alignment required for each string.
   for (const CStringInputSection *isec : inputs) {
@@ -1754,7 +1755,7 @@ void DeduplicatedCStringSection::finalizeContents() {
         continue;
       auto s = isec->getCachedHashStringRef(i);
       assert(isec->align != 0);
-      uint8_t trailingZeros = llvm::countr_zero(isec->align | piece.inSecOff);
+      uint8_t trailingZeros = getStringPieceAlignment(isec, piece);
       auto it = stringOffsetMap.insert(
           std::make_pair(s, StringOffset(trailingZeros)));
       if (!it.second && it.first->second.trailingZeros < trailingZeros)

From 71d8ddc78a9beeb307d7effab73f557c08b70a7e Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Tue, 30 Sep 2025 18:44:40 +0200
Subject: [PATCH 275/878] [CIR] Upstream ParenExpr for AggregateExpr (#160998)

Upstream ParenExpr support for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  4 +--
 clang/test/CIR/CodeGen/struct.cpp             | 36 +++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 4a8aac900ee07..5596499ee94b5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -131,9 +131,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
                      std::string("AggExprEmitter::VisitStmt: ") +
                          s->getStmtClassName());
   }
-  void VisitParenExpr(ParenExpr *pe) {
-    cgf.cgm.errorNYI(pe->getSourceRange(), "AggExprEmitter: VisitParenExpr");
-  }
+  void VisitParenExpr(ParenExpr *pe) { Visit(pe->getSubExpr()); }
   void VisitGenericSelectionExpr(GenericSelectionExpr *ge) {
     cgf.cgm.errorNYI(ge->getSourceRange(),
                      "AggExprEmitter: VisitGenericSelectionExpr");
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index ee6c4cab7341f..75374284d09d0 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -93,3 +93,39 @@ void f3() {
 // OGCG:   %[[O:.*]] = alloca %struct.Outer, align 4
 // OGCG:   %[[O_I:.*]] = getelementptr inbounds nuw %struct.Outer, ptr %[[O]], i32 0, i32 0
 // OGCG:   %[[O_I_N:.*]] = getelementptr inbounds nuw %struct.Inner, ptr %[[O_I]], i32 0, i32 0
+
+void paren_expr() {
+  struct Point {
+    int x;
+    int y;
+  };
+
+  Point a = (Point{});
+  Point b = (a);
+}
+
+// CIR: cir.func{{.*}} @_Z10paren_exprv()
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["a", init]
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !rec_Point, !cir.ptr<!rec_Point>, ["b", init]
+// CIR:   %[[X_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "x"} : !cir.ptr<!rec_Point> -> !cir.ptr<!s32i>
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[CONST_0]], %[[X_ELEM_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[Y_ELEM_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "y"} : !cir.ptr<!rec_Point> -> !cir.ptr<!s32i>
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[CONST_0]], %[[Y_ELEM_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   cir.call @_ZZ10paren_exprvEN5PointC1ERKS_(%[[B_ADDR]], %[[A_ADDR]]) nothrow : (!cir.ptr<!rec_Point>, !cir.ptr<!rec_Point>) -> ()
+
+// LLVM: define{{.*}} void @_Z10paren_exprv()
+// LLVM:   %[[A_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
+// LLVM:   %[[B_ADDR:.*]] = alloca %struct.Point, i64 1, align 4
+// LLVM:   %[[X_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 0
+// LLVM:   store i32 0, ptr %[[X_ELEM_PTR]], align 4
+// LLVM:   %[[Y_ELEM_PTR:.*]] = getelementptr %struct.Point, ptr %[[A_ADDR]], i32 0, i32 1
+// LLVM:   store i32 0, ptr %[[Y_ELEM_PTR]], align 4
+// LLVM:   call void @_ZZ10paren_exprvEN5PointC1ERKS_(ptr %[[B_ADDR]], ptr %[[A_ADDR]])
+
+// OGCG: define{{.*}} void @_Z10paren_exprv()
+// OGCG:   %[[A_ADDR:.*]] = alloca %struct.Point, align 4
+// OGCG:   %[[B_ADDR:.*]] = alloca %struct.Point, align 4
+// OGCG:   call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
+// OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[B_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)

From 178651ac872f4ca3b2d77e0ff6ff87e83a066f6f Mon Sep 17 00:00:00 2001
From: sebvince <115461989+sebvince@users.noreply.github.com>
Date: Tue, 30 Sep 2025 18:46:05 +0200
Subject: [PATCH 276/878] [MLIR][SCF] Add loops as parameter to LoopTerminator
 callback when using CustomOp. (#161386)

This PR adds to the generateLoopTerminatorFn callback the loops
generated by GenerateLoopHeaderFn. This is needed to correctly set the
insertion point with scf.forall ops.
---
 mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h | 4 +++-
 mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp        | 4 ++--
 .../TilingInterface/TestTilingInterfaceTransformOps.cpp       | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 668ee6386f71f..7c735d825b445 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -183,6 +183,7 @@ struct SCFTilingOptions {
       ArrayRef<OpFoldResult> givenTileSizes, ValueRange destinationTensors)>;
 
   // Type of the callback function that generates the loop terminator.
+  // - `loops` : generated loops from the GenerateLoopHeaderFn callback
   // - `tiledResults` : Tiles of the result computed for the iteration space
   //                    tile.
   // - `resultOffsets` : For each of the `tiledResults`, the offset at which
@@ -193,7 +194,8 @@ struct SCFTilingOptions {
   //                   tensor.
   // Returns the `CustomLoopHeaderInfo` object (described above)
   using GenerateLoopTerminatorFn = std::function<LogicalResult(
-      RewriterBase &rewriter, Location loc, ValueRange tiledResults,
+      RewriterBase &rewriter, Location loc, ArrayRef<LoopLikeOpInterface> loops,
+      ValueRange tiledResults,
       ArrayRef<SmallVector<OpFoldResult>> resultOffsets,
       ArrayRef<SmallVector<OpFoldResult>> resultSizes,
       ValueRange destinationTensors)>;
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 89e2c57d709dd..36685d3affe03 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -665,8 +665,8 @@ generateLoopNestUsingCustomOp(
     return failure();
   }
 
-  if (failed(generateLoopTerminatorFn(rewriter, loc, tiledResults,
-                                      resultOffsets, resultSizes,
+  if (failed(generateLoopTerminatorFn(rewriter, loc, loopHeaderInfo->loops,
+                                      tiledResults, resultOffsets, resultSizes,
                                       loopHeaderInfo->destinationTensors))) {
     return failure();
   }
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
index 7981c72c2f2c8..326fec3ee5cf0 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
@@ -581,7 +581,8 @@ DiagnosedSilenceableFailure transform::TestTileUsingCustomLoopOp::apply(
   };
 
   scf::SCFTilingOptions::GenerateLoopTerminatorFn terminatorFn =
-      [&](RewriterBase &rewriter, Location loc, ValueRange tiledResults,
+      [&](RewriterBase &rewriter, Location loc,
+          ArrayRef<LoopLikeOpInterface> loops, ValueRange tiledResults,
           ArrayRef<SmallVector<OpFoldResult>> resultOffsets,
           ArrayRef<SmallVector<OpFoldResult>> resultSizes,
           ValueRange destinationTensors) -> LogicalResult {

From c4e7da3da5fe1c685b6789e34c4dcf23aec0158a Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Tue, 30 Sep 2025 12:48:47 -0400
Subject: [PATCH 277/878] [AArch64] shouldFoldMaskToVariableShiftPair should be
 true for scalars up to the biggest legal type (#158069)

For AArch64, we want to do this up to 64-bits. Otherwise this results in
bloated code.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h | 10 ++
 .../test/CodeGen/AArch64/and-mask-variable.ll | 80 +++++++++++++++
 llvm/test/CodeGen/AArch64/extract-bits.ll     | 98 ++++++++-----------
 llvm/test/CodeGen/AArch64/extract-lowbits.ll  | 66 ++++++-------
 4 files changed, 161 insertions(+), 93 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/and-mask-variable.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d8072d15853ee..e472e7d565d9b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -303,6 +303,16 @@ class AArch64TargetLowering : public TargetLowering {
   bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                          CombineLevel Level) const override;
 
+  /// Return true if it is profitable to fold a pair of shifts into a mask.
+  bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override {
+    EVT VT = Y.getValueType();
+
+    if (VT.isVector())
+      return false;
+
+    return VT.getScalarSizeInBits() <= 64;
+  }
+
   bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
                                             unsigned SelectOpcode, SDValue X,
                                             SDValue Y) const override;
diff --git a/llvm/test/CodeGen/AArch64/and-mask-variable.ll b/llvm/test/CodeGen/AArch64/and-mask-variable.ll
new file mode 100644
index 0000000000000..f41cdc6dd241b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/and-mask-variable.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-elf -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; CHECK-SD-LABEL: mask_pair:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr w8, w0, w1
+; CHECK-SD-NEXT:    lsl w0, w8, w1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mask_pair:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    lsl w8, w8, w1
+; CHECK-GI-NEXT:    and w0, w8, w0
+; CHECK-GI-NEXT:    ret
+  %shl = shl nsw i32 -1, %y
+  %and = and i32 %shl, %x
+  ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; CHECK-SD-LABEL: mask_pair_64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr x8, x0, x1
+; CHECK-SD-NEXT:    lsl x0, x8, x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mask_pair_64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-GI-NEXT:    lsl x8, x8, x1
+; CHECK-GI-NEXT:    and x0, x8, x0
+; CHECK-GI-NEXT:    ret
+  %shl = shl nsw i64 -1, %y
+  %and = and i64 %shl, %x
+  ret i64 %and
+}
+
+define i128 @mask_pair_128(i128 %x, i128 %y) {
+; CHECK-SD-LABEL: mask_pair_128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    mov x10, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-SD-NEXT:    lsl x8, x8, x2
+; CHECK-SD-NEXT:    lsr x9, x10, x9
+; CHECK-SD-NEXT:    tst x2, #0x40
+; CHECK-SD-NEXT:    orr x9, x8, x9
+; CHECK-SD-NEXT:    csel x9, x8, x9, ne
+; CHECK-SD-NEXT:    csel x8, xzr, x8, ne
+; CHECK-SD-NEXT:    and x0, x8, x0
+; CHECK-SD-NEXT:    and x1, x9, x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mask_pair_128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    mov x9, #-1 // =0xffffffffffffffff
+; CHECK-GI-NEXT:    sub x10, x2, #64
+; CHECK-GI-NEXT:    sub x8, x8, x2
+; CHECK-GI-NEXT:    lsl x11, x9, x2
+; CHECK-GI-NEXT:    cmp x2, #64
+; CHECK-GI-NEXT:    lsr x8, x9, x8
+; CHECK-GI-NEXT:    lsl x9, x9, x10
+; CHECK-GI-NEXT:    csel x10, x11, xzr, lo
+; CHECK-GI-NEXT:    orr x8, x8, x11
+; CHECK-GI-NEXT:    and x0, x10, x0
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x2, #0
+; CHECK-GI-NEXT:    csinv x8, x8, xzr, ne
+; CHECK-GI-NEXT:    and x1, x8, x1
+; CHECK-GI-NEXT:    ret
+  %shl = shl nsw i128 -1, %y
+  %and = and i128 %shl, %x
+  ret i128 %and
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index 8e822d19a19b9..5a96116142b51 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -532,11 +532,10 @@ define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
 define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_c0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w10, w0, w1
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    and w0, w8, w10
+; CHECK-NEXT:    lsr w8, w0, w1
+; CHECK-NEXT:    neg w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
@@ -548,12 +547,11 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_c1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w10, w0, w1
-; CHECK-NEXT:    sub w8, w8, w2
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    and w0, w8, w10
+; CHECK-NEXT:    lsr w8, w0, w1
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %skip = zext i8 %numskipbits to i32
   %shifted = lshr i32 %val, %skip
@@ -569,10 +567,9 @@ define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    neg w9, w2
-; CHECK-NEXT:    mov w10, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w9, w10, w9
 ; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    and w0, w9, w8
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %shifted = lshr i32 %val, %numskipbits
@@ -587,11 +584,10 @@ define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    mov w9, #32 // =0x20
-; CHECK-NEXT:    mov w10, #-1 // =0xffffffff
 ; CHECK-NEXT:    sub w9, w9, w2
 ; CHECK-NEXT:    lsr w8, w8, w1
-; CHECK-NEXT:    lsr w9, w10, w9
-; CHECK-NEXT:    and w0, w9, w8
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %skip = zext i8 %numskipbits to i32
@@ -606,11 +602,10 @@ define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_c4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr w10, w0, w1
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    and w0, w10, w8
+; CHECK-NEXT:    lsr w8, w0, w1
+; CHECK-NEXT:    neg w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
@@ -624,11 +619,10 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_c0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x10, x0, x1
-; CHECK-NEXT:    lsr x8, x9, x8
-; CHECK-NEXT:    and x0, x8, x10
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    neg x9, x2
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
@@ -640,13 +634,12 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_c1_indexzext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    lsr x10, x0, x1
-; CHECK-NEXT:    sub w8, w8, w2
-; CHECK-NEXT:    lsr x8, x9, x8
-; CHECK-NEXT:    and x0, x8, x10
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    mov w9, #64 // =0x40
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %skip = zext i8 %numskipbits to i64
   %shifted = lshr i64 %val, %skip
@@ -662,10 +655,9 @@ define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    neg x9, x2
-; CHECK-NEXT:    mov x10, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x9, x10, x9
 ; CHECK-NEXT:    lsr x8, x8, x1
-; CHECK-NEXT:    and x0, x9, x8
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %shifted = lshr i64 %val, %numskipbits
@@ -679,13 +671,12 @@ define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 ; CHECK-LABEL: bextr64_c3_load_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    mov w9, #64 // =0x40
-; CHECK-NEXT:    mov x10, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    sub w9, w9, w2
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    lsr x8, x8, x1
-; CHECK-NEXT:    lsr x9, x10, x9
-; CHECK-NEXT:    and x0, x9, x8
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %skip = zext i8 %numskipbits to i64
@@ -700,11 +691,10 @@ define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_c4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg x8, x2
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    lsr x10, x0, x1
-; CHECK-NEXT:    lsr x8, x9, x8
-; CHECK-NEXT:    and x0, x10, x8
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    neg x9, x2
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
@@ -737,11 +727,10 @@ define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind
 define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_c1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr x10, x0, x1
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    and w0, w8, w10
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    neg w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shifted = lshr i64 %val, %numskipbits
   %truncshifted = trunc i64 %shifted to i32
@@ -756,11 +745,10 @@ define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_32_c2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w8, w2
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    lsr x10, x0, x1
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    and w0, w8, w10
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    neg w9, w2
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
diff --git a/llvm/test/CodeGen/AArch64/extract-lowbits.ll b/llvm/test/CodeGen/AArch64/extract-lowbits.ll
index 4b8f3e86b5fef..368440c65df84 100644
--- a/llvm/test/CodeGen/AArch64/extract-lowbits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-lowbits.ll
@@ -347,10 +347,9 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    neg w9, w1
-; CHECK-NEXT:    lsr w8, w8, w9
-; CHECK-NEXT:    and w0, w8, w0
+; CHECK-NEXT:    neg w8, w1
+; CHECK-NEXT:    lsl w9, w0, w8
+; CHECK-NEXT:    lsr w0, w9, w8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
@@ -362,10 +361,9 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c1_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-NEXT:    sub w8, w8, w1
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    and w0, w8, w0
+; CHECK-NEXT:    lsl w9, w0, w8
+; CHECK-NEXT:    lsr w0, w9, w8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
@@ -377,11 +375,10 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    ldr w8, [x0]
 ; CHECK-NEXT:    neg w9, w1
-; CHECK-NEXT:    ldr w10, [x0]
-; CHECK-NEXT:    lsr w8, w8, w9
-; CHECK-NEXT:    and w0, w8, w10
+; CHECK-NEXT:    lsl w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, w9
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %numhighbits = sub i32 32, %numlowbits
@@ -394,11 +391,10 @@ define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c3_load_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEXT:    mov w9, #-1 // =0xffffffff
-; CHECK-NEXT:    ldr w10, [x0]
+; CHECK-NEXT:    ldr w9, [x0]
 ; CHECK-NEXT:    sub w8, w8, w1
-; CHECK-NEXT:    lsr w8, w9, w8
-; CHECK-NEXT:    and w0, w8, w10
+; CHECK-NEXT:    lsl w9, w9, w8
+; CHECK-NEXT:    lsr w0, w9, w8
 ; CHECK-NEXT:    ret
   %val = load i32, ptr %w
   %numhighbits = sub i8 32, %numlowbits
@@ -411,10 +407,9 @@ define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi32_c4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    neg w9, w1
-; CHECK-NEXT:    lsr w8, w8, w9
-; CHECK-NEXT:    and w0, w0, w8
+; CHECK-NEXT:    neg w8, w1
+; CHECK-NEXT:    lsl w9, w0, w8
+; CHECK-NEXT:    lsr w0, w9, w8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
@@ -427,10 +422,9 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    neg x9, x1
-; CHECK-NEXT:    lsr x8, x8, x9
-; CHECK-NEXT:    and x0, x8, x0
+; CHECK-NEXT:    neg x8, x1
+; CHECK-NEXT:    lsl x9, x0, x8
+; CHECK-NEXT:    lsr x0, x9, x8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
@@ -442,10 +436,9 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c1_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    sub w8, w8, w1
-; CHECK-NEXT:    lsr x8, x9, x8
-; CHECK-NEXT:    and x0, x8, x0
+; CHECK-NEXT:    lsl x9, x0, x8
+; CHECK-NEXT:    lsr x0, x9, x8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
@@ -457,11 +450,10 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c2_load:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    neg x9, x1
-; CHECK-NEXT:    ldr x10, [x0]
-; CHECK-NEXT:    lsr x8, x8, x9
-; CHECK-NEXT:    and x0, x8, x10
+; CHECK-NEXT:    lsl x8, x8, x9
+; CHECK-NEXT:    lsr x0, x8, x9
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %numhighbits = sub i64 64, %numlowbits
@@ -474,11 +466,10 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c3_load_indexzext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEXT:    mov x9, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ldr x10, [x0]
+; CHECK-NEXT:    ldr x9, [x0]
 ; CHECK-NEXT:    sub w8, w8, w1
-; CHECK-NEXT:    lsr x8, x9, x8
-; CHECK-NEXT:    and x0, x8, x10
+; CHECK-NEXT:    lsl x9, x9, x8
+; CHECK-NEXT:    lsr x0, x9, x8
 ; CHECK-NEXT:    ret
   %val = load i64, ptr %w
   %numhighbits = sub i8 64, %numlowbits
@@ -491,10 +482,9 @@ define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; CHECK-LABEL: bzhi64_c4_commutative:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    neg x9, x1
-; CHECK-NEXT:    lsr x8, x8, x9
-; CHECK-NEXT:    and x0, x0, x8
+; CHECK-NEXT:    neg x8, x1
+; CHECK-NEXT:    lsl x9, x0, x8
+; CHECK-NEXT:    lsr x0, x9, x8
 ; CHECK-NEXT:    ret
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits

From 9fd09f4bd9ae0ab48be3b8339527cef0806447d7 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Wed, 1 Oct 2025 01:55:15 +0900
Subject: [PATCH 278/878] [llvm-readobj][NFC] Restore and disable clang-format
 for machine type list (#160122)

The original code was more readable, just disable `clang-format` for
this code.

See https://github.com/llvm/llvm-project/pull/159793

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 llvm/tools/llvm-readobj/ELFDumper.cpp | 339 +++++++++++++-------------
 1 file changed, 167 insertions(+), 172 deletions(-)

diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 3092bfd42e25e..ab93316907cc6 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -1133,180 +1133,175 @@ const EnumEntry<unsigned> C6000ElfOSABI[] = {
   {"C6000_LINUX",  "Linux C6000",      ELF::ELFOSABI_C6000_LINUX}
 };
 
+// clang-format off
 const EnumEntry<unsigned> ElfMachineType[] = {
-    ENUM_ENT(EM_NONE, "None"),
-    ENUM_ENT(EM_M32, "WE32100"),
-    ENUM_ENT(EM_SPARC, "Sparc"),
-    ENUM_ENT(EM_386, "Intel 80386"),
-    ENUM_ENT(EM_68K, "MC68000"),
-    ENUM_ENT(EM_88K, "MC88000"),
-    ENUM_ENT(EM_IAMCU, "EM_IAMCU"),
-    ENUM_ENT(EM_860, "Intel 80860"),
-    ENUM_ENT(EM_MIPS, "MIPS R3000"),
-    ENUM_ENT(EM_S370, "IBM System/370"),
-    ENUM_ENT(EM_MIPS_RS3_LE, "MIPS R3000 little-endian"),
-    ENUM_ENT(EM_PARISC, "HPPA"),
-    ENUM_ENT(EM_VPP500, "Fujitsu VPP500"),
-    ENUM_ENT(EM_SPARC32PLUS, "Sparc v8+"),
-    ENUM_ENT(EM_960, "Intel 80960"),
-    ENUM_ENT(EM_PPC, "PowerPC"),
-    ENUM_ENT(EM_PPC64, "PowerPC64"),
-    ENUM_ENT(EM_S390, "IBM S/390"),
-    ENUM_ENT(EM_SPU, "SPU"),
-    ENUM_ENT(EM_V800, "NEC V800 series"),
-    ENUM_ENT(EM_FR20, "Fujistsu FR20"),
-    ENUM_ENT(EM_RH32, "TRW RH-32"),
-    ENUM_ENT(EM_RCE, "Motorola RCE"),
-    ENUM_ENT(EM_ARM, "ARM"),
-    ENUM_ENT(EM_ALPHA, "EM_ALPHA"),
-    ENUM_ENT(EM_SH, "Hitachi SH"),
-    ENUM_ENT(EM_SPARCV9, "Sparc v9"),
-    ENUM_ENT(EM_TRICORE, "Siemens Tricore"),
-    ENUM_ENT(EM_ARC, "ARC"),
-    ENUM_ENT(EM_H8_300, "Hitachi H8/300"),
-    ENUM_ENT(EM_H8_300H, "Hitachi H8/300H"),
-    ENUM_ENT(EM_H8S, "Hitachi H8S"),
-    ENUM_ENT(EM_H8_500, "Hitachi H8/500"),
-    ENUM_ENT(EM_IA_64, "Intel IA-64"),
-    ENUM_ENT(EM_MIPS_X, "Stanford MIPS-X"),
-    ENUM_ENT(EM_COLDFIRE, "Motorola Coldfire"),
-    ENUM_ENT(EM_68HC12, "Motorola MC68HC12 Microcontroller"),
-    ENUM_ENT(EM_MMA, "Fujitsu Multimedia Accelerator"),
-    ENUM_ENT(EM_PCP, "Siemens PCP"),
-    ENUM_ENT(EM_NCPU, "Sony nCPU embedded RISC processor"),
-    ENUM_ENT(EM_NDR1, "Denso NDR1 microprocesspr"),
-    ENUM_ENT(EM_STARCORE, "Motorola Star*Core processor"),
-    ENUM_ENT(EM_ME16, "Toyota ME16 processor"),
-    ENUM_ENT(EM_ST100, "STMicroelectronics ST100 processor"),
-    ENUM_ENT(EM_TINYJ, "Advanced Logic Corp. TinyJ embedded processor"),
-    ENUM_ENT(EM_X86_64, "Advanced Micro Devices X86-64"),
-    ENUM_ENT(EM_PDSP, "Sony DSP processor"),
-    ENUM_ENT(EM_PDP10, "Digital Equipment Corp. PDP-10"),
-    ENUM_ENT(EM_PDP11, "Digital Equipment Corp. PDP-11"),
-    ENUM_ENT(EM_FX66, "Siemens FX66 microcontroller"),
-    ENUM_ENT(EM_ST9PLUS, "STMicroelectronics ST9+ 8/16 bit microcontroller"),
-    ENUM_ENT(EM_ST7, "STMicroelectronics ST7 8-bit microcontroller"),
-    ENUM_ENT(EM_68HC16, "Motorola MC68HC16 Microcontroller"),
-    ENUM_ENT(EM_68HC11, "Motorola MC68HC11 Microcontroller"),
-    ENUM_ENT(EM_68HC08, "Motorola MC68HC08 Microcontroller"),
-    ENUM_ENT(EM_68HC05, "Motorola MC68HC05 Microcontroller"),
-    ENUM_ENT(EM_SVX, "Silicon Graphics SVx"),
-    ENUM_ENT(EM_ST19, "STMicroelectronics ST19 8-bit microcontroller"),
-    ENUM_ENT(EM_VAX, "Digital VAX"),
-    ENUM_ENT(EM_CRIS, "Axis Communications 32-bit embedded processor"),
-    ENUM_ENT(EM_JAVELIN, "Infineon Technologies 32-bit embedded cpu"),
-    ENUM_ENT(EM_FIREPATH, "Element 14 64-bit DSP processor"),
-    ENUM_ENT(EM_ZSP, "LSI Logic's 16-bit DSP processor"),
-    ENUM_ENT(EM_MMIX, "Donald Knuth's educational 64-bit processor"),
-    ENUM_ENT(EM_HUANY,
-             "Harvard Universitys's machine-independent object format"),
-    ENUM_ENT(EM_PRISM, "Vitesse Prism"),
-    ENUM_ENT(EM_AVR, "Atmel AVR 8-bit microcontroller"),
-    ENUM_ENT(EM_FR30, "Fujitsu FR30"),
-    ENUM_ENT(EM_D10V, "Mitsubishi D10V"),
-    ENUM_ENT(EM_D30V, "Mitsubishi D30V"),
-    ENUM_ENT(EM_V850, "NEC v850"),
-    ENUM_ENT(EM_M32R, "Renesas M32R (formerly Mitsubishi M32r)"),
-    ENUM_ENT(EM_MN10300, "Matsushita MN10300"),
-    ENUM_ENT(EM_MN10200, "Matsushita MN10200"),
-    ENUM_ENT(EM_PJ, "picoJava"),
-    ENUM_ENT(EM_OPENRISC, "OpenRISC 32-bit embedded processor"),
-    ENUM_ENT(EM_ARC_COMPACT, "EM_ARC_COMPACT"),
-    ENUM_ENT(EM_XTENSA, "Tensilica Xtensa Processor"),
-    ENUM_ENT(EM_VIDEOCORE, "Alphamosaic VideoCore processor"),
-    ENUM_ENT(EM_TMM_GPP, "Thompson Multimedia General Purpose Processor"),
-    ENUM_ENT(EM_NS32K, "National Semiconductor 32000 series"),
-    ENUM_ENT(EM_TPC, "Tenor Network TPC processor"),
-    ENUM_ENT(EM_SNP1K, "EM_SNP1K"),
-    ENUM_ENT(EM_ST200, "STMicroelectronics ST200 microcontroller"),
-    ENUM_ENT(EM_IP2K, "Ubicom IP2xxx 8-bit microcontrollers"),
-    ENUM_ENT(EM_MAX, "MAX Processor"),
-    ENUM_ENT(EM_CR, "National Semiconductor CompactRISC"),
-    ENUM_ENT(EM_F2MC16, "Fujitsu F2MC16"),
-    ENUM_ENT(EM_MSP430, "Texas Instruments msp430 microcontroller"),
-    ENUM_ENT(EM_BLACKFIN, "Analog Devices Blackfin"),
-    ENUM_ENT(EM_SE_C33, "S1C33 Family of Seiko Epson processors"),
-    ENUM_ENT(EM_SEP, "Sharp embedded microprocessor"),
-    ENUM_ENT(EM_ARCA, "Arca RISC microprocessor"),
-    ENUM_ENT(EM_UNICORE, "Unicore"),
-    ENUM_ENT(EM_EXCESS, "eXcess 16/32/64-bit configurable embedded CPU"),
-    ENUM_ENT(EM_DXP, "Icera Semiconductor Inc. Deep Execution Processor"),
-    ENUM_ENT(EM_ALTERA_NIOS2, "Altera Nios"),
-    ENUM_ENT(EM_CRX, "National Semiconductor CRX microprocessor"),
-    ENUM_ENT(EM_XGATE, "Motorola XGATE embedded processor"),
-    ENUM_ENT(EM_C166, "Infineon Technologies xc16x"),
-    ENUM_ENT(EM_M16C, "Renesas M16C"),
-    ENUM_ENT(EM_DSPIC30F,
-             "Microchip Technology dsPIC30F Digital Signal Controller"),
-    ENUM_ENT(EM_CE, "Freescale Communication Engine RISC core"),
-    ENUM_ENT(EM_M32C, "Renesas M32C"),
-    ENUM_ENT(EM_TSK3000, "Altium TSK3000 core"),
-    ENUM_ENT(EM_RS08, "Freescale RS08 embedded processor"),
-    ENUM_ENT(EM_SHARC, "EM_SHARC"),
-    ENUM_ENT(EM_ECOG2, "Cyan Technology eCOG2 microprocessor"),
-    ENUM_ENT(EM_SCORE7, "SUNPLUS S+Core"),
-    ENUM_ENT(EM_DSP24, "New Japan Radio (NJR) 24-bit DSP Processor"),
-    ENUM_ENT(EM_VIDEOCORE3, "Broadcom VideoCore III processor"),
-    ENUM_ENT(EM_LATTICEMICO32, "Lattice Mico32"),
-    ENUM_ENT(EM_SE_C17, "Seiko Epson C17 family"),
-    ENUM_ENT(EM_TI_C6000, "Texas Instruments TMS320C6000 DSP family"),
-    ENUM_ENT(EM_TI_C2000, "Texas Instruments TMS320C2000 DSP family"),
-    ENUM_ENT(EM_TI_C5500, "Texas Instruments TMS320C55x DSP family"),
-    ENUM_ENT(EM_MMDSP_PLUS,
-             "STMicroelectronics 64bit VLIW Data Signal Processor"),
-    ENUM_ENT(EM_CYPRESS_M8C, "Cypress M8C microprocessor"),
-    ENUM_ENT(EM_R32C, "Renesas R32C series microprocessors"),
-    ENUM_ENT(EM_TRIMEDIA, "NXP Semiconductors TriMedia architecture family"),
-    ENUM_ENT(EM_HEXAGON, "Qualcomm Hexagon"),
-    ENUM_ENT(EM_8051, "Intel 8051 and variants"),
-    ENUM_ENT(EM_STXP7X, "STMicroelectronics STxP7x family"),
-    ENUM_ENT(
-        EM_NDS32,
-        "Andes Technology compact code size embedded RISC processor family"),
-    ENUM_ENT(EM_ECOG1, "Cyan Technology eCOG1 microprocessor"),
-    // FIXME: Following EM_ECOG1X definitions is dead code since EM_ECOG1X has
-    //        an identical number to EM_ECOG1.
-    ENUM_ENT(EM_ECOG1X, "Cyan Technology eCOG1X family"),
-    ENUM_ENT(EM_MAXQ30, "Dallas Semiconductor MAXQ30 Core microcontrollers"),
-    ENUM_ENT(EM_XIMO16, "New Japan Radio (NJR) 16-bit DSP Processor"),
-    ENUM_ENT(EM_MANIK, "M2000 Reconfigurable RISC Microprocessor"),
-    ENUM_ENT(EM_CRAYNV2, "Cray Inc. NV2 vector architecture"),
-    ENUM_ENT(EM_RX, "Renesas RX"),
-    ENUM_ENT(EM_METAG, "Imagination Technologies Meta processor architecture"),
-    ENUM_ENT(EM_MCST_ELBRUS,
-             "MCST Elbrus general purpose hardware architecture"),
-    ENUM_ENT(EM_ECOG16, "Cyan Technology eCOG16 family"),
-    ENUM_ENT(EM_CR16, "National Semiconductor CompactRISC 16-bit processor"),
-    ENUM_ENT(EM_ETPU, "Freescale Extended Time Processing Unit"),
-    ENUM_ENT(EM_SLE9X, "Infineon Technologies SLE9X core"),
-    ENUM_ENT(EM_L10M, "EM_L10M"),
-    ENUM_ENT(EM_K10M, "EM_K10M"),
-    ENUM_ENT(EM_AARCH64, "AArch64"),
-    ENUM_ENT(EM_AVR32, "Atmel Corporation 32-bit microprocessor family"),
-    ENUM_ENT(EM_STM8, "STMicroeletronics STM8 8-bit microcontroller"),
-    ENUM_ENT(EM_TILE64, "Tilera TILE64 multicore architecture family"),
-    ENUM_ENT(EM_TILEPRO, "Tilera TILEPro multicore architecture family"),
-    ENUM_ENT(EM_MICROBLAZE,
-             "Xilinx MicroBlaze 32-bit RISC soft processor core"),
-    ENUM_ENT(EM_CUDA, "NVIDIA CUDA architecture"),
-    ENUM_ENT(EM_TILEGX, "Tilera TILE-Gx multicore architecture family"),
-    ENUM_ENT(EM_CLOUDSHIELD, "EM_CLOUDSHIELD"),
-    ENUM_ENT(EM_COREA_1ST, "EM_COREA_1ST"),
-    ENUM_ENT(EM_COREA_2ND, "EM_COREA_2ND"),
-    ENUM_ENT(EM_ARC_COMPACT2, "EM_ARC_COMPACT2"),
-    ENUM_ENT(EM_OPEN8, "EM_OPEN8"),
-    ENUM_ENT(EM_RL78, "Renesas RL78"),
-    ENUM_ENT(EM_VIDEOCORE5, "Broadcom VideoCore V processor"),
-    ENUM_ENT(EM_78KOR, "EM_78KOR"),
-    ENUM_ENT(EM_56800EX, "EM_56800EX"),
-    ENUM_ENT(EM_AMDGPU, "EM_AMDGPU"),
-    ENUM_ENT(EM_RISCV, "RISC-V"),
-    ENUM_ENT(EM_LANAI, "EM_LANAI"),
-    ENUM_ENT(EM_BPF, "EM_BPF"),
-    ENUM_ENT(EM_VE, "NEC SX-Aurora Vector Engine"),
-    ENUM_ENT(EM_LOONGARCH, "LoongArch"),
-    ENUM_ENT(EM_INTELGT, "Intel Graphics Technology"),
+  ENUM_ENT(EM_NONE,          "None"),
+  ENUM_ENT(EM_M32,           "WE32100"),
+  ENUM_ENT(EM_SPARC,         "Sparc"),
+  ENUM_ENT(EM_386,           "Intel 80386"),
+  ENUM_ENT(EM_68K,           "MC68000"),
+  ENUM_ENT(EM_88K,           "MC88000"),
+  ENUM_ENT(EM_IAMCU,         "EM_IAMCU"),
+  ENUM_ENT(EM_860,           "Intel 80860"),
+  ENUM_ENT(EM_MIPS,          "MIPS R3000"),
+  ENUM_ENT(EM_S370,          "IBM System/370"),
+  ENUM_ENT(EM_MIPS_RS3_LE,   "MIPS R3000 little-endian"),
+  ENUM_ENT(EM_PARISC,        "HPPA"),
+  ENUM_ENT(EM_VPP500,        "Fujitsu VPP500"),
+  ENUM_ENT(EM_SPARC32PLUS,   "Sparc v8+"),
+  ENUM_ENT(EM_960,           "Intel 80960"),
+  ENUM_ENT(EM_PPC,           "PowerPC"),
+  ENUM_ENT(EM_PPC64,         "PowerPC64"),
+  ENUM_ENT(EM_S390,          "IBM S/390"),
+  ENUM_ENT(EM_SPU,           "SPU"),
+  ENUM_ENT(EM_V800,          "NEC V800 series"),
+  ENUM_ENT(EM_FR20,          "Fujistsu FR20"),
+  ENUM_ENT(EM_RH32,          "TRW RH-32"),
+  ENUM_ENT(EM_RCE,           "Motorola RCE"),
+  ENUM_ENT(EM_ARM,           "ARM"),
+  ENUM_ENT(EM_ALPHA,         "EM_ALPHA"),
+  ENUM_ENT(EM_SH,            "Hitachi SH"),
+  ENUM_ENT(EM_SPARCV9,       "Sparc v9"),
+  ENUM_ENT(EM_TRICORE,       "Siemens Tricore"),
+  ENUM_ENT(EM_ARC,           "ARC"),
+  ENUM_ENT(EM_H8_300,        "Hitachi H8/300"),
+  ENUM_ENT(EM_H8_300H,       "Hitachi H8/300H"),
+  ENUM_ENT(EM_H8S,           "Hitachi H8S"),
+  ENUM_ENT(EM_H8_500,        "Hitachi H8/500"),
+  ENUM_ENT(EM_IA_64,         "Intel IA-64"),
+  ENUM_ENT(EM_MIPS_X,        "Stanford MIPS-X"),
+  ENUM_ENT(EM_COLDFIRE,      "Motorola Coldfire"),
+  ENUM_ENT(EM_68HC12,        "Motorola MC68HC12 Microcontroller"),
+  ENUM_ENT(EM_MMA,           "Fujitsu Multimedia Accelerator"),
+  ENUM_ENT(EM_PCP,           "Siemens PCP"),
+  ENUM_ENT(EM_NCPU,          "Sony nCPU embedded RISC processor"),
+  ENUM_ENT(EM_NDR1,          "Denso NDR1 microprocesspr"),
+  ENUM_ENT(EM_STARCORE,      "Motorola Star*Core processor"),
+  ENUM_ENT(EM_ME16,          "Toyota ME16 processor"),
+  ENUM_ENT(EM_ST100,         "STMicroelectronics ST100 processor"),
+  ENUM_ENT(EM_TINYJ,         "Advanced Logic Corp. TinyJ embedded processor"),
+  ENUM_ENT(EM_X86_64,        "Advanced Micro Devices X86-64"),
+  ENUM_ENT(EM_PDSP,          "Sony DSP processor"),
+  ENUM_ENT(EM_PDP10,         "Digital Equipment Corp. PDP-10"),
+  ENUM_ENT(EM_PDP11,         "Digital Equipment Corp. PDP-11"),
+  ENUM_ENT(EM_FX66,          "Siemens FX66 microcontroller"),
+  ENUM_ENT(EM_ST9PLUS,       "STMicroelectronics ST9+ 8/16 bit microcontroller"),
+  ENUM_ENT(EM_ST7,           "STMicroelectronics ST7 8-bit microcontroller"),
+  ENUM_ENT(EM_68HC16,        "Motorola MC68HC16 Microcontroller"),
+  ENUM_ENT(EM_68HC11,        "Motorola MC68HC11 Microcontroller"),
+  ENUM_ENT(EM_68HC08,        "Motorola MC68HC08 Microcontroller"),
+  ENUM_ENT(EM_68HC05,        "Motorola MC68HC05 Microcontroller"),
+  ENUM_ENT(EM_SVX,           "Silicon Graphics SVx"),
+  ENUM_ENT(EM_ST19,          "STMicroelectronics ST19 8-bit microcontroller"),
+  ENUM_ENT(EM_VAX,           "Digital VAX"),
+  ENUM_ENT(EM_CRIS,          "Axis Communications 32-bit embedded processor"),
+  ENUM_ENT(EM_JAVELIN,       "Infineon Technologies 32-bit embedded cpu"),
+  ENUM_ENT(EM_FIREPATH,      "Element 14 64-bit DSP processor"),
+  ENUM_ENT(EM_ZSP,           "LSI Logic's 16-bit DSP processor"),
+  ENUM_ENT(EM_MMIX,          "Donald Knuth's educational 64-bit processor"),
+  ENUM_ENT(EM_HUANY,         "Harvard Universitys's machine-independent object format"),
+  ENUM_ENT(EM_PRISM,         "Vitesse Prism"),
+  ENUM_ENT(EM_AVR,           "Atmel AVR 8-bit microcontroller"),
+  ENUM_ENT(EM_FR30,          "Fujitsu FR30"),
+  ENUM_ENT(EM_D10V,          "Mitsubishi D10V"),
+  ENUM_ENT(EM_D30V,          "Mitsubishi D30V"),
+  ENUM_ENT(EM_V850,          "NEC v850"),
+  ENUM_ENT(EM_M32R,          "Renesas M32R (formerly Mitsubishi M32r)"),
+  ENUM_ENT(EM_MN10300,       "Matsushita MN10300"),
+  ENUM_ENT(EM_MN10200,       "Matsushita MN10200"),
+  ENUM_ENT(EM_PJ,            "picoJava"),
+  ENUM_ENT(EM_OPENRISC,      "OpenRISC 32-bit embedded processor"),
+  ENUM_ENT(EM_ARC_COMPACT,   "EM_ARC_COMPACT"),
+  ENUM_ENT(EM_XTENSA,        "Tensilica Xtensa Processor"),
+  ENUM_ENT(EM_VIDEOCORE,     "Alphamosaic VideoCore processor"),
+  ENUM_ENT(EM_TMM_GPP,       "Thompson Multimedia General Purpose Processor"),
+  ENUM_ENT(EM_NS32K,         "National Semiconductor 32000 series"),
+  ENUM_ENT(EM_TPC,           "Tenor Network TPC processor"),
+  ENUM_ENT(EM_SNP1K,         "EM_SNP1K"),
+  ENUM_ENT(EM_ST200,         "STMicroelectronics ST200 microcontroller"),
+  ENUM_ENT(EM_IP2K,          "Ubicom IP2xxx 8-bit microcontrollers"),
+  ENUM_ENT(EM_MAX,           "MAX Processor"),
+  ENUM_ENT(EM_CR,            "National Semiconductor CompactRISC"),
+  ENUM_ENT(EM_F2MC16,        "Fujitsu F2MC16"),
+  ENUM_ENT(EM_MSP430,        "Texas Instruments msp430 microcontroller"),
+  ENUM_ENT(EM_BLACKFIN,      "Analog Devices Blackfin"),
+  ENUM_ENT(EM_SE_C33,        "S1C33 Family of Seiko Epson processors"),
+  ENUM_ENT(EM_SEP,           "Sharp embedded microprocessor"),
+  ENUM_ENT(EM_ARCA,          "Arca RISC microprocessor"),
+  ENUM_ENT(EM_UNICORE,       "Unicore"),
+  ENUM_ENT(EM_EXCESS,        "eXcess 16/32/64-bit configurable embedded CPU"),
+  ENUM_ENT(EM_DXP,           "Icera Semiconductor Inc. Deep Execution Processor"),
+  ENUM_ENT(EM_ALTERA_NIOS2,  "Altera Nios"),
+  ENUM_ENT(EM_CRX,           "National Semiconductor CRX microprocessor"),
+  ENUM_ENT(EM_XGATE,         "Motorola XGATE embedded processor"),
+  ENUM_ENT(EM_C166,          "Infineon Technologies xc16x"),
+  ENUM_ENT(EM_M16C,          "Renesas M16C"),
+  ENUM_ENT(EM_DSPIC30F,      "Microchip Technology dsPIC30F Digital Signal Controller"),
+  ENUM_ENT(EM_CE,            "Freescale Communication Engine RISC core"),
+  ENUM_ENT(EM_M32C,          "Renesas M32C"),
+  ENUM_ENT(EM_TSK3000,       "Altium TSK3000 core"),
+  ENUM_ENT(EM_RS08,          "Freescale RS08 embedded processor"),
+  ENUM_ENT(EM_SHARC,         "EM_SHARC"),
+  ENUM_ENT(EM_ECOG2,         "Cyan Technology eCOG2 microprocessor"),
+  ENUM_ENT(EM_SCORE7,        "SUNPLUS S+Core"),
+  ENUM_ENT(EM_DSP24,         "New Japan Radio (NJR) 24-bit DSP Processor"),
+  ENUM_ENT(EM_VIDEOCORE3,    "Broadcom VideoCore III processor"),
+  ENUM_ENT(EM_LATTICEMICO32, "Lattice Mico32"),
+  ENUM_ENT(EM_SE_C17,        "Seiko Epson C17 family"),
+  ENUM_ENT(EM_TI_C6000,      "Texas Instruments TMS320C6000 DSP family"),
+  ENUM_ENT(EM_TI_C2000,      "Texas Instruments TMS320C2000 DSP family"),
+  ENUM_ENT(EM_TI_C5500,      "Texas Instruments TMS320C55x DSP family"),
+  ENUM_ENT(EM_MMDSP_PLUS,    "STMicroelectronics 64bit VLIW Data Signal Processor"),
+  ENUM_ENT(EM_CYPRESS_M8C,   "Cypress M8C microprocessor"),
+  ENUM_ENT(EM_R32C,          "Renesas R32C series microprocessors"),
+  ENUM_ENT(EM_TRIMEDIA,      "NXP Semiconductors TriMedia architecture family"),
+  ENUM_ENT(EM_HEXAGON,       "Qualcomm Hexagon"),
+  ENUM_ENT(EM_8051,          "Intel 8051 and variants"),
+  ENUM_ENT(EM_STXP7X,        "STMicroelectronics STxP7x family"),
+  ENUM_ENT(EM_NDS32,         "Andes Technology compact code size embedded RISC processor family"),
+  ENUM_ENT(EM_ECOG1,         "Cyan Technology eCOG1 microprocessor"),
+  // FIXME: Following EM_ECOG1X definitions is dead code since EM_ECOG1X has
+  //        an identical number to EM_ECOG1.
+  ENUM_ENT(EM_ECOG1X,        "Cyan Technology eCOG1X family"),
+  ENUM_ENT(EM_MAXQ30,        "Dallas Semiconductor MAXQ30 Core microcontrollers"),
+  ENUM_ENT(EM_XIMO16,        "New Japan Radio (NJR) 16-bit DSP Processor"),
+  ENUM_ENT(EM_MANIK,         "M2000 Reconfigurable RISC Microprocessor"),
+  ENUM_ENT(EM_CRAYNV2,       "Cray Inc. NV2 vector architecture"),
+  ENUM_ENT(EM_RX,            "Renesas RX"),
+  ENUM_ENT(EM_METAG,         "Imagination Technologies Meta processor architecture"),
+  ENUM_ENT(EM_MCST_ELBRUS,   "MCST Elbrus general purpose hardware architecture"),
+  ENUM_ENT(EM_ECOG16,        "Cyan Technology eCOG16 family"),
+  ENUM_ENT(EM_CR16,          "National Semiconductor CompactRISC 16-bit processor"),
+  ENUM_ENT(EM_ETPU,          "Freescale Extended Time Processing Unit"),
+  ENUM_ENT(EM_SLE9X,         "Infineon Technologies SLE9X core"),
+  ENUM_ENT(EM_L10M,          "EM_L10M"),
+  ENUM_ENT(EM_K10M,          "EM_K10M"),
+  ENUM_ENT(EM_AARCH64,       "AArch64"),
+  ENUM_ENT(EM_AVR32,         "Atmel Corporation 32-bit microprocessor family"),
+  ENUM_ENT(EM_STM8,          "STMicroeletronics STM8 8-bit microcontroller"),
+  ENUM_ENT(EM_TILE64,        "Tilera TILE64 multicore architecture family"),
+  ENUM_ENT(EM_TILEPRO,       "Tilera TILEPro multicore architecture family"),
+  ENUM_ENT(EM_MICROBLAZE,    "Xilinx MicroBlaze 32-bit RISC soft processor core"),
+  ENUM_ENT(EM_CUDA,          "NVIDIA CUDA architecture"),
+  ENUM_ENT(EM_TILEGX,        "Tilera TILE-Gx multicore architecture family"),
+  ENUM_ENT(EM_CLOUDSHIELD,   "EM_CLOUDSHIELD"),
+  ENUM_ENT(EM_COREA_1ST,     "EM_COREA_1ST"),
+  ENUM_ENT(EM_COREA_2ND,     "EM_COREA_2ND"),
+  ENUM_ENT(EM_ARC_COMPACT2,  "EM_ARC_COMPACT2"),
+  ENUM_ENT(EM_OPEN8,         "EM_OPEN8"),
+  ENUM_ENT(EM_RL78,          "Renesas RL78"),
+  ENUM_ENT(EM_VIDEOCORE5,    "Broadcom VideoCore V processor"),
+  ENUM_ENT(EM_78KOR,         "EM_78KOR"),
+  ENUM_ENT(EM_56800EX,       "EM_56800EX"),
+  ENUM_ENT(EM_AMDGPU,        "EM_AMDGPU"),
+  ENUM_ENT(EM_RISCV,         "RISC-V"),
+  ENUM_ENT(EM_LANAI,         "EM_LANAI"),
+  ENUM_ENT(EM_BPF,           "EM_BPF"),
+  ENUM_ENT(EM_VE,            "NEC SX-Aurora Vector Engine"),
+  ENUM_ENT(EM_LOONGARCH,     "LoongArch"),
+  ENUM_ENT(EM_INTELGT,       "Intel Graphics Technology"),
 };
+// clang-format on
 
 const EnumEntry<unsigned> ElfSymbolBindings[] = {
     {"Local",  "LOCAL",  ELF::STB_LOCAL},

From dd43a79ed0662b23d963bbba032670a6c9a1beb1 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Tue, 30 Sep 2025 09:57:07 -0700
Subject: [PATCH 279/878] [lld][MachO] Use llvm::Align and remove StringOffset
 type (#161253)

Use `llvm::Align` instead of directly storing the shift amount for
clarity. Also remove the `DeduplicatedCStringSection::StringOffset` in
favor of simply storing the `uint64_t` offset since `trailingZeros` is
not used outside of `finalizeContents()`. These two changes allow us to
refactor `finalizeContents()`.


No function change intended.

Depends on https://github.com/llvm/llvm-project/pull/161241.
---
 lld/MachO/SyntheticSections.cpp | 65 ++++++++++++++-------------------
 lld/MachO/SyntheticSections.h   | 12 +-----
 2 files changed, 29 insertions(+), 48 deletions(-)

diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 5645d8a05a28f..903ba78a27c75 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -848,8 +848,7 @@ void ObjCSelRefsHelper::initialize() {
 void ObjCSelRefsHelper::cleanup() { methnameToSelref.clear(); }
 
 ConcatInputSection *ObjCSelRefsHelper::makeSelRef(StringRef methname) {
-  auto methnameOffset =
-      in.objcMethnameSection->getStringOffset(methname).outSecOff;
+  auto methnameOffset = in.objcMethnameSection->getStringOffset(methname);
 
   size_t wordSize = target->wordSize;
   uint8_t *selrefData = bAlloc().Allocate<uint8_t>(wordSize);
@@ -1722,13 +1721,13 @@ void CStringSection::writeTo(uint8_t *buf) const {
 // and don't need this alignment. They will be emitted at some arbitrary address
 // `A`, but ld64 will treat them as being 16-byte aligned with an offset of
 // `16 % A`.
-static uint8_t getStringPieceAlignment(const CStringInputSection *isec,
-                                       const StringPiece &piece) {
-  return llvm::countr_zero(isec->align | piece.inSecOff);
+static Align getStringPieceAlignment(const CStringInputSection *isec,
+                                     const StringPiece &piece) {
+  return llvm::Align(1ULL << llvm::countr_zero(isec->align | piece.inSecOff));
 }
 
 void CStringSection::finalizeContents() {
-  uint64_t offset = 0;
+  size = 0;
   // TODO: Call buildCStringPriorities() to support cstring ordering when
   // deduplication is off, although this may negatively impact build
   // performance.
@@ -1736,30 +1735,27 @@ void CStringSection::finalizeContents() {
     for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
       if (!piece.live)
         continue;
-      uint32_t pieceAlign = 1 << getStringPieceAlignment(isec, piece);
-      offset = alignToPowerOf2(offset, pieceAlign);
-      piece.outSecOff = offset;
-      isec->isFinal = true;
+      piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
       StringRef string = isec->getStringRef(i);
-      offset += string.size() + 1; // account for null terminator
+      size = piece.outSecOff + string.size() + 1; // account for null terminator
     }
+    isec->isFinal = true;
   }
-  size = offset;
 }
 
 void DeduplicatedCStringSection::finalizeContents() {
   // Find the largest alignment required for each string.
+  DenseMap<CachedHashStringRef, Align> strToAlignment;
   for (const CStringInputSection *isec : inputs) {
     for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
       if (!piece.live)
         continue;
       auto s = isec->getCachedHashStringRef(i);
       assert(isec->align != 0);
-      uint8_t trailingZeros = getStringPieceAlignment(isec, piece);
-      auto it = stringOffsetMap.insert(
-          std::make_pair(s, StringOffset(trailingZeros)));
-      if (!it.second && it.first->second.trailingZeros < trailingZeros)
-        it.first->second.trailingZeros = trailingZeros;
+      auto align = getStringPieceAlignment(isec, piece);
+      auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
+      if (!wasInserted && it->second < align)
+        it->second = align;
     }
   }
 
@@ -1769,38 +1765,31 @@ void DeduplicatedCStringSection::finalizeContents() {
   for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
     auto &piece = isec->pieces[i];
     auto s = isec->getCachedHashStringRef(i);
-    auto it = stringOffsetMap.find(s);
-    assert(it != stringOffsetMap.end());
-    lld::macho::DeduplicatedCStringSection::StringOffset &offsetInfo =
-        it->second;
-    if (offsetInfo.outSecOff == UINT64_MAX) {
-      offsetInfo.outSecOff =
-          alignToPowerOf2(size, 1ULL << offsetInfo.trailingZeros);
-      size = offsetInfo.outSecOff + s.size() + 1; // account for null terminator
+    auto [it, wasInserted] = stringOffsetMap.try_emplace(s, /*placeholder*/ 0);
+    if (wasInserted) {
+      // Avoid computing the offset until we are sure we will need to
+      uint64_t offset = alignTo(size, strToAlignment.at(s));
+      it->second = offset;
+      size = offset + s.size() + 1; // account for null terminator
     }
-    piece.outSecOff = offsetInfo.outSecOff;
+    // If the string was already in stringOffsetMap, it is a duplicate and we
+    // only need to assign the offset.
+    piece.outSecOff = it->second;
   }
   for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
 }
 
 void DeduplicatedCStringSection::writeTo(uint8_t *buf) const {
-  for (const auto &p : stringOffsetMap) {
-    StringRef data = p.first.val();
-    uint64_t off = p.second.outSecOff;
-    if (!data.empty())
-      memcpy(buf + off, data.data(), data.size());
-  }
+  for (const auto &[s, outSecOff] : stringOffsetMap)
+    if (s.size())
+      memcpy(buf + outSecOff, s.data(), s.size());
 }
 
-DeduplicatedCStringSection::StringOffset
-DeduplicatedCStringSection::getStringOffset(StringRef str) const {
+uint64_t DeduplicatedCStringSection::getStringOffset(StringRef str) const {
   // StringPiece uses 31 bits to store the hashes, so we replicate that
   uint32_t hash = xxh3_64bits(str) & 0x7fffffff;
-  auto offset = stringOffsetMap.find(CachedHashStringRef(str, hash));
-  assert(offset != stringOffsetMap.end() &&
-         "Looked-up strings should always exist in section");
-  return offset->second;
+  return stringOffsetMap.at(CachedHashStringRef(str, hash));
 }
 
 // This section is actually emitted as __TEXT,__const by ld64, but clang may
diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index 1abf3c210a64e..a37dd66107ee7 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -571,18 +571,10 @@ class DeduplicatedCStringSection final : public CStringSection {
   uint64_t getSize() const override { return size; }
   void finalizeContents() override;
   void writeTo(uint8_t *buf) const override;
-
-  struct StringOffset {
-    uint8_t trailingZeros;
-    uint64_t outSecOff = UINT64_MAX;
-
-    explicit StringOffset(uint8_t zeros) : trailingZeros(zeros) {}
-  };
-
-  StringOffset getStringOffset(StringRef str) const;
+  uint64_t getStringOffset(StringRef str) const;
 
 private:
-  llvm::DenseMap<llvm::CachedHashStringRef, StringOffset> stringOffsetMap;
+  llvm::DenseMap<llvm::CachedHashStringRef, uint64_t> stringOffsetMap;
   size_t size = 0;
 };
 

From 66af9423e882247ca2389d1d20c7ee9b21b50a82 Mon Sep 17 00:00:00 2001
From: Sang Ik Lee <sang.ik.lee@intel.com>
Date: Tue, 30 Sep 2025 10:16:20 -0700
Subject: [PATCH 280/878] [MLIR][XeVM] Add XeVM special id ops. (#160735)

Add special GPU id, index ops.
---
 mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td | 33 +++++++++++++++++++
 mlir/test/Dialect/LLVMIR/xevm.mlir          | 36 +++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
index 514b01a69fb9b..4f7a8421c07b9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/XeVMOps.td
@@ -634,4 +634,37 @@ def XeVM_TargetAttr : XeVM_Attr<"XeVMTarget", "target"> {
   let genVerifyDecl = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// XeVM special register op definitions
+//===----------------------------------------------------------------------===//
+
+class XeVM_SpecialIdRegisterOp<string mnemonic, list<Trait> traits = []>
+    : XeVM_Op<mnemonic, traits>,
+      Results<(outs AnyTypeOf<[I32, I64]>:$res)>,
+      Arguments<(ins OptionalAttr<LLVM_ConstantRangeAttr>:$range)> {
+  let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
+}
+
+multiclass XeVM_SpecialRegisterXYZ<string mnemonic, list<Trait> traits = []> {
+  def XOp : XeVM_SpecialIdRegisterOp<!strconcat(mnemonic, ".x"), traits>;
+  def YOp : XeVM_SpecialIdRegisterOp<!strconcat(mnemonic, ".y"), traits>;
+  def ZOp : XeVM_SpecialIdRegisterOp<!strconcat(mnemonic, ".z"), traits>;
+}
+
+//===----------------------------------------------------------------------===//
+// Workitem index and range
+defm XeVM_WorkitemId : XeVM_SpecialRegisterXYZ<"local_id">;
+defm XeVM_WorkgroupDim : XeVM_SpecialRegisterXYZ<"local_size">;
+
+//===----------------------------------------------------------------------===//
+// Workgroup index and range
+defm XeVM_WorkgroupId : XeVM_SpecialRegisterXYZ<"group_id">;
+defm XeVM_GridDim : XeVM_SpecialRegisterXYZ<"group_count">;
+
+//===----------------------------------------------------------------------===//
+// Lane, Subgroup index and range
+def XeVM_LaneIdOp : XeVM_SpecialIdRegisterOp<"lane_id">;
+def XeVM_SubgroupIdOp : XeVM_SpecialIdRegisterOp<"subgroup_id">;
+def XeVM_SubgroupSizeOp : XeVM_SpecialIdRegisterOp<"subgroup_size">;
+
 #endif // XEVMIR_OPS
diff --git a/mlir/test/Dialect/LLVMIR/xevm.mlir b/mlir/test/Dialect/LLVMIR/xevm.mlir
index bb1f650a1cd12..66fb2949a270f 100644
--- a/mlir/test/Dialect/LLVMIR/xevm.mlir
+++ b/mlir/test/Dialect/LLVMIR/xevm.mlir
@@ -116,3 +116,39 @@ func.func @prefetch(%ptr: !llvm.ptr<1>) {
 // CHECK-LABEL: @xevm_module [#xevm.target<O = 3, chip = "pvc">] {
 gpu.module @xevm_module [#xevm.target<O = 3, chip = "pvc">]{
 }
+
+// -----
+// CHECK-LABEL: @xevm_special_ids
+llvm.func @xevm_special_ids() -> i32 {
+  // CHECK: xevm.local_id.x : i32
+  %1 = xevm.local_id.x : i32
+  // CHECK: xevm.local_id.y : i32
+  %2 = xevm.local_id.y : i32
+  // CHECK: xevm.local_id.z : i32
+  %3 = xevm.local_id.z : i32
+  // CHECK: xevm.local_size.x : i32
+  %4 = xevm.local_size.x : i32
+  // CHECK: xevm.local_size.y : i32
+  %5 = xevm.local_size.y : i32
+  // CHECK: xevm.local_size.z : i32
+  %6 = xevm.local_size.z : i32
+  // CHECK: xevm.group_id.x : i32
+  %7 = xevm.group_id.x : i32
+  // CHECK: xevm.group_id.y : i32
+  %8 = xevm.group_id.y : i32
+  // CHECK: xevm.group_id.z : i32
+  %9 = xevm.group_id.z : i32
+  // CHECK: xevm.group_count.x : i32
+  %10 = xevm.group_count.x : i32
+  // CHECK: xevm.group_count.y : i32
+  %11 = xevm.group_count.y : i32
+  // CHECK: xevm.group_count.z : i32
+  %12 = xevm.group_count.z : i32
+  // CHECK: xevm.lane_id : i32
+  %14 = xevm.lane_id : i32
+  // CHECK: xevm.subgroup_size : i32
+  %39 = xevm.subgroup_size : i32
+  // CHECK: xevm.subgroup_id : i32
+  %40 = xevm.subgroup_id : i32
+  llvm.return %1 : i32
+}

From ee8394d9469a2946ffe2e7d192c593ecf3f93098 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Tue, 30 Sep 2025 19:16:31 +0200
Subject: [PATCH 281/878] [CIR] Implement ChooseExpr for AggregateExpr
 (#160999)

Implement the ChooseExpr for aggregate expr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  4 +--
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |  2 ++
 clang/test/CIR/CodeGen/struct.cpp             | 25 +++++++++++++++++++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 5596499ee94b5..af42d1d882ae0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -211,9 +211,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "AggExprEmitter: VisitAbstractConditionalOperator");
   }
-  void VisitChooseExpr(const ChooseExpr *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitChooseExpr");
-  }
+  void VisitChooseExpr(const ChooseExpr *e) { Visit(e->getChosenSubExpr()); }
   void VisitCXXParenListInitExpr(CXXParenListInitExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "AggExprEmitter: VisitCXXParenListInitExpr");
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 0abb21a670719..9d98361a8b6a2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -841,6 +841,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     return emitCastLValue(cast<CastExpr>(e));
   case Expr::MaterializeTemporaryExprClass:
     return emitMaterializeTemporaryExpr(cast<MaterializeTemporaryExpr>(e));
+  case Expr::ChooseExprClass:
+    return emitLValue(cast<ChooseExpr>(e)->getChosenSubExpr());
   }
 }
 
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index 75374284d09d0..1dc16f3b79497 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -129,3 +129,28 @@ void paren_expr() {
 // OGCG:   %[[B_ADDR:.*]] = alloca %struct.Point, align 4
 // OGCG:   call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[B_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
+
+void choose_expr() {
+  CompleteS a;
+  CompleteS b;
+  CompleteS c = __builtin_choose_expr(true, a, b);
+}
+
+// CIR: cir.func{{.*}} @_Z11choose_exprv()
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a"]
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["b"]
+// CIR:   %[[C_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["c", init]
+// TODO(cir): Call to default copy constructor should be replaced by `cir.copy` op
+// CIR:   cir.call @_ZN9CompleteSC1ERKS_(%[[C_ADDR]], %[[A_ADDR]]) nothrow : (!cir.ptr<!rec_CompleteS>, !cir.ptr<!rec_CompleteS>) -> ()
+
+// LLVM: define{{.*}} void @_Z11choose_exprv()
+// LLVM:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   %[[B_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   %[[C_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   call void @_ZN9CompleteSC1ERKS_(ptr %[[C_ADDR]], ptr %[[A_ADDR]])
+
+// OGCG: define{{.*}} void @_Z11choose_exprv()
+// OGCG:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG:   %[[B_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG:   %[[C_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[C_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)

From 2780c209e1e242fd9e7d71045f88fe4e824cee20 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 10:34:41 -0700
Subject: [PATCH 282/878] [flang] Emit error on impossible-to-implement
 construct (#160384)

An assignment to a whole polymorphic allocatable changes its dynamic
type to the type of the right-hand side expression. But when the
assignment is under control of a WHERE statement, or a FORALL / DO
CONCURRENT with a mask expression, there is no interpretation of the
assignment, as the type of a variable must be the same for all of its
elements.

There is no restriction in the standard against this usage, and no other
Fortran compiler complains about it. But it is not possible to implement
it in general, and the behavior produced by other compilers is not
reasonable, much less worthy of emulating. It's best to simply disallow
it with an error message.

Fixes https://github.com/llvm/llvm-project/issues/133669, or more
accurately, resolves it.
---
 flang/docs/Extensions.md           | 11 +++++++
 flang/lib/Semantics/assignment.cpp |  6 +++-
 flang/test/Semantics/bug133669.f90 | 51 ++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Semantics/bug133669.f90

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index c442a9cd6859e..9f9de6529dd03 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -557,6 +557,17 @@ end
   generic intrinsic function's inferred result type does not
   match an explicit declaration.  This message is a warning.
 
+* There is no restriction in the standard against assigning
+  to a whole polymorphic allocatable under control of a `WHERE`
+  construct or statement, but there is no good portable
+  behavior to implement and the standard isn't entirely clear
+  what it should mean.
+  (Other compilers allow it, but the results are never meaningful;
+  some never change the type, some change the type according to
+  the value of the last mask element, some treat these
+  assignment statements as no-ops, and the rest crash during compilation.)
+  The compiler flags this case as an error.
+
 ## Standard features that might as well not be
 
 * f18 supports designators with constant expressions, properly
diff --git a/flang/lib/Semantics/assignment.cpp b/flang/lib/Semantics/assignment.cpp
index 88e08887160d9..f4aa496e485e1 100644
--- a/flang/lib/Semantics/assignment.cpp
+++ b/flang/lib/Semantics/assignment.cpp
@@ -41,7 +41,6 @@ class AssignmentContext {
   void PopWhereContext();
   void Analyze(const parser::AssignmentStmt &);
   void Analyze(const parser::PointerAssignmentStmt &);
-  void Analyze(const parser::ConcurrentControl &);
   SemanticsContext &context() { return context_; }
 
 private:
@@ -76,6 +75,11 @@ void AssignmentContext::Analyze(const parser::AssignmentStmt &stmt) {
         whole{evaluate::UnwrapWholeSymbolOrComponentDataRef(lhs)}) {
       if (IsAllocatable(whole->GetUltimate())) {
         flags.set(DefinabilityFlag::PotentialDeallocation);
+        if (IsPolymorphic(*whole) && whereDepth_ > 0) {
+          Say(lhsLoc,
+              "Assignment to whole polymorphic allocatable '%s' may not be nested in a WHERE statement or construct"_err_en_US,
+              whole->name());
+        }
       }
     }
     if (auto whyNot{WhyNotDefinable(lhsLoc, scope, flags, lhs)}) {
diff --git a/flang/test/Semantics/bug133669.f90 b/flang/test/Semantics/bug133669.f90
new file mode 100644
index 0000000000000..b4d55db193a2c
--- /dev/null
+++ b/flang/test/Semantics/bug133669.f90
@@ -0,0 +1,51 @@
+!RUN: %python %S/test_errors.py %s %flang_fc1
+module m
+ contains
+  subroutine s(x, y, mask)
+    class(*), allocatable, intent(in out) :: x(:), y(:)
+    logical, intent(in) :: mask(:)
+    select type(x)
+    type is(integer)
+      print *, 'before, x is integer', x
+    type is(real)
+      print *, 'before, x is real', x
+    class default
+      print *, 'before, x has some other type'
+    end select
+    select type(y)
+    type is(integer)
+      print *, 'y is integer', y
+    type is(real)
+      print *, 'y is real', y
+    end select
+    print *, 'mask', mask
+    !ERROR: Assignment to whole polymorphic allocatable 'x' may not be nested in a WHERE statement or construct
+    where(mask) x = y
+    select type(x)
+    type is(integer)
+      print *, 'after, x is integer', x
+    type is(real)
+      print *, 'after, x is real', x
+    class default
+      print *, 'before, x has some other type'
+    end select
+    print *
+  end
+end
+
+program main
+  use m
+  class(*), allocatable :: x(:), y(:)
+  x = [1, 2]
+  y = [3., 4.]
+  call s(x, y, [.false., .false.])
+  x = [1, 2]
+  y = [3., 4.]
+  call s(x, y, [.false., .true.])
+  x = [1, 2]
+  y = [3., 4.]
+  call s(x, y, [.true., .false.])
+  x = [1, 2]
+  y = [3., 4.]
+  call s(x, y, [.true., .true.])
+end program main

From 673e3051b0ededcf9e028a86faae6d2ba8b09daa Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 10:35:06 -0700
Subject: [PATCH 283/878] [flang][runtime] Let more list-directed child input
 advance (#160590)

Whether list-directed child READ statements should be allowed to advance
to further records is neither explicit in the standard nor consistent in
existing Fortran implementations. We allow child namelist READ
statements to advance, but not other list- directed child input.

This patch refines our interpretation of this case. Child namelist READ
statements continue to be able to advance; in addition, non-namelist
child READ statements can now advance if their parent READ statement is
a list-directed input statement at the top level, or a child that could.
But non-namelist list-directed child input taking place in a context
with explicit format control won't advance to following records, so that
the format-controlled parent READ statement can retain control over
record advancement.

Also corrects two cases of record repositioning in numeric input
editing, which were failing under child input because they weren't
allowing for left tab limits.

Fixes https://github.com/llvm/llvm-project/issues/160351.
---
 flang-rt/include/flang-rt/runtime/io-error.h | 11 ++++++
 flang-rt/include/flang-rt/runtime/io-stmt.h  |  7 ++++
 flang-rt/lib/runtime/descriptor-io.cpp       |  6 ++-
 flang-rt/lib/runtime/edit-input.cpp          | 10 +++--
 flang-rt/lib/runtime/io-stmt.cpp             | 39 +++++++++++++-------
 flang/docs/Extensions.md                     | 11 ++++++
 6 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/io-error.h b/flang-rt/include/flang-rt/runtime/io-error.h
index 3e8401036f289..0ac1183131808 100644
--- a/flang-rt/include/flang-rt/runtime/io-error.h
+++ b/flang-rt/include/flang-rt/runtime/io-error.h
@@ -67,6 +67,17 @@ class IoErrorHandler : public Terminator {
   RT_API_ATTRS int GetIoStat() const { return ioStat_; }
   RT_API_ATTRS bool GetIoMsg(char *, std::size_t);
 
+  // Sets the HasEnd flag so that EOF isn't fatal; used to peek ahead
+  RT_API_ATTRS bool SetHasEnd(bool yes = true) {
+    bool oldValue{(flags_ & hasEnd) != 0};
+    if (yes) {
+      flags_ |= hasEnd;
+    } else {
+      flags_ &= ~hasEnd;
+    }
+    return oldValue;
+  }
+
 private:
   enum Flag : std::uint8_t {
     hasIoStat = 1, // IOSTAT=
diff --git a/flang-rt/include/flang-rt/runtime/io-stmt.h b/flang-rt/include/flang-rt/runtime/io-stmt.h
index 1c4f06c0a7082..1cb72d87d3dfa 100644
--- a/flang-rt/include/flang-rt/runtime/io-stmt.h
+++ b/flang-rt/include/flang-rt/runtime/io-stmt.h
@@ -703,6 +703,13 @@ class ChildListIoStatementState : public ChildIoStatementState<DIR>,
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
   RT_API_ATTRS bool AdvanceRecord(int = 1);
   RT_API_ATTRS int EndIoStatement();
+  RT_API_ATTRS bool CanAdvance() {
+    return DIR == Direction::Input &&
+        (canAdvance_ || this->mutableModes().inNamelist);
+  }
+
+private:
+  bool canAdvance_{false};
 };
 
 template <Direction DIR>
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index e00072510aff7..42ac4c0516637 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -47,9 +47,11 @@ static RT_API_ATTRS common::optional<bool> DefinedFormattedIo(
     const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
-  // Look at the next data edit descriptor.  If this is list-directed I/O, the
-  // "maxRepeat=0" argument will prevent the input from advancing over an
+  // Look at the next data edit descriptor.  If this is list-directed input,
+  // the "maxRepeat=0" argument will prevent the input from advancing over an
   // initial '(' that shouldn't be consumed now as the start of a real part.
+  // It also allows reaching EOF without crashing, since the EOF only matters
+  // if a child READ is actually performed.
   common::optional<DataEdit> peek{io.GetNextDataEdit(/*maxRepeat=*/0)};
   if (peek &&
       (peek->descriptor == DataEdit::DefinedDerivedType ||
diff --git a/flang-rt/lib/runtime/edit-input.cpp b/flang-rt/lib/runtime/edit-input.cpp
index 6ab546ee59f74..436fc3894d902 100644
--- a/flang-rt/lib/runtime/edit-input.cpp
+++ b/flang-rt/lib/runtime/edit-input.cpp
@@ -53,11 +53,13 @@ static RT_API_ATTRS bool EditBOZInput(
     IoStatementState &io, const DataEdit &edit, void *n, std::size_t bytes) {
   // Skip leading white space & zeroes
   common::optional<int> remaining{io.CueUpInput(edit)};
-  auto start{io.GetConnectionState().positionInRecord};
+  const ConnectionState &connection{io.GetConnectionState()};
+  auto leftTabLimit{connection.leftTabLimit.value_or(0)};
+  auto start{connection.positionInRecord - leftTabLimit};
   common::optional<char32_t> next{io.NextInField(remaining, edit)};
   if (next.value_or('?') == '0') {
     do {
-      start = io.GetConnectionState().positionInRecord;
+      start = connection.positionInRecord - leftTabLimit;
       next = io.NextInField(remaining, edit);
     } while (next && *next == '0');
   }
@@ -447,7 +449,9 @@ static RT_API_ATTRS ScannedRealInput ScanRealInput(
     }
     // In list-directed input, a bad exponent is not consumed.
     auto nextBeforeExponent{next};
-    auto startExponent{io.GetConnectionState().positionInRecord};
+    const ConnectionState &connection{io.GetConnectionState()};
+    auto leftTabLimit{connection.leftTabLimit.value_or(0)};
+    auto startExponent{connection.positionInRecord - leftTabLimit};
     bool hasGoodExponent{false};
     if (next) {
       if (isHexadecimal) {
diff --git a/flang-rt/lib/runtime/io-stmt.cpp b/flang-rt/lib/runtime/io-stmt.cpp
index 7bcba5fe71ee4..b958f23cf5342 100644
--- a/flang-rt/lib/runtime/io-stmt.cpp
+++ b/flang-rt/lib/runtime/io-stmt.cpp
@@ -880,6 +880,9 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
     edit.descriptor = DataEdit::ListDirectedImaginaryPart;
   }
   auto fastField{io.GetUpcomingFastAsciiField()};
+  // Reaching EOF is okay when peeking at list-directed defined input;
+  // pretend that there's an END= in that case.
+  bool oldHasEnd{maxRepeat == 0 && !io.GetIoErrorHandler().SetHasEnd()};
   auto ch{io.GetNextNonBlank(byteCount, &fastField)};
   if (ch && *ch == comma && eatComma_) {
     // Consume comma & whitespace after previous item.
@@ -890,19 +893,23 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
     ch = io.GetNextNonBlank(byteCount, &fastField);
   }
   eatComma_ = true;
-  if (!ch) {
-    return common::nullopt;
+  if (maxRepeat == 0 && !oldHasEnd) {
+    io.GetIoErrorHandler().SetHasEnd(false);
   }
-  if (*ch == '/') {
+  if (!ch) { // EOF
+    if (maxRepeat == 0) {
+      return edit; // DataEdit::ListDirected for look-ahead
+    } else {
+      return common::nullopt;
+    }
+  } else if (*ch == '/') {
     hitSlash_ = true;
     edit.descriptor = DataEdit::ListDirectedNullValue;
     return edit;
-  }
-  if (*ch == comma) { // separator: null value
+  } else if (*ch == comma) { // separator: null value
     edit.descriptor = DataEdit::ListDirectedNullValue;
     return edit;
-  }
-  if (imaginaryPart_) { // can't repeat components
+  } else if (imaginaryPart_) { // can't repeat components
     return edit;
   }
   if (*ch >= '0' && *ch <= '9' && fastField.MightBeRepetitionCount()) {
@@ -1103,10 +1110,19 @@ ChildListIoStatementState<DIR>::ChildListIoStatementState(
     : ChildIoStatementState<DIR>{child, sourceFile, sourceLine} {
 #if !defined(RT_DEVICE_AVOID_RECURSION)
   if constexpr (DIR == Direction::Input) {
-    if (auto *listInput{child.parent()
+    if (const auto *listInput{child.parent()
                 .get_if<ListDirectedStatementState<Direction::Input>>()}) {
       this->set_eatComma(listInput->eatComma());
       this->namelistGroup_ = listInput->namelistGroup();
+      if (auto *childListInput{child.parent()
+                  .get_if<ChildListIoStatementState<Direction::Input>>()}) {
+        // Child list input whose parent is child list input: can advance
+        // if the parent can.
+        this->canAdvance_ = childListInput->CanAdvance();
+      } else {
+        // Child list input of top-level list input: can advance.
+        this->canAdvance_ = true;
+      }
     }
   }
 #else
@@ -1117,12 +1133,7 @@ ChildListIoStatementState<DIR>::ChildListIoStatementState(
 template <Direction DIR>
 bool ChildListIoStatementState<DIR>::AdvanceRecord(int n) {
 #if !defined(RT_DEVICE_AVOID_RECURSION)
-  // Allow child NAMELIST input to advance
-  if (DIR == Direction::Input && this->mutableModes().inNamelist) {
-    return this->child().parent().AdvanceRecord(n);
-  } else {
-    return false;
-  }
+  return this->CanAdvance() && this->child().parent().AdvanceRecord(n);
 #else
   this->ReportUnsupportedChildIo();
 #endif
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 9f9de6529dd03..420b7517922b7 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -928,6 +928,17 @@ print *, [(j,j=1,10)]
   and the portable interpretation across the most common Fortran
   compilers.
 
+* `NAMELIST` child input statements are allowed to advance to further
+  input records.
+  Further, advancement is allowed when the parent input statement is
+  a non-child (top level) list-directed input statement, or, recursively,
+  an intermediate child list-directed input statement that can advance.
+  This means that non-`NAMELIST` list-directed child input statements are
+  not allowed to advance when they have an ancestor formatted input statement
+  that is not list-directed and there is no intervening `NAMELIST`.
+  This design allows format-driven input with `DT` editing to retain
+  control over advancement in child input, while otherwise allowing it.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the

From 2c58d192a2a6e54080ac36e0626487747dc7be5c Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 10:35:37 -0700
Subject: [PATCH 284/878] [flang][runtime] Expand IOTYPE and V_LIST (#160744)

The IOTYPE and V_LIST dummy arguments to a defined formatted I/O
subroutine are extracted from a DT edit descriptor in a FORMAT. They are
currently stored in the DataEdit structure, and their maximum sizes are
rather small since DataEdits are sometimes returned or passed by value.

This patch moves their storage into the FormattedIoStatementState
structure and enlarges them a bit.

Fixes https://github.com/llvm/llvm-project/issues/154954.
---
 .../include/flang-rt/runtime/format-implementation.h   |  4 ++--
 flang-rt/include/flang-rt/runtime/format.h             |  3 +--
 flang-rt/include/flang-rt/runtime/io-stmt.h            | 10 ++++++++--
 flang-rt/lib/runtime/descriptor-io.cpp                 |  7 ++++---
 flang-rt/unittests/Runtime/Format.cpp                  |  2 +-
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/format-implementation.h b/flang-rt/include/flang-rt/runtime/format-implementation.h
index de06524de32d3..46134146f5c13 100644
--- a/flang-rt/include/flang-rt/runtime/format-implementation.h
+++ b/flang-rt/include/flang-rt/runtime/format-implementation.h
@@ -532,7 +532,7 @@ RT_API_ATTRS common::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
           ReportBadFormat(context, "Excessive DT'iotype' in FORMAT", start);
           return common::nullopt;
         }
-        edit.ioType[edit.ioTypeChars++] = ch;
+        context.ioType[edit.ioTypeChars++] = ch;
         if (ch == quote) {
           ++offset_;
         }
@@ -556,7 +556,7 @@ RT_API_ATTRS common::optional<DataEdit> FormatControl<CONTEXT>::GetNextDataEdit(
           ReportBadFormat(context, "Excessive DT(v_list) in FORMAT", start);
           return common::nullopt;
         }
-        edit.vList[edit.vListEntries++] = n;
+        context.vList[edit.vListEntries++] = n;
         auto ch{static_cast<char>(GetNextChar(context))};
         if (ch != ',') {
           ok = ch == ')';
diff --git a/flang-rt/include/flang-rt/runtime/format.h b/flang-rt/include/flang-rt/runtime/format.h
index 34e33edae546d..79a7dd713b1a1 100644
--- a/flang-rt/include/flang-rt/runtime/format.h
+++ b/flang-rt/include/flang-rt/runtime/format.h
@@ -86,12 +86,11 @@ struct DataEdit {
   // defined I/O data edit descriptor
   RT_OFFLOAD_VAR_GROUP_BEGIN
   static constexpr std::size_t maxIoTypeChars{32};
-  static constexpr std::size_t maxVListEntries{4};
+  static constexpr std::size_t maxVListEntries{16};
   RT_OFFLOAD_VAR_GROUP_END
   std::uint8_t ioTypeChars{0};
   std::uint8_t vListEntries{0};
   char ioType[maxIoTypeChars];
-  int vList[maxVListEntries];
 };
 
 // Generates a sequence of DataEdits from a FORMAT statement or
diff --git a/flang-rt/include/flang-rt/runtime/io-stmt.h b/flang-rt/include/flang-rt/runtime/io-stmt.h
index 1cb72d87d3dfa..3de2309954a40 100644
--- a/flang-rt/include/flang-rt/runtime/io-stmt.h
+++ b/flang-rt/include/flang-rt/runtime/io-stmt.h
@@ -61,8 +61,14 @@ using IoDirectionState = std::conditional_t<D == Direction::Input,
     InputStatementState, OutputStatementState>;
 
 // Common state for all kinds of formatted I/O
-template <Direction D> class FormattedIoStatementState {};
-template <> class FormattedIoStatementState<Direction::Input> {
+struct DefinedIoArgs {
+  char ioType[DataEdit::maxIoTypeChars]; // IOTYPE string
+  int vList[DataEdit::maxVListEntries]; // V_LIST(:) values
+};
+template <Direction D>
+class FormattedIoStatementState : public DefinedIoArgs {};
+template <>
+class FormattedIoStatementState<Direction::Input> : public DefinedIoArgs {
 public:
   RT_API_ATTRS std::size_t GetEditDescriptorChars() const;
   RT_API_ATTRS void GotChar(int);
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index 42ac4c0516637..e599e624fe02e 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -64,10 +64,11 @@ static RT_API_ATTRS common::optional<bool> DefinedFormattedIo(
             : *io.GetNextDataEdit(1)};
     char ioType[2 + edit.maxIoTypeChars];
     auto ioTypeLen{std::size_t{2} /*"DT"*/ + edit.ioTypeChars};
+    auto &definedIoArgs{*io.get_if<DefinedIoArgs>()};
     if (edit.descriptor == DataEdit::DefinedDerivedType) {
       ioType[0] = 'D';
       ioType[1] = 'T';
-      runtime::memcpy(ioType + 2, edit.ioType, edit.ioTypeChars);
+      runtime::memcpy(ioType + 2, definedIoArgs.ioType, edit.ioTypeChars);
     } else {
       runtime::strcpy(
           ioType, io.mutableModes().inNamelist ? "NAMELIST" : "LISTDIRECTED");
@@ -81,7 +82,7 @@ static RT_API_ATTRS common::optional<bool> DefinedFormattedIo(
     if (integer8) {
       // Convert v_list values to INTEGER(8)
       for (int j{0}; j < edit.vListEntries; ++j) {
-        vList64[j] = edit.vList[j];
+        vList64[j] = definedIoArgs.vList[j];
       }
       vListDesc.Establish(
           TypeCategory::Integer, sizeof(std::int64_t), nullptr, 1);
@@ -91,7 +92,7 @@ static RT_API_ATTRS common::optional<bool> DefinedFormattedIo(
           static_cast<SubscriptValue>(sizeof(std::int64_t)));
     } else {
       vListDesc.Establish(TypeCategory::Integer, sizeof(int), nullptr, 1);
-      vListDesc.set_base_addr(edit.vList);
+      vListDesc.set_base_addr(definedIoArgs.vList);
       vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries);
       vListDesc.GetDimension(0).SetByteStride(
           static_cast<SubscriptValue>(sizeof(int)));
diff --git a/flang-rt/unittests/Runtime/Format.cpp b/flang-rt/unittests/Runtime/Format.cpp
index fe7403f26700b..cd52dc8c54ed5 100644
--- a/flang-rt/unittests/Runtime/Format.cpp
+++ b/flang-rt/unittests/Runtime/Format.cpp
@@ -22,7 +22,7 @@ using namespace std::literals::string_literals;
 using ResultsTy = std::vector<std::string>;
 
 // A test harness context for testing FormatControl
-class TestFormatContext : public IoErrorHandler {
+class TestFormatContext : public IoErrorHandler, public DefinedIoArgs {
 public:
   using CharType = char;
   TestFormatContext() : IoErrorHandler{"format.cpp", 1} {}

From 10a9ec88501fa260002245fcd4d0c6f6ccae4e17 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 10:36:10 -0700
Subject: [PATCH 285/878] [flang] Fix crash in structure constructor lowering
 (#160769)

MLIR types created by lowering for structure constructors appear to be
sensitive to the ordering of their components in the typed expression
representation used for structure constructors and derived type constant
values.

At present, the components appear in source position order. When some
ancestral types are defined in modules, this ordering can cause their
components to be ordered after components defined in extended derived
types. This can lead to crashes from incompatible MLIR types.

To avoid this issue, sort structure constructor components first in
ascending order of derived type extension depth; retain source position
ordering for components in the same derived type and for error recovery
situations.

Fixes https://github.com/llvm/llvm-project/issues/143740.
---
 flang/lib/Evaluate/constant.cpp           | 38 +++++++++++++++++++++
 flang/test/Evaluate/Inputs/comporder1.mod |  6 ++++
 flang/test/Evaluate/Inputs/comporder2.mod |  8 +++++
 flang/test/Evaluate/comporder.f90         | 41 +++++++++++++++++++++++
 4 files changed, 93 insertions(+)
 create mode 100644 flang/test/Evaluate/Inputs/comporder1.mod
 create mode 100644 flang/test/Evaluate/Inputs/comporder2.mod
 create mode 100644 flang/test/Evaluate/comporder.f90

diff --git a/flang/lib/Evaluate/constant.cpp b/flang/lib/Evaluate/constant.cpp
index 990339958399e..0fa397b9c356d 100644
--- a/flang/lib/Evaluate/constant.cpp
+++ b/flang/lib/Evaluate/constant.cpp
@@ -389,7 +389,45 @@ std::size_t Constant<SomeDerived>::CopyFrom(const Constant<SomeDerived> &source,
   return Base::CopyFrom(source, count, resultSubscripts, dimOrder);
 }
 
+static std::optional<int> DerivedTypeDepth(const semantics::Scope &scope) {
+  if (scope.IsDerivedType()) {
+    for (auto iter{scope.cbegin()}; iter != scope.cend(); ++iter) {
+      const Symbol &symbol{*iter->second};
+      if (symbol.test(Symbol::Flag::ParentComp)) {
+        if (const semantics::DeclTypeSpec *type{symbol.GetType()}) {
+          if (const semantics::DerivedTypeSpec *derived{type->AsDerived()}) {
+            const semantics::Scope *parent{derived->scope()};
+            if (!parent) {
+              parent = derived->typeSymbol().scope();
+            }
+            if (parent) {
+              if (auto parentDepth{DerivedTypeDepth(*parent)}) {
+                return 1 + *parentDepth;
+              }
+            }
+          }
+        }
+        return std::nullopt; // error recovery
+      }
+    }
+    return 0;
+  } else {
+    return std::nullopt; // error recovery
+  }
+}
+
 bool ComponentCompare::operator()(SymbolRef x, SymbolRef y) const {
+  if (&x->owner() != &y->owner()) {
+    // Not components of the same derived type; put ancestors' components first.
+    if (auto xDepth{DerivedTypeDepth(x->owner())}) {
+      if (auto yDepth{DerivedTypeDepth(y->owner())}) {
+        if (*xDepth != *yDepth) {
+          return *xDepth < *yDepth;
+        }
+      }
+    }
+  }
+  // Same derived type, distinct instantiations, or error recovery.
   return semantics::SymbolSourcePositionCompare{}(x, y);
 }
 
diff --git a/flang/test/Evaluate/Inputs/comporder1.mod b/flang/test/Evaluate/Inputs/comporder1.mod
new file mode 100644
index 0000000000000..5c1a3c89d5e1e
--- /dev/null
+++ b/flang/test/Evaluate/Inputs/comporder1.mod
@@ -0,0 +1,6 @@
+﻿!mod$ v1 sum:64657f78d85da21d
+module comporder1
+type::base
+integer(4)::c1
+end type
+end
diff --git a/flang/test/Evaluate/Inputs/comporder2.mod b/flang/test/Evaluate/Inputs/comporder2.mod
new file mode 100644
index 0000000000000..e228639669642
--- /dev/null
+++ b/flang/test/Evaluate/Inputs/comporder2.mod
@@ -0,0 +1,8 @@
+﻿!mod$ v1 sum:3235f4a02cdad423
+!need$ 64657f78d85da21d n comporder1
+module comporder2
+use comporder1,only:base
+type,extends(base)::intermediate
+integer(4)::c2
+end type
+end
diff --git a/flang/test/Evaluate/comporder.f90 b/flang/test/Evaluate/comporder.f90
new file mode 100644
index 0000000000000..c57f68137e11b
--- /dev/null
+++ b/flang/test/Evaluate/comporder.f90
@@ -0,0 +1,41 @@
+!RUN: %flang_fc1 -fdebug-unparse -I%S/Inputs %s | FileCheck %s
+program main
+  use comporder2
+  type, extends(intermediate) :: last
+    integer c3
+  end type
+  !CHECK:PRINT *, last(c1=1_4,c2=2_4,c3=3_4)
+  print *, last(1,2,3)
+  !CHECK:PRINT *, last(c1=11_4,c2=12_4,c3=13_4)
+  print *, last(c3=13,c2=12,c1=11)
+  !CHECK:PRINT *, last(c1=21_4,c2=22_4,c3=23_4)
+  print *, last(c3=23,c1=21,c2=22)
+  !CHECK:PRINT *, last(c1=31_4,c2=32_4,c3=33_4)
+  print *, last(c2=32,c3=33,c1=31)
+  !CHECK:PRINT *, last(c1=41_4,c2=42_4,c3=43_4)
+  print *, last(c2=42,c1=41,c3=43)
+  !CHECK:PRINT *, last(c1=51_4,c2=52_4,c3=53_4)
+  print *, last(c1=51,c3=53,c2=52)
+  !CHECK:PRINT *, last(c1=61_4,c2=62_4,c3=63_4)
+  print *, last(c1=61,c2=62,c3=63)
+  !CHECK:PRINT *, last(intermediate=intermediate(c1=71_4,c2=72_4),c3=73_4)
+  print *, last(c3=73,intermediate=intermediate(c2=72,c1=71))
+  !CHECK:PRINT *, last(intermediate=intermediate(c1=81_4,c2=82_4),c3=83_4)
+  print *, last(c3=83,intermediate=intermediate(c1=81,c2=82))
+  !CHECK:PRINT *, last(intermediate=intermediate(c1=91_4,c2=92_4),c3=93_4)
+  print *, last(intermediate(c2=92,c1=91),c3=93)
+  !CHECK:PRINT *, last(intermediate=intermediate(c1=101_4,c2=102_4),c3=103_4)
+  print *, last(intermediate(c1=101,c2=102),c3=103)
+  !CHECK:PRINT *, last(intermediate=intermediate(base=base(c1=111_4),c2=112_4),c3=113_4)
+  print *, last(c3=113,intermediate=intermediate(c2=112,base=base(c1=111)))
+  !CHECK:PRINT *, last(intermediate=intermediate(base=base(c1=121_4),c2=122_4),c3=123_4)
+  print *, last(c3=123,intermediate=intermediate(base(c1=121),c2=122))
+  !CHECK:PRINT *, last(intermediate=intermediate(base=base(c1=131_4),c2=132_4),c3=133_4)
+  print *, last(intermediate(c2=132,base=base(c1=131)),c3=133)
+  !CHECK:PRINT *, last(intermediate=intermediate(base=base(c1=141_4),c2=142_4),c3=143_4)
+  print *, last(intermediate(base(c1=141),c2=142),c3=143)
+  !CHECK:PRINT *, last(base=base(c1=151_4),c2=152_4,c3=153_4)
+  print *, last(base(151),c3=153,c2=152)
+  !CHECK:PRINT *, last(base=base(c1=161_4),c2=162_4,c3=163_4)
+  print *, last(base(161),c2=162,c3=163)
+end

From 6399d4792d127520ed7281229755d18ff54da297 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Tue, 30 Sep 2025 10:36:21 -0700
Subject: [PATCH 286/878] [llvm] Use the VFS to make path absolute (#161271)

For the redirecting VFS, the `'overlay-relative'` option controls
whether external paths should be appended to the overlay directory. This
didn't always work as expected: when the overlay file path itself was
relative, its absolute path was decided by the real FS, not the
underlying VFS, and the resulting external path didn't exist in the
underlying VFS. This PR fixes this issue.
---
 llvm/lib/Support/VirtualFileSystem.cpp        |  9 ++++++--
 .../Support/VirtualFileSystemTest.cpp         | 23 +++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 7ff62d43ba205..44d2ee7076fb2 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -1908,7 +1908,12 @@ class llvm::vfs::RedirectingFileSystemParser {
           FullPath = FS->getOverlayFileDir();
           assert(!FullPath.empty() &&
                  "External contents prefix directory must exist");
-          llvm::sys::path::append(FullPath, Value);
+          SmallString<256> AbsFullPath = Value;
+          if (FS->makeAbsolute(FullPath, AbsFullPath)) {
+            error(N, "failed to make 'external-contents' absolute");
+            return nullptr;
+          }
+          FullPath = AbsFullPath;
         } else {
           FullPath = Value;
         }
@@ -2204,7 +2209,7 @@ RedirectingFileSystem::create(std::unique_ptr<MemoryBuffer> Buffer,
     //  FS->OverlayFileDir => /<absolute_path_to>/dummy.cache/vfs
     //
     SmallString<256> OverlayAbsDir = sys::path::parent_path(YAMLFilePath);
-    std::error_code EC = llvm::sys::fs::make_absolute(OverlayAbsDir);
+    std::error_code EC = FS->makeAbsolute(OverlayAbsDir);
     assert(!EC && "Overlay dir final path must be absolute");
     (void)EC;
     FS->setOverlayFileDir(OverlayAbsDir);
diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp
index d47a4ee986778..f52f25f93744d 100644
--- a/llvm/unittests/Support/VirtualFileSystemTest.cpp
+++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp
@@ -1941,6 +1941,29 @@ TEST_F(VFSFromYAMLTest, ReturnsExternalPathVFSHit) {
   EXPECT_EQ(0, NumDiagnostics);
 }
 
+TEST_F(VFSFromYAMLTest, RelativeFileDirWithOverlayRelativeSetting) {
+  auto Lower = makeIntrusiveRefCnt<DummyFileSystem>();
+  Lower->addDirectory("//root/foo/bar");
+  Lower->addRegularFile("//root/foo/bar/a");
+  Lower->setCurrentWorkingDirectory("//root/foo");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS =
+      getFromYAMLString("{\n"
+                        "  'case-sensitive': false,\n"
+                        "  'overlay-relative': true,\n"
+                        "  'roots': [\n"
+                        "    { 'name': '//root/foo/bar/b', 'type': 'file',\n"
+                        "      'external-contents': 'a'\n"
+                        "    }\n"
+                        "  ]\n"
+                        "}",
+                        Lower, "bar/overlay");
+
+  ASSERT_NE(FS.get(), nullptr);
+  ErrorOr<vfs::Status> S = FS->status("//root/foo/bar/b");
+  ASSERT_FALSE(S.getError());
+  EXPECT_EQ("//root/foo/bar/a", S->getName());
+}
+
 TEST_F(VFSFromYAMLTest, RootRelativeToOverlayDirTest) {
   auto Lower = makeIntrusiveRefCnt<DummyFileSystem>();
   Lower->addDirectory("//root/foo/bar");

From 52afb8dd112d52b620003998b5348b45b214e828 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 10:36:33 -0700
Subject: [PATCH 287/878] [flang] Don't retain FIXED/FREE compiler directives
 (#160780)

Some old code in the prescanner, antedating the current -E output
mechanisms, retains the !DIR$ FIXED and !DIR$ FREE directives in the
input, and will even generate them to append to the scanned source from
source and include files to restore the fixed/free source form
distinction. But these directives have not been needed since the -E
output generator began generating source form insensitive output, and
they can confuse the parser's error recovery when the appended
directives follow the END statement. Change their handling so that
they're read and respected by the prescanner but no longer retained in
either the -E output or the cooked character stream passed on to the
parser.

Fixes a regression reported by @danielcchen after PR 159834.
---
 flang/lib/Parser/prescan.cpp          | 33 ++++++++++++---------------
 flang/lib/Parser/prescan.h            |  2 +-
 flang/test/Preprocessing/fixed-free.f |  8 +++++++
 3 files changed, 24 insertions(+), 19 deletions(-)
 create mode 100644 flang/test/Preprocessing/fixed-free.f

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 3a9a475c365ee..865c149380d85 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -97,17 +97,7 @@ void Prescanner::Prescan(ProvenanceRange range) {
   while (!IsAtEnd()) {
     Statement();
   }
-  if (inFixedForm_ != beganInFixedForm) {
-    std::string dir{"!dir$ "};
-    if (beganInFixedForm) {
-      dir += "fixed";
-    } else {
-      dir += "free";
-    }
-    dir += '\n';
-    TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
-    tokens.Emit(cooked_);
-  }
+  inFixedForm_ = beganInFixedForm;
 }
 
 void Prescanner::Statement() {
@@ -324,10 +314,11 @@ void Prescanner::Statement() {
       }
       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
       preprocessed->ToLowerCase();
-      SourceFormChange(preprocessed->ToString());
-      CheckAndEmitLine(
-          preprocessed->ClipComment(*this, true /* skip first ! */),
-          newlineProvenance);
+      if (!SourceFormChange(preprocessed->ToString())) {
+        CheckAndEmitLine(
+            preprocessed->ClipComment(*this, true /* skip first ! */),
+            newlineProvenance);
+      }
       break;
     case LineClassification::Kind::Source:
       if (inFixedForm_) {
@@ -370,14 +361,16 @@ void Prescanner::Statement() {
         }
       }
       tokens.ToLowerCase();
-      SourceFormChange(tokens.ToString());
+      if (!SourceFormChange(tokens.ToString())) {
+        CheckAndEmitLine(tokens, newlineProvenance);
+      }
     } else { // Kind::Source
       tokens.ToLowerCase();
       if (inFixedForm_) {
         EnforceStupidEndStatementRules(tokens);
       }
+      CheckAndEmitLine(tokens, newlineProvenance);
     }
-    CheckAndEmitLine(tokens, newlineProvenance);
   }
   directiveSentinel_ = nullptr;
 }
@@ -1774,11 +1767,15 @@ Prescanner::LineClassification Prescanner::ClassifyLine(
   return classification;
 }
 
-void Prescanner::SourceFormChange(std::string &&dir) {
+bool Prescanner::SourceFormChange(std::string &&dir) {
   if (dir == "!dir$ free") {
     inFixedForm_ = false;
+    return true;
   } else if (dir == "!dir$ fixed") {
     inFixedForm_ = true;
+    return true;
+  } else {
+    return false;
   }
 }
 
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index c181c03273ccc..fc38adb926530 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -225,7 +225,7 @@ class Prescanner {
   LineClassification ClassifyLine(const char *) const;
   LineClassification ClassifyLine(
       TokenSequence &, Provenance newlineProvenance) const;
-  void SourceFormChange(std::string &&);
+  bool SourceFormChange(std::string &&);
   bool CompilerDirectiveContinuation(TokenSequence &, const char *sentinel);
   bool SourceLineContinuation(TokenSequence &);
 
diff --git a/flang/test/Preprocessing/fixed-free.f b/flang/test/Preprocessing/fixed-free.f
new file mode 100644
index 0000000000000..95f63a4d71e4c
--- /dev/null
+++ b/flang/test/Preprocessing/fixed-free.f
@@ -0,0 +1,8 @@
+!RUN: %flang -E %s 2>&1 | FileCheck %s
+!RUN: %flang -fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+!CHECK-NOT: dir$
+!CHECK-NOT: error:
+!dir$ fixed
+        continue
+!dir$ free
+        end

From ed5e6b87013485c7b16d825e0cbf556a1e7e3e19 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 10:36:52 -0700
Subject: [PATCH 288/878] [flang] Catch calls to impure intrinsics from PURE
 subprograms (#160947)

The code in expression semantics that catches a call to an impure
procedure in a PURE context misses calls to impure intrinsics, since
their designators have a SpecificIntrinsic rather than a Symbol. Replace
the current check with a new one that uses the characteristics of the
called procedure, which works for both intrinsic and non-intrinsic
cases.

Testing this change revealed that an explicit INTRINSIC statement wasn't
doing the right thing for extension "dual" intrinsics that can be called
as either a function or as a subroutine; the use of an INTRINSIC
statement would disallow its use as a subroutine. I've fixed that here
as well.

Fixes https://github.com/llvm/llvm-project/issues/157124.
---
 flang/include/flang/Evaluate/intrinsics.h |  1 +
 flang/lib/Evaluate/intrinsics.cpp         |  7 +++++--
 flang/lib/Semantics/expression.cpp        | 23 ++++++++++++++---------
 flang/lib/Semantics/resolve-names.cpp     |  3 ++-
 flang/test/Semantics/bug157124.f90        | 11 +++++++++++
 5 files changed, 33 insertions(+), 12 deletions(-)
 create mode 100644 flang/test/Semantics/bug157124.f90

diff --git a/flang/include/flang/Evaluate/intrinsics.h b/flang/include/flang/Evaluate/intrinsics.h
index dbe1ba7fe7ec1..fc1c8b2ba6ab7 100644
--- a/flang/include/flang/Evaluate/intrinsics.h
+++ b/flang/include/flang/Evaluate/intrinsics.h
@@ -86,6 +86,7 @@ class IntrinsicProcTable {
   bool IsIntrinsic(const std::string &) const;
   bool IsIntrinsicFunction(const std::string &) const;
   bool IsIntrinsicSubroutine(const std::string &) const;
+  bool IsDualIntrinsic(const std::string &) const;
 
   // Inquiry intrinsics are defined in section 16.7, table 16.1
   IntrinsicClass GetIntrinsicClass(const std::string &) const;
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index c7f174f7989dd..fe679da4ff98b 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -1674,7 +1674,7 @@ static const IntrinsicInterface intrinsicSubroutine[]{
             {"to", SameIntOrUnsigned, Rank::elemental, Optionality::required,
                 common::Intent::Out},
             {"topos", AnyInt}},
-        {}, Rank::elemental, IntrinsicClass::elementalSubroutine}, // elemental
+        {}, Rank::elemental, IntrinsicClass::elementalSubroutine},
     {"random_init",
         {{"repeatable", AnyLogical, Rank::scalar},
             {"image_distinct", AnyLogical, Rank::scalar}},
@@ -2903,7 +2903,7 @@ bool IntrinsicProcTable::Implementation::IsDualIntrinsic(
   // Collection for some intrinsics with function and subroutine form,
   // in order to pass the semantic check.
   static const std::string dualIntrinsic[]{{"chdir"}, {"etime"}, {"fseek"},
-      {"ftell"}, {"getcwd"}, {"hostnm"}, {"putenv"s}, {"rename"}, {"second"},
+      {"ftell"}, {"getcwd"}, {"hostnm"}, {"putenv"}, {"rename"}, {"second"},
       {"system"}, {"unlink"}};
   return llvm::is_contained(dualIntrinsic, name);
 }
@@ -3766,6 +3766,9 @@ bool IntrinsicProcTable::IsIntrinsicFunction(const std::string &name) const {
 bool IntrinsicProcTable::IsIntrinsicSubroutine(const std::string &name) const {
   return DEREF(impl_.get()).IsIntrinsicSubroutine(name);
 }
+bool IntrinsicProcTable::IsDualIntrinsic(const std::string &name) const {
+  return DEREF(impl_.get()).IsDualIntrinsic(name);
+}
 
 IntrinsicClass IntrinsicProcTable::GetIntrinsicClass(
     const std::string &name) const {
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 3f048ab6f7a4d..836500145e4a2 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -3644,19 +3644,24 @@ std::optional<characteristics::Procedure> ExpressionAnalyzer::CheckCall(
       Say(callSite,
           "Assumed-length character function must be defined with a length to be called"_err_en_US);
     }
+    if (!chars->IsPure()) {
+      if (const semantics::Scope *pure{semantics::FindPureProcedureContaining(
+              context_.FindScope(callSite))}) {
+        std::string name;
+        if (procSymbol) {
+          name = "'"s + procSymbol->name().ToString() + "'";
+        } else if (const auto *intrinsic{proc.GetSpecificIntrinsic()}) {
+          name = "'"s + intrinsic->name + "'";
+        }
+        Say(callSite,
+            "Procedure %s referenced in pure subprogram '%s' must be pure too"_err_en_US,
+            name, DEREF(pure->symbol()).name());
+      }
+    }
     ok &= semantics::CheckArguments(*chars, arguments, context_,
         context_.FindScope(callSite), treatExternalAsImplicit,
         /*ignoreImplicitVsExplicit=*/false, specificIntrinsic);
   }
-  if (procSymbol && !IsPureProcedure(*procSymbol)) {
-    if (const semantics::Scope *
-        pure{semantics::FindPureProcedureContaining(
-            context_.FindScope(callSite))}) {
-      Say(callSite,
-          "Procedure '%s' referenced in pure subprogram '%s' must be pure too"_err_en_US,
-          procSymbol->name(), DEREF(pure->symbol()).name());
-    }
-  }
   if (ok && !treatExternalAsImplicit && procSymbol &&
       !(chars && chars->HasExplicitInterface())) {
     if (const Symbol *global{FindGlobal(*procSymbol)};
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index ef0b8cdfd827b..d1150a9eb67f4 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5726,7 +5726,8 @@ void DeclarationVisitor::DeclareIntrinsic(const parser::Name &name) {
       }
     }
     if (!symbol.test(Symbol::Flag::Function) &&
-        !symbol.test(Symbol::Flag::Subroutine)) {
+        !symbol.test(Symbol::Flag::Subroutine) &&
+        !context().intrinsics().IsDualIntrinsic(name.source.ToString())) {
       if (context().intrinsics().IsIntrinsicFunction(name.source.ToString())) {
         symbol.set(Symbol::Flag::Function);
       } else if (context().intrinsics().IsIntrinsicSubroutine(
diff --git a/flang/test/Semantics/bug157124.f90 b/flang/test/Semantics/bug157124.f90
new file mode 100644
index 0000000000000..92326dc9e7b69
--- /dev/null
+++ b/flang/test/Semantics/bug157124.f90
@@ -0,0 +1,11 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+pure subroutine puresub
+  intrinsic sleep, chdir, get_command
+  character(80) str
+  !ERROR: Procedure 'impureexternal' referenced in pure subprogram 'puresub' must be pure too
+  call impureExternal ! implicit interface
+  !ERROR: Procedure 'sleep' referenced in pure subprogram 'puresub' must be pure too
+  call sleep(1) ! intrinsic subroutine, debatably impure
+  !ERROR: Procedure 'chdir' referenced in pure subprogram 'puresub' must be pure too
+  call chdir('.') ! "dual" function/subroutine, impure
+end

From 802530283a983a92d7848f26e6291c690667144e Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 10:37:14 -0700
Subject: [PATCH 289/878] [flang] Improve presentation of errors after last
 source line (#161391)

We don't emit source file names or line numbers for error messages at
EOF. Detect these and handle them a little better, pointing at the
newline at the end of the last source line instead.
---
 flang/include/flang/Parser/message.h |  2 ++
 flang/lib/Parser/basic-parsers.h     |  2 +-
 flang/lib/Parser/message.cpp         | 19 ++++++++++++++++++-
 flang/test/Parser/recovery08.f90     | 11 +++++++++++
 4 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Parser/recovery08.f90

diff --git a/flang/include/flang/Parser/message.h b/flang/include/flang/Parser/message.h
index 7da9e12999db1..224263e4be860 100644
--- a/flang/include/flang/Parser/message.h
+++ b/flang/include/flang/Parser/message.h
@@ -65,6 +65,8 @@ class MessageFixedText {
     return severity_ == Severity::Error || severity_ == Severity::Todo;
   }
 
+  static const MessageFixedText endOfFileMessage; // "end of file"_err_en_US
+
 private:
   CharBlock text_;
   Severity severity_{Severity::None};
diff --git a/flang/lib/Parser/basic-parsers.h b/flang/lib/Parser/basic-parsers.h
index 7e69d41debfcd..46d5168c80fe7 100644
--- a/flang/lib/Parser/basic-parsers.h
+++ b/flang/lib/Parser/basic-parsers.h
@@ -828,7 +828,7 @@ struct NextCh {
     if (std::optional<const char *> result{state.GetNextChar()}) {
       return result;
     }
-    state.Say("end of file"_err_en_US);
+    state.Say(MessageFixedText::endOfFileMessage);
     return std::nullopt;
   }
 };
diff --git a/flang/lib/Parser/message.cpp b/flang/lib/Parser/message.cpp
index 2a8101dd0b810..2c4f930c0b088 100644
--- a/flang/lib/Parser/message.cpp
+++ b/flang/lib/Parser/message.cpp
@@ -21,6 +21,10 @@
 
 namespace Fortran::parser {
 
+// The nextCh parser emits this, and Message::GetProvenanceRange() looks for it.
+const MessageFixedText MessageFixedText::endOfFileMessage{
+    "end of file"_err_en_US};
+
 llvm::raw_ostream &operator<<(llvm::raw_ostream &o, const MessageFixedText &t) {
   std::size_t n{t.text().size()};
   for (std::size_t j{0}; j < n; ++j) {
@@ -232,7 +236,20 @@ std::optional<ProvenanceRange> Message::GetProvenanceRange(
     const AllCookedSources &allCooked) const {
   return common::visit(
       common::visitors{
-          [&](CharBlock cb) { return allCooked.GetProvenanceRange(cb); },
+          [&](CharBlock cb) -> std::optional<ProvenanceRange> {
+            if (auto pr{allCooked.GetProvenanceRange(cb)}) {
+              return pr;
+            } else if (const auto *fixed{std::get_if<MessageFixedText>(&text_)};
+                fixed &&
+                fixed->text() == MessageFixedText::endOfFileMessage.text() &&
+                cb.begin() && cb.size() == 1) {
+              // Failure from "nextCh" due to reaching EOF.  Back up one byte
+              // to the terminal newline so that the output looks better.
+              return allCooked.GetProvenanceRange(CharBlock{cb.begin() - 1, 1});
+            } else {
+              return std::nullopt;
+            }
+          },
           [](const ProvenanceRange &pr) { return std::make_optional(pr); },
       },
       location_);
diff --git a/flang/test/Parser/recovery08.f90 b/flang/test/Parser/recovery08.f90
new file mode 100644
index 0000000000000..978e42bab9344
--- /dev/null
+++ b/flang/test/Parser/recovery08.f90
@@ -0,0 +1,11 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+! CHECK: error: end of file
+! CHECK: ^
+! CHECK: in the context: END PROGRAM statement
+! CHECK: in the context: main program
+
+  integer :: i
+
+  ! Add empty lines for emphasis
+
+  i = 5

From b40feb9c54469928368705a724db8b9d032e8725 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Tue, 30 Sep 2025 10:38:07 -0700
Subject: [PATCH 290/878] [llvm] Fix build after #161260

The modular build was failing due to a missing include.
---
 llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
index f53cee26739a4..43ef86b08517a 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/GCOVProfiler.h
@@ -14,6 +14,7 @@
 
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Transforms/Utils/Instrumentation.h"
 
 namespace llvm {

From 72dafa1658d30395f626e08b1fddc034efe61eec Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 30 Sep 2025 19:43:37 +0200
Subject: [PATCH 291/878] [TableGen, CHERI] Make CPtrWildcard test tolerant to
 unrelated changes (#161406)

Changes to llvm/include/llvm/IR/Intrinsics.td may change the constants
that are embedded in this test. Use wildcards, so that unrelated changes
do not trip over this test failing.

Fixes: https://github.com/llvm/llvm-project/pull/158426
---
 llvm/test/TableGen/CPtrWildcard.td | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/TableGen/CPtrWildcard.td b/llvm/test/TableGen/CPtrWildcard.td
index 96b51ae1044a3..230a6730c610a 100644
--- a/llvm/test/TableGen/CPtrWildcard.td
+++ b/llvm/test/TableGen/CPtrWildcard.td
@@ -5,19 +5,19 @@
 
 // CHECK:        static const unsigned char MatcherTable[] = {
 // CHECK-NEXT: /*     0*/ OPC_CheckOpcode, TARGET_VAL(ISD::INTRINSIC_WO_CHAIN),
-// CHECK-NEXT:/*     3*/ OPC_CheckChild0Integer, 42,
+// CHECK-NEXT:/*     3*/ OPC_CheckChild0Integer, [[#]],
 // CHECK-NEXT:/*     5*/ OPC_RecordChild1, // #0 = $src
 // CHECK-NEXT:/*     6*/ OPC_Scope, 9, /*->17*/ // 2 children in Scope
 // CHECK-NEXT:/*     8*/  OPC_CheckChild1Type, /*MVT::c64*/126|128,1/*254*/,
 // CHECK-NEXT:/*    11*/  OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C64_TO_I64),
 // CHECK-NEXT:                /*MVT::i64*/8, 1/*#Ops*/, 0,
-// CHECK-NEXT:            // Src: (intrinsic_wo_chain:{ *:[i64] } 21:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8
+// CHECK-NEXT:            // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8
 // CHECK-NEXT:            // Dst: (C64_TO_I64:{ *:[i64] } ?:{ *:[c64] }:$src)
 // CHECK-NEXT:/*    17*/ /*Scope*/ 9, /*->27*/
 // CHECK-NEXT:/*    18*/  OPC_CheckChild1Type, /*MVT::c128*/127|128,1/*255*/,
 // CHECK-NEXT:/*    21*/  OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C128_TO_I64),
 // CHECK-NEXT:                /*MVT::i64*/8, 1/*#Ops*/, 0,
-// CHECK-NEXT:            // Src: (intrinsic_wo_chain:{ *:[i64] } 21:{ *:[iPTR] }, c128:{ *:[c128] }:$src) - Complexity = 8
+// CHECK-NEXT:            // Src: (intrinsic_wo_chain:{ *:[i64] } [[#]]:{ *:[iPTR] }, c128:{ *:[c128] }:$src) - Complexity = 8
 // CHECK-NEXT:            // Dst: (C128_TO_I64:{ *:[i64] } ?:{ *:[c128] }:$src)
 // CHECK-NEXT:/*    27*/ 0, /*End of Scope*/
 // CHECK-NEXT:    0

From df7ac0ec32e00df9a6c6924224c166c3316724a9 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Tue, 30 Sep 2025 10:46:21 -0700
Subject: [PATCH 292/878] [clang][modules] Virtualize module cache pruning
 (#149113)

This PR virtualizes module cache pruning via the new `ModuleCache`
interface. Currently this is an NFC, but I left a FIXME in
`InProcessModuleCache` to make this more efficient for the dependency
scanner.
---
 .../include/clang/Serialization/ModuleCache.h |  9 +-
 clang/lib/Frontend/CompilerInstance.cpp       | 93 +------------------
 clang/lib/Serialization/ModuleCache.cpp       | 86 +++++++++++++++++
 .../InProcessModuleCache.cpp                  |  7 ++
 4 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/clang/include/clang/Serialization/ModuleCache.h b/clang/include/clang/Serialization/ModuleCache.h
index 3117d954a09cc..ec052c5c18e0a 100644
--- a/clang/include/clang/Serialization/ModuleCache.h
+++ b/clang/include/clang/Serialization/ModuleCache.h
@@ -45,11 +45,15 @@ class ModuleCache : public RefCountedBase<ModuleCache> {
   /// were validated.
   virtual void updateModuleTimestamp(StringRef ModuleFilename) = 0;
 
+  /// Prune module files that haven't been accessed in a long time.
+  virtual void maybePrune(StringRef Path, time_t PruneInterval,
+                          time_t PruneAfter) = 0;
+
   /// Returns this process's view of the module cache.
   virtual InMemoryModuleCache &getInMemoryModuleCache() = 0;
   virtual const InMemoryModuleCache &getInMemoryModuleCache() const = 0;
 
-  // TODO: Virtualize writing/reading PCM files, pruning, etc.
+  // TODO: Virtualize writing/reading PCM files, etc.
 
   virtual ~ModuleCache() = default;
 };
@@ -59,6 +63,9 @@ class ModuleCache : public RefCountedBase<ModuleCache> {
 /// \c CompilerInstance instances participating in building modules for single
 /// translation unit in order to share the same \c InMemoryModuleCache.
 IntrusiveRefCntPtr<ModuleCache> createCrossProcessModuleCache();
+
+/// Shared implementation of `ModuleCache::maybePrune()`.
+void maybePruneImpl(StringRef Path, time_t PruneInterval, time_t PruneAfter);
 } // namespace clang
 
 #endif
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index c989ad2e5155c..b1fb905e3602f 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1598,90 +1598,6 @@ static void checkConfigMacros(Preprocessor &PP, Module *M,
   }
 }
 
-/// Write a new timestamp file with the given path.
-static void writeTimestampFile(StringRef TimestampFile) {
-  std::error_code EC;
-  llvm::raw_fd_ostream Out(TimestampFile.str(), EC, llvm::sys::fs::OF_None);
-}
-
-/// Prune the module cache of modules that haven't been accessed in
-/// a long time.
-static void pruneModuleCache(const HeaderSearchOptions &HSOpts) {
-  llvm::sys::fs::file_status StatBuf;
-  llvm::SmallString<128> TimestampFile;
-  TimestampFile = HSOpts.ModuleCachePath;
-  assert(!TimestampFile.empty());
-  llvm::sys::path::append(TimestampFile, "modules.timestamp");
-
-  // Try to stat() the timestamp file.
-  if (std::error_code EC = llvm::sys::fs::status(TimestampFile, StatBuf)) {
-    // If the timestamp file wasn't there, create one now.
-    if (EC == std::errc::no_such_file_or_directory) {
-      writeTimestampFile(TimestampFile);
-    }
-    return;
-  }
-
-  // Check whether the time stamp is older than our pruning interval.
-  // If not, do nothing.
-  time_t TimeStampModTime =
-      llvm::sys::toTimeT(StatBuf.getLastModificationTime());
-  time_t CurrentTime = time(nullptr);
-  if (CurrentTime - TimeStampModTime <= time_t(HSOpts.ModuleCachePruneInterval))
-    return;
-
-  // Write a new timestamp file so that nobody else attempts to prune.
-  // There is a benign race condition here, if two Clang instances happen to
-  // notice at the same time that the timestamp is out-of-date.
-  writeTimestampFile(TimestampFile);
-
-  // Walk the entire module cache, looking for unused module files and module
-  // indices.
-  std::error_code EC;
-  for (llvm::sys::fs::directory_iterator Dir(HSOpts.ModuleCachePath, EC),
-       DirEnd;
-       Dir != DirEnd && !EC; Dir.increment(EC)) {
-    // If we don't have a directory, there's nothing to look into.
-    if (!llvm::sys::fs::is_directory(Dir->path()))
-      continue;
-
-    // Walk all of the files within this directory.
-    for (llvm::sys::fs::directory_iterator File(Dir->path(), EC), FileEnd;
-         File != FileEnd && !EC; File.increment(EC)) {
-      // We only care about module and global module index files.
-      StringRef Extension = llvm::sys::path::extension(File->path());
-      if (Extension != ".pcm" && Extension != ".timestamp" &&
-          llvm::sys::path::filename(File->path()) != "modules.idx")
-        continue;
-
-      // Look at this file. If we can't stat it, there's nothing interesting
-      // there.
-      if (llvm::sys::fs::status(File->path(), StatBuf))
-        continue;
-
-      // If the file has been used recently enough, leave it there.
-      time_t FileAccessTime = llvm::sys::toTimeT(StatBuf.getLastAccessedTime());
-      if (CurrentTime - FileAccessTime <=
-              time_t(HSOpts.ModuleCachePruneAfter)) {
-        continue;
-      }
-
-      // Remove the file.
-      llvm::sys::fs::remove(File->path());
-
-      // Remove the timestamp file.
-      std::string TimpestampFilename = File->path() + ".timestamp";
-      llvm::sys::fs::remove(TimpestampFilename);
-    }
-
-    // If we removed all of the files in the directory, remove the directory
-    // itself.
-    if (llvm::sys::fs::directory_iterator(Dir->path(), EC) ==
-            llvm::sys::fs::directory_iterator() && !EC)
-      llvm::sys::fs::remove(Dir->path());
-  }
-}
-
 void CompilerInstance::createASTReader() {
   if (TheASTReader)
     return;
@@ -1692,11 +1608,10 @@ void CompilerInstance::createASTReader() {
   // If we're implicitly building modules but not currently recursively
   // building a module, check whether we need to prune the module cache.
   if (getSourceManager().getModuleBuildStack().empty() &&
-      !getPreprocessor().getHeaderSearchInfo().getModuleCachePath().empty() &&
-      getHeaderSearchOpts().ModuleCachePruneInterval > 0 &&
-      getHeaderSearchOpts().ModuleCachePruneAfter > 0) {
-    pruneModuleCache(getHeaderSearchOpts());
-  }
+      !getPreprocessor().getHeaderSearchInfo().getModuleCachePath().empty())
+    ModCache->maybePrune(getHeaderSearchOpts().ModuleCachePath,
+                         getHeaderSearchOpts().ModuleCachePruneInterval,
+                         getHeaderSearchOpts().ModuleCachePruneAfter);
 
   HeaderSearchOptions &HSOpts = getHeaderSearchOpts();
   std::string Sysroot = HSOpts.Sysroot;
diff --git a/clang/lib/Serialization/ModuleCache.cpp b/clang/lib/Serialization/ModuleCache.cpp
index f42bdc16d815d..96687277ebafd 100644
--- a/clang/lib/Serialization/ModuleCache.cpp
+++ b/clang/lib/Serialization/ModuleCache.cpp
@@ -16,6 +16,87 @@
 
 using namespace clang;
 
+/// Write a new timestamp file with the given path.
+static void writeTimestampFile(StringRef TimestampFile) {
+  std::error_code EC;
+  llvm::raw_fd_ostream Out(TimestampFile.str(), EC, llvm::sys::fs::OF_None);
+}
+
+void clang::maybePruneImpl(StringRef Path, time_t PruneInterval,
+                           time_t PruneAfter) {
+  if (PruneInterval <= 0 || PruneAfter <= 0)
+    return;
+
+  llvm::SmallString<128> TimestampFile(Path);
+  llvm::sys::path::append(TimestampFile, "modules.timestamp");
+
+  // Try to stat() the timestamp file.
+  llvm::sys::fs::file_status StatBuf;
+  if (std::error_code EC = llvm::sys::fs::status(TimestampFile, StatBuf)) {
+    // If the timestamp file wasn't there, create one now.
+    if (EC == std::errc::no_such_file_or_directory)
+      writeTimestampFile(TimestampFile);
+    return;
+  }
+
+  // Check whether the time stamp is older than our pruning interval.
+  // If not, do nothing.
+  time_t TimestampModTime =
+      llvm::sys::toTimeT(StatBuf.getLastModificationTime());
+  time_t CurrentTime = time(nullptr);
+  if (CurrentTime - TimestampModTime <= PruneInterval)
+    return;
+
+  // Write a new timestamp file so that nobody else attempts to prune.
+  // There is a benign race condition here, if two Clang instances happen to
+  // notice at the same time that the timestamp is out-of-date.
+  writeTimestampFile(TimestampFile);
+
+  // Walk the entire module cache, looking for unused module files and module
+  // indices.
+  std::error_code EC;
+  for (llvm::sys::fs::directory_iterator Dir(Path, EC), DirEnd;
+       Dir != DirEnd && !EC; Dir.increment(EC)) {
+    // If we don't have a directory, there's nothing to look into.
+    if (!llvm::sys::fs::is_directory(Dir->path()))
+      continue;
+
+    // Walk all the files within this directory.
+    for (llvm::sys::fs::directory_iterator File(Dir->path(), EC), FileEnd;
+         File != FileEnd && !EC; File.increment(EC)) {
+      // We only care about module and global module index files.
+      StringRef Extension = llvm::sys::path::extension(File->path());
+      if (Extension != ".pcm" && Extension != ".timestamp" &&
+          llvm::sys::path::filename(File->path()) != "modules.idx")
+        continue;
+
+      // Look at this file. If we can't stat it, there's nothing interesting
+      // there.
+      if (llvm::sys::fs::status(File->path(), StatBuf))
+        continue;
+
+      // If the file has been used recently enough, leave it there.
+      time_t FileAccessTime = llvm::sys::toTimeT(StatBuf.getLastAccessedTime());
+      if (CurrentTime - FileAccessTime <= PruneAfter)
+        continue;
+
+      // Remove the file.
+      llvm::sys::fs::remove(File->path());
+
+      // Remove the timestamp file.
+      std::string TimpestampFilename = File->path() + ".timestamp";
+      llvm::sys::fs::remove(TimpestampFilename);
+    }
+
+    // If we removed all the files in the directory, remove the directory
+    // itself.
+    if (llvm::sys::fs::directory_iterator(Dir->path(), EC) ==
+            llvm::sys::fs::directory_iterator() &&
+        !EC)
+      llvm::sys::fs::remove(Dir->path());
+  }
+}
+
 namespace {
 class CrossProcessModuleCache : public ModuleCache {
   InMemoryModuleCache InMemory;
@@ -53,6 +134,11 @@ class CrossProcessModuleCache : public ModuleCache {
     OS.clear_error(); // Avoid triggering a fatal error.
   }
 
+  void maybePrune(StringRef Path, time_t PruneInterval,
+                  time_t PruneAfter) override {
+    maybePruneImpl(Path, PruneInterval, PruneAfter);
+  }
+
   InMemoryModuleCache &getInMemoryModuleCache() override { return InMemory; }
   const InMemoryModuleCache &getInMemoryModuleCache() const override {
     return InMemory;
diff --git a/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp b/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp
index 80db2d47d940e..d1e543b438225 100644
--- a/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp
+++ b/clang/lib/Tooling/DependencyScanning/InProcessModuleCache.cpp
@@ -100,6 +100,13 @@ class InProcessModuleCache : public ModuleCache {
     Timestamp.store(llvm::sys::toTimeT(std::chrono::system_clock::now()));
   }
 
+  void maybePrune(StringRef Path, time_t PruneInterval,
+                  time_t PruneAfter) override {
+    // FIXME: This only needs to be ran once per build, not in every
+    // compilation. Call it once per service.
+    maybePruneImpl(Path, PruneInterval, PruneAfter);
+  }
+
   InMemoryModuleCache &getInMemoryModuleCache() override { return InMemory; }
   const InMemoryModuleCache &getInMemoryModuleCache() const override {
     return InMemory;

From ecea2b542b8bfb5e8244b3b1dd88a81014594797 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 30 Sep 2025 19:49:22 +0200
Subject: [PATCH 293/878] [MLIR] Fix gpu.launch attribution argument printing
 (#161408)

This was broken and never tested.
Not only this could crash for stack-use-after-scope, but it also would
have printed something like:

```
value <block argument> of type 'memref<7x8xf64, #gpu.address_space<workgroup>>' at index: 12
```

insted of the SSA value.

It turns out the gpu.func already have a very similar helper that we can
reuse here.

Fixes #161394
---
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 45 +++++++++----------------
 mlir/test/Dialect/GPU/ops.mlir         | 46 +++++++++++++++++++++-----
 2 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 43b02f16aa829..c0f9132de3db4 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -535,17 +535,26 @@ parseAttributions(OpAsmParser &parser, StringRef keyword,
                                   /*allowType=*/true);
 }
 
-/// Prints a GPU function memory attribution.
 static void printAttributions(OpAsmPrinter &p, StringRef keyword,
-                              ArrayRef<BlockArgument> values) {
+                              ArrayRef<BlockArgument> values,
+                              ArrayAttr attributes = {}) {
   if (values.empty())
     return;
 
-  auto printBlockArg = [](BlockArgument v) {
-    return llvm::formatv("{} : {}", v, v.getType());
-  };
-  p << ' ' << keyword << '('
-    << llvm::interleaved(llvm::map_range(values, printBlockArg)) << ')';
+  p << ' ' << keyword << '(';
+  llvm::interleaveComma(
+      llvm::enumerate(values), p, [&p, attributes](auto pair) {
+        BlockArgument v = pair.value();
+        p << v << " : " << v.getType();
+
+        size_t attributionIndex = pair.index();
+        DictionaryAttr attrs;
+        if (attributes && attributionIndex < attributes.size())
+          attrs = llvm::cast<DictionaryAttr>(attributes[attributionIndex]);
+        if (attrs)
+          p.printOptionalAttrDict(attrs.getValue());
+      });
+  p << ')';
 }
 
 /// Verifies a GPU function memory attribution.
@@ -1649,28 +1658,6 @@ ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) {
   return parser.parseRegion(*body, entryArgs);
 }
 
-static void printAttributions(OpAsmPrinter &p, StringRef keyword,
-                              ArrayRef<BlockArgument> values,
-                              ArrayAttr attributes) {
-  if (values.empty())
-    return;
-
-  p << ' ' << keyword << '(';
-  llvm::interleaveComma(
-      llvm::enumerate(values), p, [&p, attributes](auto pair) {
-        BlockArgument v = pair.value();
-        p << v << " : " << v.getType();
-
-        size_t attributionIndex = pair.index();
-        DictionaryAttr attrs;
-        if (attributes && attributionIndex < attributes.size())
-          attrs = llvm::cast<DictionaryAttr>(attributes[attributionIndex]);
-        if (attrs)
-          p.printOptionalAttrDict(attrs.getValue());
-      });
-  p << ')';
-}
-
 void GPUFuncOp::print(OpAsmPrinter &p) {
   p << ' ';
   p.printSymbolName(getName());
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index e3e2474d917c8..7772e7a1681c4 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -68,6 +68,31 @@ module attributes {gpu.container_module} {
     return
   }
 
+  // CHECK-LABEL: func @launch_with_attributions(
+  func.func @launch_with_attributions(%blk : index, %thrd : index, %float : f32, %data : memref<?xf32,1>) {
+    // CHECK: gpu.launch
+    gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %blk, %grid_y = %blk, %grid_z = %blk)
+               threads(%tx, %ty, %tz) in (%block_x = %thrd, %block_y = %thrd, %block_z = %thrd)
+    // CHECK-SAME: workgroup(%[[WGROUP1:.*]] : memref<42xf32, 3>, %[[WGROUP2:.*]] : memref<2xf32, 3>)
+        workgroup(%arg1: memref<42xf32, 3>, %arg2: memref<2xf32, 3>)
+    // CHECK-SAME: private(%[[PRIVATE1:.*]] : memref<2xf32, 5>, %[[PRIVATE2:.*]] : memref<1xf32, 5>)
+        private(%arg3: memref<2xf32, 5>, %arg4: memref<1xf32, 5>)
+                {
+      "use"(%float) : (f32) -> ()
+      "use"(%data) : (memref<?xf32,1>) -> ()
+      // CHECK: "use"(%[[WGROUP1]], %[[WGROUP2]])
+      "use"(%arg1, %arg2) : (memref<42xf32, 3>, memref<2xf32, 3>) -> ()
+      // CHECK: "use"(%[[PRIVATE1]])
+      "use"(%arg3) : (memref<2xf32, 5>) -> ()
+      // CHECK: "use"(%[[PRIVATE2]])
+      "use"(%arg4) : (memref<1xf32, 5>) -> ()
+      // CHECK: gpu.terminator
+      gpu.terminator
+    }
+    return
+  }
+
+
   gpu.module @kernels {
     gpu.func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) kernel {
       %tIdX = gpu.thread_id x
@@ -228,17 +253,20 @@ module attributes {gpu.container_module} {
 
   gpu.module @gpu_funcs {
     // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
-    // CHECK:       workgroup
-    // CHECK:       private
-    // CHECK:       attributes
     gpu.func @kernel_1(%arg0: f32)
-        workgroup(%arg1: memref<42xf32, 3>)
-        private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)
+    // CHECK:       workgroup(%[[WGROUP1:.*]] : memref<42xf32, 3>, %[[WGROUP2:.*]] : memref<2xf32, 3>)
+        workgroup(%arg1: memref<42xf32, 3>, %arg2: memref<2xf32, 3>)
+    // CHECK:       private(%[[PRIVATE1:.*]] : memref<2xf32, 5>, %[[PRIVATE2:.*]] : memref<1xf32, 5>)
+        private(%arg3: memref<2xf32, 5>, %arg4: memref<1xf32, 5>)
         kernel
-        attributes {foo="bar"} {
-      "use"(%arg1) : (memref<42xf32, 3>) -> ()
-      "use"(%arg2) : (memref<2xf32, 5>) -> ()
-      "use"(%arg3) : (memref<1xf32, 5>) -> ()
+    // CHECK:       attributes {foo = "bar"}
+        attributes {foo = "bar"} {
+      // CHECK: "use"(%[[WGROUP1]], %[[WGROUP2]])
+      "use"(%arg1, %arg2) : (memref<42xf32, 3>, memref<2xf32, 3>) -> ()
+      // CHECK: "use"(%[[PRIVATE1]])
+      "use"(%arg3) : (memref<2xf32, 5>) -> ()
+      // CHECK: "use"(%[[PRIVATE2]])
+      "use"(%arg4) : (memref<1xf32, 5>) -> ()
       gpu.return
     }
 

From 4aba9f223fb79f08675d3f286602bf785553b4de Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 30 Sep 2025 10:53:18 -0700
Subject: [PATCH 294/878] [RISCV] Add missing CHECK lines for Zkt to
 sifive-p450/p470/p670 test. NFC (#161393)

---
 clang/test/Driver/riscv-cpus.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index 88ec766ff6966..cd92adc64a7d6 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -473,6 +473,7 @@
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zba"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zbb"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zbs"
+// MCPU-SIFIVE-P450-SAME: "-target-feature" "+zkt"
 // MCPU-SIFIVE-P450-SAME: "-target-abi" "lp64d"
 
 // RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p470 | FileCheck -check-prefix=MCPU-SIFIVE-P470 %s
@@ -503,6 +504,7 @@
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zba"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zbb"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zbs"
+// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zkt"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvbb"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zvbc"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zve32f"
@@ -564,6 +566,7 @@
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zba"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zbb"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zbs"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zkt"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvbb"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvbc"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zve32f"

From 0c1e7cc913cda9e473c8dea8f0f195bd63524abd Mon Sep 17 00:00:00 2001
From: Yury Plyakhin <yury.plyakhin@intel.com>
Date: Tue, 30 Sep 2025 10:54:47 -0700
Subject: [PATCH 295/878] [clang-sycl-linker] Generate SymbolTable for each
 image (#161287)

This PR adds extraction of kernel names for each image and stores them
to the Image's StringData field.
---
 .../clang-sycl-linker/ClangSYCLLinker.cpp     | 32 ++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
index fde6b55165868..8dd993fb04362 100644
--- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
+++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
@@ -466,6 +466,13 @@ static Error runAOTCompile(StringRef InputFile, StringRef OutputFile,
   return createStringError(inconvertibleErrorCode(), "Unsupported arch");
 }
 
+// TODO: Consider using LLVM-IR metadata to identify globals of interest
+bool isKernel(const Function &F) {
+  const CallingConv::ID CC = F.getCallingConv();
+  return CC == CallingConv::SPIR_KERNEL || CC == CallingConv::AMDGPU_KERNEL ||
+         CC == CallingConv::PTX_Kernel;
+}
+
 /// Performs the following steps:
 /// 1. Link input device code (user code and SYCL device library code).
 /// 2. Run SPIR-V code generation.
@@ -486,6 +493,22 @@ Error runSYCLLink(ArrayRef<std::string> Files, const ArgList &Args) {
   SmallVector<std::string> SplitModules;
   SplitModules.emplace_back(*LinkedFile);
 
+  // Generate symbol table.
+  SmallVector<std::string> SymbolTable;
+  for (size_t I = 0, E = SplitModules.size(); I != E; ++I) {
+    Expected<std::unique_ptr<Module>> ModOrErr =
+        getBitcodeModule(SplitModules[I], C);
+    if (!ModOrErr)
+      return ModOrErr.takeError();
+
+    SmallVector<StringRef> Symbols;
+    for (Function &F : **ModOrErr) {
+      if (isKernel(F))
+        Symbols.push_back(F.getName());
+    }
+    SymbolTable.emplace_back(llvm::join(Symbols.begin(), Symbols.end(), "\n"));
+  }
+
   bool IsAOTCompileNeeded = IsIntelOffloadArch(
       StringToOffloadArch(Args.getLastArgValue(OPT_arch_EQ)));
 
@@ -523,12 +546,19 @@ Error runSYCLLink(ArrayRef<std::string> Files, const ArgList &Args) {
         return createFileError(File, EC);
     }
     OffloadingImage TheImage{};
-    TheImage.TheImageKind = IMG_Object;
+    // TODO: TheImageKind should be
+    // `IsAOTCompileNeeded ? IMG_Object : IMG_SPIRV;`
+    // For that we need to update SYCL Runtime to align with the ImageKind enum.
+    // Temporarily it is initalized to IMG_None, because in that case, SYCL
+    // Runtime has a heuristic to understand what the Image Kind is, so at least
+    // it works.
+    TheImage.TheImageKind = IMG_None;
     TheImage.TheOffloadKind = OFK_SYCL;
     TheImage.StringData["triple"] =
         Args.MakeArgString(Args.getLastArgValue(OPT_triple_EQ));
     TheImage.StringData["arch"] =
         Args.MakeArgString(Args.getLastArgValue(OPT_arch_EQ));
+    TheImage.StringData["symbols"] = SymbolTable[I];
     TheImage.Image = std::move(*FileOrErr);
 
     llvm::SmallString<0> Buffer = OffloadBinary::write(TheImage);

From 98766d288f0d7cadcf34f355d36e4deaf233d046 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Tue, 30 Sep 2025 11:09:01 -0700
Subject: [PATCH 296/878] [clang] Cleanup docs and code for legacy no_sanitize
 attributes (NFC). (#161311)

Update generated docs for legacy attributes:

* no_sanitize_(address|thread|memory)
* no_address_safety_analysis

Those are older forms of no_sanitize("list", "of", "sanitizers")
attribute. They were previously as various spellings of the same
attribute, which made the auto-generated documentation confusing.

Fix this by explicitly making them three different attributes. This
would also allow to simplify the delegation to the new no_sanitize form
slightly, as we can instead rely on auto-generated code to check that
TSan and MSan can't be disabled for globals.

**HTML docs before:**
<img width="1004" height="1175" alt="rendered-docs-before"
src="https://github.com/user-attachments/assets/407b5fc1-799c-4882-8ff8-44a5ef3cf4f1"
/>

**HTML docs after:**
<img width="1098" height="1118" alt="rendered-docs-after"
src="https://github.com/user-attachments/assets/236ca93f-25f8-4d58-95ac-ede95ce18d01"
/>

---------

Co-authored-by: Erich Keane <ekeane@nvidia.com>
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Basic/Attr.td             | 29 ++++++++---
 clang/include/clang/Basic/AttrDocs.td         | 11 ++--
 clang/lib/Sema/SemaDeclAttr.cpp               | 50 ++++++++++++-------
 ...a-attribute-supported-attributes-list.test |  4 +-
 5 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3b269ccd57718..97799aeadfc7e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -583,6 +583,7 @@ Moved checkers
 
 Sanitizers
 ----------
+- Improved documentation for legacy ``no_sanitize`` attributes.
 
 Python Binding Changes
 ----------------------
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 2623f9ff6972f..de56bb38fd63e 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3921,16 +3921,31 @@ def NoSanitize : InheritableAttr {
   }];
 }
 
-// Attributes to disable a specific sanitizer. No new sanitizers should be added
+// Attribute to disable AddressSanitizer. No new spellings should be added
 // to this list; the no_sanitize attribute should be extended instead.
-def NoSanitizeSpecific : InheritableAttr {
+def NoSanitizeAddress : InheritableAttr {
   let Spellings = [GCC<"no_address_safety_analysis">,
-                   GCC<"no_sanitize_address">,
-                   GCC<"no_sanitize_thread">,
-                   Clang<"no_sanitize_memory">];
+                   GCC<"no_sanitize_address">];
   let Subjects = SubjectList<[Function, GlobalVar], ErrorDiag>;
-  let Documentation = [NoSanitizeAddressDocs, NoSanitizeThreadDocs,
-                       NoSanitizeMemoryDocs];
+  let Documentation = [NoSanitizeAddressDocs];
+  let ASTNode = 0;
+}
+
+// Attribute to disable ThreadSanitizer. No new spellings should be added
+// to this list; the no_sanitize attribute should be extended instead.
+def NoSanitizeThread : InheritableAttr {
+  let Spellings = [GCC<"no_sanitize_thread">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [NoSanitizeThreadDocs];
+  let ASTNode = 0;
+}
+
+// Attribute to disable MemorySanitizer. No new spellings should be added
+// to this list; the no_sanitize attribute should be extended instead.
+def NoSanitizeMemory : InheritableAttr {
+  let Spellings = [Clang<"no_sanitize_memory">];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [NoSanitizeMemoryDocs];
   let ASTNode = 0;
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index ee212a9b50f36..20a52b49a8f10 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -13,16 +13,16 @@
 // version control.
 //
 // To run clang-tblgen to generate the .rst file:
-// clang-tblgen -gen-attr-docs -I <root>/llvm/tools/clang/include
-//   <root>/llvm/tools/clang/include/clang/Basic/Attr.td -o
-//   <root>/llvm/tools/clang/docs/AttributeReference.rst
+// clang-tblgen -gen-attr-docs -I <root>/clang/include
+//   <root>/clang/include/clang/Basic/Attr.td -o
+//   <root>/clang/docs/AttributeReference.rst
 //
 // To run sphinx to generate the .html files (note that sphinx-build must be
 // available on the PATH):
 // Windows (from within the clang\docs directory):
 //   make.bat html
-// Non-Windows (from within the clang\docs directory):
-//   sphinx-build -b html _build/html
+// Non-Windows (from within the clang/docs directory):
+//   sphinx-build -b html . _build/html
 
 def GlobalDocumentation {
   code Intro =[{..
@@ -3629,6 +3629,7 @@ instrumentations should not be applied.
 
 The attribute takes a list of string literals with the following accepted
 values:
+
 * all values accepted by ``-fno-sanitize=``;
 * ``coverage``, to disable SanitizerCoverage instrumentation.
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a8dfa4d7df2d5..328ccf6694073 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6361,19 +6361,8 @@ static void handleNoSanitizeAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
                                               Sanitizers.size()));
 }
 
-static void handleNoSanitizeSpecificAttr(Sema &S, Decl *D,
-                                         const ParsedAttr &AL) {
-  StringRef AttrName = AL.getAttrName()->getName();
-  normalizeName(AttrName);
-  StringRef SanitizerName = llvm::StringSwitch<StringRef>(AttrName)
-                                .Case("no_address_safety_analysis", "address")
-                                .Case("no_sanitize_address", "address")
-                                .Case("no_sanitize_thread", "thread")
-                                .Case("no_sanitize_memory", "memory");
-  if (isGlobalVar(D) && SanitizerName != "address")
-    S.Diag(D->getLocation(), diag::err_attribute_wrong_decl_type)
-        << AL << AL.isRegularKeywordAttribute() << ExpectedFunction;
-
+static AttributeCommonInfo
+getNoSanitizeAttrInfo(const ParsedAttr &NoSanitizeSpecificAttr) {
   // FIXME: Rather than create a NoSanitizeSpecificAttr, this creates a
   // NoSanitizeAttr object; but we need to calculate the correct spelling list
   // index rather than incorrectly assume the index for NoSanitizeSpecificAttr
@@ -6383,11 +6372,32 @@ static void handleNoSanitizeSpecificAttr(Sema &S, Decl *D,
   // getSpelling() or prettyPrint() on the resulting semantic attribute object
   // without failing assertions.
   unsigned TranslatedSpellingIndex = 0;
-  if (AL.isStandardAttributeSyntax())
+  if (NoSanitizeSpecificAttr.isStandardAttributeSyntax())
     TranslatedSpellingIndex = 1;
 
-  AttributeCommonInfo Info = AL;
+  AttributeCommonInfo Info = NoSanitizeSpecificAttr;
   Info.setAttributeSpellingListIndex(TranslatedSpellingIndex);
+  return Info;
+}
+
+static void handleNoSanitizeAddressAttr(Sema &S, Decl *D,
+                                        const ParsedAttr &AL) {
+  StringRef SanitizerName = "address";
+  AttributeCommonInfo Info = getNoSanitizeAttrInfo(AL);
+  D->addAttr(::new (S.Context)
+                 NoSanitizeAttr(S.Context, Info, &SanitizerName, 1));
+}
+
+static void handleNoSanitizeThreadAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  StringRef SanitizerName = "thread";
+  AttributeCommonInfo Info = getNoSanitizeAttrInfo(AL);
+  D->addAttr(::new (S.Context)
+                 NoSanitizeAttr(S.Context, Info, &SanitizerName, 1));
+}
+
+static void handleNoSanitizeMemoryAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  StringRef SanitizerName = "memory";
+  AttributeCommonInfo Info = getNoSanitizeAttrInfo(AL);
   D->addAttr(::new (S.Context)
                  NoSanitizeAttr(S.Context, Info, &SanitizerName, 1));
 }
@@ -7513,8 +7523,14 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_NoSanitize:
     handleNoSanitizeAttr(S, D, AL);
     break;
-  case ParsedAttr::AT_NoSanitizeSpecific:
-    handleNoSanitizeSpecificAttr(S, D, AL);
+  case ParsedAttr::AT_NoSanitizeAddress:
+    handleNoSanitizeAddressAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_NoSanitizeThread:
+    handleNoSanitizeThreadAttr(S, D, AL);
+    break;
+  case ParsedAttr::AT_NoSanitizeMemory:
+    handleNoSanitizeMemoryAttr(S, D, AL);
     break;
   case ParsedAttr::AT_GuardedBy:
     handleGuardedByAttr(S, D, AL);
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 37ff33e5a1523..73d4cb1769ed5 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -126,7 +126,9 @@
 // CHECK-NEXT: NoProfileFunction (SubjectMatchRule_function)
 // CHECK-NEXT: NoRandomizeLayout (SubjectMatchRule_record)
 // CHECK-NEXT: NoSanitize (SubjectMatchRule_function, SubjectMatchRule_objc_method, SubjectMatchRule_variable_is_global)
-// CHECK-NEXT: NoSanitizeSpecific (SubjectMatchRule_function, SubjectMatchRule_variable_is_global)
+// CHECK-NEXT: NoSanitizeAddress (SubjectMatchRule_function, SubjectMatchRule_variable_is_global)
+// CHECK-NEXT: NoSanitizeMemory (SubjectMatchRule_function)
+// CHECK-NEXT: NoSanitizeThread (SubjectMatchRule_function)
 // CHECK-NEXT: NoSpeculativeLoadHardening (SubjectMatchRule_function, SubjectMatchRule_objc_method)
 // CHECK-NEXT: NoSplitStack (SubjectMatchRule_function)
 // CHECK-NEXT: NoStackProtector (SubjectMatchRule_function)

From 6caa0d05c2282feafa1479141c0009257639e48f Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Tue, 30 Sep 2025 20:16:04 +0200
Subject: [PATCH 297/878] [CIR] Upstream RTTI Builder & RTTI for VTable
 Definitions (#160002)

Upstream the RTTI builder with helpers and used them in the VTable
Definitions

Issue https://github.com/llvm/llvm-project/issues/154992
---
 clang/include/clang/CIR/MissingFeatures.h     |    2 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |    5 +
 clang/lib/CIR/CodeGen/CIRGenCXXABI.h          |    3 +
 clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp | 1063 +++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |    9 +-
 clang/lib/CIR/CodeGen/CIRGenModule.h          |   18 +
 clang/lib/CIR/CodeGen/CIRGenVTables.cpp       |   43 +
 clang/lib/CIR/CodeGen/CIRGenVTables.h         |    2 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |   10 +-
 clang/test/CIR/CodeGen/vtt.cpp                |  899 +++++++-------
 10 files changed, 1633 insertions(+), 421 deletions(-)

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 7e59989dc09f1..f09ec95d9ccf8 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -37,6 +37,8 @@ struct MissingFeatures {
   static bool opGlobalDLLImportExport() { return false; }
   static bool opGlobalPartition() { return false; }
   static bool opGlobalUsedOrCompilerUsed() { return false; }
+  static bool setDSOLocal() { return false; }
+  static bool setComdat() { return false; }
 
   static bool supportIFuncAttr() { return false; }
   static bool supportVisibility() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 6a1746a7ad0ac..58345b45c97bc 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -89,6 +89,11 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return cir::ConstRecordAttr::get(sTy, arrayAttr);
   }
 
+  cir::TypeInfoAttr getTypeInfo(mlir::ArrayAttr fieldsAttr) {
+    cir::ConstRecordAttr anonRecord = getAnonConstRecord(fieldsAttr);
+    return cir::TypeInfoAttr::get(anonRecord.getType(), fieldsAttr);
+  }
+
   std::string getUniqueAnonRecordName() { return getUniqueRecordName("anon"); }
 
   std::string getUniqueRecordName(const std::string &baseName) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index ae922599809b8..1dee77425c30d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -114,6 +114,9 @@ class CIRGenCXXABI {
 
   virtual void emitRethrow(CIRGenFunction &cgf, bool isNoReturn) = 0;
 
+  virtual mlir::Attribute getAddrOfRTTIDescriptor(mlir::Location loc,
+                                                  QualType ty) = 0;
+
   /// Get the type of the implicit "this" parameter used by a method. May return
   /// zero if no specific type is applicable, e.g. if the ABI expects the "this"
   /// parameter to point to some artificial offset in a complete object due to
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index 0bf6cf556787c..debea8af66b50 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -103,6 +103,9 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI {
                              const CXXRecordDecl *rd) override;
   void emitVirtualInheritanceTables(const CXXRecordDecl *rd) override;
 
+  mlir::Attribute getAddrOfRTTIDescriptor(mlir::Location loc,
+                                          QualType ty) override;
+
   bool doStructorsInitializeVPtrs(const CXXRecordDecl *vtableClass) override {
     return true;
   }
@@ -111,6 +114,34 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI {
   getVirtualBaseClassOffset(mlir::Location loc, CIRGenFunction &cgf,
                             Address thisAddr, const CXXRecordDecl *classDecl,
                             const CXXRecordDecl *baseClassDecl) override;
+
+  /**************************** RTTI Uniqueness ******************************/
+protected:
+  /// Returns true if the ABI requires RTTI type_info objects to be unique
+  /// across a program.
+  virtual bool shouldRTTIBeUnique() const { return true; }
+
+public:
+  /// What sort of unique-RTTI behavior should we use?
+  enum RTTIUniquenessKind {
+    /// We are guaranteeing, or need to guarantee, that the RTTI string
+    /// is unique.
+    RUK_Unique,
+
+    /// We are not guaranteeing uniqueness for the RTTI string, so we
+    /// can demote to hidden visibility but must use string comparisons.
+    RUK_NonUniqueHidden,
+
+    /// We are not guaranteeing uniqueness for the RTTI string, so we
+    /// have to use string comparisons, but we also have to emit it with
+    /// non-hidden visibility.
+    RUK_NonUniqueVisible
+  };
+
+  /// Return the required visibility status for the given type and linkage in
+  /// the current ABI.
+  RTTIUniquenessKind
+  classifyRTTIUniqueness(QualType canTy, cir::GlobalLinkageKind linkage) const;
 };
 
 } // namespace
@@ -424,6 +455,1038 @@ void CIRGenItaniumCXXABI::emitVirtualInheritanceTables(
   vtables.emitVTTDefinition(vtt, cgm.getVTableLinkage(rd), rd);
 }
 
+namespace {
+class CIRGenItaniumRTTIBuilder {
+  CIRGenModule &cgm;                 // Per-module state.
+  const CIRGenItaniumCXXABI &cxxABI; // Per-module state.
+
+  /// The fields of the RTTI descriptor currently being built.
+  SmallVector<mlir::Attribute, 16> fields;
+
+  // Returns the mangled type name of the given type.
+  cir::GlobalOp getAddrOfTypeName(mlir::Location loc, QualType ty,
+                                  cir::GlobalLinkageKind linkage);
+
+  /// descriptor of the given type.
+  mlir::Attribute getAddrOfExternalRTTIDescriptor(mlir::Location loc,
+                                                  QualType ty);
+
+  /// Build the vtable pointer for the given type.
+  void buildVTablePointer(mlir::Location loc, const Type *ty);
+
+  /// Build an abi::__si_class_type_info, used for single inheritance, according
+  /// to the Itanium C++ ABI, 2.9.5p6b.
+  void buildSIClassTypeInfo(mlir::Location loc, const CXXRecordDecl *rd);
+
+  /// Build an abi::__vmi_class_type_info, used for
+  /// classes with bases that do not satisfy the abi::__si_class_type_info
+  /// constraints, according ti the Itanium C++ ABI, 2.9.5p5c.
+  void buildVMIClassTypeInfo(mlir::Location loc, const CXXRecordDecl *rd);
+
+public:
+  CIRGenItaniumRTTIBuilder(const CIRGenItaniumCXXABI &abi, CIRGenModule &cgm)
+      : cgm(cgm), cxxABI(abi) {}
+
+  /// Build the RTTI type info struct for the given type, or
+  /// link to an existing RTTI descriptor if one already exists.
+  mlir::Attribute buildTypeInfo(mlir::Location loc, QualType ty);
+
+  /// Build the RTTI type info struct for the given type.
+  mlir::Attribute buildTypeInfo(mlir::Location loc, QualType ty,
+                                cir::GlobalLinkageKind linkage,
+                                mlir::SymbolTable::Visibility visibility);
+};
+} // namespace
+
+// TODO(cir): Will be removed after sharing them with the classical codegen
+namespace {
+
+// Pointer type info flags.
+enum {
+  /// PTI_Const - Type has const qualifier.
+  PTI_Const = 0x1,
+
+  /// PTI_Volatile - Type has volatile qualifier.
+  PTI_Volatile = 0x2,
+
+  /// PTI_Restrict - Type has restrict qualifier.
+  PTI_Restrict = 0x4,
+
+  /// PTI_Incomplete - Type is incomplete.
+  PTI_Incomplete = 0x8,
+
+  /// PTI_ContainingClassIncomplete - Containing class is incomplete.
+  /// (in pointer to member).
+  PTI_ContainingClassIncomplete = 0x10,
+
+  /// PTI_TransactionSafe - Pointee is transaction_safe function (C++ TM TS).
+  // PTI_TransactionSafe = 0x20,
+
+  /// PTI_Noexcept - Pointee is noexcept function (C++1z).
+  PTI_Noexcept = 0x40,
+};
+
+// VMI type info flags.
+enum {
+  /// VMI_NonDiamondRepeat - Class has non-diamond repeated inheritance.
+  VMI_NonDiamondRepeat = 0x1,
+
+  /// VMI_DiamondShaped - Class is diamond shaped.
+  VMI_DiamondShaped = 0x2
+};
+
+// Base class type info flags.
+enum {
+  /// BCTI_Virtual - Base class is virtual.
+  BCTI_Virtual = 0x1,
+
+  /// BCTI_Public - Base class is public.
+  BCTI_Public = 0x2
+};
+
+/// Given a builtin type, returns whether the type
+/// info for that type is defined in the standard library.
+/// TODO(cir): this can unified with LLVM codegen
+static bool typeInfoIsInStandardLibrary(const BuiltinType *ty) {
+  // Itanium C++ ABI 2.9.2:
+  //   Basic type information (e.g. for "int", "bool", etc.) will be kept in
+  //   the run-time support library. Specifically, the run-time support
+  //   library should contain type_info objects for the types X, X* and
+  //   X const*, for every X in: void, std::nullptr_t, bool, wchar_t, char,
+  //   unsigned char, signed char, short, unsigned short, int, unsigned int,
+  //   long, unsigned long, long long, unsigned long long, float, double,
+  //   long double, char16_t, char32_t, and the IEEE 754r decimal and
+  //   half-precision floating point types.
+  //
+  // GCC also emits RTTI for __int128.
+  // FIXME: We do not emit RTTI information for decimal types here.
+
+  // Types added here must also be added to emitFundamentalRTTIDescriptors.
+  switch (ty->getKind()) {
+  case BuiltinType::WasmExternRef:
+  case BuiltinType::HLSLResource:
+    llvm_unreachable("NYI");
+  case BuiltinType::Void:
+  case BuiltinType::NullPtr:
+  case BuiltinType::Bool:
+  case BuiltinType::WChar_S:
+  case BuiltinType::WChar_U:
+  case BuiltinType::Char_U:
+  case BuiltinType::Char_S:
+  case BuiltinType::UChar:
+  case BuiltinType::SChar:
+  case BuiltinType::Short:
+  case BuiltinType::UShort:
+  case BuiltinType::Int:
+  case BuiltinType::UInt:
+  case BuiltinType::Long:
+  case BuiltinType::ULong:
+  case BuiltinType::LongLong:
+  case BuiltinType::ULongLong:
+  case BuiltinType::Half:
+  case BuiltinType::Float:
+  case BuiltinType::Double:
+  case BuiltinType::LongDouble:
+  case BuiltinType::Float16:
+  case BuiltinType::Float128:
+  case BuiltinType::Ibm128:
+  case BuiltinType::Char8:
+  case BuiltinType::Char16:
+  case BuiltinType::Char32:
+  case BuiltinType::Int128:
+  case BuiltinType::UInt128:
+    return true;
+
+#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix)                   \
+  case BuiltinType::Id:
+#include "clang/Basic/OpenCLImageTypes.def"
+#define EXT_OPAQUE_TYPE(ExtType, Id, Ext) case BuiltinType::Id:
+#include "clang/Basic/OpenCLExtensionTypes.def"
+  case BuiltinType::OCLSampler:
+  case BuiltinType::OCLEvent:
+  case BuiltinType::OCLClkEvent:
+  case BuiltinType::OCLQueue:
+  case BuiltinType::OCLReserveID:
+#define SVE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AArch64ACLETypes.def"
+#define PPC_VECTOR_TYPE(Name, Id, Size) case BuiltinType::Id:
+#include "clang/Basic/PPCTypes.def"
+#define RVV_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/RISCVVTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId, Width, Align) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
+  case BuiltinType::ShortAccum:
+  case BuiltinType::Accum:
+  case BuiltinType::LongAccum:
+  case BuiltinType::UShortAccum:
+  case BuiltinType::UAccum:
+  case BuiltinType::ULongAccum:
+  case BuiltinType::ShortFract:
+  case BuiltinType::Fract:
+  case BuiltinType::LongFract:
+  case BuiltinType::UShortFract:
+  case BuiltinType::UFract:
+  case BuiltinType::ULongFract:
+  case BuiltinType::SatShortAccum:
+  case BuiltinType::SatAccum:
+  case BuiltinType::SatLongAccum:
+  case BuiltinType::SatUShortAccum:
+  case BuiltinType::SatUAccum:
+  case BuiltinType::SatULongAccum:
+  case BuiltinType::SatShortFract:
+  case BuiltinType::SatFract:
+  case BuiltinType::SatLongFract:
+  case BuiltinType::SatUShortFract:
+  case BuiltinType::SatUFract:
+  case BuiltinType::SatULongFract:
+  case BuiltinType::BFloat16:
+    return false;
+
+  case BuiltinType::Dependent:
+#define BUILTIN_TYPE(Id, SingletonId)
+#define PLACEHOLDER_TYPE(Id, SingletonId) case BuiltinType::Id:
+#include "clang/AST/BuiltinTypes.def"
+    llvm_unreachable("asking for RRTI for a placeholder type!");
+
+  case BuiltinType::ObjCId:
+  case BuiltinType::ObjCClass:
+  case BuiltinType::ObjCSel:
+    llvm_unreachable("FIXME: Objective-C types are unsupported!");
+  }
+
+  llvm_unreachable("Invalid BuiltinType Kind!");
+}
+
+static bool typeInfoIsInStandardLibrary(const PointerType *pointerTy) {
+  QualType pointeeTy = pointerTy->getPointeeType();
+  const auto *builtinTy = dyn_cast<BuiltinType>(pointeeTy);
+  if (!builtinTy)
+    return false;
+
+  // Check the qualifiers.
+  Qualifiers quals = pointeeTy.getQualifiers();
+  quals.removeConst();
+
+  if (!quals.empty())
+    return false;
+
+  return typeInfoIsInStandardLibrary(builtinTy);
+}
+
+/// IsStandardLibraryRTTIDescriptor - Returns whether the type
+/// information for the given type exists in the standard library.
+static bool isStandardLibraryRttiDescriptor(QualType ty) {
+  // Type info for builtin types is defined in the standard library.
+  if (const auto *builtinTy = dyn_cast<BuiltinType>(ty))
+    return typeInfoIsInStandardLibrary(builtinTy);
+
+  // Type info for some pointer types to builtin types is defined in the
+  // standard library.
+  if (const auto *pointerTy = dyn_cast<PointerType>(ty))
+    return typeInfoIsInStandardLibrary(pointerTy);
+
+  return false;
+}
+
+/// ShouldUseExternalRTTIDescriptor - Returns whether the type information for
+/// the given type exists somewhere else, and that we should not emit the type
+/// information in this translation unit.  Assumes that it is not a
+/// standard-library type.
+static bool shouldUseExternalRttiDescriptor(CIRGenModule &cgm, QualType ty) {
+  ASTContext &context = cgm.getASTContext();
+
+  // If RTTI is disabled, assume it might be disabled in the
+  // translation unit that defines any potential key function, too.
+  if (!context.getLangOpts().RTTI)
+    return false;
+
+  if (const auto *recordTy = dyn_cast<RecordType>(ty)) {
+    const CXXRecordDecl *rd =
+        cast<CXXRecordDecl>(recordTy->getOriginalDecl())->getDefinitionOrSelf();
+    if (!rd->hasDefinition())
+      return false;
+
+    if (!rd->isDynamicClass())
+      return false;
+
+    // FIXME: this may need to be reconsidered if the key function
+    // changes.
+    // N.B. We must always emit the RTTI data ourselves if there exists a key
+    // function.
+    bool isDLLImport = rd->hasAttr<DLLImportAttr>();
+
+    // Don't import the RTTI but emit it locally.
+    if (cgm.getTriple().isOSCygMing())
+      return false;
+
+    if (cgm.getVTables().isVTableExternal(rd)) {
+      if (cgm.getTarget().hasPS4DLLImportExport())
+        return true;
+
+      return !isDLLImport || cgm.getTriple().isWindowsItaniumEnvironment();
+    }
+
+    if (isDLLImport)
+      return true;
+  }
+
+  return false;
+}
+
+/// Contains virtual and non-virtual bases seen when traversing a class
+/// hierarchy.
+struct SeenBases {
+  llvm::SmallPtrSet<const CXXRecordDecl *, 16> nonVirtualBases;
+  llvm::SmallPtrSet<const CXXRecordDecl *, 16> virtualBases;
+};
+
+/// Compute the value of the flags member in abi::__vmi_class_type_info.
+///
+static unsigned computeVmiClassTypeInfoFlags(const CXXBaseSpecifier *base,
+                                             SeenBases &bases) {
+
+  unsigned flags = 0;
+  auto *baseDecl = base->getType()->castAsCXXRecordDecl();
+
+  if (base->isVirtual()) {
+    // Mark the virtual base as seen.
+    if (!bases.virtualBases.insert(baseDecl).second) {
+      // If this virtual base has been seen before, then the class is diamond
+      // shaped.
+      flags |= VMI_DiamondShaped;
+    } else {
+      if (bases.nonVirtualBases.count(baseDecl))
+        flags |= VMI_NonDiamondRepeat;
+    }
+  } else {
+    // Mark the non-virtual base as seen.
+    if (!bases.nonVirtualBases.insert(baseDecl).second) {
+      // If this non-virtual base has been seen before, then the class has non-
+      // diamond shaped repeated inheritance.
+      flags |= VMI_NonDiamondRepeat;
+    } else {
+      if (bases.virtualBases.count(baseDecl))
+        flags |= VMI_NonDiamondRepeat;
+    }
+  }
+
+  // Walk all bases.
+  for (const auto &bs : baseDecl->bases())
+    flags |= computeVmiClassTypeInfoFlags(&bs, bases);
+
+  return flags;
+}
+
+static unsigned computeVmiClassTypeInfoFlags(const CXXRecordDecl *rd) {
+  unsigned flags = 0;
+  SeenBases bases;
+
+  // Walk all bases.
+  for (const auto &bs : rd->bases())
+    flags |= computeVmiClassTypeInfoFlags(&bs, bases);
+
+  return flags;
+}
+
+// Return whether the given record decl has a "single,
+// public, non-virtual base at offset zero (i.e. the derived class is dynamic
+// iff the base is)", according to Itanium C++ ABI, 2.95p6b.
+// TODO(cir): this can unified with LLVM codegen
+static bool canUseSingleInheritance(const CXXRecordDecl *rd) {
+  // Check the number of bases.
+  if (rd->getNumBases() != 1)
+    return false;
+
+  // Get the base.
+  CXXRecordDecl::base_class_const_iterator base = rd->bases_begin();
+
+  // Check that the base is not virtual.
+  if (base->isVirtual())
+    return false;
+
+  // Check that the base is public.
+  if (base->getAccessSpecifier() != AS_public)
+    return false;
+
+  // Check that the class is dynamic iff the base is.
+  auto *baseDecl = base->getType()->castAsCXXRecordDecl();
+  return baseDecl->isEmpty() ||
+         baseDecl->isDynamicClass() == rd->isDynamicClass();
+}
+
+/// IsIncompleteClassType - Returns whether the given record type is incomplete.
+static bool isIncompleteClassType(const RecordType *recordTy) {
+  return !recordTy->getOriginalDecl()
+              ->getDefinitionOrSelf()
+              ->isCompleteDefinition();
+}
+
+/// Returns whether the given type contains an
+/// incomplete class type. This is true if
+///
+///   * The given type is an incomplete class type.
+///   * The given type is a pointer type whose pointee type contains an
+///     incomplete class type.
+///   * The given type is a member pointer type whose class is an incomplete
+///     class type.
+///   * The given type is a member pointer type whoise pointee type contains an
+///     incomplete class type.
+/// is an indirect or direct pointer to an incomplete class type.
+static bool containsIncompleteClassType(QualType ty) {
+  if (const auto *recordTy = dyn_cast<RecordType>(ty)) {
+    if (isIncompleteClassType(recordTy))
+      return true;
+  }
+
+  if (const auto *pointerTy = dyn_cast<PointerType>(ty))
+    return containsIncompleteClassType(pointerTy->getPointeeType());
+
+  if (const auto *memberPointerTy = dyn_cast<MemberPointerType>(ty)) {
+    // Check if the class type is incomplete.
+    if (!memberPointerTy->getMostRecentCXXRecordDecl()->hasDefinition())
+      return true;
+
+    return containsIncompleteClassType(memberPointerTy->getPointeeType());
+  }
+
+  return false;
+}
+
+const char *vTableClassNameForType(const CIRGenModule &cgm, const Type *ty) {
+  // abi::__class_type_info.
+  static const char *const classTypeInfo =
+      "_ZTVN10__cxxabiv117__class_type_infoE";
+  // abi::__si_class_type_info.
+  static const char *const siClassTypeInfo =
+      "_ZTVN10__cxxabiv120__si_class_type_infoE";
+  // abi::__vmi_class_type_info.
+  static const char *const vmiClassTypeInfo =
+      "_ZTVN10__cxxabiv121__vmi_class_type_infoE";
+
+  switch (ty->getTypeClass()) {
+#define TYPE(Class, Base)
+#define ABSTRACT_TYPE(Class, Base)
+#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) case Type::Class:
+#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
+#define DEPENDENT_TYPE(Class, Base) case Type::Class:
+#include "clang/AST/TypeNodes.inc"
+    llvm_unreachable("Non-canonical and dependent types shouldn't get here");
+
+  case Type::LValueReference:
+  case Type::RValueReference:
+    llvm_unreachable("References shouldn't get here");
+
+  case Type::Auto:
+  case Type::DeducedTemplateSpecialization:
+    llvm_unreachable("Undeduced type shouldn't get here");
+
+  case Type::Pipe:
+    llvm_unreachable("Pipe types shouldn't get here");
+
+  case Type::ArrayParameter:
+    llvm_unreachable("Array Parameter types should not get here.");
+
+  case Type::Builtin:
+  case Type::BitInt:
+  // GCC treats vector and complex types as fundamental types.
+  case Type::Vector:
+  case Type::ExtVector:
+  case Type::ConstantMatrix:
+  case Type::Complex:
+  case Type::Atomic:
+  // FIXME: GCC treats block pointers as fundamental types?!
+  case Type::BlockPointer:
+    cgm.errorNYI("VTableClassNameForType: __fundamental_type_info");
+    break;
+  case Type::ConstantArray:
+  case Type::IncompleteArray:
+  case Type::VariableArray:
+    cgm.errorNYI("VTableClassNameForType: __array_type_info");
+    break;
+
+  case Type::FunctionNoProto:
+  case Type::FunctionProto:
+    cgm.errorNYI("VTableClassNameForType: __function_type_info");
+    break;
+
+  case Type::Enum:
+    cgm.errorNYI("VTableClassNameForType: Enum");
+    break;
+
+  case Type::Record: {
+    const CXXRecordDecl *rd =
+        cast<CXXRecordDecl>(cast<RecordType>(ty)->getOriginalDecl())
+            ->getDefinitionOrSelf();
+
+    if (!rd->hasDefinition() || !rd->getNumBases()) {
+      return classTypeInfo;
+    }
+
+    if (canUseSingleInheritance(rd)) {
+      return siClassTypeInfo;
+    }
+
+    return vmiClassTypeInfo;
+  }
+
+  case Type::ObjCObject:
+    cgm.errorNYI("VTableClassNameForType: ObjCObject");
+    break;
+
+  case Type::ObjCInterface:
+    cgm.errorNYI("VTableClassNameForType: ObjCInterface");
+    break;
+
+  case Type::ObjCObjectPointer:
+  case Type::Pointer:
+    cgm.errorNYI("VTableClassNameForType: __pointer_type_info");
+    break;
+
+  case Type::MemberPointer:
+    cgm.errorNYI("VTableClassNameForType: __pointer_to_member_type_info");
+    break;
+
+  case Type::HLSLAttributedResource:
+  case Type::HLSLInlineSpirv:
+    llvm_unreachable("HLSL doesn't support virtual functions");
+  }
+
+  return nullptr;
+}
+} // namespace
+
+/// Return the linkage that the type info and type info name constants
+/// should have for the given type.
+static cir::GlobalLinkageKind getTypeInfoLinkage(CIRGenModule &cgm,
+                                                 QualType ty) {
+  //   In addition, it and all of the intermediate abi::__pointer_type_info
+  //   structs in the chain down to the abi::__class_type_info for the
+  //   incomplete class type must be prevented from resolving to the
+  //   corresponding type_info structs for the complete class type, possibly
+  //   by making them local static objects. Finally, a dummy class RTTI is
+  //   generated for the incomplete type that will not resolve to the final
+  //   complete class RTTI (because the latter need not exist), possibly by
+  //   making it a local static object.
+  if (containsIncompleteClassType(ty))
+    return cir::GlobalLinkageKind::InternalLinkage;
+
+  switch (ty->getLinkage()) {
+  case Linkage::Invalid:
+    llvm_unreachable("Linkage hasn't been computed!");
+
+  case Linkage::None:
+  case Linkage::Internal:
+  case Linkage::UniqueExternal:
+    return cir::GlobalLinkageKind::InternalLinkage;
+
+  case Linkage::VisibleNone:
+  case Linkage::Module:
+  case Linkage::External:
+    // RTTI is not enabled, which means that this type info struct is going
+    // to be used for exception handling. Give it linkonce_odr linkage.
+    if (!cgm.getLangOpts().RTTI)
+      return cir::GlobalLinkageKind::LinkOnceODRLinkage;
+
+    if (const RecordType *record = dyn_cast<RecordType>(ty)) {
+      const CXXRecordDecl *rd =
+          cast<CXXRecordDecl>(record->getOriginalDecl())->getDefinitionOrSelf();
+      if (rd->hasAttr<WeakAttr>())
+        return cir::GlobalLinkageKind::WeakODRLinkage;
+
+      if (cgm.getTriple().isWindowsItaniumEnvironment())
+        if (rd->hasAttr<DLLImportAttr>() &&
+            shouldUseExternalRttiDescriptor(cgm, ty))
+          return cir::GlobalLinkageKind::ExternalLinkage;
+
+      // MinGW always uses LinkOnceODRLinkage for type info.
+      if (rd->isDynamicClass() && !cgm.getASTContext()
+                                       .getTargetInfo()
+                                       .getTriple()
+                                       .isWindowsGNUEnvironment())
+        return cgm.getVTableLinkage(rd);
+    }
+
+    return cir::GlobalLinkageKind::LinkOnceODRLinkage;
+  }
+
+  llvm_unreachable("Invalid linkage!");
+}
+
+cir::GlobalOp
+CIRGenItaniumRTTIBuilder::getAddrOfTypeName(mlir::Location loc, QualType ty,
+                                            cir::GlobalLinkageKind linkage) {
+  CIRGenBuilderTy &builder = cgm.getBuilder();
+  SmallString<256> name;
+  llvm::raw_svector_ostream out(name);
+  cgm.getCXXABI().getMangleContext().mangleCXXRTTIName(ty, out);
+
+  // We know that the mangled name of the type starts at index 4 of the
+  // mangled name of the typename, so we can just index into it in order to
+  // get the mangled name of the type.
+  mlir::Attribute init = builder.getString(
+      name.substr(4), cgm.convertType(cgm.getASTContext().CharTy),
+      std::nullopt);
+
+  CharUnits align =
+      cgm.getASTContext().getTypeAlignInChars(cgm.getASTContext().CharTy);
+
+  // builder.getString can return a #cir.zero if the string given to it only
+  // contains null bytes. However, type names cannot be full of null bytes.
+  // So cast Init to a ConstArrayAttr should be safe.
+  auto initStr = cast<cir::ConstArrayAttr>(init);
+
+  cir::GlobalOp gv = cgm.createOrReplaceCXXRuntimeVariable(
+      loc, name, initStr.getType(), linkage, align);
+  CIRGenModule::setInitializer(gv, init);
+  return gv;
+}
+
+mlir::Attribute
+CIRGenItaniumRTTIBuilder::getAddrOfExternalRTTIDescriptor(mlir::Location loc,
+                                                          QualType ty) {
+  // Mangle the RTTI name.
+  SmallString<256> name;
+  llvm::raw_svector_ostream out(name);
+  cgm.getCXXABI().getMangleContext().mangleCXXRTTI(ty, out);
+  CIRGenBuilderTy &builder = cgm.getBuilder();
+
+  // Look for an existing global.
+  cir::GlobalOp gv = dyn_cast_or_null<cir::GlobalOp>(
+      mlir::SymbolTable::lookupSymbolIn(cgm.getModule(), name));
+
+  if (!gv) {
+    // Create a new global variable.
+    // From LLVM codegen => Note for the future: If we would ever like to do
+    // deferred emission of RTTI, check if emitting vtables opportunistically
+    // need any adjustment.
+    gv = CIRGenModule::createGlobalOp(cgm, loc, name, builder.getUInt8PtrTy(),
+                                      /*isConstant=*/true);
+    const CXXRecordDecl *rd = ty->getAsCXXRecordDecl();
+    cgm.setGVProperties(gv, rd);
+
+    // Import the typeinfo symbol when all non-inline virtual methods are
+    // imported.
+    if (cgm.getTarget().hasPS4DLLImportExport()) {
+      cgm.errorNYI("getAddrOfExternalRTTIDescriptor: hasPS4DLLImportExport");
+    }
+  }
+
+  return builder.getGlobalViewAttr(builder.getUInt8PtrTy(), gv);
+}
+
+void CIRGenItaniumRTTIBuilder::buildVTablePointer(mlir::Location loc,
+                                                  const Type *ty) {
+  CIRGenBuilderTy &builder = cgm.getBuilder();
+  const char *vTableName = vTableClassNameForType(cgm, ty);
+
+  // Check if the alias exists. If it doesn't, then get or create the global.
+  if (cgm.getItaniumVTableContext().isRelativeLayout()) {
+    cgm.errorNYI("buildVTablePointer: isRelativeLayout");
+    return;
+  }
+
+  mlir::Type vtableGlobalTy = builder.getPointerTo(builder.getUInt8PtrTy());
+  llvm::Align align = cgm.getDataLayout().getABITypeAlign(vtableGlobalTy);
+  cir::GlobalOp vTable = cgm.createOrReplaceCXXRuntimeVariable(
+      loc, vTableName, vtableGlobalTy, cir::GlobalLinkageKind::ExternalLinkage,
+      CharUnits::fromQuantity(align));
+
+  // The vtable address point is 2.
+  mlir::Attribute field{};
+  if (cgm.getItaniumVTableContext().isRelativeLayout()) {
+    cgm.errorNYI("buildVTablePointer: isRelativeLayout");
+  } else {
+    SmallVector<mlir::Attribute, 4> offsets{
+        cgm.getBuilder().getI32IntegerAttr(2)};
+    auto indices = mlir::ArrayAttr::get(builder.getContext(), offsets);
+    field = cgm.getBuilder().getGlobalViewAttr(cgm.getBuilder().getUInt8PtrTy(),
+                                               vTable, indices);
+  }
+
+  assert(field && "expected attribute");
+  fields.push_back(field);
+}
+
+/// Build an abi::__si_class_type_info, used for single inheritance, according
+/// to the Itanium C++ ABI, 2.95p6b.
+void CIRGenItaniumRTTIBuilder::buildSIClassTypeInfo(mlir::Location loc,
+                                                    const CXXRecordDecl *rd) {
+  // Itanium C++ ABI 2.9.5p6b:
+  // It adds to abi::__class_type_info a single member pointing to the
+  // type_info structure for the base type,
+  mlir::Attribute baseTypeInfo =
+      CIRGenItaniumRTTIBuilder(cxxABI, cgm)
+          .buildTypeInfo(loc, rd->bases_begin()->getType());
+  fields.push_back(baseTypeInfo);
+}
+
+/// Build an abi::__vmi_class_type_info, used for
+/// classes with bases that do not satisfy the abi::__si_class_type_info
+/// constraints, according to the Itanium C++ ABI, 2.9.5p5c.
+void CIRGenItaniumRTTIBuilder::buildVMIClassTypeInfo(mlir::Location loc,
+                                                     const CXXRecordDecl *rd) {
+  mlir::Type unsignedIntLTy =
+      cgm.convertType(cgm.getASTContext().UnsignedIntTy);
+
+  // Itanium C++ ABI 2.9.5p6c:
+  //   __flags is a word with flags describing details about the class
+  //   structure, which may be referenced by using the __flags_masks
+  //   enumeration. These flags refer to both direct and indirect bases.
+  unsigned flags = computeVmiClassTypeInfoFlags(rd);
+  fields.push_back(cir::IntAttr::get(unsignedIntLTy, flags));
+
+  // Itanium C++ ABI 2.9.5p6c:
+  //   __base_count is a word with the number of direct proper base class
+  //   descriptions that follow.
+  fields.push_back(cir::IntAttr::get(unsignedIntLTy, rd->getNumBases()));
+
+  if (!rd->getNumBases())
+    return;
+
+  // Now add the base class descriptions.
+
+  // Itanium C++ ABI 2.9.5p6c:
+  //   __base_info[] is an array of base class descriptions -- one for every
+  //   direct proper base. Each description is of the type:
+  //
+  //   struct abi::__base_class_type_info {
+  //   public:
+  //     const __class_type_info *__base_type;
+  //     long __offset_flags;
+  //
+  //     enum __offset_flags_masks {
+  //       __virtual_mask = 0x1,
+  //       __public_mask = 0x2,
+  //       __offset_shift = 8
+  //     };
+  //   };
+
+  // If we're in mingw and 'long' isn't wide enough for a pointer, use 'long
+  // long' instead of 'long' for __offset_flags. libstdc++abi uses long long on
+  // LLP64 platforms.
+  // FIXME: Consider updating libc++abi to match, and extend this logic to all
+  // LLP64 platforms.
+  QualType offsetFlagsTy = cgm.getASTContext().LongTy;
+  const TargetInfo &ti = cgm.getASTContext().getTargetInfo();
+  if (ti.getTriple().isOSCygMing() &&
+      ti.getPointerWidth(LangAS::Default) > ti.getLongWidth())
+    offsetFlagsTy = cgm.getASTContext().LongLongTy;
+  mlir::Type offsetFlagsLTy = cgm.convertType(offsetFlagsTy);
+
+  for (const CXXBaseSpecifier &base : rd->bases()) {
+    // The __base_type member points to the RTTI for the base type.
+    fields.push_back(CIRGenItaniumRTTIBuilder(cxxABI, cgm)
+                         .buildTypeInfo(loc, base.getType()));
+
+    CXXRecordDecl *baseDecl = base.getType()->castAsCXXRecordDecl();
+    int64_t offsetFlags = 0;
+
+    // All but the lower 8 bits of __offset_flags are a signed offset.
+    // For a non-virtual base, this is the offset in the object of the base
+    // subobject. For a virtual base, this is the offset in the virtual table of
+    // the virtual base offset for the virtual base referenced (negative).
+    CharUnits offset;
+    if (base.isVirtual())
+      offset = cgm.getItaniumVTableContext().getVirtualBaseOffsetOffset(
+          rd, baseDecl);
+    else {
+      const ASTRecordLayout &layout =
+          cgm.getASTContext().getASTRecordLayout(rd);
+      offset = layout.getBaseClassOffset(baseDecl);
+    }
+    offsetFlags = uint64_t(offset.getQuantity()) << 8;
+
+    // The low-order byte of __offset_flags contains flags, as given by the
+    // masks from the enumeration __offset_flags_masks.
+    if (base.isVirtual())
+      offsetFlags |= BCTI_Virtual;
+    if (base.getAccessSpecifier() == AS_public)
+      offsetFlags |= BCTI_Public;
+
+    fields.push_back(cir::IntAttr::get(offsetFlagsLTy, offsetFlags));
+  }
+}
+
+mlir::Attribute CIRGenItaniumRTTIBuilder::buildTypeInfo(mlir::Location loc,
+                                                        QualType ty) {
+  // We want to operate on the canonical type.
+  ty = ty.getCanonicalType();
+
+  // Check if we've already emitted an RTTI descriptor for this type.
+  SmallString<256> name;
+  llvm::raw_svector_ostream out(name);
+  cgm.getCXXABI().getMangleContext().mangleCXXRTTI(ty, out);
+
+  auto oldGV = dyn_cast_or_null<cir::GlobalOp>(
+      mlir::SymbolTable::lookupSymbolIn(cgm.getModule(), name));
+
+  if (oldGV && !oldGV.isDeclaration()) {
+    assert(!oldGV.hasAvailableExternallyLinkage() &&
+           "available_externally typeinfos not yet implemented");
+    return cgm.getBuilder().getGlobalViewAttr(cgm.getBuilder().getUInt8PtrTy(),
+                                              oldGV);
+  }
+
+  // Check if there is already an external RTTI descriptor for this type.
+  if (isStandardLibraryRttiDescriptor(ty) ||
+      shouldUseExternalRttiDescriptor(cgm, ty))
+    return getAddrOfExternalRTTIDescriptor(loc, ty);
+
+  // Emit the standard library with external linkage.
+  cir::GlobalLinkageKind linkage = getTypeInfoLinkage(cgm, ty);
+
+  // Give the type_info object and name the formal visibility of the
+  // type itself.
+  assert(!cir::MissingFeatures::hiddenVisibility());
+  assert(!cir::MissingFeatures::protectedVisibility());
+
+  mlir::SymbolTable::Visibility symVisibility;
+  if (cir::isLocalLinkage(linkage))
+    // If the linkage is local, only default visibility makes sense.
+    symVisibility = mlir::SymbolTable::Visibility::Public;
+  else if (cxxABI.classifyRTTIUniqueness(ty, linkage) ==
+           CIRGenItaniumCXXABI::RUK_NonUniqueHidden) {
+    cgm.errorNYI(
+        "buildTypeInfo: classifyRTTIUniqueness == RUK_NonUniqueHidden");
+    symVisibility = CIRGenModule::getMLIRVisibility(ty->getVisibility());
+  } else
+    symVisibility = CIRGenModule::getMLIRVisibility(ty->getVisibility());
+
+  return buildTypeInfo(loc, ty, linkage, symVisibility);
+}
+
+mlir::Attribute CIRGenItaniumRTTIBuilder::buildTypeInfo(
+    mlir::Location loc, QualType ty, cir::GlobalLinkageKind linkage,
+    mlir::SymbolTable::Visibility visibility) {
+  CIRGenBuilderTy &builder = cgm.getBuilder();
+
+  assert(!cir::MissingFeatures::setDLLStorageClass());
+
+  // Add the vtable pointer.
+  buildVTablePointer(loc, cast<Type>(ty));
+
+  // And the name.
+  cir::GlobalOp typeName = getAddrOfTypeName(loc, ty, linkage);
+  mlir::Attribute typeNameField;
+
+  // If we're supposed to demote the visibility, be sure to set a flag
+  // to use a string comparison for type_info comparisons.
+  CIRGenItaniumCXXABI::RTTIUniquenessKind rttiUniqueness =
+      cxxABI.classifyRTTIUniqueness(ty, linkage);
+  if (rttiUniqueness != CIRGenItaniumCXXABI::RUK_Unique) {
+    // The flag is the sign bit, which on ARM64 is defined to be clear
+    // for global pointers. This is very ARM64-specific.
+    cgm.errorNYI(
+        "buildTypeInfo: rttiUniqueness != CIRGenItaniumCXXABI::RUK_Unique");
+  } else {
+    typeNameField =
+        builder.getGlobalViewAttr(builder.getUInt8PtrTy(), typeName);
+  }
+
+  fields.push_back(typeNameField);
+
+  switch (ty->getTypeClass()) {
+#define TYPE(Class, Base)
+#define ABSTRACT_TYPE(Class, Base)
+#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) case Type::Class:
+#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
+#define DEPENDENT_TYPE(Class, Base) case Type::Class:
+#include "clang/AST/TypeNodes.inc"
+    llvm_unreachable("Non-canonical and dependent types shouldn't get here");
+
+  // GCC treats vector types as fundamental types.
+  case Type::Builtin:
+  case Type::Vector:
+  case Type::ExtVector:
+  case Type::ConstantMatrix:
+  case Type::Complex:
+  case Type::BlockPointer:
+    // Itanium C++ ABI 2.9.5p4:
+    // abi::__fundamental_type_info adds no data members to std::type_info.
+    break;
+
+  case Type::LValueReference:
+  case Type::RValueReference:
+    llvm_unreachable("References shouldn't get here");
+
+  case Type::Auto:
+  case Type::DeducedTemplateSpecialization:
+    llvm_unreachable("Undeduced type shouldn't get here");
+
+  case Type::Pipe:
+    break;
+
+  case Type::BitInt:
+    break;
+
+  case Type::ConstantArray:
+  case Type::IncompleteArray:
+  case Type::VariableArray:
+  case Type::ArrayParameter:
+    // Itanium C++ ABI 2.9.5p5:
+    // abi::__array_type_info adds no data members to std::type_info.
+    break;
+
+  case Type::FunctionNoProto:
+  case Type::FunctionProto:
+    // Itanium C++ ABI 2.9.5p5:
+    // abi::__function_type_info adds no data members to std::type_info.
+    break;
+
+  case Type::Enum:
+    // Itanium C++ ABI 2.9.5p5:
+    // abi::__enum_type_info adds no data members to std::type_info.
+    break;
+
+  case Type::Record: {
+    const auto *rd =
+        cast<CXXRecordDecl>(cast<RecordType>(ty)->getOriginalDecl())
+            ->getDefinitionOrSelf();
+    if (!rd->hasDefinition() || !rd->getNumBases()) {
+      // We don't need to emit any fields.
+      break;
+    }
+
+    if (canUseSingleInheritance(rd)) {
+      buildSIClassTypeInfo(loc, rd);
+    } else {
+      buildVMIClassTypeInfo(loc, rd);
+    }
+
+    break;
+  }
+
+  case Type::ObjCObject:
+  case Type::ObjCInterface:
+    cgm.errorNYI("buildTypeInfo: ObjCObject & ObjCInterface");
+    break;
+
+  case Type::ObjCObjectPointer:
+    cgm.errorNYI("buildTypeInfo: ObjCObjectPointer");
+    break;
+
+  case Type::Pointer:
+    cgm.errorNYI("buildTypeInfo: Pointer");
+    break;
+
+  case Type::MemberPointer:
+    cgm.errorNYI("buildTypeInfo: MemberPointer");
+    break;
+
+  case Type::Atomic:
+    // No fields, at least for the moment.
+    break;
+
+  case Type::HLSLAttributedResource:
+  case Type::HLSLInlineSpirv:
+    llvm_unreachable("HLSL doesn't support RTTI");
+  }
+
+  assert(!cir::MissingFeatures::opGlobalDLLImportExport());
+  cir::TypeInfoAttr init = builder.getTypeInfo(builder.getArrayAttr(fields));
+
+  SmallString<256> name;
+  llvm::raw_svector_ostream out(name);
+  cgm.getCXXABI().getMangleContext().mangleCXXRTTI(ty, out);
+
+  // Create new global and search for an existing global.
+  auto oldGV = dyn_cast_or_null<cir::GlobalOp>(
+      mlir::SymbolTable::lookupSymbolIn(cgm.getModule(), name));
+
+  cir::GlobalOp gv =
+      CIRGenModule::createGlobalOp(cgm, loc, name, init.getType(),
+                                   /*isConstant=*/true);
+
+  // Export the typeinfo in the same circumstances as the vtable is
+  // exported.
+  if (cgm.getTarget().hasPS4DLLImportExport()) {
+    cgm.errorNYI("buildTypeInfo: target hasPS4DLLImportExport");
+    return {};
+  }
+
+  // If there's already an old global variable, replace it with the new one.
+  if (oldGV) {
+    // Replace occurrences of the old variable if needed.
+    gv.setName(oldGV.getName());
+    if (!oldGV->use_empty()) {
+      cgm.errorNYI("buildTypeInfo: old GV !use_empty");
+      return {};
+    }
+    oldGV->erase();
+  }
+
+  if (cgm.supportsCOMDAT() && cir::isWeakForLinker(gv.getLinkage())) {
+    assert(!cir::MissingFeatures::setComdat());
+    cgm.errorNYI("buildTypeInfo: supportsCOMDAT & isWeakForLinker");
+    return {};
+  }
+
+  CharUnits align = cgm.getASTContext().toCharUnitsFromBits(
+      cgm.getTarget().getPointerAlign(LangAS::Default));
+  gv.setAlignmentAttr(cgm.getSize(align));
+
+  // The Itanium ABI specifies that type_info objects must be globally
+  // unique, with one exception: if the type is an incomplete class
+  // type or a (possibly indirect) pointer to one.  That exception
+  // affects the general case of comparing type_info objects produced
+  // by the typeid operator, which is why the comparison operators on
+  // std::type_info generally use the type_info name pointers instead
+  // of the object addresses.  However, the language's built-in uses
+  // of RTTI generally require class types to be complete, even when
+  // manipulating pointers to those class types.  This allows the
+  // implementation of dynamic_cast to rely on address equality tests,
+  // which is much faster.
+
+  // All of this is to say that it's important that both the type_info
+  // object and the type_info name be uniqued when weakly emitted.
+
+  mlir::SymbolTable::setSymbolVisibility(typeName, visibility);
+  assert(!cir::MissingFeatures::setDLLStorageClass());
+  assert(!cir::MissingFeatures::opGlobalPartition());
+  assert(!cir::MissingFeatures::setDSOLocal());
+
+  mlir::SymbolTable::setSymbolVisibility(gv, visibility);
+  assert(!cir::MissingFeatures::setDLLStorageClass());
+  assert(!cir::MissingFeatures::opGlobalPartition());
+  assert(!cir::MissingFeatures::setDSOLocal());
+
+  CIRGenModule::setInitializer(gv, init);
+  return builder.getGlobalViewAttr(builder.getUInt8PtrTy(), gv);
+}
+
+mlir::Attribute CIRGenItaniumCXXABI::getAddrOfRTTIDescriptor(mlir::Location loc,
+                                                             QualType ty) {
+  return CIRGenItaniumRTTIBuilder(*this, cgm).buildTypeInfo(loc, ty);
+}
+
+/// What sort of uniqueness rules should we use for the RTTI for the
+/// given type?
+CIRGenItaniumCXXABI::RTTIUniquenessKind
+CIRGenItaniumCXXABI::classifyRTTIUniqueness(
+    QualType canTy, cir::GlobalLinkageKind linkage) const {
+  if (shouldRTTIBeUnique())
+    return RUK_Unique;
+
+  // It's only necessary for linkonce_odr or weak_odr linkage.
+  if (linkage != cir::GlobalLinkageKind::LinkOnceODRLinkage &&
+      linkage != cir::GlobalLinkageKind::WeakODRLinkage)
+    return RUK_Unique;
+
+  // It's only necessary with default visibility.
+  if (canTy->getVisibility() != DefaultVisibility)
+    return RUK_Unique;
+
+  // If we're not required to publish this symbol, hide it.
+  if (linkage == cir::GlobalLinkageKind::LinkOnceODRLinkage)
+    return RUK_NonUniqueHidden;
+
+  // If we're required to publish this symbol, as we might be under an
+  // explicit instantiation, leave it with default visibility but
+  // enable string-comparisons.
+  assert(linkage == cir::GlobalLinkageKind::WeakODRLinkage);
+  return RUK_NonUniqueVisible;
+}
+
 void CIRGenItaniumCXXABI::emitDestructorCall(
     CIRGenFunction &cgf, const CXXDestructorDecl *dd, CXXDtorType type,
     bool forVirtualBase, bool delegating, Address thisAddr, QualType thisTy) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index c977ff9f06de6..8eb48f6d0fb46 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -2184,8 +2184,13 @@ mlir::Attribute CIRGenModule::getAddrOfRTTIDescriptor(mlir::Location loc,
   if (!shouldEmitRTTI(forEh))
     return builder.getConstNullPtrAttr(builder.getUInt8PtrTy());
 
-  errorNYI(loc, "getAddrOfRTTIDescriptor");
-  return mlir::Attribute();
+  if (forEh && ty->isObjCObjectPointerType() &&
+      langOpts.ObjCRuntime.isGNUFamily()) {
+    errorNYI(loc, "getAddrOfRTTIDescriptor: Objc PtrType & Objc RT GUN");
+    return {};
+  }
+
+  return getCXXABI().getAddrOfRTTIDescriptor(loc, ty);
 }
 
 // TODO(cir): this can be shared with LLVM codegen.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 073e8d96b773b..006111d19d65f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -256,6 +256,24 @@ class CIRGenModule : public CIRGenTypeCache {
   mlir::Attribute getAddrOfRTTIDescriptor(mlir::Location loc, QualType ty,
                                           bool forEH = false);
 
+  static mlir::SymbolTable::Visibility getMLIRVisibility(Visibility v) {
+    switch (v) {
+    case DefaultVisibility:
+      return mlir::SymbolTable::Visibility::Public;
+    case HiddenVisibility:
+      return mlir::SymbolTable::Visibility::Private;
+    case ProtectedVisibility:
+      // The distinction between ProtectedVisibility and DefaultVisibility is
+      // that symbols with ProtectedVisibility, while visible to the dynamic
+      // linker like DefaultVisibility, are guaranteed to always dynamically
+      // resolve to a symbol in the current shared object. There is currently no
+      // equivalent MLIR visibility, so we fall back on the fact that the symbol
+      // is visible.
+      return mlir::SymbolTable::Visibility::Public;
+    }
+    llvm_unreachable("unknown visibility!");
+  }
+
   /// Return a constant array for the given string.
   mlir::Attribute getConstantArrayFromStringLiteral(const StringLiteral *e);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenVTables.cpp b/clang/lib/CIR/CodeGen/CIRGenVTables.cpp
index af8f5ae2cc0a5..94d856b41b3ce 100644
--- a/clang/lib/CIR/CodeGen/CIRGenVTables.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenVTables.cpp
@@ -47,6 +47,49 @@ cir::RecordType CIRGenVTables::getVTableType(const VTableLayout &layout) {
   return cgm.getBuilder().getAnonRecordTy(tys, /*incomplete=*/false);
 }
 
+/// At this point in the translation unit, does it appear that can we
+/// rely on the vtable being defined elsewhere in the program?
+///
+/// The response is really only definitive when called at the end of
+/// the translation unit.
+///
+/// The only semantic restriction here is that the object file should
+/// not contain a vtable definition when that vtable is defined
+/// strongly elsewhere.  Otherwise, we'd just like to avoid emitting
+/// vtables when unnecessary.
+/// TODO(cir): this should be merged into common AST helper for codegen.
+bool CIRGenVTables::isVTableExternal(const CXXRecordDecl *rd) {
+  assert(rd->isDynamicClass() && "Non-dynamic classes have no VTable.");
+
+  // We always synthesize vtables if they are needed in the MS ABI. MSVC doesn't
+  // emit them even if there is an explicit template instantiation.
+  if (cgm.getTarget().getCXXABI().isMicrosoft())
+    return false;
+
+  // If we have an explicit instantiation declaration (and not a
+  // definition), the vtable is defined elsewhere.
+  TemplateSpecializationKind tsk = rd->getTemplateSpecializationKind();
+  if (tsk == TSK_ExplicitInstantiationDeclaration)
+    return true;
+
+  // Otherwise, if the class is an instantiated template, the
+  // vtable must be defined here.
+  if (tsk == TSK_ImplicitInstantiation ||
+      tsk == TSK_ExplicitInstantiationDefinition)
+    return false;
+
+  // Otherwise, if the class doesn't have a key function (possibly
+  // anymore), the vtable must be defined here.
+  const CXXMethodDecl *keyFunction =
+      cgm.getASTContext().getCurrentKeyFunction(rd);
+  if (!keyFunction)
+    return false;
+
+  // Otherwise, if we don't have a definition of the key function, the
+  // vtable must be defined somewhere else.
+  return !keyFunction->hasBody();
+}
+
 /// This is a callback from Sema to tell us that a particular vtable is
 /// required to be emitted in this translation unit.
 ///
diff --git a/clang/lib/CIR/CodeGen/CIRGenVTables.h b/clang/lib/CIR/CodeGen/CIRGenVTables.h
index e19242c651034..9c425ab43b3d9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenVTables.h
+++ b/clang/lib/CIR/CodeGen/CIRGenVTables.h
@@ -100,6 +100,8 @@ class CIRGenVTables {
   /// is enabled) and the VTT (if the class has virtual bases).
   void generateClassData(const CXXRecordDecl *rd);
 
+  bool isVTableExternal(const clang::CXXRecordDecl *rd);
+
   /// Returns the type of a vtable with the given layout. Normally a struct of
   /// arrays of pointers, with one struct element for each vtable in the vtable
   /// group.
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 876948d53010b..bd6d6e3a6ed09 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -222,8 +222,9 @@ class CIRAttrToValue {
     return llvm::TypeSwitch<mlir::Attribute, mlir::Value>(attr)
         .Case<cir::IntAttr, cir::FPAttr, cir::ConstComplexAttr,
               cir::ConstArrayAttr, cir::ConstRecordAttr, cir::ConstVectorAttr,
-              cir::ConstPtrAttr, cir::GlobalViewAttr, cir::VTableAttr,
-              cir::ZeroAttr>([&](auto attrT) { return visitCirAttr(attrT); })
+              cir::ConstPtrAttr, cir::GlobalViewAttr, cir::TypeInfoAttr,
+              cir::VTableAttr, cir::ZeroAttr>(
+            [&](auto attrT) { return visitCirAttr(attrT); })
         .Default([&](auto attrT) { return mlir::Value(); });
   }
 
@@ -1694,7 +1695,7 @@ CIRToLLVMGlobalOpLowering::matchAndRewriteRegionInitializedGlobal(
   // TODO: Generalize this handling when more types are needed here.
   assert((isa<cir::ConstArrayAttr, cir::ConstRecordAttr, cir::ConstVectorAttr,
               cir::ConstPtrAttr, cir::ConstComplexAttr, cir::GlobalViewAttr,
-              cir::VTableAttr, cir::ZeroAttr>(init)));
+              cir::TypeInfoAttr, cir::VTableAttr, cir::ZeroAttr>(init)));
 
   // TODO(cir): once LLVM's dialect has proper equivalent attributes this
   // should be updated. For now, we use a custom op to initialize globals
@@ -1749,7 +1750,8 @@ mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(
     } else if (mlir::isa<cir::ConstArrayAttr, cir::ConstVectorAttr,
                          cir::ConstRecordAttr, cir::ConstPtrAttr,
                          cir::ConstComplexAttr, cir::GlobalViewAttr,
-                         cir::VTableAttr, cir::ZeroAttr>(init.value())) {
+                         cir::TypeInfoAttr, cir::VTableAttr, cir::ZeroAttr>(
+                   init.value())) {
       // TODO(cir): once LLVM's dialect has proper equivalent attributes this
       // should be updated. For now, we use a custom op to initialize globals
       // to the appropriate value.
diff --git a/clang/test/CIR/CodeGen/vtt.cpp b/clang/test/CIR/CodeGen/vtt.cpp
index 9d88acef91eef..baab972bce696 100644
--- a/clang/test/CIR/CodeGen/vtt.cpp
+++ b/clang/test/CIR/CodeGen/vtt.cpp
@@ -1,9 +1,16 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fno-rtti -fclangir -emit-cir %s -o %t.cir
-// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: FileCheck --check-prefixes=CIR-NO-RTTI,CIR-COMMON --input-file=%t.cir %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fno-rtti -fclangir -emit-llvm %s -o %t-cir.ll
-// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll  %s
+// RUN: FileCheck --check-prefixes=LLVM-NO-RTTI,LLVM-COMMON --input-file=%t-cir.ll  %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fno-rtti -emit-llvm %s -o %t.ll
-// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll  %s
+// RUN: FileCheck --check-prefixes=OGCG-NO-RTTI,OGCG-COMMON --input-file=%t.ll  %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefixes=CIR-RTTI,CIR-COMMON --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefixes=LLVM-RTTI,LLVM-COMMON --input-file=%t-cir.ll  %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefixes=OGCG-RTTI,OGCG-COMMON --input-file=%t.ll  %s
 
 // Note: This test will be expanded to verify VTT emission and VTT implicit
 // argument handling. For now, it's just test the record layout.
@@ -39,449 +46,511 @@ void f(D *d) {}
 // Trigger vtable and VTT emission for D.
 void D::y() {}
 
-// CIR: !rec_A2Ebase = !cir.record<struct "A.base" packed {!cir.vptr, !s32i}>
-// CIR: !rec_B2Ebase = !cir.record<struct "B.base" packed {!cir.vptr, !s32i}>
-// CIR: !rec_C2Ebase = !cir.record<struct "C.base" {!cir.vptr, !s64i}>
-// CIR: !rec_A = !cir.record<class "A" packed padded {!cir.vptr, !s32i, !cir.array<!u8i x 4>}>
-// CIR: !rec_B = !cir.record<class "B" packed padded {!cir.vptr, !s32i, !cir.array<!u8i x 4>, !rec_A2Ebase, !cir.array<!u8i x 4>}>
-// CIR: !rec_C = !cir.record<class "C" {!cir.vptr, !s64i, !rec_A2Ebase}>
-// CIR: !rec_D = !cir.record<class "D" {!rec_B2Ebase, !rec_C2Ebase, !s64i, !rec_A2Ebase}>
+// CIR-COMMON: !rec_A2Ebase = !cir.record<struct "A.base" packed {!cir.vptr, !s32i}>
+// CIR-COMMON: !rec_B2Ebase = !cir.record<struct "B.base" packed {!cir.vptr, !s32i}>
+// CIR-COMMON: !rec_C2Ebase = !cir.record<struct "C.base" {!cir.vptr, !s64i}>
+// CIR-COMMON: !rec_A = !cir.record<class "A" packed padded {!cir.vptr, !s32i, !cir.array<!u8i x 4>}>
+// CIR-COMMON: !rec_B = !cir.record<class "B" packed padded {!cir.vptr, !s32i, !cir.array<!u8i x 4>, !rec_A2Ebase, !cir.array<!u8i x 4>}>
+// CIR-COMMON: !rec_C = !cir.record<class "C" {!cir.vptr, !s64i, !rec_A2Ebase}>
+// CIR-COMMON: !rec_D = !cir.record<class "D" {!rec_B2Ebase, !rec_C2Ebase, !s64i, !rec_A2Ebase}>
 
-// CIR: !rec_anon_struct = !cir.record<struct  {!cir.array<!cir.ptr<!u8i> x 5>, !cir.array<!cir.ptr<!u8i> x 4>, !cir.array<!cir.ptr<!u8i> x 4>}>
-// CIR: !rec_anon_struct1 = !cir.record<struct  {!cir.array<!cir.ptr<!u8i> x 4>, !cir.array<!cir.ptr<!u8i> x 4>}>
+// CIR-RTTI:   ![[REC_TYPE_INFO_VTABLE:.*]]= !cir.record<struct  {!cir.ptr<!u8i>, !cir.ptr<!u8i>, !u32i, !u32i, !cir.ptr<!u8i>, !s64i, !cir.ptr<!u8i>, !s64i}>
+// CIR-COMMON: ![[REC_D_VTABLE:.*]] = !cir.record<struct  {!cir.array<!cir.ptr<!u8i> x 5>, !cir.array<!cir.ptr<!u8i> x 4>, !cir.array<!cir.ptr<!u8i> x 4>}>
+// CIR-COMMON: ![[REC_B_OR_C_IN_D_VTABLE:.*]]= !cir.record<struct  {!cir.array<!cir.ptr<!u8i> x 4>, !cir.array<!cir.ptr<!u8i> x 4>}>
 
 // Vtable for D
-// CIR:      cir.global{{.*}} @_ZTV1D = #cir.vtable<{
-// CIR-SAME:   #cir.const_array<[
-// CIR-SAME:     #cir.ptr<40 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1B1wEv> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1D1yEv> : !cir.ptr<!u8i>
-// CIR-SAME:   ]> : !cir.array<!cir.ptr<!u8i> x 5>,
-// CIR-SAME:   #cir.const_array<[
-// CIR-SAME:     #cir.ptr<24 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<-16 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1C1xEv> : !cir.ptr<!u8i>
-// CIR-SAME:   ]> : !cir.array<!cir.ptr<!u8i> x 4>,
-// CIR-SAME:   #cir.const_array<[
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<-40 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1A1vEv> : !cir.ptr<!u8i>
-// CIR-SAME:   ]> : !cir.array<!cir.ptr<!u8i> x 4>
-// CIR-SAME: }> : !rec_anon_struct {alignment = 8 : i64}
-
-// LLVM:      @_ZTV1D = global { [5 x ptr], [4 x ptr], [4 x ptr] } {
-// LLVM-SAME:   [5 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv, ptr @_ZN1D1yEv],
-// LLVM-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN1C1xEv],
-// LLVM-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
-// LLVM-SAME: }, align 8
-
-// OGCG:      @_ZTV1D = unnamed_addr constant { [5 x ptr], [4 x ptr], [4 x ptr] } {
-// OGCG-SAME:   [5 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv, ptr @_ZN1D1yEv],
-// OGCG-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN1C1xEv],
-// OGCG-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
-// OGCG-SAME: }, align 8
+
+// CIR-COMMON:       cir.global{{.*}} @_ZTV1D = #cir.vtable<{
+// CIR-COMMON-SAME:    #cir.const_array<[
+// CIR-COMMON-SAME:      #cir.ptr<40 : i64> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-NO-RTTI-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:        #cir.global_view<@_ZTI1D> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1B1wEv> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1D1yEv> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME:    ]> : !cir.array<!cir.ptr<!u8i> x 5>,
+// CIR-COMMON-SAME:    #cir.const_array<[
+// CIR-COMMON-SAME:      #cir.ptr<24 : i64> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.ptr<-16 : i64> : !cir.ptr<!u8i>,
+// CIR-NO-RTTI-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:        #cir.global_view<@_ZTI1D> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1C1xEv> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME:    ]> : !cir.array<!cir.ptr<!u8i> x 4>,
+// CIR-COMMON-SAME:    #cir.const_array<[
+// CIR-COMMON-SAME:      #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.ptr<-40 : i64> : !cir.ptr<!u8i>,
+// CIR-NO-RTTI-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:        #cir.global_view<@_ZTI1D> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1A1vEv> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME:    ]> : !cir.array<!cir.ptr<!u8i> x 4>
+// CIR-COMMON-SAME: }> : ![[REC_D_VTABLE]] {alignment = 8 : i64}
+
+// LLVM-COMMON:       @_ZTV1D = global { [5 x ptr], [4 x ptr], [4 x ptr] } {
+// LLVM-NO-RTTI-SAME:   [5 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv, ptr @_ZN1D1yEv],
+// LLVM-RTTI-SAME:      [5 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr @_ZTI1D, ptr @_ZN1B1wEv, ptr @_ZN1D1yEv],
+// LLVM-NO-RTTI-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN1C1xEv],
+// LLVM-RTTI-SAME:      [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1D, ptr @_ZN1C1xEv],
+// LLVM-NO-RTTI-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
+// LLVM-RTTI-SAME:      [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr @_ZTI1D, ptr @_ZN1A1vEv]
+// LLVM-COMMON-SAME:  }, align 8
+
+// OGCG-COMMON:       @_ZTV1D = unnamed_addr constant { [5 x ptr], [4 x ptr], [4 x ptr] } {
+// OGCG-NO-RTTI-SAME:   [5 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv, ptr @_ZN1D1yEv],
+// OGCG-RTTI-SAME:      [5 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr @_ZTI1D, ptr @_ZN1B1wEv, ptr @_ZN1D1yEv],
+// OGCG-NO-RTTI-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr inttoptr (i64 -16 to ptr), ptr null, ptr @_ZN1C1xEv],
+// OGCG-RTTI-SAME:      [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1D, ptr @_ZN1C1xEv],
+// OGCG-NO-RTTI-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
+// OGCG-RTTI-SAME:      [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr @_ZTI1D, ptr @_ZN1A1vEv]
+// OGCG-COMMON-SAME:  }, align 8
 
 // VTT for D
-// CIR:      cir.global{{.*}} @_ZTT1D = #cir.const_array<[
-// CIR-SAME:   #cir.global_view<@_ZTV1D, [0 : i32, 3 : i32]> : !cir.ptr<!u8i>,
-// CIR-SAME:   #cir.global_view<@_ZTC1D0_1B, [0 : i32, 3 : i32]> : !cir.ptr<!u8i>,
-// CIR-SAME:   #cir.global_view<@_ZTC1D0_1B, [1 : i32, 3 : i32]> : !cir.ptr<!u8i>,
-// CIR-SAME:   #cir.global_view<@_ZTC1D16_1C, [0 : i32, 3 : i32]> : !cir.ptr<!u8i>,
-// CIR-SAME:   #cir.global_view<@_ZTC1D16_1C, [1 : i32, 3 : i32]> : !cir.ptr<!u8i>,
-// CIR-SAME:   #cir.global_view<@_ZTV1D, [2 : i32, 3 : i32]> : !cir.ptr<!u8i>,
-// CIR-SAME:   #cir.global_view<@_ZTV1D, [1 : i32, 3 : i32]> : !cir.ptr<!u8i>
-// CIR-SAME: ]> : !cir.array<!cir.ptr<!u8i> x 7> {alignment = 8 : i64}
-
-// LLVM:      @_ZTT1D = global [7 x ptr] [
-// LLVM-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 24),
-// LLVM-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D0_1B, i64 24),
-// LLVM-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D0_1B, i64 56),
-// LLVM-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D16_1C, i64 24),
-// LLVM-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D16_1C, i64 56),
-// LLVM-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 96),
-// LLVM-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 64)
-// LLVM-SAME: ], align 8
-
-// OGCG:      @_ZTT1D = unnamed_addr constant [7 x ptr] [
-// OGCG-SAME:   ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 3),
-// OGCG-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D0_1B, i32 0, i32 0, i32 3),
-// OGCG-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D0_1B, i32 0, i32 1, i32 3),
-// OGCG-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D16_1C, i32 0, i32 0, i32 3),
-// OGCG-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D16_1C, i32 0, i32 1, i32 3),
-// OGCG-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 2, i32 3),
-// OGCG-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 3)
-// OGCG-SAME: ], align 8
+
+// CIR-COMMON:      cir.global{{.*}} @_ZTT1D = #cir.const_array<[
+// CIR-COMMON-SAME:   #cir.global_view<@_ZTV1D, [0 : i32, 3 : i32]> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:   #cir.global_view<@_ZTC1D0_1B, [0 : i32, 3 : i32]> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:   #cir.global_view<@_ZTC1D0_1B, [1 : i32, 3 : i32]> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:   #cir.global_view<@_ZTC1D16_1C, [0 : i32, 3 : i32]> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:   #cir.global_view<@_ZTC1D16_1C, [1 : i32, 3 : i32]> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:   #cir.global_view<@_ZTV1D, [2 : i32, 3 : i32]> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:   #cir.global_view<@_ZTV1D, [1 : i32, 3 : i32]> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME: ]> : !cir.array<!cir.ptr<!u8i> x 7> {alignment = 8 : i64}
+
+// LLVM-COMMON:      @_ZTT1D = global [7 x ptr] [
+// LLVM-COMMON-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 24),
+// LLVM-COMMON-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D0_1B, i64 24),
+// LLVM-COMMON-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D0_1B, i64 56),
+// LLVM-COMMON-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D16_1C, i64 24),
+// LLVM-COMMON-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTC1D16_1C, i64 56),
+// LLVM-COMMON-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 96),
+// LLVM-COMMON-SAME:   ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 64)
+// LLVM-COMMON-SAME: ], align 8
+
+// OGCG-COMMON:      @_ZTT1D = unnamed_addr constant [7 x ptr] [
+// OGCG-COMMON-SAME:   ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 3),
+// OGCG-COMMON-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D0_1B, i32 0, i32 0, i32 3),
+// OGCG-COMMON-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D0_1B, i32 0, i32 1, i32 3),
+// OGCG-COMMON-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D16_1C, i32 0, i32 0, i32 3),
+// OGCG-COMMON-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [4 x ptr], [4 x ptr] }, ptr @_ZTC1D16_1C, i32 0, i32 1, i32 3),
+// OGCG-COMMON-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 2, i32 3),
+// OGCG-COMMON-SAME:   ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 3)
+// OGCG-COMMON-SAME: ], align 8
 
 // Construction vtable for B-in-D
-// CIR:      cir.global{{.*}} @_ZTC1D0_1B = #cir.vtable<{
-// CIR-SAME:   #cir.const_array<[
-// CIR-SAME:     #cir.ptr<40 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1B1wEv> : !cir.ptr<!u8i>
-// CIR-SAME:   ]> : !cir.array<!cir.ptr<!u8i> x 4>,
-// CIR-SAME:   #cir.const_array<[
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<-40 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1A1vEv> : !cir.ptr<!u8i>
-// CIR-SAME:   ]> : !cir.array<!cir.ptr<!u8i> x 4>
-// CIR-SAME: }> : !rec_anon_struct1 {alignment = 8 : i64}
-
-// LLVM:      @_ZTC1D0_1B = global { [4 x ptr], [4 x ptr] } {
-// LLVM-SAME:   [4 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv],
-// LLVM-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
-// LLVM-SAME: }, align 8
-
-// OGCG:      @_ZTC1D0_1B = unnamed_addr constant { [4 x ptr], [4 x ptr] } {
-// OGCG-SAME:   [4 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv],
-// OGCG-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
-// OGCG-SAME: }, align 8
+
+// CIR-COMMON:      cir.global{{.*}} @_ZTC1D0_1B = #cir.vtable<{
+// CIR-COMMON-SAME:    #cir.const_array<[
+// CIR-COMMON-SAME:      #cir.ptr<40 : i64> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-NO-RTTI-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:        #cir.global_view<@_ZTI1B> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1B1wEv> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME:    ]> : !cir.array<!cir.ptr<!u8i> x 4>,
+// CIR-COMMON-SAME:    #cir.const_array<[
+// CIR-COMMON-SAME:      #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.ptr<-40 : i64> : !cir.ptr<!u8i>,
+// CIR-NO-RTTI-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:        #cir.global_view<@_ZTI1B> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1A1vEv> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME:    ]> : !cir.array<!cir.ptr<!u8i> x 4>
+// CIR-COMMON-SAME:    }> : ![[REC_B_OR_C_IN_D_VTABLE]]
+
+// LLVM-COMMON:       @_ZTC1D0_1B = global { [4 x ptr], [4 x ptr] } {
+// LLVM-NO-RTTI-SAME: [4 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv],
+// LLVM-RTTI-SAME:    [4 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr @_ZTI1B, ptr @_ZN1B1wEv],
+// LLVM-NO-RTTI-SAME: [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
+// LLVM-RTTI-SAME:    [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr @_ZTI1B, ptr @_ZN1A1vEv]
+// LLVM-COMMON-SAME:  }, align 8
+
+// OGCG-COMMON:       @_ZTC1D0_1B = unnamed_addr constant { [4 x ptr], [4 x ptr] } {
+// OGCG-NO-RTTI-SAME:   [4 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr null, ptr @_ZN1B1wEv],
+// OGCG-RTTI-SAME:      [4 x ptr] [ptr inttoptr (i64 40 to ptr), ptr null, ptr @_ZTI1B, ptr @_ZN1B1wEv],
+// OGCG-NO-RTTI-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr null, ptr @_ZN1A1vEv]
+// OGCG-RTTI-SAME:      [4 x ptr] [ptr null, ptr inttoptr (i64 -40 to ptr), ptr @_ZTI1B, ptr @_ZN1A1vEv]
+// OGCG-COMMON-SAME:  }, align 8
+
+// CIR-RTTI:  cir.global{{.*}} @_ZTI1B : !cir.ptr<!u8i>
+
+// LLVM-RTTI: @_ZTI1B = external global ptr
+
+// OGCG-RTTI: @_ZTI1B = external constant ptr
 
 // Construction vtable for C-in-D
-// CIR:      cir.global{{.*}} @_ZTC1D16_1C = #cir.vtable<{
-// CIR-SAME:   #cir.const_array<[
-// CIR-SAME:     #cir.ptr<24 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1C1xEv> : !cir.ptr<!u8i>
-// CIR-SAME:   ]> : !cir.array<!cir.ptr<!u8i> x 4>,
-// CIR-SAME:   #cir.const_array<[
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<-24 : i64> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
-// CIR-SAME:     #cir.global_view<@_ZN1A1vEv> : !cir.ptr<!u8i>
-// CIR-SAME:   ]> : !cir.array<!cir.ptr<!u8i> x 4>
-// CIR-SAME: }> : !rec_anon_struct1 {alignment = 8 : i64}
-
-// LLVM:      @_ZTC1D16_1C = global { [4 x ptr], [4 x ptr] } {
-// LLVM-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr null, ptr @_ZN1C1xEv],
-// LLVM-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -24 to ptr), ptr null, ptr @_ZN1A1vEv]
-// LLVM-SAME: }, align 8
-
-// OGCG:      @_ZTC1D16_1C = unnamed_addr constant { [4 x ptr], [4 x ptr] } {
-// OGCG-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr null, ptr @_ZN1C1xEv],
-// OGCG-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -24 to ptr), ptr null, ptr @_ZN1A1vEv]
-// OGCG-SAME: }, align 8
+
+// CIR-COMMON:       cir.global{{.*}} @_ZTC1D16_1C = #cir.vtable<{
+// CIR-COMMON-SAME:    #cir.const_array<[
+// CIR-COMMON-SAME:      #cir.ptr<24 : i64> : !cir.ptr<!u8i>,
+// CIR-NO-RTTI-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:        #cir.global_view<@_ZTI1C> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1C1xEv> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME:    ]> : !cir.array<!cir.ptr<!u8i> x 4>,
+// CIR-COMMON-SAME:    #cir.const_array<[
+// CIR-COMMON-SAME:      #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.ptr<-24 : i64> : !cir.ptr<!u8i>,
+// CIR-NO-RTTI-SAME:     #cir.ptr<null> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:        #cir.global_view<@_ZTI1C> : !cir.ptr<!u8i>,
+// CIR-COMMON-SAME:      #cir.global_view<@_ZN1A1vEv> : !cir.ptr<!u8i>
+// CIR-COMMON-SAME:  ]> : !cir.array<!cir.ptr<!u8i> x 4>}>
+// CIR-COMMON-SAME:  : ![[REC_B_OR_C_IN_D_VTABLE]]
+
+// LLVM-COMMON:       @_ZTC1D16_1C = global { [4 x ptr], [4 x ptr] } {
+// LLVM-NO-RTTI-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr null, ptr @_ZN1C1xEv],
+// LLVM-RTTI-SAME:      [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr @_ZTI1C, ptr @_ZN1C1xEv],
+// LLVM-NO-RTTI-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -24 to ptr), ptr null, ptr @_ZN1A1vEv]
+// LLVM-RTTI-SAME:      [4 x ptr] [ptr null, ptr inttoptr (i64 -24 to ptr), ptr @_ZTI1C, ptr @_ZN1A1vEv]
+// LLVM-COMMON-SAME:  }, align 8
+
+// OGCG-COMMON:        @_ZTC1D16_1C = unnamed_addr constant { [4 x ptr], [4 x ptr] } {
+// OGCG-NO-RTTI-SAME:   [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr null, ptr @_ZN1C1xEv],
+// OGCG-RTTI-SAME:      [4 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr @_ZTI1C, ptr @_ZN1C1xEv],
+// OGCG-NO-RTTI-SAME:   [4 x ptr] [ptr null, ptr inttoptr (i64 -24 to ptr), ptr null, ptr @_ZN1A1vEv]
+// OGCG-RTTI-SAME:      [4 x ptr] [ptr null, ptr inttoptr (i64 -24 to ptr), ptr @_ZTI1C, ptr @_ZN1A1vEv]
+// OGCG-COMMON-SAME:  }, align 8
+
+// RTTI class type info for D
+
+// CIR-RTTI:  cir.globa{{.*}} @_ZTVN10__cxxabiv121__vmi_class_type_infoE : !cir.ptr<!cir.ptr<!u8i>>
+
+// CIR-RTTI:  cir.global{{.*}} @_ZTS1D = #cir.const_array<"1D" : !cir.array<!s8i x 2>> : !cir.array<!s8i x 2>
+
+// CIR-RTTI:      cir.global{{.*}} @_ZTI1D = #cir.typeinfo<{
+// CIR-RTTI-SAME:   #cir.global_view<@_ZTVN10__cxxabiv121__vmi_class_type_infoE, [2 : i32]> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:   #cir.global_view<@_ZTS1D> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:   #cir.int<2> : !u32i, #cir.int<2> : !u32i,
+// CIR-RTTI-SAME:   #cir.global_view<@_ZTI1B> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:   #cir.int<2> : !s64i,
+// CIR-RTTI-SAME:   #cir.global_view<@_ZTI1C> : !cir.ptr<!u8i>,
+// CIR-RTTI-SAME:   #cir.int<4098> : !s64i}> : !rec_anon_struct
+
+// CIR-RTTI: cir.global{{.*}} @_ZTV1A : !rec_anon_struct3
+
+// LLVM-RTTI: @_ZTVN10__cxxabiv121__vmi_class_type_infoE = external global ptr
+// LLVM-RTTI: @_ZTS1D = global [2 x i8] c"1D", align 1
+
+// LLVM-RTTI:      @_ZTI1D = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64 } {
+// LLVM-RTTI-SAME:   ptr getelementptr (i8, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 16),
+// LLVM-RTTI-SAME:   ptr @_ZTS1D, i32 2, i32 2, ptr @_ZTI1B, i64 2, ptr @_ZTI1C, i64 4098 }
+
+// OGCG-RTTI:      @_ZTI1D = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64 } {
+// OGCG-RTTI-SAME:   ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2),
+// OGCG-RTTI-SAME:   ptr @_ZTS1D, i32 2, i32 2, ptr @_ZTI1B, i64 2, ptr @_ZTI1C, i64 4098 }, align 8
+
+// OGCG-RTTI: @_ZTVN10__cxxabiv121__vmi_class_type_infoE = external global [0 x ptr]
+// OGCG-RTTI: @_ZTS1D = constant [3 x i8] c"1D\00", align 1
+// OGCG-RTTI: @_ZTV1A = external unnamed_addr constant { [3 x ptr] }, align 8
 
 D::D() {}
 
 // In CIR, this gets emitted after the B and C constructors. See below.
 // Base (C2) constructor for D
 
-// OGCG: define {{.*}} void @_ZN1DC2Ev(ptr {{.*}} %[[THIS_ARG:.*]], ptr {{.*}} %[[VTT_ARG:.*]])
-// OGCG:   %[[THIS_ADDR:.*]] = alloca ptr
-// OGCG:   %[[VTT_ADDR:.*]] = alloca ptr
-// OGCG:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// OGCG:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
-// OGCG:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// OGCG:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
-// OGCG:   %[[B_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 1
-// OGCG:   call void @_ZN1BC2Ev(ptr {{.*}} %[[THIS]], ptr {{.*}} %[[B_VTT]])
-// OGCG:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
-// OGCG:   %[[C_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 3
-// OGCG:   call void @_ZN1CC2Ev(ptr {{.*}} %[[C_ADDR]], ptr {{.*}} %[[C_VTT]])
-// OGCG:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
-// OGCG:   store ptr %[[VPTR]], ptr %[[THIS]]
-// OGCG:   %[[D_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 5
-// OGCG:   %[[D_VPTR:.*]] = load ptr, ptr %[[D_VPTR_ADDR]]
-// OGCG:   %[[D_VPTR_ADDR2:.*]] = load ptr, ptr %[[THIS]]
-// OGCG:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[D_VPTR_ADDR2]], i64 -24
-// OGCG:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
-// OGCG:   %[[BASE_PTR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
-// OGCG:   store ptr %[[D_VPTR]], ptr %[[BASE_PTR]]
-// OGCG:   %[[C_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 6
-// OGCG:   %[[C_VPTR:.*]] = load ptr, ptr %[[C_VPTR_ADDR]]
-// OGCG:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
-// OGCG:   store ptr %[[C_VPTR]], ptr %[[C_ADDR]]
-
+// OGCG-COMMON: define {{.*}} void @_ZN1DC2Ev(ptr {{.*}} %[[THIS_ARG:.*]], ptr {{.*}} %[[VTT_ARG:.*]])
+// OGCG-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   %[[VTT_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
+// OGCG-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
+// OGCG-COMMON:   %[[B_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 1
+// OGCG-COMMON:   call void @_ZN1BC2Ev(ptr {{.*}} %[[THIS]], ptr {{.*}} %[[B_VTT]])
+// OGCG-COMMON:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
+// OGCG-COMMON:   %[[C_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 3
+// OGCG-COMMON:   call void @_ZN1CC2Ev(ptr {{.*}} %[[C_ADDR]], ptr {{.*}} %[[C_VTT]])
+// OGCG-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
+// OGCG-COMMON:   store ptr %[[VPTR]], ptr %[[THIS]]
+// OGCG-COMMON:   %[[D_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 5
+// OGCG-COMMON:   %[[D_VPTR:.*]] = load ptr, ptr %[[D_VPTR_ADDR]]
+// OGCG-COMMON:   %[[D_VPTR_ADDR2:.*]] = load ptr, ptr %[[THIS]]
+// OGCG-COMMON:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[D_VPTR_ADDR2]], i64 -24
+// OGCG-COMMON:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
+// OGCG-COMMON:   %[[BASE_PTR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
+// OGCG-COMMON:   store ptr %[[D_VPTR]], ptr %[[BASE_PTR]]
+// OGCG-COMMON:   %[[C_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 6
+// OGCG-COMMON:   %[[C_VPTR:.*]] = load ptr, ptr %[[C_VPTR_ADDR]]
+// OGCG-COMMON:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
+// OGCG-COMMON:   store ptr %[[C_VPTR]], ptr %[[C_ADDR]]
 
 // Base (C2) constructor for B
 
-// CIR:      cir.func {{.*}} @_ZN1BC2Ev
-// CIR-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_B>
-// CIR-SAME:                      %[[VTT_ARG:.*]]: !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
-// CIR:        %[[VTT_ADDR:.*]] = cir.alloca {{.*}} ["vtt", init]
-// CIR:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
-// CIR:        cir.store %[[VTT_ARG]], %[[VTT_ADDR]]
-// CIR:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
-// CIR:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
-// CIR:        %[[VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
-// CIR:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]]
-// CIR:        %[[B_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
-// CIR:        cir.store{{.*}} %[[VPTR]], %[[B_VPTR_ADDR]]
-// CIR:        %[[B_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[B_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[B_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
-// CIR:        %[[B_VPTR:.*]] = cir.load{{.*}} %[[B_VPTR_ADDR]]
-// CIR:        %[[B_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
-// CIR:        %[[VPTR:.*]] = cir.load{{.*}} %[[B_VPTR_ADDR]]
-// CIR:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
-// CIR:        %[[CONST_24:.*]] = cir.const #cir.int<-24>
-// CIR:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
-// CIR:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_B>), !cir.ptr<!u8i>
-// CIR:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_B>
-// CIR:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
-// CIR:        cir.store{{.*}} %[[B_VPTR]], %[[BASE_VPTR_ADDR]]
-
-// LLVM: define {{.*}} void @_ZN1BC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]])
-// LLVM:   %[[THIS_ADDR:.*]] = alloca ptr
-// LLVM:   %[[VTT_ADDR:.*]] = alloca ptr
-// LLVM:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// LLVM:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
-// LLVM:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// LLVM:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
-// LLVM:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
-// LLVM:   store ptr %[[VPTR]], ptr %[[THIS]]
-// LLVM:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 1
-// LLVM:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
-// LLVM:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
-// LLVM:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
-// LLVM:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
-// LLVM:   %[[BASE_PTR:.*]] = getelementptr i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
-// LLVM:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
-
-// OGCG: define {{.*}} void @_ZN1BC2Ev(ptr {{.*}} %[[THIS_ARG:.*]], ptr {{.*}} %[[VTT_ARG:.*]])
-// OGCG:   %[[THIS_ADDR:.*]] = alloca ptr
-// OGCG:   %[[VTT_ADDR:.*]] = alloca ptr
-// OGCG:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// OGCG:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
-// OGCG:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// OGCG:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
-// OGCG:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
-// OGCG:   store ptr %[[VPTR]], ptr %[[THIS]]
-// OGCG:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 1
-// OGCG:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
-// OGCG:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
-// OGCG:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
-// OGCG:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
-// OGCG:   %[[BASE_PTR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
-// OGCG:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
+// CIR-COMMON:      cir.func {{.*}} @_ZN1BC2Ev
+// CIR-COMMON-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_B>
+// CIR-COMMON-SAME:                      %[[VTT_ARG:.*]]: !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CIR-COMMON:        %[[VTT_ADDR:.*]] = cir.alloca {{.*}} ["vtt", init]
+// CIR-COMMON:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
+// CIR-COMMON:        cir.store %[[VTT_ARG]], %[[VTT_ADDR]]
+// CIR-COMMON:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
+// CIR-COMMON:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
+// CIR-COMMON:        %[[VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]]
+// CIR-COMMON:        %[[B_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
+// CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[B_VPTR_ADDR]]
+// CIR-COMMON:        %[[B_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[B_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[B_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[B_VPTR:.*]] = cir.load{{.*}} %[[B_VPTR_ADDR]]
+// CIR-COMMON:        %[[B_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
+// CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[B_VPTR_ADDR]]
+// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[CONST_24:.*]] = cir.const #cir.int<-24>
+// CIR-COMMON:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR-COMMON:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
+// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_B>), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_B>
+// CIR-COMMON:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
+// CIR-COMMON:        cir.store{{.*}} %[[B_VPTR]], %[[BASE_VPTR_ADDR]]
+
+// LLVM-COMMON: define {{.*}} void @_ZN1BC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]])
+// LLVM-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// LLVM-COMMON:   %[[VTT_ADDR:.*]] = alloca ptr
+// LLVM-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
+// LLVM-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
+// LLVM-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
+// LLVM-COMMON:   store ptr %[[VPTR]], ptr %[[THIS]]
+// LLVM-COMMON:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 1
+// LLVM-COMMON:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
+// LLVM-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
+// LLVM-COMMON:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
+// LLVM-COMMON:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
+// LLVM-COMMON:   %[[BASE_PTR:.*]] = getelementptr i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
+// LLVM-COMMON:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
+
+// OGCG-COMMON: define {{.*}} void @_ZN1BC2Ev(ptr {{.*}} %[[THIS_ARG:.*]], ptr {{.*}} %[[VTT_ARG:.*]])
+// OGCG-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   %[[VTT_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
+// OGCG-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
+// OGCG-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
+// OGCG-COMMON:   store ptr %[[VPTR]], ptr %[[THIS]]
+// OGCG-COMMON:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 1
+// OGCG-COMMON:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
+// OGCG-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
+// OGCG-COMMON:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
+// OGCG-COMMON:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
+// OGCG-COMMON:   %[[BASE_PTR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
+// OGCG-COMMON:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
 
 // Base (C2) constructor for C
 
-// CIR:      cir.func {{.*}} @_ZN1CC2Ev
-// CIR-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_C>
-// CIR-SAME:                      %[[VTT_ARG:.*]]: !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
-// CIR:        %[[VTT_ADDR:.*]] = cir.alloca {{.*}} ["vtt", init]
-// CIR:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
-// CIR:        cir.store %[[VTT_ARG]], %[[VTT_ADDR]]
-// CIR:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
-// CIR:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
-// CIR:        %[[VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
-// CIR:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]]
-// CIR:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
-// CIR:        cir.store{{.*}} %[[VPTR]], %[[C_VPTR_ADDR]]
-// CIR:        %[[C_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[C_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
-// CIR:        %[[C_VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]]
-// CIR:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
-// CIR:        %[[VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]]
-// CIR:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
-// CIR:        %[[CONST_24:.*]] = cir.const #cir.int<-24>
-// CIR:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
-// CIR:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_C>), !cir.ptr<!u8i>
-// CIR:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_C>
-// CIR:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
-// CIR:        cir.store{{.*}} %[[C_VPTR]], %[[BASE_VPTR_ADDR]]
-
-// LLVM: define {{.*}} void @_ZN1CC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]])
-// LLVM:   %[[THIS_ADDR:.*]] = alloca ptr
-// LLVM:   %[[VTT_ADDR:.*]] = alloca ptr
-// LLVM:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// LLVM:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
-// LLVM:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// LLVM:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
-// LLVM:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
-// LLVM:   store ptr %[[VPTR]], ptr %[[THIS]]
-// LLVM:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 1
-// LLVM:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
-// LLVM:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
-// LLVM:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
-// LLVM:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
-// LLVM:   %[[BASE_PTR:.*]] = getelementptr i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
-// LLVM:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
-
-// OGCG: define {{.*}} void @_ZN1CC2Ev(ptr {{.*}} %[[THIS_ARG:.*]], ptr {{.*}} %[[VTT_ARG:.*]])
-// OGCG:   %[[THIS_ADDR:.*]] = alloca ptr
-// OGCG:   %[[VTT_ADDR:.*]] = alloca ptr
-// OGCG:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// OGCG:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
-// OGCG:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// OGCG:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
-// OGCG:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
-// OGCG:   store ptr %[[VPTR]], ptr %[[THIS]]
-// OGCG:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 1
-// OGCG:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
-// OGCG:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
-// OGCG:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
-// OGCG:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
-// OGCG:   %[[BASE_PTR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
-// OGCG:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
+// CIR-COMMON:      cir.func {{.*}} @_ZN1CC2Ev
+// CIR-COMMON-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_C>
+// CIR-COMMON-SAME:                      %[[VTT_ARG:.*]]: !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CIR-COMMON:        %[[VTT_ADDR:.*]] = cir.alloca {{.*}} ["vtt", init]
+// CIR-COMMON:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
+// CIR-COMMON:        cir.store %[[VTT_ARG]], %[[VTT_ADDR]]
+// CIR-COMMON:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
+// CIR-COMMON:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
+// CIR-COMMON:        %[[VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]]
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
+// CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[C_VPTR_ADDR]]
+// CIR-COMMON:        %[[C_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[C_VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]]
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
+// CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]]
+// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[CONST_24:.*]] = cir.const #cir.int<-24>
+// CIR-COMMON:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR-COMMON:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
+// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_C>), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_C>
+// CIR-COMMON:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
+// CIR-COMMON:        cir.store{{.*}} %[[C_VPTR]], %[[BASE_VPTR_ADDR]]
+
+// LLVM-COMMON: define {{.*}} void @_ZN1CC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]])
+// LLVM-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// LLVM-COMMON:   %[[VTT_ADDR:.*]] = alloca ptr
+// LLVM-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
+// LLVM-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
+// LLVM-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
+// LLVM-COMMON:   store ptr %[[VPTR]], ptr %[[THIS]]
+// LLVM-COMMON:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 1
+// LLVM-COMMON:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
+// LLVM-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
+// LLVM-COMMON:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
+// LLVM-COMMON:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
+// LLVM-COMMON:   %[[BASE_PTR:.*]] = getelementptr i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
+// LLVM-COMMON:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
+
+// OGCG-COMMON: define {{.*}} void @_ZN1CC2Ev(ptr {{.*}} %[[THIS_ARG:.*]], ptr {{.*}} %[[VTT_ARG:.*]])
+// OGCG-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   %[[VTT_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
+// OGCG-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
+// OGCG-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
+// OGCG-COMMON:   store ptr %[[VPTR]], ptr %[[THIS]]
+// OGCG-COMMON:   %[[B_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i64 1
+// OGCG-COMMON:   %[[B_VPTR:.*]] = load ptr, ptr %[[B_VPTR_ADDR]]
+// OGCG-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[THIS]]
+// OGCG-COMMON:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[VPTR]], i64 -24
+// OGCG-COMMON:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
+// OGCG-COMMON:   %[[BASE_PTR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
+// OGCG-COMMON:   store ptr %[[B_VPTR]], ptr %[[BASE_PTR]]
 
 // Base (C2) constructor for D
 
-// CIR:      cir.func {{.*}} @_ZN1DC2Ev
-// CIR-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_D>
-// CIR-SAME:                      %[[VTT_ARG:.*]]: !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
-// CIR:        %[[VTT_ADDR:.*]] = cir.alloca {{.*}} ["vtt", init]
-// CIR:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
-// CIR:        cir.store %[[VTT_ARG]], %[[VTT_ADDR]]
-// CIR:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
-// CIR:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
-// CIR:        %[[B_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [0] -> !cir.ptr<!rec_B>
-// CIR:        %[[B_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        cir.call @_ZN1BC2Ev(%[[B_ADDR]], %[[B_VTT]]) nothrow : (!cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!void>>) -> ()
-// CIR:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
-// CIR:        %[[C_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 3 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        cir.call @_ZN1CC2Ev(%[[C_ADDR]], %[[C_VTT]]) nothrow : (!cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!void>>) -> ()
-// CIR:        %[[D_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[D_VTT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
-// CIR:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
-// CIR:        %[[D_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
-// CIR:        cir.store{{.*}} %[[VPTR]], %[[D_VPTR_ADDR]]
-// CIR:        %[[D_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 5 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[D_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[D_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
-// CIR:        %[[D_VPTR:.*]] = cir.load{{.*}} %[[D_VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
-// CIR:        %[[D_VPTR_ADDR2:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_D> -> !cir.ptr<!cir.vptr>
-// CIR:        %[[VPTR2:.*]] = cir.load{{.*}} %[[D_VPTR_ADDR2]] : !cir.ptr<!cir.vptr>, !cir.vptr
-// CIR:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR2]] : !cir.vptr), !cir.ptr<!u8i>
-// CIR:        %[[CONST_24:.*]] = cir.const #cir.int<-24> : !s64i
-// CIR:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
-// CIR:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_D>), !cir.ptr<!u8i>
-// CIR:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_D>
-// CIR:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
-// CIR:        cir.store{{.*}} %[[D_VPTR]], %[[BASE_VPTR_ADDR]]
-// CIR:        %[[C_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 6 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[C_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
-// CIR:        %[[C_VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
-// CIR:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
-// CIR:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[C_ADDR]] : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
-// CIR:        cir.store{{.*}} %[[C_VPTR]], %[[C_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
-
-// LLVM: define {{.*}} void @_ZN1DC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]]) {
-// LLVM:   %[[THIS_ADDR:.*]] = alloca ptr
-// LLVM:   %[[VTT_ADDR:.*]] = alloca ptr
-// LLVM:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// LLVM:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
-// LLVM:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// LLVM:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
-// LLVM:   %[[B_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 1
-// LLVM:   call void @_ZN1BC2Ev(ptr %[[THIS]], ptr %[[B_VTT]])
-// LLVM:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
-// LLVM:   %[[C_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 3
-// LLVM:   call void @_ZN1CC2Ev(ptr %[[C_ADDR]], ptr %[[C_VTT]])
-// LLVM:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
-// LLVM:   store ptr %[[VPTR]], ptr %[[THIS]]
-// LLVM:   %[[D_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 5
-// LLVM:   %[[D_VPTR:.*]] = load ptr, ptr %[[D_VPTR_ADDR]]
-// LLVM:   %[[D_VPTR_ADDR2:.*]] = load ptr, ptr %[[THIS]]
-// LLVM:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[D_VPTR_ADDR2]], i64 -24
-// LLVM:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
-// LLVM:   %[[BASE_PTR:.*]] = getelementptr i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
-// LLVM:   store ptr %[[D_VPTR]], ptr %[[BASE_PTR]]
-// LLVM:   %[[C_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 6
-// LLVM:   %[[C_VPTR:.*]] = load ptr, ptr %[[C_VPTR_ADDR]]
-// LLVM:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
-// LLVM:   store ptr %[[C_VPTR]], ptr %[[C_ADDR]]
+// CIR-COMMON:      cir.func {{.*}} @_ZN1DC2Ev
+// CIR-COMMON-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_D>
+// CIR-COMMON-SAME:                      %[[VTT_ARG:.*]]: !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CIR-COMMON:        %[[VTT_ADDR:.*]] = cir.alloca {{.*}} ["vtt", init]
+// CIR-COMMON:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
+// CIR-COMMON:        cir.store %[[VTT_ARG]], %[[VTT_ADDR]]
+// CIR-COMMON:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
+// CIR-COMMON:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
+// CIR-COMMON:        %[[B_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [0] -> !cir.ptr<!rec_B>
+// CIR-COMMON:        %[[B_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        cir.call @_ZN1BC2Ev(%[[B_ADDR]], %[[B_VTT]]) nothrow : (!cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!void>>) -> ()
+// CIR-COMMON:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
+// CIR-COMMON:        %[[C_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 3 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        cir.call @_ZN1CC2Ev(%[[C_ADDR]], %[[C_VTT]]) nothrow : (!cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!void>>) -> ()
+// CIR-COMMON:        %[[D_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[D_VTT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-COMMON:        %[[D_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
+// CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[D_VPTR_ADDR]]
+// CIR-COMMON:        %[[D_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 5 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[D_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[D_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[D_VPTR:.*]] = cir.load{{.*}} %[[D_VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-COMMON:        %[[D_VPTR_ADDR2:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_D> -> !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[VPTR2:.*]] = cir.load{{.*}} %[[D_VPTR_ADDR2]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR2]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[CONST_24:.*]] = cir.const #cir.int<-24> : !s64i
+// CIR-COMMON:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR-COMMON:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
+// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_D>), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_D>
+// CIR-COMMON:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
+// CIR-COMMON:        cir.store{{.*}} %[[D_VPTR]], %[[BASE_VPTR_ADDR]]
+// CIR-COMMON:        %[[C_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 6 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[C_VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-COMMON:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[C_ADDR]] : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
+// CIR-COMMON:        cir.store{{.*}} %[[C_VPTR]], %[[C_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
+
+// LLVM-COMMON: define {{.*}} void @_ZN1DC2Ev(ptr %[[THIS_ARG:.*]], ptr %[[VTT_ARG:.*]]) {
+// LLVM-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// LLVM-COMMON:   %[[VTT_ADDR:.*]] = alloca ptr
+// LLVM-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   store ptr %[[VTT_ARG]], ptr %[[VTT_ADDR]]
+// LLVM-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   %[[VTT:.*]] = load ptr, ptr %[[VTT_ADDR]]
+// LLVM-COMMON:   %[[B_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 1
+// LLVM-COMMON:   call void @_ZN1BC2Ev(ptr %[[THIS]], ptr %[[B_VTT]])
+// LLVM-COMMON:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
+// LLVM-COMMON:   %[[C_VTT:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 3
+// LLVM-COMMON:   call void @_ZN1CC2Ev(ptr %[[C_ADDR]], ptr %[[C_VTT]])
+// LLVM-COMMON:   %[[VPTR:.*]] = load ptr, ptr %[[VTT]]
+// LLVM-COMMON:   store ptr %[[VPTR]], ptr %[[THIS]]
+// LLVM-COMMON:   %[[D_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 5
+// LLVM-COMMON:   %[[D_VPTR:.*]] = load ptr, ptr %[[D_VPTR_ADDR]]
+// LLVM-COMMON:   %[[D_VPTR_ADDR2:.*]] = load ptr, ptr %[[THIS]]
+// LLVM-COMMON:   %[[BASE_OFFSET_ADDR:.*]] = getelementptr i8, ptr %[[D_VPTR_ADDR2]], i64 -24
+// LLVM-COMMON:   %[[BASE_OFFSET:.*]] = load i64, ptr %[[BASE_OFFSET_ADDR]]
+// LLVM-COMMON:   %[[BASE_PTR:.*]] = getelementptr i8, ptr %[[THIS]], i64 %[[BASE_OFFSET]]
+// LLVM-COMMON:   store ptr %[[D_VPTR]], ptr %[[BASE_PTR]]
+// LLVM-COMMON:   %[[C_VPTR_ADDR:.*]] = getelementptr inbounds ptr, ptr %[[VTT]], i32 6
+// LLVM-COMMON:   %[[C_VPTR:.*]] = load ptr, ptr %[[C_VPTR_ADDR]]
+// LLVM-COMMON:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
+// LLVM-COMMON:   store ptr %[[C_VPTR]], ptr %[[C_ADDR]]
 
 // The C2 constructor for D gets emitted earlier in OGCG, see above.
 
 // Base (C2) constructor for A
 
-// CIR:      cir.func {{.*}} @_ZN1AC2Ev
-// CIR-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_A>
-// CIR:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
-// CIR:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
-// CIR:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
-// CIR:        %[[VPTR:.*]] = cir.vtable.address_point(@_ZTV1A, address_point = <index = 0, offset = 2>) : !cir.vptr
-// CIR:        %[[VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_A> -> !cir.ptr<!cir.vptr>
-// CIR:        cir.store{{.*}} %[[VPTR]], %[[VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
-
-// LLVM: define {{.*}} void @_ZN1AC2Ev(ptr %[[THIS_ARG:.*]]) {
-// LLVM:   %[[THIS_ADDR:.*]] = alloca ptr, i64 1, align 8
-// LLVM:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]], align 8
-// LLVM:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
-// LLVM:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1A, i64 16), ptr %[[THIS]]
+// CIR-COMMON:      cir.func {{.*}} @_ZN1AC2Ev
+// CIR-COMMON-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_A>
+// CIR-COMMON:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CIR-COMMON:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
+// CIR-COMMON:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
+// CIR-COMMON:        %[[VPTR:.*]] = cir.vtable.address_point(@_ZTV1A, address_point = <index = 0, offset = 2>) : !cir.vptr
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_A> -> !cir.ptr<!cir.vptr>
+// CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
+
+// LLVM-COMMON: define {{.*}} void @_ZN1AC2Ev(ptr %[[THIS_ARG:.*]]) {
+// LLVM-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr, i64 1, align 8
+// LLVM-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]], align 8
+// LLVM-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]], align 8
+// LLVM-COMMON:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1A, i64 16), ptr %[[THIS]]
 
 // The C2 constructor for A gets emitted later in OGCG, see below.
 
 // Complete (C1) constructor for D
 
-// CIR:      cir.func {{.*}} @_ZN1DC1Ev
-// CIR-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_D>
-// CIR:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
-// CIR:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
-// CIR:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
-// CIR:        %[[A_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [40] -> !cir.ptr<!rec_A>
-// CIR:        cir.call @_ZN1AC2Ev(%[[A_ADDR]]) nothrow : (!cir.ptr<!rec_A>) -> ()
-// CIR:        %[[B_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [0] -> !cir.ptr<!rec_B>
-// CIR:        %[[B_VTT:.*]] = cir.vtt.address_point @_ZTT1D, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        cir.call @_ZN1BC2Ev(%[[B_ADDR]], %[[B_VTT]]) nothrow : (!cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!void>>) -> ()
-// CIR:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
-// CIR:        %[[C_VTT:.*]] = cir.vtt.address_point @_ZTT1D, offset = 3 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        cir.call @_ZN1CC2Ev(%[[C_ADDR]], %[[C_VTT]]) nothrow : (!cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!void>>) -> ()
-// CIR:        %[[D_VPTR:.*]] = cir.vtable.address_point(@_ZTV1D, address_point = <index = 0, offset = 3>) : !cir.vptr
-// CIR:        %[[VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_D> -> !cir.ptr<!cir.vptr>
-// CIR:        cir.store{{.*}} %[[D_VPTR]], %[[VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
-// CIR:        %[[A_VPTR:.*]] = cir.vtable.address_point(@_ZTV1D, address_point = <index = 2, offset = 3>) : !cir.vptr
-// CIR:        %[[A_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [40] -> !cir.ptr<!rec_A>
-// CIR:        %[[A_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[A_ADDR]] : !cir.ptr<!rec_A> -> !cir.ptr<!cir.vptr>
-// CIR:        cir.store{{.*}} %[[A_VPTR]], %[[A_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
-// CIR:        %[[C_VPTR:.*]] = cir.vtable.address_point(@_ZTV1D, address_point = <index = 1, offset = 3>) : !cir.vptr
-// CIR:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
-// CIR:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[C_ADDR]] : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
-// CIR:        cir.store{{.*}} %[[C_VPTR]], %[[C_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
-
-// LLVM: define {{.*}} void @_ZN1DC1Ev(ptr %[[THIS_ARG:.*]])
-// LLVM:   %[[THIS_ADDR:.*]] = alloca ptr
-// LLVM:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// LLVM:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// LLVM:   %[[A_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 40
-// LLVM:   call void @_ZN1AC2Ev(ptr %[[A_ADDR]])
-// LLVM:   call void @_ZN1BC2Ev(ptr %[[THIS]], ptr getelementptr inbounds nuw (i8, ptr @_ZTT1D, i64 8))
-// LLVM:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
-// LLVM:   call void @_ZN1CC2Ev(ptr %[[C_ADDR]], ptr getelementptr inbounds nuw (i8, ptr @_ZTT1D, i64 24))
-// LLVM:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 24), ptr %[[THIS]]
-// LLVM:   %[[A_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 40
-// LLVM:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 96), ptr %[[A_ADDR]]
-// LLVM:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
-// LLVM:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 64), ptr %[[C_ADDR]]
-
-// OGCG: define {{.*}} void @_ZN1DC1Ev(ptr {{.*}} %[[THIS_ARG:.*]])
-// OGCG:   %[[THIS_ADDR:.*]] = alloca ptr
-// OGCG:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// OGCG:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// OGCG:   %[[A_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 40
-// OGCG:   call void @_ZN1AC2Ev(ptr {{.*}} %[[A_ADDR]])
-// OGCG:   call void @_ZN1BC2Ev(ptr {{.*}} %[[THIS]], ptr {{.*}} getelementptr inbounds ([7 x ptr], ptr @_ZTT1D, i64 0, i64 1))
-// OGCG:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
-// OGCG:   call void @_ZN1CC2Ev(ptr {{.*}} %[[C_ADDR]], ptr {{.*}} getelementptr inbounds ([7 x ptr], ptr @_ZTT1D, i64 0, i64 3))
-// OGCG:   store ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 3), ptr %[[THIS]]
-// OGCG:   %[[A_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 40
-// OGCG:   store ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 2, i32 3), ptr %[[A_ADDR]]
-// OGCG:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
-// OGCG:   store ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 3), ptr %[[C_ADDR]]
-
-// OGCG: define {{.*}} void @_ZN1AC2Ev(ptr {{.*}} %[[THIS_ARG:.*]])
-// OGCG:   %[[THIS_ADDR:.*]] = alloca ptr
-// OGCG:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
-// OGCG:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
-// OGCG:   store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2), ptr %[[THIS]]
+// CIR-COMMON:      cir.func {{.*}} @_ZN1DC1Ev
+// CIR-COMMON-SAME:                      %[[THIS_ARG:.*]]: !cir.ptr<!rec_D>
+// CIR-COMMON:        %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CIR-COMMON:        cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
+// CIR-COMMON:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
+// CIR-COMMON:        %[[A_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [40] -> !cir.ptr<!rec_A>
+// CIR-COMMON:        cir.call @_ZN1AC2Ev(%[[A_ADDR]]) nothrow : (!cir.ptr<!rec_A>) -> ()
+// CIR-COMMON:        %[[B_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [0] -> !cir.ptr<!rec_B>
+// CIR-COMMON:        %[[B_VTT:.*]] = cir.vtt.address_point @_ZTT1D, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        cir.call @_ZN1BC2Ev(%[[B_ADDR]], %[[B_VTT]]) nothrow : (!cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!void>>) -> ()
+// CIR-COMMON:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
+// CIR-COMMON:        %[[C_VTT:.*]] = cir.vtt.address_point @_ZTT1D, offset = 3 -> !cir.ptr<!cir.ptr<!void>>
+// CIR-COMMON:        cir.call @_ZN1CC2Ev(%[[C_ADDR]], %[[C_VTT]]) nothrow : (!cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!void>>) -> ()
+// CIR-COMMON:        %[[D_VPTR:.*]] = cir.vtable.address_point(@_ZTV1D, address_point = <index = 0, offset = 3>) : !cir.vptr
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_D> -> !cir.ptr<!cir.vptr>
+// CIR-COMMON:        cir.store{{.*}} %[[D_VPTR]], %[[VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[A_VPTR:.*]] = cir.vtable.address_point(@_ZTV1D, address_point = <index = 2, offset = 3>) : !cir.vptr
+// CIR-COMMON:        %[[A_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [40] -> !cir.ptr<!rec_A>
+// CIR-COMMON:        %[[A_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[A_ADDR]] : !cir.ptr<!rec_A> -> !cir.ptr<!cir.vptr>
+// CIR-COMMON:        cir.store{{.*}} %[[A_VPTR]], %[[A_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[C_VPTR:.*]] = cir.vtable.address_point(@_ZTV1D, address_point = <index = 1, offset = 3>) : !cir.vptr
+// CIR-COMMON:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[C_ADDR]] : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
+// CIR-COMMON:        cir.store{{.*}} %[[C_VPTR]], %[[C_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
+
+// LLVM-COMMON: define {{.*}} void @_ZN1DC1Ev(ptr %[[THIS_ARG:.*]])
+// LLVM-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// LLVM-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// LLVM-COMMON:   %[[A_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 40
+// LLVM-COMMON:   call void @_ZN1AC2Ev(ptr %[[A_ADDR]])
+// LLVM-COMMON:   call void @_ZN1BC2Ev(ptr %[[THIS]], ptr getelementptr inbounds nuw (i8, ptr @_ZTT1D, i64 8))
+// LLVM-COMMON:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
+// LLVM-COMMON:   call void @_ZN1CC2Ev(ptr %[[C_ADDR]], ptr getelementptr inbounds nuw (i8, ptr @_ZTT1D, i64 24))
+// LLVM-COMMON:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 24), ptr %[[THIS]]
+// LLVM-COMMON:   %[[A_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 40
+// LLVM-COMMON:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 96), ptr %[[A_ADDR]]
+// LLVM-COMMON:   %[[C_ADDR:.*]] = getelementptr i8, ptr %[[THIS]], i32 16
+// LLVM-COMMON:   store ptr getelementptr inbounds nuw (i8, ptr @_ZTV1D, i64 64), ptr %[[C_ADDR]]
+
+// OGCG-COMMON: define {{.*}} void @_ZN1DC1Ev(ptr {{.*}} %[[THIS_ARG:.*]])
+// OGCG-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   %[[A_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 40
+// OGCG-COMMON:   call void @_ZN1AC2Ev(ptr {{.*}} %[[A_ADDR]])
+// OGCG-COMMON:   call void @_ZN1BC2Ev(ptr {{.*}} %[[THIS]], ptr {{.*}} getelementptr inbounds ([7 x ptr], ptr @_ZTT1D, i64 0, i64 1))
+// OGCG-COMMON:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
+// OGCG-COMMON:   call void @_ZN1CC2Ev(ptr {{.*}} %[[C_ADDR]], ptr {{.*}} getelementptr inbounds ([7 x ptr], ptr @_ZTT1D, i64 0, i64 3))
+// OGCG-COMMON:   store ptr getelementptr inbounds inrange(-24, 16) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 0, i32 3), ptr %[[THIS]]
+// OGCG-COMMON:   %[[A_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 40
+// OGCG-COMMON:   store ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 2, i32 3), ptr %[[A_ADDR]]
+// OGCG-COMMON:   %[[C_ADDR:.*]] = getelementptr inbounds i8, ptr %[[THIS]], i64 16
+// OGCG-COMMON:   store ptr getelementptr inbounds inrange(-24, 8) ({ [5 x ptr], [4 x ptr], [4 x ptr] }, ptr @_ZTV1D, i32 0, i32 1, i32 3), ptr %[[C_ADDR]]
+
+// OGCG-COMMON: define {{.*}} void @_ZN1AC2Ev(ptr {{.*}} %[[THIS_ARG:.*]])
+// OGCG-COMMON:   %[[THIS_ADDR:.*]] = alloca ptr
+// OGCG-COMMON:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// OGCG-COMMON:   store ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 2), ptr %[[THIS]]

From d884b55ea40e455bcef162d6c6e3176c078f41ef Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Tue, 30 Sep 2025 21:24:44 +0300
Subject: [PATCH 298/878] [BOLT] Introduce helpers to match `MCInst`s one at a
 time (NFC) (#138883)

Introduce a low-level instruction matching DSL to capture and/or match
the operands of MCInst, single instruction at a time. Unlike the
existing `MCPlusBuilder::MCInstMatcher` machinery, this DSL is intended
for the use cases when the precise control over the instruction order is
required. For example, when validating PtrAuth hardening, all registers
are usually considered unsafe after a function call, even though
callee-saved registers should preserve their old
values _under normal operation_.

Usage example:

    // Bring the short names into the local scope:
    using namespace LowLevelInstMatcherDSL;
    // Declare the registers to capture:
    Reg Xn, Xm;
    // Capture the 0th and 1st operands, match the 2nd operand against the
    // just captured Xm register, match the 3rd operand against literal 0:
    if (!matchInst(MaybeAdd, AArch64::ADDXrs, Xm, Xn, Xm, Imm(0))
      return AArch64::NoRegister;
    // Match the 0th operand against Xm:
    if (!matchInst(MaybeBr, AArch64::BR, Xm))
      return AArch64::NoRegister;
    // Manually check that Xm and Xn did not match the same register:
    if (Xm.get() == Xn.get())
      return AArch64::NoRegister;
    // Return the matched register:
    return Xm.get();
---
 bolt/include/bolt/Core/MCInstUtils.h          | 166 ++++++++++++++++++
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  91 ++++------
 2 files changed, 201 insertions(+), 56 deletions(-)

diff --git a/bolt/include/bolt/Core/MCInstUtils.h b/bolt/include/bolt/Core/MCInstUtils.h
index eb56629c61c7d..291e31e0e0fdf 100644
--- a/bolt/include/bolt/Core/MCInstUtils.h
+++ b/bolt/include/bolt/Core/MCInstUtils.h
@@ -10,6 +10,7 @@
 #define BOLT_CORE_MCINSTUTILS_H
 
 #include "bolt/Core/BinaryBasicBlock.h"
+#include "bolt/Core/MCPlus.h"
 #include <map>
 #include <variant>
 
@@ -175,6 +176,171 @@ static inline raw_ostream &operator<<(raw_ostream &OS,
   return Ref.print(OS);
 }
 
+/// Instruction-matching helpers operating on a single instruction at a time.
+///
+/// The idea is to make low-level instruction matching as readable as possible.
+/// The classes contained in this namespace are intended to be used as a
+/// domain-specific language to match MCInst with the particular opcode and
+/// operands.
+///
+/// The goals of this DSL include
+/// * matching a single instruction against the template consisting of the
+///   particular target-specific opcode and a pattern of operands
+/// * matching operands against the known values (such as 42, AArch64::X1 or
+///   "the value of --brk-operand=N command line argument")
+/// * capturing operands of an instruction ("whatever is the destination
+///   register of AArch64::ADDXri instruction, store it to Xd variable to be
+///   queried later")
+/// * expressing repeated operands of a single matched instruction (such as
+///   "ADDXri Xd, Xd, 42, 0" for an arbitrary register Xd) as well as across
+///   multiple calls to matchInst(), which is naturally achieved by sequentially
+///   capturing the operands and matching operands against the known values
+/// * matching multi-instruction code patterns by sequentially calling
+///   matchInst() while passing around already matched operands
+///
+/// The non-goals (compared to MCPlusBuilder::MCInstMatcher) include
+/// * matching an arbitrary tree of instructions in a single matchInst() call
+/// * encapsulation of target-specific knowledge ("match an increment of Xm
+///   by 42")
+///
+/// Unlike MCPlusBuilder::MCInstMatcher, this DSL focuses on the use cases when
+/// the precise control over the instruction order is important. For example,
+/// let's consider a target-specific function that has to match two particular
+/// instructions against this pattern (for two different registers Xm and Xn)
+///
+///     ADDXrs Xm, Xn, Xm, #0
+///     BR     Xm
+///
+/// and return the register holding the branch target. Assuming the instructions
+/// are available as MaybeAdd and MaybeBr, the following code can be used:
+///
+///     // Bring the short names into the local scope:
+///     using namespace LowLevelInstMatcherDSL;
+///     // Declare the registers to capture:
+///     Reg Xn, Xm;
+///     // Capture the 0th and 1st operands, match the 2nd operand against the
+///     // just captured Xm register, match the 3rd operand against literal 0:
+///     if (!matchInst(MaybeAdd, AArch64::ADDXrs, Xm, Xn, Xm, Imm(0))
+///       return AArch64::NoRegister;
+///     // Match the 0th operand against Xm:
+///     if (!matchInst(MaybeBr, AArch64::BR, Xm))
+///       return AArch64::NoRegister;
+///     // Manually check that Xm and Xn did not match the same register:
+///     if (Xm.get() == Xn.get())
+///       return AArch64::NoRegister;
+///     // Return the matched register:
+///     return Xm.get();
+///
+namespace LowLevelInstMatcherDSL {
+
+// The base class to match an operand of type T.
+//
+// The subclasses of OpMatcher are intended to be allocated on the stack and
+// to only be used by passing them to matchInst() and by calling their get()
+// function, thus the peculiar `mutable` specifiers: to make the calling code
+// compact and readable, the templated matchInst() function has to accept both
+// long-lived Imm/Reg wrappers declared as local variables (intended to capture
+// the first operand's value and match the subsequent operands, whether inside
+// a single instruction or across multiple instructions), as well as temporary
+// wrappers around literal values to match, f.e. Imm(42) or Reg(AArch64::XZR).
+template <typename T> class OpMatcher {
+  mutable std::optional<T> Value;
+  mutable std::optional<T> SavedValue;
+
+  // Remember/restore the last Value - to be called by matchInst.
+  void remember() const { SavedValue = Value; }
+  void restore() const { Value = SavedValue; }
+
+  template <class... OpMatchers>
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+protected:
+  OpMatcher(std::optional<T> ValueToMatch) : Value(ValueToMatch) {}
+
+  bool matchValue(T OpValue) const {
+    // Check that OpValue does not contradict the existing Value.
+    bool MatchResult = !Value || *Value == OpValue;
+    // If MatchResult is false, all matchers will be reset before returning from
+    // matchInst, including this one, thus no need to assign conditionally.
+    Value = OpValue;
+
+    return MatchResult;
+  }
+
+public:
+  /// Returns the captured value.
+  T get() const {
+    assert(Value.has_value());
+    return *Value;
+  }
+};
+
+class Reg : public OpMatcher<MCPhysReg> {
+  bool matches(const MCOperand &Op) const {
+    if (!Op.isReg())
+      return false;
+
+    return matchValue(Op.getReg());
+  }
+
+  template <class... OpMatchers>
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+public:
+  Reg(std::optional<MCPhysReg> RegToMatch = std::nullopt)
+      : OpMatcher<MCPhysReg>(RegToMatch) {}
+};
+
+class Imm : public OpMatcher<int64_t> {
+  bool matches(const MCOperand &Op) const {
+    if (!Op.isImm())
+      return false;
+
+    return matchValue(Op.getImm());
+  }
+
+  template <class... OpMatchers>
+  friend bool matchInst(const MCInst &, unsigned, const OpMatchers &...);
+
+public:
+  Imm(std::optional<int64_t> ImmToMatch = std::nullopt)
+      : OpMatcher<int64_t>(ImmToMatch) {}
+};
+
+/// Tries to match Inst and updates Ops on success.
+///
+/// If Inst has the specified Opcode and its operand list prefix matches Ops,
+/// this function returns true and updates Ops, otherwise false is returned and
+/// values of Ops are kept as before matchInst was called.
+///
+/// Please note that while Ops are technically passed by a const reference to
+/// make invocations like `matchInst(MI, Opcode, Imm(42))` possible, all their
+/// fields are marked mutable.
+template <class... OpMatchers>
+bool matchInst(const MCInst &Inst, unsigned Opcode, const OpMatchers &...Ops) {
+  if (Inst.getOpcode() != Opcode)
+    return false;
+  assert(sizeof...(Ops) <= MCPlus::getNumPrimeOperands(Inst) &&
+         "Too many operands are matched for the Opcode");
+
+  // Ask each matcher to remember its current value in case of rollback.
+  (Ops.remember(), ...);
+
+  // Check if all matchers match the corresponding operands.
+  auto It = Inst.begin();
+  auto AllMatched = (Ops.matches(*(It++)) && ... && true);
+
+  // If match failed, restore the original captured values.
+  if (!AllMatched) {
+    (Ops.restore(), ...);
+    return false;
+  }
+
+  return true;
+}
+
+} // namespace LowLevelInstMatcherDSL
+
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index a6589f8f9ee42..f271867cb2004 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -19,6 +19,7 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/MCInstUtils.h"
 #include "bolt/Core/MCPlusBuilder.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
@@ -401,81 +402,59 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
     // Iterate over the instructions of BB in reverse order, matching opcodes
     // and operands.
-    MCPhysReg TestedReg = 0;
-    MCPhysReg ScratchReg = 0;
+
     auto It = BB.end();
-    auto StepAndGetOpcode = [&It, &BB]() -> int {
-      if (It == BB.begin())
-        return -1;
-      --It;
-      return It->getOpcode();
+    auto StepBack = [&]() {
+      while (It != BB.begin()) {
+        --It;
+        // Skip any CFI instructions, but no other pseudos are expected here.
+        if (!isCFI(*It))
+          return true;
+      }
+      return false;
     };
-
-    switch (StepAndGetOpcode()) {
-    default:
-      // Not matched the branch instruction.
+    // Step to the last non-CFI instruction.
+    if (!StepBack())
       return std::nullopt;
-    case AArch64::Bcc:
-      // Bcc EQ, .Lon_success
-      if (It->getOperand(0).getImm() != AArch64CC::EQ)
-        return std::nullopt;
-      // Not checking .Lon_success (see above).
 
-      // SUBSXrs XZR, TestedReg, ScratchReg, 0 (used by "CMP reg, reg" alias)
-      if (StepAndGetOpcode() != AArch64::SUBSXrs ||
-          It->getOperand(0).getReg() != AArch64::XZR ||
-          It->getOperand(3).getImm() != 0)
+    using namespace llvm::bolt::LowLevelInstMatcherDSL;
+    Reg TestedReg;
+    Reg ScratchReg;
+
+    if (matchInst(*It, AArch64::Bcc, Imm(AArch64CC::EQ) /*, .Lon_success*/)) {
+      if (!StepBack() || !matchInst(*It, AArch64::SUBSXrs, Reg(AArch64::XZR),
+                                    TestedReg, ScratchReg, Imm(0)))
         return std::nullopt;
-      TestedReg = It->getOperand(1).getReg();
-      ScratchReg = It->getOperand(2).getReg();
 
       // Either XPAC(I|D) ScratchReg, ScratchReg
       // or     XPACLRI
-      switch (StepAndGetOpcode()) {
-      default:
+      if (!StepBack())
         return std::nullopt;
-      case AArch64::XPACLRI:
+      if (matchInst(*It, AArch64::XPACLRI)) {
         // No operands to check, but using XPACLRI forces TestedReg to be X30.
-        if (TestedReg != AArch64::LR)
-          return std::nullopt;
-        break;
-      case AArch64::XPACI:
-      case AArch64::XPACD:
-        if (It->getOperand(0).getReg() != ScratchReg ||
-            It->getOperand(1).getReg() != ScratchReg)
+        if (TestedReg.get() != AArch64::LR)
           return std::nullopt;
-        break;
+      } else if (!matchInst(*It, AArch64::XPACI, ScratchReg, ScratchReg) &&
+                 !matchInst(*It, AArch64::XPACD, ScratchReg, ScratchReg)) {
+        return std::nullopt;
       }
 
-      // ORRXrs ScratchReg, XZR, TestedReg, 0 (used by "MOV reg, reg" alias)
-      if (StepAndGetOpcode() != AArch64::ORRXrs)
+      if (!StepBack() || !matchInst(*It, AArch64::ORRXrs, ScratchReg,
+                                    Reg(AArch64::XZR), TestedReg, Imm(0)))
         return std::nullopt;
-      if (It->getOperand(0).getReg() != ScratchReg ||
-          It->getOperand(1).getReg() != AArch64::XZR ||
-          It->getOperand(2).getReg() != TestedReg ||
-          It->getOperand(3).getImm() != 0)
-        return std::nullopt;
-
-      return std::make_pair(TestedReg, &*It);
 
-    case AArch64::TBZX:
-      // TBZX ScratchReg, 62, .Lon_success
-      ScratchReg = It->getOperand(0).getReg();
-      if (It->getOperand(1).getImm() != 62)
-        return std::nullopt;
-      // Not checking .Lon_success (see above).
+      return std::make_pair(TestedReg.get(), &*It);
+    }
 
-      // EORXrs ScratchReg, TestedReg, TestedReg, 1
-      if (StepAndGetOpcode() != AArch64::EORXrs)
-        return std::nullopt;
-      TestedReg = It->getOperand(1).getReg();
-      if (It->getOperand(0).getReg() != ScratchReg ||
-          It->getOperand(2).getReg() != TestedReg ||
-          It->getOperand(3).getImm() != 1)
+    if (matchInst(*It, AArch64::TBZX, ScratchReg, Imm(62) /*, .Lon_success*/)) {
+      if (!StepBack() || !matchInst(*It, AArch64::EORXrs, ScratchReg, TestedReg,
+                                    TestedReg, Imm(1)))
         return std::nullopt;
 
-      return std::make_pair(TestedReg, &*It);
+      return std::make_pair(TestedReg.get(), &*It);
     }
+
+    return std::nullopt;
   }
 
   std::optional<MCPhysReg> getAuthCheckedReg(const MCInst &Inst,

From 3a8d771612c6d9d95c2e020aa37dd3674279432f Mon Sep 17 00:00:00 2001
From: Victor Chernyakin <chernyakin.victor.j@outlook.com>
Date: Tue, 30 Sep 2025 11:30:07 -0700
Subject: [PATCH 299/878] [llvm][NFC] Simplify implementation of `isa`
 (#161403)

Using a fold instead of template recursion.
---
 llvm/include/llvm/Support/Casting.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 66fdcb44ea2c0..2a9a149327d83 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -544,14 +544,9 @@ struct CastInfo<To, std::optional<From>> : public OptionalValueCast<To, From> {
 ///
 ///  if (isa<Type>(myVal)) { ... }
 ///  if (isa<Type0, Type1, Type2>(myVal)) { ... }
-template <typename To, typename From>
-[[nodiscard]] inline bool isa(const From &Val) {
-  return CastInfo<To, const From>::isPossible(Val);
-}
-
-template <typename First, typename Second, typename... Rest, typename From>
+template <typename... To, typename From>
 [[nodiscard]] inline bool isa(const From &Val) {
-  return isa<First>(Val) || isa<Second, Rest...>(Val);
+  return (CastInfo<To, const From>::isPossible(Val) || ...);
 }
 
 /// cast<X> - Return the argument parameter cast to the specified type.  This

From 5665b1bf9da5c0918f8babec13f2ff537078a233 Mon Sep 17 00:00:00 2001
From: Julian Lettner <yln@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:32:49 -0700
Subject: [PATCH 300/878] [lldb][NFC] Fix spelling of function in log message
 (#161261)

Fix spelling of `GetMemoryRegionInfo` function in
log message and comment and reformat code.
---
 lldb/tools/debugserver/source/MacOSX/MachTask.mm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.mm b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
index 8ae9d4df99657..e2395cffb3565 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachTask.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
@@ -213,7 +213,7 @@
 }
 
 //----------------------------------------------------------------------
-// MachTask::MemoryRegionInfo
+// MachTask::GetMemoryRegionInfo
 //----------------------------------------------------------------------
 int MachTask::GetMemoryRegionInfo(nub_addr_t addr, DNBRegionInfo *region_info) {
   task_t task = TaskPort();
@@ -221,9 +221,9 @@
     return -1;
 
   int ret = m_vm_memory.GetMemoryRegionInfo(task, addr, region_info);
-  DNBLogThreadedIf(LOG_MEMORY, "MachTask::MemoryRegionInfo ( addr = 0x%8.8llx "
-                               ") => %i  (start = 0x%8.8llx, size = 0x%8.8llx, "
-                               "permissions = %u)",
+  DNBLogThreadedIf(LOG_MEMORY,
+                   "MachTask::GetMemoryRegionInfo ( addr = 0x%8.8llx ) => %i  "
+                   "(start = 0x%8.8llx, size = 0x%8.8llx, permissions = %u)",
                    (uint64_t)addr, ret, (uint64_t)region_info->addr,
                    (uint64_t)region_info->size, region_info->permissions);
   return ret;

From 280abaf9da0121011863ad095991c7d95fc504ae Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 30 Sep 2025 19:35:12 +0100
Subject: [PATCH 301/878] [VPlan] Handle scalar-VF in transforms (NFC)
 (#161365)

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp   | 8 +++-----
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5a08e4d25cfa5..40f0ca6b941cb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8165,14 +8165,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
     VFRange SubRange = {VF, MaxVFTimes2};
     if (auto Plan = tryToBuildVPlanWithVPRecipes(
             std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
-      bool HasScalarVF = Plan->hasScalarVFOnly();
       // Now optimize the initial VPlan.
-      if (!HasScalarVF)
-        VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
-                                 *Plan, CM.getMinimalBitwidths());
+      VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
+                               *Plan, CM.getMinimalBitwidths());
       VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
       // TODO: try to put it close to addActiveLaneMask().
-      if (CM.foldTailWithEVL() && !HasScalarVF)
+      if (CM.foldTailWithEVL())
         VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
                                  *Plan, CM.getMaxSafeElements());
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 969dce4bc98ae..a73b083cff7fd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2124,6 +2124,8 @@ static void licm(VPlan &Plan) {
 
 void VPlanTransforms::truncateToMinimalBitwidths(
     VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
+  if (Plan.hasScalarVFOnly())
+    return;
   // Keep track of created truncates, so they can be re-used. Note that we
   // cannot use RAUW after creating a new truncate, as this would could make
   // other uses have different types for their operands, making them invalidly
@@ -2704,6 +2706,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
 ///
 void VPlanTransforms::addExplicitVectorLength(
     VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
+  if (Plan.hasScalarVFOnly())
+    return;
   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
 
   auto *CanonicalIVPHI = Plan.getCanonicalIV();

From 4064c0eab46e34fd07dbca4b2f665582f4528afb Mon Sep 17 00:00:00 2001
From: Eugene Epshteyn <eepshteyn@nvidia.com>
Date: Tue, 30 Sep 2025 14:40:35 -0400
Subject: [PATCH 302/878] [flang] Implemented a warning about contiguity of
 compile time constant values (#161084)

Implemented `common::UsageWarning::ConstantIsContiguous` to warn about
the
following case:
```
integer, parameter :: num = 3
integer, parameter :: arr(num)=[(i, i=1,num)]
logical, parameter :: result=is_contiguous(arr(num:1:-1))
end
```
Here, while array section is discontiguous, `arr` is a compile time
constant,
so array section created at compile time will end up being contiguous
and
`result` will be "true". If `arr` wasn't a constant, the result at
runtime
would have been "false".
---
 flang/include/flang/Support/Fortran-features.h | 2 +-
 flang/lib/Evaluate/fold-logical.cpp            | 8 ++++++++
 flang/test/Semantics/contiguous-warn.f90       | 6 ++++++
 3 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Semantics/contiguous-warn.f90

diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
index 2bbc2385777da..51364d552be64 100644
--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -76,7 +76,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     IndexVarRedefinition, IncompatibleImplicitInterfaces, CdefinedInit,
     VectorSubscriptFinalization, UndefinedFunctionResult, UselessIomsg,
     MismatchingDummyProcedure, SubscriptedEmptyArray, UnsignedLiteralTruncation,
-    CompatibleDeclarationsFromDistinctModules,
+    CompatibleDeclarationsFromDistinctModules, ConstantIsContiguous,
     NullActualForDefaultIntentAllocatable, UseAssociationIntoSameNameSubprogram,
     HostAssociatedIntentOutInSpecExpr, NonVolatilePointerToVolatile,
     RealConstantWidening, VolatileOrAsynchronousTemporary)
diff --git a/flang/lib/Evaluate/fold-logical.cpp b/flang/lib/Evaluate/fold-logical.cpp
index c64f79e06a8ac..449c316802d6a 100644
--- a/flang/lib/Evaluate/fold-logical.cpp
+++ b/flang/lib/Evaluate/fold-logical.cpp
@@ -799,12 +799,20 @@ Expr<Type<TypeCategory::Logical, KIND>> FoldIntrinsicFunction(
     }
   } else if (name == "is_contiguous") {
     if (args.at(0)) {
+      auto warnContiguous{[&]() {
+        if (auto source{args[0]->sourceLocation()}) {
+          context.Warn(common::UsageWarning::ConstantIsContiguous, *source,
+              "is_contiguous() is always true for named constants and subobjects of named constants"_warn_en_US);
+        }
+      }};
       if (auto *expr{args[0]->UnwrapExpr()}) {
         if (auto contiguous{IsContiguous(*expr, context)}) {
+          warnContiguous();
           return Expr<T>{*contiguous};
         }
       } else if (auto *assumedType{args[0]->GetAssumedTypeDummy()}) {
         if (auto contiguous{IsContiguous(*assumedType, context)}) {
+          warnContiguous();
           return Expr<T>{*contiguous};
         }
       }
diff --git a/flang/test/Semantics/contiguous-warn.f90 b/flang/test/Semantics/contiguous-warn.f90
new file mode 100644
index 0000000000000..2eb1f1c0857f7
--- /dev/null
+++ b/flang/test/Semantics/contiguous-warn.f90
@@ -0,0 +1,6 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic -Werror
+integer, parameter :: num = 3
+integer, parameter :: arr(num)=[(i, i=1,num)]
+!WARNING: is_contiguous() is always true for named constants and subobjects of named constants [-Wconstant-is-contiguous]
+logical, parameter :: result=is_contiguous(arr(num:1:-1))
+end

From 4e5928689f2399dc6aede8dde2536a98a96a1802 Mon Sep 17 00:00:00 2001
From: Jeaye Wilkerson <contact@jeaye.com>
Date: Tue, 30 Sep 2025 11:41:00 -0700
Subject: [PATCH 303/878] Fix `run_clang_repl` output when not present
 (#161412)

On the happy path, when `clang-repl` is present, we will invoke it in
order to determine if the host supports JIT features. That will return a
string containing "true". However, in cases where `clang-repl` is not
present or we fail to invoke it, we previously returned `False`, which
would then trigger a failure with our substring check. This PR updates
the function to return `""` instead, so the substring check is still
valid.

This is related to https://github.com/llvm/llvm-project/pull/157359,
where the original change was introduced.
---
 clang/test/lit.cfg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 64e2bbad8f3b2..e6c79d7a71b51 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -145,7 +145,7 @@ def run_clang_repl(args):
     clang_repl_exe = lit.util.which("clang-repl", config.clang_tools_dir)
 
     if not clang_repl_exe:
-        return False
+        return ""
 
     try:
         clang_repl_cmd = subprocess.Popen(
@@ -153,7 +153,7 @@ def run_clang_repl(args):
         )
     except OSError:
         print("could not exec clang-repl")
-        return False
+        return ""
 
     clang_repl_out = clang_repl_cmd.stdout.read().decode("ascii")
     clang_repl_cmd.wait()

From fad2a4c00db68ee10b506b05531cdc6458d9be37 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 11:46:41 -0700
Subject: [PATCH 304/878] [flang] Add #include to fix MSVC build (#161415)

flang/lib/Evaluate/constant.cpp apparently needs this #include for MSVC
builds but somehow not for others.
---
 flang/lib/Evaluate/constant.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Evaluate/constant.cpp b/flang/lib/Evaluate/constant.cpp
index 0fa397b9c356d..8923ab114c737 100644
--- a/flang/lib/Evaluate/constant.cpp
+++ b/flang/lib/Evaluate/constant.cpp
@@ -10,6 +10,7 @@
 #include "flang/Evaluate/expression.h"
 #include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/type.h"
+#include "flang/Semantics/scope.h"
 #include <string>
 
 namespace Fortran::evaluate {

From 67141c74272838919985ce1931c42365b1790c6a Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Tue, 30 Sep 2025 14:57:26 -0400
Subject: [PATCH 305/878] [NFC] Remove trailing whitespaces from
 `clang/include/clang/Basic/Attr.td`

---
 clang/include/clang/Basic/Attr.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index de56bb38fd63e..fe3ca70a98e2d 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1470,7 +1470,7 @@ def Constructor : InheritableAttr {
   let TemplateDependent = 1;
   let Documentation = [CtorDtorDocs];
   let AdditionalMembers = [{
-    static constexpr unsigned DefaultPriority = 65535; 
+    static constexpr unsigned DefaultPriority = 65535;
   }];
 }
 
@@ -1815,7 +1815,7 @@ def Destructor : InheritableAttr {
   let TemplateDependent = 1;
   let Documentation = [CtorDtorDocs];
   let AdditionalMembers = [{
-    static constexpr unsigned int DefaultPriority = 65535; 
+    static constexpr unsigned int DefaultPriority = 65535;
   }];
 }
 

From 9d42c752569f3141ca42b75dd37f45e771ffa7a0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 30 Sep 2025 20:04:28 +0100
Subject: [PATCH 306/878] [LAA] Fix picking context instr in evaluatePtrAddRec
 for multiple preds.

A loop may have more than one predecessor out of the loop. In that case,
just pick the first non-phi instruction in the loop header.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   9 +-
 .../early-exit-runtime-checks.ll              | 132 ++++++++++++++++++
 2 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 05f7ac694ac9b..47dccde45337b 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -234,9 +234,14 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
 
   // Check if we have a suitable dereferencable assumption we can use.
   if (!StartPtrV->canBeFreed()) {
+    Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
+    if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
+      if (isa<BranchInst>(LoopPred->getTerminator()))
+        CtxI = LoopPred->getTerminator();
+    }
+
     RetainedKnowledge DerefRK = getKnowledgeValidInContext(
-        StartPtrV, {Attribute::Dereferenceable}, *AC,
-        L->getLoopPredecessor()->getTerminator(), DT);
+        StartPtrV, {Attribute::Dereferenceable}, *AC, CtxI, DT);
     if (DerefRK) {
       DerefBytesSCEV =
           SE.getUMaxExpr(DerefBytesSCEV, SE.getSCEV(DerefRK.IRArgValue));
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index 207a44d5d08d4..a08f859858d24 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -565,6 +565,138 @@ e.2:
   ret void
 }
 
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_multiple_predecessors(ptr %A, ptr %B, i1 %c) nosync nofree {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_multiple_predecessors'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: (2000 + %B))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+  br i1 %c, label %then, label %else
+
+then:
+  br label %loop.header
+
+else:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %iv.next = add nuw nsw i64 %iv, 1
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %cntable.c.2 = icmp eq i64 %iv.next, 500
+  br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+  br label %loop.header
+
+cleanup4:
+  ret void
+
+e.1:
+  ret void
+
+e.2:
+  ret void
+}
+
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_multiple_predecessors_no_valid(ptr %A, ptr %B, i1 %c) nosync nofree {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_multiple_predecessors_no_valid'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+  br i1 %c, label %then, label %else
+
+then:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+  br label %loop.header
+
+else:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %iv.next = add nuw nsw i64 %iv, 1
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %cntable.c.2 = icmp eq i64 %iv.next, 500
+  br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+  br label %loop.header
+
+cleanup4:
+  ret void
+
+e.1:
+  ret void
+
+e.2:
+  ret void
+}
+
 define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small(ptr %A, ptr %B) nosync nofree {
 ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_deref_via_assumption_too_small'
 ; CHECK-NEXT:    loop.header:

From 6ca835b7f4349ad55c8e8afdf0669927b6b284b4 Mon Sep 17 00:00:00 2001
From: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
Date: Wed, 1 Oct 2025 03:13:29 +0800
Subject: [PATCH 307/878] [compiler-rt][asan] Add wcscpy/wcsncpy; enable
 wcscat/wcsncat on Windows (#160493)

Summary
- Add ASan interceptors for wcscpy/wcsncpy on all platforms.
- Enable wcscat/wcsncat on Windows (already enabled on POSIX via
sanitizer_common).

Motivation
- Use of wchar string APIs is common on Windows; improve parity with
char* string checks.

Changes
- Implement wcscpy/wcsncpy in asan_interceptors.cpp; check overlap and
mark read/write ranges in bytes.
- wcsncpy: compute write size in bytes (size * sizeof(wchar_t)) to avoid
missed overflows when sizeof(wchar_t) != 1.
- Use MaybeRealWcsnlen when available to bound reads.
- Register Windows static thunk for wcscpy/wcsncpy/wcscat/wcsncat; rely
on sanitizer_common interceptors for wcscat/wcsncat.
- Tests: add wcscpy/wcsncpy/wcscat/wcsncat; flush stdout before crash;
use resilient FileCheck patterns (reuse [[ADDR]], wildcard for function
suffixes and paths, flexible line numbers).

Testing
- AArch64 Linux: new tests pass with check-asan locally.

Follow-up to and based on prior work in PR #90909 (author: branh,
Microsoft); builds on that work and addresses review feedback. Thanks!

---------

Signed-off-by: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
---
 compiler-rt/lib/asan/asan_interceptors.cpp    | 46 +++++++++++++++++--
 compiler-rt/lib/asan/asan_interceptors.h      |  1 +
 .../asan/asan_win_static_runtime_thunk.cpp    |  4 ++
 .../sanitizer_platform_interceptors.h         |  2 +-
 compiler-rt/test/asan/TestCases/wcscat.cpp    | 26 +++++++++++
 compiler-rt/test/asan/TestCases/wcscpy.cpp    | 23 ++++++++++
 compiler-rt/test/asan/TestCases/wcsncat.cpp   | 27 +++++++++++
 compiler-rt/test/asan/TestCases/wcsncpy.cpp   | 25 ++++++++++
 8 files changed, 150 insertions(+), 4 deletions(-)
 create mode 100644 compiler-rt/test/asan/TestCases/wcscat.cpp
 create mode 100644 compiler-rt/test/asan/TestCases/wcscpy.cpp
 create mode 100644 compiler-rt/test/asan/TestCases/wcsncat.cpp
 create mode 100644 compiler-rt/test/asan/TestCases/wcsncpy.cpp

diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 7c9a08b9083a2..0f613f0fdc30b 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -58,13 +58,20 @@ namespace __asan {
 
 static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
 #if SANITIZER_INTERCEPT_STRNLEN
-  if (REAL(strnlen)) {
+  if (REAL(strnlen))
     return REAL(strnlen)(s, maxlen);
-  }
-#endif
+#  endif
   return internal_strnlen(s, maxlen);
 }
 
+static inline uptr MaybeRealWcsnlen(const wchar_t* s, uptr maxlen) {
+#  if SANITIZER_INTERCEPT_WCSNLEN
+  if (REAL(wcsnlen))
+    return REAL(wcsnlen)(s, maxlen);
+#  endif
+  return internal_wcsnlen(s, maxlen);
+}
+
 void SetThreadName(const char *name) {
   AsanThread *t = GetCurrentThread();
   if (t)
@@ -570,6 +577,20 @@ INTERCEPTOR(char *, strcpy, char *to, const char *from) {
   return REAL(strcpy)(to, from);
 }
 
+INTERCEPTOR(wchar_t*, wcscpy, wchar_t* to, const wchar_t* from) {
+  void* ctx;
+  ASAN_INTERCEPTOR_ENTER(ctx, wcscpy);
+  if (!TryAsanInitFromRtl())
+    return REAL(wcscpy)(to, from);
+  if (flags()->replace_str) {
+    uptr size = (internal_wcslen(from) + 1) * sizeof(wchar_t);
+    CHECK_RANGES_OVERLAP("wcscpy", to, size, from, size);
+    ASAN_READ_RANGE(ctx, from, size);
+    ASAN_WRITE_RANGE(ctx, to, size);
+  }
+  return REAL(wcscpy)(to, from);
+}
+
 // Windows doesn't always define the strdup identifier,
 // and when it does it's a macro defined to either _strdup
 // or _strdup_dbg, _strdup_dbg ends up calling _strdup, so
@@ -633,6 +654,20 @@ INTERCEPTOR(char*, strncpy, char *to, const char *from, usize size) {
   return REAL(strncpy)(to, from, size);
 }
 
+INTERCEPTOR(wchar_t*, wcsncpy, wchar_t* to, const wchar_t* from, uptr size) {
+  void* ctx;
+  ASAN_INTERCEPTOR_ENTER(ctx, wcsncpy);
+  AsanInitFromRtl();
+  if (flags()->replace_str) {
+    uptr from_size =
+        Min(size, MaybeRealWcsnlen(from, size) + 1) * sizeof(wchar_t);
+    CHECK_RANGES_OVERLAP("wcsncpy", to, from_size, from, from_size);
+    ASAN_READ_RANGE(ctx, from, from_size);
+    ASAN_WRITE_RANGE(ctx, to, size * sizeof(wchar_t));
+  }
+  return REAL(wcsncpy)(to, from, size);
+}
+
 template <typename Fn>
 static ALWAYS_INLINE auto StrtolImpl(void *ctx, Fn real, const char *nptr,
                                      char **endptr, int base)
@@ -809,6 +844,11 @@ void InitializeAsanInterceptors() {
   ASAN_INTERCEPT_FUNC(strncat);
   ASAN_INTERCEPT_FUNC(strncpy);
   ASAN_INTERCEPT_FUNC(strdup);
+
+  // Intercept wcs* functions.
+  ASAN_INTERCEPT_FUNC(wcscpy);
+  ASAN_INTERCEPT_FUNC(wcsncpy);
+
 #  if ASAN_INTERCEPT___STRDUP
   ASAN_INTERCEPT_FUNC(__strdup);
 #endif
diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h
index 3e2386eaf8092..2d551cfafd1f5 100644
--- a/compiler-rt/lib/asan/asan_interceptors.h
+++ b/compiler-rt/lib/asan/asan_interceptors.h
@@ -129,6 +129,7 @@ DECLARE_REAL(char*, strchr, const char *str, int c)
 DECLARE_REAL(SIZE_T, strlen, const char *s)
 DECLARE_REAL(char*, strncpy, char *to, const char *from, SIZE_T size)
 DECLARE_REAL(SIZE_T, strnlen, const char *s, SIZE_T maxlen)
+DECLARE_REAL(SIZE_T, wcsnlen, const wchar_t* s, SIZE_T maxlen)
 DECLARE_REAL(char*, strstr, const char *s1, const char *s2)
 
 #  if !SANITIZER_APPLE
diff --git a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
index 4a69b66574039..46e0e90738f24 100644
--- a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
@@ -63,6 +63,10 @@ INTERCEPT_LIBRARY_FUNCTION_ASAN(strpbrk);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strspn);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strstr);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strtok);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcscat);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcscpy);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsncat);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsncpy);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(wcslen);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsnlen);
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 29987decdff45..88ecd7e16306a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -551,7 +551,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_MALLOC_USABLE_SIZE (!SI_MAC && !SI_NETBSD)
 #define SANITIZER_INTERCEPT_MCHECK_MPROBE SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_WCSLEN 1
-#define SANITIZER_INTERCEPT_WCSCAT SI_POSIX
+#define SANITIZER_INTERCEPT_WCSCAT (SI_POSIX || SI_WINDOWS)
 #define SANITIZER_INTERCEPT_WCSDUP SI_POSIX
 #define SANITIZER_INTERCEPT_SIGNAL_AND_SIGACTION (!SI_WINDOWS && SI_NOT_FUCHSIA)
 #define SANITIZER_INTERCEPT_BSD_SIGNAL SI_ANDROID
diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
new file mode 100644
index 0000000000000..dcdff88c18ef1
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcscat.cpp
@@ -0,0 +1,26 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  wchar_t *start = L"X means ";
+  wchar_t *append = L"dog";
+  wchar_t goodDst[12];
+  wcscpy(goodDst, start);
+  wcscat(goodDst, append);
+
+  wchar_t badDst[9];
+  wcscpy(badDst, start);
+  printf("Good so far.\n");
+  // CHECK: Good so far.
+  fflush(stdout);
+  wcscat(badDst, append); // Boom!
+  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
+  printf("Should have failed with ASAN error.\n");
+}
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
new file mode 100644
index 0000000000000..414d83303a960
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcscpy.cpp
@@ -0,0 +1,23 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  wchar_t *src = L"X means dog";
+  wchar_t goodDst[12];
+  wcscpy(goodDst, src);
+
+  wchar_t badDst[7];
+  printf("Good so far.\n");
+  // CHECK: Good so far.
+  fflush(stdout);
+  wcscpy(badDst, src); // Boom!
+  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
+  printf("Should have failed with ASAN error.\n");
+}
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
new file mode 100644
index 0000000000000..3ab7fc8f55d63
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcsncat.cpp
@@ -0,0 +1,27 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  wchar_t *start = L"X means ";
+  wchar_t *append = L"dog";
+  wchar_t goodDst[15];
+  wcscpy(goodDst, start);
+  wcsncat(goodDst, append, 5);
+
+  wchar_t badDst[11];
+  wcscpy(badDst, start);
+  wcsncat(badDst, append, 1);
+  printf("Good so far.\n");
+  // CHECK: Good so far.
+  fflush(stdout);
+  wcsncat(badDst, append, 3); // Boom!
+  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
+  printf("Should have failed with ASAN error.\n");
+}
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
new file mode 100644
index 0000000000000..6177b72990a0a
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
@@ -0,0 +1,25 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  wchar_t *src = L"X means dog";
+  wchar_t goodDst[12];
+  wcsncpy(goodDst, src, 12);
+
+  wchar_t badDst[7];
+  wcsncpy(badDst, src, 7); // This should still work.
+  printf("Good so far.\n");
+  // CHECK: Good so far.
+  fflush(stdout);
+
+  wcsncpy(badDst, src, 15); // Boom!
+  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
+  printf("Should have failed with ASAN error.\n");
+}
\ No newline at end of file

From 8425004ce65abc34c6f55fbf751f101536e9c07d Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 30 Sep 2025 12:42:47 -0700
Subject: [PATCH 308/878] [clang-doc] Suppress long-name test on windows
 (#161424)

This seems to have broken some buildbots for a long time, so just
suppress it for now until we determine how/why.
---
 clang-tools-extra/test/clang-doc/long-name.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang-tools-extra/test/clang-doc/long-name.cpp b/clang-tools-extra/test/clang-doc/long-name.cpp
index db96fc4aebe5a..e29c468ecc4da 100644
--- a/clang-tools-extra/test/clang-doc/long-name.cpp
+++ b/clang-tools-extra/test/clang-doc/long-name.cpp
@@ -1,3 +1,5 @@
+// FIXME: This test seems to break on windows, so disable it for now.
+// UNSUPPORTED: system-windows
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --output=%t --format=mustache --executor=standalone %s
 // RUN: ls %t/json | FileCheck %s -check-prefix=CHECK-JSON

From e27e9ca5d8cdf85f8c8466a792730ff1df313072 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 13:04:37 -0700
Subject: [PATCH 309/878] [flang] Attempt to work around MSVC build problem
 (#161426)

Move a function that seems to be running into an MSVC problem from the
source file where I created it to another one (tools.cpp) that is
already known to be able to access the semantics::Scope type.
---
 flang/include/flang/Evaluate/tools.h |  3 +++
 flang/lib/Evaluate/constant.cpp      | 28 ----------------------------
 flang/lib/Evaluate/tools.cpp         | 27 +++++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 5f2f199e778c7..f9d74db1df03b 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1521,6 +1521,9 @@ bool IsVarSubexpressionOf(
 // it returns std::nullopt.
 std::optional<Expr<SomeType>> GetConvertInput(const Expr<SomeType> &x);
 
+// How many ancestors does have a derived type have?
+std::optional<int> DerivedTypeDepth(const semantics::Scope &);
+
 } // namespace Fortran::evaluate
 
 namespace Fortran::semantics {
diff --git a/flang/lib/Evaluate/constant.cpp b/flang/lib/Evaluate/constant.cpp
index 8923ab114c737..f57dd825a7a7c 100644
--- a/flang/lib/Evaluate/constant.cpp
+++ b/flang/lib/Evaluate/constant.cpp
@@ -10,7 +10,6 @@
 #include "flang/Evaluate/expression.h"
 #include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/type.h"
-#include "flang/Semantics/scope.h"
 #include <string>
 
 namespace Fortran::evaluate {
@@ -390,33 +389,6 @@ std::size_t Constant<SomeDerived>::CopyFrom(const Constant<SomeDerived> &source,
   return Base::CopyFrom(source, count, resultSubscripts, dimOrder);
 }
 
-static std::optional<int> DerivedTypeDepth(const semantics::Scope &scope) {
-  if (scope.IsDerivedType()) {
-    for (auto iter{scope.cbegin()}; iter != scope.cend(); ++iter) {
-      const Symbol &symbol{*iter->second};
-      if (symbol.test(Symbol::Flag::ParentComp)) {
-        if (const semantics::DeclTypeSpec *type{symbol.GetType()}) {
-          if (const semantics::DerivedTypeSpec *derived{type->AsDerived()}) {
-            const semantics::Scope *parent{derived->scope()};
-            if (!parent) {
-              parent = derived->typeSymbol().scope();
-            }
-            if (parent) {
-              if (auto parentDepth{DerivedTypeDepth(*parent)}) {
-                return 1 + *parentDepth;
-              }
-            }
-          }
-        }
-        return std::nullopt; // error recovery
-      }
-    }
-    return 0;
-  } else {
-    return std::nullopt; // error recovery
-  }
-}
-
 bool ComponentCompare::operator()(SymbolRef x, SymbolRef y) const {
   if (&x->owner() != &y->owner()) {
     // Not components of the same derived type; put ancestors' components first.
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 1f3cbbf6a0c36..6d0da63ead07a 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1950,6 +1950,33 @@ bool IsVarSubexpressionOf(
   return VariableFinder{sub}(super);
 }
 
+std::optional<int> DerivedTypeDepth(const semantics::Scope &scope) {
+  if (scope.IsDerivedType()) {
+    for (auto iter{scope.cbegin()}; iter != scope.cend(); ++iter) {
+      const Symbol &symbol{*iter->second};
+      if (symbol.test(Symbol::Flag::ParentComp)) {
+        if (const semantics::DeclTypeSpec *type{symbol.GetType()}) {
+          if (const semantics::DerivedTypeSpec *derived{type->AsDerived()}) {
+            const semantics::Scope *parent{derived->scope()};
+            if (!parent) {
+              parent = derived->typeSymbol().scope();
+            }
+            if (parent) {
+              if (auto parentDepth{DerivedTypeDepth(*parent)}) {
+                return 1 + *parentDepth;
+              }
+            }
+          }
+        }
+        return std::nullopt; // error recovery
+      }
+    }
+    return 0;
+  } else {
+    return std::nullopt; // error recovery
+  }
+}
+
 } // namespace Fortran::evaluate
 
 namespace Fortran::semantics {

From 1ff3e2e2805fe4a8284a688fbd7b3863826d9629 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Tue, 30 Sep 2025 16:06:36 -0400
Subject: [PATCH 310/878] [MLIR][Standalone] gate wheel build behind
 MLIR_ENABLE_BINDINGS_PYTHON=ON (#161427)

If MLIR_ENABLE_BINDINGS_PYTHON=ON then
[StandalonePythonModules](https://github.com/llvm/llvm-project/blob/main/mlir/examples/standalone/pyproject.toml#L38)
isn't a valid target.
---
 mlir/test/Examples/standalone/lit.local.cfg  | 2 ++
 mlir/test/Examples/standalone/test.wheel.toy | 1 +
 2 files changed, 3 insertions(+)

diff --git a/mlir/test/Examples/standalone/lit.local.cfg b/mlir/test/Examples/standalone/lit.local.cfg
index ac03503e46ea3..6cf89358f8992 100644
--- a/mlir/test/Examples/standalone/lit.local.cfg
+++ b/mlir/test/Examples/standalone/lit.local.cfg
@@ -17,3 +17,5 @@ config.substitutions.append(("%cmake_build_type", config.cmake_build_type))
 
 if not config.llvm_shared_libs_build:
     config.available_features.add("non-shared-libs-build")
+if config.enable_bindings_python:
+    config.available_features.add("bindings-python")
diff --git a/mlir/test/Examples/standalone/test.wheel.toy b/mlir/test/Examples/standalone/test.wheel.toy
index 17d8cb5b246c9..5ff927129793b 100644
--- a/mlir/test/Examples/standalone/test.wheel.toy
+++ b/mlir/test/Examples/standalone/test.wheel.toy
@@ -3,6 +3,7 @@
 # C/Users/ContainerAdministrator/AppData/Local/Temp.
 # UNSUPPORTED: target={{.*(windows).*}}
 # REQUIRES: non-shared-libs-build
+# REQUIRES: bindings-python
 
 # RUN: export CMAKE_BUILD_TYPE=%cmake_build_type
 # RUN: export CMAKE_CXX_COMPILER=%host_cxx

From e83a3b8614afbb707a4f3492e0fccd7e4c1d99b7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 30 Sep 2025 21:09:27 +0100
Subject: [PATCH 311/878] [AMDGPU] Introduce and use NotUseRealTrue16Insts.
 NFC. (#161373)

This removes ~2000 lines from both AMDGPUGenDAGISel.inc and
AMDGPUGenGlobalISel.inc.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td            |  2 +
 llvm/lib/Target/AMDGPU/DSInstructions.td    | 14 ++---
 llvm/lib/Target/AMDGPU/FLATInstructions.td  | 12 ++--
 llvm/lib/Target/AMDGPU/SIInstructions.td    | 64 ++++++++-------------
 llvm/lib/Target/AMDGPU/VOP1Instructions.td  |  3 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td  |  3 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td  |  6 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td |  6 +-
 8 files changed, 43 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index eaa1870f4be28..7003a40a940aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2589,6 +2589,8 @@ def NotHasTrue16BitInsts : True16PredicateClass<"!Subtarget->hasTrue16BitInsts()
 // only allow 32-bit registers in operands and use low halves thereof.
 def UseRealTrue16Insts : True16PredicateClass<"Subtarget->useRealTrue16Insts()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts)>;
+def NotUseRealTrue16Insts : True16PredicateClass<"!Subtarget->useRealTrue16Insts()">,
+  AssemblerPredicate<(not (all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts))>;
 def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && "
                                               "!Subtarget->useRealTrue16Insts()">,
   AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index f2e432fa8d7f5..b2ff5a11aec6e 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -969,10 +969,9 @@ multiclass DSReadPat_t16<DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-      let True16Predicate = p in {
-        def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
-      }
+    let True16Predicate = NotUseRealTrue16Insts in {
+      def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    }
     let True16Predicate = UseRealTrue16Insts in {
       def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
     }
@@ -1050,10 +1049,9 @@ multiclass DSWritePat_t16 <DS_Pseudo inst, ValueType vt, string frag> {
   }
 
   let OtherPredicates = [NotLDSRequiresM0Init] in {
-    foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-      let True16Predicate = p in {
-        def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
-      }
+    let True16Predicate = NotUseRealTrue16Insts in {
+      def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
+    }
     let True16Predicate = UseRealTrue16Insts in {
       def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_t16"), vt, !cast<PatFrag>(frag)>;
     }
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 9f33bac4c56ea..5a22b23cecf86 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1982,8 +1982,7 @@ defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
 defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>;
 defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
   defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>;
   defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>;
   defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>;
@@ -2127,8 +2126,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, extloadi16_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
@@ -2187,8 +2185,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let OtherPredicates = [HasFlatGlobalInsts], True16Predicate =  p in {
+let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = NotUseRealTrue16Insts in {
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i16>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, store_global, i16>;
 defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
@@ -2356,8 +2353,7 @@ defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, extloadi16_private, i32>;
 defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
 defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
 defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
 defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 59fd2f10ccacd..be084a952bc41 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1466,8 +1466,7 @@ class VOPSelectPat_t16 <ValueType vt> : GCNPat <
 
 def : VOPSelectModsPat <i32>;
 def : VOPSelectModsPat <f32>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
   def : VOPSelectPat <f16>;
   def : VOPSelectPat <i16>;
 } // End True16Predicate = p
@@ -2137,8 +2136,7 @@ def : GCNPat <
 >;
 
 foreach fp16vt = [f16, bf16] in {
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let SubtargetPredicate = p in {
+let SubtargetPredicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (fabs (fp16vt VGPR_32:$src)),
   (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
@@ -2230,8 +2228,7 @@ def : GCNPat <
 }
 
 foreach fp16vt = [f16, bf16] in {
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (fcopysign fp16vt:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -2354,23 +2351,21 @@ def : GCNPat <
   (S_MOV_B32 $ga)
 >;
 
-foreach pred = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in {
-  let True16Predicate = pred in {
-    def : GCNPat <
-      (VGPRImm<(i16 imm)>:$imm),
-      (V_MOV_B32_e32 imm:$imm)
-    >;
+let True16Predicate = NotUseRealTrue16Insts in {
+  def : GCNPat <
+    (VGPRImm<(i16 imm)>:$imm),
+    (V_MOV_B32_e32 imm:$imm)
+  >;
 
-    // FIXME: Workaround for ordering issue with peephole optimizer where
-    // a register class copy interferes with immediate folding.  Should
-    // use s_mov_b32, which can be shrunk to s_movk_i32
+  // FIXME: Workaround for ordering issue with peephole optimizer where
+  // a register class copy interferes with immediate folding.  Should
+  // use s_mov_b32, which can be shrunk to s_movk_i32
 
-    foreach vt = [f16, bf16] in {
-      def : GCNPat <
-        (VGPRImm<(vt fpimm)>:$imm),
-        (V_MOV_B32_e32 (vt (bitcast_fpimm_to_i32 $imm)))
-      >;
-    }
+  foreach vt = [f16, bf16] in {
+    def : GCNPat <
+      (VGPRImm<(vt fpimm)>:$imm),
+      (V_MOV_B32_e32 (vt (bitcast_fpimm_to_i32 $imm)))
+    >;
   }
 }
 
@@ -2859,8 +2854,7 @@ def : GCNPat<
   (i32 (DivergentSextInreg<i1> i32:$src)),
   (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (i16 (DivergentSextInreg<i1> i16:$src)),
   (V_BFE_I32_e64 $src, (i32 0), (i32 1))
@@ -3205,8 +3199,7 @@ def : GCNPat<
 }
 }  // AddedComplexity = 1
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat<
   (i32 (DivergentUnaryFrag<zext> i16:$src)),
   (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src)
@@ -3416,8 +3409,7 @@ def : GCNPat <
 
 // Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
 // The 12s emit 0s.
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (i16 (bswap i16:$a)),
   (V_PERM_B32_e64  (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
@@ -3670,8 +3662,7 @@ def : GCNPat <
   (S_LSHL_B32 SReg_32:$src1, (i16 16))
 >;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))),
   (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1))
@@ -3707,8 +3698,7 @@ def : GCNPat <
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
   (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3735,8 +3725,7 @@ def : GCNPat <
 >;
 
 let SubtargetPredicate = HasVOP3PInsts in {
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = NotUseRealTrue16Insts in
 def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
   (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3766,8 +3755,7 @@ def : GCNPat <
   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 // Take the lower 16 bits from each VGPR_32 and concat them
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
@@ -3838,8 +3826,7 @@ def : GCNPat <
 >;
 
 // Take the upper 16 bits from each VGPR_32 and concat them
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = NotUseRealTrue16Insts in
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector>
     (Ty !if(!eq(Ty, i16),
@@ -3881,8 +3868,7 @@ def : GCNPat <
   (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1))
 >;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (v2f16 (scalar_to_vector f16:$src0)),
   (COPY $src0)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 6230c17e20804..77df72111605e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1561,8 +1561,7 @@ def : GCNPat <
 
 } // End OtherPredicates = [isGFX8Plus]
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let OtherPredicates = [isGFX8Plus, p] in {
+let OtherPredicates = [isGFX8Plus, NotUseRealTrue16Insts] in {
 def : GCNPat<
   (i32 (anyext i16:$src)),
   (COPY $src)
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 37d92bc5076de..30dab55df7c29 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1378,8 +1378,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
                      $src)
 >;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in {
+let True16Predicate = NotUseRealTrue16Insts in {
 def : GCNPat <
   (and i16:$src0, i16:$src1),
   (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index e6a7c35dce0be..4a2b54dde68d3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -387,8 +387,7 @@ let SchedRW = [Write64Bit] in {
 } // End SchedRW = [Write64Bit]
 } // End isReMaterializable = 1
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = NotUseRealTrue16Insts in
 def : GCNPat<
   (i32 (DivergentUnaryFrag<sext> i16:$src)),
   (i32 (V_BFE_I32_e64 i16:$src, (i32 0), (i32 0x10)))
@@ -501,8 +500,7 @@ def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = NotUseRealTrue16Insts in
 def : GCNPat<
   (i64 (DivergentUnaryFrag<sext> i16:$src)),
     (REG_SEQUENCE VReg_64,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 52ee1e874ad86..5daf860d540ca 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -402,8 +402,7 @@ defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profi
 
 defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32>;
 
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = NotUseRealTrue16Insts in
 defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
 let True16Predicate = UseRealTrue16Insts in
 defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_F16_t16>;
@@ -428,8 +427,7 @@ defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Pro
 } // End isCommutable = 1
 
 defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32_BF16, bf16>;
-foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let True16Predicate = p in
+let True16Predicate = NotUseRealTrue16Insts in
 defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
 let True16Predicate = UseRealTrue16Insts in
 defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;

From fe9fba8d24f4e7a0cca26fceb621cfa4276d793b Mon Sep 17 00:00:00 2001
From: Henrich Lauko <xlauko@mail.muni.cz>
Date: Tue, 30 Sep 2025 22:25:00 +0200
Subject: [PATCH 312/878] [OpenACC][CIR] Fix transform inclusive scan init
 parameter (#161428)

This fixes macos build, where otherwise the compilation yields an error: `no viable conversion from 'bool' to 'typename iterator_traits<const QualType *>::value_type`
---
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index e41c2d85fbd5d..fc28ac552224c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -89,7 +89,7 @@ mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
   std::transform_inclusive_scan(
       resultTypes.begin(), resultTypes.end(),
       std::back_inserter(allocasLeftArr), std::plus<bool>{},
-      [](QualType ty) { return !ty->isConstantArrayType(); });
+      [](QualType ty) { return !ty->isConstantArrayType(); }, false);
 
   // Keep track of the number of 'elements' that we're allocating. Individual
   // allocas should multiply this by the size of its current allocation.

From a099c91eb8e275c385859f7b3ddee3f0b08db558 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 30 Sep 2025 22:06:16 +0100
Subject: [PATCH 313/878] [LAA] Add tests for using inbounds flags only used in
 predicated blocks.

Test for https://github.com/llvm/llvm-project/issues/160912.
---
 .../inbounds-gep-in-predicated-blocks.ll      | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
new file mode 100644
index 0000000000000..6eed0ec864820
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+; unsigned long long s0 = 0, s1 = 0;
+; for (int i = 0; i < 100; i++) {
+;   if (i % 4 == 0) {
+;     A[s0] = 2;  // A[0], A[4], A[8], A[12], ...
+;     A[s1] = 1;  // A[0], A[8], A[16], A[24], ...
+;   }
+;   s0 += (1ULL << 62) + 1;
+;   s1 += (1ULL << 62) + 2;
+; }
+; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
+; there are no dependences), as the pointers are not dereferenced in all loop iterations.
+define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
+; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+  %offset.0 = phi i64 [ 0, %entry ], [ %offset.0.next, %loop.latch ]
+  %offset.1 = phi i64 [ 0, %entry ], [ %offset.1.next, %loop.latch ]
+  %idx.0 = getelementptr inbounds i8, ptr %A, i64 %offset.0
+  %idx.1 = getelementptr inbounds i8, ptr %A, i64 %offset.1
+  %mask = and i64 %i, 3
+  %cond = icmp eq i64 %mask, 0
+  br i1 %cond, label %if.then, label %loop.latch
+
+if.then:
+  store i8 2, ptr %idx.0
+  store i8 1, ptr %idx.1
+  br label %loop.latch
+
+loop.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %offset.0.next = add i64 %offset.0, 4611686018427387905 ; 2^62 + 1
+  %offset.1.next = add i64 %offset.1, 4611686018427387906 ; 2^62 + 2
+  %cond.exit = icmp eq i64 %i.next, 100
+  br i1 %cond.exit, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @test_header_existing(ptr %src, ptr %dst, i64 %start) {
+; CHECK-LABEL: 'test_header_existing'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:        ptr %dst
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.src = getelementptr nusw { i8, i8, i32 }, ptr %src, i64 %iv.next
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %dst High: (1 + %dst))
+; CHECK-NEXT:            Member: %dst
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: (8 + (8 * %start) + %src) High: (809 + %src))
+; CHECK-NEXT:            Member: {(8 + (8 * %start) + %src),+,8}<%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ %start, %entry ], [ %iv.next, %loop.latch ]
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr nusw { i8, i8, i32 }, ptr %src, i64 %iv.next
+  %l = load i8, ptr %gep.src, align 1
+  store i8 %l, ptr %dst, align 1
+  br label %loop.header
+
+exit:
+  ret void
+}

From f61be4352592639a0903e67a9b5d3ec664ad4d23 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 30 Sep 2025 22:13:05 +0100
Subject: [PATCH 314/878] Revert "[VPlan] Compute cost of more replicating
 loads/stores in ::computeCost. (#160053)"

This reverts commit b4be7ecaf06bfcb4aa8d47c4fda1eed9bbe4ae77.

See https://github.com/llvm/llvm-project/issues/161404 for a crash
exposed by the change. Revert while I investigate.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  16 +--
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   7 +-
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  16 +--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 118 ++----------------
 4 files changed, 27 insertions(+), 130 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 40f0ca6b941cb..12fb46da8e71a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3903,8 +3903,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
       if (VF.isScalar())
         continue;
 
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4161,8 +4160,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6837,7 +6835,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7070,8 +7068,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
-                        *CM.PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
@@ -8600,8 +8597,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                          *CM.PSE.getSE());
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -10058,7 +10054,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind, *CM.PSE.getSE());
+                          CM.CostKind);
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 728d29107808d..81f1956c96254 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1750,8 +1750,7 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 }
 
 InstructionCost VPCostContext::getScalarizationOverhead(
-    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
-    bool AlwaysIncludeReplicatingR) {
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
   if (VF.isScalar())
     return 0;
 
@@ -1771,9 +1770,7 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   SmallPtrSet<const VPValue *, 4> UniqueOperands;
   SmallVector<Type *> Tys;
   for (auto *Op : Operands) {
-    if (Op->isLiveIn() ||
-        (!AlwaysIncludeReplicatingR &&
-         isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
+    if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
         !UniqueOperands.insert(Op).second)
       continue;
     Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 2a8baec74b72b..fe59774b7c838 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,14 +349,12 @@ struct VPCostContext {
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
-  ScalarEvolution &SE;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
-                TargetTransformInfo::TargetCostKind CostKind,
-                ScalarEvolution &SE)
+                TargetTransformInfo::TargetCostKind CostKind)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind), SE(SE) {}
+        CostKind(CostKind) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -376,12 +374,10 @@ struct VPCostContext {
 
   /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
   /// and \p Operands with \p VF. This is a convenience wrapper for the
-  /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
-  /// is true, always compute the cost of scalarizing replicating operands.
-  InstructionCost
-  getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
-                           ElementCount VF,
-                           bool AlwaysIncludeReplicatingR = false);
+  /// type-based getScalarizationOverhead API.
+  InstructionCost getScalarizationOverhead(Type *ResultTy,
+                                           ArrayRef<const VPValue *> Operands,
+                                           ElementCount VF);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ee03729f150b2..3a55710d59b08 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3098,61 +3098,6 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
-/// Returns true if \p Ptr is a pointer computation for which the legacy cost
-/// model computes a SCEV expression when computing the address cost.
-static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
-  auto *PtrR = Ptr->getDefiningRecipe();
-  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
-                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
-                      Instruction::GetElementPtr) ||
-                 isa<VPWidenGEPRecipe>(PtrR)))
-    return false;
-
-  // We are looking for a GEP where all indices are either loop invariant or
-  // inductions.
-  for (VPValue *Opd : drop_begin(PtrR->operands())) {
-    if (!Opd->isDefinedOutsideLoopRegions() &&
-        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
-      return false;
-  }
-
-  return true;
-}
-
-/// Returns true if \p V is used as part of the address of another load or
-/// store.
-static bool isUsedByLoadStoreAddress(const VPUser *V) {
-  SmallPtrSet<const VPUser *, 4> Seen;
-  SmallVector<const VPUser *> WorkList = {V};
-
-  while (!WorkList.empty()) {
-    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
-    if (!Cur || !Seen.insert(Cur).second)
-      continue;
-
-    for (VPUser *U : Cur->users()) {
-      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
-        if (InterleaveR->getAddr() == Cur)
-          return true;
-      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
-        if (RepR->getOpcode() == Instruction::Load &&
-            RepR->getOperand(0) == Cur)
-          return true;
-        if (RepR->getOpcode() == Instruction::Store &&
-            RepR->getOperand(1) == Cur)
-          return true;
-      }
-      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
-        if (MemR->getAddr() == Cur && MemR->isConsecutive())
-          return true;
-      }
-    }
-
-    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
-  }
-  return false;
-}
-
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3260,58 +3205,21 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   }
   case Instruction::Load:
   case Instruction::Store: {
-    if (VF.isScalable() && !isSingleScalar())
-      return InstructionCost::getInvalid();
-
+    if (isSingleScalar()) {
+      bool IsLoad = UI->getOpcode() == Instruction::Load;
+      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
+      const Align Alignment = getLoadStoreAlignment(UI);
+      unsigned AS = getLoadStoreAddressSpace(UI);
+      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
+      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
+    }
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
-    const VPRegionBlock *ParentRegion = getParent()->getParent();
-    if (ParentRegion && ParentRegion->isReplicator())
-      break;
-
-    bool IsLoad = UI->getOpcode() == Instruction::Load;
-    const VPValue *PtrOp = getOperand(!IsLoad);
-    // TODO: Handle cases where we need to pass a SCEV to
-    // getAddressComputationCost.
-    if (shouldUseAddressAccessSCEV(PtrOp))
-      break;
-
-    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
-    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
-    const Align Alignment = getLoadStoreAlignment(UI);
-    unsigned AS = getLoadStoreAddressSpace(UI);
-    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
-    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
-        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
-
-    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
-
-    InstructionCost ScalarCost =
-        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
-    if (isSingleScalar())
-      return ScalarCost;
-
-    SmallVector<const VPValue *> OpsToScalarize;
-    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
-    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
-    // don't assign scalarization overhead in general, if the target prefers
-    // vectorized addressing or the loaded value is used as part of an address
-    // of another load or store.
-    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
-    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
-      bool EfficientVectorLoadStore =
-          Ctx.TTI.supportsEfficientVectorElementLoadStore();
-      if (!(IsLoad && !PreferVectorizedAddressing) &&
-          !(!IsLoad && EfficientVectorLoadStore))
-        append_range(OpsToScalarize, operands());
-
-      if (!EfficientVectorLoadStore)
-        ResultTy = Ctx.Types.inferScalarType(this);
-    }
-
-    return (ScalarCost * VF.getFixedValue()) +
-           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
+    break;
   }
   }
 

From ca84f2aa3be6e46a4dccb1bec56b93f2bb3d8ef0 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 30 Sep 2025 14:20:13 -0700
Subject: [PATCH 315/878] [CIR] Upstream support for generating global ctor
 regions (#161298)

This adds support for handling global variables with non-trivial
constructors. The constructor call is emitted in CIR as a 'ctor' region
associated with the global definition. This form of global definition
cannot be lowered to LLVM IR yet.

A later change will add support in LoweringPrepare to move the ctor code
into a __cxx_global_var_init() function and add that function to the
list of global global ctors, but for now we must stop at the initial CIR
generation.
---
 clang/include/clang/CIR/MissingFeatures.h     |   1 -
 clang/lib/CIR/CodeGen/CIRGenCXX.cpp           | 139 ++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp       |  28 ++++
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp  |   4 +-
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |  10 +-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  12 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  19 ++-
 clang/lib/CIR/CodeGen/CIRGenModule.h          |   7 +
 clang/lib/CIR/CodeGen/CMakeLists.txt          |   1 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |   5 +
 clang/test/CIR/CodeGen/global-init.cpp        |  17 +++
 11 files changed, 230 insertions(+), 13 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp
 create mode 100644 clang/test/CIR/CodeGen/global-init.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index f09ec95d9ccf8..7a6c084f51cd7 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -248,7 +248,6 @@ struct MissingFeatures {
   static bool metaDataNode() { return false; }
   static bool moduleNameHash() { return false; }
   static bool msabi() { return false; }
-  static bool needsGlobalCtorDtor() { return false; }
   static bool nrvo() { return false; }
   static bool objCBlocks() { return false; }
   static bool objCGC() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
index da507d6f28335..d5b35c25c83ba 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -15,10 +15,89 @@
 
 #include "clang/AST/GlobalDecl.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
 
+static void emitDeclInit(CIRGenFunction &cgf, const VarDecl *varDecl,
+                         cir::GlobalOp globalOp) {
+  assert((varDecl->hasGlobalStorage() ||
+          (varDecl->hasLocalStorage() &&
+           cgf.getContext().getLangOpts().OpenCLCPlusPlus)) &&
+         "VarDecl must have global or local (in the case of OpenCL) storage!");
+  assert(!varDecl->getType()->isReferenceType() &&
+         "Should not call emitDeclInit on a reference!");
+
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+
+  // Set up the ctor region.
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  mlir::Block *block = builder.createBlock(&globalOp.getCtorRegion());
+  CIRGenFunction::LexicalScope lexScope{cgf, globalOp.getLoc(),
+                                        builder.getInsertionBlock()};
+  lexScope.setAsGlobalInit();
+  builder.setInsertionPointToStart(block);
+
+  Address declAddr(cgf.cgm.getAddrOfGlobalVar(varDecl),
+                   cgf.cgm.getASTContext().getDeclAlign(varDecl));
+
+  QualType type = varDecl->getType();
+  LValue lv = cgf.makeAddrLValue(declAddr, type);
+
+  const Expr *init = varDecl->getInit();
+  switch (CIRGenFunction::getEvaluationKind(type)) {
+  case cir::TEK_Scalar:
+    assert(!cir::MissingFeatures::objCGC());
+    cgf.emitScalarInit(init, cgf.getLoc(varDecl->getLocation()), lv, false);
+    break;
+  case cir::TEK_Complex:
+    cgf.cgm.errorNYI(varDecl->getSourceRange(), "complex global initializer");
+    break;
+  case cir::TEK_Aggregate:
+    assert(!cir::MissingFeatures::aggValueSlotGC());
+    cgf.emitAggExpr(init,
+                    AggValueSlot::forLValue(lv, AggValueSlot::IsDestructed,
+                                            AggValueSlot::IsNotAliased,
+                                            AggValueSlot::DoesNotOverlap));
+    break;
+  }
+
+  // Finish the ctor region.
+  builder.setInsertionPointToEnd(block);
+  cir::YieldOp::create(builder, globalOp.getLoc());
+}
+
+static void emitDeclDestroy(CIRGenFunction &cgf, const VarDecl *vd,
+                            cir::GlobalOp addr) {
+  // Honor __attribute__((no_destroy)) and bail instead of attempting
+  // to emit a reference to a possibly nonexistent destructor, which
+  // in turn can cause a crash. This will result in a global constructor
+  // that isn't balanced out by a destructor call as intended by the
+  // attribute. This also checks for -fno-c++-static-destructors and
+  // bails even if the attribute is not present.
+  QualType::DestructionKind dtorKind = vd->needsDestruction(cgf.getContext());
+
+  // FIXME:  __attribute__((cleanup)) ?
+
+  switch (dtorKind) {
+  case QualType::DK_none:
+    return;
+
+  case QualType::DK_cxx_destructor:
+    break;
+
+  case QualType::DK_objc_strong_lifetime:
+  case QualType::DK_objc_weak_lifetime:
+  case QualType::DK_nontrivial_c_struct:
+    // We don't care about releasing objects during process teardown.
+    assert(!vd->getTLSKind() && "should have rejected this");
+    return;
+  }
+
+  cgf.cgm.errorNYI(vd->getSourceRange(), "global with destructor");
+}
+
 cir::FuncOp CIRGenModule::codegenCXXStructor(GlobalDecl gd) {
   const CIRGenFunctionInfo &fnInfo =
       getTypes().arrangeCXXStructorDeclaration(gd);
@@ -38,3 +117,63 @@ cir::FuncOp CIRGenModule::codegenCXXStructor(GlobalDecl gd) {
   assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
   return fn;
 }
+
+// Global variables requiring non-trivial initialization are handled
+// differently in CIR than in classic codegen. Classic codegen emits
+// a global init function (__cxx_global_var_init) and inserts
+// initialization for each global there. In CIR, we attach a ctor
+// region to the global variable and insert the initialization code
+// into the ctor region. This will be moved into the
+// __cxx_global_var_init function during the LoweringPrepare pass.
+void CIRGenModule::emitCXXGlobalVarDeclInit(const VarDecl *varDecl,
+                                            cir::GlobalOp addr,
+                                            bool performInit) {
+  QualType ty = varDecl->getType();
+
+  // TODO: handle address space
+  // The address space of a static local variable (addr) may be different
+  // from the address space of the "this" argument of the constructor. In that
+  // case, we need an addrspacecast before calling the constructor.
+  //
+  // struct StructWithCtor {
+  //   __device__ StructWithCtor() {...}
+  // };
+  // __device__ void foo() {
+  //   __shared__ StructWithCtor s;
+  //   ...
+  // }
+  //
+  // For example, in the above CUDA code, the static local variable s has a
+  // "shared" address space qualifier, but the constructor of StructWithCtor
+  // expects "this" in the "generic" address space.
+  assert(!cir::MissingFeatures::addressSpace());
+
+  // Create a CIRGenFunction to emit the initializer. While this isn't a true
+  // function, the handling works the same way.
+  CIRGenFunction cgf{*this, builder, true};
+  llvm::SaveAndRestore<CIRGenFunction *> savedCGF(curCGF, &cgf);
+  curCGF->curFn = addr;
+
+  CIRGenFunction::SourceLocRAIIObject fnLoc{cgf,
+                                            getLoc(varDecl->getLocation())};
+
+  assert(!cir::MissingFeatures::astVarDeclInterface());
+
+  if (!ty->isReferenceType()) {
+    assert(!cir::MissingFeatures::openMP());
+
+    bool needsDtor = varDecl->needsDestruction(getASTContext()) ==
+                     QualType::DK_cxx_destructor;
+    // PerformInit, constant store invariant / destroy handled below.
+    if (performInit)
+      emitDeclInit(cgf, varDecl, addr);
+
+    if (varDecl->getType().isConstantStorage(getASTContext(), true, !needsDtor))
+      errorNYI(varDecl->getSourceRange(), "global with constant storage");
+    else
+      emitDeclDestroy(cgf, varDecl, addr);
+    return;
+  }
+
+  errorNYI(varDecl->getSourceRange(), "global with reference type");
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp
new file mode 100644
index 0000000000000..d1efed80aaf0e
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenDeclCXX.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code dealing with code generation of C++ declarations
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenModule.h"
+#include "clang/AST/Attr.h"
+#include "clang/Basic/LangOptions.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+void CIRGenModule::emitCXXGlobalVarDeclInitFunc(const VarDecl *vd,
+                                                cir::GlobalOp addr,
+                                                bool performInit) {
+  assert(!cir::MissingFeatures::cudaSupport());
+
+  assert(!cir::MissingFeatures::deferredCXXGlobalInit());
+
+  emitCXXGlobalVarDeclInit(vd, addr, performInit);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 178b276f19d41..e20a4fc3c63aa 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -775,7 +775,9 @@ class ConstExprEmitter
   }
 
   mlir::Attribute VisitCXXConstructExpr(CXXConstructExpr *e, QualType ty) {
-    cgm.errorNYI(e->getBeginLoc(), "ConstExprEmitter::VisitCXXConstructExpr");
+    if (!e->getConstructor()->isTrivial())
+      return nullptr;
+    cgm.errorNYI(e->getBeginLoc(), "trivial constructor const handling");
     return {};
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 9d98361a8b6a2..a404c0c08d893 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -342,10 +342,12 @@ void CIRGenFunction::LexicalScope::cleanup() {
 cir::ReturnOp CIRGenFunction::LexicalScope::emitReturn(mlir::Location loc) {
   CIRGenBuilderTy &builder = cgf.getBuilder();
 
-  if (!cgf.curFn.getFunctionType().hasVoidReturn()) {
+  auto fn = dyn_cast<cir::FuncOp>(cgf.curFn);
+  assert(fn && "emitReturn from non-function");
+  if (!fn.getFunctionType().hasVoidReturn()) {
     // Load the value from `__retval` and return it via the `cir.return` op.
     auto value = builder.create<cir::LoadOp>(
-        loc, cgf.curFn.getFunctionType().getReturnType(), *cgf.fnRetAlloca);
+        loc, fn.getFunctionType().getReturnType(), *cgf.fnRetAlloca);
     return builder.create<cir::ReturnOp>(loc,
                                          llvm::ArrayRef(value.getResult()));
   }
@@ -459,7 +461,9 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
     const auto *md = cast<CXXMethodDecl>(d);
     if (md->getParent()->isLambda() && md->getOverloadedOperator() == OO_Call) {
       // We're in a lambda.
-      curFn.setLambda(true);
+      auto fn = dyn_cast<cir::FuncOp>(curFn);
+      assert(fn && "lambda in non-function region");
+      fn.setLambda(true);
 
       // Figure out the captures.
       md->getParent()->getCaptureFields(lambdaCaptureFields,
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index ef07db3d48ffc..c0ed8b4006ec5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -98,8 +98,10 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// This is the inner-most code context, which includes blocks.
   const clang::Decl *curCodeDecl = nullptr;
 
-  /// The function for which code is currently being generated.
-  cir::FuncOp curFn;
+  /// The current function or global initializer that is generated code for.
+  /// This is usually a cir::FuncOp, but it can also be a cir::GlobalOp for
+  /// global initializers.
+  mlir::Operation *curFn = nullptr;
 
   using DeclMapTy = llvm::DenseMap<const clang::Decl *, Address>;
   /// This keeps track of the CIR allocas or globals for local C
@@ -116,7 +118,11 @@ class CIRGenFunction : public CIRGenTypeCache {
   CIRGenModule &getCIRGenModule() { return cgm; }
   const CIRGenModule &getCIRGenModule() const { return cgm; }
 
-  mlir::Block *getCurFunctionEntryBlock() { return &curFn.getRegion().front(); }
+  mlir::Block *getCurFunctionEntryBlock() {
+    // We currently assume this isn't called for a global initializer.
+    auto fn = mlir::cast<cir::FuncOp>(curFn);
+    return &fn.getRegion().front();
+  }
 
   /// Sanitizers enabled for this function.
   clang::SanitizerSet sanOpts;
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 8eb48f6d0fb46..2bd2729f0b0fb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -730,7 +730,6 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
   // since this is the job for its original source.
   bool isDefinitionAvailableExternally =
       astContext.GetGVALinkageForVariable(vd) == GVA_AvailableExternally;
-  assert(!cir::MissingFeatures::needsGlobalCtorDtor());
 
   // It is useless to emit the definition for an available_externally variable
   // which can't be marked as const.
@@ -743,6 +742,10 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
     return;
 
   mlir::Attribute init;
+  bool needsGlobalCtor = false;
+  bool needsGlobalDtor =
+      !isDefinitionAvailableExternally &&
+      vd->needsDestruction(astContext) == QualType::DK_cxx_destructor;
   const VarDecl *initDecl;
   const Expr *initExpr = vd->getAnyInitializer(initDecl);
 
@@ -777,8 +780,8 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
         if (initDecl->hasFlexibleArrayInit(astContext))
           errorNYI(vd->getSourceRange(), "flexible array initializer");
         init = builder.getZeroInitAttr(convertType(qt));
-        if (astContext.GetGVALinkageForVariable(vd) != GVA_AvailableExternally)
-          errorNYI(vd->getSourceRange(), "global constructor");
+        if (!isDefinitionAvailableExternally)
+          needsGlobalCtor = true;
       } else {
         errorNYI(vd->getSourceRange(), "static initializer");
       }
@@ -787,8 +790,7 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
       // We don't need an initializer, so remove the entry for the delayed
       // initializer position (just in case this entry was delayed) if we
       // also don't need to register a destructor.
-      if (vd->needsDestruction(astContext) == QualType::DK_cxx_destructor)
-        errorNYI(vd->getSourceRange(), "delayed destructor");
+      assert(!cir::MissingFeatures::deferredCXXGlobalInit());
     }
   }
 
@@ -827,6 +829,9 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
   if (emitter)
     emitter->finalize(gv);
 
+  assert(!cir::MissingFeatures::opGlobalConstant());
+  assert(!cir::MissingFeatures::opGlobalSection());
+
   // Set CIR's linkage type as appropriate.
   cir::GlobalLinkageKind linkage =
       getCIRLinkageVarDefinition(vd, /*IsConstant=*/false);
@@ -844,6 +849,10 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd,
   assert(!cir::MissingFeatures::opGlobalThreadLocal());
 
   maybeSetTrivialComdat(*vd, gv);
+
+  // Emit the initializer function if necessary.
+  if (needsGlobalCtor || needsGlobalDtor)
+    emitCXXGlobalVarDeclInitFunc(vd, gv, needsGlobalCtor);
 }
 
 void CIRGenModule::emitGlobalDefinition(clang::GlobalDecl gd,
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 006111d19d65f..2c4c6dd14e2ff 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -426,6 +426,13 @@ class CIRGenModule : public CIRGenTypeCache {
   void emitGlobalVarDefinition(const clang::VarDecl *vd,
                                bool isTentative = false);
 
+  /// Emit the function that initializes the specified global
+  void emitCXXGlobalVarDeclInit(const VarDecl *varDecl, cir::GlobalOp addr,
+                                bool performInit);
+
+  void emitCXXGlobalVarDeclInitFunc(const VarDecl *vd, cir::GlobalOp addr,
+                                    bool performInit);
+
   void emitGlobalOpenACCDecl(const clang::OpenACCConstructDecl *cd);
 
   // C++ related functions.
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index c1f27ec8ba858..3ebf460f7d34c 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -18,6 +18,7 @@ add_clang_library(clangCIR
   CIRGenCXXABI.cpp
   CIRGenBuiltin.cpp
   CIRGenDecl.cpp
+  CIRGenDeclCXX.cpp
   CIRGenDeclOpenACC.cpp
   CIRGenException.cpp
   CIRGenExpr.cpp
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index bd6d6e3a6ed09..0f309e42bcd4c 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1711,6 +1711,11 @@ CIRToLLVMGlobalOpLowering::matchAndRewriteRegionInitializedGlobal(
 mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(
     cir::GlobalOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
+  // If this global requires non-trivial initialization or destruction,
+  // that needs to be moved to runtime handlers during LoweringPrepare.
+  if (!op.getCtorRegion().empty() || !op.getDtorRegion().empty())
+    return op.emitError() << "GlobalOp ctor and dtor regions should be removed "
+                             "in LoweringPrepare";
 
   std::optional<mlir::Attribute> init = op.getInitialValue();
 
diff --git a/clang/test/CIR/CodeGen/global-init.cpp b/clang/test/CIR/CodeGen/global-init.cpp
new file mode 100644
index 0000000000000..102affc5563ac
--- /dev/null
+++ b/clang/test/CIR/CodeGen/global-init.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+
+// Note: The CIR generated from this test isn't ready for lowering to LLVM yet.
+//       That will require changes to LoweringPrepare.
+
+struct NeedsCtor {
+  NeedsCtor();
+};
+
+NeedsCtor needsCtor;
+
+// CIR: cir.func private @_ZN9NeedsCtorC1Ev(!cir.ptr<!rec_NeedsCtor>)
+// CIR: cir.global external @needsCtor = ctor : !rec_NeedsCtor {
+// CIR:   %[[THIS:.*]] = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
+// CIR:   cir.call @_ZN9NeedsCtorC1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+// CIR: }

From b6dfa3d47db74e72a566e0605fb573a8fcea1234 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Tue, 30 Sep 2025 14:24:55 -0700
Subject: [PATCH 316/878] [HLSL][NFC] Add helper struct to simplify dealing
 with resource binding attributes (#161254)

Add new `ResourceBindingAttrs` struct that holds resource binding attributes `HLSLResourceBindingAttr` and `HLSLVkBindingAttr` and provides helper methods to simplify dealing with resource bindings. This code is placed in the AST library to be shared between Sema and CodeGen.

This change has been done in preparation of a third binding attribute coming soon to represent `[[vk::counter_binding()]]`. This new attribute and more helper member functions will be added to `ResourceBindingAttrs` and will be used in both Sema and in CodeGen to implement resource counter initialization.
---
 clang/include/clang/AST/HLSLResource.h | 75 ++++++++++++++++++++
 clang/lib/CodeGen/CGHLSLRuntime.cpp    | 95 +++++++++-----------------
 clang/lib/CodeGen/CGHLSLRuntime.h      |  6 +-
 clang/lib/Sema/SemaHLSL.cpp            | 31 +++------
 4 files changed, 118 insertions(+), 89 deletions(-)
 create mode 100644 clang/include/clang/AST/HLSLResource.h

diff --git a/clang/include/clang/AST/HLSLResource.h b/clang/include/clang/AST/HLSLResource.h
new file mode 100644
index 0000000000000..e3ee0b136cec3
--- /dev/null
+++ b/clang/include/clang/AST/HLSLResource.h
@@ -0,0 +1,75 @@
+//===- HLSLResource.h - Routines for HLSL resources and bindings ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides shared routines to help analyze HLSL resources and
+// theirs bindings during Sema and CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_HLSLRESOURCE_H
+#define LLVM_CLANG_AST_HLSLRESOURCE_H
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Attrs.inc"
+#include "clang/AST/DeclBase.h"
+#include "clang/Basic/TargetInfo.h"
+
+namespace clang {
+
+class HLSLResourceBindingAttr;
+class HLSLRVkBindingAttr;
+
+namespace hlsl {
+
+struct ResourceBindingAttrs {
+  HLSLResourceBindingAttr *RegBinding;
+  HLSLVkBindingAttr *VkBinding;
+
+  ResourceBindingAttrs(const Decl *D) {
+    RegBinding = D->getAttr<HLSLResourceBindingAttr>();
+    bool IsSpirv = D->getASTContext().getTargetInfo().getTriple().isSPIRV();
+    VkBinding = IsSpirv ? D->getAttr<HLSLVkBindingAttr>() : nullptr;
+  }
+
+  bool hasBinding() const { return RegBinding || VkBinding; }
+  bool isExplicit() const {
+    return (RegBinding && RegBinding->hasRegisterSlot()) || VkBinding;
+  }
+
+  unsigned getSlot() const {
+    assert(isExplicit() && "no explicit binding");
+    if (VkBinding)
+      return VkBinding->getBinding();
+    if (RegBinding && RegBinding->hasRegisterSlot())
+      return RegBinding->getSlotNumber();
+    llvm_unreachable("no explicit binding");
+  }
+
+  unsigned getSpace() const {
+    if (VkBinding)
+      return VkBinding->getSet();
+    if (RegBinding)
+      return RegBinding->getSpaceNumber();
+    return 0;
+  }
+
+  bool hasImplicitOrderID() const {
+    return RegBinding && RegBinding->hasImplicitBindingOrderID();
+  }
+
+  unsigned getImplicitOrderID() const {
+    assert(hasImplicitOrderID());
+    return RegBinding->getImplicitBindingOrderID();
+  }
+};
+
+} // namespace hlsl
+
+} // namespace clang
+
+#endif // LLVM_CLANG_AST_HLSLRESOURCE_H
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index cf018c8c7de2a..ede1780592bf5 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -21,6 +21,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attrs.inc"
 #include "clang/AST/Decl.h"
+#include "clang/AST/HLSLResource.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/TargetOptions.h"
@@ -131,35 +132,24 @@ static CXXMethodDecl *lookupMethod(CXXRecordDecl *Record, StringRef Name,
 
 static CXXMethodDecl *lookupResourceInitMethodAndSetupArgs(
     CodeGenModule &CGM, CXXRecordDecl *ResourceDecl, llvm::Value *Range,
-    llvm::Value *Index, StringRef Name, HLSLResourceBindingAttr *RBA,
-    HLSLVkBindingAttr *VkBinding, CallArgList &Args) {
-  assert((VkBinding || RBA) && "at least one a binding attribute expected");
+    llvm::Value *Index, StringRef Name, ResourceBindingAttrs &Binding,
+    CallArgList &Args) {
+  assert(Binding.hasBinding() && "at least one binding attribute expected");
 
   ASTContext &AST = CGM.getContext();
-  std::optional<uint32_t> RegisterSlot;
-  uint32_t SpaceNo = 0;
-  if (VkBinding) {
-    RegisterSlot = VkBinding->getBinding();
-    SpaceNo = VkBinding->getSet();
-  } else {
-    if (RBA->hasRegisterSlot())
-      RegisterSlot = RBA->getSlotNumber();
-    SpaceNo = RBA->getSpaceNumber();
-  }
-
   CXXMethodDecl *CreateMethod = nullptr;
   Value *NameStr = buildNameForResource(Name, CGM);
-  Value *Space = llvm::ConstantInt::get(CGM.IntTy, SpaceNo);
+  Value *Space = llvm::ConstantInt::get(CGM.IntTy, Binding.getSpace());
 
-  if (RegisterSlot.has_value()) {
+  if (Binding.isExplicit()) {
     // explicit binding
-    auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, RegisterSlot.value());
+    auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, Binding.getSlot());
     Args.add(RValue::get(RegSlot), AST.UnsignedIntTy);
     CreateMethod = lookupMethod(ResourceDecl, "__createFromBinding", SC_Static);
   } else {
     // implicit binding
     auto *OrderID =
-        llvm::ConstantInt::get(CGM.IntTy, RBA->getImplicitBindingOrderID());
+        llvm::ConstantInt::get(CGM.IntTy, Binding.getImplicitOrderID());
     Args.add(RValue::get(OrderID), AST.UnsignedIntTy);
     CreateMethod =
         lookupMethod(ResourceDecl, "__createFromImplicitBinding", SC_Static);
@@ -194,8 +184,8 @@ static std::optional<llvm::Value *> initializeLocalResourceArray(
     CodeGenFunction &CGF, CXXRecordDecl *ResourceDecl,
     const ConstantArrayType *ArrayTy, AggValueSlot &ValueSlot,
     llvm::Value *Range, llvm::Value *StartIndex, StringRef ResourceName,
-    HLSLResourceBindingAttr *RBA, HLSLVkBindingAttr *VkBinding,
-    ArrayRef<llvm::Value *> PrevGEPIndices, SourceLocation ArraySubsExprLoc) {
+    ResourceBindingAttrs &Binding, ArrayRef<llvm::Value *> PrevGEPIndices,
+    SourceLocation ArraySubsExprLoc) {
 
   ASTContext &AST = CGF.getContext();
   llvm::IntegerType *IntTy = CGF.CGM.IntTy;
@@ -220,7 +210,7 @@ static std::optional<llvm::Value *> initializeLocalResourceArray(
       }
       std::optional<llvm::Value *> MaybeIndex = initializeLocalResourceArray(
           CGF, ResourceDecl, SubArrayTy, ValueSlot, Range, Index, ResourceName,
-          RBA, VkBinding, GEPIndices, ArraySubsExprLoc);
+          Binding, GEPIndices, ArraySubsExprLoc);
       if (!MaybeIndex)
         return std::nullopt;
       Index = *MaybeIndex;
@@ -244,8 +234,7 @@ static std::optional<llvm::Value *> initializeLocalResourceArray(
 
     CallArgList Args;
     CXXMethodDecl *CreateMethod = lookupResourceInitMethodAndSetupArgs(
-        CGF.CGM, ResourceDecl, Range, Index, ResourceName, RBA, VkBinding,
-        Args);
+        CGF.CGM, ResourceDecl, Range, Index, ResourceName, Binding, Args);
 
     if (!CreateMethod)
       // This can happen if someone creates an array of structs that looks like
@@ -439,14 +428,7 @@ void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) {
   emitBufferGlobalsAndMetadata(BufDecl, BufGV);
 
   // Initialize cbuffer from binding (implicit or explicit)
-  if (HLSLVkBindingAttr *VkBinding = BufDecl->getAttr<HLSLVkBindingAttr>()) {
-    initializeBufferFromBinding(BufDecl, BufGV, VkBinding);
-  } else {
-    HLSLResourceBindingAttr *RBA = BufDecl->getAttr<HLSLResourceBindingAttr>();
-    assert(RBA &&
-           "cbuffer/tbuffer should always have resource binding attribute");
-    initializeBufferFromBinding(BufDecl, BufGV, RBA);
-  }
+  initializeBufferFromBinding(BufDecl, BufGV);
 }
 
 void CGHLSLRuntime::addRootSignature(
@@ -810,44 +792,29 @@ static void initializeBuffer(CodeGenModule &CGM, llvm::GlobalVariable *GV,
 }
 
 void CGHLSLRuntime::initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
-                                                llvm::GlobalVariable *GV,
-                                                HLSLVkBindingAttr *VkBinding) {
-  assert(VkBinding && "expect a nonnull binding attribute");
-  auto *Index = llvm::ConstantInt::get(CGM.IntTy, 0);
-  auto *RangeSize = llvm::ConstantInt::get(CGM.IntTy, 1);
-  auto *Set = llvm::ConstantInt::get(CGM.IntTy, VkBinding->getSet());
-  auto *Binding = llvm::ConstantInt::get(CGM.IntTy, VkBinding->getBinding());
-  Value *Name = buildNameForResource(BufDecl->getName(), CGM);
-  llvm::Intrinsic::ID IntrinsicID =
-      CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic();
-
-  SmallVector<Value *> Args{Set, Binding, RangeSize, Index, Name};
-  initializeBuffer(CGM, GV, IntrinsicID, Args);
-}
+                                                llvm::GlobalVariable *GV) {
+  ResourceBindingAttrs Binding(BufDecl);
+  assert(Binding.hasBinding() &&
+         "cbuffer/tbuffer should always have resource binding attribute");
 
-void CGHLSLRuntime::initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
-                                                llvm::GlobalVariable *GV,
-                                                HLSLResourceBindingAttr *RBA) {
-  assert(RBA && "expect a nonnull binding attribute");
   auto *Index = llvm::ConstantInt::get(CGM.IntTy, 0);
   auto *RangeSize = llvm::ConstantInt::get(CGM.IntTy, 1);
-  auto *Space = llvm::ConstantInt::get(CGM.IntTy, RBA->getSpaceNumber());
+  auto *Space = llvm::ConstantInt::get(CGM.IntTy, Binding.getSpace());
   Value *Name = buildNameForResource(BufDecl->getName(), CGM);
 
-  llvm::Intrinsic::ID IntrinsicID =
-      RBA->hasRegisterSlot()
-          ? CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic()
-          : CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic();
-
   // buffer with explicit binding
-  if (RBA->hasRegisterSlot()) {
-    auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, RBA->getSlotNumber());
+  if (Binding.isExplicit()) {
+    llvm::Intrinsic::ID IntrinsicID =
+        CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic();
+    auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, Binding.getSlot());
     SmallVector<Value *> Args{Space, RegSlot, RangeSize, Index, Name};
     initializeBuffer(CGM, GV, IntrinsicID, Args);
   } else {
     // buffer with implicit binding
+    llvm::Intrinsic::ID IntrinsicID =
+        CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic();
     auto *OrderID =
-        llvm::ConstantInt::get(CGM.IntTy, RBA->getImplicitBindingOrderID());
+        llvm::ConstantInt::get(CGM.IntTy, Binding.getImplicitOrderID());
     SmallVector<Value *> Args{OrderID, Space, RangeSize, Index, Name};
     initializeBuffer(CGM, GV, IntrinsicID, Args);
   }
@@ -960,9 +927,9 @@ std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
 
   // Find binding info for the resource array. For implicit binding
   // an HLSLResourceBindingAttr should have been added by SemaHLSL.
-  HLSLVkBindingAttr *VkBinding = ArrayDecl->getAttr<HLSLVkBindingAttr>();
-  HLSLResourceBindingAttr *RBA = ArrayDecl->getAttr<HLSLResourceBindingAttr>();
-  assert((VkBinding || RBA) && "resource array must have a binding attribute");
+  ResourceBindingAttrs Binding(ArrayDecl);
+  assert((Binding.hasBinding()) &&
+         "resource array must have a binding attribute");
 
   // Find the individual resource type.
   QualType ResultTy = ArraySubsExpr->getType();
@@ -992,7 +959,7 @@ std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
     CallArgList Args;
     CXXMethodDecl *CreateMethod = lookupResourceInitMethodAndSetupArgs(
         CGF.CGM, ResourceTy->getAsCXXRecordDecl(), Range, Index,
-        ArrayDecl->getName(), RBA, VkBinding, Args);
+        ArrayDecl->getName(), Binding, Args);
 
     if (!CreateMethod)
       // This can happen if someone creates an array of structs that looks like
@@ -1009,8 +976,8 @@ std::optional<LValue> CGHLSLRuntime::emitResourceArraySubscriptExpr(
         cast<ConstantArrayType>(ResultTy.getTypePtr());
     std::optional<llvm::Value *> EndIndex = initializeLocalResourceArray(
         CGF, ResourceTy->getAsCXXRecordDecl(), ArrayTy, ValueSlot, Range, Index,
-        ArrayDecl->getName(), RBA, VkBinding,
-        {llvm::ConstantInt::get(CGM.IntTy, 0)}, ArraySubsExpr->getExprLoc());
+        ArrayDecl->getName(), Binding, {llvm::ConstantInt::get(CGM.IntTy, 0)},
+        ArraySubsExpr->getExprLoc());
     if (!EndIndex)
       return std::nullopt;
   }
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 9c0e6056fd4ee..7c6c2850fd4d4 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -200,11 +200,7 @@ class CGHLSLRuntime {
   void emitBufferGlobalsAndMetadata(const HLSLBufferDecl *BufDecl,
                                     llvm::GlobalVariable *BufGV);
   void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
-                                   llvm::GlobalVariable *GV,
-                                   HLSLVkBindingAttr *VkBinding);
-  void initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
-                                   llvm::GlobalVariable *GV,
-                                   HLSLResourceBindingAttr *RBA);
+                                   llvm::GlobalVariable *GV);
   llvm::Triple::ArchType getArch();
 
   llvm::DenseMap<const clang::RecordType *, llvm::TargetExtType *> LayoutTypes;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 940d510b4cc02..129b03c07c0bd 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/DeclarationName.h"
 #include "clang/AST/DynamicRecursiveASTVisitor.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/HLSLResource.h"
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/Builtins.h"
@@ -52,6 +53,7 @@
 #include <utility>
 
 using namespace clang;
+using namespace clang::hlsl;
 using RegisterType = HLSLResourceBindingAttr::RegisterType;
 
 static CXXRecordDecl *createHostLayoutStruct(Sema &S,
@@ -3799,19 +3801,8 @@ bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
   uint64_t UIntTySize = AST.getTypeSize(AST.UnsignedIntTy);
   uint64_t IntTySize = AST.getTypeSize(AST.IntTy);
 
-  // Gather resource binding information from attributes.
-  HLSLResourceBindingAttr *RBA = VD->getAttr<HLSLResourceBindingAttr>();
-  HLSLVkBindingAttr *VkBinding = VD->getAttr<HLSLVkBindingAttr>();
-  std::optional<uint32_t> RegisterSlot;
-  uint32_t SpaceNo = 0;
-  if (VkBinding) {
-    RegisterSlot = VkBinding->getBinding();
-    SpaceNo = VkBinding->getSet();
-  } else if (RBA) {
-    if (RBA->hasRegisterSlot())
-      RegisterSlot = RBA->getSlotNumber();
-    SpaceNo = RBA->getSpaceNumber();
-  }
+  // Gather resource binding attributes.
+  ResourceBindingAttrs Binding(VD);
 
   // Find correct initialization method and create its arguments.
   QualType ResourceTy = VD->getType();
@@ -3819,21 +3810,21 @@ bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
   CXXMethodDecl *CreateMethod = nullptr;
   llvm::SmallVector<Expr *> Args;
 
-  if (RegisterSlot.has_value()) {
+  if (Binding.isExplicit()) {
     // The resource has explicit binding.
     CreateMethod = lookupMethod(SemaRef, ResourceDecl, "__createFromBinding",
                                 VD->getLocation());
-    IntegerLiteral *RegSlot = IntegerLiteral::Create(
-        AST, llvm::APInt(UIntTySize, RegisterSlot.value()), AST.UnsignedIntTy,
-        SourceLocation());
+    IntegerLiteral *RegSlot =
+        IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, Binding.getSlot()),
+                               AST.UnsignedIntTy, SourceLocation());
     Args.push_back(RegSlot);
   } else {
     // The resource has implicit binding.
     CreateMethod =
         lookupMethod(SemaRef, ResourceDecl, "__createFromImplicitBinding",
                      VD->getLocation());
-    uint32_t OrderID = (RBA && RBA->hasImplicitBindingOrderID())
-                           ? RBA->getImplicitBindingOrderID()
+    uint32_t OrderID = (Binding.hasImplicitOrderID())
+                           ? Binding.getImplicitOrderID()
                            : getNextImplicitBindingOrderID();
     IntegerLiteral *OrderId =
         IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, OrderID),
@@ -3848,7 +3839,7 @@ bool SemaHLSL::initGlobalResourceDecl(VarDecl *VD) {
     return false;
 
   IntegerLiteral *Space =
-      IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, SpaceNo),
+      IntegerLiteral::Create(AST, llvm::APInt(UIntTySize, Binding.getSpace()),
                              AST.UnsignedIntTy, SourceLocation());
   Args.push_back(Space);
 

From f57b60ad2a72b64b75bc5422f67b46c625debbc6 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 30 Sep 2025 14:50:59 -0700
Subject: [PATCH 317/878] [flang] Add missing #include for MSVC (#161437)

I moved a function to Evaluate/tools.cpp in an attempt to dodge some
MSVC compiler issue but didn't add an include directive for
Evaluate/tools.h to Evaluate/constant.cpp.
---
 flang/include/flang/Evaluate/tools.h | 2 +-
 flang/lib/Evaluate/constant.cpp      | 5 +++--
 flang/lib/Evaluate/tools.cpp         | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index f9d74db1df03b..d8d0956369e40 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1522,7 +1522,7 @@ bool IsVarSubexpressionOf(
 std::optional<Expr<SomeType>> GetConvertInput(const Expr<SomeType> &x);
 
 // How many ancestors does have a derived type have?
-std::optional<int> DerivedTypeDepth(const semantics::Scope &);
+std::optional<int> CountDerivedTypeAncestors(const semantics::Scope &);
 
 } // namespace Fortran::evaluate
 
diff --git a/flang/lib/Evaluate/constant.cpp b/flang/lib/Evaluate/constant.cpp
index f57dd825a7a7c..7fe000892ac1a 100644
--- a/flang/lib/Evaluate/constant.cpp
+++ b/flang/lib/Evaluate/constant.cpp
@@ -9,6 +9,7 @@
 #include "flang/Evaluate/constant.h"
 #include "flang/Evaluate/expression.h"
 #include "flang/Evaluate/shape.h"
+#include "flang/Evaluate/tools.h"
 #include "flang/Evaluate/type.h"
 #include <string>
 
@@ -392,8 +393,8 @@ std::size_t Constant<SomeDerived>::CopyFrom(const Constant<SomeDerived> &source,
 bool ComponentCompare::operator()(SymbolRef x, SymbolRef y) const {
   if (&x->owner() != &y->owner()) {
     // Not components of the same derived type; put ancestors' components first.
-    if (auto xDepth{DerivedTypeDepth(x->owner())}) {
-      if (auto yDepth{DerivedTypeDepth(y->owner())}) {
+    if (auto xDepth{CountDerivedTypeAncestors(x->owner())}) {
+      if (auto yDepth{CountDerivedTypeAncestors(y->owner())}) {
         if (*xDepth != *yDepth) {
           return *xDepth < *yDepth;
         }
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 6d0da63ead07a..3cfad03648aee 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1950,7 +1950,7 @@ bool IsVarSubexpressionOf(
   return VariableFinder{sub}(super);
 }
 
-std::optional<int> DerivedTypeDepth(const semantics::Scope &scope) {
+std::optional<int> CountDerivedTypeAncestors(const semantics::Scope &scope) {
   if (scope.IsDerivedType()) {
     for (auto iter{scope.cbegin()}; iter != scope.cend(); ++iter) {
       const Symbol &symbol{*iter->second};
@@ -1962,7 +1962,7 @@ std::optional<int> DerivedTypeDepth(const semantics::Scope &scope) {
               parent = derived->typeSymbol().scope();
             }
             if (parent) {
-              if (auto parentDepth{DerivedTypeDepth(*parent)}) {
+              if (auto parentDepth{CountDerivedTypeAncestors(*parent)}) {
                 return 1 + *parentDepth;
               }
             }

From 96a1e559ccc472a9f8444c889bcfba3aee8c274d Mon Sep 17 00:00:00 2001
From: CatherineMoore <catmoore@amd.com>
Date: Tue, 30 Sep 2025 17:55:57 -0400
Subject: [PATCH 318/878] [OpenMP] Update 6.1 implementation status. (#161449)

@jhuber6: Please review
---
 clang/docs/OpenMPSupport.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 68ca7bedddb06..cf89e31aa93ef 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -580,9 +580,12 @@ implementation.
 | need_device_addr modifier for adjust_args clause            | :part:`partial`           | :none:`unclaimed`         | Parsing/Sema: https://github.com/llvm/llvm-project/pull/143442           |
 |                                                             |                           |                           |               https://github.com/llvm/llvm-project/pull/149586           |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Prescriptive num_threads                                    | :part:`In Progress`       | :none:`unclaimed`         | ro-i                                                                     |
+| Prescriptive num_threads                                    | :good:`done`              | :none:`unclaimed`         |  https://github.com/llvm/llvm-project/pull/160659                        |
+|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146403                        |
+|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146404                        |
+|                                                             |                           |                           |  https://github.com/llvm/llvm-project/pull/146405                        |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Message and severity clauses                                | :part:`In Progress`       | :none:`unclaimed`         | ro-i                                                                     |
+| Message and severity clauses                                | :good:`done`              | :none:`unclaimed`         |  https://github.com/llvm/llvm-project/pull/146093                        |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Local clause on declare target                              | :part:`In Progress`       | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+

From 739425b1342d53d0314b4970b91ff7d334428105 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 30 Sep 2025 15:02:03 -0700
Subject: [PATCH 319/878] [NFC][LLVM] Use ListSeparator in AsmWriter (#161422)

Use `ListSeparator` instead of manual code when generating comma
separated lists. Also replace `FieldSeparator` with `ListSeparator` as
they both provide identical functionality.
---
 llvm/lib/IR/AsmWriter.cpp | 237 +++++++++++++++-----------------------
 1 file changed, 96 insertions(+), 141 deletions(-)

diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 1a518305cffbe..54b92c9d35915 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -516,19 +516,15 @@ static void PrintShuffleMask(raw_ostream &Out, Type *Ty, ArrayRef<int> Mask) {
   if (isa<ScalableVectorType>(Ty))
     Out << "vscale x ";
   Out << Mask.size() << " x i32> ";
-  bool FirstElt = true;
   if (all_of(Mask, [](int Elt) { return Elt == 0; })) {
     Out << "zeroinitializer";
   } else if (all_of(Mask, [](int Elt) { return Elt == PoisonMaskElem; })) {
     Out << "poison";
   } else {
     Out << "<";
+    ListSeparator LS;
     for (int Elt : Mask) {
-      if (FirstElt)
-        FirstElt = false;
-      else
-        Out << ", ";
-      Out << "i32 ";
+      Out << LS << "i32 ";
       if (Elt == PoisonMaskElem)
         Out << "poison";
       else
@@ -1700,14 +1696,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
-    WriterCtx.TypePrinter->print(ETy, Out);
-    Out << ' ';
-    WriteAsOperandInternal(Out, CA->getOperand(0), WriterCtx);
-    for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
-      Out << ", ";
+    ListSeparator LS;
+    for (const Value *Op : CA->operands()) {
+      Out << LS;
       WriterCtx.TypePrinter->print(ETy, Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, CA->getOperand(i), WriterCtx);
+      WriteAsOperandInternal(Out, Op, WriterCtx);
     }
     Out << ']';
     return;
@@ -1725,11 +1719,9 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 
     Type *ETy = CA->getType()->getElementType();
     Out << '[';
-    WriterCtx.TypePrinter->print(ETy, Out);
-    Out << ' ';
-    WriteAsOperandInternal(Out, CA->getElementAsConstant(0), WriterCtx);
-    for (uint64_t i = 1, e = CA->getNumElements(); i != e; ++i) {
-      Out << ", ";
+    ListSeparator LS;
+    for (uint64_t i = 0, e = CA->getNumElements(); i != e; ++i) {
+      Out << LS;
       WriterCtx.TypePrinter->print(ETy, Out);
       Out << ' ';
       WriteAsOperandInternal(Out, CA->getElementAsConstant(i), WriterCtx);
@@ -1742,24 +1734,17 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     if (CS->getType()->isPacked())
       Out << '<';
     Out << '{';
-    unsigned N = CS->getNumOperands();
-    if (N) {
-      Out << ' ';
-      WriterCtx.TypePrinter->print(CS->getOperand(0)->getType(), Out);
+    if (CS->getNumOperands() != 0) {
       Out << ' ';
-
-      WriteAsOperandInternal(Out, CS->getOperand(0), WriterCtx);
-
-      for (unsigned i = 1; i < N; i++) {
-        Out << ", ";
-        WriterCtx.TypePrinter->print(CS->getOperand(i)->getType(), Out);
+      ListSeparator LS;
+      for (const Value *Op : CS->operands()) {
+        Out << LS;
+        WriterCtx.TypePrinter->print(Op->getType(), Out);
         Out << ' ';
-
-        WriteAsOperandInternal(Out, CS->getOperand(i), WriterCtx);
+        WriteAsOperandInternal(Out, Op, WriterCtx);
       }
       Out << ' ';
     }
-
     Out << '}';
     if (CS->getType()->isPacked())
       Out << '>';
@@ -1787,11 +1772,9 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     }
 
     Out << '<';
-    WriterCtx.TypePrinter->print(ETy, Out);
-    Out << ' ';
-    WriteAsOperandInternal(Out, CV->getAggregateElement(0U), WriterCtx);
-    for (unsigned i = 1, e = CVVTy->getNumElements(); i != e; ++i) {
-      Out << ", ";
+    ListSeparator LS;
+    for (unsigned i = 0, e = CVVTy->getNumElements(); i != e; ++i) {
+      Out << LS;
       WriterCtx.TypePrinter->print(ETy, Out);
       Out << ' ';
       WriteAsOperandInternal(Out, CV->getAggregateElement(i), WriterCtx);
@@ -1848,13 +1831,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       Out << ", ";
     }
 
-    for (User::const_op_iterator OI = CE->op_begin(); OI != CE->op_end();
-         ++OI) {
-      WriterCtx.TypePrinter->print((*OI)->getType(), Out);
+    ListSeparator LS;
+    for (const Value *Op : CE->operands()) {
+      Out << LS;
+      WriterCtx.TypePrinter->print(Op->getType(), Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, *OI, WriterCtx);
-      if (OI+1 != CE->op_end())
-        Out << ", ";
+      WriteAsOperandInternal(Out, Op, WriterCtx);
     }
 
     if (CE->isCast()) {
@@ -1875,11 +1857,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
                          AsmWriterContext &WriterCtx) {
   Out << "!{";
-  for (unsigned mi = 0, me = Node->getNumOperands(); mi != me; ++mi) {
-    const Metadata *MD = Node->getOperand(mi);
-    if (!MD)
+  ListSeparator LS;
+  for (const Metadata *MD : Node->operands()) {
+    Out << LS;
+    if (!MD) {
       Out << "null";
-    else if (auto *MDV = dyn_cast<ValueAsMetadata>(MD)) {
+    } else if (auto *MDV = dyn_cast<ValueAsMetadata>(MD)) {
       Value *V = MDV->getValue();
       WriterCtx.TypePrinter->print(V->getType(), Out);
       Out << ' ';
@@ -1888,8 +1871,6 @@ static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
       WriteAsOperandInternal(Out, MD, WriterCtx);
       WriterCtx.onWriteMetadataAsOperand(MD);
     }
-    if (mi + 1 != me)
-      Out << ", ";
   }
 
   Out << "}";
@@ -1897,24 +1878,9 @@ static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
 
 namespace {
 
-struct FieldSeparator {
-  bool Skip = true;
-  const char *Sep;
-
-  FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
-};
-
-raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
-  if (FS.Skip) {
-    FS.Skip = false;
-    return OS;
-  }
-  return OS << FS.Sep;
-}
-
 struct MDFieldPrinter {
   raw_ostream &Out;
-  FieldSeparator FS;
+  ListSeparator FS;
   AsmWriterContext &WriterCtx;
 
   explicit MDFieldPrinter(raw_ostream &Out)
@@ -2051,7 +2017,7 @@ void MDFieldPrinter::printDIFlags(StringRef Name, DINode::DIFlags Flags) {
   SmallVector<DINode::DIFlags, 8> SplitFlags;
   auto Extra = DINode::splitFlags(Flags, SplitFlags);
 
-  FieldSeparator FlagsFS(" | ");
+  ListSeparator FlagsFS(" | ");
   for (auto F : SplitFlags) {
     auto StringF = DINode::getFlagString(F);
     assert(!StringF.empty() && "Expected valid flag");
@@ -2075,7 +2041,7 @@ void MDFieldPrinter::printDISPFlags(StringRef Name,
   SmallVector<DISubprogram::DISPFlags, 8> SplitFlags;
   auto Extra = DISubprogram::splitFlags(Flags, SplitFlags);
 
-  FieldSeparator FlagsFS(" | ");
+  ListSeparator FlagsFS(" | ");
   for (auto F : SplitFlags) {
     auto StringF = DISubprogram::getFlagString(F);
     assert(!StringF.empty() && "Expected valid flag");
@@ -2124,7 +2090,7 @@ static void writeGenericDINode(raw_ostream &Out, const GenericDINode *N,
   Printer.printString("header", N->getHeader());
   if (N->getNumDwarfOperands()) {
     Out << Printer.FS << "operands: {";
-    FieldSeparator IFS;
+    ListSeparator IFS;
     for (auto &I : N->dwarf_operands()) {
       Out << IFS;
       writeMetadataAsOperand(Out, I, WriterCtx);
@@ -2638,7 +2604,7 @@ static void writeDILabel(raw_ostream &Out, const DILabel *N,
 static void writeDIExpression(raw_ostream &Out, const DIExpression *N,
                               AsmWriterContext &WriterCtx) {
   Out << "!DIExpression(";
-  FieldSeparator FS;
+  ListSeparator FS;
   if (N->isValid()) {
     for (const DIExpression::ExprOperand &Op : N->expr_ops()) {
       auto OpStr = dwarf::OperationEncodingString(Op.getOp());
@@ -2666,7 +2632,7 @@ static void writeDIArgList(raw_ostream &Out, const DIArgList *N,
   assert(FromValue &&
          "Unexpected DIArgList metadata outside of value argument");
   Out << "!DIArgList(";
-  FieldSeparator FS;
+  ListSeparator FS;
   MDFieldPrinter Printer(Out, WriterCtx);
   for (Metadata *Arg : N->getArgs()) {
     Out << FS;
@@ -3073,15 +3039,11 @@ void AssemblyWriter::writeOperandBundles(const CallBase *Call) {
 
   Out << " [ ";
 
-  bool FirstBundle = true;
+  ListSeparator LS;
   for (unsigned i = 0, e = Call->getNumOperandBundles(); i != e; ++i) {
     OperandBundleUse BU = Call->getOperandBundleAt(i);
 
-    if (!FirstBundle)
-      Out << ", ";
-    FirstBundle = false;
-
-    Out << '"';
+    Out << LS << '"';
     printEscapedString(BU.getTagName(), Out);
     Out << '"';
 
@@ -3229,7 +3191,7 @@ void AssemblyWriter::printModuleSummaryIndex() {
     Out << "path: \"";
     printEscapedString(ModPair.first, Out);
     Out << "\", hash: (";
-    FieldSeparator FS;
+    ListSeparator FS;
     for (auto Hash : ModPair.second)
       Out << FS << Hash;
     Out << "))\n";
@@ -3347,7 +3309,7 @@ void AssemblyWriter::printTypeIdSummary(const TypeIdSummary &TIS) {
   printTypeTestResolution(TIS.TTRes);
   if (!TIS.WPDRes.empty()) {
     Out << ", wpdResolutions: (";
-    FieldSeparator FS;
+    ListSeparator FS;
     for (auto &WPDRes : TIS.WPDRes) {
       Out << FS;
       Out << "(offset: " << WPDRes.first << ", ";
@@ -3362,7 +3324,7 @@ void AssemblyWriter::printTypeIdSummary(const TypeIdSummary &TIS) {
 void AssemblyWriter::printTypeIdCompatibleVtableSummary(
     const TypeIdCompatibleVtableInfo &TI) {
   Out << ", summary: (";
-  FieldSeparator FS;
+  ListSeparator FS;
   for (auto &P : TI) {
     Out << FS;
     Out << "(offset: " << P.AddressPointOffset << ", ";
@@ -3374,7 +3336,7 @@ void AssemblyWriter::printTypeIdCompatibleVtableSummary(
 
 void AssemblyWriter::printArgs(const std::vector<uint64_t> &Args) {
   Out << "args: (";
-  FieldSeparator FS;
+  ListSeparator FS;
   for (auto arg : Args) {
     Out << FS;
     Out << arg;
@@ -3391,7 +3353,7 @@ void AssemblyWriter::printWPDRes(const WholeProgramDevirtResolution &WPDRes) {
 
   if (!WPDRes.ResByArg.empty()) {
     Out << ", resByArg: (";
-    FieldSeparator FS;
+    ListSeparator FS;
     for (auto &ResByArg : WPDRes.ResByArg) {
       Out << FS;
       printArgs(ResByArg.first);
@@ -3451,7 +3413,7 @@ void AssemblyWriter::printGlobalVarSummary(const GlobalVarSummary *GS) {
 
   if (!VTableFuncs.empty()) {
     Out << ", vTableFuncs: (";
-    FieldSeparator FS;
+    ListSeparator FS;
     for (auto &P : VTableFuncs) {
       Out << FS;
       Out << "(virtFunc: ^" << Machine.getGUIDSlot(P.FuncVI.getGUID())
@@ -3528,7 +3490,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
 
   if (!FS->calls().empty()) {
     Out << ", calls: (";
-    FieldSeparator IFS;
+    ListSeparator IFS;
     for (auto &Call : FS->calls()) {
       Out << IFS;
       Out << "(callee: ^" << Machine.getGUIDSlot(Call.first.getGUID());
@@ -3566,22 +3528,22 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
 
   if (!FS->allocs().empty()) {
     Out << ", allocs: (";
-    FieldSeparator AFS;
+    ListSeparator AFS;
     for (auto &AI : FS->allocs()) {
       Out << AFS;
       Out << "(versions: (";
-      FieldSeparator VFS;
+      ListSeparator VFS;
       for (auto V : AI.Versions) {
         Out << VFS;
         Out << AllocTypeName(V);
       }
       Out << "), memProf: (";
-      FieldSeparator MIBFS;
+      ListSeparator MIBFS;
       for (auto &MIB : AI.MIBs) {
         Out << MIBFS;
         Out << "(type: " << AllocTypeName((uint8_t)MIB.AllocType);
         Out << ", stackIds: (";
-        FieldSeparator SIDFS;
+        ListSeparator SIDFS;
         for (auto Id : MIB.StackIdIndices) {
           Out << SIDFS;
           Out << TheIndex->getStackIdAtIndex(Id);
@@ -3595,7 +3557,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
 
   if (!FS->callsites().empty()) {
     Out << ", callsites: (";
-    FieldSeparator SNFS;
+    ListSeparator SNFS;
     for (auto &CI : FS->callsites()) {
       Out << SNFS;
       if (CI.Callee)
@@ -3603,13 +3565,13 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
       else
         Out << "(callee: null";
       Out << ", clones: (";
-      FieldSeparator VFS;
+      ListSeparator VFS;
       for (auto V : CI.Clones) {
         Out << VFS;
         Out << V;
       }
       Out << "), stackIds: (";
-      FieldSeparator SIDFS;
+      ListSeparator SIDFS;
       for (auto Id : CI.StackIdIndices) {
         Out << SIDFS;
         Out << TheIndex->getStackIdAtIndex(Id);
@@ -3625,7 +3587,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
 
   if (!FS->paramAccesses().empty()) {
     Out << ", params: (";
-    FieldSeparator IFS;
+    ListSeparator IFS;
     for (auto &PS : FS->paramAccesses()) {
       Out << IFS;
       Out << "(param: " << PS.ParamNo;
@@ -3633,7 +3595,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
       PrintRange(PS.Use);
       if (!PS.Calls.empty()) {
         Out << ", calls: (";
-        FieldSeparator IFS;
+        ListSeparator IFS;
         for (auto &Call : PS.Calls) {
           Out << IFS;
           Out << "(callee: ^" << Machine.getGUIDSlot(Call.Callee.getGUID());
@@ -3653,11 +3615,11 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
 void AssemblyWriter::printTypeIdInfo(
     const FunctionSummary::TypeIdInfo &TIDInfo) {
   Out << ", typeIdInfo: (";
-  FieldSeparator TIDFS;
+  ListSeparator TIDFS;
   if (!TIDInfo.TypeTests.empty()) {
     Out << TIDFS;
     Out << "typeTests: (";
-    FieldSeparator FS;
+    ListSeparator FS;
     for (auto &GUID : TIDInfo.TypeTests) {
       auto TidIter = TheIndex->typeIds().equal_range(GUID);
       if (TidIter.first == TidIter.second) {
@@ -3706,7 +3668,7 @@ void AssemblyWriter::printVFuncId(const FunctionSummary::VFuncId VFId) {
     return;
   }
   // Print all type id that correspond to this GUID.
-  FieldSeparator FS;
+  ListSeparator FS;
   for (const auto &[GUID, TypeIdPair] : make_range(TidIter)) {
     Out << FS;
     Out << "vFuncId: (";
@@ -3721,7 +3683,7 @@ void AssemblyWriter::printVFuncId(const FunctionSummary::VFuncId VFId) {
 void AssemblyWriter::printNonConstVCalls(
     const std::vector<FunctionSummary::VFuncId> &VCallList, const char *Tag) {
   Out << Tag << ": (";
-  FieldSeparator FS;
+  ListSeparator FS;
   for (auto &VFuncId : VCallList) {
     Out << FS;
     printVFuncId(VFuncId);
@@ -3733,7 +3695,7 @@ void AssemblyWriter::printConstVCalls(
     const std::vector<FunctionSummary::ConstVCall> &VCallList,
     const char *Tag) {
   Out << Tag << ": (";
-  FieldSeparator FS;
+  ListSeparator FS;
   for (auto &ConstVCall : VCallList) {
     Out << FS;
     Out << "(";
@@ -3774,7 +3736,7 @@ void AssemblyWriter::printSummary(const GlobalValueSummary &Summary) {
   auto RefList = Summary.refs();
   if (!RefList.empty()) {
     Out << ", refs: (";
-    FieldSeparator FS;
+    ListSeparator FS;
     for (auto &Ref : RefList) {
       Out << FS;
       if (Ref.isReadOnly())
@@ -3797,7 +3759,7 @@ void AssemblyWriter::printSummaryInfo(unsigned Slot, const ValueInfo &VI) {
     Out << "guid: " << VI.getGUID();
   if (!VI.getSummaryList().empty()) {
     Out << ", summaries: (";
-    FieldSeparator FS;
+    ListSeparator FS;
     for (auto &Summary : VI.getSummaryList()) {
       Out << FS;
       printSummary(*Summary);
@@ -3835,13 +3797,11 @@ void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
   Out << '!';
   printMetadataIdentifier(NMD->getName(), Out);
   Out << " = !{";
-  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-    if (i)
-      Out << ", ";
-
+  ListSeparator LS;
+  for (const MDNode *Op : NMD->operands()) {
+    Out << LS;
     // Write DIExpressions inline.
     // FIXME: Ban DIExpressions in NamedMDNodes, they will serve no purpose.
-    MDNode *Op = NMD->getOperand(i);
     if (auto *Expr = dyn_cast<DIExpression>(Op)) {
       writeDIExpression(Out, Expr, AsmWriterContext::getEmpty());
       continue;
@@ -4192,11 +4152,10 @@ void AssemblyWriter::printFunction(const Function *F) {
   // Loop over the arguments, printing them...
   if (F->isDeclaration() && !IsForDebug) {
     // We're only interested in the type here - don't print argument names.
+    ListSeparator LS;
     for (unsigned I = 0, E = FT->getNumParams(); I != E; ++I) {
-      // Insert commas as we go... the first arg doesn't get a comma
-      if (I)
-        Out << ", ";
-      // Output type...
+      Out << LS;
+      // Output type.
       TypePrinter.print(FT->getParamType(I), Out);
 
       AttributeSet ArgAttrs = Attrs.getParamAttrs(I);
@@ -4207,10 +4166,9 @@ void AssemblyWriter::printFunction(const Function *F) {
     }
   } else {
     // The arguments are meaningful here, print them in detail.
+    ListSeparator LS;
     for (const Argument &Arg : F->args()) {
-      // Insert commas as we go... the first arg doesn't get a comma
-      if (Arg.getArgNo() != 0)
-        Out << ", ";
+      Out << LS;
       printArgument(&Arg, Attrs.getParamAttrs(Arg.getArgNo()));
     }
   }
@@ -4332,16 +4290,14 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
     // Output predecessors for the block.
     Out.PadToColumn(50);
     Out << ";";
-    const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-
-    if (PI == PE) {
+    if (pred_empty(BB)) {
       Out << " No predecessors!";
     } else {
       Out << " preds = ";
-      writeOperand(*PI, false);
-      for (++PI; PI != PE; ++PI) {
-        Out << ", ";
-        writeOperand(*PI, false);
+      ListSeparator LS;
+      for (const BasicBlock *Pred : predecessors(BB)) {
+        Out << LS;
+        writeOperand(Pred, false);
       }
     }
   }
@@ -4520,9 +4476,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(Operand, true);
     Out << ", [";
 
+    ListSeparator LS;
     for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i) {
-      if (i != 1)
-        Out << ", ";
+      Out << LS;
       writeOperand(I.getOperand(i), true);
     }
     Out << ']';
@@ -4531,9 +4487,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     TypePrinter.print(I.getType(), Out);
     Out << ' ';
 
+    ListSeparator LS;
     for (unsigned op = 0, Eop = PN->getNumIncomingValues(); op < Eop; ++op) {
-      if (op) Out << ", ";
-      Out << "[ ";
+      Out << LS << "[ ";
       writeOperand(PN->getIncomingValue(op), false); Out << ", ";
       writeOperand(PN->getIncomingBlock(op), false); Out << " ]";
     }
@@ -4570,12 +4526,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << " within ";
     writeOperand(CatchSwitch->getParentPad(), /*PrintType=*/false);
     Out << " [";
-    unsigned Op = 0;
+    ListSeparator LS;
     for (const BasicBlock *PadBB : CatchSwitch->handlers()) {
-      if (Op > 0)
-        Out << ", ";
+      Out << LS;
       writeOperand(PadBB, /*PrintType=*/true);
-      ++Op;
     }
     Out << "] unwind ";
     if (const BasicBlock *UnwindDest = CatchSwitch->getUnwindDest())
@@ -4586,10 +4540,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << " within ";
     writeOperand(FPI->getParentPad(), /*PrintType=*/false);
     Out << " [";
-    for (unsigned Op = 0, NumOps = FPI->arg_size(); Op < NumOps; ++Op) {
-      if (Op > 0)
-        Out << ", ";
-      writeOperand(FPI->getArgOperand(Op), /*PrintType=*/true);
+    ListSeparator LS;
+    for (const Value *Op : FPI->arg_operands()) {
+      Out << LS;
+      writeOperand(Op, /*PrintType=*/true);
     }
     Out << ']';
   } else if (isa<ReturnInst>(I) && !Operand) {
@@ -4635,9 +4589,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
+    ListSeparator LS;
     for (unsigned op = 0, Eop = CI->arg_size(); op < Eop; ++op) {
-      if (op > 0)
-        Out << ", ";
+      Out << LS;
       writeParamOperand(CI->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
@@ -4683,9 +4637,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
+    ListSeparator LS;
     for (unsigned op = 0, Eop = II->arg_size(); op < Eop; ++op) {
-      if (op)
-        Out << ", ";
+      Out << LS;
       writeParamOperand(II->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
@@ -4723,9 +4677,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
+    ListSeparator ArgLS;
     for (unsigned op = 0, Eop = CBI->arg_size(); op < Eop; ++op) {
-      if (op)
-        Out << ", ";
+      Out << ArgLS;
       writeParamOperand(CBI->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
@@ -4738,10 +4692,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << "\n          to ";
     writeOperand(CBI->getDefaultDest(), true);
     Out << " [";
-    for (unsigned i = 0, e = CBI->getNumIndirectDests(); i != e; ++i) {
-      if (i != 0)
-        Out << ", ";
-      writeOperand(CBI->getIndirectDest(i), true);
+    ListSeparator DestLS;
+    for (const BasicBlock *Dest : CBI->getIndirectDests()) {
+      Out << DestLS;
+      writeOperand(Dest, true);
     }
     Out << ']';
   } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
@@ -4824,9 +4778,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
 
     Out << ' ';
-    for (unsigned i = 0, E = I.getNumOperands(); i != E; ++i) {
-      if (i) Out << ", ";
-      writeOperand(I.getOperand(i), PrintAllTypes);
+    ListSeparator LS;
+    for (const Value *Op : I.operands()) {
+      Out << LS;
+      writeOperand(Op, PrintAllTypes);
     }
   }
 

From 1c11f72344e8d2fdb29587572dd50db6b10fdd28 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Tue, 30 Sep 2025 15:19:53 -0700
Subject: [PATCH 320/878] [NFC] [IndVarSimplify] add overflowing tests
 (#159877)

Also use UTC for test instead.
---
 .../IndVarSimplify/X86/overflow-intrinsics.ll | 268 ++++++++++++++++--
 1 file changed, 244 insertions(+), 24 deletions(-)

diff --git a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
index 4a59e419369af..cb4e07ef3e26b 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/overflow-intrinsics.ll
@@ -1,10 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=indvars < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @f_sadd(ptr %a) {
-; CHECK-LABEL: @f_sadd(
+; CHECK-LABEL: define void @f_sadd(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br i1 false, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0:![0-9]+]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
 entry:
   br label %for.body
 
@@ -18,9 +37,6 @@ for.body:                                         ; preds = %entry, %cont
   store i8 0, ptr %arrayidx, align 1
   %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.04, i32 1)
   %1 = extractvalue { i32, i1 } %0, 1
-; CHECK: for.body:
-; CHECK-NOT: @llvm.sadd.with.overflow
-; CHECK:  br i1 false, label %trap, label %cont, !nosanitize !0
   br i1 %1, label %trap, label %cont, !nosanitize !{}
 
 trap:                                             ; preds = %for.body
@@ -33,8 +49,71 @@ cont:                                             ; preds = %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
+define void @f_sadd_overflow(ptr %a) {
+; CHECK-LABEL: define void @f_sadd_overflow(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 2147483645, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 2147483647
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br i1 true, label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %cont
+  ret void
+
+for.body:                                         ; preds = %entry, %cont
+  %i.04 = phi i32 [ 2147483645, %entry ], [ %2, %cont ]
+  %idxprom = sext i32 %i.04 to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  store i8 0, ptr %arrayidx, align 1
+  %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.04, i32 1)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont, !nosanitize !{}
+
+trap:                                             ; preds = %for.body
+  tail call void @llvm.trap() #2, !nosanitize !{}
+  unreachable, !nosanitize !{}
+
+cont:                                             ; preds = %for.body
+  %2 = extractvalue { i32, i1 } %0, 0
+  %cmp = icmp sle i32 %2, 2147483647
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
 define void @f_uadd(ptr %a) {
-; CHECK-LABEL: @f_uadd(
+; CHECK-LABEL: define void @f_uadd(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br i1 false, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
 entry:
   br label %for.body
 
@@ -48,9 +127,6 @@ for.body:                                         ; preds = %entry, %cont
   store i8 0, ptr %arrayidx, align 1
   %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %i.04, i32 1)
   %1 = extractvalue { i32, i1 } %0, 1
-; CHECK: for.body:
-; CHECK-NOT: @llvm.uadd.with.overflow
-; CHECK: br i1 false, label %trap, label %cont, !nosanitize !0
   br i1 %1, label %trap, label %cont, !nosanitize !{}
 
 trap:                                             ; preds = %for.body
@@ -63,8 +139,71 @@ cont:                                             ; preds = %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
+define void @f_uadd_overflow(ptr %a) {
+; CHECK-LABEL: define void @f_uadd_overflow(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -6, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br i1 true, label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %cont
+  ret void
+
+for.body:                                         ; preds = %entry, %cont
+  %i.04 = phi i32 [ 4294967290, %entry ], [ %2, %cont ]
+  %idxprom = sext i32 %i.04 to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  store i8 0, ptr %arrayidx, align 1
+  %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %i.04, i32 1)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont, !nosanitize !{}
+
+trap:                                             ; preds = %for.body
+  tail call void @llvm.trap(), !nosanitize !{}
+  unreachable, !nosanitize !{}
+
+cont:                                             ; preds = %for.body
+  %2 = extractvalue { i32, i1 } %0, 0
+  %cmp = icmp ule i32 %2, 4294967295
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
 define void @f_ssub(ptr nocapture %a) {
-; CHECK-LABEL: @f_ssub(
+; CHECK-LABEL: define void @f_ssub(
+; CHECK-SAME: ptr captures(none) [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 false, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[INDVARS_IV_NEXT]], -1
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
 entry:
   br label %for.body
 
@@ -78,9 +217,6 @@ for.body:                                         ; preds = %entry, %cont
   store i8 0, ptr %arrayidx, align 1
   %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %i.04, i32 1)
   %1 = extractvalue { i32, i1 } %0, 1
-; CHECK: for.body:
-; CHECK-NOT: @llvm.ssub.with.overflow.i32
-; CHECK: br i1 false, label %trap, label %cont, !nosanitize !0
   br i1 %1, label %trap, label %cont, !nosanitize !{}
 
 trap:                                             ; preds = %for.body
@@ -93,8 +229,76 @@ cont:                                             ; preds = %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
+; It is theoretically possible to replace the `ssub.with.overflow` with a
+; condition on the IV, but SCEV cannot represent non-unsigned-wrapping
+; subtraction operations.
+define void @f_ssub_overflow(ptr nocapture %a) {
+; CHECK-LABEL: define void @f_ssub_overflow(
+; CHECK-SAME: ptr captures(none) [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ -2147483642, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[TMP0]], i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 true, label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %cont
+  ret void
+
+for.body:                                         ; preds = %entry, %cont
+  %i.04 = phi i32 [ -2147483642, %entry ], [ %2, %cont ]
+  %idxprom = sext i32 %i.04 to i64
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom
+  store i8 0, ptr %arrayidx, align 1
+  %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %i.04, i32 1)
+  %1 = extractvalue { i32, i1 } %0, 1
+  br i1 %1, label %trap, label %cont, !nosanitize !{}
+
+trap:                                             ; preds = %for.body
+  tail call void @llvm.trap(), !nosanitize !{}
+  unreachable, !nosanitize !{}
+
+cont:                                             ; preds = %for.body
+  %2 = extractvalue { i32, i1 } %0, 0
+  %cmp = icmp sge i32 %2, -2147483648
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
 define void @f_usub(ptr nocapture %a) {
-; CHECK-LABEL: @f_usub(
+; CHECK-LABEL: define void @f_usub(
+; CHECK-SAME: ptr captures(none) [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 false, label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i64 [[INDVARS_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
 entry:
   br label %for.body
 
@@ -109,9 +313,6 @@ for.body:                                         ; preds = %entry, %cont
   %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %i.04, i32 1)
   %1 = extractvalue { i32, i1 } %0, 1
 
-; CHECK: for.body:
-; CHECK-NOT: @llvm.usub.with.overflow.i32
-; CHECK: br i1 false, label %trap, label %cont, !nosanitize !0
   br i1 %1, label %trap, label %cont, !nosanitize !{}
 
 trap:                                             ; preds = %for.body
@@ -124,8 +325,31 @@ cont:                                             ; preds = %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
+; It is theoretically possible to replace the `usub.with.overflow` with a
+; condition on the IV, but SCEV cannot represent non-unsigned-wrapping
+; subtraction operations.
 define void @f_usub_overflow(ptr nocapture %a) {
-; CHECK-LABEL: @f_usub_overflow(
+; CHECK-LABEL: define void @f_usub_overflow(
+; CHECK-SAME: ptr captures(none) [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[CONT:.*]] ], [ 15, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[TMP0]], i32 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[TRAP:.*]], label %[[CONT]], !nosanitize [[META0]]
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    tail call void @llvm.trap(), !nosanitize [[META0]]
+; CHECK-NEXT:    unreachable, !nosanitize [[META0]]
+; CHECK:       [[CONT]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
+; CHECK-NEXT:    br i1 true, label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
+;
 entry:
   br label %for.body
 
@@ -139,13 +363,6 @@ for.body:                                         ; preds = %entry, %cont
   store i8 0, ptr %arrayidx, align 1
   %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %i.04, i32 1)
   %1 = extractvalue { i32, i1 } %0, 1
-
-; It is theoretically possible to prove this, but SCEV cannot
-; represent non-unsigned-wrapping subtraction operations.
-
-; CHECK: for.body:
-; CHECK:  [[COND:%[^ ]+]] = extractvalue { i32, i1 } %1, 1
-; CHECK-NEXT:  br i1 [[COND]], label %trap, label %cont, !nosanitize !0
   br i1 %1, label %trap, label %cont, !nosanitize !{}
 
 trap:                                             ; preds = %for.body
@@ -166,3 +383,6 @@ declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
 declare { i32, i1 } @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
 
 declare void @llvm.trap() #2
+;.
+; CHECK: [[META0]] = !{}
+;.

From b80b48d3e89eca606fde4443b603ba8fdd8f67c8 Mon Sep 17 00:00:00 2001
From: David Salinas <dsalinas@amd.com>
Date: Tue, 30 Sep 2025 18:23:11 -0400
Subject: [PATCH 321/878] Fix memory leak in Offloading API (#161430)

Fix or the failing Sanitizer buildbots from PR:
https://github.com/llvm/llvm-project/pull/143342
---
 llvm/lib/Object/OffloadBundle.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index 0dd378e65fd81..a6a9628acddcc 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -120,14 +120,15 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle)
     return errorCodeToError(object_error::parse_failed);
 
-  OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName);
+  std::unique_ptr<OffloadBundleFatBin> TheBundle(
+      new OffloadBundleFatBin(Buf, FileName));
 
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
     return Err;
 
-  return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
+  return TheBundle;
 }
 
 Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) {

From 4e404d0e5135ae7d0d4e7cc4c9b04d456eb50a01 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 30 Sep 2025 16:31:04 -0700
Subject: [PATCH 322/878] [CodingStandard] Require Unix line endings for all
 files (#161228)

Require all files to use Unix line endings, formalizing an already
followed convention.
---
 llvm/docs/CodingStandards.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index dd275f292967b..65dd794103ac3 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -1790,6 +1790,12 @@ would help to avoid running into a "dangling else" situation.
       markAsIgnored(D);
   }
 
+Use Unix line endings for files
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use Unix line endings for all files. CRLF line endings are allowed as an
+exception for test files that intend to test CRLF handling or when the file
+format requires it (like ``.bat`` or ``.rc`` files).
 
 See Also
 ========

From 69b0a479ac57b6c7ce2228e1944e8fc0997260ae Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jon@spectralcompute.co.uk>
Date: Wed, 1 Oct 2025 00:35:51 +0100
Subject: [PATCH 323/878] [AMDGPU] Precommit test for 160181

---
 ...e-lds-precise-allocate-to-module-struct.ll | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll

diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
new file mode 100644
index 0000000000000..0de7f8f621a53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
+
+; Regression test for issue 160181
+; One variable is chosen to be assigned at zero. Here, that's @both
+; Then other variables should be allocated at fixed offsets from that provided
+; they are allocated by all the other kernels that presently allocate the
+; variable at address zero.
+; The failure mode was in that second check - variables could be added to
+; the module scope zero address struct even when some of the kernels allocating
+; that struct do not need the additional variable.
+
+; With current llvm, all three of these integers are put in the module scope struct, when
+; neither kern_one or kern_two access all three.
+
+@both = addrspace(3) global i32 poison
+@both_second = addrspace(3) global i16 poison ; a second field in the module struct
+@one = addrspace(3) global i32 poison
+@two = addrspace(3) global i32 poison
+
+
+;.
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]]
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+;.
+define void @func_one() {
+; CHECK-LABEL: define {{[^@]+}}@func_one() {
+; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1:![0-9]+]]
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18:![0-9]+]]
+; CHECK-NEXT:    store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %val0 = load i32, ptr addrspace(3) @both
+  store i32 %val0, ptr addrspace(3) @one
+  store i16 10, ptr addrspace(3) @both_second
+  ret void
+}
+
+define amdgpu_kernel void @kern_one() {
+; CHECK-LABEL: define {{[^@]+}}@kern_one
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META24:![0-9]+]]
+; CHECK-NEXT:    call void @func_one()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @func_one()
+  ret void
+}
+
+define void @func_two() {
+; CHECK-LABEL: define {{[^@]+}}@func_two() {
+; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25:![0-9]+]]
+; CHECK-NEXT:    store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT:    ret void
+;
+  %val0 = load i32, ptr addrspace(3) @both
+  store i32 %val0, ptr addrspace(3) @two
+  store i16 20, ptr addrspace(3) @both_second
+  ret void
+}
+
+define amdgpu_kernel void @kern_two() {
+; CHECK-LABEL: define {{[^@]+}}@kern_two
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META26:![0-9]+]], !noalias [[META27:![0-9]+]]
+; CHECK-NEXT:    call void @func_two()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @func_two()
+  ret void
+}
+
+; Unrelated to the bug at hand, but if a variable is only
+; reachable from a single kernel, it gets allocated to a fixed
+; address independent of the module scope struct. This kernel
+; means the key variables miss that optimisation while @both
+; remains the best candidate for address zero allocation.
+define void @func_block_direct_allocation() {
+; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() {
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18]]
+; CHECK-NEXT:    [[VAL2:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25]]
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT:    store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
+; CHECK-NEXT:    store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT:    ret void
+;
+  %val1 = load i32, ptr addrspace(3) @one
+  %val2 = load i32, ptr addrspace(3) @two
+  %sum = add i32 %val1, %val2
+  store i32 %sum, ptr addrspace(3) @both
+  store i16 30, ptr addrspace(3) @both_second
+  ret void
+}
+
+define amdgpu_kernel void @kern_block_direct_allocation() {
+; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @func_block_direct_allocation()
+; CHECK-NEXT:    call void @func_one()
+; CHECK-NEXT:    call void @func_two()
+; CHECK-NEXT:    ret void
+;
+  call void @func_block_direct_allocation()
+  call void @func_one()
+  call void @func_two()
+  ret void
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="16" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: [[META0]] = !{i32 0, i32 1}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]]}
+; CHECK: [[META3]] = distinct !{[[META3]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META3]]}
+; CHECK: [[META5]] = distinct !{[[META5]], [[META3]]}
+; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
+; CHECK: [[META7]] = distinct !{[[META7]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META7]]}
+; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]}
+; CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
+; CHECK: [[META11]] = distinct !{[[META11]]}
+; CHECK: [[META12]] = distinct !{[[META12]], [[META11]]}
+; CHECK: [[META13]] = distinct !{[[META13]], [[META11]]}
+; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]}
+; CHECK: [[META15]] = distinct !{[[META15]]}
+; CHECK: [[META16]] = distinct !{[[META16]], [[META15]]}
+; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]}
+; CHECK: [[META18]] = !{[[META19:![0-9]+]], [[META2]], [[META5]], [[META20:![0-9]+]], [[META6]], [[META9]], [[META21:![0-9]+]], [[META10]], [[META13]], [[META22:![0-9]+]], [[META14]], [[META17]]}
+; CHECK: [[META19]] = distinct !{[[META19]], [[META3]]}
+; CHECK: [[META20]] = distinct !{[[META20]], [[META7]]}
+; CHECK: [[META21]] = distinct !{[[META21]], [[META11]]}
+; CHECK: [[META22]] = distinct !{[[META22]], [[META15]]}
+; CHECK: [[META23]] = !{[[META19]], [[META4]], [[META5]], [[META20]], [[META8]], [[META9]], [[META21]], [[META12]], [[META13]], [[META22]], [[META16]], [[META17]]}
+; CHECK: [[META24]] = !{[[META10]], [[META12]], [[META13]], [[META14]], [[META16]], [[META17]]}
+; CHECK: [[META25]] = !{[[META19]], [[META2]], [[META4]], [[META20]], [[META6]], [[META8]], [[META21]], [[META10]], [[META12]], [[META22]], [[META14]], [[META16]]}
+; CHECK: [[META26]] = !{[[META22]]}
+; CHECK: [[META27]] = !{[[META14]], [[META16]], [[META17]]}
+;.

From 0aa7da089aaab2b6e524c124701e645f2cdcab75 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 30 Sep 2025 17:02:49 -0700
Subject: [PATCH 324/878] [llvm][mustache] Fix failing StandaloneIndentation
 test (#159192)

When rendering partials, we need to use an indentation stream,
but when part of the partial is a unescaped sequence, we cannot
indent those. To address this, we build a common MustacheStream
interface for all the output streams to use. This allows us to
further customize the AddIndentationStream implementation
and opt it out of indenting the UnescapeSequence.
---
 llvm/lib/Support/Mustache.cpp                 | 152 ++++++---
 llvm/unittests/Support/MustacheTest.cpp       | 313 +++++++++---------
 .../llvm-test-mustache-spec.cpp               |   1 -
 3 files changed, 263 insertions(+), 203 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 646d7a0ff9c0e..6275e5eccf03a 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -56,6 +56,33 @@ static Accessor splitMustacheString(StringRef Str) {
 
 namespace llvm::mustache {
 
+class MustacheOutputStream : public raw_ostream {
+public:
+  MustacheOutputStream() = default;
+  ~MustacheOutputStream() override = default;
+
+  virtual void suspendIndentation() {}
+  virtual void resumeIndentation() {}
+
+private:
+  void anchor() override;
+};
+
+void MustacheOutputStream::anchor() {}
+
+class RawMustacheOutputStream : public MustacheOutputStream {
+public:
+  RawMustacheOutputStream(raw_ostream &OS) : OS(OS) { SetUnbuffered(); }
+
+private:
+  raw_ostream &OS;
+
+  void write_impl(const char *Ptr, size_t Size) override {
+    OS.write(Ptr, Size);
+  }
+  uint64_t current_pos() const override { return OS.tell(); }
+};
+
 class Token {
 public:
   enum class Type {
@@ -156,29 +183,31 @@ class ASTNode {
 
   void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
 
-  void render(const llvm::json::Value &Data, llvm::raw_ostream &OS);
+  void render(const llvm::json::Value &Data, MustacheOutputStream &OS);
 
 private:
-  void renderLambdas(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
-                     Lambda &L);
+  void renderLambdas(const llvm::json::Value &Contexts,
+                     MustacheOutputStream &OS, Lambda &L);
 
   void renderSectionLambdas(const llvm::json::Value &Contexts,
-                            llvm::raw_ostream &OS, SectionLambda &L);
+                            MustacheOutputStream &OS, SectionLambda &L);
 
-  void renderPartial(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
-                     ASTNode *Partial);
+  void renderPartial(const llvm::json::Value &Contexts,
+                     MustacheOutputStream &OS, ASTNode *Partial);
 
-  void renderChild(const llvm::json::Value &Context, llvm::raw_ostream &OS);
+  void renderChild(const llvm::json::Value &Context, MustacheOutputStream &OS);
 
   const llvm::json::Value *findContext();
 
-  void renderRoot(const json::Value &CurrentCtx, raw_ostream &OS);
-  void renderText(raw_ostream &OS);
-  void renderPartial(const json::Value &CurrentCtx, raw_ostream &OS);
-  void renderVariable(const json::Value &CurrentCtx, raw_ostream &OS);
-  void renderUnescapeVariable(const json::Value &CurrentCtx, raw_ostream &OS);
-  void renderSection(const json::Value &CurrentCtx, raw_ostream &OS);
-  void renderInvertSection(const json::Value &CurrentCtx, raw_ostream &OS);
+  void renderRoot(const json::Value &CurrentCtx, MustacheOutputStream &OS);
+  void renderText(MustacheOutputStream &OS);
+  void renderPartial(const json::Value &CurrentCtx, MustacheOutputStream &OS);
+  void renderVariable(const json::Value &CurrentCtx, MustacheOutputStream &OS);
+  void renderUnescapeVariable(const json::Value &CurrentCtx,
+                              MustacheOutputStream &OS);
+  void renderSection(const json::Value &CurrentCtx, MustacheOutputStream &OS);
+  void renderInvertSection(const json::Value &CurrentCtx,
+                           MustacheOutputStream &OS);
 
   MustacheContext &Ctx;
   Type Ty;
@@ -455,7 +484,7 @@ static SmallVector<Token> tokenize(StringRef Template) {
 }
 
 // Custom stream to escape strings.
-class EscapeStringStream : public raw_ostream {
+class EscapeStringStream : public MustacheOutputStream {
 public:
   explicit EscapeStringStream(llvm::raw_ostream &WrappedStream,
                               EscapeMap &Escape)
@@ -497,15 +526,18 @@ class EscapeStringStream : public raw_ostream {
 };
 
 // Custom stream to add indentation used to for rendering partials.
-class AddIndentationStringStream : public raw_ostream {
+class AddIndentationStringStream : public MustacheOutputStream {
 public:
-  explicit AddIndentationStringStream(llvm::raw_ostream &WrappedStream,
+  explicit AddIndentationStringStream(raw_ostream &WrappedStream,
                                       size_t Indentation)
       : Indentation(Indentation), WrappedStream(WrappedStream),
-        NeedsIndent(true) {
+        NeedsIndent(true), IsSuspended(false) {
     SetUnbuffered();
   }
 
+  void suspendIndentation() override { IsSuspended = true; }
+  void resumeIndentation() override { IsSuspended = false; }
+
 protected:
   void write_impl(const char *Ptr, size_t Size) override {
     llvm::StringRef Data(Ptr, Size);
@@ -513,12 +545,15 @@ class AddIndentationStringStream : public raw_ostream {
     Indent.resize(Indentation, ' ');
 
     for (char C : Data) {
+      LLVM_DEBUG(dbgs() << "IndentationStream: NeedsIndent=" << NeedsIndent
+                        << ", C='" << C << "', Indentation=" << Indentation
+                        << "\n");
       if (NeedsIndent && C != '\n') {
         WrappedStream << Indent;
         NeedsIndent = false;
       }
       WrappedStream << C;
-      if (C == '\n')
+      if (C == '\n' && !IsSuspended)
         NeedsIndent = true;
     }
   }
@@ -527,8 +562,9 @@ class AddIndentationStringStream : public raw_ostream {
 
 private:
   size_t Indentation;
-  llvm::raw_ostream &WrappedStream;
+  raw_ostream &WrappedStream;
   bool NeedsIndent;
+  bool IsSuspended;
 };
 
 class Parser {
@@ -618,6 +654,7 @@ void Parser::parseMustache(ASTNode *Parent) {
   }
 }
 static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
+  LLVM_DEBUG(dbgs() << "toMustacheString: kind=" << (int)Data.kind() << "\n");
   switch (Data.kind()) {
   case json::Value::Null:
     return;
@@ -630,6 +667,7 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   }
   case json::Value::String: {
     auto Str = *Data.getAsString();
+    LLVM_DEBUG(dbgs() << "  --> writing string: \"" << Str << "\"\n");
     OS << Str.str();
     return;
   }
@@ -649,19 +687,24 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   }
 }
 
-void ASTNode::renderRoot(const json::Value &CurrentCtx, raw_ostream &OS) {
+void ASTNode::renderRoot(const json::Value &CurrentCtx,
+                         MustacheOutputStream &OS) {
   renderChild(CurrentCtx, OS);
 }
 
-void ASTNode::renderText(raw_ostream &OS) { OS << Body; }
+void ASTNode::renderText(MustacheOutputStream &OS) { OS << Body; }
 
-void ASTNode::renderPartial(const json::Value &CurrentCtx, raw_ostream &OS) {
+void ASTNode::renderPartial(const json::Value &CurrentCtx,
+                            MustacheOutputStream &OS) {
+  LLVM_DEBUG(dbgs() << "renderPartial: Accessor=" << AccessorValue[0]
+                    << ", Indentation=" << Indentation << "\n");
   auto Partial = Ctx.Partials.find(AccessorValue[0]);
   if (Partial != Ctx.Partials.end())
     renderPartial(CurrentCtx, OS, Partial->getValue().get());
 }
 
-void ASTNode::renderVariable(const json::Value &CurrentCtx, raw_ostream &OS) {
+void ASTNode::renderVariable(const json::Value &CurrentCtx,
+                             MustacheOutputStream &OS) {
   auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
   if (Lambda != Ctx.Lambdas.end()) {
     renderLambdas(CurrentCtx, OS, Lambda->getValue());
@@ -672,16 +715,22 @@ void ASTNode::renderVariable(const json::Value &CurrentCtx, raw_ostream &OS) {
 }
 
 void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx,
-                                     raw_ostream &OS) {
+                                     MustacheOutputStream &OS) {
+  LLVM_DEBUG(dbgs() << "renderUnescapeVariable: Accessor=" << AccessorValue[0]
+                    << "\n");
   auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
   if (Lambda != Ctx.Lambdas.end()) {
     renderLambdas(CurrentCtx, OS, Lambda->getValue());
   } else if (const json::Value *ContextPtr = findContext()) {
+    LLVM_DEBUG(dbgs() << "  --> Found context value, writing to stream.\n");
+    OS.suspendIndentation();
     toMustacheString(*ContextPtr, OS);
+    OS.resumeIndentation();
   }
 }
 
-void ASTNode::renderSection(const json::Value &CurrentCtx, raw_ostream &OS) {
+void ASTNode::renderSection(const json::Value &CurrentCtx,
+                            MustacheOutputStream &OS) {
   auto SectionLambda = Ctx.SectionLambdas.find(AccessorValue[0]);
   if (SectionLambda != Ctx.SectionLambdas.end()) {
     renderSectionLambdas(CurrentCtx, OS, SectionLambda->getValue());
@@ -701,7 +750,7 @@ void ASTNode::renderSection(const json::Value &CurrentCtx, raw_ostream &OS) {
 }
 
 void ASTNode::renderInvertSection(const json::Value &CurrentCtx,
-                                  raw_ostream &OS) {
+                                  MustacheOutputStream &OS) {
   bool IsLambda = Ctx.SectionLambdas.contains(AccessorValue[0]);
   const json::Value *ContextPtr = findContext();
   if (isContextFalsey(ContextPtr) && !IsLambda) {
@@ -709,40 +758,42 @@ void ASTNode::renderInvertSection(const json::Value &CurrentCtx,
   }
 }
 
-void ASTNode::render(const json::Value &CurrentCtx, raw_ostream &OS) {
+void ASTNode::render(const llvm::json::Value &Data, MustacheOutputStream &OS) {
   if (Ty != Root && Ty != Text && AccessorValue.empty())
     return;
   // Set the parent context to the incoming context so that we
   // can walk up the context tree correctly in findContext().
-  ParentContext = &CurrentCtx;
+  ParentContext = &Data;
 
   switch (Ty) {
   case Root:
-    renderRoot(CurrentCtx, OS);
+    renderRoot(Data, OS);
     return;
   case Text:
     renderText(OS);
     return;
   case Partial:
-    renderPartial(CurrentCtx, OS);
+    renderPartial(Data, OS);
     return;
   case Variable:
-    renderVariable(CurrentCtx, OS);
+    renderVariable(Data, OS);
     return;
   case UnescapeVariable:
-    renderUnescapeVariable(CurrentCtx, OS);
+    renderUnescapeVariable(Data, OS);
     return;
   case Section:
-    renderSection(CurrentCtx, OS);
+    renderSection(Data, OS);
     return;
   case InvertSection:
-    renderInvertSection(CurrentCtx, OS);
+    renderInvertSection(Data, OS);
     return;
   }
   llvm_unreachable("Invalid ASTNode type");
 }
 
 const json::Value *ASTNode::findContext() {
+  LLVM_DEBUG(dbgs() << "findContext: AccessorValue[0]=" << AccessorValue[0]
+                    << "\n");
   // The mustache spec allows for dot notation to access nested values
   // a single dot refers to the current context.
   // We attempt to find the JSON context in the current node, if it is not
@@ -757,12 +808,22 @@ const json::Value *ASTNode::findContext() {
   StringRef CurrentAccessor = AccessorValue[0];
   ASTNode *CurrentParent = Parent;
 
+  LLVM_DEBUG(dbgs() << "findContext: ParentContext: ";
+             if (ParentContext) ParentContext->print(dbgs());
+             else dbgs() << "nullptr"; dbgs() << "\n");
+
   while (!CurrentContext || !CurrentContext->get(CurrentAccessor)) {
+    LLVM_DEBUG(dbgs() << "findContext: climbing parent\n");
     if (CurrentParent->Ty != Root) {
       CurrentContext = CurrentParent->ParentContext->getAsObject();
       CurrentParent = CurrentParent->Parent;
+      LLVM_DEBUG(dbgs() << "findContext: new ParentContext: ";
+                 if (CurrentParent->ParentContext)
+                     CurrentParent->ParentContext->print(dbgs());
+                 else dbgs() << "nullptr"; dbgs() << "\n");
       continue;
     }
+    LLVM_DEBUG(dbgs() << "findContext: reached root, not found\n");
     return nullptr;
   }
   const json::Value *Context = nullptr;
@@ -778,22 +839,28 @@ const json::Value *ASTNode::findContext() {
       Context = CurrentValue;
     }
   }
+  LLVM_DEBUG(dbgs() << "findContext: found value: ";
+             if (Context) Context->print(dbgs()); else dbgs() << "nullptr";
+             dbgs() << "\n");
   return Context;
 }
 
-void ASTNode::renderChild(const json::Value &Contexts, llvm::raw_ostream &OS) {
+void ASTNode::renderChild(const json::Value &Contexts,
+                          MustacheOutputStream &OS) {
   for (AstPtr &Child : Children)
     Child->render(Contexts, OS);
 }
 
-void ASTNode::renderPartial(const json::Value &Contexts, llvm::raw_ostream &OS,
-                            ASTNode *Partial) {
+void ASTNode::renderPartial(const json::Value &Contexts,
+                            MustacheOutputStream &OS, ASTNode *Partial) {
+  LLVM_DEBUG(dbgs() << "renderPartial (helper): Indentation=" << Indentation
+                    << "\n");
   AddIndentationStringStream IS(OS, Indentation);
   Partial->render(Contexts, IS);
 }
 
-void ASTNode::renderLambdas(const json::Value &Contexts, llvm::raw_ostream &OS,
-                            Lambda &L) {
+void ASTNode::renderLambdas(const json::Value &Contexts,
+                            MustacheOutputStream &OS, Lambda &L) {
   json::Value LambdaResult = L();
   std::string LambdaStr;
   raw_string_ostream Output(LambdaStr);
@@ -810,7 +877,7 @@ void ASTNode::renderLambdas(const json::Value &Contexts, llvm::raw_ostream &OS,
 }
 
 void ASTNode::renderSectionLambdas(const json::Value &Contexts,
-                                   llvm::raw_ostream &OS, SectionLambda &L) {
+                                   MustacheOutputStream &OS, SectionLambda &L) {
   json::Value Return = L(RawBody);
   if (isFalsey(Return))
     return;
@@ -823,7 +890,8 @@ void ASTNode::renderSectionLambdas(const json::Value &Contexts,
 }
 
 void Template::render(const json::Value &Data, llvm::raw_ostream &OS) {
-  Tree->render(Data, OS);
+  RawMustacheOutputStream MOS(OS);
+  Tree->render(Data, MOS);
 }
 
 void Template::registerPartial(std::string Name, std::string Partial) {
diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp
index 83f6e9afd1e71..e2c4422f32fd1 100644
--- a/llvm/unittests/Support/MustacheTest.cpp
+++ b/llvm/unittests/Support/MustacheTest.cpp
@@ -22,7 +22,7 @@ using namespace llvm::json;
 TEST(MustacheInterpolation, NoInterpolation) {
   // Mustache-free templates should render as-is.
   Value D = {};
-  auto T = Template("Hello from {Mustache}!\n");
+  Template T("Hello from {Mustache}!\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -32,7 +32,7 @@ TEST(MustacheInterpolation, NoInterpolation) {
 TEST(MustacheInterpolation, BasicInterpolation) {
   // Unadorned tags should interpolate content into the template.
   Value D = Object{{"subject", "World"}};
-  auto T = Template("Hello, {{subject}}!");
+  Template T("Hello, {{subject}}!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -42,7 +42,7 @@ TEST(MustacheInterpolation, BasicInterpolation) {
 TEST(MustacheInterpolation, NoReinterpolation) {
   // Interpolated tag output should not be re-interpolated.
   Value D = Object{{"template", "{{planet}}"}, {"planet", "Earth"}};
-  auto T = Template("{{template}}: {{planet}}");
+  Template T("{{template}}: {{planet}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -54,7 +54,7 @@ TEST(MustacheInterpolation, HTMLEscaping) {
   Value D = Object{
       {"forbidden", "& \" < >"},
   };
-  auto T = Template("These characters should be HTML escaped: {{forbidden}}\n");
+  Template T("These characters should be HTML escaped: {{forbidden}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -67,8 +67,7 @@ TEST(MustacheInterpolation, Ampersand) {
   Value D = Object{
       {"forbidden", "& \" < >"},
   };
-  auto T =
-      Template("These characters should not be HTML escaped: {{&forbidden}}\n");
+  Template T("These characters should not be HTML escaped: {{&forbidden}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -78,7 +77,7 @@ TEST(MustacheInterpolation, Ampersand) {
 TEST(MustacheInterpolation, BasicIntegerInterpolation) {
   // Integers should interpolate seamlessly.
   Value D = Object{{"mph", 85}};
-  auto T = Template("{{mph}} miles an hour!");
+  Template T("{{mph}} miles an hour!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -88,7 +87,7 @@ TEST(MustacheInterpolation, BasicIntegerInterpolation) {
 TEST(MustacheInterpolation, AmpersandIntegerInterpolation) {
   // Integers should interpolate seamlessly.
   Value D = Object{{"mph", 85}};
-  auto T = Template("{{&mph}} miles an hour!");
+  Template T("{{&mph}} miles an hour!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -98,7 +97,7 @@ TEST(MustacheInterpolation, AmpersandIntegerInterpolation) {
 TEST(MustacheInterpolation, BasicDecimalInterpolation) {
   // Decimals should interpolate seamlessly with proper significance.
   Value D = Object{{"power", 1.21}};
-  auto T = Template("{{power}} jiggawatts!");
+  Template T("{{power}} jiggawatts!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -108,7 +107,7 @@ TEST(MustacheInterpolation, BasicDecimalInterpolation) {
 TEST(MustacheInterpolation, BasicNullInterpolation) {
   // Nulls should interpolate as the empty string.
   Value D = Object{{"cannot", nullptr}};
-  auto T = Template("I ({{cannot}}) be seen!");
+  Template T("I ({{cannot}}) be seen!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -118,7 +117,7 @@ TEST(MustacheInterpolation, BasicNullInterpolation) {
 TEST(MustacheInterpolation, AmpersandNullInterpolation) {
   // Nulls should interpolate as the empty string.
   Value D = Object{{"cannot", nullptr}};
-  auto T = Template("I ({{&cannot}}) be seen!");
+  Template T("I ({{&cannot}}) be seen!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -128,7 +127,7 @@ TEST(MustacheInterpolation, AmpersandNullInterpolation) {
 TEST(MustacheInterpolation, BasicContextMissInterpolation) {
   // Failed context lookups should default to empty strings.
   Value D = Object{};
-  auto T = Template("I ({{cannot}}) be seen!");
+  Template T("I ({{cannot}}) be seen!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -138,7 +137,7 @@ TEST(MustacheInterpolation, BasicContextMissInterpolation) {
 TEST(MustacheInterpolation, DottedNamesBasicInterpolation) {
   // Dotted names should be considered a form of shorthand for sections.
   Value D = Object{{"person", Object{{"name", "Joe"}}}};
-  auto T = Template("{{person.name}} == {{#person}}{{name}}{{/person}}");
+  Template T("{{person.name}} == {{#person}}{{name}}{{/person}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -148,7 +147,7 @@ TEST(MustacheInterpolation, DottedNamesBasicInterpolation) {
 TEST(MustacheInterpolation, DottedNamesAmpersandInterpolation) {
   // Dotted names should be considered a form of shorthand for sections.
   Value D = Object{{"person", Object{{"name", "Joe"}}}};
-  auto T = Template("{{&person.name}} == {{#person}}{{&name}}{{/person}}");
+  Template T("{{&person.name}} == {{#person}}{{&name}}{{/person}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -163,7 +162,7 @@ TEST(MustacheInterpolation, DottedNamesArbitraryDepth) {
                Object{{"c",
                        Object{{"d",
                                Object{{"e", Object{{"name", "Phil"}}}}}}}}}}}};
-  auto T = Template("{{a.b.c.d.e.name}}");
+  Template T("{{a.b.c.d.e.name}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -173,7 +172,7 @@ TEST(MustacheInterpolation, DottedNamesArbitraryDepth) {
 TEST(MustacheInterpolation, DottedNamesBrokenChains) {
   // Any falsey value prior to the last part of the name should yield ''.
   Value D = Object{{"a", Object{}}};
-  auto T = Template("{{a.b.c}} == ");
+  Template T("{{a.b.c}} == ");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -184,7 +183,7 @@ TEST(MustacheInterpolation, DottedNamesBrokenChainResolution) {
   // Each part of a dotted name should resolve only against its parent.
   Value D =
       Object{{"a", Object{{"b", Object{}}}}, {"c", Object{{"name", "Jim"}}}};
-  auto T = Template("{{a.b.c.name}} == ");
+  Template T("{{a.b.c.name}} == ");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -201,7 +200,7 @@ TEST(MustacheInterpolation, DottedNamesInitialResolution) {
                     Object{{"d", Object{{"e", Object{{"name", "Phil"}}}}}}}}}}},
       {"b",
        Object{{"c", Object{{"d", Object{{"e", Object{{"name", "Wrong"}}}}}}}}}};
-  auto T = Template("{{#a}}{{b.c.d.e.name}}{{/a}}");
+  Template T("{{#a}}{{b.c.d.e.name}}{{/a}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -212,7 +211,7 @@ TEST(MustacheInterpolation, DottedNamesContextPrecedence) {
   // Dotted names should be resolved against former resolutions.
   Value D =
       Object{{"a", Object{{"b", Object{}}}}, {"b", Object{{"c", "ERROR"}}}};
-  auto T = Template("{{#a}}{{b.c}}{{/a}}");
+  Template T("{{#a}}{{b.c}}{{/a}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -222,7 +221,7 @@ TEST(MustacheInterpolation, DottedNamesContextPrecedence) {
 TEST(MustacheInterpolation, DottedNamesAreNotSingleKeys) {
   // Dotted names shall not be parsed as single, atomic keys
   Value D = Object{{"a.b", "c"}};
-  auto T = Template("{{a.b}}");
+  Template T("{{a.b}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -232,7 +231,7 @@ TEST(MustacheInterpolation, DottedNamesAreNotSingleKeys) {
 TEST(MustacheInterpolation, DottedNamesNoMasking) {
   // Dotted Names in a given context are unavailable due to dot splitting
   Value D = Object{{"a.b", "c"}, {"a", Object{{"b", "d"}}}};
-  auto T = Template("{{a.b}}");
+  Template T("{{a.b}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -242,7 +241,7 @@ TEST(MustacheInterpolation, DottedNamesNoMasking) {
 TEST(MustacheInterpolation, ImplicitIteratorsBasicInterpolation) {
   // Unadorned tags should interpolate content into the template.
   Value D = "world";
-  auto T = Template("Hello, {{.}}!\n");
+  Template T("Hello, {{.}}!\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -252,7 +251,7 @@ TEST(MustacheInterpolation, ImplicitIteratorsBasicInterpolation) {
 TEST(MustacheInterpolation, ImplicitIteratorsAmersand) {
   // Basic interpolation should be HTML escaped.
   Value D = "& \" < >";
-  auto T = Template("These characters should not be HTML escaped: {{&.}}\n");
+  Template T("These characters should not be HTML escaped: {{&.}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -262,7 +261,7 @@ TEST(MustacheInterpolation, ImplicitIteratorsAmersand) {
 TEST(MustacheInterpolation, ImplicitIteratorsInteger) {
   // Integers should interpolate seamlessly.
   Value D = 85;
-  auto T = Template("{{.}} miles an hour!\n");
+  Template T("{{.}} miles an hour!\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -272,7 +271,7 @@ TEST(MustacheInterpolation, ImplicitIteratorsInteger) {
 TEST(MustacheInterpolation, InterpolationSurroundingWhitespace) {
   // Interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  auto T = Template("| {{string}} |");
+  Template T("| {{string}} |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -282,7 +281,7 @@ TEST(MustacheInterpolation, InterpolationSurroundingWhitespace) {
 TEST(MustacheInterpolation, AmersandSurroundingWhitespace) {
   // Interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  auto T = Template("| {{&string}} |");
+  Template T("| {{&string}} |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -292,7 +291,7 @@ TEST(MustacheInterpolation, AmersandSurroundingWhitespace) {
 TEST(MustacheInterpolation, StandaloneInterpolationWithWhitespace) {
   // Standalone interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  auto T = Template("  {{string}}\n");
+  Template T("  {{string}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -302,7 +301,7 @@ TEST(MustacheInterpolation, StandaloneInterpolationWithWhitespace) {
 TEST(MustacheInterpolation, StandaloneAmpersandWithWhitespace) {
   // Standalone interpolation should not alter surrounding whitespace.
   Value D = Object{{"string", "---"}};
-  auto T = Template("  {{&string}}\n");
+  Template T("  {{&string}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -312,7 +311,7 @@ TEST(MustacheInterpolation, StandaloneAmpersandWithWhitespace) {
 TEST(MustacheInterpolation, InterpolationWithPadding) {
   // Superfluous in-tag whitespace should be ignored.
   Value D = Object{{"string", "---"}};
-  auto T = Template("|{{ string }}|");
+  Template T("|{{ string }}|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -322,7 +321,7 @@ TEST(MustacheInterpolation, InterpolationWithPadding) {
 TEST(MustacheInterpolation, AmpersandWithPadding) {
   // Superfluous in-tag whitespace should be ignored.
   Value D = Object{{"string", "---"}};
-  auto T = Template("|{{& string }}|");
+  Template T("|{{& string }}|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -332,7 +331,7 @@ TEST(MustacheInterpolation, AmpersandWithPadding) {
 TEST(MustacheInterpolation, InterpolationWithPaddingAndNewlines) {
   // Superfluous in-tag whitespace should be ignored.
   Value D = Object{{"string", "---"}};
-  auto T = Template("|{{ string \n\n\n }}|");
+  Template T("|{{ string \n\n\n }}|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -341,7 +340,7 @@ TEST(MustacheInterpolation, InterpolationWithPaddingAndNewlines) {
 
 TEST(MustacheSections, Truthy) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("{{#boolean}}This should be rendered.{{/boolean}}");
+  Template T("{{#boolean}}This should be rendered.{{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -350,7 +349,7 @@ TEST(MustacheSections, Truthy) {
 
 TEST(MustacheSections, Falsey) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("{{#boolean}}This should not be rendered.{{/boolean}}");
+  Template T("{{#boolean}}This should not be rendered.{{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -360,7 +359,7 @@ TEST(MustacheSections, Falsey) {
 TEST(MustacheInterpolation, IsFalseyNull) {
   // Mustache-free templates should render as-is.
   Value D = Object{{"boolean", nullptr}};
-  auto T = Template("Hello, {{#boolean}}World{{/boolean}}");
+  Template T("Hello, {{#boolean}}World{{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -370,7 +369,7 @@ TEST(MustacheInterpolation, IsFalseyNull) {
 TEST(MustacheInterpolation, IsFalseyArray) {
   // Mustache-free templates should render as-is.
   Value D = Object{{"boolean", Array()}};
-  auto T = Template("Hello, {{#boolean}}World{{/boolean}}");
+  Template T("Hello, {{#boolean}}World{{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -380,7 +379,7 @@ TEST(MustacheInterpolation, IsFalseyArray) {
 TEST(MustacheInterpolation, IsFalseyObject) {
   // Mustache-free templates should render as-is.
   Value D = Object{{"boolean", Object{}}};
-  auto T = Template("Hello, {{#boolean}}World{{/boolean}}");
+  Template T("Hello, {{#boolean}}World{{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -390,7 +389,7 @@ TEST(MustacheInterpolation, IsFalseyObject) {
 TEST(MustacheInterpolation, DoubleRendering) {
   // Mustache-free templates should render as-is.
   Value D1 = Object{{"subject", "World"}};
-  auto T = Template("Hello, {{subject}}!");
+  Template T("Hello, {{subject}}!");
   std::string Out1;
   raw_string_ostream OS1(Out1);
   T.render(D1, OS1);
@@ -404,7 +403,7 @@ TEST(MustacheInterpolation, DoubleRendering) {
 
 TEST(MustacheSections, NullIsFalsey) {
   Value D = Object{{"null", nullptr}};
-  auto T = Template("{{#null}}This should not be rendered.{{/null}}");
+  Template T("{{#null}}This should not be rendered.{{/null}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -413,7 +412,7 @@ TEST(MustacheSections, NullIsFalsey) {
 
 TEST(MustacheSections, Context) {
   Value D = Object{{"context", Object{{"name", "Joe"}}}};
-  auto T = Template("{{#context}}Hi {{name}}.{{/context}}");
+  Template T("{{#context}}Hi {{name}}.{{/context}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -425,7 +424,7 @@ TEST(MustacheSections, ParentContexts) {
                    {"b", "wrong"},
                    {"sec", Object{{"b", "bar"}}},
                    {"c", Object{{"d", "baz"}}}};
-  auto T = Template("{{#sec}}{{a}}, {{b}}, {{c.d}}{{/sec}}");
+  Template T("{{#sec}}{{a}}, {{b}}, {{c.d}}{{/sec}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -434,7 +433,7 @@ TEST(MustacheSections, ParentContexts) {
 
 TEST(MustacheSections, VariableTest) {
   Value D = Object{{"foo", "bar"}};
-  auto T = Template("{{#foo}}{{.}} is {{foo}}{{/foo}}");
+  Template T("{{#foo}}{{.}} is {{foo}}{{/foo}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -450,14 +449,14 @@ TEST(MustacheSections, ListContexts) {
             Array{Object{{"mname", "1"},
                          {"bottoms", Array{Object{{"bname", "x"}},
                                            Object{{"bname", "y"}}}}}}}}}}};
-  auto T = Template("{{#tops}}"
-                    "{{#middles}}"
-                    "{{tname.lower}}{{mname}}."
-                    "{{#bottoms}}"
-                    "{{tname.upper}}{{mname}}{{bname}}."
-                    "{{/bottoms}}"
-                    "{{/middles}}"
-                    "{{/tops}}");
+  Template T("{{#tops}}"
+             "{{#middles}}"
+             "{{tname.lower}}{{mname}}."
+             "{{#bottoms}}"
+             "{{tname.upper}}{{mname}}{{bname}}."
+             "{{/bottoms}}"
+             "{{/middles}}"
+             "{{/tops}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -469,7 +468,7 @@ TEST(MustacheSections, DeeplyNestedContexts) {
       {"a", Object{{"one", 1}}},
       {"b", Object{{"two", 2}}},
       {"c", Object{{"three", 3}, {"d", Object{{"four", 4}, {"five", 5}}}}}};
-  auto T = Template(
+  Template T(
       "{{#a}}\n{{one}}\n{{#b}}\n{{one}}{{two}}{{one}}\n{{#c}}\n{{one}}{{two}}{{"
       "three}}{{two}}{{one}}\n{{#d}}\n{{one}}{{two}}{{three}}{{four}}{{three}}{"
       "{two}}{{one}}\n{{#five}}\n{{one}}{{two}}{{three}}{{four}}{{five}}{{four}"
@@ -490,7 +489,7 @@ TEST(MustacheSections, DeeplyNestedContexts) {
 TEST(MustacheSections, List) {
   Value D = Object{{"list", Array{Object{{"item", 1}}, Object{{"item", 2}},
                                   Object{{"item", 3}}}}};
-  auto T = Template("{{#list}}{{item}}{{/list}}");
+  Template T("{{#list}}{{item}}{{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -499,7 +498,7 @@ TEST(MustacheSections, List) {
 
 TEST(MustacheSections, EmptyList) {
   Value D = Object{{"list", Array{}}};
-  auto T = Template("{{#list}}Yay lists!{{/list}}");
+  Template T("{{#list}}Yay lists!{{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -508,8 +507,8 @@ TEST(MustacheSections, EmptyList) {
 
 TEST(MustacheSections, Doubled) {
   Value D = Object{{"bool", true}, {"two", "second"}};
-  auto T = Template("{{#bool}}\n* first\n{{/bool}}\n* "
-                    "{{two}}\n{{#bool}}\n* third\n{{/bool}}\n");
+  Template T("{{#bool}}\n* first\n{{/bool}}\n* "
+             "{{two}}\n{{#bool}}\n* third\n{{/bool}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -518,7 +517,7 @@ TEST(MustacheSections, Doubled) {
 
 TEST(MustacheSections, NestedTruthy) {
   Value D = Object{{"bool", true}};
-  auto T = Template("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |");
+  Template T("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -527,7 +526,7 @@ TEST(MustacheSections, NestedTruthy) {
 
 TEST(MustacheSections, NestedFalsey) {
   Value D = Object{{"bool", false}};
-  auto T = Template("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |");
+  Template T("| A {{#bool}}B {{#bool}}C{{/bool}} D{{/bool}} E |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -536,7 +535,7 @@ TEST(MustacheSections, NestedFalsey) {
 
 TEST(MustacheSections, ContextMisses) {
   Value D = Object{};
-  auto T = Template("[{{#missing}}Found key 'missing'!{{/missing}}]");
+  Template T("[{{#missing}}Found key 'missing'!{{/missing}}]");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -545,7 +544,7 @@ TEST(MustacheSections, ContextMisses) {
 
 TEST(MustacheSections, ImplicitIteratorString) {
   Value D = Object{{"list", Array{"a", "b", "c", "d", "e"}}};
-  auto T = Template("{{#list}}({{.}}){{/list}}");
+  Template T("{{#list}}({{.}}){{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -554,7 +553,7 @@ TEST(MustacheSections, ImplicitIteratorString) {
 
 TEST(MustacheSections, ImplicitIteratorInteger) {
   Value D = Object{{"list", Array{1, 2, 3, 4, 5}}};
-  auto T = Template("{{#list}}({{.}}){{/list}}");
+  Template T("{{#list}}({{.}}){{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -563,7 +562,7 @@ TEST(MustacheSections, ImplicitIteratorInteger) {
 
 TEST(MustacheSections, ImplicitIteratorArray) {
   Value D = Object{{"list", Array{Array{1, 2, 3}, Array{"a", "b", "c"}}}};
-  auto T = Template("{{#list}}({{#.}}{{.}}{{/.}}){{/list}}");
+  Template T("{{#list}}({{#.}}{{.}}{{/.}}){{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -572,7 +571,7 @@ TEST(MustacheSections, ImplicitIteratorArray) {
 
 TEST(MustacheSections, ImplicitIteratorHTMLEscaping) {
   Value D = Object{{"list", Array{"&", "\"", "<", ">"}}};
-  auto T = Template("{{#list}}({{.}}){{/list}}");
+  Template T("{{#list}}({{.}}){{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -581,7 +580,7 @@ TEST(MustacheSections, ImplicitIteratorHTMLEscaping) {
 
 TEST(MustacheSections, ImplicitIteratorAmpersand) {
   Value D = Object{{"list", Array{"&", "\"", "<", ">"}}};
-  auto T = Template("{{#list}}({{&.}}){{/list}}");
+  Template T("{{#list}}({{&.}}){{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -590,7 +589,7 @@ TEST(MustacheSections, ImplicitIteratorAmpersand) {
 
 TEST(MustacheSections, ImplicitIteratorRootLevel) {
   Value D = Array{Object{{"value", "a"}}, Object{{"value", "b"}}};
-  auto T = Template("{{#.}}({{value}}){{/.}}");
+  Template T("{{#.}}({{value}}){{/.}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -599,7 +598,7 @@ TEST(MustacheSections, ImplicitIteratorRootLevel) {
 
 TEST(MustacheSections, DottedNamesTruthy) {
   Value D = Object{{"a", Object{{"b", Object{{"c", true}}}}}};
-  auto T = Template("{{#a.b.c}}Here{{/a.b.c}} == Here");
+  Template T("{{#a.b.c}}Here{{/a.b.c}} == Here");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -608,7 +607,7 @@ TEST(MustacheSections, DottedNamesTruthy) {
 
 TEST(MustacheSections, DottedNamesFalsey) {
   Value D = Object{{"a", Object{{"b", Object{{"c", false}}}}}};
-  auto T = Template("{{#a.b.c}}Here{{/a.b.c}} == ");
+  Template T("{{#a.b.c}}Here{{/a.b.c}} == ");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -617,7 +616,7 @@ TEST(MustacheSections, DottedNamesFalsey) {
 
 TEST(MustacheSections, DottedNamesBrokenChains) {
   Value D = Object{{"a", Object{}}};
-  auto T = Template("{{#a.b.c}}Here{{/a.b.c}} == ");
+  Template T("{{#a.b.c}}Here{{/a.b.c}} == ");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -626,7 +625,7 @@ TEST(MustacheSections, DottedNamesBrokenChains) {
 
 TEST(MustacheSections, SurroundingWhitespace) {
   Value D = Object{{"boolean", true}};
-  auto T = Template(" | {{#boolean}}\t|\t{{/boolean}} | \n");
+  Template T(" | {{#boolean}}\t|\t{{/boolean}} | \n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -635,8 +634,7 @@ TEST(MustacheSections, SurroundingWhitespace) {
 
 TEST(MustacheSections, InternalWhitespace) {
   Value D = Object{{"boolean", true}};
-  auto T = Template(
-      " | {{#boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n");
+  Template T(" | {{#boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -645,8 +643,7 @@ TEST(MustacheSections, InternalWhitespace) {
 
 TEST(MustacheSections, IndentedInlineSections) {
   Value D = Object{{"boolean", true}};
-  auto T =
-      Template(" {{#boolean}}YES{{/boolean}}\n {{#boolean}}GOOD{{/boolean}}\n");
+  Template T(" {{#boolean}}YES{{/boolean}}\n {{#boolean}}GOOD{{/boolean}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -655,7 +652,7 @@ TEST(MustacheSections, IndentedInlineSections) {
 
 TEST(MustacheSections, StandaloneLines) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("| This Is\n{{#boolean}}\n|\n{{/boolean}}\n| A Line\n");
+  Template T("| This Is\n{{#boolean}}\n|\n{{/boolean}}\n| A Line\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -664,7 +661,7 @@ TEST(MustacheSections, StandaloneLines) {
 
 TEST(MustacheSections, IndentedStandaloneLines) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("| This Is\n  {{#boolean}}\n|\n  {{/boolean}}\n| A Line\n");
+  Template T("| This Is\n  {{#boolean}}\n|\n  {{/boolean}}\n| A Line\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -673,7 +670,7 @@ TEST(MustacheSections, IndentedStandaloneLines) {
 
 TEST(MustacheSections, StandaloneLineEndings) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("|\r\n{{#boolean}}\r\n{{/boolean}}\r\n|");
+  Template T("|\r\n{{#boolean}}\r\n{{/boolean}}\r\n|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -682,7 +679,7 @@ TEST(MustacheSections, StandaloneLineEndings) {
 
 TEST(MustacheSections, StandaloneWithoutPreviousLine) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("  {{#boolean}}\n#{{/boolean}}\n/");
+  Template T("  {{#boolean}}\n#{{/boolean}}\n/");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -691,7 +688,7 @@ TEST(MustacheSections, StandaloneWithoutPreviousLine) {
 
 TEST(MustacheSections, StandaloneWithoutNewline) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("#{{#boolean}}\n/\n  {{/boolean}}");
+  Template T("#{{#boolean}}\n/\n  {{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -700,7 +697,7 @@ TEST(MustacheSections, StandaloneWithoutNewline) {
 
 TEST(MustacheSections, Padding) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("|{{# boolean }}={{/ boolean }}|");
+  Template T("|{{# boolean }}={{/ boolean }}|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -709,7 +706,7 @@ TEST(MustacheSections, Padding) {
 
 TEST(MustacheInvertedSections, Falsey) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("{{^boolean}}This should be rendered.{{/boolean}}");
+  Template T("{{^boolean}}This should be rendered.{{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -718,7 +715,7 @@ TEST(MustacheInvertedSections, Falsey) {
 
 TEST(MustacheInvertedSections, Truthy) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("{{^boolean}}This should not be rendered.{{/boolean}}");
+  Template T("{{^boolean}}This should not be rendered.{{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -727,7 +724,7 @@ TEST(MustacheInvertedSections, Truthy) {
 
 TEST(MustacheInvertedSections, NullIsFalsey) {
   Value D = Object{{"null", nullptr}};
-  auto T = Template("{{^null}}This should be rendered.{{/null}}");
+  Template T("{{^null}}This should be rendered.{{/null}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -736,7 +733,7 @@ TEST(MustacheInvertedSections, NullIsFalsey) {
 
 TEST(MustacheInvertedSections, Context) {
   Value D = Object{{"context", Object{{"name", "Joe"}}}};
-  auto T = Template("{{^context}}Hi {{name}}.{{/context}}");
+  Template T("{{^context}}Hi {{name}}.{{/context}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -746,7 +743,7 @@ TEST(MustacheInvertedSections, Context) {
 TEST(MustacheInvertedSections, List) {
   Value D = Object{
       {"list", Array{Object{{"n", 1}}, Object{{"n", 2}}, Object{{"n", 3}}}}};
-  auto T = Template("{{^list}}{{n}}{{/list}}");
+  Template T("{{^list}}{{n}}{{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -755,7 +752,7 @@ TEST(MustacheInvertedSections, List) {
 
 TEST(MustacheInvertedSections, EmptyList) {
   Value D = Object{{"list", Array{}}};
-  auto T = Template("{{^list}}Yay lists!{{/list}}");
+  Template T("{{^list}}Yay lists!{{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -764,8 +761,8 @@ TEST(MustacheInvertedSections, EmptyList) {
 
 TEST(MustacheInvertedSections, Doubled) {
   Value D = Object{{"bool", false}, {"two", "second"}};
-  auto T = Template("{{^bool}}\n* first\n{{/bool}}\n* "
-                    "{{two}}\n{{^bool}}\n* third\n{{/bool}}\n");
+  Template T("{{^bool}}\n* first\n{{/bool}}\n* "
+             "{{two}}\n{{^bool}}\n* third\n{{/bool}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -774,7 +771,7 @@ TEST(MustacheInvertedSections, Doubled) {
 
 TEST(MustacheInvertedSections, NestedFalsey) {
   Value D = Object{{"bool", false}};
-  auto T = Template("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |");
+  Template T("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -783,7 +780,7 @@ TEST(MustacheInvertedSections, NestedFalsey) {
 
 TEST(MustacheInvertedSections, NestedTruthy) {
   Value D = Object{{"bool", true}};
-  auto T = Template("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |");
+  Template T("| A {{^bool}}B {{^bool}}C{{/bool}} D{{/bool}} E |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -792,7 +789,7 @@ TEST(MustacheInvertedSections, NestedTruthy) {
 
 TEST(MustacheInvertedSections, ContextMisses) {
   Value D = Object{};
-  auto T = Template("[{{^missing}}Cannot find key 'missing'!{{/missing}}]");
+  Template T("[{{^missing}}Cannot find key 'missing'!{{/missing}}]");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -801,7 +798,7 @@ TEST(MustacheInvertedSections, ContextMisses) {
 
 TEST(MustacheInvertedSections, DottedNamesTruthy) {
   Value D = Object{{"a", Object{{"b", Object{{"c", true}}}}}};
-  auto T = Template("{{^a.b.c}}Not Here{{/a.b.c}} == ");
+  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == ");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -810,7 +807,7 @@ TEST(MustacheInvertedSections, DottedNamesTruthy) {
 
 TEST(MustacheInvertedSections, DottedNamesFalsey) {
   Value D = Object{{"a", Object{{"b", Object{{"c", false}}}}}};
-  auto T = Template("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here");
+  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -819,7 +816,7 @@ TEST(MustacheInvertedSections, DottedNamesFalsey) {
 
 TEST(MustacheInvertedSections, DottedNamesBrokenChains) {
   Value D = Object{{"a", Object{}}};
-  auto T = Template("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here");
+  Template T("{{^a.b.c}}Not Here{{/a.b.c}} == Not Here");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -828,7 +825,7 @@ TEST(MustacheInvertedSections, DottedNamesBrokenChains) {
 
 TEST(MustacheInvertedSections, SurroundingWhitespace) {
   Value D = Object{{"boolean", false}};
-  auto T = Template(" | {{^boolean}}\t|\t{{/boolean}} | \n");
+  Template T(" | {{^boolean}}\t|\t{{/boolean}} | \n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -837,8 +834,7 @@ TEST(MustacheInvertedSections, SurroundingWhitespace) {
 
 TEST(MustacheInvertedSections, InternalWhitespace) {
   Value D = Object{{"boolean", false}};
-  auto T = Template(
-      " | {{^boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n");
+  Template T(" | {{^boolean}} {{! Important Whitespace }}\n {{/boolean}} | \n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -847,8 +843,7 @@ TEST(MustacheInvertedSections, InternalWhitespace) {
 
 TEST(MustacheInvertedSections, IndentedInlineSections) {
   Value D = Object{{"boolean", false}};
-  auto T =
-      Template(" {{^boolean}}NO{{/boolean}}\n {{^boolean}}WAY{{/boolean}}\n");
+  Template T(" {{^boolean}}NO{{/boolean}}\n {{^boolean}}WAY{{/boolean}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -857,7 +852,7 @@ TEST(MustacheInvertedSections, IndentedInlineSections) {
 
 TEST(MustacheInvertedSections, StandaloneLines) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("| This Is\n{{^boolean}}\n|\n{{/boolean}}\n| A Line\n");
+  Template T("| This Is\n{{^boolean}}\n|\n{{/boolean}}\n| A Line\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -866,7 +861,7 @@ TEST(MustacheInvertedSections, StandaloneLines) {
 
 TEST(MustacheInvertedSections, StandaloneIndentedLines) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("| This Is\n  {{^boolean}}\n|\n  {{/boolean}}\n| A Line\n");
+  Template T("| This Is\n  {{^boolean}}\n|\n  {{/boolean}}\n| A Line\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -875,7 +870,7 @@ TEST(MustacheInvertedSections, StandaloneIndentedLines) {
 
 TEST(MustacheInvertedSections, StandaloneLineEndings) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("|\r\n{{^boolean}}\r\n{{/boolean}}\r\n|");
+  Template T("|\r\n{{^boolean}}\r\n{{/boolean}}\r\n|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -884,7 +879,7 @@ TEST(MustacheInvertedSections, StandaloneLineEndings) {
 
 TEST(MustacheInvertedSections, StandaloneWithoutPreviousLine) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("  {{^boolean}}\n^{{/boolean}}\n/");
+  Template T("  {{^boolean}}\n^{{/boolean}}\n/");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -893,7 +888,7 @@ TEST(MustacheInvertedSections, StandaloneWithoutPreviousLine) {
 
 TEST(MustacheInvertedSections, StandaloneWithoutNewline) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("^{{^boolean}}\n/\n  {{/boolean}}");
+  Template T("^{{^boolean}}\n/\n  {{/boolean}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -902,7 +897,7 @@ TEST(MustacheInvertedSections, StandaloneWithoutNewline) {
 
 TEST(MustacheInvertedSections, Padding) {
   Value D = Object{{"boolean", false}};
-  auto T = Template("|{{^ boolean }}={{/ boolean }}|");
+  Template T("|{{^ boolean }}={{/ boolean }}|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -911,7 +906,7 @@ TEST(MustacheInvertedSections, Padding) {
 
 TEST(MustachePartials, BasicBehavior) {
   Value D = Object{};
-  auto T = Template("{{>text}}");
+  Template T("{{>text}}");
   T.registerPartial("text", "from partial");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -921,7 +916,7 @@ TEST(MustachePartials, BasicBehavior) {
 
 TEST(MustachePartials, FailedLookup) {
   Value D = Object{};
-  auto T = Template("{{>text}}");
+  Template T("{{>text}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -930,7 +925,7 @@ TEST(MustachePartials, FailedLookup) {
 
 TEST(MustachePartials, Context) {
   Value D = Object{{"text", "content"}};
-  auto T = Template("{{>partial}}");
+  Template T("{{>partial}}");
   T.registerPartial("partial", "*{{text}}*");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -942,7 +937,7 @@ TEST(MustachePartials, Recursion) {
   Value D =
       Object{{"content", "X"},
              {"nodes", Array{Object{{"content", "Y"}, {"nodes", Array{}}}}}};
-  auto T = Template("{{>node}}");
+  Template T("{{>node}}");
   T.registerPartial("node", "{{content}}({{#nodes}}{{>node}}{{/nodes}})");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -952,7 +947,7 @@ TEST(MustachePartials, Recursion) {
 
 TEST(MustachePartials, Nested) {
   Value D = Object{{"a", "hello"}, {"b", "world"}};
-  auto T = Template("{{>outer}}");
+  Template T("{{>outer}}");
   T.registerPartial("outer", "*{{a}} {{>inner}}*");
   T.registerPartial("inner", "{{b}}!");
   std::string Out;
@@ -963,7 +958,7 @@ TEST(MustachePartials, Nested) {
 
 TEST(MustachePartials, SurroundingWhitespace) {
   Value D = Object{};
-  auto T = Template("| {{>partial}} |");
+  Template T("| {{>partial}} |");
   T.registerPartial("partial", "\t|\t");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -973,7 +968,7 @@ TEST(MustachePartials, SurroundingWhitespace) {
 
 TEST(MustachePartials, InlineIndentation) {
   Value D = Object{{"data", "|"}};
-  auto T = Template("  {{data}}  {{> partial}}\n");
+  Template T("  {{data}}  {{> partial}}\n");
   T.registerPartial("partial", "<\n<");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -983,7 +978,7 @@ TEST(MustachePartials, InlineIndentation) {
 
 TEST(MustachePartials, PaddingWhitespace) {
   Value D = Object{{"boolean", true}};
-  auto T = Template("|{{> partial }}|");
+  Template T("|{{> partial }}|");
   T.registerPartial("partial", "[]");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -992,18 +987,18 @@ TEST(MustachePartials, PaddingWhitespace) {
 }
 
 TEST(MustachePartials, StandaloneIndentation) {
-  Value D = Object{{"content", "<\n->"}};
-  auto T = Template("\\\n  {{>partial}}\n/\n");
+  mustache::Template T("\\\n {{>partial}}\n/\n");
   T.registerPartial("partial", "|\n{{{content}}}\n|\n");
-  std::string Out;
-  raw_string_ostream OS(Out);
-  T.render(D, OS);
-  EXPECT_EQ("\\\n  |\n  <\n  ->\n  |\n/\n", Out);
+  std::string O;
+  raw_string_ostream OS(O);
+  Value DataContext = Object{{"content", "<\n->"}};
+  T.render(DataContext, OS);
+  EXPECT_EQ("\\\n |\n <\n->\n |\n/\n", OS.str());
 }
 
 TEST(MustacheLambdas, BasicInterpolation) {
   Value D = Object{};
-  auto T = Template("Hello, {{lambda}}!");
+  Template T("Hello, {{lambda}}!");
   Lambda L = []() -> llvm::json::Value { return "World"; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1014,7 +1009,7 @@ TEST(MustacheLambdas, BasicInterpolation) {
 
 TEST(MustacheLambdas, InterpolationExpansion) {
   Value D = Object{{"planet", "World"}};
-  auto T = Template("Hello, {{lambda}}!");
+  Template T("Hello, {{lambda}}!");
   Lambda L = []() -> llvm::json::Value { return "{{planet}}"; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1025,7 +1020,7 @@ TEST(MustacheLambdas, InterpolationExpansion) {
 
 TEST(MustacheLambdas, BasicMultipleCalls) {
   Value D = Object{};
-  auto T = Template("{{lambda}} == {{lambda}} == {{lambda}}");
+  Template T("{{lambda}} == {{lambda}} == {{lambda}}");
   int I = 0;
   Lambda L = [&I]() -> llvm::json::Value {
     I += 1;
@@ -1040,7 +1035,7 @@ TEST(MustacheLambdas, BasicMultipleCalls) {
 
 TEST(MustacheLambdas, Escaping) {
   Value D = Object{};
-  auto T = Template("<{{lambda}}{{&lambda}}");
+  Template T("<{{lambda}}{{&lambda}}");
   Lambda L = []() -> llvm::json::Value { return ">"; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1051,7 +1046,7 @@ TEST(MustacheLambdas, Escaping) {
 
 TEST(MustacheLambdas, Sections) {
   Value D = Object{};
-  auto T = Template("<{{#lambda}}{{x}}{{/lambda}}>");
+  Template T("<{{#lambda}}{{x}}{{/lambda}}>");
   SectionLambda L = [](StringRef Text) -> llvm::json::Value {
     if (Text == "{{x}}") {
       return "yes";
@@ -1069,7 +1064,7 @@ TEST(MustacheLambdas, SectionExpansion) {
   Value D = Object{
       {"planet", "Earth"},
   };
-  auto T = Template("<{{#lambda}}-{{/lambda}}>");
+  Template T("<{{#lambda}}-{{/lambda}}>");
   SectionLambda L = [](StringRef Text) -> llvm::json::Value {
     SmallString<128> Result;
     Result += Text;
@@ -1086,7 +1081,7 @@ TEST(MustacheLambdas, SectionExpansion) {
 
 TEST(MustacheLambdas, SectionsMultipleCalls) {
   Value D = Object{};
-  auto T = Template("{{#lambda}}FILE{{/lambda}} != {{#lambda}}LINE{{/lambda}}");
+  Template T("{{#lambda}}FILE{{/lambda}} != {{#lambda}}LINE{{/lambda}}");
   SectionLambda L = [](StringRef Text) -> llvm::json::Value {
     SmallString<128> Result;
     Result += "__";
@@ -1103,7 +1098,7 @@ TEST(MustacheLambdas, SectionsMultipleCalls) {
 
 TEST(MustacheLambdas, InvertedSections) {
   Value D = Object{{"static", "static"}};
-  auto T = Template("<{{^lambda}}{{static}}{{/lambda}}>");
+  Template T("<{{^lambda}}{{static}}{{/lambda}}>");
   SectionLambda L = [](StringRef Text) -> llvm::json::Value { return false; };
   T.registerLambda("lambda", L);
   std::string Out;
@@ -1115,7 +1110,7 @@ TEST(MustacheLambdas, InvertedSections) {
 TEST(MustacheComments, Inline) {
   // Comment blocks should be removed from the template.
   Value D = {};
-  auto T = Template("12345{{! Comment Block! }}67890");
+  Template T("12345{{! Comment Block! }}67890");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1125,8 +1120,7 @@ TEST(MustacheComments, Inline) {
 TEST(MustacheComments, Multiline) {
   // Multiline comments should be permitted.
   Value D = {};
-  auto T =
-      Template("12345{{!\n  This is a\n  multi-line comment...\n}}67890\n");
+  Template T("12345{{!\n  This is a\n  multi-line comment...\n}}67890\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1136,7 +1130,7 @@ TEST(MustacheComments, Multiline) {
 TEST(MustacheComments, Standalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  auto T = Template("Begin.\n{{! Comment Block! }}\nEnd.\n");
+  Template T("Begin.\n{{! Comment Block! }}\nEnd.\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1146,7 +1140,7 @@ TEST(MustacheComments, Standalone) {
 TEST(MustacheComments, IndentedStandalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  auto T = Template("Begin.\n  {{! Indented Comment Block! }}\nEnd.\n");
+  Template T("Begin.\n  {{! Indented Comment Block! }}\nEnd.\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1156,7 +1150,7 @@ TEST(MustacheComments, IndentedStandalone) {
 TEST(MustacheComments, StandaloneLineEndings) {
   // "\r\n" should be considered a newline for standalone tags.
   Value D = {};
-  auto T = Template("|\r\n{{! Standalone Comment }}\r\n|");
+  Template T("|\r\n{{! Standalone Comment }}\r\n|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1166,7 +1160,7 @@ TEST(MustacheComments, StandaloneLineEndings) {
 TEST(MustacheComments, StandaloneWithoutPreviousLine) {
   // Standalone tags should not require a newline to precede them.
   Value D = {};
-  auto T = Template("  {{! I'm Still Standalone }}\n!");
+  Template T("  {{! I'm Still Standalone }}\n!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1176,7 +1170,7 @@ TEST(MustacheComments, StandaloneWithoutPreviousLine) {
 TEST(MustacheComments, StandaloneWithoutNewline) {
   // Standalone tags should not require a newline to follow them.
   Value D = {};
-  auto T = Template("!\n  {{! I'm Still Standalone }}");
+  Template T("!\n  {{! I'm Still Standalone }}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1186,7 +1180,7 @@ TEST(MustacheComments, StandaloneWithoutNewline) {
 TEST(MustacheComments, MultilineStandalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  auto T = Template("Begin.\n{{!\nSomething's going on here...\n}}\nEnd.\n");
+  Template T("Begin.\n{{!\nSomething's going on here...\n}}\nEnd.\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1196,8 +1190,7 @@ TEST(MustacheComments, MultilineStandalone) {
 TEST(MustacheComments, IndentedMultilineStandalone) {
   // All standalone comment lines should be removed.
   Value D = {};
-  auto T =
-      Template("Begin.\n  {{!\n    Something's going on here...\n  }}\nEnd.\n");
+  Template T("Begin.\n  {{!\n    Something's going on here...\n  }}\nEnd.\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1207,7 +1200,7 @@ TEST(MustacheComments, IndentedMultilineStandalone) {
 TEST(MustacheComments, IndentedInline) {
   // Inline comments should not strip whitespace.
   Value D = {};
-  auto T = Template("  12 {{! 34 }}\n");
+  Template T("  12 {{! 34 }}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1217,7 +1210,7 @@ TEST(MustacheComments, IndentedInline) {
 TEST(MustacheComments, SurroundingWhitespace) {
   // Comment removal should preserve surrounding whitespace.
   Value D = {};
-  auto T = Template("12345 {{! Comment Block! }} 67890");
+  Template T("12345 {{! Comment Block! }} 67890");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1228,7 +1221,7 @@ TEST(MustacheComments, VariableNameCollision) {
   // Comments must never render, even if a variable with the same name exists.
   Value D = Object{
       {"! comment", 1}, {"! comment ", 2}, {"!comment", 3}, {"comment", 4}};
-  auto T = Template("comments never show: >{{! comment }}<");
+  Template T("comments never show: >{{! comment }}<");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1241,7 +1234,7 @@ TEST(MustacheComments, VariableNameCollision) {
 // implemented, these assertions should be changed back to EXPECT_EQ.
 TEST(MustacheTripleMustache, Basic) {
   Value D = Object{{"subject", "<b>World</b>"}};
-  auto T = Template("Hello, {{{subject}}}!");
+  Template T("Hello, {{{subject}}}!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1250,7 +1243,7 @@ TEST(MustacheTripleMustache, Basic) {
 
 TEST(MustacheTripleMustache, IntegerInterpolation) {
   Value D = Object{{"mph", 85}};
-  auto T = Template("{{{mph}}} miles an hour!");
+  Template T("{{{mph}}} miles an hour!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1259,7 +1252,7 @@ TEST(MustacheTripleMustache, IntegerInterpolation) {
 
 TEST(MustacheTripleMustache, DecimalInterpolation) {
   Value D = Object{{"power", 1.21}};
-  auto T = Template("{{{power}}} jiggawatts!");
+  Template T("{{{power}}} jiggawatts!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1268,7 +1261,7 @@ TEST(MustacheTripleMustache, DecimalInterpolation) {
 
 TEST(MustacheTripleMustache, NullInterpolation) {
   Value D = Object{{"cannot", nullptr}};
-  auto T = Template("I ({{{cannot}}}) be seen!");
+  Template T("I ({{{cannot}}}) be seen!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1277,7 +1270,7 @@ TEST(MustacheTripleMustache, NullInterpolation) {
 
 TEST(MustacheTripleMustache, ContextMissInterpolation) {
   Value D = Object{};
-  auto T = Template("I ({{{cannot}}}) be seen!");
+  Template T("I ({{{cannot}}}) be seen!");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1286,7 +1279,7 @@ TEST(MustacheTripleMustache, ContextMissInterpolation) {
 
 TEST(MustacheTripleMustache, DottedNames) {
   Value D = Object{{"person", Object{{"name", "<b>Joe</b>"}}}};
-  auto T = Template("{{{person.name}}}");
+  Template T("{{{person.name}}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1295,7 +1288,7 @@ TEST(MustacheTripleMustache, DottedNames) {
 
 TEST(MustacheTripleMustache, ImplicitIterator) {
   Value D = Object{{"list", Array{"<a>", "<b>"}}};
-  auto T = Template("{{#list}}({{{.}}}){{/list}}");
+  Template T("{{#list}}({{{.}}}){{/list}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1304,7 +1297,7 @@ TEST(MustacheTripleMustache, ImplicitIterator) {
 
 TEST(MustacheTripleMustache, SurroundingWhitespace) {
   Value D = Object{{"string", "---"}};
-  auto T = Template("| {{{string}}} |");
+  Template T("| {{{string}}} |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1313,7 +1306,7 @@ TEST(MustacheTripleMustache, SurroundingWhitespace) {
 
 TEST(MustacheTripleMustache, Standalone) {
   Value D = Object{{"string", "---"}};
-  auto T = Template("  {{{string}}}\n");
+  Template T("  {{{string}}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1322,7 +1315,7 @@ TEST(MustacheTripleMustache, Standalone) {
 
 TEST(MustacheTripleMustache, WithPadding) {
   Value D = Object{{"string", "---"}};
-  auto T = Template("|{{{ string }}}|");
+  Template T("|{{{ string }}}|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1331,7 +1324,7 @@ TEST(MustacheTripleMustache, WithPadding) {
 
 TEST(MustacheDelimiters, PairBehavior) {
   Value D = Object{{"text", "Hey!"}};
-  auto T = Template("{{=<% %>=}}(<%text%>)");
+  Template T("{{=<% %>=}}(<%text%>)");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1340,7 +1333,7 @@ TEST(MustacheDelimiters, PairBehavior) {
 
 TEST(MustacheDelimiters, SpecialCharacters) {
   Value D = Object{{"text", "It worked!"}};
-  auto T = Template("({{=[ ]=}}[text])");
+  Template T("({{=[ ]=}}[text])");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1375,7 +1368,7 @@ TEST(MustacheDelimiters, InvertedSections) {
 
 TEST(MustacheDelimiters, PartialInheritence) {
   Value D = Object{{"value", "yes"}};
-  auto T = Template("[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n");
+  Template T("[ {{>include}} ]\n{{= | | =}}\n[ |>include| ]\n");
   T.registerPartial("include", ".{{value}}.");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -1385,7 +1378,7 @@ TEST(MustacheDelimiters, PartialInheritence) {
 
 TEST(MustacheDelimiters, PostPartialBehavior) {
   Value D = Object{{"value", "yes"}};
-  auto T = Template("[ {{>include}} ]\n[ .{{value}}.  .|value|. ]\n");
+  Template T("[ {{>include}} ]\n[ .{{value}}.  .|value|. ]\n");
   T.registerPartial("include", ".{{value}}. {{= | | =}} .|value|.");
   std::string Out;
   raw_string_ostream OS(Out);
@@ -1395,7 +1388,7 @@ TEST(MustacheDelimiters, PostPartialBehavior) {
 
 TEST(MustacheDelimiters, SurroundingWhitespace) {
   Value D = Object{};
-  auto T = Template("| {{=@ @=}} |");
+  Template T("| {{=@ @=}} |");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1404,7 +1397,7 @@ TEST(MustacheDelimiters, SurroundingWhitespace) {
 
 TEST(MustacheDelimiters, OutlyingWhitespaceInline) {
   Value D = Object{};
-  auto T = Template(" | {{=@ @=}}\n");
+  Template T(" | {{=@ @=}}\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1413,7 +1406,7 @@ TEST(MustacheDelimiters, OutlyingWhitespaceInline) {
 
 TEST(MustacheDelimiters, StandaloneTag) {
   Value D = Object{};
-  auto T = Template("Begin.\n{{=@ @=}}\nEnd.\n");
+  Template T("Begin.\n{{=@ @=}}\nEnd.\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1422,7 +1415,7 @@ TEST(MustacheDelimiters, StandaloneTag) {
 
 TEST(MustacheDelimiters, IndentedStandaloneTag) {
   Value D = Object{};
-  auto T = Template("Begin.\n  {{=@ @=}}\nEnd.\n");
+  Template T("Begin.\n  {{=@ @=}}\nEnd.\n");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1431,7 +1424,7 @@ TEST(MustacheDelimiters, IndentedStandaloneTag) {
 
 TEST(MustacheDelimiters, StandaloneLineEndings) {
   Value D = Object{};
-  auto T = Template("|\r\n{{= @ @ =}}\r\n|");
+  Template T("|\r\n{{= @ @ =}}\r\n|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1440,7 +1433,7 @@ TEST(MustacheDelimiters, StandaloneLineEndings) {
 
 TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
   Value D = Object{};
-  auto T = Template("  {{=@ @=}}\n=");
+  Template T("  {{=@ @=}}\n=");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1449,7 +1442,7 @@ TEST(MustacheDelimiters, StandaloneWithoutPreviousLine) {
 
 TEST(MustacheDelimiters, StandaloneWithoutNewline) {
   Value D = Object{};
-  auto T = Template("=\n  {{=@ @=}}");
+  Template T("=\n  {{=@ @=}}");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
@@ -1458,7 +1451,7 @@ TEST(MustacheDelimiters, StandaloneWithoutNewline) {
 
 TEST(MustacheDelimiters, PairwithPadding) {
   Value D = Object{};
-  auto T = Template("|{{= @   @ =}}|");
+  Template T("|{{= @   @ =}}|");
   std::string Out;
   raw_string_ostream OS(Out);
   T.render(D, OS);
diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
index bdcef376547fb..9007eb365a15f 100644
--- a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -112,7 +112,6 @@ static const StringMap<StringSet<>> XFailTestNames = {{
          "Section - Alternate Delimiters",
          "Section - Multiple Calls",
      }},
-    {"partials.json", {"Standalone Indentation"}},
 }};
 
 struct TestData {

From 9ce0dae54e7d34ef4e0266069c0d3f1ae5968612 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 30 Sep 2025 17:47:06 -0700
Subject: [PATCH 325/878] [llvm][mustache] Simplify debug logging (#159193)

The existing logging was inconsistent, and we logged too many things.
This PR introduces a more principled schema, and eliminates many,
redundant log lines.
---
 llvm/lib/Support/Mustache.cpp | 82 ++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 6275e5eccf03a..178f9703071de 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -329,6 +329,36 @@ struct Tag {
   size_t StartPosition = StringRef::npos;
 };
 
+static const char *tagKindToString(Tag::Kind K) {
+  switch (K) {
+  case Tag::Kind::None:
+    return "None";
+  case Tag::Kind::Normal:
+    return "Normal";
+  case Tag::Kind::Triple:
+    return "Triple";
+  }
+  llvm_unreachable("Unknown Tag::Kind");
+}
+
+static const char *jsonKindToString(json::Value::Kind K) {
+  switch (K) {
+  case json::Value::Kind::Null:
+    return "JSON_KIND_NULL";
+  case json::Value::Kind::Boolean:
+    return "JSON_KIND_BOOLEAN";
+  case json::Value::Kind::Number:
+    return "JSON_KIND_NUMBER";
+  case json::Value::Kind::String:
+    return "JSON_KIND_STRING";
+  case json::Value::Kind::Array:
+    return "JSON_KIND_ARRAY";
+  case json::Value::Kind::Object:
+    return "JSON_KIND_OBJECT";
+  }
+  llvm_unreachable("Unknown json::Value::Kind");
+}
+
 static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
                        StringRef Close) {
   const StringLiteral TripleOpen("{{{");
@@ -373,11 +403,10 @@ static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
 
 static std::optional<std::pair<StringRef, StringRef>>
 processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
-  LLVM_DEBUG(dbgs() << "  Found tag: \"" << T.FullMatch << "\", Content: \""
-                    << T.Content << "\"\n");
+  LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
+                    << ", Kind: " << tagKindToString(T.TagKind) << "\n");
   if (T.TagKind == Tag::Kind::Triple) {
     Tokens.emplace_back(T.FullMatch.str(), "&" + T.Content.str(), '&');
-    LLVM_DEBUG(dbgs() << "  Created UnescapeVariable token.\n");
     return std::nullopt;
   }
   StringRef Interpolated = T.Content;
@@ -385,7 +414,6 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
   if (!Interpolated.trim().starts_with("=")) {
     char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
     Tokens.emplace_back(RawBody, Interpolated.str(), Front);
-    LLVM_DEBUG(dbgs() << "  Created tag token of type '" << Front << "'\n");
     return std::nullopt;
   }
   Tokens.emplace_back(RawBody, Interpolated.str(), '=');
@@ -395,8 +423,8 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
   DelimSpec = DelimSpec.trim();
 
   std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
-  LLVM_DEBUG(dbgs() << "  Found Set Delimiter tag. NewOpen='" << Ret.first
-                    << "', NewClose='" << Ret.second << "'\n");
+  LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
+                    << ", NewClose: " << Ret.second << "\n");
   return Ret;
 }
 
@@ -405,15 +433,15 @@ processTag(const Tag &T, SmallVectorImpl<Token> &Tokens) {
 // but we don't support that here. An unescape variable
 // is represented only by {{& variable}}.
 static SmallVector<Token> tokenize(StringRef Template) {
-  LLVM_DEBUG(dbgs() << "Tokenizing template: \"" << Template << "\"\n");
+  LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
   SmallString<8> Close("}}");
   size_t Start = 0;
 
   while (Start < Template.size()) {
-    LLVM_DEBUG(dbgs() << "Loop start. Start=" << Start << ", Open='" << Open
-                      << "', Close='" << Close << "'\n");
+    LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
+                      << "', Close:'" << Close << "'\n");
     Tag T = findNextTag(Template, Start, Open, Close);
 
     if (T.TagKind == Tag::Kind::None) {
@@ -428,7 +456,6 @@ static SmallVector<Token> tokenize(StringRef Template) {
     if (T.StartPosition > Start) {
       StringRef Text = Template.substr(Start, T.StartPosition - Start);
       Tokens.emplace_back(Text.str());
-      LLVM_DEBUG(dbgs() << "  Created Text token: \"" << Text << "\"\n");
     }
 
     if (auto NewDelims = processTag(T, Tokens)) {
@@ -479,7 +506,6 @@ static SmallVector<Token> tokenize(StringRef Template) {
     if ((!HasTextBehind && !HasTextAhead) || (!HasTextBehind && Idx == LastIdx))
       stripTokenBefore(Tokens, Idx, CurrentToken, CurrentType);
   }
-  LLVM_DEBUG(dbgs() << "Tokenizing finished.\n");
   return Tokens;
 }
 
@@ -545,8 +571,8 @@ class AddIndentationStringStream : public MustacheOutputStream {
     Indent.resize(Indentation, ' ');
 
     for (char C : Data) {
-      LLVM_DEBUG(dbgs() << "IndentationStream: NeedsIndent=" << NeedsIndent
-                        << ", C='" << C << "', Indentation=" << Indentation
+      LLVM_DEBUG(dbgs() << "[Indentation Stream] NeedsIndent:" << NeedsIndent
+                        << ", C:'" << C << "', Indentation:" << Indentation
                         << "\n");
       if (NeedsIndent && C != '\n') {
         WrappedStream << Indent;
@@ -654,7 +680,9 @@ void Parser::parseMustache(ASTNode *Parent) {
   }
 }
 static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
-  LLVM_DEBUG(dbgs() << "toMustacheString: kind=" << (int)Data.kind() << "\n");
+  LLVM_DEBUG(dbgs() << "[To Mustache String] Kind: "
+                    << jsonKindToString(Data.kind()) << ", Data: " << Data
+                    << "\n");
   switch (Data.kind()) {
   case json::Value::Null:
     return;
@@ -667,7 +695,6 @@ static void toMustacheString(const json::Value &Data, raw_ostream &OS) {
   }
   case json::Value::String: {
     auto Str = *Data.getAsString();
-    LLVM_DEBUG(dbgs() << "  --> writing string: \"" << Str << "\"\n");
     OS << Str.str();
     return;
   }
@@ -696,8 +723,8 @@ void ASTNode::renderText(MustacheOutputStream &OS) { OS << Body; }
 
 void ASTNode::renderPartial(const json::Value &CurrentCtx,
                             MustacheOutputStream &OS) {
-  LLVM_DEBUG(dbgs() << "renderPartial: Accessor=" << AccessorValue[0]
-                    << ", Indentation=" << Indentation << "\n");
+  LLVM_DEBUG(dbgs() << "[Render Partial] Accessor:" << AccessorValue[0]
+                    << ", Indentation:" << Indentation << "\n");
   auto Partial = Ctx.Partials.find(AccessorValue[0]);
   if (Partial != Ctx.Partials.end())
     renderPartial(CurrentCtx, OS, Partial->getValue().get());
@@ -716,13 +743,12 @@ void ASTNode::renderVariable(const json::Value &CurrentCtx,
 
 void ASTNode::renderUnescapeVariable(const json::Value &CurrentCtx,
                                      MustacheOutputStream &OS) {
-  LLVM_DEBUG(dbgs() << "renderUnescapeVariable: Accessor=" << AccessorValue[0]
+  LLVM_DEBUG(dbgs() << "[Render UnescapeVariable] Accessor:" << AccessorValue[0]
                     << "\n");
   auto Lambda = Ctx.Lambdas.find(AccessorValue[0]);
   if (Lambda != Ctx.Lambdas.end()) {
     renderLambdas(CurrentCtx, OS, Lambda->getValue());
   } else if (const json::Value *ContextPtr = findContext()) {
-    LLVM_DEBUG(dbgs() << "  --> Found context value, writing to stream.\n");
     OS.suspendIndentation();
     toMustacheString(*ContextPtr, OS);
     OS.resumeIndentation();
@@ -792,8 +818,6 @@ void ASTNode::render(const llvm::json::Value &Data, MustacheOutputStream &OS) {
 }
 
 const json::Value *ASTNode::findContext() {
-  LLVM_DEBUG(dbgs() << "findContext: AccessorValue[0]=" << AccessorValue[0]
-                    << "\n");
   // The mustache spec allows for dot notation to access nested values
   // a single dot refers to the current context.
   // We attempt to find the JSON context in the current node, if it is not
@@ -808,22 +832,12 @@ const json::Value *ASTNode::findContext() {
   StringRef CurrentAccessor = AccessorValue[0];
   ASTNode *CurrentParent = Parent;
 
-  LLVM_DEBUG(dbgs() << "findContext: ParentContext: ";
-             if (ParentContext) ParentContext->print(dbgs());
-             else dbgs() << "nullptr"; dbgs() << "\n");
-
   while (!CurrentContext || !CurrentContext->get(CurrentAccessor)) {
-    LLVM_DEBUG(dbgs() << "findContext: climbing parent\n");
     if (CurrentParent->Ty != Root) {
       CurrentContext = CurrentParent->ParentContext->getAsObject();
       CurrentParent = CurrentParent->Parent;
-      LLVM_DEBUG(dbgs() << "findContext: new ParentContext: ";
-                 if (CurrentParent->ParentContext)
-                     CurrentParent->ParentContext->print(dbgs());
-                 else dbgs() << "nullptr"; dbgs() << "\n");
       continue;
     }
-    LLVM_DEBUG(dbgs() << "findContext: reached root, not found\n");
     return nullptr;
   }
   const json::Value *Context = nullptr;
@@ -839,9 +853,6 @@ const json::Value *ASTNode::findContext() {
       Context = CurrentValue;
     }
   }
-  LLVM_DEBUG(dbgs() << "findContext: found value: ";
-             if (Context) Context->print(dbgs()); else dbgs() << "nullptr";
-             dbgs() << "\n");
   return Context;
 }
 
@@ -853,8 +864,7 @@ void ASTNode::renderChild(const json::Value &Contexts,
 
 void ASTNode::renderPartial(const json::Value &Contexts,
                             MustacheOutputStream &OS, ASTNode *Partial) {
-  LLVM_DEBUG(dbgs() << "renderPartial (helper): Indentation=" << Indentation
-                    << "\n");
+  LLVM_DEBUG(dbgs() << "[Render Partial Indentation] Indentation: " << Indentation << "\n");
   AddIndentationStringStream IS(OS, Indentation);
   Partial->render(Contexts, IS);
 }

From f29f1112f5cc467c0cdac05532770cdd15382c23 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 1 Oct 2025 10:45:38 +1000
Subject: [PATCH 326/878] [JITLink][MachO] Use Triple::isArm64e consistently.

---
 llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 09ac0f19a7b07..f79478038c5cb 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -599,8 +599,7 @@ Expected<std::unique_ptr<LinkGraph>> createLinkGraphFromMachOObject_arm64(
 }
 
 static Error applyPACSigningToModInitPointers(LinkGraph &G) {
-  assert(G.getTargetTriple().getSubArch() == Triple::AArch64SubArch_arm64e &&
-         "PAC signing only valid for arm64e");
+  assert(G.getTargetTriple().isArm64e() && "PAC signing only valid for arm64e");
 
   if (auto *ModInitSec = G.findSectionByName("__DATA,__mod_init_func")) {
     for (auto *B : ModInitSec->blocks()) {

From d392563433316e310edacf35a40fb2f9aa477acc Mon Sep 17 00:00:00 2001
From: ronlieb <ron.lieberman@amd.com>
Date: Tue, 30 Sep 2025 21:16:08 -0400
Subject: [PATCH 327/878] Revert "Fix memory leak in Offloading API" (#161465)

Reverts llvm/llvm-project#161430
---
 llvm/lib/Object/OffloadBundle.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index a6a9628acddcc..0dd378e65fd81 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -120,15 +120,14 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle)
     return errorCodeToError(object_error::parse_failed);
 
-  std::unique_ptr<OffloadBundleFatBin> TheBundle(
-      new OffloadBundleFatBin(Buf, FileName));
+  OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName);
 
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
     return Err;
 
-  return TheBundle;
+  return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
 }
 
 Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) {

From 3e1d4d4144cc9d28ccd85cf49d6fc836c38ffbaa Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 30 Sep 2025 19:14:30 -0700
Subject: [PATCH 328/878] [RISCV] Remove Zicntr from sifive-p450/p470/p670.
 (#161444)

These cores don't implement the `time` CSR. They require SBI to trap and
emulate it which is allowed by RVA20U.
---
 clang/test/Driver/riscv-cpus.c           |  3 -
 llvm/lib/Target/RISCV/RISCVProcessors.td | 81 +++++++++++++++++++++---
 2 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index cd92adc64a7d6..5d5fdd72baedb 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -462,7 +462,6 @@
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+ziccif"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicclsm"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+ziccrse"
-// MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicntr"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicsr"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zifencei"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zihintntl"
@@ -492,7 +491,6 @@
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+ziccif"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicclsm"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+ziccrse"
-// MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicntr"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zicsr"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zifencei"
 // MCPU-SIFIVE-P470-SAME: "-target-feature" "+zihintntl"
@@ -555,7 +553,6 @@
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+ziccif"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicclsm"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+ziccrse"
-// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicntr"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicsr"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zifencei"
 // MCPU-SIFIVE-P670-SAME: "-target-feature" "+zihintntl"
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 95f8a8789fa6c..17a794867be9e 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -347,16 +347,58 @@ defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
                                  TunePostRAScheduler];
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
-                                      !listconcat(RVA22U64Features,
-                                      [FeatureStdExtZifencei,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZicsr,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtB,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtZifencei,
                                        FeatureStdExtZihintntl,
                                        FeatureUnalignedScalarMem,
-                                       FeatureUnalignedVectorMem]),
+                                       FeatureUnalignedVectorMem],
                                       SiFiveP400TuneFeatures>;
 
 def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
-                                      !listconcat(RVA22U64Features,
-                                      [FeatureStdExtV,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZicsr,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtB,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtV,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZihintntl,
                                        FeatureStdExtZvl128b,
@@ -368,7 +410,7 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
                                        FeatureVendorXSiFivecdiscarddlone,
                                        FeatureVendorXSiFivecflushdlone,
                                        FeatureUnalignedScalarMem,
-                                       FeatureUnalignedVectorMem]),
+                                       FeatureUnalignedVectorMem],
                                       !listconcat(SiFiveP400TuneFeatures,
                                                   [TuneNoSinkSplatOperands,
                                                    TuneVXRMPipelineFlush])>;
@@ -397,8 +439,29 @@ def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", SiFiveP500Model,
 }
 
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
-                                      !listconcat(RVA22U64Features,
-                                      [FeatureStdExtV,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZicsr,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtB,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtV,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZihintntl,
                                        FeatureStdExtZvl128b,
@@ -408,7 +471,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                        FeatureStdExtZvksc,
                                        FeatureStdExtZvksg,
                                        FeatureUnalignedScalarMem,
-                                       FeatureUnalignedVectorMem]),
+                                       FeatureUnalignedVectorMem],
                                       [TuneNoDefaultUnroll,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,

From 89ed5255b9ee88119e409a6d986eb1ad0e8f08e3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 30 Sep 2025 19:53:21 -0700
Subject: [PATCH 329/878] [RISCV] Rename BFloatVectors -> BF16Vectors in
 tablegen. NFC (#161469)

Part of this rename is taken from #161158, but applies it more
consistently to more variables.

I think using BF16 makes it easier to not confuse BFloat and Float when
reading.
---
 .../lib/Target/RISCV/RISCVInstrInfoVPseudos.td | 18 +++++++++---------
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td  |  2 +-
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td  |  4 ++--
 llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td  |  4 ++--
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td     |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td   |  8 ++++----
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 4eb9a3be26fa6..d998316e4725c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -345,7 +345,7 @@ defset list<VTypeInfo> AllVectors = {
     }
   }
 
-  defset list<VTypeInfo> AllFloatAndBFloatVectors = {
+  defset list<VTypeInfo> AllFloatAndBF16Vectors = {
     defset list<VTypeInfo> AllFloatVectors = {
       defset list<VTypeInfo> NoGroupFloatVectors = {
         defset list<VTypeInfo> FractionalGroupFloatVectors = {
@@ -382,16 +382,16 @@ defset list<VTypeInfo> AllVectors = {
       }
     }
 
-    defset list<VTypeInfo> AllBFloatVectors = {
-      defset list<VTypeInfo> NoGroupBFloatVectors = {
-        defset list<VTypeInfo> FractionalGroupBFloatVectors = {
+    defset list<VTypeInfo> AllBF16Vectors = {
+      defset list<VTypeInfo> NoGroupBF16Vectors = {
+        defset list<VTypeInfo> FractionalGroupBF16Vectors = {
           def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
           def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
         }
         def VBF16M1:  VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
       }
 
-      defset list<GroupVTypeInfo> GroupBFloatVectors = {
+      defset list<GroupVTypeInfo> GroupBF16Vectors = {
         def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
                                     V_M2, bf16, FPR16>;
         def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
@@ -542,7 +542,7 @@ defset list<VTypeInfoToWide> AllWidenableIntToFloatVectors = {
   def : VTypeInfoToWide<VI32M4, VF64M8>;
 }
 
-defset list<VTypeInfoToWide> AllWidenableBFloatToFloatVectors = {
+defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = {
   def : VTypeInfoToWide<VBF16MF4, VF32MF2>;
   def : VTypeInfoToWide<VBF16MF2, VF32M1>;
   def : VTypeInfoToWide<VBF16M1, VF32M2>;
@@ -5870,7 +5870,7 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
 
 multiclass VPatConversionWF_VF_BF<string intrinsic, string instruction,
                                   bit isSEWAware = 0> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in
   {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
@@ -5977,7 +5977,7 @@ multiclass VPatConversionVF_WF_RTZ<string intrinsic, string instruction,
 
 multiclass VPatConversionVF_WF_BF_RM<string intrinsic, string instruction,
                                      bit isSEWAware = 0> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
@@ -7154,7 +7154,7 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 // We can use vmerge.vvm to support vector-vector vfmerge.
 // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
 // int_riscv_vmerge. Support both for compatibility.
-foreach vti = AllFloatAndBFloatVectors in {
+foreach vti = AllFloatAndBF16Vectors in {
   let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM",
                                  vti.Vector,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index dc613614aa457..139ff9277bb91 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -1388,7 +1388,7 @@ defm : VPatFPSetCCSDNode_VV_VF_FV<SETOLE, "PseudoVMFLE", "PseudoVMFGE">;
 // Floating-point vselects:
 // 11.15. Vector Integer Merge Instructions
 // 13.15. Vector Floating-Point Merge Instruction
-foreach fvti = AllFloatAndBFloatVectors in {
+foreach fvti = AllFloatAndBF16Vectors in {
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
     def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 1511f1b55b996..cf904eab1dd39 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2426,7 +2426,7 @@ foreach vti = AllFloatVectors in {
 // Floating-point vselects:
 // 11.15. Vector Integer Merge Instructions
 // 13.15. Vector Floating-Point Merge Instruction
-foreach fvti = AllFloatAndBFloatVectors in {
+foreach fvti = AllFloatAndBF16Vectors in {
   defvar ivti = GetIntVTypeInfo<fvti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
   def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm),
@@ -2770,7 +2770,7 @@ foreach vti = NoGroupFloatVectors in {
   }
 }
 
-foreach vti = AllFloatAndBFloatVectors in {
+foreach vti = AllFloatAndBF16Vectors in {
   defvar ivti = GetIntVTypeInfo<vti>.Vti;
   let Predicates = GetVTypePredicates<ivti>.Predicates in {
     def : Pat<(vti.Vector
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 9835c033aea9c..b683e895c31c0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -560,7 +560,7 @@ multiclass VPseudoVNCVT_BF16_S {
 }
 
 multiclass VPatConversionS_BF16<string intrinsic, string instruction> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = [HasVendorXAndesVBFHCvt] in
@@ -572,7 +572,7 @@ multiclass VPatConversionS_BF16<string intrinsic, string instruction> {
 }
 
 multiclass VPatConversionBF16_S<string intrinsic, string instruction> {
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = [HasVendorXAndesVBFHCvt] in
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index b546339ce99e2..557d8736eede3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -770,7 +770,7 @@ multiclass VPatVQMACCQOQ<string intrinsic, string instruction, string kind>
     : VPatVMACC<intrinsic, instruction, kind, VQMACCQOQInfoPairs, vint8m1_t>;
 
 multiclass VPatVFWMACC<string intrinsic, string instruction, string kind>
-    : VPatVMACC<intrinsic, instruction, kind, AllWidenableBFloatToFloatVectors,
+    : VPatVMACC<intrinsic, instruction, kind, AllWidenableBF16ToFloatVectors,
                 vbfloat16m1_t>;
 
 defset list<VTypeInfoToWide> VFNRCLIPInfoPairs = {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index 6d8672b72a12d..0be9eab6870ec 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -53,7 +53,7 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
   defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w",
                                    "PseudoVFNCVTBF16_F_F", isSEWAware=1>;
 
-  foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     let Predicates = [HasVInstructionsBF16Minimal] in
@@ -91,9 +91,9 @@ let Predicates = [HasStdExtZvfbfminOrZvfofp8min] in {
 
 let Predicates = [HasStdExtZvfbfwma] in {
   defm : VPatTernaryW_VV_VX_RM<"int_riscv_vfwmaccbf16", "PseudoVFWMACCBF16",
-                               AllWidenableBFloatToFloatVectors, isSEWAware=1>;
+                               AllWidenableBF16ToFloatVectors, isSEWAware=1>;
   defm : VPatWidenFPMulAccVL_VV_VF_RM<riscv_vfwmadd_vl, "PseudoVFWMACCBF16",
-                                      AllWidenableBFloatToFloatVectors>;
+                                      AllWidenableBF16ToFloatVectors>;
   defm : VPatWidenFPMulAccSDNode_VV_VF_RM<"PseudoVFWMACCBF16",
-                                          AllWidenableBFloatToFloatVectors>;
+                                          AllWidenableBF16ToFloatVectors>;
 }

From fd4e77cf333855a495a09277f951c0ccef557772 Mon Sep 17 00:00:00 2001
From: Shunsuke Watanabe <watanabe.shu-06@fujitsu.com>
Date: Wed, 1 Oct 2025 13:03:23 +0900
Subject: [PATCH 330/878] [flang][driver] Accelerate complex division when
 `-ffast-math` is specified (#159689)

This patch accelerates complex division by passing
`-complex-range=basic` to the frontend when the `-ffast-math` option is
specified. This behavior is the same as `-fcomplex-arithmetic=basic`. A
warning is issued if a different value is specified for
`-fcomplex-arithmetic=`. The warning conditions will be unified with
clang.
---
 clang/include/clang/Driver/CommonArgs.h    |  5 ++
 clang/lib/Driver/ToolChains/Clang.cpp      | 90 +++++++---------------
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 48 ++++++++++++
 clang/lib/Driver/ToolChains/Flang.cpp      | 18 ++++-
 flang/docs/ComplexOperations.md            |  4 +-
 flang/docs/FlangDriver.md                  |  3 +
 flang/test/Driver/complex-range.f90        | 77 ++++++++++++++++++
 7 files changed, 179 insertions(+), 66 deletions(-)

diff --git a/clang/include/clang/Driver/CommonArgs.h b/clang/include/clang/Driver/CommonArgs.h
index 40ae40665b040..23426c0a3e02e 100644
--- a/clang/include/clang/Driver/CommonArgs.h
+++ b/clang/include/clang/Driver/CommonArgs.h
@@ -304,6 +304,11 @@ std::string complexRangeKindToStr(LangOptions::ComplexRangeKind Range);
 // Render a frontend option corresponding to ComplexRangeKind.
 std::string renderComplexRangeOption(LangOptions::ComplexRangeKind Range);
 
+// Set the complex range and output a warning as needed.
+void setComplexRange(const Driver &D, StringRef NewOpt,
+                     LangOptions::ComplexRangeKind NewRange, StringRef &LastOpt,
+                     LangOptions::ComplexRangeKind &Range);
+
 } // end namespace tools
 } // end namespace driver
 } // end namespace clang
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index adaa6b3005577..412a176006bc0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2723,42 +2723,6 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
   }
 }
 
-static void EmitComplexRangeDiag(const Driver &D, StringRef LastOpt,
-                                 LangOptions::ComplexRangeKind Range,
-                                 StringRef NewOpt,
-                                 LangOptions::ComplexRangeKind NewRange) {
-  //  Do not emit a warning if NewOpt overrides LastOpt in the following cases.
-  //
-  // | LastOpt               | NewOpt                |
-  // |-----------------------|-----------------------|
-  // | -fcx-limited-range    | -fno-cx-limited-range |
-  // | -fno-cx-limited-range | -fcx-limited-range    |
-  // | -fcx-fortran-rules    | -fno-cx-fortran-rules |
-  // | -fno-cx-fortran-rules | -fcx-fortran-rules    |
-  // | -ffast-math           | -fno-fast-math        |
-  // | -ffp-model=           | -ffast-math           |
-  // | -ffp-model=           | -fno-fast-math        |
-  // | -ffp-model=           | -ffp-model=           |
-  // | -fcomplex-arithmetic= | -fcomplex-arithmetic= |
-  if (LastOpt == NewOpt || NewOpt.empty() || LastOpt.empty() ||
-      (LastOpt == "-fcx-limited-range" && NewOpt == "-fno-cx-limited-range") ||
-      (LastOpt == "-fno-cx-limited-range" && NewOpt == "-fcx-limited-range") ||
-      (LastOpt == "-fcx-fortran-rules" && NewOpt == "-fno-cx-fortran-rules") ||
-      (LastOpt == "-fno-cx-fortran-rules" && NewOpt == "-fcx-fortran-rules") ||
-      (LastOpt == "-ffast-math" && NewOpt == "-fno-fast-math") ||
-      (LastOpt.starts_with("-ffp-model=") && NewOpt == "-ffast-math") ||
-      (LastOpt.starts_with("-ffp-model=") && NewOpt == "-fno-fast-math") ||
-      (LastOpt.starts_with("-ffp-model=") &&
-       NewOpt.starts_with("-ffp-model=")) ||
-      (LastOpt.starts_with("-fcomplex-arithmetic=") &&
-       NewOpt.starts_with("-fcomplex-arithmetic=")))
-    return;
-
-  D.Diag(clang::diag::warn_drv_overriding_complex_range)
-      << LastOpt << NewOpt << complexRangeKindToStr(Range)
-      << complexRangeKindToStr(NewRange);
-}
-
 static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                        bool OFastEnabled, const ArgList &Args,
                                        ArgStringList &CmdArgs,
@@ -2815,27 +2779,19 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   std::string ComplexRangeStr;
   StringRef LastComplexRangeOption;
 
-  auto setComplexRange = [&](StringRef NewOption,
-                             LangOptions::ComplexRangeKind NewRange) {
-    // Warn if user overrides the previously set complex number
-    // multiplication/division option.
-    if (Range != LangOptions::ComplexRangeKind::CX_None && Range != NewRange)
-      EmitComplexRangeDiag(D, LastComplexRangeOption, Range, NewOption,
-                           NewRange);
-    LastComplexRangeOption = NewOption;
-    Range = NewRange;
-  };
-
   // Lambda to set fast-math options. This is also used by -ffp-model=fast
   auto applyFastMath = [&](bool Aggressive, StringRef CallerOption) {
     if (Aggressive) {
       HonorINFs = false;
       HonorNaNs = false;
-      setComplexRange(CallerOption, LangOptions::ComplexRangeKind::CX_Basic);
+      setComplexRange(D, CallerOption, LangOptions::ComplexRangeKind::CX_Basic,
+                      LastComplexRangeOption, Range);
     } else {
       HonorINFs = true;
       HonorNaNs = true;
-      setComplexRange(CallerOption, LangOptions::ComplexRangeKind::CX_Promoted);
+      setComplexRange(D, CallerOption,
+                      LangOptions::ComplexRangeKind::CX_Promoted,
+                      LastComplexRangeOption, Range);
     }
     MathErrno = false;
     AssociativeMath = true;
@@ -2887,18 +2843,24 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     default: continue;
 
     case options::OPT_fcx_limited_range:
-      setComplexRange(A->getSpelling(),
-                      LangOptions::ComplexRangeKind::CX_Basic);
+      setComplexRange(D, A->getSpelling(),
+                      LangOptions::ComplexRangeKind::CX_Basic,
+                      LastComplexRangeOption, Range);
       break;
     case options::OPT_fno_cx_limited_range:
-      setComplexRange(A->getSpelling(), LangOptions::ComplexRangeKind::CX_Full);
+      setComplexRange(D, A->getSpelling(),
+                      LangOptions::ComplexRangeKind::CX_Full,
+                      LastComplexRangeOption, Range);
       break;
     case options::OPT_fcx_fortran_rules:
-      setComplexRange(A->getSpelling(),
-                      LangOptions::ComplexRangeKind::CX_Improved);
+      setComplexRange(D, A->getSpelling(),
+                      LangOptions::ComplexRangeKind::CX_Improved,
+                      LastComplexRangeOption, Range);
       break;
     case options::OPT_fno_cx_fortran_rules:
-      setComplexRange(A->getSpelling(), LangOptions::ComplexRangeKind::CX_Full);
+      setComplexRange(D, A->getSpelling(),
+                      LangOptions::ComplexRangeKind::CX_Full,
+                      LastComplexRangeOption, Range);
       break;
     case options::OPT_fcomplex_arithmetic_EQ: {
       LangOptions::ComplexRangeKind RangeVal;
@@ -2916,7 +2878,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
             << A->getSpelling() << Val;
         break;
       }
-      setComplexRange(Args.MakeArgString(A->getSpelling() + Val), RangeVal);
+      setComplexRange(D, Args.MakeArgString(A->getSpelling() + Val), RangeVal,
+                      LastComplexRangeOption, Range);
       break;
     }
     case options::OPT_ffp_model_EQ: {
@@ -2956,8 +2919,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
         FPModel = Val;
         FPContract = "on";
         LastFpContractOverrideOption = "-ffp-model=precise";
-        setComplexRange(Args.MakeArgString(A->getSpelling() + Val),
-                        LangOptions::ComplexRangeKind::CX_Full);
+        setComplexRange(D, Args.MakeArgString(A->getSpelling() + Val),
+                        LangOptions::ComplexRangeKind::CX_Full,
+                        LastComplexRangeOption, Range);
       } else if (Val == "strict") {
         StrictFPModel = true;
         FPExceptionBehavior = "strict";
@@ -2966,8 +2930,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
         LastFpContractOverrideOption = "-ffp-model=strict";
         TrappingMath = true;
         RoundingFPMath = true;
-        setComplexRange(Args.MakeArgString(A->getSpelling() + Val),
-                        LangOptions::ComplexRangeKind::CX_Full);
+        setComplexRange(D, Args.MakeArgString(A->getSpelling() + Val),
+                        LangOptions::ComplexRangeKind::CX_Full,
+                        LastComplexRangeOption, Range);
       } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Val;
@@ -3174,8 +3139,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       SignedZeros = true;
       restoreFPContractState();
       if (Range != LangOptions::ComplexRangeKind::CX_Full)
-        setComplexRange(A->getSpelling(),
-                        LangOptions::ComplexRangeKind::CX_None);
+        setComplexRange(D, A->getSpelling(),
+                        LangOptions::ComplexRangeKind::CX_None,
+                        LastComplexRangeOption, Range);
       else
         Range = LangOptions::ComplexRangeKind::CX_None;
       LastComplexRangeOption = "";
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index cce4f6487c0bd..49ee53f0ba3bf 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3557,3 +3557,51 @@ tools::renderComplexRangeOption(LangOptionsBase::ComplexRangeKind Range) {
     return "-complex-range=" + ComplexRangeStr;
   return ComplexRangeStr;
 }
+
+static void emitComplexRangeDiag(const Driver &D, StringRef LastOpt,
+                                 LangOptions::ComplexRangeKind Range,
+                                 StringRef NewOpt,
+                                 LangOptions::ComplexRangeKind NewRange) {
+  //  Do not emit a warning if NewOpt overrides LastOpt in the following cases.
+  //
+  // | LastOpt               | NewOpt                |
+  // |-----------------------|-----------------------|
+  // | -fcx-limited-range    | -fno-cx-limited-range |
+  // | -fno-cx-limited-range | -fcx-limited-range    |
+  // | -fcx-fortran-rules    | -fno-cx-fortran-rules |
+  // | -fno-cx-fortran-rules | -fcx-fortran-rules    |
+  // | -ffast-math           | -fno-fast-math        |
+  // | -ffp-model=           | -ffast-math           |
+  // | -ffp-model=           | -fno-fast-math        |
+  // | -ffp-model=           | -ffp-model=           |
+  // | -fcomplex-arithmetic= | -fcomplex-arithmetic= |
+  if (LastOpt == NewOpt || NewOpt.empty() || LastOpt.empty() ||
+      (LastOpt == "-fcx-limited-range" && NewOpt == "-fno-cx-limited-range") ||
+      (LastOpt == "-fno-cx-limited-range" && NewOpt == "-fcx-limited-range") ||
+      (LastOpt == "-fcx-fortran-rules" && NewOpt == "-fno-cx-fortran-rules") ||
+      (LastOpt == "-fno-cx-fortran-rules" && NewOpt == "-fcx-fortran-rules") ||
+      (LastOpt == "-ffast-math" && NewOpt == "-fno-fast-math") ||
+      (LastOpt.starts_with("-ffp-model=") && NewOpt == "-ffast-math") ||
+      (LastOpt.starts_with("-ffp-model=") && NewOpt == "-fno-fast-math") ||
+      (LastOpt.starts_with("-ffp-model=") &&
+       NewOpt.starts_with("-ffp-model=")) ||
+      (LastOpt.starts_with("-fcomplex-arithmetic=") &&
+       NewOpt.starts_with("-fcomplex-arithmetic=")))
+    return;
+
+  D.Diag(clang::diag::warn_drv_overriding_complex_range)
+      << LastOpt << NewOpt << complexRangeKindToStr(Range)
+      << complexRangeKindToStr(NewRange);
+}
+
+void tools::setComplexRange(const Driver &D, StringRef NewOpt,
+                            LangOptions::ComplexRangeKind NewRange,
+                            StringRef &LastOpt,
+                            LangOptions::ComplexRangeKind &Range) {
+  // Warn if user overrides the previously set complex number
+  // multiplication/division option.
+  if (Range != LangOptions::ComplexRangeKind::CX_None && Range != NewRange)
+    emitComplexRangeDiag(D, LastOpt, Range, NewOpt, NewRange);
+  LastOpt = NewOpt;
+  Range = NewRange;
+}
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index a5394813eeb97..426bc52f34859 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -693,6 +693,7 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
   bool AssociativeMath = false;
   bool ReciprocalMath = false;
 
+  StringRef LastComplexRangeOption;
   LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None;
 
   if (const Arg *A = Args.getLastArg(options::OPT_ffp_contract)) {
@@ -720,17 +721,22 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
       continue;
 
     case options::OPT_fcomplex_arithmetic_EQ: {
+      LangOptions::ComplexRangeKind NewRange;
       StringRef Val = A->getValue();
       if (Val == "full")
-        Range = LangOptions::ComplexRangeKind::CX_Full;
+        NewRange = LangOptions::ComplexRangeKind::CX_Full;
       else if (Val == "improved")
-        Range = LangOptions::ComplexRangeKind::CX_Improved;
+        NewRange = LangOptions::ComplexRangeKind::CX_Improved;
       else if (Val == "basic")
-        Range = LangOptions::ComplexRangeKind::CX_Basic;
+        NewRange = LangOptions::ComplexRangeKind::CX_Basic;
       else {
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Val;
+        break;
       }
+
+      setComplexRange(D, Args.MakeArgString(A->getSpelling() + Val), NewRange,
+                      LastComplexRangeOption, Range);
       break;
     }
     case options::OPT_fhonor_infinities:
@@ -779,6 +785,9 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
       ApproxFunc = true;
       SignedZeros = false;
       FPContract = "fast";
+      setComplexRange(D, A->getSpelling(),
+                      LangOptions::ComplexRangeKind::CX_Basic,
+                      LastComplexRangeOption, Range);
       break;
     case options::OPT_fno_fast_math:
       HonorINFs = true;
@@ -792,6 +801,9 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
       // --ffp-contract=off -fno-fast-math --> -ffp-contract=off
       if (FPContract == "fast")
         FPContract = "";
+      setComplexRange(D, A->getSpelling(),
+                      LangOptions::ComplexRangeKind::CX_None,
+                      LastComplexRangeOption, Range);
       break;
     }
 
diff --git a/flang/docs/ComplexOperations.md b/flang/docs/ComplexOperations.md
index 3ebeea5e0a540..1b6ec527b446a 100644
--- a/flang/docs/ComplexOperations.md
+++ b/flang/docs/ComplexOperations.md
@@ -93,7 +93,9 @@ While [the same option in clang][2] allows specifying `promoted`, this is not
 implemented in Flang. Also, in the case of `improved`, clang does not handle NaN
 and infinite values, but Flang does. These behavioral differences arise because
 the transformation of complex division calculations depends on the implementation
-of ComplexToStandard, which may change in the future.
+of ComplexToStandard, which may change in the future. If you specify
+`-ffast-math`, the lowering is the same as specifiying
+`-fcomplex-arithmetic=basic`.
 
 [1]: https://discourse.llvm.org/t/rfc-change-lowering-of-fortran-math-intrinsics/63971
 [2]: https://clang.llvm.org/docs/UsersManual.html#cmdoption-fcomplex-arithmetic
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index 2b7d9d4ae6908..3286171bb1499 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -573,6 +573,9 @@ documentation for more details.
 These correspond to LLVM IR Fast Math attributes:
 https://llvm.org/docs/LangRef.html#fast-math-flags
 
+In addition to the above, `-ffast-math` also enables
+`-fcomplex-arithmetic=basic`.
+
 When `-ffast-math` is specified, any linker steps generated by the compiler
 driver will also link to `crtfastmath.o`, which adds a static constructor
 that sets the FTZ/DAZ bits in MXCSR, affecting not only the current only the
diff --git a/flang/test/Driver/complex-range.f90 b/flang/test/Driver/complex-range.f90
index e5a1ba9068ac9..575fa0437fd0d 100644
--- a/flang/test/Driver/complex-range.f90
+++ b/flang/test/Driver/complex-range.f90
@@ -15,6 +15,83 @@
 ! RUN: not %flang -### -fcomplex-arithmetic=foo -c %s 2>&1 \
 ! RUN:   | FileCheck %s --check-prefix=ERR
 
+! RUN: %flang -### -ffast-math -c %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=BASIC
+
+! RUN: %flang -### -fno-fast-math -c %s 2>&1 \
+! RUN:   | FileCheck %s --check-prefix=RANGE
+
+! RUN: %flang -### -Werror -ffast-math -fno-fast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=RANGE %s
+
+! RUN: %flang -### -ffast-math -fcomplex-arithmetic=full -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=FULL,ARITH-FULL-OVERRIDING,FAST-OVERRIDDEN %s
+
+! RUN: %flang -### -ffast-math -fcomplex-arithmetic=improved -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=IMPRVD,ARITH-IMPROVED-OVERRIDING,FAST-OVERRIDDEN %s
+
+! RUN: %flang -### -Werror -ffast-math -fcomplex-arithmetic=basic -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC %s
+
+! RUN: %flang -### -Werror -fno-fast-math -ffast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC %s
+
+! RUN: %flang -### -Werror -fno-fast-math -fcomplex-arithmetic=full -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=FULL %s
+
+! RUN: %flang -### -Werror -fno-fast-math -fcomplex-arithmetic=improved -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=IMPRVD %s
+
+! RUN: %flang -### -Werror -fno-fast-math -fcomplex-arithmetic=basic -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC %s
+
+! RUN: %flang -### -fcomplex-arithmetic=full -ffast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC,FAST-OVERRIDING,ARITH-FULL-OVERRIDDEN %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=full -fno-fast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=RANGE %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=full -fcomplex-arithmetic=improved -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=IMPRVD %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=full -fcomplex-arithmetic=basic -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC %s
+
+! RUN: %flang -### -fcomplex-arithmetic=improved -ffast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC,FAST-OVERRIDING,ARITH-IMPROVED-OVERRIDDEN %s
+
+! RUN: %flang -### -fcomplex-arithmetic=improved -fno-fast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=RANGE,NOFAST-OVERRIDING,ARITH-IMPROVED-OVERRIDDEN %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=improved -fcomplex-arithmetic=full -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=FULL %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=improved -fcomplex-arithmetic=basic -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=basic -ffast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=BASIC %s
+
+! RUN: %flang -### -fcomplex-arithmetic=basic -fno-fast-math -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=RANGE,NOFAST-OVERRIDING,ARITH-BASIC-OVERRIDDEN %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=basic -fcomplex-arithmetic=full -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=FULL %s
+
+! RUN: %flang -### -Werror -fcomplex-arithmetic=basic -fcomplex-arithmetic=improved -c %s 2>&1 \
+! RUN:   | FileCheck --check-prefixes=IMPRVD %s
+
+
+! FAST-OVERRIDING: warning: '-ffast-math' sets complex range to "basic"
+! NOFAST-OVERRIDING: warning: '-fno-fast-math' sets complex range to "none"
+! ARITH-FULL-OVERRIDING: warning: '-fcomplex-arithmetic=full' sets complex range to "full"
+! ARITH-IMPROVED-OVERRIDING: warning: '-fcomplex-arithmetic=improved' sets complex range to "improved"
+
+! FAST-OVERRIDDEN: overriding the setting of "basic" that was implied by '-ffast-math' [-Woverriding-complex-range]
+! ARITH-FULL-OVERRIDDEN: overriding the setting of "full" that was implied by '-fcomplex-arithmetic=full' [-Woverriding-complex-range]
+! ARITH-IMPROVED-OVERRIDDEN: overriding the setting of "improved" that was implied by '-fcomplex-arithmetic=improved' [-Woverriding-complex-range]
+! ARITH-BASIC-OVERRIDDEN: overriding the setting of "basic" that was implied by '-fcomplex-arithmetic=basic' [-Woverriding-complex-range]
+
 ! RANGE-NOT: -complex-range=
 ! FULL: -complex-range=full
 ! IMPRVD: -complex-range=improved

From 133406e3d9afb845426898154ecf532a75056d37 Mon Sep 17 00:00:00 2001
From: Un1q32 <joey.t.reinhart@gmail.com>
Date: Wed, 1 Oct 2025 00:05:27 -0400
Subject: [PATCH 331/878] Reserve R9 on armv6 iOS 2.x (#150835)

The iOS 2.x ABI had R9 as a reserved register, 3.0 made it available,
but support for the 2.x ABI was never added to LLVM. We only use the 2.x
ABI on armv6 since before 3.0 armv6 was the only architecture supported
by iOS.
---
 llvm/lib/Target/ARM/ARMSubtarget.cpp        | 6 +++++-
 llvm/test/CodeGen/ARM/inline-asm-clobber.ll | 7 +++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 3329beab63ddf..58bc338b25856 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -225,7 +225,11 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       (isTargetDarwin() || DM == DenormalMode::getPreserveSign()))
     HasNEONForFP = true;
 
-  if (isRWPI())
+  const ARM::ArchKind Arch = ARM::parseArch(TargetTriple.getArchName());
+  if (isRWPI() ||
+      (isTargetIOS() &&
+       (Arch == ARM::ArchKind::ARMV6K || Arch == ARM::ArchKind::ARMV6) &&
+       TargetTriple.isOSVersionLT(3, 0)))
     ReserveR9 = true;
 
   // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
diff --git a/llvm/test/CodeGen/ARM/inline-asm-clobber.ll b/llvm/test/CodeGen/ARM/inline-asm-clobber.ll
index 7b1331f3f1e84..f44ad2a896ad4 100644
--- a/llvm/test/CodeGen/ARM/inline-asm-clobber.ll
+++ b/llvm/test/CodeGen/ARM/inline-asm-clobber.ll
@@ -6,12 +6,19 @@
 ; RUN: llc <%s -mtriple=arm-none-eabi --frame-pointer=all 2>&1 \
 ; RUN:   | FileCheck %s -check-prefix=NO_FP_ELIM
 
+; RUN: llc <%s -mtriple=armv6-apple-ios2 2>&1 | FileCheck %s -check-prefix=IOS2
+; RUN: llc <%s -mtriple=armv6k-apple-ios2 2>&1 | FileCheck %s -check-prefix=IOS2
+; RUN: llc <%s -mtriple=armv6k-apple-ios3 2>&1 | FileCheck %s -check-prefix=IOS3
+; RUN: llc <%s -mtriple=armv7-apple-ios2 2>&1 | FileCheck %s -check-prefix=IOS3
+
 ; CHECK: warning: inline asm clobber list contains reserved registers: SP, PC
 ; CHECK: warning: inline asm clobber list contains reserved registers: R11
 ; RWPI: warning: inline asm clobber list contains reserved registers: R9, SP, PC
 ; RWPI: warning: inline asm clobber list contains reserved registers: R11
 ; NO_FP_ELIM: warning: inline asm clobber list contains reserved registers: R11, SP, PC
 ; NO_FP_ELIM: warning: inline asm clobber list contains reserved registers: R11
+; IOS2: warning: inline asm clobber list contains reserved registers: R9, SP, PC
+; IOS3: warning: inline asm clobber list contains reserved registers: SP, PC
 
 define void @foo() nounwind {
   call void asm sideeffect "mov r7, #1",

From 079d589f5a389d18b6277b31a61e471ec56b5b7e Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 30 Sep 2025 23:18:20 -0500
Subject: [PATCH 332/878] [HLSL][NFC] Add missing includes for standalone
 header compilation (#161473)

HLSLResource.h added by #161254 builds in the context of a .cpp file
(e.g. CGHLSLRuntime.cpp) but not when doing a header compilation, e.g.:

```
clang/include/clang/AST/Attrs.inc:12:45: error: unknown type name 'raw_ostream'; did you mean 'clang::raw_ostream'?
   12 | static inline void DelimitAttributeArgument(raw_ostream& OS, bool& IsFirst) {
```
---
 clang/include/clang/AST/HLSLResource.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/include/clang/AST/HLSLResource.h b/clang/include/clang/AST/HLSLResource.h
index e3ee0b136cec3..9cdd81b2d8dab 100644
--- a/clang/include/clang/AST/HLSLResource.h
+++ b/clang/include/clang/AST/HLSLResource.h
@@ -15,9 +15,12 @@
 #define LLVM_CLANG_AST_HLSLRESOURCE_H
 
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/Attrs.inc"
 #include "clang/AST/DeclBase.h"
 #include "clang/Basic/TargetInfo.h"
+#include "clang/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 

From a414c22f32fa08ab91d9d6fe06d6949b526bedb3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 30 Sep 2025 21:44:08 -0700
Subject: [PATCH 333/878] [Support] Fix warnings

This patch fixes:

  llvm/lib/Support/Mustache.cpp:332:20: error: unused function
  'tagKindToString' [-Werror,-Wunused-function]

  llvm/lib/Support/Mustache.cpp:344:20: error: unused function
  'jsonKindToString' [-Werror,-Wunused-function]
---
 llvm/lib/Support/Mustache.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 178f9703071de..47860c00be610 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -329,7 +329,7 @@ struct Tag {
   size_t StartPosition = StringRef::npos;
 };
 
-static const char *tagKindToString(Tag::Kind K) {
+[[maybe_unused]] static const char *tagKindToString(Tag::Kind K) {
   switch (K) {
   case Tag::Kind::None:
     return "None";
@@ -341,7 +341,7 @@ static const char *tagKindToString(Tag::Kind K) {
   llvm_unreachable("Unknown Tag::Kind");
 }
 
-static const char *jsonKindToString(json::Value::Kind K) {
+[[maybe_unused]] static const char *jsonKindToString(json::Value::Kind K) {
   switch (K) {
   case json::Value::Kind::Null:
     return "JSON_KIND_NULL";

From 6e0d519b3937a2f96179e40f417bc5cc79f3adba Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 30 Sep 2025 21:55:14 -0700
Subject: [PATCH 334/878] [ADT] Consolidate uninitialized_copy in SmallVector
 (NFC) (#161043)

This patch consolidates two implementations of uninitialized_copy into
a single template function.
---
 llvm/include/llvm/ADT/SmallVector.h | 33 +++++++++++++----------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index 77805f5c03c14..efae6f339f9de 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -502,25 +502,22 @@ class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
 
   /// Copy the range [I, E) onto the uninitialized memory
   /// starting with "Dest", constructing elements into it as needed.
-  template<typename It1, typename It2>
+  template <typename It1, typename It2>
   static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
-    // Arbitrary iterator types; just use the basic implementation.
-    std::uninitialized_copy(I, E, Dest);
-  }
-
-  /// Copy the range [I, E) onto the uninitialized memory
-  /// starting with "Dest", constructing elements into it as needed.
-  template <typename T1, typename T2>
-  static void uninitialized_copy(
-      T1 *I, T1 *E, T2 *Dest,
-      std::enable_if_t<std::is_same<std::remove_const_t<T1>, T2>::value> * =
-          nullptr) {
-    // Use memcpy for PODs iterated by pointers (which includes SmallVector
-    // iterators): std::uninitialized_copy optimizes to memmove, but we can
-    // use memcpy here. Note that I and E are iterators and thus might be
-    // invalid for memcpy if they are equal.
-    if (I != E)
-      std::memcpy(reinterpret_cast<void *>(Dest), I, (E - I) * sizeof(T));
+    if constexpr (std::is_pointer_v<It1> && std::is_pointer_v<It2> &&
+                  std::is_same_v<
+                      std::remove_const_t<std::remove_pointer_t<It1>>,
+                      std::remove_pointer_t<It2>>) {
+      // Use memcpy for PODs iterated by pointers (which includes SmallVector
+      // iterators): std::uninitialized_copy optimizes to memmove, but we can
+      // use memcpy here. Note that I and E are iterators and thus might be
+      // invalid for memcpy if they are equal.
+      if (I != E)
+        std::memcpy(reinterpret_cast<void *>(Dest), I, (E - I) * sizeof(T));
+    } else {
+      // Arbitrary iterator types; just use the basic implementation.
+      std::uninitialized_copy(I, E, Dest);
+    }
   }
 
   /// Double the size of the allocated memory, guaranteeing space for at

From d62776d03323e709abb8e4734e0ae36f76dba815 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 1 Oct 2025 14:20:10 +0800
Subject: [PATCH 335/878] [LVI] Handle constant value lattice in
 `getEdgeValueLocal` (#161410)

Closes https://github.com/llvm/llvm-project/issues/161367.

In https://github.com/llvm/llvm-project/pull/157614, we ignored cases
where OpLatticeVal might be a constant or notconstant. Directly
returning the result causes a type mismatch. I apologize for the
oversight in the previous code review.

This patch applies the cast op to constants. For notconstant value
lattices, I'd leave it as a todo (it is similar to the constant case,
except for trunc without nsw/nuw).
---
 llvm/lib/Analysis/LazyValueInfo.cpp           | 28 ++++++++++-------
 .../CorrelatedValuePropagation/pr161367.ll    | 31 +++++++++++++++++++
 2 files changed, 48 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/Transforms/CorrelatedValuePropagation/pr161367.ll

diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 6fb28072afe46..0e5bc481383a0 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1632,19 +1632,25 @@ LazyValueInfoImpl::getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
                 *getValueFromCondition(Usr->getOperand(0), Condition,
                                        isTrueDest, /*UseBlockValue*/ false);
 
-            if (!OpLatticeVal.isConstantRange())
-              return OpLatticeVal;
+            if (OpLatticeVal.isConstantRange()) {
+              const unsigned ResultBitWidth =
+                  Usr->getType()->getScalarSizeInBits();
+              if (auto *Trunc = dyn_cast<TruncInst>(Usr))
+                return ValueLatticeElement::getRange(
+                    OpLatticeVal.getConstantRange().truncate(
+                        ResultBitWidth, Trunc->getNoWrapKind()));
 
-            const unsigned ResultBitWidth =
-                Usr->getType()->getScalarSizeInBits();
-            if (auto *Trunc = dyn_cast<TruncInst>(Usr))
               return ValueLatticeElement::getRange(
-                  OpLatticeVal.getConstantRange().truncate(
-                      ResultBitWidth, Trunc->getNoWrapKind()));
-
-            return ValueLatticeElement::getRange(
-                OpLatticeVal.getConstantRange().castOp(
-                    cast<CastInst>(Usr)->getOpcode(), ResultBitWidth));
+                  OpLatticeVal.getConstantRange().castOp(
+                      cast<CastInst>(Usr)->getOpcode(), ResultBitWidth));
+            }
+            if (OpLatticeVal.isConstant()) {
+              Constant *C = OpLatticeVal.getConstant();
+              if (auto *CastC = ConstantFoldCastOperand(
+                      cast<CastInst>(Usr)->getOpcode(), C, Usr->getType(), DL))
+                return ValueLatticeElement::get(CastC);
+            }
+            return ValueLatticeElement::getOverdefined();
           } else {
             // If one of Val's operand has an inferred value, we may be able to
             // infer the value of Val.
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/pr161367.ll b/llvm/test/Transforms/CorrelatedValuePropagation/pr161367.ll
new file mode 100644
index 0000000000000..346eaeaec72c1
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/pr161367.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s
+
+; Make sure that we apply trunc to the edge value of %x.
+@g = external global i8
+
+define i16 @pr161367(i64 %x) {
+; CHECK-LABEL: define i16 @pr161367(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[X]] to i16
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[X]], sub (i64 0, i64 ptrtoint (ptr @g to i64))
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[ELSE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RET:%.*]] = phi i16 [ trunc (i64 sub (i64 0, i64 ptrtoint (ptr @g to i64)) to i16), %[[ENTRY]] ], [ 0, %[[ELSE]] ]
+; CHECK-NEXT:    ret i16 [[RET]]
+;
+entry:
+  %trunc = trunc i64 %x to i16
+  %exitcond = icmp eq i64 %x, sub (i64 0, i64 ptrtoint (ptr @g to i64))
+  br i1 %exitcond, label %exit, label %else
+
+else:
+  br label %exit
+
+exit:
+  %ret = phi i16 [ %trunc, %entry ], [ 0, %else ]
+  ret i16 %ret
+}

From 63ca8483d0efc544c5b8c4484d36a64c3b3ff210 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Oct 2025 08:58:47 +0200
Subject: [PATCH 336/878] [IR] Introduce !captures metadata (#160913)

This introduces `!captures` metadata on stores, which looks like this:

```
store ptr %x, ptr %y, !captures !{!"address", !"read_provenance"}
```

The semantics are the same as replacing the store with a call like this:
```
call void @llvm.store(ptr captures(address, read_provenance) %x, ptr %y)
```

This metadata is intended for annotation by frontends -- it's not
something we can feasibly infer at this point, as it would require
analyzing uses of the pointer stored in memory.

The motivating use case for this is Rust's `println!()` machinery, which
involves storing a reference to the value inside a structure. This means
that printing code (including conditional debugging code), can inhibit
optimizations because the pointer escapes. With the new metadata we can
annotate this as a read-only capture, which has less impact on
optimizations.
---
 llvm/docs/LangRef.rst                         |  29 +++
 llvm/include/llvm/IR/FixedMetadataKinds.def   |   1 +
 llvm/include/llvm/IR/Metadata.h               |   8 +
 llvm/lib/Analysis/CaptureTracking.cpp         |   6 +-
 llvm/lib/IR/Metadata.cpp                      |  35 ++++
 llvm/lib/IR/Verifier.cpp                      |  25 +++
 llvm/lib/Transforms/Utils/Local.cpp           |   6 +
 .../Transforms/FunctionAttrs/nocapture.ll     |  68 +++++++
 .../SimplifyCFG/hoist-with-metadata.ll        | 171 ++++++++++++++++++
 llvm/test/Verifier/captures-metadata.ll       |  37 ++++
 10 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Verifier/captures-metadata.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8e863939781a2..22b58bf0f5735 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1489,6 +1489,8 @@ Currently, only the following parameter attributes are defined:
     function, returning a pointer to allocated storage disjoint from the
     storage for any other object accessible to the caller.
 
+.. _captures_attr:
+
 ``captures(...)``
     This attribute restricts the ways in which the callee may capture the
     pointer. This is not a valid attribute for return values. This attribute
@@ -7543,6 +7545,33 @@ The number of bytes known to be dereferenceable is specified by the integer
 value in the metadata node. This is analogous to the ''dereferenceable_or_null''
 attribute on parameters and return values.
 
+'``captures``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``!captures`` metadata can only be applied to ``store`` instructions with
+a pointer-typed value operand. It restricts the capturing behavior of the store
+value operand in the same way the ``captures(...)`` attribute would do on a
+call. See the :ref:`pointer capture section <pointercapture>` for a detailed
+discussion of capture semantics.
+
+The ``!captures`` metadata accepts a non-empty list of strings from the same
+set as the :ref:`captures attribute <captures_attr>`:
+``!"address"``, ``!"address_is_null"``, ``!"provenance"`` and
+``!"read_provenance"``. ``!"none"`` is not supported.
+
+For example ``store ptr %x, ptr %y, !captures !{!"address"}`` indicates that
+the copy of pointer ``%x`` stored to location ``%y`` will only be used to
+inspect its integral address value, and not dereferenced. Dereferencing the
+pointer would result in undefined behavior.
+
+Similarly ``store ptr %x, ptr %y, !captures !{!"address", !"read_provenance"}``
+indicates that while reads through the stored pointer are allowed, writes would
+result in undefined behavior.
+
+The ``!captures`` attribute makes no statement about other uses of ``%x``, or
+uses of the stored-to memory location after it has been overwritten with a
+different value.
+
 .. _llvm.loop:
 
 '``llvm.loop``'
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index d09cc15d65ff6..0603abcd6a4da 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -55,3 +55,4 @@ LLVM_FIXED_MD_KIND(MD_mmra, "mmra", 40)
 LLVM_FIXED_MD_KIND(MD_noalias_addrspace, "noalias.addrspace", 41)
 LLVM_FIXED_MD_KIND(MD_callee_type, "callee_type", 42)
 LLVM_FIXED_MD_KIND(MD_nofree, "nofree", 43)
+LLVM_FIXED_MD_KIND(MD_captures, "captures", 44)
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 990bdc618f240..85a7f8fd373c0 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -41,6 +41,7 @@
 
 namespace llvm {
 
+enum class CaptureComponents : uint8_t;
 class Module;
 class ModuleSlotTracker;
 class raw_ostream;
@@ -1480,6 +1481,13 @@ class MDNode : public Metadata {
   LLVM_ABI static MDNode *getMergedCallsiteMetadata(MDNode *A, MDNode *B);
   LLVM_ABI static MDNode *getMergedCalleeTypeMetadata(const MDNode *A,
                                                       const MDNode *B);
+
+  /// Convert !captures metadata to CaptureComponents. MD may be nullptr.
+  LLVM_ABI static CaptureComponents toCaptureComponents(const MDNode *MD);
+  /// Convert CaptureComponents to !captures metadata. The return value may be
+  /// nullptr.
+  LLVM_ABI static MDNode *fromCaptureComponents(LLVMContext &Ctx,
+                                                CaptureComponents CC);
 };
 
 /// Tuple of metadata.
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index a0fe7f9037e47..22229d9c26b3b 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -320,8 +320,12 @@ UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
     return CaptureComponents::None;
   case Instruction::Store:
     // Stored the pointer - conservatively assume it may be captured.
+    if (U.getOperandNo() == 0)
+      return MDNode::toCaptureComponents(
+          I->getMetadata(LLVMContext::MD_captures));
+
     // Volatile stores make the address observable.
-    if (U.getOperandNo() == 0 || cast<StoreInst>(I)->isVolatile())
+    if (cast<StoreInst>(I)->isVolatile())
       return CaptureComponents::All;
     return CaptureComponents::None;
   case Instruction::AtomicRMW: {
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 9cfb0ff4d689a..1add0c7930bc9 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ModRef.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -1435,6 +1436,40 @@ MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) {
   return B;
 }
 
+CaptureComponents MDNode::toCaptureComponents(const MDNode *MD) {
+  if (!MD)
+    return CaptureComponents::All;
+
+  CaptureComponents CC = CaptureComponents::None;
+  for (Metadata *Op : MD->operands()) {
+    CaptureComponents Component =
+        StringSwitch<CaptureComponents>(cast<MDString>(Op)->getString())
+            .Case("address", CaptureComponents::Address)
+            .Case("address_is_null", CaptureComponents::AddressIsNull)
+            .Case("provenance", CaptureComponents::Provenance)
+            .Case("read_provenance", CaptureComponents::ReadProvenance);
+    CC |= Component;
+  }
+  return CC;
+}
+
+MDNode *MDNode::fromCaptureComponents(LLVMContext &Ctx, CaptureComponents CC) {
+  assert(!capturesNothing(CC) && "Can't encode captures(none)");
+  if (capturesAll(CC))
+    return nullptr;
+
+  SmallVector<Metadata *> Components;
+  if (capturesAddressIsNullOnly(CC))
+    Components.push_back(MDString::get(Ctx, "address_is_null"));
+  else if (capturesAddress(CC))
+    Components.push_back(MDString::get(Ctx, "address"));
+  if (capturesReadProvenanceOnly(CC))
+    Components.push_back(MDString::get(Ctx, "read_provenance"));
+  else if (capturesFullProvenance(CC))
+    Components.push_back(MDString::get(Ctx, "provenance"));
+  return MDNode::get(Ctx, Components);
+}
+
 //===----------------------------------------------------------------------===//
 // NamedMDNode implementation.
 //
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8c03d6f809d50..6b3cd27b77a7a 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -542,6 +542,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   void visitAliasScopeMetadata(const MDNode *MD);
   void visitAliasScopeListMetadata(const MDNode *MD);
   void visitAccessGroupMetadata(const MDNode *MD);
+  void visitCapturesMetadata(Instruction &I, const MDNode *Captures);
 
   template <class Ty> bool isValidMetadataArray(const MDTuple &N);
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -5373,6 +5374,27 @@ void Verifier::visitAccessGroupMetadata(const MDNode *MD) {
   }
 }
 
+void Verifier::visitCapturesMetadata(Instruction &I, const MDNode *Captures) {
+  static const char *ValidArgs[] = {"address_is_null", "address",
+                                    "read_provenance", "provenance"};
+
+  auto *SI = dyn_cast<StoreInst>(&I);
+  Check(SI, "!captures metadata can only be applied to store instructions", &I);
+  Check(SI->getValueOperand()->getType()->isPointerTy(),
+        "!captures metadata can only be applied to store with value operand of "
+        "pointer type",
+        &I);
+  Check(Captures->getNumOperands() != 0, "!captures metadata cannot be empty",
+        &I);
+
+  for (Metadata *Op : Captures->operands()) {
+    auto *Str = dyn_cast<MDString>(Op);
+    Check(Str, "!captures metadata must be a list of strings", &I);
+    Check(is_contained(ValidArgs, Str->getString()),
+          "invalid entry in !captures metadata", &I, Str);
+  }
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
@@ -5600,6 +5622,9 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *Annotation = I.getMetadata(LLVMContext::MD_annotation))
     visitAnnotationMetadata(Annotation);
 
+  if (MDNode *Captures = I.getMetadata(LLVMContext::MD_captures))
+    visitCapturesMetadata(I, Captures);
+
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 123881e276584..21b2652d04120 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3025,6 +3025,12 @@ static void combineMetadata(Instruction *K, const Instruction *J,
         // Preserve !nosanitize if both K and J have it.
         K->setMetadata(Kind, JMD);
         break;
+      case LLVMContext::MD_captures:
+        K->setMetadata(
+            Kind, MDNode::fromCaptureComponents(
+                      K->getContext(), MDNode::toCaptureComponents(JMD) |
+                                           MDNode::toCaptureComponents(KMD)));
+        break;
       }
   }
   // Set !invariant.group from J if J has it. If both instructions have it
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index 60a4214548a72..8113ba65fe422 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -1398,5 +1398,73 @@ define void @assume_nonnull(ptr %p) {
   ret void
 }
 
+define void @captures_metadata_address_is_null(ptr %x, ptr %y) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; FNATTRS-LABEL: define void @captures_metadata_address_is_null
+; FNATTRS-SAME: (ptr captures(address_is_null) [[X:%.*]], ptr writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) #[[ATTR17]] {
+; FNATTRS-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META0:![0-9]+]]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; ATTRIBUTOR-LABEL: define void @captures_metadata_address_is_null
+; ATTRIBUTOR-SAME: (ptr nofree writeonly [[X:%.*]], ptr nofree nonnull writeonly captures(none) [[Y:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META0:![0-9]+]]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  store ptr %x, ptr %y, !captures !{!"address_is_null"}
+  ret void
+}
+
+define void @captures_metadata_address(ptr %x, ptr %y) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; FNATTRS-LABEL: define void @captures_metadata_address
+; FNATTRS-SAME: (ptr captures(address) [[X:%.*]], ptr writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) #[[ATTR17]] {
+; FNATTRS-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META1:![0-9]+]]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; ATTRIBUTOR-LABEL: define void @captures_metadata_address
+; ATTRIBUTOR-SAME: (ptr nofree writeonly [[X:%.*]], ptr nofree nonnull writeonly captures(none) [[Y:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META1:![0-9]+]]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  store ptr %x, ptr %y, !captures !{!"address"}
+  ret void
+}
+
+define void @captures_metadata_address_read_provenance(ptr %x, ptr %y) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; FNATTRS-LABEL: define void @captures_metadata_address_read_provenance
+; FNATTRS-SAME: (ptr captures(address, read_provenance) [[X:%.*]], ptr writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) #[[ATTR17]] {
+; FNATTRS-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META2:![0-9]+]]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; ATTRIBUTOR-LABEL: define void @captures_metadata_address_read_provenance
+; ATTRIBUTOR-SAME: (ptr nofree writeonly [[X:%.*]], ptr nofree nonnull writeonly captures(none) [[Y:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META2:![0-9]+]]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  store ptr %x, ptr %y, !captures !{!"address", !"read_provenance"}
+  ret void
+}
+
+define void @captures_metadata_provenance(ptr %x, ptr %y) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; FNATTRS-LABEL: define void @captures_metadata_provenance
+; FNATTRS-SAME: (ptr captures(provenance) [[X:%.*]], ptr writeonly captures(none) initializes((0, 8)) [[Y:%.*]]) #[[ATTR17]] {
+; FNATTRS-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META3:![0-9]+]]
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
+; ATTRIBUTOR-LABEL: define void @captures_metadata_provenance
+; ATTRIBUTOR-SAME: (ptr nofree writeonly [[X:%.*]], ptr nofree nonnull writeonly captures(none) [[Y:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR-NEXT:    store ptr [[X]], ptr [[Y]], align 8, !captures [[META3:![0-9]+]]
+; ATTRIBUTOR-NEXT:    ret void
+;
+  store ptr %x, ptr %y, !captures !{!"provenance"}
+  ret void
+}
+
 declare ptr @llvm.launder.invariant.group.p0(ptr)
 declare ptr @llvm.strip.invariant.group.p0(ptr)
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
index d34ac2bb30040..85c8ed20210b8 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
@@ -424,6 +424,174 @@ join:
   ret ptr %phi
 }
 
+define void @hoist_captures_same(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_same(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8, !captures [[META9:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+else:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+out:
+  ret void
+}
+
+define void @hoist_captures_different(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_different(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8, !captures [[META10:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+else:
+  store ptr %x, ptr %y, !captures !{!"read_provenance"}
+  br label %out
+
+out:
+  ret void
+}
+
+define void @hoist_captures_overlap(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_overlap(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8, !captures [[META10]]
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+else:
+  store ptr %x, ptr %y, !captures !{!"address", !"read_provenance"}
+  br label %out
+
+out:
+  ret void
+}
+
+define void @hoist_captures_subsume1(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_subsume1(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8, !captures [[META9]]
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y, !captures !{!"address_is_null"}
+  br label %out
+
+else:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+out:
+  ret void
+}
+
+define void @hoist_captures_subsume2(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_subsume2(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8, !captures [[META11:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y, !captures !{!"provenance"}
+  br label %out
+
+else:
+  store ptr %x, ptr %y, !captures !{!"read_provenance"}
+  br label %out
+
+out:
+  ret void
+}
+
+define void @hoist_captures_full_set(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_full_set(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+else:
+  store ptr %x, ptr %y, !captures !{!"provenance"}
+  br label %out
+
+out:
+  ret void
+}
+
+define void @hoist_captures_only_one1(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_only_one1(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+else:
+  store ptr %x, ptr %y
+  br label %out
+
+out:
+  ret void
+}
+
+define void @hoist_captures_only_one2(i1 %c, ptr %x, ptr %y) {
+; CHECK-LABEL: @hoist_captures_only_one2(
+; CHECK-NEXT:  if:
+; CHECK-NEXT:    store ptr [[X:%.*]], ptr [[Y:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+if:
+  br i1 %c, label %then, label %else
+
+then:
+  store ptr %x, ptr %y
+  br label %out
+
+else:
+  store ptr %x, ptr %y, !captures !{!"address"}
+  br label %out
+
+out:
+  ret void
+}
+
 !0 = !{ i8 0, i8 1 }
 !1 = !{ i8 3, i8 5 }
 !2 = !{}
@@ -445,4 +613,7 @@ join:
 ; CHECK: [[META6]] = !{float 2.500000e+00}
 ; CHECK: [[META7]] = !{i32 5, i32 6}
 ; CHECK: [[META8]] = !{i32 4, i32 5}
+; CHECK: [[META9]] = !{!"address"}
+; CHECK: [[META10]] = !{!"address", !"read_provenance"}
+; CHECK: [[META11]] = !{!"provenance"}
 ;.
diff --git a/llvm/test/Verifier/captures-metadata.ll b/llvm/test/Verifier/captures-metadata.ll
new file mode 100644
index 0000000000000..ae08ddd036f16
--- /dev/null
+++ b/llvm/test/Verifier/captures-metadata.ll
@@ -0,0 +1,37 @@
+; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+
+; CHECK: !captures metadata can only be applied to store instructions
+define void @wrong_instr_type(ptr %x) {
+  load ptr, ptr %x, !captures !{!"address"}
+  ret void
+}
+
+; CHECK: captures metadata can only be applied to store with value operand of pointer type
+define void @wrong_op_type(i32 %x, ptr %y) {
+  store i32 %x, ptr %y, !captures !{!"address"}
+  ret void
+}
+
+; CHECK: !captures metadata cannot be empty
+define void @empty(ptr %x, ptr %y) {
+  store ptr %x, ptr %y, !captures !{}
+  ret void
+}
+
+; CHECK: !captures metadata must be a list of strings
+define void @not_string(ptr %x, ptr %y) {
+  store ptr %x, ptr %y, !captures !{!{}}
+  ret void
+}
+
+; CHECK: invalid entry in !captures metadata
+define void @invalid_str(ptr %x, ptr %y) {
+  store ptr %x, ptr %y, !captures !{!"foo"}
+  ret void
+}
+
+; CHECK: invalid entry in !captures metadata
+define void @invalid_none(ptr %x, ptr %y) {
+  store ptr %x, ptr %y, !captures !{!"none"}
+  ret void
+}

From 1098a5cefd764eb58e8530e821eaa5d5a6c42310 Mon Sep 17 00:00:00 2001
From: quic_hchandel <hchandel@qti.qualcomm.com>
Date: Wed, 1 Oct 2025 12:43:32 +0530
Subject: [PATCH 337/878] [RISCV] Add commutative support for Qualcomm uC
 Xqcics extension (#161328)

This is a follow-up to #160653 doing similar changes for Xqcics.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp    |  10 ++
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |   4 +-
 llvm/test/CodeGen/RISCV/xqcics.ll           | 124 ++++++++++++++++++++
 3 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 70b6c7ea35f82..1e6b04f8a4281 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3793,6 +3793,11 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
       return false;
     // Operands 1 and 2 are commutable, if we switch the opcode.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
+  case RISCV::QC_SELECTIEQ:
+  case RISCV::QC_SELECTINE:
+  case RISCV::QC_SELECTIIEQ:
+  case RISCV::QC_SELECTIINE:
+    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
   case RISCV::QC_MVEQ:
   case RISCV::QC_MVNE:
   case RISCV::QC_MVLT:
@@ -4018,6 +4023,11 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,
                                                    OpIdx2);
   }
+  case RISCV::QC_SELECTIEQ:
+  case RISCV::QC_SELECTINE:
+  case RISCV::QC_SELECTIIEQ:
+  case RISCV::QC_SELECTIINE:
+    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
   case RISCV::QC_MVEQ:
   case RISCV::QC_MVNE:
   case RISCV::QC_MVLT:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index ff4a0406799b1..540786851e2d5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -524,7 +524,7 @@ class QCIRVInstRI<bits<1> funct1, DAGOperand InTyImm11,
   let Inst{30-20} = imm11;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCISELECTIICC<bits<3> funct3, string opcodestr>
     : RVInstR4<0b00, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2),
@@ -537,7 +537,7 @@ class QCISELECTIICC<bits<3> funct3, string opcodestr>
   let rs2 = simm1;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCommutable = 1 in
 class QCISELECTICC<bits<3> funct3, string opcodestr>
     : RVInstR4<0b01, funct3, OPC_CUSTOM_2, (outs GPRNoX0:$rd_wb),
                (ins GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm2),
diff --git a/llvm/test/CodeGen/RISCV/xqcics.ll b/llvm/test/CodeGen/RISCV/xqcics.ll
index 5b7ca9e7fedb8..60fc98c5de663 100644
--- a/llvm/test/CodeGen/RISCV/xqcics.ll
+++ b/llvm/test/CodeGen/RISCV/xqcics.ll
@@ -690,3 +690,127 @@ entry:
   ret i32 %sel
 }
 
+define i32 @select_cc_example_eq1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a1, a0, .LBB21_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB21_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eq1:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectieq a0, a1, a2, 11
+; RV32IXQCICS-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_eq1:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.selectieq a0, a1, a2, 11
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_eq1:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.line a2, a1, a0, 11
+; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %b, %a
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ne1(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ne1:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bne a1, a0, .LBB22_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a2, 11
+; RV32I-NEXT:  .LBB22_2: # %entry
+; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ne1:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectine a0, a1, a2, 11
+; RV32IXQCICS-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ne1:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.selectine a0, a1, a2, 11
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ne1:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.lieq a2, a1, a0, 11
+; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %b, %a
+  %sel = select i1 %cmp, i32 %x, i32 11
+  ret i32 %sel
+}
+
+
+define i32 @select_cc_example_eq2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_eq2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    beq a1, a0, .LBB23_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB23_2:
+; RV32I-NEXT:    li a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_eq2:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectiieq a0, a1, 15, 11
+; RV32IXQCICS-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_eq2:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.selectiieq a0, a1, 15, 11
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_eq2:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.selectiieq a0, a1, 15, 11
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp eq i32 %b, %a
+  %sel = select i1 %cmp, i32 15, i32 11
+  ret i32 %sel
+}
+
+define i32 @select_cc_example_ne2(i32 %a, i32 %b, i32 %x, i32 %y) {
+; RV32I-LABEL: select_cc_example_ne2:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    bne a1, a0, .LBB24_2
+; RV32I-NEXT:  # %bb.1: # %entry
+; RV32I-NEXT:    li a0, 11
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB24_2:
+; RV32I-NEXT:    li a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32IXQCICS-LABEL: select_cc_example_ne2:
+; RV32IXQCICS:       # %bb.0: # %entry
+; RV32IXQCICS-NEXT:    qc.selectiine a0, a1, 15, 11
+; RV32IXQCICS-NEXT:    ret
+;
+; RV32IXQCICM-LABEL: select_cc_example_ne2:
+; RV32IXQCICM:       # %bb.0: # %entry
+; RV32IXQCICM-NEXT:    qc.selectiine a0, a1, 15, 11
+; RV32IXQCICM-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_cc_example_ne2:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    qc.selectiine a0, a1, 15, 11
+; RV32IXQCI-NEXT:    ret
+entry:
+  %cmp = icmp ne i32 %b, %a
+  %sel = select i1 %cmp, i32 15, i32 11
+  ret i32 %sel
+}

From 8c5c37582e909232dc5810c4809c2bd1a22fbe1b Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 1 Oct 2025 09:21:17 +0200
Subject: [PATCH 338/878] [flang] add helper to create descriptor with new base
 address (#161347)

There is currently no helper to create a descriptor for a copy of a
Fortran entity based on the descriptor of the original entity and the
base address of the copy (most places that are doing this currently are
also doing allocation of the copy at the same time or using the
runtime).
Add a helper for this with a unit test.
---
 .../flang/Optimizer/Builder/FIRBuilder.h      |  9 ++++
 flang/lib/Optimizer/Builder/FIRBuilder.cpp    | 22 +++++++++
 .../Optimizer/Builder/FIRBuilderTest.cpp      | 47 +++++++++++++++++++
 3 files changed, 78 insertions(+)

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 4b3087ed45788..d3af3bafbf279 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -959,6 +959,15 @@ mlir::Value genLifetimeStart(mlir::OpBuilder &builder, mlir::Location loc,
 void genLifetimeEnd(mlir::OpBuilder &builder, mlir::Location loc,
                     mlir::Value mem);
 
+/// Given a fir.box or fir.class \p box describing an entity and a raw address
+/// \p newAddr for an entity with the same Fortran properties (rank, dynamic
+/// type, length parameters and bounds) and attributes (POINTER or ALLOCATABLE),
+/// create a box for \p newAddr with the same type as \p box. This assumes \p
+/// newAddr is for contiguous storage (\p box does not have to be contiguous).
+mlir::Value getDescriptorWithNewBaseAddress(fir::FirOpBuilder &builder,
+                                            mlir::Location loc, mlir::Value box,
+                                            mlir::Value newAddr);
+
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index b6501fd530992..5e6e20861fd85 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1974,3 +1974,25 @@ void fir::factory::genLifetimeEnd(mlir::OpBuilder &builder, mlir::Location loc,
                                   mlir::Value cast) {
   mlir::LLVM::LifetimeEndOp::create(builder, loc, cast);
 }
+
+mlir::Value fir::factory::getDescriptorWithNewBaseAddress(
+    fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value box,
+    mlir::Value newAddr) {
+  auto boxType = llvm::dyn_cast<fir::BaseBoxType>(box.getType());
+  assert(boxType &&
+         "expected a box type input in getDescriptorWithNewBaseAddress");
+  if (boxType.isAssumedRank())
+    TODO(loc, "changing descriptor base address for an assumed rank entity");
+  llvm::SmallVector<mlir::Value> lbounds;
+  fir::factory::genDimInfoFromBox(builder, loc, box, &lbounds,
+                                  /*extents=*/nullptr, /*strides=*/nullptr);
+  fir::BoxValue inputBoxValue(box, lbounds, /*explicitParams=*/{});
+  fir::ExtendedValue openedInput =
+      fir::factory::readBoxValue(builder, loc, inputBoxValue);
+  mlir::Value shape = fir::isArray(openedInput)
+                          ? builder.createShape(loc, openedInput)
+                          : mlir::Value{};
+  mlir::Value typeMold = fir::isPolymorphicType(boxType) ? box : mlir::Value{};
+  return builder.createBox(loc, boxType, newAddr, shape, /*slice=*/{},
+                           fir::getTypeParams(openedInput), typeMold);
+}
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index e3e364720af67..fffd4ab5446ca 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -644,3 +644,50 @@ TEST_F(FIRBuilderTest, genArithIntegerOverflow) {
   auto op4_ioff = op4_iofi.getOverflowAttr().getValue();
   EXPECT_EQ(op4_ioff, nsw);
 }
+
+TEST_F(FIRBuilderTest, getDescriptorWithNewBaseAddress) {
+  auto builder = getBuilder();
+  auto loc = builder.getUnknownLoc();
+
+  // Build an input fir.box for a 1-D array of i64 with constant extent 10.
+  auto i64Ty = builder.getI64Type();
+  auto seqTy = fir::SequenceType::get({10}, i64Ty);
+  auto refArrTy = fir::ReferenceType::get(seqTy);
+  auto ptrTy = fir::PointerType::get(seqTy);
+  auto boxTy = fir::BoxType::get(ptrTy);
+  // Create an undef box descriptor value (descriptor contents are unspecified).
+  mlir::Value inputBox = fir::UndefOp::create(builder, loc, boxTy);
+
+  // New base address (same element type and properties).
+  mlir::Value addr2 = fir::UndefOp::create(builder, loc, refArrTy);
+
+  mlir::Value newBox = fir::factory::getDescriptorWithNewBaseAddress(
+      builder, loc, inputBox, addr2);
+
+  // The returned descriptor must have the same type as the input box.
+  EXPECT_EQ(newBox.getType(), inputBox.getType());
+
+  // It must be constructed by an embox using the new base address.
+  ASSERT_TRUE(llvm::isa_and_nonnull<fir::EmboxOp>(newBox.getDefiningOp()));
+  auto embox = llvm::dyn_cast<fir::EmboxOp>(newBox.getDefiningOp());
+  EXPECT_EQ(embox.getMemref(), addr2);
+
+  // The shape should be derived from the input box; expect a fir.shape with one
+  // extent that comes from a fir.box_dims reading from the original input box.
+  mlir::Value shape = embox.getShape();
+  ASSERT_TRUE(shape);
+  ASSERT_TRUE(llvm::isa_and_nonnull<fir::ShapeShiftOp>(shape.getDefiningOp()));
+  auto shapeOp = llvm::dyn_cast<fir::ShapeShiftOp>(shape.getDefiningOp());
+  ASSERT_EQ(shapeOp.getExtents().size(), 1u);
+  mlir::Value extent0 = shapeOp.getExtents()[0];
+  ASSERT_TRUE(llvm::isa_and_nonnull<fir::BoxDimsOp>(extent0.getDefiningOp()));
+  auto dimOp = llvm::dyn_cast<fir::BoxDimsOp>(extent0.getDefiningOp());
+  EXPECT_EQ(dimOp.getVal(), inputBox);
+
+  // Also verify the origin comes from a BoxDims on the same input box.
+  ASSERT_EQ(shapeOp.getOrigins().size(), 1u);
+  mlir::Value origin0 = shapeOp.getOrigins()[0];
+  ASSERT_TRUE(llvm::isa_and_nonnull<fir::BoxDimsOp>(origin0.getDefiningOp()));
+  auto lbOp = llvm::dyn_cast<fir::BoxDimsOp>(origin0.getDefiningOp());
+  EXPECT_EQ(lbOp.getVal(), inputBox);
+}

From 332b4deb0dfe9f4d11325513d4122e69024beea9 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Oct 2025 08:37:15 +0100
Subject: [PATCH 339/878] [lldb][IRExecutionUnit] Return error on failure to
 resolve function address (#161363)

Starting with https://github.com/llvm/llvm-project/pull/148877 we
started encoding the module ID of the function DIE we are currently
parsing into its `AsmLabel` in the AST. When the JIT asks LLDB to
resolve our special mangled name, we would locate the module and resolve
the function/symbol we found in it.

If we are debugging with a `SymbolFileDWARFDebugMap`, the module ID we
encode is that of the `.o` file that is tracked by the debug-map. To
resolve the address of the DIE in that `.o` file, we have to ask
`SymbolFileDWARFDebugMap::LinkOSOAddress` to turn the address of the
`.o` DIE into a real address in the linked executable. This will only
work if the `.o` address was actually tracked by the debug-map. However,
if the function definition appears in multiple `.o` files (which is the
case for functions defined in headers), the linker will most likely
de-deuplicate that definition. So most `.o`'s definition DIEs for that
function won't have a contribution in the debug-map, and thus we fail to
resolve the address.

When debugging Clang on Darwin, e.g., you'd see:
```
(lldb) expr CXXDecl->getName()

error: Couldn't look up symbols:
  $__lldb_func::0x1:0x4000d000002359da:_ZNK5clang9NamedDecl7getNameEv
Hint: The expression tried to call a function that is not present in the target, perhaps because it was optimized out by the compiler.
```
unless you were stopped in the `.o` file whose definition of `getName`
made it into the final executable.

The fix here is to error out if we fail to resolve the address, causing
us to fall back on the old flow which did a lookup by mangled name,
which the `SymbolFileDWARFDebugMap` will handle correctly.

An alternative fix to this would be to encode the
`SymbolFileDWARFDebugMap`'s module-id. And implement
`SymbolFileDWARFDebugMap::ResolveFunctionCallLabel` by doing a mangled
name lookup. The proposed approach doesn't stop us from implementing
that, so we could choose to do it in a follow-up.

rdar://161393045
---
 lldb/source/Expression/IRExecutionUnit.cpp    |  7 ++++-
 .../function-call-from-object-file/Makefile   |  3 ++
 .../TestFunctionCallFromObjectFile.py         | 29 +++++++++++++++++++
 .../function-call-from-object-file/common.h   |  8 +++++
 .../function-call-from-object-file/lib1.cpp   |  8 +++++
 .../function-call-from-object-file/lib2.cpp   |  6 ++++
 .../function-call-from-object-file/main.cpp   | 10 +++++++
 7 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/API/lang/cpp/function-call-from-object-file/Makefile
 create mode 100644 lldb/test/API/lang/cpp/function-call-from-object-file/TestFunctionCallFromObjectFile.py
 create mode 100644 lldb/test/API/lang/cpp/function-call-from-object-file/common.h
 create mode 100644 lldb/test/API/lang/cpp/function-call-from-object-file/lib1.cpp
 create mode 100644 lldb/test/API/lang/cpp/function-call-from-object-file/lib2.cpp
 create mode 100644 lldb/test/API/lang/cpp/function-call-from-object-file/main.cpp

diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp
index 25d4a87b89ef2..60b9de0d21b2e 100644
--- a/lldb/source/Expression/IRExecutionUnit.cpp
+++ b/lldb/source/Expression/IRExecutionUnit.cpp
@@ -751,7 +751,12 @@ ResolveFunctionCallLabel(FunctionCallLabel &label,
   sc_list.Append(*sc_or_err);
 
   LoadAddressResolver resolver(*sc.target_sp, symbol_was_missing_weak);
-  return resolver.Resolve(sc_list).value_or(LLDB_INVALID_ADDRESS);
+  lldb::addr_t resolved_addr =
+      resolver.Resolve(sc_list).value_or(LLDB_INVALID_ADDRESS);
+  if (resolved_addr == LLDB_INVALID_ADDRESS)
+    return llvm::createStringError("couldn't resolve address for function");
+
+  return resolved_addr;
 }
 
 lldb::addr_t
diff --git a/lldb/test/API/lang/cpp/function-call-from-object-file/Makefile b/lldb/test/API/lang/cpp/function-call-from-object-file/Makefile
new file mode 100644
index 0000000000000..285bbfbbca4fe
--- /dev/null
+++ b/lldb/test/API/lang/cpp/function-call-from-object-file/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp lib1.cpp lib2.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/lang/cpp/function-call-from-object-file/TestFunctionCallFromObjectFile.py b/lldb/test/API/lang/cpp/function-call-from-object-file/TestFunctionCallFromObjectFile.py
new file mode 100644
index 0000000000000..f0a7aef182a67
--- /dev/null
+++ b/lldb/test/API/lang/cpp/function-call-from-object-file/TestFunctionCallFromObjectFile.py
@@ -0,0 +1,29 @@
+"""
+Tests that we can call functions that have definitions in multiple
+CUs in the debug-info (which is the case for functions defined in headers).
+The linker will most likely de-duplicate the functiond definitions when linking
+the final executable. On Darwin, this will create a debug-map that LLDB will use
+to fix up object file addresses to addresses in the linked executable. However,
+if we parsed the DIE from the object file whose functiond definition got stripped
+by the linker, LLDB needs to ensure it can still resolve the function symbol it
+got for it.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestFunctionCallFromObjectFile(TestBase):
+    def test_lib1(self):
+        self.build()
+        lldbutil.run_to_name_breakpoint(self, "lib1_func")
+
+        self.expect_expr("Foo{}.foo()", result_type="int", result_value="15")
+
+    def test_lib2(self):
+        self.build()
+        lldbutil.run_to_name_breakpoint(self, "lib2_func")
+
+        self.expect_expr("Foo{}.foo()", result_type="int", result_value="15")
diff --git a/lldb/test/API/lang/cpp/function-call-from-object-file/common.h b/lldb/test/API/lang/cpp/function-call-from-object-file/common.h
new file mode 100644
index 0000000000000..76e23be6b97a6
--- /dev/null
+++ b/lldb/test/API/lang/cpp/function-call-from-object-file/common.h
@@ -0,0 +1,8 @@
+#ifndef COMMON_H_IN
+#define COMMON_H_IN
+
+struct Foo {
+  int foo() { return 15; }
+};
+
+#endif // COMMON_H_IN
diff --git a/lldb/test/API/lang/cpp/function-call-from-object-file/lib1.cpp b/lldb/test/API/lang/cpp/function-call-from-object-file/lib1.cpp
new file mode 100644
index 0000000000000..b97bcc1b712b6
--- /dev/null
+++ b/lldb/test/API/lang/cpp/function-call-from-object-file/lib1.cpp
@@ -0,0 +1,8 @@
+#include "common.h"
+
+// Parameter "Foo*" forces LLDB to parse "Foo" from the object
+// file that it is stopped in.
+void lib1_func(Foo *) {
+  // Force definition into lib1.o debug-info.
+  Foo{}.foo();
+}
diff --git a/lldb/test/API/lang/cpp/function-call-from-object-file/lib2.cpp b/lldb/test/API/lang/cpp/function-call-from-object-file/lib2.cpp
new file mode 100644
index 0000000000000..2f9d81a8bdf4c
--- /dev/null
+++ b/lldb/test/API/lang/cpp/function-call-from-object-file/lib2.cpp
@@ -0,0 +1,6 @@
+#include "common.h"
+
+void lib2_func(Foo *) {
+  // Force definition into lib2.o debug-info.
+  Foo{}.foo();
+}
diff --git a/lldb/test/API/lang/cpp/function-call-from-object-file/main.cpp b/lldb/test/API/lang/cpp/function-call-from-object-file/main.cpp
new file mode 100644
index 0000000000000..61ca798daf1df
--- /dev/null
+++ b/lldb/test/API/lang/cpp/function-call-from-object-file/main.cpp
@@ -0,0 +1,10 @@
+struct Foo;
+
+extern void lib1_func(Foo *);
+extern void lib2_func(Foo *);
+
+int main() {
+  lib1_func(nullptr);
+  lib2_func(nullptr);
+  return 0;
+}

From 3c0f7b184d265281dfcd4fab73348bc0e72c9902 Mon Sep 17 00:00:00 2001
From: Hendrik_Klug <43926224+Jimmy2027@users.noreply.github.com>
Date: Wed, 1 Oct 2025 09:40:47 +0200
Subject: [PATCH 340/878] [mlir][transform] Add PromoteTensorOp (#158318)

Transform op to request a tensor value to live in a specific memory
space after bufferization

Co-authored-by: Nicolas Vasilache <Nico.Vasilache@amd.com>
Co-authored-by: Alex Zinenko <ftynse@gmail.com>
---
 .../Linalg/TransformOps/LinalgTransformOps.td |  49 +++++++-
 .../TransformOps/LinalgTransformOps.cpp       | 116 ++++++++++++++----
 .../mlir/dialects/transform/structured.py     |   6 -
 .../Transform/test-promote-tensors.mlir       | 104 ++++++++++++++++
 4 files changed, 239 insertions(+), 36 deletions(-)
 create mode 100644 mlir/test/Dialect/Transform/test-promote-tensors.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 8f3232f01544f..0d6ebc087e2f3 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -17,6 +17,7 @@ include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Dialect/SCF/IR/DeviceMappingInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/RegionKindInterface.td"
 
@@ -236,11 +237,51 @@ def BufferizeToAllocationOp : Op<Transform_Dialect,
                       Transform_AnyOpType:$new_ops);
   let assemblyFormat = "$target attr-dict `:` type($target)";
   let hasVerifier = 1;
+}
 
-  let builders = [
-    OpBuilder<(ins "Value":$target, "Attribute":$memorySpace)>,
-    OpBuilder<(ins "Value":$target, "int64_t":$memorySpace)>
-  ];
+//===----------------------------------------------------------------------===//
+// PromoteTensorOp
+//===----------------------------------------------------------------------===//
+
+def PromoteTensorOp : Op<Transform_Dialect, "structured.promote_tensor",
+                         [DeclareOpInterfaceMethods<TransformOpInterface>,
+                          DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+                          SameOperandsAndResultType]> {
+  let summary = "Request a tensor value to live in a specific memory space "
+                "after bufferization";
+  let description = [{
+    Requests that a tensor value lives in a specific memory space for its
+    lifetime. This is achieved by allocating a new tensor in the desired
+    memory space with `bufferization.alloc_tensor` and optionally materializing
+    the source value into that allocation with
+    `bufferization.materialize_in_destination`. All uses of the original value
+    are then redirected to the promoted value.
+
+    The generated code for promoting tensor value %0 resembles the following:
+
+      %1 = bufferization.alloc_tensor(<dynamic dims of %0>)
+           { memory_space = memory_space }
+      // Note: the materialization is omitted if %0 is never read and is only
+      // written into (i.e., it behaves as a result tensor).
+      %2 = bufferization.materialize_in_destination %0 in %1
+      // ...
+      <all users of %0 now use %2 instead>
+
+    Deallocation is not handled by this transform.
+
+    Return modes:
+    - Produces a silenceable failure if the given handle does not point to
+      tensor-typed values.
+    - Succeeds otherwise and returns a handle to the promoted value(s), i.e.,
+      the result of materialization if present and the allocation otherwise.
+  }];
+
+  let arguments = (ins TransformValueHandleTypeInterface:$tensor,
+      OptionalAttr<AnyAttr>:$memory_space);
+  let results = (outs TransformValueHandleTypeInterface:$promoted);
+
+  let assemblyFormat =
+      "(`to` $memory_space^)? $tensor attr-dict `:` type($tensor)";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 3f0b0bacd9756..dd9b4c2490ef4 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -42,6 +42,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/LogicalResult.h"
@@ -273,32 +274,6 @@ void transform::ApplyFoldPackUnpackIntoEmptyPatternsOp::populatePatterns(
 // BufferizeToAllocationOp
 //===----------------------------------------------------------------------===//
 
-void transform::BufferizeToAllocationOp::build(OpBuilder &b,
-                                               OperationState &result,
-                                               Value target,
-                                               Attribute memorySpace) {
-  SmallVector<Type> resultTypes;
-  resultTypes.push_back(b.getType<transform::AnyValueType>());
-  resultTypes.push_back(b.getType<transform::AnyOpType>());
-  return build(b, result,
-               /*resultTypes=*/resultTypes,
-               /*target=*/target,
-               /*memory_space=*/memorySpace);
-}
-
-void transform::BufferizeToAllocationOp::build(OpBuilder &b,
-                                               OperationState &result,
-                                               Value target,
-                                               int64_t memorySpace) {
-  SmallVector<Type> resultTypes;
-  resultTypes.push_back(b.getType<transform::AnyValueType>());
-  resultTypes.push_back(b.getType<transform::AnyOpType>());
-  return build(b, result,
-               /*resultTypes=*/resultTypes,
-               /*target=*/target,
-               /*memory_space=*/b.getI64IntegerAttr(memorySpace));
-}
-
 namespace {
 class NewOpsListener : public RewriterBase::ForwardingListener {
 public:
@@ -408,6 +383,95 @@ LogicalResult transform::BufferizeToAllocationOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// PromoteTensorOp
+//===----------------------------------------------------------------------===//
+
+/// Return true if the operand may be read from by its owner. This is currently
+/// very conservative and only looks inside linalg operations to prevent
+/// unintentional data loss.
+static bool mayBeRead(OpOperand &operand) {
+  auto linalgOp = dyn_cast<linalg::LinalgOp>(operand.getOwner());
+
+  // Be conservative about ops we cannot analyze deeper.
+  if (!linalgOp)
+    return true;
+
+  // Look inside linalg ops.
+  Value blockArgument = linalgOp.getMatchingBlockArgument(&operand);
+  return !blockArgument.use_empty();
+}
+
+/// Return true if the value may be read through any of its uses.
+static bool mayBeRead(Value value) {
+  // If the value has a reference semantics, it
+  // may be read through any alias...
+  if (!isa<TensorType, FloatType, IntegerType>(value.getType()))
+    return true;
+  return llvm::any_of(value.getUses(),
+                      static_cast<bool (&)(OpOperand &)>(mayBeRead));
+}
+
+DiagnosedSilenceableFailure
+transform::PromoteTensorOp::apply(transform::TransformRewriter &rewriter,
+                                  transform::TransformResults &results,
+                                  transform::TransformState &state) {
+  SmallVector<Value> promoted;
+  for (Value tensor : state.getPayloadValues(getTensor())) {
+    auto type = dyn_cast<RankedTensorType>(tensor.getType());
+    if (!type) {
+      return emitSilenceableError() << "non-tensor type: " << tensor;
+    }
+
+    Operation *definingOp = tensor.getDefiningOp();
+    if (definingOp)
+      rewriter.setInsertionPointAfter(definingOp);
+    else
+      rewriter.setInsertionPointToStart(cast<BlockArgument>(tensor).getOwner());
+
+    // Check this before we emit operations using this value.
+    bool needsMaterialization = mayBeRead(tensor);
+
+    SmallVector<Value> dynamicDims;
+    llvm::SmallPtrSet<Operation *, 4> preservedOps;
+    for (auto [pos, dim] : llvm::enumerate(type.getShape())) {
+      if (!ShapedType::isDynamic(dim))
+        continue;
+      Value cst = rewriter.create<arith::ConstantIndexOp>(tensor.getLoc(), pos);
+      auto dimOp = rewriter.create<tensor::DimOp>(tensor.getLoc(), tensor, cst);
+      preservedOps.insert(dimOp);
+      dynamicDims.push_back(dimOp);
+    }
+    auto allocation = rewriter.create<bufferization::AllocTensorOp>(
+        tensor.getLoc(), type, dynamicDims);
+    // Set memory space if provided.
+    if (getMemorySpaceAttr())
+      allocation.setMemorySpaceAttr(getMemorySpaceAttr());
+    Value allocated = allocation;
+
+    // Only insert a materialization (typically bufferizes to a copy) when the
+    // value may be read from.
+    if (needsMaterialization) {
+      auto copy = rewriter.create<bufferization::MaterializeInDestinationOp>(
+          tensor.getLoc(), tensor, allocated);
+      preservedOps.insert(copy);
+      promoted.push_back(copy.getResult());
+    } else {
+      promoted.push_back(allocated);
+    }
+    rewriter.replaceAllUsesExcept(tensor, promoted.back(), preservedOps);
+  }
+  results.setValues(cast<OpResult>(getPromoted()), promoted);
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform::PromoteTensorOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  transform::onlyReadsHandle(getTensorMutable(), effects);
+  transform::producesHandle(getOperation()->getOpResults(), effects);
+  transform::modifiesPayload(effects);
+}
+
 //===----------------------------------------------------------------------===//
 // DecomposeOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index bf40cc532065d..e3bacb5777d9f 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -44,18 +44,12 @@ def __init__(
         loc=None,
         ip=None,
     ):
-        # No other types are allowed, so hard-code those here.
-        allocated_buffer_type = transform.AnyValueType.get()
-        new_ops_type = transform.AnyOpType.get()
-
         if isinstance(memory_space, int):
             memory_space = str(memory_space)
         if isinstance(memory_space, str):
             memory_space = Attribute.parse(memory_space)
 
         super().__init__(
-            allocated_buffer_type,
-            new_ops_type,
             target,
             memory_space=memory_space,
             memcpy_op=memcpy_op,
diff --git a/mlir/test/Dialect/Transform/test-promote-tensors.mlir b/mlir/test/Dialect/Transform/test-promote-tensors.mlir
new file mode 100644
index 0000000000000..bc9a05af64156
--- /dev/null
+++ b/mlir/test/Dialect/Transform/test-promote-tensors.mlir
@@ -0,0 +1,104 @@
+// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
+
+// CHECK-LABEL: @promote_in0
+// CHECK-SAME:  (%[[ARG0:.+]]: tensor<?x42xf32>, %{{.*}}, %{{.*}})
+// CHECK:  %[[C0:.+]] = arith.constant 0
+// CHECK:  %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]]
+// CHECK:  %[[ALLOC:.+]] = bufferization.alloc_tensor(%[[DIM]]) {memory_space = 1 : i64}
+// CHECK:  %[[MAT:.+]] = bufferization.materialize_in_destination %[[ARG0]] in %[[ALLOC]]
+// CHECK:  linalg.matmul ins(%[[MAT]], %{{.*}}
+func.func @promote_in0(%arg0: tensor<?x42xf32>, %arg1: tensor<42x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    %0 = linalg.matmul ins(%arg0, %arg1: tensor<?x42xf32>, tensor<42x?xf32>)
+                       outs(%arg2: tensor<?x?xf32>) -> tensor<?x?xf32>
+    return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%root: !transform.any_op) {
+        %mm = transform.structured.match ops{["linalg.matmul"]} in %root
+            : (!transform.any_op) -> !transform.any_op
+        %op0 = transform.get_operand %mm[0]
+            : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %op0 : !transform.any_value
+        transform.yield
+    }
+}
+
+// -----
+
+// CHECK-LABEL: @promote_out
+// CHECK-SAME: (%{{.*}}: tensor<?x42xf32>, %{{.*}}: tensor<?x42xf32>, %[[ARG2:.+]]: tensor<?x?xf32>)
+func.func @promote_out(%arg0: tensor<?x42xf32>, %arg1: tensor<?x42xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    // CHECK:  %[[C0:.+]] = arith.constant 0
+    // CHECK:  %[[DIM0:.+]] = tensor.dim %[[ARG2]], %[[C0]]
+    // CHECK:  %[[C1:.+]] = arith.constant 1
+    // CHECK:  %[[DIM1:.+]] = tensor.dim %[[ARG2]], %[[C1]]
+    // CHECK:  %[[ALLOC:.+]] = bufferization.alloc_tensor(%[[DIM0]], %[[DIM1]]) {memory_space = 1 : i64}
+    // CHECK-NOT: materialize_in_destination
+    // CHECK:  linalg.add {{.*}} outs(%[[ALLOC]]
+    %0 = linalg.add ins(%arg0, %arg1 : tensor<?x42xf32>, tensor<?x42xf32>)
+                    outs(%arg2 : tensor<?x?xf32>) -> tensor<?x?xf32>
+    return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%root: !transform.any_op) {
+        %la = transform.structured.match ops{["linalg.add"]} in %root
+            : (!transform.any_op) -> !transform.any_op
+        %init = transform.get_operand %la[2]
+                : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %init : !transform.any_value
+
+        transform.yield
+    }
+}
+
+// -----
+
+// CHECK-LABEL: @promote_in0_out_bufferize
+// CHECK-SAME: (%[[ARG0:.+]]: tensor<?x42xf32>, %{{.*}}: tensor<42x?xf32>, %[[ARG2:.+]]: tensor<?x?xf32>)
+func.func @promote_in0_out_bufferize(%arg0: tensor<?x42xf32>, %arg1: tensor<42x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    // CHECK:  %[[IN1:.+]] = bufferization.to_buffer %arg1 : tensor<42x?xf32> to memref<42x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[IN0:.+]] = bufferization.to_buffer %arg0 : tensor<?x42xf32> to memref<?x42xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %{{.+}} = bufferization.to_buffer %arg0 : tensor<?x42xf32> to memref<?x42xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %{{.+}} = bufferization.to_buffer %arg2 : tensor<?x?xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %{{.+}} = bufferization.to_buffer %arg2 : tensor<?x?xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[C0:.+]] = arith.constant 0 : index
+    // CHECK:  %{{.+}} = memref.dim %{{.+}}, %[[C0]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[C1:.+]] = arith.constant 1 : index
+    // CHECK:  %{{.+}} = memref.dim %{{.+}}, %[[C1]] : memref<?x?xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[ALLOC_OUT:.+]] = memref.alloc(%{{.+}}, %{{.+}}) {alignment = 64 : i64} : memref<?x?xf32, 1>
+    // CHECK:  %{{.+}} = arith.constant 0 : index
+    // CHECK:  %{{.+}} = memref.dim %{{.+}}, %{{.+}} : memref<?x42xf32, strided<[?, ?], offset: ?>>
+    // CHECK:  %[[ALLOC_IN:.+]] = memref.alloc(%{{.+}}) {alignment = 64 : i64} : memref<?x42xf32, 1>
+    // CHECK:  memref.copy %[[IN0]], %[[ALLOC_IN]] : memref<?x42xf32, strided<[?, ?], offset: ?>> to memref<?x42xf32, 1>
+    // CHECK: linalg.add ins(%[[ALLOC_IN]], %[[IN1]] : memref<?x42xf32, 1>, memref<42x?xf32, strided<[?, ?], offset: ?>>) outs(%[[ALLOC_OUT]] : memref<?x?xf32, 1>)
+    %0 = linalg.add ins(%arg0, %arg1: tensor<?x42xf32>, tensor<42x?xf32>)
+                    outs(%arg2: tensor<?x?xf32>) -> tensor<?x?xf32>
+    return %0 : tensor<?x?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%root: !transform.any_op) {
+        %la = transform.structured.match ops{["linalg.add"]} in %root
+            : (!transform.any_op) -> !transform.any_op
+        %op0 = transform.get_operand %la[0]
+            : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %op0 : !transform.any_value
+
+        %init = transform.get_operand %la[2]
+                : (!transform.any_op) -> !transform.any_value
+        transform.structured.promote_tensor to 1 %init : !transform.any_value
+
+        %func = transform.structured.match ops{["func.func"]} in %root
+                : (!transform.any_op) -> !transform.any_op
+
+        %bufferized = transform.bufferization.one_shot_bufferize %func
+            : (!transform.any_op) -> !transform.any_op
+
+        transform.yield
+    }
+}
+
+
+

From f2f0963f303be3b4b274a24aca70b51f10ea5112 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Oct 2025 09:33:51 +0200
Subject: [PATCH 341/878] [MemorySanitizer] Generate check lines for some
 vararg tests (NFC)

Use UTC_ARGS: --disable to skip the tests with many arguments.
---
 .../LoongArch/vararg-loongarch64.ll           |  79 ++++++--
 .../MemorySanitizer/Mips/vararg-mips64.ll     |  87 +++++++--
 .../MemorySanitizer/Mips/vararg-mips64el.ll   |  86 +++++++--
 .../MemorySanitizer/PowerPC/vararg-ppc64.ll   | 174 +++++++++++++----
 .../MemorySanitizer/PowerPC/vararg-ppc64le.ll | 175 ++++++++++++++----
 5 files changed, 484 insertions(+), 117 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll b/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
index e6d3a4b2994ad..4d4fc1bdd7bde 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
@@ -9,12 +10,36 @@ declare void @llvm.va_start(ptr) #2
 declare void @llvm.va_end(ptr) #2
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 define i32 @foo(i32 %guard, ...) {
-; CHECK-LABEL: @foo
-; CHECK:    [[TMP1:%.*]] = load {{.*}} @__msan_va_arg_overflow_size_tls
-; CHECK:    [[TMP3:%.*]] = alloca {{.*}} [[TMP1]]
-; CHECK:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 [[TMP1]], i1 false)
-; CHECK:    [[TMP4:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800)
-; CHECK:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP4]], i1 false)
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i32 [[GUARD:%.*]], ...) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 0, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[VL:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP15]], ptr align 8 [[TMP2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VL]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[VL]])
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 0
 ;
   %vl = alloca ptr, align 8
   call void @llvm.lifetime.start.p0(ptr %vl)
@@ -27,11 +52,22 @@ define i32 @foo(i32 %guard, ...) {
 ;; Save the incoming shadow value from the arguments in the __msan_va_arg_tls
 ;; array.
 define i32 @bar() {
-; CHECK-LABEL: @bar
-; CHECK:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK:    store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
+; CHECK-LABEL: define i32 @bar() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
@@ -40,15 +76,28 @@ define i32 @bar() {
 ;; Check multiple fixed arguments.
 declare i32 @foo2(i32 %g1, i32 %g2, ...)
 define i32 @bar2() {
-; CHECK-LABEL: @bar2
-; CHECK:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK:    store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
+; CHECK-LABEL: define i32 @bar2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %1 = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
 }
 
+; UTC_ARGS: --disable
+
 ;; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
 ;; passed to a variadic function.
 declare i64 @sum(i64 %n, ...)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
index 69a74a37a1f04..9f3f10e51b272 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
@@ -1,9 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "E-m:m-i8:8:32-i16:16:32-i64:64-n32:64-S128"
 target triple = "mips64--linux"
 
 define i32 @foo(i32 %guard, ...) {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i32 [[GUARD:%.*]], ...) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 0, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[VL:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 549755813888
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 549755813888
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 549755813888
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP15]], ptr align 8 [[TMP2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VL]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[VL]])
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 0
+;
   %vl = alloca ptr, align 8
   call void @llvm.lifetime.start.p0(ptr %vl)
   call void @llvm.va_start(ptr %vl)
@@ -12,23 +44,29 @@ define i32 @foo(i32 %guard, ...) {
   ret i32 0
 }
 
-; First, check allocation of the save area.
-
-; CHECK-LABEL: @foo
-; CHECK: [[A:%.*]] = load {{.*}} @__msan_va_arg_overflow_size_tls
-; CHECK: [[C:%.*]] = alloca {{.*}} [[A]]
-
-; CHECK: call void @llvm.memset.p0.i64(ptr align 8 [[C]], i8 0, i64 [[A]], i1 false)
-
-; CHECK: [[D:%.*]] = call i64 @llvm.umin.i64(i64 [[A]], i64 800)
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[C]], ptr align 8 @__msan_va_arg_tls, i64 [[D]], i1 false)
-
 declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 declare void @llvm.va_start(ptr) #2
 declare void @llvm.va_end(ptr) #2
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
 define i32 @bar() {
+; CHECK-LABEL: define i32 @bar() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
 }
@@ -36,23 +74,32 @@ define i32 @bar() {
 ; Save the incoming shadow value from the arguments in the __msan_va_arg_tls
 ; array.  The first argument is stored at position 4, since it's right
 ; justified.
-; CHECK-LABEL: @bar
-; CHECK: store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check multiple fixed arguments.
 declare i32 @foo2(i32 %g1, i32 %g2, ...)
 define i32 @bar2() {
+; CHECK-LABEL: define i32 @bar2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar2
-; CHECK: store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
+
+; UTC_ARGS: --disable
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
 ; passed to a variadic function.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
index b19da8e9ff14b..41fb975dcf285 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
@@ -1,9 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:m-i8:8:32-i16:16:32-i64:64-n32:64-S128"
 target triple = "mips64el--linux"
 
 define i32 @foo(i32 %guard, ...) {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i32 [[GUARD:%.*]], ...) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 0, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[VL:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 549755813888
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 549755813888
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 549755813888
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP15]], ptr align 8 [[TMP2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VL]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[VL]])
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 0
+;
   %vl = alloca ptr, align 8
   call void @llvm.lifetime.start.p0(ptr %vl)
   call void @llvm.va_start(ptr %vl)
@@ -12,46 +44,60 @@ define i32 @foo(i32 %guard, ...) {
   ret i32 0
 }
 
-; First, check allocation of the save area.
-
-; CHECK-LABEL: @foo
-; CHECK: [[A:%.*]] = load {{.*}} @__msan_va_arg_overflow_size_tls
-; CHECK: [[C:%.*]] = alloca {{.*}} [[A]]
-
-; CHECK: call void @llvm.memset.p0.i64(ptr align 8 [[C]], i8 0, i64 [[A]], i1 false)
-
-; CHECK: [[D:%.*]] = call i64 @llvm.umin.i64(i64 [[A]], i64 800)
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[C]], ptr align 8 @__msan_va_arg_tls, i64 [[D]], i1 false)
-
 declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 declare void @llvm.va_start(ptr) #2
 declare void @llvm.va_end(ptr) #2
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
 define i32 @bar() {
+; CHECK-LABEL: define i32 @bar() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
 }
 
 ; Save the incoming shadow value from the arguments in the __msan_va_arg_tls
 ; array.
-; CHECK-LABEL: @bar
-; CHECK: store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check multiple fixed arguments.
 declare i32 @foo2(i32 %g1, i32 %g2, ...)
 define i32 @bar2() {
+; CHECK-LABEL: define i32 @bar2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar2
-; CHECK: store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
+; UTC_ARGS: --disable
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
 ; passed to a variadic function.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
index 9351067969050..19b07e16fb46f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
@@ -1,9 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64--linux"
 
 define i32 @foo(i32 %guard, ...) {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i32 [[GUARD:%.*]], ...) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 0, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[VL:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -246290604621825
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 17592186044416
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 8796093022208
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -246290604621825
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 17592186044416
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 8796093022208
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP13]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = and i64 [[TMP17]], -246290604621825
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 17592186044416
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 8796093022208
+; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP21]], ptr align 8 [[TMP2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VL]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[VL]])
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 0
+;
   %vl = alloca ptr, align 8
   call void @llvm.lifetime.start.p0(ptr %vl)
   call void @llvm.va_start(ptr %vl)
@@ -12,23 +50,29 @@ define i32 @foo(i32 %guard, ...) {
   ret i32 0
 }
 
-; First, check allocation of the save area.
-
-; CHECK-LABEL: @foo
-; CHECK: [[A:%.*]] = load {{.*}} @__msan_va_arg_overflow_size_tls
-; CHECK: [[C:%.*]] = alloca {{.*}} [[A]]
-
-; CHECK: call void @llvm.memset.p0.i64(ptr align 8 [[C]], i8 0, i64 [[A]], i1 false)
-
-; CHECK: [[D:%.*]] = call i64 @llvm.umin.i64(i64 [[A]], i64 800)
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[C]], ptr align 8 @__msan_va_arg_tls, i64 [[D]], i1 false)
-
 declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 declare void @llvm.va_start(ptr) #2
 declare void @llvm.va_end(ptr) #2
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
 define i32 @bar() {
+; CHECK-LABEL: define i32 @bar() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
 }
@@ -36,14 +80,22 @@ define i32 @bar() {
 ; Save the incoming shadow value from the arguments in the __msan_va_arg_tls
 ; array.  The first argument is stored at position 4, since it's right
 ; justified.
-; CHECK-LABEL: @bar
-; CHECK: store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check vector argument.
 define i32 @bar2() {
+; CHECK-LABEL: define i32 @bar2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
   ret i32 %1
 }
@@ -51,50 +103,110 @@ define i32 @bar2() {
 ; The vector is at offset 16 of parameter save area, but __msan_va_arg_tls
 ; corresponds to offset 8+ of parameter save area - so the offset from
 ; __msan_va_arg_tls is actually misaligned.
-; CHECK-LABEL: @bar2
-; CHECK: store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check i64 array.
 define i32 @bar4() {
+; CHECK-LABEL: define i32 @bar4() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2])
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar4
-; CHECK: store [2 x i64] zeroinitializer, ptr @__msan_va_arg_tls, align 8
-; CHECK: store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check i128 array.
 define i32 @bar5() {
+; CHECK-LABEL: define i32 @bar5() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar5
-; CHECK: store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store {{.*}} 40, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check 8-aligned byval.
 define i32 @bar6(ptr %arg) {
+; CHECK-LABEL: define i32 @bar6(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -246290604621825
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 16, i1 false)
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 8796093022208
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_va_arg_tls, ptr align 8 [[TMP11]], i64 16, i1 false)
+; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([2 x i64]) align 8 [[ARG]])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP12]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, ptr byval([2 x i64]) align 8 %arg)
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar6
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_va_arg_tls, ptr align 8 {{.*}}, i64 16, i1 false)
-; CHECK: store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check 16-aligned byval.
 define i32 @bar7(ptr %arg) {
+; CHECK-LABEL: define i32 @bar7(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -246290604621825
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 32, i1 false)
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 8796093022208
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), ptr align 8 [[TMP11]], i64 32, i1 false)
+; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 [[ARG]])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP12]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 %arg)
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar7
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), ptr align 8 {{.*}}, i64 32, i1 false)
-; CHECK: store {{.*}} 40, {{.*}} @__msan_va_arg_overflow_size_tls
 
+; UTC_ARGS: --disable
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
 ; passed to a variadic function.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
index 4151f3b223b3a..1fe63850860e8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
@@ -1,9 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le--linux"
 
 define i32 @foo(i32 %guard, ...) {
+; CHECK-LABEL: define i32 @foo(
+; CHECK-SAME: i32 [[GUARD:%.*]], ...) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 0, i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[VL:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -246290604621825
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 17592186044416
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 8796093022208
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -246290604621825
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 17592186044416
+; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 8796093022208
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP13]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VL]])
+; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[VL]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = and i64 [[TMP17]], -246290604621825
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 17592186044416
+; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 8796093022208
+; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP21]], ptr align 8 [[TMP2]], i64 [[TMP1]], i1 false)
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[VL]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[VL]])
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 0
+;
   %vl = alloca ptr, align 8
   call void @llvm.lifetime.start.p0(ptr %vl)
   call void @llvm.va_start(ptr %vl)
@@ -12,37 +50,51 @@ define i32 @foo(i32 %guard, ...) {
   ret i32 0
 }
 
-; First, check allocation of the save area.
-
-; CHECK-LABEL: @foo
-; CHECK: [[A:%.*]] = load {{.*}} @__msan_va_arg_overflow_size_tls
-; CHECK: [[C:%.*]] = alloca {{.*}} [[A]]
-
-; CHECK: call void @llvm.memset.p0.i64(ptr align 8 [[C]], i8 0, i64 [[A]], i1 false)
-
-; CHECK: [[D:%.*]] = call i64 @llvm.umin.i64(i64 [[A]], i64 800)
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[C]], ptr align 8 @__msan_va_arg_tls, i64 [[D]], i1 false)
-
 declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 declare void @llvm.va_start(ptr) #2
 declare void @llvm.va_end(ptr) #2
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
 define i32 @bar() {
+; CHECK-LABEL: define i32 @bar() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
   ret i32 %1
 }
 
 ; Save the incoming shadow value from the arguments in the __msan_va_arg_tls
 ; array.
-; CHECK-LABEL: @bar
-; CHECK: store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check vector argument.
 define i32 @bar2() {
+; CHECK-LABEL: define i32 @bar2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
   ret i32 %1
 }
@@ -50,49 +102,110 @@ define i32 @bar2() {
 ; The vector is at offset 16 of parameter save area, but __msan_va_arg_tls
 ; corresponds to offset 8+ of parameter save area - so the offset from
 ; __msan_va_arg_tls is actually misaligned.
-; CHECK-LABEL: @bar2
-; CHECK: store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store {{.*}} 24, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check i64 array.
 define i32 @bar4() {
+; CHECK-LABEL: define i32 @bar4() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr @__msan_va_arg_tls, align 8
+; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2])
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar4
-; CHECK: store [2 x i64] zeroinitializer, ptr @__msan_va_arg_tls, align 8
-; CHECK: store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check i128 array.
 define i32 @bar5() {
+; CHECK-LABEL: define i32 @bar5() {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar5
-; CHECK: store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK: store {{.*}} 40, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check 8-aligned byval.
 define i32 @bar6(ptr %arg) {
+; CHECK-LABEL: define i32 @bar6(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -246290604621825
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 16, i1 false)
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 8796093022208
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_va_arg_tls, ptr align 8 [[TMP11]], i64 16, i1 false)
+; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([2 x i64]) align 8 [[ARG]])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP12]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, ptr byval([2 x i64]) align 8 %arg)
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar6
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_va_arg_tls, ptr align 8 {{.*}}, i64 16, i1 false)
-; CHECK: store {{.*}} 16, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Check 16-aligned byval.
 define i32 @bar7(ptr %arg) {
+; CHECK-LABEL: define i32 @bar7(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -246290604621825
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 32, i1 false)
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 8796093022208
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), ptr align 8 [[TMP11]], i64 32, i1 false)
+; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 [[ARG]])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP12]]
+;
   %1 = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 %arg)
   ret i32 %1
 }
 
-; CHECK-LABEL: @bar7
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), ptr align 8 {{.*}}, i64 32, i1 false)
-; CHECK: store {{.*}} 40, {{.*}} @__msan_va_arg_overflow_size_tls
+
+; UTC_ARGS: --disable
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
 ; passed to a variadic function.

From b0de7a6a53d7f3153ba5f6a74e379a96857c8692 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Wed, 1 Oct 2025 09:05:37 +0100
Subject: [PATCH 342/878] [flang][debug] Change type*N to type(kind=N).
 (#161432)

It was discussed in https://github.com/llvm/llvm-project/pull/161361.
---
 .../lib/Optimizer/Transforms/DebugTypeGenerator.cpp  |  7 ++++---
 flang/test/Integration/debug-complex-1.f90           |  4 ++--
 flang/test/Integration/debug-local-var-2.f90         |  6 +++---
 flang/test/Transforms/debug-complex-1.fir            |  4 ++--
 flang/test/Transforms/debug-derived-type-1.fir       |  6 +++---
 flang/test/Transforms/debug-fn-info.fir              |  6 +++---
 flang/test/Transforms/debug-local-var.fir            |  6 +++---
 flang/test/Transforms/debug-ref-type.fir             |  2 +-
 flang/test/Transforms/debug-tuple-type.fir           |  2 +-
 flang/test/Transforms/debug-vector-type.fir          | 12 ++++++------
 10 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index a7e47239036ba..00fdb5a4516bd 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -682,10 +682,11 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertPointerLikeType(
 static mlir::StringAttr getBasicTypeName(mlir::MLIRContext *context,
                                          llvm::StringRef baseName,
                                          unsigned bitSize) {
-  std::string name(baseName.str());
+  std::ostringstream oss;
+  oss << baseName.str();
   if (bitSize != 32)
-    name += "*" + std::to_string(bitSize / 8);
-  return mlir::StringAttr::get(context, name);
+    oss << "(kind=" << (bitSize / 8) << ")";
+  return mlir::StringAttr::get(context, oss.str());
 }
 
 mlir::LLVM::DITypeAttr
diff --git a/flang/test/Integration/debug-complex-1.f90 b/flang/test/Integration/debug-complex-1.f90
index 48ea0295eb250..1d70140a202d7 100644
--- a/flang/test/Integration/debug-complex-1.f90
+++ b/flang/test/Integration/debug-complex-1.f90
@@ -17,8 +17,8 @@ function fn1(a, b) result (c)
 end program
 
 ! CHECK-DAG: ![[C4:.*]] = !DIBasicType(name: "complex", size: 64, encoding: DW_ATE_complex_float)
-! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex*8", size: 128, encoding: DW_ATE_complex_float)
-! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex*16", size: 256, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex(kind=8)", size: 128, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex(kind=16)", size: 256, encoding: DW_ATE_complex_float)
 ! CHECK-DAG: !DILocalVariable(name: "c4"{{.*}}type: ![[C4]])
 ! CHECK-DAG: !DILocalVariable(name: "c8"{{.*}}type: ![[C8]])
 ! CHECK-DAG: !DILocalVariable(name: "r"{{.*}}type: ![[C16]])
diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90
index 93659a56c7275..e95263e6841ad 100644
--- a/flang/test/Integration/debug-local-var-2.f90
+++ b/flang/test/Integration/debug-local-var-2.f90
@@ -40,11 +40,11 @@ program mn
 ! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "MN", {{.*}})
 
 ! BOTH-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
-! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer*8", size: 64, encoding: DW_ATE_signed)
-! BOTH-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical*1", size: 8, encoding: DW_ATE_boolean)
+! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer(kind=8)", size: 64, encoding: DW_ATE_signed)
+! BOTH-DAG: ![[TYL8:.*]]  = !DIBasicType(name: "logical(kind=1)", size: 8, encoding: DW_ATE_boolean)
 ! BOTH-DAG: ![[TYL32:.*]] = !DIBasicType(name: "logical", size: 32, encoding: DW_ATE_boolean)
 ! BOTH-DAG: ![[TYR32:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
-! BOTH-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real*8", size: 64, encoding: DW_ATE_float)
+! BOTH-DAG: ![[TYR64:.*]] = !DIBasicType(name: "real(kind=8)", size: 64, encoding: DW_ATE_float)
 
 ! BOTH-DAG: ![[I4]] = !DILocalVariable(name: "i4", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI32]])
 ! BOTH-DAG: ![[I8]] = !DILocalVariable(name: "i8", scope: ![[MAIN]], file: !{{.*}}, line: [[@LINE+6]], type: ![[TYI64]])
diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir
index 7a288fec69be3..6e2c6c5bdb354 100644
--- a/flang/test/Transforms/debug-complex-1.fir
+++ b/flang/test/Transforms/debug-complex-1.fir
@@ -26,9 +26,9 @@ module {
 #loc3 = loc("./simple.f90":8:1)
 #loc4 = loc("./simple.f90":11:1)
 
-// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex*8", sizeInBits = 128, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex(kind=8)", sizeInBits = 128, encoding = DW_ATE_complex_float>
 // CHECK-DAG: #[[CMPX4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
-// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex*16", sizeInBits = 256, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex(kind=16)", sizeInBits = 256, encoding = DW_ATE_complex_float>
 
 // CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX8]], #[[CMPX4]]>
 // CHECK-DAG: #[[TY2:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX16]], #[[CMPX4]]>
diff --git a/flang/test/Transforms/debug-derived-type-1.fir b/flang/test/Transforms/debug-derived-type-1.fir
index 672b6cf2819d2..22832b67742c8 100644
--- a/flang/test/Transforms/debug-derived-type-1.fir
+++ b/flang/test/Transforms/debug-derived-type-1.fir
@@ -45,12 +45,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 
 
 // CHECK-DAG: #[[INT_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer(kind=8)", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[REAL4_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[CMX8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
 // CHECK-DAG: #[[CMX_ARR:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type, baseType = #[[CMX8_TY:.*]]{{.*}}>
-// CHECK-DAG: #[[LOG_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
-// CHECK-DAG: #[[REAL8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical(kind=1)", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK-DAG: #[[STR_TY:.*]] = #llvm.di_string_type
 // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}name = "m_employee"{{.*}}>
 // CHECK-DAG: #[[MOD1:.*]] = #llvm.di_module<{{.*}}name = "t1"{{.*}}>
diff --git a/flang/test/Transforms/debug-fn-info.fir b/flang/test/Transforms/debug-fn-info.fir
index d82cef1acc423..e42beb1f748f1 100644
--- a/flang/test/Transforms/debug-fn-info.fir
+++ b/flang/test/Transforms/debug-fn-info.fir
@@ -64,10 +64,10 @@ module {
 #loc4 = loc("test2.f90":53:22)
 
 
-// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer(kind=8)", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical(kind=1)", sizeInBits = 8, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
 // CHECK: #[[TY0:.*]] = #llvm.di_subroutine_type<callingConvention = DW_CC_program, types = #di_null_type>
diff --git a/flang/test/Transforms/debug-local-var.fir b/flang/test/Transforms/debug-local-var.fir
index 466f79c6ed879..d39017e6dd62a 100644
--- a/flang/test/Transforms/debug-local-var.fir
+++ b/flang/test/Transforms/debug-local-var.fir
@@ -71,10 +71,10 @@ module {
 #loc15 = loc("test.f90":21:24)
 #loc16 = loc("test.f90":22:5)
 
-// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[INT8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer(kind=8)", sizeInBits = 64, encoding = DW_ATE_signed>
 // CHECK-DAG: #[[INT4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
-// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical*1", sizeInBits = 8, encoding = DW_ATE_boolean>
+// CHECK-DAG: #[[REAL8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #[[LOG1:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical(kind=1)", sizeInBits = 8, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[REAL4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #[[LOG4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "logical", sizeInBits = 32, encoding = DW_ATE_boolean>
 // CHECK-DAG: #[[MAIN:.*]] = #llvm.di_subprogram<{{.*}}name = "mn"{{.*}}>
diff --git a/flang/test/Transforms/debug-ref-type.fir b/flang/test/Transforms/debug-ref-type.fir
index 2164a40c7c111..daffa293ba2e3 100644
--- a/flang/test/Transforms/debug-ref-type.fir
+++ b/flang/test/Transforms/debug-ref-type.fir
@@ -5,6 +5,6 @@ module {
 }
 #loc1 = loc("test.f90":5:1)
 
-// CHECK: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*1", sizeInBits = 8, encoding = DW_ATE_signed>
+// CHECK: #[[INT8_TY:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer(kind=1)", sizeInBits = 8, encoding = DW_ATE_signed>
 // CHECK: #[[REF_TY:.*]] = #llvm.di_derived_type<tag = DW_TAG_pointer_type, name = "", baseType = #[[INT8_TY]]{{.*}}>
 // CHECK: #llvm.di_subroutine_type<{{.*}}types = #[[REF_TY]], #[[INT8_TY]]>
diff --git a/flang/test/Transforms/debug-tuple-type.fir b/flang/test/Transforms/debug-tuple-type.fir
index b865d492b6696..73a07333b3aef 100644
--- a/flang/test/Transforms/debug-tuple-type.fir
+++ b/flang/test/Transforms/debug-tuple-type.fir
@@ -5,7 +5,7 @@ module {
   func.func private @_FortranAioOutputDerivedType(!fir.ref<tuple<>>)
 }
 
-// CHECK: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float>
 // CHECK: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}>
 // CHECK: #[[DTY1:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "", baseType = #[[F64]], sizeInBits = 64, alignInBits = {{.*}}>
 // CHECK: #[[DTY2:.*]] = #llvm.di_derived_type<tag = DW_TAG_member, name = "", baseType = #[[F64]], sizeInBits = 64, alignInBits = {{.*}}, offsetInBits = {{.*}}>
diff --git a/flang/test/Transforms/debug-vector-type.fir b/flang/test/Transforms/debug-vector-type.fir
index cfb97ea46ba61..9e41d90f407b9 100644
--- a/flang/test/Transforms/debug-vector-type.fir
+++ b/flang/test/Transforms/debug-vector-type.fir
@@ -2,22 +2,22 @@
 
 module {
 func.func private @foo1(%arg0: !fir.vector<20:bf16>)
-// CHECK-DAG: #[[F16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*2", sizeInBits = 16, encoding = DW_ATE_float>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real*2 (20)", baseType = #[[F16]], flags = Vector, sizeInBits = 320, elements = #llvm.di_subrange<count = 20 : i64>>
+// CHECK-DAG: #[[F16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=2)", sizeInBits = 16, encoding = DW_ATE_float>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real(kind=2) (20)", baseType = #[[F16]], flags = Vector, sizeInBits = 320, elements = #llvm.di_subrange<count = 20 : i64>>
 
 func.func private @foo2(%arg0: !fir.vector<30:f32>)
 // CHECK-DAG: #[[F32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
 // CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real (30)", baseType = #[[F32]], flags = Vector, sizeInBits = 960, elements = #llvm.di_subrange<count = 30 : i64>>
 
 func.func private @foo3(%arg0: !fir.vector<10:f64>)
-// CHECK-DAG: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real*8", sizeInBits = 64, encoding = DW_ATE_float>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real*8 (10)", baseType = #[[F64]], flags = Vector, sizeInBits = 640, elements = #llvm.di_subrange<count = 10 : i64>>
+// CHECK-DAG: #[[F64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real(kind=8)", sizeInBits = 64, encoding = DW_ATE_float>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector real(kind=8) (10)", baseType = #[[F64]], flags = Vector, sizeInBits = 640, elements = #llvm.di_subrange<count = 10 : i64>>
 
 func.func private @foo4(%arg0: !fir.vector<5:i32>)
 // CHECK-DAG: #[[I32:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
 // CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer (5)", baseType = #[[I32]], flags = Vector, sizeInBits = 160, elements = #llvm.di_subrange<count = 5 : i64>>
 
 func.func private @foo5(%arg0: !fir.vector<2:i64>)
-// CHECK-DAG: #[[I64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer*8", sizeInBits = 64, encoding = DW_ATE_signed>
-// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer*8 (2)", baseType = #[[I64]], flags = Vector, sizeInBits = 128, elements = #llvm.di_subrange<count = 2 : i64>>
+// CHECK-DAG: #[[I64:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer(kind=8)", sizeInBits = 64, encoding = DW_ATE_signed>
+// CHECK-DAG: #llvm.di_composite_type<tag = DW_TAG_array_type, name = "vector integer(kind=8) (2)", baseType = #[[I64]], flags = Vector, sizeInBits = 128, elements = #llvm.di_subrange<count = 2 : i64>>
 }

From 5baf1c14640cf580f4e65375a3f9b49420dad9c6 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Oct 2025 10:11:41 +0200
Subject: [PATCH 343/878] [MemorySanitizer] Generate test checks for kmsan test
 (NFC)

---
 .../MemorySanitizer/msan_kernel_basic.ll      | 813 ++++++++++++------
 1 file changed, 538 insertions(+), 275 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
index 4b7a910af08bf..a7209de32380a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; KMSAN instrumentation tests
 ; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck %s -check-prefixes=CHECK
 
@@ -6,309 +7,495 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Check the instrumentation prologue.
 define void @Empty() nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define void @Empty(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    ret void
+;
 entry:
   ret void
 }
 
-; CHECK-LABEL: @Empty
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; %param_shadow:
-; CHECK: getelementptr {{.*}} i32 0, i32 0
-; %retval_shadow:
-; CHECK: getelementptr {{.*}} i32 0, i32 1
-; %va_arg_shadow:
-; CHECK: getelementptr {{.*}} i32 0, i32 2
-; %va_arg_origin:
-; CHECK: getelementptr {{.*}} i32 0, i32 3
-; %va_arg_overflow_size:
-; CHECK: getelementptr {{.*}} i32 0, i32 4
-; %param_origin:
-; CHECK: getelementptr {{.*}} i32 0, i32 5
-; %retval_origin:
-; CHECK: getelementptr {{.*}} i32 0, i32 6
-
 ; Check instrumentation of stores
-
 define void @Store1(ptr nocapture %p, i8 %x) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define void @Store1(
+; CHECK-SAME: ptr captures(none) [[P:%.*]], i8 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[_MSARG1]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8:[0-9]+]]
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[P]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
+; CHECK-NEXT:    store i8 [[TMP7]], ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label %[[BB18]]
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    store i8 [[X]], ptr [[P]], align 1
+; CHECK-NEXT:    ret void
+;
 entry:
   store i8 %x, ptr %p
   ret void
 }
 
-; CHECK-LABEL: @Store1
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: [[BASE:%[0-9]+]] = ptrtoint {{.*}} [[PARAM_SHADOW]]
-; CHECK: [[SHADOW_PTR:%[a-z0-9_]+]] = inttoptr {{.*}} [[BASE]]
-; CHECK: [[SHADOW:%[a-z0-9]+]] = load i64, ptr [[SHADOW_PTR]]
-; CHECK: [[BASE2:%[0-9]+]] = ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: icmp ne i64 [[SHADOW]]
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_metadata_ptr_for_store_1(ptr %p)
-; CHECK: store i8
-; If the new shadow is non-zero, jump to __msan_chain_origin()
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_chain_origin
-; Storing origin here:
-; CHECK: store i32
-; CHECK: br label
-; CHECK: {{^[0-9]+}}:
-; CHECK: store i8
-; CHECK: ret void
-
 define void @Store2(ptr nocapture %p, i16 %x) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define void @Store2(
+; CHECK-SAME: ptr captures(none) [[P:%.*]], i16 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[_MSARG1]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_2(ptr [[P]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
+; CHECK-NEXT:    store i16 [[TMP7]], ptr [[TMP14]], align 2
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label %[[BB18]]
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    store i16 [[X]], ptr [[P]], align 2
+; CHECK-NEXT:    ret void
+;
 entry:
   store i16 %x, ptr %p
   ret void
 }
 
-; CHECK-LABEL: @Store2
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_metadata_ptr_for_store_2(ptr %p)
-; CHECK: store i16
-; If the new shadow is non-zero, jump to __msan_chain_origin()
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_chain_origin
-; Storing origin here:
-; CHECK: store i32
-; CHECK: br label
-; CHECK: {{^[0-9]+}}:
-; CHECK: store i16
-; CHECK: ret void
-
-
 define void @Store4(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define void @Store4(
+; CHECK-SAME: ptr captures(none) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[_MSARG1]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_4(ptr [[P]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i32 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    br label %[[BB18]]
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   store i32 %x, ptr %p
   ret void
 }
 
-; CHECK-LABEL: @Store4
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i32
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_metadata_ptr_for_store_4(ptr %p)
-; CHECK: store i32
-; If the new shadow is non-zero, jump to __msan_chain_origin()
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_chain_origin
-; Storing origin here:
-; CHECK: store i32
-; CHECK: br label
-; CHECK: {{^[0-9]+}}:
-; CHECK: store i32
-; CHECK: ret void
-
 define void @Store8(ptr nocapture %p, i64 %x) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define void @Store8(
+; CHECK-SAME: ptr captures(none) [[P:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[_MSARG1]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_8(ptr [[P]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
+; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB21:.*]], !prof [[PROF1]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = shl i64 [[TMP18]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    store i64 [[TMP20]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    br label %[[BB21]]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    store i64 [[X]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
 entry:
   store i64 %x, ptr %p
   ret void
 }
 
-; CHECK-LABEL: @Store8
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_metadata_ptr_for_store_8(ptr %p)
-; CHECK: store i64
-; If the new shadow is non-zero, jump to __msan_chain_origin()
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_chain_origin
-; Storing origin here:
-; CHECK: store i64
-; CHECK: br label
-; CHECK: {{^[0-9]+}}:
-; CHECK: store i64
-; CHECK: ret void
-
 define void @Store16(ptr nocapture %p, i128 %x) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define void @Store16(
+; CHECK-SAME: ptr captures(none) [[P:%.*]], i128 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = load i128, ptr [[_MSARG1]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_n(ptr [[P]], i64 16)
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
+; CHECK-NEXT:    store i128 [[TMP7]], ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB22:.*]], !prof [[PROF1]]
+; CHECK:       [[BB16]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = shl i64 [[TMP18]], 32
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    store i64 [[TMP20]], ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP15]], i32 1
+; CHECK-NEXT:    store i64 [[TMP20]], ptr [[TMP21]], align 8
+; CHECK-NEXT:    br label %[[BB22]]
+; CHECK:       [[BB22]]:
+; CHECK-NEXT:    store i128 [[X]], ptr [[P]], align 8
+; CHECK-NEXT:    ret void
+;
 entry:
   store i128 %x, ptr %p
   ret void
 }
 
-; CHECK-LABEL: @Store16
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_metadata_ptr_for_store_n(ptr %p, i64 16)
-; CHECK: store i128
-; If the new shadow is non-zero, jump to __msan_chain_origin()
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; CHECK: @__msan_chain_origin
-; Storing origin here:
-; CHECK: store i64
-; CHECK: br label
-; CHECK: {{^[0-9]+}}:
-; CHECK: store i128
-; CHECK: ret void
-
-
 ; Check instrumentation of loads
 
 define i8 @Load1(ptr nocapture %p) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define i8 @Load1(
+; CHECK-SAME: ptr captures(none) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_1(ptr [[P]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 1
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i8 [[_MSLD]], ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    ret i8 [[TMP7]]
+;
 entry:
   %0 = load i8, ptr %p
   ret i8 %0
 }
 
-; CHECK-LABEL: @Load1
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; Load the value from %p. This is done before accessing the shadow
-; to ease atomic handling.
-; CHECK: load i8
-; CHECK: @__msan_metadata_ptr_for_load_1(ptr %p)
-; Load the shadow and origin.
-; CHECK: load i8
-; CHECK: load i32
-
-
 define i16 @Load2(ptr nocapture %p) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define i16 @Load2(
+; CHECK-SAME: ptr captures(none) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[P]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_2(ptr [[P]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 1
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP9]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i16 [[_MSLD]], ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    ret i16 [[TMP7]]
+;
 entry:
   %0 = load i16, ptr %p
   ret i16 %0
 }
 
-; CHECK-LABEL: @Load2
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; Load the value from %p. This is done before accessing the shadow
-; to ease atomic handling.
-; CHECK: load i16
-; CHECK: @__msan_metadata_ptr_for_load_2(ptr %p)
-; Load the shadow and origin.
-; CHECK: load i16
-; CHECK: load i32
-
-
 define i32 @Load4(ptr nocapture %p) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define i32 @Load4(
+; CHECK-SAME: ptr captures(none) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_4(ptr [[P]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 1
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    store i32 [[_MSLD]], ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
 entry:
   %0 = load i32, ptr %p
   ret i32 %0
 }
 
-; CHECK-LABEL: @Load4
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; Load the value from %p. This is done before accessing the shadow
-; to ease atomic handling.
-; CHECK: load i32
-; CHECK: @__msan_metadata_ptr_for_load_4(ptr %p)
-; Load the shadow and origin.
-; CHECK: load i32
-; CHECK: load i32
-
 define i64 @Load8(ptr nocapture %p) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define i64 @Load8(
+; CHECK-SAME: ptr captures(none) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_8(ptr [[P]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 1
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    ret i64 [[TMP7]]
+;
 entry:
   %0 = load i64, ptr %p
   ret i64 %0
 }
 
-; CHECK-LABEL: @Load8
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; Load the value from %p. This is done before accessing the shadow
-; to ease atomic handling.
-; CHECK: load i64
-; CHECK: @__msan_metadata_ptr_for_load_8(ptr %p)
-; Load the shadow and origin.
-; CHECK: load i64
-; CHECK: load i32
-
 define i128 @Load16(ptr nocapture %p) nounwind uwtable sanitize_memory {
+; CHECK-LABEL: define i128 @Load16(
+; CHECK-SAME: ptr captures(none) [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = load i128, ptr [[P]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_n(ptr [[P]], i64 16)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 1
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i128, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 8
+; CHECK-NEXT:    store i128 [[_MSLD]], ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    ret i128 [[TMP7]]
+;
 entry:
   %0 = load i128, ptr %p
   ret i128 %0
 }
 
-; CHECK-LABEL: @Load16
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: ptrtoint {{.*}} [[PARAM_SHADOW]]
-; Load the shadow of %p and check it
-; CHECK: load i64
-; CHECK: icmp
-; CHECK: br i1
-; CHECK: {{^[0-9]+}}:
-; Load the value from %p. This is done before accessing the shadow
-; to ease atomic handling.
-; CHECK: load i128
-; CHECK: @__msan_metadata_ptr_for_load_n(ptr %p, i64 16)
-; Load the shadow and origin.
-; CHECK: load i128
-; CHECK: load i32
-
-
 ; Test kernel-specific va_list instrumentation
 
 %struct.__va_list_tag = type { i32, i32, ptr, ptr }
@@ -319,6 +506,78 @@ declare dso_local i32 @VAListFn(ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind uwtable
 define dso_local i32 @VarArgFn(ptr %fmt, ...) local_unnamed_addr sanitize_memory #0 {
+; CHECK-LABEL: define dso_local i32 @VarArgFn(
+; CHECK-SAME: ptr [[FMT:%.*]], ...) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[VA_ARG_OVERFLOW_SIZE]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 48, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = alloca i8, i64 [[TMP6]], align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP7]], i8 0, i64 [[TMP6]], i1 false)
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP6]], i64 800)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP7]], ptr align 8 [[VA_ARG_SHADOW]], i64 [[TMP8]], i1 false)
+; CHECK-NEXT:    [[TMP9:%.*]] = alloca i8, i64 [[TMP6]], align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP9]], ptr align 8 [[VA_ARG_ORIGIN]], i64 [[TMP8]], i1 false)
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[ARGS:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @__msan_poison_alloca(ptr [[ARGS]], i64 24, ptr @[[GLOB0:[0-9]+]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[ARGS]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 1
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[ARGS]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 16
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { ptr, ptr } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { ptr, ptr } [[TMP17]], 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP18]], ptr align 16 [[TMP7]], i64 48, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP19]], ptr align 16 [[TMP9]], i64 48, i1 false)
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[ARGS]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { ptr, ptr } [[TMP24]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { ptr, ptr } [[TMP24]], 1
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP25]], ptr align 16 [[TMP27]], i64 [[TMP5]], i1 false)
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[TMP9]], i32 48
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP26]], ptr align 16 [[TMP28]], i64 [[TMP5]], i1 false)
+; CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    store i64 [[TMP2]], ptr [[_MSARG1]], align 8
+; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CHECK-NEXT:    store i32 [[TMP4]], ptr [[_MSARG_O2]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], 8
+; CHECK-NEXT:    [[_MSARG3:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT:    store i64 0, ptr [[_MSARG3]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @VAListFn(ptr [[FMT]], ptr nonnull [[ARGS]])
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    call void @llvm.va_end.p0(ptr nonnull [[ARGS]])
+; CHECK-NEXT:    store i32 [[_MSRET]], ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    store i32 [[TMP33]], ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
 entry:
   %args = alloca [1 x %struct.__va_list_tag], align 16
   call void @llvm.va_start(ptr nonnull %args)
@@ -330,52 +589,56 @@ entry:
 ; Kernel is built without SSE support.
 attributes #0 = { "target-features"="+fxsr,+x87,-sse" }
 
-; CHECK-LABEL: @VarArgFn
-; CHECK: @__msan_get_context_state()
-; CHECK: [[VA_ARG_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 2
-; CHECK: [[VA_ARG_ORIGIN:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 3
-; CHECK: [[VA_ARG_OVERFLOW_SIZE:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 4
 
-; CHECK: [[OSIZE:%[0-9]+]] = load i64, ptr [[VA_ARG_OVERFLOW_SIZE]]
 ; Register save area is 48 bytes for non-SSE builds.
-; CHECK: [[SIZE:%[0-9]+]] = add i64 48, [[OSIZE]]
-; CHECK: [[SHADOWS:%[0-9]+]] = alloca i8, i64 [[SIZE]]
-; CHECK: call void @llvm.memset{{.*}}(ptr align 8 [[SHADOWS]], i8 0, i64 [[SIZE]], i1 false)
-; CHECK: [[COPYSZ:%[0-9]+]] = call i64 @llvm.umin.i64(i64 [[SIZE]], i64 800)
-; CHECK: call void @llvm.memcpy{{.*}}(ptr align 8 [[SHADOWS]], ptr align 8 [[VA_ARG_SHADOW]], i64 [[COPYSZ]]
-; CHECK: [[ORIGINS:%[0-9]+]] = alloca i8, i64 [[SIZE]]
-; CHECK: call void @llvm.memcpy{{.*}}(ptr align 8 [[ORIGINS]], ptr align 8 [[VA_ARG_ORIGIN]], i64 [[COPYSZ]]
-; CHECK: call i32 @VAListFn
 
 ; Function Attrs: nounwind uwtable
 define dso_local void @VarArgCaller() local_unnamed_addr sanitize_memory {
+; CHECK-LABEL: define dso_local void @VarArgCaller(
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_get_context_state()
+; CHECK-NEXT:    [[PARAM_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 0
+; CHECK-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[VA_ARG_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 2
+; CHECK-NEXT:    [[VA_ARG_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 3
+; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
+; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
+; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    store i64 0, ptr [[_MSARG]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 8
+; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    store i32 0, ptr [[_MSARG1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VA_ARG_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSARG_VA_S:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VA_ARG_ORIGIN]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSARG_VA_O:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[VA_ARG_SHADOW]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
+; CHECK-NEXT:    [[_MSARG_VA_S2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[VA_ARG_ORIGIN]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 8
+; CHECK-NEXT:    [[_MSARG_VA_O3:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store i32 0, ptr [[_MSARG_VA_S2]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[_MSARG_VA_O3]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[VA_ARG_OVERFLOW_SIZE]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (ptr, ...) @VarArgFn(ptr @.str, i32 123)
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr [[RETVAL_SHADOW]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[RETVAL_ORIGIN]], align 4
+; CHECK-NEXT:    ret void
+;
 entry:
   %call = tail call i32 (ptr, ...) @VarArgFn(ptr @.str, i32 123)
   ret void
 }
 
-; CHECK-LABEL: @VarArgCaller
-
-; CHECK: entry:
-; CHECK: @__msan_get_context_state()
-; CHECK: [[PARAM_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 0
-; CHECK: [[VA_ARG_SHADOW:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 2
-; CHECK: [[VA_ARG_OVERFLOW_SIZE:%[a-z0-9_]+]] = getelementptr {{.*}} i32 0, i32 4
-
-; CHECK: [[PARAM_SI:%[_a-z0-9]+]] = ptrtoint {{.*}} [[PARAM_SHADOW]]
-; CHECK: [[ARG1_S:%[_a-z0-9]+]] = inttoptr i64 [[PARAM_SI]] to ptr
-; First argument is initialized
-; CHECK: store i64 0, ptr [[ARG1_S]]
-
-; Dangling cast of va_arg_shadow[0], unused because the first argument is fixed.
-; CHECK: [[VA_CAST0:%[_a-z0-9]+]] = ptrtoint {{.*}} [[VA_ARG_SHADOW]] to i64
-
-; CHECK: [[VA_CAST1:%[_a-z0-9]+]] = ptrtoint {{.*}} [[VA_ARG_SHADOW]] to i64
-; CHECK: [[ARG1_SI:%[_a-z0-9]+]] = add i64 [[VA_CAST1]], 8
-; CHECK: [[PARG1_S:%[_a-z0-9]+]] = inttoptr i64 [[ARG1_SI]] to ptr
-
-; Shadow for 123 is 0.
-; CHECK: store i32 0, ptr [[ARG1_S]]
-
-; CHECK: store i64 0, ptr [[VA_ARG_OVERFLOW_SIZE]]
-; CHECK: call i32 (ptr, ...) @VarArgFn({{.*}} @.str{{.*}} i32 123)
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.

From 69586331e868cef99fbcea74a061bd44f57c1904 Mon Sep 17 00:00:00 2001
From: mikael-nilsson-arm <mikael.nilsson@arm.com>
Date: Wed, 1 Oct 2025 10:13:42 +0200
Subject: [PATCH 344/878] [InstCombine] Opt phi(freeze(undef), C) -> phi(C, C)
 (#161181)

Try to choose a value for freeze that enables the PHI to be replaced
with its input constants if they are equal.
---
 .../InstCombine/InstructionCombining.cpp      |  31 +-
 .../Transforms/InstCombine/in-freeze-phi.ll   | 274 ++++++++++++++++++
 2 files changed, 303 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/in-freeze-phi.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 8fbaf68dfcc43..ff063f929347f 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5169,6 +5169,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   // - or: pick -1
   // - select's condition: if the true value is constant, choose it by making
   //                       the condition true.
+  // - phi: pick the common constant across operands
   // - default: pick 0
   //
   // Note that this transform is intentionally done here rather than
@@ -5179,9 +5180,32 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   // TODO: This could use getBinopAbsorber() / getBinopIdentity() to avoid
   //       duplicating logic for binops at least.
   auto getUndefReplacement = [&](Type *Ty) {
-    Value *BestValue = nullptr;
+    auto pickCommonConstantFromPHI = [](PHINode &PN) -> Value * {
+      // phi(freeze(undef), C, C). Choose C for freeze so the PHI can be
+      // removed.
+      Constant *BestValue = nullptr;
+      for (Value *V : PN.incoming_values()) {
+        if (match(V, m_Freeze(m_Undef())))
+          continue;
+
+        Constant *C = dyn_cast<Constant>(V);
+        if (!C)
+          return nullptr;
+
+        if (!isGuaranteedNotToBeUndefOrPoison(C))
+          return nullptr;
+
+        if (BestValue && BestValue != C)
+          return nullptr;
+
+        BestValue = C;
+      }
+      return BestValue;
+    };
+
     Value *NullValue = Constant::getNullValue(Ty);
-    for (const auto *U : I.users()) {
+    Value *BestValue = nullptr;
+    for (auto *U : I.users()) {
       Value *V = NullValue;
       if (match(U, m_Or(m_Value(), m_Value())))
         V = ConstantInt::getAllOnesValue(Ty);
@@ -5190,6 +5214,9 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
       else if (match(U, m_c_Select(m_Specific(&I), m_Value(V)))) {
         if (!isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT))
           V = NullValue;
+      } else if (auto *PHI = dyn_cast<PHINode>(U)) {
+        if (Value *MaybeV = pickCommonConstantFromPHI(*PHI))
+          V = MaybeV;
       }
 
       if (!BestValue)
diff --git a/llvm/test/Transforms/InstCombine/in-freeze-phi.ll b/llvm/test/Transforms/InstCombine/in-freeze-phi.ll
new file mode 100644
index 0000000000000..917d81b499c49
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/in-freeze-phi.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+define i32 @phi_freeze_same_consts(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define i32 @phi_freeze_same_consts(
+; CHECK-SAME: i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C0]], label %[[BB_FREEZE:.*]], label %[[BB_OTHER:.*]]
+; CHECK:       [[BB_FREEZE]]:
+; CHECK-NEXT:    br label %[[FINAL:.*]]
+; CHECK:       [[BB_OTHER]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[CA:.*]], label %[[CB:.*]]
+; CHECK:       [[CA]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[FINAL]]:
+; CHECK-NEXT:    ret i32 42
+;
+entry:
+  br i1 %c0, label %bb_freeze, label %bb_other
+
+bb_freeze:
+  %f = freeze i32 undef
+  br label %final
+
+bb_other:
+  br i1 %c1, label %cA, label %cB
+cA:
+  br label %final
+cB:
+  br label %final
+
+final:
+  %phi = phi i32 [ %f, %bb_freeze ], [ 42, %cA ], [ 42, %cB ]
+  ret i32 %phi
+}
+
+define i32 @phi_freeze_mixed_consts(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define i32 @phi_freeze_mixed_consts(
+; CHECK-SAME: i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C0]], label %[[BB_FREEZE:.*]], label %[[BB_OTHER:.*]]
+; CHECK:       [[BB_FREEZE]]:
+; CHECK-NEXT:    br label %[[FINAL:.*]]
+; CHECK:       [[BB_OTHER]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[CA:.*]], label %[[CB:.*]]
+; CHECK:       [[CA]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[FINAL]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB_FREEZE]] ], [ 42, %[[CA]] ], [ 7, %[[CB]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  br i1 %c0, label %bb_freeze, label %bb_other
+
+bb_freeze:
+  %f = freeze i32 undef
+  br label %final
+
+bb_other:
+  br i1 %c1, label %cA, label %cB
+cA:
+  br label %final
+cB:
+  br label %final
+
+final:
+  %phi = phi i32 [ %f, %bb_freeze ], [ 42, %cA ], [ 7, %cB ]
+  ret i32 %phi
+}
+
+define i32 @phi_freeze_with_nonconst_incoming(i32 %x, i1 %c0, i1 %c1) {
+; CHECK-LABEL: define i32 @phi_freeze_with_nonconst_incoming(
+; CHECK-SAME: i32 [[X:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C0]], label %[[BB_FREEZE:.*]], label %[[BB_OTHER:.*]]
+; CHECK:       [[BB_FREEZE]]:
+; CHECK-NEXT:    br label %[[FINAL:.*]]
+; CHECK:       [[BB_OTHER]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[CA:.*]], label %[[CB:.*]]
+; CHECK:       [[CA]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[FINAL]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 0, %[[BB_FREEZE]] ], [ [[X]], %[[CA]] ], [ 13, %[[CB]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  br i1 %c0, label %bb_freeze, label %bb_other
+
+bb_freeze:
+  %f = freeze i32 undef
+  br label %final
+
+bb_other:
+  br i1 %c1, label %cA, label %cB
+cA:
+  br label %final
+cB:
+  br label %final
+
+final:
+  %phi = phi i32 [ %f, %bb_freeze ], [ %x, %cA ], [ 13, %cB ]
+  ret i32 %phi
+}
+
+define <4 x i8> @phi_freeze_vector(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define <4 x i8> @phi_freeze_vector(
+; CHECK-SAME: i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C0]], label %[[BB_FREEZE:.*]], label %[[BB_OTHER:.*]]
+; CHECK:       [[BB_FREEZE]]:
+; CHECK-NEXT:    br label %[[FINAL:.*]]
+; CHECK:       [[BB_OTHER]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[CA:.*]], label %[[CB:.*]]
+; CHECK:       [[CA]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[FINAL]]:
+; CHECK-NEXT:    ret <4 x i8> splat (i8 9)
+;
+entry:
+  br i1 %c0, label %bb_freeze, label %bb_other
+
+bb_freeze:
+  %f = freeze <4 x i8> undef
+  br label %final
+
+bb_other:
+  br i1 %c1, label %cA, label %cB
+
+cA:
+  br label %final
+
+cB:
+  br label %final
+
+final:
+  %phi = phi <4 x i8> [ %f, %bb_freeze ],
+  [<i8 9, i8 9, i8 9, i8 9>, %cA ],
+  [<i8 9, i8 9, i8 9, i8 9>, %cB ]
+  ret <4 x i8> %phi
+}
+
+define i32 @multi_use_one_folds_one_not_zero(i1 %c0, i1 %c1, i1 %c2) {
+; CHECK-LABEL: define i32 @multi_use_one_folds_one_not_zero(
+; CHECK-SAME: i1 [[C0:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C0]], label %[[BB_OTHER3:.*]], label %[[CC1:.*]]
+; CHECK:       [[BB_OTHER3]]:
+; CHECK-NEXT:    br label %[[MID:.*]]
+; CHECK:       [[CC1]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[CA:.*]], label %[[CB:.*]]
+; CHECK:       [[CA]]:
+; CHECK-NEXT:    br label %[[MID]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    br label %[[MID]]
+; CHECK:       [[MID]]:
+; CHECK-NEXT:    [[PHI_FOLD:%.*]] = phi i32 [ 0, %[[BB_OTHER3]] ], [ 1, %[[CA]] ], [ 1, %[[CB]] ]
+; CHECK-NEXT:    br i1 [[C2]], label %[[BB_FREEZE2:.*]], label %[[CD:.*]]
+; CHECK:       [[BB_FREEZE2]]:
+; CHECK-NEXT:    br label %[[FINAL:.*]]
+; CHECK:       [[BB_OTHER2:.*:]]
+; CHECK-NEXT:    br i1 true, label %[[CA]], label %[[CB]]
+; CHECK:       [[CC:.*:]]
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[CD]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[FINAL]]:
+; CHECK-NEXT:    ret i32 [[PHI_FOLD]]
+;
+entry:
+  %f = freeze i32 undef
+  br i1 %c0, label %bb_freeze, label %bb_other
+bb_freeze:
+  br label %mid
+bb_other:
+  br i1 %c1, label %cA, label %cB
+cA:
+  br label %mid
+cB:
+  br label %mid
+mid:
+  %phi_no_fold  = phi i32 [ %f, %bb_freeze ], [ 1, %cA ], [ 1, %cB ]
+  br i1 %c2, label %bb_freeze2, label %cD
+bb_freeze2:
+  br label %final
+bb_other2:
+  br i1 %c1, label %cA, label %cB
+cC:
+  br label %final
+cD:
+  br label %final
+final:
+  %phi_fold  = phi i32 [ %f, %bb_freeze2 ], [ 0, %cC ], [ 0, %cD ]
+  %a = add i32 %phi_fold, %phi_no_fold
+  ret i32 %a
+}
+
+define i32 @phi_freeze_poison(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define i32 @phi_freeze_poison(
+; CHECK-SAME: i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C0]], label %[[BB_FREEZE:.*]], label %[[BB_OTHER:.*]]
+; CHECK:       [[BB_FREEZE]]:
+; CHECK-NEXT:    br label %[[FINAL:.*]]
+; CHECK:       [[BB_OTHER]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[CA:.*]], label %[[CB:.*]]
+; CHECK:       [[CA]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[FINAL]]:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br i1 %c0, label %bb_freeze, label %bb_other
+
+bb_freeze:
+  %f = freeze i32 undef
+  br label %final
+
+bb_other:
+  br i1 %c1, label %cA, label %cB
+cA:
+  br label %final
+cB:
+  br label %final
+
+final:
+  %phi = phi i32 [ %f, %bb_freeze ], [ poison, %cA ], [ poison, %cB ]
+  ret i32 %phi
+}
+
+define <2 x i32> @phi_freeze_poison_vec(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define <2 x i32> @phi_freeze_poison_vec(
+; CHECK-SAME: i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[C0]], label %[[BB_FREEZE:.*]], label %[[BB_OTHER:.*]]
+; CHECK:       [[BB_FREEZE]]:
+; CHECK-NEXT:    br label %[[FINAL:.*]]
+; CHECK:       [[BB_OTHER]]:
+; CHECK-NEXT:    br i1 [[C1]], label %[[CA:.*]], label %[[CB:.*]]
+; CHECK:       [[CA]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[CB]]:
+; CHECK-NEXT:    br label %[[FINAL]]
+; CHECK:       [[FINAL]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB_FREEZE]] ], [ <i32 poison, i32 1>, %[[CA]] ], [ <i32 poison, i32 1>, %[[CB]] ]
+; CHECK-NEXT:    ret <2 x i32> [[PHI]]
+;
+entry:
+  br i1 %c0, label %bb_freeze, label %bb_other
+
+bb_freeze:
+  %f = freeze <2 x i32> undef
+  br label %final
+
+bb_other:
+  br i1 %c1, label %cA, label %cB
+cA:
+  br label %final
+cB:
+  br label %final
+
+final:
+  %phi = phi <2 x i32> [ %f, %bb_freeze ], [ <i32 poison, i32 1>, %cA ], [ <i32 poison, i32 1>, %cB ]
+  ret <2 x i32> %phi
+}

From 6c032fa60037cb995db87b956d172480ffb54723 Mon Sep 17 00:00:00 2001
From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com>
Date: Wed, 1 Oct 2025 11:21:19 +0300
Subject: [PATCH 345/878] [libc][math] Refactor exp10m1f16 implementation to
 header-only in src/__support/math folder. (#161119)

Part of #147386

in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
---
 libc/shared/math.h                            |   1 +
 libc/shared/math/exp10m1f16.h                 |  29 +++
 libc/src/__support/math/CMakeLists.txt        |  17 ++
 libc/src/__support/math/exp10m1f16.h          | 185 ++++++++++++++++++
 libc/src/math/generic/CMakeLists.txt          |  13 +-
 libc/src/math/generic/exp10m1f16.cpp          | 158 +--------------
 libc/test/shared/CMakeLists.txt               |   1 +
 libc/test/shared/shared_math_test.cpp         |   1 +
 .../llvm-project-overlay/libc/BUILD.bazel     |  18 +-
 9 files changed, 254 insertions(+), 169 deletions(-)
 create mode 100644 libc/shared/math/exp10m1f16.h
 create mode 100644 libc/src/__support/math/exp10m1f16.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index cccd6a375930e..4b2a0d8c712ad 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -46,6 +46,7 @@
 #include "math/exp10f.h"
 #include "math/exp10f16.h"
 #include "math/exp10m1f.h"
+#include "math/exp10m1f16.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp10m1f16.h b/libc/shared/math/exp10m1f16.h
new file mode 100644
index 0000000000000..5f18f2986207e
--- /dev/null
+++ b/libc/shared/math/exp10m1f16.h
@@ -0,0 +1,29 @@
+//===-- Shared exp10m1f16 function ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP10M1F16_H
+#define LLVM_LIBC_SHARED_MATH_EXP10M1F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/exp10m1f16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp10m1f16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP10M1F16_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 84c1b15498672..98f9bb42f91f4 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -498,6 +498,23 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  exp10m1f16
+  HDRS
+    exp10m1f16.h
+  DEPENDS
+    .exp10f16_utils
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+)
+
 add_header_library(
   erff
   HDRS
diff --git a/libc/src/__support/math/exp10m1f16.h b/libc/src/__support/math/exp10m1f16.h
new file mode 100644
index 0000000000000..6367a857fa98a
--- /dev/null
+++ b/libc/src/__support/math/exp10m1f16.h
@@ -0,0 +1,185 @@
+//===-- Implementation header for exp10m1f16 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "exp10f16_utils.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float16 exp10m1f16(float16 x) {
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.5c4p-4, exp10m1f16(x) = 0x1.bacp-3 (RZ)
+      {0x2d71U, 0x32ebU, 1U, 0U, 0U},
+      // x = -0x1.5ep-13, exp10m1f16(x) = -0x1.92cp-12 (RZ)
+      {0x8978U, 0x8e4bU, 0U, 1U, 0U},
+      // x = -0x1.e2p-10, exp10m1f16(x) = -0x1.14cp-8 (RZ)
+      {0x9788U, 0x9c53U, 0U, 1U, 0U},
+  }};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+  constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
+#else
+  constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
+#endif
+
+  constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
+      EXP10M1F16_EXCEPTS_HI = {{
+          // (input, RZ output, RU offset, RD offset, RN offset)
+          // x = 0x1.8f4p-2, exp10m1f16(x) = 0x1.744p+0 (RZ)
+          {0x363dU, 0x3dd1U, 1U, 0U, 0U},
+          // x = 0x1.95cp-2, exp10m1f16(x) = 0x1.7d8p+0 (RZ)
+          {0x3657U, 0x3df6U, 1U, 0U, 0U},
+          // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
+          {0x3741U, 0x3f5cU, 1U, 0U, 1U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+          // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
+          {0x4030U, 0x57b1U, 1U, 0U, 1U},
+          // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
+          {0x406eU, 0x5917U, 1U, 0U, 1U},
+          // x = 0x1.2f4p+2, exp10m1f16(x) = 0x1.ab8p+15 (RZ)
+          {0x44bdU, 0x7aaeU, 1U, 0U, 1U},
+#endif
+      }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| <= 2^(-3), or |x| >= 11 * log10(2), or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x429fU)) {
+    // exp10m1(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 16 * log10(2).
+    if (x_u >= 0x44d1U && x_bits.is_pos()) {
+      // exp10m1(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x < -11 * log10(2).
+    if (x_u > 0xc29fU) {
+      // exp10m1(-inf) = -1
+      if (x_bits.is_inf())
+        return FPBits::one(Sign::NEG).get_val();
+
+      // When x >= -0x1.ce4p+1, round(10^x - 1, HP, RN) = -0x1.ffcp-1.
+      if (x_u <= 0xc339U) {
+        return fputil::round_result_slightly_down(
+            fputil::cast<float16>(-0x1.ffcp-1));
+      }
+
+      // When x < -0x1.ce4p+1, round(10^x - 1, HP, RN) = -1.
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_DOWNWARD:
+        return FPBits::one(Sign::NEG).get_val();
+      default:
+        return fputil::cast<float16>(-0x1.ffcp-1);
+      }
+    }
+
+    // When |x| <= 2^(-3).
+    if (x_abs <= 0x3000U) {
+      if (LIBC_UNLIKELY(x_abs == 0))
+        return x;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u);
+          LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+      float xf = x;
+      // Degree-5 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax((10^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
+      //   > x * P;
+      return fputil::cast<float16>(
+          xf * fputil::polyeval(xf, 0x1.26bb1cp+1f, 0x1.5351c8p+1f,
+                                0x1.04704p+1f, 0x1.2ce084p+0f, 0x1.14a6bep-1f));
+    }
+  }
+
+  // When x is 1, 2, or 3. These are hard-to-round cases with exact results.
+  // 10^4 - 1 = 9'999 is not exactly representable as a float16, but luckily the
+  // polynomial approximation gives the correct result for x = 4 in all
+  // rounding modes.
+  if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
+    switch (x_u) {
+    case 0x3c00U: // x = 1.0f16
+      return fputil::cast<float16>(9.0);
+    case 0x4000U: // x = 2.0f16
+      return fputil::cast<float16>(99.0);
+    case 0x4200U: // x = 3.0f16
+      return fputil::cast<float16>(999.0);
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP10M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // exp10(x) = exp2((hi + mid) * log2(10)) * exp10(lo)
+  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+  // exp10m1(x) = exp2((hi + mid) * log2(lo)) * exp10(lo) - 1
+  return fputil::cast<float16>(
+      fputil::multiply_add(exp2_hi_mid, exp10_lo, -1.0f));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10M1F16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 8074a3925626c..99c1b08326d53 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1603,18 +1603,7 @@ add_entrypoint_object(
   HDRS
     ../exp10m1f16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.cpu_features
-    libc.src.__support.math.exp10f16_utils
+    libc.src.__support.math.exp10m1f16
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
index 6c2fdbea418df..8a3c4abf1f10e 100644
--- a/libc/src/math/generic/exp10m1f16.cpp
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -7,166 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp10m1f16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/macros/properties/cpu_features.h"
-#include "src/__support/math/exp10f16_utils.h"
+#include "src/__support/math/exp10m1f16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.5c4p-4, exp10m1f16(x) = 0x1.bacp-3 (RZ)
-    {0x2d71U, 0x32ebU, 1U, 0U, 0U},
-    // x = -0x1.5ep-13, exp10m1f16(x) = -0x1.92cp-12 (RZ)
-    {0x8978U, 0x8e4bU, 0U, 1U, 0U},
-    // x = -0x1.e2p-10, exp10m1f16(x) = -0x1.14cp-8 (RZ)
-    {0x9788U, 0x9c53U, 0U, 1U, 0U},
-}};
-
-#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
-#else
-static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
-#endif
-
-static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
-    EXP10M1F16_EXCEPTS_HI = {{
-        // (input, RZ output, RU offset, RD offset, RN offset)
-        // x = 0x1.8f4p-2, exp10m1f16(x) = 0x1.744p+0 (RZ)
-        {0x363dU, 0x3dd1U, 1U, 0U, 0U},
-        // x = 0x1.95cp-2, exp10m1f16(x) = 0x1.7d8p+0 (RZ)
-        {0x3657U, 0x3df6U, 1U, 0U, 0U},
-        // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
-        {0x3741U, 0x3f5cU, 1U, 0U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
-        {0x4030U, 0x57b1U, 1U, 0U, 1U},
-        // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
-        {0x406eU, 0x5917U, 1U, 0U, 1U},
-        // x = 0x1.2f4p+2, exp10m1f16(x) = 0x1.ab8p+15 (RZ)
-        {0x44bdU, 0x7aaeU, 1U, 0U, 1U},
-#endif
-    }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
 LLVM_LIBC_FUNCTION(float16, exp10m1f16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-  FPBits x_bits(x);
-
-  uint16_t x_u = x_bits.uintval();
-  uint16_t x_abs = x_u & 0x7fffU;
-
-  // When |x| <= 2^(-3), or |x| >= 11 * log10(2), or x is NaN.
-  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x429fU)) {
-    // exp10m1(NaN) = NaN
-    if (x_bits.is_nan()) {
-      if (x_bits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // When x >= 16 * log10(2).
-    if (x_u >= 0x44d1U && x_bits.is_pos()) {
-      // exp10m1(+inf) = +inf
-      if (x_bits.is_inf())
-        return FPBits::inf().get_val();
-
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_UPWARD:
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
-        return FPBits::inf().get_val();
-      default:
-        return FPBits::max_normal().get_val();
-      }
-    }
-
-    // When x < -11 * log10(2).
-    if (x_u > 0xc29fU) {
-      // exp10m1(-inf) = -1
-      if (x_bits.is_inf())
-        return FPBits::one(Sign::NEG).get_val();
-
-      // When x >= -0x1.ce4p+1, round(10^x - 1, HP, RN) = -0x1.ffcp-1.
-      if (x_u <= 0xc339U) {
-        return fputil::round_result_slightly_down(
-            fputil::cast<float16>(-0x1.ffcp-1));
-      }
-
-      // When x < -0x1.ce4p+1, round(10^x - 1, HP, RN) = -1.
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_DOWNWARD:
-        return FPBits::one(Sign::NEG).get_val();
-      default:
-        return fputil::cast<float16>(-0x1.ffcp-1);
-      }
-    }
-
-    // When |x| <= 2^(-3).
-    if (x_abs <= 0x3000U) {
-      if (LIBC_UNLIKELY(x_abs == 0))
-        return x;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u);
-          LIBC_UNLIKELY(r.has_value()))
-        return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-      float xf = x;
-      // Degree-5 minimax polynomial generated by Sollya with the following
-      // commands:
-      //   > display = hexadecimal;
-      //   > P = fpminimax((10^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
-      //   > x * P;
-      return fputil::cast<float16>(
-          xf * fputil::polyeval(xf, 0x1.26bb1cp+1f, 0x1.5351c8p+1f,
-                                0x1.04704p+1f, 0x1.2ce084p+0f, 0x1.14a6bep-1f));
-    }
-  }
-
-  // When x is 1, 2, or 3. These are hard-to-round cases with exact results.
-  // 10^4 - 1 = 9'999 is not exactly representable as a float16, but luckily the
-  // polynomial approximation gives the correct result for x = 4 in all
-  // rounding modes.
-  if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
-    switch (x_u) {
-    case 0x3c00U: // x = 1.0f16
-      return fputil::cast<float16>(9.0);
-    case 0x4000U: // x = 2.0f16
-      return fputil::cast<float16>(99.0);
-    case 0x4200U: // x = 3.0f16
-      return fputil::cast<float16>(999.0);
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP10M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // exp10(x) = exp2((hi + mid) * log2(10)) * exp10(lo)
-  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
-  // exp10m1(x) = exp2((hi + mid) * log2(lo)) * exp10(lo) - 1
-  return fputil::cast<float16>(
-      fputil::multiply_add(exp2_hi_mid, exp10_lo, -1.0f));
+  return math::exp10m1f16(x);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index 13a0aae5d4c67..ea4634cbe7f9f 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -37,6 +37,7 @@ add_fp_unittest(
     libc.src.__support.math.cospif16
     libc.src.__support.math.dsqrtl
     libc.src.__support.math.exp10m1f
+    libc.src.__support.math.exp10m1f16
     libc.src.__support.math.erff
     libc.src.__support.math.exp
     libc.src.__support.math.exp10
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 25bf5ad8ae411..17221932927b0 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -27,6 +27,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::coshf16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::cospif16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp10f16(0.0f16));
+  EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp10m1f16(0.0f16));
 
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::expf16(0.0f16));
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 8d9e80393bf20..e57d9dea036dd 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2673,6 +2673,22 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp10m1f16",
+    hdrs = ["src/__support/math/exp10m1f16.h"],
+    deps = [
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_exp10f16_utils",
+        ":errno",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_erff",
     hdrs = ["src/__support/math/erff.h"],
@@ -3622,7 +3638,7 @@ libc_math_function(
 libc_math_function(
     name = "exp10m1f16",
     additional_deps = [
-        ":__support_math_exp10f16_utils",
+        ":__support_math_exp10m1f16",
     ],
 )
 

From 57b1b254297ed2d2fd9560e6a5eef0c44918223a Mon Sep 17 00:00:00 2001
From: Henrich Lauko <xlauko@mail.muni.cz>
Date: Wed, 1 Oct 2025 10:44:05 +0200
Subject: [PATCH 346/878] [CIR] Refactor cir.cast to use uniform assembly form
 w/o parens, commas (#161431)

This mirrors incubator changes from https://github.com/llvm/clangir/pull/1922
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 28 +++----
 clang/lib/CIR/CodeGen/CIRGenRecordLayout.h    |  2 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  2 +-
 .../CIR/CodeGen/aapcs-volatile-bitfields.c    | 10 +--
 clang/test/CIR/CodeGen/array-ctor.cpp         |  6 +-
 clang/test/CIR/CodeGen/array-dtor.cpp         |  8 +-
 clang/test/CIR/CodeGen/array.cpp              | 32 ++++----
 clang/test/CIR/CodeGen/assign-operator.cpp    |  6 +-
 clang/test/CIR/CodeGen/basic.c                |  2 +-
 clang/test/CIR/CodeGen/basic.cpp              |  2 +-
 clang/test/CIR/CodeGen/binassign.c            |  2 +-
 clang/test/CIR/CodeGen/binop.c                |  6 +-
 clang/test/CIR/CodeGen/binop.cpp              | 12 +--
 clang/test/CIR/CodeGen/builtin_bit.cpp        | 38 ++++-----
 clang/test/CIR/CodeGen/builtin_call.cpp       |  8 +-
 clang/test/CIR/CodeGen/builtin_printf.cpp     |  4 +-
 clang/test/CIR/CodeGen/cast.cpp               | 30 +++----
 clang/test/CIR/CodeGen/cmp.cpp                |  4 +-
 clang/test/CIR/CodeGen/comma.c                |  2 +-
 clang/test/CIR/CodeGen/complex-cast.cpp       | 80 +++++++++----------
 .../CodeGen/complex-compound-assignment.cpp   | 28 +++----
 clang/test/CIR/CodeGen/complex-mul-div.cpp    | 24 +++---
 clang/test/CIR/CodeGen/complex-unary.cpp      | 24 +++---
 clang/test/CIR/CodeGen/complex.cpp            | 38 ++++-----
 clang/test/CIR/CodeGen/cxx-default-init.cpp   |  6 +-
 clang/test/CIR/CodeGen/delegating-ctor.cpp    | 12 +--
 clang/test/CIR/CodeGen/delete.cpp             |  4 +-
 clang/test/CIR/CodeGen/destructors.cpp        |  4 +-
 .../CIR/CodeGen/finegrain-bitfield-access.cpp | 16 ++--
 clang/test/CIR/CodeGen/if.cpp                 |  6 +-
 clang/test/CIR/CodeGen/int-to-bool.cpp        |  8 +-
 clang/test/CIR/CodeGen/loop.cpp               | 12 +--
 clang/test/CIR/CodeGen/new.cpp                | 16 ++--
 clang/test/CIR/CodeGen/no-prototype.c         |  6 +-
 clang/test/CIR/CodeGen/opaque.c               |  4 +-
 clang/test/CIR/CodeGen/opaque.cpp             |  6 +-
 clang/test/CIR/CodeGen/pointers.cpp           |  2 +-
 clang/test/CIR/CodeGen/ternary.cpp            |  2 +-
 clang/test/CIR/CodeGen/unary.cpp              | 32 ++++----
 clang/test/CIR/CodeGen/union.c                | 10 +--
 clang/test/CIR/CodeGen/var_arg.c              | 12 +--
 .../CIR/CodeGen/variable-decomposition.cpp    |  2 +-
 clang/test/CIR/CodeGen/vbase.cpp              | 10 +--
 clang/test/CIR/CodeGen/vector-ext.cpp         |  8 +-
 clang/test/CIR/CodeGen/vector.cpp             |  8 +-
 clang/test/CIR/CodeGen/vtt.cpp                | 38 ++++-----
 clang/test/CIR/CodeGenOpenACC/combined-copy.c |  2 +-
 .../combined-firstprivate-clause.cpp          | 74 ++++++++---------
 .../combined-private-clause.cpp               |  2 +-
 .../combined-reduction-clause-default-ops.cpp | 18 ++---
 .../combined-reduction-clause-float.cpp       | 18 ++---
 .../combined-reduction-clause-inline-ops.cpp  | 36 ++++-----
 .../combined-reduction-clause-int.cpp         | 18 ++---
 .../combined-reduction-clause-outline-ops.cpp | 36 ++++-----
 clang/test/CIR/CodeGenOpenACC/combined.cpp    |  6 +-
 .../compute-firstprivate-clause.c             | 36 ++++-----
 .../compute-firstprivate-clause.cpp           | 74 ++++++++---------
 .../CodeGenOpenACC/compute-private-clause.cpp |  2 +-
 .../compute-reduction-clause-default-ops.c    | 18 ++---
 .../compute-reduction-clause-default-ops.cpp  | 18 ++---
 .../compute-reduction-clause-float.c          | 18 ++---
 .../compute-reduction-clause-float.cpp        | 18 ++---
 .../compute-reduction-clause-inline-ops.cpp   | 36 ++++-----
 .../compute-reduction-clause-int.c            | 18 ++---
 .../compute-reduction-clause-int.cpp          | 18 ++---
 .../compute-reduction-clause-outline-ops.cpp  | 36 ++++-----
 .../compute-reduction-clause-unsigned-int.c   | 18 ++---
 clang/test/CIR/CodeGenOpenACC/data.c          |  4 +-
 clang/test/CIR/CodeGenOpenACC/host_data.c     |  4 +-
 clang/test/CIR/CodeGenOpenACC/init.c          |  6 +-
 clang/test/CIR/CodeGenOpenACC/kernels.c       | 10 +--
 .../CodeGenOpenACC/loop-private-clause.cpp    |  2 +-
 .../loop-reduction-clause-default-ops.cpp     | 18 ++---
 .../loop-reduction-clause-float.cpp           | 18 ++---
 .../loop-reduction-clause-inline-ops.cpp      | 36 ++++-----
 .../loop-reduction-clause-int.cpp             | 18 ++---
 .../loop-reduction-clause-outline-ops.cpp     | 36 ++++-----
 clang/test/CIR/CodeGenOpenACC/parallel.c      | 10 +--
 .../private-clause-array-recipes-CtorDtor.cpp | 38 ++++-----
 .../private-clause-array-recipes-NoOps.cpp    | 10 +--
 ...-clause-pointer-array-recipes-CtorDtor.cpp | 12 +--
 clang/test/CIR/CodeGenOpenACC/serial.c        | 10 +--
 clang/test/CIR/CodeGenOpenACC/set.c           |  4 +-
 clang/test/CIR/CodeGenOpenACC/shutdown.c      |  6 +-
 clang/test/CIR/CodeGenOpenACC/wait.c          |  6 +-
 clang/test/CIR/IR/alloca.cir                  |  4 +-
 clang/test/CIR/IR/binassign.cir               |  4 +-
 clang/test/CIR/IR/cast.cir                    |  8 +-
 clang/test/CIR/IR/cmp.cir                     | 48 +++++------
 clang/test/CIR/IR/vtable-addrpt.cir           |  2 +-
 clang/test/CIR/IR/vtt-addrpoint.cir           |  2 +-
 clang/test/CIR/Lowering/cast.cir              | 34 ++++----
 clang/test/CIR/Lowering/if.cir                |  8 +-
 clang/test/CIR/Lowering/vtt-addrpoint.cir     |  2 +-
 clang/test/CIR/Transforms/canonicalize.cir    | 28 +++----
 clang/test/CIR/Transforms/if.cir              |  8 +-
 clang/test/CIR/Transforms/switch.cir          |  8 +-
 97 files changed, 773 insertions(+), 773 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index e1be08c1bbbbd..f857cf82a5192 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -209,9 +209,10 @@ def CIR_CastOp : CIR_Op<"cast", [
     Example:
 
     ```mlir
-    %4 = cir.cast(int_to_bool, %3 : i32), !cir.bool
+    %4 = cir.cast int_to_bool %3 : i32 -> !cir.bool
     ...
-    %x = cir.cast(array_to_ptrdecay, %0 : !cir.ptr<!cir.array<i32 x 10>>), !cir.ptr<i32>
+    %x = cir.cast array_to_ptrdecay %0 
+       : !cir.ptr<!cir.array<i32 x 10>> -> !cir.ptr<i32>
     ```
   }];
 
@@ -219,8 +220,7 @@ def CIR_CastOp : CIR_Op<"cast", [
   let results = (outs CIR_AnyType:$result);
 
   let assemblyFormat = [{
-    `(` $kind `,` $src `:` type($src) `)`
-    `,` type($result) attr-dict
+    $kind $src `:` type($src) `->` type($result) attr-dict
   }];
 
   // The input and output types should match the cast kind.
@@ -1176,7 +1176,7 @@ def CIR_GotoOp : CIR_Op<"goto", [Terminator]> {
   ```mlir
     cir.scope {  // REGION #1
       %2 = cir.load %0 : !cir.ptr<!s32i>, !s32i
-      %3 = cir.cast(int_to_bool, %2 : !s32i), !cir.bool
+      %3 = cir.cast int_to_bool %2 : !s32i -> !cir.bool
       cir.if %3 {
         cir.goto "label"
       }
@@ -3994,9 +3994,9 @@ def CIR_VAStartOp : CIR_Op<"va_start"> {
 
     ```mlir
     // %args : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>
-    %p = cir.cast(array_to_ptrdecay, %args
-          : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>),
-        !cir.ptr<!rec___va_list_tag>
+    %p = cir.cast array_to_ptrdecay %args
+          : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>)
+          -> !cir.ptr<!rec___va_list_tag>
     %count = cir.load %0 : !cir.ptr<!s32i>, !s32i
     cir.va_start %p %count : !cir.ptr<!rec___va_list_tag>, !s32i
     ```
@@ -4033,9 +4033,9 @@ def CIR_VAEndOp : CIR_Op<"va_end"> {
     Example:
     ```mlir
     // %args : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>
-    %p = cir.cast(array_to_ptrdecay, %args
-          : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>),
-        !cir.ptr<!rec___va_list_tag>
+    %p = cir.cast array_to_ptrdecay %args
+          : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>
+          -> !cir.ptr<!rec___va_list_tag>
     cir.va_end %p : !cir.ptr<!rec___va_list_tag>
     ```
   }];
@@ -4068,9 +4068,9 @@ def CIR_VAArgOp : CIR_Op<"va_arg"> {
     Example:
     ```mlir
     // %args : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>
-    %p = cir.cast(array_to_ptrdecay, %args
-            : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>),
-          !cir.ptr<!rec___va_list_tag>
+    %p = cir.cast array_to_ptrdecay %args
+            : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>
+            -> !cir.ptr<!rec___va_list_tag>
     cir.va.start %p : !cir.ptr<!rec___va_list_tag>
 
     // Fetch an `int` from the vararg list.
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
index 914ef16c2a5ee..bf0ddc5875059 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayout.h
@@ -57,7 +57,7 @@ namespace clang::CIRGen {
 ///   cir.func @store_field() {
 ///     %0 = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["s"] {alignment = 4 : i64}
 ///     %1 = cir.const #cir.int<2> : !s32i
-///     %2 = cir.cast(integral, %1 : !s32i), !u32i
+///     %2 = cir.cast integral %1 : !s32i -> !u32i
 ///     %3 = cir.get_member %0[3] {name = "more_bits"} : !cir.ptr<!rec_S> ->
 ///     !cir.ptr<!u16i>
 ///     %4 = cir.set_bitfield(#bfi_more_bits, %3 :
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 0f309e42bcd4c..22f069d9cead0 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2425,7 +2425,7 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
 // For instance, this CIR code:
 //
 //    cir.func @foo(%arg0: !s32i) -> !s32i {
-//      %4 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+//      %4 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
 //      cir.if %4 {
 //        %5 = cir.const #cir.int<1> : !s32i
 //        cir.return %5 : !s32i
diff --git a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
index 00378f725d76a..92eae6aab6800 100644
--- a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
+++ b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
@@ -86,7 +86,7 @@ int check_load(st1 *s1) {
 // CIR:    [[LOAD:%.*]] = cir.load align(8) {{.*}} : !cir.ptr<!cir.ptr<!rec_st1>>, !cir.ptr<!rec_st1>
 // CIR:    [[MEMBER:%.*]] = cir.get_member [[LOAD]][0] {name = "b"} : !cir.ptr<!rec_st1> -> !cir.ptr<!u16i>
 // CIR:    [[BITFI:%.*]] = cir.get_bitfield align(4) (#bfi_b, [[MEMBER]] {is_volatile} : !cir.ptr<!u16i>) -> !u32i
-// CIR:    [[CAST:%.*]] = cir.cast(integral, [[BITFI]] : !u32i), !s32i
+// CIR:    [[CAST:%.*]] = cir.cast integral [[BITFI]] : !u32i -> !s32i
 // CIR:    cir.store [[CAST]], [[RETVAL:%.*]] : !s32i, !cir.ptr<!s32i>
 // CIR:    [[RET:%.*]] = cir.load [[RETVAL]] : !cir.ptr<!s32i>, !s32i
 // CIR:    cir.return [[RET]] : !s32i
@@ -118,7 +118,7 @@ int check_load_exception(st3 *s3) {
 // CIR:    [[LOAD:%.*]] = cir.load align(8) {{.*}} : !cir.ptr<!cir.ptr<!rec_st3>>, !cir.ptr<!rec_st3>
 // CIR:    [[MEMBER:%.*]] = cir.get_member [[LOAD]][2] {name = "b"} : !cir.ptr<!rec_st3> -> !cir.ptr<!u8i>
 // CIR:    [[BITFI:%.*]] = cir.get_bitfield align(4) (#bfi_b1, [[MEMBER]] {is_volatile} : !cir.ptr<!u8i>) -> !u32i
-// CIR:    [[CAST:%.*]] = cir.cast(integral, [[BITFI]] : !u32i), !s32i
+// CIR:    [[CAST:%.*]] = cir.cast integral [[BITFI]] : !u32i -> !s32i
 // CIR:    cir.store [[CAST]], [[RETVAL:%.*]] : !s32i, !cir.ptr<!s32i>
 // CIR:    [[RET:%.*]] = cir.load [[RETVAL]] : !cir.ptr<!s32i>, !s32i
 // CIR:    cir.return [[RET]] : !s32i
@@ -180,7 +180,7 @@ void check_store(st2 *s2) {
 
 // CIR:  cir.func dso_local @check_store
 // CIR:    [[CONST:%.*]] = cir.const #cir.int<1> : !s32i
-// CIR:    [[CAST:%.*]] = cir.cast(integral, [[CONST]] : !s32i), !s16i
+// CIR:    [[CAST:%.*]] = cir.cast integral [[CONST]] : !s32i -> !s16i
 // CIR:    [[LOAD:%.*]] = cir.load align(8) {{.*}} : !cir.ptr<!cir.ptr<!rec_st2>>, !cir.ptr<!rec_st2>
 // CIR:    [[MEMBER:%.*]] = cir.get_member [[LOAD]][0] {name = "a"} : !cir.ptr<!rec_st2> -> !cir.ptr<!u32i>
 // CIR:    [[SETBF:%.*]] = cir.set_bitfield align(8) (#bfi_a, [[MEMBER]] : !cir.ptr<!u32i>, [[CAST]] : !s16i) {is_volatile} -> !s16i
@@ -211,7 +211,7 @@ void check_store_exception(st3 *s3) {
 
 // CIR:  cir.func dso_local @check_store_exception
 // CIR:    [[CONST:%.*]] = cir.const #cir.int<2> : !s32i
-// CIR:    [[CAST:%.*]] = cir.cast(integral, [[CONST]] : !s32i), !u32i
+// CIR:    [[CAST:%.*]] = cir.cast integral [[CONST]] : !s32i -> !u32i
 // CIR:    [[LOAD:%.*]] = cir.load align(8) {{.*}} : !cir.ptr<!cir.ptr<!rec_st3>>, !cir.ptr<!rec_st3>
 // CIR:    [[MEMBER:%.*]] = cir.get_member [[LOAD]][2] {name = "b"} : !cir.ptr<!rec_st3> -> !cir.ptr<!u8i>
 // CIR:    [[SETBF:%.*]] = cir.set_bitfield align(4) (#bfi_b1, [[MEMBER]] : !cir.ptr<!u8i>, [[CAST]] : !u32i) {is_volatile} -> !u32i
@@ -263,7 +263,7 @@ void check_store_second_member (st4 *s4) {
 
 // CIR:  cir.func dso_local @check_store_second_member
 // CIR:    [[ONE:%.*]] = cir.const #cir.int<1> : !s32i
-// CIR:    [[CAST:%.*]] = cir.cast(integral, [[ONE]] : !s32i), !u64i
+// CIR:    [[CAST:%.*]] = cir.cast integral [[ONE]] : !s32i -> !u64i
 // CIR:    [[LOAD:%.*]] = cir.load align(8) {{.*}} : !cir.ptr<!cir.ptr<!rec_st4>>, !cir.ptr<!rec_st4>
 // CIR:    [[MEMBER:%.*]] = cir.get_member [[LOAD]][2] {name = "b"} : !cir.ptr<!rec_st4> -> !cir.ptr<!u16i>
 // CIR:    cir.set_bitfield align(8) (#bfi_b2, [[MEMBER]] : !cir.ptr<!u16i>, [[CAST]] : !u64i) {is_volatile} -> !u64i
diff --git a/clang/test/CIR/CodeGen/array-ctor.cpp b/clang/test/CIR/CodeGen/array-ctor.cpp
index bad4868ed8c34..5583d9d56954e 100644
--- a/clang/test/CIR/CodeGen/array-ctor.cpp
+++ b/clang/test/CIR/CodeGen/array-ctor.cpp
@@ -27,7 +27,7 @@ void foo() {
 // CIR: cir.func dso_local @_Z3foov()
 // CIR:   %[[ARRAY:.*]] = cir.alloca !cir.array<!rec_S x 42>, !cir.ptr<!cir.array<!rec_S x 42>>, ["s", init]
 // CIR:   %[[CONST42:.*]] = cir.const #cir.int<42> : !u64i
-// CIR:   %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARRAY]] : !cir.ptr<!cir.array<!rec_S x 42>>), !cir.ptr<!rec_S>
+// CIR:   %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARRAY]] : !cir.ptr<!cir.array<!rec_S x 42>> -> !cir.ptr<!rec_S>
 // CIR:   %[[END_PTR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_S>, %[[CONST42]] : !u64i), !cir.ptr<!rec_S>
 // CIR:   %[[ITER:.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["__array_idx"]
 // CIR:   cir.store %[[DECAY]], %[[ITER]] : !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>
@@ -111,7 +111,7 @@ void multi_dimensional() {
 
 // CIR-BEFORE-LPP:     cir.func{{.*}} @_Z17multi_dimensionalv()
 // CIR-BEFORE-LPP:       %[[S:.*]] = cir.alloca !cir.array<!cir.array<!rec_S x 5> x 3>, !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>>, ["s", init]
-// CIR-BEFORE-LPP:       %[[FLAT:.*]] = cir.cast(bitcast, %[[S]] : !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>>), !cir.ptr<!cir.array<!rec_S x 15>>
+// CIR-BEFORE-LPP:       %[[FLAT:.*]] = cir.cast bitcast %[[S]] : !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>> -> !cir.ptr<!cir.array<!rec_S x 15>>
 // CIR-BEFORE-LPP:       cir.array.ctor %[[FLAT]] : !cir.ptr<!cir.array<!rec_S x 15>> {
 // CIR-BEFORE-LPP:        ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_S>):
 // CIR-BEFORE-LPP:          cir.call @_ZN1SC1Ev(%[[ARG]]) : (!cir.ptr<!rec_S>) -> ()
@@ -122,7 +122,7 @@ void multi_dimensional() {
 // CIR:     cir.func{{.*}} @_Z17multi_dimensionalv()
 // CIR:       %[[S:.*]] = cir.alloca !cir.array<!cir.array<!rec_S x 5> x 3>, !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>>, ["s", init]
 // CIR:       %[[CONST15:.*]] = cir.const #cir.int<15> : !u64i
-// CIR:       %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, {{.*}} : !cir.ptr<!cir.array<!rec_S x 15>>), !cir.ptr<!rec_S>
+// CIR:       %[[DECAY:.*]] = cir.cast array_to_ptrdecay {{.*}} : !cir.ptr<!cir.array<!rec_S x 15>> -> !cir.ptr<!rec_S>
 // CIR:       %[[END_PTR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_S>, %[[CONST15]] : !u64i), !cir.ptr<!rec_S>
 // CIR:       %[[ITER:.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["__array_idx"]
 // CIR:       cir.store %[[DECAY]], %[[ITER]] : !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>
diff --git a/clang/test/CIR/CodeGen/array-dtor.cpp b/clang/test/CIR/CodeGen/array-dtor.cpp
index 36db265a6dfed..e969d50842a03 100644
--- a/clang/test/CIR/CodeGen/array-dtor.cpp
+++ b/clang/test/CIR/CodeGen/array-dtor.cpp
@@ -26,7 +26,7 @@ void test_cleanup_array() {
 // CIR: cir.func{{.*}} @_Z18test_cleanup_arrayv()
 // CIR:   %[[S:.*]] = cir.alloca !cir.array<!rec_S x 42>, !cir.ptr<!cir.array<!rec_S x 42>>, ["s"]
 // CIR:   %[[CONST41:.*]] = cir.const #cir.int<41> : !u64i
-// CIR:   %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[S]] : !cir.ptr<!cir.array<!rec_S x 42>>), !cir.ptr<!rec_S>
+// CIR:   %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[S]] : !cir.ptr<!cir.array<!rec_S x 42>> -> !cir.ptr<!rec_S>
 // CIR:   %[[END_PTR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_S>, %[[CONST41]] : !u64i), !cir.ptr<!rec_S>
 // CIR:   %[[ITER:.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["__array_idx"]
 // CIR:   cir.store %[[END_PTR]], %[[ITER]] : !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>
@@ -109,7 +109,7 @@ void multi_dimensional() {
 
 // CIR-BEFORE-LPP:     cir.func{{.*}} @_Z17multi_dimensionalv()
 // CIR-BEFORE-LPP:       %[[S:.*]] = cir.alloca !cir.array<!cir.array<!rec_S x 5> x 3>, !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>>, ["s"]
-// CIR-BEFORE-LPP:       %[[FLAT:.*]] = cir.cast(bitcast, %[[S]] : !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>>), !cir.ptr<!cir.array<!rec_S x 15>>
+// CIR-BEFORE-LPP:       %[[FLAT:.*]] = cir.cast bitcast %[[S]] : !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>> -> !cir.ptr<!cir.array<!rec_S x 15>>
 // CIR-BEFORE-LPP:       cir.array.dtor %[[FLAT]] : !cir.ptr<!cir.array<!rec_S x 15>> {
 // CIR-BEFORE-LPP:       ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_S>):
 // CIR-BEFORE-LPP:         cir.call @_ZN1SD1Ev(%[[ARG]]) nothrow : (!cir.ptr<!rec_S>) -> ()
@@ -119,9 +119,9 @@ void multi_dimensional() {
 
 // CIR:     cir.func{{.*}} @_Z17multi_dimensionalv()
 // CIR:       %[[S:.*]] = cir.alloca !cir.array<!cir.array<!rec_S x 5> x 3>, !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>>, ["s"]
-// CIR:       %[[FLAT:.*]] = cir.cast(bitcast, %[[S]] : !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>>), !cir.ptr<!cir.array<!rec_S x 15>>
+// CIR:       %[[FLAT:.*]] = cir.cast bitcast %[[S]] : !cir.ptr<!cir.array<!cir.array<!rec_S x 5> x 3>> -> !cir.ptr<!cir.array<!rec_S x 15>>
 // CIR:       %[[CONST14:.*]] = cir.const #cir.int<14> : !u64i
-// CIR:       %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[FLAT]] : !cir.ptr<!cir.array<!rec_S x 15>>), !cir.ptr<!rec_S>
+// CIR:       %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[FLAT]] : !cir.ptr<!cir.array<!rec_S x 15>> -> !cir.ptr<!rec_S>
 // CIR:       %[[END_PTR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_S>, %[[CONST14]] : !u64i), !cir.ptr<!rec_S>
 // CIR:       %[[ITER:.*]] = cir.alloca !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>, ["__array_idx"]
 // CIR:       cir.store %[[END_PTR]], %[[ITER]] : !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index 5dac10491d036..3333634a256dc 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -113,12 +113,12 @@ void func() {
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
 // CIR: %[[INIT_2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e2", init]
 // CIR: %[[IDX:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>> -> !cir.ptr<!s32i>
 // CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[IDX]] : !s32i), !cir.ptr<!s32i>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR" cir.store %[[TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[IDX:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>> -> !cir.ptr<!s32i>
 // CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[IDX]] : !s32i), !cir.ptr<!s32i>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR" cir.store %[[TMP]], %[[INIT_2]] : !s32i, !cir.ptr<!s32i>
@@ -152,7 +152,7 @@ void func2() {
 
 // CIR: %[[ARR2:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
 // CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %[[ARR2]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR2]] : !cir.ptr<!cir.array<!s32i x 2>> -> !cir.ptr<!s32i>
 // CIR: %[[FIVE:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: cir.store{{.*}} %[[FIVE]], %[[ARR_0]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
@@ -209,7 +209,7 @@ void func3() {
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>> -> !cir.ptr<!s32i>
 // CIR: %[[V0:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: cir.store{{.*}} %[[V0]], %[[ARR_PTR]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i
@@ -219,7 +219,7 @@ void func3() {
 // CIR: %[[IDX_V:.*]] = cir.const #cir.int<1> : !s32i
 // CIR: cir.store{{.*}} %[[IDX_V]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[TMP_IDX:.*]] = cir.load{{.*}} %[[IDX]] : !cir.ptr<!s32i>, !s32i
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>> -> !cir.ptr<!s32i>
 // CIR: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!s32i>, %[[TMP_IDX]] : !s32i), !cir.ptr<!s32i>
 // CIR: %[[ELE_TMP:.*]] = cir.load{{.*}} %[[ELE_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[ELE_TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
@@ -258,20 +258,20 @@ void func4() {
 
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>> -> !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_0_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>> -> !cir.ptr<!s32i>
 // CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: cir.store{{.*}} %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
 // CIR: %[[ARR_1:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[OFFSET]] : !s64i), !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_1_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>> -> !cir.ptr<!s32i>
 // CIR: %[[V_1_0:.*]] = cir.const #cir.int<6> : !s32i
 // CIR: cir.store{{.*}} %[[V_1_0]], %[[ARR_1_PTR]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[IDX:.*]] = cir.const #cir.int<0> : !s32i
 // CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>> -> !cir.ptr<!cir.array<!s32i x 1>>
 // CIR: %[[ARR_1:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!cir.array<!s32i x 1>>, %[[IDX_1]] : !s32i), !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_1_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 1>> -> !cir.ptr<!s32i>
 // CIR: %[[ELE_0:.*]] = cir.ptr_stride(%[[ARR_1_PTR]] : !cir.ptr<!s32i>, %[[IDX]] : !s32i), !cir.ptr<!s32i>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[ELE_0]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[TMP]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
@@ -306,8 +306,8 @@ void func5() {
 
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 1> x 2>, !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>, ["arr", init]
 // CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 1>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 1>>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %0 : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>>), !cir.ptr<!cir.array<!s32i x 1>>
-// CIR: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %0 : !cir.ptr<!cir.array<!cir.array<!s32i x 1> x 2>> -> !cir.ptr<!cir.array<!s32i x 1>>
+// CIR: %[[ARR_0_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_0]] : !cir.ptr<!cir.array<!s32i x 1>> -> !cir.ptr<!s32i>
 // CIR: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: cir.store{{.*}} %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
@@ -364,7 +364,7 @@ void func6() {
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 2>, !cir.ptr<!cir.array<!s32i x 2>>, ["arr", init]
 // CIR: %[[V:.*]] = cir.const #cir.int<4> : !s32i
 // CIR: cir.store{{.*}} %[[V]], %[[VAR]] : !s32i, !cir.ptr<!s32i>
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>>), !cir.ptr<!s32i>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 2>> -> !cir.ptr<!s32i>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[VAR]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.store{{.*}} %[[TMP]], %[[ARR_PTR]] : !s32i, !cir.ptr<!s32i>
 // CIR: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i
@@ -396,7 +396,7 @@ void func7() {
 
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 1>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>, ["arr", init]
 // CIR: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, ["arrayinit.temp", init]
-// CIR: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>>), !cir.ptr<!cir.ptr<!s32i>>
+// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 1>> -> !cir.ptr<!cir.ptr<!s32i>>
 // CIR: cir.store{{.*}} %[[ARR_0]], %[[ARR_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
 // CIR: %[[ARR_END:.*]] = cir.ptr_stride(%[[ARR_0]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ONE]] : !s64i), !cir.ptr<!cir.ptr<!s32i>>
@@ -497,7 +497,7 @@ void func9(int arr[10][5]) {
 // CIR:  %[[IDX_1:.*]] = cir.const #cir.int<1> : !s32i
 // CIR:  %[[TMP_1:.*]] = cir.load{{.*}} %[[ARR]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, !cir.ptr<!cir.array<!s32i x 5>>
 // CIR:  %[[ARR_1:.*]] = cir.ptr_stride(%[[TMP_1]] : !cir.ptr<!cir.array<!s32i x 5>>, %[[IDX_1]] : !s32i), !cir.ptr<!cir.array<!s32i x 5>>
-// CIR:  %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CIR:  %[[ARR_1_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR_1]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CIR:  %[[ARR_1_2:.*]] = cir.ptr_stride(%[[ARR_1_PTR]] : !cir.ptr<!s32i>, %[[IDX]] : !s32i), !cir.ptr<!s32i>
 // CIR:  %[[TMP_2:.*]] = cir.load{{.*}} %[[ARR_1_2]] : !cir.ptr<!s32i>, !s32i
 // CIR:  cir.store{{.*}} %[[TMP_2]], %[[INIT]] : !s32i, !cir.ptr<!s32i>
@@ -581,7 +581,7 @@ void array_with_complex_elements() {
 }
 
 // CIR: %[[ARR_ADDR:.*]] = cir.alloca !cir.array<!cir.complex<!cir.float> x 2>, !cir.ptr<!cir.array<!cir.complex<!cir.float> x 2>>, ["arr", init]
-// CIR: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ADDR]] : !cir.ptr<!cir.array<!cir.complex<!cir.float> x 2>>), !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[ARR_0:.*]] = cir.cast array_to_ptrdecay %[[ARR_ADDR]] : !cir.ptr<!cir.array<!cir.complex<!cir.float> x 2>> -> !cir.ptr<!cir.complex<!cir.float>>
 // CIR: %[[CONST_COMPLEX_0:.*]] = cir.const #cir.const_complex<#cir.fp<1.100000e+00> : !cir.float, #cir.fp<2.200000e+00> : !cir.float> : !cir.complex<!cir.float>
 // CIR: cir.store{{.*}} %[[CONST_COMPLEX_0]], %[[ARR_0]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR: %[[IDX_1:.*]] = cir.const #cir.int<1> : !s64i
diff --git a/clang/test/CIR/CodeGen/assign-operator.cpp b/clang/test/CIR/CodeGen/assign-operator.cpp
index 3e509f59368b6..1089d4b6e69f8 100644
--- a/clang/test/CIR/CodeGen/assign-operator.cpp
+++ b/clang/test/CIR/CodeGen/assign-operator.cpp
@@ -17,7 +17,7 @@ void a() {
 // CIR: cir.func{{.*}} @_Z1av()
 // CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_x, !cir.ptr<!rec_x>, ["a"]
 // CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-// CIR:   %[[ONE_CAST:.*]] = cir.cast(integral, %[[ONE]] : !u32i), !s32i
+// CIR:   %[[ONE_CAST:.*]] = cir.cast integral %[[ONE]] : !u32i -> !s32i
 // CIR:   %[[RET:.*]] = cir.call @_ZN1xaSEi(%[[A_ADDR]], %[[ONE_CAST]]) : (!cir.ptr<!rec_x>, !s32i) -> !s32i
 
 // LLVM: define{{.*}} @_Z1av()
@@ -75,10 +75,10 @@ void copy_c(C &c1, C &c2) {
 // CIR:   %[[A_MEMBER_2:.*]] = cir.get_member %[[ARG1_LOAD]][0] {name = "a"}
 // CIR:   %[[C_A:.*]] = cir.call @_ZN1AaSERKS_(%[[A_MEMBER]], %[[A_MEMBER_2]])
 // CIR:   %[[B_MEMBER:.*]] = cir.get_member %[[THIS]][1] {name = "b"}
-// CIR:   %[[B_VOID_PTR:.*]] = cir.cast(bitcast, %[[B_MEMBER]] : !cir.ptr<!cir.array<!rec_B x 16>>), !cir.ptr<!void>
+// CIR:   %[[B_VOID_PTR:.*]] = cir.cast bitcast %[[B_MEMBER]] : !cir.ptr<!cir.array<!rec_B x 16>> -> !cir.ptr<!void>
 // CIR:   %[[RET_LOAD:.*]] = cir.load %[[ARG1_ADDR]]
 // CIR:   %[[B_MEMBER_2:.*]] = cir.get_member %[[RET_LOAD]][1] {name = "b"}
-// CIR:   %[[B_VOID_PTR_2:.*]] = cir.cast(bitcast, %[[B_MEMBER_2]] : !cir.ptr<!cir.array<!rec_B x 16>>), !cir.ptr<!void>
+// CIR:   %[[B_VOID_PTR_2:.*]] = cir.cast bitcast %[[B_MEMBER_2]] : !cir.ptr<!cir.array<!rec_B x 16>> -> !cir.ptr<!void>
 // CIR:   %[[SIZE:.*]] = cir.const #cir.int<64> : !u64i
 // CIR:   %[[COUNT:.*]] = cir.call @memcpy(%[[B_VOID_PTR]], %[[B_VOID_PTR_2]], %[[SIZE]])
 // CIR:   cir.store %[[THIS]], %[[RET_ADDR]]
diff --git a/clang/test/CIR/CodeGen/basic.c b/clang/test/CIR/CodeGen/basic.c
index 2c3c5b0f22a5c..9268615bc9fb0 100644
--- a/clang/test/CIR/CodeGen/basic.c
+++ b/clang/test/CIR/CodeGen/basic.c
@@ -296,7 +296,7 @@ size_type max_size(void) {
 // CIR:   %0 = cir.alloca !u64i, !cir.ptr<!u64i>, ["__retval"] {alignment = 8 : i64}
 // CIR:   %1 = cir.const #cir.int<0> : !s32i
 // CIR:   %2 = cir.unary(not, %1) : !s32i, !s32i
-// CIR:   %3 = cir.cast(integral, %2 : !s32i), !u64i
+// CIR:   %3 = cir.cast integral %2 : !s32i -> !u64i
 // CIR:   %4 = cir.const #cir.int<8> : !u64i
 // CIR:   %5 = cir.binop(div, %3, %4) : !u64i
 
diff --git a/clang/test/CIR/CodeGen/basic.cpp b/clang/test/CIR/CodeGen/basic.cpp
index fe6dd938f0faf..af8de6fff047a 100644
--- a/clang/test/CIR/CodeGen/basic.cpp
+++ b/clang/test/CIR/CodeGen/basic.cpp
@@ -124,7 +124,7 @@ size_type max_size() {
 // CHECK:   %0 = cir.alloca !u64i, !cir.ptr<!u64i>, ["__retval"] {alignment = 8 : i64}
 // CHECK:   %1 = cir.const #cir.int<0> : !s32i
 // CHECK:   %2 = cir.unary(not, %1) : !s32i, !s32i
-// CHECK:   %3 = cir.cast(integral, %2 : !s32i), !u64i
+// CHECK:   %3 = cir.cast integral %2 : !s32i -> !u64i
 // CHECK:   %4 = cir.const #cir.int<8> : !u64i
 // CHECK:   %5 = cir.binop(div, %3, %4) : !u64i
 // CHECK:   cir.store{{.*}} %5, %0 : !u64i, !cir.ptr<!u64i>
diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c
index 541b50a664c0e..65bea4df7d837 100644
--- a/clang/test/CIR/CodeGen/binassign.c
+++ b/clang/test/CIR/CodeGen/binassign.c
@@ -25,7 +25,7 @@ void binary_assign(void) {
 // CIR:         %[[TRUE:.*]] = cir.const #true
 // CIR:         cir.store{{.*}} %[[TRUE]], %[[B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CIR:         %[[CHAR_INI_INIT:.*]] = cir.const #cir.int<65> : !s32i
-// CIR:         %[[CHAR_VAL:.*]] = cir.cast(integral, %[[CHAR_INI_INIT]] : !s32i), !s8i
+// CIR:         %[[CHAR_VAL:.*]] = cir.cast integral %[[CHAR_INI_INIT]] : !s32i -> !s8i
 // CIR:         cir.store{{.*}} %[[CHAR_VAL]], %[[C]] : !s8i, !cir.ptr<!s8i>
 // CIR:         %[[FLOAT_VAL:.*]] = cir.const #cir.fp<3.140000e+00> : !cir.float
 // CIR:         cir.store{{.*}} %[[FLOAT_VAL]], %[[F]] : !cir.float, !cir.ptr<!cir.float>
diff --git a/clang/test/CIR/CodeGen/binop.c b/clang/test/CIR/CodeGen/binop.c
index 280fd29b067f9..4427e4b605297 100644
--- a/clang/test/CIR/CodeGen/binop.c
+++ b/clang/test/CIR/CodeGen/binop.c
@@ -5,9 +5,9 @@ void conditionalResultIimplicitCast(int a, int b, float f) {
   // Should implicit cast back to int.
   int x = a && b;
   // CHECK: %[[#INT:]] = cir.ternary
-  // CHECK: %{{.+}} = cir.cast(bool_to_int, %[[#INT]] : !cir.bool), !s32i
+  // CHECK: %{{.+}} = cir.cast bool_to_int %[[#INT]] : !cir.bool -> !s32i
   float y = f && f;
   // CHECK: %[[#BOOL:]] = cir.ternary
-  // CHECK: %[[#INT:]] = cir.cast(bool_to_int, %[[#BOOL]] : !cir.bool), !s32i
-  // CHECK: %{{.+}} = cir.cast(int_to_float, %[[#INT]] : !s32i), !cir.float
+  // CHECK: %[[#INT:]] = cir.cast bool_to_int %[[#BOOL]] : !cir.bool -> !s32i
+  // CHECK: %{{.+}} = cir.cast int_to_float %[[#INT]] : !s32i -> !cir.float
 }
diff --git a/clang/test/CIR/CodeGen/binop.cpp b/clang/test/CIR/CodeGen/binop.cpp
index 847e81755939f..c1a432dbc2c32 100644
--- a/clang/test/CIR/CodeGen/binop.cpp
+++ b/clang/test/CIR/CodeGen/binop.cpp
@@ -337,13 +337,13 @@ void zext_shift_example(int a, unsigned char b) {
 
 // CIR: %[[A1:.*]] = cir.load{{.*}} %[[A_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR: %[[B1:.*]] = cir.load{{.*}} %[[B_PTR]] : !cir.ptr<!u8i>, !u8i
-// CIR: %[[B1_EXT:.*]] = cir.cast(integral, %[[B1]] : !u8i), !s32i
+// CIR: %[[B1_EXT:.*]] = cir.cast integral %[[B1]] : !u8i -> !s32i
 // CIR: %[[ASHR:.*]] = cir.shift(right, %[[A1]] : !s32i, %[[B1_EXT]] : !s32i) -> !s32i
 // CIR: cir.store{{.*}} %[[ASHR]], %[[X_PTR]] : !s32i, !cir.ptr<!s32i>
 
 // CIR: %[[A2:.*]] = cir.load{{.*}} %[[A_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR: %[[B2:.*]] = cir.load{{.*}} %[[B_PTR]] : !cir.ptr<!u8i>, !u8i
-// CIR: %[[B2_EXT:.*]] = cir.cast(integral, %[[B2]] : !u8i), !s32i
+// CIR: %[[B2_EXT:.*]] = cir.cast integral %[[B2]] : !u8i -> !s32i
 // CIR: %[[SHL:.*]] = cir.shift(left, %[[A2]] : !s32i, %[[B2_EXT]] : !s32i) -> !s32i
 // CIR: cir.store{{.*}} %[[SHL]], %[[X_PTR]] : !s32i, !cir.ptr<!s32i>
 
@@ -409,13 +409,13 @@ void sext_shift_example(int a, signed char b) {
 
 // CIR: %[[A1:.*]] = cir.load{{.*}} %[[A_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR: %[[B1:.*]] = cir.load{{.*}} %[[B_PTR]] : !cir.ptr<!s8i>, !s8i
-// CIR: %[[B1_EXT:.*]] = cir.cast(integral, %[[B1]] : !s8i), !s32i
+// CIR: %[[B1_EXT:.*]] = cir.cast integral %[[B1]] : !s8i -> !s32i
 // CIR: %[[ASHR:.*]] = cir.shift(right, %[[A1]] : !s32i, %[[B1_EXT]] : !s32i) -> !s32i
 // CIR: cir.store{{.*}} %[[ASHR]], %[[X_PTR]] : !s32i, !cir.ptr<!s32i>
 
 // CIR: %[[A2:.*]] = cir.load{{.*}} %[[A_PTR]] : !cir.ptr<!s32i>, !s32i
 // CIR: %[[B2:.*]] = cir.load{{.*}} %[[B_PTR]] : !cir.ptr<!s8i>, !s8i
-// CIR: %[[B2_EXT:.*]] = cir.cast(integral, %[[B2]] : !s8i), !s32i
+// CIR: %[[B2_EXT:.*]] = cir.cast integral %[[B2]] : !s8i -> !s32i
 // CIR: %[[SHL:.*]] = cir.shift(left, %[[A2]] : !s32i, %[[B2_EXT]] : !s32i) -> !s32i
 // CIR: cir.store{{.*}} %[[SHL]], %[[X_PTR]] : !s32i, !cir.ptr<!s32i>
 
@@ -481,13 +481,13 @@ void long_shift_example(long long a, short b) {
 
 // CIR: %[[A1:.*]] = cir.load{{.*}} %[[A_PTR]] : !cir.ptr<!s64i>, !s64i
 // CIR: %[[B1:.*]] = cir.load{{.*}} %[[B_PTR]] : !cir.ptr<!s16i>, !s16i
-// CIR: %[[B1_EXT:.*]] = cir.cast(integral, %[[B1]] : !s16i), !s32i
+// CIR: %[[B1_EXT:.*]] = cir.cast integral %[[B1]] : !s16i -> !s32i
 // CIR: %[[ASHR:.*]] = cir.shift(right, %[[A1]] : !s64i, %[[B1_EXT]] : !s32i) -> !s64i
 // CIR: cir.store{{.*}} %[[ASHR]], %[[X_PTR]] : !s64i, !cir.ptr<!s64i>
 
 // CIR: %[[A2:.*]] = cir.load{{.*}} %[[A_PTR]] : !cir.ptr<!s64i>, !s64i
 // CIR: %[[B2:.*]] = cir.load{{.*}} %[[B_PTR]] : !cir.ptr<!s16i>, !s16i
-// CIR: %[[B2_EXT:.*]] = cir.cast(integral, %[[B2]] : !s16i), !s32i
+// CIR: %[[B2_EXT:.*]] = cir.cast integral %[[B2]] : !s16i -> !s32i
 // CIR: %[[SHL:.*]] = cir.shift(left, %[[A2]] : !s64i, %[[B2_EXT]] : !s32i) -> !s64i
 // CIR: cir.store{{.*}} %[[SHL]], %[[X_PTR]] : !s64i, !cir.ptr<!s64i>
 
diff --git a/clang/test/CIR/CodeGen/builtin_bit.cpp b/clang/test/CIR/CodeGen/builtin_bit.cpp
index 8b9a187e799ed..32a53d883a170 100644
--- a/clang/test/CIR/CodeGen/builtin_bit.cpp
+++ b/clang/test/CIR/CodeGen/builtin_bit.cpp
@@ -34,7 +34,7 @@ int test_builtin_clrsbl(long x) {
 
 // CIR-LABEL: _Z19test_builtin_clrsbll
 // CIR:         [[TMP:%.+]] = cir.clrsb %{{.+}} : !s64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !s64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !s64i -> !s32i
 
 // LLVM-LABEL: _Z19test_builtin_clrsbll
 // LLVM:         %[[X:.+]] = load i64, ptr %{{.+}}, align 8
@@ -58,7 +58,7 @@ int test_builtin_clrsbll(long long x) {
 
 // CIR-LABEL: _Z20test_builtin_clrsbllx
 // CIR:         [[TMP:%.+]] = cir.clrsb %{{.+}} : !s64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !s64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !s64i -> !s32i
 
 // LLVM-LABEL: _Z20test_builtin_clrsbllx
 // LLVM:         %[[X:.+]] = load i64, ptr %{{.+}}, align 8
@@ -82,7 +82,7 @@ int test_builtin_ctzs(unsigned short x) {
 
 // CIR-LABEL: _Z17test_builtin_ctzst
 // CIR:         [[TMP:%.+]] = cir.ctz %{{.+}} poison_zero : !u16i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u16i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u16i -> !s32i
 
 // LLVM-LABEL: _Z17test_builtin_ctzst
 // LLVM:         %{{.+}} = call i16 @llvm.cttz.i16(i16 %{{.+}}, i1 true)
@@ -96,7 +96,7 @@ int test_builtin_ctz(unsigned x) {
 
 // CIR-LABEL: _Z16test_builtin_ctzj
 // CIR:         [[TMP:%.+]] = cir.ctz %{{.+}} poison_zero : !u32i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u32i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u32i -> !s32i
 
 // LLVM-LABEL: _Z16test_builtin_ctzj
 // LLVM:         %{{.+}} = call i32 @llvm.cttz.i32(i32 %{{.+}}, i1 true)
@@ -110,7 +110,7 @@ int test_builtin_ctzl(unsigned long x) {
 
 // CIR-LABEL: _Z17test_builtin_ctzlm
 // CIR:         [[TMP:%.+]] = cir.ctz %{{.+}} poison_zero : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z17test_builtin_ctzlm
 // LLVM:         %{{.+}} = call i64 @llvm.cttz.i64(i64 %{{.+}}, i1 true)
@@ -124,7 +124,7 @@ int test_builtin_ctzll(unsigned long long x) {
 
 // CIR-LABEL: _Z18test_builtin_ctzlly
 // CIR:         [[TMP:%.+]] = cir.ctz %{{.+}} poison_zero : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z18test_builtin_ctzlly
 // LLVM:         %{{.+}} = call i64 @llvm.cttz.i64(i64 %{{.+}}, i1 true)
@@ -138,7 +138,7 @@ int test_builtin_ctzg(unsigned x) {
 
 // CIR-LABEL: _Z17test_builtin_ctzgj
 // CIR:         [[TMP:%.+]] = cir.ctz %{{.+}} poison_zero : !u32i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u32i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u32i -> !s32i
 
 // LLVM-LABEL: _Z17test_builtin_ctzgj
 // LLVM:         %{{.+}} = call i32 @llvm.cttz.i32(i32 %{{.+}}, i1 true)
@@ -152,7 +152,7 @@ int test_builtin_clzs(unsigned short x) {
 
 // CIR-LABEL: _Z17test_builtin_clzst
 // CIR:         [[TMP:%.+]] = cir.clz %{{.+}} poison_zero : !u16i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u16i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u16i -> !s32i
 
 // LLVM-LABEL: _Z17test_builtin_clzst
 // LLVM:         %{{.+}} = call i16 @llvm.ctlz.i16(i16 %{{.+}}, i1 true)
@@ -166,7 +166,7 @@ int test_builtin_clz(unsigned x) {
 
 // CIR-LABEL: _Z16test_builtin_clzj
 // CIR:         [[TMP:%.+]] = cir.clz %{{.+}} poison_zero : !u32i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u32i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u32i -> !s32i
 
 // LLVM-LABEL: _Z16test_builtin_clzj
 // LLVM:         %{{.+}} = call i32 @llvm.ctlz.i32(i32 %{{.+}}, i1 true)
@@ -180,7 +180,7 @@ int test_builtin_clzl(unsigned long x) {
 
 // CIR-LABEL: _Z17test_builtin_clzlm
 // CIR:         [[TMP:%.+]] = cir.clz %{{.+}} poison_zero : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z17test_builtin_clzlm
 // LLVM:         %{{.+}} = call i64 @llvm.ctlz.i64(i64 %{{.+}}, i1 true)
@@ -194,7 +194,7 @@ int test_builtin_clzll(unsigned long long x) {
 
 // CIR-LABEL: _Z18test_builtin_clzlly
 // CIR:         [[TMP:%.+]] = cir.clz %{{.+}} poison_zero : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z18test_builtin_clzlly
 // LLVM:         %{{.+}} = call i64 @llvm.ctlz.i64(i64 %{{.+}}, i1 true)
@@ -208,7 +208,7 @@ int test_builtin_clzg(unsigned x) {
 
 // CIR-LABEL: _Z17test_builtin_clzgj
 // CIR:         [[TMP:%.+]] = cir.clz %{{.+}} poison_zero : !u32i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u32i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u32i -> !s32i
 
 // LLVM-LABEL: _Z17test_builtin_clzgj
 // LLVM:         %{{.+}} = call i32 @llvm.ctlz.i32(i32 %{{.+}}, i1 true)
@@ -294,7 +294,7 @@ int test_builtin_parity(unsigned x) {
 
 // CIR-LABEL: _Z19test_builtin_parityj
 // CIR:         [[TMP:%.+]] = cir.parity %{{.+}} : !u32i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u32i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u32i -> !s32i
 
 // LLVM-LABEL: _Z19test_builtin_parityj
 // LLVM:         %[[X:.+]] = load i32, ptr %{{.+}}, align 4
@@ -312,7 +312,7 @@ int test_builtin_parityl(unsigned long x) {
 
 // CIR-LABEL: _Z20test_builtin_paritylm
 // CIR:         [[TMP:%.+]] = cir.parity %{{.+}} : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z20test_builtin_paritylm
 // LLVM:         %[[X:.+]] = load i64, ptr %{{.+}}, align 8
@@ -330,7 +330,7 @@ int test_builtin_parityll(unsigned long long x) {
 
 // CIR-LABEL: _Z21test_builtin_paritylly
 // CIR:         [[TMP:%.+]] = cir.parity %{{.+}} : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z21test_builtin_paritylly
 // LLVM:         %[[X:.+]] = load i64, ptr %{{.+}}, align 8
@@ -348,7 +348,7 @@ int test_builtin_popcount(unsigned x) {
 
 // CIR-LABEL: _Z21test_builtin_popcountj
 // CIR:         [[TMP:%.+]] = cir.popcount %{{.+}} : !u32i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u32i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u32i -> !s32i
 
 // LLVM-LABEL: _Z21test_builtin_popcountj
 // LLVM:         %{{.+}} = call i32 @llvm.ctpop.i32(i32 %{{.+}})
@@ -362,7 +362,7 @@ int test_builtin_popcountl(unsigned long x) {
 
 // CIR-LABEL: _Z22test_builtin_popcountlm
 // CIR:         [[TMP:%.+]] = cir.popcount %{{.+}} : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z22test_builtin_popcountlm
 // LLVM:         %{{.+}} = call i64 @llvm.ctpop.i64(i64 %{{.+}})
@@ -376,7 +376,7 @@ int test_builtin_popcountll(unsigned long long x) {
 
 // CIR-LABEL: _Z23test_builtin_popcountlly
 // CIR:         [[TMP:%.+]] = cir.popcount %{{.+}} : !u64i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u64i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u64i -> !s32i
 
 // LLVM-LABEL: _Z23test_builtin_popcountlly
 // LLVM:         %{{.+}} = call i64 @llvm.ctpop.i64(i64 %{{.+}})
@@ -390,7 +390,7 @@ int test_builtin_popcountg(unsigned x) {
 
 // CIR-LABEL: _Z22test_builtin_popcountgj
 // CIR:         [[TMP:%.+]] = cir.popcount %{{.+}} : !u32i
-// CIR:         {{%.+}} = cir.cast(integral, [[TMP]] : !u32i), !s32i
+// CIR:         {{%.+}} = cir.cast integral [[TMP]] : !u32i -> !s32i
 
 // LLVM-LABEL: _Z22test_builtin_popcountgj
 // LLVM:         %{{.+}} = call i32 @llvm.ctpop.i32(i32 %{{.+}})
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index 853d894a3311b..a30df97250d19 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -165,9 +165,9 @@ void expect(int x, int y) {
 
 // CIR-LABEL: cir.func{{.*}} @_Z6expectii
 // CIR:         %[[X:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
-// CIR-NEXT:    %[[X_LONG:.+]] = cir.cast(integral, %[[X]] : !s32i), !s64i
+// CIR-NEXT:    %[[X_LONG:.+]] = cir.cast integral %[[X]] : !s32i -> !s64i
 // CIR-NEXT:    %[[Y:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
-// CIR-NEXT:    %[[Y_LONG:.+]] = cir.cast(integral, %[[Y]] : !s32i), !s64i
+// CIR-NEXT:    %[[Y_LONG:.+]] = cir.cast integral %[[Y]] : !s32i -> !s64i
 // CIR-NEXT:    %{{.+}} = cir.expect(%[[X_LONG]], %[[Y_LONG]]) : !s64i
 // CIR:       }
 
@@ -185,9 +185,9 @@ void expect_prob(int x, int y) {
 
 // CIR-LABEL: cir.func{{.*}} @_Z11expect_probii
 // CIR:         %[[X:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
-// CIR-NEXT:    %[[X_LONG:.+]] = cir.cast(integral, %[[X]] : !s32i), !s64i
+// CIR-NEXT:    %[[X_LONG:.+]] = cir.cast integral %[[X]] : !s32i -> !s64i
 // CIR-NEXT:    %[[Y:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!s32i>, !s32i
-// CIR-NEXT:    %[[Y_LONG:.+]] = cir.cast(integral, %[[Y]] : !s32i), !s64i
+// CIR-NEXT:    %[[Y_LONG:.+]] = cir.cast integral %[[Y]] : !s32i -> !s64i
 // CIR-NEXT:    %{{.+}} = cir.expect(%[[X_LONG]], %[[Y_LONG]], 2.500000e-01) : !s64i
 // CIR:       }
 
diff --git a/clang/test/CIR/CodeGen/builtin_printf.cpp b/clang/test/CIR/CodeGen/builtin_printf.cpp
index 80875c349bfcf..898984a6c12d3 100644
--- a/clang/test/CIR/CodeGen/builtin_printf.cpp
+++ b/clang/test/CIR/CodeGen/builtin_printf.cpp
@@ -28,11 +28,11 @@ void func(char const * const str, int i) {
 // CIR:   %[[null_ptr:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
 // CIR:   %[[printf_result1:.+]] = cir.call @printf(%[[null_ptr]]) nothrow : (!cir.ptr<!s8i>) -> !s32i
 // CIR:   %[[str_fmt_global:.+]] = cir.get_global @".str" : !cir.ptr<!cir.array<!s8i x 3>>
-// CIR:   %[[str_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[str_fmt_global]] : !cir.ptr<!cir.array<!s8i x 3>>), !cir.ptr<!s8i>
+// CIR:   %[[str_fmt_ptr:.+]] = cir.cast array_to_ptrdecay %[[str_fmt_global]] : !cir.ptr<!cir.array<!s8i x 3>> -> !cir.ptr<!s8i>
 // CIR:   %[[str_val:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
 // CIR:   %[[printf_result2:.+]] = cir.call @printf(%[[str_fmt_ptr]], %[[str_val]]) nothrow : (!cir.ptr<!s8i>, !cir.ptr<!s8i>) -> !s32i
 // CIR:   %[[full_fmt_global:.+]] = cir.get_global @".str.1" : !cir.ptr<!cir.array<!s8i x 7>>
-// CIR:   %[[full_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[full_fmt_global]] : !cir.ptr<!cir.array<!s8i x 7>>), !cir.ptr<!s8i>
+// CIR:   %[[full_fmt_ptr:.+]] = cir.cast array_to_ptrdecay %[[full_fmt_global]] : !cir.ptr<!cir.array<!s8i x 7>> -> !cir.ptr<!s8i>
 // CIR:   %[[str_val2:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
 // CIR:   %[[i_val:.+]] = cir.load{{.*}} %[[i_ptr]] : !cir.ptr<!s32i>, !s32i
 // CIR:   %[[printf_result3:.+]] = cir.call @printf(%[[full_fmt_ptr]], %[[str_val2]], %[[i_val]]) nothrow : (!cir.ptr<!s8i>, !cir.ptr<!s8i>, !s32i) -> !s32i
diff --git a/clang/test/CIR/CodeGen/cast.cpp b/clang/test/CIR/CodeGen/cast.cpp
index caf6de7c7d485..7afa955cf3bcf 100644
--- a/clang/test/CIR/CodeGen/cast.cpp
+++ b/clang/test/CIR/CodeGen/cast.cpp
@@ -12,7 +12,7 @@ unsigned char cxxstaticcast_0(unsigned int x) {
 // CIR:    %[[RV:[0-9]+]] = cir.alloca !u8i, !cir.ptr<!u8i>, ["__retval"] {alignment = 1 : i64}
 // CIR:    cir.store %arg0, %[[XPTR]] : !u32i, !cir.ptr<!u32i>
 // CIR:    %[[XVAL:[0-9]+]] = cir.load{{.*}} %[[XPTR]] : !cir.ptr<!u32i>, !u32i
-// CIR:    %[[CASTED:[0-9]+]] = cir.cast(integral, %[[XVAL]] : !u32i), !u8i
+// CIR:    %[[CASTED:[0-9]+]] = cir.cast integral %[[XVAL]] : !u32i -> !u8i
 // CIR:    cir.store %[[CASTED]], %[[RV]] : !u8i, !cir.ptr<!u8i>
 // CIR:    %[[R:[0-9]+]] = cir.load{{.*}} %1 : !cir.ptr<!u8i>, !u8i
 // CIR:    cir.return %[[R]] : !u8i
@@ -30,55 +30,55 @@ int cStyleCasts_0(unsigned x1, int x2, float x3, short x4, double x5) {
 // LLVM: define{{.*}} i32 @_Z13cStyleCasts_0jifsd
 
   char a = (char)x1; // truncate
-  // CIR: %{{[0-9]+}} = cir.cast(integral, %{{[0-9]+}} : !u32i), !s8i
+  // CIR: %{{[0-9]+}} = cir.cast integral %{{[0-9]+}} : !u32i -> !s8i
   // LLVM: %{{[0-9]+}} = trunc i32 %{{[0-9]+}} to i8
 
   short b = (short)x2; // truncate with sign
-  // CIR: %{{[0-9]+}} = cir.cast(integral, %{{[0-9]+}} : !s32i), !s16i
+  // CIR: %{{[0-9]+}} = cir.cast integral %{{[0-9]+}} : !s32i -> !s16i
   // LLVM: %{{[0-9]+}} = trunc i32 %{{[0-9]+}} to i16
 
   long long c = (long long)x1; // zero extend
-  // CIR: %{{[0-9]+}} = cir.cast(integral, %{{[0-9]+}} : !u32i), !s64i
+  // CIR: %{{[0-9]+}} = cir.cast integral %{{[0-9]+}} : !u32i -> !s64i
   // LLVM: %{{[0-9]+}} = zext i32 %{{[0-9]+}} to i64
 
   long long d = (long long)x2; // sign extend
-  // CIR: %{{[0-9]+}} = cir.cast(integral, %{{[0-9]+}} : !s32i), !s64i
+  // CIR: %{{[0-9]+}} = cir.cast integral %{{[0-9]+}} : !s32i -> !s64i
   // LLVM: %{{[0-9]+}} = sext i32 %{{[0-9]+}} to i64
 
   unsigned ui = (unsigned)x2; // sign drop
-  // CIR: %{{[0-9]+}} = cir.cast(integral, %{{[0-9]+}} : !s32i), !u32i
+  // CIR: %{{[0-9]+}} = cir.cast integral %{{[0-9]+}} : !s32i -> !u32i
 
   int si = (int)x1; // sign add
-  // CIR: %{{[0-9]+}} = cir.cast(integral, %{{[0-9]+}} : !u32i), !s32i
+  // CIR: %{{[0-9]+}} = cir.cast integral %{{[0-9]+}} : !u32i -> !s32i
 
   bool ib;
   int bi = (int)ib; // bool to int
-  // CIR: %{{[0-9]+}} = cir.cast(bool_to_int, %{{[0-9]+}} : !cir.bool), !s32i
+  // CIR: %{{[0-9]+}} = cir.cast bool_to_int %{{[0-9]+}} : !cir.bool -> !s32i
   // LLVM: %{{[0-9]+}} = zext i1 %{{[0-9]+}} to i32
 
   bool b2 = x2; // int to bool
-  // CIR: %{{[0-9]+}} = cir.cast(int_to_bool, %{{[0-9]+}} : !s32i), !cir.bool
+  // CIR: %{{[0-9]+}} = cir.cast int_to_bool %{{[0-9]+}} : !s32i -> !cir.bool
   // LLVM: %[[INTTOBOOL:[0-9]+]]  = icmp ne i32 %{{[0-9]+}}, 0
   // LLVM: zext i1 %[[INTTOBOOL]] to i8
 
   void *p;
   bool b3 = p; // ptr to bool
-  // CIR: %{{[0-9]+}} = cir.cast(ptr_to_bool, %{{[0-9]+}} : !cir.ptr<!void>), !cir.bool
+  // CIR: %{{[0-9]+}} = cir.cast ptr_to_bool %{{[0-9]+}} : !cir.ptr<!void> -> !cir.bool
   // LLVM: %[[PTRTOBOOL:[0-9]+]]  = icmp ne ptr %{{[0-9]+}}, null
   // LLVM: zext i1 %[[PTRTOBOOL]] to i8
 
   float f;
   bool b4 = f; // float to bool
-  // CIR: %{{[0-9]+}} = cir.cast(float_to_bool, %{{[0-9]+}} : !cir.float), !cir.bool
+  // CIR: %{{[0-9]+}} = cir.cast float_to_bool %{{[0-9]+}} : !cir.float -> !cir.bool
   // LLVM: %{{[0-9]+}} = fcmp une float %{{[0-9]+}}, 0.000000e+00
   // LLVM: %{{[0-9]+}} = zext i1 %{{[0-9]+}} to i8
 
   double d2 = f; // float to double
-  // CIR: %{{[0-9]+}} = cir.cast(floating, %{{[0-9]+}} : !cir.float), !cir.double
+  // CIR: %{{[0-9]+}} = cir.cast floating %{{[0-9]+}} : !cir.float -> !cir.double
   // LLVM: %{{[0-9]+}} = fpext float %{{[0-9]+}} to double
 
   f = d2; // double to float
-  // CIR: %{{[0-9]+}} = cir.cast(floating, %{{[0-9]+}} : !cir.double), !cir.float
+  // CIR: %{{[0-9]+}} = cir.cast floating %{{[0-9]+}} : !cir.double -> !cir.float
   // LLVM: %{{[0-9]+}} = fptrunc double %{{[0-9]+}} to float
 
   return 0;
@@ -93,7 +93,7 @@ bool cptr(void *d) {
 // CIR:   %[[DPTR:[0-9]+]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["d", init] {alignment = 8 : i64}
 
 // CIR:   %[[DVAL:[0-9]+]] = cir.load{{.*}} %[[DPTR]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
-// CIR:   %{{[0-9]+}} = cir.cast(ptr_to_bool, %[[DVAL]] : !cir.ptr<!void>), !cir.bool
+// CIR:   %{{[0-9]+}} = cir.cast ptr_to_bool %[[DVAL]] : !cir.ptr<!void> -> !cir.bool
 
 // LLVM-LABEL: define{{.*}} i1 @_Z4cptrPv(ptr %0)
 // LLVM:         %[[ARG_STORAGE:.*]] = alloca ptr, i64 1
@@ -127,7 +127,7 @@ void bitcast() {
 }
 
 // CIR: %[[D_VEC:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
-// CIR: %[[I_VEC:.*]] = cir.cast(bitcast, %[[D_VEC]] : !cir.vector<2 x !cir.double>), !cir.vector<4 x !s32i>
+// CIR: %[[I_VEC:.*]] = cir.cast bitcast %[[D_VEC]] : !cir.vector<2 x !cir.double> -> !cir.vector<4 x !s32i>
 
 // LLVM: %[[D_VEC:.*]] = load <2 x double>, ptr {{.*}}, align 16
 // LLVM: %[[I_VEC:.*]] = bitcast <2 x double> %[[D_VEC]] to <4 x i32>
diff --git a/clang/test/CIR/CodeGen/cmp.cpp b/clang/test/CIR/CodeGen/cmp.cpp
index 75c8cda0c3603..7e32d16e88d57 100644
--- a/clang/test/CIR/CodeGen/cmp.cpp
+++ b/clang/test/CIR/CodeGen/cmp.cpp
@@ -407,9 +407,9 @@ void bool_cmp(bool a, bool b) {
 // CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["x", init]
 
 // CIR: %[[A1:.*]] = cir.load{{.*}} %[[A_PTR]] : !cir.ptr<!cir.bool>, !cir.bool
-// CIR: %[[A1_INT:.*]] = cir.cast(bool_to_int, %[[A1]] : !cir.bool), !s32i
+// CIR: %[[A1_INT:.*]] = cir.cast bool_to_int %[[A1]] : !cir.bool -> !s32i
 // CIR: %[[B1:.*]] = cir.load{{.*}} %[[B_PTR]] : !cir.ptr<!cir.bool>, !cir.bool
-// CIR: %[[B1_INT:.*]] = cir.cast(bool_to_int, %[[B1]] : !cir.bool), !s32i
+// CIR: %[[B1_INT:.*]] = cir.cast bool_to_int %[[B1]] : !cir.bool -> !s32i
 // CIR: %{{.*}} = cir.cmp(gt, %[[A1_INT]], %[[B1_INT]]) : !s32i, !cir.bool
 // CIR: cir.store{{.*}} {{.*}}, %[[X_PTR]] : !cir.bool, !cir.ptr<!cir.bool>
 
diff --git a/clang/test/CIR/CodeGen/comma.c b/clang/test/CIR/CodeGen/comma.c
index a1479b85d3f04..cc26a3f200664 100644
--- a/clang/test/CIR/CodeGen/comma.c
+++ b/clang/test/CIR/CodeGen/comma.c
@@ -24,7 +24,7 @@ void comma(void) {
 // CIR:         %[[TRUE:.*]] = cir.const #true
 // CIR:         cir.store{{.*}} %[[TRUE]], %[[B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CIR:         %[[CHAR_INI_INIT:.*]] = cir.const #cir.int<65> : !s32i
-// CIR:         %[[CHAR_VAL:.*]] = cir.cast(integral, %[[CHAR_INI_INIT]] : !s32i), !s8i
+// CIR:         %[[CHAR_VAL:.*]] = cir.cast integral %[[CHAR_INI_INIT]] : !s32i -> !s8i
 // CIR:         cir.store{{.*}} %[[CHAR_VAL]], %[[C]] : !s8i, !cir.ptr<!s8i>
 // CIR:         %[[FLOAT_VAL:.*]] = cir.const #cir.fp<3.140000e+00> : !cir.float
 // CIR:         cir.store{{.*}} %[[FLOAT_VAL]], %[[F]] : !cir.float, !cir.ptr<!cir.float>
diff --git a/clang/test/CIR/CodeGen/complex-cast.cpp b/clang/test/CIR/CodeGen/complex-cast.cpp
index a8f51cd627f9d..5dc08eb414a5b 100644
--- a/clang/test/CIR/CodeGen/complex-cast.cpp
+++ b/clang/test/CIR/CodeGen/complex-cast.cpp
@@ -20,7 +20,7 @@ void scalar_to_complex() {
   ci = sd;
 }
 
-// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast(float_to_complex, %{{.*}} : !cir.double), !cir.complex<!cir.double>
+// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast float_to_complex %{{.*}} : !cir.double -> !cir.complex<!cir.double>
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.load{{.*}}  %{{.*}} : !cir.ptr<!cir.double>, !cir.double
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.double
@@ -35,7 +35,7 @@ void scalar_to_complex() {
 // OGCG: store double %[[REAL]], ptr {{.*}}, align 8
 // OGCG: store double 0.000000e+00, ptr getelementptr inbounds nuw ({ double, double }, ptr @cd, i32 0, i32 1), align 8
 
-// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast(int_to_complex, %{{.*}} : !s32i), !cir.complex<!s32i>
+// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast int_to_complex %{{.*}} : !s32i -> !cir.complex<!s32i>
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.load{{.*}}  %{{.*}} : !cir.ptr<!s32i>, !s32i
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.int<0> : !s32i
@@ -50,11 +50,11 @@ void scalar_to_complex() {
 // OGCG: store i32 %[[REAL]], ptr {{.*}}, align 4
 // OGCG: store i32 0, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr @ci, i32 0, i32 1), align 4
 
-// CIR-BEFORE: %[[INT_TO_FP:.*]] = cir.cast(int_to_float, %{{.*}} : !s32i), !cir.double
-// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast(float_to_complex, %[[INT_TO_FP]] : !cir.double), !cir.complex<!cir.double>
+// CIR-BEFORE: %[[INT_TO_FP:.*]] = cir.cast int_to_float %{{.*}} : !s32i -> !cir.double
+// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast float_to_complex %[[INT_TO_FP]] : !cir.double -> !cir.complex<!cir.double>
 
 //      CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
-// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast(int_to_float, %[[TMP]] : !s32i), !cir.double
+// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast int_to_float %[[TMP]] : !s32i -> !cir.double
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.double
 // CIR-AFTER-NEXT: %{{.*}} = cir.complex.create %[[REAL]], %[[IMAG]] : !cir.double -> !cir.complex<!cir.double>
 
@@ -69,11 +69,11 @@ void scalar_to_complex() {
 // OGCG: store double %[[REAL]], ptr {{.*}}, align 8
 // OGCG: store double 0.000000e+00, ptr getelementptr inbounds nuw ({ double, double }, ptr {{.*}}, i32 0, i32 1), align 8
 
-// CIR-BEFORE: %[[FP_TO_INT:.*]] = cir.cast(float_to_int, %{{.*}} : !cir.double), !s32i
-// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast(int_to_complex, %[[FP_TO_INT]] : !s32i), !cir.complex<!s32i>
+// CIR-BEFORE: %[[FP_TO_INT:.*]] = cir.cast float_to_int %{{.*}} : !cir.double -> !s32i
+// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast int_to_complex %[[FP_TO_INT]] : !s32i -> !cir.complex<!s32i>
 
 //      CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!cir.double>, !cir.double
-// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast(float_to_int, %[[TMP]] : !cir.double), !s32i
+// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast float_to_int %[[TMP]] : !cir.double -> !s32i
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.int<0> : !s32i
 // CIR-AFTER-NEXT: %{{.*}} = cir.complex.create %[[REAL]], %[[IMAG]] : !s32i -> !cir.complex<!s32i>
 
@@ -95,7 +95,7 @@ void scalar_to_complex_explicit() {
   ci = (int _Complex)sd;
 }
 
-// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast(float_to_complex, %{{.*}} : !cir.double), !cir.complex<!cir.double>
+// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast float_to_complex %{{.*}} : !cir.double -> !cir.complex<!cir.double>
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.load{{.*}}  %{{.*}} : !cir.ptr<!cir.double>, !cir.double
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.double
@@ -110,7 +110,7 @@ void scalar_to_complex_explicit() {
 // OGCG: store double %[[REAL]], ptr {{.*}}, align 8
 // OGCG: store double 0.000000e+00, ptr getelementptr inbounds nuw ({ double, double }, ptr @cd, i32 0, i32 1), align 8
 
-// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast(int_to_complex, %{{.*}} : !s32i), !cir.complex<!s32i>
+// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast int_to_complex %{{.*}} : !s32i -> !cir.complex<!s32i>
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.load{{.*}}  %{{.*}} : !cir.ptr<!s32i>, !s32i
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.int<0> : !s32i
@@ -125,11 +125,11 @@ void scalar_to_complex_explicit() {
 // OGCG: store i32 %[[REAL]], ptr {{.*}}, align 4
 // OGCG: store i32 0, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr @ci, i32 0, i32 1), align 4
 
-// CIR-BEFORE: %[[INT_TO_FP:.*]] = cir.cast(int_to_float, %{{.*}} : !s32i), !cir.double
-// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast(float_to_complex, %[[INT_TO_FP]] : !cir.double), !cir.complex<!cir.double>
+// CIR-BEFORE: %[[INT_TO_FP:.*]] = cir.cast int_to_float %{{.*}} : !s32i -> !cir.double
+// CIR-BEFORE: %[[FP_TO_COMPLEX:.*]] = cir.cast float_to_complex %[[INT_TO_FP]] : !cir.double -> !cir.complex<!cir.double>
 
 //      CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
-// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast(int_to_float, %[[TMP]] : !s32i), !cir.double
+// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast int_to_float %[[TMP]] : !s32i -> !cir.double
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.double
 // CIR-AFTER-NEXT: %{{.*}} = cir.complex.create %[[REAL]], %[[IMAG]] : !cir.double -> !cir.complex<!cir.double>
 
@@ -144,11 +144,11 @@ void scalar_to_complex_explicit() {
 // OGCG: store double %[[REAL]], ptr {{.*}}, align 8
 // OGCG: store double 0.000000e+00, ptr getelementptr inbounds nuw ({ double, double }, ptr {{.*}}, i32 0, i32 1), align 8
 
-// CIR-BEFORE: %[[FP_TO_INT:.*]] = cir.cast(float_to_int, %{{.*}} : !cir.double), !s32i
-// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast(int_to_complex, %[[FP_TO_INT]] : !s32i), !cir.complex<!s32i>
+// CIR-BEFORE: %[[FP_TO_INT:.*]] = cir.cast float_to_int %{{.*}} : !cir.double -> !s32i
+// CIR-BEFORE: %[[INT_TO_COMPLEX:.*]] = cir.cast int_to_complex %[[FP_TO_INT]] : !s32i -> !cir.complex<!s32i>
 
 //      CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!cir.double>, !cir.double
-// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast(float_to_int, %[[TMP]] : !cir.double), !s32i
+// CIR-AFTER-NEXT: %[[REAL:.*]] = cir.cast float_to_int %[[TMP]] : !cir.double -> !s32i
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.const #cir.int<0> : !s32i
 // CIR-AFTER-NEXT: %{{.*}} = cir.complex.create %[[REAL]], %[[IMAG]] : !s32i -> !cir.complex<!s32i>
 
@@ -170,7 +170,7 @@ void complex_to_scalar() {
   si = (int)cd;
 }
 
-// CIR-BEFORE: %[[FP_TO_COMPLEX_REAL:.*]] = cir.cast(float_complex_to_real, %{{.*}} : !cir.complex<!cir.double>), !cir.double
+// CIR-BEFORE: %[[FP_TO_COMPLEX_REAL:.*]] = cir.cast float_complex_to_real %{{.*}} : !cir.complex<!cir.double> -> !cir.double
 
 // CIR-AFTER: %{{.*}} = cir.complex.real %{{.*}} : !cir.complex<!cir.double> -> !cir.double
 
@@ -180,7 +180,7 @@ void complex_to_scalar() {
 // OGCG: %[[REAL:.*]] = load double, ptr {{.*}}, align 8
 // OGCG: store double %[[REAL]], ptr {{.*}}, align 8
 
-// CIR-BEFORE: %[[INT_COMPLEX_TO_REAL:.*]] = cir.cast(int_complex_to_real, %{{.*}} : !cir.complex<!s32i>), !s32i
+// CIR-BEFORE: %[[INT_COMPLEX_TO_REAL:.*]] = cir.cast int_complex_to_real %{{.*}} : !cir.complex<!s32i> -> !s32i
 
 // CIR-AFTER: %{{.*}} = cir.complex.real %{{.*}} : !cir.complex<!s32i> -> !s32i
 
@@ -190,11 +190,11 @@ void complex_to_scalar() {
 // OGCG: %[[REAL:.*]] = load i32, ptr {{.*}}, align 4
 // OGCG: store i32 %[[REAL]], ptr {{.*}}, align 4
 
-// CIR-BEFORE: %[[INT_COMPLEX_TO_REAL:.*]] = cir.cast(int_complex_to_real, %{{.*}} : !cir.complex<!s32i>), !s32i
-// CIR-BEFORE: %[[INT_TO_FP:.*]] = cir.cast(int_to_float, %[[INT_COMPLEX_TO_REAL]] : !s32i), !cir.double
+// CIR-BEFORE: %[[INT_COMPLEX_TO_REAL:.*]] = cir.cast int_complex_to_real %{{.*}} : !cir.complex<!s32i> -> !s32i
+// CIR-BEFORE: %[[INT_TO_FP:.*]] = cir.cast int_to_float %[[INT_COMPLEX_TO_REAL]] : !s32i -> !cir.double
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.complex.real %{{.*}} : !cir.complex<!s32i> -> !s32i
-// CIR-AFTER-NEXT: %{{.*}} = cir.cast(int_to_float, %[[REAL]] : !s32i), !cir.double
+// CIR-AFTER-NEXT: %{{.*}} = cir.cast int_to_float %[[REAL]] : !s32i -> !cir.double
 
 //      LLVM: %[[REAL:.*]] = extractvalue { i32, i32 } %{{.+}}, 0
 // LLVM-NEXT: %[[REAL_TO_DOUBLE:.*]] = sitofp i32 %[[REAL]] to double
@@ -204,11 +204,11 @@ void complex_to_scalar() {
 // OGCG: %[[INT_TO_FP:.*]] = sitofp i32 %[[REAL]] to double
 // OGCG: store double %[[INT_TO_FP]], ptr {{.*}}, align 8
 
-// CIR-BEFORE: %[[FP_TO_COMPLEX_REAL:.*]] = cir.cast(float_complex_to_real, %{{.*}} : !cir.complex<!cir.double>), !cir.double
-// CIR-BEFORE: %[[FP_TO_INT:.*]] = cir.cast(float_to_int, %[[FP_TO_COMPLEX_REAL]] : !cir.double), !s32i
+// CIR-BEFORE: %[[FP_TO_COMPLEX_REAL:.*]] = cir.cast float_complex_to_real %{{.*}} : !cir.complex<!cir.double> -> !cir.double
+// CIR-BEFORE: %[[FP_TO_INT:.*]] = cir.cast float_to_int %[[FP_TO_COMPLEX_REAL]] : !cir.double -> !s32i
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.complex.real %{{.*}} : !cir.complex<!cir.double> -> !cir.double
-// CIR-AFTER-NEXT: %{{.*}} = cir.cast(float_to_int, %[[REAL]] : !cir.double), !s32i
+// CIR-AFTER-NEXT: %{{.*}} = cir.cast float_to_int %[[REAL]] : !cir.double -> !s32i
 
 //      LLVM: %[[REAL:.*]] = extractvalue { double, double } %{{.+}}, 0
 // LLVM-NEXT: %[[REAL_TO_INT:.*]] = fptosi double %[[REAL]] to i32
@@ -223,12 +223,12 @@ void complex_to_bool() {
   b = (bool)ci;
 }
 
-// CIR-BEFORE: %[[FP_COMPLEX_TO_BOOL:.*]] = cir.cast(float_complex_to_bool, %{{.*}} : !cir.complex<!cir.double>), !cir.bool
+// CIR-BEFORE: %[[FP_COMPLEX_TO_BOOL:.*]] = cir.cast float_complex_to_bool %{{.*}} : !cir.complex<!cir.double> -> !cir.bool
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.complex.real %{{.*}} : !cir.complex<!cir.double> -> !cir.double
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.complex.imag %{{.*}} : !cir.complex<!cir.double> -> !cir.double
-// CIR-AFTER-NEXT: %[[REAL_TO_BOOL:.*]] = cir.cast(float_to_bool, %[[REAL]] : !cir.double), !cir.bool
-// CIR-AFTER-NEXT: %[[IMAG_TO_BOOL:.*]] = cir.cast(float_to_bool, %[[IMAG]] : !cir.double), !cir.bool
+// CIR-AFTER-NEXT: %[[REAL_TO_BOOL:.*]] = cir.cast float_to_bool %[[REAL]] : !cir.double -> !cir.bool
+// CIR-AFTER-NEXT: %[[IMAG_TO_BOOL:.*]] = cir.cast float_to_bool %[[IMAG]] : !cir.double -> !cir.bool
 // CIR-AFTER-NEXT: %[[CONST_TRUE:.*]] = cir.const #true
 // CIR-AFTER-NEXT: %{{.*}} = cir.select if %[[REAL_TO_BOOL]] then %[[CONST_TRUE]] else %[[IMAG_TO_BOOL]] : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
 
@@ -248,12 +248,12 @@ void complex_to_bool() {
 // OGCG: %[[BOOL_TO_INT:.*]] = zext i1 %[[COMPLEX_TO_BOOL]] to i8
 // OGCG: store i8 %[[BOOL_TO_INT]], ptr {{.*}}, align 1
 
-// CIR-BEFORE: %[[INT_COMPLEX_TO_BOOL:.*]] = cir.cast(int_complex_to_bool, %{{.*}} : !cir.complex<!s32i>), !cir.bool
+// CIR-BEFORE: %[[INT_COMPLEX_TO_BOOL:.*]] = cir.cast int_complex_to_bool %{{.*}} : !cir.complex<!s32i> -> !cir.bool
 
 //      CIR-AFTER: %[[REAL:.*]] = cir.complex.real %{{.*}} : !cir.complex<!s32i> -> !s32i
 // CIR-AFTER-NEXT: %[[IMAG:.*]] = cir.complex.imag %{{.*}} : !cir.complex<!s32i> -> !s32i
-// CIR-AFTER-NEXT: %[[REAL_TO_BOOL:.*]] = cir.cast(int_to_bool, %[[REAL]] : !s32i), !cir.bool
-// CIR-AFTER-NEXT: %[[IMAG_TO_BOOL:.*]] = cir.cast(int_to_bool, %[[IMAG]] : !s32i), !cir.bool
+// CIR-AFTER-NEXT: %[[REAL_TO_BOOL:.*]] = cir.cast int_to_bool %[[REAL]] : !s32i -> !cir.bool
+// CIR-AFTER-NEXT: %[[IMAG_TO_BOOL:.*]] = cir.cast int_to_bool %[[IMAG]] : !s32i -> !cir.bool
 // CIR-AFTER-NEXT: %[[CONST_TRUE:.*]] = cir.const #true
 // CIR-AFTER-NEXT: %{{.+}} = cir.select if %[[REAL_TO_BOOL]] then %[[CONST_TRUE]] else %[[IMAG_TO_BOOL]] : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
 
@@ -279,12 +279,12 @@ void complex_to_complex_cast() {
 }
 
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[FP_COMPLEX:.*]] = cir.cast(float_complex, %[[TMP]] : !cir.complex<!cir.float>), !cir.complex<!cir.double>
+// CIR-BEFORE: %[[FP_COMPLEX:.*]] = cir.cast float_complex %[[TMP]] : !cir.complex<!cir.float> -> !cir.complex<!cir.double>
 
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %{{.*}} : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %{{.*}} : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[REAL_FP_CAST:.*]] = cir.cast(floating, %[[REAL]] : !cir.float), !cir.double
-// CIR-AFTER: %[[IMAG_FP_CAST:.*]] = cir.cast(floating, %[[IMAG]] : !cir.float), !cir.double
+// CIR-AFTER: %[[REAL_FP_CAST:.*]] = cir.cast floating %[[REAL]] : !cir.float -> !cir.double
+// CIR-AFTER: %[[IMAG_FP_CAST:.*]] = cir.cast floating %[[IMAG]] : !cir.float -> !cir.double
 // CIR-AFTER: %{{.*}} = cir.complex.create %[[REAL_FP_CAST]], %[[IMAG_FP_CAST]] : !cir.double -> !cir.complex<!cir.double>
 
 // LLVM: %[[REAL:.*]] = extractvalue { float, float } %{{.*}}, 0
@@ -303,12 +303,12 @@ void complex_to_complex_cast() {
 // OGCG: store double %[[IMAG_FP_CAST]], ptr getelementptr inbounds nuw ({ double, double }, ptr {{.*}}, i32 0, i32 1), align 8
 
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %{{.*}} : !cir.ptr<!cir.complex<!s16i>>, !cir.complex<!s16i>
-// CIR-BEFORE: %[[INT_COMPLEX:.*]] = cir.cast(int_complex, %[[TMP]] : !cir.complex<!s16i>), !cir.complex<!s32i>
+// CIR-BEFORE: %[[INT_COMPLEX:.*]] = cir.cast int_complex %[[TMP]] : !cir.complex<!s16i> -> !cir.complex<!s32i>
 
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %{{.*}} : !cir.complex<!s16i> -> !s16i
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %{{.*}} : !cir.complex<!s16i> -> !s16i
-// CIR-AFTER: %[[REAL_INT_CAST:.*]] = cir.cast(integral, %[[REAL]] : !s16i), !s32i
-// CIR-AFTER: %[[IMAG_INT_CAST:.*]] = cir.cast(integral, %[[IMAG]] : !s16i), !s32i
+// CIR-AFTER: %[[REAL_INT_CAST:.*]] = cir.cast integral %[[REAL]] : !s16i -> !s32i
+// CIR-AFTER: %[[IMAG_INT_CAST:.*]] = cir.cast integral %[[IMAG]] : !s16i -> !s32i
 // CIR-AFTER: %{{.*}} = cir.complex.create %[[REAL_INT_CAST]], %[[IMAG_INT_CAST]] : !s32i -> !cir.complex<!s32i>
 
 // LLVM: %[[REAL:.*]] = extractvalue { i16, i16 } %{{.*}}, 0
@@ -336,9 +336,9 @@ void lvalue_to_rvalue_bitcast() {
    double _Complex b = __builtin_bit_cast(double _Complex, a);
 }
 
-// CIR-BEFORE: %{{.*}} = cir.cast(bitcast, %{{.*}} : !cir.ptr<!rec_CX>), !cir.ptr<!cir.complex<!cir.double>>
+// CIR-BEFORE: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.ptr<!rec_CX> -> !cir.ptr<!cir.complex<!cir.double>>
 
-// CIR-AFTER: %{{.*}} = cir.cast(bitcast, %{{.*}} : !cir.ptr<!rec_CX>), !cir.ptr<!cir.complex<!cir.double>>
+// CIR-AFTER: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.ptr<!rec_CX> -> !cir.ptr<!cir.complex<!cir.double>>
 
 // LLVM: %[[PTR_ADDR:.*]] = alloca %struct.CX, i64 1, align 8
 // LLVM: %[[COMPLEX_ADDR:.*]] = alloca { double, double }, i64 1, align 8
@@ -361,9 +361,9 @@ void lvalue_bitcast() {
   (double _Complex &)a = {};
 }
 
-// CIR-BEFORE: %{{.*}} = cir.cast(bitcast, %{{.*}} : !cir.ptr<!rec_CX>), !cir.ptr<!cir.complex<!cir.double>>
+// CIR-BEFORE: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.ptr<!rec_CX> -> !cir.ptr<!cir.complex<!cir.double>>
 
-// CIR-AFTER: %{{.*}} = cir.cast(bitcast, %{{.*}} : !cir.ptr<!rec_CX>), !cir.ptr<!cir.complex<!cir.double>>
+// CIR-AFTER: %{{.*}} = cir.cast bitcast %{{.*}} : !cir.ptr<!rec_CX> -> !cir.ptr<!cir.complex<!cir.double>>
 
 // LLVM: %[[A_ADDR:.*]] = alloca %struct.CX, i64 1, align 8
 // LLVM: store { double, double } zeroinitializer, ptr %[[A_ADDR]], align 8
diff --git a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
index 9909985e7819c..a5070f51fad63 100644
--- a/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
+++ b/clang/test/CIR/CodeGen/complex-compound-assignment.cpp
@@ -154,20 +154,20 @@ void foo3() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[B_REAL:.*]] = cir.complex.real %[[TMP_B]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[B_IMAG:.*]] = cir.complex.imag %[[TMP_B]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[B_REAL_F32:.*]] = cir.cast(floating, %[[B_REAL]] : !cir.f16), !cir.float
-// CIR: %[[B_IMAG_F32:.*]] = cir.cast(floating, %[[B_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[B_REAL_F32:.*]] = cir.cast floating %[[B_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[B_IMAG_F32:.*]] = cir.cast floating %[[B_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[B_COMPLEX_F32:.*]] = cir.complex.create %[[B_REAL_F32]], %[[B_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[ADD_A_B:.*]] = cir.complex.add %[[B_COMPLEX_F32]], %[[A_COMPLEX_F32]] : !cir.complex<!cir.float>
 // CIR: %[[ADD_REAL:.*]] = cir.complex.real %[[ADD_A_B]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: %[[ADD_IMAG:.*]] = cir.complex.imag %[[ADD_A_B]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[ADD_REAL_F16:.*]] = cir.cast(floating, %[[ADD_REAL]] : !cir.float), !cir.f16
-// CIR: %[[ADD_IMAG_F16:.*]] = cir.cast(floating, %[[ADD_IMAG]] : !cir.float), !cir.f16
+// CIR: %[[ADD_REAL_F16:.*]] = cir.cast floating %[[ADD_REAL]] : !cir.float -> !cir.f16
+// CIR: %[[ADD_IMAG_F16:.*]] = cir.cast floating %[[ADD_IMAG]] : !cir.float -> !cir.f16
 // CIR: %[[RESULT:.*]] = cir.complex.create %[[ADD_REAL_F16]], %[[ADD_IMAG_F16]] : !cir.f16 -> !cir.complex<!cir.f16>
 // CIR: cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
@@ -712,14 +712,14 @@ void foo13() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[B_REAL:.*]] = cir.complex.real %[[TMP_B]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[B_IMAG:.*]] = cir.complex.imag %[[TMP_B]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[B_REAL_F32:.*]] = cir.cast(floating, %[[B_REAL]] : !cir.f16), !cir.float
-// CIR: %[[B_IMAG_F32:.*]] = cir.cast(floating, %[[B_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[B_REAL_F32:.*]] = cir.cast floating %[[B_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[B_IMAG_F32:.*]] = cir.cast floating %[[B_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[B_COMPLEX_F32:.*]] = cir.complex.create %[[B_REAL_F32]], %[[B_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
@@ -729,8 +729,8 @@ void foo13() {
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[B_REAL:.*]] = cir.complex.real %[[TMP_B]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[B_IMAG:.*]] = cir.complex.imag %[[TMP_B]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[B_REAL_F32:.*]] = cir.cast(floating, %[[B_REAL]] : !cir.f16), !cir.float
-// CIR: %[[B_IMAG_F32:.*]] = cir.cast(floating, %[[B_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[B_REAL_F32:.*]] = cir.cast floating %[[B_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[B_IMAG_F32:.*]] = cir.cast floating %[[B_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[B_COMPLEX_F32:.*]] = cir.complex.create %[[B_REAL_F32]], %[[B_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[B_REAL_F32:.*]] = cir.complex.real %[[B_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: %[[B_IMAG_F32:.*]] = cir.complex.imag %[[B_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
@@ -739,8 +739,8 @@ void foo13() {
 // CIR: %[[RESULT:.*]] = cir.call @__divsc3(%[[B_REAL_F32]], %[[B_IMAG_F32]], %[[DIV_AB_REAL]], %[[DIV_AB_IMAG]]) : (!cir.float, !cir.float, !cir.float, !cir.float) -> !cir.complex<!cir.float>
 // CIR: %[[RESULT_REAL_F32:.*]] = cir.complex.real %[[RESULT]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: %[[RESULT_IMAG_F32:.*]] = cir.complex.imag %[[RESULT]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[RESULT_REAL_F16:.*]] = cir.cast(floating, %[[RESULT_REAL_F32]] : !cir.float), !cir.f16
-// CIR: %[[RESULT_IMAG_F16:.*]] = cir.cast(floating, %[[RESULT_IMAG_F32]] : !cir.float), !cir.f16
+// CIR: %[[RESULT_REAL_F16:.*]] = cir.cast floating %[[RESULT_REAL_F32]] : !cir.float -> !cir.f16
+// CIR: %[[RESULT_IMAG_F16:.*]] = cir.cast floating %[[RESULT_IMAG_F32]] : !cir.float -> !cir.f16
 // CIR: %[[RESULT_COMPLEX_F16:.*]] = cir.complex.create %[[RESULT_REAL_F16]], %[[RESULT_IMAG_F16]] : !cir.f16 -> !cir.complex<!cir.f16>
 // CIR: cir.store{{.*}} %[[RESULT_COMPLEX_F16]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
diff --git a/clang/test/CIR/CodeGen/complex-mul-div.cpp b/clang/test/CIR/CodeGen/complex-mul-div.cpp
index d49304660b4d4..b306981434dc6 100644
--- a/clang/test/CIR/CodeGen/complex-mul-div.cpp
+++ b/clang/test/CIR/CodeGen/complex-mul-div.cpp
@@ -549,10 +549,10 @@ void foo3() {
 // CIR-AFTER-PROMOTED: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER-PROMOTED: %[[B_REAL:.*]] = cir.complex.real %[[TMP_B]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER-PROMOTED: %[[B_IMAG:.*]] = cir.complex.imag %[[TMP_B]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER-PROMOTED: %[[A_REAL_F64:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.float), !cir.double
-// CIR-AFTER-PROMOTED: %[[A_IMAG_F64:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.float), !cir.double
-// CIR-AFTER-PROMOTED: %[[B_REAL_F64:.*]] = cir.cast(floating, %[[B_REAL]] : !cir.float), !cir.double
-// CIR-AFTER-PROMOTED: %[[B_IMAG_F64:.*]] = cir.cast(floating, %[[B_IMAG]] : !cir.float), !cir.double
+// CIR-AFTER-PROMOTED: %[[A_REAL_F64:.*]] = cir.cast floating %[[A_REAL]] : !cir.float -> !cir.double
+// CIR-AFTER-PROMOTED: %[[A_IMAG_F64:.*]] = cir.cast floating %[[A_IMAG]] : !cir.float -> !cir.double
+// CIR-AFTER-PROMOTED: %[[B_REAL_F64:.*]] = cir.cast floating %[[B_REAL]] : !cir.float -> !cir.double
+// CIR-AFTER-PROMOTED: %[[B_IMAG_F64:.*]] = cir.cast floating %[[B_IMAG]] : !cir.float -> !cir.double
 // CIR-AFTER-PROMOTED: %[[MUL_AR_BR:.*]] = cir.binop(mul, %[[A_REAL_F64]], %[[B_REAL_F64]]) : !cir.double
 // CIR-AFTER-PROMOTED: %[[MUL_AI_BI:.*]] = cir.binop(mul, %[[A_IMAG_F64]], %[[B_IMAG_F64]]) : !cir.double
 // CIR-AFTER-PROMOTED: %[[MUL_BR_BR:.*]] = cir.binop(mul, %[[B_REAL_F64]], %[[B_REAL_F64]]) : !cir.double
@@ -567,8 +567,8 @@ void foo3() {
 // CIR-AFTER-PROMOTED: %[[RESULT_F64:.*]] = cir.complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : !cir.double -> !cir.complex<!cir.double>
 // CIR-AFTER-PROMOTED: %[[RESULT_REAL_F64:.*]] = cir.complex.real %[[RESULT_F64]] : !cir.complex<!cir.double> -> !cir.double
 // CIR-AFTER-PROMOTED: %[[RESULT_IMAG_F64:.*]] = cir.complex.imag %[[RESULT_F64]] : !cir.complex<!cir.double> -> !cir.double
-// CIR-AFTER-PROMOTED: %[[RESULT_REAL_F32:.*]] = cir.cast(floating, %[[RESULT_REAL_F64]] : !cir.double), !cir.float
-// CIR-AFTER-PROMOTED: %[[RESULT_IMAG_F32:.*]] = cir.cast(floating, %[[RESULT_IMAG_F64]] : !cir.double), !cir.float
+// CIR-AFTER-PROMOTED: %[[RESULT_REAL_F32:.*]] = cir.cast floating %[[RESULT_REAL_F64]] : !cir.double -> !cir.float
+// CIR-AFTER-PROMOTED: %[[RESULT_IMAG_F32:.*]] = cir.cast floating %[[RESULT_IMAG_F64]] : !cir.double -> !cir.float
 // CIR-AFTER-PROMOTED: %[[RESULT_F32:.*]] = cir.complex.create %[[RESULT_REAL_F32]], %[[RESULT_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER-PROMOTED: cir.store{{.*}} %[[RESULT_F32]], %[[C_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -1044,10 +1044,10 @@ void foo6() {
 // CIR-AFTER-PROMOTED: %[[A_IMAG:.*]] = cir.complex.imag %[[COMPLEX_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER-PROMOTED: %[[B_REAL:.*]] = cir.complex.real %[[TMP_B]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER-PROMOTED: %[[B_IMAG:.*]] = cir.complex.imag %[[TMP_B]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER-PROMOTED: %[[A_REAL_F64:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.float), !cir.double
-// CIR-AFTER-PROMOTED: %[[A_IMAG_F64:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.float), !cir.double
-// CIR-AFTER-PROMOTED: %[[B_REAL_F64:.*]] = cir.cast(floating, %[[B_REAL]] : !cir.float), !cir.double
-// CIR-AFTER-PROMOTED: %[[B_IMAG_F64:.*]] = cir.cast(floating, %[[B_IMAG]] : !cir.float), !cir.double
+// CIR-AFTER-PROMOTED: %[[A_REAL_F64:.*]] = cir.cast floating %[[A_REAL]] : !cir.float -> !cir.double
+// CIR-AFTER-PROMOTED: %[[A_IMAG_F64:.*]] = cir.cast floating %[[A_IMAG]] : !cir.float -> !cir.double
+// CIR-AFTER-PROMOTED: %[[B_REAL_F64:.*]] = cir.cast floating %[[B_REAL]] : !cir.float -> !cir.double
+// CIR-AFTER-PROMOTED: %[[B_IMAG_F64:.*]] = cir.cast floating %[[B_IMAG]] : !cir.float -> !cir.double
 // CIR-AFTER-PROMOTED: %[[MUL_AR_BR:.*]] = cir.binop(mul, %[[A_REAL_F64]], %[[B_REAL_F64]]) : !cir.double
 // CIR-AFTER-PROMOTED: %[[MUL_AI_BI:.*]] = cir.binop(mul, %[[A_IMAG_F64]], %[[B_IMAG_F64]]) : !cir.double
 // CIR-AFTER-PROMOTED: %[[MUL_BR_BR:.*]] = cir.binop(mul, %[[B_REAL_F64]], %[[B_REAL_F64]]) : !cir.double
@@ -1062,8 +1062,8 @@ void foo6() {
 // CIR-AFTER-PROMOTED: %[[RESULT_F64:.*]] = cir.complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : !cir.double -> !cir.complex<!cir.double>
 // CIR-AFTER-PROMOTED: %[[RESULT_REAL_F64:.*]] = cir.complex.real %[[RESULT_F64]] : !cir.complex<!cir.double> -> !cir.double
 // CIR-AFTER-PROMOTED: %[[RESULT_IMAG_F64:.*]] = cir.complex.imag %[[RESULT_F64]] : !cir.complex<!cir.double> -> !cir.double
-// CIR-AFTER-PROMOTED: %[[RESULT_REAL_F32:.*]] = cir.cast(floating, %[[RESULT_REAL_F64]] : !cir.double), !cir.float
-// CIR-AFTER-PROMOTED: %[[RESULT_IMAG_F32:.*]] = cir.cast(floating, %[[RESULT_IMAG_F64]] : !cir.double), !cir.float
+// CIR-AFTER-PROMOTED: %[[RESULT_REAL_F32:.*]] = cir.cast floating %[[RESULT_REAL_F64]] : !cir.double -> !cir.float
+// CIR-AFTER-PROMOTED: %[[RESULT_IMAG_F32:.*]] = cir.cast floating %[[RESULT_IMAG_F64]] : !cir.double -> !cir.float
 // CIR-AFTER-PROMOTED: %[[RESULT_F32:.*]] = cir.complex.create %[[RESULT_REAL_F32]], %[[RESULT_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER-PROMOTED: cir.store{{.*}} %[[RESULT_F32]], %[[C_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
diff --git a/clang/test/CIR/CodeGen/complex-unary.cpp b/clang/test/CIR/CodeGen/complex-unary.cpp
index d79199f23bbfd..a8e434b903763 100644
--- a/clang/test/CIR/CodeGen/complex-unary.cpp
+++ b/clang/test/CIR/CodeGen/complex-unary.cpp
@@ -380,9 +380,9 @@ void foo9() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["b", init]
 // CIR-BEFORE: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
-// CIR-BEFORE: %[[A_COMPLEX_F32:.*]] = cir.cast(float_complex, %[[TMP_A]] : !cir.complex<!cir.f16>), !cir.complex<!cir.float>
+// CIR-BEFORE: %[[A_COMPLEX_F32:.*]] = cir.cast float_complex %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.complex<!cir.float>
 // CIR-BEFORE: %[[RESULT:.*]] = cir.unary(plus, %[[A_COMPLEX_F32]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[A_COMPLEX_F16:.*]] = cir.cast(float_complex, %[[RESULT]] : !cir.complex<!cir.float>), !cir.complex<!cir.f16>
+// CIR-BEFORE: %[[A_COMPLEX_F16:.*]] = cir.cast float_complex %[[RESULT]] : !cir.complex<!cir.float> -> !cir.complex<!cir.f16>
 // CIR-BEFORE: cir.store{{.*}} %[[A_COMPLEX_F16]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["a"]
@@ -390,8 +390,8 @@ void foo9() {
 // CIR-AFTER: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR-AFTER: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR-AFTER: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR-AFTER: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR-AFTER: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR-AFTER: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
@@ -400,8 +400,8 @@ void foo9() {
 // CIR-AFTER: %[[RESULT_COMPLEX_F32:.*]] = cir.complex.create %[[RESULT_REAL_F32]], %[[RESULT_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.complex.real %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.complex.imag %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[RESULT_REAL_F16:.*]] = cir.cast(floating, %[[RESULT_REAL_F32]] : !cir.float), !cir.f16
-// CIR-AFTER: %[[RESULT_IMAG_F16:.*]] = cir.cast(floating, %[[RESULT_IMAG_F32]] : !cir.float), !cir.f16
+// CIR-AFTER: %[[RESULT_REAL_F16:.*]] = cir.cast floating %[[RESULT_REAL_F32]] : !cir.float -> !cir.f16
+// CIR-AFTER: %[[RESULT_IMAG_F16:.*]] = cir.cast floating %[[RESULT_IMAG_F32]] : !cir.float -> !cir.f16
 // CIR-AFTER: %[[RESULT_COMPLEX_F16:.*]] = cir.complex.create %[[RESULT_REAL_F16]], %[[RESULT_IMAG_F16]] : !cir.f16 -> !cir.complex<!cir.f16>
 // CIR-AFTER: cir.store{{.*}} %[[RESULT_COMPLEX_F16]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
@@ -445,9 +445,9 @@ void foo10() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["b", init]
 // CIR-BEFORE: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
-// CIR-BEFORE: %[[A_COMPLEX_F32:.*]] = cir.cast(float_complex, %[[TMP_A]] : !cir.complex<!cir.f16>), !cir.complex<!cir.float>
+// CIR-BEFORE: %[[A_COMPLEX_F32:.*]] = cir.cast float_complex %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.complex<!cir.float>
 // CIR-BEFORE: %[[RESULT:.*]] = cir.unary(minus, %[[A_COMPLEX_F32]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[A_COMPLEX_F16:.*]] = cir.cast(float_complex, %[[RESULT]] : !cir.complex<!cir.float>), !cir.complex<!cir.f16>
+// CIR-BEFORE: %[[A_COMPLEX_F16:.*]] = cir.cast float_complex %[[RESULT]] : !cir.complex<!cir.float> -> !cir.complex<!cir.f16>
 // CIR-BEFORE: cir.store{{.*}} %[[A_COMPLEX_F16]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["a"]
@@ -455,8 +455,8 @@ void foo10() {
 // CIR-AFTER: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR-AFTER: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR-AFTER: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR-AFTER: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR-AFTER: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR-AFTER: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
@@ -465,8 +465,8 @@ void foo10() {
 // CIR-AFTER: %[[RESULT_COMPLEX_F32:.*]] = cir.complex.create %[[RESULT_REAL_F32]], %[[RESULT_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.complex.real %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.complex.imag %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[RESULT_REAL_F16:.*]] = cir.cast(floating, %[[RESULT_REAL_F32]] : !cir.float), !cir.f16
-// CIR-AFTER: %[[RESULT_IMAG_F16:.*]] = cir.cast(floating, %[[RESULT_IMAG_F32]] : !cir.float), !cir.f16
+// CIR-AFTER: %[[RESULT_REAL_F16:.*]] = cir.cast floating %[[RESULT_REAL_F32]] : !cir.float -> !cir.f16
+// CIR-AFTER: %[[RESULT_IMAG_F16:.*]] = cir.cast floating %[[RESULT_IMAG_F32]] : !cir.float -> !cir.f16
 // CIR-AFTER: %[[RESULT_COMPLEX_F16:.*]] = cir.complex.create %[[RESULT_REAL_F16]], %[[RESULT_IMAG_F16]] : !cir.f16 -> !cir.complex<!cir.f16>
 // CIR-AFTER: cir.store{{.*}} %[[RESULT_COMPLEX_F16]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 4c396d312d148..2d58c380c844a 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -612,7 +612,7 @@ void foo24() {
 // CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.complex<!s32i> x 2>, !cir.ptr<!cir.array<!cir.complex<!s32i> x 2>>, ["arr"]
 // CIR: %[[RESULT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["r", init]
 // CIR: %[[IDX:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!cir.complex<!s32i> x 2>>), !cir.ptr<!cir.complex<!s32i>>
+// CIR: %[[ARR_PTR:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!cir.complex<!s32i> x 2>> -> !cir.ptr<!cir.complex<!s32i>>
 // CIR: %[[RESULT_VAL:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr<!cir.complex<!s32i>>, %[[IDX]] : !s32i), !cir.ptr<!cir.complex<!s32i>>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[RESULT_VAL]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
 // CIR: cir.store{{.*}} %[[TMP]], %[[RESULT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
@@ -938,11 +938,11 @@ void foo35() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_F16:.*]] = cir.cast(floating, %[[A_REAL_F32]] : !cir.float), !cir.f16
+// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL_F32]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[A_REAL_F16]], %[[REAL_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
@@ -975,11 +975,11 @@ void foo36() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_IMAG_F16:.*]] = cir.cast(floating, %[[A_IMAG_F32]] : !cir.float), !cir.f16
+// CIR: %[[A_IMAG_F16:.*]] = cir.cast floating %[[A_IMAG_F32]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[A_IMAG_F16]], %[[IMAG_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
@@ -1102,11 +1102,11 @@ void atomic_complex_type() {
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR: %[[ATOMIC_TMP_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["atomic-temp"]
-// CIR: %[[A_PTR:.*]] = cir.cast(bitcast, %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>), !cir.ptr<!u64i>
-// CIR: %[[ATOMIC_TMP_PTR:.*]] = cir.cast(bitcast, %[[ATOMIC_TMP_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>), !cir.ptr<!u64i>
+// CIR: %[[A_PTR:.*]] = cir.cast bitcast %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>> -> !cir.ptr<!u64i>
+// CIR: %[[ATOMIC_TMP_PTR:.*]] = cir.cast bitcast %[[ATOMIC_TMP_ADDR]] : !cir.ptr<!cir.complex<!cir.float>> -> !cir.ptr<!u64i>
 // CIR: %[[TMP_A_ATOMIC:.*]] = cir.load{{.*}} atomic(relaxed) %[[A_PTR]] : !cir.ptr<!u64i>, !u64i
 // CIR: cir.store{{.*}} %[[TMP_A_ATOMIC]], %[[ATOMIC_TMP_PTR]] : !u64i, !cir.ptr<!u64i>
-// CIR: %[[TMP_ATOMIC_PTR:.*]] = cir.cast(bitcast, %[[ATOMIC_TMP_PTR]] : !cir.ptr<!u64i>), !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[TMP_ATOMIC_PTR:.*]] = cir.cast bitcast %[[ATOMIC_TMP_PTR]] : !cir.ptr<!u64i> -> !cir.ptr<!cir.complex<!cir.float>>
 // CIR: %[[TMP_ATOMIC:.*]] = cir.load{{.*}} %[[TMP_ATOMIC_PTR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR: cir.store{{.*}} %[[TMP_ATOMIC]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -1178,8 +1178,8 @@ void real_on_scalar_with_type_promotion() {
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
-// CIR: %[[TMP_A_F32:.*]] = cir.cast(floating, %[[TMP_A]] : !cir.f16), !cir.float
-// CIR: %[[TMP_A_F16:.*]] = cir.cast(floating, %[[TMP_A_F32]] : !cir.float), !cir.f16
+// CIR: %[[TMP_A_F32:.*]] = cir.cast floating %[[TMP_A]] : !cir.f16 -> !cir.float
+// CIR: %[[TMP_A_F16:.*]] = cir.cast floating %[[TMP_A_F32]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[TMP_A_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca half, i64 1, align 2
@@ -1204,7 +1204,7 @@ void imag_on_scalar_with_type_promotion() {
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
 // CIR: %[[CONST_ZERO:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.float
-// CIR: %[[CONST_ZERO_F16:.*]] = cir.cast(floating, %[[CONST_ZERO]] : !cir.float), !cir.f16
+// CIR: %[[CONST_ZERO_F16:.*]] = cir.cast floating %[[CONST_ZERO]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[CONST_ZERO_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca half, i64 1, align 2
@@ -1244,11 +1244,11 @@ void real_on_scalar_from_real_with_type_promotion() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_F16:.*]] = cir.cast(floating, %[[A_REAL_F32]] : !cir.float), !cir.f16
+// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL_F32]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[A_REAL_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
@@ -1281,11 +1281,11 @@ void real_on_scalar_from_imag_with_type_promotion() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.f16
-// CIR: %[[A_REAL_F32:.*]] = cir.cast(floating, %[[A_REAL]] : !cir.f16), !cir.float
-// CIR: %[[A_IMAG_F32:.*]] = cir.cast(floating, %[[A_IMAG]] : !cir.f16), !cir.float
+// CIR: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
+// CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_IMAG_F16:.*]] = cir.cast(floating, %[[A_IMAG_F32]] : !cir.float), !cir.f16
+// CIR: %[[A_IMAG_F16:.*]] = cir.cast floating %[[A_IMAG_F32]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[A_IMAG_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
diff --git a/clang/test/CIR/CodeGen/cxx-default-init.cpp b/clang/test/CIR/CodeGen/cxx-default-init.cpp
index 06d3a27f61cc9..b3d706ffa831f 100644
--- a/clang/test/CIR/CodeGen/cxx-default-init.cpp
+++ b/clang/test/CIR/CodeGen/cxx-default-init.cpp
@@ -33,7 +33,7 @@ struct ZeroInit {
 // CIR:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
 // CIR:   cir.store{{.*}} %[[ZERO]], %[[P_B]]
 // CIR:   %[[ARR:.*]] = cir.get_member %[[THIS]][2] {name = "arr"}
-// CIR:   %[[ARR_BEGIN:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 4>>), !cir.ptr<!s32i>
+// CIR:   %[[ARR_BEGIN:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 4>> -> !cir.ptr<!s32i>
 // CIR:   cir.store{{.*}} %[[ARR_BEGIN]], %[[ITER]]
 // CIR:   %[[FOUR:.*]] = cir.const #cir.int<4> : !s64i
 // CIR:   %[[END:.*]] = cir.ptr_stride(%[[ARR_BEGIN]] : !cir.ptr<!s32i>, %[[FOUR]] : !s64i)
@@ -139,7 +139,7 @@ struct ValueInit {
 // CIR:   %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
 // CIR:   cir.store{{.*}} %[[THREE]], %[[P_B]]
 // CIR:   %[[ARR:.*]] = cir.get_member %[[THIS]][2] {name = "arr"}
-// CIR:   %[[ARR_BEGIN:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!s32i x 4>>), !cir.ptr<!s32i>
+// CIR:   %[[ARR_BEGIN:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 4>> -> !cir.ptr<!s32i>
 // CIR:   %[[FOUR:.*]] = cir.const #cir.int<4> : !s32i
 // CIR:   cir.store{{.*}} %[[FOUR]], %[[ARR_BEGIN]]
 // CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
@@ -169,7 +169,7 @@ struct ValueInit {
 // CIR:   cir.store{{.*}} %[[FOUR_FIVEI]], %[[C]]
 // CIR:   %[[BF:.*]] = cir.get_member %[[THIS]][4] {name = "bf"}
 // CIR:   %[[FF:.*]] = cir.const #cir.int<255> : !s32i
-// CIR:   %[[FF_CAST:.*]] = cir.cast(integral, %[[FF]] : !s32i), !u32i
+// CIR:   %[[FF_CAST:.*]] = cir.cast integral %[[FF]] : !s32i -> !u32i
 // CIR:   %[[BF_VAL:.*]] = cir.set_bitfield{{.*}} (#bfi_bf, %[[BF]] : !cir.ptr<!u8i>, %[[FF_CAST]] : !u32i)
 
 // LLVM: define{{.*}} void @_ZN9ValueInitC2Ev(ptr %[[THIS_ARG:.*]])
diff --git a/clang/test/CIR/CodeGen/delegating-ctor.cpp b/clang/test/CIR/CodeGen/delegating-ctor.cpp
index 73ee6b719940a..c95ecf44dcb10 100644
--- a/clang/test/CIR/CodeGen/delegating-ctor.cpp
+++ b/clang/test/CIR/CodeGen/delegating-ctor.cpp
@@ -116,23 +116,23 @@ Derived::Derived(const void *inVoid) { squawk(); }
 // CIR:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
 // CIR:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
 // CIR:        %[[VPTR_GLOBAL_ADDR:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[VPTR_PTR:.*]] = cir.cast(bitcast, %[[VPTR_GLOBAL_ADDR]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR:        %[[VPTR_PTR:.*]] = cir.cast bitcast %[[VPTR_GLOBAL_ADDR]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
 // CIR:        %[[VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
 // CIR:        cir.store{{.*}} %[[VPTR]], %[[VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
 // CIR:        %[[VPTR_BASE_ADDR:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
-// CIR:        %[[VPTR_BASE_PTR:.*]] = cir.cast(bitcast, %[[VPTR_BASE_ADDR]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR:        %[[VPTR_BASE_PTR:.*]] = cir.cast bitcast %[[VPTR_BASE_ADDR]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR:        %[[VPTR_BASE:.*]] = cir.load{{.*}} %[[VPTR_BASE_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
 // CIR:        %[[VPTR_DERIVED_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
 // CIR:        %[[VPTR_DERIVED:.*]] = cir.load{{.*}} %[[VPTR_DERIVED_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
-// CIR:        %[[VPTR_DERIVED_AS_I8PTR:.*]] = cir.cast(bitcast, %[[VPTR_DERIVED]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR:        %[[VPTR_DERIVED_AS_I8PTR:.*]] = cir.cast bitcast %[[VPTR_DERIVED]] : !cir.vptr -> !cir.ptr<!u8i>
 // CIR:        %[[BASE_LOC_OFFSET:.*]] = cir.const #cir.int<-32> : !s64i
 // CIR:        %[[BASE_OFFSET_PTR:.*]] = cir.ptr_stride(%[[VPTR_DERIVED_AS_I8PTR]] : !cir.ptr<!u8i>, %[[BASE_LOC_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_OFFSET_I64PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR:        %[[BASE_OFFSET_I64PTR:.*]] = cir.cast bitcast %[[BASE_OFFSET_PTR]] : !cir.ptr<!u8i> -> !cir.ptr<!s64i>
 // CIR:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_I64PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR:        %[[THIS_AS_I8PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_Derived>), !cir.ptr<!u8i>
+// CIR:        %[[THIS_AS_I8PTR:.*]] = cir.cast bitcast %[[THIS]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!u8i>
 // CIR:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_AS_I8PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR:        %[[BASE_AS_I8PTR:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_Derived>
+// CIR:        %[[BASE_AS_I8PTR:.*]] = cir.cast bitcast %[[BASE_PTR]] : !cir.ptr<!u8i> -> !cir.ptr<!rec_Derived>
 // CIR:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_AS_I8PTR]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
 // CIR:        cir.store{{.*}} %[[VPTR_BASE]], %[[BASE_VPTR_ADDR]] : !cir.vptr, !cir.ptr<!cir.vptr>
 // CIR:        %[[VPTR_BASE_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
diff --git a/clang/test/CIR/CodeGen/delete.cpp b/clang/test/CIR/CodeGen/delete.cpp
index f21d203f266e5..69640aa04531f 100644
--- a/clang/test/CIR/CodeGen/delete.cpp
+++ b/clang/test/CIR/CodeGen/delete.cpp
@@ -21,7 +21,7 @@ void test_sized_delete(SizedDelete *x) {
 
 // CIR: cir.func dso_local @_Z17test_sized_deleteP11SizedDelete
 // CIR:   %[[X:.*]] = cir.load{{.*}} %{{.*}}
-// CIR:   %[[X_CAST:.*]] = cir.cast(bitcast, %[[X]] : !cir.ptr<!rec_SizedDelete>), !cir.ptr<!void>
+// CIR:   %[[X_CAST:.*]] = cir.cast bitcast %[[X]] : !cir.ptr<!rec_SizedDelete> -> !cir.ptr<!void>
 // CIR:   %[[OBJ_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CIR:   cir.call @_ZN11SizedDeletedlEPvm(%[[X_CAST]], %[[OBJ_SIZE]]) nothrow : (!cir.ptr<!void>, !u64i) -> ()
 
@@ -62,7 +62,7 @@ Container::~Container() { delete contents; }
 // CIR:   %[[CONTENTS_PTR_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "contents"} : !cir.ptr<!rec_Container> -> !cir.ptr<!cir.ptr<!rec_Contents>>
 // CIR:   %[[CONTENTS_PTR:.*]] = cir.load{{.*}} %[[CONTENTS_PTR_ADDR]]
 // CIR:   cir.call @_ZN8ContentsD2Ev(%[[CONTENTS_PTR]]) nothrow : (!cir.ptr<!rec_Contents>) -> ()
-// CIR:   %[[CONTENTS_CAST:.*]] = cir.cast(bitcast, %[[CONTENTS_PTR]] : !cir.ptr<!rec_Contents>), !cir.ptr<!void>
+// CIR:   %[[CONTENTS_CAST:.*]] = cir.cast bitcast %[[CONTENTS_PTR]] : !cir.ptr<!rec_Contents> -> !cir.ptr<!void>
 // CIR:   %[[OBJ_SIZE:.*]] = cir.const #cir.int<1> : !u64i
 // CIR:   cir.call @_ZdlPvm(%[[CONTENTS_CAST]], %[[OBJ_SIZE]]) nothrow : (!cir.ptr<!void>, !u64i) -> ()
 
diff --git a/clang/test/CIR/CodeGen/destructors.cpp b/clang/test/CIR/CodeGen/destructors.cpp
index fde0732a4352f..1ede1569a826f 100644
--- a/clang/test/CIR/CodeGen/destructors.cpp
+++ b/clang/test/CIR/CodeGen/destructors.cpp
@@ -64,7 +64,7 @@ void test_array_destructor() {
 // CIR: cir.func dso_local @_Z21test_array_destructorv()
 // CIR:   %[[ARR:.*]] = cir.alloca !cir.array<!rec_array_element x 5>, !cir.ptr<!cir.array<!rec_array_element x 5>>, ["arr", init]
 // CIR:   %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!rec_array_element>, !cir.ptr<!cir.ptr<!rec_array_element>>, ["arrayinit.temp", init]
-// CIR:   %[[BEGIN:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!rec_array_element x 5>>)
+// CIR:   %[[BEGIN:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!rec_array_element x 5>>
 // CIR:   cir.store{{.*}} %[[BEGIN]], %[[ARR_PTR]]
 // CIR:   %[[FIVE:.*]] = cir.const #cir.int<5> : !s64i
 // CIR:   %[[ARR_END:.*]] = cir.ptr_stride(%[[BEGIN]] : !cir.ptr<!rec_array_element>, %[[FIVE]] : !s64i)
@@ -80,7 +80,7 @@ void test_array_destructor() {
 // CIR:     cir.condition(%[[CMP]])
 // CIR:   }
 // CIR:   %[[FOUR:.*]] = cir.const #cir.int<4> : !u64i
-// CIR:   %[[BEGIN:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr<!cir.array<!rec_array_element x 5>>)
+// CIR:   %[[BEGIN:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!rec_array_element x 5>>
 // CIR:   %[[END:.*]] = cir.ptr_stride(%[[BEGIN]] : !cir.ptr<!rec_array_element>, %[[FOUR]] : !u64i)
 // CIR:   %[[ARR_PTR:.*]] = cir.alloca !cir.ptr<!rec_array_element>, !cir.ptr<!cir.ptr<!rec_array_element>>, ["__array_idx"]
 // CIR:   cir.store %[[END]], %[[ARR_PTR]]
diff --git a/clang/test/CIR/CodeGen/finegrain-bitfield-access.cpp b/clang/test/CIR/CodeGen/finegrain-bitfield-access.cpp
index 930b0a9c70059..d9ccd273ff3ba 100644
--- a/clang/test/CIR/CodeGen/finegrain-bitfield-access.cpp
+++ b/clang/test/CIR/CodeGen/finegrain-bitfield-access.cpp
@@ -70,7 +70,7 @@ void write8_1() {
 
 // CIR-LABEL: @_Z8write8_1v
 // CIR: [[CONST3:%.*]] = cir.const #cir.int<3> : !s32i
-// CIR: [[INT3:%.*]] = cir.cast(integral, [[CONST3]] : !s32i), !u32i
+// CIR: [[INT3:%.*]] = cir.cast integral [[CONST3]] : !s32i -> !u32i
 // CIR: [[MEMBER:%.*]] = cir.get_member {{.*}}[1] {name = "f3"} : !cir.ptr<!rec_S1> -> !cir.ptr<!u8i>
 // CIR: cir.set_bitfield align(1) (#bfi_f3, [[MEMBER]] : !cir.ptr<!u8i>, [[INT3]] : !u32i) -> !u32i
 
@@ -116,7 +116,7 @@ void write8_2() {
 
 // CIR-LABEL: @_Z8write8_2v
 // CIR: [[CONST3:%.*]] = cir.const #cir.int<3> : !s32i
-// CIR: [[INT3:%.*]] = cir.cast(integral, [[CONST3]] : !s32i), !u32i
+// CIR: [[INT3:%.*]] = cir.cast integral [[CONST3]] : !s32i -> !u32i
 // CIR: [[MEMBER:%.*]] = cir.get_member {{.*}}[2] {name = "f5"} : !cir.ptr<!rec_S1> -> !cir.ptr<!u16i>
 // CIR: cir.set_bitfield align(2) (#bfi_f5, %3 : !cir.ptr<!u16i>, {{.*}} : !u32i) -> !u32i
 
@@ -141,7 +141,7 @@ unsigned read16_1() {
 // CIR-LABEL: @_Z8read16_1v
 // CIR: [[MEMBER:%.*]] = cir.get_member {{.*}}[0] {name = "f1"} : !cir.ptr<!rec_S2> -> !cir.ptr<!u16i>
 // CIR: [[BITFI:%.*]] = cir.get_bitfield align(8) (#bfi_f1, [[MEMBER]] : !cir.ptr<!u16i>) -> !u64i
-// CIR: [[BFCAST:%.*]] = cir.cast(integral, [[BITFI]] : !u64i), !u32i
+// CIR: [[BFCAST:%.*]] = cir.cast integral [[BITFI]] : !u64i -> !u32i
 // CIR: cir.store [[BFCAST]], {{.*}} : !u32i, !cir.ptr<!u32i>
 // CIR: [[RET:%.*]] = cir.load {{.*}} : !cir.ptr<!u32i>, !u32i
 // CIR: cir.return [[RET]] : !u32i
@@ -167,7 +167,7 @@ unsigned read16_2() {
 // CIR-LABEL: @_Z8read16_2v
 // CIR: [[MEMBER:%.*]] = cir.get_member {{.*}}[1] {name = "f2"} : !cir.ptr<!rec_S2> -> !cir.ptr<!u16i>
 // CIR: [[BITFI:%.*]] = cir.get_bitfield align(2) (#bfi_f2, [[MEMBER]] : !cir.ptr<!u16i>) -> !u64i
-// CIR: [[BFCAST:%.*]] = cir.cast(integral, [[BITFI]] : !u64i), !u32i
+// CIR: [[BFCAST:%.*]] = cir.cast integral [[BITFI]] : !u64i -> !u32i
 // CIR: cir.store [[BFCAST]], {{.*}} : !u32i, !cir.ptr<!u32i>
 // CIR: [[RET:%.*]] = cir.load {{.*}} : !cir.ptr<!u32i>, !u32i
 // CIR: cir.return [[RET]] : !u32i
@@ -192,7 +192,7 @@ void write16_1() {
 
 // CIR-LABEL: @_Z9write16_1v
 // CIR: [[CONST5:%.*]] = cir.const #cir.int<5> : !s32i
-// CIR: [[INT5:%.*]] = cir.cast(integral, [[CONST5]] : !s32i), !u64i
+// CIR: [[INT5:%.*]] = cir.cast integral [[CONST5]] : !s32i -> !u64i
 // CIR: [[MEMBER:%.*]]  = cir.get_member {{.*}}[0] {name = "f1"} : !cir.ptr<!rec_S2> -> !cir.ptr<!u16i>
 // CIR: cir.set_bitfield align(8) (#bfi_f1, [[MEMBER]] : !cir.ptr<!u16i>, [[INT5]] : !u64i) -> !u64i
 // CIR: cir.return
@@ -212,7 +212,7 @@ void write16_2() {
 
 // CIR-LABEL: @_Z9write16_2v
 // CIR: [[CONST5:%.*]] = cir.const #cir.int<5> : !s32i
-// CIR: [[INT5:%.*]] = cir.cast(integral, [[CONST5]] : !s32i), !u64i
+// CIR: [[INT5:%.*]] = cir.cast integral [[CONST5]] : !s32i -> !u64i
 // CIR: [[MEMBER:%.*]] = cir.get_member {{.*}}[1] {name = "f2"} : !cir.ptr<!rec_S2> -> !cir.ptr<!u16i>
 // CIR: cir.set_bitfield align(2) (#bfi_f2, [[MEMBER]] : !cir.ptr<!u16i>, {{.*}} : !u64i) -> !u64i
 // CIR: cir.return
@@ -232,7 +232,7 @@ unsigned read32_1() {
 // CIR-LABEL: @_Z8read32_1v
 // CIR: [[MEMBER:%.*]] = cir.get_member {{.*}}[1] {name = "f3"} : !cir.ptr<!rec_S3> -> !cir.ptr<!u32i>
 // CIR: [[BITFI:%.*]] = cir.get_bitfield align(4) (#bfi_f3_1, [[MEMBER]] : !cir.ptr<!u32i>) -> !u64i
-// CIR: [[BFCAST:%.*]] = cir.cast(integral, [[BITFI]] : !u64i), !u32i
+// CIR: [[BFCAST:%.*]] = cir.cast integral [[BITFI]] : !u64i -> !u32i
 // CIR: cir.store [[BFCAST]], {{.*}} : !u32i, !cir.ptr<!u32i>
 // CIR: [[RET:%.*]] = cir.load {{.*}} : !cir.ptr<!u32i>, !u32i
 // CIR: cir.return [[RET]] : !u32i
@@ -257,7 +257,7 @@ void write32_1() {
 
 // CIR-LABEL: @_Z9write32_1v
 // CIR: [[CONST5:%.*]] = cir.const #cir.int<5> : !s32i
-// CIR: [[INT5:%.*]] = cir.cast(integral, [[CONST5]] : !s32i), !u64i
+// CIR: [[INT5:%.*]] = cir.cast integral [[CONST5]] : !s32i -> !u64i
 // CIR: [[MEMBER:%.*]] = cir.get_member {{.*}}[1] {name = "f3"} : !cir.ptr<!rec_S3> -> !cir.ptr<!u32i>
 // CIR: cir.set_bitfield align(4) (#bfi_f3_1, [[MEMBER]] : !cir.ptr<!u32i>, [[INT5]] : !u64i) -> !u64i
 // CIR: cir.return
diff --git a/clang/test/CIR/CodeGen/if.cpp b/clang/test/CIR/CodeGen/if.cpp
index daaec8a61484d..823539b88834f 100644
--- a/clang/test/CIR/CodeGen/if.cpp
+++ b/clang/test/CIR/CodeGen/if.cpp
@@ -74,7 +74,7 @@ void if1(int a) {
 // CIR: cir.func{{.*}} @_Z3if1i(%arg0: !s32i loc({{.*}}))
 // CIR: cir.scope {
 // CIR:   %3 = cir.load{{.*}} %0 : !cir.ptr<!s32i>, !s32i
-// CIR:   %4 = cir.cast(int_to_bool, %3 : !s32i), !cir.bool
+// CIR:   %4 = cir.cast int_to_bool %3 : !s32i -> !cir.bool
 // CIR-NEXT:   cir.if %4 {
 // CIR-NEXT:     %5 = cir.const #cir.int<3> : !s32i
 // CIR-NEXT:     cir.store{{.*}} %5, %1 : !s32i, !cir.ptr<!s32i>
@@ -141,7 +141,7 @@ void if2(int a, bool b, bool c) {
 // CIR: cir.func{{.*}} @_Z3if2ibb(%arg0: !s32i loc({{.*}}), %arg1: !cir.bool loc({{.*}}), %arg2: !cir.bool loc({{.*}}))
 // CIR: cir.scope {
 // CIR:   %5 = cir.load{{.*}} %0 : !cir.ptr<!s32i>, !s32i
-// CIR:   %6 = cir.cast(int_to_bool, %5 : !s32i), !cir.bool
+// CIR:   %6 = cir.cast int_to_bool %5 : !s32i -> !cir.bool
 // CIR:   cir.if %6 {
 // CIR:     %7 = cir.const #cir.int<3> : !s32i
 // CIR:     cir.store{{.*}} %7, %3 : !s32i, !cir.ptr<!s32i>
@@ -267,7 +267,7 @@ int if_init() {
 // CIR:   %[[CONST42:.*]] = cir.const #cir.int<42> : !s32i
 // CIR:   cir.store{{.*}} %[[CONST42]], %[[X]] : !s32i, !cir.ptr<!s32i>
 // CIR:   %[[X_VAL:.*]] = cir.load{{.*}} %[[X]] : !cir.ptr<!s32i>, !s32i
-// CIR:   %[[COND:.*]] = cir.cast(int_to_bool, %[[X_VAL]] : !s32i), !cir.bool
+// CIR:   %[[COND:.*]] = cir.cast int_to_bool %[[X_VAL]] : !s32i -> !cir.bool
 // CIR:   cir.if %[[COND]] {
 // CIR:     %[[X_IF:.*]] = cir.load{{.*}} %[[X]] : !cir.ptr<!s32i>, !s32i
 // CIR:     %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
diff --git a/clang/test/CIR/CodeGen/int-to-bool.cpp b/clang/test/CIR/CodeGen/int-to-bool.cpp
index ad36af4552c2f..97b799b60d25f 100644
--- a/clang/test/CIR/CodeGen/int-to-bool.cpp
+++ b/clang/test/CIR/CodeGen/int-to-bool.cpp
@@ -10,7 +10,7 @@ bool f1(unsigned char c) {
 }
 
 // CIR: cir.func{{.*}} @_Z2f1h
-// CIR:   cir.cast(int_to_bool, %{{.*}} : !u8i), !cir.bool
+// CIR:   cir.cast int_to_bool %{{.*}} : !u8i -> !cir.bool
 
 // Note: The full zext/store/load/trunc sequence is checked here to show what
 // CIR is being lowered to. There's no need to check it for every function since
@@ -33,7 +33,7 @@ bool f2(short s) {
 }
 
 // CIR: cir.func{{.*}} @_Z2f2s
-// CIR:   cir.cast(int_to_bool, %{{.*}} : !s16i), !cir.bool
+// CIR:   cir.cast int_to_bool %{{.*}} : !s16i -> !cir.bool
 
 // LLVM: define{{.*}} i1 @_Z2f2s
 // LLVM:   %[[CMP:.*]] = icmp ne i16 %4, 0
@@ -48,7 +48,7 @@ bool f3(unsigned u) {
 }
 
 // CIR: cir.func{{.*}} @_Z2f3j
-// CIR:   cir.cast(int_to_bool, %{{.*}} : !u32i), !cir.bool
+// CIR:   cir.cast int_to_bool %{{.*}} : !u32i -> !cir.bool
 
 // LLVM: define{{.*}} i1 @_Z2f3j
 // LLVM:   %[[CMP:.*]] = icmp ne i32 %4, 0
@@ -63,7 +63,7 @@ bool f4(long l) {
 }
 
 // CIR: cir.func{{.*}} @_Z2f4l
-// CIR:   cir.cast(int_to_bool, %{{.*}} : !s64i), !cir.bool
+// CIR:   cir.cast int_to_bool %{{.*}} : !s64i -> !cir.bool
 
 // LLVM: define{{.*}} i1 @_Z2f4l
 // LLVM:   %[[CMP:.*]] = icmp ne i64 %4, 0
diff --git a/clang/test/CIR/CodeGen/loop.cpp b/clang/test/CIR/CodeGen/loop.cpp
index 0eba0bbc97c15..b30589cd1b6ec 100644
--- a/clang/test/CIR/CodeGen/loop.cpp
+++ b/clang/test/CIR/CodeGen/loop.cpp
@@ -205,10 +205,10 @@ void l4() {
 // CIR:     %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
 // CIR:     cir.store{{.*}} %[[A_ADDR]], %[[RANGE_ADDR]]
 // CIR:     %[[RANGE_LOAD:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
-// CIR:     %[[RANGE_CAST:.*]] = cir.cast(array_to_ptrdecay, %[[RANGE_LOAD]] : {{.*}})
+// CIR:     %[[RANGE_CAST:.*]] = cir.cast array_to_ptrdecay %[[RANGE_LOAD]] : {{.*}}
 // CIR:     cir.store{{.*}} %[[RANGE_CAST]], %[[BEGIN_ADDR]]
 // CIR:     %[[BEGIN:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
-// CIR:     %[[BEGIN_CAST:.*]] = cir.cast(array_to_ptrdecay, %[[BEGIN]] : {{.*}})
+// CIR:     %[[BEGIN_CAST:.*]] = cir.cast array_to_ptrdecay %[[BEGIN]] : {{.*}}
 // CIR:     %[[TEN:.*]] = cir.const #cir.int<10>
 // CIR:     %[[END_PTR:.*]] = cir.ptr_stride(%[[BEGIN_CAST]] : {{.*}}, %[[TEN]] : {{.*}})
 // CIR:     cir.store{{.*}} %[[END_PTR]], %[[END_ADDR]]
@@ -312,7 +312,7 @@ void l5() {
 // CIR:     %[[BEGIN_ADDR:.*]] = cir.alloca {{.*}} ["__begin1", init]
 // CIR:     %[[END_ADDR:.*]] = cir.alloca {{.*}} ["__end1", init]
 // CIR:     %[[X_ADDR:.*]] = cir.alloca {{.*}} ["x", init]
-// CIR:     %[[ARR_CAST:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ADDR]] : {{.*}})
+// CIR:     %[[ARR_CAST:.*]] = cir.cast array_to_ptrdecay %[[ARR_ADDR]] : {{.*}}
 // CIR:     %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CIR:     cir.store{{.*}} %[[ONE]], %[[ARR_CAST]]
 // CIR:     %[[OFFSET1:.*]] = cir.const #cir.int<1> : !s64i
@@ -329,10 +329,10 @@ void l5() {
 // CIR:     cir.store{{.*}} %[[FOUR]], %[[STRIDE3]]
 // CIR:     cir.store{{.*}} %[[ARR_ADDR]], %[[RANGE_ADDR]]
 // CIR:     %[[RANGE_LOAD:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
-// CIR:     %[[RANGE_CAST:.*]] = cir.cast(array_to_ptrdecay, %[[RANGE_LOAD]] : {{.*}})
+// CIR:     %[[RANGE_CAST:.*]] = cir.cast array_to_ptrdecay %[[RANGE_LOAD]] : {{.*}}
 // CIR:     cir.store{{.*}} %[[RANGE_CAST]], %[[BEGIN_ADDR]]
 // CIR:     %[[BEGIN:.*]] = cir.load{{.*}} %[[RANGE_ADDR]]
-// CIR:     %[[BEGIN_CAST:.*]] = cir.cast(array_to_ptrdecay, %[[BEGIN]] : {{.*}})
+// CIR:     %[[BEGIN_CAST:.*]] = cir.cast array_to_ptrdecay %[[BEGIN]] : {{.*}}
 // CIR:     %[[FOUR:.*]] = cir.const #cir.int<4> : !s64i
 // CIR:     %[[END_PTR:.*]] = cir.ptr_stride(%[[BEGIN_CAST]] : {{.*}}, %[[FOUR]] : {{.*}})
 // CIR:     cir.store{{.*}} %[[END_PTR]], %[[END_ADDR]]
@@ -445,7 +445,7 @@ void test_do_while_false() {
 // CIR-NEXT:       cir.yield
 // CIR-NEXT:     } while {
 // CIR-NEXT:       %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CIR-NEXT:       %[[FALSE:.*]] = cir.cast(int_to_bool, %[[ZERO]] : !s32i), !cir.bool
+// CIR-NEXT:       %[[FALSE:.*]] = cir.cast int_to_bool %[[ZERO]] : !s32i -> !cir.bool
 // CIR-NEXT:       cir.condition(%[[FALSE]])
 
 // LLVM: define{{.*}} void @_Z19test_do_while_falsev()
diff --git a/clang/test/CIR/CodeGen/new.cpp b/clang/test/CIR/CodeGen/new.cpp
index b14bf077cd154..91dae3f28c572 100644
--- a/clang/test/CIR/CodeGen/new.cpp
+++ b/clang/test/CIR/CodeGen/new.cpp
@@ -22,15 +22,15 @@ void test_basic_new() {
 // CHECK:   %[[PD_ADDR:.*]] = cir.alloca !cir.ptr<!cir.double>, !cir.ptr<!cir.ptr<!cir.double>>, ["pd", init]
 // CHECK:   %[[EIGHT:.*]] = cir.const #cir.int<8>
 // CHECK:   %[[NEW_S:.*]] = cir.call @_Znwm(%[[EIGHT]])
-// CHECK:   %[[NEW_S_PTR:.*]] = cir.cast(bitcast, %[[NEW_S]]
+// CHECK:   %[[NEW_S_PTR:.*]] = cir.cast bitcast %[[NEW_S]]
 // CHECK:   cir.store{{.*}} %[[NEW_S_PTR]], %[[PS_ADDR]]
 // CHECK:   %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK:   %[[NEW_INT:.*]] = cir.call @_Znwm(%[[FOUR]])
-// CHECK:   %[[NEW_INT_PTR:.*]] = cir.cast(bitcast, %[[NEW_INT]]
+// CHECK:   %[[NEW_INT_PTR:.*]] = cir.cast bitcast %[[NEW_INT]]
 // CHECK:   cir.store{{.*}} %[[NEW_INT_PTR]], %[[PN_ADDR]]
 // CHECK:   %[[EIGHT:.*]] = cir.const #cir.int<8>
 // CHECK:   %[[NEW_DOUBLE:.*]] = cir.call @_Znwm(%[[EIGHT]])
-// CHECK:   %[[NEW_DOUBLE_PTR:.*]] = cir.cast(bitcast, %[[NEW_DOUBLE]]
+// CHECK:   %[[NEW_DOUBLE_PTR:.*]] = cir.cast bitcast %[[NEW_DOUBLE]]
 // CHECK:   cir.store{{.*}} %[[NEW_DOUBLE_PTR]], %[[PD_ADDR]]
 // CHECK:   cir.return
 
@@ -68,13 +68,13 @@ void test_new_with_init() {
 // CHECK:   %[[PD_ADDR:.*]] = cir.alloca !cir.ptr<!cir.double>, !cir.ptr<!cir.ptr<!cir.double>>, ["pd", init]
 // CHECK:   %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK:   %[[NEW_INT:.*]] = cir.call @_Znwm(%[[FOUR]])
-// CHECK:   %[[NEW_INT_PTR:.*]] = cir.cast(bitcast, %[[NEW_INT]]
+// CHECK:   %[[NEW_INT_PTR:.*]] = cir.cast bitcast %[[NEW_INT]]
 // CHECK:   %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK:   cir.store{{.*}} %[[TWO]], %[[NEW_INT_PTR]]
 // CHECK:   cir.store{{.*}} %[[NEW_INT_PTR]], %[[PN_ADDR]]
 // CHECK:   %[[EIGHT:.*]] = cir.const #cir.int<8>
 // CHECK:   %[[NEW_DOUBLE:.*]] = cir.call @_Znwm(%[[EIGHT]])
-// CHECK:   %[[NEW_DOUBLE_PTR:.*]] = cir.cast(bitcast, %[[NEW_DOUBLE]]
+// CHECK:   %[[NEW_DOUBLE_PTR:.*]] = cir.cast bitcast %[[NEW_DOUBLE]]
 // CHECK:   %[[THREE:.*]] = cir.const #cir.fp<3.000000e+00>
 // CHECK:   cir.store{{.*}} %[[THREE]], %[[NEW_DOUBLE_PTR]]
 // CHECK:   cir.store{{.*}} %[[NEW_DOUBLE_PTR]], %[[PD_ADDR]]
@@ -119,12 +119,12 @@ void test_new_with_ctor() {
 // CHECK:   %[[PS2_2_ADDR:.*]] = cir.alloca !cir.ptr<!rec_S2>, !cir.ptr<!cir.ptr<!rec_S2>>, ["ps2_2", init]
 // CHECK:   %[[EIGHT:.*]] = cir.const #cir.int<8>
 // CHECK:   %[[NEW_S2:.*]] = cir.call @_Znwm(%[[EIGHT]])
-// CHECK:   %[[NEW_S2_PTR:.*]] = cir.cast(bitcast, %[[NEW_S2]]
+// CHECK:   %[[NEW_S2_PTR:.*]] = cir.cast bitcast %[[NEW_S2]]
 // CHECK:   cir.call @_ZN2S2C1Ev(%[[NEW_S2_PTR]])
 // CHECK:   cir.store{{.*}} %[[NEW_S2_PTR]], %[[PS2_ADDR]]
 // CHECK:   %[[EIGHT:.*]] = cir.const #cir.int<8>
 // CHECK:   %[[NEW_S2_2:.*]] = cir.call @_Znwm(%[[EIGHT]])
-// CHECK:   %[[NEW_S2_2_PTR:.*]] = cir.cast(bitcast, %[[NEW_S2_2]]
+// CHECK:   %[[NEW_S2_2_PTR:.*]] = cir.cast bitcast %[[NEW_S2_2]]
 // CHECK:   %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK:   %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK:   cir.call @_ZN2S2C1Eii(%[[NEW_S2_2_PTR]], %[[ONE]], %[[TWO]])
@@ -161,7 +161,7 @@ void test_new_with_complex_type() {
 // CHECK:   %0 = cir.alloca !cir.ptr<!cir.complex<!cir.float>>, !cir.ptr<!cir.ptr<!cir.complex<!cir.float>>>, ["a", init]
 // CHECK:   %1 = cir.const #cir.int<8> : !u64i
 // CHECK:   %2 = cir.call @_Znwm(%1) : (!u64i) -> !cir.ptr<!void>
-// CHECK:   %3 = cir.cast(bitcast, %2 : !cir.ptr<!void>), !cir.ptr<!cir.complex<!cir.float>>
+// CHECK:   %3 = cir.cast bitcast %2 : !cir.ptr<!void> -> !cir.ptr<!cir.complex<!cir.float>>
 // CHECK:   %4 = cir.const #cir.const_complex<#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float> : !cir.complex<!cir.float>
 // CHECK:   cir.store align(8) %4, %3 : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CHECK:   cir.store align(8) %3, %0 : !cir.ptr<!cir.complex<!cir.float>>, !cir.ptr<!cir.ptr<!cir.complex<!cir.float>>>
diff --git a/clang/test/CIR/CodeGen/no-prototype.c b/clang/test/CIR/CodeGen/no-prototype.c
index 4be6a94c12129..728c4b80b95a2 100644
--- a/clang/test/CIR/CodeGen/no-prototype.c
+++ b/clang/test/CIR/CodeGen/no-prototype.c
@@ -51,7 +51,7 @@ int test3(int x) {
 // CHECK: cir.func dso_local @test3
   return noProto3(x);
   // CHECK:  [[GGO:%.*]] = cir.get_global @noProto3 : !cir.ptr<!cir.func<(...) -> !s32i>>
-  // CHECK:  [[CAST:%.*]] = cir.cast(bitcast, [[GGO]] : !cir.ptr<!cir.func<(...) -> !s32i>>), !cir.ptr<!cir.func<(!s32i) -> !s32i>>
+  // CHECK:  [[CAST:%.*]] = cir.cast bitcast [[GGO]] : !cir.ptr<!cir.func<(...) -> !s32i>> -> !cir.ptr<!cir.func<(!s32i) -> !s32i>>
   // CHECK:  {{%.*}} = cir.call [[CAST]](%{{[0-9]+}}) : (!cir.ptr<!cir.func<(!s32i) -> !s32i>>, !s32i) -> !s32i
 }
 
@@ -68,7 +68,7 @@ int noProto4() { return 0; }
 int test4(int x) {
   return noProto4(x); // Even if we know the definition, this should compile.
   // CHECK:  [[GGO:%.*]] = cir.get_global @noProto4 : !cir.ptr<!cir.func<() -> !s32i>>
-  // CHECK:  [[CAST:%.*]] = cir.cast(bitcast, [[GGO]] : !cir.ptr<!cir.func<() -> !s32i>>), !cir.ptr<!cir.func<(!s32i) -> !s32i>>
+  // CHECK:  [[CAST:%.*]] = cir.cast bitcast [[GGO]] : !cir.ptr<!cir.func<() -> !s32i>> -> !cir.ptr<!cir.func<(!s32i) -> !s32i>>
   // CHECK:  {{%.*}} = cir.call [[CAST]]({{%.*}}) : (!cir.ptr<!cir.func<(!s32i) -> !s32i>>, !s32i) -> !s32i
 }
 
@@ -77,7 +77,7 @@ int noProto5();
 int test5(int x) {
   return noProto5();
   // CHECK:  [[GGO:%.*]] = cir.get_global @noProto5 : !cir.ptr<!cir.func<(!s32i) -> !s32i>>
-  // CHECK:  [[CAST:%.*]] = cir.cast(bitcast, [[GGO]] : !cir.ptr<!cir.func<(!s32i) -> !s32i>>), !cir.ptr<!cir.func<() -> !s32i>>
+  // CHECK:  [[CAST:%.*]] = cir.cast bitcast [[GGO]] : !cir.ptr<!cir.func<(!s32i) -> !s32i>> -> !cir.ptr<!cir.func<() -> !s32i>>
   // CHECK:  {{%.*}} = cir.call [[CAST]]() : (!cir.ptr<!cir.func<() -> !s32i>>) -> !s32i
 }
 int noProto5(int x) { return x; }
diff --git a/clang/test/CIR/CodeGen/opaque.c b/clang/test/CIR/CodeGen/opaque.c
index 96ecdfc4cd978..73f6402e8a484 100644
--- a/clang/test/CIR/CodeGen/opaque.c
+++ b/clang/test/CIR/CodeGen/opaque.c
@@ -17,8 +17,8 @@ void foo2() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_BOOL:.*]] = cir.cast(float_to_bool, %[[A_REAL]] : !cir.float), !cir.bool
-// CIR: %[[A_IMAG_BOOL:.*]] = cir.cast(float_to_bool, %[[A_IMAG]] : !cir.float), !cir.bool
+// CIR: %[[A_REAL_BOOL:.*]] = cir.cast float_to_bool %[[A_REAL]] : !cir.float -> !cir.bool
+// CIR: %[[A_IMAG_BOOL:.*]] = cir.cast float_to_bool %[[A_IMAG]] : !cir.float -> !cir.bool
 // CIR: %[[CONST_TRUE:.*]] = cir.const #true
 // CIR: %[[COND:.*]] = cir.select if %[[A_REAL_BOOL]] then %[[CONST_TRUE]] else %[[A_IMAG_BOOL]] : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
 // CIR: %[[RESULT:.*]] = cir.ternary(%[[COND]], true {
diff --git a/clang/test/CIR/CodeGen/opaque.cpp b/clang/test/CIR/CodeGen/opaque.cpp
index a48c013e5c20b..028bfd9ef4cd0 100644
--- a/clang/test/CIR/CodeGen/opaque.cpp
+++ b/clang/test/CIR/CodeGen/opaque.cpp
@@ -35,8 +35,8 @@ void foo2() {
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_BOOL:.*]] = cir.cast(float_to_bool, %[[A_REAL]] : !cir.float), !cir.bool
-// CIR: %[[A_IMAG_BOOL:.*]] = cir.cast(float_to_bool, %[[A_IMAG]] : !cir.float), !cir.bool
+// CIR: %[[A_REAL_BOOL:.*]] = cir.cast float_to_bool %[[A_REAL]] : !cir.float -> !cir.bool
+// CIR: %[[A_IMAG_BOOL:.*]] = cir.cast float_to_bool %[[A_IMAG]] : !cir.float -> !cir.bool
 // CIR: %[[CONST_TRUE:.*]] = cir.const #true
 // CIR: %[[COND:.*]] = cir.select if %[[A_REAL_BOOL]] then %[[CONST_TRUE]] else %[[A_IMAG_BOOL]] : (!cir.bool, !cir.bool, !cir.bool) -> !cir.bool
 // CIR: %[[RESULT:.*]] = cir.ternary(%[[COND]], true {
@@ -111,7 +111,7 @@ void foo3() {
 // CIR: %[[B_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b"]
 // CIR: %[[C_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["c", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
-// CIR: %[[A_BOOL:.*]] = cir.cast(int_to_bool, %[[TMP_A]] : !s32i), !cir.bool
+// CIR: %[[A_BOOL:.*]] = cir.cast int_to_bool %[[TMP_A]] : !s32i -> !cir.bool
 // CIR: %[[RESULT:.*]] = cir.ternary(%[[A_BOOL]], true {
 // CIR:   %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
 // CIR:   cir.yield %[[TMP_A]] : !s32i
diff --git a/clang/test/CIR/CodeGen/pointers.cpp b/clang/test/CIR/CodeGen/pointers.cpp
index dcfcc723f4da1..2c3dbb0fd6c58 100644
--- a/clang/test/CIR/CodeGen/pointers.cpp
+++ b/clang/test/CIR/CodeGen/pointers.cpp
@@ -24,7 +24,7 @@ void foo(int *iptr, char *cptr, unsigned ustride) {
   // Must convert unsigned stride to a signed one.
   iptr - ustride;
   // CHECK: %[[#STRIDE:]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!u32i>, !u32i
-  // CHECK: %[[#SIGNSTRIDE:]] = cir.cast(integral, %[[#STRIDE]] : !u32i), !s32i
+  // CHECK: %[[#SIGNSTRIDE:]] = cir.cast integral %[[#STRIDE]] : !u32i -> !s32i
   // CHECK: %[[#NEGSTRIDE:]] = cir.unary(minus, %[[#SIGNSTRIDE]]) : !s32i, !s32i
   // CHECK: cir.ptr_stride(%{{.+}} : !cir.ptr<!s32i>, %[[#NEGSTRIDE]] : !s32i), !cir.ptr<!s32i>
 
diff --git a/clang/test/CIR/CodeGen/ternary.cpp b/clang/test/CIR/CodeGen/ternary.cpp
index 781286a94cc2e..eb38ee3083e5c 100644
--- a/clang/test/CIR/CodeGen/ternary.cpp
+++ b/clang/test/CIR/CodeGen/ternary.cpp
@@ -69,7 +69,7 @@ int foo(int a, int b) {
 // CIR: [[ALOAD2:%.+]] = cir.load align(4) [[A]] : !cir.ptr<!s32i>, !s32i
 // CIR: cir.yield [[ALOAD2]] : !s32i
 // CIR: }) : (!cir.bool) -> !s32i
-// CIR: [[CAST:%.+]] = cir.cast(int_to_bool, [[TERNARY_RES]] : !s32i), !cir.bool
+// CIR: [[CAST:%.+]] = cir.cast int_to_bool [[TERNARY_RES]] : !s32i -> !cir.bool
 // CIR: cir.if [[CAST]] {
 // CIR: [[ONE:%.+]] = cir.const #cir.int<1> : !s32i
 // CIR: [[MINUS_ONE:%.+]] = cir.unary(minus, [[ONE]]) nsw : !s32i, !s32i
diff --git a/clang/test/CIR/CodeGen/unary.cpp b/clang/test/CIR/CodeGen/unary.cpp
index c37524bc8b2c9..ac1ae344c6b48 100644
--- a/clang/test/CIR/CodeGen/unary.cpp
+++ b/clang/test/CIR/CodeGen/unary.cpp
@@ -410,10 +410,10 @@ void chars(char c) {
 // CHECK: cir.func{{.*}} @_Z5charsc
 
   int c1 = +c;
-  // CHECK: %[[PROMO:.*]] = cir.cast(integral, %{{.+}} : !s8i), !s32i
+  // CHECK: %[[PROMO:.*]] = cir.cast integral %{{.+}} : !s8i -> !s32i
   // CHECK: cir.unary(plus, %[[PROMO]]) : !s32i, !s32i
   int c2 = -c;
-  // CHECK: %[[PROMO:.*]] = cir.cast(integral, %{{.+}} : !s8i), !s32i
+  // CHECK: %[[PROMO:.*]] = cir.cast integral %{{.+}} : !s8i -> !s32i
   // CHECK: cir.unary(minus, %[[PROMO]]) nsw : !s32i, !s32i
 
   // Chars can go through some integer promotion codegen paths even when not promoted.
@@ -431,9 +431,9 @@ _Float16 fp16UPlus(_Float16 f) {
 
 // CHECK: cir.func{{.*}} @_Z9fp16UPlusDF16_({{.*}}) -> !cir.f16
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[F:.*]]
-// CHECK:   %[[PROMOTED:.*]] = cir.cast(floating, %[[INPUT]] : !cir.f16), !cir.float
+// CHECK:   %[[PROMOTED:.*]] = cir.cast floating %[[INPUT]] : !cir.f16 -> !cir.float
 // CHECK:   %[[RESULT:.*]] = cir.unary(plus, %[[PROMOTED]])
-// CHECK:   %[[UNPROMOTED:.*]] = cir.cast(floating, %[[RESULT]] : !cir.float), !cir.f16
+// CHECK:   %[[UNPROMOTED:.*]] = cir.cast floating %[[RESULT]] : !cir.float -> !cir.f16
 
 // LLVM: define{{.*}} half @_Z9fp16UPlusDF16_({{.*}})
 // LLVM:   %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2
@@ -451,9 +451,9 @@ _Float16 fp16UMinus(_Float16 f) {
 
 // CHECK: cir.func{{.*}} @_Z10fp16UMinusDF16_({{.*}}) -> !cir.f16
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[F:.*]]
-// CHECK:   %[[PROMOTED:.*]] = cir.cast(floating, %[[INPUT]] : !cir.f16), !cir.float
+// CHECK:   %[[PROMOTED:.*]] = cir.cast floating %[[INPUT]] : !cir.f16 -> !cir.float
 // CHECK:   %[[RESULT:.*]] = cir.unary(minus, %[[PROMOTED]])
-// CHECK:   %[[UNPROMOTED:.*]] = cir.cast(floating, %[[RESULT]] : !cir.float), !cir.f16
+// CHECK:   %[[UNPROMOTED:.*]] = cir.cast floating %[[RESULT]] : !cir.float -> !cir.f16
 
 // LLVM: define{{.*}} half @_Z10fp16UMinusDF16_({{.*}})
 // LLVM:   %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2
@@ -482,24 +482,24 @@ void test_logical_not() {
 
 // CHECK: cir.func{{.*}} @_Z16test_logical_notv()
 // CHECK:   %[[A:.*]] = cir.load{{.*}} %[[A_ADDR:.*]] : !cir.ptr<!s32i>, !s32i
-// CHECK:   %[[A_BOOL:.*]] = cir.cast(int_to_bool, %[[A]] : !s32i), !cir.bool
+// CHECK:   %[[A_BOOL:.*]] = cir.cast int_to_bool %[[A]] : !s32i -> !cir.bool
 // CHECK:   %[[A_NOT:.*]] = cir.unary(not, %[[A_BOOL]]) : !cir.bool, !cir.bool
-// CHECK:   %[[A_CAST:.*]] = cir.cast(bool_to_int, %[[A_NOT]] : !cir.bool), !s32i
+// CHECK:   %[[A_CAST:.*]] = cir.cast bool_to_int %[[A_NOT]] : !cir.bool -> !s32i
 // CHECK:   cir.store{{.*}} %[[A_CAST]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
 // CHECK:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR:.*]] : !cir.ptr<!cir.bool>, !cir.bool
 // CHECK:   %[[B_NOT:.*]] = cir.unary(not, %[[B]]) : !cir.bool, !cir.bool
 // CHECK:   cir.store{{.*}} %[[B_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK:   %[[C:.*]] = cir.load{{.*}} %[[C_ADDR:.*]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK:   %[[C_BOOL:.*]] = cir.cast(float_to_bool, %[[C]] : !cir.float), !cir.bool
+// CHECK:   %[[C_BOOL:.*]] = cir.cast float_to_bool %[[C]] : !cir.float -> !cir.bool
 // CHECK:   %[[C_NOT:.*]] = cir.unary(not, %[[C_BOOL]]) : !cir.bool, !cir.bool
-// CHECK:   %[[C_CAST:.*]] = cir.cast(bool_to_float, %[[C_NOT]] : !cir.bool), !cir.float
+// CHECK:   %[[C_CAST:.*]] = cir.cast bool_to_float %[[C_NOT]] : !cir.bool -> !cir.float
 // CHECK:   cir.store{{.*}} %[[C_CAST]], %[[C_ADDR]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK:   %[[P:.*]] = cir.load{{.*}} %[[P_ADDR:.*]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
-// CHECK:   %[[P_BOOL:.*]] = cir.cast(ptr_to_bool, %[[P]] : !cir.ptr<!s32i>), !cir.bool
+// CHECK:   %[[P_BOOL:.*]] = cir.cast ptr_to_bool %[[P]] : !cir.ptr<!s32i> -> !cir.bool
 // CHECK:   %[[P_NOT:.*]] = cir.unary(not, %[[P_BOOL]]) : !cir.bool, !cir.bool
 // CHECK:   cir.store{{.*}} %[[P_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK:   %[[D:.*]] = cir.load{{.*}} %[[D_ADDR:.*]] : !cir.ptr<!cir.double>, !cir.double
-// CHECK:   %[[D_BOOL:.*]] = cir.cast(float_to_bool, %[[D]] : !cir.double), !cir.bool
+// CHECK:   %[[D_BOOL:.*]] = cir.cast float_to_bool %[[D]] : !cir.double -> !cir.bool
 // CHECK:   %[[D_NOT:.*]] = cir.unary(not, %[[D_BOOL]]) : !cir.bool, !cir.bool
 // CHECK:   cir.store{{.*}} %[[D_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 
@@ -566,10 +566,10 @@ void f16NestedUPlus() {
 // CHECK:  %[[A_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["a"]
 // CHECK:  %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
 // CHECK:  %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
-// CHECK:  %[[A_F32:.*]] = cir.cast(floating, %[[TMP_A]] : !cir.f16), !cir.float
+// CHECK:  %[[A_F32:.*]] = cir.cast floating %[[TMP_A]] : !cir.f16 -> !cir.float
 // CHECK:  %[[A_PLUS:.*]] = cir.unary(plus, %[[A_F32]]) : !cir.float, !cir.float
 // CHECK:  %[[RESULT_F32:.*]] = cir.unary(plus, %[[A_PLUS]]) : !cir.float, !cir.float
-// CHECK:  %[[RESULT:.*]] = cir.cast(floating, %[[RESULT_F32]] : !cir.float), !cir.f16
+// CHECK:  %[[RESULT:.*]] = cir.cast floating %[[RESULT_F32]] : !cir.float -> !cir.f16
 // CHECK:  cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: define{{.*}} void @_Z14f16NestedUPlusv()
@@ -597,10 +597,10 @@ void f16NestedUMinus() {
 // CHECK:  %[[A_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["a"]
 // CHECK:  %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
 // CHECK:  %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
-// CHECK:  %[[A_F32:.*]] = cir.cast(floating, %[[TMP_A]] : !cir.f16), !cir.float
+// CHECK:  %[[A_F32:.*]] = cir.cast floating %[[TMP_A]] : !cir.f16 -> !cir.float
 // CHECK:  %[[A_MINUS:.*]] = cir.unary(minus, %[[A_F32]]) : !cir.float, !cir.float
 // CHECK:  %[[RESULT_F32:.*]] = cir.unary(minus, %[[A_MINUS]]) : !cir.float, !cir.float
-// CHECK:  %[[RESULT:.*]] = cir.cast(floating, %[[RESULT_F32]] : !cir.float), !cir.f16
+// CHECK:  %[[RESULT:.*]] = cir.cast floating %[[RESULT_F32]] : !cir.float -> !cir.f16
 // CHECK:  cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: define{{.*}} void @_Z15f16NestedUMinusv()
diff --git a/clang/test/CIR/CodeGen/union.c b/clang/test/CIR/CodeGen/union.c
index 23e862b24517d..bda8e77b89048 100644
--- a/clang/test/CIR/CodeGen/union.c
+++ b/clang/test/CIR/CodeGen/union.c
@@ -116,7 +116,7 @@ void shouldGenerateUnionAccess(union U2 u) {
 // CIR-NEXT:   %[[U:.*]] = cir.alloca !rec_U2, !cir.ptr<!rec_U2>, ["u", init] {alignment = 8 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG]], %[[U]] : !rec_U2, !cir.ptr<!rec_U2>
 // CIR-NEXT:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CIR-NEXT:   %[[ZERO_CHAR:.*]] = cir.cast(integral, %[[ZERO]] : !s32i), !s8i
+// CIR-NEXT:   %[[ZERO_CHAR:.*]] = cir.cast integral %[[ZERO]] : !s32i -> !s8i
 // CIR-NEXT:   %[[B_PTR:.*]] = cir.get_member %[[U]][0] {name = "b"} : !cir.ptr<!rec_U2> -> !cir.ptr<!s8i>
 // CIR-NEXT:   cir.store{{.*}} %[[ZERO_CHAR]], %[[B_PTR]] : !s8i, !cir.ptr<!s8i>
 // CIR-NEXT:   %[[B_PTR2:.*]] = cir.get_member %[[U]][0] {name = "b"} : !cir.ptr<!rec_U2> -> !cir.ptr<!s8i>
@@ -174,10 +174,10 @@ void f3(union U3 u) {
 // CIR-NEXT:   %[[U:.*]] = cir.alloca !rec_U3, !cir.ptr<!rec_U3>, ["u", init] {alignment = 1 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG]], %[[U]] : !rec_U3, !cir.ptr<!rec_U3>
 // CIR-NEXT:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CIR-NEXT:   %[[ZERO_CHAR:.*]] = cir.cast(integral, %[[ZERO]] : !s32i), !s8i
+// CIR-NEXT:   %[[ZERO_CHAR:.*]] = cir.cast integral %[[ZERO]] : !s32i -> !s8i
 // CIR-NEXT:   %[[IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR-NEXT:   %[[C_PTR:.*]] = cir.get_member %[[U]][0] {name = "c"} : !cir.ptr<!rec_U3> -> !cir.ptr<!cir.array<!s8i x 5>>
-// CIR-NEXT:   %[[C_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[C_PTR]] : !cir.ptr<!cir.array<!s8i x 5>>), !cir.ptr<!s8i>
+// CIR-NEXT:   %[[C_DECAY:.*]] = cir.cast array_to_ptrdecay %[[C_PTR]] : !cir.ptr<!cir.array<!s8i x 5>> -> !cir.ptr<!s8i>
 // CIR-NEXT:   %[[ELEM_PTR:.*]] = cir.ptr_stride(%[[C_DECAY]] : !cir.ptr<!s8i>, %[[IDX]] : !s32i), !cir.ptr<!s8i>
 // CIR-NEXT:   cir.store{{.*}} %[[ZERO_CHAR]], %[[ELEM_PTR]] : !s8i, !cir.ptr<!s8i>
 // CIR-NEXT:   cir.return
@@ -206,10 +206,10 @@ void f5(union U4 u) {
 // CIR-NEXT:   %[[U:.*]] = cir.alloca !rec_U4, !cir.ptr<!rec_U4>, ["u", init] {alignment = 4 : i64}
 // CIR-NEXT:   cir.store{{.*}} %[[ARG]], %[[U]] : !rec_U4, !cir.ptr<!rec_U4>
 // CIR-NEXT:   %[[CHAR_VAL:.*]] = cir.const #cir.int<65> : !s32i
-// CIR-NEXT:   %[[CHAR_CAST:.*]] = cir.cast(integral, %[[CHAR_VAL]] : !s32i), !s8i
+// CIR-NEXT:   %[[CHAR_CAST:.*]] = cir.cast integral %[[CHAR_VAL]] : !s32i -> !s8i
 // CIR-NEXT:   %[[IDX:.*]] = cir.const #cir.int<4> : !s32i
 // CIR-NEXT:   %[[C_PTR:.*]] = cir.get_member %[[U]][0] {name = "c"} : !cir.ptr<!rec_U4> -> !cir.ptr<!cir.array<!s8i x 5>>
-// CIR-NEXT:   %[[C_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[C_PTR]] : !cir.ptr<!cir.array<!s8i x 5>>), !cir.ptr<!s8i>
+// CIR-NEXT:   %[[C_DECAY:.*]] = cir.cast array_to_ptrdecay %[[C_PTR]] : !cir.ptr<!cir.array<!s8i x 5>> -> !cir.ptr<!s8i>
 // CIR-NEXT:   %[[ELEM_PTR:.*]] = cir.ptr_stride(%[[C_DECAY]] : !cir.ptr<!s8i>, %[[IDX]] : !s32i), !cir.ptr<!s8i>
 // CIR-NEXT:   cir.store{{.*}} %[[CHAR_CAST]], %[[ELEM_PTR]] : !s8i, !cir.ptr<!s8i>
 // CIR-NEXT:   cir.return
diff --git a/clang/test/CIR/CodeGen/var_arg.c b/clang/test/CIR/CodeGen/var_arg.c
index e9c4acb15d009..f5b92c61e11ad 100644
--- a/clang/test/CIR/CodeGen/var_arg.c
+++ b/clang/test/CIR/CodeGen/var_arg.c
@@ -23,13 +23,13 @@ int varargs(int count, ...) {
 // CIR:   %[[VAAREA:.+]] = cir.alloca !cir.array<!rec___va_list_tag x 1>, !cir.ptr<!cir.array<!rec___va_list_tag x 1>>, ["args"]
 // CIR:   %[[RES_ADDR:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["res", init]
 // CIR:   cir.store %arg0, %[[COUNT_ADDR]] : !s32i, !cir.ptr<!s32i>
-// CIR:   %[[VA_PTR0:.+]] = cir.cast(array_to_ptrdecay, %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>), !cir.ptr<!rec___va_list_tag>
+// CIR:   %[[VA_PTR0:.+]] = cir.cast array_to_ptrdecay %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>> -> !cir.ptr<!rec___va_list_tag>
 // CIR:   %[[COUNT_VAL:.+]] = cir.load{{.*}} %[[COUNT_ADDR]] : !cir.ptr<!s32i>, !s32i
 // CIR:   cir.va_start %[[VA_PTR0]] %[[COUNT_VAL]] : !cir.ptr<!rec___va_list_tag>, !s32i
-// CIR:   %[[VA_PTR1:.+]] = cir.cast(array_to_ptrdecay, %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>), !cir.ptr<!rec___va_list_tag>
+// CIR:   %[[VA_PTR1:.+]] = cir.cast array_to_ptrdecay %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>> -> !cir.ptr<!rec___va_list_tag>
 // CIR:   %[[VA_ARG:.+]] = cir.va_arg %[[VA_PTR1]] : (!cir.ptr<!rec___va_list_tag>) -> !s32i
 // CIR:   cir.store{{.*}} %[[VA_ARG]], %[[RES_ADDR]] : !s32i, !cir.ptr<!s32i>
-// CIR:   %[[VA_PTR2:.+]] = cir.cast(array_to_ptrdecay, %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>), !cir.ptr<!rec___va_list_tag>
+// CIR:   %[[VA_PTR2:.+]] = cir.cast array_to_ptrdecay %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>> -> !cir.ptr<!rec___va_list_tag>
 // CIR:   cir.va_end %[[VA_PTR2]] : !cir.ptr<!rec___va_list_tag>
 // CIR:   %[[RESULT:.+]] = cir.load{{.*}} %[[RES_ADDR]] : !cir.ptr<!s32i>, !s32i
 // CIR:   cir.store %[[RESULT]], %[[RET_ADDR]] : !s32i, !cir.ptr<!s32i>
@@ -99,13 +99,13 @@ int stdarg_start(int count, ...) {
 // CIR:   %[[VAAREA:.+]] = cir.alloca !cir.array<!rec___va_list_tag x 1>, !cir.ptr<!cir.array<!rec___va_list_tag x 1>>, ["args"]
 // CIR:   %[[RES_ADDR:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["res", init]
 // CIR:   cir.store %arg0, %[[COUNT_ADDR]] : !s32i, !cir.ptr<!s32i>
-// CIR:   %[[VA_PTR0:.+]] = cir.cast(array_to_ptrdecay, %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>), !cir.ptr<!rec___va_list_tag>
+// CIR:   %[[VA_PTR0:.+]] = cir.cast array_to_ptrdecay %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>> -> !cir.ptr<!rec___va_list_tag>
 // CIR:   %[[C12345:.+]] = cir.const #cir.int<12345> : !s32i
 // CIR:   cir.va_start %[[VA_PTR0]] %[[C12345]] : !cir.ptr<!rec___va_list_tag>, !s32i
-// CIR:   %[[VA_PTR1:.+]] = cir.cast(array_to_ptrdecay, %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>), !cir.ptr<!rec___va_list_tag>
+// CIR:   %[[VA_PTR1:.+]] = cir.cast array_to_ptrdecay %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>> -> !cir.ptr<!rec___va_list_tag>
 // CIR:   %[[VA_ARG:.+]] = cir.va_arg %[[VA_PTR1]] : (!cir.ptr<!rec___va_list_tag>) -> !s32i
 // CIR:   cir.store{{.*}} %[[VA_ARG]], %[[RES_ADDR]] : !s32i, !cir.ptr<!s32i>
-// CIR:   %[[VA_PTR2:.+]] = cir.cast(array_to_ptrdecay, %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>>), !cir.ptr<!rec___va_list_tag>
+// CIR:   %[[VA_PTR2:.+]] = cir.cast array_to_ptrdecay %[[VAAREA]] : !cir.ptr<!cir.array<!rec___va_list_tag x 1>> -> !cir.ptr<!rec___va_list_tag>
 // CIR:   cir.va_end %[[VA_PTR2]] : !cir.ptr<!rec___va_list_tag>
 // CIR:   %[[RESULT:.+]] = cir.load{{.*}} %[[RES_ADDR]] : !cir.ptr<!s32i>, !s32i
 // CIR:   cir.store %[[RESULT]], %[[RET_ADDR]] : !s32i, !cir.ptr<!s32i>
diff --git a/clang/test/CIR/CodeGen/variable-decomposition.cpp b/clang/test/CIR/CodeGen/variable-decomposition.cpp
index 40dfe73c411c9..ba59109ab625f 100644
--- a/clang/test/CIR/CodeGen/variable-decomposition.cpp
+++ b/clang/test/CIR/CodeGen/variable-decomposition.cpp
@@ -27,7 +27,7 @@ float function() {
 // CIR:  cir.store{{.*}} %[[TWO_FP]], %[[MEMBER_B]]
 // CIR:  %[[MEMBER_A:.+]] = cir.get_member %[[STRUCT]][0] {name = "a"} : !cir.ptr<!rec_some_struct> -> !cir.ptr<!s32i>
 // CIR:  %[[LOAD_A:.+]] = cir.load align(4) %[[MEMBER_A]] : !cir.ptr<!s32i>, !s32i
-// CIR:  %[[CAST_A:.+]] = cir.cast(int_to_float, %[[LOAD_A]] : !s32i), !cir.float
+// CIR:  %[[CAST_A:.+]] = cir.cast int_to_float %[[LOAD_A]] : !s32i -> !cir.float
 // CIR:  %[[MEMBER_B:.+]] = cir.get_member %[[STRUCT]][1] {name = "b"} : !cir.ptr<!rec_some_struct> -> !cir.ptr<!cir.float>
 // CIR:  %[[LOAD_B:.+]] = cir.load align(4) %[[MEMBER_B]] : !cir.ptr<!cir.float>, !cir.float
 // CIR:  %[[ADD:.+]] = cir.binop(add, %[[CAST_A]], %[[LOAD_B]]) : !cir.float
diff --git a/clang/test/CIR/CodeGen/vbase.cpp b/clang/test/CIR/CodeGen/vbase.cpp
index 4d57f8ea74e0c..86469c5d6ae7d 100644
--- a/clang/test/CIR/CodeGen/vbase.cpp
+++ b/clang/test/CIR/CodeGen/vbase.cpp
@@ -62,15 +62,15 @@ void ppp() { B b; }
 // CIR:   cir.call @_ZN7DerivedC1Ev(%[[D]]) nothrow : (!cir.ptr<!rec_Derived>) -> ()
 // CIR:   %[[VPTR_PTR:.+]] = cir.vtable.get_vptr %[[D]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!cir.vptr>
 // CIR:   %[[VPTR:.+]] = cir.load {{.*}} %[[VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
-// CIR:   %[[VPTR_I8:.+]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR:   %[[VPTR_I8:.+]] = cir.cast bitcast %[[VPTR]] : !cir.vptr -> !cir.ptr<!u8i>
 // CIR:   %[[NEG32:.+]] = cir.const #cir.int<-32> : !s64i
 // CIR:   %[[ADJ_VPTR_I8:.+]] = cir.ptr_stride(%[[VPTR_I8]] : !cir.ptr<!u8i>, %[[NEG32]] : !s64i), !cir.ptr<!u8i>
-// CIR:   %[[OFFSET_PTR:.+]] = cir.cast(bitcast, %[[ADJ_VPTR_I8]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR:   %[[OFFSET_PTR:.+]] = cir.cast bitcast %[[ADJ_VPTR_I8]] : !cir.ptr<!u8i> -> !cir.ptr<!s64i>
 // CIR:   %[[OFFSET:.+]] = cir.load {{.*}} %[[OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR:   %[[D_I8:.+]] = cir.cast(bitcast, %[[D]] : !cir.ptr<!rec_Derived>), !cir.ptr<!u8i>
+// CIR:   %[[D_I8:.+]] = cir.cast bitcast %[[D]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!u8i>
 // CIR:   %[[ADJ_THIS_I8:.+]] = cir.ptr_stride(%[[D_I8]] : !cir.ptr<!u8i>, %[[OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR:   %[[ADJ_THIS_D:.+]] = cir.cast(bitcast, %[[ADJ_THIS_I8]] : !cir.ptr<!u8i>), !cir.ptr<!rec_Derived>
-// CIR:   %[[BASE_THIS:.+]] = cir.cast(bitcast, %[[ADJ_THIS_D]] : !cir.ptr<!rec_Derived>), !cir.ptr<!rec_Base>
+// CIR:   %[[ADJ_THIS_D:.+]] = cir.cast bitcast %[[ADJ_THIS_I8]] : !cir.ptr<!u8i> -> !cir.ptr<!rec_Derived>
+// CIR:   %[[BASE_THIS:.+]] = cir.cast bitcast %[[ADJ_THIS_D]] : !cir.ptr<!rec_Derived> -> !cir.ptr<!rec_Base>
 // CIR:   %[[BASE_VPTR_PTR:.+]] = cir.vtable.get_vptr %[[BASE_THIS]] : !cir.ptr<!rec_Base> -> !cir.ptr<!cir.vptr>
 // CIR:   %[[BASE_VPTR:.+]] = cir.load {{.*}} %[[BASE_VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
 // CIR:   %[[SLOT_PTR:.+]] = cir.vtable.get_virtual_fn_addr %[[BASE_VPTR]][0] : !cir.vptr -> !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!rec_Base>)>>>
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index 8bca48d8ffe0c..2fd493f87c1ee 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -1048,7 +1048,7 @@ void foo17() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["a"]
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
-// CIR: %[[RES:.*]] = cir.cast(float_to_int, %[[TMP]] : !cir.vector<2 x !cir.double>), !cir.vector<2 x !u16i>
+// CIR: %[[RES:.*]] = cir.cast float_to_int %[[TMP]] : !cir.vector<2 x !cir.double> -> !cir.vector<2 x !u16i>
 
 // LLVM: %[[VEC_A:.*]] = alloca <2 x double>, i64 1, align 16
 // LLVM: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16
@@ -1228,11 +1228,11 @@ void foo24() {
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<4 x !cir.f16>, !cir.ptr<!cir.vector<4 x !cir.f16>>, ["b"]
 // CIR: %[[C_ADDR:.*]] = cir.alloca !cir.vector<4 x !cir.f16>, !cir.ptr<!cir.vector<4 x !cir.f16>>, ["c", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !cir.f16>>, !cir.vector<4 x !cir.f16>
-// CIR: %[[TMP_A_F16:.*]] = cir.cast(floating, %[[TMP_A]] : !cir.vector<4 x !cir.f16>), !cir.vector<4 x !cir.float>
+// CIR: %[[TMP_A_F16:.*]] = cir.cast floating %[[TMP_A]] : !cir.vector<4 x !cir.f16> -> !cir.vector<4 x !cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.vector<4 x !cir.f16>>, !cir.vector<4 x !cir.f16>
-// CIR: %[[TMP_B_F16:.*]] = cir.cast(floating, %[[TMP_B]] : !cir.vector<4 x !cir.f16>), !cir.vector<4 x !cir.float>
+// CIR: %[[TMP_B_F16:.*]] = cir.cast floating %[[TMP_B]] : !cir.vector<4 x !cir.f16> -> !cir.vector<4 x !cir.float>
 // CIR: %[[RESULT:.*]] = cir.binop(add, %[[TMP_A_F16]], %[[TMP_B_F16]]) : !cir.vector<4 x !cir.float>
-// CIR: %[[RESULT_VF16:.*]] = cir.cast(floating, %[[RESULT]] : !cir.vector<4 x !cir.float>), !cir.vector<4 x !cir.f16>
+// CIR: %[[RESULT_VF16:.*]] = cir.cast floating %[[RESULT]] : !cir.vector<4 x !cir.float> -> !cir.vector<4 x !cir.f16>
 // CIR: cir.store{{.*}} %[[RESULT_VF16]], %[[C_ADDR]] : !cir.vector<4 x !cir.f16>, !cir.ptr<!cir.vector<4 x !cir.f16>>
 
 // LLVM: %[[A_ADDR:.*]] = alloca <4 x half>, i64 1, align 8
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index f242779502148..86551d277fa71 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -1035,7 +1035,7 @@ void foo17() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>>, ["a"]
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double>
-// CIR: %[[RES:.*]] = cir.cast(float_to_int, %[[TMP]] : !cir.vector<2 x !cir.double>), !cir.vector<2 x !u16i>
+// CIR: %[[RES:.*]] = cir.cast float_to_int %[[TMP]] : !cir.vector<2 x !cir.double> -> !cir.vector<2 x !u16i>
 
 // LLVM: %[[VEC_A:.*]] = alloca <2 x double>, i64 1, align 16
 // LLVM: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16
@@ -1270,11 +1270,11 @@ void foo27() {
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.vector<4 x !cir.f16>, !cir.ptr<!cir.vector<4 x !cir.f16>>, ["b"]
 // CIR: %[[C_ADDR:.*]] = cir.alloca !cir.vector<4 x !cir.f16>, !cir.ptr<!cir.vector<4 x !cir.f16>>, ["c", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.vector<4 x !cir.f16>>, !cir.vector<4 x !cir.f16>
-// CIR: %[[TMP_A_F16:.*]] = cir.cast(floating, %[[TMP_A]] : !cir.vector<4 x !cir.f16>), !cir.vector<4 x !cir.float>
+// CIR: %[[TMP_A_F16:.*]] = cir.cast floating %[[TMP_A]] : !cir.vector<4 x !cir.f16> -> !cir.vector<4 x !cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.vector<4 x !cir.f16>>, !cir.vector<4 x !cir.f16>
-// CIR: %[[TMP_B_F16:.*]] = cir.cast(floating, %[[TMP_B]] : !cir.vector<4 x !cir.f16>), !cir.vector<4 x !cir.float>
+// CIR: %[[TMP_B_F16:.*]] = cir.cast floating %[[TMP_B]] : !cir.vector<4 x !cir.f16> -> !cir.vector<4 x !cir.float>
 // CIR: %[[RESULT:.*]] = cir.binop(add, %[[TMP_A_F16]], %[[TMP_B_F16]]) : !cir.vector<4 x !cir.float>
-// CIR: %[[RESULT_VF16:.*]] = cir.cast(floating, %[[RESULT]] : !cir.vector<4 x !cir.float>), !cir.vector<4 x !cir.f16>
+// CIR: %[[RESULT_VF16:.*]] = cir.cast floating %[[RESULT]] : !cir.vector<4 x !cir.float> -> !cir.vector<4 x !cir.f16>
 // CIR: cir.store{{.*}} %[[RESULT_VF16]], %[[C_ADDR]] : !cir.vector<4 x !cir.f16>, !cir.ptr<!cir.vector<4 x !cir.f16>>
 
 // LLVM: %[[A_ADDR:.*]] = alloca <4 x half>, i64 1, align 8
diff --git a/clang/test/CIR/CodeGen/vtt.cpp b/clang/test/CIR/CodeGen/vtt.cpp
index baab972bce696..f47da41e5b200 100644
--- a/clang/test/CIR/CodeGen/vtt.cpp
+++ b/clang/test/CIR/CodeGen/vtt.cpp
@@ -281,23 +281,23 @@ D::D() {}
 // CIR-COMMON:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
 // CIR-COMMON:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
 // CIR-COMMON:        %[[VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast bitcast %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]]
 // CIR-COMMON:        %[[B_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
 // CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[B_VPTR_ADDR]]
 // CIR-COMMON:        %[[B_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
-// CIR-COMMON:        %[[B_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[B_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[B_VPTR_ADDR:.*]] = cir.cast bitcast %[[B_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[B_VPTR:.*]] = cir.load{{.*}} %[[B_VPTR_ADDR]]
 // CIR-COMMON:        %[[B_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
 // CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[B_VPTR_ADDR]]
-// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast bitcast %[[VPTR]] : !cir.vptr -> !cir.ptr<!u8i>
 // CIR-COMMON:        %[[CONST_24:.*]] = cir.const #cir.int<-24>
 // CIR-COMMON:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
-// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast bitcast %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i> -> !cir.ptr<!s64i>
 // CIR-COMMON:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_B>), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast bitcast %[[THIS]] : !cir.ptr<!rec_B> -> !cir.ptr<!u8i>
 // CIR-COMMON:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_B>
+// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast bitcast %[[BASE_PTR]] : !cir.ptr<!u8i> -> !cir.ptr<!rec_B>
 // CIR-COMMON:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
 // CIR-COMMON:        cir.store{{.*}} %[[B_VPTR]], %[[BASE_VPTR_ADDR]]
 
@@ -347,23 +347,23 @@ D::D() {}
 // CIR-COMMON:        %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
 // CIR-COMMON:        %[[VTT:.*]] = cir.load{{.*}} %[[VTT_ADDR]]
 // CIR-COMMON:        %[[VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast bitcast %[[VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]]
 // CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
 // CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[C_VPTR_ADDR]]
 // CIR-COMMON:        %[[C_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
-// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.cast bitcast %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[C_VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]]
 // CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
 // CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]]
-// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast bitcast %[[VPTR]] : !cir.vptr -> !cir.ptr<!u8i>
 // CIR-COMMON:        %[[CONST_24:.*]] = cir.const #cir.int<-24>
 // CIR-COMMON:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
-// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast bitcast %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i> -> !cir.ptr<!s64i>
 // CIR-COMMON:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_C>), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast bitcast %[[THIS]] : !cir.ptr<!rec_C> -> !cir.ptr<!u8i>
 // CIR-COMMON:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_C>
+// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast bitcast %[[BASE_PTR]] : !cir.ptr<!u8i> -> !cir.ptr<!rec_C>
 // CIR-COMMON:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
 // CIR-COMMON:        cir.store{{.*}} %[[C_VPTR]], %[[BASE_VPTR_ADDR]]
 
@@ -419,27 +419,27 @@ D::D() {}
 // CIR-COMMON:        %[[C_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 3 -> !cir.ptr<!cir.ptr<!void>>
 // CIR-COMMON:        cir.call @_ZN1CC2Ev(%[[C_ADDR]], %[[C_VTT]]) nothrow : (!cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!void>>) -> ()
 // CIR-COMMON:        %[[D_VTT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast(bitcast, %[[D_VTT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[VPTR_ADDR:.*]] = cir.cast bitcast %[[D_VTT]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[VPTR:.*]] = cir.load{{.*}} %[[VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
 // CIR-COMMON:        %[[D_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[THIS]]
 // CIR-COMMON:        cir.store{{.*}} %[[VPTR]], %[[D_VPTR_ADDR]]
 // CIR-COMMON:        %[[D_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 5 -> !cir.ptr<!cir.ptr<!void>>
-// CIR-COMMON:        %[[D_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[D_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[D_VPTR_ADDR:.*]] = cir.cast bitcast %[[D_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[D_VPTR:.*]] = cir.load{{.*}} %[[D_VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
 // CIR-COMMON:        %[[D_VPTR_ADDR2:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr<!rec_D> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[VPTR2:.*]] = cir.load{{.*}} %[[D_VPTR_ADDR2]] : !cir.ptr<!cir.vptr>, !cir.vptr
-// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast(bitcast, %[[VPTR2]] : !cir.vptr), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[VPTR_ADDR2:.*]] = cir.cast bitcast %[[VPTR2]] : !cir.vptr -> !cir.ptr<!u8i>
 // CIR-COMMON:        %[[CONST_24:.*]] = cir.const #cir.int<-24> : !s64i
 // CIR-COMMON:        %[[BASE_OFFSET_ADDR:.*]] = cir.ptr_stride(%[[VPTR_ADDR2]] : !cir.ptr<!u8i>, %[[CONST_24]] : !s64i), !cir.ptr<!u8i>
-// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast(bitcast, %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i>), !cir.ptr<!s64i>
+// CIR-COMMON:        %[[BASE_OFFSET_PTR:.*]] = cir.cast bitcast %[[BASE_OFFSET_ADDR]] : !cir.ptr<!u8i> -> !cir.ptr<!s64i>
 // CIR-COMMON:        %[[BASE_OFFSET:.*]] = cir.load{{.*}} %[[BASE_OFFSET_PTR]] : !cir.ptr<!s64i>, !s64i
-// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr<!rec_D>), !cir.ptr<!u8i>
+// CIR-COMMON:        %[[THIS_PTR:.*]] = cir.cast bitcast %[[THIS]] : !cir.ptr<!rec_D> -> !cir.ptr<!u8i>
 // CIR-COMMON:        %[[BASE_PTR:.*]] = cir.ptr_stride(%[[THIS_PTR]] : !cir.ptr<!u8i>, %[[BASE_OFFSET]] : !s64i), !cir.ptr<!u8i>
-// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast(bitcast, %[[BASE_PTR]] : !cir.ptr<!u8i>), !cir.ptr<!rec_D>
+// CIR-COMMON:        %[[BASE_CAST:.*]] = cir.cast bitcast %[[BASE_PTR]] : !cir.ptr<!u8i> -> !cir.ptr<!rec_D>
 // CIR-COMMON:        %[[BASE_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[BASE_CAST]]
 // CIR-COMMON:        cir.store{{.*}} %[[D_VPTR]], %[[BASE_VPTR_ADDR]]
 // CIR-COMMON:        %[[C_VTT_ADDR_POINT:.*]] = cir.vtt.address_point %[[VTT]] : !cir.ptr<!cir.ptr<!void>>, offset = 6 -> !cir.ptr<!cir.ptr<!void>>
-// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.cast(bitcast, %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+// CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.cast bitcast %[[C_VTT_ADDR_POINT]] : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
 // CIR-COMMON:        %[[C_VPTR:.*]] = cir.load{{.*}} %[[C_VPTR_ADDR]] : !cir.ptr<!cir.vptr>, !cir.vptr
 // CIR-COMMON:        %[[C_ADDR:.*]] = cir.base_class_addr %[[THIS]] : !cir.ptr<!rec_D> nonnull [16] -> !cir.ptr<!rec_C>
 // CIR-COMMON:        %[[C_VPTR_ADDR:.*]] = cir.vtable.get_vptr %[[C_ADDR]] : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-copy.c b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
index b4573e66f24a5..c1dc938912845 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-copy.c
+++ b/clang/test/CIR/CodeGenOpenACC/combined-copy.c
@@ -1090,7 +1090,7 @@ void copy_member_of_array_element_member() {
   for(int i = 0; i < 5; ++i);
   // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
   // CHECK-NEXT: %[[GETINNER:.*]] = cir.get_member %[[OUTER]][0] {name = "inner"} : !cir.ptr<!rec_OuterTy> -> !cir.ptr<!cir.array<!rec_InnerTy x 4>>
-  // CHECK-NEXT: %[[INNERDECAY:.*]] = cir.cast(array_to_ptrdecay, %[[GETINNER]] : !cir.ptr<!cir.array<!rec_InnerTy x 4>>), !cir.ptr<!rec_InnerTy>
+  // CHECK-NEXT: %[[INNERDECAY:.*]] = cir.cast array_to_ptrdecay %[[GETINNER]] : !cir.ptr<!cir.array<!rec_InnerTy x 4>> -> !cir.ptr<!rec_InnerTy>
   // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[INNERDECAY]] : !cir.ptr<!rec_InnerTy>, %[[TWO]] : !s32i), !cir.ptr<!rec_InnerTy>
   // CHECK-NEXT: %[[GETB:.*]] = cir.get_member %[[STRIDE]][1] {name = "b"} : !cir.ptr<!rec_InnerTy> -> !cir.ptr<!s32i>
   // CHECK-NEXT:  %[[COPYIN1:.*]] = acc.copyin varPtr(%[[GETB]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {dataClause = #acc<data_clause acc_copy>, name = "outer.inner[2].b"}
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
index 57e70df957ae6..e836a37a9bccd 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
@@ -87,9 +87,9 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!s32i>, %[[ZERO]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !s32i, !cir.ptr<!s32i>
@@ -97,7 +97,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[ONE]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ONE_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -105,7 +105,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[TWO]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[TWO_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -113,7 +113,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[THREE]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[THREE_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -121,7 +121,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[FOUR]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[FOUR_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -134,9 +134,9 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!cir.float>, %[[ZERO]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !cir.float, !cir.ptr<!cir.float>
@@ -144,7 +144,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[ONE]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ONE_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -152,7 +152,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[TWO]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[TWO_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -160,7 +160,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[THREE]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[THREE_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -168,7 +168,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[FOUR]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[FOUR_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -181,37 +181,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
@@ -224,37 +224,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
@@ -267,37 +267,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
@@ -310,37 +310,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor> 
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor> 
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
@@ -349,7 +349,7 @@ struct HasDtor {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i
-// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[ELEM:.*]] = cir.ptr_stride(%[[ARRPTR]] : !cir.ptr<!rec_HasDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[ITR:.*]] = cir.alloca !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>, ["__array_idx"]
 // CHECK-NEXT: cir.store %[[ELEM]], %[[ITR]] : !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
index 639320275ab0f..10f4482fee54f 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
@@ -125,7 +125,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasDtor>) -> ()
 // CHECK-NEXT: cir.yield
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
index 8cce119bc847e..3d295d58d1026 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
@@ -263,7 +263,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -305,7 +305,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -407,7 +407,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -509,7 +509,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -611,7 +611,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -714,7 +714,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -758,7 +758,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -800,7 +800,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -903,7 +903,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
index 2265a9a7744a4..be33afe07e363 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
@@ -131,7 +131,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -160,7 +160,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -191,7 +191,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -222,7 +222,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -253,7 +253,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -285,7 +285,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -315,7 +315,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -344,7 +344,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -376,7 +376,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
index a2b9d4015aeb3..f13d96d171123 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
@@ -310,7 +310,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -349,7 +349,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -372,7 +372,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -471,7 +471,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -494,7 +494,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -593,7 +593,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -616,7 +616,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -715,7 +715,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -738,7 +738,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -837,7 +837,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -861,7 +861,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -901,7 +901,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -925,7 +925,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -964,7 +964,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -988,7 +988,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -1087,7 +1087,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -1111,7 +1111,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -1151,7 +1151,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
index e7caf83e9b862..952fee9b1ac1a 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
@@ -134,7 +134,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -163,7 +163,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -194,7 +194,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -225,7 +225,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -256,7 +256,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -288,7 +288,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -318,7 +318,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -347,7 +347,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -379,7 +379,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
index bf9aa0ad59d60..15646ed87b284 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
@@ -310,7 +310,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -349,7 +349,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -372,7 +372,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -471,7 +471,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -494,7 +494,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -593,7 +593,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -616,7 +616,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -715,7 +715,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -738,7 +738,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -837,7 +837,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -861,7 +861,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -901,7 +901,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -925,7 +925,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -964,7 +964,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -987,7 +987,7 @@ void acc_combined() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -1086,7 +1086,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -1111,7 +1111,7 @@ void acc_combined() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -1151,7 +1151,7 @@ void acc_combined() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp
index b8140335f7c29..98f2ffd2cb12a 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp
@@ -191,7 +191,7 @@ extern "C" void acc_combined(int N, int cond) {
 #pragma acc serial loop self(N)
   for(unsigned I = 0; I < N; ++I);
   // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load{{.*}} %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[N_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[N_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.serial combined(loop) self(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.loop combined(serial) {
@@ -203,7 +203,7 @@ extern "C" void acc_combined(int N, int cond) {
 #pragma acc parallel loop if(N)
   for(unsigned I = 0; I < N; ++I);
   // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load{{.*}} %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[N_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[N_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.parallel combined(loop) if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.loop combined(parallel) {
@@ -215,7 +215,7 @@ extern "C" void acc_combined(int N, int cond) {
 #pragma acc serial loop if(1)
   for(unsigned I = 0; I < N; ++I);
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ONE_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.serial combined(loop) if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.loop combined(serial) {
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
index 947b281bfd9db..de6e7b0314fa9 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
@@ -40,9 +40,9 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!s32i>, %[[ZERO]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !s32i, !cir.ptr<!s32i>
@@ -50,7 +50,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[ONE]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ONE_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -58,7 +58,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[TWO]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[TWO_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -66,7 +66,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[THREE]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[THREE_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -74,7 +74,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[FOUR]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[FOUR_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -87,9 +87,9 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!cir.float>, %[[ZERO]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !cir.float, !cir.ptr<!cir.float>
@@ -97,7 +97,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[ONE]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ONE_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -105,7 +105,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[TWO]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[TWO_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -113,7 +113,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[THREE]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[THREE_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -121,7 +121,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[FOUR]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[FOUR_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -134,37 +134,37 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.copy %[[FROM_OFFSET:.*]] to %[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
 //
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
index 49fd78cb385e6..fca3ca85c9edf 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
@@ -87,9 +87,9 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!s32i>, %[[ZERO]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !s32i, !cir.ptr<!s32i>
@@ -97,7 +97,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[ONE]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ONE_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -105,7 +105,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[TWO]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[TWO_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -113,7 +113,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[THREE]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[THREE_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -121,7 +121,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[FOUR]] : !s64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[FOUR_2]] : !u64i), !cir.ptr<!s32i>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
@@ -134,9 +134,9 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!cir.float>, %[[ZERO]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !cir.float, !cir.ptr<!cir.float>
@@ -144,7 +144,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[ONE]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ONE_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -152,7 +152,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[TWO]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[TWO_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -160,7 +160,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[THREE]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[THREE_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -168,7 +168,7 @@ struct HasDtor {
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[FOUR]] : !s64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[FOUR_2]] : !u64i), !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
@@ -181,37 +181,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
 // CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
 //
@@ -224,37 +224,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
 // CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
 //
@@ -267,37 +267,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
 // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
 //
@@ -310,37 +310,37 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
 // CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor> 
+// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor> 
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
 // CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
 // CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast(array_to_ptrdecay, %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
 //
@@ -349,7 +349,7 @@ struct HasDtor {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i
-// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[ELEM:.*]] = cir.ptr_stride(%[[ARRPTR]] : !cir.ptr<!rec_HasDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[ITR:.*]] = cir.alloca !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>, ["__array_idx"]
 // CHECK-NEXT: cir.store %[[ELEM]], %[[ITR]] : !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
index 97399d9d4620e..d8542225b45fd 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
@@ -113,7 +113,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasDtor>) -> ()
 // CHECK-NEXT: cir.yield
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
index fff72dcbdd204..e357f440eb4c3 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
@@ -260,7 +260,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -289,7 +289,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -392,7 +392,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -494,7 +494,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -596,7 +596,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -699,7 +699,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -729,7 +729,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -758,7 +758,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -861,7 +861,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
index c5b45f2d2efe1..e0098bc625459 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
@@ -263,7 +263,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -305,7 +305,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -407,7 +407,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -509,7 +509,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -611,7 +611,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -714,7 +714,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -758,7 +758,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -800,7 +800,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -903,7 +903,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
index 5b0dcadece4f8..5336fadc9fd0c 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
@@ -131,7 +131,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -160,7 +160,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -191,7 +191,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -222,7 +222,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -253,7 +253,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -285,7 +285,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -315,7 +315,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -344,7 +344,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -376,7 +376,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
index 35a79d15d7756..a51388203a3d8 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
@@ -132,7 +132,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -161,7 +161,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -192,7 +192,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -223,7 +223,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -254,7 +254,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -286,7 +286,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -316,7 +316,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -345,7 +345,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -377,7 +377,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
index 1844440a47857..1968c0ac740dd 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
@@ -310,7 +310,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -349,7 +349,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -372,7 +372,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -471,7 +471,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -494,7 +494,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -593,7 +593,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -616,7 +616,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -715,7 +715,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -738,7 +738,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -837,7 +837,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -861,7 +861,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -901,7 +901,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -925,7 +925,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -964,7 +964,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -988,7 +988,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -1087,7 +1087,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -1111,7 +1111,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -1151,7 +1151,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
index 363e88502e815..f63e340b29aa7 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
@@ -132,7 +132,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -161,7 +161,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -192,7 +192,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -223,7 +223,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -254,7 +254,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -286,7 +286,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -316,7 +316,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -345,7 +345,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -377,7 +377,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
index a4320e6db5f7a..48e5ac94627f5 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
@@ -134,7 +134,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -163,7 +163,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -194,7 +194,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -225,7 +225,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -256,7 +256,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -288,7 +288,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -318,7 +318,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -347,7 +347,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -379,7 +379,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
index b56c1698e9eeb..6d204bc9060b0 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
@@ -310,7 +310,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -349,7 +349,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -372,7 +372,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -471,7 +471,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -494,7 +494,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -593,7 +593,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -616,7 +616,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -715,7 +715,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -738,7 +738,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -837,7 +837,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -861,7 +861,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -901,7 +901,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -925,7 +925,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -964,7 +964,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -987,7 +987,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -1086,7 +1086,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -1111,7 +1111,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -1151,7 +1151,7 @@ void acc_compute() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
index 0e815b7c7059a..35a7e7a951f74 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
@@ -132,7 +132,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!u32i>
@@ -161,7 +161,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -192,7 +192,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -223,7 +223,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -254,7 +254,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -286,7 +286,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!u32i>
@@ -316,7 +316,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!u32i>
@@ -345,7 +345,7 @@ void acc_compute() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -377,7 +377,7 @@ void acc_compute() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>>), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!u32i>, !cir.ptr<!cir.ptr<!u32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!u32i>
diff --git a/clang/test/CIR/CodeGenOpenACC/data.c b/clang/test/CIR/CodeGenOpenACC/data.c
index 1f6a76ce1ea7c..4e13f17f4bfd7 100644
--- a/clang/test/CIR/CodeGenOpenACC/data.c
+++ b/clang/test/CIR/CodeGenOpenACC/data.c
@@ -87,7 +87,7 @@ void acc_data(int cond) {
 #pragma acc data default(none) if(cond)
   {}
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.data if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.terminator
@@ -96,7 +96,7 @@ void acc_data(int cond) {
 #pragma acc data default(none) if(1)
   {}
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ONE_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.data if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.terminator
diff --git a/clang/test/CIR/CodeGenOpenACC/host_data.c b/clang/test/CIR/CodeGenOpenACC/host_data.c
index fa06d2a1cbd26..bcfa175f4e525 100644
--- a/clang/test/CIR/CodeGenOpenACC/host_data.c
+++ b/clang/test/CIR/CodeGenOpenACC/host_data.c
@@ -38,7 +38,7 @@ void acc_host_data(int cond, int var1, int var2, int *arr) {
   // CHECK-NEXT: %[[USE_DEV1:.*]] = acc.use_device varPtr(%[[V1]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "var1"}
   // CHECK-NEXT: %[[USE_DEV2:.*]] = acc.use_device varPtr(%[[V2]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "var2"}
   // CHECK-NEXT: %[[LOAD_COND:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_BOOL:.*]] = cir.cast(int_to_bool, %[[LOAD_COND]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_BOOL:.*]] = cir.cast int_to_bool %[[LOAD_COND]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[COND_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_BOOL]] : !cir.bool to i1
   // CHECK-NEXT: acc.host_data if(%[[COND_CAST]]) dataOperands(%[[USE_DEV1]], %[[USE_DEV2]] : !cir.ptr<!s32i>, !cir.ptr<!s32i>) {
   // CHECK-NEXT: acc.terminator
@@ -49,7 +49,7 @@ void acc_host_data(int cond, int var1, int var2, int *arr) {
   // CHECK-NEXT: %[[USE_DEV1:.*]] = acc.use_device varPtr(%[[V1]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "var1"}
   // CHECK-NEXT: %[[USE_DEV2:.*]] = acc.use_device varPtr(%[[V2]] : !cir.ptr<!s32i>) -> !cir.ptr<!s32i> {name = "var2"}
   // CHECK-NEXT: %[[LOAD_COND:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_BOOL:.*]] = cir.cast(int_to_bool, %[[LOAD_COND]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_BOOL:.*]] = cir.cast int_to_bool %[[LOAD_COND]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[COND_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_BOOL]] : !cir.bool to i1
   // CHECK-NEXT: acc.host_data if(%[[COND_CAST]]) dataOperands(%[[USE_DEV1]], %[[USE_DEV2]] : !cir.ptr<!s32i>, !cir.ptr<!s32i>) {
   // CHECK-NEXT: acc.terminator
diff --git a/clang/test/CIR/CodeGenOpenACC/init.c b/clang/test/CIR/CodeGenOpenACC/init.c
index 805fb08dbf487..829850f2c82d6 100644
--- a/clang/test/CIR/CodeGenOpenACC/init.c
+++ b/clang/test/CIR/CodeGenOpenACC/init.c
@@ -18,13 +18,13 @@ void acc_init(int cond) {
 
 #pragma acc init if(cond)
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.init if(%[[BOOL_CONV]])
 
 #pragma acc init if(1)
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT: %[[ONE_TO_BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[ONE_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ONE_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[ONE_TO_BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.init if(%[[BOOL_CONV]])
 
@@ -40,7 +40,7 @@ void acc_init(int cond) {
 
 #pragma acc init if(cond) device_num(cond) device_type(*)
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_CAST]] : !cir.bool to i1
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
   // CHECK-NEXT: %[[COND_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index 9b10b7489e814..9f33e54a345b1 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -29,7 +29,7 @@ void acc_kernels(int cond) {
   // CHECK-NEXT: cir.scope {
   // CHECK-NEXT: cir.while {
   // CHECK-NEXT: %[[INT:.*]] = cir.const #cir.int<1>
-  // CHECK-NEXT: %[[CAST:.*]] = cir.cast(int_to_bool, %[[INT]] :
+  // CHECK-NEXT: %[[CAST:.*]] = cir.cast int_to_bool %[[INT]]
   // CHECK-NEXT: cir.condition(%[[CAST]])
   // CHECK-NEXT: } do {
   // CHECK-NEXT: cir.yield
@@ -49,7 +49,7 @@ void acc_kernels(int cond) {
 #pragma acc kernels self(cond)
   {}
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.kernels self(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.terminator
@@ -58,7 +58,7 @@ void acc_kernels(int cond) {
 #pragma acc kernels self(0)
   {}
   // CHECK-NEXT: %[[ZERO_LITERAL:.*]] = cir.const #cir.int<0> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ZERO_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ZERO_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.kernels self(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.terminator
@@ -67,7 +67,7 @@ void acc_kernels(int cond) {
 #pragma acc kernels if(cond)
   {}
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.kernels if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.terminator
@@ -76,7 +76,7 @@ void acc_kernels(int cond) {
 #pragma acc kernels if(1)
   {}
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ONE_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.kernels if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.terminator
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
index d4fd4ccc68f7a..b356f0fb26cc4 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
@@ -125,7 +125,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
 // CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasDtor>) -> ()
 // CHECK-NEXT: cir.yield
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
index 7130a2bdccdcc..73b8fe27c6aa1 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
@@ -263,7 +263,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -305,7 +305,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -407,7 +407,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -509,7 +509,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -611,7 +611,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -714,7 +714,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -758,7 +758,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
@@ -800,7 +800,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -903,7 +903,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_DefaultOperators>, !cir.ptr<!cir.ptr<!rec_DefaultOperators>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_DefaultOperators>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
index e549104e0fedb..77c61382c06bf 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
@@ -132,7 +132,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -161,7 +161,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -192,7 +192,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -223,7 +223,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -254,7 +254,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -286,7 +286,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -316,7 +316,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
@@ -345,7 +345,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -377,7 +377,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>>), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!cir.float>, !cir.ptr<!cir.ptr<!cir.float>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[LAST_IDX]] : !s64i), !cir.ptr<!cir.float>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
index c2ece70bae7a2..6ca0654b0384d 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
@@ -310,7 +310,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -349,7 +349,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -372,7 +372,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -471,7 +471,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -494,7 +494,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -593,7 +593,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -616,7 +616,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -715,7 +715,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -738,7 +738,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -837,7 +837,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -861,7 +861,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -901,7 +901,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -925,7 +925,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -964,7 +964,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -988,7 +988,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -1087,7 +1087,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
@@ -1111,7 +1111,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsInline>
@@ -1151,7 +1151,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
index f9169df97f1e6..dd3c54fa8f023 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
@@ -134,7 +134,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -163,7 +163,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -194,7 +194,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -225,7 +225,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -256,7 +256,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -288,7 +288,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -318,7 +318,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
@@ -347,7 +347,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[DECAY]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE_IDX:.*]] = cir.const #cir.int<1> : !s64i
@@ -379,7 +379,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[LAST_IDX]] : !s64i), !cir.ptr<!s32i>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
index a3bf17356b81c..d36f9c608920e 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
@@ -310,7 +310,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -349,7 +349,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -372,7 +372,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_mul__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -471,7 +471,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -494,7 +494,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_max__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -593,7 +593,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -616,7 +616,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_min__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -715,7 +715,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -738,7 +738,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_iand__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -837,7 +837,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -861,7 +861,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -901,7 +901,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -925,7 +925,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -964,7 +964,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -987,7 +987,7 @@ void acc_loop() {
 // CHECK-NEXT: acc.reduction.recipe @reduction_land__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[DECAY]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
 // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
@@ -1086,7 +1086,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
@@ -1111,7 +1111,7 @@ void acc_loop() {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: cir.store {{.*}} %[[DECAY]], %[[TEMP_ITR]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<5> : !s64i
 // CHECK-NEXT: %[[END_ITR:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[LAST_IDX]] : !s64i), !cir.ptr<!rec_HasOperatorsOutline>
@@ -1151,7 +1151,7 @@ void acc_loop() {
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
 // CHECK-NEXT: %[[SIZE:.*]] = cir.const #cir.int<4>  : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[SIZE]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[IDX]] : !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 5db174fb6549b..7080a8d5e579a 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -28,7 +28,7 @@ void acc_parallel(int cond) {
   // CHECK-NEXT: cir.scope {
   // CHECK-NEXT: cir.while {
   // CHECK-NEXT: %[[INT:.*]] = cir.const #cir.int<1>
-  // CHECK-NEXT: %[[CAST:.*]] = cir.cast(int_to_bool, %[[INT]] :
+  // CHECK-NEXT: %[[CAST:.*]] = cir.cast int_to_bool %[[INT]]
   // CHECK-NEXT: cir.condition(%[[CAST]])
   // CHECK-NEXT: } do {
   // CHECK-NEXT: cir.yield
@@ -48,7 +48,7 @@ void acc_parallel(int cond) {
 #pragma acc parallel self(cond)
   {}
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.parallel self(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
@@ -57,7 +57,7 @@ void acc_parallel(int cond) {
 #pragma acc parallel self(0)
   {}
   // CHECK-NEXT: %[[ZERO_LITERAL:.*]] = cir.const #cir.int<0> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ZERO_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ZERO_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.parallel self(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
@@ -66,7 +66,7 @@ void acc_parallel(int cond) {
 #pragma acc parallel if(cond)
   {}
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.parallel if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
@@ -75,7 +75,7 @@ void acc_parallel(int cond) {
 #pragma acc parallel if(1)
   {}
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ONE_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.parallel if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
index c62ebe26584b8..30a14ac836e8e 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
@@ -34,7 +34,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: cir.call @_ZN8CtorDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CHECK-NEXT: cir.yield
@@ -55,7 +55,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, ["openacc.private.init", init] {alignment = 16 : i64}
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<5> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ONE_PAST_LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
@@ -75,7 +75,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT:} destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}):
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
@@ -120,7 +120,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: } body {
 //
 // CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 // CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 //
 // CHECK-NEXT: cir.scope {
@@ -139,7 +139,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: cir.call @_ZN8CtorDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CHECK-NEXT: cir.yield
@@ -169,9 +169,9 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__ZTSA5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_CtorDtor x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, ["openacc.private.init", init] {alignment = 16 : i64}
-// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast(bitcast, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>), !cir.ptr<!cir.array<!rec_CtorDtor x 25>>
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 25>>
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<25> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 25>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 25>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ONE_PAST_LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
@@ -190,9 +190,9 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}):
-// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast(bitcast, %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>), !cir.ptr<!cir.array<!rec_CtorDtor x 25>>
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 25>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<24> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 25>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 25>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
@@ -236,7 +236,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>), !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
 // CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
 // CHECK-NEXT: cir.scope {
 // CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
@@ -253,7 +253,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 // CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 // CHECK-NEXT: cir.scope {
 // CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -270,7 +270,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[BOUND1_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: cir.call @_ZN8CtorDtorD1Ev(%[[BOUND1_STRIDE]]) nothrow : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CHECK-NEXT: cir.yield
@@ -330,7 +330,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: } body {
 //
 // CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>), !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
 // CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
 //
 // CHECK-NEXT: cir.scope {
@@ -349,10 +349,10 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i
-// CHECK-NEXT: %[[ARR_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[ARR_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
@@ -395,9 +395,9 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__ZTSA5_A5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>, ["openacc.private.init", init] {alignment = 16 : i64}
-// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast(bitcast, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>), !cir.ptr<!cir.array<!rec_CtorDtor x 125>>
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 125>>
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<125> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 125>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 125>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ONE_PAST_LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
@@ -416,9 +416,9 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}})):
-// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast(bitcast, %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>), !cir.ptr<!cir.array<!rec_CtorDtor x 125>>
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 125>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<124> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 125>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BITCAST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 125>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[LAST_ELT]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
index 38df8133a38c0..753389f2a3f47 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
@@ -20,7 +20,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!rec_NoOps x 5>> {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, ["openacc.private.init", init] {alignment = 16 : i64}
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<5> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
 // CHECK-NEXT: %[[ONE_PAST_LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NoOps>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_NoOps>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
@@ -57,9 +57,9 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__ZTSA5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_NoOps x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, ["openacc.private.init", init] {alignment = 16 : i64}
-// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast(bitcast, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>), !cir.ptr<!cir.array<!rec_NoOps x 25>>
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 25>>
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<25> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BITCAST]] : !cir.ptr<!cir.array<!rec_NoOps x 25>>), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BITCAST]] : !cir.ptr<!cir.array<!rec_NoOps x 25>> -> !cir.ptr<!rec_NoOps>
 // CHECK-NEXT: %[[ONE_PAST_LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NoOps>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_NoOps>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
@@ -110,9 +110,9 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.private.init", init] {alignment = 16 : i64}
-// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast(bitcast, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>), !cir.ptr<!cir.array<!rec_NoOps x 125>>
+// CHECK-NEXT: %[[BITCAST:.*]] = cir.cast bitcast %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 125>>
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<125> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BITCAST]] : !cir.ptr<!cir.array<!rec_NoOps x 125>>), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BITCAST]] : !cir.ptr<!cir.array<!rec_NoOps x 125>> -> !cir.ptr<!rec_NoOps>
 // CHECK-NEXT: %[[ONE_PAST_LAST_ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NoOps>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_NoOps>
 // CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
index 52bcd7cd539f2..e17ef90d01212 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
@@ -336,7 +336,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CHECK-NEXT: cir.scope {
 // CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -435,7 +435,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: cir.call @_ZN8CtorDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CHECK-NEXT: cir.yield
@@ -511,7 +511,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
 // CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
 // CHECK-NEXT: cir.scope {
 // CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
@@ -683,7 +683,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: cir.call @_ZN8CtorDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CHECK-NEXT: cir.yield
@@ -777,7 +777,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[TLA_STRIDE]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
 // CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i 
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
 // CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
 // CHECK-NEXT: cir.store %[[ELT]], %[[IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
@@ -885,7 +885,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[COND]])
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CHECK-NEXT: cir.scope {
 // CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c
index 9e3359141838f..aae4a92b13b0e 100644
--- a/clang/test/CIR/CodeGenOpenACC/serial.c
+++ b/clang/test/CIR/CodeGenOpenACC/serial.c
@@ -29,7 +29,7 @@ void acc_serial(int cond) {
   // CHECK-NEXT: cir.scope {
   // CHECK-NEXT: cir.while {
   // CHECK-NEXT: %[[INT:.*]] = cir.const #cir.int<1>
-  // CHECK-NEXT: %[[CAST:.*]] = cir.cast(int_to_bool, %[[INT]] :
+  // CHECK-NEXT: %[[CAST:.*]] = cir.cast int_to_bool %[[INT]]
   // CHECK-NEXT: cir.condition(%[[CAST]])
   // CHECK-NEXT: } do {
   // CHECK-NEXT: cir.yield
@@ -49,7 +49,7 @@ void acc_serial(int cond) {
 #pragma acc serial self(cond)
   {}
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.serial self(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
@@ -58,7 +58,7 @@ void acc_serial(int cond) {
 #pragma acc serial self(0)
   {}
   // CHECK-NEXT: %[[ZERO_LITERAL:.*]] = cir.const #cir.int<0> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ZERO_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ZERO_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.serial self(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
@@ -67,7 +67,7 @@ void acc_serial(int cond) {
 #pragma acc serial if(cond)
   {}
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.serial if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
@@ -76,7 +76,7 @@ void acc_serial(int cond) {
 #pragma acc serial if(1)
   {}
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[ONE_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.serial if(%[[CONV_CAST]]) {
   // CHECK-NEXT: acc.yield
diff --git a/clang/test/CIR/CodeGenOpenACC/set.c b/clang/test/CIR/CodeGenOpenACC/set.c
index 0b87f42603776..b8030dfd9d883 100644
--- a/clang/test/CIR/CodeGenOpenACC/set.c
+++ b/clang/test/CIR/CodeGenOpenACC/set.c
@@ -26,7 +26,7 @@ void acc_set(int cond) {
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
   // CHECK-NEXT: %[[COND_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.set device_num(%[[COND_CONV]] : si32) if(%[[BOOL_CONV]])
 
@@ -36,7 +36,7 @@ void acc_set(int cond) {
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
   // CHECK-NEXT: %[[COND_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.set default_async(%[[ONE_CONV]] : si32) device_num(%[[COND_CONV]] : si32) if(%[[BOOL_CONV]]) attributes {device_type = #acc.device_type<radeon>}
 
diff --git a/clang/test/CIR/CodeGenOpenACC/shutdown.c b/clang/test/CIR/CodeGenOpenACC/shutdown.c
index b68ef90e07252..8c27fa6c2d544 100644
--- a/clang/test/CIR/CodeGenOpenACC/shutdown.c
+++ b/clang/test/CIR/CodeGenOpenACC/shutdown.c
@@ -18,13 +18,13 @@ void acc_shutdown(int cond) {
 
 #pragma acc shutdown if(cond)
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.shutdown if(%[[BOOL_CONV]])
 
 #pragma acc shutdown if(1)
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT: %[[ONE_TO_BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[ONE_LITERAL]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[ONE_TO_BOOL_CAST:.*]] = cir.cast int_to_bool %[[ONE_LITERAL]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[ONE_TO_BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.shutdown if(%[[BOOL_CONV]])
 
@@ -40,7 +40,7 @@ void acc_shutdown(int cond) {
 
 #pragma acc shutdown if(cond) device_num(cond) device_type(*)
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[COND_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[BOOL_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_CAST]] : !cir.bool to i1
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
   // CHECK-NEXT: %[[COND_CONV:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
diff --git a/clang/test/CIR/CodeGenOpenACC/wait.c b/clang/test/CIR/CodeGenOpenACC/wait.c
index aeda8b955a6d0..8be8665923c59 100644
--- a/clang/test/CIR/CodeGenOpenACC/wait.c
+++ b/clang/test/CIR/CodeGenOpenACC/wait.c
@@ -10,7 +10,7 @@ void acc_wait(int cond) {
 
 #pragma acc wait if (cond)
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: acc.wait if(%[[CONV_CAST]])
 
@@ -37,7 +37,7 @@ void acc_wait(int cond) {
 
 #pragma acc wait(queues:1) if (cond)
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
   // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
@@ -54,7 +54,7 @@ void acc_wait(int cond) {
 
 #pragma acc wait(devnum:1: 2, 3) if (cond)
   // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load{{.*}} %[[COND]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast(int_to_bool, %[[COND_LOAD]] : !s32i), !cir.bool
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[COND_LOAD]] : !s32i -> !cir.bool
   // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
   // CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
   // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
diff --git a/clang/test/CIR/IR/alloca.cir b/clang/test/CIR/IR/alloca.cir
index 12f7e6ac6a914..4a13c44292b35 100644
--- a/clang/test/CIR/IR/alloca.cir
+++ b/clang/test/CIR/IR/alloca.cir
@@ -12,7 +12,7 @@ module {
     %2 = cir.load align(8) %0 : !cir.ptr<!u64i>, !u64i
     // Dynamically sized alloca
     %3 = cir.alloca !u8i, !cir.ptr<!u8i>, %2 : !u64i, ["bi_alloca"] {alignment = 16 : i64}
-    %4 = cir.cast(bitcast, %3 : !cir.ptr<!u8i>), !cir.ptr<!void>
+    %4 = cir.cast bitcast %3 : !cir.ptr<!u8i> -> !cir.ptr<!void>
     cir.store %4, %1 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
     %5 = cir.load %1 : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
     cir.return %5 : !cir.ptr<!void>
@@ -24,7 +24,7 @@ module {
   // CHECK:   cir.store %arg0, %0 : !u64i, !cir.ptr<!u64i>
   // CHECK:   %2 = cir.load align(8) %0 : !cir.ptr<!u64i>, !u64i
   // CHECK:   %3 = cir.alloca !u8i, !cir.ptr<!u8i>, %2 : !u64i, ["bi_alloca"] {alignment = 16 : i64}
-  // CHECK:   %4 = cir.cast(bitcast, %3 : !cir.ptr<!u8i>), !cir.ptr<!void>
+  // CHECK:   %4 = cir.cast bitcast %3 : !cir.ptr<!u8i> -> !cir.ptr<!void>
   // CHECK:   cir.store %4, %1 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
   // CHECK:   %5 = cir.load %1 : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
   // CHECK:   cir.return %5 : !cir.ptr<!void>
diff --git a/clang/test/CIR/IR/binassign.cir b/clang/test/CIR/IR/binassign.cir
index a25729635094e..6d2c5c8ab6962 100644
--- a/clang/test/CIR/IR/binassign.cir
+++ b/clang/test/CIR/IR/binassign.cir
@@ -12,7 +12,7 @@ module {
     %4 = cir.const #true
     cir.store %4, %0 : !cir.bool, !cir.ptr<!cir.bool>
     %5 = cir.const #cir.int<65> : !s32i
-    %6 = cir.cast(integral, %5 : !s32i), !s8i
+    %6 = cir.cast integral %5 : !s32i -> !s8i
     cir.store %6, %1 : !s8i, !cir.ptr<!s8i>
     %7 = cir.const #cir.fp<3.140000e+00> : !cir.float
     cir.store %7, %2 : !cir.float, !cir.ptr<!cir.float>
@@ -34,7 +34,7 @@ module {
 // CHECK:     %4 = cir.const #true
 // CHECK:     cir.store %4, %0 : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK:     %5 = cir.const #cir.int<65> : !s32i
-// CHECK:     %6 = cir.cast(integral, %5 : !s32i), !s8i
+// CHECK:     %6 = cir.cast integral %5 : !s32i -> !s8i
 // CHECK:     cir.store %6, %1 : !s8i, !cir.ptr<!s8i>
 // CHECK:     %7 = cir.const #cir.fp<3.140000e+00> : !cir.float
 // CHECK:     cir.store %7, %2 : !cir.float, !cir.ptr<!cir.float>
diff --git a/clang/test/CIR/IR/cast.cir b/clang/test/CIR/IR/cast.cir
index a335887de7ec7..11b1664871ef7 100644
--- a/clang/test/CIR/IR/cast.cir
+++ b/clang/test/CIR/IR/cast.cir
@@ -3,21 +3,21 @@
 
 module  {
   cir.func @yolo(%arg0 : !s32i) {
-    %a = cir.cast (int_to_bool, %arg0 : !s32i), !cir.bool
+    %a = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
 
     %0 = cir.const #cir.int<0> : !s32i
     cir.return
   }
 
   cir.func @bitcast(%p: !cir.ptr<!s32i>) {
-    %0 = cir.cast(bitcast, %p : !cir.ptr<!s32i>), !cir.ptr<f32>
+    %0 = cir.cast bitcast %p : !cir.ptr<!s32i> -> !cir.ptr<f32>
     cir.return
   }
 }
 
 // CHECK: cir.func{{.*}} @yolo(%arg0: !s32i)
-// CHECK: %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+// CHECK: %0 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
 // CHECK: %1 = cir.const #cir.int<0> : !s32i
 
 // CHECK: cir.func{{.*}} @bitcast
-// CHECK: %0 = cir.cast(bitcast, %arg0 : !cir.ptr<!s32i>), !cir.ptr<f32>
+// CHECK: %0 = cir.cast bitcast %arg0 : !cir.ptr<!s32i> -> !cir.ptr<f32>
diff --git a/clang/test/CIR/IR/cmp.cir b/clang/test/CIR/IR/cmp.cir
index 818527189af01..fdf538d7eef92 100644
--- a/clang/test/CIR/IR/cmp.cir
+++ b/clang/test/CIR/IR/cmp.cir
@@ -274,39 +274,39 @@ module {
     cir.store %arg0, %0 : !cir.bool, !cir.ptr<!cir.bool>
     cir.store %arg1, %1 : !cir.bool, !cir.ptr<!cir.bool>
     %3 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-    %4 = cir.cast(bool_to_int, %3 : !cir.bool), !s32i
+    %4 = cir.cast bool_to_int %3 : !cir.bool -> !s32i
     %5 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-    %6 = cir.cast(bool_to_int, %5 : !cir.bool), !s32i
+    %6 = cir.cast bool_to_int %5 : !cir.bool -> !s32i
     %7 = cir.cmp(gt, %4, %6) : !s32i, !cir.bool
     cir.store %7, %2 : !cir.bool, !cir.ptr<!cir.bool>
     %8 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-    %9 = cir.cast(bool_to_int, %8 : !cir.bool), !s32i
+    %9 = cir.cast bool_to_int %8 : !cir.bool -> !s32i
     %10 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-    %11 = cir.cast(bool_to_int, %10 : !cir.bool), !s32i
+    %11 = cir.cast bool_to_int %10 : !cir.bool -> !s32i
     %12 = cir.cmp(lt, %9, %11) : !s32i, !cir.bool
     cir.store %12, %2 : !cir.bool, !cir.ptr<!cir.bool>
     %13 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-    %14 = cir.cast(bool_to_int, %13 : !cir.bool), !s32i
+    %14 = cir.cast bool_to_int %13 : !cir.bool -> !s32i
     %15 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-    %16 = cir.cast(bool_to_int, %15 : !cir.bool), !s32i
+    %16 = cir.cast bool_to_int %15 : !cir.bool -> !s32i
     %17 = cir.cmp(ge, %14, %16) : !s32i, !cir.bool
     cir.store %17, %2 : !cir.bool, !cir.ptr<!cir.bool>
     %18 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-    %19 = cir.cast(bool_to_int, %18 : !cir.bool), !s32i
+    %19 = cir.cast bool_to_int %18 : !cir.bool -> !s32i
     %20 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-    %21 = cir.cast(bool_to_int, %20 : !cir.bool), !s32i
+    %21 = cir.cast bool_to_int %20 : !cir.bool -> !s32i
     %22 = cir.cmp(le, %19, %21) : !s32i, !cir.bool
     cir.store %22, %2 : !cir.bool, !cir.ptr<!cir.bool>
     %23 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-    %24 = cir.cast(bool_to_int, %23 : !cir.bool), !s32i
+    %24 = cir.cast bool_to_int %23 : !cir.bool -> !s32i
     %25 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-    %26 = cir.cast(bool_to_int, %25 : !cir.bool), !s32i
+    %26 = cir.cast bool_to_int %25 : !cir.bool -> !s32i
     %27 = cir.cmp(eq, %24, %26) : !s32i, !cir.bool
     cir.store %27, %2 : !cir.bool, !cir.ptr<!cir.bool>
     %28 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-    %29 = cir.cast(bool_to_int, %28 : !cir.bool), !s32i
+    %29 = cir.cast bool_to_int %28 : !cir.bool -> !s32i
     %30 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-    %31 = cir.cast(bool_to_int, %30 : !cir.bool), !s32i
+    %31 = cir.cast bool_to_int %30 : !cir.bool -> !s32i
     %32 = cir.cmp(ne, %29, %31) : !s32i, !cir.bool
     cir.store %32, %2 : !cir.bool, !cir.ptr<!cir.bool>
     cir.return
@@ -319,39 +319,39 @@ module {
   // CHECK-NEXT:   cir.store %arg0, %0 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   cir.store %arg1, %1 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   %3 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %4 = cir.cast(bool_to_int, %3 : !cir.bool), !s32i
+  // CHECK-NEXT:   %4 = cir.cast bool_to_int %3 : !cir.bool -> !s32i
   // CHECK-NEXT:   %5 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %6 = cir.cast(bool_to_int, %5 : !cir.bool), !s32i
+  // CHECK-NEXT:   %6 = cir.cast bool_to_int %5 : !cir.bool -> !s32i
   // CHECK-NEXT:   %7 = cir.cmp(gt, %4, %6) : !s32i, !cir.bool
   // CHECK-NEXT:   cir.store %7, %2 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   %8 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %9 = cir.cast(bool_to_int, %8 : !cir.bool), !s32i
+  // CHECK-NEXT:   %9 = cir.cast bool_to_int %8 : !cir.bool -> !s32i
   // CHECK-NEXT:   %10 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %11 = cir.cast(bool_to_int, %10 : !cir.bool), !s32i
+  // CHECK-NEXT:   %11 = cir.cast bool_to_int %10 : !cir.bool -> !s32i
   // CHECK-NEXT:   %12 = cir.cmp(lt, %9, %11) : !s32i, !cir.bool
   // CHECK-NEXT:   cir.store %12, %2 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   %13 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %14 = cir.cast(bool_to_int, %13 : !cir.bool), !s32i
+  // CHECK-NEXT:   %14 = cir.cast bool_to_int %13 : !cir.bool -> !s32i
   // CHECK-NEXT:   %15 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %16 = cir.cast(bool_to_int, %15 : !cir.bool), !s32i
+  // CHECK-NEXT:   %16 = cir.cast bool_to_int %15 : !cir.bool -> !s32i
   // CHECK-NEXT:   %17 = cir.cmp(ge, %14, %16) : !s32i, !cir.bool
   // CHECK-NEXT:   cir.store %17, %2 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   %18 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %19 = cir.cast(bool_to_int, %18 : !cir.bool), !s32i
+  // CHECK-NEXT:   %19 = cir.cast bool_to_int %18 : !cir.bool -> !s32i
   // CHECK-NEXT:   %20 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %21 = cir.cast(bool_to_int, %20 : !cir.bool), !s32i
+  // CHECK-NEXT:   %21 = cir.cast bool_to_int %20 : !cir.bool -> !s32i
   // CHECK-NEXT:   %22 = cir.cmp(le, %19, %21) : !s32i, !cir.bool
   // CHECK-NEXT:   cir.store %22, %2 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   %23 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %24 = cir.cast(bool_to_int, %23 : !cir.bool), !s32i
+  // CHECK-NEXT:   %24 = cir.cast bool_to_int %23 : !cir.bool -> !s32i
   // CHECK-NEXT:   %25 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %26 = cir.cast(bool_to_int, %25 : !cir.bool), !s32i
+  // CHECK-NEXT:   %26 = cir.cast bool_to_int %25 : !cir.bool -> !s32i
   // CHECK-NEXT:   %27 = cir.cmp(eq, %24, %26) : !s32i, !cir.bool
   // CHECK-NEXT:   cir.store %27, %2 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   %28 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %29 = cir.cast(bool_to_int, %28 : !cir.bool), !s32i
+  // CHECK-NEXT:   %29 = cir.cast bool_to_int %28 : !cir.bool -> !s32i
   // CHECK-NEXT:   %30 = cir.load %1 : !cir.ptr<!cir.bool>, !cir.bool
-  // CHECK-NEXT:   %31 = cir.cast(bool_to_int, %30 : !cir.bool), !s32i
+  // CHECK-NEXT:   %31 = cir.cast bool_to_int %30 : !cir.bool -> !s32i
   // CHECK-NEXT:   %32 = cir.cmp(ne, %29, %31) : !s32i, !cir.bool
   // CHECK-NEXT:   cir.store %32, %2 : !cir.bool, !cir.ptr<!cir.bool>
   // CHECK-NEXT:   cir.return
diff --git a/clang/test/CIR/IR/vtable-addrpt.cir b/clang/test/CIR/IR/vtable-addrpt.cir
index 0b809cc2506e6..106e7485fbbcf 100644
--- a/clang/test/CIR/IR/vtable-addrpt.cir
+++ b/clang/test/CIR/IR/vtable-addrpt.cir
@@ -14,7 +14,7 @@ module {
     cir.store %arg0, %0 : !cir.ptr<!rec_S>, !cir.ptr<!cir.ptr<!rec_S>>
     %1 = cir.load %0 : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
     %2 = cir.vtable.address_point(@_ZTV1S, address_point = <index = 0, offset = 2>) : !cir.vptr
-    %3 = cir.cast(bitcast, %1 : !cir.ptr<!rec_S>), !cir.ptr<!cir.vptr>
+    %3 = cir.cast bitcast %1 : !cir.ptr<!rec_S> -> !cir.ptr<!cir.vptr>
     cir.store align(8) %2, %3 : !cir.vptr, !cir.ptr<!cir.vptr>
     cir.return
   }
diff --git a/clang/test/CIR/IR/vtt-addrpoint.cir b/clang/test/CIR/IR/vtt-addrpoint.cir
index f05bb782c6911..11e5f4da83b50 100644
--- a/clang/test/CIR/IR/vtt-addrpoint.cir
+++ b/clang/test/CIR/IR/vtt-addrpoint.cir
@@ -26,7 +26,7 @@ module {
 
     cir.call @_ZN1BC2Ev(%4, %5) : (!cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!void>>) -> ()
     %6 = cir.vtt.address_point %3 : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-    %7 = cir.cast(bitcast, %6 : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+    %7 = cir.cast bitcast %6 : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
     %8 = cir.load align(8) %7 : !cir.ptr<!cir.vptr>, !cir.vptr
     %9 = cir.vtable.get_vptr %2 : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
     cir.store align(8) %8, %9 : !cir.vptr, !cir.ptr<!cir.vptr>
diff --git a/clang/test/CIR/Lowering/cast.cir b/clang/test/CIR/Lowering/cast.cir
index 6842905dae6a4..ec104edec2405 100644
--- a/clang/test/CIR/Lowering/cast.cir
+++ b/clang/test/CIR/Lowering/cast.cir
@@ -26,51 +26,51 @@ module {
 
     // Integer casts.
     %9 = cir.load %0 : !cir.ptr<!u32i>, !u32i
-    %10 = cir.cast(integral, %9 : !u32i), !s8i
+    %10 = cir.cast integral %9 : !u32i -> !s8i
     // CHECK: %{{[0-9]+}} = llvm.trunc %{{[0-9]+}} : i32 to i8
     cir.store %10, %3 : !s8i, !cir.ptr<!s8i>
     %11 = cir.load %1 : !cir.ptr<!s32i>, !s32i
-    %12 = cir.cast(integral, %11 : !s32i), !s16i
+    %12 = cir.cast integral %11 : !s32i -> !s16i
     // CHECK: %{{[0-9]+}} = llvm.trunc %{{[0-9]+}} : i32 to i16
     cir.store %12, %4 : !s16i, !cir.ptr<!s16i>
     %13 = cir.load %0 : !cir.ptr<!u32i>, !u32i
-    %14 = cir.cast(integral, %13 : !u32i), !s64i
+    %14 = cir.cast integral %13 : !u32i -> !s64i
     // CHECK: %{{[0-9]+}} = llvm.zext %{{[0-9]+}} : i32 to i64
     cir.store %14, %5 : !s64i, !cir.ptr<!s64i>
     %15 = cir.load %1 : !cir.ptr<!s32i>, !s32i
-    %16 = cir.cast(integral, %15 : !s32i), !s64i
+    %16 = cir.cast integral %15 : !s32i -> !s64i
     // CHECK: %{{[0-9]+}} = llvm.sext %{{[0-9]+}} : i32 to i64
-    %30 = cir.cast(integral, %arg1 : !s32i), !u32i
+    %30 = cir.cast integral %arg1 : !s32i -> !u32i
     // Should not produce a cast.
-    %32 = cir.cast(integral, %arg0 : !u32i), !s32i
+    %32 = cir.cast integral %arg0 : !u32i -> !s32i
     // Should not produce a cast.
     %21 = cir.load %20 : !cir.ptr<!s16i>, !s16i
-    %22 = cir.cast(integral, %21 : !s16i), !u64i
+    %22 = cir.cast integral %21 : !s16i -> !u64i
     // CHECK: %[[TMP:[0-9]+]] = llvm.sext %{{[0-9]+}} : i16 to i64
-    %33 = cir.cast(int_to_bool, %arg1 : !s32i), !cir.bool
+    %33 = cir.cast int_to_bool %arg1 : !s32i -> !cir.bool
     // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: %[[#CMP:]] = llvm.icmp "ne" %arg1, %[[#ZERO]] : i32
 
     // Pointer casts.
     cir.store %16, %6 : !s64i, !cir.ptr<!s64i>
-    %23 = cir.cast(int_to_ptr, %22 : !u64i), !cir.ptr<!u8i>
+    %23 = cir.cast int_to_ptr %22 : !u64i -> !cir.ptr<!u8i>
     // CHECK: %[[TMP2:[0-9]+]] = llvm.inttoptr %[[TMP]] : i64 to !llvm.ptr
-    %24 = cir.cast(ptr_to_int, %23 : !cir.ptr<!u8i>), !s32i
+    %24 = cir.cast ptr_to_int %23 : !cir.ptr<!u8i> -> !s32i
     // CHECK: %{{[0-9]+}} = llvm.ptrtoint %[[TMP2]] : !llvm.ptr to i32
-    %29 = cir.cast(ptr_to_bool, %23 : !cir.ptr<!u8i>), !cir.bool
+    %29 = cir.cast ptr_to_bool %23 : !cir.ptr<!u8i> -> !cir.bool
 
     // Floating point casts.
-    %25 = cir.cast(int_to_float, %arg1 : !s32i), !cir.float
+    %25 = cir.cast int_to_float %arg1 : !s32i -> !cir.float
     // CHECK: %{{.+}} = llvm.sitofp %{{.+}} : i32 to f32
-    %26 = cir.cast(int_to_float, %arg0 : !u32i), !cir.float
+    %26 = cir.cast int_to_float %arg0 : !u32i -> !cir.float
     // CHECK: %{{.+}} = llvm.uitofp %{{.+}} : i32 to f32
-    %27 = cir.cast(float_to_int, %arg2 : !cir.float), !s32i
+    %27 = cir.cast float_to_int %arg2 : !cir.float -> !s32i
     // CHECK: %{{.+}} = llvm.fptosi %{{.+}} : f32 to i32
-    %28 = cir.cast(float_to_int, %arg2 : !cir.float), !u32i
+    %28 = cir.cast float_to_int %arg2 : !cir.float -> !u32i
     // CHECK: %{{.+}} = llvm.fptoui %{{.+}} : f32 to i32
     %18 = cir.const #cir.int<0> : !s32i
     // CHECK: %{{.+}} = llvm.fptrunc %{{.+}} : f64 to f32
-    %34 = cir.cast(floating, %arg3 : !cir.double), !cir.float
+    %34 = cir.cast floating %arg3 : !cir.double -> !cir.float
 
     cir.store %18, %2 : !s32i, !cir.ptr<!s32i>
     %19 = cir.load %2 : !cir.ptr<!s32i>, !s32i
@@ -84,7 +84,7 @@ module {
     cir.store %arg0, %0 : !cir.bool, !cir.ptr<!cir.bool>
 
     %2 = cir.load %0 : !cir.ptr<!cir.bool>, !cir.bool
-    %3 = cir.cast(bool_to_int, %2 : !cir.bool), !u8i
+    %3 = cir.cast bool_to_int %2 : !cir.bool -> !u8i
     // CHECK: %[[LOAD_BOOL:.*]] = llvm.load %{{.*}} : !llvm.ptr -> i8
     // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[LOAD_BOOL]] : i8 to i1
     // CHECK: %[[EXT:.*]] = llvm.zext %[[TRUNC]] : i1 to i8
diff --git a/clang/test/CIR/Lowering/if.cir b/clang/test/CIR/Lowering/if.cir
index 3a077aa9ef057..888fb38e2d77c 100644
--- a/clang/test/CIR/Lowering/if.cir
+++ b/clang/test/CIR/Lowering/if.cir
@@ -4,7 +4,7 @@
 
 module {
   cir.func @foo(%arg0: !s32i) -> !s32i {
-    %4 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+    %4 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
     cir.if %4 {
       %5 = cir.const #cir.int<1> : !s32i
       cir.return %5 : !s32i
@@ -44,7 +44,7 @@ module {
 //  LLVM-NEXT: }
 
   cir.func @onlyIf(%arg0: !s32i) -> !s32i {
-    %4 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+    %4 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
     cir.if %4 {
       %5 = cir.const #cir.int<1> : !s32i
       cir.return %5 : !s32i
@@ -66,7 +66,7 @@ module {
   // Verify empty if clause is properly lowered to empty block
   cir.func @emptyIfClause(%arg0: !s32i) -> !s32i {
     // MLIR-LABEL: llvm.func @emptyIfClause
-    %4 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+    %4 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
     // MLIR: llvm.cond_br {{%.*}}, ^[[T:.*]], ^[[PHI:.*]]
     cir.if %4 {
       // MLIR-NEXT: ^[[T]]:
@@ -82,7 +82,7 @@ module {
   // addressed
   cir.func @emptyIfElseClause(%arg0: !s32i) -> !s32i {
     // MLIR-LABEL: llvm.func @emptyIfElseClause
-    %4 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+    %4 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
     // MLIR: llvm.cond_br {{%.*}}, ^[[T:.*]], ^[[F:.*]]
     cir.if %4 {
     // MLIR-NEXT: ^[[T]]:
diff --git a/clang/test/CIR/Lowering/vtt-addrpoint.cir b/clang/test/CIR/Lowering/vtt-addrpoint.cir
index 96dc27d991cd4..e1bfd00245b1b 100644
--- a/clang/test/CIR/Lowering/vtt-addrpoint.cir
+++ b/clang/test/CIR/Lowering/vtt-addrpoint.cir
@@ -24,7 +24,7 @@ module {
     %5 = cir.vtt.address_point %3 : !cir.ptr<!cir.ptr<!void>>, offset = 1 -> !cir.ptr<!cir.ptr<!void>>
     cir.call @_ZN1BC2Ev(%4, %5) : (!cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!void>>) -> ()
     %6 = cir.vtt.address_point %3 : !cir.ptr<!cir.ptr<!void>>, offset = 0 -> !cir.ptr<!cir.ptr<!void>>
-    %7 = cir.cast(bitcast, %6 : !cir.ptr<!cir.ptr<!void>>), !cir.ptr<!cir.vptr>
+    %7 = cir.cast bitcast %6 : !cir.ptr<!cir.ptr<!void>> -> !cir.ptr<!cir.vptr>
     %8 = cir.load align(8) %7 : !cir.ptr<!cir.vptr>, !cir.vptr
     %9 = cir.vtable.get_vptr %2 : !cir.ptr<!rec_C> -> !cir.ptr<!cir.vptr>
     cir.store align(8) %8, %9 : !cir.vptr, !cir.ptr<!cir.vptr>
diff --git a/clang/test/CIR/Transforms/canonicalize.cir b/clang/test/CIR/Transforms/canonicalize.cir
index 5daff119a626f..5606f9e16a690 100644
--- a/clang/test/CIR/Transforms/canonicalize.cir
+++ b/clang/test/CIR/Transforms/canonicalize.cir
@@ -50,39 +50,39 @@ module {
   // CHECK-NEXT: }
 
   cir.func @cast1(%arg0: !cir.bool) -> !cir.bool {
-    %0 = cir.cast(bool_to_int, %arg0 : !cir.bool), !s32i
-    %1 = cir.cast(int_to_bool, %0 : !s32i), !cir.bool
+    %0 = cir.cast bool_to_int %arg0 : !cir.bool -> !s32i
+    %1 = cir.cast int_to_bool %0 : !s32i -> !cir.bool
     cir.return %1 : !cir.bool
   }
   // CHECK:      cir.func{{.*}} @cast1(%[[ARG0:.*]]: !cir.bool) -> !cir.bool
   // CHECK-NEXT:   cir.return %[[ARG0]] : !cir.bool
 
   cir.func @cast2(%arg0: !s32i) -> !cir.bool {
-    %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
-    %1 = cir.cast(bool_to_int, %0 : !cir.bool), !s32i
-    %2 = cir.cast(integral, %1 : !s32i), !s64i
-    %3 = cir.cast(int_to_bool, %2 : !s64i), !cir.bool
+    %0 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
+    %1 = cir.cast bool_to_int %0 : !cir.bool -> !s32i
+    %2 = cir.cast integral %1 : !s32i -> !s64i
+    %3 = cir.cast int_to_bool %2 : !s64i -> !cir.bool
     cir.return %3 : !cir.bool
   }
   // CHECK:      cir.func{{.*}} @cast2(%[[ARG0:.*]]: !s32i) -> !cir.bool
-  // CHECK-NEXT:   %[[CAST:.*]] = cir.cast(int_to_bool, %[[ARG0]] : !s32i), !cir.bool
+  // CHECK-NEXT:   %[[CAST:.*]] = cir.cast int_to_bool %[[ARG0]] : !s32i -> !cir.bool
   // CHECK-NEXT:   cir.return %[[CAST]] : !cir.bool
 
   cir.func @no_fold_cast(%arg0: !s32i) -> !s64i {
-    %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
-    %1 = cir.cast(bool_to_int, %0 : !cir.bool), !s32i
-    %2 = cir.cast(integral, %1 : !s32i), !s64i
+    %0 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
+    %1 = cir.cast bool_to_int %0 : !cir.bool -> !s32i
+    %2 = cir.cast integral %1 : !s32i -> !s64i
     cir.return %2 : !s64i
   }
   // CHECK:      cir.func{{.*}} @no_fold_cast(%[[ARG0:.*]]: !s32i) -> !s64i
-  // CHECK-NEXT:   %[[CAST:.*]] = cir.cast(int_to_bool, %[[ARG0]] : !s32i), !cir.bool
-  // CHECK-NEXT:   %[[CAST2:.*]] = cir.cast(bool_to_int, %[[CAST]] : !cir.bool), !s32i
-  // CHECK-NEXT:   %[[CAST3:.*]] = cir.cast(integral, %[[CAST2]] : !s32i), !s64i
+  // CHECK-NEXT:   %[[CAST:.*]] = cir.cast int_to_bool %[[ARG0]] : !s32i -> !cir.bool
+  // CHECK-NEXT:   %[[CAST2:.*]] = cir.cast bool_to_int %[[CAST]] : !cir.bool -> !s32i
+  // CHECK-NEXT:   %[[CAST3:.*]] = cir.cast integral %[[CAST2]] : !s32i -> !s64i
   // CHECK-NEXT:   cir.return %[[CAST3]] : !s64i
 
   cir.func @cast_poison() -> !s64i {
     %0 = cir.const #cir.poison : !s32i
-    %1 = cir.cast(integral, %0 : !s32i), !s64i
+    %1 = cir.cast integral %0 : !s32i -> !s64i
     cir.return %1 : !s64i
   }
   // CHECK:      @cast_poison
diff --git a/clang/test/CIR/Transforms/if.cir b/clang/test/CIR/Transforms/if.cir
index 3f817c793643f..ced288f7ecf29 100644
--- a/clang/test/CIR/Transforms/if.cir
+++ b/clang/test/CIR/Transforms/if.cir
@@ -4,7 +4,7 @@
 
 module {
   cir.func @foo(%arg0: !s32i) -> !s32i {
-    %4 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+    %4 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
     cir.if %4 {
       %5 = cir.const #cir.int<1> : !s32i
       cir.return %5 : !s32i
@@ -15,7 +15,7 @@ module {
     cir.return %arg0 : !s32i
   }
 //      CHECK: cir.func{{.*}} @foo(%arg0: !s32i) -> !s32i {
-// CHECK-NEXT:   %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+// CHECK-NEXT:   %0 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
 // CHECK-NEXT:   cir.brcond %0 ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:  // pred: ^bb0
 // CHECK-NEXT:   %1 = cir.const #cir.int<1> : !s32i
@@ -28,7 +28,7 @@ module {
 // CHECK-NEXT: }
 
   cir.func @onlyIf(%arg0: !s32i) -> !s32i {
-    %4 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+    %4 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
     cir.if %4 {
       %5 = cir.const #cir.int<1> : !s32i
       cir.return %5 : !s32i
@@ -36,7 +36,7 @@ module {
     cir.return %arg0 : !s32i
   }
 //      CHECK: cir.func{{.*}} @onlyIf(%arg0: !s32i) -> !s32i {
-// CHECK-NEXT:   %0 = cir.cast(int_to_bool, %arg0 : !s32i), !cir.bool
+// CHECK-NEXT:   %0 = cir.cast int_to_bool %arg0 : !s32i -> !cir.bool
 // CHECK-NEXT:   cir.brcond %0 ^bb1, ^bb2
 // CHECK-NEXT: ^bb1:  // pred: ^bb0
 // CHECK-NEXT:   %1 = cir.const #cir.int<1> : !s32i
diff --git a/clang/test/CIR/Transforms/switch.cir b/clang/test/CIR/Transforms/switch.cir
index a000d6b70fbcc..3addfe37061cd 100644
--- a/clang/test/CIR/Transforms/switch.cir
+++ b/clang/test/CIR/Transforms/switch.cir
@@ -261,8 +261,8 @@ module {
 // CHECK-NEXT:     %[[RANGE:[0-9]+]] = cir.const #cir.int<99>
 // CHECK-NEXT:     %[[LOWER_BOUND:[0-9]+]] = cir.const #cir.int<1>
 // CHECK-NEXT:     %[[DIFF:[0-9]+]] = cir.binop(sub, %[[X]], %[[LOWER_BOUND]])
-// CHECK-NEXT:     %[[U_DIFF:[0-9]+]] = cir.cast(integral, %[[DIFF]] : !s32i), !u32i
-// CHECK-NEXT:     %[[U_RANGE:[0-9]+]] = cir.cast(integral, %[[RANGE]] : !s32i), !u32i
+// CHECK-NEXT:     %[[U_DIFF:[0-9]+]] = cir.cast integral %[[DIFF]] : !s32i -> !u32i
+// CHECK-NEXT:     %[[U_RANGE:[0-9]+]] = cir.cast integral %[[RANGE]] : !s32i -> !u32i
 // CHECK-NEXT:     %[[CMP_RESULT:[0-9]+]] = cir.cmp(le, %[[U_DIFF]], %[[U_RANGE]])
 // CHECK-NEXT:     cir.brcond %[[CMP_RESULT]] ^[[CASE_RANGE]], ^[[CASE_DEFAULT:bb[0-9]+]]
 // CHECK-NEXT:  ^[[CASE_DEFAULT]]:
@@ -304,8 +304,8 @@ module {
 // CHECK:    %[[CONST97:.*]] = cir.const #cir.int<97> : !s32i
 // CHECK:    %[[CONST3:.*]] = cir.const #cir.int<3> : !s32i
 // CHECK:    %[[SUB:.*]] = cir.binop(sub, %[[COND]], %[[CONST3]]) : !s32i
-// CHECK:    %[[CAST1:.*]] = cir.cast(integral, %[[SUB]] : !s32i), !u32i
-// CHECK:    %[[CAST2:.*]] = cir.cast(integral, %[[CONST97]] : !s32i), !u32i
+// CHECK:    %[[CAST1:.*]] = cir.cast integral %[[SUB]] : !s32i -> !u32i
+// CHECK:    %[[CAST2:.*]] = cir.cast integral %[[CONST97]] : !s32i -> !u32i
 // CHECK:    %[[CMP:.*]] = cir.cmp(le, %[[CAST1]], %[[CAST2]]) : !u32i, !cir.bool
 // CHECK:    cir.brcond %7 ^bb[[#DEFAULT_BB]], ^bb[[#RANGE_BB:]]
 // CHECK:  ^bb[[#RANGE_BB]]:  // pred: ^bb[[#RANGE_BR]]

From f80e7e139e3e677638233037499f8cba50c66b9e Mon Sep 17 00:00:00 2001
From: Hongyu Chen <xxs_chy@outlook.com>
Date: Wed, 1 Oct 2025 16:47:41 +0800
Subject: [PATCH 347/878] [GlobalOpt] Check if users are CallBase when changing
 CC (#161399)

Fixes https://github.com/llvm/llvm-project/issues/156656
`hasChangeableCCImpl` guarantees the address of the function is not
taken, but it ignores assume-like calls.
This patch ignores assume-like calls when changing CC.
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp    | 16 +++--
 llvm/test/Transforms/GlobalOpt/fastcc.ll | 90 +++++++++++++++++++-----
 2 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index f88d51f443bcf..99c4982c58b47 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1680,7 +1680,9 @@ processGlobal(GlobalValue &GV,
 /// FastCC.
 static void ChangeCalleesToFastCall(Function *F) {
   for (User *U : F->users())
-    cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
+    if (auto *Call = dyn_cast<CallBase>(U))
+      if (Call->getCalledOperand() == F)
+        Call->setCallingConv(CallingConv::Fast);
 }
 
 static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
@@ -1766,10 +1768,12 @@ isValidCandidateForColdCC(Function &F,
     return false;
 
   for (User *U : F.users()) {
-    CallBase &CB = cast<CallBase>(*U);
-    Function *CallerFunc = CB.getParent()->getParent();
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB || CB->getCalledOperand() != &F)
+      continue;
+    Function *CallerFunc = CB->getParent()->getParent();
     BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
-    if (!isColdCallSite(CB, CallerBFI))
+    if (!isColdCallSite(*CB, CallerBFI))
       return false;
     if (!llvm::is_contained(AllCallsCold, CallerFunc))
       return false;
@@ -1779,7 +1783,9 @@ isValidCandidateForColdCC(Function &F,
 
 static void changeCallSitesToColdCC(Function *F) {
   for (User *U : F->users())
-    cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
+    if (auto *Call = dyn_cast<CallBase>(U))
+      if (Call->getCalledOperand() == F)
+        Call->setCallingConv(CallingConv::Cold);
 }
 
 // This function iterates over all the call instructions in the input Function
diff --git a/llvm/test/Transforms/GlobalOpt/fastcc.ll b/llvm/test/Transforms/GlobalOpt/fastcc.ll
index 854357e6fad97..edbd602a97f3b 100644
--- a/llvm/test/Transforms/GlobalOpt/fastcc.ll
+++ b/llvm/test/Transforms/GlobalOpt/fastcc.ll
@@ -1,16 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt < %s -passes=globalopt -S | FileCheck %s
 
 declare token @llvm.call.preallocated.setup(i32)
 declare ptr @llvm.call.preallocated.arg(token, i32)
 
 define internal i32 @f(ptr %m) {
-; CHECK-LABEL: define internal fastcc i32 @f
+; CHECK-LABEL: define internal fastcc i32 @f(
+; CHECK-SAME: ptr [[M:%.*]]) unnamed_addr {
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[M]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
   %v = load i32, ptr %m
   ret i32 %v
 }
 
 define internal x86_thiscallcc i32 @g(ptr %m) {
-; CHECK-LABEL: define internal fastcc i32 @g
+; CHECK-LABEL: define internal fastcc i32 @g(
+; CHECK-SAME: ptr [[M:%.*]]) unnamed_addr {
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[M]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
   %v = load i32, ptr %m
   ret i32 %v
 }
@@ -18,41 +27,80 @@ define internal x86_thiscallcc i32 @g(ptr %m) {
 ; Leave this one alone, because the user went out of their way to request this
 ; convention.
 define internal coldcc i32 @h(ptr %m) {
-; CHECK-LABEL: define internal coldcc i32 @h
+; CHECK-LABEL: define internal coldcc i32 @h(
+; CHECK-SAME: ptr [[M:%.*]]) unnamed_addr {
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[M]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
   %v = load i32, ptr %m
   ret i32 %v
 }
 
 define internal i32 @j(ptr %m) {
-; CHECK-LABEL: define internal i32 @j
+; CHECK-LABEL: define internal i32 @j(
+; CHECK-SAME: ptr [[M:%.*]]) {
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[M]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
   %v = load i32, ptr %m
   ret i32 %v
 }
 
 define internal i32 @inalloca(ptr inalloca(i32) %p) {
-; CHECK-LABEL: define internal fastcc i32 @inalloca(ptr %p)
+; CHECK-LABEL: define internal fastcc i32 @inalloca(
+; CHECK-SAME: ptr [[P:%.*]]) unnamed_addr {
+; CHECK-NEXT:    [[RV:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[RV]]
+;
   %rv = load i32, ptr %p
   ret i32 %rv
 }
 
 define i32 @inalloca2_caller(ptr inalloca(i32) %p) {
+; CHECK-LABEL: define i32 @inalloca2_caller(
+; CHECK-SAME: ptr inalloca(i32) [[P:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[RV:%.*]] = musttail call i32 @inalloca2(ptr inalloca(i32) [[P]])
+; CHECK-NEXT:    ret i32 [[RV]]
+;
   %rv = musttail call i32 @inalloca2(ptr inalloca(i32) %p)
   ret i32 %rv
 }
 define internal i32 @inalloca2(ptr inalloca(i32) %p) {
 ; Because of the musttail caller, this inalloca cannot be dropped.
-; CHECK-LABEL: define internal i32 @inalloca2(ptr inalloca(i32) %p)
+; CHECK-LABEL: define internal i32 @inalloca2(
+; CHECK-SAME: ptr inalloca(i32) [[P:%.*]]) unnamed_addr {
+; CHECK-NEXT:    [[RV:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[RV]]
+;
   %rv = load i32, ptr %p
   ret i32 %rv
 }
 
 define internal i32 @preallocated(ptr preallocated(i32) %p) {
-; CHECK-LABEL: define internal fastcc i32 @preallocated(ptr %p)
+; CHECK-LABEL: define internal fastcc i32 @preallocated(
+; CHECK-SAME: ptr [[P:%.*]]) unnamed_addr {
+; CHECK-NEXT:    [[RV:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[RV]]
+;
   %rv = load i32, ptr %p
   ret i32 %rv
 }
 
 define void @call_things() {
+; CHECK-LABEL: define void @call_things() local_unnamed_addr {
+; CHECK-NEXT:    [[M:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call fastcc i32 @f(ptr [[M]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call fastcc i32 @g(ptr [[M]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call coldcc i32 @h(ptr [[M]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @j(ptr [[M]])
+; CHECK-NEXT:    [[ARGS:%.*]] = alloca inalloca i32, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call fastcc i32 @inalloca(ptr [[ARGS]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call ptr @llvm.stacksave.p0()
+; CHECK-NEXT:    [[PAARG:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = call fastcc i32 @preallocated(ptr [[PAARG]])
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[TMP6]])
+; CHECK-NEXT:    ret void
+;
   %m = alloca i32
   call i32 @f(ptr %m)
   call x86_thiscallcc i32 @g(ptr %m)
@@ -65,15 +113,25 @@ define void @call_things() {
   call i32 @preallocated(ptr preallocated(i32) %N) ["preallocated"(token %c)]
   ret void
 }
-; CHECK-LABEL: define void @call_things()
-; CHECK: call fastcc i32 @f
-; CHECK: call fastcc i32 @g
-; CHECK: call coldcc i32 @h
-; CHECK: call i32 @j
-; CHECK: call fastcc i32 @inalloca(ptr %args)
-; CHECK-NOT: llvm.call.preallocated
-; CHECK: call fastcc i32 @preallocated(ptr %paarg)
 
 @llvm.used = appending global [1 x ptr] [
-   ptr @j
+  ptr @j
 ], section "llvm.metadata"
+
+define internal i32 @assume_fastcc() {
+; CHECK-LABEL: define internal fastcc i32 @assume_fastcc() {
+; CHECK-NEXT:    [[OBJSIZE:%.*]] = call i32 @llvm.objectsize.i32.p0(ptr @assume_fastcc, i1 false, i1 false, i1 false)
+; CHECK-NEXT:    ret i32 [[OBJSIZE]]
+;
+  %objsize = call i32 @llvm.objectsize.i32.p0(ptr @assume_fastcc, i1 false, i1 false, i1 false)
+  ret i32 %objsize
+}
+
+define internal i32 @constexpr_self_user() addrspace(1) {
+; CHECK-LABEL: define internal fastcc i32 @constexpr_self_user() addrspace(1) {
+; CHECK-NEXT:    [[OBJSIZE:%.*]] = call i32 @llvm.objectsize.i32.p0(ptr addrspacecast (ptr addrspace(1) @constexpr_self_user to ptr), i1 false, i1 false, i1 false)
+; CHECK-NEXT:    ret i32 [[OBJSIZE]]
+;
+  %objsize = call i32 @llvm.objectsize.i32.p0(ptr addrspacecast (ptr addrspace(1) @constexpr_self_user to ptr), i1 false, i1 false, i1 false)
+  ret i32 %objsize
+}

From 14fcd81861aa1576d204b9146345c4426d81fc49 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Wed, 1 Oct 2025 10:51:00 +0200
Subject: [PATCH 348/878] [AMDGPU][InsertWaitCnts] Refactor some helper
 functions, NFC (#161160)

- Remove one-line wrappers around a simple function call when they're
only used once or twice.
- Move very generic helpers into SIInstrInfo
- Delete unused functions

The goal is simply to reduce the noise in SIInsertWaitCnts without
hiding functionality. I focused on moving trivial helpers, or helpers
with very descriptive/verbose names (so it doesn't hide too much logic
away from the pass), and that have some reusability potential.

I'm also trying to make the code style more consistent. It doesn't make
sense to see a function call `TII->isXXX` then suddenly call a random
`isY` method that just wraps around `TII->isY`.

The context of this work is that I'm trying to learn how this pass
works, and while going through the code I noticed some little things
here and there that I thought would be good to fix.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 115 +++-----------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp      |  53 +++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |  23 ++++
 3 files changed, 89 insertions(+), 102 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index f291191dbfd5c..91136fd85c545 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -495,13 +495,6 @@ class SIInsertWaitcnts {
   bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
   bool run(MachineFunction &MF);
 
-  bool isForceEmitWaitcnt() const {
-    for (auto T : inst_counter_types())
-      if (ForceEmitWaitcnt[T])
-        return true;
-    return false;
-  }
-
   void setForceEmitWaitcnt() {
 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
 // For debug builds, get the debug counter info and adjust if need be
@@ -570,10 +563,6 @@ class SIInsertWaitcnts {
     return VmemReadMapping[getVmemType(Inst)];
   }
 
-  bool hasXcnt() const { return ST->hasWaitXCnt(); }
-
-  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
-  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
   bool isVmemAccess(const MachineInstr &MI) const;
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
@@ -591,7 +580,6 @@ class SIInsertWaitcnts {
                              WaitcntBrackets &ScoreBrackets);
   bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
                             WaitcntBrackets &ScoreBrackets);
-  static bool asynchronouslyWritesSCC(unsigned Opcode);
 };
 
 // This objects maintains the current score brackets of each wait counter, and
@@ -1109,7 +1097,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         setRegScore(FIRST_LDS_VGPR, T, CurrScore);
     }
 
-    if (Context->asynchronouslyWritesSCC(Inst.getOpcode())) {
+    if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
       setRegScore(SCC, T, CurrScore);
       PendingSCCWrite = &Inst;
     }
@@ -1831,12 +1819,6 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
   return Modified;
 }
 
-static bool readsVCCZ(const MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
-         !MI.getOperand(1).isUndef();
-}
-
 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
   // Currently all conventions wait, but this may not always be the case.
@@ -2061,7 +2043,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
           ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
         }
 
-        if (hasXcnt() && Op.isDef())
+        if (ST->hasWaitXCnt() && Op.isDef())
           ScoreBrackets.determineWait(X_CNT, Interval, Wait);
       }
     }
@@ -2087,10 +2069,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
-  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
-    if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
-      Wait.DsCnt = 0;
-    }
+  if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
+      ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+    Wait.DsCnt = 0;
   }
 
   // Verify that the wait is actually needed.
@@ -2185,75 +2166,11 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   return Modified;
 }
 
-// This is a flat memory operation. Check to see if it has memory tokens other
-// than LDS. Other address spaces supported by flat memory operations involve
-// global memory.
-bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // All flat instructions use the VMEM counter except prefetch.
-  if (!TII->usesVM_CNT(MI))
-    return false;
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access VMEM.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves VMEM.
-  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
-  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
-  // (GDS) address space is not supported by flat operations. Therefore, simply
-  // return true unless only the LDS address space is found.
-  for (const MachineMemOperand *Memop : MI.memoperands()) {
-    unsigned AS = Memop->getAddrSpace();
-    assert(AS != AMDGPUAS::REGION_ADDRESS);
-    if (AS != AMDGPUAS::LOCAL_ADDRESS)
-      return true;
-  }
-
-  return false;
-}
-
-// This is a flat memory operation. Check to see if it has memory tokens for
-// either LDS or FLAT.
-bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
-  assert(TII->isFLAT(MI));
-
-  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
-  if (!TII->usesLGKM_CNT(MI))
-    return false;
-
-  // If in tgsplit mode then there can be no use of LDS.
-  if (ST->isTgSplitEnabled())
-    return false;
-
-  // If there are no memory operands then conservatively assume the flat
-  // operation may access LDS.
-  if (MI.memoperands_empty())
-    return true;
-
-  // See if any memory operand specifies an address space that involves LDS.
-  for (const MachineMemOperand *Memop : MI.memoperands()) {
-    unsigned AS = Memop->getAddrSpace();
-    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
-      return true;
-  }
-
-  return false;
-}
-
 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
-  return (TII->isFLAT(MI) && mayAccessVMEMThroughFlat(MI)) ||
+  return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
          (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
 }
 
-static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
-  auto Opc = Inst.getOpcode();
-  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
-         Opc == AMDGPU::GLOBAL_WBINV;
-}
-
 // Return true if the next instruction is S_ENDPGM, following fallthrough
 // blocks if necessary.
 bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
@@ -2331,7 +2248,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
-    if (isGFX12CacheInvOrWBInst(Inst)) {
+    if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                    Inst);
       return;
@@ -2341,14 +2258,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
 
     int FlatASCount = 0;
 
-    if (mayAccessVMEMThroughFlat(Inst)) {
+    if (TII->mayAccessVMEMThroughFlat(Inst)) {
       ++FlatASCount;
       IsVMEMAccess = true;
       ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
                                    Inst);
     }
 
-    if (mayAccessLDSThroughFlat(Inst)) {
+    if (TII->mayAccessLDSThroughFlat(Inst)) {
       ++FlatASCount;
       ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
     }
@@ -2394,7 +2311,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
     else
       ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
-  } else if (asynchronouslyWritesSCC(Inst.getOpcode())) {
+  } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
   } else {
     switch (Inst.getOpcode()) {
@@ -2413,7 +2330,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     }
   }
 
-  if (!hasXcnt())
+  if (!ST->hasWaitXCnt())
     return;
 
   if (IsVMEMAccess)
@@ -2516,12 +2433,6 @@ static bool isWaitInstr(MachineInstr &Inst) {
          counterTypeForInstr(Opcode).has_value();
 }
 
-bool SIInsertWaitcnts::asynchronouslyWritesSCC(unsigned Opcode) {
-  return Opcode == AMDGPU::S_BARRIER_LEAVE ||
-         Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
-         Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
-}
-
 // Generate s_waitcnt instructions where needed.
 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
                                             MachineBasicBlock &Block,
@@ -2578,7 +2489,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     OldWaitcntInstr = nullptr;
 
     // Restore vccz if it's not known to be correct already.
-    bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
+    bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
 
     // Don't examine operands unless we need to track vccz correctness.
     if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
@@ -2701,7 +2612,7 @@ bool SIInsertWaitcnts::isPreheaderToFlush(
 
 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
   if (SIInstrInfo::isFLAT(MI))
-    return mayAccessVMEMThroughFlat(MI);
+    return TII->mayAccessVMEMThroughFlat(MI);
   return SIInstrInfo::isVMEM(MI);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 044ea866342c2..56435a50c87ad 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4344,6 +4344,59 @@ bool SIInstrInfo::mayAccessScratchThroughFlat(const MachineInstr &MI) const {
   });
 }
 
+bool SIInstrInfo::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
+  assert(isFLAT(MI));
+
+  // All flat instructions use the VMEM counter except prefetch.
+  if (!usesVM_CNT(MI))
+    return false;
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access VMEM.
+  if (MI.memoperands_empty())
+    return true;
+
+  // See if any memory operand specifies an address space that involves VMEM.
+  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
+  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
+  // (GDS) address space is not supported by flat operations. Therefore, simply
+  // return true unless only the LDS address space is found.
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    assert(AS != AMDGPUAS::REGION_ADDRESS);
+    if (AS != AMDGPUAS::LOCAL_ADDRESS)
+      return true;
+  }
+
+  return false;
+}
+
+bool SIInstrInfo::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
+  assert(isFLAT(MI));
+
+  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
+  if (!usesLGKM_CNT(MI))
+    return false;
+
+  // If in tgsplit mode then there can be no use of LDS.
+  if (ST.isTgSplitEnabled())
+    return false;
+
+  // If there are no memory operands then conservatively assume the flat
+  // operation may access LDS.
+  if (MI.memoperands_empty())
+    return true;
+
+  // See if any memory operand specifies an address space that involves LDS.
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    unsigned AS = Memop->getAddrSpace();
+    if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
+      return true;
+  }
+
+  return false;
+}
+
 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
   // Skip the full operand and register alias search modifiesRegister
   // does. There's only a handful of instructions that touch this, it's only an
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c2252afdbb064..a21089f8e0fcc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -688,6 +688,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   /// to not hit scratch.
   bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
 
+  /// \returns true for FLAT instructions that can access VMEM.
+  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
+
+  /// \returns true for FLAT instructions that can access LDS.
+  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+
   static bool isBlockLoadStore(uint16_t Opcode) {
     switch (Opcode) {
     case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
@@ -748,6 +754,18 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
   }
 
+  static bool isSBarrierSCCWrite(unsigned Opcode) {
+    return Opcode == AMDGPU::S_BARRIER_LEAVE ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
+           Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0;
+  }
+
+  static bool isCBranchVCCZRead(const MachineInstr &MI) {
+    unsigned Opc = MI.getOpcode();
+    return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+           !MI.getOperand(1).isUndef();
+  }
+
   static bool isWQM(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::WQM;
   }
@@ -1010,6 +1028,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
            Opcode == AMDGPU::DS_GWS_BARRIER;
   }
 
+  static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
+    return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+           Opc == AMDGPU::GLOBAL_WBINV;
+  }
+
   static bool isF16PseudoScalarTrans(unsigned Opcode) {
     return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
            Opcode == AMDGPU::V_S_LOG_F16_e64 ||

From 88c668d050aceb27c161f82474efa0004eced9b2 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Wed, 1 Oct 2025 10:53:32 +0200
Subject: [PATCH 349/878] [AMDGPU][SIInsertWaitCnts] De-duplicate code (NFC)
 (#161161)

I'm reading through the pass over and over again to try and learn how it works. I noticed some code duplication here and there while doing that.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 64 ++++++++++-----------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h        |  5 ++
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 91136fd85c545..3f9a1f492ace5 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1853,26 +1853,24 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   assert(!MI.isMetaInstruction());
 
   AMDGPU::Waitcnt Wait;
+  const unsigned Opc = MI.getOpcode();
 
   // FIXME: This should have already been handled by the memory legalizer.
   // Removing this currently doesn't affect any lit tests, but we need to
   // verify that nothing was relying on this. The number of buffer invalidates
   // being handled here should not be expanded.
-  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
-      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
-      MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
-      MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
-      MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
+  if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
+      Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
+      Opc == AMDGPU::BUFFER_GL1_INV) {
     Wait.LoadCnt = 0;
   }
 
   // All waits must be resolved at call return.
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
-  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
-      MI.getOpcode() == AMDGPU::SI_RETURN ||
-      MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
-      MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+  if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
+      Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
+      Opc == AMDGPU::S_SETPC_B64_return ||
       (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
   }
@@ -1884,8 +1882,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   // send a message to explicitly release all VGPRs before the stores have
   // completed, but it is only safe to do this if there are no outstanding
   // scratch stores.
-  else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-           MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+  else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
     if (!WCG->isOptNone() &&
         (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
          (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
@@ -1894,8 +1891,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       ReleaseVGPRInsts.insert(&MI);
   }
   // Resolve vm waits before gs-done.
-  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
-            MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+  else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
            ST->hasLegacyGeometry() &&
            ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
             AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
@@ -1920,7 +1916,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
 
     // Wait for any pending GDS instruction to complete before any
     // "Always GDS" instruction.
-    if (TII->isAlwaysGDS(MI.getOpcode()) && ScoreBrackets.hasPendingGDS())
+    if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
       addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
 
     if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
@@ -1946,7 +1942,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
                                       Wait);
         }
       }
-    } else if (MI.getOpcode() == AMDGPU::S_BARRIER_WAIT) {
+    } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
       ScoreBrackets.tryClearSCCWriteEvent(&MI);
     } else {
       // FIXME: Should not be relying on memoperands.
@@ -2061,8 +2057,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
   //
   // In all other cases, ensure safety by ensuring that there are no outstanding
   // memory operations.
-  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
-      !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
+  if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
+      !ST->supportsBackOffBarrier()) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
 
@@ -2146,19 +2142,19 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   }
 
   // XCnt may be already consumed by a load wait.
-  if (Wait.KmCnt == 0 && Wait.XCnt != ~0u &&
-      !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
-    Wait.XCnt = ~0u;
+  if (Wait.XCnt != ~0u) {
+    if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
+      Wait.XCnt = ~0u;
 
-  if (Wait.LoadCnt == 0 && Wait.XCnt != ~0u &&
-      !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
-    Wait.XCnt = ~0u;
+    if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
+      Wait.XCnt = ~0u;
 
-  // Since the translation for VMEM addresses occur in-order, we can skip the
-  // XCnt if the current instruction is of VMEM type and has a memory dependency
-  // with another VMEM instruction in flight.
-  if (Wait.XCnt != ~0u && isVmemAccess(*It))
-    Wait.XCnt = ~0u;
+    // Since the translation for VMEM addresses occur in-order, we can skip the
+    // XCnt if the current instruction is of VMEM type and has a memory
+    // dependency with another VMEM instruction in flight.
+    if (isVmemAccess(*It))
+      Wait.XCnt = ~0u;
+  }
 
   if (WCG->createNewWaitcnt(Block, It, Wait))
     Modified = true;
@@ -2395,9 +2391,8 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
         unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
         if (!OldEventsHasSCCWrite) {
           PendingSCCWrite = Other.PendingSCCWrite;
-        } else {
-          if (PendingSCCWrite != Other.PendingSCCWrite)
-            PendingSCCWrite = nullptr;
+        } else if (PendingSCCWrite != Other.PendingSCCWrite) {
+          PendingSCCWrite = nullptr;
         }
       }
     }
@@ -2635,11 +2630,10 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
   for (MachineBasicBlock *MBB : ML->blocks()) {
     for (MachineInstr &MI : *MBB) {
       if (isVMEMOrFlatVMEM(MI)) {
-        if (MI.mayLoad())
-          HasVMemLoad = true;
-        if (MI.mayStore())
-          HasVMemStore = true;
+        HasVMemLoad |= MI.mayLoad();
+        HasVMemStore |= MI.mayStore();
       }
+
       for (const MachineOperand &Op : MI.all_uses()) {
         if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
           continue;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a21089f8e0fcc..754f52a28e710 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1033,6 +1033,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
            Opc == AMDGPU::GLOBAL_WBINV;
   }
 
+  static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
+    return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
+           Opc == AMDGPU::GLOBAL_WBINV;
+  }
+
   static bool isF16PseudoScalarTrans(unsigned Opcode) {
     return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
            Opcode == AMDGPU::V_S_LOG_F16_e64 ||

From e86b3386fda91c17add6ae25710399b832b8cb9e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 1 Oct 2025 10:06:01 +0100
Subject: [PATCH 350/878] [DAGCombine] Support (shl %x, constant) in
 foldPartialReduceMLAMulOp. (#160663)

Support shifts in foldPartialReduceMLAMulOp by treating (shl %x, %c) as
(mul %x, (shl 1, %c)).

PR: https://github.com/llvm/llvm-project/pull/160663
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++++-
 .../neon-partial-reduce-dot-product.ll        | 86 +++++++++++++++----
 2 files changed, 88 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 204e1f0c75e00..558c5a0390228 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12994,13 +12994,31 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
 
-  APInt C;
-  if (Op1->getOpcode() != ISD::MUL ||
-      !ISD::isConstantSplatVector(Op2.getNode(), C) || !C.isOne())
+  unsigned Opc = Op1->getOpcode();
+  if (Opc != ISD::MUL && Opc != ISD::SHL)
     return SDValue();
 
   SDValue LHS = Op1->getOperand(0);
   SDValue RHS = Op1->getOperand(1);
+
+  // Try to treat (shl %a, %c) as (mul %a, (1 << %c)) for constant %c.
+  if (Opc == ISD::SHL) {
+    APInt C;
+    if (!ISD::isConstantSplatVector(RHS.getNode(), C))
+      return SDValue();
+
+    RHS =
+        DAG.getSplatVector(RHS.getValueType(), DL,
+                           DAG.getConstant(APInt(C.getBitWidth(), 1).shl(C), DL,
+                                           RHS.getValueType().getScalarType()));
+    Opc = ISD::MUL;
+  }
+
+  APInt C;
+  if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) ||
+      !C.isOne())
+    return SDValue();
+
   unsigned LHSOpcode = LHS->getOpcode();
   if (!ISD::isExtOpcode(LHSOpcode))
     return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index d60c870003e4d..428750740fc56 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1257,21 +1257,55 @@ entry:
 }
 
 define <4 x i32> @partial_reduce_shl_sext_const_rhs6(<16 x i8> %l, <4 x i32> %part) {
-; CHECK-COMMON-LABEL: partial_reduce_shl_sext_const_rhs6:
+; CHECK-NODOT-LABEL: partial_reduce_shl_sext_const_rhs6:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-NODOT-NEXT:    sshll v3.4s, v0.4h, #6
+; CHECK-NODOT-NEXT:    sshll2 v4.4s, v2.8h, #6
+; CHECK-NODOT-NEXT:    sshll v2.4s, v2.4h, #6
+; CHECK-NODOT-NEXT:    sshll2 v0.4s, v0.8h, #6
+; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT:    add v2.4s, v4.4s, v3.4s
+; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: partial_reduce_shl_sext_const_rhs6:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #64
+; CHECK-DOT-NEXT:    sdot v1.4s, v0.16b, v2.16b
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: partial_reduce_shl_sext_const_rhs6:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #64
+; CHECK-DOT-I8MM-NEXT:    sdot v1.4s, v0.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
+  %ext = sext <16 x i8> %l to <16 x i32>
+  %shift = shl nsw <16 x i32> %ext, splat (i32 6)
+  %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift)
+  ret <4 x i32> %red
+}
+
+define <4 x i32> @partial_reduce_shl_sext_const_rhs7(<16 x i8> %l, <4 x i32> %part) {
+; CHECK-COMMON-LABEL: partial_reduce_shl_sext_const_rhs7:
 ; CHECK-COMMON:       // %bb.0:
 ; CHECK-COMMON-NEXT:    sshll v2.8h, v0.8b, #0
 ; CHECK-COMMON-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-COMMON-NEXT:    sshll v3.4s, v0.4h, #6
-; CHECK-COMMON-NEXT:    sshll2 v4.4s, v2.8h, #6
-; CHECK-COMMON-NEXT:    sshll v2.4s, v2.4h, #6
-; CHECK-COMMON-NEXT:    sshll2 v0.4s, v0.8h, #6
+; CHECK-COMMON-NEXT:    sshll v3.4s, v0.4h, #7
+; CHECK-COMMON-NEXT:    sshll2 v4.4s, v2.8h, #7
+; CHECK-COMMON-NEXT:    sshll v2.4s, v2.4h, #7
+; CHECK-COMMON-NEXT:    sshll2 v0.4s, v0.8h, #7
 ; CHECK-COMMON-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-COMMON-NEXT:    add v2.4s, v4.4s, v3.4s
 ; CHECK-COMMON-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-COMMON-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-COMMON-NEXT:    ret
   %ext = sext <16 x i8> %l to <16 x i32>
-  %shift = shl nsw <16 x i32> %ext, splat (i32 6)
+  %shift = shl nsw <16 x i32> %ext, splat (i32 7)
   %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift)
   ret <4 x i32> %red
 }
@@ -1331,19 +1365,33 @@ define <4 x i32> @partial_reduce_shl_sext_non_const_rhs(<16 x i8> %l, <4 x i32>
 }
 
 define <4 x i32> @partial_reduce_shl_zext_const_rhs6(<16 x i8> %l, <4 x i32> %part) {
-; CHECK-COMMON-LABEL: partial_reduce_shl_zext_const_rhs6:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-COMMON-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-COMMON-NEXT:    ushll v3.4s, v0.4h, #6
-; CHECK-COMMON-NEXT:    ushll2 v4.4s, v2.8h, #6
-; CHECK-COMMON-NEXT:    ushll v2.4s, v2.4h, #6
-; CHECK-COMMON-NEXT:    ushll2 v0.4s, v0.8h, #6
-; CHECK-COMMON-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-COMMON-NEXT:    add v2.4s, v4.4s, v3.4s
-; CHECK-COMMON-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-COMMON-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-COMMON-NEXT:    ret
+; CHECK-NODOT-LABEL: partial_reduce_shl_zext_const_rhs6:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-NODOT-NEXT:    ushll v3.4s, v0.4h, #6
+; CHECK-NODOT-NEXT:    ushll2 v4.4s, v2.8h, #6
+; CHECK-NODOT-NEXT:    ushll v2.4s, v2.4h, #6
+; CHECK-NODOT-NEXT:    ushll2 v0.4s, v0.8h, #6
+; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT:    add v2.4s, v4.4s, v3.4s
+; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: partial_reduce_shl_zext_const_rhs6:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #64
+; CHECK-DOT-NEXT:    udot v1.4s, v0.16b, v2.16b
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: partial_reduce_shl_zext_const_rhs6:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #64
+; CHECK-DOT-I8MM-NEXT:    udot v1.4s, v0.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %ext = zext <16 x i8> %l to <16 x i32>
   %shift = shl nsw <16 x i32> %ext, splat (i32 6)
   %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift)

From da1eabd238d27c19ecc05580dff6361e5a0190d4 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve@amd.com>
Date: Wed, 1 Oct 2025 11:08:29 +0200
Subject: [PATCH 351/878] [AMDGPU] Remove duplicate definition of
 isGFX12CacheInvOrWBInst

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 754f52a28e710..a21089f8e0fcc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1033,11 +1033,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
            Opc == AMDGPU::GLOBAL_WBINV;
   }
 
-  static bool isGFX12CacheInvOrWBInst(unsigned Opc) {
-    return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
-           Opc == AMDGPU::GLOBAL_WBINV;
-  }
-
   static bool isF16PseudoScalarTrans(unsigned Opcode) {
     return Opcode == AMDGPU::V_S_EXP_F16_e64 ||
            Opcode == AMDGPU::V_S_LOG_F16_e64 ||

From 8c2cece14deaaa175bc665e20228280fbe6d019d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 1 Oct 2025 11:06:36 +0100
Subject: [PATCH 352/878] Fix MSVC "result of 32-bit shift implicitly converted
 to 64 bits" warning. NFC. (#161496)

---
 llvm/lib/CAS/OnDiskTrieRawHashMap.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
index 9b382dd749ea5..940389336ce22 100644
--- a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -114,7 +114,7 @@ class SubtrieHandle {
   using SlotT = std::atomic<int64_t>;
 
   static int64_t getSlotsSize(uint32_t NumBits) {
-    return sizeof(int64_t) * (1u << NumBits);
+    return sizeof(int64_t) * (1ull << NumBits);
   }
 
   static int64_t getSize(uint32_t NumBits) {
@@ -191,7 +191,8 @@ class SubtrieHandle {
   MutableArrayRef<SlotT> Slots;
 
   static MutableArrayRef<SlotT> getSlots(Header &H) {
-    return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1), 1u << H.NumBits);
+    return MutableArrayRef(reinterpret_cast<SlotT *>(&H + 1),
+                           1ull << H.NumBits);
   }
 };
 

From 5c50bdcea3977672d1183ee69cb840498f2fcf15 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Oct 2025 10:29:21 +0100
Subject: [PATCH 353/878] [lldb][Mangled][NFC] Remove redundant const-qualifier
 on llvm::StringRef argument

---
 lldb/include/lldb/Core/Mangled.h | 2 +-
 lldb/source/Core/Mangled.cpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/include/lldb/Core/Mangled.h b/lldb/include/lldb/Core/Mangled.h
index 47f1c6a8d80b7..665accb3119e3 100644
--- a/lldb/include/lldb/Core/Mangled.h
+++ b/lldb/include/lldb/Core/Mangled.h
@@ -251,7 +251,7 @@ class Mangled {
   /// \return
   ///     eManglingSchemeNone if no known mangling scheme could be identified
   ///     for s, otherwise the enumerator for the mangling scheme detected.
-  static Mangled::ManglingScheme GetManglingScheme(llvm::StringRef const name);
+  static Mangled::ManglingScheme GetManglingScheme(llvm::StringRef name);
 
   static bool IsMangledName(llvm::StringRef name);
 
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index 91b9c0007617d..0780846b0ed60 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -40,7 +40,7 @@ bool Mangled::IsMangledName(llvm::StringRef name) {
   return Mangled::GetManglingScheme(name) != Mangled::eManglingSchemeNone;
 }
 
-Mangled::ManglingScheme Mangled::GetManglingScheme(llvm::StringRef const name) {
+Mangled::ManglingScheme Mangled::GetManglingScheme(llvm::StringRef name) {
   if (name.empty())
     return Mangled::eManglingSchemeNone;
 

From a3f667bc08cb2f547ff2139c975dd684cf005885 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Wed, 1 Oct 2025 12:08:12 +0200
Subject: [PATCH 354/878] [AMDGPU][SIInsertWaitCnts] Remove redundant
 TII/TRI/MRI arguments (NFC) (#161357)

WaitCntBrackets already has a pointer to its SIInsertWaitCnt instance.
With a small change, it can directly access TII/TRI/MRI that way.
This simplifies a lot of call sites which make the code easier to
follow.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 121 +++++++++-----------
 1 file changed, 54 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 3f9a1f492ace5..76bfce8c0f6f9 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -418,15 +418,14 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
 class SIInsertWaitcnts {
 public:
   const GCNSubtarget *ST;
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
   InstCounterType SmemAccessCounter;
   InstCounterType MaxCounter;
   const unsigned *WaitEventMaskForInst;
 
 private:
-  const SIInstrInfo *TII = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-
   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
   DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
   MachineLoopInfo *MLI;
@@ -631,8 +630,6 @@ class WaitcntBrackets {
   bool merge(const WaitcntBrackets &Other);
 
   RegInterval getRegInterval(const MachineInstr *MI,
-                             const MachineRegisterInfo *MRI,
-                             const SIRegisterInfo *TRI,
                              const MachineOperand &Op) const;
 
   bool counterOutOfOrder(InstCounterType T) const;
@@ -650,9 +647,7 @@ class WaitcntBrackets {
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
   void applyWaitcnt(InstCounterType T, unsigned Count);
   void applyXcnt(const AMDGPU::Waitcnt &Wait);
-  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
-                     const MachineRegisterInfo *MRI, WaitEventType E,
-                     MachineInstr &MI);
+  void updateByEvent(WaitEventType E, MachineInstr &MI);
 
   unsigned hasPendingEvent() const { return PendingEvents; }
   unsigned hasPendingEvent(WaitEventType E) const {
@@ -761,10 +756,8 @@ class WaitcntBrackets {
   void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
                           unsigned Score);
 
-  void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
-                         const MachineRegisterInfo *MRI,
-                         const MachineOperand &Op, InstCounterType CntTy,
-                         unsigned Val);
+  void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,
+                         InstCounterType CntTy, unsigned Val);
 
   const SIInsertWaitcnts *Context;
 
@@ -821,12 +814,13 @@ class SIInsertWaitcntsLegacy : public MachineFunctionPass {
 } // end anonymous namespace
 
 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
-                                            const MachineRegisterInfo *MRI,
-                                            const SIRegisterInfo *TRI,
                                             const MachineOperand &Op) const {
   if (Op.getReg() == AMDGPU::SCC)
     return {SCC, SCC + 1};
 
+  const SIRegisterInfo *TRI = Context->TRI;
+  const MachineRegisterInfo *MRI = Context->MRI;
+
   if (!TRI->isInAllocatableClass(Op.getReg()))
     return {-1, -1};
 
@@ -891,11 +885,9 @@ void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
 }
 
 void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
-                                        const SIRegisterInfo *TRI,
-                                        const MachineRegisterInfo *MRI,
                                         const MachineOperand &Op,
                                         InstCounterType CntTy, unsigned Score) {
-  RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
+  RegInterval Interval = getRegInterval(MI, Op);
   setScoreByInterval(Interval, CntTy, Score);
 }
 
@@ -927,10 +919,7 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
   return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
 }
 
-void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
-                                    const SIRegisterInfo *TRI,
-                                    const MachineRegisterInfo *MRI,
-                                    WaitEventType E, MachineInstr &Inst) {
+void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
   InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
 
   unsigned UB = getScoreUB(T);
@@ -943,6 +932,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
   PendingEvents |= 1 << E;
   setScoreUB(T, CurrScore);
 
+  const SIRegisterInfo *TRI = Context->TRI;
+  const MachineRegisterInfo *MRI = Context->MRI;
+  const SIInstrInfo *TII = Context->TII;
+
   if (T == EXP_CNT) {
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
@@ -950,59 +943,56 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       // All GDS operations must protect their address register (same as
       // export.)
       if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
-        setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
+        setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
 
       if (Inst.mayStore()) {
         if (const auto *Data0 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
-          setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
+          setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
         if (const auto *Data1 =
                 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
-          setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
+          setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (const MachineOperand &Op : Inst.all_uses()) {
           if (TRI->isVectorRegister(*MRI, Op.getReg()))
-            setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
+            setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
         }
       }
     } else if (TII->isFLAT(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst, TRI, MRI,
+        setScoreByOperand(&Inst,
                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst, TRI, MRI,
+        setScoreByOperand(&Inst,
                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isMIMG(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
-                          CurrScore);
+        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst, TRI, MRI,
+        setScoreByOperand(&Inst,
                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isMTBUF(Inst)) {
       if (Inst.mayStore())
-        setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
-                          CurrScore);
+        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
     } else if (TII->isMUBUF(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
-                          CurrScore);
+        setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(&Inst, TRI, MRI,
+        setScoreByOperand(&Inst,
                           *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
     } else if (TII->isLDSDIR(Inst)) {
       // LDSDIR instructions attach the score to the destination.
-      setScoreByOperand(&Inst, TRI, MRI,
+      setScoreByOperand(&Inst,
                         *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
                         EXP_CNT, CurrScore);
     } else {
@@ -1013,18 +1003,18 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         // score.
         for (MachineOperand &DefMO : Inst.all_defs()) {
           if (TRI->isVGPR(*MRI, DefMO.getReg())) {
-            setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
+            setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
           }
         }
       }
       for (const MachineOperand &Op : Inst.all_uses()) {
         if (TRI->isVectorRegister(*MRI, Op.getReg()))
-          setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
+          setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
       }
     }
   } else if (T == X_CNT) {
     for (const MachineOperand &Op : Inst.all_uses())
-      setScoreByOperand(&Inst, TRI, MRI, Op, T, CurrScore);
+      setScoreByOperand(&Inst, Op, T, CurrScore);
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
     // Match the score to the destination registers.
     //
@@ -1036,7 +1026,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
     // Special cases where implicit register defs exists, such as M0 or VCC,
     // but none with memory instructions.
     for (const MachineOperand &Op : Inst.defs()) {
-      RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
+      RegInterval Interval = getRegInterval(&Inst, Op);
       if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
         if (Interval.first >= NUM_ALL_VGPRS)
           continue;
@@ -1928,7 +1918,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
       const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
       if (CallAddrOp.isReg()) {
         RegInterval CallAddrOpInterval =
-            ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
+            ScoreBrackets.getRegInterval(&MI, CallAddrOp);
 
         ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
                                     Wait);
@@ -1936,7 +1926,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (const auto *RtnAddrOp =
                 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
           RegInterval RtnAddrOpInterval =
-              ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
+              ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);
 
           ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
                                       Wait);
@@ -2000,7 +1990,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
           continue;
 
-        RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
+        RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);
 
         const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
         if (IsVGPR) {
@@ -2237,16 +2227,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
     if (TII->isAlwaysGDS(Inst.getOpcode()) ||
         TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+      ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
       ScoreBrackets->setPendingGDS();
     } else {
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
     }
   } else if (TII->isFLAT(Inst)) {
     if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
-                                   Inst);
+      ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
       return;
     }
 
@@ -2257,13 +2246,12 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     if (TII->mayAccessVMEMThroughFlat(Inst)) {
       ++FlatASCount;
       IsVMEMAccess = true;
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
-                                   Inst);
+      ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
     }
 
     if (TII->mayAccessLDSThroughFlat(Inst)) {
       ++FlatASCount;
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
     }
 
     // This is a flat memory operation that access both VMEM and LDS, so note it
@@ -2274,16 +2262,15 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   } else if (SIInstrInfo::isVMEM(Inst) &&
              !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
     IsVMEMAccess = true;
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
-                                 Inst);
+    ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
 
     if (ST->vmemWriteNeedsExpWaitcnt() &&
         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+      ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
     }
   } else if (TII->isSMRD(Inst)) {
     IsSMEMAccess = true;
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+    ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
   } else if (Inst.isCall()) {
     if (callWaitsOnFunctionReturn(Inst)) {
       // Act as a wait on everything
@@ -2295,33 +2282,33 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
     }
   } else if (SIInstrInfo::isLDSDIR(Inst)) {
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
+    ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
   } else if (TII->isVINTERP(Inst)) {
     int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
     ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
   } else if (SIInstrInfo::isEXP(Inst)) {
     unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
     else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
     else
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
+      ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
   } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, SCC_WRITE, Inst);
+    ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
   } else {
     switch (Inst.getOpcode()) {
     case AMDGPU::S_SENDMSG:
     case AMDGPU::S_SENDMSG_RTN_B32:
     case AMDGPU::S_SENDMSG_RTN_B64:
     case AMDGPU::S_SENDMSGHALT:
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+      ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
       break;
     case AMDGPU::S_MEMTIME:
     case AMDGPU::S_MEMREALTIME:
     case AMDGPU::S_GET_BARRIER_STATE_M0:
     case AMDGPU::S_GET_BARRIER_STATE_IMM:
-      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
       break;
     }
   }
@@ -2330,10 +2317,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     return;
 
   if (IsVMEMAccess)
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_GROUP, Inst);
+    ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
 
   if (IsSMEMAccess)
-    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_GROUP, Inst);
+    ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
 }
 
 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
@@ -2637,7 +2624,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
       for (const MachineOperand &Op : MI.all_uses()) {
         if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
           continue;
-        RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
+        RegInterval Interval = Brackets.getRegInterval(&MI, Op);
         // Vgpr use
         for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
           // If we find a register that is loaded inside the loop, 1. and 2.
@@ -2662,7 +2649,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
       // VMem load vgpr def
       if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
         for (const MachineOperand &Op : MI.all_defs()) {
-          RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
+          RegInterval Interval = Brackets.getRegInterval(&MI, Op);
           for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
             // If we find a register that is loaded inside the loop, 1. and 2.
             // are invalidated and we can exit.

From 0a0d4979935cc13ecafdb8c9b00dd74779651781 Mon Sep 17 00:00:00 2001
From: Timur Golubovich <timur.golubovich@syntacore.com>
Date: Wed, 1 Oct 2025 13:20:58 +0300
Subject: [PATCH 355/878] [lldb][TypeSystemClang] Added unique builtins types
 for __bf16 and _Float16 (#157674)

During debugging applization with __bf16 and _Float16 float types it was
discovered that lldb creates the same CompilerType for them. This can
cause an infinite recursion error, if one tries to create two struct
specializations with these types and then inherit one specialization
from another.
---
 .../TypeSystem/Clang/TypeSystemClang.cpp      |  8 +++++++
 .../floating-types-specialization/Makefile    |  3 +++
 .../TestCppFloatingTypesSpecialization.py     | 22 +++++++++++++++++++
 .../floating-types-specialization/main.cpp    | 11 ++++++++++
 .../TestCppTemplateArguments.py               |  2 +-
 5 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/API/lang/cpp/floating-types-specialization/Makefile
 create mode 100644 lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py
 create mode 100644 lldb/test/API/lang/cpp/floating-types-specialization/main.cpp

diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index a5aaf1f9cb5af..21c265ede0bc5 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -960,6 +960,12 @@ CompilerType TypeSystemClang::GetBuiltinTypeForDWARFEncodingAndBitSize(
     if (type_name == "long double" &&
         QualTypeMatchesBitSize(bit_size, ast, ast.LongDoubleTy))
       return GetType(ast.LongDoubleTy);
+    if (type_name == "__bf16" &&
+        QualTypeMatchesBitSize(bit_size, ast, ast.BFloat16Ty))
+      return GetType(ast.BFloat16Ty);
+    if (type_name == "_Float16" &&
+        QualTypeMatchesBitSize(bit_size, ast, ast.Float16Ty))
+      return GetType(ast.Float16Ty);
     // As Rust currently uses `TypeSystemClang`, match `f128` here as well so it
     // doesn't get misinterpreted as `long double` on targets where they are
     // the same size but different formats.
@@ -1792,6 +1798,8 @@ bool TypeSystemClang::RecordHasFields(const RecordDecl *record_decl) {
     for (base_class = cxx_record_decl->bases_begin(),
         base_class_end = cxx_record_decl->bases_end();
          base_class != base_class_end; ++base_class) {
+      assert(record_decl != base_class->getType()->getAsCXXRecordDecl() &&
+             "Base can't inherit from itself.");
       if (RecordHasFields(base_class->getType()->getAsCXXRecordDecl()))
         return true;
     }
diff --git a/lldb/test/API/lang/cpp/floating-types-specialization/Makefile b/lldb/test/API/lang/cpp/floating-types-specialization/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/lang/cpp/floating-types-specialization/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py b/lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py
new file mode 100644
index 0000000000000..9564a0bc31809
--- /dev/null
+++ b/lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py
@@ -0,0 +1,22 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestCase(TestBase):
+    def test(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.cpp", False)
+        )
+
+        self.expect_expr("f0", result_type="Foo<__bf16>")
+        self.expect_expr("f1", result_type="Foo<__fp16>")
+
+        # Test sizeof to ensure while computing layout we don't do
+        # infinite recursion.
+        v = self.frame().EvaluateExpression("sizeof(f0)")
+        self.assertEqual(v.GetValueAsUnsigned() > 0, True)
+        v = self.frame().EvaluateExpression("sizeof(f1)")
+        self.assertEqual(v.GetValueAsUnsigned() > 0, True)
diff --git a/lldb/test/API/lang/cpp/floating-types-specialization/main.cpp b/lldb/test/API/lang/cpp/floating-types-specialization/main.cpp
new file mode 100644
index 0000000000000..e3e8a3767fef8
--- /dev/null
+++ b/lldb/test/API/lang/cpp/floating-types-specialization/main.cpp
@@ -0,0 +1,11 @@
+template <typename T> struct Foo;
+
+template <> struct Foo<__bf16> {};
+
+template <> struct Foo<_Float16> : Foo<__bf16> {};
+
+int main() {
+  Foo<__bf16> f0;
+  Foo<_Float16> f1;
+  return 0; // break here
+}
diff --git a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py
index eac7b5ef1099a..f26d382bf8582 100644
--- a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py
+++ b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py
@@ -82,7 +82,7 @@ def test(self):
         value = self.expect_expr("temp7", result_type="Foo<__fp16, __fp16>")
         self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1))
 
-        value = self.expect_expr("temp8", result_type="Foo<__fp16, __fp16>")
+        value = self.expect_expr("temp8", result_type="Foo<__bf16, __bf16>")
         self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1))
 
         value = self.expect_expr("temp9", result_type="Bar<double, 1.200000e+00>")

From e95ed3352e5001a038d5fbf2989f2161fc526384 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 08:14:16 -0700
Subject: [PATCH 356/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in OpenMPDialect.cpp (NFC)

---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index f01ad05a778ec..a173cf13328cd 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -182,7 +182,7 @@ static ParseResult parseClauseAttr(AsmParser &parser, ClauseAttr &attr) {
 }
 
 template <typename ClauseAttr>
-void printClauseAttr(OpAsmPrinter &p, Operation *op, ClauseAttr attr) {
+static void printClauseAttr(OpAsmPrinter &p, Operation *op, ClauseAttr attr) {
   p << stringifyEnum(attr.getValue());
 }
 
@@ -1511,8 +1511,8 @@ static LogicalResult verifySynchronizationHint(Operation *op, uint64_t hint) {
 //===----------------------------------------------------------------------===//
 
 // Helper function to get bitwise AND of `value` and 'flag'
-uint64_t mapTypeToBitFlag(uint64_t value,
-                          llvm::omp::OpenMPOffloadMappingFlags flag) {
+static uint64_t mapTypeToBitFlag(uint64_t value,
+                                 llvm::omp::OpenMPOffloadMappingFlags flag) {
   return value & llvm::to_underlying(flag);
 }
 

From 55bd45852cde2eeb0132767f54f48589d87ce2ad Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Wed, 1 Oct 2025 14:03:29 +0300
Subject: [PATCH 357/878] [BOLT] Gadget scanner: optionally assume auth traps
 on failure (#139778)

On AArch64 it is possible for an auth instruction to either return an
invalid address value on failure (without FEAT_FPAC) or generate an
error (with FEAT_FPAC). It thus may be possible to never emit explicit
pointer checks, if the target CPU is known to support FEAT_FPAC.

This commit implements an --auth-traps-on-failure command line option,
which essentially makes "safe-to-dereference" and "trusted" register
properties identical and disables scanning for authentication oracles
completely.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp        | 112 +++++++----
 .../binary-analysis/AArch64/cmdline-args.test |   1 +
 .../AArch64/gs-pauth-authentication-oracles.s |   6 +-
 .../binary-analysis/AArch64/gs-pauth-calls.s  |   5 +-
 .../AArch64/gs-pauth-debug-output.s           | 177 ++++++++++-------
 .../AArch64/gs-pauth-signing-oracles.s        |  54 ++---
 .../AArch64/gs-pauth-tail-calls.s             | 184 +++++++++---------
 7 files changed, 314 insertions(+), 225 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index cfe4b6ba785e4..9d22d3c8c6cd7 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -14,6 +14,7 @@
 #include "bolt/Passes/PAuthGadgetScanner.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Passes/DataflowAnalysis.h"
+#include "bolt/Utils/CommandLineOpts.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/MC/MCInst.h"
@@ -26,6 +27,11 @@ namespace llvm {
 namespace bolt {
 namespace PAuthGadgetScanner {
 
+static cl::opt<bool> AuthTrapsOnFailure(
+    "auth-traps-on-failure",
+    cl::desc("Assume authentication instructions always trap on failure"),
+    cl::cat(opts::BinaryAnalysisCategory));
+
 [[maybe_unused]] static void traceInst(const BinaryContext &BC, StringRef Label,
                                        const MCInst &MI) {
   dbgs() << "  " << Label << ": ";
@@ -364,6 +370,34 @@ class SrcSafetyAnalysis {
     return Clobbered;
   }
 
+  std::optional<MCPhysReg> getRegMadeTrustedByChecking(const MCInst &Inst,
+                                                       SrcState Cur) const {
+    // This function cannot return multiple registers. This is never the case
+    // on AArch64.
+    std::optional<MCPhysReg> RegCheckedByInst =
+        BC.MIB->getAuthCheckedReg(Inst, /*MayOverwrite=*/false);
+    if (RegCheckedByInst && Cur.SafeToDerefRegs[*RegCheckedByInst])
+      return *RegCheckedByInst;
+
+    auto It = CheckerSequenceInfo.find(&Inst);
+    if (It == CheckerSequenceInfo.end())
+      return std::nullopt;
+
+    MCPhysReg RegCheckedBySequence = It->second.first;
+    const MCInst *FirstCheckerInst = It->second.second;
+
+    // FirstCheckerInst should belong to the same basic block (see the
+    // assertion in DataflowSrcSafetyAnalysis::run()), meaning it was
+    // deterministically processed a few steps before this instruction.
+    const SrcState &StateBeforeChecker = getStateBefore(*FirstCheckerInst);
+
+    // The sequence checks the register, but it should be authenticated before.
+    if (!StateBeforeChecker.SafeToDerefRegs[RegCheckedBySequence])
+      return std::nullopt;
+
+    return RegCheckedBySequence;
+  }
+
   // Returns all registers that can be treated as if they are written by an
   // authentication instruction.
   SmallVector<MCPhysReg> getRegsMadeSafeToDeref(const MCInst &Point,
@@ -386,18 +420,38 @@ class SrcSafetyAnalysis {
         Regs.push_back(DstAndSrc->first);
     }
 
+    // Make sure explicit checker sequence keeps register safe-to-dereference
+    // when the register would be clobbered according to the regular rules:
+    //
+    //    ; LR is safe to dereference here
+    //    mov   x16, x30  ; start of the sequence, LR is s-t-d right before
+    //    xpaclri         ; clobbers LR, LR is not safe anymore
+    //    cmp   x30, x16
+    //    b.eq  1f        ; end of the sequence: LR is marked as trusted
+    //    brk   0x1234
+    //  1:
+    //    ; at this point LR would be marked as trusted,
+    //    ; but not safe-to-dereference
+    //
+    // or even just
+    //
+    //    ; X1 is safe to dereference here
+    //    ldr x0, [x1, #8]!
+    //    ; X1 is trusted here, but it was clobbered due to address write-back
+    if (auto CheckedReg = getRegMadeTrustedByChecking(Point, Cur))
+      Regs.push_back(*CheckedReg);
+
     return Regs;
   }
 
   // Returns all registers made trusted by this instruction.
   SmallVector<MCPhysReg> getRegsMadeTrusted(const MCInst &Point,
                                             const SrcState &Cur) const {
+    assert(!AuthTrapsOnFailure && "Use getRegsMadeSafeToDeref instead");
     SmallVector<MCPhysReg> Regs;
 
     // An authenticated pointer can be checked, or
-    std::optional<MCPhysReg> CheckedReg =
-        BC.MIB->getAuthCheckedReg(Point, /*MayOverwrite=*/false);
-    if (CheckedReg && Cur.SafeToDerefRegs[*CheckedReg])
+    if (auto CheckedReg = getRegMadeTrustedByChecking(Point, Cur))
       Regs.push_back(*CheckedReg);
 
     // ... a pointer can be authenticated by an instruction that always checks
@@ -408,19 +462,6 @@ class SrcSafetyAnalysis {
     if (AutReg && IsChecked)
       Regs.push_back(*AutReg);
 
-    if (CheckerSequenceInfo.contains(&Point)) {
-      MCPhysReg CheckedReg;
-      const MCInst *FirstCheckerInst;
-      std::tie(CheckedReg, FirstCheckerInst) = CheckerSequenceInfo.at(&Point);
-
-      // FirstCheckerInst should belong to the same basic block (see the
-      // assertion in DataflowSrcSafetyAnalysis::run()), meaning it was
-      // deterministically processed a few steps before this instruction.
-      const SrcState &StateBeforeChecker = getStateBefore(*FirstCheckerInst);
-      if (StateBeforeChecker.SafeToDerefRegs[CheckedReg])
-        Regs.push_back(CheckedReg);
-    }
-
     // ... a safe address can be materialized, or
     if (auto NewAddrReg = BC.MIB->getMaterializedAddressRegForPtrAuth(Point))
       Regs.push_back(*NewAddrReg);
@@ -463,28 +504,11 @@ class SrcSafetyAnalysis {
     BitVector Clobbered = getClobberedRegs(Point);
     SmallVector<MCPhysReg> NewSafeToDerefRegs =
         getRegsMadeSafeToDeref(Point, Cur);
-    SmallVector<MCPhysReg> NewTrustedRegs = getRegsMadeTrusted(Point, Cur);
-
-    // Ideally, being trusted is a strictly stronger property than being
-    // safe-to-dereference. To simplify the computation of Next state, enforce
-    // this for NewSafeToDerefRegs and NewTrustedRegs. Additionally, this
-    // fixes the properly for "cumulative" register states in tricky cases
-    // like the following:
-    //
-    //    ; LR is safe to dereference here
-    //    mov   x16, x30  ; start of the sequence, LR is s-t-d right before
-    //    xpaclri         ; clobbers LR, LR is not safe anymore
-    //    cmp   x30, x16
-    //    b.eq  1f        ; end of the sequence: LR is marked as trusted
-    //    brk   0x1234
-    //  1:
-    //    ; at this point LR would be marked as trusted,
-    //    ; but not safe-to-dereference
-    //
-    for (auto TrustedReg : NewTrustedRegs) {
-      if (!is_contained(NewSafeToDerefRegs, TrustedReg))
-        NewSafeToDerefRegs.push_back(TrustedReg);
-    }
+    // If authentication instructions trap on failure, safe-to-dereference
+    // registers are always trusted.
+    SmallVector<MCPhysReg> NewTrustedRegs =
+        AuthTrapsOnFailure ? NewSafeToDerefRegs
+                           : getRegsMadeTrusted(Point, Cur);
 
     // Then, compute the state after this instruction is executed.
     SrcState Next = Cur;
@@ -521,6 +545,11 @@ class SrcSafetyAnalysis {
       dbgs() << ")\n";
     });
 
+    // Being trusted is a strictly stronger property than being
+    // safe-to-dereference.
+    assert(!Next.TrustedRegs.test(Next.SafeToDerefRegs) &&
+           "SafeToDerefRegs should contain all TrustedRegs");
+
     return Next;
   }
 
@@ -1130,6 +1159,11 @@ class DataflowDstSafetyAnalysis
   }
 
   void run() override {
+    // As long as DstSafetyAnalysis is only computed to detect authentication
+    // oracles, it is a waste of time to compute it when authentication
+    // instructions are known to always trap on failure.
+    assert(!AuthTrapsOnFailure &&
+           "DstSafetyAnalysis is useless with faulting auth");
     for (BinaryBasicBlock &BB : Func) {
       if (auto CheckerInfo = BC.MIB->getAuthCheckedReg(BB)) {
         LLVM_DEBUG({
@@ -1571,6 +1605,8 @@ void FunctionAnalysisContext::findUnsafeDefs(
     SmallVector<PartialReport<MCPhysReg>> &Reports) {
   if (PacRetGadgetsOnly)
     return;
+  if (AuthTrapsOnFailure)
+    return;
 
   auto Analysis = DstSafetyAnalysis::create(BF, AllocatorId, {});
   LLVM_DEBUG({ dbgs() << "Running dst register safety analysis...\n"; });
diff --git a/bolt/test/binary-analysis/AArch64/cmdline-args.test b/bolt/test/binary-analysis/AArch64/cmdline-args.test
index 3e70b2c0d3bb9..9660ad3bf80f7 100644
--- a/bolt/test/binary-analysis/AArch64/cmdline-args.test
+++ b/bolt/test/binary-analysis/AArch64/cmdline-args.test
@@ -33,6 +33,7 @@ HELP-NEXT:  OPTIONS:
 HELP-EMPTY:
 HELP-NEXT:  BinaryAnalysis options:
 HELP-EMPTY:
+HELP-NEXT:   --auth-traps-on-failure - Assume authentication instructions always trap on failure
 HELP-NEXT:   --scanners=<value> - which gadget scanners to run
 HELP-NEXT:   =pacret - pac-ret: return address protection (subset of "pauth")
 HELP-NEXT:   =pauth - All Pointer Authentication scanners
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
index f44ba21b9d484..9f580b66f47c7 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-authentication-oracles.s
@@ -1,6 +1,7 @@
 // RUN: %clang %cflags -march=armv8.3-a %s -o %t.exe
-// RUN: llvm-bolt-binary-analysis --scanners=pacret %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
-// RUN: llvm-bolt-binary-analysis --scanners=pauth  %t.exe 2>&1 | FileCheck %s
+// RUN: llvm-bolt-binary-analysis --scanners=pacret                        %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth --auth-traps-on-failure %t.exe 2>&1 | FileCheck -check-prefix=FPAC %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth                         %t.exe 2>&1 | FileCheck %s
 
 // The detection of compiler-generated explicit pointer checks is tested in
 // gs-pauth-address-checks.s, for that reason only test here "dummy-load" and
@@ -8,6 +9,7 @@
 // detected per-instruction and per-BB.
 
 // PACRET-NOT: authentication oracle found in function
+// FPAC-NOT:   authentication oracle found in function
 
         .text
 
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s b/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s
index fb0bc7cff2377..5e88e105a33f0 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s
@@ -1,6 +1,7 @@
 // RUN: %clang %cflags -march=armv8.3-a %s -o %t.exe
-// RUN: llvm-bolt-binary-analysis --scanners=pacret %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
-// RUN: llvm-bolt-binary-analysis --scanners=pauth %t.exe 2>&1 | FileCheck %s
+// RUN: llvm-bolt-binary-analysis --scanners=pacret                        %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth --auth-traps-on-failure %t.exe 2>&1 | FileCheck %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth                         %t.exe 2>&1 | FileCheck %s
 
 // PACRET-NOT: non-protected call found in function
 
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
index b1cec7f92ad05..ee8521ff1f810 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
@@ -1,10 +1,14 @@
 // REQUIRES: asserts
 //
 // RUN: %clang %cflags -march=armv8.3-a %s -o %t.exe
-// RUN: llvm-bolt-binary-analysis --scanners=pacret -no-threads \
-// RUN:    -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck %s
-// RUN: llvm-bolt-binary-analysis --scanners=pauth -no-threads \
-// RUN:    -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,PAUTH %s
+// RUN: llvm-bolt-binary-analysis --scanners=pacret --no-threads \
+// RUN:    -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,NOFPAC %s
+// RUN: llvm-bolt-binary-analysis --scanners=pacret --no-threads --auth-traps-on-failure \
+// RUN:    -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,FPAC %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth  --no-threads \
+// RUN:    -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,NOFPAC,AUTH-ORACLES,PAUTH %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth  --no-threads --auth-traps-on-failure \
+// RUN:    -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,FPAC,PAUTH %s
 
 // Check the debug output generated by PAuth gadget scanner to make sure the
 // that output is kept meaningful and to provide an overview of what happens
@@ -61,30 +65,54 @@ simple:
 // CHECK-NEXT:     State 1: src-state<empty>
 // CHECK-NEXT:     State 2: src-state<SafeToDerefRegs: , TrustedRegs:  , Insts: >)
 // CHECK-NEXT:     merged state: src-state<SafeToDerefRegs: , TrustedRegs:  , Insts: >
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   autiza  x0, src-state<SafeToDerefRegs: , TrustedRegs:  , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   blr     x0, src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   ldp     x29, x30, [sp], #0x10, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   hint    #29, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   DataflowSrcSafetyAnalysis::Confluence(
-// CHECK-NEXT:     State 1: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >
-// CHECK-NEXT:     State 2: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     merged state: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   autiza  x0, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   blr     x0, src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   ldp     x29, x30, [sp], #0x10, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   hint    #29, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   autiza  x0, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   blr     x0, src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ldp     x29, x30, [sp], #0x10, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   hint    #29, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   DataflowSrcSafetyAnalysis::Confluence(
+// NOFPAC-NEXT:     State 1: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >
+// NOFPAC-NEXT:     State 2: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     merged state: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   autiza  x0, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   blr     x0, src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ldp     x29, x30, [sp], #0x10, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   hint    #29, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
+// NOFPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   autiza  x0, src-state<SafeToDerefRegs: , TrustedRegs:  , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: W0 X0 W0_HI , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   blr     x0, src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: W0 X0 W0_HI , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ldp     x29, x30, [sp], #0x10, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   hint    #29, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
+// FPAC-NEXT:   DataflowSrcSafetyAnalysis::Confluence(
+// FPAC-NEXT:     State 1: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >
+// FPAC-NEXT:     State 2: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:     merged state: src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   autiza  x0, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: W0 X0 W0_HI , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   blr     x0, src-state<SafeToDerefRegs: W0 X0 W0_HI , TrustedRegs: W0 X0 W0_HI , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ldp     x29, x30, [sp], #0x10, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   hint    #29, src-state<SafeToDerefRegs: , TrustedRegs: , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
+// FPAC-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
+// FPAC-NEXT:     .. result: (src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: >)
 // CHECK-NEXT: After src register safety analysis:
 // CHECK-NEXT: Binary Function "simple"  {
 // CHECK-NEXT:   Number      : 1
@@ -255,53 +283,56 @@ auth_oracle:
 // ...
 // CHECK:      End of Function "auth_oracle"
 // ...
-// PAUTH:      Running dst register safety analysis...
-// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: >)
-// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
-// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
-// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
-// PAUTH-NEXT: After dst register safety analysis:
-// PAUTH-NEXT: Binary Function "auth_oracle"  {
-// PAUTH-NEXT:   Number      : 4
-// PAUTH-NEXT:   State       : CFG constructed
+// FPAC-NOT: Running dst register safety analysis
+// FPAC-NOT: DstSafetyAnalysis::ComputeNext
+// FPAC-NOT: {{.*dst-state.*}}
+// AUTH-ORACLES:      Running dst register safety analysis...
+// AUTH-ORACLES-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: >)
+// AUTH-ORACLES-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// AUTH-ORACLES-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// AUTH-ORACLES-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: >)
+// AUTH-ORACLES-NEXT: After dst register safety analysis:
+// AUTH-ORACLES-NEXT: Binary Function "auth_oracle"  {
+// AUTH-ORACLES-NEXT:   Number      : 4
+// AUTH-ORACLES-NEXT:   State       : CFG constructed
 // ...
-// PAUTH:        BB Layout   : [[BB0]]
-// PAUTH-NEXT: }
-// PAUTH-NEXT: [[BB0]] (2 instructions, align : 1)
-// PAUTH-NEXT:   Entry Point
-// PAUTH-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
-// PAUTH-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
-// PAUTH-EMPTY:
-// PAUTH-NEXT: DWARF CFI Instructions:
-// PAUTH-NEXT:     <empty>
-// PAUTH-NEXT: End of Function "auth_oracle"
-// PAUTH-EMPTY:
-// PAUTH-NEXT:   Found auth inst:     00000000:        autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
-// PAUTH-NEXT:     Authenticated reg: X0
-// PAUTH-NEXT:     safe output registers: LR W30 W30_HI{{[ \t]*$}}
-// PAUTH-EMPTY:
-// PAUTH-NEXT: Running detailed dst register safety analysis...
-// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: [0]()>)
-// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
-// PAUTH-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
-// PAUTH-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
-// PAUTH-NEXT: After detailed dst register safety analysis:
-// PAUTH-NEXT: Binary Function "auth_oracle"  {
-// PAUTH-NEXT:   Number      : 4
-// PAUTH-NEXT:   State       : CFG constructed
+// AUTH-ORACLES:        BB Layout   : [[BB0]]
+// AUTH-ORACLES-NEXT: }
+// AUTH-ORACLES-NEXT: [[BB0]] (2 instructions, align : 1)
+// AUTH-ORACLES-NEXT:   Entry Point
+// AUTH-ORACLES-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// AUTH-ORACLES-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// AUTH-ORACLES-EMPTY:
+// AUTH-ORACLES-NEXT: DWARF CFI Instructions:
+// AUTH-ORACLES-NEXT:     <empty>
+// AUTH-ORACLES-NEXT: End of Function "auth_oracle"
+// AUTH-ORACLES-EMPTY:
+// AUTH-ORACLES-NEXT:   Found auth inst:     00000000:        autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: >
+// AUTH-ORACLES-NEXT:     Authenticated reg: X0
+// AUTH-ORACLES-NEXT:     safe output registers: LR W30 W30_HI{{[ \t]*$}}
+// AUTH-ORACLES-EMPTY:
+// AUTH-ORACLES-NEXT: Running detailed dst register safety analysis...
+// AUTH-ORACLES-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: [0]()>)
+// AUTH-ORACLES-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
+// AUTH-ORACLES-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
+// AUTH-ORACLES-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
+// AUTH-ORACLES-NEXT: After detailed dst register safety analysis:
+// AUTH-ORACLES-NEXT: Binary Function "auth_oracle"  {
+// AUTH-ORACLES-NEXT:   Number      : 4
+// AUTH-ORACLES-NEXT:   State       : CFG constructed
 // ...
-// PAUTH:        BB Layout   : [[BB0]]
-// PAUTH-NEXT: }
-// PAUTH-NEXT: [[BB0]] (2 instructions, align : 1)
-// PAUTH-NEXT:   Entry Point
-// PAUTH-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
-// PAUTH-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0]()>
-// PAUTH-EMPTY:
-// PAUTH-NEXT: DWARF CFI Instructions:
-// PAUTH-NEXT:     <empty>
-// PAUTH-NEXT: End of Function "auth_oracle"
-// PAUTH-EMPTY:
-// PAUTH-NEXT:   Attaching leakage info to:     00000000:      autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+// AUTH-ORACLES:        BB Layout   : [[BB0]]
+// AUTH-ORACLES-NEXT: }
+// AUTH-ORACLES-NEXT: [[BB0]] (2 instructions, align : 1)
+// AUTH-ORACLES-NEXT:   Entry Point
+// AUTH-ORACLES-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+// AUTH-ORACLES-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0]()>
+// AUTH-ORACLES-EMPTY:
+// AUTH-ORACLES-NEXT: DWARF CFI Instructions:
+// AUTH-ORACLES-NEXT:     <empty>
+// AUTH-ORACLES-NEXT: End of Function "auth_oracle"
+// AUTH-ORACLES-EMPTY:
+// AUTH-ORACLES-NEXT:   Attaching leakage info to:     00000000:      autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
 
 // Gadget scanner should not crash on CFI instructions, including when debug-printing them.
 // Note that the particular debug output is not checked, but BOLT should be
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s b/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s
index 4d4bb7b0fb251..7d908f234d852 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-signing-oracles.s
@@ -1,6 +1,7 @@
 // RUN: %clang %cflags -march=armv8.3-a+pauth-lr -Wl,--no-relax %s -o %t.exe
-// RUN: llvm-bolt-binary-analysis --scanners=pacret %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
-// RUN: llvm-bolt-binary-analysis --scanners=pauth  %t.exe 2>&1 | FileCheck %s
+// RUN: llvm-bolt-binary-analysis --scanners=pacret                        %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth                         %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,NOFPAC %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth --auth-traps-on-failure %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,FPAC %s
 
 // The detection of compiler-generated explicit pointer checks is tested in
 // gs-pauth-address-checks.s, for that reason only test here "dummy-load" and
@@ -66,9 +67,10 @@ good_sign_auted_checked_brk:
         .globl  bad_sign_authed_unchecked
         .type   bad_sign_authed_unchecked,@function
 bad_sign_authed_unchecked:
-// CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_sign_authed_unchecked, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// FPAC-NOT: bad_sign_authed_unchecked
+// NOFPAC-LABEL: GS-PAUTH: signing oracle found in function bad_sign_authed_unchecked, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         autda   x0, x2
         pacda   x0, x1
         ret
@@ -266,9 +268,10 @@ bad_call_between_checked_and_used:
         .globl  bad_transition_check_then_auth
         .type   bad_transition_check_then_auth,@function
 bad_transition_check_then_auth:
-// CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_transition_check_then_auth, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// FPAC-NOT: bad_transition_check_then_auth
+// NOFPAC-LABEL: GS-PAUTH: signing oracle found in function bad_transition_check_then_auth, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         ldr     x2, [x0]
         autda   x0, x2
         pacda   x0, x1
@@ -278,9 +281,10 @@ bad_transition_check_then_auth:
         .globl  bad_transition_auth_then_auth
         .type   bad_transition_auth_then_auth,@function
 bad_transition_auth_then_auth:
-// CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_transition_auth_then_auth, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// FPAC-NOT: bad_transition_auth_then_auth
+// NOFPAC-LABEL: GS-PAUTH: signing oracle found in function bad_transition_auth_then_auth, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         autda   x0, x2
         autda   x0, x2
         pacda   x0, x1
@@ -363,9 +367,10 @@ good_sign_auted_checked_brk_multi_bb:
         .globl  bad_sign_authed_unchecked_multi_bb
         .type   bad_sign_authed_unchecked_multi_bb,@function
 bad_sign_authed_unchecked_multi_bb:
-// CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_sign_authed_unchecked_multi_bb, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// FPAC-NOT: bad_sign_authed_unchecked_multi_bb
+// NOFPAC-LABEL: GS-PAUTH: signing oracle found in function bad_sign_authed_unchecked_multi_bb, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         autda   x0, x2
         cbz     x3, 1f
         ldr     x2, [x0]
@@ -534,9 +539,10 @@ good_sign_auted_checked_ldr_nocfg:
         .globl  bad_sign_authed_unchecked_nocfg
         .type   bad_sign_authed_unchecked_nocfg,@function
 bad_sign_authed_unchecked_nocfg:
-// CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_sign_authed_unchecked_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// FPAC-NOT: bad_sign_authed_unchecked_nocfg
+// NOFPAC-LABEL: GS-PAUTH: signing oracle found in function bad_sign_authed_unchecked_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         adr     x3, 1f
         br      x3
 1:
@@ -640,9 +646,10 @@ bad_clobber_between_checked_and_used_nocfg:
         .globl  bad_transition_check_then_auth_nocfg
         .type   bad_transition_check_then_auth_nocfg,@function
 bad_transition_check_then_auth_nocfg:
-// CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_transition_check_then_auth_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// FPAC-NOT: bad_transition_check_then_auth_nocfg
+// NOFPAC-LABEL: GS-PAUTH: signing oracle found in function bad_transition_check_then_auth_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         adr     x3, 1f
         br      x3
 1:
@@ -655,9 +662,10 @@ bad_transition_check_then_auth_nocfg:
         .globl  bad_transition_auth_then_auth_nocfg
         .type   bad_transition_auth_then_auth_nocfg,@function
 bad_transition_auth_then_auth_nocfg:
-// CHECK-LABEL: GS-PAUTH: signing oracle found in function bad_transition_auth_then_auth_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// FPAC-NOT: bad_transition_auth_then_auth_nocfg
+// NOFPAC-LABEL: GS-PAUTH: signing oracle found in function bad_transition_auth_then_auth_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:     pacda   x0, x1
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
         adr     x3, 1f
         br      x3
 1:
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s b/bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s
index 2d3c2f1a632ca..59b7d929275a9 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-tail-calls.s
@@ -1,6 +1,7 @@
 // RUN: %clang %cflags -Wl,--entry=_custom_start -march=armv8.3-a %s -o %t.exe
-// RUN: llvm-bolt-binary-analysis --scanners=pacret %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
-// RUN: llvm-bolt-binary-analysis --scanners=pauth  %t.exe 2>&1 | FileCheck %s
+// RUN: llvm-bolt-binary-analysis --scanners=pacret                        %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth --auth-traps-on-failure %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,FPAC %s
+// RUN: llvm-bolt-binary-analysis --scanners=pauth                         %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,NOFPAC %s
 
 // PACRET-NOT: untrusted link register found before tail call
 
@@ -89,19 +90,20 @@ bad_indirect_tailcall_not_auted:
         .globl  bad_direct_tailcall_untrusted
         .type   bad_direct_tailcall_untrusted,@function
 bad_direct_tailcall_untrusted:
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_direct_tailcall_untrusted, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       callee # TAILCALL
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_direct_tailcall_untrusted, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
-// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      b       callee # TAILCALL
-// CHECK-NEXT:  This happens in the following basic block:
-// CHECK-NEXT:  {{[0-9a-f]+}}:   paciasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
-// CHECK-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
-// CHECK-NEXT:  {{[0-9a-f]+}}:   autiasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   b       callee # TAILCALL
+// FPAC-NOT: bad_direct_tailcall_untrusted
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_direct_tailcall_untrusted, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       callee # TAILCALL
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_direct_tailcall_untrusted, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 1 instructions that leak the affected registers are:
+// NOFPAC-NEXT:  1.     {{[0-9a-f]+}}:      b       callee # TAILCALL
+// NOFPAC-NEXT:  This happens in the following basic block:
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   paciasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   autiasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   b       callee # TAILCALL
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         ldp     x29, x30, [sp], #0x10
@@ -114,19 +116,20 @@ bad_direct_tailcall_untrusted:
 bad_plt_tailcall_untrusted:
 // FIXME: Calls via PLT are disassembled incorrectly. Nevertheless, they are
 //        still detected as tail calls.
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_plt_tailcall_untrusted, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted # TAILCALL
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_plt_tailcall_untrusted, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
-// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted # TAILCALL
-// CHECK-NEXT:  This happens in the following basic block:
-// CHECK-NEXT:  {{[0-9a-f]+}}:   paciasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
-// CHECK-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
-// CHECK-NEXT:  {{[0-9a-f]+}}:   autiasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   b       bad_indirect_tailcall_untrusted # TAILCALL
+// FPAC-NOT: bad_plt_tailcall_untrusted
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_plt_tailcall_untrusted, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted # TAILCALL
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_plt_tailcall_untrusted, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 1 instructions that leak the affected registers are:
+// NOFPAC-NEXT:  1.     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted # TAILCALL
+// NOFPAC-NEXT:  This happens in the following basic block:
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   paciasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   autiasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   b       bad_indirect_tailcall_untrusted # TAILCALL
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         ldp     x29, x30, [sp], #0x10
@@ -137,20 +140,21 @@ bad_plt_tailcall_untrusted:
         .globl  bad_indirect_tailcall_untrusted
         .type   bad_indirect_tailcall_untrusted,@function
 bad_indirect_tailcall_untrusted:
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_indirect_tailcall_untrusted, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      br      x0 # TAILCALL
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
-// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      br      x0 # TAILCALL
-// CHECK-NEXT:  This happens in the following basic block:
-// CHECK-NEXT:  {{[0-9a-f]+}}:   paciasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
-// CHECK-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
-// CHECK-NEXT:  {{[0-9a-f]+}}:   autiasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
-// CHECK-NEXT:  {{[0-9a-f]+}}:   br      x0 # TAILCALL
+// FPAC-NOT: bad_indirect_tailcall_untrusted
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_indirect_tailcall_untrusted, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      br      x0 # TAILCALL
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 1 instructions that leak the affected registers are:
+// NOFPAC-NEXT:  1.     {{[0-9a-f]+}}:      br      x0 # TAILCALL
+// NOFPAC-NEXT:  This happens in the following basic block:
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   paciasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   autiasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   autia   x0, x1
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   br      x0 # TAILCALL
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         ldp     x29, x30, [sp], #0x10
@@ -251,13 +255,14 @@ bad_indirect_tailcall_not_auted_multi_bb:
         .globl  bad_direct_tailcall_untrusted_multi_bb
         .type   bad_direct_tailcall_untrusted_multi_bb,@function
 bad_direct_tailcall_untrusted_multi_bb:
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_direct_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       callee # TAILCALL
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_direct_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
-// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      b       callee # TAILCALL
+// FPAC-NOT: bad_direct_tailcall_untrusted_multi_bb
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_direct_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       callee # TAILCALL
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_direct_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 1 instructions that leak the affected registers are:
+// NOFPAC-NEXT:  1.     {{[0-9a-f]+}}:      b       callee # TAILCALL
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         ldp     x29, x30, [sp], #0x10
@@ -271,12 +276,13 @@ bad_direct_tailcall_untrusted_multi_bb:
         .globl  bad_indirect_tailcall_untrusted_multi_bb
         .type   bad_indirect_tailcall_untrusted_multi_bb,@function
 bad_indirect_tailcall_untrusted_multi_bb:
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_indirect_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      br      x0 # UNKNOWN CONTROL FLOW
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
+// FPAC-NOT: bad_indirect_tailcall_untrusted_multi_bb
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_indirect_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      br      x0 # UNKNOWN CONTROL FLOW
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted_multi_bb, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 0 instructions that leak the affected registers are:
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         ldp     x29, x30, [sp], #0x10
@@ -397,13 +403,14 @@ bad_indirect_tailcall_not_auted_nocfg:
         .globl  bad_direct_tailcall_untrusted_nocfg
         .type   bad_direct_tailcall_untrusted_nocfg,@function
 bad_direct_tailcall_untrusted_nocfg:
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_direct_tailcall_untrusted_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       callee # TAILCALL
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_direct_tailcall_untrusted_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
-// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      b       callee # TAILCALL
+// FPAC-NOT: bad_direct_tailcall_untrusted_nocfg
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_direct_tailcall_untrusted_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       callee # TAILCALL
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_direct_tailcall_untrusted_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 1 instructions that leak the affected registers are:
+// NOFPAC-NEXT:  1.     {{[0-9a-f]+}}:      b       callee # TAILCALL
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         adr     x3, 1f
@@ -419,13 +426,14 @@ bad_direct_tailcall_untrusted_nocfg:
 bad_plt_tailcall_untrusted_nocfg:
 // FIXME: Calls via PLT are disassembled incorrectly. Nevertheless, they are
 //        still detected as tail calls.
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_plt_tailcall_untrusted_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted_nocfg # TAILCALL
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_plt_tailcall_untrusted_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
-// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted_nocfg # TAILCALL
+// FPAC-NOT: bad_plt_tailcall_untrusted_nocfg
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_plt_tailcall_untrusted_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted_nocfg # TAILCALL
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_plt_tailcall_untrusted_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 1 instructions that leak the affected registers are:
+// NOFPAC-NEXT:  1.     {{[0-9a-f]+}}:      b       bad_indirect_tailcall_untrusted_nocfg # TAILCALL
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         adr     x3, 1f
@@ -441,11 +449,12 @@ bad_plt_tailcall_untrusted_nocfg:
 bad_indirect_tailcall_untrusted_nocfg:
 // Known false negative: ignoring UNKNOWN CONTROL FLOW without CFG.
 // Authentication oracle is found by a generic checker, though.
-// CHECK-NOT: untrusted link register{{.*}}bad_indirect_tailcall_untrusted_nocfg
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted_nocfg, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 0 instructions that leak the affected registers are:
-// CHECK-NOT: untrusted link register{{.*}}bad_indirect_tailcall_untrusted_nocfg
+// FPAC-NOT: bad_indirect_tailcall_untrusted_nocfg
+// NOFPAC-NOT: untrusted link register{{.*}}bad_indirect_tailcall_untrusted_nocfg
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted_nocfg, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 0 instructions that leak the affected registers are:
+// NOFPAC-NOT: untrusted link register{{.*}}bad_indirect_tailcall_untrusted_nocfg
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         adr     x3, 1f
@@ -515,19 +524,20 @@ good_indirect_tailcall_no_clobber_v83:
         .globl  bad_indirect_tailcall_untrusted_v83
         .type   bad_indirect_tailcall_untrusted_v83,@function
 bad_indirect_tailcall_untrusted_v83:
-// CHECK-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_indirect_tailcall_untrusted_v83, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      braa    x0, x1 # TAILCALL
-// CHECK-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
-// CHECK-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted_v83, basic block {{[^,]+}}, at address
-// CHECK-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
-// CHECK-NEXT:  The 1 instructions that leak the affected registers are:
-// CHECK-NEXT:  1.     {{[0-9a-f]+}}:      braa    x0, x1 # TAILCALL
-// CHECK-NEXT:  This happens in the following basic block:
-// CHECK-NEXT:  {{[0-9a-f]+}}:   paciasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
-// CHECK-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
-// CHECK-NEXT:  {{[0-9a-f]+}}:   autiasp
-// CHECK-NEXT:  {{[0-9a-f]+}}:   braa    x0, x1 # TAILCALL
+// FPAC-NOT: bad_indirect_tailcall_untrusted_v83
+// NOFPAC-LABEL: GS-PAUTH: untrusted link register found before tail call in function bad_indirect_tailcall_untrusted_v83, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      braa    x0, x1 # TAILCALL
+// NOFPAC-NEXT:  The 0 instructions that write to the affected registers after any authentication are:
+// NOFPAC-LABEL: GS-PAUTH: authentication oracle found in function bad_indirect_tailcall_untrusted_v83, basic block {{[^,]+}}, at address
+// NOFPAC-NEXT:  The instruction is     {{[0-9a-f]+}}:      autiasp
+// NOFPAC-NEXT:  The 1 instructions that leak the affected registers are:
+// NOFPAC-NEXT:  1.     {{[0-9a-f]+}}:      braa    x0, x1 # TAILCALL
+// NOFPAC-NEXT:  This happens in the following basic block:
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   paciasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   stp     x29, x30, [sp, #-0x10]!
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   ldp     x29, x30, [sp], #0x10
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   autiasp
+// NOFPAC-NEXT:  {{[0-9a-f]+}}:   braa    x0, x1 # TAILCALL
         paciasp
         stp     x29, x30, [sp, #-0x10]!
         ldp     x29, x30, [sp], #0x10

From f7f37fb71ebe12721b9ad22f21ecd7e0823210d4 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Wed, 1 Oct 2025 14:12:45 +0300
Subject: [PATCH 358/878] [BOLT] Gadget scanner: make use of C++17 features and
 LLVM helpers (#141665)

Perform trivial syntactical cleanups:

- make use of structured binding declarations
- use LLVM utility functions when appropriate
- omit braces around single expression inside single-line LLVM_DEBUG()

This patch is NFC aside from minor debug output changes.
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp        | 60 +++++++++----------
 .../AArch64/gs-pauth-debug-output.s           | 14 ++---
 2 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 9d22d3c8c6cd7..01b350b2f11fe 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -88,8 +88,8 @@ class TrackedRegisters {
   TrackedRegisters(ArrayRef<MCPhysReg> RegsToTrack)
       : Registers(RegsToTrack),
         RegToIndexMapping(getMappingSize(RegsToTrack), NoIndex) {
-    for (unsigned I = 0; I < RegsToTrack.size(); ++I)
-      RegToIndexMapping[RegsToTrack[I]] = I;
+    for (auto [MappedIndex, Reg] : llvm::enumerate(RegsToTrack))
+      RegToIndexMapping[Reg] = MappedIndex;
   }
 
   ArrayRef<MCPhysReg> getRegisters() const { return Registers; }
@@ -203,9 +203,9 @@ struct SrcState {
 
     SafeToDerefRegs &= StateIn.SafeToDerefRegs;
     TrustedRegs &= StateIn.TrustedRegs;
-    for (unsigned I = 0; I < LastInstWritingReg.size(); ++I)
-      for (const MCInst *J : StateIn.LastInstWritingReg[I])
-        LastInstWritingReg[I].insert(J);
+    for (auto [ThisSet, OtherSet] :
+         llvm::zip_equal(LastInstWritingReg, StateIn.LastInstWritingReg))
+      ThisSet.insert_range(OtherSet);
     return *this;
   }
 
@@ -224,11 +224,9 @@ struct SrcState {
 static void printInstsShort(raw_ostream &OS,
                             ArrayRef<SetOfRelatedInsts> Insts) {
   OS << "Insts: ";
-  for (unsigned I = 0; I < Insts.size(); ++I) {
-    auto &Set = Insts[I];
+  for (auto [I, PtrSet] : llvm::enumerate(Insts)) {
     OS << "[" << I << "](";
-    for (const MCInst *MCInstP : Set)
-      OS << MCInstP << " ";
+    interleave(PtrSet, OS, " ");
     OS << ")";
   }
 }
@@ -416,8 +414,9 @@ class SrcSafetyAnalysis {
     // ... an address can be updated in a safe manner, producing the result
     // which is as trusted as the input address.
     if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Point)) {
-      if (Cur.SafeToDerefRegs[DstAndSrc->second])
-        Regs.push_back(DstAndSrc->first);
+      auto [DstReg, SrcReg] = *DstAndSrc;
+      if (Cur.SafeToDerefRegs[SrcReg])
+        Regs.push_back(DstReg);
     }
 
     // Make sure explicit checker sequence keeps register safe-to-dereference
@@ -469,8 +468,9 @@ class SrcSafetyAnalysis {
     // ... an address can be updated in a safe manner, producing the result
     // which is as trusted as the input address.
     if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Point)) {
-      if (Cur.TrustedRegs[DstAndSrc->second])
-        Regs.push_back(DstAndSrc->first);
+      auto [DstReg, SrcReg] = *DstAndSrc;
+      if (Cur.TrustedRegs[SrcReg])
+        Regs.push_back(DstReg);
     }
 
     return Regs;
@@ -865,9 +865,9 @@ struct DstState {
       return (*this = StateIn);
 
     CannotEscapeUnchecked &= StateIn.CannotEscapeUnchecked;
-    for (unsigned I = 0; I < FirstInstLeakingReg.size(); ++I)
-      for (const MCInst *J : StateIn.FirstInstLeakingReg[I])
-        FirstInstLeakingReg[I].insert(J);
+    for (auto [ThisSet, OtherSet] :
+         llvm::zip_equal(FirstInstLeakingReg, StateIn.FirstInstLeakingReg))
+      ThisSet.insert_range(OtherSet);
     return *this;
   }
 
@@ -1033,8 +1033,7 @@ class DstSafetyAnalysis {
 
     // ... an address can be updated in a safe manner, or
     if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Inst)) {
-      MCPhysReg DstReg, SrcReg;
-      std::tie(DstReg, SrcReg) = *DstAndSrc;
+      auto [DstReg, SrcReg] = *DstAndSrc;
       // Note that *all* registers containing the derived values must be safe,
       // both source and destination ones. No temporaries are supported at now.
       if (Cur.CannotEscapeUnchecked[SrcReg] &&
@@ -1074,7 +1073,7 @@ class DstSafetyAnalysis {
     // If this instruction terminates the program immediately, no
     // authentication oracles are possible past this point.
     if (BC.MIB->isTrap(Point)) {
-      LLVM_DEBUG({ traceInst(BC, "Trap instruction found", Point); });
+      LLVM_DEBUG(traceInst(BC, "Trap instruction found", Point));
       DstState Next(NumRegs, RegsToTrackInstsFor.getNumTrackedRegisters());
       Next.CannotEscapeUnchecked.set();
       return Next;
@@ -1249,7 +1248,7 @@ class CFGUnawareDstSafetyAnalysis : public DstSafetyAnalysis,
       // starting to analyze Inst.
       if (BC.MIB->isCall(Inst) || BC.MIB->isBranch(Inst) ||
           BC.MIB->isReturn(Inst)) {
-        LLVM_DEBUG({ traceInst(BC, "Control flow instruction", Inst); });
+        LLVM_DEBUG(traceInst(BC, "Control flow instruction", Inst));
         S = createUnsafeState();
       }
 
@@ -1394,7 +1393,7 @@ shouldReportUnsafeTailCall(const BinaryContext &BC, const BinaryFunction &BF,
   // such libc, ignore tail calls performed by ELF entry function.
   if (BC.StartFunctionAddress &&
       *BC.StartFunctionAddress == Inst.getFunction()->getAddress()) {
-    LLVM_DEBUG({ dbgs() << "  Skipping tail call in ELF entry function.\n"; });
+    LLVM_DEBUG(dbgs() << "  Skipping tail call in ELF entry function.\n");
     return std::nullopt;
   }
 
@@ -1468,7 +1467,7 @@ shouldReportAuthOracle(const BinaryContext &BC, const MCInstReference &Inst,
   });
 
   if (S.empty()) {
-    LLVM_DEBUG({ dbgs() << "    DstState is empty!\n"; });
+    LLVM_DEBUG(dbgs() << "    DstState is empty!\n");
     return make_generic_report(
         Inst, "Warning: no state computed for an authentication instruction "
               "(possibly unreachable)");
@@ -1495,7 +1494,7 @@ collectRegsToTrack(ArrayRef<PartialReport<MCPhysReg>> Reports) {
 void FunctionAnalysisContext::findUnsafeUses(
     SmallVector<PartialReport<MCPhysReg>> &Reports) {
   auto Analysis = SrcSafetyAnalysis::create(BF, AllocatorId, {});
-  LLVM_DEBUG({ dbgs() << "Running src register safety analysis...\n"; });
+  LLVM_DEBUG(dbgs() << "Running src register safety analysis...\n");
   Analysis->run();
   LLVM_DEBUG({
     dbgs() << "After src register safety analysis:\n";
@@ -1552,8 +1551,7 @@ void FunctionAnalysisContext::findUnsafeUses(
 
     const SrcState &S = Analysis->getStateBefore(Inst);
     if (S.empty()) {
-      LLVM_DEBUG(
-          { traceInst(BC, "Instruction has no state, skipping", Inst); });
+      LLVM_DEBUG(traceInst(BC, "Instruction has no state, skipping", Inst));
       assert(UnreachableBBReported && "Should be reported at least once");
       (void)UnreachableBBReported;
       return;
@@ -1580,8 +1578,7 @@ void FunctionAnalysisContext::augmentUnsafeUseReports(
   SmallVector<MCPhysReg> RegsToTrack = collectRegsToTrack(Reports);
   // Re-compute the analysis with register tracking.
   auto Analysis = SrcSafetyAnalysis::create(BF, AllocatorId, RegsToTrack);
-  LLVM_DEBUG(
-      { dbgs() << "\nRunning detailed src register safety analysis...\n"; });
+  LLVM_DEBUG(dbgs() << "\nRunning detailed src register safety analysis...\n");
   Analysis->run();
   LLVM_DEBUG({
     dbgs() << "After detailed src register safety analysis:\n";
@@ -1591,7 +1588,7 @@ void FunctionAnalysisContext::augmentUnsafeUseReports(
   // Augment gadget reports.
   for (auto &Report : Reports) {
     MCInstReference Location = Report.Issue->Location;
-    LLVM_DEBUG({ traceInst(BC, "Attaching clobbering info to", Location); });
+    LLVM_DEBUG(traceInst(BC, "Attaching clobbering info to", Location));
     assert(Report.RequestedDetails &&
            "Should be removed by handleSimpleReports");
     auto DetailedInfo =
@@ -1609,7 +1606,7 @@ void FunctionAnalysisContext::findUnsafeDefs(
     return;
 
   auto Analysis = DstSafetyAnalysis::create(BF, AllocatorId, {});
-  LLVM_DEBUG({ dbgs() << "Running dst register safety analysis...\n"; });
+  LLVM_DEBUG(dbgs() << "Running dst register safety analysis...\n");
   Analysis->run();
   LLVM_DEBUG({
     dbgs() << "After dst register safety analysis:\n";
@@ -1632,8 +1629,7 @@ void FunctionAnalysisContext::augmentUnsafeDefReports(
   SmallVector<MCPhysReg> RegsToTrack = collectRegsToTrack(Reports);
   // Re-compute the analysis with register tracking.
   auto Analysis = DstSafetyAnalysis::create(BF, AllocatorId, RegsToTrack);
-  LLVM_DEBUG(
-      { dbgs() << "\nRunning detailed dst register safety analysis...\n"; });
+  LLVM_DEBUG(dbgs() << "\nRunning detailed dst register safety analysis...\n");
   Analysis->run();
   LLVM_DEBUG({
     dbgs() << "After detailed dst register safety analysis:\n";
@@ -1643,7 +1639,7 @@ void FunctionAnalysisContext::augmentUnsafeDefReports(
   // Augment gadget reports.
   for (auto &Report : Reports) {
     MCInstReference Location = Report.Issue->Location;
-    LLVM_DEBUG({ traceInst(BC, "Attaching leakage info to", Location); });
+    LLVM_DEBUG(traceInst(BC, "Attaching leakage info to", Location));
     assert(Report.RequestedDetails &&
            "Should be removed by handleSimpleReports");
     auto DetailedInfo = std::make_shared<LeakageInfo>(
diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
index ee8521ff1f810..a3ad7effe4b0d 100644
--- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
+++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s
@@ -177,9 +177,9 @@ clobber:
 // CHECK-EMPTY:
 // CHECK-NEXT: Running detailed src register safety analysis...
 // CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   mov     w30, #0x0, src-state<SafeToDerefRegs: LR W30 W30_HI , TrustedRegs: LR W30 W30_HI , Insts: [0]()>)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: W30_HI , TrustedRegs: W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
-// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: W30_HI , TrustedRegs: W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
-// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: W30_HI , TrustedRegs: W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
+// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: W30_HI , TrustedRegs: W30_HI , Insts: [0](0x{{[0-9a-f]+}})>)
+// CHECK-NEXT:   SrcSafetyAnalysis::ComputeNext(   ret     x30, src-state<SafeToDerefRegs: W30_HI , TrustedRegs: W30_HI , Insts: [0](0x{{[0-9a-f]+}})>)
+// CHECK-NEXT:     .. result: (src-state<SafeToDerefRegs: W30_HI , TrustedRegs: W30_HI , Insts: [0](0x{{[0-9a-f]+}})>)
 // CHECK-NEXT: After detailed src register safety analysis:
 // CHECK-NEXT: Binary Function "clobber"  {
 // ...
@@ -189,7 +189,7 @@ clobber:
 // Iterating over the reports and attaching clobbering info:
 
 // CHECK-EMPTY:
-// CHECK-NEXT:   Attaching clobbering info to:     00000000:         ret # DataflowSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+// CHECK-NEXT:   Attaching clobbering info to:     00000000:         ret # DataflowSrcSafetyAnalysis: src-state<SafeToDerefRegs: BitVector, TrustedRegs: BitVector, Insts: [0](0x{{[0-9a-f]+}})>
 
         .globl  nocfg
         .type   nocfg,@function
@@ -315,7 +315,7 @@ auth_oracle:
 // AUTH-ORACLES-NEXT:   DstSafetyAnalysis::ComputeNext(       ret     x30, dst-state<CannotEscapeUnchecked: , Insts: [0]()>)
 // AUTH-ORACLES-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
 // AUTH-ORACLES-NEXT:   DstSafetyAnalysis::ComputeNext(       autia   x0, x1, dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0]()>)
-// AUTH-ORACLES-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0](0x{{[0-9a-f]+}} )>)
+// AUTH-ORACLES-NEXT:     .. result: (dst-state<CannotEscapeUnchecked: LR W30 W30_HI , Insts: [0](0x{{[0-9a-f]+}})>)
 // AUTH-ORACLES-NEXT: After detailed dst register safety analysis:
 // AUTH-ORACLES-NEXT: Binary Function "auth_oracle"  {
 // AUTH-ORACLES-NEXT:   Number      : 4
@@ -325,14 +325,14 @@ auth_oracle:
 // AUTH-ORACLES-NEXT: }
 // AUTH-ORACLES-NEXT: [[BB0]] (2 instructions, align : 1)
 // AUTH-ORACLES-NEXT:   Entry Point
-// AUTH-ORACLES-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+// AUTH-ORACLES-NEXT:     00000000:   autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}})>
 // AUTH-ORACLES-NEXT:     00000004:   ret # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0]()>
 // AUTH-ORACLES-EMPTY:
 // AUTH-ORACLES-NEXT: DWARF CFI Instructions:
 // AUTH-ORACLES-NEXT:     <empty>
 // AUTH-ORACLES-NEXT: End of Function "auth_oracle"
 // AUTH-ORACLES-EMPTY:
-// AUTH-ORACLES-NEXT:   Attaching leakage info to:     00000000:      autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}} )>
+// AUTH-ORACLES-NEXT:   Attaching leakage info to:     00000000:      autia   x0, x1 # DataflowDstSafetyAnalysis: dst-state<CannotEscapeUnchecked: BitVector, Insts: [0](0x{{[0-9a-f]+}})>
 
 // Gadget scanner should not crash on CFI instructions, including when debug-printing them.
 // Note that the particular debug output is not checked, but BOLT should be

From 65829ffd3eb92a1b2da2d204059da035c1a75197 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 06:13:53 -0700
Subject: [PATCH 359/878] [MLIR] Apply clang-tidy fixes for
 performance-move-const-arg in SimplifyAffineMinMax.cpp (NFC)

---
 mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp
index f3e065a12ded0..9821a75a55f49 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineMinMax.cpp
@@ -246,6 +246,6 @@ void SimplifyAffineMinMaxPass::runOnOperation() {
   patterns.add<SimplifyAffineMaxOp, SimplifyAffineMinOp, SimplifyAffineApplyOp>(
       func.getContext());
   FrozenRewritePatternSet frozenPatterns(std::move(patterns));
-  if (failed(applyPatternsGreedily(func, std::move(frozenPatterns))))
+  if (failed(applyPatternsGreedily(func, frozenPatterns)))
     return signalPassFailure();
 }

From fef7753454a51f00d1300f30d1991696c00ba6f7 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 1 Oct 2025 12:41:03 +0100
Subject: [PATCH 360/878] [AArch64] Some tests for cbz/tbz with wzr. NFC

---
 llvm/test/CodeGen/AArch64/cbz_wzr.mir | 260 +++++++++++++++++++++
 llvm/test/CodeGen/AArch64/tbz-tbnz.ll | 324 +++++++++++++++++++++++++-
 2 files changed, 582 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/cbz_wzr.mir

diff --git a/llvm/test/CodeGen/AArch64/cbz_wzr.mir b/llvm/test/CodeGen/AArch64/cbz_wzr.mir
new file mode 100644
index 0000000000000..7deea56ba23a1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cbz_wzr.mir
@@ -0,0 +1,260 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -o - %s -mtriple=aarch64-none-eabi -run-pass=machine-cp -mcp-use-is-copy-instr | FileCheck %s
+
+---
+name:            cbz_wzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: cbz_wzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZW $wzr, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $w8 = ORRWrs $wzr, $wzr, 0
+    CBZW killed renamable $w8, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
+---
+name:            cbnz_wzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: cbnz_wzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBNZW $wzr, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $w8 = ORRWrs $wzr, $wzr, 0
+    CBNZW killed renamable $w8, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
+---
+name:            tbz_wzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: tbz_wzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBZW $wzr, 0, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $w8 = ORRWrs $wzr, $wzr, 0
+    TBZW killed renamable $w8, 0, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
+---
+name:            tbnz_wzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: tbnz_wzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBNZW $wzr, 0, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $w8 = ORRWrs $wzr, $wzr, 0
+    TBNZW killed renamable $w8, 0, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
+
+---
+name:            cbz_xzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: cbz_xzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBZX $xzr, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $x8 = ORRXrs $xzr, $xzr, 0
+    CBZX killed renamable $x8, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
+---
+name:            cbnz_xzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: cbnz_xzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBNZX $xzr, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $x8 = ORRXrs $xzr, $xzr, 0
+    CBNZX killed renamable $x8, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
+---
+name:            tbz_xzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: tbz_xzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBZX $xzr, 0, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $x8 = ORRXrs $xzr, $xzr, 0
+    TBZX killed renamable $x8, 0, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
+---
+name:            tbnz_xzr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: tbnz_xzr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   TBNZX $xzr, 0, %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $w0 = MOVZWi 10, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   $w0 = MOVZWi 20, 0
+  ; CHECK-NEXT:   RET undef $lr, implicit $w0
+  bb.0:
+    liveins: $x0
+
+    $x8 = ORRXrs $xzr, $xzr, 0
+    TBNZX killed renamable $x8, 0, %bb.2
+
+  bb.1:
+    $w0 = MOVZWi 10, 0
+    RET undef $lr, implicit $w0
+
+  bb.2:
+    $w0 = MOVZWi 20, 0
+    RET undef $lr, implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
index 4a04934971711..6946cc23d867d 100644
--- a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
+++ b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -O3 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=aarch64 -O3 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare void @t()
 
@@ -581,3 +581,323 @@ end:
   ret void
 }
 
+define ptr @tbnz_wzr(i1 %cmp1.not.i, ptr %locflg) {
+; CHECK-SD-LABEL: tbnz_wzr:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB20_2
+; CHECK-SD-NEXT:  // %bb.1:
+; CHECK-SD-NEXT:    tbnz wzr, #0, .LBB20_3
+; CHECK-SD-NEXT:    b .LBB20_4
+; CHECK-SD-NEXT:  .LBB20_2: // %opnfil.exit.thread
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:    tbz w8, #0, .LBB20_4
+; CHECK-SD-NEXT:  .LBB20_3: // %if.else25
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:  .LBB20_4: // %common.ret
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbnz_wzr:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #0 // =0x0
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB20_3
+; CHECK-GI-NEXT:  // %bb.1: // %if.end10
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB20_4
+; CHECK-GI-NEXT:  .LBB20_2: // %common.ret
+; CHECK-GI-NEXT:    mov x0, xzr
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB20_3: // %opnfil.exit.thread
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:    tbz w8, #0, .LBB20_2
+; CHECK-GI-NEXT:  .LBB20_4: // %if.else25
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:    mov x0, xzr
+; CHECK-GI-NEXT:    ret
+entry:
+  br i1 %cmp1.not.i, label %if.end10, label %opnfil.exit.thread
+
+opnfil.exit.thread:                               ; preds = %entry
+  store i32 0, ptr %locflg, align 4
+  br label %if.end10
+
+if.end10:                                         ; preds = %opnfil.exit.thread, %entry
+  %cmp5 = phi i1 [ true, %opnfil.exit.thread ], [ false, %entry ]
+  br i1 %cmp5, label %if.else25, label %if.then12
+
+if.then12:                                        ; preds = %if.end10
+  %call20 = load i32, ptr null, align 4
+  br label %if.end26
+
+if.else25:                                        ; preds = %if.end10
+  store i32 0, ptr %locflg, align 4
+  br label %if.end26
+
+if.end26:                                         ; preds = %if.else25, %if.then12
+  br i1 %cmp5, label %common.ret, label %if.then28
+
+common.ret:                                       ; preds = %if.then28, %if.end26
+  %common.ret.op = phi ptr [ null, %if.then28 ], [ null, %if.end26 ]
+  ret ptr %common.ret.op
+
+if.then28:                                        ; preds = %if.end26
+  %0 = load ptr, ptr null, align 8
+  br label %common.ret
+}
+
+define ptr @tbz_wzr(i1 %cmp1.not.i, ptr %locflg) {
+; CHECK-SD-LABEL: tbz_wzr:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB21_2
+; CHECK-SD-NEXT:  // %bb.1:
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    tbnz w8, #0, .LBB21_3
+; CHECK-SD-NEXT:    b .LBB21_4
+; CHECK-SD-NEXT:  .LBB21_2: // %opnfil.exit.thread
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:    tbz wzr, #0, .LBB21_4
+; CHECK-SD-NEXT:  .LBB21_3: // %if.else25
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:  .LBB21_4: // %common.ret
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: tbz_wzr:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    tbz w0, #0, .LBB21_3
+; CHECK-GI-NEXT:  // %bb.1: // %if.end10
+; CHECK-GI-NEXT:    tbnz w8, #0, .LBB21_4
+; CHECK-GI-NEXT:  .LBB21_2: // %common.ret
+; CHECK-GI-NEXT:    mov x0, xzr
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB21_3: // %opnfil.exit.thread
+; CHECK-GI-NEXT:    mov w8, #0 // =0x0
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:    tbz w8, #0, .LBB21_2
+; CHECK-GI-NEXT:  .LBB21_4: // %if.else25
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:    mov x0, xzr
+; CHECK-GI-NEXT:    ret
+entry:
+  br i1 %cmp1.not.i, label %if.end10, label %opnfil.exit.thread
+
+opnfil.exit.thread:                               ; preds = %entry
+  store i32 0, ptr %locflg, align 4
+  br label %if.end10
+
+if.end10:                                         ; preds = %opnfil.exit.thread, %entry
+  %cmp5 = phi i1 [ false, %opnfil.exit.thread ], [ true, %entry ]
+  br i1 %cmp5, label %if.else25, label %if.then12
+
+if.then12:                                        ; preds = %if.end10
+  %call20 = load i32, ptr null, align 4
+  br label %if.end26
+
+if.else25:                                        ; preds = %if.end10
+  store i32 0, ptr %locflg, align 4
+  br label %if.end26
+
+if.end26:                                         ; preds = %if.else25, %if.then12
+  br i1 %cmp5, label %common.ret, label %if.then28
+
+common.ret:                                       ; preds = %if.then28, %if.end26
+  %common.ret.op = phi ptr [ null, %if.then28 ], [ null, %if.end26 ]
+  ret ptr %common.ret.op
+
+if.then28:                                        ; preds = %if.end26
+  %0 = load ptr, ptr null, align 8
+  br label %common.ret
+}
+
+define ptr @cbnz_wzr(i1 %cmp1.not.i, ptr %locflg) {
+; CHECK-SD-LABEL: cbnz_wzr:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB22_2
+; CHECK-SD-NEXT:  // %bb.1:
+; CHECK-SD-NEXT:    cbnz wzr, .LBB22_3
+; CHECK-SD-NEXT:    b .LBB22_4
+; CHECK-SD-NEXT:  .LBB22_2: // %opnfil.exit.thread
+; CHECK-SD-NEXT:    mov w8, #10 // =0xa
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:    cbz w8, .LBB22_4
+; CHECK-SD-NEXT:  .LBB22_3: // %if.else25
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:  .LBB22_4: // %common.ret
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cbnz_wzr:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, wzr
+; CHECK-GI-NEXT:    tbnz w0, #0, .LBB22_2
+; CHECK-GI-NEXT:  // %bb.1: // %opnfil.exit.thread
+; CHECK-GI-NEXT:    mov w8, #10 // =0xa
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:  .LBB22_2: // %if.end10
+; CHECK-GI-NEXT:    cbz w8, .LBB22_4
+; CHECK-GI-NEXT:  // %bb.3: // %if.else25
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:  .LBB22_4: // %common.ret
+; CHECK-GI-NEXT:    mov x0, xzr
+; CHECK-GI-NEXT:    ret
+entry:
+  br i1 %cmp1.not.i, label %if.end10, label %opnfil.exit.thread
+
+opnfil.exit.thread:                               ; preds = %entry
+  store i32 0, ptr %locflg, align 4
+  br label %if.end10
+
+if.end10:                                         ; preds = %opnfil.exit.thread, %entry
+  %cmp5 = phi i32 [ 10, %opnfil.exit.thread ], [ 0, %entry ]
+  %cmp5b = icmp ne i32 %cmp5, 0
+  br i1 %cmp5b, label %if.else25, label %if.then12
+
+if.then12:                                        ; preds = %if.end10
+  %call20 = load i32, ptr null, align 4
+  br label %if.end26
+
+if.else25:                                        ; preds = %if.end10
+  store i32 0, ptr %locflg, align 4
+  br label %if.end26
+
+if.end26:                                         ; preds = %if.else25, %if.then12
+  br i1 %cmp5b, label %common.ret, label %if.then28
+
+common.ret:                                       ; preds = %if.then28, %if.end26
+  %common.ret.op = phi ptr [ null, %if.then28 ], [ null, %if.end26 ]
+  ret ptr %common.ret.op
+
+if.then28:                                        ; preds = %if.end26
+  %0 = load ptr, ptr null, align 8
+  br label %common.ret
+}
+
+define ptr @cbz_wzr(i1 %cmp1.not.i, ptr %locflg) {
+; CHECK-SD-LABEL: cbz_wzr:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    tbz w0, #0, .LBB23_2
+; CHECK-SD-NEXT:  // %bb.1:
+; CHECK-SD-NEXT:    mov w8, #10 // =0xa
+; CHECK-SD-NEXT:    cbnz w8, .LBB23_3
+; CHECK-SD-NEXT:    b .LBB23_4
+; CHECK-SD-NEXT:  .LBB23_2: // %opnfil.exit.thread
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:    cbz wzr, .LBB23_4
+; CHECK-SD-NEXT:  .LBB23_3: // %if.else25
+; CHECK-SD-NEXT:    str wzr, [x1]
+; CHECK-SD-NEXT:  .LBB23_4: // %common.ret
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: cbz_wzr:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #10 // =0xa
+; CHECK-GI-NEXT:    tbnz w0, #0, .LBB23_2
+; CHECK-GI-NEXT:  // %bb.1: // %opnfil.exit.thread
+; CHECK-GI-NEXT:    mov w8, wzr
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:  .LBB23_2: // %if.end10
+; CHECK-GI-NEXT:    cbz w8, .LBB23_4
+; CHECK-GI-NEXT:  // %bb.3: // %if.else25
+; CHECK-GI-NEXT:    str wzr, [x1]
+; CHECK-GI-NEXT:  .LBB23_4: // %common.ret
+; CHECK-GI-NEXT:    mov x0, xzr
+; CHECK-GI-NEXT:    ret
+entry:
+  br i1 %cmp1.not.i, label %if.end10, label %opnfil.exit.thread
+
+opnfil.exit.thread:                               ; preds = %entry
+  store i32 0, ptr %locflg, align 4
+  br label %if.end10
+
+if.end10:                                         ; preds = %opnfil.exit.thread, %entry
+  %cmp5 = phi i32 [ 0, %opnfil.exit.thread ], [ 10, %entry ]
+  %cmp5b = icmp ne i32 %cmp5, 0
+  br i1 %cmp5b, label %if.else25, label %if.then12
+
+if.then12:                                        ; preds = %if.end10
+  %call20 = load i32, ptr null, align 4
+  br label %if.end26
+
+if.else25:                                        ; preds = %if.end10
+  store i32 0, ptr %locflg, align 4
+  br label %if.end26
+
+if.end26:                                         ; preds = %if.else25, %if.then12
+  br i1 %cmp5b, label %common.ret, label %if.then28
+
+common.ret:                                       ; preds = %if.then28, %if.end26
+  %common.ret.op = phi ptr [ null, %if.then28 ], [ null, %if.end26 ]
+  ret ptr %common.ret.op
+
+if.then28:                                        ; preds = %if.end26
+  %0 = load ptr, ptr null, align 8
+  br label %common.ret
+}
+
+define i1 @avifSequenceHeaderParse() {
+; CHECK-SD-LABEL: avifSequenceHeaderParse:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    cbz w8, .LBB24_2
+; CHECK-SD-NEXT:  .LBB24_1: // %bb6
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB24_2: // %bb1
+; CHECK-SD-NEXT:    cbz w8, .LBB24_4
+; CHECK-SD-NEXT:  // %bb.3:
+; CHECK-SD-NEXT:    tbz xzr, #63, .LBB24_1
+; CHECK-SD-NEXT:    b .LBB24_5
+; CHECK-SD-NEXT:  .LBB24_4: // %bb2
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    tbz x8, #63, .LBB24_1
+; CHECK-SD-NEXT:  .LBB24_5: // %bb4
+; CHECK-SD-NEXT:    mov w8, #1 // =0x1
+; CHECK-SD-NEXT:    mov w0, wzr
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: avifSequenceHeaderParse:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    mov w8, #1 // =0x1
+; CHECK-GI-NEXT:    ret
+entry:
+  %a = icmp slt i64 0, 0
+  br i1 %a, label %bb1, label %bb6
+
+bb1:                                 ; preds = %entry
+  %b = icmp eq i32 1, 0
+  br i1 %b, label %bb2, label %bb3
+
+bb2:                                  ; preds = %bb1
+  %c = load i8, ptr null, align 1
+  %d = zext i8 1 to i64
+  %e = shl i64 %d, 0
+  br label %bb3
+
+bb3:                            ; preds = %bb2, %bb1
+  %f = phi i64 [ %e, %bb2 ], [ 0, %bb1 ]
+  %g = icmp slt i64 %f, 0
+  br i1 %g, label %bb4, label %bb6
+
+bb4:                                 ; preds = %bb3
+  %h = icmp eq i32 1, 0
+  br i1 %h, label %bb5, label %bb7
+
+bb5:                                  ; preds = %bb4
+  %i = load i8, ptr null, align 1
+  %j = shl i64 0, 0
+  br label %bb7
+
+bb6:                                      ; preds = %bb7, %bb3, %entry
+  %k = phi i1 [ false, %bb7 ], [ false, %bb3 ], [ false, %entry ]
+  ret i1 %k
+
+bb7:                            ; preds = %bb5, %bb4
+  %l = phi ptr [ inttoptr (i64 1 to ptr), %bb5 ], [ null, %bb4 ]
+  %m = phi i64 [ %j, %bb5 ], [ 0, %bb4 ]
+  %n = icmp ult ptr %l, null
+  br label %bb6
+}

From 0d0cc06afe07f288ae18cf26e8a66bcb204146cd Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 05:05:47 -0700
Subject: [PATCH 361/878] [MLIR] Apply clang-tidy fixes for
 performance-unnecessary-value-param in Rewrite.cpp (NFC)

---
 mlir/lib/Bindings/Python/Rewrite.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index f18298ecaf415..836f44fd7d4be 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -127,7 +127,7 @@ class PyFrozenRewritePatternSet {
         mlirPythonFrozenRewritePatternSetToCapsule(get()));
   }
 
-  static nb::object createFromCapsule(nb::object capsule) {
+  static nb::object createFromCapsule(const nb::object &capsule) {
     MlirFrozenRewritePatternSet rawPm =
         mlirPythonCapsuleToFrozenRewritePatternSet(capsule.ptr());
     if (rawPm.ptr == nullptr)

From d5f98f3b01d8cece369d62366c180a5deb89e42f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 10:29:19 -0700
Subject: [PATCH 362/878] [MLIR] Apply clang-tidy fixes for
 performance-unnecessary-copy-initialization in InferIntRangeCommon.cpp (NFC)

---
 mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index af4ea5ac1cec8..0f28cbc751c1c 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -304,7 +304,7 @@ static ConstantIntRanges inferDivURange(const ConstantIntRanges &lhs,
     umin = lhsMin.udiv(rhsMax);
 
   // X u/ Y u<= X.
-  APInt umax = lhsMax;
+  const APInt &umax = lhsMax;
   return ConstantIntRanges::fromUnsigned(umin, umax);
 }
 

From 93c830597cd1c68059a165de3eabea3a0b8f4526 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 1 Oct 2025 13:56:38 +0200
Subject: [PATCH 363/878] [clang][bytecode] Fix integral cast edge case
 (#161506)

We were converting the `ASInt` to as sign-less `APInt` too early and
losing the sign information.
---
 clang/lib/AST/ByteCode/Compiler.cpp  | 12 ++++++++++--
 clang/test/AST/ByteCode/literals.cpp |  2 ++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 0b7b6cd64dd97..c71fd22fe9d7e 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -540,7 +540,8 @@ bool Compiler<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (const auto *IL = dyn_cast<IntegerLiteral>(SubExpr)) {
       if (ToT != PT_IntAP && ToT != PT_IntAPS && FromT != PT_IntAP &&
           FromT != PT_IntAPS && !CE->getType()->isEnumeralType())
-        return this->emitConst(IL->getValue(), CE);
+        return this->emitConst(APSInt(IL->getValue(), !isSignedType(*FromT)),
+                               CE);
       if (!this->emitConst(IL->getValue(), SubExpr))
         return false;
     } else {
@@ -4541,7 +4542,14 @@ bool Compiler<Emitter>::emitConst(T Value, const Expr *E) {
 template <class Emitter>
 bool Compiler<Emitter>::emitConst(const APSInt &Value, PrimType Ty,
                                   const Expr *E) {
-  return this->emitConst(static_cast<const APInt &>(Value), Ty, E);
+  if (Ty == PT_IntAPS)
+    return this->emitConstIntAPS(Value, E);
+  if (Ty == PT_IntAP)
+    return this->emitConstIntAP(Value, E);
+
+  if (Value.isSigned())
+    return this->emitConst(Value.getSExtValue(), Ty, E);
+  return this->emitConst(Value.getZExtValue(), Ty, E);
 }
 
 template <class Emitter>
diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp
index 5bc3f7f4c815c..5028ebfa3de30 100644
--- a/clang/test/AST/ByteCode/literals.cpp
+++ b/clang/test/AST/ByteCode/literals.cpp
@@ -28,6 +28,8 @@ static_assert(number != 10, ""); // both-error{{failed}} \
 static_assert(__objc_yes, "");
 static_assert(!__objc_no, "");
 
+static_assert((long long)0x00000000FFFF0000 == 4294901760, "");
+
 constexpr bool b = number;
 static_assert(b, "");
 constexpr int one = true;

From a374017bbcc8191fa6b7b2939ffcb9bb831df419 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Wed, 1 Oct 2025 15:01:37 +0300
Subject: [PATCH 364/878] [mlir][memref] Introduce `memref.distinct_objects` op
 (#156913)

The `distinct_objects` operation takes a list of memrefs and returns a
list of memrefs of the same types, with the additional assumption that
accesses to these memrefs will never alias with each other. This means
that loads and stores to different memrefs in the list can be safely
reordered.

The discussion
https://discourse.llvm.org/t/rfc-introducing-memref-aliasing-attributes/88049
---
 .../mlir/Dialect/MemRef/IR/MemRefOps.td       | 39 +++++++++++++-
 .../Conversion/MemRefToLLVM/MemRefToLLVM.cpp  | 52 +++++++++++++++++--
 mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp      | 23 ++++++++
 .../MemRefToLLVM/memref-to-llvm.mlir          | 30 +++++++++++
 mlir/test/Dialect/MemRef/invalid.mlir         | 16 ++++++
 mlir/test/Dialect/MemRef/ops.mlir             |  9 ++++
 6 files changed, 164 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
index 2bf953e32ccce..d4d67bfb278d5 100644
--- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td
@@ -155,7 +155,7 @@ def AssumeAlignmentOp : MemRef_Op<"assume_alignment", [
       The `assume_alignment` operation takes a memref and an integer alignment
       value. It returns a new SSA value of the same memref type, but associated
       with the assumption that the underlying buffer is aligned to the given
-      alignment. 
+      alignment.
 
       If the buffer isn't aligned to the given alignment, its result is poison.
       This operation doesn't affect the semantics of a program where the
@@ -170,7 +170,7 @@ def AssumeAlignmentOp : MemRef_Op<"assume_alignment", [
   let assemblyFormat = "$memref `,` $alignment attr-dict `:` type($memref)";
   let extraClassDeclaration = [{
     MemRefType getType() { return ::llvm::cast<MemRefType>(getResult().getType()); }
-    
+
     Value getViewSource() { return getMemref(); }
   }];
 
@@ -178,6 +178,41 @@ def AssumeAlignmentOp : MemRef_Op<"assume_alignment", [
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// DistinctObjectsOp
+//===----------------------------------------------------------------------===//
+
+def DistinctObjectsOp : MemRef_Op<"distinct_objects", [
+      Pure,
+      DeclareOpInterfaceMethods<InferTypeOpInterface>
+      // ViewLikeOpInterface TODO: ViewLikeOpInterface only supports a single argument
+    ]> {
+  let summary = "assumption that acesses to specific memrefs will never alias";
+  let description = [{
+      The `distinct_objects` operation takes a list of memrefs and returns the same
+      memrefs, with the additional assumption that accesses to them will never
+      alias with each other. This means that loads and stores to different
+      memrefs in the list can be safely reordered.
+
+      If the memrefs do alias, the load/store behavior is undefined. This
+      operation doesn't affect the semantics of a valid program. It is
+      intended for optimization purposes, allowing the compiler to generate more
+      efficient code based on the non-aliasing assumption. The optimization is
+      best-effort.
+
+      Example:
+
+      ```mlir
+      %1, %2 = memref.distinct_objects %a, %b : memref<?xf32>, memref<?xf32>
+      ```
+  }];
+  let arguments = (ins Variadic<AnyMemRef>:$operands);
+  let results = (outs Variadic<AnyMemRef>:$results);
+
+  let assemblyFormat = "$operands attr-dict `:` type($operands)";
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // AllocOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index cc6314cbd1ffe..a6f816aa07377 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -465,6 +465,51 @@ struct AssumeAlignmentOpLowering
   }
 };
 
+struct DistinctObjectsOpLowering
+    : public ConvertOpToLLVMPattern<memref::DistinctObjectsOp> {
+  using ConvertOpToLLVMPattern<
+      memref::DistinctObjectsOp>::ConvertOpToLLVMPattern;
+  explicit DistinctObjectsOpLowering(const LLVMTypeConverter &converter)
+      : ConvertOpToLLVMPattern<memref::DistinctObjectsOp>(converter) {}
+
+  LogicalResult
+  matchAndRewrite(memref::DistinctObjectsOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    ValueRange operands = adaptor.getOperands();
+    if (operands.size() <= 1) {
+      // Fast path.
+      rewriter.replaceOp(op, operands);
+      return success();
+    }
+
+    Location loc = op.getLoc();
+    SmallVector<Value> ptrs;
+    for (auto [origOperand, newOperand] :
+         llvm::zip_equal(op.getOperands(), operands)) {
+      auto memrefType = cast<MemRefType>(origOperand.getType());
+      MemRefDescriptor memRefDescriptor(newOperand);
+      Value ptr = memRefDescriptor.bufferPtr(rewriter, loc, *getTypeConverter(),
+                                             memrefType);
+      ptrs.push_back(ptr);
+    }
+
+    auto cond =
+        LLVM::ConstantOp::create(rewriter, loc, rewriter.getI1Type(), 1);
+    // Generate separate_storage assumptions for each pair of pointers.
+    for (auto i : llvm::seq<size_t>(ptrs.size() - 1)) {
+      for (auto j : llvm::seq<size_t>(i + 1, ptrs.size())) {
+        Value ptr1 = ptrs[i];
+        Value ptr2 = ptrs[j];
+        LLVM::AssumeOp::create(rewriter, loc, cond,
+                               LLVM::AssumeSeparateStorageTag{}, ptr1, ptr2);
+      }
+    }
+
+    rewriter.replaceOp(op, operands);
+    return success();
+  }
+};
+
 // A `dealloc` is converted into a call to `free` on the underlying data buffer.
 // The memref descriptor being an SSA value, there is no need to clean it up
 // in any way.
@@ -1997,22 +2042,23 @@ void mlir::populateFinalizeMemRefToLLVMConversionPatterns(
   patterns.add<
       AllocaOpLowering,
       AllocaScopeOpLowering,
-      AtomicRMWOpLowering,
       AssumeAlignmentOpLowering,
+      AtomicRMWOpLowering,
       ConvertExtractAlignedPointerAsIndex,
       DimOpLowering,
+      DistinctObjectsOpLowering,
       ExtractStridedMetadataOpLowering,
       GenericAtomicRMWOpLowering,
       GetGlobalMemrefOpLowering,
       LoadOpLowering,
       MemRefCastOpLowering,
-      MemorySpaceCastOpLowering,
       MemRefReinterpretCastOpLowering,
       MemRefReshapeOpLowering,
+      MemorySpaceCastOpLowering,
       PrefetchOpLowering,
       RankOpLowering,
-      ReassociatingReshapeOpConversion<memref::ExpandShapeOp>,
       ReassociatingReshapeOpConversion<memref::CollapseShapeOp>,
+      ReassociatingReshapeOpConversion<memref::ExpandShapeOp>,
       StoreOpLowering,
       SubViewOpLowering,
       TransposeOpLowering,
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 349b4deb29023..e9bdcda296da5 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -606,6 +606,29 @@ AssumeAlignmentOp::bubbleDownCasts(OpBuilder &builder) {
   return bubbleDownCastsPassthroughOpImpl(*this, builder, getMemrefMutable());
 }
 
+//===----------------------------------------------------------------------===//
+// DistinctObjectsOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DistinctObjectsOp::verify() {
+  if (getOperandTypes() != getResultTypes())
+    return emitOpError("operand types and result types must match");
+
+  if (getOperandTypes().empty())
+    return emitOpError("expected at least one operand");
+
+  return success();
+}
+
+LogicalResult DistinctObjectsOp::inferReturnTypes(
+    MLIRContext * /*context*/, std::optional<Location> /*location*/,
+    ValueRange operands, DictionaryAttr /*attributes*/,
+    OpaqueProperties /*properties*/, RegionRange /*regions*/,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  llvm::copy(operands.getTypes(), std::back_inserter(inferredReturnTypes));
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // CastOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 45b1a1f1ca40c..0cbe064572911 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -195,6 +195,36 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
 
 // -----
 
+// ALL-LABEL: func @distinct_objects
+//  ALL-SAME:   (%[[ARG0:.*]]: memref<?xf16>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf64>)
+func.func @distinct_objects(%arg0: memref<?xf16>, %arg1: memref<?xf32>, %arg2: memref<?xf64>) -> (memref<?xf16>, memref<?xf32>, memref<?xf64>) {
+//   ALL-DAG:   %[[CAST_0:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] : memref<?xf16> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//   ALL-DAG:   %[[CAST_1:.*]] = builtin.unrealized_conversion_cast %[[ARG1]] : memref<?xf32> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//   ALL-DAG:   %[[CAST_2:.*]] = builtin.unrealized_conversion_cast %[[ARG2]] : memref<?xf64> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[PTR_0:.*]] = llvm.extractvalue %[[CAST_0]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[PTR_1:.*]] = llvm.extractvalue %[[CAST_1]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[PTR_2:.*]] = llvm.extractvalue %[[CAST_2]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+//       ALL:   %[[TRUE:.*]] = llvm.mlir.constant(true) : i1
+//       ALL:   llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_0]], %[[PTR_1]] : !llvm.ptr, !llvm.ptr)] : i1
+//       ALL:   llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_0]], %[[PTR_2]] : !llvm.ptr, !llvm.ptr)] : i1
+//       ALL:   llvm.intr.assume %[[TRUE]] ["separate_storage"(%[[PTR_1]], %[[PTR_2]] : !llvm.ptr, !llvm.ptr)] : i1
+  %1, %2, %3 = memref.distinct_objects %arg0, %arg1, %arg2 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  return %1, %2, %3 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+}
+
+// -----
+
+// ALL-LABEL: func @distinct_objects_noop
+//  ALL-SAME:   (%[[ARG0:.*]]: memref<?xf16>)
+func.func @distinct_objects_noop(%arg0: memref<?xf16>) -> memref<?xf16> {
+// 1-operand version is noop
+//  ALL-NEXT:   return %[[ARG0]]
+  %1 = memref.distinct_objects %arg0 : memref<?xf16>
+  return %1 : memref<?xf16>
+}
+
+// -----
+
 // CHECK-LABEL: func @assume_alignment_w_offset
 // CHECK-INTERFACE-LABEL: func @assume_alignment_w_offset
 func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset: ?>>) {
diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir
index 3f96d907632b7..5ff292058ccc1 100644
--- a/mlir/test/Dialect/MemRef/invalid.mlir
+++ b/mlir/test/Dialect/MemRef/invalid.mlir
@@ -1169,3 +1169,19 @@ func.func @expand_shape_invalid_output_shape(
       into memref<2x15x20xf32, strided<[60000, 4000, 2], offset: 100>>
   return
 }
+
+// -----
+
+func.func @distinct_objects_types_mismatch(%arg0: memref<?xf32>, %arg1: memref<?xi32>) -> (memref<?xi32>, memref<?xf32>) {
+  // expected-error @+1 {{operand types and result types must match}}
+  %0, %1 = "memref.distinct_objects"(%arg0, %arg1) : (memref<?xf32>, memref<?xi32>) -> (memref<?xi32>, memref<?xf32>)
+  return %0, %1 : memref<?xi32>, memref<?xf32>
+}
+
+// -----
+
+func.func @distinct_objects_0_operands() {
+  // expected-error @+1 {{expected at least one operand}}
+  "memref.distinct_objects"() : () -> ()
+  return
+}
diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir
index 6c2298a3f8acb..a90c9505a8405 100644
--- a/mlir/test/Dialect/MemRef/ops.mlir
+++ b/mlir/test/Dialect/MemRef/ops.mlir
@@ -302,6 +302,15 @@ func.func @assume_alignment(%0: memref<4x4xf16>) {
   return
 }
 
+// CHECK-LABEL: func @distinct_objects
+// CHECK-SAME: (%[[ARG0:.*]]: memref<?xf16>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf64>)
+func.func @distinct_objects(%arg0: memref<?xf16>, %arg1: memref<?xf32>, %arg2: memref<?xf64>) -> (memref<?xf16>, memref<?xf32>, memref<?xf64>) {
+  // CHECK:  %[[RES:.*]]:3 = memref.distinct_objects %[[ARG0]], %[[ARG1]], %[[ARG2]] : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  %1, %2, %3 = memref.distinct_objects %arg0, %arg1, %arg2 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  // CHECK:  return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+  return %1, %2, %3 : memref<?xf16>, memref<?xf32>, memref<?xf64>
+}
+
 // CHECK-LABEL: func @expand_collapse_shape_static
 func.func @expand_collapse_shape_static(
     %arg0: memref<3x4x5xf32>,

From 2a96d19ab01a4b5d992f492233f6a21d1e7cc4e0 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Oct 2025 13:07:21 +0100
Subject: [PATCH 365/878] [lldb][CPlusPlusLanguage] Avoid redundant const char*
 -> StringRef roundtrip (#161499)

We've been seen (very sporadic) lifetime issues around this area. Here's
an example backtrace:
```
[  8] 0x0000000188e56743 libsystem_platform.dylib`_sigtramp + 55
[  9] 0x00000001181e041f LLDB`lldb_private::CPlusPlusLanguage::SymbolNameFitsToLanguage(lldb_private::Mangled) const [inlined] unsigned long std::1::constexpr_strlen[abi:nn200100]<char>(char const*) + 7 at constexpr_c_functions.h:63:10
[  9] 0x00000001181e0418 LLDB`lldb_private::CPlusPlusLanguage::SymbolNameFitsToLanguage(lldb_private::Mangled) const [inlined] std::__1::char_traits<char>::length[abi:nn200100](char const*) at char_traits.h:232:12
[  9] 0x00000001181e0418 LLDB`lldb_private::CPlusPlusLanguage::SymbolNameFitsToLanguage(lldb_private::Mangled) const [inlined] llvm::StringRef::StringRef(char const*) at StringRef.h:90:33
[  9] 0x00000001181e0418 LLDB`lldb_private::CPlusPlusLanguage::SymbolNameFitsToLanguage(lldb_private::Mangled) const [inlined] llvm::StringRef::StringRef(char const*) at StringRef.h:92:38
[  9] 0x00000001181e0418 LLDB`lldb_private::CPlusPlusLanguage::SymbolNameFitsToLanguage(lldb_private::Mangled) const + 20 at CPlusPlusLanguage.cpp:68:62
```

Looks like we're calling `strlen` on a nullptr. I stared at this
codepath for a while but am still not sure how that could happen unless
the underlying `ConstString` somehow pointed to corrupted data.

But `SymbolNameFitsToLanguage` does some roundtripping through a `const
char*` before calling `GetManglingScheme`. No other callsite does this
and it just seems redundant.

This patch cleans this up.

rdar://161128180
---
 .../Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 4e8a430af8c6c..a2199cb65cd35 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -104,10 +104,10 @@ CPlusPlusLanguage::GetFunctionNameInfo(ConstString name) const {
 }
 
 bool CPlusPlusLanguage::SymbolNameFitsToLanguage(Mangled mangled) const {
-  const char *mangled_name = mangled.GetMangledName().GetCString();
-  auto mangling_scheme = Mangled::GetManglingScheme(mangled_name);
-  return mangled_name && (mangling_scheme == Mangled::eManglingSchemeItanium ||
-                          mangling_scheme == Mangled::eManglingSchemeMSVC);
+  auto mangling_scheme =
+      Mangled::GetManglingScheme(mangled.GetMangledName().GetStringRef());
+  return mangling_scheme == Mangled::eManglingSchemeItanium ||
+         mangling_scheme == Mangled::eManglingSchemeMSVC;
 }
 
 ConstString CPlusPlusLanguage::GetDemangledFunctionNameWithoutArguments(

From da160574e0b28e279de4e06edfc66ff3c0a06c9a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 1 Oct 2025 05:09:18 -0700
Subject: [PATCH 366/878] [MLIR] Remove unused debug macros (NFC)

---
 mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 094ef0a45b8d2..e51cac4286f0c 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -173,8 +173,6 @@ struct TestXeGPUUnrollingPatterns
 
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "test-xegpu-layout-interface"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 
 // Test pattern for distributing vector::StepOp from workgroup to subgroup.
 // Validates DistributeLayoutAttr interfaces for offset computation

From 3a34710157a14d2ea3375322f259ff4f5cf4dac8 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 1 Oct 2025 05:19:42 -0700
Subject: [PATCH 367/878] [NFC][LLVM][AsmWriter] Move type printing to
 `WriteAsOperandInternal` (#161456)

Add option to `WriteAsOperandInternal` to print the type and use that to
eliminate explicit type printing code in several places.
---
 llvm/lib/IR/AsmWriter.cpp | 88 +++++++++++++--------------------------
 1 file changed, 30 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 54b92c9d35915..e29179b8f9955 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1465,7 +1465,8 @@ struct AsmWriterContext {
 //===----------------------------------------------------------------------===//
 
 static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
-                                   AsmWriterContext &WriterCtx);
+                                   AsmWriterContext &WriterCtx,
+                                   bool PrintType = false);
 
 static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
                                    AsmWriterContext &WriterCtx,
@@ -1685,23 +1686,19 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     ListSeparator LS;
     for (unsigned i = 0, e = NumOpsToWrite; i != e; ++i) {
       Out << LS;
-      WriterCtx.TypePrinter->print(CPA->getOperand(i)->getType(), Out);
-      Out << ' ';
-      WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx);
+      WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx,
+                             /*PrintType=*/true);
     }
     Out << ')';
     return;
   }
 
   if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
-    Type *ETy = CA->getType()->getElementType();
     Out << '[';
     ListSeparator LS;
     for (const Value *Op : CA->operands()) {
       Out << LS;
-      WriterCtx.TypePrinter->print(ETy, Out);
-      Out << ' ';
-      WriteAsOperandInternal(Out, Op, WriterCtx);
+      WriteAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
     }
     Out << ']';
     return;
@@ -1717,14 +1714,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       return;
     }
 
-    Type *ETy = CA->getType()->getElementType();
     Out << '[';
     ListSeparator LS;
     for (uint64_t i = 0, e = CA->getNumElements(); i != e; ++i) {
       Out << LS;
-      WriterCtx.TypePrinter->print(ETy, Out);
-      Out << ' ';
-      WriteAsOperandInternal(Out, CA->getElementAsConstant(i), WriterCtx);
+      WriteAsOperandInternal(Out, CA->getElementAsConstant(i), WriterCtx,
+                             /*PrintType=*/true);
     }
     Out << ']';
     return;
@@ -1739,9 +1734,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       ListSeparator LS;
       for (const Value *Op : CS->operands()) {
         Out << LS;
-        WriterCtx.TypePrinter->print(Op->getType(), Out);
-        Out << ' ';
-        WriteAsOperandInternal(Out, Op, WriterCtx);
+        WriteAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
       }
       Out << ' ';
     }
@@ -1753,7 +1746,6 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
 
   if (isa<ConstantVector>(CV) || isa<ConstantDataVector>(CV)) {
     auto *CVVTy = cast<FixedVectorType>(CV->getType());
-    Type *ETy = CVVTy->getElementType();
 
     // Use the same shorthand for splat vector (i.e. "splat(Ty val)") as is
     // permitted on IR input to reduce the output changes when enabling
@@ -1763,9 +1755,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     if (auto *SplatVal = CV->getSplatValue()) {
       if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal)) {
         Out << "splat (";
-        WriterCtx.TypePrinter->print(ETy, Out);
-        Out << ' ';
-        WriteAsOperandInternal(Out, SplatVal, WriterCtx);
+        WriteAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
         Out << ')';
         return;
       }
@@ -1775,9 +1765,8 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     ListSeparator LS;
     for (unsigned i = 0, e = CVVTy->getNumElements(); i != e; ++i) {
       Out << LS;
-      WriterCtx.TypePrinter->print(ETy, Out);
-      Out << ' ';
-      WriteAsOperandInternal(Out, CV->getAggregateElement(i), WriterCtx);
+      WriteAsOperandInternal(Out, CV->getAggregateElement(i), WriterCtx,
+                             /*PrintType=*/true);
     }
     Out << '>';
     return;
@@ -1813,9 +1802,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       if (auto *SplatVal = CE->getSplatValue()) {
         if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal)) {
           Out << "splat (";
-          WriterCtx.TypePrinter->print(SplatVal->getType(), Out);
-          Out << ' ';
-          WriteAsOperandInternal(Out, SplatVal, WriterCtx);
+          WriteAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
           Out << ')';
           return;
         }
@@ -1834,9 +1821,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     ListSeparator LS;
     for (const Value *Op : CE->operands()) {
       Out << LS;
-      WriterCtx.TypePrinter->print(Op->getType(), Out);
-      Out << ' ';
-      WriteAsOperandInternal(Out, Op, WriterCtx);
+      WriteAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
     }
 
     if (CE->isCast()) {
@@ -1864,9 +1849,7 @@ static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
       Out << "null";
     } else if (auto *MDV = dyn_cast<ValueAsMetadata>(MD)) {
       Value *V = MDV->getValue();
-      WriterCtx.TypePrinter->print(V->getType(), Out);
-      Out << ' ';
-      WriteAsOperandInternal(Out, V, WriterCtx);
+      WriteAsOperandInternal(Out, V, WriterCtx, /*PrintType=*/true);
     } else {
       WriteAsOperandInternal(Out, MD, WriterCtx);
       WriterCtx.onWriteMetadataAsOperand(MD);
@@ -2634,7 +2617,7 @@ static void writeDIArgList(raw_ostream &Out, const DIArgList *N,
   Out << "!DIArgList(";
   ListSeparator FS;
   MDFieldPrinter Printer(Out, WriterCtx);
-  for (Metadata *Arg : N->getArgs()) {
+  for (const Metadata *Arg : N->getArgs()) {
     Out << FS;
     WriteAsOperandInternal(Out, Arg, WriterCtx, true);
   }
@@ -2700,7 +2683,13 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
 // Full implementation of printing a Value as an operand with support for
 // TypePrinting, etc.
 static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
-                                   AsmWriterContext &WriterCtx) {
+                                   AsmWriterContext &WriterCtx,
+                                   bool PrintType) {
+  if (PrintType) {
+    WriterCtx.TypePrinter->print(V->getType(), Out);
+    Out << ' ';
+  }
+
   if (V->hasName()) {
     PrintLLVMName(Out, V);
     return;
@@ -2825,9 +2814,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
   assert((FromValue || !isa<LocalAsMetadata>(V)) &&
          "Unexpected function-local metadata outside of value argument");
 
-  WriterCtx.TypePrinter->print(V->getValue()->getType(), Out);
-  Out << ' ';
-  WriteAsOperandInternal(Out, V->getValue(), WriterCtx);
+  WriteAsOperandInternal(Out, V->getValue(), WriterCtx, /*PrintType=*/true);
 }
 
 namespace {
@@ -2965,12 +2952,8 @@ void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
     Out << "<null operand!>";
     return;
   }
-  if (PrintType) {
-    TypePrinter.print(Operand->getType(), Out);
-    Out << ' ';
-  }
-  auto WriterCtx = getContext();
-  WriteAsOperandInternal(Out, Operand, WriterCtx);
+  auto WriteCtx = getContext();
+  WriteAsOperandInternal(Out, Operand, WriteCtx, PrintType);
 }
 
 void AssemblyWriter::writeSyncScope(const LLVMContext &Context,
@@ -3049,20 +3032,14 @@ void AssemblyWriter::writeOperandBundles(const CallBase *Call) {
 
     Out << '(';
 
-    bool FirstInput = true;
+    ListSeparator InnerLS;
     auto WriterCtx = getContext();
     for (const auto &Input : BU.Inputs) {
-      if (!FirstInput)
-        Out << ", ";
-      FirstInput = false;
-
+      Out << InnerLS;
       if (Input == nullptr)
         Out << "<null operand bundle!>";
-      else {
-        TypePrinter.print(Input->getType(), Out);
-        Out << " ";
-        WriteAsOperandInternal(Out, Input, WriterCtx);
-      }
+      else
+        WriteAsOperandInternal(Out, Input, WriterCtx, /*PrintType=*/true);
     }
 
     Out << ')';
@@ -5265,13 +5242,8 @@ static bool printWithoutType(const Value &V, raw_ostream &O,
 static void printAsOperandImpl(const Value &V, raw_ostream &O, bool PrintType,
                                ModuleSlotTracker &MST) {
   TypePrinting TypePrinter(MST.getModule());
-  if (PrintType) {
-    TypePrinter.print(V.getType(), O);
-    O << ' ';
-  }
-
   AsmWriterContext WriterCtx(&TypePrinter, MST.getMachine(), MST.getModule());
-  WriteAsOperandInternal(O, &V, WriterCtx);
+  WriteAsOperandInternal(O, &V, WriterCtx, PrintType);
 }
 
 void Value::printAsOperand(raw_ostream &O, bool PrintType,

From 372d3fb10c9dd0ff4e780d68b86220e30fd00b27 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 1 Oct 2025 13:28:37 +0100
Subject: [PATCH 368/878] [CodeGen] Remove
 `shouldExpandPartialReductionIntrinsic()` hook (NFC) (#161498)

This is unused. Targets can lower/expand the `PARTIAL_REDUCE_*` ISD
nodes.
---
 llvm/include/llvm/CodeGen/TargetLowering.h            | 7 -------
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 4 ----
 2 files changed, 11 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c45e03a7bdad8..7bbad172b2d42 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -480,13 +480,6 @@ class LLVM_ABI TargetLoweringBase {
     return true;
   }
 
-  /// Return true if the @llvm.vector.partial.reduce.* intrinsic
-  /// should be expanded using generic code in SelectionDAGBuilder.
-  virtual bool
-  shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const {
-    return true;
-  }
-
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
   /// using generic code in SelectionDAGBuilder.
   virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b5201a311c591..c21890a0d856f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8103,10 +8103,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::vector_partial_reduce_add: {
-    if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      visitTargetIntrinsic(I, Intrinsic);
-      return;
-    }
     SDValue Acc = getValue(I.getOperand(0));
     SDValue Input = getValue(I.getOperand(1));
     setValue(&I,

From b413ac1af3b884af98925a9b422582ca7ebcbecd Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 30 Sep 2025 16:57:08 +0000
Subject: [PATCH 369/878] [NFC][AArch64][ISEL] Remove unnecessary predicates
 from partial_reduce_*mla patterns.

---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 62 +++++++++----------
 1 file changed, 28 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1e30735b7a56a..36c9cb6c1d94f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -707,16 +707,14 @@ let Predicates = [HasSVE_or_SME] in {
   defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", AArch64sdot>;
   defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", AArch64udot>;
 
-  let Predicates = [HasSVE_or_SME] in {
-    def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
-              (UDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
-    def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
-              (SDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
-    def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
-              (UDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
-    def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
-              (SDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
-  } // End HasSVE_or_SME
+  def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+            (UDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
+  def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+            (SDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>;
+  def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+            (UDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
+  def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+            (SDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>;
 
   defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
   defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
@@ -3646,6 +3644,9 @@ let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {
   defm USDOT_ZZZ  : sve_int_dot_mixed<"usdot", AArch64usdot>;
   defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
   defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
+
+  def : Pat<(nxv4i32 (partial_reduce_sumla nxv4i32:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
+            (USDOT_ZZZ $Acc, $RHS, $LHS)>;
 } // End HasSVE_or_SME, HasMatMulInt8
 
 let Predicates = [HasSVE, HasMatMulFP32] in {
@@ -3752,6 +3753,19 @@ let Predicates = [HasSVE2_or_SME] in {
   defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>;
   defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>;
 
+  def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
+            (UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
+  def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
+            (SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
+  def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
+            (UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
+  def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
+            (SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
+  def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
+            (UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
+  def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
+            (SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
+
   // SVE2 saturating multiply-add long (indexed)
   defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>;
   defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>;
@@ -3880,19 +3894,6 @@ let Predicates = [HasSVE2_or_SME] in {
   def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$Input, (nxv16i8 (splat_vector (i32 1))))),
             (SADDWT_ZZZ_H (SADDWB_ZZZ_H $Acc, $Input), $Input)>;
 
-  def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
-            (UMLALT_ZZZ_D (UMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
-  def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv4i32:$LHS, nxv4i32:$RHS)),
-            (SMLALT_ZZZ_D (SMLALB_ZZZ_D $Acc, $LHS, $RHS), $LHS, $RHS)>;
-  def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
-            (UMLALT_ZZZ_S (UMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
-  def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$LHS, nxv8i16:$RHS)),
-            (SMLALT_ZZZ_S (SMLALB_ZZZ_S $Acc, $LHS, $RHS), $LHS, $RHS)>;
-  def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
-            (UMLALT_ZZZ_H (UMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
-  def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
-            (SMLALT_ZZZ_H (SMLALB_ZZZ_H $Acc, $LHS, $RHS), $LHS, $RHS)>;
-
   // SVE2 integer multiply long
   defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
   defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
@@ -4200,11 +4201,6 @@ let Predicates = [HasSVEAES2, HasNonStreamingSVE_or_SSVE_AES] in {
   def PMULL_2ZZZ_Q : sve_crypto_pmull_multi<"pmull">;
 }
 
-let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {
-    def : Pat<(nxv4i32 (partial_reduce_sumla nxv4i32:$Acc, nxv16i8:$LHS, nxv16i8:$RHS)),
-              (USDOT_ZZZ $Acc, $RHS, $LHS)>;
-  } // End HasSVE_or_SME, HasMatMulInt8
-
 //===----------------------------------------------------------------------===//
 // SME or SVE2.1 instructions
 //===----------------------------------------------------------------------===//
@@ -4238,12 +4234,10 @@ defm UDOT_ZZZ_HtoS  : sve2p1_two_way_dot_vv<"udot", 0b1, int_aarch64_sve_udot_x2
 defm SDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"sdot", 0b0, int_aarch64_sve_sdot_lane_x2>;
 defm UDOT_ZZZI_HtoS : sve2p1_two_way_dot_vvi<"udot", 0b1, int_aarch64_sve_udot_lane_x2>;
 
-let Predicates = [HasSVE2p1_or_SME2] in {
-  def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
-            (UDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
-  def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
-            (SDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
-} // End HasSVE2p1_or_SME2
+def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+          (UDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
+def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+          (SDOT_ZZZ_HtoS $Acc, $MulLHS, $MulRHS)>;
 
 defm SQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>;
 defm UQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>;

From a33544b83c80dcaa851fabd2979def6f68dd6e7a Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Wed, 1 Oct 2025 06:16:39 -0700
Subject: [PATCH 370/878] [OpenACC][CIR] Implement 'alloca copying' for private
 lowering (#161382)

The previous patch ensured that we correctly got the allocas put in
place. This patch takes the address of each element of each alloca, and
copies it to the previous one. This allows us to re-form the
pointer-structure for a recipe.
---
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp |  96 +++-
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h   |   4 +
 ...-clause-pointer-array-recipes-CtorDtor.cpp | 489 ++++++++++++++++++
 ...ate-clause-pointer-array-recipes-NoOps.cpp | 485 +++++++++++++++++
 ...ivate-clause-pointer-array-recipes-int.cpp | 488 ++++++++++++++++-
 ...rivate-clause-pointer-recipes-CtorDtor.cpp | 266 ++++++++++
 .../private-clause-pointer-recipes-NoOps.cpp  | 268 ++++++++++
 .../private-clause-pointer-recipes-int.cpp    | 268 +++++++++-
 8 files changed, 2354 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index fc28ac552224c..25cacbb73eb03 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -36,6 +36,75 @@ mlir::Block *OpenACCRecipeBuilderBase::createRecipeBlock(mlir::Region &region,
   llvm::SmallVector<mlir::Location> locs{types.size(), loc};
   return builder.createBlock(&region, region.end(), types, locs);
 }
+void OpenACCRecipeBuilderBase::makeAllocaCopy(mlir::Location loc,
+                                              mlir::Type copyType,
+                                              mlir::Value numEltsToCopy,
+                                              mlir::Value offsetPerSubarray,
+                                              mlir::Value destAlloca,
+                                              mlir::Value srcAlloca) {
+  mlir::OpBuilder::InsertionGuard guardCase(builder);
+
+  mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
+  auto itrPtrTy = cir::PointerType::get(itrTy);
+  mlir::IntegerAttr itrAlign =
+      cgf.cgm.getSize(cgf.getContext().getTypeAlignInChars(
+          cgf.getContext().UnsignedLongLongTy));
+
+  auto loopBuilder = [&]() {
+    auto itr =
+        cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "itr", itrAlign);
+    cir::ConstantOp constZero = builder.getConstInt(loc, itrTy, 0);
+    builder.CIRBaseBuilderTy::createStore(loc, constZero, itr);
+    builder.createFor(
+        loc,
+        /*condBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          // itr < numEltsToCopy
+          // Enforce a trip count of 1 if there wasn't any element count, this
+          // way we can just use this loop with a constant bounds instead of a
+          // separate code path.
+          if (!numEltsToCopy)
+            numEltsToCopy = builder.getConstInt(loc, itrTy, 1);
+
+          auto loadCur = cir::LoadOp::create(builder, loc, {itr});
+          auto cmp = builder.createCompare(loc, cir::CmpOpKind::lt, loadCur,
+                                           numEltsToCopy);
+          builder.createCondition(cmp);
+        },
+        /*bodyBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          // destAlloca[itr] = srcAlloca[offsetPerSubArray * itr];
+          auto loadCur = cir::LoadOp::create(builder, loc, {itr});
+          auto srcOffset = builder.createMul(loc, offsetPerSubarray, loadCur);
+
+          auto ptrToOffsetIntoSrc = cir::PtrStrideOp::create(
+              builder, loc, copyType, srcAlloca, srcOffset);
+
+          auto offsetIntoDecayDest = cir::PtrStrideOp::create(
+              builder, loc, builder.getPointerTo(copyType), destAlloca,
+              loadCur);
+
+          builder.CIRBaseBuilderTy::createStore(loc, ptrToOffsetIntoSrc,
+                                                offsetIntoDecayDest);
+          builder.createYield(loc);
+        },
+        /*stepBuilder=*/
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
+          // Simple increment of the iterator.
+          auto load = cir::LoadOp::create(builder, loc, {itr});
+          auto inc = cir::UnaryOp::create(builder, loc, load.getType(),
+                                          cir::UnaryOpKind::Inc, load);
+          builder.CIRBaseBuilderTy::createStore(loc, inc, itr);
+          builder.createYield(loc);
+        });
+  };
+
+  cir::ScopeOp::create(builder, loc,
+                       [&](mlir::OpBuilder &b, mlir::Location loc) {
+                         loopBuilder();
+                         builder.createYield(loc);
+                       });
+}
 
 mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
     mlir::Block *block, SourceRange exprRange, mlir::Location loc,
@@ -78,6 +147,10 @@ mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
 
   bool lastBoundWasArray = isArrayTy(boundTypes.back());
 
+  // Make sure we track a moving version of this so we can get our
+  // 'copying' back to correct.
+  mlir::Value lastAlloca = initialAlloca;
+
   // Since we're iterating the types in reverse, this sets up for each index
   // corresponding to the boundsRange to be the 'after application of the
   // bounds.
@@ -125,14 +198,21 @@ mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
 
       mlir::Type eltTy = cgf.convertType(resultType);
       cir::PointerType ptrTy = builder.getPointerTo(eltTy);
-      builder.createAlloca(loc, ptrTy, eltTy, "openacc.init.bounds",
-                           cgf.getContext().getTypeAlignInChars(resultType),
-                           curSize);
-
-      // TODO: OpenACC : At this point we should be copying the addresses of
-      // each element of this to the last allocation.  At the moment, that is
-      // not yet implemented.
-      cgf.cgm.errorNYI(exprRange, "OpenACC recipe alloca copying");
+      mlir::Value curAlloca = builder.createAlloca(
+          loc, ptrTy, eltTy, "openacc.init.bounds",
+          cgf.getContext().getTypeAlignInChars(resultType), curSize);
+
+      makeAllocaCopy(loc, ptrTy, cumulativeElts, eltsPerSubArray, lastAlloca,
+                     curAlloca);
+      lastAlloca = curAlloca;
+    } else {
+      // In the case of an array, we just need to decay the pointer, so just do
+      // a zero-offset stride on the last alloca to decay it down an array
+      // level.
+      cir::ConstantOp constZero = builder.getConstInt(loc, itrTy, 0);
+      lastAlloca = builder.getArrayElement(loc, loc, lastAlloca,
+                                           cgf.convertType(resultType),
+                                           constZero, /*shouldDecay=*/true);
     }
 
     cumulativeElts = eltsToAlloca;
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index acd187bbaee0d..d802ccb151b64 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -24,6 +24,10 @@
 
 namespace clang::CIRGen {
 class OpenACCRecipeBuilderBase {
+  // makes the copy of the addresses of an alloca to the previous allocation.
+  void makeAllocaCopy(mlir::Location loc, mlir::Type copyType,
+                      mlir::Value numEltsToCopy, mlir::Value offsetPerSubarray,
+                      mlir::Value destAlloca, mlir::Value srcAlloca);
   // This function generates the required alloca, similar to
   // 'emitAutoVarAlloca', except for the OpenACC array/pointer types.
   mlir::Value makeBoundsAlloca(mlir::Block *block, SourceRange exprRange,
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
index e17ef90d01212..da56de005d5f2 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
@@ -20,6 +20,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -77,12 +101,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -165,6 +236,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
@@ -172,12 +267,60 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA3]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -279,12 +422,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -311,12 +501,40 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
+// 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -399,6 +617,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -478,6 +720,10 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
@@ -485,12 +731,58 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -589,12 +881,39 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -622,6 +941,31 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
@@ -629,6 +973,29 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -729,12 +1096,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -838,9 +1252,37 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+//
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // 
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
@@ -848,6 +1290,29 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -948,6 +1413,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
index 4398216132616..f9230dcaf1691 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
@@ -15,6 +15,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -41,12 +65,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -75,6 +146,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
@@ -82,12 +177,58 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA3]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -109,12 +250,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -141,12 +329,39 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -176,6 +391,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -200,6 +439,10 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
@@ -207,12 +450,58 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -231,12 +520,39 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -265,12 +581,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -292,12 +655,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -328,16 +738,67 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -358,6 +819,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
index 79692d3468295..45d8b78a4a128 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void do_things(unsigned A, unsigned B) {
@@ -13,6 +13,31 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -39,12 +64,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -73,6 +145,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
@@ -80,12 +176,57 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA3:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA3]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -107,12 +248,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -138,12 +326,39 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -172,6 +387,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64} 
+// 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!s32i x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -197,6 +436,10 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
@@ -204,12 +447,58 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -228,12 +517,39 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64} 
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_DEREF]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -262,12 +578,60 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.array<!s32i x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} 
@@ -289,12 +653,59 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[UB2_CAST]]) : !u64i
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<20> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.array<!s32i x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>>
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!s32i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s32i x 5>>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -325,16 +736,67 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
 //
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+//
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
 // CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[STRIDE]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -356,6 +818,30 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
 // CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!s32i> x 5>, !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
index 77ff3571a98b4..65b0365521bf4 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
@@ -29,6 +29,34 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -44,12 +72,64 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -71,6 +151,32 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
@@ -78,12 +184,65 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_ALLOCA]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -193,6 +352,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -208,12 +394,64 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
@@ -299,6 +537,34 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_CTORDTOR:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_CTORDTOR]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
index 4822dd70227f8..07e06f86924fb 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
@@ -23,6 +23,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -38,12 +65,65 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -63,6 +143,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+//
 // CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
@@ -70,12 +177,66 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_ALLOCA]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -106,6 +267,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+// 
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -121,12 +309,64 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -155,6 +395,34 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_NOOPS:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_NOOPS]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
index ddf25de34f74e..a3b7dcac7689d 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void do_things(unsigned A, unsigned B) {
@@ -21,6 +21,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -36,12 +63,65 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -61,6 +141,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+//
 // CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
@@ -68,12 +175,64 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
 // CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_ALLOCA]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -104,6 +263,34 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_INT_PTR:.*]] = cir.const #cir.int<8> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -119,6 +306,32 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
 // CHECK-NEXT: %[[INT_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
@@ -126,6 +339,32 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_INT]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
 //
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
@@ -154,6 +393,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
 // CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_INT]]) : !u64i
 // CHECK-NEXT: %[[INT_VLA_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_VLA_ALLOCA]] : !cir.ptr<!s32i>, %[[SRC_IDX]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }

From f4d18c0ef8e3207b8ee2363fea60f21d4fa325bc Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolf.morel@intel.com>
Date: Wed, 1 Oct 2025 14:47:35 +0100
Subject: [PATCH 371/878] [MLIR][Transform][Tune] Introduce
 `transform.tune.alternatives` op (#160724)

This op enables expressing uncertainty regarding what should be
happening at particular places in transform-dialect schedules. In
particular, it enables representing a choice among alternative regions.
This choice is resolved through providing a `selected_region` argument.
When this argument is provided, the semantics are such that it is valid
to rewrite the op through substituting in the selected region -- with
the op's interpreted semantics corresponding to exactly this.

This op represents another piece of the puzzle w.r.t. a toolkit for
expressing autotuning problems with the transform dialect. Note that
this goes beyond tuning knobs _on_ transforms, going further by making
it tunable which (sequences of) transforms are to be applied.
---
 .../TuneExtension/TuneExtensionOps.h          |   1 +
 .../TuneExtension/TuneExtensionOps.td         |  54 +++++
 .../TuneExtension/TuneExtensionOps.cpp        | 184 ++++++++++++++++++
 mlir/python/mlir/dialects/transform/tune.py   |  66 ++++++-
 .../test-tune-extension-invalid.mlir          |  85 ++++++++
 .../Transform/test-tune-extension.mlir        | 126 ++++++++++++
 .../python/dialects/transform_tune_ext.py     | 105 ++++++++--
 7 files changed, 604 insertions(+), 17 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h
index 74e1d28ffac82..ba11259790676 100644
--- a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h
+++ b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_DIALECT_TRANSFORM_TUNEEXTENSION_TUNEEXTENSIONOPS_H
 #define MLIR_DIALECT_TRANSFORM_TUNEEXTENSION_TUNEEXTENSIONOPS_H
 
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
index d68d451afac40..d095659fc4838 100644
--- a/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.td
@@ -11,10 +11,15 @@
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/BuiltinAttributes.td"
 include "mlir/IR/CommonAttrConstraints.td"
 
+//===----------------------------------------------------------------------===//
+// KnobOp
+//===----------------------------------------------------------------------===//
+
 def KnobOp : Op<Transform_Dialect, "tune.knob", [
   DeclareOpInterfaceMethods<TransformOpInterface>,
   DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
@@ -52,4 +57,53 @@ def KnobOp : Op<Transform_Dialect, "tune.knob", [
       "`<` $name `>` (`=` $selected^ `from`)? `options` `=` $options attr-dict `->` type(results)";
 }
 
+//===----------------------------------------------------------------------===//
+// AlternativesOp
+//===----------------------------------------------------------------------===//
+
+def AlternativesOp : Op<Transform_Dialect, "tune.alternatives", [
+  DeclareOpInterfaceMethods<RegionBranchOpInterface,
+        ["getEntrySuccessorOperands", "getSuccessorRegions",
+         "getRegionInvocationBounds"]>,
+  DeclareOpInterfaceMethods<TransformOpInterface>,
+  DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+  SingleBlockImplicitTerminator<"::mlir::transform::YieldOp">,
+  NoRegionArguments
+]> {
+  let summary = "Represents a choice among its regions, i.e. sub-schedules";
+
+  let description = [{
+    This op represents a choice over which of its regions is to be used.
+
+    When `selected_region` is provided, the semantics are that this op is to be
+    substituted for by the selected region, meaning the region's results become
+    the results of this op. Without a provided `selected_region`, the semantics
+    are that this non-deterministic choice is yet to be resolved -- which in
+    terms of the op's interpreted semantics is a failure.
+
+    The `selected_region` argument is either an `IntegerAttr` or a param holding
+    an `IntegerAttr`, which should provide a valid zero-based index with respect
+    to the number of alternatives, i.e. regions.
+  }];
+  let cppNamespace = [{ mlir::transform::tune }];
+
+  let arguments = (ins Builtin_StringAttr:$name,
+                       OptionalAttr<APIntAttr>:$selected_region_attr,
+                       Optional<TransformParamTypeInterface>:$selected_region_param);
+  let results = (outs Variadic<Transform_AnyHandleOrParamType>:$results);
+  let regions = (region VariadicRegion<SizedRegion<1>>:$alternatives);
+
+  let assemblyFormat = [{
+    `<` $name `>`
+    (`selected_region` `=` custom<AlternativesOpSelectedRegion>(
+        $selected_region_attr, $selected_region_param)^)?
+    attr-dict-with-keyword
+    (`:` type($selected_region_param)^)?
+    (`->` type($results)^)?
+    regions
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif // MLIR_DIALECT_TRANSFORM_TUNEEXTENSION_TUNEEXTENSIONOPS
diff --git a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
index 842e880ca9150..c627158e999ed 100644
--- a/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
+++ b/mlir/lib/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp
@@ -6,13 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
 #include "llvm/Support/Debug.h"
 
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.h"
 
 using namespace mlir;
 
+static ParseResult parseAlternativesOpSelectedRegion(
+    OpAsmParser &parser, IntegerAttr &selectedRegionAttr,
+    std::optional<OpAsmParser::UnresolvedOperand> &selectedRegionParam);
+
+static void printAlternativesOpSelectedRegion(OpAsmPrinter &printer,
+                                              Operation *op,
+                                              IntegerAttr selectedRegionAttr,
+                                              Value selectedRegionParam);
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/Transform/TuneExtension/TuneExtensionOps.cpp.inc"
 
@@ -57,3 +68,176 @@ LogicalResult transform::tune::KnobOp::verify() {
 
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// AlternativesOp
+//===----------------------------------------------------------------------===//
+
+static ParseResult parseAlternativesOpSelectedRegion(
+    OpAsmParser &parser, IntegerAttr &selectedRegionAttr,
+    std::optional<OpAsmParser::UnresolvedOperand> &selectedRegionParam) {
+  size_t selectedRegionIdx;
+  OptionalParseResult attrParseRes =
+      parser.parseOptionalInteger(selectedRegionIdx);
+  if (attrParseRes.has_value()) {
+    if (failed(*attrParseRes))
+      return failure();
+
+    selectedRegionAttr = parser.getBuilder().getIndexAttr(selectedRegionIdx);
+    return success();
+  }
+
+  OpAsmParser::UnresolvedOperand param;
+  auto paramParseRes = parser.parseOptionalOperand(param);
+  if (paramParseRes.has_value()) {
+    if (failed(*paramParseRes))
+      return failure();
+
+    selectedRegionParam = param;
+    return success();
+  }
+
+  return parser.emitError(parser.getCurrentLocation())
+         << "expected either an integer attribute or a transform.param operand";
+}
+
+static void printAlternativesOpSelectedRegion(OpAsmPrinter &printer,
+                                              Operation *op,
+                                              IntegerAttr selectedRegionAttr,
+                                              Value selectedRegionParam) {
+  if (selectedRegionAttr)
+    printer << selectedRegionAttr.getValue();
+  if (selectedRegionParam)
+    printer << selectedRegionParam;
+}
+
+OperandRange transform::tune::AlternativesOp::getEntrySuccessorOperands(
+    RegionBranchPoint point) {
+  // No operands will be forwarded to the region(s).
+  return getOperands().slice(0, 0);
+}
+
+void transform::tune::AlternativesOp::getSuccessorRegions(
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  if (point.isParent())
+    if (auto selectedRegionIdx = getSelectedRegionAttr())
+      regions.emplace_back(
+          &getAlternatives()[selectedRegionIdx->getSExtValue()],
+          Block::BlockArgListType());
+    else
+      for (Region &alternative : getAlternatives())
+        regions.emplace_back(&alternative, Block::BlockArgListType());
+  else
+    regions.emplace_back(getOperation()->getResults());
+}
+
+void transform::tune::AlternativesOp::getRegionInvocationBounds(
+    ArrayRef<Attribute> operands, SmallVectorImpl<InvocationBounds> &bounds) {
+  (void)operands;
+  bounds.reserve(getNumRegions());
+
+  if (auto selectedRegionIdx = getSelectedRegionAttr()) {
+    bounds.resize(getNumRegions(), InvocationBounds(0, 0));
+    bounds[selectedRegionIdx->getSExtValue()] = InvocationBounds(1, 1);
+  } else {
+    bounds.resize(getNumRegions(), InvocationBounds(0, 1));
+  }
+}
+
+void transform::tune::AlternativesOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  onlyReadsHandle(getSelectedRegionParamMutable(), effects);
+  producesHandle(getOperation()->getOpResults(), effects);
+  // TODO: should effects from regions be forwarded?
+}
+
+DiagnosedSilenceableFailure
+transform::tune::AlternativesOp::apply(transform::TransformRewriter &rewriter,
+                                       transform::TransformResults &results,
+                                       transform::TransformState &state) {
+  std::optional<size_t> selectedRegionIdx;
+
+  if (auto selectedRegionAttr = getSelectedRegionAttr())
+    selectedRegionIdx = selectedRegionAttr->getSExtValue();
+
+  if (Value selectedRegionParam = getSelectedRegionParam()) {
+    ArrayRef<Attribute> associatedAttrs = state.getParams(selectedRegionParam);
+    IntegerAttr selectedRegionAttr;
+    if (associatedAttrs.size() != 1 ||
+        !(selectedRegionAttr = dyn_cast<IntegerAttr>(associatedAttrs[0])))
+      return emitDefiniteFailure()
+             << "param should hold exactly one integer attribute, got: "
+             << associatedAttrs[0];
+    selectedRegionIdx = selectedRegionAttr.getValue().getSExtValue();
+  }
+
+  if (!selectedRegionIdx)
+    return emitDefiniteFailure() << "non-deterministic choice " << getName()
+                                 << " is only resolved through providing a "
+                                    "`selected_region` attr/param";
+
+  if (*selectedRegionIdx < 0 || *selectedRegionIdx >= getNumRegions())
+    return emitDefiniteFailure()
+           << "'selected_region' attribute/param specifies region at index "
+           << *selectedRegionIdx << " while op has only " << getNumRegions()
+           << " regions";
+
+  Region &selectedRegion = getRegion(*selectedRegionIdx);
+  auto scope = state.make_region_scope(selectedRegion);
+  Block &block = selectedRegion.front();
+  // Apply the region's ops one by one.
+  for (Operation &transform : block.without_terminator()) {
+    DiagnosedSilenceableFailure result =
+        state.applyTransform(cast<transform::TransformOpInterface>(transform));
+    if (result.isDefiniteFailure())
+      return result;
+
+    if (result.isSilenceableFailure()) {
+      for (const auto &res : getResults())
+        results.set(res, {});
+      return result;
+    }
+  }
+  // Forward the operation mapping for values yielded from the region to the
+  // values produced by the alternatives op.
+  transform::detail::forwardTerminatorOperands(&block, state, results);
+  return DiagnosedSilenceableFailure::success();
+}
+
+LogicalResult transform::tune::AlternativesOp::verify() {
+  for (auto *region : getRegions()) {
+    auto yieldTerminator =
+        llvm::dyn_cast_if_present<transform::YieldOp>(region->front().back());
+    if (!yieldTerminator)
+      return emitOpError() << "expected '"
+                           << transform::YieldOp::getOperationName()
+                           << "' as terminator";
+
+    if (yieldTerminator->getNumOperands() != getNumResults())
+      return yieldTerminator.emitOpError()
+             << "expected terminator to have as many operands as the parent op "
+                "has results";
+
+    for (auto [i, operandType, resultType] : llvm::zip_equal(
+             llvm::seq<unsigned>(0, yieldTerminator->getNumOperands()),
+             yieldTerminator->getOperands().getType(), getResultTypes())) {
+      if (operandType == resultType)
+        continue;
+      return yieldTerminator.emitOpError()
+             << "the type of the terminator operand #" << i
+             << " must match the type of the corresponding parent op result ("
+             << operandType << " vs " << resultType << ")";
+    }
+  }
+
+  if (auto selectedRegionAttr = getSelectedRegionAttr()) {
+    size_t regionIdx = selectedRegionAttr->getSExtValue();
+    if (regionIdx < 0 || regionIdx >= getNumRegions())
+      return emitOpError()
+             << "'selected_region' attribute specifies region at index "
+             << regionIdx << " while op has only " << getNumRegions()
+             << " regions";
+  }
+
+  return success();
+}
diff --git a/mlir/python/mlir/dialects/transform/tune.py b/mlir/python/mlir/dialects/transform/tune.py
index f63f88a382422..b3bfa8015c4d8 100644
--- a/mlir/python/mlir/dialects/transform/tune.py
+++ b/mlir/python/mlir/dialects/transform/tune.py
@@ -6,6 +6,9 @@
 
 from ...ir import (
     Type,
+    Value,
+    Operation,
+    OpView,
     Attribute,
     ArrayAttr,
     StringAttr,
@@ -19,7 +22,10 @@
 from .._transform_tune_extension_ops_gen import _Dialect
 
 try:
-    from .._ods_common import _cext as _ods_cext
+    from .._ods_common import (
+        get_op_result_or_value as _get_op_result_or_value,
+        _cext as _ods_cext,
+    )
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
@@ -36,7 +42,7 @@ def __init__(
             ArrayAttr, Sequence[Union[Attribute, bool, int, float, str]], Attribute
         ],
         *,
-        selected: Optional[Attribute] = None,
+        selected: Optional[Union[Attribute, bool, int, float, str]] = None,
         loc=None,
         ip=None,
     ):
@@ -75,8 +81,62 @@ def knob(
         ArrayAttr, Sequence[Union[Attribute, bool, int, float, str]], Attribute
     ],
     *,
-    selected: Optional[Attribute] = None,
+    selected: Optional[Union[Attribute, bool, int, float, str]] = None,
     loc=None,
     ip=None,
 ):
     return KnobOp(result, name, options, selected=selected, loc=loc, ip=ip)
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class AlternativesOp(AlternativesOp):
+    def __init__(
+        self,
+        results: Sequence[Type],
+        name: Union[StringAttr, str],
+        num_alternatives: int,
+        *,
+        selected_region: Optional[
+            Union[int, IntegerAttr, Value, Operation, OpView]
+        ] = None,
+        loc=None,
+        ip=None,
+    ):
+        if isinstance(name, str):
+            name = StringAttr.get(name)
+
+        selected_region_attr = selected_region_param = None
+        if isinstance(selected_region, IntegerAttr):
+            selected_region_attr = selected_region
+        elif isinstance(selected_region, int):
+            selected_region_attr = IntegerAttr.get(
+                IntegerType.get_signless(32), selected_region
+            )
+        elif isinstance(selected_region, (Value, Operation, OpView)):
+            selected_region_param = _get_op_result_or_value(selected_region)
+
+        super().__init__(
+            results,
+            name,
+            num_alternatives,
+            selected_region_attr=selected_region_attr,
+            selected_region_param=selected_region_param,
+            loc=loc,
+            ip=ip,
+        )
+        for region in self.regions:
+            region.blocks.append()
+
+
+def alternatives(
+    results: Sequence[Type],
+    name: Union[StringAttr, str],
+    num_alternatives: int,
+    *,
+    selected_region: Optional[Union[int, IntegerAttr, Value, Operation, OpView]] = None,
+    loc=None,
+    ip=None,
+):
+    return AlternativesOp(
+        results, name, num_alternatives, selected_region=selected_region, loc=loc, ip=ip
+    )
diff --git a/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir b/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir
index 2e5f433abeb71..efc3890288456 100644
--- a/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir
+++ b/mlir/test/Dialect/Transform/test-tune-extension-invalid.mlir
@@ -19,3 +19,88 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    // expected-error@below {{'selected_region' attribute specifies region at index 2 while op has only 2 regions}}
+    transform.tune.alternatives<"bifurcation"> selected_region = 2 {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %singleton_of_c0 = transform.param.constant [0] -> !transform.any_param
+    // expected-error@below {{param should hold exactly one integer attribute, got: [0]}}
+    transform.tune.alternatives<"bifurcation"> selected_region = %singleton_of_c0 : !transform.any_param {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %c0 = transform.param.constant 0 -> !transform.any_param
+    %c1 = transform.param.constant 1 -> !transform.any_param
+    %c0_and_c1 = transform.merge_handles %c0, %c1 : !transform.any_param
+    // expected-error@below {{param should hold exactly one integer attribute}}
+    transform.tune.alternatives<"bifurcation"> selected_region = %c0_and_c1 : !transform.any_param {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %c2 = transform.param.constant 2 -> !transform.any_param
+    // expected-error@below {{'selected_region' attribute/param specifies region at index 2 while op has only 2 regions}}
+    transform.tune.alternatives<"bifurcation"> selected_region = %c2 : !transform.any_param {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @f()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    // expected-error@below {{non-deterministic choice "bifurcation" is only resolved through providing a `selected_region` attr/param}}
+    transform.tune.alternatives<"bifurcation"> {
+      transform.yield
+    }, {
+      transform.yield
+    }
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Transform/test-tune-extension.mlir b/mlir/test/Dialect/Transform/test-tune-extension.mlir
index 0a253c6d5f837..5da48a2218ec6 100644
--- a/mlir/test/Dialect/Transform/test-tune-extension.mlir
+++ b/mlir/test/Dialect/Transform/test-tune-extension.mlir
@@ -59,3 +59,129 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+
+// -----
+
+// CHECK-LABEL: schedule_with_two_independent_choices_already_made
+func.func @schedule_with_two_independent_choices_already_made(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32> {
+//      CHECK-NOT: scf.forall
+//      CHECK:     scf.for
+//      CHECK-NOT:   scf.for
+//      CHECK:       scf.forall
+//      CHECK-NOT:   scf.for
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         linalg.matmul
+//      CHECK:         scf.forall.in_parallel
+//      CHECK:           tensor.parallel_insert_slice
+//      CHECK:       tensor.insert_slice
+//      CHECK:       scf.yield
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+
+    %tiled_matmul = transform.tune.alternatives<"outer_par_or_seq_tiling"> selected_region = 0 -> !transform.any_op
+    { // First alternative/region, with index = 0
+      %contained_matmul, %loop = transform.structured.tile_using_for %matmul tile_sizes [8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }, { // Second alternative/region, with index = 1
+      %contained_matmul, %loop = transform.structured.tile_using_forall %matmul tile_sizes [8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }
+
+    transform.tune.alternatives<"inner_par_or_seq_tiling"> selected_region = 1 -> !transform.any_op {
+      %contained_matmul, %loop = transform.structured.tile_using_for %tiled_matmul tile_sizes [0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }, {
+      %contained_matmul, %loop = transform.structured.tile_using_forall %tiled_matmul tile_sizes [0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }
+
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: subschedule_with_choice_resolved_in_main_schedule
+func.func @subschedule_with_choice_resolved_in_main_schedule(
+  %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>)
+    -> tensor<128x128xf32> {
+//      CHECK-NOT: scf.for
+//      CHECK:     scf.forall
+//      CHECK-NOT:   scf.forall
+//      CHECK:       scf.for
+//      CHECK-NOT:   scf.forall
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         tensor.extract_slice
+//      CHECK:         linalg.matmul
+//      CHECK:         tensor.insert_slice
+//      CHECK:         scf.yield
+//      CHECK:       scf.forall.in_parallel
+//      CHECK:         tensor.parallel_insert_slice
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>)
+                     outs(%arg2: tensor<128x128xf32>) -> tensor<128x128xf32>
+  return %0 : tensor<128x128xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @subschedule_with_embedded_choice(%matmul: !transform.any_op {transform.readonly},
+                                                             %par_or_seq: !transform.param<i64> {transform.readonly},
+                                                             %tile_size: !transform.param<i64> {transform.readonly}) -> !transform.any_op {
+    %tiled_matmul = transform.tune.alternatives<"par_or_seq_tiling"> selected_region = %par_or_seq : !transform.param<i64> -> !transform.any_op {
+      %contained_matmul, %loop = transform.structured.tile_using_for %matmul tile_sizes [%tile_size] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }, {
+      %contained_matmul, %loop = transform.structured.tile_using_forall %matmul tile_sizes [%tile_size] : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op)
+      transform.yield %contained_matmul : !transform.any_op
+    }
+    transform.yield %tiled_matmul : !transform.any_op
+  }
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %outer_par = transform.param.constant 1 -> !transform.param<i64>
+    %outer_tile_size = transform.param.constant 32 -> !transform.param<i64>
+    %inner_seq = transform.tune.knob<"inner_par_or_seq"> = 0 from options = [0, 1] -> !transform.param<i64>
+    %inner_tile_size = transform.param.constant 8 -> !transform.param<i64>
+    %tiled_matmul = transform.include @subschedule_with_embedded_choice failures(propagate) (%matmul, %outer_par, %outer_tile_size) : (!transform.any_op, !transform.param<i64>, !transform.param<i64>) -> !transform.any_op
+    %tiled_tiled_matmul = transform.include @subschedule_with_embedded_choice failures(propagate) (%tiled_matmul, %inner_seq, %inner_tile_size) : (!transform.any_op, !transform.param<i64>, !transform.param<i64>) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: eeny_meeny_miny_moe
+func.func private @eeny_meeny_miny_moe()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+
+    %tiled_matmul = transform.tune.alternatives<"4way"> selected_region = 3 -> !transform.any_param
+    { // First alternative/region, with index = 0
+      %out = transform.param.constant "eeny" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }, { // Second alternative/region, with index = 1
+      %out = transform.param.constant "meeny" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }, { // Third alternative/region, with index = 2
+      %out = transform.param.constant "miny" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }, { // Fourth alternative/region, with index = 3
+      %out = transform.param.constant "moe" -> !transform.any_param
+      transform.yield %out : !transform.any_param
+    }
+    transform.yield
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/python/dialects/transform_tune_ext.py b/mlir/test/python/dialects/transform_tune_ext.py
index dfb93594bca52..eb2a083211ef7 100644
--- a/mlir/test/python/dialects/transform_tune_ext.py
+++ b/mlir/test/python/dialects/transform_tune_ext.py
@@ -1,21 +1,21 @@
 # RUN: %PYTHON %s | FileCheck %s
 
-from mlir.ir import *
+from mlir import ir
 from mlir.dialects import transform
 from mlir.dialects.transform import tune, debug
 
 
 def run(f):
-    print("\nTEST:", f.__name__)
-    with Context(), Location.unknown():
-        module = Module.create()
-        with InsertionPoint(module.body):
+    print("\n// TEST:", f.__name__)
+    with ir.Context(), ir.Location.unknown():
+        module = ir.Module.create()
+        with ir.InsertionPoint(module.body):
             sequence = transform.SequenceOp(
                 transform.FailurePropagationMode.Propagate,
                 [],
                 transform.AnyOpType.get(),
             )
-            with InsertionPoint(sequence.body):
+            with ir.InsertionPoint(sequence.body):
                 f(sequence.bodyTarget)
                 transform.YieldOp()
         print(module)
@@ -29,10 +29,10 @@ def testKnobOp(target):
 
     # CHECK: %[[HEADS_OR_TAILS:.*]] = transform.tune.knob<"coin"> options = [true, false] -> !transform.any_param
     heads_or_tails = tune.KnobOp(
-        result=any_param, name=StringAttr.get("coin"), options=[True, False]
+        result=any_param, name=ir.StringAttr.get("coin"), options=[True, False]
     )
     # CHECK: transform.tune.knob<"animal"> options = ["cat", "dog", unit] -> !transform.any_param
-    tune.KnobOp(any_param, name="animal", options=["cat", "dog", UnitAttr.get()])
+    tune.KnobOp(any_param, name="animal", options=["cat", "dog", ir.UnitAttr.get()])
     # CHECK: transform.tune.knob<"tile_size"> options = [2, 4, 8, 16, 24, 32] -> !transform.any_param
     tune.KnobOp(any_param, "tile_size", [2, 4, 8, 16, 24, 32])
     # CHECK: transform.tune.knob<"magic_value"> options = [2.000000e+00, 2.250000e+00, 2.500000e+00, 2.750000e+00, 3.000000e+00] -> !transform.any_param
@@ -45,7 +45,10 @@ def testKnobOp(target):
     heads = tune.KnobOp(any_param, "coin", options=[True, False], selected=True)
     # CHECK: transform.tune.knob<"animal"> = "dog" from options = ["cat", "dog", unit] -> !transform.any_param
     tune.KnobOp(
-        any_param, name="animal", options=["cat", "dog", UnitAttr.get()], selected="dog"
+        any_param,
+        name="animal",
+        options=["cat", "dog", ir.UnitAttr.get()],
+        selected="dog",
     )
     # CHECK: transform.tune.knob<"tile_size"> = 8 : i64 from options = [2, 4, 8, 16, 24, 32] -> !transform.any_param
     tune.KnobOp(any_param, "tile_size", [2, 4, 8, 16, 24, 32], selected=8)
@@ -57,16 +60,90 @@ def testKnobOp(target):
 
     # CHECK: transform.tune.knob<"range_as_a_dict"> = 4 : i64 from options = {start = 2 : i64, step = 2 : i64, stop = 16 : i64} -> !transform.any_param
     # NB: Membership of `selected` in non-ArrayAttr `options` is _not_ verified.
-    i64 = IntegerType.get_signless(64)
+    i64 = ir.IntegerType.get_signless(64)
     tune.knob(
         any_param,
         "range_as_a_dict",
-        DictAttr.get(
+        ir.DictAttr.get(
             {
-                "start": IntegerAttr.get(i64, 2),
-                "stop": IntegerAttr.get(i64, 16),
-                "step": IntegerAttr.get(i64, 2),
+                "start": ir.IntegerAttr.get(i64, 2),
+                "stop": ir.IntegerAttr.get(i64, 16),
+                "step": ir.IntegerAttr.get(i64, 2),
             }
         ),
         selected=4,
     )
+
+
+# CHECK-LABEL: TEST: testAlternativesOp
+@run
+def testAlternativesOp(target):
+    any_param = transform.AnyParamType.get()
+
+    # CHECK: %[[LEFT_OR_RIGHT_OUTCOME:.*]] = transform.tune.alternatives<"left_or_right"> -> !transform.any_param {
+    left_or_right = tune.AlternativesOp(
+        [transform.AnyParamType.get()], "left_or_right", 2
+    )
+    idx_for_left, idx_for_right = 0, 1
+    with ir.InsertionPoint(left_or_right.alternatives[idx_for_left].blocks[0]):
+        # CHECK: %[[C0:.*]] = transform.param.constant 0
+        i32_0 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0)
+        c0 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_0)
+        # CHECK: transform.yield %[[C0]]
+        transform.yield_(c0)
+    # CHECK-NEXT: }, {
+    with ir.InsertionPoint(left_or_right.alternatives[idx_for_right].blocks[0]):
+        # CHECK: %[[C1:.*]] = transform.param.constant 1
+        i32_1 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1)
+        c1 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1)
+        # CHECK: transform.yield %[[C1]]
+        transform.yield_(c1)
+    # CHECK-NEXT: }
+    outcome_of_left_or_right_decision = left_or_right.results[0]
+
+    # CHECK: transform.tune.alternatives<"fork_in_the_road"> selected_region = 0 -> !transform.any_param {
+    fork_in_the_road = tune.AlternativesOp(
+        [transform.AnyParamType.get()], "fork_in_the_road", 2, selected_region=0
+    )
+    with ir.InsertionPoint(fork_in_the_road.alternatives[idx_for_left].blocks[0]):
+        # CHECK: %[[C0:.*]] = transform.param.constant 0
+        i32_0 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 0)
+        c0 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_0)
+        # CHECK: transform.yield %[[C0]]
+        transform.yield_(c0)
+    # CHECK-NEXT: }, {
+    with ir.InsertionPoint(fork_in_the_road.alternatives[idx_for_right].blocks[0]):
+        # CHECK: %[[C1:.*]] = transform.param.constant 1
+        i32_1 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1)
+        c1 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1)
+        # CHECK: transform.yield %[[C1]]
+        transform.yield_(c1)
+    # CHECK-NEXT: }
+
+    # CHECK: transform.tune.alternatives<"left_or_right_as_before"> selected_region = %[[LEFT_OR_RIGHT_OUTCOME]] : !transform.any_param {
+    left_or_right_as_before = tune.AlternativesOp(
+        [],
+        "left_or_right_as_before",
+        2,
+        selected_region=outcome_of_left_or_right_decision,
+    )
+    with ir.InsertionPoint(
+        left_or_right_as_before.alternatives[idx_for_left].blocks[0]
+    ):
+        # CHECK: transform.param.constant 1337
+        i32_1337 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 1337)
+        c1337 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_1337)
+        # CHECK: transform.debug.emit_param_as_remark
+        debug.emit_param_as_remark(c1337)
+        transform.yield_([])
+    # CHECK-NEXT: }, {
+    with ir.InsertionPoint(
+        left_or_right_as_before.alternatives[idx_for_right].blocks[0]
+    ):
+        # CHECK: transform.param.constant 42
+        i32_42 = ir.IntegerAttr.get(ir.IntegerType.get_signless(32), 42)
+        c42 = transform.ParamConstantOp(transform.AnyParamType.get(), i32_42)
+        # CHECK: transform.debug.emit_param_as_remark
+        debug.emit_param_as_remark(c42)
+        transform.yield_([])
+    # CHECK-NEXT: }

From 8df0575125c6f7575aff05a8a9effa07bedc7e92 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 1 Oct 2025 22:48:12 +0900
Subject: [PATCH 372/878] AMDGPU: Add peephole opt baseline tests (#161309)

Add tests which show missed folds of subregister extracts with
intermediate full copies.
---
 .../AMDGPU/peephole-opt-regseq-removal.mir    | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
index f1f2eb6baf008..0c723a09809c6 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
@@ -80,3 +80,151 @@ body:             |
     %4:vreg_128 = REG_SEQUENCE %3.sub0, %subreg.sub0, %3.sub1, %subreg.sub1, %3.sub2, %subreg.sub2, %3.sub3, %subreg.sub3
     KILL implicit %4
 ...
+
+---
+name: copy_vreg_64_subreg_from_vgpr_reg_sequence
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: copy_vreg_64_subreg_from_vgpr_reg_sequence
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN-NEXT: $vgpr0 = COPY [[COPY2]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %3:vgpr_32 = COPY %2.sub0
+    $vgpr0 = COPY %3
+...
+
+---
+name: copy_vreg_64_subreg_from_vgpr_reg_sequence_extra_copy
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: copy_vreg_64_subreg_from_vgpr_reg_sequence_extra_copy
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: $vgpr0 = COPY [[COPY3]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %3:vreg_64 = COPY %2
+    %4:vgpr_32 = COPY %3.sub0
+    $vgpr0 = COPY %4
+...
+
+---
+name: copy_av_64_subreg_from_vgpr_reg_sequence
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: copy_av_64_subreg_from_vgpr_reg_sequence
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE]]
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: $vgpr0 = COPY [[COPY3]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vreg_64_align2 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %3:av_64_align2 = COPY %2
+    %4:vgpr_32 = COPY %3.sub0
+    $vgpr0 = COPY %4
+...
+
+---
+name: copy_vreg_64_subreg_from_vgpr_reg_sequence_with_sub0_compose
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; GCN-LABEL: name: copy_vreg_64_subreg_from_vgpr_reg_sequence_with_sub0_compose
+    ; GCN: liveins: $vgpr0_vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GCN-NEXT: $vgpr0 = COPY [[COPY2]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vreg_64 = REG_SEQUENCE %0.sub0, %subreg.sub0, %1, %subreg.sub1
+    %3:vgpr_32 = COPY %2.sub0
+    $vgpr0 = COPY %3
+...
+
+---
+name: copy_vreg_64_subreg_from_vgpr_reg_sequence_with_sub1_compose
+body:             |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; GCN-LABEL: name: copy_vreg_64_subreg_from_vgpr_reg_sequence_with_sub1_compose
+    ; GCN: liveins: $vgpr0_vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub1, %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GCN-NEXT: $vgpr0 = COPY [[COPY2]]
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vreg_64 = REG_SEQUENCE %0.sub1, %subreg.sub0, %1, %subreg.sub1
+    %3:vgpr_32 = COPY %2.sub0
+    $vgpr0 = COPY %3
+...
+
+---
+name: copy_vreg_64_subreg_from_multiple_vgpr_reg_sequence
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GCN-LABEL: name: copy_vreg_64_subreg_from_multiple_vgpr_reg_sequence
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+    ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub1_sub2
+    ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[COPY4]]
+    ; GCN-NEXT: $vgpr2_vgpr3 = COPY [[COPY5]]
+    ; GCN-NEXT: $vgpr4_vgpr5 = COPY [[COPY6]]
+    ; GCN-NEXT: $vgpr6 = COPY [[COPY7]]
+    ; GCN-NEXT: $vgpr6 = COPY [[COPY8]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+    %3:vgpr_32 = COPY $vgpr3
+    %4:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1
+    %5:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1
+    %6:vreg_128 = REG_SEQUENCE %4, %subreg.sub0_sub1, %5, %subreg.sub2_sub3
+    %7:vreg_64 = COPY %6.sub0_sub1
+    %8:vreg_64 = COPY %6.sub1_sub2
+    %9:vreg_64 = COPY %6.sub2_sub3
+    %10:vgpr_32 = COPY %6.sub2
+    %11:vgpr_32 = COPY %6.sub0
+    $vgpr0_vgpr1 = COPY %7
+    $vgpr2_vgpr3 = COPY %8
+    $vgpr4_vgpr5 = COPY %9
+    $vgpr6 = COPY %10
+    $vgpr6 = COPY %11
+...

From 2e5a5fdd329073e3d4b174743dfc7fca655e9491 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 1 Oct 2025 14:55:04 +0100
Subject: [PATCH 373/878] [X86] Add test showing failure to remove sign splats
 from PACKSS intrinsics (#161518)

PACKSS intrinsic calls are only expanded to X86ISD::PACKSS nodes during
legalisation, after which time we fail to handle cases where ASHR sign
splats (now lowered to X86ISD::VSRAI) are unnecessary.

Add additional example of FREEZE(PACKSS()) as that's an issue as well.
---
 llvm/test/CodeGen/X86/combine-pack.ll | 55 +++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/combine-pack.ll

diff --git a/llvm/test/CodeGen/X86/combine-pack.ll b/llvm/test/CodeGen/X86/combine-pack.ll
new file mode 100644
index 0000000000000..9e740b04073e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-pack.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
+
+; TODO: Failure to remove unnecessary signsplat
+define <8 x i16> @combine_packss_v4i32_signsplat(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: combine_packss_v4i32_signsplat:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    psraw $15, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_packss_v4i32_signsplat:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cmp = icmp sgt <4 x i32> %a0, %a1
+  %ext = sext <4 x i1> %cmp to <4 x i32>
+  %pack = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %ext, <4 x i32> splat (i32 -1))
+  %signsplat = ashr <8 x i16> %pack, splat (i16 15)
+  ret <8 x i16> %signsplat
+}
+
+; TODO: Failure to remove unnecessary signsplat through freeze
+define <8 x i16> @combine_packss_v4i32_freeze_signsplat(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: combine_packss_v4i32_freeze_signsplat:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    packssdw %xmm1, %xmm0
+; SSE-NEXT:    psraw $15, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_packss_v4i32_freeze_signsplat:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cmp = icmp sgt <4 x i32> %a0, %a1
+  %ext = sext <4 x i1> %cmp to <4 x i32>
+  %pack = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %ext, <4 x i32> splat (i32 -1))
+  %freeze = freeze <8 x i16> %pack
+  %signsplat = ashr <8 x i16> %freeze, splat (i16 15)
+  ret <8 x i16> %signsplat
+}

From c09054866a1ad6c250e6c972d369b9023abb4b3b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 1 Oct 2025 22:02:08 +0800
Subject: [PATCH 374/878] [InstCombine] Drop poison-generating flags when
 reusing existing or instruction (#161504)

Closes https://github.com/llvm/llvm-project/issues/161493.
---
 .../InstCombine/InstCombineAndOrXor.cpp       |  7 +++++
 llvm/test/Transforms/InstCombine/funnel.ll    | 26 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4b7793f6e010b..9b272c4721cbd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3080,6 +3080,13 @@ InstCombinerImpl::convertOrOfShiftsToFunnelShift(Instruction &Or) {
       assert(ZextLowShlAmt->uge(HighSize) &&
              ZextLowShlAmt->ule(Width - LowSize) && "Invalid concat");
 
+      // We cannot reuse the result if it may produce poison.
+      // Drop poison generating flags in the expression tree.
+      // Or
+      cast<Instruction>(U)->dropPoisonGeneratingFlags();
+      // Shl
+      cast<Instruction>(X)->dropPoisonGeneratingFlags();
+
       FShiftArgs = {U, U, ConstantInt::get(Or0->getType(), *ZextHighShlAmt)};
       break;
     }
diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index 0e5f0469264c7..e5731080fba44 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -635,3 +635,29 @@ define i32 @test_rotl_and_neg_wrong_mask(i32 %x, i32 %shamt) {
   %or = or i32 %shl, %shr
   ret i32 %or
 }
+
+declare void @use(i16)
+
+; Make sure the reused result does not produce poison.
+
+define i16 @fshl_concat_vector_may_produce_poison(i4 %x, i12 %y) {
+; CHECK-LABEL: @fshl_concat_vector_may_produce_poison(
+; CHECK-NEXT:    [[X_FR:%.*]] = freeze i4 [[X:%.*]]
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i4 [[X_FR]] to i16
+; CHECK-NEXT:    [[SLX:%.*]] = shl nuw i16 [[ZEXT_X]], 12
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i12 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[XY:%.*]] = or disjoint i16 [[SLX]], [[ZEXT_Y]]
+; CHECK-NEXT:    call void @use(i16 [[XY]])
+; CHECK-NEXT:    [[YX:%.*]] = call i16 @llvm.fshl.i16(i16 [[XY]], i16 [[XY]], i16 4)
+; CHECK-NEXT:    ret i16 [[YX]]
+;
+  %x.fr = freeze i4 %x
+  %zext.x = zext i4 %x.fr to i16
+  %slx = shl nuw nsw i16 %zext.x, 12
+  %zext.y = zext i12 %y to i16
+  %xy = or disjoint i16 %slx, %zext.y
+  call void @use(i16 %xy)
+  %sly = shl nuw i16 %zext.y, 4
+  %yx = or disjoint i16 %sly, %zext.x
+  ret i16 %yx
+}

From 73d9974c91413f5a6dbe6f76f4b73ad226b6276b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 1 Oct 2025 22:02:30 +0800
Subject: [PATCH 375/878] [InstCombine] Avoid self-replacing in
 `getUndefReplacement` (#161500)

Self-replacing has a different meaning in InstCombine. It will replace
all uses with poison.
Closes https://github.com/llvm/llvm-project/issues/161492.
---
 .../InstCombine/InstructionCombining.cpp      |  3 ++-
 llvm/test/Transforms/InstCombine/freeze.ll    | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index ff063f929347f..5d2d79e420931 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5212,7 +5212,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
       else if (match(U, m_Select(m_Specific(&I), m_Constant(), m_Value())))
         V = ConstantInt::getTrue(Ty);
       else if (match(U, m_c_Select(m_Specific(&I), m_Value(V)))) {
-        if (!isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT))
+        if (V == &I || !isGuaranteedNotToBeUndefOrPoison(V, &AC, &I, &DT))
           V = NullValue;
       } else if (auto *PHI = dyn_cast<PHINode>(U)) {
         if (Value *MaybeV = pickCommonConstantFromPHI(*PHI))
@@ -5225,6 +5225,7 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
         BestValue = NullValue;
     }
     assert(BestValue && "Must have at least one use");
+    assert(BestValue != &I && "Cannot replace with itself");
     return BestValue;
   };
 
diff --git a/llvm/test/Transforms/InstCombine/freeze.ll b/llvm/test/Transforms/InstCombine/freeze.ll
index af5cb0c75537b..ac7d65c2a3c6a 100644
--- a/llvm/test/Transforms/InstCombine/freeze.ll
+++ b/llvm/test/Transforms/InstCombine/freeze.ll
@@ -1464,6 +1464,27 @@ define ptr @freeze_ptrmask_nonnull(ptr %p, i64 noundef %m) {
   ret ptr %fr
 }
 
+define i64 @pr161492_1(i1 %cond) {
+; CHECK-LABEL: define i64 @pr161492_1(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:    ret i64 0
+;
+  %fr1 = freeze i64 poison
+  %fr2 = freeze i64 poison
+  %ret = select i1 %cond, i64 %fr1, i64 %fr2
+  ret i64 %ret
+}
+
+define i64 @pr161492_2(i1 %cond) {
+; CHECK-LABEL: define i64 @pr161492_2(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:    ret i64 0
+;
+  %fr = freeze i64 poison
+  %ret = select i1 %cond, i64 %fr, i64 %fr
+  ret i64 %ret
+}
+
 !0 = !{}
 !1 = !{i64 4}
 !2 = !{i32 0, i32 100}

From a05e004b285af92f9bcef12d5ab5537c36002c13 Mon Sep 17 00:00:00 2001
From: Hongyu Chen <xxs_chy@outlook.com>
Date: Wed, 1 Oct 2025 22:06:08 +0800
Subject: [PATCH 376/878] [DFAJumpThreading] Unfold select to the incoming
 block of phi user (#160987)

Fixes #160250
We previously assumed the select to unfold is defined in the incoming
block of phi user, as `isValidSelectInst` filters other cases at the
initial stage. However, the selects not defined in the incoming block
may occur after unfolding the arms of the unfolded select.
This patch sinks the select into the incoming block of the phi user and
unfolds it at the incoming block.
---
 .../Transforms/Scalar/DFAJumpThreading.cpp    | 10 ++-
 .../dfa-jump-threading-transform.ll           | 63 +++++++++-----
 .../DFAJumpThreading/dfa-unfold-select.ll     | 84 +++++++++++++++++++
 3 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 944b253e0f5e7..e9a3e983bc1e2 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -190,12 +190,12 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
             std::vector<BasicBlock *> *NewBBs) {
   SelectInst *SI = SIToUnfold.getInst();
   PHINode *SIUse = SIToUnfold.getUse();
-  BasicBlock *StartBlock = SI->getParent();
+  assert(SI->hasOneUse());
+  // The select may come indirectly, instead of from where it is defined.
+  BasicBlock *StartBlock = SIUse->getIncomingBlock(*SI->use_begin());
   BranchInst *StartBlockTerm =
       dyn_cast<BranchInst>(StartBlock->getTerminator());
-
   assert(StartBlockTerm);
-  assert(SI->hasOneUse());
 
   if (StartBlockTerm->isUnconditional()) {
     BasicBlock *EndBlock = StartBlock->getUniqueSuccessor();
@@ -332,7 +332,7 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
   }
 
   // Preserve loop info
-  if (Loop *L = LI->getLoopFor(SI->getParent())) {
+  if (Loop *L = LI->getLoopFor(StartBlock)) {
     for (BasicBlock *NewBB : *NewBBs)
       L->addBasicBlockToLoop(NewBB, *LI);
   }
@@ -533,6 +533,8 @@ struct MainSwitch {
       return false;
 
     // Only fold the select coming from directly where it is defined.
+    // TODO: We have dealt with the select coming indirectly now. This
+    // constraint can be relaxed.
     PHINode *PHIUser = dyn_cast<PHINode>(SIUse);
     if (PHIUser && PHIUser->getIncomingBlock(*SI->use_begin()) != SIBB)
       return false;
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
index cba1ba8dde768..ad0568486396f 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
@@ -304,32 +304,43 @@ end:
 define void @pr106083_invalidBBarg_fold(i1 %cmp1, i1 %cmp2, i1 %not, ptr %d) {
 ; CHECK-LABEL: @pr106083_invalidBBarg_fold(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[SEL_SI_UNFOLD_FALSE:%.*]]
-; CHECK:       sel.si.unfold.false:
-; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 1, [[BB:%.*]] ]
-; CHECK-NEXT:    br label [[BB1]]
+; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       BB1:
-; CHECK-NEXT:    [[I:%.*]] = phi i16 [ 0, [[BB1_BACKEDGE:%.*]] ], [ 0, [[BB]] ], [ 1, [[BB7:%.*]] ], [ 0, [[SEL_SI_UNFOLD_FALSE]] ], [ 1, [[BB7_JT0:%.*]] ]
-; CHECK-NEXT:    [[SEL_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[SEL_SI_UNFOLD_PHI]], [[BB1_BACKEDGE]] ], [ [[SEL_SI_UNFOLD_PHI]], [[BB7]] ], [ 0, [[BB]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SEL_SI_UNFOLD_FALSE]] ], [ [[SEL_SI_UNFOLD_PHI]], [[BB7_JT0]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i16 [ 0, [[BB1_BACKEDGE:%.*]] ], [ 0, [[BB:%.*]] ], [ 1, [[BB9:%.*]] ], [ 1, [[BB7_JT0:%.*]] ]
 ; CHECK-NEXT:    br i1 [[NOT:%.*]], label [[BB7_JT0]], label [[BB2:%.*]]
 ; CHECK:       BB2:
 ; CHECK-NEXT:    store i16 0, ptr [[D:%.*]], align 2
-; CHECK-NEXT:    br i1 [[CMP2:%.*]], label [[BB7]], label [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK-NEXT:    br i1 [[CMP2:%.*]], label [[BB7:%.*]], label [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0:%.*]]
 ; CHECK:       spec.select.si.unfold.false:
-; CHECK-NEXT:    br label [[BB7]]
+; CHECK-NEXT:    br label [[BB9]]
 ; CHECK:       spec.select.si.unfold.false.jt0:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[BB2]] ]
 ; CHECK-NEXT:    br label [[BB7_JT0]]
+; CHECK:       sel.si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB9]], label [[SEL_SI_UNFOLD_FALSE_JT1:%.*]]
+; CHECK:       sel.si.unfold.true.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 0, [[BB2]] ]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[BB7_JT0]], label [[SEL_SI_UNFOLD_FALSE:%.*]]
+; CHECK:       sel.si.unfold.false:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2:%.*]] = phi i32 [ 1, [[BB7]] ]
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       sel.si.unfold.false.jt1:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2_JT1:%.*]] = phi i32 [ 1, [[SEL_SI_UNFOLD_TRUE:%.*]] ]
+; CHECK-NEXT:    br label [[BB7_JT1:%.*]]
 ; CHECK:       BB7:
-; CHECK-NEXT:    [[D_PROMOTED4:%.*]] = phi i16 [ 1, [[BB2]] ], [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]] ]
-; CHECK-NEXT:    [[_3:%.*]] = phi i32 [ [[SEL_SI_UNFOLD_PHI]], [[BB2]] ], [ poison, [[SPEC_SELECT_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[D_PROMOTED4:%.*]] = phi i16 [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]] ], [ 1, [[SEL_SI_UNFOLD_TRUE]] ], [ 1, [[SEL_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    [[_3:%.*]] = phi i32 [ poison, [[SPEC_SELECT_SI_UNFOLD_FALSE]] ], [ poison, [[SEL_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI2]], [[SEL_SI_UNFOLD_FALSE]] ]
 ; CHECK-NEXT:    switch i32 [[_3]], label [[BB1_BACKEDGE]] [
 ; CHECK-NEXT:      i32 0, label [[BB1]]
 ; CHECK-NEXT:      i32 1, label [[BB8:%.*]]
 ; CHECK-NEXT:    ]
+; CHECK:       BB7.jt1:
+; CHECK-NEXT:    [[D_PROMOTED4_JT1:%.*]] = phi i16 [ 1, [[SEL_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    [[_3_JT1:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI2_JT1]], [[SEL_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    br label [[BB8]]
 ; CHECK:       BB7.jt0:
-; CHECK-NEXT:    [[D_PROMOTED4_JT0:%.*]] = phi i16 [ 0, [[BB1]] ], [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ]
-; CHECK-NEXT:    [[_3_JT0:%.*]] = phi i32 [ 0, [[BB1]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    [[D_PROMOTED4_JT0:%.*]] = phi i16 [ 0, [[BB1]] ], [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ], [ 1, [[BB7]] ]
+; CHECK-NEXT:    [[_3_JT0:%.*]] = phi i32 [ 0, [[BB1]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ], [ [[DOTSI_UNFOLD_PHI1]], [[BB7]] ]
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       BB1.backedge:
 ; CHECK-NEXT:    br label [[BB1]]
@@ -367,30 +378,40 @@ BB8:                                              ; preds = %BB7
 define void @pr106083_select_dead_uses(i1 %cmp1, i1 %not, ptr %p) {
 ; CHECK-LABEL: @pr106083_select_dead_uses(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[DOTLOOPEXIT6:%.*]], label [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]]
-; CHECK:       spec.select.si.unfold.false:
-; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 1, [[BB:%.*]] ]
-; CHECK-NEXT:    br label [[DOTLOOPEXIT6]]
+; CHECK-NEXT:    br label [[DOTLOOPEXIT6:%.*]]
 ; CHECK:       .loopexit6:
-; CHECK-NEXT:    [[SPEC_SELECT_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[SPEC_SELECT_SI_UNFOLD_PHI]], [[SELECT_UNFOLD:%.*]] ], [ 0, [[BB]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SPEC_SELECT_SI_UNFOLD_FALSE]] ]
 ; CHECK-NEXT:    br i1 [[NOT:%.*]], label [[SELECT_UNFOLD_JT0:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    [[NOT2:%.*]] = icmp eq i32 0, 0
-; CHECK-NEXT:    br i1 [[NOT2]], label [[SELECT_UNFOLD]], label [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK-NEXT:    br i1 [[NOT2]], label [[SELECT_UNFOLD:%.*]], label [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0:%.*]]
 ; CHECK:       spec.select7.si.unfold.false:
-; CHECK-NEXT:    br label [[SELECT_UNFOLD]]
+; CHECK-NEXT:    br label [[SELECT_UNFOLD1:%.*]]
 ; CHECK:       spec.select7.si.unfold.false.jt0:
 ; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[BB1]] ]
 ; CHECK-NEXT:    br label [[SELECT_UNFOLD_JT0]]
+; CHECK:       spec.select.si.unfold.true:
+; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[SELECT_UNFOLD1]], label [[SPEC_SELECT_SI_UNFOLD_FALSE_JT1:%.*]]
+; CHECK:       spec.select.si.unfold.true.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 0, [[BB1]] ]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SELECT_UNFOLD_JT0]], label [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]]
+; CHECK:       spec.select.si.unfold.false:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2:%.*]] = phi i32 [ 1, [[SELECT_UNFOLD]] ]
+; CHECK-NEXT:    br label [[SELECT_UNFOLD1]]
+; CHECK:       spec.select.si.unfold.false.jt1:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2_JT1:%.*]] = phi i32 [ 1, [[SPEC_SELECT_SI_UNFOLD_TRUE:%.*]] ]
+; CHECK-NEXT:    br label [[SELECT_UNFOLD_JT1:%.*]]
 ; CHECK:       select.unfold:
-; CHECK-NEXT:    [[_2:%.*]] = phi i32 [ [[SPEC_SELECT_SI_UNFOLD_PHI]], [[BB1]] ], [ poison, [[SPEC_SELECT7_SI_UNFOLD_FALSE:%.*]] ]
+; CHECK-NEXT:    [[_2:%.*]] = phi i32 [ poison, [[SPEC_SELECT7_SI_UNFOLD_FALSE:%.*]] ], [ poison, [[SPEC_SELECT_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI2]], [[SPEC_SELECT_SI_UNFOLD_FALSE]] ]
 ; CHECK-NEXT:    switch i32 [[_2]], label [[BB2:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[DOTPREHEADER_PREHEADER:%.*]]
 ; CHECK-NEXT:      i32 1, label [[DOTLOOPEXIT6]]
 ; CHECK-NEXT:    ]
+; CHECK:       select.unfold.jt1:
+; CHECK-NEXT:    [[_2_JT1:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI2_JT1]], [[SPEC_SELECT_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    br label [[DOTLOOPEXIT6]]
 ; CHECK:       select.unfold.jt0:
-; CHECK-NEXT:    [[_2_JT0:%.*]] = phi i32 [ 0, [[DOTLOOPEXIT6]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    [[_2_JT0:%.*]] = phi i32 [ 0, [[DOTLOOPEXIT6]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SELECT_UNFOLD]] ]
 ; CHECK-NEXT:    br label [[DOTPREHEADER_PREHEADER]]
 ; CHECK:       .preheader.preheader:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index 93872c3938768..663f459e23084 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -463,3 +463,87 @@ unreachable:
 sw.bb:                                         ; preds = %if.end
   br label %while.cond
 }
+
+define i16 @pr160250() {
+; CHECK-LABEL: @pr160250(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND48:%.*]]
+; CHECK:       for.cond48:
+; CHECK-NEXT:    br i1 false, label [[CLEANUP87_JT0:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br i1 false, label [[DOT6_SI_UNFOLD_TRUE:%.*]], label [[DOT5_SI_UNFOLD_TRUE:%.*]]
+; CHECK:       .5.si.unfold.true:
+; CHECK-NEXT:    br i1 false, label [[SPEC_SELECT1_SI_UNFOLD_TRUE1:%.*]], label [[DOT5_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       .5.si.unfold.true.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 0, [[IF_ELSE]] ]
+; CHECK-NEXT:    br i1 false, label [[SPEC_SELECT1_SI_UNFOLD_TRUE:%.*]], label [[DOT5_SI_UNFOLD_FALSE:%.*]]
+; CHECK:       .5.si.unfold.false:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2:%.*]] = phi i32 [ 0, [[DOT5_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    br label [[SPEC_SELECT1_SI_UNFOLD_TRUE1]]
+; CHECK:       .5.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI2_JT0:%.*]] = phi i32 [ 0, [[DOT5_SI_UNFOLD_TRUE1:%.*]] ]
+; CHECK-NEXT:    br label [[SPEC_SELECT1_SI_UNFOLD_TRUE]]
+; CHECK:       spec.select1.si.unfold.true:
+; CHECK-NEXT:    [[DOT5_SI_UNFOLD_PHI:%.*]] = phi i32 [ poison, [[DOT5_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI2]], [[DOT5_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    br i1 false, label [[SPEC_SELECT_SI_UNFOLD_FALSE1:%.*]], label [[SPEC_SELECT1_SI_UNFOLD_FALSE_JT1:%.*]]
+; CHECK:       spec.select1.si.unfold.true.jt0:
+; CHECK-NEXT:    [[DOT5_SI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI1]], [[DOT5_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI2_JT0]], [[DOT5_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br i1 false, label [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]], label [[SPEC_SELECT1_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       spec.select1.si.unfold.false:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI:%.*]] = phi i32 [ 0, [[SPEC_SELECT1_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    br label [[SPEC_SELECT_SI_UNFOLD_FALSE1]]
+; CHECK:       spec.select1.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[SPEC_SELECT1_SI_UNFOLD_TRUE1]] ]
+; CHECK-NEXT:    br label [[SPEC_SELECT_SI_UNFOLD_FALSE]]
+; CHECK:       spec.select.si.unfold.false:
+; CHECK-NEXT:    [[SPEC_SELECT1_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[DOT5_SI_UNFOLD_PHI]], [[SPEC_SELECT1_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI]], [[SPEC_SELECT1_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[CLEANUP87:%.*]]
+; CHECK:       spec.select.si.unfold.false.jt0:
+; CHECK-NEXT:    [[SPEC_SELECT1_SI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ [[DOT5_SI_UNFOLD_PHI_JT0]], [[SPEC_SELECT1_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT1_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT:    br label [[CLEANUP87_JT0]]
+; CHECK:       .6.si.unfold.true:
+; CHECK-NEXT:    br i1 false, label [[CLEANUP87]], label [[DOT6_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       .6.si.unfold.true.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI3:%.*]] = phi i32 [ 0, [[IF_ELSE]] ]
+; CHECK-NEXT:    br i1 false, label [[CLEANUP87_JT0]], label [[DOT6_SI_UNFOLD_FALSE:%.*]]
+; CHECK:       .6.si.unfold.false:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI4:%.*]] = phi i32 [ 0, [[DOT6_SI_UNFOLD_TRUE]] ]
+; CHECK-NEXT:    br label [[CLEANUP87]]
+; CHECK:       .6.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI4_JT0:%.*]] = phi i32 [ 0, [[DOT6_SI_UNFOLD_TRUE1:%.*]] ]
+; CHECK-NEXT:    br label [[CLEANUP87_JT0]]
+; CHECK:       cleanup87:
+; CHECK-NEXT:    [[CLEANUP_DEST_SLOT_3:%.*]] = phi i32 [ [[SPEC_SELECT1_SI_UNFOLD_PHI]], [[SPEC_SELECT_SI_UNFOLD_FALSE1]] ], [ poison, [[DOT6_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI4]], [[DOT6_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    switch i32 [[CLEANUP_DEST_SLOT_3]], label [[FOR_COND48_BACKEDGE:%.*]] [
+; CHECK-NEXT:      i32 0, label [[FOR_COND48_BACKEDGE]]
+; CHECK-NEXT:      i32 1, label [[FOR_COND48_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       cleanup87.jt0:
+; CHECK-NEXT:    [[CLEANUP_DEST_SLOT_3_JT0:%.*]] = phi i32 [ 0, [[FOR_COND48]] ], [ [[SPEC_SELECT1_SI_UNFOLD_PHI_JT0]], [[SPEC_SELECT_SI_UNFOLD_FALSE]] ], [ [[DOTSI_UNFOLD_PHI3]], [[DOT6_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI4_JT0]], [[DOT6_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[FOR_COND48_BACKEDGE]]
+; CHECK:       for.cond48.backedge:
+; CHECK-NEXT:    br label [[FOR_COND48]]
+;
+entry:
+  %.5 = select i1 false, i32 0, i32 0
+  %.6 = select i1 false, i32 0, i32 0
+  br label %for.cond48
+
+for.cond48:                                       ; preds = %for.cond48.backedge, %entry
+  br i1 false, label %cleanup87, label %if.else
+
+if.else:                                          ; preds = %for.cond48
+  %spec.select1 = select i1 false, i32 %.5, i32 0
+  %spec.select = select i1 false, i32 %.6, i32 %spec.select1
+  br label %cleanup87
+
+cleanup87:                                        ; preds = %if.else, %for.cond48
+  %cleanup.dest.slot.3 = phi i32 [ 0, %for.cond48 ], [ %spec.select, %if.else ]
+  switch i32 %cleanup.dest.slot.3, label %for.cond48.backedge [
+  i32 0, label %for.cond48.backedge
+  i32 1, label %for.cond48.backedge
+  ]
+
+for.cond48.backedge:                              ; preds = %cleanup87, %cleanup87, %cleanup87
+  br label %for.cond48
+}

From ca1ff80a16f135306282cb8ef33d0af6ce20fa69 Mon Sep 17 00:00:00 2001
From: Asher Mancinelli <ashermancinelli@gmail.com>
Date: Wed, 1 Oct 2025 07:15:42 -0700
Subject: [PATCH 377/878] [MLIR] Add sincos fusion pass (#161413)

We see performance improvements from using sincos to reuse calculations
in hot loops that compute sin() and cos() of the same operand. Add a
pass to identify sin() and cos() calls in the same block with the same
operand and fast-math flags, and fuse them into a sincos op.

Follow-up to:
* #160561
* #160772
---
 .../mlir/Dialect/Math/Transforms/Passes.td    |  8 ++
 .../Dialect/Math/Transforms/CMakeLists.txt    |  1 +
 .../Dialect/Math/Transforms/SincosFusion.cpp  | 80 +++++++++++++++++
 mlir/test/Dialect/Math/sincos-fusion.mlir     | 86 +++++++++++++++++++
 4 files changed, 175 insertions(+)
 create mode 100644 mlir/lib/Dialect/Math/Transforms/SincosFusion.cpp
 create mode 100644 mlir/test/Dialect/Math/sincos-fusion.mlir

diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Passes.td b/mlir/include/mlir/Dialect/Math/Transforms/Passes.td
index 4d415aeac8f58..48346abd84285 100644
--- a/mlir/include/mlir/Dialect/Math/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Math/Transforms/Passes.td
@@ -64,4 +64,12 @@ def MathExpandOpsPass : Pass<"math-expand-ops"> {
   ];
 }
 
+def MathSincosFusionPass : Pass<"math-sincos-fusion"> {
+  let summary = "Fuse sin and cos operations.";
+  let description = [{
+    Fuse sin and cos operations into a sincos operation.
+  }];
+  let dependentDialects = ["math::MathDialect"];
+}
+
 #endif // MLIR_DIALECT_MATH_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt
index ff62b515533c3..8899c3a1d1a42 100644
--- a/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Math/Transforms/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRMathTransforms
   ExpandOps.cpp
   ExtendToSupportedTypes.cpp
   PolynomialApproximation.cpp
+  SincosFusion.cpp
   UpliftToFMA.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/Math/Transforms/SincosFusion.cpp b/mlir/lib/Dialect/Math/Transforms/SincosFusion.cpp
new file mode 100644
index 0000000000000..69407df201cfa
--- /dev/null
+++ b/mlir/lib/Dialect/Math/Transforms/SincosFusion.cpp
@@ -0,0 +1,80 @@
+//===- SincosFusion.cpp - Fuse sin/cos into sincos -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Math/Transforms/Passes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::math;
+
+namespace {
+
+/// Fuse a math.sin and math.cos in the same block that use the same operand and
+/// have identical fastmath flags into a single math.sincos.
+struct SincosFusionPattern : OpRewritePattern<math::SinOp> {
+  using Base::Base;
+
+  LogicalResult matchAndRewrite(math::SinOp sinOp,
+                                PatternRewriter &rewriter) const override {
+    Value operand = sinOp.getOperand();
+    mlir::arith::FastMathFlags sinFastMathFlags = sinOp.getFastmath();
+
+    math::CosOp cosOp = nullptr;
+    sinOp->getBlock()->walk([&](math::CosOp op) {
+      if (op.getOperand() == operand && op.getFastmath() == sinFastMathFlags) {
+        cosOp = op;
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+
+    if (!cosOp)
+      return failure();
+
+    Operation *firstOp = sinOp->isBeforeInBlock(cosOp) ? sinOp.getOperation()
+                                                       : cosOp.getOperation();
+    rewriter.setInsertionPoint(firstOp);
+
+    Type elemType = sinOp.getType();
+    auto sincos = math::SincosOp::create(rewriter, firstOp->getLoc(),
+                                         TypeRange{elemType, elemType}, operand,
+                                         sinOp.getFastmathAttr());
+
+    rewriter.replaceOp(sinOp, sincos.getSin());
+    rewriter.replaceOp(cosOp, sincos.getCos());
+    return success();
+  }
+};
+
+} // namespace
+
+namespace mlir::math {
+#define GEN_PASS_DEF_MATHSINCOSFUSIONPASS
+#include "mlir/Dialect/Math/Transforms/Passes.h.inc"
+} // namespace mlir::math
+
+namespace {
+
+struct MathSincosFusionPass final
+    : math::impl::MathSincosFusionPassBase<MathSincosFusionPass> {
+  using MathSincosFusionPassBase::MathSincosFusionPassBase;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<SincosFusionPattern>(&getContext());
+
+    GreedyRewriteConfig config;
+    if (failed(
+            applyPatternsGreedily(getOperation(), std::move(patterns), config)))
+      return signalPassFailure();
+  }
+};
+
+} // namespace
diff --git a/mlir/test/Dialect/Math/sincos-fusion.mlir b/mlir/test/Dialect/Math/sincos-fusion.mlir
new file mode 100644
index 0000000000000..29fb9f12475b8
--- /dev/null
+++ b/mlir/test/Dialect/Math/sincos-fusion.mlir
@@ -0,0 +1,86 @@
+// RUN: mlir-opt -math-sincos-fusion %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @sincos_fusion(
+// CHECK-SAME:      %[[ARG0:.*]]: f32,
+// CHECK-SAME:      %[[ARG1:.*]]: f32) -> (f32, f32, f32, f32) {
+// CHECK:           %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] : f32
+// CHECK:           %[[VAL_2:.*]], %[[VAL_3:.*]] = math.sincos %[[ARG1]] : f32
+// CHECK:           return %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], %[[VAL_2]] : f32, f32, f32, f32
+// CHECK:         }
+func.func @sincos_fusion(%arg0 : f32, %arg1 : f32) -> (f32, f32, f32, f32) {
+    %0 = math.sin %arg0 : f32
+    %1 = math.cos %arg0 : f32
+
+    %2 = math.cos %arg1 : f32
+    %3 = math.sin %arg1 : f32
+
+    func.return %0, %1, %2, %3 : f32, f32, f32, f32
+}
+
+func.func private @sink(%arg0 : f32)
+
+// CHECK:         func.func private @sink(f32)
+// CHECK-LABEL:   func.func @sincos_ensure_ssa_dominance(
+// CHECK-SAME:      %[[ARG0:.*]]: f32,
+// CHECK-SAME:      %[[ARG1:.*]]: f32) -> (f32, f32, f32, f32) {
+// CHECK:           %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] : f32
+// CHECK:           call @sink(%[[VAL_0]]) : (f32) -> ()
+// CHECK:           %[[VAL_2:.*]], %[[VAL_3:.*]] = math.sincos %[[ARG1]] : f32
+// CHECK:           call @sink(%[[VAL_3]]) : (f32) -> ()
+// CHECK:           return %[[VAL_0]], %[[VAL_1]], %[[VAL_3]], %[[VAL_2]] : f32, f32, f32, f32
+// CHECK:         }
+func.func @sincos_ensure_ssa_dominance(%arg0 : f32, %arg1 : f32) -> (f32, f32, f32, f32) {
+    %0 = math.sin %arg0 : f32
+    func.call @sink(%0) : (f32) -> ()
+    %1 = math.cos %arg0 : f32
+    %2 = math.cos %arg1 : f32
+    func.call @sink(%2) : (f32) -> ()
+    %3 = math.sin %arg1 : f32
+    func.return %0, %1, %2, %3 : f32, f32, f32, f32
+}
+
+// CHECK-LABEL:   func.func @sincos_fusion_no_match_fmf(
+// CHECK-SAME:      %[[ARG0:.*]]: f32) -> (f32, f32) {
+// CHECK:           %[[VAL_0:.*]] = math.sin %[[ARG0]] fastmath<contract> : f32
+// CHECK:           %[[VAL_1:.*]] = math.cos %[[ARG0]] : f32
+// CHECK:           return %[[VAL_0]], %[[VAL_1]] : f32, f32
+// CHECK:         }
+func.func @sincos_fusion_no_match_fmf(%arg0 : f32) -> (f32, f32) {
+    %0 = math.sin %arg0 fastmath<contract> : f32
+    %1 = math.cos %arg0 : f32
+    func.return %0, %1 : f32, f32
+}
+
+// CHECK-LABEL:   func.func @sincos_no_fusion_different_block(
+// CHECK-SAME:      %[[ARG0:.*]]: f32,
+// CHECK-SAME:      %[[ARG1:.*]]: i1) -> f32 {
+// CHECK:           %[[VAL_0:.*]] = scf.if %[[ARG1]] -> (f32) {
+// CHECK:             %[[VAL_1:.*]] = math.sin %[[ARG0]] : f32
+// CHECK:             scf.yield %[[VAL_1]] : f32
+// CHECK:           } else {
+// CHECK:             %[[VAL_2:.*]] = math.cos %[[ARG0]] : f32
+// CHECK:             scf.yield %[[VAL_2]] : f32
+// CHECK:           }
+// CHECK:           return %[[VAL_0]] : f32
+// CHECK:         }
+func.func @sincos_no_fusion_different_block(%arg0 : f32, %flag : i1) -> f32 {
+  %0 = scf.if %flag -> f32 {
+    %s = math.sin %arg0 : f32
+    scf.yield %s : f32
+  } else {
+    %c = math.cos %arg0 : f32
+    scf.yield %c : f32
+  }
+  func.return %0 : f32
+}
+
+// CHECK-LABEL:   func.func @sincos_fusion_preserve_fastmath(
+// CHECK-SAME:      %[[ARG0:.*]]: f32) -> (f32, f32) {
+// CHECK:           %[[VAL_0:.*]], %[[VAL_1:.*]] = math.sincos %[[ARG0]] fastmath<contract> : f32
+// CHECK:           return %[[VAL_0]], %[[VAL_1]] : f32, f32
+// CHECK:         }
+func.func @sincos_fusion_preserve_fastmath(%arg0 : f32) -> (f32, f32) {
+    %0 = math.sin %arg0 fastmath<contract> : f32
+    %1 = math.cos %arg0 fastmath<contract> : f32
+    func.return %0, %1 : f32, f32
+}

From 58c959b4f79b0b0ea901c1e7b5051a16f598631e Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 1 Oct 2025 07:22:13 -0700
Subject: [PATCH 378/878] [AMDGPU] Use common allUsesAvailableAt implementation
 [nfc] (#161418)

Replace the target specific copy with a call to the generic routine. I
don't spot any differences by eye, and there's nothing in the original
review discussion (#124327) which makes it clear why this was
duplicated.
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 63 +--------------------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  6 --
 2 files changed, 3 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index fab78a93aa063..bdc08101c7119 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -29,6 +29,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1633,64 +1634,6 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
-bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
-                                         SlotIndex OriginalIdx,
-                                         SlotIndex RematIdx) const {
-
-  LiveIntervals *LIS = DAG.LIS;
-  MachineRegisterInfo &MRI = DAG.MRI;
-  OriginalIdx = OriginalIdx.getRegSlot(true);
-  RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true));
-  for (const MachineOperand &MO : InstToRemat->operands()) {
-    if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
-      continue;
-
-    if (!MO.getReg().isVirtual()) {
-      // Do not attempt to reason about PhysRegs
-      // TODO: better analysis of PhysReg livness
-      if (!DAG.MRI.isConstantPhysReg(MO.getReg()) &&
-          !DAG.TII->isIgnorableUse(MO))
-        return false;
-
-      // Constant PhysRegs and IgnorableUses are okay
-      continue;
-    }
-
-    LiveInterval &LI = LIS->getInterval(MO.getReg());
-    const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx);
-    assert(OVNI);
-
-    // Don't allow rematerialization immediately after the original def.
-    // It would be incorrect if InstToRemat redefines the register.
-    // See PR14098.
-    if (SlotIndex::isSameInstr(OriginalIdx, RematIdx))
-      return false;
-
-    if (OVNI != LI.getVNInfoAt(RematIdx))
-      return false;
-
-    // Check that subrange is live at RematIdx.
-    if (LI.hasSubRanges()) {
-      const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
-      unsigned SubReg = MO.getSubReg();
-      LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
-                              : MRI.getMaxLaneMaskForVReg(MO.getReg());
-      for (LiveInterval::SubRange &SR : LI.subranges()) {
-        if ((SR.LaneMask & LM).none())
-          continue;
-        if (!SR.liveAt(RematIdx))
-          return false;
-
-        // Early exit if all used lanes are checked. No need to continue.
-        LM &= ~SR.LaneMask;
-        if (LM.none())
-          break;
-      }
-    }
-  }
-  return true;
-}
-
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const Function &F = MF.getFunction();
 
@@ -1812,9 +1755,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
       // Do not rematerialize an instruction it it uses registers that aren't
       // available at its use. This ensures that we are not extending any live
       // range while rematerializing.
-      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
       SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
-      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
+      if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
+                                              *DAG.TII))
         continue;
 
       REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 06b9b64091f00..8ea42677454e4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -496,12 +496,6 @@ class PreRARematStage : public GCNSchedStage {
   /// stage to their pre-stage values.
   void finalizeGCNSchedStage() override;
 
-  /// \p Returns true if all the uses in \p InstToRemat defined at \p
-  /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
-  /// reg uses.
-  bool allUsesAvailableAt(const MachineInstr *InstToRemat,
-                          SlotIndex OriginalIdx, SlotIndex RematIdx) const;
-
 public:
   bool initGCNSchedStage() override;
 

From cd0f560cc7e88bffedc4c34e3eb3efbf00dcb3ef Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 1 Oct 2025 15:22:45 +0100
Subject: [PATCH 379/878] [AArch64][SME] Precommit tests for LUT4I `Chain`
 issues (NFC) (#161505)

These tests show that `luti4` intrinsics are currently incorrectly
CSD'd.
---
 llvm/test/CodeGen/AArch64/pr161420.ll         | 51 +++++++++++++++++++
 .../AArch64/sme2-intrinsics-luti4-lane-x4.ll  | 18 +++++++
 .../CodeGen/AArch64/sme2-intrinsics-luti4.ll  | 20 ++++++++
 3 files changed, 89 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/pr161420.ll

diff --git a/llvm/test/CodeGen/AArch64/pr161420.ll b/llvm/test/CodeGen/AArch64/pr161420.ll
new file mode 100644
index 0000000000000..515a1bf47cc1e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr161420.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx15.0.0"
+
+; From: https://github.com/llvm/llvm-project/issues/161420. This test checks that
+; two `luti4` instructions are emitted. FIXME: This is currently broken!
+define void @pluto(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) #0 {
+; CHECK-LABEL: pluto:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    mov w8, #0 ; =0x0
+; CHECK-NEXT:    ldr zt0, [x1]
+; CHECK-NEXT:    ldr z0, [x3]
+; CHECK-NEXT:    ptrue pn8.h
+; CHECK-NEXT:    ld1h { z4.h - z7.h }, pn8/z, [x0]
+; CHECK-NEXT:    luti4 { z0.h - z3.h }, zt0, z0[0]
+; CHECK-NEXT:    fmla za.h[w8, 2, vgx4], { z4.h - z7.h }, { z0.h - z3.h }
+; CHECK-NEXT:    ret
+bb:
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg1)
+  %load = load <vscale x 16 x i8>, ptr %arg3, align 16
+  %call = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16()
+  %call4 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %call, ptr %arg)
+  %extractvalue = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 0
+  %extractvalue5 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 1
+  %extractvalue6 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 2
+  %extractvalue7 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call4, 3
+  %call8 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %load, i32 0)
+  %extractvalue9 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 0
+  %extractvalue10 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 1
+  %extractvalue11 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 2
+  %extractvalue12 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call8, 3
+  tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 0, <vscale x 8 x half> %extractvalue, <vscale x 8 x half> %extractvalue5, <vscale x 8 x half> %extractvalue6, <vscale x 8 x half> %extractvalue7, <vscale x 8 x half> %extractvalue9, <vscale x 8 x half> %extractvalue10, <vscale x 8 x half> %extractvalue11, <vscale x 8 x half> %extractvalue12)
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg2)
+  %call13 = tail call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 0, <vscale x 16 x i8> %load, i32 0)
+  %extractvalue14 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 0
+  %extractvalue15 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 1
+  %extractvalue16 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 2
+  %extractvalue17 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %call13, 3
+  tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32 2, <vscale x 8 x half> %extractvalue, <vscale x 8 x half> %extractvalue5, <vscale x 8 x half> %extractvalue6, <vscale x 8 x half> %extractvalue7, <vscale x 8 x half> %extractvalue14, <vscale x 8 x half> %extractvalue15, <vscale x 8 x half> %extractvalue16, <vscale x 8 x half> %extractvalue17)
+  ret void
+}
+
+declare void @llvm.aarch64.sme.ldr.zt(i32, ptr)
+declare target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c16()
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr)
+declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8f16(i32 immarg, <vscale x 16 x i8>, i32 immarg)
+declare void @llvm.aarch64.sme.fmla.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
+
+attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn uwtable(sync) "aarch64_inout_za" "aarch64_inout_zt0" "aarch64_pstate_sm_enabled" "target-cpu"="apple-m1" "target-features"="+fp-armv8,+lse,+neon,+sme,+sme-f16f16,+sme2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a" }
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
index 92d3e1182bf34..cf306e5238018 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
@@ -48,6 +48,24 @@ define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscal
     ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
+; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
+; FIXME: This is currently broken!
+define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_multiple_luti4_zt_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    luti4 { z0.s - z3.s }, zt0, z0[1]
+; CHECK-NEXT:    // fake_use: $z0 $z0_z1_z2_z3
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrA)
+  %res1 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 1)
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrB)
+  %res2 = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4f32(i32 0, <vscale x 16 x i8> %x, i32 1)
+
+  call void (...) @llvm.fake.use({<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res1)
+  call void (...) @llvm.fake.use({<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res2)
+  ret void
+}
+
 declare {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32, <vscale x 16 x i8>, i32)
 declare {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv4i32(i32, <vscale x 16 x i8>, i32)
 declare {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8bf16(i32, <vscale x 16 x i8>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
index 778f31194baf4..0024b70bd7c8f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
@@ -14,4 +14,24 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
 }
 
+; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
+; FIXME: This is currently broken!
+define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
+; CHECK-LABEL: test_multiple_luti4_zt_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    luti4 { z0.b - z3.b }, zt0, { z0, z1 }
+; CHECK-NEXT:    // fake_use: $z0 $z0_z1_z2_z3
+; CHECK-NEXT:    ret
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrA)
+  %res1 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
+  tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %ptrB)
+  %res2 = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
+
+  call void (...) @llvm.fake.use({ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res1)
+  call void (...) @llvm.fake.use({ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2)
+  ret void
+}
+
 attributes #0 = { "target-features"="+sme2,+sme-lutv2"}

From 8f77621574176387f906b8ceef9e1abb90bf22f6 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 1 Oct 2025 16:24:48 +0200
Subject: [PATCH 380/878] [clang] Convert second arg of
 __builtin_assume_aligned to ConstantExpr (#161314)

Since the second argument must be a constant integer, we can as well
convert it to a `ConstantExpr` in Sema.


Fixes https://github.com/llvm/llvm-project/issues/161272
---
 clang/docs/ReleaseNotes.rst                   | 2 ++
 clang/lib/Sema/SemaChecking.cpp               | 3 +++
 clang/test/SemaCXX/builtin-assume-aligned.cpp | 6 ++++++
 3 files changed, 11 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 97799aeadfc7e..b1ddfa070318b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -246,6 +246,8 @@ Non-comprehensive list of changes in this release
 
 - ``__builtin_assume_dereferenceable`` now accepts non-constant size operands.
 
+- Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314)
+
 New Compiler Flags
 ------------------
 - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 39c3aa2243338..7b37e0b8d5430 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5954,6 +5954,9 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
     if (Result > Sema::MaximumAlignment)
       Diag(TheCall->getBeginLoc(), diag::warn_assume_aligned_too_great)
           << SecondArg->getSourceRange() << Sema::MaximumAlignment;
+
+    TheCall->setArg(1,
+                    ConstantExpr::Create(Context, SecondArg, APValue(Result)));
   }
 
   if (NumArgs > 2) {
diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp
index 48bd8414fc50a..afc11cc694705 100644
--- a/clang/test/SemaCXX/builtin-assume-aligned.cpp
+++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp
@@ -47,3 +47,9 @@ constexpr void *s1 = __builtin_assume_aligned(x, 32);
 constexpr void *s2 = __builtin_assume_aligned(x, 32, 5);
 constexpr void *s3 = __builtin_assume_aligned(x, 32, -1);
 
+
+constexpr int add(int a, int b) {
+  return a+b;
+}
+constexpr void *c1 = __builtin_assume_aligned(p, add(1,1));
+constexpr void *c2 = __builtin_assume_aligned(p, add(2,1)); // expected-error {{not a power of 2}}

From 50c8e5d730ac55454ef5c1f58fbc9096e946240c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 1 Oct 2025 15:38:15 +0100
Subject: [PATCH 381/878] [X86] SimplifyDemandedBitsForTargetNode - generalize
 X86ISD::VSRAI handling when only demanding 'known signbits' (#161523)

If we only demand bits that already match the signbit then we don't need to shift.

Generalizes an existing pattern that just handled signbit-only demanded bits to match what we do for ISD::SRA.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +++++--
 llvm/test/CodeGen/X86/combine-pack.ll   | 3 ---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd04ff5bc7ef4..a04c3e8f70c89 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44615,8 +44615,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     APInt DemandedMask = OriginalDemandedBits << ShAmt;
 
-    // If we just want the sign bit then we don't need to shift it.
-    if (OriginalDemandedBits.isSignMask())
+    // If we only want bits that already match the signbit then we don't need
+    // to shift.
+    unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
+    if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
+        NumHiDemandedBits)
       return TLO.CombineTo(Op, Op0);
 
     // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
diff --git a/llvm/test/CodeGen/X86/combine-pack.ll b/llvm/test/CodeGen/X86/combine-pack.ll
index 9e740b04073e0..1e7c700055e3f 100644
--- a/llvm/test/CodeGen/X86/combine-pack.ll
+++ b/llvm/test/CodeGen/X86/combine-pack.ll
@@ -5,14 +5,12 @@
 
 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
 
-; TODO: Failure to remove unnecessary signsplat
 define <8 x i16> @combine_packss_v4i32_signsplat(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_packss_v4i32_signsplat:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
-; SSE-NEXT:    psraw $15, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_packss_v4i32_signsplat:
@@ -20,7 +18,6 @@ define <8 x i16> @combine_packss_v4i32_signsplat(<4 x i32> %a0, <4 x i32> %a1) {
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %cmp = icmp sgt <4 x i32> %a0, %a1
   %ext = sext <4 x i1> %cmp to <4 x i32>

From 13ce5f249ed911971fe899e318d08765399d8ce1 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Wed, 1 Oct 2025 07:42:37 -0700
Subject: [PATCH 382/878] [OpenACC] Remove unnecessary uses of `getResult`, fix
 cast tests (#161526)

A previous review comment pointed out that operations with only a single
result implicitly convert to `mlir::Value`. This patch removes the
explicit use of `getResult` where it is unnecessary in OpenACC lowering.

However, there ARE a few cases where it is necessary where the
`mlir::ValueRange` implicit constructor from a single value is being
used, so those are untouched.

Additionally, while the previous patch was being committed (#161382), a
second patch (#161431) changed the format of cir.casts, so this patch
fixes the additional test lines for that as well.
---
 clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp       |  2 +-
 clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp | 18 ++++++-------
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp | 26 +++++++++----------
 ...-clause-pointer-array-recipes-CtorDtor.cpp |  8 +++---
 ...ate-clause-pointer-array-recipes-NoOps.cpp |  8 +++---
 ...ivate-clause-pointer-array-recipes-int.cpp |  8 +++---
 6 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
index 7f9350a9e4173..a9af753381db3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
@@ -62,7 +62,7 @@ mlir::Value CIRGenFunction::createOpenACCConstantInt(mlir::Location loc,
   auto constOp = builder.create<mlir::arith::ConstantOp>(
       loc, builder.getIntegerAttr(ty, value));
 
-  return constOp.getResult();
+  return constOp;
 }
 
 CIRGenFunction::OpenACCDataOperandInfo
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index 3cf053449458f..f22f3e8845f8a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -110,7 +110,7 @@ class OpenACCClauseCIREmitter final
     auto constOp = builder.create<mlir::arith::ConstantOp>(
         loc, builder.getIntegerAttr(ty, value));
 
-    return constOp.getResult();
+    return constOp;
   }
 
   mlir::Value createConstantInt(SourceLocation loc, unsigned width,
@@ -230,13 +230,13 @@ class OpenACCClauseCIREmitter final
                     std::is_same_v<AfterOpTy, mlir::acc::DetachOp>) {
         // Detach/Delete ops don't have the variable reference here, so they
         // take 1 fewer argument to their build function.
-        afterOp = builder.create<AfterOpTy>(
-            opInfo.beginLoc, beforeOp.getResult(), structured, implicit,
-            opInfo.name, opInfo.bounds);
+        afterOp =
+            builder.create<AfterOpTy>(opInfo.beginLoc, beforeOp, structured,
+                                      implicit, opInfo.name, opInfo.bounds);
       } else {
         afterOp = builder.create<AfterOpTy>(
-            opInfo.beginLoc, beforeOp.getResult(), opInfo.varValue, structured,
-            implicit, opInfo.name, opInfo.bounds);
+            opInfo.beginLoc, beforeOp, opInfo.varValue, structured, implicit,
+            opInfo.name, opInfo.bounds);
       }
     }
 
@@ -1005,7 +1005,7 @@ class OpenACCClauseCIREmitter final
                       /*temporary=*/nullptr, OpenACCReductionOperator::Invalid,
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
-                      privateOp.getResult());
+                      privateOp);
           // TODO: OpenACC: The dialect is going to change in the near future to
           // have these be on a different operation, so when that changes, we
           // probably need to change these here.
@@ -1053,7 +1053,7 @@ class OpenACCClauseCIREmitter final
                       OpenACCReductionOperator::Invalid,
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
-                      firstPrivateOp.getResult());
+                      firstPrivateOp);
 
           // TODO: OpenACC: The dialect is going to change in the near future to
           // have these be on a different operation, so when that changes, we
@@ -1101,7 +1101,7 @@ class OpenACCClauseCIREmitter final
                       /*temporary=*/nullptr, clause.getReductionOp(),
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
-                      reductionOp.getResult());
+                      reductionOp);
 
           operation.addReduction(builder.getContext(), reductionOp, recipe);
         }
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index 25cacbb73eb03..e603884d08e3c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -240,7 +240,7 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
 
     if (auto arrayTy = dyn_cast<cir::ArrayType>(eltTy))
       return builder.getArrayElement(loc, loc, subVal, arrayTy.getElementType(),
-                                     idxLoad.getResult(),
+                                     idxLoad,
                                      /*shouldDecay=*/true);
 
     assert(isa<cir::PointerType>(eltTy));
@@ -248,8 +248,8 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
     auto eltLoad = cir::LoadOp::create(builder, loc, {subVal});
 
     return cir::PtrStrideOp::create(builder, loc, eltLoad.getType(), eltLoad,
-                                    idxLoad.getResult())
-        .getResult();
+                                    idxLoad);
+        
   };
 
   auto forStmtBuilder = [&]() {
@@ -271,12 +271,11 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
     if (inverse) {
       cir::ConstantOp constOne = builder.getConstInt(loc, itrTy, 1);
 
-      auto sub =
-          cir::BinOp::create(builder, loc, itrTy, cir::BinOpKind::Sub,
-                             ubConversion.getResult(0), constOne.getResult());
+      auto sub = cir::BinOp::create(builder, loc, itrTy, cir::BinOpKind::Sub,
+                                    ubConversion.getResult(0), constOne);
 
       // Upperbound is exclusive, so subtract 1.
-      builder.CIRBaseBuilderTy::createStore(loc, sub.getResult(), itr);
+      builder.CIRBaseBuilderTy::createStore(loc, sub, itr);
     } else {
       // Lowerbound is inclusive, so we can include it.
       builder.CIRBaseBuilderTy::createStore(loc, lbConversion.getResult(0),
@@ -294,8 +293,8 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
           auto loadCur = cir::LoadOp::create(builder, loc, {itr});
           // Use 'not equal' since we are just doing an increment/decrement.
           auto cmp = builder.createCompare(
-              loc, inverse ? cir::CmpOpKind::ge : cir::CmpOpKind::lt,
-              loadCur.getResult(), endItr.getResult(0));
+              loc, inverse ? cir::CmpOpKind::ge : cir::CmpOpKind::lt, loadCur,
+              endItr.getResult(0));
           builder.createCondition(cmp);
         },
         /*bodyBuilder=*/
@@ -309,11 +308,10 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
         /*stepBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto load = cir::LoadOp::create(builder, loc, {itr});
-          auto unary = cir::UnaryOp::create(builder, loc, load.getType(),
-                                            inverse ? cir::UnaryOpKind::Dec
-                                                    : cir::UnaryOpKind::Inc,
-                                            load.getResult());
-          builder.CIRBaseBuilderTy::createStore(loc, unary.getResult(), itr);
+          auto unary = cir::UnaryOp::create(
+              builder, loc, load.getType(),
+              inverse ? cir::UnaryOpKind::Dec : cir::UnaryOpKind::Inc, load);
+          builder.CIRBaseBuilderTy::createStore(loc, unary, itr);
           builder.createYield(loc);
         });
   };
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
index da56de005d5f2..4d0e481a9d19e 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
@@ -502,7 +502,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -721,7 +721,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
 //
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
@@ -882,7 +882,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -1281,7 +1281,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // 
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
index f9230dcaf1691..4687320d3fca2 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
@@ -330,7 +330,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -440,7 +440,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
 //
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
@@ -521,7 +521,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -767,7 +767,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
index 45d8b78a4a128..db5d5782bcecd 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
@@ -327,7 +327,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> -> !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -437,7 +437,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 //
 // CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
@@ -518,7 +518,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>>), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!s32i>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 // CHECK-NEXT: %[[TL_DEREF:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
@@ -765,7 +765,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
 //
 // CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
-// CHECK-NEXT: %[[DECAY:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>>), !cir.ptr<!cir.ptr<!s32i>>
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!s32i> x 5>> -> !cir.ptr<!cir.ptr<!s32i>>
 // CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!s32i>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!s32i>>
 //
 // CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index

From 9e0c0a09392ff7d3b196b17c77595e921e35d765 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 1 Oct 2025 15:46:12 +0100
Subject: [PATCH 383/878] [LLVM][SCEV] udiv (mul nuw a, vscale), (mul nuw b,
 vscale) -> udiv a, b (#157836)

---
 .../Analysis/ScalarEvolutionPatternMatch.h    | 23 ++++-
 llvm/lib/Analysis/ScalarEvolution.cpp         |  7 ++
 .../ScalarEvolution/mul-udiv-folds.ll         | 40 +++++++++
 .../LoopVectorize/AArch64/sve-epilog-vect.ll  | 88 +++++++------------
 4 files changed, 97 insertions(+), 61 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 7a45ae93b185b..164b46b54890b 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -184,6 +184,7 @@ m_scev_PtrToInt(const Op0_t &Op0) {
 
 /// Match a binary SCEV.
 template <typename SCEVTy, typename Op0_t, typename Op1_t,
+          SCEV::NoWrapFlags WrapFlags = SCEV::FlagAnyWrap,
           bool Commutable = false>
 struct SCEVBinaryExpr_match {
   Op0_t Op0;
@@ -192,6 +193,10 @@ struct SCEVBinaryExpr_match {
   SCEVBinaryExpr_match(Op0_t Op0, Op1_t Op1) : Op0(Op0), Op1(Op1) {}
 
   bool match(const SCEV *S) const {
+    if (auto WrappingS = dyn_cast<SCEVNAryExpr>(S))
+      if (WrappingS->getNoWrapFlags(WrapFlags) != WrapFlags)
+        return false;
+
     auto *E = dyn_cast<SCEVTy>(S);
     return E && E->getNumOperands() == 2 &&
            ((Op0.match(E->getOperand(0)) && Op1.match(E->getOperand(1))) ||
@@ -201,10 +206,12 @@ struct SCEVBinaryExpr_match {
 };
 
 template <typename SCEVTy, typename Op0_t, typename Op1_t,
+          SCEV::NoWrapFlags WrapFlags = SCEV::FlagAnyWrap,
           bool Commutable = false>
-inline SCEVBinaryExpr_match<SCEVTy, Op0_t, Op1_t, Commutable>
+inline SCEVBinaryExpr_match<SCEVTy, Op0_t, Op1_t, WrapFlags, Commutable>
 m_scev_Binary(const Op0_t &Op0, const Op1_t &Op1) {
-  return SCEVBinaryExpr_match<SCEVTy, Op0_t, Op1_t, Commutable>(Op0, Op1);
+  return SCEVBinaryExpr_match<SCEVTy, Op0_t, Op1_t, WrapFlags, Commutable>(Op0,
+                                                                           Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
@@ -220,9 +227,17 @@ m_scev_Mul(const Op0_t &Op0, const Op1_t &Op1) {
 }
 
 template <typename Op0_t, typename Op1_t>
-inline SCEVBinaryExpr_match<SCEVMulExpr, Op0_t, Op1_t, true>
+inline SCEVBinaryExpr_match<SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true>
 m_scev_c_Mul(const Op0_t &Op0, const Op1_t &Op1) {
-  return m_scev_Binary<SCEVMulExpr, Op0_t, Op1_t, true>(Op0, Op1);
+  return m_scev_Binary<SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagAnyWrap, true>(Op0,
+                                                                           Op1);
+}
+
+template <typename Op0_t, typename Op1_t>
+inline SCEVBinaryExpr_match<SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagNUW, true>
+m_scev_c_NUWMul(const Op0_t &Op0, const Op1_t &Op1) {
+  return m_scev_Binary<SCEVMulExpr, Op0_t, Op1_t, SCEV::FlagNUW, true>(Op0,
+                                                                       Op1);
 }
 
 template <typename Op0_t, typename Op1_t>
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b08399b381f34..63e1b1462d007 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3598,6 +3598,13 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
     }
   }
 
+  // TODO: Generalize to handle any common factors.
+  // udiv (mul nuw a, vscale), (mul nuw b, vscale) --> udiv a, b
+  const SCEV *NewLHS, *NewRHS;
+  if (match(LHS, m_scev_c_NUWMul(m_SCEV(NewLHS), m_SCEVVScale())) &&
+      match(RHS, m_scev_c_NUWMul(m_SCEV(NewRHS), m_SCEVVScale())))
+    return getUDivExpr(NewLHS, NewRHS);
+
   // The Insertion Point (IP) might be invalid by now (due to UniqueSCEVs
   // changes). Make sure we get a new one.
   IP = nullptr;
diff --git a/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
index 1e21fbf08a92f..e1c62309142d0 100644
--- a/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
+++ b/llvm/test/Analysis/ScalarEvolution/mul-udiv-folds.ll
@@ -188,3 +188,43 @@ loop:
 exit:
   ret void
 }
+
+define noundef i64 @udiv_mul_common_vscale_factor(i64 %a, i64 %b) {
+; CHECK-LABEL: 'udiv_mul_common_vscale_factor'
+; CHECK-NEXT:  Classifying expressions for: @udiv_mul_common_vscale_factor
+; CHECK-NEXT:    %vs = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    --> vscale U: [1,0) S: [1,0)
+; CHECK-NEXT:    %a.vs = mul i64 %a, %vs
+; CHECK-NEXT:    --> (vscale * %a) U: full-set S: full-set
+; CHECK-NEXT:    %b.vs = mul i64 %b, %vs
+; CHECK-NEXT:    --> (vscale * %b) U: full-set S: full-set
+; CHECK-NEXT:    %div = udiv i64 %a.vs, %b.vs
+; CHECK-NEXT:    --> ((vscale * %a) /u (vscale * %b)) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @udiv_mul_common_vscale_factor
+;
+  %vs = call i64 @llvm.vscale()
+  %a.vs = mul i64 %a, %vs
+  %b.vs = mul i64 %b, %vs
+  %div = udiv i64 %a.vs, %b.vs
+  ret i64 %div
+}
+
+define noundef i64 @udiv_mul_nuw_common_vscale_factor(i64 %a, i64 %b) {
+; CHECK-LABEL: 'udiv_mul_nuw_common_vscale_factor'
+; CHECK-NEXT:  Classifying expressions for: @udiv_mul_nuw_common_vscale_factor
+; CHECK-NEXT:    %vs = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    --> vscale U: [1,0) S: [1,0)
+; CHECK-NEXT:    %a.vs = mul nuw i64 %a, %vs
+; CHECK-NEXT:    --> (vscale * %a)<nuw> U: full-set S: full-set
+; CHECK-NEXT:    %b.vs = mul nuw i64 %b, %vs
+; CHECK-NEXT:    --> (vscale * %b)<nuw> U: full-set S: full-set
+; CHECK-NEXT:    %div = udiv i64 %a.vs, %b.vs
+; CHECK-NEXT:    --> (%a /u %b) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @udiv_mul_nuw_common_vscale_factor
+;
+  %vs = call i64 @llvm.vscale()
+  %a.vs = mul nuw i64 %a, %vs
+  %b.vs = mul nuw i64 %b, %vs
+  %div = udiv i64 %a.vs, %b.vs
+  ret i64 %div
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 8d33ccbf38861..bbc0e33af8c84 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -49,7 +49,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -59,7 +59,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-NEXT:    store <8 x i8> splat (i8 1), ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -97,7 +97,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.ph:
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -107,7 +107,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-VF8-NEXT:    store <8 x i8> splat (i8 1), ptr [[TMP9]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
 ; CHECK-VF8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
 ; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VF8:       vec.epilog.scalar.ph:
@@ -150,7 +150,7 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
@@ -182,13 +182,13 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-VF8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK-VF8:       vec.epilog.ph:
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
@@ -198,7 +198,7 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP9]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
 ; CHECK-VF8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
 ; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VF8:       vec.epilog.scalar.ph:
@@ -261,13 +261,13 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8
@@ -279,7 +279,7 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -313,13 +313,13 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-VF8-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK-VF8:       vec.epilog.ph:
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8
@@ -331,7 +331,7 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-VF8-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP9]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 8
 ; CHECK-VF8-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
-; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -382,14 +382,14 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[START]], i64 10000
@@ -400,7 +400,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[NEXT_GEP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 10000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -433,14 +433,14 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP6]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-VF8-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
 ; CHECK-VF8-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK-VF8:       vec.epilog.ph:
 ; CHECK-VF8-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-VF8-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[START]], i64 10000
@@ -451,7 +451,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8-NEXT:    store <8 x i8> zeroinitializer, ptr [[NEXT_GEP2]], align 1
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 8
 ; CHECK-VF8-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 10000
-; CHECK-VF8-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-VF8:       vec.epilog.middle.block:
 ; CHECK-VF8-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VF8:       vec.epilog.scalar.ph:
@@ -514,13 +514,13 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF15:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[N]], 2
@@ -536,7 +536,7 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -576,7 +576,7 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
@@ -606,18 +606,12 @@ exit:
 }
 
 ; Loop with vscale-based trip count vscale x 1024.
-; TODO: No epilogue vectorizations should remain when choosing VF = vscale x 4.
 define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 {
 ; CHECK-LABEL: @trip_count_vscale_no_epilogue_iterations(
-; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1024
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
@@ -625,7 +619,7 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
@@ -644,31 +638,11 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX4]]
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x float> [[WIDE_LOAD5]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[TMP19]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ;
@@ -703,7 +677,7 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-VF8-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]

From edb80a8d7fb2e1e29a4cb792148b4ffd441eb114 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 1 Oct 2025 15:53:10 +0100
Subject: [PATCH 384/878] [lldb][test] Fix bf16 test cases on Arm 32-bit
 (#161528)

Fixes #157674

On ARM, the presence of a specific bf16 type in the AST is gated by:
```
bool ARMTargetInfo::hasBFloat16Type() const {
  // The __bf16 type is generally available so long as we have any fp registers.
  return HasBFloat16 || (FPU && !SoftFloat);
}
```

And the target we use when evaluating symbols (derived from the program
file, I think, haven't found it yet) does not enable any of this.

This means that we fall back to __fp16.

So for parts of the testing we just need to expect __fp16 instead, and
others we need to skip because now a class looks like it inherits from
itself.

There's a proper fix here somewhere but I don't know what it is yet.
---
 .../TestCppFloatingTypesSpecialization.py     | 22 +++++++++++++++----
 .../TestCppTemplateArguments.py               |  9 ++++++--
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py b/lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py
index 9564a0bc31809..f4530cd545046 100644
--- a/lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py
+++ b/lldb/test/API/lang/cpp/floating-types-specialization/TestCppFloatingTypesSpecialization.py
@@ -1,4 +1,5 @@
 import lldb
+import lldbsuite.test.lldbplatformutil as lldbplatformutil
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
@@ -11,12 +12,25 @@ def test(self):
             self, "// break here", lldb.SBFileSpec("main.cpp", False)
         )
 
-        self.expect_expr("f0", result_type="Foo<__bf16>")
-        self.expect_expr("f1", result_type="Foo<__fp16>")
+        # On 32-bit Arm, you have to have the bfloat16 extension, or an FPU while
+        # not using the soft float mode. The target we assume has none of that
+        # so instead of __bf16 we get __fp16.
+        is_arm_32_bit = lldbplatformutil.getArchitecture() == "arm"
+
+        self.expect_expr(
+            "f0", result_type=("Foo<__fp16>" if is_arm_32_bit else "Foo<__bf16>")
+        )
+
+        # When __bf16 is actually __fp16, f1 looks like it inherits from itself.
+        # Which clang allows but LLDB fails to evaluate.
+        if not is_arm_32_bit:
+            self.expect_expr("f1", result_type="Foo<__fp16>")
 
         # Test sizeof to ensure while computing layout we don't do
         # infinite recursion.
         v = self.frame().EvaluateExpression("sizeof(f0)")
         self.assertEqual(v.GetValueAsUnsigned() > 0, True)
-        v = self.frame().EvaluateExpression("sizeof(f1)")
-        self.assertEqual(v.GetValueAsUnsigned() > 0, True)
+
+        if not is_arm_32_bit:
+            v = self.frame().EvaluateExpression("sizeof(f1)")
+            self.assertEqual(v.GetValueAsUnsigned() > 0, True)
diff --git a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py
index f26d382bf8582..83c057220410a 100644
--- a/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py
+++ b/lldb/test/API/lang/cpp/template-arguments/TestCppTemplateArguments.py
@@ -1,4 +1,5 @@
 import lldb
+import lldbsuite.test.lldbplatformutil as lldbplatformutil
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
@@ -82,8 +83,12 @@ def test(self):
         value = self.expect_expr("temp7", result_type="Foo<__fp16, __fp16>")
         self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1))
 
-        value = self.expect_expr("temp8", result_type="Foo<__bf16, __bf16>")
-        self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1))
+        # The target we use when evaluating these expressions for Arm leads to there
+        # not being a __bf16 type in the AST so we fall back to __fp16 and evaluating
+        # this fails.
+        if lldbplatformutil.getArchitecture() != "arm":
+            value = self.expect_expr("temp8", result_type="Foo<__bf16, __bf16>")
+            self.assertFalse(value.GetType().GetTemplateArgumentValue(target, 1))
 
         value = self.expect_expr("temp9", result_type="Bar<double, 1.200000e+00>")
         template_param_value = value.GetType().GetTemplateArgumentValue(target, 1)

From 664b227089a2d3a72b15018018c5e8e4e639f944 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Wed, 1 Oct 2025 16:01:54 +0100
Subject: [PATCH 385/878] [LV] Keep duplicate recipes in VPExpressionRecipe
 (#156976)

The VPExpressionRecipe class uses a set to store its bundled recipes. If
repeated recipes are bundled then the duplicates will be lost, causing
the following recipes to not be at the expected place in the set.

When printing a reduce.add(mul(ext, ext)) bundle, for example, if the
extends are the same then the 3rd element of the set will be the
reduction, rather than the expected mul, causing a cast error. With this
change, the recipes are at the expected index in the set.

Fixes #156464
---
 llvm/lib/Transforms/Vectorize/VPlan.h         |  11 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  17 ++-
 .../LoopVectorize/reduction-inloop.ll         | 123 ++++++++++++++++++
 .../vplan-printing-reductions.ll              |  47 +++++++
 4 files changed, 189 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 10d704df289c8..c167dd7f65fac 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -29,6 +29,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
@@ -2977,7 +2978,8 @@ class LLVM_ABI_FOR_TEST VPBranchOnMaskRecipe : public VPRecipeBase {
 /// the expression is elevated to connect the non-expression recipe with the
 /// VPExpressionRecipe itself.
 class VPExpressionRecipe : public VPSingleDefRecipe {
-  /// Recipes included in this VPExpressionRecipe.
+  /// Recipes included in this VPExpressionRecipe. This could contain
+  /// duplicates.
   SmallVector<VPSingleDefRecipe *> ExpressionRecipes;
 
   /// Temporary VPValues used for external operands of the expression, i.e.
@@ -3039,8 +3041,11 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
   }
 
   ~VPExpressionRecipe() override {
-    for (auto *R : reverse(ExpressionRecipes))
-      delete R;
+    SmallPtrSet<VPSingleDefRecipe *, 4> ExpressionRecipesSeen;
+    for (auto *R : reverse(ExpressionRecipes)) {
+      if (ExpressionRecipesSeen.insert(R).second)
+        delete R;
+    }
     for (VPValue *T : LiveInPlaceholders)
       delete T;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3a55710d59b08..46909a53a9547 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2755,10 +2755,7 @@ VPExpressionRecipe::VPExpressionRecipe(
     ExpressionTypes ExpressionType,
     ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
     : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
-      ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
-                            ExpressionRecipes.begin(), ExpressionRecipes.end())
-                            .takeVector()),
-      ExpressionType(ExpressionType) {
+      ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
   assert(!ExpressionRecipes.empty() && "Nothing to combine?");
   assert(
       none_of(ExpressionRecipes,
@@ -2802,14 +2799,22 @@ VPExpressionRecipe::VPExpressionRecipe(
         continue;
       addOperand(Op);
       LiveInPlaceholders.push_back(new VPValue());
-      R->setOperand(Idx, LiveInPlaceholders.back());
     }
   }
+
+  // Replace each external operand with the first one created for it in
+  // LiveInPlaceholders.
+  for (auto *R : ExpressionRecipes)
+    for (auto const &[LiveIn, Tmp] : zip(operands(), LiveInPlaceholders))
+      R->replaceUsesOfWith(LiveIn, Tmp);
 }
 
 void VPExpressionRecipe::decompose() {
   for (auto *R : ExpressionRecipes)
-    R->insertBefore(this);
+    // Since the list could contain duplicates, make sure the recipe hasn't
+    // already been inserted.
+    if (!R->getParent())
+      R->insertBefore(this);
 
   for (const auto &[Idx, Op] : enumerate(operands()))
     LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index f4d4cca0d4220..f0183047f694b 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -3351,6 +3351,129 @@ for.end:                                          ; preds = %for.body, %entry
   ret i32 %x.0.lcssa
 }
 
+; Test that bundling recipes that share an operand into an expression works.
+; In this case the two extends are the recipes that share an operand.
+define i64 @reduction_expression_same_operands(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: define i64 @reduction_expression_same_operands(
+; CHECK-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i64> [[TMP3]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT:    [[TMP6]] = add i64 [[VEC_PHI]], [[TMP5]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[RDX_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[IV]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i16, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV0:%.*]] = sext i16 [[LOAD0]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD0]] to i32
+; CHECK-NEXT:    [[MUL1:%.*]] = mul nsw i32 [[CONV0]], [[CONV1]]
+; CHECK-NEXT:    [[MUL:%.*]] = sext i32 [[MUL1]] to i64
+; CHECK-NEXT:    [[RDX_NEXT]] = add nsw i64 [[RDX]], [[MUL]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @reduction_expression_same_operands(
+; CHECK-INTERLEAVED-SAME: ptr readonly captures(none) [[X:%.*]], ptr readonly captures(none) [[Y:%.*]], i32 [[N:%.*]]) {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP1]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i16>, ptr [[TMP2]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i64> [[TMP4]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add i64 [[VEC_PHI]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = sext <4 x i16> [[WIDE_LOAD2]] to <4 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nsw <4 x i64> [[TMP9]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP12]] = add i64 [[VEC_PHI1]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i64 [[TMP12]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[LOOP:.*]]
+; CHECK-INTERLEAVED:       [[LOOP]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[RDX:%.*]] = phi i64 [ [[RDX_NEXT:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD0:%.*]] = load i16, ptr [[ARRAYIDX]], align 4
+; CHECK-INTERLEAVED-NEXT:    [[CONV0:%.*]] = sext i16 [[LOAD0]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD0]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[MUL1:%.*]] = mul nsw i32 [[CONV0]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = sext i32 [[MUL1]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[RDX_NEXT]] = add nsw i64 [[RDX]], [[MUL]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK-INTERLEAVED:       [[EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    ret i64 [[R_0_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv
+  %load0 = load i16, ptr %arrayidx, align 4
+  %conv0 = sext i16 %load0 to i32
+  %conv1 = sext i16 %load0 to i32
+  %mul = mul nsw i32 %conv0, %conv1
+  %conv = sext i32 %mul to i64
+  %rdx.next = add nsw i64 %rdx, %conv
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+  ret i64 %r.0.lcssa
+}
+
 declare float @llvm.fmuladd.f32(float, float, float)
 
 !6 = distinct !{!6, !7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 5a0c69bf5db1b..06b044872c217 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -753,3 +753,50 @@ exit:
   %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
   ret i64 %r.0.lcssa
 }
+
+define i64 @print_mulacc_duplicate_extends(ptr nocapture readonly %x, ptr nocapture readonly %y, i32 %n) {
+; CHECK-LABEL: 'print_mulacc_duplicate_extends'
+; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<%n> = original trip-count
+; CHECK-EMPTY:
+; CHECK:      vector.ph:
+; CHECK-NEXT:   EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<1>
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[IV_NEXT:%.+]]>
+; CHECK-NEXT:     WIDEN-REDUCTION-PHI ir<[[RDX:%.+]]> = phi vp<[[RDX_START]]>, vp<[[RDX_NEXT:%.+]]>
+; CHECK-NEXT:     vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; CHECK-NEXT:     CLONE ir<[[ARRAYIDX0:%.+]]> = getelementptr inbounds ir<%x>, vp<[[STEPS]]>
+; CHECK-NEXT:     vp<[[ADDR0:%.+]]> = vector-pointer ir<[[ARRAYIDX0]]>
+; CHECK-NEXT:     WIDEN ir<[[LOAD0:%.+]]> = load vp<[[ADDR0]]>
+; CHECK-NEXT:     EXPRESSION vp<[[RDX_NEXT:%.+]]> = ir<[[RDX]]> + reduce.sub (mul nsw (ir<[[LOAD0]]> sext to i64), (ir<[[LOAD0]]> sext to i64))
+; CHECK-NEXT:     EMIT vp<[[IV_NEXT]]> = add nuw vp<[[IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<[[IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, ptr %x, i32 %iv
+  %load0 = load i16, ptr %arrayidx, align 4
+  %conv0 = sext i16 %load0 to i32
+  %mul = mul nsw i32 %conv0, %conv0
+  %conv = sext i32 %mul to i64
+  %rdx.next = sub nsw i64 %rdx, %conv
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  %r.0.lcssa = phi i64 [ %rdx.next, %loop ]
+  ret i64 %r.0.lcssa
+}

From 7ae3eca6572fe463ed51e82e63890eb7d77096d7 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Oct 2025 16:24:30 +0100
Subject: [PATCH 386/878] [lldb][test] Allow '.c' files to be used as shell
 tests (#161520)

Required for https://github.com/llvm/llvm-project/pull/161521
---
 lldb/test/Shell/lit.cfg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 505847fb763e0..cdc0cfe51f7c6 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -33,7 +33,7 @@
 
 # suffixes: A list of file extensions to treat as test files. This is overriden
 # by individual lit.local.cfg files in the test subdirectories.
-config.suffixes = [".test", ".cpp", ".s", ".m", ".ll"]
+config.suffixes = [".test", ".cpp", ".s", ".m", ".ll", ".c"]
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent

From f33564b9afaa34ffd92b754db7a5ff8ff5e60897 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Oct 2025 16:26:55 +0100
Subject: [PATCH 387/878] [lldb][NFCI] Remove the non-const reference
 Mangled::GetMangledName accessor (#161495)

We've been seen (very sporadic) lifetime issues around this area. We
noticed that `GetMangledName` has two accessors, one of which returns a
non-const reference. I audited all the callsites and no users of this
overload actually mutate the `ConstString` itself (which is a suspicious
thing to do anyway since it's just a wrapper around a `const char*`).

This patch removes the redundant overload.

rdar://161128180
---
 lldb/include/lldb/Core/Mangled.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/lldb/include/lldb/Core/Mangled.h b/lldb/include/lldb/Core/Mangled.h
index 665accb3119e3..546d7a9b409ed 100644
--- a/lldb/include/lldb/Core/Mangled.h
+++ b/lldb/include/lldb/Core/Mangled.h
@@ -148,13 +148,7 @@ class Mangled {
   /// Mangled name get accessor.
   ///
   /// \return
-  ///     A reference to the mangled name string object.
-  ConstString &GetMangledName() { return m_mangled; }
-
-  /// Mangled name get accessor.
-  ///
-  /// \return
-  ///     A const reference to the mangled name string object.
+  ///     The mangled name string object.
   ConstString GetMangledName() const { return m_mangled; }
 
   /// Best name get accessor.

From 0e17fb52da131a21ec5ad4878ee3b54b8deb5b59 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 1 Oct 2025 17:34:14 +0200
Subject: [PATCH 388/878] [CIR] Implement GenericSelectionExpr for
 AggregateExpr (#161003)

Implement the GenericSelectionExpr for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  3 +-
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |  2 ++
 clang/test/CIR/CodeGen/struct.cpp             | 29 +++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index af42d1d882ae0..1e987f3bedc7e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -133,8 +133,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   }
   void VisitParenExpr(ParenExpr *pe) { Visit(pe->getSubExpr()); }
   void VisitGenericSelectionExpr(GenericSelectionExpr *ge) {
-    cgf.cgm.errorNYI(ge->getSourceRange(),
-                     "AggExprEmitter: VisitGenericSelectionExpr");
+    Visit(ge->getResultExpr());
   }
   void VisitCoawaitExpr(CoawaitExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitCoawaitExpr");
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index a404c0c08d893..b26b4f2500579 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -836,6 +836,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     return emitCallExprLValue(cast<CallExpr>(e));
   case Expr::ParenExprClass:
     return emitLValue(cast<ParenExpr>(e)->getSubExpr());
+  case Expr::GenericSelectionExprClass:
+    return emitLValue(cast<GenericSelectionExpr>(e)->getResultExpr());
   case Expr::DeclRefExprClass:
     return emitDeclRefLValue(cast<DeclRefExpr>(e));
   case Expr::CStyleCastExprClass:
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index 1dc16f3b79497..96db82a89977c 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -154,3 +154,32 @@ void choose_expr() {
 // OGCG:   %[[B_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG:   %[[C_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[C_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
+
+void generic_selection() {
+  CompleteS a;
+  CompleteS b;
+  int c;
+  CompleteS d = _Generic(c, int : a, default: b);
+}
+
+// CIR: cir.func{{.*}} @_Z17generic_selectionv()
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a"]
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["b"]
+// CIR:   %[[C_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["c"]
+// CIR:   %[[D_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["d", init]
+// TODO(cir): Call to default copy constructor should be replaced by `cir.copy` op
+// CIR:   cir.call @_ZN9CompleteSC1ERKS_(%[[D_ADDR]], %[[A_ADDR]]) nothrow : (!cir.ptr<!rec_CompleteS>, !cir.ptr<!rec_CompleteS>) -> ()
+
+// LLVM: define{{.*}} void @_Z17generic_selectionv()
+// LLVM:   %1 = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   %2 = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   %3 = alloca i32, i64 1, align 4
+// LLVM:   %4 = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   call void @_ZN9CompleteSC1ERKS_(ptr %4, ptr %1)
+
+// OGCG: define{{.*}} void @_Z17generic_selectionv()
+// OGCG:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG:   %[[B_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG:   %[[C_ADDR:.*]] = alloca i32, align 4
+// OGCG:   %[[D_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[D_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)

From 1c7f40bf832222ed6b5435a6a19b4ef3c652bd9f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 1 Oct 2025 16:39:03 +0100
Subject: [PATCH 389/878] [X86] X86ISD::PACKSS/US do not create undef/poison
 (#161534)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 16 ++++++++++++++++
 llvm/test/CodeGen/X86/combine-pack.ll   |  3 ---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a04c3e8f70c89..34854e4d8b6c0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45172,6 +45172,18 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
   case X86ISD::Wrapper:
   case X86ISD::WrapperRIP:
     return true;
+  case X86ISD::PACKSS:
+  case X86ISD::PACKUS: {
+    APInt DemandedLHS, DemandedRHS;
+    getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
+                        DemandedRHS);
+    return (!DemandedLHS ||
+            DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
+                                                 PoisonOnly, Depth + 1)) &&
+           (!DemandedRHS ||
+            DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
+                                                 PoisonOnly, Depth + 1));
+  }
   case X86ISD::INSERTPS:
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
@@ -45242,6 +45254,10 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
   case X86ISD::BLENDI:
   case X86ISD::BLENDV:
     return false;
+  // SSE packs.
+  case X86ISD::PACKSS:
+  case X86ISD::PACKUS:
+    return false;
   // SSE target shuffles.
   case X86ISD::INSERTPS:
   case X86ISD::PSHUFB:
diff --git a/llvm/test/CodeGen/X86/combine-pack.ll b/llvm/test/CodeGen/X86/combine-pack.ll
index 1e7c700055e3f..2f5454dc2c3ec 100644
--- a/llvm/test/CodeGen/X86/combine-pack.ll
+++ b/llvm/test/CodeGen/X86/combine-pack.ll
@@ -26,14 +26,12 @@ define <8 x i16> @combine_packss_v4i32_signsplat(<4 x i32> %a0, <4 x i32> %a1) {
   ret <8 x i16> %signsplat
 }
 
-; TODO: Failure to remove unnecessary signsplat through freeze
 define <8 x i16> @combine_packss_v4i32_freeze_signsplat(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE-LABEL: combine_packss_v4i32_freeze_signsplat:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
-; SSE-NEXT:    psraw $15, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_packss_v4i32_freeze_signsplat:
@@ -41,7 +39,6 @@ define <8 x i16> @combine_packss_v4i32_freeze_signsplat(<4 x i32> %a0, <4 x i32>
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %cmp = icmp sgt <4 x i32> %a0, %a1
   %ext = sext <4 x i1> %cmp to <4 x i32>

From 825d82d0515d7d40c280547f6dcbf0aba9255289 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 1 Oct 2025 08:46:19 -0700
Subject: [PATCH 390/878] [NFC][LLVM] Misc code cleanup in AsmWriter.cpp
 (#161522)

- Change function/variable names to follow LLVM coding standard.
- Use `auto` for variables initialized using `dyn_cast`.
- Use `ArrayRef` instead of const vector references for function
arguments.
- Use `interleaved` to print comma separated lists of integers.
- Inline some instances of `incorporateFunction` lambda that have a
single use.
---
 llvm/lib/IR/AsmWriter.cpp | 407 ++++++++++++++++++--------------------
 1 file changed, 195 insertions(+), 212 deletions(-)

diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index e29179b8f9955..245129f3f791f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -127,7 +127,7 @@ static void orderValue(const Value *V, OrderMap &OM) {
   if (OM.lookup(V))
     return;
 
-  if (const Constant *C = dyn_cast<Constant>(V)) {
+  if (const auto *C = dyn_cast<Constant>(V)) {
     if (isa<ConstantData>(C))
       return;
 
@@ -146,17 +146,17 @@ static void orderValue(const Value *V, OrderMap &OM) {
 static OrderMap orderModule(const Module *M) {
   OrderMap OM;
 
-  auto orderConstantValue = [&OM](const Value *V) {
+  auto OrderConstantValue = [&OM](const Value *V) {
     if (isa<Constant>(V) || isa<InlineAsm>(V))
       orderValue(V, OM);
   };
 
   auto OrderConstantFromMetadata = [&](Metadata *MD) {
     if (const auto *VAM = dyn_cast<ValueAsMetadata>(MD)) {
-      orderConstantValue(VAM->getValue());
+      OrderConstantValue(VAM->getValue());
     } else if (const auto *AL = dyn_cast<DIArgList>(MD)) {
       for (const auto *VAM : AL->getArgs())
-        orderConstantValue(VAM->getValue());
+        OrderConstantValue(VAM->getValue());
     }
   };
 
@@ -302,18 +302,18 @@ static UseListOrderMap predictUseListOrder(const Module *M) {
 }
 
 static const Module *getModuleFromVal(const Value *V) {
-  if (const Argument *MA = dyn_cast<Argument>(V))
+  if (const auto *MA = dyn_cast<Argument>(V))
     return MA->getParent() ? MA->getParent()->getParent() : nullptr;
 
-  if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
+  if (const auto *BB = dyn_cast<BasicBlock>(V))
     return BB->getParent() ? BB->getParent()->getParent() : nullptr;
 
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+  if (const auto *I = dyn_cast<Instruction>(V)) {
     const Function *M = I->getParent() ? I->getParent()->getParent() : nullptr;
     return M ? M->getParent() : nullptr;
   }
 
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+  if (const auto *GV = dyn_cast<GlobalValue>(V))
     return GV->getParent();
 
   if (const auto *MAV = dyn_cast<MetadataAsValue>(V)) {
@@ -337,7 +337,7 @@ static const Module *getModuleFromDPI(const DbgRecord *DR) {
   return DR->getMarker() ? getModuleFromDPI(DR->getMarker()) : nullptr;
 }
 
-static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
+static void printCallingConv(unsigned cc, raw_ostream &Out) {
   switch (cc) {
   default:                         Out << "cc" << cc; break;
   case CallingConv::Fast:          Out << "fastcc"; break;
@@ -484,7 +484,7 @@ void llvm::printLLVMNameWithoutPrefix(raw_ostream &OS, StringRef Name) {
 /// Turn the specified name into an 'LLVM name', which is either prefixed with %
 /// (if the string only contains simple characters) or is surrounded with ""'s
 /// (if it has special chars in it). Print it out.
-static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
+static void printLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
   switch (Prefix) {
   case NoPrefix:
     break;
@@ -506,12 +506,12 @@ static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
 /// Turn the specified name into an 'LLVM name', which is either prefixed with %
 /// (if the string only contains simple characters) or is surrounded with ""'s
 /// (if it has special chars in it). Print it out.
-static void PrintLLVMName(raw_ostream &OS, const Value *V) {
-  PrintLLVMName(OS, V->getName(),
+static void printLLVMName(raw_ostream &OS, const Value *V) {
+  printLLVMName(OS, V->getName(),
                 isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
 }
 
-static void PrintShuffleMask(raw_ostream &Out, Type *Ty, ArrayRef<int> Mask) {
+static void printShuffleMask(raw_ostream &Out, Type *Ty, ArrayRef<int> Mask) {
   Out << ", <";
   if (isa<ScalableVectorType>(Ty))
     Out << "vscale x ";
@@ -668,7 +668,7 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
       return printStructBody(STy, OS);
 
     if (!STy->getName().empty())
-      return PrintLLVMName(OS, STy->getName(), LocalPrefix);
+      return printLLVMName(OS, STy->getName(), LocalPrefix);
 
     incorporateTypes();
     const auto I = Type2Number.find(STy);
@@ -999,26 +999,26 @@ void ModuleSlotTracker::setProcessHook(
 }
 
 static SlotTracker *createSlotTracker(const Value *V) {
-  if (const Argument *FA = dyn_cast<Argument>(V))
+  if (const auto *FA = dyn_cast<Argument>(V))
     return new SlotTracker(FA->getParent());
 
-  if (const Instruction *I = dyn_cast<Instruction>(V))
+  if (const auto *I = dyn_cast<Instruction>(V))
     if (I->getParent())
       return new SlotTracker(I->getParent()->getParent());
 
-  if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
+  if (const auto *BB = dyn_cast<BasicBlock>(V))
     return new SlotTracker(BB->getParent());
 
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+  if (const auto *GV = dyn_cast<GlobalVariable>(V))
     return new SlotTracker(GV->getParent());
 
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
+  if (const auto *GA = dyn_cast<GlobalAlias>(V))
     return new SlotTracker(GA->getParent());
 
-  if (const GlobalIFunc *GIF = dyn_cast<GlobalIFunc>(V))
+  if (const auto *GIF = dyn_cast<GlobalIFunc>(V))
     return new SlotTracker(GIF->getParent());
 
-  if (const Function *Func = dyn_cast<Function>(V))
+  if (const auto *Func = dyn_cast<Function>(V))
     return new SlotTracker(Func);
 
   return nullptr;
@@ -1218,7 +1218,7 @@ void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   // but we can have faulty metadata from debug-intrinsic days being
   // autoupgraded into debug records. This gets caught by the verifier, which
   // then will print the faulty IR, hitting this code path.
-  if (const DbgVariableRecord *DVR = dyn_cast<const DbgVariableRecord>(&DR)) {
+  if (const auto *DVR = dyn_cast<const DbgVariableRecord>(&DR)) {
     // Process metadata used by DbgRecords; we only specifically care about the
     // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
     // Expression fields should only be printed inline and so do not use a slot.
@@ -1233,7 +1233,7 @@ void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
       if (auto *Empty = dyn_cast_if_present<MDNode>(DVR->getRawAddress()))
         CreateMetadataSlot(Empty);
     }
-  } else if (const DbgLabelRecord *DLR = dyn_cast<const DbgLabelRecord>(&DR)) {
+  } else if (const auto *DLR = dyn_cast<const DbgLabelRecord>(&DR)) {
     CreateMetadataSlot(DLR->getRawLabel());
   } else {
     llvm_unreachable("unsupported DbgRecord kind");
@@ -1244,12 +1244,12 @@ void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
 
 void SlotTracker::processInstructionMetadata(const Instruction &I) {
   // Process metadata used directly by intrinsics.
-  if (const CallInst *CI = dyn_cast<CallInst>(&I))
+  if (const auto *CI = dyn_cast<CallInst>(&I))
     if (Function *F = CI->getCalledFunction())
       if (F->isIntrinsic())
         for (auto &Op : I.operands())
           if (auto *V = dyn_cast_or_null<MetadataAsValue>(Op))
-            if (MDNode *N = dyn_cast<MDNode>(V->getMetadata()))
+            if (auto *N = dyn_cast<MDNode>(V->getMetadata()))
               CreateMetadataSlot(N);
 
   // Process metadata attached to this instruction.
@@ -1406,7 +1406,7 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
 
   // Recursively add any MDNodes referenced by operands.
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
-    if (const MDNode *Op = dyn_cast_or_null<MDNode>(N->getOperand(i)))
+    if (const auto *Op = dyn_cast_or_null<MDNode>(N->getOperand(i)))
       CreateMetadataSlot(Op);
 }
 
@@ -1464,33 +1464,30 @@ struct AsmWriterContext {
 // AsmWriter Implementation
 //===----------------------------------------------------------------------===//
 
-static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+static void writeAsOperandInternal(raw_ostream &Out, const Value *V,
                                    AsmWriterContext &WriterCtx,
                                    bool PrintType = false);
 
-static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
+static void writeAsOperandInternal(raw_ostream &Out, const Metadata *MD,
                                    AsmWriterContext &WriterCtx,
                                    bool FromValue = false);
 
-static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
-  if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U))
+static void writeOptimizationInfo(raw_ostream &Out, const User *U) {
+  if (const auto *FPO = dyn_cast<const FPMathOperator>(U))
     Out << FPO->getFastMathFlags();
 
-  if (const OverflowingBinaryOperator *OBO =
-        dyn_cast<OverflowingBinaryOperator>(U)) {
+  if (const auto *OBO = dyn_cast<OverflowingBinaryOperator>(U)) {
     if (OBO->hasNoUnsignedWrap())
       Out << " nuw";
     if (OBO->hasNoSignedWrap())
       Out << " nsw";
-  } else if (const PossiblyExactOperator *Div =
-               dyn_cast<PossiblyExactOperator>(U)) {
+  } else if (const auto *Div = dyn_cast<PossiblyExactOperator>(U)) {
     if (Div->isExact())
       Out << " exact";
-  } else if (const PossiblyDisjointInst *PDI =
-                 dyn_cast<PossiblyDisjointInst>(U)) {
+  } else if (const auto *PDI = dyn_cast<PossiblyDisjointInst>(U)) {
     if (PDI->isDisjoint())
       Out << " disjoint";
-  } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
+  } else if (const auto *GEP = dyn_cast<GEPOperator>(U)) {
     if (GEP->isInBounds())
       Out << " inbounds";
     else if (GEP->hasNoUnsignedSignedWrap())
@@ -1515,7 +1512,7 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
   }
 }
 
-static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
+static void writeAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
   if (&APF.getSemantics() == &APFloat::IEEEsingle() ||
       &APF.getSemantics() == &APFloat::IEEEdouble()) {
     // We would like to output the FP constant value in exponential notation,
@@ -1608,9 +1605,9 @@ static void WriteAPFloatInternal(raw_ostream &Out, const APFloat &APF) {
     llvm_unreachable("Unsupported floating point type");
 }
 
-static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
+static void writeConstantInternal(raw_ostream &Out, const Constant *CV,
                                   AsmWriterContext &WriterCtx) {
-  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+  if (const auto *CI = dyn_cast<ConstantInt>(CV)) {
     Type *Ty = CI->getType();
 
     if (Ty->isVectorTy()) {
@@ -1630,7 +1627,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
-  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+  if (const auto *CFP = dyn_cast<ConstantFP>(CV)) {
     Type *Ty = CFP->getType();
 
     if (Ty->isVectorTy()) {
@@ -1639,7 +1636,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       Out << " ";
     }
 
-    WriteAPFloatInternal(Out, CFP->getValueAPF());
+    writeAPFloatInternal(Out, CFP->getValueAPF());
 
     if (Ty->isVectorTy())
       Out << ")";
@@ -1652,28 +1649,28 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
-  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
+  if (const auto *BA = dyn_cast<BlockAddress>(CV)) {
     Out << "blockaddress(";
-    WriteAsOperandInternal(Out, BA->getFunction(), WriterCtx);
+    writeAsOperandInternal(Out, BA->getFunction(), WriterCtx);
     Out << ", ";
-    WriteAsOperandInternal(Out, BA->getBasicBlock(), WriterCtx);
+    writeAsOperandInternal(Out, BA->getBasicBlock(), WriterCtx);
     Out << ")";
     return;
   }
 
   if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV)) {
     Out << "dso_local_equivalent ";
-    WriteAsOperandInternal(Out, Equiv->getGlobalValue(), WriterCtx);
+    writeAsOperandInternal(Out, Equiv->getGlobalValue(), WriterCtx);
     return;
   }
 
   if (const auto *NC = dyn_cast<NoCFIValue>(CV)) {
     Out << "no_cfi ";
-    WriteAsOperandInternal(Out, NC->getGlobalValue(), WriterCtx);
+    writeAsOperandInternal(Out, NC->getGlobalValue(), WriterCtx);
     return;
   }
 
-  if (const ConstantPtrAuth *CPA = dyn_cast<ConstantPtrAuth>(CV)) {
+  if (const auto *CPA = dyn_cast<ConstantPtrAuth>(CV)) {
     Out << "ptrauth (";
 
     // ptrauth (ptr CST, i32 KEY[, i64 DISC[, ptr ADDRDISC]?]?)
@@ -1686,25 +1683,25 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     ListSeparator LS;
     for (unsigned i = 0, e = NumOpsToWrite; i != e; ++i) {
       Out << LS;
-      WriteAsOperandInternal(Out, CPA->getOperand(i), WriterCtx,
+      writeAsOperandInternal(Out, CPA->getOperand(i), WriterCtx,
                              /*PrintType=*/true);
     }
     Out << ')';
     return;
   }
 
-  if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+  if (const auto *CA = dyn_cast<ConstantArray>(CV)) {
     Out << '[';
     ListSeparator LS;
     for (const Value *Op : CA->operands()) {
       Out << LS;
-      WriteAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
+      writeAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
     }
     Out << ']';
     return;
   }
 
-  if (const ConstantDataArray *CA = dyn_cast<ConstantDataArray>(CV)) {
+  if (const auto *CA = dyn_cast<ConstantDataArray>(CV)) {
     // As a special case, print the array as a string if it is an array of
     // i8 with ConstantInt values.
     if (CA->isString()) {
@@ -1718,14 +1715,14 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     ListSeparator LS;
     for (uint64_t i = 0, e = CA->getNumElements(); i != e; ++i) {
       Out << LS;
-      WriteAsOperandInternal(Out, CA->getElementAsConstant(i), WriterCtx,
+      writeAsOperandInternal(Out, CA->getElementAsConstant(i), WriterCtx,
                              /*PrintType=*/true);
     }
     Out << ']';
     return;
   }
 
-  if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+  if (const auto *CS = dyn_cast<ConstantStruct>(CV)) {
     if (CS->getType()->isPacked())
       Out << '<';
     Out << '{';
@@ -1734,7 +1731,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       ListSeparator LS;
       for (const Value *Op : CS->operands()) {
         Out << LS;
-        WriteAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
+        writeAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
       }
       Out << ' ';
     }
@@ -1755,7 +1752,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     if (auto *SplatVal = CV->getSplatValue()) {
       if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal)) {
         Out << "splat (";
-        WriteAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
+        writeAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
         Out << ')';
         return;
       }
@@ -1765,7 +1762,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     ListSeparator LS;
     for (unsigned i = 0, e = CVVTy->getNumElements(); i != e; ++i) {
       Out << LS;
-      WriteAsOperandInternal(Out, CV->getAggregateElement(i), WriterCtx,
+      writeAsOperandInternal(Out, CV->getAggregateElement(i), WriterCtx,
                              /*PrintType=*/true);
     }
     Out << '>';
@@ -1792,7 +1789,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
-  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+  if (const auto *CE = dyn_cast<ConstantExpr>(CV)) {
     // Use the same shorthand for splat vector (i.e. "splat(Ty val)") as is
     // permitted on IR input to reduce the output changes when enabling
     // UseConstant{Int,FP}ForScalableSplat.
@@ -1802,7 +1799,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       if (auto *SplatVal = CE->getSplatValue()) {
         if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal)) {
           Out << "splat (";
-          WriteAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
+          writeAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
           Out << ')';
           return;
         }
@@ -1810,10 +1807,10 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     }
 
     Out << CE->getOpcodeName();
-    WriteOptimizationInfo(Out, CE);
+    writeOptimizationInfo(Out, CE);
     Out << " (";
 
-    if (const GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
+    if (const auto *GEP = dyn_cast<GEPOperator>(CE)) {
       WriterCtx.TypePrinter->print(GEP->getSourceElementType(), Out);
       Out << ", ";
     }
@@ -1821,7 +1818,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     ListSeparator LS;
     for (const Value *Op : CE->operands()) {
       Out << LS;
-      WriteAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
+      writeAsOperandInternal(Out, Op, WriterCtx, /*PrintType=*/true);
     }
 
     if (CE->isCast()) {
@@ -1830,7 +1827,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     }
 
     if (CE->getOpcode() == Instruction::ShuffleVector)
-      PrintShuffleMask(Out, CE->getType(), CE->getShuffleMask());
+      printShuffleMask(Out, CE->getType(), CE->getShuffleMask());
 
     Out << ')';
     return;
@@ -1849,9 +1846,9 @@ static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
       Out << "null";
     } else if (auto *MDV = dyn_cast<ValueAsMetadata>(MD)) {
       Value *V = MDV->getValue();
-      WriteAsOperandInternal(Out, V, WriterCtx, /*PrintType=*/true);
+      writeAsOperandInternal(Out, V, WriterCtx, /*PrintType=*/true);
     } else {
-      WriteAsOperandInternal(Out, MD, WriterCtx);
+      writeAsOperandInternal(Out, MD, WriterCtx);
       WriterCtx.onWriteMetadataAsOperand(MD);
     }
   }
@@ -1939,7 +1936,7 @@ static void writeMetadataAsOperand(raw_ostream &Out, const Metadata *MD,
     Out << "null";
     return;
   }
-  WriteAsOperandInternal(Out, MD, WriterCtx);
+  writeAsOperandInternal(Out, MD, WriterCtx);
   WriterCtx.onWriteMetadataAsOperand(MD);
 }
 
@@ -2619,7 +2616,7 @@ static void writeDIArgList(raw_ostream &Out, const DIArgList *N,
   MDFieldPrinter Printer(Out, WriterCtx);
   for (const Metadata *Arg : N->getArgs()) {
     Out << FS;
-    WriteAsOperandInternal(Out, Arg, WriterCtx, true);
+    writeAsOperandInternal(Out, Arg, WriterCtx, true);
   }
   Out << ")";
 }
@@ -2662,7 +2659,7 @@ static void writeDIImportedEntity(raw_ostream &Out, const DIImportedEntity *N,
   Out << ")";
 }
 
-static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
+static void writeMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
                                     AsmWriterContext &Ctx) {
   if (Node->isDistinct())
     Out << "distinct ";
@@ -2682,7 +2679,7 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
 
 // Full implementation of printing a Value as an operand with support for
 // TypePrinting, etc.
-static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+static void writeAsOperandInternal(raw_ostream &Out, const Value *V,
                                    AsmWriterContext &WriterCtx,
                                    bool PrintType) {
   if (PrintType) {
@@ -2691,18 +2688,18 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
   }
 
   if (V->hasName()) {
-    PrintLLVMName(Out, V);
+    printLLVMName(Out, V);
     return;
   }
 
-  const Constant *CV = dyn_cast<Constant>(V);
+  const auto *CV = dyn_cast<Constant>(V);
   if (CV && !isa<GlobalValue>(CV)) {
     assert(WriterCtx.TypePrinter && "Constants require TypePrinting!");
-    WriteConstantInternal(Out, CV, WriterCtx);
+    writeConstantInternal(Out, CV, WriterCtx);
     return;
   }
 
-  if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+  if (const auto *IA = dyn_cast<InlineAsm>(V)) {
     Out << "asm ";
     if (IA->hasSideEffects())
       Out << "sideeffect ";
@@ -2722,7 +2719,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
   }
 
   if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
-    WriteAsOperandInternal(Out, MD->getMetadata(), WriterCtx,
+    writeAsOperandInternal(Out, MD->getMetadata(), WriterCtx,
                            /* FromValue */ true);
     return;
   }
@@ -2732,7 +2729,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
   auto *Machine = WriterCtx.Machine;
   // If we have a SlotTracker, use it.
   if (Machine) {
-    if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    if (const auto *GV = dyn_cast<GlobalValue>(V)) {
       Slot = Machine->getGlobalSlot(GV);
       Prefix = '@';
     } else {
@@ -2749,7 +2746,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     }
   } else if ((Machine = createSlotTracker(V))) {
     // Otherwise, create one to get the # and then destroy it.
-    if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    if (const auto *GV = dyn_cast<GlobalValue>(V)) {
       Slot = Machine->getGlobalSlot(GV);
       Prefix = '@';
     } else {
@@ -2767,21 +2764,21 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     Out << "<badref>";
 }
 
-static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
+static void writeAsOperandInternal(raw_ostream &Out, const Metadata *MD,
                                    AsmWriterContext &WriterCtx,
                                    bool FromValue) {
   // Write DIExpressions and DIArgLists inline when used as a value. Improves
   // readability of debug info intrinsics.
-  if (const DIExpression *Expr = dyn_cast<DIExpression>(MD)) {
+  if (const auto *Expr = dyn_cast<DIExpression>(MD)) {
     writeDIExpression(Out, Expr, WriterCtx);
     return;
   }
-  if (const DIArgList *ArgList = dyn_cast<DIArgList>(MD)) {
+  if (const auto *ArgList = dyn_cast<DIArgList>(MD)) {
     writeDIArgList(Out, ArgList, WriterCtx, FromValue);
     return;
   }
 
-  if (const MDNode *N = dyn_cast<MDNode>(MD)) {
+  if (const auto *N = dyn_cast<MDNode>(MD)) {
     std::unique_ptr<SlotTracker> MachineStorage;
     SaveAndRestore SARMachine(WriterCtx.Machine);
     if (!WriterCtx.Machine) {
@@ -2790,7 +2787,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
     }
     int Slot = WriterCtx.Machine->getMetadataSlot(N);
     if (Slot == -1) {
-      if (const DILocation *Loc = dyn_cast<DILocation>(N)) {
+      if (const auto *Loc = dyn_cast<DILocation>(N)) {
         writeDILocation(Out, Loc, WriterCtx);
         return;
       }
@@ -2802,7 +2799,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
     return;
   }
 
-  if (const MDString *MDS = dyn_cast<MDString>(MD)) {
+  if (const auto *MDS = dyn_cast<MDString>(MD)) {
     Out << "!\"";
     printEscapedString(MDS->getString(), Out);
     Out << '"';
@@ -2814,7 +2811,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
   assert((FromValue || !isa<LocalAsMetadata>(V)) &&
          "Unexpected function-local metadata outside of value argument");
 
-  WriteAsOperandInternal(Out, V->getValue(), WriterCtx, /*PrintType=*/true);
+  writeAsOperandInternal(Out, V->getValue(), WriterCtx, /*PrintType=*/true);
 }
 
 namespace {
@@ -2889,7 +2886,7 @@ class AssemblyWriter {
   void printDbgRecord(const DbgRecord &DR);
   void printDbgRecordLine(const DbgRecord &DR);
 
-  void printUseListOrder(const Value *V, const std::vector<unsigned> &Shuffle);
+  void printUseListOrder(const Value *V, ArrayRef<unsigned> Shuffle);
   void printUseLists(const Function *F);
 
   void printModuleSummaryIndex();
@@ -2901,16 +2898,14 @@ class AssemblyWriter {
   void printTypeIdSummary(const TypeIdSummary &TIS);
   void printTypeIdCompatibleVtableSummary(const TypeIdCompatibleVtableInfo &TI);
   void printTypeTestResolution(const TypeTestResolution &TTRes);
-  void printArgs(const std::vector<uint64_t> &Args);
+  void printArgs(ArrayRef<uint64_t> Args);
   void printWPDRes(const WholeProgramDevirtResolution &WPDRes);
   void printTypeIdInfo(const FunctionSummary::TypeIdInfo &TIDInfo);
   void printVFuncId(const FunctionSummary::VFuncId VFId);
-  void
-  printNonConstVCalls(const std::vector<FunctionSummary::VFuncId> &VCallList,
-                      const char *Tag);
-  void
-  printConstVCalls(const std::vector<FunctionSummary::ConstVCall> &VCallList,
-                   const char *Tag);
+  void printNonConstVCalls(ArrayRef<FunctionSummary::VFuncId> VCallList,
+                           const char *Tag);
+  void printConstVCalls(ArrayRef<FunctionSummary::ConstVCall> VCallList,
+                        const char *Tag);
 
 private:
   /// Print out metadata attachments.
@@ -2953,7 +2948,7 @@ void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
     return;
   }
   auto WriteCtx = getContext();
-  WriteAsOperandInternal(Out, Operand, WriteCtx, PrintType);
+  writeAsOperandInternal(Out, Operand, WriteCtx, PrintType);
 }
 
 void AssemblyWriter::writeSyncScope(const LLVMContext &Context,
@@ -3013,7 +3008,7 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
   Out << ' ';
   // Print the operand
   auto WriterCtx = getContext();
-  WriteAsOperandInternal(Out, Operand, WriterCtx);
+  writeAsOperandInternal(Out, Operand, WriterCtx);
 }
 
 void AssemblyWriter::writeOperandBundles(const CallBase *Call) {
@@ -3039,7 +3034,7 @@ void AssemblyWriter::writeOperandBundles(const CallBase *Call) {
       if (Input == nullptr)
         Out << "<null operand bundle!>";
       else
-        WriteAsOperandInternal(Out, Input, WriterCtx, /*PrintType=*/true);
+        writeAsOperandInternal(Out, Input, WriterCtx, /*PrintType=*/true);
     }
 
     Out << ')';
@@ -3311,14 +3306,8 @@ void AssemblyWriter::printTypeIdCompatibleVtableSummary(
   Out << ")";
 }
 
-void AssemblyWriter::printArgs(const std::vector<uint64_t> &Args) {
-  Out << "args: (";
-  ListSeparator FS;
-  for (auto arg : Args) {
-    Out << FS;
-    Out << arg;
-  }
-  Out << ")";
+void AssemblyWriter::printArgs(ArrayRef<uint64_t> Args) {
+  Out << "args: (" << llvm::interleaved(Args) << ')';
 }
 
 void AssemblyWriter::printWPDRes(const WholeProgramDevirtResolution &WPDRes) {
@@ -3658,7 +3647,7 @@ void AssemblyWriter::printVFuncId(const FunctionSummary::VFuncId VFId) {
 }
 
 void AssemblyWriter::printNonConstVCalls(
-    const std::vector<FunctionSummary::VFuncId> &VCallList, const char *Tag) {
+    ArrayRef<FunctionSummary::VFuncId> VCallList, const char *Tag) {
   Out << Tag << ": (";
   ListSeparator FS;
   for (auto &VFuncId : VCallList) {
@@ -3669,8 +3658,7 @@ void AssemblyWriter::printNonConstVCalls(
 }
 
 void AssemblyWriter::printConstVCalls(
-    const std::vector<FunctionSummary::ConstVCall> &VCallList,
-    const char *Tag) {
+    ArrayRef<FunctionSummary::ConstVCall> VCallList, const char *Tag) {
   Out << Tag << ": (";
   ListSeparator FS;
   for (auto &ConstVCall : VCallList) {
@@ -3793,7 +3781,7 @@ void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
   Out << "}\n";
 }
 
-static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
+static void printVisibility(GlobalValue::VisibilityTypes Vis,
                             formatted_raw_ostream &Out) {
   switch (Vis) {
   case GlobalValue::DefaultVisibility: break;
@@ -3802,13 +3790,13 @@ static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
   }
 }
 
-static void PrintDSOLocation(const GlobalValue &GV,
+static void printDSOLocation(const GlobalValue &GV,
                              formatted_raw_ostream &Out) {
   if (GV.isDSOLocal() && !GV.isImplicitDSOLocal())
     Out << "dso_local ";
 }
 
-static void PrintDLLStorageClass(GlobalValue::DLLStorageClassTypes SCT,
+static void printDLLStorageClass(GlobalValue::DLLStorageClassTypes SCT,
                                  formatted_raw_ostream &Out) {
   switch (SCT) {
   case GlobalValue::DefaultStorageClass: break;
@@ -3817,7 +3805,7 @@ static void PrintDLLStorageClass(GlobalValue::DLLStorageClassTypes SCT,
   }
 }
 
-static void PrintThreadLocalModel(GlobalVariable::ThreadLocalMode TLM,
+static void printThreadLocalModel(GlobalVariable::ThreadLocalMode TLM,
                                   formatted_raw_ostream &Out) {
   switch (TLM) {
     case GlobalVariable::NotThreadLocal:
@@ -3863,7 +3851,7 @@ static void maybePrintComdat(formatted_raw_ostream &Out,
     return;
 
   Out << '(';
-  PrintLLVMName(Out, C->getName(), ComdatPrefix);
+  printLLVMName(Out, C->getName(), ComdatPrefix);
   Out << ')';
 }
 
@@ -3872,17 +3860,17 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
     Out << "; Materializable\n";
 
   AsmWriterContext WriterCtx(&TypePrinter, &Machine, GV->getParent());
-  WriteAsOperandInternal(Out, GV, WriterCtx);
+  writeAsOperandInternal(Out, GV, WriterCtx);
   Out << " = ";
 
   if (!GV->hasInitializer() && GV->hasExternalLinkage())
     Out << "external ";
 
   Out << getLinkageNameWithSpace(GV->getLinkage());
-  PrintDSOLocation(*GV, Out);
-  PrintVisibility(GV->getVisibility(), Out);
-  PrintDLLStorageClass(GV->getDLLStorageClass(), Out);
-  PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
+  printDSOLocation(*GV, Out);
+  printVisibility(GV->getVisibility(), Out);
+  printDLLStorageClass(GV->getDLLStorageClass(), Out);
+  printThreadLocalModel(GV->getThreadLocalMode(), Out);
   StringRef UA = getUnnamedAddrEncoding(GV->getUnnamedAddr());
   if (!UA.empty())
       Out << UA << ' ';
@@ -3963,14 +3951,14 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
     Out << "; Materializable\n";
 
   AsmWriterContext WriterCtx(&TypePrinter, &Machine, GA->getParent());
-  WriteAsOperandInternal(Out, GA, WriterCtx);
+  writeAsOperandInternal(Out, GA, WriterCtx);
   Out << " = ";
 
   Out << getLinkageNameWithSpace(GA->getLinkage());
-  PrintDSOLocation(*GA, Out);
-  PrintVisibility(GA->getVisibility(), Out);
-  PrintDLLStorageClass(GA->getDLLStorageClass(), Out);
-  PrintThreadLocalModel(GA->getThreadLocalMode(), Out);
+  printDSOLocation(*GA, Out);
+  printVisibility(GA->getVisibility(), Out);
+  printDLLStorageClass(GA->getDLLStorageClass(), Out);
+  printThreadLocalModel(GA->getThreadLocalMode(), Out);
   StringRef UA = getUnnamedAddrEncoding(GA->getUnnamedAddr());
   if (!UA.empty())
       Out << UA << ' ';
@@ -4002,12 +3990,12 @@ void AssemblyWriter::printIFunc(const GlobalIFunc *GI) {
     Out << "; Materializable\n";
 
   AsmWriterContext WriterCtx(&TypePrinter, &Machine, GI->getParent());
-  WriteAsOperandInternal(Out, GI, WriterCtx);
+  writeAsOperandInternal(Out, GI, WriterCtx);
   Out << " = ";
 
   Out << getLinkageNameWithSpace(GI->getLinkage());
-  PrintDSOLocation(*GI, Out);
-  PrintVisibility(GI->getVisibility(), Out);
+  printDSOLocation(*GI, Out);
+  printVisibility(GI->getVisibility(), Out);
 
   Out << "ifunc ";
 
@@ -4059,7 +4047,7 @@ void AssemblyWriter::printTypeIdentities() {
 
   auto &NamedTypes = TypePrinter.getNamedTypes();
   for (StructType *NamedType : NamedTypes) {
-    PrintLLVMName(Out, NamedType->getName(), LocalPrefix);
+    printLLVMName(Out, NamedType->getName(), LocalPrefix);
     Out << " = type ";
 
     // Make sure we print out at least one level of the type structure, so
@@ -4107,13 +4095,13 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "define ";
 
   Out << getLinkageNameWithSpace(F->getLinkage());
-  PrintDSOLocation(*F, Out);
-  PrintVisibility(F->getVisibility(), Out);
-  PrintDLLStorageClass(F->getDLLStorageClass(), Out);
+  printDSOLocation(*F, Out);
+  printVisibility(F->getVisibility(), Out);
+  printDLLStorageClass(F->getDLLStorageClass(), Out);
 
   // Print the calling convention.
   if (F->getCallingConv() != CallingConv::C) {
-    PrintCallingConv(F->getCallingConv(), Out);
+    printCallingConv(F->getCallingConv(), Out);
     Out << " ";
   }
 
@@ -4123,7 +4111,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   TypePrinter.print(F->getReturnType(), Out);
   AsmWriterContext WriterCtx(&TypePrinter, &Machine, F->getParent());
   Out << ' ';
-  WriteAsOperandInternal(Out, F, WriterCtx);
+  writeAsOperandInternal(Out, F, WriterCtx);
   Out << '(';
 
   // Loop over the arguments, printing them...
@@ -4239,7 +4227,7 @@ void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
   // Output name, if available...
   if (Arg->hasName()) {
     Out << ' ';
-    PrintLLVMName(Out, Arg);
+    printLLVMName(Out, Arg);
   } else {
     int Slot = Machine.getLocalSlot(Arg);
     assert(Slot != -1 && "expect argument in function here");
@@ -4252,7 +4240,7 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
   bool IsEntryBlock = BB->getParent() && BB->isEntryBlock();
   if (BB->hasName()) {              // Print out the label if it exists...
     Out << "\n";
-    PrintLLVMName(Out, BB->getName(), LabelPrefix);
+    printLLVMName(Out, BB->getName(), LabelPrefix);
     Out << ':';
   } else if (!IsEntryBlock) {
     Out << "\n";
@@ -4370,7 +4358,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 
   // Print out name if it exists...
   if (I.hasName()) {
-    PrintLLVMName(Out, &I);
+    printLLVMName(Out, &I);
     Out << " = ";
   } else if (!I.getType()->isVoidTy()) {
     // Print out the def slot taken.
@@ -4381,7 +4369,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << '%' << SlotNum << " = ";
   }
 
-  if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+  if (const auto *CI = dyn_cast<CallInst>(&I)) {
     if (CI->isMustTailCall())
       Out << "musttail ";
     else if (CI->isTailCall())
@@ -4409,14 +4397,14 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << " volatile";
 
   // Print out optimization information.
-  WriteOptimizationInfo(Out, &I);
+  writeOptimizationInfo(Out, &I);
 
   // Print out the compare instruction predicates
-  if (const CmpInst *CI = dyn_cast<CmpInst>(&I))
+  if (const auto *CI = dyn_cast<CmpInst>(&I))
     Out << ' ' << CI->getPredicate();
 
   // Print out the atomicrmw operation
-  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I))
+  if (const auto *RMWI = dyn_cast<AtomicRMWInst>(&I))
     Out << ' ' << AtomicRMWInst::getOperationName(RMWI->getOperation());
 
   // Print out the type of the operands...
@@ -4459,29 +4447,32 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       writeOperand(I.getOperand(i), true);
     }
     Out << ']';
-  } else if (const PHINode *PN = dyn_cast<PHINode>(&I)) {
+  } else if (const auto *PN = dyn_cast<PHINode>(&I)) {
     Out << ' ';
     TypePrinter.print(I.getType(), Out);
     Out << ' ';
 
     ListSeparator LS;
-    for (unsigned op = 0, Eop = PN->getNumIncomingValues(); op < Eop; ++op) {
+    for (const auto &[V, Block] :
+         zip_equal(PN->incoming_values(), PN->blocks())) {
       Out << LS << "[ ";
-      writeOperand(PN->getIncomingValue(op), false); Out << ", ";
-      writeOperand(PN->getIncomingBlock(op), false); Out << " ]";
+      writeOperand(V, false);
+      Out << ", ";
+      writeOperand(Block, false);
+      Out << " ]";
     }
-  } else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(&I)) {
+  } else if (const auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
     Out << ' ';
     writeOperand(I.getOperand(0), true);
-    for (unsigned i : EVI->indices())
-      Out << ", " << i;
-  } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(&I)) {
+    Out << ", ";
+    Out << llvm::interleaved(EVI->indices());
+  } else if (const auto *IVI = dyn_cast<InsertValueInst>(&I)) {
     Out << ' ';
     writeOperand(I.getOperand(0), true); Out << ", ";
     writeOperand(I.getOperand(1), true);
-    for (unsigned i : IVI->indices())
-      Out << ", " << i;
-  } else if (const LandingPadInst *LPI = dyn_cast<LandingPadInst>(&I)) {
+    Out << ", ";
+    Out << llvm::interleaved(IVI->indices());
+  } else if (const auto *LPI = dyn_cast<LandingPadInst>(&I)) {
     Out << ' ';
     TypePrinter.print(I.getType(), Out);
     if (LPI->isCleanup() || LPI->getNumClauses() != 0)
@@ -4540,11 +4531,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       writeOperand(CRI->getOperand(1), /*PrintType=*/true);
     else
       Out << "to caller";
-  } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+  } else if (const auto *CI = dyn_cast<CallInst>(&I)) {
     // Print the calling convention being used.
     if (CI->getCallingConv() != CallingConv::C) {
       Out << " ";
-      PrintCallingConv(CI->getCallingConv(), Out);
+      printCallingConv(CI->getCallingConv(), Out);
     }
 
     Operand = CI->getCalledOperand();
@@ -4587,7 +4578,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs());
 
     writeOperandBundles(CI);
-  } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+  } else if (const auto *II = dyn_cast<InvokeInst>(&I)) {
     Operand = II->getCalledOperand();
     FunctionType *FTy = II->getFunctionType();
     Type *RetTy = FTy->getReturnType();
@@ -4596,7 +4587,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     // Print the calling convention being used.
     if (II->getCallingConv() != CallingConv::C) {
       Out << " ";
-      PrintCallingConv(II->getCallingConv(), Out);
+      printCallingConv(II->getCallingConv(), Out);
     }
 
     if (PAL.hasRetAttrs())
@@ -4630,7 +4621,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(II->getNormalDest(), true);
     Out << " unwind ";
     writeOperand(II->getUnwindDest(), true);
-  } else if (const CallBrInst *CBI = dyn_cast<CallBrInst>(&I)) {
+  } else if (const auto *CBI = dyn_cast<CallBrInst>(&I)) {
     Operand = CBI->getCalledOperand();
     FunctionType *FTy = CBI->getFunctionType();
     Type *RetTy = FTy->getReturnType();
@@ -4639,7 +4630,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     // Print the calling convention being used.
     if (CBI->getCallingConv() != CallingConv::C) {
       Out << " ";
-      PrintCallingConv(CBI->getCallingConv(), Out);
+      printCallingConv(CBI->getCallingConv(), Out);
     }
 
     if (PAL.hasRetAttrs())
@@ -4675,7 +4666,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       writeOperand(Dest, true);
     }
     Out << ']';
-  } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+  } else if (const auto *AI = dyn_cast<AllocaInst>(&I)) {
     Out << ' ';
     if (AI->isUsedWithInAlloca())
       Out << "inalloca ";
@@ -4697,9 +4688,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
 
     unsigned AddrSpace = AI->getAddressSpace();
-    if (AddrSpace != 0) {
+    if (AddrSpace != 0)
       Out << ", addrspace(" << AddrSpace << ')';
-    }
   } else if (isa<CastInst>(I)) {
     if (Operand) {
       Out << ' ';
@@ -4714,7 +4704,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
     Out << ", ";
     TypePrinter.print(I.getType(), Out);
-  } else if (Operand) {   // Print the normal way.
+  } else if (Operand) { // Print the normal way.
     if (const auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
       Out << ' ';
       TypePrinter.print(GEP->getSourceElementType(), Out);
@@ -4763,28 +4753,28 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
   }
 
   // Print atomic ordering/alignment for memory operations
-  if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+  if (const auto *LI = dyn_cast<LoadInst>(&I)) {
     if (LI->isAtomic())
       writeAtomic(LI->getContext(), LI->getOrdering(), LI->getSyncScopeID());
     if (MaybeAlign A = LI->getAlign())
       Out << ", align " << A->value();
-  } else if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+  } else if (const auto *SI = dyn_cast<StoreInst>(&I)) {
     if (SI->isAtomic())
       writeAtomic(SI->getContext(), SI->getOrdering(), SI->getSyncScopeID());
     if (MaybeAlign A = SI->getAlign())
       Out << ", align " << A->value();
-  } else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+  } else if (const auto *CXI = dyn_cast<AtomicCmpXchgInst>(&I)) {
     writeAtomicCmpXchg(CXI->getContext(), CXI->getSuccessOrdering(),
                        CXI->getFailureOrdering(), CXI->getSyncScopeID());
     Out << ", align " << CXI->getAlign().value();
-  } else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I)) {
+  } else if (const auto *RMWI = dyn_cast<AtomicRMWInst>(&I)) {
     writeAtomic(RMWI->getContext(), RMWI->getOrdering(),
                 RMWI->getSyncScopeID());
     Out << ", align " << RMWI->getAlign().value();
-  } else if (const FenceInst *FI = dyn_cast<FenceInst>(&I)) {
+  } else if (const auto *FI = dyn_cast<FenceInst>(&I)) {
     writeAtomic(FI->getContext(), FI->getOrdering(), FI->getSyncScopeID());
-  } else if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
-    PrintShuffleMask(Out, SVI->getType(), SVI->getShuffleMask());
+  } else if (const auto *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+    printShuffleMask(Out, SVI->getType(), SVI->getShuffleMask());
   }
 
   // Print Metadata info.
@@ -4840,7 +4830,7 @@ void AssemblyWriter::printDbgVariableRecord(const DbgVariableRecord &DVR) {
     if (!M)
       Out << "(null)";
     else
-      WriteAsOperandInternal(Out, M, WriterCtx, true);
+      writeAsOperandInternal(Out, M, WriterCtx, true);
   };
 
   Out << "(";
@@ -4874,9 +4864,9 @@ void AssemblyWriter::printDbgRecordLine(const DbgRecord &DR) {
 void AssemblyWriter::printDbgLabelRecord(const DbgLabelRecord &Label) {
   auto WriterCtx = getContext();
   Out << "#dbg_label(";
-  WriteAsOperandInternal(Out, Label.getRawLabel(), WriterCtx, true);
+  writeAsOperandInternal(Out, Label.getRawLabel(), WriterCtx, true);
   Out << ", ";
-  WriteAsOperandInternal(Out, Label.getDebugLoc(), WriterCtx, true);
+  writeAsOperandInternal(Out, Label.getDebugLoc(), WriterCtx, true);
   Out << ")";
 }
 
@@ -4899,7 +4889,7 @@ void AssemblyWriter::printMetadataAttachments(
     } else
       Out << "!<unknown kind #" << Kind << ">";
     Out << ' ';
-    WriteAsOperandInternal(Out, I.second, WriterCtx);
+    writeAsOperandInternal(Out, I.second, WriterCtx);
   }
 }
 
@@ -4922,7 +4912,7 @@ void AssemblyWriter::writeAllMDNodes() {
 
 void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
   auto WriterCtx = getContext();
-  WriteMDNodeBodyInternal(Out, Node, WriterCtx);
+  writeMDNodeBodyInternal(Out, Node, WriterCtx);
 }
 
 void AssemblyWriter::writeAttribute(const Attribute &Attr, bool InAttrGroup) {
@@ -4941,12 +4931,10 @@ void AssemblyWriter::writeAttribute(const Attribute &Attr, bool InAttrGroup) {
 
 void AssemblyWriter::writeAttributeSet(const AttributeSet &AttrSet,
                                        bool InAttrGroup) {
-  bool FirstAttr = true;
+  ListSeparator LS(" ");
   for (const auto &Attr : AttrSet) {
-    if (!FirstAttr)
-      Out << ' ';
+    Out << LS;
     writeAttribute(Attr, InAttrGroup);
-    FirstAttr = false;
   }
 }
 
@@ -4963,7 +4951,7 @@ void AssemblyWriter::writeAllAttributeGroups() {
 }
 
 void AssemblyWriter::printUseListOrder(const Value *V,
-                                       const std::vector<unsigned> &Shuffle) {
+                                       ArrayRef<unsigned> Shuffle) {
   bool IsInFunction = Machine.getFunction();
   if (IsInFunction)
     Out << "  ";
@@ -5052,7 +5040,7 @@ void NamedMDNode::print(raw_ostream &ROS, ModuleSlotTracker &MST,
 }
 
 void Comdat::print(raw_ostream &ROS, bool /*IsForDebug*/) const {
-  PrintLLVMName(ROS, getName(), ComdatPrefix);
+  printLLVMName(ROS, getName(), ComdatPrefix);
   ROS << " = comdat ";
 
   switch (getSelectionKind()) {
@@ -5084,7 +5072,7 @@ void Type::print(raw_ostream &OS, bool /*IsForDebug*/, bool NoDetails) const {
     return;
 
   // If the type is a named struct type, print the body as well.
-  if (StructType *STy = dyn_cast<StructType>(const_cast<Type*>(this)))
+  if (auto *STy = dyn_cast<StructType>(const_cast<Type *>(this)))
     if (!STy->isLiteral()) {
       OS << " = type ";
       TP.printStructBody(STy, OS);
@@ -5120,11 +5108,9 @@ void DbgMarker::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
       MST.getMachine() ? *MST.getMachine() : EmptySlotTable;
-  auto incorporateFunction = [&](const Function *F) {
-    if (F)
-      MST.incorporateFunction(*F);
-  };
-  incorporateFunction(getParent() ? getParent()->getParent() : nullptr);
+  const Function *F = getParent() ? getParent()->getParent() : nullptr;
+  if (F)
+    MST.incorporateFunction(*F);
   AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
   W.printDbgMarker(*this);
 }
@@ -5141,13 +5127,11 @@ void DbgVariableRecord::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
       MST.getMachine() ? *MST.getMachine() : EmptySlotTable;
-  auto incorporateFunction = [&](const Function *F) {
-    if (F)
-      MST.incorporateFunction(*F);
-  };
-  incorporateFunction(Marker && Marker->getParent()
+  const Function *F = Marker && Marker->getParent()
                           ? Marker->getParent()->getParent()
-                          : nullptr);
+                          : nullptr;
+  if (F)
+    MST.incorporateFunction(*F);
   AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
   W.printDbgVariableRecord(*this);
 }
@@ -5158,12 +5142,11 @@ void DbgLabelRecord::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
       MST.getMachine() ? *MST.getMachine() : EmptySlotTable;
-  auto incorporateFunction = [&](const Function *F) {
-    if (F)
-      MST.incorporateFunction(*F);
-  };
-  incorporateFunction(Marker->getParent() ? Marker->getParent()->getParent()
-                                          : nullptr);
+  const Function *F =
+      Marker->getParent() ? Marker->getParent()->getParent() : nullptr;
+  if (F)
+    MST.incorporateFunction(*F);
+
   AssemblyWriter W(OS, SlotTable, getModuleFromDPI(this), nullptr, IsForDebug);
   W.printDbgLabelRecord(*this);
 }
@@ -5185,39 +5168,39 @@ void Value::print(raw_ostream &ROS, ModuleSlotTracker &MST,
   SlotTracker EmptySlotTable(static_cast<const Module *>(nullptr));
   SlotTracker &SlotTable =
       MST.getMachine() ? *MST.getMachine() : EmptySlotTable;
-  auto incorporateFunction = [&](const Function *F) {
+  auto IncorporateFunction = [&](const Function *F) {
     if (F)
       MST.incorporateFunction(*F);
   };
 
-  if (const Instruction *I = dyn_cast<Instruction>(this)) {
-    incorporateFunction(I->getParent() ? I->getParent()->getParent() : nullptr);
+  if (const auto *I = dyn_cast<Instruction>(this)) {
+    IncorporateFunction(I->getParent() ? I->getParent()->getParent() : nullptr);
     AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), nullptr, IsForDebug);
     W.printInstruction(*I);
-  } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(this)) {
-    incorporateFunction(BB->getParent());
+  } else if (const auto *BB = dyn_cast<BasicBlock>(this)) {
+    IncorporateFunction(BB->getParent());
     AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), nullptr, IsForDebug);
     W.printBasicBlock(BB);
-  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) {
+  } else if (const auto *GV = dyn_cast<GlobalValue>(this)) {
     AssemblyWriter W(OS, SlotTable, GV->getParent(), nullptr, IsForDebug);
-    if (const GlobalVariable *V = dyn_cast<GlobalVariable>(GV))
+    if (const auto *V = dyn_cast<GlobalVariable>(GV))
       W.printGlobal(V);
-    else if (const Function *F = dyn_cast<Function>(GV))
+    else if (const auto *F = dyn_cast<Function>(GV))
       W.printFunction(F);
-    else if (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV))
+    else if (const auto *A = dyn_cast<GlobalAlias>(GV))
       W.printAlias(A);
-    else if (const GlobalIFunc *I = dyn_cast<GlobalIFunc>(GV))
+    else if (const auto *I = dyn_cast<GlobalIFunc>(GV))
       W.printIFunc(I);
     else
       llvm_unreachable("Unknown GlobalValue to print out!");
-  } else if (const MetadataAsValue *V = dyn_cast<MetadataAsValue>(this)) {
+  } else if (const auto *V = dyn_cast<MetadataAsValue>(this)) {
     V->getMetadata()->print(ROS, MST, getModuleFromVal(V));
-  } else if (const Constant *C = dyn_cast<Constant>(this)) {
+  } else if (const auto *C = dyn_cast<Constant>(this)) {
     TypePrinting TypePrinter;
     TypePrinter.print(C->getType(), OS);
     OS << ' ';
     AsmWriterContext WriterCtx(&TypePrinter, MST.getMachine());
-    WriteConstantInternal(OS, C, WriterCtx);
+    writeConstantInternal(OS, C, WriterCtx);
   } else if (isa<InlineAsm>(this) || isa<Argument>(this)) {
     this->printAsOperand(OS, /* PrintType */ true, MST);
   } else {
@@ -5233,7 +5216,7 @@ static bool printWithoutType(const Value &V, raw_ostream &O,
   if (V.hasName() || isa<GlobalValue>(V) ||
       (!isa<Constant>(V) && !isa<MetadataAsValue>(V))) {
     AsmWriterContext WriterCtx(nullptr, Machine, M);
-    WriteAsOperandInternal(O, &V, WriterCtx);
+    writeAsOperandInternal(O, &V, WriterCtx);
     return true;
   }
   return false;
@@ -5243,7 +5226,7 @@ static void printAsOperandImpl(const Value &V, raw_ostream &O, bool PrintType,
                                ModuleSlotTracker &MST) {
   TypePrinting TypePrinter(MST.getModule());
   AsmWriterContext WriterCtx(&TypePrinter, MST.getMachine(), MST.getModule());
-  WriteAsOperandInternal(O, &V, WriterCtx, PrintType);
+  writeAsOperandInternal(O, &V, WriterCtx, PrintType);
 }
 
 void Value::printAsOperand(raw_ostream &O, bool PrintType,
@@ -5274,14 +5257,14 @@ void Value::printAsOperand(raw_ostream &O, bool PrintType,
 static void printMetadataImplRec(raw_ostream &ROS, const Metadata &MD,
                                  AsmWriterContext &WriterCtx) {
   formatted_raw_ostream OS(ROS);
-  WriteAsOperandInternal(OS, &MD, WriterCtx, /* FromValue */ true);
+  writeAsOperandInternal(OS, &MD, WriterCtx, /* FromValue */ true);
 
   auto *N = dyn_cast<MDNode>(&MD);
   if (!N || isa<DIExpression>(MD))
     return;
 
   OS << " = ";
-  WriteMDNodeBodyInternal(OS, N, WriterCtx);
+  writeMDNodeBodyInternal(OS, N, WriterCtx);
 }
 
 namespace {
@@ -5342,14 +5325,14 @@ static void printMetadataImpl(raw_ostream &ROS, const Metadata &MD,
     WriterCtx =
         std::make_unique<AsmWriterContext>(&TypePrinter, MST.getMachine(), M);
 
-  WriteAsOperandInternal(OS, &MD, *WriterCtx, /* FromValue */ true);
+  writeAsOperandInternal(OS, &MD, *WriterCtx, /* FromValue */ true);
 
   auto *N = dyn_cast<MDNode>(&MD);
   if (OnlyAsOperand || !N || isa<DIExpression>(MD))
     return;
 
   OS << " = ";
-  WriteMDNodeBodyInternal(OS, N, *WriterCtx);
+  writeMDNodeBodyInternal(OS, N, *WriterCtx);
 }
 
 void Metadata::printAsOperand(raw_ostream &OS, const Module *M) const {

From 9f7e7f7d9c9f46cfe83481b1e79d7c8a3e11580e Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Oct 2025 16:47:15 +0100
Subject: [PATCH 391/878] [lldb][MachO] Fix inspection of global variables that
 start with 'O' (#161521)

On Darwin C-symbols are prefixed with a '_'. The LLDB Macho-O parses
handles Objective-C metadata symbols starting with '_OBJC' specially.
Previously global symbols starting with a '_O' prefix were lost because
of incorrectly scoped if-guards. This patch removes those checks.

There is more cleanup that can be done in this file because there's a
bunch of duplicated checks for these ObjC symbols. I decided to leave
that for an NFC follow-up.

Depends on https://github.com/llvm/llvm-project/pull/161520

rdar://158159242
---
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     | 90 +++++++++----------
 .../Shell/Expr/TestGlobalSymbolObjCConflict.c | 33 +++++++
 2 files changed, 75 insertions(+), 48 deletions(-)
 create mode 100644 lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index fada1fda2b4bc..91c93be1b8cfd 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -2805,32 +2805,29 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                         is_gsym = true;
                         sym[sym_idx].SetExternal(true);
 
-                        if (symbol_name && symbol_name[0] == '_' &&
-                            symbol_name[1] == 'O') {
-                          llvm::StringRef symbol_name_ref(symbol_name);
-                          if (symbol_name_ref.starts_with(
-                                  g_objc_v2_prefix_class)) {
-                            symbol_name_non_abi_mangled = symbol_name + 1;
-                            symbol_name =
-                                symbol_name + g_objc_v2_prefix_class.size();
-                            type = eSymbolTypeObjCClass;
-                            demangled_is_synthesized = true;
-
-                          } else if (symbol_name_ref.starts_with(
-                                         g_objc_v2_prefix_metaclass)) {
-                            symbol_name_non_abi_mangled = symbol_name + 1;
-                            symbol_name =
-                                symbol_name + g_objc_v2_prefix_metaclass.size();
-                            type = eSymbolTypeObjCMetaClass;
-                            demangled_is_synthesized = true;
-                          } else if (symbol_name_ref.starts_with(
-                                         g_objc_v2_prefix_ivar)) {
-                            symbol_name_non_abi_mangled = symbol_name + 1;
-                            symbol_name =
-                                symbol_name + g_objc_v2_prefix_ivar.size();
-                            type = eSymbolTypeObjCIVar;
-                            demangled_is_synthesized = true;
-                          }
+                        llvm::StringRef symbol_name_ref(symbol_name);
+                        if (symbol_name_ref.starts_with(
+                                g_objc_v2_prefix_class)) {
+                          symbol_name_non_abi_mangled = symbol_name + 1;
+                          symbol_name =
+                              symbol_name + g_objc_v2_prefix_class.size();
+                          type = eSymbolTypeObjCClass;
+                          demangled_is_synthesized = true;
+
+                        } else if (symbol_name_ref.starts_with(
+                                       g_objc_v2_prefix_metaclass)) {
+                          symbol_name_non_abi_mangled = symbol_name + 1;
+                          symbol_name =
+                              symbol_name + g_objc_v2_prefix_metaclass.size();
+                          type = eSymbolTypeObjCMetaClass;
+                          demangled_is_synthesized = true;
+                        } else if (symbol_name_ref.starts_with(
+                                       g_objc_v2_prefix_ivar)) {
+                          symbol_name_non_abi_mangled = symbol_name + 1;
+                          symbol_name =
+                              symbol_name + g_objc_v2_prefix_ivar.size();
+                          type = eSymbolTypeObjCIVar;
+                          demangled_is_synthesized = true;
                         } else {
                           if (nlist.n_value != 0)
                             symbol_section = section_info.GetSection(
@@ -3652,7 +3649,7 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
       if (is_debug) {
         switch (nlist.n_type) {
-        case N_GSYM:
+        case N_GSYM: {
           // global symbol: name,,NO_SECT,type,0
           // Sometimes the N_GSYM value contains the address.
 
@@ -3668,33 +3665,30 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
           is_gsym = true;
           sym[sym_idx].SetExternal(true);
 
-          if (symbol_name && symbol_name[0] == '_' && symbol_name[1] == 'O') {
-            llvm::StringRef symbol_name_ref(symbol_name);
-            if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
-              symbol_name_non_abi_mangled = symbol_name + 1;
-              symbol_name = symbol_name + g_objc_v2_prefix_class.size();
-              type = eSymbolTypeObjCClass;
-              demangled_is_synthesized = true;
-
-            } else if (symbol_name_ref.starts_with(
-                           g_objc_v2_prefix_metaclass)) {
-              symbol_name_non_abi_mangled = symbol_name + 1;
-              symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
-              type = eSymbolTypeObjCMetaClass;
-              demangled_is_synthesized = true;
-            } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
-              symbol_name_non_abi_mangled = symbol_name + 1;
-              symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
-              type = eSymbolTypeObjCIVar;
-              demangled_is_synthesized = true;
-            }
+          llvm::StringRef symbol_name_ref(symbol_name);
+          if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
+            symbol_name_non_abi_mangled = symbol_name + 1;
+            symbol_name = symbol_name + g_objc_v2_prefix_class.size();
+            type = eSymbolTypeObjCClass;
+            demangled_is_synthesized = true;
+
+          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_metaclass)) {
+            symbol_name_non_abi_mangled = symbol_name + 1;
+            symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
+            type = eSymbolTypeObjCMetaClass;
+            demangled_is_synthesized = true;
+          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
+            symbol_name_non_abi_mangled = symbol_name + 1;
+            symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
+            type = eSymbolTypeObjCIVar;
+            demangled_is_synthesized = true;
           } else {
             if (nlist.n_value != 0)
               symbol_section =
                   section_info.GetSection(nlist.n_sect, nlist.n_value);
             type = eSymbolTypeData;
           }
-          break;
+        } break;
 
         case N_FNAME:
           // procedure name (f77 kludge): name,,NO_SECT,0,0
diff --git a/lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c b/lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c
new file mode 100644
index 0000000000000..62c0162863337
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c
@@ -0,0 +1,33 @@
+// Tests that LLDB correctly parses global symbols
+// starting with 'O'. On some platforms (e.g., Darwin)
+// C-symbols are prefixed with a '_'. The LLDB Macho-O
+// parses handles Objective-C metadata symbols starting
+// with '_OBJC' specially. This test ensures that we don't
+// lose track of regular global symbols with a '_O' prefix
+// in this.
+
+// RUN: %clang_host -c -g -fno-common %s -o %t.o
+// RUN: %clang_host %t.o -o %t.out
+// RUN: %lldb -b -x %t.out \
+// RUN:       -o "b 27" \
+// RUN:       -o "run" \
+// RUN:       -o "p OglobalVar" \
+// RUN:       -o "p Oabc" | FileCheck %s
+
+typedef struct {
+  int a;
+} Oabc_t;
+
+Oabc_t Oabc;
+int OglobalVar;
+
+int main(int argc, const char *argv[]) {
+  Oabc.a = 15;
+  OglobalVar = 10;
+  return OglobalVar + Oabc.a;
+}
+
+// CHECK: (lldb) p OglobalVar
+// CHECK: (int) 10
+// CHECK: (lldb) p Oabc
+// CHECK: (Oabc_t) (a = 15)

From aee99e8015daa9f53ab1fd4e5b24cc4c694bdc4a Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Wed, 1 Oct 2025 08:55:54 -0700
Subject: [PATCH 392/878] [WebAssembly] Define llvm-internal WasmEH tags in
 compiler-rt (#160959)

The `__c_longjmp` and `__cpp_exceptions` tags are used internally by
llvm to implement setjmp/longjmp and C++ exception handling
respectively.

These symbols were previously defined weakly in each object file but
were recently converted to external references in #159143. They now need
to be defined somewhere in the runtime libraries. I think compiler-rt is
likely the most sensible place for them.
---
 compiler-rt/lib/builtins/CMakeLists.txt       | 11 ++++----
 compiler-rt/lib/builtins/wasm/__c_longjmp.S   | 26 +++++++++++++++++++
 .../lib/builtins/wasm/__cpp_exception.S       | 26 +++++++++++++++++++
 3 files changed, 58 insertions(+), 5 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/wasm/__c_longjmp.S
 create mode 100644 compiler-rt/lib/builtins/wasm/__cpp_exception.S

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 0d7fc65cfd3e9..9095b056ae782 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -816,14 +816,15 @@ set(s390x_SOURCES
   ${GENERIC_TF_SOURCES}
 )
 
-set(wasm32_SOURCES
-  ${GENERIC_TF_SOURCES}
-  ${GENERIC_SOURCES}
-)
-set(wasm64_SOURCES
+
+set(wasm_SOURCES
+  wasm/__c_longjmp.S
+  wasm/__cpp_exceptions.S
   ${GENERIC_TF_SOURCES}
   ${GENERIC_SOURCES}
 )
+set(wasm32_SOURCES ${wasm_SOURCES})
+set(wasm64_SOURCES ${wasm_SOURCES})
 
 set(ve_SOURCES
   ve/grow_stack.S
diff --git a/compiler-rt/lib/builtins/wasm/__c_longjmp.S b/compiler-rt/lib/builtins/wasm/__c_longjmp.S
new file mode 100644
index 0000000000000..d130862fd5c41
--- /dev/null
+++ b/compiler-rt/lib/builtins/wasm/__c_longjmp.S
@@ -0,0 +1,26 @@
+//===-- __c_longjmp.S - Implement __c_longjmp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __c_longjmp which LLVM uses to implenmet setjmp/longjmp
+// when Wasm EH is enabled.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __wasm_exception_handling__
+
+#ifdef __wasm64__
+#define PTR i64
+#else
+#define PTR i32
+#endif
+
+.globl __c_longjmp
+.tagtype __c_longjmp PTR
+__c_longjmp:
+
+#endif // !__wasm_exception_handling__
diff --git a/compiler-rt/lib/builtins/wasm/__cpp_exception.S b/compiler-rt/lib/builtins/wasm/__cpp_exception.S
new file mode 100644
index 0000000000000..0496e1dbf6158
--- /dev/null
+++ b/compiler-rt/lib/builtins/wasm/__cpp_exception.S
@@ -0,0 +1,26 @@
+//===-- __cpp_exception.S - Implement __cpp_exception ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements __cpp_exception which LLVM uses to implement exception
+// handling when Wasm EH is enabled.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __wasm_exception_handling__
+
+#ifdef __wasm64__
+#define PTR i64
+#else
+#define PTR i32
+#endif
+
+.globl __cpp_exception
+.tagtype __cpp_exception PTR
+__cpp_exception:
+
+#endif // !__wasm_exception_handling__

From 23e081524fd9f64fb3430822e879b6dc36a1d3f1 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Oct 2025 16:58:09 +0100
Subject: [PATCH 393/878] [lldb][MachO][NFC] Extract ObjC metadata symbol
 parsing into helper function (#161536)

Just a simple de-duplication of the same code. We saw a bug here
recently (https://github.com/llvm/llvm-project/pull/161521). Might as
well isolate this all in one place.

rdar://158159242
---
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     | 178 ++++++------------
 1 file changed, 54 insertions(+), 124 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 91c93be1b8cfd..e8bbefe27f47f 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -2067,6 +2067,43 @@ static bool ParseTrieEntries(DataExtractor &data, lldb::offset_t offset,
   return true;
 }
 
+static bool
+TryParseV2ObjCMetadataSymbol(const char *&symbol_name,
+                             const char *&symbol_name_non_abi_mangled,
+                             SymbolType &type) {
+  static constexpr llvm::StringLiteral g_objc_v2_prefix_class("_OBJC_CLASS_$_");
+  static constexpr llvm::StringLiteral g_objc_v2_prefix_metaclass(
+      "_OBJC_METACLASS_$_");
+  static constexpr llvm::StringLiteral g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
+
+  llvm::StringRef symbol_name_ref(symbol_name);
+  if (symbol_name_ref.empty())
+    return false;
+
+  if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
+    symbol_name_non_abi_mangled = symbol_name + 1;
+    symbol_name = symbol_name + g_objc_v2_prefix_class.size();
+    type = eSymbolTypeObjCClass;
+    return true;
+  }
+
+  if (symbol_name_ref.starts_with(g_objc_v2_prefix_metaclass)) {
+    symbol_name_non_abi_mangled = symbol_name + 1;
+    symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
+    type = eSymbolTypeObjCMetaClass;
+    return true;
+  }
+
+  if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
+    symbol_name_non_abi_mangled = symbol_name + 1;
+    symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
+    type = eSymbolTypeObjCIVar;
+    return true;
+  }
+
+  return false;
+}
+
 static SymbolType GetSymbolType(const char *&symbol_name,
                                 bool &demangled_is_synthesized,
                                 const SectionSP &text_section_sp,
@@ -2183,9 +2220,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
   lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
   uint32_t i;
   FileSpecList dylib_files;
-  llvm::StringRef g_objc_v2_prefix_class("_OBJC_CLASS_$_");
-  llvm::StringRef g_objc_v2_prefix_metaclass("_OBJC_METACLASS_$_");
-  llvm::StringRef g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
   UUID image_uuid;
 
   for (i = 0; i < m_header.ncmds; ++i) {
@@ -2805,33 +2839,13 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                         is_gsym = true;
                         sym[sym_idx].SetExternal(true);
 
-                        llvm::StringRef symbol_name_ref(symbol_name);
-                        if (symbol_name_ref.starts_with(
-                                g_objc_v2_prefix_class)) {
-                          symbol_name_non_abi_mangled = symbol_name + 1;
-                          symbol_name =
-                              symbol_name + g_objc_v2_prefix_class.size();
-                          type = eSymbolTypeObjCClass;
-                          demangled_is_synthesized = true;
-
-                        } else if (symbol_name_ref.starts_with(
-                                       g_objc_v2_prefix_metaclass)) {
-                          symbol_name_non_abi_mangled = symbol_name + 1;
-                          symbol_name =
-                              symbol_name + g_objc_v2_prefix_metaclass.size();
-                          type = eSymbolTypeObjCMetaClass;
+                        if (TryParseV2ObjCMetadataSymbol(
+                                symbol_name, symbol_name_non_abi_mangled,
+                                type)) {
                           demangled_is_synthesized = true;
-                        } else if (symbol_name_ref.starts_with(
-                                       g_objc_v2_prefix_ivar)) {
-                          symbol_name_non_abi_mangled = symbol_name + 1;
-                          symbol_name =
-                              symbol_name + g_objc_v2_prefix_ivar.size();
-                          type = eSymbolTypeObjCIVar;
-                          demangled_is_synthesized = true;
-                        } else {
-                          if (nlist.n_value != 0)
-                            symbol_section = section_info.GetSection(
-                                nlist.n_sect, nlist.n_value);
+                        } else if (nlist.n_value != 0) {
+                          symbol_section = section_info.GetSection(
+                              nlist.n_sect, nlist.n_value);
                           type = eSymbolTypeData;
                         }
                         break;
@@ -3316,49 +3330,10 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                                   ::strstr(symbol_sect_name, "__objc") ==
                                       symbol_sect_name) {
                                 type = eSymbolTypeRuntime;
-
-                                if (symbol_name) {
-                                  llvm::StringRef symbol_name_ref(symbol_name);
-                                  if (symbol_name_ref.starts_with("_OBJC_")) {
-                                    llvm::StringRef
-                                        g_objc_v2_prefix_class(
-                                            "_OBJC_CLASS_$_");
-                                    llvm::StringRef
-                                        g_objc_v2_prefix_metaclass(
-                                            "_OBJC_METACLASS_$_");
-                                    llvm::StringRef
-                                        g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
-                                    if (symbol_name_ref.starts_with(
-                                            g_objc_v2_prefix_class)) {
-                                      symbol_name_non_abi_mangled =
-                                          symbol_name + 1;
-                                      symbol_name =
-                                          symbol_name +
-                                          g_objc_v2_prefix_class.size();
-                                      type = eSymbolTypeObjCClass;
-                                      demangled_is_synthesized = true;
-                                    } else if (
-                                        symbol_name_ref.starts_with(
-                                            g_objc_v2_prefix_metaclass)) {
-                                      symbol_name_non_abi_mangled =
-                                          symbol_name + 1;
-                                      symbol_name =
-                                          symbol_name +
-                                          g_objc_v2_prefix_metaclass.size();
-                                      type = eSymbolTypeObjCMetaClass;
-                                      demangled_is_synthesized = true;
-                                    } else if (symbol_name_ref.starts_with(
-                                                   g_objc_v2_prefix_ivar)) {
-                                      symbol_name_non_abi_mangled =
-                                          symbol_name + 1;
-                                      symbol_name =
-                                          symbol_name +
-                                          g_objc_v2_prefix_ivar.size();
-                                      type = eSymbolTypeObjCIVar;
-                                      demangled_is_synthesized = true;
-                                    }
-                                  }
-                                }
+                                demangled_is_synthesized =
+                                    TryParseV2ObjCMetadataSymbol(
+                                        symbol_name,
+                                        symbol_name_non_abi_mangled, type);
                               } else if (symbol_sect_name &&
                                          ::strstr(symbol_sect_name,
                                                   "__gcc_except_tab") ==
@@ -3665,27 +3640,12 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
           is_gsym = true;
           sym[sym_idx].SetExternal(true);
 
-          llvm::StringRef symbol_name_ref(symbol_name);
-          if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
-            symbol_name_non_abi_mangled = symbol_name + 1;
-            symbol_name = symbol_name + g_objc_v2_prefix_class.size();
-            type = eSymbolTypeObjCClass;
-            demangled_is_synthesized = true;
-
-          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_metaclass)) {
-            symbol_name_non_abi_mangled = symbol_name + 1;
-            symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
-            type = eSymbolTypeObjCMetaClass;
+          if (TryParseV2ObjCMetadataSymbol(symbol_name,
+                                           symbol_name_non_abi_mangled, type)) {
             demangled_is_synthesized = true;
-          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
-            symbol_name_non_abi_mangled = symbol_name + 1;
-            symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
-            type = eSymbolTypeObjCIVar;
-            demangled_is_synthesized = true;
-          } else {
-            if (nlist.n_value != 0)
-              symbol_section =
-                  section_info.GetSection(nlist.n_sect, nlist.n_value);
+          } else if (nlist.n_value != 0) {
+            symbol_section =
+                section_info.GetSection(nlist.n_sect, nlist.n_value);
             type = eSymbolTypeData;
           }
         } break;
@@ -4123,39 +4083,9 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                 if (symbol_sect_name &&
                     ::strstr(symbol_sect_name, "__objc") == symbol_sect_name) {
                   type = eSymbolTypeRuntime;
+                  demangled_is_synthesized = TryParseV2ObjCMetadataSymbol(
+                      symbol_name, symbol_name_non_abi_mangled, type);
 
-                  if (symbol_name) {
-                    llvm::StringRef symbol_name_ref(symbol_name);
-                    if (symbol_name_ref.starts_with("_OBJC_")) {
-                      llvm::StringRef g_objc_v2_prefix_class(
-                          "_OBJC_CLASS_$_");
-                      llvm::StringRef g_objc_v2_prefix_metaclass(
-                          "_OBJC_METACLASS_$_");
-                      llvm::StringRef g_objc_v2_prefix_ivar(
-                          "_OBJC_IVAR_$_");
-                      if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
-                        symbol_name_non_abi_mangled = symbol_name + 1;
-                        symbol_name =
-                            symbol_name + g_objc_v2_prefix_class.size();
-                        type = eSymbolTypeObjCClass;
-                        demangled_is_synthesized = true;
-                      } else if (symbol_name_ref.starts_with(
-                                     g_objc_v2_prefix_metaclass)) {
-                        symbol_name_non_abi_mangled = symbol_name + 1;
-                        symbol_name =
-                            symbol_name + g_objc_v2_prefix_metaclass.size();
-                        type = eSymbolTypeObjCMetaClass;
-                        demangled_is_synthesized = true;
-                      } else if (symbol_name_ref.starts_with(
-                                     g_objc_v2_prefix_ivar)) {
-                        symbol_name_non_abi_mangled = symbol_name + 1;
-                        symbol_name =
-                            symbol_name + g_objc_v2_prefix_ivar.size();
-                        type = eSymbolTypeObjCIVar;
-                        demangled_is_synthesized = true;
-                      }
-                    }
-                  }
                 } else if (symbol_sect_name &&
                            ::strstr(symbol_sect_name, "__gcc_except_tab") ==
                                symbol_sect_name) {

From 59fe8401f473dc77825b7bdfc82dc1628ddb82cc Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 1 Oct 2025 08:58:30 -0700
Subject: [PATCH 394/878] [LV] Use StringRef::consume_front. NFC (#161454)

---
 llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ff35db14f7094..7d376c370bb1c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -293,9 +293,8 @@ void LoopVectorizeHints::getHintsFromMetadata() {
 }
 
 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
-  if (!Name.starts_with(Prefix()))
+  if (!Name.consume_front(Prefix()))
     return;
-  Name = Name.substr(Prefix().size(), StringRef::npos);
 
   const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
   if (!C)

From 0e124b9586b862e749a3c9ee8b43cf18ad9c2812 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Oct 2025 08:59:23 -0700
Subject: [PATCH 395/878] [ADT] Use "if constexpr" with shouldReverseIterate
 (#161477)

This patch uses "if constexpr" whenever we call shouldReverseIterate
in "if" conditions.  Note that shouldReverseIterate is a constexpr
function.
---
 llvm/include/llvm/ADT/SmallPtrSet.h | 10 ++++++----
 llvm/lib/Support/StringMap.cpp      |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h
index e24cd6415b687..f588a77a53b2a 100644
--- a/llvm/include/llvm/ADT/SmallPtrSet.h
+++ b/llvm/include/llvm/ADT/SmallPtrSet.h
@@ -476,18 +476,20 @@ template <typename PtrType> class SmallPtrSetImpl : public SmallPtrSetImplBase {
   }
 
   [[nodiscard]] iterator begin() const {
-    if (shouldReverseIterate())
+    if constexpr (shouldReverseIterate())
       return makeIterator(EndPointer() - 1);
-    return makeIterator(CurArray);
+    else
+      return makeIterator(CurArray);
   }
   [[nodiscard]] iterator end() const { return makeIterator(EndPointer()); }
 
 private:
   /// Create an iterator that dereferences to same place as the given pointer.
   iterator makeIterator(const void *const *P) const {
-    if (shouldReverseIterate())
+    if constexpr (shouldReverseIterate())
       return iterator(P == EndPointer() ? CurArray : P + 1, CurArray, *this);
-    return iterator(P, EndPointer(), *this);
+    else
+      return iterator(P, EndPointer(), *this);
   }
 };
 
diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp
index 3432dc15ceef2..4aee30cd484e0 100644
--- a/llvm/lib/Support/StringMap.cpp
+++ b/llvm/lib/Support/StringMap.cpp
@@ -83,7 +83,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name,
   // Hash table unallocated so far?
   if (NumBuckets == 0)
     init(16);
-  if (shouldReverseIterate())
+  if constexpr (shouldReverseIterate())
     FullHashValue = ~FullHashValue;
   unsigned BucketNo = FullHashValue & (NumBuckets - 1);
   unsigned *HashTable = getHashTable(TheTable, NumBuckets);
@@ -142,7 +142,7 @@ int StringMapImpl::FindKey(StringRef Key, uint32_t FullHashValue) const {
 #ifdef EXPENSIVE_CHECKS
   assert(FullHashValue == hash(Key));
 #endif
-  if (shouldReverseIterate())
+  if constexpr (shouldReverseIterate())
     FullHashValue = ~FullHashValue;
   unsigned BucketNo = FullHashValue & (NumBuckets - 1);
   unsigned *HashTable = getHashTable(TheTable, NumBuckets);

From 190826c5df1ee34bf04823ba3b9d448e18b877ed Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Oct 2025 08:59:31 -0700
Subject: [PATCH 396/878] [ADT, Support] Drop extraneous std::bool_constant
 (NFC) (#161478)

This patch drops extraneous std::bool_constant, replacing:

  std::bool_constant<std::is_foo_v<...>>

with:

  std::is_foo<...>
---
 llvm/include/llvm/ADT/IntervalTree.h          |  3 +--
 llvm/include/llvm/Support/FormatProviders.h   | 19 +++++++------------
 .../llvm/Support/FormatVariadicDetails.h      |  3 +--
 llvm/include/llvm/Support/HashBuilder.h       |  3 +--
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/ADT/IntervalTree.h b/llvm/include/llvm/ADT/IntervalTree.h
index 918c86227576e..d14de06f26dc3 100644
--- a/llvm/include/llvm/ADT/IntervalTree.h
+++ b/llvm/include/llvm/ADT/IntervalTree.h
@@ -236,8 +236,7 @@ template <typename PointT, typename ValueT> class IntervalData {
 //===----------------------------------------------------------------------===//
 // Helper class template that is used by the IntervalTree to ensure that one
 // does instantiate using only fundamental and/or pointer types.
-template <typename T>
-using PointTypeIsValid = std::bool_constant<std::is_fundamental<T>::value>;
+template <typename T> using PointTypeIsValid = std::is_fundamental<T>;
 
 template <typename T>
 using ValueTypeIsValid = std::bool_constant<std::is_fundamental<T>::value ||
diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h
index 9147782055574..8eaa5e382c73e 100644
--- a/llvm/include/llvm/Support/FormatProviders.h
+++ b/llvm/include/llvm/Support/FormatProviders.h
@@ -29,22 +29,18 @@ namespace support {
 namespace detail {
 template <typename T>
 struct use_integral_formatter
-    : public std::bool_constant<
-          is_one_of<T, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t,
-                    uint64_t, int, unsigned, long, unsigned long, long long,
-                    unsigned long long>::value> {};
+    : public is_one_of<T, uint8_t, int16_t, uint16_t, int32_t, uint32_t,
+                       int64_t, uint64_t, int, unsigned, long, unsigned long,
+                       long long, unsigned long long> {};
 
 template <typename T>
-struct use_char_formatter : public std::bool_constant<std::is_same_v<T, char>> {
-};
+struct use_char_formatter : public std::is_same<T, char> {};
 
 template <typename T>
-struct is_cstring
-    : public std::bool_constant<is_one_of<T, char *, const char *>::value> {};
+struct is_cstring : public is_one_of<T, char *, const char *> {};
 
 template <typename T>
-struct use_string_formatter
-    : public std::bool_constant<std::is_convertible_v<T, llvm::StringRef>> {};
+struct use_string_formatter : public std::is_convertible<T, llvm::StringRef> {};
 
 template <typename T>
 struct use_pointer_formatter
@@ -52,8 +48,7 @@ struct use_pointer_formatter
 };
 
 template <typename T>
-struct use_double_formatter
-    : public std::bool_constant<std::is_floating_point_v<T>> {};
+struct use_double_formatter : public std::is_floating_point<T> {};
 
 class HelperFunctions {
 protected:
diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
index 4002caf76675c..0fdc7b6f94da7 100644
--- a/llvm/include/llvm/Support/FormatVariadicDetails.h
+++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
@@ -92,8 +92,7 @@ template <class T> class has_StreamOperator {
 // based format() invocation.
 template <typename T>
 struct uses_format_member
-    : public std::bool_constant<
-          std::is_base_of_v<format_adapter, std::remove_reference_t<T>>> {};
+    : public std::is_base_of<format_adapter, std::remove_reference_t<T>> {};
 
 // Simple template that decides whether a type T should use the format_provider
 // based format() invocation.  The member function takes priority, so this test
diff --git a/llvm/include/llvm/Support/HashBuilder.h b/llvm/include/llvm/Support/HashBuilder.h
index ae266d3f19a1a..d0130d61af59b 100644
--- a/llvm/include/llvm/Support/HashBuilder.h
+++ b/llvm/include/llvm/Support/HashBuilder.h
@@ -31,8 +31,7 @@ namespace llvm {
 namespace hashbuilder_detail {
 /// Trait to indicate whether a type's bits can be hashed directly (after
 /// endianness correction).
-template <typename U>
-struct IsHashableData : std::bool_constant<is_integral_or_enum<U>::value> {};
+template <typename U> struct IsHashableData : is_integral_or_enum<U> {};
 
 } // namespace hashbuilder_detail
 

From 09dbb3e25444ec93b45320061932a1328f0d1ed8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Oct 2025 08:59:40 -0700
Subject: [PATCH 397/878] [Support] Use a C++17 fold expression in a
 static_assert (NFC) (#161479)

This patch simplifies a recursive use of a type trait to a C++17 fold
expression.
---
 llvm/include/llvm/Support/Format.h | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Support/Format.h b/llvm/include/llvm/Support/Format.h
index 2553002b37899..34b224dba5407 100644
--- a/llvm/include/llvm/Support/Format.h
+++ b/llvm/include/llvm/Support/Format.h
@@ -78,16 +78,6 @@ class LLVM_ABI format_object_base {
 /// printed, this synthesizes the string into a temporary buffer provided and
 /// returns whether or not it is big enough.
 
-// Helper to validate that format() parameters are scalars or pointers.
-template <typename... Args> struct validate_format_parameters;
-template <typename Arg, typename... Args>
-struct validate_format_parameters<Arg, Args...> {
-  static_assert(std::is_scalar_v<Arg>,
-                "format can't be used with non fundamental / non pointer type");
-  validate_format_parameters() { validate_format_parameters<Args...>(); }
-};
-template <> struct validate_format_parameters<> {};
-
 template <typename... Ts>
 class format_object final : public format_object_base {
   std::tuple<Ts...> Vals;
@@ -105,7 +95,9 @@ class format_object final : public format_object_base {
 public:
   format_object(const char *fmt, const Ts &... vals)
       : format_object_base(fmt), Vals(vals...) {
-    validate_format_parameters<Ts...>();
+    static_assert(
+        (std::is_scalar_v<Ts> && ...),
+        "format can't be used with non fundamental / non pointer type");
   }
 
   int snprint(char *Buffer, unsigned BufferSize) const override {

From 515660d843b04d62eed816e9fb191f31af805e4b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Oct 2025 08:59:49 -0700
Subject: [PATCH 398/878] [Support] Make getMaxValue and getMinValue constexpr
 variables (NFC) (#161480)

This patch makes getMaxValue and getMinValue constexpr variables and
"inlines" the calls to getMaxValue and getMinValue.

We could probably make InstructionCost constexpr also, but that's left
for another day.
---
 llvm/include/llvm/Support/InstructionCost.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Support/InstructionCost.h b/llvm/include/llvm/Support/InstructionCost.h
index ab1c8ebc8c95e..507c16666b958 100644
--- a/llvm/include/llvm/Support/InstructionCost.h
+++ b/llvm/include/llvm/Support/InstructionCost.h
@@ -59,8 +59,8 @@ class InstructionCost {
       State = Invalid;
   }
 
-  static CostType getMaxValue() { return std::numeric_limits<CostType>::max(); }
-  static CostType getMinValue() { return std::numeric_limits<CostType>::min(); }
+  static constexpr CostType MaxValue = std::numeric_limits<CostType>::max();
+  static constexpr CostType MinValue = std::numeric_limits<CostType>::min();
 
 public:
   // A default constructed InstructionCost is a valid zero cost
@@ -69,8 +69,8 @@ class InstructionCost {
   InstructionCost(CostState) = delete;
   InstructionCost(CostType Val) : Value(Val), State(Valid) {}
 
-  static InstructionCost getMax() { return getMaxValue(); }
-  static InstructionCost getMin() { return getMinValue(); }
+  static InstructionCost getMax() { return MaxValue; }
+  static InstructionCost getMin() { return MinValue; }
   static InstructionCost getInvalid(CostType Val = 0) {
     InstructionCost Tmp(Val);
     Tmp.setInvalid();
@@ -102,7 +102,7 @@ class InstructionCost {
     // Saturating addition.
     InstructionCost::CostType Result;
     if (AddOverflow(Value, RHS.Value, Result))
-      Result = RHS.Value > 0 ? getMaxValue() : getMinValue();
+      Result = RHS.Value > 0 ? MaxValue : MinValue;
 
     Value = Result;
     return *this;
@@ -120,7 +120,7 @@ class InstructionCost {
     // Saturating subtract.
     InstructionCost::CostType Result;
     if (SubOverflow(Value, RHS.Value, Result))
-      Result = RHS.Value > 0 ? getMinValue() : getMaxValue();
+      Result = RHS.Value > 0 ? MinValue : MaxValue;
     Value = Result;
     return *this;
   }
@@ -138,9 +138,9 @@ class InstructionCost {
     InstructionCost::CostType Result;
     if (MulOverflow(Value, RHS.Value, Result)) {
       if ((Value > 0 && RHS.Value > 0) || (Value < 0 && RHS.Value < 0))
-        Result = getMaxValue();
+        Result = MaxValue;
       else
-        Result = getMinValue();
+        Result = MinValue;
     }
 
     Value = Result;

From a6bf271ffec9b4b68e5b1f5270b8d8d6f28dac79 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Oct 2025 08:59:58 -0700
Subject: [PATCH 399/878] [llvm] Proofread DirectXUsage.rst (#161481)

---
 llvm/docs/DirectXUsage.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/DirectXUsage.rst b/llvm/docs/DirectXUsage.rst
index 1d964e6d54dae..78f27d89c1f8a 100644
--- a/llvm/docs/DirectXUsage.rst
+++ b/llvm/docs/DirectXUsage.rst
@@ -29,7 +29,7 @@ Initially the backend is aimed at supporting DirectX 12, and support for DirectX
 11 is planned at a later date.
 
 The DirectX backend is currently experimental and is not shipped with any
-release builds of LLVM tools. To enable building the DirectX backend locally add
+release builds of LLVM tools. To build the DirectX backend locally, add
 ``DirectX`` to the ``LLVM_EXPERIMENTAL_TARGETS_TO_BUILD`` CMake option. For more
 information on building LLVM see the :doc:`CMake` documentation.
 
@@ -38,7 +38,7 @@ information on building LLVM see the :doc:`CMake` documentation.
 Target Triples
 ==============
 
-At present the DirectX target only supports the ``dxil`` architecture, which
+At present, the DirectX target only supports the ``dxil`` architecture, which
 generates code for the
 `DirectX Intermediate Language. <https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst>`_
 
@@ -46,8 +46,8 @@ In addition to target architecture, the DirectX backend also needs to know the
 target runtime version and pipeline stage. These are expressed using the OS and
 Environment triple component.
 
-Presently the DirectX backend requires targeting the ``shadermodel`` OS, and
-supports versions 6.0+ (at time of writing the latest announced version is 6.7).
+Presently, the DirectX backend requires targeting the ``shadermodel`` OS, and
+supports versions 6.0+ (as of writing, the latest announced version is 6.7).
 
 .. table:: DirectX Environments
 

From bb16c56019f16998235e8a8c43dc072b03fe1dc0 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 1 Oct 2025 17:14:25 +0100
Subject: [PATCH 400/878] [LoopIdiom] Fix a DL-related crash in optimizeCRCLoop
 (#161509)

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  |  7 +--
 .../LoopIdiom/cyclic-redundancy-check-dl.ll   | 50 +++++++++++++++++++
 .../LoopIdiom/cyclic-redundancy-check.ll      | 24 ++++-----
 3 files changed, 64 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check-dl.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 0874b29ab7d22..019536ca91ae0 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1598,11 +1598,8 @@ bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
   //   crc = (crc << 8) ^ tbl[(iv'th byte of data) ^ (top byte of crc)]
   {
     auto LoByte = [](IRBuilderBase &Builder, Value *Op, const Twine &Name) {
-      Type *OpTy = Op->getType();
-      unsigned OpBW = OpTy->getIntegerBitWidth();
-      return OpBW > 8
-                 ? Builder.CreateAnd(Op, ConstantInt::get(OpTy, 0XFF), Name)
-                 : Op;
+      return Builder.CreateZExtOrTrunc(
+          Op, IntegerType::getInt8Ty(Op->getContext()), Name);
     };
     auto HiIdx = [LoByte, CRCBW](IRBuilderBase &Builder, Value *Op,
                                  const Twine &Name) {
diff --git a/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check-dl.ll b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check-dl.ll
new file mode 100644
index 0000000000000..14a4c952d3510
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check-dl.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6
+; RUN: opt -passes=loop-idiom -S %s | FileCheck %s
+
+target datalayout = "p:16:16"
+
+;.
+; CHECK: @.crctable = private constant [256 x i32] zeroinitializer
+;.
+define void @test_with_dl() {
+; CHECK-LABEL: define void @test_with_dl() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[PH:.*]]
+; CHECK:       [[PH_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[CRC_NEXT_LCSSA:%.*]] = phi i32 [ [[CRC_NEXT3:%.*]], %[[LOOP:.*]] ]
+; CHECK-NEXT:    br label %[[PH]]
+; CHECK:       [[PH]]:
+; CHECK-NEXT:    [[CRC_USE:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[CRC_NEXT_LCSSA]], %[[PH_LOOPEXIT]] ]
+; CHECK-NEXT:    br label %[[LOOP]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CRC2:%.*]] = phi i32 [ 0, %[[PH]] ], [ [[CRC_NEXT3]], %[[LOOP]] ]
+; CHECK-NEXT:    [[INDEXER_LO:%.*]] = trunc i32 [[CRC2]] to i8
+; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i8 [[INDEXER_LO]] to i16
+; CHECK-NEXT:    [[TBL_PTRADD:%.*]] = getelementptr inbounds i32, ptr @.crctable, i16 [[INDEXER_EXT]]
+; CHECK-NEXT:    [[TBL_LD:%.*]] = load i32, ptr [[TBL_PTRADD]], align 4
+; CHECK-NEXT:    [[CRC_LE_SHIFT:%.*]] = lshr i32 [[CRC2]], 8
+; CHECK-NEXT:    [[CRC_NEXT3]] = xor i32 [[CRC_LE_SHIFT]], [[TBL_LD]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND1:%.*]] = icmp ne i16 [[IV]], 0
+; CHECK-NEXT:    br i1 [[EXIT_COND1]], label %[[LOOP]], label %[[PH_LOOPEXIT]]
+;
+entry:
+  br label %ph
+
+ph:
+  %crc.use = phi i32 [ 0, %entry ], [ %crc.next, %loop ]
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %ph ], [ %iv.next, %loop ]
+  %crc = phi i32 [ 0, %ph ], [ %crc.next, %loop ]
+  %lshr.crc.1 = lshr i32 %crc, 1
+  %crc.and.1 = and i32 %crc, 1
+  %sb.check = icmp eq i32 %crc.and.1, 0
+  %xor = xor i32 %lshr.crc.1, 0
+  %crc.next = select i1 %sb.check, i32 %lshr.crc.1, i32 %xor
+  %iv.next = add i16 %iv, 1
+  %exit.cond = icmp ult i16 %iv, 7
+  br i1 %exit.cond, label %loop, label %ph
+}
diff --git a/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll
index 51dc142200d78..b2ec53ca405d4 100644
--- a/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll
+++ b/llvm/test/Transforms/LoopIdiom/cyclic-redundancy-check.ll
@@ -118,8 +118,8 @@ define i16 @crc16.le.tc16(i16 %msg, i16 %checksum) {
 ; CHECK-NEXT:    [[IV_INDEXER:%.*]] = zext i8 [[IV_BITS]] to i16
 ; CHECK-NEXT:    [[DATA_INDEXER:%.*]] = lshr i16 [[MSG]], [[IV_INDEXER]]
 ; CHECK-NEXT:    [[CRC_DATA_INDEXER:%.*]] = xor i16 [[DATA_INDEXER]], [[CRC2]]
-; CHECK-NEXT:    [[INDEXER_LO:%.*]] = and i16 [[CRC_DATA_INDEXER]], 255
-; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i16 [[INDEXER_LO]] to i64
+; CHECK-NEXT:    [[INDEXER_LO:%.*]] = trunc i16 [[CRC_DATA_INDEXER]] to i8
+; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i8 [[INDEXER_LO]] to i64
 ; CHECK-NEXT:    [[TBL_PTRADD:%.*]] = getelementptr inbounds i16, ptr @.crctable.2, i64 [[INDEXER_EXT]]
 ; CHECK-NEXT:    [[TBL_LD:%.*]] = load i16, ptr [[TBL_PTRADD]], align 2
 ; CHECK-NEXT:    [[CRC_LE_SHIFT:%.*]] = lshr i16 [[CRC2]], 8
@@ -166,8 +166,8 @@ define i8 @crc8.le.tc16(i16 %msg, i8 %checksum) {
 ; CHECK-NEXT:    [[DATA_INDEXER:%.*]] = lshr i16 [[MSG]], [[IV_INDEXER]]
 ; CHECK-NEXT:    [[CRC_INDEXER_CAST:%.*]] = zext i8 [[CRC2]] to i16
 ; CHECK-NEXT:    [[CRC_DATA_INDEXER:%.*]] = xor i16 [[DATA_INDEXER]], [[CRC_INDEXER_CAST]]
-; CHECK-NEXT:    [[INDEXER_LO:%.*]] = and i16 [[CRC_DATA_INDEXER]], 255
-; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i16 [[INDEXER_LO]] to i64
+; CHECK-NEXT:    [[INDEXER_LO:%.*]] = trunc i16 [[CRC_DATA_INDEXER]] to i8
+; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i8 [[INDEXER_LO]] to i64
 ; CHECK-NEXT:    [[TBL_PTRADD:%.*]] = getelementptr inbounds i8, ptr @.crctable.3, i64 [[INDEXER_EXT]]
 ; CHECK-NEXT:    [[TBL_LD]] = load i8, ptr [[TBL_PTRADD]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1
@@ -212,8 +212,8 @@ define i16 @crc16.be.tc8.crc.init.li(i16 %checksum, i8 %msg) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[CRC2:%.*]] = phi i16 [ [[CRC_INIT]], %[[ENTRY]] ], [ [[CRC_NEXT3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[INDEXER_HI:%.*]] = lshr i16 [[CRC2]], 8
-; CHECK-NEXT:    [[INDEXER_HI_LO_BYTE:%.*]] = and i16 [[INDEXER_HI]], 255
-; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i16 [[INDEXER_HI_LO_BYTE]] to i64
+; CHECK-NEXT:    [[INDEXER_HI_LO_BYTE:%.*]] = trunc i16 [[INDEXER_HI]] to i8
+; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i8 [[INDEXER_HI_LO_BYTE]] to i64
 ; CHECK-NEXT:    [[TBL_PTRADD:%.*]] = getelementptr inbounds i16, ptr @.crctable.4, i64 [[INDEXER_EXT]]
 ; CHECK-NEXT:    [[TBL_LD:%.*]] = load i16, ptr [[TBL_PTRADD]], align 2
 ; CHECK-NEXT:    [[CRC_BE_SHIFT:%.*]] = shl i16 [[CRC2]], 8
@@ -255,8 +255,8 @@ define i16 @crc16.be.tc8.crc.init.arg(i16 %crc.init) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[CRC2:%.*]] = phi i16 [ [[CRC_INIT]], %[[ENTRY]] ], [ [[CRC_NEXT3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[INDEXER_HI:%.*]] = lshr i16 [[CRC2]], 8
-; CHECK-NEXT:    [[INDEXER_HI_LO_BYTE:%.*]] = and i16 [[INDEXER_HI]], 255
-; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i16 [[INDEXER_HI_LO_BYTE]] to i64
+; CHECK-NEXT:    [[INDEXER_HI_LO_BYTE:%.*]] = trunc i16 [[INDEXER_HI]] to i8
+; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i8 [[INDEXER_HI_LO_BYTE]] to i64
 ; CHECK-NEXT:    [[TBL_PTRADD:%.*]] = getelementptr inbounds i16, ptr @.crctable.5, i64 [[INDEXER_EXT]]
 ; CHECK-NEXT:    [[TBL_LD:%.*]] = load i16, ptr [[TBL_PTRADD]], align 2
 ; CHECK-NEXT:    [[CRC_BE_SHIFT:%.*]] = shl i16 [[CRC2]], 8
@@ -295,8 +295,8 @@ define i16 @crc16.be.tc8.crc.init.arg.flipped.sb.check(i16 %crc.init) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[CRC2:%.*]] = phi i16 [ [[CRC_INIT]], %[[ENTRY]] ], [ [[CRC_NEXT3:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[INDEXER_HI:%.*]] = lshr i16 [[CRC2]], 8
-; CHECK-NEXT:    [[INDEXER_HI_LO_BYTE:%.*]] = and i16 [[INDEXER_HI]], 255
-; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i16 [[INDEXER_HI_LO_BYTE]] to i64
+; CHECK-NEXT:    [[INDEXER_HI_LO_BYTE:%.*]] = trunc i16 [[INDEXER_HI]] to i8
+; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i8 [[INDEXER_HI_LO_BYTE]] to i64
 ; CHECK-NEXT:    [[TBL_PTRADD:%.*]] = getelementptr inbounds i16, ptr @.crctable.6, i64 [[INDEXER_EXT]]
 ; CHECK-NEXT:    [[TBL_LD:%.*]] = load i16, ptr [[TBL_PTRADD]], align 2
 ; CHECK-NEXT:    [[CRC_BE_SHIFT:%.*]] = shl i16 [[CRC2]], 8
@@ -406,8 +406,8 @@ define i32 @crc32.le.tc8.data32(i32 %checksum, i32 %msg) {
 ; CHECK-NEXT:    [[IV_INDEXER:%.*]] = zext i8 [[IV_BITS]] to i32
 ; CHECK-NEXT:    [[DATA_INDEXER:%.*]] = lshr i32 [[MSG]], [[IV_INDEXER]]
 ; CHECK-NEXT:    [[CRC_DATA_INDEXER:%.*]] = xor i32 [[DATA_INDEXER]], [[CRC2]]
-; CHECK-NEXT:    [[INDEXER_LO:%.*]] = and i32 [[CRC_DATA_INDEXER]], 255
-; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i32 [[INDEXER_LO]] to i64
+; CHECK-NEXT:    [[INDEXER_LO:%.*]] = trunc i32 [[CRC_DATA_INDEXER]] to i8
+; CHECK-NEXT:    [[INDEXER_EXT:%.*]] = zext i8 [[INDEXER_LO]] to i64
 ; CHECK-NEXT:    [[TBL_PTRADD:%.*]] = getelementptr inbounds i32, ptr @.crctable.8, i64 [[INDEXER_EXT]]
 ; CHECK-NEXT:    [[TBL_LD:%.*]] = load i32, ptr [[TBL_PTRADD]], align 4
 ; CHECK-NEXT:    [[CRC_LE_SHIFT:%.*]] = lshr i32 [[CRC2]], 8

From cbaf3c60184da7e89e0c6c342d04e94746bf72f0 Mon Sep 17 00:00:00 2001
From: carlobertolli <carlo.bertolli@amd.com>
Date: Wed, 1 Oct 2025 11:17:13 -0500
Subject: [PATCH 401/878] Add test to show save-temps is broken for amdgcn
 target. (#161472)

In response to request here
https://github.com/llvm/llvm-project/pull/160935
by @jansvoboda11
---
 clang/test/OpenMP/amdgcn_save_temps.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 clang/test/OpenMP/amdgcn_save_temps.c

diff --git a/clang/test/OpenMP/amdgcn_save_temps.c b/clang/test/OpenMP/amdgcn_save_temps.c
new file mode 100644
index 0000000000000..ebf0d6031ee82
--- /dev/null
+++ b/clang/test/OpenMP/amdgcn_save_temps.c
@@ -0,0 +1,25 @@
+
+// REQUIRES: amdgpu-registered-target
+
+// XFAIL: *
+
+// RUN: %clang_cc1 -E -fopenmp -x c -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd %s -o %t-openmp-amdgcn-amd-amdhsa-gfx90a.i
+// RUN: %clang_cc1 -fopenmp  -x c -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd -emit-llvm-bc %s -o %t-x86_64-unknown-unknown.bc
+// RUN: %clang_cc1 -fopenmp -x c -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd -emit-llvm -fopenmp-is-target-device -x cpp-output %t-openmp-amdgcn-amd-amdhsa-gfx90a.i -fopenmp-host-ir-file-path %t-x86_64-unknown-unknown.bc -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#define N 1000
+
+int test_amdgcn_save_temps() {
+  int arr[N];
+#pragma omp target
+  for (int i = 0; i < N; i++) {
+    arr[i] = 1;
+  }
+  return arr[0];
+}
+#endif
+
+// CHECK: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_test_amdgcn_save_temps

From 91c35d6378910ae76b8e8644825d8458f5d44ca1 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 1 Oct 2025 09:26:31 -0700
Subject: [PATCH 402/878] [gn] port aee99e8015da

---
 .../gn/secondary/compiler-rt/lib/builtins/sources.gni      | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
index ac48b940bce20..2ab2a0eb2783a 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
@@ -526,6 +526,13 @@ if (current_cpu == "ve") {
   ]
 }
 
+if (current_cpu == "wasm") {
+  builtins_sources += [
+    "wasm/__c_longjmp.S",
+    "wasm/__cpp_exceptions.S",
+  ]
+}
+
 if (!compiler_rt_exclude_atomic_builtin) {
   builtins_sources += [ "atomic.c" ]
 }

From a9b8dfe7b5f224e2d442352979cf2e0c1c0b539b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 1 Oct 2025 08:40:46 -0400
Subject: [PATCH 403/878] [libc++] Add a script to find outliers and re-run
 candidates in LNT results

This allows selectively re-running benchmarks that are suspected to contain
a lot of noise.
---
 libcxx/utils/find-rerun-candidates | 242 +++++++++++++++++++++++++++++
 libcxx/utils/visualize-historical  |  17 --
 2 files changed, 242 insertions(+), 17 deletions(-)
 create mode 100755 libcxx/utils/find-rerun-candidates

diff --git a/libcxx/utils/find-rerun-candidates b/libcxx/utils/find-rerun-candidates
new file mode 100755
index 0000000000000..5ac2644005aac
--- /dev/null
+++ b/libcxx/utils/find-rerun-candidates
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+
+import argparse
+import datetime
+import functools
+import os
+import pathlib
+import re
+import statistics
+import subprocess
+import sys
+
+import git
+import pandas
+import tqdm
+
+@functools.total_ordering
+class Commit:
+    """
+    This class represents a commit inside a given Git repository.
+    """
+
+    def __init__(self, git_repo, sha):
+        self._git_repo = git_repo
+        self._sha = sha
+
+    def __eq__(self, other):
+        """
+        Return whether two commits refer to the same commit.
+
+        This doesn't take into account the content of the Git tree at those commits, only the
+        'identity' of the commits themselves.
+        """
+        return self.fullrev == other.fullrev
+
+    def __lt__(self, other):
+        """
+        Return whether a commit is an ancestor of another commit in the Git repository.
+        """
+        # Is self._sha an ancestor of other._sha?
+        res = subprocess.run(['git', '-C', self._git_repo, 'merge-base', '--is-ancestor', self._sha, other._sha])
+        if res.returncode not in (0, 1):
+            raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}')
+        return res.returncode == 0
+
+    def __hash__(self):
+        """
+        Return the full revision for this commit.
+        """
+        return hash(self.fullrev)
+
+    @functools.cache
+    def show(self, include_diff=False):
+        """
+        Return the commit information equivalent to `git show` associated to this commit.
+        """
+        cmd = ['git', '-C', self._git_repo, 'show', self._sha]
+        if not include_diff:
+            cmd.append('--no-patch')
+        return subprocess.check_output(cmd, text=True)
+
+    @functools.cached_property
+    def shortrev(self):
+        """
+        Return the shortened version of the given SHA.
+        """
+        return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', '--short', self._sha], text=True).strip()
+
+    @functools.cached_property
+    def fullrev(self):
+        """
+        Return the full SHA associated to this commit.
+        """
+        return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', self._sha], text=True).strip()
+
+    @functools.cached_property
+    def commit_date(self):
+        """
+        Return the date of the commit as a `datetime.datetime` object.
+        """
+        repo = git.Repo(self._git_repo)
+        return datetime.datetime.fromtimestamp(repo.commit(self._sha).committed_date)
+
+    def prefetch(self):
+        """
+        Prefetch cached properties associated to this commit object.
+
+        This makes it possible to control when time is spent recovering that information from Git for
+        e.g. better reporting to the user.
+        """
+        self.commit_date
+        self.fullrev
+        self.shortrev
+        self.show()
+
+    def __str__(self):
+        return self._sha
+
+def directory_path(string):
+    if os.path.isdir(string):
+        return pathlib.Path(string)
+    else:
+        raise NotADirectoryError(string)
+
+def parse_lnt(lines, aggregate=statistics.median):
+    """
+    Parse lines in LNT format and return a list of dictionnaries of the form:
+
+        [
+            {
+                'benchmark': <benchmark1>,
+                <metric1>: [float],
+                <metric2>: [float],
+                'data_points': int,
+                ...
+            },
+            {
+                'benchmark': <benchmark2>,
+                <metric1>: [float],
+                <metric2>: [float],
+                'data_points': int,
+                ...
+            },
+            ...
+        ]
+
+    If a metric has multiple values associated to it, they are aggregated into a single
+    value using the provided aggregation function.
+    """
+    results = {}
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+
+        (identifier, value) = line.split(' ')
+        (benchmark, metric) = identifier.split('.')
+        if benchmark not in results:
+            results[benchmark] = {'benchmark': benchmark}
+
+        entry = results[benchmark]
+        if metric not in entry:
+            entry[metric] = []
+        entry[metric].append(float(value))
+
+    for (bm, entry) in results.items():
+        metrics = [key for key in entry if isinstance(entry[key], list)]
+        min_data_points = min(len(entry[metric]) for metric in metrics)
+        for metric in metrics:
+            entry[metric] = aggregate(entry[metric])
+        entry['data_points'] = min_data_points
+
+    return list(results.values())
+
+def sorted_revlist(git_repo, commits):
+    """
+    Return the list of commits sorted by their chronological order (from oldest to newest) in the
+    provided Git repository. Items earlier in the list are older than items later in the list.
+    """
+    revlist_cmd = ['git', '-C', git_repo, 'rev-list', '--no-walk'] + list(commits)
+    revlist = subprocess.check_output(revlist_cmd, text=True).strip().splitlines()
+    return list(reversed(revlist))
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        prog='find-rerun-candidates',
+        description='Find benchmarking data points that are good candidates for additional runs, to reduce noise.')
+    parser.add_argument('directory', type=directory_path,
+        help='Path to a valid directory containing benchmark data in LNT format, each file being named <commit>.lnt. '
+             'This is also the format generated by the `benchmark-historical` utility.')
+    parser.add_argument('--metric', type=str, default='execution_time',
+        help='The metric to analyze. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- '
+             'this option allows selecting which metric is analyzed for rerun candidates. The default is "execution_time".')
+    parser.add_argument('--filter', type=str, required=False,
+        help='An optional regular expression used to filter the benchmarks included in the analysis. '
+             'Only benchmarks whose names match the regular expression will be analyzed.')
+    parser.add_argument('--outlier-threshold', metavar='FLOAT', type=float, default=0.1,
+        help='Relative difference from the previous points for considering a data point as an outlier. This threshold is '
+             'expressed as a floating point number, e.g. 0.25 will detect points that differ by more than 25%% from their '
+             'previous result.')
+    parser.add_argument('--data-points-threshold', type=int, required=False,
+        help='Number of data points above which an outlier is not considered an outlier. If an outlier has more than '
+             'that number of data points yet its relative difference is above the threshold, it is not considered an '
+             'outlier. This can be used to re-run noisy data points until we have at least N samples, at which point '
+             'we consider the data to be accurate, even if the result is beyond the threshold. By default, there is '
+             'no limit on the number of data points.')
+    parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
+        help='Path to the git repository to use for ordering commits in time. '
+             'By default, the current working directory is used.')
+    args = parser.parse_args(argv)
+
+    # Extract benchmark data from the directory.
+    data = {}
+    files = [f for f in args.directory.glob('*.lnt')]
+    for file in tqdm.tqdm(files, desc='Parsing LNT files'):
+        rows = parse_lnt(file.read_text().splitlines())
+        (commit, _) = os.path.splitext(os.path.basename(file))
+        commit = Commit(args.git_repo, commit)
+        data[commit] = rows
+
+    # Obtain commit information which is then cached throughout the program. Do this
+    # eagerly so we can provide a progress bar.
+    for commit in tqdm.tqdm(data.keys(), desc='Prefetching Git information'):
+        commit.prefetch()
+
+    # Create a dataframe from the raw data and add some columns to it:
+    # - 'commit' represents the Commit object associated to the results in that row
+    # - `revlist_order` represents the order of the commit within the Git repository.
+    revlist = sorted_revlist(args.git_repo, [c.fullrev for c in data.keys()])
+    data = pandas.DataFrame([row | {'commit': c} for (c, rows) in data.items() for row in rows])
+    data = data.join(pandas.DataFrame([{'revlist_order': revlist.index(c.fullrev)} for c in data['commit']]))
+
+    # Filter the benchmarks if needed.
+    if args.filter is not None:
+        keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
+        data = data[data['benchmark'].isin(keeplist)]
+
+    # Detect outliers by selecting all benchmarks whose change percentage is beyond the threshold.
+    # If we have a max number of points, also take that into account.
+    if args.data_points_threshold is not None:
+        print(f'Generating outliers with more than {args.outlier_threshold * 100}% relative difference and less than {args.data_points_threshold} data points')
+    else:
+        print(f'Generating outliers with more than {args.outlier_threshold * 100}% relative difference')
+
+    overall = set()
+    for (benchmark, series) in data.sort_values(by='revlist_order').groupby('benchmark'):
+        pct_change = series[args.metric].pct_change()
+        outliers = series[pct_change.abs() > args.outlier_threshold]
+        if args.data_points_threshold is not None:
+            outliers = outliers[outliers['data_points'] < args.data_points_threshold]
+        outliers = set(outliers['commit'])
+        overall |= outliers
+        if len(outliers) > 0:
+            print(f'{benchmark}: {" ".join(c.shortrev for c in outliers)}')
+
+    if len(overall) > 0:
+        print(f'Summary: {" ".join(c.shortrev for c in overall)}')
+    else:
+        print(f'No outliers')
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/libcxx/utils/visualize-historical b/libcxx/utils/visualize-historical
index ef28e8b17ca4b..114c7e81f29e7 100755
--- a/libcxx/utils/visualize-historical
+++ b/libcxx/utils/visualize-historical
@@ -213,13 +213,6 @@ def main(argv):
              'Since the chart is interactive, it generally makes most sense to include all the benchmarks '
              'and to then filter them in the browser, but in some cases producing a chart with a reduced '
              'number of data series is useful.')
-    parser.add_argument('--find-outliers', metavar='FLOAT', type=float, required=False,
-        help='Instead of building a chart, detect commits that show a large spike (more than the given relative threshold) '
-             'with the previous result and print those to standard output. This can be used to generate a list of '
-             'potential outliers that we might want to re-generate the data for. The threshold is expressed as a '
-             'floating point number, e.g. 0.25 will detect points that differ by more than 25%% from their previous '
-             'result. This option respects --filter, i.e. only benchmarks that match the filter will be analyzed for '
-             'outliers.')
     parser.add_argument('--subtitle', type=str, required=False,
         help='Optional subtitle for the chart. This can be used to help identify the contents of the chart.')
     parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
@@ -258,16 +251,6 @@ def main(argv):
         keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
         data = data[data['benchmark'].isin(keeplist)]
 
-    # If requested, perform a basic pass to detect outliers.
-    # Note that we consider a commit to be an outlier if any of the benchmarks for that commit is an outlier.
-    if args.find_outliers is not None:
-        threshold = args.find_outliers
-        outliers = set()
-        for (benchmark, series) in data.sort_values(by='revlist_order').groupby('benchmark'):
-            outliers |= set(series[series[args.metric].pct_change() > threshold]['commit'])
-        print(f'Outliers (more than {threshold * 100}%): {" ".join(c.shortrev for c in outliers)}')
-        return
-
     # Plot the data for all the required benchmarks.
     figure = create_plot(data, args.metric, subtitle=args.subtitle)
     do_open = args.output is None or args.open

From 8907b6d39371d439461cdd3475d5590f87821377 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 1 Oct 2025 17:53:59 +0100
Subject: [PATCH 404/878] [VPlan] Remove original loop blocks if dead.
 (#155497)

Build on top of https://github.com/llvm/llvm-project/pull/154510 to
completely remove the blocks of dead scalar loops.

Depends on https://github.com/llvm/llvm-project/pull/154510.

PR: https://github.com/llvm/llvm-project/pull/155497
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  20 +-
 .../LoopVectorize/AArch64/call-costs.ll       |  11 -
 .../AArch64/clamped-trip-count.ll             |  26 -
 .../AArch64/conditional-branches-cost.ll      |  30 +-
 .../AArch64/drop-poison-generating-flags.ll   |  30 -
 .../first-order-recurrence-fold-tail.ll       |  13 -
 .../AArch64/force-target-instruction-cost.ll  |  15 +-
 .../LoopVectorize/AArch64/induction-costs.ll  |  15 +-
 .../AArch64/invariant-replicate-region.ll     |  16 +-
 .../AArch64/low_trip_count_predicates.ll      |  14 +-
 .../AArch64/mul-simplification.ll             |  12 +-
 .../AArch64/multiple-result-intrinsics.ll     |  51 -
 .../LoopVectorize/AArch64/optsize_minsize.ll  | 203 ----
 .../partial-reduce-dot-product-epilogue.ll    |  14 +-
 .../partial-reduce-dot-product-mixed.ll       |  16 +-
 .../partial-reduce-dot-product-neon.ll        |  93 +-
 .../AArch64/partial-reduce-dot-product.ll     |  80 +-
 .../LoopVectorize/AArch64/partial-reduce.ll   |  33 +-
 .../LoopVectorize/AArch64/pr73894.ll          |  15 +-
 .../AArch64/reduction-recurrence-costs-sve.ll |  51 +-
 .../AArch64/scalable-strict-fadd.ll           | 135 +--
 .../AArch64/simple_early_exit.ll              |  45 +-
 .../LoopVectorize/AArch64/store-costs-sve.ll  |   9 -
 .../LoopVectorize/AArch64/strict-fadd.ll      |   9 +-
 .../AArch64/struct-return-cost.ll             |  12 +-
 .../sve-interleaved-masked-accesses.ll        |  18 +-
 .../AArch64/sve-tail-folding-forced.ll        |   3 +-
 .../AArch64/sve-tail-folding-optsize.ll       |   3 +-
 .../sve-tail-folding-overflow-checks.ll       |  12 +-
 .../AArch64/sve-tail-folding-reductions.ll    | 112 +--
 .../AArch64/sve-tail-folding-unroll.ll        |   8 +-
 .../LoopVectorize/AArch64/sve-tail-folding.ll |  50 +-
 .../LoopVectorize/AArch64/sve2-histcnt.ll     |   7 +-
 .../AArch64/synthesize-mask-for-call.ll       |  36 -
 .../AArch64/tail-folding-styles.ll            |  36 -
 ...eave-to-widen-memory-remove-loop-region.ll |  86 +-
 ...arrow-interleave-to-widen-memory-unroll.ll |  34 -
 .../AArch64/type-shrinkage-insertelt.ll       |  35 -
 .../widen-call-with-intrinsic-or-libfunc.ll   |   3 +-
 .../LoopVectorize/AMDGPU/packed-math.ll       |   8 -
 .../LoopVectorize/ARM/active-lane-mask.ll     |  15 -
 .../ARM/mve-gather-scatter-tailpred.ll        |  42 -
 .../ARM/mve-reduction-predselect.ll           |  32 -
 .../LoopVectorize/ARM/mve-reduction-types.ll  | 166 +---
 .../LoopVectorize/ARM/optsize_minsize.ll      | 152 ---
 .../ARM/tail-folding-loop-hint.ll             |   5 +-
 .../LoopVectorize/LoongArch/defaults.ll       |  11 -
 .../Transforms/LoopVectorize/RISCV/bf16.ll    |  30 -
 .../RISCV/blocks-with-dead-instructions.ll    | 166 ----
 .../LoopVectorize/RISCV/dead-ops-cost.ll      |  40 -
 .../LoopVectorize/RISCV/defaults.ll           |  25 +-
 .../Transforms/LoopVectorize/RISCV/divrem.ll  | 321 +------
 .../RISCV/evl-compatible-loops.ll             |  20 -
 .../Transforms/LoopVectorize/RISCV/f16.ll     |  13 -
 .../RISCV/gather-scatter-cost.ll              |  34 -
 .../LoopVectorize/RISCV/induction-costs.ll    |  36 -
 .../LoopVectorize/RISCV/inloop-reduction.ll   |  60 +-
 .../RISCV/interleaved-accesses.ll             | 885 ------------------
 .../RISCV/interleaved-masked-access.ll        |  20 +-
 .../Transforms/LoopVectorize/RISCV/lmul.ll    |  33 -
 .../LoopVectorize/RISCV/low-trip-count.ll     |  56 --
 .../LoopVectorize/RISCV/mask-index-type.ll    |  20 +-
 .../RISCV/partial-reduce-dot-product.ll       |  36 +-
 ...ruction-or-drop-poison-generating-flags.ll |  25 +-
 .../Transforms/LoopVectorize/RISCV/pr88802.ll |  22 +-
 .../LoopVectorize/RISCV/reductions.ll         | 267 +-----
 .../LoopVectorize/RISCV/reg-usage-prune-vf.ll |  60 --
 .../LoopVectorize/RISCV/remark-reductions.ll  |  12 +-
 .../RISCV/riscv-vector-reverse.ll             |  28 +-
 .../LoopVectorize/RISCV/safe-dep-distance.ll  |  48 -
 .../LoopVectorize/RISCV/scalable-basics.ll    |  67 +-
 .../LoopVectorize/RISCV/scalable-tailfold.ll  |  68 +-
 .../RISCV/select-cmp-reduction.ll             |  98 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |  59 --
 .../RISCV/tail-folding-cast-intrinsics.ll     |  11 -
 .../RISCV/tail-folding-cond-reduction.ll      | 180 +---
 .../LoopVectorize/RISCV/tail-folding-div.ll   |  56 --
 .../tail-folding-fixed-order-recurrence.ll    |  72 +-
 .../RISCV/tail-folding-inloop-reduction.ll    | 248 +----
 .../RISCV/tail-folding-interleave.ll          | 113 +--
 .../LoopVectorize/RISCV/tail-folding-iv32.ll  |  11 -
 .../RISCV/tail-folding-known-no-overflow.ll   |  33 -
 .../RISCV/tail-folding-masked-loadstore.ll    |  20 +-
 .../RISCV/tail-folding-ordered-reduction.ll   |  14 +-
 .../RISCV/tail-folding-reduction.ll           | 248 +----
 .../RISCV/tail-folding-reverse-load-store.ll  |  50 +-
 .../RISCV/tail-folding-safe-dep-distance.ll   |  84 --
 .../RISCV/tail-folding-uniform-store.ll       |  10 -
 .../truncate-to-minimal-bitwidth-cost.ll      |  60 --
 .../truncate-to-minimal-bitwidth-evl-crash.ll |  14 -
 .../LoopVectorize/RISCV/uniform-load-store.ll | 182 +---
 ...ctor-loop-backedge-elimination-with-evl.ll |  21 +-
 .../RISCV/vectorize-vp-intrinsics.ll          |  14 -
 .../LoopVectorize/SystemZ/addressing.ll       |   8 -
 .../SystemZ/force-target-instruction-cost.ll  |   9 -
 .../LoopVectorize/SystemZ/pr47665.ll          |  16 -
 .../predicated-first-order-recurrence.ll      |  12 -
 ...demanding-all-lanes-and-first-lane-only.ll |  19 -
 .../LoopVectorize/X86/constant-fold.ll        |  89 +-
 .../X86/cost-constant-known-via-scev.ll       |  18 +-
 .../LoopVectorize/X86/cost-model.ll           |  40 +-
 ...bounds-flags-for-reverse-vector-pointer.ll |  25 +-
 .../X86/fixed-order-recurrence.ll             |  11 -
 .../LoopVectorize/X86/gather_scatter.ll       |  36 +-
 .../X86/imprecise-through-phis.ll             |  48 +-
 .../LoopVectorize/X86/induction-costs.ll      |  23 +-
 .../LoopVectorize/X86/interleave-cost.ll      |  52 -
 .../LoopVectorize/X86/interleaving.ll         |  12 -
 .../X86/invariant-store-vectorization.ll      |  10 -
 .../LoopVectorize/X86/load-deref-pred.ll      | 274 +-----
 .../LoopVectorize/X86/metadata-enable.ll      |  24 -
 .../Transforms/LoopVectorize/X86/optsize.ll   |  88 --
 .../LoopVectorize/X86/parallel-loops.ll       |   4 -
 .../X86/pr141968-instsimplifyfolder.ll        |  17 +-
 .../Transforms/LoopVectorize/X86/pr34438.ll   |  19 +-
 ...6-sunk-instruction-used-outside-of-loop.ll |  20 +-
 .../Transforms/LoopVectorize/X86/pr81872.ll   |  20 +-
 .../LoopVectorize/X86/reduction-fastmath.ll   |  42 +-
 .../X86/replicate-uniform-call.ll             |  17 -
 .../X86/scev-checks-unprofitable.ll           |   9 -
 .../LoopVectorize/X86/small-size.ll           |  48 +-
 .../LoopVectorize/X86/strided_load_cost.ll    | 148 ++-
 .../LoopVectorize/X86/tail_loop_folding.ll    |  50 +-
 .../LoopVectorize/X86/uniform_mem_op.ll       |  20 +-
 .../X86/vect.omp.force.small-tc.ll            |  28 +-
 .../X86/vectorize-force-tail-with-evl.ll      |  14 -
 .../X86/vectorize-interleaved-accesses-gap.ll |  15 -
 ...ned-value-used-as-scalar-and-first-lane.ll |  40 +-
 .../LoopVectorize/X86/x86-predication.ll      |  25 +-
 .../Transforms/LoopVectorize/bsd_regex.ll     |   4 -
 .../LoopVectorize/check-prof-info.ll          |   8 -
 .../constantfolder-infer-correct-gepty.ll     |  13 -
 .../LoopVectorize/constantfolder.ll           | 105 ---
 .../LoopVectorize/create-induction-resume.ll  |  12 -
 .../LoopVectorize/dead_instructions.ll        |  12 -
 .../debugloc-optimize-vfuf-term.ll            |  14 -
 .../dont-fold-tail-for-const-TC.ll            |   9 -
 .../dont-fold-tail-for-divisible-TC.ll        |   9 -
 ...irst-order-recurrence-dead-instructions.ll |  19 +-
 .../first-order-recurrence-interleave-only.ll |  15 +-
 ...t-order-recurrence-multiply-recurrences.ll |  15 -
 .../LoopVectorize/first-order-recurrence.ll   | 189 +---
 llvm/test/Transforms/LoopVectorize/flags.ll   |  14 +-
 .../LoopVectorize/float-induction.ll          |  12 -
 .../float-minmax-instruction-flag.ll          |  15 +-
 .../LoopVectorize/if-pred-stores.ll           |  59 +-
 ...ction-multiple-uses-in-same-instruction.ll |  10 -
 .../LoopVectorize/induction-step.ll           |  12 -
 .../Transforms/LoopVectorize/induction.ll     | 178 +---
 .../instruction-only-used-outside-of-loop.ll  |  71 +-
 .../interleave-with-i65-induction.ll          |  13 -
 ...aved-accesses-different-insert-position.ll |  17 -
 .../interleaved-accesses-metadata.ll          |  45 +-
 .../LoopVectorize/interleaved-accesses.ll     |  38 +-
 .../Transforms/LoopVectorize/is_fpclass.ll    |  12 -
 .../LoopVectorize/iv-select-cmp-decreasing.ll | 180 +---
 .../LoopVectorize/iv-select-cmp-trunc.ll      | 144 +--
 .../LoopVectorize/iv_outside_user.ll          | 156 +--
 .../LoopVectorize/load-deref-pred-align.ll    | 138 +--
 .../LoopVectorize/load-deref-pred-neg-off.ll  |  23 +-
 .../load-of-struct-deref-pred.ll              |  45 -
 .../Transforms/LoopVectorize/loop-form.ll     |  10 -
 .../LoopVectorize/make-followup-loop-id.ll    |  22 +-
 .../LoopVectorize/memdep-fold-tail.ll         |  12 -
 .../test/Transforms/LoopVectorize/metadata.ll | 126 +--
 .../minimumnum-maximumnum-reductions.ll       |  56 +-
 .../LoopVectorize/multiple-address-spaces.ll  |   4 -
 .../multiple-result-intrinsics.ll             |  63 +-
 .../LoopVectorize/noalias-scope-decl.ll       |  14 -
 llvm/test/Transforms/LoopVectorize/optsize.ll | 115 +--
 .../test/Transforms/LoopVectorize/phi-cost.ll |   8 +-
 ...r154045-dont-fold-extractelement-livein.ll |  16 -
 llvm/test/Transforms/LoopVectorize/pr32859.ll |   4 +-
 .../LoopVectorize/pr36983-multiple-lcssa.ll   |  12 +-
 .../LoopVectorize/pr44488-predication.ll      |  21 +-
 .../pr45679-fold-tail-by-masking.ll           |  57 --
 .../pr46525-expander-insertpoint.ll           |  12 +-
 .../pr51614-fold-tail-by-masking.ll           |  17 +-
 .../pr55167-fold-tail-live-out.ll             |  26 +-
 llvm/test/Transforms/LoopVectorize/pr66616.ll |  29 +-
 .../LoopVectorize/predicate-switch.ll         |  58 --
 .../predicatedinst-loop-invariant.ll          |  68 --
 .../preserve-dbg-loc-and-loop-metadata.ll     | 560 ++++++-----
 .../preserve-dbg-loc-reduction-inloop.ll      |   2 +-
 .../LoopVectorize/reduction-inloop-min-max.ll |  24 -
 .../LoopVectorize/reduction-inloop-pred.ll    |  76 +-
 .../LoopVectorize/reduction-inloop-uf4.ll     |  24 +-
 .../LoopVectorize/reduction-inloop.ll         | 834 ++---------------
 .../LoopVectorize/reduction-predselect.ll     |  40 -
 .../Transforms/LoopVectorize/reduction.ll     |  16 +-
 .../LoopVectorize/remarks-reduction-inloop.ll |  14 +-
 .../reverse-induction-gep-nowrap-flags.ll     |  48 +-
 .../LoopVectorize/reverse_induction.ll        |  58 +-
 .../Transforms/LoopVectorize/runtime-check.ll |   4 -
 .../scev-exit-phi-invalidation.ll             |  49 +-
 .../LoopVectorize/select-neg-cond.ll          |  15 -
 ...tion-start-value-may-be-undef-or-poison.ll |  45 +-
 .../LoopVectorize/select-reduction.ll         |  25 +-
 .../single-early-exit-cond-poison.ll          |  28 +-
 .../single-early-exit-deref-assumptions.ll    |  42 +-
 .../single-early-exit-interleave-hint.ll      |  20 +-
 .../single-early-exit-interleave-only.ll      |  32 +-
 .../single-early-exit-interleave.ll           | 119 +--
 .../LoopVectorize/single-value-blend-phis.ll  | 105 +--
 .../LoopVectorize/single_early_exit.ll        |  39 +-
 .../single_early_exit_live_outs.ll            | 424 +--------
 ...e-reduction-results-in-tail-folded-loop.ll |  11 -
 .../strict-fadd-interleave-only.ll            |  76 +-
 .../strided-accesses-interleave-only.ll       |  10 -
 .../tail-folding-alloca-in-loop.ll            |  10 -
 ...folding-optimize-vector-induction-width.ll |  81 --
 .../LoopVectorize/tail-folding-switch.ll      |  16 -
 .../tail-folding-vectorization-factor-1.ll    |  17 -
 .../LoopVectorize/trunc-extended-icmps.ll     |  19 -
 .../LoopVectorize/trunc-loads-p16.ll          |  14 -
 .../LoopVectorize/trunc-reductions.ll         |  12 -
 .../Transforms/LoopVectorize/trunc-shifts.ll  |  84 --
 .../LoopVectorize/uitofp-preserve-nneg.ll     |  12 -
 .../Transforms/LoopVectorize/uniform-blend.ll |  64 --
 .../uniform_across_vf_induction1.ll           |  57 +-
 .../uniform_across_vf_induction1_and.ll       |  35 +-
 .../uniform_across_vf_induction1_div_urem.ll  |  18 +-
 .../uniform_across_vf_induction1_lshr.ll      |  70 +-
 .../uniform_across_vf_induction2.ll           | 104 +-
 .../unused-blend-mask-for-first-operand.ll    |  69 +-
 ...or-loop-backedge-elimination-early-exit.ll |  84 +-
 ...p-backedge-elimination-outside-iv-users.ll |  50 +-
 .../vector-loop-backedge-elimination.ll       |  68 +-
 .../widen-gep-all-indices-invariant.ll        |  35 -
 .../LoopVectorize/widen-intrinsic.ll          |  11 -
 .../PhaseOrdering/ARM/arm_mean_q7.ll          |   6 +-
 231 files changed, 1492 insertions(+), 11722 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 81f1956c96254..ffd2e59938510 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -968,24 +968,26 @@ void VPlan::execute(VPTransformState *State) {
     // logic generic during VPlan execution.
     State->CFG.DTU.applyUpdates(
         {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
-  } else {
+  }
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      Entry);
+  // Generate code for the VPlan, in parts of the vector skeleton, loop body and
+  // successor blocks including the middle, exit and scalar preheader blocks.
+  for (VPBlockBase *Block : RPOT)
+    Block->execute(State);
+
+  // If the original loop is unreachable, delete it and all its blocks.
+  if (!ScalarPhVPBB->hasPredecessors()) {
     Loop *OrigLoop =
         State->LI->getLoopFor(getScalarHeader()->getIRBasicBlock());
-    // If the original loop is unreachable, we need to delete it.
     auto Blocks = OrigLoop->getBlocksVector();
     Blocks.push_back(cast<VPIRBasicBlock>(ScalarPhVPBB)->getIRBasicBlock());
     for (auto *BB : Blocks)
       State->LI->removeBlock(BB);
+    DeleteDeadBlocks(Blocks, &State->CFG.DTU);
     State->LI->erase(OrigLoop);
   }
 
-  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
-      Entry);
-  // Generate code for the VPlan, in parts of the vector skeleton, loop body and
-  // successor blocks including the middle, exit and scalar preheader blocks.
-  for (VPBlockBase *Block : RPOT)
-    Block->execute(State);
-
   State->CFG.DTU.flush();
 
   VPBasicBlock *Header = vputils::getFirstLoopHeader(*this, State->VPDT);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
index 387bb4302de60..23918427e7003 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll
@@ -81,17 +81,6 @@ define void @powi_call(ptr %P) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[POWI:%.*]] = tail call double @llvm.powi.f64.i32(double [[L]], i32 3)
-; CHECK-NEXT:    store double [[POWI]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 56a4683298e3d..6e3d257e531ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -33,20 +33,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[DST]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
-; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8
-; CHECK-NEXT:    store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
@@ -108,20 +95,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[P_OUT_TAIL_09:%.*]] = phi ptr [ [[DST]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[SHR3:%.*]] = lshr i64 [[VAL]], [[TMP19]]
-; CHECK-NEXT:    [[CONV4:%.*]] = trunc i64 [[SHR3]] to i8
-; CHECK-NEXT:    store i8 [[CONV4]], ptr [[P_OUT_TAIL_09]], align 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_OUT_TAIL_09]], i64 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index e4ee6776ae24c..6cf11be0e11f7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -362,8 +362,9 @@ define void @latch_branch_cost(ptr %dst) {
 ; PRED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
 ; PRED-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br [[EXIT:label %.*]]
-; PRED:       [[SCALAR_PH:.*:]]
+; PRED-NEXT:    br label %[[EXIT:.*]]
+; PRED:       [[EXIT]]:
+; PRED-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -585,8 +586,9 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP15]], true
 ; PRED-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br [[EXIT:label %.*]]
-; PRED:       [[SCALAR_PH:.*:]]
+; PRED-NEXT:    br label %[[EXIT:.*]]
+; PRED:       [[EXIT]]:
+; PRED-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -609,7 +611,6 @@ exit:
 }
 
 define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
-;
 ; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
 ; COMMON-SAME: ptr [[DST:%.*]]) {
 ; COMMON-NEXT:  [[ENTRY:.*:]]
@@ -659,16 +660,16 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; COMMON-NEXT:    store i8 6, ptr [[TMP6]], align 1
 ; COMMON-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; COMMON:       [[PRED_STORE_CONTINUE12]]:
-; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
+; COMMON-NEXT:    br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]]
 ; COMMON:       [[PRED_STORE_IF13]]:
 ; COMMON-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
 ; COMMON-NEXT:    store i8 7, ptr [[TMP7]], align 1
-; COMMON-NEXT:    br label %[[EXIT]]
-; COMMON:       [[EXIT]]:
-; COMMON-NEXT:    br label %[[SCALAR_PH:.*]]
-; COMMON:       [[SCALAR_PH]]:
-; COMMON-NEXT:    br [[EXIT1:label %.*]]
-; COMMON:       [[SCALAR_PH1:.*:]]
+; COMMON-NEXT:    br label %[[EXIT1]]
+; COMMON:       [[EXIT1]]:
+; COMMON-NEXT:    br label %[[SCALAR_PH1:.*]]
+; COMMON:       [[SCALAR_PH1]]:
+; COMMON-NEXT:    br [[EXIT:label %.*]]
+; COMMON:       [[SCALAR_PH:.*:]]
 ;
 entry:
   br label %loop
@@ -1160,8 +1161,9 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) {
 ; PRED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; PRED-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
-; PRED-NEXT:    br [[EXIT:label %.*]]
-; PRED:       [[SCALAR_PH:.*:]]
+; PRED-NEXT:    br label %[[EXIT:.*]]
+; PRED:       [[EXIT]]:
+; PRED-NEXT:    ret void
 ;
 entry:
   br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
index 1af55e91e861a..71acac25e4efe 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll
@@ -65,36 +65,6 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
 ; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds double, ptr [[SRC_1]], i64 [[IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load double, ptr [[GEP_SRC_1]], align 8
-; CHECK-NEXT:    [[ABS:%.*]] = tail call nnan double @llvm.fabs.f64(double [[L_1]])
-; CHECK-NEXT:    [[C_0:%.*]] = fcmp olt double [[ABS]], 1.000000e+00
-; CHECK-NEXT:    br i1 [[C_0]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[SRC_2]], align 8
-; CHECK-NEXT:    [[IV_SUB_1:%.*]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[GEP_IV_SUB_1:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[IV_SUB_1]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_IV_SUB_1]], align 8
-; CHECK-NEXT:    [[C_1:%.*]] = fcmp oeq double [[L_2]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[C_1]], label %[[MERGE:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[IV_SUB_2:%.*]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[GEP_IV_SUB_2:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[IV_SUB_2]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_IV_SUB_2]], align 8
-; CHECK-NEXT:    br label %[[MERGE]]
-; CHECK:       [[MERGE]]:
-; CHECK-NEXT:    [[MERGE_IV:%.*]] = phi i64 [ [[IV_SUB_2]], %[[ELSE]] ], [ [[IV_SUB_1]], %[[THEN]] ]
-; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr inbounds i32, ptr [[DST_1]], i64 [[MERGE_IV]]
-; CHECK-NEXT:    store i32 10, ptr [[GEP_DST_1]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
index 890ff1dc05e4f..4bb8a0e72acb7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll
@@ -69,20 +69,7 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    [[FOR_NEXT]] = load i16, ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr i32, ptr [[B]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    store i32 [[SEXT]], ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1001
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index db088f88e2d8a..bfee39eac0ae2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -18,21 +18,8 @@ define double @test_reduction_costs() {
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_1:.*]]
-; CHECK:       [[LOOP_1]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[R_1:%.*]] = phi double [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[R_1_NEXT:%.*]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[R_2:%.*]] = phi double [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[R_2_NEXT:%.*]], %[[LOOP_1]] ]
-; CHECK-NEXT:    [[R_1_NEXT]] = fadd double [[R_1]], 3.000000e+00
-; CHECK-NEXT:    [[R_2_NEXT]] = fadd double [[R_2]], 9.000000e+00
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_1]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[R_1_NEXT_LCSSA:%.*]] = phi double [ [[R_1_NEXT]], %[[LOOP_1]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[R_2_NEXT_LCSSA:%.*]] = phi double [ [[R_2_NEXT]], %[[LOOP_1]] ], [ [[TMP1]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[DIV:%.*]] = fmul double [[R_1_NEXT_LCSSA]], [[R_2_NEXT_LCSSA]]
+; CHECK-NEXT:    [[DIV:%.*]] = fmul double [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    ret double [[DIV]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index a74c33f26e58a..42a1940925968 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -169,22 +169,9 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[RECUR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[PTR_IV]], align 4
-; CHECK-NEXT:    [[RECUR_NEXT]] = zext i32 [[L]] to i64
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RECUR_LCSSA:%.*]] = phi i64 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RECUR_LCSSA]]
+; CHECK-NEXT:    ret i64 [[VECTOR_RECUR_EXTRACT_FOR_PHI]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
index f1571e67e5849..d80fdd1ce7270 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
@@ -51,22 +51,8 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[REM_1:%.*]] = urem i32 10, [[X]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 0, %[[LOOP_HEADER]] ], [ [[REM_1]], %[[THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 99
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], %[[LOOP_LATCH]] ], [ [[TMP17]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP17]]
 ;
 entry:
   br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index dd8bd273050c7..e424649cf50c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -474,19 +474,8 @@ define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_0179:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD]] = add i32 [[SUM_0179]], [[TMP5]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   br label %for.body
@@ -520,6 +509,7 @@ define i32 @tc4_from_profile(ptr noundef readonly captures(none) %tmp, i64 %N) v
 ; CHECK-NEXT:    [[ADD]] = add i32 [[SUM_0179]], [[TMP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
index 80bf956927c77..9f518e448eb19 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll
@@ -62,18 +62,8 @@ define i32 @add_reduction_select_operand_constant_but_non_uniform() {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[ADD2_REASS:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 42, %[[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[ADD2_REASS]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[RDX_NEXT]] = add i32 0, [[RDX]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD2_REASS]], 64
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index 544ef5c82c7ac..a6e0f8a2a1c3a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -32,14 +32,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
 ; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
 ; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
 ; CHECK:  [[EXIT:.*:]]
 ;
 ; CHECK-ARMPL-LABEL: define void @sincos_f32(
@@ -112,14 +105,7 @@ define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
 ; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
 ; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
-; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
-; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
 ; CHECK:  [[EXIT:.*:]]
 ;
 ; CHECK-ARMPL-LABEL: define void @sincos_f64(
@@ -209,15 +195,6 @@ define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly
 ; CHECK-ARMPL:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP15]], 1
 ; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr [[TMP19:%.*]], i32 4, <vscale x 4 x i1> [[TMP14:%.*]])
 ; CHECK-ARMPL:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP17]], ptr [[TMP21:%.*]], i32 4, <vscale x 4 x i1> [[TMP14]])
-; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
-; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
-; CHECK-ARMPL:  [[FOR_BODY:.*:]]
-; CHECK-ARMPL:  [[IF_THEN:.*:]]
-; CHECK-ARMPL:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
-; CHECK-ARMPL:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK-ARMPL:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK-ARMPL:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK-ARMPL:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
 ; CHECK-ARMPL:  [[IF_MERGE:.*:]]
 ; CHECK-ARMPL:  [[FOR_END:.*:]]
 ;
@@ -277,14 +254,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
 ; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
 ; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
 ; CHECK:  [[EXIT:.*:]]
 ;
 ; CHECK-ARMPL-LABEL: define void @modf_f32(
@@ -357,14 +327,7 @@ define void @modf_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
 ; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
 ; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.modf.f64(double [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
-; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
-; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
 ; CHECK:  [[EXIT:.*:]]
 ;
 ; CHECK-ARMPL-LABEL: define void @modf_f64(
@@ -441,14 +404,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
 ; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
 ; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
 ; CHECK:  [[EXIT:.*:]]
 ;
 ; CHECK-ARMPL-LABEL: define void @sincospi_f32(
@@ -521,14 +477,7 @@ define void @sincospi_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
 ; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
 ; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.sincospi.f64(double [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
-; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
-; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
 ; CHECK:  [[EXIT:.*:]]
 ;
 ; CHECK-ARMPL-LABEL: define void @sincospi_f64(
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index ff3f6e906e82c..56ace5497b996 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -30,17 +30,6 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; DEFAULT-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]]
-; DEFAULT-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -59,17 +48,6 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; OPTSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; OPTSIZE:       [[SCALAR_PH:.*]]:
-; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; OPTSIZE:       [[FOR_BODY]]:
-; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; OPTSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]]
-; OPTSIZE-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; OPTSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; OPTSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; OPTSIZE:       [[FOR_COND_CLEANUP]]:
 ; OPTSIZE-NEXT:    ret void
 ;
@@ -88,17 +66,6 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; MINSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MINSIZE:       [[SCALAR_PH:.*]]:
-; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; MINSIZE:       [[FOR_BODY]]:
-; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MINSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]]
-; MINSIZE-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MINSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; MINSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; MINSIZE:       [[FOR_COND_CLEANUP]]:
 ; MINSIZE-NEXT:    ret void
 ;
@@ -390,23 +357,6 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP72:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul i8 [[A]], [[TMP72]]
-; DEFAULT-NEXT:    [[SHR:%.*]] = lshr i8 [[TMP72]], 1
-; DEFAULT-NEXT:    [[MUL5:%.*]] = mul i8 [[SHR]], [[B]]
-; DEFAULT-NEXT:    [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]]
-; DEFAULT-NEXT:    [[SHR7:%.*]] = lshr i8 [[TMP72]], 2
-; DEFAULT-NEXT:    [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]]
-; DEFAULT-NEXT:    [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -531,23 +481,6 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP26:%.*]] = trunc nuw nsw i64 [[IV]] to i8
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul i8 [[A]], [[TMP26]]
-; DEFAULT-NEXT:    [[SHR:%.*]] = lshr i8 [[TMP26]], 1
-; DEFAULT-NEXT:    [[MUL5:%.*]] = mul i8 [[SHR]], [[B]]
-; DEFAULT-NEXT:    [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]]
-; DEFAULT-NEXT:    [[SHR7:%.*]] = lshr i8 [[TMP26]], 2
-; DEFAULT-NEXT:    [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]]
-; DEFAULT-NEXT:    [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IV]]
-; DEFAULT-NEXT:    store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1
-; DEFAULT-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -598,23 +531,6 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; OPTSIZE:       [[SCALAR_PH:.*]]:
-; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; OPTSIZE:       [[FOR_BODY]]:
-; OPTSIZE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; OPTSIZE-NEXT:    [[TMP26:%.*]] = trunc nuw nsw i64 [[IV]] to i8
-; OPTSIZE-NEXT:    [[MUL:%.*]] = mul i8 [[A]], [[TMP26]]
-; OPTSIZE-NEXT:    [[SHR:%.*]] = lshr i8 [[TMP26]], 1
-; OPTSIZE-NEXT:    [[MUL5:%.*]] = mul i8 [[SHR]], [[B]]
-; OPTSIZE-NEXT:    [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]]
-; OPTSIZE-NEXT:    [[SHR7:%.*]] = lshr i8 [[TMP26]], 2
-; OPTSIZE-NEXT:    [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]]
-; OPTSIZE-NEXT:    [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]]
-; OPTSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IV]]
-; OPTSIZE-NEXT:    store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1
-; OPTSIZE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; OPTSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15
-; OPTSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; OPTSIZE:       [[FOR_COND_CLEANUP]]:
 ; OPTSIZE-NEXT:    ret void
 ;
@@ -665,23 +581,6 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MINSIZE:       [[SCALAR_PH:.*]]:
-; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; MINSIZE:       [[FOR_BODY]]:
-; MINSIZE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MINSIZE-NEXT:    [[TMP26:%.*]] = trunc nuw nsw i64 [[IV]] to i8
-; MINSIZE-NEXT:    [[MUL:%.*]] = mul i8 [[A]], [[TMP26]]
-; MINSIZE-NEXT:    [[SHR:%.*]] = lshr i8 [[TMP26]], 1
-; MINSIZE-NEXT:    [[MUL5:%.*]] = mul i8 [[SHR]], [[B]]
-; MINSIZE-NEXT:    [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]]
-; MINSIZE-NEXT:    [[SHR7:%.*]] = lshr i8 [[TMP26]], 2
-; MINSIZE-NEXT:    [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]]
-; MINSIZE-NEXT:    [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]]
-; MINSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[IV]]
-; MINSIZE-NEXT:    store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1
-; MINSIZE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; MINSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 15
-; MINSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; MINSIZE:       [[FOR_COND_CLEANUP]]:
 ; MINSIZE-NEXT:    ret void
 ;
@@ -746,23 +645,6 @@ define void @dont_vectorize_with_minsize() {
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; DEFAULT-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; DEFAULT-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -789,23 +671,6 @@ define void @dont_vectorize_with_minsize() {
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; OPTSIZE:       [[SCALAR_PH:.*]]:
-; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; OPTSIZE:       [[FOR_BODY]]:
-; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; OPTSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; OPTSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; OPTSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; OPTSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; OPTSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; OPTSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; OPTSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; OPTSIZE:       [[FOR_COND_CLEANUP]]:
 ; OPTSIZE-NEXT:    ret void
 ;
@@ -832,23 +697,6 @@ define void @dont_vectorize_with_minsize() {
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MINSIZE:       [[SCALAR_PH:.*]]:
-; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; MINSIZE:       [[FOR_BODY]]:
-; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MINSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; MINSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; MINSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; MINSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; MINSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MINSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; MINSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; MINSIZE:       [[FOR_COND_CLEANUP]]:
 ; MINSIZE-NEXT:    ret void
 ;
@@ -913,23 +761,6 @@ define void @vectorization_forced_minsize_reduce_width() {
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; DEFAULT-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; DEFAULT-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -956,23 +787,6 @@ define void @vectorization_forced_minsize_reduce_width() {
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; OPTSIZE:       [[SCALAR_PH:.*]]:
-; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; OPTSIZE:       [[FOR_BODY]]:
-; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; OPTSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; OPTSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; OPTSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; OPTSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; OPTSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; OPTSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; OPTSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; OPTSIZE:       [[FOR_COND_CLEANUP]]:
 ; OPTSIZE-NEXT:    ret void
 ;
@@ -999,23 +813,6 @@ define void @vectorization_forced_minsize_reduce_width() {
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MINSIZE:       [[SCALAR_PH:.*]]:
-; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; MINSIZE:       [[FOR_BODY]]:
-; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MINSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; MINSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; MINSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; MINSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; MINSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MINSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; MINSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; MINSIZE:       [[FOR_COND_CLEANUP]]:
 ; MINSIZE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 24375dd864fae..dd239c023c686 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -28,7 +28,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -80,7 +81,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
@@ -111,7 +112,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
@@ -135,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
 ; CHECK-NEXT:    [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       while.end.loopexit:
 ; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret void
@@ -494,11 +495,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 [[TMP182]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 43fccdc5a0706..49e9989b65d2f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -261,7 +261,8 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-NOI8MM-LABEL: define i32 @sudot_neon(
 ; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR1:[0-9]+]] {
@@ -296,7 +297,8 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
@@ -349,12 +351,13 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-NOI8MM-LABEL: define i32 @usdot_neon(
 ; CHECK-NOI8MM-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR1]] {
@@ -384,12 +387,13 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 410993b4f4776..801eb810d8625 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -30,7 +30,8 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP9]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -65,7 +66,8 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP14]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @dotp(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -90,7 +92,8 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP9]]
 ;
 entry:
   br label %for.body
@@ -196,11 +199,12 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP71]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -354,12 +358,13 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP140]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -442,11 +447,12 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP71]]
 ;
 entry:
   br label %for.body
@@ -491,11 +497,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP11]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -517,11 +524,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP11]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -543,11 +551,12 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -594,11 +603,12 @@ define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-INTERLEAVE1-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP13]], align 4
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP12]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -622,11 +632,12 @@ define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP13]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP12]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
@@ -650,11 +661,12 @@ define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-MAXBW-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP13]], align 4
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -733,7 +745,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
@@ -831,7 +843,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE10]], [[PARTIAL_REDUCE13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
@@ -897,7 +909,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
@@ -1292,11 +1304,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP182]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_predicated(
 ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1627,11 +1640,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP182]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @dotp_predicated(
 ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1962,11 +1976,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-MAXBW-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP182]]
 ;
 entry:
   br label %for.body
@@ -2010,12 +2025,14 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -2045,13 +2062,15 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -2072,12 +2091,14 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-MAXBW-NEXT:    ret i32 [[RESULT]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 09917fcab80c4..6e11e559151da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -501,7 +501,8 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP71]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -660,7 +661,8 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP142]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -747,7 +749,8 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]])
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP71]]
 ;
 entry:
   br label %for.body
@@ -800,7 +803,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
@@ -848,7 +851,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
@@ -890,7 +893,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
@@ -949,7 +952,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -987,7 +990,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8
@@ -1019,7 +1022,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -1108,7 +1111,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP41]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP35]])
@@ -1226,7 +1229,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP81]], [[TMP80]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
@@ -1296,7 +1299,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE16]])
 ; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE17]])
@@ -1393,11 +1396,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP22]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_predicated(
 ; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1430,11 +1434,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP22]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @dotp_predicated(
 ; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1467,11 +1472,12 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP19]], true
-; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP21]]
 ;
 entry:
   br label %for.body
@@ -1519,7 +1525,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1566,7 +1572,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
@@ -1601,7 +1607,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1660,7 +1666,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1707,7 +1713,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
@@ -1742,7 +1748,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1860,7 +1866,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -1972,7 +1978,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2010,7 +2016,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2047,7 +2053,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2105,7 +2111,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2143,7 +2149,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2180,7 +2186,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2247,7 +2253,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2301,7 +2307,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
@@ -2343,7 +2349,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2465,7 +2471,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]])
@@ -2565,7 +2571,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2665,7 +2671,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 1ef5b208a7b32..db3166cc0ec8d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -499,7 +499,8 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @zext_add_reduc_i8_i32_predicated(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
@@ -527,7 +528,8 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @zext_add_reduc_i8_i32_predicated(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
@@ -555,7 +557,8 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 {
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -674,7 +677,7 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = sub <16 x i32> [[VEC_PHI]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
@@ -700,7 +703,7 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = sub <16 x i32> [[VEC_PHI1]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -726,7 +729,7 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-MAXBW-NEXT:    [[TMP10]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
@@ -768,7 +771,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
@@ -794,7 +797,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -820,7 +823,7 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]]
@@ -871,7 +874,7 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
@@ -906,7 +909,7 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add nuw i32 [[VEC_PHI1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP21]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -942,7 +945,7 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
@@ -993,7 +996,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
@@ -1028,7 +1031,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -1064,7 +1067,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
index c4feabe960a67..edf7e280d7416 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll
@@ -50,22 +50,9 @@ define i32 @pr70988(ptr %src, i32 %n) {
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP17]], i32 [[TMP18]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDUC:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDUC_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[TMP24:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDUC]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
-; CHECK-NEXT:    [[TMP24]] = tail call i32 @llvm.smax.i32(i32 [[TMP23]], i32 [[MAX]])
-; CHECK-NEXT:    [[INDUC_NEXT]] = add nuw nsw i64 [[INDUC]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDUC_NEXT]], [[UMAX]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP24]], [[LOOP]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
 entry:
   %1 = and i32 %n, 15
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 0c7dc29cb46d8..0f82de629afa9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -241,42 +241,8 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP41]])
 ; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH:.*]]:
-; PRED-NEXT:    br label %[[LOOP:.*]]
-; PRED:       [[LOOP]]:
-; PRED-NEXT:    [[TMP45:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[TMP53:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[SCALAR_RECUR10:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[TMP45]], %[[LOOP]] ]
-; PRED-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[SUM_RED:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[RED_2:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[TMP52:%.*]] = add i64 [[Y]], 1
-; PRED-NEXT:    [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP52]]
-; PRED-NEXT:    [[TMP53]] = load i32, ptr [[GEP_1]], align 4
-; PRED-NEXT:    [[OR3:%.*]] = or i32 [[SCALAR_RECUR10]], [[X]]
-; PRED-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
-; PRED-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 1
-; PRED-NEXT:    [[TMP54:%.*]] = shl i32 [[OR3]], 1
-; PRED-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 2
-; PRED-NEXT:    [[SHL19:%.*]] = shl i32 [[X]], 1
-; PRED-NEXT:    [[TMP56:%.*]] = or i32 [[SHR]], [[SHL19]]
-; PRED-NEXT:    [[TMP57:%.*]] = or i32 [[TMP56]], [[TMP55]]
-; PRED-NEXT:    [[TMP58:%.*]] = or i32 [[TMP57]], [[X]]
-; PRED-NEXT:    [[OR20:%.*]] = or i32 [[Z]], [[X]]
-; PRED-NEXT:    [[NOT:%.*]] = and i32 [[OR20]], 1
-; PRED-NEXT:    [[AND:%.*]] = xor i32 [[NOT]], 1
-; PRED-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64
-; PRED-NEXT:    [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]]
-; PRED-NEXT:    [[TMP59:%.*]] = load i32, ptr [[GEP_2]], align 4
-; PRED-NEXT:    [[SHR24:%.*]] = lshr i32 [[TMP58]], 1
-; PRED-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64
-; PRED-NEXT:    [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]]
-; PRED-NEXT:    [[TMP60:%.*]] = load i32, ptr [[GEP_3]], align 4
-; PRED-NEXT:    [[RED_1:%.*]] = or i32 [[TMP59]], [[SUM_RED]]
-; PRED-NEXT:    [[RED_2]] = or i32 [[RED_1]], [[TMP60]]
-; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV1]], [[Y]]
-; PRED-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; PRED:       [[EXIT]]:
-; PRED-NEXT:    [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], %[[LOOP]] ], [ [[TMP44]], %[[MIDDLE_BLOCK]] ]
-; PRED-NEXT:    ret i32 [[RED_2_LCSSA]]
+; PRED-NEXT:    ret i32 [[TMP44]]
 ;
 entry:
   br label %loop
@@ -497,21 +463,8 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> [[TMP16]])
 ; PRED-NEXT:    br label %[[EXIT:.*]]
-; PRED:       [[SCALAR_PH:.*]]:
-; PRED-NEXT:    br label %[[LOOP:.*]]
-; PRED:       [[LOOP]]:
-; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[RED:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; PRED-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; PRED-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; PRED-NEXT:    [[DIV:%.*]] = udiv i16 [[L]], [[X]]
-; PRED-NEXT:    [[RED_NEXT]] = or i16 [[DIV]], [[RED]]
-; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; PRED-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; PRED:       [[EXIT]]:
-; PRED-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
-; PRED-NEXT:    ret i16 [[RED_NEXT_LCSSA]]
+; PRED-NEXT:    ret i16 [[TMP19]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 885c79048aaf7..5072058ed5b8f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -144,20 +144,8 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
-; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED-TF:       for.body:
-; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD]] = fadd float [[TMP12]], [[SUM_07]]
-; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    ret float [[ADD_LCSSA]]
+; CHECK-ORDERED-TF-NEXT:    ret float [[TMP9]]
 ;
 
 
@@ -390,23 +378,11 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT14]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP39]], i64 [[TMP6]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP40:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP41:%.*]] = xor i1 [[TMP40]], true
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
-; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED-TF:       for.body:
-; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP42:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD]] = fadd float [[TMP42]], [[SUM_07]]
-; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    ret float [[ADD_LCSSA]]
+; CHECK-ORDERED-TF-NEXT:    ret float [[TMP30]]
 ;
 
 
@@ -630,30 +606,12 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP17]], true
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
-; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED-TF:       for.body:
-; CHECK-ORDERED-TF-NEXT:    [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[SCALAR_PH:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDXB1]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD1]] = fadd float [[TMP19]], [[ADD_PHI2]]
-; CHECK-ORDERED-TF-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV]], 1
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[OR]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDXB2]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD2]] = fadd float [[TMP20]], [[ADD_PHI1]]
-; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    store float [[ADD1_LCSSA]], ptr [[A]], align 4
-; CHECK-ORDERED-TF-NEXT:    store float [[ADD2_LCSSA]], ptr [[ARRAYIDXA]], align 4
+; CHECK-ORDERED-TF-NEXT:    store float [[TMP16]], ptr [[A]], align 4
+; CHECK-ORDERED-TF-NEXT:    store float [[TMP14]], ptr [[ARRAYIDXA]], align 4
 ; CHECK-ORDERED-TF-NEXT:    ret void
 ;
 
@@ -863,28 +821,13 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP7]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED-TF:       for.body:
-; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[SCALAR_PH]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ADD:%.*]] = fadd float [[TMP15]], [[TMP16]]
-; CHECK-ORDERED-TF-NEXT:    [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end.loopexit:
-; CHECK-ORDERED-TF-NEXT:    [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-ORDERED-TF-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP12]], [[FOR_END_LOOPEXIT]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[RES]]
 ;
 
@@ -1081,31 +1024,11 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
-; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED-TF:       for.body:
-; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-ORDERED-TF-NEXT:    [[RES:%.*]] = phi float [ 1.000000e+00, [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[TMP15]], 0.000000e+00
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-ORDERED-TF:       if.then:
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_INC]]
-; CHECK-ORDERED-TF:       for.inc:
-; CHECK-ORDERED-TF-NEXT:    [[PHI:%.*]] = phi float [ [[TMP16]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    ret float [[RDX]]
+; CHECK-ORDERED-TF-NEXT:    ret float [[TMP12]]
 ;
 
 
@@ -1245,7 +1168,7 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b,
 ; CHECK-ORDERED-TF-NEXT:    [[ADD3]] = fadd float [[ADD]], [[TMP1]]
 ; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
 ; CHECK-ORDERED-TF-NEXT:    [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
 ; CHECK-ORDERED-TF-NEXT:    ret float [[RDX]]
@@ -1542,25 +1465,11 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP6]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = xor i1 [[TMP54]], true
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
-; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED-TF:       for.body:
-; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP56]], float [[TMP57]], float [[SUM_07]])
-; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    ret float [[MULADD_LCSSA]]
+; CHECK-ORDERED-TF-NEXT:    ret float [[TMP44]]
 ;
 
 
@@ -1852,25 +1761,11 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
 ; CHECK-ORDERED-TF-NEXT:    [[ACTIVE_LANE_MASK_NEXT18]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP53]], i64 [[TMP6]])
 ; CHECK-ORDERED-TF-NEXT:    [[TMP54:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-ORDERED-TF-NEXT:    [[TMP55:%.*]] = xor i1 [[TMP54]], true
-; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-ORDERED-TF-NEXT:    br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-ORDERED-TF:       middle.block:
 ; CHECK-ORDERED-TF-NEXT:    br label [[FOR_END:%.*]]
-; CHECK-ORDERED-TF:       scalar.ph:
-; CHECK-ORDERED-TF-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED-TF:       for.body:
-; CHECK-ORDERED-TF-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP56:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-ORDERED-TF-NEXT:    [[TMP57:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-TF-NEXT:    [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP56]], float [[TMP57]], float [[SUM_07]])
-; CHECK-ORDERED-TF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-TF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-TF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-ORDERED-TF:       for.end:
-; CHECK-ORDERED-TF-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-TF-NEXT:    ret float [[MULADD_LCSSA]]
+; CHECK-ORDERED-TF-NEXT:    ret float [[TMP44]]
 ;
 
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 4e989c5d3eca8..3b016f8d0a9ff 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -129,20 +129,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -203,21 +191,8 @@ define i64 @loop_contains_safe_call() #1 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -365,22 +340,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LD1]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index 79fb3fd181cc5..c775b44bd1ba6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -88,16 +88,7 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; PRED-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP14]], true
 ; PRED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       middle.block:
-; PRED-NEXT:    br label [[EXIT:%.*]]
-; PRED:       scalar.ph:
 ; PRED-NEXT:    br label [[LOOP:%.*]]
-; PRED:       loop:
-; PRED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; PRED-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
-; PRED-NEXT:    store i8 0, ptr [[GEP]], align 1
-; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; PRED-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
-; PRED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 3f230b7b9c3c4..e084307c0c2ae 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -490,8 +490,7 @@ define float @fadd_predicated(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-ORDERED: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> splat (float -0.000000e+00)
 ; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]])
 ; CHECK-ORDERED: for.end:
-; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %[[RDX]], %middle.block ]
-; CHECK-ORDERED: ret float %[[RES_PHI]]
+; CHECK-ORDERED: ret float %[[RDX]]
 
 ; CHECK-UNORDERED-LABEL: @fadd_predicated
 ; CHECK-UNORDERED: vector.ph
@@ -507,12 +506,8 @@ define float @fadd_predicated(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
 ; CHECK-UNORDERED: middle.block
 ; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %[[MASK]])
-; CHECK-UNORDERED: for.body
-; CHECK-UNORDERED: %[[LOAD:.*]] = load float, ptr
-; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[LOAD]]
 ; CHECK-UNORDERED: for.end
-; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
-; CHECK-UNORDERED: ret float %[[SUM]]
+; CHECK-UNORDERED: ret float %[[RDX]]
 
 ; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated
 ; CHECK-NOT-VECTORIZED-NOT: vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
index bdbbfdfa97427..9526a848f8eab 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/struct-return-cost.ll
@@ -31,10 +31,7 @@ define void @struct_return_widen(ptr noalias %in, ptr noalias writeonly %out_a,
 ; CHECK:  [[VECTOR_BODY:.*:]]
 ; CHECK:    [[TMP2:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD:%.*]])
 ; CHECK:    [[TMP3:%.*]] = call { <2 x half>, <2 x half> } @fixed_vec_foo(<2 x half> [[WIDE_LOAD1:%.*]])
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR2:[0-9]+]]
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
@@ -82,12 +79,9 @@ define void @struct_return_replicate(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK:  [[ENTRY:.*:]]
 ; CHECK:  [[VECTOR_PH:.*:]]
 ; CHECK:  [[VECTOR_BODY:.*:]]
-; CHECK:    [[TMP2:%.*]] = tail call { half, half } @foo(half [[TMP1:%.*]]) #[[ATTR3:[0-9]+]]
-; CHECK:    [[TMP4:%.*]] = tail call { half, half } @foo(half [[TMP3:%.*]]) #[[ATTR3]]
+; CHECK:    [[TMP2:%.*]] = tail call { half, half } @foo(half [[TMP1:%.*]]) #[[ATTR2:[0-9]+]]
+; CHECK:    [[TMP4:%.*]] = tail call { half, half } @foo(half [[TMP3:%.*]]) #[[ATTR2]]
 ; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
-; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]]
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
@@ -162,7 +156,7 @@ define void @struct_return_scalable(ptr noalias %in, ptr noalias writeonly %out_
 ; CHECK:  [[MIDDLE_BLOCK:.*:]]
 ; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR3]]
+; CHECK:    [[CALL:%.*]] = tail call { half, half } @foo(half [[IN_VAL:%.*]]) #[[ATTR2]]
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 33b3629337e8b..3b0bd87587cc0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -116,7 +116,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_TAIL_FOLDING:       for.end:
+; PREDICATED_TAIL_FOLDING-NEXT:    ret void
 ;
 entry:
   %conv = zext i8 %guard to i32
@@ -243,10 +244,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_TAIL_FOLDING:       for.end:
+; PREDICATED_TAIL_FOLDING-NEXT:    ret void
 ;
 entry:
   %conv = zext i8 %guard to i32
@@ -377,10 +379,11 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_TAIL_FOLDING:       for.end:
+; PREDICATED_TAIL_FOLDING-NEXT:    ret void
 ;
 entry:
   %conv = zext i8 %guard1 to i32
@@ -537,10 +540,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_TAIL_FOLDING:       scalar.ph:
+; PREDICATED_TAIL_FOLDING:       for.end:
+; PREDICATED_TAIL_FOLDING-NEXT:    ret void
 ;
 entry:
   %conv = zext i8 %guard to i32
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 16acd3f5ccdbd..b8b4fbd3140de 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -69,7 +69,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
index 069d369a4cdf0..cb2c003872573 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
@@ -29,7 +29,8 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
index 61448bdbbc651..33ee0d6e2ae2f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -33,7 +33,10 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp6.not = icmp eq i32 %N, 0
@@ -87,10 +90,13 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src,
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cmp6.not = icmp eq i64 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
index b725669f78c30..b5544dc3310c9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll
@@ -36,21 +36,9 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = add i32 [[RED]], [[VAL]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP19]]
 ;
 ; CHECK-IN-LOOP-LABEL: @add_reduction_i32(
 ; CHECK-IN-LOOP-NEXT:  entry:
@@ -81,21 +69,9 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP18]], true
 ; CHECK-IN-LOOP-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-IN-LOOP:       middle.block:
-; CHECK-IN-LOOP-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK-IN-LOOP:       scalar.ph:
 ; CHECK-IN-LOOP-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK-IN-LOOP:       while.body:
-; CHECK-IN-LOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-IN-LOOP-NEXT:    [[RED:%.*]] = phi i32 [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-IN-LOOP-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-IN-LOOP-NEXT:    [[RED_NEXT]] = add i32 [[RED]], [[VAL]]
-; CHECK-IN-LOOP-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-IN-LOOP-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-IN-LOOP-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-IN-LOOP:       while.end.loopexit:
-; CHECK-IN-LOOP-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-IN-LOOP-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
+; CHECK-IN-LOOP-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %while.body
@@ -141,23 +117,11 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP17]], true
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK:       while.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ 0.000000e+00, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = fadd float [[RED]], [[VAL]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       while.end.loopexit:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP14]]
 ;
 ; CHECK-IN-LOOP-LABEL: @add_reduction_f32(
 ; CHECK-IN-LOOP-NEXT:  entry:
@@ -185,23 +149,11 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-IN-LOOP-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP17]], true
-; CHECK-IN-LOOP-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-IN-LOOP-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-IN-LOOP:       middle.block:
-; CHECK-IN-LOOP-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK-IN-LOOP:       scalar.ph:
 ; CHECK-IN-LOOP-NEXT:    br label [[WHILE_BODY:%.*]]
-; CHECK-IN-LOOP:       while.body:
-; CHECK-IN-LOOP-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-IN-LOOP-NEXT:    [[RED:%.*]] = phi float [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ 0.000000e+00, [[SCALAR_PH]] ]
-; CHECK-IN-LOOP-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-IN-LOOP-NEXT:    [[VAL:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-IN-LOOP-NEXT:    [[RED_NEXT]] = fadd float [[RED]], [[VAL]]
-; CHECK-IN-LOOP-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-IN-LOOP-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-IN-LOOP-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-IN-LOOP:       while.end.loopexit:
-; CHECK-IN-LOOP-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-IN-LOOP-NEXT:    ret float [[RED_NEXT_LCSSA]]
+; CHECK-IN-LOOP-NEXT:    ret float [[TMP14]]
 ;
 entry:
   br label %while.body
@@ -251,32 +203,12 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP16]], true
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 7, [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP26]], 5
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[RDX]], [[TMP27]]
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RES_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP25]]
 ;
 ; CHECK-IN-LOOP-LABEL: @cond_xor_reduction(
 ; CHECK-IN-LOOP-NEXT:  entry:
@@ -308,31 +240,11 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 {
 ; CHECK-IN-LOOP-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-IN-LOOP-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-IN-LOOP-NEXT:    [[TMP23:%.*]] = xor i1 [[TMP22]], true
-; CHECK-IN-LOOP-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-IN-LOOP-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-IN-LOOP:       middle.block:
-; CHECK-IN-LOOP-NEXT:    br label [[FOR_END:%.*]]
-; CHECK-IN-LOOP:       scalar.ph:
-; CHECK-IN-LOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-IN-LOOP:       for.body:
-; CHECK-IN-LOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-IN-LOOP-NEXT:    [[RDX:%.*]] = phi i32 [ 7, [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
-; CHECK-IN-LOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[IV]]
-; CHECK-IN-LOOP-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-IN-LOOP-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP24]], 5
-; CHECK-IN-LOOP-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-IN-LOOP:       if.then:
-; CHECK-IN-LOOP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-IN-LOOP-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-IN-LOOP-NEXT:    [[XOR:%.*]] = xor i32 [[RDX]], [[TMP25]]
-; CHECK-IN-LOOP-NEXT:    br label [[FOR_INC]]
-; CHECK-IN-LOOP:       for.inc:
-; CHECK-IN-LOOP-NEXT:    [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ]
-; CHECK-IN-LOOP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-IN-LOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-IN-LOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-IN-LOOP-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK-IN-LOOP:       for.end:
-; CHECK-IN-LOOP-NEXT:    [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-IN-LOOP-NEXT:    ret i32 [[RES_LCSSA]]
+; CHECK-IN-LOOP-NEXT:    ret i32 [[TMP19]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
index 18793864531a9..5531b3ca51140 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll
@@ -72,7 +72,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -176,10 +177,11 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT16]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP93]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP67:%.*]] = xor i1 [[TMP66]], true
-; CHECK-NEXT:    br i1 [[TMP67]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP67]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index ec178727ce73b..9ebe79096adc4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -33,7 +33,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -73,10 +74,11 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i1 [[TMP6]], true
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -120,10 +122,11 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP12]], true
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -180,10 +183,11 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i1 [[TMP21]], true
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -231,10 +235,11 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor i1 [[TMP15]], true
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -284,10 +289,11 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP14]], true
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -342,10 +348,11 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = xor i1 [[TMP17]], true
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -403,10 +410,11 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n)
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP13]], true
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -454,10 +462,11 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -509,10 +518,11 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor i1 [[TMP14]], true
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %while.body
@@ -551,7 +561,7 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 {
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index e7d25a0446a70..742097bdae890 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -562,7 +562,8 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -626,7 +627,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -719,7 +720,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> [[TMP6]], i64 1, <vscale x 2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index e6ff39bebeda3..6da3c77cd35c1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -317,19 +317,7 @@ define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
@@ -369,19 +357,7 @@ define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
@@ -421,19 +397,7 @@ define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index c44db7db673fe..1607755e624a3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -71,16 +71,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
 ; DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DATA:       middle.block:
-; DATA-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; DATA:       scalar.ph:
 ; DATA-NEXT:    br label [[WHILE_BODY:%.*]]
-; DATA:       while.body:
-; DATA-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; DATA-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; DATA-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
-; DATA-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; DATA-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; DATA-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; DATA:       while.end.loopexit:
 ; DATA-NEXT:    ret void
 ;
@@ -115,16 +106,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_NO_LANEMASK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
 ; DATA_NO_LANEMASK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DATA_NO_LANEMASK:       middle.block:
-; DATA_NO_LANEMASK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; DATA_NO_LANEMASK:       scalar.ph:
 ; DATA_NO_LANEMASK-NEXT:    br label [[WHILE_BODY:%.*]]
-; DATA_NO_LANEMASK:       while.body:
-; DATA_NO_LANEMASK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; DATA_NO_LANEMASK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; DATA_NO_LANEMASK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
-; DATA_NO_LANEMASK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; DATA_NO_LANEMASK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; DATA_NO_LANEMASK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; DATA_NO_LANEMASK:       while.end.loopexit:
 ; DATA_NO_LANEMASK-NEXT:    ret void
 ;
@@ -150,16 +132,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true
 ; DATA_AND_CONTROL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DATA_AND_CONTROL:       middle.block:
-; DATA_AND_CONTROL-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; DATA_AND_CONTROL:       scalar.ph:
 ; DATA_AND_CONTROL-NEXT:    br label [[WHILE_BODY:%.*]]
-; DATA_AND_CONTROL:       while.body:
-; DATA_AND_CONTROL-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; DATA_AND_CONTROL-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; DATA_AND_CONTROL-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
-; DATA_AND_CONTROL-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; DATA_AND_CONTROL-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; DATA_AND_CONTROL-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; DATA_AND_CONTROL:       while.end.loopexit:
 ; DATA_AND_CONTROL-NEXT:    ret void
 ;
@@ -190,16 +163,7 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP15]], true
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DATA_AND_CONTROL_NO_RT_CHECK:       middle.block:
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT:%.*]]
-; DATA_AND_CONTROL_NO_RT_CHECK:       scalar.ph:
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
-; DATA_AND_CONTROL_NO_RT_CHECK:       while.body:
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
 ; DATA_AND_CONTROL_NO_RT_CHECK:       while.end.loopexit:
 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 038330b99b0f5..c26176028626b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -22,21 +22,6 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br label %[[EXIT:.*]]
-; VF2:       [[SCALAR_PH:.*]]:
-; VF2-NEXT:    br label %[[LOOP:.*]]
-; VF2:       [[LOOP]]:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF2-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
-; VF2-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; VF2-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
-; VF2-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; VF2-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
-; VF2-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; VF2-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
-; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 2
-; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; VF2:       [[EXIT]]:
 ; VF2-NEXT:    ret void
 ;
@@ -86,33 +71,18 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF4-NEXT:    br i1 false, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; VF4:       [[PRED_STORE_IF5]]:
 ; VF4-NEXT:    [[TMP27:%.*]] = shl nsw i64 3, 1
-; VF4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
-; VF4-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; VF4-NEXT:    store i64 [[TMP29]], ptr [[TMP28]], align 8
-; VF4-NEXT:    [[TMP30:%.*]] = or disjoint i64 [[TMP27]], 1
-; VF4-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP30]]
-; VF4-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP31]], align 8
-; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP31]], align 8
-; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
-; VF4:       [[PRED_STORE_CONTINUE6]]:
-; VF4-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
-; VF4:       [[MIDDLE_BLOCK]]:
-; VF4-NEXT:    br label %[[EXIT:.*]]
-; VF4:       [[SCALAR_PH:.*]]:
-; VF4-NEXT:    br label %[[LOOP:.*]]
-; VF4:       [[LOOP]]:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF4-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
+; VF4-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP27]]
 ; VF4-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
 ; VF4-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
-; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
+; VF4-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[TMP27]], 1
 ; VF4-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
 ; VF4-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
 ; VF4-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
-; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 2
-; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
+; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
+; VF4:       [[PRED_STORE_CONTINUE6]]:
+; VF4-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF4:       [[MIDDLE_BLOCK]]:
+; VF4-NEXT:    br label %[[EXIT:.*]]
 ; VF4:       [[EXIT]]:
 ; VF4-NEXT:    ret void
 ;
@@ -237,27 +207,6 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias %
 ; VF2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br label %[[EXIT:.*]]
-; VF2:       [[SCALAR_PH:.*]]:
-; VF2-NEXT:    br label %[[LOOP:.*]]
-; VF2:       [[LOOP]]:
-; VF2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF2-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]]
-; VF2-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]]
-; VF2-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
-; VF2-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_0]], i64 4
-; VF2-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; VF2-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
-; VF2-NEXT:    [[ADD_0:%.*]] = fadd float [[L_A_0]], [[L_B_0]]
-; VF2-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_0]], i64 4
-; VF2-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; VF2-NEXT:    [[ADD_1:%.*]] = fadd float [[L_A_1]], [[L_B_1]]
-; VF2-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV]]
-; VF2-NEXT:    store float [[ADD_0]], ptr [[GEP_RES_0]], align 4
-; VF2-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 4
-; VF2-NEXT:    store float [[ADD_1]], ptr [[GEP_RES_1]], align 4
-; VF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; VF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; VF2:       [[EXIT]]:
 ; VF2-NEXT:    ret void
 ;
@@ -282,27 +231,6 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias %
 ; VF4-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF4:       [[MIDDLE_BLOCK]]:
 ; VF4-NEXT:    br label %[[EXIT:.*]]
-; VF4:       [[SCALAR_PH:.*]]:
-; VF4-NEXT:    br label %[[LOOP:.*]]
-; VF4:       [[LOOP]]:
-; VF4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF4-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]]
-; VF4-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]]
-; VF4-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
-; VF4-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_0]], i64 4
-; VF4-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; VF4-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
-; VF4-NEXT:    [[ADD_0:%.*]] = fadd float [[L_A_0]], [[L_B_0]]
-; VF4-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_0]], i64 4
-; VF4-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; VF4-NEXT:    [[ADD_1:%.*]] = fadd float [[L_A_1]], [[L_B_1]]
-; VF4-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV]]
-; VF4-NEXT:    store float [[ADD_0]], ptr [[GEP_RES_0]], align 4
-; VF4-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 4
-; VF4-NEXT:    store float [[ADD_1]], ptr [[GEP_RES_1]], align 4
-; VF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF4-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; VF4-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; VF4:       [[EXIT]]:
 ; VF4-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index a044ae8f5d90e..d290f2d4f5bc3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -27,21 +27,6 @@ define void @load_store_interleave_group(ptr noalias %data) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]]
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; CHECK-NEXT:    store i64 [[L_0]], ptr [[DATA_0]], align 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[MUL_2]], 1
-; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[ADD_1]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; CHECK-NEXT:    store i64 [[L_1]], ptr [[DATA_1]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -105,25 +90,6 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
-; CHECK-NEXT:    [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP13]]
-; CHECK-NEXT:    [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8
-; CHECK-NEXT:    [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]]
-; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_0]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = or disjoint i64 [[TMP13]], 1
-; CHECK-NEXT:    [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]]
-; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
index edb951946d873..187edb580f8e2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
@@ -49,23 +49,6 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_INC1286_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[IF_THEN1165_US:%.*]]
-; CHECK:       if.then1165.us:
-; CHECK-NEXT:    [[INDVARS_IV1783:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDVARS_IV1783]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i16, ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[CONV1177_US:%.*]] = zext i16 [[L_A]] to i32
-; CHECK-NEXT:    [[ADD1178_US:%.*]] = add nsw i32 [[CONV1177_US]], 10
-; CHECK-NEXT:    [[CONV1179_US:%.*]] = trunc i32 [[ADD1178_US]] to i16
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDVARS_IV1783]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i64, ptr [[GEP_B]], align 8
-; CHECK-NEXT:    [[IDXPROM1181_US:%.*]] = ashr exact i64 [[L_B]], 32
-; CHECK-NEXT:    [[ARRAYIDX1185_US:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[IDXPROM1181_US]]
-; CHECK-NEXT:    store i16 [[CONV1179_US]], ptr [[ARRAYIDX1185_US]], align 2
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1784]] = add nuw nsw i64 [[INDVARS_IV1783]], 1
-; CHECK-NEXT:    [[EXITCOND1785:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1784]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND1785]], label [[FOR_INC1286_LOOPEXIT]], label [[IF_THEN1165_US]]
 ; CHECK:       for.inc1286.loopexit:
 ; CHECK-NEXT:    ret void
 ;
@@ -141,24 +124,6 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_INC1286_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[IF_THEN1165_US:%.*]]
-; CHECK:       if.then1165.us:
-; CHECK-NEXT:    [[INDVARS_IV1783:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ]
-; CHECK-NEXT:    [[FPTR:%.*]] = load i32, ptr [[C]], align 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDVARS_IV1783]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i16, ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[CONV1177_US:%.*]] = zext i16 [[L_A]] to i32
-; CHECK-NEXT:    [[ADD1178_US:%.*]] = add nsw i32 [[CONV1177_US]], [[FPTR]]
-; CHECK-NEXT:    [[CONV1179_US:%.*]] = trunc i32 [[ADD1178_US]] to i16
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDVARS_IV1783]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i64, ptr [[GEP_B]], align 8
-; CHECK-NEXT:    [[IDXPROM1181_US:%.*]] = ashr exact i64 [[L_B]], 32
-; CHECK-NEXT:    [[ARRAYIDX1185_US:%.*]] = getelementptr inbounds i16, ptr [[M3]], i64 [[IDXPROM1181_US]]
-; CHECK-NEXT:    store i16 [[CONV1179_US]], ptr [[ARRAYIDX1185_US]], align 2
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1784]] = add nuw nsw i64 [[INDVARS_IV1783]], 1
-; CHECK-NEXT:    [[EXITCOND1785:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1784]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND1785]], label [[FOR_INC1286_LOOPEXIT]], label [[IF_THEN1165_US]]
 ; CHECK:       for.inc1286.loopexit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
index c8eecd7283f1e..96a25a853f880 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll
@@ -127,7 +127,8 @@ define void @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
index d5d0c14cf2c82..bc9cf4fe93622 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@@ -23,11 +23,7 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; GFX9-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; GFX9-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; GFX9:       middle.block:
-; GFX9-NEXT:    br label [[FOR_END:%.*]]
-; GFX9:       scalar.ph:
 ; GFX9-NEXT:    br label [[FOR_BODY:%.*]]
-; GFX9:       for.body:
-; GFX9-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; GFX9:       for.end:
 ; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
 ; GFX9-NEXT:    [[ADD_LCSSA:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[BIN_RDX]])
@@ -52,11 +48,7 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; VI-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; VI-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VI:       middle.block:
-; VI-NEXT:    br label [[FOR_END:%.*]]
-; VI:       scalar.ph:
 ; VI-NEXT:    br label [[FOR_BODY:%.*]]
-; VI:       for.body:
-; VI-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; VI:       for.end:
 ; VI-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
 ; VI-NEXT:    [[ADD_LCSSA:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[BIN_RDX]])
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
index e83ac2eed2d49..58a24ee7c4677 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll
@@ -36,18 +36,6 @@ define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 {
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[TMP10]], 3
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i8 [[MUL]], ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
@@ -81,7 +69,4 @@ attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]}
-; CHECK: [[META4]] = !{!"llvm.loop.vectorize.width", i32 16}
-; CHECK: [[META5]] = !{!"llvm.loop.interleave.count", i32 2}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
index e52d85c51ab76..9a76019ec5f46 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll
@@ -25,21 +25,7 @@ define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 1
-; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP8]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
@@ -212,21 +198,7 @@ define void @test_stride3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 3
-; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP8]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
@@ -273,21 +245,7 @@ define void @test_stride4_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 4
-; CHECK-NEXT:    [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[ADD7:%.*]] = add nsw i32 5, [[TMP8]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
-; CHECK-NEXT:    store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_023]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[END]], label [[FOR_BODY]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index 4cdfcf2c87b97..0a4ed7ff2eb38 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -22,11 +22,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
@@ -75,11 +71,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP8]]
 ;
@@ -126,11 +118,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
@@ -177,11 +165,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
@@ -228,11 +212,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
@@ -279,11 +259,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
@@ -330,11 +306,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
@@ -381,11 +353,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
index fc7922762a0b4..029d8bd64fe50 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll
@@ -34,28 +34,11 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[RES_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP12]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP13]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[RES_010]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_011]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
 ;
 entry:
@@ -112,28 +95,11 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[RES_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP12]] to i32
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP13]] to i32
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[RES_010]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_011]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RES_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[RES_0_LCSSA]]
 ;
 entry:
@@ -183,25 +149,13 @@ define i32 @add_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP7]], [[R_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
 entry:
@@ -245,26 +199,14 @@ define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 1, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = mul nsw i32 [[TMP7]], [[R_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
 entry:
@@ -308,26 +250,14 @@ define i32 @and_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ -1, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = and i32 [[TMP7]], [[R_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ -1, [[ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
 entry:
@@ -371,26 +301,14 @@ define i32 @or_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = or i32 [[TMP7]], [[R_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
 entry:
@@ -434,26 +352,14 @@ define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = xor i32 [[TMP7]], [[R_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
 entry:
@@ -497,26 +403,14 @@ define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]])
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP7]], [[R_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret float [[R_0_LCSSA]]
 ;
 entry:
@@ -560,26 +454,14 @@ define float @fmul_f32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select fast <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP3]], <4 x float> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]])
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[R_07:%.*]] = phi float [ [[ADD:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fmul fast float [[TMP7]], [[R_07]]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup.loopexit:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi float [ 1.000000e+00, [[ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    ret float [[R_0_LCSSA]]
 ;
 entry:
@@ -622,7 +504,7 @@ define i32 @smin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
@@ -640,7 +522,7 @@ define i32 @smin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -689,7 +571,7 @@ define i32 @smax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
@@ -707,7 +589,7 @@ define i32 @smax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -756,7 +638,7 @@ define i32 @umin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
@@ -774,7 +656,7 @@ define i32 @umin_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -823,7 +705,7 @@ define i32 @umax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
@@ -841,7 +723,7 @@ define i32 @umax_i32(ptr nocapture readonly %x, i32 %n) #0 {
 ; CHECK-NEXT:    [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP7]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index 3426fb16841c5..6ea075f76aed4 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -30,17 +30,6 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; DEFAULT-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]]
-; DEFAULT-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -59,17 +48,6 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; OPTSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; OPTSIZE:       [[SCALAR_PH:.*]]:
-; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; OPTSIZE:       [[FOR_BODY]]:
-; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; OPTSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]]
-; OPTSIZE-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; OPTSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; OPTSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; OPTSIZE:       [[FOR_COND_CLEANUP]]:
 ; OPTSIZE-NEXT:    ret void
 ;
@@ -88,17 +66,6 @@ define void @always_vectorize(ptr %p, i32 %x) {
 ; MINSIZE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MINSIZE:       [[SCALAR_PH:.*]]:
-; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; MINSIZE:       [[FOR_BODY]]:
-; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MINSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]]
-; MINSIZE-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MINSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; MINSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; MINSIZE:       [[FOR_COND_CLEANUP]]:
 ; MINSIZE-NEXT:    ret void
 ;
@@ -386,23 +353,6 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP72:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i8
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul i8 [[A]], [[TMP72]]
-; DEFAULT-NEXT:    [[SHR:%.*]] = lshr i8 [[TMP72]], 1
-; DEFAULT-NEXT:    [[MUL5:%.*]] = mul i8 [[SHR]], [[B]]
-; DEFAULT-NEXT:    [[ADD:%.*]] = add i8 [[MUL5]], [[MUL]]
-; DEFAULT-NEXT:    [[SHR7:%.*]] = lshr i8 [[TMP72]], 2
-; DEFAULT-NEXT:    [[MUL9:%.*]] = mul i8 [[SHR7]], [[C]]
-; DEFAULT-NEXT:    [[ADD10:%.*]] = add i8 [[ADD]], [[MUL9]]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    store i8 [[ADD10]], ptr [[ARRAYIDX]], align 1
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -502,23 +452,6 @@ define void @dont_vectorize_with_minsize() {
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; DEFAULT-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; DEFAULT-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -545,23 +478,6 @@ define void @dont_vectorize_with_minsize() {
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; OPTSIZE:       [[SCALAR_PH:.*]]:
-; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; OPTSIZE:       [[FOR_BODY]]:
-; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; OPTSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; OPTSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; OPTSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; OPTSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; OPTSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; OPTSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; OPTSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; OPTSIZE:       [[FOR_COND_CLEANUP]]:
 ; OPTSIZE-NEXT:    ret void
 ;
@@ -588,23 +504,6 @@ define void @dont_vectorize_with_minsize() {
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MINSIZE:       [[SCALAR_PH:.*]]:
-; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; MINSIZE:       [[FOR_BODY]]:
-; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MINSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; MINSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; MINSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; MINSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; MINSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MINSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; MINSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
 ; MINSIZE:       [[FOR_COND_CLEANUP]]:
 ; MINSIZE-NEXT:    ret void
 ;
@@ -659,23 +558,6 @@ define void @vectorization_forced() {
 ; DEFAULT-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; DEFAULT:       [[SCALAR_PH:.*]]:
-; DEFAULT-NEXT:    br label %[[FOR_BODY:.*]]
-; DEFAULT:       [[FOR_BODY]]:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; DEFAULT-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; DEFAULT-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; DEFAULT-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; DEFAULT:       [[FOR_COND_CLEANUP]]:
 ; DEFAULT-NEXT:    ret void
 ;
@@ -702,23 +584,6 @@ define void @vectorization_forced() {
 ; OPTSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; OPTSIZE:       [[MIDDLE_BLOCK]]:
 ; OPTSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; OPTSIZE:       [[SCALAR_PH:.*]]:
-; OPTSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; OPTSIZE:       [[FOR_BODY]]:
-; OPTSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; OPTSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; OPTSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; OPTSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; OPTSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; OPTSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; OPTSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; OPTSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; OPTSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; OPTSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; OPTSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; OPTSIZE:       [[FOR_COND_CLEANUP]]:
 ; OPTSIZE-NEXT:    ret void
 ;
@@ -745,23 +610,6 @@ define void @vectorization_forced() {
 ; MINSIZE-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; MINSIZE:       [[MIDDLE_BLOCK]]:
 ; MINSIZE-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MINSIZE:       [[SCALAR_PH:.*]]:
-; MINSIZE-NEXT:    br label %[[FOR_BODY:.*]]
-; MINSIZE:       [[FOR_BODY]]:
-; MINSIZE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MINSIZE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; MINSIZE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[CVAL:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; MINSIZE-NEXT:    [[MUL:%.*]] = mul nsw i32 [[BVAL]], [[CVAL]]
-; MINSIZE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [1000 x i16], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; MINSIZE-NEXT:    [[AVAL:%.*]] = load i16, ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[TRUNC:%.*]] = trunc i32 [[MUL]] to i16
-; MINSIZE-NEXT:    [[ADD:%.*]] = add i16 [[TRUNC]], [[AVAL]]
-; MINSIZE-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX4]], align 2
-; MINSIZE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MINSIZE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; MINSIZE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; MINSIZE:       [[FOR_COND_CLEANUP]]:
 ; MINSIZE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
index 625f7a643a3ac..1ae71c8695401 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll
@@ -52,7 +52,7 @@ define dso_local void @predicate_loop_hint(ptr noalias nocapture %A, ptr noalias
 ; CHECK:         %index.next = add nuw i64 %index, 4
 ; CHECK:         br i1 %{{.*}}, label %{{.*}}, label %vector.body, !llvm.loop [[VEC_LOOP2:![0-9]+]]
 ;
-; CHECK:         br i1 %{{.*}}, label %{{.*}}, label %for.body, !llvm.loop [[SCALAR_LOOP2:![0-9]+]]
+; CHECK-NOT:     br i1 %{{.*}}, label %{{.*}}, label %for.body, !llvm.loop
 entry:
   br label %for.body
 
@@ -78,9 +78,6 @@ for.body:
 ; CHECK-NEXT: [[MD_RT_UNROLL_DIS]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-NEXT: [[SCALAR_LOOP1]] = distinct !{[[SCALAR_LOOP1]], [[MD_RT_UNROLL_DIS]], [[MD_IS_VEC]]}
 ; CHECK-NEXT: [[VEC_LOOP2]] = distinct !{[[VEC_LOOP2]], [[MD_IS_VEC]], [[MD_RT_UNROLL_DIS]]}
-; CHECK-NEXT: [[SCALAR_LOOP2]] = distinct !{[[SCALAR_LOOP2]], [[ORIG_PRED_ENABLED:!.+]], [[ORIG_VEC_ENABLED:!.+]]}
-; CHECK-NEXT: [[ORIG_PRED_ENABLED]] = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
-; CHECK-NEXT: [[ORIG_VEC_ENABLED]] = !{!"llvm.loop.vectorize.enable", i1 true}
 
 !6 = distinct !{!6, !7, !8}
 !7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
index 0b13343f6ff86..7afa8ce998121 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
@@ -33,18 +33,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
index a7f0206089abf..024194db39332 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -46,19 +46,6 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFBFMIN-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
 ; ZVFBFMIN-NEXT:    br label %[[EXIT:.*]]
-; ZVFBFMIN:       [[SCALAR_PH:.*]]:
-; ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
-; ZVFBFMIN:       [[LOOP]]:
-; ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
-; ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
-; ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
-; ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
-; ZVFBFMIN-NEXT:    [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
-; ZVFBFMIN-NEXT:    store bfloat [[Z]], ptr [[A_GEP]], align 2
-; ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
-; ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
-; ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; ZVFBFMIN:       [[EXIT]]:
 ; ZVFBFMIN-NEXT:    ret void
 ;
@@ -155,23 +142,6 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; ZVFBFMIN-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
 ; ZVFBFMIN-NEXT:    br label %[[EXIT:.*]]
-; ZVFBFMIN:       [[SCALAR_PH:.*]]:
-; ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
-; ZVFBFMIN:       [[LOOP]]:
-; ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
-; ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
-; ZVFBFMIN-NEXT:    [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
-; ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
-; ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
-; ZVFBFMIN-NEXT:    [[Z:%.*]] = load float, ptr [[C_GEP]], align 4
-; ZVFBFMIN-NEXT:    [[X_EXT:%.*]] = fpext bfloat [[X]] to float
-; ZVFBFMIN-NEXT:    [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
-; ZVFBFMIN-NEXT:    [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
-; ZVFBFMIN-NEXT:    store float [[FMULADD]], ptr [[C_GEP]], align 4
-; ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
-; ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
-; ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; ZVFBFMIN:       [[EXIT]]:
 ; ZVFBFMIN-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index 612e7c083bda1..2087218bf3ea3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -33,24 +33,6 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i16 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[DEAD_GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    store i16 [[XOR]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -106,24 +88,6 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i16 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[DEAD_GEP:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    store i16 [[XOR]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -179,27 +143,6 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i16 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[DEAD_GEP_1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[DEAD_GEP_2:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    store i16 [[XOR]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -262,29 +205,6 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i16 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    br label %[[THEN_1:.*]]
-; CHECK:       [[THEN_1]]:
-; CHECK-NEXT:    [[DEAD_GEP_1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[DEAD_GEP_2:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    store i16 [[XOR]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -347,31 +267,6 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i16 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    br label %[[THEN_1:.*]]
-; CHECK:       [[THEN_1]]:
-; CHECK-NEXT:    [[DEAD_GEP_1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[ELSE_2:.*]]
-; CHECK:       [[ELSE_2]]:
-; CHECK-NEXT:    [[DEAD_GEP_2:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    store i16 [[XOR]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -450,31 +345,6 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i16 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i16 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    br i1 [[IC]], label %[[THEN_1:.*]], label %[[ELSE]]
-; CHECK:       [[THEN_1]]:
-; CHECK-NEXT:    [[DEAD_GEP_1:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[ELSE_2:.*]]
-; CHECK:       [[ELSE_2]]:
-; CHECK-NEXT:    [[DEAD_GEP_2:%.*]] = getelementptr i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    store i16 [[XOR]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -537,24 +407,6 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i32 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[P:%.*]] = phi i16 [ [[L]], %[[LOOP_HEADER]] ], [ 99, %[[THEN]] ]
-; CHECK-NEXT:    store i16 [[P]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -607,24 +459,6 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[XOR1315:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[XOR]] = xor i32 0, 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[P:%.*]] = phi i16 [ [[L]], %[[LOOP_HEADER]] ], [ 99, %[[ELSE]] ]
-; CHECK-NEXT:    store i16 [[P]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 96c3a0d1a2f01..10f8f742bb1e2 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -42,16 +42,6 @@ define void @dead_load(ptr %p, i16 %start) {
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[START_EXT]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 3
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IV]], 111
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -326,21 +316,6 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) {
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 false, label %[[LOOP_LATCH]], label %[[THEN:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[NOT_A:%.*]] = xor i32 [[A]], -1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[NOT_A]], %[[THEN]] ], [ 0, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[P]], ptr [[GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 9
-; CHECK-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV]], 322
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -408,21 +383,6 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L_DEAD:%.*]] = load i8, ptr [[GEP_SRC_0]], align 1
-; CHECK-NEXT:    [[IV_1:%.*]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
-; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[L_1]] to i32
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[EXT]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
index b6230dc1e09ab..3fd90b2848848 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll
@@ -32,18 +32,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -86,21 +75,9 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP11]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index d20dd0587f44e..01b4502308c95 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -29,18 +29,7 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[DIVREM:%.*]] = udiv i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -61,18 +50,7 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[DIVREM:%.*]] = udiv i64 [[ELEM]], [[V]]
-; FIXED-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -113,20 +91,9 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[DIVREM:%.*]] = sdiv i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -147,18 +114,7 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[DIVREM:%.*]] = sdiv i64 [[ELEM]], [[V]]
-; FIXED-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -199,20 +155,9 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[DIVREM:%.*]] = urem i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -233,18 +178,7 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[DIVREM:%.*]] = urem i64 [[ELEM]], [[V]]
-; FIXED-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -285,20 +219,9 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[DIVREM:%.*]] = srem i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -319,18 +242,7 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[DIVREM:%.*]] = srem i64 [[ELEM]], [[V]]
-; FIXED-NEXT:    store i64 [[DIVREM]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -379,26 +291,9 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i64 [[V]], 0
-; CHECK-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; CHECK:       do_op:
-; CHECK-NEXT:    [[DIVREM:%.*]] = udiv i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; CHECK-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -422,24 +317,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
-; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[C:%.*]] = icmp ne i64 [[V]], 0
-; FIXED-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; FIXED:       do_op:
-; FIXED-NEXT:    [[DIVREM:%.*]] = udiv i64 [[ELEM]], [[V]]
-; FIXED-NEXT:    br label [[LATCH]]
-; FIXED:       latch:
-; FIXED-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; FIXED-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; FIXED-NEXT:    br label [[LATCH:%.*]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -494,26 +372,9 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i64 [[V]], 0
-; CHECK-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; CHECK:       do_op:
-; CHECK-NEXT:    [[DIVREM:%.*]] = sdiv i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; CHECK-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -537,24 +398,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
-; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[C:%.*]] = icmp ne i64 [[V]], 0
-; FIXED-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; FIXED:       do_op:
-; FIXED-NEXT:    [[DIVREM:%.*]] = sdiv i64 [[ELEM]], [[V]]
-; FIXED-NEXT:    br label [[LATCH]]
-; FIXED:       latch:
-; FIXED-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; FIXED-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; FIXED-NEXT:    br label [[LATCH:%.*]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -601,26 +445,9 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP12]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i64 [[ELEM]], 42
-; CHECK-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; CHECK:       do_op:
-; CHECK-NEXT:    [[DIVREM:%.*]] = udiv i64 [[ELEM]], 27
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; CHECK-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -641,24 +468,7 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
-; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[C:%.*]] = icmp ne i64 [[ELEM]], 42
-; FIXED-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; FIXED:       do_op:
-; FIXED-NEXT:    [[DIVREM:%.*]] = udiv i64 [[ELEM]], 27
-; FIXED-NEXT:    br label [[LATCH]]
-; FIXED:       latch:
-; FIXED-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; FIXED-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; FIXED-NEXT:    br label [[LATCH:%.*]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -705,26 +515,9 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP12]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i64 [[ELEM]], 42
-; CHECK-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; CHECK:       do_op:
-; CHECK-NEXT:    [[DIVREM:%.*]] = sdiv i64 [[ELEM]], 27
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; CHECK-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -745,24 +538,7 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
-; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[C:%.*]] = icmp ne i64 [[ELEM]], 42
-; FIXED-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; FIXED:       do_op:
-; FIXED-NEXT:    [[DIVREM:%.*]] = sdiv i64 [[ELEM]], 27
-; FIXED-NEXT:    br label [[LATCH]]
-; FIXED:       latch:
-; FIXED-NEXT:    [[PHI:%.*]] = phi i64 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; FIXED-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; FIXED-NEXT:    br label [[LATCH:%.*]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -815,26 +591,9 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]]
 ; CHECK-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[ELEM]], -128
-; CHECK-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; CHECK:       do_op:
-; CHECK-NEXT:    [[DIVREM:%.*]] = sdiv i8 [[ELEM]], -1
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i8 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; CHECK-NEXT:    store i8 [[PHI]], ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -856,24 +615,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[FOR_END:%.*]]
-; FIXED:       scalar.ph:
-; FIXED-NEXT:    br label [[FOR_BODY:%.*]]
-; FIXED:       for.body:
-; FIXED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; FIXED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; FIXED-NEXT:    [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; FIXED-NEXT:    [[C:%.*]] = icmp ne i8 [[ELEM]], -128
-; FIXED-NEXT:    br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]]
-; FIXED:       do_op:
-; FIXED-NEXT:    [[DIVREM:%.*]] = sdiv i8 [[ELEM]], -1
-; FIXED-NEXT:    br label [[LATCH]]
-; FIXED:       latch:
-; FIXED-NEXT:    [[PHI:%.*]] = phi i8 [ [[ELEM]], [[FOR_BODY]] ], [ [[DIVREM]], [[DO_OP]] ]
-; FIXED-NEXT:    store i8 [[PHI]], ptr [[ARRAYIDX]], align 1
-; FIXED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; FIXED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; FIXED-NEXT:    br label [[LATCH:%.*]]
 ; FIXED:       for.end:
 ; FIXED-NEXT:    ret void
 ;
@@ -945,7 +687,7 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i8> [[VEC_IND]], [[BROADCAST_SPLAT7]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 2
@@ -972,7 +714,7 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
 ; CHECK-NEXT:    [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[MERGE_LCSSA]]
@@ -1004,28 +746,9 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
 ; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       middle.block:
 ; FIXED-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
-; FIXED-NEXT:    br label [[LOOP_HEADER:%.*]]
-; FIXED:       loop.header:
-; FIXED-NEXT:    [[IV:%.*]] = phi i16 [ -12, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; FIXED-NEXT:    [[NARROW_IV:%.*]] = phi i8 [ -12, [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ]
-; FIXED-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
-; FIXED:       then:
-; FIXED-NEXT:    [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X]]
-; FIXED-NEXT:    [[UD_EXT:%.*]] = zext i8 [[UD]] to i16
-; FIXED-NEXT:    [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y]]
-; FIXED-NEXT:    [[SD_EXT:%.*]] = sext i16 [[SD]] to i32
-; FIXED-NEXT:    br label [[LOOP_LATCH]]
-; FIXED:       loop.latch:
-; FIXED-NEXT:    [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ]
-; FIXED-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], 1
-; FIXED-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; FIXED-NEXT:    [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
-; FIXED-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
+; FIXED-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; FIXED:       exit:
-; FIXED-NEXT:    [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; FIXED-NEXT:    ret i32 [[MERGE_LCSSA]]
+; FIXED-NEXT:    ret i32 [[TMP7]]
 ;
 entry:
   br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
index 0a605563e45a9..21272cb72f4d6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -30,16 +30,7 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV1]]
-; CHECK-NEXT:    store i64 [[IV1]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
@@ -84,18 +75,7 @@ define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[B]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[ADDR]], i64 8
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store ptr [[ADDR]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
index a2ab7c4cc52ad..143a51dc811f1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@@ -46,19 +46,6 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFHMIN-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; ZVFHMIN:       [[MIDDLE_BLOCK]]:
 ; ZVFHMIN-NEXT:    br label %[[EXIT:.*]]
-; ZVFHMIN:       [[SCALAR_PH:.*]]:
-; ZVFHMIN-NEXT:    br label %[[LOOP:.*]]
-; ZVFHMIN:       [[LOOP]]:
-; ZVFHMIN-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
-; ZVFHMIN-NEXT:    [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
-; ZVFHMIN-NEXT:    [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
-; ZVFHMIN-NEXT:    [[X:%.*]] = load half, ptr [[A_GEP]], align 2
-; ZVFHMIN-NEXT:    [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
-; ZVFHMIN-NEXT:    [[Z:%.*]] = fadd half [[X]], [[Y]]
-; ZVFHMIN-NEXT:    store half [[Z]], ptr [[A_GEP]], align 2
-; ZVFHMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
-; ZVFHMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
-; ZVFHMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; ZVFHMIN:       [[EXIT]]:
 ; ZVFHMIN-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
index 5df4f703c1b1b..1c6954c187e5f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll
@@ -116,17 +116,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RVA23-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RVA23:       middle.block:
-; RVA23-NEXT:    br label [[EXIT:%.*]]
-; RVA23:       scalar.ph:
 ; RVA23-NEXT:    br label [[LOOP:%.*]]
-; RVA23:       loop:
-; RVA23-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; RVA23-NEXT:    [[TMP8:%.*]] = mul i64 [[IV]], 7
-; RVA23-NEXT:    [[ADD_PTR:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP8]]
-; RVA23-NEXT:    store i8 0, ptr [[ADD_PTR]], align 1
-; RVA23-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; RVA23-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 585
-; RVA23-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
 ; RVA23:       exit:
 ; RVA23-NEXT:    ret void
 ;
@@ -153,17 +143,7 @@ define void @predicated_strided_store(ptr %start) {
 ; RVA23ZVL1024B-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RVA23ZVL1024B-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RVA23ZVL1024B:       middle.block:
-; RVA23ZVL1024B-NEXT:    br label [[EXIT:%.*]]
-; RVA23ZVL1024B:       scalar.ph:
 ; RVA23ZVL1024B-NEXT:    br label [[LOOP:%.*]]
-; RVA23ZVL1024B:       loop:
-; RVA23ZVL1024B-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; RVA23ZVL1024B-NEXT:    [[TMP8:%.*]] = mul i64 [[IV]], 7
-; RVA23ZVL1024B-NEXT:    [[ADD_PTR:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP8]]
-; RVA23ZVL1024B-NEXT:    store i8 0, ptr [[ADD_PTR]], align 1
-; RVA23ZVL1024B-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; RVA23ZVL1024B-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 585
-; RVA23ZVL1024B-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
 ; RVA23ZVL1024B:       exit:
 ; RVA23ZVL1024B-NEXT:    ret void
 ;
@@ -216,21 +196,7 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P1]], i64 [[IV]]
-; CHECK-NEXT:    store ptr [[P0]], ptr [[ARRAYIDX11]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[P2]], align 4
-; CHECK-NEXT:    [[BITS_TO_GO:%.*]] = getelementptr i8, ptr [[P3]], i64 [[TMP10]]
-; CHECK-NEXT:    store i32 0, ptr [[BITS_TO_GO]], align 4
-; CHECK-NEXT:    store i32 0, ptr [[BITS_TO_GO]], align 4
-; CHECK-NEXT:    store i8 0, ptr [[BITS_TO_GO]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
index 4d97a659e94e9..4ccec2ca61778 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll
@@ -153,21 +153,6 @@ define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV_0:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[IV_0_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_OR:%.*]] = or i32 [[IV_2]], [[IV_0]]
-; CHECK-NEXT:    [[IV_OR_EXT:%.*]] = sext i32 [[IV_OR]] to i64
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_OR_EXT]]
-; CHECK-NEXT:    store ptr [[GEP_SRC]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[IV_0_NEXT]] = add i32 [[IV_0]], 2
-; CHECK-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_1]], 1
-; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 2
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -228,27 +213,6 @@ define void @redundant_iv_trunc_for_cse(ptr noalias %src, ptr noalias %dst, i64
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    [[C_0:%.*]] = icmp eq i32 [[L]], 0
-; CHECK-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    br i1 [[C_0]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[TRUNC_IV_2:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SHL_IV:%.*]] = shl i32 [[TRUNC_IV_2]], 16
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[SHL_IV]], %[[THEN]] ], [ [[TRUNC_IV]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TRUNC_P:%.*]] = trunc i32 [[P]] to i8
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[TRUNC_P]], ptr [[GEP_DST]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index 63d1af38e93f0..7e6e45feaa834 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -133,24 +133,11 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; IF-EVL-OUTLOOP:       scalar.ph:
 ; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-OUTLOOP:       for.body:
-; IF-EVL-OUTLOOP-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-OUTLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-; IF-EVL-OUTLOOP-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
-; IF-EVL-OUTLOOP-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; IF-EVL-OUTLOOP:       for.cond.cleanup.loopexit:
-; IF-EVL-OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; IF-EVL-OUTLOOP:       for.cond.cleanup:
-; IF-EVL-OUTLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-OUTLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP12]], [[FOR_BODY]] ]
 ; IF-EVL-OUTLOOP-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
 ; IF-EVL-INLOOP-LABEL: @add_i16_i32(
@@ -176,24 +163,11 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
-; IF-EVL-INLOOP:       scalar.ph:
 ; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-INLOOP:       for.body:
-; IF-EVL-INLOOP-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; IF-EVL-INLOOP-NEXT:    [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; IF-EVL-INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
-; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; IF-EVL-INLOOP-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-; IF-EVL-INLOOP-NEXT:    [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
-; IF-EVL-INLOOP-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
-; IF-EVL-INLOOP-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ; IF-EVL-INLOOP:       for.cond.cleanup.loopexit:
-; IF-EVL-INLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-INLOOP-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; IF-EVL-INLOOP:       for.cond.cleanup:
-; IF-EVL-INLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; IF-EVL-INLOOP-NEXT:    [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP11]], [[FOR_BODY]] ]
 ; IF-EVL-INLOOP-NEXT:    ret i32 [[R_0_LCSSA]]
 ;
 entry:
@@ -330,22 +304,9 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-OUTLOOP:       scalar.ph:
 ; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-OUTLOOP:       for.body:
-; IF-EVL-OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-OUTLOOP-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP19]], [[RDX]]
-; IF-EVL-OUTLOOP-NEXT:    [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]]
-; IF-EVL-OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; IF-EVL-OUTLOOP:       for.end:
-; IF-EVL-OUTLOOP-NEXT:    [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-OUTLOOP-NEXT:    ret i32 [[SMIN_LCSSA]]
+; IF-EVL-OUTLOOP-NEXT:    ret i32 [[TMP18]]
 ;
 ; IF-EVL-INLOOP-LABEL: @smin(
 ; IF-EVL-INLOOP-NEXT:  entry:
@@ -367,22 +328,9 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-INLOOP:       scalar.ph:
 ; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-INLOOP:       for.body:
-; IF-EVL-INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-INLOOP-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP16]], [[RDX]]
-; IF-EVL-INLOOP-NEXT:    [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP16]], i32 [[RDX]]
-; IF-EVL-INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; IF-EVL-INLOOP:       for.end:
-; IF-EVL-INLOOP-NEXT:    [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-INLOOP-NEXT:    ret i32 [[SMIN_LCSSA]]
+; IF-EVL-INLOOP-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
 ; IF-EVL-LABEL: @smin(
 ; IF-EVL-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 43560d25f8ce2..31c8b74194062 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -31,24 +31,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
-; CHECK-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -73,24 +56,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
-; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -121,24 +87,7 @@ define void @load_store_factor2_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
-; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -194,24 +143,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -236,24 +168,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -284,24 +199,7 @@ define void @load_store_factor2_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -359,29 +257,7 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; CHECK-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
-; CHECK-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
-; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
-; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -410,29 +286,7 @@ define void @load_store_factor3_i32(ptr %p) {
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; FIXED-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; FIXED-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; FIXED-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
-; FIXED-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
-; FIXED-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
-; FIXED-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -465,29 +319,7 @@ define void @load_store_factor3_i32(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; SCALABLE-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i32 [[X1]], 2
-; SCALABLE-NEXT:    store i32 [[Y1]], ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X2:%.*]] = load i32, ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[Y2:%.*]] = add i32 [[X2]], 3
-; SCALABLE-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -551,29 +383,7 @@ define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -602,29 +412,7 @@ define void @load_store_factor3_i64(ptr %p) {
 ; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -657,29 +445,7 @@ define void @load_store_factor3_i64(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -745,34 +511,7 @@ define void @load_store_factor4(ptr %p) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 4
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -803,34 +542,7 @@ define void @load_store_factor4(ptr %p) {
 ; FIXED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 4
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; FIXED-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; FIXED-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -865,34 +577,7 @@ define void @load_store_factor4(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 4
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; SCALABLE-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -966,39 +651,7 @@ define void @load_store_factor5(ptr %p) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 5
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; CHECK-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; CHECK-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; CHECK-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1033,39 +686,7 @@ define void @load_store_factor5(ptr %p) {
 ; FIXED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 5
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; FIXED-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; FIXED-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; FIXED-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; FIXED-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; FIXED-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; FIXED-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; FIXED-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1102,39 +723,7 @@ define void @load_store_factor5(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 5
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; SCALABLE-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; SCALABLE-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; SCALABLE-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; SCALABLE-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1216,44 +805,7 @@ define void @load_store_factor6(ptr %p) {
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 6
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; CHECK-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; CHECK-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; CHECK-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; CHECK-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; CHECK-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; CHECK-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1291,44 +843,7 @@ define void @load_store_factor6(ptr %p) {
 ; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 6
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; FIXED-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; FIXED-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; FIXED-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; FIXED-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; FIXED-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; FIXED-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; FIXED-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; FIXED-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; FIXED-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; FIXED-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; FIXED-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; FIXED-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1367,44 +882,7 @@ define void @load_store_factor6(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 6
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; SCALABLE-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; SCALABLE-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; SCALABLE-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; SCALABLE-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; SCALABLE-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; SCALABLE-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; SCALABLE-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; SCALABLE-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1494,49 +972,7 @@ define void @load_store_factor7(ptr %p) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 7
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; CHECK-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; CHECK-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; CHECK-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; CHECK-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; CHECK-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; CHECK-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; CHECK-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
-; CHECK-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
-; CHECK-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1578,49 +1014,7 @@ define void @load_store_factor7(ptr %p) {
 ; FIXED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 7
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; FIXED-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; FIXED-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; FIXED-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; FIXED-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; FIXED-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; FIXED-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; FIXED-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; FIXED-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; FIXED-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; FIXED-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; FIXED-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; FIXED-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; FIXED-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
-; FIXED-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; FIXED-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
-; FIXED-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; FIXED-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1661,49 +1055,7 @@ define void @load_store_factor7(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 7
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; SCALABLE-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; SCALABLE-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; SCALABLE-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; SCALABLE-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; SCALABLE-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; SCALABLE-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; SCALABLE-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; SCALABLE-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; SCALABLE-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
-; SCALABLE-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; SCALABLE-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
-; SCALABLE-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; SCALABLE-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1801,54 +1153,7 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; CHECK-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; CHECK-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; CHECK-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; CHECK-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; CHECK-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; CHECK-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; CHECK-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
-; CHECK-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
-; CHECK-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
-; CHECK-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
-; CHECK-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
-; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
-; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
-; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1891,54 +1196,7 @@ define void @load_store_factor8(ptr %p) {
 ; FIXED-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; FIXED-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; FIXED-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; FIXED-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; FIXED-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; FIXED-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; FIXED-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; FIXED-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; FIXED-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; FIXED-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; FIXED-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; FIXED-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; FIXED-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; FIXED-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; FIXED-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; FIXED-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; FIXED-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; FIXED-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; FIXED-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; FIXED-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; FIXED-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; FIXED-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; FIXED-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; FIXED-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
-; FIXED-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; FIXED-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
-; FIXED-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; FIXED-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
-; FIXED-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
-; FIXED-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
-; FIXED-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
-; FIXED-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
-; FIXED-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -1981,54 +1239,7 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; SCALABLE-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; SCALABLE-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
-; SCALABLE-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; SCALABLE-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; SCALABLE-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
-; SCALABLE-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
-; SCALABLE-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; SCALABLE-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; SCALABLE-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
-; SCALABLE-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
-; SCALABLE-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; SCALABLE-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; SCALABLE-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
-; SCALABLE-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
-; SCALABLE-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; SCALABLE-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
-; SCALABLE-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; SCALABLE-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
-; SCALABLE-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
-; SCALABLE-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; SCALABLE-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
-; SCALABLE-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; SCALABLE-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
-; SCALABLE-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
-; SCALABLE-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
-; SCALABLE-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
-; SCALABLE-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
-; SCALABLE-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -2118,23 +1329,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
-; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -2157,23 +1352,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; FIXED-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
-; FIXED-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -2202,23 +1381,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i32, ptr [[Q1]], align 4
-; SCALABLE-NEXT:    [[RES:%.*]] = add i32 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]]
-; SCALABLE-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -2273,23 +1436,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
-; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -2312,23 +1459,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; FIXED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
-; FIXED:       scalar.ph:
 ; FIXED-NEXT:    br label [[LOOP:%.*]]
-; FIXED:       loop:
-; FIXED-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; FIXED-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; FIXED-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; FIXED-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; FIXED-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; FIXED-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; FIXED-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; FIXED-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; FIXED-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
-; FIXED-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; FIXED-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; FIXED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; FIXED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
@@ -2357,23 +1488,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br label [[EXIT:%.*]]
-; SCALABLE:       scalar.ph:
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
-; SCALABLE:       loop:
-; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; SCALABLE-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; SCALABLE-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
-; SCALABLE-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
-; SCALABLE-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
-; SCALABLE-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; SCALABLE-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
-; SCALABLE-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; SCALABLE-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
-; SCALABLE-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
-; SCALABLE-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; SCALABLE-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; SCALABLE-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; SCALABLE:       exit:
 ; SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index a30aebb16a8c1..ef0f0cf8777e7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -96,7 +96,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_DATA:       middle.block:
 ; PREDICATED_DATA-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_DATA:       scalar.ph:
+; PREDICATED_DATA:       for.end:
+; PREDICATED_DATA-NEXT:    ret void
 ;
 ; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor2
 ; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -135,9 +136,13 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP10]], <vscale x 32 x i1> [[INTERLEAVED_MASK4]], i32 [[INTERLEAVE_EVL3]])
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_DATA-WITH-EVL:       middle.block:
 ; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_DATA-WITH-EVL:       scalar.ph:
+; PREDICATED_DATA-WITH-EVL:       for.end:
+; PREDICATED_DATA-WITH-EVL-NEXT:    ret void
 ;
 entry:
   %conv = zext i8 %guard to i32
@@ -270,10 +275,11 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_DATA-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_DATA-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDICATED_DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_DATA-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; PREDICATED_DATA:       middle.block:
 ; PREDICATED_DATA-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_DATA:       scalar.ph:
+; PREDICATED_DATA:       for.end:
+; PREDICATED_DATA-NEXT:    ret void
 ;
 ; PREDICATED_DATA-WITH-EVL-LABEL: define void @masked_strided_factor4
 ; PREDICATED_DATA-WITH-EVL-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
@@ -316,9 +322,13 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_DATA-WITH-EVL-NEXT:    call void @llvm.vp.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP15]], <vscale x 64 x i1> [[INTERLEAVED_MASK4]], i32 [[INTERLEAVE_EVL3]])
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]]
 ; PREDICATED_DATA-WITH-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_DATA-WITH-EVL-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
+; PREDICATED_DATA-WITH-EVL-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_DATA-WITH-EVL:       middle.block:
 ; PREDICATED_DATA-WITH-EVL-NEXT:    br label [[FOR_END:%.*]]
-; PREDICATED_DATA-WITH-EVL:       scalar.ph:
+; PREDICATED_DATA-WITH-EVL:       for.end:
+; PREDICATED_DATA-WITH-EVL-NEXT:    ret void
 ;
 entry:
   %conv = zext i8 %guard to i32
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
index cf2f78b578981..328ee16a92db4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
@@ -62,18 +62,7 @@ define void @load_store(ptr %p) {
 ; LMUL2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; LMUL2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LMUL2:       middle.block:
-; LMUL2-NEXT:    br label [[FOR_END:%.*]]
-; LMUL2:       scalar.ph:
 ; LMUL2-NEXT:    br label [[FOR_BODY:%.*]]
-; LMUL2:       for.body:
-; LMUL2-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; LMUL2-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL2-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
-; LMUL2-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL2-NEXT:    store i64 [[W]], ptr [[Q]], align 8
-; LMUL2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; LMUL2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; LMUL2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; LMUL2:       for.end:
 ; LMUL2-NEXT:    ret void
 ;
@@ -96,18 +85,7 @@ define void @load_store(ptr %p) {
 ; LMUL4-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; LMUL4-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LMUL4:       middle.block:
-; LMUL4-NEXT:    br label [[FOR_END:%.*]]
-; LMUL4:       scalar.ph:
 ; LMUL4-NEXT:    br label [[FOR_BODY:%.*]]
-; LMUL4:       for.body:
-; LMUL4-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; LMUL4-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL4-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
-; LMUL4-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL4-NEXT:    store i64 [[W]], ptr [[Q]], align 8
-; LMUL4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; LMUL4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; LMUL4-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; LMUL4:       for.end:
 ; LMUL4-NEXT:    ret void
 ;
@@ -130,18 +108,7 @@ define void @load_store(ptr %p) {
 ; LMUL8-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; LMUL8-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LMUL8:       middle.block:
-; LMUL8-NEXT:    br label [[FOR_END:%.*]]
-; LMUL8:       scalar.ph:
 ; LMUL8-NEXT:    br label [[FOR_BODY:%.*]]
-; LMUL8:       for.body:
-; LMUL8-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; LMUL8-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL8-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
-; LMUL8-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL8-NEXT:    store i64 [[W]], ptr [[Q]], align 8
-; LMUL8-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; LMUL8-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; LMUL8-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; LMUL8:       for.end:
 ; LMUL8-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
index 53907fadf8187..8ef53cade01ac 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -133,21 +133,7 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP7]], ptr align 1 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP15]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP16]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -186,21 +172,7 @@ define void @trip16_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP11]], ptr align 1 [[TMP4]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -240,21 +212,7 @@ define void @trip32_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP11]], ptr align 1 [[TMP4]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP7]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP8]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -293,21 +251,7 @@ define void @trip24_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP7]], ptr align 1 [[DST]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP5]])
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP8]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[I_08]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP9]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 24
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index ae6c90c5ce188..06b47aa6551a0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -40,25 +40,7 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; VLENUNK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VLENUNK:       middle.block:
-; VLENUNK-NEXT:    br label [[FOR_END:%.*]]
-; VLENUNK:       scalar.ph:
-; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
-; VLENUNK:       for.body:
-; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; VLENUNK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[IV]], 512
-; VLENUNK-NEXT:    br i1 [[ICMP]], label [[DO_LOAD:%.*]], label [[LATCH]]
-; VLENUNK:       do_load:
-; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; VLENUNK-NEXT:    [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; VLENUNK-NEXT:    br label [[LATCH]]
-; VLENUNK:       latch:
-; VLENUNK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ELEM]], [[DO_LOAD]] ], [ 0, [[FOR_BODY]] ]
-; VLENUNK-NEXT:    [[ADD:%.*]] = add i32 [[PHI]], [[V]]
-; VLENUNK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; VLENUNK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
+; VLENUNK-NEXT:    br label [[LATCH:%.*]]
 ; VLENUNK:       for.end:
 ; VLENUNK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index e0bd8aa3a7a2a..0a9b1e0af48bc 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -108,7 +108,8 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
 ; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-V:       scalar.ph:
+; FIXED-V:       for.exit:
+; FIXED-V-NEXT:    ret i32 [[TMP15]]
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdot(
 ; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -143,7 +144,8 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
 ; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-ZVQDOTQ:       scalar.ph:
+; FIXED-ZVQDOTQ:       for.exit:
+; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
 entry:
   br label %for.body
@@ -263,12 +265,13 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
 ; FIXED-V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED-V:       middle.block:
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
 ; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-V:       scalar.ph:
+; FIXED-V:       for.exit:
+; FIXED-V-NEXT:    ret i32 [[TMP15]]
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotu(
 ; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -298,12 +301,13 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FIXED-ZVQDOTQ:       middle.block:
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
 ; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-ZVQDOTQ:       scalar.ph:
+; FIXED-ZVQDOTQ:       for.exit:
+; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
 entry:
   br label %for.body
@@ -423,12 +427,13 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
 ; FIXED-V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED-V:       middle.block:
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
 ; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-V:       scalar.ph:
+; FIXED-V:       for.exit:
+; FIXED-V-NEXT:    ret i32 [[TMP15]]
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu(
 ; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -458,12 +463,13 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FIXED-ZVQDOTQ:       middle.block:
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
 ; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-ZVQDOTQ:       scalar.ph:
+; FIXED-ZVQDOTQ:       for.exit:
+; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
 entry:
   br label %for.body
@@ -582,12 +588,13 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-V-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
 ; FIXED-V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED-V:       middle.block:
 ; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
 ; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
 ; FIXED-V-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-V:       scalar.ph:
+; FIXED-V:       for.exit:
+; FIXED-V-NEXT:    ret i32 [[TMP15]]
 ;
 ; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu2(
 ; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -617,12 +624,13 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FIXED-ZVQDOTQ:       middle.block:
 ; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
 ; FIXED-ZVQDOTQ-NEXT:    br label [[FOR_EXIT:%.*]]
-; FIXED-ZVQDOTQ:       scalar.ph:
+; FIXED-ZVQDOTQ:       for.exit:
+; FIXED-ZVQDOTQ-NEXT:    ret i32 [[TMP13]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
index 782c2f6c24fa4..65928f80a76f6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
@@ -49,30 +49,7 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ule i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_1:%.*]], label [[ELSE_1:%.*]]
-; CHECK:       then.1:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ule i64 [[IV]], [[B]]
-; CHECK-NEXT:    br i1 [[C_2]], label [[ELSE_1]], label [[MERGE:%.*]]
-; CHECK:       else.1:
-; CHECK-NEXT:    [[C_3:%.*]] = icmp ule i64 [[IV]], [[C]]
-; CHECK-NEXT:    br i1 [[C_3]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
-; CHECK:       then.2:
-; CHECK-NEXT:    br label [[MERGE]]
-; CHECK:       merge:
-; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ poison, [[THEN_1]] ], [ [[IV]], [[THEN_2]] ]
-; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[IDX]]
-; CHECK-NEXT:    store i16 0, ptr [[GETELEMENTPTR]], align 2
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i64 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 3739f85afe740..8d4d282a5236d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -37,27 +37,7 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT1:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH1:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT:    [[ADD]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[IV]], 2
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 48
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SHL]], 52
-; CHECK-NEXT:    [[TRUNC_I32:%.*]] = trunc i64 [[ASHR]] to i32
-; CHECK-NEXT:    br i1 [[CMP_SLT]], label [[COND_FALSE:%.*]], label [[FOR_BODY]]
-; CHECK:       cond.false:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B]] to i32
-; CHECK-NEXT:    br label [[FOR_BODY]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TRUNC_I32]], [[FOR_COND1]] ], [ [[ZEXT]], [[COND_FALSE]] ]
-; CHECK-NEXT:    [[SHL_I32:%.*]] = shl i32 [[COND]], 8
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
-; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[P]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 8
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND1]], label [[EXIT1]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
index 9b6bc684249f1..735fb769de8b9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reductions.ll
@@ -29,20 +29,8 @@ define i32 @add(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[TMP10]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -85,20 +73,8 @@ define i32 @sub(ptr %a, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP3]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 1024, %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[SUB]] = sub i32 [[RDX]], [[X]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUB_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   br label %loop
@@ -144,23 +120,8 @@ define i32 @addsub(ptr %a, ptr %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[SUB:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[RDX]], [[X]]
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr i32, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[SUB]] = sub i32 [[ADD]], [[Y]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUB_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
 ;
 entry:
   br label %loop
@@ -209,20 +170,8 @@ define i32 @or(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[OR:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[OR]] = or i32 [[TMP10]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[OR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -267,20 +216,8 @@ define i32 @and(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[AND:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[AND]] = and i32 [[TMP10]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[AND_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -325,20 +262,8 @@ define i32 @xor(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP8]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[XOR:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[XOR]] = xor i32 [[TMP10]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -384,21 +309,8 @@ define i32 @smin(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_010:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP11]], [[SUM_010]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[SUM_010]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -445,21 +357,8 @@ define i32 @umax(ptr nocapture %a, ptr nocapture readonly %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_010:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt i32 [[TMP11]], [[SUM_010]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP11]], i32 [[SUM_010]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -505,20 +404,8 @@ define float @fadd_fast(ptr noalias nocapture readonly %a, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP8]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[TMP10]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[ADD_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -561,20 +448,8 @@ define half @fadd_fast_half_zvfh(ptr noalias nocapture readonly %a, i64 %n) "tar
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> [[TMP8]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast half [[TMP10]], [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi half [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret half [[ADD_LCSSA]]
+; CHECK-NEXT:    ret half [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -744,21 +619,8 @@ define float @fmin_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt float [[TMP11]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP11]], float [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -803,21 +665,8 @@ define half @fmin_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call half @llvm.vector.reduce.fmin.nxv8f16(<vscale x 8 x half> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt half [[TMP11]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], half [[TMP11]], half [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi half [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret half [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret half [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -862,21 +711,8 @@ define bfloat @fmin_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi bfloat [ 0xR0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt bfloat [[TMP11]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], bfloat [[TMP11]], bfloat [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi bfloat [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret bfloat [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret bfloat [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -923,21 +759,8 @@ define float @fmax_fast(ptr noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast ogt float [[TMP11]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP11]], float [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -982,21 +805,8 @@ define half @fmax_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) #
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call fast half @llvm.vector.reduce.fmax.nxv8f16(<vscale x 8 x half> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast ogt half [[TMP11]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], half [[TMP11]], half [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi half [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret half [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret half [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -1041,21 +851,8 @@ define bfloat @fmax_fast_bfloat_zvfbfmin(ptr noalias nocapture readonly %a, i64
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP12:%.*]] = call fast bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi bfloat [ 0xR0000, %[[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp fast ogt bfloat [[TMP11]], [[SUM_07]]
-; CHECK-NEXT:    [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], bfloat [[TMP11]], bfloat [[SUM_07]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi bfloat [ [[DOTSROA_SPECULATED]], %[[FOR_BODY]] ], [ [[TMP12]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret bfloat [[DOTSROA_SPECULATED_LCSSA]]
+; CHECK-NEXT:    ret bfloat [[TMP12]]
 ;
 entry:
   br label %for.body
@@ -1243,22 +1040,8 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[MULADD:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[SUM_07]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[MULADD_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP16]]
 ;
 entry:
   br label %for.body
@@ -1305,22 +1088,8 @@ define half @fmuladd_f16_zvfh(ptr %a, ptr %b, i64 %n) "target-features"="+zvfh"
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP16:%.*]] = call reassoc half @llvm.vector.reduce.fadd.nxv8f16(half 0xH8000, <vscale x 8 x half> [[TMP9]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_07:%.*]] = phi half [ 0xH0000, %[[SCALAR_PH]] ], [ [[MULADD:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MULADD]] = tail call reassoc half @llvm.fmuladd.f16(half [[TMP11]], half [[TMP12]], half [[SUM_07]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[MULADD_LCSSA:%.*]] = phi half [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP16]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret half [[MULADD_LCSSA]]
+; CHECK-NEXT:    ret half [[TMP16]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll
index 93c0a7455165b..850a6cb7ddb0d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll
@@ -58,36 +58,6 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) {
 ; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[WIDE_IV_0:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_0_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[WIDE_IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_1_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[WIDE_IV_2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_2_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[WIDE_IV_0_SUB:%.*]] = sub i64 [[WIDE_IV_0]], 1
-; CHECK-NEXT:    [[A_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_0_SUB]]
-; CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[A_GEP0]], align 1
-; CHECK-NEXT:    [[WIDE_IV_1_SUB:%.*]] = sub i64 [[WIDE_IV_1]], 1
-; CHECK-NEXT:    [[B_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_1_SUB]]
-; CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[B_GEP0]], align 1
-; CHECK-NEXT:    [[WIDE_IV_2_SUB:%.*]] = sub i64 [[WIDE_IV_2]], 1
-; CHECK-NEXT:    [[C_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_2_SUB]]
-; CHECK-NEXT:    [[C:%.*]] = load i8, ptr [[C_GEP0]], align 1
-; CHECK-NEXT:    [[IV_MUL:%.*]] = mul i64 [[IV]], 3
-; CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[IV_MUL]]
-; CHECK-NEXT:    [[A_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 0
-; CHECK-NEXT:    store i8 [[A]], ptr [[A_GEP1]], align 1
-; CHECK-NEXT:    [[B_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 1
-; CHECK-NEXT:    store i8 [[B]], ptr [[B_GEP1]], align 1
-; CHECK-NEXT:    [[C_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 2
-; CHECK-NEXT:    store i8 [[C]], ptr [[C_GEP1]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[WIDE_IV_0_NEXT]] = add i64 [[WIDE_IV_0]], 2
-; CHECK-NEXT:    [[WIDE_IV_1_NEXT]] = add i64 [[WIDE_IV_1]], 3
-; CHECK-NEXT:    [[WIDE_IV_2_NEXT]] = add i64 [[WIDE_IV_2]], 4
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -145,36 +115,6 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) {
 ; NO-REG-PRESSURE-CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; NO-REG-PRESSURE-CHECK:       [[MIDDLE_BLOCK]]:
 ; NO-REG-PRESSURE-CHECK-NEXT:    br label %[[EXIT:.*]]
-; NO-REG-PRESSURE-CHECK:       [[SCALAR_PH:.*]]:
-; NO-REG-PRESSURE-CHECK-NEXT:    br label %[[LOOP:.*]]
-; NO-REG-PRESSURE-CHECK:       [[LOOP]]:
-; NO-REG-PRESSURE-CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_0:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_0_NEXT:%.*]], %[[LOOP]] ]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_1_NEXT:%.*]], %[[LOOP]] ]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[WIDE_IV_2_NEXT:%.*]], %[[LOOP]] ]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_0_SUB:%.*]] = sub i64 [[WIDE_IV_0]], 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[A_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_0_SUB]]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[A:%.*]] = load i8, ptr [[A_GEP0]], align 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_1_SUB:%.*]] = sub i64 [[WIDE_IV_1]], 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[B_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_1_SUB]]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[B:%.*]] = load i8, ptr [[B_GEP0]], align 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_2_SUB:%.*]] = sub i64 [[WIDE_IV_2]], 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[C_GEP0:%.*]] = getelementptr i8, ptr [[P0]], i64 [[WIDE_IV_2_SUB]]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[C:%.*]] = load i8, ptr [[C_GEP0]], align 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[IV_MUL:%.*]] = mul i64 [[IV]], 3
-; NO-REG-PRESSURE-CHECK-NEXT:    [[BASE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[IV_MUL]]
-; NO-REG-PRESSURE-CHECK-NEXT:    [[A_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 0
-; NO-REG-PRESSURE-CHECK-NEXT:    store i8 [[A]], ptr [[A_GEP1]], align 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[B_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 1
-; NO-REG-PRESSURE-CHECK-NEXT:    store i8 [[B]], ptr [[B_GEP1]], align 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[C_GEP1:%.*]] = getelementptr i8, ptr [[BASE]], i8 2
-; NO-REG-PRESSURE-CHECK-NEXT:    store i8 [[C]], ptr [[C_GEP1]], align 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_0_NEXT]] = add i64 [[WIDE_IV_0]], 2
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_1_NEXT]] = add i64 [[WIDE_IV_1]], 3
-; NO-REG-PRESSURE-CHECK-NEXT:    [[WIDE_IV_2_NEXT]] = add i64 [[WIDE_IV_2]], 4
-; NO-REG-PRESSURE-CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV]], 1024
-; NO-REG-PRESSURE-CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; NO-REG-PRESSURE-CHECK:       [[EXIT]]:
 ; NO-REG-PRESSURE-CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll
index 7b8404abdc54b..b80368df96089 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll
@@ -21,18 +21,8 @@ define float @s311(float %a_0, float %s311_sum) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ [[S311_SUM]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED_NEXT]] = fadd float [[A_0]], [[RED]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1200
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RED_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP6]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index a165dde0d217e..5ca9bfdb29c2c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -53,10 +53,9 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RV64-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    br [[EXIT:label %.*]]
-; RV64:       [[SCALAR_PH:.*:]]
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    br label %[[EXIT:.*]]
+; RV64:       [[EXIT]]:
+; RV64-NEXT:    ret void
 ;
 ; RV32-LABEL: define void @vector_reverse_i32(
 ; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -93,10 +92,9 @@ define void @vector_reverse_i32(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RV32-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    br [[EXIT:label %.*]]
-; RV32:       [[SCALAR_PH:.*:]]
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    br label %[[EXIT:.*]]
+; RV32:       [[EXIT]]:
+; RV32-NEXT:    ret void
 ;
 ; RV64-UF2-LABEL: define void @vector_reverse_i32(
 ; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -718,10 +716,9 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
 ; RV64-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RV64-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; RV64:       [[MIDDLE_BLOCK]]:
-; RV64-NEXT:    br [[EXIT:label %.*]]
-; RV64:       [[SCALAR_PH:.*:]]
-; RV64-NEXT:    br label %[[FOR_BODY:.*]]
-; RV64:       [[FOR_BODY]]:
+; RV64-NEXT:    br label %[[EXIT:.*]]
+; RV64:       [[EXIT]]:
+; RV64-NEXT:    ret void
 ;
 ; RV32-LABEL: define void @vector_reverse_f32_simplify(
 ; RV32-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
@@ -758,10 +755,9 @@ define void @vector_reverse_f32_simplify(ptr noalias %A, ptr noalias %B) {
 ; RV32-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; RV32-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; RV32:       [[MIDDLE_BLOCK]]:
-; RV32-NEXT:    br [[EXIT:label %.*]]
-; RV32:       [[SCALAR_PH:.*:]]
-; RV32-NEXT:    br label %[[FOR_BODY:.*]]
-; RV32:       [[FOR_BODY]]:
+; RV32-NEXT:    br label %[[EXIT:.*]]
+; RV32:       [[EXIT]]:
+; RV32-NEXT:    ret void
 ;
 ; RV64-UF2-LABEL: define void @vector_reverse_f32_simplify(
 ; RV64-UF2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
index ecde1646ab2b4..e046816b694c0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll
@@ -28,19 +28,7 @@ define void @test(ptr %p) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -81,19 +69,7 @@ define void @test_may_clobber(ptr %p) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -137,19 +113,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -193,19 +157,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; CHECK-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
index 544ddc539c832..7330ce61515d9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll
@@ -27,18 +27,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -84,18 +73,7 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -179,18 +157,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -235,23 +202,9 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP11]]
 ;
 entry:
   br label %for.body
@@ -292,16 +245,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -340,16 +284,7 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store ptr [[V]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index a596c639d08d1..3c90908b0a08f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -28,18 +28,7 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -84,18 +73,7 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -140,23 +118,9 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP11]])
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
-; CHECK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP14]]
 ;
 entry:
   br label %for.body
@@ -197,16 +161,7 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -246,17 +201,7 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -356,18 +301,7 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index 4bfe9a4487604..8971b0cadfa48 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -29,21 +29,8 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP12]], [[X]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
@@ -91,21 +78,8 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[Y]], i32 0
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[COND:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast olt float [[TMP12]], [[X]]
-; CHECK-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
+; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
@@ -151,21 +125,8 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 7, i32 3
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ 3, %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 3
-; CHECK-NEXT:    [[TMP17]] = select i1 [[TMP16]], i32 [[TMP13]], i32 7
-; CHECK-NEXT:    [[TMP18]] = add nuw nsw i64 [[TMP12]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
+; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
@@ -211,21 +172,8 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 [[B]], i32 [[A]]
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ [[A]], %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 3
-; CHECK-NEXT:    [[TMP17]] = select i1 [[TMP16]], i32 [[TMP13]], i32 [[B]]
-; CHECK-NEXT:    [[TMP18]] = add nuw nsw i64 [[TMP12]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
+; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
@@ -271,21 +219,8 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = freeze i1 [[TMP10]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP11]], i32 1, i32 2
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TMP18:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ 2, %[[SCALAR_PH]] ], [ [[TMP17:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast ueq float [[TMP15]], 3.000000e+00
-; CHECK-NEXT:    [[TMP17]] = select i1 [[TMP16]], i32 [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP18]] = add nuw nsw i64 [[TMP12]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[N]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP17]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
+; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
@@ -373,29 +308,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1
 ; CHECK-NEXT:    [[TMP13:%.*]] = freeze i1 [[TMP12]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 0
 ; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %[[FOR_INC]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP14]], 35
-; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP15]], 2
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-NEXT:    br label %[[FOR_INC]]
-; CHECK:       [[FOR_INC]]:
-; CHECK-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %[[FOR_BODY]] ], [ [[SPEC_SELECT]], %[[IF_THEN]] ]
-; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
-; CHECK-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %[[FOR_INC]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[R_1_LCSSA]]
+; CHECK-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index ca1c710e614f3..2fbc73ef74d16 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -31,19 +31,7 @@ define void @single_constant_stride_int_scaled(ptr %p) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH1:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET:%.*]] = mul nuw nsw i64 [[I]], 8
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[SCALAR_PH]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -147,20 +135,7 @@ define void @single_constant_stride_int_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[OFFSET:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[OFFSET_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[Q0]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[Q0]], align 4
-; CHECK-NEXT:    [[OFFSET_NEXT]] = add nuw nsw i64 [[OFFSET]], 64
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -264,19 +239,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH1:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[P]], [[SCALAR_PH1]] ], [ [[PTR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[X0:%.*]] = load i32, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[Y0:%.*]] = add i32 [[X0]], 1
-; CHECK-NEXT:    store i32 [[Y0]], ptr [[PTR]], align 4
-; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
-; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[SCALAR_PH]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -1357,18 +1320,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; NOSTRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; NOSTRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; NOSTRIDED:       middle.block:
-; NOSTRIDED-NEXT:    br label [[EXIT:%.*]]
-; NOSTRIDED:       scalar.ph:
 ; NOSTRIDED-NEXT:    br label [[LOOP:%.*]]
-; NOSTRIDED:       loop:
-; NOSTRIDED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; NOSTRIDED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], i64 [[IV]]
-; NOSTRIDED-NEXT:    [[TMP8:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; NOSTRIDED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[IV]]
-; NOSTRIDED-NEXT:    store i64 [[TMP8]], ptr [[ARRAYIDX2]], align 8
-; NOSTRIDED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; NOSTRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; NOSTRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; NOSTRIDED:       exit:
 ; NOSTRIDED-NEXT:    ret void
 ;
@@ -1452,18 +1404,7 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) {
 ; STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; STRIDED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; STRIDED:       middle.block:
-; STRIDED-NEXT:    br label [[EXIT:%.*]]
-; STRIDED:       scalar.ph:
 ; STRIDED-NEXT:    br label [[LOOP:%.*]]
-; STRIDED:       loop:
-; STRIDED-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; STRIDED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[IN]], i64 [[IV]]
-; STRIDED-NEXT:    [[TMP8:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; STRIDED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT]], i64 [[IV]]
-; STRIDED-NEXT:    store i64 [[TMP8]], ptr [[ARRAYIDX2]], align 8
-; STRIDED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; STRIDED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; STRIDED-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]]
 ; STRIDED:       exit:
 ; STRIDED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
index 6652fefb35d60..8ab0f6f4c14f1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll
@@ -1206,17 +1206,6 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) {
 ; IF-EVL-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[EXIT:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[LOOP:.*]]
-; IF-EVL:       [[LOOP]]:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IF-EVL-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[GEP]] to i64
-; IF-EVL-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    store i64 [[TMP0]], ptr [[GEP2]], align 8
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]]
 ; IF-EVL:       [[EXIT]]:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
index 61f97aa0a47ed..34a82757eccc0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll
@@ -43,23 +43,9 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-OUTLOOP:       scalar.ph:
 ; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-OUTLOOP:       for.body:
-; IF-EVL-OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-OUTLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP27]], 3
-; IF-EVL-OUTLOOP-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP27]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
-; IF-EVL-OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL-OUTLOOP:       for.end:
-; IF-EVL-OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-OUTLOOP-NEXT:    ret i32 [[ADD_LCSSA]]
+; IF-EVL-OUTLOOP-NEXT:    ret i32 [[TMP24]]
 ;
 ; IF-EVL-INLOOP-LABEL: define i32 @cond_add(
 ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -84,23 +70,9 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-INLOOP-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-INLOOP:       scalar.ph:
 ; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-INLOOP:       for.body:
-; IF-EVL-INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-INLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP25]], 3
-; IF-EVL-INLOOP-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP25]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
-; IF-EVL-INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL-INLOOP:       for.end:
-; IF-EVL-INLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-INLOOP-NEXT:    ret i32 [[ADD_LCSSA]]
+; IF-EVL-INLOOP-NEXT:    ret i32 [[TMP22]]
 ;
 ; NO-VP-OUTLOOP-LABEL: define i32 @cond_add(
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -239,30 +211,12 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
 ; IF-EVL-OUTLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP23]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PREDPHI]])
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-OUTLOOP:       scalar.ph:
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-OUTLOOP:       for.body:
-; IF-EVL-OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-OUTLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP28]], 3
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; IF-EVL-OUTLOOP:       if.then:
-; IF-EVL-OUTLOOP-NEXT:    [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP28]]
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_INC]]
-; IF-EVL-OUTLOOP:       for.inc:
-; IF-EVL-OUTLOOP-NEXT:    [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]]
+; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_INC:%.*]]
 ; IF-EVL-OUTLOOP:       for.end:
-; IF-EVL-OUTLOOP-NEXT:    [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-OUTLOOP-NEXT:    ret i32 [[RDX_ADD_LCSSA]]
+; IF-EVL-OUTLOOP-NEXT:    ret i32 [[TMP27]]
 ;
 ; IF-EVL-INLOOP-LABEL: define i32 @cond_add_pred(
 ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
@@ -284,29 +238,11 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
 ; IF-EVL-INLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[TMP11]], [[TMP23]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-INLOOP-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-INLOOP-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-INLOOP:       scalar.ph:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-INLOOP:       for.body:
-; IF-EVL-INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; IF-EVL-INLOOP-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[FOR_INC]] ]
-; IF-EVL-INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-INLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP25]], 3
-; IF-EVL-INLOOP-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; IF-EVL-INLOOP:       if.then:
-; IF-EVL-INLOOP-NEXT:    [[ADD_PRED:%.*]] = add nsw i32 [[RDX]], [[TMP25]]
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_INC]]
-; IF-EVL-INLOOP:       for.inc:
-; IF-EVL-INLOOP-NEXT:    [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]]
+; IF-EVL-INLOOP-NEXT:    br label [[FOR_INC:%.*]]
 ; IF-EVL-INLOOP:       for.end:
-; IF-EVL-INLOOP-NEXT:    [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-INLOOP-NEXT:    ret i32 [[RDX_ADD_LCSSA]]
+; IF-EVL-INLOOP-NEXT:    ret i32 [[TMP22]]
 ;
 ; NO-VP-OUTLOOP-LABEL: define i32 @cond_add_pred(
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
@@ -466,27 +402,12 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
 ; IF-EVL-OUTLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-OUTLOOP:       scalar.ph:
 ; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-OUTLOOP:       for.body:
-; IF-EVL-OUTLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP37:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-OUTLOOP-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
-; IF-EVL-OUTLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP37]], [[IV_TRUNC]]
-; IF-EVL-OUTLOOP-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP37]], i32 0
-; IF-EVL-OUTLOOP-NEXT:    [[ADD]] = add nsw i32 [[SELECT]], [[RDX]]
-; IF-EVL-OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]]
 ; IF-EVL-OUTLOOP:       for.end:
-; IF-EVL-OUTLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-OUTLOOP-NEXT:    ret i32 [[ADD_LCSSA]]
+; IF-EVL-OUTLOOP-NEXT:    ret i32 [[TMP22]]
 ;
 ; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add(
 ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
@@ -516,26 +437,11 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]]
 ; IF-EVL-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-INLOOP-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-INLOOP-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-INLOOP:       scalar.ph:
 ; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-INLOOP:       for.body:
-; IF-EVL-INLOOP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[RDX1:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-INLOOP-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-INLOOP-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
-; IF-EVL-INLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP28]], [[IV_TRUNC]]
-; IF-EVL-INLOOP-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i32 [[TMP28]], i32 0
-; IF-EVL-INLOOP-NEXT:    [[ADD1]] = add nsw i32 [[SELECT]], [[RDX1]]
-; IF-EVL-INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]]
 ; IF-EVL-INLOOP:       for.end:
-; IF-EVL-INLOOP-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD1]], [[FOR_BODY]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-INLOOP-NEXT:    ret i32 [[ADD_LCSSA]]
+; IF-EVL-INLOOP-NEXT:    ret i32 [[ADD]]
 ;
 ; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add(
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
@@ -700,31 +606,12 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-OUTLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]]
 ; IF-EVL-OUTLOOP-NEXT:    [[VEC_IND_NEXT7]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-OUTLOOP-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL-OUTLOOP:       middle.block:
 ; IF-EVL-OUTLOOP-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP24]])
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-OUTLOOP:       scalar.ph:
-; IF-EVL-OUTLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-OUTLOOP:       for.body:
-; IF-EVL-OUTLOOP-NEXT:    [[IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
-; IF-EVL-OUTLOOP-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
-; IF-EVL-OUTLOOP-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV1]] to i32
-; IF-EVL-OUTLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP38]], [[IV_TRUNC]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]]
-; IF-EVL-OUTLOOP:       if.then:
-; IF-EVL-OUTLOOP-NEXT:    [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP38]]
-; IF-EVL-OUTLOOP-NEXT:    br label [[MIDDLE_BLOCK]]
-; IF-EVL-OUTLOOP:       for.inc:
-; IF-EVL-OUTLOOP-NEXT:    [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[FOR_BODY]] ]
-; IF-EVL-OUTLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-OUTLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-OUTLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]]
+; IF-EVL-OUTLOOP-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; IF-EVL-OUTLOOP:       for.end:
-; IF-EVL-OUTLOOP-NEXT:    [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[MIDDLE_BLOCK1]] ]
-; IF-EVL-OUTLOOP-NEXT:    ret i32 [[RDX_ADD_LCSSA]]
+; IF-EVL-OUTLOOP-NEXT:    ret i32 [[TMP27]]
 ;
 ; IF-EVL-INLOOP-LABEL: define i32 @step_cond_add_pred(
 ; IF-EVL-INLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
@@ -753,30 +640,11 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]]
 ; IF-EVL-INLOOP-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-INLOOP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-INLOOP-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL-INLOOP:       middle.block:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL-INLOOP:       scalar.ph:
-; IF-EVL-INLOOP-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL-INLOOP:       for.body:
-; IF-EVL-INLOOP-NEXT:    [[IV1:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE_BLOCK:%.*]] ]
-; IF-EVL-INLOOP-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[RDX_ADD:%.*]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-INLOOP-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
-; IF-EVL-INLOOP-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
-; IF-EVL-INLOOP-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV1]] to i32
-; IF-EVL-INLOOP-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP35]], [[IV_TRUNC]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[MIDDLE_BLOCK]]
-; IF-EVL-INLOOP:       if.then:
-; IF-EVL-INLOOP-NEXT:    [[ADD_PRED:%.*]] = add nsw i32 [[BC_MERGE_RDX]], [[TMP35]]
-; IF-EVL-INLOOP-NEXT:    br label [[MIDDLE_BLOCK]]
-; IF-EVL-INLOOP:       for.inc:
-; IF-EVL-INLOOP-NEXT:    [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[FOR_BODY]] ]
-; IF-EVL-INLOOP-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-INLOOP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-INLOOP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]]
+; IF-EVL-INLOOP-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; IF-EVL-INLOOP:       for.end:
-; IF-EVL-INLOOP-NEXT:    [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP17]], [[MIDDLE_BLOCK1]] ]
-; IF-EVL-INLOOP-NEXT:    ret i32 [[RDX_ADD_LCSSA]]
+; IF-EVL-INLOOP-NEXT:    ret i32 [[TMP17]]
 ;
 ; NO-VP-OUTLOOP-LABEL: define i32 @step_cond_add_pred(
 ; NO-VP-OUTLOOP-SAME: ptr [[A:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0]] {
@@ -931,20 +799,16 @@ for.end:
 ; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; IF-EVL-OUTLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
-; IF-EVL-OUTLOOP: [[META4]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; IF-EVL-OUTLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; IF-EVL-OUTLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; IF-EVL-OUTLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
 ;.
 ; IF-EVL-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; IF-EVL-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; IF-EVL-INLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
-; IF-EVL-INLOOP: [[META4]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; IF-EVL-INLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; IF-EVL-INLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
-; IF-EVL-INLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
 ;.
 ; NO-VP-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; NO-VP-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
index 22d216e059af3..8cd540c3888db 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
@@ -33,20 +33,6 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[EXIT:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[LOOP:.*]]
-; IF-EVL:       [[LOOP]]:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; IF-EVL-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = load i64, ptr [[A_GEP]], align 8
-; IF-EVL-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i64, ptr [[B_GEP]], align 8
-; IF-EVL-NEXT:    [[TMP18:%.*]] = sdiv i64 [[TMP16]], [[TMP17]]
-; IF-EVL-NEXT:    [[C_GEP:%.*]] = getelementptr i64, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    store i64 [[TMP18]], ptr [[C_GEP]], align 8
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; IF-EVL-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; IF-EVL:       [[EXIT]]:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -143,20 +129,6 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[EXIT:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[LOOP:.*]]
-; IF-EVL:       [[LOOP]]:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; IF-EVL-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = load i64, ptr [[A_GEP]], align 8
-; IF-EVL-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i64, ptr [[B_GEP]], align 8
-; IF-EVL-NEXT:    [[TMP18:%.*]] = udiv i64 [[TMP16]], [[TMP17]]
-; IF-EVL-NEXT:    [[C_GEP:%.*]] = getelementptr i64, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    store i64 [[TMP18]], ptr [[C_GEP]], align 8
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; IF-EVL-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; IF-EVL:       [[EXIT]]:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -252,20 +224,6 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[EXIT:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[LOOP:.*]]
-; IF-EVL:       [[LOOP]]:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; IF-EVL-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = load i64, ptr [[A_GEP]], align 8
-; IF-EVL-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i64, ptr [[B_GEP]], align 8
-; IF-EVL-NEXT:    [[TMP18:%.*]] = srem i64 [[TMP16]], [[TMP17]]
-; IF-EVL-NEXT:    [[C_GEP:%.*]] = getelementptr i64, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    store i64 [[TMP18]], ptr [[C_GEP]], align 8
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; IF-EVL-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; IF-EVL:       [[EXIT]]:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -361,20 +319,6 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[EXIT:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[LOOP:.*]]
-; IF-EVL:       [[LOOP]]:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; IF-EVL-NEXT:    [[A_GEP:%.*]] = getelementptr i64, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP16:%.*]] = load i64, ptr [[A_GEP]], align 8
-; IF-EVL-NEXT:    [[B_GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i64, ptr [[B_GEP]], align 8
-; IF-EVL-NEXT:    [[TMP18:%.*]] = urem i64 [[TMP16]], [[TMP17]]
-; IF-EVL-NEXT:    [[C_GEP:%.*]] = getelementptr i64, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    store i64 [[TMP18]], ptr [[C_GEP]], align 8
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[DONE:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; IF-EVL-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; IF-EVL:       [[EXIT]]:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index b153328663184..c7ba826295de8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -42,19 +42,6 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
-; IF-EVL:       [[FOR_BODY]]:
-; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ 33, %[[SCALAR_PH]] ], [ [[TMP24:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    [[TMP24]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[TMP24]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       [[FOR_END]]:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -167,23 +154,9 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP23]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
-; IF-EVL:       [[FOR_BODY]]:
-; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ 33, %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ 22, %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3]]
 ; IF-EVL:       [[FOR_END]]:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -316,25 +289,9 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP27]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
-; IF-EVL:       [[FOR_BODY]]:
-; IF-EVL-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i32 [ 33, %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR2:%.*]] = phi i32 [ 22, %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR3:%.*]] = phi i32 [ 11, %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]]
-; IF-EVL-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], [[FOR1]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS]]
-; IF-EVL-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3]]
 ; IF-EVL:       [[FOR_END]]:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -469,7 +426,7 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP12]], align 4
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS]], [[TMP3]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4
@@ -495,7 +452,7 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
 ; IF-EVL-NEXT:    [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       [[FOR_END]]:
 ; IF-EVL-NEXT:    [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret i32 [[FOR1_LCSSA]]
@@ -613,20 +570,9 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       [[MIDDLE_BLOCK]]:
 ; IF-EVL-NEXT:    br label %[[FOR_END:.*]]
-; IF-EVL:       [[SCALAR_PH:.*]]:
-; IF-EVL-NEXT:    br label %[[FOR_BODY:.*]]
-; IF-EVL:       [[FOR_BODY]]:
-; IF-EVL-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV1_NEXT:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[FOR1:%.*]] = phi i64 [ 33, %[[SCALAR_PH]] ], [ [[TMP14:%.*]], %[[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP14]] = add i64 [[IV1]], 42
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[IV1]]
-; IF-EVL-NEXT:    store i64 [[FOR1]], ptr [[ARRAYIDX]], align 8
-; IF-EVL-NEXT:    [[IV1_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV1_NEXT]], [[TC]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3]]
 ; IF-EVL:       [[FOR_END]]:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -713,13 +659,11 @@ for.end:
 ; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
-; IF-EVL: [[META4]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
 ; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
-; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
 ;.
 ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll
index df550ecac561e..b9a4e97cd9f24 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll
@@ -30,21 +30,9 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD]] = add nsw i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ADD_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP15]]
 ;
 ; NO-VP-LABEL: @add(
 ; NO-VP-NEXT:  entry:
@@ -129,7 +117,7 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP5]] = mul i32 [[VEC_PHI1]], [[TMP4]]
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 8
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[BIN_RDX:%.*]] = mul i32 [[TMP5]], [[MUL]]
 ; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
@@ -146,7 +134,7 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[MUL1]] = mul nsw i32 [[TMP0]], [[RDX1]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[FOR_BODY1]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret i32 [[MUL_LCSSA]]
@@ -231,23 +219,11 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[OR]] = or i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[OR_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP15]]
 ;
 ; NO-VP-LABEL: @or(
 ; NO-VP-NEXT:  entry:
@@ -327,23 +303,11 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[AND]] = and i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[AND_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP15]]
 ;
 ; NO-VP-LABEL: @and(
 ; NO-VP-NEXT:  entry:
@@ -423,23 +387,11 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[XOR]] = xor i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[XOR_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP15]]
 ;
 ; NO-VP-LABEL: @xor(
 ; NO-VP-NEXT:  entry:
@@ -519,24 +471,11 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP8]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP17]], [[RDX]]
-; IF-EVL-NEXT:    [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[SMIN_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
 ; NO-VP-LABEL: @smin(
 ; NO-VP-NEXT:  entry:
@@ -618,24 +557,11 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP8]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[TMP17]], [[RDX]]
-; IF-EVL-NEXT:    [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[SMAX_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
 ; NO-VP-LABEL: @smax(
 ; NO-VP-NEXT:  entry:
@@ -717,24 +643,11 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP8]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp ult i32 [[TMP17]], [[RDX]]
-; IF-EVL-NEXT:    [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[UMIN_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
 ; NO-VP-LABEL: @umin(
 ; NO-VP-NEXT:  entry:
@@ -816,24 +729,11 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP8]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp ugt i32 [[TMP17]], [[RDX]]
-; IF-EVL-NEXT:    [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP17]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[UMAX_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
 ; NO-VP-LABEL: @umax(
 ; NO-VP-NEXT:  entry:
@@ -915,23 +815,11 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD]] = fadd reassoc float [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[ADD_LCSSA]]
+; IF-EVL-NEXT:    ret float [[TMP15]]
 ;
 ; NO-VP-LABEL: @fadd(
 ; NO-VP-NEXT:  entry:
@@ -1016,7 +904,7 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP5]] = fmul reassoc float [[VEC_PHI1]], [[TMP4]]
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 8
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[BIN_RDX:%.*]] = fmul reassoc float [[TMP5]], [[MUL]]
 ; IF-EVL-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]]
@@ -1033,7 +921,7 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[MUL1]] = fmul reassoc float [[TMP0]], [[RDX1]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL1]], [[FOR_BODY1]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret float [[MUL_LCSSA]]
@@ -1119,24 +1007,11 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP8]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP17]], [[RDX]]
-; IF-EVL-NEXT:    [[MIN]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[MIN_LCSSA]]
+; IF-EVL-NEXT:    ret float [[RDX_MINMAX_SELECT]]
 ;
 ; NO-VP-LABEL: @fmin(
 ; NO-VP-NEXT:  entry:
@@ -1220,24 +1095,11 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP8]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP:%.*]] = fcmp fast ogt float [[TMP17]], [[RDX]]
-; IF-EVL-NEXT:    [[MAX]] = select i1 [[CMP]], float [[TMP17]], float [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[MAX_LCSSA]]
+; IF-EVL-NEXT:    ret float [[RDX_MINMAX_SELECT]]
 ;
 ; NO-VP-LABEL: @fmax(
 ; NO-VP-NEXT:  entry:
@@ -1324,7 +1186,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI2]], <8 x float> [[WIDE_LOAD3]])
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP3]], <8 x float> [[TMP4]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]])
@@ -1342,7 +1204,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret float [[MIN_LCSSA]]
@@ -1432,7 +1294,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI2]], <8 x float> [[WIDE_LOAD3]])
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP3]], <8 x float> [[TMP4]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]])
@@ -1450,7 +1312,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP32:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP21:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret float [[MAX_LCSSA]]
@@ -1539,25 +1401,11 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[RDX]])
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[MULADD_LCSSA]]
+; IF-EVL-NEXT:    ret float [[TMP18]]
 ;
 ; NO-VP-LABEL: @fmuladd(
 ; NO-VP-NEXT:  entry:
@@ -1644,27 +1492,14 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = freeze i1 [[TMP19]]
 ; IF-EVL-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]]
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP21]], 3
-; IF-EVL-NEXT:    [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ANYOF_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; NO-VP-LABEL: @anyof_icmp(
 ; NO-VP-NEXT:  entry:
@@ -1749,27 +1584,14 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP16]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = freeze i1 [[TMP19]]
 ; IF-EVL-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]]
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = fcmp fast olt float [[TMP21]], 3.000000e+00
-; IF-EVL-NEXT:    [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ANYOF_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; NO-VP-LABEL: @anyof_fcmp(
 ; NO-VP-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index 7c05f4613b575..0c22a9eb2acab 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -32,21 +32,7 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
-; IF-EVL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
-; IF-EVL-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       for.cond.cleanup:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -156,30 +142,12 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 0
-; IF-EVL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[RDX]], [[TMP16]]
-; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 1
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD]], [[TMP17]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 3
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD2]] = add nsw i32 [[ADD1]], [[TMP18]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]]
 ; IF-EVL:       exit:
-; IF-EVL-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ADD2_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP15]]
 ;
 ; NO-VP-LABEL: @load_factor_4_with_gap(
 ; NO-VP-NEXT:  entry:
@@ -299,22 +267,9 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[TMP15:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 [[TMP15]], i32 0
-; IF-EVL-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 [[TMP15]], i32 1
-; IF-EVL-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX1]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 [[TMP15]], i32 3
-; IF-EVL-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[TMP15]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -427,30 +382,12 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 0
-; IF-EVL-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[RDX]], [[TMP16]]
-; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 1
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD]], [[TMP17]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 2
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD2]] = add nsw i32 [[ADD1]], [[TMP18]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]]
 ; IF-EVL:       exit:
-; IF-EVL-NEXT:    [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ADD2_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP15]]
 ;
 ; NO-VP-LABEL: @load_factor_4_with_tail_gap(
 ; NO-VP-NEXT:  entry:
@@ -571,22 +508,9 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT5]] = add <vscale x 4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[TMP15:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 [[TMP15]], i32 0
-; IF-EVL-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 [[TMP15]], i32 1
-; IF-EVL-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX1]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 [[TMP15]], i32 2
-; IF-EVL-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[TMP15]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -697,33 +621,12 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ [[N]], [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 0
-; IF-EVL-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[RDX]], [[TMP20]]
-; IF-EVL-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 1
-; IF-EVL-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD1:%.*]] = add nsw i32 [[ADD]], [[TMP21]]
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 2
-; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD2:%.*]] = add nsw i32 [[ADD1]], [[TMP22]]
-; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i64 [[IV]], i32 3
-; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
-; IF-EVL-NEXT:    [[ADD3]] = add nsw i32 [[ADD2]], [[TMP23]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp sgt i64 [[IV_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]]
 ; IF-EVL:       exit:
-; IF-EVL-NEXT:    [[ADD3_LCSSA:%.*]] = phi i32 [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ADD3_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP19]]
 ;
 ; NO-VP-LABEL: @load_factor_4_reverse(
 ; NO-VP-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll
index 00c88a46c3a0a..1aea6aaff66a3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-iv32.ll
@@ -26,18 +26,7 @@ define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) {
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY1:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV1:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV1]]
-; IF-EVL-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV1]]
-; IF-EVL-NEXT:    store i32 [[TMP0]], ptr [[ARRAYIDX4]], align 4
-; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT1]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY1]]
 ; IF-EVL:       for.cond.cleanup:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll
index a03b4306bad66..e94e64fe11d2f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll
@@ -32,17 +32,6 @@ define void @trip_count_max_1024(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[P]], i64 [[I]]
-; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[Y:%.*]] = add i64 [[X]], 1
-; CHECK-NEXT:    store i64 [[Y]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp uge i64 [[I_NEXT]], [[TC]]
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
@@ -92,17 +81,6 @@ define void @overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[P]], i64 [[I]]
-; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[Y:%.*]] = add i64 [[X]], 1
-; CHECK-NEXT:    store i64 [[Y]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[TC]]
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
@@ -152,17 +130,6 @@ define void @no_overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[P]], i64 [[I]]
-; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[Y:%.*]] = add i64 [[X]], 1
-; CHECK-NEXT:    store i64 [[Y]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[TC_ADD]]
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
index 58b4c5311dbec..b13c671ae3d56 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-masked-loadstore.ll
@@ -30,25 +30,7 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP23]], 0
-; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; IF-EVL:       if.then:
-; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP23]], [[TMP24]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
-; IF-EVL-NEXT:    br label [[FOR_INC]]
-; IF-EVL:       for.inc:
-; IF-EVL-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]]
+; IF-EVL-NEXT:    br label [[FOR_INC:%.*]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-ordered-reduction.ll
index 6c487ab8090d6..dcb7bf484f4ae 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-ordered-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-ordered-reduction.ll
@@ -29,21 +29,9 @@ define float @fadd(ptr noalias nocapture readonly %a, i64 %n) {
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD]] = fadd float [[TMP17]], [[SUM_07]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[ADD_LCSSA]]
+; IF-EVL-NEXT:    ret float [[TMP14]]
 ;
 ; NO-VP-LABEL: @fadd(
 ; NO-VP-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll
index e14ff7ce29a10..7179e7dc48c8d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll
@@ -30,21 +30,9 @@ define i32 @add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD]] = add nsw i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ADD_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP17]]
 ;
 ; NO-VP-LABEL: @add(
 ; NO-VP-NEXT:  entry:
@@ -129,7 +117,7 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[TMP4]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP4]], [[TMP5]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP6]])
@@ -147,7 +135,7 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret i32 [[MUL_LCSSA]]
@@ -233,24 +221,12 @@ define i32 @or(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[OR]] = or i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[OR_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP17]]
 ;
 ; NO-VP-LABEL: @or(
 ; NO-VP-NEXT:  entry:
@@ -332,24 +308,12 @@ define i32 @and(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[AND]] = and i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[AND_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP17]]
 ;
 ; NO-VP-LABEL: @and(
 ; NO-VP-NEXT:  entry:
@@ -431,24 +395,12 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[XOR]] = xor i32 [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[XOR_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP17]]
 ;
 ; NO-VP-LABEL: @xor(
 ; NO-VP-NEXT:  entry:
@@ -532,25 +484,12 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP19]], [[RDX]]
-; IF-EVL-NEXT:    [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[SMIN_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP18]]
 ;
 ; NO-VP-LABEL: @smin(
 ; NO-VP-NEXT:  entry:
@@ -638,25 +577,12 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[SMAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[TMP19]], [[RDX]]
-; IF-EVL-NEXT:    [[SMAX]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[SMAX_LCSSA:%.*]] = phi i32 [ [[SMAX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[SMAX_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP18]]
 ;
 ; NO-VP-LABEL: @smax(
 ; NO-VP-NEXT:  entry:
@@ -744,25 +670,12 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[UMIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp ult i32 [[TMP19]], [[RDX]]
-; IF-EVL-NEXT:    [[UMIN]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[UMIN_LCSSA:%.*]] = phi i32 [ [[UMIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[UMIN_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP18]]
 ;
 ; NO-VP-LABEL: @umin(
 ; NO-VP-NEXT:  entry:
@@ -850,25 +763,12 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[UMAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp ugt i32 [[TMP19]], [[RDX]]
-; IF-EVL-NEXT:    [[UMAX]] = select i1 [[CMP_I]], i32 [[TMP19]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[UMAX_LCSSA:%.*]] = phi i32 [ [[UMAX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[UMAX_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[TMP18]]
 ;
 ; NO-VP-LABEL: @umax(
 ; NO-VP-NEXT:  entry:
@@ -954,24 +854,12 @@ define float @fadd(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP14]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ADD]] = fadd reassoc float [[TMP18]], [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[ADD_LCSSA]]
+; IF-EVL-NEXT:    ret float [[TMP17]]
 ;
 ; NO-VP-LABEL: @fadd(
 ; NO-VP-NEXT:  entry:
@@ -1056,7 +944,7 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP4]] = fmul reassoc <8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = fmul reassoc <8 x float> [[TMP4]], [[TMP5]]
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[TMP6]])
@@ -1074,7 +962,7 @@ define float @fmul(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]]
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP15:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret float [[MUL_LCSSA]]
@@ -1162,25 +1050,12 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[TMP15]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[MIN:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP19]], [[RDX]]
-; IF-EVL-NEXT:    [[MIN]] = select i1 [[CMP]], float [[TMP19]], float [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[MIN_LCSSA]]
+; IF-EVL-NEXT:    ret float [[TMP18]]
 ;
 ; NO-VP-LABEL: @fmin(
 ; NO-VP-NEXT:  entry:
@@ -1268,25 +1143,12 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[TMP15]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[MAX:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP:%.*]] = fcmp fast ogt float [[TMP19]], [[RDX]]
-; IF-EVL-NEXT:    [[MAX]] = select i1 [[CMP]], float [[TMP19]], float [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[MAX_LCSSA]]
+; IF-EVL-NEXT:    ret float [[TMP18]]
 ;
 ; NO-VP-LABEL: @fmax(
 ; NO-VP-NEXT:  entry:
@@ -1375,7 +1237,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP3]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]])
@@ -1393,7 +1255,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret float [[MIN_LCSSA]]
@@ -1483,7 +1345,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[TMP3]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]])
 ; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]])
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]])
@@ -1501,7 +1363,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]])
 ; IF-EVL-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP32:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP21:![0-9]+]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
 ; IF-EVL-NEXT:    ret float [[MAX_LCSSA]]
@@ -1590,26 +1452,12 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP17]])
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi float [ [[START]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[MULADD]] = tail call reassoc float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[RDX]])
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret float [[MULADD_LCSSA]]
+; IF-EVL-NEXT:    ret float [[TMP20]]
 ;
 ; NO-VP-LABEL: @fmuladd(
 ; NO-VP-NEXT:  entry:
@@ -1696,27 +1544,14 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = freeze i1 [[TMP18]]
 ; IF-EVL-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP19]], i32 [[INV:%.*]], i32 [[START:%.*]]
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], 3
-; IF-EVL-NEXT:    [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ANYOF_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; NO-VP-LABEL: @anyof_icmp(
 ; NO-VP-NEXT:  entry:
@@ -1801,27 +1636,14 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
+; IF-EVL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP15]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = freeze i1 [[TMP18]]
 ; IF-EVL-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP19]], i32 [[INV:%.*]], i32 [[START:%.*]]
-; IF-EVL-NEXT:    br label [[FOR_END:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[RDX:%.*]] = phi i32 [ [[START]], [[SCALAR_PH]] ], [ [[ANYOF:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP_I:%.*]] = fcmp fast olt float [[TMP20]], 3.000000e+00
-; IF-EVL-NEXT:    [[ANYOF]] = select i1 [[CMP_I]], i32 [[INV]], i32 [[RDX]]
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; IF-EVL:       for.end:
-; IF-EVL-NEXT:    [[ANYOF_LCSSA:%.*]] = phi i32 [ [[ANYOF]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; IF-EVL-NEXT:    ret i32 [[ANYOF_LCSSA]]
+; IF-EVL-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; NO-VP-LABEL: @anyof_fcmp(
 ; NO-VP-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
index 5b9bc501afff4..e70894b981dff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reverse-load-store.ll
@@ -43,20 +43,7 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[LOOPEND:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL]], [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD]]
-; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
-; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
-; IF-EVL-NEXT:    store i32 [[TMP]], ptr [[GEPS]], align 4
-; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
-; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]]
 ; IF-EVL:       loopend:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -179,27 +166,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[LOOPEND:%.*]]
-; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[ADD_PHI:%.*]] = phi i64 [ [[STARTVAL]], [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
-; IF-EVL-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
-; IF-EVL-NEXT:    [[ADD]] = add i64 [[ADD_PHI]], -1
-; IF-EVL-NEXT:    [[GEPL:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[I]]
-; IF-EVL-NEXT:    [[TMP:%.*]] = load i32, ptr [[GEPL]], align 4
-; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP]], 100
-; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; IF-EVL:       if.then:
-; IF-EVL-NEXT:    [[GEPL1:%.*]] = getelementptr inbounds i32, ptr [[PTR1]], i64 [[ADD]]
-; IF-EVL-NEXT:    [[V:%.*]] = load i32, ptr [[GEPL1]], align 4
-; IF-EVL-NEXT:    [[GEPS:%.*]] = getelementptr inbounds i32, ptr [[PTR2]], i64 [[ADD]]
-; IF-EVL-NEXT:    store i32 [[V]], ptr [[GEPS]], align 4
-; IF-EVL-NEXT:    br label [[FOR_INC]]
-; IF-EVL:       for.inc:
-; IF-EVL-NEXT:    [[INC]] = add i32 [[I]], 1
-; IF-EVL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 1024
-; IF-EVL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEND]]
+; IF-EVL-NEXT:    br label [[FOR_INC:%.*]]
 ; IF-EVL:       loopend:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -351,22 +318,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
-; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 1024, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    [[X:%.*]] = load i8, ptr [[GEP_A]], align 1
-; IF-EVL-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i8 [[X]]
-; IF-EVL-NEXT:    [[Y:%.*]] = load i8, ptr [[GEP_B]], align 1
-; IF-EVL-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    store i8 [[Y]], ptr [[GEP_C]], align 1
-; IF-EVL-NEXT:    [[GEP_D:%.*]] = getelementptr i8, ptr [[D]], i64 [[IV]]
-; IF-EVL-NEXT:    store i8 [[Y]], ptr [[GEP_D]], align 1
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], -1
-; IF-EVL-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; IF-EVL-NEXT:    br i1 [[CMP_NOT]], label [[EXIT]], label [[LOOP]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll
index b13f97d41862e..e1c62fe2d043d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll
@@ -31,19 +31,7 @@ define void @test(ptr %p) {
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
-; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 8
-; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 200
-; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 8
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -125,19 +113,7 @@ define void @test_may_clobber1(ptr %p) {
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
-; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
-; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -157,19 +133,7 @@ define void @test_may_clobber1(ptr %p) {
 ; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; NO-VP:       middle.block:
-; NO-VP-NEXT:    br label [[EXIT:%.*]]
-; NO-VP:       scalar.ph:
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
-; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 100
-; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -259,19 +223,7 @@ define void @test_may_clobber3(ptr %p) {
 ; IF-EVL-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; IF-EVL-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
-; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 10
-; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -291,19 +243,7 @@ define void @test_may_clobber3(ptr %p) {
 ; NO-VP-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; NO-VP-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; NO-VP:       middle.block:
-; NO-VP-NEXT:    br label [[EXIT:%.*]]
-; NO-VP:       scalar.ph:
 ; NO-VP-NEXT:    br label [[LOOP:%.*]]
-; NO-VP:       loop:
-; NO-VP-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; NO-VP-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; NO-VP-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; NO-VP-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 10
-; NO-VP-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; NO-VP-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; NO-VP-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; NO-VP-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; NO-VP-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; NO-VP:       exit:
 ; NO-VP-NEXT:    ret void
 ;
@@ -347,19 +287,7 @@ define void @trivial_due_max_vscale(ptr %p) {
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
-; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 8192
-; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 199
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
@@ -446,19 +374,7 @@ define void @no_high_lmul_or_interleave(ptr %p) {
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[EXIT:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[LOOP:%.*]]
-; IF-EVL:       loop:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; IF-EVL-NEXT:    [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]]
-; IF-EVL-NEXT:    [[V:%.*]] = load i64, ptr [[A1]], align 32
-; IF-EVL-NEXT:    [[OFFSET:%.*]] = add i64 [[IV]], 1024
-; IF-EVL-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET]]
-; IF-EVL-NEXT:    store i64 [[V]], ptr [[A2]], align 32
-; IF-EVL-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IF-EVL-NEXT:    [[CMP:%.*]] = icmp ne i64 [[IV]], 3001
-; IF-EVL-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; IF-EVL:       exit:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
index 0bb7ad0d57055..f804329169fe0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-uniform-store.ll
@@ -38,16 +38,6 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = sub nuw nsw i64 1, [[IV1]]
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP22]]
-; CHECK-NEXT:    store i64 0, ptr [[ARRAYIDX14]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
index 5c89f218fdf7d..c5319c6165f89 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll
@@ -31,20 +31,6 @@ define void @test_pr98413_zext_removed(ptr %src, ptr noalias %dst, i64 %x) {
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP_SRC]], align 8
-; CHECK-NEXT:    [[EXT_L:%.*]] = zext i16 [[L]] to i64
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X]], [[EXT_L]]
-; CHECK-NEXT:    [[TRUNC_AND:%.*]] = trunc i64 [[AND]] to i8
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[TRUNC_AND]], ptr [[GEP_DST]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 96
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -95,20 +81,6 @@ define void @test_pr98413_sext_removed(ptr %src, ptr noalias %dst, i64 %x) {
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP_SRC]], align 8
-; CHECK-NEXT:    [[EXT_L:%.*]] = sext i16 [[L]] to i64
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X]], [[EXT_L]]
-; CHECK-NEXT:    [[TRUNC_AND:%.*]] = trunc i64 [[AND]] to i8
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[TRUNC_AND]], ptr [[GEP_DST]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 96
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -151,21 +123,6 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 {
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[F_039:%.*]] = phi i8 [ 0, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = or i8 23, [[X]]
-; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i8 [[TMP4]] to i1
-; CHECK-NEXT:    br i1 [[EXTRACT_T]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    store i8 0, ptr [[DST]], align 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[ADD]] = add i8 [[F_039]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[F_039]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], 8
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -260,23 +217,6 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[T1:%.*]] = trunc i64 [[N]] to i32
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[T1]], [[T]]
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[X]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr double, ptr [[SRC]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[RETVAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    store double [[RETVAL]], ptr [[DST]], align 8
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[V]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
index 6efb0358242c7..000dc4a13f63a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll
@@ -22,20 +22,6 @@ define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[GEP_SRC1]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i32
-; CHECK-NEXT:    [[MUL16:%.*]] = mul i32 0, [[CONV]]
-; CHECK-NEXT:    [[SHR35:%.*]] = lshr i32 [[MUL16]], 1
-; CHECK-NEXT:    [[CONV36:%.*]] = trunc i32 [[SHR35]] to i8
-; CHECK-NEXT:    store i8 [[CONV36]], ptr null, align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV1]], 8
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 9095d6e87ad4f..bae97e53a1ff9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -29,16 +29,6 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; SCALABLE:       [[SCALAR_PH:.*]]:
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -97,16 +87,6 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6
 ; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; TF-SCALABLE:       [[SCALAR_PH:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -292,22 +272,6 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; SCALABLE:       [[SCALAR_PH:.*]]:
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; SCALABLE-NEXT:    br i1 [[CMP]], label %[[DO_LOAD:.*]], label %[[LATCH]]
-; SCALABLE:       [[DO_LOAD]]:
-; SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; SCALABLE-NEXT:    br label %[[LATCH]]
-; SCALABLE:       [[LATCH]]:
-; SCALABLE-NEXT:    [[PHI:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[V]], %[[DO_LOAD]] ]
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -389,22 +353,6 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; TF-SCALABLE:       [[SCALAR_PH:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; TF-SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; TF-SCALABLE-NEXT:    br i1 [[CMP]], label %[[DO_LOAD:.*]], label %[[LATCH]]
-; TF-SCALABLE:       [[DO_LOAD]]:
-; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
-; TF-SCALABLE-NEXT:    br label %[[LATCH]]
-; TF-SCALABLE:       [[LATCH]]:
-; TF-SCALABLE-NEXT:    [[PHI:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[V]], %[[DO_LOAD]] ]
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[PHI]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -451,19 +399,9 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; SCALABLE:       [[SCALAR_PH:.*]]:
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 1
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -519,19 +457,9 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; TF-SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; TF-SCALABLE:       [[SCALAR_PH:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 1
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -571,19 +499,9 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; SCALABLE:       [[SCALAR_PH:.*]]:
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -639,19 +557,9 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; TF-SCALABLE:       [[SCALAR_PH:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -700,19 +608,9 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; SCALABLE:       [[SCALAR_PH:.*]]:
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; SCALABLE-NEXT:    store i64 [[IV]], ptr [[B]], align 8
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -781,19 +679,9 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; TF-SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; TF-SCALABLE:       [[SCALAR_PH:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    store i64 [[IV]], ptr [[B]], align 8
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -843,24 +731,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]]
 ; SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; SCALABLE:       [[SCALAR_PH:.*]]:
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; SCALABLE-NEXT:    br i1 [[CMP]], label %[[DO_STORE:.*]], label %[[LATCH]]
-; SCALABLE:       [[DO_STORE]]:
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; SCALABLE-NEXT:    br label %[[LATCH]]
-; SCALABLE:       [[LATCH]]:
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -939,24 +812,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]]
 ; TF-SCALABLE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; TF-SCALABLE-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; TF-SCALABLE:       [[SCALAR_PH:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; TF-SCALABLE-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[IV]], 10
-; TF-SCALABLE-NEXT:    br i1 [[CMP]], label %[[DO_STORE:.*]], label %[[LATCH]]
-; TF-SCALABLE:       [[DO_STORE]]:
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; TF-SCALABLE-NEXT:    br label %[[LATCH]]
-; TF-SCALABLE:       [[LATCH]]:
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
@@ -1002,19 +860,9 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP10]], [[INDEX]]
 ; SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP10]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       [[MIDDLE_BLOCK]]:
 ; SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; SCALABLE:       [[SCALAR_PH:.*]]:
-; SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; SCALABLE:       [[FOR_BODY]]:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 1
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; SCALABLE:       [[FOR_END]]:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1070,19 +918,9 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap
 ; TF-SCALABLE-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[INDEX]]
 ; TF-SCALABLE-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; TF-SCALABLE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; TF-SCALABLE:       [[MIDDLE_BLOCK]]:
 ; TF-SCALABLE-NEXT:    br label %[[FOR_END:.*]]
-; TF-SCALABLE:       [[SCALAR_PH:.*]]:
-; TF-SCALABLE-NEXT:    br label %[[FOR_BODY:.*]]
-; TF-SCALABLE:       [[FOR_BODY]]:
-; TF-SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[B]], align 1
-; TF-SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; TF-SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
-; TF-SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; TF-SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1025
-; TF-SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; TF-SCALABLE:       [[FOR_END]]:
 ; TF-SCALABLE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll
index 8c67b4cb7996e..1676461863583 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll
@@ -15,15 +15,6 @@ define void @foo(ptr %arg) #0 {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [3 x i64], ptr [[ARG]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    store i64 0, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[COND]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -61,18 +52,8 @@ define i32 @test_remove_iv(i32 %start) #0 {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> [[TMP5]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[START]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED_NEXT]] = xor i32 [[RED]], 3
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 5
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
index 649ce601c66d1..0a64723b6ff9d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -30,21 +30,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
 ; IF-EVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP22]]
-; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ; IF-EVL:       for.cond.cleanup:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
index b0f0c39711274..b106f99130785 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -25,11 +25,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 poison
 ;
@@ -76,11 +72,7 @@ define i32 @foo1(ptr nocapture noalias %A, ptr nocapture %PtrPtr) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 poison
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
index 1d4cbc3cebcde..78c71fd3beb89 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll
@@ -38,15 +38,6 @@ define void @test_scalar_steps_target_instruction_cost(ptr %dst) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 3
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], 22
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
index a423f06ae9892..02e82b43fdd80 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -91,23 +91,7 @@ define void @test(ptr %p, i40 %a) {
 ; CHECK:       pred.store.continue30:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SHL:%.*]] = shl i40 [[A]], 24
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i40 [[SHL]], 28
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i40 [[ASHR]] to i32
-; CHECK-NEXT:    [[ICMP_EQ:%.*]] = icmp eq i32 [[TRUNC]], 0
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[ICMP_EQ]] to i32
-; CHECK-NEXT:    [[ICMP_ULT:%.*]] = icmp ult i32 0, [[ZEXT]]
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP_ULT]], true
-; CHECK-NEXT:    [[ICMP_SGT:%.*]] = icmp sgt i1 [[OR]], false
-; CHECK-NEXT:    store i1 [[ICMP_SGT]], ptr [[P]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
index 3c788b2ef539a..ee84ef243570a 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll
@@ -63,19 +63,7 @@ define void @func_21() {
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[LV:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A_PTR:%.*]] = getelementptr inbounds [5 x i32], ptr @A, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LV]] = load i32, ptr [[A_PTR]], align 4
-; CHECK-NEXT:    [[B_PTR:%.*]] = getelementptr inbounds [5 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[SCALAR_RECUR]], ptr [[B_PTR]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 5
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
index d40cb6ea2f60e..cfb180594b0ec 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll
@@ -66,25 +66,6 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[MUL_IV:%.*]] = mul nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L_1]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[IV_OR:%.*]] = or disjoint i64 [[IV]], 4
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [8 x i32], ptr @src, i64 0, i64 [[IV_OR]]
-; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    store i32 [[L_2]], ptr [[DST]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 9dd7e9f0e97d5..f65a9d7d45ed8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -22,19 +22,7 @@ define void @f1() {
 ; CHECK-NEXT:    store <2 x ptr> <ptr @a, ptr @a>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[BB2:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    [[C_1_0:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[_TMP9:%.*]], [[BB2]] ]
-; CHECK-NEXT:    [[_TMP1:%.*]] = zext i16 0 to i64
-; CHECK-NEXT:    [[_TMP2:%.*]] = getelementptr [1 x %rec8], ptr @a, i16 0, i64 [[_TMP1]]
-; CHECK-NEXT:    [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64
-; CHECK-NEXT:    [[_TMP7:%.*]] = getelementptr [2 x ptr], ptr @b, i16 0, i64 [[_TMP6]]
-; CHECK-NEXT:    store ptr [[_TMP2]], ptr [[_TMP7]], align 8
-; CHECK-NEXT:    [[_TMP9]] = add nsw i16 [[C_1_0]], 1
-; CHECK-NEXT:    [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT:    br i1 [[_TMP11]], label [[BB2]], label [[BB3]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    ret void
 ;
@@ -102,25 +90,7 @@ define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK:       pred.store.continue8:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
-; CHECK:       then.1:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 2
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], true
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR]], i1 [[C_0]], i1 false
-; CHECK-NEXT:    br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
-; CHECK:       then.2:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
-; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -195,25 +165,7 @@ define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK:       pred.store.continue8:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
-; CHECK:       then.1:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 2
-; CHECK-NEXT:    [[OR:%.*]] = or i1 true, [[CMP]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR]], i1 [[C_1]], i1 false
-; CHECK-NEXT:    br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
-; CHECK:       then.2:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
-; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -289,25 +241,7 @@ define void @redundant_and_1(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK:       pred.store.continue8:
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
-; CHECK:       then.1:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 2
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], false
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR]], i1 [[C_1]], i1 false
-; CHECK-NEXT:    br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
-; CHECK:       then.2:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
-; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -341,6 +275,23 @@ exit:
 define void @redundant_and_2(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: @redundant_and_2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[C_0:%.*]], label [[LOOP_LATCH]], label [[THEN_1:%.*]]
+; CHECK:       then.1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 2
+; CHECK-NEXT:    [[OR:%.*]] = and i1 false, [[CMP]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[OR]], i1 [[C_1:%.*]], i1 false
+; CHECK-NEXT:    br i1 [[COND]], label [[THEN_2:%.*]], label [[LOOP_LATCH]]
+; CHECK:       then.2:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
index ee88abbe4d1c0..e0dd3768ec111 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll
@@ -92,24 +92,8 @@ define i64 @second_lshr_operand_zero_via_scev() {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOPS:.*]]
-; CHECK:       [[LOOPS]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOPS]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOPS]] ]
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP14]], [[EXT_0]]
-; CHECK-NEXT:    [[CONV_1:%.*]] = zext i32 [[SHR]] to i64
-; CHECK-NEXT:    [[RED_NEXT_V:%.*]] = select i1 [[C]], i64 [[AND]], i64 [[CONV_1]]
-; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED_NEXT_V]], [[RED]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOPS]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOPS]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[TMP13]]
 ;
 entry:
   %ext.0 = sext i8 0 to i32
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 0ba885d7811b2..9453ad7c61f68 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -696,19 +696,9 @@ define i64 @live_in_known_1_via_scev() {
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[VEC_PHI]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ 3, [[SCALAR_PH]] ], [ [[RED_MUL:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED_MUL]] = mul nsw i64 [[RED]], [[P_EXT]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 [[TMP3]]
 ;
 entry:
   %sel = select i1 false, i32 3, i32 0
@@ -753,22 +743,9 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) {
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT_I_I_I:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ 1, [[SCALAR_PH]] ], [ [[RED_MUL:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[NOT_X:%.*]] = xor i1 [[X]], true
-; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[NOT_X]] to i64
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[Y]], [[EXT]]
-; CHECK-NEXT:    [[RED_MUL]] = mul i64 [[SHL]], [[RED]]
-; CHECK-NEXT:    [[IV_NEXT_I_I_I]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RED_MUL_LCSSA:%.*]] = phi i64 [ [[RED_MUL]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RED_MUL_LCSSA]]
+; CHECK-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   br label %loop
@@ -808,20 +785,9 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i1 [[TMP20]] to i32
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[VEC_EPILOG_PH:%.*]] ], [ [[INC:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[OR13:%.*]] = phi i32 [ 0, [[VEC_EPILOG_PH]] ], [ [[OR:%.*]], [[LOOP1]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[OR13]], 1
-; CHECK-NEXT:    [[OR]] = or i32 [[AND]], [[CONV]]
-; CHECK-NEXT:    [[INC]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 16
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP1]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP1]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[OR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP21]]
 ;
 entry:
   %conv = zext i1 %cmp to i32
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
index 3d07eca646380..249efe1706e0f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll
@@ -39,30 +39,9 @@ define i1 @fn(ptr %nno) #0 {
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY20:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 10, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC35:%.*]] ]
-; CHECK-NEXT:    [[SUM_01:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC35]] ]
-; CHECK-NEXT:    [[REM4:%.*]] = and i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[CMP21:%.*]] = icmp eq i64 [[REM4]], 0
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    br i1 [[CMP21]], label [[IF_THEN22:%.*]], label [[FOR_INC35]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[TMP15]], 1
-; CHECK-NEXT:    [[REM27:%.*]] = urem i32 [[MUL]], 10
-; CHECK-NEXT:    br label [[FOR_INC35]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[REM27_PN:%.*]] = phi i32 [ [[REM27]], [[IF_THEN22]] ], [ [[TMP15]], [[FOR_BODY20]] ]
-; CHECK-NEXT:    [[SUM_1]] = or i32 [[REM27_PN]], [[SUM_01]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[CMP19_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; CHECK-NEXT:    br i1 [[CMP19_NOT]], label [[EXIT]], label [[FOR_BODY20]]
+; CHECK-NEXT:    br label [[FOR_INC35:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_INC35]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[CMP41:%.*]] = icmp eq i32 [[SUM_1_LCSSA]], 0
+; CHECK-NEXT:    [[CMP41:%.*]] = icmp eq i32 [[TMP14]], 0
 ; CHECK-NEXT:    ret i1 [[CMP41]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index d0c311eb4521f..cc84fabd00ecc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -495,18 +495,7 @@ define void @test_first_order_recurrence_tried_to_scalarized(ptr %dst, i1 %c, i3
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ 4, [[SCALAR_PH]] ], [ [[IV]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[FOR]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i32 [[IV]]
-; CHECK-NEXT:    store i32 [[SUB]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 9528510f568fa..2f33e111d8ca7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -45,7 +45,8 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    br label [[FOR_END:%.*]]
-; AVX512:       scalar.ph:
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
 ;
 ; FVW2-LABEL: @foo1(
 ; FVW2-NEXT:  entry:
@@ -70,7 +71,8 @@ define void @foo1(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; FVW2:       middle.block:
 ; FVW2-NEXT:    br label [[FOR_END:%.*]]
-; FVW2:       scalar.ph:
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -137,7 +139,8 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    br label [[FOR_END:%.*]]
-; AVX512:       scalar.ph:
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
 ;
 ; FVW2-LABEL: @foo2(
 ; FVW2-NEXT:  entry:
@@ -182,7 +185,8 @@ define void @foo2(ptr noalias %in, ptr noalias %out, ptr noalias %trigger, ptr n
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; FVW2:       middle.block:
 ; FVW2-NEXT:    br label [[FOR_END:%.*]]
-; FVW2:       scalar.ph:
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -250,7 +254,8 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    br label [[FOR_END:%.*]]
-; AVX512:       scalar.ph:
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
 ;
 ; FVW2-LABEL: @foo3(
 ; FVW2-NEXT:  entry:
@@ -295,7 +300,8 @@ define void @foo3(ptr noalias %in, ptr noalias %out, ptr noalias %trigger) {
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; FVW2:       middle.block:
 ; FVW2-NEXT:    br label [[FOR_END:%.*]]
-; FVW2:       scalar.ph:
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -350,7 +356,8 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    br label [[FOR_END:%.*]]
-; AVX512:       scalar.ph:
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
 ;
 ; FVW2-LABEL: @foo2_addrspace(
 ; FVW2-NEXT:  entry:
@@ -395,7 +402,8 @@ define void @foo2_addrspace(ptr addrspace(1) noalias %in, ptr addrspace(1) noali
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; FVW2:       middle.block:
 ; FVW2-NEXT:    br label [[FOR_END:%.*]]
-; FVW2:       scalar.ph:
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -449,7 +457,8 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    br label [[FOR_END:%.*]]
-; AVX512:       scalar.ph:
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
 ;
 ; FVW2-LABEL: @foo2_addrspace2(
 ; FVW2-NEXT:  entry:
@@ -494,7 +503,8 @@ define void @foo2_addrspace2(ptr addrspace(1) noalias %in, ptr addrspace(0) noal
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; FVW2:       middle.block:
 ; FVW2-NEXT:    br label [[FOR_END:%.*]]
-; FVW2:       scalar.ph:
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
 ;
 entry:
   br label %for.body
@@ -548,7 +558,8 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; AVX512-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    br label [[FOR_END:%.*]]
-; AVX512:       scalar.ph:
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
 ;
 ; FVW2-LABEL: @foo2_addrspace3(
 ; FVW2-NEXT:  entry:
@@ -593,7 +604,8 @@ define void @foo2_addrspace3(ptr addrspace(0) noalias %in, ptr addrspace(1) noal
 ; FVW2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; FVW2:       middle.block:
 ; FVW2-NEXT:    br label [[FOR_END:%.*]]
-; FVW2:       scalar.ph:
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
index b2d587cbb1df9..877fcd4d638eb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -90,29 +90,9 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; SSE:       middle.block:
 ; SSE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[PREDPHI3]], [[PREDPHI]]
 ; SSE-NEXT:    [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]])
-; SSE-NEXT:    br label [[DONE:%.*]]
-; SSE:       scalar.ph:
-; SSE-NEXT:    br label [[LOOP:%.*]]
-; SSE:       loop:
-; SSE-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
-; SSE-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
-; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
-; SSE-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
-; SSE-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
-; SSE:       do.add:
-; SSE-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
-; SSE-NEXT:    br label [[NEXT_ITER]]
-; SSE:       no.add:
-; SSE-NEXT:    br label [[NEXT_ITER]]
-; SSE:       next.iter:
-; SSE-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
-; SSE-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
-; SSE-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
-; SSE-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]]
+; SSE-NEXT:    br label [[NEXT_ITER:%.*]]
 ; SSE:       done:
-; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; SSE-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+; SSE-NEXT:    ret double [[TMP11]]
 ;
 ; AVX-LABEL: @sumIfVector(
 ; AVX-NEXT:  entry:
@@ -153,29 +133,9 @@ define double @sumIfVector(ptr nocapture readonly %arr) {
 ; AVX-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <4 x double> [[PREDPHI8]], [[BIN_RDX]]
 ; AVX-NEXT:    [[BIN_RDX11:%.*]] = fadd fast <4 x double> [[PREDPHI9]], [[BIN_RDX10]]
 ; AVX-NEXT:    [[TMP21:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[BIN_RDX11]])
-; AVX-NEXT:    br label [[DONE:%.*]]
-; AVX:       scalar.ph:
-; AVX-NEXT:    br label [[LOOP:%.*]]
-; AVX:       loop:
-; AVX-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
-; AVX-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
-; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]]
-; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8
-; AVX-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
-; AVX-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
-; AVX:       do.add:
-; AVX-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
-; AVX-NEXT:    br label [[NEXT_ITER]]
-; AVX:       no.add:
-; AVX-NEXT:    br label [[NEXT_ITER]]
-; AVX:       next.iter:
-; AVX-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
-; AVX-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
-; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
-; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]]
+; AVX-NEXT:    br label [[NEXT_ITER:%.*]]
 ; AVX:       done:
-; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+; AVX-NEXT:    ret double [[TMP21]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 27eef017727dc..a19b294541172 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -409,21 +409,9 @@ define i16 @iv_and_step_trunc() {
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[REC_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[IV_NEXT]] to i16
-; CHECK-NEXT:    [[REC_NEXT]] = mul i16 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i16 [[REC_LCSSA]]
+; CHECK-NEXT:    ret i16 [[VECTOR_RECUR_EXTRACT_FOR_PHI]]
 ;
 entry:
   br label %loop
@@ -612,16 +600,7 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    store i32 [[IV_TRUNC]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 91c7e7a37eb93..2f9627855a2c9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -38,36 +38,6 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT:    [[IV_MUL:%.*]] = shl i64 [[IV]], 2
-; CHECK-NEXT:    [[GEP_DST_19:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[IV_MUL]]
-; CHECK-NEXT:    store float [[L_0]], ptr [[GEP_DST_19]], align 4
-; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = or disjoint i64 [[IV_MUL]], 1
-; CHECK-NEXT:    [[GEP_DST_119:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[ADD_1]]
-; CHECK-NEXT:    store float [[L_1]], ptr [[GEP_DST_119]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = or disjoint i64 [[IV_MUL]], 2
-; CHECK-NEXT:    [[GEP_DST_129:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[ADD_2]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP_DST_129]], align 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = or disjoint i64 [[IV_MUL]], 3
-; CHECK-NEXT:    [[GEP_DST_140:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[ADD_3]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP_DST_140]], align 4
-; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT:    [[GEP_DST_247:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[IV_MUL]]
-; CHECK-NEXT:    store float [[L_2]], ptr [[GEP_DST_247]], align 4
-; CHECK-NEXT:    [[GEP_DST_255:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[ADD_1]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP_DST_255]], align 4
-; CHECK-NEXT:    [[GEP_DST_265:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[ADD_2]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP_DST_265]], align 4
-; CHECK-NEXT:    [[GEP_DST_276:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[ADD_3]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[GEP_DST_276]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -504,17 +474,6 @@ define void @interleave_store_double_i64(ptr %dst) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]], i32 1
-; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP_1]], align 8
-; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_0]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -616,17 +575,6 @@ define void @interleave_store_i64_double_2(ptr %dst) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP_0]], align 8
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]], i32 1
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_1]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 228bc80cef9d1..e2329fe31cd56 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -34,13 +34,9 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; SSE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; SSE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SSE:       middle.block:
-; SSE-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; SSE:       scalar.ph:
 ; SSE-NEXT:    br label [[FOR_BODY:%.*]]
 ; SSE:       for.cond.cleanup:
 ; SSE-NEXT:    ret void
-; SSE:       for.body:
-; SSE-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 ; AVX1-LABEL: @foo(
 ; AVX1-NEXT:  entry:
@@ -88,13 +84,9 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
-; AVX1-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; AVX1:       scalar.ph:
 ; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.cond.cleanup:
 ; AVX1-NEXT:    ret void
-; AVX1:       for.body:
-; AVX1-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 ; AVX2-LABEL: @foo(
 ; AVX2-NEXT:  entry:
@@ -142,13 +134,9 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
-; AVX2-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; AVX2:       scalar.ph:
 ; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX2:       for.cond.cleanup:
 ; AVX2-NEXT:    ret void
-; AVX2:       for.body:
-; AVX2-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 ; ATOM-LABEL: @foo(
 ; ATOM-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 5853e914ce112..5d40e6ab954fd 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -409,16 +409,6 @@ define void @test_store_of_final_reduction_value(i64 %x, ptr %dst) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    store i64 [[TMP1]], ptr [[DST]], align 8
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV4:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED_NEXT]] = mul i64 [[RED]], [[X]]
-; CHECK-NEXT:    store i64 [[RED_NEXT]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV4]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV4]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 9e0ef737eb59f..2a8c698f3f7fa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -63,27 +63,9 @@ define i32 @test_explicit_pred(i64 %len) {
 ; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP18]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP19]], [[BIN_RDX13]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EARLYCND:%.*]] = icmp slt i64 [[IV]], [[LEN]]
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP21]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
@@ -212,28 +194,9 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
@@ -390,27 +353,9 @@ define i32 @test_invariant_address(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP98]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP99]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP101:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ALLOCA]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP101]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP101]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
@@ -659,28 +604,9 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP146]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP147]], [[BIN_RDX37]]
 ; CHECK-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR_I16P:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR_I16P]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP149]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
@@ -974,28 +900,9 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1024, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
@@ -1216,28 +1123,9 @@ define i32 @test_non_unit_stride(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 2
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4093
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP117]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
@@ -1366,28 +1254,9 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %alloca = alloca [1024 x i32]
@@ -1516,28 +1385,9 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %alloca = alloca [4095 x i32]
@@ -1666,28 +1516,9 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %alloca = alloca [16383 x i8]
@@ -1985,28 +1816,9 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %allocation = call nonnull ptr @my_alloc(i32 16384)
@@ -2136,28 +1948,9 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %allocation = call nonnull ptr @my_array_alloc(i32 4096, i32 4)
@@ -2297,28 +2090,9 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP74]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP75]], [[BIN_RDX10]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[ALLOCATION]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP77]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP77]]
 ;
 entry:
   %allocation = call nonnull ptr @my_alloc(i32 16384)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index d0991a5c52fd2..e23f8a9b63ef0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1199,19 +1199,7 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O1VEC2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; O1VEC2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; O1VEC2:       middle.block:
-; O1VEC2-NEXT:    br label [[FOR_END:%.*]]
-; O1VEC2:       scalar.ph:
 ; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
-; O1VEC2:       for.body:
-; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV]]
-; O1VEC2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], [[N]]
-; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS_IV]]
-; O1VEC2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
 ; O1VEC2:       for.end:
 ; O1VEC2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A]], align 4
 ; O1VEC2-NEXT:    ret i32 [[TMP11]]
@@ -1239,19 +1227,7 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; OzVEC2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; OzVEC2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; OzVEC2:       middle.block:
-; OzVEC2-NEXT:    br label [[FOR_END:%.*]]
-; OzVEC2:       scalar.ph:
 ; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
-; OzVEC2:       for.body:
-; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV]]
-; OzVEC2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP10]], [[N]]
-; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS_IV]]
-; OzVEC2-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4
-; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
-; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
 ; OzVEC2:       for.end:
 ; OzVEC2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A]], align 4
 ; OzVEC2-NEXT:    ret i32 [[TMP11]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
index fc37e5f96c309..e1140b59681fe 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -32,18 +32,6 @@ define i32 @foo_optsize() #0 {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -69,18 +57,6 @@ define i32 @foo_optsize() #0 {
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AUTOVF:       [[MIDDLE_BLOCK]]:
 ; AUTOVF-NEXT:    br label %[[FOR_END:.*]]
-; AUTOVF:       [[SCALAR_PH:.*]]:
-; AUTOVF-NEXT:    br label %[[FOR_BODY:.*]]
-; AUTOVF:       [[FOR_BODY]]:
-; AUTOVF-NEXT:    [[I_08:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
-; AUTOVF-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; AUTOVF-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
-; AUTOVF-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; AUTOVF-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
-; AUTOVF-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; AUTOVF:       [[FOR_END]]:
 ; AUTOVF-NEXT:    ret i32 0
 ;
@@ -128,18 +104,6 @@ define i32 @foo_minsize() #1 {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
-; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -165,18 +129,6 @@ define i32 @foo_minsize() #1 {
 ; AUTOVF-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AUTOVF:       [[MIDDLE_BLOCK]]:
 ; AUTOVF-NEXT:    br label %[[FOR_END:.*]]
-; AUTOVF:       [[SCALAR_PH:.*]]:
-; AUTOVF-NEXT:    br label %[[FOR_BODY:.*]]
-; AUTOVF:       [[FOR_BODY]]:
-; AUTOVF-NEXT:    [[I_08:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 [[I_08]]
-; AUTOVF-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; AUTOVF-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP7]], 0
-; AUTOVF-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; AUTOVF-NEXT:    store i8 [[DOT]], ptr [[ARRAYIDX]], align 1
-; AUTOVF-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
-; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; AUTOVF:       [[FOR_END]]:
 ; AUTOVF-NEXT:    ret i32 0
 ;
@@ -226,18 +178,6 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_07]]
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -263,18 +203,6 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon
 ; AUTOVF-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AUTOVF:       [[MIDDLE_BLOCK]]:
 ; AUTOVF-NEXT:    br label %[[FOR_END_LOOPEXIT:.*]]
-; AUTOVF:       [[SCALAR_PH:.*]]:
-; AUTOVF-NEXT:    br label %[[FOR_BODY:.*]]
-; AUTOVF:       [[FOR_BODY]]:
-; AUTOVF-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; AUTOVF-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
-; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]]
-; AUTOVF-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; AUTOVF-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[I_07]]
-; AUTOVF-NEXT:    store i32 [[TMP6]], ptr [[ARRAYIDX1]], align 4
-; AUTOVF-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
-; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]]
 ; AUTOVF:       [[FOR_END_LOOPEXIT]]:
 ; AUTOVF-NEXT:    ret void
 ;
@@ -431,14 +359,6 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 -72
-; CHECK-NEXT:    store ptr null, ptr [[PTR_IV]], align 8
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -475,14 +395,6 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 {
 ; AUTOVF-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AUTOVF:       [[MIDDLE_BLOCK]]:
 ; AUTOVF-NEXT:    br label %[[EXIT:.*]]
-; AUTOVF:       [[SCALAR_PH:.*]]:
-; AUTOVF-NEXT:    br label %[[LOOP:.*]]
-; AUTOVF:       [[LOOP]]:
-; AUTOVF-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; AUTOVF-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 -72
-; AUTOVF-NEXT:    store ptr null, ptr [[PTR_IV]], align 8
-; AUTOVF-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
-; AUTOVF-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; AUTOVF:       [[EXIT]]:
 ; AUTOVF-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
index 65f84871e9b34..5d76dfb781636 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -108,11 +108,7 @@ define void @parallel_loop(ptr nocapture %a, ptr nocapture %b) nounwind uwtable
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
index 62eacf6ab5953..619693abf51e4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll
@@ -104,23 +104,8 @@ define i8 @pr141968(i1 %cond, i8 %v) {
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP_LATCH]], label %[[COND_FALSE:.*]]
-; CHECK:       [[COND_FALSE]]:
-; CHECK-NEXT:    [[SDIV:%.*]] = sdiv i16 [[SEXT]], [[ZEXT_TRUE]]
-; CHECK-NEXT:    [[SDIV_TRUNC:%.*]] = trunc i16 [[SDIV]] to i8
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[RET:%.*]] = phi i8 [ [[SDIV_TRUNC]], %[[COND_FALSE]] ], [ 0, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i8 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RET_LCSSA:%.*]] = phi i8 [ [[RET]], %[[LOOP_LATCH]] ], [ [[PREDPHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i8 [[RET_LCSSA]]
+; CHECK-NEXT:    ret i8 [[PREDPHI]]
 ;
 entry:
   %zext.true = zext i1 true to i16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
index 972164fe49624..47db49c72766a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -16,26 +16,13 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly %
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP2:%.*]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[B:%.*]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[A:%.*]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    br label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
index 00980655b61ed..e7f56a45ebdc6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll
@@ -43,26 +43,8 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[CMP_1]], label %[[LOOP_LATCH]], label %[[THEN:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[M:%.*]] = phi i32 [ [[L]], %[[THEN]] ], [ 0, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i32 [[M]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP_2:%.*]] = icmp slt i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[CMP_2]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[GEP_LCSSA:%.*]] = phi ptr [ [[GEP_SRC]], %[[LOOP_LATCH]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret ptr [[GEP_LCSSA]]
+; CHECK-NEXT:    ret ptr [[TMP2]]
 ;
 entry:
   br label %loop.header
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
index 3922796a1a4b8..36163790706ed 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll
@@ -39,23 +39,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF0:![0-9]+]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[BB6:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 99, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[IV]], 1
-; CHECK-NEXT:    [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0
-; CHECK-NEXT:    br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF5:![0-9]+]]
-; CHECK:       bb18:
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV]], 1
-; CHECK-NEXT:    [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]]
-; CHECK-NEXT:    store i64 1, ptr [[GETELEMENTPTR19]], align 8
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90
-; CHECK-NEXT:    br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    ret void
 ;
@@ -99,6 +83,4 @@ attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" }
 ; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[META4]] = !{!"llvm.loop.estimated_trip_count", i32 24}
-; CHECK: [[PROF5]] = !{!"branch_weights", i32 1, i32 1}
-; CHECK: [[PROF6]] = !{!"branch_weights", i32 1, i32 95}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index 2bc3a97d162f0..f066000fe9f66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -71,23 +71,11 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ 0.000000e+00, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4
-; CHECK-NEXT:    [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]]
-; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
-; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]]
 ; CHECK:       loop.exit.loopexit:
-; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
-; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP9]], [[LOOP]] ]
 ; CHECK-NEXT:    ret float [[SUM_LCSSA]]
 ;
 entry:
@@ -134,23 +122,11 @@ define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ -0.000000e+00, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4
-; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]]
-; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
-; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]]
 ; CHECK:       loop.exit.loopexit:
-; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
-; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP9]], [[LOOP]] ]
 ; CHECK-NEXT:    ret float [[SUM_LCSSA]]
 ;
 entry:
@@ -197,23 +173,11 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array)
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP7]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ -0.000000e+00, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]]
-; CHECK-NEXT:    [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4
-; CHECK-NEXT:    [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]]
-; CHECK-NEXT:    [[IDX_INC]] = add i32 [[IDX]], 1
-; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]]
 ; CHECK:       loop.exit.loopexit:
-; CHECK-NEXT:    [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
-; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[SUM_INC_LCSSA]], [[LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[SUM_LCSSA:%.*]] = phi float [ -0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP9]], [[LOOP]] ]
 ; CHECK-NEXT:    ret float [[SUM_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
index 90f3df50153a2..70b05ac34559e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll
@@ -50,23 +50,6 @@ define void @smax_call_uniform(ptr %dst, i64 %x) {
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[REM1:%.*]] = urem i64 [[MUL]], [[X]]
-; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[REM1]], i64 0)
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[LOOP_HEADER]] ], [ [[SMAX]], %[[ELSE]] ]
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i64, ptr [[DST]], i64 [[IV_NEXT]]
-; CHECK-NEXT:    store i64 0, ptr [[GEP1]], align 8
-; CHECK-NEXT:    [[IV_NEXT1]] = add i64 [[IV1]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT1]], 1024
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
index b713a39c078d5..272b62bdbd5aa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll
@@ -33,8 +33,6 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT_1_LOOPEXIT1:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_3:.*]]
 ; CHECK:       [[LOOP_2_PREHEADER]]:
 ; CHECK-NEXT:    br label %[[LOOP_2:.*]]
 ; CHECK:       [[LOOP_2]]:
@@ -48,13 +46,6 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr
 ; CHECK-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
 ; CHECK-NEXT:    [[EC_2:%.*]] = icmp ult i64 [[IV_2]], [[IV_1_LCSSA]]
 ; CHECK-NEXT:    br i1 [[EC_2]], label %[[LOOP_2]], label %[[EXIT_1_LOOPEXIT:.*]]
-; CHECK:       [[LOOP_3]]:
-; CHECK-NEXT:    [[IV_4:%.*]] = phi i64 [ [[IV_4_NEXT:%.*]], %[[LOOP_3]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_4]]
-; CHECK-NEXT:    store i8 0, ptr [[GEP_DST_2]], align 1
-; CHECK-NEXT:    [[IV_4_NEXT]] = add i64 [[IV_4]], 1
-; CHECK-NEXT:    [[EC_3:%.*]] = icmp ult i64 [[IV_4_NEXT]], [[IV_1_LCSSA]]
-; CHECK-NEXT:    br i1 [[EC_3]], label %[[LOOP_3]], label %[[EXIT_1_LOOPEXIT1]]
 ; CHECK:       [[EXIT_1_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[EXIT_1:.*]]
 ; CHECK:       [[EXIT_1_LOOPEXIT1]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index f877e1b311cea..e99ffda9e4043 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -39,12 +39,8 @@ define void @example1() optsize {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[TMP7:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP6:%.*]]
 ; CHECK:       6:
-; CHECK-NEXT:    br i1 poison, label [[TMP7]], label [[TMP6]]
-; CHECK:       7:
 ; CHECK-NEXT:    ret void
 ;
   br label %1
@@ -123,8 +119,6 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[DOT_PREHEADER_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[DOTLR_PH5:%.*]]
 ; CHECK:       ..preheader_crit_edge:
 ; CHECK-NEXT:    [[PHITMP:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    br label [[DOTPREHEADER]]
@@ -134,7 +128,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]]
 ; CHECK:       .lr.ph.preheader:
 ; CHECK-NEXT:    br label [[VECTOR_PH8:%.*]]
-; CHECK:       vector.ph8:
+; CHECK:       vector.ph7:
 ; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[N]] to i64
 ; CHECK-NEXT:    [[N_RND_UP10:%.*]] = add nuw nsw i64 [[TMP17]], 3
 ; CHECK-NEXT:    [[N_VEC12:%.*]] = and i64 [[N_RND_UP10]], 8589934588
@@ -142,7 +136,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_114]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT19]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY13:%.*]]
-; CHECK:       vector.body15:
+; CHECK:       vector.body14:
 ; CHECK-NEXT:    [[INDEX16:%.*]] = phi i64 [ 0, [[VECTOR_PH8]] ], [ [[INDEX_NEXT29:%.*]], [[PRED_STORE_CONTINUE26:%.*]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX16]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX16]], i64 0
@@ -151,7 +145,7 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
-; CHECK:       pred.store.if19:
+; CHECK:       pred.store.if18:
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr @b, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr @c, i64 [[OFFSET_IDX]]
@@ -160,10 +154,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP25:%.*]] = and i32 [[TMP23]], [[TMP21]]
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP24]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
-; CHECK:       pred.store.continue20:
+; CHECK:       pred.store.continue19:
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i1> [[TMP18]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; CHECK:       pred.store.if21:
+; CHECK:       pred.store.if20:
 ; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr @b, i64 [[TMP27]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
@@ -173,10 +167,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP33:%.*]] = and i32 [[TMP31]], [[TMP29]]
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; CHECK:       pred.store.continue22:
+; CHECK:       pred.store.continue21:
 ; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i1> [[TMP18]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; CHECK:       pred.store.if23:
+; CHECK:       pred.store.if22:
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr @b, i64 [[TMP35]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4
@@ -186,10 +180,10 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP41:%.*]] = and i32 [[TMP39]], [[TMP37]]
 ; CHECK-NEXT:    store i32 [[TMP41]], ptr [[TMP40]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; CHECK:       pred.store.continue24:
+; CHECK:       pred.store.continue23:
 ; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i1> [[TMP18]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.if25:
+; CHECK:       pred.store.if24:
 ; CHECK-NEXT:    [[TMP43:%.*]] = add i64 [[OFFSET_IDX]], 3
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr @b, i64 [[TMP43]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[TMP44]], align 4
@@ -199,18 +193,12 @@ define void @example2(i32 %n, i32 %x) optsize {
 ; CHECK-NEXT:    [[TMP49:%.*]] = and i32 [[TMP47]], [[TMP45]]
 ; CHECK-NEXT:    store i32 [[TMP49]], ptr [[TMP48]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; CHECK:       pred.store.continue26:
+; CHECK:       pred.store.continue25:
 ; CHECK-NEXT:    [[INDEX_NEXT29]] = add nuw i64 [[INDEX16]], 4
 ; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]]
-; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK28:%.*]], label [[VECTOR_BODY13]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block28:
-; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       scalar.ph7:
+; CHECK-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK27:%.*]], label [[VECTOR_BODY13]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block27:
 ; CHECK-NEXT:    br label [[DOTLR_PH1:%.*]]
-; CHECK:       .lr.ph5:
-; CHECK-NEXT:    br i1 poison, label [[DOT_PREHEADER_CRIT_EDGE]], label [[DOTLR_PH5]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOTLR_PH]], label [[DOTLR_PH1]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
@@ -328,11 +316,7 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge.loopexit:
 ; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
 ; CHECK:       ._crit_edge:
@@ -418,12 +402,8 @@ define void @example23b(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[TMP5:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP4:%.*]]
 ; CHECK:       4:
-; CHECK-NEXT:    br i1 poison, label [[TMP5]], label [[TMP4]]
-; CHECK:       5:
 ; CHECK-NEXT:    ret void
 ;
   br label %1
@@ -516,12 +496,8 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst)
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[TMP26:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP25:%.*]]
 ; CHECK:       25:
-; CHECK-NEXT:    br i1 poison, label [[TMP26]], label [[TMP25]]
-; CHECK:       26:
 ; CHECK-NEXT:    ret void
 ;
   br label %1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
index 931c927d304ed..15e26782f8e66 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -15,17 +15,17 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:  [[ITER_CHECK:.*]]:
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 ; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J]] to i64
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
-; CHECK:       [[VECTOR_PH1]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP144:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP145:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP146:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP147:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP144:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP145:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP146:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP147:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -184,15 +184,15 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
 ; CHECK-NEXT:    br i1 false, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], %[[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], %[[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP172:%.*]] = add i64 [[INDEX9]], 0
 ; CHECK-NEXT:    [[TMP173:%.*]] = add i64 [[INDEX9]], 1
 ; CHECK-NEXT:    [[TMP174:%.*]] = add i64 [[INDEX9]], 2
@@ -216,20 +216,20 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]]
 ; CHECK-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100
-; CHECK-NEXT:    br i1 [[TMP169]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP169]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]])
-; CHECK-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY1:.*]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], %[[FOR_BODY1]] ], [ [[TMP149]], %[[MIDDLE_BLOCK]] ], [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], %[[FOR_BODY]] ], [ [[TMP149]], %[[MIDDLE_BLOCK]] ], [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ADD7_LCSSA]]
-; CHECK:       [[FOR_BODY1]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY1]] ]
-; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], %[[SCALAR_PH]] ], [ [[ADD7]], %[[FOR_BODY1]] ]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD7]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA1]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
@@ -239,24 +239,24 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; CHECK-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
 ; MAX-BW-LABEL: define i32 @matrix_row_col(
 ; MAX-BW-SAME: ptr readonly captures(none) [[DATA:%.*]], i32 [[I:%.*]], i32 [[J:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; MAX-BW-NEXT:  [[ITER_CHECK:.*]]:
 ; MAX-BW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
 ; MAX-BW-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[J]] to i64
-; MAX-BW-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; MAX-BW-NEXT:    br i1 false, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; MAX-BW:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; MAX-BW-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; MAX-BW:       [[VECTOR_PH]]:
-; MAX-BW-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
-; MAX-BW:       [[VECTOR_PH1]]:
 ; MAX-BW-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; MAX-BW:       [[VECTOR_BODY]]:
-; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP144:%.*]], %[[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP145:%.*]], %[[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP146:%.*]], %[[VECTOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH1]] ], [ [[TMP147:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP144:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP145:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP146:%.*]], %[[VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP147:%.*]], %[[VECTOR_BODY]] ]
 ; MAX-BW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; MAX-BW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
 ; MAX-BW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
@@ -415,15 +415,15 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX8]])
 ; MAX-BW-NEXT:    br i1 false, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; MAX-BW:       [[VEC_EPILOG_ITER_CHECK]]:
-; MAX-BW-NEXT:    br i1 false, label %[[SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; MAX-BW-NEXT:    br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
 ; MAX-BW:       [[VEC_EPILOG_PH]]:
-; MAX-BW-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
-; MAX-BW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
+; MAX-BW-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; MAX-BW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; MAX-BW-NEXT:    [[TMP171:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; MAX-BW-NEXT:    br label %[[FOR_BODY:.*]]
-; MAX-BW:       [[FOR_BODY]]:
-; MAX-BW-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], %[[FOR_BODY]] ]
-; MAX-BW-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], %[[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], %[[FOR_BODY]] ]
+; MAX-BW-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; MAX-BW:       [[VEC_EPILOG_VECTOR_BODY]]:
+; MAX-BW-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; MAX-BW-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP171]], %[[VEC_EPILOG_PH]] ], [ [[TMP168:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; MAX-BW-NEXT:    [[TMP172:%.*]] = add i64 [[INDEX9]], 0
 ; MAX-BW-NEXT:    [[TMP173:%.*]] = add i64 [[INDEX9]], 1
 ; MAX-BW-NEXT:    [[TMP174:%.*]] = add i64 [[INDEX9]], 2
@@ -447,20 +447,20 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[TMP168]] = add <4 x i32> [[TMP167]], [[TMP166]]
 ; MAX-BW-NEXT:    [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 4
 ; MAX-BW-NEXT:    [[TMP169:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 100
-; MAX-BW-NEXT:    br i1 [[TMP169]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; MAX-BW-NEXT:    br i1 [[TMP169]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; MAX-BW:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; MAX-BW-NEXT:    [[TMP170:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP168]])
-; MAX-BW-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP]], label %[[SCALAR_PH]]
-; MAX-BW:       [[SCALAR_PH]]:
+; MAX-BW-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]]
+; MAX-BW:       [[VEC_EPILOG_SCALAR_PH]]:
 ; MAX-BW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; MAX-BW-NEXT:    [[BC_MERGE_RDX13:%.*]] = phi i32 [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP149]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; MAX-BW-NEXT:    br label %[[FOR_BODY1:.*]]
+; MAX-BW-NEXT:    br label %[[FOR_BODY:.*]]
 ; MAX-BW:       [[FOR_COND_CLEANUP]]:
-; MAX-BW-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], %[[FOR_BODY1]] ], [ [[TMP149]], %[[MIDDLE_BLOCK]] ], [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; MAX-BW-NEXT:    [[ADD7_LCSSA:%.*]] = phi i32 [ [[ADD7:%.*]], %[[FOR_BODY]] ], [ [[TMP149]], %[[MIDDLE_BLOCK]] ], [ [[TMP170]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; MAX-BW-NEXT:    ret i32 [[ADD7_LCSSA]]
-; MAX-BW:       [[FOR_BODY1]]:
-; MAX-BW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY1]] ]
-; MAX-BW-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], %[[SCALAR_PH]] ], [ [[ADD7]], %[[FOR_BODY1]] ]
+; MAX-BW:       [[FOR_BODY]]:
+; MAX-BW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; MAX-BW-NEXT:    [[SUM_015:%.*]] = phi i32 [ [[BC_MERGE_RDX13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD7]], %[[FOR_BODY]] ]
 ; MAX-BW-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[IDXPROM]], i64 [[INDVARS_IV]]
 ; MAX-BW-NEXT:    [[TMP150:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[INT_TBAA1]]
 ; MAX-BW-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x i32], ptr [[DATA]], i64 [[INDVARS_IV]], i64 [[IDXPROM5]]
@@ -470,7 +470,7 @@ define i32 @matrix_row_col(ptr nocapture readonly %data, i32 %i, i32 %j) local_u
 ; MAX-BW-NEXT:    [[ADD7]] = add i32 [[ADD]], [[MUL]]
 ; MAX-BW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; MAX-BW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
-; MAX-BW-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
+; MAX-BW-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
 entry:
   %idxprom = sext i32 %i to i64
@@ -555,26 +555,9 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; CHECK-NEXT:    store i8 [[TMP35]], ptr [[TMP27]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; CHECK-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
-; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
-; CHECK-NEXT:    [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
-; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[IN0]], align 4
-; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[IN1]], align 4
-; CHECK-NEXT:    [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
-; CHECK-NEXT:    [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
-; CHECK-NEXT:    [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
-; CHECK-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -675,26 +658,9 @@ define void @test(ptr %A, ptr noalias %B) #0 {
 ; MAX-BW-NEXT:    store i8 [[TMP67]], ptr [[TMP51]], align 1
 ; MAX-BW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; MAX-BW-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; MAX-BW-NEXT:    br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; MAX-BW-NEXT:    br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; MAX-BW:       [[MIDDLE_BLOCK]]:
 ; MAX-BW-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; MAX-BW:       [[SCALAR_PH:.*]]:
-; MAX-BW-NEXT:    br label %[[FOR_BODY:.*]]
-; MAX-BW:       [[FOR_BODY]]:
-; MAX-BW-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; MAX-BW-NEXT:    [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0
-; MAX-BW-NEXT:    [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1
-; MAX-BW-NEXT:    [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]]
-; MAX-BW-NEXT:    [[IN1:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_1]]
-; MAX-BW-NEXT:    [[V0:%.*]] = load i32, ptr [[IN0]], align 4
-; MAX-BW-NEXT:    [[V1:%.*]] = load i32, ptr [[IN1]], align 4
-; MAX-BW-NEXT:    [[REDUCE_ADD_0:%.*]] = add i32 [[V0]], [[V1]]
-; MAX-BW-NEXT:    [[REDUCE_ADD_0_NARROW:%.*]] = trunc i32 [[REDUCE_ADD_0]] to i8
-; MAX-BW-NEXT:    [[OUT:%.*]] = getelementptr inbounds [1024 x i8], ptr [[B]], i64 0, i64 [[IV_0]]
-; MAX-BW-NEXT:    store i8 [[REDUCE_ADD_0_NARROW]], ptr [[OUT]], align 1
-; MAX-BW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV_0]], 2
-; MAX-BW-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 1024
-; MAX-BW-NEXT:    br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP]]
 ; MAX-BW:       [[FOR_COND_CLEANUP]]:
 ; MAX-BW-NEXT:    ret void
 ;
@@ -745,9 +711,10 @@ attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+s
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
 ; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]], [[META7]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]], [[META6]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META6]], [[META7]]}
+; CHECK: [[PROF8]] = !{!"branch_weights", i32 4, i32 28}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]], [[META7]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]], [[META6]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META6]], [[META7]]}
 ;.
 ; MAX-BW: [[INT_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
 ; MAX-BW: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
@@ -756,7 +723,8 @@ attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+s
 ; MAX-BW: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
 ; MAX-BW: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
 ; MAX-BW: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
-; MAX-BW: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]], [[META7]]}
-; MAX-BW: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]], [[META6]]}
-; MAX-BW: [[LOOP10]] = distinct !{[[LOOP10]], [[META6]], [[META7]]}
+; MAX-BW: [[PROF8]] = !{!"branch_weights", i32 4, i32 28}
+; MAX-BW: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]], [[META7]]}
+; MAX-BW: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]], [[META6]]}
+; MAX-BW: [[LOOP11]] = distinct !{[[LOOP11]], [[META6]], [[META7]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
index 669e9252256de..7069534f3b683 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@@ -28,23 +28,9 @@ define dso_local void @tail_folding_enabled(ptr noalias nocapture %A, ptr noalia
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -89,25 +75,11 @@ define dso_local void @tail_folding_disabled(ptr noalias nocapture %A, ptr noali
 ; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP6]], ptr [[TMP7]], i32 4, <8 x i1> [[TMP1]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -170,28 +142,12 @@ define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B,
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP10]], <8 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]])
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ [[SUM_1:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4
-; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[SUM_1]] = add nuw nsw i32 [[ADD]], [[SUM_0]]
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_1_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP13]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index 27150cb6cca0d..63f9a1310d15a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -398,27 +398,9 @@ define i32 @test_count_bits(ptr %test_base) {
 ; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
 ; CHECK-NEXT:    [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX13]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[BYTE:%.*]] = udiv i64 [[IV]], 8
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[BYTE]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    [[BIT:%.*]] = urem i64 [[IV]], 8
-; CHECK-NEXT:    [[BIT_TRUNC:%.*]] = trunc i64 [[BIT]] to i8
-; CHECK-NEXT:    [[MASK:%.*]] = lshr i8 [[EARLYCND]], [[BIT_TRUNC]]
-; CHECK-NEXT:    [[TEST:%.*]] = and i8 [[MASK]], 1
-; CHECK-NEXT:    [[VAL:%.*]] = zext i8 [[TEST]] to i32
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP41]]
 ;
 entry:
   %alloca = alloca [4096 x i32]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 3ae8001f9e439..28de5c7915a84 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -141,20 +141,7 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -200,22 +187,9 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP7]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP7]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
index 282e9a503e6ed..1e94f83a24d0a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll
@@ -38,21 +38,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; IF-EVL:       scalar.ph:
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
-; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
-; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
-; IF-EVL-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ; IF-EVL:       for.cond.cleanup:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
index 69cdd655f9dc6..455fe83dbb6df 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
@@ -74,22 +74,7 @@ define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 {
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10008
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_MUL:%.*]] = mul nuw i64 [[IV]], 6
-; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    store i8 [[L]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[ARRAYIDX77:%.*]] = getelementptr i8, ptr [[L_OUT]], i64 [[IV_MUL]]
-; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX77]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[IV_MUL]], 2
-; CHECK-NEXT:    [[ARRAYIDX97:%.*]] = getelementptr i8, ptr [[L_OUT]], i64 [[ADD_2]]
-; CHECK-NEXT:    store i8 0, ptr [[ARRAYIDX97]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 10000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
index bdedcca391a19..9ea9e1193f956 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll
@@ -48,25 +48,7 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[G_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[G_SRC]], align 8
-; CHECK-NEXT:    [[IV_4:%.*]] = add nuw nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[C:%.*]] = icmp ule i64 [[L]], 128
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.then:
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV_4]], 1
-; CHECK-NEXT:    [[G_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[OR]]
-; CHECK-NEXT:    store i64 [[IV_4]], ptr [[G_DST]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -131,25 +113,7 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[G_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i64, ptr [[G_SRC]], align 8
-; CHECK-NEXT:    [[IV_4:%.*]] = add nuw nsw i64 [[IV]], 4
-; CHECK-NEXT:    [[C:%.*]] = icmp ule i64 [[L]], 128
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_THEN:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.then:
-; CHECK-NEXT:    [[OR:%.*]] = or disjoint i64 [[IV_4]], 1
-; CHECK-NEXT:    [[G_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[OR]]
-; CHECK-NEXT:    store i64 [[L]], ptr [[G_DST]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 32
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index f9403b8e3fb4a..774f0dba47224 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -134,30 +134,9 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SINK-GATHER:       middle.block:
 ; SINK-GATHER-NEXT:    [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP47]])
-; SINK-GATHER-NEXT:    br label [[FOR_END:%.*]]
-; SINK-GATHER:       scalar.ph:
-; SINK-GATHER-NEXT:    br label [[FOR_BODY:%.*]]
-; SINK-GATHER:       for.body:
-; SINK-GATHER-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; SINK-GATHER-NEXT:    [[R:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[T7:%.*]], [[FOR_INC]] ]
-; SINK-GATHER-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
-; SINK-GATHER-NEXT:    [[T1:%.*]] = load i32, ptr [[T0]], align 4
-; SINK-GATHER-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; SINK-GATHER:       if.then:
-; SINK-GATHER-NEXT:    [[T2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]]
-; SINK-GATHER-NEXT:    [[T3:%.*]] = load i32, ptr [[T2]], align 4
-; SINK-GATHER-NEXT:    [[T4:%.*]] = sdiv i32 [[T3]], [[X]]
-; SINK-GATHER-NEXT:    [[T5:%.*]] = add nsw i32 [[T4]], [[T1]]
-; SINK-GATHER-NEXT:    br label [[FOR_INC]]
-; SINK-GATHER:       for.inc:
-; SINK-GATHER-NEXT:    [[T6:%.*]] = phi i32 [ [[T1]], [[FOR_BODY]] ], [ [[T5]], [[IF_THEN]] ]
-; SINK-GATHER-NEXT:    [[T7]] = add i32 [[R]], [[T6]]
-; SINK-GATHER-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; SINK-GATHER-NEXT:    [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 10000
-; SINK-GATHER-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]]
+; SINK-GATHER-NEXT:    br label [[FOR_INC:%.*]]
 ; SINK-GATHER:       for.end:
-; SINK-GATHER-NEXT:    [[T8:%.*]] = phi i32 [ [[T7]], [[FOR_INC]] ], [ [[TMP49]], [[MIDDLE_BLOCK]] ]
-; SINK-GATHER-NEXT:    ret i32 [[T8]]
+; SINK-GATHER-NEXT:    ret i32 [[TMP49]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
index afdbfaa92835b..f64255f29d335 100644
--- a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -37,11 +37,7 @@ define i32 @foo(ptr nocapture %A) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 undef
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index ce9d1f24ac883..b59ad8481597c 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -19,12 +19,8 @@ define void @_Z3foov() {
 ; CHECK:  vector.body:
 ; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:  middle.block:
-; CHECK:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:  scalar.ph:
 ; CHECK:    br label [[FOR_BODY:%.*]]
 ; CHECK:  for.cond.cleanup:
-; CHECK:  for.body:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF5:![0-9]+]]
 ;
 ; CHECK-MASKED-LABEL: @_Z3foov(
 ; CHECK-MASKED:  entry:
@@ -34,12 +30,8 @@ define void @_Z3foov() {
 ; CHECK-MASKED:  vector.body:
 ; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK-MASKED:  middle.block:
-; CHECK-MASKED:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK-MASKED:  scalar.ph:
 ; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
 ; CHECK-MASKED:  for.cond.cleanup:
-; CHECK-MASKED:  for.body:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF5:![0-9]+]]
 ;
 ; CHECK-SCALABLE-LABEL: @_Z3foov(
 ; CHECK-SCALABLE:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
index bd0655ddff379..143a0afd77195 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll
@@ -19,19 +19,6 @@ define void @test(ptr %data) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[OR_IV_1:%.*]] = or disjoint i64 [[IV]], 1
-; CHECK-NEXT:    [[GEP_POSTSCALE:%.*]] = getelementptr [64 x float], ptr @postscale, i64 0, i64 [[OR_IV_1]]
-; CHECK-NEXT:    [[LOAD_POSTSCALE:%.*]] = load float, ptr [[GEP_POSTSCALE]], align 4, !tbaa [[FLOAT_TBAA0]]
-; CHECK-NEXT:    [[LRINT:%.*]] = tail call i64 @llvm.lrint.i64.f32(float [[LOAD_POSTSCALE]])
-; CHECK-NEXT:    [[LRINT_TRUNC:%.*]] = trunc i64 [[LRINT]] to i16
-; CHECK-NEXT:    store i16 [[LRINT_TRUNC]], ptr [[DATA]], align 2, !tbaa [[SHORT_TBAA4]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], 8
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
index 37f2e73b0cf9f..66592b0ccf677 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
@@ -16,20 +16,6 @@ define void @const_fold_ptradd(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[CONST_0:%.*]] = phi i64 [ [[D]], %[[ELSE]] ], [ 0, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i64 [[CONST_0]]
-; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -70,20 +56,6 @@ define void @const_fold_inbounds_ptradd(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[CONST_0:%.*]] = phi i64 [ [[D]], %[[ELSE]] ], [ 0, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[CONST_0]]
-; CHECK-NEXT:    store i16 0, ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -125,20 +97,6 @@ define void @const_fold_select(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[CONST_1:%.*]] = phi i64 [ [[D]], %[[ELSE]] ], [ 1, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[OR:%.*]] = or i64 [[D]], [[CONST_1]]
-; CHECK-NEXT:    store i64 [[OR]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -179,24 +137,6 @@ define void @const_fold_add_sub_mul_ashr_lshr(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[CONST_1:%.*]] = phi i64 [ [[D]], %[[ELSE]] ], [ 1, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 2, [[CONST_1]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[ADD]], [[CONST_1]]
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SUB]], [[CONST_1]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i64 [[ASHR]], 3
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr i64 [[MUL]], [[CONST_1]]
-; CHECK-NEXT:    store i64 [[LSHR]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -241,22 +181,6 @@ define void @const_fold_and_or_xor(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[CONST_1:%.*]] = phi i64 [ [[D]], %[[ELSE]] ], [ 1, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[OR:%.*]] = or i64 2, [[CONST_1]]
-; CHECK-NEXT:    [[AND:%.*]] = and i64 [[OR]], [[CONST_1]]
-; CHECK-NEXT:    [[XOR:%.*]] = and i64 [[AND]], [[CONST_1]]
-; CHECK-NEXT:    store i64 [[XOR]], ptr [[DST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -299,21 +223,6 @@ define void @const_fold_cmp_zext(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[CONST_1:%.*]] = phi i64 [ [[D]], %[[ELSE]] ], [ 1, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[VAL:%.*]] = icmp ugt i64 2, [[CONST_1]]
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[VAL]] to i8
-; CHECK-NEXT:    store i8 [[ZEXT]], ptr [[DST]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -355,20 +264,6 @@ define void @const_fold_trunc(ptr %dst, i64 %d) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[CONST_0:%.*]] = phi i64 [ [[D]], %[[ELSE]] ], [ 0, %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[CONST_0]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], ptr [[DST]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
index 33e688c418d0e..62399c5d4b4ee 100644
--- a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
+++ b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll
@@ -67,19 +67,7 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[L2_HEADER_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[L2_INNER_HEADER:%.*]]
-; CHECK:       L2.Inner.header:
-; CHECK-NEXT:    [[L2_ACCUM:%.*]] = phi i32 [ [[L2_ACCUM_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ 1, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[L2_IV:%.*]] = phi i64 [ [[L2_IV_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ 1, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2_ACCUM_NEXT]] = sub i32 [[L2_ACCUM]], [[L1_EXIT_VAL]]
-; CHECK-NEXT:    [[L2_DUMMY_BUT_NEED_IT:%.*]] = sext i32 [[L2_ACCUM_NEXT]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[L2_IV]]
-; CHECK-NEXT:    store i64 [[L2_DUMMY_BUT_NEED_IT]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[L2_IV_NEXT]] = add nuw nsw i64 [[L2_IV]], 1
-; CHECK-NEXT:    [[L2_EXIT_COND:%.*]] = icmp ugt i64 [[L2_IV]], 11
-; CHECK-NEXT:    br i1 [[L2_EXIT_COND]], label [[L2_HEADER_LOOPEXIT]], label [[L2_INNER_HEADER]]
 ; CHECK:       L2.exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
index 0a8e9dc0b4093..02e1d0e9e7004 100644
--- a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
@@ -94,20 +94,8 @@ define void @pr47390(ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PRIMARY:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[PRIMARY_ADD:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[USE_PRIMARY:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[PRIMARY]], %[[LOOP]] ]
-; CHECK-NEXT:    [[SECONDARY:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[SECONDARY_ADD:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PRIMARY_ADD]] = add i32 [[PRIMARY]], 1
-; CHECK-NEXT:    [[SECONDARY_ADD]] = add i32 [[SECONDARY]], 1
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[SECONDARY]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[SECONDARY]], 5
-; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
index f61478bfc8856..b31b73274e1cc 100644
--- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
+++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll
@@ -15,15 +15,6 @@ define i32 @foo(ptr %p) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]], !dbg [[DBG3]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ], !dbg [[DBG7:![0-9]+]]
-; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG9:![0-9]+]]
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1, !dbg [[DBG10:![0-9]+]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -64,9 +55,4 @@ exit:                              ; preds = %loop
 ; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 11, type: [[META5:![0-9]+]], spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]])
 ; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]])
 ; CHECK: [[META6]] = !{}
-; CHECK: [[DBG7]] = !DILocation(line: 4, scope: [[META4]])
-; CHECK: [[DBG8]] = !DILocation(line: 5, scope: [[META4]])
-; CHECK: [[DBG9]] = !DILocation(line: 7, scope: [[META4]])
-; CHECK: [[DBG10]] = !DILocation(line: 8, scope: [[META4]])
-; CHECK: [[DBG11]] = !DILocation(line: 9, scope: [[META4]])
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
index d97624fa6eace..274bd043cd86b 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll
@@ -24,16 +24,7 @@ define dso_local void @constTC(ptr noalias nocapture %A) optsize {
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
-; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 1800
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
index 4f5a26e9c89cb..156c2bdca7b0e 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
@@ -198,16 +198,7 @@ define dso_local void @cannotProveAlignedTC(ptr noalias nocapture %A, i32 %p, i3
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
-; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
index ff2baec8c912e..eca39e6f0b6ba 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
@@ -108,25 +108,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; CHECK-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; CHECK-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; CHECK-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; CHECK-NEXT:    br i1 [[VEC_DEAD]], label %[[FOR_END]], label %[[LOOP]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
index fd19760159e68..ebfe16bf78abd 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
@@ -22,21 +22,8 @@ define float @for_load_interleave_only(ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[SRC]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 16
-; CHECK-NEXT:    [[L]] = load float, ptr [[PTR_IV]], align 4
-; CHECK-NEXT:    store float 0.000000e+00, ptr [[PTR_IV]], align 4
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi float [ [[FOR]], %[[LOOP]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[FOR_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP2]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
index 149157aaa4b55..74129806ad6fb 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll
@@ -119,22 +119,7 @@ define void @test_pr54223_sink_after_insertion_order(ptr noalias %a, ptr noalias
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SCALAR_RECUR6:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[FOR_2_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[NEG:%.*]] = fneg float [[SCALAR_RECUR6]]
-; CHECK-NEXT:    [[MULADD:%.*]] = call float @llvm.fmuladd.f32(float [[SCALAR_RECUR]], float [[NEG]], float 0.000000e+00)
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[FOR_1_NEXT]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[FOR_2_NEXT]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    store float [[MULADD]], ptr [[DST_GEP]], align 4
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 443e44b6de944..bd0c098d335a2 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -1193,19 +1193,9 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
-; UNROLL-NO-IC:       scalar.body:
-; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR2:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR3]] = add i64 0, 1
-; UNROLL-NO-IC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 1000
-; UNROLL-NO-IC-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[SCALAR_BODY]]
 ; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    [[VAR2_LCSSA:%.*]] = phi i64 [ [[VAR2]], [[SCALAR_BODY]] ], [ 1, [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i64 [[VAR2_LCSSA]]
+; UNROLL-NO-IC-NEXT:    ret i64 1
 ;
 ; UNROLL-NO-VF-LABEL: @constant_folded_previous_value(
 ; UNROLL-NO-VF-NEXT:  entry:
@@ -1218,19 +1208,9 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
-; UNROLL-NO-VF:       scalar.body:
-; UNROLL-NO-VF-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VAR2:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VAR3]] = add i64 0, 1
-; UNROLL-NO-VF-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; UNROLL-NO-VF-NEXT:    [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 1000
-; UNROLL-NO-VF-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[SCALAR_BODY]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[VAR2_LCSSA:%.*]] = phi i64 [ [[VAR2]], [[SCALAR_BODY]] ], [ 1, [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-VF-NEXT:    ret i64 [[VAR2_LCSSA]]
+; UNROLL-NO-VF-NEXT:    ret i64 1
 ;
 ; SINK-AFTER-LABEL: @constant_folded_previous_value(
 ; SINK-AFTER-NEXT:  entry:
@@ -1243,19 +1223,9 @@ define i64 @constant_folded_previous_value() {
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    br label [[FOR_END:%.*]]
-; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
-; SINK-AFTER:       scalar.body:
-; SINK-AFTER-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VAR2:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VAR3]] = add i64 0, 1
-; SINK-AFTER-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; SINK-AFTER-NEXT:    [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 1000
-; SINK-AFTER-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[SCALAR_BODY]]
 ; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    [[VAR2_LCSSA:%.*]] = phi i64 [ [[VAR2]], [[SCALAR_BODY]] ], [ 1, [[MIDDLE_BLOCK]] ]
-; SINK-AFTER-NEXT:    ret i64 [[VAR2_LCSSA]]
+; SINK-AFTER-NEXT:    ret i64 1
 ;
 entry:
   br label %scalar.body
@@ -2725,21 +2695,9 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]]
 ; UNROLL-NO-IC-NEXT:    [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br label [[BB1:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[BB2:%.*]]
 ; UNROLL-NO-IC:       bb1:
-; UNROLL-NO-IC-NEXT:    [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[VAR]]
-; UNROLL-NO-IC:       bb2:
-; UNROLL-NO-IC-NEXT:    [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[Y]], [[SCALAR_PH:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR4:%.*]] = phi i32 [ [[VAR7:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR6]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
-; UNROLL-NO-IC-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
-; UNROLL-NO-IC-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-IC-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-IC-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    ret i32 [[TMP51]]
 ;
 ; UNROLL-NO-VF-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-VF-NEXT:  bb:
@@ -2785,21 +2743,9 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]]
-; UNROLL-NO-VF-NEXT:    br label [[BB1:%.*]]
-; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[BB2:%.*]]
 ; UNROLL-NO-VF:       bb1:
-; UNROLL-NO-VF-NEXT:    [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-VF-NEXT:    ret i32 [[VAR]]
-; UNROLL-NO-VF:       bb2:
-; UNROLL-NO-VF-NEXT:    [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[Y]], [[SCALAR_PH:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[VAR4:%.*]] = phi i32 [ [[VAR7:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR6]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
-; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
-; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    ret i32 [[BIN_RDX]]
 ;
 ; SINK-AFTER-LABEL: @sink_into_replication_region(
 ; SINK-AFTER-NEXT:  bb:
@@ -2868,21 +2814,9 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
-; SINK-AFTER-NEXT:    br label [[BB1:%.*]]
-; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    br label [[BB2:%.*]]
 ; SINK-AFTER:       bb1:
-; SINK-AFTER-NEXT:    [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; SINK-AFTER-NEXT:    ret i32 [[VAR]]
-; SINK-AFTER:       bb2:
-; SINK-AFTER-NEXT:    [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[Y]], [[SCALAR_PH:%.*]] ]
-; SINK-AFTER-NEXT:    [[VAR4:%.*]] = phi i32 [ [[VAR7:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR6]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
-; SINK-AFTER-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
-; SINK-AFTER-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; SINK-AFTER-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; SINK-AFTER-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27:![0-9]+]]
+; SINK-AFTER-NEXT:    ret i32 [[TMP27]]
 ;
 bb:
   br label %bb2
@@ -3078,25 +3012,9 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP73]], [[TMP72]]
 ; UNROLL-NO-IC-NEXT:    [[TMP75:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br label [[BB1:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[BB2:%.*]]
 ; UNROLL-NO-IC:       bb1:
-; UNROLL-NO-IC-NEXT:    [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP75]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[VAR]]
-; UNROLL-NO-IC:       bb2:
-; UNROLL-NO-IC-NEXT:    [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[Y]], [[SCALAR_PH:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR4:%.*]] = phi i32 [ [[VAR7:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR6]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[G:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[IV]]
-; UNROLL-NO-IC-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
-; UNROLL-NO-IC-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
-; UNROLL-NO-IC-NEXT:    store i32 [[VAR3]], ptr [[G]], align 4
-; UNROLL-NO-IC-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-IC-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27]]
+; UNROLL-NO-IC-NEXT:    ret i32 [[TMP75]]
 ;
 ; UNROLL-NO-VF-LABEL: @sink_into_replication_region_multiple(
 ; UNROLL-NO-VF-NEXT:  bb:
@@ -3155,25 +3073,9 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP28:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]]
-; UNROLL-NO-VF-NEXT:    br label [[BB1:%.*]]
-; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[BB2:%.*]]
 ; UNROLL-NO-VF:       bb1:
-; UNROLL-NO-VF-NEXT:    [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-VF-NEXT:    ret i32 [[VAR]]
-; UNROLL-NO-VF:       bb2:
-; UNROLL-NO-VF-NEXT:    [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[Y]], [[SCALAR_PH:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[VAR4:%.*]] = phi i32 [ [[VAR7:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR6]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[G:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[IV]]
-; UNROLL-NO-VF-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
-; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
-; UNROLL-NO-VF-NEXT:    store i32 [[VAR3]], ptr [[G]], align 4
-; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27]]
+; UNROLL-NO-VF-NEXT:    ret i32 [[BIN_RDX]]
 ;
 ; SINK-AFTER-LABEL: @sink_into_replication_region_multiple(
 ; SINK-AFTER-NEXT:  bb:
@@ -3273,25 +3175,9 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) {
 ; SINK-AFTER-NEXT:    br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP28:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP37]])
-; SINK-AFTER-NEXT:    br label [[BB1:%.*]]
-; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    br label [[BB2:%.*]]
 ; SINK-AFTER:       bb1:
-; SINK-AFTER-NEXT:    [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ]
-; SINK-AFTER-NEXT:    ret i32 [[VAR]]
-; SINK-AFTER:       bb2:
-; SINK-AFTER-NEXT:    [[VAR3:%.*]] = phi i32 [ [[VAR8:%.*]], [[BB2]] ], [ [[Y]], [[SCALAR_PH:%.*]] ]
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[VAR4:%.*]] = phi i32 [ [[VAR7:%.*]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[VAR5:%.*]] = phi i32 [ [[VAR6]], [[BB2]] ], [ 0, [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[G:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[IV]]
-; SINK-AFTER-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
-; SINK-AFTER-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
-; SINK-AFTER-NEXT:    store i32 [[VAR3]], ptr [[G]], align 4
-; SINK-AFTER-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; SINK-AFTER-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
-; SINK-AFTER-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27]]
+; SINK-AFTER-NEXT:    ret i32 [[TMP39]]
 ;
 bb:
   br label %bb2
@@ -3341,26 +3227,9 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-IC:       loop:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; UNROLL-NO-IC-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; UNROLL-NO-IC-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; UNROLL-NO-IC-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; UNROLL-NO-IC-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; UNROLL-NO-IC-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; UNROLL-NO-IC-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; UNROLL-NO-IC-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-IC-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]]
 ; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[FOR_LCSSA]]
+; UNROLL-NO-IC-NEXT:    ret i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]]
 ;
 ; UNROLL-NO-VF-LABEL: @sink_after_dead_inst(
 ; UNROLL-NO-VF-NEXT:  entry:
@@ -3382,26 +3251,9 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-VF:       loop:
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[FOR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; UNROLL-NO-VF-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; UNROLL-NO-VF-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; UNROLL-NO-VF-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; UNROLL-NO-VF-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; UNROLL-NO-VF-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; UNROLL-NO-VF-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
+; UNROLL-NO-VF-NEXT:    ret i32 [[TMP10]]
 ;
 ; SINK-AFTER-LABEL: @sink_after_dead_inst(
 ; SINK-AFTER-NEXT:  entry:
@@ -3423,26 +3275,9 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) {
 ; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SINK-AFTER-NEXT:    br label [[FOR_END:%.*]]
-; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
-; SINK-AFTER:       loop:
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[FOR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; SINK-AFTER-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; SINK-AFTER-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; SINK-AFTER-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; SINK-AFTER-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; SINK-AFTER-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; SINK-AFTER-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; SINK-AFTER-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; SINK-AFTER-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; SINK-AFTER-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]]
 ; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; SINK-AFTER-NEXT:    ret i32 [[FOR_LCSSA]]
+; SINK-AFTER-NEXT:    ret i32 [[VECTOR_RECUR_EXTRACT_FOR_PHI]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/flags.ll b/llvm/test/Transforms/LoopVectorize/flags.ll
index cb86f5f190b54..2268085e5fe73 100644
--- a/llvm/test/Transforms/LoopVectorize/flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/flags.ll
@@ -129,20 +129,8 @@ define float @fast_math(ptr noalias %s) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[S]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = fadd fast float [[RED]], [[TMP4]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[ADD_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 901f67ee676ee..f56699a45320d 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -1649,11 +1649,7 @@ define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; VEC4_INTERL1-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VEC4_INTERL1:       middle.block:
-; VEC4_INTERL1-NEXT:    br label [[EXIT:%.*]]
-; VEC4_INTERL1:       scalar.ph:
 ; VEC4_INTERL1-NEXT:    br label [[LOOP:%.*]]
-; VEC4_INTERL1:       loop:
-; VEC4_INTERL1-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; VEC4_INTERL1:       exit:
 ; VEC4_INTERL1-NEXT:    ret i32 0
 ;
@@ -1672,11 +1668,7 @@ define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; VEC4_INTERL2-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VEC4_INTERL2:       middle.block:
-; VEC4_INTERL2-NEXT:    br label [[EXIT:%.*]]
-; VEC4_INTERL2:       scalar.ph:
 ; VEC4_INTERL2-NEXT:    br label [[LOOP:%.*]]
-; VEC4_INTERL2:       loop:
-; VEC4_INTERL2-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; VEC4_INTERL2:       exit:
 ; VEC4_INTERL2-NEXT:    ret i32 0
 ;
@@ -1699,11 +1691,7 @@ define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
 ; VEC1_INTERL2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
 ; VEC1_INTERL2-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VEC1_INTERL2:       middle.block:
-; VEC1_INTERL2-NEXT:    br label [[EXIT:%.*]]
-; VEC1_INTERL2:       scalar.ph:
 ; VEC1_INTERL2-NEXT:    br label [[LOOP:%.*]]
-; VEC1_INTERL2:       loop:
-; VEC1_INTERL2-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; VEC1_INTERL2:       exit:
 ; VEC1_INTERL2-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
index 93031c757582a..555e695cfa935 100644
--- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll
@@ -66,22 +66,9 @@ define float @minloopattr(ptr nocapture readonly %arg) #0 {
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP4]])
-; CHECK-NEXT:    br label [[OUT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]]
-; CHECK-NEXT:    [[T4:%.*]] = load float, ptr [[T3]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = fcmp olt float [[T2]], [[T4]]
-; CHECK-NEXT:    [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]]
-; CHECK-NEXT:    [[T7]] = add i64 [[T1]], 1
-; CHECK-NEXT:    [[T8:%.*]] = icmp eq i64 [[T7]], 65537
-; CHECK-NEXT:    br i1 [[T8]], label [[OUT]], label [[LOOP]]
 ; CHECK:       out:
-; CHECK-NEXT:    [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[T6_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP6]]
 ;
 top:
   %t = load float, ptr %arg
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c86e27173bffa..f7376a0f8e205 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -67,23 +67,7 @@ define i32 @test(ptr nocapture %f) #0 {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NOSIMPLIFY:       scalar.ph:
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL-NOSIMPLIFY:       for.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; UNROLL-NOSIMPLIFY:       if.then:
-; UNROLL-NOSIMPLIFY-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], 20
-; UNROLL-NOSIMPLIFY-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC]]
-; UNROLL-NOSIMPLIFY:       for.inc:
-; UNROLL-NOSIMPLIFY-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC:%.*]]
 ; UNROLL-NOSIMPLIFY:       for.end:
 ; UNROLL-NOSIMPLIFY-NEXT:    ret i32 0
 ;
@@ -449,25 +433,7 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NOSIMPLIFY:       scalar.ph:
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL-NOSIMPLIFY:       for.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 1000, [[SCALAR_PH]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; UNROLL-NOSIMPLIFY:       if.then:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP6]], ptr [[TMP3]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC]]
-; UNROLL-NOSIMPLIFY:       for.inc:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9]] = add nuw nsw i64 [[TMP1]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7]] = add i64 [[TMP2]], -1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC:%.*]]
 ; UNROLL-NOSIMPLIFY:       for.end:
 ; UNROLL-NOSIMPLIFY-NEXT:    ret void
 ;
@@ -575,26 +541,7 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) {
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; UNROLL-NOSIMPLIFY:       middle.block:
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NOSIMPLIFY:       scalar.ph:
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL-NOSIMPLIFY:       for.body:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 1000, [[SCALAR_PH]] ]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP1]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 0, ptr [[TMP3]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; UNROLL-NOSIMPLIFY:       if.then:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i8
-; UNROLL-NOSIMPLIFY-NEXT:    store i8 [[TMP6]], ptr [[TMP3]], align 1
-; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC]]
-; UNROLL-NOSIMPLIFY:       for.inc:
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP9]] = add nuw nsw i64 [[TMP1]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP7]] = add i64 [[TMP2]], -1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 0
-; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[TMP8]], label [[FOR_END]], label [[FOR_BODY]]
+; UNROLL-NOSIMPLIFY-NEXT:    br label [[FOR_INC:%.*]]
 ; UNROLL-NOSIMPLIFY:       for.end:
 ; UNROLL-NOSIMPLIFY-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll
index f0b32c618947a..ccf05d73945ff 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll
@@ -24,17 +24,7 @@ define void @multiple_iv_uses_in_same_instruction(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr [[PTR]], i64 0, i64 [[IV]], i64 [[IV]]
-; CHECK-NEXT:    [[T:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    store i32 [[T]], ptr [[GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
index 362de0e0bba7a..53d5ac472c892 100644
--- a/llvm/test/Transforms/LoopVectorize/induction-step.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -291,18 +291,6 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[IV]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i64 [[IV]], 1
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT_P]] = phi i64 [ [[IV_NEXT]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT_P]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 60c844c3f6415..cc55a51e134a6 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -2764,19 +2764,9 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[B_0:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A_0_AND]] = and i32 [[A_0]], 4
-; CHECK-NEXT:    [[B_NEXT]] = add i8 [[B_0]], -1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[B_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[A_0_AND_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 ; IND-LABEL: @i8_loop(
 ; IND-NEXT:  entry:
@@ -2789,11 +2779,7 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; IND-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; IND-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    br label [[EXIT:%.*]]
-; IND:       scalar.ph:
 ; IND-NEXT:    br label [[LOOP:%.*]]
-; IND:       loop:
-; IND-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; IND:       exit:
 ; IND-NEXT:    ret i32 0
 ;
@@ -2808,11 +2794,7 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; UNROLL-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    br label [[EXIT:%.*]]
-; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
-; UNROLL:       loop:
-; UNROLL-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; UNROLL:       exit:
 ; UNROLL-NEXT:    ret i32 0
 ;
@@ -2833,19 +2815,9 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br label [[EXIT:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-IC:       loop:
-; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[B_0:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[A_0_AND]] = and i32 [[A_0]], 4
-; UNROLL-NO-IC-NEXT:    [[B_NEXT]] = add i8 [[B_0]], -1
-; UNROLL-NO-IC-NEXT:    [[EC:%.*]] = icmp eq i8 [[B_NEXT]], 0
-; UNROLL-NO-IC-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; UNROLL-NO-IC:       exit:
-; UNROLL-NO-IC-NEXT:    [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[A_0_AND_LCSSA]]
+; UNROLL-NO-IC-NEXT:    ret i32 [[TMP3]]
 ;
 ; INTERLEAVE-LABEL: @i8_loop(
 ; INTERLEAVE-NEXT:  entry:
@@ -2858,11 +2830,7 @@ define i32 @i8_loop() nounwind readnone ssp uwtable {
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; INTERLEAVE-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; INTERLEAVE:       middle.block:
-; INTERLEAVE-NEXT:    br label [[EXIT:%.*]]
-; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
-; INTERLEAVE:       loop:
-; INTERLEAVE-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; INTERLEAVE:       exit:
 ; INTERLEAVE-NEXT:    ret i32 0
 ;
@@ -2897,19 +2865,9 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[B_0:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[B_0_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[A_0_AND]] = and i32 [[A_0]], 4
-; CHECK-NEXT:    [[B_0_NEXT]] = add i16 [[B_0]], -1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[B_0_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[A_0_AND_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 ; IND-LABEL: @i16_loop(
 ; IND-NEXT:  entry:
@@ -2922,11 +2880,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; IND-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536
 ; IND-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    br label [[EXIT:%.*]]
-; IND:       scalar.ph:
 ; IND-NEXT:    br label [[LOOP:%.*]]
-; IND:       loop:
-; IND-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; IND:       exit:
 ; IND-NEXT:    ret i32 0
 ;
@@ -2941,11 +2895,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536
 ; UNROLL-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    br label [[EXIT:%.*]]
-; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
-; UNROLL:       loop:
-; UNROLL-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; UNROLL:       exit:
 ; UNROLL-NEXT:    ret i32 0
 ;
@@ -2966,19 +2916,9 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = and <2 x i32> [[TMP1]], [[TMP0]]
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br label [[EXIT:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-IC:       loop:
-; UNROLL-NO-IC-NEXT:    [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH:%.*]] ], [ [[A_0_AND:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[B_0:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[B_0_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[A_0_AND]] = and i32 [[A_0]], 4
-; UNROLL-NO-IC-NEXT:    [[B_0_NEXT]] = add i16 [[B_0]], -1
-; UNROLL-NO-IC-NEXT:    [[EC:%.*]] = icmp eq i16 [[B_0_NEXT]], 0
-; UNROLL-NO-IC-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; UNROLL-NO-IC:       exit:
-; UNROLL-NO-IC-NEXT:    [[A_0_AND_LCSSA:%.*]] = phi i32 [ [[A_0_AND]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[A_0_AND_LCSSA]]
+; UNROLL-NO-IC-NEXT:    ret i32 [[TMP3]]
 ;
 ; INTERLEAVE-LABEL: @i16_loop(
 ; INTERLEAVE-NEXT:  entry:
@@ -2991,11 +2931,7 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 65536
 ; INTERLEAVE-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; INTERLEAVE:       middle.block:
-; INTERLEAVE-NEXT:    br label [[EXIT:%.*]]
-; INTERLEAVE:       scalar.ph:
 ; INTERLEAVE-NEXT:    br label [[LOOP:%.*]]
-; INTERLEAVE:       loop:
-; INTERLEAVE-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; INTERLEAVE:       exit:
 ; INTERLEAVE-NEXT:    ret i32 0
 ;
@@ -5025,28 +4961,9 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP15]])
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ -20, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[VAR0:%.*]] = phi i32 [ [[A]], [[SCALAR_PH]] ], [ [[VAR6:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[VAR1:%.*]] = trunc i32 [[I]] to i16
-; CHECK-NEXT:    [[VAR2:%.*]] = icmp eq i16 [[VAR1]], 0
-; CHECK-NEXT:    br i1 [[VAR2]], label [[FOR_INC]], label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[VAR3:%.*]] = urem i16 [[B]], [[VAR1]]
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[VAR4:%.*]] = phi i16 [ [[VAR3]], [[FOR_COND]] ], [ 0, [[FOR_BODY]] ]
-; CHECK-NEXT:    [[VAR5:%.*]] = sext i16 [[VAR4]] to i32
-; CHECK-NEXT:    [[VAR6]] = or i32 [[VAR0]], [[VAR5]]
-; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[I_NEXT]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[VAR7]]
+; CHECK-NEXT:    ret i32 [[TMP17]]
 ;
 ; IND-LABEL: @PR32419(
 ; IND-NEXT:  entry:
@@ -5086,15 +5003,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; IND-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20
 ; IND-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    br label [[FOR_END:%.*]]
-; IND:       scalar.ph:
-; IND-NEXT:    br label [[FOR_BODY:%.*]]
-; IND:       for.body:
-; IND-NEXT:    br i1 poison, label [[FOR_INC:%.*]], label [[FOR_COND:%.*]]
-; IND:       for.cond:
-; IND-NEXT:    br label [[FOR_INC]]
-; IND:       for.inc:
-; IND-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
+; IND-NEXT:    br label [[FOR_INC:%.*]]
 ; IND:       for.end:
 ; IND-NEXT:    [[VAR7:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP14]])
 ; IND-NEXT:    ret i32 [[VAR7]]
@@ -5160,15 +5069,7 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; UNROLL-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20
 ; UNROLL-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL:       scalar.ph:
-; UNROLL-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL:       for.body:
-; UNROLL-NEXT:    br i1 poison, label [[FOR_INC:%.*]], label [[FOR_COND:%.*]]
-; UNROLL:       for.cond:
-; UNROLL-NEXT:    br label [[FOR_INC]]
-; UNROLL:       for.inc:
-; UNROLL-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
+; UNROLL-NEXT:    br label [[FOR_INC:%.*]]
 ; UNROLL:       for.end:
 ; UNROLL-NEXT:    [[BIN_RDX:%.*]] = or <2 x i32> [[TMP27]], [[TMP26]]
 ; UNROLL-NEXT:    [[VAR7:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]])
@@ -5239,28 +5140,9 @@ define i32 @PR32419(i32 %a, i16 %b) {
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[BIN_RDX:%.*]] = or <2 x i32> [[TMP29]], [[TMP28]]
 ; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]])
-; UNROLL-NO-IC-NEXT:    br label [[FOR_END:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    br label [[FOR_BODY:%.*]]
-; UNROLL-NO-IC:       for.body:
-; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i32 [ -20, [[SCALAR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR0:%.*]] = phi i32 [ [[A]], [[SCALAR_PH]] ], [ [[VAR6:%.*]], [[FOR_INC]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR1:%.*]] = trunc i32 [[I]] to i16
-; UNROLL-NO-IC-NEXT:    [[VAR2:%.*]] = icmp eq i16 [[VAR1]], 0
-; UNROLL-NO-IC-NEXT:    br i1 [[VAR2]], label [[FOR_INC]], label [[FOR_COND:%.*]]
-; UNROLL-NO-IC:       for.cond:
-; UNROLL-NO-IC-NEXT:    [[VAR3:%.*]] = urem i16 [[B]], [[VAR1]]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_INC]]
-; UNROLL-NO-IC:       for.inc:
-; UNROLL-NO-IC-NEXT:    [[VAR4:%.*]] = phi i16 [ [[VAR3]], [[FOR_COND]] ], [ 0, [[FOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VAR5:%.*]] = sext i16 [[VAR4]] to i32
-; UNROLL-NO-IC-NEXT:    [[VAR6]] = or i32 [[VAR0]], [[VAR5]]
-; UNROLL-NO-IC-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
-; UNROLL-NO-IC-NEXT:    [[COND:%.*]] = icmp eq i32 [[I_NEXT]], 0
-; UNROLL-NO-IC-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_INC:%.*]]
 ; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    [[VAR7:%.*]] = phi i32 [ [[VAR6]], [[FOR_INC]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[VAR7]]
+; UNROLL-NO-IC-NEXT:    ret i32 [[TMP31]]
 ;
 ; INTERLEAVE-LABEL: @PR32419(
 ; INTERLEAVE-NEXT:  entry:
@@ -5818,23 +5700,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TRUNC_IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RECUR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[LV]], [[RECUR]]
-; CHECK-NEXT:    [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IV_TRUNC]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[IV_TRUNC]], [[MUL]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[DST_GEP]], align 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -5862,11 +5728,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; IND-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; IND-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; IND:       middle.block:
-; IND-NEXT:    br label [[EXIT:%.*]]
-; IND:       scalar.ph:
 ; IND-NEXT:    br label [[LOOP:%.*]]
-; IND:       loop:
-; IND-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; IND:       exit:
 ; IND-NEXT:    ret void
 ;
@@ -5900,11 +5762,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; UNROLL-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; UNROLL:       middle.block:
-; UNROLL-NEXT:    br label [[EXIT:%.*]]
-; UNROLL:       scalar.ph:
 ; UNROLL-NEXT:    br label [[LOOP:%.*]]
-; UNROLL:       loop:
-; UNROLL-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; UNROLL:       exit:
 ; UNROLL-NEXT:    ret void
 ;
@@ -5937,23 +5795,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    br label [[EXIT:%.*]]
-; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-IC:       loop:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[TRUNC_IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[RECUR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC]], align 4
-; UNROLL-NO-IC-NEXT:    [[MUL:%.*]] = mul nsw i32 [[LV]], [[RECUR]]
-; UNROLL-NO-IC-NEXT:    [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[IV_TRUNC]] = trunc i64 [[IV]] to i32
-; UNROLL-NO-IC-NEXT:    [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IV_TRUNC]]
-; UNROLL-NO-IC-NEXT:    [[ADD:%.*]] = add i32 [[IV_TRUNC]], [[MUL]]
-; UNROLL-NO-IC-NEXT:    store i32 [[ADD]], ptr [[DST_GEP]], align 4
-; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TRUNC_IV_NEXT]], 100
-; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]]
 ; UNROLL-NO-IC:       exit:
 ; UNROLL-NO-IC-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
index 9222af933475b..8975c058c6b79 100644
--- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll
@@ -18,23 +18,9 @@ define i32 @one_direct_branch(ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 25500, [[LV]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
 entry:
   br label %loop
@@ -73,26 +59,9 @@ define i32 @two_direct_branch(ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 25500, [[LV]]
-; CHECK-NEXT:    br label [[BB:%.*]]
-; CHECK:       bb:
-; CHECK-NEXT:    [[PHI_XOR_1:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[PHI_XOR:%.*]] = phi i32 [ [[PHI_XOR_1]], [[BB]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
 entry:
   br label %loop
@@ -141,26 +110,9 @@ define i32 @cond_branch(i32 %a, ptr %src) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 25500, [[LV]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_LATCH]], label [[THEN:%.*]]
-; CHECK:       then:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[PHI_XOR:%.*]] = phi i32 [ [[XOR]], [[LOOP]] ], [ 10, [[THEN]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[XOR_LCSSA:%.*]] = phi i32 [ [[PHI_XOR]], [[LOOP_LATCH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[XOR_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
 entry:
   br label %loop
@@ -205,18 +157,9 @@ define i32 @optimizable_trunc_used_outside() {
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[IV_TRUNC_LCSSA:%.*]] = phi i32 [ [[IV_TRUNC]], [[LOOP]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[IV_TRUNC_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
index 1128dd354f659..2c97bb7622740 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll
@@ -33,19 +33,6 @@ define void @i65_induction_with_negative_step(ptr %dst) {
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_I65:%.*]] = phi i65 [ 0, %[[SCALAR_PH]] ], [ [[IV_I65_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TRUNC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[TRUNC]] = trunc i65 [[IV_I65]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TRUNC]]
-; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    [[IV_I65_NEXT]] = add i65 [[IV_I65]], -1
-; CHECK-NEXT:    br i1 [[ICMP]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 85e7477837cde..eca9c1fe74c21 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -27,23 +27,6 @@ define void @gep_for_first_member_does_not_dominate_insert_point(ptr %str, ptr n
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[OR_1:%.*]] = or disjoint i64 [[IV2]], 1
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[STR]], i64 [[OR_1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[GEP1]], align 1
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr [[STR]], i64 [[IV2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[GEP0]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i8 [[ADD]], ptr [[GEP_DST]], align 1
-; CHECK-NEXT:    [[IV2_NEXT]] = add i64 [[IV2]], 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll
index 4dc9cfd5264bc..bd0fd77e7c391 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll
@@ -45,23 +45,6 @@ define void @merge_tbaa_interleave_group(ptr nocapture readonly %p, ptr noalias
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], ptr [[P]], i64 [[IV]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = load double, ptr [[X]], align 8, !tbaa [[TBAA0]]
-; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP19]], 2.000000e+00
-; CHECK-NEXT:    [[X4:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], ptr [[CP]], i64 0, i64 [[IV]], i32 0
-; CHECK-NEXT:    store double [[MUL]], ptr [[X4]], align 8, !tbaa [[TBAA10:![0-9]+]]
-; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], ptr [[P]], i64 [[IV]], i32 1
-; CHECK-NEXT:    [[TMP20:%.*]] = load double, ptr [[Y]], align 8, !tbaa [[TBAA5]]
-; CHECK-NEXT:    [[MUL7:%.*]] = fmul double [[TMP20]], 3.000000e+00
-; CHECK-NEXT:    [[Y10:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], ptr [[CP]], i64 0, i64 [[IV]], i32 1
-; CHECK-NEXT:    store double [[MUL7]], ptr [[Y10]], align 8, !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -126,20 +109,20 @@ define void @ir_tbaa_different(ptr %base, ptr %end, ptr %src) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[SRC]], align 4, !alias.scope [[META13:![0-9]+]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[SRC]], align 4, !alias.scope [[META10:![0-9]+]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP11]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META13]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x float>, ptr [[NEXT_GEP]], align 4, !alias.scope [[META13:![0-9]+]], !noalias [[META10]]
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x float> [[WIDE_VEC]], <4 x float> poison, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x float> [[WIDE_VEC]], <4 x float> poison, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT:    store <4 x float> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META16]], !noalias [[META13]]
+; CHECK-NEXT:    store <4 x float> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 4, !alias.scope [[META13]], !noalias [[META10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -152,9 +135,9 @@ define void @ir_tbaa_different(ptr %base, ptr %end, ptr %src) {
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
 ; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[PTR_IV]], align 4
 ; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[L_1]], [[L_INVAR]]
-; CHECK-NEXT:    store float [[MUL_1]], ptr [[PTR_IV]], align 4, !tbaa [[TBAA10]]
+; CHECK-NEXT:    store float [[MUL_1]], ptr [[PTR_IV]], align 4, !tbaa [[TBAA16:![0-9]+]]
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 4
-; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_1]], align 4, !tbaa [[TBAA12]]
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[GEP_1]], align 4, !tbaa [[TBAA18:![0-9]+]]
 ; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[L_2]], [[L_INVAR]]
 ; CHECK-NEXT:    store float [[MUL_2]], ptr [[GEP_1]], align 4
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
@@ -278,15 +261,15 @@ exit:
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]], [[META9:![0-9]+]]}
 ; CHECK: [[META8]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[TBAA10]] = !{[[META11:![0-9]+]], [[META2]], i64 0}
-; CHECK: [[META11]] = !{!"Vec2r", [[META2]], i64 0, [[META2]], i64 8}
-; CHECK: [[TBAA12]] = !{[[META11]], [[META2]], i64 8}
+; CHECK: [[META10]] = !{[[META11:![0-9]+]]}
+; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]]}
+; CHECK: [[META12]] = distinct !{[[META12]], !"LVerDomain"}
 ; CHECK: [[META13]] = !{[[META14:![0-9]+]]}
-; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]}
-; CHECK: [[META15]] = distinct !{[[META15]], !"LVerDomain"}
-; CHECK: [[META16]] = !{[[META17:![0-9]+]]}
-; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]}
-; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META8]], [[META9]]}
+; CHECK: [[META14]] = distinct !{[[META14]], [[META12]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META8]], [[META9]]}
+; CHECK: [[TBAA16]] = !{[[META17:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META17]] = !{!"Vec2r", [[META2]], i64 0, [[META2]], i64 8}
+; CHECK: [[TBAA18]] = !{[[META17]], [[META2]], i64 8}
 ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META8]]}
 ; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META8]], [[META9]]}
 ; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META9]], [[META8]]}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 4885dd2e33815..b4cad1142134c 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -47,11 +47,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -124,11 +120,7 @@ define void @test_struct_array_load3_store3() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -206,11 +198,7 @@ define i32 @test_struct_load4(ptr nocapture readonly %S) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUB8_LCSSA:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    ret i32 [[SUB8_LCSSA]]
@@ -279,13 +267,9 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -365,13 +349,9 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
 ;
 entry:
   br label %for.body
@@ -619,11 +599,7 @@ define void @load_gap_reverse(ptr noalias nocapture %P1, ptr noalias nocapture %
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_EXIT]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -681,13 +657,9 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
 ;
 entry:
   br label %for.body
@@ -753,13 +725,9 @@ define void @mixed_load3_store3(ptr nocapture %A) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -836,17 +804,13 @@ define void @int_float_struct(ptr nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr @SA, align 4
 ; CHECK-NEXT:    store float [[ADD3_LCSSA]], ptr @SB, align 4
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll
index ab70c14a0be61..6c4ee5b7359dc 100644
--- a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll
+++ b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll
@@ -20,19 +20,7 @@ define void @d() {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[I7:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[I3:%.*]] = load float, ptr null, align 4
-; CHECK-NEXT:    [[I4:%.*]] = getelementptr float, ptr @d, i64 [[I]]
-; CHECK-NEXT:    [[I5:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[I3]], i32 0)
-; CHECK-NEXT:    [[I6:%.*]] = select i1 [[I5]], float 0.000000e+00, float 1.000000e+00
-; CHECK-NEXT:    store float [[I6]], ptr [[I4]], align 4
-; CHECK-NEXT:    [[I7]] = add i64 [[I]], 1
-; CHECK-NEXT:    [[I8:%.*]] = icmp eq i64 [[I7]], 128
-; CHECK-NEXT:    br i1 [[I8]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index e662039ee6eff..70b1ea13677b8 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -31,21 +31,8 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], 9223372036854775807
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
-; IC1VF4:       [[SCALAR_PH:.*]]:
-; IC1VF4-NEXT:    br label %[[LOOP:.*]]
-; IC1VF4:       [[LOOP]]:
-; IC1VF4-NEXT:    [[IV:%.*]] = phi i64 [ 19999, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; IC1VF4-NEXT:    [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; IC1VF4-NEXT:    [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
-; IC1VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
-; IC1VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; IC1VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
-; IC1VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC1VF4:       [[EXIT]]:
-; IC1VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC1VF4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; IC1VF4-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 ; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start(
 ; IC4VF4-SAME: ptr [[A:%.*]]) {
@@ -101,21 +88,8 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP18]], 9223372036854775807
 ; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
-; IC4VF4:       [[SCALAR_PH:.*]]:
-; IC4VF4-NEXT:    br label %[[LOOP:.*]]
-; IC4VF4:       [[LOOP]]:
-; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ 19999, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; IC4VF4-NEXT:    [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; IC4VF4-NEXT:    [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
-; IC4VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
-; IC4VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; IC4VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
-; IC4VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF4:       [[EXIT]]:
-; IC4VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; IC4VF4-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 ; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_const_start(
 ; IC4VF1-SAME: ptr [[A:%.*]]) {
@@ -159,21 +133,8 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) {
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], 9223372036854775807
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331
 ; IC4VF1-NEXT:    br label %[[EXIT:.*]]
-; IC4VF1:       [[SCALAR_PH:.*]]:
-; IC4VF1-NEXT:    br label %[[LOOP:.*]]
-; IC4VF1:       [[LOOP]]:
-; IC4VF1-NEXT:    [[IV:%.*]] = phi i64 [ 19999, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; IC4VF1-NEXT:    [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; IC4VF1-NEXT:    [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
-; IC4VF1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
-; IC4VF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; IC4VF1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
-; IC4VF1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF1:       [[EXIT]]:
-; IC4VF1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF1-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; IC4VF1-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 entry:
   br label %loop
@@ -227,21 +188,8 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
-; IC1VF4:       [[SCALAR_PH:.*]]:
-; IC1VF4-NEXT:    br label %[[LOOP:.*]]
-; IC1VF4:       [[LOOP]]:
-; IC1VF4-NEXT:    [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]]
-; IC1VF4-NEXT:    [[LD_TABLE:%.*]] = load i16, ptr [[GEP_TABLE_IV]], align 1
-; IC1VF4-NEXT:    [[CMP_TABLE_VAL:%.*]] = icmp ugt i16 [[LD_TABLE]], [[VAL]]
-; IC1VF4-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], -1
-; IC1VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_TABLE_VAL]], i16 [[IV_NEXT]], i16 [[RDX]]
-; IC1VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; IC1VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC1VF4:       [[EXIT]]:
-; IC1VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC1VF4-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; IC1VF4-NEXT:    ret i16 [[RDX_SELECT]]
 ;
 ; IC4VF4-LABEL: define i16 @select_decreasing_induction_icmp_table_i16(
 ; IC4VF4-SAME: i16 noundef [[VAL:%.*]]) {
@@ -460,21 +408,8 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP116]], 32767
 ; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP116]], i16 0
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
-; IC4VF4:       [[SCALAR_PH:.*]]:
-; IC4VF4-NEXT:    br label %[[LOOP:.*]]
-; IC4VF4:       [[LOOP]]:
-; IC4VF4-NEXT:    [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]]
-; IC4VF4-NEXT:    [[LD_TABLE:%.*]] = load i16, ptr [[GEP_TABLE_IV]], align 1
-; IC4VF4-NEXT:    [[CMP_TABLE_VAL:%.*]] = icmp ugt i16 [[LD_TABLE]], [[VAL]]
-; IC4VF4-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], -1
-; IC4VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_TABLE_VAL]], i16 [[IV_NEXT]], i16 [[RDX]]
-; IC4VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; IC4VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF4:       [[EXIT]]:
-; IC4VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF4-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; IC4VF4-NEXT:    ret i16 [[RDX_SELECT]]
 ;
 ; IC4VF1-LABEL: define i16 @select_decreasing_induction_icmp_table_i16(
 ; IC4VF1-SAME: i16 noundef [[VAL:%.*]]) {
@@ -523,21 +458,8 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) {
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[RDX_MINMAX5]], 32767
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[RDX_MINMAX5]], i16 0
 ; IC4VF1-NEXT:    br label %[[EXIT:.*]]
-; IC4VF1:       [[SCALAR_PH:.*]]:
-; IC4VF1-NEXT:    br label %[[LOOP:.*]]
-; IC4VF1:       [[LOOP]]:
-; IC4VF1-NEXT:    [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]]
-; IC4VF1-NEXT:    [[LD_TABLE:%.*]] = load i16, ptr [[GEP_TABLE_IV]], align 1
-; IC4VF1-NEXT:    [[CMP_TABLE_VAL:%.*]] = icmp ugt i16 [[LD_TABLE]], [[VAL]]
-; IC4VF1-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], -1
-; IC4VF1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_TABLE_VAL]], i16 [[IV_NEXT]], i16 [[RDX]]
-; IC4VF1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; IC4VF1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF1:       [[EXIT]]:
-; IC4VF1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF1-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; IC4VF1-NEXT:    ret i16 [[RDX_SELECT]]
 ;
 entry:
   br label %loop
@@ -592,21 +514,8 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP7]], 32767
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
-; IC1VF4:       [[SCALAR_PH:.*]]:
-; IC1VF4-NEXT:    br label %[[LOOP:.*]]
-; IC1VF4:       [[LOOP]]:
-; IC1VF4-NEXT:    [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]]
-; IC1VF4-NEXT:    [[LD_TABLE:%.*]] = load half, ptr [[GEP_TABLE_IV]], align 1
-; IC1VF4-NEXT:    [[CMP_TABLE_VAL:%.*]] = fcmp ugt half [[LD_TABLE]], [[VAL]]
-; IC1VF4-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], -1
-; IC1VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_TABLE_VAL]], i16 [[IV_NEXT]], i16 [[RDX]]
-; IC1VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; IC1VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC1VF4:       [[EXIT]]:
-; IC1VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC1VF4-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; IC1VF4-NEXT:    ret i16 [[RDX_SELECT]]
 ;
 ; IC4VF4-LABEL: define i16 @select_decreasing_induction_icmp_table_half(
 ; IC4VF4-SAME: half noundef [[VAL:%.*]]) {
@@ -825,21 +734,8 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[TMP116]], 32767
 ; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP116]], i16 0
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
-; IC4VF4:       [[SCALAR_PH:.*]]:
-; IC4VF4-NEXT:    br label %[[LOOP:.*]]
-; IC4VF4:       [[LOOP]]:
-; IC4VF4-NEXT:    [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]]
-; IC4VF4-NEXT:    [[LD_TABLE:%.*]] = load half, ptr [[GEP_TABLE_IV]], align 1
-; IC4VF4-NEXT:    [[CMP_TABLE_VAL:%.*]] = fcmp ugt half [[LD_TABLE]], [[VAL]]
-; IC4VF4-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], -1
-; IC4VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_TABLE_VAL]], i16 [[IV_NEXT]], i16 [[RDX]]
-; IC4VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; IC4VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF4:       [[EXIT]]:
-; IC4VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF4-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; IC4VF4-NEXT:    ret i16 [[RDX_SELECT]]
 ;
 ; IC4VF1-LABEL: define i16 @select_decreasing_induction_icmp_table_half(
 ; IC4VF1-SAME: half noundef [[VAL:%.*]]) {
@@ -888,21 +784,8 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) {
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i16 [[RDX_MINMAX5]], 32767
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[RDX_MINMAX5]], i16 0
 ; IC4VF1-NEXT:    br label %[[EXIT:.*]]
-; IC4VF1:       [[SCALAR_PH:.*]]:
-; IC4VF1-NEXT:    br label %[[LOOP:.*]]
-; IC4VF1:       [[LOOP]]:
-; IC4VF1-NEXT:    [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]]
-; IC4VF1-NEXT:    [[LD_TABLE:%.*]] = load half, ptr [[GEP_TABLE_IV]], align 1
-; IC4VF1-NEXT:    [[CMP_TABLE_VAL:%.*]] = fcmp ugt half [[LD_TABLE]], [[VAL]]
-; IC4VF1-NEXT:    [[IV_NEXT]] = add nsw i16 [[IV]], -1
-; IC4VF1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_TABLE_VAL]], i16 [[IV_NEXT]], i16 [[RDX]]
-; IC4VF1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i16 [[IV_NEXT]], 0
-; IC4VF1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF1:       [[EXIT]]:
-; IC4VF1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i16 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF1-NEXT:    ret i16 [[SPEC_SELECT_LCSSA]]
+; IC4VF1-NEXT:    ret i16 [[RDX_SELECT]]
 ;
 entry:
   br label %loop
@@ -954,21 +837,8 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC1VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -1
 ; IC1VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331
 ; IC1VF4-NEXT:    br label %[[EXIT:.*]]
-; IC1VF4:       [[SCALAR_PH:.*]]:
-; IC1VF4-NEXT:    br label %[[LOOP:.*]]
-; IC1VF4:       [[LOOP]]:
-; IC1VF4-NEXT:    [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC1VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; IC1VF4-NEXT:    [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; IC1VF4-NEXT:    [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
-; IC1VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
-; IC1VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; IC1VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
-; IC1VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC1VF4:       [[EXIT]]:
-; IC1VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC1VF4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; IC1VF4-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 ; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
 ; IC4VF4-SAME: ptr [[A:%.*]]) {
@@ -1024,21 +894,8 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC4VF4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP18]], -1
 ; IC4VF4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331
 ; IC4VF4-NEXT:    br label %[[EXIT:.*]]
-; IC4VF4:       [[SCALAR_PH:.*]]:
-; IC4VF4-NEXT:    br label %[[LOOP:.*]]
-; IC4VF4:       [[LOOP]]:
-; IC4VF4-NEXT:    [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF4-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; IC4VF4-NEXT:    [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; IC4VF4-NEXT:    [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
-; IC4VF4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
-; IC4VF4-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; IC4VF4-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
-; IC4VF4-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF4:       [[EXIT]]:
-; IC4VF4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF4-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; IC4VF4-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 ; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
 ; IC4VF1-SAME: ptr [[A:%.*]]) {
@@ -1082,21 +939,8 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
 ; IC4VF1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -1
 ; IC4VF1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331
 ; IC4VF1-NEXT:    br label %[[EXIT:.*]]
-; IC4VF1:       [[SCALAR_PH:.*]]:
-; IC4VF1-NEXT:    br label %[[LOOP:.*]]
-; IC4VF1:       [[LOOP]]:
-; IC4VF1-NEXT:    [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
-; IC4VF1-NEXT:    [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; IC4VF1-NEXT:    [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
-; IC4VF1-NEXT:    [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
-; IC4VF1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
-; IC4VF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; IC4VF1-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
-; IC4VF1-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]]
 ; IC4VF1:       [[EXIT]]:
-; IC4VF1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; IC4VF1-NEXT:    ret i64 [[SPEC_SELECT_LCSSA]]
+; IC4VF1-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
index 0ace54731dc2d..b991d58eb2b8d 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll
@@ -261,22 +261,8 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 331
 ; CHECK-VF4IC1-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF4IC1:       [[SCALAR_PH:.*]]:
-; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC1:       [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP7]], 3
-; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP8]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF4IC1:       [[EXIT]]:
-; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit(
 ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
@@ -322,22 +308,8 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], -2147483648
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 331
 ; CHECK-VF4IC4-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF4IC4:       [[SCALAR_PH:.*]]:
-; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC4:       [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP16]], 3
-; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP17]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF4IC4:       [[EXIT]]:
-; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC4-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit(
 ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
@@ -384,22 +356,8 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) {
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 331
 ; CHECK-VF1IC4-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF1IC4:       [[SCALAR_PH:.*]]:
-; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF1IC4:       [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-VF1IC4-NEXT:    [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP26]], 3
-; CHECK-VF1IC4-NEXT:    [[TMP27:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP27]], i32 [[RDX]]
-; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 20000
-; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF1IC4:       [[EXIT]]:
-; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF1IC4-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
@@ -446,22 +404,8 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP6]], -2147483648
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 -1
 ; CHECK-VF4IC1-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF4IC1:       [[SCALAR_PH:.*]]:
-; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC1:       [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC1-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP7]], 0.000000e+00
-; CHECK-VF4IC1-NEXT:    [[TMP8:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP8]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483648
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF4IC1:       [[EXIT]]:
-; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; CHECK-VF4IC4-LABEL: define i32 @select_fcmp_max_valid_const_ub(
 ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
@@ -507,22 +451,8 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP15]], -2147483648
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 -1
 ; CHECK-VF4IC4-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF4IC4:       [[SCALAR_PH:.*]]:
-; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC4:       [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP16]], 0.000000e+00
-; CHECK-VF4IC4-NEXT:    [[TMP17:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP17]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483648
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF4IC4:       [[EXIT]]:
-; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC4-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; CHECK-VF1IC4-LABEL: define i32 @select_fcmp_max_valid_const_ub(
 ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
@@ -569,22 +499,8 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) {
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX5]], -2147483648
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 -1
 ; CHECK-VF1IC4-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF1IC4:       [[SCALAR_PH:.*]]:
-; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF1IC4:       [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]]
-; CHECK-VF1IC4-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = fcmp fast olt float [[TMP26]], 0.000000e+00
-; CHECK-VF1IC4-NEXT:    [[TMP27:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[TMP27]], i32 [[RDX]]
-; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 2147483648
-; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF1IC4:       [[EXIT]]:
-; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF1IC4-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
@@ -636,22 +552,8 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP5]], 0
 ; CHECK-VF4IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP5]], i32 331
 ; CHECK-VF4IC1-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF4IC1:       [[SCALAR_PH:.*]]:
-; CHECK-VF4IC1-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC1:       [[FOR_BODY]]:
-; CHECK-VF4IC1-NEXT:    [[IV1:%.*]] = phi i64 [ 2147483646, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
-; CHECK-VF4IC1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
-; CHECK-VF4IC1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3
-; CHECK-VF4IC1-NEXT:    [[CONV:%.*]] = trunc i64 [[IV1]] to i32
-; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-VF4IC1-NEXT:    [[INC]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-VF4IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
-; CHECK-VF4IC1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF4IC1:       [[EXIT]]:
-; CHECK-VF4IC1-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC1-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; CHECK-VF4IC4-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range(
 ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) {
@@ -698,22 +600,8 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP14]], 0
 ; CHECK-VF4IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP14]], i32 331
 ; CHECK-VF4IC4-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF4IC4:       [[SCALAR_PH:.*]]:
-; CHECK-VF4IC4-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF4IC4:       [[FOR_BODY]]:
-; CHECK-VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF4IC4-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP15]], 3
-; CHECK-VF4IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-VF4IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF4IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
-; CHECK-VF4IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF4IC4:       [[EXIT]]:
-; CHECK-VF4IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF4IC4-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 ; CHECK-VF1IC4-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range(
 ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) {
@@ -762,22 +650,8 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) {
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[RDX_MINMAX6]], 0
 ; CHECK-VF1IC4-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX6]], i32 331
 ; CHECK-VF1IC4-NEXT:    br label %[[EXIT:.*]]
-; CHECK-VF1IC4:       [[SCALAR_PH:.*]]:
-; CHECK-VF1IC4-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-VF1IC4:       [[FOR_BODY]]:
-; CHECK-VF1IC4-NEXT:    [[IV:%.*]] = phi i64 [ 2147483646, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ]
-; CHECK-VF1IC4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-VF1IC4-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-VF1IC4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP24]], 3
-; CHECK-VF1IC4-NEXT:    [[CONV:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP]], i32 [[CONV]], i32 [[RDX]]
-; CHECK-VF1IC4-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-VF1IC4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4294967294
-; CHECK-VF1IC4-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK-VF1IC4:       [[EXIT]]:
-; CHECK-VF1IC4-NEXT:    [[SPEC_SELECT_LCSSA:%.*]] = phi i32 [ [[SPEC_SELECT]], %[[FOR_BODY]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
-; CHECK-VF1IC4-NEXT:    ret i32 [[SPEC_SELECT_LCSSA]]
+; CHECK-VF1IC4-NEXT:    ret i32 [[RDX_SELECT]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 3f91baa117b7f..86515ebe25637 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -102,16 +102,8 @@ define i32 @constpre()  {
 ; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ 32, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = sub nsw i32 [[INC_PHI]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[INC_PHI_LCSSA:%.*]] = phi i32 [ [[INC_PHI]], %[[FOR_BODY]] ], [ 2, %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[INC_PHI_LCSSA]]
+; CHECK-NEXT:    ret i32 2
 ;
 entry:
   br label %for.body
@@ -142,18 +134,8 @@ define ptr @geppre(ptr %ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -16
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INC_PHI:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[PTR_PHI:%.*]] = phi ptr [ [[PTR]], %[[SCALAR_PH]] ], [ [[INC_PTR:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add nsw i32 [[INC_PHI]], 1
-; CHECK-NEXT:    [[INC_PTR]] = getelementptr i32, ptr [[PTR_PHI]], i32 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC]], 32
-; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[PTR_PHI_LCSSA:%.*]] = phi ptr [ [[PTR_PHI]], %[[FOR_BODY]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret ptr [[PTR_PHI_LCSSA]]
+; CHECK-NEXT:    ret ptr [[IND_ESCAPE]]
 ;
 entry:
   br label %for.body
@@ -411,18 +393,8 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    br label %[[EXIT:.*]]
-; VEC:       [[SCALAR_PH:.*]]:
-; VEC-NEXT:    br label %[[LOOP:.*]]
-; VEC:       [[LOOP]]:
-; VEC-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VEC-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; VEC-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
-; VEC-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
-; VEC-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
-; VEC-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
-; VEC-NEXT:    ret i64 [[IV_LCSSA]]
+; VEC-NEXT:    ret i64 1001
 ;
 ; INTERLEAVE-LABEL: define i64 @iv_scalar_steps_and_outside_users(
 ; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
@@ -442,18 +414,8 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) {
 ; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
-; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
-; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; INTERLEAVE:       [[EXIT]]:
-; INTERLEAVE-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 1001, %[[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i64 [[IV_LCSSA]]
+; INTERLEAVE-NEXT:    ret i64 1001
 ;
 entry:
   br label %loop
@@ -491,20 +453,8 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    br label %[[EXIT:.*]]
-; VEC:       [[SCALAR_PH:.*]]:
-; VEC-NEXT:    br label %[[LOOP:.*]]
-; VEC:       [[LOOP]]:
-; VEC-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VEC-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
-; VEC-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; VEC-NEXT:    [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2
-; VEC-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
-; VEC-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
-; VEC-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
-; VEC-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
-; VEC-NEXT:    ret i32 [[IV_2_LCSSA]]
+; VEC-NEXT:    ret i32 2002
 ;
 ; INTERLEAVE-LABEL: define i32 @iv_2_dead_in_loop_only_used_outside(
 ; INTERLEAVE-SAME: ptr [[PTR:%.*]]) {
@@ -524,20 +474,8 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) {
 ; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2
-; INTERLEAVE-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
-; INTERLEAVE-NEXT:    store i64 [[IV]], ptr [[GEP_PTR]], align 4
-; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ugt i64 [[IV]], 1000
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; INTERLEAVE:       [[EXIT]]:
-; INTERLEAVE-NEXT:    [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP]] ], [ 2002, %[[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i32 [[IV_2_LCSSA]]
+; INTERLEAVE-NEXT:    ret i32 2002
 ;
 entry:
   br label %loop
@@ -1092,18 +1030,8 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
 ; VEC-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    br label %[[E_EXIT:.*]]
-; VEC:       [[SCALAR_PH:.*]]:
-; VEC-NEXT:    br label %[[LOOP:.*]]
-; VEC:       [[LOOP]]:
-; VEC-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
-; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
-; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
-; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
-; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]]
 ; VEC:       [[E_EXIT]]:
-; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
-; VEC-NEXT:    ret i32 [[RES]]
+; VEC-NEXT:    ret i32 [[TMP5]]
 ;
 ; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification(
 ; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
@@ -1126,18 +1054,8 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) {
 ; INTERLEAVE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[E_EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
-; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
-; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
-; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]]
 ; INTERLEAVE:       [[E_EXIT]]:
-; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i32 [[RES]]
+; INTERLEAVE-NEXT:    ret i32 [[TMP5]]
 ;
 entry:
   %step.1 = sext i8 0 to i32
@@ -1187,19 +1105,8 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) {
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
 ; VEC-NEXT:    br label %[[E_EXIT:.*]]
-; VEC:       [[SCALAR_PH:.*]]:
-; VEC-NEXT:    br label %[[LOOP:.*]]
-; VEC:       [[LOOP]]:
-; VEC-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VEC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
-; VEC-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
-; VEC-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
-; VEC-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
-; VEC-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
-; VEC-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]]
 ; VEC:       [[E_EXIT]]:
-; VEC-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
-; VEC-NEXT:    ret i32 [[RES]]
+; VEC-NEXT:    ret i32 [[TMP7]]
 ;
 ; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(
 ; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
@@ -1224,19 +1131,8 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) {
 ; INTERLEAVE-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[E_EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
-; INTERLEAVE-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
-; INTERLEAVE-NEXT:    [[INC:%.*]] = add i32 [[IV]], 1
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[INC]]
-; INTERLEAVE-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 8
-; INTERLEAVE-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]]
 ; INTERLEAVE:       [[E_EXIT]]:
-; INTERLEAVE-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i32 [[RES]]
+; INTERLEAVE-NEXT:    ret i32 [[TMP5]]
 ;
 entry:
   %step.1 = sext i8 0 to i32
@@ -1356,24 +1252,12 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1
 ; VEC-NEXT:    store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2
 ; VEC-NEXT:    [[TMP5:%.*]] = add i64 1, -1
-; VEC-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
+; VEC-NEXT:    [[IV_1_NEXT_LCSSA1:%.*]] = add i64 [[TMP5]], 1
 ; VEC-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    br label %[[EXIT:.*]]
-; VEC:       [[SCALAR_PH:.*]]:
-; VEC-NEXT:    br label %[[LOOP:.*]]
-; VEC:       [[LOOP]]:
-; VEC-NEXT:    [[IV_1:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
-; VEC-NEXT:    [[IV_2:%.*]] = phi i64 [ 2, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
-; VEC-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV_1]]
-; VEC-NEXT:    store i16 1, ptr [[GEP]], align 2
-; VEC-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], -1
-; VEC-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_2_NEXT]], 0
-; VEC-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
-; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; VEC-NEXT:    ret i64 [[IV_1_NEXT_LCSSA]]
+; VEC-NEXT:    ret i64 [[IV_1_NEXT_LCSSA1]]
 ;
 ; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented(
 ; INTERLEAVE-SAME: ptr [[DST:%.*]]) {
@@ -1387,24 +1271,12 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; INTERLEAVE-NEXT:    store i16 1, ptr [[TMP0]], align 2
 ; INTERLEAVE-NEXT:    store i16 1, ptr [[TMP1]], align 2
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = add i64 1, -1
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 1
+; INTERLEAVE-NEXT:    [[IV_1_NEXT_LCSSA1:%.*]] = add i64 [[TMP2]], 1
 ; INTERLEAVE-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV_1:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[IV_2:%.*]] = phi i64 [ 2, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV_1]]
-; INTERLEAVE-NEXT:    store i16 1, ptr [[GEP]], align 2
-; INTERLEAVE-NEXT:    [[IV_2_NEXT]] = add i64 [[IV_2]], -1
-; INTERLEAVE-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_2_NEXT]], 0
-; INTERLEAVE-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
-; INTERLEAVE-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; INTERLEAVE:       [[EXIT]]:
-; INTERLEAVE-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
-; INTERLEAVE-NEXT:    ret i64 [[IV_1_NEXT_LCSSA]]
+; INTERLEAVE-NEXT:    ret i64 [[IV_1_NEXT_LCSSA1]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 11d48df8b8aaa..9358fd9cc8440 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -48,29 +48,9 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP15]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[L_T]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i16 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i16 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i16 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i16 [[TMP17]]
 ;
 entry:
   %alloca = alloca [163840 x i16], align 4
@@ -142,29 +122,9 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP15]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[L_T]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i32, ptr [[START]], i64 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ADDR]], align 4
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP17]]
 ;
 entry:
   %alloca = alloca [163840 x i32], align 4
@@ -370,26 +330,7 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1023, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3
-; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP20]], 2
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
 ; CHECK-NEXT:    ret void
@@ -481,27 +422,7 @@ define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %des
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1023, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[OFF:%.*]] = add i64 [[IV]], -1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[OFF]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP22]], 3
-; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[OFF]]
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP23]], 2
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[OFF]]
-; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
 ; CHECK-NEXT:    ret void
@@ -574,30 +495,9 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]]
-; CHECK-NEXT:    [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[L_T]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[IV_STRIDE:%.*]] = mul i64 [[IV]], 2
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[IV_STRIDE]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i16 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i16 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp eq i64 [[IV]], 4095
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i16 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i16 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i16 [[TMP15]]
 ;
 entry:
   %alloca = alloca [163840 x i16], align 4
@@ -681,27 +581,7 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 511, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP21]], 3
-; CHECK-NEXT:    br i1 [[CMP3_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[IV_STRIDED:%.*]] = mul i64 [[IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[IV_STRIDED]]
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP22]], 2
-; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[IV]]
-; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX7]], align 4
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[EXIT]], label [[FOR_BODY]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST:%.*]], ptr [[LOCAL_DEST]], i64 1024, i1 false)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
index b224534720a2d..b14a1cdff92c2 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
@@ -52,28 +52,9 @@ define i8 @test_negative_off(i16 %len, ptr %test_base) {
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]])
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ -1000, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
-; CHECK-NEXT:    [[ACCUM:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]]
-; CHECK-NEXT:    [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1
-; CHECK-NEXT:    br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]]
-; CHECK:       pred:
-; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[IV]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[ADDR]], align 1
-; CHECK-NEXT:    br label [[LATCH]]
-; CHECK:       latch:
-; CHECK-NEXT:    [[VAL_PHI:%.*]] = phi i8 [ 0, [[LOOP]] ], [ [[VAL]], [[PRED]] ]
-; CHECK-NEXT:    [[ACCUM_NEXT]] = add i8 [[ACCUM]], [[VAL_PHI]]
-; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i16 [[IV]], -990
-; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i8 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i8 [[ACCUM_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i8 [[TMP20]]
 ;
 entry:
   %alloca = alloca [64638 x i8]
diff --git a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll
index f44fc4e5568b1..096a0a87cbb8a 100644
--- a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll
@@ -30,28 +30,6 @@ define void @accesses_to_struct_dereferenceable(ptr noalias %dst) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    [[D:%.*]] = load i32, ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[D]], 0
-; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[TMP_0:%.*]] = phi i32 [ [[L_A]], [[IF_THEN]] ], [ [[L_B]], [[IF_ELSE]] ]
-; CHECK-NEXT:    store i32 [[TMP_0]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -265,29 +243,6 @@ define void @accesses_to_struct_may_not_be_dereferenceable_access_size(ptr noali
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    [[D:%.*]] = load i32, ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[D]], 0
-; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 0, i64 [[IV]]
-; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       if.else:
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr @foo, i64 0, i32 1, i64 [[IV]]
-; CHECK-NEXT:    [[L_B:%.*]] = load i64, ptr [[GEP_B]], align 4
-; CHECK-NEXT:    [[T:%.*]] = trunc i64 [[L_B]] to i32
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[TMP_0:%.*]] = phi i32 [ [[L_A]], [[IF_THEN]] ], [ [[T]], [[IF_ELSE]] ]
-; CHECK-NEXT:    store i32 [[TMP_0]], ptr [[GEP_DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll
index c589c77895353..aed1e2920bbdc 100644
--- a/llvm/test/Transforms/LoopVectorize/loop-form.ll
+++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll
@@ -79,17 +79,7 @@ define void @bottom_tested(ptr %p, i32 %n) {
 ; TAILFOLD-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; TAILFOLD-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; TAILFOLD:       middle.block:
-; TAILFOLD-NEXT:    br label [[IF_END:%.*]]
-; TAILFOLD:       scalar.ph:
 ; TAILFOLD-NEXT:    br label [[FOR_COND:%.*]]
-; TAILFOLD:       for.cond:
-; TAILFOLD-NEXT:    [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[INC:%.*]], [[FOR_COND]] ]
-; TAILFOLD-NEXT:    [[IPROM:%.*]] = sext i32 [[I]] to i64
-; TAILFOLD-NEXT:    [[B:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IPROM]]
-; TAILFOLD-NEXT:    store i16 0, ptr [[B]], align 4
-; TAILFOLD-NEXT:    [[INC]] = add nsw i32 [[I]], 1
-; TAILFOLD-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
-; TAILFOLD-NEXT:    br i1 [[CMP]], label [[FOR_COND]], label [[IF_END]]
 ; TAILFOLD:       if.end:
 ; TAILFOLD-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll b/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll
index 781980dce87b3..1fe802f9e1093 100644
--- a/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll
+++ b/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll
@@ -32,17 +32,6 @@ define void @scalar_loop_dead(ptr noundef captures(none) %a, float noundef %x) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[X]], [[LOAD]]
-; CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COMP:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[COMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -83,7 +72,7 @@ define void @scalar_loop_live(ptr noundef captures(none) %a, float noundef %x, i
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -98,7 +87,7 @@ define void @scalar_loop_live(ptr noundef captures(none) %a, float noundef %x, i
 ; CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[COMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -128,9 +117,6 @@ exit:
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized"}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.count", i32 8}
 ; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]]}
-; CHECK: [[META5]] = !{!"llvm.loop.vectorize.enable", i1 true}
-; CHECK: [[META6]] = !{!"llvm.loop.vectorize.followup_all", [[META1]], [[META2]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]], [[META3]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
index bb5199208ba15..30ee4803de607 100644
--- a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll
@@ -69,19 +69,7 @@ define void @maxvf3() {
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[AJ:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[J]]
-; CHECK-NEXT:    store i8 69, ptr [[AJ]], align 8
-; CHECK-NEXT:    [[JP3:%.*]] = add nuw nsw i32 3, [[J]]
-; CHECK-NEXT:    [[AJP3:%.*]] = getelementptr inbounds [18 x i8], ptr @a, i32 0, i32 [[JP3]]
-; CHECK-NEXT:    store i8 7, ptr [[AJP3]], align 8
-; CHECK-NEXT:    [[J_NEXT]] = add nuw nsw i32 [[J]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index e2dadff3e985b..3c59a279e077d 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -142,18 +142,6 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[CHAR_TBAA0]], !range [[RNG9:![0-9]+]]
-; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]], !range [[RNG9]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -180,18 +168,6 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[CHAR_TBAA0]], !range [[RNG9:![0-9]+]]
-; INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]], !range [[RNG9]]
-; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; INTERLEAVE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; INTERLEAVE:       [[EXIT]]:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -229,21 +205,9 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
-; CHECK-NEXT:    [[CALL:%.*]] = call double @bar(double [[LOAD]]) #[[ATTR2:[0-9]+]], !fpmath [[META3]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -267,21 +231,9 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP7]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
-; INTERLEAVE-NEXT:    [[CALL:%.*]] = call double @bar(double [[LOAD]]) #[[ATTR2:[0-9]+]], !fpmath [[META3]]
-; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
-; INTERLEAVE-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; INTERLEAVE:       [[EXIT]]:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -319,21 +271,9 @@ define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) {
 ; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = call i64 @llvm.abs.i64(i64 [[LOAD]], i1 true), !range [[RNG9]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -357,21 +297,9 @@ define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP7]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
-; INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @llvm.abs.i64(i64 [[LOAD]], i1 true), !range [[RNG9]]
-; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; INTERLEAVE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; INTERLEAVE:       [[EXIT]]:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -409,21 +337,9 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
-; CHECK-NEXT:    [[CALL:%.*]] = call double @llvm.sin.f64(double [[LOAD]]) #[[ATTR2]], !fpmath [[META3]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -447,21 +363,9 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) {
 ; INTERLEAVE-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP7]], align 8
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    br label %[[EXIT:.*]]
-; INTERLEAVE:       [[SCALAR_PH:.*]]:
-; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
-; INTERLEAVE:       [[LOOP]]:
-; INTERLEAVE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]]
-; INTERLEAVE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[CHAR_TBAA0]]
-; INTERLEAVE-NEXT:    [[CALL:%.*]] = call double @llvm.sin.f64(double [[LOAD]]) #[[ATTR2]], !fpmath [[META3]]
-; INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
-; INTERLEAVE-NEXT:    store double [[CALL]], ptr [[ARRAYIDX]], align 8
-; INTERLEAVE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; INTERLEAVE:       [[EXIT]]:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -506,7 +410,7 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i32> [[TMP3]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -522,7 +426,7 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; CHECK-NEXT:    store ptr [[ARRAYIDX_2]], ptr [[ARRAYIDX_1]], align 8, !custom_md [[META2]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !custom_md [[META2]]
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]], !custom_md [[META2]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP14:![0-9]+]], !custom_md [[META2]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP13:![0-9]+]], !custom_md [[META2]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -555,7 +459,7 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; INTERLEAVE-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i32> [[STEP_ADD3]], splat (i32 2)
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; INTERLEAVE-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; INTERLEAVE:       [[MIDDLE_BLOCK]]:
 ; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
 ; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -571,7 +475,7 @@ define void @unknown_metadata(ptr nocapture %a, ptr noalias %b, i64 %size) {
 ; INTERLEAVE-NEXT:    store ptr [[ARRAYIDX_2]], ptr [[ARRAYIDX_1]], align 8, !custom_md [[META2]]
 ; INTERLEAVE-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !custom_md [[META2]]
 ; INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]], !custom_md [[META2]]
-; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP14:![0-9]+]], !custom_md [[META2]]
+; INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP13:![0-9]+]], !custom_md [[META2]]
 ; INTERLEAVE:       [[EXIT]]:
 ; INTERLEAVE-NEXT:    ret void
 ;
@@ -617,12 +521,11 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; CHECK: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META6]], [[META5]]}
 ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META5]], [[META6]]}
-; CHECK: [[RNG9]] = !{i64 0, i64 2}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]], [[META6]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META6]], [[META5]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
 ;.
 ; INTERLEAVE: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
 ; INTERLEAVE: [[META1]] = !{!"omnipotent char", [[META2]]}
@@ -633,10 +536,9 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; INTERLEAVE: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; INTERLEAVE: [[LOOP7]] = distinct !{[[LOOP7]], [[META6]], [[META5]]}
 ; INTERLEAVE: [[LOOP8]] = distinct !{[[LOOP8]], [[META5]], [[META6]]}
-; INTERLEAVE: [[RNG9]] = !{i64 0, i64 2}
+; INTERLEAVE: [[LOOP9]] = distinct !{[[LOOP9]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
-; INTERLEAVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]], [[META6]]}
-; INTERLEAVE: [[LOOP14]] = distinct !{[[LOOP14]], [[META6]], [[META5]]}
+; INTERLEAVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
index 7866728168888..47a2a84b44601 100644
--- a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll
@@ -26,20 +26,8 @@ define float @maximumnum_intrinsic(ptr readonly %x) {
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = tail call float @llvm.maximumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV1]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP6]]
 ;
 entry:
   br label %loop
@@ -82,20 +70,8 @@ define float @maximumnum_intrinsic_fast(ptr readonly %x) {
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = tail call fast float @llvm.maximumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV1]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP6]]
 ;
 entry:
   br label %loop
@@ -138,20 +114,8 @@ define float @minimumnum_intrinsic(ptr readonly %x) {
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = tail call float @llvm.minimumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV1]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP6]]
 ;
 entry:
   br label %loop
@@ -194,20 +158,8 @@ define float @minimumnum_intrinsic_fast(ptr readonly %x) {
 ; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = call fast <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]])
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[RED_NEXT]] = tail call fast float @llvm.minimumnum.f32(float [[RED]], float [[L]])
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV1]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[INC]], 1024
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RED_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP6]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index 2e88ff6e99fdf..a1fc1b8f34ff3 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -34,10 +34,6 @@ define i32 @main() #0 {
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll
index d928a4b7ebe4b..b19f9c5a3b60d 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-result-intrinsics.ll
@@ -12,14 +12,7 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
 ; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
 ; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
@@ -55,14 +48,7 @@ define void @sincos_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
 ; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
 ; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.sincos.f64(double [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
-; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
-; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
@@ -91,9 +77,9 @@ define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly
 ; CHECK-LABEL: define void @predicated_sincos(
 ; CHECK-SAME: float [[X:%.*]], ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) {
 ; CHECK:  [[ENTRY:.*:]]
-; CHECK:  [[VECTOR_BODY1:.*]]:
-; CHECK:  [[VECTOR_BODY:.*:]]
-; CHECK:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY1]] ], [ [[INDEX_NEXT:%.*]], %[[IF_THEN2:.*]] ]
+; CHECK:  [[VECTOR_BODY:.*]]:
+; CHECK:  [[VECTOR_BODY1:.*:]]
+; CHECK:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_BODY]] ], [ [[INDEX_NEXT:%.*]], %[[IF_THEN1:.*]] ]
 ; CHECK:    [[TMP4:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 0
 ; CHECK:    [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 1
@@ -107,23 +93,14 @@ define void @predicated_sincos(float %x, ptr noalias %in, ptr noalias writeonly
 ; CHECK:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:  [[PRED_STORE_CONTINUE]]:
 ; CHECK:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
-; CHECK:    br i1 [[TMP12]], label %[[PRED_STORE_IF1:.*]], label %[[IF_THEN2]]
+; CHECK:    br i1 [[TMP12]], label %[[PRED_STORE_IF1:.*]], label %[[IF_THEN1]]
 ; CHECK:  [[PRED_STORE_IF1]]:
 ; CHECK:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
 ; CHECK:    store float [[TMP15]], ptr [[TMP14:%.*]], align 4
 ; CHECK:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
 ; CHECK:    store float [[TMP17]], ptr [[TMP16:%.*]], align 4
-; CHECK:    br label %[[IF_THEN2]]
-; CHECK:  [[IF_THEN2]]:
-; CHECK:  [[IF_THEN:.*:]]
-; CHECK:  [[IF_THEN3:.*:]]
-; CHECK:  [[IF_THEN4:.*:]]
-; CHECK:  [[IF_THEN1:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincos.f32(float [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
+; CHECK:    br label %[[IF_THEN1]]
+; CHECK:  [[IF_THEN1]]:
 ; CHECK:  [[IF_MERGE:.*:]]
 ; CHECK:  [[FOR_END:.*:]]
 ;
@@ -167,14 +144,7 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
 ; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
 ; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.modf.f32(float [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
@@ -210,14 +180,7 @@ define void @modf_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
 ; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
 ; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.modf.f64(double [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
-; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
-; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
@@ -253,14 +216,7 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
 ; CHECK:    store <2 x float> [[TMP4]], ptr [[TMP7:%.*]], align 4
 ; CHECK:    store <2 x float> [[TMP5]], ptr [[TMP9:%.*]], align 4
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { float, float } @llvm.sincospi.f32(float [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1
-; CHECK:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 4
-; CHECK:    store float [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 4
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
@@ -296,14 +252,7 @@ define void @sincospi_f64(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK:    [[TMP5:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP3]], 1
 ; CHECK:    store <2 x double> [[TMP4]], ptr [[TMP7:%.*]], align 8
 ; CHECK:    store <2 x double> [[TMP5]], ptr [[TMP9:%.*]], align 8
-; CHECK:  [[MIDDLE_BLOCK:.*:]]
-; CHECK:  [[SCALAR_PH:.*:]]
 ; CHECK:  [[FOR_BODY:.*:]]
-; CHECK:    [[CALL:%.*]] = tail call { double, double } @llvm.sincospi.f64(double [[IN_VAL:%.*]])
-; CHECK:    [[EXTRACT_A:%.*]] = extractvalue { double, double } [[CALL]], 0
-; CHECK:    [[EXTRACT_B:%.*]] = extractvalue { double, double } [[CALL]], 1
-; CHECK:    store double [[EXTRACT_A]], ptr [[ARRAYIDX2:%.*]], align 8
-; CHECK:    store double [[EXTRACT_B]], ptr [[ARRAYIDX4:%.*]], align 8
 ; CHECK:  [[EXIT:.*:]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
index 9b6774e3d63fe..481fa04cf7164 100644
--- a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
+++ b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll
@@ -26,20 +26,6 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b)
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp ogt float [[TMP7]], 1.000000e+02
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP7]], 1.000000e+00
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX5]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 819cfaadeecbf..9f82795e1f71c 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -273,19 +273,8 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY29:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP28]]:
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[FOR_BODY29]]:
-; CHECK-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; CHECK-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; CHECK-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
-; CHECK-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
-; CHECK-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; CHECK-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; CHECK-NEXT:    br i1 [[CMP26]], label %[[FOR_BODY29]], label %[[FOR_COND_CLEANUP28]]
 ;
 ; PGSO-LABEL: define void @pr43371(
 ; PGSO-SAME: ) #[[ATTR0]] {
@@ -310,19 +299,8 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
-; PGSO:       [[SCALAR_PH:.*]]:
-; PGSO-NEXT:    br label %[[FOR_BODY29:.*]]
 ; PGSO:       [[FOR_COND_CLEANUP28]]:
 ; PGSO-NEXT:    unreachable
-; PGSO:       [[FOR_BODY29]]:
-; PGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; PGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; PGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; PGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
-; PGSO-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
-; PGSO-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; PGSO-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; PGSO-NEXT:    br i1 [[CMP26]], label %[[FOR_BODY29]], label %[[FOR_COND_CLEANUP28]]
 ;
 ; NPGSO-LABEL: define void @pr43371(
 ; NPGSO-SAME: ) #[[ATTR0]] {
@@ -347,19 +325,8 @@ define void @pr43371() optsize {
 ; NPGSO-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
-; NPGSO:       [[SCALAR_PH:.*]]:
-; NPGSO-NEXT:    br label %[[FOR_BODY29:.*]]
 ; NPGSO:       [[FOR_COND_CLEANUP28]]:
 ; NPGSO-NEXT:    unreachable
-; NPGSO:       [[FOR_BODY29]]:
-; NPGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; NPGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; NPGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
-; NPGSO-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
-; NPGSO-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; NPGSO-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; NPGSO-NEXT:    br i1 [[CMP26]], label %[[FOR_BODY29]], label %[[FOR_COND_CLEANUP28]]
 ;
 ; We do not want to generate SCEV predicates when optimising for size, because
 ; that will lead to extra code generation such as the SCEV overflow runtime
@@ -407,19 +374,8 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY29:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP28]]:
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[FOR_BODY29]]:
-; CHECK-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; CHECK-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; CHECK-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
-; CHECK-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
-; CHECK-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; CHECK-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; CHECK-NEXT:    br i1 [[CMP26]], label %[[FOR_BODY29]], label %[[FOR_COND_CLEANUP28]]
 ;
 ; PGSO-LABEL: define void @pr43371_pgso(
 ; PGSO-SAME: ) !prof [[PROF14]] {
@@ -444,19 +400,8 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[FOR_COND_CLEANUP28:.*]]
-; PGSO:       [[SCALAR_PH:.*]]:
-; PGSO-NEXT:    br label %[[FOR_BODY29:.*]]
 ; PGSO:       [[FOR_COND_CLEANUP28]]:
 ; PGSO-NEXT:    unreachable
-; PGSO:       [[FOR_BODY29]]:
-; PGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; PGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
-; PGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
-; PGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
-; PGSO-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
-; PGSO-NEXT:    [[INC37]] = add i16 [[I24_0170]], 1
-; PGSO-NEXT:    [[CMP26:%.*]] = icmp ult i16 [[INC37]], 756
-; PGSO-NEXT:    br i1 [[CMP26]], label %[[FOR_BODY29]], label %[[FOR_COND_CLEANUP28]]
 ;
 ; NPGSO-LABEL: define void @pr43371_pgso(
 ; NPGSO-SAME: ) !prof [[PROF14]] {
@@ -686,16 +631,6 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]]
-; CHECK-NEXT:    [[GEPOFB:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[MULB]]
-; CHECK-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -734,16 +669,6 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; PGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[FOR_END:.*]]
-; PGSO:       [[SCALAR_PH:.*]]:
-; PGSO-NEXT:    br label %[[FOR_BODY:.*]]
-; PGSO:       [[FOR_BODY]]:
-; PGSO-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; PGSO-NEXT:    [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]]
-; PGSO-NEXT:    [[GEPOFB:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[MULB]]
-; PGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
-; PGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; PGSO:       [[FOR_END]]:
 ; PGSO-NEXT:    ret void
 ;
@@ -782,16 +707,6 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize {
 ; NPGSO-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    br label %[[FOR_END:.*]]
-; NPGSO:       [[SCALAR_PH:.*]]:
-; NPGSO-NEXT:    br label %[[FOR_BODY:.*]]
-; NPGSO:       [[FOR_BODY]]:
-; NPGSO-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; NPGSO-NEXT:    [[MULB:%.*]] = mul nsw i32 [[IV]], [[BSTRIDE]]
-; NPGSO-NEXT:    [[GEPOFB:%.*]] = getelementptr inbounds i16, ptr [[B]], i32 [[MULB]]
-; NPGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
-; NPGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; NPGSO:       [[FOR_END]]:
 ; NPGSO-NEXT:    ret void
 ;
@@ -830,7 +745,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -843,7 +758,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; CHECK-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -862,7 +777,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; PGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; PGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; PGSO:       [[MIDDLE_BLOCK]]:
 ; PGSO-NEXT:    br label %[[SCALAR_PH]]
 ; PGSO:       [[SCALAR_PH]]:
@@ -875,7 +790,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; PGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; PGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; PGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; PGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; PGSO:       [[FOR_END]]:
 ; PGSO-NEXT:    ret void
 ;
@@ -894,7 +809,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO-NEXT:    store <2 x i16> splat (i16 42), ptr [[TMP1]], align 4
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP0]], 2
 ; NPGSO-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; NPGSO:       [[MIDDLE_BLOCK]]:
 ; NPGSO-NEXT:    br label %[[SCALAR_PH]]
 ; NPGSO:       [[SCALAR_PH]]:
@@ -907,7 +822,7 @@ define void @stride1_pgso(ptr noalias %B, i32 %BStride) !prof !14 {
 ; NPGSO-NEXT:    store i16 42, ptr [[GEPOFB]], align 4
 ; NPGSO-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; NPGSO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 1025
-; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; NPGSO-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; NPGSO:       [[FOR_END]]:
 ; NPGSO-NEXT:    ret void
 ;
@@ -1092,10 +1007,8 @@ exit:
 ; CHECK: [[META17]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]}
 ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]}
-; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META21:![0-9]+]]}
-; CHECK: [[META21]] = !{!"llvm.loop.vectorize.enable", i1 true}
-; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
-; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
+; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
 ;.
 ; PGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
 ; PGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1103,10 +1016,8 @@ exit:
 ; PGSO: [[META17]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; PGSO: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]], [[META17]]}
 ; PGSO: [[LOOP19]] = distinct !{[[LOOP19]], [[META16]], [[META17]]}
-; PGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META21:![0-9]+]]}
-; PGSO: [[META21]] = !{!"llvm.loop.vectorize.enable", i1 true}
-; PGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
-; PGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
+; PGSO: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]], [[META17]]}
+; PGSO: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
 ;.
 ; NPGSO: [[PROF14]] = !{!"function_entry_count", i64 0}
 ; NPGSO: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
@@ -1119,8 +1030,6 @@ exit:
 ; NPGSO: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]], [[META17]]}
 ; NPGSO: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META16]]}
 ; NPGSO: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]], [[META17]]}
-; NPGSO: [[LOOP25]] = distinct !{[[LOOP25]], [[META26:![0-9]+]]}
-; NPGSO: [[META26]] = !{!"llvm.loop.vectorize.enable", i1 true}
-; NPGSO: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]], [[META17]]}
-; NPGSO: [[LOOP28]] = distinct !{[[LOOP28]], [[META16]]}
+; NPGSO: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]], [[META17]]}
+; NPGSO: [[LOOP26]] = distinct !{[[LOOP26]], [[META16]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
index bf5631c783fe9..7b5d0b69639fa 100644
--- a/llvm/test/Transforms/LoopVectorize/phi-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
@@ -185,13 +185,9 @@ define i32 @red_phi_0(i32 %start, ptr %src) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SCALAR_PH:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SCALAR_PH1:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[SCALAR_PH1]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH1:.*:]]
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    br i1 poison, label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[START]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP0]])
diff --git a/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll b/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
index a25632562009c..f2d6834c91d53 100644
--- a/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
@@ -29,22 +29,6 @@ define void @pr154045(ptr %p, i1 %c, i64 %x) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label %[[LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[REM:%.*]] = srem i64 0, [[X]]
-; CHECK-NEXT:    br label %[[LATCH]]
-; CHECK:       [[LATCH]]:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[REM]], %[[ELSE]] ], [ 0, %[[LOOP]] ]
-; CHECK-NEXT:    [[PHI_TRUNC:%.*]] = trunc i64 [[PHI]] to i32
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[PHI_TRUNC]], 0
-; CHECK-NEXT:    store i32 [[SHL]], ptr [[P]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
index a29a6bd735feb..2d30e0c9ad10f 100644
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -10,7 +10,7 @@
 ; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
 
 ; Function Attrs: nounwind uwtable
-define void @main() #0 {
+define void @main(i32 %n) #0 {
 entry:
   br label %for.cond1.preheader.i
 
@@ -21,7 +21,7 @@ for.cond1.preheader.i:                            ; preds = %if.end.2.i, %entry
 
 if.end.2.i:                                       ; preds = %for.cond1.preheader.i
   %inc5.i = add nsw i32 %c.06.i, 1
-  %cmp.i = icmp slt i32 %inc5.i, 16
+  %cmp.i = icmp slt i32 %inc5.i, %n
   br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader
 
 for.cond.preheader:                               ; preds = %if.end.2.i
diff --git a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll
index b0e2ae6524491..98963a72c5ad0 100644
--- a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll
@@ -20,18 +20,8 @@ define i16 @duplicate_lcssa(i16 %val) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RES:%.*]] = phi i16 [ [[VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT]], %[[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = sub nsw i16 [[IV]], 1
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i16 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[LCSSA_1:%.*]] = phi i16 [ [[RES]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI1]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[LCSSA_2:%.*]] = phi i16 [ [[RES]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI1]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i16 [[LCSSA_2]]
+; CHECK-NEXT:    ret i16 [[VECTOR_RECUR_EXTRACT_FOR_PHI1]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
index d1b912d47a0ce..a1cb361d20bee 100644
--- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
@@ -43,26 +43,7 @@ define i16 @test_true_and_false_branch_equal() {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i16 [ 99, [[SCALAR_PH:%.*]] ], [ [[INC7:%.*]], [[FOR_LATCH:%.*]] ]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr @v_38, align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i16 [[LV]], 32767
-; CHECK-NEXT:    br i1 [[CMP1]], label [[COND_END:%.*]], label [[COND_END]]
-; CHECK:       cond.end:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i16 [[LV]], 0
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_LATCH]], label [[COND_FALSE4:%.*]]
-; CHECK:       cond.false4:
-; CHECK-NEXT:    [[REM:%.*]] = srem i16 5786, [[LV]]
-; CHECK-NEXT:    br label [[FOR_LATCH]]
-; CHECK:       for.latch:
-; CHECK-NEXT:    [[COND6:%.*]] = phi i16 [ [[REM]], [[COND_FALSE4]] ], [ 5786, [[COND_END]] ]
-; CHECK-NEXT:    store i16 [[COND6]], ptr @v_39, align 1
-; CHECK-NEXT:    [[INC7]] = add nsw i16 [[I_07]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[INC7]], 111
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]]
+; CHECK-NEXT:    br label [[FOR_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[RV:%.*]] = load i16, ptr @v_39, align 1
 ; CHECK-NEXT:    ret i16 [[RV]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
index 8450db69ecb68..9ed35fb0a79e8 100644
--- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll
@@ -57,16 +57,7 @@ define void @pr45679(ptr %A) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
-; CHECK-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -118,16 +109,7 @@ define void @pr45679(ptr %A) {
 ; VF2UF2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; VF2UF2-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2UF2:       middle.block:
-; VF2UF2-NEXT:    br label [[EXIT:%.*]]
-; VF2UF2:       scalar.ph:
 ; VF2UF2-NEXT:    br label [[LOOP:%.*]]
-; VF2UF2:       loop:
-; VF2UF2-NEXT:    [[RIV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
-; VF2UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
-; VF2UF2-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
-; VF2UF2-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
-; VF2UF2-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; VF2UF2-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]]
 ; VF2UF2:       exit:
 ; VF2UF2-NEXT:    ret void
 ;
@@ -174,16 +156,7 @@ define void @pr45679(ptr %A) {
 ; VF1UF4-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; VF1UF4-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF1UF4:       middle.block:
-; VF1UF4-NEXT:    br label [[EXIT:%.*]]
-; VF1UF4:       scalar.ph:
 ; VF1UF4-NEXT:    br label [[LOOP:%.*]]
-; VF1UF4:       loop:
-; VF1UF4-NEXT:    [[RIV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
-; VF1UF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]]
-; VF1UF4-NEXT:    store i32 13, ptr [[ARRAYIDX]], align 1
-; VF1UF4-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
-; VF1UF4-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; VF1UF4-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]]
 ; VF1UF4:       exit:
 ; VF1UF4-NEXT:    ret void
 ;
@@ -253,17 +226,7 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 14
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -319,17 +282,7 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF2UF2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; VF2UF2-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF2UF2:       middle.block:
-; VF2UF2-NEXT:    br label [[FOR_END:%.*]]
-; VF2UF2:       scalar.ph:
 ; VF2UF2-NEXT:    br label [[FOR_BODY:%.*]]
-; VF2UF2:       for.body:
-; VF2UF2-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF2UF2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; VF2UF2-NEXT:    [[V:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; VF2UF2-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; VF2UF2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF2UF2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 14
-; VF2UF2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; VF2UF2:       for.end:
 ; VF2UF2-NEXT:    ret void
 ;
@@ -380,17 +333,7 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) {
 ; VF1UF4-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; VF1UF4-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF1UF4:       middle.block:
-; VF1UF4-NEXT:    br label [[FOR_END:%.*]]
-; VF1UF4:       scalar.ph:
 ; VF1UF4-NEXT:    br label [[FOR_BODY:%.*]]
-; VF1UF4:       for.body:
-; VF1UF4-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF1UF4-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; VF1UF4-NEXT:    [[V:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
-; VF1UF4-NEXT:    store i64 [[V]], ptr [[B]], align 8
-; VF1UF4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; VF1UF4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 14
-; VF1UF4-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; VF1UF4:       for.end:
 ; VF1UF4-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
index 673d582b2b177..01c6c3f23b5a4 100644
--- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
@@ -31,23 +31,13 @@ define void @test(i16 %x, i64 %y, ptr %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    store i32 0, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[V2:%.*]] = trunc i64 [[IV]] to i8
-; CHECK-NEXT:    [[V3:%.*]] = add i8 [[V2]], 1
-; CHECK-NEXT:    [[CMP15:%.*]] = icmp slt i8 [[V3]], 5
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[INC]]
-; CHECK-NEXT:    br i1 [[CMP15]], label [[LOOP]], label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    [[DIV_1:%.*]] = udiv i64 [[Y]], [[ADD]]
 ; CHECK-NEXT:    [[V1:%.*]] = add i64 [[DIV_1]], 1
 ; CHECK-NEXT:    br label [[LOOP_2:%.*]]
 ; CHECK:       loop.2:
-; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[IV_NEXT_1:%.*]], [[LOOP_2]] ], [ 0, [[LOOP_EXIT]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[IV_NEXT_1:%.*]], [[LOOP_2]] ], [ 0, [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT_1]] = add i64 [[IV_1]], [[V1]]
 ; CHECK-NEXT:    call void @use(i64 [[IV_NEXT_1]])
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ult i64 [[IV_NEXT_1]], 200
diff --git a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
index 75437fe01589b..615ea062afd53 100644
--- a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll
@@ -61,24 +61,9 @@ define dso_local i16 @reverse_interleave_load_fold_mask() optsize {
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP26]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 41, [[SCALAR_PH:%.*]] ], [ [[IVMINUS1:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[PREVSUM:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IVMINUS1]] = add nsw i16 [[IV]], -1
-; CHECK-NEXT:    [[GEPA0:%.*]] = getelementptr inbounds [40 x [4 x i16]], ptr @A, i16 0, i16 [[IVMINUS1]], i16 0
-; CHECK-NEXT:    [[TMP29:%.*]] = load i16, ptr [[GEPA0]], align 1
-; CHECK-NEXT:    [[GEPA3:%.*]] = getelementptr inbounds [40 x [4 x i16]], ptr @A, i16 0, i16 [[IVMINUS1]], i16 3
-; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[GEPA3]], align 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i16 [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[PREVSUM]] = add nsw i16 [[SUM]], [[ADD]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[IV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[PREVSUM_LCSSA:%.*]] = phi i16 [ [[PREVSUM]], [[LOOP]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i16 [[PREVSUM_LCSSA]]
+; CHECK-NEXT:    ret i16 [[TMP28]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
index 637b4abf7b14f..7b3500933314a 100644
--- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll
@@ -33,31 +33,9 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PREDPHI7]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 6, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[V_2:%.*]] = phi i32 [ 35902, [[SCALAR_PH]] ], [ [[P_2:%.*]], [[LOOP_LATCH]] ]
-; CHECK-NEXT:    br i1 [[C_2]], label [[LOOP_LATCH]], label [[BODY_1:%.*]]
-; CHECK:       body.1:
-; CHECK-NEXT:    [[V_2_ADD:%.*]] = add i32 [[V_2]], 10
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_LATCH]], label [[BODY_2:%.*]]
-; CHECK:       body.2:
-; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[V_2_ADD]], 20
-; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[A]], 1
-; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[XOR]]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[P_1:%.*]] = phi i32 [ [[IV]], [[LOOP_HEADER]] ], [ 9, [[BODY_1]] ], [ 9, [[BODY_2]] ]
-; CHECK-NEXT:    [[P_2]] = phi i32 [ [[V_2]], [[LOOP_HEADER]] ], [ [[V_2_ADD]], [[BODY_1]] ], [ [[ADD_2]], [[BODY_2]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[IV]], 181
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[E_1:%.*]] = phi i32 [ [[P_1]], [[LOOP_LATCH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[E_2:%.*]] = phi i32 [ [[P_2]], [[LOOP_LATCH]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_1]], [[E_2]]
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 bb:
diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll
index d5b2519109385..1ef614ab32472 100644
--- a/llvm/test/Transforms/LoopVectorize/pr66616.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll
@@ -18,41 +18,32 @@ define void @pr66616(ptr %ptr) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[PREHEADER:%.*]]
-; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    br label [[LOOP_1:%.*]]
-; CHECK:       loop.1:
-; CHECK-NEXT:    [[IV_1:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[INC:%.*]], [[LOOP_1]] ]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[LOAD]], 1
-; CHECK-NEXT:    [[INC]] = add i8 [[IV_1]], 1
-; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i8 [[INC]], 0
-; CHECK-NEXT:    br i1 [[COND1]], label [[PREHEADER]], label [[LOOP_1]]
 ; CHECK:       preheader:
-; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi i32 [ [[ADD3]], [[LOOP_1]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 0, [[ADD3_LCSSA]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 -1, [[DOTLCSSA]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH1:%.*]], label [[VECTOR_PH2:%.*]]
-; CHECK:       vector.ph2:
+; CHECK:       vector.ph1:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END:%.*]] = add i32 [[ADD3_LCSSA]], [[DOTCAST]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP3]], [[DOTCAST]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY3:%.*]]
-; CHECK:       vector.body3:
+; CHECK:       vector.body2:
 ; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ 0, [[VECTOR_PH2]] ], [ [[INDEX_NEXT9:%.*]], [[VECTOR_BODY3]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX8]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK6:%.*]], label [[VECTOR_BODY3]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       middle.block6:
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY3]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       middle.block5:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH1]]
-; CHECK:       scalar.ph1:
-; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK6]] ], [ [[ADD3_LCSSA]], [[PREHEADER]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK6]] ], [ [[PTR]], [[PREHEADER]] ]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK5]] ], [ [[TMP3]], [[LOOP_1]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[IND_END5]], [[MIDDLE_BLOCK5]] ], [ [[PTR]], [[LOOP_1]] ]
 ; CHECK-NEXT:    br label [[LOOP_2:%.*]]
 ; CHECK:       loop.2:
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi i32 [ [[IV_2_I:%.*]], [[LOOP_2]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH1]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
index 70428f0c07cac..565e203e68f72 100644
--- a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll
@@ -425,20 +425,6 @@ define void @switch_all_to_default(ptr %start) {
 ; IC1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC1:       [[MIDDLE_BLOCK]]:
 ; IC1-NEXT:    br label %[[EXIT:.*]]
-; IC1:       [[SCALAR_PH:.*]]:
-; IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
-; IC1:       [[LOOP_HEADER]]:
-; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; IC1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IC1-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH]] [
-; IC1-NEXT:      i64 120, label %[[LOOP_LATCH]]
-; IC1-NEXT:      i64 100, label %[[LOOP_LATCH]]
-; IC1-NEXT:    ]
-; IC1:       [[LOOP_LATCH]]:
-; IC1-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]]
-; IC1-NEXT:    store i64 42, ptr [[GEP]], align 1
-; IC1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; IC1:       [[EXIT]]:
 ; IC1-NEXT:    ret void
 ;
@@ -459,20 +445,6 @@ define void @switch_all_to_default(ptr %start) {
 ; IC2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IC2:       [[MIDDLE_BLOCK]]:
 ; IC2-NEXT:    br label %[[EXIT:.*]]
-; IC2:       [[SCALAR_PH:.*]]:
-; IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
-; IC2:       [[LOOP_HEADER]]:
-; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; IC2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IC2-NEXT:    switch i64 [[IV]], label %[[LOOP_LATCH]] [
-; IC2-NEXT:      i64 120, label %[[LOOP_LATCH]]
-; IC2-NEXT:      i64 100, label %[[LOOP_LATCH]]
-; IC2-NEXT:    ]
-; IC2:       [[LOOP_LATCH]]:
-; IC2-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]]
-; IC2-NEXT:    store i64 42, ptr [[GEP]], align 1
-; IC2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; IC2:       [[EXIT]]:
 ; IC2-NEXT:    ret void
 ;
@@ -513,21 +485,6 @@ define void @switch_unconditional(ptr %start) {
 ; IC1-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IC1:       [[MIDDLE_BLOCK]]:
 ; IC1-NEXT:    br label %[[EXIT:.*]]
-; IC1:       [[SCALAR_PH:.*]]:
-; IC1-NEXT:    br label %[[LOOP:.*]]
-; IC1:       [[LOOP]]:
-; IC1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; IC1-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[START]], i64 [[IV]]
-; IC1-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
-; IC1-NEXT:    switch i32 [[X]], label %[[FOO:.*]] [
-; IC1-NEXT:    ]
-; IC1:       [[FOO]]:
-; IC1-NEXT:    br label %[[LATCH]]
-; IC1:       [[LATCH]]:
-; IC1-NEXT:    store i32 0, ptr [[GEP]], align 4
-; IC1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IC1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]]
 ; IC1:       [[EXIT]]:
 ; IC1-NEXT:    ret void
 ;
@@ -548,21 +505,6 @@ define void @switch_unconditional(ptr %start) {
 ; IC2-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; IC2:       [[MIDDLE_BLOCK]]:
 ; IC2-NEXT:    br label %[[EXIT:.*]]
-; IC2:       [[SCALAR_PH:.*]]:
-; IC2-NEXT:    br label %[[LOOP:.*]]
-; IC2:       [[LOOP]]:
-; IC2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
-; IC2-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[START]], i64 [[IV]]
-; IC2-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
-; IC2-NEXT:    switch i32 [[X]], label %[[FOO:.*]] [
-; IC2-NEXT:    ]
-; IC2:       [[FOO]]:
-; IC2-NEXT:    br label %[[LATCH]]
-; IC2:       [[LATCH]]:
-; IC2-NEXT:    store i32 0, ptr [[GEP]], align 4
-; IC2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; IC2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; IC2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]]
 ; IC2:       [[EXIT]]:
 ; IC2-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
index dfdaaf14114cc..52555d550f3d9 100644
--- a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
@@ -58,26 +58,6 @@ define void @loop_invariant_store(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[ADD]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[IV]], 2
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 48
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SHL]], 52
-; CHECK-NEXT:    [[TRUNC_I32:%.*]] = trunc i64 [[ASHR]] to i32
-; CHECK-NEXT:    br i1 [[CMP_SLT]], label %[[COND_FALSE:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[COND_FALSE]]:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B]] to i32
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TRUNC_I32]], %[[LOOP_HEADER]] ], [ [[ZEXT]], %[[COND_FALSE]] ]
-; CHECK-NEXT:    [[SHL_I32:%.*]] = shl i32 [[COND]], 8
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
-; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[P]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 8
-; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -174,28 +154,6 @@ define void @loop_invariant_srem(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i8 [[IV]], 2
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 48
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SHL]], 52
-; CHECK-NEXT:    [[TRUNC_I32:%.*]] = trunc i64 [[ASHR]] to i32
-; CHECK-NEXT:    br i1 [[CMP_SLT]], label %[[COND_FALSE:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[COND_FALSE]]:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B]] to i32
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TRUNC_I32]], %[[LOOP_HEADER]] ], [ [[ZEXT]], %[[COND_FALSE]] ]
-; CHECK-NEXT:    [[SHL_I32:%.*]] = shl i32 [[COND]], 8
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
-; CHECK-NEXT:    [[REM:%.*]] = srem i8 [[IV]], [[TRUNC]]
-; CHECK-NEXT:    [[GEP_P_REM:%.*]] = getelementptr i32, ptr [[P]], i8 [[REM]]
-; CHECK-NEXT:    store i32 4, ptr [[GEP_P_REM]], align 4
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV]], 8
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -245,19 +203,6 @@ define void @loop_invariant_float_store(ptr %p, i32 %a) {
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[IV]], 2
-; CHECK-NEXT:    br i1 [[CMP_SLT]], label %[[COND_FALSE:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[COND_FALSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    store float [[TMP10]], ptr [[P]], align 4
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp slt i32 [[IV]], 8
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -315,19 +260,6 @@ define void @test_store_to_invariant_address_needs_mask_due_to_low_trip_count(pt
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi i32 [ 1, %[[LOOP_HEADER]] ], [ 0, %[[ELSE]] ]
-; CHECK-NEXT:    store i32 [[MERGE]], ptr [[DST]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 3
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index 14526afc46088..6542c42678cc5 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -27,17 +27,6 @@ define void @_Z3fooPf(ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[P:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[P]], 2.000000e+00
-; CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -58,25 +47,8 @@ define void @_Z3fooPf(ptr %a) {
 ; DEBUGLOC-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG24]], !llvm.loop [[LOOP25:![0-9]+]]
 ; DEBUGLOC:       [[MIDDLE_BLOCK]]:
 ; DEBUGLOC-NEXT:    br label %[[FOR_END:.*]], !dbg [[DBG24]]
-; DEBUGLOC:       [[SCALAR_PH:.*]]:
-; DEBUGLOC-NEXT:    br label %[[FOR_BODY:.*]], !dbg [[DBG18]]
-; DEBUGLOC:       [[FOR_BODY]]:
-; DEBUGLOC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], !dbg [[DBG19]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[INDVARS_IV]], [[META9:![0-9]+]], !DIExpression(), [[DBG19]])
-; DEBUGLOC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]], !dbg [[DBG20]]
-; DEBUGLOC-NEXT:      #dbg_value(ptr [[ARRAYIDX]], [[META11:![0-9]+]], !DIExpression(), [[DBG20]])
-; DEBUGLOC-NEXT:    [[P:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !dbg [[DBG21]]
-; DEBUGLOC-NEXT:      #dbg_value(float [[P]], [[META12:![0-9]+]], !DIExpression(), [[DBG21]])
-; DEBUGLOC-NEXT:    [[MUL:%.*]] = fmul float [[P]], 2.000000e+00, !dbg [[DBG22]]
-; DEBUGLOC-NEXT:      #dbg_value(float [[MUL]], [[META14:![0-9]+]], !DIExpression(), [[DBG22]])
-; DEBUGLOC-NEXT:    store float [[MUL]], ptr [[ARRAYIDX]], align 4, !dbg [[DBG23]]
-; DEBUGLOC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG28:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[INDVARS_IV_NEXT]], [[META15:![0-9]+]], !DIExpression(), [[DBG28]])
-; DEBUGLOC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024, !dbg [[DBG29:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i1 [[EXITCOND]], [[META16:![0-9]+]], !DIExpression(), [[DBG29]])
-; DEBUGLOC-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]], !dbg [[DBG24]], !llvm.loop [[LOOP30:![0-9]+]]
 ; DEBUGLOC:       [[FOR_END]]:
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG32:![0-9]+]]
+; DEBUGLOC-NEXT:    ret void, !dbg [[DBG28:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -122,7 +94,7 @@ define void @widen_ptr_induction_dbg(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 32
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -134,54 +106,54 @@ define void @widen_ptr_induction_dbg(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[IV_NEXT]] = getelementptr inbounds ptr, ptr [[IV]], i64 1
 ; CHECK-NEXT:    store ptr [[IV]], ptr [[IV]], align 1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 ; DEBUGLOC-LABEL: define void @widen_ptr_induction_dbg(
-; DEBUGLOC-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) !dbg [[DBG33:![0-9]+]] {
+; DEBUGLOC-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) !dbg [[DBG29:![0-9]+]] {
 ; DEBUGLOC-NEXT:  [[ENTRY:.*]]:
-; DEBUGLOC-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64, !dbg [[DBG38:![0-9]+]]
-; DEBUGLOC-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64, !dbg [[DBG38]]
-; DEBUGLOC-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8, !dbg [[DBG38]]
-; DEBUGLOC-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]], !dbg [[DBG38]]
-; DEBUGLOC-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3, !dbg [[DBG38]]
-; DEBUGLOC-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1, !dbg [[DBG38]]
-; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4, !dbg [[DBG38]]
-; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !dbg [[DBG38]]
+; DEBUGLOC-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64, !dbg [[DBG34:![0-9]+]]
+; DEBUGLOC-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64, !dbg [[DBG34]]
+; DEBUGLOC-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8, !dbg [[DBG34]]
+; DEBUGLOC-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]], !dbg [[DBG34]]
+; DEBUGLOC-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3, !dbg [[DBG34]]
+; DEBUGLOC-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1, !dbg [[DBG34]]
+; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4, !dbg [[DBG34]]
+; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !dbg [[DBG34]]
 ; DEBUGLOC:       [[VECTOR_PH]]:
 ; DEBUGLOC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
 ; DEBUGLOC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
 ; DEBUGLOC-NEXT:    [[TMP4:%.*]] = mul i64 [[N_VEC]], 8
 ; DEBUGLOC-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
-; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG38]]
+; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG34]]
 ; DEBUGLOC:       [[VECTOR_BODY]]:
 ; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; DEBUGLOC-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG39:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 8, i64 16, i64 24>, !dbg [[DBG39]]
-; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0, !dbg [[DBG40:![0-9]+]]
-; DEBUGLOC-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG40]]
+; DEBUGLOC-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG35:![0-9]+]]
+; DEBUGLOC-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 8, i64 16, i64 24>, !dbg [[DBG35]]
+; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 0, !dbg [[DBG36:![0-9]+]]
+; DEBUGLOC-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 1, !dbg [[DBG36]]
 ; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; DEBUGLOC-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 32, !dbg [[DBG39]]
-; DEBUGLOC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG41:![0-9]+]]
-; DEBUGLOC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG41]], !llvm.loop [[LOOP42:![0-9]+]]
+; DEBUGLOC-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 32, !dbg [[DBG35]]
+; DEBUGLOC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG37:![0-9]+]]
+; DEBUGLOC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG37]], !llvm.loop [[LOOP38:![0-9]+]]
 ; DEBUGLOC:       [[MIDDLE_BLOCK]]:
-; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]], !dbg [[DBG41]]
-; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG41]]
+; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]], !dbg [[DBG37]]
+; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG37]]
 ; DEBUGLOC:       [[SCALAR_PH]]:
-; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ], !dbg [[DBG39]]
-; DEBUGLOC-NEXT:    br label %[[LOOP:.*]], !dbg [[DBG38]]
+; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ], !dbg [[DBG35]]
+; DEBUGLOC-NEXT:    br label %[[LOOP:.*]], !dbg [[DBG34]]
 ; DEBUGLOC:       [[LOOP]]:
-; DEBUGLOC-NEXT:    [[IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ], !dbg [[DBG39]]
-; DEBUGLOC-NEXT:      #dbg_value(ptr [[IV]], [[META35:![0-9]+]], !DIExpression(), [[DBG39]])
-; DEBUGLOC-NEXT:    [[IV_NEXT]] = getelementptr inbounds ptr, ptr [[IV]], i64 1, !dbg [[DBG43:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(ptr [[IV_NEXT]], [[META36:![0-9]+]], !DIExpression(), [[DBG43]])
-; DEBUGLOC-NEXT:    store ptr [[IV]], ptr [[IV]], align 1, !dbg [[DBG40]]
-; DEBUGLOC-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]], !dbg [[DBG44:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i1 [[CMP_NOT]], [[META37:![0-9]+]], !DIExpression(), [[DBG44]])
-; DEBUGLOC-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG41]], !llvm.loop [[LOOP45:![0-9]+]]
+; DEBUGLOC-NEXT:    [[IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ], !dbg [[DBG35]]
+; DEBUGLOC-NEXT:      #dbg_value(ptr [[IV]], [[META31:![0-9]+]], !DIExpression(), [[DBG35]])
+; DEBUGLOC-NEXT:    [[IV_NEXT]] = getelementptr inbounds ptr, ptr [[IV]], i64 1, !dbg [[DBG39:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(ptr [[IV_NEXT]], [[META32:![0-9]+]], !DIExpression(), [[DBG39]])
+; DEBUGLOC-NEXT:    store ptr [[IV]], ptr [[IV]], align 1, !dbg [[DBG36]]
+; DEBUGLOC-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[IV_NEXT]], [[END]], !dbg [[DBG40:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(i1 [[CMP_NOT]], [[META33:![0-9]+]], !DIExpression(), [[DBG40]])
+; DEBUGLOC-NEXT:    br i1 [[CMP_NOT]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG37]], !llvm.loop [[LOOP41:![0-9]+]]
 ; DEBUGLOC:       [[EXIT]]:
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG46:![0-9]+]]
+; DEBUGLOC-NEXT:    ret void, !dbg [[DBG42:![0-9]+]]
 ;
 entry:
   br label %loop
@@ -254,7 +226,7 @@ define void @predicated_phi_dbg(i64 %n, ptr %x) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -274,96 +246,96 @@ define void @predicated_phi_dbg(i64 %n, ptr %x) {
 ; CHECK-NEXT:    store i64 [[D]], ptr [[IDX]], align 8
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    ret void
 ;
 ; DEBUGLOC-LABEL: define void @predicated_phi_dbg(
-; DEBUGLOC-SAME: i64 [[N:%.*]], ptr [[X:%.*]]) !dbg [[DBG47:![0-9]+]] {
+; DEBUGLOC-SAME: i64 [[N:%.*]], ptr [[X:%.*]]) !dbg [[DBG43:![0-9]+]] {
 ; DEBUGLOC-NEXT:  [[ENTRY:.*]]:
-; DEBUGLOC-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1), !dbg [[DBG56:![0-9]+]]
-; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4, !dbg [[DBG56]]
-; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !dbg [[DBG56]]
+; DEBUGLOC-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1), !dbg [[DBG52:![0-9]+]]
+; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4, !dbg [[DBG52]]
+; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !dbg [[DBG52]]
 ; DEBUGLOC:       [[VECTOR_PH]]:
 ; DEBUGLOC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 4
 ; DEBUGLOC-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG56]]
+; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG52]]
 ; DEBUGLOC:       [[VECTOR_BODY]]:
-; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE6:.*]] ], !dbg [[DBG57:![0-9]+]]
-; DEBUGLOC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_UDIV_CONTINUE6]] ], !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 5), !dbg [[DBG58:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0, !dbg [[DBG58]]
-; DEBUGLOC-NEXT:    br i1 [[TMP1]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE6:.*]] ], !dbg [[DBG53:![0-9]+]]
+; DEBUGLOC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_UDIV_CONTINUE6]] ], !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], splat (i64 5), !dbg [[DBG54:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0, !dbg [[DBG54]]
+; DEBUGLOC-NEXT:    br i1 [[TMP1]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_IF]]:
-; DEBUGLOC-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0, !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[TMP3:%.*]] = udiv i64 [[N]], [[TMP2]], !dbg [[DBG59:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i32 0, !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0, !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[TMP3:%.*]] = udiv i64 [[N]], [[TMP2]], !dbg [[DBG55:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i32 0, !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_CONTINUE]]:
-; DEBUGLOC-NEXT:    [[TMP5:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP4]], %[[PRED_UDIV_IF]] ], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1, !dbg [[DBG58]]
-; DEBUGLOC-NEXT:    br i1 [[TMP6]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2:.*]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[TMP5:%.*]] = phi <4 x i64> [ poison, %[[VECTOR_BODY]] ], [ [[TMP4]], %[[PRED_UDIV_IF]] ], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1, !dbg [[DBG54]]
+; DEBUGLOC-NEXT:    br i1 [[TMP6]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2:.*]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_IF1]]:
-; DEBUGLOC-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1, !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[TMP8:%.*]] = udiv i64 [[N]], [[TMP7]], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    [[TMP9:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP8]], i32 1, !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE2]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1, !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[TMP8:%.*]] = udiv i64 [[N]], [[TMP7]], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    [[TMP9:%.*]] = insertelement <4 x i64> [[TMP5]], i64 [[TMP8]], i32 1, !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE2]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_CONTINUE2]]:
-; DEBUGLOC-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP5]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], %[[PRED_UDIV_IF1]] ], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2, !dbg [[DBG58]]
-; DEBUGLOC-NEXT:    br i1 [[TMP11]], label %[[PRED_UDIV_IF3:.*]], label %[[PRED_UDIV_CONTINUE4:.*]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[TMP10:%.*]] = phi <4 x i64> [ [[TMP5]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP9]], %[[PRED_UDIV_IF1]] ], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2, !dbg [[DBG54]]
+; DEBUGLOC-NEXT:    br i1 [[TMP11]], label %[[PRED_UDIV_IF3:.*]], label %[[PRED_UDIV_CONTINUE4:.*]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_IF3]]:
-; DEBUGLOC-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 2, !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[TMP13:%.*]] = udiv i64 [[N]], [[TMP12]], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP13]], i32 2, !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE4]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 2, !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[TMP13:%.*]] = udiv i64 [[N]], [[TMP12]], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    [[TMP14:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP13]], i32 2, !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE4]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_CONTINUE4]]:
-; DEBUGLOC-NEXT:    [[TMP15:%.*]] = phi <4 x i64> [ [[TMP10]], %[[PRED_UDIV_CONTINUE2]] ], [ [[TMP14]], %[[PRED_UDIV_IF3]] ], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3, !dbg [[DBG58]]
-; DEBUGLOC-NEXT:    br i1 [[TMP16]], label %[[PRED_UDIV_IF5:.*]], label %[[PRED_UDIV_CONTINUE6]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[TMP15:%.*]] = phi <4 x i64> [ [[TMP10]], %[[PRED_UDIV_CONTINUE2]] ], [ [[TMP14]], %[[PRED_UDIV_IF3]] ], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3, !dbg [[DBG54]]
+; DEBUGLOC-NEXT:    br i1 [[TMP16]], label %[[PRED_UDIV_IF5:.*]], label %[[PRED_UDIV_CONTINUE6]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_IF5]]:
-; DEBUGLOC-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 3, !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[TMP18:%.*]] = udiv i64 [[N]], [[TMP17]], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP18]], i32 3, !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE6]], !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 3, !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[TMP18:%.*]] = udiv i64 [[N]], [[TMP17]], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP18]], i32 3, !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    br label %[[PRED_UDIV_CONTINUE6]], !dbg [[DBG54]]
 ; DEBUGLOC:       [[PRED_UDIV_CONTINUE6]]:
-; DEBUGLOC-NEXT:    [[TMP20:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_UDIV_CONTINUE4]] ], [ [[TMP19]], %[[PRED_UDIV_IF5]] ], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP20]], <4 x i64> zeroinitializer, !dbg [[DBG60:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[X]], i64 [[INDEX]], !dbg [[DBG61:![0-9]+]]
-; DEBUGLOC-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP21]], align 8, !dbg [[DBG62:![0-9]+]]
-; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4), !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG63:![0-9]+]]
-; DEBUGLOC-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG63]], !llvm.loop [[LOOP64:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP20:%.*]] = phi <4 x i64> [ [[TMP15]], %[[PRED_UDIV_CONTINUE4]] ], [ [[TMP19]], %[[PRED_UDIV_IF5]] ], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> [[TMP20]], <4 x i64> zeroinitializer, !dbg [[DBG56:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[X]], i64 [[INDEX]], !dbg [[DBG57:![0-9]+]]
+; DEBUGLOC-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP21]], align 8, !dbg [[DBG58:![0-9]+]]
+; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4), !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG59:![0-9]+]]
+; DEBUGLOC-NEXT:    br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG59]], !llvm.loop [[LOOP60:![0-9]+]]
 ; DEBUGLOC:       [[MIDDLE_BLOCK]]:
-; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]], !dbg [[DBG63]]
-; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]], !dbg [[DBG63]]
+; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]], !dbg [[DBG59]]
+; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]], !dbg [[DBG59]]
 ; DEBUGLOC:       [[SCALAR_PH]]:
-; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG57]]
-; DEBUGLOC-NEXT:    br label %[[FOR_BODY:.*]], !dbg [[DBG56]]
+; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], !dbg [[DBG53]]
+; DEBUGLOC-NEXT:    br label %[[FOR_BODY:.*]], !dbg [[DBG52]]
 ; DEBUGLOC:       [[FOR_BODY]]:
-; DEBUGLOC-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[FOR_INC:.*]] ], !dbg [[DBG57]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[I]], [[META49:![0-9]+]], !DIExpression(), [[DBG57]])
-; DEBUGLOC-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I]], 5, !dbg [[DBG58]]
-; DEBUGLOC-NEXT:      #dbg_value(i1 [[CMP]], [[META50:![0-9]+]], !DIExpression(), [[DBG58]])
-; DEBUGLOC-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[FOR_INC]], !dbg [[DBG65:![0-9]+]]
+; DEBUGLOC-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[FOR_INC:.*]] ], !dbg [[DBG53]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[I]], [[META45:![0-9]+]], !DIExpression(), [[DBG53]])
+; DEBUGLOC-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I]], 5, !dbg [[DBG54]]
+; DEBUGLOC-NEXT:      #dbg_value(i1 [[CMP]], [[META46:![0-9]+]], !DIExpression(), [[DBG54]])
+; DEBUGLOC-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[FOR_INC]], !dbg [[DBG61:![0-9]+]]
 ; DEBUGLOC:       [[IF_THEN]]:
-; DEBUGLOC-NEXT:    [[TMP4:%.*]] = udiv i64 [[N]], [[I]], !dbg [[DBG59]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[TMP4]], [[META51:![0-9]+]], !DIExpression(), [[DBG59]])
-; DEBUGLOC-NEXT:    br label %[[FOR_INC]], !dbg [[DBG66:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP4:%.*]] = udiv i64 [[N]], [[I]], !dbg [[DBG55]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[TMP4]], [[META47:![0-9]+]], !DIExpression(), [[DBG55]])
+; DEBUGLOC-NEXT:    br label %[[FOR_INC]], !dbg [[DBG62:![0-9]+]]
 ; DEBUGLOC:       [[FOR_INC]]:
-; DEBUGLOC-NEXT:    [[D:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[TMP4]], %[[IF_THEN]] ], !dbg [[DBG60]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[D]], [[META52:![0-9]+]], !DIExpression(), [[DBG60]])
-; DEBUGLOC-NEXT:    [[IDX:%.*]] = getelementptr i64, ptr [[X]], i64 [[I]], !dbg [[DBG61]]
-; DEBUGLOC-NEXT:      #dbg_value(ptr [[IDX]], [[META53:![0-9]+]], !DIExpression(), [[DBG61]])
-; DEBUGLOC-NEXT:    store i64 [[D]], ptr [[IDX]], align 8, !dbg [[DBG62]]
-; DEBUGLOC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1, !dbg [[DBG67:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[I_NEXT]], [[META54:![0-9]+]], !DIExpression(), [[DBG67]])
-; DEBUGLOC-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]], !dbg [[DBG68:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i1 [[COND]], [[META55:![0-9]+]], !DIExpression(), [[DBG68]])
-; DEBUGLOC-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !dbg [[DBG63]], !llvm.loop [[LOOP69:![0-9]+]]
+; DEBUGLOC-NEXT:    [[D:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[TMP4]], %[[IF_THEN]] ], !dbg [[DBG56]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[D]], [[META48:![0-9]+]], !DIExpression(), [[DBG56]])
+; DEBUGLOC-NEXT:    [[IDX:%.*]] = getelementptr i64, ptr [[X]], i64 [[I]], !dbg [[DBG57]]
+; DEBUGLOC-NEXT:      #dbg_value(ptr [[IDX]], [[META49:![0-9]+]], !DIExpression(), [[DBG57]])
+; DEBUGLOC-NEXT:    store i64 [[D]], ptr [[IDX]], align 8, !dbg [[DBG58]]
+; DEBUGLOC-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1, !dbg [[DBG63:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[I_NEXT]], [[META50:![0-9]+]], !DIExpression(), [[DBG63]])
+; DEBUGLOC-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]], !dbg [[DBG64:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(i1 [[COND]], [[META51:![0-9]+]], !DIExpression(), [[DBG64]])
+; DEBUGLOC-NEXT:    br i1 [[COND]], label %[[FOR_BODY]], label %[[FOR_END]], !dbg [[DBG59]], !llvm.loop [[LOOP65:![0-9]+]]
 ; DEBUGLOC:       [[FOR_END]]:
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG70:![0-9]+]]
+; DEBUGLOC-NEXT:    ret void, !dbg [[DBG66:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -415,7 +387,7 @@ define void @scalar_cast_dbg(ptr nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -429,57 +401,57 @@ define void @scalar_cast_dbg(ptr nocapture %a, i32 %start, i64 %k) {
 ; CHECK-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[K]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 ; DEBUGLOC-LABEL: define void @scalar_cast_dbg(
-; DEBUGLOC-SAME: ptr captures(none) [[A:%.*]], i32 [[START:%.*]], i64 [[K:%.*]]) !dbg [[DBG71:![0-9]+]] {
+; DEBUGLOC-SAME: ptr captures(none) [[A:%.*]], i32 [[START:%.*]], i64 [[K:%.*]]) !dbg [[DBG67:![0-9]+]] {
 ; DEBUGLOC-NEXT:  [[ENTRY:.*]]:
-; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[K]], 4, !dbg [[DBG78:![0-9]+]]
-; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]], !dbg [[DBG78]]
+; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[K]], 4, !dbg [[DBG74:![0-9]+]]
+; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]], !dbg [[DBG74]]
 ; DEBUGLOC:       [[VECTOR_SCEVCHECK]]:
-; DEBUGLOC-NEXT:    [[TMP0:%.*]] = add i64 [[K]], -1, !dbg [[DBG78]]
-; DEBUGLOC-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32, !dbg [[DBG78]]
-; DEBUGLOC-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0, !dbg [[DBG78]]
-; DEBUGLOC-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP0]], 4294967295, !dbg [[DBG78]]
-; DEBUGLOC-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]], !dbg [[DBG78]]
-; DEBUGLOC-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]], !dbg [[DBG79:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP0:%.*]] = add i64 [[K]], -1, !dbg [[DBG74]]
+; DEBUGLOC-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32, !dbg [[DBG74]]
+; DEBUGLOC-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0, !dbg [[DBG74]]
+; DEBUGLOC-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP0]], 4294967295, !dbg [[DBG74]]
+; DEBUGLOC-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]], !dbg [[DBG74]]
+; DEBUGLOC-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]], !dbg [[DBG75:![0-9]+]]
 ; DEBUGLOC:       [[VECTOR_PH]]:
 ; DEBUGLOC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[K]], 4
 ; DEBUGLOC-NEXT:    [[N_VEC:%.*]] = sub i64 [[K]], [[N_MOD_VF]]
-; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG79]]
+; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG75]]
 ; DEBUGLOC:       [[VECTOR_BODY]]:
-; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG79]]
-; DEBUGLOC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG80:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32, !dbg [[DBG80]]
-; DEBUGLOC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP5]], !dbg [[DBG81:![0-9]+]]
-; DEBUGLOC-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP6]], align 4, !dbg [[DBG82:![0-9]+]]
-; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG79]]
-; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4), !dbg [[DBG80]]
-; DEBUGLOC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG83:![0-9]+]]
-; DEBUGLOC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG83]], !llvm.loop [[LOOP84:![0-9]+]]
+; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG75]]
+; DEBUGLOC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG76:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32, !dbg [[DBG76]]
+; DEBUGLOC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP5]], !dbg [[DBG77:![0-9]+]]
+; DEBUGLOC-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP6]], align 4, !dbg [[DBG78:![0-9]+]]
+; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG75]]
+; DEBUGLOC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4), !dbg [[DBG76]]
+; DEBUGLOC-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG79:![0-9]+]]
+; DEBUGLOC-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG79]], !llvm.loop [[LOOP80:![0-9]+]]
 ; DEBUGLOC:       [[MIDDLE_BLOCK]]:
-; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]], !dbg [[DBG83]]
-; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG83]]
+; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]], !dbg [[DBG79]]
+; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG79]]
 ; DEBUGLOC:       [[SCALAR_PH]]:
-; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], !dbg [[DBG79]]
-; DEBUGLOC-NEXT:    br label %[[LOOP:.*]], !dbg [[DBG78]]
+; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], !dbg [[DBG75]]
+; DEBUGLOC-NEXT:    br label %[[LOOP:.*]], !dbg [[DBG74]]
 ; DEBUGLOC:       [[LOOP]]:
-; DEBUGLOC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ], !dbg [[DBG79]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV]], [[META73:![0-9]+]], !DIExpression(), [[DBG79]])
-; DEBUGLOC-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32, !dbg [[DBG80]]
-; DEBUGLOC-NEXT:      #dbg_value(i32 [[TRUNC_IV]], [[META74:![0-9]+]], !DIExpression(), [[DBG80]])
-; DEBUGLOC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TRUNC_IV]], !dbg [[DBG81]]
-; DEBUGLOC-NEXT:      #dbg_value(ptr [[ARRAYIDX]], [[META75:![0-9]+]], !DIExpression(), [[DBG81]])
-; DEBUGLOC-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4, !dbg [[DBG82]]
-; DEBUGLOC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1, !dbg [[DBG85:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV_NEXT]], [[META76:![0-9]+]], !DIExpression(), [[DBG85]])
-; DEBUGLOC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[K]], !dbg [[DBG86:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i1 [[EXITCOND]], [[META77:![0-9]+]], !DIExpression(), [[DBG86]])
-; DEBUGLOC-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG83]], !llvm.loop [[LOOP87:![0-9]+]]
+; DEBUGLOC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ], !dbg [[DBG75]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV]], [[META69:![0-9]+]], !DIExpression(), [[DBG75]])
+; DEBUGLOC-NEXT:    [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32, !dbg [[DBG76]]
+; DEBUGLOC-NEXT:      #dbg_value(i32 [[TRUNC_IV]], [[META70:![0-9]+]], !DIExpression(), [[DBG76]])
+; DEBUGLOC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TRUNC_IV]], !dbg [[DBG77]]
+; DEBUGLOC-NEXT:      #dbg_value(ptr [[ARRAYIDX]], [[META71:![0-9]+]], !DIExpression(), [[DBG77]])
+; DEBUGLOC-NEXT:    store i32 [[TRUNC_IV]], ptr [[ARRAYIDX]], align 4, !dbg [[DBG78]]
+; DEBUGLOC-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1, !dbg [[DBG81:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV_NEXT]], [[META72:![0-9]+]], !DIExpression(), [[DBG81]])
+; DEBUGLOC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[K]], !dbg [[DBG82:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(i1 [[EXITCOND]], [[META73:![0-9]+]], !DIExpression(), [[DBG82]])
+; DEBUGLOC-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG79]], !llvm.loop [[LOOP83:![0-9]+]]
 ; DEBUGLOC:       [[EXIT]]:
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG88:![0-9]+]]
+; DEBUGLOC-NEXT:    ret void, !dbg [[DBG84:![0-9]+]]
 ;
 entry:
   br label %loop
@@ -522,7 +494,7 @@ define void @widen_intrinsic_dbg(i64 %n, ptr %y, ptr %x) {
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -538,60 +510,60 @@ define void @widen_intrinsic_dbg(i64 %n, ptr %y, ptr %x) {
 ; CHECK-NEXT:    store float [[CALL]], ptr [[GEP_X]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 ; DEBUGLOC-LABEL: define void @widen_intrinsic_dbg(
-; DEBUGLOC-SAME: i64 [[N:%.*]], ptr [[Y:%.*]], ptr [[X:%.*]]) !dbg [[DBG89:![0-9]+]] {
+; DEBUGLOC-SAME: i64 [[N:%.*]], ptr [[Y:%.*]], ptr [[X:%.*]]) !dbg [[DBG85:![0-9]+]] {
 ; DEBUGLOC-NEXT:  [[ENTRY:.*]]:
-; DEBUGLOC-NEXT:    [[Y2:%.*]] = ptrtoint ptr [[Y]] to i64, !dbg [[DBG98:![0-9]+]]
-; DEBUGLOC-NEXT:    [[X1:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG98]]
-; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4, !dbg [[DBG98]]
-; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]], !dbg [[DBG98]]
+; DEBUGLOC-NEXT:    [[Y2:%.*]] = ptrtoint ptr [[Y]] to i64, !dbg [[DBG94:![0-9]+]]
+; DEBUGLOC-NEXT:    [[X1:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG94]]
+; DEBUGLOC-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4, !dbg [[DBG94]]
+; DEBUGLOC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]], !dbg [[DBG94]]
 ; DEBUGLOC:       [[VECTOR_MEMCHECK]]:
-; DEBUGLOC-NEXT:    [[TMP0:%.*]] = sub i64 [[X1]], [[Y2]], !dbg [[DBG98]]
-; DEBUGLOC-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16, !dbg [[DBG98]]
-; DEBUGLOC-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]], !dbg [[DBG99:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP0:%.*]] = sub i64 [[X1]], [[Y2]], !dbg [[DBG94]]
+; DEBUGLOC-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16, !dbg [[DBG94]]
+; DEBUGLOC-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]], !dbg [[DBG95:![0-9]+]]
 ; DEBUGLOC:       [[VECTOR_PH]]:
 ; DEBUGLOC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; DEBUGLOC-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG99]]
+; DEBUGLOC-NEXT:    br label %[[VECTOR_BODY:.*]], !dbg [[DBG95]]
 ; DEBUGLOC:       [[VECTOR_BODY]]:
-; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG99]]
-; DEBUGLOC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[INDEX]], !dbg [[DBG100:![0-9]+]]
-; DEBUGLOC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !dbg [[DBG101:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]]), !dbg [[DBG102:![0-9]+]]
-; DEBUGLOC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]], !dbg [[DBG103:![0-9]+]]
-; DEBUGLOC-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP3]], align 4, !dbg [[DBG104:![0-9]+]]
-; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG99]]
-; DEBUGLOC-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG105:![0-9]+]]
-; DEBUGLOC-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG105]], !llvm.loop [[LOOP106:![0-9]+]]
+; DEBUGLOC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ], !dbg [[DBG95]]
+; DEBUGLOC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[INDEX]], !dbg [[DBG96:![0-9]+]]
+; DEBUGLOC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4, !dbg [[DBG97:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]]), !dbg [[DBG98:![0-9]+]]
+; DEBUGLOC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[INDEX]], !dbg [[DBG99:![0-9]+]]
+; DEBUGLOC-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP3]], align 4, !dbg [[DBG100:![0-9]+]]
+; DEBUGLOC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG95]]
+; DEBUGLOC-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG101:![0-9]+]]
+; DEBUGLOC-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !dbg [[DBG101]], !llvm.loop [[LOOP102:![0-9]+]]
 ; DEBUGLOC:       [[MIDDLE_BLOCK]]:
-; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]], !dbg [[DBG105]]
-; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG105]]
+; DEBUGLOC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]], !dbg [[DBG101]]
+; DEBUGLOC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !dbg [[DBG101]]
 ; DEBUGLOC:       [[SCALAR_PH]]:
-; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ], !dbg [[DBG99]]
-; DEBUGLOC-NEXT:    br label %[[LOOP:.*]], !dbg [[DBG98]]
+; DEBUGLOC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ], !dbg [[DBG95]]
+; DEBUGLOC-NEXT:    br label %[[LOOP:.*]], !dbg [[DBG94]]
 ; DEBUGLOC:       [[LOOP]]:
-; DEBUGLOC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ], !dbg [[DBG99]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV]], [[META91:![0-9]+]], !DIExpression(), [[DBG99]])
-; DEBUGLOC-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[IV]], !dbg [[DBG100]]
-; DEBUGLOC-NEXT:      #dbg_value(ptr [[GEP_Y]], [[META92:![0-9]+]], !DIExpression(), [[DBG100]])
-; DEBUGLOC-NEXT:    [[LOAD:%.*]] = load float, ptr [[GEP_Y]], align 4, !dbg [[DBG101]]
-; DEBUGLOC-NEXT:      #dbg_value(float [[LOAD]], [[META93:![0-9]+]], !DIExpression(), [[DBG101]])
-; DEBUGLOC-NEXT:    [[CALL:%.*]] = call float @llvm.sqrt.f32(float [[LOAD]]), !dbg [[DBG102]]
-; DEBUGLOC-NEXT:      #dbg_value(float [[CALL]], [[META94:![0-9]+]], !DIExpression(), [[DBG102]])
-; DEBUGLOC-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[IV]], !dbg [[DBG103]]
-; DEBUGLOC-NEXT:      #dbg_value(ptr [[GEP_X]], [[META95:![0-9]+]], !DIExpression(), [[DBG103]])
-; DEBUGLOC-NEXT:    store float [[CALL]], ptr [[GEP_X]], align 4, !dbg [[DBG104]]
-; DEBUGLOC-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG107:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV_NEXT]], [[META96:![0-9]+]], !DIExpression(), [[DBG107]])
-; DEBUGLOC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]], !dbg [[DBG108:![0-9]+]]
-; DEBUGLOC-NEXT:      #dbg_value(i1 [[EXITCOND]], [[META97:![0-9]+]], !DIExpression(), [[DBG108]])
-; DEBUGLOC-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG105]], !llvm.loop [[LOOP109:![0-9]+]]
+; DEBUGLOC-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ], !dbg [[DBG95]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV]], [[META87:![0-9]+]], !DIExpression(), [[DBG95]])
+; DEBUGLOC-NEXT:    [[GEP_Y:%.*]] = getelementptr inbounds float, ptr [[Y]], i64 [[IV]], !dbg [[DBG96]]
+; DEBUGLOC-NEXT:      #dbg_value(ptr [[GEP_Y]], [[META88:![0-9]+]], !DIExpression(), [[DBG96]])
+; DEBUGLOC-NEXT:    [[LOAD:%.*]] = load float, ptr [[GEP_Y]], align 4, !dbg [[DBG97]]
+; DEBUGLOC-NEXT:      #dbg_value(float [[LOAD]], [[META89:![0-9]+]], !DIExpression(), [[DBG97]])
+; DEBUGLOC-NEXT:    [[CALL:%.*]] = call float @llvm.sqrt.f32(float [[LOAD]]), !dbg [[DBG98]]
+; DEBUGLOC-NEXT:      #dbg_value(float [[CALL]], [[META90:![0-9]+]], !DIExpression(), [[DBG98]])
+; DEBUGLOC-NEXT:    [[GEP_X:%.*]] = getelementptr inbounds float, ptr [[X]], i64 [[IV]], !dbg [[DBG99]]
+; DEBUGLOC-NEXT:      #dbg_value(ptr [[GEP_X]], [[META91:![0-9]+]], !DIExpression(), [[DBG99]])
+; DEBUGLOC-NEXT:    store float [[CALL]], ptr [[GEP_X]], align 4, !dbg [[DBG100]]
+; DEBUGLOC-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG103:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(i64 [[IV_NEXT]], [[META92:![0-9]+]], !DIExpression(), [[DBG103]])
+; DEBUGLOC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]], !dbg [[DBG104:![0-9]+]]
+; DEBUGLOC-NEXT:      #dbg_value(i1 [[EXITCOND]], [[META93:![0-9]+]], !DIExpression(), [[DBG104]])
+; DEBUGLOC-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !dbg [[DBG101]], !llvm.loop [[LOOP105:![0-9]+]]
 ; DEBUGLOC:       [[EXIT]]:
-; DEBUGLOC-NEXT:    ret void, !dbg [[DBG110:![0-9]+]]
+; DEBUGLOC-NEXT:    ret void, !dbg [[DBG106:![0-9]+]]
 ;
 entry:
   br label %loop
@@ -618,23 +590,21 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
-; CHECK: [[META4]] = !{!"llvm.loop.vectorize.width", i32 4}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]]}
 ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]}
 ;.
 ; DEBUGLOC: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C, file: [[META1:![0-9]+]], producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
 ; DEBUGLOC: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
 ; DEBUGLOC: [[DBG5]] = distinct !DISubprogram(name: "_Z3fooPf", linkageName: "_Z3fooPf", scope: null, file: [[META1]], line: 1, type: [[META6:![0-9]+]], scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META8:![0-9]+]])
 ; DEBUGLOC: [[META6]] = !DISubroutineType(types: [[META7:![0-9]+]])
 ; DEBUGLOC: [[META7]] = !{}
-; DEBUGLOC: [[META8]] = !{[[META9]], [[META11]], [[META12]], [[META14]], [[META15]], [[META16]]}
+; DEBUGLOC: [[META8]] = !{[[META9:![0-9]+]], [[META11:![0-9]+]], [[META12:![0-9]+]], [[META14:![0-9]+]], [[META15:![0-9]+]], [[META16:![0-9]+]]}
 ; DEBUGLOC: [[META9]] = !DILocalVariable(name: "1", scope: [[DBG5]], file: [[META1]], line: 2, type: [[META10:![0-9]+]])
 ; DEBUGLOC: [[META10]] = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
 ; DEBUGLOC: [[META11]] = !DILocalVariable(name: "2", scope: [[DBG5]], file: [[META1]], line: 3, type: [[META10]])
@@ -654,87 +624,83 @@ exit:
 ; DEBUGLOC: [[LOOP25]] = distinct !{[[LOOP25]], [[META26:![0-9]+]], [[META27:![0-9]+]]}
 ; DEBUGLOC: [[META26]] = !{!"llvm.loop.isvectorized", i32 1}
 ; DEBUGLOC: [[META27]] = !{!"llvm.loop.unroll.runtime.disable"}
-; DEBUGLOC: [[DBG28]] = !DILocation(line: 7, column: 1, scope: [[DBG5]])
-; DEBUGLOC: [[DBG29]] = !DILocation(line: 8, column: 1, scope: [[DBG5]])
-; DEBUGLOC: [[LOOP30]] = distinct !{[[LOOP30]], [[META31:![0-9]+]]}
-; DEBUGLOC: [[META31]] = !{!"llvm.loop.vectorize.width", i32 4}
-; DEBUGLOC: [[DBG32]] = !DILocation(line: 10, column: 1, scope: [[DBG5]])
-; DEBUGLOC: [[DBG33]] = distinct !DISubprogram(name: "widen_ptr_induction_dbg", linkageName: "widen_ptr_induction_dbg", scope: null, file: [[META1]], line: 11, type: [[META6]], scopeLine: 11, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META34:![0-9]+]])
-; DEBUGLOC: [[META34]] = !{[[META35]], [[META36]], [[META37]]}
-; DEBUGLOC: [[META35]] = !DILocalVariable(name: "7", scope: [[DBG33]], file: [[META1]], line: 12, type: [[META10]])
-; DEBUGLOC: [[META36]] = !DILocalVariable(name: "8", scope: [[DBG33]], file: [[META1]], line: 13, type: [[META10]])
-; DEBUGLOC: [[META37]] = !DILocalVariable(name: "9", scope: [[DBG33]], file: [[META1]], line: 15, type: [[META17]])
-; DEBUGLOC: [[DBG38]] = !DILocation(line: 11, column: 1, scope: [[DBG33]])
-; DEBUGLOC: [[DBG39]] = !DILocation(line: 12, column: 1, scope: [[DBG33]])
-; DEBUGLOC: [[DBG40]] = !DILocation(line: 14, column: 1, scope: [[DBG33]])
-; DEBUGLOC: [[DBG41]] = !DILocation(line: 16, column: 1, scope: [[DBG33]])
-; DEBUGLOC: [[LOOP42]] = distinct !{[[LOOP42]], [[META26]], [[META27]]}
-; DEBUGLOC: [[DBG43]] = !DILocation(line: 13, column: 1, scope: [[DBG33]])
-; DEBUGLOC: [[DBG44]] = !DILocation(line: 15, column: 1, scope: [[DBG33]])
-; DEBUGLOC: [[LOOP45]] = distinct !{[[LOOP45]], [[META27]], [[META26]]}
-; DEBUGLOC: [[DBG46]] = !DILocation(line: 17, column: 1, scope: [[DBG33]])
-; DEBUGLOC: [[DBG47]] = distinct !DISubprogram(name: "predicated_phi_dbg", linkageName: "predicated_phi_dbg", scope: null, file: [[META1]], line: 18, type: [[META6]], scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META48:![0-9]+]])
-; DEBUGLOC: [[META48]] = !{[[META49]], [[META50]], [[META51]], [[META52]], [[META53]], [[META54]], [[META55]]}
-; DEBUGLOC: [[META49]] = !DILocalVariable(name: "10", scope: [[DBG47]], file: [[META1]], line: 19, type: [[META10]])
-; DEBUGLOC: [[META50]] = !DILocalVariable(name: "11", scope: [[DBG47]], file: [[META1]], line: 20, type: [[META17]])
-; DEBUGLOC: [[META51]] = !DILocalVariable(name: "12", scope: [[DBG47]], file: [[META1]], line: 22, type: [[META10]])
-; DEBUGLOC: [[META52]] = !DILocalVariable(name: "13", scope: [[DBG47]], file: [[META1]], line: 24, type: [[META10]])
-; DEBUGLOC: [[META53]] = !DILocalVariable(name: "14", scope: [[DBG47]], file: [[META1]], line: 25, type: [[META10]])
-; DEBUGLOC: [[META54]] = !DILocalVariable(name: "15", scope: [[DBG47]], file: [[META1]], line: 27, type: [[META10]])
-; DEBUGLOC: [[META55]] = !DILocalVariable(name: "16", scope: [[DBG47]], file: [[META1]], line: 28, type: [[META17]])
-; DEBUGLOC: [[DBG56]] = !DILocation(line: 18, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG57]] = !DILocation(line: 19, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG58]] = !DILocation(line: 20, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG59]] = !DILocation(line: 22, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG60]] = !DILocation(line: 24, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG61]] = !DILocation(line: 25, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG62]] = !DILocation(line: 26, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG63]] = !DILocation(line: 29, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[LOOP64]] = distinct !{[[LOOP64]], [[META26]], [[META27]]}
-; DEBUGLOC: [[DBG65]] = !DILocation(line: 21, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG66]] = !DILocation(line: 23, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG67]] = !DILocation(line: 27, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG68]] = !DILocation(line: 28, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[LOOP69]] = distinct !{[[LOOP69]], [[META27]], [[META26]]}
-; DEBUGLOC: [[DBG70]] = !DILocation(line: 30, column: 1, scope: [[DBG47]])
-; DEBUGLOC: [[DBG71]] = distinct !DISubprogram(name: "scalar_cast_dbg", linkageName: "scalar_cast_dbg", scope: null, file: [[META1]], line: 31, type: [[META6]], scopeLine: 31, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META72:![0-9]+]])
-; DEBUGLOC: [[META72]] = !{[[META73]], [[META74]], [[META75]], [[META76]], [[META77]]}
-; DEBUGLOC: [[META73]] = !DILocalVariable(name: "17", scope: [[DBG71]], file: [[META1]], line: 32, type: [[META10]])
-; DEBUGLOC: [[META74]] = !DILocalVariable(name: "18", scope: [[DBG71]], file: [[META1]], line: 33, type: [[META13]])
-; DEBUGLOC: [[META75]] = !DILocalVariable(name: "19", scope: [[DBG71]], file: [[META1]], line: 34, type: [[META10]])
-; DEBUGLOC: [[META76]] = !DILocalVariable(name: "20", scope: [[DBG71]], file: [[META1]], line: 36, type: [[META10]])
-; DEBUGLOC: [[META77]] = !DILocalVariable(name: "21", scope: [[DBG71]], file: [[META1]], line: 37, type: [[META17]])
-; DEBUGLOC: [[DBG78]] = !DILocation(line: 31, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[DBG79]] = !DILocation(line: 32, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[DBG80]] = !DILocation(line: 33, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[DBG81]] = !DILocation(line: 34, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[DBG82]] = !DILocation(line: 35, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[DBG83]] = !DILocation(line: 38, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[LOOP84]] = distinct !{[[LOOP84]], [[META26]], [[META27]]}
-; DEBUGLOC: [[DBG85]] = !DILocation(line: 36, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[DBG86]] = !DILocation(line: 37, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[LOOP87]] = distinct !{[[LOOP87]], [[META26]]}
-; DEBUGLOC: [[DBG88]] = !DILocation(line: 39, column: 1, scope: [[DBG71]])
-; DEBUGLOC: [[DBG89]] = distinct !DISubprogram(name: "widen_intrinsic_dbg", linkageName: "widen_intrinsic_dbg", scope: null, file: [[META1]], line: 40, type: [[META6]], scopeLine: 40, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META90:![0-9]+]])
-; DEBUGLOC: [[META90]] = !{[[META91]], [[META92]], [[META93]], [[META94]], [[META95]], [[META96]], [[META97]]}
-; DEBUGLOC: [[META91]] = !DILocalVariable(name: "22", scope: [[DBG89]], file: [[META1]], line: 41, type: [[META10]])
-; DEBUGLOC: [[META92]] = !DILocalVariable(name: "23", scope: [[DBG89]], file: [[META1]], line: 42, type: [[META10]])
-; DEBUGLOC: [[META93]] = !DILocalVariable(name: "24", scope: [[DBG89]], file: [[META1]], line: 43, type: [[META13]])
-; DEBUGLOC: [[META94]] = !DILocalVariable(name: "25", scope: [[DBG89]], file: [[META1]], line: 44, type: [[META13]])
-; DEBUGLOC: [[META95]] = !DILocalVariable(name: "26", scope: [[DBG89]], file: [[META1]], line: 45, type: [[META10]])
-; DEBUGLOC: [[META96]] = !DILocalVariable(name: "27", scope: [[DBG89]], file: [[META1]], line: 47, type: [[META10]])
-; DEBUGLOC: [[META97]] = !DILocalVariable(name: "28", scope: [[DBG89]], file: [[META1]], line: 48, type: [[META17]])
-; DEBUGLOC: [[DBG98]] = !DILocation(line: 40, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG99]] = !DILocation(line: 41, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG100]] = !DILocation(line: 42, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG101]] = !DILocation(line: 43, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG102]] = !DILocation(line: 44, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG103]] = !DILocation(line: 45, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG104]] = !DILocation(line: 46, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG105]] = !DILocation(line: 49, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[LOOP106]] = distinct !{[[LOOP106]], [[META26]], [[META27]]}
-; DEBUGLOC: [[DBG107]] = !DILocation(line: 47, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[DBG108]] = !DILocation(line: 48, column: 1, scope: [[DBG89]])
-; DEBUGLOC: [[LOOP109]] = distinct !{[[LOOP109]], [[META26]]}
-; DEBUGLOC: [[DBG110]] = !DILocation(line: 50, column: 1, scope: [[DBG89]])
+; DEBUGLOC: [[DBG28]] = !DILocation(line: 10, column: 1, scope: [[DBG5]])
+; DEBUGLOC: [[DBG29]] = distinct !DISubprogram(name: "widen_ptr_induction_dbg", linkageName: "widen_ptr_induction_dbg", scope: null, file: [[META1]], line: 11, type: [[META6]], scopeLine: 11, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META30:![0-9]+]])
+; DEBUGLOC: [[META30]] = !{[[META31]], [[META32]], [[META33]]}
+; DEBUGLOC: [[META31]] = !DILocalVariable(name: "7", scope: [[DBG29]], file: [[META1]], line: 12, type: [[META10]])
+; DEBUGLOC: [[META32]] = !DILocalVariable(name: "8", scope: [[DBG29]], file: [[META1]], line: 13, type: [[META10]])
+; DEBUGLOC: [[META33]] = !DILocalVariable(name: "9", scope: [[DBG29]], file: [[META1]], line: 15, type: [[META17]])
+; DEBUGLOC: [[DBG34]] = !DILocation(line: 11, column: 1, scope: [[DBG29]])
+; DEBUGLOC: [[DBG35]] = !DILocation(line: 12, column: 1, scope: [[DBG29]])
+; DEBUGLOC: [[DBG36]] = !DILocation(line: 14, column: 1, scope: [[DBG29]])
+; DEBUGLOC: [[DBG37]] = !DILocation(line: 16, column: 1, scope: [[DBG29]])
+; DEBUGLOC: [[LOOP38]] = distinct !{[[LOOP38]], [[META26]], [[META27]]}
+; DEBUGLOC: [[DBG39]] = !DILocation(line: 13, column: 1, scope: [[DBG29]])
+; DEBUGLOC: [[DBG40]] = !DILocation(line: 15, column: 1, scope: [[DBG29]])
+; DEBUGLOC: [[LOOP41]] = distinct !{[[LOOP41]], [[META27]], [[META26]]}
+; DEBUGLOC: [[DBG42]] = !DILocation(line: 17, column: 1, scope: [[DBG29]])
+; DEBUGLOC: [[DBG43]] = distinct !DISubprogram(name: "predicated_phi_dbg", linkageName: "predicated_phi_dbg", scope: null, file: [[META1]], line: 18, type: [[META6]], scopeLine: 18, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META44:![0-9]+]])
+; DEBUGLOC: [[META44]] = !{[[META45]], [[META46]], [[META47]], [[META48]], [[META49]], [[META50]], [[META51]]}
+; DEBUGLOC: [[META45]] = !DILocalVariable(name: "10", scope: [[DBG43]], file: [[META1]], line: 19, type: [[META10]])
+; DEBUGLOC: [[META46]] = !DILocalVariable(name: "11", scope: [[DBG43]], file: [[META1]], line: 20, type: [[META17]])
+; DEBUGLOC: [[META47]] = !DILocalVariable(name: "12", scope: [[DBG43]], file: [[META1]], line: 22, type: [[META10]])
+; DEBUGLOC: [[META48]] = !DILocalVariable(name: "13", scope: [[DBG43]], file: [[META1]], line: 24, type: [[META10]])
+; DEBUGLOC: [[META49]] = !DILocalVariable(name: "14", scope: [[DBG43]], file: [[META1]], line: 25, type: [[META10]])
+; DEBUGLOC: [[META50]] = !DILocalVariable(name: "15", scope: [[DBG43]], file: [[META1]], line: 27, type: [[META10]])
+; DEBUGLOC: [[META51]] = !DILocalVariable(name: "16", scope: [[DBG43]], file: [[META1]], line: 28, type: [[META17]])
+; DEBUGLOC: [[DBG52]] = !DILocation(line: 18, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG53]] = !DILocation(line: 19, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG54]] = !DILocation(line: 20, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG55]] = !DILocation(line: 22, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG56]] = !DILocation(line: 24, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG57]] = !DILocation(line: 25, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG58]] = !DILocation(line: 26, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG59]] = !DILocation(line: 29, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[LOOP60]] = distinct !{[[LOOP60]], [[META26]], [[META27]]}
+; DEBUGLOC: [[DBG61]] = !DILocation(line: 21, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG62]] = !DILocation(line: 23, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG63]] = !DILocation(line: 27, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG64]] = !DILocation(line: 28, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[LOOP65]] = distinct !{[[LOOP65]], [[META27]], [[META26]]}
+; DEBUGLOC: [[DBG66]] = !DILocation(line: 30, column: 1, scope: [[DBG43]])
+; DEBUGLOC: [[DBG67]] = distinct !DISubprogram(name: "scalar_cast_dbg", linkageName: "scalar_cast_dbg", scope: null, file: [[META1]], line: 31, type: [[META6]], scopeLine: 31, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META68:![0-9]+]])
+; DEBUGLOC: [[META68]] = !{[[META69]], [[META70]], [[META71]], [[META72]], [[META73]]}
+; DEBUGLOC: [[META69]] = !DILocalVariable(name: "17", scope: [[DBG67]], file: [[META1]], line: 32, type: [[META10]])
+; DEBUGLOC: [[META70]] = !DILocalVariable(name: "18", scope: [[DBG67]], file: [[META1]], line: 33, type: [[META13]])
+; DEBUGLOC: [[META71]] = !DILocalVariable(name: "19", scope: [[DBG67]], file: [[META1]], line: 34, type: [[META10]])
+; DEBUGLOC: [[META72]] = !DILocalVariable(name: "20", scope: [[DBG67]], file: [[META1]], line: 36, type: [[META10]])
+; DEBUGLOC: [[META73]] = !DILocalVariable(name: "21", scope: [[DBG67]], file: [[META1]], line: 37, type: [[META17]])
+; DEBUGLOC: [[DBG74]] = !DILocation(line: 31, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[DBG75]] = !DILocation(line: 32, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[DBG76]] = !DILocation(line: 33, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[DBG77]] = !DILocation(line: 34, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[DBG78]] = !DILocation(line: 35, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[DBG79]] = !DILocation(line: 38, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[LOOP80]] = distinct !{[[LOOP80]], [[META26]], [[META27]]}
+; DEBUGLOC: [[DBG81]] = !DILocation(line: 36, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[DBG82]] = !DILocation(line: 37, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[LOOP83]] = distinct !{[[LOOP83]], [[META26]]}
+; DEBUGLOC: [[DBG84]] = !DILocation(line: 39, column: 1, scope: [[DBG67]])
+; DEBUGLOC: [[DBG85]] = distinct !DISubprogram(name: "widen_intrinsic_dbg", linkageName: "widen_intrinsic_dbg", scope: null, file: [[META1]], line: 40, type: [[META6]], scopeLine: 40, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META86:![0-9]+]])
+; DEBUGLOC: [[META86]] = !{[[META87]], [[META88]], [[META89]], [[META90]], [[META91]], [[META92]], [[META93]]}
+; DEBUGLOC: [[META87]] = !DILocalVariable(name: "22", scope: [[DBG85]], file: [[META1]], line: 41, type: [[META10]])
+; DEBUGLOC: [[META88]] = !DILocalVariable(name: "23", scope: [[DBG85]], file: [[META1]], line: 42, type: [[META10]])
+; DEBUGLOC: [[META89]] = !DILocalVariable(name: "24", scope: [[DBG85]], file: [[META1]], line: 43, type: [[META13]])
+; DEBUGLOC: [[META90]] = !DILocalVariable(name: "25", scope: [[DBG85]], file: [[META1]], line: 44, type: [[META13]])
+; DEBUGLOC: [[META91]] = !DILocalVariable(name: "26", scope: [[DBG85]], file: [[META1]], line: 45, type: [[META10]])
+; DEBUGLOC: [[META92]] = !DILocalVariable(name: "27", scope: [[DBG85]], file: [[META1]], line: 47, type: [[META10]])
+; DEBUGLOC: [[META93]] = !DILocalVariable(name: "28", scope: [[DBG85]], file: [[META1]], line: 48, type: [[META17]])
+; DEBUGLOC: [[DBG94]] = !DILocation(line: 40, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG95]] = !DILocation(line: 41, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG96]] = !DILocation(line: 42, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG97]] = !DILocation(line: 43, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG98]] = !DILocation(line: 44, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG99]] = !DILocation(line: 45, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG100]] = !DILocation(line: 46, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG101]] = !DILocation(line: 49, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[LOOP102]] = distinct !{[[LOOP102]], [[META26]], [[META27]]}
+; DEBUGLOC: [[DBG103]] = !DILocation(line: 47, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[DBG104]] = !DILocation(line: 48, column: 1, scope: [[DBG85]])
+; DEBUGLOC: [[LOOP105]] = distinct !{[[LOOP105]], [[META26]]}
+; DEBUGLOC: [[DBG106]] = !DILocation(line: 50, column: 1, scope: [[DBG85]])
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll
index 57f0dc205dba1..787fa31751b6a 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-reduction-inloop.ll
@@ -22,7 +22,7 @@ loop:
   %load = load i32, ptr %gep, align 4
   %red.next = add i32 %red, %load
   %iv.next = add i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 256
+  %exitcond = icmp eq i64 %iv.next, 257
   br i1 %exitcond, label %exit, label %loop
 
 exit:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll
index f20d4922b475e..73ddddc69a7c7 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-min-max.ll
@@ -20,10 +20,6 @@ define i32 @reduction_smin(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
@@ -66,10 +62,6 @@ define i32 @reduction_smin_select_ops_flipped(ptr nocapture %A, ptr nocapture %B
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
@@ -111,10 +103,6 @@ define i32 @reduction_smin_intrinsic(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
@@ -159,10 +147,6 @@ define i32 @reduction_umax(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
@@ -205,10 +189,6 @@ define i32 @reduction_umax_select_ops_flipped(ptr nocapture %A, ptr nocapture %B
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
@@ -250,10 +230,6 @@ define i32 @reduction_umax_intrinsic(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret i32 [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index 925290b10b35e..1b9dcadbbfc39 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -61,11 +61,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP26]]
 ;
@@ -170,11 +166,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP48]]
 ;
@@ -263,11 +255,7 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP29]]
 ;
@@ -373,11 +361,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP48]]
 ;
@@ -485,11 +469,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP46]]
 ;
@@ -594,11 +574,7 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    ret i32 [[TMP45]]
 ;
@@ -701,11 +677,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[TMP45]]
 ;
@@ -806,11 +778,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[TMP43]]
 ;
@@ -911,11 +879,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[TMP43]]
 ;
@@ -1016,11 +980,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret float [[TMP43]]
 ;
@@ -1123,11 +1083,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret float [[TMP45]]
 ;
@@ -1211,11 +1167,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
@@ -1297,11 +1249,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[RDX_MINMAX]]
 ;
@@ -1356,21 +1304,7 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br i1 poison, label [[IF_THEN8:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then8:
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else:
-; CHECK-NEXT:    br i1 poison, label [[IF_THEN16:%.*]], label [[FOR_INC]]
-; CHECK:       if.then16:
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_END]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
 ; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
@@ -1478,11 +1412,7 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP30]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = trunc <4 x i32> [[TMP32]] to <4 x i8>
@@ -1572,11 +1502,7 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[TMP31:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP29]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = trunc <4 x i32> [[TMP31]] to <4 x i8>
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index cad3ca1394bb9..183462f71d480 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -35,11 +35,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add i32 [[TMP9]], [[BIN_RDX]]
@@ -114,11 +110,7 @@ define i64 @reduction_sum_chain(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i64 [[TMP19]], [[TMP17]]
 ; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add i64 [[TMP21]], [[BIN_RDX]]
@@ -345,11 +337,7 @@ define i32 @predicated(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP111:%.*]] = icmp eq i64 [[INDEX_NEXT]], 272
 ; CHECK-NEXT:    br i1 [[TMP111]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP104]], [[TMP101]]
 ; CHECK-NEXT:    [[BIN_RDX34:%.*]] = add i32 [[TMP107]], [[BIN_RDX]]
@@ -581,17 +569,9 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP119:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP119]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP119]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul i32 [[TMP112]], [[TMP109]]
 ; CHECK-NEXT:    [[BIN_RDX36:%.*]] = mul i32 [[TMP115]], [[BIN_RDX]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
index f0183047f694b..ec7fde81b205b 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll
@@ -23,21 +23,8 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L7:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L7]] = add i32 [[SUM_02]], [[L3]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L7]], %[[DOTLR_PH]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_single(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]]) {
@@ -61,22 +48,9 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP5]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = add i32 [[TMP5]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L7:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L7]] = add i32 [[SUM_02]], [[L3]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L7]], %[[DOTLR_PH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -125,26 +99,8 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-NEXT:    [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[L7:%.*]] = add i32 [[SUM_02]], [[L6]]
-; CHECK-NEXT:    [[L8:%.*]] = add i32 [[L7]], [[L3]]
-; CHECK-NEXT:    [[L9]] = add i32 [[L8]], [[L5]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
@@ -183,27 +139,9 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = add i32 [[TMP15]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[L7:%.*]] = add i32 [[SUM_02]], [[L6]]
-; CHECK-INTERLEAVED-NEXT:    [[L8:%.*]] = add i32 [[L7]], [[L3]]
-; CHECK-INTERLEAVED-NEXT:    [[L9]] = add i32 [[L8]], [[L5]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -251,22 +189,8 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L7:%.*]] = add i32 [[SUM_02]], [[L3]]
-; CHECK-NEXT:    [[L9]] = add i32 [[L7]], 3
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_const(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]]) {
@@ -294,23 +218,9 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = add i32 [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L7:%.*]] = add i32 [[SUM_02]], [[L3]]
-; CHECK-INTERLEAVED-NEXT:    [[L9]] = add i32 [[L7]], 3
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -360,26 +270,8 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PROD_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 1, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-NEXT:    [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[L7:%.*]] = mul i32 [[PROD_02]], [[L6]]
-; CHECK-NEXT:    [[L8:%.*]] = mul i32 [[L7]], [[L3]]
-; CHECK-NEXT:    [[L9]] = mul i32 [[L8]], [[L5]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_prod(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
@@ -418,27 +310,9 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = mul i32 [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PROD_0_LCSSA:%.*]] = mul i32 [[TMP15]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[PROD_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 1, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[L7:%.*]] = mul i32 [[PROD_02]], [[L6]]
-; CHECK-INTERLEAVED-NEXT:    [[L8:%.*]] = mul i32 [[L7]], [[L3]]
-; CHECK-INTERLEAVED-NEXT:    [[L9]] = mul i32 [[L8]], [[L5]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[PROD_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[PROD_0_LCSSA]]
 ;
 entry:
@@ -491,26 +365,8 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-NEXT:    [[L6:%.*]] = mul nsw i32 [[L5]], [[L3]]
-; CHECK-NEXT:    [[L7:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[L8:%.*]] = add i32 [[SUM_02]], [[L7]]
-; CHECK-NEXT:    [[L9]] = add i32 [[L8]], [[L6]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_mix(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
@@ -547,27 +403,9 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = add i32 [[TMP13]], [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L6:%.*]] = mul nsw i32 [[L5]], [[L3]]
-; CHECK-INTERLEAVED-NEXT:    [[L7:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[L8:%.*]] = add i32 [[SUM_02]], [[L7]]
-; CHECK-INTERLEAVED-NEXT:    [[L9]] = add i32 [[L8]], [[L6]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -617,24 +455,8 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L7:%.*]], %[[DOTLR_PH]] ], [ 19, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-NEXT:    [[L6:%.*]] = mul i32 [[SUM_02]], [[L3]]
-; CHECK-NEXT:    [[L7]] = mul i32 [[L6]], [[L5]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L7]], %[[DOTLR_PH]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_mul(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
@@ -666,25 +488,9 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = mul i32 [[TMP11]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = mul i32 [[TMP11]], [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L7:%.*]], %[[DOTLR_PH]] ], [ 19, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L6:%.*]] = mul i32 [[SUM_02]], [[L3]]
-; CHECK-INTERLEAVED-NEXT:    [[L7]] = mul i32 [[L6]], [[L5]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L7]], %[[DOTLR_PH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -731,24 +537,8 @@ define i32 @start_at_non_zero(ptr nocapture %in, ptr nocapture %coeff, ptr nocap
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_09:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 120, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[COEFF]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[L1]], [[L0]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[SUM_09]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @start_at_non_zero(
 ; CHECK-INTERLEAVED-SAME: ptr captures(none) [[IN:%.*]], ptr captures(none) [[COEFF:%.*]], ptr captures(none) [[OUT:%.*]]) {
@@ -780,24 +570,8 @@ define i32 @start_at_non_zero(ptr nocapture %in, ptr nocapture %coeff, ptr nocap
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP9]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_09:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 120, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[COEFF]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul nsw i32 [[L1]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[SUM_09]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
 ;
 entry:
   br label %for.body
@@ -844,24 +618,8 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], %[[FOR_BODY]] ], [ -1, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = and i32 [[RESULT_08]], [[L0]]
-; CHECK-NEXT:    [[AND]] = and i32 [[ADD]], [[L1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[AND]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_and(
 ; CHECK-INTERLEAVED-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
@@ -893,25 +651,9 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = and i32 [[TMP11]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = and i32 [[TMP11]], [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], %[[FOR_BODY]] ], [ -1, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = and i32 [[RESULT_08]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[AND]] = and i32 [[ADD]], [[L1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[AND]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -958,24 +700,8 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-NEXT:    [[OR]] = or i32 [[ADD]], [[RESULT_08]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[OR]], %[[FOR_BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_or(
 ; CHECK-INTERLEAVED-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
@@ -1005,25 +731,9 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = or i32 [[TMP9]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = or i32 [[TMP9]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[OR]] = or i32 [[ADD]], [[RESULT_08]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[OR]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1070,24 +780,8 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-NEXT:    [[XOR]] = xor i32 [[ADD]], [[RESULT_08]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[XOR]], %[[FOR_BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_xor(
 ; CHECK-INTERLEAVED-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
@@ -1117,25 +811,9 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = xor i32 [[TMP9]], [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = xor i32 [[TMP9]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[XOR]] = xor i32 [[ADD]], [[RESULT_08]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[XOR]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1183,24 +861,8 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[L0]]
-; CHECK-NEXT:    [[FADD]] = fadd fast float [[ADD]], [[L1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ [[FADD]], %[[FOR_BODY]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define float @reduction_fadd(
 ; CHECK-INTERLEAVED-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
@@ -1232,25 +894,9 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd fast float [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = fadd fast float [[TMP7]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[FADD]] = fadd fast float [[ADD]], [[L1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ [[FADD]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret float [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1298,24 +944,8 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[L0]]
-; CHECK-NEXT:    [[FMUL]] = fmul fast float [[ADD]], [[L1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ [[FMUL]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define float @reduction_fmul(
 ; CHECK-INTERLEAVED-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) {
@@ -1347,25 +977,9 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fmul fast float [[TMP11]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = fmul fast float [[TMP11]], [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], %[[FOR_BODY]] ], [ 0.000000e+00, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[FMUL]] = fmul fast float [[ADD]], [[L1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ [[FMUL]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret float [[RESULT_0_LCSSA]]
 ;
 entry:
@@ -1410,21 +1024,8 @@ define i32 @reduction_sub_lhs(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ 3, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[X_05]], [[L0]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sub_lhs(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]]) {
@@ -1450,21 +1051,8 @@ define i32 @reduction_sub_lhs(ptr noalias nocapture %A) {
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP5]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ 3, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[SUB]] = sub nsw i32 [[X_05]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    ret i32 [[X_0_LCSSA]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
 ;
 entry:
   br label %for.body
@@ -1519,38 +1107,8 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-NEXT:    [[SUM_033:%.*]] = phi float [ [[S]], %[[SCALAR_PH]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[L0]], [[L1]]
-; CHECK-NEXT:    br i1 [[CMP3]], label %[[IF_THEN:.*]], label %[[FOR_INC]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[L1]], 1.000000e+00
-; CHECK-NEXT:    br i1 [[CMP6]], label %[[IF_THEN8:.*]], label %[[IF_ELSE:.*]]
-; CHECK:       [[IF_THEN8]]:
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[SUM_033]], [[L0]]
-; CHECK-NEXT:    br label %[[FOR_INC]]
-; CHECK:       [[IF_ELSE]]:
-; CHECK-NEXT:    [[CMP14:%.*]] = fcmp ogt float [[L0]], 2.000000e+00
-; CHECK-NEXT:    br i1 [[CMP14]], label %[[IF_THEN16:.*]], label %[[FOR_INC]]
-; CHECK:       [[IF_THEN16]]:
-; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[SUM_033]], [[L1]]
-; CHECK-NEXT:    br label %[[FOR_INC]]
-; CHECK:       [[FOR_INC]]:
-; CHECK-NEXT:    [[SUM_1]] = phi float [ [[ADD]], %[[IF_THEN8]] ], [ [[ADD19]], %[[IF_THEN16]] ], [ [[SUM_033]], %[[IF_ELSE]] ], [ [[SUM_033]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
+; CHECK-NEXT:    ret float [[TMP13]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define float @reduction_conditional(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], float [[S:%.*]]) {
@@ -1602,38 +1160,8 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[PREDPHI9]], [[PREDPHI6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_033:%.*]] = phi float [ [[S]], %[[SCALAR_PH]] ], [ [[SUM_1:%.*]], %[[FOR_INC]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[L0]], [[L1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP3]], label %[[IF_THEN:.*]], label %[[FOR_INC]]
-; CHECK-INTERLEAVED:       [[IF_THEN]]:
-; CHECK-INTERLEAVED-NEXT:    [[CMP6:%.*]] = fcmp ogt float [[L1]], 1.000000e+00
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP6]], label %[[IF_THEN8:.*]], label %[[IF_ELSE:.*]]
-; CHECK-INTERLEAVED:       [[IF_THEN8]]:
-; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = fadd fast float [[SUM_033]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_INC]]
-; CHECK-INTERLEAVED:       [[IF_ELSE]]:
-; CHECK-INTERLEAVED-NEXT:    [[CMP14:%.*]] = fcmp ogt float [[L0]], 2.000000e+00
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP14]], label %[[IF_THEN16:.*]], label %[[FOR_INC]]
-; CHECK-INTERLEAVED:       [[IF_THEN16]]:
-; CHECK-INTERLEAVED-NEXT:    [[ADD19:%.*]] = fadd fast float [[SUM_033]], [[L1]]
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_INC]]
-; CHECK-INTERLEAVED:       [[FOR_INC]]:
-; CHECK-INTERLEAVED-NEXT:    [[SUM_1]] = phi float [ [[ADD]], %[[IF_THEN8]] ], [ [[ADD19]], %[[IF_THEN16]] ], [ [[SUM_033]], %[[IF_ELSE]] ], [ [[SUM_033]], %[[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], 128
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_END]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[SUM_1_LCSSA:%.*]] = phi float [ [[SUM_1]], %[[FOR_INC]] ], [ [[TMP24]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    ret float [[SUM_1_LCSSA]]
+; CHECK-INTERLEAVED-NEXT:    ret float [[TMP24]]
 ;
 entry:
   br label %for.body
@@ -1679,11 +1207,11 @@ for.end:
 define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[_LR_PH1:.*]]:
 ; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
 ; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
 ; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1703,11 +1231,11 @@ define i32 @reduction_sum_multiuse(ptr noalias nocapture %A, ptr noalias nocaptu
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sum_multiuse(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
-; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:  [[_LR_PH1:.*]]:
 ; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
 ; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L10:%.*]], %[[DOTLR_PH]] ], [ 0, %[[_LR_PH1]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
 ; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
@@ -1778,26 +1306,8 @@ define i32 @reduction_predicated(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-NEXT:    [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[L7:%.*]] = add i32 [[SUM_02]], [[L6]]
-; CHECK-NEXT:    [[L8:%.*]] = add i32 [[L7]], [[L3]]
-; CHECK-NEXT:    [[L9]] = add i32 [[L8]], [[L5]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_predicated(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
@@ -1836,27 +1346,9 @@ define i32 @reduction_predicated(ptr noalias nocapture %A, ptr noalias nocapture
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = add i32 [[TMP15]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i32, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L5:%.*]] = load i32, ptr [[L4]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[L7:%.*]] = add i32 [[SUM_02]], [[L6]]
-; CHECK-INTERLEAVED-NEXT:    [[L8:%.*]] = add i32 [[L7]], [[L3]]
-; CHECK-INTERLEAVED-NEXT:    [[L9]] = add i32 [[L8]], [[L5]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -1902,27 +1394,13 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP5]] = zext <4 x i8> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 255, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L3E:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[L9]] = add i32 [[SUM_02]], [[L3E]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA1:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[SUM_0_LCSSA1]] to i8
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[TMP8]] to i8
 ; CHECK-NEXT:    ret i8 [[SUM_0_LCSSA]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i8 @reduction_add_trunc(
@@ -1951,28 +1429,14 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = zext <4 x i8> [[TMP9]] to <4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i8> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP13]] to i32
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 255, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i8, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L3E:%.*]] = zext i8 [[L3]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[L9]] = add i32 [[SUM_02]], [[L3E]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA1:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[SUM_0_LCSSA1]] to i8
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[TMP14]] to i8
 ; CHECK-INTERLEAVED-NEXT:    ret i8 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -2016,27 +1480,13 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP5]] = zext <4 x i8> [[TMP4]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK:       [[_LR_PH:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 255, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i8, ptr [[L2]], align 4
-; CHECK-NEXT:    [[L3E:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT:    [[L9]] = and i32 [[SUM_02]], [[L3E]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK:       [[__CRIT_EDGE:.*:]]
-; CHECK-NEXT:    [[SUM_0_LCSSA1:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[SUM_0_LCSSA1]] to i8
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[TMP8]] to i8
 ; CHECK-NEXT:    ret i8 [[SUM_0_LCSSA]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i8 @reduction_and_trunc(
@@ -2065,28 +1515,14 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = zext <4 x i8> [[TMP9]] to <4 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = and <4 x i8> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP13]] to i32
 ; CHECK-INTERLEAVED-NEXT:    br [[DOT_CRIT_EDGE:label %.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[DOTLR_PH:.*]]
-; CHECK-INTERLEAVED:       [[_LR_PH:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], %[[DOTLR_PH]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02P:%.*]] = phi i32 [ [[L9:%.*]], %[[DOTLR_PH]] ], [ 255, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-INTERLEAVED-NEXT:    [[L2:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L3:%.*]] = load i8, ptr [[L2]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L3E:%.*]] = zext i8 [[L3]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[L9]] = and i32 [[SUM_02]], [[L3E]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], [[DOT_CRIT_EDGE]], label %[[DOTLR_PH]]
 ; CHECK-INTERLEAVED:       [[__CRIT_EDGE:.*:]]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA1:%.*]] = phi i32 [ [[L9]], %[[DOTLR_PH]] ], [ [[TMP14]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[SUM_0_LCSSA1]] to i8
+; CHECK-INTERLEAVED-NEXT:    [[SUM_0_LCSSA:%.*]] = trunc i32 [[TMP14]] to i8
 ; CHECK-INTERLEAVED-NEXT:    ret i8 [[SUM_0_LCSSA]]
 ;
 entry:
@@ -2133,7 +1569,7 @@ define float @reduction_fmuladd(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[TMP4]] = fadd float [[VEC_PHI]], [[TMP3]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[SCALAR_PH]]
@@ -2151,7 +1587,7 @@ define float @reduction_fmuladd(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP7]], float [[SUM_07]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], %[[FOR_BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MULADD_LCSSA]]
@@ -2185,7 +1621,7 @@ define float @reduction_fmuladd(ptr %a, ptr %b, i64 %n) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = fadd float [[VEC_PHI1]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd float [[TMP9]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2204,7 +1640,7 @@ define float @reduction_fmuladd(ptr %a, ptr %b, i64 %n) {
 ; CHECK-INTERLEAVED-NEXT:    [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP11]], float [[TMP12]], float [[SUM_07]])
 ; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
 ; CHECK-INTERLEAVED-NEXT:    [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret float [[MULADD_LCSSA]]
@@ -2373,7 +1809,7 @@ define float @reduction_fmuladd_blend(ptr %a, ptr %b, i64 %n, i1 %c) {
 ; CHECK-NEXT:    [[TMP7]] = fadd float [[VEC_PHI]], [[TMP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -2388,17 +1824,17 @@ define float @reduction_fmuladd_blend(ptr %a, ptr %b, i64 %n, i1 %c) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    br i1 [[C]], label %[[FOO:.*]], label %[[BAR:.*]]
-; CHECK:       [[FOO]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]]
+; CHECK:       [[IF]]:
 ; CHECK-NEXT:    br label %[[LATCH]]
-; CHECK:       [[BAR]]:
+; CHECK:       [[ELSE]]:
 ; CHECK-NEXT:    [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP9]], float [[TMP10]], float [[SUM]])
 ; CHECK-NEXT:    br label %[[LATCH]]
 ; CHECK:       [[LATCH]]:
-; CHECK-NEXT:    [[SUM_NEXT]] = phi float [ [[SUM]], %[[FOO]] ], [ [[MULADD]], %[[BAR]] ]
+; CHECK-NEXT:    [[SUM_NEXT]] = phi float [ [[SUM]], %[[IF]] ], [ [[MULADD]], %[[ELSE]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi float [ [[SUM_NEXT]], %[[LATCH]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[SUM_NEXT_LCSSA]]
@@ -2437,7 +1873,7 @@ define float @reduction_fmuladd_blend(ptr %a, ptr %b, i64 %n, i1 %c) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13]] = fadd float [[VEC_PHI1]], [[TMP12]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd float [[TMP13]], [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2463,7 +1899,7 @@ define float @reduction_fmuladd_blend(ptr %a, ptr %b, i64 %n, i1 %c) {
 ; CHECK-INTERLEAVED-NEXT:    [[SUM_NEXT]] = phi float [ [[SUM]], %[[IF]] ], [ [[MULADD]], %[[ELSE]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[EXIT]]:
 ; CHECK-INTERLEAVED-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi float [ [[SUM_NEXT]], %[[LATCH]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret float [[SUM_NEXT_LCSSA]]
@@ -2524,7 +1960,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    [[TMP7]] = add i32 [[VEC_PHI]], [[TMP6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
@@ -2550,7 +1986,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-NEXT:    [[G_1]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[G_016]], %[[FOR_BODY2]] ]
 ; CHECK-NEXT:    [[INC6]] = add nuw nsw i32 [[A_117]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC6]], [[I]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       [[FOR_END7]]:
 ; CHECK-NEXT:    [[G_1_LCSSA:%.*]] = phi i32 [ [[G_1]], %[[FOR_INC5]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[G_1_LCSSA]]
@@ -2590,7 +2026,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14]] = add i32 [[VEC_PHI1]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP14]], [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I]], [[N_VEC]]
@@ -2617,7 +2053,7 @@ define i32 @predicated_not_dominates_reduction(ptr nocapture noundef readonly %h
 ; CHECK-INTERLEAVED-NEXT:    [[G_1]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[G_016]], %[[FOR_BODY2]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[INC6]] = add nuw nsw i32 [[A_117]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC6]], [[I]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[FOR_END7]]:
 ; CHECK-INTERLEAVED-NEXT:    [[G_1_LCSSA:%.*]] = phi i32 [ [[G_1]], %[[FOR_INC5]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[G_1_LCSSA]]
@@ -2680,7 +2116,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    [[TMP11]] = add i32 [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END7:.*]], label %[[SCALAR_PH]]
@@ -2707,7 +2143,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-NEXT:    [[G_1]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[G_016]], %[[FOR_BODY2]] ]
 ; CHECK-NEXT:    [[INC6]] = add nuw nsw i32 [[A_117]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC6]], [[I]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       [[FOR_END7]]:
 ; CHECK-NEXT:    [[G_1_LCSSA:%.*]] = phi i32 [ [[G_1]], %[[FOR_INC5]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[G_1_LCSSA]]
@@ -2753,7 +2189,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20]] = add i32 [[TMP14]], [[TMP19]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP20]], [[TMP17]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I]], [[N_VEC]]
@@ -2781,7 +2217,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
 ; CHECK-INTERLEAVED-NEXT:    [[G_1]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[G_016]], %[[FOR_BODY2]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[INC6]] = add nuw nsw i32 [[A_117]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC6]], [[I]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END7]], label %[[FOR_BODY2]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[FOR_END7]]:
 ; CHECK-INTERLEAVED-NEXT:    [[G_1_LCSSA:%.*]] = phi i32 [ [[G_1]], %[[FOR_INC5]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i32 [[G_1_LCSSA]]
@@ -2890,34 +2326,11 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-NEXT:    [[TMP48]] = add i32 [[VEC_PHI]], [[TMP47]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP49]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP49]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], %[[FOR_INC:.*]] ], [ [[TMP48]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[A_1_LCSSA]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[G_09:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC3:%.*]], %[[FOR_INC]] ]
-; CHECK-NEXT:    [[A_08:%.*]] = phi i32 [ undef, %[[SCALAR_PH]] ], [ [[A_1]], %[[FOR_INC]] ]
-; CHECK-NEXT:    [[D:%.*]] = getelementptr inbounds [0 x %struct.e], ptr [[B]], i32 0, i32 [[G_09]], i32 1
-; CHECK-NEXT:    [[TMP45:%.*]] = load i32, ptr [[D]], align 4
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP45]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LOR_LHS_FALSE:.*]], label %[[IF_THEN:.*]]
-; CHECK:       [[LOR_LHS_FALSE]]:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x %struct.e], ptr [[B]], i32 0, i32 [[G_09]]
-; CHECK-NEXT:    [[TMP46:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TOBOOL2_NOT:%.*]] = icmp eq i32 [[TMP46]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL2_NOT]], label %[[FOR_INC]], label %[[IF_THEN]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[A_08]], 1
-; CHECK-NEXT:    br label %[[FOR_INC]]
-; CHECK:       [[FOR_INC]]:
-; CHECK-NEXT:    [[A_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[A_08]], %[[LOR_LHS_FALSE]] ]
-; CHECK-NEXT:    [[INC3]] = add nuw nsw i32 [[G_09]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC3]], 1000
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-NEXT:    ret i32 [[TMP48]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @predicated_or_dominates_reduction(
 ; CHECK-INTERLEAVED-SAME: ptr [[B:%.*]]) {
@@ -3051,35 +2464,12 @@ define i32 @predicated_or_dominates_reduction(ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP98]] = add i32 [[VEC_PHI1]], [[TMP97]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP99]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP99]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP98]], [[TMP94]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK-INTERLEAVED:       [[FOR_COND_CLEANUP]]:
-; CHECK-INTERLEAVED-NEXT:    [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], %[[FOR_INC:.*]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    ret i32 [[A_1_LCSSA]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[G_09:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC3:%.*]], %[[FOR_INC]] ]
-; CHECK-INTERLEAVED-NEXT:    [[A_08:%.*]] = phi i32 [ undef, %[[SCALAR_PH]] ], [ [[A_1]], %[[FOR_INC]] ]
-; CHECK-INTERLEAVED-NEXT:    [[D:%.*]] = getelementptr inbounds [0 x %struct.e], ptr [[B]], i32 0, i32 [[G_09]], i32 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = load i32, ptr [[D]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP100]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TOBOOL_NOT]], label %[[LOR_LHS_FALSE:.*]], label %[[IF_THEN:.*]]
-; CHECK-INTERLEAVED:       [[LOR_LHS_FALSE]]:
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x %struct.e], ptr [[B]], i32 0, i32 [[G_09]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[TOBOOL2_NOT:%.*]] = icmp eq i32 [[TMP101]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TOBOOL2_NOT]], label %[[FOR_INC]], label %[[IF_THEN]]
-; CHECK-INTERLEAVED:       [[IF_THEN]]:
-; CHECK-INTERLEAVED-NEXT:    [[INC:%.*]] = add nsw i32 [[A_08]], 1
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_INC]]
-; CHECK-INTERLEAVED:       [[FOR_INC]]:
-; CHECK-INTERLEAVED-NEXT:    [[A_1]] = phi i32 [ [[INC]], %[[IF_THEN]] ], [ [[A_08]], %[[LOR_LHS_FALSE]] ]
-; CHECK-INTERLEAVED-NEXT:    [[INC3]] = add nuw nsw i32 [[G_09]], 1
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC3]], 1000
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
 ;
 entry:
   br label %for.body
@@ -3135,27 +2525,11 @@ define i32 @reduction_add_sub(ptr noalias nocapture %A, ptr noalias nocapture %B
 ; CHECK-NEXT:    [[TMP6]] = add i32 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[L0_B:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[X_05]], [[L0]]
-; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[ADD]], [[L0_B]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_add_sub(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
@@ -3187,28 +2561,12 @@ define i32 @reduction_add_sub(ptr noalias nocapture %A, ptr noalias nocapture %B
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13]] = add i32 [[TMP9]], [[TMP12]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[X_05:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L0_B:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = add nsw i32 [[X_05]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[SUB]] = sub nsw i32 [[ADD]], [[L0_B]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ [[SUB]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    ret i32 [[X_0_LCSSA]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
 ;
 entry:
   br label %for.body
@@ -3254,27 +2612,11 @@ define i32 @reduction_sub_add(ptr noalias nocapture %A, ptr noalias nocapture %B
 ; CHECK-NEXT:    [[TMP6]] = add i32 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[FOR_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[X_05:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[L0_B:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X_05]], [[L0]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[L0_B]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[X_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @reduction_sub_add(
 ; CHECK-INTERLEAVED-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) {
@@ -3306,28 +2648,12 @@ define i32 @reduction_sub_add(ptr noalias nocapture %A, ptr noalias nocapture %B
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13]] = add i32 [[TMP9]], [[TMP12]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_END:.*]]
-; CHECK-INTERLEAVED:       [[SCALAR_PH:.*]]:
-; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK-INTERLEAVED:       [[FOR_BODY]]:
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[X_05:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[ARRAYIDX_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-INTERLEAVED-NEXT:    [[L0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[L0_B:%.*]] = load i32, ptr [[ARRAYIDX_B]], align 4
-; CHECK-INTERLEAVED-NEXT:    [[SUB:%.*]] = sub nsw i32 [[X_05]], [[L0]]
-; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add nsw i32 [[SUB]], [[L0_B]]
-; CHECK-INTERLEAVED-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 256
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
 ; CHECK-INTERLEAVED:       [[FOR_END]]:
-; CHECK-INTERLEAVED-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ [[ADD]], %[[FOR_BODY]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    ret i32 [[X_0_LCSSA]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[BIN_RDX]]
 ;
 entry:
   br label %for.body
@@ -3374,7 +2700,7 @@ define i64 @reduction_expression_same_operands(ptr nocapture readonly %x, ptr no
 ; CHECK-NEXT:    [[TMP6]] = add i64 [[VEC_PHI]], [[TMP5]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -3394,7 +2720,7 @@ define i64 @reduction_expression_same_operands(ptr nocapture readonly %x, ptr no
 ; CHECK-NEXT:    [[RDX_NEXT]] = add nsw i64 [[RDX]], [[MUL]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[R_0_LCSSA]]
@@ -3426,7 +2752,7 @@ define i64 @reduction_expression_same_operands(ptr nocapture readonly %x, ptr no
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12]] = add i64 [[VEC_PHI1]], [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i64 [[TMP12]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
@@ -3447,7 +2773,7 @@ define i64 @reduction_expression_same_operands(ptr nocapture readonly %x, ptr no
 ; CHECK-INTERLEAVED-NEXT:    [[RDX_NEXT]] = add nsw i64 [[RDX]], [[MUL]]
 ; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
 ; CHECK-INTERLEAVED:       [[EXIT]]:
 ; CHECK-INTERLEAVED-NEXT:    [[R_0_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT]], %[[LOOP]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-INTERLEAVED-NEXT:    ret i64 [[R_0_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
index 7d35ad0095c8f..855a0ce56f2c7 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
@@ -60,11 +60,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
@@ -162,11 +158,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP43]])
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
@@ -267,11 +259,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[DOT_CRIT_EDGE:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
-; CHECK:       .lr.ph:
-; CHECK-NEXT:    br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
 ; CHECK:       ._crit_edge:
 ; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP42]])
 ; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
@@ -371,11 +359,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP42]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
@@ -475,11 +459,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP42]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
@@ -579,11 +559,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP42]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
@@ -683,11 +659,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP42]])
 ; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
@@ -787,11 +759,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP42]])
 ; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
@@ -874,11 +842,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP25]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
@@ -959,11 +923,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
 ; CHECK-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP25]])
 ; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index 916a83a727f89..65d57015b0140 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -775,21 +775,7 @@ define float @reduction_conditional(ptr %A, ptr %B, ptr %C, float %S) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    br i1 poison, label [[IF_THEN8:%.*]], label [[IF_ELSE:%.*]]
-; CHECK:       if.then8:
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       if.else:
-; CHECK-NEXT:    br i1 poison, label [[IF_THEN16:%.*]], label [[FOR_INC]]
-; CHECK:       if.then16:
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    br i1 poison, label [[FOR_BODY]], label [[FOR_END]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[PREDPHI3]])
 ; CHECK-NEXT:    ret float [[SUM_1_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll
index e6ad5937dc5e2..e621b804d5633 100644
--- a/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll
+++ b/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll
@@ -24,20 +24,8 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[BODY:.*]]
-; CHECK:       [[BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_TMP:%.*]] = phi i32 [ [[SUM:%.*]], %[[BODY]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD0:%.*]] = load i32, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[SUM]] = add i32 [[SUM_TMP]], [[LOAD0]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT]], label %[[BODY]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM]], %[[BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   br label %body
diff --git a/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll b/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll
index 826696fcdc452..0896848905c6c 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse-induction-gep-nowrap-flags.ll
@@ -25,22 +25,8 @@ define i32 @preserve_inbounds(i64 %start, ptr %ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
-; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
-; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
-; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
 ; CHECK:       [[END]]:
-; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   br label %loop
@@ -85,22 +71,8 @@ define i32 @preserve_nusw(i64 %start, ptr %ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
-; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr nusw i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
-; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
-; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
 ; CHECK:       [[END]]:
-; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   br label %loop
@@ -145,22 +117,8 @@ define i32 @drop_nuw(i64 %start, ptr %ptr) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REV_IND:%.*]] = phi i64 [ [[START]], %[[SCALAR_PH]] ], [ [[REV_IND_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REDUX:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[REDUX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[REV_IND_NEXT]] = add i64 [[REV_IND]], -1
-; CHECK-NEXT:    [[GEP_PTR_IND:%.*]] = getelementptr nuw i32, ptr [[PTR]], i64 [[REV_IND_NEXT]]
-; CHECK-NEXT:    [[LD_PTR:%.*]] = load i32, ptr [[GEP_PTR_IND]], align 4
-; CHECK-NEXT:    [[REDUX_NEXT]] = add i32 [[LD_PTR]], [[REDUX]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[LOOP]], label %[[END]]
 ; CHECK:       [[END]]:
-; CHECK-NEXT:    [[REDUX_NEXT_LCSSA:%.*]] = phi i32 [ [[REDUX_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[REDUX_NEXT_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP6]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
index 579092136d651..31129d3bcc2f4 100644
--- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -37,22 +37,8 @@ define i32 @reverse_induction_i64(i64 %startval, ptr %ptr) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br label %[[LOOPEND:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i64 [ [[STARTVAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC4:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[REDUX5:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD_I]] = add i64 [[ADD_I7]], -1
-; CHECK-NEXT:    [[KIND__I:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD_I]]
-; CHECK-NEXT:    [[TMP_I1:%.*]] = load i32, ptr [[KIND__I]], align 4
-; CHECK-NEXT:    [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]]
-; CHECK-NEXT:    [[INC4]] = add i32 [[I_06]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[LOOPEND]]
 ; CHECK:       [[LOOPEND]]:
-; CHECK-NEXT:    [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], %[[FOR_BODY]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[INC_REDUX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP13]]
 ;
 entry:
   br label %for.body
@@ -105,22 +91,8 @@ define i32 @reverse_induction_i128(i128 %startval, ptr %ptr) {
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    br label %[[LOOPEND:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[ADD_I7:%.*]] = phi i128 [ [[STARTVAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC4:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[REDUX5:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD_I]] = add i128 [[ADD_I7]], -1
-; CHECK-NEXT:    [[KIND__I:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i128 [[ADD_I]]
-; CHECK-NEXT:    [[TMP_I1:%.*]] = load i32, ptr [[KIND__I]], align 4
-; CHECK-NEXT:    [[INC_REDUX]] = add i32 [[TMP_I1]], [[REDUX5]]
-; CHECK-NEXT:    [[INC4]] = add i32 [[I_06]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC4]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[LOOPEND]]
 ; CHECK:       [[LOOPEND]]:
-; CHECK-NEXT:    [[INC_REDUX_LCSSA:%.*]] = phi i32 [ [[INC_REDUX]], %[[FOR_BODY]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[INC_REDUX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP13]]
 ;
 entry:
   br label %for.body
@@ -263,19 +235,6 @@ define void @reverse_forward_induction_i64_i8() {
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[WHILE_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
-; CHECK:       [[WHILE_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
-; CHECK-NEXT:    [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[WHILE_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[INC]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label %[[WHILE_BODY]], label %[[WHILE_END]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -329,19 +288,6 @@ define void @reverse_forward_induction_i64_i8_signed() {
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[WHILE_END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
-; CHECK:       [[WHILE_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
-; CHECK-NEXT:    [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ -127, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[WHILE_BODY]] ]
-; CHECK-NEXT:    [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[INC]] to i32
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label %[[WHILE_BODY]], label %[[WHILE_END]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index 79fdc07042525..f87be5a115044 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -429,13 +429,9 @@ define dso_local void @forced_optsize(ptr noalias nocapture readonly %x_p, ptr n
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ;
 ; FORCED_OPTSIZE-LABEL: @forced_optsize(
 ; FORCED_OPTSIZE-NEXT:  entry:
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
index a43ea07d0c7af..c7b27040d6484 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll
@@ -19,60 +19,49 @@ define void @test_pr63368(i1 %c, ptr %A) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    br label [[EXIT_1:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_1_HEADER:%.*]]
-; CHECK:       loop.1.header:
-; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1_LATCH:%.*]] ]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_1_LATCH]], label [[LOOP_1_LATCH]]
-; CHECK:       loop.1.latch:
-; CHECK-NEXT:    [[L_LCSSA:%.*]] = phi i32 [ [[L]], [[LOOP_1_HEADER]] ], [ [[L]], [[LOOP_1_HEADER]] ]
-; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 1
-; CHECK-NEXT:    [[EC_1:%.*]] = icmp eq i32 [[IV_1_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC_1]], label [[EXIT_1]], label [[LOOP_1_HEADER]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    [[L_LCSSA_LCSSA:%.*]] = phi i32 [ [[L_LCSSA]], [[LOOP_1_LATCH]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[L_LCSSA_LCSSA]], i32 -1)
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[DOTLCSSA]], i32 -1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SMAX1]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH2:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[L_LCSSA_LCSSA]], i32 -1)
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 poison, i32 -1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SMAX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i8 1, [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i8 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP3]], 255
 ; CHECK-NEXT:    [[TMP8:%.*]] = or i1 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH2]], label [[VECTOR_PH3:%.*]]
-; CHECK:       vector.ph3:
+; CHECK-NEXT:    br i1 [[TMP8]], label [[SCALAR_PH]], label [[VECTOR_PH2:%.*]]
+; CHECK:       vector.ph2:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i32 [[N_VEC]] to i8
-; CHECK-NEXT:    br label [[VECTOR_BODY4:%.*]]
-; CHECK:       vector.body4:
-; CHECK-NEXT:    [[INDEX5:%.*]] = phi i32 [ 0, [[VECTOR_PH3]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY4]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX5]] to i8
+; CHECK-NEXT:    br label [[VECTOR_BODY3:%.*]]
+; CHECK:       vector.body3:
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ 0, [[VECTOR_PH2]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY3]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX4]] to i8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[A]], i8 [[TMP10]]
 ; CHECK-NEXT:    store <4 x i8> zeroinitializer, ptr [[TMP11]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX5]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY4]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       middle.block7:
+; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i32 [[INDEX4]], 4
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT5]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK6:%.*]], label [[VECTOR_BODY3]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       middle.block6:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_2:%.*]], label [[SCALAR_PH2]]
-; CHECK:       scalar.ph2:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP9]], [[MIDDLE_BLOCK7]] ], [ 0, [[EXIT_1]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT_2:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP9]], [[MIDDLE_BLOCK6]] ], [ 0, [[EXIT_1]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP_2:%.*]]
 ; CHECK:       loop.2:
-; CHECK-NEXT:    [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH2]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ]
+; CHECK-NEXT:    [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add i8 [[IV_2]], 1
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i8 [[IV_2_NEXT]]
 ; CHECK-NEXT:    store i8 0, ptr [[GEP_A]], align 1
 ; CHECK-NEXT:    [[IV_2_SEXT:%.*]] = sext i8 [[IV_2]] to i32
-; CHECK-NEXT:    [[EC_2:%.*]] = icmp sge i32 [[L_LCSSA_LCSSA]], [[IV_2_SEXT]]
+; CHECK-NEXT:    [[EC_2:%.*]] = icmp sge i32 [[DOTLCSSA]], [[IV_2_SEXT]]
 ; CHECK-NEXT:    br i1 [[EC_2]], label [[LOOP_2]], label [[EXIT_2]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll b/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll
index d620b92115a60..92af82868ad1e 100644
--- a/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll
@@ -20,21 +20,6 @@ define void @neg_cond(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[P_GEP]], align 4
-; CHECK-NEXT:    [[Q_GEP:%.*]] = getelementptr i32, ptr [[Q]], i32 [[IV]]
-; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[Q_GEP]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 42
-; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[CMP]], true
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[NOT]], i32 42, i32 43
-; CHECK-NEXT:    store i32 [[SEL]], ptr [[P_GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index b87cf904c897c..f4d5a84fe67c8 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -25,21 +25,8 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ undef, [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 1
-; CHECK-NEXT:    [[SELECT]] = select i1 [[C]], i64 [[RED]], i64 [[A]]
-; CHECK-NEXT:    [[ADD]] = add nuw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 32
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[PHI]]
+; CHECK-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 entry:
   br label %loop
@@ -83,21 +70,8 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ poison, [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 1
-; CHECK-NEXT:    [[SELECT]] = select i1 [[C]], i64 [[RED]], i64 [[A]]
-; CHECK-NEXT:    [[ADD]] = add nuw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 32
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[PHI]]
+; CHECK-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 entry:
   br label %loop
@@ -141,21 +115,8 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = freeze i1 [[TMP4]]
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 [[START]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[SCALAR_PH:%.*]] ], [ [[ADD:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[START]], [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 1
-; CHECK-NEXT:    [[SELECT]] = select i1 [[C]], i64 [[RED]], i64 [[A]]
-; CHECK-NEXT:    [[ADD]] = add nuw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 32
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[PHI]]
+; CHECK-NEXT:    ret i64 [[RDX_SELECT]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
index 0fd780e7b44bc..1f5646d2a3090 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll
@@ -36,22 +36,11 @@ define i32 @test(i64 %N, i32 %x) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
-; CHECK-NEXT:    br label [[EXIT_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[EXTRA_ITER]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SEL_COND:%.*]] = icmp sgt i32 [[NEXT]], 10
-; CHECK-NEXT:    [[SEL]] = select i1 [[SEL_COND]], i32 [[NEXT]], i32 10
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]]
 ; CHECK:       exit.loopexit:
-; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ 0, [[CHECK]] ], [ [[SEL_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ 0, [[CHECK]] ], [ [[TMP5]], [[LOOP]] ]
 ; CHECK-NEXT:    ret i32 [[RESULT]]
 ;
 entry:
@@ -90,19 +79,9 @@ define i32 @pr66895_tail_fold_reduction_exit_inst_gets_simplified(i32 %n) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_PHI]])
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 12, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], -1
-; CHECK-NEXT:    [[RED_NEXT]] = mul i32 [[RED]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 0
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RED_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[RED_LCSSA]]
+; CHECK-NEXT:    ret i32 [[TMP3]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
index edad0b59cf9ae..794e274a2628c 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll
@@ -40,20 +40,8 @@ define noundef i32 @f(i32 noundef %g) {
 ; VF4IC2-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
 ; VF4IC2-NEXT:    [[TMP16:%.*]] = add i32 0, [[TMP15]]
 ; VF4IC2-NEXT:    br label %[[RETURN]]
-; VF4IC2:       [[SCALAR_PH:.*]]:
-; VF4IC2-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF4IC2:       [[LOOP_HEADER]]:
-; VF4IC2-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF4IC2-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IV]], 3
-; VF4IC2-NEXT:    [[SHR:%.*]] = ashr i32 [[G]], [[MUL]]
-; VF4IC2-NEXT:    [[EARLY_COND:%.*]] = icmp eq i32 [[SHR]], 0
-; VF4IC2-NEXT:    br i1 [[EARLY_COND]], label %[[LOOP_LATCH]], label %[[RETURN]]
-; VF4IC2:       [[LOOP_LATCH]]:
-; VF4IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; VF4IC2-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 8
-; VF4IC2-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]]
 ; VF4IC2:       [[RETURN]]:
-; VF4IC2-NEXT:    [[RES:%.*]] = phi i32 [ [[SHR]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[TMP16]], %[[VECTOR_EARLY_EXIT]] ]
+; VF4IC2-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[TMP16]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF4IC2-NEXT:    ret i32 [[RES]]
 ;
 ; VF8IC1-LABEL: define noundef i32 @f(
@@ -80,20 +68,8 @@ define noundef i32 @f(i32 noundef %g) {
 ; VF8IC1-NEXT:    [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
 ; VF8IC1-NEXT:    [[TMP7:%.*]] = add i32 0, [[TMP6]]
 ; VF8IC1-NEXT:    br label %[[RETURN]]
-; VF8IC1:       [[SCALAR_PH:.*]]:
-; VF8IC1-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF8IC1:       [[LOOP_HEADER]]:
-; VF8IC1-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF8IC1-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IV]], 3
-; VF8IC1-NEXT:    [[SHR:%.*]] = ashr i32 [[G]], [[MUL]]
-; VF8IC1-NEXT:    [[EARLY_COND:%.*]] = icmp eq i32 [[SHR]], 0
-; VF8IC1-NEXT:    br i1 [[EARLY_COND]], label %[[LOOP_LATCH]], label %[[RETURN]]
-; VF8IC1:       [[LOOP_LATCH]]:
-; VF8IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; VF8IC1-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 8
-; VF8IC1-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]]
 ; VF8IC1:       [[RETURN]]:
-; VF8IC1-NEXT:    [[RES:%.*]] = phi i32 [ [[SHR]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8IC1-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF8IC1-NEXT:    ret i32 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index b1b3a3feb007a..644900d2a42c5 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -9,9 +9,9 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
@@ -22,7 +22,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_SPLIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_SPLIT]]:
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -31,22 +31,8 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
 ; CHECK-NEXT:    br label %[[LOOP_END]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP1:.*]]
-; CHECK:       [[LOOP1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 0, %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END]]
-; CHECK:       [[LOOP_INC]]:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP1]], label %[[LOOP_END]]
 ; CHECK:       [[LOOP_END]]:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], %[[LOOP1]] ], [ -1, %[[LOOP_INC]] ], [ -1, %[[MIDDLE_BLOCK]] ], [ [[TMP9]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ [[TMP9]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -331,9 +317,9 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_p
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IV_NEXT1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-NEXT:    br label %[[LOOP_HEADER1:.*]]
-; CHECK:       [[LOOP_HEADER1]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP_HEADER1]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[NEXT_GEP]], align 2
@@ -343,10 +329,10 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_p
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_SPLIT:.*]], label %[[LOOP_HEADER1]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_SPLIT]]:
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[LOOP_LATCH1:.*]]
-; CHECK:       [[LOOP_LATCH1]]:
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[VECTOR_EARLY_EXIT]]:
@@ -356,10 +342,10 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_p
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
 ; CHECK-NEXT:    br label %[[EXIT_LOOPEXIT]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi ptr [ [[IV_NEXT1]], %[[LOOP_LATCH1]] ], [ [[A]], %[[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IV_NEXT1]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[LOOP_HEADER_PREHEADER]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[IV1]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[IV]], align 2
 ; CHECK-NEXT:    [[C_0:%.*]] = icmp eq i16 [[L]], 0
 ; CHECK-NEXT:    br i1 [[C_0]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_LATCH]]
@@ -368,7 +354,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_p
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[IV_NEXT]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_LOOPEXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[EXIT_LOOPEXIT]]:
-; CHECK-NEXT:    [[P_PH:%.*]] = phi ptr [ [[A_END]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[A_END]], %[[LOOP_LATCH1]] ], [ [[TMP13]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[P_PH:%.*]] = phi ptr [ [[A_END]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[A_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[A]], %[[ENTRY]] ], [ [[P_PH]], %[[EXIT_LOOPEXIT]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
index b630557eb2cfe..d8e62c7b3b8d4 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll
@@ -43,24 +43,10 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4-NEXT:    br label %[[E2:.*]]
 ; VF4IC4:       [[VECTOR_EARLY_EXIT]]:
 ; VF4IC4-NEXT:    br label %[[E1:.*]]
-; VF4IC4:       [[SCALAR_PH:.*]]:
-; VF4IC4-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF4IC4:       [[LOOP_HEADER]]:
-; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[SCALAR_PH]] ]
-; VF4IC4-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
-; VF4IC4-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
-; VF4IC4-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L]], 10
-; VF4IC4-NEXT:    br i1 [[C_1]], label %[[E1]], label %[[LOOP_LATCH]]
-; VF4IC4:       [[LOOP_LATCH]]:
-; VF4IC4-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
-; VF4IC4-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
-; VF4IC4-NEXT:    br i1 [[C_2]], label %[[E2]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF4IC4:       [[E1]]:
-; VF4IC4-NEXT:    [[P1:%.*]] = phi i64 [ 0, %[[LOOP_HEADER]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
-; VF4IC4-NEXT:    ret i64 [[P1]]
+; VF4IC4-NEXT:    ret i64 0
 ; VF4IC4:       [[E2]]:
-; VF4IC4-NEXT:    [[P2:%.*]] = phi i64 [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ]
-; VF4IC4-NEXT:    ret i64 [[P2]]
+; VF4IC4-NEXT:    ret i64 1
 ;
 entry:
   %src = alloca [128 x i32]
@@ -94,6 +80,4 @@ e2:
 ; VF4IC4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; VF4IC4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; VF4IC4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; VF4IC4: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
-; VF4IC4: [[META4]] = !{!"llvm.loop.interleave.count", i32 4}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
index 6836f7b90ad19..a50ce969da7f4 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll
@@ -46,21 +46,9 @@ define i8 @iv_used_in_exit_with_math(i8 noundef %g) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP19]] to i8
 ; CHECK-NEXT:    br label %[[RETURN]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 1, [[IV]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[S]], [[G]]
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[RETURN]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]]
 ; CHECK:       [[RETURN]]:
-; CHECK-NEXT:    [[RES_IV1:%.*]] = phi i8 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP20]], %[[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    [[RES_IV2:%.*]] = phi i8 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP23]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RES_IV1:%.*]] = phi i8 [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP20]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RES_IV2:%.*]] = phi i8 [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP23]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = add i8 [[RES_IV1]], [[RES_IV2]]
 ; CHECK-NEXT:    ret i8 [[RES]]
 ;
@@ -125,21 +113,9 @@ define i32 @iv_used_in_exit_with_loads(ptr align 4 dereferenceable(128) %src) {
 ; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
 ; CHECK-NEXT:    [[TMP29:%.*]] = add i32 [[INDEX]], [[TMP28]]
 ; CHECK-NEXT:    br label %[[RETURN]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[L]], 0
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[RETURN]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 32
-; CHECK-NEXT:    br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]]
 ; CHECK:       [[RETURN]]:
-; CHECK-NEXT:    [[RES_IV1:%.*]] = phi i32 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP29]], %[[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    [[RES_IV2:%.*]] = phi i32 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP29]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RES_IV1:%.*]] = phi i32 [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP29]], %[[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RES_IV2:%.*]] = phi i32 [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP29]], %[[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = add i32 [[RES_IV1]], [[RES_IV2]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
index a4ce68f0453ae..ed5dcc78eeb78 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll
@@ -42,25 +42,11 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() {
 ; VF4IC4:       middle.block:
 ; VF4IC4-NEXT:    br label [[E2:%.*]]
 ; VF4IC4:       vector.early.exit:
-; VF4IC4-NEXT:    br label [[E1:%.*]]
-; VF4IC4:       scalar.ph:
 ; VF4IC4-NEXT:    br label [[LOOP_HEADER:%.*]]
-; VF4IC4:       loop.header:
-; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; VF4IC4-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
-; VF4IC4-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
-; VF4IC4-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L]], 10
-; VF4IC4-NEXT:    br i1 [[C_1]], label [[E1]], label [[LOOP_LATCH]]
-; VF4IC4:       loop.latch:
-; VF4IC4-NEXT:    [[INC]] = add nuw i64 [[IV]], 1
-; VF4IC4-NEXT:    [[C_2:%.*]] = icmp eq i64 [[INC]], 128
-; VF4IC4-NEXT:    br i1 [[C_2]], label [[E2]], label [[LOOP_HEADER]]
 ; VF4IC4:       e1:
-; VF4IC4-NEXT:    [[P1:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
-; VF4IC4-NEXT:    ret i64 [[P1]]
+; VF4IC4-NEXT:    ret i64 0
 ; VF4IC4:       e2:
-; VF4IC4-NEXT:    [[P2:%.*]] = phi i64 [ 1, [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ]
-; VF4IC4-NEXT:    ret i64 [[P2]]
+; VF4IC4-NEXT:    ret i64 1
 ;
 entry:
   %src = alloca [128 x i32]
@@ -155,22 +141,8 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
-; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    br label [[LOOP:%.*]]
-; VF4IC4:       loop:
-; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
-; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
-; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; VF4IC4:       loop.inc:
-; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
-; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
 ; VF4IC4-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -256,19 +228,8 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; VF4IC4-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
-; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    br label [[LOOP:%.*]]
-; VF4IC4:       loop:
-; VF4IC4-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[P1]], [[SCALAR_PH:%.*]] ]
-; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[PTR]], align 1
-; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 72
-; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; VF4IC4:       loop.inc:
-; VF4IC4-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
-; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[PTR_NEXT]], [[PTREND]]
-; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[PTR]], [[LOOP]] ], [ [[PTREND]], [[LOOP_INC]] ], [ [[PTREND]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[PTREND]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VECTOR_EARLY_EXIT]] ]
 ; VF4IC4-NEXT:    ret ptr [[RETVAL]]
 ;
 entry:
@@ -360,22 +321,8 @@ define i64 @same_exit_block_post_inc_use() {
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
-; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    br label [[LOOP:%.*]]
-; VF4IC4:       loop:
-; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
-; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
-; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; VF4IC4:       loop.inc:
-; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
-; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ [[IV_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
 ; VF4IC4-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -470,27 +417,11 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
-; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; VF4IC4:       scalar.ph:
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
-; VF4IC4:       loop:
-; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
-; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
-; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; VF4IC4:       loop.inc:
-; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
-; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; VF4IC4:       loop.early.exit:
-; VF4IC4-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
-; VF4IC4-NEXT:    ret i64 [[RETVAL1]]
+; VF4IC4-NEXT:    ret i64 [[TMP10]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
-; VF4IC4-NEXT:    ret i64 [[RETVAL2]]
+; VF4IC4-NEXT:    ret i64 67
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -588,27 +519,11 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; VF4IC4-NEXT:    [[TMP8:%.*]] = select i1 [[TMP32]], i64 [[TMP31]], i64 [[TMP29]]
 ; VF4IC4-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
 ; VF4IC4-NEXT:    [[TMP10:%.*]] = add i64 3, [[TMP9]]
-; VF4IC4-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; VF4IC4:       scalar.ph:
 ; VF4IC4-NEXT:    br label [[LOOP:%.*]]
-; VF4IC4:       loop:
-; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
-; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
-; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; VF4IC4:       loop.inc:
-; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 67
-; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; VF4IC4:       loop.early.exit:
-; VF4IC4-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[IV]], [[LOOP]] ], [ [[TMP10]], [[VECTOR_EARLY_EXIT]] ]
-; VF4IC4-NEXT:    ret i64 [[RETVAL1]]
+; VF4IC4-NEXT:    ret i64 [[TMP10]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[IV_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
-; VF4IC4-NEXT:    ret i64 [[RETVAL2]]
+; VF4IC4-NEXT:    ret i64 67
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -847,22 +762,8 @@ define i8 @same_exit_block_use_loaded_value() {
 ; VF4IC4-NEXT:    [[TMP41:%.*]] = icmp uge i64 [[TMP8]], 12
 ; VF4IC4-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i8 [[TMP40]], i8 [[TMP38]]
 ; VF4IC4-NEXT:    br label [[LOOP_END]]
-; VF4IC4:       scalar.ph:
-; VF4IC4-NEXT:    br label [[LOOP:%.*]]
-; VF4IC4:       loop:
-; VF4IC4-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[SCALAR_PH:%.*]] ]
-; VF4IC4-NEXT:    [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1
-; VF4IC4-NEXT:    [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]]
-; VF4IC4-NEXT:    [[LD2:%.*]] = load i8, ptr [[GEP_P2]], align 1
-; VF4IC4-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; VF4IC4-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; VF4IC4:       loop.inc:
-; VF4IC4-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF4IC4-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; VF4IC4-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]]
 ; VF4IC4:       loop.end:
-; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ [[LD1]], [[LOOP]] ], [ -1, [[LOOP_INC]] ], [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP42]], [[VECTOR_EARLY_EXIT]] ]
+; VF4IC4-NEXT:    [[RETVAL:%.*]] = phi i8 [ -1, [[MIDDLE_BLOCK]] ], [ [[TMP42]], [[VECTOR_EARLY_EXIT]] ]
 ; VF4IC4-NEXT:    ret i8 [[RETVAL]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
index 219c66f7a68a4..3bb39b95235ed 100644
--- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll
@@ -29,28 +29,7 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    br label [[LOOP_COND:%.*]]
-; CHECK:       loop.cond:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[RES:%.*]] = phi i16 [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[IV]]
-; CHECK-NEXT:    store i16 [[RES]], ptr [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -112,29 +91,7 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.cond:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[RES:%.*]] = phi i16 [ 0, [[LOOP_HEADER]] ], [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[IV]]
-; CHECK-NEXT:    store i16 [[RES]], ptr [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -201,26 +158,7 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_A]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i16 [[LV]], ptr [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -297,29 +235,7 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) {
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.cond:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[SRC_PTR]], align 1
-; CHECK-NEXT:    [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]]
-; CHECK-NEXT:    br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK:       loop.next:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[RES:%.*]] = phi i16 [ 0, [[LOOP_HEADER]] ], [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[IV]]
-; CHECK-NEXT:    store i16 [[RES]], ptr [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 63
-; CHECK-NEXT:    br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -371,20 +287,7 @@ define void @duplicated_incoming_blocks_blend(i32 %x, ptr %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[ADD_I:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[C_0:%.*]] = icmp ugt i32 [[IV]], [[X:%.*]]
-; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[IV]], [[LOOP_HEADER]] ], [ [[IV]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr i32, ptr [[PTR]], i32 [[P]]
-; CHECK-NEXT:    store i32 [[P]], ptr [[GEP_PTR]], align 4
-; CHECK-NEXT:    [[ADD_I]] = add nsw i32 [[P]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD_I]], 1000
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_HEADER]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
index 04f04a8a08fc2..3500c5c9d81cd 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -34,22 +34,8 @@ define i64 @same_exit_block_phi_of_consts() {
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ], [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -108,21 +94,7 @@ define i64 @diff_exit_block_phi_of_consts() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    ret i64 0
 ; CHECK:       loop.end:
@@ -292,16 +264,7 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ -10, [[SCALAR_PH:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[IND_NEXT]] = add nsw i32 [[IND]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       early.exit:
 ; CHECK-NEXT:    tail call void @abort()
 ; CHECK-NEXT:    unreachable
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 54408b24db114..79821b8be1734 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -36,22 +36,8 @@ define i64 @same_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -116,24 +102,8 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() {
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[DOTCAST]], 2
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i32 9, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi i32 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ 9, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[INDEX2_NEXT]] = add i32 [[INDEX2]], 2
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ [[INDEX2]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL]]
 ;
 entry:
@@ -197,23 +167,8 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[DOTCAST]], 2
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i32 9, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i128 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi i32 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ 9, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC]] ], [ [[P1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[PTR]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i128 [[INDEX]], 1
-; CHECK-NEXT:    [[INDEX2_NEXT]] = add i32 [[INDEX2]], 2
-; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i128 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ [[INDEX2]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i32 [[RETVAL]]
 ;
 entry:
@@ -277,24 +232,8 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() {
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float 1.000000e+00, [[DOTCAST]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = fadd fast float 9.000000e+00, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi float [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ 9.000000e+00, [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[INDEX2_NEXT]] = fadd fast float [[INDEX2]], 1.000000e+00
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi float [ [[INDEX2]], [[LOOP]] ], [ 1.230000e+02, [[LOOP_INC]] ], [ 1.230000e+02, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi float [ 1.230000e+02, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret float [[RETVAL]]
 ;
 entry:
@@ -360,24 +299,8 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() {
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 5
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP20]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi ptr [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[P2]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[INDEX2_NEXT]] = getelementptr i8, ptr [[INDEX2]], i64 5
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[INDEX2]], [[LOOP]] ], [ [[P1]], [[LOOP_INC]] ], [ [[P1]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[P1]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret ptr [[RETVAL]]
 ;
 entry:
@@ -438,19 +361,8 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() {
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[P1]], [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[PTR]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 72
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[PTR_NEXT]], [[PTREND]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[PTR]], [[LOOP]] ], [ [[PTREND]], [[LOOP_INC]] ], [ [[PTREND]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[PTREND]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret ptr [[RETVAL]]
 ;
 entry:
@@ -512,23 +424,8 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false
-; CHECK-NEXT:    br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -592,22 +489,8 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -670,22 +553,8 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -745,22 +614,8 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 66, [[MIDDLE_BLOCK]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -823,22 +678,8 @@ define i64 @same_exit_block_pre_inc_use3() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ], [ 66, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ 66, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
 ;
 entry:
@@ -902,20 +743,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -976,22 +805,8 @@ define i64 @same_exit_block_post_inc_use() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -1051,19 +866,8 @@ define ptr @same_exit_block_post_inc_use1_ivptr() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[P1]], [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[PTR]], align 1
-; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], 72
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne ptr [[PTR_NEXT]], [[PTREND]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[PTR_NEXT]], [[LOOP]] ], [ [[PTREND]], [[LOOP_INC]] ], [ [[PTREND]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi ptr [ [[PTREND]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret ptr [[RETVAL]]
 ;
 entry:
@@ -1123,22 +927,8 @@ define i64 @same_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 66, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -1200,27 +990,11 @@ define i64 @diff_exit_block_pre_inc_use1() {
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK-NEXT:    ret i64 [[EARLY_EXIT_VALUE]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
+; CHECK-NEXT:    ret i64 67
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -1282,27 +1056,11 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit:
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ], [ 67, [[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK-NEXT:    ret i64 67
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
+; CHECK-NEXT:    ret i64 66
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -1367,27 +1125,11 @@ define i64 @diff_exit_block_pre_inc_use3() {
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
+; CHECK-NEXT:    ret i64 [[EARLY_EXIT_VALUE]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[INDEX_LCSSA1]]
+; CHECK-NEXT:    ret i64 66
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -1450,27 +1192,11 @@ define i64 @diff_exit_block_post_inc_use1() {
 ; CHECK-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP13]], i1 true)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK-NEXT:    ret i64 [[EARLY_EXIT_VALUE]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
+; CHECK-NEXT:    ret i64 67
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -1536,27 +1262,11 @@ define i64 @diff_exit_block_post_inc_use2() {
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 3, [[TMP11]]
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[TMP21]], [[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK-NEXT:    ret i64 [[TMP21]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ 66, [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
+; CHECK-NEXT:    ret i64 66
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -1624,29 +1334,11 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], 1
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 [[START]], [[TMP12]]
-; CHECK-NEXT:    br label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[START]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[INDEX2_NEXT]] = add i64 [[INDEX2]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX2_NEXT]], [[LOOP]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK-NEXT:    ret i64 [[EARLY_EXIT_VALUE]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX2]], [[LOOP_INC]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
+; CHECK-NEXT:    ret i64 [[IND_ESCAPE]]
 ;
 entry:
   %p1 = alloca [1024 x i8]
@@ -1713,21 +1405,8 @@ define i64 @loop_contains_safe_call() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -1788,21 +1467,8 @@ define i64 @loop_contains_safe_div() {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[LD1]], 20000
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DIV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -1864,22 +1530,8 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LD1]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -2071,22 +1723,8 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
 ; CHECK-NEXT:    [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]]
 ; CHECK-NEXT:    br label [[LOOP_END]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ], [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ [[EARLY_EXIT_VALUE]], [[VECTOR_EARLY_EXIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
index 66300ed6024c6..19ab96dd822b6 100644
--- a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll
@@ -41,18 +41,7 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]])
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr @a, align 4
-; CHECK-NEXT:    br label [[EXIT_LOOPEXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_START]], [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr @c, align 4
-; CHECK-NEXT:    [[RED_NEXT]] = xor i32 [[RED]], [[L]]
-; CHECK-NEXT:    store i32 [[RED_NEXT]], ptr @a, align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 7
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT_LOOPEXIT]], label [[LOOP]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
index 7027d857fd040..ca32808bc482a 100644
--- a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll
@@ -23,19 +23,9 @@ define float @pr70988() {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 1022
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RDX_NEXT]] = fadd contract float [[RDX]], 1.000000e+00
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[INDEX_NEXT]], 1021
-; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[RDX_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTLCSSA]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
 ; CHECK-ALM-LABEL: define float @pr70988() {
 ; CHECK-ALM-NEXT:  entry:
@@ -56,19 +46,9 @@ define float @pr70988() {
 ; CHECK-ALM-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 1022
 ; CHECK-ALM-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-ALM:       middle.block:
-; CHECK-ALM-NEXT:    br label [[EXIT:%.*]]
-; CHECK-ALM:       scalar.ph:
 ; CHECK-ALM-NEXT:    br label [[LOOP:%.*]]
-; CHECK-ALM:       loop:
-; CHECK-ALM-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ALM-NEXT:    [[RDX:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ALM-NEXT:    [[RDX_NEXT]] = fadd contract float [[RDX]], 1.000000e+00
-; CHECK-ALM-NEXT:    [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 1
-; CHECK-ALM-NEXT:    [[COND:%.*]] = icmp ult i32 [[INDEX_NEXT]], 1021
-; CHECK-ALM-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT]]
 ; CHECK-ALM:       exit:
-; CHECK-ALM-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[RDX_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-ALM-NEXT:    ret float [[DOTLCSSA]]
+; CHECK-ALM-NEXT:    ret float [[TMP5]]
 ;
 entry:
   br label %loop
@@ -123,21 +103,9 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[NARROW:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[NARROW]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[SRC]], i32 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[RDX_NEXT]] = fadd contract float [[RDX]], [[L]]
-; CHECK-NEXT:    [[EC:%.*]] = icmp ult i32 [[NARROW]], 15
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[RDX_NEXT]], [[LOOP]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[DOTLCSSA]]
+; CHECK-NEXT:    ret float [[TMP13]]
 ;
 ; CHECK-ALM-LABEL: define float @pr72720reduction_using_active_lane_mask(
 ; CHECK-ALM-SAME: ptr [[SRC:%.*]]) {
@@ -173,21 +141,9 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) {
 ; CHECK-ALM-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
 ; CHECK-ALM-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-ALM:       middle.block:
-; CHECK-ALM-NEXT:    br label [[EXIT:%.*]]
-; CHECK-ALM:       scalar.ph:
 ; CHECK-ALM-NEXT:    br label [[LOOP:%.*]]
-; CHECK-ALM:       loop:
-; CHECK-ALM-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[NARROW:%.*]], [[LOOP]] ]
-; CHECK-ALM-NEXT:    [[RDX:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ALM-NEXT:    [[NARROW]] = add nuw nsw i32 [[IV]], 1
-; CHECK-ALM-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[SRC]], i32 [[IV]]
-; CHECK-ALM-NEXT:    [[L:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-ALM-NEXT:    [[RDX_NEXT]] = fadd contract float [[RDX]], [[L]]
-; CHECK-ALM-NEXT:    [[EC:%.*]] = icmp ult i32 [[NARROW]], 15
-; CHECK-ALM-NEXT:    br i1 [[EC]], label [[LOOP]], label [[EXIT]]
 ; CHECK-ALM:       exit:
-; CHECK-ALM-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[RDX_NEXT]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ALM-NEXT:    ret float [[DOTLCSSA]]
+; CHECK-ALM-NEXT:    ret float [[TMP11]]
 ;
 entry:
   br label %loop
@@ -229,19 +185,9 @@ define float @fadd_reduction_with_live_in(float %inc) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1002
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[SUM_NEXT]] = fadd float [[SUM]], [[INC]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[LCSSA:%.*]] = phi float [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret float [[LCSSA]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
 ; CHECK-ALM-LABEL: define float @fadd_reduction_with_live_in(
 ; CHECK-ALM-SAME: float [[INC:%.*]]) {
@@ -263,19 +209,9 @@ define float @fadd_reduction_with_live_in(float %inc) {
 ; CHECK-ALM-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1002
 ; CHECK-ALM-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-ALM:       middle.block:
-; CHECK-ALM-NEXT:    br label [[EXIT:%.*]]
-; CHECK-ALM:       scalar.ph:
 ; CHECK-ALM-NEXT:    br label [[LOOP:%.*]]
-; CHECK-ALM:       loop:
-; CHECK-ALM-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ALM-NEXT:    [[SUM:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ALM-NEXT:    [[SUM_NEXT]] = fadd float [[SUM]], [[INC]]
-; CHECK-ALM-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-ALM-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 1000
-; CHECK-ALM-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK-ALM:       exit:
-; CHECK-ALM-NEXT:    [[LCSSA:%.*]] = phi float [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; CHECK-ALM-NEXT:    ret float [[LCSSA]]
+; CHECK-ALM-NEXT:    ret float [[TMP5]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll
index 97f686c9c025a..dcab18fd93ed2 100644
--- a/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll
@@ -22,16 +22,6 @@ define void @test_variable_stride(ptr %dst, i32 %scale) {
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IDX:%.*]] = mul i32 [[IV]], [[SCALE]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i32 [[IDX]]
-; CHECK-NEXT:    store i32 [[IV]], ptr [[GEP]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
index 87eebb7baf880..a852b731ea13b 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
@@ -54,16 +54,6 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = alloca i8, i64 [[N]], align 16
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store ptr [[TMP18]], ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 200
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
index 4bc4e54ae60fa..00e04c7daee51 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll
@@ -34,15 +34,6 @@ define void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 15
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -94,15 +85,6 @@ define void @canonical_upper_limit_i8(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 255
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -154,15 +136,6 @@ define void @canonical_lower_limit_i16(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 257
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -214,15 +187,6 @@ define void @canonical_upper_limit_i16(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 65535
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -274,15 +238,6 @@ define void @canonical_lower_limit_i32(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 65537
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -334,15 +289,6 @@ define void @canonical_upper_limit_i32(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 4294967295
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -394,15 +340,6 @@ define void @canonical_lower_limit_i64(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 4294967297
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -454,15 +391,6 @@ define void @canonical_upper_limit_i64(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], -1
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -514,15 +442,6 @@ define void @canonical_lower_limit_i128(ptr nocapture noundef writeonly %p) {
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[END:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i256 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_IV:%.*]] = getelementptr inbounds i16, ptr [[P]], i256 [[IV]]
-; CHECK-NEXT:    store i16 1, ptr [[P_IV]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i256 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i256 [[IV_NEXT]], 18446744073709551617
-; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
index 6fd7c709a0442..b6f43aaa86e33 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll
@@ -55,22 +55,6 @@ define void @tail_fold_switch(ptr %dst, i32 %0) {
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    switch i32 [[TMP0]], label %[[LOOP_LATCH]] [
-; CHECK-NEXT:      i32 0, label %[[LOOP_LATCH]]
-; CHECK-NEXT:      i32 1, label %[[IF_THEN:.*]]
-; CHECK-NEXT:    ]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 4
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
index 45c56a0d7b79d..3bc5da155b351 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
@@ -53,18 +53,9 @@ define void @VF1-VPlanExe(ptr %dst) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 0, ptr [[DST_PTR]], align 4
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 entry:
   br label %for.body
@@ -132,17 +123,9 @@ define void @VF1-VPWidenCanonicalIVRecipeExe(ptr %ptr1) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
-; CHECK:       for.body:
-; CHECK-NEXT:    [[ADDR:%.*]] = phi ptr [ [[PTR:%.*]], [[FOR_BODY]] ], [ [[PTR1]], [[SCALAR_PH:%.*]] ]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[ADDR]], align 8
-; CHECK-NEXT:    [[PTR]] = getelementptr inbounds double, ptr [[ADDR]], i64 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[PTR]], [[PTR2]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
 ;
 entry:
   %ptr2 = getelementptr inbounds double, ptr %ptr1, i64 15
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll b/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll
index 387a02e63fe59..8a162930ffd99 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll
@@ -133,26 +133,7 @@ define void @ext_cmp(ptr %src.1, ptr %src.2, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds i16, ptr [[SRC_1]], i64 [[IV]]
-; CHECK-NEXT:    [[I2:%.*]] = load i16, ptr [[GEP_SRC_1]], align 2
-; CHECK-NEXT:    [[I3:%.*]] = sext i16 [[I2]] to i32
-; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 0, [[I3]]
-; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds i8, ptr [[SRC_2]], i64 [[IV]]
-; CHECK-NEXT:    [[I4:%.*]] = load i8, ptr [[GEP_SRC_2]], align 2
-; CHECK-NEXT:    [[I5:%.*]] = zext i8 [[I4]] to i32
-; CHECK-NEXT:    [[I6:%.*]] = select i1 [[C_1]], i32 0, i32 [[I5]]
-; CHECK-NEXT:    [[I7:%.*]] = and i32 [[I6]], 0
-; CHECK-NEXT:    [[I8:%.*]] = trunc nuw nsw i32 [[I7]] to i16
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store i16 [[I8]], ptr [[GEP_DST]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
index 83ecf1adc80b5..6e7cdba1cd3ce 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll
@@ -26,21 +26,7 @@ define void @pr77468(ptr noalias %src, ptr noalias %dst, i1 %x) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i16 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
-; CHECK-NEXT:    [[X_EXT:%.*]] = zext i1 [[X]] to i32
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X_EXT]], [[L]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i16 [[IV]]
-; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[AND]] to i16
-; CHECK-NEXT:    store i16 [[T]], ptr [[GEP_DST]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i16 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll b/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
index 2f5f157e55f63..2aebb73081364 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-reductions.ll
@@ -18,11 +18,7 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[AND_LCSSA_OFF0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> [[TMP2]])
 ; CHECK-NEXT:    ret i8 [[AND_LCSSA_OFF0]]
@@ -64,11 +60,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[XOR_LCSSA_OFF0:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP2]])
 ; CHECK-NEXT:    ret i16 [[XOR_LCSSA_OFF0]]
@@ -110,11 +102,7 @@ define i16 @reduction_xor_trunc(ptr noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[XOR_LCSSA_OFF0:%.*]] = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> [[TMP2]])
 ; CHECK-NEXT:    ret i16 [[XOR_LCSSA_OFF0]]
diff --git a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
index 4a372b5f786e6..498c58d1bfd82 100644
--- a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
+++ b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll
@@ -24,20 +24,7 @@ define void @test_pr47927_lshr_const_shift_ops(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[L:%.*]] = lshr i32 [[F]], 18
-; CHECK-NEXT:    [[L_T:%.*]] = trunc i32 [[L]] to i8
-; CHECK-NEXT:    [[IV_EXT:%.*]] = zext i8 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV_EXT]]
-; CHECK-NEXT:    store i8 [[L_T]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IV_NEXT]] to i32
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[CONV]], 100
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -81,20 +68,7 @@ define void @test_shl_const_shift_ops(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[L:%.*]] = shl i32 [[F]], 18
-; CHECK-NEXT:    [[L_T:%.*]] = trunc i32 [[L]] to i8
-; CHECK-NEXT:    [[IV_EXT:%.*]] = zext i8 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV_EXT]]
-; CHECK-NEXT:    store i8 [[L_T]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IV_NEXT]] to i32
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[CONV]], 100
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -138,20 +112,7 @@ define void @test_ashr_const_shift_ops(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[L:%.*]] = ashr i32 [[F]], 18
-; CHECK-NEXT:    [[L_T:%.*]] = trunc i32 [[L]] to i8
-; CHECK-NEXT:    [[IV_EXT:%.*]] = zext i8 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV_EXT]]
-; CHECK-NEXT:    store i8 [[L_T]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IV_NEXT]] to i32
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[CONV]], 100
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -195,22 +156,7 @@ define void @test_shl_const_shifted_op(ptr %dst, i32 %f) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_EXT:%.*]] = zext i8 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV_EXT]]
-; CHECK-NEXT:    [[LV:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[LV]] to i32
-; CHECK-NEXT:    [[L:%.*]] = shl i32 19, [[ZEXT]]
-; CHECK-NEXT:    [[L_T:%.*]] = trunc i32 [[L]] to i8
-; CHECK-NEXT:    store i8 [[L_T]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IV_NEXT]] to i32
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[CONV]], 100
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -257,22 +203,7 @@ define void @test_lshr_by_18(ptr %A) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_EXT:%.*]] = zext i8 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_EXT]]
-; CHECK-NEXT:    [[LV:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[LV_EXT:%.*]] = zext i8 [[LV]] to i32
-; CHECK-NEXT:    [[L:%.*]] = lshr i32 [[LV_EXT]], 18
-; CHECK-NEXT:    [[L_T:%.*]] = trunc i32 [[L]] to i8
-; CHECK-NEXT:    store i8 [[L_T]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IV_NEXT]] to i32
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[CONV]], 100
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -318,22 +249,7 @@ define void @test_lshr_by_4(ptr %A) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[IV_EXT:%.*]] = zext i8 [[IV]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_EXT]]
-; CHECK-NEXT:    [[LV:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[LV_EXT:%.*]] = zext i8 [[LV]] to i32
-; CHECK-NEXT:    [[L:%.*]] = lshr i32 [[LV_EXT]], 4
-; CHECK-NEXT:    [[L_T:%.*]] = trunc i32 [[L]] to i8
-; CHECK-NEXT:    store i8 [[L_T]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IV_NEXT]] to i32
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[CONV]], 100
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
index d6273e015f24c..b85f2746a0b14 100644
--- a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
+++ b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll
@@ -22,19 +22,7 @@ define void @uitofp_preserve_nneg(ptr %result, i32 %size, float %y) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
-; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER4:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CONV:%.*]] = uitofp nneg i32 [[TMP4]] to float
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul float [[CONV]], [[Y]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = zext nneg i32 [[TMP4]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[RESULT]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store float [[TMP5]], ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[TMP4]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], 256
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_EXIT]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index ccb301f4a3f79..985a9a2c2d155 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -21,21 +21,6 @@ define void @blend_uniform_iv_trunc(i1 %c) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_NEXT]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i16 [ poison, %[[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], %[[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i16 [[BLEND]]
-; CHECK-NEXT:    store i16 0, ptr [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -79,20 +64,6 @@ define void @blend_uniform_iv(i1 %c) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_NEXT]]:
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i64 [ poison, %[[LOOP_HEADER]] ], [ [[IV]], %[[LOOP_NEXT]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[BLEND]]
-; CHECK-NEXT:    store i16 0, ptr [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -150,25 +121,6 @@ define void @blend_chain_iv(i1 %c) {
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_NEXT]]:
-; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_NEXT_2:.*]], label %[[LOOP_NEXT_3:.*]]
-; CHECK:       [[LOOP_NEXT_2]]:
-; CHECK-NEXT:    br label %[[LOOP_NEXT_3]]
-; CHECK:       [[LOOP_NEXT_3]]:
-; CHECK-NEXT:    [[BLEND_1:%.*]] = phi i64 [ undef, %[[LOOP_NEXT]] ], [ [[IV]], %[[LOOP_NEXT_2]] ]
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[BLEND:%.*]] = phi i64 [ undef, %[[LOOP_HEADER]] ], [ [[BLEND_1]], %[[LOOP_NEXT_3]] ]
-; CHECK-NEXT:    [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[BLEND]]
-; CHECK-NEXT:    store i16 0, ptr [[DST_PTR]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT:    br i1 [[CMP439]], label %[[LOOP_HEADER]], label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -275,22 +227,6 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; CHECK-NEXT:    [[GEP_IV:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_IV]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[L]], 10
-; CHECK-NEXT:    br label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[P_1:%.*]] = phi i32 [ [[L]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[P_2:%.*]] = phi i32 [ [[ADD]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[P_1]], [[P_2]]
-; CHECK-NEXT:    store i32 [[RES]], ptr [[GEP_IV]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 2c49fda1ad520..571c55c276dd5 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -24,7 +24,8 @@ define void @ld_div1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -64,10 +65,11 @@ define void @ld_div2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -112,10 +114,11 @@ define void @ld_div3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -167,10 +170,11 @@ define void @ld_div1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -214,10 +218,11 @@ define void @ld_div2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -269,10 +274,11 @@ define void @ld_div3_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -324,7 +330,7 @@ define void @ld_div1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -379,7 +385,7 @@ define void @ld_div2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -426,7 +432,7 @@ define void @ld_div3_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -467,7 +473,7 @@ define void @ld_div1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -516,7 +522,7 @@ define void @ld_div2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -565,7 +571,7 @@ define void @ld_div3_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -621,7 +627,7 @@ define void @ld_div1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -669,7 +675,7 @@ define void @ld_div2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -725,7 +731,7 @@ define void @ld_div3_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -781,7 +787,7 @@ define void @ld_div1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -837,7 +843,7 @@ define void @ld_div2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -885,7 +891,7 @@ define void @ld_div3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -933,10 +939,11 @@ define void @test_step_is_not_invariant(ptr %A) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 56
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index c7525fb684d83..6cf82fc2c9d48 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -24,7 +24,8 @@ define void @ld_and_neg1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -64,10 +65,11 @@ define void @ld_and_neg2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -112,10 +114,11 @@ define void @ld_and_neg3_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -167,10 +170,11 @@ define void @ld_and_neg1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -212,10 +216,11 @@ define void @ld_and_neg2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -267,7 +272,7 @@ define void @ld_and_neg1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -322,7 +327,7 @@ define void @ld_and_neg2_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -371,7 +376,7 @@ define void @ld_and_neg2_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -427,7 +432,7 @@ define void @ld_and_neg2_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -483,7 +488,7 @@ define void @ld_and_neg2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -539,7 +544,7 @@ define void @ld_and_neg3_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 27cefa2d41927..9ed22400b7055 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -58,7 +58,8 @@ define void @ld_div2_urem3_1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -130,10 +131,11 @@ define void @ld_div2_urem3_2(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -203,10 +205,11 @@ define void @ld_div4(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -247,10 +250,11 @@ define void @ld_div8_urem3(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index cee53b5b1d2f9..2b5d0f3cb0125 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -25,7 +25,8 @@ define void @ld_lshr0_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr0_step1_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -46,7 +47,8 @@ define void @ld_lshr0_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -86,10 +88,11 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step1_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -123,10 +126,11 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -166,10 +170,11 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr2_step1_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -189,10 +194,11 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -244,10 +250,11 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr0_step2_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -296,10 +303,11 @@ define void @ld_lshr0_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -343,10 +351,11 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_lshr1_step2_start0_ind1
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -379,10 +388,11 @@ define void @ld_lshr1_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP14]], ptr [[TMP10]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF4-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -434,7 +444,7 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -486,7 +496,7 @@ define void @ld_lshr0_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -541,7 +551,7 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -593,7 +603,7 @@ define void @ld_lshr1_step3_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -643,7 +653,7 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -681,7 +691,7 @@ define void @ld_lshr1_step1_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -729,7 +739,7 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -766,7 +776,7 @@ define void @ld_lshr1_step2_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    store i64 [[TMP15]], ptr [[TMP11]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
-; VF4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -822,7 +832,7 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -875,7 +885,7 @@ define void @ld_lshr1_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -931,7 +941,7 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -984,7 +994,7 @@ define void @ld_lshr2_step3_start1_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index 0f8289d06d761..12851d7d91cc7 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -35,7 +35,8 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step1_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -76,7 +77,8 @@ define void @ld_div1_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -121,10 +123,11 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step1_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -162,10 +165,11 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -218,10 +222,11 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step1_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -259,10 +264,11 @@ define void @ld_div3_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -322,10 +328,11 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div1_step2_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -378,10 +385,11 @@ define void @ld_div1_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -441,10 +449,11 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div2_step2_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -497,10 +506,11 @@ define void @ld_div2_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -560,10 +570,11 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[EXIT:%.*]]
-; VF2:       scalar.ph:
+; VF2:       exit:
+; VF2-NEXT:    ret void
 ;
 ; VF4-LABEL: define void @ld_div3_step2_start0_ind2
 ; VF4-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
@@ -616,10 +627,11 @@ define void @ld_div3_step2_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
-; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[EXIT:%.*]]
-; VF4:       scalar.ph:
+; VF4:       exit:
+; VF4-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -679,7 +691,7 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -735,7 +747,7 @@ define void @ld_div1_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -798,7 +810,7 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -854,7 +866,7 @@ define void @ld_div2_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -917,7 +929,7 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -973,7 +985,7 @@ define void @ld_div3_step3_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1030,7 +1042,7 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1072,7 +1084,7 @@ define void @ld_div1_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1129,7 +1141,7 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1171,7 +1183,7 @@ define void @ld_div2_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1228,7 +1240,7 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 998
-; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1270,7 +1282,7 @@ define void @ld_div3_step1_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1334,7 +1346,7 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1391,7 +1403,7 @@ define void @ld_div1_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
-; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1455,7 +1467,7 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1512,7 +1524,7 @@ define void @ld_div2_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
-; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1576,7 +1588,7 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 498
-; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1633,7 +1645,7 @@ define void @ld_div3_step2_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 496
-; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1697,7 +1709,7 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1754,7 +1766,7 @@ define void @ld_div1_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1818,7 +1830,7 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1875,7 +1887,7 @@ define void @ld_div2_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
@@ -1939,7 +1951,7 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 6)
 ; VF2-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i64> [[VEC_IND1]], splat (i64 2)
 ; VF2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; VF2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF2:       middle.block:
 ; VF2-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF2:       scalar.ph:
@@ -1996,7 +2008,7 @@ define void @ld_div3_step3_start1_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 12)
 ; VF4-NEXT:    [[VEC_IND_NEXT2]] = add <4 x i64> [[VEC_IND1]], splat (i64 4)
 ; VF4-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], 332
-; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; VF4-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; VF4:       middle.block:
 ; VF4-NEXT:    br label [[SCALAR_PH:%.*]]
 ; VF4:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
index 5f83e39200644..5d07341263bc2 100644
--- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
+++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll
@@ -23,26 +23,7 @@ define void @test_not_first_lane_only_constant(ptr %A, ptr noalias %B)  {
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[IV]]
-; CHECK-NEXT:    br i1 false, label [[LOOP_LATCH]], label [[ELSE_1:%.*]]
-; CHECK:       else.1:
-; CHECK-NEXT:    br i1 false, label [[THEN_2:%.*]], label [[ELSE_2:%.*]]
-; CHECK:       then.2:
-; CHECK-NEXT:    br label [[ELSE_2]]
-; CHECK:       else.2:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi ptr [ [[B]], [[ELSE_2]] ], [ poison, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[MERGE]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    store i16 [[L]], ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -98,29 +79,7 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 %
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[IV]]
-; CHECK-NEXT:    [[L_0:%.*]] = load i16, ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[C_0:%.*]] = icmp ult i16 [[L_0]], [[X]]
-; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[ELSE_1:%.*]]
-; CHECK:       else.1:
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i16 [[L_0]], [[Y]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_2:%.*]], label [[ELSE_2:%.*]]
-; CHECK:       then.2:
-; CHECK-NEXT:    br label [[ELSE_2]]
-; CHECK:       else.2:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi ptr [ [[B]], [[ELSE_2]] ], [ poison, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[MERGE]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    store i16 [[L]], ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -179,29 +138,7 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[IV]]
-; CHECK-NEXT:    [[L_0:%.*]] = load i16, ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[C_0:%.*]] = icmp ult i16 [[L_0]], [[X]]
-; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[ELSE_1:%.*]]
-; CHECK:       else.1:
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i16 [[L_0]], [[Y]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_2:%.*]], label [[ELSE_2:%.*]]
-; CHECK:       then.2:
-; CHECK-NEXT:    br label [[ELSE_2]]
-; CHECK:       else.2:
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[MERGE:%.*]] = phi ptr [ poison, [[LOOP_HEADER]] ], [ [[B]], [[ELSE_2]] ]
-; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[MERGE]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; CHECK-NEXT:    store i16 [[L]], ptr [[GEP_A]], align 2
-; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
index 462865d11507a..8da1dca52e87b 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll
@@ -31,20 +31,8 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF1:       [[VECTOR_EARLY_EXIT]]:
 ; VF8UF1-NEXT:    br label %[[EXIT]]
-; VF8UF1:       [[SCALAR_PH:.*]]:
-; VF8UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF8UF1:       [[LOOP_HEADER]]:
-; VF8UF1-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF8UF1-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
-; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1
-; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
-; VF8UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
-; VF8UF1:       [[LOOP_LATCH]]:
-; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
-; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; VF8UF1:       [[EXIT]]:
-; VF8UF1-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF1-NEXT:    [[RES:%.*]] = phi i8 [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF1-NEXT:    ret i8 [[RES]]
 ;
 ; VF8UF2-LABEL: define i8 @test_early_exit_max_tc_less_than_16(
@@ -70,20 +58,8 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF8UF2-NEXT:    br label %[[EXIT:.*]]
 ; VF8UF2:       [[VECTOR_EARLY_EXIT]]:
 ; VF8UF2-NEXT:    br label %[[EXIT]]
-; VF8UF2:       [[SCALAR_PH:.*]]:
-; VF8UF2-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF8UF2:       [[LOOP_HEADER]]:
-; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF8UF2-NEXT:    [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]]
-; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
-; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
-; VF8UF2-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
-; VF8UF2:       [[LOOP_LATCH]]:
-; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; VF8UF2:       [[EXIT]]:
-; VF8UF2-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i8 [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i8 [[RES]]
 ;
 ; VF16UF1-LABEL: define i8 @test_early_exit_max_tc_less_than_16(
@@ -104,20 +80,8 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn
 ; VF16UF1-NEXT:    br label %[[EXIT:.*]]
 ; VF16UF1:       [[VECTOR_EARLY_EXIT]]:
 ; VF16UF1-NEXT:    br label %[[EXIT]]
-; VF16UF1:       [[SCALAR_PH:.*]]:
-; VF16UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF16UF1:       [[LOOP_HEADER]]:
-; VF16UF1-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF16UF1-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
-; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1
-; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
-; VF16UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
-; VF16UF1:       [[LOOP_LATCH]]:
-; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
-; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; VF16UF1:       [[EXIT]]:
-; VF16UF1-NEXT:    [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
+; VF16UF1-NEXT:    [[RES:%.*]] = phi i8 [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ]
 ; VF16UF1-NEXT:    ret i8 [[RES]]
 ;
 entry:
@@ -166,20 +130,8 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true)
 ; VF8UF1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]]
 ; VF8UF1-NEXT:    br label %[[EXIT]]
-; VF8UF1:       [[SCALAR_PH:.*]]:
-; VF8UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF8UF1:       [[LOOP_HEADER]]:
-; VF8UF1-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF8UF1-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
-; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1
-; VF8UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
-; VF8UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
-; VF8UF1:       [[LOOP_LATCH]]:
-; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
-; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; VF8UF1:       [[EXIT]]:
-; VF8UF1-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF1-NEXT:    [[RES:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF1-NEXT:    ret i64 [[RES]]
 ;
 ; VF8UF2-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(
@@ -212,20 +164,8 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF8UF2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 [[TMP7]]
 ; VF8UF2-NEXT:    [[TMP12:%.*]] = add i64 0, [[TMP11]]
 ; VF8UF2-NEXT:    br label %[[EXIT]]
-; VF8UF2:       [[SCALAR_PH:.*]]:
-; VF8UF2-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF8UF2:       [[LOOP_HEADER]]:
-; VF8UF2-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF8UF2-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
-; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1
-; VF8UF2-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
-; VF8UF2-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
-; VF8UF2:       [[LOOP_LATCH]]:
-; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
-; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; VF8UF2:       [[EXIT]]:
-; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP12]], %[[VECTOR_EARLY_EXIT]] ]
+; VF8UF2-NEXT:    [[RES:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP12]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF8UF2-NEXT:    ret i64 [[RES]]
 ;
 ; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(
@@ -248,20 +188,8 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer
 ; VF16UF1-NEXT:    [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 true)
 ; VF16UF1-NEXT:    [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]]
 ; VF16UF1-NEXT:    br label %[[EXIT]]
-; VF16UF1:       [[SCALAR_PH:.*]]:
-; VF16UF1-NEXT:    br label %[[LOOP_HEADER:.*]]
-; VF16UF1:       [[LOOP_HEADER]]:
-; VF16UF1-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
-; VF16UF1-NEXT:    [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]]
-; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1
-; VF16UF1-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 0
-; VF16UF1-NEXT:    br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]]
-; VF16UF1:       [[LOOP_LATCH]]:
-; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV1]], 1
-; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]]
 ; VF16UF1:       [[EXIT]]:
-; VF16UF1-NEXT:    [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ]
+; VF16UF1-NEXT:    [[RES:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ]
 ; VF16UF1-NEXT:    ret i64 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll
index d01358407f02f..2317af5619749 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll
@@ -17,18 +17,8 @@ define i64 @remove_loop_region_int_iv_used_outside(ptr %dst) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store ptr null, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ], [ 15, %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 15
 ;
 entry:
   br label %loop
@@ -60,18 +50,8 @@ define i64 @remove_loop_region_int_iv_inc_used_outside(ptr %dst) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store ptr null, ptr [[GEP]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[IV_NEXT]], %[[LOOP]] ], [ 16, %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i64 [[RES]]
+; CHECK-NEXT:    ret i64 16
 ;
 entry:
   br label %loop
@@ -105,19 +85,8 @@ define ptr @remove_loop_region_ptr_iv_used_outside(ptr %dst) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -8
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[DST]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[INT_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INT_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    store ptr null, ptr [[PTR_IV]], align 8
-; CHECK-NEXT:    [[INT_IV_NEXT]] = add i64 [[INT_IV]], 1
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[INT_IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP]] ], [ [[IND_ESCAPE]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret ptr [[RES]]
+; CHECK-NEXT:    ret ptr [[IND_ESCAPE]]
 ;
 entry:
   br label %loop
@@ -151,19 +120,8 @@ define ptr @remove_loop_region_ptr_iv_inc_used_outside(ptr %dst) {
 ; CHECK-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[DST]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[INT_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INT_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    store ptr null, ptr [[PTR_IV]], align 8
-; CHECK-NEXT:    [[INT_IV_NEXT]] = add i64 [[INT_IV]], 1
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[INT_IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV_NEXT]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret ptr [[RES]]
+; CHECK-NEXT:    ret ptr [[TMP0]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
index 5f8646925bf6d..e160a15ece47d 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll
@@ -176,15 +176,6 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
 ; VF8UF1-NEXT:    br label %[[EXIT:.*]]
-; VF8UF1:       [[SCALAR_PH:.*]]:
-; VF8UF1-NEXT:    br label %[[LOOP:.*]]
-; VF8UF1:       [[LOOP]]:
-; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ 2, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
-; VF8UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
-; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF8UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF8UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; VF8UF1:       [[EXIT]]:
 ; VF8UF1-NEXT:    ret void
 ;
@@ -316,15 +307,6 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br label %[[EXIT:.*]]
-; VF8UF2:       [[SCALAR_PH:.*]]:
-; VF8UF2-NEXT:    br label %[[LOOP:.*]]
-; VF8UF2:       [[LOOP]]:
-; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ 2, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
-; VF8UF2-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
-; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF8UF2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF8UF2-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    ret void
 ;
@@ -455,15 +437,6 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5,
 ; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    br label %[[EXIT:.*]]
-; VF16UF1:       [[SCALAR_PH:.*]]:
-; VF16UF1-NEXT:    br label %[[LOOP:.*]]
-; VF16UF1:       [[LOOP]]:
-; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ 2, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
-; VF16UF1-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
-; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; VF16UF1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; VF16UF1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]]
 ; VF16UF1:       [[EXIT]]:
 ; VF16UF1-NEXT:    ret void
 ;
@@ -728,23 +701,14 @@ define void @scev_expand_step(i64 %x, ptr %dst) {
 ; VF8UF1:       [[PRED_STORE_IF13]]:
 ; VF8UF1-NEXT:    [[TMP40:%.*]] = mul i64 7, [[STEP]]
 ; VF8UF1-NEXT:    [[TMP41:%.*]] = add i64 0, [[TMP40]]
-; VF8UF1-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], [[STEP]]
-; VF8UF1-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP42]]
-; VF8UF1-NEXT:    store i8 0, ptr [[TMP43]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT:%.*]] = add i64 [[TMP41]], [[STEP]]
+; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
+; VF8UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
 ; VF8UF1-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; VF8UF1:       [[PRED_STORE_CONTINUE14]]:
 ; VF8UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF1:       [[MIDDLE_BLOCK]]:
 ; VF8UF1-NEXT:    br label %[[EXIT:.*]]
-; VF8UF1:       [[SCALAR_PH:.*]]:
-; VF8UF1-NEXT:    br label %[[LOOP:.*]]
-; VF8UF1:       [[LOOP]]:
-; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF8UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
-; VF8UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
-; VF8UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
-; VF8UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
-; VF8UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]]
 ; VF8UF1:       [[EXIT]]:
 ; VF8UF1-NEXT:    ret void
 ;
@@ -922,22 +886,13 @@ define void @scev_expand_step(i64 %x, ptr %dst) {
 ; VF8UF2-NEXT:    [[TMP81:%.*]] = mul i64 15, [[STEP]]
 ; VF8UF2-NEXT:    [[TMP82:%.*]] = add i64 0, [[TMP81]]
 ; VF8UF2-NEXT:    [[TMP83:%.*]] = add i64 [[TMP82]], [[STEP]]
-; VF8UF2-NEXT:    [[TMP84:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP83]]
-; VF8UF2-NEXT:    store i8 0, ptr [[TMP84]], align 1
+; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP83]]
+; VF8UF2-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
 ; VF8UF2-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
 ; VF8UF2:       [[PRED_STORE_CONTINUE30]]:
 ; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF8UF2:       [[MIDDLE_BLOCK]]:
 ; VF8UF2-NEXT:    br label %[[EXIT:.*]]
-; VF8UF2:       [[SCALAR_PH:.*]]:
-; VF8UF2-NEXT:    br label %[[LOOP:.*]]
-; VF8UF2:       [[LOOP]]:
-; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF8UF2-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
-; VF8UF2-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
-; VF8UF2-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
-; VF8UF2-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
-; VF8UF2-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]]
 ; VF8UF2:       [[EXIT]]:
 ; VF8UF2-NEXT:    ret void
 ;
@@ -1114,22 +1069,13 @@ define void @scev_expand_step(i64 %x, ptr %dst) {
 ; VF16UF1-NEXT:    [[TMP80:%.*]] = mul i64 15, [[STEP]]
 ; VF16UF1-NEXT:    [[TMP81:%.*]] = add i64 0, [[TMP80]]
 ; VF16UF1-NEXT:    [[TMP82:%.*]] = add i64 [[TMP81]], [[STEP]]
-; VF16UF1-NEXT:    [[TMP83:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP82]]
-; VF16UF1-NEXT:    store i8 0, ptr [[TMP83]], align 1
+; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP82]]
+; VF16UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
 ; VF16UF1-NEXT:    br label %[[PRED_STORE_CONTINUE30]]
 ; VF16UF1:       [[PRED_STORE_CONTINUE30]]:
 ; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF16UF1:       [[MIDDLE_BLOCK]]:
 ; VF16UF1-NEXT:    br label %[[EXIT:.*]]
-; VF16UF1:       [[SCALAR_PH:.*]]:
-; VF16UF1-NEXT:    br label %[[LOOP:.*]]
-; VF16UF1:       [[LOOP]]:
-; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; VF16UF1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[STEP]]
-; VF16UF1-NEXT:    [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV_NEXT]]
-; VF16UF1-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
-; VF16UF1-NEXT:    [[EC:%.*]] = icmp slt i64 [[IV_NEXT]], 16
-; VF16UF1-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]]
 ; VF16UF1:       [[EXIT]]:
 ; VF16UF1-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
index 06b7bd8c9f84d..d08ca8c99e8ba 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll
@@ -21,19 +21,6 @@ define void @pr63340(ptr %A, ptr %B) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
-; CHECK:       loop.header:
-; CHECK-NEXT:    [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br label [[LOOP_LATCH]]
-; CHECK:       loop.latch:
-; CHECK-NEXT:    [[F_0_I:%.*]] = phi ptr [ [[A]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[F_0_I]], i64 1
-; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds ptr, ptr [[B]], i8 [[IV]]
-; CHECK-NEXT:    store ptr [[GEP]], ptr [[GEP_B]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add i8 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], -128
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -78,17 +65,6 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[GEP_L:%.*]] = getelementptr float, ptr [[L]], i64 [[N]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store ptr [[GEP_L]], ptr [[GEP_DST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -131,17 +107,6 @@ define void @wide_gep_multiple_indices_some_invariant(ptr noalias %dst, ptr noal
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr [[SRC]], align 8
-; CHECK-NEXT:    [[GEP_L:%.*]] = getelementptr [10 x float], ptr [[L]], i32 [[X]], i64 [[IV]]
-; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store ptr [[GEP_L]], ptr [[GEP_DST]], align 8
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll b/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll
index 055f2fdb84834..922ebe7211b6e 100644
--- a/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll
@@ -20,17 +20,6 @@ define void @powi_only_first_lane_used_of_second_arg(ptr %p, i32 %pow) {
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
-; CHECK:       [[SCALAR_PH:.*]]:
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[P_GEP:%.*]] = getelementptr float, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:    [[X:%.*]] = load float, ptr [[P_GEP]], align 4
-; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.powi.f32.i32(float [[X]], i32 [[POW]])
-; CHECK-NEXT:    store float [[Y]], ptr [[P_GEP]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index 435e6fccd620c..5e9fe8c4135ac 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -34,8 +34,8 @@ define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP2]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[BLOCKSIZE]], 15
 ; CHECK-NEXT:    [[CMP2_NOT15:%.*]] = icmp eq i32 [[AND]], 0
-; CHECK-NEXT:    br i1 [[CMP2_NOT15]], label [[WHILE_END5:%.*]], label [[MIDDLE_BLOCK:%.*]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[CMP2_NOT15]], label [[WHILE_END5:%.*]], label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = tail call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 0, i32 [[AND]])
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[PSRC_ADDR_0_LCSSA]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32>
@@ -44,7 +44,7 @@ define void @arm_mean_q7(ptr noundef %pSrc, i32 noundef %blockSize, ptr noundef
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[SUM_0_LCSSA]], [[TMP6]]
 ; CHECK-NEXT:    br label [[WHILE_END5]]
 ; CHECK:       while.end5:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_0_LCSSA]], [[WHILE_END]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_0_LCSSA]], [[WHILE_END]] ], [ [[TMP7]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUM_1_LCSSA]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    [[CONV6:%.*]] = trunc i32 [[DIV]] to i8
 ; CHECK-NEXT:    store i8 [[CONV6]], ptr [[PRESULT:%.*]], align 1

From 240b73e10f5c6549776cfd3847db2be14dc42776 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 1 Oct 2025 09:54:30 -0700
Subject: [PATCH 405/878] [SimplifyCFG][PGO] Reuse existing `setBranchWeights`
 (#160629)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The main difference between SimplifyCFG's `setBranchWeights`​ and the ProfDataUtils' is that the former doesn't propagate all-zero weights. That seems like a sensible thing to do, so updated the latter accordingly, and added a flag to control the behavior.

Also moved to ProfDataUtils the logic fitting 64-bit weights to 32-bit.

As side-effect, this fixes some profcheck failures.
---
 llvm/include/llvm/IR/Instructions.h           |   7 +-
 llvm/include/llvm/IR/ProfDataUtils.h          |   8 +-
 llvm/lib/IR/Instructions.cpp                  |  17 ---
 llvm/lib/IR/ProfDataUtils.cpp                 |  38 ++++-
 llvm/lib/Transforms/IPO/SampleProfile.cpp     |   8 +-
 .../Instrumentation/IndirectCallPromotion.cpp |   4 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 134 ++++++------------
 llvm/utils/profcheck-xfail.txt                |   2 -
 8 files changed, 100 insertions(+), 118 deletions(-)

diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 95a0a7fd2f97e..de7a237098594 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -32,6 +32,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/Support/AtomicOrdering.h"
@@ -3536,8 +3537,6 @@ class SwitchInstProfUpdateWrapper {
   bool Changed = false;
 
 protected:
-  LLVM_ABI MDNode *buildProfBranchWeightsMD();
-
   LLVM_ABI void init();
 
 public:
@@ -3549,8 +3548,8 @@ class SwitchInstProfUpdateWrapper {
   SwitchInstProfUpdateWrapper(SwitchInst &SI) : SI(SI) { init(); }
 
   ~SwitchInstProfUpdateWrapper() {
-    if (Changed)
-      SI.setMetadata(LLVMContext::MD_prof, buildProfBranchWeightsMD());
+    if (Changed && Weights.has_value() && Weights->size() >= 2)
+      setBranchWeights(SI, Weights.value(), /*IsExpected=*/false);
   }
 
   /// Delegate the call to the underlying SwitchInst::removeCase() and remove
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index e97160e59c795..a0876b169e0b8 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -145,7 +145,13 @@ LLVM_ABI bool extractProfTotalWeight(const Instruction &I,
 /// \param Weights an array of weights to set on instruction I.
 /// \param IsExpected were these weights added from an llvm.expect* intrinsic.
 LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
-                               bool IsExpected);
+                               bool IsExpected, bool ElideAllZero = false);
+
+/// Variant of `setBranchWeights` where the `Weights` will be fit first to
+/// uint32_t by shifting right.
+LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef<uint64_t> Weights,
+                                     bool IsExpected,
+                                     bool ElideAllZero = false);
 
 /// downscale the given weights preserving the ratio. If the maximum value is
 /// not already known and not provided via \param KnownMaxCount , it will be
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index dd83168ab3c6e..941e41f3127d5 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -4141,23 +4141,6 @@ void SwitchInst::growOperands() {
   growHungoffUses(ReservedSpace);
 }
 
-MDNode *SwitchInstProfUpdateWrapper::buildProfBranchWeightsMD() {
-  assert(Changed && "called only if metadata has changed");
-
-  if (!Weights)
-    return nullptr;
-
-  assert(SI.getNumSuccessors() == Weights->size() &&
-         "num of prof branch_weights must accord with num of successors");
-
-  bool AllZeroes = all_of(*Weights, [](uint32_t W) { return W == 0; });
-
-  if (AllZeroes || Weights->size() < 2)
-    return nullptr;
-
-  return MDBuilder(SI.getParent()->getContext()).createBranchWeights(*Weights);
-}
-
 void SwitchInstProfUpdateWrapper::init() {
   MDNode *ProfileData = getBranchWeightMDNode(SI);
   if (!ProfileData)
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index 99029c1719507..edeca976d293e 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/IR/ProfDataUtils.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -19,6 +20,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
@@ -84,10 +86,31 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
   }
 }
 
+/// Push the weights right to fit in uint32_t.
+static SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights) {
+  SmallVector<uint32_t> Ret;
+  Ret.reserve(Weights.size());
+  uint64_t Max = *llvm::max_element(Weights);
+  if (Max > UINT_MAX) {
+    unsigned Offset = 32 - llvm::countl_zero(Max);
+    for (const uint64_t &Value : Weights)
+      Ret.push_back(static_cast<uint32_t>(Value >> Offset));
+  } else {
+    append_range(Ret, Weights);
+  }
+  return Ret;
+}
+
 } // namespace
 
 namespace llvm {
-
+cl::opt<bool> ElideAllZeroBranchWeights("elide-all-zero-branch-weights",
+#if defined(LLVM_ENABLE_PROFCHECK)
+                                        cl::init(false)
+#else
+                                        cl::init(true)
+#endif
+);
 const char *MDProfLabels::BranchWeights = "branch_weights";
 const char *MDProfLabels::ExpectedBranchWeights = "expected";
 const char *MDProfLabels::ValueProfile = "VP";
@@ -282,12 +305,23 @@ bool hasExplicitlyUnknownBranchWeights(const Instruction &I) {
 }
 
 void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
-                      bool IsExpected) {
+                      bool IsExpected, bool ElideAllZero) {
+  if ((ElideAllZeroBranchWeights && ElideAllZero) &&
+      llvm::all_of(Weights, [](uint32_t V) { return V == 0; })) {
+    I.setMetadata(LLVMContext::MD_prof, nullptr);
+    return;
+  }
+
   MDBuilder MDB(I.getContext());
   MDNode *BranchWeights = MDB.createBranchWeights(Weights, IsExpected);
   I.setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
+void setFittedBranchWeights(Instruction &I, ArrayRef<uint64_t> Weights,
+                            bool IsExpected, bool ElideAllZero) {
+  setBranchWeights(I, fitWeights(Weights), IsExpected, ElideAllZero);
+}
+
 SmallVector<uint32_t> downscaleWeights(ArrayRef<uint64_t> Weights,
                                        std::optional<uint64_t> KnownMaxCount) {
   uint64_t MaxCount = KnownMaxCount.has_value() ? KnownMaxCount.value()
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 5bc7e34938127..99b8b88ebedbb 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1664,8 +1664,9 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           else if (OverwriteExistingWeights)
             I.setMetadata(LLVMContext::MD_prof, nullptr);
         } else if (!isa<IntrinsicInst>(&I)) {
-          setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])},
-                           /*IsExpected=*/false);
+          setBranchWeights(
+              I, ArrayRef<uint32_t>{static_cast<uint32_t>(BlockWeights[BB])},
+              /*IsExpected=*/false);
         }
       }
     } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) {
@@ -1676,7 +1677,8 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           if (cast<CallBase>(I).isIndirectCall()) {
             I.setMetadata(LLVMContext::MD_prof, nullptr);
           } else {
-            setBranchWeights(I, {uint32_t(0)}, /*IsExpected=*/false);
+            setBranchWeights(I, ArrayRef<uint32_t>{uint32_t(0)},
+                             /*IsExpected=*/false);
           }
         }
       }
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index f451c2b471aa6..0249f210f4754 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -672,8 +672,8 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
       createBranchWeights(CB.getContext(), Count, TotalCount - Count));
 
   if (AttachProfToDirectCall)
-    setBranchWeights(NewInst, {static_cast<uint32_t>(Count)},
-                     /*IsExpected=*/false);
+    setFittedBranchWeights(NewInst, {Count},
+                           /*IsExpected=*/false);
 
   using namespace ore;
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 216bdf4eb9efb..4d1f768e2177a 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -955,33 +955,6 @@ static bool valuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
   return false;
 }
 
-// Set branch weights on SwitchInst. This sets the metadata if there is at
-// least one non-zero weight.
-static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights,
-                             bool IsExpected) {
-  // Check that there is at least one non-zero weight. Otherwise, pass
-  // nullptr to setMetadata which will erase the existing metadata.
-  MDNode *N = nullptr;
-  if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; }))
-    N = MDBuilder(SI->getParent()->getContext())
-            .createBranchWeights(Weights, IsExpected);
-  SI->setMetadata(LLVMContext::MD_prof, N);
-}
-
-// Similar to the above, but for branch and select instructions that take
-// exactly 2 weights.
-static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
-                             uint32_t FalseWeight, bool IsExpected) {
-  assert(isa<BranchInst>(I) || isa<SelectInst>(I));
-  // Check that there is at least one non-zero weight. Otherwise, pass
-  // nullptr to setMetadata which will erase the existing metadata.
-  MDNode *N = nullptr;
-  if (TrueWeight || FalseWeight)
-    N = MDBuilder(I->getParent()->getContext())
-            .createBranchWeights(TrueWeight, FalseWeight, IsExpected);
-  I->setMetadata(LLVMContext::MD_prof, N);
-}
-
 /// If TI is known to be a terminator instruction and its block is known to
 /// only have a single predecessor block, check to see if that predecessor is
 /// also a value comparison with the same value, and if that comparison
@@ -1181,16 +1154,6 @@ static void getBranchWeights(Instruction *TI,
   }
 }
 
-/// Keep halving the weights until all can fit in uint32_t.
-static void fitWeights(MutableArrayRef<uint64_t> Weights) {
-  uint64_t Max = *llvm::max_element(Weights);
-  if (Max > UINT_MAX) {
-    unsigned Offset = 32 - llvm::countl_zero(Max);
-    for (uint64_t &I : Weights)
-      I >>= Offset;
-  }
-}
-
 static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
     BasicBlock *BB, BasicBlock *PredBlock, ValueToValueMapTy &VMap) {
   Instruction *PTI = PredBlock->getTerminator();
@@ -1446,14 +1409,9 @@ bool SimplifyCFGOpt::performValueComparisonIntoPredecessorFolding(
   for (ValueEqualityComparisonCase &V : PredCases)
     NewSI->addCase(V.Value, V.Dest);
 
-  if (PredHasWeights || SuccHasWeights) {
-    // Halve the weights if any of them cannot fit in an uint32_t
-    fitWeights(Weights);
-
-    SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
-
-    setBranchWeights(NewSI, MDWeights, /*IsExpected=*/false);
-  }
+  if (PredHasWeights || SuccHasWeights)
+    setFittedBranchWeights(*NewSI, Weights, /*IsExpected=*/false,
+                           /*ElideAllZero=*/true);
 
   eraseTerminatorAndDCECond(PTI);
 
@@ -4053,39 +4011,34 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
 
   // Try to update branch weights.
   uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
-  SmallVector<uint32_t, 2> MDWeights;
+  SmallVector<uint64_t, 2> MDWeights;
   if (extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
                              SuccTrueWeight, SuccFalseWeight)) {
-    SmallVector<uint64_t, 8> NewWeights;
 
     if (PBI->getSuccessor(0) == BB) {
       // PBI: br i1 %x, BB, FalseDest
       // BI:  br i1 %y, UniqueSucc, FalseDest
       // TrueWeight is TrueWeight for PBI * TrueWeight for BI.
-      NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+      MDWeights.push_back(PredTrueWeight * SuccTrueWeight);
       // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
       //               TrueWeight for PBI * FalseWeight for BI.
       // We assume that total weights of a BranchInst can fit into 32 bits.
       // Therefore, we will not have overflow using 64-bit arithmetic.
-      NewWeights.push_back(PredFalseWeight *
-                               (SuccFalseWeight + SuccTrueWeight) +
-                           PredTrueWeight * SuccFalseWeight);
+      MDWeights.push_back(PredFalseWeight * (SuccFalseWeight + SuccTrueWeight) +
+                          PredTrueWeight * SuccFalseWeight);
     } else {
       // PBI: br i1 %x, TrueDest, BB
       // BI:  br i1 %y, TrueDest, UniqueSucc
       // TrueWeight is TrueWeight for PBI * TotalWeight for BI +
       //              FalseWeight for PBI * TrueWeight for BI.
-      NewWeights.push_back(PredTrueWeight * (SuccFalseWeight + SuccTrueWeight) +
-                           PredFalseWeight * SuccTrueWeight);
+      MDWeights.push_back(PredTrueWeight * (SuccFalseWeight + SuccTrueWeight) +
+                          PredFalseWeight * SuccTrueWeight);
       // FalseWeight is FalseWeight for PBI * FalseWeight for BI.
-      NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+      MDWeights.push_back(PredFalseWeight * SuccFalseWeight);
     }
 
-    // Halve the weights if any of them cannot fit in an uint32_t
-    fitWeights(NewWeights);
-
-    append_range(MDWeights, NewWeights);
-    setBranchWeights(PBI, MDWeights[0], MDWeights[1], /*IsExpected=*/false);
+    setFittedBranchWeights(*PBI, MDWeights, /*IsExpected=*/false,
+                           /*ElideAllZero=*/true);
 
     // TODO: If BB is reachable from all paths through PredBlock, then we
     // could replace PBI's branch probabilities with BI's.
@@ -4125,8 +4078,8 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
     if (auto *SI = dyn_cast<SelectInst>(PBI->getCondition()))
       if (!MDWeights.empty()) {
         assert(isSelectInRoleOfConjunctionOrDisjunction(SI));
-        setBranchWeights(SI, MDWeights[0], MDWeights[1],
-                         /*IsExpected=*/false);
+        setFittedBranchWeights(*SI, {MDWeights[0], MDWeights[1]},
+                               /*IsExpected=*/false, /*ElideAllZero=*/true);
       }
 
   ++NumFoldBranchToCommonDest;
@@ -4478,9 +4431,9 @@ static bool mergeConditionalStoreToAddress(
     if (InvertQCond)
       std::swap(QWeights[0], QWeights[1]);
     auto CombinedWeights = getDisjunctionWeights(PWeights, QWeights);
-    setBranchWeights(PostBB->getTerminator(), CombinedWeights[0],
-                     CombinedWeights[1],
-                     /*IsExpected=*/false);
+    setFittedBranchWeights(*PostBB->getTerminator(),
+                           {CombinedWeights[0], CombinedWeights[1]},
+                           /*IsExpected=*/false, /*ElideAllZero=*/true);
   }
 
   QB.SetInsertPoint(T);
@@ -4836,10 +4789,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) +
                                   PredOther * SuccCommon,
                               PredOther * SuccOther};
-    // Halve the weights if any of them cannot fit in an uint32_t
-    fitWeights(NewWeights);
 
-    setBranchWeights(PBI, NewWeights[0], NewWeights[1], /*IsExpected=*/false);
+    setFittedBranchWeights(*PBI, NewWeights, /*IsExpected=*/false,
+                           /*ElideAllZero=*/true);
     // Cond may be a select instruction with the first operand set to "true", or
     // the second to "false" (see how createLogicalOp works for `and` and `or`)
     if (!ProfcheckDisableMetadataFixes)
@@ -4849,8 +4801,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
         assert(dyn_cast<SelectInst>(SI)->getCondition() == PBICond);
         // The corresponding probabilities are what was referred to above as
         // PredCommon and PredOther.
-        setBranchWeights(SI, PredCommon, PredOther,
-                         /*IsExpected=*/false);
+        setFittedBranchWeights(*SI, {PredCommon, PredOther},
+                               /*IsExpected=*/false, /*ElideAllZero=*/true);
       }
   }
 
@@ -4876,8 +4828,8 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
       if (HasWeights) {
         uint64_t TrueWeight = PBIOp ? PredFalseWeight : PredTrueWeight;
         uint64_t FalseWeight = PBIOp ? PredTrueWeight : PredFalseWeight;
-        setBranchWeights(NV, TrueWeight, FalseWeight,
-                         /*IsExpected=*/false);
+        setFittedBranchWeights(*NV, {TrueWeight, FalseWeight},
+                               /*IsExpected=*/false, /*ElideAllZero=*/true);
       }
     }
   }
@@ -4940,7 +4892,8 @@ bool SimplifyCFGOpt::simplifyTerminatorOnSelect(Instruction *OldTerm,
       // Create a conditional branch sharing the condition of the select.
       BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
       if (TrueWeight != FalseWeight)
-        setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false);
+        setBranchWeights(*NewBI, {TrueWeight, FalseWeight},
+                         /*IsExpected=*/false, /*ElideAllZero=*/true);
     }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
@@ -5889,7 +5842,8 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
         TrueWeight /= 2;
         FalseWeight /= 2;
       }
-      setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false);
+      setFittedBranchWeights(*NewBI, {TrueWeight, FalseWeight},
+                             /*IsExpected=*/false, /*ElideAllZero=*/true);
     }
   }
 
@@ -6364,9 +6318,9 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
         // BranchWeights. We want the probability and negative probability of
         // Condition == SecondCase.
         assert(BranchWeights.size() == 3);
-        setBranchWeights(SI, BranchWeights[2],
-                         BranchWeights[0] + BranchWeights[1],
-                         /*IsExpected=*/false);
+        setBranchWeights(
+            *SI, {BranchWeights[2], BranchWeights[0] + BranchWeights[1]},
+            /*IsExpected=*/false, /*ElideAllZero=*/true);
       }
     }
     Value *ValueCompare =
@@ -6381,9 +6335,10 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
       size_t FirstCasePos = (Condition != nullptr);
       size_t SecondCasePos = FirstCasePos + 1;
       uint32_t DefaultCase = (Condition != nullptr) ? BranchWeights[0] : 0;
-      setBranchWeights(SI, BranchWeights[FirstCasePos],
-                       DefaultCase + BranchWeights[SecondCasePos],
-                       /*IsExpected=*/false);
+      setBranchWeights(*SI,
+                       {BranchWeights[FirstCasePos],
+                        DefaultCase + BranchWeights[SecondCasePos]},
+                       /*IsExpected=*/false, /*ElideAllZero=*/true);
     }
     return Ret;
   }
@@ -6427,8 +6382,10 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
             // We know there's a Default case. We base the resulting branch
             // weights off its probability.
             assert(BranchWeights.size() >= 2);
-            setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
-                             BranchWeights[0], /*IsExpected=*/false);
+            setBranchWeights(
+                *SI,
+                {accumulate(drop_begin(BranchWeights), 0U), BranchWeights[0]},
+                /*IsExpected=*/false, /*ElideAllZero=*/true);
           }
           return Ret;
         }
@@ -6451,8 +6408,10 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
             Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
         if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
           assert(BranchWeights.size() >= 2);
-          setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
-                           BranchWeights[0], /*IsExpected=*/false);
+          setBranchWeights(
+              *SI,
+              {accumulate(drop_begin(BranchWeights), 0U), BranchWeights[0]},
+              /*IsExpected=*/false, /*ElideAllZero=*/true);
         }
         return Ret;
       }
@@ -6469,8 +6428,9 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector,
           Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult);
       if (auto *SI = dyn_cast<SelectInst>(Ret); SI && HasBranchWeights) {
         assert(BranchWeights.size() >= 2);
-        setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0),
-                         BranchWeights[0], /*IsExpected=*/false);
+        setBranchWeights(
+            *SI, {accumulate(drop_begin(BranchWeights), 0U), BranchWeights[0]},
+            /*IsExpected=*/false, /*ElideAllZero=*/true);
       }
       return Ret;
     }
@@ -8152,8 +8112,8 @@ static bool mergeNestedCondBranch(BranchInst *BI, DomTreeUpdater *DTU) {
   if (HasWeight) {
     uint64_t Weights[2] = {BBTWeight * BB1FWeight + BBFWeight * BB2TWeight,
                            BBTWeight * BB1TWeight + BBFWeight * BB2FWeight};
-    fitWeights(Weights);
-    setBranchWeights(BI, Weights[0], Weights[1], /*IsExpected=*/false);
+    setFittedBranchWeights(*BI, Weights, /*IsExpected=*/false,
+                           /*ElideAllZero=*/true);
   }
   return true;
 }
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 08c89441ec855..77e6ab7c5a6ea 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1414,7 +1414,6 @@ Transforms/SimplifyCFG/merge-cond-stores.ll
 Transforms/SimplifyCFG/multiple-phis.ll
 Transforms/SimplifyCFG/PhiBlockMerge.ll
 Transforms/SimplifyCFG/pr48641.ll
-Transforms/SimplifyCFG/preserve-branchweights.ll
 Transforms/SimplifyCFG/preserve-store-alignment.ll
 Transforms/SimplifyCFG/rangereduce.ll
 Transforms/SimplifyCFG/RISCV/select-trunc-i64.ll
@@ -1424,7 +1423,6 @@ Transforms/SimplifyCFG/safe-abs.ll
 Transforms/SimplifyCFG/SimplifyEqualityComparisonWithOnlyPredecessor-domtree-preservation-edgecase.ll
 Transforms/SimplifyCFG/speculate-blocks.ll
 Transforms/SimplifyCFG/speculate-derefable-load.ll
-Transforms/SimplifyCFG/suppress-zero-branch-weights.ll
 Transforms/SimplifyCFG/switch_create-custom-dl.ll
 Transforms/SimplifyCFG/switch_create.ll
 Transforms/SimplifyCFG/switch-dup-bbs.ll

From 42ab473f518c5f180455c674cbaba70a0b2634b7 Mon Sep 17 00:00:00 2001
From: Sam Elliott <aelliott@qti.qualcomm.com>
Date: Wed, 1 Oct 2025 18:00:40 +0100
Subject: [PATCH 406/878] [RISCV] Xqci with Short Forward Branches (#161407)

This change implements support for the combination of Xqci and the Short
Forward Branch optimisation.

In particular, we want to prioritise `Branch+ALU` (short forward
branches) over the equivalent `ALU+CMov`, when the compared values are
both registers, and the selected values come from registers (as this is
what `PseudoCCMOVGPR` supports).

However, when expanding `PseudoCCMOVGPR` (i.e., `Branch+MV`), we instead
want to expand it to a conditional move (for code size reasons), so I
have added `RISCVExpandPseudo::expandCCOpToCMov` to try to do so. This
mostly works, except if `PseudoCCMOVGPR` is comparing against zero and
gets commuted - as can be seen in one example in `foo` in
`select-cc.ll`.

This change:
- updates the attributes used for the XQCI RUN lines for the select
tests.
- modifies the CodeGen patterns and predicates to prioritise selecting
the SFB Pseudo.
- adds CodeGen patterns for MVLTI/MVLTUI/MVGEI/MVGEUI with imm=zero, to
prioritise over the equivalent `Select_GPR_Using_CC_GPR` patterns for
rhs=X0.
- adds a hook to attempt to turn the predicated-mov Pseudo back into a
Conditional Move from Xqcicm (which matches the pseudo in terms of tied
register operands).
---
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   |  85 +++++
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |  37 +-
 llvm/test/CodeGen/RISCV/cmov-branch-opt.ll    | 109 ++++++
 llvm/test/CodeGen/RISCV/select-bare.ll        |   2 +-
 llvm/test/CodeGen/RISCV/select-cc.ll          |  59 ++--
 llvm/test/CodeGen/RISCV/select-cond.ll        |   2 +-
 llvm/test/CodeGen/RISCV/select-const.ll       | 137 ++++----
 llvm/test/CodeGen/RISCV/select.ll             | 322 +++++++++++-------
 llvm/test/CodeGen/RISCV/xqcicli.ll            |   2 +-
 llvm/test/CodeGen/RISCV/xqcicm.ll             |   2 +-
 llvm/test/CodeGen/RISCV/xqcics.ll             |   2 +-
 11 files changed, 514 insertions(+), 245 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index d4d9e5430d390..410561855e181 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -46,6 +46,8 @@ class RISCVExpandPseudo : public MachineFunctionPass {
                 MachineBasicBlock::iterator &NextMBBI);
   bool expandCCOp(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   MachineBasicBlock::iterator &NextMBBI);
+  bool expandCCOpToCMov(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI);
   bool expandVMSET_VMCLR(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI, unsigned Opcode);
   bool expandMV_FPR16INX(MachineBasicBlock &MBB,
@@ -178,6 +180,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
 bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    MachineBasicBlock::iterator &NextMBBI) {
+  // First try expanding to a Conditional Move rather than a branch+mv
+  if (expandCCOpToCMov(MBB, MBBI))
+    return true;
 
   MachineFunction *MF = MBB.getParent();
   MachineInstr &MI = *MBBI;
@@ -277,6 +282,86 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
   return true;
 }
 
+bool RISCVExpandPseudo::expandCCOpToCMov(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  if (MI.getOpcode() != RISCV::PseudoCCMOVGPR &&
+      MI.getOpcode() != RISCV::PseudoCCMOVGPRNoX0)
+    return false;
+
+  if (!STI->hasVendorXqcicm())
+    return false;
+
+  // FIXME: Would be wonderful to support LHS=X0, but not very easy.
+  if (MI.getOperand(1).getReg() == RISCV::X0 ||
+      MI.getOperand(4).getReg() == RISCV::X0 ||
+      MI.getOperand(5).getReg() == RISCV::X0)
+    return false;
+
+  auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm());
+
+  unsigned CMovOpcode, CMovIOpcode;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unhandled CC");
+  case RISCVCC::COND_EQ:
+    CMovOpcode = RISCV::QC_MVEQ;
+    CMovIOpcode = RISCV::QC_MVEQI;
+    break;
+  case RISCVCC::COND_NE:
+    CMovOpcode = RISCV::QC_MVNE;
+    CMovIOpcode = RISCV::QC_MVNEI;
+    break;
+  case RISCVCC::COND_LT:
+    CMovOpcode = RISCV::QC_MVLT;
+    CMovIOpcode = RISCV::QC_MVLTI;
+    break;
+  case RISCVCC::COND_GE:
+    CMovOpcode = RISCV::QC_MVGE;
+    CMovIOpcode = RISCV::QC_MVGEI;
+    break;
+  case RISCVCC::COND_LTU:
+    CMovOpcode = RISCV::QC_MVLTU;
+    CMovIOpcode = RISCV::QC_MVLTUI;
+    break;
+  case RISCVCC::COND_GEU:
+    CMovOpcode = RISCV::QC_MVGEU;
+    CMovIOpcode = RISCV::QC_MVGEUI;
+    break;
+  }
+
+  if (MI.getOperand(2).getReg() == RISCV::X0) {
+    // $dst = PseudoCCMOVGPR $lhs, X0, $cc, $falsev (=$dst), $truev
+    // $dst = PseudoCCMOVGPRNoX0 $lhs, X0, $cc, $falsev (=$dst), $truev
+    // =>
+    // $dst = QC_MVccI $falsev (=$dst), $lhs, 0, $truev
+    BuildMI(MBB, MBBI, DL, TII->get(CMovIOpcode))
+        .addDef(MI.getOperand(0).getReg())
+        .addReg(MI.getOperand(4).getReg())
+        .addReg(MI.getOperand(1).getReg())
+        .addImm(0)
+        .addReg(MI.getOperand(5).getReg());
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // $dst = PseudoCCMOVGPR $lhs, $rhs, $cc, $falsev (=$dst), $truev
+  // $dst = PseudoCCMOVGPRNoX0 $lhs, $rhs, $cc, $falsev (=$dst), $truev
+  // =>
+  // $dst = QC_MVcc $falsev (=$dst), $lhs, $rhs, $truev
+  BuildMI(MBB, MBBI, DL, TII->get(CMovOpcode))
+      .addDef(MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(4).getReg())
+      .addReg(MI.getOperand(1).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addReg(MI.getOperand(5).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
 bool RISCVExpandPseudo::expandVMSET_VMCLR(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MBBI,
                                           unsigned Opcode) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 540786851e2d5..efdbd1298aec6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1350,6 +1350,10 @@ class QCIMVCCIPat<CondCode Cond, QCIMVCCI Inst, DAGOperand InTyImm>
     : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), InTyImm:$imm, Cond, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
           (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, GPRNoX0:$rs3)>;
 
+class QCIMVCCIZeroPat<CondCode Cond, QCIMVCCI Inst>
+    : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), Cond, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
+          (Inst GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>;
+
 class QCISELECTCCIPat<CondCode Cond, QCISELECTCCI Inst>
     : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rd), simm5:$imm, Cond, (i32 GPRNoX0:$rs2), (i32 GPRNoX0:$rs3))),
           (Inst GPRNoX0:$rd, simm5:$imm, GPRNoX0:$rs2, GPRNoX0:$rs3)>;
@@ -1538,14 +1542,7 @@ def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>;
 let Predicates = [HasVendorXqciint, IsRV32] in
 def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>;
 
-let Predicates = [HasVendorXqcicm, IsRV32] in {
-// (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`.
-// This exists to prioritise over the `Select_GPR_Using_CC_GPR` pattern.
-def : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), SETNE, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
-          (QC_MVNEI GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>;
-def : Pat<(i32 (riscv_selectcc (i32 GPRNoX0:$rs1), (i32 0), SETEQ, (i32 GPRNoX0:$rs3), (i32 GPRNoX0:$rd))),
-          (QC_MVEQI GPRNoX0:$rd, GPRNoX0:$rs1, 0, GPRNoX0:$rs3)>;
-
+let Predicates = [HasVendorXqcicm, NoShortForwardBranchOpt, IsRV32] in {
 def : QCIMVCCPat<SETEQ,  QC_MVEQ>;
 def : QCIMVCCPat<SETNE,  QC_MVNE>;
 def : QCIMVCCPat<SETLT,  QC_MVLT>;
@@ -1553,12 +1550,24 @@ def : QCIMVCCPat<SETULT, QC_MVLTU>;
 def : QCIMVCCPat<SETGE,  QC_MVGE>;
 def : QCIMVCCPat<SETUGE, QC_MVGEU>;
 
-def : QCIMVCCIPat<SETEQ,  QC_MVEQI,  simm5>;
-def : QCIMVCCIPat<SETNE,  QC_MVNEI,  simm5>;
-def : QCIMVCCIPat<SETLT,  QC_MVLTI,  simm5>;
-def : QCIMVCCIPat<SETULT, QC_MVLTUI, uimm5>;
-def : QCIMVCCIPat<SETGE,  QC_MVGEI,  simm5>;
-def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5>;
+// These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern for X0.
+def : QCIMVCCIZeroPat<SETEQ,  QC_MVEQI>;
+def : QCIMVCCIZeroPat<SETNE,  QC_MVNEI>;
+def : QCIMVCCIZeroPat<SETLT,  QC_MVLTI>;
+def : QCIMVCCIZeroPat<SETULT, QC_MVLTUI>;
+def : QCIMVCCIZeroPat<SETGE,  QC_MVGEI>;
+def : QCIMVCCIZeroPat<SETUGE, QC_MVGEUI>;
+}
+
+let Predicates = [HasVendorXqcicm, IsRV32] in {
+// These all use *imm5nonzero because we want to use PseudoCCMOVGPR with X0 when SFB is enabled.
+// When SFB is not enabled, the `QCIMVCCIZeroPat`s above will be used if RHS=0.
+def : QCIMVCCIPat<SETEQ,  QC_MVEQI,  simm5nonzero>;
+def : QCIMVCCIPat<SETNE,  QC_MVNEI,  simm5nonzero>;
+def : QCIMVCCIPat<SETLT,  QC_MVLTI,  simm5nonzero>;
+def : QCIMVCCIPat<SETULT, QC_MVLTUI, uimm5nonzero>;
+def : QCIMVCCIPat<SETGE,  QC_MVGEI,  simm5nonzero>;
+def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>;
 }
 
 let Predicates = [HasVendorXqcicli, IsRV32] in {
diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
index f8b1d505f4e81..edec1d0b649ce 100644
--- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
@@ -11,6 +11,8 @@
 ; RUN:   | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND,SFB-NOZICOND-C %s
 ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+zicond -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=SHORT_FORWARD,SFB-ZICOND %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 
 ; The conditional move optimization in sifive-p450 requires that only a
 ; single c.mv instruction appears in the branch shadow.
@@ -42,6 +44,14 @@ define signext i32 @test1(i32 signext %x, i32 signext %y, i32 signext %z) {
 ; SHORT_FORWARD-NEXT:    xor a0, a0, a1
 ; SHORT_FORWARD-NEXT:  .LBB0_2:
 ; SHORT_FORWARD-NEXT:    ret
+;
+; RV32IXQCI-LABEL: test1:
+; RV32IXQCI:       # %bb.0:
+; RV32IXQCI-NEXT:    bnez a2, .LBB0_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB0_2:
+; RV32IXQCI-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %a = xor i32 %x, %y
   %b = select i1 %c, i32 %a, i32 %x
@@ -73,6 +83,14 @@ define signext i32 @test2(i32 signext %x, i32 signext %y, i32 signext %z) {
 ; SHORT_FORWARD-NEXT:    xor a0, a0, a1
 ; SHORT_FORWARD-NEXT:  .LBB1_2:
 ; SHORT_FORWARD-NEXT:    ret
+;
+; RV32IXQCI-LABEL: test2:
+; RV32IXQCI:       # %bb.0:
+; RV32IXQCI-NEXT:    beqz a2, .LBB1_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB1_2:
+; RV32IXQCI-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %a = xor i32 %x, %y
   %b = select i1 %c, i32 %x, i32 %a
@@ -120,6 +138,19 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si
 ; SHORT_FORWARD-NEXT:  .LBB2_4:
 ; SHORT_FORWARD-NEXT:    addw a0, a0, a2
 ; SHORT_FORWARD-NEXT:    ret
+;
+; RV32IXQCI-LABEL: test3:
+; RV32IXQCI:       # %bb.0:
+; RV32IXQCI-NEXT:    beqz a4, .LBB2_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB2_2:
+; RV32IXQCI-NEXT:    beqz a4, .LBB2_4
+; RV32IXQCI-NEXT:  # %bb.3:
+; RV32IXQCI-NEXT:    xor a2, a2, a3
+; RV32IXQCI-NEXT:  .LBB2_4:
+; RV32IXQCI-NEXT:    add a0, a0, a2
+; RV32IXQCI-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %a = xor i32 %v, %w
   %b = select i1 %c, i32 %v, i32 %a
@@ -167,6 +198,12 @@ define signext i32 @test4(i32 signext %x, i32 signext %y, i32 signext %z) {
 ; SFB-ZICOND-NEXT:    li a0, 3
 ; SFB-ZICOND-NEXT:    czero.nez a0, a0, a2
 ; SFB-ZICOND-NEXT:    ret
+;
+; RV32IXQCI-LABEL: test4:
+; RV32IXQCI:       # %bb.0:
+; RV32IXQCI-NEXT:    li a0, 0
+; RV32IXQCI-NEXT:    qc.lieqi a0, a2, 0, 3
+; RV32IXQCI-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %a = select i1 %c, i32 3, i32 0
   ret i32 %a
@@ -199,6 +236,15 @@ define i16 @select_xor_1(i16 %A, i8 %cond) {
 ; SHORT_FORWARD-NEXT:    xori a0, a0, 43
 ; SHORT_FORWARD-NEXT:  .LBB4_2: # %entry
 ; SHORT_FORWARD-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_xor_1:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a1, a1, 1
+; RV32IXQCI-NEXT:    beqz a1, .LBB4_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xori a0, a0, 43
+; RV32IXQCI-NEXT:  .LBB4_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp eq i8 %and, 0
@@ -236,6 +282,15 @@ define i16 @select_xor_1b(i16 %A, i8 %cond) {
 ; SHORT_FORWARD-NEXT:    xori a0, a0, 43
 ; SHORT_FORWARD-NEXT:  .LBB5_2: # %entry
 ; SHORT_FORWARD-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_xor_1b:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a1, a1, 1
+; RV32IXQCI-NEXT:    beqz a1, .LBB5_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xori a0, a0, 43
+; RV32IXQCI-NEXT:  .LBB5_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp ne i8 %and, 1
@@ -289,6 +344,15 @@ define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) {
 ; SFB-ZICOND-NEXT:    xor a0, a1, a0
 ; SFB-ZICOND-NEXT:  .LBB6_2: # %entry
 ; SFB-ZICOND-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_xor_2:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB6_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB6_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp eq i8 %and, 0
@@ -344,6 +408,15 @@ define i32 @select_xor_2b(i32 %A, i32 %B, i8 %cond) {
 ; SFB-ZICOND-NEXT:    xor a0, a1, a0
 ; SFB-ZICOND-NEXT:  .LBB7_2: # %entry
 ; SFB-ZICOND-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_xor_2b:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB7_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB7_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp ne i8 %and, 1
@@ -397,6 +470,15 @@ define i32 @select_or(i32 %A, i32 %B, i8 %cond) {
 ; SFB-ZICOND-NEXT:    or a0, a1, a0
 ; SFB-ZICOND-NEXT:  .LBB8_2: # %entry
 ; SFB-ZICOND-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_or:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB8_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB8_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp eq i8 %and, 0
@@ -452,6 +534,15 @@ define i32 @select_or_b(i32 %A, i32 %B, i8 %cond) {
 ; SFB-ZICOND-NEXT:    or a0, a1, a0
 ; SFB-ZICOND-NEXT:  .LBB9_2: # %entry
 ; SFB-ZICOND-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_or_b:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB9_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB9_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp ne i8 %and, 1
@@ -505,6 +596,15 @@ define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) {
 ; SFB-ZICOND-NEXT:    or a0, a1, a0
 ; SFB-ZICOND-NEXT:  .LBB10_2: # %entry
 ; SFB-ZICOND-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_or_1:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB10_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB10_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i32 %cond, 1
  %cmp10 = icmp eq i32 %and, 0
@@ -560,6 +660,15 @@ define i32 @select_or_1b(i32 %A, i32 %B, i32 %cond) {
 ; SFB-ZICOND-NEXT:    or a0, a1, a0
 ; SFB-ZICOND-NEXT:  .LBB11_2: # %entry
 ; SFB-ZICOND-NEXT:    ret
+;
+; RV32IXQCI-LABEL: select_or_1b:
+; RV32IXQCI:       # %bb.0: # %entry
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB11_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB11_2: # %entry
+; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i32 %cond, 1
  %cmp10 = icmp ne i32 %and, 1
diff --git a/llvm/test/CodeGen/RISCV/select-bare.ll b/llvm/test/CodeGen/RISCV/select-bare.ll
index 44028a7651b95..550eb94724ff2 100644
--- a/llvm/test/CodeGen/RISCV/select-bare.ll
+++ b/llvm/test/CodeGen/RISCV/select-bare.ll
@@ -3,7 +3,7 @@
 ; RUN:   | FileCheck %s -check-prefix=RV32I
 ; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I-CCMOV %s
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 
 define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll
index b57f625cb867f..95f5a9d0925de 100644
--- a/llvm/test/CodeGen/RISCV/select-cc.ll
+++ b/llvm/test/CodeGen/RISCV/select-cc.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -disable-block-placement -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32I %s
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 ; RUN: llc -mtriple=riscv64 -disable-block-placement -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64I %s
@@ -88,39 +88,38 @@ define signext i32 @foo(i32 signext %a, ptr %b) nounwind {
 ; RV32IXQCI-LABEL: foo:
 ; RV32IXQCI:       # %bb.0:
 ; RV32IXQCI-NEXT:    lw a2, 0(a1)
-; RV32IXQCI-NEXT:    lw a4, 0(a1)
-; RV32IXQCI-NEXT:    lw t5, 0(a1)
-; RV32IXQCI-NEXT:    lw t4, 0(a1)
-; RV32IXQCI-NEXT:    lw t3, 0(a1)
-; RV32IXQCI-NEXT:    lw t2, 0(a1)
-; RV32IXQCI-NEXT:    lw t0, 0(a1)
-; RV32IXQCI-NEXT:    lw a7, 0(a1)
-; RV32IXQCI-NEXT:    lw a6, 0(a1)
 ; RV32IXQCI-NEXT:    lw a3, 0(a1)
-; RV32IXQCI-NEXT:    lw t1, 0(a1)
+; RV32IXQCI-NEXT:    lw a4, 0(a1)
 ; RV32IXQCI-NEXT:    lw a5, 0(a1)
-; RV32IXQCI-NEXT:    bltz t1, .LBB0_2
+; RV32IXQCI-NEXT:    qc.mvne a0, a0, a2, a2
+; RV32IXQCI-NEXT:    qc.mveq a0, a0, a3, a3
+; RV32IXQCI-NEXT:    lw a2, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvgeu a0, a4, a0, a4
+; RV32IXQCI-NEXT:    lw a3, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvltu a0, a0, a5, a5
+; RV32IXQCI-NEXT:    lw a4, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvgeu a0, a0, a2, a2
+; RV32IXQCI-NEXT:    lw a2, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvltu a0, a3, a0, a3
+; RV32IXQCI-NEXT:    lw a3, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvge a0, a4, a0, a4
+; RV32IXQCI-NEXT:    lw a4, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvlt a0, a0, a2, a2
+; RV32IXQCI-NEXT:    lw a2, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvge a0, a0, a3, a3
+; RV32IXQCI-NEXT:    lw a3, 0(a1)
+; RV32IXQCI-NEXT:    qc.mvlt a0, a4, a0, a4
+; RV32IXQCI-NEXT:    lw a4, 0(a1)
+; RV32IXQCI-NEXT:    lw a1, 0(a1)
+; RV32IXQCI-NEXT:    blez a2, .LBB0_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    li a5, 0
-; RV32IXQCI-NEXT:    qc.mveq a2, a0, a2, a0
-; RV32IXQCI-NEXT:    qc.mvne a4, a2, a4, a2
-; RV32IXQCI-NEXT:    qc.mvltu t5, t5, a4, a4
-; RV32IXQCI-NEXT:    qc.mvgeu t4, t5, t4, t5
-; RV32IXQCI-NEXT:    qc.mvltu t3, t4, t3, t4
-; RV32IXQCI-NEXT:    qc.mvgeu t2, t2, t3, t3
-; RV32IXQCI-NEXT:    qc.mvlt t0, t0, t2, t2
-; RV32IXQCI-NEXT:    qc.mvge a7, t0, a7, t0
-; RV32IXQCI-NEXT:    qc.mvlt a6, a7, a6, a7
-; RV32IXQCI-NEXT:    qc.mvge a3, a3, a6, a6
-; RV32IXQCI-NEXT:    qc.mvlt a3, a5, t1, t1
-; RV32IXQCI-NEXT:    mv a5, a3
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:  .LBB0_2:
-; RV32IXQCI-NEXT:    lw a2, 0(a1)
-; RV32IXQCI-NEXT:    lw a0, 0(a1)
-; RV32IXQCI-NEXT:    li a1, 1024
-; RV32IXQCI-NEXT:    qc.mvlt a2, a1, a2, a5
-; RV32IXQCI-NEXT:    li a1, 2046
-; RV32IXQCI-NEXT:    qc.mvltu a0, a1, t1, a2
+; RV32IXQCI-NEXT:    qc.mvlti a0, a2, 0, a3
+; RV32IXQCI-NEXT:    li a3, 1024
+; RV32IXQCI-NEXT:    qc.mvge a0, a3, a4, a4
+; RV32IXQCI-NEXT:    li a3, 2046
+; RV32IXQCI-NEXT:    qc.mvgeu a0, a3, a2, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: foo:
diff --git a/llvm/test/CodeGen/RISCV/select-cond.ll b/llvm/test/CodeGen/RISCV/select-cond.ll
index 3ca0f46e8c02f..a3c48737edc3c 100644
--- a/llvm/test/CodeGen/RISCV/select-cond.ll
+++ b/llvm/test/CodeGen/RISCV/select-cond.ll
@@ -7,7 +7,7 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-XQCICM
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32-XQCICS
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV64
diff --git a/llvm/test/CodeGen/RISCV/select-const.ll b/llvm/test/CodeGen/RISCV/select-const.ll
index 65d10bb823418..dfac6e1630d25 100644
--- a/llvm/test/CodeGen/RISCV/select-const.ll
+++ b/llvm/test/CodeGen/RISCV/select-const.ll
@@ -5,7 +5,7 @@
 ; RUN:   | FileCheck -check-prefixes=RV32,RV32IF %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zicond -target-abi=ilp32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV32,RV32ZICOND %s
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 ; RUN: llc -mtriple=riscv64 -target-abi=lp64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64,RV64I %s
@@ -579,9 +579,9 @@ define i32 @select_slt_zero_constant1_constant2(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_slt_zero_constant1_constant2:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srai a0, a0, 31
-; RV32IXQCI-NEXT:    andi a0, a0, 10
-; RV32IXQCI-NEXT:    addi a0, a0, -3
+; RV32IXQCI-NEXT:    li a1, -3
+; RV32IXQCI-NEXT:    qc.lilti a1, a0, 0, 7
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_slt_zero_constant1_constant2:
@@ -605,9 +605,9 @@ define i32 @select_sgt_negative_one_constant1_constant2(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_sgt_negative_one_constant1_constant2:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srai a0, a0, 31
-; RV32IXQCI-NEXT:    andi a0, a0, -10
-; RV32IXQCI-NEXT:    addi a0, a0, 7
+; RV32IXQCI-NEXT:    li a1, -3
+; RV32IXQCI-NEXT:    qc.ligei a1, a0, 0, 7
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_sgt_negative_one_constant1_constant2:
@@ -653,12 +653,10 @@ define i32 @select_nonnegative_lui_addi(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_nonnegative_lui_addi:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    mv a1, a0
-; RV32IXQCI-NEXT:    lui a0, 4
-; RV32IXQCI-NEXT:    bgez a1, .LBB21_2
-; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    li a0, 25
-; RV32IXQCI-NEXT:  .LBB21_2:
+; RV32IXQCI-NEXT:    lui a2, 4
+; RV32IXQCI-NEXT:    li a1, 25
+; RV32IXQCI-NEXT:    qc.mvgei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: select_nonnegative_lui_addi:
@@ -726,12 +724,10 @@ define i32 @select_nonnegative_lui_addi_swapped(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_nonnegative_lui_addi_swapped:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    bgez a0, .LBB22_2
-; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    lui a0, 4
-; RV32IXQCI-NEXT:    ret
-; RV32IXQCI-NEXT:  .LBB22_2:
-; RV32IXQCI-NEXT:    li a0, 25
+; RV32IXQCI-NEXT:    li a2, 25
+; RV32IXQCI-NEXT:    lui a1, 4
+; RV32IXQCI-NEXT:    qc.mvgei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: select_nonnegative_lui_addi_swapped:
@@ -801,13 +797,13 @@ define i32 @diff_shl_addi(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: diff_shl_addi:
 ; RV32IXQCI:       # %bb.0:
+; RV32IXQCI-NEXT:    lui a2, 4
+; RV32IXQCI-NEXT:    li a1, 25
 ; RV32IXQCI-NEXT:    bgez a0, .LBB23_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    lui a0, 4
-; RV32IXQCI-NEXT:    addi a0, a0, 25
-; RV32IXQCI-NEXT:    ret
+; RV32IXQCI-NEXT:    addi a1, a2, 25
 ; RV32IXQCI-NEXT:  .LBB23_2:
-; RV32IXQCI-NEXT:    li a0, 25
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: diff_shl_addi:
@@ -876,13 +872,13 @@ define i32 @diff_shl_addi2(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: diff_shl_addi2:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    bgez a0, .LBB24_2
+; RV32IXQCI-NEXT:    lui a2, 4
+; RV32IXQCI-NEXT:    li a1, 25
+; RV32IXQCI-NEXT:    bltz a0, .LBB24_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    li a0, 25
-; RV32IXQCI-NEXT:    ret
+; RV32IXQCI-NEXT:    addi a1, a2, 25
 ; RV32IXQCI-NEXT:  .LBB24_2:
-; RV32IXQCI-NEXT:    lui a0, 4
-; RV32IXQCI-NEXT:    addi a0, a0, 25
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: diff_shl_addi2:
@@ -929,9 +925,10 @@ define i32 @diff_pow2_24_16(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: diff_pow2_24_16:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srai a0, a0, 31
-; RV32IXQCI-NEXT:    andi a0, a0, -8
-; RV32IXQCI-NEXT:    addi a0, a0, 24
+; RV32IXQCI-NEXT:    li a2, 24
+; RV32IXQCI-NEXT:    li a1, 16
+; RV32IXQCI-NEXT:    qc.mvgei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: diff_pow2_24_16:
@@ -955,9 +952,10 @@ define i32 @diff_pow2_16_24(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: diff_pow2_16_24:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srli a0, a0, 28
-; RV32IXQCI-NEXT:    andi a0, a0, 8
-; RV32IXQCI-NEXT:    addi a0, a0, 16
+; RV32IXQCI-NEXT:    li a2, 16
+; RV32IXQCI-NEXT:    li a1, 24
+; RV32IXQCI-NEXT:    qc.mvgei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: diff_pow2_16_24:
@@ -1008,14 +1006,14 @@ define i32 @zext_or_constant(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: zext_or_constant:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    bgez a0, .LBB27_2
+; RV32IXQCI-NEXT:    srli a2, a0, 31
+; RV32IXQCI-NEXT:    lui a1, 140
+; RV32IXQCI-NEXT:    addi a1, a1, 417
+; RV32IXQCI-NEXT:    bltz a0, .LBB27_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    lui a0, 140
-; RV32IXQCI-NEXT:    addi a0, a0, 417
-; RV32IXQCI-NEXT:    ret
+; RV32IXQCI-NEXT:    xori a1, a2, 1
 ; RV32IXQCI-NEXT:  .LBB27_2:
-; RV32IXQCI-NEXT:    srli a0, a0, 31
-; RV32IXQCI-NEXT:    xori a0, a0, 1
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: zext_or_constant:
@@ -1095,14 +1093,14 @@ define i32 @zext_or_constant2(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: zext_or_constant2:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    bltz a0, .LBB28_2
+; RV32IXQCI-NEXT:    srli a2, a0, 31
+; RV32IXQCI-NEXT:    lui a1, 140
+; RV32IXQCI-NEXT:    addi a1, a1, 417
+; RV32IXQCI-NEXT:    bgez a0, .LBB28_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    lui a0, 140
-; RV32IXQCI-NEXT:    addi a0, a0, 417
-; RV32IXQCI-NEXT:    ret
+; RV32IXQCI-NEXT:    xori a1, a2, 1
 ; RV32IXQCI-NEXT:  .LBB28_2:
-; RV32IXQCI-NEXT:    srli a0, a0, 31
-; RV32IXQCI-NEXT:    xori a0, a0, 1
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: zext_or_constant2:
@@ -1183,14 +1181,14 @@ define i32 @sext_or_constant(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: sext_or_constant:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    bgez a0, .LBB29_2
+; RV32IXQCI-NEXT:    srli a2, a0, 31
+; RV32IXQCI-NEXT:    lui a1, 140
+; RV32IXQCI-NEXT:    addi a1, a1, 417
+; RV32IXQCI-NEXT:    bltz a0, .LBB29_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    lui a0, 140
-; RV32IXQCI-NEXT:    addi a0, a0, 417
-; RV32IXQCI-NEXT:    ret
+; RV32IXQCI-NEXT:    addi a1, a2, -1
 ; RV32IXQCI-NEXT:  .LBB29_2:
-; RV32IXQCI-NEXT:    srli a0, a0, 31
-; RV32IXQCI-NEXT:    addi a0, a0, -1
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: sext_or_constant:
@@ -1271,14 +1269,14 @@ define i32 @sext_or_constant2(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: sext_or_constant2:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    bltz a0, .LBB30_2
+; RV32IXQCI-NEXT:    srli a2, a0, 31
+; RV32IXQCI-NEXT:    lui a1, 140
+; RV32IXQCI-NEXT:    addi a1, a1, 417
+; RV32IXQCI-NEXT:    bgez a0, .LBB30_2
 ; RV32IXQCI-NEXT:  # %bb.1:
-; RV32IXQCI-NEXT:    lui a0, 140
-; RV32IXQCI-NEXT:    addi a0, a0, 417
-; RV32IXQCI-NEXT:    ret
+; RV32IXQCI-NEXT:    addi a1, a2, -1
 ; RV32IXQCI-NEXT:  .LBB30_2:
-; RV32IXQCI-NEXT:    srli a0, a0, 31
-; RV32IXQCI-NEXT:    addi a0, a0, -1
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64I-LABEL: sext_or_constant2:
@@ -1332,9 +1330,9 @@ define i32 @select_0_6(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_0_6:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srai a0, a0, 2
-; RV32IXQCI-NEXT:    srli a0, a0, 30
-; RV32IXQCI-NEXT:    slli a0, a0, 1
+; RV32IXQCI-NEXT:    li a1, 6
+; RV32IXQCI-NEXT:    qc.ligei a1, a0, 0, 0
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_0_6:
@@ -1358,9 +1356,9 @@ define i32 @select_6_0(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_6_0:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srli a0, a0, 31
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    andi a0, a0, 6
+; RV32IXQCI-NEXT:    li a1, 0
+; RV32IXQCI-NEXT:    qc.ligei a1, a0, 0, 6
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_6_0:
@@ -1383,8 +1381,9 @@ define i32 @select_0_394(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_0_394:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srai a0, a0, 31
-; RV32IXQCI-NEXT:    andi a0, a0, 394
+; RV32IXQCI-NEXT:    li a1, 394
+; RV32IXQCI-NEXT:    qc.ligei a1, a0, 0, 0
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_0_394:
@@ -1407,9 +1406,9 @@ define i32 @select_394_0(i32 signext %x) {
 ;
 ; RV32IXQCI-LABEL: select_394_0:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    srli a0, a0, 31
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    andi a0, a0, 394
+; RV32IXQCI-NEXT:    li a1, 394
+; RV32IXQCI-NEXT:    qc.lilti a1, a0, 0, 0
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 ;
 ; RV64-LABEL: select_394_0:
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 8273c65bf512e..1eb47e4c0ede2 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+xventanacondops -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64IMXVTCONDOPS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV32IMZICOND %s
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,CHECKZICOND,RV64IMZICOND %s
-; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 
 define i16 @select_xor_1(i16 %A, i8 %cond) {
@@ -44,10 +44,11 @@ define i16 @select_xor_1(i16 %A, i8 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_xor_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a1, a1, 31
-; RV32IXQCI-NEXT:    srai a1, a1, 31
-; RV32IXQCI-NEXT:    andi a1, a1, 43
-; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:    andi a1, a1, 1
+; RV32IXQCI-NEXT:    beqz a1, .LBB0_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xori a0, a0, 43
+; RV32IXQCI-NEXT:  .LBB0_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -102,10 +103,11 @@ define i16 @select_xor_1b(i16 %A, i8 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_xor_1b:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a1, a1, 31
-; RV32IXQCI-NEXT:    srai a1, a1, 31
-; RV32IXQCI-NEXT:    andi a1, a1, 43
-; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:    andi a1, a1, 1
+; RV32IXQCI-NEXT:    beqz a1, .LBB1_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xori a0, a0, 43
+; RV32IXQCI-NEXT:  .LBB1_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -148,10 +150,11 @@ define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_xor_2:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a2, a2, 31
-; RV32IXQCI-NEXT:    srai a2, a2, 31
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB2_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB2_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -196,10 +199,11 @@ define i32 @select_xor_2b(i32 %A, i32 %B, i8 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_xor_2b:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a2, a2, 31
-; RV32IXQCI-NEXT:    srai a2, a2, 31
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB3_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB3_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -221,9 +225,10 @@ define i16 @select_xor_3(i16 %A, i8 %cond) {
 ; RV32IXQCI-LABEL: select_xor_3:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a1, a1, 1
-; RV32IXQCI-NEXT:    addi a1, a1, -1
-; RV32IXQCI-NEXT:    andi a1, a1, 43
-; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:    bnez a1, .LBB4_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xori a0, a0, 43
+; RV32IXQCI-NEXT:  .LBB4_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -247,9 +252,10 @@ define i16 @select_xor_3b(i16 %A, i8 %cond) {
 ; RV32IXQCI-LABEL: select_xor_3b:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a1, a1, 1
-; RV32IXQCI-NEXT:    addi a1, a1, -1
-; RV32IXQCI-NEXT:    andi a1, a1, 43
-; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:    bnez a1, .LBB5_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    xori a0, a0, 43
+; RV32IXQCI-NEXT:  .LBB5_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -293,9 +299,10 @@ define i32 @select_xor_4(i32 %A, i32 %B, i8 %cond) {
 ; RV32IXQCI-LABEL: select_xor_4:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a2, a2, 1
-; RV32IXQCI-NEXT:    addi a2, a2, -1
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    bnez a2, .LBB6_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB6_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -341,9 +348,10 @@ define i32 @select_xor_4b(i32 %A, i32 %B, i8 %cond) {
 ; RV32IXQCI-LABEL: select_xor_4b:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a2, a2, 1
-; RV32IXQCI-NEXT:    addi a2, a2, -1
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    bnez a2, .LBB7_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    xor a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB7_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -382,9 +390,12 @@ define i32 @select_xor_5(i1 zeroext %cond, i32 %x) {
 ;
 ; RV32IXQCI-LABEL: select_xor_5:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a1
-; RV32IXQCI-NEXT:    xori a0, a0, 128
+; RV32IXQCI-NEXT:    li a2, 128
+; RV32IXQCI-NEXT:    bnez a0, .LBB8_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    xori a2, a1, 128
+; RV32IXQCI-NEXT:  .LBB8_2:
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
   %add = xor i32 %x, 128
   %sel = select i1 %cond, i32 128, i32 %add
@@ -424,10 +435,11 @@ define i32 @select_or(i32 %A, i32 %B, i8 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_or:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a2, a2, 31
-; RV32IXQCI-NEXT:    srai a2, a2, 31
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB9_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB9_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -472,10 +484,11 @@ define i32 @select_or_b(i32 %A, i32 %B, i8 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_or_b:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a2, a2, 31
-; RV32IXQCI-NEXT:    srai a2, a2, 31
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB10_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB10_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -518,10 +531,11 @@ define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_or_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a2, a2, 31
-; RV32IXQCI-NEXT:    srai a2, a2, 31
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB11_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB11_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i32 %cond, 1
@@ -566,10 +580,11 @@ define i32 @select_or_1b(i32 %A, i32 %B, i32 %cond) {
 ;
 ; RV32IXQCI-LABEL: select_or_1b:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    slli a2, a2, 31
-; RV32IXQCI-NEXT:    srai a2, a2, 31
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    andi a2, a2, 1
+; RV32IXQCI-NEXT:    beqz a2, .LBB12_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB12_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i32 %cond, 1
@@ -613,9 +628,10 @@ define i32 @select_or_2(i32 %A, i32 %B, i8 %cond) {
 ; RV32IXQCI-LABEL: select_or_2:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a2, a2, 1
-; RV32IXQCI-NEXT:    addi a2, a2, -1
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    bnez a2, .LBB13_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB13_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -661,9 +677,10 @@ define i32 @select_or_2b(i32 %A, i32 %B, i8 %cond) {
 ; RV32IXQCI-LABEL: select_or_2b:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a2, a2, 1
-; RV32IXQCI-NEXT:    addi a2, a2, -1
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    bnez a2, .LBB14_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB14_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i8 %cond, 1
@@ -707,9 +724,10 @@ define i32 @select_or_3(i32 %A, i32 %B, i32 %cond) {
 ; RV32IXQCI-LABEL: select_or_3:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a2, a2, 1
-; RV32IXQCI-NEXT:    addi a2, a2, -1
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    bnez a2, .LBB15_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB15_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i32 %cond, 1
@@ -755,9 +773,10 @@ define i32 @select_or_3b(i32 %A, i32 %B, i32 %cond) {
 ; RV32IXQCI-LABEL: select_or_3b:
 ; RV32IXQCI:       # %bb.0: # %entry
 ; RV32IXQCI-NEXT:    andi a2, a2, 1
-; RV32IXQCI-NEXT:    addi a2, a2, -1
-; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:    bnez a2, .LBB16_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
 ; RV32IXQCI-NEXT:    or a0, a0, a1
+; RV32IXQCI-NEXT:  .LBB16_2: # %entry
 ; RV32IXQCI-NEXT:    ret
 entry:
  %and = and i32 %cond, 1
@@ -796,9 +815,12 @@ define i32 @select_or_4(i1 zeroext %cond, i32 %x) {
 ;
 ; RV32IXQCI-LABEL: select_or_4:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a1
-; RV32IXQCI-NEXT:    ori a0, a0, 128
+; RV32IXQCI-NEXT:    li a2, 128
+; RV32IXQCI-NEXT:    bnez a0, .LBB17_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    ori a2, a1, 128
+; RV32IXQCI-NEXT:  .LBB17_2:
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
   %add = or i32 %x, 128
   %sel = select i1 %cond, i32 128, i32 %add
@@ -840,9 +862,11 @@ define i32 @select_add_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_add_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    neg a0, a0
-; RV32IXQCI-NEXT:    and a0, a0, a1
-; RV32IXQCI-NEXT:    add a0, a0, a2
+; RV32IXQCI-NEXT:    beqz a0, .LBB18_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    add a2, a2, a1
+; RV32IXQCI-NEXT:  .LBB18_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = add i32 %a, %b
@@ -885,9 +909,11 @@ define i32 @select_add_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_add_2:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a2
-; RV32IXQCI-NEXT:    add a0, a0, a1
+; RV32IXQCI-NEXT:    bnez a0, .LBB19_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    add a1, a1, a2
+; RV32IXQCI-NEXT:  .LBB19_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = add i32 %a, %b
@@ -933,9 +959,11 @@ define i32 @select_add_3(i1 zeroext %cond, i32 %a) {
 ;
 ; RV32IXQCI-LABEL: select_add_3:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    andi a0, a0, 42
-; RV32IXQCI-NEXT:    add a0, a0, a1
+; RV32IXQCI-NEXT:    bnez a0, .LBB20_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    addi a1, a1, 42
+; RV32IXQCI-NEXT:  .LBB20_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = add i32 %a, 42
@@ -978,9 +1006,12 @@ define i32 @select_add_4(i1 zeroext %cond, i32 %x) {
 ;
 ; RV32IXQCI-LABEL: select_add_4:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a1
-; RV32IXQCI-NEXT:    addi a0, a0, 128
+; RV32IXQCI-NEXT:    li a2, 128
+; RV32IXQCI-NEXT:    bnez a0, .LBB21_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    addi a2, a1, 128
+; RV32IXQCI-NEXT:  .LBB21_2:
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
   %add = add i32 %x, 128
   %sel = select i1 %cond, i32 128, i32 %add
@@ -1029,12 +1060,14 @@ define i64 @select_add_5(i1 zeroext %cond, i64 %x) {
 ;
 ; RV32IXQCI-LABEL: select_add_5:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    addi a3, a0, -1
-; RV32IXQCI-NEXT:    and a1, a1, a3
-; RV32IXQCI-NEXT:    addi a0, a1, 128
-; RV32IXQCI-NEXT:    sltu a1, a0, a1
-; RV32IXQCI-NEXT:    and a2, a2, a3
-; RV32IXQCI-NEXT:    add a1, a1, a2
+; RV32IXQCI-NEXT:    mv a3, a0
+; RV32IXQCI-NEXT:    addi a4, a1, 128
+; RV32IXQCI-NEXT:    sltu a0, a4, a1
+; RV32IXQCI-NEXT:    add a2, a2, a0
+; RV32IXQCI-NEXT:    li a0, 128
+; RV32IXQCI-NEXT:    qc.mveqi a0, a3, 0, a4
+; RV32IXQCI-NEXT:    qc.selectieqi a3, 0, a2, 0
+; RV32IXQCI-NEXT:    mv a1, a3
 ; RV32IXQCI-NEXT:    ret
   %add = add i64 %x, 128
   %sel = select i1 %cond, i64 128, i64 %add
@@ -1093,14 +1126,15 @@ define i64 @select_add_6(i1 zeroext %cond, i64 %x) {
 ;
 ; RV32IXQCI-LABEL: select_add_6:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    addi a3, a0, -1
+; RV32IXQCI-NEXT:    mv a3, a0
 ; RV32IXQCI-NEXT:    lui a0, 14
-; RV32IXQCI-NEXT:    and a1, a1, a3
-; RV32IXQCI-NEXT:    addi a0, a0, 1005
-; RV32IXQCI-NEXT:    add a0, a0, a1
+; RV32IXQCI-NEXT:    addi a4, a0, 1005
+; RV32IXQCI-NEXT:    add a0, a1, a4
 ; RV32IXQCI-NEXT:    sltu a1, a0, a1
-; RV32IXQCI-NEXT:    and a2, a2, a3
 ; RV32IXQCI-NEXT:    add a1, a1, a2
+; RV32IXQCI-NEXT:    qc.mvnei a0, a3, 0, a4
+; RV32IXQCI-NEXT:    qc.selectieqi a3, 0, a1, 0
+; RV32IXQCI-NEXT:    mv a1, a3
 ; RV32IXQCI-NEXT:    ret
   %add = add i64 %x, 58349
   %sel = select i1 %cond, i64 58349, i64 %add
@@ -1152,9 +1186,11 @@ define i32 @select_sub_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_sub_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    sub a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:    beqz a0, .LBB24_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    sub a2, a1, a2
+; RV32IXQCI-NEXT:  .LBB24_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = sub i32 %a, %b
@@ -1197,9 +1233,11 @@ define i32 @select_sub_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_sub_2:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a2
-; RV32IXQCI-NEXT:    sub a0, a1, a0
+; RV32IXQCI-NEXT:    bnez a0, .LBB25_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    sub a1, a1, a2
+; RV32IXQCI-NEXT:  .LBB25_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = sub i32 %a, %b
@@ -1245,9 +1283,11 @@ define i32 @select_sub_3(i1 zeroext %cond, i32 %a) {
 ;
 ; RV32IXQCI-LABEL: select_sub_3:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    andi a0, a0, 42
-; RV32IXQCI-NEXT:    sub a0, a1, a0
+; RV32IXQCI-NEXT:    bnez a0, .LBB26_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    addi a1, a1, -42
+; RV32IXQCI-NEXT:  .LBB26_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = sub i32 %a, 42
@@ -1301,10 +1341,12 @@ define i32 @select_sub_4(i1 zeroext %cond, i32 %x) {
 ;
 ; RV32IXQCI-LABEL: select_sub_4:
 ; RV32IXQCI:       # %bb.0:
+; RV32IXQCI-NEXT:    li a2, 128
+; RV32IXQCI-NEXT:    bnez a0, .LBB27_2
+; RV32IXQCI-NEXT:  # %bb.1:
 ; RV32IXQCI-NEXT:    addi a2, a1, -128
-; RV32IXQCI-NEXT:    li a1, 128
-; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:  .LBB27_2:
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
   %add = sub i32 %x, 128
   %sel = select i1 %cond, i32 128, i32 %add
@@ -1347,9 +1389,11 @@ define i32 @select_and_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_and_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    and a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:    beqz a0, .LBB28_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    and a2, a2, a1
+; RV32IXQCI-NEXT:  .LBB28_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = and i32 %a, %b
@@ -1392,9 +1436,11 @@ define i32 @select_and_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_and_2:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    and a2, a2, a1
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    bnez a0, .LBB29_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    and a1, a1, a2
+; RV32IXQCI-NEXT:  .LBB29_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = and i32 %a, %b
@@ -1437,9 +1483,11 @@ define i32 @select_and_3(i1 zeroext %cond, i32 %a) {
 ;
 ; RV32IXQCI-LABEL: select_and_3:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    andi a2, a1, 42
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    bnez a0, .LBB30_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    andi a1, a1, 42
+; RV32IXQCI-NEXT:  .LBB30_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = and i32 %a, 42
@@ -1626,9 +1674,11 @@ define i32 @select_udiv_3(i1 zeroext %cond, i32 %a) {
 ; RV32IXQCI-NEXT:    lui a3, 199729
 ; RV32IXQCI-NEXT:    addi a3, a3, -975
 ; RV32IXQCI-NEXT:    mulhu a2, a2, a3
-; RV32IXQCI-NEXT:    srli a2, a2, 2
-; RV32IXQCI-NEXT:    qc.mvnei a2, a0, 0, a1
-; RV32IXQCI-NEXT:    mv a0, a2
+; RV32IXQCI-NEXT:    bnez a0, .LBB33_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    srli a1, a2, 2
+; RV32IXQCI-NEXT:  .LBB33_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = udiv i32 %a, 42
@@ -1681,9 +1731,11 @@ define i32 @select_shl_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_shl_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    sll a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:    beqz a0, .LBB34_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    sll a2, a1, a2
+; RV32IXQCI-NEXT:  .LBB34_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = shl i32 %a, %b
@@ -1726,9 +1778,11 @@ define i32 @select_shl_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_shl_2:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a2
-; RV32IXQCI-NEXT:    sll a0, a1, a0
+; RV32IXQCI-NEXT:    bnez a0, .LBB35_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    sll a1, a1, a2
+; RV32IXQCI-NEXT:  .LBB35_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = shl i32 %a, %b
@@ -1797,9 +1851,11 @@ define i32 @select_ashr_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_ashr_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    sra a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:    beqz a0, .LBB37_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    sra a2, a1, a2
+; RV32IXQCI-NEXT:  .LBB37_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = ashr i32 %a, %b
@@ -1842,9 +1898,11 @@ define i32 @select_ashr_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_ashr_2:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a2
-; RV32IXQCI-NEXT:    sra a0, a1, a0
+; RV32IXQCI-NEXT:    bnez a0, .LBB38_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    sra a1, a1, a2
+; RV32IXQCI-NEXT:  .LBB38_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = ashr i32 %a, %b
@@ -1913,9 +1971,11 @@ define i32 @select_lshr_1(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_lshr_1:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    srl a1, a1, a2
-; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
-; RV32IXQCI-NEXT:    mv a0, a1
+; RV32IXQCI-NEXT:    beqz a0, .LBB40_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    srl a2, a1, a2
+; RV32IXQCI-NEXT:  .LBB40_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a2
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = lshr i32 %a, %b
@@ -1958,9 +2018,11 @@ define i32 @select_lshr_2(i1 zeroext %cond, i32 %a, i32 %b) {
 ;
 ; RV32IXQCI-LABEL: select_lshr_2:
 ; RV32IXQCI:       # %bb.0: # %entry
-; RV32IXQCI-NEXT:    addi a0, a0, -1
-; RV32IXQCI-NEXT:    and a0, a0, a2
-; RV32IXQCI-NEXT:    srl a0, a1, a0
+; RV32IXQCI-NEXT:    bnez a0, .LBB41_2
+; RV32IXQCI-NEXT:  # %bb.1: # %entry
+; RV32IXQCI-NEXT:    srl a1, a1, a2
+; RV32IXQCI-NEXT:  .LBB41_2: # %entry
+; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
 entry:
   %c = lshr i32 %a, %b
@@ -2304,11 +2366,13 @@ define i32 @select_cst3(i1 zeroext %cond) {
 ;
 ; RV32IXQCI-LABEL: select_cst3:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    lui a1, 7
-; RV32IXQCI-NEXT:    lui a2, 5
-; RV32IXQCI-NEXT:    addi a3, a1, 1328
-; RV32IXQCI-NEXT:    addi a1, a2, -480
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a3
+; RV32IXQCI-NEXT:    lui a2, 7
+; RV32IXQCI-NEXT:    lui a1, 5
+; RV32IXQCI-NEXT:    addi a1, a1, -480
+; RV32IXQCI-NEXT:    beqz a0, .LBB51_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    addi a1, a2, 1328
+; RV32IXQCI-NEXT:  .LBB51_2:
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %ret = select i1 %cond, i32 30000, i32 20000
@@ -2370,10 +2434,12 @@ define i32 @select_cst5(i1 zeroext %cond) {
 ;
 ; RV32IXQCI-LABEL: select_cst5:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    lui a1, 1
-; RV32IXQCI-NEXT:    addi a2, a1, -2047
+; RV32IXQCI-NEXT:    lui a2, 1
 ; RV32IXQCI-NEXT:    li a1, 2047
-; RV32IXQCI-NEXT:    qc.mveqi a1, a0, 0, a2
+; RV32IXQCI-NEXT:    bnez a0, .LBB53_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    addi a1, a2, -2047
+; RV32IXQCI-NEXT:  .LBB53_2:
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %ret = select i1 %cond, i32 2047, i32 2049
@@ -2417,10 +2483,12 @@ define i32 @select_cst5_invert(i1 zeroext %cond) {
 ;
 ; RV32IXQCI-LABEL: select_cst5_invert:
 ; RV32IXQCI:       # %bb.0:
-; RV32IXQCI-NEXT:    lui a1, 1
-; RV32IXQCI-NEXT:    addi a2, a1, -2047
+; RV32IXQCI-NEXT:    lui a2, 1
 ; RV32IXQCI-NEXT:    li a1, 2047
-; RV32IXQCI-NEXT:    qc.mvnei a1, a0, 0, a2
+; RV32IXQCI-NEXT:    beqz a0, .LBB54_2
+; RV32IXQCI-NEXT:  # %bb.1:
+; RV32IXQCI-NEXT:    addi a1, a2, -2047
+; RV32IXQCI-NEXT:  .LBB54_2:
 ; RV32IXQCI-NEXT:    mv a0, a1
 ; RV32IXQCI-NEXT:    ret
   %ret = select i1 %cond, i32 2049, i32 2047
diff --git a/llvm/test/CodeGen/RISCV/xqcicli.ll b/llvm/test/CodeGen/RISCV/xqcicli.ll
index 8b976163351ae..8d4caa177513b 100644
--- a/llvm/test/CodeGen/RISCV/xqcicli.ll
+++ b/llvm/test/CodeGen/RISCV/xqcicli.ll
@@ -4,7 +4,7 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32I
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicli -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICLI
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 
 define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
diff --git a/llvm/test/CodeGen/RISCV/xqcicm.ll b/llvm/test/CodeGen/RISCV/xqcicm.ll
index fb48301b1d8e8..8e934963c258b 100644
--- a/llvm/test/CodeGen/RISCV/xqcicm.ll
+++ b/llvm/test/CodeGen/RISCV/xqcicm.ll
@@ -6,7 +6,7 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICM
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICM
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 
 define i32 @select_example(i32 %cond, i32 %x, i32 %y) {
diff --git a/llvm/test/CodeGen/RISCV/xqcics.ll b/llvm/test/CodeGen/RISCV/xqcics.ll
index 60fc98c5de663..c0839c98c1348 100644
--- a/llvm/test/CodeGen/RISCV/xqcics.ll
+++ b/llvm/test/CodeGen/RISCV/xqcics.ll
@@ -6,7 +6,7 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICS
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcics,+experimental-xqcicm -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCICM
-; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcicm,+experimental-xqcics,+experimental-xqcicli,+zca,+short-forward-branch-opt,+conditional-cmv-fusion -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=RV32IXQCI
 
 define i32 @select_cc_example_eq_s1(i32 %a, i32 %b, i32 %x, i32 %y) {

From 57a9f79336abebbdfffd52270bb615f7189758d2 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 2 Oct 2025 01:11:51 +0800
Subject: [PATCH 407/878] [ValueTracking] Take PHI's poison-generating flags
 into account (#161530)

ninf/nnan in the phi node may produce poison values. They should be
considered in `isGuaranteedNotToBeUndefOrPoison`.
Closes https://github.com/llvm/llvm-project/issues/161524.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 35 ++++++++++---------
 .../test/Transforms/InstCombine/freeze-phi.ll | 28 +++++++++++++++
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 6f11b250cf21f..09a8fbea065ac 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7651,25 +7651,26 @@ static bool isGuaranteedNotToBeUndefOrPoison(
         return true;
     }
 
-    if (const auto *PN = dyn_cast<PHINode>(V)) {
-      unsigned Num = PN->getNumIncomingValues();
-      bool IsWellDefined = true;
-      for (unsigned i = 0; i < Num; ++i) {
-        if (PN == PN->getIncomingValue(i))
-          continue;
-        auto *TI = PN->getIncomingBlock(i)->getTerminator();
-        if (!isGuaranteedNotToBeUndefOrPoison(PN->getIncomingValue(i), AC, TI,
-                                              DT, Depth + 1, Kind)) {
-          IsWellDefined = false;
-          break;
+    if (!::canCreateUndefOrPoison(Opr, Kind,
+                                  /*ConsiderFlagsAndMetadata=*/true)) {
+      if (const auto *PN = dyn_cast<PHINode>(V)) {
+        unsigned Num = PN->getNumIncomingValues();
+        bool IsWellDefined = true;
+        for (unsigned i = 0; i < Num; ++i) {
+          if (PN == PN->getIncomingValue(i))
+            continue;
+          auto *TI = PN->getIncomingBlock(i)->getTerminator();
+          if (!isGuaranteedNotToBeUndefOrPoison(PN->getIncomingValue(i), AC, TI,
+                                                DT, Depth + 1, Kind)) {
+            IsWellDefined = false;
+            break;
+          }
         }
-      }
-      if (IsWellDefined)
+        if (IsWellDefined)
+          return true;
+      } else if (all_of(Opr->operands(), OpCheck))
         return true;
-    } else if (!::canCreateUndefOrPoison(Opr, Kind,
-                                         /*ConsiderFlagsAndMetadata*/ true) &&
-               all_of(Opr->operands(), OpCheck))
-      return true;
+    }
   }
 
   if (auto *I = dyn_cast<LoadInst>(V))
diff --git a/llvm/test/Transforms/InstCombine/freeze-phi.ll b/llvm/test/Transforms/InstCombine/freeze-phi.ll
index cdc9a5efe5933..62bb9dc31b76b 100644
--- a/llvm/test/Transforms/InstCombine/freeze-phi.ll
+++ b/llvm/test/Transforms/InstCombine/freeze-phi.ll
@@ -212,3 +212,31 @@ D:
   %y.fr = freeze i32 %y
   ret i32 %y.fr
 }
+
+; Make sure that fmf in phi node is dropped when freeze get folded.
+
+define float @pr161524(float noundef %arg) {
+; CHECK-LABEL: @pr161524(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[ARG:%.*]], i32 144)
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_EXIT:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[FADD:%.*]] = fadd float [[ARG]], 1.000000e+00
+; CHECK-NEXT:    br label [[IF_EXIT]]
+; CHECK:       if.exit:
+; CHECK-NEXT:    [[RET:%.*]] = phi float [ [[FADD]], [[IF_THEN]] ], [ [[ARG]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %cond = tail call i1 @llvm.is.fpclass.f32(float %arg, i32 144)
+  br i1 %cond, label %if.then, label %if.exit
+
+if.then:
+  %fadd = fadd float %arg, 1.0
+  br label %if.exit
+
+if.exit:
+  %ret = phi ninf float [ %fadd, %if.then ], [ %arg, %entry ]
+  %ret.fr = freeze float %ret
+  ret float %ret.fr
+}

From e1bd9afd648f7d697da64775a79b5bd0d434b909 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 1 Oct 2025 17:16:34 +0000
Subject: [PATCH 408/878] [gn build] Port 2936a2c882d7

---
 llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn       | 2 ++
 llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn | 1 +
 2 files changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
index 2f692d752ee18..c37f43c637767 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
@@ -4,9 +4,11 @@ static_library("CAS") {
     "ActionCache.cpp",
     "ActionCaches.cpp",
     "BuiltinCAS.cpp",
+    "DatabaseFile.cpp",
     "InMemoryCAS.cpp",
     "MappedFileRegionArena.cpp",
     "ObjectStore.cpp",
     "OnDiskCommon.cpp",
+    "OnDiskTrieRawHashMap.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
index de6de0b119e9e..ccb447f1b7254 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
@@ -10,6 +10,7 @@ unittest("CASTests") {
     "ActionCacheTest.cpp",
     "CASTestConfig.cpp",
     "ObjectStoreTest.cpp",
+    "OnDiskTrieRawHashMapTest.cpp",
     "ProgramTest.cpp",
   ]
 }

From 03cb514abe018985387d3e39e9e78bcfe339e874 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 1 Oct 2025 17:16:35 +0000
Subject: [PATCH 409/878] [gn build] Port da315a352880

---
 llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
index d4ec80b3d5be8..c143acfc915bc 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn
@@ -36,6 +36,7 @@ static_library("Core") {
     "GDBIndex.cpp",
     "HashUtilities.cpp",
     "JumpTable.cpp",
+    "MCInstUtils.cpp",
     "MCPlusBuilder.cpp",
     "ParallelUtilities.cpp",
     "Relocation.cpp",

From 5a80fb9177e3c831c9c574400a13d77393397f2a Mon Sep 17 00:00:00 2001
From: Augusto Noronha <anoronha@apple.com>
Date: Wed, 1 Oct 2025 10:46:36 -0700
Subject: [PATCH 410/878] Revert "[lldb][MachO][NFC] Extract ObjC metadata
 symbol parsing into helper function (#161536)"

This reverts commit 23e081524fd9f64fb3430822e879b6dc36a1d3f1.
---
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     | 178 ++++++++++++------
 1 file changed, 124 insertions(+), 54 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index e8bbefe27f47f..91c93be1b8cfd 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -2067,43 +2067,6 @@ static bool ParseTrieEntries(DataExtractor &data, lldb::offset_t offset,
   return true;
 }
 
-static bool
-TryParseV2ObjCMetadataSymbol(const char *&symbol_name,
-                             const char *&symbol_name_non_abi_mangled,
-                             SymbolType &type) {
-  static constexpr llvm::StringLiteral g_objc_v2_prefix_class("_OBJC_CLASS_$_");
-  static constexpr llvm::StringLiteral g_objc_v2_prefix_metaclass(
-      "_OBJC_METACLASS_$_");
-  static constexpr llvm::StringLiteral g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
-
-  llvm::StringRef symbol_name_ref(symbol_name);
-  if (symbol_name_ref.empty())
-    return false;
-
-  if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
-    symbol_name_non_abi_mangled = symbol_name + 1;
-    symbol_name = symbol_name + g_objc_v2_prefix_class.size();
-    type = eSymbolTypeObjCClass;
-    return true;
-  }
-
-  if (symbol_name_ref.starts_with(g_objc_v2_prefix_metaclass)) {
-    symbol_name_non_abi_mangled = symbol_name + 1;
-    symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
-    type = eSymbolTypeObjCMetaClass;
-    return true;
-  }
-
-  if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
-    symbol_name_non_abi_mangled = symbol_name + 1;
-    symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
-    type = eSymbolTypeObjCIVar;
-    return true;
-  }
-
-  return false;
-}
-
 static SymbolType GetSymbolType(const char *&symbol_name,
                                 bool &demangled_is_synthesized,
                                 const SectionSP &text_section_sp,
@@ -2220,6 +2183,9 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
   lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
   uint32_t i;
   FileSpecList dylib_files;
+  llvm::StringRef g_objc_v2_prefix_class("_OBJC_CLASS_$_");
+  llvm::StringRef g_objc_v2_prefix_metaclass("_OBJC_METACLASS_$_");
+  llvm::StringRef g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
   UUID image_uuid;
 
   for (i = 0; i < m_header.ncmds; ++i) {
@@ -2839,13 +2805,33 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                         is_gsym = true;
                         sym[sym_idx].SetExternal(true);
 
-                        if (TryParseV2ObjCMetadataSymbol(
-                                symbol_name, symbol_name_non_abi_mangled,
-                                type)) {
+                        llvm::StringRef symbol_name_ref(symbol_name);
+                        if (symbol_name_ref.starts_with(
+                                g_objc_v2_prefix_class)) {
+                          symbol_name_non_abi_mangled = symbol_name + 1;
+                          symbol_name =
+                              symbol_name + g_objc_v2_prefix_class.size();
+                          type = eSymbolTypeObjCClass;
                           demangled_is_synthesized = true;
-                        } else if (nlist.n_value != 0) {
-                          symbol_section = section_info.GetSection(
-                              nlist.n_sect, nlist.n_value);
+
+                        } else if (symbol_name_ref.starts_with(
+                                       g_objc_v2_prefix_metaclass)) {
+                          symbol_name_non_abi_mangled = symbol_name + 1;
+                          symbol_name =
+                              symbol_name + g_objc_v2_prefix_metaclass.size();
+                          type = eSymbolTypeObjCMetaClass;
+                          demangled_is_synthesized = true;
+                        } else if (symbol_name_ref.starts_with(
+                                       g_objc_v2_prefix_ivar)) {
+                          symbol_name_non_abi_mangled = symbol_name + 1;
+                          symbol_name =
+                              symbol_name + g_objc_v2_prefix_ivar.size();
+                          type = eSymbolTypeObjCIVar;
+                          demangled_is_synthesized = true;
+                        } else {
+                          if (nlist.n_value != 0)
+                            symbol_section = section_info.GetSection(
+                                nlist.n_sect, nlist.n_value);
                           type = eSymbolTypeData;
                         }
                         break;
@@ -3330,10 +3316,49 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                                   ::strstr(symbol_sect_name, "__objc") ==
                                       symbol_sect_name) {
                                 type = eSymbolTypeRuntime;
-                                demangled_is_synthesized =
-                                    TryParseV2ObjCMetadataSymbol(
-                                        symbol_name,
-                                        symbol_name_non_abi_mangled, type);
+
+                                if (symbol_name) {
+                                  llvm::StringRef symbol_name_ref(symbol_name);
+                                  if (symbol_name_ref.starts_with("_OBJC_")) {
+                                    llvm::StringRef
+                                        g_objc_v2_prefix_class(
+                                            "_OBJC_CLASS_$_");
+                                    llvm::StringRef
+                                        g_objc_v2_prefix_metaclass(
+                                            "_OBJC_METACLASS_$_");
+                                    llvm::StringRef
+                                        g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
+                                    if (symbol_name_ref.starts_with(
+                                            g_objc_v2_prefix_class)) {
+                                      symbol_name_non_abi_mangled =
+                                          symbol_name + 1;
+                                      symbol_name =
+                                          symbol_name +
+                                          g_objc_v2_prefix_class.size();
+                                      type = eSymbolTypeObjCClass;
+                                      demangled_is_synthesized = true;
+                                    } else if (
+                                        symbol_name_ref.starts_with(
+                                            g_objc_v2_prefix_metaclass)) {
+                                      symbol_name_non_abi_mangled =
+                                          symbol_name + 1;
+                                      symbol_name =
+                                          symbol_name +
+                                          g_objc_v2_prefix_metaclass.size();
+                                      type = eSymbolTypeObjCMetaClass;
+                                      demangled_is_synthesized = true;
+                                    } else if (symbol_name_ref.starts_with(
+                                                   g_objc_v2_prefix_ivar)) {
+                                      symbol_name_non_abi_mangled =
+                                          symbol_name + 1;
+                                      symbol_name =
+                                          symbol_name +
+                                          g_objc_v2_prefix_ivar.size();
+                                      type = eSymbolTypeObjCIVar;
+                                      demangled_is_synthesized = true;
+                                    }
+                                  }
+                                }
                               } else if (symbol_sect_name &&
                                          ::strstr(symbol_sect_name,
                                                   "__gcc_except_tab") ==
@@ -3640,12 +3665,27 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
           is_gsym = true;
           sym[sym_idx].SetExternal(true);
 
-          if (TryParseV2ObjCMetadataSymbol(symbol_name,
-                                           symbol_name_non_abi_mangled, type)) {
+          llvm::StringRef symbol_name_ref(symbol_name);
+          if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
+            symbol_name_non_abi_mangled = symbol_name + 1;
+            symbol_name = symbol_name + g_objc_v2_prefix_class.size();
+            type = eSymbolTypeObjCClass;
             demangled_is_synthesized = true;
-          } else if (nlist.n_value != 0) {
-            symbol_section =
-                section_info.GetSection(nlist.n_sect, nlist.n_value);
+
+          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_metaclass)) {
+            symbol_name_non_abi_mangled = symbol_name + 1;
+            symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
+            type = eSymbolTypeObjCMetaClass;
+            demangled_is_synthesized = true;
+          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
+            symbol_name_non_abi_mangled = symbol_name + 1;
+            symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
+            type = eSymbolTypeObjCIVar;
+            demangled_is_synthesized = true;
+          } else {
+            if (nlist.n_value != 0)
+              symbol_section =
+                  section_info.GetSection(nlist.n_sect, nlist.n_value);
             type = eSymbolTypeData;
           }
         } break;
@@ -4083,9 +4123,39 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                 if (symbol_sect_name &&
                     ::strstr(symbol_sect_name, "__objc") == symbol_sect_name) {
                   type = eSymbolTypeRuntime;
-                  demangled_is_synthesized = TryParseV2ObjCMetadataSymbol(
-                      symbol_name, symbol_name_non_abi_mangled, type);
 
+                  if (symbol_name) {
+                    llvm::StringRef symbol_name_ref(symbol_name);
+                    if (symbol_name_ref.starts_with("_OBJC_")) {
+                      llvm::StringRef g_objc_v2_prefix_class(
+                          "_OBJC_CLASS_$_");
+                      llvm::StringRef g_objc_v2_prefix_metaclass(
+                          "_OBJC_METACLASS_$_");
+                      llvm::StringRef g_objc_v2_prefix_ivar(
+                          "_OBJC_IVAR_$_");
+                      if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
+                        symbol_name_non_abi_mangled = symbol_name + 1;
+                        symbol_name =
+                            symbol_name + g_objc_v2_prefix_class.size();
+                        type = eSymbolTypeObjCClass;
+                        demangled_is_synthesized = true;
+                      } else if (symbol_name_ref.starts_with(
+                                     g_objc_v2_prefix_metaclass)) {
+                        symbol_name_non_abi_mangled = symbol_name + 1;
+                        symbol_name =
+                            symbol_name + g_objc_v2_prefix_metaclass.size();
+                        type = eSymbolTypeObjCMetaClass;
+                        demangled_is_synthesized = true;
+                      } else if (symbol_name_ref.starts_with(
+                                     g_objc_v2_prefix_ivar)) {
+                        symbol_name_non_abi_mangled = symbol_name + 1;
+                        symbol_name =
+                            symbol_name + g_objc_v2_prefix_ivar.size();
+                        type = eSymbolTypeObjCIVar;
+                        demangled_is_synthesized = true;
+                      }
+                    }
+                  }
                 } else if (symbol_sect_name &&
                            ::strstr(symbol_sect_name, "__gcc_except_tab") ==
                                symbol_sect_name) {

From 65fd44eb0d9211035c4aaee85bc78e49c4682fab Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 1 Oct 2025 10:54:18 -0700
Subject: [PATCH 411/878] [clang-linker-wrapper][NFC] Invert condition for
 readability (#161557)

I find the positive expressions easier to read than negative.
---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index be658acacb406..1419b8c90a625 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -608,10 +608,10 @@ Expected<StringRef> linkDevice(ArrayRef<StringRef> InputFiles,
 Error containerizeRawImage(std::unique_ptr<MemoryBuffer> &Img, OffloadKind Kind,
                            const ArgList &Args) {
   llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  if (Kind != OFK_OpenMP || !Triple.isSPIRV() ||
-      Triple.getVendor() != llvm::Triple::Intel)
-    return Error::success();
-  return offloading::intel::containerizeOpenMPSPIRVImage(Img);
+  if (Kind == OFK_OpenMP && Triple.isSPIRV() &&
+      Triple.getVendor() == llvm::Triple::Intel)
+    return offloading::intel::containerizeOpenMPSPIRVImage(Img);
+  return Error::success();
 }
 
 Expected<StringRef> writeOffloadFile(const OffloadFile &File) {

From 2d6e7ef567a80b887904221c4eb1320b4d5684b9 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 1 Oct 2025 19:14:30 +0100
Subject: [PATCH 412/878] [LV] Add additional tests for replicating load/store
 costs.

Includes test for https://github.com/llvm/llvm-project/issues/161404
---
 .../AArch64/replicating-load-store-costs.ll   |  39 +++
 .../X86/replicating-load-store-costs.ll       | 231 ++++++++++++++++++
 2 files changed, 270 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
index c15e8d4252fba..ab9b48fb68f6b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll
@@ -616,6 +616,45 @@ exit:
   ret double  %red.next
 }
 
+define i32 @test_ptr_iv_load_used_by_other_load(ptr %start, ptr %end) {
+; CHECK-LABEL: define i32 @test_ptr_iv_load_used_by_other_load(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi ptr [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[RED_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[IV]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[C_EXT:%.*]] = zext i1 [[C]] to i32
+; CHECK-NEXT:    [[RED_NEXT]] = or i32 [[RED]], [[C_EXT]]
+; CHECK-NEXT:    [[IV_NEXT]] = getelementptr nusw i8, ptr [[IV]], i64 32
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[IV]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_LCSSA:%.*]] = phi i32 [ [[RED]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[RED_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:                                 ; preds = %loop, %entry
+  %iv = phi ptr [ %iv.next, %loop ], [ null, %entry ]
+  %red = phi i32 [ %red.next, %loop ], [ 0, %entry ]
+  %0 = load ptr, ptr %iv, align 8
+  %1 = load i8, ptr %0, align 8
+  %c = icmp ne i8 %1, 0
+  %c.ext = zext i1 %c to i32
+  %red.next = or i32 %red, %c.ext
+  %iv.next = getelementptr nusw i8, ptr %iv, i64 32
+  %ec = icmp eq ptr %iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %red
+}
+
 attributes #0 = { "target-cpu"="neoverse-512tvb" }
 
 !0 = !{!1, !2, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
new file mode 100644
index 0000000000000..d93932585460f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
+; RUN: opt -p loop-vectorize -mtriple=x86_64-linux-gnu -S %s | FileCheck --check-prefix=I64 %s
+; RUN: opt -p loop-vectorize -mtriple=i386-pc-linux-gnu -S %s | FileCheck --check-prefix=I32 %s
+
+
+define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
+; I64-LABEL: define void @test_store_initially_interleave(
+; I64-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; I64-NEXT:  [[ENTRY:.*]]:
+; I64-NEXT:    br label %[[LOOP:.*]]
+; I64:       [[LOOP]]:
+; I64-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ]
+; I64-NEXT:    [[CONV:%.*]] = uitofp i32 [[IV]] to double
+; I64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]]
+; I64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADD_PTR_I]], align 4
+; I64-NEXT:    store double [[CONV]], ptr [[TMP0]], align 4
+; I64-NEXT:    [[INC]] = add i32 [[IV]], 1
+; I64-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], [[N]]
+; I64-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; I64:       [[EXIT]]:
+; I64-NEXT:    ret void
+;
+; I32-LABEL: define void @test_store_initially_interleave(
+; I32-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; I32-NEXT:  [[ENTRY:.*:]]
+; I32-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 1
+; I32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], 8
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32:       [[VECTOR_PH]]:
+; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 8
+; I32-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; I32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 8, i32 [[N_MOD_VF]]
+; I32-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP2]]
+; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I32:       [[VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
+; I32-NEXT:    [[STEP_ADD_2:%.*]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
+; I32-NEXT:    [[STEP_ADD_3:%.*]] = add <2 x i32> [[STEP_ADD_2]], splat (i32 2)
+; I32-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
+; I32-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 1
+; I32-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 2
+; I32-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 3
+; I32-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 4
+; I32-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 5
+; I32-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 6
+; I32-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 7
+; I32-NEXT:    [[TMP11:%.*]] = uitofp <2 x i32> [[VEC_IND]] to <2 x double>
+; I32-NEXT:    [[TMP12:%.*]] = uitofp <2 x i32> [[STEP_ADD]] to <2 x double>
+; I32-NEXT:    [[TMP13:%.*]] = uitofp <2 x i32> [[STEP_ADD_2]] to <2 x double>
+; I32-NEXT:    [[TMP14:%.*]] = uitofp <2 x i32> [[STEP_ADD_3]] to <2 x double>
+; I32-NEXT:    [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
+; I32-NEXT:    [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
+; I32-NEXT:    [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I32-NEXT:    [[TMP18:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I32-NEXT:    [[TMP19:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I32-NEXT:    [[TMP20:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I32-NEXT:    [[TMP21:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I32-NEXT:    [[TMP22:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I32-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP15]], align 4
+; I32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[TMP16]], align 4
+; I32-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP17]], align 4
+; I32-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[TMP18]], align 4
+; I32-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[TMP19]], align 4
+; I32-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[TMP20]], align 4
+; I32-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[TMP21]], align 4
+; I32-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP22]], align 4
+; I32-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; I32-NEXT:    store double [[TMP31]], ptr [[TMP23]], align 4
+; I32-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; I32-NEXT:    store double [[TMP32]], ptr [[TMP24]], align 4
+; I32-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP12]], i32 0
+; I32-NEXT:    store double [[TMP33]], ptr [[TMP25]], align 4
+; I32-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP12]], i32 1
+; I32-NEXT:    store double [[TMP34]], ptr [[TMP26]], align 4
+; I32-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP13]], i32 0
+; I32-NEXT:    store double [[TMP35]], ptr [[TMP27]], align 4
+; I32-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[TMP13]], i32 1
+; I32-NEXT:    store double [[TMP36]], ptr [[TMP28]], align 4
+; I32-NEXT:    [[TMP37:%.*]] = extractelement <2 x double> [[TMP14]], i32 0
+; I32-NEXT:    store double [[TMP37]], ptr [[TMP29]], align 4
+; I32-NEXT:    [[TMP38:%.*]] = extractelement <2 x double> [[TMP14]], i32 1
+; I32-NEXT:    store double [[TMP38]], ptr [[TMP30]], align 4
+; I32-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; I32-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD_3]], splat (i32 2)
+; I32-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT:    br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I32:       [[MIDDLE_BLOCK]]:
+; I32-NEXT:    br label %[[SCALAR_PH]]
+; I32:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %conv = uitofp i32 %iv to double
+  %add.ptr.i = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 %iv
+  %0 = load ptr, ptr %add.ptr.i, align 4
+  store double %conv, ptr %0, align 4
+  %inc = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:                    ; preds = %loop
+  ret void
+}
+
+define void @test_store_loaded_value(ptr noalias %src, ptr noalias %dst, i32 %n) #0 {
+; I64-LABEL: define void @test_store_loaded_value(
+; I64-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; I64-NEXT:  [[BB:.*:]]
+; I64-NEXT:    [[PRE:%.*]] = icmp slt i32 [[N]], 1
+; I64-NEXT:    br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]]
+; I64:       [[PH]]:
+; I64-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; I64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4
+; I64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I64:       [[VECTOR_PH]]:
+; I64-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4
+; I64-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I64:       [[VECTOR_BODY]]:
+; I64-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I64-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I64-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I64-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]]
+; I64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; I64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; I64-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; I64-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
+; I64-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
+; I64-NEXT:    [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8
+; I64-NEXT:    [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8
+; I64-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP0]], 1
+; I64-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP1]], 1
+; I64-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP2]], 1
+; I64-NEXT:    [[TMP15:%.*]] = shl i64 [[TMP3]], 1
+; I64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; I64-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; I64-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; I64-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
+; I64-NEXT:    store double [[TMP8]], ptr [[TMP16]], align 8
+; I64-NEXT:    store double [[TMP9]], ptr [[TMP17]], align 8
+; I64-NEXT:    store double [[TMP10]], ptr [[TMP18]], align 8
+; I64-NEXT:    store double [[TMP11]], ptr [[TMP19]], align 8
+; I64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; I64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I64:       [[MIDDLE_BLOCK]]:
+; I64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; I64-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; I64:       [[SCALAR_PH]]:
+;
+; I32-LABEL: define void @test_store_loaded_value(
+; I32-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; I32-NEXT:  [[BB:.*:]]
+; I32-NEXT:    [[PRE:%.*]] = icmp slt i32 [[N]], 1
+; I32-NEXT:    br i1 [[PRE]], [[EXIT:label %.*]], label %[[PH:.*]]
+; I32:       [[PH]]:
+; I32-NEXT:    [[N_EXT:%.*]] = zext i32 [[N]] to i64
+; I32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_EXT]], 4
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32:       [[VECTOR_PH]]:
+; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_EXT]], 4
+; I32-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_EXT]], [[N_MOD_VF]]
+; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I32:       [[VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; I32-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; I32-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; I32-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; I32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP0]]
+; I32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
+; I32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
+; I32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
+; I32-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP4]], align 8
+; I32-NEXT:    [[TMP9:%.*]] = load double, ptr [[TMP5]], align 8
+; I32-NEXT:    [[TMP10:%.*]] = load double, ptr [[TMP6]], align 8
+; I32-NEXT:    [[TMP11:%.*]] = load double, ptr [[TMP7]], align 8
+; I32-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP0]], 1
+; I32-NEXT:    [[TMP13:%.*]] = shl i64 [[TMP1]], 1
+; I32-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP2]], 1
+; I32-NEXT:    [[TMP15:%.*]] = shl i64 [[TMP3]], 1
+; I32-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP12]]
+; I32-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
+; I32-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP14]]
+; I32-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP15]]
+; I32-NEXT:    store double [[TMP8]], ptr [[TMP16]], align 8
+; I32-NEXT:    store double [[TMP9]], ptr [[TMP17]], align 8
+; I32-NEXT:    store double [[TMP10]], ptr [[TMP18]], align 8
+; I32-NEXT:    store double [[TMP11]], ptr [[TMP19]], align 8
+; I32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; I32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; I32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; I32:       [[MIDDLE_BLOCK]]:
+; I32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
+; I32-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; I32:       [[SCALAR_PH]]:
+;
+bb:
+  %pre = icmp slt i32 %n, 1
+  br i1 %pre, label %exit, label %ph
+
+ph:
+  %n.ext = zext i32 %n to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %ph ], [ %iv.next, %loop ]
+  %iv.next = add i64 %iv, 1
+  %gep.src = getelementptr i8, ptr %src, i64 %iv
+  %l = load double, ptr %gep.src, align 8
+  %sext = shl i64 %iv, 1
+  %gep.dst = getelementptr i8, ptr %dst, i64 %sext
+  store double %l, ptr %gep.dst, align 8
+  %ec = icmp eq i64 %iv.next, %n.ext
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="znver2" }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}

From fc6cc4009ff2dabd8e47d48009ca18609765c872 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Wed, 1 Oct 2025 12:27:39 -0400
Subject: [PATCH 413/878] [AsmPrinter] Remove unnecessary casts. NFC

---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 701a6a2f0f7a0..11efe492c57cc 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -473,11 +473,9 @@ bool AsmPrinter::doInitialization(Module &M) {
   AddrLabelSymbols = nullptr;
 
   // Initialize TargetLoweringObjectFile.
-  const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
-    .Initialize(OutContext, TM);
+  TM.getObjFileLowering()->Initialize(OutContext, TM);
 
-  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
-      .getModuleMetadata(M);
+  TM.getObjFileLowering()->getModuleMetadata(M);
 
   // On AIX, we delay emitting any section information until
   // after emitting the .file pseudo-op. This allows additional

From 62c50fd7955a1b2b0678b97038832a155ca97723 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 1 Oct 2025 19:32:53 +0100
Subject: [PATCH 414/878] [VPlan] Retrieve canonical IV directly in
 preparePlanForEpilogue (NFCI).

Move code handling canonical IV out of the loop, simplifying the loop
body. Preparation for follow-up changes
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 87 +++++++++----------
 1 file changed, 42 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 12fb46da8e71a..fa5be21dc2b8a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9521,55 +9521,52 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
   Header->setName("vec.epilog.vector.body");
 
-  DenseMap<Value *, Value *> ToFrozen;
-  SmallVector<Instruction *> InstsToMove;
   // Ensure that the start values for all header phi recipes are updated before
   // vectorizing the epilogue loop.
-  for (VPRecipeBase &R : Header->phis()) {
-    if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
-      // When vectorizing the epilogue loop, the canonical induction start
-      // value needs to be changed from zero to the value after the main
-      // vector loop. Find the resume value created during execution of the main
-      // VPlan. It must be the first phi in the loop preheader.
-      // FIXME: Improve modeling for canonical IV start values in the epilogue
-      // loop.
-      using namespace llvm::PatternMatch;
-      PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
-      for (Value *Inc : EPResumeVal->incoming_values()) {
-        if (match(Inc, m_SpecificInt(0)))
-          continue;
-        assert(!EPI.VectorTripCount &&
-               "Must only have a single non-zero incoming value");
-        EPI.VectorTripCount = Inc;
-      }
-      // If we didn't find a non-zero vector trip count, all incoming values
-      // must be zero, which also means the vector trip count is zero. Pick the
-      // first zero as vector trip count.
-      // TODO: We should not choose VF * UF so the main vector loop is known to
-      // be dead.
-      if (!EPI.VectorTripCount) {
-        assert(
-            EPResumeVal->getNumIncomingValues() > 0 &&
-            all_of(EPResumeVal->incoming_values(),
-                   [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
-            "all incoming values must be 0");
-        EPI.VectorTripCount = EPResumeVal->getOperand(0);
-      }
-      VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
-      assert(all_of(IV->users(),
-                    [](const VPUser *U) {
-                      return isa<VPScalarIVStepsRecipe>(U) ||
-                             isa<VPDerivedIVRecipe>(U) ||
-                             cast<VPRecipeBase>(U)->isScalarCast() ||
-                             cast<VPInstruction>(U)->getOpcode() ==
-                                 Instruction::Add;
-                    }) &&
-             "the canonical IV should only be used by its increment or "
-             "ScalarIVSteps when resetting the start value");
-      IV->setOperand(0, VPV);
+  VPCanonicalIVPHIRecipe *IV = Plan.getCanonicalIV();
+  // When vectorizing the epilogue loop, the canonical induction start
+  // value needs to be changed from zero to the value after the main
+  // vector loop. Find the resume value created during execution of the main
+  // VPlan. It must be the first phi in the loop preheader.
+  // FIXME: Improve modeling for canonical IV start values in the epilogue
+  // loop.
+  using namespace llvm::PatternMatch;
+  PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
+  for (Value *Inc : EPResumeVal->incoming_values()) {
+    if (match(Inc, m_SpecificInt(0)))
       continue;
-    }
+    assert(!EPI.VectorTripCount &&
+           "Must only have a single non-zero incoming value");
+    EPI.VectorTripCount = Inc;
+  }
+  // If we didn't find a non-zero vector trip count, all incoming values
+  // must be zero, which also means the vector trip count is zero. Pick the
+  // first zero as vector trip count.
+  // TODO: We should not choose VF * UF so the main vector loop is known to
+  // be dead.
+  if (!EPI.VectorTripCount) {
+    assert(EPResumeVal->getNumIncomingValues() > 0 &&
+           all_of(EPResumeVal->incoming_values(),
+                  [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
+           "all incoming values must be 0");
+    EPI.VectorTripCount = EPResumeVal->getOperand(0);
+  }
+  VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
+  assert(all_of(IV->users(),
+                [](const VPUser *U) {
+                  return isa<VPScalarIVStepsRecipe>(U) ||
+                         isa<VPDerivedIVRecipe>(U) ||
+                         cast<VPRecipeBase>(U)->isScalarCast() ||
+                         cast<VPInstruction>(U)->getOpcode() ==
+                             Instruction::Add;
+                }) &&
+         "the canonical IV should only be used by its increment or "
+         "ScalarIVSteps when resetting the start value");
+  IV->setOperand(0, VPV);
 
+  DenseMap<Value *, Value *> ToFrozen;
+  SmallVector<Instruction *> InstsToMove;
+  for (VPRecipeBase &R : drop_begin(Header->phis())) {
     Value *ResumeV = nullptr;
     // TODO: Move setting of resume values to prepareToExecute.
     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {

From 37637120af80672002a97c330077e31df7432261 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 1 Oct 2025 20:37:04 +0200
Subject: [PATCH 415/878] [CIR] Update ComplexRealOp to work on scalar type
 (#161080)

Update cir::CreateRealOp to make it visible on scalars

Issue #160568
---
 .../clang/CIR/Dialect/Builder/CIRBaseBuilder.h     |  7 ++++---
 clang/include/clang/CIR/Dialect/IR/CIROps.td       | 10 ++++++----
 .../clang/CIR/Dialect/IR/CIRTypeConstraints.td     |  6 ++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp         |  6 ++++--
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp            | 11 ++++++++++-
 .../lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp  |  9 +++++++--
 clang/test/CIR/CodeGen/complex.cpp                 | 14 +++++++++-----
 7 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 3f83c302176c0..8a5bf0376ec98 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -148,9 +148,10 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   }
 
   mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) {
-    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
-    return cir::ComplexRealOp::create(*this, loc, operandTy.getElementType(),
-                                      operand);
+    auto resultType = operand.getType();
+    if (auto complexResultType = mlir::dyn_cast<cir::ComplexType>(resultType))
+      resultType = complexResultType.getElementType();
+    return cir::ComplexRealOp::create(*this, loc, resultType, operand);
   }
 
   mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index f857cf82a5192..0a78492aa9a86 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3260,18 +3260,20 @@ def CIR_ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
 def CIR_ComplexRealOp : CIR_Op<"complex.real", [Pure]> {
   let summary = "Extract the real part of a complex value";
   let description = [{
-    `cir.complex.real` operation takes an operand of `!cir.complex` type and
-    yields the real part of it.
+    `cir.complex.real` operation takes an operand of `!cir.complex`, `!cir.int`
+    or `!cir.float`. If the operand is `!cir.complex`, the real part of it will
+    be returned, otherwise the value returned unmodified. 
 
     Example:
 
     ```mlir
-    %1 = cir.complex.real %0 : !cir.complex<!cir.float> -> !cir.float
+    %real = cir.complex.real %complex : !cir.complex<!cir.float> -> !cir.float
+    %real = cir.complex.real %scalar : !cir.float -> !cir.float
     ```
   }];
 
   let results = (outs CIR_AnyIntOrFloatType:$result);
-  let arguments = (ins CIR_ComplexType:$operand);
+  let arguments = (ins CIR_AnyComplexOrIntOrFloatType:$operand);
 
   let assemblyFormat = [{
     $operand `:` qualified(type($operand)) `->` qualified(type($result))
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
index 82f6e1d33043e..da03a291a7690 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
@@ -165,6 +165,12 @@ def CIR_AnyIntOrFloatType : AnyTypeOf<[CIR_AnyFloatType, CIR_AnyIntType],
 
 def CIR_AnyComplexType : CIR_TypeBase<"::cir::ComplexType", "complex type">;
 
+def CIR_AnyComplexOrIntOrFloatType : AnyTypeOf<[
+    CIR_AnyComplexType, CIR_AnyFloatType, CIR_AnyIntType
+], "complex, integer or floating point type"> {
+    let cppFunctionName = "isComplexOrIntegerOrFloatingPointType";
+}
+
 //===----------------------------------------------------------------------===//
 // Array Type predicates
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index f4bbced781942..500007f6f241b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -2151,8 +2151,10 @@ mlir::Value ScalarExprEmitter::VisitRealImag(const UnaryOperator *e,
   }
 
   if (e->getOpcode() == UO_Real) {
-    return promotionTy.isNull() ? Visit(op)
-                                : cgf.emitPromotedScalarExpr(op, promotionTy);
+    mlir::Value operand = promotionTy.isNull()
+                              ? Visit(op)
+                              : cgf.emitPromotedScalarExpr(op, promotionTy);
+    return builder.createComplexReal(loc, operand);
   }
 
   // __imag on a scalar returns zero. Emit the subexpr to ensure side
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index fb87036fdfe21..6b5cc808e9a29 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -2388,14 +2388,23 @@ OpFoldResult cir::ComplexCreateOp::fold(FoldAdaptor adaptor) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult cir::ComplexRealOp::verify() {
-  if (getType() != getOperand().getType().getElementType()) {
+  mlir::Type operandTy = getOperand().getType();
+  if (auto complexOperandTy = mlir::dyn_cast<cir::ComplexType>(operandTy)) {
+    operandTy = complexOperandTy.getElementType();
+  }
+
+  if (getType() != operandTy) {
     emitOpError() << ": result type does not match operand type";
     return failure();
   }
+
   return success();
 }
 
 OpFoldResult cir::ComplexRealOp::fold(FoldAdaptor adaptor) {
+  if (!mlir::isa<cir::ComplexType>(getOperand().getType()))
+    return nullptr;
+
   if (auto complexCreateOp = getOperand().getDefiningOp<cir::ComplexCreateOp>())
     return complexCreateOp.getOperand(0);
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 22f069d9cead0..4bc7783175120 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2999,8 +2999,13 @@ mlir::LogicalResult CIRToLLVMComplexRealOpLowering::matchAndRewrite(
     cir::ComplexRealOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   mlir::Type resultLLVMTy = getTypeConverter()->convertType(op.getType());
-  rewriter.replaceOpWithNewOp<mlir::LLVM::ExtractValueOp>(
-      op, resultLLVMTy, adaptor.getOperand(), llvm::ArrayRef<std::int64_t>{0});
+  mlir::Value operand = adaptor.getOperand();
+  if (mlir::isa<cir::ComplexType>(op.getOperand().getType())) {
+    operand = mlir::LLVM::ExtractValueOp::create(
+        rewriter, op.getLoc(), resultLLVMTy, operand,
+        llvm::ArrayRef<std::int64_t>{0});
+  }
+  rewriter.replaceOp(op, operand);
   return mlir::success();
 }
 
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 2d58c380c844a..ae69b2486efd0 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1140,7 +1140,8 @@ void real_on_scalar_glvalue() {
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.float>, !cir.float
-// CIR: cir.store{{.*}} %[[TMP_A]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.float -> !cir.float
+// CIR: cir.store{{.*}} %[[A_REAL]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
 
 // LLVM: %[[A_ADDR:.*]] = alloca float, i64 1, align 4
 // LLVM: %[[B_ADDR:.*]] = alloca float, i64 1, align 4
@@ -1179,7 +1180,8 @@ void real_on_scalar_with_type_promotion() {
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
 // CIR: %[[TMP_A_F32:.*]] = cir.cast floating %[[TMP_A]] : !cir.f16 -> !cir.float
-// CIR: %[[TMP_A_F16:.*]] = cir.cast floating %[[TMP_A_F32]] : !cir.float -> !cir.f16
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A_F32]] : !cir.float -> !cir.float
+// CIR: %[[TMP_A_F16:.*]] = cir.cast floating %[[A_REAL]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[TMP_A_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca half, i64 1, align 2
@@ -1248,7 +1250,8 @@ void real_on_scalar_from_real_with_type_promotion() {
 // CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL_F32]] : !cir.float -> !cir.f16
+// CIR: %[[A_REAL:.*]] = cir.complex.real %[[A_REAL_F32]] : !cir.float -> !cir.float
+// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL]] : !cir.float -> !cir.f16
 // CIR: cir.store{{.*}} %[[A_REAL_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
@@ -1285,8 +1288,9 @@ void real_on_scalar_from_imag_with_type_promotion() {
 // CIR: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_IMAG_F16:.*]] = cir.cast floating %[[A_IMAG_F32]] : !cir.float -> !cir.f16
-// CIR: cir.store{{.*}} %[[A_IMAG_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
+// CIR: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_IMAG_F32]] : !cir.float -> !cir.float
+// CIR: %[[A_REAL_F16:.*]] = cir.cast floating %[[A_REAL_F32]] : !cir.float -> !cir.f16
+// CIR: cir.store{{.*}} %[[A_REAL_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { half, half }, i64 1, align 2
 // LLVM: %[[B_ADDR]] = alloca half, i64 1, align 2

From e84dcba9246d696715fe1daa4ddb218182580a70 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 1 Oct 2025 11:45:21 -0700
Subject: [PATCH 416/878] [MLIR] Remove leftover debug print code

This was incorrectly left and merged with #160615
---
 mlir/lib/IR/Builders.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index c84e760a3f363..8f199b60fccdc 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -489,13 +489,6 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl<Value> &results,
   SmallVector<OpFoldResult, 4> foldResults;
   LDBG() << "Trying to fold: "
          << OpWithFlags(op, OpPrintingFlags().skipRegions());
-  if (op->getName().getStringRef() == "vector.extract") {
-    Operation *parent = op->getParentOp();
-    while (parent && parent->getName().getStringRef() != "spirv.func")
-      parent = parent->getParentOp();
-    if (parent)
-      parent->dump();
-  }
   if (failed(op->fold(foldResults)))
     return cleanupFailure();
 

From 9e04291fd20489882259625ec327b87ecef6fa8c Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro@nvidia.com>
Date: Wed, 1 Oct 2025 12:08:49 -0700
Subject: [PATCH 417/878] [InstCombine] linearize complexity of
 `findDemandedEltsByAllUsers()` (#161436)

Each call to `findemandedEltsBySingleUser()` returns a new APInt that
must be OR'd with the current APInt. For large vectors with many uses
this can be slow, if the total number of operations is `{# uses} x {size
of vector}`. Instead or OR'ing, use `setBit()` on the passed-in APInt.
---
 .../InstCombine/InstCombineVectorOps.cpp      | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 6ef30663bf3ce..18a45c6799bac 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -319,20 +319,20 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
   return nullptr;
 }
 
-/// Find elements of V demanded by UserInstr.
-static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
+/// Find elements of V demanded by UserInstr. If returns false, we were not able
+/// to determine all elements.
+static bool findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr,
+                                         APInt &UnionUsedElts) {
   unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
 
-  // Conservatively assume that all elements are needed.
-  APInt UsedElts(APInt::getAllOnes(VWidth));
-
   switch (UserInstr->getOpcode()) {
   case Instruction::ExtractElement: {
     ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
     assert(EEI->getVectorOperand() == V);
     ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
     if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
-      UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
+      UnionUsedElts.setBit(EEIIndexC->getZExtValue());
+      return true;
     }
     break;
   }
@@ -341,23 +341,23 @@ static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
     unsigned MaskNumElts =
         cast<FixedVectorType>(UserInstr->getType())->getNumElements();
 
-    UsedElts = APInt(VWidth, 0);
-    for (unsigned i = 0; i < MaskNumElts; i++) {
-      unsigned MaskVal = Shuffle->getMaskValue(i);
+    for (auto I : llvm::seq(MaskNumElts)) {
+      unsigned MaskVal = Shuffle->getMaskValue(I);
       if (MaskVal == -1u || MaskVal >= 2 * VWidth)
         continue;
       if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
-        UsedElts.setBit(MaskVal);
+        UnionUsedElts.setBit(MaskVal);
       if (Shuffle->getOperand(1) == V &&
           ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
-        UsedElts.setBit(MaskVal - VWidth);
+        UnionUsedElts.setBit(MaskVal - VWidth);
     }
-    break;
+    return true;
   }
   default:
     break;
   }
-  return UsedElts;
+
+  return false;
 }
 
 /// Find union of elements of V demanded by all its users.
@@ -370,7 +370,8 @@ static APInt findDemandedEltsByAllUsers(Value *V) {
   APInt UnionUsedElts(VWidth, 0);
   for (const Use &U : V->uses()) {
     if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
-      UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
+      if (!findDemandedEltsBySingleUser(V, I, UnionUsedElts))
+        return APInt::getAllOnes(VWidth);
     } else {
       UnionUsedElts = APInt::getAllOnes(VWidth);
       break;

From 78c65545d4694e043b593e4cca7c7281a639247b Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Wed, 1 Oct 2025 12:15:46 -0700
Subject: [PATCH 418/878] [AST] Give `CharUnits::operator%` a consistent type.
 NFC (#160781)

Update the `operator%` overload that accepts `CharUnits` to return
`CharUnits` to match the other `operator%`. This is more logical than
returning an `int64` and cleans up users that want to continue to do
math with the result.

Many users of this were explicitly comparing against 0. I considered
updating these to compare against `CharUnits::Zero` or even introducing
an `explicit operator bool()`, but they all feel clearer if we update
them to use the existing `isMultipleOf()` function instead.
---
 clang/include/clang/AST/CharUnits.h         |  6 +++---
 clang/lib/AST/APValue.cpp                   |  2 +-
 clang/lib/AST/RecordLayoutBuilder.cpp       |  9 ++++-----
 clang/lib/CodeGen/CGAtomic.cpp              |  2 +-
 clang/lib/CodeGen/CGExprConstant.cpp        |  2 +-
 clang/lib/CodeGen/CGObjCMac.cpp             |  2 +-
 clang/lib/CodeGen/CGRecordLayoutBuilder.cpp | 12 ++++++------
 clang/lib/Sema/SemaChecking.cpp             |  2 +-
 clang/lib/StaticAnalyzer/Core/Store.cpp     |  2 +-
 9 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/AST/CharUnits.h b/clang/include/clang/AST/CharUnits.h
index c06354451dfbe..e570bfae69524 100644
--- a/clang/include/clang/AST/CharUnits.h
+++ b/clang/include/clang/AST/CharUnits.h
@@ -141,7 +141,7 @@ namespace clang {
       /// Among other things, this promises that
       /// self.alignTo(N) will just return self.
       bool isMultipleOf(CharUnits N) const {
-        return (*this % N) == 0;
+        return (*this % N) == CharUnits::Zero();
       }
 
       // Arithmetic operators.
@@ -165,8 +165,8 @@ namespace clang {
       CharUnits operator% (QuantityType N) const {
         return CharUnits(Quantity % N);
       }
-      QuantityType operator% (const CharUnits &Other) const {
-        return Quantity % Other.Quantity;
+      CharUnits operator%(const CharUnits &Other) const {
+        return CharUnits(Quantity % Other.Quantity);
       }
       CharUnits operator+ (const CharUnits &Other) const {
         return CharUnits(Quantity + Other.Quantity);
diff --git a/clang/lib/AST/APValue.cpp b/clang/lib/AST/APValue.cpp
index 7173c2a0e1a2a..2e1c8eb3726cf 100644
--- a/clang/lib/AST/APValue.cpp
+++ b/clang/lib/AST/APValue.cpp
@@ -784,7 +784,7 @@ void APValue::printPretty(raw_ostream &Out, const PrintingPolicy &Policy,
       if (!O.isZero()) {
         if (IsReference)
           Out << "*(";
-        if (S.isZero() || O % S) {
+        if (S.isZero() || !O.isMultipleOf(S)) {
           Out << "(char*)";
           S = CharUnits::One();
         }
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 43f4e070748bb..00b938bdf308d 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -2087,9 +2087,8 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D,
   if (InsertExtraPadding) {
     CharUnits ASanAlignment = CharUnits::fromQuantity(8);
     CharUnits ExtraSizeForAsan = ASanAlignment;
-    if (FieldSize % ASanAlignment)
-      ExtraSizeForAsan +=
-          ASanAlignment - CharUnits::fromQuantity(FieldSize % ASanAlignment);
+    if (!FieldSize.isMultipleOf(ASanAlignment))
+      ExtraSizeForAsan += ASanAlignment - (FieldSize % ASanAlignment);
     EffectiveFieldSize = FieldSize = FieldSize + ExtraSizeForAsan;
   }
 
@@ -2119,10 +2118,10 @@ void ItaniumRecordLayoutBuilder::LayoutField(const FieldDecl *D,
     if (RD->hasAttr<PackedAttr>() || !MaxFieldAlignment.isZero())
       if (FieldAlign < OriginalFieldAlign)
         if (D->getType()->isRecordType()) {
-          // If the offset is a multiple of the alignment of
+          // If the offset is not a multiple of the alignment of
           // the type, raise the warning.
           // TODO: Takes no account the alignment of the outer struct
-          if (FieldOffset % OriginalFieldAlign != 0)
+          if (!FieldOffset.isMultipleOf(OriginalFieldAlign))
             Diag(D->getLocation(), diag::warn_unaligned_access)
                 << Context.getCanonicalTagType(RD) << D->getName()
                 << D->getType();
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index eeb0fd6412946..4a3446abcc78f 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -880,7 +880,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   CharUnits MaxInlineWidth =
       getContext().toCharUnitsFromBits(MaxInlineWidthInBits);
   DiagnosticsEngine &Diags = CGM.getDiags();
-  bool Misaligned = (Ptr.getAlignment() % TInfo.Width) != 0;
+  bool Misaligned = !Ptr.getAlignment().isMultipleOf(TInfo.Width);
   bool Oversized = getContext().toBits(TInfo.Width) > MaxInlineWidthInBits;
   if (Misaligned) {
     Diags.Report(E->getBeginLoc(), diag::warn_atomic_op_misaligned)
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index b44dd9ecc717e..6407afc3d9447 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -433,7 +433,7 @@ llvm::Constant *ConstantAggregateBuilder::buildFrom(
 
       // All remaining elements must be the same type.
       if (Elems[I]->getType() != CommonType ||
-          Offset(I) % ElemSize != 0) {
+          !Offset(I).isMultipleOf(ElemSize)) {
         CanEmitArray = false;
         break;
       }
diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp
index 60f30a1334d6d..dbcce9b86ad52 100644
--- a/clang/lib/CodeGen/CGObjCMac.cpp
+++ b/clang/lib/CodeGen/CGObjCMac.cpp
@@ -5367,7 +5367,7 @@ IvarLayoutBuilder::buildBitmap(CGObjCCommonMac &CGObjC,
 
     // Ignore scan requests that don't start at an even multiple of the
     // word size.  We can't encode them.
-    if ((beginOfScan % WordSize) != 0)
+    if (!beginOfScan.isMultipleOf(WordSize))
       continue;
 
     // Ignore scan requests that start before the instance start.
diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 5f6136c917ac2..e9205c68c2812 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -369,11 +369,11 @@ void CGRecordLowering::lowerUnion(bool isNonVirtualBaseType) {
   appendPaddingBytes(LayoutSize - getSize(StorageType));
   // Set packed if we need it.
   const auto StorageAlignment = getAlignment(StorageType);
-  assert((Layout.getSize() % StorageAlignment == 0 ||
-          Layout.getDataSize() % StorageAlignment) &&
+  assert((Layout.getSize().isMultipleOf(StorageAlignment) ||
+          !Layout.getDataSize().isMultipleOf(StorageAlignment)) &&
          "Union's standard layout and no_unique_address layout must agree on "
          "packedness");
-  if (Layout.getDataSize() % StorageAlignment)
+  if (!Layout.getDataSize().isMultipleOf(StorageAlignment))
     Packed = true;
 }
 
@@ -977,7 +977,7 @@ void CGRecordLowering::determinePacked(bool NVBaseType) {
       continue;
     // If any member falls at an offset that it not a multiple of its alignment,
     // then the entire record must be packed.
-    if (Member.Offset % getAlignment(Member.Data))
+    if (!Member.Offset.isMultipleOf(getAlignment(Member.Data)))
       Packed = true;
     if (Member.Offset < NVSize)
       NVAlignment = std::max(NVAlignment, getAlignment(Member.Data));
@@ -985,12 +985,12 @@ void CGRecordLowering::determinePacked(bool NVBaseType) {
   }
   // If the size of the record (the capstone's offset) is not a multiple of the
   // record's alignment, it must be packed.
-  if (Members.back().Offset % Alignment)
+  if (!Members.back().Offset.isMultipleOf(Alignment))
     Packed = true;
   // If the non-virtual sub-object is not a multiple of the non-virtual
   // sub-object's alignment, it must be packed.  We cannot have a packed
   // non-virtual sub-object and an unpacked complete object or vise versa.
-  if (NVSize % NVAlignment)
+  if (!NVSize.isMultipleOf(NVAlignment))
     Packed = true;
   // Update the alignment of the sentinel.
   if (!Packed)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7b37e0b8d5430..8b9e132505094 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -15949,7 +15949,7 @@ void Sema::RefersToMemberWithReducedAlignment(
   }
 
   // Check if the synthesized offset fulfills the alignment.
-  if (Offset % ExpectedAlignment != 0 ||
+  if (!Offset.isMultipleOf(ExpectedAlignment) ||
       // It may fulfill the offset it but the effective alignment may still be
       // lower than the expected expression alignment.
       CompleteObjectAlignment < ExpectedAlignment) {
diff --git a/clang/lib/StaticAnalyzer/Core/Store.cpp b/clang/lib/StaticAnalyzer/Core/Store.cpp
index 971e6bc798837..b609f36ae7e89 100644
--- a/clang/lib/StaticAnalyzer/Core/Store.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Store.cpp
@@ -210,7 +210,7 @@ std::optional<const MemRegion *> StoreManager::castRegion(const MemRegion *R,
           // Is the offset a multiple of the size?  If so, we can layer the
           // ElementRegion (with elementType == PointeeTy) directly on top of
           // the base region.
-          if (off % pointeeTySize == 0) {
+          if (off.isMultipleOf(pointeeTySize)) {
             newIndex = off / pointeeTySize;
             newSuperR = baseR;
           }

From 0e14973f3cff922549b472aebc4c3b0dc1fc4d76 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 1 Oct 2025 13:06:16 -0700
Subject: [PATCH 419/878] [NFC][LLVM-Tests] Specialize test suite for LLVM unit
 tests (#161442)

Remove `UnitTests` from LLVM_TEST_DEPENDS_COMMON and create a
specialized lit suite for unit-tests and that depends only on
`UnitTests`.
---
 llvm/test/CMakeLists.txt      | 4 ++--
 llvm/test/Unit/CMakeLists.txt | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Unit/CMakeLists.txt

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 4db7663045ecc..32c7c64451746 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -71,7 +71,6 @@ set(LLVM_TEST_DEPENDS
   ${LLVM_TEST_DEPENDS_COMMON}
   BugpointPasses
   LLVMWindowsDriver
-  UnitTests
   bugpoint
   llc
   lli
@@ -270,10 +269,11 @@ add_lit_testsuites(LLVM ${CMAKE_CURRENT_SOURCE_DIR}
   ${exclude_from_check_all}
   DEPENDS ${LLVM_TEST_DEPENDS}
   FOLDER "Tests/Subdirectories"
-  SKIP "^FileCheck" "^TableGen"
+  SKIP "^FileCheck" "^TableGen" "^Unit"
   )
 add_subdirectory(FileCheck)
 add_subdirectory(TableGen)
+add_subdirectory(Unit)
 
 # Setup an alias for 'check-all'.
 add_custom_target(check)
diff --git a/llvm/test/Unit/CMakeLists.txt b/llvm/test/Unit/CMakeLists.txt
new file mode 100644
index 0000000000000..6b0abe199673f
--- /dev/null
+++ b/llvm/test/Unit/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_lit_testsuite(check-llvm-unit "Running lit suite for LLVM unit tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  EXCLUDE_FROM_CHECK_ALL
+  DEPENDS UnitTests
+  )

From ec982fac1d1d9968f5167c6dce244428bb590318 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim@gmail.com>
Date: Wed, 1 Oct 2025 16:07:19 -0400
Subject: [PATCH 420/878] [SLPVectorizer] Change arguments of 'isStridedLoad'
 (NFC) (#160401)

This is needed to reduce the diff for the future work on widening
strided loads. Also, with this change we'll be able to re-use this for
the case when each pointer represents a start of a group of contiguous
loads.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 44 +++++++------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f77d587bf5889..fedca65d241e8 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2241,10 +2241,9 @@ class BoUpSLP {
   /// TODO: If load combining is allowed in the IR optimizer, this analysis
   ///       may not be necessary.
   bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
-  bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
-                     ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
-                     const DataLayout &DL, ScalarEvolution &SE,
-                     const int64_t Diff, StridedPtrInfo &SPtrInfo) const;
+  bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
+                     Align Alignment, const int64_t Diff, Value *Ptr0,
+                     Value *PtrN, StridedPtrInfo &SPtrInfo) const;
 
   /// Checks if the given array of loads can be represented as a vectorized,
   /// scatter or just simple gather.
@@ -6824,13 +6823,10 @@ isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
 /// 4. Any pointer operand is an instruction with the users outside of the
 /// current graph (for masked gathers extra extractelement instructions
 /// might be required).
-bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
-                            ArrayRef<unsigned> Order,
-                            const TargetTransformInfo &TTI,
-                            const DataLayout &DL, ScalarEvolution &SE,
-                            const int64_t Diff,
-                            StridedPtrInfo &SPtrInfo) const {
-  const size_t Sz = VL.size();
+bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
+                            Align Alignment, const int64_t Diff, Value *Ptr0,
+                            Value *PtrN, StridedPtrInfo &SPtrInfo) const {
+  const size_t Sz = PointerOps.size();
   if (Diff % (Sz - 1) != 0)
     return false;
 
@@ -6842,7 +6838,6 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
   });
 
   const uint64_t AbsoluteDiff = std::abs(Diff);
-  Type *ScalarTy = VL.front()->getType();
   auto *VecTy = getWidenedType(ScalarTy, Sz);
   if (IsAnyPointerUsedOutGraph ||
       (AbsoluteDiff > Sz &&
@@ -6853,20 +6848,9 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
     int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
     if (Diff != Stride * static_cast<int64_t>(Sz - 1))
       return false;
-    Align Alignment =
-        cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
-            ->getAlign();
-    if (!TTI.isLegalStridedLoadStore(VecTy, Alignment))
+    if (!TTI->isLegalStridedLoadStore(VecTy, Alignment))
       return false;
-    Value *Ptr0;
-    Value *PtrN;
-    if (Order.empty()) {
-      Ptr0 = PointerOps.front();
-      PtrN = PointerOps.back();
-    } else {
-      Ptr0 = PointerOps[Order.front()];
-      PtrN = PointerOps[Order.back()];
-    }
+
     // Iterate through all pointers and check if all distances are
     // unique multiple of Dist.
     SmallSet<int64_t, 4> Dists;
@@ -6875,14 +6859,14 @@ bool BoUpSLP::isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
       if (Ptr == PtrN)
         Dist = Diff;
       else if (Ptr != Ptr0)
-        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+        Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
       // If the strides are not the same or repeated, we can't
       // vectorize.
       if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second)
         break;
     }
     if (Dists.size() == Sz) {
-      Type *StrideTy = DL.getIndexType(Ptr0->getType());
+      Type *StrideTy = DL->getIndexType(Ptr0->getType());
       SPtrInfo.StrideVal = ConstantInt::get(StrideTy, Stride);
       SPtrInfo.Ty = getWidenedType(ScalarTy, Sz);
       return true;
@@ -6971,7 +6955,11 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
                                    cast<Instruction>(V), UserIgnoreList);
                              }))
       return LoadsState::CompressVectorize;
-    if (isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, *Diff, SPtrInfo))
+    Align Alignment =
+        cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
+            ->getAlign();
+    if (isStridedLoad(PointerOps, ScalarTy, Alignment, *Diff, Ptr0, PtrN,
+                      SPtrInfo))
       return LoadsState::StridedVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

From 15dc80fda748b094c28942e0b360a8fc2db8fd20 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim@gmail.com>
Date: Wed, 1 Oct 2025 16:09:45 -0400
Subject: [PATCH 421/878] [SLPVectorizer][NFC] A test for widening constant
 strided loads. (#160552)

Precommit a test.
---
 .../RISCV/basic-strided-loads.ll              | 118 ++++++++++++++++--
 1 file changed, 105 insertions(+), 13 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
index 4f52227c6511e..02e05b2e4138a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll
@@ -527,23 +527,14 @@ define void @rt_stride_1_with_reordering(ptr %pl, i64 %stride, ptr %ps) {
   ret void
 }
 
-; TODO: We want to generate this code:
-; define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
-; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0
-; %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
-; %strided_load = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 8, <4 x i1> splat (i1 true), i32 4)
-; %bitcast_ = bitcast <4 x i32> %strided_load to <16 x i8>
-; store <16 x i8> %bitcast_, ptr %gep_s0, align 1
-; ret void
-; }
-define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
-; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
+define void @constant_stride_masked_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
+; CHECK-LABEL: define void @constant_stride_masked_no_reordering(
 ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
 ; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <28 x i8> @llvm.masked.load.v28i8.p0(ptr [[GEP_L0]], i32 1, <28 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <28 x i8> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <28 x i8> [[TMP1]], <28 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 1
 ; CHECK-NEXT:    ret void
 ;
   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
@@ -617,6 +608,107 @@ define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps)
   ret void
 }
 
+; TODO: We want to generate this code:
+; define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) #0 {
+;   %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+;   %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+;   %1 = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 %gep_l0, i64 100, <4 x i1> splat (i1 true), i32 4)
+;   %2 = bitcast <4 x i32> %1 to <16 x i8>
+;   store <16 x i8> %2, ptr %gep_s0, align 1
+;   ret void
+; }
+define void @constant_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
+; CHECK-LABEL: define void @constant_stride_widen_no_reordering(
+; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0
+; CHECK-NEXT:    [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 100
+; CHECK-NEXT:    [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 200
+; CHECK-NEXT:    [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 300
+; CHECK-NEXT:    [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 1
+; CHECK-NEXT:    ret void
+;
+  %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0
+  %gep_l1 = getelementptr inbounds i8, ptr %pl, i64 1
+  %gep_l2 = getelementptr inbounds i8, ptr %pl, i64 2
+  %gep_l3 = getelementptr inbounds i8, ptr %pl, i64 3
+  %gep_l4 = getelementptr inbounds i8, ptr %pl, i64 100
+  %gep_l5 = getelementptr inbounds i8, ptr %pl, i64 101
+  %gep_l6 = getelementptr inbounds i8, ptr %pl, i64 102
+  %gep_l7 = getelementptr inbounds i8, ptr %pl, i64 103
+  %gep_l8 = getelementptr inbounds i8, ptr %pl, i64 200
+  %gep_l9 = getelementptr inbounds i8, ptr %pl, i64 201
+  %gep_l10 = getelementptr inbounds i8, ptr %pl, i64 202
+  %gep_l11 = getelementptr inbounds i8, ptr %pl, i64 203
+  %gep_l12 = getelementptr inbounds i8, ptr %pl, i64 300
+  %gep_l13 = getelementptr inbounds i8, ptr %pl, i64 301
+  %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 302
+  %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 303
+
+  %load0  = load i8, ptr %gep_l0 , align 1
+  %load1  = load i8, ptr %gep_l1 , align 1
+  %load2  = load i8, ptr %gep_l2 , align 1
+  %load3  = load i8, ptr %gep_l3 , align 1
+  %load4  = load i8, ptr %gep_l4 , align 1
+  %load5  = load i8, ptr %gep_l5 , align 1
+  %load6  = load i8, ptr %gep_l6 , align 1
+  %load7  = load i8, ptr %gep_l7 , align 1
+  %load8  = load i8, ptr %gep_l8 , align 1
+  %load9  = load i8, ptr %gep_l9 , align 1
+  %load10 = load i8, ptr %gep_l10, align 1
+  %load11 = load i8, ptr %gep_l11, align 1
+  %load12 = load i8, ptr %gep_l12, align 1
+  %load13 = load i8, ptr %gep_l13, align 1
+  %load14 = load i8, ptr %gep_l14, align 1
+  %load15 = load i8, ptr %gep_l15, align 1
+
+  %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0
+  %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1
+  %gep_s2 = getelementptr inbounds i8, ptr %ps, i64 2
+  %gep_s3 = getelementptr inbounds i8, ptr %ps, i64 3
+  %gep_s4 = getelementptr inbounds i8, ptr %ps, i64 4
+  %gep_s5 = getelementptr inbounds i8, ptr %ps, i64 5
+  %gep_s6 = getelementptr inbounds i8, ptr %ps, i64 6
+  %gep_s7 = getelementptr inbounds i8, ptr %ps, i64 7
+  %gep_s8 = getelementptr inbounds i8, ptr %ps, i64 8
+  %gep_s9 = getelementptr inbounds i8, ptr %ps, i64 9
+  %gep_s10 = getelementptr inbounds i8, ptr %ps, i64 10
+  %gep_s11 = getelementptr inbounds i8, ptr %ps, i64 11
+  %gep_s12 = getelementptr inbounds i8, ptr %ps, i64 12
+  %gep_s13 = getelementptr inbounds i8, ptr %ps, i64 13
+  %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14
+  %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15
+
+  store i8 %load0, ptr %gep_s0, align 1
+  store i8 %load1, ptr %gep_s1, align 1
+  store i8 %load2, ptr %gep_s2, align 1
+  store i8 %load3, ptr %gep_s3, align 1
+  store i8 %load4, ptr %gep_s4, align 1
+  store i8 %load5, ptr %gep_s5, align 1
+  store i8 %load6, ptr %gep_s6, align 1
+  store i8 %load7, ptr %gep_s7, align 1
+  store i8 %load8, ptr %gep_s8, align 1
+  store i8 %load9, ptr %gep_s9, align 1
+  store i8 %load10, ptr %gep_s10, align 1
+  store i8 %load11, ptr %gep_s11, align 1
+  store i8 %load12, ptr %gep_s12, align 1
+  store i8 %load13, ptr %gep_s13, align 1
+  store i8 %load14, ptr %gep_s14, align 1
+  store i8 %load15, ptr %gep_s15, align 1
+
+  ret void
+}
 ; TODO: We want to generate this code:
 ; define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) {
 ; %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 %offset0

From 9a30ada53d5ef8d651c75795af2f6e9c48a1eecb Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Wed, 1 Oct 2025 13:20:59 -0700
Subject: [PATCH 422/878] [CIR][NFC] Fix CIR build (#161577)

This fixes the CIR build after recent changes to CharUnits.
---
 clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
index a7628816089d0..bf812c8a1793b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
@@ -615,7 +615,7 @@ void CIRRecordLowering::determinePacked(bool nvBaseType) {
       continue;
     // If any member falls at an offset that it not a multiple of its alignment,
     // then the entire record must be packed.
-    if (member.offset % getAlignment(member.data))
+    if (!member.offset.isMultipleOf(getAlignment(member.data)))
       packed = true;
     if (member.offset < nvSize)
       nvAlignment = std::max(nvAlignment, getAlignment(member.data));
@@ -623,12 +623,12 @@ void CIRRecordLowering::determinePacked(bool nvBaseType) {
   }
   // If the size of the record (the capstone's offset) is not a multiple of the
   // record's alignment, it must be packed.
-  if (members.back().offset % alignment)
+  if (!members.back().offset.isMultipleOf(alignment))
     packed = true;
   // If the non-virtual sub-object is not a multiple of the non-virtual
   // sub-object's alignment, it must be packed.  We cannot have a packed
   // non-virtual sub-object and an unpacked complete object or vise versa.
-  if (nvSize % nvAlignment)
+  if (!nvSize.isMultipleOf(nvAlignment))
     packed = true;
   // Update the alignment of the sentinel.
   if (!packed)

From 71365c0b6b95eb6874b08f32fe58b5419979ff4c Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Wed, 1 Oct 2025 13:34:27 -0700
Subject: [PATCH 423/878] [CIR][NFC] Fix CIR build after CharUnits change
 (#161580)

My previous attempt to fix this missed one case.
---
 clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
index bf812c8a1793b..2baeb43c48b2e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
@@ -824,7 +824,7 @@ void CIRRecordLowering::lowerUnion() {
     appendPaddingBytes(layoutSize - getSize(storageType));
 
   // Set packed if we need it.
-  if (layoutSize % getAlignment(storageType))
+  if (!layoutSize.isMultipleOf(getAlignment(storageType)))
     packed = true;
 }
 

From 56ca23c46dfd3b4a8cfd5cff05680ffcb20dde44 Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffranllvm@gmail.com>
Date: Wed, 1 Oct 2025 13:42:38 -0700
Subject: [PATCH 424/878] [DirectX] Updating Root Signature Metadata to contain
 Static Sampler flags (#160210)

Root Signature 1.2 adds flags to static samplers. This requires us to
change the metadata representation to account for it when being
generated. This patch focus on the metadata changes required in the
backend, frontend changes will come in a future PR.
---
 clang/test/CodeGenHLSL/RootSignature.hlsl     |  4 +-
 .../llvm/Frontend/HLSL/HLSLRootSignature.h    |  1 +
 .../Frontend/HLSL/RootSignatureMetadata.cpp   | 14 ++++++-
 ...gnature-StaticSamplers-Invalid-AddressU.ll |  2 +-
 ...gnature-StaticSamplers-Invalid-AddressV.ll |  2 +-
 ...gnature-StaticSamplers-Invalid-AddressW.ll |  2 +-
 ...ture-StaticSamplers-Invalid-BorderColor.ll |  2 +-
 ...e-StaticSamplers-Invalid-ComparisonFunc.ll |  2 +-
 ...Signature-StaticSamplers-Invalid-Filter.ll |  2 +-
 ...otSignature-StaticSamplers-Invalid-Flag.ll | 19 +++++++++
 ...re-StaticSamplers-Invalid-MaxAnisotropy.ll |  2 +-
 ...Signature-StaticSamplers-Invalid-MaxLod.ll |  2 +-
 ...Signature-StaticSamplers-Invalid-MinLod.ll |  2 +-
 ...ature-StaticSamplers-Invalid-MinLopBias.ll |  2 +-
 ...re-StaticSamplers-Invalid-RegisterSpace.ll |  2 +-
 ...e-StaticSamplers-Invalid-ShaderRegister.ll |  2 +-
 ...StaticSamplers-Invalid-ShaderVisibility.ll |  2 +-
 .../RootSignature-StaticSamplers.ll           |  2 +-
 .../RootSignature-StaticSamplers_V3.ll        | 42 +++++++++++++++++++
 .../rootsignature-validation-fail-sampler.ll  |  2 +-
 ...re-validation-fail-static-sampler-range.ll |  4 +-
 21 files changed, 94 insertions(+), 20 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll
 create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll

diff --git a/clang/test/CodeGenHLSL/RootSignature.hlsl b/clang/test/CodeGenHLSL/RootSignature.hlsl
index bc40bdd79ce59..bbab6a73a3658 100644
--- a/clang/test/CodeGenHLSL/RootSignature.hlsl
+++ b/clang/test/CodeGenHLSL/RootSignature.hlsl
@@ -82,8 +82,8 @@ void RootDescriptorsEntry() {}
 // checking minLOD, maxLOD
 // CHECK-SAME: float -1.280000e+02, float 1.280000e+02,
 
-// checking register, space and visibility
-// CHECK-SAME: i32 42, i32 0, i32 0}
+// checking register, space, visibility and flags
+// CHECK-SAME: i32 42, i32 0, i32 0, i32 0}
 
 #define SampleStaticSampler \
   "StaticSampler(s42, " \
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
index 87777fddc9157..37224d8a94527 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
@@ -131,6 +131,7 @@ struct StaticSampler {
   float MaxLOD = std::numeric_limits<float>::max();
   uint32_t Space = 0;
   dxbc::ShaderVisibility Visibility = dxbc::ShaderVisibility::All;
+  dxbc::StaticSamplerFlags Flags = dxbc::StaticSamplerFlags::None;
 };
 
 /// Models RootElement : RootFlags | RootConstants | RootParam
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
index 5785505ce2b0c..7a0cf408968de 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp
@@ -218,6 +218,7 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
       ConstantAsMetadata::get(Builder.getInt32(Sampler.Space)),
       ConstantAsMetadata::get(
           Builder.getInt32(to_underlying(Sampler.Visibility))),
+      ConstantAsMetadata::get(Builder.getInt32(to_underlying(Sampler.Flags))),
   };
   return MDNode::get(Ctx, Operands);
 }
@@ -417,7 +418,7 @@ Error MetadataParser::parseDescriptorTable(mcdxbc::RootSignatureDesc &RSD,
 
 Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
                                          MDNode *StaticSamplerNode) {
-  if (StaticSamplerNode->getNumOperands() != 14)
+  if (StaticSamplerNode->getNumOperands() != 15)
     return make_error<InvalidRSMetadataFormat>("Static Sampler");
 
   mcdxbc::StaticSampler Sampler;
@@ -501,6 +502,17 @@ Error MetadataParser::parseStaticSampler(mcdxbc::RootSignatureDesc &RSD,
     return Error(std::move(E));
   Sampler.ShaderVisibility = *Visibility;
 
+  if (RSD.Version < 3) {
+    RSD.StaticSamplers.push_back(Sampler);
+    return Error::success();
+  }
+  assert(RSD.Version >= 3);
+
+  if (std::optional<uint32_t> Val = extractMdIntValue(StaticSamplerNode, 14))
+    Sampler.Flags = *Val;
+  else
+    return make_error<InvalidRSMetadataValue>("Static Sampler Flags");
+
   RSD.StaticSamplers.push_back(Sampler);
   return Error::success();
 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll
index 288dea00b6e55..b043ea1418df6 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressU.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 666, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 666, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll
index e9abcf9669999..8219ffdd679d2 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressV.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 666, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 666, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll
index 238f488ee78d6..31d8dd10f3e22 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-AddressW.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 666, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 666, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll
index 8dc69eb1f9d7c..2bb4af5d9c0f2 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-BorderColor.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 666, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 666, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll
index b2c8faf8d4a0a..62fda735b6860 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ComparisonFunc.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 666, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 666, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll
index 758d2629ed78e..7e8de14160ce9 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Filter.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 45, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 45, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll
new file mode 100644
index 0000000000000..8f7ef8857ad15
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-Flag.ll
@@ -0,0 +1,19 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Invalid value for Static Sampler Flag: 4 
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 3 } ; function, root signature
+!3 = !{ !5 } ; list of root signature elements
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 4 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll
index 47d4b52d72e8e..312e7697d4f2a 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxAnisotropy.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 666, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 666, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll
index 855e0c0cb6e51..80fd208a1bceb 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MaxLod.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 0x7FF8000000000000, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 0x7FF8000000000000, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll
index 812749b9ed824..5daaf69a40062 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLod.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float 0x7FF8000000000000, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float 0x7FF8000000000000, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll
index 6898aec6f2e49..423987b0e2624 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-MinLopBias.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 6.660000e+02, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 6.660000e+02, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll
index dc6ee4290b532..af630dcdd0300 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-RegisterSpace.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 4294967280, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 4294967280, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll
index 6cee1dd95fd81..bd752f0519da4 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderRegister.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 4294967295, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 4294967295, i32 0, i32 0, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll
index fa5bf12e2b8cd..ca0c02d64983b 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers-Invalid-ShaderVisibility.ll
@@ -16,4 +16,4 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 666 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 666, i32 0 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
index 1dd470d7fb822..77c5c7af66247 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers.ll
@@ -15,7 +15,7 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
 !3 = !{ !5 } ; list of root signature elements
-!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }
 
 ; DXC: - Name:            RTS0
 ; DXC-NEXT:     Size:            76
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll
new file mode 100644
index 0000000000000..7e56f0408e3f3
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-StaticSamplers_V3.ll
@@ -0,0 +1,42 @@
+; RUN: opt %s -dxil-embed -dxil-globals -S -o - | FileCheck %s
+; RUN: llc %s --filetype=obj -o - | obj2yaml | FileCheck %s --check-prefix=DXC
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: @dx.rts0 = private constant [248 x i8]  c"{{.*}}", section "RTS0", align 4
+
+define void @main() #0 {
+entry:
+  ret void
+}
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+
+!dx.rootsignatures = !{!2} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3, i32 3 } ; function, root signature
+!3 = !{ !5, !6, !7, !8 } ; list of root signature elements
+!5 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 1 }
+!6 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 43, i32 0, i32 0, i32 2 }
+!7 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 44, i32 0, i32 0, i32 0 }
+!8 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 45, i32 0, i32 0, i32 3 }
+
+; DXC: - Name:            RTS0
+; DXC-NEXT:     Size:            248
+; DXC-NEXT:     RootSignature:
+; DXC-NEXT:       Version:         3
+; DXC-NEXT:       NumRootParameters: 0
+; DXC-NEXT:       RootParametersOffset: 24
+; DXC-NEXT:       NumStaticSamplers: 4
+; DXC-NEXT:       StaticSamplersOffset: 24
+; DXC-NEXT:       Parameters:      []
+; DXC-NEXT:       Samplers:
+; DXC-LABEL:         ShaderRegister:  42
+; DXC:               SAMPLER_FLAG_UINT_BORDER_COLOR: true
+; DXC-LABEL:         ShaderRegister:  43
+; DXC:               SAMPLER_FLAG_NON_NORMALIZED_COORDINATES: true
+; DXC-LABEL:         ShaderRegister:  44
+; DXC-NOT:           SAMPLER_FLAG_NON_NORMALIZED_COORDINATES:
+; DXC-NOT:           SAMPLER_FLAG_UINT_BORDER_COLOR:
+; DXC-LABEL:         ShaderRegister:  45
+; DXC:               SAMPLER_FLAG_UINT_BORDER_COLOR: true
+; DXC-NEXT:          SAMPLER_FLAG_NON_NORMALIZED_COORDINATES: true
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
index c244095520468..b68606d656d75 100644
--- a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler.ll
@@ -10,6 +10,6 @@ entry:
 
 !0 = !{ptr @CSMain, !1, i32 2}
 !1 = !{!2, !3}
-!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0 }
+!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0, i32 0 }
 !3 = !{!"DescriptorTable", i32 0, !4}
 !4 = !{!"Sampler", i32 1, i32 42, i32 0, i32 -1, i32 0}
diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
index 9ac02ebbc0965..7c836e2f85d68 100644
--- a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
+++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-static-sampler-range.ll
@@ -10,5 +10,5 @@ entry:
 
 !0 = !{ptr @CSMain, !1, i32 2}
 !1 = !{!2, !3}
-!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0 }
-!3 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0 }
+!2 = !{ !"StaticSampler", i32 5, i32 4, i32 5, i32 3, float 0x3FF7CCCCC0000000, i32 10, i32 2, i32 1, float -1.270000e+02, float 1.220000e+02, i32 42, i32 0, i32 0, i32 0 }
+!3 = !{ !"StaticSampler", i32 4, i32 2, i32 3, i32 5, float 0x3FF6CCCCC0000000, i32 9, i32 3, i32 2, float -1.280000e+02, float 1.280000e+02, i32 42, i32 0, i32 0, i32 0 }

From c4788bff8266497aafd526f887722982127a2e11 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 1 Oct 2025 13:58:26 -0700
Subject: [PATCH 425/878] [NFC][LLVM][Support] Misc code cleanup in
 ScopedPrinter (#161462)

Add missing file header to ScopedPrinter.cpp and adjust the code to
conform to LLVM coding standards.
---
 llvm/lib/Support/ScopedPrinter.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Support/ScopedPrinter.cpp b/llvm/lib/Support/ScopedPrinter.cpp
index a17e397c0aa58..efb61785d17b0 100644
--- a/llvm/lib/Support/ScopedPrinter.cpp
+++ b/llvm/lib/Support/ScopedPrinter.cpp
@@ -1,12 +1,17 @@
-#include "llvm/Support/ScopedPrinter.h"
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/Format.h"
 
-using namespace llvm::support;
+using namespace llvm;
 
-namespace llvm {
-
-raw_ostream &operator<<(raw_ostream &OS, const HexNumber &Value) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const HexNumber &Value) {
   OS << "0x" << utohexstr(Value.Value);
   return OS;
 }
@@ -45,5 +50,3 @@ JSONScopedPrinter::JSONScopedPrinter(
   if (this->OuterScope)
     this->OuterScope->setPrinter(*this);
 }
-
-} // namespace llvm

From b389adf56a1610cbdf0aa0a7b0b70b2c6b049f83 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 1 Oct 2025 14:00:28 -0700
Subject: [PATCH 426/878] [RISCV] Allow non-canonicalized splats in
 isProfitableToSinkOperands (#161586)

This isn't an optimization change - IR transforms should have remove the
operands and replaced them with poison. However, I noticed the
non-canonical splat structure in a couple of llvm-reduce outputs. This
results in us creating extremely atypical IR which is quite misleading
about the true cause of what's going on. (Because the non-canonical
splat doesn't get sunk, we then prone whatever was actually holding it
outside the loop in the original example, eliminating insight as to the
true cause of whatever issue we're debugging.)
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  4 +--
 .../CodeGen/RISCV/rvv/sink-splat-operands.ll  | 36 +++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index d4124ae9aeff0..ee25f6918de8b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -3139,8 +3139,8 @@ bool RISCVTTIImpl::isProfitableToSinkOperands(
     bool IsVPSplat = match(Op, m_Intrinsic<Intrinsic::experimental_vp_splat>(
                                    m_Value(), m_Value(), m_Value()));
     if (!IsVPSplat &&
-        !match(Op, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
-                             m_Undef(), m_ZeroMask())))
+        !match(Op, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
+                             m_Value(), m_ZeroMask())))
       continue;
 
     // Don't sink i1 splats.
diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index 02825b2bda484..19a184148c0b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -6018,3 +6018,39 @@ vector.latch:                                     ; preds = %for.body419
 for.cond.cleanup:                                 ; preds = %vector.latch
   ret void
 }
+
+;; This is exactly like sink_add_splat except that the splat has operands
+;; which haven't been converted to undef.
+define void @sink_non_canonical_splat(ptr nocapture %a, i32 signext %x) {
+; CHECK-LABEL: sink_non_canonical_splat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:  .LBB131_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vadd.vx v8, v8, a1
+; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    bne a0, a2, .LBB131_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
+  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, ptr %a, i64 %index
+  %wide.load = load <4 x i32>, ptr %0, align 4
+  %1 = add <4 x i32> %wide.load, %broadcast.splat
+  store <4 x i32> %1, ptr %0, align 4
+  %index.next = add nuw i64 %index, 4
+  %2 = icmp eq i64 %index.next, 1024
+  br i1 %2, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}

From 94b2617590c11b372a2161ccdfe52ecd73d58ec1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 1 Oct 2025 21:57:54 +0100
Subject: [PATCH 427/878] [VPlan] Remove VPIRPhis in exit blocks when deleting
 scalar loop BBs.

DeleteDeadBlocks will remove single-entry phis. Remove them from the exit
VPIRBBs in VPlan as well, otherwise we would retain references to deleted
IR instructions.

Fixes MSan failures after 8907b6d39
https://lab.llvm.org/buildbot/#/builders/164/builds/14013
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ffd2e59938510..02eb6375aac41 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -978,6 +978,16 @@ void VPlan::execute(VPTransformState *State) {
 
   // If the original loop is unreachable, delete it and all its blocks.
   if (!ScalarPhVPBB->hasPredecessors()) {
+    // DeleteDeadBlocks will remove single-entry phis. Remove them from the exit
+    // VPIRBBs in VPlan as well, otherwise we would retain references to deleted
+    // IR instructions.
+    for (VPIRBasicBlock *EB : getExitBlocks()) {
+      for (VPRecipeBase &R : make_early_inc_range(EB->phis())) {
+        if (R.getNumOperands() == 1)
+          R.eraseFromParent();
+      }
+    }
+
     Loop *OrigLoop =
         State->LI->getLoopFor(getScalarHeader()->getIRBasicBlock());
     auto Blocks = OrigLoop->getBlocksVector();

From ba5141d27c66136d0dda866d30a495940474c528 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 1 Oct 2025 16:12:30 -0500
Subject: [PATCH 428/878] [flang][OpenMP] Check contatining scoping unit in
 DECLARE_SIMD (#161556)

Check if the name on DECLARE_SIMD is the name of the containing scoping
unit.

Fixes https://github.com/llvm/llvm-project/issues/161516
---
 flang/include/flang/Semantics/openmp-utils.h |  2 ++
 flang/lib/Semantics/check-omp-structure.cpp  | 12 ++++++++++-
 flang/lib/Semantics/openmp-utils.cpp         | 18 ++++++++++++++++
 flang/lib/Semantics/resolve-directives.cpp   | 22 ++------------------
 flang/test/Semantics/OpenMP/declare-simd.f90 |  5 +++++
 5 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 08b67167f5de2..2954a1c4769f7 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -37,6 +37,8 @@ template <typename T, typename U = std::remove_const_t<T>> U AsRvalue(T &t) {
 
 template <typename T> T &&AsRvalue(T &&t) { return std::move(t); }
 
+const Scope &GetScopingUnit(const Scope &scope);
+
 // There is no consistent way to get the source of an ActionStmt, but there
 // is "source" in Statement<T>. This structure keeps the ActionStmt with the
 // extracted source for further use.
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e224e069abcef..1f059f747bad0 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1361,9 +1361,19 @@ void OmpStructureChecker::Enter(const parser::OpenMPDeclareSimdConstruct &x) {
     return;
   }
 
+  auto isValidSymbol{[](const Symbol *sym) {
+    if (IsProcedure(*sym) || IsFunction(*sym)) {
+      return true;
+    }
+    if (const Symbol *owner{GetScopingUnit(sym->owner()).symbol()}) {
+      return IsProcedure(*owner) || IsFunction(*owner);
+    }
+    return false;
+  }};
+
   const parser::OmpArgument &arg{args.v.front()};
   if (auto *sym{GetArgumentSymbol(arg)}) {
-    if (!IsProcedure(*sym) && !IsFunction(*sym)) {
+    if (!isValidSymbol(sym)) {
       auto &msg{context_.Say(arg.source,
           "The name '%s' should refer to a procedure"_err_en_US, sym->name())};
       if (sym->test(Symbol::Flag::Implicit)) {
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 35b7718715071..a8ec4d6c24beb 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -41,6 +41,24 @@
 namespace Fortran::semantics::omp {
 using namespace Fortran::parser::omp;
 
+const Scope &GetScopingUnit(const Scope &scope) {
+  const Scope *iter{&scope};
+  for (; !iter->IsTopLevel(); iter = &iter->parent()) {
+    switch (iter->kind()) {
+    case Scope::Kind::BlockConstruct:
+    case Scope::Kind::BlockData:
+    case Scope::Kind::DerivedType:
+    case Scope::Kind::MainProgram:
+    case Scope::Kind::Module:
+    case Scope::Kind::Subprogram:
+      return *iter;
+    default:
+      break;
+    }
+  }
+  return *iter;
+}
+
 SourcedActionStmt GetActionStmt(const parser::ExecutionPartConstruct *x) {
   if (x == nullptr) {
     return SourcedActionStmt{};
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index bd7b8ac552fab..25872be4af52e 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -379,24 +379,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   explicit OmpAttributeVisitor(SemanticsContext &context)
       : DirectiveAttributeVisitor(context) {}
 
-  static const Scope &scopingUnit(const Scope &scope) {
-    const Scope *iter{&scope};
-    for (; !iter->IsTopLevel(); iter = &iter->parent()) {
-      switch (iter->kind()) {
-      case Scope::Kind::BlockConstruct:
-      case Scope::Kind::BlockData:
-      case Scope::Kind::DerivedType:
-      case Scope::Kind::MainProgram:
-      case Scope::Kind::Module:
-      case Scope::Kind::Subprogram:
-        return *iter;
-      default:
-        break;
-      }
-    }
-    return *iter;
-  }
-
   template <typename A> void Walk(const A &x) { parser::Walk(x, *this); }
   template <typename A> bool Pre(const A &) { return true; }
   template <typename A> void Post(const A &) {}
@@ -3086,8 +3068,8 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
       checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective;
     }
     if (checkScope) {
-      if (scopingUnit(GetContext().scope) !=
-          scopingUnit(symbol->GetUltimate().owner())) {
+      if (omp::GetScopingUnit(GetContext().scope) !=
+          omp::GetScopingUnit(symbol->GetUltimate().owner())) {
         context_.Say(designator.source, // 2.15.3
             "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
             parser::ToUpperCaseLetters(
diff --git a/flang/test/Semantics/OpenMP/declare-simd.f90 b/flang/test/Semantics/OpenMP/declare-simd.f90
index ceed2c262555a..bb259b8722ca2 100644
--- a/flang/test/Semantics/OpenMP/declare-simd.f90
+++ b/flang/test/Semantics/OpenMP/declare-simd.f90
@@ -19,4 +19,9 @@ subroutine f00
 subroutine f01
 end
 
+integer function f02
+!Ok, expect no diagnostics
+!$omp declare_simd(f02)
+end
+
 end module

From b66dfa7273f0d7953965e00af3999315a015a563 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Wed, 1 Oct 2025 14:14:23 -0700
Subject: [PATCH 429/878] [LLDB] Add load core time to target metrics (#161581)

This patch adds a load core time, right now we don't have much insight
into the performance of load core, especially for large coredumps. To
start collecting information on this I've added some minor
instrumentation code to measure the two call sites of `LoadCore`.

I've also added a test to validate the new metric is output in
statistics dump
---
 lldb/include/lldb/Target/Statistics.h         |  2 ++
 lldb/source/API/SBTarget.cpp                  |  1 +
 lldb/source/Commands/CommandObjectTarget.cpp  |  6 +++-
 lldb/source/Target/Statistics.cpp             |  5 ++++
 .../stats_api/TestStatisticsAPI.py            | 28 +++++++++++++++++++
 .../stats_api/arm64-minidump-build-ids.yaml   | 19 +++++++++++++
 6 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml

diff --git a/lldb/include/lldb/Target/Statistics.h b/lldb/include/lldb/Target/Statistics.h
index d6983bb0b9d24..2653835206ec7 100644
--- a/lldb/include/lldb/Target/Statistics.h
+++ b/lldb/include/lldb/Target/Statistics.h
@@ -322,12 +322,14 @@ class TargetStats {
   void IncreaseSourceRealpathCompatibleCount(uint32_t count);
 
   StatsDuration &GetCreateTime() { return m_create_time; }
+  StatsDuration &GetLoadCoreTime() { return m_load_core_time; }
   StatsSuccessFail &GetExpressionStats() { return m_expr_eval; }
   StatsSuccessFail &GetFrameVariableStats() { return m_frame_var; }
   void Reset(Target &target);
 
 protected:
   StatsDuration m_create_time;
+  StatsDuration m_load_core_time;
   std::optional<StatsTimepoint> m_launch_or_attach_time;
   std::optional<StatsTimepoint> m_first_private_stop_time;
   std::optional<StatsTimepoint> m_first_public_stop_time;
diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index eb56337de3c44..90ffe786c696c 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -255,6 +255,7 @@ SBProcess SBTarget::LoadCore(const char *core_file, lldb::SBError &error) {
     ProcessSP process_sp(target_sp->CreateProcess(
         target_sp->GetDebugger().GetListener(), "", &filespec, false));
     if (process_sp) {
+      ElapsedTime loadCoreTime(target_sp->GetStatistics().GetLoadCoreTime());
       error.SetError(process_sp->LoadCore());
       if (error.Success())
         sb_process.SetSP(process_sp);
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 940be42d1b6e3..b5fc49d58c1eb 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -418,7 +418,11 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
         if (process_sp) {
           // Seems weird that we Launch a core file, but that is what we
           // do!
-          error = process_sp->LoadCore();
+          {
+            ElapsedTime loadCoreTime(
+                target_sp->GetStatistics().GetLoadCoreTime());
+            error = process_sp->LoadCore();
+          }
 
           if (error.Fail()) {
             result.AppendError(error.AsCString("unknown core file format"));
diff --git a/lldb/source/Target/Statistics.cpp b/lldb/source/Target/Statistics.cpp
index 8ad8d507268e2..f7311a8b24416 100644
--- a/lldb/source/Target/Statistics.cpp
+++ b/lldb/source/Target/Statistics.cpp
@@ -148,6 +148,11 @@ TargetStats::ToJSON(Target &target,
     target_metrics_json.try_emplace("targetCreateTime",
                                     m_create_time.get().count());
 
+    if (m_load_core_time.get().count() > 0) {
+      target_metrics_json.try_emplace("loadCoreTime",
+                                      m_load_core_time.get().count());
+    }
+
     json::Array breakpoints_array;
     double totalBreakpointResolveTime = 0.0;
     // Report both the normal breakpoint list and the internal breakpoint list.
diff --git a/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py b/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py
index f06c9ae14bb7a..d7249df350fc1 100644
--- a/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py
+++ b/lldb/test/API/functionalities/stats_api/TestStatisticsAPI.py
@@ -1,6 +1,7 @@
 # Test the SBAPI for GetStatistics()
 
 import json
+
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -54,6 +55,11 @@ def test_stats_api(self):
             stats_json,
             'Make sure the "frameVariable" key in in target.GetStatistics()["targets"][0]',
         )
+        self.assertNotIn(
+            "loadCoreTime",
+            stats_json,
+            "LoadCoreTime should not be present in a live, non-coredump target",
+        )
         expressionEvaluation = stats_json["expressionEvaluation"]
         self.assertIn(
             "successes",
@@ -157,3 +163,25 @@ def test_command_stats_force(self):
         stats_force.GetAsJSON(stream_force)
         debug_stats_force = json.loads(stream_force.GetData())
         self.assertEqual(debug_stats_force["totalDebugInfoByteSize"], 445)
+
+    def test_core_load_time(self):
+        """
+        Test to see if the coredump path is included in statistics dump.
+        """
+        yaml_file = "arm64-minidump-build-ids.yaml"
+        src_dir = self.getSourceDir()
+        minidump_path = self.getBuildArtifact(os.path.basename(yaml_file) + ".dmp")
+        self.yaml2obj(os.path.join(src_dir, yaml_file), minidump_path)
+        target = self.dbg.CreateTarget(None)
+        process = target.LoadCore(minidump_path)
+        self.assertTrue(process.IsValid())
+
+        stats_options = lldb.SBStatisticsOptions()
+        stats = target.GetStatistics(stats_options)
+        stream = lldb.SBStream()
+        stats.GetAsJSON(stream)
+        debug_stats = json.loads(stream.GetData())
+        self.assertTrue("targets" in debug_stats)
+        target_info = debug_stats["targets"][0]
+        self.assertTrue("loadCoreTime" in target_info)
+        self.assertTrue(float(target_info["loadCoreTime"]) > 0.0)
diff --git a/lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml b/lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml
new file mode 100644
index 0000000000000..4acbc409d8082
--- /dev/null
+++ b/lldb/test/API/functionalities/stats_api/arm64-minidump-build-ids.yaml
@@ -0,0 +1,19 @@
+--- !minidump
+Streams:
+  - Type:            SystemInfo
+    Processor Arch:  ARM
+    Platform ID:     Linux
+    CSD Version:     '15E216'
+    CPU:
+      CPUID:           0x00000000
+  - Type:            ModuleList
+    Modules:
+      - Base of Image:   0x0000000000001000
+        Size of Image:   0x00001000
+        Module Name:     '/tmp/a'
+        CodeView Record: 4C4570420102030405060708090A0B0C0D0E0F1011121314
+      - Base of Image:   0x0000000000001000
+        Size of Image:   0x00001000
+        Module Name:     '/tmp/b'
+        CodeView Record: 4C4570420A141E28323C46505A646E78828C96A0AAB4BEC8
+...

From f1986d9d4467108859cc7e5061b7ca9c48e2ec24 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 1 Oct 2025 14:22:56 -0700
Subject: [PATCH 430/878] [AMDGPU] Fix real and fake true16 v_cvt_f32_bf16
 disasm (#161578)

---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    | 11 +++--
 .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt |  9 ++--
 .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt        | 45 ++++++++++++-------
 .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt         | 12 +++--
 4 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 77df72111605e..54f57e02ed47e 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -314,9 +314,10 @@ let SubtargetPredicate = HasGFX950Insts, OtherPredicates = [HasBF16ConversionIns
   defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>;
 }
 let SubtargetPredicate = isGFX1250Plus, OtherPredicates = [HasBF16ConversionInsts] in {
-  defm V_CVT_F32_BF16_gfx1250 : VOP1Inst_t16_with_profiles <"v_cvt_f32_bf16_gfx1250", VOP_F32_BF16,
-                                                            VOPProfile_CVT_F32_BF16_gfx1250_t16,
-                                                            VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
+  let True16Predicate  = UseRealTrue16Insts in
+    defm V_CVT_F32_BF16_gfx1250_t16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_t16", VOPProfile_CVT_F32_BF16_gfx1250_t16>;
+  let True16Predicate  = UseFakeTrue16Insts in
+    defm V_CVT_F32_BF16_gfx1250_fake16 : VOP1Inst <"V_CVT_F32_BF16_gfx1250_fake16", VOPProfile_CVT_F32_BF16_gfx1250_fake16>;
 }
 
 let ReadsModeReg = 0, mayRaiseFPException = 0 in {
@@ -899,6 +900,7 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
   let DecoderNamespace = Gen.DecoderNamespace;
   let OtherPredicates = !listconcat(ps.OtherPredicates,
                                     !if(p.HasExt64BitDPP, [HasDPALU_DPP], []));
+  let True16Predicate = ps.True16Predicate;
 }
 
 class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
@@ -921,6 +923,7 @@ class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pf
     VOP1_DPP8<op, ps, p> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
   let DecoderNamespace = Gen.DecoderNamespace;
+  let True16Predicate = ps.True16Predicate;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1149,7 +1152,7 @@ defm V_TANH_F16              : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
 defm V_PERMLANE16_SWAP_B32   : VOP1_Real_OpSelIsDPP_gfx1250<0x049>;
 defm V_TANH_BF16             : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
 defm V_PRNG_B32              : VOP1_Real_FULL<GFX1250Gen, 0x04b>;
-defm V_CVT_F32_BF16          : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
+defm V_CVT_F32_BF16_gfx1250  : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16">;
 defm V_SAT_PK4_I4_I8         : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>;
 defm V_SAT_PK4_U4_U8         : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>;
 defm V_CVT_PK_F16_FP8        : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
index 07dbbddcdc2f9..94edf22e36acf 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt
@@ -720,10 +720,12 @@
 # GFX1250: v_cvt_f32_bf16_e32 v5, ttmp15           ; encoding: [0x7b,0xe4,0x0a,0x7e]
 
 0x01,0xe5,0x0a,0x7e
-# GFX1250: v_cvt_f32_bf16_e32 v5, v1.l             ; encoding: [0x01,0xe5,0x0a,0x7e]
+# GFX1250-REAL16: v_cvt_f32_bf16_e32 v5, v1.l             ; encoding: [0x01,0xe5,0x0a,0x7e]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
 
 0x7f,0xe5,0x0a,0x7e
-# GFX1250: v_cvt_f32_bf16_e32 v5, v127.l           ; encoding: [0x7f,0xe5,0x0a,0x7e]
+# GFX1250-REAL16: v_cvt_f32_bf16_e32 v5, v127.l           ; encoding: [0x7f,0xe5,0x0a,0x7e]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e32 v5, v127             ; encoding: [0x7f,0xe5,0x0a,0x7e]
 
 0x6b,0xe4,0x0a,0x7e
 # GFX1250: v_cvt_f32_bf16_e32 v5, vcc_hi           ; encoding: [0x6b,0xe4,0x0a,0x7e]
@@ -732,7 +734,8 @@
 # GFX1250: v_cvt_f32_bf16_e32 v5, vcc_lo           ; encoding: [0x6a,0xe4,0x0a,0x7e]
 
 0x81,0xe5,0x0a,0x7e
-# GFX1250: v_cvt_f32_bf16_e32 v5, v1.h             ; encoding: [0x81,0xe5,0x0a,0x7e]
+# GFX1250-REAL16: v_cvt_f32_bf16_e32 v5, v1.h             ; encoding: [0x81,0xe5,0x0a,0x7e]
+# GFX1250-FAKE16: v_cvt_f32_bf16_e32 v5, v129/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0x81,0xe5,0x0a,0x7e]
 
 0xff,0xf0,0x02,0x7e,0x34,0x12,0x00,0x00
 # GFX1250-REAL16: v_cvt_f16_bf8_e32 v1.l, 0x1234          ; encoding: [0xff,0xf0,0x02,0x7e,0x34,0x12,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
index c12ecb8d868aa..93286caa4fa2c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt
@@ -615,49 +615,64 @@
 # GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 
 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30
-# GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
 0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 
 0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x81,0x1b,0x00,0xff]
 
 0xfa,0xf0,0x02,0x7e,0x02,0x39,0x00,0xff
 # GFX1250-REAL16: v_cvt_f16_bf8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf0,0x02,0x7e,0x02,0x39,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
index fa7b940132f0c..fb3f1b25c6c7f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt
@@ -165,16 +165,20 @@
 # GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05]
 
 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00
-# GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 
 0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
 0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05
-# GFX1250: v_cvt_f32_bf16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX1250-REAL16: v_cvt_f32_bf16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
+# GFX1250-FAKE16: v_cvt_f32_bf16_dpp v5, v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
 
 0xe9,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05
 # GFX1250-REAL16: v_cvt_f16_bf8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05]

From f2c8c42821a8c6de8984a1e7a932233cf221d5c1 Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffranllvm@gmail.com>
Date: Wed, 1 Oct 2025 14:23:37 -0700
Subject: [PATCH 431/878] [HLSL] Update Frontend to support version 1.2 of root
 signature (#160616)

This patch updates the frontend to support version 1.2 of root
signatures, it adds parsing, metadata generation and a few tests.

---------

Co-authored-by: joaosaffran <joao.saffran@microsoft.com>
---
 clang/include/clang/Basic/LangOptions.h       |  3 +-
 clang/include/clang/Driver/Options.td         |  6 +-
 .../clang/Lex/HLSLRootSignatureTokenKinds.def |  8 +++
 .../clang/Parse/ParseHLSLRootSignature.h      |  3 +
 clang/lib/AST/TextNodeDumper.cpp              |  3 +
 clang/lib/Driver/ToolChains/HLSL.cpp          |  2 +-
 clang/lib/Parse/ParseHLSLRootSignature.cpp    | 61 +++++++++++++++++++
 .../AST/HLSL/RootSignature-Target-AST.hlsl    | 12 +++-
 clang/test/AST/HLSL/RootSignatures-AST.hlsl   | 27 ++++++++
 clang/test/CodeGenHLSL/RootSignature.hlsl     |  5 +-
 clang/test/SemaHLSL/RootSignature-err.hlsl    |  4 ++
 .../SemaHLSL/RootSignature-flags-err.hlsl     | 24 +++++---
 .../Lex/LexHLSLRootSignatureTest.cpp          |  3 +
 .../Parse/ParseHLSLRootSignatureTest.cpp      | 34 ++++++++++-
 llvm/include/llvm/BinaryFormat/DXContainer.h  |  1 +
 .../llvm/Frontend/HLSL/HLSLRootSignature.h    |  6 +-
 llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp  |  9 ++-
 .../HLSL/RootSignatureValidations.cpp         |  2 +-
 llvm/lib/ObjectYAML/DXContainerYAML.cpp       |  2 +-
 .../Frontend/HLSLRootSignatureDumpTest.cpp    |  7 ++-
 20 files changed, 194 insertions(+), 28 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index a8943df5b39aa..41595ec2a060d 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -549,8 +549,7 @@ class LangOptions : public LangOptionsBase {
   bool CheckNew = false;
 
   /// The HLSL root signature version for dxil.
-  llvm::dxbc::RootSignatureVersion HLSLRootSigVer =
-      llvm::dxbc::RootSignatureVersion::V1_1;
+  llvm::dxbc::RootSignatureVersion HLSLRootSigVer;
 
   /// The HLSL root signature that will be used to overide the root signature
   /// used for the shader entry point.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6245cf33a0719..096df56d0f183 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -9473,7 +9473,7 @@ def target_profile : DXCJoinedOrSeparate<"T">, MetaVarName<"<profile>">,
          "lib_6_3, lib_6_4, lib_6_5, lib_6_6, lib_6_7, lib_6_x,"
          "ms_6_5, ms_6_6, ms_6_7,"
          "as_6_5, as_6_6, as_6_7,"
-         "rootsig_1_0, rootsig_1_1">;
+         "rootsig_1_0, rootsig_1_1, rootsig_1_2">;
 def emit_pristine_llvm : DXCFlag<"emit-pristine-llvm">,
   HelpText<"Emit pristine LLVM IR from the frontend by not running any LLVM passes at all."
            "Same as -S + -emit-llvm + -disable-llvm-passes.">;
@@ -9486,9 +9486,9 @@ def fdx_rootsignature_version :
   Group<dxc_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Root Signature Version">,
-  Values<"rootsig_1_0,rootsig_1_1">,
+  Values<"rootsig_1_0,rootsig_1_1,rootsig_1_2">,
   NormalizedValuesScope<"llvm::dxbc::RootSignatureVersion">,
-  NormalizedValues<["V1_0", "V1_1"]>,
+  NormalizedValues<["V1_0", "V1_1", "V1_2"]>,
   MarshallingInfoEnum<LangOpts<"HLSLRootSigVer">, "V1_1">;
 def dxc_rootsig_ver :
   Separate<["/", "-"], "force-rootsig-ver">,
diff --git a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def
index a5cfeb34b2b51..1d7f7adbe076f 100644
--- a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def
+++ b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def
@@ -65,6 +65,9 @@
 #ifndef STATIC_BORDER_COLOR_ENUM
 #define STATIC_BORDER_COLOR_ENUM(NAME, LIT) ENUM(NAME, LIT)
 #endif
+#ifndef STATIC_SAMPLER_FLAG_ENUM
+#define STATIC_SAMPLER_FLAG_ENUM(NAME, LIT) ENUM(NAME, LIT)
+#endif
 
 // General Tokens:
 TOK(invalid, "invalid identifier")
@@ -228,6 +231,10 @@ STATIC_BORDER_COLOR_ENUM(OpaqueWhite, "STATIC_BORDER_COLOR_OPAQUE_WHITE")
 STATIC_BORDER_COLOR_ENUM(OpaqueBlackUint, "STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT")
 STATIC_BORDER_COLOR_ENUM(OpaqueWhiteUint, "STATIC_BORDER_COLOR_OPAQUE_WHITE_UINT")
 
+// Root Descriptor Flag Enums:
+STATIC_SAMPLER_FLAG_ENUM(UintBorderColor, "UINT_BORDER_COLOR")
+STATIC_SAMPLER_FLAG_ENUM(NonNormalizedCoordinates, "NON_NORMALIZED_COORDINATES")
+
 #undef STATIC_BORDER_COLOR_ENUM
 #undef COMPARISON_FUNC_ENUM
 #undef TEXTURE_ADDRESS_MODE_ENUM
@@ -237,6 +244,7 @@ STATIC_BORDER_COLOR_ENUM(OpaqueWhiteUint, "STATIC_BORDER_COLOR_OPAQUE_WHITE_UINT
 #undef DESCRIPTOR_RANGE_FLAG_ENUM_OFF
 #undef DESCRIPTOR_RANGE_FLAG_ENUM_ON
 #undef ROOT_DESCRIPTOR_FLAG_ENUM
+#undef STATIC_SAMPLER_FLAG_ENUM
 #undef ROOT_FLAG_ENUM
 #undef DESCRIPTOR_RANGE_OFFSET_ENUM
 #undef UNBOUNDED_ENUM
diff --git a/clang/include/clang/Parse/ParseHLSLRootSignature.h b/clang/include/clang/Parse/ParseHLSLRootSignature.h
index b06846fd83c09..8f91d7cd7b031 100644
--- a/clang/include/clang/Parse/ParseHLSLRootSignature.h
+++ b/clang/include/clang/Parse/ParseHLSLRootSignature.h
@@ -130,6 +130,7 @@ class RootSignatureParser {
     std::optional<float> MaxLOD;
     std::optional<uint32_t> Space;
     std::optional<llvm::dxbc::ShaderVisibility> Visibility;
+    std::optional<llvm::dxbc::StaticSamplerFlags> Flags;
   };
   std::optional<ParsedStaticSamplerParams> parseStaticSamplerParams();
 
@@ -153,6 +154,8 @@ class RootSignatureParser {
   parseRootDescriptorFlags(RootSignatureToken::Kind Context);
   std::optional<llvm::dxbc::DescriptorRangeFlags>
   parseDescriptorRangeFlags(RootSignatureToken::Kind Context);
+  std::optional<llvm::dxbc::StaticSamplerFlags>
+  parseStaticSamplerFlags(RootSignatureToken::Kind Context);
 
   /// Use NumericLiteralParser to convert CurToken.NumSpelling into a unsigned
   /// 32-bit integer
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 8f7fe3bea4e8f..cf5e9147ad78b 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -3095,6 +3095,9 @@ void TextNodeDumper::VisitHLSLRootSignatureDecl(
   case llvm::dxbc::RootSignatureVersion::V1_1:
     OS << "1.1";
     break;
+  case llvm::dxbc::RootSignatureVersion::V1_2:
+    OS << "1.2";
+    break;
   }
   OS << ", ";
   llvm::hlsl::rootsig::dumpRootElements(OS, D->getRootElements());
diff --git a/clang/lib/Driver/ToolChains/HLSL.cpp b/clang/lib/Driver/ToolChains/HLSL.cpp
index f4858e4c960de..2869549e6b3f0 100644
--- a/clang/lib/Driver/ToolChains/HLSL.cpp
+++ b/clang/lib/Driver/ToolChains/HLSL.cpp
@@ -64,7 +64,7 @@ bool isLegalShaderModel(Triple &T) {
   } break;
   case Triple::EnvironmentType::RootSignature:
     VersionTuple MinVer(1, 0);
-    VersionTuple MaxVer(1, 1);
+    VersionTuple MaxVer(1, 2);
     return MinVer <= Version && Version <= MaxVer;
   }
   return false;
diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp
index 3b16efb1f1199..7be6eecc520b1 100644
--- a/clang/lib/Parse/ParseHLSLRootSignature.cpp
+++ b/clang/lib/Parse/ParseHLSLRootSignature.cpp
@@ -485,6 +485,9 @@ std::optional<StaticSampler> RootSignatureParser::parseStaticSampler() {
   if (Params->Visibility.has_value())
     Sampler.Visibility = Params->Visibility.value();
 
+  if (Params->Flags.has_value())
+    Sampler.Flags = Params->Flags.value();
+
   return Sampler;
 }
 
@@ -926,6 +929,20 @@ RootSignatureParser::parseStaticSamplerParams() {
       if (!Visibility.has_value())
         return std::nullopt;
       Params.Visibility = Visibility;
+    } else if (tryConsumeExpectedToken(TokenKind::kw_flags)) {
+      // `flags` `=` STATIC_SAMPLE_FLAGS
+      if (Params.Flags.has_value()) {
+        reportDiag(diag::err_hlsl_rootsig_repeat_param) << CurToken.TokKind;
+        return std::nullopt;
+      }
+
+      if (consumeExpectedToken(TokenKind::pu_equal))
+        return std::nullopt;
+
+      auto Flags = parseStaticSamplerFlags(TokenKind::kw_flags);
+      if (!Flags.has_value())
+        return std::nullopt;
+      Params.Flags = Flags;
     } else {
       consumeNextToken(); // let diagnostic be at the start of invalid token
       reportDiag(diag::err_hlsl_invalid_token)
@@ -1255,6 +1272,50 @@ RootSignatureParser::parseDescriptorRangeFlags(TokenKind Context) {
   return Flags;
 }
 
+std::optional<llvm::dxbc::StaticSamplerFlags>
+RootSignatureParser::parseStaticSamplerFlags(TokenKind Context) {
+  assert(CurToken.TokKind == TokenKind::pu_equal &&
+         "Expects to only be invoked starting at given keyword");
+
+  // Handle the edge-case of '0' to specify no flags set
+  if (tryConsumeExpectedToken(TokenKind::int_literal)) {
+    if (!verifyZeroFlag()) {
+      reportDiag(diag::err_hlsl_rootsig_non_zero_flag);
+      return std::nullopt;
+    }
+    return llvm::dxbc::StaticSamplerFlags::None;
+  }
+
+  TokenKind Expected[] = {
+#define STATIC_SAMPLER_FLAG_ENUM(NAME, LIT) TokenKind::en_##NAME,
+#include "clang/Lex/HLSLRootSignatureTokenKinds.def"
+  };
+
+  std::optional<llvm::dxbc::StaticSamplerFlags> Flags;
+
+  do {
+    if (tryConsumeExpectedToken(Expected)) {
+      switch (CurToken.TokKind) {
+#define STATIC_SAMPLER_FLAG_ENUM(NAME, LIT)                                    \
+  case TokenKind::en_##NAME:                                                   \
+    Flags = maybeOrFlag<llvm::dxbc::StaticSamplerFlags>(                       \
+        Flags, llvm::dxbc::StaticSamplerFlags::NAME);                          \
+    break;
+#include "clang/Lex/HLSLRootSignatureTokenKinds.def"
+      default:
+        llvm_unreachable("Switch for consumed enum token was not provided");
+      }
+    } else {
+      consumeNextToken(); // consume token to point at invalid token
+      reportDiag(diag::err_hlsl_invalid_token)
+          << /*value=*/1 << /*value of*/ Context;
+      return std::nullopt;
+    }
+  } while (tryConsumeExpectedToken(TokenKind::pu_or));
+
+  return Flags;
+}
+
 std::optional<uint32_t> RootSignatureParser::handleUIntLiteral() {
   // Parse the numeric value and do semantic checks on its specification
   clang::NumericLiteralParser Literal(
diff --git a/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl b/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl
index 91441e32e047d..129ab7022f361 100644
--- a/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl
+++ b/clang/test/AST/HLSL/RootSignature-Target-AST.hlsl
@@ -1,9 +1,15 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
+// RUN:  -fdx-rootsignature-version=rootsig_1_0 \
+// RUN:  -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_0
+
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
+// RUN:  -fdx-rootsignature-version=rootsig_1_1 \
 // RUN:  -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_1
 
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
-// RUN:  -fdx-rootsignature-version=rootsig_1_0 \
-// RUN:  -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_0
+// RUN:  -fdx-rootsignature-version=rootsig_1_2 \
+// RUN:  -hlsl-entry EntryRootSig -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_2
+
 
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-rootsignature -ast-dump \
 // RUN:  -D CmdRS='"UAV(u0)"'\
@@ -12,11 +18,13 @@
 // CHECK: -HLSLRootSignatureDecl 0x{{.*}} {{.*}} implicit [[ENTRY_RS_DECL:__hlsl_rootsig_decl_\d*]]
 // CHECK-V1_0-SAME: version: 1.0,
 // CHECK-V1_1-SAME: version: 1.1,
+// CHECK-V1_2-SAME: version: 1.2,
 // CHECK-SAME: RootElements{
 // CHECK-SAME: RootCBV(b0,
 // CHECK-SAME:   space = 0, visibility = All,
 // CHECK-V1_0-SAME: flags = DataVolatile
 // CHECK-V1_1-SAME: flags = DataStaticWhileSetAtExecute
+// CHECK-V1_2-SAME: flags = DataStaticWhileSetAtExecute
 // CHECK-SAME: )
 // CHECK-SAME: }
 #define EntryRootSig "CBV(b0)"
diff --git a/clang/test/AST/HLSL/RootSignatures-AST.hlsl b/clang/test/AST/HLSL/RootSignatures-AST.hlsl
index 32da1f14853b0..0f0f3a5ca706f 100644
--- a/clang/test/AST/HLSL/RootSignatures-AST.hlsl
+++ b/clang/test/AST/HLSL/RootSignatures-AST.hlsl
@@ -6,6 +6,9 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -ast-dump \
 // RUN:  -fdx-rootsignature-version=rootsig_1_1 \
 // RUN:  -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_1
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -ast-dump \
+// RUN:  -fdx-rootsignature-version=rootsig_1_2 \
+// RUN:  -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-V1_2
 
 // This test ensures that the sample root signature is parsed without error and
 // the Attr AST Node is created succesfully. If an invalid root signature was
@@ -31,6 +34,7 @@
 // CHECK: -HLSLRootSignatureDecl 0x{{.*}} {{.*}} implicit [[SAMPLE_RS_DECL:__hlsl_rootsig_decl_\d*]]
 // CHECK-V1_0: version: 1.0,
 // CHECK-V1_1: version: 1.1,
+// CHECK-V1_2: version: 1.2,
 // CHECK-SAME: RootElements{
 // CHECK-SAME: RootFlags(AllowInputAssemblerInputLayout | DenyVertexShaderRootAccess),
 // CHECK-SAME: RootCBV(b0,
@@ -62,6 +66,7 @@
 // CHECK-SAME:   s0, numDescriptors = 4, space = 1, offset = DescriptorTableOffsetAppend,
 // CHECK-V1_0-SAME:  flags = DescriptorsVolatile
 // CHECK-V1_1-SAME:  flags = None
+// CHECK-V1_2-SAME:  flags = None
 // CHECK-SAME: ),
 // CHECK-SAME: DescriptorTable(
 // CHECK-SAME:   numClauses = 1, visibility = All
@@ -73,6 +78,7 @@
 // CHECK-SAME:   s1, filter = Anisotropic, addressU = Wrap, addressV = Wrap, addressW = Wrap,
 // CHECK-SAME:   mipLODBias = 0.000000e+00, maxAnisotropy = 16, comparisonFunc = LessEqual,
 // CHECK-SAME:   borderColor = OpaqueWhite, minLOD = 0.000000e+00, maxLOD = 3.402823e+38, space = 0, visibility = All
+// CHECK-SAME:  flags = None
 // CHECK-SAME: )}
 
 // CHECK: -RootSignatureAttr 0x{{.*}} {{.*}} [[SAMPLE_RS_DECL]]
@@ -131,3 +137,24 @@ void same_rs_string_main() {}
 // CHECK: -RootSignatureAttr 0x{{.*}} {{.*}} [[DIFF_RS_DECL]]
 [RootSignature(SampleDifferentRS)]
 void different_rs_string_main() {}
+
+#define SampleStaticSamplerRS \
+  "StaticSampler(s0, flags = NON_NORMALIZED_COORDINATES)"
+
+// Ensure that static samplers flags are correctly parsed in different versions
+
+// CHECK: -HLSLRootSignatureDecl 0x{{.*}} {{.*}} implicit [[DIFF_RS_DECL:__hlsl_rootsig_decl_\d*]]
+// CHECK-V1_0: version: 1.0,
+// CHECK-V1_1: version: 1.1,
+// CHECK-V1_2: version: 1.2,
+// CHECK-SAME: RootElements{
+// CHECK-SAME:  StaticSampler(
+// CHECK-SAME:   s0, filter = Anisotropic, addressU = Wrap, addressV = Wrap, addressW = Wrap,
+// CHECK-SAME:   mipLODBias = 0.000000e+00, maxAnisotropy = 16, comparisonFunc = LessEqual,
+// CHECK-SAME:   borderColor = OpaqueWhite, minLOD = 0.000000e+00, maxLOD = 3.402823e+38, space = 0, visibility = All
+// CHECK-SAME:   flags = NonNormalizedCoordinates
+// CHECK-SAME: )}
+
+// CHECK: -RootSignatureAttr 0x{{.*}} {{.*}} [[DIFF_RS_DECL]]
+[RootSignature(SampleStaticSamplerRS)]
+void statoc_sampler_v12_main() {}
diff --git a/clang/test/CodeGenHLSL/RootSignature.hlsl b/clang/test/CodeGenHLSL/RootSignature.hlsl
index bbab6a73a3658..eaff3a9e73305 100644
--- a/clang/test/CodeGenHLSL/RootSignature.hlsl
+++ b/clang/test/CodeGenHLSL/RootSignature.hlsl
@@ -82,8 +82,8 @@ void RootDescriptorsEntry() {}
 // checking minLOD, maxLOD
 // CHECK-SAME: float -1.280000e+02, float 1.280000e+02,
 
-// checking register, space, visibility and flags
-// CHECK-SAME: i32 42, i32 0, i32 0, i32 0}
+// checking register, space, visibility and flag
+// CHECK-SAME: i32 42, i32 0, i32 0, i32 1}
 
 #define SampleStaticSampler \
   "StaticSampler(s42, " \
@@ -96,6 +96,7 @@ void RootDescriptorsEntry() {}
   " borderColor = STATIC_BORDER_COLOR_OPAQUE_WHITE, " \
   " minLOD = -128.f, maxLOD = 128.f, " \
   " space = 0, visibility = SHADER_VISIBILITY_ALL, " \
+  " flags = UINT_BORDER_COLOR" \
   ")"
 [shader("compute"), RootSignature(SampleStaticSampler)]
 [numthreads(1,1,1)]
diff --git a/clang/test/SemaHLSL/RootSignature-err.hlsl b/clang/test/SemaHLSL/RootSignature-err.hlsl
index 89c684cd8d11f..debeafe7ee446 100644
--- a/clang/test/SemaHLSL/RootSignature-err.hlsl
+++ b/clang/test/SemaHLSL/RootSignature-err.hlsl
@@ -191,6 +191,10 @@ void basic_validation_5() {}
 [RootSignature("StaticSampler(s0, mipLODBias = 15.990001)")]
 void basic_validation_6() {}
 
+// expected-error@+1 {{invalid value of flags}}
+[RootSignature("StaticSampler(s0, flags = FLAG_TYPO)")]
+void basic_validation_7() {}
+
 // expected-error@+1 {{sampler and non-sampler resource mixed in descriptor table}}
 [RootSignature("DescriptorTable(Sampler(s0), CBV(b0))")]
 void mixed_resource_table() {}
diff --git a/clang/test/SemaHLSL/RootSignature-flags-err.hlsl b/clang/test/SemaHLSL/RootSignature-flags-err.hlsl
index 9449d33cee1ad..c79e692202ded 100644
--- a/clang/test/SemaHLSL/RootSignature-flags-err.hlsl
+++ b/clang/test/SemaHLSL/RootSignature-flags-err.hlsl
@@ -2,7 +2,8 @@
 // RUN:   -fdx-rootsignature-version=rootsig_1_0 %s -verify=v10
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -fsyntax-only \
 // RUN:   -fdx-rootsignature-version=rootsig_1_1 %s -verify=v11
-
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -fsyntax-only \
+// RUN:   -fdx-rootsignature-version=rootsig_1_2 %s -verify=v12
 // Root Descriptor Flags:
 
 // v10-error@+1 {{invalid flags for version 1.0}}
@@ -13,8 +14,9 @@ void bad_root_descriptor_flags_0() {}
 [RootSignature("CBV(b0, flags = DATA_STATIC_WHILE_SET_AT_EXECUTE)")]
 void bad_root_descriptor_flags_1() {}
 
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
 [RootSignature("CBV(b0, flags = DATA_STATIC | DATA_VOLATILE)")]
 void bad_root_descriptor_flags_2() {}
 
@@ -40,18 +42,20 @@ void bad_descriptor_range_flags_3() {}
 [RootSignature("DescriptorTable(CBV(b0, flags = DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS))")]
 void bad_descriptor_range_flags_4() {}
 
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
 [RootSignature("DescriptorTable(CBV(b0, flags = DATA_STATIC | DATA_STATIC_WHILE_SET_AT_EXECUTE))")]
 void bad_descriptor_range_flags_5() {}
 
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
 [RootSignature("DescriptorTable(CBV(b0, flags = DESCRIPTORS_VOLATILE | DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS))")]
 void bad_descriptor_range_flags_6() {}
 
-// v10-error@+2 {{invalid flags for version 1.0}}
-// v11-error@+1 {{invalid flags for version 1.1}}
+// v10-error@+3 {{invalid flags for version 1.0}}
+// v11-error@+2 {{invalid flags for version 1.1}}
+// v12-error@+1 {{invalid flags for version 1.2}}
 [RootSignature("DescriptorTable(CBV(b0, flags = DESCRIPTORS_VOLATILE | DATA_STATIC))")]
 void bad_descriptor_range_flags_7() {}
-
diff --git a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp
index 01f8d4f97b092..82f19686167da 100644
--- a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp
+++ b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp
@@ -226,6 +226,9 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) {
     STATIC_BORDER_COLOR_OPAQUE_WHITE
     STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT
     STATIC_BORDER_COLOR_OPAQUE_WHITE_UINT
+
+    UINT_BORDER_COLOR
+    NON_NORMALIZED_COORDINATES
   )cc";
   hlsl::RootSignatureLexer Lexer(Source);
 
diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
index 9b9f5dd8a63bb..f7e9d2d32c3f4 100644
--- a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
+++ b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp
@@ -263,7 +263,8 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseStaticSamplerTest) {
       filter = FILTER_MAXIMUM_MIN_POINT_MAG_LINEAR_MIP_POINT,
       maxLOD = 9000, addressU = TEXTURE_ADDRESS_MIRROR,
       comparisonFunc = COMPARISON_NOT_EQUAL,
-      borderColor = STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT
+      borderColor = STATIC_BORDER_COLOR_OPAQUE_BLACK_UINT,
+      flags = 0
     )
   )cc";
 
@@ -336,6 +337,37 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseStaticSamplerTest) {
   ASSERT_TRUE(Consumer->isSatisfied());
 }
 
+TEST_F(ParseHLSLRootSignatureTest, ValidStaticSamplerFlagsTest) {
+  const llvm::StringLiteral Source = R"cc(
+    StaticSampler(s0, flags = UINT_BORDER_COLOR | NON_NORMALIZED_COORDINATES)
+  )cc";
+
+  auto Ctx = createMinimalASTContext();
+  StringLiteral *Signature = wrapSource(Ctx, Source);
+
+  TrivialModuleLoader ModLoader;
+  auto PP = createPP(Source, ModLoader);
+
+  hlsl::RootSignatureParser Parser(RootSignatureVersion::V1_1, Signature, *PP);
+
+  // Test no diagnostics produced
+  Consumer->setNoDiag();
+
+  ASSERT_FALSE(Parser.parse());
+
+  auto Elements = Parser.getElements();
+  ASSERT_EQ(Elements.size(), 1u);
+
+  RootElement Elem = Elements[0].getElement();
+  ASSERT_TRUE(std::holds_alternative<StaticSampler>(Elem));
+  auto ValidStaticSamplerFlags =
+      llvm::dxbc::StaticSamplerFlags::NonNormalizedCoordinates |
+      llvm::dxbc::StaticSamplerFlags::UintBorderColor;
+  ASSERT_EQ(std::get<StaticSampler>(Elem).Flags, ValidStaticSamplerFlags);
+
+  ASSERT_TRUE(Consumer->isSatisfied());
+}
+
 TEST_F(ParseHLSLRootSignatureTest, ValidParseFloatsTest) {
   const llvm::StringLiteral Source = R"cc(
     StaticSampler(s0, mipLODBias = 0),
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index 08a7ddb6929f5..8944e73688eed 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -844,6 +844,7 @@ struct StaticSampler : public v1::StaticSampler {
 enum class RootSignatureVersion {
   V1_0 = 0x1,
   V1_1 = 0x2,
+  V1_2 = 0x3,
 };
 
 } // namespace dxbc
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
index 37224d8a94527..edee6a7dec6fc 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
@@ -56,7 +56,8 @@ struct RootDescriptor {
       return;
     }
 
-    assert(Version == llvm::dxbc::RootSignatureVersion::V1_1 &&
+    assert((Version == llvm::dxbc::RootSignatureVersion::V1_1 ||
+            Version == llvm::dxbc::RootSignatureVersion::V1_2) &&
            "Specified an invalid root signature version");
     switch (Type) {
     case dxil::ResourceClass::CBuffer:
@@ -100,7 +101,8 @@ struct DescriptorTableClause {
       return;
     }
 
-    assert(Version == dxbc::RootSignatureVersion::V1_1 &&
+    assert((Version == dxbc::RootSignatureVersion::V1_1 ||
+            Version == dxbc::RootSignatureVersion::V1_2) &&
            "Specified an invalid root signature version");
     switch (Type) {
     case dxil::ResourceClass::CBuffer:
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
index 92c62b83fadb0..2b33e560d74ac 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp
@@ -113,6 +113,13 @@ static raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const llvm::dxbc::StaticSamplerFlags &Flags) {
+  printFlags(OS, Flags, dxbc::getStaticSamplerFlags());
+
+  return OS;
+}
+
 raw_ostream &operator<<(raw_ostream &OS, const dxbc::RootFlags &Flags) {
   OS << "RootFlags(";
   printFlags(OS, Flags, dxbc::getRootFlags());
@@ -172,7 +179,7 @@ raw_ostream &operator<<(raw_ostream &OS, const StaticSampler &Sampler) {
      << ", borderColor = " << Sampler.BorderColor
      << ", minLOD = " << Sampler.MinLOD << ", maxLOD = " << Sampler.MaxLOD
      << ", space = " << Sampler.Space << ", visibility = " << Sampler.Visibility
-     << ")";
+     << ", flags = " << Sampler.Flags << ")";
   return OS;
 }
 
diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
index 2c78d622f7f28..8a2b03d9ede8b 100644
--- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
+++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp
@@ -40,7 +40,7 @@ bool verifyRootDescriptorFlag(uint32_t Version, uint32_t FlagsVal) {
   if (Version == 1)
     return Flags == FlagT::DataVolatile;
 
-  assert(Version == 2 && "Provided invalid root signature version");
+  assert((Version <= 3) && "Provided invalid root signature version");
 
   // The data-specific flags are mutually exclusive.
   FlagT DataFlags = FlagT::DataVolatile | FlagT::DataStatic |
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 3c09ae4e5f2bc..5dff9bad12b52 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -154,7 +154,7 @@ DXContainerYAML::RootSignatureYamlDesc::create(
         if (Error E = readDescriptorRanges<dxbc::RTS0::v1::DescriptorRange>(
                 Header, RootSigDesc, DTV))
           return std::move(E);
-      } else if (Version == 2) {
+      } else if (Version == 2 || Version == 3) {
         if (Error E = readDescriptorRanges<dxbc::RTS0::v2::DescriptorRange>(
                 Header, RootSigDesc, DTV))
           return std::move(E);
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
index 1eb03f16527ec..451c376219c38 100644
--- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
+++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
@@ -266,7 +266,8 @@ TEST(HLSLRootSignatureTest, DefaultStaticSamplerDump) {
                          "minLOD = 0.000000e+00, "
                          "maxLOD = 3.402823e+38, "
                          "space = 0, "
-                         "visibility = All"
+                         "visibility = All, "
+                         "flags = None"
                          ")";
   EXPECT_EQ(Out, Expected);
 }
@@ -287,6 +288,7 @@ TEST(HLSLRootSignatureTest, DefinedStaticSamplerDump) {
   Sampler.MaxLOD = 32.0f;
   Sampler.Space = 7;
   Sampler.Visibility = llvm::dxbc::ShaderVisibility::Domain;
+  Sampler.Flags = llvm::dxbc::StaticSamplerFlags::NonNormalizedCoordinates;
 
   std::string Out;
   llvm::raw_string_ostream OS(Out);
@@ -305,7 +307,8 @@ TEST(HLSLRootSignatureTest, DefinedStaticSamplerDump) {
                          "minLOD = 1.000000e+00, "
                          "maxLOD = 3.200000e+01, "
                          "space = 7, "
-                         "visibility = Domain"
+                         "visibility = Domain, "
+                         "flags = NonNormalizedCoordinates"
                          ")";
   EXPECT_EQ(Out, Expected);
 }

From 69a53b8d54a6876dd322923a148d47749b76c5fc Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Wed, 1 Oct 2025 23:27:25 +0200
Subject: [PATCH 432/878] [Flang] Fix perfect loop nest detection (#161554)

PR #160283 uses `Unwrap` to detect a `continue` statement, but it
applied it on the loop body itelf which sometimes finds a trailing
continue statement, but not always. Apply `Unwrap` on the last body
statement instead, where the `continue` is expected.

Fixes #161529
---
 flang/lib/Semantics/resolve-directives.cpp    | 13 ++++++++-----
 .../Lower/OpenMP/wsloop-collapse-continue.f90 | 19 +++++++++++++++++++
 flang/test/Semantics/OpenMP/do08.f90          |  1 -
 flang/test/Semantics/OpenMP/do13.f90          |  1 -
 4 files changed, 27 insertions(+), 7 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/wsloop-collapse-continue.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 25872be4af52e..b1eaaa859c30a 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2285,14 +2285,17 @@ void OmpAttributeVisitor::CheckPerfectNestAndRectangularLoop(
     }
 
     auto checkPerfectNest = [&, this]() {
-      auto blockSize = block.size();
-      if (blockSize <= 1)
+      if (block.empty())
         return;
+      auto last = block.end();
+      --last;
 
-      if (parser::Unwrap<parser::ContinueStmt>(x))
-        blockSize -= 1;
+      // A trailing CONTINUE is not considered part of the loop body
+      if (parser::Unwrap<parser::ContinueStmt>(*last))
+        --last;
 
-      if (blockSize <= 1)
+      // In a perfectly nested loop, the nested loop must be the only statement
+      if (last == block.begin())
         return;
 
       // Non-perfectly nested loop
diff --git a/flang/test/Lower/OpenMP/wsloop-collapse-continue.f90 b/flang/test/Lower/OpenMP/wsloop-collapse-continue.f90
new file mode 100644
index 0000000000000..fea7a8b335d63
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-collapse-continue.f90
@@ -0,0 +1,19 @@
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+program wsloop_collapse_continue
+  integer i, j
+
+! CHECK: omp.wsloop {{.*}} {
+! CHECK: omp.loop_nest ({{.*}}) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) collapse(2) {
+  !$omp do collapse(2)
+  do 50 i = 1, 42
+     do 51 j = 1, 84
+! CHECK: fir.call @_FortranAioOutputInteger32(
+        print *, i
+! CHECK: fir.call @_FortranAioOutputInteger32(
+        print *, j
+     51 continue
+  50 continue
+  !$omp end do
+
+end program wsloop_collapse_continue
diff --git a/flang/test/Semantics/OpenMP/do08.f90 b/flang/test/Semantics/OpenMP/do08.f90
index bb3c1d0cd3855..5143dff0dd315 100644
--- a/flang/test/Semantics/OpenMP/do08.f90
+++ b/flang/test/Semantics/OpenMP/do08.f90
@@ -61,7 +61,6 @@ program omp
   !$omp end do
 
 
-  !ERROR: Canonical loop nest must be perfectly nested.
   !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do collapse(3)
   do 60 i=2,200,2
diff --git a/flang/test/Semantics/OpenMP/do13.f90 b/flang/test/Semantics/OpenMP/do13.f90
index 8f7844f4136f9..6e9d1dddade4c 100644
--- a/flang/test/Semantics/OpenMP/do13.f90
+++ b/flang/test/Semantics/OpenMP/do13.f90
@@ -59,7 +59,6 @@ program omp
   !$omp end do
 
 
-  !ERROR: Canonical loop nest must be perfectly nested.
   !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do collapse(3)
   do 60 i=1,10

From 1a850279c5a6e3662f3a7b40a9ea097838c2aca0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 1 Oct 2025 22:30:18 +0100
Subject: [PATCH 433/878] [LV] Re-compute cost of scalarized load users.

If there are direct memory op users of the newly scalarized load,
their cost may have changed because there's no scalarization
overhead for the operand. Update it.

This ensures assigning consistent costs to scalarized memory
instructions that themselves have scalarized memory instructions as
operands.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  23 +-
 .../X86/replicating-load-store-costs.ll       | 301 +++++++++++++++---
 2 files changed, 285 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fa5be21dc2b8a..e5d6c8118eb55 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5699,6 +5699,20 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
           Worklist.push_back(InstOp);
   }
 
+  auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
+    // If there are direct memory op users of the newly scalarized load,
+    // their cost may have changed because there's no scalarization
+    // overhead for the operand. Update it.
+    for (User *U : LI->users()) {
+      if (!isa<LoadInst, StoreInst>(U))
+        continue;
+      if (getWideningDecision(cast<Instruction>(U), VF) != CM_Scalarize)
+        continue;
+      setWideningDecision(
+          cast<Instruction>(U), VF, CM_Scalarize,
+          getMemInstScalarizationCost(cast<Instruction>(U), VF));
+    }
+  };
   for (auto *I : AddrDefs) {
     if (isa<LoadInst>(I)) {
       // Setting the desired widening decision should ideally be handled in
@@ -5708,21 +5722,24 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       InstWidening Decision = getWideningDecision(I, VF);
       if (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
           (!isPredicatedInst(I) && !Legal->isUniformMemOp(*I, VF) &&
-           Decision == CM_Scalarize))
+           Decision == CM_Scalarize)) {
         // Scalarize a widened load of address or update the cost of a scalar
         // load of an address.
         setWideningDecision(
             I, VF, CM_Scalarize,
             (VF.getKnownMinValue() *
              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
-      else if (const auto *Group = getInterleavedAccessGroup(I)) {
+        UpdateMemOpUserCost(cast<LoadInst>(I));
+      } else if (const auto *Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
-          if (Instruction *Member = Group->getMember(I))
+          if (Instruction *Member = Group->getMember(I)) {
             setWideningDecision(
                 Member, VF, CM_Scalarize,
                 (VF.getKnownMinValue() *
                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
+            UpdateMemOpUserCost(cast<LoadInst>(Member));
+          }
         }
       }
     } else {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index d93932585460f..87848730c8f01 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -6,38 +6,184 @@
 define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I64-LABEL: define void @test_store_initially_interleave(
 ; I64-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
-; I64-NEXT:  [[ENTRY:.*]]:
-; I64-NEXT:    br label %[[LOOP:.*]]
-; I64:       [[LOOP]]:
-; I64-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[LOOP]] ]
-; I64-NEXT:    [[CONV:%.*]] = uitofp i32 [[IV]] to double
+; I64-NEXT:  [[ITER_CHECK:.*:]]
+; I64-NEXT:    [[TMP4:%.*]] = add i32 [[N]], 1
+; I64-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP4]], 4
+; I64-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; I64:       [[VECTOR_SCEVCHECK]]:
+; I64-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[N]], 0
+; I64-NEXT:    br i1 [[TMP1]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; I64:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; I64-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP4]], 16
+; I64-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; I64:       [[VECTOR_PH]]:
+; I64-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP4]], 16
+; I64-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; I64-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 16, i32 [[N_MOD_VF]]
+; I64-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP4]], [[TMP3]]
+; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I64:       [[VECTOR_BODY]]:
+; I64-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I64-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; I64-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; I64-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
+; I64-NEXT:    [[IV:%.*]] = add i32 [[INDEX]], 0
+; I64-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; I64-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 2
+; I64-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX]], 3
+; I64-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 4
+; I64-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 5
+; I64-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 6
+; I64-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 7
+; I64-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 8
+; I64-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 9
+; I64-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX]], 10
+; I64-NEXT:    [[TMP15:%.*]] = add i32 [[INDEX]], 11
+; I64-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 12
+; I64-NEXT:    [[TMP17:%.*]] = add i32 [[INDEX]], 13
+; I64-NEXT:    [[TMP18:%.*]] = add i32 [[INDEX]], 14
+; I64-NEXT:    [[TMP19:%.*]] = add i32 [[INDEX]], 15
+; I64-NEXT:    [[TMP20:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I64-NEXT:    [[TMP21:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I64-NEXT:    [[TMP22:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I64-NEXT:    [[TMP23:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
 ; I64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[IV]]
+; I64-NEXT:    [[TMP25:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
+; I64-NEXT:    [[TMP26:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP6]]
+; I64-NEXT:    [[TMP27:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP7]]
+; I64-NEXT:    [[TMP28:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
+; I64-NEXT:    [[TMP29:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
+; I64-NEXT:    [[TMP30:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I64-NEXT:    [[TMP31:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I64-NEXT:    [[TMP32:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I64-NEXT:    [[TMP33:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I64-NEXT:    [[TMP34:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I64-NEXT:    [[TMP35:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP15]]
+; I64-NEXT:    [[TMP36:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP16]]
+; I64-NEXT:    [[TMP37:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP17]]
+; I64-NEXT:    [[TMP38:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP18]]
+; I64-NEXT:    [[TMP39:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP19]]
 ; I64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ADD_PTR_I]], align 4
+; I64-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP25]], align 4
+; I64-NEXT:    [[TMP42:%.*]] = load ptr, ptr [[TMP26]], align 4
+; I64-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP27]], align 4
+; I64-NEXT:    [[TMP44:%.*]] = load ptr, ptr [[TMP28]], align 4
+; I64-NEXT:    [[TMP45:%.*]] = load ptr, ptr [[TMP29]], align 4
+; I64-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[TMP30]], align 4
+; I64-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[TMP31]], align 4
+; I64-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[TMP32]], align 4
+; I64-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP33]], align 4
+; I64-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[TMP34]], align 4
+; I64-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP35]], align 4
+; I64-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP36]], align 4
+; I64-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP37]], align 4
+; I64-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP38]], align 4
+; I64-NEXT:    [[TMP55:%.*]] = load ptr, ptr [[TMP39]], align 4
+; I64-NEXT:    [[CONV:%.*]] = extractelement <4 x double> [[TMP20]], i32 0
 ; I64-NEXT:    store double [[CONV]], ptr [[TMP0]], align 4
-; I64-NEXT:    [[INC]] = add i32 [[IV]], 1
-; I64-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], [[N]]
-; I64-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
-; I64:       [[EXIT]]:
-; I64-NEXT:    ret void
+; I64-NEXT:    [[TMP57:%.*]] = extractelement <4 x double> [[TMP20]], i32 1
+; I64-NEXT:    store double [[TMP57]], ptr [[TMP41]], align 4
+; I64-NEXT:    [[TMP58:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
+; I64-NEXT:    store double [[TMP58]], ptr [[TMP42]], align 4
+; I64-NEXT:    [[TMP59:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
+; I64-NEXT:    store double [[TMP59]], ptr [[TMP43]], align 4
+; I64-NEXT:    [[TMP60:%.*]] = extractelement <4 x double> [[TMP21]], i32 0
+; I64-NEXT:    store double [[TMP60]], ptr [[TMP44]], align 4
+; I64-NEXT:    [[TMP61:%.*]] = extractelement <4 x double> [[TMP21]], i32 1
+; I64-NEXT:    store double [[TMP61]], ptr [[TMP45]], align 4
+; I64-NEXT:    [[TMP62:%.*]] = extractelement <4 x double> [[TMP21]], i32 2
+; I64-NEXT:    store double [[TMP62]], ptr [[TMP46]], align 4
+; I64-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP21]], i32 3
+; I64-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
+; I64-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP22]], i32 0
+; I64-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
+; I64-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP22]], i32 1
+; I64-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
+; I64-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP22]], i32 2
+; I64-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
+; I64-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP22]], i32 3
+; I64-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
+; I64-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP23]], i32 0
+; I64-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
+; I64-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP23]], i32 1
+; I64-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
+; I64-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP23]], i32 2
+; I64-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
+; I64-NEXT:    [[TMP71:%.*]] = extractelement <4 x double> [[TMP23]], i32 3
+; I64-NEXT:    store double [[TMP71]], ptr [[TMP55]], align 4
+; I64-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; I64-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
+; I64-NEXT:    [[TMP72:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; I64-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I64:       [[MIDDLE_BLOCK]]:
+; I64-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; I64:       [[VEC_EPILOG_ITER_CHECK]]:
+; I64-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP3]], 4
+; I64-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; I64:       [[VEC_EPILOG_PH]]:
+; I64-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; I64-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[TMP4]], 4
+; I64-NEXT:    [[TMP73:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0
+; I64-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 4, i32 [[N_MOD_VF2]]
+; I64-NEXT:    [[N_VEC3:%.*]] = sub i32 [[TMP4]], [[TMP74]]
+; I64-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; I64-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; I64-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; I64-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; I64:       [[VEC_EPILOG_VECTOR_BODY]]:
+; I64-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I64-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I64-NEXT:    [[TMP75:%.*]] = add i32 [[INDEX4]], 0
+; I64-NEXT:    [[TMP76:%.*]] = add i32 [[INDEX4]], 1
+; I64-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 2
+; I64-NEXT:    [[TMP78:%.*]] = add i32 [[INDEX4]], 3
+; I64-NEXT:    [[TMP79:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I64-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I64-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I64-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I64-NEXT:    [[TMP83:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP78]]
+; I64-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
+; I64-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
+; I64-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
+; I64-NEXT:    [[TMP87:%.*]] = load ptr, ptr [[TMP83]], align 4
+; I64-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP79]], i32 0
+; I64-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
+; I64-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP79]], i32 1
+; I64-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
+; I64-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP79]], i32 2
+; I64-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
+; I64-NEXT:    [[TMP91:%.*]] = extractelement <4 x double> [[TMP79]], i32 3
+; I64-NEXT:    store double [[TMP91]], ptr [[TMP87]], align 4
+; I64-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
+; I64-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
+; I64-NEXT:    [[TMP92:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; I64-NEXT:    br i1 [[TMP92]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; I64:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; I64-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; I64:       [[VEC_EPILOG_SCALAR_PH]]:
 ;
 ; I32-LABEL: define void @test_store_initially_interleave(
 ; I32-SAME: i32 [[N:%.*]], ptr noalias [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
 ; I32-NEXT:  [[ENTRY:.*:]]
 ; I32-NEXT:    [[TMP0:%.*]] = add i32 [[N]], 1
-; I32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], 8
-; I32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; I32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], 4
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; I32:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; I32-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i32 [[TMP0]], 16
+; I32-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; I32:       [[VECTOR_PH]]:
-; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 8
+; I32-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
 ; I32-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
-; I32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 8, i32 [[N_MOD_VF]]
+; I32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 16, i32 [[N_MOD_VF]]
 ; I32-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP2]]
 ; I32-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; I32:       [[VECTOR_BODY]]:
 ; I32-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; I32-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; I32-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
-; I32-NEXT:    [[STEP_ADD_2:%.*]] = add <2 x i32> [[STEP_ADD]], splat (i32 2)
-; I32-NEXT:    [[STEP_ADD_3:%.*]] = add <2 x i32> [[STEP_ADD_2]], splat (i32 2)
+; I32-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; I32-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; I32-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
+; I32-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4)
 ; I32-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 0
 ; I32-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 1
 ; I32-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 2
@@ -46,10 +192,18 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX]], 5
 ; I32-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 6
 ; I32-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 7
-; I32-NEXT:    [[TMP11:%.*]] = uitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; I32-NEXT:    [[TMP12:%.*]] = uitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; I32-NEXT:    [[TMP13:%.*]] = uitofp <2 x i32> [[STEP_ADD_2]] to <2 x double>
-; I32-NEXT:    [[TMP14:%.*]] = uitofp <2 x i32> [[STEP_ADD_3]] to <2 x double>
+; I32-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 8
+; I32-NEXT:    [[TMP12:%.*]] = add i32 [[INDEX]], 9
+; I32-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX]], 10
+; I32-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX]], 11
+; I32-NEXT:    [[TMP40:%.*]] = add i32 [[INDEX]], 12
+; I32-NEXT:    [[TMP41:%.*]] = add i32 [[INDEX]], 13
+; I32-NEXT:    [[TMP42:%.*]] = add i32 [[INDEX]], 14
+; I32-NEXT:    [[TMP43:%.*]] = add i32 [[INDEX]], 15
+; I32-NEXT:    [[TMP44:%.*]] = uitofp <4 x i32> [[VEC_IND]] to <4 x double>
+; I32-NEXT:    [[TMP45:%.*]] = uitofp <4 x i32> [[STEP_ADD]] to <4 x double>
+; I32-NEXT:    [[TMP46:%.*]] = uitofp <4 x i32> [[STEP_ADD_2]] to <4 x double>
+; I32-NEXT:    [[TMP55:%.*]] = uitofp <4 x i32> [[STEP_ADD_3]] to <4 x double>
 ; I32-NEXT:    [[TMP15:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP3]]
 ; I32-NEXT:    [[TMP16:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP4]]
 ; I32-NEXT:    [[TMP17:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP5]]
@@ -58,6 +212,14 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP20:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP8]]
 ; I32-NEXT:    [[TMP21:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP9]]
 ; I32-NEXT:    [[TMP22:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP10]]
+; I32-NEXT:    [[TMP56:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP11]]
+; I32-NEXT:    [[TMP57:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP12]]
+; I32-NEXT:    [[TMP58:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP13]]
+; I32-NEXT:    [[TMP59:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP14]]
+; I32-NEXT:    [[TMP60:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP40]]
+; I32-NEXT:    [[TMP61:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP41]]
+; I32-NEXT:    [[TMP62:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP42]]
+; I32-NEXT:    [[TMP71:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP43]]
 ; I32-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP15]], align 4
 ; I32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[TMP16]], align 4
 ; I32-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP17]], align 4
@@ -66,29 +228,96 @@ define void @test_store_initially_interleave(i32 %n, ptr noalias %src) #0 {
 ; I32-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[TMP20]], align 4
 ; I32-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[TMP21]], align 4
 ; I32-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP22]], align 4
-; I32-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; I32-NEXT:    [[TMP47:%.*]] = load ptr, ptr [[TMP56]], align 4
+; I32-NEXT:    [[TMP48:%.*]] = load ptr, ptr [[TMP57]], align 4
+; I32-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP58]], align 4
+; I32-NEXT:    [[TMP50:%.*]] = load ptr, ptr [[TMP59]], align 4
+; I32-NEXT:    [[TMP51:%.*]] = load ptr, ptr [[TMP60]], align 4
+; I32-NEXT:    [[TMP52:%.*]] = load ptr, ptr [[TMP61]], align 4
+; I32-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[TMP62]], align 4
+; I32-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP71]], align 4
+; I32-NEXT:    [[TMP31:%.*]] = extractelement <4 x double> [[TMP44]], i32 0
 ; I32-NEXT:    store double [[TMP31]], ptr [[TMP23]], align 4
-; I32-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; I32-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP44]], i32 1
 ; I32-NEXT:    store double [[TMP32]], ptr [[TMP24]], align 4
-; I32-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP12]], i32 0
+; I32-NEXT:    [[TMP33:%.*]] = extractelement <4 x double> [[TMP44]], i32 2
 ; I32-NEXT:    store double [[TMP33]], ptr [[TMP25]], align 4
-; I32-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP12]], i32 1
+; I32-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP44]], i32 3
 ; I32-NEXT:    store double [[TMP34]], ptr [[TMP26]], align 4
-; I32-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP13]], i32 0
+; I32-NEXT:    [[TMP35:%.*]] = extractelement <4 x double> [[TMP45]], i32 0
 ; I32-NEXT:    store double [[TMP35]], ptr [[TMP27]], align 4
-; I32-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[TMP13]], i32 1
+; I32-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP45]], i32 1
 ; I32-NEXT:    store double [[TMP36]], ptr [[TMP28]], align 4
-; I32-NEXT:    [[TMP37:%.*]] = extractelement <2 x double> [[TMP14]], i32 0
+; I32-NEXT:    [[TMP37:%.*]] = extractelement <4 x double> [[TMP45]], i32 2
 ; I32-NEXT:    store double [[TMP37]], ptr [[TMP29]], align 4
-; I32-NEXT:    [[TMP38:%.*]] = extractelement <2 x double> [[TMP14]], i32 1
+; I32-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP45]], i32 3
 ; I32-NEXT:    store double [[TMP38]], ptr [[TMP30]], align 4
-; I32-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; I32-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD_3]], splat (i32 2)
+; I32-NEXT:    [[TMP63:%.*]] = extractelement <4 x double> [[TMP46]], i32 0
+; I32-NEXT:    store double [[TMP63]], ptr [[TMP47]], align 4
+; I32-NEXT:    [[TMP64:%.*]] = extractelement <4 x double> [[TMP46]], i32 1
+; I32-NEXT:    store double [[TMP64]], ptr [[TMP48]], align 4
+; I32-NEXT:    [[TMP65:%.*]] = extractelement <4 x double> [[TMP46]], i32 2
+; I32-NEXT:    store double [[TMP65]], ptr [[TMP49]], align 4
+; I32-NEXT:    [[TMP66:%.*]] = extractelement <4 x double> [[TMP46]], i32 3
+; I32-NEXT:    store double [[TMP66]], ptr [[TMP50]], align 4
+; I32-NEXT:    [[TMP67:%.*]] = extractelement <4 x double> [[TMP55]], i32 0
+; I32-NEXT:    store double [[TMP67]], ptr [[TMP51]], align 4
+; I32-NEXT:    [[TMP68:%.*]] = extractelement <4 x double> [[TMP55]], i32 1
+; I32-NEXT:    store double [[TMP68]], ptr [[TMP52]], align 4
+; I32-NEXT:    [[TMP69:%.*]] = extractelement <4 x double> [[TMP55]], i32 2
+; I32-NEXT:    store double [[TMP69]], ptr [[TMP53]], align 4
+; I32-NEXT:    [[TMP70:%.*]] = extractelement <4 x double> [[TMP55]], i32 3
+; I32-NEXT:    store double [[TMP70]], ptr [[TMP54]], align 4
+; I32-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; I32-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4)
 ; I32-NEXT:    [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; I32-NEXT:    br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; I32:       [[MIDDLE_BLOCK]]:
-; I32-NEXT:    br label %[[SCALAR_PH]]
-; I32:       [[SCALAR_PH]]:
+; I32-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; I32:       [[VEC_EPILOG_ITER_CHECK]]:
+; I32-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP2]], 4
+; I32-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; I32:       [[VEC_EPILOG_PH]]:
+; I32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; I32-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[TMP0]], 4
+; I32-NEXT:    [[TMP72:%.*]] = icmp eq i32 [[N_MOD_VF2]], 0
+; I32-NEXT:    [[TMP73:%.*]] = select i1 [[TMP72]], i32 4, i32 [[N_MOD_VF2]]
+; I32-NEXT:    [[N_VEC3:%.*]] = sub i32 [[TMP0]], [[TMP73]]
+; I32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; I32-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; I32-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; I32-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; I32:       [[VEC_EPILOG_VECTOR_BODY]]:
+; I32-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I32-NEXT:    [[VEC_IND5:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; I32-NEXT:    [[TMP74:%.*]] = add i32 [[INDEX4]], 0
+; I32-NEXT:    [[TMP75:%.*]] = add i32 [[INDEX4]], 1
+; I32-NEXT:    [[TMP76:%.*]] = add i32 [[INDEX4]], 2
+; I32-NEXT:    [[TMP77:%.*]] = add i32 [[INDEX4]], 3
+; I32-NEXT:    [[TMP78:%.*]] = uitofp <4 x i32> [[VEC_IND5]] to <4 x double>
+; I32-NEXT:    [[TMP79:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP74]]
+; I32-NEXT:    [[TMP80:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP75]]
+; I32-NEXT:    [[TMP81:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP76]]
+; I32-NEXT:    [[TMP82:%.*]] = getelementptr nusw { ptr, ptr, ptr }, ptr null, i32 [[TMP77]]
+; I32-NEXT:    [[TMP83:%.*]] = load ptr, ptr [[TMP79]], align 4
+; I32-NEXT:    [[TMP84:%.*]] = load ptr, ptr [[TMP80]], align 4
+; I32-NEXT:    [[TMP85:%.*]] = load ptr, ptr [[TMP81]], align 4
+; I32-NEXT:    [[TMP86:%.*]] = load ptr, ptr [[TMP82]], align 4
+; I32-NEXT:    [[TMP87:%.*]] = extractelement <4 x double> [[TMP78]], i32 0
+; I32-NEXT:    store double [[TMP87]], ptr [[TMP83]], align 4
+; I32-NEXT:    [[TMP88:%.*]] = extractelement <4 x double> [[TMP78]], i32 1
+; I32-NEXT:    store double [[TMP88]], ptr [[TMP84]], align 4
+; I32-NEXT:    [[TMP89:%.*]] = extractelement <4 x double> [[TMP78]], i32 2
+; I32-NEXT:    store double [[TMP89]], ptr [[TMP85]], align 4
+; I32-NEXT:    [[TMP90:%.*]] = extractelement <4 x double> [[TMP78]], i32 3
+; I32-NEXT:    store double [[TMP90]], ptr [[TMP86]], align 4
+; I32-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX4]], 4
+; I32-NEXT:    [[VEC_IND_NEXT7]] = add <4 x i32> [[VEC_IND5]], splat (i32 4)
+; I32-NEXT:    [[TMP91:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; I32-NEXT:    br i1 [[TMP91]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; I32:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; I32-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; I32:       [[VEC_EPILOG_SCALAR_PH]]:
 ;
 entry:
   br label %loop
@@ -149,7 +378,7 @@ define void @test_store_loaded_value(ptr noalias %src, ptr noalias %dst, i32 %n)
 ; I64-NEXT:    store double [[TMP11]], ptr [[TMP19]], align 8
 ; I64-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; I64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; I64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; I64-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; I64:       [[MIDDLE_BLOCK]]:
 ; I64-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
 ; I64-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
@@ -196,7 +425,7 @@ define void @test_store_loaded_value(ptr noalias %src, ptr noalias %dst, i32 %n)
 ; I32-NEXT:    store double [[TMP11]], ptr [[TMP19]], align 8
 ; I32-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; I32-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; I32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; I32-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; I32:       [[MIDDLE_BLOCK]]:
 ; I32-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_EXT]], [[N_VEC]]
 ; I32-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]

From 39410dff52d813ccfc7efc6fc0c6afd4583e14a6 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 1 Oct 2025 14:34:02 -0700
Subject: [PATCH 434/878] [clang] Invert condition refactored in #160935
 (#161583)

The PR #160935 incorrectly replaced `llvm::sys::fs::getUniqueID()` with
`llvm::vfs::FileSystem::exists()` in a condition. That's incorrect,
since the first function returns `std::error_code` that evaluates to
`true` when there is an error (file doesn't exist), while the new code
does the opposite. This PR fixes that issue by inverting the
conditional.

Co-authored-by: ronlieb <ron.lieberman@amd.com>
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 2 +-
 clang/test/OpenMP/amdgcn_save_temps.c | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 75bde3f72c4c2..8cda583313ca4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1542,7 +1542,7 @@ static llvm::TargetRegionEntryInfo getEntryInfoFromPresumedLoc(
     SourceManager &SM = CGM.getContext().getSourceManager();
     PresumedLoc PLoc = SM.getPresumedLoc(BeginLoc);
 
-    if (CGM.getFileSystem()->exists(PLoc.getFilename()))
+    if (!CGM.getFileSystem()->exists(PLoc.getFilename()))
       PLoc = SM.getPresumedLoc(BeginLoc, /*UseLineDirectives=*/false);
 
     return std::pair<std::string, uint64_t>(PLoc.getFilename(), PLoc.getLine());
diff --git a/clang/test/OpenMP/amdgcn_save_temps.c b/clang/test/OpenMP/amdgcn_save_temps.c
index ebf0d6031ee82..d838bb1166b6b 100644
--- a/clang/test/OpenMP/amdgcn_save_temps.c
+++ b/clang/test/OpenMP/amdgcn_save_temps.c
@@ -1,8 +1,6 @@
 
 // REQUIRES: amdgpu-registered-target
 
-// XFAIL: *
-
 // RUN: %clang_cc1 -E -fopenmp -x c -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd %s -o %t-openmp-amdgcn-amd-amdhsa-gfx90a.i
 // RUN: %clang_cc1 -fopenmp  -x c -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd -emit-llvm-bc %s -o %t-x86_64-unknown-unknown.bc
 // RUN: %clang_cc1 -fopenmp -x c -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -save-temps=cwd -emit-llvm -fopenmp-is-target-device -x cpp-output %t-openmp-amdgcn-amd-amdhsa-gfx90a.i -fopenmp-host-ir-file-path %t-x86_64-unknown-unknown.bc -o - | FileCheck %s

From f122484b998d8dbfdaf2e6b9c222438c71e90d86 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 1 Oct 2025 14:35:17 -0700
Subject: [PATCH 435/878] [llvm][support] Move `make_absolute` from `sys::fs`
 to `sys::path` (#161459)

The `llvm::sys::fs::make_absolute(const Twine &, SmallVectorImpl<char>
&)` functions doesn't perform any FS access - it only modifies the
second parameter via path/string operations. This function should live
in the `llvm::sys::path` namespace for consistency and for making it
easier to spot function calls that perform IO.
---
 bolt/lib/Core/BinaryContext.cpp               |   2 +-
 bolt/lib/Rewrite/DWARFRewriter.cpp            |   2 +-
 .../lib/Tooling/ApplyReplacements.cpp         |   2 +-
 clang-tools-extra/clang-move/Move.cpp         |   2 +-
 clang-tools-extra/clangd/ConfigCompile.cpp    |   2 +-
 .../clangd/SystemIncludeExtractor.cpp         |   2 +-
 .../clangd/index/SymbolCollector.cpp          |   2 +-
 clang-tools-extra/clangd/tool/ClangdMain.cpp  |   2 +-
 .../include-cleaner/tool/IncludeCleaner.cpp   |   2 +-
 clang/lib/Lex/HeaderSearch.cpp                |   4 +-
 .../DependencyScannerImpl.cpp                 |   2 +-
 .../Frontend/CompilerInstanceTest.cpp         |   2 +-
 llvm/include/llvm/Support/FileSystem.h        |  12 ---
 llvm/include/llvm/Support/Path.h              |  12 +++
 llvm/lib/Support/Path.cpp                     | 100 +++++++++---------
 llvm/lib/Support/VirtualFileSystem.cpp        |   4 +-
 llvm/tools/llvm-config/llvm-config.cpp        |   6 +-
 llvm/tools/llvm-dwp/llvm-dwp.cpp              |   2 +-
 llvm/tools/llvm-opt-report/OptReport.cpp      |   2 +-
 llvm/unittests/Support/Path.cpp               |   4 +-
 20 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 98440cde7cebd..b7ded6b931a15 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1662,7 +1662,7 @@ void BinaryContext::preprocessDWODebugInfo() {
                "files.\n";
       }
       // Prevent failures when DWOName is already an absolute path.
-      sys::fs::make_absolute(DWOCompDir, AbsolutePath);
+      sys::path::make_absolute(DWOCompDir, AbsolutePath);
       DWARFUnit *DWOCU =
           DwarfUnit->getNonSkeletonUnitDIE(false, AbsolutePath).getDwarfUnit();
       if (!DWOCU->isDWOUnit()) {
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 5c89a424caa7f..7366d2aca35ea 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1853,7 +1853,7 @@ void DWARFRewriter::writeDWOFiles(
   else if (!sys::fs::exists(CompDir))
     CompDir = ".";
   // Prevent failures when DWOName is already an absolute path.
-  sys::fs::make_absolute(CompDir, AbsolutePath);
+  sys::path::make_absolute(CompDir, AbsolutePath);
 
   std::error_code EC;
   std::unique_ptr<ToolOutputFile> TempOut =
diff --git a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
index b895075e4f31c..0ac8f712e112f 100644
--- a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
+++ b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
@@ -142,7 +142,7 @@ groupReplacements(const TUReplacements &TUs, const TUDiagnostics &TUDs,
     // build directories, make them absolute immediately.
     SmallString<128> Path = R.getFilePath();
     if (BuildDir)
-      llvm::sys::fs::make_absolute(*BuildDir, Path);
+      llvm::sys::path::make_absolute(*BuildDir, Path);
     else
       SM.getFileManager().makeAbsolutePath(Path);
 
diff --git a/clang-tools-extra/clang-move/Move.cpp b/clang-tools-extra/clang-move/Move.cpp
index 17f597170f9f6..519d359991cdb 100644
--- a/clang-tools-extra/clang-move/Move.cpp
+++ b/clang-tools-extra/clang-move/Move.cpp
@@ -75,7 +75,7 @@ std::string MakeAbsolutePath(StringRef CurrentDir, StringRef Path) {
     return "";
   llvm::SmallString<128> InitialDirectory(CurrentDir);
   llvm::SmallString<128> AbsolutePath(Path);
-  llvm::sys::fs::make_absolute(InitialDirectory, AbsolutePath);
+  llvm::sys::path::make_absolute(InitialDirectory, AbsolutePath);
   return CleanPath(std::move(AbsolutePath));
 }
 
diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp
index 962a48bcb7671..18e31809aa7c7 100644
--- a/clang-tools-extra/clangd/ConfigCompile.cpp
+++ b/clang-tools-extra/clangd/ConfigCompile.cpp
@@ -131,7 +131,7 @@ struct FragmentCompiler {
       return std::nullopt;
     }
     llvm::SmallString<256> AbsPath = llvm::StringRef(*Path);
-    llvm::sys::fs::make_absolute(FragmentDirectory, AbsPath);
+    llvm::sys::path::make_absolute(FragmentDirectory, AbsPath);
     llvm::sys::path::native(AbsPath, Style);
     return AbsPath.str().str();
   }
diff --git a/clang-tools-extra/clangd/SystemIncludeExtractor.cpp b/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
index 106de1b84c5c6..4a5cd3bb78b2f 100644
--- a/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
+++ b/clang-tools-extra/clangd/SystemIncludeExtractor.cpp
@@ -106,7 +106,7 @@ struct DriverArgs {
     // relative or absolute).
     if (llvm::any_of(Driver,
                      [](char C) { return llvm::sys::path::is_separator(C); })) {
-      llvm::sys::fs::make_absolute(Cmd.Directory, Driver);
+      llvm::sys::path::make_absolute(Cmd.Directory, Driver);
     }
     this->Driver = Driver.str().str();
     for (size_t I = 0, E = Cmd.CommandLine.size(); I < E; ++I) {
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index 6bdb1080fb294..39c479b5f4d5b 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -325,7 +325,7 @@ class SymbolCollector::HeaderFileURICache {
     if (R.second) {
       llvm::SmallString<256> AbsPath = Path;
       if (!llvm::sys::path::is_absolute(AbsPath) && !FallbackDir.empty())
-        llvm::sys::fs::make_absolute(FallbackDir, AbsPath);
+        llvm::sys::path::make_absolute(FallbackDir, AbsPath);
       assert(llvm::sys::path::is_absolute(AbsPath) &&
              "If the VFS can't make paths absolute, a FallbackDir must be "
              "provided");
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 4de2f213565e4..4a990f8f716ca 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -578,7 +578,7 @@ class TestScheme : public URIScheme {
     Body = Body.ltrim('/');
     llvm::SmallString<16> Path(Body);
     path::native(Path);
-    fs::make_absolute(TestScheme::TestDir, Path);
+    path::make_absolute(TestScheme::TestDir, Path);
     return std::string(Path);
   }
 
diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index 372ab5fa2706e..fefbfc3a9614d 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -344,7 +344,7 @@ mapInputsToAbsPaths(clang::tooling::CompilationDatabase &CDB,
     }
     for (const auto &Cmd : Cmds) {
       llvm::SmallString<256> CDBPath(Cmd.Filename);
-      llvm::sys::fs::make_absolute(Cmd.Directory, CDBPath);
+      llvm::sys::path::make_absolute(Cmd.Directory, CDBPath);
       CDBToAbsPaths[std::string(CDBPath)] = std::string(AbsPath);
     }
   }
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index ae09f70ee7896..238c5e2f2d9a5 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -2077,7 +2077,7 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(
 
   llvm::SmallString<32> FilePath = File;
   if (!WorkingDir.empty() && !path::is_absolute(FilePath))
-    fs::make_absolute(WorkingDir, FilePath);
+    path::make_absolute(WorkingDir, FilePath);
   // remove_dots switches to backslashes on windows as a side-effect!
   // We always want to suggest forward slashes for includes.
   // (not remove_dots(..., posix) as that misparses windows paths).
@@ -2091,7 +2091,7 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(
   // `BestPrefixLength` accordingly.
   auto CheckDir = [&](llvm::SmallString<32> Dir) -> bool {
     if (!WorkingDir.empty() && !path::is_absolute(Dir))
-      fs::make_absolute(WorkingDir, Dir);
+      path::make_absolute(WorkingDir, Dir);
     path::remove_dots(Dir, /*remove_dot_dot=*/true);
     for (auto NI = path::begin(File), NE = path::end(File),
               DI = path::begin(Dir), DE = path::end(Dir);
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index d370bfd0dd10f..66cf2688e0a13 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -31,7 +31,7 @@ class DependencyConsumerForwarder : public DependencyFileGenerator {
     for (const auto &File : getDependencies()) {
       CanonPath = File;
       llvm::sys::path::remove_dots(CanonPath, /*remove_dot_dot=*/true);
-      llvm::sys::fs::make_absolute(WorkingDirectory, CanonPath);
+      llvm::sys::path::make_absolute(WorkingDirectory, CanonPath);
       C.handleFileDependency(CanonPath);
     }
   }
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index 36cac5a5dd010..cd3fefa1ea994 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -33,7 +33,7 @@ TEST(CompilerInstance, DefaultVFSOverlayFromInvocation) {
 
   SmallString<256> CurrentPath;
   sys::fs::current_path(CurrentPath);
-  sys::fs::make_absolute(CurrentPath, FileName);
+  sys::path::make_absolute(CurrentPath, FileName);
 
   // Mount the VFS file itself on the path 'virtual.file'. Makes this test
   // a bit shorter than creating a new dummy file just for this purpose.
diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h
index c203779307840..cf2a8104ac813 100644
--- a/llvm/include/llvm/Support/FileSystem.h
+++ b/llvm/include/llvm/Support/FileSystem.h
@@ -266,18 +266,6 @@ class file_status : public basic_file_status {
 /// @name Physical Operators
 /// @{
 
-/// Make \a path an absolute path.
-///
-/// Makes \a path absolute using the \a current_directory if it is not already.
-/// An empty \a path will result in the \a current_directory.
-///
-/// /absolute/path   => /absolute/path
-/// relative/../path => <current-directory>/relative/../path
-///
-/// @param path A path that is modified to be an absolute path.
-LLVM_ABI void make_absolute(const Twine &current_directory,
-                            SmallVectorImpl<char> &path);
-
 /// Make \a path an absolute path.
 ///
 /// Makes \a path absolute using the current directory if it is not already. An
diff --git a/llvm/include/llvm/Support/Path.h b/llvm/include/llvm/Support/Path.h
index 0cb517146c04b..a8e0f338ec203 100644
--- a/llvm/include/llvm/Support/Path.h
+++ b/llvm/include/llvm/Support/Path.h
@@ -566,6 +566,18 @@ LLVM_ABI bool is_absolute_gnu(const Twine &path, Style style = Style::native);
 /// @result True if the path is relative, false if it is not.
 LLVM_ABI bool is_relative(const Twine &path, Style style = Style::native);
 
+/// Make \a path an absolute path.
+///
+/// Makes \a path absolute using the \a current_directory if it is not already.
+/// An empty \a path will result in the \a current_directory.
+///
+/// /absolute/path   => /absolute/path
+/// relative/../path => <current-directory>/relative/../path
+///
+/// @param path A path that is modified to be an absolute path.
+LLVM_ABI void make_absolute(const Twine &current_directory,
+                            SmallVectorImpl<char> &path);
+
 } // end namespace path
 } // end namespace sys
 } // end namespace llvm
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 761d29e960887..3e066665f4155 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -700,6 +700,55 @@ bool is_relative(const Twine &path, Style style) {
   return !is_absolute(path, style);
 }
 
+void make_absolute(const Twine &current_directory,
+                   SmallVectorImpl<char> &path) {
+  StringRef p(path.data(), path.size());
+
+  bool rootDirectory = has_root_directory(p);
+  bool rootName = has_root_name(p);
+
+  // Already absolute.
+  if ((rootName || is_style_posix(Style::native)) && rootDirectory)
+    return;
+
+  // All the following conditions will need the current directory.
+  SmallString<128> current_dir;
+  current_directory.toVector(current_dir);
+
+  // Relative path. Prepend the current directory.
+  if (!rootName && !rootDirectory) {
+    // Append path to the current directory.
+    append(current_dir, p);
+    // Set path to the result.
+    path.swap(current_dir);
+    return;
+  }
+
+  if (!rootName && rootDirectory) {
+    StringRef cdrn = root_name(current_dir);
+    SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
+    append(curDirRootName, p);
+    // Set path to the result.
+    path.swap(curDirRootName);
+    return;
+  }
+
+  if (rootName && !rootDirectory) {
+    StringRef pRootName = root_name(p);
+    StringRef bRootDirectory = root_directory(current_dir);
+    StringRef bRelativePath = relative_path(current_dir);
+    StringRef pRelativePath = relative_path(p);
+
+    SmallString<128> res;
+    append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
+    path.swap(res);
+    return;
+  }
+
+  llvm_unreachable("All rootName and rootDirectory combinations should have "
+                   "occurred above!");
+}
+
 StringRef remove_leading_dotslash(StringRef Path, Style style) {
   // Remove leading "./" (or ".//" or "././" etc.)
   while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1], style)) {
@@ -903,55 +952,6 @@ getPotentiallyUniqueTempFileName(const Twine &Prefix, StringRef Suffix,
   return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
 }
 
-void make_absolute(const Twine &current_directory,
-                   SmallVectorImpl<char> &path) {
-  StringRef p(path.data(), path.size());
-
-  bool rootDirectory = path::has_root_directory(p);
-  bool rootName = path::has_root_name(p);
-
-  // Already absolute.
-  if ((rootName || is_style_posix(Style::native)) && rootDirectory)
-    return;
-
-  // All of the following conditions will need the current directory.
-  SmallString<128> current_dir;
-  current_directory.toVector(current_dir);
-
-  // Relative path. Prepend the current directory.
-  if (!rootName && !rootDirectory) {
-    // Append path to the current directory.
-    path::append(current_dir, p);
-    // Set path to the result.
-    path.swap(current_dir);
-    return;
-  }
-
-  if (!rootName && rootDirectory) {
-    StringRef cdrn = path::root_name(current_dir);
-    SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
-    path::append(curDirRootName, p);
-    // Set path to the result.
-    path.swap(curDirRootName);
-    return;
-  }
-
-  if (rootName && !rootDirectory) {
-    StringRef pRootName      = path::root_name(p);
-    StringRef bRootDirectory = path::root_directory(current_dir);
-    StringRef bRelativePath  = path::relative_path(current_dir);
-    StringRef pRelativePath  = path::relative_path(p);
-
-    SmallString<128> res;
-    path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
-    path.swap(res);
-    return;
-  }
-
-  llvm_unreachable("All rootName and rootDirectory combinations should have "
-                   "occurred above!");
-}
-
 std::error_code make_absolute(SmallVectorImpl<char> &path) {
   if (path::is_absolute(path))
     return {};
@@ -960,7 +960,7 @@ std::error_code make_absolute(SmallVectorImpl<char> &path) {
   if (std::error_code ec = current_path(current_dir))
     return ec;
 
-  make_absolute(current_dir, path);
+  path::make_absolute(current_dir, path);
   return {};
 }
 
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index 44d2ee7076fb2..c754b30d8de4a 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -133,7 +133,7 @@ std::error_code FileSystem::makeAbsolute(SmallVectorImpl<char> &Path) const {
   if (!WorkingDir)
     return WorkingDir.getError();
 
-  llvm::sys::fs::make_absolute(WorkingDir.get(), Path);
+  sys::path::make_absolute(WorkingDir.get(), Path);
   return {};
 }
 
@@ -300,7 +300,7 @@ class RealFileSystem : public FileSystem {
     if (!WD || !*WD)
       return Path;
     Path.toVector(Storage);
-    sys::fs::make_absolute(WD->get().Resolved, Storage);
+    sys::path::make_absolute(WD->get().Resolved, Storage);
     return Storage;
   }
 
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index 49df8fdcb7f79..7f8c55ab00989 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -357,18 +357,18 @@ int main(int argc, char **argv) {
     ActivePrefix = CurrentExecPrefix;
     {
       SmallString<256> Path(LLVM_INSTALL_INCLUDEDIR);
-      sys::fs::make_absolute(ActivePrefix, Path);
+      sys::path::make_absolute(ActivePrefix, Path);
       ActiveIncludeDir = std::string(Path);
     }
     {
       SmallString<256> Path(LLVM_TOOLS_INSTALL_DIR);
-      sys::fs::make_absolute(ActivePrefix, Path);
+      sys::path::make_absolute(ActivePrefix, Path);
       ActiveBinDir = std::string(Path);
     }
     ActiveLibDir = ActivePrefix + "/lib" + LLVM_LIBDIR_SUFFIX;
     {
       SmallString<256> Path(LLVM_INSTALL_PACKAGE_DIR);
-      sys::fs::make_absolute(ActivePrefix, Path);
+      sys::path::make_absolute(ActivePrefix, Path);
       ActiveCMakeDir = std::string(Path);
     }
     ActiveIncludeOption = "-I" + ActiveIncludeDir;
diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp
index 61ba82d0634ac..31bad2d68982b 100644
--- a/llvm/tools/llvm-dwp/llvm-dwp.cpp
+++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp
@@ -94,7 +94,7 @@ getDWOFilenames(StringRef ExecFilename) {
         dwarf::toString(Die.find(dwarf::DW_AT_comp_dir), "");
     if (!DWOCompDir.empty()) {
       SmallString<16> DWOPath(DWOName);
-      sys::fs::make_absolute(DWOCompDir, DWOPath);
+      sys::path::make_absolute(DWOCompDir, DWOPath);
       if (!sys::fs::exists(DWOPath) && sys::fs::exists(DWOName))
         DWOPaths.push_back(std::move(DWOName));
       else
diff --git a/llvm/tools/llvm-opt-report/OptReport.cpp b/llvm/tools/llvm-opt-report/OptReport.cpp
index 68ed92c8bacea..e4b4fc287b8c1 100644
--- a/llvm/tools/llvm-opt-report/OptReport.cpp
+++ b/llvm/tools/llvm-opt-report/OptReport.cpp
@@ -274,7 +274,7 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
   for (auto &FI : LocationInfo) {
     SmallString<128> FileName(FI.first);
     if (!InputRelDir.empty())
-      sys::fs::make_absolute(InputRelDir, FileName);
+      sys::path::make_absolute(InputRelDir, FileName);
 
     const auto &FileInfo = FI.second;
 
diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index 888729b9dd249..eb649defc0021 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -255,14 +255,14 @@ TEST(Support, Path) {
 
   {
     SmallString<32> Relative("foo.cpp");
-    sys::fs::make_absolute("/root", Relative);
+    path::make_absolute("/root", Relative);
     Relative[5] = '/'; // Fix up windows paths.
     ASSERT_EQ("/root/foo.cpp", Relative);
   }
 
   {
     SmallString<32> Relative("foo.cpp");
-    sys::fs::make_absolute("//root", Relative);
+    path::make_absolute("//root", Relative);
     Relative[6] = '/'; // Fix up windows paths.
     ASSERT_EQ("//root/foo.cpp", Relative);
   }

From 103d2cae80160ebe79a91e4b4239140e2cd52283 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Wed, 1 Oct 2025 15:01:44 -0700
Subject: [PATCH 436/878] [alpha.webkit.RetainPtrCtorAdoptChecker] Allow
 leakRef in copy methods (#160986)

Allow leakRef() in the return statement of an Objective-C copy method
and other methods which return +1.
---
 .../WebKit/RetainPtrCtorAdoptChecker.cpp      |  4 +++
 .../Checkers/WebKit/objc-mock-types.h         | 36 ++++++++++++++++++-
 .../WebKit/retain-ptr-ctor-adopt-use.mm       | 12 +++++--
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
index e1f9a77f5a5ca..955b8d19a820c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
@@ -385,6 +385,10 @@ class RetainPtrCtorAdoptChecker
       if (RTC.isUnretained(RetValue->getType()))
         return;
     }
+    if (retainsRet && *retainsRet) {
+      CreateOrCopyFnCall.insert(RetValue);
+      return;
+    }
     if (auto *CE = dyn_cast<CallExpr>(RetValue)) {
       auto *Callee = CE->getDirectCallee();
       if (!Callee || !isCreateOrCopyFunction(Callee))
diff --git a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
index 39dee1746158b..dacb713130818 100644
--- a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h
@@ -17,6 +17,20 @@ template<typename T> typename remove_reference<T>::type&& move(T&& t);
 
 #endif
 
+namespace std {
+
+template <bool, typename U = void> struct enable_if {
+};
+
+template <typename T> struct enable_if<true, T> {
+  using type = T;
+};
+
+template <bool value, class T = void>
+using enable_if_t = typename enable_if<value, T>::type;
+
+}
+
 @class NSString;
 @class NSArray;
 @class NSMutableArray;
@@ -100,6 +114,7 @@ id CFBridgingRelease(CFTypeRef X) {
 __attribute__((objc_root_class))
 @interface NSObject
 + (instancetype) alloc;
++ (instancetype) allocWithZone:(NSZone *)zone;
 + (Class) class;
 + (Class) superclass;
 - (instancetype) init;
@@ -232,6 +247,14 @@ template <typename T> struct RemovePointer<T*> {
   typedef T Type;
 };
 
+template <typename T> struct IsPointer {
+  static constexpr bool value = false;
+};
+
+template <typename T> struct IsPointer<T*> {
+  static constexpr bool value = true;
+};
+
 template <typename T> struct RetainPtr {
   using ValueType = typename RemovePointer<T>::Type;
   using PtrType = ValueType*;
@@ -285,12 +308,23 @@ template <typename T> struct RetainPtr {
   PtrType operator->() const { return t; }
   T &operator*() const { return *t; }
   RetainPtr &operator=(PtrType t);
-  PtrType leakRef()
+
+  template <typename U = PtrType>
+  std::enable_if_t<IsPointer<U>::value, U> leakRef() CF_RETURNS_RETAINED
+  {
+    PtrType s = t;
+    t = nullptr;
+    return s;
+  }
+
+  template <typename U = PtrType>
+  std::enable_if_t<!IsPointer<U>::value, U> leakRef() NS_RETURNS_RETAINED
   {
     PtrType s = t;
     t = nullptr;
     return s;
   }
+
   operator PtrType() const { return t; }
   operator bool() const { return t; }
 
diff --git a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
index 769901778cdf0..45705615f3196 100644
--- a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
+++ b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
@@ -104,6 +104,14 @@ - (void)setValue:value {
   _number = value;
 }
 
+- (id)copyWithZone:(NSZone *)zone {
+  auto copy = adoptNS([(SomeObj *)[SomeObj allocWithZone:zone] init]);
+  [copy setValue:_number];
+  [copy setNext:_next];
+  [copy setOther:_other];
+  return copy.leakRef();
+}
+
 @end;
 
 RetainPtr<CVPixelBufferRef> cf_out_argument() {
@@ -151,7 +159,7 @@ CFTypeRef LeakWrapper() {
 
 extern Class (*getNSArrayClass)();
 NSArray *allocArrayInstance() NS_RETURNS_RETAINED {
-  return [[getNSArrayClass() alloc] init];
+  return adoptNS([[getNSArrayClass() alloc] init]).leakRef();
 }
 
 extern int (*GetObj)(CF_RETURNS_RETAINED CFTypeRef* objOut);
@@ -294,7 +302,7 @@ -(NSString *)leak_string {
 }
 
 -(NSString *)make_string {
-  return [[NSString alloc] initWithUTF8String:"hello"];
+  return adoptNS([[NSString alloc] initWithUTF8String:"hello"]).leakRef();
 }
 
 -(void)local_leak_string {

From e2f8bfc55079143e955c08da2b4d20ef3e97a332 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Wed, 1 Oct 2025 15:07:12 -0700
Subject: [PATCH 437/878] [clang-sycl-linker][NFC] Remove dead includes
 (#161564)

These includes are not used by ClangSYCLLinker.cpp directly.
Explicitly include FormatVariadic.h for formatv declaration, which was implicitly included by removed headers.
---
 clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
index 8dd993fb04362..594c79a28047b 100644
--- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
+++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
@@ -27,22 +27,16 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Object/Archive.h"
-#include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
-#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/IRObjectFile.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/OffloadBinary.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
-#include "llvm/Remarks/HotnessThresholdParser.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"

From 1e4d4bb584a1c35c5f7801c68b9dfccd6130caab Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Wed, 1 Oct 2025 15:14:38 -0700
Subject: [PATCH 438/878] [Clang][NFC] Refactor operator delete argument
 handling (#160554)

This change moves the getUsualDeleteParams function into the
FunctionDecl class so that it can be shared between LLVM IR and CIR
codegen.
---
 clang/include/clang/AST/Decl.h          |  3 ++
 clang/include/clang/AST/ExprCXX.h       |  8 ++++
 clang/lib/AST/Decl.cpp                  | 47 ++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp | 64 ++-----------------------
 clang/lib/CodeGen/CGExprCXX.cpp         | 56 +---------------------
 5 files changed, 65 insertions(+), 113 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index d85d04d2a4d53..406d79ebd6641 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -80,6 +80,7 @@ class TypeAliasTemplateDecl;
 class UnresolvedSetImpl;
 class VarTemplateDecl;
 enum class ImplicitParamKind;
+struct UsualDeleteParams;
 
 // Holds a constraint expression along with a pack expansion index, if
 // expanded.
@@ -2646,6 +2647,8 @@ class FunctionDecl : public DeclaratorDecl,
   bool isTypeAwareOperatorNewOrDelete() const;
   void setIsTypeAwareOperatorNewOrDelete(bool IsTypeAwareOperator = true);
 
+  UsualDeleteParams getUsualDeleteParams() const;
+
   /// Compute the language linkage.
   LanguageLinkage getLanguageLinkage() const;
 
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 5f16bac94d5e6..d78c7b6363b5d 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -2342,6 +2342,14 @@ struct ImplicitDeallocationParameters {
   SizedDeallocationMode PassSize;
 };
 
+/// The parameters to pass to a usual operator delete.
+struct UsualDeleteParams {
+  TypeAwareAllocationMode TypeAwareDelete = TypeAwareAllocationMode::No;
+  bool DestroyingDelete = false;
+  bool Size = false;
+  AlignedAllocationMode Alignment = AlignedAllocationMode::No;
+};
+
 /// Represents a new-expression for memory allocation and constructor
 /// calls, e.g: "new CXXNewExpr(foo)".
 class CXXNewExpr final
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index cd8e495e82c80..c7341552be365 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3552,6 +3552,53 @@ void FunctionDecl::setIsTypeAwareOperatorNewOrDelete(bool IsTypeAware) {
   getASTContext().setIsTypeAwareOperatorNewOrDelete(this, IsTypeAware);
 }
 
+UsualDeleteParams FunctionDecl::getUsualDeleteParams() const {
+  UsualDeleteParams Params;
+
+  // This function should only be called for operator delete declarations.
+  assert(getDeclName().isAnyOperatorDelete());
+  if (!getDeclName().isAnyOperatorDelete())
+    return Params;
+
+  const FunctionProtoType *FPT = getType()->castAs<FunctionProtoType>();
+  auto AI = FPT->param_type_begin(), AE = FPT->param_type_end();
+
+  if (isTypeAwareOperatorNewOrDelete()) {
+    Params.TypeAwareDelete = TypeAwareAllocationMode::Yes;
+    assert(AI != AE);
+    ++AI;
+  }
+
+  // The first argument after the type-identity parameter (if any) is
+  // always a void* (or C* for a destroying operator delete for class
+  // type C).
+  ++AI;
+
+  // The next parameter may be a std::destroying_delete_t.
+  if (isDestroyingOperatorDelete()) {
+    assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
+    Params.DestroyingDelete = true;
+    assert(AI != AE);
+    ++AI;
+  }
+
+  // Figure out what other parameters we should be implicitly passing.
+  if (AI != AE && (*AI)->isIntegerType()) {
+    Params.Size = true;
+    ++AI;
+  } else
+    assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
+
+  if (AI != AE && (*AI)->isAlignValT()) {
+    Params.Alignment = AlignedAllocationMode::Yes;
+    ++AI;
+  } else
+    assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
+
+  assert(AI == AE && "unexpected usual deallocation function parameter");
+  return Params;
+}
+
 LanguageLinkage FunctionDecl::getLanguageLinkage() const {
   return getDeclLanguageLinkage(*this);
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 83208bf226882..7989ad2e30f17 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -210,60 +210,6 @@ RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
   return emitCall(fnInfo, callee, returnValue, args, nullptr, loc);
 }
 
-namespace {
-/// The parameters to pass to a usual operator delete.
-struct UsualDeleteParams {
-  TypeAwareAllocationMode typeAwareDelete = TypeAwareAllocationMode::No;
-  bool destroyingDelete = false;
-  bool size = false;
-  AlignedAllocationMode alignment = AlignedAllocationMode::No;
-};
-} // namespace
-
-// FIXME(cir): this should be shared with LLVM codegen
-static UsualDeleteParams getUsualDeleteParams(const FunctionDecl *fd) {
-  UsualDeleteParams params;
-
-  const FunctionProtoType *fpt = fd->getType()->castAs<FunctionProtoType>();
-  auto ai = fpt->param_type_begin(), ae = fpt->param_type_end();
-
-  if (fd->isTypeAwareOperatorNewOrDelete()) {
-    params.typeAwareDelete = TypeAwareAllocationMode::Yes;
-    assert(ai != ae);
-    ++ai;
-  }
-
-  // The first argument after the type-identity parameter (if any) is
-  // always a void* (or C* for a destroying operator delete for class
-  // type C).
-  ++ai;
-
-  // The next parameter may be a std::destroying_delete_t.
-  if (fd->isDestroyingOperatorDelete()) {
-    params.destroyingDelete = true;
-    assert(ai != ae);
-    ++ai;
-  }
-
-  // Figure out what other parameters we should be implicitly passing.
-  if (ai != ae && (*ai)->isIntegerType()) {
-    params.size = true;
-    ++ai;
-  } else {
-    assert(!isTypeAwareAllocation(params.typeAwareDelete));
-  }
-
-  if (ai != ae && (*ai)->isAlignValT()) {
-    params.alignment = AlignedAllocationMode::Yes;
-    ++ai;
-  } else {
-    assert(!isTypeAwareAllocation(params.typeAwareDelete));
-  }
-
-  assert(ai == ae && "unexpected usual deallocation function parameter");
-  return params;
-}
-
 static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
                                        unsigned minElements,
                                        mlir::Value &numElements,
@@ -616,11 +562,11 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
   const auto *deleteFTy = deleteFD->getType()->castAs<FunctionProtoType>();
   CallArgList deleteArgs;
 
-  UsualDeleteParams params = getUsualDeleteParams(deleteFD);
+  UsualDeleteParams params = deleteFD->getUsualDeleteParams();
   auto paramTypeIt = deleteFTy->param_type_begin();
 
   // Pass std::type_identity tag if present
-  if (isTypeAwareAllocation(params.typeAwareDelete))
+  if (isTypeAwareAllocation(params.TypeAwareDelete))
     cgm.errorNYI(deleteFD->getSourceRange(),
                  "emitDeleteCall: type aware delete");
 
@@ -631,12 +577,12 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
   deleteArgs.add(RValue::get(deletePtr), argTy);
 
   // Pass the std::destroying_delete tag if present.
-  if (params.destroyingDelete)
+  if (params.DestroyingDelete)
     cgm.errorNYI(deleteFD->getSourceRange(),
                  "emitDeleteCall: destroying delete");
 
   // Pass the size if the delete function has a size_t parameter.
-  if (params.size) {
+  if (params.Size) {
     QualType sizeType = *paramTypeIt++;
     CharUnits deleteTypeSize = getContext().getTypeSizeInChars(deleteTy);
     assert(mlir::isa<cir::IntType>(convertType(sizeType)) &&
@@ -648,7 +594,7 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
   }
 
   // Pass the alignment if the delete function has an align_val_t parameter.
-  if (isAlignedAllocation(params.alignment))
+  if (isAlignedAllocation(params.Alignment))
     cgm.errorNYI(deleteFD->getSourceRange(),
                  "emitDeleteCall: aligned allocation");
 
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index a092b718412be..c52526c89f171 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1376,58 +1376,6 @@ RValue CodeGenFunction::EmitBuiltinNewDeleteCall(const FunctionProtoType *Type,
   llvm_unreachable("predeclared global operator new/delete is missing");
 }
 
-namespace {
-/// The parameters to pass to a usual operator delete.
-struct UsualDeleteParams {
-  TypeAwareAllocationMode TypeAwareDelete = TypeAwareAllocationMode::No;
-  bool DestroyingDelete = false;
-  bool Size = false;
-  AlignedAllocationMode Alignment = AlignedAllocationMode::No;
-};
-}
-
-static UsualDeleteParams getUsualDeleteParams(const FunctionDecl *FD) {
-  UsualDeleteParams Params;
-
-  const FunctionProtoType *FPT = FD->getType()->castAs<FunctionProtoType>();
-  auto AI = FPT->param_type_begin(), AE = FPT->param_type_end();
-
-  if (FD->isTypeAwareOperatorNewOrDelete()) {
-    Params.TypeAwareDelete = TypeAwareAllocationMode::Yes;
-    assert(AI != AE);
-    ++AI;
-  }
-
-  // The first argument after the type-identity parameter (if any) is
-  // always a void* (or C* for a destroying operator delete for class
-  // type C).
-  ++AI;
-
-  // The next parameter may be a std::destroying_delete_t.
-  if (FD->isDestroyingOperatorDelete()) {
-    assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
-    Params.DestroyingDelete = true;
-    assert(AI != AE);
-    ++AI;
-  }
-
-  // Figure out what other parameters we should be implicitly passing.
-  if (AI != AE && (*AI)->isIntegerType()) {
-    Params.Size = true;
-    ++AI;
-  } else
-    assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
-
-  if (AI != AE && (*AI)->isAlignValT()) {
-    Params.Alignment = AlignedAllocationMode::Yes;
-    ++AI;
-  } else
-    assert(!isTypeAwareAllocation(Params.TypeAwareDelete));
-
-  assert(AI == AE && "unexpected usual deallocation function parameter");
-  return Params;
-}
-
 namespace {
   /// A cleanup to call the given 'operator delete' function upon abnormal
   /// exit from a new expression. Templated on a traits type that deals with
@@ -1505,7 +1453,7 @@ namespace {
       } else {
         // For a non-placement new-expression, 'operator delete' can take a
         // size and/or an alignment if it has the right parameters.
-        Params = getUsualDeleteParams(OperatorDelete);
+        Params = OperatorDelete->getUsualDeleteParams();
       }
 
       assert(!Params.DestroyingDelete &&
@@ -1838,7 +1786,7 @@ void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD,
   const auto *DeleteFTy = DeleteFD->getType()->castAs<FunctionProtoType>();
   CallArgList DeleteArgs;
 
-  auto Params = getUsualDeleteParams(DeleteFD);
+  auto Params = DeleteFD->getUsualDeleteParams();
   auto ParamTypeIt = DeleteFTy->param_type_begin();
 
   std::optional<llvm::AllocaInst *> TagAlloca;

From 780f69cd922d8925648e11e771e77f0b46190e5b Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Wed, 1 Oct 2025 15:25:34 -0700
Subject: [PATCH 439/878] [Clang][CMake] Add CSSPGO support to
 LLVM_BUILD_INSTRUMENTED (#79942)

Build on Clang-BOLT infrastructure to collect sample profile for CSSPGO.
Add CSSPGO.cmake and BOLT-CSSPGO.cmake to automate CSSPGO/+BOLT
Clang builds.

Note that `CLANG_PGO_TRAINING_DATA_SOURCE_DIR` is required as built-in
training set is inadequate for collecting sampled profile.

Hardware compatibility: CSSPGO requires synchronized (0-skid) call
and branch stacks, which is only available with Intel PEBS (Sandy
Bridge+),
AMD Zen3 with BRS, Zen4 with LBRv2+LBR_PMC_FREEZE, and Zen5 with LBRv2.
This patch adds support for Intel `br_inst_retired.near_taken:uppp`
event.

Test Plan:
Added BOLT-CSSPGO.cmake with same use as BOLT-PGO.cmake,
e.g. for bootstrapped ThinLTO+CSSPGO+BOLT, with CSSPGO profile collected
from LLVM build, and BOLT profile collected from Hello World
(instrumentation):
```
cmake -B clang-csspgo-bolt -S /path/to/llvm-project/llvm \
-DLLVM_ENABLE_LLD=ON -DBOOTSTRAP_LLVM_ENABLE_LLD=ON \
-DBOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_LLD=ON \
-DPGO_INSTRUMENT_LTO=Thin \
-DBOOTSTRAP_CLANG_PGO_TRAINING_DATA_SOURCE_DIR=/path/to/llvm-project/llvm \
-GNinja  -C /path/to/llvm-project/clang/cmake/caches/BOLT-CSSPGO.cmake
ninja stage2-clang-bolt
...
warning: Sample PGO is estimated to optimize better with 19.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
...
[2800/2801] Optimizing Clang with BOLT
BOLT-INFO: 8189 out of 106942 functions in the binary (7.7%) have non-empty execution profile
            13776393 : taken branches (-42.1%)
```

Performance testing with Clang:
- Setup: Clang-BOLT testing harness
https://github.com/aaupov/llvm-devmtg-2022/commit/9f2b46f67a1930a51c58a0e4894637a8c64c570e
  - CSSPGO training: building LLVM,
  - InstrPGO training: building Hello World,
  - BOLT training: building Hello World, instrumentation,
  - benchmark: building small LLVM tool (not),
- 2S Intel SKX Xeon 6138 with 40C/80T and 256GB RAM, using 20C/40T for
build,
- Results, wall time, lower is better
  - Baseline (bootstrapped build): 10.36s,
  - InstrPGO + ThinLTO: 9.34s,
  - CSSPGO + ThinLTO: 8.85s.
- BOLT results, for reference:
  - Baseline: 9.09s,
  - InstrPGO + ThinLTO: 9.09s,
  - CSSPGO + ThinLTO: 8.58s.

---------

Co-authored-by: Matthias Braun <matze@braunis.de>
---
 clang/CMakeLists.txt                       | 15 ++++-
 clang/cmake/caches/BOLT-CSSPGO.cmake       |  3 +
 clang/cmake/caches/BOLT-PGO.cmake          |  3 +-
 clang/cmake/caches/CSSPGO.cmake            |  2 +
 clang/utils/perf-training/CMakeLists.txt   | 36 ++++++++++--
 clang/utils/perf-training/perf-helper.py   | 65 ++++++++++++++++++----
 llvm/CMakeLists.txt                        |  3 +
 llvm/cmake/modules/HandleLLVMOptions.cmake | 30 +++++++++-
 8 files changed, 136 insertions(+), 21 deletions(-)
 create mode 100644 clang/cmake/caches/BOLT-CSSPGO.cmake
 create mode 100644 clang/cmake/caches/CSSPGO.cmake

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 4eaa712899856..e4cb1a359620d 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -754,11 +754,22 @@ if (CLANG_ENABLE_BOOTSTRAP)
   if(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED)
     add_dependencies(clang-bootstrap-deps llvm-profdata)
     set(PGO_OPT -DLLVM_PROFDATA=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-profdata)
+    string(TOUPPER "${BOOTSTRAP_LLVM_BUILD_INSTRUMENTED}" BOOTSTRAP_LLVM_BUILD_INSTRUMENTED)
+    if (BOOTSTRAP_LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
+      add_dependencies(clang-bootstrap-deps llvm-profgen)
+      list(APPEND PGO_OPT -DLLVM_PROFGEN=${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-profgen)
+    endif()
   endif()
 
   if(LLVM_BUILD_INSTRUMENTED)
-    add_dependencies(clang-bootstrap-deps generate-profdata)
-    set(PGO_OPT -DLLVM_PROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.profdata)
+    string(TOUPPER "${LLVM_BUILD_INSTRUMENTED}" LLVM_BUILD_INSTRUMENTED)
+    if (LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
+      add_dependencies(clang-bootstrap-deps generate-sprofdata)
+      set(PGO_OPT -DLLVM_SPROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.sprofdata)
+    else()
+      add_dependencies(clang-bootstrap-deps generate-profdata)
+      set(PGO_OPT -DLLVM_PROFDATA_FILE=${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/clang.profdata)
+    endif()
     # Use the current tools for LTO instead of the instrumented ones
     list(APPEND _BOOTSTRAP_DEFAULT_PASSTHROUGH
       CMAKE_CXX_COMPILER
diff --git a/clang/cmake/caches/BOLT-CSSPGO.cmake b/clang/cmake/caches/BOLT-CSSPGO.cmake
new file mode 100644
index 0000000000000..b1c204ad57ac5
--- /dev/null
+++ b/clang/cmake/caches/BOLT-CSSPGO.cmake
@@ -0,0 +1,3 @@
+set(BOLT_PGO_CMAKE_CACHE "CSSPGO" CACHE STRING "")
+set(BOOTSTRAP_CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
+include(${CMAKE_CURRENT_LIST_DIR}/BOLT-PGO.cmake)
diff --git a/clang/cmake/caches/BOLT-PGO.cmake b/clang/cmake/caches/BOLT-PGO.cmake
index 1a04ca9a74e5e..cc9410fd0e95c 100644
--- a/clang/cmake/caches/BOLT-PGO.cmake
+++ b/clang/cmake/caches/BOLT-PGO.cmake
@@ -1,3 +1,4 @@
+set(BOLT_PGO_CMAKE_CACHE "PGO" CACHE STRING "")
 set(LLVM_ENABLE_PROJECTS "bolt;clang;lld" CACHE STRING "")
 
 set(CLANG_BOOTSTRAP_TARGETS
@@ -14,4 +15,4 @@ set(BOOTSTRAP_CLANG_BOOTSTRAP_TARGETS
 set(PGO_BUILD_CONFIGURATION
   ${CMAKE_CURRENT_LIST_DIR}/BOLT.cmake
   CACHE STRING "")
-include(${CMAKE_CURRENT_LIST_DIR}/PGO.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/${BOLT_PGO_CMAKE_CACHE}.cmake)
diff --git a/clang/cmake/caches/CSSPGO.cmake b/clang/cmake/caches/CSSPGO.cmake
new file mode 100644
index 0000000000000..59e08a64f8aad
--- /dev/null
+++ b/clang/cmake/caches/CSSPGO.cmake
@@ -0,0 +1,2 @@
+set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED "CSSPGO" CACHE STRING "")
+include(${CMAKE_CURRENT_LIST_DIR}/PGO.cmake)
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index 1d7bb788a15ed..2cd4c4c29c2bb 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -6,6 +6,10 @@ set(CLANG_PGO_TRAINING_DATA "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH
 set(CLANG_PGO_TRAINING_DATA_SOURCE_DIR OFF CACHE STRING "Path to source directory containing cmake project with source files to use for generating pgo data")
 set(CLANG_PGO_TRAINING_DEPS "" CACHE STRING "Extra dependencies needed to build the PGO training data.")
 
+add_custom_target(clear-perf-data
+  COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
+  COMMENT "Clearing old perf data")
+
 option(CLANG_PGO_TRAINING_USE_LLVM_BUILD "Use LLVM build for generating PGO data" ON)
 
 llvm_canonicalize_cmake_booleans(
@@ -21,7 +25,7 @@ if(LLVM_BUILD_INSTRUMENTED)
   add_lit_testsuite(generate-profraw "Generating clang PGO data"
     ${CMAKE_CURRENT_BINARY_DIR}/pgo-data/
     EXCLUDE_FROM_CHECK_ALL
-    DEPENDS clear-profraw
+    DEPENDS clear-profraw clang
     )
 
   add_custom_target(clear-profraw
@@ -55,6 +59,32 @@ if(LLVM_BUILD_INSTRUMENTED)
               USE_TOOLCHAIN EXLUDE_FROM_ALL NO_INSTALL DEPENDS generate-profraw)
       add_dependencies(generate-profdata generate-profraw-external)
     endif()
+
+    if(NOT LLVM_PROFGEN)
+      find_program(LLVM_PROFGEN llvm-profgen)
+    endif()
+
+    if(NOT LLVM_PROFGEN)
+      message(STATUS "To enable converting CSSPGO samples LLVM_PROFGEN has to point to llvm-profgen")
+    elseif(NOT CLANG_PGO_TRAINING_DATA_SOURCE_DIR)
+      message(STATUS "CLANG_PGO_TRAINING_DATA_SOURCE_DIR must be set to collect CSSPGO samples")
+    else()
+      set(PERF_HELPER "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py)
+      set(CLANG_SPROFDATA ${CMAKE_CURRENT_BINARY_DIR}/clang.sprofdata)
+      add_custom_command(
+        OUTPUT ${CLANG_SPROFDATA}
+        # Execute generate-profraw-external under perf
+        COMMAND ${PERF_HELPER} perf --csspgo -- ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} --target generate-profraw-external
+        # Convert perf profile into profraw
+        COMMAND ${PERF_HELPER} perf2prof ${LLVM_PROFGEN} $<TARGET_FILE:clang> ${CMAKE_CURRENT_BINARY_DIR}
+        # Merge profdata
+        COMMAND ${PERF_HELPER} merge --sample ${LLVM_PROFDATA} ${CLANG_SPROFDATA} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS clang ${CLANG_PGO_TRAINING_DEPS} clear-perf-data generate-profraw-external-clean
+        VERBATIM
+        USES_TERMINAL
+      )
+      add_custom_target(generate-sprofdata DEPENDS ${CLANG_SPROFDATA})
+    endif()
   endif()
 endif()
 
@@ -104,8 +134,4 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
     COMMENT "Clearing old BOLT fdata")
 
-  add_custom_target(clear-perf-data
-    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
-    COMMENT "Clearing old perf data")
-
 endif()
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index ab4491d2a6b6d..1c7904ec62163 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -45,14 +45,22 @@ def clean(args):
 
 
 def merge(args):
-    if len(args) < 3:
-        print(
-            "Usage: %s merge <llvm-profdata> <output> <paths>\n" % __file__
-            + "\tMerges all profraw files from path into output."
-        )
-        return 1
-    cmd = [args[0], "merge", "-o", args[1]]
-    for path in args[2:]:
+    parser = argparse.ArgumentParser(
+        prog="perf-helper merge",
+        description="Merges all profraw files from path(s) into output",
+    )
+    parser.add_argument("profdata", help="Path to llvm-profdata tool")
+    parser.add_argument("output", help="Output filename")
+    parser.add_argument(
+        "paths", nargs="+", help="Folder(s) containing input profraw files"
+    )
+    parser.add_argument("--sample", action="store_true", help="Sample profile")
+    opts = parser.parse_args(args)
+
+    cmd = [opts.profdata, "merge", "-o", opts.output]
+    if opts.sample:
+        cmd += ["--sample"]
+    for path in opts.paths:
         cmd.extend(findFilesWithExtension(path, "profraw"))
     subprocess.check_call(cmd)
     return 0
@@ -73,25 +81,30 @@ def merge_fdata(args):
 
 def perf(args):
     parser = argparse.ArgumentParser(
-        prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
+        prog="perf-helper perf",
+        description="perf wrapper for BOLT/CSSPGO profile collection",
     )
     parser.add_argument(
         "--lbr", action="store_true", help="Use perf with branch stacks"
     )
+    parser.add_argument("--csspgo", action="store_true", help="Enable CSSPGO flags")
     parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
 
     opts = parser.parse_args(args)
     cmd = opts.cmd[1:]
 
+    event = "br_inst_retired.near_taken:uppp" if opts.csspgo else "cycles:u"
     perf_args = [
         "perf",
         "record",
-        "--event=cycles:u",
+        f"--event={event}",
         "--freq=max",
         "--output=%d.perf.data" % os.getpid(),
     ]
-    if opts.lbr:
+    if opts.lbr or opts.csspgo:
         perf_args += ["--branch-filter=any,u"]
+    if opts.csspgo:
+        perf_args += ["-g", "--call-graph=fp"]
     perf_args.extend(cmd)
 
     start_time = time.time()
@@ -127,6 +140,30 @@ def perf2bolt(args):
     return 0
 
 
+def perf2prof(args):
+    parser = argparse.ArgumentParser(
+        prog="perf-helper perf2prof",
+        description="perf to CSSPGO prof conversion wrapper",
+    )
+    parser.add_argument("profgen", help="Path to llvm-profgen binary")
+    parser.add_argument("binary", help="Input binary")
+    parser.add_argument("paths", nargs="+", help="Path containing perf.data files")
+    opts = parser.parse_args(args)
+
+    profgen_args = [opts.profgen, f"--binary={opts.binary}"]
+    for path in opts.paths:
+        for filename in findFilesWithExtension(path, "perf.data"):
+            subprocess.run(
+                [
+                    *profgen_args,
+                    f"--perfdata={filename}",
+                    f"--output={filename}.profraw",
+                ],
+                check=True,
+            )
+    return 0
+
+
 def dtrace(args):
     parser = argparse.ArgumentParser(
         prog="perf-helper dtrace",
@@ -660,7 +697,10 @@ def bolt_optimize(args):
     process.check_returncode()
 
     if opts.method in ["PERF", "LBR"]:
-        perf2bolt([opts.bolt, opts.perf_training_binary_dir, opts.input])
+        args = [opts.bolt, opts.perf_training_binary_dir, opts.input]
+        if opts.method == "LBR":
+            args.extend("--lbr")
+        perf2bolt(args)
 
     merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
 
@@ -707,6 +747,7 @@ def bolt_optimize(args):
     "merge-fdata": merge_fdata,
     "perf": perf,
     "perf2bolt": perf2bolt,
+    "perf2prof": perf2prof,
 }
 
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index b98192968a3ab..c450ee5a3d72e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1011,6 +1011,9 @@ set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ${LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_defa
 set(LLVM_PROFDATA_FILE "" CACHE FILEPATH
   "Profiling data file to use when compiling in order to improve runtime performance.")
 
+set(LLVM_SPROFDATA_FILE "" CACHE FILEPATH
+  "Sampling profiling data file to use when compiling in order to improve runtime performance.")
+
 if(LLVM_INCLUDE_TESTS)
   # All LLVM Python files should be compatible down to this minimum version.
   set(LLVM_MINIMUM_PYTHON_VERSION 3.8)
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 8eca29f8a03f5..d4195db6368d7 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -1184,7 +1184,7 @@ if(LLVM_ENABLE_EH AND NOT LLVM_ENABLE_RTTI)
   message(FATAL_ERROR "Exception handling requires RTTI. You must set LLVM_ENABLE_RTTI to ON")
 endif()
 
-set(LLVM_BUILD_INSTRUMENTED OFF CACHE STRING "Build LLVM and tools with PGO instrumentation. May be specified as IR or Frontend")
+set(LLVM_BUILD_INSTRUMENTED OFF CACHE STRING "Build LLVM and tools with PGO instrumentation. May be specified as IR, Frontend, CSIR, CSSPGO")
 set(LLVM_VP_COUNTERS_PER_SITE "1.5" CACHE STRING "Value profile counters to use per site for IR PGO with Clang")
 mark_as_advanced(LLVM_BUILD_INSTRUMENTED LLVM_VP_COUNTERS_PER_SITE)
 string(TOUPPER "${LLVM_BUILD_INSTRUMENTED}" uppercase_LLVM_BUILD_INSTRUMENTED)
@@ -1217,6 +1217,19 @@ if (LLVM_BUILD_INSTRUMENTED)
         CMAKE_EXE_LINKER_FLAGS
         CMAKE_SHARED_LINKER_FLAGS)
     endif()
+  elseif(uppercase_LLVM_BUILD_INSTRUMENTED STREQUAL "CSSPGO")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      append("-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -fno-optimize-sibling-calls -fpseudo-probe-for-profiling -fdebug-info-for-profiling"
+        CMAKE_CXX_FLAGS
+        CMAKE_C_FLAGS)
+      if(NOT LINKER_IS_LLD_LINK)
+        append("-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -fno-optimize-sibling-calls -fpseudo-probe-for-profiling -fdebug-info-for-profiling"
+          CMAKE_EXE_LINKER_FLAGS
+          CMAKE_SHARED_LINKER_FLAGS)
+      endif()
+    else()
+      message(FATAL_ERROR "LLVM_BUILD_INSTRUMENTED=CSSPGO can only be specified when compiling with clang")
+    endif()
   else()
     append("-fprofile-instr-generate=\"${LLVM_PROFILE_FILE_PATTERN}\""
       CMAKE_CXX_FLAGS
@@ -1269,6 +1282,21 @@ elseif(LLVM_PROFDATA_FILE)
   message(WARNING "LLVM_PROFDATA_FILE specified, but ${LLVM_PROFDATA_FILE} not found")
 endif()
 
+if(LLVM_SPROFDATA_FILE AND EXISTS ${LLVM_SPROFDATA_FILE})
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" )
+    append("-fpseudo-probe-for-profiling -fprofile-sample-use=\"${LLVM_SPROFDATA_FILE}\""
+      CMAKE_CXX_FLAGS
+      CMAKE_C_FLAGS)
+    if(NOT LINKER_IS_LLD_LINK)
+      append("-fpseudo-probe-for-profiling -fprofile-sample-use=\"${LLVM_SPROFDATA_FILE}\""
+        CMAKE_EXE_LINKER_FLAGS
+        CMAKE_SHARED_LINKER_FLAGS)
+    endif()
+  else()
+    message(FATAL_ERROR "LLVM_SPROFDATA_FILE can only be specified when compiling with clang")
+  endif()
+endif()
+
 option(LLVM_BUILD_INSTRUMENTED_COVERAGE "Build LLVM and tools with Code Coverage instrumentation" Off)
 option(LLVM_INDIVIDUAL_TEST_COVERAGE "Emit individual coverage file for each test case." OFF)
 mark_as_advanced(LLVM_BUILD_INSTRUMENTED_COVERAGE)

From bdd98a01478ddcb99b05d9d2eb20bf4985a21683 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86@yahoo.com>
Date: Wed, 1 Oct 2025 15:31:50 -0700
Subject: [PATCH 440/878] [AMDGPU] Add documentation files for GFX12. (#157151)

This patch adds documentation files for GFX12.
---
 llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst       | 2002 +++++++++++++++++++++
 llvm/docs/AMDGPU/gfx12_addr.rst           |   15 +
 llvm/docs/AMDGPU/gfx12_attr.rst           |   28 +
 llvm/docs/AMDGPU/gfx12_clause.rst         |    7 +
 llvm/docs/AMDGPU/gfx12_data0_56f215.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_data0_6802ce.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_data0_e016a1.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_data0_fd235e.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_data1_6802ce.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_data1_731030.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_data1_e016a1.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_data1_fd235e.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_delay.rst          |   74 +
 llvm/docs/AMDGPU/gfx12_hwreg.rst          |   76 +
 llvm/docs/AMDGPU/gfx12_imm16.rst          |    7 +
 llvm/docs/AMDGPU/gfx12_ioffset.rst        |   15 +
 llvm/docs/AMDGPU/gfx12_label.rst          |   29 +
 llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst |   15 +
 llvm/docs/AMDGPU/gfx12_literal_81e671.rst |   15 +
 llvm/docs/AMDGPU/gfx12_m.rst              |   13 +
 llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst   |   15 +
 llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst   |   15 +
 llvm/docs/AMDGPU/gfx12_samp.rst           |   15 +
 llvm/docs/AMDGPU/gfx12_sbase_453b95.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_354189.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_836716.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_sdst_006c40.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_sdst_20064d.rst    |   15 +
 llvm/docs/AMDGPU/gfx12_sdst_354189.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_sdst_836716.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_sendmsg.rst        |   48 +
 llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst    |   30 +
 llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_218bea.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_39b593.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_730a13.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_81e671.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_c98889.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst  |   17 +
 llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst  |   15 +
 llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst |   17 +
 llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst |   15 +
 llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst |   20 +
 llvm/docs/AMDGPU/gfx12_src0_5727cf.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src0_5cae62.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src0_6802ce.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src0_85aab6.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src0_c4593f.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src0_e016a1.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src0_fd235e.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_5727cf.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_5cae62.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_6802ce.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_731030.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_977794.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_c4593f.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_e016a1.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src1_fd235e.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_2797bc.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_5727cf.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_5cae62.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_6802ce.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_7b936a.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_c4593f.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_src2_e016a1.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_srcx0.rst          |   17 +
 llvm/docs/AMDGPU/gfx12_srcy0.rst          |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_tgt.rst            |   17 +
 llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst   |   15 +
 llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst   |   15 +
 llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst   |   15 +
 llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst   |   15 +
 llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst   |   15 +
 llvm/docs/AMDGPU/gfx12_vcc.rst            |   16 +
 llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vdata_69a144.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vdata_89680f.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_006c40.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_227281.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_69a144.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_836716.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_89680f.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vdstx.rst          |   17 +
 llvm/docs/AMDGPU/gfx12_vdsty.rst          |   17 +
 llvm/docs/AMDGPU/gfx12_version.rst        |    7 +
 llvm/docs/AMDGPU/gfx12_vsrc0.rst          |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst   |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc2.rst          |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc3.rst          |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst    |   17 +
 llvm/docs/AMDGPU/gfx12_vsrcx1.rst         |   17 +
 llvm/docs/AMDGPU/gfx12_vsrcy1.rst         |   17 +
 llvm/docs/AMDGPU/gfx12_waitcnt.rst        |   55 +
 llvm/docs/AMDGPUModifierSyntax.rst        |  109 ++
 llvm/docs/AMDGPUOperandSyntax.rst         |   11 +
 llvm/docs/AMDGPUUsage.rst                 |    2 +
 131 files changed, 4426 insertions(+)
 create mode 100644 llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_addr.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_attr.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_clause.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data0_56f215.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data0_6802ce.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data0_e016a1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data0_fd235e.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data1_6802ce.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data1_731030.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data1_e016a1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_data1_fd235e.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_delay.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_hwreg.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_imm16.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ioffset.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_label.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_literal_81e671.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_m.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_samp.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sbase_453b95.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_354189.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_836716.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdst_006c40.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdst_20064d.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdst_354189.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdst_836716.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sendmsg.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_218bea.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_39b593.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_730a13.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_81e671.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_c98889.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src0_5727cf.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src0_5cae62.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src0_6802ce.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src0_85aab6.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src0_c4593f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src0_e016a1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src0_fd235e.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_5727cf.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_5cae62.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_6802ce.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_731030.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_977794.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_c4593f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_e016a1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src1_fd235e.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_2797bc.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_5727cf.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_5cae62.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_6802ce.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_7b936a.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_c4593f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_src2_e016a1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_srcx0.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_srcy0.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_tgt.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vcc.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdata_69a144.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdata_89680f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_006c40.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_227281.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_69a144.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_836716.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_89680f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdstx.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vdsty.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_version.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc0.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc2.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc3.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrcx1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_vsrcy1.rst
 create mode 100644 llvm/docs/AMDGPU/gfx12_waitcnt.rst

diff --git a/llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst b/llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst
new file mode 100644
index 0000000000000..7259ee8731300
--- /dev/null
+++ b/llvm/docs/AMDGPU/AMDGPUAsmGFX12.rst
@@ -0,0 +1,2002 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+====================================================================================
+Syntax of GFX12 Instructions
+====================================================================================
+
+.. contents::
+  :local:
+
+Introduction
+============
+
+This document describes the syntax of GFX12 instructions.
+
+Notation
+========
+
+Notation used in this document is explained :ref:`here<amdgpu_syn_instruction_notation>`.
+
+Overview
+========
+
+An overview of generic syntax and other features of AMDGPU instructions may be found :ref:`in this document<amdgpu_syn_instructions>`.
+
+Instructions
+============
+
+
+SMEM
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                **DST**      **SRC0**     **SRC1**     **SRC2**     **SRC3**          **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_atc_probe                :ref:`sdata<amdgpu_synid_gfx12_sdata_d725ab>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_atc_probe_buffer         :ref:`sdata<amdgpu_synid_gfx12_sdata_d725ab>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_b128         :ref:`sdata<amdgpu_synid_gfx12_sdata_4585b8>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_b256         :ref:`sdata<amdgpu_synid_gfx12_sdata_0974a4>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_b32          :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_b512         :ref:`sdata<amdgpu_synid_gfx12_sdata_6c003b>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_b64          :ref:`sdata<amdgpu_synid_gfx12_sdata_354189>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_b96          :ref:`sdata<amdgpu_synid_gfx12_sdata_dd9dd8>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_i16          :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_i8           :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_u16          :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_load_u8           :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_nop                                                                 :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_buffer_prefetch_data              :ref:`sbase<amdgpu_synid_gfx12_sbase_453b95>`,   :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_8ec073>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>`         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_dcache_inv                                                                 :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_b128                :ref:`sdata<amdgpu_synid_gfx12_sdata_4585b8>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_b256                :ref:`sdata<amdgpu_synid_gfx12_sdata_0974a4>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_b32                 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_b512                :ref:`sdata<amdgpu_synid_gfx12_sdata_6c003b>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_b64                 :ref:`sdata<amdgpu_synid_gfx12_sdata_354189>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_b96                 :ref:`sdata<amdgpu_synid_gfx12_sdata_dd9dd8>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_i16                 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_i8                  :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_u16                 :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_load_u8                  :ref:`sdata<amdgpu_synid_gfx12_sdata_836716>`,   :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`                         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_prefetch_data                     :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>`         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_prefetch_data_pc_rel              :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>`                  :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_prefetch_inst                     :ref:`sbase<amdgpu_synid_gfx12_sbase_47adb7>`,   :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>`         :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    s_prefetch_inst_pc_rel              :ref:`ioffset<amdgpu_synid_gfx12_ioffset>`, :ref:`soffset<amdgpu_synid_gfx12_soffset_ec005a>`, :ref:`sdata<amdgpu_synid_gfx12_sdata_5c7b50>`                  :ref:`offset24s<amdgpu_synid_smem_offset24s>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+SOP1
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_abs_i32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_alloc_vgpr                             :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_and_not0_saveexec_b32        :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_and_not0_saveexec_b64        :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_and_not0_wrexec_b32          :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_and_not0_wrexec_b64          :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_and_not1_saveexec_b32        :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_and_not1_saveexec_b64        :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_and_not1_wrexec_b32          :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_and_not1_wrexec_b64          :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_and_saveexec_b32             :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_and_saveexec_b64             :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_barrier_init                           :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_barrier_join                           :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_barrier_signal                         :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_barrier_signal_isfirst                 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_bcnt0_i32_b32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_bcnt0_i32_b64                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_bcnt1_i32_b32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_bcnt1_i32_b64                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_bitreplicate_b64_b32         :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_bitset0_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_bitset0_b64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_bitset1_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_bitset1_b64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_brev_b32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_brev_b64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_ceil_f16                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_ceil_f32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cls_i32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cls_i32_i64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_clz_i32_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_clz_i32_u64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_cmov_b32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cmov_b64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_ctz_i32_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_ctz_i32_b64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_cvt_f16_f32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cvt_f32_f16                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cvt_f32_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cvt_f32_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cvt_hi_f32_f16               :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cvt_i32_f32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_cvt_u32_f32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_floor_f16                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_floor_f32                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_get_barrier_state            :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_get_lock_state               :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_getpc_b64                    :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`
+    s_mov_b32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_mov_b64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_mov_fed_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_mov_from_global_b32          :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+    s_mov_from_global_b64          :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+    s_mov_regrd_b32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_mov_to_global_b32            :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_mov_to_global_b64            :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_movreld_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_movreld_b64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_movrels_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+    s_movrels_b64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+    s_movrelsd_2_b32               :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+    s_nand_saveexec_b32            :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_nand_saveexec_b64            :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_nor_saveexec_b32             :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_nor_saveexec_b64             :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_not_b32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_not_b64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_or_not0_saveexec_b32         :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_or_not0_saveexec_b64         :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_or_not1_saveexec_b32         :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_or_not1_saveexec_b64         :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_or_saveexec_b32              :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_or_saveexec_b64              :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_quadmask_b32                 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_quadmask_b64                 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_rfe_b64                                :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+    s_rndne_f16                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_rndne_f32                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_sendmsg_rtn_b32              :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_245536>`
+    s_sendmsg_rtn_b64              :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_245536>`
+    s_setpc_b64                              :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+    s_sext_i32_i16                 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_sext_i32_i8                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_sleep_var                              :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_swap_to_global_b32           :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_007f9c>`
+    s_swappc_b64                   :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_2797bc>`
+    s_trunc_f16                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_trunc_f32                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_try_lock                               :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_unlock                                 :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_wakeup_barrier                         :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_1a9ca5>`
+    s_wqm_b32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_wqm_b64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_xnor_saveexec_b32            :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_xnor_saveexec_b64            :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+    s_xor_saveexec_b32             :ref:`sdst<amdgpu_synid_gfx12_sdst_836716>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`
+    s_xor_saveexec_b64             :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`
+
+SOP2
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_absdiff_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_add_co_ci_u32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_add_co_i32                   :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_add_co_u32                   :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_add_f16                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_add_f32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_add_nc_u64                   :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_and_b32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_and_b64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_and_not1_b32                 :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_and_not1_b64                 :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_ashr_i32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_ashr_i64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bfe_i32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bfe_i64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bfe_u32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bfe_u64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bfm_b32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bfm_b64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cselect_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cselect_b64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_cvt_pk_rtz_f16_f32           :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_fmaak_f32                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`,    :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    s_fmac_f16                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_fmac_f32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_fmamk_f32                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`,  :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshl1_add_u32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshl2_add_u32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshl3_add_u32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshl4_add_u32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshl_b32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshl_b64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshr_b32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_lshr_b64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_max_i32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_max_num_f16                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_max_num_f32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_max_u32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_maximum_f16                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_maximum_f32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_min_i32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_min_num_f16                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_min_num_f32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_min_u32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_minimum_f16                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_minimum_f32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_mul_f16                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_mul_f32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_mul_hi_i32                   :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_mul_hi_u32                   :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_mul_i32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_mul_u64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_nand_b32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_nand_b64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_nor_b32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_nor_b64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_or_b32                       :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_or_b64                       :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_or_not1_b32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_or_not1_b64                  :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_pack_hh_b32_b16              :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_pack_hl_b32_b16              :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_pack_lh_b32_b16              :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_pack_ll_b32_b16              :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_sub_co_ci_u32                :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_sub_co_i32                   :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_sub_co_u32                   :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_sub_f16                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_sub_f32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_sub_nc_u64                   :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_xnor_b32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_xnor_b64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_xor_b32                      :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_xor_b64                      :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+
+SOPC
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_bitcmp0_b32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bitcmp0_b64                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bitcmp1_b32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_bitcmp1_b64                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_eq_f16                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_eq_f32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_eq_i32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_eq_u32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_eq_u64                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_cmp_ge_f16                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_ge_f32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_ge_i32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_ge_u32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_gt_f16                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_gt_f32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_gt_i32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_gt_u32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_le_f16                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_le_f32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_le_i32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_le_u32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lg_f16                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lg_f32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lg_i32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lg_u32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lg_u64                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_bbb4c6>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_bbb4c6>`
+    s_cmp_lt_f16                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lt_f32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lt_i32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_lt_u32                   :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_neq_f16                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_neq_f32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nge_f16                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nge_f32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_ngt_f16                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_ngt_f32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nle_f16                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nle_f32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nlg_f16                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nlg_f32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nlt_f16                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_nlt_f32                  :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_o_f16                    :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_o_f32                    :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_u_f16                    :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+    s_cmp_u_f32                    :ref:`ssrc0<amdgpu_synid_gfx12_ssrc0_c4593f>`,    :ref:`ssrc1<amdgpu_synid_gfx12_ssrc1_c4593f>`
+
+SOPK
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_addk_co_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_call_b64                     :ref:`sdst<amdgpu_synid_gfx12_sdst_354189>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cmovk_i32                    :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_eq_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_eq_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_ge_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_ge_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_gt_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_gt_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_le_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_le_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_lg_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_lg_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_lt_i32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_cmpk_lt_u32                  :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_getreg_b32                   :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_7ed651>`
+    s_getreg_regrd_b32             :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_7ed651>`
+    s_movk_i32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_mulk_i32                     :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_setreg_b32                   :ref:`simm16<amdgpu_synid_gfx12_simm16_cc1716>`,   :ref:`sdst<amdgpu_synid_gfx12_sdst_20064d>`
+    s_setreg_imm32_b32             :ref:`simm16<amdgpu_synid_gfx12_simm16_cc1716>`,   :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    s_subvector_loop_begin         :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_subvector_loop_end           :ref:`sdst<amdgpu_synid_gfx12_sdst_ced58d>`,     :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_version                                :ref:`simm16<amdgpu_synid_gfx12_simm16_15ccdd>`
+
+SOPP
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **SRC**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    s_barrier
+    s_barrier_leave
+    s_barrier_wait                 :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_branch                       :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_cdbgsys              :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_cdbgsys_and_user     :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_cdbgsys_or_user      :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_cdbguser             :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_execnz               :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_execz                :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_scc0                 :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_scc1                 :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_vccnz                :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_cbranch_vccz                 :ref:`simm16<amdgpu_synid_gfx12_simm16_3d2a4f>`
+    s_clause                       :ref:`simm16<amdgpu_synid_gfx12_simm16_730a13>`
+    s_code_end
+    s_decperflevel                 :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_delay_alu                    :ref:`simm16<amdgpu_synid_gfx12_simm16_c98889>`
+    s_denorm_mode                  :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_endpgm
+    s_endpgm_ordered_ps_done
+    s_endpgm_saved
+    s_icache_inv
+    s_incperflevel                 :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_nop                          :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_round_mode                   :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_sendmsg                      :ref:`simm16<amdgpu_synid_gfx12_simm16_ee8b30>`
+    s_sendmsghalt                  :ref:`simm16<amdgpu_synid_gfx12_simm16_ee8b30>`
+    s_set_inst_prefetch_distance   :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_sethalt                      :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_setkill                      :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_setprio                      :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_singleuse_vdst               :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+    s_sleep                        :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+    s_trap                         :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_ttracedata
+    s_ttracedata_imm               :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_alu                     :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+    s_wait_bvhcnt                  :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_dscnt                   :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_event                   :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+    s_wait_expcnt                  :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_idle
+    s_wait_kmcnt                   :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_loadcnt                 :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_loadcnt_dscnt           :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+    s_wait_samplecnt               :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_storecnt                :ref:`simm16<amdgpu_synid_gfx12_simm16_39b593>`
+    s_wait_storecnt_dscnt          :ref:`simm16<amdgpu_synid_gfx12_simm16_81e671>`
+    s_waitcnt                      :ref:`simm16<amdgpu_synid_gfx12_simm16_218bea>`
+    s_wakeup
+
+VBUFFER
+-------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                     **DST**      **SRC0**     **SRC1**     **SRC2**      **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    buffer_atomic_add_f32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_add_u32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_add_u64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_and_b32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_and_b64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_cmpswap_b32       :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_cmpswap_b64       :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_cond_sub_u32      :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_dec_u32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_dec_u64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_inc_u32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_inc_u64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_max_i32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_max_i64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_max_num_f32       :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_max_u32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_max_u64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_min_i32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_min_i64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_min_num_f32       :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_min_u32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_min_u64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_or_b32            :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_or_b64            :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_pk_add_bf16       :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_pk_add_f16        :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_sub_clamp_u32     :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_sub_u32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_sub_u64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_swap_b32          :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_swap_b64          :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_xor_b32           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_atomic_xor_b64           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_gl0_inv                                                       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_gl1_inv                                                       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_b128                :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_b32                 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_b64                 :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_b96                 :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_block               :ref:`vdata<amdgpu_synid_gfx12_vdata_2eda77>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_b16             :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_format_x        :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_format_xy       :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_format_xyz      :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_format_xyzw     :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_hi_b16          :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_hi_format_x     :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_hi_i8           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_hi_u8           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_i8              :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_d16_u8              :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_format_x            :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_format_xy           :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_format_xyz          :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_format_xyzw         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_i16                 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_i8                  :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_lds_b32                      :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_lds_format_x                 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_lds_i16                      :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_lds_i8                       :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_lds_u16                      :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_lds_u8                       :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_u16                 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_load_u8                  :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_nop                                                           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_b128               :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_b16                :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_b32                :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_b64                :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_b8                 :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_b96                :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_block              :ref:`vdata<amdgpu_synid_gfx12_vdata_2eda77>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_d16_format_x       :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_d16_format_xy      :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_d16_format_xyz     :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_d16_format_xyzw    :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_d16_hi_b16         :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_d16_hi_b8          :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_d16_hi_format_x    :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_format_x           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_format_xy          :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_format_xyz         :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    buffer_store_format_xyzw        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_d16_format_x       :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_d16_format_xy      :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_d16_format_xyz     :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_d16_format_xyzw    :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_format_x           :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_format_xy          :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_format_xyz         :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_load_format_xyzw        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_d16_format_x      :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_d16_format_xy     :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_d16_format_xyz    :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_d16_format_xyzw   :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_format_x          :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_format_xy         :ref:`vdata<amdgpu_synid_gfx12_vdata_bdb32f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_format_xyz        :ref:`vdata<amdgpu_synid_gfx12_vdata_48e42f>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    tbuffer_store_format_xyzw       :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,   :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,   :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`,    :ref:`soffset<amdgpu_synid_gfx12_soffset_c5b88c>`   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`idxen<amdgpu_synid_idxen>` :ref:`offen<amdgpu_synid_offen>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VDS
+---
+
+.. parsed-literal::
+
+    **INSTRUCTION**                             **DST**       **SRC0**      **SRC1**      **SRC2**        **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    ds_add_f32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_add_f64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_add_rtn_f32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_add_rtn_u32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_add_rtn_u64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_add_u32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_add_u64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_and_b32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_and_b64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_and_rtn_b32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_and_rtn_b64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_append                               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`                                      :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_b32                         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_fi_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_fi_from_global_b32          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_fi_to_global_b32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_fi_to_simd_b32              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_from_global_b32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_to_global_b32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bpermute_to_simd_b32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bvh_stack_push4_pop1_rtn_b32         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_e016a1>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bvh_stack_push8_pop1_rtn_b32         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_731030>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_bvh_stack_push8_pop2_rtn_b64         :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_731030>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_cmpstore_b32                                   :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_cmpstore_b64                                   :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_cmpstore_rtn_b32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_cmpstore_rtn_b64                     :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_cond_sub_rtn_u32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_cond_sub_u32                                   :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_condxchg32_rtn_b64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_consume                              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`                                      :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_dec_rtn_u32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_dec_rtn_u64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_dec_u32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_dec_u64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_inc_rtn_u32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_inc_rtn_u64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_inc_u32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_inc_u64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_2addr_b32                       :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_2addr_b64                       :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_2addr_stride64_b32              :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_2addr_stride64_b64              :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_addtid_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`                                      :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_b128                            :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_b32                             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_b64                             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_b96                             :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_i16                             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_i8                              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_i8_d16                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_i8_d16_hi                       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_u16                             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_u16_d16                         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_u16_d16_hi                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_u8                              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_u8_d16                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_load_u8_d16_hi                       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_i32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_i64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_num_f32                                    :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_num_f64                                    :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_num_rtn_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_num_rtn_f64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_rtn_i32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_rtn_i64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_rtn_u32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_rtn_u64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_u32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_max_u64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_i32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_i64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_num_f32                                    :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_num_f64                                    :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_num_rtn_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_num_rtn_f64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_rtn_i32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_rtn_i64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_rtn_u32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_rtn_u64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_u32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_min_u64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_mskor_b32                                      :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_mskor_b64                                      :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_mskor_rtn_b32                        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_mskor_rtn_b64                        :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_nop                                                                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_or_b32                                         :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_or_b64                                         :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_or_rtn_b32                           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_or_rtn_b64                           :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_permute_b32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_permute_from_global_b32              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_permute_to_global_b32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_permute_to_simd_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_pk_add_bf16                                    :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_pk_add_f16                                     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_pk_add_rtn_bf16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_pk_add_rtn_f16                       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_rsub_rtn_u32                         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_rsub_rtn_u64                         :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_rsub_u32                                       :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_rsub_u64                                       :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_2addr_b32                                :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_2addr_b64                                :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_2addr_stride64_b32                       :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_2addr_stride64_b64                       :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_addtid_b32                               :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b128                                     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_e016a1>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b16                                      :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b16_d16_hi                               :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b32                                      :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b64                                      :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b8                                       :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b8_d16_hi                                :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_store_b96                                      :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_56f215>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_storexchg_2addr_rtn_b32              :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_storexchg_2addr_rtn_b64              :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_storexchg_2addr_stride64_rtn_b32     :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_storexchg_2addr_stride64_rtn_b64     :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`,    :ref:`data1<amdgpu_synid_gfx12_data1_fd235e>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_storexchg_rtn_b32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_storexchg_rtn_b64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_sub_clamp_rtn_u32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_sub_clamp_u32                                  :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_sub_rtn_u32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_sub_rtn_u64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_sub_u32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_sub_u64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_swizzle_b32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`                            :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_wrap_rtn_b32                         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`,    :ref:`data1<amdgpu_synid_gfx12_data1_6802ce>`       :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_xor_b32                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_xor_b64                                        :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_xor_rtn_b32                          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_6802ce>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+    ds_xor_rtn_b64                          :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`addr<amdgpu_synid_gfx12_addr>`,     :ref:`data0<amdgpu_synid_gfx12_data0_fd235e>`                 :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`offset0<amdgpu_synid_ds_offset80>` :ref:`offset1<amdgpu_synid_ds_offset81>`
+
+VDSDIR
+------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**            **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    ds_direct_load                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`                     :ref:`wait_va_vdst<amdgpu_synid_wait_va_vdst>` :ref:`wait_vdst<amdgpu_synid_wait_vdst>` :ref:`wait_vm_vsrc<amdgpu_synid_wait_vm_vsrc>`
+    ds_param_load                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`attr<amdgpu_synid_gfx12_attr>`           :ref:`wait_va_vdst<amdgpu_synid_wait_va_vdst>` :ref:`wait_vdst<amdgpu_synid_wait_vdst>` :ref:`wait_vm_vsrc<amdgpu_synid_wait_vm_vsrc>`
+
+VERIF
+-----
+
+.. parsed-literal::
+
+    **INSTRUCTION**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    fake_s_delay_alu
+    fake_s_nop
+    fake_s_wait_alu
+    fake_s_wait_bvhcnt
+    fake_s_wait_dscnt
+    fake_s_wait_expcnt
+    fake_s_wait_kmcnt
+    fake_s_wait_loadcnt
+    fake_s_wait_samplecnt
+    fake_s_wait_storecnt
+    fake_s_waitcnt
+    fake_v_nop
+    ill_0
+    ill_1
+    ill_beef
+    metadata
+    verif_s_adjdelay_alu
+
+VEXPORT
+-------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**      **SRC3**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    export                         :ref:`tgt<amdgpu_synid_gfx12_tgt>`,      :ref:`vsrc0<amdgpu_synid_gfx12_vsrc0>`,    :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`,    :ref:`vsrc2<amdgpu_synid_gfx12_vsrc2>`,    :ref:`vsrc3<amdgpu_synid_gfx12_vsrc3>`          :ref:`done<amdgpu_synid_done>` :ref:`row_en<amdgpu_synid_row_en>`
+
+VFLAT
+-----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    flat_atomic_add_f32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_add_u32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_add_u64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_and_b32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_and_b64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_cmpswap_b32        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_cmpswap_b64        :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_cond_sub_u32       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_dec_u32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_dec_u64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_inc_u32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_inc_u64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_max_i32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_max_i64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_max_num_f32        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_max_u32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_max_u64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_min_i32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_min_i64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_min_num_f32        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_min_u32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_min_u64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_or_b32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_or_b64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_pk_add_bf16        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_pk_add_f16         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_sub_clamp_u32      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_sub_u32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_sub_u64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_swap_b32           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_swap_b64           :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_xor_b32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_atomic_xor_b64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_b128                 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_b64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_b96                  :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_d16_b16              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_d16_hi_b16           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_d16_hi_i8            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_d16_hi_u8            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_d16_i8               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_d16_u8               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_i8                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_load_u8                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_b128                          :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_b16                           :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_b32                           :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_b64                           :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_b8                            :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_b96                           :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_56f215>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_d16_hi_b16                    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    flat_store_d16_hi_b8                     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`           :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VGLOBAL
+-------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                       **DST**       **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    global_atomic_add_f32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_add_u32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_add_u64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_and_b32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_and_b64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_cmpswap_b32         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_cmpswap_b64         :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_cond_sub_u32        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_dec_u32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_dec_u64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_inc_u32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_inc_u64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_max_i32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_max_i64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_max_num_f32         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_max_u32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_max_u64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_min_i32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_min_i64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_min_num_f32         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_min_u32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_min_u64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_or_b32              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_or_b64              :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_ordered_add_b64     :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_pk_add_bf16         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_pk_add_f16          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_sub_clamp_u32       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_sub_u32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_sub_u64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_swap_b32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_swap_b64            :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_xor_b32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_atomic_xor_b64             :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_inv                                                                     :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_addtid_b32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                              :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_b128                  :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_b32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_b64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_b96                   :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_block                 :ref:`vdst<amdgpu_synid_gfx12_vdst_2eda77>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_d16_b16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_d16_hi_b16            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_d16_hi_i8             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_d16_hi_u8             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_d16_i8                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_d16_u8                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_i8                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_lds_addtid_b32                  :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                              :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_lds_b32                         :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_lds_i16                         :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_lds_i8                          :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_lds_u16                         :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_lds_u8                          :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_tr_b128               :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_tr_b64                :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_load_u8                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_addtid_b32                     :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_b128                           :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_b16                            :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_b32                            :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_b64                            :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_b8                             :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_b96                            :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_56f215>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_block                          :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_89fd7b>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_d16_hi_b16                     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_store_d16_hi_b8                      :ref:`vaddr<amdgpu_synid_gfx12_vaddr_f2b449>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_cdc95c>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_wb                                                                      :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    global_wbinv                                                                   :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VIMAGE
+------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                      **DST**       **SRC0**      **SRC1**       **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    image_atomic_add_flt             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_add_uint            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_and                 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_cmpswap             :ref:`vdata<amdgpu_synid_gfx12_vdata_89680f>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_dec_uint            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_inc_uint            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_max_flt             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_max_int             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_max_uint            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_min_flt             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_min_int             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_min_uint            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_or                  :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_pk_add_bf16         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_pk_add_f16          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_sub_uint            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_swap                :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_atomic_xor                 :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_bvh64_intersect_ray        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c12f43>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_bvh8_intersect_ray         :ref:`vdata<amdgpu_synid_gfx12_vdata_aac3e8>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_a972b9>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_bvh_dual_intersect_ray     :ref:`vdata<amdgpu_synid_gfx12_vdata_aac3e8>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c12f43>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_bvh_intersect_ray          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_a972b9>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_5fe6d8>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_get_resinfo                :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_load                       :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_load_mip                   :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_load_mip_pck               :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_load_mip_pck_sgn           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_load_pck                   :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_load_pck_sgn               :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_rsvd_atomic_umax_8         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_rsvd_atomic_umin_8         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_store                      :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_store_mip                  :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_store_mip_pck              :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_store_pck                  :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`,    :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`,    :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`       :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VINTERP
+-------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_interp_p10_f16_f32           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`           :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+    v_interp_p10_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`           :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+    v_interp_p10_rtz_f16_f32       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`           :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+    v_interp_p2_f16_f32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`           :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+    v_interp_p2_f32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`           :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+    v_interp_p2_rtz_f16_f32        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`           :ref:`clamp<amdgpu_synid_clamp>` :ref:`wait_exp<amdgpu_synid_wait_exp>`
+
+VOP1
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC**            **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_bfrev_b32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ceil_f16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ceil_f32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ceil_f64                     :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cls_i32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_clz_i32_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cos_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cos_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ctz_i32_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f16_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f16_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f16_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_bf8                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_fp8                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_ubyte0               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_ubyte1               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_ubyte2               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f32_ubyte3               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f64_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f64_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_f64_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_floor_i32_f32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_i16_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_i32_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_i32_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_i32_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_nearest_i32_f32          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_norm_i16_f16             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_norm_u16_f16             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_off_f32_i4               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_f32_bf8               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_f32_fp8               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_u16_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_u32_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_u32_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_u32_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_exp_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_exp_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_floor_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_floor_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_floor_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fract_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fract_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fract_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_exp_i16_f16            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_exp_i32_f32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_exp_i32_f64            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_mant_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_mant_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_frexp_mant_f64               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_log_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_log_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mov_b16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mov_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mov_fed_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mov_from_global_b32          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mov_to_global_b32            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_movreld_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_movrels_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_movrelsd_2_b32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_movrelsd_b32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_nop                                                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_not_b16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_not_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_permlane64_b32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pipeflush                                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_f64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rcp_iflag_f32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_readfirstlane_b32            :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rndne_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rndne_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rndne_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rsq_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rsq_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_rsq_f64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sat_pk_u8_i16                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sin_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sin_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sqrt_f16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sqrt_f32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sqrt_f64                     :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_swap_b16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_swap_b32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_swaprel_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_trunc_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_trunc_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_trunc_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_writelane_regwr_b32          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOP2
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**      **DST1**      **SRC0**      **SRC1**       **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add_co_ci_u32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>`          :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_f64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_nc_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_nc_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_and_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ashrrev_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cndmask_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>`          :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_rtz_f16_f32           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmaak_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>`      :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmaak_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>`      :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmaak_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`literal<amdgpu_synid_gfx12_literal_1f74c7>`::ref:`m<amdgpu_synid_gfx12_m>`      :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmac_f16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmac_f32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmac_f64                     :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmamk_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmamk_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fmamk_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`literal<amdgpu_synid_gfx12_literal_1f74c7>`::ref:`m<amdgpu_synid_gfx12_m>`, :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_illegal                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ldexp_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshlrev_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshlrev_b64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshrrev_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_i32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_num_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_num_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_num_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_u32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_i32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_num_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_num_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_num_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_u32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_dx9_zero_f32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_f64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_hi_i32_i24               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_hi_u32_u24               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_i32_i24                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_u32_u24                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_u64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_or_b32                       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pk_fmac_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_co_ci_u32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>`          :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_nc_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_nc_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_subrev_co_ci_u32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vcc<amdgpu_synid_gfx12_vcc>`::ref:`m<amdgpu_synid_gfx12_m>`          :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_subrev_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_subrev_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_subrev_nc_u32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_xnor_b32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_xor_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOP3
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST0**      **DST1**      **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_add3_u32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_co_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_lshl_u32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_nc_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_nc_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_add_nc_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_alignbit_b32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_alignbyte_b32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_and_b16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_and_or_b32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ashrrev_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ashrrev_i64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_bcnt_u32_b32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_bfe_i32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_bfe_u32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_bfi_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_bfm_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cndmask_b16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_2797bc>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cubeid_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cubema_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cubesc_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cubetc_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_bf8_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_fp8_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_i16_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_i16_i32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_norm_i16_f16          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_norm_i16_f32          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_norm_u16_f16          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_norm_u16_f32          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_u16_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_u16_u32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_pk_u8_f32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_sr_bf8_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cvt_sr_fp8_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fixup_f16                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fixup_f32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fixup_f64                :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fmas_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_fmas_f64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_scale_f32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_div_scale_f64                :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_dot2_bf16_bf16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_dot2_f16_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fma_dx9_zero_f32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fma_f16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fma_f32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_fma_f64                      :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ldexp_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_ldexp_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lerp_u8                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshl_add_u32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshl_add_u64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshl_or_b32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshlrev_b16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshrrev_b16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_lshrrev_b64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_co_i64_i32               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_co_u64_u32               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i32_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_i32_i24                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u32_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mad_u32_u24                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_i16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_i32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_num_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_num_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_u16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max3_u32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_i16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_max_u16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maximum3_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maximum3_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maximum_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maximum_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maximum_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maximumminimum_f16           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maximumminimum_f32           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maxmin_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maxmin_num_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maxmin_num_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_maxmin_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mbcnt_hi_u32_b32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mbcnt_lo_u32_b32             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_med3_i16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_med3_i32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_med3_num_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_med3_num_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_med3_u16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_med3_u32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min3_i16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min3_i32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min3_num_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min3_num_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min3_u16                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min3_u32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_i16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_min_u16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minimum3_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minimum3_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minimum_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minimum_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minimum_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minimummaximum_f16           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minimummaximum_f32           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minmax_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minmax_num_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minmax_num_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_minmax_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mqsad_pk_u16_u8              :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mqsad_u32_u8                 :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_e016a1>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_msad_u8                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_hi_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_hi_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_lo_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mul_lo_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_mullit_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_or3_b32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_or_b16                       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_pack_b32_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_perm_b32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_permlane16_b32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_permlane16_var_b32           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_permlanex16_b32              :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_permlanex16_var_b32          :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_qsad_pk_u16_u8               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_readlane_b32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_977794>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_readlane_regrd_b32           :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_977794>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_exp_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_exp_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_log_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_log_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_rcp_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_rcp_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_rsq_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_rsq_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_sqrt_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_85aab6>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_s_sqrt_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_836716>`,               :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`                             :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_hi_u8                    :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sad_u8                       :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_co_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_nc_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_nc_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_sub_nc_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_subrev_co_u32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`sdst<amdgpu_synid_gfx12_sdst_e701cc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_trig_preop_f64               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_writelane_b32                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_c4593f>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_977794>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_xad_u32                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_xor3_b32                     :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`         :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_xor_b16                      :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,               :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`                   :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOP3P
+-----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_dot2_f32_bf16                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot2_f32_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot4_f32_bf8_bf8             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot4_f32_bf8_fp8             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot4_f32_fp8_bf8             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot4_f32_fp8_fp8             :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot4_i32_iu8                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot4_u32_u8                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot8_i32_iu4                 :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_dot8_u32_u4                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_fma_mix_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_fma_mixhi_f16                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_fma_mixlo_f16                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_pk_add_bf16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_add_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_add_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_add_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_ashrrev_i16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_fma_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_pk_fma_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5cae62>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5cae62>`
+    v_pk_lshlrev_b16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_lshrrev_b16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_mad_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_pk_mad_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`,     :ref:`src2<amdgpu_synid_gfx12_src2_5727cf>`
+    v_pk_max_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_max_num_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_max_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_maximum_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_min_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_min_num_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_min_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_minimum_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_mul_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_mul_lo_u16                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_sub_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_pk_sub_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`,     :ref:`src1<amdgpu_synid_gfx12_src1_5727cf>`
+    v_swmmac_bf16_16x16x32_bf16    :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_731030>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_f16_16x16x32_f16      :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_731030>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_f32_16x16x32_bf16     :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_731030>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_f32_16x16x32_bf8_bf8  :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_f32_16x16x32_bf8_fp8  :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_f32_16x16x32_f16      :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_731030>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_f32_16x16x32_fp8_bf8  :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_f32_16x16x32_fp8_fp8  :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_i32_16x16x32_iu4      :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_i32_16x16x32_iu8      :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_swmmac_i32_16x16x64_iu4      :ref:`vdst<amdgpu_synid_gfx12_vdst_47d3bc>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_6802ce>`
+    v_wmma_bf16_16x16x16_bf16      :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_7b936a>`
+    v_wmma_f16_16x16x16_f16        :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_7b936a>`
+    v_wmma_f32_16x16x16_bf16       :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_f32_16x16x16_bf8_bf8    :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_f32_16x16x16_bf8_fp8    :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_f32_16x16x16_f16        :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_e016a1>`,     :ref:`src1<amdgpu_synid_gfx12_src1_e016a1>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_f32_16x16x16_fp8_bf8    :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_f32_16x16x16_fp8_fp8    :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_i32_16x16x16_iu4        :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_6802ce>`,     :ref:`src1<amdgpu_synid_gfx12_src1_6802ce>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_i32_16x16x16_iu8        :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+    v_wmma_i32_16x16x32_iu4        :ref:`vdst<amdgpu_synid_gfx12_vdst_227281>`,     :ref:`src0<amdgpu_synid_gfx12_src0_fd235e>`,     :ref:`src1<amdgpu_synid_gfx12_src1_fd235e>`,     :ref:`src2<amdgpu_synid_gfx12_src2_96fbd3>`
+
+VOPC
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_cmp_class_f16                :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_class_f32                :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_class_f64                :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_eq_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_i32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_i64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_u32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_f_u64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ge_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_gt_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_le_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lg_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lg_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lg_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_lt_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ne_i16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ne_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ne_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ne_u16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ne_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ne_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_neq_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_neq_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_neq_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nge_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_ngt_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nle_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlg_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_nlt_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_o_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_i32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_i64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_u32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_t_u64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f16                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f32                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmp_u_f64                    :ref:`vdst<amdgpu_synid_gfx12_vdst_006c40>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_class_f16               :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_class_f32               :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_class_f64               :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_i64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_eq_u64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_f_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_i64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ge_u64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_i64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_gt_u64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_i64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_le_u64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lg_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lg_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lg_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_f64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_i64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_lt_u64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ne_i16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ne_i32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ne_i64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ne_u16                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ne_u32                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ne_u64                  :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_neq_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_neq_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_neq_f64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nge_f64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_ngt_f64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nle_f64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlg_f64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f16                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f32                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_nlt_f64                 :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_o_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_i32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_i64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_u32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_t_u64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f16                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f32                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5727cf>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_6802ce>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+    v_cmpx_u_f64                   :ref:`vdst<amdgpu_synid_gfx12_vdst_7de8e7>`,     :ref:`src0<amdgpu_synid_gfx12_src0_5cae62>`::ref:`m<amdgpu_synid_gfx12_m>`,   :ref:`vsrc1<amdgpu_synid_gfx12_vsrc1_fd235e>`::ref:`m<amdgpu_synid_gfx12_m>`        :ref:`omod<amdgpu_synid_omod>` :ref:`clamp<amdgpu_synid_clamp>`
+
+VOPD
+----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                                **DST0**   **DST1**   **SRC0**   **SRC1**    **SRC2**    **SRC3**    **SRC4**    **SRC5**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_dual_add_f32_x_add_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_add_nc_u32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_and_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_cndmask_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_add_f32_x_dot2acc_f32_bf16          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_dot2acc_f32_f16           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_fmaak_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_add_f32_x_fmac_f32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_fmamk_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_add_f32_x_lshlrev_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_max_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_min_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_mov_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_add_f32_x_mul_dx9_zero_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_mul_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_sub_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_add_f32_x_subrev_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_cndmask_b32_x_add_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_add_nc_u32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_and_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_cndmask_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_dot2acc_f32_bf16      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_dot2acc_f32_f16       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_fmaak_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`,    :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_cndmask_b32_x_fmac_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_fmamk_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`,    :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_cndmask_b32_x_lshlrev_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_max_num_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_min_num_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_mov_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_mul_dx9_zero_f32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_mul_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_sub_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_cndmask_b32_x_subrev_f32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_dot2acc_f32_bf16_x_add_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_add_nc_u32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_and_b32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_cndmask_b32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_dot2acc_f32_bf16_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_dot2acc_f32_f16  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_fmaak_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_dot2acc_f32_bf16_x_fmac_f32         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_fmamk_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_dot2acc_f32_bf16_x_lshlrev_b32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_max_num_f32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_min_num_f32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_mov_b32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_dot2acc_f32_bf16_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_mul_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_sub_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_bf16_x_subrev_f32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_add_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_add_nc_u32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_and_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_cndmask_b32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_dot2acc_f32_f16_x_dot2acc_f32_bf16  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_dot2acc_f32_f16   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_fmaak_f32         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_dot2acc_f32_f16_x_fmac_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_fmamk_f32         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_dot2acc_f32_f16_x_lshlrev_b32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_max_num_f32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_min_num_f32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_mov_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_dot2acc_f32_f16_x_mul_dx9_zero_f32  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_mul_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_sub_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_dot2acc_f32_f16_x_subrev_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmaak_f32_x_add_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_add_nc_u32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_and_b32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_cndmask_b32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`,    :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_dot2acc_f32_bf16        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_dot2acc_f32_f16         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_fmaak_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_fmac_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_fmamk_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_lshlrev_b32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_max_num_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_min_num_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_mov_b32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_mul_dx9_zero_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_mul_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_sub_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmaak_f32_x_subrev_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmac_f32_x_add_f32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_add_nc_u32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_and_b32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_cndmask_b32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_fmac_f32_x_dot2acc_f32_bf16         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_dot2acc_f32_f16          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_fmaak_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmac_f32_x_fmac_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_fmamk_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmac_f32_x_lshlrev_b32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_max_num_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_min_num_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_mov_b32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_fmac_f32_x_mul_dx9_zero_f32         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_mul_f32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_sub_f32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmac_f32_x_subrev_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_fmamk_f32_x_add_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_add_nc_u32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_and_b32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_cndmask_b32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`,    :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_dot2acc_f32_bf16        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_dot2acc_f32_f16         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_fmaak_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_fmac_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_fmamk_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_lshlrev_b32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_max_num_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_min_num_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_mov_b32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_mul_dx9_zero_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_mul_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_sub_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmamk_f32_x_subrev_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_max_num_f32_x_add_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_add_nc_u32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_and_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_cndmask_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_max_num_f32_x_dot2acc_f32_bf16      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_dot2acc_f32_f16       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_fmaak_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_max_num_f32_x_fmac_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_fmamk_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_max_num_f32_x_lshlrev_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_max_num_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_min_num_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_mov_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_max_num_f32_x_mul_dx9_zero_f32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_mul_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_sub_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_max_num_f32_x_subrev_f32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_add_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_add_nc_u32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_and_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_cndmask_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_min_num_f32_x_dot2acc_f32_bf16      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_dot2acc_f32_f16       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_fmaak_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_min_num_f32_x_fmac_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_fmamk_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_min_num_f32_x_lshlrev_b32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_max_num_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_min_num_f32           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_mov_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_min_num_f32_x_mul_dx9_zero_f32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_mul_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_sub_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_min_num_f32_x_subrev_f32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_add_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_add_nc_u32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_and_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_cndmask_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_mov_b32_x_dot2acc_f32_bf16          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_dot2acc_f32_f16           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_fmaak_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_mov_b32_x_fmac_f32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_fmamk_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_mov_b32_x_lshlrev_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_max_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_min_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_mov_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_mov_b32_x_mul_dx9_zero_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_mul_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_sub_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mov_b32_x_subrev_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_add_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_add_nc_u32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_and_b32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_cndmask_b32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_mul_dx9_zero_f32_x_dot2acc_f32_bf16 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_dot2acc_f32_f16  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_fmaak_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_mul_dx9_zero_f32_x_fmac_f32         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_fmamk_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_mul_dx9_zero_f32_x_lshlrev_b32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_max_num_f32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_min_num_f32      :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_mov_b32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_mul_dx9_zero_f32_x_mul_dx9_zero_f32 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_mul_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_sub_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_dx9_zero_f32_x_subrev_f32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_add_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_add_nc_u32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_and_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_cndmask_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_mul_f32_x_dot2acc_f32_bf16          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_dot2acc_f32_f16           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_fmaak_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_mul_f32_x_fmac_f32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_fmamk_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_mul_f32_x_lshlrev_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_max_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_min_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_mov_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_mul_f32_x_mul_dx9_zero_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_mul_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_sub_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_mul_f32_x_subrev_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_add_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_add_nc_u32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_and_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_cndmask_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_sub_f32_x_dot2acc_f32_bf16          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_dot2acc_f32_f16           :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_fmaak_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_sub_f32_x_fmac_f32                  :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_fmamk_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_sub_f32_x_lshlrev_b32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_max_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_min_num_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_mov_b32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_sub_f32_x_mul_dx9_zero_f32          :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_mul_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_sub_f32                   :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_sub_f32_x_subrev_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_add_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_add_nc_u32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_and_b32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_cndmask_b32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_subrev_f32_x_dot2acc_f32_bf16       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_dot2acc_f32_f16        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_fmaak_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_subrev_f32_x_fmac_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_fmamk_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`, :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_subrev_f32_x_lshlrev_b32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_max_num_f32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_min_num_f32            :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_mov_b32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`
+    v_dual_subrev_f32_x_mul_dx9_zero_f32       :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_mul_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_sub_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_subrev_f32_x_subrev_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`, :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`, :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`, :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`, :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,  :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+
+VOPDX
+-----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_dual_add_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_cndmask_b32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`,   :ref:`vcc<amdgpu_synid_gfx12_vcc>`
+    v_dual_dot2acc_f32_bf16        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_dot2acc_f32_f16         :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_fmaak_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`,   :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`
+    v_dual_fmac_f32                :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_fmamk_f32               :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`literal<amdgpu_synid_gfx12_literal_81e671>`,  :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_max_num_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_min_num_f32             :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_mov_b32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`
+    v_dual_mul_dx9_zero_f32        :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_mul_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_sub_f32                 :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+    v_dual_subrev_f32              :ref:`vdstx<amdgpu_synid_gfx12_vdstx>`,    :ref:`srcx0<amdgpu_synid_gfx12_srcx0>`,    :ref:`vsrcx1<amdgpu_synid_gfx12_vsrcx1>`
+
+VOPDY
+-----
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    v_dual_add_nc_u32              :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`,    :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,    :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_and_b32                 :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`,    :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,    :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+    v_dual_lshlrev_b32             :ref:`vdsty<amdgpu_synid_gfx12_vdsty>`,    :ref:`srcy0<amdgpu_synid_gfx12_srcy0>`,    :ref:`vsrcy1<amdgpu_synid_gfx12_vsrcy1>`
+
+VSAMPLE
+-------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                **DST**    **SRC0**   **SRC1**  **SRC2**  **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    image_gather4              :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_b            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_b_cl         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_c            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_c_b          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_c_b_cl       :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_c_cl         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_c_l          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_c_lz         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_c_lz_o       :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_cl           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_l            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_lz           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_lz_o         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4_o            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_gather4h             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_get_lod              :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_msaa_load            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_d82160>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`        :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample               :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_b             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_b_cl          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_b_cl_o        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_b_o           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_b           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_b_cl        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_b_cl_o      :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_b_o         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_cl          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_cl_o        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d_cl        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d_cl_g16    :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d_cl_o      :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d_cl_o_g16  :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d_g16       :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d_o         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_d_o_g16     :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_l           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_l_o         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_lz          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_lz_o        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_c_o           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_cl            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_cl_o          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d_cl          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d_cl_g16      :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d_cl_o        :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d_cl_o_g16    :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d_g16         :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d_o           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_d_o_g16       :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_l             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_l_o           :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_lz            :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_lz_o          :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    image_sample_o             :ref:`vdata<amdgpu_synid_gfx12_vdata_69a144>`, :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`, :ref:`rsrc<amdgpu_synid_gfx12_rsrc_c9f929>`, :ref:`samp<amdgpu_synid_gfx12_samp>`  :ref:`dmask<amdgpu_synid_dmask>` :ref:`tfe<amdgpu_synid_tfe>` :ref:`unorm<amdgpu_synid_unorm>` :ref:`lwe<amdgpu_synid_lwe>` :ref:`dim<amdgpu_synid_dim>` :ref:`r128<amdgpu_synid_r128>` :ref:`a16<amdgpu_synid_a16>` :ref:`d16<amdgpu_synid_d16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+VSCRATCH
+--------
+
+.. parsed-literal::
+
+    **INSTRUCTION**                    **DST**       **SRC0**      **SRC1**      **SRC2**           **MODIFIERS**
+    \ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|\ |---|
+    scratch_load_b128              :ref:`vdst<amdgpu_synid_gfx12_vdst_69a144>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_b32               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_b64               :ref:`vdst<amdgpu_synid_gfx12_vdst_bdb32f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_b96               :ref:`vdst<amdgpu_synid_gfx12_vdst_48e42f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_block             :ref:`vdst<amdgpu_synid_gfx12_vdst_2eda77>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_d16_b16           :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_d16_hi_b16        :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_d16_hi_i8         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_d16_hi_u8         :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_d16_i8            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_d16_u8            :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_i16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_i8                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_lds_b32                     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_lds_i16                     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_lds_i8                      :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_lds_u16                     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_lds_u8                      :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_u16               :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_load_u8                :ref:`vdst<amdgpu_synid_gfx12_vdst_89680f>`,     :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`                    :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_b128                       :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_e016a1>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_b16                        :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_b32                        :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_b64                        :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_fd235e>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_b8                         :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_b96                        :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_56f215>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_block                      :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_89fd7b>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_d16_hi_b16                 :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+    scratch_store_d16_hi_b8                  :ref:`vaddr<amdgpu_synid_gfx12_vaddr_c8b8d4>`,    :ref:`vsrc<amdgpu_synid_gfx12_vsrc_6802ce>`,     :ref:`saddr<amdgpu_synid_gfx12_saddr_d42b64>`          :ref:`offset<amdgpu_synid_ds_offset16>` :ref:`th<amdgpu_synid_th>` :ref:`scope<amdgpu_synid_scope>` :ref:`nv<amdgpu_synid_nv>`
+
+.. |---| unicode:: U+02014 .. em dash
+
+.. toctree::
+    :hidden:
+
+    gfx12_addr
+    gfx12_attr
+    gfx12_data0_56f215
+    gfx12_data0_6802ce
+    gfx12_data0_e016a1
+    gfx12_data0_fd235e
+    gfx12_data1_6802ce
+    gfx12_data1_731030
+    gfx12_data1_e016a1
+    gfx12_data1_fd235e
+    gfx12_ioffset
+    gfx12_literal_1f74c7
+    gfx12_literal_81e671
+    gfx12_m
+    gfx12_rsrc_5fe6d8
+    gfx12_rsrc_c9f929
+    gfx12_saddr_cdc95c
+    gfx12_saddr_d42b64
+    gfx12_samp
+    gfx12_sbase_453b95
+    gfx12_sbase_47adb7
+    gfx12_sdata_0974a4
+    gfx12_sdata_354189
+    gfx12_sdata_4585b8
+    gfx12_sdata_5c7b50
+    gfx12_sdata_6c003b
+    gfx12_sdata_836716
+    gfx12_sdata_d725ab
+    gfx12_sdata_dd9dd8
+    gfx12_sdst_006c40
+    gfx12_sdst_20064d
+    gfx12_sdst_354189
+    gfx12_sdst_836716
+    gfx12_sdst_ced58d
+    gfx12_sdst_e701cc
+    gfx12_simm16_15ccdd
+    gfx12_simm16_218bea
+    gfx12_simm16_39b593
+    gfx12_simm16_3d2a4f
+    gfx12_simm16_730a13
+    gfx12_simm16_7ed651
+    gfx12_simm16_81e671
+    gfx12_simm16_c98889
+    gfx12_simm16_cc1716
+    gfx12_simm16_ee8b30
+    gfx12_soffset_8ec073
+    gfx12_soffset_c5b88c
+    gfx12_soffset_ec005a
+    gfx12_src0_5727cf
+    gfx12_src0_5cae62
+    gfx12_src0_6802ce
+    gfx12_src0_85aab6
+    gfx12_src0_c4593f
+    gfx12_src0_e016a1
+    gfx12_src0_fd235e
+    gfx12_src1_5727cf
+    gfx12_src1_5cae62
+    gfx12_src1_6802ce
+    gfx12_src1_731030
+    gfx12_src1_977794
+    gfx12_src1_c4593f
+    gfx12_src1_e016a1
+    gfx12_src1_fd235e
+    gfx12_src2_2797bc
+    gfx12_src2_5727cf
+    gfx12_src2_5cae62
+    gfx12_src2_6802ce
+    gfx12_src2_7b936a
+    gfx12_src2_96fbd3
+    gfx12_src2_c4593f
+    gfx12_src2_e016a1
+    gfx12_srcx0
+    gfx12_srcy0
+    gfx12_ssrc0_007f9c
+    gfx12_ssrc0_1a9ca5
+    gfx12_ssrc0_245536
+    gfx12_ssrc0_2797bc
+    gfx12_ssrc0_bbb4c6
+    gfx12_ssrc0_c4593f
+    gfx12_ssrc1_bbb4c6
+    gfx12_ssrc1_c4593f
+    gfx12_tgt
+    gfx12_vaddr_a972b9
+    gfx12_vaddr_c12f43
+    gfx12_vaddr_c8b8d4
+    gfx12_vaddr_d82160
+    gfx12_vaddr_f2b449
+    gfx12_vcc
+    gfx12_vdata_2eda77
+    gfx12_vdata_48e42f
+    gfx12_vdata_69a144
+    gfx12_vdata_89680f
+    gfx12_vdata_aac3e8
+    gfx12_vdata_bdb32f
+    gfx12_vdst_006c40
+    gfx12_vdst_227281
+    gfx12_vdst_2eda77
+    gfx12_vdst_47d3bc
+    gfx12_vdst_48e42f
+    gfx12_vdst_69a144
+    gfx12_vdst_7de8e7
+    gfx12_vdst_836716
+    gfx12_vdst_89680f
+    gfx12_vdst_bdb32f
+    gfx12_vdstx
+    gfx12_vdsty
+    gfx12_vsrc0
+    gfx12_vsrc1_6802ce
+    gfx12_vsrc1_fd235e
+    gfx12_vsrc2
+    gfx12_vsrc3
+    gfx12_vsrc_56f215
+    gfx12_vsrc_6802ce
+    gfx12_vsrc_89fd7b
+    gfx12_vsrc_e016a1
+    gfx12_vsrc_fd235e
+    gfx12_vsrcx1
+    gfx12_vsrcy1
+    gfx12_clause
+    gfx12_delay
+    gfx12_hwreg
+    gfx12_imm16
+    gfx12_label
+    gfx12_sendmsg
+    gfx12_sendmsg_rtn
+    gfx12_version
+    gfx12_waitcnt
diff --git a/llvm/docs/AMDGPU/gfx12_addr.rst b/llvm/docs/AMDGPU/gfx12_addr.rst
new file mode 100644
index 0000000000000..d2fc0e0cb2f4b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_addr.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_addr:
+
+addr
+====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_attr.rst b/llvm/docs/AMDGPU/gfx12_attr.rst
new file mode 100644
index 0000000000000..a6c5c275b349f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_attr.rst
@@ -0,0 +1,28 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_attr:
+
+attr
+====
+
+Interpolation attribute and channel:
+
+    ============== ===================================
+    Syntax         Description
+    ============== ===================================
+    attr{0..32}.x  Attribute 0..32 with *x* channel.
+    attr{0..32}.y  Attribute 0..32 with *y* channel.
+    attr{0..32}.z  Attribute 0..32 with *z* channel.
+    attr{0..32}.w  Attribute 0..32 with *w* channel.
+    ============== ===================================
+
+Examples:
+
+.. parsed-literal::
+
+    ds_param_load v1, attr0.x
diff --git a/llvm/docs/AMDGPU/gfx12_clause.rst b/llvm/docs/AMDGPU/gfx12_clause.rst
new file mode 100644
index 0000000000000..88feb3b1d9974
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_clause.rst
@@ -0,0 +1,7 @@
+.. _amdgpu_synid_clause:
+
+clause
+======
+
+Description of a clause following this instruction.
+
diff --git a/llvm/docs/AMDGPU/gfx12_data0_56f215.rst b/llvm/docs/AMDGPU/gfx12_data0_56f215.rst
new file mode 100644
index 0000000000000..d8dde0013ed64
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_56f215.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data0_56f215:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data0_6802ce.rst b/llvm/docs/AMDGPU/gfx12_data0_6802ce.rst
new file mode 100644
index 0000000000000..02fe36f489229
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_6802ce.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data0_6802ce:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data0_e016a1.rst b/llvm/docs/AMDGPU/gfx12_data0_e016a1.rst
new file mode 100644
index 0000000000000..914715bf30ea9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_e016a1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data0_e016a1:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data0_fd235e.rst b/llvm/docs/AMDGPU/gfx12_data0_fd235e.rst
new file mode 100644
index 0000000000000..7617c61a94be3
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data0_fd235e.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data0_fd235e:
+
+data0
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_6802ce.rst b/llvm/docs/AMDGPU/gfx12_data1_6802ce.rst
new file mode 100644
index 0000000000000..318db2daaeec3
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_6802ce.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data1_6802ce:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_731030.rst b/llvm/docs/AMDGPU/gfx12_data1_731030.rst
new file mode 100644
index 0000000000000..1a6eda65328ae
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_731030.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data1_731030:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_e016a1.rst b/llvm/docs/AMDGPU/gfx12_data1_e016a1.rst
new file mode 100644
index 0000000000000..dee4148c3d6d1
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_e016a1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data1_e016a1:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_data1_fd235e.rst b/llvm/docs/AMDGPU/gfx12_data1_fd235e.rst
new file mode 100644
index 0000000000000..c8d4a88857d1f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_data1_fd235e.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_data1_fd235e:
+
+data1
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_delay.rst b/llvm/docs/AMDGPU/gfx12_delay.rst
new file mode 100644
index 0000000000000..600ece7fccfc5
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_delay.rst
@@ -0,0 +1,74 @@
+.. _amdgpu_synid_delay:
+
+delay
+=====
+
+A delay between dependent SALU/VALU instructions.
+This operand may specify a delay for 2 instructions:
+the one after the current *s_delay_alu* instruction
+and for the second instruction indicated by *SKIP*.
+
+The bits of this operand have the following meaning:
+
+    ===== ========================================================== ============
+    Bits  Description                                                Value Range
+    ===== ========================================================== ============
+    3:0   ID0: indicates a delay for the first instruction.          0..11
+    6:4   SKIP: indicates the position of the second instruction.    0..5
+    10:7  ID1: indicates a delay for the second instruction.         0..11
+    ===== ========================================================== ============
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* A combination of *instid0*, *instskip*, *instid1* values described below.
+
+    ======================== =========================== ===============
+    Syntax                   Description                 Default Value
+    ======================== =========================== ===============
+    instid0(<*ID name*>)     A symbolic *ID0* value.     instid0(NO_DEP)
+    instskip(<*SKIP name*>)  A symbolic *SKIP* value.    instskip(SAME)
+    instid1(<*ID name*>)     A symbolic *ID1* value.     instid1(NO_DEP)
+    ======================== =========================== ===============
+
+These values may be specified in any order.
+When more than one value is specified, the values must be separated from each other by a '|'.
+
+Valid *ID names* are defined below.
+
+    =================== ===================================================================
+    Name                Description
+    =================== ===================================================================
+    NO_DEP              No dependency on any prior instruction. This is the default value.
+    VALU_DEP_1          Dependency on a previous VALU instruction, 1 opcode back.
+    VALU_DEP_2          Dependency on a previous VALU instruction, 2 opcodes back.
+    VALU_DEP_3          Dependency on a previous VALU instruction, 3 opcodes back.
+    VALU_DEP_4          Dependency on a previous VALU instruction, 4 opcodes back.
+    TRANS32_DEP_1       Dependency on a previous TRANS32 instruction, 1 opcode back.
+    TRANS32_DEP_2       Dependency on a previous TRANS32 instruction, 2 opcodes back.
+    TRANS32_DEP_3       Dependency on a previous TRANS32 instruction, 3 opcodes back.
+    FMA_ACCUM_CYCLE_1   Single cycle penalty for FMA accumulation.
+    SALU_CYCLE_1        1 cycle penalty for a prior SALU instruction.
+    SALU_CYCLE_2        2 cycle penalty for a prior SALU instruction.
+    SALU_CYCLE_3        3 cycle penalty for a prior SALU instruction.
+    =================== ===================================================================
+
+Legal *SKIP names* are described in the following table.
+
+    ======== ============================================================================
+    Name     Description
+    ======== ============================================================================
+    SAME     Apply second dependency to the same instruction. This is the default value.
+    NEXT     Apply second dependency to the next instruction.
+    SKIP_1   Skip 1 instruction then apply dependency.
+    SKIP_2   Skip 2 instructions then apply dependency.
+    SKIP_3   Skip 3 instructions then apply dependency.
+    SKIP_4   Skip 4 instructions then apply dependency.
+    ======== ============================================================================
+
+Examples:
+
+.. parsed-literal::
+
+    s_delay_alu instid0(VALU_DEP_1)
+    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
diff --git a/llvm/docs/AMDGPU/gfx12_hwreg.rst b/llvm/docs/AMDGPU/gfx12_hwreg.rst
new file mode 100644
index 0000000000000..d99cb20df24ae
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_hwreg.rst
@@ -0,0 +1,76 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_hwreg:
+
+hwreg
+=====
+
+Bits of a hardware register being accessed.
+
+The bits of this operand have the following meaning:
+
+    ======= ===================== ============
+    Bits    Description           Value Range
+    ======= ===================== ============
+    5:0     Register *id*.        0..63
+    10:6    First bit *offset*.   0..31
+    15:11   *Size* in bits.       1..32
+    ======= ===================== ============
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* An *hwreg* value described below.
+
+    ==================================== ============================================================================
+    Hwreg Value Syntax                   Description
+    ==================================== ============================================================================
+    hwreg({0..63})                       All bits of a register indicated by its *id*.
+    hwreg(<*name*>)                      All bits of a register indicated by its *name*.
+    hwreg({0..63}, {0..31}, {1..32})     Register bits indicated by register *id*, first bit *offset* and *size*.
+    hwreg(<*name*>, {0..31}, {1..32})    Register bits indicated by register *name*, first bit *offset* and *size*.
+    ==================================== ============================================================================
+
+Numeric values may be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`
+or :ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+
+Defined register *names* include:
+
+    =================== ==========================================
+    Name                Description
+    =================== ==========================================
+    HW_REG_MODE         Shader writeable mode bits.
+    HW_REG_STATUS       Shader read-only status.
+    HW_REG_TRAPSTS      Trap status.
+    HW_REG_HW_ID1       Id of wave, simd, compute unit, etc.
+    HW_REG_HW_ID2       Id of queue, pipeline, etc.
+    HW_REG_GPR_ALLOC    Per-wave SGPR and VGPR allocation.
+    HW_REG_LDS_ALLOC    Per-wave LDS allocation.
+    HW_REG_IB_STS       Counters of outstanding instructions.
+    HW_REG_SH_MEM_BASES Memory aperture.
+    HW_REG_FLAT_SCR_LO  flat_scratch_lo register.
+    HW_REG_FLAT_SCR_HI  flat_scratch_hi register.
+    =================== ==========================================
+
+Examples:
+
+.. parsed-literal::
+
+    reg = 1
+    offset = 2
+    size = 4
+    hwreg_enc = reg | (offset << 6) | ((size - 1) << 11)
+
+    s_getreg_b32 s2, 0x1881
+    s_getreg_b32 s2, hwreg_enc                     // the same as above
+    s_getreg_b32 s2, hwreg(1, 2, 4)                // the same as above
+    s_getreg_b32 s2, hwreg(reg, offset, size)      // the same as above
+
+    s_getreg_b32 s2, hwreg(15)
+    s_getreg_b32 s2, hwreg(51, 1, 31)
+    s_getreg_b32 s2, hwreg(HW_REG_LDS_ALLOC, 0, 1)
diff --git a/llvm/docs/AMDGPU/gfx12_imm16.rst b/llvm/docs/AMDGPU/gfx12_imm16.rst
new file mode 100644
index 0000000000000..44e6d5856a558
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_imm16.rst
@@ -0,0 +1,7 @@
+.. _amdgpu_synid_imm16:
+
+imm16
+======
+
+An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range -32768..65535.
+
diff --git a/llvm/docs/AMDGPU/gfx12_ioffset.rst b/llvm/docs/AMDGPU/gfx12_ioffset.rst
new file mode 100644
index 0000000000000..0901b774f8144
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ioffset.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ioffset:
+
+ioffset
+=======
+
+*Size:* 1 dword.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_label.rst b/llvm/docs/AMDGPU/gfx12_label.rst
new file mode 100644
index 0000000000000..bdd6e1cb1ee8d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_label.rst
@@ -0,0 +1,29 @@
+.. _amdgpu_synid_label:
+
+label
+=====
+
+A branch target which is a 16-bit signed integer treated as a PC-relative dword offset.
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range -32768..65535.
+* A :ref:`symbol<amdgpu_synid_symbol>` (for example, a label) representing a relocatable address in the same compilation unit where it is referred from. The value is handled as a 16-bit PC-relative dword offset to be resolved by a linker.
+
+Examples:
+
+.. parsed-literal::
+
+  offset = 30
+  label_1:
+  label_2 = . + 4
+
+  s_branch 32
+  s_branch offset + 2
+  s_branch label_1
+  s_branch label_2
+  s_branch label_3
+  s_branch label_4
+
+  label_3 = label_2 + 4
+  label_4:
diff --git a/llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst b/llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst
new file mode 100644
index 0000000000000..7442c5d5c89dc
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_literal_1f74c7.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_literal_1f74c7:
+
+literal
+=======
+
+*Size:* 2 dwords.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_literal_81e671.rst b/llvm/docs/AMDGPU/gfx12_literal_81e671.rst
new file mode 100644
index 0000000000000..ab1b05601ff68
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_literal_81e671.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_literal_81e671:
+
+literal
+=======
+
+*Size:* 1 dword.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_m.rst b/llvm/docs/AMDGPU/gfx12_m.rst
new file mode 100644
index 0000000000000..7cfee90bae2ce
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_m.rst
@@ -0,0 +1,13 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_m:
+
+m
+=
+
+This operand may be used with floating point operand modifiers :ref:`abs<amdgpu_synid_abs>` and :ref:`neg<amdgpu_synid_neg>`.
diff --git a/llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst b/llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst
new file mode 100644
index 0000000000000..d1a475f205329
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_rsrc_5fe6d8.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_rsrc_5fe6d8:
+
+rsrc
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst b/llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst
new file mode 100644
index 0000000000000..180ae068d2ceb
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_rsrc_c9f929.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_rsrc_c9f929:
+
+rsrc
+====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst b/llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst
new file mode 100644
index 0000000000000..4b3511fc76671
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_saddr_cdc95c.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_saddr_cdc95c:
+
+saddr
+=====
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst b/llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst
new file mode 100644
index 0000000000000..d3de11dfaade8
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_saddr_d42b64.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_saddr_d42b64:
+
+saddr
+=====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_samp.rst b/llvm/docs/AMDGPU/gfx12_samp.rst
new file mode 100644
index 0000000000000..2bb15e58db5fe
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_samp.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_samp:
+
+samp
+====
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sbase_453b95.rst b/llvm/docs/AMDGPU/gfx12_sbase_453b95.rst
new file mode 100644
index 0000000000000..54c2deed4e5f1
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sbase_453b95.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sbase_453b95:
+
+sbase
+=====
+
+A 128-bit buffer resource constant for scalar memory operations which provides a base address, a size and a stride.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst b/llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst
new file mode 100644
index 0000000000000..2308b3d44585c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sbase_47adb7.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sbase_47adb7:
+
+sbase
+=====
+
+A 64-bit base address for scalar memory operations.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst b/llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst
new file mode 100644
index 0000000000000..d498f8c705210
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_0974a4.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_0974a4:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_354189.rst b/llvm/docs/AMDGPU/gfx12_sdata_354189.rst
new file mode 100644
index 0000000000000..c50665474c461
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_354189.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_354189:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst b/llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst
new file mode 100644
index 0000000000000..42f66f33e6ad4
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_4585b8.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_4585b8:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst b/llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst
new file mode 100644
index 0000000000000..028461a4a07da
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_5c7b50.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_5c7b50:
+
+sdata
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst b/llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst
new file mode 100644
index 0000000000000..87e19a95f2b8b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_6c003b.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_6c003b:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 16 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_836716.rst b/llvm/docs/AMDGPU/gfx12_sdata_836716.rst
new file mode 100644
index 0000000000000..be1bce94e7062
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_836716.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_836716:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst b/llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst
new file mode 100644
index 0000000000000..c882df8dad6c1
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_d725ab.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_d725ab:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`simm8<amdgpu_synid_simm8>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst b/llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst
new file mode 100644
index 0000000000000..64658894fee95
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdata_dd9dd8.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdata_dd9dd8:
+
+sdata
+=====
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_006c40.rst b/llvm/docs/AMDGPU/gfx12_sdst_006c40.rst
new file mode 100644
index 0000000000000..f269b05c65edc
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_006c40.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_006c40:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`vcc<amdgpu_synid_vcc>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_20064d.rst b/llvm/docs/AMDGPU/gfx12_sdst_20064d.rst
new file mode 100644
index 0000000000000..83c11a2e03eae
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_20064d.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_20064d:
+
+sdst
+====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_354189.rst b/llvm/docs/AMDGPU/gfx12_sdst_354189.rst
new file mode 100644
index 0000000000000..8433406a20591
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_354189.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_354189:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_836716.rst b/llvm/docs/AMDGPU/gfx12_sdst_836716.rst
new file mode 100644
index 0000000000000..abce5696f6716
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_836716.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_836716:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst b/llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst
new file mode 100644
index 0000000000000..e0072d90a4cfd
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_ced58d.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_ced58d:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst b/llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst
new file mode 100644
index 0000000000000..33e8c376af67f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sdst_e701cc.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_sdst_e701cc:
+
+sdst
+====
+
+Instruction output.
+
+*Size:* 1 dword if wavefront size is 32, otherwise 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_sendmsg.rst b/llvm/docs/AMDGPU/gfx12_sendmsg.rst
new file mode 100644
index 0000000000000..cb51be04555fe
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sendmsg.rst
@@ -0,0 +1,48 @@
+.. _amdgpu_synid_sendmsg:
+
+sendmsg
+=======
+
+An 8-bit value in simm16[7:0] encodes the message type.
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* A *sendmsg* value described below.
+
+
+    ==================================== ====================================================
+    Sendmsg Value Syntax                 Description
+    ==================================== ====================================================
+    sendmsg(<*type*>)                    A message identified by its *type*.
+    ==================================== ====================================================
+
+*Type* may be specified using message *name* or message *id*.
+
+Numeric values may be specified as positive :ref:`integer numbers<amdgpu_synid_integer_number>`
+or :ref:`absolute expressions<amdgpu_synid_absolute_expression>`.
+
+
+Only the following message types are valid.
+
+    ====================== ===========
+    Message type           simm16[7:0]
+    ====================== ===========
+    Reserved               0 
+    MSG_INTERRUPT          1
+    MSG_HS_TESSFACTOR      2
+    MSG_DEALLOC_VGPRS      3
+    MSG_GS_ALLOC_REQ       9
+    ====================== ===========
+
+Examples:
+
+.. parsed-literal::
+
+    // numeric message code
+    msg = 0x1
+    s_sendmsg 0x3
+    s_sendmsg msg + 2
+
+    // sendmsg with strict arguments validation
+    s_sendmsg sendmsg(MSG_INTERRUPT)
diff --git a/llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst b/llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst
new file mode 100644
index 0000000000000..ebb591dc101e7
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_sendmsg_rtn.rst
@@ -0,0 +1,30 @@
+.. _amdgpu_synid_sendmsg_rtn:
+
+sendmsg_rtn
+===========
+
+An 8-bit value in the instruction to encode the message type.
+
+This operand may be specified as one of the following:
+
+    * An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+    * A *sendmsg* value described below.
+
+    ==================================== ====================================================
+    Sendmsg Value Syntax                 Description
+    ==================================== ====================================================
+    sendmsg(MSG_RTN_GET_DOORBELL)        Get doorbell ID.
+    sendmsg(MSG_RTN_GET_DDID)            Get Draw/Dispatch ID.
+    sendmsg(MSG_RTN_GET_TMA)             Get TMA value.
+    sendmsg(MSG_RTN_GET_TBA)             Get TBA value.
+    sendmsg(MSG_RTN_GET_REALTIME)        Get REALTIME value.
+    sendmsg(MSG_RTN_SAVE_WAVE)           Report that this wave is ready to be context-saved.
+    ==================================== ====================================================
+
+Examples:
+
+.. parsed-literal::
+
+    s_sendmsg_rtn_b32 s0, 132
+    s_sendmsg_rtn_b32 s0, sendmsg(MSG_GET_REALTIME)
+
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst b/llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst
new file mode 100644
index 0000000000000..0cb123393a309
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_15ccdd.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_15ccdd:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`version<amdgpu_synid_version>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_218bea.rst b/llvm/docs/AMDGPU/gfx12_simm16_218bea.rst
new file mode 100644
index 0000000000000..e08605e0bfbfb
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_218bea.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_218bea:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`waitcnt<amdgpu_synid_waitcnt>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_39b593.rst b/llvm/docs/AMDGPU/gfx12_simm16_39b593.rst
new file mode 100644
index 0000000000000..babb4b689a519
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_39b593.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_39b593:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`imm16<amdgpu_synid_imm16>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst b/llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst
new file mode 100644
index 0000000000000..cc8dbc6742803
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_3d2a4f.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_3d2a4f:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`label<amdgpu_synid_label>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_730a13.rst b/llvm/docs/AMDGPU/gfx12_simm16_730a13.rst
new file mode 100644
index 0000000000000..93596db9287be
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_730a13.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_730a13:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`clause<amdgpu_synid_clause>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst b/llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst
new file mode 100644
index 0000000000000..fc63930c30334
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_7ed651.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_7ed651:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`hwreg<amdgpu_synid_hwreg>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_81e671.rst b/llvm/docs/AMDGPU/gfx12_simm16_81e671.rst
new file mode 100644
index 0000000000000..16dcf397b48cf
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_81e671.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_81e671:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_c98889.rst b/llvm/docs/AMDGPU/gfx12_simm16_c98889.rst
new file mode 100644
index 0000000000000..03e007af73690
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_c98889.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_c98889:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`delay<amdgpu_synid_delay>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst b/llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst
new file mode 100644
index 0000000000000..e53f8125c3398
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_cc1716.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_cc1716:
+
+simm16
+======
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`hwreg<amdgpu_synid_hwreg>`
diff --git a/llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst b/llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst
new file mode 100644
index 0000000000000..9bdac9b6056e7
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_simm16_ee8b30.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_simm16_ee8b30:
+
+simm16
+======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`sendmsg<amdgpu_synid_sendmsg>`
diff --git a/llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst b/llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst
new file mode 100644
index 0000000000000..44de0304b46cf
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_soffset_8ec073.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_soffset_8ec073:
+
+soffset
+=======
+
+An unsigned 20-bit offset added to the base address to get memory address.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst b/llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst
new file mode 100644
index 0000000000000..d115150d11d71
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_soffset_c5b88c.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_soffset_c5b88c:
+
+soffset
+=======
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst b/llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst
new file mode 100644
index 0000000000000..bd571b6499603
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_soffset_ec005a.rst
@@ -0,0 +1,20 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_soffset_ec005a:
+
+soffset
+=======
+
+An offset added to the base address to get memory address.
+
+* If offset is specified as a register, it supplies an unsigned byte offset.
+* If offset is specified as a 21-bit immediate, it supplies a signed byte offset.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_5727cf.rst b/llvm/docs/AMDGPU/gfx12_src0_5727cf.rst
new file mode 100644
index 0000000000000..15fde5c33daab
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_5727cf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src0_5727cf:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_5cae62.rst b/llvm/docs/AMDGPU/gfx12_src0_5cae62.rst
new file mode 100644
index 0000000000000..fa02f046b1804
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_5cae62.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src0_5cae62:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_6802ce.rst b/llvm/docs/AMDGPU/gfx12_src0_6802ce.rst
new file mode 100644
index 0000000000000..e17a719c8b02c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_6802ce.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src0_6802ce:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_85aab6.rst b/llvm/docs/AMDGPU/gfx12_src0_85aab6.rst
new file mode 100644
index 0000000000000..effa6f69c6acb
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_85aab6.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src0_85aab6:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_c4593f.rst b/llvm/docs/AMDGPU/gfx12_src0_c4593f.rst
new file mode 100644
index 0000000000000..bbe6191f49944
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_c4593f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src0_c4593f:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_e016a1.rst b/llvm/docs/AMDGPU/gfx12_src0_e016a1.rst
new file mode 100644
index 0000000000000..c2d23d737610d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_e016a1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src0_e016a1:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src0_fd235e.rst b/llvm/docs/AMDGPU/gfx12_src0_fd235e.rst
new file mode 100644
index 0000000000000..dc048af280704
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src0_fd235e.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src0_fd235e:
+
+src0
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_5727cf.rst b/llvm/docs/AMDGPU/gfx12_src1_5727cf.rst
new file mode 100644
index 0000000000000..d1d08370eab76
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_5727cf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_5727cf:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_5cae62.rst b/llvm/docs/AMDGPU/gfx12_src1_5cae62.rst
new file mode 100644
index 0000000000000..3ad591ce779a7
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_5cae62.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_5cae62:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_6802ce.rst b/llvm/docs/AMDGPU/gfx12_src1_6802ce.rst
new file mode 100644
index 0000000000000..84ff631fc275d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_6802ce.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_6802ce:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_731030.rst b/llvm/docs/AMDGPU/gfx12_src1_731030.rst
new file mode 100644
index 0000000000000..8c67699145a1f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_731030.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_731030:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_977794.rst b/llvm/docs/AMDGPU/gfx12_src1_977794.rst
new file mode 100644
index 0000000000000..765134002d94b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_977794.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_977794:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_c4593f.rst b/llvm/docs/AMDGPU/gfx12_src1_c4593f.rst
new file mode 100644
index 0000000000000..aba4da84faee5
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_c4593f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_c4593f:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_e016a1.rst b/llvm/docs/AMDGPU/gfx12_src1_e016a1.rst
new file mode 100644
index 0000000000000..438585390ec88
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_e016a1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_e016a1:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src1_fd235e.rst b/llvm/docs/AMDGPU/gfx12_src1_fd235e.rst
new file mode 100644
index 0000000000000..5863e93170ef6
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src1_fd235e.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src1_fd235e:
+
+src1
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_2797bc.rst b/llvm/docs/AMDGPU/gfx12_src2_2797bc.rst
new file mode 100644
index 0000000000000..b393e2ac36b96
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_2797bc.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_2797bc:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_5727cf.rst b/llvm/docs/AMDGPU/gfx12_src2_5727cf.rst
new file mode 100644
index 0000000000000..9ffaa079f6b91
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_5727cf.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_5727cf:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_5cae62.rst b/llvm/docs/AMDGPU/gfx12_src2_5cae62.rst
new file mode 100644
index 0000000000000..46d65cb3bad5b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_5cae62.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_5cae62:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_6802ce.rst b/llvm/docs/AMDGPU/gfx12_src2_6802ce.rst
new file mode 100644
index 0000000000000..0ad2ede9df4ac
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_6802ce.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_6802ce:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_7b936a.rst b/llvm/docs/AMDGPU/gfx12_src2_7b936a.rst
new file mode 100644
index 0000000000000..9f1ea3c3ab944
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_7b936a.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_7b936a:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`fconst<amdgpu_synid_fconst>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst b/llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst
new file mode 100644
index 0000000000000..884d089d544c2
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_96fbd3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_96fbd3:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`fconst<amdgpu_synid_fconst>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_c4593f.rst b/llvm/docs/AMDGPU/gfx12_src2_c4593f.rst
new file mode 100644
index 0000000000000..849230b5a56a2
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_c4593f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_c4593f:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_src2_e016a1.rst b/llvm/docs/AMDGPU/gfx12_src2_e016a1.rst
new file mode 100644
index 0000000000000..266c4eaedf72d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_src2_e016a1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_src2_e016a1:
+
+src2
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_srcx0.rst b/llvm/docs/AMDGPU/gfx12_srcx0.rst
new file mode 100644
index 0000000000000..57b05a18c3100
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_srcx0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_srcx0:
+
+srcx0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_srcy0.rst b/llvm/docs/AMDGPU/gfx12_srcy0.rst
new file mode 100644
index 0000000000000..350b7428668ba
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_srcy0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_srcy0:
+
+srcy0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`, :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst
new file mode 100644
index 0000000000000..c3f33e4f78fdd
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_007f9c.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_007f9c:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst
new file mode 100644
index 0000000000000..5aa3f2d3585ac
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_1a9ca5.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_1a9ca5:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`m0<amdgpu_synid_m0>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst
new file mode 100644
index 0000000000000..36925daf7a86c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_245536.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_245536:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`sendmsg_rtn<amdgpu_synid_sendmsg_rtn>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst
new file mode 100644
index 0000000000000..4eae7050ea714
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_2797bc.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_2797bc:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst
new file mode 100644
index 0000000000000..a29f83d36d48f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_bbb4c6.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_bbb4c6:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst b/llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst
new file mode 100644
index 0000000000000..33ca4d608d7df
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc0_c4593f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc0_c4593f:
+
+ssrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst b/llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst
new file mode 100644
index 0000000000000..1f3ea343f3a09
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc1_bbb4c6.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc1_bbb4c6:
+
+ssrc1
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`
diff --git a/llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst b/llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst
new file mode 100644
index 0000000000000..f81d0f203f07b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_ssrc1_c4593f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_ssrc1_c4593f:
+
+ssrc1
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`m0<amdgpu_synid_m0>`, :ref:`scc<amdgpu_synid_scc>`, :ref:`fconst<amdgpu_synid_fconst>`, :ref:`literal<amdgpu_synid_literal>`, :ref:`exec_hi<amdgpu_synid_exec_hi>`, :ref:`exec_lo<amdgpu_synid_exec_lo>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_tgt.rst b/llvm/docs/AMDGPU/gfx12_tgt.rst
new file mode 100644
index 0000000000000..83a25aa466bfb
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_tgt.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_tgt:
+
+tgt
+===
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst b/llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst
new file mode 100644
index 0000000000000..223b50d6ef205
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_a972b9.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_a972b9:
+
+vaddr
+=====
+
+*Size:* 11 dwords.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst b/llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst
new file mode 100644
index 0000000000000..5a93efec9f86a
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_c12f43.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_c12f43:
+
+vaddr
+=====
+
+*Size:* 12 dwords.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst b/llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst
new file mode 100644
index 0000000000000..1998e1ddc9504
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_c8b8d4.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_c8b8d4:
+
+vaddr
+=====
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst b/llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst
new file mode 100644
index 0000000000000..92d09a2399a2f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_d82160.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_d82160:
+
+vaddr
+=====
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst b/llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst
new file mode 100644
index 0000000000000..10d7e0ad1fce4
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vaddr_f2b449.rst
@@ -0,0 +1,15 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vaddr_f2b449:
+
+vaddr
+=====
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vcc.rst b/llvm/docs/AMDGPU/gfx12_vcc.rst
new file mode 100644
index 0000000000000..e8509ff50a32f
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vcc.rst
@@ -0,0 +1,16 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vcc:
+
+vcc
+===
+
+Vector condition code. This operand depends on wavefront size:
+
+* Should be :ref:`vcc_lo<amdgpu_synid_vcc_lo>` if wavefront size is 32.
+* Should be :ref:`vcc<amdgpu_synid_vcc>` if wavefront size is 64.
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst b/llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst
new file mode 100644
index 0000000000000..839ec86ce2634
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_2eda77.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_2eda77:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 32 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst b/llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst
new file mode 100644
index 0000000000000..d2ab49a951684
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_48e42f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_48e42f:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_69a144.rst b/llvm/docs/AMDGPU/gfx12_vdata_69a144.rst
new file mode 100644
index 0000000000000..22ac087b51e8e
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_69a144.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_69a144:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_89680f.rst b/llvm/docs/AMDGPU/gfx12_vdata_89680f.rst
new file mode 100644
index 0000000000000..5f4f4782e410d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_89680f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_89680f:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst b/llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst
new file mode 100644
index 0000000000000..2e285ef86eebc
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_aac3e8.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_aac3e8:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 10 dwords.
+
+*Operands:* 
diff --git a/llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst b/llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst
new file mode 100644
index 0000000000000..109c7672541a5
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdata_bdb32f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdata_bdb32f:
+
+vdata
+=====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_006c40.rst b/llvm/docs/AMDGPU/gfx12_vdst_006c40.rst
new file mode 100644
index 0000000000000..dc3ac95500037
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_006c40.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_006c40:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`vcc<amdgpu_synid_vcc>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_227281.rst b/llvm/docs/AMDGPU/gfx12_vdst_227281.rst
new file mode 100644
index 0000000000000..13fd9513245dd
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_227281.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_227281:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 4 dwords if wavefront size is 64, otherwise 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst b/llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst
new file mode 100644
index 0000000000000..9372e484cf5d9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_2eda77.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_2eda77:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 32 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst b/llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst
new file mode 100644
index 0000000000000..056fe3f197417
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_47d3bc.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_47d3bc:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 8 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst b/llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst
new file mode 100644
index 0000000000000..84ab35b36b7b3
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_48e42f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_48e42f:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_69a144.rst b/llvm/docs/AMDGPU/gfx12_vdst_69a144.rst
new file mode 100644
index 0000000000000..70873ff9502b8
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_69a144.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_69a144:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst b/llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst
new file mode 100644
index 0000000000000..7248ea9449236
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_7de8e7.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_7de8e7:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`exec<amdgpu_synid_exec>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_836716.rst b/llvm/docs/AMDGPU/gfx12_vdst_836716.rst
new file mode 100644
index 0000000000000..1cd43ee9f620a
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_836716.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_836716:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`s<amdgpu_synid_s>`, :ref:`ttmp<amdgpu_synid_ttmp>`, :ref:`null<amdgpu_synid_null>`, :ref:`vcc_hi<amdgpu_synid_vcc_hi>`, :ref:`vcc_lo<amdgpu_synid_vcc_lo>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_89680f.rst b/llvm/docs/AMDGPU/gfx12_vdst_89680f.rst
new file mode 100644
index 0000000000000..b4f055cc1574d
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_89680f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_89680f:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst b/llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst
new file mode 100644
index 0000000000000..e2a4a47987b7c
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdst_bdb32f.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdst_bdb32f:
+
+vdst
+====
+
+Instruction output.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdstx.rst b/llvm/docs/AMDGPU/gfx12_vdstx.rst
new file mode 100644
index 0000000000000..4b95d4d0d84ba
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdstx.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdstx:
+
+vdstx
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vdsty.rst b/llvm/docs/AMDGPU/gfx12_vdsty.rst
new file mode 100644
index 0000000000000..cf0b4641308be
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vdsty.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vdsty:
+
+vdsty
+=====
+
+Instruction output.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_version.rst b/llvm/docs/AMDGPU/gfx12_version.rst
new file mode 100644
index 0000000000000..4e490ca4954a9
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_version.rst
@@ -0,0 +1,7 @@
+.. _amdgpu_synid_version:
+
+version
+=======
+
+Microcode version header.
+
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc0.rst b/llvm/docs/AMDGPU/gfx12_vsrc0.rst
new file mode 100644
index 0000000000000..fb381690c3692
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc0.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc0:
+
+vsrc0
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst b/llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst
new file mode 100644
index 0000000000000..449054574be9b
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc1_6802ce.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc1_6802ce:
+
+vsrc1
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst b/llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst
new file mode 100644
index 0000000000000..d6567c2fd9cef
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc1_fd235e.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc1_fd235e:
+
+vsrc1
+=====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc2.rst b/llvm/docs/AMDGPU/gfx12_vsrc2.rst
new file mode 100644
index 0000000000000..fe20832437431
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc2.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc2:
+
+vsrc2
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc3.rst b/llvm/docs/AMDGPU/gfx12_vsrc3.rst
new file mode 100644
index 0000000000000..18df9e4418f0e
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc3.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc3:
+
+vsrc3
+=====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst b/llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst
new file mode 100644
index 0000000000000..166da38acb079
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_56f215.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_56f215:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 3 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst b/llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst
new file mode 100644
index 0000000000000..e879c2bad2038
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_6802ce.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_6802ce:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst b/llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst
new file mode 100644
index 0000000000000..c521e7261c59e
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_89fd7b.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_89fd7b:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 32 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst b/llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst
new file mode 100644
index 0000000000000..84eb2eda944b7
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_e016a1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_e016a1:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 4 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst b/llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst
new file mode 100644
index 0000000000000..640a235730f93
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrc_fd235e.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrc_fd235e:
+
+vsrc
+====
+
+Instruction input.
+
+*Size:* 2 dwords.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrcx1.rst b/llvm/docs/AMDGPU/gfx12_vsrcx1.rst
new file mode 100644
index 0000000000000..9dab58c459b2e
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrcx1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrcx1:
+
+vsrcx1
+======
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_vsrcy1.rst b/llvm/docs/AMDGPU/gfx12_vsrcy1.rst
new file mode 100644
index 0000000000000..496b2d66d2ff5
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_vsrcy1.rst
@@ -0,0 +1,17 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_gfx12_vsrcy1:
+
+vsrcy1
+======
+
+Instruction input.
+
+*Size:* 1 dword.
+
+*Operands:* :ref:`v<amdgpu_synid_v>`
diff --git a/llvm/docs/AMDGPU/gfx12_waitcnt.rst b/llvm/docs/AMDGPU/gfx12_waitcnt.rst
new file mode 100644
index 0000000000000..454122212f64e
--- /dev/null
+++ b/llvm/docs/AMDGPU/gfx12_waitcnt.rst
@@ -0,0 +1,55 @@
+..
+    **************************************************
+    *                                                *
+    *   Automatically generated file, do not edit!   *
+    *                                                *
+    **************************************************
+
+.. _amdgpu_synid_waitcnt:
+
+waitcnt
+=======
+
+Counts of outstanding instructions to wait for.
+
+The bits of this operand have the following meaning:
+
+   ===== ================================================ ============
+   Bits  Description                                      Value Range
+   ===== ================================================ ============
+   2:0   EXP_CNT: export and LDSDIR count.                0..7
+   3:3   Unused                                           \-
+   9:4   LGKM_CNT: LDS, GDS, Constant and Message count.  0..63
+   15:10 VM_CNT: vector memory operations count.          0..63
+   ===== ================================================ ============
+
+This operand may be specified as one of the following:
+
+* An :ref:`integer_number<amdgpu_synid_integer_number>` or an :ref:`absolute_expression<amdgpu_synid_absolute_expression>`. The value must be in the range 0..0xFFFF.
+* A combination of *vmcnt*, *expcnt*, *lgkmcnt* and other values described below.
+
+    ====================== ======================================================================
+    Syntax                 Description
+    ====================== ======================================================================
+    vmcnt(<*N*>)           A VM_CNT value. *N* must not exceed the largest VM_CNT value.
+    expcnt(<*N*>)          An EXP_CNT value. *N* must not exceed the largest EXP_CNT value.
+    lgkmcnt(<*N*>)         An LGKM_CNT value. *N* must not exceed the largest LGKM_CNT value.
+    vmcnt_sat(<*N*>)       A VM_CNT value computed as min(*N*, the largest VM_CNT value).
+    expcnt_sat(<*N*>)      An EXP_CNT value computed as min(*N*, the largest EXP_CNT value).
+    lgkmcnt_sat(<*N*>)     An LGKM_CNT value computed as min(*N*, the largest LGKM_CNT value).
+    ====================== ======================================================================
+
+These values may be specified in any order. Spaces, ampersands and commas may be used as optional separators.
+
+*N* is either an
+:ref:`integer number<amdgpu_synid_integer_number>` or an
+:ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+
+Examples:
+
+.. parsed-literal::
+
+    s_waitcnt vmcnt(1)
+    s_waitcnt expcnt(2) lgkmcnt(3)
+    s_waitcnt vmcnt(1), expcnt(2), lgkmcnt(3)
+    s_waitcnt vmcnt(1) & lgkmcnt_sat(100) & expcnt(2)
diff --git a/llvm/docs/AMDGPUModifierSyntax.rst b/llvm/docs/AMDGPUModifierSyntax.rst
index 334bdafefbbe2..8a60663b7303c 100644
--- a/llvm/docs/AMDGPUModifierSyntax.rst
+++ b/llvm/docs/AMDGPUModifierSyntax.rst
@@ -1078,6 +1078,73 @@ Examples:
   offset:0xfffff
   offset:-x
 
+.. _amdgpu_synid_smem_offset24s:
+
+offset24s
+~~~~~~~~~
+
+Specifies a signed 24-bit offset, in bytes. The default value is 0.
+
+    ============================= ====================================================================
+    Syntax                        Description
+    ============================= ====================================================================
+    offset:{-0x1000000..0xFFFFFF} Specifies an offset as an
+                                  :ref:`integer number <amdgpu_synid_integer_number>`
+                                  or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+    ============================= ====================================================================
+
+Examples:
+
+.. parsed-literal::
+
+  offset:-1
+  offset:0xfffff
+  offset:-x
+
+.. _amdgpu_synid_th:
+
+th
+~~
+
+Specifies temporal hint of memory operation.
+
+    =============================== =========================================================
+    Syntax                          Description
+    =============================== =========================================================
+    TH_{LOAD|STORE}_RT              Regular
+    TH_{LOAD|STORE}_NT              Non-temporal
+    TH_{LOAD|STORE}_HT              High-temporal
+    TH_{LOAD|STORE}_LU              Last use. Not available in SYS scope.
+    TH_{LOAD|STORE}_WB              Regular (CU, SE); High-temporal with write-back (MALL)
+    TH_{LOAD|STORE}_NT_RT           Non-temporal (CU, SE); Regular (MALL)
+    TH_{LOAD|STORE}_RT_NT           Regular (CU, SE); Non-temporal (MALL)
+    TH_{LOAD|STORE}_NT_HT           Non-temporal (CU, SE); High-temporal (MALL)
+    TH_{LOAD|STORE}_NT_WB           Non-temporal (CU, SE); High-temporal with write-back (MALL)
+    TH_{LOAD|STORE}_BYPASS          Available for SYS scope only.
+    TH_ATOMIC_RT                    Regular
+    TH_ATOMIC_RT_RETURN             Regular. For atomic instructions that return values.
+    TH_ATOMIC_NT                    Non-temporal
+    TH_ATOMIC_NT_RETURN             Non-temporal. For atomic instructions that return values.
+    TH_ATOMIC_CASCADE_RT            Cascading atomic; Regular.
+    TH_ATOMIC_CASCADE_NT            Cascading atomic; Non-temporal.
+    =============================== =========================================================
+
+.. _amdgpu_synid_scope:
+
+scope
+~~~~~
+
+Specifies scope of memory operation.
+
+    =============================== =========================================================
+    Syntax                          Description
+    =============================== =========================================================
+    SCOPE_CU                        Coherency within a Compute Unit.
+    SCOPE_SE                        Coherency within a Shader Engine.
+    SCOPE_DEV                       Coherency within a single device.
+    SCOPE_SYS                       Coherency across the full system.
+    =============================== =========================================================
+
 VINTRP/VINTERP/LDSDIR Modifiers
 -------------------------------
 
@@ -1117,6 +1184,27 @@ The default value is zero. This is a safe value, but it may be suboptimal.
                      issuing this instruction.
     ================ ======================================================
 
+.. _amdgpu_synid_wait_va_vdst:
+
+wait_va_vdst
+~~~~~~~~~~~~
+
+Manually specify a wait on the VA_VDST counter before issuing this instruction. VA_VDST must be less
+than or equal to this value before the instruction is issued. If set to 15, no wait is performed.
+
+If unspecified the current default is zero. This is a safe value but may have poor performance characteristics.
+
+This modifier is a shorthand for the WAR hazard where VALU reads a VGPR that is written by a parameter
+load. Since there is no VA_VSRC counter we must use VA_VDST as a proxy to detect when the
+VALU instruction has completed:
+
+Examples:
+
+.. parsed-literal::
+
+  v_mov_b32 v1, v0
+  ds_param_load v0, . . . wait_va_vdst:0
+
 .. _amdgpu_synid_wait_vdst:
 
 wait_vdst
@@ -1135,6 +1223,27 @@ The default value is zero. This is a safe value, but it may be suboptimal.
                        issuing this instruction.
     ================== ======================================================
 
+.. _amdgpu_synid_wait_vm_vsrc:
+
+wait_vm_vsrc
+~~~~~~~~~~~~
+
+Manually specify a wait on the VM_VSRC counter before issuing this instruction. VM_VSRC must be less
+than or equal to this value before the instruction is issued. If set to 1, no wait is performed.
+
+If unspecified the current default is zero. This is a safe value but may have poor performance characteristics.
+
+This modifier is a shorthand for the WAR hazard where VMEM reads a VGPR that is written by a parameter
+load.
+
+Examples:
+
+.. parsed-literal::
+
+  buffer_load_b32 v1, v0, s0, 0
+  ds_param_load v0, . . . wait_vm_vsrc:0
+
+
 DPP8 Modifiers
 --------------
 
diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst
index e8a76322fe76a..722290fb72e16 100644
--- a/llvm/docs/AMDGPUOperandSyntax.rst
+++ b/llvm/docs/AMDGPUOperandSyntax.rst
@@ -479,6 +479,7 @@ High and low 32 bits of *xnack mask* may be accessed as separate registers:
 
 .. _amdgpu_synid_vcc:
 .. _amdgpu_synid_vcc_lo:
+.. _amdgpu_synid_vcc_hi:
 
 vcc
 ---
@@ -523,6 +524,8 @@ including register indexing and bounds checking.
     =========== ===================================================
 
 .. _amdgpu_synid_exec:
+.. _amdgpu_synid_exec_lo:
+.. _amdgpu_synid_exec_hi:
 
 exec
 ----
@@ -752,6 +755,14 @@ or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
 
 The value must be in the range -0x100000..0x0FFFFF.
 
+.. _amdgpu_synid_simm8:
+
+simm8
+-----
+
+An 8-bit :ref:`integer number<amdgpu_synid_integer_number>`
+or an :ref:`absolute expression<amdgpu_synid_absolute_expression>`.
+
 .. _amdgpu_synid_off:
 
 off
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 74b7604fda56d..a4d110fbbb38b 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -22,6 +22,7 @@ User Guide for AMDGPU Backend
    AMDGPU/AMDGPUAsmGFX1013
    AMDGPU/AMDGPUAsmGFX1030
    AMDGPU/AMDGPUAsmGFX11
+   AMDGPU/AMDGPUAsmGFX12
    AMDGPUModifierSyntax
    AMDGPUOperandSyntax
    AMDGPUInstructionSyntax
@@ -19908,6 +19909,7 @@ in this description.
                                                                 :doc:`gfx1102<AMDGPU/AMDGPUAsmGFX11>`
 
                                                                 :doc:`gfx1103<AMDGPU/AMDGPUAsmGFX11>`
+    RDNA 4        :doc:`GFX12<AMDGPU/AMDGPUAsmGFX12>`           :doc:`gfx1200<AMDGPU/AMDGPUAsmGFX12>`
     ============= ============================================= =======================================
 
 For more information about instructions, their semantics and supported

From 11a4b2d950baa4ddb31505b71a1736fa1f3242b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 1 Oct 2025 15:32:07 -0700
Subject: [PATCH 441/878] Cleanup the LLVM exported symbols namespace (#161240)

There's a pattern throughout LLVM of cl::opts being exported. That in
itself is probably a bit unfortunate, but what's especially bad about it
is that a lot of those symbols are in the global namespace. Move them
into the llvm namespace.

While doing this, I noticed some other variables in the global namespace
and moved them as well.
---
 llvm/include/llvm/Target/TargetMachine.h              |  4 ++--
 llvm/lib/Analysis/CtxProfAnalysis.cpp                 |  4 +++-
 llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp   |  4 ++++
 llvm/lib/Analysis/MemoryProfileInfo.cpp               |  4 ++++
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp           |  2 +-
 llvm/lib/Analysis/ProfileSummaryInfo.cpp              |  4 ++++
 llvm/lib/CGData/CodeGenData.cpp                       |  3 +++
 llvm/lib/CGData/CodeGenDataReader.cpp                 |  4 ++--
 llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp         |  2 +-
 llvm/lib/CodeGen/MachineRegionInfo.cpp                |  3 ++-
 llvm/lib/CodeGen/RegAllocScore.cpp                    |  4 ++++
 llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp      |  2 +-
 llvm/lib/IR/Instruction.cpp                           |  4 ++++
 llvm/lib/IR/Value.cpp                                 |  2 +-
 llvm/lib/LTO/LTO.cpp                                  |  3 ++-
 llvm/lib/Passes/PassBuilderPipelines.cpp              |  3 ++-
 llvm/lib/ProfileData/MemProfCommon.cpp                |  4 ++++
 llvm/lib/Target/TargetMachine.cpp                     |  2 +-
 llvm/lib/Transforms/IPO/FunctionImport.cpp            |  4 ++--
 llvm/lib/Transforms/IPO/FunctionSpecialization.cpp    | 11 ++++++++---
 .../Transforms/IPO/MemProfContextDisambiguation.cpp   |  3 ++-
 llvm/lib/Transforms/IPO/SampleProfile.cpp             |  3 ++-
 llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp      |  4 ++++
 llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp        |  4 ++++
 .../Transforms/InstCombine/InstructionCombining.cpp   | 10 +++++++---
 .../Instrumentation/IndirectCallPromotion.cpp         |  4 ++--
 .../Transforms/Instrumentation/PGOInstrumentation.cpp |  2 --
 .../Transforms/Instrumentation/PGOMemOPSizeOpt.cpp    |  4 ++++
 .../Instrumentation/ValueProfilePlugins.inc           |  2 ++
 llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp      |  4 ++++
 llvm/lib/Transforms/Scalar/LICM.cpp                   |  8 ++++++--
 llvm/lib/Transforms/Utils/FunctionImportUtils.cpp     |  4 ++++
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp             |  6 +++++-
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp     |  2 +-
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp                |  2 ++
 llvm/unittests/Analysis/MemoryProfileInfoTest.cpp     |  2 ++
 llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp    |  5 +++--
 llvm/unittests/CodeGen/RegAllocScoreTest.cpp          |  3 +++
 llvm/unittests/ProfileData/MemProfTest.cpp            | 11 ++++++-----
 39 files changed, 118 insertions(+), 38 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index bf4e490554723..d0fd483a8ddaa 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -29,10 +29,10 @@
 #include <string>
 #include <utility>
 
-LLVM_ABI extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
-
 namespace llvm {
 
+LLVM_ABI extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
+
 class AAManager;
 using ModulePassManager = PassManager<Module>;
 
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp
index a363bce0570e7..c4abec02e765a 100644
--- a/llvm/lib/Analysis/CtxProfAnalysis.cpp
+++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp
@@ -30,6 +30,9 @@
 #define DEBUG_TYPE "ctx_prof"
 
 using namespace llvm;
+
+namespace llvm {
+
 cl::opt<std::string>
     UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
                   cl::desc("Use the specified contextual profile file"));
@@ -50,7 +53,6 @@ static cl::opt<bool> ForceIsInSpecializedModule(
 
 const char *AssignGUIDPass::GUIDMetadataName = "guid";
 
-namespace llvm {
 class ProfileAnnotatorImpl final {
   friend class ProfileAnnotator;
   class BBInfo;
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index 7b93474e4dc7b..25e7a97065b27 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -22,6 +22,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "pgo-icall-prom-analysis"
 
+namespace llvm {
+
 // The percent threshold for the direct-call target (this call site vs the
 // remaining call count) for it to be considered as the promotion target.
 static cl::opt<unsigned> ICPRemainingPercentThreshold(
@@ -54,6 +56,8 @@ cl::opt<unsigned> MaxNumVTableAnnotations(
     "icp-max-num-vtables", cl::init(6), cl::Hidden,
     cl::desc("Max number of vtables annotated for a vtable load instruction."));
 
+} // end namespace llvm
+
 bool ICallPromotionAnalysis::isPromotionProfitable(uint64_t Count,
                                                    uint64_t TotalCount,
                                                    uint64_t RemainingCount) {
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index b5ca6b13108fe..11602d29c1313 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -22,6 +22,8 @@ using namespace llvm::memprof;
 
 #define DEBUG_TYPE "memory-profile-info"
 
+namespace llvm {
+
 cl::opt<bool> MemProfReportHintedSizes(
     "memprof-report-hinted-sizes", cl::init(false), cl::Hidden,
     cl::desc("Report total allocation sizes of hinted allocations"));
@@ -52,6 +54,8 @@ cl::opt<unsigned> MinPercentMaxColdSize(
     "memprof-min-percent-max-cold-size", cl::init(100), cl::Hidden,
     cl::desc("Min percent of max cold bytes for critical cold context"));
 
+} // end namespace llvm
+
 bool llvm::memprof::metadataIncludesAllContextSizeInfo() {
   return MemProfReportHintedSizes || MinClonedColdBytePercent < 100;
 }
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index a317ac471a231..a60a4bb1194e2 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -67,7 +67,6 @@ using namespace llvm::memprof;
 namespace llvm {
 FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold =
     FunctionSummary::FSHT_None;
-} // namespace llvm
 
 static cl::opt<FunctionSummary::ForceSummaryHotnessType, true> FSEC(
     "force-summary-edges-cold", cl::Hidden, cl::location(ForceSummaryEdgesCold),
@@ -91,6 +90,7 @@ LLVM_ABI extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
 extern cl::opt<bool> MemProfReportHintedSizes;
+} // namespace llvm
 
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index f1c3155f2f141..44d7a175cc7fe 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -24,6 +24,8 @@
 #include <optional>
 using namespace llvm;
 
+namespace llvm {
+
 static cl::opt<bool> PartialProfile(
     "partial-profile", cl::Hidden, cl::init(false),
     cl::desc("Specify the current profile is used as a partial profile."));
@@ -44,6 +46,8 @@ static cl::opt<double> PartialSampleProfileWorkingSetSizeScaleFactor(
              "and the factor to scale the working set size to use the same "
              "shared thresholds as PGO."));
 
+} // end namespace llvm
+
 // The profile summary metadata may be attached either by the frontend or by
 // any backend passes (IR level instrumentation, for example). This method
 // checks if the Summary is null and if so checks if the summary metadata is now
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index b4f08c3d13b0d..7900dc7653c03 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -31,11 +31,14 @@ static cl::opt<bool>
 static cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+
+namespace llvm {
 cl::opt<bool> CodeGenDataThinLTOTwoRounds(
     "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
     cl::desc("Enable two-round ThinLTO code generation. The first round "
              "emits codegen data, while the second round uses the emitted "
              "codegen data for further optimizations."));
+} // end namespace llvm
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index 3fd8cfe1a8762..b1cd939db9a4f 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -26,14 +26,14 @@ static cl::opt<bool> IndexedCodeGenDataReadFunctionMapNames(
              "disabled to save memory and time for final consumption of the "
              "indexed CodeGenData in production."));
 
+namespace llvm {
+
 cl::opt<bool> IndexedCodeGenDataLazyLoading(
     "indexed-codegen-data-lazy-loading", cl::init(false), cl::Hidden,
     cl::desc(
         "Lazily load indexed CodeGenData. Enable to save memory and time "
         "for final consumption of the indexed CodeGenData in production."));
 
-namespace llvm {
-
 static Expected<std::unique_ptr<MemoryBuffer>>
 setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
   auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN()
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 477e5c1559b26..c2d474fdde696 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -34,7 +34,7 @@ cl::opt<bool> llvm::DisableGISelLegalityCheck(
     cl::desc("Don't verify that MIR is fully legal between GlobalISel passes"),
     cl::Hidden);
 
-cl::opt<bool> VerboseVerifyLegalizerInfo(
+static cl::opt<bool> VerboseVerifyLegalizerInfo(
     "verbose-gisel-verify-legalizer-info",
     cl::desc("Print more information to dbgs about GlobalISel legalizer rules "
              "being verified"),
diff --git a/llvm/lib/CodeGen/MachineRegionInfo.cpp b/llvm/lib/CodeGen/MachineRegionInfo.cpp
index f8268b8894ca3..366755af08e49 100644
--- a/llvm/lib/CodeGen/MachineRegionInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -127,7 +128,7 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
 #endif
 
 char MachineRegionInfoPass::ID = 0;
-char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
+char &llvm::MachineRegionInfoPassID = MachineRegionInfoPass::ID;
 
 INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
                       "Detect single entry single exit regions", true, true)
diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp
index 9c9cc1f1f0b7b..280946bb0d0b3 100644
--- a/llvm/lib/CodeGen/RegAllocScore.cpp
+++ b/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
+
+namespace llvm {
 LLVM_ABI cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2),
                                     cl::Hidden);
 LLVM_ABI cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0),
@@ -33,6 +35,8 @@ LLVM_ABI cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight",
                                           cl::init(0.2), cl::Hidden);
 LLVM_ABI cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight",
                                               cl::init(1.0), cl::Hidden);
+} // end namespace llvm
+
 #define DEBUG_TYPE "regalloc-score"
 
 RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) {
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
index c1017d8a3c22f..d973a47f68732 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVReader.cpp
@@ -148,7 +148,7 @@ std::error_code LVSplitContext::open(std::string ContextName,
   return std::error_code();
 }
 
-LVReader *CurrentReader = nullptr;
+static LVReader *CurrentReader = nullptr;
 LVReader &LVReader::getInstance() {
   if (CurrentReader)
     return *CurrentReader;
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index a8bb34f69c629..33ca46ca1c2c6 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -30,6 +30,8 @@
 #include "llvm/Support/Compiler.h"
 using namespace llvm;
 
+namespace llvm {
+
 // FIXME: Flag used for an ablation performance test, Issue #147390. Placing it
 // here because referencing IR should be feasible from anywhere. Will be
 // removed after the ablation test.
@@ -38,6 +40,8 @@ cl::opt<bool> ProfcheckDisableMetadataFixes(
     cl::desc(
         "Disable metadata propagation fixes discovered through Issue #147390"));
 
+} // end namespace llvm
+
 InsertPosition::InsertPosition(Instruction *InsertBefore)
     : InsertAt(InsertBefore ? InsertBefore->getIterator()
                             : InstListType::iterator()) {}
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index e5e062d1cf4e2..a3476092253e7 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -36,7 +36,7 @@
 
 using namespace llvm;
 
-cl::opt<bool> UseDerefAtPointSemantics(
+static cl::opt<bool> UseDerefAtPointSemantics(
     "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false),
     cl::desc("Deref attributes and metadata infer facts at definition only"));
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 7b252627d73f9..e6544f3bafff4 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -75,9 +75,10 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+namespace llvm {
 extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
-
 extern cl::opt<bool> ForceImportAll;
+} // end namespace llvm
 
 namespace llvm {
 /// Enable global value internalization in LTO.
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 256cf9d4cd1ce..373b3c3ee56a9 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -150,6 +150,8 @@
 
 using namespace llvm;
 
+namespace llvm {
+
 static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
     "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
     cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
@@ -305,7 +307,6 @@ static cl::opt<std::string> InstrumentColdFuncOnlyPath(
 extern cl::opt<std::string> UseCtxProfile;
 extern cl::opt<bool> PGOInstrumentColdFunctionOnly;
 
-namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
 } // namespace llvm
 
diff --git a/llvm/lib/ProfileData/MemProfCommon.cpp b/llvm/lib/ProfileData/MemProfCommon.cpp
index a13a291a4cd27..cfd2efddce27b 100644
--- a/llvm/lib/ProfileData/MemProfCommon.cpp
+++ b/llvm/lib/ProfileData/MemProfCommon.cpp
@@ -20,6 +20,8 @@
 using namespace llvm;
 using namespace llvm::memprof;
 
+namespace llvm {
+
 // Upper bound on lifetime access density (accesses per byte per lifetime sec)
 // for marking an allocation cold.
 LLVM_ABI cl::opt<float> MemProfLifetimeAccessDensityColdThreshold(
@@ -48,6 +50,8 @@ LLVM_ABI cl::opt<bool>
                        cl::desc("Enable use of hot hints (only supported for "
                                 "unambigously hot allocations)"));
 
+} // end namespace llvm
+
 AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity,
                                            uint64_t AllocCount,
                                            uint64_t TotalLifetime) {
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index ad7e503cb1552..cf8569194d778 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
-cl::opt<bool> NoKernelInfoEndLTO(
+cl::opt<bool> llvm::NoKernelInfoEndLTO(
     "no-kernel-info-end-lto",
     cl::desc("remove the kernel-info pass at the end of the full LTO pipeline"),
     cl::init(false), cl::Hidden);
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 83aa7de5400f5..28ee4449421bd 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -72,6 +72,7 @@ STATISTIC(NumImportedModules, "Number of modules imported from");
 STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
 STATISTIC(NumLiveSymbols, "Number of live symbols in index");
 
+namespace llvm {
 cl::opt<bool>
     ForceImportAll("force-import-all", cl::init(false), cl::Hidden,
                    cl::desc("Import functions with noinline attribute"));
@@ -185,9 +186,8 @@ static cl::opt<bool> CtxprofMoveRootsToOwnModule(
 
 extern cl::list<GlobalValue::GUID> MoveSymbolGUID;
 
-namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
-}
+} // end namespace llvm
 
 // Load lazily a module from \p FileName in \p Context.
 static std::unique_ptr<Module> loadFile(const std::string &FileName,
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 4f5373846f43a..150a2dc5d48e2 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -28,10 +28,13 @@ using namespace llvm;
 
 STATISTIC(NumSpecsCreated, "Number of specializations created");
 
+namespace llvm {
+
 static cl::opt<bool> ForceSpecialization(
-    "force-specialization", cl::init(false), cl::Hidden, cl::desc(
-    "Force function specialization for every call site with a constant "
-    "argument"));
+    "force-specialization", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Force function specialization for every call site with a constant "
+        "argument"));
 
 static cl::opt<unsigned> MaxClones(
     "funcspec-max-clones", cl::init(3), cl::Hidden, cl::desc(
@@ -91,6 +94,8 @@ static cl::opt<bool> SpecializeLiteralConstant(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 bool InstCostVisitor::canEliminateSuccessor(BasicBlock *BB,
                                             BasicBlock *Succ) const {
   unsigned I = 0;
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 15f4d76300bff..c4f1b680a53ec 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -214,11 +214,12 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
     "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
     cl::desc(
         "Require target function definition when promoting indirect calls"));
-} // namespace llvm
 
 extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
+} // namespace llvm
+
 namespace {
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 99b8b88ebedbb..e39e311dd795f 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -116,6 +116,8 @@ STATISTIC(
     NumCSInlinedHitGrowthLimit,
     "Number of functions with FDO inline stopped due to growth size limit");
 
+namespace llvm {
+
 // Command line option to specify the file to read samples from. This is
 // mainly used for debugging.
 static cl::opt<std::string> SampleProfileFile(
@@ -198,7 +200,6 @@ static cl::opt<bool> DisableSampleLoaderInlining(
         "pass, and merge (or scale) profiles (as configured by "
         "--sample-profile-merge-inlinee)."));
 
-namespace llvm {
 cl::opt<bool>
     SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
                     cl::desc("Sort profiled recursion by edge weights."));
diff --git a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
index 093a39eb4b5d7..70b8614826826 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileMatcher.cpp
@@ -23,6 +23,8 @@ using namespace sampleprof;
 
 #define DEBUG_TYPE "sample-profile-matcher"
 
+namespace llvm {
+
 static cl::opt<unsigned> FuncProfileSimilarityThreshold(
     "func-profile-similarity-threshold", cl::Hidden, cl::init(80),
     cl::desc("Consider a profile matches a function if the similarity of their "
@@ -55,6 +57,8 @@ static cl::opt<unsigned> SalvageStaleProfileMaxCallsites(
     cl::desc("The maximum number of callsites in a function, above which stale "
              "profile matching will be skipped."));
 
+} // end namespace llvm
+
 void SampleProfileMatcher::findIRAnchors(const Function &F,
                                          AnchorMap &IRAnchors) const {
   // For inlined code, recover the original callsite and callee by finding the
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 09bffa7bf5846..ac41fdd988605 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -120,6 +120,8 @@ STATISTIC(NumVirtConstProp1Bit,
           "Number of 1 bit virtual constant propagations");
 STATISTIC(NumVirtConstProp, "Number of virtual constant propagations");
 
+namespace llvm {
+
 static cl::opt<PassSummaryAction> ClSummaryAction(
     "wholeprogramdevirt-summary-action",
     cl::desc("What to do with the summary when running this pass"),
@@ -175,6 +177,8 @@ static cl::list<std::string>
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 /// With Clang, a pure virtual class's deleting destructor is emitted as a
 /// `llvm.trap` intrinsic followed by an unreachable IR instruction. In the
 /// context of whole program devirtualization, the deleting destructor of a pure
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 5d2d79e420931..917004c4702b6 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -132,9 +132,11 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
-static cl::opt<bool>
-EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
-                                              cl::init(true));
+namespace llvm {
+
+static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking",
+                                       cl::desc("Enable code sinking"),
+                                       cl::init(true));
 
 static cl::opt<unsigned> MaxSinkNumUsers(
     "instcombine-max-sink-users", cl::init(32),
@@ -156,6 +158,8 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
                                                cl::Hidden, cl::init(true));
 
+} // end namespace llvm
+
 std::optional<Instruction *>
 InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
   // Handle target specific intrinsics
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 0249f210f4754..cf87e354aef56 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -55,11 +55,11 @@ using namespace llvm;
 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
 
+namespace llvm {
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
-namespace llvm {
 extern cl::opt<bool> EnableVTableProfileUse;
-}
+} // namespace llvm
 
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index d9e850e7a2bf3..120c4f65a7292 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -222,7 +222,6 @@ cl::opt<bool> NoPGOWarnMismatchComdatWeak(
     cl::desc("The option is used to turn on/off "
              "warnings about hash mismatch for comdat "
              "or weak functions."));
-} // namespace llvm
 
 // Command line option to enable/disable select instruction instrumentation.
 static cl::opt<bool>
@@ -347,7 +346,6 @@ cl::list<std::string> CtxPGOSkipCallsiteInstrument(
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
-namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
 extern cl::opt<PGOViewCountsType> PGOViewCounts;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index 343bec37018c5..a5f417a02a99a 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -54,6 +54,8 @@ using namespace llvm;
 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
 
+namespace llvm {
+
 // The minimum call count to optimize memory intrinsic calls.
 static cl::opt<unsigned>
     MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::init(1000),
@@ -93,6 +95,8 @@ static cl::opt<unsigned>
     MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
                     cl::desc("Optimize the memop size <= this value"));
 
+} // end namespace llvm
+
 namespace {
 
 static const char *getMIName(const MemIntrinsic *MI) {
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index a3d4e5367b9ab..0534fdd0c756c 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -21,7 +21,9 @@
 using namespace llvm;
 using CandidateInfo = ValueProfileCollector::CandidateInfo;
 
+namespace llvm {
 extern cl::opt<bool> MemOPOptMemcmpBcmp;
+} // end namespace llvm
 
 ///--------------------------- MemIntrinsicPlugin ------------------------------
 class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 2025fbbf05973..36f9bb451e23d 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -26,6 +26,8 @@
 
 using namespace llvm;
 
+namespace llvm {
+
 static cl::opt<unsigned>
     JumpTableSizeThreshold("jump-table-to-switch-size-threshold", cl::Hidden,
                            cl::desc("Only split jump tables with size less or "
@@ -43,6 +45,8 @@ static cl::opt<unsigned> FunctionSizeThreshold(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 #define DEBUG_TYPE "jump-table-to-switch"
 
 namespace {
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bab1f2a90a8fd..9655173de4422 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -116,6 +116,8 @@ STATISTIC(NumIntAssociationsHoisted,
 STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
                                     "reassociated and hoisted out of the loop");
 
+namespace llvm {
+
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
     DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
@@ -154,7 +156,7 @@ static cl::opt<unsigned> IntAssociationUpperLimit(
 // which may not be precise, since optimizeUses is capped. The result is
 // correct, but we may not get as "far up" as possible to get which access is
 // clobbering the one queried.
-cl::opt<unsigned> llvm::SetLicmMssaOptCap(
+cl::opt<unsigned> SetLicmMssaOptCap(
     "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
     cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
              "for faster compile. Caps the MemorySSA clobbering calls."));
@@ -162,7 +164,7 @@ cl::opt<unsigned> llvm::SetLicmMssaOptCap(
 // Experimentally, memory promotion carries less importance than sinking and
 // hoisting. Limit when we do promotion when using MemorySSA, in order to save
 // compile time.
-cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
+cl::opt<unsigned> SetLicmMssaNoAccForPromotionCap(
     "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
     cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
              "effect. When MSSA in LICM is enabled, then this is the maximum "
@@ -171,6 +173,8 @@ cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedOrFoldableInLoop(const Instruction &I, const Loop *CurLoop,
                                       const LoopSafetyInfo *SafetyInfo,
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 1a9e16be6989e..d31154fcf085d 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -17,6 +17,8 @@
 
 using namespace llvm;
 
+namespace llvm {
+
 /// Uses the "source_filename" instead of a Module hash ID for the suffix of
 /// promoted locals during LTO. NOTE: This requires that the source filename
 /// has a unique name / path to avoid name collisions.
@@ -35,6 +37,8 @@ cl::list<GlobalValue::GUID> MoveSymbolGUID(
         "used with the name of contextual profiling roots."),
     cl::Hidden);
 
+} // end namespace llvm
+
 FunctionImportGlobalProcessing::FunctionImportGlobalProcessing(
     Module &M, const ModuleSummaryIndex &Index,
     SetVector<GlobalValue *> *GlobalsToImport, bool ClearDSOLocalOnDeclarations)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4d1f768e2177a..8bba634521e3e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -95,7 +95,9 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "simplifycfg"
 
-cl::opt<bool> llvm::RequireAndPreserveDomTree(
+namespace llvm {
+
+cl::opt<bool> RequireAndPreserveDomTree(
     "simplifycfg-require-and-preserve-domtree", cl::Hidden,
 
     cl::desc(
@@ -205,6 +207,8 @@ static cl::opt<unsigned> MaxJumpThreadingLiveBlocks(
 
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
+} // end namespace llvm
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index a73b083cff7fd..acdb37996a443 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,7 +40,7 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
-cl::opt<bool> EnableWideActiveLaneMask(
+static cl::opt<bool> EnableWideActiveLaneMask(
     "enable-wide-lane-mask", cl::init(false), cl::Hidden,
     cl::desc("Enable use of wide get active lane mask instructions"));
 
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 047557e5a7fae..ea89c4d1dd25f 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -83,7 +83,9 @@ static CGDataAction Action;
 static std::optional<CGDataFormat> OutputFormat;
 static std::vector<std::string> InputFilenames;
 
+namespace llvm {
 extern cl::opt<bool> IndexedCodeGenDataLazyLoading;
+} // end namespace llvm
 
 static void exitWithError(Twine Message, StringRef Whence = "",
                           StringRef Hint = "") {
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index 8c4fd8bb449d5..d1c0f643f5cd7 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -24,7 +24,9 @@
 using namespace llvm;
 using namespace llvm::memprof;
 
+namespace llvm {
 LLVM_ABI extern cl::opt<bool> MemProfKeepAllNotColdContexts;
+} // end namespace llvm
 
 namespace {
 
diff --git a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
index 45dc50ec0839b..c8752c78d1c35 100644
--- a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -25,9 +25,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
 
-LLVM_ABI extern llvm::cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
-
 namespace llvm {
+
+LLVM_ABI extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
+
 namespace {
 
 class ProfileSummaryInfoTest : public testing::Test {
diff --git a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
index 86bfc7a81d1be..432dc9348fbd3 100644
--- a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
+++ b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
@@ -31,11 +31,14 @@
 #include "gtest/gtest.h"
 
 using namespace llvm;
+
+namespace llvm {
 LLVM_ABI extern cl::opt<double> CopyWeight;
 LLVM_ABI extern cl::opt<double> LoadWeight;
 LLVM_ABI extern cl::opt<double> StoreWeight;
 LLVM_ABI extern cl::opt<double> CheapRematWeight;
 LLVM_ABI extern cl::opt<double> ExpensiveRematWeight;
+} // namespace llvm
 
 namespace {
 // Include helper functions to ease the manipulation of MachineFunctions.
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index abe36bc759658..6ea951eee920b 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -26,13 +26,14 @@
 
 #include <initializer_list>
 
-LLVM_ABI extern llvm::cl::opt<float> MemProfLifetimeAccessDensityColdThreshold;
-LLVM_ABI extern llvm::cl::opt<unsigned> MemProfAveLifetimeColdThreshold;
-LLVM_ABI extern llvm::cl::opt<unsigned>
+namespace llvm {
+
+LLVM_ABI extern cl::opt<float> MemProfLifetimeAccessDensityColdThreshold;
+LLVM_ABI extern cl::opt<unsigned> MemProfAveLifetimeColdThreshold;
+LLVM_ABI extern cl::opt<unsigned>
     MemProfMinAveLifetimeAccessDensityHotThreshold;
-LLVM_ABI extern llvm::cl::opt<bool> MemProfUseHotHints;
+LLVM_ABI extern cl::opt<bool> MemProfUseHotHints;
 
-namespace llvm {
 namespace memprof {
 
 namespace {

From b181c22c54bd8c3f6d8a3071661572c5782a7a30 Mon Sep 17 00:00:00 2001
From: Georgiy Samoylov <g.samoylov@syntacore.com>
Date: Thu, 2 Oct 2025 01:40:42 +0300
Subject: [PATCH 442/878] [lldb][RISCV] Fixed TestSymbolFileJSON for RISC-V
 (#161497)

This test failed during testing on the RISC-V target because we couldn't
strip the main label from the binary. main is dynamically linked when
the -fPIC flag is enabled. The RISC-V ABI requires that executables
support loading at arbitrary addresses to enable shared libraries and
secure loading (ASLR). In PIC mode, function addresses cannot be
hardcoded in the code. Instead, code is generated to load addresses from
the GOT/PLT tables, which are initialized by the dynamic loader. The
reference to main thus ends up in .dynsym and is dynamically bound. We
cannot strip main or any other dynamically linked functions because
these functions are referenced indirectly via dynamic linking tables
(.plt and .got). Removing these symbols would break the dynamic linking
mechanism needed to resolve function addresses at runtime, causing the
executable to fail to correctly call them.
---
 lldb/test/API/functionalities/json/symbol-file/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/functionalities/json/symbol-file/Makefile b/lldb/test/API/functionalities/json/symbol-file/Makefile
index 13bc164582eee..5d05d95fc8428 100644
--- a/lldb/test/API/functionalities/json/symbol-file/Makefile
+++ b/lldb/test/API/functionalities/json/symbol-file/Makefile
@@ -1,4 +1,5 @@
 C_SOURCES := main.c
+CFLAGS_EXTRAS := -no-pie
 
 all: stripped.out
 

From 9ba1121e3c200bd7935ced3f33d161a0f488609b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 1 Oct 2025 16:07:26 -0700
Subject: [PATCH 443/878] [RISCV] Remove break after return in
 RISCVVEmitter.cpp. NFC (#161599)

---
 clang/utils/TableGen/RISCVVEmitter.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index f73b0aecc3b6d..74f29ac43ac40 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -133,28 +133,20 @@ static BasicType ParseBasicType(char c) {
   switch (c) {
   case 'c':
     return BasicType::Int8;
-    break;
   case 's':
     return BasicType::Int16;
-    break;
   case 'i':
     return BasicType::Int32;
-    break;
   case 'l':
     return BasicType::Int64;
-    break;
   case 'x':
     return BasicType::Float16;
-    break;
   case 'f':
     return BasicType::Float32;
-    break;
   case 'd':
     return BasicType::Float64;
-    break;
   case 'y':
     return BasicType::BFloat16;
-    break;
   default:
     return BasicType::Unknown;
   }

From aeffd3645aae854d0dabc8ed45168fb696e6ee39 Mon Sep 17 00:00:00 2001
From: Bruno Cardoso Lopes <bruno.cardoso@gmail.com>
Date: Wed, 1 Oct 2025 16:10:53 -0700
Subject: [PATCH 444/878] [Docs] Add CIR related meetings to GettingInvolved
 page (#157181)

Co-authored-by: Andy Kaylor <akaylor@nvidia.com>
---
 llvm/docs/GettingInvolved.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index f7e1374ec2aef..4b4b09ad87aba 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -213,6 +213,16 @@ what to add to your calendar invite.
      - `ics <https://calendar.google.com/calendar/ical/c_fe5774fa2769c5085d6b87e8fac272e8940e7d0089bc0e0a58dc3ead7978504b%40group.calendar.google.com/public/basic.ics>`__
        `gcal <https://calendar.google.com/calendar/embed?src=c_fe5774fa2769c5085d6b87e8fac272e8940e7d0089bc0e0a58dc3ead7978504b%40group.calendar.google.com&ctz=Asia%2FTokyo>`__
      - `Minutes/docs <https://discourse.llvm.org/t/llvm-qualification-wg-sync-ups-meeting-minutes/87148>`__
+   * - MLIR C/C++ Frontend Working Group
+     - Monthly, usually 1st Monday of the month
+     - `ics <https://calendar.google.com/calendar/ical/jvceakm3kbpku3f4jrsv1lkigo%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/embed?src=jvceakm3kbpku3f4jrsv1lkigo%40group.calendar.google.com&ctz=America%2FLos_Angeles>`__
+     - `Minutes/docs <https://docs.google.com/document/d/1-flHK3TjQUrkSO2Fdt4webZ2zCyeXxpTLMiRQbMW7hE>`__
+   * - ClangIR Upstreaming Coordination Meeting
+     - Every 2 weeks on Mondays
+     - `ics <https://calendar.google.com/calendar/ical/c_673c6cd64474c0aff173bf8fa609559f93d654e0984d9d91d71abd32d28c0486%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/embed?src=c_673c6cd64474c0aff173bf8fa609559f93d654e0984d9d91d71abd32d28c0486%40group.calendar.google.com&ctz=America%2FLos_Angeles>`__
+     -
 
 
 For event owners, our Discord bot also supports sending automated announcements

From 52b185075919a3d8ec9dc6b7d7da365e28532735 Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Wed, 1 Oct 2025 16:39:22 -0700
Subject: [PATCH 445/878] [IR2Vec] Add support for Cmp predicates in vocabulary
 and embeddings (#156952)

Comparison predicates (equal, not equal, greater than, etc.) provide important semantic information about program behavior. Previously, IR2Vec only captured that a comparison was happening but not what kind of comparison it was. This PR extends the IR2Vec vocabulary to include comparison predicates (ICmp and FCmp) as part of the embedding space.

Following are the changes:
1. Expand the vocabulary slot layout to include predicate entries after opcodes, types, and operands
2. Add methods to handle predicate embedding lookups and conversions
3. Update the embedder implementations to include predicate information when processing CmpInst instructions
4. Update test files to include the new predicate entries in the vocabulary

(Tracking issues: #141817, #141833)
---
 llvm/include/llvm/Analysis/IR2Vec.h           | 55 ++++++++++++--
 llvm/lib/Analysis/IR2Vec.cpp                  | 76 ++++++++++++++++---
 .../IR2Vec/Inputs/dummy_2D_vocab.json         | 28 ++++++-
 .../Inputs/dummy_3D_nonzero_arg_vocab.json    | 28 ++++++-
 .../Inputs/dummy_3D_nonzero_opc_vocab.json    | 29 ++++++-
 .../Inputs/reference_default_vocab_print.txt  | 26 +++++++
 .../Inputs/reference_wtd1_vocab_print.txt     | 26 +++++++
 .../Inputs/reference_wtd2_vocab_print.txt     | 26 +++++++
 llvm/test/Analysis/IR2Vec/if-else.ll          |  2 +-
 llvm/test/Analysis/IR2Vec/unreachable.ll      |  2 +-
 llvm/test/tools/llvm-ir2vec/entities.ll       | 28 ++++++-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp        |  2 +-
 llvm/unittests/Analysis/IR2VecTest.cpp        | 47 +++++++++++-
 13 files changed, 351 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 3671c1c71ac0b..f3f9de460218b 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -36,6 +36,7 @@
 #define LLVM_ANALYSIS_IR2VEC_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
@@ -162,15 +163,34 @@ using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
 /// embeddings.
 class Vocabulary {
   friend class llvm::IR2VecVocabAnalysis;
+
+  // Vocabulary Slot Layout:
+  // +----------------+------------------------------------------------------+
+  // | Entity Type    | Index Range                                          |
+  // +----------------+------------------------------------------------------+
+  // | Opcodes        | [0 .. (MaxOpcodes-1)]                                |
+  // | Canonical Types| [MaxOpcodes .. (MaxOpcodes+MaxCanonicalTypeIDs-1)]   |
+  // | Operands       | [(MaxOpcodes+MaxCanonicalTypeIDs) .. NumCanEntries]  |
+  // +----------------+------------------------------------------------------+
+  // Note: MaxOpcodes is the number of unique opcodes supported by LLVM IR.
+  //       MaxCanonicalTypeIDs is the number of canonicalized type IDs.
+  //       "Similar" LLVM Types are grouped/canonicalized together. E.g., all
+  //       float variants (FloatTy, DoubleTy, HalfTy, etc.) map to
+  //       CanonicalTypeID::FloatTy. This helps reduce the vocabulary size
+  //       and improves learning. Operands include Comparison predicates
+  //       (ICmp/FCmp) along with other operand types. This can be extended to
+  //       include other specializations in future.
   using VocabVector = std::vector<ir2vec::Embedding>;
   VocabVector Vocab;
 
-public:
-  // Slot layout:
-  // [0 .. MaxOpcodes-1]               => Instruction opcodes
-  // [MaxOpcodes .. MaxOpcodes+MaxCanonicalTypeIDs-1] => Canonicalized types
-  // [MaxOpcodes+MaxCanonicalTypeIDs .. NumCanonicalEntries-1] => Operand kinds
+  static constexpr unsigned NumICmpPredicates =
+      static_cast<unsigned>(CmpInst::LAST_ICMP_PREDICATE) -
+      static_cast<unsigned>(CmpInst::FIRST_ICMP_PREDICATE) + 1;
+  static constexpr unsigned NumFCmpPredicates =
+      static_cast<unsigned>(CmpInst::LAST_FCMP_PREDICATE) -
+      static_cast<unsigned>(CmpInst::FIRST_FCMP_PREDICATE) + 1;
 
+public:
   /// Canonical type IDs supported by IR2Vec Vocabulary
   enum class CanonicalTypeID : unsigned {
     FloatTy,
@@ -207,13 +227,18 @@ class Vocabulary {
       static_cast<unsigned>(CanonicalTypeID::MaxCanonicalType);
   static constexpr unsigned MaxOperandKinds =
       static_cast<unsigned>(OperandKind::MaxOperandKind);
+  // CmpInst::Predicate has gaps. We want the vocabulary to be dense without
+  // empty slots.
+  static constexpr unsigned MaxPredicateKinds =
+      NumICmpPredicates + NumFCmpPredicates;
 
   Vocabulary() = default;
   LLVM_ABI Vocabulary(VocabVector &&Vocab) : Vocab(std::move(Vocab)) {}
 
   LLVM_ABI bool isValid() const { return Vocab.size() == NumCanonicalEntries; };
   LLVM_ABI unsigned getDimension() const;
-  /// Total number of entries (opcodes + canonicalized types + operand kinds)
+  /// Total number of entries (opcodes + canonicalized types + operand kinds +
+  /// predicates)
   static constexpr size_t getCanonicalSize() { return NumCanonicalEntries; }
 
   /// Function to get vocabulary key for a given Opcode
@@ -228,16 +253,21 @@ class Vocabulary {
   /// Function to classify an operand into OperandKind
   LLVM_ABI static OperandKind getOperandKind(const Value *Op);
 
+  /// Function to get vocabulary key for a given predicate
+  LLVM_ABI static StringRef getVocabKeyForPredicate(CmpInst::Predicate P);
+
   /// Functions to return the slot index or position of a given Opcode, TypeID,
   /// or OperandKind in the vocabulary.
   LLVM_ABI static unsigned getSlotIndex(unsigned Opcode);
   LLVM_ABI static unsigned getSlotIndex(Type::TypeID TypeID);
   LLVM_ABI static unsigned getSlotIndex(const Value &Op);
+  LLVM_ABI static unsigned getSlotIndex(CmpInst::Predicate P);
 
   /// Accessors to get the embedding for a given entity.
   LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const;
   LLVM_ABI const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
   LLVM_ABI const ir2vec::Embedding &operator[](const Value &Arg) const;
+  LLVM_ABI const ir2vec::Embedding &operator[](CmpInst::Predicate P) const;
 
   /// Const Iterator type aliases
   using const_iterator = VocabVector::const_iterator;
@@ -274,7 +304,13 @@ class Vocabulary {
 
 private:
   constexpr static unsigned NumCanonicalEntries =
-      MaxOpcodes + MaxCanonicalTypeIDs + MaxOperandKinds;
+      MaxOpcodes + MaxCanonicalTypeIDs + MaxOperandKinds + MaxPredicateKinds;
+
+  // Base offsets for slot layout to simplify index computation
+  constexpr static unsigned OperandBaseOffset =
+      MaxOpcodes + MaxCanonicalTypeIDs;
+  constexpr static unsigned PredicateBaseOffset =
+      OperandBaseOffset + MaxOperandKinds;
 
   /// String mappings for CanonicalTypeID values
   static constexpr StringLiteral CanonicalTypeNames[] = {
@@ -326,6 +362,11 @@ class Vocabulary {
 
   /// Function to convert TypeID to CanonicalTypeID
   LLVM_ABI static CanonicalTypeID getCanonicalTypeID(Type::TypeID TypeID);
+
+  /// Function to get the predicate enum value for a given index. Index is
+  /// relative to the predicates section of the vocabulary. E.g., Index 0
+  /// corresponds to the first predicate.
+  LLVM_ABI static CmpInst::Predicate getPredicate(unsigned Index);
 };
 
 /// Embedder provides the interface to generate embeddings (vector
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 99afc0601d523..f51f0898cb37e 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -216,6 +216,8 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
       ArgEmb += Vocab[*Op];
     auto InstVector =
         Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+    if (const auto *IC = dyn_cast<CmpInst>(&I))
+      InstVector += Vocab[IC->getPredicate()];
     InstVecMap[&I] = InstVector;
     BBVector += InstVector;
   }
@@ -250,6 +252,9 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
     // embeddings
     auto InstVector =
         Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+    // Add compare predicate embedding as an additional operand if applicable
+    if (const auto *IC = dyn_cast<CmpInst>(&I))
+      InstVector += Vocab[IC->getPredicate()];
     InstVecMap[&I] = InstVector;
     BBVector += InstVector;
   }
@@ -278,7 +283,17 @@ unsigned Vocabulary::getSlotIndex(Type::TypeID TypeID) {
 unsigned Vocabulary::getSlotIndex(const Value &Op) {
   unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
   assert(Index < MaxOperandKinds && "Invalid OperandKind");
-  return MaxOpcodes + MaxCanonicalTypeIDs + Index;
+  return OperandBaseOffset + Index;
+}
+
+unsigned Vocabulary::getSlotIndex(CmpInst::Predicate P) {
+  unsigned PU = static_cast<unsigned>(P);
+  unsigned FirstFC = static_cast<unsigned>(CmpInst::FIRST_FCMP_PREDICATE);
+  unsigned FirstIC = static_cast<unsigned>(CmpInst::FIRST_ICMP_PREDICATE);
+
+  unsigned PredIdx =
+      (PU >= FirstIC) ? (NumFCmpPredicates + (PU - FirstIC)) : (PU - FirstFC);
+  return PredicateBaseOffset + PredIdx;
 }
 
 const Embedding &Vocabulary::operator[](unsigned Opcode) const {
@@ -293,6 +308,10 @@ const ir2vec::Embedding &Vocabulary::operator[](const Value &Arg) const {
   return Vocab[getSlotIndex(Arg)];
 }
 
+const ir2vec::Embedding &Vocabulary::operator[](CmpInst::Predicate P) const {
+  return Vocab[getSlotIndex(P)];
+}
+
 StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
   assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
 #define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
@@ -338,18 +357,41 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   return OperandKind::VariableID;
 }
 
+CmpInst::Predicate Vocabulary::getPredicate(unsigned Index) {
+  assert(Index < MaxPredicateKinds && "Invalid predicate index");
+  unsigned PredEnumVal =
+      (Index < NumFCmpPredicates)
+          ? (static_cast<unsigned>(CmpInst::FIRST_FCMP_PREDICATE) + Index)
+          : (static_cast<unsigned>(CmpInst::FIRST_ICMP_PREDICATE) +
+             (Index - NumFCmpPredicates));
+  return static_cast<CmpInst::Predicate>(PredEnumVal);
+}
+
+StringRef Vocabulary::getVocabKeyForPredicate(CmpInst::Predicate Pred) {
+  static SmallString<16> PredNameBuffer;
+  if (Pred < CmpInst::FIRST_ICMP_PREDICATE)
+    PredNameBuffer = "FCMP_";
+  else
+    PredNameBuffer = "ICMP_";
+  PredNameBuffer += CmpInst::getPredicateName(Pred);
+  return PredNameBuffer;
+}
+
 StringRef Vocabulary::getStringKey(unsigned Pos) {
   assert(Pos < NumCanonicalEntries && "Position out of bounds in vocabulary");
   // Opcode
   if (Pos < MaxOpcodes)
     return getVocabKeyForOpcode(Pos + 1);
   // Type
-  if (Pos < MaxOpcodes + MaxCanonicalTypeIDs)
+  if (Pos < OperandBaseOffset)
     return getVocabKeyForCanonicalTypeID(
         static_cast<CanonicalTypeID>(Pos - MaxOpcodes));
   // Operand
-  return getVocabKeyForOperandKind(
-      static_cast<OperandKind>(Pos - MaxOpcodes - MaxCanonicalTypeIDs));
+  if (Pos < PredicateBaseOffset)
+    return getVocabKeyForOperandKind(
+        static_cast<OperandKind>(Pos - OperandBaseOffset));
+  // Predicates
+  return getVocabKeyForPredicate(getPredicate(Pos - PredicateBaseOffset));
 }
 
 // For now, assume vocabulary is stable unless explicitly invalidated.
@@ -363,11 +405,9 @@ Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) {
   VocabVector DummyVocab;
   DummyVocab.reserve(NumCanonicalEntries);
   float DummyVal = 0.1f;
-  // Create a dummy vocabulary with entries for all opcodes, types, and
-  // operands
-  for ([[maybe_unused]] unsigned _ :
-       seq(0u, Vocabulary::MaxOpcodes + Vocabulary::MaxCanonicalTypeIDs +
-                   Vocabulary::MaxOperandKinds)) {
+  // Create a dummy vocabulary with entries for all opcodes, types, operands
+  // and predicates
+  for ([[maybe_unused]] unsigned _ : seq(0u, Vocabulary::NumCanonicalEntries)) {
     DummyVocab.push_back(Embedding(Dim, DummyVal));
     DummyVal += 0.1f;
   }
@@ -510,6 +550,24 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   }
   Vocab.insert(Vocab.end(), NumericArgEmbeddings.begin(),
                NumericArgEmbeddings.end());
+
+  // Handle Predicates: part of Operands section. We look up predicate keys
+  // in ArgVocab.
+  std::vector<Embedding> NumericPredEmbeddings(Vocabulary::MaxPredicateKinds,
+                                               Embedding(Dim, 0));
+  NumericPredEmbeddings.reserve(Vocabulary::MaxPredicateKinds);
+  for (unsigned PK : seq(0u, Vocabulary::MaxPredicateKinds)) {
+    StringRef VocabKey =
+        Vocabulary::getVocabKeyForPredicate(Vocabulary::getPredicate(PK));
+    auto It = ArgVocab.find(VocabKey.str());
+    if (It != ArgVocab.end()) {
+      NumericPredEmbeddings[PK] = It->second;
+      continue;
+    }
+    handleMissingEntity(VocabKey.str());
+  }
+  Vocab.insert(Vocab.end(), NumericPredEmbeddings.begin(),
+               NumericPredEmbeddings.end());
 }
 
 IR2VecVocabAnalysis::IR2VecVocabAnalysis(const VocabVector &Vocab)
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json b/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json
index 07fde84c1541b..ae36ff54686c5 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json
+++ b/llvm/test/Analysis/IR2Vec/Inputs/dummy_2D_vocab.json
@@ -87,6 +87,32 @@
         "Function": [1, 2],
         "Pointer": [3, 4],
         "Constant": [5, 6],
-        "Variable": [7, 8]
+        "Variable": [7, 8],
+        "FCMP_false": [9, 10],
+        "FCMP_oeq": [11, 12], 
+        "FCMP_ogt": [13, 14], 
+        "FCMP_oge": [15, 16], 
+        "FCMP_olt": [17, 18], 
+        "FCMP_ole": [19, 20], 
+        "FCMP_one": [21, 22], 
+        "FCMP_ord": [23, 24], 
+        "FCMP_uno": [25, 26], 
+        "FCMP_ueq": [27, 28], 
+        "FCMP_ugt": [29, 30], 
+        "FCMP_uge": [31, 32], 
+        "FCMP_ult": [33, 34], 
+        "FCMP_ule": [35, 36], 
+        "FCMP_une": [37, 38], 
+        "FCMP_true": [39, 40], 
+        "ICMP_eq": [41, 42], 
+        "ICMP_ne": [43, 44], 
+        "ICMP_ugt": [45, 46], 
+        "ICMP_uge": [47, 48], 
+        "ICMP_ult": [49, 50], 
+        "ICMP_ule": [51, 52], 
+        "ICMP_sgt": [53, 54], 
+        "ICMP_sge": [55, 56], 
+        "ICMP_slt": [57, 58], 
+        "ICMP_sle": [59, 60]
     }
 }
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json
index 932b3a217b70c..9003dc73954aa 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json
+++ b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_arg_vocab.json
@@ -86,6 +86,32 @@
         "Function": [1, 2, 3],
         "Pointer": [4, 5, 6],
         "Constant": [7, 8, 9],
-        "Variable": [10, 11, 12]
+        "Variable": [10, 11, 12],
+        "FCMP_false": [13, 14, 15],
+        "FCMP_oeq": [16, 17, 18],
+        "FCMP_ogt": [19, 20, 21],
+        "FCMP_oge": [22, 23, 24],
+        "FCMP_olt": [25, 26, 27],
+        "FCMP_ole": [28, 29, 30],
+        "FCMP_one": [31, 32, 33],
+        "FCMP_ord": [34, 35, 36],
+        "FCMP_uno": [37, 38, 39],
+        "FCMP_ueq": [40, 41, 42],
+        "FCMP_ugt": [43, 44, 45],
+        "FCMP_uge": [46, 47, 48],
+        "FCMP_ult": [49, 50, 51],
+        "FCMP_ule": [52, 53, 54],
+        "FCMP_une": [55, 56, 57],
+        "FCMP_true": [58, 59, 60],        
+        "ICMP_eq": [61, 62, 63],
+        "ICMP_ne": [64, 65, 66],
+        "ICMP_ugt": [67, 68, 69],
+        "ICMP_uge": [70, 71, 72],
+        "ICMP_ult": [73, 74, 75],
+        "ICMP_ule": [76, 77, 78],
+        "ICMP_sgt": [79, 80, 81],
+        "ICMP_sge": [82, 83, 84],
+        "ICMP_slt": [85, 86, 87],
+        "ICMP_sle": [88, 89, 90]
     }
 }
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
index 19f3efee9f6a1..7ef85490b27df 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
+++ b/llvm/test/Analysis/IR2Vec/Inputs/dummy_3D_nonzero_opc_vocab.json
@@ -47,6 +47,7 @@
         "FPTrunc": [133, 134, 135],
         "FPExt": [136, 137, 138],
         "PtrToInt": [139, 140, 141],
+        "PtrToAddr": [202, 203, 204],
         "IntToPtr": [142, 143, 144],
         "BitCast": [145, 146, 147],
         "AddrSpaceCast": [148, 149, 150],
@@ -86,6 +87,32 @@
         "Function": [0, 0, 0],
         "Pointer": [0, 0, 0],
         "Constant": [0, 0, 0],
-        "Variable": [0, 0, 0]
+        "Variable": [0, 0, 0],
+        "FCMP_false": [0, 0, 0],
+        "FCMP_oeq": [0, 0, 0],
+        "FCMP_ogt": [0, 0, 0],
+        "FCMP_oge": [0, 0, 0],
+        "FCMP_olt": [0, 0, 0],
+        "FCMP_ole": [0, 0, 0],
+        "FCMP_one": [0, 0, 0],
+        "FCMP_ord": [0, 0, 0],
+        "FCMP_uno": [0, 0, 0],
+        "FCMP_ueq": [0, 0, 0],
+        "FCMP_ugt": [0, 0, 0],
+        "FCMP_uge": [0, 0, 0],
+        "FCMP_ult": [0, 0, 0],
+        "FCMP_ule": [0, 0, 0],
+        "FCMP_une": [0, 0, 0],
+        "FCMP_true": [0, 0, 0],
+        "ICMP_eq": [0, 0, 0],
+        "ICMP_ne": [0, 0, 0],
+        "ICMP_ugt": [0, 0, 0],
+        "ICMP_uge": [0, 0, 0],
+        "ICMP_ult": [0, 0, 0],
+        "ICMP_ule": [0, 0, 0],
+        "ICMP_sgt": [1, 1, 1],
+        "ICMP_sge": [0, 0, 0],
+        "ICMP_slt": [0, 0, 0],
+        "ICMP_sle": [0, 0, 0]
     }
 }
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
index df7769c9c6a65..d62b0dd157b0b 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
@@ -82,3 +82,29 @@ Key: Function:  [ 0.20  0.40 ]
 Key: Pointer:  [ 0.60  0.80 ]
 Key: Constant:  [ 1.00  1.20 ]
 Key: Variable:  [ 1.40  1.60 ]
+Key: FCMP_false:  [ 1.80  2.00 ]
+Key: FCMP_oeq:  [ 2.20  2.40 ]
+Key: FCMP_ogt:  [ 2.60  2.80 ]
+Key: FCMP_oge:  [ 3.00  3.20 ]
+Key: FCMP_olt:  [ 3.40  3.60 ]
+Key: FCMP_ole:  [ 3.80  4.00 ]
+Key: FCMP_one:  [ 4.20  4.40 ]
+Key: FCMP_ord:  [ 4.60  4.80 ]
+Key: FCMP_uno:  [ 5.00  5.20 ]
+Key: FCMP_ueq:  [ 5.40  5.60 ]
+Key: FCMP_ugt:  [ 5.80  6.00 ]
+Key: FCMP_uge:  [ 6.20  6.40 ]
+Key: FCMP_ult:  [ 6.60  6.80 ]
+Key: FCMP_ule:  [ 7.00  7.20 ]
+Key: FCMP_une:  [ 7.40  7.60 ]
+Key: FCMP_true:  [ 7.80  8.00 ]
+Key: ICMP_eq:  [ 8.20  8.40 ]
+Key: ICMP_ne:  [ 8.60  8.80 ]
+Key: ICMP_ugt:  [ 9.00  9.20 ]
+Key: ICMP_uge:  [ 9.40  9.60 ]
+Key: ICMP_ult:  [ 9.80  10.00 ]
+Key: ICMP_ule:  [ 10.20  10.40 ]
+Key: ICMP_sgt:  [ 10.60  10.80 ]
+Key: ICMP_sge:  [ 11.00  11.20 ]
+Key: ICMP_slt:  [ 11.40  11.60 ]
+Key: ICMP_sle:  [ 11.80  12.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
index f3ce809fd2fd2..e443adb17ac78 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
@@ -82,3 +82,29 @@ Key: Function:  [ 0.50  1.00 ]
 Key: Pointer:  [ 1.50  2.00 ]
 Key: Constant:  [ 2.50  3.00 ]
 Key: Variable:  [ 3.50  4.00 ]
+Key: FCMP_false:  [ 4.50  5.00 ]
+Key: FCMP_oeq:  [ 5.50  6.00 ]
+Key: FCMP_ogt:  [ 6.50  7.00 ]
+Key: FCMP_oge:  [ 7.50  8.00 ]
+Key: FCMP_olt:  [ 8.50  9.00 ]
+Key: FCMP_ole:  [ 9.50  10.00 ]
+Key: FCMP_one:  [ 10.50  11.00 ]
+Key: FCMP_ord:  [ 11.50  12.00 ]
+Key: FCMP_uno:  [ 12.50  13.00 ]
+Key: FCMP_ueq:  [ 13.50  14.00 ]
+Key: FCMP_ugt:  [ 14.50  15.00 ]
+Key: FCMP_uge:  [ 15.50  16.00 ]
+Key: FCMP_ult:  [ 16.50  17.00 ]
+Key: FCMP_ule:  [ 17.50  18.00 ]
+Key: FCMP_une:  [ 18.50  19.00 ]
+Key: FCMP_true:  [ 19.50  20.00 ]
+Key: ICMP_eq:  [ 20.50  21.00 ]
+Key: ICMP_ne:  [ 21.50  22.00 ]
+Key: ICMP_ugt:  [ 22.50  23.00 ]
+Key: ICMP_uge:  [ 23.50  24.00 ]
+Key: ICMP_ult:  [ 24.50  25.00 ]
+Key: ICMP_ule:  [ 25.50  26.00 ]
+Key: ICMP_sgt:  [ 26.50  27.00 ]
+Key: ICMP_sge:  [ 27.50  28.00 ]
+Key: ICMP_slt:  [ 28.50  29.00 ]
+Key: ICMP_sle:  [ 29.50  30.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
index 72b25b9bd3d9c..7fb6043552f7b 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
@@ -82,3 +82,29 @@ Key: Function:  [ 0.00  0.00 ]
 Key: Pointer:  [ 0.00  0.00 ]
 Key: Constant:  [ 0.00  0.00 ]
 Key: Variable:  [ 0.00  0.00 ]
+Key: FCMP_false:  [ 0.00  0.00 ]
+Key: FCMP_oeq:  [ 0.00  0.00 ]
+Key: FCMP_ogt:  [ 0.00  0.00 ]
+Key: FCMP_oge:  [ 0.00  0.00 ]
+Key: FCMP_olt:  [ 0.00  0.00 ]
+Key: FCMP_ole:  [ 0.00  0.00 ]
+Key: FCMP_one:  [ 0.00  0.00 ]
+Key: FCMP_ord:  [ 0.00  0.00 ]
+Key: FCMP_uno:  [ 0.00  0.00 ]
+Key: FCMP_ueq:  [ 0.00  0.00 ]
+Key: FCMP_ugt:  [ 0.00  0.00 ]
+Key: FCMP_uge:  [ 0.00  0.00 ]
+Key: FCMP_ult:  [ 0.00  0.00 ]
+Key: FCMP_ule:  [ 0.00  0.00 ]
+Key: FCMP_une:  [ 0.00  0.00 ]
+Key: FCMP_true:  [ 0.00  0.00 ]
+Key: ICMP_eq:  [ 0.00  0.00 ]
+Key: ICMP_ne:  [ 0.00  0.00 ]
+Key: ICMP_ugt:  [ 0.00  0.00 ]
+Key: ICMP_uge:  [ 0.00  0.00 ]
+Key: ICMP_ult:  [ 0.00  0.00 ]
+Key: ICMP_ule:  [ 0.00  0.00 ]
+Key: ICMP_sgt:  [ 0.00  0.00 ]
+Key: ICMP_sge:  [ 0.00  0.00 ]
+Key: ICMP_slt:  [ 0.00  0.00 ]
+Key: ICMP_sle:  [ 0.00  0.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/if-else.ll b/llvm/test/Analysis/IR2Vec/if-else.ll
index fe532479086d3..804c1ca5cb6f6 100644
--- a/llvm/test/Analysis/IR2Vec/if-else.ll
+++ b/llvm/test/Analysis/IR2Vec/if-else.ll
@@ -29,7 +29,7 @@ return:                                           ; preds = %if.else, %if.then
 
 ; CHECK: Basic block vectors:
 ; CHECK-NEXT: Basic block: entry:
-; CHECK-NEXT: [ 816.00  825.00  834.00 ]
+; CHECK-NEXT: [ 816.20  825.20  834.20 ]
 ; CHECK-NEXT: Basic block: if.then:
 ; CHECK-NEXT: [ 195.00  198.00  201.00 ]
 ; CHECK-NEXT: Basic block: if.else:
diff --git a/llvm/test/Analysis/IR2Vec/unreachable.ll b/llvm/test/Analysis/IR2Vec/unreachable.ll
index b0e3e49978018..9be0ee1c2de7a 100644
--- a/llvm/test/Analysis/IR2Vec/unreachable.ll
+++ b/llvm/test/Analysis/IR2Vec/unreachable.ll
@@ -33,7 +33,7 @@ return:                                           ; preds = %if.else, %if.then
 
 ; CHECK: Basic block vectors:
 ; CHECK-NEXT: Basic block: entry:
-; CHECK-NEXT: [ 816.00  825.00  834.00 ]
+; CHECK-NEXT: [ 816.20  825.20  834.20 ]
 ; CHECK-NEXT: Basic block: if.then:
 ; CHECK-NEXT: [ 195.00  198.00  201.00 ]
 ; CHECK-NEXT: Basic block: if.else:
diff --git a/llvm/test/tools/llvm-ir2vec/entities.ll b/llvm/test/tools/llvm-ir2vec/entities.ll
index 4b51adf30bf74..8dbce57302f6f 100644
--- a/llvm/test/tools/llvm-ir2vec/entities.ll
+++ b/llvm/test/tools/llvm-ir2vec/entities.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-ir2vec entities | FileCheck %s
 
-CHECK: 84
+CHECK: 110
 CHECK-NEXT: Ret     0
 CHECK-NEXT: Br      1
 CHECK-NEXT: Switch  2
@@ -85,3 +85,29 @@ CHECK-NEXT: Function        80
 CHECK-NEXT: Pointer 81
 CHECK-NEXT: Constant        82
 CHECK-NEXT: Variable        83
+CHECK-NEXT: FCMP_false   84
+CHECK-NEXT: FCMP_oeq     85
+CHECK-NEXT: FCMP_ogt     86
+CHECK-NEXT: FCMP_oge     87
+CHECK-NEXT: FCMP_olt     88
+CHECK-NEXT: FCMP_ole     89
+CHECK-NEXT: FCMP_one     90
+CHECK-NEXT: FCMP_ord     91
+CHECK-NEXT: FCMP_uno     92
+CHECK-NEXT: FCMP_ueq     93
+CHECK-NEXT: FCMP_ugt     94
+CHECK-NEXT: FCMP_uge     95
+CHECK-NEXT: FCMP_ult     96
+CHECK-NEXT: FCMP_ule     97
+CHECK-NEXT: FCMP_une     98
+CHECK-NEXT: FCMP_true    99
+CHECK-NEXT: ICMP_eq      100
+CHECK-NEXT: ICMP_ne      101
+CHECK-NEXT: ICMP_ugt     102
+CHECK-NEXT: ICMP_uge     103
+CHECK-NEXT: ICMP_ult     104
+CHECK-NEXT: ICMP_ule     105
+CHECK-NEXT: ICMP_sgt     106
+CHECK-NEXT: ICMP_sge     107
+CHECK-NEXT: ICMP_slt     108
+CHECK-NEXT: ICMP_sle     109
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index aabebf0cc90a9..1c656b8fcf4e7 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -184,7 +184,7 @@ class IR2VecTool {
         // Add "Arg" relationships
         unsigned ArgIndex = 0;
         for (const Use &U : I.operands()) {
-          unsigned OperandID = Vocabulary::getSlotIndex(*U);
+          unsigned OperandID = Vocabulary::getSlotIndex(*U.get());
           unsigned RelationID = ArgRelation + ArgIndex;
           OS << Opcode << '\t' << OperandID << '\t' << RelationID << '\n';
 
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 9f2f6a3496ce0..9bc48e45eab5e 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -435,6 +435,7 @@ static constexpr unsigned MaxOpcodes = Vocabulary::MaxOpcodes;
 static constexpr unsigned MaxTypeIDs = Vocabulary::MaxTypeIDs;
 static constexpr unsigned MaxCanonicalTypeIDs = Vocabulary::MaxCanonicalTypeIDs;
 static constexpr unsigned MaxOperands = Vocabulary::MaxOperandKinds;
+static constexpr unsigned MaxPredicateKinds = Vocabulary::MaxPredicateKinds;
 
 // Mapping between LLVM Type::TypeID tokens and Vocabulary::CanonicalTypeID
 // names and their canonical string keys.
@@ -460,7 +461,8 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
       EXPECT_EQ(Emb.size(), Dim);
 
     // Should have the correct total number of embeddings
-    EXPECT_EQ(VocabVecSize, MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands);
+    EXPECT_EQ(VocabVecSize, MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands +
+                                MaxPredicateKinds);
 
     auto ExpectedVocab = VocabVec;
 
@@ -527,6 +529,26 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
   EXPECT_EQ(Vocabulary::getSlotIndex(*Arg),
             EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::VariableID));
 #undef EXPECTED_VOCAB_OPERAND_SLOT
+
+  // Test getSlotIndex for predicates
+#define EXPECTED_VOCAB_PREDICATE_SLOT(X)                                       \
+  MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands + static_cast<unsigned>(X)
+  for (unsigned P = CmpInst::FIRST_FCMP_PREDICATE;
+       P <= CmpInst::LAST_FCMP_PREDICATE; ++P) {
+    CmpInst::Predicate Pred = static_cast<CmpInst::Predicate>(P);
+    unsigned ExpectedIdx =
+        EXPECTED_VOCAB_PREDICATE_SLOT((P - CmpInst::FIRST_FCMP_PREDICATE));
+    EXPECT_EQ(Vocabulary::getSlotIndex(Pred), ExpectedIdx);
+  }
+  auto ICMP_Start = CmpInst::LAST_FCMP_PREDICATE + 1;
+  for (unsigned P = CmpInst::FIRST_ICMP_PREDICATE;
+       P <= CmpInst::LAST_ICMP_PREDICATE; ++P) {
+    CmpInst::Predicate Pred = static_cast<CmpInst::Predicate>(P);
+    unsigned ExpectedIdx = EXPECTED_VOCAB_PREDICATE_SLOT(
+        ICMP_Start + P - CmpInst::FIRST_ICMP_PREDICATE);
+    EXPECT_EQ(Vocabulary::getSlotIndex(Pred), ExpectedIdx);
+  }
+#undef EXPECTED_VOCAB_PREDICATE_SLOT
 }
 
 #if GTEST_HAS_DEATH_TEST
@@ -569,6 +591,7 @@ TEST(IR2VecVocabularyTest, StringKeyGeneration) {
 
 #undef EXPECT_CANONICAL_TYPE_NAME
 
+  // Verify OperandKind -> string mapping
 #define HANDLE_OPERAND_KINDS(X)                                                \
   X(FunctionID, "Function")                                                    \
   X(PointerID, "Pointer")                                                      \
@@ -592,6 +615,28 @@ TEST(IR2VecVocabularyTest, StringKeyGeneration) {
       Vocabulary::getStringKey(MaxOpcodes + MaxCanonicalTypeIDs + 1);
   EXPECT_EQ(FuncArgKey, "Function");
   EXPECT_EQ(PtrArgKey, "Pointer");
+
+// Verify PredicateKind -> string mapping
+#define EXPECT_PREDICATE_KIND(PredNum, PredPos, PredKind)                      \
+  do {                                                                         \
+    std::string PredStr =                                                      \
+        std::string(PredKind) + "_" +                                          \
+        CmpInst::getPredicateName(static_cast<CmpInst::Predicate>(PredNum))    \
+            .str();                                                            \
+    unsigned Pos = MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands + PredPos;   \
+    EXPECT_EQ(Vocabulary::getStringKey(Pos), PredStr);                         \
+  } while (0)
+
+  for (unsigned P = CmpInst::FIRST_FCMP_PREDICATE;
+       P <= CmpInst::LAST_FCMP_PREDICATE; ++P)
+    EXPECT_PREDICATE_KIND(P, P - CmpInst::FIRST_FCMP_PREDICATE, "FCMP");
+
+  auto ICMP_Pos = CmpInst::LAST_FCMP_PREDICATE + 1;
+  for (unsigned P = CmpInst::FIRST_ICMP_PREDICATE;
+       P <= CmpInst::LAST_ICMP_PREDICATE; ++P)
+    EXPECT_PREDICATE_KIND(P, ICMP_Pos++, "ICMP");
+
+#undef EXPECT_PREDICATE_KIND
 }
 
 TEST(IR2VecVocabularyTest, VocabularyDimensions) {

From 4fccaaef700b64db06b7438802ccc58c3bb5b970 Mon Sep 17 00:00:00 2001
From: Andre Kuhlenschmidt <andre.kuhlenschmidt@gmail.com>
Date: Wed, 1 Oct 2025 16:40:00 -0700
Subject: [PATCH 446/878] [flang][runtime] fix intrinsics case of
 extends_type_of (#161466)

Fixes https://github.com/llvm/llvm-project/issues/155459 by making sure
the cases are considered in the right order. Previously intrinsics types
where overriding the pointer cases which have higher precedence in the
specification.

Also passes the following
[tests](https://github.com/llvm/llvm-test-suite/pull/287).
---
 flang-rt/lib/runtime/derived-api.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/flang-rt/lib/runtime/derived-api.cpp b/flang-rt/lib/runtime/derived-api.cpp
index bb08e0397fe9c..fe6868292f019 100644
--- a/flang-rt/lib/runtime/derived-api.cpp
+++ b/flang-rt/lib/runtime/derived-api.cpp
@@ -118,14 +118,26 @@ bool RTDEF(SameTypeAs)(const Descriptor &a, const Descriptor &b) {
 }
 
 bool RTDEF(ExtendsTypeOf)(const Descriptor &a, const Descriptor &mold) {
+  // The wording of the standard indicates null or unallocated checks take
+  // precedence over the extension checks which take precedence over any
+  // compiler specific behavior.
+  // F'23 16.9.86 p 5
+  // If MOLD is unlimited polymorphic and is either a disassociated pointer or
+  // unallocated allocatable variable, the result is true;
   auto aType{a.raw().type};
   auto moldType{mold.raw().type};
   if ((aType != CFI_type_struct && aType != CFI_type_other) ||
       (moldType != CFI_type_struct && moldType != CFI_type_other)) {
-    // If either type is intrinsic, they must match.
-    return aType == moldType;
-  } else if (const typeInfo::DerivedType *
-      derivedTypeMold{GetDerivedType(mold)}) {
+    if (!mold.IsAllocated()) {
+      return true;
+    } else if (!a.IsAllocated()) {
+      return false;
+    } else {
+      // If either type is intrinsic and not a pointer or allocatable
+      // then they must match.
+      return aType == moldType;
+    }
+  } else if (const auto *derivedTypeMold{GetDerivedType(mold)}) {
     // If A is unlimited polymorphic and is either a disassociated pointer or
     // unallocated allocatable, the result is false.
     // Otherwise if the dynamic type of A or MOLD is extensible, the result is

From 4aaf6d1b8cc655f37a1a4ab8f7a7988dbef4eeae Mon Sep 17 00:00:00 2001
From: Wu Yingcong <yingcong.wu@intel.com>
Date: Thu, 2 Oct 2025 07:42:43 +0800
Subject: [PATCH 447/878] [libunwind][test] Add check for objcopy to improve
 test compatibility (#161112)

Previously, we only used `objcopy`, which is not available for some
build configurations. With this patch, we not only try to use `objcopy`,
but also try to use `llvm-objcopy` if available.

This is a follow-up of https://github.com/llvm/llvm-project/pull/156383.
---
 libunwind/test/configs/cmake-bridge.cfg.in    | 11 +++++++++++
 libunwind/test/eh_frame_fde_pc_range.pass.cpp |  7 +++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in
index b804c210f0bbc..e40497bfa9976 100644
--- a/libunwind/test/configs/cmake-bridge.cfg.in
+++ b/libunwind/test/configs/cmake-bridge.cfg.in
@@ -14,6 +14,7 @@
 import os, site
 site.addsitedir(os.path.join('@LIBUNWIND_LIBCXX_PATH@', 'utils'))
 import libcxx.test.format
+from lit.util import which
 
 # Basic configuration of the test suite
 config.name = os.path.basename('@LIBUNWIND_TEST_CONFIG@')
@@ -33,3 +34,13 @@ config.substitutions.append(('%{install-prefix}', '@LIBUNWIND_TESTING_INSTALL_PR
 config.substitutions.append(('%{include}', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/include'))
 config.substitutions.append(('%{lib}', '@LIBUNWIND_TESTING_INSTALL_PREFIX@/@LIBUNWIND_INSTALL_LIBRARY_DIR@'))
 config.substitutions.append(('%{benchmark_flags}', ''))
+
+# Check for objcopy tools
+objcopy_path = which('llvm-objcopy', '@LLVM_BUILD_BINARY_DIR@/bin')
+if not objcopy_path:
+    objcopy_path = which('llvm-objcopy')
+if not objcopy_path:
+    objcopy_path = which('objcopy')
+if objcopy_path:
+    config.substitutions.append(('%{objcopy}', objcopy_path))
+    config.available_features.add('objcopy-available')
diff --git a/libunwind/test/eh_frame_fde_pc_range.pass.cpp b/libunwind/test/eh_frame_fde_pc_range.pass.cpp
index 39c8e8066264d..852612bd9a6e4 100644
--- a/libunwind/test/eh_frame_fde_pc_range.pass.cpp
+++ b/libunwind/test/eh_frame_fde_pc_range.pass.cpp
@@ -14,16 +14,15 @@
 // clang-format off
 
 // REQUIRES: target={{x86_64-.+-linux-gnu}}
-// aarch64,arm have a cross toolchain build(llvm-clang-win-x-aarch64, etc)
-// where objdump is not available.
+// REQUIRES: objcopy-available
 
 // TODO: Figure out why this fails with Memory Sanitizer.
 // XFAIL: msan
 
 // RUN: %{build}
-// RUN: objcopy --dump-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
+// RUN: %{objcopy} --dump-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
 // RUN: echo -ne '\xFF' | dd of=%t_ehf_hdr.bin bs=1 seek=2 count=2 conv=notrunc status=none 
-// RUN: objcopy --update-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
+// RUN: %{objcopy} --update-section .eh_frame_hdr=%t_ehf_hdr.bin %t.exe
 // RUN: %{exec} %t.exe
 
 // clang-format on

From ed1d9548b5c08142dab82bcfdd9875177d8223a5 Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Wed, 1 Oct 2025 17:13:13 -0700
Subject: [PATCH 448/878] [IR2Vec] Refactor vocabulary to use section-based
 storage (#158376)

Refactored IR2Vec vocabulary and introduced IR (semantics) agnostic `VocabStorage`
- `Vocabulary` *has-a* `VocabStorage`
- `Vocabulary` deals with LLVM IR specific entities. This would help in efficient reuse of parts of the logic for MIR.
- Storage uses a section-based approach instead of a flat vector, improving organization and access patterns.
---
 llvm/include/llvm/Analysis/IR2Vec.h           | 218 ++++++++---
 llvm/lib/Analysis/IR2Vec.cpp                  | 256 +++++++------
 llvm/lib/Analysis/InlineAdvisor.cpp           |   2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp        |   6 +-
 .../FunctionPropertiesAnalysisTest.cpp        |  13 +-
 llvm/unittests/Analysis/IR2VecTest.cpp        | 347 ++++++++++++++++--
 6 files changed, 648 insertions(+), 194 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index f3f9de460218b..b7c301580a8a4 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -45,6 +45,7 @@
 #include "llvm/Support/JSON.h"
 #include <array>
 #include <map>
+#include <optional>
 
 namespace llvm {
 
@@ -144,6 +145,73 @@ struct Embedding {
 using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
 using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
 
+/// Generic storage class for section-based vocabularies.
+/// VocabStorage provides a generic foundation for storing and accessing
+/// embeddings organized into sections.
+class VocabStorage {
+private:
+  /// Section-based storage
+  std::vector<std::vector<Embedding>> Sections;
+
+  const size_t TotalSize;
+  const unsigned Dimension;
+
+public:
+  /// Default constructor creates empty storage (invalid state)
+  VocabStorage() : Sections(), TotalSize(0), Dimension(0) {}
+
+  /// Create a VocabStorage with pre-organized section data
+  VocabStorage(std::vector<std::vector<Embedding>> &&SectionData);
+
+  VocabStorage(VocabStorage &&) = default;
+  VocabStorage &operator=(VocabStorage &&) = delete;
+
+  VocabStorage(const VocabStorage &) = delete;
+  VocabStorage &operator=(const VocabStorage &) = delete;
+
+  /// Get total number of entries across all sections
+  size_t size() const { return TotalSize; }
+
+  /// Get number of sections
+  unsigned getNumSections() const {
+    return static_cast<unsigned>(Sections.size());
+  }
+
+  /// Section-based access: Storage[sectionId][localIndex]
+  const std::vector<Embedding> &operator[](unsigned SectionId) const {
+    assert(SectionId < Sections.size() && "Invalid section ID");
+    return Sections[SectionId];
+  }
+
+  /// Get vocabulary dimension
+  unsigned getDimension() const { return Dimension; }
+
+  /// Check if vocabulary is valid (has data)
+  bool isValid() const { return TotalSize > 0; }
+
+  /// Iterator support for section-based access
+  class const_iterator {
+    const VocabStorage *Storage;
+    unsigned SectionId = 0;
+    size_t LocalIndex = 0;
+
+  public:
+    const_iterator(const VocabStorage *Storage, unsigned SectionId,
+                   size_t LocalIndex)
+        : Storage(Storage), SectionId(SectionId), LocalIndex(LocalIndex) {}
+
+    LLVM_ABI const Embedding &operator*() const;
+    LLVM_ABI const_iterator &operator++();
+    LLVM_ABI bool operator==(const const_iterator &Other) const;
+    LLVM_ABI bool operator!=(const const_iterator &Other) const;
+  };
+
+  const_iterator begin() const { return const_iterator(this, 0, 0); }
+  const_iterator end() const {
+    return const_iterator(this, getNumSections(), 0);
+  }
+};
+
 /// Class for storing and accessing the IR2Vec vocabulary.
 /// The Vocabulary class manages seed embeddings for LLVM IR entities. The
 /// seed embeddings are the initial learned representations of the entities
@@ -164,7 +232,7 @@ using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
 class Vocabulary {
   friend class llvm::IR2VecVocabAnalysis;
 
-  // Vocabulary Slot Layout:
+  // Vocabulary Layout:
   // +----------------+------------------------------------------------------+
   // | Entity Type    | Index Range                                          |
   // +----------------+------------------------------------------------------+
@@ -180,8 +248,16 @@ class Vocabulary {
   //       and improves learning. Operands include Comparison predicates
   //       (ICmp/FCmp) along with other operand types. This can be extended to
   //       include other specializations in future.
-  using VocabVector = std::vector<ir2vec::Embedding>;
-  VocabVector Vocab;
+  enum class Section : unsigned {
+    Opcodes = 0,
+    CanonicalTypes = 1,
+    Operands = 2,
+    Predicates = 3,
+    MaxSections
+  };
+
+  // Use section-based storage for better organization and efficiency
+  VocabStorage Storage;
 
   static constexpr unsigned NumICmpPredicates =
       static_cast<unsigned>(CmpInst::LAST_ICMP_PREDICATE) -
@@ -233,10 +309,23 @@ class Vocabulary {
       NumICmpPredicates + NumFCmpPredicates;
 
   Vocabulary() = default;
-  LLVM_ABI Vocabulary(VocabVector &&Vocab) : Vocab(std::move(Vocab)) {}
+  LLVM_ABI Vocabulary(VocabStorage &&Storage) : Storage(std::move(Storage)) {}
+
+  Vocabulary(const Vocabulary &) = delete;
+  Vocabulary &operator=(const Vocabulary &) = delete;
+
+  Vocabulary(Vocabulary &&) = default;
+  Vocabulary &operator=(Vocabulary &&Other) = delete;
+
+  LLVM_ABI bool isValid() const {
+    return Storage.size() == NumCanonicalEntries;
+  }
+
+  LLVM_ABI unsigned getDimension() const {
+    assert(isValid() && "IR2Vec Vocabulary is invalid");
+    return Storage.getDimension();
+  }
 
-  LLVM_ABI bool isValid() const { return Vocab.size() == NumCanonicalEntries; };
-  LLVM_ABI unsigned getDimension() const;
   /// Total number of entries (opcodes + canonicalized types + operand kinds +
   /// predicates)
   static constexpr size_t getCanonicalSize() { return NumCanonicalEntries; }
@@ -245,10 +334,16 @@ class Vocabulary {
   LLVM_ABI static StringRef getVocabKeyForOpcode(unsigned Opcode);
 
   /// Function to get vocabulary key for a given TypeID
-  LLVM_ABI static StringRef getVocabKeyForTypeID(Type::TypeID TypeID);
+  LLVM_ABI static StringRef getVocabKeyForTypeID(Type::TypeID TypeID) {
+    return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID));
+  }
 
   /// Function to get vocabulary key for a given OperandKind
-  LLVM_ABI static StringRef getVocabKeyForOperandKind(OperandKind Kind);
+  LLVM_ABI static StringRef getVocabKeyForOperandKind(OperandKind Kind) {
+    unsigned Index = static_cast<unsigned>(Kind);
+    assert(Index < MaxOperandKinds && "Invalid OperandKind");
+    return OperandKindNames[Index];
+  }
 
   /// Function to classify an operand into OperandKind
   LLVM_ABI static OperandKind getOperandKind(const Value *Op);
@@ -256,40 +351,66 @@ class Vocabulary {
   /// Function to get vocabulary key for a given predicate
   LLVM_ABI static StringRef getVocabKeyForPredicate(CmpInst::Predicate P);
 
-  /// Functions to return the slot index or position of a given Opcode, TypeID,
-  /// or OperandKind in the vocabulary.
-  LLVM_ABI static unsigned getSlotIndex(unsigned Opcode);
-  LLVM_ABI static unsigned getSlotIndex(Type::TypeID TypeID);
-  LLVM_ABI static unsigned getSlotIndex(const Value &Op);
-  LLVM_ABI static unsigned getSlotIndex(CmpInst::Predicate P);
+  /// Functions to return flat index
+  LLVM_ABI static unsigned getIndex(unsigned Opcode) {
+    assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+    return Opcode - 1; // Convert to zero-based index
+  }
+
+  LLVM_ABI static unsigned getIndex(Type::TypeID TypeID) {
+    assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+    return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID));
+  }
+
+  LLVM_ABI static unsigned getIndex(const Value &Op) {
+    unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
+    assert(Index < MaxOperandKinds && "Invalid OperandKind");
+    return OperandBaseOffset + Index;
+  }
+
+  LLVM_ABI static unsigned getIndex(CmpInst::Predicate P) {
+    return PredicateBaseOffset + getPredicateLocalIndex(P);
+  }
 
   /// Accessors to get the embedding for a given entity.
-  LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const;
-  LLVM_ABI const ir2vec::Embedding &operator[](Type::TypeID TypeId) const;
-  LLVM_ABI const ir2vec::Embedding &operator[](const Value &Arg) const;
-  LLVM_ABI const ir2vec::Embedding &operator[](CmpInst::Predicate P) const;
+  LLVM_ABI const ir2vec::Embedding &operator[](unsigned Opcode) const {
+    assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
+    return Storage[static_cast<unsigned>(Section::Opcodes)][Opcode - 1];
+  }
+
+  LLVM_ABI const ir2vec::Embedding &operator[](Type::TypeID TypeID) const {
+    assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
+    unsigned LocalIndex = static_cast<unsigned>(getCanonicalTypeID(TypeID));
+    return Storage[static_cast<unsigned>(Section::CanonicalTypes)][LocalIndex];
+  }
+
+  LLVM_ABI const ir2vec::Embedding &operator[](const Value &Arg) const {
+    unsigned LocalIndex = static_cast<unsigned>(getOperandKind(&Arg));
+    assert(LocalIndex < MaxOperandKinds && "Invalid OperandKind");
+    return Storage[static_cast<unsigned>(Section::Operands)][LocalIndex];
+  }
+
+  LLVM_ABI const ir2vec::Embedding &operator[](CmpInst::Predicate P) const {
+    unsigned LocalIndex = getPredicateLocalIndex(P);
+    return Storage[static_cast<unsigned>(Section::Predicates)][LocalIndex];
+  }
 
   /// Const Iterator type aliases
-  using const_iterator = VocabVector::const_iterator;
+  using const_iterator = VocabStorage::const_iterator;
+
   const_iterator begin() const {
     assert(isValid() && "IR2Vec Vocabulary is invalid");
-    return Vocab.begin();
+    return Storage.begin();
   }
 
-  const_iterator cbegin() const {
-    assert(isValid() && "IR2Vec Vocabulary is invalid");
-    return Vocab.cbegin();
-  }
+  const_iterator cbegin() const { return begin(); }
 
   const_iterator end() const {
     assert(isValid() && "IR2Vec Vocabulary is invalid");
-    return Vocab.end();
+    return Storage.end();
   }
 
-  const_iterator cend() const {
-    assert(isValid() && "IR2Vec Vocabulary is invalid");
-    return Vocab.cend();
-  }
+  const_iterator cend() const { return end(); }
 
   /// Returns the string key for a given index position in the vocabulary.
   /// This is useful for debugging or printing the vocabulary. Do not use this
@@ -297,7 +418,7 @@ class Vocabulary {
   LLVM_ABI static StringRef getStringKey(unsigned Pos);
 
   /// Create a dummy vocabulary for testing purposes.
-  LLVM_ABI static VocabVector createDummyVocabForTest(unsigned Dim = 1);
+  LLVM_ABI static VocabStorage createDummyVocabForTest(unsigned Dim = 1);
 
   LLVM_ABI bool invalidate(Module &M, const PreservedAnalyses &PA,
                            ModuleAnalysisManager::Invalidator &Inv) const;
@@ -306,12 +427,16 @@ class Vocabulary {
   constexpr static unsigned NumCanonicalEntries =
       MaxOpcodes + MaxCanonicalTypeIDs + MaxOperandKinds + MaxPredicateKinds;
 
-  // Base offsets for slot layout to simplify index computation
+  // Base offsets for flat index computation
   constexpr static unsigned OperandBaseOffset =
       MaxOpcodes + MaxCanonicalTypeIDs;
   constexpr static unsigned PredicateBaseOffset =
       OperandBaseOffset + MaxOperandKinds;
 
+  /// Functions for predicate index calculations
+  static unsigned getPredicateLocalIndex(CmpInst::Predicate P);
+  static CmpInst::Predicate getPredicateFromLocalIndex(unsigned LocalIndex);
+
   /// String mappings for CanonicalTypeID values
   static constexpr StringLiteral CanonicalTypeNames[] = {
       "FloatTy",   "VoidTy",   "LabelTy",   "MetadataTy",
@@ -358,15 +483,26 @@ class Vocabulary {
 
   /// Function to get vocabulary key for canonical type by enum
   LLVM_ABI static StringRef
-  getVocabKeyForCanonicalTypeID(CanonicalTypeID CType);
+  getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) {
+    unsigned Index = static_cast<unsigned>(CType);
+    assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID");
+    return CanonicalTypeNames[Index];
+  }
 
   /// Function to convert TypeID to CanonicalTypeID
-  LLVM_ABI static CanonicalTypeID getCanonicalTypeID(Type::TypeID TypeID);
+  LLVM_ABI static CanonicalTypeID getCanonicalTypeID(Type::TypeID TypeID) {
+    unsigned Index = static_cast<unsigned>(TypeID);
+    assert(Index < MaxTypeIDs && "Invalid TypeID");
+    return TypeIDMapping[Index];
+  }
 
   /// Function to get the predicate enum value for a given index. Index is
   /// relative to the predicates section of the vocabulary. E.g., Index 0
   /// corresponds to the first predicate.
-  LLVM_ABI static CmpInst::Predicate getPredicate(unsigned Index);
+  LLVM_ABI static CmpInst::Predicate getPredicate(unsigned Index) {
+    assert(Index < MaxPredicateKinds && "Invalid predicate index");
+    return getPredicateFromLocalIndex(Index);
+  }
 };
 
 /// Embedder provides the interface to generate embeddings (vector
@@ -459,22 +595,22 @@ class LLVM_ABI FlowAwareEmbedder : public Embedder {
 /// mapping between an entity of the IR (like opcode, type, argument, etc.) and
 /// its corresponding embedding.
 class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
-  using VocabVector = std::vector<ir2vec::Embedding>;
   using VocabMap = std::map<std::string, ir2vec::Embedding>;
-  VocabMap OpcVocab, TypeVocab, ArgVocab;
-  VocabVector Vocab;
+  std::optional<ir2vec::VocabStorage> Vocab;
 
-  Error readVocabulary();
+  Error readVocabulary(VocabMap &OpcVocab, VocabMap &TypeVocab,
+                       VocabMap &ArgVocab);
   Error parseVocabSection(StringRef Key, const json::Value &ParsedVocabValue,
                           VocabMap &TargetVocab, unsigned &Dim);
-  void generateNumMappedVocab();
+  void generateVocabStorage(VocabMap &OpcVocab, VocabMap &TypeVocab,
+                            VocabMap &ArgVocab);
   void emitError(Error Err, LLVMContext &Ctx);
 
 public:
   LLVM_ABI static AnalysisKey Key;
   IR2VecVocabAnalysis() = default;
-  LLVM_ABI explicit IR2VecVocabAnalysis(const VocabVector &Vocab);
-  LLVM_ABI explicit IR2VecVocabAnalysis(VocabVector &&Vocab);
+  LLVM_ABI explicit IR2VecVocabAnalysis(ir2vec::VocabStorage &&Vocab)
+      : Vocab(std::move(Vocab)) {}
   using Result = ir2vec::Vocabulary;
   LLVM_ABI Result run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index f51f0898cb37e..271f004b0a787 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Module.h"
@@ -262,55 +263,75 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
 }
 
 // ==----------------------------------------------------------------------===//
-// Vocabulary
+// VocabStorage
 //===----------------------------------------------------------------------===//
 
-unsigned Vocabulary::getDimension() const {
-  assert(isValid() && "IR2Vec Vocabulary is invalid");
-  return Vocab[0].size();
-}
-
-unsigned Vocabulary::getSlotIndex(unsigned Opcode) {
-  assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
-  return Opcode - 1; // Convert to zero-based index
-}
-
-unsigned Vocabulary::getSlotIndex(Type::TypeID TypeID) {
-  assert(static_cast<unsigned>(TypeID) < MaxTypeIDs && "Invalid type ID");
-  return MaxOpcodes + static_cast<unsigned>(getCanonicalTypeID(TypeID));
-}
-
-unsigned Vocabulary::getSlotIndex(const Value &Op) {
-  unsigned Index = static_cast<unsigned>(getOperandKind(&Op));
-  assert(Index < MaxOperandKinds && "Invalid OperandKind");
-  return OperandBaseOffset + Index;
-}
-
-unsigned Vocabulary::getSlotIndex(CmpInst::Predicate P) {
-  unsigned PU = static_cast<unsigned>(P);
-  unsigned FirstFC = static_cast<unsigned>(CmpInst::FIRST_FCMP_PREDICATE);
-  unsigned FirstIC = static_cast<unsigned>(CmpInst::FIRST_ICMP_PREDICATE);
-
-  unsigned PredIdx =
-      (PU >= FirstIC) ? (NumFCmpPredicates + (PU - FirstIC)) : (PU - FirstFC);
-  return PredicateBaseOffset + PredIdx;
-}
-
-const Embedding &Vocabulary::operator[](unsigned Opcode) const {
-  return Vocab[getSlotIndex(Opcode)];
+VocabStorage::VocabStorage(std::vector<std::vector<Embedding>> &&SectionData)
+    : Sections(std::move(SectionData)), TotalSize([&] {
+        assert(!Sections.empty() && "Vocabulary has no sections");
+        // Compute total size across all sections
+        size_t Size = 0;
+        for (const auto &Section : Sections) {
+          assert(!Section.empty() && "Vocabulary section is empty");
+          Size += Section.size();
+        }
+        return Size;
+      }()),
+      Dimension([&] {
+        // Get dimension from the first embedding in the first section - all
+        // embeddings must have the same dimension
+        assert(!Sections.empty() && "Vocabulary has no sections");
+        assert(!Sections[0].empty() && "First section of vocabulary is empty");
+        unsigned ExpectedDim = static_cast<unsigned>(Sections[0][0].size());
+
+        // Verify that all embeddings across all sections have the same
+        // dimension
+        auto allSameDim = [ExpectedDim](const std::vector<Embedding> &Section) {
+          return std::all_of(Section.begin(), Section.end(),
+                             [ExpectedDim](const Embedding &Emb) {
+                               return Emb.size() == ExpectedDim;
+                             });
+        };
+        assert(std::all_of(Sections.begin(), Sections.end(), allSameDim) &&
+               "All embeddings must have the same dimension");
+
+        return ExpectedDim;
+      }()) {}
+
+const Embedding &VocabStorage::const_iterator::operator*() const {
+  assert(SectionId < Storage->Sections.size() && "Invalid section ID");
+  assert(LocalIndex < Storage->Sections[SectionId].size() &&
+         "Local index out of range");
+  return Storage->Sections[SectionId][LocalIndex];
+}
+
+VocabStorage::const_iterator &VocabStorage::const_iterator::operator++() {
+  ++LocalIndex;
+  // Check if we need to move to the next section
+  if (SectionId < Storage->getNumSections() &&
+      LocalIndex >= Storage->Sections[SectionId].size()) {
+    assert(LocalIndex == Storage->Sections[SectionId].size() &&
+           "Local index should be at the end of the current section");
+    LocalIndex = 0;
+    ++SectionId;
+  }
+  return *this;
 }
 
-const Embedding &Vocabulary::operator[](Type::TypeID TypeID) const {
-  return Vocab[getSlotIndex(TypeID)];
+bool VocabStorage::const_iterator::operator==(
+    const const_iterator &Other) const {
+  return Storage == Other.Storage && SectionId == Other.SectionId &&
+         LocalIndex == Other.LocalIndex;
 }
 
-const ir2vec::Embedding &Vocabulary::operator[](const Value &Arg) const {
-  return Vocab[getSlotIndex(Arg)];
+bool VocabStorage::const_iterator::operator!=(
+    const const_iterator &Other) const {
+  return !(*this == Other);
 }
 
-const ir2vec::Embedding &Vocabulary::operator[](CmpInst::Predicate P) const {
-  return Vocab[getSlotIndex(P)];
-}
+// ==----------------------------------------------------------------------===//
+// Vocabulary
+//===----------------------------------------------------------------------===//
 
 StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
   assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode");
@@ -323,29 +344,6 @@ StringRef Vocabulary::getVocabKeyForOpcode(unsigned Opcode) {
   return "UnknownOpcode";
 }
 
-StringRef Vocabulary::getVocabKeyForCanonicalTypeID(CanonicalTypeID CType) {
-  unsigned Index = static_cast<unsigned>(CType);
-  assert(Index < MaxCanonicalTypeIDs && "Invalid CanonicalTypeID");
-  return CanonicalTypeNames[Index];
-}
-
-Vocabulary::CanonicalTypeID
-Vocabulary::getCanonicalTypeID(Type::TypeID TypeID) {
-  unsigned Index = static_cast<unsigned>(TypeID);
-  assert(Index < MaxTypeIDs && "Invalid TypeID");
-  return TypeIDMapping[Index];
-}
-
-StringRef Vocabulary::getVocabKeyForTypeID(Type::TypeID TypeID) {
-  return getVocabKeyForCanonicalTypeID(getCanonicalTypeID(TypeID));
-}
-
-StringRef Vocabulary::getVocabKeyForOperandKind(Vocabulary::OperandKind Kind) {
-  unsigned Index = static_cast<unsigned>(Kind);
-  assert(Index < MaxOperandKinds && "Invalid OperandKind");
-  return OperandKindNames[Index];
-}
-
 // Helper function to classify an operand into OperandKind
 Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   if (isa<Function>(Op))
@@ -357,14 +355,23 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) {
   return OperandKind::VariableID;
 }
 
-CmpInst::Predicate Vocabulary::getPredicate(unsigned Index) {
-  assert(Index < MaxPredicateKinds && "Invalid predicate index");
-  unsigned PredEnumVal =
-      (Index < NumFCmpPredicates)
-          ? (static_cast<unsigned>(CmpInst::FIRST_FCMP_PREDICATE) + Index)
-          : (static_cast<unsigned>(CmpInst::FIRST_ICMP_PREDICATE) +
-             (Index - NumFCmpPredicates));
-  return static_cast<CmpInst::Predicate>(PredEnumVal);
+unsigned Vocabulary::getPredicateLocalIndex(CmpInst::Predicate P) {
+  if (P >= CmpInst::FIRST_FCMP_PREDICATE && P <= CmpInst::LAST_FCMP_PREDICATE)
+    return P - CmpInst::FIRST_FCMP_PREDICATE;
+  else
+    return P - CmpInst::FIRST_ICMP_PREDICATE +
+           (CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1);
+}
+
+CmpInst::Predicate Vocabulary::getPredicateFromLocalIndex(unsigned LocalIndex) {
+  unsigned fcmpRange =
+      CmpInst::LAST_FCMP_PREDICATE - CmpInst::FIRST_FCMP_PREDICATE + 1;
+  if (LocalIndex < fcmpRange)
+    return static_cast<CmpInst::Predicate>(CmpInst::FIRST_FCMP_PREDICATE +
+                                           LocalIndex);
+  else
+    return static_cast<CmpInst::Predicate>(CmpInst::FIRST_ICMP_PREDICATE +
+                                           LocalIndex - fcmpRange);
 }
 
 StringRef Vocabulary::getVocabKeyForPredicate(CmpInst::Predicate Pred) {
@@ -401,17 +408,51 @@ bool Vocabulary::invalidate(Module &M, const PreservedAnalyses &PA,
   return !(PAC.preservedWhenStateless());
 }
 
-Vocabulary::VocabVector Vocabulary::createDummyVocabForTest(unsigned Dim) {
-  VocabVector DummyVocab;
-  DummyVocab.reserve(NumCanonicalEntries);
+VocabStorage Vocabulary::createDummyVocabForTest(unsigned Dim) {
   float DummyVal = 0.1f;
-  // Create a dummy vocabulary with entries for all opcodes, types, operands
-  // and predicates
-  for ([[maybe_unused]] unsigned _ : seq(0u, Vocabulary::NumCanonicalEntries)) {
-    DummyVocab.push_back(Embedding(Dim, DummyVal));
+
+  // Create sections for opcodes, types, operands, and predicates
+  // Order must match Vocabulary::Section enum
+  std::vector<std::vector<Embedding>> Sections;
+  Sections.reserve(4);
+
+  // Opcodes section
+  std::vector<Embedding> OpcodeSec;
+  OpcodeSec.reserve(MaxOpcodes);
+  for (unsigned I = 0; I < MaxOpcodes; ++I) {
+    OpcodeSec.emplace_back(Dim, DummyVal);
     DummyVal += 0.1f;
   }
-  return DummyVocab;
+  Sections.push_back(std::move(OpcodeSec));
+
+  // Types section
+  std::vector<Embedding> TypeSec;
+  TypeSec.reserve(MaxCanonicalTypeIDs);
+  for (unsigned I = 0; I < MaxCanonicalTypeIDs; ++I) {
+    TypeSec.emplace_back(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+  Sections.push_back(std::move(TypeSec));
+
+  // Operands section
+  std::vector<Embedding> OperandSec;
+  OperandSec.reserve(MaxOperandKinds);
+  for (unsigned I = 0; I < MaxOperandKinds; ++I) {
+    OperandSec.emplace_back(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+  Sections.push_back(std::move(OperandSec));
+
+  // Predicates section
+  std::vector<Embedding> PredicateSec;
+  PredicateSec.reserve(MaxPredicateKinds);
+  for (unsigned I = 0; I < MaxPredicateKinds; ++I) {
+    PredicateSec.emplace_back(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+  Sections.push_back(std::move(PredicateSec));
+
+  return VocabStorage(std::move(Sections));
 }
 
 // ==----------------------------------------------------------------------===//
@@ -457,7 +498,9 @@ Error IR2VecVocabAnalysis::parseVocabSection(
 
 // FIXME: Make this optional. We can avoid file reads
 // by auto-generating a default vocabulary during the build time.
-Error IR2VecVocabAnalysis::readVocabulary() {
+Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab,
+                                          VocabMap &TypeVocab,
+                                          VocabMap &ArgVocab) {
   auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /*IsText=*/true);
   if (!BufOrError)
     return createFileError(VocabFile, BufOrError.getError());
@@ -488,7 +531,9 @@ Error IR2VecVocabAnalysis::readVocabulary() {
   return Error::success();
 }
 
-void IR2VecVocabAnalysis::generateNumMappedVocab() {
+void IR2VecVocabAnalysis::generateVocabStorage(VocabMap &OpcVocab,
+                                               VocabMap &TypeVocab,
+                                               VocabMap &ArgVocab) {
 
   // Helper for handling missing entities in the vocabulary.
   // Currently, we use a zero vector. In the future, we will throw an error to
@@ -506,7 +551,6 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
   // Handle Opcodes
   std::vector<Embedding> NumericOpcodeEmbeddings(Vocabulary::MaxOpcodes,
                                                  Embedding(Dim));
-  NumericOpcodeEmbeddings.reserve(Vocabulary::MaxOpcodes);
   for (unsigned Opcode : seq(0u, Vocabulary::MaxOpcodes)) {
     StringRef VocabKey = Vocabulary::getVocabKeyForOpcode(Opcode + 1);
     auto It = OpcVocab.find(VocabKey.str());
@@ -515,13 +559,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
     else
       handleMissingEntity(VocabKey.str());
   }
-  Vocab.insert(Vocab.end(), NumericOpcodeEmbeddings.begin(),
-               NumericOpcodeEmbeddings.end());
 
   // Handle Types - only canonical types are present in vocabulary
   std::vector<Embedding> NumericTypeEmbeddings(Vocabulary::MaxCanonicalTypeIDs,
                                                Embedding(Dim));
-  NumericTypeEmbeddings.reserve(Vocabulary::MaxCanonicalTypeIDs);
   for (unsigned CTypeID : seq(0u, Vocabulary::MaxCanonicalTypeIDs)) {
     StringRef VocabKey = Vocabulary::getVocabKeyForCanonicalTypeID(
         static_cast<Vocabulary::CanonicalTypeID>(CTypeID));
@@ -531,13 +572,10 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
     }
     handleMissingEntity(VocabKey.str());
   }
-  Vocab.insert(Vocab.end(), NumericTypeEmbeddings.begin(),
-               NumericTypeEmbeddings.end());
 
   // Handle Arguments/Operands
   std::vector<Embedding> NumericArgEmbeddings(Vocabulary::MaxOperandKinds,
                                               Embedding(Dim));
-  NumericArgEmbeddings.reserve(Vocabulary::MaxOperandKinds);
   for (unsigned OpKind : seq(0u, Vocabulary::MaxOperandKinds)) {
     Vocabulary::OperandKind Kind = static_cast<Vocabulary::OperandKind>(OpKind);
     StringRef VocabKey = Vocabulary::getVocabKeyForOperandKind(Kind);
@@ -548,14 +586,11 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
     }
     handleMissingEntity(VocabKey.str());
   }
-  Vocab.insert(Vocab.end(), NumericArgEmbeddings.begin(),
-               NumericArgEmbeddings.end());
 
   // Handle Predicates: part of Operands section. We look up predicate keys
   // in ArgVocab.
   std::vector<Embedding> NumericPredEmbeddings(Vocabulary::MaxPredicateKinds,
                                                Embedding(Dim, 0));
-  NumericPredEmbeddings.reserve(Vocabulary::MaxPredicateKinds);
   for (unsigned PK : seq(0u, Vocabulary::MaxPredicateKinds)) {
     StringRef VocabKey =
         Vocabulary::getVocabKeyForPredicate(Vocabulary::getPredicate(PK));
@@ -566,15 +601,22 @@ void IR2VecVocabAnalysis::generateNumMappedVocab() {
     }
     handleMissingEntity(VocabKey.str());
   }
-  Vocab.insert(Vocab.end(), NumericPredEmbeddings.begin(),
-               NumericPredEmbeddings.end());
-}
 
-IR2VecVocabAnalysis::IR2VecVocabAnalysis(const VocabVector &Vocab)
-    : Vocab(Vocab) {}
+  // Create section-based storage instead of flat vocabulary
+  // Order must match Vocabulary::Section enum
+  std::vector<std::vector<Embedding>> Sections(4);
+  Sections[static_cast<unsigned>(Vocabulary::Section::Opcodes)] =
+      std::move(NumericOpcodeEmbeddings); // Section::Opcodes
+  Sections[static_cast<unsigned>(Vocabulary::Section::CanonicalTypes)] =
+      std::move(NumericTypeEmbeddings); // Section::CanonicalTypes
+  Sections[static_cast<unsigned>(Vocabulary::Section::Operands)] =
+      std::move(NumericArgEmbeddings); // Section::Operands
+  Sections[static_cast<unsigned>(Vocabulary::Section::Predicates)] =
+      std::move(NumericPredEmbeddings); // Section::Predicates
 
-IR2VecVocabAnalysis::IR2VecVocabAnalysis(VocabVector &&Vocab)
-    : Vocab(std::move(Vocab)) {}
+  // Create VocabStorage from organized sections
+  Vocab.emplace(std::move(Sections));
+}
 
 void IR2VecVocabAnalysis::emitError(Error Err, LLVMContext &Ctx) {
   handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
@@ -586,8 +628,8 @@ IR2VecVocabAnalysis::Result
 IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   auto Ctx = &M.getContext();
   // If vocabulary is already populated by the constructor, use it.
-  if (!Vocab.empty())
-    return Vocabulary(std::move(Vocab));
+  if (Vocab.has_value())
+    return Vocabulary(std::move(Vocab.value()));
 
   // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
@@ -596,7 +638,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
                    "set it using --ir2vec-vocab-path");
     return Vocabulary(); // Return invalid result
   }
-  if (auto Err = readVocabulary()) {
+
+  VocabMap OpcVocab, TypeVocab, ArgVocab;
+  if (auto Err = readVocabulary(OpcVocab, TypeVocab, ArgVocab)) {
     emitError(std::move(Err), *Ctx);
     return Vocabulary();
   }
@@ -611,9 +655,9 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   scaleVocabSection(ArgVocab, ArgWeight);
 
   // Generate the numeric lookup vocabulary
-  generateNumMappedVocab();
+  generateVocabStorage(OpcVocab, TypeVocab, ArgVocab);
 
-  return Vocabulary(std::move(Vocab));
+  return Vocabulary(std::move(Vocab.value()));
 }
 
 // ==----------------------------------------------------------------------===//
@@ -622,7 +666,7 @@ IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
 
 PreservedAnalyses IR2VecPrinterPass::run(Module &M,
                                          ModuleAnalysisManager &MAM) {
-  auto Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
+  auto &Vocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
   assert(Vocabulary.isValid() && "IR2Vec Vocabulary is invalid");
 
   for (Function &F : M) {
@@ -664,7 +708,7 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
 
 PreservedAnalyses IR2VecVocabPrinterPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
-  auto IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
+  auto &IR2VecVocabulary = MAM.getResult<IR2VecVocabAnalysis>(M);
   assert(IR2VecVocabulary.isValid() && "IR2Vec Vocabulary is invalid");
 
   // Print each entry
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 28b14c2562df1..0fa804f2959e8 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -217,7 +217,7 @@ AnalysisKey PluginInlineAdvisorAnalysis::Key;
 bool InlineAdvisorAnalysis::initializeIR2VecVocabIfRequested(
     Module &M, ModuleAnalysisManager &MAM) {
   if (!IR2VecVocabFile.empty()) {
-    auto IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M);
+    auto &IR2VecVocabResult = MAM.getResult<IR2VecVocabAnalysis>(M);
     if (!IR2VecVocabResult.isValid()) {
       M.getContext().emitError("Failed to load IR2Vec vocabulary");
       return false;
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 1c656b8fcf4e7..434449c7c5117 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -162,8 +162,8 @@ class IR2VecTool {
 
     for (const BasicBlock &BB : F) {
       for (const auto &I : BB.instructionsWithoutDebug()) {
-        unsigned Opcode = Vocabulary::getSlotIndex(I.getOpcode());
-        unsigned TypeID = Vocabulary::getSlotIndex(I.getType()->getTypeID());
+        unsigned Opcode = Vocabulary::getIndex(I.getOpcode());
+        unsigned TypeID = Vocabulary::getIndex(I.getType()->getTypeID());
 
         // Add "Next" relationship with previous instruction
         if (HasPrevOpcode) {
@@ -184,7 +184,7 @@ class IR2VecTool {
         // Add "Arg" relationships
         unsigned ArgIndex = 0;
         for (const Use &U : I.operands()) {
-          unsigned OperandID = Vocabulary::getSlotIndex(*U.get());
+          unsigned OperandID = Vocabulary::getIndex(*U.get());
           unsigned RelationID = ArgRelation + ArgIndex;
           OS << Opcode << '\t' << OperandID << '\t' << RelationID << '\n';
 
diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
index dc6059dcf6827..b6e8567ee514d 100644
--- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
@@ -43,8 +43,11 @@ class FunctionPropertiesAnalysisTest : public testing::Test {
 public:
   FunctionPropertiesAnalysisTest() {
     auto VocabVector = ir2vec::Vocabulary::createDummyVocabForTest(1);
-    MAM.registerPass([&] { return IR2VecVocabAnalysis(VocabVector); });
-    IR2VecVocab = ir2vec::Vocabulary(std::move(VocabVector));
+    MAM.registerPass([VocabVector = std::move(VocabVector)]() mutable {
+      return IR2VecVocabAnalysis(std::move(VocabVector));
+    });
+    IR2VecVocab =
+        new ir2vec::Vocabulary(ir2vec::Vocabulary::createDummyVocabForTest(1));
     MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
     FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
     FAM.registerPass([&] { return DominatorTreeAnalysis(); });
@@ -66,7 +69,7 @@ class FunctionPropertiesAnalysisTest : public testing::Test {
   std::unique_ptr<LoopInfo> LI;
   FunctionAnalysisManager FAM;
   ModuleAnalysisManager MAM;
-  ir2vec::Vocabulary IR2VecVocab;
+  ir2vec::Vocabulary *IR2VecVocab;
 
   void TearDown() override {
     // Restore original IR2Vec weights
@@ -78,7 +81,7 @@ class FunctionPropertiesAnalysisTest : public testing::Test {
   FunctionPropertiesInfo buildFPI(Function &F) {
     // FunctionPropertiesInfo assumes IR2VecVocabAnalysis has been run to
     // use IR2Vec.
-    auto VocabResult = MAM.getResult<IR2VecVocabAnalysis>(*F.getParent());
+    auto &VocabResult = MAM.getResult<IR2VecVocabAnalysis>(*F.getParent());
     (void)VocabResult;
     return FunctionPropertiesInfo::getFunctionPropertiesInfo(F, FAM);
   }
@@ -106,7 +109,7 @@ class FunctionPropertiesAnalysisTest : public testing::Test {
   }
 
   std::unique_ptr<ir2vec::Embedder> createEmbedder(const Function &F) {
-    auto Emb = ir2vec::Embedder::create(IR2VecKind::Symbolic, F, IR2VecVocab);
+    auto Emb = ir2vec::Embedder::create(IR2VecKind::Symbolic, F, *IR2VecVocab);
     EXPECT_TRUE(static_cast<bool>(Emb));
     return Emb;
   }
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 9bc48e45eab5e..743628fffac76 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -295,7 +295,7 @@ TEST(IR2VecTest, ZeroDimensionEmbedding) {
 // Fixture for IR2Vec tests requiring IR setup.
 class IR2VecTestFixture : public ::testing::Test {
 protected:
-  Vocabulary V;
+  Vocabulary *V;
   LLVMContext Ctx;
   std::unique_ptr<Module> M;
   Function *F = nullptr;
@@ -304,7 +304,7 @@ class IR2VecTestFixture : public ::testing::Test {
   Instruction *RetInst = nullptr;
 
   void SetUp() override {
-    V = Vocabulary(Vocabulary::createDummyVocabForTest(2));
+    V = new Vocabulary(Vocabulary::createDummyVocabForTest(2));
 
     // Setup IR
     M = std::make_unique<Module>("TestM", Ctx);
@@ -322,7 +322,7 @@ class IR2VecTestFixture : public ::testing::Test {
 };
 
 TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
-  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &InstMap = Emb->getInstVecMap();
@@ -341,7 +341,7 @@ TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
 }
 
 TEST_F(IR2VecTestFixture, GetInstVecMap_FlowAware) {
-  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &InstMap = Emb->getInstVecMap();
@@ -358,7 +358,7 @@ TEST_F(IR2VecTestFixture, GetInstVecMap_FlowAware) {
 }
 
 TEST_F(IR2VecTestFixture, GetBBVecMap_Symbolic) {
-  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &BBMap = Emb->getBBVecMap();
@@ -373,7 +373,7 @@ TEST_F(IR2VecTestFixture, GetBBVecMap_Symbolic) {
 }
 
 TEST_F(IR2VecTestFixture, GetBBVecMap_FlowAware) {
-  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &BBMap = Emb->getBBVecMap();
@@ -388,7 +388,7 @@ TEST_F(IR2VecTestFixture, GetBBVecMap_FlowAware) {
 }
 
 TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
-  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &BBVec = Emb->getBBVector(*BB);
@@ -398,7 +398,7 @@ TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
 }
 
 TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
-  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &BBVec = Emb->getBBVector(*BB);
@@ -408,7 +408,7 @@ TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
 }
 
 TEST_F(IR2VecTestFixture, GetFunctionVector_Symbolic) {
-  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &FuncVec = Emb->getFunctionVector();
@@ -420,7 +420,7 @@ TEST_F(IR2VecTestFixture, GetFunctionVector_Symbolic) {
 }
 
 TEST_F(IR2VecTestFixture, GetFunctionVector_FlowAware) {
-  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, V);
+  auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
   ASSERT_TRUE(static_cast<bool>(Emb));
 
   const auto &FuncVec = Emb->getFunctionVector();
@@ -464,7 +464,10 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
     EXPECT_EQ(VocabVecSize, MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands +
                                 MaxPredicateKinds);
 
-    auto ExpectedVocab = VocabVec;
+    // Collect embeddings for later comparison before moving VocabVec
+    std::vector<Embedding> ExpectedVocab;
+    for (const auto &Emb : VocabVec)
+      ExpectedVocab.push_back(Emb);
 
     IR2VecVocabAnalysis VocabAnalysis(std::move(VocabVec));
     LLVMContext TestCtx;
@@ -482,17 +485,17 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) {
 }
 
 TEST(IR2VecVocabularyTest, SlotIdxMapping) {
-  // Test getSlotIndex for Opcodes
+  // Test getIndex for Opcodes
 #define EXPECT_OPCODE_SLOT(NUM, OPCODE, CLASS)                                 \
-  EXPECT_EQ(Vocabulary::getSlotIndex(NUM), static_cast<unsigned>(NUM - 1));
+  EXPECT_EQ(Vocabulary::getIndex(NUM), static_cast<unsigned>(NUM - 1));
 #define HANDLE_INST(NUM, OPCODE, CLASS) EXPECT_OPCODE_SLOT(NUM, OPCODE, CLASS)
 #include "llvm/IR/Instruction.def"
 #undef HANDLE_INST
 #undef EXPECT_OPCODE_SLOT
 
-  // Test getSlotIndex for Types
+  // Test getIndex for Types
 #define EXPECT_TYPE_SLOT(TypeIDTok, CanonEnum, CanonStr)                       \
-  EXPECT_EQ(Vocabulary::getSlotIndex(Type::TypeIDTok),                         \
+  EXPECT_EQ(Vocabulary::getIndex(Type::TypeIDTok),                             \
             MaxOpcodes + static_cast<unsigned>(                                \
                              Vocabulary::CanonicalTypeID::CanonEnum));
 
@@ -500,7 +503,7 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
 
 #undef EXPECT_TYPE_SLOT
 
-  // Test getSlotIndex for Value operands
+  // Test getIndex for Value operands
   LLVMContext Ctx;
   Module M("TestM", Ctx);
   FunctionType *FTy =
@@ -510,27 +513,27 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
 #define EXPECTED_VOCAB_OPERAND_SLOT(X)                                         \
   MaxOpcodes + MaxCanonicalTypeIDs + static_cast<unsigned>(X)
   // Test Function operand
-  EXPECT_EQ(Vocabulary::getSlotIndex(*F),
+  EXPECT_EQ(Vocabulary::getIndex(*F),
             EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::FunctionID));
 
   // Test Constant operand
   Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
-  EXPECT_EQ(Vocabulary::getSlotIndex(*C),
+  EXPECT_EQ(Vocabulary::getIndex(*C),
             EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::ConstantID));
 
   // Test Pointer operand
   BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
   AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB);
-  EXPECT_EQ(Vocabulary::getSlotIndex(*PtrVal),
+  EXPECT_EQ(Vocabulary::getIndex(*PtrVal),
             EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::PointerID));
 
   // Test Variable operand (function argument)
   Argument *Arg = F->getArg(0);
-  EXPECT_EQ(Vocabulary::getSlotIndex(*Arg),
+  EXPECT_EQ(Vocabulary::getIndex(*Arg),
             EXPECTED_VOCAB_OPERAND_SLOT(Vocabulary::OperandKind::VariableID));
 #undef EXPECTED_VOCAB_OPERAND_SLOT
 
-  // Test getSlotIndex for predicates
+  // Test getIndex for predicates
 #define EXPECTED_VOCAB_PREDICATE_SLOT(X)                                       \
   MaxOpcodes + MaxCanonicalTypeIDs + MaxOperands + static_cast<unsigned>(X)
   for (unsigned P = CmpInst::FIRST_FCMP_PREDICATE;
@@ -538,7 +541,7 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
     CmpInst::Predicate Pred = static_cast<CmpInst::Predicate>(P);
     unsigned ExpectedIdx =
         EXPECTED_VOCAB_PREDICATE_SLOT((P - CmpInst::FIRST_FCMP_PREDICATE));
-    EXPECT_EQ(Vocabulary::getSlotIndex(Pred), ExpectedIdx);
+    EXPECT_EQ(Vocabulary::getIndex(Pred), ExpectedIdx);
   }
   auto ICMP_Start = CmpInst::LAST_FCMP_PREDICATE + 1;
   for (unsigned P = CmpInst::FIRST_ICMP_PREDICATE;
@@ -546,7 +549,7 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
     CmpInst::Predicate Pred = static_cast<CmpInst::Predicate>(P);
     unsigned ExpectedIdx = EXPECTED_VOCAB_PREDICATE_SLOT(
         ICMP_Start + P - CmpInst::FIRST_ICMP_PREDICATE);
-    EXPECT_EQ(Vocabulary::getSlotIndex(Pred), ExpectedIdx);
+    EXPECT_EQ(Vocabulary::getIndex(Pred), ExpectedIdx);
   }
 #undef EXPECTED_VOCAB_PREDICATE_SLOT
 }
@@ -555,15 +558,14 @@ TEST(IR2VecVocabularyTest, SlotIdxMapping) {
 #ifndef NDEBUG
 TEST(IR2VecVocabularyTest, NumericIDMapInvalidInputs) {
   // Test invalid opcode IDs
-  EXPECT_DEATH(Vocabulary::getSlotIndex(0u), "Invalid opcode");
-  EXPECT_DEATH(Vocabulary::getSlotIndex(MaxOpcodes + 1), "Invalid opcode");
+  EXPECT_DEATH(Vocabulary::getIndex(0u), "Invalid opcode");
+  EXPECT_DEATH(Vocabulary::getIndex(MaxOpcodes + 1), "Invalid opcode");
 
   // Test invalid type IDs
-  EXPECT_DEATH(Vocabulary::getSlotIndex(static_cast<Type::TypeID>(MaxTypeIDs)),
+  EXPECT_DEATH(Vocabulary::getIndex(static_cast<Type::TypeID>(MaxTypeIDs)),
+               "Invalid type ID");
+  EXPECT_DEATH(Vocabulary::getIndex(static_cast<Type::TypeID>(MaxTypeIDs + 10)),
                "Invalid type ID");
-  EXPECT_DEATH(
-      Vocabulary::getSlotIndex(static_cast<Type::TypeID>(MaxTypeIDs + 10)),
-      "Invalid type ID");
 }
 #endif // NDEBUG
 #endif // GTEST_HAS_DEATH_TEST
@@ -573,7 +575,7 @@ TEST(IR2VecVocabularyTest, StringKeyGeneration) {
   EXPECT_EQ(Vocabulary::getStringKey(12), "Add");
 
 #define EXPECT_OPCODE(NUM, OPCODE, CLASS)                                      \
-  EXPECT_EQ(Vocabulary::getStringKey(Vocabulary::getSlotIndex(NUM)),           \
+  EXPECT_EQ(Vocabulary::getStringKey(Vocabulary::getIndex(NUM)),               \
             Vocabulary::getVocabKeyForOpcode(NUM));
 #define HANDLE_INST(NUM, OPCODE, CLASS) EXPECT_OPCODE(NUM, OPCODE, CLASS)
 #include "llvm/IR/Instruction.def"
@@ -672,10 +674,12 @@ TEST(IR2VecVocabularyTest, InvalidAccess) {
 #endif // GTEST_HAS_DEATH_TEST
 
 TEST(IR2VecVocabularyTest, TypeIDStringKeyMapping) {
+  Vocabulary V = Vocabulary(Vocabulary::createDummyVocabForTest());
 #define EXPECT_TYPE_TO_CANONICAL(TypeIDTok, CanonEnum, CanonStr)               \
-  EXPECT_EQ(                                                                   \
-      Vocabulary::getStringKey(Vocabulary::getSlotIndex(Type::TypeIDTok)),     \
-      CanonStr);
+  do {                                                                         \
+    unsigned FlatIdx = V.getIndex(Type::TypeIDTok);                            \
+    EXPECT_EQ(Vocabulary::getStringKey(FlatIdx), CanonStr);                    \
+  } while (0);
 
   IR2VEC_HANDLE_TYPE_BIMAP(EXPECT_TYPE_TO_CANONICAL)
 
@@ -683,14 +687,20 @@ TEST(IR2VecVocabularyTest, TypeIDStringKeyMapping) {
 }
 
 TEST(IR2VecVocabularyTest, InvalidVocabularyConstruction) {
-  std::vector<Embedding> InvalidVocab;
-  InvalidVocab.push_back(Embedding(2, 1.0));
-  InvalidVocab.push_back(Embedding(2, 2.0));
-
-  Vocabulary V(std::move(InvalidVocab));
+  // Test 1: Create invalid VocabStorage with insufficient sections
+  std::vector<std::vector<Embedding>> InvalidSectionData;
+  // Only add one section with 2 embeddings, but the vocabulary needs 4 sections
+  std::vector<Embedding> Section1;
+  Section1.push_back(Embedding(2, 1.0));
+  Section1.push_back(Embedding(2, 2.0));
+  InvalidSectionData.push_back(std::move(Section1));
+
+  VocabStorage InvalidStorage(std::move(InvalidSectionData));
+  Vocabulary V(std::move(InvalidStorage));
   EXPECT_FALSE(V.isValid());
 
   {
+    // Test 2: Default-constructed vocabulary should be invalid
     Vocabulary InvalidResult;
     EXPECT_FALSE(InvalidResult.isValid());
 #if GTEST_HAS_DEATH_TEST
@@ -701,4 +711,265 @@ TEST(IR2VecVocabularyTest, InvalidVocabularyConstruction) {
   }
 }
 
+TEST(VocabStorageTest, DefaultConstructor) {
+  VocabStorage storage;
+
+  EXPECT_EQ(storage.size(), 0u);
+  EXPECT_EQ(storage.getNumSections(), 0u);
+  EXPECT_EQ(storage.getDimension(), 0u);
+  EXPECT_FALSE(storage.isValid());
+
+  // Test iterators on empty storage
+  EXPECT_EQ(storage.begin(), storage.end());
+}
+
+TEST(VocabStorageTest, BasicConstruction) {
+  // Create test data with 3 sections
+  std::vector<std::vector<Embedding>> sectionData;
+
+  // Section 0: 2 embeddings of dimension 3
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{1.0, 2.0, 3.0});
+  section0.emplace_back(std::vector<double>{4.0, 5.0, 6.0});
+  sectionData.push_back(std::move(section0));
+
+  // Section 1: 1 embedding of dimension 3
+  std::vector<Embedding> section1;
+  section1.emplace_back(std::vector<double>{7.0, 8.0, 9.0});
+  sectionData.push_back(std::move(section1));
+
+  // Section 2: 3 embeddings of dimension 3
+  std::vector<Embedding> section2;
+  section2.emplace_back(std::vector<double>{10.0, 11.0, 12.0});
+  section2.emplace_back(std::vector<double>{13.0, 14.0, 15.0});
+  section2.emplace_back(std::vector<double>{16.0, 17.0, 18.0});
+  sectionData.push_back(std::move(section2));
+
+  VocabStorage storage(std::move(sectionData));
+
+  EXPECT_EQ(storage.size(), 6u); // Total: 2 + 1 + 3 = 6
+  EXPECT_EQ(storage.getNumSections(), 3u);
+  EXPECT_EQ(storage.getDimension(), 3u);
+  EXPECT_TRUE(storage.isValid());
+}
+
+TEST(VocabStorageTest, SectionAccess) {
+  // Create test data
+  std::vector<std::vector<Embedding>> sectionData;
+
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{1.0, 2.0});
+  section0.emplace_back(std::vector<double>{3.0, 4.0});
+  sectionData.push_back(std::move(section0));
+
+  std::vector<Embedding> section1;
+  section1.emplace_back(std::vector<double>{5.0, 6.0});
+  sectionData.push_back(std::move(section1));
+
+  VocabStorage storage(std::move(sectionData));
+
+  // Test section access
+  EXPECT_EQ(storage[0].size(), 2u);
+  EXPECT_EQ(storage[1].size(), 1u);
+
+  // Test embedding values
+  EXPECT_THAT(storage[0][0].getData(), ElementsAre(1.0, 2.0));
+  EXPECT_THAT(storage[0][1].getData(), ElementsAre(3.0, 4.0));
+  EXPECT_THAT(storage[1][0].getData(), ElementsAre(5.0, 6.0));
+}
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(VocabStorageTest, InvalidSectionAccess) {
+  std::vector<std::vector<Embedding>> sectionData;
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{1.0, 2.0});
+  sectionData.push_back(std::move(section0));
+
+  VocabStorage storage(std::move(sectionData));
+
+  EXPECT_DEATH(storage[1], "Invalid section ID");
+  EXPECT_DEATH(storage[10], "Invalid section ID");
+}
+
+TEST(VocabStorageTest, EmptySection) {
+  std::vector<std::vector<Embedding>> sectionData;
+  std::vector<Embedding> emptySection; // Empty section
+  sectionData.push_back(std::move(emptySection));
+
+  std::vector<Embedding> validSection;
+  validSection.emplace_back(std::vector<double>{1.0});
+  sectionData.push_back(std::move(validSection));
+
+  EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+               "Vocabulary section is empty");
+}
+
+TEST(VocabStorageTest, EmptyMiddleSection) {
+  std::vector<std::vector<Embedding>> sectionData;
+
+  // Valid first section
+  std::vector<Embedding> validSection1;
+  validSection1.emplace_back(std::vector<double>{1.0});
+  sectionData.push_back(std::move(validSection1));
+
+  // Empty middle section
+  std::vector<Embedding> emptySection;
+  sectionData.push_back(std::move(emptySection));
+
+  // Valid last section
+  std::vector<Embedding> validSection2;
+  validSection2.emplace_back(std::vector<double>{2.0});
+  sectionData.push_back(std::move(validSection2));
+
+  EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+               "Vocabulary section is empty");
+}
+
+TEST(VocabStorageTest, NoSections) {
+  std::vector<std::vector<Embedding>> sectionData; // No sections
+
+  EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+               "Vocabulary has no sections");
+}
+
+TEST(VocabStorageTest, MismatchedDimensionsAcrossSections) {
+  std::vector<std::vector<Embedding>> sectionData;
+
+  // Section 0: embeddings with dimension 2
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{1.0, 2.0});
+  section0.emplace_back(std::vector<double>{3.0, 4.0});
+  sectionData.push_back(std::move(section0));
+
+  // Section 1: embedding with dimension 3 (mismatch!)
+  std::vector<Embedding> section1;
+  section1.emplace_back(std::vector<double>{5.0, 6.0, 7.0});
+  sectionData.push_back(std::move(section1));
+
+  EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+               "All embeddings must have the same dimension");
+}
+
+TEST(VocabStorageTest, MismatchedDimensionsWithinSection) {
+  std::vector<std::vector<Embedding>> sectionData;
+
+  // Section 0: first embedding with dimension 2, second with dimension 3
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{1.0, 2.0});
+  section0.emplace_back(std::vector<double>{3.0, 4.0, 5.0}); // Mismatch!
+  sectionData.push_back(std::move(section0));
+
+  EXPECT_DEATH(VocabStorage(std::move(sectionData)),
+               "All embeddings must have the same dimension");
+}
+#endif // NDEBUG
+#endif // GTEST_HAS_DEATH_TEST
+
+TEST(VocabStorageTest, IteratorBasics) {
+  std::vector<std::vector<Embedding>> sectionData;
+
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{1.0, 2.0});
+  section0.emplace_back(std::vector<double>{3.0, 4.0});
+  sectionData.push_back(std::move(section0));
+
+  std::vector<Embedding> section1;
+  section1.emplace_back(std::vector<double>{5.0, 6.0});
+  sectionData.push_back(std::move(section1));
+
+  VocabStorage storage(std::move(sectionData));
+
+  // Test iterator basics
+  auto it = storage.begin();
+  auto end = storage.end();
+
+  EXPECT_NE(it, end);
+
+  // Check first embedding
+  EXPECT_THAT((*it).getData(), ElementsAre(1.0, 2.0));
+
+  // Advance to second embedding
+  ++it;
+  EXPECT_NE(it, end);
+  EXPECT_THAT((*it).getData(), ElementsAre(3.0, 4.0));
+
+  // Advance to third embedding (in section 1)
+  ++it;
+  EXPECT_NE(it, end);
+  EXPECT_THAT((*it).getData(), ElementsAre(5.0, 6.0));
+
+  // Advance past the end
+  ++it;
+  EXPECT_EQ(it, end);
+}
+
+TEST(VocabStorageTest, IteratorTraversal) {
+  std::vector<std::vector<Embedding>> sectionData;
+
+  // Section 0: 2 embeddings
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{10.0});
+  section0.emplace_back(std::vector<double>{20.0});
+  sectionData.push_back(std::move(section0));
+
+  // Section 1: 1 embedding
+  std::vector<Embedding> section1;
+  section1.emplace_back(std::vector<double>{25.0});
+  sectionData.push_back(std::move(section1));
+
+  // Section 2: 3 embeddings
+  std::vector<Embedding> section2;
+  section2.emplace_back(std::vector<double>{30.0});
+  section2.emplace_back(std::vector<double>{40.0});
+  section2.emplace_back(std::vector<double>{50.0});
+  sectionData.push_back(std::move(section2));
+
+  VocabStorage storage(std::move(sectionData));
+
+  // Collect all values using iterator
+  std::vector<double> values;
+  for (const auto &emb : storage) {
+    EXPECT_EQ(emb.size(), 1u);
+    values.push_back(emb[0]);
+  }
+
+  // Should get all embeddings from all sections
+  EXPECT_THAT(values, ElementsAre(10.0, 20.0, 25.0, 30.0, 40.0, 50.0));
+}
+
+TEST(VocabStorageTest, IteratorComparison) {
+  std::vector<std::vector<Embedding>> sectionData;
+  std::vector<Embedding> section0;
+  section0.emplace_back(std::vector<double>{1.0});
+  section0.emplace_back(std::vector<double>{2.0});
+  sectionData.push_back(std::move(section0));
+
+  VocabStorage storage(std::move(sectionData));
+
+  auto it1 = storage.begin();
+  auto it2 = storage.begin();
+  auto end = storage.end();
+
+  // Test equality
+  EXPECT_EQ(it1, it2);
+  EXPECT_NE(it1, end);
+
+  // Advance one iterator
+  ++it1;
+  EXPECT_NE(it1, it2);
+  EXPECT_NE(it1, end);
+
+  // Advance second iterator to match
+  ++it2;
+  EXPECT_EQ(it1, it2);
+
+  // Advance both to end
+  ++it1;
+  ++it2;
+  EXPECT_EQ(it1, end);
+  EXPECT_EQ(it2, end);
+  EXPECT_EQ(it1, it2);
+}
+
 } // end anonymous namespace

From 5f0f4972c46707d46145f713c20a442bef8379d8 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 1 Oct 2025 17:26:22 -0700
Subject: [PATCH 449/878] [libc] Unify and extend no_sanitize attributes for
 strlen. (#161316)

Fast strlen implementations (naive wide-reads, SIMD-based, and
x86_64/aarch64-optimized versions) all may perform
technically-out-of-bound reads, which leads to reports under ASan,
HWASan (on ARM machines), and also TSan (which also has the capability
to detect heap out-of-bound reads). So, we need to explicitly disable
instrumentation in all three cases.

Tragically, Clang didn't support `[[gnu::no_sanitize]]` syntax until
recently, and since we're supporting both GCC and Clang, we have to
revert to `__attribute__` syntax.
---
 libc/src/__support/macros/attributes.h               | 10 ++++++++++
 libc/src/string/CMakeLists.txt                       |  1 +
 libc/src/string/memory_utils/aarch64/inline_strlen.h |  2 +-
 libc/src/string/memory_utils/generic/inline_strlen.h |  3 +--
 libc/src/string/memory_utils/x86_64/inline_strlen.h  |  4 ++--
 libc/src/string/string_utils.h                       |  3 ++-
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel    |  1 +
 7 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/libc/src/__support/macros/attributes.h b/libc/src/__support/macros/attributes.h
index 145aa3b65057c..d5ff028634940 100644
--- a/libc/src/__support/macros/attributes.h
+++ b/libc/src/__support/macros/attributes.h
@@ -81,4 +81,14 @@ LIBC_THREAD_MODE_EXTERNAL.
 #define LIBC_HAS_VECTOR_TYPE 0
 #endif
 
+#if __has_attribute(no_sanitize)
+// Disable regular and hardware-supported ASan for functions that may
+// intentionally make out-of-bounds access. Disable TSan as well, as it detects
+// out-of-bounds accesses to heap memory.
+#define LIBC_NO_SANITIZE_OOB_ACCESS                                            \
+  __attribute__((no_sanitize("address", "hwaddress", "thread")))
+#else
+#define LIBC_NO_SANITIZE_OOB_ACCESS
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_ATTRIBUTES_H
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index b8cdb2a7d3538..83c956429be24 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -22,6 +22,7 @@ add_header_library(
     libc.src.__support.CPP.type_traits
     libc.src.__support.CPP.simd
     libc.src.__support.common
+    libc.src.__support.macros.attributes
     libc.src.string.memory_utils.inline_memcpy
   ${string_config_options}
 )
diff --git a/libc/src/string/memory_utils/aarch64/inline_strlen.h b/libc/src/string/memory_utils/aarch64/inline_strlen.h
index 36fd1aa636b54..87f5ccdd56e23 100644
--- a/libc/src/string/memory_utils/aarch64/inline_strlen.h
+++ b/libc/src/string/memory_utils/aarch64/inline_strlen.h
@@ -17,7 +17,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 namespace neon {
-[[gnu::no_sanitize_address]] [[maybe_unused]] LIBC_INLINE static size_t
+[[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
 string_length(const char *src) {
   using Vector __attribute__((may_alias)) = uint8x8_t;
 
diff --git a/libc/src/string/memory_utils/generic/inline_strlen.h b/libc/src/string/memory_utils/generic/inline_strlen.h
index d7435afb03719..69700e801bcea 100644
--- a/libc/src/string/memory_utils/generic/inline_strlen.h
+++ b/libc/src/string/memory_utils/generic/inline_strlen.h
@@ -24,8 +24,7 @@ LIBC_INLINE constexpr cpp::simd_mask<char> shift_mask(cpp::simd_mask<char> m,
   return cpp::bit_cast<cpp::simd_mask<char>>(r);
 }
 
-[[clang::no_sanitize("address")]] LIBC_INLINE size_t
-string_length(const char *src) {
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) {
   constexpr cpp::simd<char> null_byte = cpp::splat('\0');
 
   size_t alignment = alignof(cpp::simd<char>);
diff --git a/libc/src/string/memory_utils/x86_64/inline_strlen.h b/libc/src/string/memory_utils/x86_64/inline_strlen.h
index 739f8c1aaddbc..9e10d58363393 100644
--- a/libc/src/string/memory_utils/x86_64/inline_strlen.h
+++ b/libc/src/string/memory_utils/x86_64/inline_strlen.h
@@ -18,12 +18,12 @@ namespace LIBC_NAMESPACE_DECL {
 namespace string_length_internal {
 // Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero.
 template <typename Vector, typename Mask>
-[[gnu::no_sanitize_address]] LIBC_INLINE static Mask
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static Mask
 compare_and_mask(const Vector *block_ptr);
 
 template <typename Vector, typename Mask,
           decltype(compare_and_mask<Vector, Mask>)>
-[[gnu::no_sanitize_address]] LIBC_INLINE static size_t
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
 string_length_vector(const char *src) {
   uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
 
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 9d636d02f4756..7feef56fb3676 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -19,6 +19,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/CPP/bitset.h"
 #include "src/__support/CPP/type_traits.h" // cpp::is_same_v
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/string/memory_utils/inline_memcpy.h"
@@ -119,7 +120,7 @@ template <typename T> LIBC_INLINE size_t string_length(const T *src) {
 }
 
 template <typename Word>
-[[gnu::no_sanitize_address]] LIBC_INLINE void *
+LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE void *
 find_first_character_wide_read(const unsigned char *src, unsigned char ch,
                                size_t n) {
   const unsigned char *char_ptr = src;
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e57d9dea036dd..026664bd019f9 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -5336,6 +5336,7 @@ libc_support_library(
         ":__support_common",
         ":__support_cpp_bitset",
         ":__support_cpp_type_traits",
+        ":__support_macros_attributes",
         ":__support_macros_optimization",
         ":hdr_limits_macros",
         ":llvm_libc_types_size_t",

From bdea159093ae2e4482dedb32bca5e03a212fc44f Mon Sep 17 00:00:00 2001
From: David Salinas <dsalinas@amd.com>
Date: Wed, 1 Oct 2025 20:47:47 -0400
Subject: [PATCH 450/878] Revert "Revert "Fix memory leak in Offloading API"
 (#161465)" (#161573)

This reverts commit d392563433316e310edacf35a40fb2f9aa477acc.
---
 llvm/lib/Object/OffloadBundle.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Object/OffloadBundle.cpp b/llvm/lib/Object/OffloadBundle.cpp
index 0dd378e65fd81..329dcbf2d939a 100644
--- a/llvm/lib/Object/OffloadBundle.cpp
+++ b/llvm/lib/Object/OffloadBundle.cpp
@@ -120,14 +120,15 @@ OffloadBundleFatBin::create(MemoryBufferRef Buf, uint64_t SectionOffset,
   if (identify_magic(Buf.getBuffer()) != file_magic::offload_bundle)
     return errorCodeToError(object_error::parse_failed);
 
-  OffloadBundleFatBin *TheBundle = new OffloadBundleFatBin(Buf, FileName);
+  std::unique_ptr<OffloadBundleFatBin> TheBundle(
+      new OffloadBundleFatBin(Buf, FileName));
 
   // Read the Bundle Entries
   Error Err = TheBundle->readEntries(Buf.getBuffer(), SectionOffset);
   if (Err)
     return Err;
 
-  return std::unique_ptr<OffloadBundleFatBin>(TheBundle);
+  return std::move(TheBundle);
 }
 
 Error OffloadBundleFatBin::extractBundle(const ObjectFile &Source) {

From 640644d68adbd2b5eaf8cd715237631057658059 Mon Sep 17 00:00:00 2001
From: Gang Chen <gangc@amd.com>
Date: Wed, 1 Oct 2025 17:52:15 -0700
Subject: [PATCH 451/878] [AMDGPU] Move LowerBufferFatPointers after
 LoadStoreVectorizer and remove the fixme (#161531)

Move LowerBufferFatPointers pass after CodegenPrepare and
LoadStoreVectorizer pass, and remove the fixme about that.
---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   38 +-
 .../buffer-fat-pointer-atomicrmw-fadd.ll      | 1311 +++++++----------
 .../buffer-fat-pointer-atomicrmw-fmax.ll      | 1207 +++++++--------
 .../buffer-fat-pointer-atomicrmw-fmin.ll      | 1207 +++++++--------
 .../AMDGPU/buffer-fat-pointers-memcpy.ll      |  326 ++--
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |    4 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   60 +-
 .../AMDGPU/resource-usage-dead-function.ll    |    4 -
 8 files changed, 1825 insertions(+), 2332 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 92a587b5771b6..280fbe20667c6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1384,6 +1384,11 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
   if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
     addPass(createAMDGPULowerKernelArgumentsPass());
 
+  TargetPassConfig::addCodeGenPrepare();
+
+  if (isPassEnabled(EnableLoadStoreVectorizer))
+    addPass(createLoadStoreVectorizerPass());
+
   if (TM->getTargetTriple().isAMDGCN()) {
     // This lowering has been placed after codegenprepare to take advantage of
     // address mode matching (which is why it isn't put with the LDS lowerings).
@@ -1392,15 +1397,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     // but has been put before switch lowering and CFG flattening so that those
     // passes can run on the more optimized control flow this pass creates in
     // many cases.
-    //
-    // FIXME: This should ideally be put after the LoadStoreVectorizer.
-    // However, due to some annoying facts about ResourceUsageAnalysis,
-    // (especially as exercised in the resource-usage-dead-function test),
-    // we need all the function passes codegenprepare all the way through
-    // said resource usage analysis to run on the call graph produced
-    // before codegenprepare runs (because codegenprepare will knock some
-    // nodes out of the graph, which leads to function-level passes not
-    // being run on them, which causes crashes in the resource usage analysis).
     addPass(createAMDGPULowerBufferFatPointersPass());
     addPass(createAMDGPULowerIntrinsicsLegacyPass());
     // In accordance with the above FIXME, manually force all the
@@ -1408,11 +1404,6 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     addPass(new DummyCGSCCPass());
   }
 
-  TargetPassConfig::addCodeGenPrepare();
-
-  if (isPassEnabled(EnableLoadStoreVectorizer))
-    addPass(createLoadStoreVectorizerPass());
-
   // LowerSwitch pass may introduce unreachable blocks that can
   // cause unexpected behavior for subsequent passes. Placing it
   // here seems better that these blocks would get cleaned up by
@@ -2125,6 +2116,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   if (EnableLowerKernelArguments)
     addPass(AMDGPULowerKernelArgumentsPass(TM));
 
+  Base::addCodeGenPrepare(addPass);
+
+  if (isPassEnabled(EnableLoadStoreVectorizer))
+    addPass(LoadStoreVectorizerPass());
+
   // This lowering has been placed after codegenprepare to take advantage of
   // address mode matching (which is why it isn't put with the LDS lowerings).
   // It could be placed anywhere before uniformity annotations (an analysis
@@ -2132,25 +2128,11 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   // but has been put before switch lowering and CFG flattening so that those
   // passes can run on the more optimized control flow this pass creates in
   // many cases.
-  //
-  // FIXME: This should ideally be put after the LoadStoreVectorizer.
-  // However, due to some annoying facts about ResourceUsageAnalysis,
-  // (especially as exercised in the resource-usage-dead-function test),
-  // we need all the function passes codegenprepare all the way through
-  // said resource usage analysis to run on the call graph produced
-  // before codegenprepare runs (because codegenprepare will knock some
-  // nodes out of the graph, which leads to function-level passes not
-  // being run on them, which causes crashes in the resource usage analysis).
   addPass(AMDGPULowerBufferFatPointersPass(TM));
   addPass.requireCGSCCOrder();
 
   addPass(AMDGPULowerIntrinsicsPass(TM));
 
-  Base::addCodeGenPrepare(addPass);
-
-  if (isPassEnabled(EnableLoadStoreVectorizer))
-    addPass(LoadStoreVectorizerPass());
-
   // LowerSwitch pass may introduce unreachable blocks that can cause unexpected
   // behavior for subsequent passes. Placing it here seems better that these
   // blocks would get cleaned up by UnreachableBlockElim inserted next in the
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index c3b14e8829042..323bffe9947c8 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -57,8 +57,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
@@ -69,7 +68,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -96,9 +95,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -106,7 +104,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -123,9 +121,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -133,7 +130,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -150,9 +147,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -160,7 +156,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -245,8 +241,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
@@ -256,7 +251,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -292,16 +287,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -318,16 +312,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -468,7 +461,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
@@ -481,7 +473,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
@@ -507,7 +498,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_4
@@ -556,7 +547,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -569,7 +559,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -594,7 +583,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
@@ -614,7 +603,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -627,7 +615,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -652,7 +639,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
@@ -672,7 +659,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -684,7 +670,6 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX7-NEXT:  ; %bb.2:
@@ -709,7 +694,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
@@ -830,8 +815,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
@@ -842,7 +826,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -860,16 +844,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -886,9 +869,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -896,7 +878,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -913,9 +895,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -923,7 +904,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -940,9 +921,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -950,7 +930,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_g
 ; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1035,8 +1015,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
@@ -1046,7 +1025,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -1064,15 +1043,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_add_f32_e32 v2, v3, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
@@ -1089,16 +1066,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -1115,16 +1091,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -1141,16 +1116,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_f32_e32 v1, v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v1
-; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -1223,9 +1197,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -1237,7 +1209,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1255,8 +1227,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
@@ -1267,7 +1238,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -1285,16 +1256,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1311,9 +1281,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1321,7 +1290,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1338,9 +1307,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1348,7 +1316,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1365,9 +1333,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -1375,7 +1342,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
 ; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1448,9 +1415,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -1462,7 +1427,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1480,8 +1445,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
@@ -1492,7 +1456,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -1510,16 +1474,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1536,9 +1499,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1546,7 +1508,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1563,9 +1525,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1573,7 +1534,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1590,9 +1551,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -1600,7 +1560,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1673,9 +1633,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -1687,7 +1645,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1705,8 +1663,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
@@ -1717,7 +1674,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -1735,16 +1692,15 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1761,9 +1717,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1771,7 +1726,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1788,9 +1743,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1798,7 +1752,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1815,9 +1769,8 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -1825,7 +1778,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1883,9 +1836,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
@@ -1897,7 +1848,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -1925,9 +1876,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
@@ -1939,7 +1888,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1958,10 +1907,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s4
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -1973,7 +1921,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -2001,9 +1949,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -2014,7 +1961,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2032,9 +1979,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2045,7 +1991,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2063,9 +2009,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2076,7 +2021,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2133,9 +2078,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2146,7 +2089,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
@@ -2174,9 +2117,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2187,7 +2128,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2205,8 +2146,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s4
+; GFX10-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
@@ -2218,7 +2158,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -2246,9 +2186,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -2257,7 +2196,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v2
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
@@ -2275,9 +2214,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2286,7 +2224,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v2
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
@@ -2304,9 +2242,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2315,7 +2252,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v2
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
@@ -2373,10 +2310,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
@@ -2390,7 +2326,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -2420,7 +2355,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2474,22 +2409,21 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX11-NEXT:  ; %bb.2:
@@ -2518,7 +2452,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2543,7 +2477,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX10-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v9, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
@@ -2556,7 +2489,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
@@ -2584,7 +2516,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB10_4
@@ -2640,7 +2572,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
@@ -2653,7 +2584,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -2680,7 +2610,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2704,7 +2634,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
@@ -2717,7 +2646,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -2744,7 +2672,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2768,7 +2696,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x800, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v9
@@ -2780,7 +2707,6 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX7-NEXT:    ; implicit-def: $vgpr4
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
 ; GFX7-NEXT:  ; %bb.2:
@@ -2807,7 +2733,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB10_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
@@ -2903,9 +2829,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
@@ -2917,7 +2841,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2945,9 +2869,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
@@ -2959,7 +2881,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2978,10 +2900,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s4
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2993,7 +2914,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -3012,9 +2933,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -3022,7 +2942,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -3040,9 +2960,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -3053,7 +2972,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3071,9 +2990,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3084,7 +3002,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3102,9 +3020,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -3115,7 +3032,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3173,9 +3090,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
@@ -3187,7 +3102,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -3215,9 +3130,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
@@ -3229,7 +3142,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -3248,10 +3161,9 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s4
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -3263,7 +3175,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -3291,9 +3203,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -3304,7 +3215,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3322,9 +3233,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -3335,7 +3245,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -3353,9 +3263,8 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -3366,7 +3275,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -7028,9 +6937,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -7042,7 +6949,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -7060,8 +6967,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
@@ -7072,7 +6978,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -7099,9 +7005,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -7109,7 +7014,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -7126,9 +7031,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -7138,7 +7042,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -7156,7 +7060,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -7164,7 +7067,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -7181,7 +7084,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
@@ -7277,9 +7180,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
@@ -7290,7 +7191,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -7308,8 +7209,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
@@ -7319,7 +7219,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -7355,9 +7255,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -7366,7 +7265,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -7385,7 +7284,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -7393,7 +7291,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -7410,7 +7308,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
@@ -7543,7 +7441,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7558,7 +7455,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-NEXT:  ; %bb.2:
@@ -7587,7 +7483,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[6:7], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7609,7 +7505,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -7622,7 +7517,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
@@ -7648,7 +7542,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
@@ -7697,7 +7591,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v9, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7710,7 +7603,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -7735,7 +7627,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7755,7 +7647,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7768,7 +7659,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -7778,9 +7668,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_add_f16_e32 v6, v8, v5
-; GFX8-NEXT:    v_or_b32_e32 v7, v6, v4
+; GFX8-NEXT:    v_add_f16_sdwa v6, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v7, v8, v5
+; GFX8-NEXT:    v_or_b32_e32 v7, v7, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v8
@@ -7795,7 +7685,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
@@ -7815,7 +7705,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7826,39 +7715,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v9
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v7
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v10
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v5
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v10
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v11
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX7-NEXT:    v_mov_b32_e32 v9, v6
+; GFX7-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7870,23 +7758,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8003,9 +7891,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -8017,7 +7903,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -8035,8 +7921,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
@@ -8047,7 +7932,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -8065,16 +7950,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8091,9 +7975,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -8101,7 +7984,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8118,9 +8001,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -8130,7 +8012,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8148,7 +8030,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -8156,7 +8037,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8173,7 +8054,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
@@ -8269,9 +8150,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
@@ -8282,7 +8161,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -8300,8 +8179,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
@@ -8311,7 +8189,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -8329,15 +8207,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8354,16 +8230,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8380,9 +8255,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -8391,7 +8265,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8410,7 +8284,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -8418,7 +8291,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB23_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -8435,7 +8308,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
@@ -8530,9 +8403,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s4
+; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
@@ -8544,7 +8415,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -8562,8 +8433,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
@@ -8574,7 +8444,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -8592,16 +8462,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8618,9 +8487,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -8628,7 +8496,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_pk_add_f16 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8645,9 +8513,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -8657,7 +8524,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -8675,7 +8542,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -8683,7 +8549,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -8700,7 +8566,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
@@ -8796,9 +8662,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
@@ -8809,7 +8673,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, v1
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -8827,8 +8691,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
@@ -8838,7 +8701,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v1
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -8856,15 +8719,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v1, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
@@ -8881,16 +8742,15 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_pk_add_f16 v1, v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v1
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8907,9 +8767,8 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -8918,7 +8777,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v1
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v2
@@ -8937,7 +8796,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -8945,7 +8803,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB25_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -8962,7 +8820,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
@@ -9054,13 +8912,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -9082,7 +8939,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -9097,12 +8954,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB26_1: ; %atomicrmw.start
@@ -9131,7 +8987,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9149,10 +9005,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -9183,7 +9038,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -9202,9 +9057,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
@@ -9230,7 +9084,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -9248,13 +9102,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -9275,7 +9128,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -9292,13 +9145,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -9320,7 +9172,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -9337,11 +9189,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -9366,7 +9217,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -9382,7 +9233,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -9391,7 +9241,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB26_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
@@ -9406,7 +9256,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -9488,13 +9338,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -9515,7 +9364,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -9531,11 +9380,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB27_1: ; %atomicrmw.start
@@ -9561,7 +9408,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -9580,11 +9427,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB27_1: ; %atomicrmw.start
@@ -9610,7 +9455,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -9629,12 +9474,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -9656,7 +9500,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -9674,13 +9518,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -9700,7 +9543,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -9717,13 +9560,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -9744,7 +9586,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -9761,11 +9603,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -9789,7 +9630,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -9806,7 +9647,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -9815,7 +9655,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB27_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -9830,7 +9670,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
@@ -9930,7 +9770,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX942-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX942-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -9942,40 +9781,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT:    ; implicit-def: $vgpr4
+; GFX942-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX942-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX942-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX942-NEXT:    s_mov_b32 s11, 0x7060302
 ; GFX942-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB28_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX942-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s10
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX942-NEXT:    v_add_f32_e32 v6, v6, v10
+; GFX942-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX942-NEXT:    v_add3_u32 v7, v7, v6, s10
+; GFX942-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT:    v_add_f32_e32 v5, v5, v10
-; GFX942-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT:    v_add3_u32 v6, v6, v5, s10
-; GFX942-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX942-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
+; GFX942-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX942-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX942-NEXT:    v_add3_u32 v8, v8, v7, s10
+; GFX942-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX942-NEXT:    v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX942-NEXT:    v_perm_b32 v8, v7, v6, s11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -9988,27 +9826,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -10022,8 +9859,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
@@ -10036,28 +9872,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB28_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v5, v5, v8 :: v_dual_add_f32 v4, v4, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, v6, v8 :: v_dual_add_f32 v5, v5, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-TRUE16-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -10071,14 +9907,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -10088,13 +9924,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -10108,8 +9943,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -10122,28 +9956,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB28_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_add_f32 v6, v6, v9 :: v_dual_add_f32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-FAKE16-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX11-FAKE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -10157,14 +9991,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX11-FAKE16-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -10174,13 +10008,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
@@ -10192,8 +10025,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
@@ -10205,25 +10037,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB28_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v8
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX10-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX10-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX10-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10235,15 +10067,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -10252,13 +10084,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10270,38 +10101,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX90A-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB28_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v10
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v10
+; GFX90A-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v6, s14
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
+; GFX90A-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX90A-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v7, s14
+; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX90A-NEXT:    v_perm_b32 v8, v7, v6, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10313,27 +10143,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10345,8 +10174,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -10360,24 +10188,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB28_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v8
+; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s14
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX908-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX908-NEXT:    v_add3_u32 v10, v10, v6, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX908-NEXT:    v_perm_b32 v6, v6, v5, s15
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX908-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10389,27 +10217,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10421,8 +10248,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -10434,27 +10260,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB28_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v8
-; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v8
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v5, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10466,27 +10292,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB28_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10497,36 +10322,35 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v5
 ; GFX7-NEXT:  .LBB28_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB28_4 Depth 2
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v9
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v11
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v8, v5, 16
+; GFX7-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX7-NEXT:  .LBB28_4: ; Parent Loop BB28_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -10538,23 +10362,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB28_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -10658,13 +10482,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -10686,7 +10509,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -10701,12 +10524,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB29_1: ; %atomicrmw.start
@@ -10735,7 +10557,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -10753,10 +10575,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -10787,7 +10608,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -10806,9 +10627,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
@@ -10834,7 +10654,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -10852,13 +10672,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -10879,7 +10698,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -10896,13 +10715,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -10924,7 +10742,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -10941,11 +10759,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -10970,7 +10787,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -10986,7 +10803,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -10995,7 +10811,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB29_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
@@ -11010,7 +10826,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11092,13 +10908,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -11119,7 +10934,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -11135,11 +10950,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB30_1: ; %atomicrmw.start
@@ -11165,7 +10978,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -11184,11 +10997,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB30_1: ; %atomicrmw.start
@@ -11214,7 +11025,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -11233,12 +11044,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -11260,7 +11070,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -11278,13 +11088,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -11304,7 +11113,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -11321,13 +11130,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -11348,7 +11156,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11365,11 +11173,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -11393,7 +11200,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11410,7 +11217,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -11419,7 +11225,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB30_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -11434,7 +11240,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
 ; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
@@ -11517,13 +11323,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -11545,7 +11350,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -11560,12 +11365,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB31_1: ; %atomicrmw.start
@@ -11594,7 +11398,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -11612,10 +11416,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -11646,7 +11449,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -11665,9 +11468,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
@@ -11693,7 +11495,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -11711,13 +11513,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -11738,7 +11539,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -11755,13 +11556,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -11783,7 +11583,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -11800,11 +11600,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -11829,7 +11628,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -11845,7 +11644,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -11854,7 +11652,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB31_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
@@ -11869,7 +11667,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -11951,13 +11749,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -11978,7 +11775,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -11994,11 +11791,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB32_1: ; %atomicrmw.start
@@ -12024,7 +11819,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -12043,11 +11838,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB32_1: ; %atomicrmw.start
@@ -12073,7 +11866,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -12092,12 +11885,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -12119,7 +11911,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -12137,13 +11929,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -12163,7 +11954,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -12180,13 +11971,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -12207,7 +11997,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12224,11 +12014,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -12252,7 +12041,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12269,7 +12058,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -12278,7 +12066,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -12293,7 +12081,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_re
 ; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
@@ -12375,13 +12163,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -12402,7 +12189,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -12418,11 +12205,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB33_1: ; %atomicrmw.start
@@ -12448,7 +12233,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -12467,11 +12252,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB33_1: ; %atomicrmw.start
@@ -12497,7 +12280,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -12516,12 +12299,11 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -12543,7 +12325,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -12561,13 +12343,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -12587,7 +12368,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -12604,13 +12385,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -12631,7 +12411,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12648,11 +12428,10 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -12676,7 +12455,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -12693,7 +12472,6 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -12702,7 +12480,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -12717,7 +12495,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    v_alignbit_b32 v3, v3, v5, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
@@ -12825,8 +12603,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
@@ -12837,7 +12614,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -12855,9 +12632,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -12865,7 +12641,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -12883,9 +12659,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -12893,7 +12668,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -12910,9 +12685,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -12920,7 +12694,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -12937,9 +12711,8 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -12947,7 +12720,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    v_add_f32_e32 v4, v5, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index f7a1fb35c8106..316ba8527b595 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -37,10 +37,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -49,7 +48,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -88,10 +87,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -99,7 +97,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -116,10 +114,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -128,7 +125,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -145,10 +142,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -157,7 +153,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -212,10 +208,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -223,7 +218,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -262,17 +257,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -289,10 +283,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -300,7 +293,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -317,10 +310,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -328,7 +320,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -402,7 +394,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX942-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -414,22 +405,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT:    ; implicit-def: $vgpr4
+; GFX942-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-NEXT:    v_max_f32_e32 v9, v5, v5
+; GFX942-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX942-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX942-NEXT:    v_max_f32_e32 v6, v4, v9
+; GFX942-NEXT:    v_max_f32_e32 v6, v9, v9
+; GFX942-NEXT:    v_max_f32_e32 v8, v6, v5
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    buffer_wbl2 sc1
 ; GFX942-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -443,21 +433,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -522,7 +512,6 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -534,22 +523,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_max_f32_e32 v9, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX90A-NEXT:    v_max_f32_e32 v6, v4, v9
+; GFX90A-NEXT:    v_max_f32_e32 v6, v9, v9
+; GFX90A-NEXT:    v_max_f32_e32 v8, v6, v5
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -561,27 +549,26 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -593,8 +580,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -605,11 +591,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX908-NEXT:    v_max_f32_e32 v5, v4, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_max_f32_e32 v5, v7, v7
+; GFX908-NEXT:    v_max_f32_e32 v6, v5, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX908-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -621,27 +607,26 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -653,8 +638,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -665,11 +649,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v4, v8
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; GFX8-NEXT:    v_max_f32_e32 v6, v5, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -681,21 +665,21 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -777,10 +761,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -789,7 +772,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX942-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -804,11 +787,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s16 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -819,7 +801,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX11-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -837,11 +819,10 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -851,7 +832,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -869,10 +850,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -880,7 +860,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -897,10 +877,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -909,7 +888,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -926,10 +905,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -938,7 +916,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -955,10 +933,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -967,7 +944,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1035,10 +1012,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -1047,7 +1023,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1086,10 +1062,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1097,7 +1072,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1114,10 +1089,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1126,7 +1100,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1143,10 +1117,9 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1155,7 +1128,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1203,12 +1176,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1220,7 +1192,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -1248,12 +1220,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1265,7 +1236,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1306,9 +1277,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1320,7 +1290,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1339,9 +1309,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1353,7 +1322,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1397,11 +1366,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1411,7 +1378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
@@ -1440,11 +1407,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1454,7 +1419,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1494,9 +1459,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1506,7 +1470,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1525,9 +1489,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1537,7 +1500,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1583,10 +1546,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
@@ -1600,12 +1562,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    v_max_num_f64_e32 v[5:6], v[5:6], v[5:6]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
@@ -1615,7 +1576,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    v_max_num_f64_e32 v[11:12], v[0:1], v[5:6]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1632,7 +1593,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1686,27 +1647,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
@@ -1716,7 +1676,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_max_f64 v[11:12], v[0:1], v[5:6]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1732,7 +1692,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1816,7 +1776,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
@@ -1829,12 +1788,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
@@ -1842,7 +1800,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[11:12], v[0:1], v[5:6]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v13
@@ -1858,7 +1816,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1882,7 +1840,6 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
@@ -1895,12 +1852,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
@@ -1908,7 +1864,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[11:12], v[0:1], v[5:6]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v13
@@ -1924,7 +1880,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -2010,12 +1966,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -2027,7 +1982,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2055,12 +2010,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2072,7 +2026,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2091,11 +2045,10 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s4
+; GFX10-NEXT:    v_mov_b32_e32 v6, s20
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2108,7 +2061,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -2127,10 +2080,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -2139,7 +2091,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -2158,9 +2110,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -2172,7 +2123,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2191,9 +2142,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2205,7 +2155,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2223,10 +2173,9 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2238,7 +2187,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2298,12 +2247,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -2315,7 +2263,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2343,12 +2291,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2360,7 +2307,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2401,9 +2348,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -2415,7 +2361,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2434,9 +2380,8 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2448,7 +2393,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -6146,13 +6091,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    v_mov_b32_e32 v3, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -6163,7 +6106,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    v_pk_max_num_f16 v4, v0, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
@@ -6182,10 +6125,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6195,7 +6137,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX942-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6210,12 +6152,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -6226,7 +6167,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX11-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -6244,11 +6185,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -6258,7 +6198,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -6276,10 +6216,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6287,7 +6226,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v5, v5
 ; GFX90A-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6304,10 +6243,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -6316,7 +6254,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_pk_max_f16 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6333,11 +6271,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -6349,7 +6286,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -6367,7 +6304,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -6375,7 +6311,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -6392,7 +6328,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
@@ -6467,10 +6403,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
@@ -6481,7 +6415,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
@@ -6500,10 +6434,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6512,7 +6445,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX942-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6528,9 +6461,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
@@ -6541,7 +6473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -6559,9 +6491,8 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
@@ -6572,7 +6503,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -6590,17 +6521,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6617,10 +6547,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -6628,7 +6557,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6645,11 +6574,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -6660,7 +6588,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -6679,7 +6607,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -6687,7 +6614,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -6704,7 +6631,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
@@ -6778,7 +6705,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6793,8 +6719,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -6805,13 +6730,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
+; GFX12-NEXT:    v_pk_max_num_f16 v5, v7, v7
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_max_num_f16 v5, v4, v8
-; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_pk_max_num_f16 v6, v5, v8
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6826,14 +6751,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6841,14 +6766,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX942-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX942-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6860,23 +6784,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT:    ; implicit-def: $vgpr4
+; GFX942-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-NEXT:    v_pk_max_f16 v9, v5, v5
+; GFX942-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX942-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX942-NEXT:    v_pk_max_f16 v6, v9, v9
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-NEXT:    v_pk_max_f16 v6, v4, v9
+; GFX942-NEXT:    v_pk_max_f16 v8, v6, v5
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6889,27 +6812,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6923,8 +6845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
@@ -6935,13 +6856,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX11-NEXT:    v_pk_max_f16 v5, v7, v7
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_max_f16 v5, v4, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_pk_max_f16 v6, v5, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6955,14 +6876,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -6971,13 +6892,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6989,8 +6909,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
@@ -7001,12 +6920,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v5, v7, v7
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_max_f16 v5, v4, v8
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_pk_max_f16 v6, v5, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7018,15 +6937,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -7035,13 +6954,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7053,22 +6971,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
+; GFX90A-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
-; GFX90A-NEXT:    v_pk_max_f16 v6, v4, v9
+; GFX90A-NEXT:    v_pk_max_f16 v6, v9, v9
+; GFX90A-NEXT:    v_pk_max_f16 v8, v6, v5
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7080,27 +6997,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7112,8 +7028,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -7124,11 +7039,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
-; GFX908-NEXT:    v_pk_max_f16 v5, v4, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_pk_max_f16 v5, v7, v7
+; GFX908-NEXT:    v_pk_max_f16 v6, v5, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7140,27 +7055,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7172,8 +7086,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -7185,14 +7098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_max_f16_e32 v5, v5, v9
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_max_f16_sdwa v5, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v7, v7
+; GFX8-NEXT:    v_max_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v6, v9
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7204,27 +7117,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7235,39 +7147,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v9
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v7
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_max_f32_e32 v6, v6, v10
-; GFX7-NEXT:    v_max_f32_e32 v7, v7, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v5
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v10
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v11
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX7-NEXT:    v_mov_b32_e32 v9, v6
+; GFX7-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7279,23 +7190,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7396,13 +7307,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-TRUE16-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX12-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -7431,7 +7340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7452,11 +7361,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-FAKE16-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
@@ -7487,7 +7394,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7506,13 +7413,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -7534,7 +7440,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7549,12 +7455,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
@@ -7583,7 +7488,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -7601,10 +7506,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -7635,7 +7539,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -7654,9 +7558,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
@@ -7682,7 +7585,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -7700,13 +7603,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -7727,7 +7629,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7744,13 +7646,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -7772,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7789,11 +7690,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -7818,7 +7718,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7834,7 +7734,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -7843,7 +7742,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
@@ -7858,7 +7757,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -7928,11 +7827,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -7958,7 +7855,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -7980,11 +7877,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
@@ -8010,7 +7905,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -8029,13 +7924,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -8056,7 +7950,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8072,11 +7966,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
@@ -8102,7 +7994,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -8121,11 +8013,9 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
@@ -8151,7 +8041,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -8170,12 +8060,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -8197,7 +8086,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -8215,13 +8104,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -8241,7 +8129,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8258,13 +8146,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -8285,7 +8172,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8302,11 +8189,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -8330,7 +8216,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8347,7 +8233,6 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -8356,7 +8241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -8371,7 +8256,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
@@ -8440,7 +8325,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-TRUE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8455,8 +8339,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
@@ -8468,30 +8351,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_max_num_f32 v5, v5, v8 :: v_dual_max_num_f32 v4, v4, v9
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_dual_max_num_f32 v6, v6, v8 :: v_dual_max_num_f32 v5, v5, v9
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX12-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX12-TRUE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-TRUE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8506,14 +8389,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -8521,7 +8404,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8532,7 +8415,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-FAKE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8547,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -8560,30 +8441,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_dual_max_num_f32 v6, v6, v9 :: v_dual_max_num_f32 v5, v5, v8
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX12-FAKE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-FAKE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-FAKE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8598,14 +8479,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-FAKE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -8613,14 +8494,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-FAKE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX942-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX942-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8632,40 +8512,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT:    ; implicit-def: $vgpr4
+; GFX942-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX942-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX942-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX942-NEXT:    s_mov_b32 s11, 0x7060302
 ; GFX942-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX942-NEXT:    v_max_f32_e32 v4, v4, v9
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s10
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX942-NEXT:    v_max_f32_e32 v6, v6, v10
+; GFX942-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX942-NEXT:    v_add3_u32 v7, v7, v6, s10
+; GFX942-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT:    v_max_f32_e32 v5, v5, v10
-; GFX942-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT:    v_add3_u32 v6, v6, v5, s10
-; GFX942-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX942-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
+; GFX942-NEXT:    v_max_f32_e32 v7, v7, v5
+; GFX942-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX942-NEXT:    v_add3_u32 v8, v8, v7, s10
+; GFX942-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX942-NEXT:    v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX942-NEXT:    v_perm_b32 v8, v7, v6, s11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8678,27 +8557,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8712,8 +8590,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
@@ -8726,28 +8603,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_max_f32 v5, v5, v8 :: v_dual_max_f32 v4, v4, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_max_f32 v6, v6, v8 :: v_dual_max_f32 v5, v5, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-TRUE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8761,14 +8638,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -8778,13 +8655,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8798,8 +8674,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -8812,28 +8687,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_max_f32 v6, v6, v9 :: v_dual_max_f32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-FAKE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-FAKE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8847,14 +8722,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX11-FAKE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -8864,13 +8739,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8882,8 +8756,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
@@ -8895,25 +8768,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f32_e32 v4, v4, v8
-; GFX10-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v8
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v9
+; GFX10-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX10-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -8925,15 +8798,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -8942,13 +8815,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -8960,38 +8832,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT:    v_max_f32_e32 v4, v4, v9
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v10
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX90A-NEXT:    v_max_f32_e32 v6, v6, v10
+; GFX90A-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v6, s14
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
+; GFX90A-NEXT:    v_max_f32_e32 v7, v7, v5
+; GFX90A-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v7, s14
+; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX90A-NEXT:    v_perm_b32 v8, v7, v6, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9003,27 +8874,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9035,8 +8905,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -9050,24 +8919,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT:    v_max_f32_e32 v4, v4, v8
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX908-NEXT:    v_max_f32_e32 v5, v5, v8
+; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s14
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX908-NEXT:    v_max_f32_e32 v6, v6, v9
+; GFX908-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX908-NEXT:    v_add3_u32 v10, v10, v6, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX908-NEXT:    v_perm_b32 v6, v6, v5, s15
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9079,27 +8948,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9111,8 +8979,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -9124,27 +8991,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_max_f32_e32 v4, v4, v8
-; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_max_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v8
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v5, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9156,27 +9023,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9187,8 +9053,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
@@ -9196,27 +9061,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v5
 ; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
-; GFX7-NEXT:    v_max_f32_e32 v4, v4, v9
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v8
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v10
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_max_f32_e32 v7, v7, v10
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v11
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX7-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9228,23 +9093,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9353,10 +9218,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -9365,7 +9229,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc0 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc0 sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9404,10 +9268,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -9416,7 +9279,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -9434,10 +9297,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -9446,7 +9308,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9463,10 +9325,9 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -9475,7 +9336,7 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_max_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 8ac6353133e72..ed67e0227278b 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -37,10 +37,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -49,7 +48,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -88,10 +87,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -99,7 +97,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -116,10 +114,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -128,7 +125,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -145,10 +142,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -157,7 +153,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -212,10 +208,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -223,7 +218,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -262,17 +257,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -289,10 +283,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v0, v0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -300,7 +293,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -317,10 +310,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -328,7 +320,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -402,7 +394,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX942-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -414,22 +405,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT:    ; implicit-def: $vgpr4
+; GFX942-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-NEXT:    v_max_f32_e32 v9, v5, v5
+; GFX942-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX942-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX942-NEXT:    v_min_f32_e32 v6, v4, v9
+; GFX942-NEXT:    v_max_f32_e32 v6, v9, v9
+; GFX942-NEXT:    v_min_f32_e32 v8, v6, v5
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:    buffer_wbl2 sc1
 ; GFX942-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -443,21 +433,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -522,7 +512,6 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -534,22 +523,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_max_f32_e32 v9, v5, v5
+; GFX90A-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX90A-NEXT:  .LBB2_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX90A-NEXT:    v_min_f32_e32 v6, v4, v9
+; GFX90A-NEXT:    v_max_f32_e32 v6, v9, v9
+; GFX90A-NEXT:    v_min_f32_e32 v8, v6, v5
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -561,27 +549,26 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -593,8 +580,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -605,11 +591,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX908-NEXT:    v_min_f32_e32 v5, v4, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_max_f32_e32 v5, v7, v7
+; GFX908-NEXT:    v_min_f32_e32 v6, v5, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX908-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -621,27 +607,26 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -653,8 +638,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -665,11 +649,11 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB2_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v4, v8
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; GFX8-NEXT:    v_min_f32_e32 v6, v5, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:  .LBB2_4: ; Parent Loop BB2_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -681,21 +665,21 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB2_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -777,10 +761,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -789,7 +772,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX942-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -804,11 +787,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
-; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s16 :: v_dual_max_f32 v2, v1, v1
+; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -819,7 +801,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX11-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -837,11 +819,10 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_max_f32_e32 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -851,7 +832,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX10-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -869,10 +850,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -880,7 +860,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -897,10 +877,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -909,7 +888,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -926,10 +905,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -938,7 +916,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -955,10 +933,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -967,7 +944,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_remote
 ; GFX7-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1035,10 +1012,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -1047,7 +1023,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX942-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1086,10 +1062,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -1097,7 +1072,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1114,10 +1089,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1126,7 +1100,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1143,10 +1117,9 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1155,7 +1128,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__amdgpu_no_fine_g
 ; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -1203,12 +1176,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1220,7 +1192,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -1248,12 +1220,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1265,7 +1236,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1306,9 +1277,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1320,7 +1290,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1339,9 +1309,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1353,7 +1322,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -1397,11 +1366,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
-; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -1411,7 +1378,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
@@ -1440,11 +1407,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s16
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
-; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1454,7 +1419,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -1494,9 +1459,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -1506,7 +1470,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1525,9 +1489,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v2, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -1537,7 +1500,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
@@ -1583,10 +1546,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX12-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX12-NEXT:    v_readfirstlane_b32 s6, v7
@@ -1600,12 +1562,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT:    ; implicit-def: $vgpr4
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
+; GFX12-NEXT:    v_max_num_f64_e32 v[5:6], v[5:6], v[5:6]
 ; GFX12-NEXT:    s_mov_b32 s1, 0
 ; GFX12-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
@@ -1615,7 +1576,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[4:5]
+; GFX12-NEXT:    v_min_num_f64_e32 v[11:12], v[0:1], v[5:6]
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX12-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1632,7 +1593,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1686,27 +1647,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v15, 0x800, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX11-NEXT:    v_readfirstlane_b32 s5, v10
 ; GFX11-NEXT:    v_readfirstlane_b32 s6, v7
 ; GFX11-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT:    ; implicit-def: $vgpr4
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX11-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
@@ -1716,7 +1676,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX11-NEXT:    v_min_f64 v[11:12], v[0:1], v[5:6]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
 ; GFX11-NEXT:  .LBB7_4: ; Parent Loop BB7_3 Depth=1
@@ -1732,7 +1692,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v4, s[4:7], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1816,7 +1776,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_u32_e32 v15, 0x800, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v9
@@ -1829,12 +1788,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT:    ; implicit-def: $vgpr4
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX908-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
@@ -1842,7 +1800,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX908-NEXT:    s_mov_b64 s[12:13], exec
-; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[11:12], v[0:1], v[5:6]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v13
@@ -1858,7 +1816,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -1882,7 +1840,6 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    v_mov_b32_e32 v7, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0x800, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v9
@@ -1895,12 +1852,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    v_max_f64 v[4:5], v[5:6], v[5:6]
+; GFX8-NEXT:    v_max_f64 v[5:6], v[5:6], v[5:6]
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:  .LBB7_3: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
@@ -1908,7 +1864,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[13:14], v[13:14]
 ; GFX8-NEXT:    s_mov_b64 s[12:13], exec
-; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[11:12], v[0:1], v[5:6]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v11
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v12
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v13
@@ -1924,7 +1880,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v4, s[8:11], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB7_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB7_3 Depth=1
@@ -2010,12 +1966,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -2027,7 +1982,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2055,12 +2010,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2072,7 +2026,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2091,11 +2045,10 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x800
-; GFX10-NEXT:    v_mov_b32_e32 v6, s4
+; GFX10-NEXT:    v_mov_b32_e32 v6, s20
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -2108,7 +2061,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -2127,10 +2080,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -2139,7 +2091,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
@@ -2158,9 +2110,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -2172,7 +2123,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2191,9 +2142,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2205,7 +2155,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2223,10 +2173,9 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s6
+; GFX7-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -2238,7 +2187,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2298,12 +2247,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x800
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v6, s4
+; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
 ; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -2315,7 +2263,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
@@ -2343,12 +2291,11 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x800
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
 ; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2360,7 +2307,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
 ; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2401,9 +2348,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -2415,7 +2361,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -2434,9 +2380,8 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x800
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_mov_b32_e32 v6, s20
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -2448,7 +2393,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
@@ -6146,13 +6091,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    v_mov_b32_e32 v3, s16
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v1, v1
 ; GFX12-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -6163,7 +6106,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX12-NEXT:    v_pk_min_num_f16 v4, v0, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v5
@@ -6182,10 +6125,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6195,7 +6137,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX942-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6210,12 +6152,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, s16
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX11-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -6226,7 +6167,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX11-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -6244,11 +6185,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_pk_max_f16 v2, v1, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -6258,7 +6198,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX10-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -6276,10 +6216,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -6287,7 +6226,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v5, v5
 ; GFX90A-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6304,10 +6243,9 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -6316,7 +6254,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX908-NEXT:    v_pk_min_f16 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -6333,11 +6271,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -6349,7 +6286,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX8-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -6367,7 +6304,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    buffer_load_dword v3, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -6375,7 +6311,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v3
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -6392,7 +6328,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__amdgpu_no
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
@@ -6467,10 +6403,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s16
-; GFX12-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-NEXT:    v_pk_max_num_f16 v2, v0, v0
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v3, s4
+; GFX12-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
@@ -6481,7 +6415,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v1
@@ -6500,10 +6434,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -6512,7 +6445,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX942-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6528,9 +6461,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s16
-; GFX11-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v3, s4
+; GFX11-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
@@ -6541,7 +6473,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -6559,9 +6491,8 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
@@ -6572,7 +6503,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX10-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -6590,17 +6521,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_pk_max_f16 v0, v1, v1
 ; GFX90A-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6617,10 +6547,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_pk_max_f16 v2, v0, v0
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -6628,7 +6557,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX908-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
@@ -6645,11 +6574,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -6660,7 +6588,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -6679,7 +6607,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
@@ -6687,7 +6614,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
@@ -6704,7 +6631,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2f16__offset__amdgpu_no_fin
 ; GFX7-NEXT:    v_or_b32_e32 v4, v6, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v6
@@ -6778,7 +6705,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6793,8 +6719,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-NEXT:    ; implicit-def: $vgpr4
+; GFX12-NEXT:    buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX12-NEXT:  ; %bb.2:
@@ -6805,13 +6730,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_pk_max_num_f16 v4, v6, v6
+; GFX12-NEXT:    v_pk_max_num_f16 v5, v7, v7
 ; GFX12-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_pk_min_num_f16 v5, v4, v8
-; GFX12-NEXT:    v_mov_b32_e32 v4, v5
+; GFX12-NEXT:    v_pk_min_num_f16 v6, v5, v8
 ; GFX12-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX12-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6826,14 +6751,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX12-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -6841,14 +6766,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX12-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX12-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX942-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX942-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6860,23 +6784,22 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT:    ; implicit-def: $vgpr4
+; GFX942-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-NEXT:    v_pk_max_f16 v9, v5, v5
+; GFX942-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX942-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX942-NEXT:    v_pk_max_f16 v6, v9, v9
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-NEXT:    v_pk_min_f16 v6, v4, v9
+; GFX942-NEXT:    v_pk_min_f16 v8, v6, v5
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6889,27 +6812,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6923,8 +6845,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT:    ; implicit-def: $vgpr4
+; GFX11-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX11-NEXT:  ; %bb.2:
@@ -6935,13 +6856,13 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX11-NEXT:    v_pk_max_f16 v5, v7, v7
 ; GFX11-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_min_f16 v5, v4, v8
-; GFX11-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-NEXT:    v_pk_min_f16 v6, v5, v8
 ; GFX11-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX11-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
@@ -6955,14 +6876,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX11-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -6971,13 +6892,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX11-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX11-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
@@ -6989,8 +6909,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
@@ -7001,12 +6920,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX10-NEXT:    v_pk_max_f16 v5, v7, v7
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_pk_min_f16 v5, v4, v8
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_pk_min_f16 v6, v5, v8
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX10-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7018,15 +6937,15 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -7035,13 +6954,12 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX10-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7053,22 +6971,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_pk_max_f16 v9, v5, v5
+; GFX90A-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX90A-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_max_f16 v4, v7, v7
-; GFX90A-NEXT:    v_pk_min_f16 v6, v4, v9
+; GFX90A-NEXT:    v_pk_max_f16 v6, v9, v9
+; GFX90A-NEXT:    v_pk_min_f16 v8, v6, v5
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7080,27 +6997,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7112,8 +7028,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -7124,11 +7039,11 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_pk_max_f16 v4, v6, v6
-; GFX908-NEXT:    v_pk_min_f16 v5, v4, v8
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_pk_max_f16 v5, v7, v7
+; GFX908-NEXT:    v_pk_min_f16 v6, v5, v8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX908-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7140,27 +7055,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7172,8 +7086,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -7185,14 +7098,14 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT:    v_max_f16_e32 v5, v6, v6
-; GFX8-NEXT:    v_min_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_min_f16_e32 v5, v5, v9
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_max_f16_sdwa v5, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_max_f16_e32 v6, v7, v7
+; GFX8-NEXT:    v_min_f16_sdwa v5, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v6, v9
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7204,27 +7117,26 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7235,39 +7147,38 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2:
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v9, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v9
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX7-NEXT:  .LBB18_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB18_4 Depth 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v7
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_min_f32_e32 v6, v6, v10
-; GFX7-NEXT:    v_min_f32_e32 v7, v7, v11
-; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v6, v5
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v10
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v11
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT:    v_mov_b32_e32 v8, v6
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX7-NEXT:    v_mov_b32_e32 v9, v6
+; GFX7-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX7-NEXT:  .LBB18_4: ; Parent Loop BB18_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -7279,23 +7190,23 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v8
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v8
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v6
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v7
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7396,13 +7307,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-TRUE16-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX12-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -7431,7 +7340,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7452,11 +7361,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX12-FAKE16-NEXT:    s_add_co_i32 s4, s16, 0x400
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
 ; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
 ; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-FAKE16-NEXT:  .LBB19_1: ; %atomicrmw.start
@@ -7487,7 +7394,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
@@ -7506,13 +7413,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -7534,7 +7440,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7549,12 +7455,11 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB19_1: ; %atomicrmw.start
@@ -7583,7 +7488,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -7601,10 +7506,9 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s16
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v1
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v1
 ; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
@@ -7635,7 +7539,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -7654,9 +7558,8 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX10-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
@@ -7682,7 +7585,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX10-NEXT:    v_perm_b32 v5, v1, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v1, v6
-; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -7700,13 +7603,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -7727,7 +7629,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v6, v1, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
@@ -7744,13 +7646,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -7772,7 +7673,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX908-NEXT:    v_perm_b32 v5, v1, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v6
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7789,11 +7690,10 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -7818,7 +7718,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX8-NEXT:    v_alignbit_b32 v5, v1, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
@@ -7834,7 +7734,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v4, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -7843,7 +7742,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX7-NEXT:  .LBB19_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
@@ -7858,7 +7757,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__amdgpu
 ; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v0
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -7928,11 +7827,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX12-TRUE16-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
@@ -7958,7 +7855,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -7980,11 +7877,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX12-FAKE16-NEXT:    s_add_co_i32 s4, s16, 0x400
-; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
-; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
 ; GFX12-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
@@ -8010,7 +7905,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v1
@@ -8029,13 +7924,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX942-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s4, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX942-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX942-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX942-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -8056,7 +7950,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[4:5]
 ; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
-; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8072,11 +7966,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_and_b32 v2, 0xffff0000, v0
-; GFX11-TRUE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_lshlrev_b32 v3, 16, v0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_lshlrev_b32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-TRUE16-NEXT:    .p2align 6
 ; GFX11-TRUE16-NEXT:  .LBB20_1: ; %atomicrmw.start
@@ -8102,7 +7994,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
@@ -8121,11 +8013,9 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, s16 :: v_dual_lshlrev_b32 v2, 16, v0
-; GFX11-FAKE16-NEXT:    s_add_i32 s4, s16, 0x400
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, s16 :: v_dual_and_b32 v3, 0xffff0000, v0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s5, 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-FAKE16-NEXT:    .p2align 6
 ; GFX11-FAKE16-NEXT:  .LBB20_1: ; %atomicrmw.start
@@ -8151,7 +8041,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
@@ -8170,12 +8060,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s20
-; GFX10-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
+; GFX10-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
 ; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
@@ -8197,7 +8086,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v0
-; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -8215,13 +8104,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX90A-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -8241,7 +8129,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX90A-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v1
@@ -8258,13 +8146,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX908-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:    v_mov_b32_e32 v4, s4
+; GFX908-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -8285,7 +8172,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX908-NEXT:    v_perm_b32 v0, v5, v0, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
-; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8302,11 +8189,10 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX8-NEXT:    buffer_load_dword v1, v1, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s4, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
 ; GFX8-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -8330,7 +8216,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX8-NEXT:    v_alignbit_b32 v0, v5, v0, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
-; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
@@ -8347,7 +8233,6 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 offen offset:1024
-; GFX7-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
@@ -8356,7 +8241,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
 ; GFX7-NEXT:  .LBB20_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
@@ -8371,7 +8256,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi
 ; GFX7-NEXT:    v_alignbit_b32 v3, v3, v6, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
-; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[5:6], v2, s[16:19], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v4
@@ -8440,7 +8325,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-TRUE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8455,8 +8339,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-TRUE16-NEXT:  ; %bb.2:
@@ -8468,30 +8351,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-TRUE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-TRUE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_dual_min_num_f32 v5, v5, v8 :: v_dual_min_num_f32 v4, v4, v9
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_dual_min_num_f32 v6, v6, v8 :: v_dual_min_num_f32 v5, v5, v9
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX12-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-TRUE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX12-TRUE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.h
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-TRUE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8506,14 +8389,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX12-TRUE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
@@ -8521,7 +8404,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8532,7 +8415,6 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX12-FAKE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8547,8 +8429,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
-; GFX12-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], null offen offset:1024
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX12-FAKE16-NEXT:  ; %bb.2:
@@ -8560,30 +8441,30 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX12-FAKE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX12-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX12-FAKE16-NEXT:    s_wait_storecnt 0x0
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_dual_min_num_f32 v5, v5, v9 :: v_dual_min_num_f32 v4, v4, v8
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_dual_min_num_f32 v6, v6, v9 :: v_dual_min_num_f32 v5, v5, v8
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX12-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX12-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX12-FAKE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX12-FAKE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX12-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffd
-; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX12-FAKE16-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
+; GFX12-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX12-FAKE16-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX12-FAKE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX12-FAKE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8598,14 +8479,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
 ; GFX12-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
 ; GFX12-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX12-FAKE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX12-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX12-FAKE16-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX12-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
@@ -8613,14 +8494,13 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX12-FAKE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX942-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX942-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8632,40 +8512,39 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-NEXT:    buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX942-NEXT:    ; implicit-def: $vgpr4
+; GFX942-NEXT:    buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX942-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX942-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX942-NEXT:    s_movk_i32 s10, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX942-NEXT:    s_mov_b32 s11, 0x7060302
 ; GFX942-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX942-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX942-NEXT:    v_min_f32_e32 v4, v4, v9
-; GFX942-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT:    v_add3_u32 v5, v5, v4, s10
-; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX942-NEXT:    v_min_f32_e32 v6, v6, v10
+; GFX942-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX942-NEXT:    v_add3_u32 v7, v7, v6, s10
+; GFX942-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX942-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX942-NEXT:    buffer_wbl2 sc1
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX942-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX942-NEXT:    v_min_f32_e32 v5, v5, v10
-; GFX942-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX942-NEXT:    v_add3_u32 v6, v6, v5, s10
-; GFX942-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX942-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
+; GFX942-NEXT:    v_min_f32_e32 v7, v7, v5
+; GFX942-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX942-NEXT:    v_add3_u32 v8, v8, v7, s10
+; GFX942-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX942-NEXT:    v_perm_b32 v6, v5, v4, s11
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX942-NEXT:    v_perm_b32 v8, v7, v6, s11
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
 ; GFX942-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX942-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8678,27 +8557,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX942-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX942-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX942-NEXT:    buffer_inv sc1
 ; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX942-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX942-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8712,8 +8590,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-TRUE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-TRUE16-NEXT:  ; %bb.2:
@@ -8726,28 +8603,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-TRUE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_min_f32 v5, v5, v8 :: v_dual_min_f32 v4, v4, v9
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_dual_min_f32 v6, v6, v8 :: v_dual_min_f32 v5, v5, v9
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX11-TRUE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-TRUE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-TRUE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8761,14 +8638,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-TRUE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-TRUE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX11-TRUE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-TRUE16-NEXT:    buffer_gl1_inv
 ; GFX11-TRUE16-NEXT:    buffer_gl0_inv
 ; GFX11-TRUE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -8778,13 +8655,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-TRUE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-TRUE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8798,8 +8674,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
-; GFX11-FAKE16-NEXT:    buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v7, v4, s[4:7], 0 offen offset:1024
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX11-FAKE16-NEXT:  ; %bb.2:
@@ -8812,28 +8687,28 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX11-FAKE16-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
 ; GFX11-FAKE16-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_dual_min_f32 v5, v5, v9 :: v_dual_min_f32 v4, v4, v8
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_dual_min_f32 v6, v6, v9 :: v_dual_min_f32 v5, v5, v8
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v11, v6, 16, 1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX11-FAKE16-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX11-FAKE16-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, v5
+; GFX11-FAKE16-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX11-FAKE16-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX11-FAKE16-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v0
@@ -8847,14 +8722,14 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-FAKE16-NEXT:    buffer_atomic_cmpswap_b32 v[5:6], v4, s[4:7], 0 offen offset:1024 glc
 ; GFX11-FAKE16-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX11-FAKE16-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX11-FAKE16-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX11-FAKE16-NEXT:    buffer_gl1_inv
 ; GFX11-FAKE16-NEXT:    buffer_gl0_inv
 ; GFX11-FAKE16-NEXT:    s_or_b32 s1, vcc_lo, s1
@@ -8864,13 +8739,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX11-FAKE16-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX11-FAKE16-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 0x400, v4
 ; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
@@ -8882,8 +8756,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
-; GFX10-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT:    ; implicit-def: $vgpr4
+; GFX10-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
@@ -8895,25 +8768,25 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX10-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_min_f32_e32 v4, v4, v8
-; GFX10-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX10-NEXT:    v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT:    v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v8
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v9
+; GFX10-NEXT:    v_bfe_u32 v10, v5, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v11, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v5
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
+; GFX10-NEXT:    v_add3_u32 v10, v10, v5, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v11, v6, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v11, v13, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v6, v6, v5, 0x7060302
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX10-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX10-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX10-NEXT:    v_readfirstlane_b32 s8, v0
@@ -8925,15 +8798,15 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX10-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
@@ -8942,13 +8815,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX10-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX10-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_add_u32_e32 v8, 0x400, v4
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX90A-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -8960,38 +8832,37 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT:    ; implicit-def: $vgpr4
+; GFX90A-NEXT:    buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
 ; GFX90A-NEXT:    s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX90A-NEXT:    s_mov_b32 s15, 0x7060302
 ; GFX90A-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT:    v_min_f32_e32 v4, v4, v9
-; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_min_f32_e32 v5, v5, v10
-; GFX90A-NEXT:    v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; GFX90A-NEXT:    v_min_f32_e32 v6, v6, v10
+; GFX90A-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v6, s14
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
+; GFX90A-NEXT:    v_and_b32_e32 v7, 0xffff0000, v9
+; GFX90A-NEXT:    v_min_f32_e32 v7, v7, v5
+; GFX90A-NEXT:    v_bfe_u32 v8, v7, 16, 1
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v7, s14
+; GFX90A-NEXT:    v_or_b32_e32 v11, 0x400000, v7
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v8, v11, vcc
+; GFX90A-NEXT:    v_perm_b32 v8, v7, v6, s15
 ; GFX90A-NEXT:    s_mov_b64 s[12:13], exec
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9003,27 +8874,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX90A-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX90A-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v9
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX90A-NEXT:    buffer_wbinvl1
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX90A-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v6
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_add_u32_e32 v7, 0x400, v4
 ; GFX908-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX908-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9035,8 +8905,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT:    ; implicit-def: $vgpr4
+; GFX908-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX908-NEXT:  ; %bb.2:
@@ -9050,24 +8919,24 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT:    v_min_f32_e32 v4, v4, v8
-; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX908-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT:    v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX908-NEXT:    v_min_f32_e32 v5, v5, v8
+; GFX908-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX908-NEXT:    v_add3_u32 v6, v6, v5, s14
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT:    v_mov_b32_e32 v4, v5
-; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX908-NEXT:    v_min_f32_e32 v6, v6, v9
+; GFX908-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX908-NEXT:    v_add3_u32 v10, v10, v6, s14
+; GFX908-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX908-NEXT:    v_perm_b32 v6, v6, v5, s15
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v6
+; GFX908-NEXT:    s_mov_b64 s[12:13], exec
+; GFX908-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX908-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9079,27 +8948,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX908-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX908-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX908-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX908-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x400, v4
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9111,8 +8979,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT:    ; implicit-def: $vgpr4
+; GFX8-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX8-NEXT:  ; %bb.2:
@@ -9124,27 +8991,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB21_4 Depth 2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_min_f32_e32 v4, v4, v8
-; GFX8-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_min_f32_e32 v5, v5, v9
-; GFX8-NEXT:    v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v8
+; GFX8-NEXT:    v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v4, v5
-; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v6, v10, vcc
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v7
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v9
+; GFX8-NEXT:    v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT:    v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v6, v6, v5, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v6
+; GFX8-NEXT:    s_mov_b64 s[12:13], exec
+; GFX8-NEXT:    v_mov_b32_e32 v6, v7
 ; GFX8-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX8-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9156,27 +9023,26 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX8-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX8-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX8-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
 ; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX8-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x400, v4
 ; GFX7-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7-NEXT:  .LBB21_1: ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9187,8 +9053,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT:    buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT:    ; implicit-def: $vgpr4
+; GFX7-NEXT:    buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_1
 ; GFX7-NEXT:  ; %bb.2:
@@ -9196,27 +9061,27 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX7-NEXT:    s_mov_b64 s[6:7], 0
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v6
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v5
 ; GFX7-NEXT:  .LBB21_3: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX7-NEXT:    ; Child Loop BB21_4 Depth 2
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v4
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v7
-; GFX7-NEXT:    v_min_f32_e32 v4, v4, v9
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v8
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v10
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    v_min_f32_e32 v7, v7, v10
-; GFX7-NEXT:    v_alignbit_b32 v5, v5, v6, 16
-; GFX7-NEXT:    v_alignbit_b32 v4, v4, v7, 16
-; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v11
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v7, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v8, 16
+; GFX7-NEXT:    v_mov_b32_e32 v9, v6
 ; GFX7-NEXT:    s_mov_b64 s[12:13], exec
-; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX7-NEXT:  .LBB21_4: ; Parent Loop BB21_3 Depth=1
 ; GFX7-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX7-NEXT:    v_readfirstlane_b32 s8, v0
@@ -9228,23 +9093,23 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf
 ; GFX7-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT:    buffer_atomic_cmpswap v[8:9], v4, s[8:11], 0 offen offset:1024 glc
 ; GFX7-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_4
 ; GFX7-NEXT:  ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
 ; GFX7-NEXT:    s_mov_b64 exec, s[12:13]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
 ; GFX7-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB21_3
 ; GFX7-NEXT:  ; %bb.6: ; %atomicrmw.end
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-NEXT:    v_mov_b32_e32 v0, v8
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
@@ -9353,10 +9218,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX942-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX942-NEXT:    s_add_i32 s6, s16, 0x400
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
 ; GFX942-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
@@ -9365,7 +9229,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX942-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX942-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
 ; GFX942-NEXT:    buffer_wbl2 sc0 sc1
-; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
+; GFX942-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen offset:1024 sc0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    buffer_inv sc0 sc1
 ; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9404,10 +9268,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX90A-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX90A-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s6
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
@@ -9416,7 +9279,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    buffer_wbl2
-; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX90A-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1
@@ -9434,10 +9297,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX908-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX908-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v2, v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
@@ -9446,7 +9308,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX908-NEXT:    v_mov_b32_e32 v1, v5
-; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX908-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
@@ -9463,10 +9325,9 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 offen offset:1024
-; GFX8-NEXT:    s_add_i32 s6, s20, 0x400
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
 ; GFX8-NEXT:  .LBB22_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -9475,7 +9336,7 @@ define float @buffer_fat_ptr_system_atomic_fmin_ret_f32__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    v_min_f32_e32 v4, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
-; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen glc
+; GFX8-NEXT:    buffer_atomic_cmpswap v[0:1], v3, s[16:19], 0 offen offset:1024 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index 3c991cfb7a1aa..afd0f01580538 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -782,69 +782,90 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX942-LABEL: memcpy_known_medium:
 ; SDAG-GFX942:       ; %bb.0:
 ; SDAG-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; SDAG-GFX942-NEXT:    s_load_dword s13, s[4:5], 0x34
+; SDAG-GFX942-NEXT:    s_load_dword s17, s[4:5], 0x34
 ; SDAG-GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x44
-; SDAG-GFX942-NEXT:    s_load_dword s14, s[4:5], 0x54
-; SDAG-GFX942-NEXT:    s_mov_b32 s12, 0
-; SDAG-GFX942-NEXT:    s_mov_b32 s5, s12
-; SDAG-GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-GFX942-NEXT:    s_load_dword s12, s[4:5], 0x54
+; SDAG-GFX942-NEXT:    s_mov_b32 s16, 0
+; SDAG-GFX942-NEXT:    s_mov_b32 s5, s16
 ; SDAG-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-GFX942-NEXT:    s_mov_b32 s4, s3
-; SDAG-GFX942-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
-; SDAG-GFX942-NEXT:    s_mov_b32 s13, s2
+; SDAG-GFX942-NEXT:    s_or_b64 s[6:7], s[4:5], s[16:17]
+; SDAG-GFX942-NEXT:    s_mov_b32 s17, s2
 ; SDAG-GFX942-NEXT:    s_mov_b32 s2, s1
-; SDAG-GFX942-NEXT:    s_mov_b32 s3, s12
-; SDAG-GFX942-NEXT:    s_or_b64 s[4:5], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT:    s_mov_b32 s13, s14
+; SDAG-GFX942-NEXT:    s_mov_b32 s3, s16
+; SDAG-GFX942-NEXT:    s_or_b64 s[4:5], s[2:3], s[16:17]
+; SDAG-GFX942-NEXT:    s_mov_b32 s17, s12
 ; SDAG-GFX942-NEXT:    s_mov_b32 s2, s11
-; SDAG-GFX942-NEXT:    s_or_b64 s[14:15], s[2:3], s[12:13]
-; SDAG-GFX942-NEXT:    s_mov_b32 s13, s10
+; SDAG-GFX942-NEXT:    s_or_b64 s[14:15], s[2:3], s[16:17]
+; SDAG-GFX942-NEXT:    s_mov_b32 s17, s10
 ; SDAG-GFX942-NEXT:    s_mov_b32 s2, s9
-; SDAG-GFX942-NEXT:    s_or_b64 s[12:13], s[2:3], s[12:13]
+; SDAG-GFX942-NEXT:    s_or_b64 s[12:13], s[2:3], s[16:17]
 ; SDAG-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
 ; SDAG-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SDAG-GFX942-NEXT:    v_add_u32_e32 v1, s0, v0
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[26:29], v1, s[4:7], 0 offen offset:96
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[30:33], v1, s[4:7], 0 offen offset:112
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[34:37], v1, s[4:7], 0 offen offset:128
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[38:41], v1, s[4:7], 0 offen offset:144
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[42:45], v1, s[4:7], 0 offen offset:160
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[46:49], v1, s[4:7], 0 offen offset:176
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224
-; SDAG-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240
-; SDAG-GFX942-NEXT:    v_add_u32_e32 v62, s8, v0
-; SDAG-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
-; SDAG-GFX942-NEXT:    s_and_b64 vcc, exec, vcc
-; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v63, a3 ; Reload Reuse
-; SDAG-GFX942-NEXT:    scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v62, s[12:15], 0 offen offset:96
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v62, s[12:15], 0 offen offset:112
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v62, s[12:15], 0 offen offset:128
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v62, s[12:15], 0 offen offset:144
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v62, s[12:15], 0 offen offset:160
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v62, s[12:15], 0 offen offset:176
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224
-; SDAG-GFX942-NEXT:    scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; SDAG-GFX942-NEXT:    s_add_i32 s1, s0, s16
+; SDAG-GFX942-NEXT:    v_mov_b32_e32 v60, s1
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[8:11], v60, s[4:7], 0 offen
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[4:7], v60, s[4:7], 0 offen offset:16
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:32
+; SDAG-GFX942-NEXT:    s_add_i32 s2, s8, s16
+; SDAG-GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; SDAG-GFX942-NEXT:    s_addk_i32 s16, 0x100
+; SDAG-GFX942-NEXT:    s_cmpk_lt_u32 s16, 0x100
 ; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240
-; SDAG-GFX942-NEXT:    s_cbranch_vccnz .LBB1_1
+; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a0, v15 ; Reload Reuse
+; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a1, v14 ; Reload Reuse
+; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a2, v13 ; Reload Reuse
+; SDAG-GFX942-NEXT:    v_accvgpr_write_b32 a3, v12 ; Reload Reuse
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[12:15], v60, s[4:7], 0 offen offset:48
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[16:19], v60, s[4:7], 0 offen offset:64
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[20:23], v60, s[4:7], 0 offen offset:80
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[24:27], v60, s[4:7], 0 offen offset:96
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[28:31], v60, s[4:7], 0 offen offset:112
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[32:35], v60, s[4:7], 0 offen offset:128
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[36:39], v60, s[4:7], 0 offen offset:144
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[40:43], v60, s[4:7], 0 offen offset:160
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[44:47], v60, s[4:7], 0 offen offset:176
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[48:51], v60, s[4:7], 0 offen offset:192
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[52:55], v60, s[4:7], 0 offen offset:208
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[56:59], v60, s[4:7], 0 offen offset:224
+; SDAG-GFX942-NEXT:    s_nop 0
+; SDAG-GFX942-NEXT:    buffer_load_dwordx4 v[60:63], v60, s[4:7], 0 offen offset:240
+; SDAG-GFX942-NEXT:    s_nop 0
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[8:11], v0, s[12:15], 0 offen
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[4:7], v0, s[12:15], 0 offen offset:16
+; SDAG-GFX942-NEXT:    s_nop 1
+; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
+; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
+; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v3, a2 ; Reload Reuse
+; SDAG-GFX942-NEXT:    v_accvgpr_read_b32 v2, a3 ; Reload Reuse
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v0, s[12:15], 0 offen offset:32
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[12:15], v0, s[12:15], 0 offen offset:48
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[16:19], v0, s[12:15], 0 offen offset:64
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[20:23], v0, s[12:15], 0 offen offset:80
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[24:27], v0, s[12:15], 0 offen offset:96
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[28:31], v0, s[12:15], 0 offen offset:112
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[32:35], v0, s[12:15], 0 offen offset:128
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[36:39], v0, s[12:15], 0 offen offset:144
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[40:43], v0, s[12:15], 0 offen offset:160
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[44:47], v0, s[12:15], 0 offen offset:176
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[48:51], v0, s[12:15], 0 offen offset:192
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[52:55], v0, s[12:15], 0 offen offset:208
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[56:59], v0, s[12:15], 0 offen offset:224
+; SDAG-GFX942-NEXT:    s_waitcnt vmcnt(15)
+; SDAG-GFX942-NEXT:    buffer_store_dwordx4 v[60:63], v0, s[12:15], 0 offen offset:240
+; SDAG-GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
 ; SDAG-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
 ; SDAG-GFX942-NEXT:    s_endpgm
 ;
@@ -852,84 +873,87 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; SDAG-GFX1100:       ; %bb.0:
 ; SDAG-GFX1100-NEXT:    s_clause 0x3
 ; SDAG-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; SDAG-GFX1100-NEXT:    s_load_b32 s13, s[4:5], 0x34
+; SDAG-GFX1100-NEXT:    s_load_b32 s17, s[4:5], 0x34
 ; SDAG-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; SDAG-GFX1100-NEXT:    s_load_b32 s18, s[4:5], 0x54
-; SDAG-GFX1100-NEXT:    s_mov_b32 s12, 0
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, 0
-; SDAG-GFX1100-NEXT:    s_mov_b32 s5, s12
-; SDAG-GFX1100-NEXT:    s_mov_b32 s15, s12
-; SDAG-GFX1100-NEXT:    s_mov_b32 s17, s12
+; SDAG-GFX1100-NEXT:    s_mov_b32 s16, 0
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT:    s_mov_b32 s5, s16
+; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s16
+; SDAG-GFX1100-NEXT:    s_mov_b32 s15, s16
 ; SDAG-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s4, s3
-; SDAG-GFX1100-NEXT:    s_mov_b32 s14, s1
-; SDAG-GFX1100-NEXT:    s_or_b64 s[6:7], s[4:5], s[12:13]
-; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s2
-; SDAG-GFX1100-NEXT:    s_mov_b32 s16, s11
-; SDAG-GFX1100-NEXT:    s_or_b64 s[4:5], s[14:15], s[12:13]
-; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s18
+; SDAG-GFX1100-NEXT:    s_mov_b32 s12, s1
+; SDAG-GFX1100-NEXT:    s_or_b64 s[6:7], s[4:5], s[16:17]
+; SDAG-GFX1100-NEXT:    s_mov_b32 s17, s2
+; SDAG-GFX1100-NEXT:    s_mov_b32 s14, s11
+; SDAG-GFX1100-NEXT:    s_or_b64 s[4:5], s[12:13], s[16:17]
+; SDAG-GFX1100-NEXT:    s_mov_b32 s17, s18
 ; SDAG-GFX1100-NEXT:    s_mov_b32 s2, s9
-; SDAG-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[12:13]
-; SDAG-GFX1100-NEXT:    s_mov_b32 s13, s10
-; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s12
+; SDAG-GFX1100-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; SDAG-GFX1100-NEXT:    s_mov_b32 s17, s10
+; SDAG-GFX1100-NEXT:    s_mov_b32 s3, s16
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; SDAG-GFX1100-NEXT:    s_or_b64 s[12:13], s[2:3], s[12:13]
+; SDAG-GFX1100-NEXT:    s_or_b64 s[12:13], s[2:3], s[16:17]
 ; SDAG-GFX1100-NEXT:  .LBB1_1: ; %load-store-loop
 ; SDAG-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SDAG-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
-; SDAG-GFX1100-NEXT:    v_add_nc_u32_e32 v65, s8, v0
-; SDAG-GFX1100-NEXT:    v_add_co_u32 v0, s1, 0x100, v0
-; SDAG-GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; SDAG-GFX1100-NEXT:    s_add_i32 s1, s0, s16
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v60, s1
+; SDAG-GFX1100-NEXT:    s_add_i32 s1, s8, s16
+; SDAG-GFX1100-NEXT:    s_addk_i32 s16, 0x100
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v64, s1
+; SDAG-GFX1100-NEXT:    s_cmpk_lt_u32 s16, 0x100
 ; SDAG-GFX1100-NEXT:    s_clause 0xf
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[9:12], v61, s[4:7], 0 offen offset:32
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[13:16], v61, s[4:7], 0 offen offset:48
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[17:20], v61, s[4:7], 0 offen offset:64
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[21:24], v61, s[4:7], 0 offen offset:80
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[25:28], v61, s[4:7], 0 offen offset:96
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[29:32], v61, s[4:7], 0 offen offset:112
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[33:36], v61, s[4:7], 0 offen offset:128
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[37:40], v61, s[4:7], 0 offen offset:144
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[41:44], v61, s[4:7], 0 offen offset:160
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[45:48], v61, s[4:7], 0 offen offset:176
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[49:52], v61, s[4:7], 0 offen offset:192
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
-; SDAG-GFX1100-NEXT:    buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[0:3], v60, s[4:7], 0 offen
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[4:7], v60, s[4:7], 0 offen offset:16
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[8:11], v60, s[4:7], 0 offen offset:32
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[12:15], v60, s[4:7], 0 offen offset:48
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[16:19], v60, s[4:7], 0 offen offset:64
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[20:23], v60, s[4:7], 0 offen offset:80
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[24:27], v60, s[4:7], 0 offen offset:96
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[28:31], v60, s[4:7], 0 offen offset:112
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[32:35], v60, s[4:7], 0 offen offset:128
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[36:39], v60, s[4:7], 0 offen offset:144
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[40:43], v60, s[4:7], 0 offen offset:160
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[44:47], v60, s[4:7], 0 offen offset:176
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[48:51], v60, s[4:7], 0 offen offset:192
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[52:55], v60, s[4:7], 0 offen offset:208
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[56:59], v60, s[4:7], 0 offen offset:224
+; SDAG-GFX1100-NEXT:    buffer_load_b128 v[60:63], v60, s[4:7], 0 offen offset:240
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(15)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[0:3], v64, s[12:15], 0 offen
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(14)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[5:8], v65, s[12:15], 0 offen offset:16
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[4:7], v64, s[12:15], 0 offen offset:16
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(13)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[9:12], v65, s[12:15], 0 offen offset:32
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[8:11], v64, s[12:15], 0 offen offset:32
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(12)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[13:16], v65, s[12:15], 0 offen offset:48
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[12:15], v64, s[12:15], 0 offen offset:48
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(11)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[17:20], v65, s[12:15], 0 offen offset:64
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[16:19], v64, s[12:15], 0 offen offset:64
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(10)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[21:24], v65, s[12:15], 0 offen offset:80
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[20:23], v64, s[12:15], 0 offen offset:80
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(9)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[25:28], v65, s[12:15], 0 offen offset:96
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[24:27], v64, s[12:15], 0 offen offset:96
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(8)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[29:32], v65, s[12:15], 0 offen offset:112
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[28:31], v64, s[12:15], 0 offen offset:112
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(7)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[33:36], v65, s[12:15], 0 offen offset:128
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[32:35], v64, s[12:15], 0 offen offset:128
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(6)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[37:40], v65, s[12:15], 0 offen offset:144
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[36:39], v64, s[12:15], 0 offen offset:144
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(5)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[41:44], v65, s[12:15], 0 offen offset:160
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[40:43], v64, s[12:15], 0 offen offset:160
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(4)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[45:48], v65, s[12:15], 0 offen offset:176
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[44:47], v64, s[12:15], 0 offen offset:176
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(3)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[49:52], v65, s[12:15], 0 offen offset:192
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[48:51], v64, s[12:15], 0 offen offset:192
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(2)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[53:56], v65, s[12:15], 0 offen offset:208
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[52:55], v64, s[12:15], 0 offen offset:208
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[56:59], v64, s[12:15], 0 offen offset:224
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-GFX1100-NEXT:    buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
-; SDAG-GFX1100-NEXT:    s_cbranch_vccnz .LBB1_1
+; SDAG-GFX1100-NEXT:    buffer_store_b128 v[60:63], v64, s[12:15], 0 offen offset:240
+; SDAG-GFX1100-NEXT:    s_cbranch_scc1 .LBB1_1
 ; SDAG-GFX1100-NEXT:  ; %bb.2: ; %memcpy-split
 ; SDAG-GFX1100-NEXT:    s_endpgm
 ;
@@ -957,52 +981,50 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, s16
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, s0, v0
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[26:29], v1, s[8:11], 0 offen offset:96
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[30:33], v1, s[8:11], 0 offen offset:112
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[34:37], v1, s[8:11], 0 offen offset:128
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[38:41], v1, s[8:11], 0 offen offset:144
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[42:45], v1, s[8:11], 0 offen offset:160
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[46:49], v1, s[8:11], 0 offen offset:176
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224
-; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240
-; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s12, v0
-; GISEL-GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v0
-; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], vcc, -1
-; GISEL-GFX942-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
-; GISEL-GFX942-NEXT:    s_and_b64 vcc, s[2:3], exec
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v62, s0, v1
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[26:29], v62, s[8:11], 0 offen offset:96
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[30:33], v62, s[8:11], 0 offen offset:112
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[34:37], v62, s[8:11], 0 offen offset:128
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[38:41], v62, s[8:11], 0 offen offset:144
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[42:45], v62, s[8:11], 0 offen offset:160
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[46:49], v62, s[8:11], 0 offen offset:176
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
+; GISEL-GFX942-NEXT:    buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v63, s12, v1
+; GISEL-GFX942-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; GISEL-GFX942-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v0
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    v_accvgpr_read_b32 v63, a3 ; Reload Reuse
-; GISEL-GFX942-NEXT:    scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v62, s[4:7], 0 offen offset:96
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v62, s[4:7], 0 offen offset:112
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v62, s[4:7], 0 offen offset:128
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v62, s[4:7], 0 offen offset:144
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v62, s[4:7], 0 offen offset:160
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v62, s[4:7], 0 offen offset:176
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224
-; GISEL-GFX942-NEXT:    scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload
+; GISEL-GFX942-NEXT:    scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[26:29], v63, s[4:7], 0 offen offset:96
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[30:33], v63, s[4:7], 0 offen offset:112
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[34:37], v63, s[4:7], 0 offen offset:128
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[38:41], v63, s[4:7], 0 offen offset:144
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[42:45], v63, s[4:7], 0 offen offset:160
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[46:49], v63, s[4:7], 0 offen offset:176
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224
+; GISEL-GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240
+; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240
 ; GISEL-GFX942-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GISEL-GFX942-NEXT:  ; %bb.2: ; %memcpy-split
 ; GISEL-GFX942-NEXT:    s_endpgm
@@ -1037,8 +1059,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v65, s8, v0
-; GISEL-GFX1100-NEXT:    v_add_co_u32 v0, s1, 0x100, v0
-; GISEL-GFX1100-NEXT:    s_xor_b32 s1, s1, -1
+; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v0, 0x100, v0
 ; GISEL-GFX1100-NEXT:    s_clause 0xf
 ; GISEL-GFX1100-NEXT:    buffer_load_b128 v[1:4], v61, s[4:7], 0 offen
 ; GISEL-GFX1100-NEXT:    buffer_load_b128 v[5:8], v61, s[4:7], 0 offen offset:16
@@ -1056,7 +1077,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    buffer_load_b128 v[53:56], v61, s[4:7], 0 offen offset:208
 ; GISEL-GFX1100-NEXT:    buffer_load_b128 v[57:60], v61, s[4:7], 0 offen offset:224
 ; GISEL-GFX1100-NEXT:    buffer_load_b128 v[61:64], v61, s[4:7], 0 offen offset:240
-; GISEL-GFX1100-NEXT:    s_xor_b32 s1, s1, -1
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(15)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[1:4], v65, s[12:15], 0 offen
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(14)
@@ -1089,7 +1109,7 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[57:60], v65, s[12:15], 0 offen offset:224
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[61:64], v65, s[12:15], 0 offen offset:240
-; GISEL-GFX1100-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; GISEL-GFX1100-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x100, v0
 ; GISEL-GFX1100-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GISEL-GFX1100-NEXT:  ; %bb.2: ; %memcpy-split
 ; GISEL-GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index bc3d3785a68a4..3aa36635a0ab6 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -11,9 +11,9 @@
 
 ; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O0>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O2>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp<O3>),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments,codegenprepare,load-store-vectorizer),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-lower-vgpr-encoding,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 65d0102a9d0dc..6e5212580ba2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -232,15 +232,15 @@
 ; GCN-O1-NEXT:    AMDGPU Preload Kernel Arguments
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      AMDGPU Lower Kernel Arguments
+; GCN-O1-NEXT:      Dominator Tree Construction
+; GCN-O1-NEXT:      Natural Loop Information
+; GCN-O1-NEXT:      CodeGen Prepare
 ; GCN-O1-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O1-NEXT:    AMDGPU lower intrinsics
 ; GCN-O1-NEXT:    CallGraph Construction
 ; GCN-O1-NEXT:    Call Graph SCC Pass Manager
 ; GCN-O1-NEXT:      DummyCGSCCPass
 ; GCN-O1-NEXT:      FunctionPass Manager
-; GCN-O1-NEXT:        Dominator Tree Construction
-; GCN-O1-NEXT:        Natural Loop Information
-; GCN-O1-NEXT:        CodeGen Prepare
 ; GCN-O1-NEXT:        Lazy Value Information Analysis
 ; GCN-O1-NEXT:        Lower SwitchInst's to branches
 ; GCN-O1-NEXT:        Lower invoke and unwind, for unwindless code generators
@@ -533,21 +533,21 @@
 ; GCN-O1-OPTS-NEXT:    AMDGPU Preload Kernel Arguments
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      AMDGPU Lower Kernel Arguments
+; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      CodeGen Prepare
+; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:      Basic Alias Analysis (stateless AA impl)
+; GCN-O1-OPTS-NEXT:      Function Alias Analysis Results
+; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      Scalar Evolution Analysis
+; GCN-O1-OPTS-NEXT:      GPU Load and Store Vectorizer
 ; GCN-O1-OPTS-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O1-OPTS-NEXT:    AMDGPU lower intrinsics
 ; GCN-O1-OPTS-NEXT:    CallGraph Construction
 ; GCN-O1-OPTS-NEXT:    Call Graph SCC Pass Manager
 ; GCN-O1-OPTS-NEXT:      DummyCGSCCPass
 ; GCN-O1-OPTS-NEXT:      FunctionPass Manager
-; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
-; GCN-O1-OPTS-NEXT:        Natural Loop Information
-; GCN-O1-OPTS-NEXT:        CodeGen Prepare
-; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
-; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
-; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
-; GCN-O1-OPTS-NEXT:        Natural Loop Information
-; GCN-O1-OPTS-NEXT:        Scalar Evolution Analysis
-; GCN-O1-OPTS-NEXT:        GPU Load and Store Vectorizer
 ; GCN-O1-OPTS-NEXT:        Lazy Value Information Analysis
 ; GCN-O1-OPTS-NEXT:        Lower SwitchInst's to branches
 ; GCN-O1-OPTS-NEXT:        Lower invoke and unwind, for unwindless code generators
@@ -852,21 +852,21 @@
 ; GCN-O2-NEXT:    AMDGPU Preload Kernel Arguments
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      AMDGPU Lower Kernel Arguments
+; GCN-O2-NEXT:      Dominator Tree Construction
+; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      CodeGen Prepare
+; GCN-O2-NEXT:      Dominator Tree Construction
+; GCN-O2-NEXT:      Basic Alias Analysis (stateless AA impl)
+; GCN-O2-NEXT:      Function Alias Analysis Results
+; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      Scalar Evolution Analysis
+; GCN-O2-NEXT:      GPU Load and Store Vectorizer
 ; GCN-O2-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O2-NEXT:    AMDGPU lower intrinsics
 ; GCN-O2-NEXT:    CallGraph Construction
 ; GCN-O2-NEXT:    Call Graph SCC Pass Manager
 ; GCN-O2-NEXT:      DummyCGSCCPass
 ; GCN-O2-NEXT:      FunctionPass Manager
-; GCN-O2-NEXT:        Dominator Tree Construction
-; GCN-O2-NEXT:        Natural Loop Information
-; GCN-O2-NEXT:        CodeGen Prepare
-; GCN-O2-NEXT:        Dominator Tree Construction
-; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
-; GCN-O2-NEXT:        Function Alias Analysis Results
-; GCN-O2-NEXT:        Natural Loop Information
-; GCN-O2-NEXT:        Scalar Evolution Analysis
-; GCN-O2-NEXT:        GPU Load and Store Vectorizer
 ; GCN-O2-NEXT:        Lazy Value Information Analysis
 ; GCN-O2-NEXT:        Lower SwitchInst's to branches
 ; GCN-O2-NEXT:        Lower invoke and unwind, for unwindless code generators
@@ -1186,21 +1186,21 @@
 ; GCN-O3-NEXT:    AMDGPU Preload Kernel Arguments
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      AMDGPU Lower Kernel Arguments
+; GCN-O3-NEXT:      Dominator Tree Construction
+; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      CodeGen Prepare
+; GCN-O3-NEXT:      Dominator Tree Construction
+; GCN-O3-NEXT:      Basic Alias Analysis (stateless AA impl)
+; GCN-O3-NEXT:      Function Alias Analysis Results
+; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      Scalar Evolution Analysis
+; GCN-O3-NEXT:      GPU Load and Store Vectorizer
 ; GCN-O3-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O3-NEXT:    AMDGPU lower intrinsics
 ; GCN-O3-NEXT:    CallGraph Construction
 ; GCN-O3-NEXT:    Call Graph SCC Pass Manager
 ; GCN-O3-NEXT:      DummyCGSCCPass
 ; GCN-O3-NEXT:      FunctionPass Manager
-; GCN-O3-NEXT:        Dominator Tree Construction
-; GCN-O3-NEXT:        Natural Loop Information
-; GCN-O3-NEXT:        CodeGen Prepare
-; GCN-O3-NEXT:        Dominator Tree Construction
-; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
-; GCN-O3-NEXT:        Function Alias Analysis Results
-; GCN-O3-NEXT:        Natural Loop Information
-; GCN-O3-NEXT:        Scalar Evolution Analysis
-; GCN-O3-NEXT:        GPU Load and Store Vectorizer
 ; GCN-O3-NEXT:        Lazy Value Information Analysis
 ; GCN-O3-NEXT:        Lower SwitchInst's to branches
 ; GCN-O3-NEXT:        Lower invoke and unwind, for unwindless code generators
diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
index 5d5aad76afd09..566eb1e14dc02 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
@@ -7,16 +7,12 @@
 
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 
-; GCN-LABEL: unreachable:
-; Function info:
-; codeLenInByte = 4
 define internal fastcc void @unreachable() {
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   unreachable
 }
 
-
 ; GCN-LABEL: entry:
 ; GCN-NOT: s_swappc_b64
 ; GCN: s_endpgm

From 3f3a20f654f913f7e251e3bf4bd5a63e73e5571a Mon Sep 17 00:00:00 2001
From: hjagasiaAMD <harsha.jagasia@amd.com>
Date: Wed, 1 Oct 2025 19:57:59 -0500
Subject: [PATCH 452/878] [PATCH] offload-tunnel-cmake with proper escape
 (#161552)

Co-authored-by: ronlieb <ron.lieberman@amd.com>
---
 llvm/runtimes/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 839929204c064..6f98eaee241bc 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -507,10 +507,14 @@ if(build_runtimes)
   endif()
 
   # Forward user-provived system configuration to runtimes for requirement introspection.
-  # CMAKE_PREFIX_PATH is the search path for CMake packages.
+  # CMAKE_PREFIX_PATH is the search path for CMake packages. In order to pass through
+  # the command line interface, the CMake semicolon separator needs to be replaced
+  # with $<SEMICOLON>
   if(CMAKE_PREFIX_PATH)
-    list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}")
+    string(JOIN "$<SEMICOLON>" escaped_cmake_prefix_path ${CMAKE_PREFIX_PATH})
+    list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${escaped_cmake_prefix_path}")
   endif()
+
   # CMAKE_PROGRAM_PATH is the search path for executables such as python.
   if(CMAKE_PROGRAM_PATH)
     list(APPEND extra_cmake_args "-DCMAKE_PROGRAM_PATH=${CMAKE_PROGRAM_PATH}")

From e37a9732e1d1b55347df1ad33cf941d22ed8ab9b Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 1 Oct 2025 18:57:33 -0700
Subject: [PATCH 453/878] [JTS][NFC] Optimize guid fetching (#161612)

It's unnecessary to build the whole symtable, and on top of everything,
un-optimal to do so for every function. All we really need is the
instrumented PGO name - considering also LTO-ness - and then we can
compute the function name.
---
 .../Transforms/Scalar/JumpTableToSwitch.h     |  7 +++++-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  4 +++-
 .../Transforms/Scalar/JumpTableToSwitch.cpp   | 22 +++++++------------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
index 61786227d7a33..dfd6e2f3d03ae 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpTableToSwitch.h
@@ -15,7 +15,12 @@ namespace llvm {
 
 class Function;
 
-struct JumpTableToSwitchPass : PassInfoMixin<JumpTableToSwitchPass> {
+class JumpTableToSwitchPass : public PassInfoMixin<JumpTableToSwitchPass> {
+  // Necessary until we switch to GUIDs as metadata, after which we can drop it.
+  const bool InLTO;
+
+public:
+  explicit JumpTableToSwitchPass(bool InLTO = false) : InLTO(InLTO) {}
   /// Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 373b3c3ee56a9..7069e8d67c2f1 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -611,7 +611,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Jump table to switch conversion.
   if (EnableJumpTableToSwitch)
-    FPM.addPass(JumpTableToSwitchPass());
+    FPM.addPass(JumpTableToSwitchPass(
+        /*InLTO=*/Phase == ThinOrFullLTOPhase::ThinLTOPostLink ||
+        Phase == ThinOrFullLTOPhase::FullLTOPostLink));
 
   FPM.addPass(
       SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index 36f9bb451e23d..3c14036e509ef 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -205,14 +205,12 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
   PostDominatorTree *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
   DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy);
   bool Changed = false;
-  InstrProfSymtab Symtab;
-  if (auto E = Symtab.create(*F.getParent()))
-    F.getContext().emitError(
-        "Could not create indirect call table, likely corrupted IR" +
-        toString(std::move(E)));
-  DenseMap<const Function *, GlobalValue::GUID> FToGuid;
-  for (const auto &[G, FPtr] : Symtab.getIDToNameMap())
-    FToGuid.insert({FPtr, G});
+  auto FuncToGuid = [&](const Function &Fct) {
+    if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName))
+      return AssignGUIDPass::getGUID(Fct);
+
+    return Function::getGUIDAssumingExternalLinkage(getIRPGOFuncName(F, InLTO));
+  };
 
   for (BasicBlock &BB : make_early_inc_range(F)) {
     BasicBlock *CurrentBB = &BB;
@@ -234,12 +232,8 @@ PreservedAnalyses JumpTableToSwitchPass::run(Function &F,
         std::optional<JumpTableTy> JumpTable = parseJumpTable(GEP, PtrTy);
         if (!JumpTable)
           continue;
-        SplittedOutTail = expandToSwitch(
-            Call, *JumpTable, DTU, ORE, [&](const Function &Fct) {
-              if (Fct.getMetadata(AssignGUIDPass::GUIDMetadataName))
-                return AssignGUIDPass::getGUID(Fct);
-              return FToGuid.lookup_or(&Fct, 0U);
-            });
+        SplittedOutTail =
+            expandToSwitch(Call, *JumpTable, DTU, ORE, FuncToGuid);
         Changed = true;
         break;
       }

From 8b9e208448009814b12200ae7c06ef760f957ac7 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Thu, 2 Oct 2025 10:01:28 +0800
Subject: [PATCH 454/878] [RISCV] Add helper function getVecPolicyOpNum in
 RISCVInsertVSETVLI.cpp. NFC. (#161476)

---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 90e1c47a71c89..6a6ead2697591 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -70,6 +70,10 @@ static unsigned getSEWOpNum(const MachineInstr &MI) {
   return RISCVII::getSEWOpNum(MI.getDesc());
 }
 
+static unsigned getVecPolicyOpNum(const MachineInstr &MI) {
+  return RISCVII::getVecPolicyOpNum(MI.getDesc());
+}
+
 /// Get the EEW for a load or store instruction.  Return std::nullopt if MI is
 /// not a load or store which ignores SEW.
 static std::optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) {
@@ -986,7 +990,7 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
 
     // If there is a policy operand, use it.
     if (RISCVII::hasVecPolicyOp(TSFlags)) {
-      const MachineOperand &Op = MI.getOperand(MI.getNumExplicitOperands() - 1);
+      const MachineOperand &Op = MI.getOperand(getVecPolicyOpNum(MI));
       uint64_t Policy = Op.getImm();
       assert(Policy <=
                  (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC) &&

From 9f4b6375b4fc12d5ed9b4713ac70682825ec4b20 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Wed, 1 Oct 2025 19:28:18 -0700
Subject: [PATCH 455/878] [BOLT][AArch64] Skip R_AARCH64_TLSDESC_CALL
 relocation (#161610)

R_AARCH64_TLSDESC_CALL is a relocation emitted as a hint for a linker to
replace `blr r` instruction with nop. BOLT does not currently require
any special handling for it.

Note that previously existing extraction of the relocated value was
incorrect.
---
 bolt/lib/Core/Relocation.cpp      | 15 ++++++-------
 bolt/test/AArch64/tls-desc-call.s | 35 +++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 7 deletions(-)
 create mode 100644 bolt/test/AArch64/tls-desc-call.s

diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp
index f882627222242..4b827b647b06c 100644
--- a/bolt/lib/Core/Relocation.cpp
+++ b/bolt/lib/Core/Relocation.cpp
@@ -81,7 +81,6 @@ static bool isSupportedAArch64(uint32_t Type) {
   case ELF::R_AARCH64_LD64_GOT_LO12_NC:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
-  case ELF::R_AARCH64_TLSDESC_CALL:
   case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
   case ELF::R_AARCH64_PREL16:
   case ELF::R_AARCH64_PREL32:
@@ -193,7 +192,6 @@ static size_t getSizeForTypeAArch64(uint32_t Type) {
   case ELF::R_AARCH64_LD64_GOT_LO12_NC:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
-  case ELF::R_AARCH64_TLSDESC_CALL:
   case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
   case ELF::R_AARCH64_PREL32:
   case ELF::R_AARCH64_MOVW_UABS_G0:
@@ -248,7 +246,14 @@ static bool skipRelocationTypeX86(uint32_t Type) {
 }
 
 static bool skipRelocationTypeAArch64(uint32_t Type) {
-  return Type == ELF::R_AARCH64_NONE || Type == ELF::R_AARCH64_LD_PREL_LO19;
+  switch (Type) {
+  default:
+    return false;
+  case ELF::R_AARCH64_NONE:
+  case ELF::R_AARCH64_LD_PREL_LO19:
+  case ELF::R_AARCH64_TLSDESC_CALL:
+    return true;
+  }
 }
 
 static bool skipRelocationTypeRISCV(uint32_t Type) {
@@ -362,7 +367,6 @@ static uint64_t extractValueAArch64(uint32_t Type, uint64_t Contents,
     return static_cast<int64_t>(PC) + SignExtend64<32>(Contents & 0xffffffff);
   case ELF::R_AARCH64_PREL64:
     return static_cast<int64_t>(PC) + Contents;
-  case ELF::R_AARCH64_TLSDESC_CALL:
   case ELF::R_AARCH64_JUMP26:
   case ELF::R_AARCH64_CALL26:
     // Immediate goes in bits 25:0 of B and BL.
@@ -552,7 +556,6 @@ static bool isGOTAArch64(uint32_t Type) {
   case ELF::R_AARCH64_TLSDESC_ADR_PAGE21:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
-  case ELF::R_AARCH64_TLSDESC_CALL:
     return true;
   }
 }
@@ -591,7 +594,6 @@ static bool isTLSAArch64(uint32_t Type) {
   case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
-  case ELF::R_AARCH64_TLSDESC_CALL:
   case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
     return true;
   }
@@ -667,7 +669,6 @@ static bool isPCRelativeAArch64(uint32_t Type) {
   case ELF::R_AARCH64_MOVW_UABS_G2_NC:
   case ELF::R_AARCH64_MOVW_UABS_G3:
     return false;
-  case ELF::R_AARCH64_TLSDESC_CALL:
   case ELF::R_AARCH64_CALL26:
   case ELF::R_AARCH64_JUMP26:
   case ELF::R_AARCH64_TSTBR14:
diff --git a/bolt/test/AArch64/tls-desc-call.s b/bolt/test/AArch64/tls-desc-call.s
new file mode 100644
index 0000000000000..05753803c3d36
--- /dev/null
+++ b/bolt/test/AArch64/tls-desc-call.s
@@ -0,0 +1,35 @@
+# RUN: %clang %cflags %s -o %t.so -fPIC -shared -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt --debug-only=bolt 2>&1 | FileCheck %s
+
+# REQUIRES: asserts
+
+## Verify that R_AARCH64_TLSDESC_CALL relocations are ignored
+
+# CHECK-NOT: Relocation {{.*}} R_AARCH64_TLSDESC_CALL
+
+  .text
+  .globl  get_tls_var
+  .p2align  2
+  .type get_tls_var,@function
+get_tls_var:
+  .cfi_startproc
+  str     x30, [sp, #-16]!
+  adrp  x0, :tlsdesc:tls_var
+  ldr x1, [x0, :tlsdesc_lo12:tls_var]
+  add x0, x0, :tlsdesc_lo12:tls_var
+  .tlsdesccall tls_var
+  blr x1
+  mrs x8, TPIDR_EL0
+  ldr w0, [x8, x0]
+  ldr x30, [sp], #16
+  ret
+  .size get_tls_var, .-get_tls_var
+  .cfi_endproc
+
+  .type tls_var,@object
+  .section  .tdata,"awT",@progbits
+  .globl  tls_var
+  .p2align  2, 0x0
+tls_var:
+  .word 42
+  .size tls_var, 4

From a2330a398db398e33687a6bed71092a85312e481 Mon Sep 17 00:00:00 2001
From: Samarth Narang <70980689+snarang181@users.noreply.github.com>
Date: Thu, 2 Oct 2025 07:59:52 +0530
Subject: [PATCH 456/878] [Clang][Sema] Switch diagnostics from toString to
 operator<< for APSInt/APInt (#161474)

---
 clang/lib/Sema/SemaChecking.cpp               | 23 ++++-----
 clang/test/AST/ByteCode/const-eval.c          |  2 +-
 clang/test/Sema/const-eval.c                  |  2 +-
 clang/test/Sema/integer-overflow.c            |  2 +-
 clang/test/Sema/unbounded-array-bounds.c      | 48 +++++++++----------
 clang/test/SemaCXX/array-bounds.cpp           |  4 +-
 .../SemaCXX/constant-expression-cxx14.cpp     |  2 +-
 clang/test/SemaCXX/integer-overflow.cpp       |  2 +-
 8 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 8b9e132505094..3cc61b167ba98 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -14884,13 +14884,11 @@ void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
       // Diag message shows element size in bits and in "bytes" (platform-
       // dependent CharUnits)
       DiagRuntimeBehavior(BaseExpr->getBeginLoc(), BaseExpr,
-                          PDiag(DiagID)
-                              << toString(index, 10, true) << AddrBits
-                              << (unsigned)ASTC.toBits(*ElemCharUnits)
-                              << toString(ElemBytes, 10, false)
-                              << toString(MaxElems, 10, false)
-                              << (unsigned)MaxElems.getLimitedValue(~0U)
-                              << IndexExpr->getSourceRange());
+                          PDiag(DiagID) << index << AddrBits
+                                        << (unsigned)ASTC.toBits(*ElemCharUnits)
+                                        << ElemBytes << MaxElems
+                                        << MaxElems.getZExtValue()
+                                        << IndexExpr->getSourceRange());
 
       const NamedDecl *ND = nullptr;
       // Try harder to find a NamedDecl to point at in the note.
@@ -14973,10 +14971,10 @@ void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
     unsigned CastMsg = (!ASE || BaseType == EffectiveType) ? 0 : 1;
     QualType CastMsgTy = ASE ? ASE->getLHS()->getType() : QualType();
 
-    DiagRuntimeBehavior(
-        BaseExpr->getBeginLoc(), BaseExpr,
-        PDiag(DiagID) << toString(index, 10, true) << ArrayTy->desugar()
-                      << CastMsg << CastMsgTy << IndexExpr->getSourceRange());
+    DiagRuntimeBehavior(BaseExpr->getBeginLoc(), BaseExpr,
+                        PDiag(DiagID)
+                            << index << ArrayTy->desugar() << CastMsg
+                            << CastMsgTy << IndexExpr->getSourceRange());
   } else {
     unsigned DiagID = diag::warn_array_index_precedes_bounds;
     if (!ASE) {
@@ -14985,8 +14983,7 @@ void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
     }
 
     DiagRuntimeBehavior(BaseExpr->getBeginLoc(), BaseExpr,
-                        PDiag(DiagID) << toString(index, 10, true)
-                                      << IndexExpr->getSourceRange());
+                        PDiag(DiagID) << index << IndexExpr->getSourceRange());
   }
 
   const NamedDecl *ND = nullptr;
diff --git a/clang/test/AST/ByteCode/const-eval.c b/clang/test/AST/ByteCode/const-eval.c
index c6b51d16b811e..d6cf600b378a8 100644
--- a/clang/test/AST/ByteCode/const-eval.c
+++ b/clang/test/AST/ByteCode/const-eval.c
@@ -144,7 +144,7 @@ EVAL_EXPR(52, &pr24622 == (void *)&PR24622);
 
 // We evaluate these by providing 2s' complement semantics in constant
 // expressions, like we do for integers.
-void *PR28739a = (__int128)(unsigned long)-1 + &PR28739a;                  // both-warning {{the pointer incremented by 18446744073709551615 refers past the last possible element for an array in 64-bit address space containing 64-bit (8-byte) elements (max possible 2305843009213693952 elements)}}
+void *PR28739a = (__int128)(unsigned long)-1 + &PR28739a;                  // both-warning {{the pointer incremented by 18'446'744'073'709'551'615 refers past the last possible element for an array in 64-bit address space containing 64-bit (8-byte) elements (max possible 2'305'843'009'213'693'952 elements)}}
 
 void *PR28739b = &PR28739b + (__int128)(unsigned long)-1;                  // both-warning {{refers past the last possible element}}
 __int128 PR28739c = (&PR28739c + (__int128)(unsigned long)-1) - &PR28739c; // both-warning {{refers past the last possible element}}
diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c
index 11cc7fbc0feb3..53face901d75e 100644
--- a/clang/test/Sema/const-eval.c
+++ b/clang/test/Sema/const-eval.c
@@ -138,7 +138,7 @@ EVAL_EXPR(52, &pr24622 == (void *)&PR24622);
 
 // We evaluate these by providing 2s' complement semantics in constant
 // expressions, like we do for integers.
-void *PR28739a = (__int128)(unsigned long)-1 + &PR28739a;                  // expected-warning {{the pointer incremented by 18446744073709551615 refers past the last possible element for an array in 64-bit address space containing 64-bit (8-byte) elements (max possible 2305843009213693952 elements)}}
+void *PR28739a = (__int128)(unsigned long)-1 + &PR28739a;                  // expected-warning {{the pointer incremented by 18'446'744'073'709'551'615 refers past the last possible element for an array in 64-bit address space containing 64-bit (8-byte) elements (max possible 2'305'843'009'213'693'952 elements)}}
 void *PR28739b = &PR28739b + (__int128)(unsigned long)-1;                  // expected-warning {{refers past the last possible element}}
 __int128 PR28739c = (&PR28739c + (__int128)(unsigned long)-1) - &PR28739c; // expected-warning {{refers past the last possible element}}
 void *PR28739d = &(&PR28739d)[(__int128)(unsigned long)-1];                // expected-warning {{refers past the last possible element}}
diff --git a/clang/test/Sema/integer-overflow.c b/clang/test/Sema/integer-overflow.c
index 30a47aa5f6ad6..ba943f0927a22 100644
--- a/clang/test/Sema/integer-overflow.c
+++ b/clang/test/Sema/integer-overflow.c
@@ -143,7 +143,7 @@ uint64_t check_integer_overflows(int i) {
   (__imag__ x) = 4608 * 1024 * 1024;
 
 // expected-warning@+4 {{overflow in expression; result is 536'870'912 with type 'int'}}
-// expected-warning@+3 {{array index 536870912 is past the end of the array (that has type 'uint64_t[10]' (aka 'unsigned long long[10]'))}}
+// expected-warning@+3 {{array index 536'870'912 is past the end of the array (that has type 'uint64_t[10]' (aka 'unsigned long long[10]'))}}
 // expected-note@+1 {{array 'a' declared here}}
   uint64_t a[10];
   a[4608 * 1024 * 1024] = 1i;
diff --git a/clang/test/Sema/unbounded-array-bounds.c b/clang/test/Sema/unbounded-array-bounds.c
index b22261a3eaeb5..909286b283852 100644
--- a/clang/test/Sema/unbounded-array-bounds.c
+++ b/clang/test/Sema/unbounded-array-bounds.c
@@ -14,11 +14,11 @@ struct S s[]; // expected-warning {{tentative array definition}} expected-note {
 void f1(void) {
   ++s[3].a;
   ++s[7073650413200313099].b;
-  // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}}
-  // addr32-warning@-2 {{array index 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178956970 elements)}}
-  // addr64-warning@-3 {{array index 7073650413200313099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576460752303423488 elements)}}
+  // addr16-warning@-1 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3'449 elements)}}
+  // addr32-warning@-2 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178'956'970 elements)}}
+  // addr64-warning@-3 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576'460'752'303'423'488 elements)}}
   ++s[7073650].c;
-  // addr16-warning@-1 {{array index 7073650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}}
+  // addr16-warning@-1 {{array index 7'073'650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3'449 elements)}}
 }
 
 long long ll[]; // expected-warning {{tentative array definition}} expected-note {{declared here}} addr16-note {{declared here}} addr32-note {{declared here}}
@@ -26,32 +26,32 @@ long long ll[]; // expected-warning {{tentative array definition}} expected-note
 void f2(void) {
   ++ll[3];
   ++ll[2705843009213693952];
-  // addr16-warning@-1 {{array index 2705843009213693952 refers past the last possible element for an array in 16-bit address space containing 64-bit (8-byte) elements (max possible 8192 elements)}}
-  // addr32-warning@-2 {{array index 2705843009213693952 refers past the last possible element for an array in 32-bit address space containing 64-bit (8-byte) elements (max possible 536870912 elements)}}
-  // addr64-warning@-3 {{array index 2705843009213693952 refers past the last possible element for an array in 64-bit address space containing 64-bit (8-byte) elements (max possible 2305843009213693952 elements)}}
+  // addr16-warning@-1 {{array index 2'705'843'009'213'693'952 refers past the last possible element for an array in 16-bit address space containing 64-bit (8-byte) elements (max possible 8'192 elements)}}
+  // addr32-warning@-2 {{array index 2'705'843'009'213'693'952 refers past the last possible element for an array in 32-bit address space containing 64-bit (8-byte) elements (max possible 536'870'912 elements)}}
+  // addr64-warning@-3 {{array index 2'705'843'009'213'693'952 refers past the last possible element for an array in 64-bit address space containing 64-bit (8-byte) elements (max possible 2'305'843'009'213'693'952 elements)}}
   ++ll[847073650];
-  // addr16-warning@-1 {{array index 847073650 refers past the last possible element for an array in 16-bit address space containing 64-bit (8-byte) elements (max possible 8192 elements)}}
-  // addr32-warning@-2 {{array index 847073650 refers past the last possible element for an array in 32-bit address space containing 64-bit (8-byte) elements (max possible 536870912 elements)}}
+  // addr16-warning@-1 {{array index 847'073'650 refers past the last possible element for an array in 16-bit address space containing 64-bit (8-byte) elements (max possible 8'192 elements)}}
+  // addr32-warning@-2 {{array index 847'073'650 refers past the last possible element for an array in 32-bit address space containing 64-bit (8-byte) elements (max possible 536'870'912 elements)}}
 }
 
 void f3(struct S p[]) { // expected-note {{declared here}} addr16-note {{declared here}}
   ++p[3].a;
   ++p[7073650413200313099].b;
-  // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}}
-  // addr32-warning@-2 {{array index 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178956970 elements)}}
-  // addr64-warning@-3 {{array index 7073650413200313099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576460752303423488 elements)}}
+  // addr16-warning@-1 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3'449 elements)}}
+  // addr32-warning@-2 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178'956'970 elements)}}
+  // addr64-warning@-3 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576'460'752'303'423'488 elements)}}
   ++p[7073650].c;
-  // addr16-warning@-1 {{array index 7073650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}}
+  // addr16-warning@-1 {{array index 7'073'650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3'449 elements)}}
 }
 
 void f4(struct S *p) { // expected-note {{declared here}} addr16-note {{declared here}}
   p += 3;
   p += 7073650413200313099;
-  // addr16-warning@-1 {{the pointer incremented by 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}}
-  // addr32-warning@-2 {{the pointer incremented by 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178956970 elements)}}
-  // addr64-warning@-3 {{the pointer incremented by 7073650413200313099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576460752303423488 elements)}}
+  // addr16-warning@-1 {{the pointer incremented by 7'073'650'413'200'313'099 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3'449 elements)}}
+  // addr32-warning@-2 {{the pointer incremented by 7'073'650'413'200'313'099 refers past the last possible element for an array in 32-bit address space containing 192-bit (24-byte) elements (max possible 178'956'970 elements)}}
+  // addr64-warning@-3 {{the pointer incremented by 7'073'650'413'200'313'099 refers past the last possible element for an array in 64-bit address space containing 256-bit (32-byte) elements (max possible 576'460'752'303'423'488 elements)}}
   p += 7073650;
-  // addr16-warning@-1 {{the pointer incremented by 7073650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3449 elements)}}
+  // addr16-warning@-1 {{the pointer incremented by 7'073'650 refers past the last possible element for an array in 16-bit address space containing 152-bit (19-byte) elements (max possible 3'449 elements)}}
 }
 
 struct BQ {
@@ -63,7 +63,7 @@ struct BQ bq[]; // expected-warning {{tentative array definition}} addr16-note {
 void f5(void) {
   ++bq[0].bigblock[0].a;
   ++bq[1].bigblock[0].a;
-  // addr16-warning@-1 {{array index 1 refers past the last possible element for an array in 16-bit address space containing 497952-bit (62244-byte) elements (max possible 1 element)}}
+  // addr16-warning@-1 {{array index 1 refers past the last possible element for an array in 16-bit address space containing 497952-bit (62'244-byte) elements (max possible 1 element)}}
 }
 
 void f6(void) {
@@ -102,15 +102,15 @@ struct {
 
 void fam_ily() {
   ++fam.tail[7073650413200313099];
-  // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 8-bit (1-byte) elements (max possible 65536 elements)}}
-  // addr32-warning@-2 {{array index 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 8-bit (1-byte) elements (max possible 4294967296 elements)}}
+  // addr16-warning@-1 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 16-bit address space containing 8-bit (1-byte) elements (max possible 65'536 elements)}}
+  // addr32-warning@-2 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 32-bit address space containing 8-bit (1-byte) elements (max possible 4'294'967'296 elements)}}
   // No warning for addr64 because the array index is inbound in that case.
   ++fam0.tail[7073650413200313099];
-  // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 8-bit (1-byte) elements (max possible 65536 elements)}}
-  // addr32-warning@-2 {{array index 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 8-bit (1-byte) elements (max possible 4294967296 elements)}}
+  // addr16-warning@-1 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 16-bit address space containing 8-bit (1-byte) elements (max possible 65'536 elements)}}
+  // addr32-warning@-2 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 32-bit address space containing 8-bit (1-byte) elements (max possible 4'294'967'296 elements)}}
   // No warning for addr64 because the array index is inbound in that case.
   ++fam1.tail[7073650413200313099];
-  // addr16-warning@-1 {{array index 7073650413200313099 refers past the last possible element for an array in 16-bit address space containing 8-bit (1-byte) elements (max possible 65536 elements)}}
-  // addr32-warning@-2 {{array index 7073650413200313099 refers past the last possible element for an array in 32-bit address space containing 8-bit (1-byte) elements (max possible 4294967296 elements)}}
+  // addr16-warning@-1 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 16-bit address space containing 8-bit (1-byte) elements (max possible 65'536 elements)}}
+  // addr32-warning@-2 {{array index 7'073'650'413'200'313'099 refers past the last possible element for an array in 32-bit address space containing 8-bit (1-byte) elements (max possible 4'294'967'296 elements)}}
   // No warning for addr64 because the array index is inbound in that case.
 }
diff --git a/clang/test/SemaCXX/array-bounds.cpp b/clang/test/SemaCXX/array-bounds.cpp
index b584e1e7cd453..6a40d1db0a6fd 100644
--- a/clang/test/SemaCXX/array-bounds.cpp
+++ b/clang/test/SemaCXX/array-bounds.cpp
@@ -237,7 +237,7 @@ void test_pr10771() {
     ((char*)foo)[sizeof(foo) - 1] = '\0';  // no-warning
     *(((char*)foo) + sizeof(foo) - 1) = '\0';  // no-warning
 
-    ((char*)foo)[sizeof(foo)] = '\0';  // expected-warning {{array index 32768 is past the end of the array (that has type 'double[4096]', cast to 'char *')}}
+    ((char*)foo)[sizeof(foo)] = '\0';  // expected-warning {{array index 32'768 is past the end of the array (that has type 'double[4096]', cast to 'char *')}}
 
     // TODO: This should probably warn, too.
     *(((char*)foo) + sizeof(foo)) = '\0';  // no-warning
@@ -248,7 +248,7 @@ int test_pr11007_aux(const char * restrict, ...);
 // Test checking with varargs.
 void test_pr11007() {
   double a[5]; // expected-note {{array 'a' declared here}}
-  test_pr11007_aux("foo", a[1000]); // expected-warning {{array index 1000 is past the end of the array (that has type 'double[5]')}}
+  test_pr11007_aux("foo", a[1000]); // expected-warning {{array index 1'000 is past the end of the array (that has type 'double[5]')}}
 }
 
 void test_rdar10916006(void)
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index 1743e0e3ac4b5..bea90ff7eaf8a 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1047,7 +1047,7 @@ constexpr int S = sum(Cs); // expected-error{{must be initialized by a constant
 constexpr void PR28739(int n) { // cxx14_20-error {{never produces a constant}}
   int *p = &n;                  // expected-note {{array 'p' declared here}}
   p += (__int128)(unsigned long)-1; // cxx14_20-note {{cannot refer to element 18446744073709551615 of non-array object in a constant expression}}
-  // expected-warning@-1 {{the pointer incremented by 18446744073709551615 refers past the last possible element for an array in 64-bit address space containing 32-bit (4-byte) elements (max possible 4611686018427387904 elements)}}
+  // expected-warning@-1 {{the pointer incremented by 18'446'744'073'709'551'615 refers past the last possible element for an array in 64-bit address space containing 32-bit (4-byte) elements (max possible 4'611'686'018'427'387'904 elements)}}
 }
 
 constexpr void Void(int n) {
diff --git a/clang/test/SemaCXX/integer-overflow.cpp b/clang/test/SemaCXX/integer-overflow.cpp
index 73a4e88ee6c09..214dc11bf3ead 100644
--- a/clang/test/SemaCXX/integer-overflow.cpp
+++ b/clang/test/SemaCXX/integer-overflow.cpp
@@ -171,7 +171,7 @@ uint64_t check_integer_overflows(int i) { //expected-note 0+{{declared here}}
   uint64_t a[10];
   a[4608 * 1024 * 1024] = 1;
 #if __cplusplus < 201103L
-// expected-warning@-2 {{array index 536870912 is past the end of the array (that has type 'uint64_t[10]' (aka 'unsigned long long[10]'))}}
+// expected-warning@-2 {{array index 536'870'912 is past the end of the array (that has type 'uint64_t[10]' (aka 'unsigned long long[10]'))}}
 // expected-note@-4 {{array 'a' declared here}}
 #endif
 

From 2d0637494936be3742750ab95b856e3cb86d1198 Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Wed, 1 Oct 2025 22:47:53 -0400
Subject: [PATCH 457/878] [mlir][arith] Add mulf(x, 0) -> 0 to mulf folder
 (#161395)

Fold  `mulf(x, 0) -> 0` when (nnan | nsz)
---
 mlir/lib/Dialect/Arith/IR/ArithOps.cpp    |  7 +++++++
 mlir/test/Dialect/Arith/canonicalize.mlir | 12 ++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 7cfd6d3a98df8..898d76ce8d9b5 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1282,6 +1282,13 @@ OpFoldResult arith::MulFOp::fold(FoldAdaptor adaptor) {
   if (matchPattern(adaptor.getRhs(), m_OneFloat()))
     return getLhs();
 
+  if (arith::bitEnumContainsAll(getFastmath(), arith::FastMathFlags::nnan |
+                                                   arith::FastMathFlags::nsz)) {
+    // mulf(x, 0) -> 0
+    if (matchPattern(adaptor.getRhs(), m_AnyZeroFloat()))
+      return getRhs();
+  }
+
   return constFoldBinaryOp<FloatAttr>(
       adaptor.getOperands(),
       [](const APFloat &a, const APFloat &b) { return a * b; });
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index ca3de3a2d7703..2fe0995c9d4df 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2216,6 +2216,18 @@ func.func @test_mulf1(%arg0 : f32, %arg1 : f32) -> (f32) {
   return %2 : f32
 }
 
+// CHECK-LABEL: @test_mulf2(
+func.func @test_mulf2(%arg0 : f32) -> (f32, f32) {
+  // CHECK-DAG:  %[[C0:.+]] = arith.constant 0.000000e+00 : f32
+  // CHECK-DAG:  %[[C0n:.+]] = arith.constant -0.000000e+00 : f32
+  // CHECK-NEXT:  return %[[C0]], %[[C0n]]
+  %c0 = arith.constant 0.0 : f32
+  %c0n = arith.constant -0.0 : f32
+  %0 = arith.mulf %c0, %arg0 fastmath<nnan,nsz> : f32
+  %1 = arith.mulf %c0n, %arg0 fastmath<nnan,nsz> : f32
+  return %0, %1 : f32, f32
+}
+
 // -----
 
 // CHECK-LABEL: @test_divf(

From 129d5ce14c4fd094799b50e6ebe6c8f9ca5003f1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 1 Oct 2025 19:55:46 -0700
Subject: [PATCH 458/878] [RISCV][GISel] Use LBU for anyext i8 atomic_load.
 (#161588)

This matches what we do for regular i8 extload due to the lack of c.lb
in Zbc.

This only affects global isel because SelectionDAG won't create an
anyext i8 atomic_load today.
---
 llvm/lib/Target/RISCV/RISCVGISel.td           |  5 ++-
 llvm/lib/Target/RISCV/RISCVInstrInfoA.td      |  7 ++--
 .../RISCV/GlobalISel/atomic-load-store.ll     | 40 +++++++++----------
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 2e5f30f8fe35a..19d5aff023d53 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -109,8 +109,9 @@ def : LdPat<extloadi8, LBU, i16>; // Prefer unsigned due to no c.lb in Zcb.
 def : StPat<truncstorei8, SB, GPR, i16>;
 
 let Predicates = [HasAtomicLdSt] in {
-  def : LdPat<atomic_load_aext_8,  LB, i16>;
-  def : LdPat<atomic_load_nonext_16, LH, i16>;
+  // Prefer unsigned due to no c.lb in Zcb.
+  def : LdPat<atomic_load_aext_8,    LBU, i16>;
+  def : LdPat<atomic_load_nonext_16, LH,  i16>;
 
   def : StPat<atomic_store_8,  SB, GPR, i16>;
   def : StPat<atomic_store_16, SH, GPR, i16>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 59f5aebf658d8..99992d196b43d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -165,10 +165,11 @@ class seq_cst_store<PatFrag base>
 // any ordering. This is necessary because AtomicExpandPass has added fences to
 // atomic load/stores and changed them to unordered ones.
 let Predicates = [HasAtomicLdSt] in {
-  def : LdPat<relaxed_load<atomic_load_asext_8>,  LB>;
+  // Use unsigned for aext due to no c.lb in Zcb.
+  def : LdPat<relaxed_load<atomic_load_sext_8>,   LB>;
+  def : LdPat<relaxed_load<atomic_load_azext_8>,  LBU>;
   def : LdPat<relaxed_load<atomic_load_asext_16>, LH>;
-  def : LdPat<relaxed_load<atomic_load_zext_8>,  LBU>;
-  def : LdPat<relaxed_load<atomic_load_zext_16>, LHU>;
+  def : LdPat<relaxed_load<atomic_load_zext_16>,  LHU>;
 
   def : StPat<relaxed_store<atomic_store_8>,  SB, GPR, XLenVT>;
   def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store.ll b/llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store.ll
index 9a1ed8f115b35..1d5d918422b28 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store.ll
@@ -37,7 +37,7 @@ define i8 @atomic_load_i8_unordered(ptr %a) nounwind {
 ;
 ; RV32IA-LABEL: atomic_load_i8_unordered:
 ; RV32IA:       # %bb.0:
-; RV32IA-NEXT:    lb a0, 0(a0)
+; RV32IA-NEXT:    lbu a0, 0(a0)
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomic_load_i8_unordered:
@@ -52,7 +52,7 @@ define i8 @atomic_load_i8_unordered(ptr %a) nounwind {
 ;
 ; RV64IA-LABEL: atomic_load_i8_unordered:
 ; RV64IA:       # %bb.0:
-; RV64IA-NEXT:    lb a0, 0(a0)
+; RV64IA-NEXT:    lbu a0, 0(a0)
 ; RV64IA-NEXT:    ret
   %1 = load atomic i8, ptr %a unordered, align 1
   ret i8 %1
@@ -71,7 +71,7 @@ define i8 @atomic_load_i8_monotonic(ptr %a) nounwind {
 ;
 ; RV32IA-LABEL: atomic_load_i8_monotonic:
 ; RV32IA:       # %bb.0:
-; RV32IA-NEXT:    lb a0, 0(a0)
+; RV32IA-NEXT:    lbu a0, 0(a0)
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomic_load_i8_monotonic:
@@ -86,7 +86,7 @@ define i8 @atomic_load_i8_monotonic(ptr %a) nounwind {
 ;
 ; RV64IA-LABEL: atomic_load_i8_monotonic:
 ; RV64IA:       # %bb.0:
-; RV64IA-NEXT:    lb a0, 0(a0)
+; RV64IA-NEXT:    lbu a0, 0(a0)
 ; RV64IA-NEXT:    ret
   %1 = load atomic i8, ptr %a monotonic, align 1
   ret i8 %1
@@ -105,13 +105,13 @@ define i8 @atomic_load_i8_acquire(ptr %a) nounwind {
 ;
 ; RV32IA-WMO-LABEL: atomic_load_i8_acquire:
 ; RV32IA-WMO:       # %bb.0:
-; RV32IA-WMO-NEXT:    lb a0, 0(a0)
+; RV32IA-WMO-NEXT:    lbu a0, 0(a0)
 ; RV32IA-WMO-NEXT:    fence r, rw
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomic_load_i8_acquire:
 ; RV32IA-TSO:       # %bb.0:
-; RV32IA-TSO-NEXT:    lb a0, 0(a0)
+; RV32IA-TSO-NEXT:    lbu a0, 0(a0)
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomic_load_i8_acquire:
@@ -126,35 +126,35 @@ define i8 @atomic_load_i8_acquire(ptr %a) nounwind {
 ;
 ; RV64IA-WMO-LABEL: atomic_load_i8_acquire:
 ; RV64IA-WMO:       # %bb.0:
-; RV64IA-WMO-NEXT:    lb a0, 0(a0)
+; RV64IA-WMO-NEXT:    lbu a0, 0(a0)
 ; RV64IA-WMO-NEXT:    fence r, rw
 ; RV64IA-WMO-NEXT:    ret
 ;
 ; RV64IA-TSO-LABEL: atomic_load_i8_acquire:
 ; RV64IA-TSO:       # %bb.0:
-; RV64IA-TSO-NEXT:    lb a0, 0(a0)
+; RV64IA-TSO-NEXT:    lbu a0, 0(a0)
 ; RV64IA-TSO-NEXT:    ret
 ;
 ; RV32IA-WMO-TRAILING-FENCE-LABEL: atomic_load_i8_acquire:
 ; RV32IA-WMO-TRAILING-FENCE:       # %bb.0:
-; RV32IA-WMO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
 ; RV32IA-WMO-TRAILING-FENCE-NEXT:    ret
 ;
 ; RV32IA-TSO-TRAILING-FENCE-LABEL: atomic_load_i8_acquire:
 ; RV32IA-TSO-TRAILING-FENCE:       # %bb.0:
-; RV32IA-TSO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV32IA-TSO-TRAILING-FENCE-NEXT:    ret
 ;
 ; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_load_i8_acquire:
 ; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
-; RV64IA-WMO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
 ; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
 ;
 ; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_load_i8_acquire:
 ; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
-; RV64IA-TSO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
   %1 = load atomic i8, ptr %a acquire, align 1
   ret i8 %1
@@ -174,14 +174,14 @@ define i8 @atomic_load_i8_seq_cst(ptr %a) nounwind {
 ; RV32IA-WMO-LABEL: atomic_load_i8_seq_cst:
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    fence rw, rw
-; RV32IA-WMO-NEXT:    lb a0, 0(a0)
+; RV32IA-WMO-NEXT:    lbu a0, 0(a0)
 ; RV32IA-WMO-NEXT:    fence r, rw
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomic_load_i8_seq_cst:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    fence rw, rw
-; RV32IA-TSO-NEXT:    lb a0, 0(a0)
+; RV32IA-TSO-NEXT:    lbu a0, 0(a0)
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomic_load_i8_seq_cst:
@@ -197,40 +197,40 @@ define i8 @atomic_load_i8_seq_cst(ptr %a) nounwind {
 ; RV64IA-WMO-LABEL: atomic_load_i8_seq_cst:
 ; RV64IA-WMO:       # %bb.0:
 ; RV64IA-WMO-NEXT:    fence rw, rw
-; RV64IA-WMO-NEXT:    lb a0, 0(a0)
+; RV64IA-WMO-NEXT:    lbu a0, 0(a0)
 ; RV64IA-WMO-NEXT:    fence r, rw
 ; RV64IA-WMO-NEXT:    ret
 ;
 ; RV64IA-TSO-LABEL: atomic_load_i8_seq_cst:
 ; RV64IA-TSO:       # %bb.0:
 ; RV64IA-TSO-NEXT:    fence rw, rw
-; RV64IA-TSO-NEXT:    lb a0, 0(a0)
+; RV64IA-TSO-NEXT:    lbu a0, 0(a0)
 ; RV64IA-TSO-NEXT:    ret
 ;
 ; RV32IA-WMO-TRAILING-FENCE-LABEL: atomic_load_i8_seq_cst:
 ; RV32IA-WMO-TRAILING-FENCE:       # %bb.0:
 ; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
-; RV32IA-WMO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
 ; RV32IA-WMO-TRAILING-FENCE-NEXT:    ret
 ;
 ; RV32IA-TSO-TRAILING-FENCE-LABEL: atomic_load_i8_seq_cst:
 ; RV32IA-TSO-TRAILING-FENCE:       # %bb.0:
 ; RV32IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
-; RV32IA-TSO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV32IA-TSO-TRAILING-FENCE-NEXT:    ret
 ;
 ; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_load_i8_seq_cst:
 ; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
 ; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
-; RV64IA-WMO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
 ; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
 ;
 ; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_load_i8_seq_cst:
 ; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
 ; RV64IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
-; RV64IA-TSO-TRAILING-FENCE-NEXT:    lb a0, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    lbu a0, 0(a0)
 ; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
   %1 = load atomic i8, ptr %a seq_cst, align 1
   ret i8 %1

From ac0e99e19124ab720e8837d1500f0c000559a0ef Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Oct 2025 20:06:15 -0700
Subject: [PATCH 459/878] [Analysis] Fix a warning

This patch fixes:

  llvm/lib/Analysis/IR2Vec.cpp:289:14: error: unused variable
  'allSameDim' [-Werror,-Wunused-variable]
---
 llvm/lib/Analysis/IR2Vec.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 271f004b0a787..af30422b73759 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -286,12 +286,13 @@ VocabStorage::VocabStorage(std::vector<std::vector<Embedding>> &&SectionData)
 
         // Verify that all embeddings across all sections have the same
         // dimension
-        auto allSameDim = [ExpectedDim](const std::vector<Embedding> &Section) {
-          return std::all_of(Section.begin(), Section.end(),
-                             [ExpectedDim](const Embedding &Emb) {
-                               return Emb.size() == ExpectedDim;
-                             });
-        };
+        [[maybe_unused]] auto allSameDim =
+            [ExpectedDim](const std::vector<Embedding> &Section) {
+              return std::all_of(Section.begin(), Section.end(),
+                                 [ExpectedDim](const Embedding &Emb) {
+                                   return Emb.size() == ExpectedDim;
+                                 });
+            };
         assert(std::all_of(Sections.begin(), Sections.end(), allSameDim) &&
                "All embeddings must have the same dimension");
 

From c6e280e7ed9e120ba5e8c141bd5c4fd116d076a2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 13:36:47 +0900
Subject: [PATCH 460/878] PeepholeOpt: Fix losing subregister indexes on full
 copies (#161310)

Previously if we had a subregister extract reading from a
full copy, the no-subregister incoming copy would overwrite
the DefSubReg index of the folding context.

There's one ugly rvv regression, but it's a downstream
issue of this; an unnecessary same class reg-to-reg full copy
was avoided.
---
 llvm/lib/CodeGen/PeepholeOptimizer.cpp        |   22 +-
 .../AMDGPU/GlobalISel/atomicrmw_fmax.ll       |  104 +-
 .../AMDGPU/GlobalISel/atomicrmw_fmin.ll       |  104 +-
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  518 +++---
 .../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll |    1 -
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  | 1641 ++++++++---------
 .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll   |  131 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll    |   96 +-
 .../atomic_optimizations_global_pointer.ll    |   94 +-
 .../buffer-fat-pointer-atomicrmw-fadd.ll      |  460 ++---
 .../buffer-fat-pointer-atomicrmw-fmax.ll      |  459 +++--
 .../buffer-fat-pointer-atomicrmw-fmin.ll      |  459 +++--
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        |  126 +-
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        |   26 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll   |   76 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll   |   47 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll   |   47 +-
 .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll   |   76 +-
 .../AMDGPU/global_atomics_i64_system.ll       |  721 ++++----
 llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll    |    3 +-
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |    2 +-
 .../AMDGPU/peephole-opt-regseq-removal.mir    |    4 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |    6 +-
 llvm/test/CodeGen/ARM/llvm.exp10.ll           |   16 +-
 llvm/test/CodeGen/ARM/llvm.frexp.ll           |   36 +-
 llvm/test/CodeGen/Mips/no-odd-spreg-msa.ll    |    1 -
 llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll  |   20 +-
 .../PowerPC/vec_conv_i64_to_fp32_elts.ll      |   16 +-
 llvm/test/CodeGen/RISCV/rvv/expandload.ll     |   10 +-
 .../rvv/fixed-vectors-interleaved-access.ll   |  114 +-
 .../RISCV/rvv/named-vector-shuffle-reverse.ll |   72 +-
 .../RISCV/rvv/nontemporal-vp-scalable.ll      |   80 +-
 .../test/CodeGen/Thumb2/mve-soft-float-abi.ll |   41 +-
 llvm/test/CodeGen/Thumb2/mve-vld3.ll          |  414 ++---
 34 files changed, 2916 insertions(+), 3127 deletions(-)

diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 729a57ef23b1e..e1d39d64e9fb8 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -1929,7 +1929,27 @@ ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
   const MachineOperand &Src = Def->getOperand(1);
   if (Src.isUndef())
     return ValueTrackerResult();
-  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
+
+  Register SrcReg = Src.getReg();
+  unsigned SubReg = Src.getSubReg();
+  if (DefSubReg) {
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    SubReg = TRI->composeSubRegIndices(SubReg, DefSubReg);
+
+    if (SrcReg.isVirtual()) {
+      // TODO: Try constraining on rewrite if we can
+      const TargetRegisterClass *RegRC = MRI.getRegClass(SrcReg);
+      const TargetRegisterClass *SrcWithSubRC =
+          TRI->getSubClassWithSubReg(RegRC, SubReg);
+      if (RegRC != SrcWithSubRC)
+        return ValueTrackerResult();
+    } else {
+      if (!TRI->getSubReg(SrcReg, SubReg))
+        return ValueTrackerResult();
+    }
+  }
+
+  return ValueTrackerResult(SrcReg, SubReg);
 }
 
 ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 666523c88860c..ff618c05e2b80 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -1812,26 +1812,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
-; GFX12-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], null offen
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], null offen
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1854,27 +1854,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1906,28 +1906,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1937,28 +1935,26 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 351502816ae6e..007417c83e324 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -1812,26 +1812,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
-; GFX12-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], null offen
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], null offen
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1854,27 +1854,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
-; GFX11-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v6, s[0:3], 0 offen
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1906,28 +1906,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1937,28 +1935,26 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v6, s[16:19], 0 offen
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index ba5a8e9c68a1f..9e412b6c7cd0a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -209,48 +209,48 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
 ; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s9
-; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s8, v0
 ; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v0
-; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_mov_b32_e32 v5, s11
+; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v2, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
 ; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s10, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v6
 ; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
 ; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
 ; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v14, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v9, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v14, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
@@ -299,7 +299,6 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s19
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
@@ -346,30 +345,30 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s17
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s16, v0
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s16, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_mov_b32_e32 v4, s19
+; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v8
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s17, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s18, v2
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v8
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s18, v7
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s18, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s18, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
@@ -378,14 +377,15 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v14, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[12:13]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[2:3], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_i64:
@@ -1070,6 +1070,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1082,184 +1083,183 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s9
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s8, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s13
-; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v3, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s9, v1
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s8, v0
+; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v3, v2, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v3, s[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s15
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s14
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v4, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v2
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s12, v8
-; GFX8-NEXT:    v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
-; GFX8-NEXT:    v_trunc_f32_e32 v3, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v3
-; GFX8-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v1
-; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v6
-; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v7, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s15
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s14
+; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v2, v5, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s12, v1
+; GFX8-NEXT:    v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX8-NEXT:    v_trunc_f32_e32 v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v2
+; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v7
+; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v12, v[2:3]
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v3, v15, v1
-; GFX8-NEXT:    v_mul_lo_u32 v17, v12, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v12, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v15, v1
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v17
+; GFX8-NEXT:    v_mul_lo_u32 v4, v15, v2
+; GFX8-NEXT:    v_mul_lo_u32 v17, v12, v3
+; GFX8-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v15, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
 ; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v5, v15, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v17, v3
-; GFX8-NEXT:    v_mul_hi_u32 v17, v12, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v17
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, v15, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v17, v4
+; GFX8-NEXT:    v_mul_hi_u32 v17, v12, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v17
 ; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v17
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
 ; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v14, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v15, v2, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v3, v15, v3
+; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
+; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v15, v3, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v13, v17, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v1, v4
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v2, v5
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v12, v[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v19, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, v15, v3
-; GFX8-NEXT:    v_mul_lo_u32 v9, v12, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v11, v20, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v8, v15, v4
-; GFX8-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v13, s[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v7, v15, v4
+; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v5
 ; GFX8-NEXT:    v_mul_hi_u32 v9, v12, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
-; GFX8-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, v15, v5
+; GFX8-NEXT:    v_mul_hi_u32 v4, v15, v4
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
+; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
+; GFX8-NEXT:    v_mul_hi_u32 v5, v15, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v12, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v3
-; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX8-NEXT:    v_mul_hi_u32 v0, s10, v3
-; GFX8-NEXT:    v_mul_hi_u32 v3, s11, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v4
+; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX8-NEXT:    v_mul_hi_u32 v1, s10, v4
+; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v4
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v4
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
-; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v5
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
+; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v3, v0
-; GFX8-NEXT:    v_mul_hi_u32 v8, s11, v4
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v8, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v4, s11
-; GFX8-NEXT:    v_mov_b32_e32 v0, s15
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8]
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s10, v3
-; GFX8-NEXT:    v_subb_u32_e64 v11, s[0:1], v4, v7, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v3, s[0:1], s11, v7
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v4, v1
+; GFX8-NEXT:    v_mul_hi_u32 v8, s11, v5
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v0, v10, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s15
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s15, v11, v[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s10, v4
+; GFX8-NEXT:    v_subb_u32_e64 v1, s[0:1], v1, v0, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v0
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v11
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s14, v8
-; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v1
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, s14, v8
+; GFX8-NEXT:    v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v9
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v3, v0, vcc
-; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 1, v14
+; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v11
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v12, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 1, v14
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s14, v7
+; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s14, v9
 ; GFX8-NEXT:    v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v10, v14, s[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v9, s4
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v11, v0, s[0:1]
-; GFX8-NEXT:    flat_store_dwordx4 v[9:10], v[1:4]
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v1, v0, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v11, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v12, v14, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[5:8]
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[6:9]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udivrem_v2i64:
@@ -1355,11 +1355,11 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add3_u32 v8, v3, v2, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v4, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v1
@@ -1387,7 +1387,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
@@ -1396,128 +1396,128 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v6, v12, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, v12, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v15, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v17
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, v15, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, v17, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v17, v12, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v17
 ; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v17
+; GFX9-NEXT:    v_add_u32_e32 v5, v5, v17
 ; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, 1, v13
 ; GFX9-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT:    v_add3_u32 v3, v6, v4, v3
+; GFX9-NEXT:    v_add3_u32 v3, v5, v4, v3
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v10
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v7, v15, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, v12, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v10, v12, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v20, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v15, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v13, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v5
+; GFX9-NEXT:    v_mul_hi_u32 v9, v12, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_mul_hi_u32 v8, v12, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v18, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v9, v15, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX9-NEXT:    v_mul_hi_u32 v7, v12, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v10, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, v10, v8
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add3_u32 v5, v7, v6, v5
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v12, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v6, s19, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s18, v5
+; GFX9-NEXT:    v_mul_hi_u32 v9, s18, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v14, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v5, v8, v7, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, s19, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, s18, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v1, v6, s[0:1]
-; GFX9-NEXT:    v_mul_hi_u32 v1, s18, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v9, s19, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s19, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v7, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, s19, v5
-; GFX9-NEXT:    v_add_u32_e32 v1, v8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, s18, v5
-; GFX9-NEXT:    v_mul_hi_u32 v12, s19, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v4, v1
+; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX9-NEXT:    v_mul_hi_u32 v7, s18, v5
+; GFX9-NEXT:    v_mul_hi_u32 v13, s19, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v4, v6
 ; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s6, v11, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v0, v9, s[0:1]
-; GFX9-NEXT:    v_add_u32_e32 v0, v10, v8
-; GFX9-NEXT:    v_add3_u32 v8, v0, v1, v12
-; GFX9-NEXT:    v_mov_b32_e32 v0, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v8, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s19
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v1, v8, s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, v9, v7
+; GFX9-NEXT:    v_add3_u32 v12, v1, v12, v13
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s6, v12, v[1:2]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v0, v10, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[0:1]
-; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s18, v4
-; GFX9-NEXT:    v_subb_co_u32_e64 v9, s[0:1], v9, v0, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, s18, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v0, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s19, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s6, v1
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s6, v8
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v10
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v12
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v11
 ; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, 1, v14
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s6, v10
+; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s6, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v11, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v14, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v8, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v0, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v12, v14, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v1, v0, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[2:5], s[12:13]
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[6:9], s[14:15]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index d053425afbb6d..7cc505171da82 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -1483,7 +1483,6 @@ define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 {
 ; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX90A-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX90A-NEXT:  .LBB20_4: ; %atomicrmw.private
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 815b9f294be8f..df9c97fa23722 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -161654,177 +161654,175 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:244
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:136
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:120
-; GFX11-TRUE16-NEXT:    s_clause 0x1a
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v153, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v154, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v40, s32 offset:236
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v41, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v42, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v43, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v44, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v45, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v46, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v47, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v56, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v57, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v58, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v59, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v60, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v61, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v62, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v63, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v72, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v73, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v74, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v75, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v76, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v77, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v78, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v79, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v88, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v89, s32 offset:136
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v90, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v91, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v92, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v93, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v94, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v95, s32 offset:112
+; GFX11-TRUE16-NEXT:    s_clause 0x18
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v104, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v105, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v106, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v107, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v108, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v109, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v110, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v111, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v120, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v121, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v122, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v123, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v124, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v125, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v126, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v127, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v136, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v137, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v138, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v139, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v140, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v141, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v142, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v143, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_store_b32 off, v152, s32 offset:12
 ; GFX11-TRUE16-NEXT:    s_clause 0x2
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:4
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v84, off, s32
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr143_lo16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v99, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v98, off, s32
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr152_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr142_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr141_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr140_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr139_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr127_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr62_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr125_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr138_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr179_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr137_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr56_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr126_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr123_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr121_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr110_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr79_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr111_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr74_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr60_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr109_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr107_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr111_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr106_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr95_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr89_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr93_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr90_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr138_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr79_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr91_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr127_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr89_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr104_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr75_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr153_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr72_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr137_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr142_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr73_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr125_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr63_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr61_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr59_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr154_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr57_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr143_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr152_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr136_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr141_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr47_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr124_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr126_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr124_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr122_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr120_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr109_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr106_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr110_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr33_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr108_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr94_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr104_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr94_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr92_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr88_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr77_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr74_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr88_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr62_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr59_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr164_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr76_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr57_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr163_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr73_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr63_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr46_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr44_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr166_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr60_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr43_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr165_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr58_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr56_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr45_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr42_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr41_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr178_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr40_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr177_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr176_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr182_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr167_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr181_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr180_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr52_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_hi16
@@ -161838,135 +161836,135 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %cmp.false
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[100:101], 24, v[15:16]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[116:117], 24, v[11:12]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[132:133], 24, v[7:8]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[133:134], 24, v[5:6]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[101:102], 24, v[23:24]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[117:118], 24, v[19:20]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[84:85], 24, v[27:28]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[101:102], 24, v[13:14]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[114:115], 24, v[11:12]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[117:118], 24, v[9:10]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[130:131], 24, v[7:8]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[144:145], 24, v[3:4]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[85:86], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v77, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v89, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v91, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v110, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v109, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v111, 8, v7
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v121, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v123, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v125, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v127, 24, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v139, 8, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v140, 8, v3
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v141, 24, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v142, 8, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v143, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v126, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v136, 24, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v137, 8, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v138, 8, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v139, 24, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v140, 8, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v152, 8, v1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v85
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v85
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 24, v99
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v99
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v84
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v92, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v104, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v109, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v122, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v124, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v126, 8, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v136, 8, v17
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[112:113], 24, v[13:14]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[128:129], 24, v[9:10]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[134:135], 24, v[3:4]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[146:147], 24, v[1:2]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[70:71], 24, v[84:85]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v98
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v30
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v74, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v77, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v92, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v110, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v122, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v124, 8, v17
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[131:132], 24, v[5:6]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[145:146], 24, v[1:2]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[70:71], 24, v[98:99]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[80:81], 24, v[29:30]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[86:87], 24, v[27:28]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[96:97], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[86:87], 24, v[23:24]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[102:103], 24, v[21:22]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[115:116], 24, v[19:20]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[118:119], 24, v[17:18]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v180.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v1.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v64.h, v1.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v177.h, v2.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v65.h, v2.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v43.h, v3.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v40.h, v3.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.h, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v183.h, v4.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v179.h, v4.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.h, v4.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v62.h, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v56.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.h, v5.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v47.h, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v42.h, v6.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.h, v6.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v91.h, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v79.h, v7.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v74.h, v8.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v60.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v111.h, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.h, v9.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v89.h, v10.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.h, v10.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v138.h, v11.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v108.h, v12.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.h, v12.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v153.h, v13.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.h, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v137.h, v14.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.h, v14.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v154.h, v15.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.h, v15.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v152.h, v16.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.h, v16.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v17.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v106.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.h, v9.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v76.h, v10.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v10.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v127.h, v11.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.h, v11.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v104.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.h, v12.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v142.h, v13.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.h, v13.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v125.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.h, v14.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v143.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.h, v15.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v141.h, v16.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.h, v16.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v135.h, v17.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.h, v17.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v39.h, v18.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v18.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v150.h, v19.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v148.h, v19.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v19.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v20.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v147.h, v20.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v34.h, v20.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v160.h, v21.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v150.h, v21.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v35.h, v21.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v151.h, v22.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v149.h, v22.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v36.h, v22.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v162.h, v23.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v160.h, v23.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v23.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v161.h, v24.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v151.h, v24.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v24.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v164.h, v25.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v162.h, v25.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v48.h, v25.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v163.h, v26.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v161.h, v26.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v49.h, v26.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v166.h, v27.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v164.h, v27.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v50.h, v27.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v165.h, v28.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v163.h, v28.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v51.h, v28.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v176.h, v29.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v166.h, v29.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v52.h, v29.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v167.h, v30.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v165.h, v30.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v53.h, v30.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v178.h, v84.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v84.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v177.h, v85.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v85.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v176.h, v98.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v54.h, v98.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v167.h, v99.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v55.h, v99.h
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr1
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5
@@ -161982,148 +161980,153 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99
 ; GFX11-TRUE16-NEXT:  .LBB90_2: ; %Flow
 ; GFX11-TRUE16-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB90_4
 ; GFX11-TRUE16-NEXT:  ; %bb.3: ; %cmp.true
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v32, 0xffff0000, v17
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v32 :: v_dual_lshlrev_b32 v31, 16, v18
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v31, 0x40c00000, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v31, 16, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v20
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v34, 0x40c00000, v32 :: v_dual_add_f32 v31, 0x40c00000, v31
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v38, v34, 16, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v32, v31, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v37, 0x400000, v31
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v31, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v32, v32, v31, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v31, v38, v34, 0x7fff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v39, v32, v37, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v17, 16, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v35, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v18
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v35, v35, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_cndmask_b32 v32, v35, v36
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v17, 0x40c00000, v17
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v33
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v35, v36, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v48, v17, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v49, 0x400000, v17
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v39.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v37, v48, v17, 0x7fff
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v33, 0xffff0000, v20
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 24, v32
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v122, 8, v32
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v124, 24, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v126, 8, v32
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v148, v37, v49, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v33
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v34
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v135, v37, v49, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v34, v34
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v34, 0xffff0000, v19
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v31, v31, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v36, 0x40c00000, v34 :: v_dual_add_f32 v19, 0x40c00000, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v33, v33, v20, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v34, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v31.l, v135.h
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v36, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v31.l, v148.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v149, v33, v35, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v147, v33, v35, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v36
-; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v36, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v33, 0x400000, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_add3_u32 v20, v20, v36, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v36
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v124, 8, v31
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v34, v17, v34, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v19, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v34.l, v149.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v136, 8, v31
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v34.l, v147.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v19, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v150, v17, v33, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 24, v34
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v108, 8, v34
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v148, v17, v33, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v20, v35 :: v_dual_and_b32 v20, 0xffff0000, v21
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v33, v20, v35 :: v_dual_and_b32 v18, 0xffff0000, v22
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v21
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v109, 24, v34
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v120, 8, v34
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_f32 v22, 0x40c00000, v22
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v22, 16, v22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v22
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v22, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v151, v19, v35 :: v_dual_lshlrev_b32 v22, 16, v24
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v20, 0x7fff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v36, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v149, v19, v35 :: v_dual_lshlrev_b32 v22, 16, v24
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v24
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v21
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v36, v17, v36, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v36.l, v151.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v36.l, v149.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v160, v17, v24 :: v_dual_lshlrev_b32 v21, 16, v23
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v23
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v35, 0x400000, v20
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v150, v17, v24, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v20, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v23
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 24, v36
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v35, v19, v35, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
-; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_f32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v33.l, v148.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v92, 8, v36
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v22, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 24, v36
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v104, 8, v36
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v161, v19, v23, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v110, 8, v33
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v151, v19, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v20, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v26
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v17, v24, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v20, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v20
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v38.l, v151.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v33.l, v150.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v38.l, v161.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v162, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v160, v17, v23 :: v_dual_lshlrev_b32 v21, 16, v25
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
@@ -162136,10 +162139,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, 0x400000, v18
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v22, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v122, 8, v33
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v163, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v161, v19, v23 :: v_dual_lshlrev_b32 v22, 16, v28
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
@@ -162152,10 +162153,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v21, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v27
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v160.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v49.l, v163.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v49.l, v161.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v35.l, v150.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v164, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v162, v17, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
@@ -162168,10 +162169,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v22, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 24, v49
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v49
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v106, 8, v35
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v165, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v49
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v62, 8, v49
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v94, 8, v35
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v163, v19, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
@@ -162184,10 +162185,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v21, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v29
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v51.l, v165.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 24, v38
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v38
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v166, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v51.l, v163.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v74, 24, v38
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v77, 8, v38
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v164, v17, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
@@ -162200,14 +162201,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v22, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v85
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v56, 24, v51
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v51
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v167, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v22, 16, v99
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v51
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v51
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v165, v19, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v85
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v99
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v53, v17, v24
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v21, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v21, v21
@@ -162216,14 +162217,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v21, 0x7fff
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v84
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v53.l, v167.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v176, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v21, 16, v98
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v37.l, v160.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v53.l, v165.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v166, v17, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, 0x400000, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v84
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v98
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_cndmask_b32 v52, v19, v24
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v22, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v22, v22
@@ -162232,10 +162233,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v22, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x400000, v21
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v37.l, v162.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 24, v53
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v42, 8, v53
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v177, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v183, 24, v53
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v41, 8, v53
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v88, 8, v37
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v167, v19, v23, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v2
@@ -162248,11 +162249,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v21, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v55.l, v177.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v92, 8, v37
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v55.l, v167.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 0x400000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v178, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v176, v17, v22, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v18, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v20, 0xffff0000, v1
@@ -162265,11 +162265,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v20, 0x40c00000, v20
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v19, v19, v2, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 24, v55
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v55
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v180, 24, v55
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v181, 8, v55
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v20, 16, 1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v179, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v177, v19, v21, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 0xffff0000, v4
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -162282,10 +162282,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v18, 0x40c00000, v18
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v17, v17, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v65.l, v179.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v48.l, v162.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v65.l, v177.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v18, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v180, v17, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v178, v17, v19, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v20, v20
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -162300,9 +162301,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v48.l, v164.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v141, 24, v65
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v183, v2, v19, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v139, 24, v65
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v140, 8, v65
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v179, v2, v19, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v17, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 0x400000, v3
@@ -162312,13 +162313,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v17, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v67.l, v183.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v67.l, v179.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v50.l, v166.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[96:97], 24, v[48:49]
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v43, v1, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v50.l, v164.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v136, 24, v67
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v40, v1, v18, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v6
@@ -162329,13 +162330,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v4, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v43.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[84:85], 24, v[50:51]
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[86:87], 24, v[50:51]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v127, 24, v67
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v139, 8, v67
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v47, v2, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[85:86], 24, v[48:49]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[86:87], 24, v[37:38]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v137, 8, v67
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v42, v2, v17, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
@@ -162349,23 +162350,23 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v47.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v62, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v50
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v56, v2, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v5, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v6
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v121, 24, v69
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v48
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v68, v1, v17, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v2, v4, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v123, 8, v69
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v56.h
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v74, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v60, v3, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v7
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v10
@@ -162379,8 +162380,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v4, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v74.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v52.l, v176.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.l, v60.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v52.l, v166.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v1, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v1, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
@@ -162388,21 +162389,20 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v6
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v1, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v91, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v79, v4, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v12
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v91.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v89, v1, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.l, v79.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v76, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v12
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v99, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v2, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
@@ -162410,10 +162410,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v54.l, v178.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v99.l, v89.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[132:133], 24, v[82:83]
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v98, v2, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.l, v76.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v54.l, v176.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[130:131], 24, v[82:83]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v2, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v7, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -162421,29 +162421,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[70:71], 24, v[54:55]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[80:81], 24, v[52:53]
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v6, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v111, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v106, v2, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v8
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v7, v4, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v14
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v98.l, v111.h
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v108, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.l, v106.h
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v104, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[128:129], 24, v[98:99]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[117:118], 24, v[96:97]
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v115, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v113, v3, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v115.l, v108.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v90, 24, v99
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v114, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.l, v104.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[118:119], 24, v[31:32]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v112, v2, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v2, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
@@ -162452,8 +162452,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v64.l, v180.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v138, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v64.l, v178.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v127, v4, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
@@ -162461,19 +162461,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v131, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v129, v4, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v8, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v137, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v125, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v114.l, v138.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v130, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.l, v40.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v128, v3, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v6, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
@@ -162481,11 +162481,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v68.l, v62.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v131.l, v137.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v129.l, v125.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.l, v127.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v153, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v142, v4, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
@@ -162494,83 +162494,82 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v8
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v130.l, v153.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v152, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v128.l, v142.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v141, v2, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[116:117], 24, v[114:115]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[133:134], 24, v[68:69]
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v154, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v69.l, v42.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[114:115], 24, v[112:113]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v143, v7, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v10, v6, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[117:118], 24, v[33:34]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[112:113], 24, v[130:131]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[134:135], 24, v[66:67]
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v145, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[144:145], 24, v[66:67]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[131:132], 24, v[68:69]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[145:146], 24, v[64:65]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v134, v4, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v145.l, v152.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[146:147], 24, v[64:65]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[118:119], 24, v[31:32]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v59, 24, v131
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v144, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v144.l, v154.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v44, 24, v145
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v46, 8, v145
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 8, v131
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v72, 8, v130
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[100:101], 24, v[144:145]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[101:102], 24, v[37:38]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v134.l, v141.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[115:116], 24, v[33:34]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v61, 24, v129
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v63, 8, v129
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v133, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e64 v133.l, v143.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 24, v134
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v47, 8, v134
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v73, 8, v128
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 24, v113
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[100:101], 24, v[133:134]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[101:102], 24, v[128:129]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[102:103], 24, v[35:36]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v57, 8, v144
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v75, 24, v115
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v77, 8, v115
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v79, 8, v114
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 8, v99
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v98
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v105, 24, v83
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 8, v83
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v110, 8, v82
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v125, 8, v68
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v140, 8, v66
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v142, 8, v65
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v143, 8, v64
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v40, 8, v54
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v45, 8, v52
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v60, 8, v50
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v76, 8, v48
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v58, 8, v133
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v78, 8, v113
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v89, 8, v112
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v91, 24, v97
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v93, 8, v97
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v95, 8, v96
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v107, 24, v83
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v109, 8, v83
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v111, 8, v82
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v121, 24, v69
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v123, 8, v69
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v126, 8, v68
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v138, 8, v66
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v152, 8, v64
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v182, 8, v54
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v43, 8, v52
 ; GFX11-TRUE16-NEXT:  .LBB90_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v180.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v143.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 8, v152.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v64.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v146.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v145.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0
 ; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v65.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v141.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v139.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v1.h, v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v179.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v142.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0xff, v177.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 8, v140.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v144.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, v5, v1
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v67.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v2.h, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v43.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v140.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v127.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v3.l, 0xff, v40.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 8, v138.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v136.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v68.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v131.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v3.h, v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v183.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v139.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0xff, v179.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 8, v137.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v69.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v121.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, v5, v3
@@ -162578,89 +162577,89 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v4.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v6.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v62.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v125.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v132.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v56.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 8, v126.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v130.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v83.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v105.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v107.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v6.l, v6.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v7.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v47.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v42.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v123.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v98.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v96.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v117.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v97.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v7.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v8.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v91.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v110.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v90.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v79.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v8.h, 8, v111.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v91.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v112.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v5, v7
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v114.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v8.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v74.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v107.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v115.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v60.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v109.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v113.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v75.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v5, v8
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v130.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v128.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v10.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v111.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v106.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v10.h, 8, v95.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v129.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v5, v9
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v59.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v61.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v10.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v11.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v89.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v76.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v93.l
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v144.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v133.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v100.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v5, v10
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v145.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v134.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v11.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v12.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v138.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v79.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v44.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v127.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v89.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v45.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v31.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v5, v11
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v118.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v12.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v13.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v108.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v77.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v104.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v13.h, 8, v78.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v124.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v120.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v5, v12
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v13.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v14.l, v14.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v153.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v72.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v142.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v115.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v34.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v5, v13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v109.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v105.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v14.l, v14.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v15.l, v15.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v137.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v61.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v125.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v63.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v35.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v102.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v5, v14
@@ -162668,71 +162667,71 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v15.l, v15.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v16.l, v16.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v154.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v57.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v94.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v143.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v90.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v37.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v5, v15
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v101.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v86.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v16.l, v16.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v16.h, v17.l, v17.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v16.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v152.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v46.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v141.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v47.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v78.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v74.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v5, v16
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v48.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v17.l, v17.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v18.l, v18.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v136.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v18.l, 0xff, v135.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 8, v124.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v49.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v5, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v63.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v59.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v18.l, v18.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v19.l, v19.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v126.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 8, v122.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v84.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v5, v18
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v51.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v19.l, v19.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v20.l, v20.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v150.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v122.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v56.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v148.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v20.h, 8, v110.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v44.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v52.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v5, v19
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v80.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v20.l, v20.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v21.l, v21.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v120.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v147.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v21.h, 8, v108.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v53.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v41.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v183.l
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, v5, v20
 ; GFX11-TRUE16-NEXT:    v_and_b16 v33.l, 0xff, v54.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v21.l, v21.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v22.l, v22.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v160.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v106.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v150.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v22.h, 8, v94.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v70.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v34.l, 0xff, v55.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v5, v21
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v181.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v180.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v22.l, v22.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v23.l, v23.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v151.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v104.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v149.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v23.h, 8, v92.l
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[6:9], off offset:16
@@ -162740,71 +162739,71 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v23.l, v23.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v24.l, v24.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v162.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v92.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v160.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v24.h, 8, v88.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v5, v23
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v24.l, v24.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v25.l, v25.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v161.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v88.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v151.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v25.h, 8, v77.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v24, v5, v24
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v25.l, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v26.l, v26.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v164.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v76.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v162.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v26.h, 8, v72.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v5, v25
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v26.l, v26.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v27.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v163.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v73.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v161.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v27.h, 8, v62.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v26, v5, v26
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v27.l, v27.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v28.l, v28.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v166.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v60.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v164.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v28.h, 8, v57.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v5, v27
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v28.l, v28.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v29.l, v29.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v165.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v58.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v163.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v29.h, 8, v46.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v28, v5, v28
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v29.l, v29.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v29.h, v30.l, v30.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v29.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v176.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v45.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v166.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v30.h, 8, v43.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v29, v5, v29
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v30.l, v30.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v30.h, v31.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v30.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v167.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v42.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v31.l, 0xff, v165.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v31.h, 8, v41.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v30, v5, v30
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v31.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v31.h, v32.l, v32.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v31.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v178.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v40.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v32.l, 0xff, v176.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v32.h, 8, v182.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v31, v5, v31
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v32.l, v32.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v32.h, v33.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v33.l, 0xff, v177.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v182.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v33.l, 0xff, v167.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v181.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v32, v5, v32
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.l, v33.l, v33.h
@@ -162820,66 +162819,64 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[26:29], off offset:96
 ; GFX11-TRUE16-NEXT:    scratch_store_b128 v0, v[30:33], off offset:112
 ; GFX11-TRUE16-NEXT:    s_clause 0x1f
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v154, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v153, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:68
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:72
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:76
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:80
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:84
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:88
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:92
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:96
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:100
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:104
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:108
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:112
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:116
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:120
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:124
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:128
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:132
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:136
-; GFX11-TRUE16-NEXT:    s_clause 0x1a
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:140
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:144
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:148
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:152
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:156
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:160
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:164
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:168
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:172
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:176
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:180
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:184
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:188
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:192
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:196
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:200
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:204
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:208
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:212
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:216
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:220
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:224
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:228
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:232
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:236
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:240
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:244
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v152, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v143, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v142, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v141, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v140, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v139, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v138, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v137, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v136, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v127, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v126, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v125, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v124, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v123, off, s32 offset:64
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v122, off, s32 offset:68
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v121, off, s32 offset:72
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v120, off, s32 offset:76
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v111, off, s32 offset:80
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v110, off, s32 offset:84
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v109, off, s32 offset:88
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v108, off, s32 offset:92
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v107, off, s32 offset:96
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v106, off, s32 offset:100
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v105, off, s32 offset:104
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v104, off, s32 offset:108
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v95, off, s32 offset:112
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v94, off, s32 offset:116
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v93, off, s32 offset:120
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v92, off, s32 offset:124
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v91, off, s32 offset:128
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v90, off, s32 offset:132
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v89, off, s32 offset:136
+; GFX11-TRUE16-NEXT:    s_clause 0x18
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v88, off, s32 offset:140
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v79, off, s32 offset:144
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v78, off, s32 offset:148
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v77, off, s32 offset:152
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v76, off, s32 offset:156
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v75, off, s32 offset:160
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v74, off, s32 offset:164
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v73, off, s32 offset:168
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v72, off, s32 offset:172
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v63, off, s32 offset:176
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v62, off, s32 offset:180
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v61, off, s32 offset:184
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v60, off, s32 offset:188
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v59, off, s32 offset:192
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v58, off, s32 offset:196
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v57, off, s32 offset:200
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v56, off, s32 offset:204
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v47, off, s32 offset:208
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v46, off, s32 offset:212
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v45, off, s32 offset:216
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v44, off, s32 offset:220
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v43, off, s32 offset:224
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v42, off, s32 offset:228
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v41, off, s32 offset:232
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v40, off, s32 offset:236
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -186724,55 +186721,55 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
@@ -186798,24 +186795,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[69:70], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v3
@@ -186827,24 +186824,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v31
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[15:16]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[70:71], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[23:24]
@@ -186906,24 +186903,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v3
@@ -186935,24 +186932,24 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v31
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v17
 ; GFX11-TRUE16-NEXT:  .LBB94_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -186990,7 +186987,7 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v4.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v160.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v68.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
@@ -186998,15 +186995,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v149.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v5
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v6.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
@@ -187014,15 +187011,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v135.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v7
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v8.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
@@ -187030,15 +187027,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v9.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v129.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v10.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v53.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
@@ -187046,15 +187043,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v11.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v115.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v39, v11
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v113.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v50.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
@@ -187062,15 +187059,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v13.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v15.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v39, v13
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v14.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v38.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
@@ -187078,8 +187075,8 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v15.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v39, v15
@@ -187094,15 +187091,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v17.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v148.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v19.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v39, v17
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v18.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v146.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v54.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
@@ -187110,15 +187107,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v19.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v134.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v21.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v39, v19
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v20.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v132.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v51.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
@@ -187126,15 +187123,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v21.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v128.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v39, v21
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v22.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v118.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v48.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
@@ -187142,15 +187139,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v23.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v114.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v39, v23
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v24.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v112.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
@@ -187158,15 +187155,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v25.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v100.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v39, v25
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v26.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v98.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
@@ -187174,15 +187171,15 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v27.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v86.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v29.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v39, v27
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v28.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
@@ -209426,55 +209423,55 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr69_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr162_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr151_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr149_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr145_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr135_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr131_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr129_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr117_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr103_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr160_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr150_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr54_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr144_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr51_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr130_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr36_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
@@ -209500,24 +209497,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[64:65], 24, v[9:10]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[69:70], 24, v[3:4]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v3
@@ -209529,24 +209526,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v31
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[38:39], 24, v[15:16]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[70:71], 24, v[1:2]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[23:24]
@@ -209608,24 +209605,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[54:55], 24, v[19:20]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 8, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 24, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 24, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 8, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 24, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 24, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 8, v6
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 24, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v97, 8, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v103, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v119, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v129, 24, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v131, 8, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v133, 8, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v135, 24, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v145, 8, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v147, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v149, 24, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v151, 8, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v5
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v161, 24, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v162, 8, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v163, 8, v3
@@ -209637,24 +209634,24 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 8, v31
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 24, v30
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v29
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 24, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v28
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v27
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 24, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v26
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v25
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 24, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 8, v23
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 24, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 8, v21
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 24, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v20
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 8, v19
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 24, v18
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v160, 8, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 8, v29
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 24, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 24, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v26
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v112, 8, v25
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 24, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v24
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v118, 8, v23
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v128, 24, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v130, 8, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v132, 8, v21
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v134, 24, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v144, 8, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v146, 8, v19
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v148, 24, v18
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v150, 8, v18
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 8, v17
 ; GFX11-TRUE16-NEXT:  .LBB98_4: ; %end
 ; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -209692,7 +209689,7 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v4.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v4.h, v4.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v160.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v68.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.l, 0xff, v6.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v6.h, 0xff, v6.h
@@ -209700,15 +209697,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v5.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v5.h, v5.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v149.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v7.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.h, 0xff, v7.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v39, v5
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v6.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v6.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v147.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v67.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.l, 0xff, v8.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v8.h, 0xff, v8.h
@@ -209716,15 +209713,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v7.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v7.h, v7.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v135.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v133.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v145.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v135.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v9.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v9.h, 0xff, v9.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v39, v7
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v8.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v8.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v133.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v64.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.l, 0xff, v10.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v10.h, 0xff, v10.h
@@ -209732,15 +209729,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v9.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v9.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v119.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v131.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v129.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v11.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.h, 0xff, v11.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v39, v9
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v10.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v10.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v119.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v53.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v12.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.h, 0xff, v12.h
@@ -209748,15 +209745,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v11.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v11.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v117.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v115.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.l, 0xff, v13.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v13.h, 0xff, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v39, v11
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v12.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v12.h, v12.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v113.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v50.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v14.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v14.h, 0xff, v14.h
@@ -209764,15 +209761,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v13.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v13.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v99.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v103.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v101.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v15.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.h, 0xff, v15.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v39, v13
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v14.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v14.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v99.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v38.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.l, 0xff, v16.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v16.h, 0xff, v16.h
@@ -209780,8 +209777,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v15.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v15.h, v15.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v85.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v97.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v87.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.l, 0xff, v17.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v17.h, 0xff, v17.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, v39, v15
@@ -209796,15 +209793,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v17.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v17.h, v17.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v150.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v148.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.l, 0xff, v19.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v19.h, 0xff, v19.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, v39, v17
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v18.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v18.h, v18.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v148.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v146.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v54.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.l, 0xff, v20.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v20.h, 0xff, v20.h
@@ -209812,15 +209809,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v19.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v19.h, v19.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v144.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v134.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.l, 0xff, v21.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v21.h, 0xff, v21.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v19, v39, v19
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v20.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v20.h, v20.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v134.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v132.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v51.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.l, 0xff, v22.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v22.h, 0xff, v22.h
@@ -209828,15 +209825,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v21.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v21.h, v21.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v130.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v128.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.l, 0xff, v23.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v23.h, 0xff, v23.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, v39, v21
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v22.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v22.h, v22.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v128.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v118.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v48.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.l, 0xff, v24.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v24.h, 0xff, v24.h
@@ -209844,15 +209841,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v23.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v23.h, v23.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v116.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v114.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.l, 0xff, v25.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v25.h, 0xff, v25.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v23, v39, v23
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v24.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v24.h, v24.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v114.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v112.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v36.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.l, 0xff, v26.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v26.h, 0xff, v26.h
@@ -209860,15 +209857,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v25.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v25.h, v25.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v102.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v100.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.l, 0xff, v27.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v27.h, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v25, v39, v25
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v26.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v26.h, v26.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v100.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v98.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v35.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.l, 0xff, v28.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v28.h, 0xff, v28.h
@@ -209876,15 +209873,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v27.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v27.h, v27.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v96.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.h, 8, v86.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.l, 0xff, v29.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v29.h, 0xff, v29.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v27, v39, v27
 ; GFX11-TRUE16-NEXT:    v_or_b16 v39.l, v28.l, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v28.h, v28.h, v34.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v86.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v33.h, 8, v85.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v34.l, 8, v34.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.l, 0xff, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v30.h, 0xff, v30.h
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index e33493c6a760e..d3fbba3cf4dd7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -85072,13 +85072,13 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr116_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr115_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr23_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr48_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr66_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr102_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr100_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr25_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr35_lo16
@@ -85086,20 +85086,20 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr99_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr98_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_lo16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr27_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr86_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr85_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr112_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr84_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr32_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr83_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr80_lo16
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr114_hi16
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr113_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr71_lo16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr37_hi16
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr29_lo16
@@ -85119,18 +85119,18 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v14
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v13
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v98, 24, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v99, 8, v10
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v100, 8, v9
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 24, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v7
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v7
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v115, 24, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v116, 8, v6
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v117, 8, v5
@@ -85159,19 +85159,19 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.h, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v66.h, v8.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v24.h, v8.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v83.h, v9.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v9.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.h, v9.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v67.h, v10.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.h, v10.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v97.h, v11.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.h, v11.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v82.h, v12.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v80.h, v12.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.h, v12.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v112.h, v13.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.h, v13.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v96.h, v14.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v87.h, v14.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.h, v14.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v114.h, v15.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v113.h, v15.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.h, v15.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v103.h, v16.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.h, v16.h
@@ -85345,29 +85345,29 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v23.l, v68.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v67.h
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff0000, v12
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v26.l, v67.h
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v25, v2, v6, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v7, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[49:50], 24, v[21:22]
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[50:51], 24, v[19:20]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[17:18]
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v1, v6, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v83, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v82, v2, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v2, 0x400000, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v14
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v83.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 24, v24
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v82, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v25.l, v82.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[51:52], 24, v[17:18]
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v80, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v4, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v6, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -85384,82 +85384,81 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v24
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v113, 8, v23
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v80.h
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v101, 24, v24
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v27, v2, v3 :: v_dual_add_f32 v2, 0x40c00000, v4
 ; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v6
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v28.l, v82.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, 0x40c00000, v6
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v28
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v97, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v6, v8, v3, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v97.h
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v9
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v33, v4, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
-; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v8, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add3_u32 v3, v8, v1, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v8, 0xffff0000, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v96, v6, v7, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v27.l, v97.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v87, v6, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v28
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v102, 8, v24
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[34:35], 24, v[27:28]
-; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[25:26]
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v96.h
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 24, v28
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v33.l, v87.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v32, v3, v4, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v16
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 24, v33
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 8, v33
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT:    v_lshrrev_b64 v[35:36], 24, v[25:26]
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 24, v33
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 8, v33
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v6, v2, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 8, v28
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v96, 8, v27
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v114, 8, v23
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v112, v4, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v6, 0x40c00000, v8
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v3, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v15
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v32.l, v112.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v103, v2, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
-; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v1, 0x7fff
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, 16, v15
-; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v10, v6, 0x7fff
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 8, v32
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 8, v27
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v4, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v5, 16, 1
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x400000, v5
-; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v2, v3, 16, 1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-TRUE16-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v114, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v2, v3, 0x7fff
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v103, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT:    v_add3_u32 v2, v10, v6, 0x7fff
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v113, v7, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v38, v4, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v38.l, v103.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v37, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v114.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v37.l, v113.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 24, v38
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 8, v38
@@ -85524,7 +85523,7 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v6.h, v7.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v7.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v113.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v7.h, 8, v114.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v98.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v27.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v6, v31, v6
@@ -85541,12 +85540,12 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v8.l, v8.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v8.h, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v83.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v9.l, 0xff, v82.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v9.h, 8, v100.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v30.l
 ; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v33.h
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v31, v8
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v80.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v81.l
 ; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v9.l, v9.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v9.h, v10.l, v10.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v31.h
@@ -85560,14 +85559,14 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v10.h, v11.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v31.h
 ; GFX11-TRUE16-NEXT:    v_and_b16 v11.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v87.l
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v11.h, 8, v96.l
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 8, v69.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v31, v10
 ; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v11.l, v11.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v11.h, v12.l, v12.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v12.l, 0xff, v80.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v12.h, 8, v86.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v31, v11
@@ -85581,14 +85580,14 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
 ; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v13.l, v13.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v13.h, v14.l, v14.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v81.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v14.l, 0xff, v87.h
+; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v14.h, 8, v83.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v31, v13
 ; GFX11-TRUE16-NEXT:    v_or_b16 v31.l, v14.l, v14.h
 ; GFX11-TRUE16-NEXT:    v_or_b16 v14.h, v15.l, v15.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v114.h
+; GFX11-TRUE16-NEXT:    v_and_b16 v15.l, 0xff, v113.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v15.h, 8, v71.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, v31, v14
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index 67c9bfe9d9f3b..ecc715cfb52f3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -11261,8 +11261,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
 ; VI-NEXT:    s_lshr_b32 s5, s17, 24
 ; VI-NEXT:    s_lshr_b32 s8, s17, 16
-; VI-NEXT:    s_lshr_b32 s9, s17, 8
-; VI-NEXT:    s_lshr_b32 s10, s16, 16
+; VI-NEXT:    s_lshr_b32 s10, s17, 8
+; VI-NEXT:    s_lshr_b32 s9, s16, 16
 ; VI-NEXT:    s_lshr_b32 s11, s16, 8
 ; VI-NEXT:    s_cbranch_execnz .LBB85_4
 ; VI-NEXT:  .LBB85_2: ; %cmp.true
@@ -11277,9 +11277,9 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; VI-NEXT:    s_branch .LBB85_5
 ; VI-NEXT:  .LBB85_3:
 ; VI-NEXT:    ; implicit-def: $sgpr11
-; VI-NEXT:    ; implicit-def: $sgpr10
-; VI-NEXT:    ; implicit-def: $sgpr4
 ; VI-NEXT:    ; implicit-def: $sgpr9
+; VI-NEXT:    ; implicit-def: $sgpr4
+; VI-NEXT:    ; implicit-def: $sgpr10
 ; VI-NEXT:    ; implicit-def: $sgpr8
 ; VI-NEXT:    ; implicit-def: $sgpr5
 ; VI-NEXT:    s_branch .LBB85_2
@@ -11287,8 +11287,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; VI-NEXT:    v_mov_b32_e32 v8, s16
 ; VI-NEXT:    v_mov_b32_e32 v9, s17
 ; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s9
+; VI-NEXT:    v_mov_b32_e32 v5, s10
 ; VI-NEXT:    v_mov_b32_e32 v6, s8
 ; VI-NEXT:    v_mov_b32_e32 v7, s5
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
@@ -11306,8 +11306,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
 ; GFX9-NEXT:    s_lshr_b32 s5, s17, 24
 ; GFX9-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s17, 8
-; GFX9-NEXT:    s_lshr_b32 s10, s16, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s17, 8
+; GFX9-NEXT:    s_lshr_b32 s9, s16, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s16, 8
 ; GFX9-NEXT:    s_cbranch_execnz .LBB85_4
 ; GFX9-NEXT:  .LBB85_2: ; %cmp.true
@@ -11322,9 +11322,9 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; GFX9-NEXT:    s_branch .LBB85_5
 ; GFX9-NEXT:  .LBB85_3:
 ; GFX9-NEXT:    ; implicit-def: $sgpr11
-; GFX9-NEXT:    ; implicit-def: $sgpr10
-; GFX9-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-NEXT:    ; implicit-def: $sgpr9
+; GFX9-NEXT:    ; implicit-def: $sgpr4
+; GFX9-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-NEXT:    s_branch .LBB85_2
@@ -11332,8 +11332,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s4
@@ -11352,8 +11352,8 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
 ; GFX11-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s0, 8
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB85_4
@@ -11370,16 +11370,16 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in
 ; GFX11-NEXT:    s_branch .LBB85_5
 ; GFX11-NEXT:  .LBB85_3:
 ; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr2
 ; GFX11-NEXT:    ; implicit-def: $sgpr6
+; GFX11-NEXT:    ; implicit-def: $sgpr2
+; GFX11-NEXT:    ; implicit-def: $sgpr7
 ; GFX11-NEXT:    ; implicit-def: $sgpr5
 ; GFX11-NEXT:    ; implicit-def: $sgpr3
 ; GFX11-NEXT:    s_branch .LBB85_2
 ; GFX11-NEXT:  .LBB85_4:
 ; GFX11-NEXT:    v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT:    v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5
+; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT:    v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s5
 ; GFX11-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-NEXT:  .LBB85_5: ; %end
@@ -13517,8 +13517,8 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
 ; GFX9-NEXT:    s_lshr_b32 s5, s17, 24
 ; GFX9-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s17, 8
-; GFX9-NEXT:    s_lshr_b32 s10, s16, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s17, 8
+; GFX9-NEXT:    s_lshr_b32 s9, s16, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s16, 8
 ; GFX9-NEXT:    s_cbranch_execnz .LBB97_4
 ; GFX9-NEXT:  .LBB97_2: ; %cmp.true
@@ -13533,9 +13533,9 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
 ; GFX9-NEXT:    s_branch .LBB97_5
 ; GFX9-NEXT:  .LBB97_3:
 ; GFX9-NEXT:    ; implicit-def: $sgpr11
-; GFX9-NEXT:    ; implicit-def: $sgpr10
-; GFX9-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-NEXT:    ; implicit-def: $sgpr9
+; GFX9-NEXT:    ; implicit-def: $sgpr4
+; GFX9-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-NEXT:    s_branch .LBB97_2
@@ -13543,8 +13543,8 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s4
@@ -13563,8 +13563,8 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
 ; GFX11-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s0, 8
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB97_4
@@ -13581,16 +13581,16 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre
 ; GFX11-NEXT:    s_branch .LBB97_5
 ; GFX11-NEXT:  .LBB97_3:
 ; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr2
 ; GFX11-NEXT:    ; implicit-def: $sgpr6
+; GFX11-NEXT:    ; implicit-def: $sgpr2
+; GFX11-NEXT:    ; implicit-def: $sgpr7
 ; GFX11-NEXT:    ; implicit-def: $sgpr5
 ; GFX11-NEXT:    ; implicit-def: $sgpr3
 ; GFX11-NEXT:    s_branch .LBB97_2
 ; GFX11-NEXT:  .LBB97_4:
 ; GFX11-NEXT:    v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT:    v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5
+; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT:    v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s5
 ; GFX11-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-NEXT:  .LBB97_5: ; %end
@@ -15345,8 +15345,8 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
 ; GFX9-NEXT:    s_lshr_b32 s5, s17, 24
 ; GFX9-NEXT:    s_lshr_b32 s8, s17, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s17, 8
-; GFX9-NEXT:    s_lshr_b32 s10, s16, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s17, 8
+; GFX9-NEXT:    s_lshr_b32 s9, s16, 16
 ; GFX9-NEXT:    s_lshr_b32 s11, s16, 8
 ; GFX9-NEXT:    s_cbranch_execnz .LBB105_4
 ; GFX9-NEXT:  .LBB105_2: ; %cmp.true
@@ -15362,9 +15362,9 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr
 ; GFX9-NEXT:    s_branch .LBB105_5
 ; GFX9-NEXT:  .LBB105_3:
 ; GFX9-NEXT:    ; implicit-def: $sgpr11
-; GFX9-NEXT:    ; implicit-def: $sgpr10
-; GFX9-NEXT:    ; implicit-def: $sgpr4
 ; GFX9-NEXT:    ; implicit-def: $sgpr9
+; GFX9-NEXT:    ; implicit-def: $sgpr4
+; GFX9-NEXT:    ; implicit-def: $sgpr10
 ; GFX9-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-NEXT:    ; implicit-def: $sgpr5
 ; GFX9-NEXT:    s_branch .LBB105_2
@@ -15372,8 +15372,8 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s17
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s11
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s10
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s4
@@ -15392,8 +15392,8 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[0:1], 24
 ; GFX11-NEXT:    s_lshr_b32 s3, s1, 24
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s0, 8
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB105_4
@@ -15410,16 +15410,16 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr
 ; GFX11-NEXT:    s_branch .LBB105_5
 ; GFX11-NEXT:  .LBB105_3:
 ; GFX11-NEXT:    ; implicit-def: $sgpr8
-; GFX11-NEXT:    ; implicit-def: $sgpr7
-; GFX11-NEXT:    ; implicit-def: $sgpr2
 ; GFX11-NEXT:    ; implicit-def: $sgpr6
+; GFX11-NEXT:    ; implicit-def: $sgpr2
+; GFX11-NEXT:    ; implicit-def: $sgpr7
 ; GFX11-NEXT:    ; implicit-def: $sgpr5
 ; GFX11-NEXT:    ; implicit-def: $sgpr3
 ; GFX11-NEXT:    s_branch .LBB105_2
 ; GFX11-NEXT:  .LBB105_4:
 ; GFX11-NEXT:    v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1
-; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s7
-; GFX11-NEXT:    v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s5
+; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s6
+; GFX11-NEXT:    v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s5
 ; GFX11-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX11-NEXT:  .LBB105_5: ; %end
@@ -16493,8 +16493,8 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
 ; VI-NEXT:    s_lshr_b64 s[4:5], s[16:17], 24
 ; VI-NEXT:    s_lshr_b32 s8, s17, 24
 ; VI-NEXT:    s_lshr_b32 s5, s17, 16
-; VI-NEXT:    s_lshr_b32 s9, s17, 8
-; VI-NEXT:    s_lshr_b32 s10, s16, 16
+; VI-NEXT:    s_lshr_b32 s10, s17, 8
+; VI-NEXT:    s_lshr_b32 s9, s16, 16
 ; VI-NEXT:    s_lshr_b32 s11, s16, 8
 ; VI-NEXT:    s_cbranch_execnz .LBB109_4
 ; VI-NEXT:  .LBB109_2: ; %cmp.true
@@ -16546,16 +16546,16 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ; VI-NEXT:  .LBB109_3:
 ; VI-NEXT:    ; implicit-def: $sgpr11
-; VI-NEXT:    ; implicit-def: $sgpr10
-; VI-NEXT:    ; implicit-def: $sgpr4
 ; VI-NEXT:    ; implicit-def: $sgpr9
+; VI-NEXT:    ; implicit-def: $sgpr4
+; VI-NEXT:    ; implicit-def: $sgpr10
 ; VI-NEXT:    ; implicit-def: $sgpr5
 ; VI-NEXT:    ; implicit-def: $sgpr8
 ; VI-NEXT:    s_branch .LBB109_2
 ; VI-NEXT:  .LBB109_4:
 ; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_mov_b32_e32 v5, s9
+; VI-NEXT:    v_mov_b32_e32 v2, s9
+; VI-NEXT:    v_mov_b32_e32 v5, s10
 ; VI-NEXT:    v_mov_b32_e32 v7, s8
 ; VI-NEXT:    v_mov_b32_e32 v3, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s16
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 97df2a0dbd44b..258bc2959f391 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -5548,7 +5548,6 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS:       ; %bb.0: ; %entry
 ; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GFX7LESS-NEXT:    s_mov_b32 s4, 0
 ; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
 ; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v4, s7, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
@@ -5557,33 +5556,32 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x0
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s5, s[6:7]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX7LESS-NEXT:    s_mov_b64 s[10:11], 0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v5, s4
 ; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    s_mul_i32 s12, s5, 5
+; GFX7LESS-NEXT:    s_mul_i32 s12, s6, 5
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s14
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7LESS-NEXT:    s_mov_b32 s6, -1
 ; GFX7LESS-NEXT:    s_mov_b32 s4, s2
 ; GFX7LESS-NEXT:    s_mov_b32 s5, s3
 ; GFX7LESS-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX7LESS-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v9, v1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v0
-; GFX7LESS-NEXT:    v_subrev_i32_e32 v6, vcc, s12, v8
-; GFX7LESS-NEXT:    v_subb_u32_e32 v7, vcc, v9, v5, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v8, v1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v7, v0
+; GFX7LESS-NEXT:    v_subrev_i32_e32 v5, vcc, s12, v7
+; GFX7LESS-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v8, vcc
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v8
-; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v9
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, v6
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, v7
+; GFX7LESS-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX7LESS-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
-; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7LESS-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX7LESS-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX7LESS-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GFX7LESS-NEXT:    s_cbranch_execnz .LBB9_2
@@ -5611,39 +5609,37 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x0
-; GFX8-NEXT:    s_bcnt1_i32_b64 s5, s[6:7]
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[10:11], 0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s4
-; GFX8-NEXT:    s_mul_i32 s12, s5, 5
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s14
-; GFX8-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mul_i32 s12, s6, 5
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_mov_b32 s4, s2
 ; GFX8-NEXT:    s_mov_b32 s5, s3
 ; GFX8-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s12, v8
-; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v9, v5, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-NEXT:    v_mov_b32_e32 v2, v8
-; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s12, v7
+; GFX8-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v8, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-NEXT:    v_mov_b32_e32 v1, v6
+; GFX8-NEXT:    v_mov_b32_e32 v2, v7
+; GFX8-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX8-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_2
@@ -5670,39 +5666,37 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x0
-; GFX9-NEXT:    s_bcnt1_i32_b64 s5, s[6:7]
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
 ; GFX9-NEXT:    s_mov_b64 s[10:11], 0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
-; GFX9-NEXT:    s_mul_i32 s12, s5, 5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s14
-; GFX9-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mul_i32 s12, s6, 5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_mov_b32 s4, s2
 ; GFX9-NEXT:    s_mov_b32 s5, s3
 ; GFX9-NEXT:  .LBB9_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mov_b32_e32 v9, v1
-; GFX9-NEXT:    v_mov_b32_e32 v8, v0
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s12, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v9, v5, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, v6
-; GFX9-NEXT:    v_mov_b32_e32 v1, v7
-; GFX9-NEXT:    v_mov_b32_e32 v2, v8
-; GFX9-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s12, v7
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v8, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-NEXT:    v_mov_b32_e32 v3, v8
 ; GFX9-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
 ; GFX9-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB9_2
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 323bffe9947c8..ca50835018824 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -1836,22 +1836,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1876,23 +1876,23 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1907,25 +1907,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v6, s20
+; GFX10-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
@@ -1947,25 +1947,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v1, v7
+; GFX908-NEXT:    v_mov_b32_e32 v2, v8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v9
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1977,25 +1977,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v7
+; GFX8-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2007,25 +2007,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
+; GFX7-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s20
+; GFX7-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2037,27 +2037,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX6-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2829,22 +2829,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2869,23 +2869,23 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2900,25 +2900,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v6, s20
+; GFX10-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
@@ -2931,22 +2931,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
+; GFX90A-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
+; GFX90A-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2958,25 +2958,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v1, v7
+; GFX908-NEXT:    v_mov_b32_e32 v2, v8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v9
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2988,25 +2988,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v7
+; GFX8-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3018,25 +3018,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
+; GFX7-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s20
+; GFX7-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3048,27 +3048,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remot
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX6-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX6-NEXT:  .LBB11_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB11_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3090,22 +3090,22 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[8:9], v0, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_add_f64_e32 v[6:7], v[8:9], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX12-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX12-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -3130,23 +3130,23 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[8:9], v0, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT:    v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v10, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX11-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -3161,25 +3161,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v1
-; GFX10-NEXT:    v_mov_b32_e32 v6, s20
+; GFX10-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX10-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_mov_b32_e32 v0, v6
+; GFX10-NEXT:    v_mov_b32_e32 v1, v7
+; GFX10-NEXT:    v_mov_b32_e32 v2, v8
+; GFX10-NEXT:    v_mov_b32_e32 v3, v9
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
@@ -3201,25 +3201,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v6
+; GFX908-NEXT:    v_mov_b32_e32 v1, v7
+; GFX908-NEXT:    v_mov_b32_e32 v2, v8
+; GFX908-NEXT:    v_mov_b32_e32 v3, v9
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3231,25 +3231,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, v7
+; GFX8-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8-NEXT:    v_mov_b32_e32 v3, v9
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3261,25 +3261,25 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s20
+; GFX7-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s20
+; GFX7-NEXT:    v_mov_b32_e32 v10, s20
 ; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3291,27 +3291,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v5, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX6-NEXT:    buffer_load_dwordx2 v[8:9], v0, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:    v_mov_b32_e32 v10, s6
 ; GFX6-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v10, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB12_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 316ba8527b595..1a4140cd0912b 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -1174,28 +1174,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1218,29 +1217,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1272,29 +1270,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1304,29 +1300,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1964,28 +1958,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2008,29 +2001,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2042,30 +2034,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v6, s20
+; GFX10-NEXT:    v_mov_b32_e32 v2, s20
+; GFX10-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v3, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2076,26 +2066,24 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
+; GFX90A-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_max_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX90A-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2105,29 +2093,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2137,29 +2123,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2169,29 +2153,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s20
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
+; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX7-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s20
+; GFX7-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX7-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2201,31 +2183,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot
 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX6-NEXT:    v_mov_b32_e32 v2, s20
+; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
-; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX6-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX6-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v2, v4
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2245,28 +2224,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2289,29 +2267,28 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2343,29 +2320,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2375,29 +2350,27 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_max_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index ed67e0227278b..671f42c6efd27 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -1174,28 +1174,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1218,29 +1217,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -1272,29 +1270,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1304,29 +1300,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1964,28 +1958,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2008,29 +2001,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2042,30 +2034,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s20
-; GFX10-NEXT:    v_mov_b32_e32 v3, v1
-; GFX10-NEXT:    v_mov_b32_e32 v6, s20
+; GFX10-NEXT:    v_mov_b32_e32 v2, s20
+; GFX10-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX10-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX10-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v10, v1
-; GFX10-NEXT:    v_mov_b32_e32 v9, v0
+; GFX10-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX10-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v7
-; GFX10-NEXT:    v_mov_b32_e32 v1, v8
-; GFX10-NEXT:    v_mov_b32_e32 v2, v9
-; GFX10-NEXT:    v_mov_b32_e32 v3, v10
-; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX10-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v3
+; GFX10-NEXT:    v_mov_b32_e32 v2, v4
+; GFX10-NEXT:    v_mov_b32_e32 v3, v5
+; GFX10-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX10-NEXT:    v_mov_b32_e32 v5, v1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
@@ -2076,26 +2066,24 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s20
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v1
-; GFX90A-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
+; GFX90A-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s20
+; GFX90A-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX90A-NEXT:    v_min_f64 v[8:9], v[0:1], v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX90A-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX90A-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1
-; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2105,29 +2093,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2137,29 +2123,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2169,29 +2153,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s20
-; GFX7-NEXT:    v_mov_b32_e32 v3, v1
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX7-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s20
+; GFX7-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX7-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
-; GFX7-NEXT:    v_mov_b32_e32 v6, s20
+; GFX7-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v10, v1
-; GFX7-NEXT:    v_mov_b32_e32 v9, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX7-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-NEXT:    v_mov_b32_e32 v1, v8
-; GFX7-NEXT:    v_mov_b32_e32 v2, v9
-; GFX7-NEXT:    v_mov_b32_e32 v3, v10
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX7-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX7-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    v_mov_b32_e32 v2, v4
+; GFX7-NEXT:    v_mov_b32_e32 v3, v5
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2201,31 +2183,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remot
 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_remote_memory:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v2, v0
-; GFX6-NEXT:    v_mov_b32_e32 v0, s20
-; GFX6-NEXT:    v_mov_b32_e32 v3, v1
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
+; GFX6-NEXT:    v_mov_b32_e32 v2, s20
+; GFX6-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
 ; GFX6-NEXT:    s_add_i32 s6, s20, 0x800
-; GFX6-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX6-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], 0
-; GFX6-NEXT:    v_mov_b32_e32 v6, s6
+; GFX6-NEXT:    v_mov_b32_e32 v8, s6
 ; GFX6-NEXT:  .LBB8_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v10, v1
-; GFX6-NEXT:    v_mov_b32_e32 v9, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX6-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v7
-; GFX6-NEXT:    v_mov_b32_e32 v1, v8
-; GFX6-NEXT:    v_mov_b32_e32 v2, v9
-; GFX6-NEXT:    v_mov_b32_e32 v3, v10
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX6-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, v3
+; GFX6-NEXT:    v_mov_b32_e32 v2, v4
+; GFX6-NEXT:    v_mov_b32_e32 v3, v5
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX6-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB8_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2245,28 +2224,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    v_mov_b32_e32 v6, s16
+; GFX12-NEXT:    v_mov_b32_e32 v2, s16
+; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
-; GFX12-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
 ; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
-; GFX12-NEXT:    v_min_num_f64_e32 v[7:8], v[0:1], v[4:5]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen offset:2048 th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_inv scope:SCOPE_DEV
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX12-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2289,29 +2267,28 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    v_mov_b32_e32 v6, s16
+; GFX11-NEXT:    v_mov_b32_e32 v2, s16
+; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
-; GFX11-NEXT:    buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT:    buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
 ; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX11-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT:    v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen offset:2048 glc
+; GFX11-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX11-NEXT:    v_mov_b32_e32 v3, v5
+; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen offset:2048 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX11-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s4
@@ -2343,29 +2320,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v2, v0
-; GFX908-NEXT:    v_mov_b32_e32 v0, s20
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
-; GFX908-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX908-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v2, s20
+; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_mov_b32_e32 v6, s20
+; GFX908-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v10, v1
-; GFX908-NEXT:    v_mov_b32_e32 v9, v0
-; GFX908-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX908-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v0, v7
-; GFX908-NEXT:    v_mov_b32_e32 v1, v8
-; GFX908-NEXT:    v_mov_b32_e32 v2, v9
-; GFX908-NEXT:    v_mov_b32_e32 v3, v10
-; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX908-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1
-; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2375,29 +2350,27 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_
 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s20
-; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    buffer_load_dwordx2 v[0:1], v0, s[16:19], 0 offen offset:2048
-; GFX8-NEXT:    v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v2, s[16:19], 0 offen offset:2048
+; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s20
+; GFX8-NEXT:    v_mov_b32_e32 v8, s20
 ; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    v_mov_b32_e32 v9, v0
-; GFX8-NEXT:    v_max_f64 v[0:1], v[9:10], v[9:10]
-; GFX8-NEXT:    v_min_f64 v[7:8], v[0:1], v[4:5]
-; GFX8-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-NEXT:    v_mov_b32_e32 v1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v2, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, v10
-; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v6, s[16:19], 0 offen offset:2048 glc
+; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
+; GFX8-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, v5
+; GFX8-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v8, s[16:19], 0 offen offset:2048 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1
-; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
 ; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 5134159e3e406..0fc54aeaef77b 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -619,43 +619,43 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_mov_b64 s[8:9], 0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v18, 31, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v19, 31, v15
-; GISEL-NEXT:    v_mov_b32_e32 v10, 0x7f
-; GISEL-NEXT:    v_mov_b32_e32 v11, 0
+; GISEL-NEXT:    v_mov_b32_e32 v16, 0x7f
+; GISEL-NEXT:    v_mov_b32_e32 v17, 0
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v18, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v18, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v18, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v18, v7
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v19, v12
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v19, v13
-; GISEL-NEXT:    v_xor_b32_e32 v14, v19, v14
-; GISEL-NEXT:    v_xor_b32_e32 v15, v19, v15
+; GISEL-NEXT:    v_xor_b32_e32 v12, v19, v14
+; GISEL-NEXT:    v_xor_b32_e32 v13, v19, v15
 ; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v0, v18
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v1, v18, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v20, s[4:5], v4, v19
 ; GISEL-NEXT:    v_subb_u32_e64 v21, s[4:5], v5, v19, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, v2, v18, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v3, v18, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v4, vcc, v14, v19, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v15, v19, vcc
-; GISEL-NEXT:    v_ffbh_u32_e32 v14, v21
-; GISEL-NEXT:    v_ffbh_u32_e32 v15, v20
-; GISEL-NEXT:    v_ffbh_u32_e32 v16, v7
-; GISEL-NEXT:    v_ffbh_u32_e32 v17, v6
+; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v2, v18, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v3, v18, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v4, vcc, v12, v19, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v13, v19, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v12, v21
+; GISEL-NEXT:    v_ffbh_u32_e32 v13, v20
+; GISEL-NEXT:    v_ffbh_u32_e32 v14, v7
+; GISEL-NEXT:    v_ffbh_u32_e32 v15, v6
 ; GISEL-NEXT:    v_or_b32_e32 v0, v20, v4
 ; GISEL-NEXT:    v_or_b32_e32 v1, v21, v5
-; GISEL-NEXT:    v_or_b32_e32 v2, v6, v12
-; GISEL-NEXT:    v_or_b32_e32 v3, v7, v13
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
+; GISEL-NEXT:    v_or_b32_e32 v2, v6, v10
+; GISEL-NEXT:    v_or_b32_e32 v3, v7, v11
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 32, v13
 ; GISEL-NEXT:    v_ffbh_u32_e32 v26, v5
 ; GISEL-NEXT:    v_ffbh_u32_e32 v27, v4
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 32, v17
-; GISEL-NEXT:    v_ffbh_u32_e32 v28, v13
-; GISEL-NEXT:    v_ffbh_u32_e32 v29, v12
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 32, v15
+; GISEL-NEXT:    v_ffbh_u32_e32 v28, v11
+; GISEL-NEXT:    v_ffbh_u32_e32 v29, v10
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
-; GISEL-NEXT:    v_min_u32_e32 v0, v14, v15
+; GISEL-NEXT:    v_min_u32_e32 v0, v12, v13
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v27
-; GISEL-NEXT:    v_min_u32_e32 v2, v16, v17
+; GISEL-NEXT:    v_min_u32_e32 v2, v14, v15
 ; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v29
 ; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 64, v0
 ; GISEL-NEXT:    v_min_u32_e32 v1, v26, v1
@@ -665,32 +665,32 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
 ; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, 0x7f, v2
+; GISEL-NEXT:    v_xor_b32_e32 v12, 0x7f, v2
 ; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v10, v10, v0
-; GISEL-NEXT:    v_or_b32_e32 v11, v3, v1
+; GISEL-NEXT:    v_or_b32_e32 v12, v12, v0
+; GISEL-NEXT:    v_or_b32_e32 v13, v3, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v11, v14, v15
-; GISEL-NEXT:    v_and_b32_e32 v14, 1, v11
-; GISEL-NEXT:    v_or_b32_e32 v10, v11, v10
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_or_b32_e32 v13, v14, v15
+; GISEL-NEXT:    v_and_b32_e32 v14, 1, v13
+; GISEL-NEXT:    v_or_b32_e32 v12, v13, v12
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, v6, 0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v16, 1, v10
+; GISEL-NEXT:    v_and_b32_e32 v16, 1, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, v7, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v12, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v10, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v11, 0, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -703,22 +703,22 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_addc_u32_e64 v28, vcc, 0, v0, s[4:5]
 ; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v30, v2
-; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 64, v30
+; GISEL-NEXT:    v_sub_i32_e64 v12, s[4:5], 64, v30
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[6:7], v30
-; GISEL-NEXT:    v_lshl_b64 v[2:3], v[12:13], v30
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[10:11], v30
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GISEL-NEXT:    v_lshr_b64 v[10:11], v[6:7], v10
+; GISEL-NEXT:    v_lshr_b64 v[12:13], v[6:7], v12
 ; GISEL-NEXT:    v_lshl_b64 v[16:17], v[6:7], v14
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v30
 ; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v15, 0, v1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v0, v10, v2
-; GISEL-NEXT:    v_or_b32_e32 v1, v11, v3
+; GISEL-NEXT:    v_or_b32_e32 v0, v12, v2
+; GISEL-NEXT:    v_or_b32_e32 v1, v13, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v0, v12, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v1, v13, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v1, v11, vcc
 ; GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s8
 ; GISEL-NEXT:    v_mov_b32_e32 v1, s9
@@ -730,26 +730,26 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
 ; GISEL-NEXT:    v_add_i32_e32 v32, vcc, 0xffffffc0, v26
 ; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 64, v26
-; GISEL-NEXT:    v_lshr_b64 v[0:1], v[12:13], v26
+; GISEL-NEXT:    v_lshr_b64 v[0:1], v[10:11], v26
 ; GISEL-NEXT:    v_lshr_b64 v[2:3], v[6:7], v26
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GISEL-NEXT:    v_add_i32_e32 v30, vcc, -1, v20
 ; GISEL-NEXT:    v_addc_u32_e32 v31, vcc, -1, v21, vcc
-; GISEL-NEXT:    v_lshl_b64 v[16:17], v[12:13], v16
-; GISEL-NEXT:    v_lshr_b64 v[12:13], v[12:13], v32
+; GISEL-NEXT:    v_lshl_b64 v[16:17], v[10:11], v16
+; GISEL-NEXT:    v_lshr_b64 v[10:11], v[10:11], v32
 ; GISEL-NEXT:    v_addc_u32_e32 v32, vcc, -1, v4, vcc
 ; GISEL-NEXT:    v_addc_u32_e32 v33, vcc, -1, v5, vcc
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    v_or_b32_e32 v2, v2, v16
 ; GISEL-NEXT:    v_or_b32_e32 v3, v3, v17
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v16, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v17, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v2, v6, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v3, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v2, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v3, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-NEXT:    v_mov_b32_e32 v1, s5
@@ -757,20 +757,20 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:  .LBB0_9: ; %udiv-do-while
 ; GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_lshl_b64 v[2:3], v[12:13], 1
+; GISEL-NEXT:    v_lshl_b64 v[2:3], v[10:11], 1
 ; GISEL-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 31, v13
-; GISEL-NEXT:    v_lshrrev_b32_e32 v34, 31, v11
-; GISEL-NEXT:    v_lshl_b64 v[12:13], v[14:15], 1
-; GISEL-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 31, v11
+; GISEL-NEXT:    v_lshrrev_b32_e32 v34, 31, v13
+; GISEL-NEXT:    v_lshl_b64 v[10:11], v[14:15], 1
+; GISEL-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 31, v15
 ; GISEL-NEXT:    v_add_i32_e32 v26, vcc, -1, v26
 ; GISEL-NEXT:    v_addc_u32_e32 v27, vcc, -1, v27, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v16, v16, v6
 ; GISEL-NEXT:    v_or_b32_e32 v2, v2, v34
-; GISEL-NEXT:    v_or_b32_e32 v10, v10, v14
-; GISEL-NEXT:    v_or_b32_e32 v14, v0, v12
-; GISEL-NEXT:    v_or_b32_e32 v15, v1, v13
+; GISEL-NEXT:    v_or_b32_e32 v12, v12, v14
+; GISEL-NEXT:    v_or_b32_e32 v14, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v15, v1, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v28, vcc, -1, v28, vcc
 ; GISEL-NEXT:    v_addc_u32_e32 v29, vcc, -1, v29, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v30, v2
@@ -783,14 +783,14 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v6
 ; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_and_b32_e32 v6, 1, v0
-; GISEL-NEXT:    v_and_b32_e32 v12, v0, v20
-; GISEL-NEXT:    v_and_b32_e32 v13, v0, v21
+; GISEL-NEXT:    v_and_b32_e32 v10, v0, v20
+; GISEL-NEXT:    v_and_b32_e32 v11, v0, v21
 ; GISEL-NEXT:    v_and_b32_e32 v34, v0, v4
 ; GISEL-NEXT:    v_and_b32_e32 v35, v0, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v6
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v2, v12
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v3, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v2, v10
+; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v3, v11, vcc
 ; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v16, v34, vcc
 ; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, v17, v35, vcc
 ; GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
@@ -800,9 +800,9 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:  .LBB0_11: ; %Flow11
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:    v_lshl_b64 v[2:3], v[14:15], 1
-; GISEL-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT:    v_lshl_b64 v[12:13], v[12:13], 1
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 31, v15
-; GISEL-NEXT:    v_or_b32_e32 v10, v10, v4
+; GISEL-NEXT:    v_or_b32_e32 v12, v12, v4
 ; GISEL-NEXT:    v_or_b32_e32 v14, v0, v2
 ; GISEL-NEXT:    v_or_b32_e32 v15, v1, v3
 ; GISEL-NEXT:  .LBB0_12: ; %Flow12
@@ -815,8 +815,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v9, v3
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v14, v7
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v15, v7
-; GISEL-NEXT:    v_xor_b32_e32 v8, v10, v7
-; GISEL-NEXT:    v_xor_b32_e32 v9, v11, v7
+; GISEL-NEXT:    v_xor_b32_e32 v8, v12, v7
+; GISEL-NEXT:    v_xor_b32_e32 v9, v13, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v7
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index e7af7467171c3..e0421575c3174 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -20,7 +20,8 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v1, -1
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
@@ -386,7 +387,8 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; SDAG-NEXT:    s_cbranch_execz .LBB1_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v1, -1
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
@@ -749,9 +751,10 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v1, -1
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
@@ -1100,9 +1103,10 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB3_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v1, -1
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
@@ -1489,9 +1493,10 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB6_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v1, -1
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
@@ -1836,9 +1841,10 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; SDAG-NEXT:    s_cbranch_execz .LBB7_10
 ; SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
-; SDAG-NEXT:    v_mov_b32_e32 v6, 0
 ; SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
-; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v1, -1
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
 ; SDAG-NEXT:    s_movk_i32 s6, 0xff7f
 ; SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 05403f008276c..a50791e10f5a2 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -7575,15 +7575,13 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v9
@@ -7593,7 +7591,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7609,15 +7609,13 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB38_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v8
@@ -7628,7 +7626,9 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB38_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7809,15 +7809,13 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v9
@@ -7827,7 +7825,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7843,15 +7843,13 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB39_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v8
@@ -7862,7 +7860,9 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB39_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8039,34 +8039,32 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX7-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX7-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8077,35 +8075,33 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX6-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:  .LBB40_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB40_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index ac223fd6030bd..311faac1b7c29 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4203,25 +4203,25 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[10:11], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX7-NEXT:    v_max_f64 v[6:7], v[0:1], v[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4237,26 +4237,25 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[10:11], v[2:3], v[2:3]
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_max_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX6-NEXT:    v_max_f64 v[6:7], v[0:1], v[10:11]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index 5653f85c67339..e2808ee9bf706 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4203,25 +4203,25 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX7-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX7-NEXT:    buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT:    v_max_f64 v[10:11], v[2:3], v[2:3]
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX7-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
+; GFX7-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX7-NEXT:    v_min_f64 v[6:7], v[0:1], v[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4237,26 +4237,25 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_remote_memory(ptr add
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
-; GFX6-NEXT:    v_max_f64 v[6:7], v[2:3], v[2:3]
+; GFX6-NEXT:    buffer_load_dwordx2 v[8:9], v[4:5], s[4:7], 0 addr64
+; GFX6-NEXT:    v_max_f64 v[10:11], v[2:3], v[2:3]
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB24_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_max_f64 v[0:1], v[10:11], v[10:11]
-; GFX6-NEXT:    v_min_f64 v[8:9], v[0:1], v[6:7]
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT:    v_max_f64 v[0:1], v[8:9], v[8:9]
+; GFX6-NEXT:    v_min_f64 v[6:7], v[0:1], v[10:11]
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v3, v9
 ; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB24_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index f0e16150c9e79..11f0f38d2b6fa 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -3913,15 +3913,13 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], -v[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v9
@@ -3931,7 +3929,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3947,15 +3947,13 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB16_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], -v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v8
@@ -3966,7 +3964,9 @@ define double @global_agent_atomic_fsub_ret_f64(ptr addrspace(1) %ptr, double %v
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB16_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4165,15 +4165,13 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:2040
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], -v[4:5]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v8
 ; GFX7-NEXT:    v_mov_b32_e32 v1, v9
@@ -4183,7 +4181,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4199,15 +4199,13 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:2040
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:  .LBB17_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], -v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v8
@@ -4218,7 +4216,9 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_pos(ptr addrspace(1)
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
 ; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v10, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB17_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4413,34 +4413,32 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_movk_i32 s4, 0xf800
-; GFX7-NEXT:    v_mov_b32_e32 v7, v1
-; GFX7-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX7-NEXT:    s_mov_b32 s5, -1
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX7-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0xfffff800, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX7-NEXT:    v_mov_b32_e32 v4, v2
-; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX7-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
 ; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v11, v1
-; GFX7-NEXT:    v_mov_b32_e32 v10, v0
-; GFX7-NEXT:    v_add_f64 v[8:9], v[10:11], -v[4:5]
-; GFX7-NEXT:    v_mov_b32_e32 v0, v8
-; GFX7-NEXT:    v_mov_b32_e32 v1, v9
-; GFX7-NEXT:    v_mov_b32_e32 v2, v10
-; GFX7-NEXT:    v_mov_b32_e32 v3, v11
-; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v6
+; GFX7-NEXT:    v_mov_b32_e32 v1, v7
+; GFX7-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-NEXT:    v_mov_b32_e32 v3, v9
+; GFX7-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_wbinvl1
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX7-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX7-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4451,35 +4449,33 @@ define double @global_agent_atomic_fsub_ret_f64__offset12b_neg(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    s_movk_i32 s4, 0xf800
-; GFX6-NEXT:    v_mov_b32_e32 v7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX6-NEXT:    s_mov_b32 s5, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX6-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 0xfffff800, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v4, v2
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, -1, v1, vcc
 ; GFX6-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
 ; GFX6-NEXT:  .LBB18_1: ; %atomicrmw.start
 ; GFX6-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v10, v0
-; GFX6-NEXT:    v_add_f64 v[8:9], v[10:11], -v[4:5]
+; GFX6-NEXT:    v_add_f64 v[6:7], v[8:9], -v[4:5]
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v8
-; GFX6-NEXT:    v_mov_b32_e32 v1, v9
-; GFX6-NEXT:    v_mov_b32_e32 v2, v10
-; GFX6-NEXT:    v_mov_b32_e32 v3, v11
-; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    v_mov_b32_e32 v0, v6
+; GFX6-NEXT:    v_mov_b32_e32 v1, v7
+; GFX6-NEXT:    v_mov_b32_e32 v2, v8
+; GFX6-NEXT:    v_mov_b32_e32 v3, v9
+; GFX6-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], v[10:11], s[4:7], 0 addr64 glc
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_wbinvl1
-; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX6-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v8, v0
 ; GFX6-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; GFX6-NEXT:    s_cbranch_execnz .LBB18_1
 ; GFX6-NEXT:  ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 74f0f64c935b4..6a4c2849ba4a3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -1502,13 +1502,11 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB32_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_sub_i32_e32 v8, vcc, v10, v7
 ; SI-NEXT:    v_subb_u32_e32 v9, vcc, v11, v6, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -1521,6 +1519,8 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB32_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1593,13 +1593,11 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB33_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_sub_i32_e32 v8, vcc, v10, v7
 ; SI-NEXT:    v_subb_u32_e32 v9, vcc, v11, v6, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -1612,6 +1610,8 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB33_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -1883,43 +1883,42 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v9, s6, 0
-; SI-NEXT:    v_writelane_b32 v9, s7, 1
+; SI-NEXT:    v_writelane_b32 v7, s6, 0
+; SI-NEXT:    v_writelane_b32 v7, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v6, s35
 ; SI-NEXT:  .LBB36_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v1
-; SI-NEXT:    v_mov_b32_e32 v7, v0
-; SI-NEXT:    v_subrev_i32_e32 v5, vcc, s34, v7
-; SI-NEXT:    v_subb_u32_e32 v6, vcc, v8, v4, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v5
-; SI-NEXT:    v_mov_b32_e32 v1, v6
-; SI-NEXT:    v_mov_b32_e32 v2, v7
-; SI-NEXT:    v_mov_b32_e32 v3, v8
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_subrev_i32_e32 v2, vcc, s34, v4
+; SI-NEXT:    v_subb_u32_e32 v3, vcc, v5, v6, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB36_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v9, 1
-; SI-NEXT:    v_readlane_b32 s6, v9, 0
+; SI-NEXT:    v_readlane_b32 s7, v7, 1
+; SI-NEXT:    v_readlane_b32 s6, v7, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1985,43 +1984,42 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v9, s6, 0
-; SI-NEXT:    v_writelane_b32 v9, s7, 1
+; SI-NEXT:    v_writelane_b32 v7, s6, 0
+; SI-NEXT:    v_writelane_b32 v7, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v6, s35
 ; SI-NEXT:  .LBB37_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v1
-; SI-NEXT:    v_mov_b32_e32 v7, v0
-; SI-NEXT:    v_subrev_i32_e32 v5, vcc, s34, v7
-; SI-NEXT:    v_subb_u32_e32 v6, vcc, v8, v4, vcc
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v5
-; SI-NEXT:    v_mov_b32_e32 v1, v6
-; SI-NEXT:    v_mov_b32_e32 v2, v7
-; SI-NEXT:    v_mov_b32_e32 v3, v8
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_subrev_i32_e32 v2, vcc, s34, v4
+; SI-NEXT:    v_subb_u32_e32 v3, vcc, v5, v6, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB37_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v9, 1
-; SI-NEXT:    v_readlane_b32 s6, v9, 0
+; SI-NEXT:    v_readlane_b32 s7, v7, 1
+; SI-NEXT:    v_readlane_b32 s6, v7, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2342,13 +2340,11 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB42_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_and_b32_e32 v9, v11, v6
 ; SI-NEXT:    v_and_b32_e32 v8, v10, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -2361,6 +2357,8 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB42_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2433,13 +2431,11 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB43_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_and_b32_e32 v9, v11, v6
 ; SI-NEXT:    v_and_b32_e32 v8, v10, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -2452,6 +2448,8 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB43_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2726,14 +2724,11 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, s34, v5
 ; SI-NEXT:    v_and_b32_e32 v2, s35, v4
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
@@ -2745,6 +2740,8 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB46_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -2825,14 +2822,11 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, s34, v5
 ; SI-NEXT:    v_and_b32_e32 v2, s35, v4
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
@@ -2844,6 +2838,8 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB47_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3182,14 +3178,11 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB52_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, v11, v6
 ; SI-NEXT:    v_and_b32_e32 v1, v10, v7
 ; SI-NEXT:    v_not_b32_e32 v9, v0
@@ -3203,6 +3196,8 @@ define i64 @global_atomic_nand_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB52_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3279,14 +3274,11 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB53_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, v11, v6
 ; SI-NEXT:    v_and_b32_e32 v1, v10, v7
 ; SI-NEXT:    v_not_b32_e32 v9, v0
@@ -3300,6 +3292,8 @@ define i64 @global_atomic_nand_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB53_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3590,14 +3584,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB56_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, s34, v5
 ; SI-NEXT:    v_and_b32_e32 v1, s35, v4
 ; SI-NEXT:    v_not_b32_e32 v3, v0
@@ -3611,6 +3602,8 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB56_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3695,14 +3688,11 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB57_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, s34, v5
 ; SI-NEXT:    v_and_b32_e32 v1, s35, v4
 ; SI-NEXT:    v_not_b32_e32 v3, v0
@@ -3716,6 +3706,8 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB57_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -3891,14 +3883,11 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB59_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v0, v11, v6
 ; SI-NEXT:    v_and_b32_e32 v1, v10, v7
 ; SI-NEXT:    v_not_b32_e32 v9, v0
@@ -3912,6 +3901,8 @@ define i64 @global_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr addrs
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB59_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4162,13 +4153,11 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB62_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_or_b32_e32 v9, v11, v6
 ; SI-NEXT:    v_or_b32_e32 v8, v10, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -4181,6 +4170,8 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB62_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4253,13 +4244,11 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB63_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_or_b32_e32 v9, v11, v6
 ; SI-NEXT:    v_or_b32_e32 v8, v10, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -4272,6 +4261,8 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB63_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4546,14 +4537,11 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB66_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v3, s34, v5
 ; SI-NEXT:    v_or_b32_e32 v2, s35, v4
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
@@ -4565,6 +4553,8 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB66_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4645,14 +4635,11 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB67_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v3, s34, v5
 ; SI-NEXT:    v_or_b32_e32 v2, s35, v4
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
@@ -4664,6 +4651,8 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB67_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -4990,13 +4979,11 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB72_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_xor_b32_e32 v9, v11, v6
 ; SI-NEXT:    v_xor_b32_e32 v8, v10, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -5009,6 +4996,8 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB72_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5081,13 +5070,11 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[4:5], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB73_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_xor_b32_e32 v9, v11, v6
 ; SI-NEXT:    v_xor_b32_e32 v8, v10, v7
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -5100,6 +5087,8 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB73_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5374,14 +5363,11 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB76_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v3, s34, v5
 ; SI-NEXT:    v_xor_b32_e32 v2, s35, v4
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
@@ -5393,6 +5379,8 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB76_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5473,14 +5461,11 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB77_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_xor_b32_e32 v3, s34, v5
 ; SI-NEXT:    v_xor_b32_e32 v2, s35, v4
 ; SI-NEXT:    v_mov_b32_e32 v0, v2
@@ -5492,6 +5477,8 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB77_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5824,13 +5811,11 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -5844,6 +5829,8 @@ define i64 @global_atomic_max_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB82_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -5918,13 +5905,11 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_gt_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -5938,6 +5923,8 @@ define i64 @global_atomic_max_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB83_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -6223,45 +6210,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB86_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB86_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6331,45 +6318,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB87_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB87_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7176,13 +7163,11 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB96_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -7196,6 +7181,8 @@ define i64 @global_atomic_umax_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB96_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7270,13 +7257,11 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB97_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -7290,6 +7275,8 @@ define i64 @global_atomic_umax_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB97_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -7575,45 +7562,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB100_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7683,45 +7670,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB101_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8416,13 +8403,11 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB109_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -8436,6 +8421,8 @@ define i64 @global_atomic_umin_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB109_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8510,13 +8497,11 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB110_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_le_u64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -8530,6 +8515,8 @@ define i64 @global_atomic_umin_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB110_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -8815,45 +8802,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB113_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB113_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8923,45 +8910,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB114_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB114_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -9292,13 +9279,11 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB119_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -9312,6 +9297,8 @@ define i64 @global_atomic_min_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB119_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9386,13 +9373,11 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB120_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    v_cmp_le_i64_e32 vcc, v[10:11], v[4:5]
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v5, v11, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v4, v10, vcc
@@ -9406,6 +9391,8 @@ define i64 @global_atomic_min_i64_ret_offset(ptr addrspace(1) %out, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB120_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -9691,45 +9678,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB123_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB123_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -9799,45 +9786,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB124_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[4:5]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v7, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB124_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -10645,14 +10632,11 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB133_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v10
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v11, vcc
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
@@ -10667,6 +10651,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB133_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -10745,14 +10731,11 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s6
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[4:7], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[8:9], 0
 ; SI-NEXT:  .LBB134_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v10
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v11, vcc
 ; SI-NEXT:    v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5]
@@ -10767,6 +10750,8 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[8:9]
 ; SI-NEXT:    s_cbranch_execnz .LBB134_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11065,14 +11050,11 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB137_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5]
@@ -11087,6 +11069,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB137_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11173,14 +11157,11 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB138_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    v_mov_b32_e32 v4, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v4
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5]
@@ -11195,6 +11176,8 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB138_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11557,14 +11540,11 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s8, s10
 ; SI-NEXT:    s_mov_b32 s9, s10
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[8:11], 0 addr64
 ; SI-NEXT:    s_mov_b64 s[6:7], 0
 ; SI-NEXT:  .LBB143_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v10
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, -1, v11, vcc
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
@@ -11581,6 +11561,8 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) {
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; SI-NEXT:    s_cbranch_execnz .LBB143_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -11663,14 +11645,11 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-NEXT:    s_mov_b32 s8, s10
 ; SI-NEXT:    s_mov_b32 s9, s10
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[10:11], v[6:7], s[8:11], 0 addr64 offset:32
 ; SI-NEXT:    s_mov_b64 s[6:7], 0
 ; SI-NEXT:  .LBB144_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v11, v1
-; SI-NEXT:    v_mov_b32_e32 v10, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v10
 ; SI-NEXT:    v_addc_u32_e32 v1, vcc, -1, v11, vcc
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
@@ -11687,6 +11666,8 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
 ; SI-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v11, v1
+; SI-NEXT:    v_mov_b32_e32 v10, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[6:7]
 ; SI-NEXT:    s_cbranch_execnz .LBB144_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
@@ -12004,49 +11985,48 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[38:39], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB147_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v8
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, -1, v9, vcc
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SI-NEXT:    v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v4
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, -1, v5, vcc
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; SI-NEXT:    v_cmp_lt_u64_e64 s[36:37], s[34:35], v[4:5]
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[36:37]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v1, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v7, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; SI-NEXT:    s_cbranch_execnz .LBB147_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[38:39]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -12124,49 +12104,48 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v10, s6, 0
-; SI-NEXT:    v_writelane_b32 v10, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[38:39], 0
-; SI-NEXT:    v_mov_b32_e32 v4, s35
-; SI-NEXT:    v_mov_b32_e32 v5, s34
+; SI-NEXT:    v_mov_b32_e32 v6, s35
+; SI-NEXT:    v_mov_b32_e32 v7, s34
 ; SI-NEXT:  .LBB148_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v9, v1
-; SI-NEXT:    v_mov_b32_e32 v8, v0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v8
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, -1, v9, vcc
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SI-NEXT:    v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9]
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_add_i32_e32 v0, vcc, -1, v4
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, -1, v5, vcc
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; SI-NEXT:    v_cmp_lt_u64_e64 s[36:37], s[34:35], v[4:5]
 ; SI-NEXT:    s_or_b64 vcc, vcc, s[36:37]
-; SI-NEXT:    v_cndmask_b32_e32 v7, v1, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v0, v6
-; SI-NEXT:    v_mov_b32_e32 v1, v7
-; SI-NEXT:    v_mov_b32_e32 v2, v8
-; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v7, vcc
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
+; SI-NEXT:    v_mov_b32_e32 v3, v5
 ; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[38:39], vcc, s[38:39]
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[38:39]
 ; SI-NEXT:    s_cbranch_execnz .LBB148_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[38:39]
-; SI-NEXT:    v_readlane_b32 s7, v10, 1
-; SI-NEXT:    v_readlane_b32 s6, v10, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index ba5ce8bb5fae7..8bb7274c84620 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_movk_i32 s4, 0xfc01
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0xfffffc01, v4
 ; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
 ; SI-NEXT:    v_bfi_b32 v5, v5, 0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index c92c672dda2ad..ca4f5d22ca9a0 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -51,7 +51,7 @@ define void @issue63986(i64 %0, i64 %idxprom, ptr inreg %ptr) {
 ; CHECK-NEXT:    v_add_co_u32_e32 v6, vcc, s4, v2
 ; CHECK-NEXT:    v_addc_co_u32_e32 v7, vcc, v3, v7, vcc
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
-; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    s_addc_u32 s5, 0, s5
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_byte v[6:7], v10
 ; CHECK-NEXT:  ; %bb.7:
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
index 0c723a09809c6..c9645c31aad75 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-opt-regseq-removal.mir
@@ -113,7 +113,7 @@ body:             |
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GCN-NEXT: $vgpr0 = COPY [[COPY3]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -135,7 +135,7 @@ body:             |
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY [[REG_SEQUENCE]]
-    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]].sub0
+    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
     ; GCN-NEXT: $vgpr0 = COPY [[COPY3]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index aa131ed6c9db1..760a298ce8971 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -495,8 +495,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    v_mov_b32_e32 v1, s35
 ; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s34, v0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT:    s_movk_i32 s0, 0x5000
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, 0x5000, v0
 ; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0
@@ -718,8 +717,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s0, 0x5000
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x5000, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], 0, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x7f
diff --git a/llvm/test/CodeGen/ARM/llvm.exp10.ll b/llvm/test/CodeGen/ARM/llvm.exp10.ll
index eb72fe8c1e1b7..49397ca386cb4 100644
--- a/llvm/test/CodeGen/ARM/llvm.exp10.ll
+++ b/llvm/test/CodeGen/ARM/llvm.exp10.ll
@@ -189,12 +189,13 @@ define <3 x float> @exp10_v3f32(<3 x float> %x) {
 ; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl exp10f
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    vmov s17, r0
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl exp10f
 ; CHECK-NEXT:    vmov s16, r0
+; CHECK-NEXT:    mov r1, r4
 ; CHECK-NEXT:    vmov s18, r6
-; CHECK-NEXT:    vmov r0, r1, d8
 ; CHECK-NEXT:    vmov r2, r3, d9
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
@@ -207,7 +208,6 @@ define <4 x float> @exp10_v4f32(<4 x float> %x) {
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    mov r4, r3
@@ -216,17 +216,15 @@ define <4 x float> @exp10_v4f32(<4 x float> %x) {
 ; CHECK-NEXT:    mov r7, r0
 ; CHECK-NEXT:    mov r0, r4
 ; CHECK-NEXT:    bl exp10f
-; CHECK-NEXT:    vmov s19, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl exp10f
-; CHECK-NEXT:    vmov s18, r0
+; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r6
-; CHECK-NEXT:    vmov s17, r7
 ; CHECK-NEXT:    bl exp10f
-; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r2, r3, d9
-; CHECK-NEXT:    vmov r0, r1, d8
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    mov r1, r7
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r4
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
   %r = call <4 x float> @llvm.exp10.v4f32(<4 x float> %x)
diff --git a/llvm/test/CodeGen/ARM/llvm.frexp.ll b/llvm/test/CodeGen/ARM/llvm.frexp.ll
index 376426d701b3e..80972b75cf283 100644
--- a/llvm/test/CodeGen/ARM/llvm.frexp.ll
+++ b/llvm/test/CodeGen/ARM/llvm.frexp.ll
@@ -362,33 +362,31 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
 define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) {
 ; CHECK-LABEL: test_frexp_v4f32_v4i32_only_use_fract:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    mov r5, r1
-; CHECK-NEXT:    mov r6, r0
-; CHECK-NEXT:    mov r1, sp
-; CHECK-NEXT:    mov r0, r3
-; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    bl frexpf
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    sub sp, #20
+; CHECK-NEXT:    mov r6, r1
 ; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    vmov s19, r0
-; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    mov r7, r0
+; CHECK-NEXT:    mov r0, r3
+; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    bl frexpf
 ; CHECK-NEXT:    add r1, sp, #8
-; CHECK-NEXT:    vmov s18, r0
+; CHECK-NEXT:    mov r4, r0
 ; CHECK-NEXT:    mov r0, r5
 ; CHECK-NEXT:    bl frexpf
 ; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    vmov s17, r0
+; CHECK-NEXT:    mov r5, r0
 ; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl frexpf
-; CHECK-NEXT:    vmov s16, r0
-; CHECK-NEXT:    vmov r2, r3, d9
-; CHECK-NEXT:    vmov r0, r1, d8
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9}
-; CHECK-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NEXT:    add r1, sp, #16
+; CHECK-NEXT:    mov r6, r0
+; CHECK-NEXT:    mov r0, r7
+; CHECK-NEXT:    bl frexpf
+; CHECK-NEXT:    mov r1, r6
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    add sp, #20
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   %result.0 = extractvalue { <4 x float>, <4 x i32> } %result, 0
   ret <4 x float> %result.0
diff --git a/llvm/test/CodeGen/Mips/no-odd-spreg-msa.ll b/llvm/test/CodeGen/Mips/no-odd-spreg-msa.ll
index 7c9f375cbb9ad..40d36fbb6fe76 100644
--- a/llvm/test/CodeGen/Mips/no-odd-spreg-msa.ll
+++ b/llvm/test/CodeGen/Mips/no-odd-spreg-msa.ll
@@ -97,7 +97,6 @@ entry:
 ; ALL:            lw $[[R0:[0-9]+]], %got(v4f32)(
 ; ALL:            ld.w $w12, 0($[[R0]])
 ; ALL:            move.v $w[[W0:13]], $w12
-; NOODDSPREG:     move.v $w[[W0:12]], $w13
 ; ALL:            teqi $zero, 1
 ; ALL-NOT: st.w
 ; ALL-NOT: ld.w
diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
index a2ad2946cc8ec..98314a02c23fe 100644
--- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
@@ -897,31 +897,31 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
 ; P8LE-NEXT:    mfvsrd r6, v2
 ; P8LE-NEXT:    mfvsrd r8, v3
 ; P8LE-NEXT:    ori r3, r3, 51289
+; P8LE-NEXT:    mffprd r4, f0
 ; P8LE-NEXT:    ori r5, r5, 42889
-; P8LE-NEXT:    rldic r4, r3, 36, 1
-; P8LE-NEXT:    mffprd r3, f0
+; P8LE-NEXT:    rldic r3, r3, 36, 1
 ; P8LE-NEXT:    rldic r5, r5, 35, 1
 ; P8LE-NEXT:    rldicl r7, r6, 63, 1
-; P8LE-NEXT:    oris r4, r4, 45590
+; P8LE-NEXT:    oris r3, r3, 45590
 ; P8LE-NEXT:    oris r5, r5, 1603
-; P8LE-NEXT:    ori r4, r4, 17097
+; P8LE-NEXT:    ori r3, r3, 17097
 ; P8LE-NEXT:    ori r5, r5, 21445
-; P8LE-NEXT:    mulhdu r4, r3, r4
+; P8LE-NEXT:    mulhdu r3, r4, r3
 ; P8LE-NEXT:    mulhdu r5, r7, r5
-; P8LE-NEXT:    sub r7, r3, r4
+; P8LE-NEXT:    sub r7, r4, r3
 ; P8LE-NEXT:    rldicl r5, r5, 57, 7
 ; P8LE-NEXT:    rldicl r7, r7, 63, 1
 ; P8LE-NEXT:    mulli r5, r5, 654
-; P8LE-NEXT:    add r4, r7, r4
+; P8LE-NEXT:    add r3, r7, r3
 ; P8LE-NEXT:    lis r7, -16037
 ; P8LE-NEXT:    ori r7, r7, 28749
-; P8LE-NEXT:    rldicl r4, r4, 60, 4
+; P8LE-NEXT:    rldicl r3, r3, 60, 4
 ; P8LE-NEXT:    sub r5, r6, r5
 ; P8LE-NEXT:    rldic r7, r7, 32, 0
-; P8LE-NEXT:    mulli r4, r4, 23
+; P8LE-NEXT:    mulli r3, r3, 23
 ; P8LE-NEXT:    oris r7, r7, 52170
 ; P8LE-NEXT:    ori r7, r7, 12109
-; P8LE-NEXT:    sub r3, r3, r4
+; P8LE-NEXT:    sub r3, r4, r3
 ; P8LE-NEXT:    mulhdu r7, r8, r7
 ; P8LE-NEXT:    mtfprd f1, r3
 ; P8LE-NEXT:    li r3, 0
diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll
index 435b0ab3fea6c..816b12e2d8e5b 100644
--- a/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_conv_i64_to_fp32_elts.ll
@@ -35,12 +35,12 @@ define i64 @test2elt(<2 x i64> %a) local_unnamed_addr #0 {
 ;
 ; CHECK-BE-LABEL: test2elt:
 ; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xscvuxdsp f0, v2
+; CHECK-BE-NEXT:    xscvdpspn v3, f0
 ; CHECK-BE-NEXT:    xxswapd vs0, v2
-; CHECK-BE-NEXT:    xscvuxdsp f1, v2
 ; CHECK-BE-NEXT:    xscvuxdsp f0, f0
-; CHECK-BE-NEXT:    xscvdpspn v2, f1
-; CHECK-BE-NEXT:    xscvdpspn v3, f0
-; CHECK-BE-NEXT:    vmrgow v2, v2, v3
+; CHECK-BE-NEXT:    xscvdpspn v2, f0
+; CHECK-BE-NEXT:    vmrgow v2, v3, v2
 ; CHECK-BE-NEXT:    mfvsrd r3, v2
 ; CHECK-BE-NEXT:    blr
 entry:
@@ -327,12 +327,12 @@ define i64 @test2elt_signed(<2 x i64> %a) local_unnamed_addr #0 {
 ;
 ; CHECK-BE-LABEL: test2elt_signed:
 ; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xscvsxdsp f0, v2
+; CHECK-BE-NEXT:    xscvdpspn v3, f0
 ; CHECK-BE-NEXT:    xxswapd vs0, v2
-; CHECK-BE-NEXT:    xscvsxdsp f1, v2
 ; CHECK-BE-NEXT:    xscvsxdsp f0, f0
-; CHECK-BE-NEXT:    xscvdpspn v2, f1
-; CHECK-BE-NEXT:    xscvdpspn v3, f0
-; CHECK-BE-NEXT:    vmrgow v2, v2, v3
+; CHECK-BE-NEXT:    xscvdpspn v2, f0
+; CHECK-BE-NEXT:    vmrgow v2, v3, v2
 ; CHECK-BE-NEXT:    mfvsrd r3, v2
 ; CHECK-BE-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
index 9173fa4622487..cc1282a9119da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -1666,20 +1666,20 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_32: # %else114
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v16, v0, a1
+; CHECK-RV32-NEXT:    vsrl.vx v24, v0, a1
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_34
 ; CHECK-RV32-NEXT:  # %bb.33: # %cond.load117
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    vsetivli zero, 31, e8, m1, tu, ma
 ; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 30
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_34: # %else118
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v24
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_35
 ; CHECK-RV32-NEXT:    j .LBB61_572
 ; CHECK-RV32-NEXT:  .LBB61_35: # %else122
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 1d691b130b3da..a2fcd7962b8b0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -661,8 +661,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    li t3, 36
-; RV32-NEXT:    mul a7, a7, t3
+; RV32-NEXT:    slli a7, a7, 5
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
@@ -682,7 +681,11 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (t1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v24, v8, v0
-; RV32-NEXT:    addi t1, sp, 16
+; RV32-NEXT:    csrr t1, vlenb
+; RV32-NEXT:    li t2, 44
+; RV32-NEXT:    mul t1, t1, t2
+; RV32-NEXT:    add t1, sp, t1
+; RV32-NEXT:    addi t1, t1, 16
 ; RV32-NEXT:    vs4r.v v8, (t1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    vmv.s.x v0, a7
 ; RV32-NEXT:    addi a3, a3, 12
@@ -694,8 +697,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    li t1, 20
-; RV32-NEXT:    mul a7, a7, t1
+; RV32-NEXT:    slli a7, a7, 4
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
 ; RV32-NEXT:    vs8r.v v8, (a7) # vscale x 64-byte Folded Spill
@@ -733,7 +735,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV32-NEXT:    csrr a7, vlenb
-; RV32-NEXT:    li t0, 28
+; RV32-NEXT:    li t0, 24
 ; RV32-NEXT:    mul a7, a7, t0
 ; RV32-NEXT:    add a7, sp, a7
 ; RV32-NEXT:    addi a7, a7, 16
@@ -755,7 +757,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; RV32-NEXT:    csrr a6, vlenb
-; RV32-NEXT:    li a7, 44
+; RV32-NEXT:    li a7, 40
 ; RV32-NEXT:    mul a6, a6, a7
 ; RV32-NEXT:    add a6, sp, a6
 ; RV32-NEXT:    addi a6, a6, 16
@@ -772,24 +774,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 12
-; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    vmv.s.x v0, a3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v6
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 92
@@ -812,8 +809,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 20
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
@@ -835,12 +831,6 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 84
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 72
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
@@ -860,30 +850,36 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v28, (a1) # vscale x 32-byte Folded Spill
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a2, 60
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT:    vl4r.v v20, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT:    vmv.v.v v20, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a2, 60
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
-; RV32-NEXT:    vmv.v.v v16, v8
+; RV32-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 60
+; RV32-NEXT:    li a2, 44
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v16, (a1) # vscale x 32-byte Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT:    vl4r.v v16, (a1) # vscale x 32-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
+; RV32-NEXT:    vrgatherei16.vv v20, v16, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v28, v24
+; RV32-NEXT:    vmv.v.v v20, v24
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v20, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI27_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_4)
 ; RV32-NEXT:    lui a2, %hi(.LCPI27_5)
@@ -891,13 +887,25 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 84
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs1r.v v16, (a1) # vscale x 8-byte Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI27_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle16.v v10, (a1)
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 28
+; RV32-NEXT:    li a2, 76
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -909,18 +917,29 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # vscale x 32-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 84
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl1r.v v7, (a1) # vscale x 8-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v24, v20, v8
+; RV32-NEXT:    vrgatherei16.vv v24, v20, v7
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v24, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 12
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 76
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl2r.v v28, (a1) # vscale x 16-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
+; RV32-NEXT:    vrgatherei16.vv v16, v0, v28
 ; RV32-NEXT:    lui a1, %hi(.LCPI27_6)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI27_6)
 ; RV32-NEXT:    lui a2, %hi(.LCPI27_8)
@@ -934,7 +953,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle16.v v5, (a2)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 44
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -942,12 +961,6 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v0, v20, v4
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v0, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 84
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v8, v6
 ; RV32-NEXT:    csrr a1, vlenb
@@ -968,7 +981,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    vse32.v v24, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
-; RV32-NEXT:    vse32.v v28, (a1)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 6
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl4r.v v8, (a2) # vscale x 32-byte Folded Reload
+; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 60
diff --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index d995a31f243d3..acc68491d5aee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -416,14 +416,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v12, a0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v8, 1, v0
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v12, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v13, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v14, v16
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v15, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v8, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v9, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v10, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v12, v11, v16
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v8, 0
+; RV32-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v12, 0
 ; RV32-BITS-UNKNOWN-NEXT:    ret
 ;
 ; RV32-BITS-256-LABEL: reverse_nxv32i1:
@@ -437,14 +437,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vrsub.vx v16, v12, a0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-256-NEXT:    vmerge.vim v12, v8, 1, v0
+; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vrgather.vv v11, v12, v16
-; RV32-BITS-256-NEXT:    vrgather.vv v10, v13, v16
-; RV32-BITS-256-NEXT:    vrgather.vv v9, v14, v16
-; RV32-BITS-256-NEXT:    vrgather.vv v8, v15, v16
+; RV32-BITS-256-NEXT:    vrgather.vv v15, v8, v16
+; RV32-BITS-256-NEXT:    vrgather.vv v14, v9, v16
+; RV32-BITS-256-NEXT:    vrgather.vv v13, v10, v16
+; RV32-BITS-256-NEXT:    vrgather.vv v12, v11, v16
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-256-NEXT:    vmsne.vi v0, v8, 0
+; RV32-BITS-256-NEXT:    vmsne.vi v0, v12, 0
 ; RV32-BITS-256-NEXT:    ret
 ;
 ; RV32-BITS-512-LABEL: reverse_nxv32i1:
@@ -458,14 +458,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vrsub.vx v16, v12, a0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-512-NEXT:    vmerge.vim v12, v8, 1, v0
+; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vrgather.vv v11, v12, v16
-; RV32-BITS-512-NEXT:    vrgather.vv v10, v13, v16
-; RV32-BITS-512-NEXT:    vrgather.vv v9, v14, v16
-; RV32-BITS-512-NEXT:    vrgather.vv v8, v15, v16
+; RV32-BITS-512-NEXT:    vrgather.vv v15, v8, v16
+; RV32-BITS-512-NEXT:    vrgather.vv v14, v9, v16
+; RV32-BITS-512-NEXT:    vrgather.vv v13, v10, v16
+; RV32-BITS-512-NEXT:    vrgather.vv v12, v11, v16
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-512-NEXT:    vmsne.vi v0, v8, 0
+; RV32-BITS-512-NEXT:    vmsne.vi v0, v12, 0
 ; RV32-BITS-512-NEXT:    ret
 ;
 ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv32i1:
@@ -479,14 +479,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v12, a0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v8, 1, v0
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v12, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v13, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v14, v16
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v15, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v8, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v9, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v10, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v12, v11, v16
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v8, 0
+; RV64-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v12, 0
 ; RV64-BITS-UNKNOWN-NEXT:    ret
 ;
 ; RV64-BITS-256-LABEL: reverse_nxv32i1:
@@ -500,14 +500,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vrsub.vx v16, v12, a0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-256-NEXT:    vmerge.vim v12, v8, 1, v0
+; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vrgather.vv v11, v12, v16
-; RV64-BITS-256-NEXT:    vrgather.vv v10, v13, v16
-; RV64-BITS-256-NEXT:    vrgather.vv v9, v14, v16
-; RV64-BITS-256-NEXT:    vrgather.vv v8, v15, v16
+; RV64-BITS-256-NEXT:    vrgather.vv v15, v8, v16
+; RV64-BITS-256-NEXT:    vrgather.vv v14, v9, v16
+; RV64-BITS-256-NEXT:    vrgather.vv v13, v10, v16
+; RV64-BITS-256-NEXT:    vrgather.vv v12, v11, v16
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-256-NEXT:    vmsne.vi v0, v8, 0
+; RV64-BITS-256-NEXT:    vmsne.vi v0, v12, 0
 ; RV64-BITS-256-NEXT:    ret
 ;
 ; RV64-BITS-512-LABEL: reverse_nxv32i1:
@@ -521,14 +521,14 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vrsub.vx v16, v12, a0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-512-NEXT:    vmerge.vim v12, v8, 1, v0
+; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vrgather.vv v11, v12, v16
-; RV64-BITS-512-NEXT:    vrgather.vv v10, v13, v16
-; RV64-BITS-512-NEXT:    vrgather.vv v9, v14, v16
-; RV64-BITS-512-NEXT:    vrgather.vv v8, v15, v16
+; RV64-BITS-512-NEXT:    vrgather.vv v15, v8, v16
+; RV64-BITS-512-NEXT:    vrgather.vv v14, v9, v16
+; RV64-BITS-512-NEXT:    vrgather.vv v13, v10, v16
+; RV64-BITS-512-NEXT:    vrgather.vv v12, v11, v16
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
+; RV64-BITS-512-NEXT:    vmsne.vi v0, v12, 0
 ; RV64-BITS-512-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
   ret <vscale x 32 x i1> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
index 4bc6313494d41..1ee7e138654b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nontemporal-vp-scalable.ll
@@ -37772,18 +37772,18 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-LABEL: test_nontemporal_vp_scatter_nxv64i8_P1:
 ; CHECK-RV32VC:       # %bb.0:
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
-; CHECK-RV32VC-NEXT:    slli a5, a1, 4
+; CHECK-RV32VC-NEXT:    slli a6, a1, 4
 ; CHECK-RV32VC-NEXT:    slli a2, a1, 2
-; CHECK-RV32VC-NEXT:    slli a6, a1, 3
+; CHECK-RV32VC-NEXT:    slli a5, a1, 3
 ; CHECK-RV32VC-NEXT:    mv a4, a3
 ; CHECK-RV32VC-NEXT:    bltu a3, a2, .LBB915_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
 ; CHECK-RV32VC-NEXT:    mv a4, a2
 ; CHECK-RV32VC-NEXT:  .LBB915_2:
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    add a7, a0, a5
+; CHECK-RV32VC-NEXT:    add a6, a6, a0
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
-; CHECK-RV32VC-NEXT:    add a0, a0, a6
+; CHECK-RV32VC-NEXT:    add a0, a0, a5
 ; CHECK-RV32VC-NEXT:    mv a5, a4
 ; CHECK-RV32VC-NEXT:    bltu a4, a1, .LBB915_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
@@ -37791,11 +37791,11 @@ define void @test_nontemporal_vp_scatter_nxv64i8_P1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:  .LBB915_4:
 ; CHECK-RV32VC-NEXT:    addi sp, sp, -16
 ; CHECK-RV32VC-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-RV32VC-NEXT:    csrr a6, vlenb
-; CHECK-RV32VC-NEXT:    slli a6, a6, 3
-; CHECK-RV32VC-NEXT:    sub sp, sp, a6
+; CHECK-RV32VC-NEXT:    csrr a7, vlenb
+; CHECK-RV32VC-NEXT:    slli a7, a7, 3
+; CHECK-RV32VC-NEXT:    sub sp, sp, a7
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a7)
+; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a6)
 ; CHECK-RV32VC-NEXT:    addi a6, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
 ; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a0)
@@ -38397,18 +38397,18 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32VC-LABEL: test_nontemporal_vp_scatter_nxv64i8_PALL:
 ; CHECK-RV32VC:       # %bb.0:
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
-; CHECK-RV32VC-NEXT:    slli a5, a1, 4
+; CHECK-RV32VC-NEXT:    slli a6, a1, 4
 ; CHECK-RV32VC-NEXT:    slli a2, a1, 2
-; CHECK-RV32VC-NEXT:    slli a6, a1, 3
+; CHECK-RV32VC-NEXT:    slli a5, a1, 3
 ; CHECK-RV32VC-NEXT:    mv a4, a3
 ; CHECK-RV32VC-NEXT:    bltu a3, a2, .LBB916_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
 ; CHECK-RV32VC-NEXT:    mv a4, a2
 ; CHECK-RV32VC-NEXT:  .LBB916_2:
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    add a7, a0, a5
+; CHECK-RV32VC-NEXT:    add a6, a6, a0
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
-; CHECK-RV32VC-NEXT:    add a0, a0, a6
+; CHECK-RV32VC-NEXT:    add a0, a0, a5
 ; CHECK-RV32VC-NEXT:    mv a5, a4
 ; CHECK-RV32VC-NEXT:    bltu a4, a1, .LBB916_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
@@ -38416,11 +38416,11 @@ define void @test_nontemporal_vp_scatter_nxv64i8_PALL(<vscale x 64 x i8> %val, <
 ; CHECK-RV32VC-NEXT:  .LBB916_4:
 ; CHECK-RV32VC-NEXT:    addi sp, sp, -16
 ; CHECK-RV32VC-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-RV32VC-NEXT:    csrr a6, vlenb
-; CHECK-RV32VC-NEXT:    slli a6, a6, 3
-; CHECK-RV32VC-NEXT:    sub sp, sp, a6
+; CHECK-RV32VC-NEXT:    csrr a7, vlenb
+; CHECK-RV32VC-NEXT:    slli a7, a7, 3
+; CHECK-RV32VC-NEXT:    sub sp, sp, a7
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a7)
+; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a6)
 ; CHECK-RV32VC-NEXT:    addi a6, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
 ; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a0)
@@ -39022,18 +39022,18 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-LABEL: test_nontemporal_vp_scatter_nxv64i8_S1:
 ; CHECK-RV32VC:       # %bb.0:
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
-; CHECK-RV32VC-NEXT:    slli a5, a1, 4
+; CHECK-RV32VC-NEXT:    slli a6, a1, 4
 ; CHECK-RV32VC-NEXT:    slli a2, a1, 2
-; CHECK-RV32VC-NEXT:    slli a6, a1, 3
+; CHECK-RV32VC-NEXT:    slli a5, a1, 3
 ; CHECK-RV32VC-NEXT:    mv a4, a3
 ; CHECK-RV32VC-NEXT:    bltu a3, a2, .LBB917_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
 ; CHECK-RV32VC-NEXT:    mv a4, a2
 ; CHECK-RV32VC-NEXT:  .LBB917_2:
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    add a7, a0, a5
+; CHECK-RV32VC-NEXT:    add a6, a6, a0
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
-; CHECK-RV32VC-NEXT:    add a0, a0, a6
+; CHECK-RV32VC-NEXT:    add a0, a0, a5
 ; CHECK-RV32VC-NEXT:    mv a5, a4
 ; CHECK-RV32VC-NEXT:    bltu a4, a1, .LBB917_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
@@ -39041,11 +39041,11 @@ define void @test_nontemporal_vp_scatter_nxv64i8_S1(<vscale x 64 x i8> %val, <vs
 ; CHECK-RV32VC-NEXT:  .LBB917_4:
 ; CHECK-RV32VC-NEXT:    addi sp, sp, -16
 ; CHECK-RV32VC-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-RV32VC-NEXT:    csrr a6, vlenb
-; CHECK-RV32VC-NEXT:    slli a6, a6, 3
-; CHECK-RV32VC-NEXT:    sub sp, sp, a6
+; CHECK-RV32VC-NEXT:    csrr a7, vlenb
+; CHECK-RV32VC-NEXT:    slli a7, a7, 3
+; CHECK-RV32VC-NEXT:    sub sp, sp, a7
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a7)
+; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a6)
 ; CHECK-RV32VC-NEXT:    addi a6, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
 ; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a0)
@@ -39647,18 +39647,18 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32VC-LABEL: test_nontemporal_vp_scatter_nxv64i8_ALL:
 ; CHECK-RV32VC:       # %bb.0:
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
-; CHECK-RV32VC-NEXT:    slli a5, a1, 4
+; CHECK-RV32VC-NEXT:    slli a6, a1, 4
 ; CHECK-RV32VC-NEXT:    slli a2, a1, 2
-; CHECK-RV32VC-NEXT:    slli a6, a1, 3
+; CHECK-RV32VC-NEXT:    slli a5, a1, 3
 ; CHECK-RV32VC-NEXT:    mv a4, a3
 ; CHECK-RV32VC-NEXT:    bltu a3, a2, .LBB918_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
 ; CHECK-RV32VC-NEXT:    mv a4, a2
 ; CHECK-RV32VC-NEXT:  .LBB918_2:
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    add a7, a0, a5
+; CHECK-RV32VC-NEXT:    add a6, a6, a0
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
-; CHECK-RV32VC-NEXT:    add a0, a0, a6
+; CHECK-RV32VC-NEXT:    add a0, a0, a5
 ; CHECK-RV32VC-NEXT:    mv a5, a4
 ; CHECK-RV32VC-NEXT:    bltu a4, a1, .LBB918_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
@@ -39666,11 +39666,11 @@ define void @test_nontemporal_vp_scatter_nxv64i8_ALL(<vscale x 64 x i8> %val, <v
 ; CHECK-RV32VC-NEXT:  .LBB918_4:
 ; CHECK-RV32VC-NEXT:    addi sp, sp, -16
 ; CHECK-RV32VC-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-RV32VC-NEXT:    csrr a6, vlenb
-; CHECK-RV32VC-NEXT:    slli a6, a6, 3
-; CHECK-RV32VC-NEXT:    sub sp, sp, a6
+; CHECK-RV32VC-NEXT:    csrr a7, vlenb
+; CHECK-RV32VC-NEXT:    slli a7, a7, 3
+; CHECK-RV32VC-NEXT:    sub sp, sp, a7
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a7)
+; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a6)
 ; CHECK-RV32VC-NEXT:    addi a6, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
 ; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a0)
@@ -40271,18 +40271,18 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32VC-LABEL: test_nontemporal_vp_scatter_nxv64i8_DEFAULT:
 ; CHECK-RV32VC:       # %bb.0:
 ; CHECK-RV32VC-NEXT:    csrr a1, vlenb
-; CHECK-RV32VC-NEXT:    slli a5, a1, 4
+; CHECK-RV32VC-NEXT:    slli a6, a1, 4
 ; CHECK-RV32VC-NEXT:    slli a2, a1, 2
-; CHECK-RV32VC-NEXT:    slli a6, a1, 3
+; CHECK-RV32VC-NEXT:    slli a5, a1, 3
 ; CHECK-RV32VC-NEXT:    mv a4, a3
 ; CHECK-RV32VC-NEXT:    bltu a3, a2, .LBB919_2
 ; CHECK-RV32VC-NEXT:  # %bb.1:
 ; CHECK-RV32VC-NEXT:    mv a4, a2
 ; CHECK-RV32VC-NEXT:  .LBB919_2:
 ; CHECK-RV32VC-NEXT:    vl8re32.v v0, (a0)
-; CHECK-RV32VC-NEXT:    add a7, a0, a5
+; CHECK-RV32VC-NEXT:    add a6, a6, a0
 ; CHECK-RV32VC-NEXT:    slli a1, a1, 1
-; CHECK-RV32VC-NEXT:    add a0, a0, a6
+; CHECK-RV32VC-NEXT:    add a0, a0, a5
 ; CHECK-RV32VC-NEXT:    mv a5, a4
 ; CHECK-RV32VC-NEXT:    bltu a4, a1, .LBB919_4
 ; CHECK-RV32VC-NEXT:  # %bb.3:
@@ -40290,11 +40290,11 @@ define void @test_nontemporal_vp_scatter_nxv64i8_DEFAULT(<vscale x 64 x i8> %val
 ; CHECK-RV32VC-NEXT:  .LBB919_4:
 ; CHECK-RV32VC-NEXT:    addi sp, sp, -16
 ; CHECK-RV32VC-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-RV32VC-NEXT:    csrr a6, vlenb
-; CHECK-RV32VC-NEXT:    slli a6, a6, 3
-; CHECK-RV32VC-NEXT:    sub sp, sp, a6
+; CHECK-RV32VC-NEXT:    csrr a7, vlenb
+; CHECK-RV32VC-NEXT:    slli a7, a7, 3
+; CHECK-RV32VC-NEXT:    sub sp, sp, a7
 ; CHECK-RV32VC-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a7)
+; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a6)
 ; CHECK-RV32VC-NEXT:    addi a6, sp, 16
 ; CHECK-RV32VC-NEXT:    vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
 ; CHECK-RV32VC-NEXT:    vl8re32.v v24, (a0)
diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
index 41d2c02a73cd0..5a79659436f34 100644
--- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
@@ -348,38 +348,35 @@ entry:
 define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-MVE-LABEL: vector_add_f32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-MVE-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-MVE-NEXT:    .pad #4
-; CHECK-MVE-NEXT:    sub sp, #4
+; CHECK-MVE-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-MVE-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-MVE-NEXT:    .vsave {d8, d9}
 ; CHECK-MVE-NEXT:    vpush {d8, d9}
-; CHECK-MVE-NEXT:    mov r4, r0
+; CHECK-MVE-NEXT:    mov r8, r0
 ; CHECK-MVE-NEXT:    add r0, sp, #40
 ; CHECK-MVE-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-MVE-NEXT:    mov r6, r1
+; CHECK-MVE-NEXT:    mov r7, r1
 ; CHECK-MVE-NEXT:    mov r0, r3
-; CHECK-MVE-NEXT:    mov r5, r2
-; CHECK-MVE-NEXT:    vmov r7, r1, d9
+; CHECK-MVE-NEXT:    mov r6, r2
+; CHECK-MVE-NEXT:    vmov r4, r1, d9
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
-; CHECK-MVE-NEXT:    vmov s19, r0
-; CHECK-MVE-NEXT:    mov r0, r5
-; CHECK-MVE-NEXT:    mov r1, r7
-; CHECK-MVE-NEXT:    bl __aeabi_fadd
-; CHECK-MVE-NEXT:    vmov r5, r1, d8
-; CHECK-MVE-NEXT:    vmov s18, r0
+; CHECK-MVE-NEXT:    mov r5, r0
 ; CHECK-MVE-NEXT:    mov r0, r6
+; CHECK-MVE-NEXT:    mov r1, r4
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
-; CHECK-MVE-NEXT:    vmov s17, r0
-; CHECK-MVE-NEXT:    mov r0, r4
-; CHECK-MVE-NEXT:    mov r1, r5
+; CHECK-MVE-NEXT:    vmov r6, r1, d8
+; CHECK-MVE-NEXT:    mov r4, r0
+; CHECK-MVE-NEXT:    mov r0, r7
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
-; CHECK-MVE-NEXT:    vmov s16, r0
-; CHECK-MVE-NEXT:    vmov r2, r3, d9
-; CHECK-MVE-NEXT:    vmov r0, r1, d8
+; CHECK-MVE-NEXT:    mov r7, r0
+; CHECK-MVE-NEXT:    mov r0, r8
+; CHECK-MVE-NEXT:    mov r1, r6
+; CHECK-MVE-NEXT:    bl __aeabi_fadd
+; CHECK-MVE-NEXT:    mov r1, r7
+; CHECK-MVE-NEXT:    mov r2, r4
+; CHECK-MVE-NEXT:    mov r3, r5
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
-; CHECK-MVE-NEXT:    add sp, #4
-; CHECK-MVE-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-MVE-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 ;
 ; CHECK-BE-LABEL: vector_add_f32:
 ; CHECK-BE:       @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index 4dd9173e2d418..93b5e3f266b0a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -33,53 +33,29 @@ entry:
 }
 
 define void @vld3_v4i32(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v4i32:
-; CHECK-LV:       @ %bb.0: @ %entry
-; CHECK-LV-NEXT:    .vsave {d8, d9}
-; CHECK-LV-NEXT:    vpush {d8, d9}
-; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-LV-NEXT:    vmov.f32 s10, s2
-; CHECK-LV-NEXT:    vmov.f32 s13, s0
-; CHECK-LV-NEXT:    vmov.f32 s14, s3
-; CHECK-LV-NEXT:    vmov.f32 s8, s4
-; CHECK-LV-NEXT:    vmov.f32 s9, s7
-; CHECK-LV-NEXT:    vmov.f32 s12, s5
-; CHECK-LV-NEXT:    vmov.f32 s15, s18
-; CHECK-LV-NEXT:    vmov.f32 s11, s17
-; CHECK-LV-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-LV-NEXT:    vmov.f32 s0, s6
-; CHECK-LV-NEXT:    vmov.f32 s2, s16
-; CHECK-LV-NEXT:    vmov.f32 s3, s19
-; CHECK-LV-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-LV-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LV-NEXT:    vpop {d8, d9}
-; CHECK-LV-NEXT:    bx lr
-;
-; CHECK-LIS-LABEL: vld3_v4i32:
-; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .vsave {d8, d9}
-; CHECK-LIS-NEXT:    vpush {d8, d9}
-; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s17, s0
-; CHECK-LIS-NEXT:    vmov.f32 s18, s3
-; CHECK-LIS-NEXT:    vmov.f32 s8, s4
-; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s16, s5
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s11, s13
-; CHECK-LIS-NEXT:    vadd.i32 q2, q2, q4
-; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vmov.f32 s2, s12
-; CHECK-LIS-NEXT:    vmov.f32 s3, s15
-; CHECK-LIS-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-LIS-NEXT:    vstrw.32 q0, [r1]
-; CHECK-LIS-NEXT:    vpop {d8, d9}
-; CHECK-LIS-NEXT:    bx lr
+; CHECK-LABEL: vld3_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
 
 entry:
   %l1 = load <12 x i32>, ptr %src, align 4
@@ -93,87 +69,46 @@ entry:
 }
 
 define void @vld3_v8i32(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v8i32:
-; CHECK-LV:       @ %bb.0: @ %entry
-; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-LV-NEXT:    vmov.f32 s10, s2
-; CHECK-LV-NEXT:    vmov.f32 s13, s0
-; CHECK-LV-NEXT:    vmov.f32 s14, s3
-; CHECK-LV-NEXT:    vmov.f32 s8, s4
-; CHECK-LV-NEXT:    vmov.f32 s9, s7
-; CHECK-LV-NEXT:    vmov.f32 s12, s5
-; CHECK-LV-NEXT:    vmov.f32 s15, s18
-; CHECK-LV-NEXT:    vmov.f32 s11, s17
-; CHECK-LV-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-LV-NEXT:    vmov.f32 s0, s6
-; CHECK-LV-NEXT:    vmov.f32 s2, s16
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LV-NEXT:    vmov.f32 s3, s19
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LV-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LV-NEXT:    vmov.f32 s17, s4
-; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LV-NEXT:    vmov.f32 s18, s7
-; CHECK-LV-NEXT:    vmov.f32 s22, s6
-; CHECK-LV-NEXT:    vmov.f32 s16, s9
-; CHECK-LV-NEXT:    vmov.f32 s19, s14
-; CHECK-LV-NEXT:    vmov.f32 s20, s8
-; CHECK-LV-NEXT:    vmov.f32 s21, s11
-; CHECK-LV-NEXT:    vmov.f32 s23, s13
-; CHECK-LV-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-LV-NEXT:    vmov.f32 s4, s10
-; CHECK-LV-NEXT:    vmov.f32 s6, s12
-; CHECK-LV-NEXT:    vmov.f32 s7, s15
-; CHECK-LV-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-LV-NEXT:    bx lr
-;
-; CHECK-LIS-LABEL: vld3_v8i32:
-; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s17, s0
-; CHECK-LIS-NEXT:    vmov.f32 s18, s3
-; CHECK-LIS-NEXT:    vmov.f32 s8, s4
-; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s16, s5
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s11, s13
-; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vadd.i32 q2, q2, q4
-; CHECK-LIS-NEXT:    vmov.f32 s2, s12
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s3, s15
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LIS-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LIS-NEXT:    vmov.f32 s17, s4
-; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s18, s7
-; CHECK-LIS-NEXT:    vmov.f32 s22, s6
-; CHECK-LIS-NEXT:    vmov.f32 s16, s9
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s20, s8
-; CHECK-LIS-NEXT:    vmov.f32 s21, s11
-; CHECK-LIS-NEXT:    vmov.f32 s23, s13
-; CHECK-LIS-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-LIS-NEXT:    vmov.f32 s4, s10
-; CHECK-LIS-NEXT:    vmov.f32 s6, s12
-; CHECK-LIS-NEXT:    vmov.f32 s7, s15
-; CHECK-LIS-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11}
-; CHECK-LIS-NEXT:    bx lr
+; CHECK-LABEL: vld3_v8i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
 
 entry:
   %l1 = load <24 x i32>, ptr %src, align 4
@@ -187,155 +122,80 @@ entry:
 }
 
 define void @vld3_v16i32(ptr %src, ptr %dst) {
-; CHECK-LV-LABEL: vld3_v16i32:
-; CHECK-LV:       @ %bb.0: @ %entry
-; CHECK-LV-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LV-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LV-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-LV-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-LV-NEXT:    vmov.f32 s10, s2
-; CHECK-LV-NEXT:    vmov.f32 s13, s0
-; CHECK-LV-NEXT:    vmov.f32 s14, s3
-; CHECK-LV-NEXT:    vmov.f32 s8, s4
-; CHECK-LV-NEXT:    vmov.f32 s9, s7
-; CHECK-LV-NEXT:    vmov.f32 s12, s5
-; CHECK-LV-NEXT:    vmov.f32 s15, s18
-; CHECK-LV-NEXT:    vmov.f32 s11, s17
-; CHECK-LV-NEXT:    vadd.i32 q2, q2, q3
-; CHECK-LV-NEXT:    vmov.f32 s0, s6
-; CHECK-LV-NEXT:    vmov.f32 s2, s16
-; CHECK-LV-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LV-NEXT:    vmov.f32 s3, s19
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LV-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-LV-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LV-NEXT:    vmov.f32 s17, s4
-; CHECK-LV-NEXT:    vmov.f32 s18, s7
-; CHECK-LV-NEXT:    vmov.f32 s22, s6
-; CHECK-LV-NEXT:    vmov.f32 s16, s9
-; CHECK-LV-NEXT:    vmov.f32 s19, s14
-; CHECK-LV-NEXT:    vmov.f32 s20, s8
-; CHECK-LV-NEXT:    vmov.f32 s21, s11
-; CHECK-LV-NEXT:    vmov.f32 s23, s13
-; CHECK-LV-NEXT:    vmov.f32 s4, s10
-; CHECK-LV-NEXT:    vldrw.u32 q2, [r0, #160]
-; CHECK-LV-NEXT:    vmov.f32 s6, s12
-; CHECK-LV-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-LV-NEXT:    vmov.f32 s7, s15
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-LV-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-LV-NEXT:    vmov.f32 s18, s10
-; CHECK-LV-NEXT:    vmov.f32 s21, s8
-; CHECK-LV-NEXT:    vmov.f32 s22, s11
-; CHECK-LV-NEXT:    vmov.f32 s16, s12
-; CHECK-LV-NEXT:    vmov.f32 s17, s15
-; CHECK-LV-NEXT:    vmov.f32 s20, s13
-; CHECK-LV-NEXT:    vmov.f32 s23, s26
-; CHECK-LV-NEXT:    vmov.f32 s19, s25
-; CHECK-LV-NEXT:    vadd.i32 q4, q4, q5
-; CHECK-LV-NEXT:    vmov.f32 s8, s14
-; CHECK-LV-NEXT:    vmov.f32 s10, s24
-; CHECK-LV-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-LV-NEXT:    vmov.f32 s11, s27
-; CHECK-LV-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-LV-NEXT:    vadd.i32 q2, q4, q2
-; CHECK-LV-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-LV-NEXT:    vmov.f32 s25, s12
-; CHECK-LV-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-LV-NEXT:    vmov.f32 s26, s15
-; CHECK-LV-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LV-NEXT:    vmov.f32 s30, s14
-; CHECK-LV-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LV-NEXT:    vmov.f32 s24, s17
-; CHECK-LV-NEXT:    vmov.f32 s27, s22
-; CHECK-LV-NEXT:    vmov.f32 s28, s16
-; CHECK-LV-NEXT:    vmov.f32 s29, s19
-; CHECK-LV-NEXT:    vmov.f32 s31, s21
-; CHECK-LV-NEXT:    vadd.i32 q6, q7, q6
-; CHECK-LV-NEXT:    vmov.f32 s12, s18
-; CHECK-LV-NEXT:    vmov.f32 s14, s20
-; CHECK-LV-NEXT:    vmov.f32 s15, s23
-; CHECK-LV-NEXT:    vadd.i32 q3, q6, q3
-; CHECK-LV-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-LV-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LV-NEXT:    bx lr
-;
-; CHECK-LIS-LABEL: vld3_v16i32:
-; CHECK-LIS:       @ %bb.0: @ %entry
-; CHECK-LIS-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LIS-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LIS-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-LIS-NEXT:    vmov.f32 s10, s2
-; CHECK-LIS-NEXT:    vmov.f32 s17, s0
-; CHECK-LIS-NEXT:    vmov.f32 s18, s3
-; CHECK-LIS-NEXT:    vmov.f32 s8, s4
-; CHECK-LIS-NEXT:    vmov.f32 s9, s7
-; CHECK-LIS-NEXT:    vmov.f32 s16, s5
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s11, s13
-; CHECK-LIS-NEXT:    vmov.f32 s0, s6
-; CHECK-LIS-NEXT:    vadd.i32 q2, q2, q4
-; CHECK-LIS-NEXT:    vmov.f32 s2, s12
-; CHECK-LIS-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s3, s15
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-LIS-NEXT:    vadd.i32 q0, q2, q0
-; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-LIS-NEXT:    vmov.f32 s17, s4
-; CHECK-LIS-NEXT:    vmov.f32 s18, s7
-; CHECK-LIS-NEXT:    vmov.f32 s22, s6
-; CHECK-LIS-NEXT:    vmov.f32 s16, s9
-; CHECK-LIS-NEXT:    vmov.f32 s19, s14
-; CHECK-LIS-NEXT:    vmov.f32 s20, s8
-; CHECK-LIS-NEXT:    vmov.f32 s21, s11
-; CHECK-LIS-NEXT:    vmov.f32 s23, s13
-; CHECK-LIS-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-LIS-NEXT:    vmov.f32 s4, s10
-; CHECK-LIS-NEXT:    vldrw.u32 q2, [r0, #160]
-; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #176]
-; CHECK-LIS-NEXT:    vmov.f32 s6, s12
-; CHECK-LIS-NEXT:    vmov.f32 s7, s15
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-LIS-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-LIS-NEXT:    vmov.f32 s18, s10
-; CHECK-LIS-NEXT:    vmov.f32 s25, s8
-; CHECK-LIS-NEXT:    vmov.f32 s26, s11
-; CHECK-LIS-NEXT:    vmov.f32 s16, s12
-; CHECK-LIS-NEXT:    vmov.f32 s17, s15
-; CHECK-LIS-NEXT:    vmov.f32 s24, s13
-; CHECK-LIS-NEXT:    vmov.f32 s27, s22
-; CHECK-LIS-NEXT:    vmov.f32 s19, s21
-; CHECK-LIS-NEXT:    vmov.f32 s8, s14
-; CHECK-LIS-NEXT:    vadd.i32 q4, q4, q6
-; CHECK-LIS-NEXT:    vmov.f32 s10, s20
-; CHECK-LIS-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-LIS-NEXT:    vmov.f32 s11, s23
-; CHECK-LIS-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-LIS-NEXT:    vadd.i32 q2, q4, q2
-; CHECK-LIS-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-LIS-NEXT:    vmov.f32 s25, s12
-; CHECK-LIS-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-LIS-NEXT:    vmov.f32 s26, s15
-; CHECK-LIS-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-LIS-NEXT:    vmov.f32 s30, s14
-; CHECK-LIS-NEXT:    vstrw.32 q1, [r1]
-; CHECK-LIS-NEXT:    vmov.f32 s24, s17
-; CHECK-LIS-NEXT:    vmov.f32 s27, s22
-; CHECK-LIS-NEXT:    vmov.f32 s28, s16
-; CHECK-LIS-NEXT:    vmov.f32 s29, s19
-; CHECK-LIS-NEXT:    vmov.f32 s31, s21
-; CHECK-LIS-NEXT:    vadd.i32 q6, q7, q6
-; CHECK-LIS-NEXT:    vmov.f32 s12, s18
-; CHECK-LIS-NEXT:    vmov.f32 s14, s20
-; CHECK-LIS-NEXT:    vmov.f32 s15, s23
-; CHECK-LIS-NEXT:    vadd.i32 q3, q6, q3
-; CHECK-LIS-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-LIS-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-LIS-NEXT:    bx lr
+; CHECK-LABEL: vld3_v16i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
+; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vadd.i32 q0, q2, q0
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT:    vadd.i32 q1, q4, q1
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s20, s13
+; CHECK-NEXT:    vmov.f32 s23, s26
+; CHECK-NEXT:    vmov.f32 s19, s25
+; CHECK-NEXT:    vadd.i32 q4, q4, q5
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s11, s27
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
+; CHECK-NEXT:    vadd.i32 q2, q4, q2
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s25, s12
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s26, s15
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s30, s14
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s24, s17
+; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s29, s19
+; CHECK-NEXT:    vmov.f32 s31, s21
+; CHECK-NEXT:    vadd.i32 q6, q7, q6
+; CHECK-NEXT:    vmov.f32 s12, s18
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vmov.f32 s15, s23
+; CHECK-NEXT:    vadd.i32 q3, q6, q3
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
 
 entry:
   %l1 = load <48 x i32>, ptr %src, align 4

From a88c83950be540f608587a4c68e0b0651e3b199b Mon Sep 17 00:00:00 2001
From: Hongyu Chen <xxs_chy@outlook.com>
Date: Thu, 2 Oct 2025 12:53:38 +0800
Subject: [PATCH 461/878] [LLVM] Volunteer myself and Usman Nadeem as
 DFAJumpThreading maintainers (#161491)

Both Usman Nadeem and I have constantly contributed to the
DFAJumpThreading pass so far. To push DFAJumpThreading forwards and make
it enabled by default, I volunteer myself and Usman Nadeem as
DFAJumpThreading maintainers.
---
 llvm/Maintainers.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 5afdd1519c96f..e52259236fc19 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -123,6 +123,13 @@ a.bataev@outlook.com (email), [alexey-bataev](https://github.com/alexey-bataev)
 Chandler Carruth \
 chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
 
+#### DFAJumpThreading
+
+Hongyu Chen \
+xxs\_chy@outlook.com (email), [XChy](https://github.com/XChy) (Github) \
+Usman Nadeem \
+mnadeem@quicinc.com (email), [UsmanNadeem](https://github.com/UsmanNadeem) (Github)
+
 ### Instrumentation and sanitizers
 
 #### Sanitizers not covered by someone else

From fea2cca4d6364f66a5f663d95141c9cab53dbfd2 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Thu, 2 Oct 2025 00:57:10 -0400
Subject: [PATCH 462/878] [MLIR][Python] expose Operation::setLoc (#161594)

---
 mlir/include/mlir-c/IR.h            |  4 ++++
 mlir/lib/Bindings/Python/IRCore.cpp | 12 +++++++++---
 mlir/lib/CAPI/IR/IR.cpp             |  4 ++++
 mlir/test/python/ir/operation.py    |  8 ++++++++
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h
index 061d7620ba077..c464e4da66f17 100644
--- a/mlir/include/mlir-c/IR.h
+++ b/mlir/include/mlir-c/IR.h
@@ -634,6 +634,10 @@ MLIR_CAPI_EXPORTED MlirContext mlirOperationGetContext(MlirOperation op);
 /// Gets the location of the operation.
 MLIR_CAPI_EXPORTED MlirLocation mlirOperationGetLocation(MlirOperation op);
 
+/// Sets the location of the operation.
+MLIR_CAPI_EXPORTED void mlirOperationSetLocation(MlirOperation op,
+                                                 MlirLocation loc);
+
 /// Gets the type id of the operation.
 /// Returns null if the operation does not have a registered operation
 /// description.
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 83a8757bb72c7..c20b2111c071e 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -3485,15 +3485,21 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           "Shortcut to get an op result if it has only one (throws an error "
           "otherwise).")
-      .def_prop_ro(
+      .def_prop_rw(
           "location",
           [](PyOperationBase &self) {
             PyOperation &operation = self.getOperation();
             return PyLocation(operation.getContext(),
                               mlirOperationGetLocation(operation.get()));
           },
-          "Returns the source location the operation was defined or derived "
-          "from.")
+          [](PyOperationBase &self, const PyLocation &location) {
+            PyOperation &operation = self.getOperation();
+            mlirOperationSetLocation(operation.get(), location.get());
+          },
+          nb::for_getter("Returns the source location the operation was "
+                         "defined or derived from."),
+          nb::for_setter("Sets the source location the operation was defined "
+                         "or derived from."))
       .def_prop_ro("parent",
                    [](PyOperationBase &self)
                        -> std::optional<nb::typed<nb::object, PyOperation>> {
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index e9844a7cc1909..188186598c5c5 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -656,6 +656,10 @@ MlirLocation mlirOperationGetLocation(MlirOperation op) {
   return wrap(unwrap(op)->getLoc());
 }
 
+void mlirOperationSetLocation(MlirOperation op, MlirLocation loc) {
+  unwrap(op)->setLoc(unwrap(loc));
+}
+
 MlirTypeID mlirOperationGetTypeID(MlirOperation op) {
   if (auto info = unwrap(op)->getRegisteredInfo())
     return wrap(info->getTypeID());
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index 4a3625c953d52..cb4cfc8c8a6ec 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -696,6 +696,7 @@ def testOperationPrint():
     # CHECK: resource1: "0x08
     module.operation.print(large_elements_limit=2)
 
+
 # CHECK-LABEL: TEST: testKnownOpView
 @run
 def testKnownOpView():
@@ -969,6 +970,13 @@ def testOperationLoc():
         assert op.location == loc
         assert op.operation.location == loc
 
+        another_loc = Location.name("another_loc")
+        op.location = another_loc
+        assert op.location == another_loc
+        assert op.operation.location == another_loc
+        # CHECK: loc("another_loc")
+        print(op.location)
+
 
 # CHECK-LABEL: TEST: testModuleMerge
 @run

From 9323fbbc4ebca57f7332ec84f7efc41eb88eca6e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 14:20:57 +0900
Subject: [PATCH 463/878] RegisterCoalescer: Avoid return after else (#161622)

---
 llvm/lib/CodeGen/RegisterCoalescer.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 7ac1aef83777a..ebfea8e5581bf 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -584,14 +584,14 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
       return DstReg == Dst;
     // This is a partial register copy. Check that the parts match.
     return Register(TRI.getSubReg(DstReg, SrcSub)) == Dst;
-  } else {
-    // DstReg is virtual.
-    if (DstReg != Dst)
-      return false;
-    // Registers match, do the subregisters line up?
-    return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
-           TRI.composeSubRegIndices(DstIdx, DstSub);
   }
+
+  // DstReg is virtual.
+  if (DstReg != Dst)
+    return false;
+  // Registers match, do the subregisters line up?
+  return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
+         TRI.composeSubRegIndices(DstIdx, DstSub);
 }
 
 void RegisterCoalescerLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -2914,8 +2914,7 @@ JoinVals::ConflictResolution JoinVals::analyzeValue(unsigned ValNo,
     if ((V.ValidLanes & OtherV.ValidLanes).any())
       // Overlapping lanes can't be resolved.
       return CR_Impossible;
-    else
-      return CR_Merge;
+    return CR_Merge;
   }
 
   // No simultaneous def. Is Other live at the def?

From d39095b19357b35bda5e874d66343499985e91bf Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Wed, 1 Oct 2025 22:21:12 -0700
Subject: [PATCH 464/878] [BOLT] Remove unused parameter. NFC (#161617)

`Skip` parameter not used/set inside `analyzeRelocation()`.
---
 bolt/include/bolt/Rewrite/RewriteInstance.h |  5 ++---
 bolt/lib/Rewrite/RewriteInstance.cpp        | 14 ++------------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 19dcce8205ebc..0fe2e32b61933 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -249,12 +249,11 @@ class RewriteInstance {
   /// Analyze relocation \p Rel.
   /// Return true if the relocation was successfully processed, false otherwise.
   /// The \p SymbolName, \p SymbolAddress, \p Addend and \p ExtractedValue
-  /// parameters will be set on success. The \p Skip argument indicates
-  /// that the relocation was analyzed, but it must not be processed.
+  /// parameters will be set on success.
   bool analyzeRelocation(const object::RelocationRef &Rel, uint32_t &RType,
                          std::string &SymbolName, bool &IsSectionRelocation,
                          uint64_t &SymbolAddress, int64_t &Addend,
-                         uint64_t &ExtractedValue, bool &Skip) const;
+                         uint64_t &ExtractedValue) const;
 
   /// Rewrite non-allocatable sections with modifications.
   void rewriteNoteSections();
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 8b78c53aa99b3..c13a9f016e8ae 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -2274,8 +2274,7 @@ uint32_t getRelocationSymbol(const ELFObjectFileBase *Obj,
 bool RewriteInstance::analyzeRelocation(
     const RelocationRef &Rel, uint32_t &RType, std::string &SymbolName,
     bool &IsSectionRelocation, uint64_t &SymbolAddress, int64_t &Addend,
-    uint64_t &ExtractedValue, bool &Skip) const {
-  Skip = false;
+    uint64_t &ExtractedValue) const {
   if (!Relocation::isSupported(RType))
     return false;
 
@@ -2707,9 +2706,8 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
   int64_t Addend;
   uint64_t ExtractedValue;
   bool IsSectionRelocation;
-  bool Skip;
   if (!analyzeRelocation(Rel, RType, SymbolName, IsSectionRelocation,
-                         SymbolAddress, Addend, ExtractedValue, Skip)) {
+                         SymbolAddress, Addend, ExtractedValue)) {
     LLVM_DEBUG({
       dbgs() << "BOLT-WARNING: failed to analyze relocation @ offset = "
              << formatv("{0:x}; type name = {1}\n", Rel.getOffset(), TypeName);
@@ -2718,14 +2716,6 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
     return;
   }
 
-  if (Skip) {
-    LLVM_DEBUG({
-      dbgs() << "BOLT-DEBUG: skipping relocation @ offset = "
-             << formatv("{0:x}; type name = {1}\n", Rel.getOffset(), TypeName);
-    });
-    return;
-  }
-
   if (!IsFromCode && !IsWritable && (IsX86 || IsAArch64) &&
       Relocation::isPCRelative(RType)) {
     BinaryData *BD = BC->getBinaryDataContainingAddress(Rel.getOffset());

From bcc85f76700d3f0aebc14bf8b981476c94892ef8 Mon Sep 17 00:00:00 2001
From: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
Date: Thu, 2 Oct 2025 13:30:09 +0800
Subject: [PATCH 465/878] [compiler-rt][asan][tests] Stabilize wchar tests on
 Darwin/Android (#161624)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Summary
Stabilize ASan wchar tests across Darwin and Android. NFC: test-only.
Follow-up to PR #160493 (adds wchar interceptors/tests).

### Motivation
- Darwin: The top frame often resolves to `libclang_rt.asan_*` rather
than a source file, so strict checks that include file/line can fail.
See Chromium issue
[448631142](https://g-issues.chromium.org/issues/448631142).
- Android: The “ERROR:” header can go to logcat instead of stderr, so
FileCheck may not see it; stdout/stderr reordering also makes pre-crash
markers racy. See Android Buildbot
[186/12821](https://lab.llvm.org/buildbot/#/builders/186/builds/12821).

### Changes
- Android:
- Force reports to stderr via `%env_asan_opts=log_to_stderr=1`, avoiding
the “ERROR:” header going to logcat.
- Print the pre-crash “Good so far.” to stderr and `fflush(stderr)` to
avoid stdout/stderr reordering.
- Darwin:
- Relax the stack-frame check to only require the function name
(`wcscpy/wcsncpy/wcscat/wcsncat`) to tolerate `libclang_rt.asan_*`
frames.
- Common:
  - Reuse FileCheck var `[[ADDR]]` instead of redefining.
- Make wide string literals `const wchar_t*` to silence
`-Wwritable-strings`.

### Risk
- NFC: test-only; no change to runtime behavior.

### References
- Follow-up to PR #160493.
- Chromium: [448631142](https://g-issues.chromium.org/issues/448631142)
(Darwin failures).
- Android Buildbot:
[186/12821](https://lab.llvm.org/buildbot/#/builders/186/builds/12821).

Signed-off-by: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
---
 compiler-rt/test/asan/TestCases/wcscat.cpp  | 20 ++++++++++----------
 compiler-rt/test/asan/TestCases/wcscpy.cpp  | 20 ++++++++++----------
 compiler-rt/test/asan/TestCases/wcsncat.cpp | 20 ++++++++++----------
 compiler-rt/test/asan/TestCases/wcsncpy.cpp | 20 ++++++++++----------
 4 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
index dcdff88c18ef1..f0a8ec12580b3 100644
--- a/compiler-rt/test/asan/TestCases/wcscat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscat.cpp
@@ -1,26 +1,26 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  wchar_t *start = L"X means ";
-  wchar_t *append = L"dog";
+  const wchar_t *start = L"X means ";
+  const wchar_t *append = L"dog";
   wchar_t goodDst[12];
   wcscpy(goodDst, start);
   wcscat(goodDst, append);
 
   wchar_t badDst[9];
   wcscpy(badDst, start);
-  printf("Good so far.\n");
+  fprintf(stderr, "Good so far.\n");
   // CHECK: Good so far.
-  fflush(stdout);
+  fflush(stderr);
   wcscat(badDst, append); // Boom!
   // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK: #0 {{0x[0-9a-f]+}} in wcscat
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
index 414d83303a960..a280d29289e37 100644
--- a/compiler-rt/test/asan/TestCases/wcscpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscpy.cpp
@@ -1,23 +1,23 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  wchar_t *src = L"X means dog";
+  const wchar_t *src = L"X means dog";
   wchar_t goodDst[12];
   wcscpy(goodDst, src);
 
   wchar_t badDst[7];
-  printf("Good so far.\n");
+  fprintf(stderr, "Good so far.\n");
   // CHECK: Good so far.
-  fflush(stdout);
+  fflush(stderr);
   wcscpy(badDst, src); // Boom!
-  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
+  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK: #0 {{0x[0-9a-f]+}} in wcscpy
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
index 3ab7fc8f55d63..eb7d095e45c7a 100644
--- a/compiler-rt/test/asan/TestCases/wcsncat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncat.cpp
@@ -1,14 +1,14 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  wchar_t *start = L"X means ";
-  wchar_t *append = L"dog";
+  const wchar_t *start = L"X means ";
+  const wchar_t *append = L"dog";
   wchar_t goodDst[15];
   wcscpy(goodDst, start);
   wcsncat(goodDst, append, 5);
@@ -16,12 +16,12 @@ int main() {
   wchar_t badDst[11];
   wcscpy(badDst, start);
   wcsncat(badDst, append, 1);
-  printf("Good so far.\n");
+  fprintf(stderr, "Good so far.\n");
   // CHECK: Good so far.
-  fflush(stdout);
+  fflush(stderr);
   wcsncat(badDst, append, 3); // Boom!
   // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK: #0 {{0x[0-9a-f]+}} in wcsncat
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
index 6177b72990a0a..1106bf5d264e5 100644
--- a/compiler-rt/test/asan/TestCases/wcsncpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
@@ -1,25 +1,25 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  wchar_t *src = L"X means dog";
+  const wchar_t *src = L"X means dog";
   wchar_t goodDst[12];
   wcsncpy(goodDst, src, 12);
 
   wchar_t badDst[7];
   wcsncpy(badDst, src, 7); // This should still work.
-  printf("Good so far.\n");
+  fprintf(stderr, "Good so far.\n");
   // CHECK: Good so far.
-  fflush(stdout);
+  fflush(stderr);
 
   wcsncpy(badDst, src, 15); // Boom!
-  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
+  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK: #0 {{0x[0-9a-f]+}} in wcsncpy
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file

From 17f6888d1771c9f61378a0a58725f3359277ddda Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Oct 2025 22:38:20 -0700
Subject: [PATCH 466/878] [IR] clang-format ValueMap.h

I'm planning to modify this file.
---
 llvm/include/llvm/IR/ValueMap.h | 105 +++++++++++++-------------------
 1 file changed, 44 insertions(+), 61 deletions(-)

diff --git a/llvm/include/llvm/IR/ValueMap.h b/llvm/include/llvm/IR/ValueMap.h
index 1a11718bfcdae..97653c2282aba 100644
--- a/llvm/include/llvm/IR/ValueMap.h
+++ b/llvm/include/llvm/IR/ValueMap.h
@@ -42,18 +42,15 @@
 
 namespace llvm {
 
-template<typename KeyT, typename ValueT, typename Config>
+template <typename KeyT, typename ValueT, typename Config>
 class ValueMapCallbackVH;
-template<typename DenseMapT, typename KeyT>
-class ValueMapIterator;
-template<typename DenseMapT, typename KeyT>
-class ValueMapConstIterator;
+template <typename DenseMapT, typename KeyT> class ValueMapIterator;
+template <typename DenseMapT, typename KeyT> class ValueMapConstIterator;
 
 /// This class defines the default behavior for configurable aspects of
 /// ValueMap<>.  User Configs should inherit from this class to be as compatible
 /// as possible with future versions of ValueMap.
-template<typename KeyT, typename MutexT = sys::Mutex>
-struct ValueMapConfig {
+template <typename KeyT, typename MutexT = sys::Mutex> struct ValueMapConfig {
   using mutex_type = MutexT;
 
   /// If FollowRAUW is true, the ValueMap will update mappings on RAUW. If it's
@@ -66,21 +63,24 @@ struct ValueMapConfig {
   // override all the defaults.
   struct ExtraData {};
 
-  template<typename ExtraDataT>
+  template <typename ExtraDataT>
   static void onRAUW(const ExtraDataT & /*Data*/, KeyT /*Old*/, KeyT /*New*/) {}
-  template<typename ExtraDataT>
-  static void onDelete(const ExtraDataT &/*Data*/, KeyT /*Old*/) {}
+  template <typename ExtraDataT>
+  static void onDelete(const ExtraDataT & /*Data*/, KeyT /*Old*/) {}
 
   /// Returns a mutex that should be acquired around any changes to the map.
   /// This is only acquired from the CallbackVH (and held around calls to onRAUW
   /// and onDelete) and not inside other ValueMap methods.  NULL means that no
   /// mutex is necessary.
-  template<typename ExtraDataT>
-  static mutex_type *getMutex(const ExtraDataT &/*Data*/) { return nullptr; }
+  template <typename ExtraDataT>
+  static mutex_type *getMutex(const ExtraDataT & /*Data*/) {
+    return nullptr;
+  }
 };
 
 /// See the file comment.
-template<typename KeyT, typename ValueT, typename Config =ValueMapConfig<KeyT>>
+template <typename KeyT, typename ValueT,
+          typename Config = ValueMapConfig<KeyT>>
 class ValueMap {
   friend class ValueMapCallbackVH<KeyT, ValueT, Config>;
 
@@ -157,9 +157,7 @@ class ValueMap {
     return Map.find_as(Val) == Map.end() ? 0 : 1;
   }
 
-  iterator find(const KeyT &Val) {
-    return iterator(Map.find_as(Val));
-  }
+  iterator find(const KeyT &Val) { return iterator(Map.find_as(Val)); }
   const_iterator find(const KeyT &Val) const {
     return const_iterator(Map.find_as(Val));
   }
@@ -186,8 +184,7 @@ class ValueMap {
   }
 
   /// insert - Range insertion of pairs.
-  template<typename InputIt>
-  void insert(InputIt I, InputIt E) {
+  template <typename InputIt> void insert(InputIt I, InputIt E) {
     for (; I != E; ++I)
       insert(*I);
   }
@@ -200,17 +197,13 @@ class ValueMap {
     Map.erase(I);
     return true;
   }
-  void erase(iterator I) {
-    return Map.erase(I.base());
-  }
+  void erase(iterator I) { return Map.erase(I.base()); }
 
-  value_type& FindAndConstruct(const KeyT &Key) {
+  value_type &FindAndConstruct(const KeyT &Key) {
     return Map.FindAndConstruct(Wrap(Key));
   }
 
-  ValueT &operator[](const KeyT &Key) {
-    return Map[Wrap(Key)];
-  }
+  ValueT &operator[](const KeyT &Key) { return Map[Wrap(Key)]; }
 
   /// isPointerIntoBucketsArray - Return true if the specified pointer points
   /// somewhere into the ValueMap's array of buckets (i.e. either to a key or
@@ -235,7 +228,7 @@ class ValueMap {
     // the const_cast incorrect) is if it gets inserted into the map.  But then
     // this function must have been called from a non-const method, making the
     // const_cast ok.
-    return ValueMapCVH(key, const_cast<ValueMap*>(this));
+    return ValueMapCVH(key, const_cast<ValueMap *>(this));
   }
 };
 
@@ -252,7 +245,7 @@ class ValueMapCallbackVH final : public CallbackVH {
   ValueMapT *Map;
 
   ValueMapCallbackVH(KeyT Key, ValueMapT *Map)
-      : CallbackVH(const_cast<Value*>(static_cast<const Value*>(Key))),
+      : CallbackVH(const_cast<Value *>(static_cast<const Value *>(Key))),
         Map(Map) {}
 
   // Private constructor used to create empty/tombstone DenseMap keys.
@@ -268,8 +261,8 @@ class ValueMapCallbackVH final : public CallbackVH {
     std::unique_lock<typename Config::mutex_type> Guard;
     if (M)
       Guard = std::unique_lock<typename Config::mutex_type>(*M);
-    Config::onDelete(Copy.Map->Data, Copy.Unwrap());  // May destroy *this.
-    Copy.Map->Map.erase(Copy);  // Definitely destroys *this.
+    Config::onDelete(Copy.Map->Data, Copy.Unwrap()); // May destroy *this.
+    Copy.Map->Map.erase(Copy); // Definitely destroys *this.
   }
 
   void allUsesReplacedWith(Value *new_key) override {
@@ -291,14 +284,14 @@ class ValueMapCallbackVH final : public CallbackVH {
       // removed the old mapping.
       if (I != Copy.Map->Map.end()) {
         ValueT Target(std::move(I->second));
-        Copy.Map->Map.erase(I);  // Definitely destroys *this.
+        Copy.Map->Map.erase(I); // Definitely destroys *this.
         Copy.Map->insert(std::make_pair(typed_new_key, std::move(Target)));
       }
     }
   }
 };
 
-template<typename KeyT, typename ValueT, typename Config>
+template <typename KeyT, typename ValueT, typename Config>
 struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config>> {
   using VH = ValueMapCallbackVH<KeyT, ValueT, Config>;
 
@@ -318,9 +311,7 @@ struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config>> {
     return DenseMapInfo<KeyT>::getHashValue(Val);
   }
 
-  static bool isEqual(const VH &LHS, const VH &RHS) {
-    return LHS == RHS;
-  }
+  static bool isEqual(const VH &LHS, const VH &RHS) { return LHS == RHS; }
 
   static bool isEqual(const KeyT &LHS, const VH &RHS) {
     return LHS == RHS.getValPtr();
@@ -347,7 +338,7 @@ template <typename DenseMapT, typename KeyT> class ValueMapIterator {
 
   struct ValueTypeProxy {
     const KeyT first;
-    ValueT& second;
+    ValueT &second;
 
     ValueTypeProxy *operator->() { return this; }
 
@@ -361,23 +352,19 @@ template <typename DenseMapT, typename KeyT> class ValueMapIterator {
     return Result;
   }
 
-  ValueTypeProxy operator->() const {
-    return operator*();
-  }
+  ValueTypeProxy operator->() const { return operator*(); }
 
-  bool operator==(const ValueMapIterator &RHS) const {
-    return I == RHS.I;
-  }
-  bool operator!=(const ValueMapIterator &RHS) const {
-    return I != RHS.I;
-  }
+  bool operator==(const ValueMapIterator &RHS) const { return I == RHS.I; }
+  bool operator!=(const ValueMapIterator &RHS) const { return I != RHS.I; }
 
-  inline ValueMapIterator& operator++() {  // Preincrement
+  inline ValueMapIterator &operator++() { // Preincrement
     ++I;
     return *this;
   }
-  ValueMapIterator operator++(int) {  // Postincrement
-    ValueMapIterator tmp = *this; ++*this; return tmp;
+  ValueMapIterator operator++(int) { // Postincrement
+    ValueMapIterator tmp = *this;
+    ++*this;
+    return tmp;
   }
 };
 
@@ -397,13 +384,13 @@ template <typename DenseMapT, typename KeyT> class ValueMapConstIterator {
   ValueMapConstIterator() : I() {}
   ValueMapConstIterator(BaseT I) : I(I) {}
   ValueMapConstIterator(ValueMapIterator<DenseMapT, KeyT> Other)
-    : I(Other.base()) {}
+      : I(Other.base()) {}
 
   BaseT base() const { return I; }
 
   struct ValueTypeProxy {
     const KeyT first;
-    const ValueT& second;
+    const ValueT &second;
     ValueTypeProxy *operator->() { return this; }
     operator std::pair<KeyT, ValueT>() const {
       return std::make_pair(first, second);
@@ -415,23 +402,19 @@ template <typename DenseMapT, typename KeyT> class ValueMapConstIterator {
     return Result;
   }
 
-  ValueTypeProxy operator->() const {
-    return operator*();
-  }
+  ValueTypeProxy operator->() const { return operator*(); }
 
-  bool operator==(const ValueMapConstIterator &RHS) const {
-    return I == RHS.I;
-  }
-  bool operator!=(const ValueMapConstIterator &RHS) const {
-    return I != RHS.I;
-  }
+  bool operator==(const ValueMapConstIterator &RHS) const { return I == RHS.I; }
+  bool operator!=(const ValueMapConstIterator &RHS) const { return I != RHS.I; }
 
-  inline ValueMapConstIterator& operator++() {  // Preincrement
+  inline ValueMapConstIterator &operator++() { // Preincrement
     ++I;
     return *this;
   }
-  ValueMapConstIterator operator++(int) {  // Postincrement
-    ValueMapConstIterator tmp = *this; ++*this; return tmp;
+  ValueMapConstIterator operator++(int) { // Postincrement
+    ValueMapConstIterator tmp = *this;
+    ++*this;
+    return tmp;
   }
 };
 

From eedfbbe986467b6d3b968f3af4ab7424af0303a7 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 1 Oct 2025 23:14:26 -0700
Subject: [PATCH 467/878] [AMDGPU] Update gfx1250 VOP1 tests to t16 syntax. NFC
 (#161603)

---
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s       | 588 +++++++++----------
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 576 +++++++++---------
 llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s  | 152 ++---
 3 files changed, 658 insertions(+), 658 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index a313741ffe22d..40fcd6f4f6955 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -73,98 +73,98 @@ v_tanh_f32 v5, src_scc
 v_tanh_f32 v255, 0xaf123456
 // GFX1250: v_tanh_f32_e32 v255, 0xaf123456         ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_tanh_f16 v5, v1
-// GFX1250: v_tanh_f16_e32 v5, v1                   ; encoding: [0x01,0x3f,0x0a,0x7e]
+v_tanh_f16 v5.l, v1.l
+// GFX1250: v_tanh_f16_e32 v5.l, v1.l               ; encoding: [0x01,0x3f,0x0a,0x7e]
 
-v_tanh_f16 v5, v127
-// GFX1250: v_tanh_f16_e32 v5, v127                 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+v_tanh_f16 v5.l, v127.l
+// GFX1250: v_tanh_f16_e32 v5.l, v127.l             ; encoding: [0x7f,0x3f,0x0a,0x7e]
 
-v_tanh_f16 v5, s1
-// GFX1250: v_tanh_f16_e32 v5, s1                   ; encoding: [0x01,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, s1
+// GFX1250: v_tanh_f16_e32 v5.l, s1                 ; encoding: [0x01,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, s105
-// GFX1250: v_tanh_f16_e32 v5, s105                 ; encoding: [0x69,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, s105
+// GFX1250: v_tanh_f16_e32 v5.l, s105               ; encoding: [0x69,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, vcc_lo
-// GFX1250: v_tanh_f16_e32 v5, vcc_lo               ; encoding: [0x6a,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, vcc_lo
+// GFX1250: v_tanh_f16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, vcc_hi
-// GFX1250: v_tanh_f16_e32 v5, vcc_hi               ; encoding: [0x6b,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, vcc_hi
+// GFX1250: v_tanh_f16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, ttmp15
-// GFX1250: v_tanh_f16_e32 v5, ttmp15               ; encoding: [0x7b,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, ttmp15
+// GFX1250: v_tanh_f16_e32 v5.l, ttmp15             ; encoding: [0x7b,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, m0
-// GFX1250: v_tanh_f16_e32 v5, m0                   ; encoding: [0x7d,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, m0
+// GFX1250: v_tanh_f16_e32 v5.l, m0                 ; encoding: [0x7d,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, exec_lo
-// GFX1250: v_tanh_f16_e32 v5, exec_lo              ; encoding: [0x7e,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, exec_lo
+// GFX1250: v_tanh_f16_e32 v5.l, exec_lo            ; encoding: [0x7e,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, exec_hi
-// GFX1250: v_tanh_f16_e32 v5, exec_hi              ; encoding: [0x7f,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, exec_hi
+// GFX1250: v_tanh_f16_e32 v5.l, exec_hi            ; encoding: [0x7f,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, null
-// GFX1250: v_tanh_f16_e32 v5, null                 ; encoding: [0x7c,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, null
+// GFX1250: v_tanh_f16_e32 v5.l, null               ; encoding: [0x7c,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, -1
-// GFX1250: v_tanh_f16_e32 v5, -1                   ; encoding: [0xc1,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, -1
+// GFX1250: v_tanh_f16_e32 v5.l, -1                 ; encoding: [0xc1,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, 0.5
-// GFX1250: v_tanh_f16_e32 v5, 0.5                  ; encoding: [0xf0,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, 0.5
+// GFX1250: v_tanh_f16_e32 v5.l, 0.5                ; encoding: [0xf0,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v5, src_scc
-// GFX1250: v_tanh_f16_e32 v5, src_scc              ; encoding: [0xfd,0x3e,0x0a,0x7e]
+v_tanh_f16 v5.l, src_scc
+// GFX1250: v_tanh_f16_e32 v5.l, src_scc            ; encoding: [0xfd,0x3e,0x0a,0x7e]
 
-v_tanh_f16 v127, 0x8000
-// GFX1250: v_tanh_f16_e32 v127, 0x8000             ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_tanh_f16 v127.l, 0x8000
+// GFX1250: v_tanh_f16_e32 v127.l, 0x8000           ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_tanh_f16 v5.h, v1.h
 // GFX1250: v_tanh_f16_e32 v5.h, v1.h               ; encoding: [0x81,0x3f,0x0a,0x7f]
 
-v_tanh_bf16 v5, v1
-// GFX1250: v_tanh_bf16_e32 v5, v1                  ; encoding: [0x01,0x95,0x0a,0x7e]
+v_tanh_bf16 v5.l, v1.l
+// GFX1250: v_tanh_bf16_e32 v5.l, v1.l              ; encoding: [0x01,0x95,0x0a,0x7e]
 
-v_tanh_bf16 v5, v127
-// GFX1250: v_tanh_bf16_e32 v5, v127                ; encoding: [0x7f,0x95,0x0a,0x7e]
+v_tanh_bf16 v5.l, v127.l
+// GFX1250: v_tanh_bf16_e32 v5.l, v127.l            ; encoding: [0x7f,0x95,0x0a,0x7e]
 
-v_tanh_bf16 v5, s1
-// GFX1250: v_tanh_bf16_e32 v5, s1                  ; encoding: [0x01,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, s1
+// GFX1250: v_tanh_bf16_e32 v5.l, s1                ; encoding: [0x01,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, s105
-// GFX1250: v_tanh_bf16_e32 v5, s105                ; encoding: [0x69,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, s105
+// GFX1250: v_tanh_bf16_e32 v5.l, s105              ; encoding: [0x69,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, vcc_lo
-// GFX1250: v_tanh_bf16_e32 v5, vcc_lo              ; encoding: [0x6a,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, vcc_lo
+// GFX1250: v_tanh_bf16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, vcc_hi
-// GFX1250: v_tanh_bf16_e32 v5, vcc_hi              ; encoding: [0x6b,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, vcc_hi
+// GFX1250: v_tanh_bf16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, ttmp15
-// GFX1250: v_tanh_bf16_e32 v5, ttmp15              ; encoding: [0x7b,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, ttmp15
+// GFX1250: v_tanh_bf16_e32 v5.l, ttmp15            ; encoding: [0x7b,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, m0
-// GFX1250: v_tanh_bf16_e32 v5, m0                  ; encoding: [0x7d,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, m0
+// GFX1250: v_tanh_bf16_e32 v5.l, m0                ; encoding: [0x7d,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, exec_lo
-// GFX1250: v_tanh_bf16_e32 v5, exec_lo             ; encoding: [0x7e,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, exec_lo
+// GFX1250: v_tanh_bf16_e32 v5.l, exec_lo           ; encoding: [0x7e,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, exec_hi
-// GFX1250: v_tanh_bf16_e32 v5, exec_hi             ; encoding: [0x7f,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, exec_hi
+// GFX1250: v_tanh_bf16_e32 v5.l, exec_hi           ; encoding: [0x7f,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, null
-// GFX1250: v_tanh_bf16_e32 v5, null                ; encoding: [0x7c,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, null
+// GFX1250: v_tanh_bf16_e32 v5.l, null              ; encoding: [0x7c,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, -1
-// GFX1250: v_tanh_bf16_e32 v5, -1                  ; encoding: [0xc1,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, -1
+// GFX1250: v_tanh_bf16_e32 v5.l, -1                ; encoding: [0xc1,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, 0.5
-// GFX1250: v_tanh_bf16_e32 v5, 0.5                 ; encoding: [0xf0,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, 0.5
+// GFX1250: v_tanh_bf16_e32 v5.l, 0.5               ; encoding: [0xf0,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v5, src_scc
-// GFX1250: v_tanh_bf16_e32 v5, src_scc             ; encoding: [0xfd,0x94,0x0a,0x7e]
+v_tanh_bf16 v5.l, src_scc
+// GFX1250: v_tanh_bf16_e32 v5.l, src_scc           ; encoding: [0xfd,0x94,0x0a,0x7e]
 
-v_tanh_bf16 v127, 0x8000
-// GFX1250: v_tanh_bf16_e32 v127, 0x8000            ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_tanh_bf16 v127.l, 0x8000
+// GFX1250: v_tanh_bf16_e32 v127.l, 0x8000          ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_tanh_bf16 v5.h, v1.h
 // GFX1250: v_tanh_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0x95,0x0a,0x7f]
@@ -214,347 +214,347 @@ v_prng_b32 v5, src_scc
 v_prng_b32 v255, 0xaf123456
 // GFX1250: v_prng_b32_e32 v255, 0xaf123456         ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
-v_rcp_bf16 v5, v1
-// GFX1250: v_rcp_bf16_e32 v5, v1                   ; encoding: [0x01,0xf3,0x0a,0x7e]
+v_rcp_bf16 v5.l, v1.l
+// GFX1250: v_rcp_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf3,0x0a,0x7e]
 
-v_rcp_bf16 v5, v127
-// GFX1250: v_rcp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf3,0x0a,0x7e]
+v_rcp_bf16 v5.l, v127.l
+// GFX1250: v_rcp_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf3,0x0a,0x7e]
 
-v_rcp_bf16 v5, s1
-// GFX1250: v_rcp_bf16_e32 v5, s1                   ; encoding: [0x01,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, s1
+// GFX1250: v_rcp_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, s105
-// GFX1250: v_rcp_bf16_e32 v5, s105                 ; encoding: [0x69,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, s105
+// GFX1250: v_rcp_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, vcc_lo
-// GFX1250: v_rcp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, vcc_lo
+// GFX1250: v_rcp_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, vcc_hi
-// GFX1250: v_rcp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, vcc_hi
+// GFX1250: v_rcp_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, ttmp15
-// GFX1250: v_rcp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, ttmp15
+// GFX1250: v_rcp_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, m0
-// GFX1250: v_rcp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, m0
+// GFX1250: v_rcp_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, exec_lo
-// GFX1250: v_rcp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, exec_lo
+// GFX1250: v_rcp_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, exec_hi
-// GFX1250: v_rcp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, exec_hi
+// GFX1250: v_rcp_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, null
-// GFX1250: v_rcp_bf16_e32 v5, null                 ; encoding: [0x7c,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, null
+// GFX1250: v_rcp_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, -1
-// GFX1250: v_rcp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, -1
+// GFX1250: v_rcp_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, 0.5
-// GFX1250: v_rcp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, 0.5
+// GFX1250: v_rcp_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v5, src_scc
-// GFX1250: v_rcp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf2,0x0a,0x7e]
+v_rcp_bf16 v5.l, src_scc
+// GFX1250: v_rcp_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf2,0x0a,0x7e]
 
-v_rcp_bf16 v127, 0x8000
-// GFX1250: v_rcp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_rcp_bf16 v127.l, 0x8000
+// GFX1250: v_rcp_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_rcp_bf16 v5.h, v1.h
 // GFX1250: v_rcp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf3,0x0a,0x7f]
 
-v_sqrt_bf16 v5, v1
-// GFX1250: v_sqrt_bf16_e32 v5, v1                  ; encoding: [0x01,0xf5,0x0a,0x7e]
+v_sqrt_bf16 v5.l, v1.l
+// GFX1250: v_sqrt_bf16_e32 v5.l, v1.l              ; encoding: [0x01,0xf5,0x0a,0x7e]
 
-v_sqrt_bf16 v5, v127
-// GFX1250: v_sqrt_bf16_e32 v5, v127                ; encoding: [0x7f,0xf5,0x0a,0x7e]
+v_sqrt_bf16 v5.l, v127.l
+// GFX1250: v_sqrt_bf16_e32 v5.l, v127.l            ; encoding: [0x7f,0xf5,0x0a,0x7e]
 
-v_sqrt_bf16 v5, s1
-// GFX1250: v_sqrt_bf16_e32 v5, s1                  ; encoding: [0x01,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, s1
+// GFX1250: v_sqrt_bf16_e32 v5.l, s1                ; encoding: [0x01,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, s105
-// GFX1250: v_sqrt_bf16_e32 v5, s105                ; encoding: [0x69,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, s105
+// GFX1250: v_sqrt_bf16_e32 v5.l, s105              ; encoding: [0x69,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, vcc_lo
-// GFX1250: v_sqrt_bf16_e32 v5, vcc_lo              ; encoding: [0x6a,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, vcc_lo
+// GFX1250: v_sqrt_bf16_e32 v5.l, vcc_lo            ; encoding: [0x6a,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, vcc_hi
-// GFX1250: v_sqrt_bf16_e32 v5, vcc_hi              ; encoding: [0x6b,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, vcc_hi
+// GFX1250: v_sqrt_bf16_e32 v5.l, vcc_hi            ; encoding: [0x6b,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, ttmp15
-// GFX1250: v_sqrt_bf16_e32 v5, ttmp15              ; encoding: [0x7b,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, ttmp15
+// GFX1250: v_sqrt_bf16_e32 v5.l, ttmp15            ; encoding: [0x7b,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, m0
-// GFX1250: v_sqrt_bf16_e32 v5, m0                  ; encoding: [0x7d,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, m0
+// GFX1250: v_sqrt_bf16_e32 v5.l, m0                ; encoding: [0x7d,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, exec_lo
-// GFX1250: v_sqrt_bf16_e32 v5, exec_lo             ; encoding: [0x7e,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, exec_lo
+// GFX1250: v_sqrt_bf16_e32 v5.l, exec_lo           ; encoding: [0x7e,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, exec_hi
-// GFX1250: v_sqrt_bf16_e32 v5, exec_hi             ; encoding: [0x7f,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, exec_hi
+// GFX1250: v_sqrt_bf16_e32 v5.l, exec_hi           ; encoding: [0x7f,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, null
-// GFX1250: v_sqrt_bf16_e32 v5, null                ; encoding: [0x7c,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, null
+// GFX1250: v_sqrt_bf16_e32 v5.l, null              ; encoding: [0x7c,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, -1
-// GFX1250: v_sqrt_bf16_e32 v5, -1                  ; encoding: [0xc1,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, -1
+// GFX1250: v_sqrt_bf16_e32 v5.l, -1                ; encoding: [0xc1,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, 0.5
-// GFX1250: v_sqrt_bf16_e32 v5, 0.5                 ; encoding: [0xf0,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, 0.5
+// GFX1250: v_sqrt_bf16_e32 v5.l, 0.5               ; encoding: [0xf0,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v5, src_scc
-// GFX1250: v_sqrt_bf16_e32 v5, src_scc             ; encoding: [0xfd,0xf4,0x0a,0x7e]
+v_sqrt_bf16 v5.l, src_scc
+// GFX1250: v_sqrt_bf16_e32 v5.l, src_scc           ; encoding: [0xfd,0xf4,0x0a,0x7e]
 
-v_sqrt_bf16 v127, 0x8000
-// GFX1250: v_sqrt_bf16_e32 v127, 0x8000            ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_sqrt_bf16 v127.l, 0x8000
+// GFX1250: v_sqrt_bf16_e32 v127.l, 0x8000          ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_sqrt_bf16 v5.h, v1.h
 // GFX1250: v_sqrt_bf16_e32 v5.h, v1.h              ; encoding: [0x81,0xf5,0x0a,0x7f]
 
-v_rsq_bf16 v5, v1
-// GFX1250: v_rsq_bf16_e32 v5, v1                   ; encoding: [0x01,0xf7,0x0a,0x7e]
+v_rsq_bf16 v5.l, v1.l
+// GFX1250: v_rsq_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf7,0x0a,0x7e]
 
-v_rsq_bf16 v5, v127
-// GFX1250: v_rsq_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf7,0x0a,0x7e]
+v_rsq_bf16 v5.l, v127.l
+// GFX1250: v_rsq_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf7,0x0a,0x7e]
 
-v_rsq_bf16 v5, s1
-// GFX1250: v_rsq_bf16_e32 v5, s1                   ; encoding: [0x01,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, s1
+// GFX1250: v_rsq_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, s105
-// GFX1250: v_rsq_bf16_e32 v5, s105                 ; encoding: [0x69,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, s105
+// GFX1250: v_rsq_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, vcc_lo
-// GFX1250: v_rsq_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, vcc_lo
+// GFX1250: v_rsq_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, vcc_hi
-// GFX1250: v_rsq_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, vcc_hi
+// GFX1250: v_rsq_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, ttmp15
-// GFX1250: v_rsq_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, ttmp15
+// GFX1250: v_rsq_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, m0
-// GFX1250: v_rsq_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, m0
+// GFX1250: v_rsq_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, exec_lo
-// GFX1250: v_rsq_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, exec_lo
+// GFX1250: v_rsq_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, exec_hi
-// GFX1250: v_rsq_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, exec_hi
+// GFX1250: v_rsq_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, null
-// GFX1250: v_rsq_bf16_e32 v5, null                 ; encoding: [0x7c,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, null
+// GFX1250: v_rsq_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, -1
-// GFX1250: v_rsq_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, -1
+// GFX1250: v_rsq_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, 0.5
-// GFX1250: v_rsq_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, 0.5
+// GFX1250: v_rsq_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v5, src_scc
-// GFX1250: v_rsq_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf6,0x0a,0x7e]
+v_rsq_bf16 v5.l, src_scc
+// GFX1250: v_rsq_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf6,0x0a,0x7e]
 
-v_rsq_bf16 v127, 0x8000
-// GFX1250: v_rsq_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_rsq_bf16 v127.l, 0x8000
+// GFX1250: v_rsq_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_rsq_bf16 v5.h, v1.h
 // GFX1250: v_rsq_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf7,0x0a,0x7f]
 
-v_log_bf16 v5, v1
-// GFX1250: v_log_bf16_e32 v5, v1                   ; encoding: [0x01,0xf9,0x0a,0x7e]
+v_log_bf16 v5.l, v1.l
+// GFX1250: v_log_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xf9,0x0a,0x7e]
 
-v_log_bf16 v5, v127
-// GFX1250: v_log_bf16_e32 v5, v127                 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+v_log_bf16 v5.l, v127.l
+// GFX1250: v_log_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xf9,0x0a,0x7e]
 
-v_log_bf16 v5, s1
-// GFX1250: v_log_bf16_e32 v5, s1                   ; encoding: [0x01,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, s1
+// GFX1250: v_log_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, s105
-// GFX1250: v_log_bf16_e32 v5, s105                 ; encoding: [0x69,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, s105
+// GFX1250: v_log_bf16_e32 v5.l, s105               ; encoding: [0x69,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, vcc_lo
-// GFX1250: v_log_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, vcc_lo
+// GFX1250: v_log_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, vcc_hi
-// GFX1250: v_log_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, vcc_hi
+// GFX1250: v_log_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, ttmp15
-// GFX1250: v_log_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, ttmp15
+// GFX1250: v_log_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, m0
-// GFX1250: v_log_bf16_e32 v5, m0                   ; encoding: [0x7d,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, m0
+// GFX1250: v_log_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, exec_lo
-// GFX1250: v_log_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, exec_lo
+// GFX1250: v_log_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, exec_hi
-// GFX1250: v_log_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, exec_hi
+// GFX1250: v_log_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, null
-// GFX1250: v_log_bf16_e32 v5, null                 ; encoding: [0x7c,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, null
+// GFX1250: v_log_bf16_e32 v5.l, null               ; encoding: [0x7c,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, -1
-// GFX1250: v_log_bf16_e32 v5, -1                   ; encoding: [0xc1,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, -1
+// GFX1250: v_log_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, 0.5
-// GFX1250: v_log_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, 0.5
+// GFX1250: v_log_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xf8,0x0a,0x7e]
 
-v_log_bf16 v5, src_scc
-// GFX1250: v_log_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xf8,0x0a,0x7e]
+v_log_bf16 v5.l, src_scc
+// GFX1250: v_log_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xf8,0x0a,0x7e]
 
-v_log_bf16 v127, 0x8000
-// GFX1250: v_log_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_log_bf16 v127.l, 0x8000
+// GFX1250: v_log_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_log_bf16 v5.h, v1.h
 // GFX1250: v_log_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xf9,0x0a,0x7f]
 
-v_exp_bf16 v5, v1
-// GFX1250: v_exp_bf16_e32 v5, v1                   ; encoding: [0x01,0xfb,0x0a,0x7e]
+v_exp_bf16 v5.l, v1.l
+// GFX1250: v_exp_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xfb,0x0a,0x7e]
 
-v_exp_bf16 v5, v127
-// GFX1250: v_exp_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfb,0x0a,0x7e]
+v_exp_bf16 v5.l, v127.l
+// GFX1250: v_exp_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xfb,0x0a,0x7e]
 
-v_exp_bf16 v5, s1
-// GFX1250: v_exp_bf16_e32 v5, s1                   ; encoding: [0x01,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, s1
+// GFX1250: v_exp_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, s105
-// GFX1250: v_exp_bf16_e32 v5, s105                 ; encoding: [0x69,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, s105
+// GFX1250: v_exp_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, vcc_lo
-// GFX1250: v_exp_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, vcc_lo
+// GFX1250: v_exp_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, vcc_hi
-// GFX1250: v_exp_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, vcc_hi
+// GFX1250: v_exp_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, ttmp15
-// GFX1250: v_exp_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, ttmp15
+// GFX1250: v_exp_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, m0
-// GFX1250: v_exp_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, m0
+// GFX1250: v_exp_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, exec_lo
-// GFX1250: v_exp_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, exec_lo
+// GFX1250: v_exp_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, exec_hi
-// GFX1250: v_exp_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, exec_hi
+// GFX1250: v_exp_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, null
-// GFX1250: v_exp_bf16_e32 v5, null                 ; encoding: [0x7c,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, null
+// GFX1250: v_exp_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, -1
-// GFX1250: v_exp_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, -1
+// GFX1250: v_exp_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, 0.5
-// GFX1250: v_exp_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, 0.5
+// GFX1250: v_exp_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v5, src_scc
-// GFX1250: v_exp_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfa,0x0a,0x7e]
+v_exp_bf16 v5.l, src_scc
+// GFX1250: v_exp_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfa,0x0a,0x7e]
 
-v_exp_bf16 v127, 0x8000
-// GFX1250: v_exp_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_exp_bf16 v127.l, 0x8000
+// GFX1250: v_exp_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_exp_bf16 v5.h, v1.h
 // GFX1250: v_exp_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfb,0x0a,0x7f]
 
-v_sin_bf16 v5, v1
-// GFX1250: v_sin_bf16_e32 v5, v1                   ; encoding: [0x01,0xfd,0x0a,0x7e]
+v_sin_bf16 v5.l, v1.l
+// GFX1250: v_sin_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xfd,0x0a,0x7e]
 
-v_sin_bf16 v5, v127
-// GFX1250: v_sin_bf16_e32 v5, v127                 ; encoding: [0x7f,0xfd,0x0a,0x7e]
+v_sin_bf16 v5.l, v127.l
+// GFX1250: v_sin_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xfd,0x0a,0x7e]
 
-v_sin_bf16 v5, s1
-// GFX1250: v_sin_bf16_e32 v5, s1                   ; encoding: [0x01,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, s1
+// GFX1250: v_sin_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, s105
-// GFX1250: v_sin_bf16_e32 v5, s105                 ; encoding: [0x69,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, s105
+// GFX1250: v_sin_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, vcc_lo
-// GFX1250: v_sin_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, vcc_lo
+// GFX1250: v_sin_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, vcc_hi
-// GFX1250: v_sin_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, vcc_hi
+// GFX1250: v_sin_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, ttmp15
-// GFX1250: v_sin_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, ttmp15
+// GFX1250: v_sin_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, m0
-// GFX1250: v_sin_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, m0
+// GFX1250: v_sin_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, exec_lo
-// GFX1250: v_sin_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, exec_lo
+// GFX1250: v_sin_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, exec_hi
-// GFX1250: v_sin_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, exec_hi
+// GFX1250: v_sin_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, null
-// GFX1250: v_sin_bf16_e32 v5, null                 ; encoding: [0x7c,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, null
+// GFX1250: v_sin_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, -1
-// GFX1250: v_sin_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, -1
+// GFX1250: v_sin_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, 0.5
-// GFX1250: v_sin_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, 0.5
+// GFX1250: v_sin_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v5, src_scc
-// GFX1250: v_sin_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfc,0x0a,0x7e]
+v_sin_bf16 v5.l, src_scc
+// GFX1250: v_sin_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfc,0x0a,0x7e]
 
-v_sin_bf16 v127, 0x8000
-// GFX1250: v_sin_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_sin_bf16 v127.l, 0x8000
+// GFX1250: v_sin_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_sin_bf16 v5.h, v1.h
 // GFX1250: v_sin_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xfd,0x0a,0x7f]
 
-v_cos_bf16 v5, v1
-// GFX1250: v_cos_bf16_e32 v5, v1                   ; encoding: [0x01,0xff,0x0a,0x7e]
+v_cos_bf16 v5.l, v1.l
+// GFX1250: v_cos_bf16_e32 v5.l, v1.l               ; encoding: [0x01,0xff,0x0a,0x7e]
 
-v_cos_bf16 v5, v127
-// GFX1250: v_cos_bf16_e32 v5, v127                 ; encoding: [0x7f,0xff,0x0a,0x7e]
+v_cos_bf16 v5.l, v127.l
+// GFX1250: v_cos_bf16_e32 v5.l, v127.l             ; encoding: [0x7f,0xff,0x0a,0x7e]
 
-v_cos_bf16 v5, s1
-// GFX1250: v_cos_bf16_e32 v5, s1                   ; encoding: [0x01,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, s1
+// GFX1250: v_cos_bf16_e32 v5.l, s1                 ; encoding: [0x01,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, s105
-// GFX1250: v_cos_bf16_e32 v5, s105                 ; encoding: [0x69,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, s105
+// GFX1250: v_cos_bf16_e32 v5.l, s105               ; encoding: [0x69,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, vcc_lo
-// GFX1250: v_cos_bf16_e32 v5, vcc_lo               ; encoding: [0x6a,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, vcc_lo
+// GFX1250: v_cos_bf16_e32 v5.l, vcc_lo             ; encoding: [0x6a,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, vcc_hi
-// GFX1250: v_cos_bf16_e32 v5, vcc_hi               ; encoding: [0x6b,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, vcc_hi
+// GFX1250: v_cos_bf16_e32 v5.l, vcc_hi             ; encoding: [0x6b,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, ttmp15
-// GFX1250: v_cos_bf16_e32 v5, ttmp15               ; encoding: [0x7b,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, ttmp15
+// GFX1250: v_cos_bf16_e32 v5.l, ttmp15             ; encoding: [0x7b,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, m0
-// GFX1250: v_cos_bf16_e32 v5, m0                   ; encoding: [0x7d,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, m0
+// GFX1250: v_cos_bf16_e32 v5.l, m0                 ; encoding: [0x7d,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, exec_lo
-// GFX1250: v_cos_bf16_e32 v5, exec_lo              ; encoding: [0x7e,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, exec_lo
+// GFX1250: v_cos_bf16_e32 v5.l, exec_lo            ; encoding: [0x7e,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, exec_hi
-// GFX1250: v_cos_bf16_e32 v5, exec_hi              ; encoding: [0x7f,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, exec_hi
+// GFX1250: v_cos_bf16_e32 v5.l, exec_hi            ; encoding: [0x7f,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, null
-// GFX1250: v_cos_bf16_e32 v5, null                 ; encoding: [0x7c,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, null
+// GFX1250: v_cos_bf16_e32 v5.l, null               ; encoding: [0x7c,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, -1
-// GFX1250: v_cos_bf16_e32 v5, -1                   ; encoding: [0xc1,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, -1
+// GFX1250: v_cos_bf16_e32 v5.l, -1                 ; encoding: [0xc1,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, 0.5
-// GFX1250: v_cos_bf16_e32 v5, 0.5                  ; encoding: [0xf0,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, 0.5
+// GFX1250: v_cos_bf16_e32 v5.l, 0.5                ; encoding: [0xf0,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v5, src_scc
-// GFX1250: v_cos_bf16_e32 v5, src_scc              ; encoding: [0xfd,0xfe,0x0a,0x7e]
+v_cos_bf16 v5.l, src_scc
+// GFX1250: v_cos_bf16_e32 v5.l, src_scc            ; encoding: [0xfd,0xfe,0x0a,0x7e]
 
-v_cos_bf16 v127, 0x8000
-// GFX1250: v_cos_bf16_e32 v127, 0x8000             ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_cos_bf16 v127.l, 0x8000
+// GFX1250: v_cos_bf16_e32 v127.l, 0x8000           ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00]
 
 v_cos_bf16 v5.h, v1.h
 // GFX1250: v_cos_bf16_e32 v5.h, v1.h               ; encoding: [0x81,0xff,0x0a,0x7f]
 
-v_cvt_f32_bf16 v5, v1
-// GFX1250: v_cvt_f32_bf16_e32 v5, v1               ; encoding: [0x01,0xe5,0x0a,0x7e]
+v_cvt_f32_bf16 v5, v1.l
+// GFX1250: v_cvt_f32_bf16_e32 v5, v1.l             ; encoding: [0x01,0xe5,0x0a,0x7e]
 
-v_cvt_f32_bf16 v5, v127
-// GFX1250: v_cvt_f32_bf16_e32 v5, v127             ; encoding: [0x7f,0xe5,0x0a,0x7e]
+v_cvt_f32_bf16 v5, v127.l
+// GFX1250: v_cvt_f32_bf16_e32 v5, v127.l           ; encoding: [0x7f,0xe5,0x0a,0x7e]
 
 v_cvt_f32_bf16 v5, s1
 // GFX1250: v_cvt_f32_bf16_e32 v5, s1               ; encoding: [0x01,0xe4,0x0a,0x7e]
@@ -676,11 +676,11 @@ v_cvt_pk_f32_bf8_e32 v[2:3], 3
 v_cvt_pk_f32_bf8_e32 v[4:5], 3
 // GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3          ; encoding: [0x83,0xde,0x08,0x7e]
 
-v_cvt_pk_f32_bf8_e32 v[2:3], v3
-// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+v_cvt_pk_f32_bf8_e32 v[2:3], v3.l
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], v3.l       ; encoding: [0x03,0xdf,0x04,0x7e]
 
-v_cvt_pk_f32_bf8_e32 v[4:5], v3
-// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v3         ; encoding: [0x03,0xdf,0x08,0x7e]
+v_cvt_pk_f32_bf8_e32 v[4:5], v3.l
+// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v3.l       ; encoding: [0x03,0xdf,0x08,0x7e]
 
 v_cvt_pk_f32_bf8_e32 v[4:5], v127.h
 // GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v127.h     ; encoding: [0xff,0xdf,0x08,0x7e]
@@ -703,32 +703,32 @@ v_cvt_pk_f32_fp8_e32 v[4:5], v127.h
 v_cvt_pk_f32_fp8_e32 v[4:5], v127.l
 // GFX1250: v_cvt_pk_f32_fp8_e32 v[4:5], v127.l     ; encoding: [0x7f,0xdd,0x08,0x7e]
 
-v_sat_pk4_i4_i8 v1, v2
-// GFX1250: v_sat_pk4_i4_i8_e32 v1, v2              ; encoding: [0x02,0xe7,0x02,0x7e]
+v_sat_pk4_i4_i8 v1.l, v2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1.l, v2            ; encoding: [0x02,0xe7,0x02,0x7e]
 
-v_sat_pk4_i4_i8 v1, s2
-// GFX1250: v_sat_pk4_i4_i8_e32 v1, s2              ; encoding: [0x02,0xe6,0x02,0x7e]
+v_sat_pk4_i4_i8 v1.l, s2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1.l, s2            ; encoding: [0x02,0xe6,0x02,0x7e]
 
-v_sat_pk4_i4_i8 v1, 2
-// GFX1250: v_sat_pk4_i4_i8_e32 v1, 2               ; encoding: [0x82,0xe6,0x02,0x7e]
+v_sat_pk4_i4_i8 v1.l, 2
+// GFX1250: v_sat_pk4_i4_i8_e32 v1.l, 2             ; encoding: [0x82,0xe6,0x02,0x7e]
 
-v_sat_pk4_i4_i8 v1, 0x1234
-// GFX1250: v_sat_pk4_i4_i8_e32 v1, 0x1234          ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
+v_sat_pk4_i4_i8 v1.l, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e32 v1.l, 0x1234        ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00]
 
 v_sat_pk4_i4_i8 v1.h, v2
 // GFX1250: v_sat_pk4_i4_i8_e32 v1.h, v2            ; encoding: [0x02,0xe7,0x02,0x7f]
 
-v_sat_pk4_u4_u8 v1, v2
-// GFX1250: v_sat_pk4_u4_u8_e32 v1, v2              ; encoding: [0x02,0xe9,0x02,0x7e]
+v_sat_pk4_u4_u8 v1.l, v2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1.l, v2            ; encoding: [0x02,0xe9,0x02,0x7e]
 
-v_sat_pk4_u4_u8 v1, s2
-// GFX1250: v_sat_pk4_u4_u8_e32 v1, s2              ; encoding: [0x02,0xe8,0x02,0x7e]
+v_sat_pk4_u4_u8 v1.l, s2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1.l, s2            ; encoding: [0x02,0xe8,0x02,0x7e]
 
-v_sat_pk4_u4_u8 v1, 2
-// GFX1250: v_sat_pk4_u4_u8_e32 v1, 2               ; encoding: [0x82,0xe8,0x02,0x7e]
+v_sat_pk4_u4_u8 v1.l, 2
+// GFX1250: v_sat_pk4_u4_u8_e32 v1.l, 2             ; encoding: [0x82,0xe8,0x02,0x7e]
 
-v_sat_pk4_u4_u8 v1, 0x1234
-// GFX1250: v_sat_pk4_u4_u8_e32 v1, 0x1234          ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
+v_sat_pk4_u4_u8 v1.l, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e32 v1.l, 0x1234        ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00]
 
 v_sat_pk4_u4_u8 v1.h, v2
 // GFX1250: v_sat_pk4_u4_u8_e32 v1.h, v2            ; encoding: [0x02,0xe9,0x02,0x7f]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
index 0a46f2f074e10..592619f41b7b5 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
@@ -58,120 +58,120 @@ v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi
 // GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_tanh_f16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_tanh_f16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_mirror
-// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_mirror
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_half_mirror
-// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_half_mirror
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_shl:1
-// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_shl:1
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_shl:15
-// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_shl:15
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_shr:1
-// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_shr:1
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_shr:15
-// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_shr:15
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_ror:1
-// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_ror:1
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_ror:15
-// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_ror:15
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_tanh_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_tanh_f16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_tanh_f16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_tanh_f16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_f16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_f16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_tanh_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_tanh_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_mirror
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_half_mirror
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_shl:1
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_shl:15
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_shr:1
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_shr:15
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_ror:1
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_ror:15
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_tanh_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_tanh_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_tanh_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_tanh_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
@@ -230,480 +230,480 @@ v_prng_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rcp_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rcp_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_mirror
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_half_mirror
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_shl:1
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_shl:15
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_shr:1
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_shr:15
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_ror:1
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_ror:15
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rcp_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rcp_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rcp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rcp_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rcp_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rcp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rcp_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sqrt_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_sqrt_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sqrt_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_mirror
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_half_mirror
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_shl:1
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_shl:15
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_shr:1
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_shr:15
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_ror:1
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_ror:15
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sqrt_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sqrt_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sqrt_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sqrt_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_sqrt_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sqrt_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sqrt_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_rsq_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_rsq_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_mirror
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_half_mirror
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_shl:1
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_shl:15
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_shr:1
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_shr:15
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_ror:1
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_ror:15
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_rsq_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_rsq_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_rsq_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_rsq_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rsq_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_log_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_log_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_mirror
-// GFX1250: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_half_mirror
-// GFX1250: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_shl:1
-// GFX1250: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_shl:15
-// GFX1250: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_shr:1
-// GFX1250: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_shr:15
-// GFX1250: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_ror:1
-// GFX1250: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_ror:15
-// GFX1250: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_log_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_log_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_log_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_log_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_log_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_exp_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_exp_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_mirror
-// GFX1250: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_half_mirror
-// GFX1250: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_shl:1
-// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_shl:15
-// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_shr:1
-// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_shr:15
-// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_ror:1
-// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_ror:15
-// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_exp_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_exp_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_exp_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_exp_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_exp_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_sin_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_sin_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_mirror
-// GFX1250: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_half_mirror
-// GFX1250: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_shl:1
-// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_shl:15
-// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_shr:1
-// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_shr:15
-// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_ror:1
-// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_ror:15
-// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_sin_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_sin_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_sin_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_sin_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sin_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cos_bf16 v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cos_bf16 v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_mirror
-// GFX1250: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_mirror
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_half_mirror
-// GFX1250: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_half_mirror
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_shl:1
-// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_shl:1
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_shl:15
-// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_shl:15
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_shr:1
-// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_shr:1
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_shr:15
-// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_shr:15
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_ror:1
-// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_ror:1
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_ror:15
-// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_ror:15
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cos_bf16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cos_bf16 v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cos_bf16 v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_cos_bf16 v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cos_bf16 v5.h, v1.h quad_perm:[3,2,1,0]
 // GFX1250: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+v_cvt_f32_bf16 v5, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+v_cvt_f32_bf16 v5, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_mirror
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_mirror
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_half_mirror
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_half_mirror
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_shl:1
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_shl:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_shl:15
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_shl:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_shr:1
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_shr:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_shr:15
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_shr:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_ror:1
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_ror:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_ror:15
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_ror:15
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
+v_cvt_f32_bf16 v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+v_cvt_f32_bf16 v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
+v_cvt_f32_bf16 v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_cvt_f32_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+v_cvt_f32_bf16 v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_f32_bf16 v5, v1.h quad_perm:[3,2,1,0]
@@ -750,24 +750,24 @@ v_cvt_pk_f16_fp8 v1, v2.h quad_perm:[0,1,2,3]
 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
-// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
+v_sat_pk4_i4_i8 v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
-// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
+v_sat_pk4_i4_i8 v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_i4_i8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
 // GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
-// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
+v_sat_pk4_u4_u8 v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
-// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
+v_sat_pk4_u4_u8 v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_u4_u8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
index 359aadc49ccc4..2aabe39383d12 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s
@@ -14,32 +14,32 @@ v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_tanh_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_tanh_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_tanh_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_tanh_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_tanh_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_tanh_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
@@ -58,152 +58,152 @@ v_prng_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rcp_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rcp_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rcp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rcp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rcp_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rcp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rcp_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sqrt_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sqrt_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sqrt_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sqrt_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sqrt_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_sqrt_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sqrt_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sqrt_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rsq_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_rsq_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_rsq_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rsq_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_log_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_log_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_log_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_log_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_exp_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_exp_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_exp_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_exp_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sin_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_sin_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_sin_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sin_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cos_bf16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_cos_bf16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cos_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cos_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_f32_bf16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
+v_cvt_f32_bf16 v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_f32_bf16_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_cvt_f32_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
+v_cvt_f32_bf16 v127, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_f32_bf16 v5, v1.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f32_bf16_dpp v5, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x81,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f16_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f16_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_cvt_f16_bf8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f16_bf8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f16_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cvt_f16_bf8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_cvt_f16_bf8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_f16_bf8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf0,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_f16_bf8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f16_bf8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf0,0x02,0x7f,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f16_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f16_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xee,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_cvt_f16_fp8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f16_fp8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xee,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f16_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cvt_f16_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xee,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_cvt_f16_fp8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cvt_f16_fp8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xee,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_f16_fp8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]
@@ -226,24 +226,24 @@ v_cvt_pk_f16_fp8 v1, v2.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_sat_pk4_i4_i8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_sat_pk4_i4_i8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_i4_i8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_sat_pk4_u4_u8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
+v_sat_pk4_u4_u8 v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_u4_u8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0]

From bd0f9db06ab5b4e62bd298834e933d9fac5105ed Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 1 Oct 2025 23:14:52 -0700
Subject: [PATCH 468/878] [AMDGPU] Update gfx1250 vop3_from_vop1 tests to t16
 syntax. NFC (#161609)

---
 .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s    | 2008 ++++++++---------
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s |  564 ++---
 .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s  |  164 +-
 3 files changed, 1368 insertions(+), 1368 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
index 8e73ecb4232e0..5ac9eb47381d6 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s
@@ -46,50 +46,50 @@ v_bfrev_b32_e64 v5, src_scc
 v_bfrev_b32_e64 v255, 0xaf123456
 // GFX1250: v_bfrev_b32_e64 v255, 0xaf123456        ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_ceil_f16_e64 v5, v1
-// GFX1250: v_ceil_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
+v_ceil_f16_e64 v5.l, v1.l
+// GFX1250: v_ceil_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00]
 
-v_ceil_f16_e64 v5, v255
-// GFX1250: v_ceil_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
+v_ceil_f16_e64 v5.l, v255.l
+// GFX1250: v_ceil_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00]
 
-v_ceil_f16_e64 v5, s1
-// GFX1250: v_ceil_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, s1
+// GFX1250: v_ceil_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, s105
-// GFX1250: v_ceil_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, s105
+// GFX1250: v_ceil_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, vcc_lo
-// GFX1250: v_ceil_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, vcc_lo
+// GFX1250: v_ceil_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, vcc_hi
-// GFX1250: v_ceil_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, vcc_hi
+// GFX1250: v_ceil_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, ttmp15
-// GFX1250: v_ceil_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, ttmp15
+// GFX1250: v_ceil_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, m0
-// GFX1250: v_ceil_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, m0
+// GFX1250: v_ceil_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, exec_lo
-// GFX1250: v_ceil_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, exec_lo
+// GFX1250: v_ceil_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, exec_hi
-// GFX1250: v_ceil_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, exec_hi
+// GFX1250: v_ceil_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, null
-// GFX1250: v_ceil_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, null
+// GFX1250: v_ceil_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, -1
-// GFX1250: v_ceil_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
+v_ceil_f16_e64 v5.l, -1
+// GFX1250: v_ceil_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00]
 
-v_ceil_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_ceil_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
+v_ceil_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_ceil_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08]
 
-v_ceil_f16_e64 v5, src_scc mul:4
-// GFX1250: v_ceil_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
+v_ceil_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_ceil_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10]
 
-v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_ceil_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_ceil_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_ceil_f16 v5.l, v128.l
 // GFX1250: v_ceil_f16_e64 v5.l, v128.l             ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00]
@@ -268,50 +268,50 @@ v_clz_i32_u32_e64 v5, src_scc
 v_clz_i32_u32_e64 v255, 0xaf123456
 // GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cos_f16_e64 v5, v1
-// GFX1250: v_cos_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v1.l
+// GFX1250: v_cos_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, v255
-// GFX1250: v_cos_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
+v_cos_f16_e64 v5.l, v255.l
+// GFX1250: v_cos_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00]
 
-v_cos_f16_e64 v5, s1
-// GFX1250: v_cos_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s1
+// GFX1250: v_cos_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, s105
-// GFX1250: v_cos_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, s105
+// GFX1250: v_cos_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_lo
-// GFX1250: v_cos_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_lo
+// GFX1250: v_cos_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, vcc_hi
-// GFX1250: v_cos_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, vcc_hi
+// GFX1250: v_cos_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, ttmp15
-// GFX1250: v_cos_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, ttmp15
+// GFX1250: v_cos_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, m0
-// GFX1250: v_cos_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, m0
+// GFX1250: v_cos_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_lo
-// GFX1250: v_cos_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_lo
+// GFX1250: v_cos_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, exec_hi
-// GFX1250: v_cos_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, exec_hi
+// GFX1250: v_cos_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, null
-// GFX1250: v_cos_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, null
+// GFX1250: v_cos_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, -1
-// GFX1250: v_cos_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_f16_e64 v5.l, -1
+// GFX1250: v_cos_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_cos_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
+v_cos_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_cos_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cos_f16_e64 v5, src_scc mul:4
-// GFX1250: v_cos_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
+v_cos_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_cos_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cos_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_cos_f16 v5.l, v128.l
 // GFX1250: v_cos_f16_e64 v5.l, v128.l              ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00]
@@ -502,11 +502,11 @@ v_cvt_pk_f32_bf8_e64 v[2:3], 3
 v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0]
 // GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_pk_f32_bf8_e64 v[2:3], v3
-// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+v_cvt_pk_f32_bf8_e64 v[2:3], v3.l
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3.l       ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0]
-// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+v_cvt_pk_f32_bf8_e64 v[2:3], v3.h op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
 
 v_cvt_pk_f32_bf8 v[2:3], v128.h
 // GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00]
@@ -526,11 +526,11 @@ v_cvt_pk_f32_fp8_e64 v[2:3], 3
 v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0]
 // GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
 
-v_cvt_pk_f32_fp8_e64 v[2:3], v3
-// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+v_cvt_pk_f32_fp8_e64 v[2:3], v3.l
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3.l       ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0]
-// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+v_cvt_pk_f32_fp8_e64 v[2:3], v3.h op_sel:[1,0]
+// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
 
 v_cvt_pk_f32_fp8 v[2:3], v128.h
 // GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00]
@@ -568,50 +568,50 @@ v_cvt_pk_f32_fp8_e64 v[4:5], v3
 v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0]
 // GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, v1
-// GFX1250: v_cvt_f16_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, v1
+// GFX1250: v_cvt_f16_f32_e64 v5.l, v1              ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, v255
-// GFX1250: v_cvt_f16_f32_e64 v5, v255              ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, v255
+// GFX1250: v_cvt_f16_f32_e64 v5.l, v255            ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, s1
-// GFX1250: v_cvt_f16_f32_e64 v5, s1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, s1
+// GFX1250: v_cvt_f16_f32_e64 v5.l, s1              ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, s105
-// GFX1250: v_cvt_f16_f32_e64 v5, s105              ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, s105
+// GFX1250: v_cvt_f16_f32_e64 v5.l, s105            ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, vcc_lo
-// GFX1250: v_cvt_f16_f32_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, vcc_lo
+// GFX1250: v_cvt_f16_f32_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, vcc_hi
-// GFX1250: v_cvt_f16_f32_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, vcc_hi
+// GFX1250: v_cvt_f16_f32_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, ttmp15
-// GFX1250: v_cvt_f16_f32_e64 v5, ttmp15            ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, ttmp15
+// GFX1250: v_cvt_f16_f32_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, m0
-// GFX1250: v_cvt_f16_f32_e64 v5, m0                ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, m0
+// GFX1250: v_cvt_f16_f32_e64 v5.l, m0              ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, exec_lo
-// GFX1250: v_cvt_f16_f32_e64 v5, exec_lo           ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, exec_lo
+// GFX1250: v_cvt_f16_f32_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, exec_hi
-// GFX1250: v_cvt_f16_f32_e64 v5, exec_hi           ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, exec_hi
+// GFX1250: v_cvt_f16_f32_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, null
-// GFX1250: v_cvt_f16_f32_e64 v5, null              ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, null
+// GFX1250: v_cvt_f16_f32_e64 v5.l, null            ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, -1
-// GFX1250: v_cvt_f16_f32_e64 v5, -1                ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5.l, -1
+// GFX1250: v_cvt_f16_f32_e64 v5.l, -1              ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v5, 0.5 mul:2
-// GFX1250: v_cvt_f16_f32_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08]
+v_cvt_f16_f32_e64 v5.l, 0.5 mul:2
+// GFX1250: v_cvt_f16_f32_e64 v5.l, 0.5 mul:2       ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cvt_f16_f32_e64 v5, src_scc mul:4
-// GFX1250: v_cvt_f16_f32_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10]
+v_cvt_f16_f32_e64 v5.l, src_scc mul:4
+// GFX1250: v_cvt_f16_f32_e64 v5.l, src_scc mul:4   ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2
-// GFX1250: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+v_cvt_f16_f32_e64 v255.l, -|0xaf123456| clamp div:2
+// GFX1250: v_cvt_f16_f32_e64 v255.l, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
 v_cvt_f16_f32 v128.l, v15
 // GFX1250: v_cvt_f16_f32_e64 v128.l, v15           ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00]
@@ -619,50 +619,50 @@ v_cvt_f16_f32 v128.l, v15
 v_cvt_f16_f32 v128.h, v15
 // GFX1250: v_cvt_f16_f32_e64 v128.h, v15 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0x0f,0x01,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, v1
-// GFX1250: v_cvt_f16_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, v1.l
+// GFX1250: v_cvt_f16_i16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, v255
-// GFX1250: v_cvt_f16_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, v255.l
+// GFX1250: v_cvt_f16_i16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, s1
-// GFX1250: v_cvt_f16_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, s1
+// GFX1250: v_cvt_f16_i16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, s105
-// GFX1250: v_cvt_f16_i16_e64 v5, s105              ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, s105
+// GFX1250: v_cvt_f16_i16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, vcc_lo
-// GFX1250: v_cvt_f16_i16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, vcc_lo
+// GFX1250: v_cvt_f16_i16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, vcc_hi
-// GFX1250: v_cvt_f16_i16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, vcc_hi
+// GFX1250: v_cvt_f16_i16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, ttmp15
-// GFX1250: v_cvt_f16_i16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, ttmp15
+// GFX1250: v_cvt_f16_i16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, m0
-// GFX1250: v_cvt_f16_i16_e64 v5, m0                ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, m0
+// GFX1250: v_cvt_f16_i16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, exec_lo
-// GFX1250: v_cvt_f16_i16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, exec_lo
+// GFX1250: v_cvt_f16_i16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, exec_hi
-// GFX1250: v_cvt_f16_i16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, exec_hi
+// GFX1250: v_cvt_f16_i16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, null
-// GFX1250: v_cvt_f16_i16_e64 v5, null              ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, null
+// GFX1250: v_cvt_f16_i16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, -1
-// GFX1250: v_cvt_f16_i16_e64 v5, -1                ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
+v_cvt_f16_i16_e64 v5.l, -1
+// GFX1250: v_cvt_f16_i16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_f16_i16_e64 v5, 0.5 mul:2
-// GFX1250: v_cvt_f16_i16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08]
+v_cvt_f16_i16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_cvt_f16_i16_e64 v5.l, 0.5 mul:2       ; encoding: [0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cvt_f16_i16_e64 v5, src_scc mul:4
-// GFX1250: v_cvt_f16_i16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10]
+v_cvt_f16_i16_e64 v5.l, src_scc mul:4
+// GFX1250: v_cvt_f16_i16_e64 v5.l, src_scc mul:4   ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2
-// GFX1250: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+v_cvt_f16_i16_e64 v255.l, 0xfe0b clamp div:2
+// GFX1250: v_cvt_f16_i16_e64 v255.l, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
 
 v_cvt_f16_i16 v128.l, v15.l
 // GFX1250: v_cvt_f16_i16_e64 v128.l, v15.l         ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00]
@@ -670,50 +670,50 @@ v_cvt_f16_i16 v128.l, v15.l
 v_cvt_f16_i16 v128.h, v15.h
 // GFX1250: v_cvt_f16_i16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd1,0xd5,0x0f,0x01,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, v1
-// GFX1250: v_cvt_f16_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, v1.l
+// GFX1250: v_cvt_f16_u16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, v255
-// GFX1250: v_cvt_f16_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, v255.l
+// GFX1250: v_cvt_f16_u16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, s1
-// GFX1250: v_cvt_f16_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, s1
+// GFX1250: v_cvt_f16_u16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, s105
-// GFX1250: v_cvt_f16_u16_e64 v5, s105              ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, s105
+// GFX1250: v_cvt_f16_u16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, vcc_lo
-// GFX1250: v_cvt_f16_u16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, vcc_lo
+// GFX1250: v_cvt_f16_u16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, vcc_hi
-// GFX1250: v_cvt_f16_u16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, vcc_hi
+// GFX1250: v_cvt_f16_u16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, ttmp15
-// GFX1250: v_cvt_f16_u16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, ttmp15
+// GFX1250: v_cvt_f16_u16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, m0
-// GFX1250: v_cvt_f16_u16_e64 v5, m0                ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, m0
+// GFX1250: v_cvt_f16_u16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, exec_lo
-// GFX1250: v_cvt_f16_u16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, exec_lo
+// GFX1250: v_cvt_f16_u16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, exec_hi
-// GFX1250: v_cvt_f16_u16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, exec_hi
+// GFX1250: v_cvt_f16_u16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, null
-// GFX1250: v_cvt_f16_u16_e64 v5, null              ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, null
+// GFX1250: v_cvt_f16_u16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, -1
-// GFX1250: v_cvt_f16_u16_e64 v5, -1                ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
+v_cvt_f16_u16_e64 v5.l, -1
+// GFX1250: v_cvt_f16_u16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_f16_u16_e64 v5, 0.5 mul:2
-// GFX1250: v_cvt_f16_u16_e64 v5, 0.5 mul:2         ; encoding: [0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08]
+v_cvt_f16_u16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_cvt_f16_u16_e64 v5.l, 0.5 mul:2       ; encoding: [0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cvt_f16_u16_e64 v5, src_scc mul:4
-// GFX1250: v_cvt_f16_u16_e64 v5, src_scc mul:4     ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10]
+v_cvt_f16_u16_e64 v5.l, src_scc mul:4
+// GFX1250: v_cvt_f16_u16_e64 v5.l, src_scc mul:4   ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2
-// GFX1250: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
+v_cvt_f16_u16_e64 v255.l, 0xfe0b clamp div:2
+// GFX1250: v_cvt_f16_u16_e64 v255.l, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00]
 
 v_cvt_f16_u16 v128.l, v15.l
 // GFX1250: v_cvt_f16_u16_e64 v128.l, v15.l         ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00]
@@ -721,11 +721,11 @@ v_cvt_f16_u16 v128.l, v15.l
 v_cvt_f16_u16 v128.h, v15.h
 // GFX1250: v_cvt_f16_u16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd0,0xd5,0x0f,0x01,0x00,0x00]
 
-v_cvt_f32_f16_e64 v5, v1
-// GFX1250: v_cvt_f32_f16_e64 v5, v1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_f32_f16_e64 v5, v1.l
+// GFX1250: v_cvt_f32_f16_e64 v5, v1.l              ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_f16_e64 v5, v255
-// GFX1250: v_cvt_f32_f16_e64 v5, v255              ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_f32_f16_e64 v5, v255.l
+// GFX1250: v_cvt_f32_f16_e64 v5, v255.l            ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_f32_f16_e64 v5, s1
 // GFX1250: v_cvt_f32_f16_e64 v5, s1                ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00]
@@ -1303,50 +1303,50 @@ v_cvt_flr_i32_f32_e64 v5, src_scc
 v_cvt_flr_i32_f32_e64 v255, -|0xaf123456|
 // GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
 
-v_cvt_i16_f16_e64 v5, v1
-// GFX1250: v_cvt_i16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, v1.l
+// GFX1250: v_cvt_i16_f16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, v255
-// GFX1250: v_cvt_i16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, v255.l
+// GFX1250: v_cvt_i16_f16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, s1
-// GFX1250: v_cvt_i16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, s1
+// GFX1250: v_cvt_i16_f16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, s105
-// GFX1250: v_cvt_i16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, s105
+// GFX1250: v_cvt_i16_f16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, vcc_lo
-// GFX1250: v_cvt_i16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, vcc_lo
+// GFX1250: v_cvt_i16_f16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, vcc_hi
-// GFX1250: v_cvt_i16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, vcc_hi
+// GFX1250: v_cvt_i16_f16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, ttmp15
-// GFX1250: v_cvt_i16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, ttmp15
+// GFX1250: v_cvt_i16_f16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, m0
-// GFX1250: v_cvt_i16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, m0
+// GFX1250: v_cvt_i16_f16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, exec_lo
-// GFX1250: v_cvt_i16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, exec_lo
+// GFX1250: v_cvt_i16_f16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, exec_hi
-// GFX1250: v_cvt_i16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, exec_hi
+// GFX1250: v_cvt_i16_f16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, null
-// GFX1250: v_cvt_i16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, null
+// GFX1250: v_cvt_i16_f16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, -1
-// GFX1250: v_cvt_i16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, -1
+// GFX1250: v_cvt_i16_f16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, 0.5
-// GFX1250: v_cvt_i16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, 0.5
+// GFX1250: v_cvt_i16_f16_e64 v5.l, 0.5             ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v5, src_scc
-// GFX1250: v_cvt_i16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00]
+v_cvt_i16_f16_e64 v5.l, src_scc
+// GFX1250: v_cvt_i16_f16_e64 v5.l, src_scc         ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp
-// GFX1250: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+v_cvt_i16_f16_e64 v255.l, -|0xfe0b| clamp
+// GFX1250: v_cvt_i16_f16_e64 v255.l, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
 v_cvt_i16_f16 v1.l, v128.l
 // GFX1250: v_cvt_i16_f16_e64 v1.l, v128.l          ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00]
@@ -1435,11 +1435,11 @@ v_cvt_i32_f64_e64 v5, -|src_scc|
 v_cvt_i32_f64_e64 v255, 0xaf123456 clamp
 // GFX1250: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_i16_e64 v5, v1
-// GFX1250: v_cvt_i32_i16_e64 v5, v1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v1.l
+// GFX1250: v_cvt_i32_i16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_i16_e64 v5, v255
-// GFX1250: v_cvt_i32_i16_e64 v5, v255              ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_i32_i16_e64 v5, v255.l
+// GFX1250: v_cvt_i32_i16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_i32_i16_e64 v5, s1
 // GFX1250: v_cvt_i32_i16_e64 v5, s1                ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00]
@@ -1531,50 +1531,50 @@ v_cvt_nearest_i32_f32_e64 v5, src_scc
 v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456|
 // GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
 
-v_cvt_norm_i16_f16_e64 v5, v1
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, v1.l
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, v1.l       ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, v255
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, v255.l
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, v255.l     ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, s1
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, s1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, s1         ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, s105
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, s105
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, s105       ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, vcc_lo
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, vcc_lo
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, vcc_lo     ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, vcc_hi
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, vcc_hi
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, vcc_hi     ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, ttmp15
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, ttmp15
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, ttmp15     ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, m0
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, m0
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, m0         ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, exec_lo
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, exec_lo
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, exec_lo    ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, exec_hi
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, exec_hi
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, exec_hi    ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, null
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, null
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, null       ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, -1
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, -1
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, -1         ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, 0.5
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, 0.5
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, 0.5        ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v5, src_scc
-// GFX1250: v_cvt_norm_i16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v5.l, src_scc
+// GFX1250: v_cvt_norm_i16_f16_e64 v5.l, src_scc    ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cvt_norm_i16_f16_e64 v255, -|0xfe0b|
-// GFX1250: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+v_cvt_norm_i16_f16_e64 v255.l, -|0xfe0b|
+// GFX1250: v_cvt_norm_i16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
 v_cvt_norm_i16_f16 v1.l, v128.l
 // GFX1250: v_cvt_norm_i16_f16_e64 v1.l, v128.l     ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00]
@@ -1582,50 +1582,50 @@ v_cvt_norm_i16_f16 v1.l, v128.l
 v_cvt_norm_i16_f16 v1.l, v128.h
 // GFX1250: v_cvt_norm_i16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe3,0xd5,0x80,0x01,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, v1
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, v1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, v1.l
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, v1.l       ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, v255
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, v255         ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, v255.l
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, v255.l     ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, s1
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, s1           ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, s1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, s1         ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, s105
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, s105         ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, s105
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, s105       ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, vcc_lo
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_lo       ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, vcc_lo
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, vcc_lo     ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, vcc_hi
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_hi       ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, vcc_hi
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, vcc_hi     ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, ttmp15
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, ttmp15       ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, ttmp15
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, ttmp15     ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, m0
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, m0           ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, m0
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, m0         ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, exec_lo
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_lo      ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, exec_lo
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, exec_lo    ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, exec_hi
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_hi      ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, exec_hi
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, exec_hi    ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, null
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, null         ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, null
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, null       ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, -1
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, -1           ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, -1
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, -1         ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, 0.5
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, 0.5          ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, 0.5
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, 0.5        ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v5, src_scc
-// GFX1250: v_cvt_norm_u16_f16_e64 v5, src_scc      ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v5.l, src_scc
+// GFX1250: v_cvt_norm_u16_f16_e64 v5.l, src_scc    ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cvt_norm_u16_f16_e64 v255, -|0xfe0b|
-// GFX1250: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b|  ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+v_cvt_norm_u16_f16_e64 v255.l, -|0xfe0b|
+// GFX1250: v_cvt_norm_u16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
 v_cvt_norm_u16_f16 v1.l, v128.l
 // GFX1250: v_cvt_norm_u16_f16_e64 v1.l, v128.l     ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00]
@@ -1723,50 +1723,50 @@ v_cvt_rpi_i32_f32_e64 v5, src_scc
 v_cvt_rpi_i32_f32_e64 v255, -|0xaf123456|
 // GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
 
-v_cvt_u16_f16_e64 v5, v1
-// GFX1250: v_cvt_u16_f16_e64 v5, v1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, v1.l
+// GFX1250: v_cvt_u16_f16_e64 v5.l, v1.l            ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, v255
-// GFX1250: v_cvt_u16_f16_e64 v5, v255              ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, v255.l
+// GFX1250: v_cvt_u16_f16_e64 v5.l, v255.l          ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, s1
-// GFX1250: v_cvt_u16_f16_e64 v5, s1                ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, s1
+// GFX1250: v_cvt_u16_f16_e64 v5.l, s1              ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, s105
-// GFX1250: v_cvt_u16_f16_e64 v5, s105              ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, s105
+// GFX1250: v_cvt_u16_f16_e64 v5.l, s105            ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, vcc_lo
-// GFX1250: v_cvt_u16_f16_e64 v5, vcc_lo            ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, vcc_lo
+// GFX1250: v_cvt_u16_f16_e64 v5.l, vcc_lo          ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, vcc_hi
-// GFX1250: v_cvt_u16_f16_e64 v5, vcc_hi            ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, vcc_hi
+// GFX1250: v_cvt_u16_f16_e64 v5.l, vcc_hi          ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, ttmp15
-// GFX1250: v_cvt_u16_f16_e64 v5, ttmp15            ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, ttmp15
+// GFX1250: v_cvt_u16_f16_e64 v5.l, ttmp15          ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, m0
-// GFX1250: v_cvt_u16_f16_e64 v5, m0                ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, m0
+// GFX1250: v_cvt_u16_f16_e64 v5.l, m0              ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, exec_lo
-// GFX1250: v_cvt_u16_f16_e64 v5, exec_lo           ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, exec_lo
+// GFX1250: v_cvt_u16_f16_e64 v5.l, exec_lo         ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, exec_hi
-// GFX1250: v_cvt_u16_f16_e64 v5, exec_hi           ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, exec_hi
+// GFX1250: v_cvt_u16_f16_e64 v5.l, exec_hi         ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, null
-// GFX1250: v_cvt_u16_f16_e64 v5, null              ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, null
+// GFX1250: v_cvt_u16_f16_e64 v5.l, null            ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, -1
-// GFX1250: v_cvt_u16_f16_e64 v5, -1                ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, -1
+// GFX1250: v_cvt_u16_f16_e64 v5.l, -1              ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, 0.5
-// GFX1250: v_cvt_u16_f16_e64 v5, 0.5               ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, 0.5
+// GFX1250: v_cvt_u16_f16_e64 v5.l, 0.5             ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v5, src_scc
-// GFX1250: v_cvt_u16_f16_e64 v5, src_scc           ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00]
+v_cvt_u16_f16_e64 v5.l, src_scc
+// GFX1250: v_cvt_u16_f16_e64 v5.l, src_scc         ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00]
 
-v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp
-// GFX1250: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+v_cvt_u16_f16_e64 v255.l, -|0xfe0b| clamp
+// GFX1250: v_cvt_u16_f16_e64 v255.l, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
 v_cvt_u16_f16 v1.l, v128.l
 // GFX1250: v_cvt_u16_f16_e64 v1.l, v128.l          ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00]
@@ -1855,11 +1855,11 @@ v_cvt_u32_f64_e64 v5, -|src_scc|
 v_cvt_u32_f64_e64 v255, 0xaf123456 clamp
 // GFX1250: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_u16_e64 v5, v1
-// GFX1250: v_cvt_u32_u16_e64 v5, v1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v1.l
+// GFX1250: v_cvt_u32_u16_e64 v5, v1.l              ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_u16_e64 v5, v255
-// GFX1250: v_cvt_u32_u16_e64 v5, v255              ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_u32_u16_e64 v5, v255.l
+// GFX1250: v_cvt_u32_u16_e64 v5, v255.l            ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_u32_u16_e64 v5, s1
 // GFX1250: v_cvt_u32_u16_e64 v5, s1                ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00]
@@ -1906,50 +1906,50 @@ v_cvt_u32_u16 v1, v128.l
 v_cvt_u32_u16 v1, v128.h
 // GFX1250: v_cvt_u32_u16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xeb,0xd5,0x80,0x01,0x00,0x00]
 
-v_exp_f16_e64 v5, v1
-// GFX1250: v_exp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
+v_exp_f16_e64 v5.l, v1.l
+// GFX1250: v_exp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00]
 
-v_exp_f16_e64 v5, v255
-// GFX1250: v_exp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
+v_exp_f16_e64 v5.l, v255.l
+// GFX1250: v_exp_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00]
 
-v_exp_f16_e64 v5, s1
-// GFX1250: v_exp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, s1
+// GFX1250: v_exp_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, s105
-// GFX1250: v_exp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, s105
+// GFX1250: v_exp_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, vcc_lo
-// GFX1250: v_exp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, vcc_lo
+// GFX1250: v_exp_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, vcc_hi
-// GFX1250: v_exp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, vcc_hi
+// GFX1250: v_exp_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, ttmp15
-// GFX1250: v_exp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, ttmp15
+// GFX1250: v_exp_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, m0
-// GFX1250: v_exp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, m0
+// GFX1250: v_exp_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, exec_lo
-// GFX1250: v_exp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, exec_lo
+// GFX1250: v_exp_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, exec_hi
-// GFX1250: v_exp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, exec_hi
+// GFX1250: v_exp_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, null
-// GFX1250: v_exp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, null
+// GFX1250: v_exp_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, -1
-// GFX1250: v_exp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
+v_exp_f16_e64 v5.l, -1
+// GFX1250: v_exp_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00]
 
-v_exp_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_exp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
+v_exp_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_exp_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08]
 
-v_exp_f16_e64 v5, src_scc mul:4
-// GFX1250: v_exp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
+v_exp_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_exp_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10]
 
-v_exp_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_exp_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_exp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_exp_f16 v1.h, v128.l
 // GFX1250: v_exp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd8,0xd5,0x80,0x01,0x00,0x00]
@@ -2137,50 +2137,50 @@ v_ffbl_b32_e64 v5, src_scc
 v_ffbl_b32_e64 v255, 0xaf123456
 // GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_floor_f16_e64 v5, v1
-// GFX1250: v_floor_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
+v_floor_f16_e64 v5.l, v1.l
+// GFX1250: v_floor_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00]
 
-v_floor_f16_e64 v5, v255
-// GFX1250: v_floor_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
+v_floor_f16_e64 v5.l, v255.l
+// GFX1250: v_floor_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00]
 
-v_floor_f16_e64 v5, s1
-// GFX1250: v_floor_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, s1
+// GFX1250: v_floor_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, s105
-// GFX1250: v_floor_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, s105
+// GFX1250: v_floor_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, vcc_lo
-// GFX1250: v_floor_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, vcc_lo
+// GFX1250: v_floor_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, vcc_hi
-// GFX1250: v_floor_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, vcc_hi
+// GFX1250: v_floor_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, ttmp15
-// GFX1250: v_floor_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, ttmp15
+// GFX1250: v_floor_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, m0
-// GFX1250: v_floor_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, m0
+// GFX1250: v_floor_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, exec_lo
-// GFX1250: v_floor_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, exec_lo
+// GFX1250: v_floor_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, exec_hi
-// GFX1250: v_floor_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, exec_hi
+// GFX1250: v_floor_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, null
-// GFX1250: v_floor_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, null
+// GFX1250: v_floor_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, -1
-// GFX1250: v_floor_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
+v_floor_f16_e64 v5.l, -1
+// GFX1250: v_floor_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00]
 
-v_floor_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_floor_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
+v_floor_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_floor_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08]
 
-v_floor_f16_e64 v5, src_scc mul:4
-// GFX1250: v_floor_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
+v_floor_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_floor_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10]
 
-v_floor_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_floor_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_floor_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_floor_f16 v1.h, v128.l
 // GFX1250: v_floor_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdb,0xd5,0x80,0x01,0x00,0x00]
@@ -2269,50 +2269,50 @@ v_floor_f64_e64 v[6:7], -|src_scc| mul:4
 v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX1250: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_fract_f16_e64 v5, v1
-// GFX1250: v_fract_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, v1.l
+// GFX1250: v_fract_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00]
 
-v_fract_f16_e64 v5, v255
-// GFX1250: v_fract_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
+v_fract_f16_e64 v5.l, v255.l
+// GFX1250: v_fract_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00]
 
-v_fract_f16_e64 v5, s1
-// GFX1250: v_fract_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, s1
+// GFX1250: v_fract_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, s105
-// GFX1250: v_fract_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, s105
+// GFX1250: v_fract_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_lo
-// GFX1250: v_fract_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_lo
+// GFX1250: v_fract_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, vcc_hi
-// GFX1250: v_fract_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, vcc_hi
+// GFX1250: v_fract_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, ttmp15
-// GFX1250: v_fract_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, ttmp15
+// GFX1250: v_fract_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, m0
-// GFX1250: v_fract_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, m0
+// GFX1250: v_fract_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_lo
-// GFX1250: v_fract_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_lo
+// GFX1250: v_fract_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, exec_hi
-// GFX1250: v_fract_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, exec_hi
+// GFX1250: v_fract_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, null
-// GFX1250: v_fract_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, null
+// GFX1250: v_fract_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, -1
-// GFX1250: v_fract_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
+v_fract_f16_e64 v5.l, -1
+// GFX1250: v_fract_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00]
 
-v_fract_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_fract_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
+v_fract_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_fract_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08]
 
-v_fract_f16_e64 v5, src_scc mul:4
-// GFX1250: v_fract_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
+v_fract_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_fract_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10]
 
-v_fract_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_fract_f16 v1.h, v128.l
 // GFX1250: v_fract_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdf,0xd5,0x80,0x01,0x00,0x00]
@@ -2401,50 +2401,50 @@ v_fract_f64_e64 v[6:7], -|src_scc| mul:4
 v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX1250: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_frexp_exp_i16_f16_e64 v5, v1
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, v1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, v1.l
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, v1.l      ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, v255
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, v255        ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, v255.l
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, v255.l    ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, s1
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, s1          ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, s1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, s1        ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, s105
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, s105        ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, s105
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, s105      ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, vcc_lo
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_lo      ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, vcc_lo
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, vcc_lo    ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, vcc_hi
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_hi      ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, vcc_hi
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, vcc_hi    ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, ttmp15
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, ttmp15      ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, ttmp15
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, ttmp15    ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, m0
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, m0          ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, m0
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, m0        ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, exec_lo
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_lo     ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, exec_lo
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, exec_lo   ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, exec_hi
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_hi     ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, exec_hi
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, exec_hi   ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, null
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, null        ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, null
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, null      ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, -1
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, -1          ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, -1
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, -1        ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, 0.5
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, 0.5         ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, 0.5
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, 0.5       ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v5, src_scc
-// GFX1250: v_frexp_exp_i16_f16_e64 v5, src_scc     ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v5.l, src_scc
+// GFX1250: v_frexp_exp_i16_f16_e64 v5.l, src_scc   ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v255, -|0xfe0b|
-// GFX1250: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
+v_frexp_exp_i16_f16_e64 v255.l, -|0xfe0b|
+// GFX1250: v_frexp_exp_i16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00]
 
 v_frexp_exp_i16_f16 v1.h, v128.l
 // GFX1250: v_frexp_exp_i16_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xda,0xd5,0x80,0x01,0x00,0x00]
@@ -2533,50 +2533,50 @@ v_frexp_exp_i32_f64_e64 v5, -|src_scc|
 v_frexp_exp_i32_f64_e64 v255, 0xaf123456
 // GFX1250: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f16_e64 v5, v1
-// GFX1250: v_frexp_mant_f16_e64 v5, v1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, v1.l
+// GFX1250: v_frexp_mant_f16_e64 v5.l, v1.l         ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, v255
-// GFX1250: v_frexp_mant_f16_e64 v5, v255           ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, v255.l
+// GFX1250: v_frexp_mant_f16_e64 v5.l, v255.l       ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s1
-// GFX1250: v_frexp_mant_f16_e64 v5, s1             ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s1
+// GFX1250: v_frexp_mant_f16_e64 v5.l, s1           ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, s105
-// GFX1250: v_frexp_mant_f16_e64 v5, s105           ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, s105
+// GFX1250: v_frexp_mant_f16_e64 v5.l, s105         ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_lo
-// GFX1250: v_frexp_mant_f16_e64 v5, vcc_lo         ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_lo
+// GFX1250: v_frexp_mant_f16_e64 v5.l, vcc_lo       ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, vcc_hi
-// GFX1250: v_frexp_mant_f16_e64 v5, vcc_hi         ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, vcc_hi
+// GFX1250: v_frexp_mant_f16_e64 v5.l, vcc_hi       ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, ttmp15
-// GFX1250: v_frexp_mant_f16_e64 v5, ttmp15         ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, ttmp15
+// GFX1250: v_frexp_mant_f16_e64 v5.l, ttmp15       ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, m0
-// GFX1250: v_frexp_mant_f16_e64 v5, m0             ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, m0
+// GFX1250: v_frexp_mant_f16_e64 v5.l, m0           ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_lo
-// GFX1250: v_frexp_mant_f16_e64 v5, exec_lo        ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_lo
+// GFX1250: v_frexp_mant_f16_e64 v5.l, exec_lo      ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, exec_hi
-// GFX1250: v_frexp_mant_f16_e64 v5, exec_hi        ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, exec_hi
+// GFX1250: v_frexp_mant_f16_e64 v5.l, exec_hi      ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, null
-// GFX1250: v_frexp_mant_f16_e64 v5, null           ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, null
+// GFX1250: v_frexp_mant_f16_e64 v5.l, null         ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, -1
-// GFX1250: v_frexp_mant_f16_e64 v5, -1             ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
+v_frexp_mant_f16_e64 v5.l, -1
+// GFX1250: v_frexp_mant_f16_e64 v5.l, -1           ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_frexp_mant_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_frexp_mant_f16_e64 v5, 0.5 mul:2      ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
+v_frexp_mant_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2    ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08]
 
-v_frexp_mant_f16_e64 v5, src_scc mul:4
-// GFX1250: v_frexp_mant_f16_e64 v5, src_scc mul:4  ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
+v_frexp_mant_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10]
 
-v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_frexp_mant_f16 v1.h, v128.l
 // GFX1250: v_frexp_mant_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd9,0xd5,0x80,0x01,0x00,0x00]
@@ -2665,50 +2665,50 @@ v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4
 v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX1250: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_log_f16_e64 v5, v1
-// GFX1250: v_log_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
+v_log_f16_e64 v5.l, v1.l
+// GFX1250: v_log_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00]
 
-v_log_f16_e64 v5, v255
-// GFX1250: v_log_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
+v_log_f16_e64 v5.l, v255.l
+// GFX1250: v_log_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00]
 
-v_log_f16_e64 v5, s1
-// GFX1250: v_log_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, s1
+// GFX1250: v_log_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, s105
-// GFX1250: v_log_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, s105
+// GFX1250: v_log_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, vcc_lo
-// GFX1250: v_log_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, vcc_lo
+// GFX1250: v_log_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, vcc_hi
-// GFX1250: v_log_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, vcc_hi
+// GFX1250: v_log_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, ttmp15
-// GFX1250: v_log_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, ttmp15
+// GFX1250: v_log_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, m0
-// GFX1250: v_log_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, m0
+// GFX1250: v_log_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, exec_lo
-// GFX1250: v_log_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, exec_lo
+// GFX1250: v_log_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, exec_hi
-// GFX1250: v_log_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, exec_hi
+// GFX1250: v_log_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, null
-// GFX1250: v_log_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, null
+// GFX1250: v_log_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, -1
-// GFX1250: v_log_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
+v_log_f16_e64 v5.l, -1
+// GFX1250: v_log_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00]
 
-v_log_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_log_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
+v_log_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_log_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08]
 
-v_log_f16_e64 v5, src_scc mul:4
-// GFX1250: v_log_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
+v_log_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_log_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10]
 
-v_log_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_log_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_log_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_log_f16 v1.h, v128.l
 // GFX1250: v_log_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd7,0xd5,0x80,0x01,0x00,0x00]
@@ -2872,50 +2872,50 @@ v_movrelsd_b32_e64 v255, v255
 v_nop_e64
 // GFX1250: v_nop                                   ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, v1
-// GFX1250: v_not_b16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, v1.l
+// GFX1250: v_not_b16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00]
 
-v_not_b16_e64 v5, v255
-// GFX1250: v_not_b16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
+v_not_b16_e64 v5.l, v255.l
+// GFX1250: v_not_b16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00]
 
-v_not_b16_e64 v5, s1
-// GFX1250: v_not_b16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, s1
+// GFX1250: v_not_b16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, s105
-// GFX1250: v_not_b16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, s105
+// GFX1250: v_not_b16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_lo
-// GFX1250: v_not_b16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_lo
+// GFX1250: v_not_b16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, vcc_hi
-// GFX1250: v_not_b16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, vcc_hi
+// GFX1250: v_not_b16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, ttmp15
-// GFX1250: v_not_b16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, ttmp15
+// GFX1250: v_not_b16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, m0
-// GFX1250: v_not_b16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, m0
+// GFX1250: v_not_b16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_lo
-// GFX1250: v_not_b16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_lo
+// GFX1250: v_not_b16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, exec_hi
-// GFX1250: v_not_b16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, exec_hi
+// GFX1250: v_not_b16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, null
-// GFX1250: v_not_b16_e64 v5, null                  ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, null
+// GFX1250: v_not_b16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, -1
-// GFX1250: v_not_b16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, -1
+// GFX1250: v_not_b16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, 0.5
-// GFX1250: v_not_b16_e64 v5, 0.5                   ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, 0.5
+// GFX1250: v_not_b16_e64 v5.l, 0.5                 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00]
 
-v_not_b16_e64 v5, src_scc
-// GFX1250: v_not_b16_e64 v5, src_scc               ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
+v_not_b16_e64 v5.l, src_scc
+// GFX1250: v_not_b16_e64 v5.l, src_scc             ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00]
 
-v_not_b16_e64 v255, 0xfe0b
-// GFX1250: v_not_b16_e64 v255, 0xfe0b              ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_not_b16_e64 v255.l, 0xfe0b
+// GFX1250: v_not_b16_e64 v255.l, 0xfe0b            ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_not_b16 v1.h, v128.l
 // GFX1250: v_not_b16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe9,0xd5,0x80,0x01,0x00,0x00]
@@ -2971,50 +2971,50 @@ v_not_b32_e64 v255, 0xaf123456
 v_pipeflush_e64
 // GFX1250: v_pipeflush                             ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, v1
-// GFX1250: v_rcp_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
+v_rcp_f16_e64 v5.l, v1.l
+// GFX1250: v_rcp_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00]
 
-v_rcp_f16_e64 v5, v255
-// GFX1250: v_rcp_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
+v_rcp_f16_e64 v5.l, v255.l
+// GFX1250: v_rcp_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00]
 
-v_rcp_f16_e64 v5, s1
-// GFX1250: v_rcp_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, s1
+// GFX1250: v_rcp_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, s105
-// GFX1250: v_rcp_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, s105
+// GFX1250: v_rcp_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, vcc_lo
-// GFX1250: v_rcp_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, vcc_lo
+// GFX1250: v_rcp_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, vcc_hi
-// GFX1250: v_rcp_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, vcc_hi
+// GFX1250: v_rcp_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, ttmp15
-// GFX1250: v_rcp_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, ttmp15
+// GFX1250: v_rcp_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, m0
-// GFX1250: v_rcp_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, m0
+// GFX1250: v_rcp_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, exec_lo
-// GFX1250: v_rcp_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, exec_lo
+// GFX1250: v_rcp_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, exec_hi
-// GFX1250: v_rcp_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, exec_hi
+// GFX1250: v_rcp_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, null
-// GFX1250: v_rcp_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, null
+// GFX1250: v_rcp_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, -1
-// GFX1250: v_rcp_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
+v_rcp_f16_e64 v5.l, -1
+// GFX1250: v_rcp_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rcp_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_rcp_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
+v_rcp_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_rcp_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rcp_f16_e64 v5, src_scc mul:4
-// GFX1250: v_rcp_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
+v_rcp_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_rcp_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rcp_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_rcp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rcp_f16 v1.h, v128.l
 // GFX1250: v_rcp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd4,0xd5,0x80,0x01,0x00,0x00]
@@ -3148,50 +3148,50 @@ v_rcp_iflag_f32_e64 v5, src_scc mul:4
 v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX1250: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_rndne_f16_e64 v5, v1
-// GFX1250: v_rndne_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v1.l
+// GFX1250: v_rndne_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, v255
-// GFX1250: v_rndne_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
+v_rndne_f16_e64 v5.l, v255.l
+// GFX1250: v_rndne_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00]
 
-v_rndne_f16_e64 v5, s1
-// GFX1250: v_rndne_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s1
+// GFX1250: v_rndne_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, s105
-// GFX1250: v_rndne_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, s105
+// GFX1250: v_rndne_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_lo
-// GFX1250: v_rndne_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_lo
+// GFX1250: v_rndne_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, vcc_hi
-// GFX1250: v_rndne_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, vcc_hi
+// GFX1250: v_rndne_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, ttmp15
-// GFX1250: v_rndne_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, ttmp15
+// GFX1250: v_rndne_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, m0
-// GFX1250: v_rndne_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, m0
+// GFX1250: v_rndne_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_lo
-// GFX1250: v_rndne_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_lo
+// GFX1250: v_rndne_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, exec_hi
-// GFX1250: v_rndne_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, exec_hi
+// GFX1250: v_rndne_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, null
-// GFX1250: v_rndne_f16_e64 v5, null                ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, null
+// GFX1250: v_rndne_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, -1
-// GFX1250: v_rndne_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
+v_rndne_f16_e64 v5.l, -1
+// GFX1250: v_rndne_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rndne_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_rndne_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
+v_rndne_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_rndne_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rndne_f16_e64 v5, src_scc mul:4
-// GFX1250: v_rndne_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
+v_rndne_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_rndne_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rndne_f16 v1.h, v128.l
 // GFX1250: v_rndne_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xde,0xd5,0x80,0x01,0x00,0x00]
@@ -3280,50 +3280,50 @@ v_rndne_f64_e64 v[6:7], -|src_scc| mul:4
 v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX1250: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_rsq_f16_e64 v5, v1
-// GFX1250: v_rsq_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
+v_rsq_f16_e64 v5.l, v1.l
+// GFX1250: v_rsq_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00]
 
-v_rsq_f16_e64 v5, v255
-// GFX1250: v_rsq_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
+v_rsq_f16_e64 v5.l, v255.l
+// GFX1250: v_rsq_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00]
 
-v_rsq_f16_e64 v5, s1
-// GFX1250: v_rsq_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, s1
+// GFX1250: v_rsq_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, s105
-// GFX1250: v_rsq_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, s105
+// GFX1250: v_rsq_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, vcc_lo
-// GFX1250: v_rsq_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, vcc_lo
+// GFX1250: v_rsq_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, vcc_hi
-// GFX1250: v_rsq_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, vcc_hi
+// GFX1250: v_rsq_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, ttmp15
-// GFX1250: v_rsq_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, ttmp15
+// GFX1250: v_rsq_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, m0
-// GFX1250: v_rsq_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, m0
+// GFX1250: v_rsq_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, exec_lo
-// GFX1250: v_rsq_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, exec_lo
+// GFX1250: v_rsq_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, exec_hi
-// GFX1250: v_rsq_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, exec_hi
+// GFX1250: v_rsq_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, null
-// GFX1250: v_rsq_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, null
+// GFX1250: v_rsq_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, -1
-// GFX1250: v_rsq_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
+v_rsq_f16_e64 v5.l, -1
+// GFX1250: v_rsq_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rsq_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_rsq_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
+v_rsq_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_rsq_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rsq_f16_e64 v5, src_scc mul:4
-// GFX1250: v_rsq_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
+v_rsq_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_rsq_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_rsq_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_rsq_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_rsq_f16 v1.h, v128.l
 // GFX1250: v_rsq_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd6,0xd5,0x80,0x01,0x00,0x00]
@@ -3412,50 +3412,50 @@ v_rsq_f64_e64 v[6:7], -|src_scc| mul:4
 v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX1250: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_sat_pk_u8_i16_e64 v5, v1
-// GFX1250: v_sat_pk_u8_i16_e64 v5, v1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, v1
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, v1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, v255
-// GFX1250: v_sat_pk_u8_i16_e64 v5, v255            ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, v255
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, v255          ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, s1
-// GFX1250: v_sat_pk_u8_i16_e64 v5, s1              ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, s1
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, s1            ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, s105
-// GFX1250: v_sat_pk_u8_i16_e64 v5, s105            ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, s105
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, s105          ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, vcc_lo
-// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_lo          ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, vcc_lo
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, vcc_lo        ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, vcc_hi
-// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_hi          ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, vcc_hi
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, vcc_hi        ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, ttmp15
-// GFX1250: v_sat_pk_u8_i16_e64 v5, ttmp15          ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, ttmp15
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, ttmp15        ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, m0
-// GFX1250: v_sat_pk_u8_i16_e64 v5, m0              ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, m0
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, m0            ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, exec_lo
-// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_lo         ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, exec_lo
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, exec_lo       ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, exec_hi
-// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_hi         ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, exec_hi
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, exec_hi       ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, null
-// GFX1250: v_sat_pk_u8_i16_e64 v5, null            ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, null
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, null          ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, -1
-// GFX1250: v_sat_pk_u8_i16_e64 v5, -1              ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, -1
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, -1            ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, 0.5
-// GFX1250: v_sat_pk_u8_i16_e64 v5, 0.5             ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, 0.5
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, 0.5           ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v5, src_scc
-// GFX1250: v_sat_pk_u8_i16_e64 v5, src_scc         ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
+v_sat_pk_u8_i16_e64 v5.l, src_scc
+// GFX1250: v_sat_pk_u8_i16_e64 v5.l, src_scc       ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00]
 
-v_sat_pk_u8_i16_e64 v255, 0xfe0b
-// GFX1250: v_sat_pk_u8_i16_e64 v255, 0xfe0b        ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
+v_sat_pk_u8_i16_e64 v255.l, 0xfe0b
+// GFX1250: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b      ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_sat_pk_u8_i16 v128.l, v1
 // GFX1250: v_sat_pk_u8_i16_e64 v128.l, v1          ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00]
@@ -3463,50 +3463,50 @@ v_sat_pk_u8_i16 v128.l, v1
 v_sat_pk_u8_i16 v128.h, v1
 // GFX1250: v_sat_pk_u8_i16_e64 v128.h, v1 op_sel:[0,1] ; encoding: [0x80,0x40,0xe2,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, v1
-// GFX1250: v_sin_f16_e64 v5, v1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v1.l
+// GFX1250: v_sin_f16_e64 v5.l, v1.l                ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, v255
-// GFX1250: v_sin_f16_e64 v5, v255                  ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
+v_sin_f16_e64 v5.l, v255.l
+// GFX1250: v_sin_f16_e64 v5.l, v255.l              ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00]
 
-v_sin_f16_e64 v5, s1
-// GFX1250: v_sin_f16_e64 v5, s1                    ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s1
+// GFX1250: v_sin_f16_e64 v5.l, s1                  ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, s105
-// GFX1250: v_sin_f16_e64 v5, s105                  ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, s105
+// GFX1250: v_sin_f16_e64 v5.l, s105                ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_lo
-// GFX1250: v_sin_f16_e64 v5, vcc_lo                ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_lo
+// GFX1250: v_sin_f16_e64 v5.l, vcc_lo              ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, vcc_hi
-// GFX1250: v_sin_f16_e64 v5, vcc_hi                ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, vcc_hi
+// GFX1250: v_sin_f16_e64 v5.l, vcc_hi              ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, ttmp15
-// GFX1250: v_sin_f16_e64 v5, ttmp15                ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, ttmp15
+// GFX1250: v_sin_f16_e64 v5.l, ttmp15              ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, m0
-// GFX1250: v_sin_f16_e64 v5, m0                    ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, m0
+// GFX1250: v_sin_f16_e64 v5.l, m0                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_lo
-// GFX1250: v_sin_f16_e64 v5, exec_lo               ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_lo
+// GFX1250: v_sin_f16_e64 v5.l, exec_lo             ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, exec_hi
-// GFX1250: v_sin_f16_e64 v5, exec_hi               ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, exec_hi
+// GFX1250: v_sin_f16_e64 v5.l, exec_hi             ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, null
-// GFX1250: v_sin_f16_e64 v5, null                  ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, null
+// GFX1250: v_sin_f16_e64 v5.l, null                ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, -1
-// GFX1250: v_sin_f16_e64 v5, -1                    ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_f16_e64 v5.l, -1
+// GFX1250: v_sin_f16_e64 v5.l, -1                  ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_sin_f16_e64 v5, 0.5 mul:2             ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
+v_sin_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_sin_f16_e64 v5.l, 0.5 mul:2           ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sin_f16_e64 v5, src_scc mul:4
-// GFX1250: v_sin_f16_e64 v5, src_scc mul:4         ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
+v_sin_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_sin_f16_e64 v5.l, src_scc mul:4       ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sin_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_sin_f16 v1.h, v128.l
 // GFX1250: v_sin_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe0,0xd5,0x80,0x01,0x00,0x00]
@@ -3559,50 +3559,50 @@ v_sin_f32_e64 v5, src_scc mul:4
 v_sin_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX1250: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_sqrt_f16_e64 v5, v1
-// GFX1250: v_sqrt_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
+v_sqrt_f16_e64 v5.l, v1.l
+// GFX1250: v_sqrt_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00]
 
-v_sqrt_f16_e64 v5, v255
-// GFX1250: v_sqrt_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
+v_sqrt_f16_e64 v5.l, v255.l
+// GFX1250: v_sqrt_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00]
 
-v_sqrt_f16_e64 v5, s1
-// GFX1250: v_sqrt_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, s1
+// GFX1250: v_sqrt_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, s105
-// GFX1250: v_sqrt_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, s105
+// GFX1250: v_sqrt_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, vcc_lo
-// GFX1250: v_sqrt_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, vcc_lo
+// GFX1250: v_sqrt_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, vcc_hi
-// GFX1250: v_sqrt_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, vcc_hi
+// GFX1250: v_sqrt_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, ttmp15
-// GFX1250: v_sqrt_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, ttmp15
+// GFX1250: v_sqrt_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, m0
-// GFX1250: v_sqrt_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, m0
+// GFX1250: v_sqrt_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, exec_lo
-// GFX1250: v_sqrt_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, exec_lo
+// GFX1250: v_sqrt_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, exec_hi
-// GFX1250: v_sqrt_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, exec_hi
+// GFX1250: v_sqrt_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, null
-// GFX1250: v_sqrt_f16_e64 v5, null                 ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, null
+// GFX1250: v_sqrt_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, -1
-// GFX1250: v_sqrt_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
+v_sqrt_f16_e64 v5.l, -1
+// GFX1250: v_sqrt_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sqrt_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_sqrt_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
+v_sqrt_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_sqrt_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sqrt_f16_e64 v5, src_scc mul:4
-// GFX1250: v_sqrt_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
+v_sqrt_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_sqrt_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_sqrt_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_sqrt_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_sqrt_f16 v1.h, v128.l
 // GFX1250: v_sqrt_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd5,0xd5,0x80,0x01,0x00,0x00]
@@ -3691,50 +3691,50 @@ v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4
 v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2
 // GFX1250: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
 
-v_trunc_f16_e64 v5, v1
-// GFX1250: v_trunc_f16_e64 v5, v1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, v1.l
+// GFX1250: v_trunc_f16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v5, v255
-// GFX1250: v_trunc_f16_e64 v5, v255                ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
+v_trunc_f16_e64 v5.l, v255.l
+// GFX1250: v_trunc_f16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00]
 
-v_trunc_f16_e64 v5, s1
-// GFX1250: v_trunc_f16_e64 v5, s1                  ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, s1
+// GFX1250: v_trunc_f16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, s105
-// GFX1250: v_trunc_f16_e64 v5, s105                ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, s105
+// GFX1250: v_trunc_f16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_lo
-// GFX1250: v_trunc_f16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_lo
+// GFX1250: v_trunc_f16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, vcc_hi
-// GFX1250: v_trunc_f16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, vcc_hi
+// GFX1250: v_trunc_f16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, ttmp15
-// GFX1250: v_trunc_f16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, ttmp15
+// GFX1250: v_trunc_f16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, m0
-// GFX1250: v_trunc_f16_e64 v5, m0                  ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, m0
+// GFX1250: v_trunc_f16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_lo
-// GFX1250: v_trunc_f16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_lo
+// GFX1250: v_trunc_f16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, exec_hi
-// GFX1250: v_trunc_f16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, exec_hi
+// GFX1250: v_trunc_f16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, null
-// GFX1250: v_trunc_f16_e64 v5, null                ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, null
+// GFX1250: v_trunc_f16_e64 v5.l, null              ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, -1
-// GFX1250: v_trunc_f16_e64 v5, -1                  ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
+v_trunc_f16_e64 v5.l, -1
+// GFX1250: v_trunc_f16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_trunc_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_trunc_f16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
+v_trunc_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_trunc_f16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08]
 
-v_trunc_f16_e64 v5, src_scc mul:4
-// GFX1250: v_trunc_f16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
+v_trunc_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_trunc_f16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10]
 
-v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2
-// GFX1250: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
+v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2
+// GFX1250: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00]
 
 v_trunc_f16 v1.h, v128.l
 // GFX1250: v_trunc_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdd,0xd5,0x80,0x01,0x00,0x00]
@@ -3868,98 +3868,98 @@ v_tanh_f32_e64 v5, src_scc mul:4
 v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2
 // GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
 
-v_tanh_f16_e64 v5, v1
-// GFX1250: v_tanh_f16_e64 v5, v1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
+v_tanh_f16_e64 v5.l, v1.l
+// GFX1250: v_tanh_f16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00]
 
-v_tanh_f16_e64 v5, v255
-// GFX1250: v_tanh_f16_e64 v5, v255                 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
+v_tanh_f16_e64 v5.l, v255.l
+// GFX1250: v_tanh_f16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00]
 
-v_tanh_f16_e64 v5, s1
-// GFX1250: v_tanh_f16_e64 v5, s1                   ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, s1
+// GFX1250: v_tanh_f16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, s105
-// GFX1250: v_tanh_f16_e64 v5, s105                 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, s105
+// GFX1250: v_tanh_f16_e64 v5.l, s105               ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, vcc_lo
-// GFX1250: v_tanh_f16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, vcc_lo
+// GFX1250: v_tanh_f16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, vcc_hi
-// GFX1250: v_tanh_f16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, vcc_hi
+// GFX1250: v_tanh_f16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, ttmp15
-// GFX1250: v_tanh_f16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, ttmp15
+// GFX1250: v_tanh_f16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, m0
-// GFX1250: v_tanh_f16_e64 v5, m0                   ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, m0
+// GFX1250: v_tanh_f16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, exec_lo
-// GFX1250: v_tanh_f16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, exec_lo
+// GFX1250: v_tanh_f16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, exec_hi
-// GFX1250: v_tanh_f16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, exec_hi
+// GFX1250: v_tanh_f16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, null
-// GFX1250: v_tanh_f16_e64 v5, null                 ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, null
+// GFX1250: v_tanh_f16_e64 v5.l, null               ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, -1
-// GFX1250: v_tanh_f16_e64 v5, -1                   ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
+v_tanh_f16_e64 v5.l, -1
+// GFX1250: v_tanh_f16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00]
 
-v_tanh_f16_e64 v5, 0.5 mul:2
-// GFX1250: v_tanh_f16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
+v_tanh_f16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_tanh_f16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08]
 
-v_tanh_f16_e64 v5, src_scc mul:4
-// GFX1250: v_tanh_f16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
+v_tanh_f16_e64 v5.l, src_scc mul:4
+// GFX1250: v_tanh_f16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10]
 
-v_tanh_f16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_tanh_f16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_tanh_f16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_tanh_f16 v5.l, v128.h
 // GFX1250: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00]
 
-v_tanh_bf16_e64 v5, v1
-// GFX1250: v_tanh_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00]
+v_tanh_bf16_e64 v5.l, v1.l
+// GFX1250: v_tanh_bf16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00]
 
-v_tanh_bf16_e64 v5, v255
-// GFX1250: v_tanh_bf16_e64 v5, v255                ; encoding: [0x05,0x00,0xca,0xd5,0xff,0x01,0x00,0x00]
+v_tanh_bf16_e64 v5.l, v255.l
+// GFX1250: v_tanh_bf16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xca,0xd5,0xff,0x01,0x00,0x00]
 
-v_tanh_bf16_e64 v5, s1
-// GFX1250: v_tanh_bf16_e64 v5, s1                  ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, s1
+// GFX1250: v_tanh_bf16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, s105
-// GFX1250: v_tanh_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, s105
+// GFX1250: v_tanh_bf16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, vcc_lo
-// GFX1250: v_tanh_bf16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xca,0xd5,0x6a,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_tanh_bf16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xca,0xd5,0x6a,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, vcc_hi
-// GFX1250: v_tanh_bf16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xca,0xd5,0x6b,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_tanh_bf16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xca,0xd5,0x6b,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, ttmp15
-// GFX1250: v_tanh_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, ttmp15
+// GFX1250: v_tanh_bf16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, m0
-// GFX1250: v_tanh_bf16_e64 v5, m0                  ; encoding: [0x05,0x00,0xca,0xd5,0x7d,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, m0
+// GFX1250: v_tanh_bf16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xca,0xd5,0x7d,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, exec_lo
-// GFX1250: v_tanh_bf16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xca,0xd5,0x7e,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, exec_lo
+// GFX1250: v_tanh_bf16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xca,0xd5,0x7e,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, exec_hi
-// GFX1250: v_tanh_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, exec_hi
+// GFX1250: v_tanh_bf16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, null
-// GFX1250: v_tanh_bf16_e64 v5, null                ; encoding: [0x05,0x00,0xca,0xd5,0x7c,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, null
+// GFX1250: v_tanh_bf16_e64 v5.l, null              ; encoding: [0x05,0x00,0xca,0xd5,0x7c,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, -1
-// GFX1250: v_tanh_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
+v_tanh_bf16_e64 v5.l, -1
+// GFX1250: v_tanh_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00]
 
-v_tanh_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
+v_tanh_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_tanh_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08]
 
-v_tanh_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
+v_tanh_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_tanh_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10]
 
-v_tanh_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_tanh_bf16 v5.l, v128.h
 // GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00]
@@ -4000,347 +4000,347 @@ v_prng_b32_e64 v5, null
 v_prng_b32_e64 v5, -1
 // GFX1250: v_prng_b32_e64 v5, -1                   ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, v1
-// GFX1250: v_rcp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
+v_rcp_bf16_e64 v5.l, v1.l
+// GFX1250: v_rcp_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00]
 
-v_rcp_bf16_e64 v5, v255
-// GFX1250: v_rcp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xf9,0xd5,0xff,0x01,0x00,0x00]
+v_rcp_bf16_e64 v5.l, v255.l
+// GFX1250: v_rcp_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xf9,0xd5,0xff,0x01,0x00,0x00]
 
-v_rcp_bf16_e64 v5, s1
-// GFX1250: v_rcp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, s1
+// GFX1250: v_rcp_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, s105
-// GFX1250: v_rcp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, s105
+// GFX1250: v_rcp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xf9,0xd5,0x69,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, vcc_lo
-// GFX1250: v_rcp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xf9,0xd5,0x6a,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_rcp_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xf9,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, vcc_hi
-// GFX1250: v_rcp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xf9,0xd5,0x6b,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_rcp_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xf9,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, ttmp15
-// GFX1250: v_rcp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, ttmp15
+// GFX1250: v_rcp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xf9,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, m0
-// GFX1250: v_rcp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xf9,0xd5,0x7d,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, m0
+// GFX1250: v_rcp_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xf9,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, exec_lo
-// GFX1250: v_rcp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xf9,0xd5,0x7e,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, exec_lo
+// GFX1250: v_rcp_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xf9,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, exec_hi
-// GFX1250: v_rcp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, exec_hi
+// GFX1250: v_rcp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xf9,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, null
-// GFX1250: v_rcp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xf9,0xd5,0x7c,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, null
+// GFX1250: v_rcp_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xf9,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, -1
-// GFX1250: v_rcp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
+v_rcp_bf16_e64 v5.l, -1
+// GFX1250: v_rcp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xf9,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rcp_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_rcp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
+v_rcp_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_rcp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xf9,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rcp_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_rcp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
+v_rcp_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_rcp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xf9,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rcp_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_rcp_bf16 v5.h, v128.h
 // GFX1250: v_rcp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xf9,0xd5,0x80,0x01,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, v1
-// GFX1250: v_sqrt_bf16_e64 v5, v1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, v1.l
+// GFX1250: v_sqrt_bf16_e64 v5.l, v1.l              ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x01,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, v255
-// GFX1250: v_sqrt_bf16_e64 v5, v255                ; encoding: [0x05,0x00,0xfa,0xd5,0xff,0x01,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, v255.l
+// GFX1250: v_sqrt_bf16_e64 v5.l, v255.l            ; encoding: [0x05,0x00,0xfa,0xd5,0xff,0x01,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, s1
-// GFX1250: v_sqrt_bf16_e64 v5, s1                  ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, s1
+// GFX1250: v_sqrt_bf16_e64 v5.l, s1                ; encoding: [0x05,0x00,0xfa,0xd5,0x01,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, s105
-// GFX1250: v_sqrt_bf16_e64 v5, s105                ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, s105
+// GFX1250: v_sqrt_bf16_e64 v5.l, s105              ; encoding: [0x05,0x00,0xfa,0xd5,0x69,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, vcc_lo
-// GFX1250: v_sqrt_bf16_e64 v5, vcc_lo              ; encoding: [0x05,0x00,0xfa,0xd5,0x6a,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_sqrt_bf16_e64 v5.l, vcc_lo            ; encoding: [0x05,0x00,0xfa,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, vcc_hi
-// GFX1250: v_sqrt_bf16_e64 v5, vcc_hi              ; encoding: [0x05,0x00,0xfa,0xd5,0x6b,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_sqrt_bf16_e64 v5.l, vcc_hi            ; encoding: [0x05,0x00,0xfa,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, ttmp15
-// GFX1250: v_sqrt_bf16_e64 v5, ttmp15              ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, ttmp15
+// GFX1250: v_sqrt_bf16_e64 v5.l, ttmp15            ; encoding: [0x05,0x00,0xfa,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, m0
-// GFX1250: v_sqrt_bf16_e64 v5, m0                  ; encoding: [0x05,0x00,0xfa,0xd5,0x7d,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, m0
+// GFX1250: v_sqrt_bf16_e64 v5.l, m0                ; encoding: [0x05,0x00,0xfa,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, exec_lo
-// GFX1250: v_sqrt_bf16_e64 v5, exec_lo             ; encoding: [0x05,0x00,0xfa,0xd5,0x7e,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, exec_lo
+// GFX1250: v_sqrt_bf16_e64 v5.l, exec_lo           ; encoding: [0x05,0x00,0xfa,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, exec_hi
-// GFX1250: v_sqrt_bf16_e64 v5, exec_hi             ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, exec_hi
+// GFX1250: v_sqrt_bf16_e64 v5.l, exec_hi           ; encoding: [0x05,0x00,0xfa,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, null
-// GFX1250: v_sqrt_bf16_e64 v5, null                ; encoding: [0x05,0x00,0xfa,0xd5,0x7c,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, null
+// GFX1250: v_sqrt_bf16_e64 v5.l, null              ; encoding: [0x05,0x00,0xfa,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, -1
-// GFX1250: v_sqrt_bf16_e64 v5, -1                  ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
+v_sqrt_bf16_e64 v5.l, -1
+// GFX1250: v_sqrt_bf16_e64 v5.l, -1                ; encoding: [0x05,0x00,0xfa,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sqrt_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_sqrt_bf16_e64 v5, 0.5 mul:2           ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
+v_sqrt_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_sqrt_bf16_e64 v5.l, 0.5 mul:2         ; encoding: [0x05,0x00,0xfa,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sqrt_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_sqrt_bf16_e64 v5, src_scc mul:4       ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
+v_sqrt_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_sqrt_bf16_e64 v5.l, src_scc mul:4     ; encoding: [0x05,0x00,0xfa,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_sqrt_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_sqrt_bf16 v5.h, v128.h
 // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00]
 
-v_rsq_bf16_e64 v5, v1
-// GFX1250: v_rsq_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
+v_rsq_bf16_e64 v5.l, v1.l
+// GFX1250: v_rsq_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00]
 
-v_rsq_bf16_e64 v5, v255
-// GFX1250: v_rsq_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
+v_rsq_bf16_e64 v5.l, v255.l
+// GFX1250: v_rsq_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00]
 
-v_rsq_bf16_e64 v5, s1
-// GFX1250: v_rsq_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, s1
+// GFX1250: v_rsq_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, s105
-// GFX1250: v_rsq_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, s105
+// GFX1250: v_rsq_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, vcc_lo
-// GFX1250: v_rsq_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_rsq_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, vcc_hi
-// GFX1250: v_rsq_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_rsq_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, ttmp15
-// GFX1250: v_rsq_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, ttmp15
+// GFX1250: v_rsq_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, m0
-// GFX1250: v_rsq_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, m0
+// GFX1250: v_rsq_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, exec_lo
-// GFX1250: v_rsq_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, exec_lo
+// GFX1250: v_rsq_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, exec_hi
-// GFX1250: v_rsq_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, exec_hi
+// GFX1250: v_rsq_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, null
-// GFX1250: v_rsq_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, null
+// GFX1250: v_rsq_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, -1
-// GFX1250: v_rsq_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
+v_rsq_bf16_e64 v5.l, -1
+// GFX1250: v_rsq_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00]
 
-v_rsq_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
+v_rsq_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_rsq_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08]
 
-v_rsq_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
+v_rsq_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_rsq_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10]
 
-v_rsq_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_rsq_bf16 v5.h, v128.h
 // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00]
 
-v_log_bf16_e64 v5, v1
-// GFX1250: v_log_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
+v_log_bf16_e64 v5.l, v1.l
+// GFX1250: v_log_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00]
 
-v_log_bf16_e64 v5, v255
-// GFX1250: v_log_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
+v_log_bf16_e64 v5.l, v255.l
+// GFX1250: v_log_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00]
 
-v_log_bf16_e64 v5, s1
-// GFX1250: v_log_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, s1
+// GFX1250: v_log_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, s105
-// GFX1250: v_log_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, s105
+// GFX1250: v_log_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, vcc_lo
-// GFX1250: v_log_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_log_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, vcc_hi
-// GFX1250: v_log_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_log_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, ttmp15
-// GFX1250: v_log_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, ttmp15
+// GFX1250: v_log_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, m0
-// GFX1250: v_log_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, m0
+// GFX1250: v_log_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, exec_lo
-// GFX1250: v_log_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, exec_lo
+// GFX1250: v_log_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, exec_hi
-// GFX1250: v_log_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, exec_hi
+// GFX1250: v_log_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, null
-// GFX1250: v_log_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, null
+// GFX1250: v_log_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, -1
-// GFX1250: v_log_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
+v_log_bf16_e64 v5.l, -1
+// GFX1250: v_log_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00]
 
-v_log_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
+v_log_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_log_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08]
 
-v_log_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_log_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
+v_log_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_log_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10]
 
-v_log_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_log_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_log_bf16 v5.h, v128.h
 // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00]
 
-v_exp_bf16_e64 v5, v1
-// GFX1250: v_exp_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
+v_exp_bf16_e64 v5.l, v1.l
+// GFX1250: v_exp_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00]
 
-v_exp_bf16_e64 v5, v255
-// GFX1250: v_exp_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
+v_exp_bf16_e64 v5.l, v255.l
+// GFX1250: v_exp_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00]
 
-v_exp_bf16_e64 v5, s1
-// GFX1250: v_exp_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, s1
+// GFX1250: v_exp_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, s105
-// GFX1250: v_exp_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, s105
+// GFX1250: v_exp_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, vcc_lo
-// GFX1250: v_exp_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_exp_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, vcc_hi
-// GFX1250: v_exp_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_exp_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, ttmp15
-// GFX1250: v_exp_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, ttmp15
+// GFX1250: v_exp_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, m0
-// GFX1250: v_exp_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, m0
+// GFX1250: v_exp_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, exec_lo
-// GFX1250: v_exp_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, exec_lo
+// GFX1250: v_exp_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, exec_hi
-// GFX1250: v_exp_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, exec_hi
+// GFX1250: v_exp_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, null
-// GFX1250: v_exp_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, null
+// GFX1250: v_exp_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, -1
-// GFX1250: v_exp_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
+v_exp_bf16_e64 v5.l, -1
+// GFX1250: v_exp_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00]
 
-v_exp_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
+v_exp_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_exp_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08]
 
-v_exp_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
+v_exp_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_exp_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10]
 
-v_exp_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_exp_bf16 v5.h, v128.h
 // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00]
 
-v_sin_bf16_e64 v5, v1
-// GFX1250: v_sin_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
+v_sin_bf16_e64 v5.l, v1.l
+// GFX1250: v_sin_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00]
 
-v_sin_bf16_e64 v5, v255
-// GFX1250: v_sin_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
+v_sin_bf16_e64 v5.l, v255.l
+// GFX1250: v_sin_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00]
 
-v_sin_bf16_e64 v5, s1
-// GFX1250: v_sin_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, s1
+// GFX1250: v_sin_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, s105
-// GFX1250: v_sin_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, s105
+// GFX1250: v_sin_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, vcc_lo
-// GFX1250: v_sin_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_sin_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, vcc_hi
-// GFX1250: v_sin_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_sin_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, ttmp15
-// GFX1250: v_sin_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, ttmp15
+// GFX1250: v_sin_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, m0
-// GFX1250: v_sin_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, m0
+// GFX1250: v_sin_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, exec_lo
-// GFX1250: v_sin_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, exec_lo
+// GFX1250: v_sin_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, exec_hi
-// GFX1250: v_sin_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, exec_hi
+// GFX1250: v_sin_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, null
-// GFX1250: v_sin_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, null
+// GFX1250: v_sin_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, -1
-// GFX1250: v_sin_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
+v_sin_bf16_e64 v5.l, -1
+// GFX1250: v_sin_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00]
 
-v_sin_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
+v_sin_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_sin_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08]
 
-v_sin_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
+v_sin_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_sin_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10]
 
-v_sin_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_sin_bf16 v5.h, v128.h
 // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00]
 
-v_cos_bf16_e64 v5, v1
-// GFX1250: v_cos_bf16_e64 v5, v1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
+v_cos_bf16_e64 v5.l, v1.l
+// GFX1250: v_cos_bf16_e64 v5.l, v1.l               ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00]
 
-v_cos_bf16_e64 v5, v255
-// GFX1250: v_cos_bf16_e64 v5, v255                 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
+v_cos_bf16_e64 v5.l, v255.l
+// GFX1250: v_cos_bf16_e64 v5.l, v255.l             ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00]
 
-v_cos_bf16_e64 v5, s1
-// GFX1250: v_cos_bf16_e64 v5, s1                   ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, s1
+// GFX1250: v_cos_bf16_e64 v5.l, s1                 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, s105
-// GFX1250: v_cos_bf16_e64 v5, s105                 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, s105
+// GFX1250: v_cos_bf16_e64 v5.l, s105               ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, vcc_lo
-// GFX1250: v_cos_bf16_e64 v5, vcc_lo               ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, vcc_lo
+// GFX1250: v_cos_bf16_e64 v5.l, vcc_lo             ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, vcc_hi
-// GFX1250: v_cos_bf16_e64 v5, vcc_hi               ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, vcc_hi
+// GFX1250: v_cos_bf16_e64 v5.l, vcc_hi             ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, ttmp15
-// GFX1250: v_cos_bf16_e64 v5, ttmp15               ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, ttmp15
+// GFX1250: v_cos_bf16_e64 v5.l, ttmp15             ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, m0
-// GFX1250: v_cos_bf16_e64 v5, m0                   ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, m0
+// GFX1250: v_cos_bf16_e64 v5.l, m0                 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, exec_lo
-// GFX1250: v_cos_bf16_e64 v5, exec_lo              ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, exec_lo
+// GFX1250: v_cos_bf16_e64 v5.l, exec_lo            ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, exec_hi
-// GFX1250: v_cos_bf16_e64 v5, exec_hi              ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, exec_hi
+// GFX1250: v_cos_bf16_e64 v5.l, exec_hi            ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, null
-// GFX1250: v_cos_bf16_e64 v5, null                 ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, null
+// GFX1250: v_cos_bf16_e64 v5.l, null               ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, -1
-// GFX1250: v_cos_bf16_e64 v5, -1                   ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
+v_cos_bf16_e64 v5.l, -1
+// GFX1250: v_cos_bf16_e64 v5.l, -1                 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cos_bf16_e64 v5, 0.5 mul:2
-// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2            ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
+v_cos_bf16_e64 v5.l, 0.5 mul:2
+// GFX1250: v_cos_bf16_e64 v5.l, 0.5 mul:2          ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08]
 
-v_cos_bf16_e64 v5, src_scc mul:4
-// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4        ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
+v_cos_bf16_e64 v5.l, src_scc mul:4
+// GFX1250: v_cos_bf16_e64 v5.l, src_scc mul:4      ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10]
 
-v_cos_bf16_e64 v255, -|0x8000| clamp div:2
-// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
+v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2
+// GFX1250: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00]
 
 v_cos_bf16_e64 v5.h, v128.h
 // GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00]
 
-v_cvt_f32_bf16_e64 v5, v1
-// GFX1250: v_cvt_f32_bf16_e64 v5, v1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_f32_bf16_e64 v5, v1.l
+// GFX1250: v_cvt_f32_bf16_e64 v5, v1.l             ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_bf16_e64 v5, v255
-// GFX1250: v_cvt_f32_bf16_e64 v5, v255             ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_f32_bf16_e64 v5, v255.l
+// GFX1250: v_cvt_f32_bf16_e64 v5, v255.l           ; encoding: [0x05,0x00,0xf2,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_f32_bf16_e64 v5, s1
 // GFX1250: v_cvt_f32_bf16_e64 v5, s1               ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x00,0x00,0x00]
@@ -4372,11 +4372,11 @@ v_cvt_f32_bf16_e64 v5, null
 v_cvt_f32_bf16_e64 v5, -1
 // GFX1250: v_cvt_f32_bf16_e64 v5, -1               ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_bf16_e64 v5, v1 op_sel:[1]
-// GFX1250: v_cvt_f32_bf16_e64 v5, v1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
+v_cvt_f32_bf16_e64 v5, v1.h op_sel:[1,0]
+// GFX1250: v_cvt_f32_bf16_e64 v5, v1.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_bf16_e64 v5, v255 op_sel:[1]
-// GFX1250: v_cvt_f32_bf16_e64 v5, v255 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
+v_cvt_f32_bf16_e64 v5, v255.h op_sel:[1,0]
+// GFX1250: v_cvt_f32_bf16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00]
 
 v_cvt_f32_bf16_e64 v5, s1 op_sel:[1]
 // GFX1250: v_cvt_f32_bf16_e64 v5, s1 op_sel:[1,0]  ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00]
@@ -4492,32 +4492,32 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1]
 v_cvt_pk_f16_fp8 v1, s2 op_sel:[1]
 // GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0]    ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00]
 
-v_sat_pk4_i4_i8 v150, v2
-// GFX1250: v_sat_pk4_i4_i8_e64 v150, v2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
+v_sat_pk4_i4_i8 v150.l, v2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150.l, v2          ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00]
 
-v_sat_pk4_i4_i8 v150, s2
-// GFX1250: v_sat_pk4_i4_i8_e64 v150, s2            ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
+v_sat_pk4_i4_i8 v150.l, s2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150.l, s2          ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00]
 
-v_sat_pk4_i4_i8 v150, 2
-// GFX1250: v_sat_pk4_i4_i8_e64 v150, 2             ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
+v_sat_pk4_i4_i8 v150.l, 2
+// GFX1250: v_sat_pk4_i4_i8_e64 v150.l, 2           ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00]
 
-v_sat_pk4_i4_i8 v150, 0x1234
-// GFX1250: v_sat_pk4_i4_i8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+v_sat_pk4_i4_i8 v150.l, 0x1234
+// GFX1250: v_sat_pk4_i4_i8_e64 v150.l, 0x1234      ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
 
 v_sat_pk4_i4_i8 v150.h, v2
 // GFX1250: v_sat_pk4_i4_i8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00]
 
-v_sat_pk4_u4_u8 v150, v2
-// GFX1250: v_sat_pk4_u4_u8_e64 v150, v2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
+v_sat_pk4_u4_u8 v150.l, v2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150.l, v2          ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00]
 
-v_sat_pk4_u4_u8 v150, s2
-// GFX1250: v_sat_pk4_u4_u8_e64 v150, s2            ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
+v_sat_pk4_u4_u8 v150.l, s2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150.l, s2          ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00]
 
-v_sat_pk4_u4_u8 v150, 2
-// GFX1250: v_sat_pk4_u4_u8_e64 v150, 2             ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
+v_sat_pk4_u4_u8 v150.l, 2
+// GFX1250: v_sat_pk4_u4_u8_e64 v150.l, 2           ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00]
 
-v_sat_pk4_u4_u8 v150, 0x1234
-// GFX1250: v_sat_pk4_u4_u8_e64 v150, 0x1234        ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
+v_sat_pk4_u4_u8 v150.l, 0x1234
+// GFX1250: v_sat_pk4_u4_u8_e64 v150.l, 0x1234      ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00]
 
 v_sat_pk4_u4_u8 v150.h, v2
 // GFX1250: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
index f14705fa9143c..d1638565a386a 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s
@@ -58,120 +58,120 @@ v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask
 // GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_tanh_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_tanh_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_tanh_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_f16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
@@ -222,468 +222,468 @@ v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
 // GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rcp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_half_mirror
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_shl:1
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_shl:15
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_shr:1
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_shr:15
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_ror:1
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_ror:15
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
+v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
+v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
+v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0]
 // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_mirror
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_mirror
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_half_mirror
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_half_mirror
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:1
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_shl:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:15
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_shl:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:1
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_shr:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:15
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_shr:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:1
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_ror:1
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:15
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_ror:15
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
+v_cvt_f32_bf16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_f32_bf16_e64_dpp v5, v128.h quad_perm:[3,2,1,0]
@@ -766,24 +766,24 @@ v_cvt_pk_f16_fp8 v1, v128.h quad_perm:[0,1,2,3]
 // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
-// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+v_sat_pk4_i4_i8 v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v150, v2 row_share:1 fi:1
-// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+v_sat_pk4_i4_i8 v150.l, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_i4_i8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
 // GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
-// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
+v_sat_pk4_u4_u8 v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v150, v2 row_share:1 fi:1
-// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
+v_sat_pk4_u4_u8 v150.l, v2 row_share:1 fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_u4_u8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
index 0414421f0a906..6ec4d5f48f8b1 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s
@@ -18,40 +18,40 @@ v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_tanh_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_tanh_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_tanh_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_f16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
@@ -62,140 +62,140 @@ v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rcp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
+v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
+v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
+v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX1250: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0]
@@ -262,8 +262,8 @@ v_cvt_f16_fp8 v128.l, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_f16_fp8_e64_dpp v128.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x80,0x00,0xf7,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
+v_cvt_f32_bf16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_f32_bf16_e64_dpp v5, v128.h dpp8:[7,6,5,4,3,2,1,0]
@@ -298,24 +298,24 @@ v_cvt_pk_f16_fp8 v1, v128.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+v_sat_pk4_i4_i8 v150.l, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+v_sat_pk4_i4_i8 v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_i4_i8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+v_sat_pk4_u4_u8 v150.l, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
+v_sat_pk4_u4_u8 v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_sat_pk4_u4_u8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0]

From 912a92a8098a425c63be3cf3251f3cab68d229c4 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 1 Oct 2025 23:15:11 -0700
Subject: [PATCH 469/878] [AMDGPU] Update VOP3 gfx1250 tests to t16 syntax. NFC
 (#161611)

---
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s       | 84 ++++++++++----------
 llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s | 72 ++++++++---------
 2 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index b4d4e365d0453..98d07ac1ece27 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -52,32 +52,32 @@ v_bitop3_b32 v255, 0xaf123456, vcc_hi, null bitop3:103
 v_bitop3_b16 v5.l, v1.l, v2.l, s3
 // GFX1250: v_bitop3_b16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x33,0xd6,0x01,0x05,0x0e,0x00]
 
-v_bitop3_b16 v5, v1, v2, s3 bitop3:161
-// GFX1250: v_bitop3_b16 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0x05,0x0e,0x30]
+v_bitop3_b16 v5.l, v1.l, v2.l, s3 bitop3:161
+// GFX1250: v_bitop3_b16 v5.l, v1.l, v2.l, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0x05,0x0e,0x30]
 
-v_bitop3_b16 v5, v255, s2, s105 bitop3:0x27
-// GFX1250: v_bitop3_b16 v5, v255, s2, s105 bitop3:0x27 ; encoding: [0x05,0x04,0x33,0xd6,0xff,0x05,0xa4,0xe1]
+v_bitop3_b16 v5.l, v255.l, s2, s105 bitop3:0x27
+// GFX1250: v_bitop3_b16 v5.l, v255.l, s2, s105 bitop3:0x27 ; encoding: [0x05,0x04,0x33,0xd6,0xff,0x05,0xa4,0xe1]
 
-v_bitop3_b16 v5, s1, v255, exec_hi bitop3:100
-// GFX1250: v_bitop3_b16 v5, s1, v255, exec_hi bitop3:0x64 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0xfe,0xff,0x89]
+v_bitop3_b16 v5.l, s1, v255.l, exec_hi bitop3:100
+// GFX1250: v_bitop3_b16 v5.l, s1, v255.l, exec_hi bitop3:0x64 ; encoding: [0x05,0x04,0x33,0xd6,0x01,0xfe,0xff,0x89]
 
-v_bitop3_b16 v5, s105, s105, exec_lo bitop3:0
-// GFX1250: v_bitop3_b16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x33,0xd6,0x69,0xd2,0xf8,0x01]
+v_bitop3_b16 v5.l, s105, s105, exec_lo bitop3:0
+// GFX1250: v_bitop3_b16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x33,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_bitop3_b16 v5, vcc_lo, ttmp15, v3 bitop3:0x15
-// GFX1250: v_bitop3_b16 v5, vcc_lo, ttmp15, v3 bitop3:0x15 ; encoding: [0x05,0x02,0x33,0xd6,0x6a,0xf6,0x0c,0xa4]
+v_bitop3_b16 v5.l, vcc_lo, ttmp15, v3.l bitop3:0x15
+// GFX1250: v_bitop3_b16 v5.l, vcc_lo, ttmp15, v3.l bitop3:0x15 ; encoding: [0x05,0x02,0x33,0xd6,0x6a,0xf6,0x0c,0xa4]
 
-v_bitop3_b16 v5, vcc_hi, 0xfe0b, v255 bitop3:63
-// GFX1250: v_bitop3_b16 v5, vcc_hi, 0xfe0b, v255 bitop3:0x3f ; encoding: [0x05,0x07,0x33,0xd6,0x6b,0xfe,0xfd,0xe7,0x0b,0xfe,0x00,0x00]
+v_bitop3_b16 v5.l, vcc_hi, 0xfe0b, v255.l bitop3:63
+// GFX1250: v_bitop3_b16 v5.l, vcc_hi, 0xfe0b, v255.l bitop3:0x3f ; encoding: [0x05,0x07,0x33,0xd6,0x6b,0xfe,0xfd,0xe7,0x0b,0xfe,0x00,0x00]
 
-v_bitop3_b16 v5, ttmp15, src_scc, ttmp15 bitop3:0x24
-// GFX1250: v_bitop3_b16 v5, ttmp15, src_scc, ttmp15 bitop3:0x24 ; encoding: [0x05,0x04,0x33,0xd6,0x7b,0xfa,0xed,0x81]
+v_bitop3_b16 v5.l, ttmp15, src_scc, ttmp15 bitop3:0x24
+// GFX1250: v_bitop3_b16 v5.l, ttmp15, src_scc, ttmp15 bitop3:0x24 ; encoding: [0x05,0x04,0x33,0xd6,0x7b,0xfa,0xed,0x81]
 
-v_bitop3_b16 v5, m0, 0.5, m0 bitop3:5
-// GFX1250: v_bitop3_b16 v5, m0, 0.5, m0 bitop3:5   ; encoding: [0x05,0x00,0x33,0xd6,0x7d,0xe0,0xf5,0xa1]
+v_bitop3_b16 v5.l, m0, 0.5, m0 bitop3:5
+// GFX1250: v_bitop3_b16 v5.l, m0, 0.5, m0 bitop3:5 ; encoding: [0x05,0x00,0x33,0xd6,0x7d,0xe0,0xf5,0xa1]
 
-v_bitop3_b16 v5, exec_lo, -1, vcc_hi bitop3:6
-// GFX1250: v_bitop3_b16 v5, exec_lo, -1, vcc_hi bitop3:6 ; encoding: [0x05,0x00,0x33,0xd6,0x7e,0x82,0xad,0xc1]
+v_bitop3_b16 v5.l, exec_lo, -1, vcc_hi bitop3:6
+// GFX1250: v_bitop3_b16 v5.l, exec_lo, -1, vcc_hi bitop3:6 ; encoding: [0x05,0x00,0x33,0xd6,0x7e,0x82,0xad,0xc1]
 
 v_bitop3_b16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1]
 // GFX1250: v_bitop3_b16 v5.h, exec_hi, null, vcc_lo op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x33,0xd6,0x7f,0xf8,0xa8,0x01]
@@ -563,17 +563,17 @@ v_cvt_sr_bf8_f16 v1, v2.l, v3
 v_cvt_sr_bf8_f16 v1, v2.h, v3
 // GFX1250: v_cvt_sr_bf8_f16 v1, v2.h, v3 op_sel:[1,0,0] ; encoding: [0x01,0x08,0x75,0xd7,0x02,0x07,0x02,0x00]
 
-v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:0
-// GFX1250: v_cvt_sr_bf8_f16 v1, v2, v3             ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x02,0x00]
+v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:0
+// GFX1250: v_cvt_sr_bf8_f16 v1, v2.l, v3           ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x02,0x00]
 
-v_cvt_sr_bf8_f16 v1, v2, s3
-// GFX1250: v_cvt_sr_bf8_f16 v1, v2, s3             ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x00,0x00]
+v_cvt_sr_bf8_f16 v1, v2.l, s3
+// GFX1250: v_cvt_sr_bf8_f16 v1, v2.l, s3           ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x00,0x00]
 
-v_cvt_sr_bf8_f16 v1, v2, 0x1234
-// GFX1250: v_cvt_sr_bf8_f16 v1, v2, 0x1234         ; encoding: [0x01,0x00,0x75,0xd7,0x02,0xff,0x01,0x00,0x34,0x12,0x00,0x00]
+v_cvt_sr_bf8_f16 v1, v2.l, 0x1234
+// GFX1250: v_cvt_sr_bf8_f16 v1, v2.l, 0x1234       ; encoding: [0x01,0x00,0x75,0xd7,0x02,0xff,0x01,0x00,0x34,0x12,0x00,0x00]
 
-v_cvt_sr_bf8_f16 v1, -v2, v3
-// GFX1250: v_cvt_sr_bf8_f16 v1, -v2, v3            ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x02,0x20]
+v_cvt_sr_bf8_f16 v1, -v2.l, v3
+// GFX1250: v_cvt_sr_bf8_f16 v1, -v2.l, v3          ; encoding: [0x01,0x00,0x75,0xd7,0x02,0x07,0x02,0x20]
 
 v_cvt_sr_bf8_f16 v1, |v2.l|, v3
 // GFX1250: v_cvt_sr_bf8_f16 v1, |v2.l|, v3         ; encoding: [0x01,0x01,0x75,0xd7,0x02,0x07,0x02,0x00]
@@ -605,14 +605,14 @@ v_cvt_sr_fp8_f16 v1, v2.l, v3
 v_cvt_sr_fp8_f16 v1, v2.h, v3
 // GFX1250: v_cvt_sr_fp8_f16 v1, v2.h, v3 op_sel:[1,0,0] ; encoding: [0x01,0x08,0x74,0xd7,0x02,0x07,0x02,0x00]
 
-v_cvt_sr_fp8_f16 v1, v2, s3
-// GFX1250: v_cvt_sr_fp8_f16 v1, v2, s3             ; encoding: [0x01,0x00,0x74,0xd7,0x02,0x07,0x00,0x00]
+v_cvt_sr_fp8_f16 v1, v2.l, s3
+// GFX1250: v_cvt_sr_fp8_f16 v1, v2.l, s3           ; encoding: [0x01,0x00,0x74,0xd7,0x02,0x07,0x00,0x00]
 
-v_cvt_sr_fp8_f16 v1, v2, 0x1234
-// GFX1250: v_cvt_sr_fp8_f16 v1, v2, 0x1234         ; encoding: [0x01,0x00,0x74,0xd7,0x02,0xff,0x01,0x00,0x34,0x12,0x00,0x00]
+v_cvt_sr_fp8_f16 v1, v2.l, 0x1234
+// GFX1250: v_cvt_sr_fp8_f16 v1, v2.l, 0x1234       ; encoding: [0x01,0x00,0x74,0xd7,0x02,0xff,0x01,0x00,0x34,0x12,0x00,0x00]
 
-v_cvt_sr_fp8_f16 v1, -v2, v3
-// GFX1250: v_cvt_sr_fp8_f16 v1, -v2, v3            ; encoding: [0x01,0x00,0x74,0xd7,0x02,0x07,0x02,0x20]
+v_cvt_sr_fp8_f16 v1, -v2.l, v3
+// GFX1250: v_cvt_sr_fp8_f16 v1, -v2.l, v3          ; encoding: [0x01,0x00,0x74,0xd7,0x02,0x07,0x02,0x20]
 
 v_cvt_sr_fp8_f16 v1, |v2.l|, v3
 // GFX1250: v_cvt_sr_fp8_f16 v1, |v2.l|, v3         ; encoding: [0x01,0x01,0x74,0xd7,0x02,0x07,0x02,0x00]
@@ -644,11 +644,11 @@ v_cvt_pk_fp8_f32 v1.l, v2, v3
 v_cvt_pk_fp8_f32 v1.h, v2, v3
 // GFX1250: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
 
-v_cvt_pk_fp8_f32 v1, -v2, |v3|
-// GFX1250: v_cvt_pk_fp8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+v_cvt_pk_fp8_f32 v1.l, -v2, |v3|
+// GFX1250: v_cvt_pk_fp8_f32 v1.l, -v2, |v3|        ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
 
-v_cvt_pk_fp8_f32 v1, s2, 3
-// GFX1250: v_cvt_pk_fp8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
+v_cvt_pk_fp8_f32 v1.l, s2, 3
+// GFX1250: v_cvt_pk_fp8_f32 v1.l, s2, 3            ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
 
 v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp
 // GFX1250: v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp     ; encoding: [0x01,0x80,0x69,0xd7,0x02,0x07,0x02,0x00]
@@ -656,14 +656,14 @@ v_cvt_pk_fp8_f32 v1.l, v2, v3 clamp
 v_cvt_pk_fp8_f32 v1.h, v2, v3 clamp
 // GFX1250: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] clamp ; encoding: [0x01,0xc0,0x69,0xd7,0x02,0x07,0x02,0x00]
 
-v_cvt_pk_bf8_f32 v1, v2, v3
-// GFX1250: v_cvt_pk_bf8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
+v_cvt_pk_bf8_f32 v1.l, v2, v3
+// GFX1250: v_cvt_pk_bf8_f32 v1.l, v2, v3           ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
 
-v_cvt_pk_bf8_f32 v1, -v2, |v3|
-// GFX1250: v_cvt_pk_bf8_f32 v1, -v2, |v3|          ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+v_cvt_pk_bf8_f32 v1.l, -v2, |v3|
+// GFX1250: v_cvt_pk_bf8_f32 v1.l, -v2, |v3|        ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
 
-v_cvt_pk_bf8_f32 v1, s2, 3
-// GFX1250: v_cvt_pk_bf8_f32 v1, s2, 3              ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
+v_cvt_pk_bf8_f32 v1.l, s2, 3
+// GFX1250: v_cvt_pk_bf8_f32 v1.l, s2, 3            ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
 
 v_cvt_sr_fp8_f32 v1, v2, v3
 // GFX1250: v_cvt_sr_fp8_f32 v1, v2, v3             ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s
index f766e52b39f2c..fc0ea8b2d927f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s
@@ -62,60 +62,60 @@ v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
 // GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:161 quad_perm:[0,1,2,3]
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0xa1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x34,0x01,0xe4,0x00,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:161 quad_perm:[0,1,2,3]
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0xa1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x34,0x01,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0x27 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0xe4,0x01,0x40,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0x27 row_mirror
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0x27 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0xe4,0x01,0x40,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:100 row_half_mirror
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, v3 bitop3:0x64 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x8c,0x01,0x41,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:100 row_half_mirror
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v3.l bitop3:0x64 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x0e,0x8c,0x01,0x41,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, v255 bitop3:0 row_shl:1
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, v255 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v255.l bitop3:0 row_shl:1
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, v255.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x01,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, s105 bitop3:0x16 row_shl:15
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, s105 bitop3:0x16 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x33,0xd6,0xfa,0x04,0xa6,0xc1,0x01,0x0f,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, s105 bitop3:0x16 row_shl:15
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, s105 bitop3:0x16 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x33,0xd6,0xfa,0x04,0xa6,0xc1,0x01,0x0f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, vcc_hi bitop3:63 row_shr:1
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, vcc_hi bitop3:0x3f row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x07,0x33,0xd6,0xfa,0x04,0xae,0xe1,0x01,0x11,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi bitop3:63 row_shr:1
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi bitop3:0x3f row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x07,0x33,0xd6,0xfa,0x04,0xae,0xe1,0x01,0x11,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, vcc_lo bitop3:0x24 row_shr:15
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, vcc_lo bitop3:0x24 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0xaa,0x81,0x01,0x1f,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo bitop3:0x24 row_shr:15
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo bitop3:0x24 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0xaa,0x81,0x01,0x1f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, ttmp15 bitop3:5 row_ror:1
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, ttmp15 bitop3:5 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xee,0xa1,0x01,0x21,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, ttmp15 bitop3:5 row_ror:1
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, ttmp15 bitop3:5 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xee,0xa1,0x01,0x21,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:6 row_ror:15
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, exec_hi bitop3:6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0xc1,0x01,0x2f,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_hi bitop3:6 row_ror:15
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_hi bitop3:6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfe,0xc1,0x01,0x2f,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x33,0xd6,0xfa,0x04,0xfa,0x01,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:77 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, exec_lo bitop3:0x4d row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x33,0xd6,0xfa,0x04,0xfa,0xa9,0x01,0x50,0x01,0xff]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo bitop3:77 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, exec_lo bitop3:0x4d row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x33,0xd6,0xfa,0x04,0xfa,0xa9,0x01,0x50,0x01,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:88 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, null bitop3:0x58 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x03,0x33,0xd6,0xfa,0x04,0xf2,0x09,0x01,0x5f,0x01,0x01]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, null bitop3:88 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, null bitop3:0x58 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x03,0x33,0xd6,0xfa,0x04,0xf2,0x09,0x01,0x5f,0x01,0x01]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:99 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX1250: v_bitop3_b16_e64_dpp v5, v1, v2, -1 bitop3:0x63 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x06,0x6b,0x01,0x60,0x09,0x13]
+v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:99 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_bitop3_b16_e64_dpp v5.l, v1.l, v2.l, -1 bitop3:0x63 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x04,0x33,0xd6,0xfa,0x04,0x06,0x6b,0x01,0x60,0x09,0x13]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:101 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX1250: v_bitop3_b16_e64_dpp v255, v255, v255, src_scc bitop3:0x65 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x04,0x33,0xd6,0xfa,0xfe,0xf7,0xab,0xff,0x6f,0x05,0x30]
+v_bitop3_b16_e64_dpp v255.l, v255.l, v255.l, src_scc bitop3:101 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_bitop3_b16_e64_dpp v255.l, v255.l, v255.l, src_scc bitop3:0x65 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x04,0x33,0xd6,0xfa,0xfe,0xf7,0xab,0xff,0x6f,0x05,0x30]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_bitop3_b16_e64_dpp v5.h, v1.h, v2.h, exec_hi op_sel:[1,1,1,1] row_ror:15 row_mask:0xf bank_mask:0xf
@@ -470,12 +470,12 @@ v_cvt_sr_bf8_f16 v1, v2.h, v3 quad_perm:[0,1,2,3] fi:1
 // GFX1250: v_cvt_sr_bf8_f16_e64_dpp v1, v2.h, v3 op_sel:[1,0,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x08,0x75,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x04,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3]
-// GFX1250: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x75,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
+v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x75,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3]
-// GFX1250: v_cvt_sr_bf8_f16_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x75,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
+v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_sr_bf8_f16_e64_dpp v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x75,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_sr_bf8_f16 v1, v2.l, v3 byte_sel:3 quad_perm:[0,1,2,3]
@@ -494,12 +494,12 @@ v_cvt_sr_fp8_f16 v1, v2.h, v3 quad_perm:[0,1,2,3] fi:1
 // GFX1250: v_cvt_sr_fp8_f16_e64_dpp v1, v2.h, v3 op_sel:[1,0,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x08,0x74,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x04,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3]
-// GFX1250: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x74,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
+v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x40,0x74,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
-v_cvt_sr_fp8_f16 v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3]
-// GFX1250: v_cvt_sr_fp8_f16_e64_dpp v1, v2, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x74,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
+v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3]
+// GFX1250: v_cvt_sr_fp8_f16_e64_dpp v1, v2.l, v3 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x20,0x74,0xd7,0xfa,0x06,0x02,0x00,0x02,0xe4,0x00,0xff]
 // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_cvt_sr_fp8_f16 v1, v2.l, v3 byte_sel:3 quad_perm:[0,1,2,3]

From a4767e63eebfa1ae0065bdd8813df3839f62b461 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 2 Oct 2025 09:16:08 +0200
Subject: [PATCH 470/878] [MemorySanitizer] Use getelementptr instead of
 ptrtoint+add+inttoptr (#161392)

MemorySanitizer currently does a lot of pointer arithmetic using
ptrtoint+add+inttoptr instead of using getelementptr. As far as I can
tell, there is no need to use this pattern -- msan is not trying to
synthesize pointers with different provenance here. The pointers in
question stay within one object (like the TLS parameter area).

I suspect that this is just a leftover from pre-opaque-pointer types
where this was a natural way to perform offset arithmetic. Nowadays we
should just emit a getelementptr i8, aka ptradd.
---
 .../Instrumentation/MemorySanitizer.cpp       |   48 +-
 .../MemorySanitizer/AArch64/arm64-ld1.ll      |   72 +-
 .../MemorySanitizer/AArch64/arm64-smaxv.ll    |   12 +-
 .../MemorySanitizer/AArch64/arm64-sminv.ll    |   12 +-
 .../MemorySanitizer/AArch64/arm64-st1.ll      |  346 +--
 .../MemorySanitizer/AArch64/arm64-st1_lane.ll |  358 +--
 .../AArch64/arm64-st1_origins.ll              |   42 +-
 .../MemorySanitizer/AArch64/arm64-tbl.ll      |  160 +-
 .../MemorySanitizer/AArch64/arm64-umaxv.ll    |   15 +-
 .../MemorySanitizer/AArch64/arm64-uminv.ll    |   15 +-
 .../MemorySanitizer/AArch64/arm64-vadd.ll     |  238 +-
 .../MemorySanitizer/AArch64/arm64-vaddv.ll    |   30 +-
 .../MemorySanitizer/AArch64/arm64-vcvt.ll     |    6 +-
 .../MemorySanitizer/AArch64/arm64-vmax.ll     |  135 +-
 .../MemorySanitizer/AArch64/arm64-vmovn.ll    |   24 +-
 .../MemorySanitizer/AArch64/arm64-vmul.ll     |  386 +--
 .../MemorySanitizer/AArch64/arm64-vshift.ll   |  296 +--
 .../MemorySanitizer/AArch64/neon_vst_float.ll |  434 ++--
 .../MemorySanitizer/AArch64/vararg.ll         |   26 +-
 .../MemorySanitizer/AArch64/vararg_shadow.ll  |  358 +--
 .../MemorySanitizer/ARM32/vararg-arm32.ll     |  416 +--
 .../LoongArch/vararg-loongarch64.ll           |   22 +-
 .../MemorySanitizer/Mips/vararg-mips64.ll     |   24 +-
 .../MemorySanitizer/Mips/vararg-mips64el.ll   |   22 +-
 .../MemorySanitizer/Mips32/vararg-mips.ll     |  416 +--
 .../MemorySanitizer/Mips32/vararg-mipsel.ll   |  414 +--
 .../MemorySanitizer/PowerPC/vararg-ppc64.ll   |   32 +-
 .../MemorySanitizer/PowerPC/vararg-ppc64le.ll |   31 +-
 .../MemorySanitizer/PowerPC32/kernel-ppcle.ll |  260 +-
 .../MemorySanitizer/PowerPC32/vararg-ppc.ll   |  440 ++--
 .../MemorySanitizer/PowerPC32/vararg-ppcle.ll |  440 ++--
 .../MemorySanitizer/RISCV32/vararg-riscv32.ll |  416 +--
 .../MemorySanitizer/X86/avx-intrinsics-x86.ll |  108 +-
 .../X86/avx10_2_512ni-intrinsics.ll           |  120 +-
 .../X86/avx10_2ni-intrinsics.ll               |  188 +-
 .../X86/avx2-intrinsics-x86.ll                |  236 +-
 .../X86/avx512-gfni-intrinsics.ll             |   72 +-
 .../X86/avx512-intrinsics-upgrade.ll          | 2228 ++++++++---------
 .../MemorySanitizer/X86/avx512-intrinsics.ll  | 1458 +++++------
 .../X86/avx512bw-intrinsics-upgrade.ll        |  794 +++---
 .../X86/avx512bw-intrinsics.ll                |  370 +--
 .../X86/avx512fp16-arith-intrinsics.ll        |  235 +-
 .../X86/avx512fp16-arith-vl-intrinsics.ll     |  221 +-
 .../X86/avx512fp16-intrinsics.ll              |  264 +-
 .../X86/avx512vl-intrinsics.ll                | 1430 +++++------
 .../X86/avx512vl_vnni-intrinsics-upgrade.ll   |   96 +-
 .../X86/avx512vl_vnni-intrinsics.ll           |   96 +-
 .../X86/avx512vnni-intrinsics-upgrade.ll      |   48 +-
 .../X86/avx512vnni-intrinsics.ll              |   48 +-
 .../X86/avx_vnni-intrinsics.ll                |   32 +-
 .../X86/avxvnniint16-intrinsics.ll            |   48 +-
 .../X86/avxvnniint8-intrinsics.ll             |   72 +-
 .../MemorySanitizer/X86/f16c-intrinsics.ll    |   11 +-
 .../MemorySanitizer/X86/mmx-intrinsics.ll     |  148 +-
 .../MemorySanitizer/X86/sse-intrinsics-x86.ll |   56 +-
 .../X86/sse2-intrinsics-x86.ll                |   78 +-
 .../X86/sse41-intrinsics-x86.ll               |   36 +-
 .../MemorySanitizer/X86/vararg-too-large.ll   |    6 +-
 .../MemorySanitizer/X86/vararg_call.ll        |   18 +-
 .../MemorySanitizer/X86/vararg_shadow.ll      |  236 +-
 .../MemorySanitizer/X86/x86-vpermi2.ll        |   76 +-
 .../MemorySanitizer/array_types.ll            |   12 +-
 .../Instrumentation/MemorySanitizer/bmi.ll    |   16 +-
 .../MemorySanitizer/byval-alignment.ll        |    2 +-
 .../Instrumentation/MemorySanitizer/byval.ll  |   24 +-
 .../expand-experimental-reductions.ll         |   12 +-
 .../MemorySanitizer/funnel_shift.ll           |  144 +-
 .../i386/avx-intrinsics-i386.ll               |  108 +-
 .../i386/avx2-intrinsics-i386.ll              |  236 +-
 .../MemorySanitizer/i386/mmx-intrinsics.ll    |  148 +-
 .../i386/msan_i386intrinsics.ll               |   12 +-
 .../i386/sse-intrinsics-i386.ll               |   36 +-
 .../i386/sse2-intrinsics-i386.ll              |   78 +-
 .../i386/sse41-intrinsics-i386.ll             |   36 +-
 .../MemorySanitizer/i386/vararg-too-large.ll  |  396 +--
 .../MemorySanitizer/i386/vararg_call.ll       |   60 +-
 .../MemorySanitizer/i386/vararg_shadow.ll     |  118 +-
 .../MemorySanitizer/masked-store-load.ll      |   48 +-
 .../MemorySanitizer/msan_basic.ll             |  482 ++--
 .../MemorySanitizer/msan_debug_info.ll        |  610 +++--
 .../MemorySanitizer/msan_eager.ll             |   14 +-
 .../MemorySanitizer/msan_kernel_basic.ll      |  267 +-
 .../MemorySanitizer/opaque-ptr.ll             |    4 +-
 .../Instrumentation/MemorySanitizer/or.ll     |    6 +-
 .../MemorySanitizer/overflow.ll               |   14 +-
 .../MemorySanitizer/pr32842.ll                |    2 +-
 .../MemorySanitizer/saturating.ll             |   14 +-
 .../Instrumentation/MemorySanitizer/scmp.ll   |   44 +-
 .../Instrumentation/MemorySanitizer/ucmp.ll   |   38 +-
 .../MemorySanitizer/vector-reduce-fadd.ll     |   16 +-
 .../MemorySanitizer/vector-reduce-fmul.ll     |   16 +-
 .../MemorySanitizer/vector_arith.ll           |    8 +-
 .../Instrumentation/MemorySanitizer/vscale.ll |    6 +-
 93 files changed, 8761 insertions(+), 9041 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index cf076b9ad70ee..eff6f0caf0c05 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1923,20 +1923,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Shadow = ParamTLS+ArgOffset.
   Value *getShadowPtrForArgument(IRBuilder<> &IRB, int ArgOffset) {
-    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
-    if (ArgOffset)
-      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, IRB.getPtrTy(0), "_msarg");
+    return IRB.CreatePtrAdd(MS.ParamTLS,
+                            ConstantInt::get(MS.IntptrTy, ArgOffset), "_msarg");
   }
 
   /// Compute the origin address for a given function argument.
   Value *getOriginPtrForArgument(IRBuilder<> &IRB, int ArgOffset) {
     if (!MS.TrackOrigins)
       return nullptr;
-    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
-    if (ArgOffset)
-      Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, IRB.getPtrTy(0), "_msarg_o");
+    return IRB.CreatePtrAdd(MS.ParamOriginTLS,
+                            ConstantInt::get(MS.IntptrTy, ArgOffset),
+                            "_msarg_o");
   }
 
   /// Compute the shadow address for a retval.
@@ -7219,9 +7216,8 @@ struct VarArgHelperBase : public VarArgHelper {
 
   /// Compute the shadow address for a given va_arg.
   Value *getShadowPtrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) {
-    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_s");
+    return IRB.CreatePtrAdd(
+        MS.VAArgTLS, ConstantInt::get(MS.IntptrTy, ArgOffset), "_msarg_va_s");
   }
 
   /// Compute the shadow address for a given va_arg.
@@ -7235,12 +7231,12 @@ struct VarArgHelperBase : public VarArgHelper {
 
   /// Compute the origin address for a given va_arg.
   Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) {
-    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
     // getOriginPtrForVAArgument() is always called after
     // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
     // overflow.
-    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
-    return IRB.CreateIntToPtr(Base, MS.PtrTy, "_msarg_va_o");
+    return IRB.CreatePtrAdd(MS.VAArgOriginTLS,
+                            ConstantInt::get(MS.IntptrTy, ArgOffset),
+                            "_msarg_va_o");
   }
 
   void CleanUnusedTLS(IRBuilder<> &IRB, Value *ShadowBase,
@@ -7467,10 +7463,8 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
       NextNodeIRBuilder IRB(OrigInst);
       Value *VAListTag = OrigInst->getArgOperand(0);
 
-      Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
-          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                        ConstantInt::get(MS.IntptrTy, 16)),
-          MS.PtrTy);
+      Value *RegSaveAreaPtrPtr =
+          IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, 16));
       Value *RegSaveAreaPtr = IRB.CreateLoad(MS.PtrTy, RegSaveAreaPtrPtr);
       Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
       const Align Alignment = Align(16);
@@ -7482,10 +7476,8 @@ struct VarArgAMD64Helper : public VarArgHelperBase {
       if (MS.TrackOrigins)
         IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
                          Alignment, AMD64FpEndOffset);
-      Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
-          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                        ConstantInt::get(MS.IntptrTy, 8)),
-          MS.PtrTy);
+      Value *OverflowArgAreaPtrPtr =
+          IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, 8));
       Value *OverflowArgAreaPtr =
           IRB.CreateLoad(MS.PtrTy, OverflowArgAreaPtrPtr);
       Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
@@ -7615,19 +7607,15 @@ struct VarArgAArch64Helper : public VarArgHelperBase {
 
   // Retrieve a va_list field of 'void*' size.
   Value *getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
-    Value *SaveAreaPtrPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                      ConstantInt::get(MS.IntptrTy, offset)),
-        MS.PtrTy);
+    Value *SaveAreaPtrPtr =
+        IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, offset));
     return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr);
   }
 
   // Retrieve a va_list field of 'int' size.
   Value *getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
-    Value *SaveAreaPtr = IRB.CreateIntToPtr(
-        IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
-                      ConstantInt::get(MS.IntptrTy, offset)),
-        MS.PtrTy);
+    Value *SaveAreaPtr =
+        IRB.CreatePtrAdd(VAListTag, ConstantInt::get(MS.IntptrTy, offset));
     Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr);
     return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
   }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
index 99e9ab939847c..864f6a973334e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-ld1.ll
@@ -877,7 +877,7 @@ define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, ptr
 ; CHECK-LABEL: define %struct.__neon_int8x16x2_t @ld2lane_16b(
 ; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -904,8 +904,8 @@ define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16
 ; CHECK-LABEL: define %struct.__neon_int8x16x3_t @ld3lane_16b(
 ; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -936,9 +936,9 @@ define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16
 ; CHECK-LABEL: define %struct.__neon_int8x16x4_t @ld4lane_16b(
 ; CHECK-SAME: <16 x i8> [[L1:%.*]], <16 x i8> [[L2:%.*]], <16 x i8> [[L3:%.*]], <16 x i8> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -977,7 +977,7 @@ define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, ptr
 ; CHECK-LABEL: define %struct.__neon_int16x8x2_t @ld2lane_8h(
 ; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1004,8 +1004,8 @@ define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x
 ; CHECK-LABEL: define %struct.__neon_int16x8x3_t @ld3lane_8h(
 ; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1036,9 +1036,9 @@ define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x
 ; CHECK-LABEL: define %struct.__neon_int16x8x4_t @ld4lane_8h(
 ; CHECK-SAME: <8 x i16> [[L1:%.*]], <8 x i16> [[L2:%.*]], <8 x i16> [[L3:%.*]], <8 x i16> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1077,7 +1077,7 @@ define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, ptr
 ; CHECK-LABEL: define %struct.__neon_int32x4x2_t @ld2lane_4s(
 ; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1104,8 +1104,8 @@ define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x
 ; CHECK-LABEL: define %struct.__neon_int32x4x3_t @ld3lane_4s(
 ; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1136,9 +1136,9 @@ define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x
 ; CHECK-LABEL: define %struct.__neon_int32x4x4_t @ld4lane_4s(
 ; CHECK-SAME: <4 x i32> [[L1:%.*]], <4 x i32> [[L2:%.*]], <4 x i32> [[L3:%.*]], <4 x i32> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1177,7 +1177,7 @@ define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, ptr
 ; CHECK-LABEL: define %struct.__neon_int64x2x2_t @ld2lane_2d(
 ; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1204,8 +1204,8 @@ define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x
 ; CHECK-LABEL: define %struct.__neon_int64x2x3_t @ld3lane_2d(
 ; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1236,9 +1236,9 @@ define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x
 ; CHECK-LABEL: define %struct.__neon_int64x2x4_t @ld4lane_2d(
 ; CHECK-SAME: <2 x i64> [[L1:%.*]], <2 x i64> [[L2:%.*]], <2 x i64> [[L3:%.*]], <2 x i64> [[L4:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -2304,7 +2304,7 @@ define <16 x i8> @ld1_16b(<16 x i8> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <16 x i8> @ld1_16b(
 ; CHECK-SAME: <16 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2332,7 +2332,7 @@ define <8 x i16> @ld1_8h(<8 x i16> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <8 x i16> @ld1_8h(
 ; CHECK-SAME: <8 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2360,7 +2360,7 @@ define <4 x i32> @ld1_4s(<4 x i32> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x i32> @ld1_4s(
 ; CHECK-SAME: <4 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2388,7 +2388,7 @@ define <4 x float> @ld1_4s_float(<4 x float> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x float> @ld1_4s_float(
 ; CHECK-SAME: <4 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2416,7 +2416,7 @@ define <2 x i64> @ld1_2d(<2 x i64> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x i64> @ld1_2d(
 ; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2444,7 +2444,7 @@ define <2 x double> @ld1_2d_double(<2 x double> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x double> @ld1_2d_double(
 ; CHECK-SAME: <2 x double> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2496,7 +2496,7 @@ define <8 x i8> @ld1_8b(<8 x i8> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <8 x i8> @ld1_8b(
 ; CHECK-SAME: <8 x i8> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2524,7 +2524,7 @@ define <4 x i16> @ld1_4h(<4 x i16> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <4 x i16> @ld1_4h(
 ; CHECK-SAME: <4 x i16> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2552,7 +2552,7 @@ define <2 x i32> @ld1_2s(<2 x i32> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x i32> @ld1_2s(
 ; CHECK-SAME: <2 x i32> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2580,7 +2580,7 @@ define <2 x float> @ld1_2s_float(<2 x float> %V, ptr %bar) #0 {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK-LABEL: define <2 x float> @ld1_2s_float(
 ; CHECK-SAME: <2 x float> [[V:%.*]], ptr [[BAR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2611,8 +2611,8 @@ define void @ld1r_2s_from_dup(ptr nocapture %a, ptr nocapture %b, ptr nocapture
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], ptr captures(none) [[DIFF:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-smaxv.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-smaxv.ll
index 632268e08022c..1319544bf6662 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-smaxv.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-smaxv.ll
@@ -122,7 +122,7 @@ define <8 x i8> @test_vmaxv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-LABEL: define <8 x i8> @test_vmaxv_s8_used_by_laneop(
 ; CHECK-SAME: <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
@@ -146,7 +146,7 @@ define <4 x i16> @test_vmaxv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-LABEL: define <4 x i16> @test_vmaxv_s16_used_by_laneop(
 ; CHECK-SAME: <4 x i16> [[A1:%.*]], <4 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
@@ -170,7 +170,7 @@ define <2 x i32> @test_vmaxv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-LABEL: define <2 x i32> @test_vmaxv_s32_used_by_laneop(
 ; CHECK-SAME: <2 x i32> [[A1:%.*]], <2 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
@@ -190,7 +190,7 @@ define <16 x i8> @test_vmaxvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-LABEL: define <16 x i8> @test_vmaxvq_s8_used_by_laneop(
 ; CHECK-SAME: <16 x i8> [[A1:%.*]], <16 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
@@ -214,7 +214,7 @@ define <8 x i16> @test_vmaxvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-LABEL: define <8 x i16> @test_vmaxvq_s16_used_by_laneop(
 ; CHECK-SAME: <8 x i16> [[A1:%.*]], <8 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
@@ -238,7 +238,7 @@ define <4 x i32> @test_vmaxvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-LABEL: define <4 x i32> @test_vmaxvq_s32_used_by_laneop(
 ; CHECK-SAME: <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-sminv.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-sminv.ll
index 267061027cd52..272a910f2fc20 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-sminv.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-sminv.ll
@@ -122,7 +122,7 @@ define <8 x i8> @test_vminv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-LABEL: define <8 x i8> @test_vminv_s8_used_by_laneop(
 ; CHECK-SAME: <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
@@ -146,7 +146,7 @@ define <4 x i16> @test_vminv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-LABEL: define <4 x i16> @test_vminv_s16_used_by_laneop(
 ; CHECK-SAME: <4 x i16> [[A1:%.*]], <4 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
@@ -170,7 +170,7 @@ define <2 x i32> @test_vminv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-LABEL: define <2 x i32> @test_vminv_s32_used_by_laneop(
 ; CHECK-SAME: <2 x i32> [[A1:%.*]], <2 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
@@ -190,7 +190,7 @@ define <16 x i8> @test_vminvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-LABEL: define <16 x i8> @test_vminvq_s8_used_by_laneop(
 ; CHECK-SAME: <16 x i8> [[A1:%.*]], <16 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
@@ -214,7 +214,7 @@ define <8 x i16> @test_vminvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-LABEL: define <8 x i16> @test_vminvq_s16_used_by_laneop(
 ; CHECK-SAME: <8 x i16> [[A1:%.*]], <8 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
@@ -238,7 +238,7 @@ define <4 x i32> @test_vminvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-LABEL: define <4 x i32> @test_vminvq_s32_used_by_laneop(
 ; CHECK-SAME: <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1.ll
index deeb1d4b6ff85..fedf45f0d9166 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1.ll
@@ -15,9 +15,9 @@ define void @st2_8b(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_memory {
 ;
 ; CHECK-LABEL: define void @st2_8b(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -41,8 +41,8 @@ define void @st2_8b_undefA(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ;
 ; CHECK-LABEL: define void @st2_8b_undefA(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -66,7 +66,7 @@ define void @st2_8b_undefB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_m
 ;
 ; CHECK-LABEL: define void @st2_8b_undefB(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
@@ -91,7 +91,7 @@ define void @st2_8b_undefAB(<8 x i8> %A, <8 x i8> %B, ptr %P) nounwind sanitize_
 ;
 ; CHECK-LABEL: define void @st2_8b_undefAB(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
@@ -115,10 +115,10 @@ define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwind sani
 ;
 ; CHECK-LABEL: define void @st3_8b(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -142,9 +142,9 @@ define void @st3_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ;
 ; CHECK-LABEL: define void @st3_8b_undefA(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -168,9 +168,9 @@ define void @st3_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ;
 ; CHECK-LABEL: define void @st3_8b_undefB(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -194,9 +194,9 @@ define void @st3_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounwi
 ;
 ; CHECK-LABEL: define void @st3_8b_undefC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -220,8 +220,8 @@ define void @st3_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ;
 ; CHECK-LABEL: define void @st3_8b_undefAB(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -245,8 +245,8 @@ define void @st3_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ;
 ; CHECK-LABEL: define void @st3_8b_undefAC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -270,7 +270,7 @@ define void @st3_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) nounw
 ;
 ; CHECK-LABEL: define void @st3_8b_undefBC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
@@ -295,7 +295,7 @@ define void @st3_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %P) noun
 ;
 ; CHECK-LABEL: define void @st3_8b_undefABC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
@@ -319,11 +319,11 @@ define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %P)
 ;
 ; CHECK-LABEL: define void @st4_8b(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -347,10 +347,10 @@ define void @st4_8b_undefA(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ;
 ; CHECK-LABEL: define void @st4_8b_undefA(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -374,10 +374,10 @@ define void @st4_8b_undefB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ;
 ; CHECK-LABEL: define void @st4_8b_undefB(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -401,10 +401,10 @@ define void @st4_8b_undefC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ;
 ; CHECK-LABEL: define void @st4_8b_undefC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -428,10 +428,10 @@ define void @st4_8b_undefD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, p
 ;
 ; CHECK-LABEL: define void @st4_8b_undefD(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -455,9 +455,9 @@ define void @st4_8b_undefAB(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefAB(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -481,9 +481,9 @@ define void @st4_8b_undefAC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefAC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -507,9 +507,9 @@ define void @st4_8b_undefBC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefBC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -533,9 +533,9 @@ define void @st4_8b_undefBD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefBD(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -559,8 +559,8 @@ define void @st4_8b_undefABC(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefABC(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -584,8 +584,8 @@ define void @st4_8b_undefABD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefABD(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -609,8 +609,8 @@ define void @st4_8b_undefACD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefACD(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -634,7 +634,7 @@ define void @st4_8b_undefBCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D,
 ;
 ; CHECK-LABEL: define void @st4_8b_undefBCD(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
@@ -659,7 +659,7 @@ define void @st4_8b_undefABCD(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D
 ;
 ; CHECK-LABEL: define void @st4_8b_undefABCD(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
@@ -689,9 +689,9 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ;
 ; CHECK-LABEL: define void @st2_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -715,10 +715,10 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ;
 ; CHECK-LABEL: define void @st3_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -742,11 +742,11 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -776,9 +776,9 @@ define void @st2_4h(<4 x i16> %A, <4 x i16> %B, ptr %P) nounwind sanitize_memory
 ;
 ; CHECK-LABEL: define void @st2_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -802,10 +802,10 @@ define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %P) nounwind s
 ;
 ; CHECK-LABEL: define void @st3_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -829,11 +829,11 @@ define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -863,9 +863,9 @@ define void @st2_8h(<8 x i16> %A, <8 x i16> %B, ptr %P) nounwind sanitize_memory
 ;
 ; CHECK-LABEL: define void @st2_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -889,10 +889,10 @@ define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %P) nounwind s
 ;
 ; CHECK-LABEL: define void @st3_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -916,11 +916,11 @@ define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -950,9 +950,9 @@ define void @st2_2s(<2 x i32> %A, <2 x i32> %B, ptr %P) nounwind sanitize_memory
 ;
 ; CHECK-LABEL: define void @st2_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -976,10 +976,10 @@ define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %P) nounwind s
 ;
 ; CHECK-LABEL: define void @st3_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1003,11 +1003,11 @@ define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -1035,9 +1035,9 @@ define void @st2_4s(<4 x i32> %A, <4 x i32> %B, ptr %P) nounwind sanitize_memory
 ;
 ; CHECK-LABEL: define void @st2_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1061,10 +1061,10 @@ define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %P) nounwind s
 ;
 ; CHECK-LABEL: define void @st3_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1088,11 +1088,11 @@ define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -1123,9 +1123,9 @@ define void @st2_1d(<1 x i64> %A, <1 x i64> %B, ptr %P) nounwind sanitize_memory
 ;
 ; CHECK-LABEL: define void @st2_1d(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1149,10 +1149,10 @@ define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %P) nounwind s
 ;
 ; CHECK-LABEL: define void @st3_1d(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1176,11 +1176,11 @@ define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_1d(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -1210,9 +1210,9 @@ define void @st2_2d(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize_memory
 ;
 ; CHECK-LABEL: define void @st2_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1236,8 +1236,8 @@ define void @st2_2d_undefA(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ;
 ; CHECK-LABEL: define void @st2_2d_undefA(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1261,7 +1261,7 @@ define void @st2_2d_undefB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitize
 ;
 ; CHECK-LABEL: define void @st2_2d_undefB(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
@@ -1286,7 +1286,7 @@ define void @st2_2d_undefAB(<2 x i64> %A, <2 x i64> %B, ptr %P) nounwind sanitiz
 ;
 ; CHECK-LABEL: define void @st2_2d_undefAB(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
@@ -1310,10 +1310,10 @@ define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nounwind s
 ;
 ; CHECK-LABEL: define void @st3_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1337,9 +1337,9 @@ define void @st3_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ;
 ; CHECK-LABEL: define void @st3_2d_undefA(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1363,9 +1363,9 @@ define void @st3_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ;
 ; CHECK-LABEL: define void @st3_2d_undefB(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1389,9 +1389,9 @@ define void @st3_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) nou
 ;
 ; CHECK-LABEL: define void @st3_2d_undefC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1415,8 +1415,8 @@ define void @st3_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ;
 ; CHECK-LABEL: define void @st3_2d_undefAB(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1440,8 +1440,8 @@ define void @st3_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ;
 ; CHECK-LABEL: define void @st3_2d_undefAC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1465,7 +1465,7 @@ define void @st3_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) no
 ;
 ; CHECK-LABEL: define void @st3_2d_undefBC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
@@ -1490,7 +1490,7 @@ define void @st3_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %P) n
 ;
 ; CHECK-LABEL: define void @st3_2d_undefABC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
@@ -1514,11 +1514,11 @@ define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
@@ -1546,10 +1546,10 @@ define void @st4_2d_undefA(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ;
 ; CHECK-LABEL: define void @st4_2d_undefA(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1573,10 +1573,10 @@ define void @st4_2d_undefB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ;
 ; CHECK-LABEL: define void @st4_2d_undefB(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1600,10 +1600,10 @@ define void @st4_2d_undefC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ;
 ; CHECK-LABEL: define void @st4_2d_undefC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1627,10 +1627,10 @@ define void @st4_2d_undefD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %
 ;
 ; CHECK-LABEL: define void @st4_2d_undefD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
@@ -1654,9 +1654,9 @@ define void @st4_2d_undefAB(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefAB(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1680,9 +1680,9 @@ define void @st4_2d_undefAC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefAC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1706,9 +1706,9 @@ define void @st4_2d_undefAD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefAD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1732,9 +1732,9 @@ define void @st4_2d_undefBC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefBC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1758,9 +1758,9 @@ define void @st4_2d_undefBD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefBD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1784,9 +1784,9 @@ define void @st4_2d_undefCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefCD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
@@ -1810,8 +1810,8 @@ define void @st4_2d_undefABC(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefABC(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1835,8 +1835,8 @@ define void @st4_2d_undefABD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefABD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1860,8 +1860,8 @@ define void @st4_2d_undefACD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefACD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
@@ -1885,7 +1885,7 @@ define void @st4_2d_undefBCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64>
 ;
 ; CHECK-LABEL: define void @st4_2d_undefBCD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
@@ -1910,7 +1910,7 @@ define void @st4_2d_undefABCD(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64
 ;
 ; CHECK-LABEL: define void @st4_2d_undefABCD(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 193514046488576
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_lane.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_lane.ll
index 9ed364df3e677..0617c8c7027fe 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_lane.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_lane.ll
@@ -13,7 +13,7 @@ target triple = "aarch64--linux-android9001"
 define void @st1lane_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -21,7 +21,7 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
@@ -42,7 +42,7 @@ define void @st1lane_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 define void @st1lane0_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -50,7 +50,7 @@ define void @st1lane0_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -71,7 +71,7 @@ define void @st1lane0_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -79,7 +79,7 @@ define void @st1lane0u_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -100,8 +100,8 @@ define void @st1lane0u_16b(<16 x i8> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -109,7 +109,7 @@ define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -130,8 +130,8 @@ define void @st1lane_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -139,7 +139,7 @@ define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <16 x i8> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <16 x i8> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -160,7 +160,7 @@ define void @st1lane0_ro_16b(<16 x i8> %A, ptr %D, i64 %offset) sanitize_memory
 define void @st1lane_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -168,7 +168,7 @@ define void @st1lane_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -189,7 +189,7 @@ define void @st1lane_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 define void @st1lane0_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -197,7 +197,7 @@ define void @st1lane0_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -218,7 +218,7 @@ define void @st1lane0_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -226,7 +226,7 @@ define void @st1lane0u_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -247,8 +247,8 @@ define void @st1lane0u_8h(<8 x i16> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -256,7 +256,7 @@ define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -277,8 +277,8 @@ define void @st1lane_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -286,7 +286,7 @@ define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i16> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -307,7 +307,7 @@ define void @st1lane0_ro_8h(<8 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -315,7 +315,7 @@ define void @st1lane_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -336,7 +336,7 @@ define void @st1lane_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 define void @st1lane0_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -344,7 +344,7 @@ define void @st1lane0_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -365,7 +365,7 @@ define void @st1lane0_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -373,7 +373,7 @@ define void @st1lane0u_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -394,8 +394,8 @@ define void @st1lane0u_4s(<4 x i32> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -403,7 +403,7 @@ define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -424,8 +424,8 @@ define void @st1lane_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -433,7 +433,7 @@ define void @st1lane0_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i32> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -454,7 +454,7 @@ define void @st1lane0_ro_4s(<4 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_4s_float(
 ; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -462,7 +462,7 @@ define void @st1lane_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -483,7 +483,7 @@ define void @st1lane_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 define void @st1lane0_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_4s_float(
 ; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -491,7 +491,7 @@ define void @st1lane0_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -512,7 +512,7 @@ define void @st1lane0_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_4s_float(
 ; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -520,7 +520,7 @@ define void @st1lane0u_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -541,8 +541,8 @@ define void @st1lane0u_4s_float(<4 x float> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_4s_float(
 ; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -550,7 +550,7 @@ define void @st1lane_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_m
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -571,8 +571,8 @@ define void @st1lane_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_m
 define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_4s_float(
 ; CHECK-SAME: <4 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -580,7 +580,7 @@ define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x float> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -601,7 +601,7 @@ define void @st1lane0_ro_4s_float(<4 x float> %A, ptr %D, i64 %offset) sanitize_
 define void @st1lane_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -609,7 +609,7 @@ define void @st1lane_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -630,7 +630,7 @@ define void @st1lane_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 define void @st1lane0_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -638,7 +638,7 @@ define void @st1lane0_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -659,7 +659,7 @@ define void @st1lane0_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -667,7 +667,7 @@ define void @st1lane0u_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -688,8 +688,8 @@ define void @st1lane0u_2d(<2 x i64> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -697,7 +697,7 @@ define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -718,8 +718,8 @@ define void @st1lane_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -727,7 +727,7 @@ define void @st1lane0_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i64> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -748,7 +748,7 @@ define void @st1lane0_ro_2d(<2 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_2d_double(
 ; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -756,7 +756,7 @@ define void @st1lane_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -777,7 +777,7 @@ define void @st1lane_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 define void @st1lane0_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_2d_double(
 ; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -785,7 +785,7 @@ define void @st1lane0_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -806,7 +806,7 @@ define void @st1lane0_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_2d_double(
 ; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -814,7 +814,7 @@ define void @st1lane0u_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -835,8 +835,8 @@ define void @st1lane0u_2d_double(<2 x double> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_2d_double(
 ; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -844,7 +844,7 @@ define void @st1lane_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -865,8 +865,8 @@ define void @st1lane_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize
 define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_2d_double(
 ; CHECK-SAME: <2 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -874,7 +874,7 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitiz
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x double> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -895,7 +895,7 @@ define void @st1lane0_ro_2d_double(<2 x double> %A, ptr %D, i64 %offset) sanitiz
 define void @st1lane_8b(<8 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_8b(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -903,7 +903,7 @@ define void @st1lane_8b(<8 x i8> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -924,8 +924,8 @@ define void @st1lane_8b(<8 x i8> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_8b(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -933,7 +933,7 @@ define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -954,8 +954,8 @@ define void @st1lane_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_8b(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -963,7 +963,7 @@ define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <8 x i8> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <8 x i8> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -984,7 +984,7 @@ define void @st1lane0_ro_8b(<8 x i8> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -992,7 +992,7 @@ define void @st1lane_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1013,7 +1013,7 @@ define void @st1lane_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 define void @st1lane0_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1021,7 +1021,7 @@ define void @st1lane0_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1042,7 +1042,7 @@ define void @st1lane0_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1050,7 +1050,7 @@ define void @st1lane0u_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1071,8 +1071,8 @@ define void @st1lane0u_4h(<4 x i16> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1080,7 +1080,7 @@ define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1101,8 +1101,8 @@ define void @st1lane_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1110,7 +1110,7 @@ define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <4 x i16> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1131,7 +1131,7 @@ define void @st1lane0_ro_4h(<4 x i16> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1139,7 +1139,7 @@ define void @st1lane_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1160,7 +1160,7 @@ define void @st1lane_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 define void @st1lane0_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1168,7 +1168,7 @@ define void @st1lane0_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1189,7 +1189,7 @@ define void @st1lane0_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1197,7 +1197,7 @@ define void @st1lane0u_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1218,8 +1218,8 @@ define void @st1lane0u_2s(<2 x i32> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1227,7 +1227,7 @@ define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1248,8 +1248,8 @@ define void @st1lane_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1257,7 +1257,7 @@ define void @st1lane0_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x i32> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1278,7 +1278,7 @@ define void @st1lane0_ro_2s(<2 x i32> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_2s_float(
 ; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1286,7 +1286,7 @@ define void @st1lane_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1307,7 +1307,7 @@ define void @st1lane_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 define void @st1lane0_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_2s_float(
 ; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1315,7 +1315,7 @@ define void @st1lane0_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1336,7 +1336,7 @@ define void @st1lane0_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_2s_float(
 ; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1344,7 +1344,7 @@ define void @st1lane0u_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1365,8 +1365,8 @@ define void @st1lane0u_2s_float(<2 x float> %A, ptr %D) sanitize_memory {
 define void @st1lane_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane_ro_2s_float(
 ; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1374,7 +1374,7 @@ define void @st1lane_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_m
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 1
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1395,8 +1395,8 @@ define void @st1lane_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_m
 define void @st1lane0_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_2s_float(
 ; CHECK-SAME: <2 x float> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1404,7 +1404,7 @@ define void @st1lane0_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <2 x float> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1425,7 +1425,7 @@ define void @st1lane0_ro_2s_float(<2 x float> %A, ptr %D, i64 %offset) sanitize_
 define void @st1lane0_1d(<1 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_1d(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1433,7 +1433,7 @@ define void @st1lane0_1d(<1 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1454,7 +1454,7 @@ define void @st1lane0_1d(<1 x i64> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_1d(<1 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_1d(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1462,7 +1462,7 @@ define void @st1lane0u_1d(<1 x i64> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1483,8 +1483,8 @@ define void @st1lane0u_1d(<1 x i64> %A, ptr %D) sanitize_memory {
 define void @st1lane0_ro_1d(<1 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_1d(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1492,7 +1492,7 @@ define void @st1lane0_ro_1d(<1 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x i64> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1513,7 +1513,7 @@ define void @st1lane0_ro_1d(<1 x i64> %A, ptr %D, i64 %offset) sanitize_memory {
 define void @st1lane0_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_1d_double(
 ; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1521,7 +1521,7 @@ define void @st1lane0_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1542,7 +1542,7 @@ define void @st1lane0_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
 define void @st1lane0u_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0u_1d_double(
 ; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], 0
@@ -1550,7 +1550,7 @@ define void @st1lane0u_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1571,8 +1571,8 @@ define void @st1lane0u_1d_double(<1 x double> %A, ptr %D) sanitize_memory {
 define void @st1lane0_ro_1d_double(<1 x double> %A, ptr %D, i64 %offset) sanitize_memory {
 ; CHECK-LABEL: define void @st1lane0_ro_1d_double(
 ; CHECK-SAME: <1 x double> [[A:%.*]], ptr [[D:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
@@ -1580,7 +1580,7 @@ define void @st1lane0_ro_1d_double(<1 x double> %A, ptr %D, i64 %offset) sanitiz
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP:%.*]] = extractelement <1 x double> [[A]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
 ; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1601,16 +1601,16 @@ define void @st1lane0_ro_1d_double(<1 x double> %A, ptr %D, i64 %offset) sanitiz
 define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st2lane_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 1, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1625,16 +1625,16 @@ define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, ptr %D) sanitize_memory {
 define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st2lane_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 1, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1649,16 +1649,16 @@ define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, ptr %D) sanitize_memory {
 define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st2lane_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 1, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1673,16 +1673,16 @@ define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, ptr %D) sanitize_memory {
 define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st2lane_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1702,17 +1702,17 @@ declare void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64>, <2 x i64>, i64, ptr)
 define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st3lane_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 1, ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1727,17 +1727,17 @@ define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %D) sanit
 define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st3lane_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], i64 1, ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1752,17 +1752,17 @@ define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %D) saniti
 define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st3lane_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i64 1, ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1777,17 +1777,17 @@ define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %D) saniti
 define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %D) sanitize_memory {
 ; CHECK-LABEL: define void @st3lane_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[D]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], i64 1, ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1807,18 +1807,18 @@ declare void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>
 define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %E) sanitize_memory {
 ; CHECK-LABEL: define void @st4lane_16b(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[E]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 1, ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1833,18 +1833,18 @@ define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
 define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %E) sanitize_memory {
 ; CHECK-LABEL: define void @st4lane_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[E]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i64 1, ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1859,18 +1859,18 @@ define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D,
 define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %E) sanitize_memory {
 ; CHECK-LABEL: define void @st4lane_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[E]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i64 1, ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1885,18 +1885,18 @@ define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D,
 define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %E) sanitize_memory {
 ; CHECK-LABEL: define void @st4lane_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[E:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[E]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i64 1, ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1913,5 +1913,5 @@ declare void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16>, <8 x i16>, <8 x i16>
 declare void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, ptr) nounwind readnone
 declare void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, ptr) nounwind readnone
 ;.
-; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
 ;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_origins.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_origins.ll
index 52283811e3065..a121df9b195ac 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_origins.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-st1_origins.ll
@@ -17,12 +17,12 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ;
 ; CHECK-LABEL: define void @st2_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 193514046488576
@@ -50,7 +50,7 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, ptr %P) nounwind sanitize_memor
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP12]], i32 7
 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       23:
 ; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
@@ -67,14 +67,14 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ;
 ; CHECK-LABEL: define void @st3_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 48), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 193514046488576
@@ -113,7 +113,7 @@ define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %P) nounwind
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP14]], i32 11
 ; CHECK-NEXT:    store i32 [[TMP20]], ptr [[TMP31]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP32:%.*]], label [[TMP33:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP32:%.*]], label [[TMP33:%.*]], !prof [[PROF1]]
 ; CHECK:       32:
 ; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -130,16 +130,16 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ;
 ; CHECK-LABEL: define void @st4_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 64) to ptr), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 64), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 48) to ptr), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 48), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 193514046488576
@@ -189,7 +189,7 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i32, ptr [[TMP16]], i32 15
 ; CHECK-NEXT:    store i32 [[TMP25]], ptr [[TMP40]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF1]]
 ; CHECK:       41:
 ; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP2]]) #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-tbl.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-tbl.ll
index b0c71dc8b0851..3d6e7fa9ed4b8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-tbl.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-tbl.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool build/bin/opt --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; Test memory sanitizer instrumentation for Arm NEON tbl instructions.
 ;
 ; RUN: opt < %s -passes=msan -S | FileCheck %s
@@ -14,7 +14,7 @@ define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @tbl1_8b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[TMP1]], <8 x i8> [[B]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP2]], [[TMP3]]
@@ -30,7 +30,7 @@ define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind sanitize_memory
 ; CHECK-LABEL: define <16 x i8> @tbl1_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[B]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]]
@@ -46,8 +46,8 @@ define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) sanitize_memor
 ; CHECK-LABEL: define <8 x i8> @tbl2_8b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP3]], [[TMP4]]
@@ -63,8 +63,8 @@ define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) sanitize_me
 ; CHECK-LABEL: define <16 x i8> @tbl2_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]]
@@ -80,9 +80,9 @@ define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
 ; CHECK-LABEL: define <8 x i8> @tbl3_8b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[D]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]]
@@ -98,9 +98,9 @@ define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @tbl3_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[D]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
@@ -116,10 +116,10 @@ define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
 ; CHECK-LABEL: define <8 x i8> @tbl4_8b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[E]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP5]], [[TMP6]]
@@ -135,10 +135,10 @@ define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @tbl4_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[E]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP5]], [[TMP6]]
@@ -156,9 +156,9 @@ define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8(<16 x i8> %a, <16 x i8> %b, <16 x i8
 ; CHECK-LABEL: define <8 x i8> @shuffled_tbl2_to_tbl4_v8i8
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> <i8 0, i8 4, i8 8, i8 12, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> zeroinitializer, [[TMP5]]
@@ -183,9 +183,9 @@ define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]]
@@ -208,11 +208,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> splat (i8 -1), i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
@@ -283,11 +283,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask(<16 x i8> %a, <16 x
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 1, i32 0
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <16 x i8> [[INS_0]], i8 1, i32 1
@@ -347,11 +347,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_first_mask2(<16 x i8> %a, <16 x
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> splat (i8 -1), i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
@@ -423,11 +423,11 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask(<16 x i8> %a, <16 x
 define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, i8 %v) sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_nonconst_second_mask2
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], i8 [[V:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i8> splat (i8 -1), i8 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[INS_0:%.*]] = insertelement <16 x i8> poison, i8 [[V]], i32 0
@@ -500,9 +500,9 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle(<16 x i8> %a, <16 x i8> %b
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_shuffle
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]]
@@ -527,9 +527,9 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1(<16 x i8> %a, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask1
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]]
@@ -554,9 +554,9 @@ define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2(<16 x i8> %a, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @shuffled_tbl2_to_tbl4_mixed_tbl2_mask2
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> zeroinitializer, [[TMP5]]
@@ -588,8 +588,8 @@ define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind saniti
 ; CHECK-LABEL: define <8 x i8> @tbx1_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP3]], [[TMP4]]
@@ -605,8 +605,8 @@ define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind sa
 ; CHECK-LABEL: define <16 x i8> @tbx1_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP3]], [[TMP4]]
@@ -622,9 +622,9 @@ define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) s
 ; CHECK-LABEL: define <8 x i8> @tbx2_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <8 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[D]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP4]], [[TMP5]]
@@ -640,9 +640,9 @@ define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @tbx2_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[D]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP4]], [[TMP5]]
@@ -658,10 +658,10 @@ define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
 ; CHECK-LABEL: define <8 x i8> @tbx3_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <8 x i8> [[E:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[E]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP5]], [[TMP6]]
@@ -677,10 +677,10 @@ define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @tbx3_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[E]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP5]], [[TMP6]]
@@ -696,11 +696,11 @@ define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D,
 ; CHECK-LABEL: define <8 x i8> @tbx4_8b
 ; CHECK-SAME: (<8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <8 x i8> [[F:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <8 x i8> [[F]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP6]], [[TMP7]]
@@ -716,11 +716,11 @@ define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @tbx4_16b
 ; CHECK-SAME: (<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> [[E:%.*]], <16 x i8> [[F:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[F]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP7]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-umaxv.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-umaxv.ll
index 95f11a05d9d2d..7f4213968b3f8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-umaxv.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-umaxv.ll
@@ -216,7 +216,7 @@ define <8 x i8> @test_vmaxv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-LABEL: define <8 x i8> @test_vmaxv_u8_used_by_laneop(
 ; CHECK-SAME: <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
@@ -240,7 +240,7 @@ define <4 x i16> @test_vmaxv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-LABEL: define <4 x i16> @test_vmaxv_u16_used_by_laneop(
 ; CHECK-SAME: <4 x i16> [[A1:%.*]], <4 x i16> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
@@ -264,7 +264,7 @@ define <2 x i32> @test_vmaxv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-LABEL: define <2 x i32> @test_vmaxv_u32_used_by_laneop(
 ; CHECK-SAME: <2 x i32> [[A1:%.*]], <2 x i32> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
@@ -284,7 +284,7 @@ define <16 x i8> @test_vmaxvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-LABEL: define <16 x i8> @test_vmaxvq_u8_used_by_laneop(
 ; CHECK-SAME: <16 x i8> [[A1:%.*]], <16 x i8> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
@@ -308,7 +308,7 @@ define <8 x i16> @test_vmaxvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-LABEL: define <8 x i16> @test_vmaxvq_u16_used_by_laneop(
 ; CHECK-SAME: <8 x i16> [[A1:%.*]], <8 x i16> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
@@ -332,7 +332,7 @@ define <4 x i32> @test_vmaxvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-LABEL: define <4 x i32> @test_vmaxvq_u32_used_by_laneop(
 ; CHECK-SAME: <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
@@ -356,3 +356,6 @@ declare i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32>) nounwind readnone
 declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>) nounwind readnone
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-uminv.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-uminv.ll
index ad51395691050..441c21b1e5753 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-uminv.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-uminv.ll
@@ -216,7 +216,7 @@ define <8 x i8> @test_vminv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-LABEL: define <8 x i8> @test_vminv_u8_used_by_laneop(
 ; CHECK-SAME: <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
@@ -240,7 +240,7 @@ define <4 x i16> @test_vminv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-LABEL: define <4 x i16> @test_vminv_u16_used_by_laneop(
 ; CHECK-SAME: <4 x i16> [[A1:%.*]], <4 x i16> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
@@ -264,7 +264,7 @@ define <2 x i32> @test_vminv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-LABEL: define <2 x i32> @test_vminv_u32_used_by_laneop(
 ; CHECK-SAME: <2 x i32> [[A1:%.*]], <2 x i32> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
@@ -284,7 +284,7 @@ define <16 x i8> @test_vminvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-LABEL: define <16 x i8> @test_vminvq_u8_used_by_laneop(
 ; CHECK-SAME: <16 x i8> [[A1:%.*]], <16 x i8> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
@@ -308,7 +308,7 @@ define <8 x i16> @test_vminvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-LABEL: define <8 x i16> @test_vminvq_u16_used_by_laneop(
 ; CHECK-SAME: <8 x i16> [[A1:%.*]], <8 x i16> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
@@ -332,7 +332,7 @@ define <4 x i32> @test_vminvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-LABEL: define <4 x i32> @test_vminvq_u32_used_by_laneop(
 ; CHECK-SAME: <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
@@ -355,3 +355,6 @@ declare i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32>) nounwind readnone
 declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>) nounwind readnone
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll
index ad0856d38c1e9..5338031383d2f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vadd.ll
@@ -17,7 +17,7 @@ define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @addhn8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]]
@@ -65,7 +65,7 @@ define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @addhn4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -113,7 +113,7 @@ define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @addhn2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -161,7 +161,7 @@ define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @addhn2_16b(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -199,7 +199,7 @@ define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @addhn2_8h(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -237,7 +237,7 @@ define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @addhn2_4s(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -280,7 +280,7 @@ define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @raddhn8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -328,7 +328,7 @@ define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @raddhn4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -376,7 +376,7 @@ define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @raddhn2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -424,7 +424,7 @@ define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @raddhn2_16b(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -462,7 +462,7 @@ define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @raddhn2_8h(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -500,7 +500,7 @@ define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @raddhn2_4s(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -542,7 +542,7 @@ define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @saddl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -587,7 +587,7 @@ define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @saddl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -632,7 +632,7 @@ define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @saddl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -677,9 +677,9 @@ define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b, <2 x i64> %param1, <2 x
 ; CHECK-LABEL: define <8 x i16> @saddl2_8h(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <2 x i64> [[PARAM1:%.*]], <2 x i64> [[PARAM2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
@@ -718,9 +718,9 @@ define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b, <2 x i64> %param1, <2 x
 ; CHECK-LABEL: define <4 x i32> @saddl2_4s(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <2 x i64> [[PARAM1:%.*]], <2 x i64> [[PARAM2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
@@ -759,9 +759,9 @@ define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b, <2 x i64> %param1, <2 x
 ; CHECK-LABEL: define <2 x i64> @saddl2_2d(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[PARAM1:%.*]], <2 x i64> [[PARAM2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
@@ -800,7 +800,7 @@ define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @uaddl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -845,7 +845,7 @@ define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @uaddl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -890,7 +890,7 @@ define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @uaddl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -936,9 +936,9 @@ define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b, <2 x i64> %param1, <2 x
 ; CHECK-LABEL: define <8 x i16> @uaddl2_8h(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <2 x i64> [[PARAM1:%.*]], <2 x i64> [[PARAM2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
@@ -977,9 +977,9 @@ define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b, <2 x i64> %param1, <2 x
 ; CHECK-LABEL: define <4 x i32> @uaddl2_4s(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <2 x i64> [[PARAM1:%.*]], <2 x i64> [[PARAM2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
@@ -1018,9 +1018,9 @@ define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b, <2 x i64> %param1, <2 x
 ; CHECK-LABEL: define <2 x i64> @uaddl2_2d(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[PARAM1:%.*]], <2 x i64> [[PARAM2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
@@ -1059,7 +1059,7 @@ define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @uaddw8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -1101,7 +1101,7 @@ define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @uaddw4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -1143,7 +1143,7 @@ define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @uaddw2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -1185,8 +1185,8 @@ define <8 x i16> @uaddw2_8h(ptr %A, ptr %B, <16 x i8> %param1) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @uaddw2_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], <16 x i8> [[PARAM1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
@@ -1233,8 +1233,8 @@ define <4 x i32> @uaddw2_4s(ptr %A, ptr %B, <8 x i16> %param1) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @uaddw2_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], <8 x i16> [[PARAM1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
@@ -1281,8 +1281,8 @@ define <2 x i64> @uaddw2_2d(ptr %A, ptr %B, <4 x i32> %param1) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @uaddw2_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], <4 x i32> [[PARAM1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
@@ -1329,7 +1329,7 @@ define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @saddw8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -1371,7 +1371,7 @@ define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @saddw4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -1413,7 +1413,7 @@ define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @saddw2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -1455,8 +1455,8 @@ define <8 x i16> @saddw2_8h(ptr %A, ptr %B, <16 x i8> %param1) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @saddw2_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], <16 x i8> [[PARAM1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
@@ -1503,8 +1503,8 @@ define <4 x i32> @saddw2_4s(ptr %A, ptr %B, <8 x i16> %param1) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @saddw2_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], <8 x i16> [[PARAM1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
@@ -1551,8 +1551,8 @@ define <2 x i64> @saddw2_2d(ptr %A, ptr %B, <4 x i32> %param1) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @saddw2_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], <4 x i32> [[PARAM1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
@@ -1963,7 +1963,7 @@ define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @sadalp4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2008,7 +2008,7 @@ define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @sadalp2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2053,7 +2053,7 @@ define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @sadalp8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2098,7 +2098,7 @@ define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @sadalp4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2143,7 +2143,7 @@ define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @sadalp2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2188,7 +2188,7 @@ define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @uadalp4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2233,7 +2233,7 @@ define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @uadalp2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2278,7 +2278,7 @@ define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @uadalp8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2323,7 +2323,7 @@ define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @uadalp4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2368,7 +2368,7 @@ define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @uadalp2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
@@ -2413,7 +2413,7 @@ define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @addp_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2454,7 +2454,7 @@ define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @addp_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2495,7 +2495,7 @@ define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @addp_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2536,7 +2536,7 @@ define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @addp_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2577,7 +2577,7 @@ define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @addp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2618,7 +2618,7 @@ define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @addp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2659,7 +2659,7 @@ define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i64> @addp_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2708,7 +2708,7 @@ define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x float> @faddp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2749,7 +2749,7 @@ define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @faddp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2790,7 +2790,7 @@ define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x double> @faddp_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2834,10 +2834,10 @@ declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>)
 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @uaddl_duprhs(
 ; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[RHS]], i32 0
@@ -2869,10 +2869,10 @@ define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4 x
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @uaddl2_duprhs(
 ; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[RHS]], i32 0
@@ -2904,10 +2904,10 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4
 define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @saddl_duplhs(
 ; CHECK-SAME: i32 [[LHS:%.*]], <4 x i32> [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[LHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[LHS]], i32 0
@@ -2939,10 +2939,10 @@ define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs, <2 x i32> %param1, <4 x
 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @saddl2_duplhs(
 ; CHECK-SAME: i32 [[LHS:%.*]], <4 x i32> [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[LHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[LHS]], i32 0
@@ -2974,10 +2974,10 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs, <2 x i32> %param1, <4
 define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @usubl_duprhs(
 ; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[RHS]], i32 0
@@ -3009,10 +3009,10 @@ define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4 x
 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @usubl2_duprhs(
 ; CHECK-SAME: <4 x i32> [[LHS:%.*]], i32 [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[RHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[RHS]], i32 0
@@ -3044,10 +3044,10 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs, <2 x i32> %param1, <4
 define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @ssubl_duplhs(
 ; CHECK-SAME: i32 [[LHS:%.*]], <4 x i32> [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[LHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[LHS]], i32 0
@@ -3079,10 +3079,10 @@ define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs, <2 x i32> %param1, <4 x
 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs, <2 x i32> %param1, <4 x i32> %param2) #0 {
 ; CHECK-LABEL: define <2 x i64> @ssubl2_duplhs(
 ; CHECK-SAME: i32 [[LHS:%.*]], <4 x i32> [[RHS:%.*]], <2 x i32> [[PARAM1:%.*]], <4 x i32> [[PARAM2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[LHSVEC_TMP:%.*]] = insertelement <2 x i32> [[PARAM1]], i32 [[LHS]], i32 0
@@ -3115,7 +3115,7 @@ define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @addhn8b_natural(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3161,7 +3161,7 @@ define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @addhn4h_natural(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3207,7 +3207,7 @@ define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @addhn2s_natural(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3252,8 +3252,8 @@ define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind #0 {
 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @addhn2_16b_natural(
 ; CHECK-SAME: <8 x i8> [[LOW:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3302,8 +3302,8 @@ define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind #0
 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @addhn2_8h_natural(
 ; CHECK-SAME: <4 x i16> [[LOW:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3352,8 +3352,8 @@ define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind #0
 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @addhn2_4s_natural(
 ; CHECK-SAME: <2 x i32> [[LOW:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3403,9 +3403,9 @@ define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @addhn_addhn2_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -3488,7 +3488,7 @@ define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @subhn8b_natural(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3534,7 +3534,7 @@ define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @subhn4h_natural(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3580,7 +3580,7 @@ define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @subhn2s_natural(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3625,8 +3625,8 @@ define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind #0 {
 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @subhn2_16b_natural(
 ; CHECK-SAME: <8 x i8> [[LOW:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3675,8 +3675,8 @@ define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind #0
 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @subhn2_8h_natural(
 ; CHECK-SAME: <4 x i16> [[LOW:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3725,8 +3725,8 @@ define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind #0
 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @subhn2_4s_natural(
 ; CHECK-SAME: <2 x i32> [[LOW:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
index 3a2ecfefd209e..4ee7e4f20f0b3 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vaddv.ll
@@ -30,7 +30,7 @@ define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-LABEL: define <8 x i8> @test_vaddv_s8_used_by_laneop(
 ; CHECK-SAME: <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
@@ -74,7 +74,7 @@ define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-LABEL: define <4 x i16> @test_vaddv_s16_used_by_laneop(
 ; CHECK-SAME: <4 x i16> [[A1:%.*]], <4 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
@@ -115,7 +115,7 @@ define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-LABEL: define <2 x i32> @test_vaddv_s32_used_by_laneop(
 ; CHECK-SAME: <2 x i32> [[A1:%.*]], <2 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
@@ -151,7 +151,7 @@ define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) #0
 ; CHECK-LABEL: define <2 x i64> @test_vaddv_s64_used_by_laneop(
 ; CHECK-SAME: <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
@@ -191,7 +191,7 @@ define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) #0 {
 ; CHECK-LABEL: define <8 x i8> @test_vaddv_u8_used_by_laneop(
 ; CHECK-SAME: <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> [[TMP0]])
@@ -259,7 +259,7 @@ define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) #0
 ; CHECK-LABEL: define <4 x i16> @test_vaddv_u16_used_by_laneop(
 ; CHECK-SAME: <4 x i16> [[A1:%.*]], <4 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP0]])
@@ -324,7 +324,7 @@ define <2 x i32> @test_vaddv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) #0
 ; CHECK-LABEL: define <2 x i32> @test_vaddv_u32_used_by_laneop(
 ; CHECK-SAME: <2 x i32> [[A1:%.*]], <2 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP0]])
@@ -408,7 +408,7 @@ define <2 x i64> @test_vaddv_u64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) #0
 ; CHECK-LABEL: define <2 x i64> @test_vaddv_u64_used_by_laneop(
 ; CHECK-SAME: <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
@@ -429,7 +429,7 @@ define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1, <1 x i64> %param1) #0 {
 ; CHECK-SAME: <2 x i64> [[A1:%.*]], <1 x i64> [[PARAM1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP0]])
 ; CHECK-NEXT:    [[VADDV_I:%.*]] = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A1]])
@@ -468,7 +468,7 @@ define <16 x i8> @test_vaddvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-LABEL: define <16 x i8> @test_vaddvq_s8_used_by_laneop(
 ; CHECK-SAME: <16 x i8> [[A1:%.*]], <16 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
@@ -512,7 +512,7 @@ define <8 x i16> @test_vaddvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-LABEL: define <8 x i16> @test_vaddvq_s16_used_by_laneop(
 ; CHECK-SAME: <8 x i16> [[A1:%.*]], <8 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
@@ -552,7 +552,7 @@ define <4 x i32> @test_vaddvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-LABEL: define <4 x i32> @test_vaddvq_s32_used_by_laneop(
 ; CHECK-SAME: <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
@@ -592,7 +592,7 @@ define <16 x i8> @test_vaddvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) #0
 ; CHECK-LABEL: define <16 x i8> @test_vaddvq_u8_used_by_laneop(
 ; CHECK-SAME: <16 x i8> [[A1:%.*]], <16 x i8> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP0]])
@@ -636,7 +636,7 @@ define <8 x i16> @test_vaddvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) #
 ; CHECK-LABEL: define <8 x i16> @test_vaddvq_u16_used_by_laneop(
 ; CHECK-SAME: <8 x i16> [[A1:%.*]], <8 x i16> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[TMP0]])
@@ -676,7 +676,7 @@ define <4 x i32> @test_vaddvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) #
 ; CHECK-LABEL: define <4 x i32> @test_vaddvq_u32_used_by_laneop(
 ; CHECK-SAME: <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll
index 93a75df4b76cc..03f6113de0762 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vcvt.ll
@@ -1083,7 +1083,7 @@ define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind #0 {
 define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @fcvtxn_4s(
 ; CHECK-SAME: <2 x float> [[RET:%.*]], <2 x double> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
@@ -1358,7 +1358,7 @@ define void @autogen_SD28458(<8 x double> %val.f64, ptr %addr.f32) #0 {
 ; CHECK-LABEL: define void @autogen_SD28458(
 ; CHECK-SAME: <8 x double> [[VAL_F64:%.*]], ptr [[ADDR_F32:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TR53:%.*]] = fptrunc <8 x double> [[VAL_F64]] to <8 x float>
@@ -1383,7 +1383,7 @@ define void @autogen_SD28458(<8 x double> %val.f64, ptr %addr.f32) #0 {
 define void @autogen_SD19225(ptr %addr.f64, ptr %addr.f32) #0 {
 ; CHECK-LABEL: define void @autogen_SD19225(
 ; CHECK-SAME: ptr [[ADDR_F64:%.*]], ptr [[ADDR_F32:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmax.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmax.ll
index e2457c0a51d46..d6d88956a4f68 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmax.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmax.ll
@@ -29,7 +29,7 @@ define <8 x i8> @smax_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @smax_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1:![0-9]+]]
@@ -68,7 +68,7 @@ define <16 x i8> @smax_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @smax_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -107,7 +107,7 @@ define <4 x i16> @smax_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @smax_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -146,7 +146,7 @@ define <8 x i16> @smax_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @smax_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -185,7 +185,7 @@ define <2 x i32> @smax_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @smax_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -224,7 +224,7 @@ define <4 x i32> @smax_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @smax_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -270,7 +270,7 @@ define <8 x i8> @umax_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @umax_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -309,7 +309,7 @@ define <16 x i8> @umax_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @umax_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -348,7 +348,7 @@ define <4 x i16> @umax_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @umax_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -387,7 +387,7 @@ define <8 x i16> @umax_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @umax_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -426,7 +426,7 @@ define <2 x i32> @umax_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @umax_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -465,7 +465,7 @@ define <4 x i32> @umax_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @umax_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -511,7 +511,7 @@ define <8 x i8> @smin_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @smin_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -550,7 +550,7 @@ define <16 x i8> @smin_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @smin_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -589,7 +589,7 @@ define <4 x i16> @smin_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @smin_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -628,7 +628,7 @@ define <8 x i16> @smin_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @smin_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -667,7 +667,7 @@ define <2 x i32> @smin_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @smin_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -706,7 +706,7 @@ define <4 x i32> @smin_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @smin_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -752,7 +752,7 @@ define <8 x i8> @umin_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @umin_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -791,7 +791,7 @@ define <16 x i8> @umin_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @umin_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -830,7 +830,7 @@ define <4 x i16> @umin_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @umin_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -869,7 +869,7 @@ define <8 x i16> @umin_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @umin_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -908,7 +908,7 @@ define <2 x i32> @umin_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @umin_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -947,7 +947,7 @@ define <4 x i32> @umin_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @umin_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -994,7 +994,7 @@ define <8 x i8> @smaxp_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @smaxp_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1035,7 +1035,7 @@ define <16 x i8> @smaxp_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @smaxp_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1076,7 +1076,7 @@ define <4 x i16> @smaxp_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @smaxp_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1117,7 +1117,7 @@ define <8 x i16> @smaxp_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @smaxp_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1158,7 +1158,7 @@ define <2 x i32> @smaxp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @smaxp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1199,7 +1199,7 @@ define <4 x i32> @smaxp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @smaxp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1247,7 +1247,7 @@ define <8 x i8> @umaxp_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @umaxp_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1288,7 +1288,7 @@ define <16 x i8> @umaxp_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @umaxp_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1329,7 +1329,7 @@ define <4 x i16> @umaxp_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @umaxp_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1370,7 +1370,7 @@ define <8 x i16> @umaxp_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @umaxp_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1411,7 +1411,7 @@ define <2 x i32> @umaxp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @umaxp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1452,7 +1452,7 @@ define <4 x i32> @umaxp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @umaxp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1501,7 +1501,7 @@ define <8 x i8> @sminp_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @sminp_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1542,7 +1542,7 @@ define <16 x i8> @sminp_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @sminp_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1583,7 +1583,7 @@ define <4 x i16> @sminp_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @sminp_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1624,7 +1624,7 @@ define <8 x i16> @sminp_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @sminp_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1665,7 +1665,7 @@ define <2 x i32> @sminp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @sminp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1706,7 +1706,7 @@ define <4 x i32> @sminp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @sminp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1754,7 +1754,7 @@ define <8 x i8> @uminp_8b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i8> @uminp_8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1795,7 +1795,7 @@ define <16 x i8> @uminp_16b(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @uminp_16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1836,7 +1836,7 @@ define <4 x i16> @uminp_4h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i16> @uminp_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1877,7 +1877,7 @@ define <8 x i16> @uminp_8h(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @uminp_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1918,7 +1918,7 @@ define <2 x i32> @uminp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x i32> @uminp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1959,7 +1959,7 @@ define <4 x i32> @uminp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @uminp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2007,7 +2007,7 @@ define <2 x float> @fmax_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x float> @fmax_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2046,7 +2046,7 @@ define <4 x float> @fmax_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @fmax_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2085,7 +2085,7 @@ define <2 x double> @fmax_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x double> @fmax_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2128,7 +2128,7 @@ define <2 x float> @fmaxp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x float> @fmaxp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2169,7 +2169,7 @@ define <4 x float> @fmaxp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @fmaxp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2210,7 +2210,7 @@ define <2 x double> @fmaxp_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x double> @fmaxp_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2255,7 +2255,7 @@ define <2 x float> @fmin_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x float> @fmin_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2294,7 +2294,7 @@ define <4 x float> @fmin_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @fmin_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2333,7 +2333,7 @@ define <2 x double> @fmin_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x double> @fmin_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2376,7 +2376,7 @@ define <2 x float> @fminp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x float> @fminp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2417,7 +2417,7 @@ define <4 x float> @fminp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @fminp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2458,7 +2458,7 @@ define <2 x double> @fminp_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x double> @fminp_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2503,7 +2503,7 @@ define <2 x float> @fminnmp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x float> @fminnmp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2544,7 +2544,7 @@ define <4 x float> @fminnmp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @fminnmp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2585,7 +2585,7 @@ define <2 x double> @fminnmp_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x double> @fminnmp_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2630,7 +2630,7 @@ define <2 x float> @fmaxnmp_2s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x float> @fmaxnmp_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2671,7 +2671,7 @@ define <4 x float> @fmaxnmp_4s(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @fmaxnmp_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2712,7 +2712,7 @@ define <2 x double> @fmaxnmp_2d(ptr %A, ptr %B) nounwind #0 {
 ; CHECK-LABEL: define <2 x double> @fmaxnmp_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -2754,3 +2754,6 @@ declare <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) n
 declare <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll
index 8e9110fa836c7..ced0138ab747c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmovn.ll
@@ -51,7 +51,7 @@ define <2 x i32> @xtn2s(<2 x i64> %A) nounwind #0 {
 define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @xtn2_16b(
 ; CHECK-SAME: <8 x i8> [[RET:%.*]], <8 x i16> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
@@ -69,7 +69,7 @@ define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @xtn2_8h(
 ; CHECK-SAME: <4 x i16> [[RET:%.*]], <4 x i32> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
@@ -87,7 +87,7 @@ define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @xtn2_4s(
 ; CHECK-SAME: <2 x i32> [[RET:%.*]], <2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
@@ -150,7 +150,7 @@ define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind #0 {
 define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @sqxtn2_16b(
 ; CHECK-SAME: <8 x i8> [[RET:%.*]], <8 x i16> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
@@ -169,7 +169,7 @@ define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @sqxtn2_8h(
 ; CHECK-SAME: <4 x i16> [[RET:%.*]], <4 x i32> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
@@ -188,7 +188,7 @@ define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @sqxtn2_4s(
 ; CHECK-SAME: <2 x i32> [[RET:%.*]], <2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
@@ -256,7 +256,7 @@ define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind #0 {
 define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @uqxtn2_16b(
 ; CHECK-SAME: <8 x i8> [[RET:%.*]], <8 x i16> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
@@ -275,7 +275,7 @@ define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @uqxtn2_8h(
 ; CHECK-SAME: <4 x i16> [[RET:%.*]], <4 x i32> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
@@ -294,7 +294,7 @@ define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @uqxtn2_4s(
 ; CHECK-SAME: <2 x i32> [[RET:%.*]], <2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
@@ -362,7 +362,7 @@ define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind #0 {
 define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @sqxtun2_16b(
 ; CHECK-SAME: <8 x i8> [[RET:%.*]], <8 x i16> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
@@ -381,7 +381,7 @@ define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind #0 {
 define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 ; CHECK-LABEL: define <8 x i16> @sqxtun2_8h(
 ; CHECK-SAME: <4 x i16> [[RET:%.*]], <4 x i32> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
@@ -400,7 +400,7 @@ define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind #0 {
 define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @sqxtun2_4s(
 ; CHECK-SAME: <2 x i32> [[RET:%.*]], <2 x i64> [[A:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmul.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmul.ll
index 38d6669671509..e9bb743b189fe 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmul.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vmul.ll
@@ -13,7 +13,7 @@ define <8 x i16> @smull8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @smull8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1:![0-9]+]]
@@ -54,7 +54,7 @@ define <4 x i32> @smull4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @smull4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -95,7 +95,7 @@ define <2 x i64> @smull2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @smull2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -140,7 +140,7 @@ define <8 x i16> @umull8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @umull8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -181,7 +181,7 @@ define <4 x i32> @umull4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @umull4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -222,7 +222,7 @@ define <2 x i64> @umull2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @umull2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -267,7 +267,7 @@ define <4 x i32> @sqdmull4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmull4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -315,7 +315,7 @@ define <2 x i64> @sqdmull2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmull2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -363,7 +363,7 @@ define <4 x i32> @sqdmull2_4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmull2_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -417,7 +417,7 @@ define <2 x i64> @sqdmull2_2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmull2_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -475,7 +475,7 @@ define <8 x i16> @pmull8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @pmull8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -518,7 +518,7 @@ define <4 x i16> @sqdmulh_4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @sqdmulh_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -557,7 +557,7 @@ define <8 x i16> @sqdmulh_8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqdmulh_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -596,7 +596,7 @@ define <2 x i32> @sqdmulh_2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @sqdmulh_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -635,7 +635,7 @@ define <4 x i32> @sqdmulh_4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmulh_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -674,7 +674,7 @@ define i32 @sqdmulh_1s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqdmulh_1s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -719,7 +719,7 @@ define <4 x i16> @sqrdmulh_4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @sqrdmulh_4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -758,7 +758,7 @@ define <8 x i16> @sqrdmulh_8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqrdmulh_8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -797,7 +797,7 @@ define <2 x i32> @sqrdmulh_2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @sqrdmulh_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -836,7 +836,7 @@ define <4 x i32> @sqrdmulh_4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqrdmulh_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -875,7 +875,7 @@ define i32 @sqrdmulh_1s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqrdmulh_1s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -920,7 +920,7 @@ define <2 x float> @fmulx_2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x float> @fmulx_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -960,7 +960,7 @@ define <4 x float> @fmulx_4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x float> @fmulx_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1000,7 +1000,7 @@ define <2 x double> @fmulx_2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x double> @fmulx_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
@@ -1044,8 +1044,8 @@ define <4 x i32> @smlal4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @smlal4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1101,8 +1101,8 @@ define <2 x i64> @smlal2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @smlal2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1208,8 +1208,8 @@ define <4 x i32> @smlsl4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @smlsl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1265,8 +1265,8 @@ define <2 x i64> @smlsl2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @smlsl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1377,8 +1377,8 @@ define <4 x i32> @sqdmlal4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmlal4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1441,8 +1441,8 @@ define <2 x i64> @sqdmlal2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmlal2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1505,8 +1505,8 @@ define <4 x i32> @sqdmlal2_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmlal2_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1575,8 +1575,8 @@ define <2 x i64> @sqdmlal2_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmlal2_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1645,8 +1645,8 @@ define <4 x i32> @sqdmlsl4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmlsl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1709,8 +1709,8 @@ define <2 x i64> @sqdmlsl2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmlsl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1773,8 +1773,8 @@ define <4 x i32> @sqdmlsl2_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmlsl2_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1843,8 +1843,8 @@ define <2 x i64> @sqdmlsl2_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmlsl2_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1913,8 +1913,8 @@ define <4 x i32> @umlal4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @umlal4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1970,8 +1970,8 @@ define <2 x i64> @umlal2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @umlal2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2077,8 +2077,8 @@ define <4 x i32> @umlsl4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @umlsl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2134,8 +2134,8 @@ define <2 x i64> @umlsl2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @umlsl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2241,8 +2241,8 @@ define <2 x float> @fmla_2s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x float> @fmla_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2294,8 +2294,8 @@ define <4 x float> @fmla_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x float> @fmla_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2347,8 +2347,8 @@ define <2 x double> @fmla_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x double> @fmla_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2404,8 +2404,8 @@ define <2 x float> @fmls_2s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x float> @fmls_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2460,8 +2460,8 @@ define <4 x float> @fmls_4s(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x float> @fmls_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2516,8 +2516,8 @@ define <2 x double> @fmls_2d(ptr %A, ptr %B, ptr %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x double> @fmls_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2572,8 +2572,8 @@ define <2 x float> @fmls_commuted_neg_2s(ptr %A, ptr %B, ptr %C) nounwind saniti
 ; CHECK-LABEL: define <2 x float> @fmls_commuted_neg_2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2628,8 +2628,8 @@ define <4 x float> @fmls_commuted_neg_4s(ptr %A, ptr %B, ptr %C) nounwind saniti
 ; CHECK-LABEL: define <4 x float> @fmls_commuted_neg_4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2684,8 +2684,8 @@ define <2 x double> @fmls_commuted_neg_2d(ptr %A, ptr %B, ptr %C) nounwind sanit
 ; CHECK-LABEL: define <2 x double> @fmls_commuted_neg_2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -2969,7 +2969,7 @@ declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x
 define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @mul_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -2987,7 +2987,7 @@ define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
 define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @mul_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -3005,7 +3005,7 @@ define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
 define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @mul_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3023,7 +3023,7 @@ define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
 define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @mul_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3042,7 +3042,7 @@ define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @mul_2d(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i64> [[A]], [[B]]
@@ -3056,7 +3056,7 @@ define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind sanitize_memory {
 define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x float> @fmul_lane_2s(
 ; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3074,7 +3074,7 @@ define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind saniti
 define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x float> @fmul_lane_4s(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3092,7 +3092,7 @@ define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind saniti
 define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x double> @fmul_lane_2d(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <2 x i32> <i32 1, i32 1>
@@ -3110,7 +3110,7 @@ define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind san
 define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind sanitize_memory {
 ; CHECK-LABEL: define float @fmul_lane_s(
 ; CHECK-SAME: float [[A:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
@@ -3128,7 +3128,7 @@ define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind sanitize_memory {
 define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind sanitize_memory {
 ; CHECK-LABEL: define double @fmul_lane_d(
 ; CHECK-SAME: double [[A:%.*]], <2 x double> [[VEC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
@@ -3148,7 +3148,7 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind sanitize_memor
 define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x float> @fmulx_lane_2s(
 ; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3167,7 +3167,7 @@ define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind sanit
 define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x float> @fmulx_lane_4s(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3186,7 +3186,7 @@ define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind sanit
 define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x double> @fmulx_lane_2d(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <2 x i32> <i32 1, i32 1>
@@ -3205,7 +3205,7 @@ define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind sa
 define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @sqdmulh_lane_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3223,7 +3223,7 @@ define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_
 define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqdmulh_lane_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -3241,7 +3241,7 @@ define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_
 define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @sqdmulh_lane_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3259,7 +3259,7 @@ define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_
 define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmulh_lane_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3277,7 +3277,7 @@ define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_
 define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqdmulh_lane_1s(
 ; CHECK-SAME: i32 [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
@@ -3295,7 +3295,7 @@ define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind sanitize_memory {
 define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @sqrdmulh_lane_4h(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3313,7 +3313,7 @@ define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind sanitize
 define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqrdmulh_lane_8h(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -3331,7 +3331,7 @@ define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind sanitize
 define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @sqrdmulh_lane_2s(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3349,7 +3349,7 @@ define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind sanitize
 define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqrdmulh_lane_4s(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3367,7 +3367,7 @@ define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind sanitize
 define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqrdmulh_lane_1s(
 ; CHECK-SAME: i32 [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
@@ -3385,7 +3385,7 @@ define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind sanitize_memory {
 define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmull_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3412,7 +3412,7 @@ define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_
 define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmull_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3440,7 +3440,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind sanitize
 ; CHECK-LABEL: define <4 x i32> @sqdmull2_lane_4s(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3470,7 +3470,7 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind sanitize
 ; CHECK-LABEL: define <2 x i64> @sqdmull2_lane_2d(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3499,7 +3499,7 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind sanitize
 define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @umull_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3519,7 +3519,7 @@ define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_me
 define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @umull_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3539,7 +3539,7 @@ define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_me
 define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @smull_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3559,7 +3559,7 @@ define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind sanitize_me
 define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @smull_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
@@ -3579,9 +3579,9 @@ define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind sanitize_me
 define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @smlal_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3603,9 +3603,9 @@ define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi
 define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @smlal_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -3627,9 +3627,9 @@ define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwi
 define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmlal_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3658,9 +3658,9 @@ define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) noun
 define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmlal_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -3690,8 +3690,8 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nou
 ; CHECK-LABEL: define <4 x i32> @sqdmlal2_lane_4s(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -3724,8 +3724,8 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nou
 ; CHECK-LABEL: define <2 x i64> @sqdmlal2_lane_2d(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -3757,8 +3757,8 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nou
 define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqdmlal_lane_1s(
 ; CHECK-SAME: i32 [[A:%.*]], i16 [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP1]], i32 0
@@ -3794,8 +3794,8 @@ declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqdmlsl_lane_1s(
 ; CHECK-SAME: i32 [[A:%.*]], i16 [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP1]], i32 0
@@ -3831,8 +3831,8 @@ declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
 define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqadd_lane1_sqdmull4s(
 ; CHECK-SAME: i32 [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
@@ -3861,8 +3861,8 @@ define i32 @sqadd_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind s
 define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqsub_lane1_sqdmull4s(
 ; CHECK-SAME: i32 [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to i64
@@ -3891,8 +3891,8 @@ define i32 @sqsub_lane1_sqdmull4s(i32 %A, <4 x i16> %B, <4 x i16> %C) nounwind s
 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @sqdmlal_lane_1d(
 ; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
@@ -3922,8 +3922,8 @@ declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @sqdmlsl_lane_1d(
 ; CHECK-SAME: i64 [[A:%.*]], i32 [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
@@ -3953,9 +3953,9 @@ declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
 define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @umlal_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -3977,9 +3977,9 @@ define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi
 define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @umlal_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -4002,9 +4002,9 @@ define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwi
 define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @smlsl_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -4026,9 +4026,9 @@ define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi
 define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @smlsl_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -4050,9 +4050,9 @@ define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwi
 define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqdmlsl_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -4081,9 +4081,9 @@ define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) noun
 define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqdmlsl_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -4113,8 +4113,8 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nou
 ; CHECK-LABEL: define <4 x i32> @sqdmlsl2_lane_4s(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> splat (i16 -1), <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -4147,8 +4147,8 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nou
 ; CHECK-LABEL: define <2 x i64> @sqdmlsl2_lane_2d(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> splat (i32 -1), <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -4180,9 +4180,9 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nou
 define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @umlsl_lane_4s(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> splat (i16 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -4204,9 +4204,9 @@ define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwi
 define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @umlsl_lane_2d(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i64> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> splat (i32 -1), <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
@@ -4230,7 +4230,7 @@ define float @fmulxs(float %a, float %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define float @fmulxs(
 ; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -4246,7 +4246,7 @@ define double @fmulxd(double %a, double %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define double @fmulxd(
 ; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -4261,7 +4261,7 @@ define double @fmulxd(double %a, double %b) nounwind sanitize_memory {
 define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind sanitize_memory {
 ; CHECK-LABEL: define float @fmulxs_lane(
 ; CHECK-SAME: float [[A:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
@@ -4280,7 +4280,7 @@ define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind sanitize_memory {
 define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind sanitize_memory {
 ; CHECK-LABEL: define double @fmulxd_lane(
 ; CHECK-SAME: double [[A:%.*]], <2 x double> [[VEC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
@@ -4304,7 +4304,7 @@ define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind sanitize
 ; CHECK-LABEL: define <8 x i16> @smull2_8h_simple(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> splat (i8 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -4327,7 +4327,7 @@ define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @foo0(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
@@ -4362,7 +4362,7 @@ define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @foo1(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP8]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
@@ -4397,7 +4397,7 @@ define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @foo2(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
@@ -4432,7 +4432,7 @@ define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @foo3(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64>
@@ -4467,7 +4467,7 @@ define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @foo4(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP8]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64>
@@ -4502,7 +4502,7 @@ define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @foo5(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64>
@@ -4713,8 +4713,8 @@ entry:
 define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @bar0(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
@@ -4752,8 +4752,8 @@ define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind saniti
 define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @bar1(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP9]] to <2 x i64>
@@ -4791,8 +4791,8 @@ define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind saniti
 define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @bar2(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP9]] to <2 x i64>
@@ -4830,8 +4830,8 @@ define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind saniti
 define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @bar3(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
@@ -4869,8 +4869,8 @@ define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind saniti
 define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @bar4(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP9]] to <2 x i64>
@@ -4908,8 +4908,8 @@ define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind saniti
 define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @bar5(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP9]] to <2 x i64>
@@ -4947,8 +4947,8 @@ define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind saniti
 define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @mlal2_1(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> splat (i16 -1), <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -4989,8 +4989,8 @@ define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind san
 define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @mlal2_2(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> splat (i32 -1), <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -5031,8 +5031,8 @@ define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind san
 define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @mlal2_4(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> splat (i16 -1), <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -5073,8 +5073,8 @@ define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind san
 define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @mlal2_5(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <4 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> splat (i32 -1), <4 x i32> zeroinitializer
@@ -5960,7 +5960,7 @@ define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind
 ; CHECK-LABEL: define <1 x double> @test_fmul_v1f64(
 ; CHECK-SAME: <1 x double> [[L:%.*]], <1 x double> [[R:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[PROD:%.*]] = fmul <1 x double> [[L]], [[R]]
@@ -5975,7 +5975,7 @@ define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind
 ; CHECK-LABEL: define <1 x double> @test_fdiv_v1f64(
 ; CHECK-SAME: <1 x double> [[L:%.*]], <1 x double> [[R:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[PROD:%.*]] = fdiv <1 x double> [[L]], [[R]]
@@ -5990,8 +5990,8 @@ define i32 @sqdmlal_s(i16 %A, i16 %B, i32 %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqdmlal_s(
 ; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[A]], i64 0
@@ -6026,8 +6026,8 @@ define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @sqdmlal_d(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
@@ -6052,8 +6052,8 @@ define i32 @sqdmlsl_s(i16 %A, i16 %B, i32 %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i32 @sqdmlsl_s(
 ; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i16> splat (i16 -1), i16 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[A]], i64 0
@@ -6088,8 +6088,8 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @sqdmlsl_d(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
@@ -6114,7 +6114,7 @@ define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @test_pmull_64(
 ; CHECK-SAME: i64 [[L:%.*]], i64 [[R:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -6132,7 +6132,7 @@ define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind saniti
 ; CHECK-LABEL: define <16 x i8> @test_pmull_high_64(
 ; CHECK-SAME: <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[L_HI:%.*]] = extractelement <2 x i64> [[L]], i32 1
@@ -6158,7 +6158,7 @@ define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind saniti
 ; CHECK-LABEL: define <1 x i64> @test_mul_v1i64(
 ; CHECK-SAME: <1 x i64> [[LHS:%.*]], <1 x i64> [[RHS:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[PROD:%.*]] = mul <1 x i64> [[LHS]], [[RHS]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
index 7fa9b412b0f03..42d2351a88cc2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
@@ -11,7 +11,7 @@ define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @sqshl8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]]
@@ -55,7 +55,7 @@ define <4 x i16> @sqshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @sqshl4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -99,7 +99,7 @@ define <2 x i32> @sqshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @sqshl2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -143,7 +143,7 @@ define <1 x i64> @sqshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @sqshl1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -214,7 +214,7 @@ define i64 @sqshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @sqshl_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -283,7 +283,7 @@ define <8 x i8> @uqshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @uqshl8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -327,7 +327,7 @@ define <4 x i16> @uqshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @uqshl4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -371,7 +371,7 @@ define <2 x i32> @uqshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @uqshl2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -415,7 +415,7 @@ define <16 x i8> @sqshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @sqshl16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -460,7 +460,7 @@ define <8 x i16> @sqshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqshl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -505,7 +505,7 @@ define <4 x i32> @sqshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqshl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -550,7 +550,7 @@ define <2 x i64> @sqshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqshl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -595,7 +595,7 @@ define <16 x i8> @uqshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @uqshl16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -640,7 +640,7 @@ define <8 x i16> @uqshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @uqshl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -685,7 +685,7 @@ define <4 x i32> @uqshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @uqshl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -730,7 +730,7 @@ define <2 x i64> @uqshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @uqshl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -775,7 +775,7 @@ define <1 x i64> @uqshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @uqshl1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -846,7 +846,7 @@ define i64 @uqshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @uqshl_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -938,7 +938,7 @@ define <8 x i8> @srshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @srshl8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -982,7 +982,7 @@ define <4 x i16> @srshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @srshl4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1026,7 +1026,7 @@ define <2 x i32> @srshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @srshl2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1070,7 +1070,7 @@ define <1 x i64> @srshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @srshl1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1141,7 +1141,7 @@ define i64 @srshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @srshl_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1210,7 +1210,7 @@ define <8 x i8> @urshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @urshl8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1254,7 +1254,7 @@ define <4 x i16> @urshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @urshl4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1298,7 +1298,7 @@ define <2 x i32> @urshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @urshl2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1342,7 +1342,7 @@ define <1 x i64> @urshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @urshl1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1413,7 +1413,7 @@ define i64 @urshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @urshl_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1482,7 +1482,7 @@ define <16 x i8> @srshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @srshl16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1527,7 +1527,7 @@ define <8 x i16> @srshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @srshl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1572,7 +1572,7 @@ define <4 x i32> @srshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @srshl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1617,7 +1617,7 @@ define <2 x i64> @srshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @srshl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1662,7 +1662,7 @@ define <16 x i8> @urshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @urshl16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1707,7 +1707,7 @@ define <8 x i16> @urshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @urshl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1752,7 +1752,7 @@ define <4 x i32> @urshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @urshl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1797,7 +1797,7 @@ define <2 x i64> @urshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @urshl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1864,7 +1864,7 @@ define <8 x i8> @sqrshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @sqrshl8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1908,7 +1908,7 @@ define <4 x i16> @sqrshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @sqrshl4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1952,7 +1952,7 @@ define <2 x i32> @sqrshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @sqrshl2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1996,7 +1996,7 @@ define <8 x i8> @uqrshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @uqrshl8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2040,7 +2040,7 @@ define <4 x i16> @uqrshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @uqrshl4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2084,7 +2084,7 @@ define <2 x i32> @uqrshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @uqrshl2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2128,7 +2128,7 @@ define <16 x i8> @sqrshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @sqrshl16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2173,7 +2173,7 @@ define <8 x i16> @sqrshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqrshl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2218,7 +2218,7 @@ define <4 x i32> @sqrshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqrshl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2263,7 +2263,7 @@ define <2 x i64> @sqrshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sqrshl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2308,7 +2308,7 @@ define <1 x i64> @sqrshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @sqrshl1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2379,7 +2379,7 @@ define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @sqrshl_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2448,7 +2448,7 @@ define <16 x i8> @uqrshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @uqrshl16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2493,7 +2493,7 @@ define <8 x i16> @uqrshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @uqrshl8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2538,7 +2538,7 @@ define <4 x i32> @uqrshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @uqrshl4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2583,7 +2583,7 @@ define <2 x i64> @uqrshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @uqrshl2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2628,7 +2628,7 @@ define <1 x i64> @uqrshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @uqrshl1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -2699,7 +2699,7 @@ define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @uqrshl_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3639,7 +3639,7 @@ define <16 x i8> @rshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @rshrn16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3682,7 +3682,7 @@ define <8 x i16> @rshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @rshrn8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3725,7 +3725,7 @@ define <4 x i32> @rshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @rshrn4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3862,7 +3862,7 @@ define <16 x i8> @shrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shrn16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3908,7 +3908,7 @@ define <8 x i16> @shrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @shrn8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -3954,7 +3954,7 @@ define <4 x i32> @shrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @shrn4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4101,7 +4101,7 @@ define <16 x i8> @sqshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @sqshrn16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4144,7 +4144,7 @@ define <8 x i16> @sqshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqshrn8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4187,7 +4187,7 @@ define <4 x i32> @sqshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqshrn4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4331,7 +4331,7 @@ define <16 x i8> @sqshrun16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @sqshrun16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4374,7 +4374,7 @@ define <8 x i16> @sqshrun8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqshrun8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4417,7 +4417,7 @@ define <4 x i32> @sqshrun4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqshrun4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4561,7 +4561,7 @@ define <16 x i8> @sqrshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @sqrshrn16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4604,7 +4604,7 @@ define <8 x i16> @sqrshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqrshrn8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4647,7 +4647,7 @@ define <4 x i32> @sqrshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqrshrn4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4791,7 +4791,7 @@ define <16 x i8> @sqrshrun16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @sqrshrun16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4834,7 +4834,7 @@ define <8 x i16> @sqrshrun8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sqrshrun8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -4877,7 +4877,7 @@ define <4 x i32> @sqrshrun4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sqrshrun4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -5021,7 +5021,7 @@ define <16 x i8> @uqrshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @uqrshrn16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -5064,7 +5064,7 @@ define <8 x i16> @uqrshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @uqrshrn8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -5107,7 +5107,7 @@ define <4 x i32> @uqrshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @uqrshrn4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -5251,7 +5251,7 @@ define <16 x i8> @uqshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @uqshrn16b(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -5294,7 +5294,7 @@ define <8 x i16> @uqshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @uqshrn8h(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -5337,7 +5337,7 @@ define <4 x i32> @uqshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @uqshrn4s(
 ; CHECK-SAME: ptr [[RET:%.*]], ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -6845,7 +6845,7 @@ define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @ursra8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -6888,7 +6888,7 @@ define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @ursra4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -6931,7 +6931,7 @@ define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @ursra2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -6974,7 +6974,7 @@ define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @ursra16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7017,7 +7017,7 @@ define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @ursra8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7060,7 +7060,7 @@ define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @ursra4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7103,7 +7103,7 @@ define <2 x i64> @ursra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @ursra2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7146,7 +7146,7 @@ define <1 x i64> @ursra1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @ursra1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7189,7 +7189,7 @@ define i64 @ursra_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @ursra_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7232,7 +7232,7 @@ define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @srsra8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7275,7 +7275,7 @@ define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @srsra4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7318,7 +7318,7 @@ define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @srsra2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7361,7 +7361,7 @@ define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @srsra16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7404,7 +7404,7 @@ define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @srsra8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7447,7 +7447,7 @@ define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @srsra4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7490,7 +7490,7 @@ define <2 x i64> @srsra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @srsra2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7533,7 +7533,7 @@ define <1 x i64> @srsra1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @srsra1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7576,7 +7576,7 @@ define i64 @srsra_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define i64 @srsra_scalar(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7619,7 +7619,7 @@ define <8 x i8> @usra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @usra8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7662,7 +7662,7 @@ define <4 x i16> @usra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @usra4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7705,7 +7705,7 @@ define <2 x i32> @usra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @usra2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7748,7 +7748,7 @@ define <16 x i8> @usra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @usra16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7791,7 +7791,7 @@ define <8 x i16> @usra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @usra8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7834,7 +7834,7 @@ define <4 x i32> @usra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @usra4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7877,7 +7877,7 @@ define <2 x i64> @usra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @usra2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7920,7 +7920,7 @@ define <1 x i64> @usra1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @usra1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -7963,7 +7963,7 @@ define <8 x i8> @ssra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @ssra8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8006,7 +8006,7 @@ define <4 x i16> @ssra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @ssra4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8049,7 +8049,7 @@ define <2 x i32> @ssra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @ssra2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8092,7 +8092,7 @@ define <16 x i8> @ssra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @ssra16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8135,7 +8135,7 @@ define <8 x i16> @ssra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @ssra8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8178,7 +8178,7 @@ define <4 x i32> @ssra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @ssra4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8221,7 +8221,7 @@ define <2 x i64> @ssra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @ssra2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8264,7 +8264,7 @@ define <8 x i8> @shr_orr8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @shr_orr8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8313,7 +8313,7 @@ define <4 x i16> @shr_orr4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @shr_orr4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8362,7 +8362,7 @@ define <2 x i32> @shr_orr2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @shr_orr2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8411,7 +8411,7 @@ define <16 x i8> @shr_orr16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shr_orr16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8460,7 +8460,7 @@ define <8 x i16> @shr_orr8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @shr_orr8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8509,7 +8509,7 @@ define <4 x i32> @shr_orr4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @shr_orr4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8558,7 +8558,7 @@ define <2 x i64> @shr_orr2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @shr_orr2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8607,7 +8607,7 @@ define <8 x i8> @shl_orr8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @shl_orr8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8656,7 +8656,7 @@ define <4 x i16> @shl_orr4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @shl_orr4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8705,7 +8705,7 @@ define <2 x i32> @shl_orr2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @shl_orr2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8754,7 +8754,7 @@ define <16 x i8> @shl_orr16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @shl_orr16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8803,7 +8803,7 @@ define <8 x i16> @shl_orr8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @shl_orr8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8852,7 +8852,7 @@ define <4 x i32> @shl_orr4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @shl_orr4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8901,7 +8901,7 @@ define <2 x i64> @shl_orr2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @shl_orr2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -8989,7 +8989,7 @@ define <8 x i8> @sli8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i8> @sli8b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9037,7 +9037,7 @@ define <4 x i16> @sli4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i16> @sli4h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9085,7 +9085,7 @@ define <2 x i32> @sli2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i32> @sli2s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9133,7 +9133,7 @@ define <1 x i64> @sli1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @sli1d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9181,7 +9181,7 @@ define <16 x i8> @sli16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <16 x i8> @sli16b(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9229,7 +9229,7 @@ define <8 x i16> @sli8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <8 x i16> @sli8h(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9277,7 +9277,7 @@ define <4 x i32> @sli4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @sli4s(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9325,7 +9325,7 @@ define <2 x i64> @sli2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @sli2d(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -9383,7 +9383,7 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) sanitize_memory {
 ; CHECK-LABEL: define <1 x i64> @ashr_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64>
@@ -9402,8 +9402,8 @@ define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[DST:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 3>
@@ -9437,8 +9437,8 @@ define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[DST:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 3>
@@ -9472,8 +9472,8 @@ define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[DST:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 3>
@@ -9507,8 +9507,8 @@ define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[DST:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 3>
@@ -9542,8 +9542,8 @@ define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sani
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[DST:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 3>
@@ -9577,8 +9577,8 @@ define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[DST:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 3>
@@ -9612,8 +9612,8 @@ define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[DST:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
index 8fed5a78d6b79..ef200402fa15b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
@@ -37,16 +37,16 @@ target triple = "aarch64--linux-android9001"
 define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
@@ -61,16 +61,16 @@ define void @st1x2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memo
 define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -85,16 +85,16 @@ define void @st1x2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -109,16 +109,16 @@ define void @st1x2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memo
 define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x2_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -133,17 +133,17 @@ define void @st1x2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -158,17 +158,17 @@ define void @st1x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr
 define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -183,17 +183,17 @@ define void @st1x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanit
 define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -208,17 +208,17 @@ define void @st1x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr
 define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x3_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -233,18 +233,18 @@ define void @st1x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanit
 define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -259,18 +259,18 @@ define void @st1x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x
 define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -285,18 +285,18 @@ define void @st1x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D,
 define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -311,18 +311,18 @@ define void @st1x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x
 define void @st1x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st1x4_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -337,16 +337,16 @@ define void @st1x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D,
 define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v16i8(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -361,16 +361,16 @@ define void @st2_v16i8(<16 x i8> %A, <16 x i8> %B, ptr %p) sanitize_memory {
 define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -385,16 +385,16 @@ define void @st2_v1f64(<1 x double> %A, <1 x double> %B, ptr %p) sanitize_memory
 define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -409,16 +409,16 @@ define void @st2_v1i64(<1 x i64> %A, <1 x i64> %B, ptr %p) sanitize_memory {
 define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2f32(
 ; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -433,16 +433,16 @@ define void @st2_v2f32(<2 x float> %A, <2 x float> %B, ptr %p) sanitize_memory {
 define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -457,16 +457,16 @@ define void @st2_v2f64(<2 x double> %A, <2 x double> %B, ptr %p) sanitize_memory
 define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2i32(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -481,16 +481,16 @@ define void @st2_v2i32(<2 x i32> %A, <2 x i32> %B, ptr %p) sanitize_memory {
 define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -505,16 +505,16 @@ define void @st2_v2i64(<2 x i64> %A, <2 x i64> %B, ptr %p) sanitize_memory {
 define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4f16(
 ; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -529,16 +529,16 @@ define void @st2_v4f16(<4 x half> %A, <4 x half> %B, ptr %p) sanitize_memory {
 define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4f32(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -553,16 +553,16 @@ define void @st2_v4f32(<4 x float> %A, <4 x float> %B, ptr %p) sanitize_memory {
 define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4i16(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -577,16 +577,16 @@ define void @st2_v4i16(<4 x i16> %A, <4 x i16> %B, ptr %p) sanitize_memory {
 define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v4i32(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -601,16 +601,16 @@ define void @st2_v4i32(<4 x i32> %A, <4 x i32> %B, ptr %p) sanitize_memory {
 define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v8f16(
 ; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -625,16 +625,16 @@ define void @st2_v8f16(<8 x half> %A, <8 x half> %B, ptr %p) sanitize_memory {
 define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v8i16(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -649,16 +649,16 @@ define void @st2_v8i16(<8 x i16> %A, <8 x i16> %B, ptr %p) sanitize_memory {
 define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st2_v8i8(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[TMP6]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -673,17 +673,17 @@ define void @st2_v8i8(<8 x i8> %A, <8 x i8> %B, ptr %p) sanitize_memory {
 define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v16i8(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -698,17 +698,17 @@ define void @st3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, ptr %p) sanitiz
 define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -723,17 +723,17 @@ define void @st3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, ptr %p
 define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -748,17 +748,17 @@ define void @st3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, ptr %p) sanitiz
 define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2f32(
 ; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -773,17 +773,17 @@ define void @st3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, ptr %p) s
 define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -798,17 +798,17 @@ define void @st3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, ptr %p
 define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2i32(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -823,17 +823,17 @@ define void @st3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, ptr %p) sanitiz
 define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -848,17 +848,17 @@ define void @st3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, ptr %p) sanitiz
 define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4f16(
 ; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -873,17 +873,17 @@ define void @st3_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr %p) sani
 define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4f32(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -898,17 +898,17 @@ define void @st3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, ptr %p) s
 define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4i16(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -923,17 +923,17 @@ define void @st3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr %p) sanitiz
 define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v4i32(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -948,17 +948,17 @@ define void @st3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr %p) sanitiz
 define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v8f16(
 ; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -973,17 +973,17 @@ define void @st3_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr %p) sani
 define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v8i16(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -998,17 +998,17 @@ define void @st3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr %p) sanitiz
 define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st3_v8i8(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1023,18 +1023,18 @@ define void @st3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, ptr %p) sanitize_me
 define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v16i8(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1049,18 +1049,18 @@ define void @st4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, p
 define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v1f64(
 ; CHECK-SAME: <1 x double> [[A:%.*]], <1 x double> [[B:%.*]], <1 x double> [[C:%.*]], <1 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1075,18 +1075,18 @@ define void @st4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x d
 define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v1i64(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]], <1 x i64> [[C:%.*]], <1 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1101,18 +1101,18 @@ define void @st4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, p
 define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2f32(
 ; CHECK-SAME: <2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]], <2 x float> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1127,18 +1127,18 @@ define void @st4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x floa
 define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2f64(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], <2 x double> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1153,18 +1153,18 @@ define void @st4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x d
 define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2i32(
 ; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]], <2 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1179,18 +1179,18 @@ define void @st4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, p
 define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v2i64(
 ; CHECK-SAME: <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i64> [[C:%.*]], <2 x i64> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1205,18 +1205,18 @@ define void @st4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, p
 define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4f16(
 ; CHECK-SAME: <4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], <4 x half> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1231,18 +1231,18 @@ define void @st4_v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> %
 define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4f32(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], <4 x float> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1257,18 +1257,18 @@ define void @st4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x floa
 define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4i16(
 ; CHECK-SAME: <4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], <4 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1283,18 +1283,18 @@ define void @st4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, p
 define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v4i32(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], <4 x i32> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1309,18 +1309,18 @@ define void @st4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, p
 define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v8f16(
 ; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], <8 x half> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1335,18 +1335,18 @@ define void @st4_v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> %
 define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v8i16(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], <8 x i16> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1361,18 +1361,18 @@ define void @st4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, p
 define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p) sanitize_memory {
 ; CHECK-LABEL: define void @st4_v8i8(
 ; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
@@ -1384,5 +1384,5 @@ define void @st4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, ptr %p
   ret void
 }
 ;.
-; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
 ;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll
index f3cceb7c075b2..b8e54a700149c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll
@@ -61,17 +61,17 @@ define i32 @bar() {
 ; array.  General purpose registers are saved at positions from 0 to 64, Floating
 ; point and SIMD are saved from 64 to 192, and the remaining from 192.
 ; CHECK-LABEL: @bar
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 8
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 16
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 64
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 80
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 24
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 32
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 96
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 40
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 48
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 56
-; CHECK: store {{.*}} @__msan_va_arg_tls {{.*}} 192
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 8
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 16
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 64
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 80
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 24
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 32
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 96
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 40
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 48
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 56
+; CHECK: store {{.*}} @__msan_va_arg_tls, i64 192
 ; CHECK: store {{.*}} 8, {{.*}} @__msan_va_arg_overflow_size_tls
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
@@ -97,6 +97,6 @@ entry:
 }
 
 ; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
-; CHECK: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
-; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
+; CHECK: getelementptr (i8, ptr @__msan_va_arg_tls, i64 792)
+; CHECK-NOT: getelementptr (i8, ptr @__msan_va_arg_tls, i64 800)
 declare i64 @sum(i64 %n, ...)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
index 06a34ac469e8c..d246e969f2522 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
@@ -39,9 +39,9 @@ define linkonce_odr dso_local void @_Z4testIcEvT_(i8 noundef %arg) sanitize_memo
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i8 [[_MSLD]] to i32
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    store i8 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i8, i32, ...) @_Z5test2IcEvT_iz(i8 noundef [[TMP7]], i32 noundef 1, i32 noundef [[CONV]])
 ; CHECK-NEXT:    ret void
@@ -80,9 +80,9 @@ define linkonce_odr dso_local void @_Z4testIiEvT_(i32 noundef %arg) sanitize_mem
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i32 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i32, i32, ...) @_Z5test2IiEvT_iz(i32 noundef [[TMP7]], i32 noundef 1, i32 noundef [[TMP7]])
 ; CHECK-NEXT:    ret void
@@ -122,9 +122,9 @@ define linkonce_odr dso_local void @_Z4testIfEvT_(float noundef %arg) sanitize_m
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP7]] to double
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[TMP11]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[TMP11]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[TMP11]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[TMP11]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (float, i32, ...) @_Z5test2IfEvT_iz(float noundef [[TMP7]], i32 noundef 1, double noundef [[CONV]])
 ; CHECK-NEXT:    ret void
@@ -163,9 +163,9 @@ define linkonce_odr dso_local void @_Z4testIdEvT_(double noundef %arg) sanitize_
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (double, i32, ...) @_Z5test2IdEvT_iz(double noundef [[TMP7]], i32 noundef 1, double noundef [[TMP7]])
 ; CHECK-NEXT:    ret void
@@ -203,9 +203,9 @@ define linkonce_odr dso_local void @_Z4testIeEvT_(fp128 noundef %arg) sanitize_m
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i128, ptr [[TMP10]], align 16
 ; CHECK-NEXT:    store i128 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i128 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i128 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i128 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i128 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (fp128, i32, ...) @_Z5test2IeEvT_iz(fp128 noundef [[TMP7]], i32 noundef 1, fp128 noundef [[TMP7]])
 ; CHECK-NEXT:    ret void
@@ -243,9 +243,9 @@ define linkonce_odr dso_local void @_Z4testI6IntIntEvT_(i64 %arg.coerce) sanitiz
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i64, i32, ...) @_Z5test2I6IntIntEvT_iz(i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i32 noundef 1, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -302,9 +302,9 @@ define linkonce_odr dso_local void @_Z4testI10Int64Int64EvT_([2 x i64] %arg.coer
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue [2 x i64] [[TMP18]], i64 [[_MSLD1]], 1
 ; CHECK-NEXT:    [[DOTFCA_1_INSERT3:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT2]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], 1
 ; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void ([2 x i64], i32, ...) @_Z5test2I10Int64Int64EvT_iz([2 x i64] [[DOTFCA_1_INSERT3]], i32 noundef 1, [2 x i64] [[DOTFCA_1_INSERT3]])
 ; CHECK-NEXT:    ret void
@@ -368,9 +368,9 @@ define linkonce_odr dso_local void @_Z4testI12DoubleDoubleEvT_([2 x double] alig
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue [2 x i64] [[TMP18]], i64 [[_MSLD1]], 1
 ; CHECK-NEXT:    [[DOTFCA_1_INSERT3:%.*]] = insertvalue [2 x double] [[DOTFCA_0_INSERT2]], double [[AGG_TMP_SROA_2_0_COPYLOAD]], 1
 ; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void ([2 x double], i32, ...) @_Z5test2I12DoubleDoubleEvT_iz([2 x double] alignstack(8) [[DOTFCA_1_INSERT3]], i32 noundef 1, [2 x double] alignstack(8) [[DOTFCA_1_INSERT3]])
 ; CHECK-NEXT:    ret void
@@ -464,9 +464,9 @@ define linkonce_odr dso_local void @_Z4testI7Double4EvT_([4 x double] alignstack
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue [4 x i64] [[TMP34]], i64 [[_MSLD3]], 3
 ; CHECK-NEXT:    [[DOTFCA_3_INSERT7:%.*]] = insertvalue [4 x double] [[DOTFCA_2_INSERT6]], double [[AGG_TMP_SROA_4_0_COPYLOAD]], 3
 ; CHECK-NEXT:    store [4 x i64] [[TMP35]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store [4 x i64] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store [4 x i64] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store [4 x i64] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    store [4 x i64] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void ([4 x double], i32, ...) @_Z5test2I7Double4EvT_iz([4 x double] alignstack(8) [[DOTFCA_3_INSERT7]], i32 noundef 1, [4 x double] alignstack(8) [[DOTFCA_3_INSERT7]])
 ; CHECK-NEXT:    ret void
@@ -540,9 +540,9 @@ define linkonce_odr dso_local void @_Z4testI11DoubleFloatEvT_([2 x i64] %arg.coe
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue [2 x i64] [[TMP18]], i64 [[_MSLD1]], 1
 ; CHECK-NEXT:    [[DOTFCA_1_INSERT3:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT2]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], 1
 ; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store [2 x i64] [[TMP19]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void ([2 x i64], i32, ...) @_Z5test2I11DoubleFloatEvT_iz([2 x i64] [[DOTFCA_1_INSERT3]], i32 noundef 1, [2 x i64] [[DOTFCA_1_INSERT3]])
 ; CHECK-NEXT:    ret void
@@ -606,9 +606,9 @@ define linkonce_odr dso_local void @_Z4testI11LongDouble2EvT_([2 x fp128] aligns
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue [2 x i128] [[TMP18]], i128 [[_MSLD1]], 1
 ; CHECK-NEXT:    [[DOTFCA_1_INSERT5:%.*]] = insertvalue [2 x fp128] [[DOTFCA_0_INSERT4]], fp128 [[AGG_TMP_SROA_2_0_COPYLOAD]], 1
 ; CHECK-NEXT:    store [2 x i128] [[TMP19]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store [2 x i128] [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    store [2 x i128] [[TMP19]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void ([2 x fp128], i32, ...) @_Z5test2I11LongDouble2EvT_iz([2 x fp128] alignstack(16) [[DOTFCA_1_INSERT5]], i32 noundef 1, [2 x fp128] alignstack(16) [[DOTFCA_1_INSERT5]])
 ; CHECK-NEXT:    ret void
@@ -702,9 +702,9 @@ define linkonce_odr dso_local void @_Z4testI11LongDouble4EvT_([4 x fp128] aligns
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue [4 x i128] [[TMP34]], i128 [[_MSLD3]], 3
 ; CHECK-NEXT:    [[DOTFCA_3_INSERT7:%.*]] = insertvalue [4 x fp128] [[DOTFCA_2_INSERT6]], fp128 [[AGG_TMP_SROA_4_0_COPYLOAD]], 3
 ; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void ([4 x fp128], i32, ...) @_Z5test2I11LongDouble4EvT_iz([4 x fp128] alignstack(16) [[DOTFCA_3_INSERT7]], i32 noundef 1, [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT7]])
 ; CHECK-NEXT:    ret void
@@ -759,29 +759,19 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -852,29 +842,19 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -937,29 +917,19 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1022,29 +992,19 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1107,29 +1067,19 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(fp128 noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1192,29 +1142,19 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1277,29 +1217,19 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz([2 x i64] %t.coe
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1362,29 +1292,19 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz([2 x double] a
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1447,29 +1367,19 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz([4 x double] alignst
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1532,29 +1442,19 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz([2 x i64] %t.co
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1617,29 +1517,19 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz([2 x fp128] ali
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1702,29 +1592,19 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz([4 x fp128] ali
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP15]], 8
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP19]], 24
-; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[ARGS]], i64 24
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[TMP18]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP26]], 16
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = add i64 [[TMP30]], 28
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[ARGS]], i64 28
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
 ; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
 ; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[TMP29]], [[TMP34]]
@@ -1838,29 +1718,29 @@ define linkonce_odr dso_local void @_Z4test2I11LongDouble4EvT_([4 x fp128] align
 ; CHECK-NEXT:    [[TMP35:%.*]] = insertvalue [4 x i128] [[TMP34]], i128 [[_MSLD3]], 3
 ; CHECK-NEXT:    [[DOTFCA_3_INSERT121:%.*]] = insertvalue [4 x fp128] [[DOTFCA_2_INSERT120]], fp128 [[AGG_TMP_SROA_4_0_COPYLOAD]], 3
 ; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 768) to ptr), i8 0, i32 32, i1 false)
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 328), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 392), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 456), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 520), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 584), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 648), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 712), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 192), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 256), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 320), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 384), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 448), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 512), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 576), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 640), align 8
+; CHECK-NEXT:    store [4 x i128] [[TMP35]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 704), align 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 768), i8 0, i32 32, i1 false)
 ; CHECK-NEXT:    store i64 1216, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void ([4 x fp128], i32, ...) @_Z5test2I11LongDouble4EvT_iz([4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], i32 noundef 20, [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]], [4 x fp128] alignstack(16) [[DOTFCA_3_INSERT121]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Instrumentation/MemorySanitizer/ARM32/vararg-arm32.ll b/llvm/test/Instrumentation/MemorySanitizer/ARM32/vararg-arm32.ll
index e05018c2d5372..cbdae2526eb38 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/ARM32/vararg-arm32.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/ARM32/vararg-arm32.ll
@@ -59,12 +59,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 4), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -87,11 +87,11 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -112,205 +112,205 @@ define dso_local i64 @many_args() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 960, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 (i64, ...) @sum(i64 120, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll b/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
index 4d4fc1bdd7bde..a0dcefd498c25 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/LoongArch/vararg-loongarch64.ll
@@ -56,12 +56,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -80,11 +80,11 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -104,8 +104,8 @@ declare i64 @sum(i64 %n, ...)
 define dso_local i64 @many_args() {
 ;; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
 ; CHECK-LABEL: @many_args
-; CHECK:    i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
-; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
+; CHECK:    getelementptr (i8, ptr @__msan_va_arg_tls, i64 792)
+; CHECK-NOT: getelementptr (i8, ptr @__msan_va_arg_tls, i64 800)
 ;
 entry:
   %ret = call i64 (i64, ...) @sum(i64 120,
diff --git a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
index 9f3f10e51b272..1187531e9a25c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
@@ -54,12 +54,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 4), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -82,11 +82,11 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -124,8 +124,8 @@ entry:
 
 ; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
 ; CHECK-LABEL: @many_args
-; CHECK: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
-; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
+; CHECK: getelementptr (i8, ptr @__msan_va_arg_tls, i64 792)
+; CHECK-NOT: getelementptr (i8, ptr @__msan_va_arg_tls, i64 800)
 declare i64 @sum(i64 %n, ...)
 
 ; CHECK: declare void @__msan_maybe_warning_1(i8 signext, i32 signext)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
index 41fb975dcf285..a78285a191c8c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
@@ -54,12 +54,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -81,11 +81,11 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -122,6 +122,6 @@ entry:
 
 ; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
 ; CHECK-LABEL: @many_args
-; CHECK: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
-; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
+; CHECK: getelementptr (i8, ptr @__msan_va_arg_tls, i64 792)
+; CHECK-NOT: getelementptr (i8, ptr @__msan_va_arg_tls, i64 800)
 declare i64 @sum(i64 %n, ...)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mips.ll b/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mips.ll
index 4d47b02bb2713..9257622f86a2c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mips.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mips.ll
@@ -59,12 +59,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 4), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -87,11 +87,11 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -112,205 +112,205 @@ define dso_local i64 @many_args() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 960, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 (i64, ...) @sum(i64 120, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mipsel.ll b/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mipsel.ll
index 98294e7c0383c..690dc2a22bd1b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mipsel.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/Mips32/vararg-mipsel.ll
@@ -59,12 +59,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -86,11 +86,11 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -111,205 +111,205 @@ define dso_local i64 @many_args() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 960, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 (i64, ...) @sum(i64 120, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
index 19b07e16fb46f..6dc896f2fc84f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
@@ -60,12 +60,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 4), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -87,8 +87,8 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
@@ -110,7 +110,7 @@ define i32 @bar4() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -130,8 +130,8 @@ define i32 @bar5() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
@@ -156,7 +156,7 @@ define i32 @bar6(ptr %arg) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
@@ -187,13 +187,13 @@ define i32 @bar7(ptr %arg) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 8796093022208
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), ptr align 8 [[TMP11]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), ptr align 8 [[TMP11]], i64 32, i1 false)
 ; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 [[ARG]])
@@ -231,6 +231,6 @@ entry:
 
 ; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
 ; CHECK-LABEL: @many_args
-; CHECK: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
-; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
+; CHECK: getelementptr (i8, ptr @__msan_va_arg_tls, i64 792)
+; CHECK-NOT: getelementptr (i8, ptr @__msan_va_arg_tls, i64 800)
 declare i64 @sum(i64 %n, ...)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
index 1fe63850860e8..e3db97cf8ba87 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
@@ -60,12 +60,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -86,8 +86,8 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
@@ -109,7 +109,7 @@ define i32 @bar4() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -129,8 +129,8 @@ define i32 @bar5() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
@@ -155,7 +155,7 @@ define i32 @bar6(ptr %arg) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
@@ -186,13 +186,13 @@ define i32 @bar7(ptr %arg) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 8796093022208
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i8 0, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], -246290604621825
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 17592186044416
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP9]], 8796093022208
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), ptr align 8 [[TMP11]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), ptr align 8 [[TMP11]], i64 32, i1 false)
 ; CHECK-NEXT:    store i64 40, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 [[ARG]])
@@ -204,7 +204,6 @@ define i32 @bar7(ptr %arg) {
   ret i32 %1
 }
 
-
 ; UTC_ARGS: --disable
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
@@ -230,6 +229,6 @@ entry:
 
 ; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
 ; CHECK-LABEL: @many_args
-; CHECK: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
-; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
+; CHECK: ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 792)
+; CHECK-NOT: ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 800)
 declare i64 @sum(i64 %n, ...)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/kernel-ppcle.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/kernel-ppcle.ll
index 1c74431e96c01..8ba033061defe 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/kernel-ppcle.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/kernel-ppcle.ll
@@ -16,39 +16,33 @@ define void @Store1(ptr %p, i8 %x) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i32 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP11]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i32 [[TMP10]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       [[BB12]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    br label %[[BB13]]
-; CHECK:       [[BB13]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[P]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 1
 ; CHECK-NEXT:    store i8 [[TMP9]], ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB17:.*]], label %[[BB19:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB11:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP12]])
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP17]], align 4
-; CHECK-NEXT:    br label %[[BB19]]
-; CHECK:       [[BB19]]:
+; CHECK-NEXT:    br label %[[BB13]]
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    store i8 [[X]], ptr [[P]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -70,39 +64,33 @@ define void @Store2(ptr %p, i16 %x) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i32 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP11]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i32 [[TMP10]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
-; CHECK:       [[BB12]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB13]]
-; CHECK:       [[BB13]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_2(ptr [[P]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 1
 ; CHECK-NEXT:    store i16 [[TMP9]], ptr [[TMP16]], align 2
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB17:.*]], label %[[BB19:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB11:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP12]])
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP17]], align 4
-; CHECK-NEXT:    br label %[[BB19]]
-; CHECK:       [[BB19]]:
+; CHECK-NEXT:    br label %[[BB13]]
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    store i16 [[X]], ptr [[P]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -124,39 +112,33 @@ define void @Store4(ptr %p, i32 %x) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i32 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP11]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i32 [[TMP10]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
-; CHECK:       [[BB12]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB13]]
-; CHECK:       [[BB13]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_4(ptr [[P]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 1
 ; CHECK-NEXT:    store i32 [[TMP9]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i32 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB17:.*]], label %[[BB19:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB11:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP12]])
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP17]], align 4
-; CHECK-NEXT:    br label %[[BB19]]
-; CHECK:       [[BB19]]:
+; CHECK-NEXT:    br label %[[BB13]]
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -178,41 +160,35 @@ define void @Store8(ptr %p, i64 %x) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i32 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP11]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i32 [[TMP10]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
-; CHECK:       [[BB12]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB13]]
-; CHECK:       [[BB13]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_8(ptr [[P]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 1
 ; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB17:.*]], label %[[BB20:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB11:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP12]])
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP17]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP20]], align 4
-; CHECK-NEXT:    br label %[[BB20]]
-; CHECK:       [[BB20]]:
+; CHECK-NEXT:    br label %[[BB14]]
+; CHECK:       [[BB14]]:
 ; CHECK-NEXT:    store i64 [[X]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -234,35 +210,29 @@ define void @Store16(ptr %p, i128 %x) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i32 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i128, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP11]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i32 [[TMP10]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
-; CHECK:       [[BB12]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB13]]
-; CHECK:       [[BB13]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB7]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_n(ptr [[P]], i32 16)
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, ptr } [[TMP15]], 1
 ; CHECK-NEXT:    store i128 [[TMP9]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB17:.*]], label %[[BB22:.*]], !prof [[PROF1]]
-; CHECK:       [[BB17]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB11:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP12]])
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i32 1
@@ -271,8 +241,8 @@ define void @Store16(ptr %p, i128 %x) sanitize_memory {
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP17]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP21]], align 4
-; CHECK-NEXT:    br label %[[BB22]]
-; CHECK:       [[BB22]]:
+; CHECK-NEXT:    br label %[[BB16]]
+; CHECK:       [[BB16]]:
 ; CHECK-NEXT:    store i128 [[X]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -294,20 +264,18 @@ define i8 @Load1(ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB7]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB5]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[P]], align 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_1(ptr [[P]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 0
@@ -336,20 +304,18 @@ define i16 @Load2(ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB7]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB5]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[P]], align 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_2(ptr [[P]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 0
@@ -378,20 +344,18 @@ define i32 @Load4(ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB7]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB5]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_4(ptr [[P]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 0
@@ -420,20 +384,18 @@ define i64 @Load8(ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB7]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB5]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_8(ptr [[P]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 0
@@ -462,20 +424,18 @@ define i128 @Load16(ptr %p) sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i32
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i32 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i32
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR2]]
-; CHECK-NEXT:    br label %[[BB7]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP3]]) #[[ATTR2]]
+; CHECK-NEXT:    br label %[[BB5]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i128, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_n(ptr [[P]], i32 16)
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppc.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppc.ll
index 29d1fbd053ecb..26aaa1e985e7f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppc.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppc.ll
@@ -76,12 +76,12 @@ define i32 @bar() {
 ; CHECK-LABEL: define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 24) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 4) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 24), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 4), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 16, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -102,9 +102,9 @@ define i32 @bar2() {
 ; CHECK-LABEL: define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 24, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
@@ -125,9 +125,9 @@ define i32 @bar4() {
 ; CHECK-LABEL: define i32 @bar4() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 24, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2])
@@ -145,9 +145,9 @@ define i32 @bar5() {
 ; CHECK-LABEL: define i32 @bar5() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 40, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
@@ -166,15 +166,15 @@ define i32 @bar6(ptr %arg) {
 ; CHECK-SAME: ptr [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 2147483647
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i32 8), i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 2147483647
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), ptr align 8 [[TMP7]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), ptr align 8 [[TMP7]], i64 16, i1 false)
 ; CHECK-NEXT:    store i32 24, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([2 x i64]) align 8 [[ARG]])
@@ -193,15 +193,15 @@ define i32 @bar7(ptr %arg) {
 ; CHECK-SAME: ptr [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 2147483647
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), i8 0, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i32 8), i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 2147483647
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), ptr align 8 [[TMP7]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), ptr align 8 [[TMP7]], i64 32, i1 false)
 ; CHECK-NEXT:    store i32 40, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 [[ARG]])
@@ -222,205 +222,205 @@ define dso_local i64 @many_args() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 792) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 792), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 792), align 8
 ; CHECK-NEXT:    store i32 968, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 (i64, ...) @sum(i64 120, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppcle.ll b/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppcle.ll
index a4d2e165dd3a8..24f9dc3bd18c9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppcle.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/PowerPC32/vararg-ppcle.ll
@@ -76,12 +76,12 @@ define i32 @bar() {
 ; CHECK-LABEL: define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 24) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 4) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 24), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 4), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 16, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -101,9 +101,9 @@ define i32 @bar2() {
 ; CHECK-LABEL: define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 24, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, <2 x i64> <i64 1, i64 2>)
@@ -124,9 +124,9 @@ define i32 @bar4() {
 ; CHECK-LABEL: define i32 @bar4() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store [2 x i64] zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 24, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i64] [i64 1, i64 2])
@@ -144,9 +144,9 @@ define i32 @bar5() {
 ; CHECK-LABEL: define i32 @bar5() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store [2 x i128] zeroinitializer, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
 ; CHECK-NEXT:    store i32 40, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, [2 x i128] [i128 1, i128 2])
@@ -165,15 +165,15 @@ define i32 @bar6(ptr %arg) {
 ; CHECK-SAME: ptr [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 2147483647
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), i8 0, i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i32 8), i8 0, i64 16, i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 2147483647
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), ptr align 8 [[TMP7]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), ptr align 8 [[TMP7]], i64 16, i1 false)
 ; CHECK-NEXT:    store i32 24, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([2 x i64]) align 8 [[ARG]])
@@ -192,15 +192,15 @@ define i32 @bar7(ptr %arg) {
 ; CHECK-SAME: ptr [[ARG:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 2147483647
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i32 [[TMP3]] to ptr
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), i8 0, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i32 8), i8 0, i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ARG]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 2147483647
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i32 [[TMP6]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), ptr align 8 [[TMP7]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), ptr align 8 [[TMP7]], i64 32, i1 false)
 ; CHECK-NEXT:    store i32 40, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 (i32, ...) @foo(i32 0, ptr byval([4 x i64]) align 16 [[ARG]])
@@ -220,205 +220,205 @@ define dso_local i64 @many_args() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 ptrtoint (ptr @__msan_param_tls to i32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_param_tls to i32), i32 792) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i32 add (i32 ptrtoint (ptr @__msan_va_arg_tls to i32), i32 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i32 792), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i32 792), align 8
 ; CHECK-NEXT:    store i32 968, ptr @__msan_va_arg_overflow_size_tls, align 4
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 (i64, ...) @sum(i64 120, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/RISCV32/vararg-riscv32.ll b/llvm/test/Instrumentation/MemorySanitizer/RISCV32/vararg-riscv32.ll
index 0c6e75c331012..f707135261e3f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/RISCV32/vararg-riscv32.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/RISCV32/vararg-riscv32.ll
@@ -59,12 +59,12 @@ define i32 @bar() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 4) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 4), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -87,11 +87,11 @@ define i32 @bar2() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 (i32, i32, ...) @foo2(i32 0, i32 1, i64 2, double 3.000000e+00)
@@ -112,205 +112,205 @@ define dso_local i64 @many_args() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 960, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 (i64, ...) @sum(i64 120, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
index 44545685b5121..af8533c18acdc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_addsub_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -29,7 +29,7 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
 define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_addsub_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -44,8 +44,8 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
 
 define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_blendv_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[A2:%.*]] to <4 x i64>
@@ -72,8 +72,8 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
 
 define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_blendv_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[A2:%.*]] to <8 x i32>
@@ -101,7 +101,7 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
 define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_cmp_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer
@@ -119,7 +119,7 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
 define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_cmp_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i32> [[TMP3]], zeroinitializer
@@ -135,7 +135,7 @@ define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) #0
 define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_cmp_ps_256_pseudo_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i32> [[TMP3]], zeroinitializer
@@ -388,7 +388,7 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_dp_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
@@ -414,7 +414,7 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
 define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hadd_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -432,7 +432,7 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
 define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hadd_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -450,7 +450,7 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
 define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hsub_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -468,7 +468,7 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
 define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hsub_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -509,7 +509,7 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly
 
 define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
@@ -535,7 +535,7 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly
 
 define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
@@ -561,7 +561,7 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read
 
 define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
@@ -587,7 +587,7 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly
 
 define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
@@ -613,9 +613,9 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado
 
 define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -642,9 +642,9 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind
 
 define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -671,9 +671,9 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi
 
 define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -700,9 +700,9 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind
 
 define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -730,7 +730,7 @@ declare void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>) nounwin
 define <4 x double> @test_x86_avx_max_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_max_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -746,7 +746,7 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
 define <8 x float> @test_x86_avx_max_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_max_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -762,7 +762,7 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
 define <4 x double> @test_x86_avx_min_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_min_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
@@ -778,7 +778,7 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
 define <8 x float> @test_x86_avx_min_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_min_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
@@ -836,7 +836,7 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_ptestc_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer
@@ -855,7 +855,7 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
 define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_ptestnzc_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer
@@ -874,7 +874,7 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
 define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_ptestz_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer
@@ -948,7 +948,7 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[A1]] to <2 x i1>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <2 x i64> [[TMP1]] to <2 x double>
@@ -974,7 +974,7 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
 define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[A1]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
@@ -1014,7 +1014,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
 define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i32> [[A1]] to <4 x i2>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <4 x i32> [[TMP1]] to <4 x float>
@@ -1036,7 +1036,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #
 }
 define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1075,7 +1075,7 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
 define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i32> [[A1]] to <8 x i3>
 ; CHECK-NEXT:    [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
@@ -1101,7 +1101,7 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
 define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -1120,7 +1120,7 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
 define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer
@@ -1139,7 +1139,7 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
 define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
@@ -1158,7 +1158,7 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i32> [[TMP3]], zeroinitializer
@@ -1177,7 +1177,7 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
 define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -1196,7 +1196,7 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer
@@ -1215,7 +1215,7 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
 define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
@@ -1234,7 +1234,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
 define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i32> [[TMP3]], zeroinitializer
@@ -1253,7 +1253,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
 define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -1272,7 +1272,7 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
 define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i64> [[TMP3]], zeroinitializer
@@ -1291,7 +1291,7 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
 define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
@@ -1310,7 +1310,7 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i32> [[TMP3]], zeroinitializer
@@ -1351,7 +1351,7 @@ declare void @llvm.x86.avx.vzeroupper() nounwind
 
 define void @movnt_dq(ptr %p, <2 x i64> %a1) nounwind #0 {
 ; CHECK-LABEL: @movnt_dq(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
@@ -1381,7 +1381,7 @@ declare void @llvm.x86.avx.movnt.dq.256(ptr, <4 x i64>) nounwind
 define void @movnt_ps(ptr %p, <8 x float> %a) nounwind #0 {
 ; CHECK-LABEL: @movnt_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1404,7 +1404,7 @@ declare void @llvm.x86.avx.movnt.ps.256(ptr, <8 x float>) nounwind
 define void @movnt_pd(ptr %p, <4 x double> %a1) nounwind #0 {
   ; add operation forces the execution domain.
 ; CHECK-LABEL: @movnt_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
@@ -1432,7 +1432,7 @@ declare void @llvm.x86.avx.movnt.pd.256(ptr, <4 x double>) nounwind
 define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_pclmulqdq(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
index 991467e1f98b2..8900085af030d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
@@ -16,8 +16,8 @@ define <16 x float> @test_mm512_dpph_ps(<16 x float> %__W, <32 x half> %__A, <32
 ; CHECK-LABEL: define <16 x float> @test_mm512_dpph_ps(
 ; CHECK-SAME: <16 x float> [[__W:%.*]], <32 x half> [[__A:%.*]], <32 x half> [[__B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -44,9 +44,9 @@ define <16 x float> @test_mm512_mask_dpph_ps(<16 x float> %__W, i16 zeroext %__U
 ; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpph_ps(
 ; CHECK-SAME: <16 x float> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x half> [[__A:%.*]], <32 x half> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -84,9 +84,9 @@ define <16 x float> @test_mm512_mask_dpph_ps(<16 x float> %__W, i16 zeroext %__U
 define <16 x float> @test_mm512_maskz_dpph_ps(i16 zeroext %__U, <16 x float> %__W, <32 x half> %__A, <32 x half> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpph_ps(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x float> [[__W:%.*]], <32 x half> [[__A:%.*]], <32 x half> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
@@ -127,8 +127,8 @@ declare <16 x float> @llvm.x86.avx10.vdpphps.512(<16 x float>, <32 x half>, <32
 define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbssd_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -168,10 +168,10 @@ define <16 x i32> @test_mm512_dpbssd_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr
 define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbssds_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
@@ -208,9 +208,9 @@ define <16 x i32> @test_mm512_mask_dpbssds_epi32(<16 x i32> %__W, i16 zeroext %_
 define <16 x i32> @test_mm512_maskz_dpbssd_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbssd_epi32(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <64 x i8> [[TMP25]], zeroinitializer
@@ -251,8 +251,8 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbssds.512(<16 x i32>, <64 x i8>, <64 x i8
 define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbsud_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -292,10 +292,10 @@ define <16 x i32> @test_mm512_dpbsud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr
 define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbsuds_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
@@ -332,9 +332,9 @@ define <16 x i32> @test_mm512_mask_dpbsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 define <16 x i32> @test_mm512_maskz_dpbsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbsud_epi32(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
@@ -375,8 +375,8 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbsuds.512(<16 x i32>, <64 x i8>, <64 x i8
 define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpbuud_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -416,10 +416,10 @@ define <16 x i32> @test_mm512_dpbuud_epi32(<16 x i32> %__W, <64 x i8> %__A, ptr
 define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpbuuds_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP3]], zeroinitializer
@@ -456,9 +456,9 @@ define <16 x i32> @test_mm512_mask_dpbuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 define <16 x i32> @test_mm512_maskz_dpbuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <64 x i8> %__A, <64 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpbuud_epi32(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <64 x i8> [[__A:%.*]], <64 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
@@ -500,9 +500,9 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8
 define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwsud_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -530,9 +530,9 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwsuds_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
@@ -557,9 +557,9 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwsud_epi32(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -588,9 +588,9 @@ declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i
 define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwusd_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -618,9 +618,9 @@ define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwusds_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
@@ -645,9 +645,9 @@ define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %_
 define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwusd_epi32(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -676,9 +676,9 @@ declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i
 define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwuud_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -706,9 +706,9 @@ define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwuuds_epi32(
 ; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
@@ -733,9 +733,9 @@ define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwuud_epi32(
 ; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -765,10 +765,10 @@ declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) sanitize_memory {
 ; CHECK-LABEL: define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(
 ; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <32 x i16> [[X3:%.*]], i32 [[X4:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i32 [[X4]] to <32 x i1>
@@ -844,7 +844,7 @@ define <8 x float> @avx_dp_ps(<8 x float> %a, <8 x float> %b) sanitize_memory {
 ; CHECK-LABEL: define <8 x float> @avx_dp_ps(
 ; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
index 373eff6a1af60..def7ba3f10770 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
@@ -19,8 +19,8 @@ define <4 x float> @test_mm_dpph_ps(<4 x float> %__W, <8 x half> %__A, <8 x half
 ; CHECK-LABEL: define <4 x float> @test_mm_dpph_ps(
 ; CHECK-SAME: <4 x float> [[__W:%.*]], <8 x half> [[__A:%.*]], <8 x half> [[__B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -47,9 +47,9 @@ define <4 x float> @test_mm_mask_dpph_ps(<4 x float> %__W, i8 zeroext %__U, <8 x
 ; CHECK-LABEL: define <4 x float> @test_mm_mask_dpph_ps(
 ; CHECK-SAME: <4 x float> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x half> [[__A:%.*]], <8 x half> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -90,9 +90,9 @@ define <4 x float> @test_mm_mask_dpph_ps(<4 x float> %__W, i8 zeroext %__U, <8 x
 define <4 x float> @test_mm_maskz_dpph_ps(i8 zeroext %__U, <4 x float> %__W, <8 x half> %__A, <8 x half> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x float> @test_mm_maskz_dpph_ps(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <4 x float> [[__W:%.*]], <8 x half> [[__A:%.*]], <8 x half> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -134,8 +134,8 @@ define <8 x float> @test_mm256_dpph_ps(<8 x float> %__W, <16 x half> %__A, <16 x
 ; CHECK-LABEL: define <8 x float> @test_mm256_dpph_ps(
 ; CHECK-SAME: <8 x float> [[__W:%.*]], <16 x half> [[__A:%.*]], <16 x half> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -162,9 +162,9 @@ define <8 x float> @test_mm256_mask_dpph_ps(<8 x float> %__W, i8 zeroext %__U, <
 ; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpph_ps(
 ; CHECK-SAME: <8 x float> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x half> [[__A:%.*]], <16 x half> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -202,9 +202,9 @@ define <8 x float> @test_mm256_mask_dpph_ps(<8 x float> %__W, i8 zeroext %__U, <
 define <8 x float> @test_mm256_maskz_dpph_ps(i8 zeroext %__U, <8 x float> %__W, <16 x half> %__A, <16 x half> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpph_ps(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x float> [[__W:%.*]], <16 x half> [[__A:%.*]], <16 x half> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
@@ -246,10 +246,10 @@ declare <8 x float> @llvm.x86.avx10.vdpphps.256(<8 x float>, <16 x half>, <16 x
 define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbssd_epi32(
 ; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
@@ -286,9 +286,9 @@ define <4 x i32> @test_mm_mask_dpbssd_epi32(<4 x i32> %__W, i4 zeroext %__U, <16
 define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbssds_epi32(
 ; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <16 x i8> [[TMP25]], zeroinitializer
@@ -326,10 +326,10 @@ define <4 x i32> @test_mm_maskz_dpbssds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbssds_epi32(
 ; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
@@ -366,9 +366,9 @@ define <8 x i32> @test_mm256_maskz_dpbssds_epi32(<8 x i32> %__W, i8 zeroext %__U
 define <8 x i32> @test_mm256_mask_dpbssd_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbssd_epi32(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP26:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP25:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i8> [[TMP25]], zeroinitializer
@@ -411,10 +411,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbsud_epi32(
 ; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
@@ -451,9 +451,9 @@ define <4 x i32> @test_mm_mask_dpbsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16
 define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbsuds_epi32(
 ; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
@@ -491,10 +491,10 @@ define <4 x i32> @test_mm_maskz_dpbsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(
 ; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
@@ -531,9 +531,9 @@ define <8 x i32> @test_mm256_maskz_dpbsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 define <8 x i32> @test_mm256_mask_dpbsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbsud_epi32(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
@@ -576,10 +576,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpbuud_epi32(
 ; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i8> [[TMP2]], zeroinitializer
@@ -616,9 +616,9 @@ define <4 x i32> @test_mm_mask_dpbuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <16
 define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <16 x i8> %__A, <16 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpbuuds_epi32(
 ; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <16 x i8> [[__A:%.*]], <16 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i8> [[TMP3]], zeroinitializer
@@ -656,10 +656,10 @@ define <4 x i32> @test_mm_maskz_dpbuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(
 ; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
@@ -696,9 +696,9 @@ define <8 x i32> @test_mm256_maskz_dpbuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 define <8 x i32> @test_mm256_mask_dpbuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <32 x i8> %__A, <32 x i8> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpbuud_epi32(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <32 x i8> [[__A:%.*]], <32 x i8> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i8> [[TMP3]], zeroinitializer
@@ -743,9 +743,9 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwsud_epi32(
 ; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -770,9 +770,9 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwsuds_epi32(
 ; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -799,9 +799,9 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(
 ; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -826,9 +826,9 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwsud_epi32(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -860,9 +860,9 @@ define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwusd_epi32(
 ; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -887,9 +887,9 @@ define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwusds_epi32(
 ; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -916,9 +916,9 @@ define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwusds_epi32(
 ; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -943,9 +943,9 @@ define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U
 define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwusd_epi32(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -977,9 +977,9 @@ define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwuud_epi32(
 ; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -1004,9 +1004,9 @@ define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwuuds_epi32(
 ; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -1033,9 +1033,9 @@ define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(
 ; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -1060,9 +1060,9 @@ define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwuud_epi32(
 ; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -1094,10 +1094,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(
 ; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <8 x i16> [[X3:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[X4]] to <8 x i1>
@@ -1169,10 +1169,10 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0,
 define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(
 ; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <16 x i16> [[X3:%.*]], i16 [[X4:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[X4]] to <16 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index 29269ff333771..e447cabc60e9f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packssdw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -40,7 +40,7 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() #0 {
 define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packsswb(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16>
@@ -73,7 +73,7 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() #0 {
 define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packuswb(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i16>
@@ -106,7 +106,7 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() #0 {
 define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pavg_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
@@ -122,7 +122,7 @@ declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pavg_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -138,7 +138,7 @@ declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readno
 define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmadd_wd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
@@ -187,7 +187,7 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
 define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmulh_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -203,7 +203,7 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
 define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmulhu_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -219,7 +219,7 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
 define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psad_bw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <4 x i64>
@@ -239,7 +239,7 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psll_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -261,7 +261,7 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psll_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -283,7 +283,7 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psll_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -353,7 +353,7 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
 define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psra_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -375,7 +375,7 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psra_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -429,7 +429,7 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
 define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -451,7 +451,7 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -473,7 +473,7 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -494,7 +494,7 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon
 
 define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, ptr %p) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_w_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -576,7 +576,7 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
 define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phadd_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -594,7 +594,7 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phadd_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -612,7 +612,7 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
 define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phadd_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -630,7 +630,7 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
 define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phsub_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -648,7 +648,7 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phsub_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -666,7 +666,7 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
 define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phsub_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
@@ -684,7 +684,7 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
 define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer
@@ -711,7 +711,7 @@ declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind rea
 define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw_load_op0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -749,7 +749,7 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmul_hr_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -765,7 +765,7 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
 define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pshuf_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]]
@@ -782,7 +782,7 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
 define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psign_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
@@ -798,7 +798,7 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psign_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -814,7 +814,7 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psign_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
@@ -830,7 +830,7 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_mpsadbw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i8> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -854,7 +854,7 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
 define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_mpsadbw_load_op0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -889,7 +889,7 @@ define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
 define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packusdw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -921,8 +921,8 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() #0 {
 
 define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendvb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr <32 x i8> [[A2:%.*]], splat (i8 7)
@@ -947,7 +947,7 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]], <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -963,7 +963,7 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendd_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
@@ -979,7 +979,7 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -995,7 +995,7 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind
 define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_permd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
@@ -1011,7 +1011,7 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_permps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1035,7 +1035,7 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
 
 define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
@@ -1060,7 +1060,7 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
 
 define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
@@ -1085,7 +1085,7 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl
 
 define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
@@ -1110,7 +1110,7 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
 
 define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
@@ -1135,9 +1135,9 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl
 
 define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -1163,9 +1163,9 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -1191,9 +1191,9 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -1219,9 +1219,9 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind
 
 define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -1248,7 +1248,7 @@ declare void @llvm.x86.avx2.maskstore.d.256(ptr, <8 x i32>, <8 x i32>) nounwind
 define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -1287,7 +1287,7 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -1326,7 +1326,7 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
 define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
@@ -1357,7 +1357,7 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
@@ -1389,7 +1389,7 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
 define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -1428,7 +1428,7 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -1467,7 +1467,7 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
 define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
@@ -1499,7 +1499,7 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
@@ -1532,7 +1532,7 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
 define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrav_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -1563,7 +1563,7 @@ declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrav_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i32>
@@ -1594,9 +1594,9 @@ declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind read
 define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, ptr %a1, <4 x i32> %idx, <2 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1627,9 +1627,9 @@ declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, ptr,
 define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, ptr %a1, <4 x i32> %idx, <4 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1660,9 +1660,9 @@ declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, ptr,
 define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, ptr %a1, <2 x i64> %idx, <2 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1693,9 +1693,9 @@ declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, ptr,
 define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, ptr %a1, <4 x i64> %idx, <4 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1726,9 +1726,9 @@ declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, ptr,
 define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, ptr %a1, <4 x i32> %idx, <4 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1759,9 +1759,9 @@ declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, ptr,
 define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, ptr %a1, <8 x i32> %idx, <8 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1792,9 +1792,9 @@ declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr,
 define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, ptr %a1, <2 x i64> %idx, <4 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1825,9 +1825,9 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, ptr,
 define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, ptr %a1, <4 x i64> %idx, <4 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1858,9 +1858,9 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, ptr,
 define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, ptr %a1, <4 x i32> %idx, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1891,9 +1891,9 @@ declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, ptr,
 define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, ptr %a1, <4 x i32> %idx, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1924,9 +1924,9 @@ declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr,
 define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, ptr %a1, <2 x i64> %idx, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1957,9 +1957,9 @@ declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, ptr,
 define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, ptr %a1, <4 x i64> %idx, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -1990,9 +1990,9 @@ declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr,
 define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, ptr %a1, <4 x i32> %idx, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -2023,9 +2023,9 @@ declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr,
 define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, ptr %a1, <8 x i32> %idx, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -2056,9 +2056,9 @@ declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr,
 define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, ptr %a1, <2 x i64> %idx, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -2089,9 +2089,9 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, ptr,
 define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, ptr %a1, <4 x i64> %idx, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -2122,10 +2122,10 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr,
 define <8 x float>  @test_gather_mask(<8 x float> %a0, ptr %a, <8 x i32> %idx, <8 x float> %mask, ptr nocapture %out) #0 {
 ; CHECK-LABEL: @test_gather_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
@@ -2167,10 +2167,10 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, ptr %a, <8 x i32> %idx, <
 
 define <2 x i64> @test_mask_demanded_bits(<2 x i64> %a0, ptr %a1, <2 x i64> %idx, <2 x i1> %mask) #0 {
 ; CHECK-LABEL: @test_mask_demanded_bits(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[MASK1:%.*]] = sext <2 x i1> [[MASK:%.*]] to <2 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
index 43da02d19693c..17bef29a05220 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll
@@ -19,10 +19,10 @@ target triple = "x86_64-unknown-linux-gnu"
 declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8affineinvqb_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -94,10 +94,10 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8>
 declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8)
 define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8affineinvqb_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -169,10 +169,10 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8>
 declare <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8>, <64 x i8>, i8)
 define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8affineinvqb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -244,10 +244,10 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8>
 declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8affineqb_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -308,10 +308,10 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s
 declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
 define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8affineqb_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -372,10 +372,10 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s
 declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
 define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8affineqb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -437,7 +437,7 @@ declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
 define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]])
@@ -450,10 +450,10 @@ define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2) #0 {
 
 define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_128_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -476,9 +476,9 @@ define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16
 
 define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_128_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -503,7 +503,7 @@ declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>)
 define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]])
@@ -516,10 +516,10 @@ define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2) #0 {
 
 define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_256_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -542,9 +542,9 @@ define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32
 
 define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i32 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_256_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1>
@@ -569,7 +569,7 @@ declare <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8>, <64 x i8>)
 define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]])
@@ -582,10 +582,10 @@ define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2) #0 {
 
 define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
@@ -608,9 +608,9 @@ define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64
 
 define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i64 %mask) #0 {
 ; CHECK-LABEL: @test_vgf2p8mulb_512_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 74cb49b0f602a..25a4a9af6f5a7 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -12,7 +12,7 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1)  #0 {
 ;
 ; CHECK-LABEL: @unpckbw_test(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
@@ -37,8 +37,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pbroadca
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_gpr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X0:%.*]], i64 0
@@ -92,8 +92,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pbroadcastq
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_gpr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[X0:%.*]], i64 0
@@ -162,8 +162,8 @@ define <16 x float> @test_x86_mask_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x f
 ;
 ; CHECK-LABEL: @test_x86_mask_vbroadcast_ss_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
@@ -188,7 +188,7 @@ define <16 x float> @test_x86_maskz_vbroadcast_ss_ps_512(<4 x float> %a0, i16 %m
 ;
 ; CHECK-LABEL: @test_x86_maskz_vbroadcast_ss_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
@@ -227,8 +227,8 @@ define <8 x double> @test_x86_mask_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x d
 ;
 ; CHECK-LABEL: @test_x86_mask_vbroadcast_sd_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
@@ -253,7 +253,7 @@ define <8 x double> @test_x86_maskz_vbroadcast_sd_pd_512(<2 x double> %a0, i8 %m
 ;
 ; CHECK-LABEL: @test_x86_maskz_vbroadcast_sd_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
@@ -292,8 +292,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_512(<4 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
@@ -316,7 +316,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_512(<4 x i32> %x0, i16 %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
@@ -354,8 +354,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_512(<2 x i64> %x0, <8 x i6
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
@@ -378,7 +378,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_512(<2 x i64> %x0, i8 %ma
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
@@ -416,8 +416,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_movsldup_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -442,7 +442,7 @@ define <16 x float>@test_int_x86_avx512_maskz_movsldup_512(<16 x float> %x0, i16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_movsldup_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -481,8 +481,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_movshdup_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -507,7 +507,7 @@ define <16 x float>@test_int_x86_avx512_maskz_movshdup_512(<16 x float> %x0, i16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_movshdup_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -546,8 +546,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_movddup_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -572,7 +572,7 @@ define <8 x double>@test_int_x86_avx512_maskz_movddup_512(<8 x double> %x0, i8 %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_movddup_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -611,8 +611,8 @@ define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_perm_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
@@ -637,7 +637,7 @@ define <8 x double>@test_int_x86_avx512_maskz_perm_df_512(<8 x double> %x0, i8 %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
@@ -676,8 +676,8 @@ define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_perm_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
@@ -700,7 +700,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
@@ -722,10 +722,10 @@ define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i
 define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_store1(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -766,10 +766,10 @@ declare void @llvm.x86.avx512.mask.storeu.ps.512(ptr, <16 x float>, i16 )
 define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_store2(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -810,10 +810,10 @@ declare void @llvm.x86.avx512.mask.storeu.pd.512(ptr, <8 x double>, i8)
 define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_store_aligned_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -854,10 +854,10 @@ declare void @llvm.x86.avx512.mask.store.ps.512(ptr, <16 x float>, i16 )
 define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_store_aligned_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -898,10 +898,10 @@ declare void @llvm.x86.avx512.mask.store.pd.512(ptr, <8 x double>, i8)
 define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
@@ -942,10 +942,10 @@ declare void @llvm.x86.avx512.mask.storeu.q.512(ptr, <8 x i64>, i8)
 define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
@@ -986,10 +986,10 @@ declare void @llvm.x86.avx512.mask.storeu.d.512(ptr, <16 x i32>, i16)
 define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_store_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
@@ -1030,10 +1030,10 @@ declare void @llvm.x86.avx512.mask.store.q.512(ptr, <8 x i64>, i8)
 define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_store_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
@@ -1074,8 +1074,8 @@ declare void @llvm.x86.avx512.mask.store.d.512(ptr, <16 x i32>, i16)
 define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_load_aligned_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1137,8 +1137,8 @@ declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr, <16 x float>, i16)
 define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_load_unaligned_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1200,8 +1200,8 @@ declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr, <16 x float>, i16)
 define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_load_aligned_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1263,8 +1263,8 @@ declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr, <8 x double>, i8)
 define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_load_unaligned_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1329,8 +1329,8 @@ define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %d
 ;
 ; CHECK-LABEL: @test_mask_load_unaligned_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -1393,8 +1393,8 @@ define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %dat
 ;
 ; CHECK-LABEL: @test_mask_load_unaligned_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -1456,8 +1456,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr, <16 x i32>, i16)
 define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_load_aligned_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1519,8 +1519,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr, <8 x i64>, i8)
 define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_load_aligned_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -1596,8 +1596,8 @@ define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
@@ -1622,7 +1622,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
@@ -1661,8 +1661,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <1
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
@@ -1687,7 +1687,7 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermil_ps_512(<16 x float> %x0, i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
@@ -1726,8 +1726,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
@@ -1750,7 +1750,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pshuf_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
@@ -1772,7 +1772,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1,
 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_pcmpeq_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -1795,8 +1795,8 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_pcmpeq_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -1828,7 +1828,7 @@ declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_pcmpeq_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
@@ -1851,8 +1851,8 @@ define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_pcmpeq_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
@@ -1884,7 +1884,7 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_pcmpgt_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
@@ -1911,8 +1911,8 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_pcmpgt_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648)
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
@@ -1948,7 +1948,7 @@ declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_pcmpgt_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
@@ -1975,8 +1975,8 @@ define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_pcmpgt_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808)
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
@@ -2014,7 +2014,7 @@ declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x doub
 define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_unpckh_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2029,9 +2029,9 @@ define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2057,7 +2057,7 @@ declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x flo
 define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_unpckh_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -2072,9 +2072,9 @@ define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -2100,7 +2100,7 @@ declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x doub
 define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_unpckl_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2115,9 +2115,9 @@ define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2143,7 +2143,7 @@ declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x flo
 define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_unpckl_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -2158,9 +2158,9 @@ define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -2186,7 +2186,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8
 define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpcklqd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2201,9 +2201,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpcklqd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2226,8 +2226,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_punpcklqd_q_512(<8 x i64> %x0, <8 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_punpcklqd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -2251,7 +2251,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8
 define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpckhqd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2266,9 +2266,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i6
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhqd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2292,7 +2292,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpckhd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -2307,9 +2307,9 @@ define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -2333,7 +2333,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpckld_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -2348,9 +2348,9 @@ define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpckld_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -2387,8 +2387,8 @@ define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_pslli_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -2412,7 +2412,7 @@ define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask)  #0
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
@@ -2452,8 +2452,8 @@ define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_pslli_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -2477,7 +2477,7 @@ define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
@@ -2517,8 +2517,8 @@ define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrli_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -2542,7 +2542,7 @@ define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask)  #0
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
@@ -2582,8 +2582,8 @@ define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrli_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -2607,7 +2607,7 @@ define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
@@ -2647,8 +2647,8 @@ define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrai_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -2672,7 +2672,7 @@ define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask)  #0
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
@@ -2712,8 +2712,8 @@ define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrai_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -2737,7 +2737,7 @@ define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
@@ -2764,7 +2764,7 @@ declare void @llvm.x86.avx512.storent.q.512(ptr, <8 x i64>)
 define void@test_storent_q_512(<8 x i64> %data, ptr %ptr)  #0 {
 ;
 ; CHECK-LABEL: @test_storent_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2789,7 +2789,7 @@ declare void @llvm.x86.avx512.storent.pd.512(ptr, <8 x double>)
 define void @test_storent_pd_512(<8 x double> %data, ptr %ptr)  #0 {
 ;
 ; CHECK-LABEL: @test_storent_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2814,7 +2814,7 @@ declare void @llvm.x86.avx512.storent.ps.512(ptr, <16 x float>)
 define void @test_storent_ps_512(<16 x float> %data, ptr %ptr)  #0 {
 ;
 ; CHECK-LABEL: @test_storent_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2837,7 +2837,7 @@ define void @test_storent_ps_512(<16 x float> %data, ptr %ptr)  #0 {
 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_xor_epi32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -2852,9 +2852,9 @@ define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %
 ;
 ; CHECK-LABEL: @test_mask_xor_epi32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -2878,7 +2878,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_or_epi32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1)
@@ -2899,9 +2899,9 @@ define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %p
 ;
 ; CHECK-LABEL: @test_mask_or_epi32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1)
@@ -2931,7 +2931,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x
 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_and_epi32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]]
@@ -2950,9 +2950,9 @@ define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %
 ;
 ; CHECK-LABEL: @test_mask_and_epi32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]]
@@ -2980,7 +2980,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16
 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_xor_epi64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -2995,9 +2995,9 @@ define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %pass
 ;
 ; CHECK-LABEL: @test_mask_xor_epi64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -3021,7 +3021,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i6
 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_or_epi64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1)
@@ -3042,9 +3042,9 @@ define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passT
 ;
 ; CHECK-LABEL: @test_mask_or_epi64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1)
@@ -3074,7 +3074,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64
 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_and_epi64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]]
@@ -3093,9 +3093,9 @@ define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %pass
 ;
 ; CHECK-LABEL: @test_mask_and_epi64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]]
@@ -3123,7 +3123,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i6
 define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_mask_add_epi32_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -3138,9 +3138,9 @@ define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -3163,8 +3163,8 @@ define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %m
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -3186,7 +3186,7 @@ define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %m
 define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3213,10 +3213,10 @@ define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -3250,9 +3250,9 @@ define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32>
 define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -3286,9 +3286,9 @@ define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask
 define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3321,12 +3321,12 @@ define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <16 x i32>
 define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
@@ -3366,11 +3366,11 @@ define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32
 define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
@@ -3412,7 +3412,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_mask_sub_epi32_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -3427,9 +3427,9 @@ define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -3452,8 +3452,8 @@ define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %m
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -3475,7 +3475,7 @@ define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %m
 define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3502,10 +3502,10 @@ define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -3539,9 +3539,9 @@ define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32>
 define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -3575,9 +3575,9 @@ define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask
 define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3610,12 +3610,12 @@ define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <16 x i32>
 define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
@@ -3655,10 +3655,10 @@ define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32
 define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask, <16 x i32> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
@@ -3700,7 +3700,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16
 define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_mask_add_epi64_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -3715,9 +3715,9 @@ define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -3740,8 +3740,8 @@ define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -3763,7 +3763,7 @@ define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)
 define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3790,10 +3790,10 @@ define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -3827,9 +3827,9 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %p
 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -3863,9 +3863,9 @@ define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)
 define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3898,12 +3898,12 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b, <8 x i64> %e
 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
@@ -3943,11 +3943,11 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_add_epi64_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
@@ -3989,7 +3989,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i6
 define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_mask_sub_epi64_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -4004,9 +4004,9 @@ define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -4029,8 +4029,8 @@ define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
@@ -4052,7 +4052,7 @@ define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)
 define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -4079,10 +4079,10 @@ define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -4116,9 +4116,9 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %p
 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -4152,9 +4152,9 @@ define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)
 define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -4187,12 +4187,12 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b, <8 x i64> %e
 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
@@ -4232,11 +4232,11 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_sub_epi64_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
@@ -4278,7 +4278,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i6
 define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_mask_mullo_epi32_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -4293,9 +4293,9 @@ define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -4318,8 +4318,8 @@ define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
@@ -4341,7 +4341,7 @@ define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -4368,10 +4368,10 @@ define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, ptr %ptr_b)  #0 {
 define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -4405,9 +4405,9 @@ define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <16
 define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -4441,9 +4441,9 @@ define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i16
 define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rmb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -4476,12 +4476,12 @@ define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b, <16
 define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rmbk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
@@ -4521,11 +4521,11 @@ define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <16
 define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask, <16 x i32> %extra_param, <16 x i32> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mullo_epi32_rmbkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
@@ -4570,7 +4570,7 @@ declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>
 define <16 x float>@test_int_x86_avx512_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_shuf_f32x4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
@@ -4585,9 +4585,9 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f32x4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
@@ -4613,7 +4613,7 @@ declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>
 define <8 x double>@test_int_x86_avx512_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_shuf_f64x2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
@@ -4628,9 +4628,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x d
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f64x2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
@@ -4655,8 +4655,8 @@ define <8 x double>@test_int_x86_avx512_maskz_shuf_f64x2(<8 x double> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_f64x2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
@@ -4681,7 +4681,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32,
 define <16 x i32>@test_int_x86_avx512_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_shuf_i32x4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
@@ -4696,9 +4696,9 @@ define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i32x4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
@@ -4722,7 +4722,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8
 define <8 x i64>@test_int_x86_avx512_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_shuf_i64x2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
@@ -4737,9 +4737,9 @@ define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i64x2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
@@ -4763,7 +4763,7 @@ declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double
 define <8 x double>@test_int_x86_avx512_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_shuf_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
@@ -4778,9 +4778,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
@@ -4805,8 +4805,8 @@ define <8 x double>@test_int_x86_avx512_maskz_shuf_pd_512(<8 x double> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
@@ -4831,7 +4831,7 @@ declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float
 define <16 x float>@test_int_x86_avx512_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_shuf_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
@@ -4846,9 +4846,9 @@ define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
@@ -4874,7 +4874,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32>@test_int_x86_avx512_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxs_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -4889,9 +4889,9 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -4915,7 +4915,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxs_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -4930,9 +4930,9 @@ define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -4956,7 +4956,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32>@test_int_x86_avx512_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxu_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -4971,9 +4971,9 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -4997,7 +4997,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxu_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -5012,9 +5012,9 @@ define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -5038,7 +5038,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32>@test_int_x86_avx512_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmins_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -5053,9 +5053,9 @@ define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -5079,7 +5079,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmins_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -5094,9 +5094,9 @@ define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -5120,7 +5120,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32>@test_int_x86_avx512_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pminu_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -5135,9 +5135,9 @@ define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -5161,7 +5161,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pminu_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -5176,9 +5176,9 @@ define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -5201,10 +5201,10 @@ define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x
 ;
 ; CHECK-LABEL: @test_mm_mask_move_ss(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[__U:%.*]], 0
@@ -5248,8 +5248,8 @@ define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4
 ; CHECK-LABEL: @test_mm_maskz_move_ss(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[__U:%.*]], 0
@@ -5288,10 +5288,10 @@ define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2
 ;
 ; CHECK-LABEL: @test_mm_mask_move_sd(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[__U:%.*]], 0
@@ -5334,8 +5334,8 @@ define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <
 ; CHECK-LABEL: @test_mm_maskz_move_sd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[__U:%.*]], 0
@@ -5394,8 +5394,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i3
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -5420,7 +5420,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_512(<16 x i8> %x0, i16 %x2
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -5462,8 +5462,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5488,7 +5488,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_512(<16 x i8> %x0, i8 %x2)
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5530,8 +5530,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5556,7 +5556,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_512(<8 x i32> %x0, i8 %x2)
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5598,8 +5598,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -5624,7 +5624,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_512(<16 x i16> %x0, i16 %x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -5666,8 +5666,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5692,7 +5692,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_512(<8 x i16> %x0, i8 %x2)
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5734,8 +5734,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i3
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -5760,7 +5760,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_512(<16 x i8> %x0, i16 %x2
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -5802,8 +5802,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5828,7 +5828,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_512(<16 x i8> %x0, i8 %x2)
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5870,8 +5870,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5896,7 +5896,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_512(<8 x i32> %x0, i8 %x2)
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxd_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -5938,8 +5938,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -5964,7 +5964,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_512(<16 x i16> %x0, i16 %x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -6006,8 +6006,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -6032,7 +6032,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_512(<8 x i16> %x0, i8 %x2)
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -6058,7 +6058,7 @@ declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>)
 define <16 x i32>@test_int_x86_avx512_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -6076,9 +6076,9 @@ define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512(<16 x i32> %x0, <16 x i32
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -6106,8 +6106,8 @@ define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512(<16 x i32> %x0, <16 x i3
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -6136,7 +6136,7 @@ declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>)
 define <8 x i64>@test_int_x86_avx512_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -6154,9 +6154,9 @@ define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512(<8 x i64> %x0, <8 x i64> %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -6184,8 +6184,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512(<8 x i64> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -6214,7 +6214,7 @@ declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>)
 define <16 x i32>@test_int_x86_avx512_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -6232,9 +6232,9 @@ define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -6262,8 +6262,8 @@ define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512(<16 x i32> %x0, <16 x i3
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -6292,7 +6292,7 @@ declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>)
 define <8 x i64>@test_int_x86_avx512_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -6310,9 +6310,9 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -6340,8 +6340,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512(<8 x i64> %x0, <8 x i64>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -6371,8 +6371,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_prol_d_512(<1
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_prol_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -6427,8 +6427,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_prol_q_512(<8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_prol_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -6483,8 +6483,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_pror_d_512(<1
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_pror_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -6539,8 +6539,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_pror_q_512(<8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_pror_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -6595,8 +6595,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psrl_qi_512
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -6647,8 +6647,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psrl_di_
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -6699,8 +6699,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psra_di_
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psra_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -6751,8 +6751,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psra_qi_512
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psra_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -6803,8 +6803,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psll_di_
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psll_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -6855,8 +6855,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psll_qi_512
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psll_qi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -6904,7 +6904,7 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psll_qi_512
 define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psll_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -6925,9 +6925,9 @@ define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <1
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psll_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -6956,8 +6956,8 @@ define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psll_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -6987,7 +6987,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32
 define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psll_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -7008,9 +7008,9 @@ define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psll_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -7039,8 +7039,8 @@ define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psll_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -7070,7 +7070,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>,
 define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrl_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -7091,9 +7091,9 @@ define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <1
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrl_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -7122,8 +7122,8 @@ define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -7153,7 +7153,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32
 define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrl_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -7174,9 +7174,9 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrl_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -7205,8 +7205,8 @@ define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -7236,7 +7236,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>,
 define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psra_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -7257,9 +7257,9 @@ define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <1
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psra_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -7288,8 +7288,8 @@ define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psra_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -7319,7 +7319,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32
 define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psra_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -7340,9 +7340,9 @@ define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psra_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -7371,8 +7371,8 @@ define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psra_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -7402,7 +7402,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>,
 define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psllv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -7420,9 +7420,9 @@ define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psllv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -7448,8 +7448,8 @@ define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -7476,7 +7476,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i
 define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psllv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -7494,9 +7494,9 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psllv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -7522,8 +7522,8 @@ define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -7551,7 +7551,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>,
 define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrav_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -7569,9 +7569,9 @@ define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrav_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -7597,8 +7597,8 @@ define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -7625,7 +7625,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i
 define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrav_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -7643,9 +7643,9 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrav_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -7671,8 +7671,8 @@ define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -7699,7 +7699,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>,
 define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrlv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -7717,9 +7717,9 @@ define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -7745,8 +7745,8 @@ define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1,
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -7773,7 +7773,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i
 define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrlv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -7791,9 +7791,9 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8
 ;
 ; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -7819,8 +7819,8 @@ define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8
 ;
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -7847,7 +7847,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>,
 define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, ptr %ptr)  #0 {
 ;
 ; CHECK-LABEL: @test_x86_avx512_psrlv_q_memop(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -7893,8 +7893,8 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x d
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double>
@@ -7934,8 +7934,8 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double>
@@ -7998,8 +7998,8 @@ define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1,
 ;
 ; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -8025,7 +8025,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask)
 ;
 ; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -8048,7 +8048,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask)  #0
 ;
 ; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -8071,7 +8071,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float
 
 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b)  #0 {
 ; CHECK-LABEL: @test_valign_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
@@ -8086,10 +8086,10 @@ define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b)  #0 {
 define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_valign_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
 ; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
@@ -8113,9 +8113,9 @@ declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32,
 define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_valign_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
 ; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> [[A:%.*]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
@@ -8141,7 +8141,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -8166,9 +8166,9 @@ define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -8203,8 +8203,8 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -8239,7 +8239,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -8264,9 +8264,9 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -8302,8 +8302,8 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -8338,8 +8338,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
@@ -8390,7 +8390,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
 define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_mask_mul_epi32_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -8421,9 +8421,9 @@ define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -8462,8 +8462,8 @@ define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mas
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -8501,7 +8501,7 @@ define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mas
 define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -8544,10 +8544,10 @@ define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -8597,9 +8597,9 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -8649,9 +8649,9 @@ define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)
 define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -8703,12 +8703,12 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
@@ -8767,11 +8767,11 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64>
 define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rmbk_buildvector(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP35:%.*]], !prof [[PROF1]]
@@ -8848,11 +8848,11 @@ define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b
 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF1]]
@@ -8911,10 +8911,10 @@ define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask,
 define <8 x i64> @test_mask_mul_epi32_rmbkz_buildvector(<16 x i32> %a, ptr %ptr_b, i8 %mask, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epi32_rmbkz_buildvector(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP34:%.*]], !prof [[PROF1]]
@@ -8993,7 +8993,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x
 define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_mask_mul_epu32_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -9024,9 +9024,9 @@ define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -9065,8 +9065,8 @@ define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mas
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -9104,7 +9104,7 @@ define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mas
 define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -9147,10 +9147,10 @@ define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -9200,9 +9200,9 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -9252,9 +9252,9 @@ define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)
 define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -9306,12 +9306,12 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %
 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
@@ -9370,11 +9370,11 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64>
 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, <8 x i64> %extra_param, <8 x i64> %extra_param2)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_mul_epu32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP34:%.*]], label [[TMP35:%.*]], !prof [[PROF1]]
@@ -9435,8 +9435,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8
 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_vextractf32x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
@@ -9465,8 +9465,8 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <
 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_vextracti64x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -9494,7 +9494,7 @@ define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_vextracti32x4(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[A]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
@@ -9536,7 +9536,7 @@ declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x fl
 
 define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_insertf32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -9553,10 +9553,10 @@ define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x f
 define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_insertf32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -9582,9 +9582,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <
 define <16 x float>@test_int_x86_avx512_maskz_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, i16 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -9610,7 +9610,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>,
 
 define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_inserti32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -9627,10 +9627,10 @@ define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32>
 define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_inserti32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -9654,9 +9654,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x
 define <16 x i32>@test_int_x86_avx512_maskz_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, i16 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -9681,7 +9681,7 @@ declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x do
 
 define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_insertf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -9698,10 +9698,10 @@ define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x d
 define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_insertf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 160), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -9727,9 +9727,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <
 define <8 x double>@test_int_x86_avx512_maskz_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, i8 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -9755,7 +9755,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i3
 
 define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_inserti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -9772,10 +9772,10 @@ define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x
 define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_inserti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 160), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -9799,9 +9799,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
 define <8 x i64>@test_int_x86_avx512_maskz_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, i8 %x4)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -9850,8 +9850,8 @@ declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly
 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, <8 x i16> %extra_param)  #0 {
 ; CHECK-LABEL: @test_cmp_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -9971,9 +9971,9 @@ define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask,
 ;
 ; CHECK-LABEL: @test_mask_cmp_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP146:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP146:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -10162,8 +10162,8 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) no
 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, <8 x i16> %extra_param)  #0 {
 ; CHECK-LABEL: @test_ucmp_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP69:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP69:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -10275,9 +10275,9 @@ define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask
 ;
 ; CHECK-LABEL: @test_mask_ucmp_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP138:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP138:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
@@ -10458,8 +10458,8 @@ declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) n
 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i8> %extra_param)  #0 {
 ; CHECK-LABEL: @test_cmp_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP77:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
@@ -10579,9 +10579,9 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8
 ;
 ; CHECK-LABEL: @test_mask_cmp_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP146:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP146:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
@@ -10770,8 +10770,8 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwi
 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i8> %extra_param)  #0 {
 ; CHECK-LABEL: @test_ucmp_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP69:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP69:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
@@ -10883,9 +10883,9 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8
 ;
 ; CHECK-LABEL: @test_mask_ucmp_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP138:%.*]] = load <8 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP138:%.*]] = load <8 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
@@ -11069,8 +11069,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[X0:%.*]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -11116,8 +11116,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(ptr %x0ptr,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512_load(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -11169,8 +11169,8 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -11195,7 +11195,7 @@ define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcastf64x4_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -11219,8 +11219,8 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(ptr %x0ptr,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512_load(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -11259,8 +11259,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_broadcas
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -11306,8 +11306,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(ptr %x0ptr, <
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512_load(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -11357,8 +11357,8 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -11381,7 +11381,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcasti64x4_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -11404,8 +11404,8 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(ptr %x0ptr, <8
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512_load(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -11457,8 +11457,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32>
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <16 x i32> [[X0:%.*]], splat (i32 -2147483648)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> splat (i32 -1), <16 x i32> [[TMP1]]
@@ -11500,8 +11500,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <8 x i64> [[X0:%.*]], splat (i64 -9223372036854775808)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> splat (i64 -1), <8 x i64> [[TMP1]]
@@ -11526,8 +11526,8 @@ define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m)  #0 {
 ;
 ; CHECK-LABEL: @test_vptestmq(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[A0:%.*]], [[TMP2]]
@@ -11585,8 +11585,8 @@ define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m)  #0 {
 ;
 ; CHECK-LABEL: @test_vptestmd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[A0:%.*]], [[TMP2]]
@@ -11646,8 +11646,8 @@ define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_ptestnm_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[X0:%.*]], [[TMP2]]
@@ -11706,8 +11706,8 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_ptestnm_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[X0:%.*]], [[TMP2]]
@@ -11765,7 +11765,7 @@ define i16 @test_kand(i16 %a0, i16 %a1)  #0 {
 ;
 ; CHECK-LABEL: @test_kand(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
@@ -11802,7 +11802,7 @@ define i16 @test_kandn(i16 %a0, i16 %a1)  #0 {
 ;
 ; CHECK-LABEL: @test_kandn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
@@ -11862,7 +11862,7 @@ define i16 @test_kor(i16 %a0, i16 %a1)  #0 {
 ;
 ; CHECK-LABEL: @test_kor(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
@@ -11904,7 +11904,7 @@ define i16 @test_kxnor(i16 %a0, i16 %a1)  #0 {
 ;
 ; CHECK-LABEL: @test_kxnor(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
@@ -11937,7 +11937,7 @@ define i16 @test_kxor(i16 %a0, i16 %a1)  #0 {
 ;
 ; CHECK-LABEL: @test_kxor(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
@@ -11966,9 +11966,9 @@ define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D
 ; CHECK-LABEL: @test_kortestz(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32>
@@ -12043,9 +12043,9 @@ define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D
 ; CHECK-LABEL: @test_kortestc(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32>
@@ -12118,7 +12118,7 @@ entry:
 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b)  #0 {
 ; CHECK-LABEL: @test_cmpps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -12143,7 +12143,7 @@ declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32,
 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b)  #0 {
 ; CHECK-LABEL: @test_cmppd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -12168,7 +12168,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i
 define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_mul_epi32_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -12199,9 +12199,9 @@ define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %pa
 ;
 ; CHECK-LABEL: @test_mul_epi32_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -12242,8 +12242,8 @@ define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #
 ;
 ; CHECK-LABEL: @test_mul_epi32_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -12283,7 +12283,7 @@ define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #
 define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -12326,10 +12326,10 @@ define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -12381,9 +12381,9 @@ define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passT
 define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -12435,8 +12435,8 @@ define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
 define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -12488,11 +12488,11 @@ define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra
 define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP33:%.*]], !prof [[PROF1]]
@@ -12553,10 +12553,10 @@ define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass
 define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP32:%.*]], !prof [[PROF1]]
@@ -12619,7 +12619,7 @@ declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>)
 define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
 ; CHECK-LABEL: @test_mul_epu32_rr(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -12650,9 +12650,9 @@ define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %pa
 ;
 ; CHECK-LABEL: @test_mul_epu32_rrk(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -12693,8 +12693,8 @@ define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #
 ;
 ; CHECK-LABEL: @test_mul_epu32_rrkz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
@@ -12734,7 +12734,7 @@ define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #
 define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epu32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -12777,10 +12777,10 @@ define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
 define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epu32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -12832,9 +12832,9 @@ define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passT
 define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epu32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -12886,8 +12886,8 @@ define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
 define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epu32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -12939,11 +12939,11 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b, <8 x i64> %extra
 define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epu32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP33:%.*]], !prof [[PROF1]]
@@ -13004,10 +13004,10 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %pass
 define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask, <8 x i64> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_mul_epu32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP32:%.*]], !prof [[PROF1]]
@@ -13070,7 +13070,7 @@ declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>)
 define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
 ;
 ; CHECK-LABEL: @test_x86_avx512_mm_cvtu32_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP1]] to i64
@@ -13187,7 +13187,7 @@ declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64
 define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -13211,9 +13211,9 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -13247,8 +13247,8 @@ define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -13282,7 +13282,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8
 define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -13297,9 +13297,9 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -13322,8 +13322,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i6
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -13347,7 +13347,7 @@ declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i3
 define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -13371,9 +13371,9 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -13407,8 +13407,8 @@ define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -13442,7 +13442,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -13457,9 +13457,9 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -13482,8 +13482,8 @@ define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -13507,8 +13507,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -13535,9 +13535,9 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -13574,9 +13574,9 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -13612,8 +13612,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8
 define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -13640,9 +13640,9 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -13679,9 +13679,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -13716,10 +13716,10 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>
 
 define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -13753,10 +13753,10 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -13800,8 +13800,8 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
 define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP8]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -13828,9 +13828,9 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -13868,8 +13868,8 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
 define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[TMP8]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -13896,9 +13896,9 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -13936,8 +13936,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
 define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X3:%.*]], <8 x i64> [[TMP3]])
@@ -13960,9 +13960,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
@@ -13994,10 +13994,10 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32
 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -14041,11 +14041,11 @@ declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x do
 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3, <8 x double> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
@@ -14099,10 +14099,10 @@ declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x
 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -14139,10 +14139,10 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>,
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X0]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X4:%.*]], <8 x i64> [[TMP3]])
@@ -14173,8 +14173,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>
 
 define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
@@ -14197,10 +14197,10 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32
 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X4:%.*]], <16 x i32> [[TMP3]])
@@ -14234,7 +14234,7 @@ declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>
 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vsubps_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14258,7 +14258,7 @@ define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vsubps_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14282,7 +14282,7 @@ define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vsubps_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14306,7 +14306,7 @@ define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vsubps_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14330,7 +14330,7 @@ define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1)  #0 {
 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vmulps_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14354,7 +14354,7 @@ define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vmulps_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14378,7 +14378,7 @@ define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vmulps_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14402,7 +14402,7 @@ define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1)  #0 {
 ; CHECK-LABEL: @test_vmulps_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -14428,8 +14428,8 @@ define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16
 ;
 ; CHECK-LABEL: @test_vmulps_mask_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14463,8 +14463,8 @@ define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16
 ;
 ; CHECK-LABEL: @test_vmulps_mask_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14498,8 +14498,8 @@ define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16
 ;
 ; CHECK-LABEL: @test_vmulps_mask_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14533,8 +14533,8 @@ define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16
 ;
 ; CHECK-LABEL: @test_vmulps_mask_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14569,9 +14569,9 @@ define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float>
 ;
 ; CHECK-LABEL: @test_vmulps_mask_passthru_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -14606,9 +14606,9 @@ define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float>
 ;
 ; CHECK-LABEL: @test_vmulps_mask_passthru_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -14643,9 +14643,9 @@ define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float>
 ;
 ; CHECK-LABEL: @test_vmulps_mask_passthru_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -14680,9 +14680,9 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float>
 ;
 ; CHECK-LABEL: @test_vmulps_mask_passthru_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -14718,8 +14718,8 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8
 ;
 ; CHECK-LABEL: @test_vmulpd_mask_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14753,8 +14753,8 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8
 ;
 ; CHECK-LABEL: @test_vmulpd_mask_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14788,8 +14788,8 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8
 ;
 ; CHECK-LABEL: @test_vmulpd_mask_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14823,8 +14823,8 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
 ;
 ; CHECK-LABEL: @test_vmulpd_mask_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14858,8 +14858,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14891,8 +14891,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14924,8 +14924,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14958,8 +14958,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -14993,8 +14993,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -15027,9 +15027,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15062,9 +15062,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15097,9 +15097,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15133,9 +15133,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15170,9 +15170,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15206,7 +15206,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15228,7 +15228,7 @@ define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15250,7 +15250,7 @@ define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15273,7 +15273,7 @@ define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15296,7 +15296,7 @@ define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15321,9 +15321,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15356,9 +15356,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15391,9 +15391,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15427,9 +15427,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15464,9 +15464,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15499,7 +15499,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15521,7 +15521,7 @@ define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15543,7 +15543,7 @@ define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15566,7 +15566,7 @@ define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15589,7 +15589,7 @@ define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15613,8 +15613,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -15646,8 +15646,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -15679,8 +15679,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -15713,8 +15713,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -15748,8 +15748,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -15782,9 +15782,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15817,9 +15817,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15852,9 +15852,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15888,9 +15888,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x
 ;
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15925,9 +15925,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -15961,7 +15961,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -15983,7 +15983,7 @@ define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -16005,7 +16005,7 @@ define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -16028,7 +16028,7 @@ define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -16051,7 +16051,7 @@ define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -16075,9 +16075,9 @@ declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>
 define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_store_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -16107,7 +16107,7 @@ define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data)  #0 {
 ;
 ; CHECK-LABEL: @test_compress_store_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16129,9 +16129,9 @@ define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data)  #0 {
 define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_store_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -16161,7 +16161,7 @@ define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data)  #0 {
 ;
 ; CHECK-LABEL: @test_compress_store_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16183,9 +16183,9 @@ define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data)  #0 {
 define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_store_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -16215,7 +16215,7 @@ define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data)  #0 {
 ;
 ; CHECK-LABEL: @test_compress_store_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16237,9 +16237,9 @@ define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data)  #0 {
 define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_store_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -16269,7 +16269,7 @@ define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data)  #0 {
 ;
 ; CHECK-LABEL: @test_compress_store_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16291,9 +16291,9 @@ define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data)  #0 {
 define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_load_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -16321,7 +16321,7 @@ define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data,
 define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_load_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -16353,7 +16353,7 @@ define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data)  #0
 ;
 ; CHECK-LABEL: @test_expand_load_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16377,7 +16377,7 @@ define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data)  #0
 define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
 ; CHECK-LABEL: @test_zero_mask_expand_load_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16400,9 +16400,9 @@ define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> %
 define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_load_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -16430,7 +16430,7 @@ define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data,
 define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_load_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -16462,7 +16462,7 @@ define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data)  #0
 ;
 ; CHECK-LABEL: @test_expand_load_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16485,9 +16485,9 @@ define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data)  #0
 define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_load_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -16515,7 +16515,7 @@ define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %ma
 define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_load_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -16547,7 +16547,7 @@ define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data)  #0 {
 ;
 ; CHECK-LABEL: @test_expand_load_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16570,9 +16570,9 @@ define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data)  #0 {
 define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_load_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -16600,7 +16600,7 @@ define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16
 define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_load_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -16632,7 +16632,7 @@ define <16 x i32> @test_expand_load_d_512(ptr %addr, <16 x i32> %data)  #0 {
 ;
 ; CHECK-LABEL: @test_expand_load_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -16656,8 +16656,8 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f
 ;
 ; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16682,8 +16682,8 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16708,9 +16708,9 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl
 ;
 ; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16736,9 +16736,9 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_mask_min_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16763,7 +16763,7 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_min_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16778,7 +16778,7 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float>
 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_min_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16795,8 +16795,8 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f
 ;
 ; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16821,8 +16821,8 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16847,9 +16847,9 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl
 ;
 ; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16875,9 +16875,9 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16
 ;
 ; CHECK-LABEL: @test_mm512_mask_max_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16902,7 +16902,7 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_max_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16917,7 +16917,7 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float>
 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
 ; CHECK-LABEL: @test_mm512_max_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -16945,8 +16945,8 @@ define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passt
 ;
 ; CHECK-LABEL: @test_mask_sqrt_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
@@ -16969,7 +16969,7 @@ define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_sqrt_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
@@ -17009,8 +17009,8 @@ define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double>
 ;
 ; CHECK-LABEL: @test_mask_sqrt_round_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -17040,7 +17040,7 @@ define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask)  #
 ;
 ; CHECK-LABEL: @test_maskz_sqrt_round_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -17082,8 +17082,8 @@ define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passt
 ;
 ; CHECK-LABEL: @test_mask_sqrt_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -17106,7 +17106,7 @@ define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_sqrt_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -17146,8 +17146,8 @@ define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float>
 ;
 ; CHECK-LABEL: @test_mask_sqrt_round_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -17177,7 +17177,7 @@ define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask)
 ;
 ; CHECK-LABEL: @test_maskz_sqrt_round_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -17209,7 +17209,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32>@test_int_x86_avx512_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -17227,9 +17227,9 @@ define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512_old(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -17255,8 +17255,8 @@ define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512_old(<16 x i32> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -17283,7 +17283,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -17301,9 +17301,9 @@ define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512_old(<8 x i64> %x0, <8 x i6
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -17329,8 +17329,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512_old(<8 x i64> %x0, <8 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -17357,7 +17357,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32>@test_int_x86_avx512_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -17375,9 +17375,9 @@ define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512_old(<16 x i32> %x0, <16 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -17403,8 +17403,8 @@ define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512_old(<16 x i32> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -17431,7 +17431,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -17449,9 +17449,9 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512_old(<8 x i64> %x0, <8 x i6
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -17477,8 +17477,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512_old(<8 x i64> %x0, <8 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512_old(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -17506,8 +17506,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_prol_d_5
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prol_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -17558,8 +17558,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_prol_q_512(
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_prol_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -17610,8 +17610,8 @@ define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pror_d_5
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pror_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -17662,8 +17662,8 @@ define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pror_q_512(
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pror_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -17714,9 +17714,9 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
@@ -17812,9 +17812,9 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
@@ -17910,9 +17910,9 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
@@ -17983,9 +17983,9 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
@@ -18055,9 +18055,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
@@ -18153,9 +18153,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
@@ -18249,9 +18249,9 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_par
 ;
 ; CHECK-LABEL: @fmadd_ss_mask_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
@@ -18352,9 +18352,9 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_pa
 ;
 ; CHECK-LABEL: @fmadd_ss_maskz_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
@@ -18454,9 +18454,9 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_pa
 ;
 ; CHECK-LABEL: @fmadd_sd_mask_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
@@ -18545,9 +18545,9 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_p
 ;
 ; CHECK-LABEL: @fmadd_sd_maskz_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
@@ -18636,10 +18636,10 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]]
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
@@ -18743,10 +18743,10 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <
 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]]
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
@@ -18851,9 +18851,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]]
@@ -18961,9 +18961,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]]
@@ -19068,11 +19068,11 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4, <4 x float> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
@@ -19122,11 +19122,11 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x
 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4, <4 x float> %extra_param)  #0 {
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
@@ -19176,10 +19176,10 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f
 
 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4, <4 x float> %extra_param)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
@@ -19240,8 +19240,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
@@ -19264,7 +19264,7 @@ define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2)  #
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
@@ -19289,8 +19289,8 @@ define <16 x float> @test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -19328,8 +19328,8 @@ define <16 x float> @test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -19364,9 +19364,9 @@ define <16 x float> @test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16
 define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -19394,7 +19394,7 @@ define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double>
 define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -19420,7 +19420,7 @@ define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask)  #
 define <8 x double> @test_compress_pd_512(<8 x double> %data, <8 x double> %extra_param)  #0 {
 ; CHECK-LABEL: @test_compress_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19445,9 +19445,9 @@ declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <
 define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -19475,7 +19475,7 @@ define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float>
 define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -19501,7 +19501,7 @@ define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask)
 define <16 x float> @test_compress_ps_512(<16 x float> %data, <16 x float> %extra_param)  #0 {
 ; CHECK-LABEL: @test_compress_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19526,9 +19526,9 @@ declare <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <
 define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -19556,7 +19556,7 @@ define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru,
 define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -19582,7 +19582,7 @@ define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask)  #0 {
 define <8 x i64> @test_compress_q_512(<8 x i64> %data, <8 x i64> %extra_param)  #0 {
 ; CHECK-LABEL: @test_compress_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19607,9 +19607,9 @@ declare <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64
 define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -19637,7 +19637,7 @@ define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passth
 define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -19663,7 +19663,7 @@ define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask)  #0 {
 define <16 x i32> @test_compress_d_512(<16 x i32> %data, <16 x i32> %extra_param)  #0 {
 ; CHECK-LABEL: @test_compress_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19688,7 +19688,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x
 define <8 x double> @test_expand_pd_512(<8 x double> %data, <8 x double> %extra_param)  #0 {
 ; CHECK-LABEL: @test_expand_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19711,9 +19711,9 @@ define <8 x double> @test_expand_pd_512(<8 x double> %data, <8 x double> %extra_
 define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -19741,7 +19741,7 @@ define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %p
 define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -19769,7 +19769,7 @@ declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8
 define <16 x float> @test_expand_ps_512(<16 x float> %data, <16 x float> %extra_param)  #0 {
 ; CHECK-LABEL: @test_expand_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19792,9 +19792,9 @@ define <16 x float> @test_expand_ps_512(<16 x float> %data, <16 x float> %extra_
 define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -19822,7 +19822,7 @@ define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %p
 define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -19850,7 +19850,7 @@ declare <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16
 define <8 x i64> @test_expand_q_512(<8 x i64> %data, <8 x i64> %extra_param)  #0 {
 ; CHECK-LABEL: @test_expand_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19873,9 +19873,9 @@ define <8 x i64> @test_expand_q_512(<8 x i64> %data, <8 x i64> %extra_param)  #0
 define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -19903,7 +19903,7 @@ define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i
 define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -19931,7 +19931,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64>
 define <16 x i32> @test_expand_d_512(<16 x i32> %data, <16 x i32> %extra_param)  #0 {
 ; CHECK-LABEL: @test_expand_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -19954,9 +19954,9 @@ define <16 x i32> @test_expand_d_512(<16 x i32> %data, <16 x i32> %extra_param)
 define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_mask_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -19984,7 +19984,7 @@ define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru
 define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask)  #0 {
 ;
 ; CHECK-LABEL: @test_maskz_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -20014,10 +20014,10 @@ define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float>
 ; CHECK-LABEL: @test_cmp_512(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index d8f204f32cfd1..cc022e93bb7c0 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -46,9 +46,9 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -76,7 +76,7 @@ define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double>
 
 define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -103,7 +103,7 @@ define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0
 define <8 x double> @test_compress_pd_512(<8 x double> %data, <8 x double> %extra_param) #0 {
 ; CHECK-LABEL: @test_compress_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -125,9 +125,9 @@ define <8 x double> @test_compress_pd_512(<8 x double> %data, <8 x double> %extr
 
 define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mask_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -155,7 +155,7 @@ define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float>
 
 define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -182,7 +182,7 @@ define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #
 define <16 x float> @test_compress_ps_512(<16 x float> %data, <16 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_compress_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -204,9 +204,9 @@ define <16 x float> @test_compress_ps_512(<16 x float> %data, <16 x float> %extr
 
 define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -234,7 +234,7 @@ define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru,
 
 define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -261,7 +261,7 @@ define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 {
 define <8 x i64> @test_compress_q_512(<8 x i64> %data, <8 x i64> %extra_param) #0 {
 ; CHECK-LABEL: @test_compress_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -283,9 +283,9 @@ define <8 x i64> @test_compress_q_512(<8 x i64> %data, <8 x i64> %extra_param) #
 
 define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mask_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -313,7 +313,7 @@ define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passth
 
 define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -340,7 +340,7 @@ define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 {
 define <16 x i32> @test_compress_d_512(<16 x i32> %data, <16 x i32> %extra_param) #0 {
 ; CHECK-LABEL: @test_compress_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -363,7 +363,7 @@ define <16 x i32> @test_compress_d_512(<16 x i32> %data, <16 x i32> %extra_param
 define <8 x double> @test_expand_pd_512(<8 x double> %data, <8 x double> %extra_param) #0 {
 ; CHECK-LABEL: @test_expand_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -385,9 +385,9 @@ define <8 x double> @test_expand_pd_512(<8 x double> %data, <8 x double> %extra_
 
 define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -415,7 +415,7 @@ define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %p
 
 define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -442,7 +442,7 @@ define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 {
 define <16 x float> @test_expand_ps_512(<16 x float> %data, <16 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_expand_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -464,9 +464,9 @@ define <16 x float> @test_expand_ps_512(<16 x float> %data, <16 x float> %extra_
 
 define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mask_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -494,7 +494,7 @@ define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %p
 
 define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -521,7 +521,7 @@ define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0
 define <8 x i64> @test_expand_q_512(<8 x i64> %data, <8 x i64> %extra_param) #0 {
 ; CHECK-LABEL: @test_expand_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -543,9 +543,9 @@ define <8 x i64> @test_expand_q_512(<8 x i64> %data, <8 x i64> %extra_param) #0
 
 define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
@@ -573,7 +573,7 @@ define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i
 
 define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -600,7 +600,7 @@ define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 {
 define <16 x i32> @test_expand_d_512(<16 x i32> %data, <16 x i32> %extra_param) #0 {
 ; CHECK-LABEL: @test_expand_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -622,9 +622,9 @@ define <16 x i32> @test_expand_d_512(<16 x i32> %data, <16 x i32> %extra_param)
 
 define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mask_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
@@ -652,7 +652,7 @@ define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru
 
 define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -713,8 +713,8 @@ declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double
 define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b, <2 x double> %extra_param) #0 {
 ; CHECK-LABEL: @test_rndscale_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -740,9 +740,9 @@ define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b, <2 x dou
 define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 {
 ; CHECK-LABEL: @test_rndscale_sd_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -769,10 +769,10 @@ define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2
 
 define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x double> %c, i8 %mask) #0 {
 ; CHECK-LABEL: @test_rndscale_sd_mask_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -812,8 +812,8 @@ define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x
 define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) #0 {
 ; CHECK-LABEL: @test_rndscale_sd_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -840,8 +840,8 @@ declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>,
 define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_rndscale_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -866,9 +866,9 @@ define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b, <4 x float>
 
 define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_rndscale_ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
@@ -906,9 +906,9 @@ define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr, <4 x float>
 define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 {
 ; CHECK-LABEL: @test_rndscale_ss_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -936,8 +936,8 @@ define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x f
 define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) #0 {
 ; CHECK-LABEL: @test_rndscale_ss_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1024,8 +1024,8 @@ define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 {
 define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_sqrt_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
@@ -1050,7 +1050,7 @@ define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passt
 define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_sqrt_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
@@ -1094,8 +1094,8 @@ define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 {
 define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_sqrt_round_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -1127,7 +1127,7 @@ define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double>
 define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_sqrt_round_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -1171,8 +1171,8 @@ define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 {
 define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mask_sqrt_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -1197,7 +1197,7 @@ define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passt
 define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_sqrt_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -1241,8 +1241,8 @@ define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 {
 define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mask_sqrt_round_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -1274,7 +1274,7 @@ define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float>
 define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_sqrt_round_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -1385,9 +1385,9 @@ declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x
 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_sqrt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1467,9 +1467,9 @@ declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <
 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_sqrt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1870,9 +1870,9 @@ declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, ptr %dst) #0 {
 ; CHECK-LABEL: @test_x86_vcvtps2ph_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <16 x i1> [[TMP6]] to <16 x i16>
@@ -1929,7 +1929,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x
 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-LABEL: @test_cmpps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -1955,7 +1955,7 @@ declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i
 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 {
 ; CHECK-LABEL: @test_cmppd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -1983,7 +1983,7 @@ declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i3
 define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) #0 {
 ; CHECK-LABEL: @test_vmaxpd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], zeroinitializer
@@ -1999,7 +1999,7 @@ declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32
 define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) #0 {
 ; CHECK-LABEL: @test_vminpd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], zeroinitializer
@@ -2014,8 +2014,8 @@ declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32
 
 define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_store_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP1]], 0
@@ -2060,7 +2060,7 @@ declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32
 define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vsubps_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2083,7 +2083,7 @@ define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vsubps_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2106,7 +2106,7 @@ define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vsubps_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2129,7 +2129,7 @@ define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vsubps_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2152,7 +2152,7 @@ define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vmulps_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2175,7 +2175,7 @@ define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vmulps_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2198,7 +2198,7 @@ define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vmulps_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2221,7 +2221,7 @@ define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: @test_vmulps_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -2244,8 +2244,8 @@ define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
 define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2279,8 +2279,8 @@ define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16
 define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2314,8 +2314,8 @@ define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16
 define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2349,8 +2349,8 @@ define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16
 define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2384,9 +2384,9 @@ define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16
 define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_passthru_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2421,9 +2421,9 @@ define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float>
 define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_passthru_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2458,9 +2458,9 @@ define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float>
 define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_passthru_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2495,9 +2495,9 @@ define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float>
 define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_vmulps_mask_passthru_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2532,8 +2532,8 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float>
 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_vmulpd_mask_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2567,8 +2567,8 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8
 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_vmulpd_mask_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2602,8 +2602,8 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8
 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_vmulpd_mask_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2637,8 +2637,8 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8
 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_vmulpd_mask_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2672,8 +2672,8 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
 define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2707,8 +2707,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2742,8 +2742,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2777,8 +2777,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2812,8 +2812,8 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -2847,9 +2847,9 @@ define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2884,9 +2884,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2921,9 +2921,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2958,9 +2958,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2995,9 +2995,9 @@ define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_add_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3032,7 +3032,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3055,7 +3055,7 @@ define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3078,7 +3078,7 @@ define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3101,7 +3101,7 @@ define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3124,7 +3124,7 @@ define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_add_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3148,9 +3148,9 @@ declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32
 define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3185,9 +3185,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3222,9 +3222,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3259,9 +3259,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3296,9 +3296,9 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3333,7 +3333,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3356,7 +3356,7 @@ define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3379,7 +3379,7 @@ define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3402,7 +3402,7 @@ define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3425,7 +3425,7 @@ define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_sub_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3448,8 +3448,8 @@ define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x flo
 define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -3483,8 +3483,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -3518,8 +3518,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -3553,8 +3553,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -3588,8 +3588,8 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16
 define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -3623,9 +3623,9 @@ define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3660,9 +3660,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3697,9 +3697,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3734,9 +3734,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3771,9 +3771,9 @@ define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x
 define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_div_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -3808,7 +3808,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3831,7 +3831,7 @@ define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3854,7 +3854,7 @@ define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3877,7 +3877,7 @@ define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3900,7 +3900,7 @@ define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x floa
 define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_div_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -3924,8 +3924,8 @@ declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32
 define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -3951,8 +3951,8 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f
 define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -3978,9 +3978,9 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4007,9 +4007,9 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl
 define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_min_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4036,7 +4036,7 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_min_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4051,7 +4051,7 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float>
 define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_min_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4067,8 +4067,8 @@ declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32
 define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4094,8 +4094,8 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f
 define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4121,9 +4121,9 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4150,9 +4150,9 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl
 define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_mask_max_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4179,7 +4179,7 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16
 define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_max_round_ps_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4194,7 +4194,7 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float>
 define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_mm512_max_round_ps_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -4212,9 +4212,9 @@ declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>,
 define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_ss_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4242,9 +4242,9 @@ define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_ss_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4272,9 +4272,9 @@ define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_ss_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4302,9 +4302,9 @@ define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_ss_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4332,9 +4332,9 @@ define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_ss_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4362,8 +4362,8 @@ define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <
 define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_add_ss_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4388,7 +4388,7 @@ define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %m
 define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_add_ss_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -4410,11 +4410,11 @@ define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) #0 {
 
 define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_mask_add_ss_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -4465,10 +4465,10 @@ define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <
 
 define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, ptr %a1, i8 %mask, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_maskz_add_ss_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
@@ -4519,9 +4519,9 @@ declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x doubl
 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_sd_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4549,9 +4549,9 @@ define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_sd_rd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4579,9 +4579,9 @@ define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_sd_ru(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4609,9 +4609,9 @@ define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_sd_rz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4639,9 +4639,9 @@ define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_add_sd_current(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4669,8 +4669,8 @@ define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1
 define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_add_sd_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4695,7 +4695,7 @@ define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8
 define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_add_sd_rn(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -4717,11 +4717,11 @@ define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) #0 {
 
 define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask, <2 x double> %extra_param) #0 {
 ; CHECK-LABEL: @test_mask_add_sd_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -4766,10 +4766,10 @@ define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1,
 
 define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, ptr %a1, i8 %mask, <2 x double> %extra_param) #0 {
 ; CHECK-LABEL: @test_maskz_add_sd_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
@@ -4814,9 +4814,9 @@ declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>,
 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_max_ss_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4844,8 +4844,8 @@ define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x
 define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_max_ss_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4870,7 +4870,7 @@ define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %
 define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_max_ss_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -4893,9 +4893,9 @@ define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) #0 {
 define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_max_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -4923,8 +4923,8 @@ define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x floa
 define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_max_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4949,7 +4949,7 @@ define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask
 define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_max_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -4971,11 +4971,11 @@ define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 
 define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_mask_max_ss_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -5026,10 +5026,10 @@ define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x floa
 
 define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, ptr %a1, i8 %mask, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_maskz_max_ss_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
@@ -5079,9 +5079,9 @@ declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x doubl
 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_max_sd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -5109,8 +5109,8 @@ define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_max_sd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5135,7 +5135,7 @@ define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i
 define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_max_sd_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -5158,9 +5158,9 @@ define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) #0 {
 define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_max_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -5188,8 +5188,8 @@ define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x d
 define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_max_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5214,7 +5214,7 @@ define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %m
 define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_max_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -5236,11 +5236,11 @@ define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 
 define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask, <2 x double> %extra_param) #0 {
 ; CHECK-LABEL: @test_mask_max_sd_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
@@ -5285,10 +5285,10 @@ define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x do
 
 define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %mask, <2 x double> %extra_param) #0 {
 ; CHECK-LABEL: @test_maskz_max_sd_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
@@ -5331,7 +5331,7 @@ define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %ma
 define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) #0 {
 ; CHECK-LABEL: @test_x86_avx512_cvtsi2ss32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -5353,7 +5353,7 @@ declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind
 
 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) #0 {
 ; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
@@ -5373,7 +5373,7 @@ define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
 
 define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
 ; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss_mem(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -5405,7 +5405,7 @@ define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr
 
 define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) #0 {
 ; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
@@ -5425,7 +5425,7 @@ define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) #0 {
 
 define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
 ; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss_mem(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -5460,9 +5460,9 @@ declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16
 
 define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -5495,10 +5495,10 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 
 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5544,8 +5544,8 @@ declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>,
 define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -5570,9 +5570,9 @@ define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x
 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -5613,8 +5613,8 @@ declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>
 define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -5639,9 +5639,9 @@ define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16
 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -5682,8 +5682,8 @@ declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X3:%.*]], <8 x i64> [[TMP3]])
@@ -5705,9 +5705,9 @@ define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %
 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
@@ -5738,10 +5738,10 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 
 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5784,12 +5784,12 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
 
 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3, <8 x double> %extra_param, <8 x double> %extra_param2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
@@ -5842,10 +5842,10 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 
 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -5880,10 +5880,10 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <8 x i64> [[X0]] to <8 x i3>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X4:%.*]], <8 x i64> [[TMP3]])
@@ -5914,8 +5914,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 
 define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
@@ -5937,10 +5937,10 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32
 
 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <16 x i32> [[X0]] to <16 x i4>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X4:%.*]], <16 x i32> [[TMP3]])
@@ -5973,9 +5973,9 @@ declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x doub
 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -6021,9 +6021,9 @@ declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x flo
 define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -6070,8 +6070,8 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
 define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[TMP1]], <16 x i8> [[TMP2]], i8 -1)
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> zeroinitializer, [[TMP4]]
@@ -6106,8 +6106,8 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6143,8 +6143,8 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
 define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[TMP1]], <16 x i8> [[TMP2]], i8 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i8> zeroinitializer, [[TMP4]]
@@ -6179,8 +6179,8 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6216,8 +6216,8 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
 define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[TMP1]], <16 x i8> [[TMP2]], i8 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i8> zeroinitializer, [[TMP4]]
@@ -6252,8 +6252,8 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6289,8 +6289,8 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
 define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[TMP1]], <8 x i16> [[TMP2]], i8 -1)
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i16> zeroinitializer, [[TMP8]]
@@ -6325,8 +6325,8 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6362,8 +6362,8 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
 define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[TMP1]], <8 x i16> [[TMP2]], i8 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i16> zeroinitializer, [[TMP11]]
@@ -6398,8 +6398,8 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6435,8 +6435,8 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
 define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[TMP1]], <8 x i16> [[TMP2]], i8 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i16> zeroinitializer, [[TMP11]]
@@ -6471,8 +6471,8 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6519,8 +6519,8 @@ define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #
 define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
@@ -6544,7 +6544,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %
 define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0,  i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
@@ -6570,8 +6570,8 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6607,7 +6607,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
 define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmovs_qd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> [[TMP2]]
@@ -6621,9 +6621,9 @@ define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1)
 
 define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_512(
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
@@ -6644,7 +6644,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32>
 
 define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovs_qd_512(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP3:%.*]] to <8 x i1>
@@ -6669,8 +6669,8 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6706,7 +6706,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
 define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmovus_qd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> [[TMP2]]
@@ -6720,9 +6720,9 @@ define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1)
 
 define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_512(
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
@@ -6743,7 +6743,7 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32>
 
 define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovus_qd_512(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP3:%.*]] to <8 x i1>
@@ -6768,8 +6768,8 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64>, i8)
 define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
@@ -6805,8 +6805,8 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
 define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[TMP1]], <16 x i8> [[TMP2]], i16 -1)
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i8> zeroinitializer, [[TMP8]]
@@ -6841,8 +6841,8 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32>, i16)
 define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
@@ -6878,8 +6878,8 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
 define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[TMP1]], <16 x i8> [[TMP2]], i16 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i8> zeroinitializer, [[TMP11]]
@@ -6914,8 +6914,8 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32>, i16)
 define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
@@ -6951,8 +6951,8 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16
 define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[TMP1]], <16 x i8> [[TMP2]], i16 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i8> zeroinitializer, [[TMP11]]
@@ -6987,8 +6987,8 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32>, i16)
 define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
@@ -7024,8 +7024,8 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16
 define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[TMP1]], <16 x i16> [[TMP2]], i16 -1)
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i16> zeroinitializer, [[TMP8]]
@@ -7060,8 +7060,8 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32>, i16)
 define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
@@ -7097,8 +7097,8 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i1
 define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[TMP1]], <16 x i16> [[TMP2]], i16 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i16> zeroinitializer, [[TMP11]]
@@ -7133,8 +7133,8 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32>, i16)
 define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
@@ -7170,8 +7170,8 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i
 define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[TMP1]], <16 x i16> [[TMP2]], i16 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i16> zeroinitializer, [[TMP11]]
@@ -7206,8 +7206,8 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32>, i16)
 define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
@@ -7243,8 +7243,8 @@ declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32
 define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -7283,8 +7283,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8
 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2dq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7325,8 +7325,8 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>
 define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7367,8 +7367,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i
 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2udq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7408,9 +7408,9 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>,
 
 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2dq_512(
-; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
@@ -7443,8 +7443,8 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double
 define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -7485,8 +7485,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2udq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7527,8 +7527,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i
 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2dq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7569,8 +7569,8 @@ declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32
 define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
@@ -7609,8 +7609,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>,
 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2udq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7651,8 +7651,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2dq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7693,8 +7693,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>
 define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2udq_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -7735,7 +7735,7 @@ declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4
 define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_getexp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -7758,9 +7758,9 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_getexp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -7807,8 +7807,8 @@ define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x f
 define <4 x float> @test_maskz_getexp_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_getexp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7835,7 +7835,7 @@ declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>,
 define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_getexp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -7858,9 +7858,9 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_mask_getexp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -7907,8 +7907,8 @@ define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2
 define <2 x double> @test_maskz_getexp_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_maskz_getexp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7935,8 +7935,8 @@ declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32
 define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7961,8 +7961,8 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8
 define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd_all(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -8053,8 +8053,8 @@ declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
 define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -8080,8 +8080,8 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %
 define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss_all(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -8166,8 +8166,8 @@ declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8
 define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -8208,8 +8208,8 @@ declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16
 define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -8250,9 +8250,9 @@ declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -8334,9 +8334,9 @@ declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i
 define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -8413,9 +8413,9 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo
 
 define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, ptr %x1p, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
@@ -8455,7 +8455,7 @@ declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -8479,9 +8479,9 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -8517,8 +8517,8 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0,
 define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X1]] to <8 x i3>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
@@ -8555,7 +8555,7 @@ declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -8579,9 +8579,9 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -8617,8 +8617,8 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0,
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[X1]] to <16 x i4>
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
@@ -8668,8 +8668,8 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x fl
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
@@ -8697,7 +8697,7 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16
 define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[X0:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
@@ -8726,9 +8726,9 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x flo
 define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ss2sd_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -8775,9 +8775,9 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x doubl
 define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_sd2ss_round(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -8824,8 +8824,8 @@ declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x
 define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -8851,9 +8851,9 @@ define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32>
 define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -8889,9 +8889,9 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x
 define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -8929,8 +8929,8 @@ declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64
 define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -8956,9 +8956,9 @@ define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1
 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -8994,9 +8994,9 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64
 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -9032,7 +9032,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
 define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_comi_sd_eq_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9055,7 +9055,7 @@ define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #
 define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9078,7 +9078,7 @@ define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1)
 define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_comi_sd_eq(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9101,7 +9101,7 @@ define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
 define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9124,7 +9124,7 @@ define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
 define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_comi_sd_lt_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9147,7 +9147,7 @@ define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #
 define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt_sae(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9170,7 +9170,7 @@ define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1)
 define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_comi_sd_lt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9193,7 +9193,7 @@ define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
 define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9218,7 +9218,7 @@ declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
 define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ucomi_ss_lt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -9245,7 +9245,7 @@ declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
 define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -9268,9 +9268,9 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i6
 define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -9305,8 +9305,8 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8
 define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -9342,7 +9342,7 @@ declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
 define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -9356,9 +9356,9 @@ define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1
 define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -9382,8 +9382,8 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
 define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
@@ -9409,7 +9409,7 @@ declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
 define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -9432,9 +9432,9 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i
 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -9469,8 +9469,8 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1
 define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -9506,7 +9506,7 @@ declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
 define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -9520,9 +9520,9 @@ define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32>
 define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -9546,8 +9546,8 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
 define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
@@ -9573,9 +9573,9 @@ declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x do
 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -9635,9 +9635,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <
 
 define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, ptr %x2ptr) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -9677,9 +9677,9 @@ declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x d
 define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -9742,9 +9742,9 @@ declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -9807,9 +9807,9 @@ declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -9872,9 +9872,9 @@ declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x f
 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -9934,9 +9934,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <
 
 define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, ptr %x2ptr) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -9976,9 +9976,9 @@ declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x
 define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -10041,9 +10041,9 @@ declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double
 define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -10106,9 +10106,9 @@ declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x doubl
 define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -10174,9 +10174,9 @@ declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
 define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
@@ -10287,9 +10287,9 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x d
 define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
@@ -10512,9 +10512,9 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x flo
 
 define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr nocapture readonly %1, float %2, float %3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_load0(
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
@@ -10562,9 +10562,9 @@ define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr
 define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
@@ -10675,9 +10675,9 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x
 define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
@@ -10788,10 +10788,10 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x fl
 define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_param, <4 x float> %extra_param2) #0 {
 ; CHECK-LABEL: @fmadd_ss_mask_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
@@ -10896,10 +10896,10 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_par
 define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_param, <4 x float> %extra_param2) #0 {
 ; CHECK-LABEL: @fmadd_ss_maskz_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
@@ -11003,10 +11003,10 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c, <4 x float> %extra_pa
 define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_param, <2 x double> %extra_param2) #0 {
 ; CHECK-LABEL: @fmadd_sd_mask_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
@@ -11099,10 +11099,10 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c, <2 x double> %extra_pa
 define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c, <2x double> %extra_param, <2x double> %extra_param2) #0 {
 ; CHECK-LABEL: @fmadd_sd_maskz_memfold(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
@@ -11193,10 +11193,10 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c, <2x double> %extra_pa
 
 define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]]
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
@@ -11321,10 +11321,10 @@ define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x
 
 define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]]
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
@@ -11450,9 +11450,9 @@ define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x fl
 define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]]
@@ -11584,9 +11584,9 @@ define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
 define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]]
@@ -11717,11 +11717,11 @@ define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x f
 
 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
@@ -11777,11 +11777,11 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x
 
 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
@@ -11838,10 +11838,10 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f
 
 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4, <4 x float> %extra_param) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
@@ -11891,7 +11891,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
 define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psll_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -11910,9 +11910,9 @@ define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) #0
 define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psll_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -11941,8 +11941,8 @@ define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1
 define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psll_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -11974,7 +11974,7 @@ declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind r
 define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psll_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -11993,9 +11993,9 @@ define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
 define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psll_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -12024,8 +12024,8 @@ define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1,
 define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psll_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -12070,8 +12070,8 @@ define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) #0 {
 define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_pslli_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -12095,7 +12095,7 @@ define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %
 define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
@@ -12135,8 +12135,8 @@ define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) #0 {
 define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_pslli_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -12160,7 +12160,7 @@ define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %pas
 define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
@@ -12187,7 +12187,7 @@ declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
 define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psra_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -12206,9 +12206,9 @@ define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
 define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psra_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -12237,8 +12237,8 @@ define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1,
 define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psra_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -12270,7 +12270,7 @@ declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind rea
 define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psra_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -12289,9 +12289,9 @@ define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) #0
 define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psra_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -12320,8 +12320,8 @@ define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1
 define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psra_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -12367,8 +12367,8 @@ define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) #0 {
 define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrai_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -12392,7 +12392,7 @@ define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %pas
 define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
@@ -12432,8 +12432,8 @@ define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) #0 {
 define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrai_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -12457,7 +12457,7 @@ define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %
 define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
@@ -12485,7 +12485,7 @@ declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readno
 define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrl_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -12504,9 +12504,9 @@ define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) #0
 define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrl_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -12535,8 +12535,8 @@ define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1
 define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -12568,7 +12568,7 @@ declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind r
 define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrl_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -12587,9 +12587,9 @@ define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
 define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrl_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -12618,8 +12618,8 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1,
 define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -12664,8 +12664,8 @@ define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) #0 {
 define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrli_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
@@ -12689,7 +12689,7 @@ define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %
 define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
@@ -12729,8 +12729,8 @@ define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) #0 {
 define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrli_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
@@ -12754,7 +12754,7 @@ define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %pas
 define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
@@ -12780,7 +12780,7 @@ declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
 define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psllv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -12817,9 +12817,9 @@ define <16 x i32> @test_x86_avx512_psllv_d_512_const() #0 {
 define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psllv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -12846,8 +12846,8 @@ define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %
 define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -12876,7 +12876,7 @@ declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind
 define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psllv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -12913,9 +12913,9 @@ define <8 x i64> @test_x86_avx512_psllv_q_512_const() #0 {
 define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psllv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -12942,8 +12942,8 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1,
 define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -12972,7 +12972,7 @@ declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind re
 define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrav_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -12989,9 +12989,9 @@ define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) #
 define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrav_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -13018,8 +13018,8 @@ define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %
 define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -13048,7 +13048,7 @@ declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind
 define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrav_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -13065,9 +13065,9 @@ define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
 define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrav_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -13094,8 +13094,8 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1,
 define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -13124,7 +13124,7 @@ declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind re
 define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrlv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -13161,9 +13161,9 @@ define <16 x i32> @test_x86_avx512_psrlv_d_512_const() #0 {
 define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -13190,8 +13190,8 @@ define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %
 define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -13220,7 +13220,7 @@ declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind
 define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrlv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
@@ -13257,9 +13257,9 @@ define <8 x i64> @test_x86_avx512_psrlv_q_512_const() #0 {
 define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
@@ -13286,8 +13286,8 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1,
 define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
@@ -13414,13 +13414,13 @@ define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x
 ; CHECK-LABEL: @bad_mask_transition(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 384), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 392), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 320), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
@@ -13487,9 +13487,9 @@ define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8
 ; CHECK-LABEL: @bad_mask_transition_2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 320), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
index 7bd35182d5c90..dbef575b30cc4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll
@@ -28,7 +28,7 @@ declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
 define i32 @test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_kunpck_wd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[X0:%.*]] to <32 x i1>
@@ -54,7 +54,7 @@ declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
 define i64 @test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_kunpck_qd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[X0:%.*]] to <64 x i1>
@@ -80,8 +80,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
 define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <64 x i8> splat (i8 -1), i8 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <64 x i8> poison, i8 [[X0:%.*]], i64 0
@@ -134,8 +134,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcast_w_gpr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <32 x i16> splat (i16 -1), i16 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <32 x i16> poison, i16 [[X0:%.*]], i64 0
@@ -187,10 +187,10 @@ declare void @llvm.x86.avx512.mask.storeu.b.512(ptr, <64 x i8>, i64)
 
 define void @test_int_x86_avx512_mask_storeu_b_512(ptr %ptr1, ptr %ptr2, <64 x i8> %x1, i64 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_b_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64 [[X2:%.*]] to <64 x i1>
@@ -230,10 +230,10 @@ declare void @llvm.x86.avx512.mask.storeu.w.512(ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_storeu_w_512(ptr %ptr1, ptr %ptr2, <32 x i16> %x1, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_w_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[X2:%.*]] to <32 x i1>
@@ -274,8 +274,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(ptr, <32 x i16>, i32)
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_loadu_w_512(ptr %ptr, ptr %ptr2, <32 x i16> %x1, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_loadu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -343,8 +343,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(ptr, <64 x i8>, i64)
 define { <64 x i8>, <64 x i8>, <64 x i8> } @test_int_x86_avx512_mask_loadu_b_512(ptr %ptr, ptr %ptr2, <64 x i8> %x1, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_loadu_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -533,7 +533,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <
 
 define <64 x i8> @test_int_x86_avx512_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_palignr_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
@@ -547,10 +547,10 @@ define <64 x i8> @test_int_x86_avx512_palignr_512(<64 x i8> %x0, <64 x i8> %x1,
 
 define <64 x i8> @test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_palignr_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
 ; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <64 x i8> [[X1:%.*]], <64 x i8> [[X0:%.*]], <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
@@ -571,9 +571,9 @@ define <64 x i8> @test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8>
 
 define <64 x i8> @test_int_x86_avx512_maskz_palignr_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x4) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_palignr_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
 ; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <64 x i8> [[X1:%.*]], <64 x i8> [[X0:%.*]], <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
@@ -610,8 +610,8 @@ define <32 x i16> @test_int_x86_avx512_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32
 define <32 x i16> @test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pshufh_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 28, i32 28>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X0]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 28, i32 28>
@@ -633,7 +633,7 @@ define <32 x i16> @test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1
 define <32 x i16> @test_int_x86_avx512_maskz_pshufh_w_512(<32 x i16> %x0, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pshufh_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 28, i32 28>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X0]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 28, i32 28>
@@ -670,8 +670,8 @@ define <32 x i16> @test_int_x86_avx512_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32
 define <32 x i16> @test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pshufl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i32> <i32 3, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X0]], <32 x i32> <i32 3, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -693,7 +693,7 @@ define <32 x i16> @test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1
 define <32 x i16> @test_int_x86_avx512_maskz_pshufl_w_512(<32 x i16> %x0, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pshufl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP1]], <32 x i32> <i32 3, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X0]], <32 x i32> <i32 3, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -715,7 +715,7 @@ define <32 x i16> @test_int_x86_avx512_maskz_pshufl_w_512(<32 x i16> %x0, i32 %x
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 ; CHECK-LABEL: @test_pcmpeq_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <64 x i8> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
@@ -737,8 +737,8 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_pcmpeq_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <64 x i8> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
@@ -770,7 +770,7 @@ declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_pcmpeq_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <32 x i16> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
@@ -792,8 +792,8 @@ define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_pcmpeq_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <32 x i16> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
@@ -825,7 +825,7 @@ declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 ; CHECK-LABEL: @test_pcmpgt_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <64 x i8> [[A:%.*]], splat (i8 -128)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <64 x i8> [[TMP1]], splat (i8 -1)
@@ -851,8 +851,8 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_pcmpgt_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <64 x i8> [[A:%.*]], splat (i8 -128)
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <64 x i8> [[TMP1]], splat (i8 -1)
@@ -888,7 +888,7 @@ declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_pcmpgt_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <32 x i16> [[A:%.*]], splat (i16 -32768)
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <32 x i16> [[TMP1]], splat (i16 -1)
@@ -914,8 +914,8 @@ define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_pcmpgt_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <32 x i16> [[A:%.*]], splat (i16 -32768)
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <32 x i16> [[TMP1]], splat (i16 -1)
@@ -953,7 +953,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64
 define <64 x i8> @test_int_x86_avx512_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpckhb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
@@ -967,9 +967,9 @@ define <64 x i8> @test_int_x86_avx512_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x
 define <64 x i8> @test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
@@ -993,7 +993,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64
 define <64 x i8> @test_int_x86_avx512_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpcklb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
@@ -1007,9 +1007,9 @@ define <64 x i8> @test_int_x86_avx512_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x
 define <64 x i8> @test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpcklb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <64 x i8> [[TMP1]], <64 x i8> [[TMP2]], <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
@@ -1033,7 +1033,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>,
 define <32 x i16> @test_int_x86_avx512_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpckhw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -1047,9 +1047,9 @@ define <32 x i16> @test_int_x86_avx512_punpckhw_d_512(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -1073,7 +1073,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>,
 define <32 x i16> @test_int_x86_avx512_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_punpcklw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
@@ -1087,9 +1087,9 @@ define <32 x i16> @test_int_x86_avx512_punpcklw_d_512(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_punpcklw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
@@ -1113,7 +1113,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x
 define <64 x i8> @test_int_x86_avx512_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxs_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1127,9 +1127,9 @@ define <64 x i8> @test_int_x86_avx512_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1,
 define <64 x i8> @test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.smax.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1153,7 +1153,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @test_int_x86_avx512_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1167,9 +1167,9 @@ define <32 x i16> @test_int_x86_avx512_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x
 define <32 x i16> @test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.smax.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1193,7 +1193,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x
 define <64 x i8> @test_int_x86_avx512_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxu_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1207,9 +1207,9 @@ define <64 x i8> @test_int_x86_avx512_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1,
 define <64 x i8> @test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.umax.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1233,7 +1233,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @test_int_x86_avx512_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaxu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1247,9 +1247,9 @@ define <32 x i16> @test_int_x86_avx512_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x
 define <32 x i16> @test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.umax.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1273,7 +1273,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x
 define <64 x i8> @test_int_x86_avx512_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmins_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1287,9 +1287,9 @@ define <64 x i8> @test_int_x86_avx512_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1,
 define <64 x i8> @test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.smin.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1313,7 +1313,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @test_int_x86_avx512_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmins_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1327,9 +1327,9 @@ define <32 x i16> @test_int_x86_avx512_pmins_w_512(<32 x i16> %x0, <32 x i16> %x
 define <32 x i16> @test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.smin.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1353,7 +1353,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x
 define <64 x i8> @test_int_x86_avx512_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pminu_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1367,9 +1367,9 @@ define <64 x i8> @test_int_x86_avx512_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1,
 define <64 x i8> @test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.umin.v64i8(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1393,7 +1393,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @test_int_x86_avx512_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pminu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1407,9 +1407,9 @@ define <32 x i16> @test_int_x86_avx512_pminu_w_512(<32 x i16> %x0, <32 x i16> %x
 define <32 x i16> @test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.umin.v32i16(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1448,8 +1448,8 @@ define <32 x i16> @test_int_x86_avx512_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %
 define <32 x i16> @test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> splat (i8 -1), <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i8> [[X0:%.*]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1473,7 +1473,7 @@ define <32 x i16> @test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i
 define <32 x i16> @test_int_x86_avx512_maskz_pmovzxb_w_512(<32 x i8> %x0, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> splat (i8 -1), <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[X0:%.*]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1514,8 +1514,8 @@ define <32 x i16> @test_int_x86_avx512_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %
 define <32 x i16> @test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> splat (i8 -1), <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i8> [[X0:%.*]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1539,7 +1539,7 @@ define <32 x i16> @test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i
 define <32 x i16> @test_int_x86_avx512_maskz_pmovsxb_w_512(<32 x i8> %x0, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> splat (i8 -1), <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[X0:%.*]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1565,7 +1565,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16>, <8 x i16>, <32 x
 define <32 x i16> @test_int_x86_avx512_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psrl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1585,9 +1585,9 @@ define <32 x i16> @test_int_x86_avx512_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1,
 define <32 x i16> @test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -1615,8 +1615,8 @@ define <32 x i16> @test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
 define <32 x i16> @test_int_x86_avx512_maskz_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psrl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -1646,8 +1646,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i32, <32 x i16>
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_wi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> [[TMP1]], i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP4]], zeroinitializer
@@ -1697,7 +1697,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x
 define <32 x i16> @test_int_x86_avx512_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psra_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1717,9 +1717,9 @@ define <32 x i16> @test_int_x86_avx512_psra_w_512(<32 x i16> %x0, <8 x i16> %x1,
 define <32 x i16> @test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psra_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -1747,8 +1747,8 @@ define <32 x i16> @test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16>
 define <32 x i16> @test_int_x86_avx512_maskz_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psra_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -1778,8 +1778,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i32, <32 x i16>
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psra_wi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> [[TMP1]], i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP4]], zeroinitializer
@@ -1829,7 +1829,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x
 define <32 x i16> @test_int_x86_avx512_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psll_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1849,9 +1849,9 @@ define <32 x i16> @test_int_x86_avx512_psll_w_512(<32 x i16> %x0, <8 x i16> %x1,
 define <32 x i16> @test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psll_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -1879,8 +1879,8 @@ define <32 x i16> @test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16>
 define <32 x i16> @test_int_x86_avx512_maskz_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psll_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -1910,8 +1910,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i32, <32 x i16>
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psll_wi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> [[TMP1]], i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP4]], zeroinitializer
@@ -1961,7 +1961,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x
 define <64 x i8> @test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pshuf_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP4]]
@@ -1976,9 +1976,9 @@ define <64 x i8> @test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1,
 define <64 x i8> @test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP13]]
@@ -2035,7 +2035,7 @@ define <32 x i16> @test_int_x86_avx512_cvtmask2w_512(i32 %x0) nounwind #0 {
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -2053,9 +2053,9 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) no
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -2081,8 +2081,8 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -2107,7 +2107,7 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2137,10 +2137,10 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) nounw
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -2177,9 +2177,9 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -2216,7 +2216,7 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2252,10 +2252,10 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) noun
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmbk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -2298,9 +2298,9 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmbkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -2346,7 +2346,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <3
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
@@ -2364,9 +2364,9 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nou
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
@@ -2392,8 +2392,8 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
@@ -2418,7 +2418,7 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2448,10 +2448,10 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwi
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -2488,9 +2488,9 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -2531,7 +2531,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -2549,9 +2549,9 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) n
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -2577,8 +2577,8 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -2603,7 +2603,7 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2633,10 +2633,10 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) noun
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -2673,9 +2673,9 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -2712,7 +2712,7 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i3
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2748,10 +2748,10 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) nou
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmbk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -2794,9 +2794,9 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <3
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmbkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -2842,7 +2842,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <3
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
@@ -2860,9 +2860,9 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) no
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
@@ -2888,8 +2888,8 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
@@ -2914,7 +2914,7 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -2944,10 +2944,10 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounw
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -2984,9 +2984,9 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -3026,7 +3026,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64
 define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind #0 {
 ; CHECK-LABEL: @test_cmp_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <64 x i8> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
@@ -3142,8 +3142,8 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind #0 {
 define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_cmp_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <64 x i8> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
@@ -3329,7 +3329,7 @@ declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) noun
 define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind #0 {
 ; CHECK-LABEL: @test_ucmp_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <64 x i8> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
@@ -3437,8 +3437,8 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind #0 {
 define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_x86_avx512_ucmp_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <64 x i8> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
@@ -3616,7 +3616,7 @@ declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nou
 define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind #0 {
 ; CHECK-LABEL: @test_cmp_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <32 x i16> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
@@ -3732,8 +3732,8 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind #0 {
 define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_cmp_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <32 x i16> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
@@ -3919,7 +3919,7 @@ declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) no
 define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind #0 {
 ; CHECK-LABEL: @test_ucmp_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <32 x i16> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
@@ -4027,8 +4027,8 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind #0 {
 define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_ucmp_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <32 x i16> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
@@ -4209,7 +4209,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i
 define <64 x i8> @mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @mm512_avg_epu8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -4223,9 +4223,9 @@ define <64 x i8> @mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i6
 define <64 x i8> @mm512_mask_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind #0 {
 ; CHECK-LABEL: @mm512_mask_avg_epu8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -4249,7 +4249,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @mm512_avg_epu16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4263,9 +4263,9 @@ define <32 x i16> @mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x
 define <32 x i16> @mm512_mask_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @mm512_mask_avg_epu16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4304,8 +4304,8 @@ define <32 x i16> @test_int_x86_avx512_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1
 define <32 x i16> @test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <32 x i16> [[X0:%.*]], splat (i16 -32768)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> splat (i16 -1), <32 x i16> [[TMP1]]
@@ -4346,8 +4346,8 @@ define <64 x i8> @test_int_x86_avx512_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1) n
 define <64 x i8> @test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq <64 x i8> [[X0:%.*]], splat (i8 -128)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <64 x i1> [[TMP12]], <64 x i8> splat (i8 -1), <64 x i8> [[TMP1]]
@@ -4373,8 +4373,8 @@ declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)
 define i64 @test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_ptestm_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
@@ -4432,8 +4432,8 @@ declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)
 define i32 @test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_ptestm_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
@@ -4491,8 +4491,8 @@ declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)
 define i64 @test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_ptestnm_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[X0:%.*]], [[TMP2]]
@@ -4550,8 +4550,8 @@ declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)
 define i32 @test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_ptestnm_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[X0:%.*]], [[TMP2]]
@@ -4655,7 +4655,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <3
 define <32 x i16> @test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmulhu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4669,9 +4669,9 @@ define <32 x i16> @test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %
 define <32 x i16> @test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4695,7 +4695,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmulh_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4709,9 +4709,9 @@ define <32 x i16> @test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x
 define <32 x i16> @test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4735,7 +4735,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>,
 define <32 x i16> @test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmulhr_sw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pmul.hr.sw.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4749,9 +4749,9 @@ define <32 x i16> @test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pmul.hr.sw.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4775,7 +4775,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <3
 define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaddubs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
@@ -4801,9 +4801,9 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaddubs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
@@ -4839,7 +4839,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <1
 define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaddw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
@@ -4865,9 +4865,9 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaddw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
@@ -4903,7 +4903,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>,
 define <32 x i16> @test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4917,9 +4917,9 @@ define <32 x i16> @test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4941,8 +4941,8 @@ define <32 x i16> @test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
 define <32 x i16> @test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -4965,8 +4965,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16
 
 define <32 x i16> @test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
@@ -4988,10 +4988,10 @@ define <32 x i16> @test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i
 
 define <32 x i16> @test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
@@ -5022,10 +5022,10 @@ declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i1
 
 define <32 x i16> @test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
@@ -5057,8 +5057,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16
 define <32 x i16> @test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X1]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X3:%.*]], <32 x i16> [[TMP2]])
@@ -5080,9 +5080,9 @@ define <32 x i16> @test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i
 define <32 x i16> @test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[TMP3]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1:%.*]], <32 x i16> [[TMP2]])
@@ -5114,9 +5114,9 @@ declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32,
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_dbpsadbw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -5188,7 +5188,7 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw
 define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5202,9 +5202,9 @@ define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) nou
 define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5226,8 +5226,8 @@ define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5248,7 +5248,7 @@ define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 
 define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -5274,10 +5274,10 @@ define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwi
 
 define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5310,9 +5310,9 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x
 
 define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -5348,7 +5348,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <3
 define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5362,9 +5362,9 @@ define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) nou
 define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5386,8 +5386,8 @@ define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5408,7 +5408,7 @@ define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 
 define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -5434,10 +5434,10 @@ define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwi
 
 define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5470,9 +5470,9 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x
 
 define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -5508,7 +5508,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <3
 define <64 x i8> @test_mask_adds_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu8_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -5522,9 +5522,9 @@ define <64 x i8> @test_mask_adds_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwin
 define <64 x i8> @test_mask_adds_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu8_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -5546,8 +5546,8 @@ define <64 x i8> @test_mask_adds_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
 define <64 x i8> @test_mask_adds_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu8_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -5568,7 +5568,7 @@ define <64 x i8> @test_mask_adds_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
 
 define <64 x i8> @test_mask_adds_epu8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu8_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -5594,10 +5594,10 @@ define <64 x i8> @test_mask_adds_epu8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind
 
 define <64 x i8> @test_mask_adds_epu8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu8_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5630,9 +5630,9 @@ define <64 x i8> @test_mask_adds_epu8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8
 
 define <64 x i8> @test_mask_adds_epu8_rmkz_512(<64 x i8> %a, ptr %ptr_b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epu8_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -5668,7 +5668,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.paddus.b.512(<64 x i8>, <64 x i8>, <64 x
 define <64 x i8> @test_mask_subs_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu8_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -5682,9 +5682,9 @@ define <64 x i8> @test_mask_subs_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwin
 define <64 x i8> @test_mask_subs_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu8_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -5706,8 +5706,8 @@ define <64 x i8> @test_mask_subs_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
 define <64 x i8> @test_mask_subs_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu8_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -5728,7 +5728,7 @@ define <64 x i8> @test_mask_subs_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
 
 define <64 x i8> @test_mask_subs_epu8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu8_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -5754,10 +5754,10 @@ define <64 x i8> @test_mask_subs_epu8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind
 
 define <64 x i8> @test_mask_subs_epu8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu8_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5790,9 +5790,9 @@ define <64 x i8> @test_mask_subs_epu8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8
 
 define <64 x i8> @test_mask_subs_epu8_rmkz_512(<64 x i8> %a, ptr %ptr_b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epu8_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -5828,7 +5828,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8>, <64 x i8>, <64 x
 define <32 x i16> @test_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_adds_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5842,9 +5842,9 @@ define <32 x i16> @test_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind
 define <32 x i16> @test_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_adds_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5868,8 +5868,8 @@ define <32 x i16> @test_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i
 define <32 x i16> @test_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_adds_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -5892,7 +5892,7 @@ define <32 x i16> @test_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %m
 
 define <32 x i16> @test_adds_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_adds_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -5918,10 +5918,10 @@ define <32 x i16> @test_adds_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0
 
 define <32 x i16> @test_adds_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_adds_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -5956,9 +5956,9 @@ define <32 x i16> @test_adds_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16>
 
 define <32 x i16> @test_adds_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_adds_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -5996,7 +5996,7 @@ declare <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16>, <32 x i16>)
 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6010,9 +6010,9 @@ define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nou
 define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6034,8 +6034,8 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6056,7 +6056,7 @@ define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 
 define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -6082,10 +6082,10 @@ define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwi
 
 define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -6118,9 +6118,9 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x
 
 define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -6156,7 +6156,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @test_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_subs_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6170,9 +6170,9 @@ define <32 x i16> @test_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind
 define <32 x i16> @test_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_subs_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6196,8 +6196,8 @@ define <32 x i16> @test_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i
 define <32 x i16> @test_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_subs_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6220,7 +6220,7 @@ define <32 x i16> @test_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %m
 
 define <32 x i16> @test_subs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_subs_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -6246,10 +6246,10 @@ define <32 x i16> @test_subs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0
 
 define <32 x i16> @test_subs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_subs_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -6284,9 +6284,9 @@ define <32 x i16> @test_subs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16>
 
 define <32 x i16> @test_subs_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_subs_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -6324,7 +6324,7 @@ declare <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16>, <32 x i16>)
 define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6338,9 +6338,9 @@ define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nou
 define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6362,8 +6362,8 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
 define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> [[A:%.*]], <32 x i16> [[B:%.*]])
@@ -6384,7 +6384,7 @@ define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 
 define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -6410,10 +6410,10 @@ define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) nounwi
 
 define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -6446,9 +6446,9 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <32 x
 
 define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i32 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -6484,7 +6484,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32
 define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi8_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -6498,9 +6498,9 @@ define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwin
 define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi8_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -6522,8 +6522,8 @@ define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
 define <64 x i8> @test_mask_adds_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi8_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -6544,7 +6544,7 @@ define <64 x i8> @test_mask_adds_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
 
 define <64 x i8> @test_mask_adds_epi8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi8_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -6570,10 +6570,10 @@ define <64 x i8> @test_mask_adds_epi8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind
 
 define <64 x i8> @test_mask_adds_epi8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi8_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -6606,9 +6606,9 @@ define <64 x i8> @test_mask_adds_epi8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8
 
 define <64 x i8> @test_mask_adds_epi8_rmkz_512(<64 x i8> %a, ptr %ptr_b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_adds_epi8_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -6644,7 +6644,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8>, <64 x i8>, <64 x
 define <64 x i8> @test_mask_subs_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi8_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -6658,9 +6658,9 @@ define <64 x i8> @test_mask_subs_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwin
 define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi8_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -6682,8 +6682,8 @@ define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
 define <64 x i8> @test_mask_subs_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi8_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> [[A:%.*]], <64 x i8> [[B:%.*]])
@@ -6704,7 +6704,7 @@ define <64 x i8> @test_mask_subs_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
 
 define <64 x i8> @test_mask_subs_epi8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi8_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -6730,10 +6730,10 @@ define <64 x i8> @test_mask_subs_epi8_rm_512(<64 x i8> %a, ptr %ptr_b) nounwind
 
 define <64 x i8> @test_mask_subs_epi8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi8_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -6766,9 +6766,9 @@ define <64 x i8> @test_mask_subs_epi8_rmk_512(<64 x i8> %a, ptr %ptr_b, <64 x i8
 
 define <64 x i8> @test_mask_subs_epi8_rmkz_512(<64 x i8> %a, ptr %ptr_b, i64 %mask) nounwind #0 {
 ; CHECK-LABEL: @test_mask_subs_epi8_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -6806,7 +6806,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x
 define <32 x i16> @test_int_x86_avx512_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psrlv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6820,9 +6820,9 @@ define <32 x i16> @test_int_x86_avx512_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1,
 define <32 x i16> @test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrlv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6844,8 +6844,8 @@ define <32 x i16> @test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @test_int_x86_avx512_maskz_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psrlv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6869,7 +6869,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16>, <32 x i16>, <32
 define <32 x i16> @test_int_x86_avx512_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psrav32_hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6883,9 +6883,9 @@ define <32 x i16> @test_int_x86_avx512_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1
 define <32 x i16> @test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrav32_hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6907,8 +6907,8 @@ define <32 x i16> @test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16
 define <32 x i16> @test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psrav32_hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6932,7 +6932,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16>, <32 x i16>, <32 x
 define <32 x i16> @test_int_x86_avx512_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psllv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6946,9 +6946,9 @@ define <32 x i16> @test_int_x86_avx512_psllv32hi(<32 x i16> %x0, <32 x i16> %x1,
 define <32 x i16> @test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psllv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -6970,8 +6970,8 @@ define <32 x i16> @test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @test_int_x86_avx512_maskz_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psllv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -7008,8 +7008,8 @@ define <32 x i8> @test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1)
 define <32 x i8> @test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_wb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i8>
@@ -7031,7 +7031,7 @@ define <32 x i8> @test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
 define <32 x i8> @test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_wb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i8>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
index 8bf6d5acc21ba..481751b25eda1 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll
@@ -24,7 +24,7 @@ define i32 @test_int_x86_avx512_kadd_d(<32 x i16> %A, <32 x i16> %B) nounwind #0
 ; CHECK-LABEL: @test_int_x86_avx512_kadd_d(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <32 x i16> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i16> [[TMP0]], zeroinitializer
@@ -74,7 +74,7 @@ define i32 @test_int_x86_avx512_kadd_q(<64 x i8> %A, <64 x i8> %B) nounwind #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_kadd_q(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <64 x i8> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <64 x i8> [[TMP0]], zeroinitializer
@@ -123,7 +123,7 @@ declare <64 x i1> @llvm.x86.avx512.kadd.q(<64 x i1>, <64 x i1>)
 define i32 @test_x86_avx512_ktestc_d(<32 x i16> %A, <32 x i16> %B) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ktestc_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <32 x i16> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
@@ -165,7 +165,7 @@ declare i32 @llvm.x86.avx512.ktestc.d(<32 x i1>, <32 x i1>) nounwind readnone
 define i32 @test_x86_avx512_ktestz_d(<32 x i16> %A, <32 x i16> %B) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ktestz_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <32 x i16> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
@@ -207,7 +207,7 @@ declare i32 @llvm.x86.avx512.ktestz.d(<32 x i1>, <32 x i1>) nounwind readnone
 define i32 @test_x86_avx512_ktestc_q(<64 x i8> %A, <64 x i8> %B) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ktestc_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <64 x i8> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
@@ -249,7 +249,7 @@ declare i32 @llvm.x86.avx512.ktestc.q(<64 x i1>, <64 x i1>) nounwind readnone
 define i32 @test_x86_avx512_ktestz_q(<64 x i8> %A, <64 x i8> %B) #0 {
 ; CHECK-LABEL: @test_x86_avx512_ktestz_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <64 x i8> [[A:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
@@ -291,7 +291,7 @@ declare i32 @llvm.x86.avx512.ktestz.q(<64 x i1>, <64 x i1>) nounwind readnone
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -309,9 +309,9 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #0
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -339,8 +339,8 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -367,7 +367,7 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -397,10 +397,10 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 {
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -439,9 +439,9 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -480,7 +480,7 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -516,10 +516,10 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0 {
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmbk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -564,9 +564,9 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi32_rmbkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -614,7 +614,7 @@ declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>)
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
@@ -632,9 +632,9 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) #0
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
@@ -662,8 +662,8 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[A:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
@@ -690,7 +690,7 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -720,10 +720,10 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) #0 {
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -762,9 +762,9 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packs_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -807,7 +807,7 @@ declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>)
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
@@ -825,9 +825,9 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) #
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
@@ -855,8 +855,8 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
@@ -883,7 +883,7 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -913,10 +913,10 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, ptr %ptr_b) #0 {
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -955,9 +955,9 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <32
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -996,7 +996,7 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i3
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1032,10 +1032,10 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b) #0
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <32 x i16> %passThru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmbk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -1080,9 +1080,9 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <3
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i32 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi32_rmbkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -1130,7 +1130,7 @@ declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>)
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rr_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = sext <32 x i1> [[TMP3]] to <32 x i16>
@@ -1148,9 +1148,9 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) #0
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rrk_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16>
@@ -1178,8 +1178,8 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rrkz_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = sext <32 x i1> [[TMP4]] to <32 x i16>
@@ -1206,7 +1206,7 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1236,10 +1236,10 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, ptr %ptr_b) #0 {
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64 x i8> %passThru, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -1278,9 +1278,9 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, ptr %ptr_b, <64
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, ptr %ptr_b, i64 %mask) #0 {
 ; CHECK-LABEL: @test_mask_packus_epi16_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
@@ -1321,8 +1321,8 @@ declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
@@ -1344,10 +1344,10 @@ define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i1
 
 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
@@ -1378,10 +1378,10 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 
 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_hi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[X0:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[X0]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X4:%.*]], <32 x i16> [[TMP2]])
@@ -1415,8 +1415,8 @@ declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <3
 define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[X1:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[X1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X1]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP100:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X3:%.*]], <32 x i16> [[TMP2]])
@@ -1438,9 +1438,9 @@ define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i1
 define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <32 x i16> [[TMP3]] to <32 x i5>
 ; CHECK-NEXT:    [[TMP101:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> [[X1:%.*]], <32 x i16> [[TMP2]])
@@ -1474,7 +1474,7 @@ declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>)
 define <64 x i8> @test_int_x86_avx512_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pavg_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1488,9 +1488,9 @@ define <64 x i8> @test_int_x86_avx512_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <
 define <64 x i8> @test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]])
@@ -1516,7 +1516,7 @@ declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>)
 define <32 x i16> @test_int_x86_avx512_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pavg_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1530,9 +1530,9 @@ define <32 x i16> @test_int_x86_avx512_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1
 define <32 x i16> @test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pavg_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1558,7 +1558,7 @@ declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
 define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pshuf_b_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]]
@@ -1573,9 +1573,9 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) #
 define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pshuf_b_512_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP10]]
@@ -1600,8 +1600,8 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> %
 define <64 x i8>@test_int_x86_avx512_pshuf_b_512_maskz(<64 x i8> %x0, <64 x i8> %x1, i64 %mask) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pshuf_b_512_maskz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP9]]
@@ -1628,7 +1628,7 @@ declare <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16>, <32 x i16>)
 define <32 x i16> @test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmulhu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1642,9 +1642,9 @@ define <32 x i16> @test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %
 define <32 x i16> @test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhu_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1670,7 +1670,7 @@ declare <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16>, <32 x i16>)
 define <32 x i16> @test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmulh_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1684,9 +1684,9 @@ define <32 x i16> @test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x
 define <32 x i16> @test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulh_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1712,7 +1712,7 @@ declare <32 x i16> @llvm.x86.avx512.pmul.hr.sw.512(<32 x i16>, <32 x i16>)
 define <32 x i16> @test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmulhr_sw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pmul.hr.sw.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1726,9 +1726,9 @@ define <32 x i16> @test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16>
 define <32 x i16> @test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmulhr_sw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.pmul.hr.sw.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -1765,8 +1765,8 @@ define <32 x i8>@test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0) #0 {
 define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_wb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i8>
@@ -1790,7 +1790,7 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
 define <32 x i8>@test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_wb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[X0:%.*]] to <32 x i8>
@@ -1816,8 +1816,8 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(ptr %ptr, <32 x i16>, i32)
 define void @test_int_x86_avx512_mask_pmov_wb_mem_512(ptr %ptr, <32 x i16> %x1, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_wb_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
@@ -1853,7 +1853,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
 define <32 x i8>@test_int_x86_avx512_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmovs_wb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <32 x i1> splat (i1 true), <32 x i8> [[TMP3]], <32 x i8> [[TMP2]]
@@ -1867,9 +1867,9 @@ define <32 x i8>@test_int_x86_avx512_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_wb_512(
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[X2:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
@@ -1890,7 +1890,7 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8>
 
 define <32 x i8>@test_int_x86_avx512_maskz_pmovs_wb_512(<32 x i16> %x0, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovs_wb_512(
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[X2:%.*]] to <32 x i1>
@@ -1915,8 +1915,8 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(ptr %ptr, <32 x i16>, i32)
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(ptr %ptr, <32 x i16> %x1, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_wb_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
@@ -1952,7 +1952,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32
 define <32 x i8>@test_int_x86_avx512_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmovus_wb_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <32 x i1> splat (i1 true), <32 x i8> [[TMP3]], <32 x i8> [[TMP2]]
@@ -1966,9 +1966,9 @@ define <32 x i8>@test_int_x86_avx512_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_wb_512(
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[X2:%.*]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc <32 x i16> [[TMP1]] to <32 x i8>
@@ -1989,7 +1989,7 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8
 
 define <32 x i8>@test_int_x86_avx512_maskz_pmovus_wb_512(<32 x i16> %x0, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovus_wb_512(
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[X2:%.*]] to <32 x i1>
@@ -2014,8 +2014,8 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(ptr %ptr, <32 x i16>, i32)
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(ptr %ptr, <32 x i16> %x1, i32 %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_wb_mem_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
@@ -2051,7 +2051,7 @@ declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
 define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaddubs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
@@ -2077,9 +2077,9 @@ define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %
 define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaddubs_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <64 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <64 x i8> [[TMP2]], zeroinitializer
@@ -2117,7 +2117,7 @@ declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
 define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_pmaddw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
@@ -2143,9 +2143,9 @@ define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %
 define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_pmaddw_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
@@ -2183,9 +2183,9 @@ declare <32 x i16> @llvm.x86.avx512.dbpsadbw.512(<64 x i8>, <64 x i8>, i32)
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_dbpsadbw_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <64 x i8> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
@@ -2293,7 +2293,7 @@ define <32 x i16> @test_x86_avx512_psrlv_w_512_const() optsize #0 {
 define <32 x i16>@test_int_x86_avx512_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psrlv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2307,9 +2307,9 @@ define <32 x i16>@test_int_x86_avx512_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1)
 define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrlv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2333,8 +2333,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16>
 define <32 x i16>@test_int_x86_avx512_maskz_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psrlv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2360,7 +2360,7 @@ declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>)
 define <32 x i16>@test_int_x86_avx512_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psrav32_hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2374,9 +2374,9 @@ define <32 x i16>@test_int_x86_avx512_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1)
 define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psrav32_hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2400,8 +2400,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
 define <32 x i16>@test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psrav32_hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2436,7 +2436,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32
 define <32 x i16>@test_int_x86_avx512_psllv32hi(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_psllv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2450,9 +2450,9 @@ define <32 x i16>@test_int_x86_avx512_psllv32hi(<32 x i16> %x0, <32 x i16> %x1)
 define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_psllv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2476,8 +2476,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16>
 define <32 x i16>@test_int_x86_avx512_maskz_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_psllv32hi(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2503,7 +2503,7 @@ declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
 define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2517,9 +2517,9 @@ define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16>
 define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2543,8 +2543,8 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
 define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_hi_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]])
@@ -2568,7 +2568,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x
 define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psll_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -2587,9 +2587,9 @@ define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) #0
 define <32 x i16> @test_x86_avx512_mask_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psll_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -2618,8 +2618,8 @@ define <32 x i16> @test_x86_avx512_mask_psll_w_512(<32 x i16> %a0, <8 x i16> %a1
 define <32 x i16> @test_x86_avx512_maskz_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psll_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -2676,8 +2676,8 @@ define <32 x i16> @test_x86_avx512_pslli_w_512(<32 x i16> %a0) #0 {
 define <32 x i16> @test_x86_avx512_mask_pslli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_pslli_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP4]], zeroinitializer
@@ -2701,7 +2701,7 @@ define <32 x i16> @test_x86_avx512_mask_pslli_w_512(<32 x i16> %a0, <32 x i16> %
 define <32 x i16> @test_x86_avx512_maskz_pslli_w_512(<32 x i16> %a0, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_pslli_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP3]], zeroinitializer
@@ -2728,7 +2728,7 @@ declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readno
 define <32 x i16> @test_x86_avx512_psra_w_512(<32 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psra_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -2747,9 +2747,9 @@ define <32 x i16> @test_x86_avx512_psra_w_512(<32 x i16> %a0, <8 x i16> %a1) #0
 define <32 x i16> @test_x86_avx512_mask_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psra_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -2778,8 +2778,8 @@ define <32 x i16> @test_x86_avx512_mask_psra_w_512(<32 x i16> %a0, <8 x i16> %a1
 define <32 x i16> @test_x86_avx512_maskz_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psra_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -2824,8 +2824,8 @@ define <32 x i16> @test_x86_avx512_psrai_w_512(<32 x i16> %a0) #0 {
 define <32 x i16> @test_x86_avx512_mask_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrai_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP4]], zeroinitializer
@@ -2849,7 +2849,7 @@ define <32 x i16> @test_x86_avx512_mask_psrai_w_512(<32 x i16> %a0, <32 x i16> %
 define <32 x i16> @test_x86_avx512_maskz_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrai_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP3]], zeroinitializer
@@ -2876,7 +2876,7 @@ declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readno
 define <32 x i16> @test_x86_avx512_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -2895,9 +2895,9 @@ define <32 x i16> @test_x86_avx512_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1) #0
 define <32 x i16> @test_x86_avx512_mask_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
@@ -2926,8 +2926,8 @@ define <32 x i16> @test_x86_avx512_mask_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1
 define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrl_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
@@ -2957,7 +2957,7 @@ declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind r
 
 define <32 x i16> @test_x86_avx512_psrl_w_512_load(<32 x i16> %a0, ptr %p) #0 {
 ; CHECK-LABEL: @test_x86_avx512_psrl_w_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -3003,8 +3003,8 @@ define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) #0 {
 define <32 x i16> @test_x86_avx512_mask_psrli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_mask_psrli_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP5:%.*]] = or <32 x i16> [[TMP4]], zeroinitializer
@@ -3028,7 +3028,7 @@ define <32 x i16> @test_x86_avx512_mask_psrli_w_512(<32 x i16> %a0, <32 x i16> %
 define <32 x i16> @test_x86_avx512_maskz_psrli_w_512(<32 x i16> %a0, i32 %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx512_maskz_psrli_w_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> [[TMP1]], i32 7)
 ; CHECK-NEXT:    [[TMP4:%.*]] = or <32 x i16> [[TMP3]], zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-intrinsics.ll
index 69d49008e1b78..a79e293f54034 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-intrinsics.ll
@@ -22,7 +22,7 @@ define <32 x half> @test_int_x86_avx512fp16_add_ph_512(<32 x half> %x1, <32 x ha
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_add_ph_512(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], zeroinitializer
@@ -37,9 +37,9 @@ define <32 x half> @test_int_x86_avx512fp16_add_ph_512(<32 x half> %x1, <32 x ha
 define <32 x half> @test_int_x86_avx512fp16_mask_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_mask_add_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
@@ -67,10 +67,10 @@ define <32 x half> @test_int_x86_avx512fp16_mask_add_ph_512(<32 x half> %src, <3
 define <32 x half> @test_int_x86_avx512fp16_maskz_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_maskz_add_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -123,10 +123,10 @@ define <32 x half> @test_int_x86_avx512fp16_maskz_add_ph_512(<32 x half> %src, <
 define <32 x half> @test_int_x86_avx512fp16_add_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_add_ph_512_round(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], <32 x half> [[SRC:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -156,7 +156,7 @@ define <32 x half> @test_int_x86_avx512fp16_sub_ph_512(<32 x half> %x1, <32 x ha
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_sub_ph_512(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], zeroinitializer
@@ -171,9 +171,9 @@ define <32 x half> @test_int_x86_avx512fp16_sub_ph_512(<32 x half> %x1, <32 x ha
 define <32 x half> @test_int_x86_avx512fp16_mask_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_mask_sub_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
@@ -201,10 +201,10 @@ define <32 x half> @test_int_x86_avx512fp16_mask_sub_ph_512(<32 x half> %src, <3
 define <32 x half> @test_int_x86_avx512fp16_maskz_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_maskz_sub_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -257,10 +257,10 @@ define <32 x half> @test_int_x86_avx512fp16_maskz_sub_ph_512(<32 x half> %src, <
 define <32 x half> @test_int_x86_avx512fp16_sub_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_sub_ph_512_round(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], <32 x half> [[SRC:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -290,7 +290,7 @@ define <32 x half> @test_int_x86_avx512fp16_mul_ph_512(<32 x half> %x1, <32 x ha
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_mul_ph_512(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], zeroinitializer
@@ -305,9 +305,9 @@ define <32 x half> @test_int_x86_avx512fp16_mul_ph_512(<32 x half> %x1, <32 x ha
 define <32 x half> @test_int_x86_avx512fp16_mask_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_mask_mul_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
@@ -335,10 +335,10 @@ define <32 x half> @test_int_x86_avx512fp16_mask_mul_ph_512(<32 x half> %src, <3
 define <32 x half> @test_int_x86_avx512fp16_maskz_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_maskz_mul_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -391,10 +391,10 @@ define <32 x half> @test_int_x86_avx512fp16_maskz_mul_ph_512(<32 x half> %src, <
 define <32 x half> @test_int_x86_avx512fp16_mul_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_mul_ph_512_round(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], <32 x half> [[SRC:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -424,7 +424,7 @@ define <32 x half> @test_int_x86_avx512fp16_div_ph_512(<32 x half> %x1, <32 x ha
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_div_ph_512(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], zeroinitializer
@@ -439,9 +439,9 @@ define <32 x half> @test_int_x86_avx512fp16_div_ph_512(<32 x half> %x1, <32 x ha
 define <32 x half> @test_int_x86_avx512fp16_mask_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_mask_div_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
@@ -469,10 +469,10 @@ define <32 x half> @test_int_x86_avx512fp16_mask_div_ph_512(<32 x half> %src, <3
 define <32 x half> @test_int_x86_avx512fp16_maskz_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_maskz_div_ph_512(
 ; CHECK-SAME: <32 x half> [[SRC:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -525,10 +525,10 @@ define <32 x half> @test_int_x86_avx512fp16_maskz_div_ph_512(<32 x half> %src, <
 define <32 x half> @test_int_x86_avx512fp16_div_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, ptr %ptr) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_div_ph_512_round(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], <32 x half> [[SRC:%.*]], i32 [[MSK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -558,7 +558,7 @@ define <32 x half> @test_min_ph(<32 x half> %x1, <32 x half> %x2) #0 {
 ; CHECK-LABEL: define <32 x half> @test_min_ph(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[_MSPROP]] to <32 x i1>
@@ -583,7 +583,7 @@ define <32 x half> @test_int_x86_avx512fp16_min_ph_512_sae(<32 x half> %x1, <32
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_min_ph_512_sae(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], zeroinitializer
@@ -598,9 +598,9 @@ define <32 x half> @test_int_x86_avx512fp16_min_ph_512_sae(<32 x half> %x1, <32
 define <32 x half> @test_int_x86_avx512fp16_maskz_min_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_maskz_min_ph_512_sae(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -629,7 +629,7 @@ define <32 x half> @test_max_ph(<32 x half> %x1, <32 x half> %x2) #0 {
 ; CHECK-LABEL: define <32 x half> @test_max_ph(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <32 x i16> [[_MSPROP]] to <32 x i1>
@@ -654,7 +654,7 @@ define <32 x half> @test_int_x86_avx512fp16_max_ph_512_sae(<32 x half> %x1, <32
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_max_ph_512_sae(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], zeroinitializer
@@ -669,9 +669,9 @@ define <32 x half> @test_int_x86_avx512fp16_max_ph_512_sae(<32 x half> %x1, <32
 define <32 x half> @test_int_x86_avx512fp16_maskz_max_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512fp16_maskz_max_ph_512_sae(
 ; CHECK-SAME: <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[MSK:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[MSK]] to <32 x i1>
@@ -700,8 +700,8 @@ define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd(<8 x half> %x0, <8 x do
 ; CHECK-LABEL: define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -727,8 +727,8 @@ define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_sae(<8 x half> %x0, <8
 ; CHECK-LABEL: define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_sae(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -754,7 +754,7 @@ define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_nomask(<8 x half> %x0,
 ; CHECK-LABEL: define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x double> [[X1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -778,8 +778,8 @@ define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_load(ptr %px0, <8 x dou
 ; CHECK-LABEL: define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_load(
 ; CHECK-SAME: ptr [[PX0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -819,8 +819,8 @@ define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph(<8 x double> %x0, <8 x ha
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph(
 ; CHECK-SAME: <8 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -846,8 +846,8 @@ define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_r(<8 x double> %x0, <8 x
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_r(
 ; CHECK-SAME: <8 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -873,8 +873,8 @@ define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_load(ptr %px0, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_load(
 ; CHECK-SAME: ptr [[PX0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -914,9 +914,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round(<8 x half> %x0,
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x float> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -945,9 +945,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_r(<8 x half> %x0
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_r(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x float> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -976,8 +976,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_nomask(<8 x half
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x float> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1004,8 +1004,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_z(<8 x half> %x0
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_z(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1033,9 +1033,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round(<8 x half> %x0,
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <2 x double> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1064,9 +1064,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_r(<8 x half> %x0
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_r(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <2 x double> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1095,8 +1095,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_nomask(<8 x half
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <2 x double> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1123,8 +1123,8 @@ define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_z(<8 x half> %x0
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_z(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <2 x double> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1152,9 +1152,9 @@ define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round(<4 x float> %x0
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <8 x half> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1183,9 +1183,9 @@ define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_r(<4 x float> %
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_r(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <8 x half> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1214,8 +1214,8 @@ define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_nomask(<4 x flo
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_nomask(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <8 x half> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1242,8 +1242,8 @@ define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_z(<4 x float> %
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_z(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1271,9 +1271,9 @@ define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round(<2 x double> %
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1302,9 +1302,9 @@ define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_r(<2 x double>
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_r(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1333,8 +1333,8 @@ define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_nomask(<2 x do
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_nomask(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1361,8 +1361,8 @@ define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_z(<2 x double>
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_z(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1404,8 +1404,8 @@ define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512(<16 x half> %x0, <1
 ; CHECK-LABEL: define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], <16 x float> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -1431,7 +1431,7 @@ define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512(<16 x half> %x0, i
 ; CHECK-LABEL: define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1468,8 +1468,8 @@ define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512r(<16 x half> %x0, <
 ; CHECK-LABEL: define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512r(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], <16 x float> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -1495,7 +1495,7 @@ define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512r(<16 x half> %x0,
 ; CHECK-LABEL: define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512r(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1534,8 +1534,8 @@ define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512(<16 x float> %x0, <1
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512(
 ; CHECK-SAME: <16 x float> [[X0:%.*]], <16 x half> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -1561,7 +1561,7 @@ define <16 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_512(<16 x float> %x0, i
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_512(
 ; CHECK-SAME: <16 x float> [[X0:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -1584,8 +1584,8 @@ define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512r(<16 x float> %x0, <
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512r(
 ; CHECK-SAME: <16 x float> [[X0:%.*]], <16 x half> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -1622,3 +1622,6 @@ define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512r(<16 x float> %x0, <
 }
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-vl-intrinsics.ll
index e67e5e73134e9..c0ba3d599807f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-arith-vl-intrinsics.ll
@@ -32,7 +32,7 @@ define <16 x half> @test_int_x86_avx512fp16_add_ph_256(<16 x half> %x1, <16 x ha
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_add_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fadd <16 x half> [[X1]], [[X2]]
@@ -46,11 +46,11 @@ define <16 x half> @test_int_x86_avx512fp16_add_ph_256(<16 x half> %x1, <16 x ha
 define <16 x half> @test_int_x86_avx512fp16_mask_add_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_mask_add_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], <16 x half> [[SRC:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -103,9 +103,9 @@ define <16 x half> @test_int_x86_avx512fp16_mask_add_ph_256(<16 x half> %x1, <16
 define <16 x half> @test_int_x86_avx512fp16_maskz_add_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_maskz_add_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -131,7 +131,7 @@ define <8 x half> @test_int_x86_avx512fp16_add_ph_128(<8 x half> %x1, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_add_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fadd <8 x half> [[X1]], [[X2]]
@@ -145,11 +145,11 @@ define <8 x half> @test_int_x86_avx512fp16_add_ph_128(<8 x half> %x1, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_add_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_add_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -202,9 +202,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_add_ph_128(<8 x half> %x1, <8 x
 define <8 x half> @test_int_x86_avx512fp16_maskz_add_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_maskz_add_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -230,7 +230,7 @@ define <16 x half> @test_int_x86_avx512fp16_sub_ph_256(<16 x half> %x1, <16 x ha
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_sub_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <16 x half> [[X1]], [[X2]]
@@ -244,11 +244,11 @@ define <16 x half> @test_int_x86_avx512fp16_sub_ph_256(<16 x half> %x1, <16 x ha
 define <16 x half> @test_int_x86_avx512fp16_mask_sub_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_mask_sub_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], <16 x half> [[SRC:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -301,9 +301,9 @@ define <16 x half> @test_int_x86_avx512fp16_mask_sub_ph_256(<16 x half> %x1, <16
 define <16 x half> @test_int_x86_avx512fp16_maskz_sub_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_maskz_sub_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -329,7 +329,7 @@ define <8 x half> @test_int_x86_avx512fp16_sub_ph_128(<8 x half> %x1, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_sub_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fsub <8 x half> [[X1]], [[X2]]
@@ -343,11 +343,11 @@ define <8 x half> @test_int_x86_avx512fp16_sub_ph_128(<8 x half> %x1, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_sub_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_sub_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -400,9 +400,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_sub_ph_128(<8 x half> %x1, <8 x
 define <8 x half> @test_int_x86_avx512fp16_maskz_sub_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_maskz_sub_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -428,7 +428,7 @@ define <16 x half> @test_int_x86_avx512fp16_mul_ph_256(<16 x half> %x1, <16 x ha
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_mul_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fmul <16 x half> [[X1]], [[X2]]
@@ -442,11 +442,11 @@ define <16 x half> @test_int_x86_avx512fp16_mul_ph_256(<16 x half> %x1, <16 x ha
 define <16 x half> @test_int_x86_avx512fp16_mask_mul_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_mask_mul_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], <16 x half> [[SRC:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -499,9 +499,9 @@ define <16 x half> @test_int_x86_avx512fp16_mask_mul_ph_256(<16 x half> %x1, <16
 define <16 x half> @test_int_x86_avx512fp16_maskz_mul_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_maskz_mul_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -527,7 +527,7 @@ define <8 x half> @test_int_x86_avx512fp16_mul_ph_128(<8 x half> %x1, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mul_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fmul <8 x half> [[X1]], [[X2]]
@@ -541,11 +541,11 @@ define <8 x half> @test_int_x86_avx512fp16_mul_ph_128(<8 x half> %x1, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_mul_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_mul_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -598,9 +598,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_mul_ph_128(<8 x half> %x1, <8 x
 define <8 x half> @test_int_x86_avx512fp16_maskz_mul_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_maskz_mul_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -626,7 +626,7 @@ define <16 x half> @test_int_x86_avx512fp16_div_ph_256(<16 x half> %x1, <16 x ha
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_div_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fdiv <16 x half> [[X1]], [[X2]]
@@ -641,7 +641,7 @@ define <16 x half> @test_int_x86_avx512fp16_div_ph_256_fast(<16 x half> %x1, <16
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_div_ph_256_fast(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fdiv fast <16 x half> [[X1]], [[X2]]
@@ -655,11 +655,11 @@ define <16 x half> @test_int_x86_avx512fp16_div_ph_256_fast(<16 x half> %x1, <16
 define <16 x half> @test_int_x86_avx512fp16_mask_div_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_mask_div_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], <16 x half> [[SRC:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -712,9 +712,9 @@ define <16 x half> @test_int_x86_avx512fp16_mask_div_ph_256(<16 x half> %x1, <16
 define <16 x half> @test_int_x86_avx512fp16_maskz_div_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512fp16_maskz_div_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]], i16 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
@@ -740,7 +740,7 @@ define <8 x half> @test_int_x86_avx512fp16_div_ph_128(<8 x half> %x1, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_div_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fdiv <8 x half> [[X1]], [[X2]]
@@ -755,7 +755,7 @@ define <8 x half> @test_int_x86_avx512fp16_div_ph_128_fast(<8 x half> %x1, <8 x
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_div_ph_128_fast(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = fdiv fast <8 x half> [[X1]], [[X2]]
@@ -769,11 +769,11 @@ define <8 x half> @test_int_x86_avx512fp16_div_ph_128_fast(<8 x half> %x1, <8 x
 define <8 x half> @test_int_x86_avx512fp16_mask_div_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_div_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -826,9 +826,9 @@ define <8 x half> @test_int_x86_avx512fp16_mask_div_ph_128(<8 x half> %x1, <8 x
 define <8 x half> @test_int_x86_avx512fp16_maskz_div_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_maskz_div_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MSK:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -854,7 +854,7 @@ define <16 x half> @test_min_ph_256(<16 x half> %x1, <16 x half> %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_min_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[_MSPROP]] to <16 x i1>
@@ -879,7 +879,7 @@ define <16 x half> @test_max_ph_256(<16 x half> %x1, <16 x half> %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_max_ph_256(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[_MSPROP]] to <16 x i1>
@@ -904,7 +904,7 @@ define <8 x half> @test_min_ph_128(<8 x half> %x1, <8 x half> %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_min_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i1>
@@ -929,7 +929,7 @@ define <8 x half> @test_max_ph_128(<8 x half> %x1, <8 x half> %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_max_ph_128(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i16> [[_MSPROP]] to <8 x i1>
@@ -957,7 +957,7 @@ define <8 x half> @test_max_ph_128_2(<8 x half> %x1, <8 x half> %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_max_ph_128_2(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> [[X1]], <8 x half> [[X2]])
@@ -972,7 +972,7 @@ define <16 x half> @test_max_ph_256_2(<16 x half> %x1, <16 x half> %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_max_ph_256_2(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES0:%.*]] = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> [[X1]], <16 x half> [[X2]])
@@ -990,7 +990,7 @@ define <8 x half> @test_min_ph_128_2(<8 x half> %x1, <8 x half> %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_min_ph_128_2(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES0:%.*]] = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> [[X1]], <8 x half> [[X2]])
@@ -1005,7 +1005,7 @@ define <16 x half> @test_min_ph_256_2(<16 x half> %x1, <16 x half> %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_min_ph_256_2(
 ; CHECK-SAME: <16 x half> [[X1:%.*]], <16 x half> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES0:%.*]] = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> [[X1]], <16 x half> [[X2]])
@@ -1022,8 +1022,8 @@ define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256(<8 x half> %x0, <4
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x double> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1049,7 +1049,7 @@ define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256_nomask(<8 x half> %
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x double> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1075,8 +1075,8 @@ define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128(<8 x half> %x0, <2
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <2 x double> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1102,7 +1102,7 @@ define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128_nomask(<8 x half> %
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <2 x double> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1128,8 +1128,8 @@ define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256(<4 x double> %x0, <8
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -1155,8 +1155,8 @@ define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256_load(ptr %px0, <8 x h
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256_load(
 ; CHECK-SAME: ptr [[PX0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1196,8 +1196,8 @@ define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128(<2 x double> %x0, <8
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1223,8 +1223,8 @@ define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128_load(ptr %px0, <8 x h
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128_load(
 ; CHECK-SAME: ptr [[PX0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -1278,8 +1278,8 @@ define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_128(<8 x half> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1305,7 +1305,7 @@ define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_128(<8 x half> %x0, i8 %x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1344,8 +1344,8 @@ define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_256(<8 x half> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1371,7 +1371,7 @@ define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_256(<8 x half> %x0, i8 %x
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1410,8 +1410,8 @@ define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_128(<8 x half> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1437,7 +1437,7 @@ define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_128(<8 x half> %x0, i8 %x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1476,8 +1476,8 @@ define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_256(<8 x half> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1503,7 +1503,7 @@ define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_256(<8 x half> %x0, i8 %x
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1542,8 +1542,8 @@ define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_128(<8 x half> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1569,7 +1569,7 @@ define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_128(<8 x half> %x0, i8 %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1608,8 +1608,8 @@ define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_256(<8 x half> %x0, <8 x
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1635,7 +1635,7 @@ define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_256(<8 x half> %x0, i8 %
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1674,8 +1674,8 @@ define <4 x float> @test_int_x86_avx512_mask_cvt_ph2psx_128(<8 x half> %x0, <4 x
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_ph2psx_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1701,7 +1701,7 @@ define <4 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_128(<8 x half> %x0, i8
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_128(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1740,8 +1740,8 @@ define <8 x float> @test_int_x86_avx512_mask_cvt_ph2psx_256(<8 x half> %x0, <8 x
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_cvt_ph2psx_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1767,7 +1767,7 @@ define <8 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_256(<8 x half> %x0, i8
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_256(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1792,8 +1792,8 @@ define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_128(<4 x float> %x0, <8 x
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1849,8 +1849,8 @@ define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -1876,7 +1876,7 @@ define <8 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_256(<8 x float> %x0, i8
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1896,3 +1896,6 @@ define <8 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_256(<8 x float> %x0, i8
 }
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
index 8723b1005f8fc..e5d1af3841f10 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll
@@ -61,7 +61,7 @@ define i32 @test_x86_avx512fp16_ucomi_sh_lt(<8 x half> %a0, <8 x half> %a1) #0 {
 ; CHECK-LABEL: define i32 @test_x86_avx512fp16_ucomi_sh_lt(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], <8 x half> [[A1:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -100,7 +100,7 @@ define <32 x half> @test_sqrt_ph_512_fast(<32 x half> %a0, <32 x half> %a1) #0 {
 ; CHECK-LABEL: define <32 x half> @test_sqrt_ph_512_fast(
 ; CHECK-SAME: <32 x half> [[A0:%.*]], <32 x half> [[A1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> [[A0]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP2]], [[TMP1]]
@@ -145,8 +145,8 @@ define <32 x half> @test_mask_sqrt_ph_512(<32 x half> %a0, <32 x half> %passthru
 ; CHECK-LABEL: define <32 x half> @test_mask_sqrt_ph_512(
 ; CHECK-SAME: <32 x half> [[A0:%.*]], <32 x half> [[PASSTHRU:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.sqrt.v32f16(<32 x half> [[A0]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
@@ -172,7 +172,7 @@ define <32 x half> @test_maskz_sqrt_ph_512(<32 x half> %a0, i32 %mask) #0 {
 ; CHECK-LABEL: define <32 x half> @test_maskz_sqrt_ph_512(
 ; CHECK-SAME: <32 x half> [[A0:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.sqrt.v32f16(<32 x half> [[A0]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
@@ -219,8 +219,8 @@ define <32 x half> @test_mask_sqrt_round_ph_512(<32 x half> %a0, <32 x half> %pa
 ; CHECK-LABEL: define <32 x half> @test_mask_sqrt_round_ph_512(
 ; CHECK-SAME: <32 x half> [[A0:%.*]], <32 x half> [[PASSTHRU:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -253,7 +253,7 @@ define <32 x half> @test_maskz_sqrt_round_ph_512(<32 x half> %a0, i32 %mask) #0
 ; CHECK-LABEL: define <32 x half> @test_maskz_sqrt_round_ph_512(
 ; CHECK-SAME: <32 x half> [[A0:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
@@ -287,9 +287,9 @@ define <8 x half> @test_sqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2,
 ; CHECK-LABEL: define <8 x half> @test_sqrt_sh(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], <8 x half> [[A1:%.*]], <8 x half> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -318,7 +318,7 @@ define half @test_sqrt_sh2(half %a0, half %a1) #0 {
 ; CHECK-LABEL: define half @test_sqrt_sh2(
 ; CHECK-SAME: half [[A0:%.*]], half [[A1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call fast half @llvm.sqrt.f16(half [[A0]])
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i16 [[TMP2]], [[TMP1]]
@@ -350,9 +350,9 @@ define <8 x half> @test_sqrt_sh_r(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2
 ; CHECK-LABEL: define <8 x half> @test_sqrt_sh_r(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], <8 x half> [[A1:%.*]], <8 x half> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -381,8 +381,8 @@ define <8 x half> @test_sqrt_sh_nomask(<8 x half> %a0, <8 x half> %a1, <8 x half
 ; CHECK-LABEL: define <8 x half> @test_sqrt_sh_nomask(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], <8 x half> [[A1:%.*]], <8 x half> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -409,8 +409,8 @@ define <8 x half> @test_sqrt_sh_z(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2
 ; CHECK-LABEL: define <8 x half> @test_sqrt_sh_z(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], <8 x half> [[A1:%.*]], <8 x half> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -455,7 +455,7 @@ define <8 x half> @test_rsqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
 ; CHECK-LABEL: define <8 x half> @test_rsqrt_sh(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], <8 x half> [[A1:%.*]], <8 x half> [[A2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -481,7 +481,7 @@ define <8 x half> @test_rsqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2)
 define <8 x half> @test_rsqrt_sh_load(<8 x half> %a0, ptr %a1ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_rsqrt_sh_load(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], ptr [[A1PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -514,7 +514,7 @@ define <8 x half> @test_rsqrt_sh_maskz(<8 x half> %a0, i8 %mask) #0 {
 ; CHECK-LABEL: define <8 x half> @test_rsqrt_sh_maskz(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -540,9 +540,9 @@ define <8 x half> @test_rsqrt_sh_mask(<8 x half> %a0, <8 x half> %b0, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_rsqrt_sh_mask(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], <8 x half> [[B0:%.*]], <8 x half> [[C0:%.*]], i8 [[MASK:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -675,9 +675,9 @@ declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half
 define <32 x half> @test_rcp_ph_512(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
 ; CHECK-LABEL: define <32 x half> @test_rcp_ph_512(
 ; CHECK-SAME: <32 x half> [[A0:%.*]], <32 x half> [[A1:%.*]], i32 [[MASK:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[MASK]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
@@ -725,7 +725,7 @@ define <8 x half> @test_rcp_sh(<8 x half> %a0) #0 {
 define <8 x half> @test_rcp_sh_load(<8 x half> %a0, ptr %a1ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_rcp_sh_load(
 ; CHECK-SAME: <8 x half> [[A0:%.*]], ptr [[A1PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -764,8 +764,8 @@ define <32 x half>@test_int_x86_avx512_mask_reduce_ph_512(<32 x half> %x0, <32 x
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512_mask_reduce_ph_512(
 ; CHECK-SAME: <32 x half> [[X0:%.*]], <32 x half> [[X2:%.*]], i32 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -807,9 +807,9 @@ define <8 x half>@test_int_x86_avx512_mask_reduce_sh(<8 x half> %x0, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_reduce_sh(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]], i8 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -838,8 +838,8 @@ define <8 x half>@test_int_x86_avx512_mask_reduce_sh_nomask(<8 x half> %x0, <8 x
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_reduce_sh_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -867,9 +867,9 @@ declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32,
 define <32 x half>@test_int_x86_avx512_mask_rndscale_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512_mask_rndscale_ph_512(
 ; CHECK-SAME: <32 x half> [[X0:%.*]], <32 x half> [[X2:%.*]], i32 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[X3]] to <32 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer
@@ -903,9 +903,9 @@ define <8 x half>@test_int_x86_avx512_mask_rndscale_sh(<8 x half> %x0, <8 x half
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_rndscale_sh(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]], i8 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -934,8 +934,8 @@ define <8 x half>@test_int_x86_avx512_mask_rndscale_sh_nomask(<8 x half> %x0, <8
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_rndscale_sh_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -964,8 +964,8 @@ define <32 x half>@test_int_x86_avx512_mask_getexp_ph_512(<32 x half> %x0, <32 x
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512_mask_getexp_ph_512(
 ; CHECK-SAME: <32 x half> [[X0:%.*]], <32 x half> [[X1:%.*]], i32 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -1004,9 +1004,9 @@ define <8 x half>@test_int_x86_avx512_mask_getexp_sh(<8 x half> %x0, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_getexp_sh(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]], i8 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1035,8 +1035,8 @@ define <8 x half>@test_int_x86_avx512_mask_getexp_sh_nomask(<8 x half> %x0, <8 x
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_getexp_sh_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1062,7 +1062,7 @@ define <8 x half>@test_int_x86_avx512_mask_getexp_sh_nomask(<8 x half> %x0, <8 x
 define <8 x half>@test_int_x86_avx512_mask_getexp_sh_load(<8 x half> %x0, ptr %x1ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_getexp_sh_load(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], ptr [[X1PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1097,8 +1097,8 @@ define <32 x half>@test_int_x86_avx512_mask_getmant_ph_512(<32 x half> %x0, <32
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512_mask_getmant_ph_512(
 ; CHECK-SAME: <32 x half> [[X0:%.*]], <32 x half> [[X2:%.*]], i32 [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
@@ -1140,9 +1140,9 @@ define <8 x half>@test_int_x86_avx512_mask_getmant_sh(<8 x half> %x0, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_getmant_sh(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]], i8 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1171,8 +1171,8 @@ define <8 x half>@test_int_x86_avx512_mask_getmant_sh_nomask(<8 x half> %x0, <8
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_getmant_sh_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1199,8 +1199,8 @@ define <8 x half>@test_int_x86_avx512_mask_getmant_sh_z(<8 x half> %x0, <8 x hal
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_getmant_sh_z(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1227,10 +1227,10 @@ declare <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half>, <32 x h
 define <32 x half>@test_int_x86_avx512_mask_scalef_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3) #0 {
 ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512_mask_scalef_ph_512(
 ; CHECK-SAME: <32 x half> [[X0:%.*]], <32 x half> [[X1:%.*]], <32 x half> [[X2:%.*]], i32 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i32 [[X3]] to <32 x i1>
@@ -1278,9 +1278,9 @@ define <8 x half>@test_int_x86_avx512_mask_scalef_sh(<8 x half> %x0, <8 x half>
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_scalef_sh(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]], i8 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -1309,8 +1309,8 @@ define <8 x half>@test_int_x86_avx512_mask_scalef_sh_nomask(<8 x half> %x0, <8 x
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_scalef_sh_nomask(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], <8 x half> [[X3:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1336,7 +1336,7 @@ define <8 x half>@test_int_x86_avx512_mask_scalef_sh_nomask(<8 x half> %x0, <8 x
 define <8 x half>@test_int_x86_avx512_mask_scalef_sh_load(<8 x half> %x0, ptr %x1ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_scalef_sh_load(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], ptr [[X1PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1370,11 +1370,11 @@ declare <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half>, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1456,11 +1456,11 @@ declare <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half>, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1542,11 +1542,11 @@ declare <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half>, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1628,11 +1628,11 @@ declare <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half>, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1714,11 +1714,11 @@ declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1800,11 +1800,11 @@ declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>
 define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, ptr %ptr) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(
 ; CHECK-SAME: <8 x half> [[X1:%.*]], <8 x half> [[X2:%.*]], <8 x half> [[SRC:%.*]], i8 [[MASK:%.*]], ptr [[PTR:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -1887,8 +1887,8 @@ define i8 @test_int_x86_avx512_mask_cmp_sh(<8 x half> %x0, <8 x half> %x1, i8 %x
 ; CHECK-LABEL: define i8 @test_int_x86_avx512_mask_cmp_sh(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X3:%.*]], i32 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -1915,8 +1915,8 @@ define i8 @test_int_x86_avx512_mask_cmp_sh_all(<8 x half> %x0, <8 x half> %x1, i
 ; CHECK-LABEL: define i8 @test_int_x86_avx512_mask_cmp_sh_all(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X3:%.*]], i32 [[X4:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -2001,9 +2001,9 @@ declare <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32>, i32)
 define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512(<16 x i32> %x0, <16 x half> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x half> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i16 [[X2]] to <16 x i1>
@@ -2035,9 +2035,9 @@ define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512(<16 x i32> %x0, <16 x
 define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_r(<16 x i32> %x0, <16 x half> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_r(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x half> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i16 [[X2]] to <16 x i1>
@@ -2089,7 +2089,7 @@ define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_nomask(<16 x i32> %x0
 define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_z(<16 x i32> %x0, i16 %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_z(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -2137,9 +2137,9 @@ declare <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32>, i32)
 define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_r(<16 x i32> %x0, <16 x half> %x1, i16 %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_r(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x half> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i16 [[X2]] to <16 x i1>
@@ -2191,7 +2191,7 @@ define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_nomask(<16 x i32> %x
 define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_z(<16 x i32> %x0, i16 %x2) #0 {
 ; CHECK-LABEL: define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_z(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
@@ -2240,8 +2240,8 @@ define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_512(<16 x half> %x0, <16 x
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_512(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -2283,8 +2283,8 @@ define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_512(<16 x half> %x0, <16
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_512(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -2326,8 +2326,8 @@ define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_512(<16 x half> %x0, <16
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_512(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -2369,8 +2369,8 @@ define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_512(<16 x half> %x0, <16
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_512(
 ; CHECK-SAME: <16 x half> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -2411,9 +2411,9 @@ declare <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64>, i32)
 define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512(<8 x i64> %x0, <8 x half> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i8 [[X2]] to <8 x i1>
@@ -2445,9 +2445,9 @@ define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512(<8 x i64> %x0, <8 x ha
 define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_r(<8 x i64> %x0, <8 x half> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_r(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i8 [[X2]] to <8 x i1>
@@ -2499,7 +2499,7 @@ define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_nomask(<8 x i64> %x0,
 define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_z(<8 x i64> %x0, i8 %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_z(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -2533,9 +2533,9 @@ declare <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64>, i32)
 define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512(<8 x i64> %x0, <8 x half> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i8 [[X2]] to <8 x i1>
@@ -2567,9 +2567,9 @@ define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512(<8 x i64> %x0, <8 x h
 define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_r(<8 x i64> %x0, <8 x half> %x1, i8 %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_r(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x half> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
 ; CHECK-NEXT:    [[MASK:%.*]] = bitcast i8 [[X2]] to <8 x i1>
@@ -2621,7 +2621,7 @@ define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_nomask(<8 x i64> %x0,
 define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_z(<8 x i64> %x0, i8 %x2) #0 {
 ; CHECK-LABEL: define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_z(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
@@ -2656,8 +2656,8 @@ define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2qq_512(<8 x half> %x0, <8 x i6
 ; CHECK-LABEL: define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2qq_512(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -2699,8 +2699,8 @@ define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2uqq_512(<8 x half> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2uqq_512(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -2742,8 +2742,8 @@ define <8 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_512(<8 x half> %x0, <8 x
 ; CHECK-LABEL: define <8 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_512(
 ; CHECK-SAME: <8 x half> [[X0:%.*]], <8 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -3051,7 +3051,7 @@ define <8 x half> @test_x86_avx512fp16_vcvtsi2sh(<8 x half> %arg0, i32 %arg1) #0
 ; CHECK-LABEL: define <8 x half> @test_x86_avx512fp16_vcvtsi2sh(
 ; CHECK-SAME: <8 x half> [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -3089,7 +3089,7 @@ define <8 x half> @test_x86_avx512fp16_vcvtsi642sh(<8 x half> %arg0, i64 %arg1)
 ; CHECK-LABEL: define <8 x half> @test_x86_avx512fp16_vcvtsi642sh(
 ; CHECK-SAME: <8 x half> [[ARG0:%.*]], i64 [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -3127,7 +3127,7 @@ define <8 x half> @test_x86_avx512fp16_vcvtusi2sh(<8 x half> %arg0, i32 %arg1) #
 ; CHECK-LABEL: define <8 x half> @test_x86_avx512fp16_vcvtusi2sh(
 ; CHECK-SAME: <8 x half> [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -3165,7 +3165,7 @@ define <8 x half> @test_x86_avx512fp16_vcvtusi642sh(<8 x half> %arg0, i64 %arg1)
 ; CHECK-LABEL: define <8 x half> @test_x86_avx512fp16_vcvtusi642sh(
 ; CHECK-SAME: <8 x half> [[ARG0:%.*]], i64 [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index d598142fe8dbf..f20d368e9abbc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -71,9 +71,9 @@ define <2 x double> @test_mask_compress_pd_128(<2 x double> %data, <2 x double>
 ;
 ; CHECK-LABEL: define <2 x double> @test_mask_compress_pd_128(
 ; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -106,7 +106,7 @@ define <2 x double> @test_maskz_compress_pd_128(<2 x double> %data, i8 %mask) #0
 ;
 ; CHECK-LABEL: define <2 x double> @test_maskz_compress_pd_128(
 ; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -137,7 +137,7 @@ define <2 x double> @test_compress_pd_128(<2 x double> %data, <2 x double> %data
 ; CHECK-LABEL: define <2 x double> @test_compress_pd_128(
 ; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -161,9 +161,9 @@ define <4 x float> @test_mask_compress_ps_128(<4 x float> %data, <4 x float> %pa
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_compress_ps_128(
 ; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -196,7 +196,7 @@ define <4 x float> @test_maskz_compress_ps_128(<4 x float> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x float> @test_maskz_compress_ps_128(
 ; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -227,7 +227,7 @@ define <4 x float> @test_compress_ps_128(<4 x float> %data, <4 x float> %data2)
 ; CHECK-LABEL: define <4 x float> @test_compress_ps_128(
 ; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -251,9 +251,9 @@ define <2 x i64> @test_mask_compress_q_128(<2 x i64> %data, <2 x i64> %passthru,
 ;
 ; CHECK-LABEL: define <2 x i64> @test_mask_compress_q_128(
 ; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -286,7 +286,7 @@ define <2 x i64> @test_maskz_compress_q_128(<2 x i64> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <2 x i64> @test_maskz_compress_q_128(
 ; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -317,7 +317,7 @@ define <2 x i64> @test_compress_q_128(<2 x i64> %data, <2 x i64> %data2) #0 {
 ; CHECK-LABEL: define <2 x i64> @test_compress_q_128(
 ; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -341,9 +341,9 @@ define <4 x i32> @test_mask_compress_d_128(<4 x i32> %data, <4 x i32> %passthru,
 ;
 ; CHECK-LABEL: define <4 x i32> @test_mask_compress_d_128(
 ; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -376,7 +376,7 @@ define <4 x i32> @test_maskz_compress_d_128(<4 x i32> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x i32> @test_maskz_compress_d_128(
 ; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -407,7 +407,7 @@ define <4 x i32> @test_compress_d_128(<4 x i32> %data, <4 x i32> %data2) #0 {
 ; CHECK-LABEL: define <4 x i32> @test_compress_d_128(
 ; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -431,7 +431,7 @@ define <2 x double> @test_expand_pd_128(<2 x double> %data, <2 x double> %data2)
 ; CHECK-LABEL: define <2 x double> @test_expand_pd_128(
 ; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -455,9 +455,9 @@ define <2 x double> @test_mask_expand_pd_128(<2 x double> %data, <2 x double> %p
 ;
 ; CHECK-LABEL: define <2 x double> @test_mask_expand_pd_128(
 ; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -490,7 +490,7 @@ define <2 x double> @test_maskz_expand_pd_128(<2 x double> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <2 x double> @test_maskz_expand_pd_128(
 ; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -521,7 +521,7 @@ define <4 x float> @test_expand_ps_128(<4 x float> %data, <4 x float> %data2) #0
 ; CHECK-LABEL: define <4 x float> @test_expand_ps_128(
 ; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -545,9 +545,9 @@ define <4 x float> @test_mask_expand_ps_128(<4 x float> %data, <4 x float> %pass
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_expand_ps_128(
 ; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -580,7 +580,7 @@ define <4 x float> @test_maskz_expand_ps_128(<4 x float> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x float> @test_maskz_expand_ps_128(
 ; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -611,7 +611,7 @@ define <2 x i64> @test_expand_q_128(<2 x i64> %data, <2 x i64> %data2) #0 {
 ; CHECK-LABEL: define <2 x i64> @test_expand_q_128(
 ; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -635,9 +635,9 @@ define <2 x i64> @test_mask_expand_q_128(<2 x i64> %data, <2 x i64> %passthru, i
 ;
 ; CHECK-LABEL: define <2 x i64> @test_mask_expand_q_128(
 ; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -670,7 +670,7 @@ define <2 x i64> @test_maskz_expand_q_128(<2 x i64> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <2 x i64> @test_maskz_expand_q_128(
 ; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -701,7 +701,7 @@ define <4 x i32> @test_expand_d_128(<4 x i32> %data, <4 x i32> %data2) #0 {
 ; CHECK-LABEL: define <4 x i32> @test_expand_d_128(
 ; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -725,9 +725,9 @@ define <4 x i32> @test_mask_expand_d_128(<4 x i32> %data, <4 x i32> %passthru, i
 ;
 ; CHECK-LABEL: define <4 x i32> @test_mask_expand_d_128(
 ; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -760,7 +760,7 @@ define <4 x i32> @test_maskz_expand_d_128(<4 x i32> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x i32> @test_maskz_expand_d_128(
 ; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -791,9 +791,9 @@ define <4 x double> @test_mask_compress_pd_256(<4 x double> %data, <4 x double>
 ;
 ; CHECK-LABEL: define <4 x double> @test_mask_compress_pd_256(
 ; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -826,7 +826,7 @@ define <4 x double> @test_maskz_compress_pd_256(<4 x double> %data, i8 %mask) #0
 ;
 ; CHECK-LABEL: define <4 x double> @test_maskz_compress_pd_256(
 ; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -857,7 +857,7 @@ define <4 x double> @test_compress_pd_256(<4 x double> %data, <4 x double> %data
 ; CHECK-LABEL: define <4 x double> @test_compress_pd_256(
 ; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -881,9 +881,9 @@ define <8 x float> @test_mask_compress_ps_256(<8 x float> %data, <8 x float> %pa
 ;
 ; CHECK-LABEL: define <8 x float> @test_mask_compress_ps_256(
 ; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -913,7 +913,7 @@ define <8 x float> @test_maskz_compress_ps_256(<8 x float> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x float> @test_maskz_compress_ps_256(
 ; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -941,7 +941,7 @@ define <8 x float> @test_compress_ps_256(<8 x float> %data, <8 x float> %data2)
 ; CHECK-LABEL: define <8 x float> @test_compress_ps_256(
 ; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -965,9 +965,9 @@ define <4 x i64> @test_mask_compress_q_256(<4 x i64> %data, <4 x i64> %passthru,
 ;
 ; CHECK-LABEL: define <4 x i64> @test_mask_compress_q_256(
 ; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -1000,7 +1000,7 @@ define <4 x i64> @test_maskz_compress_q_256(<4 x i64> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x i64> @test_maskz_compress_q_256(
 ; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -1031,7 +1031,7 @@ define <4 x i64> @test_compress_q_256(<4 x i64> %data, <4 x i64> %data2) #0 {
 ; CHECK-LABEL: define <4 x i64> @test_compress_q_256(
 ; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1055,9 +1055,9 @@ define <8 x i32> @test_mask_compress_d_256(<8 x i32> %data, <8 x i32> %passthru,
 ;
 ; CHECK-LABEL: define <8 x i32> @test_mask_compress_d_256(
 ; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -1087,7 +1087,7 @@ define <8 x i32> @test_maskz_compress_d_256(<8 x i32> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x i32> @test_maskz_compress_d_256(
 ; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -1115,7 +1115,7 @@ define <8 x i32> @test_compress_d_256(<8 x i32> %data, <8 x i32> %data2) #0 {
 ; CHECK-LABEL: define <8 x i32> @test_compress_d_256(
 ; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1139,7 +1139,7 @@ define <4 x double> @test_expand_pd_256(<4 x double> %data, <4 x double> %data2)
 ; CHECK-LABEL: define <4 x double> @test_expand_pd_256(
 ; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1163,9 +1163,9 @@ define <4 x double> @test_mask_expand_pd_256(<4 x double> %data, <4 x double> %p
 ;
 ; CHECK-LABEL: define <4 x double> @test_mask_expand_pd_256(
 ; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -1198,7 +1198,7 @@ define <4 x double> @test_maskz_expand_pd_256(<4 x double> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x double> @test_maskz_expand_pd_256(
 ; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -1229,7 +1229,7 @@ define <8 x float> @test_expand_ps_256(<8 x float> %data, <8 x float> %data2) #0
 ; CHECK-LABEL: define <8 x float> @test_expand_ps_256(
 ; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1253,9 +1253,9 @@ define <8 x float> @test_mask_expand_ps_256(<8 x float> %data, <8 x float> %pass
 ;
 ; CHECK-LABEL: define <8 x float> @test_mask_expand_ps_256(
 ; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -1285,7 +1285,7 @@ define <8 x float> @test_maskz_expand_ps_256(<8 x float> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x float> @test_maskz_expand_ps_256(
 ; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -1313,7 +1313,7 @@ define <4 x i64> @test_expand_q_256(<4 x i64> %data, <4 x i64> %data2) #0 {
 ; CHECK-LABEL: define <4 x i64> @test_expand_q_256(
 ; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1337,9 +1337,9 @@ define <4 x i64> @test_mask_expand_q_256(<4 x i64> %data, <4 x i64> %passthru, i
 ;
 ; CHECK-LABEL: define <4 x i64> @test_mask_expand_q_256(
 ; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -1372,7 +1372,7 @@ define <4 x i64> @test_maskz_expand_q_256(<4 x i64> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x i64> @test_maskz_expand_q_256(
 ; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -1403,7 +1403,7 @@ define <8 x i32> @test_expand_d_256(<8 x i32> %data, <8 x i32> %data2) #0 {
 ; CHECK-LABEL: define <8 x i32> @test_expand_d_256(
 ; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1427,9 +1427,9 @@ define <8 x i32> @test_mask_expand_d_256(<8 x i32> %data, <8 x i32> %passthru, i
 ;
 ; CHECK-LABEL: define <8 x i32> @test_mask_expand_d_256(
 ; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -1459,7 +1459,7 @@ define <8 x i32> @test_maskz_expand_d_256(<8 x i32> %data, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x i32> @test_maskz_expand_d_256(
 ; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
@@ -1487,7 +1487,7 @@ define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) #0 {
 ; CHECK-LABEL: define i8 @test_cmpps_256(
 ; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1514,7 +1514,7 @@ define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: define i8 @test_cmpps_128(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1543,7 +1543,7 @@ define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) #0 {
 ; CHECK-LABEL: define i8 @test_cmppd_256(
 ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -1572,7 +1572,7 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: define i8 @test_cmppd_128(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -1602,8 +1602,8 @@ define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1
 ; CHECK-LABEL: define <8 x float> @test_mm512_maskz_max_ps_256(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
@@ -1630,9 +1630,9 @@ define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1,
 ; CHECK-LABEL: define <8 x float> @test_mm512_mask_max_ps_256(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
@@ -1659,7 +1659,7 @@ define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %
 ; CHECK-LABEL: define <8 x float> @test_mm512_max_ps_256(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
@@ -1676,8 +1676,8 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1
 ; CHECK-LABEL: define <4 x float> @test_mm512_maskz_max_ps_128(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
@@ -1707,9 +1707,9 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-LABEL: define <4 x float> @test_mm512_mask_max_ps_128(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
@@ -1739,7 +1739,7 @@ define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %
 ; CHECK-LABEL: define <4 x float> @test_mm512_max_ps_128(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
@@ -1756,8 +1756,8 @@ define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1
 ; CHECK-LABEL: define <8 x float> @test_mm512_maskz_min_ps_256(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
@@ -1784,9 +1784,9 @@ define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1,
 ; CHECK-LABEL: define <8 x float> @test_mm512_mask_min_ps_256(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
@@ -1813,7 +1813,7 @@ define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %
 ; CHECK-LABEL: define <8 x float> @test_mm512_min_ps_256(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
@@ -1830,9 +1830,9 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1
 ; CHECK-LABEL: define <4 x float> @test_mm512_maskz_min_ps_128(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
@@ -1865,10 +1865,10 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-LABEL: define <4 x float> @test_mm512_mask_min_ps_128(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
@@ -1901,7 +1901,7 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %
 ; CHECK-LABEL: define <4 x float> @test_mm512_min_ps_128(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
@@ -1962,8 +1962,8 @@ define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermi2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X1]], <4 x i32> [[TMP5]])
@@ -1987,9 +1987,9 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermi2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X1]], <4 x i32> [[TMP6]])
@@ -2024,8 +2024,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i
 define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i2>
@@ -2049,10 +2049,10 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
 ;
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP6]])
@@ -2088,10 +2088,10 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
 ;
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP9]])
@@ -2129,8 +2129,8 @@ define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpermi2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X1]], <8 x i32> [[TMP5]])
@@ -2154,9 +2154,9 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermi2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X1]], <8 x i32> [[TMP6]])
@@ -2188,8 +2188,8 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i
 define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_ask_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i3>
@@ -2213,10 +2213,10 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
 ;
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP6]])
@@ -2249,10 +2249,10 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
 ;
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP9]])
@@ -2287,8 +2287,8 @@ define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_vpermi2var_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP6]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to <2 x double>
@@ -2315,9 +2315,9 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vpermi2var_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP15:%.*]] = trunc <2 x i64> [[TMP13]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to <2 x double>
@@ -2362,8 +2362,8 @@ define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_vpermi2var_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to <4 x double>
@@ -2390,9 +2390,9 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vpermi2var_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to <4 x double>
@@ -2437,8 +2437,8 @@ define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_vpermi2var_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to <4 x float>
@@ -2465,9 +2465,9 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to <4 x float>
@@ -2510,10 +2510,10 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %
 ;
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128_cast(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <2 x i64> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32>
@@ -2561,8 +2561,8 @@ define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_vpermi2var_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[TMP6]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to <8 x float>
@@ -2589,9 +2589,9 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_vpermi2var_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP15:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to <8 x float>
@@ -2633,8 +2633,8 @@ define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermi2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP8]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X1]], <2 x i64> [[TMP5]])
@@ -2658,9 +2658,9 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermi2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X1]], <2 x i64> [[TMP6]])
@@ -2695,8 +2695,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i
 define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 {
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP8]] to <2 x i1>
@@ -2720,10 +2720,10 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i
 ;
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP6]])
@@ -2759,10 +2759,10 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x
 ;
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP9]])
@@ -2800,8 +2800,8 @@ define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermi2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP8]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X1]], <4 x i64> [[TMP5]])
@@ -2825,9 +2825,9 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermi2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X1]], <4 x i64> [[TMP6]])
@@ -2862,8 +2862,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i
 define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP8]] to <4 x i2>
@@ -2887,10 +2887,10 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i
 ;
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP6]])
@@ -2926,10 +2926,10 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x
 ;
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i2>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP9]])
@@ -2967,8 +2967,8 @@ define <2 x double>@test_int_x86_avx512_scalef_pd_128(<2 x double> %x0, <2 x dou
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_scalef_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -2996,9 +2996,9 @@ define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_scalef_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -3029,8 +3029,8 @@ define <4 x double>@test_int_x86_avx512_scalef_pd_256(<4 x double> %x0, <4 x dou
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_scalef_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -3058,9 +3058,9 @@ define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_scalef_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -3091,8 +3091,8 @@ define <4 x float>@test_int_x86_avx512_scalef_ps_128(<4 x float> %x0, <4 x float
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_scalef_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -3120,9 +3120,9 @@ define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_scalef_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -3153,8 +3153,8 @@ define <8 x float>@test_int_x86_avx512_scalef_ps_256(<8 x float> %x0, <8 x float
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_scalef_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -3182,9 +3182,9 @@ define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_scalef_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -3216,8 +3216,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -3273,8 +3273,8 @@ define void @test_int_x86_avx512_mask_pmov_qb_mem_128(ptr %ptr, <2 x i64> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -3312,8 +3312,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -3369,8 +3369,8 @@ define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(ptr %ptr, <2 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -3408,8 +3408,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -3465,8 +3465,8 @@ define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(ptr %ptr, <2 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -3504,8 +3504,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -3561,8 +3561,8 @@ define void @test_int_x86_avx512_mask_pmov_qb_mem_256(ptr %ptr, <4 x i64> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -3600,8 +3600,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -3657,8 +3657,8 @@ define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(ptr %ptr, <4 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -3696,8 +3696,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -3753,8 +3753,8 @@ define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(ptr %ptr, <4 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -3792,8 +3792,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -3849,8 +3849,8 @@ define void @test_int_x86_avx512_mask_pmov_qw_mem_128(ptr %ptr, <2 x i64> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -3888,8 +3888,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -3945,8 +3945,8 @@ define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(ptr %ptr, <2 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -3984,8 +3984,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4041,8 +4041,8 @@ define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(ptr %ptr, <2 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -4080,8 +4080,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -4137,8 +4137,8 @@ define void @test_int_x86_avx512_mask_pmov_qw_mem_256(ptr %ptr, <4 x i64> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -4176,8 +4176,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -4233,8 +4233,8 @@ define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(ptr %ptr, <4 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -4272,8 +4272,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -4329,8 +4329,8 @@ define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(ptr %ptr, <4 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4425,8 +4425,8 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_128(ptr %ptr, <2 x i64> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -4464,8 +4464,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4521,8 +4521,8 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(ptr %ptr, <2 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -4560,8 +4560,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -4617,8 +4617,8 @@ define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(ptr %ptr, <2 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -4668,8 +4668,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
@@ -4698,7 +4698,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) #0
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmov_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
@@ -4729,8 +4729,8 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_256(ptr %ptr, <4 x i64> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -4767,7 +4767,7 @@ define <4 x i32>@test_int_x86_avx512_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1)
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovs_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -4792,8 +4792,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -4820,7 +4820,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_pmovs_qd_256(<4 x i64> %x0, i8 %x2) #
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovs_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -4846,8 +4846,8 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(ptr %ptr, <4 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -4884,7 +4884,7 @@ define <4 x i32>@test_int_x86_avx512_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1)
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovus_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -4909,8 +4909,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -4937,7 +4937,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_pmovus_qd_256(<4 x i64> %x0, i8 %x2)
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovus_qd_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -4963,8 +4963,8 @@ define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(ptr %ptr, <4 x i64> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
@@ -5002,8 +5002,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5059,8 +5059,8 @@ define void @test_int_x86_avx512_mask_pmov_db_mem_128(ptr %ptr, <4 x i32> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -5098,8 +5098,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5155,8 +5155,8 @@ define void @test_int_x86_avx512_mask_pmovs_db_mem_128(ptr %ptr, <4 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -5194,8 +5194,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5251,8 +5251,8 @@ define void @test_int_x86_avx512_mask_pmovus_db_mem_128(ptr %ptr, <4 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -5290,8 +5290,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -5347,8 +5347,8 @@ define void @test_int_x86_avx512_mask_pmov_db_mem_256(ptr %ptr, <8 x i32> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -5386,8 +5386,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -5443,8 +5443,8 @@ define void @test_int_x86_avx512_mask_pmovs_db_mem_256(ptr %ptr, <8 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -5482,8 +5482,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -5539,8 +5539,8 @@ define void @test_int_x86_avx512_mask_pmovus_db_mem_256(ptr %ptr, <8 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -5578,8 +5578,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5635,8 +5635,8 @@ define void @test_int_x86_avx512_mask_pmov_dw_mem_128(ptr %ptr, <4 x i32> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -5674,8 +5674,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5731,8 +5731,8 @@ define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(ptr %ptr, <4 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -5770,8 +5770,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -5827,8 +5827,8 @@ define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(ptr %ptr, <4 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(
 ; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -5866,8 +5866,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -5923,8 +5923,8 @@ define void @test_int_x86_avx512_mask_pmov_dw_mem_256(ptr %ptr, <8 x i32> %x1, i
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -5962,8 +5962,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -6019,8 +6019,8 @@ define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(ptr %ptr, <8 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -6058,8 +6058,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -6115,8 +6115,8 @@ define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(ptr %ptr, <8 x i32> %x1,
 ; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(
 ; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
@@ -6154,8 +6154,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6195,7 +6195,7 @@ define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2dq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6222,8 +6222,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, <
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6253,7 +6253,7 @@ define <4 x float>@test_int_x86_avx512_cvt_pd2ps(<2 x double> %x0, <4 x float> %
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6278,8 +6278,8 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x flo
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6305,7 +6305,7 @@ define <4 x float>@test_int_x86_avx512_cvt_pd2ps_zext(<2 x double> %x0, <4 x flo
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6332,8 +6332,8 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6363,7 +6363,7 @@ define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128(<2 x double> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6388,8 +6388,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6415,7 +6415,7 @@ define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6442,8 +6442,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0,
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6473,7 +6473,7 @@ define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_256(<4 x double> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -6498,8 +6498,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -6527,7 +6527,7 @@ define <4 x i32>@test_int_x86_avx512_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2dq_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6552,8 +6552,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i3
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6581,7 +6581,7 @@ define <8 x i32>@test_int_x86_avx512_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2dq_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -6606,8 +6606,8 @@ define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i3
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -6635,7 +6635,7 @@ define <4 x i32>@test_int_x86_avx512_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2udq_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6660,8 +6660,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6689,7 +6689,7 @@ define <8 x i32>@test_int_x86_avx512_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2udq_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -6714,8 +6714,8 @@ define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -6743,7 +6743,7 @@ define <4 x i32>@test_int_x86_avx512_ask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_ask_cvtt_pd2dq_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6768,8 +6768,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6795,7 +6795,7 @@ define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2dq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6822,8 +6822,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0,
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6853,7 +6853,7 @@ define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6878,8 +6878,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6905,7 +6905,7 @@ define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -6932,8 +6932,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0,
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -6963,7 +6963,7 @@ define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -6988,8 +6988,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -7017,7 +7017,7 @@ define <4 x i32>@test_int_x86_avx512_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32>
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_ps2udq_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -7042,8 +7042,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7071,7 +7071,7 @@ define <8 x i32>@test_int_x86_avx512_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32>
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvtt_ps2udq_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -7096,8 +7096,8 @@ define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -7125,9 +7125,9 @@ define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <
 ;
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_rndscale_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[X3]] to i2
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i2 [[TMP4]] to <2 x i1>
@@ -7162,9 +7162,9 @@ define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <
 ;
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_rndscale_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[X3]] to i4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
@@ -7199,9 +7199,9 @@ define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4
 ;
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_rndscale_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[X3]] to i4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
@@ -7236,9 +7236,9 @@ define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8
 ;
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_rndscale_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
@@ -7273,8 +7273,8 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_getmant_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7330,8 +7330,8 @@ define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_getmant_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -7374,8 +7374,8 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_getmant_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7418,8 +7418,8 @@ define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_getmant_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -7461,8 +7461,8 @@ define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pternlog_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7490,9 +7490,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pternlog_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0
@@ -7535,9 +7535,9 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pternlog_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0
@@ -7579,8 +7579,8 @@ define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_pternlog_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -7608,9 +7608,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_pternlog_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
@@ -7650,9 +7650,9 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_pternlog_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
@@ -7691,8 +7691,8 @@ define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_pternlog_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
@@ -7720,9 +7720,9 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_pternlog_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0
@@ -7763,9 +7763,9 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_pternlog_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0
@@ -7807,8 +7807,8 @@ define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_pternlog_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
@@ -7836,9 +7836,9 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_pternlog_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
@@ -7879,9 +7879,9 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_pternlog_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
@@ -7922,8 +7922,8 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
 ; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_128(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
@@ -7980,8 +7980,8 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
 ; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_256(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
@@ -8046,7 +8046,7 @@ define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrkz(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -8071,9 +8071,9 @@ define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %
 ;
 ; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrk(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
@@ -8113,7 +8113,7 @@ define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrkz(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
@@ -8139,9 +8139,9 @@ define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %
 ;
 ; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrk(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
@@ -8185,7 +8185,7 @@ define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrkz(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
@@ -8210,9 +8210,9 @@ define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %ma
 ;
 ; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrk(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
@@ -8252,7 +8252,7 @@ define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrkz(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
@@ -8278,9 +8278,9 @@ define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %ma
 ;
 ; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrk(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
@@ -8324,7 +8324,7 @@ define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrkz(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
@@ -8350,9 +8350,9 @@ define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i
 ;
 ; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrk(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
@@ -8393,7 +8393,7 @@ define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrkz(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i2
@@ -8419,9 +8419,9 @@ define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i
 ;
 ; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrk(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i2
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i2 [[TMP4]] to <2 x i1>
@@ -8465,7 +8465,7 @@ define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrkz(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i4
@@ -8491,9 +8491,9 @@ define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8
 ;
 ; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrk(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i4
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
@@ -8534,7 +8534,7 @@ define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
 ;
 ; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrkz(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i8 [[MASK]] to i2
@@ -8560,9 +8560,9 @@ define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8
 ;
 ; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrk(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i8 [[MASK]] to i2
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i2 [[TMP4]] to <2 x i1>
@@ -8592,7 +8592,7 @@ define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i6
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_permvar_df_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
@@ -8617,9 +8617,9 @@ define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_permvar_df_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
@@ -8659,8 +8659,8 @@ define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_permvar_df_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
@@ -8700,7 +8700,7 @@ define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_permvar_di_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
@@ -8716,9 +8716,9 @@ define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_permvar_di_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
@@ -8747,8 +8747,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i6
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_permvar_di_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
@@ -8779,9 +8779,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_fixupimm_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -8846,9 +8846,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0,
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_fixupimm_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -8898,9 +8898,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_fixupimm_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -8965,9 +8965,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0,
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_fixupimm_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -9032,9 +9032,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_fixupimm_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -9099,9 +9099,9 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_fixupimm_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
@@ -9166,9 +9166,9 @@ define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_fixupimm_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -9233,9 +9233,9 @@ define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_fixupimm_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
@@ -9297,7 +9297,7 @@ define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psra_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -9318,10 +9318,10 @@ define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psra_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc i128 [[TMP6]] to i64
@@ -9358,9 +9358,9 @@ define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psra_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc i128 [[TMP16]] to i64
@@ -9399,7 +9399,7 @@ define <4 x i64> @test_x86_avx512_psra_q_256(<4 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psra_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -9420,10 +9420,10 @@ define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1,
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psra_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 88), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc i128 [[TMP6]] to i64
@@ -9460,9 +9460,9 @@ define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1,
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psra_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 88), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP17:%.*]] = trunc i128 [[TMP16]] to i64
@@ -9516,9 +9516,9 @@ define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %pas
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrai_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP5]], i32 7)
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
@@ -9550,8 +9550,8 @@ define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask, i8
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP9]], i32 7)
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
@@ -9600,9 +9600,9 @@ define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %pas
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrai_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP5]], i32 7)
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
@@ -9634,8 +9634,8 @@ define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask, i8
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP9]], i32 7)
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
@@ -9668,7 +9668,7 @@ define <2 x i64> @test_x86_avx512_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psrav_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
@@ -9687,10 +9687,10 @@ define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrav_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <2 x i1> [[TMP6]] to <2 x i64>
@@ -9725,9 +9725,9 @@ define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1
 ; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(
 ; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext <2 x i1> [[TMP13]] to <2 x i64>
@@ -9763,7 +9763,7 @@ define <4 x i64> @test_x86_avx512_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psrav_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
@@ -9782,10 +9782,10 @@ define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1,
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrav_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i64>
@@ -9820,9 +9820,9 @@ define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1
 ; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(
 ; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i1> [[TMP13]] to <4 x i64>
@@ -9858,8 +9858,8 @@ define <8 x float> @test_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x flo
 ; CHECK-LABEL: define <8 x float> @test_vfmadd256_ps(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -9876,9 +9876,9 @@ define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
 ; CHECK-LABEL: define <8 x float> @test_mask_vfmadd256_ps(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -9906,8 +9906,8 @@ define <4 x float> @test_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 ; CHECK-LABEL: define <4 x float> @test_vfmadd128_ps(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -9924,9 +9924,9 @@ define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -9957,8 +9957,8 @@ define <4 x double> @test_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x dou
 ; CHECK-LABEL: define <4 x double> @test_fmadd256_pd(
 ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
@@ -9975,9 +9975,9 @@ define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4
 ; CHECK-LABEL: define <4 x double> @test_mask_fmadd256_pd(
 ; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
@@ -10008,8 +10008,8 @@ define <2 x double> @test_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x dou
 ; CHECK-LABEL: define <2 x double> @test_fmadd128_pd(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
@@ -10026,9 +10026,9 @@ define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2
 ; CHECK-LABEL: define <2 x double> @test_mask_fmadd128_pd(
 ; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
@@ -10060,9 +10060,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmadd_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
@@ -10094,9 +10094,9 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmadd_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
@@ -10127,9 +10127,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmadd_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
@@ -10161,9 +10161,9 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmadd_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
@@ -10194,9 +10194,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -10228,9 +10228,9 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -10261,9 +10261,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmadd_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -10292,9 +10292,9 @@ define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmadd_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -10321,10 +10321,10 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2
 ;
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsub_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
@@ -10358,10 +10358,10 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4
 ;
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsub_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
@@ -10395,10 +10395,10 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x
 ;
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
@@ -10432,10 +10432,10 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x
 ;
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsub_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
@@ -10465,9 +10465,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x
 define <8 x float> @test_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
 ; CHECK-LABEL: define <8 x float> @test_vfnmadd256_ps(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
@@ -10486,10 +10486,10 @@ define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
 ;
 ; CHECK-LABEL: define <8 x float> @test_mask_vfnmadd256_ps(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
@@ -10519,9 +10519,9 @@ define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8
 define <4 x float> @test_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
 ; CHECK-LABEL: define <4 x float> @test_vfnmadd128_ps(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
@@ -10540,10 +10540,10 @@ define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfnmadd128_ps(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
@@ -10576,9 +10576,9 @@ define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4
 define <4 x double> @test_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
 ; CHECK-LABEL: define <4 x double> @test_vfnmadd256_pd(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
@@ -10597,10 +10597,10 @@ define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1,
 ;
 ; CHECK-LABEL: define <4 x double> @test_mask_vfnmadd256_pd(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
@@ -10633,9 +10633,9 @@ define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1,
 define <2 x double> @test_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
 ; CHECK-LABEL: define <2 x double> @test_vfnmadd128_pd(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
@@ -10654,10 +10654,10 @@ define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1,
 ;
 ; CHECK-LABEL: define <2 x double> @test_mask_vfnmadd128_pd(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
@@ -10690,8 +10690,8 @@ define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1,
 define <8 x float> @test_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
 ; CHECK-LABEL: define <8 x float> @test_vfnmsub256_ps(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]]
@@ -10714,10 +10714,10 @@ define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8
 ;
 ; CHECK-LABEL: define <8 x float> @test_mask_vfnmsub256_ps(
 ; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
@@ -10750,8 +10750,8 @@ define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8
 define <4 x float> @test_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
 ; CHECK-LABEL: define <4 x float> @test_vfnmsub128_ps(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
@@ -10774,10 +10774,10 @@ define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfnmsub128_ps(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
@@ -10813,8 +10813,8 @@ define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4
 define <4 x double> @test_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
 ; CHECK-LABEL: define <4 x double> @test_vfnmsub256_pd(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]]
@@ -10837,10 +10837,10 @@ define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1,
 ;
 ; CHECK-LABEL: define <4 x double> @test_mask_vfnmsub256_pd(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
@@ -10876,8 +10876,8 @@ define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1,
 define <2 x double> @test_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
 ; CHECK-LABEL: define <2 x double> @test_vfnmsub128_pd(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]]
@@ -10900,10 +10900,10 @@ define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1,
 ;
 ; CHECK-LABEL: define <2 x double> @test_mask_vfnmsub128_pd(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
@@ -10941,9 +10941,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X0]]
@@ -10981,9 +10981,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X0]]
@@ -11021,9 +11021,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X0]]
@@ -11061,9 +11061,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X0]]
@@ -11097,8 +11097,8 @@ define <8 x float> @test_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x flo
 ; CHECK-LABEL: define <8 x float> @test_fmaddsub256_ps(
 ; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP7]]
@@ -11125,9 +11125,9 @@ define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8
 ; CHECK-LABEL: define <8 x float> @test_mask_fmaddsub256_ps(
 ; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
@@ -11165,8 +11165,8 @@ define <4 x float> @test_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x flo
 ; CHECK-LABEL: define <4 x float> @test_fmaddsub128_ps(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP7]]
@@ -11193,9 +11193,9 @@ define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4
 ; CHECK-LABEL: define <4 x float> @test_mask_fmaddsub128_ps(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
@@ -11236,8 +11236,8 @@ define <4 x double> @test_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4
 ; CHECK-LABEL: define <4 x double> @test_vfmaddsub256_pd(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP7]]
@@ -11264,9 +11264,9 @@ define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a
 ; CHECK-LABEL: define <4 x double> @test_mask_vfmaddsub256_pd(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
@@ -11307,8 +11307,8 @@ define <2 x double> @test_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2
 ; CHECK-LABEL: define <2 x double> @test_vfmaddsub128_pd(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP7]]
@@ -11335,9 +11335,9 @@ define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a
 ; CHECK-LABEL: define <2 x double> @test_mask_vfmaddsub128_pd(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
@@ -11379,9 +11379,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0,
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
@@ -11423,9 +11423,9 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0,
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP12]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
@@ -11466,9 +11466,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0,
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
@@ -11510,9 +11510,9 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0,
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP12]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
@@ -11553,9 +11553,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
@@ -11597,9 +11597,9 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP12]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
@@ -11640,9 +11640,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
@@ -11681,9 +11681,9 @@ define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP12]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
@@ -11721,9 +11721,9 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0,
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
@@ -11765,9 +11765,9 @@ define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0,
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
@@ -11809,9 +11809,9 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
@@ -11853,9 +11853,9 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
@@ -11893,10 +11893,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1,
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmk(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -11939,10 +11939,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmka(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -11985,9 +11985,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkz(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -12015,9 +12015,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkza(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -12045,10 +12045,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmb(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -12103,10 +12103,10 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmba(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -12161,9 +12161,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbz(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -12203,9 +12203,9 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
 ;
 ; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbza(
 ; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -12245,10 +12245,10 @@ define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %
 ;
 ; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmk(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -12291,9 +12291,9 @@ define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double>
 ;
 ; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmkz(
 ; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -12321,10 +12321,10 @@ define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %
 ;
 ; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmk(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -12367,9 +12367,9 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double>
 ;
 ; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmkz(
 ; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 216096199fd06..5e937485ff282 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -16,8 +16,8 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x
 define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
@@ -49,11 +49,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
@@ -141,8 +141,8 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x
 define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
@@ -174,11 +174,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -270,8 +270,8 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x i32>, <8
 define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <32 x i8>
@@ -303,11 +303,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -395,8 +395,8 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>, <4 x i32>, <4
 define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
@@ -428,11 +428,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -524,8 +524,8 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x
 define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
@@ -557,11 +557,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -649,8 +649,8 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x
 define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
@@ -682,11 +682,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -779,8 +779,8 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x i32>, <8
 define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
@@ -812,11 +812,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -904,8 +904,8 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>, <4 x i32>, <4
 define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
@@ -937,11 +937,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
index 26b1306e03894..1d3046804b74f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
@@ -15,8 +15,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
@@ -44,11 +44,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP40:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP40:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
@@ -131,8 +131,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
@@ -160,11 +160,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -253,8 +253,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>
 define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP24]], zeroinitializer
@@ -282,11 +282,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP40:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP40:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -369,8 +369,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>
 define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP24]], zeroinitializer
@@ -398,11 +398,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP40:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -491,8 +491,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
@@ -524,11 +524,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -619,8 +619,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
 define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
@@ -652,11 +652,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -753,8 +753,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>
 define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
@@ -786,11 +786,11 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <8 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -881,8 +881,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>
 define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -926,11 +926,11 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <4 x i32> [[X4:%.*]], i8 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
index f6410c6799a55..5c99f8a3a1fb6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
@@ -16,8 +16,8 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <
 define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
@@ -49,11 +49,11 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
@@ -141,8 +141,8 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <64 x i8>
@@ -174,11 +174,11 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -266,8 +266,8 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <
 define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
@@ -299,11 +299,11 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -391,8 +391,8 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>,
 define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
@@ -424,11 +424,11 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
index 6d4ce6dec5198..236ff45c6cd08 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
@@ -15,8 +15,8 @@ declare <16 x i32> @llvm.x86.avx512.vpdpbusd.512(<16 x i32>, <64 x i8>, <64 x i8
 define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
@@ -44,11 +44,11 @@ define <16 x i32> @test_int_x86_avx512_ask_vpdpbusd_512(<16 x i32> %x0, <64 x i8
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <64 x i8> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP40:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP40:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1:![0-9]+]]
@@ -131,8 +131,8 @@ declare <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32>, <64 x i8>, <64 x i
 define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, <64 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <64 x i8> [[TMP24]], zeroinitializer
@@ -160,11 +160,11 @@ define <16 x i32>@test_int_x86_avx512_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <64 x i8> %x1, ptr %x2p, <64 x i8> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <64 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <64 x i8> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP33:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP33:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP40:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP40:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -247,8 +247,8 @@ declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i
 define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
@@ -280,11 +280,11 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
@@ -375,8 +375,8 @@ declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x
 define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_int_x86_avx512_ask_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
@@ -408,11 +408,11 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
 define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) sanitize_memory {
 ; CHECK-LABEL: define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i32> [[X4:%.*]], i16 [[X3:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
index 1de2a54486e58..0344fbd5ee2a9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
@@ -15,8 +15,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
@@ -46,8 +46,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
@@ -77,8 +77,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>
 define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <32 x i8> %x1, <32 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpbusds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP4]], zeroinitializer
@@ -108,8 +108,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>
 define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <16 x i8> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpbusds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP4]], zeroinitializer
@@ -139,8 +139,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
 define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpwssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
@@ -174,8 +174,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
 define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpwssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
@@ -209,8 +209,8 @@ declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>
 define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx_vpdpwssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
@@ -244,8 +244,8 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>
 define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx_vpdpwssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
index 66cbebee80dc3..707b46bb8686e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
@@ -26,8 +26,8 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -44,8 +44,8 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(
 ; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -62,8 +62,8 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -80,8 +80,8 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(
 ; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -98,8 +98,8 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -116,8 +116,8 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(
 ; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -134,8 +134,8 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -152,8 +152,8 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(
 ; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -170,8 +170,8 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -188,8 +188,8 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(
 ; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
@@ -206,8 +206,8 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
@@ -224,8 +224,8 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(
 ; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
index d91abeac6a816..4a7050790007b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint8-intrinsics.ll
@@ -15,10 +15,10 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbssd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbssd_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssd_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1:![0-9]+]]
@@ -78,10 +78,10 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbssds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbssds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbssds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -141,10 +141,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbssd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbssd_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssd_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -204,10 +204,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbssds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbssds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbssds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -267,10 +267,10 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbsud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbsud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsud_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -330,10 +330,10 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbsuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbsuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbsuds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -393,10 +393,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbsud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbsud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsud_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -456,10 +456,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbsuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbsuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbsuds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -519,10 +519,10 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbuud.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbuud_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuud_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -582,10 +582,10 @@ declare <4 x i32> @llvm.x86.avx2.vpdpbuuds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 define <4 x i32>@test_int_x86_avx2_vpdpbuuds_128(<4 x i32> %x0, <16 x i8> %x1, ptr %x2p, <16 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpbuuds_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <16 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -645,10 +645,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbuud_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuud_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
@@ -708,10 +708,10 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 define <8 x i32>@test_int_x86_avx2_vpdpbuuds_256(<8 x i32> %x0, <32 x i8> %x1, ptr %x2p, <32 x i8> %x4) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpbuuds_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <32 x i8> [[X1:%.*]], ptr [[X2P:%.*]], <32 x i8> [[X4:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/f16c-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/f16c-intrinsics.ll
index e663a7bfeef24..cd2ccaf32e946 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/f16c-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/f16c-intrinsics.ll
@@ -47,7 +47,7 @@ define void @test_x86_vcvtps2ph_256_m(ptr nocapture %d, <8 x float> %a) nounwind
 ; CHECK-LABEL: define void @test_x86_vcvtps2ph_256_m(
 ; CHECK-SAME: ptr captures(none) [[D:%.*]], <8 x float> [[A:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32> [[TMP17]], zeroinitializer
@@ -76,7 +76,7 @@ define void @test_x86_vcvtps2ph_128_m(ptr nocapture %d, <4 x float> %a) nounwind
 ; CHECK-LABEL: define void @test_x86_vcvtps2ph_128_m(
 ; CHECK-SAME: ptr captures(none) [[D:%.*]], <4 x float> [[A:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <4 x i32> [[TMP9]], zeroinitializer
@@ -109,7 +109,7 @@ define void @test_x86_vcvtps2ph_128_m2(ptr nocapture %hf4x16, <4 x float> %f4X86
 ; CHECK-LABEL: define void @test_x86_vcvtps2ph_128_m2(
 ; CHECK-SAME: ptr captures(none) [[HF4X16:%.*]], <4 x float> [[F4X86:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
@@ -145,7 +145,7 @@ define void @test_x86_vcvtps2ph_128_m3(ptr nocapture %hf4x16, <4 x float> %f4X86
 ; CHECK-LABEL: define void @test_x86_vcvtps2ph_128_m3(
 ; CHECK-SAME: ptr captures(none) [[HF4X16:%.*]], <4 x float> [[F4X86:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
@@ -178,3 +178,6 @@ entry:
 }
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
index 3d98f60a8242a..d62fd7e8d1a89 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll
@@ -22,7 +22,7 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test1(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -68,7 +68,7 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test88(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -108,7 +108,7 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test87(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -148,7 +148,7 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test86(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -188,7 +188,7 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test85(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -228,7 +228,7 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test84(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -268,7 +268,7 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test83(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -308,7 +308,7 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test82(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -348,7 +348,7 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test81(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -388,7 +388,7 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test80(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -428,7 +428,7 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test79(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -468,7 +468,7 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test78(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -508,7 +508,7 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test77(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -548,7 +548,7 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test76(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP16]] to <4 x i16>
@@ -596,7 +596,7 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test75(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP16]] to <2 x i32>
@@ -644,7 +644,7 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test74(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <1 x i64> [[TMP16]] to <4 x i16>
@@ -1049,7 +1049,7 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
@@ -1094,7 +1094,7 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
@@ -1139,7 +1139,7 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
@@ -1178,7 +1178,7 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
@@ -1223,7 +1223,7 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
@@ -1268,7 +1268,7 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
@@ -1307,7 +1307,7 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32>
@@ -1352,7 +1352,7 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16>
@@ -1396,7 +1396,7 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test56(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -1436,7 +1436,7 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test55(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -1476,7 +1476,7 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test54(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -1516,7 +1516,7 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test53(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -1556,7 +1556,7 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test52(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -1594,7 +1594,7 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test51(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -1634,7 +1634,7 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test50(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -1674,7 +1674,7 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test49(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP13:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16>
@@ -1732,7 +1732,7 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test48(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -1772,7 +1772,7 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test47(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -1812,7 +1812,7 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test46(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -1852,7 +1852,7 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test45(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -1891,7 +1891,7 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
@@ -1926,7 +1926,7 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test43(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -1966,7 +1966,7 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test42(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2006,7 +2006,7 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test41(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -2046,7 +2046,7 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test40(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2086,7 +2086,7 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test39(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -2126,7 +2126,7 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test38(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2166,7 +2166,7 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test37(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -2207,7 +2207,7 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
@@ -2240,7 +2240,7 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test35(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32>
@@ -2280,7 +2280,7 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test34(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2320,7 +2320,7 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test33(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -2360,7 +2360,7 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test32(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
@@ -2399,7 +2399,7 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test31(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2439,7 +2439,7 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test30(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -2479,7 +2479,7 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test29(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2519,7 +2519,7 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test28(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -2559,7 +2559,7 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test27(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2599,7 +2599,7 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test26(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
@@ -2639,7 +2639,7 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test25(
 ; CHECK-SAME: ptr [[P:%.*]], <1 x i64> [[A:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
@@ -2702,9 +2702,9 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0
 ; CHECK-LABEL: define void @test23(
 ; CHECK-SAME: <1 x i64> [[D:%.*]], <1 x i64> [[N:%.*]], ptr [[P:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[N]] to <8 x i8>
@@ -2744,7 +2744,7 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test22(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP8]] to <4 x i16>
@@ -2850,7 +2850,7 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test20(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP5]] to <2 x i32>
@@ -2975,7 +2975,7 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <1 x i64> [[A]], i32 0
@@ -3112,7 +3112,7 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test12(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
@@ -3152,7 +3152,7 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test11(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -3192,7 +3192,7 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test10(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
@@ -3232,7 +3232,7 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test9(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
@@ -3273,7 +3273,7 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test8(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -3313,7 +3313,7 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test7(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
@@ -3371,7 +3371,7 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test6(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -3417,7 +3417,7 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test5(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
@@ -3463,7 +3463,7 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test4(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -3509,7 +3509,7 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test3(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -3555,7 +3555,7 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test2(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
@@ -3603,7 +3603,7 @@ define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @test89(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -3647,7 +3647,7 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
 ; CHECK-SAME: <1 x i64> [[A_COERCE:%.*]], i32 [[D:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP7]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse-intrinsics-x86.ll
index 9d7763a6ef589..46e814806f383 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse-intrinsics-x86.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_cmp_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
@@ -25,7 +25,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
 define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_cmp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -45,7 +45,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
 define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comieq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -64,7 +64,7 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comige_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -83,7 +83,7 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comigt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -102,7 +102,7 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comile_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -121,7 +121,7 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comilt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -140,7 +140,7 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comineq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -162,9 +162,9 @@ define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.sse.cvtss2si(<4 x float> [[A0:%.*]])
@@ -183,9 +183,9 @@ define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.sse.cvttss2si(<4 x float> [[A0:%.*]])
@@ -209,9 +209,9 @@ define void @test_x86_sse_ldmxcsr(ptr %a0) #0 {
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_LDMXCSR]], 0
 ; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.x86.sse.ldmxcsr(ptr [[A0]])
@@ -227,7 +227,7 @@ declare void @llvm.x86.sse.ldmxcsr(ptr) nounwind
 define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_max_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]])
@@ -243,7 +243,7 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
 define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_max_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -260,7 +260,7 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
 define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_min_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]])
@@ -276,7 +276,7 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
 define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_min_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -296,9 +296,9 @@ define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) #0 {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
 ; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       4:
 ; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> [[A0:%.*]])
@@ -377,9 +377,9 @@ define void @test_x86_sse_stmxcsr(ptr %a0) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
 ; CHECK-NEXT:    store i32 0, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
 ; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @llvm.x86.sse.stmxcsr(ptr [[A0]])
@@ -394,7 +394,7 @@ declare void @llvm.x86.sse.stmxcsr(ptr) nounwind
 define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomieq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -413,7 +413,7 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomige_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -432,7 +432,7 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomigt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -451,7 +451,7 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomile_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -470,7 +470,7 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomilt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
@@ -489,7 +489,7 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomineq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
index 7048050180792..fc7b01b034f33 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll
@@ -15,7 +15,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cmp_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -33,7 +33,7 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
 define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cmp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -53,7 +53,7 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
 define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comieq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -72,7 +72,7 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comige_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -91,7 +91,7 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comigt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -110,7 +110,7 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comile_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -129,7 +129,7 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comilt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -148,7 +148,7 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comineq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -340,7 +340,7 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 
 define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
@@ -363,7 +363,7 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
 
 define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, ptr %p1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -397,7 +397,7 @@ define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, ptr %p1) #0 {
 
 define <4 x float> @test_x86_sse2_cvtsd2ss_load_optsize(<4 x float> %a0, ptr %p1) optsize #0 {
 ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss_load_optsize(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -542,7 +542,7 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
 define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_max_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]])
@@ -558,7 +558,7 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
 define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_max_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP3]], <2 x i32> <i32 2, i32 1>
@@ -575,7 +575,7 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
 define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_min_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]])
@@ -591,7 +591,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
 define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_min_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP3]], <2 x i32> <i32 2, i32 1>
@@ -629,7 +629,7 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
 define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_packssdw_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -662,7 +662,7 @@ define <8 x i16> @test_x86_sse2_packssdw_128_fold() #0 {
 define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_packsswb_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
@@ -695,7 +695,7 @@ define <16 x i8> @test_x86_sse2_packsswb_128_fold() #0 {
 define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_packuswb_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16>
@@ -728,7 +728,7 @@ define <16 x i8> @test_x86_sse2_packuswb_128_fold() #0 {
 define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pavg_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> [[A0:%.*]], <16 x i8> [[A1:%.*]])
@@ -744,7 +744,7 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pavg_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
@@ -760,7 +760,7 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pmadd_wd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
@@ -809,7 +809,7 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
 define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pmulh_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
@@ -825,7 +825,7 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pmulhu_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]])
@@ -841,7 +841,7 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
 define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psad_bw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
@@ -861,7 +861,7 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psll_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -883,7 +883,7 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psll_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -905,7 +905,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psll_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -975,7 +975,7 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
 define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psra_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -997,7 +997,7 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psra_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1051,7 +1051,7 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
 define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1073,7 +1073,7 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1095,7 +1095,7 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
@@ -1116,7 +1116,7 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, ptr %p) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_w_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -1198,7 +1198,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
 define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomieq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1217,7 +1217,7 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomige_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1236,7 +1236,7 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomigt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1255,7 +1255,7 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomile_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1274,7 +1274,7 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomilt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
@@ -1293,7 +1293,7 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomineq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll
index 1fcab72d571ea..618dde9b3dac6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse41-intrinsics-x86.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_sse41_blendvpd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[A2:%.*]] to <2 x i64>
@@ -34,8 +34,8 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
 
 define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_sse41_blendvps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[A2:%.*]] to <4 x i32>
@@ -63,7 +63,7 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
 define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_dppd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <2 x i1> <i1 false, i1 true>, <2 x i64> [[TMP3]], <2 x i64> zeroinitializer
@@ -84,7 +84,7 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
 define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_dpps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
@@ -105,7 +105,7 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
 define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_insertps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -131,7 +131,7 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_mpsadbw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
@@ -155,7 +155,7 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
 define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(ptr %ptr, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_mpsadbw_load_op0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
@@ -190,7 +190,7 @@ define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(ptr %ptr, <16 x i8> %a1) #0 {
 define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_packusdw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
@@ -222,8 +222,8 @@ define <8 x i16> @test_x86_sse41_packusdw_fold() #0 {
 
 define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) #0 {
 ; CHECK-LABEL: @test_x86_sse41_pblendvb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr <16 x i8> [[A2:%.*]], splat (i8 7)
@@ -262,7 +262,7 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
 define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_ptestc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -281,7 +281,7 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
 define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_ptestnzc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -300,7 +300,7 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
 define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_ptestz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <2 x i64> [[TMP3]], zeroinitializer
@@ -347,7 +347,7 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_round_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 7)
@@ -362,7 +362,7 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
 
 define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, ptr %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_round_sd_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -389,7 +389,7 @@ define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, ptr %a1) #0
 
 define <4 x float> @test_x86_sse41_round_ss_load(<4 x float> %a0, ptr %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_round_ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll
index 9a7f4b985293c..bd9661295a210 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll
@@ -26,8 +26,8 @@ entry:
   ret i64 %ret
 }
 
-; If the size of __msan_va_arg_tls changes the second argument of `add` must also be changed.
+; If the size of __msan_va_arg_tls changes the second argument of `getelementptr` must also be changed.
 ; CHECK-LABEL: @many_args
-; CHECK: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792)
-; CHECK-NOT: i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 800)
+; CHECK: getelementptr (i8, ptr @__msan_va_arg_tls, i64 792)
+; CHECK-NOT: getelementptr (i8, ptr @__msan_va_arg_tls, i64 800)
 declare i64 @sum(i64 %n, ...)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll
index b61cb6aebb3ea..bec2ba9ea62f9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll
@@ -16,15 +16,15 @@ entry:
   ret i32 %call
 }
 
-; CHECK: store i32 0, {{.*}} @__msan_param_tls {{.*}} i64 8
-; CHECK: store i32 0, {{.*}} @__msan_param_tls {{.*}} i64 16
-; CHECK: store i32 0, {{.*}} @__msan_param_tls {{.*}} i64 24
-; CHECK: store i32 0, {{.*}} @__msan_va_arg_tls {{.*}} i64 8
-; CHECK-ORIGIN: store i32 0, {{.*}} @__msan_va_arg_origin_tls {{.*}} i64 8
-; CHECK: store i32 0, {{.*}} @__msan_va_arg_tls {{.*}} i64 16
-; CHECK-ORIGIN: store i32 0, {{.*}} @__msan_va_arg_origin_tls {{.*}} i64 16
-; CHECK: store i32 0, {{.*}} @__msan_va_arg_tls {{.*}} i64 24
-; CHECK-ORIGIN: store i32 0, {{.*}} @__msan_va_arg_origin_tls {{.*}} i64 24
+; CHECK: store i32 0, {{.*}} @__msan_param_tls, i64 8
+; CHECK: store i32 0, {{.*}} @__msan_param_tls, i64 16
+; CHECK: store i32 0, {{.*}} @__msan_param_tls, i64 24
+; CHECK: store i32 0, {{.*}} @__msan_va_arg_tls, i64 8
+; CHECK-ORIGIN: store i32 0, {{.*}} @__msan_va_arg_origin_tls, i64 8
+; CHECK: store i32 0, {{.*}} @__msan_va_arg_tls, i64 16
+; CHECK-ORIGIN: store i32 0, {{.*}} @__msan_va_arg_origin_tls, i64 16
+; CHECK: store i32 0, {{.*}} @__msan_va_arg_tls, i64 24
+; CHECK-ORIGIN: store i32 0, {{.*}} @__msan_va_arg_origin_tls, i64 24
 
 define dso_local i32 @sum(i32 %n, ...) local_unnamed_addr #0 {
 entry:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
index 4bc14daaca427..c549c165ee966 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_shadow.ll
@@ -39,9 +39,9 @@ define linkonce_odr dso_local void @_Z4testIcEvT_(i8 noundef signext %arg) sanit
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext i8 [[_MSLD]] to i32
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    store i8 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i8, i32, ...) @_Z5test2IcEvT_iz(i8 noundef signext [[TMP7]], i32 noundef 1, i32 noundef [[CONV]])
 ; CHECK-NEXT:    ret void
@@ -80,9 +80,9 @@ define linkonce_odr dso_local void @_Z4testIiEvT_(i32 noundef %arg) sanitize_mem
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i32 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i32, i32, ...) @_Z5test2IiEvT_iz(i32 noundef [[TMP7]], i32 noundef 1, i32 noundef [[TMP7]])
 ; CHECK-NEXT:    ret void
@@ -122,9 +122,9 @@ define linkonce_odr dso_local void @_Z4testIfEvT_(float noundef %arg) sanitize_m
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP7]] to double
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[TMP11]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[TMP11]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[TMP11]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[TMP11]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (float, i32, ...) @_Z5test2IfEvT_iz(float noundef [[TMP7]], i32 noundef 1, double noundef [[CONV]])
 ; CHECK-NEXT:    ret void
@@ -163,9 +163,9 @@ define linkonce_odr dso_local void @_Z4testIdEvT_(double noundef %arg) sanitize_
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (double, i32, ...) @_Z5test2IdEvT_iz(double noundef [[TMP7]], i32 noundef 1, double noundef [[TMP7]])
 ; CHECK-NEXT:    ret void
@@ -203,9 +203,9 @@ define linkonce_odr dso_local void @_Z4testIeEvT_(x86_fp80 noundef %arg) sanitiz
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i80, ptr [[TMP10]], align 16
 ; CHECK-NEXT:    store i80 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i80 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i80 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i80 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i80 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (x86_fp80, i32, ...) @_Z5test2IeEvT_iz(x86_fp80 noundef [[TMP7]], i32 noundef 1, x86_fp80 noundef [[TMP7]])
 ; CHECK-NEXT:    ret void
@@ -243,9 +243,9 @@ define linkonce_odr dso_local void @_Z4testI6IntIntEvT_(i64 %arg.coerce) sanitiz
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i64, i32, ...) @_Z5test2I6IntIntEvT_iz(i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i32 noundef 1, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -264,7 +264,7 @@ define linkonce_odr dso_local void @_Z4testI10Int64Int64EvT_(i64 %arg.coerce0, i
 ; CHECK-SAME: i64 [[ARG_COERCE0:%.*]], i64 [[ARG_COERCE1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[ARG:%.*]] = alloca [[STRUCT_INT64INT64:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
@@ -295,12 +295,12 @@ define linkonce_odr dso_local void @_Z4testI10Int64Int64EvT_(i64 %arg.coerce0, i
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i64, i64, i32, ...) @_Z5test2I10Int64Int64EvT_iz(i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i32 noundef 1, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -322,7 +322,7 @@ define linkonce_odr dso_local void @_Z4testI12DoubleDoubleEvT_(double %arg.coerc
 ; CHECK-SAME: double [[ARG_COERCE0:%.*]], double [[ARG_COERCE1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[ARG:%.*]] = alloca [[STRUCT_DOUBLEDOUBLE:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
@@ -353,12 +353,12 @@ define linkonce_odr dso_local void @_Z4testI12DoubleDoubleEvT_(double %arg.coerc
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (double, double, i32, ...) @_Z5test2I12DoubleDoubleEvT_iz(double [[AGG_TMP_SROA_0_0_COPYLOAD]], double [[AGG_TMP_SROA_2_0_COPYLOAD]], i32 noundef 1, double [[AGG_TMP_SROA_0_0_COPYLOAD]], double [[AGG_TMP_SROA_2_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -390,15 +390,15 @@ define linkonce_odr dso_local void @_Z4testI7Double4EvT_(ptr noundef byval(%stru
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 32, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP8]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP8]], i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), ptr align 8 [[TMP11]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), ptr align 8 [[TMP11]], i64 32, i1 false)
 ; CHECK-NEXT:    store i64 32, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (ptr, i32, ...) @_Z5test2I7Double4EvT_iz(ptr noundef nonnull byval([[STRUCT_DOUBLE4]]) align 8 [[ARG]], i32 noundef 1, ptr noundef nonnull byval([[STRUCT_DOUBLE4]]) align 8 [[ARG]])
 ; CHECK-NEXT:    ret void
@@ -416,7 +416,7 @@ define linkonce_odr dso_local void @_Z4testI11DoubleFloatEvT_(double %arg.coerce
 ; CHECK-SAME: double [[ARG_COERCE0:%.*]], float [[ARG_COERCE1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[ARG:%.*]] = alloca [[STRUCT_DOUBLEFLOAT:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
@@ -447,12 +447,12 @@ define linkonce_odr dso_local void @_Z4testI11DoubleFloatEvT_(double %arg.coerce
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    store i32 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i32 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
+; CHECK-NEXT:    store i32 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (double, float, i32, ...) @_Z5test2I11DoubleFloatEvT_iz(double [[AGG_TMP_SROA_0_0_COPYLOAD]], float [[AGG_TMP_SROA_2_0_COPYLOAD]], i32 noundef 1, double [[AGG_TMP_SROA_0_0_COPYLOAD]], float [[AGG_TMP_SROA_2_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -484,15 +484,15 @@ define linkonce_odr dso_local void @_Z4testI11LongDouble2EvT_(ptr noundef byval(
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 32, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP8]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP8]], i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), ptr align 8 [[TMP11]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), ptr align 8 [[TMP11]], i64 32, i1 false)
 ; CHECK-NEXT:    store i64 32, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (ptr, i32, ...) @_Z5test2I11LongDouble2EvT_iz(ptr noundef nonnull byval([[STRUCT_LONGDOUBLE2]]) align 16 [[ARG]], i32 noundef 1, ptr noundef nonnull byval([[STRUCT_LONGDOUBLE2]]) align 16 [[ARG]])
 ; CHECK-NEXT:    ret void
@@ -518,15 +518,15 @@ define linkonce_odr dso_local void @_Z4testI11LongDouble4EvT_(ptr noundef byval(
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 64, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), ptr align 8 [[TMP8]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 72), ptr align 8 [[TMP8]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), ptr align 8 [[TMP11]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), ptr align 8 [[TMP11]], i64 64, i1 false)
 ; CHECK-NEXT:    store i64 64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (ptr, i32, ...) @_Z5test2I11LongDouble4EvT_iz(ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], i32 noundef 1, ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]])
 ; CHECK-NEXT:    ret void
@@ -561,17 +561,13 @@ define linkonce_odr dso_local void @_Z5test2IcEvT_iz(i8 noundef signext %t, i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -624,17 +620,13 @@ define linkonce_odr dso_local void @_Z5test2IiEvT_iz(i32 noundef %t, i32 noundef
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -679,17 +671,13 @@ define linkonce_odr dso_local void @_Z5test2IfEvT_iz(float noundef %t, i32 nound
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -734,17 +722,13 @@ define linkonce_odr dso_local void @_Z5test2IdEvT_iz(double noundef %t, i32 noun
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -789,17 +773,13 @@ define linkonce_odr dso_local void @_Z5test2IeEvT_iz(x86_fp80 noundef %t, i32 no
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -844,17 +824,13 @@ define linkonce_odr dso_local void @_Z5test2I6IntIntEvT_iz(i64 %t.coerce, i32 no
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -899,17 +875,13 @@ define linkonce_odr dso_local void @_Z5test2I10Int64Int64EvT_iz(i64 %t.coerce0,
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -954,17 +926,13 @@ define linkonce_odr dso_local void @_Z5test2I12DoubleDoubleEvT_iz(double %t.coer
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -1009,17 +977,13 @@ define linkonce_odr dso_local void @_Z5test2I7Double4EvT_iz(ptr noundef byval(%s
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -1064,17 +1028,13 @@ define linkonce_odr dso_local void @_Z5test2I11DoubleFloatEvT_iz(double %t.coerc
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -1119,17 +1079,13 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble2EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -1174,17 +1130,13 @@ define linkonce_odr dso_local void @_Z5test2I11LongDouble4EvT_iz(ptr noundef byv
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP16]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
@@ -1222,88 +1174,88 @@ define linkonce_odr dso_local void @_Z4test3I11LongDouble4EvT_(ptr noundef byval
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 64, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), ptr align 8 [[TMP8]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 72), ptr align 8 [[TMP8]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), ptr align 8 [[TMP11]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 136), ptr align 8 [[TMP11]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), ptr align 8 [[TMP14]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 200), ptr align 8 [[TMP14]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), ptr align 8 [[TMP17]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 264), ptr align 8 [[TMP17]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 87960930222080
 ; CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), ptr align 8 [[TMP20]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 328), ptr align 8 [[TMP20]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), ptr align 8 [[TMP23]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 392), ptr align 8 [[TMP23]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 87960930222080
 ; CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), ptr align 8 [[TMP26]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 456), ptr align 8 [[TMP26]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080
 ; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), ptr align 8 [[TMP29]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 520), ptr align 8 [[TMP29]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
 ; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), ptr align 8 [[TMP32]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 584), ptr align 8 [[TMP32]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 87960930222080
 ; CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), ptr align 8 [[TMP35]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 648), ptr align 8 [[TMP35]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP37:%.*]] = xor i64 [[TMP36]], 87960930222080
 ; CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), ptr align 8 [[TMP38]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 712), ptr align 8 [[TMP38]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP39:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP40:%.*]] = xor i64 [[TMP39]], 87960930222080
 ; CHECK-NEXT:    [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), ptr align 8 [[TMP41]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), ptr align 8 [[TMP41]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP42:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP43:%.*]] = xor i64 [[TMP42]], 87960930222080
 ; CHECK-NEXT:    [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 240) to ptr), ptr align 8 [[TMP44]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 240), ptr align 8 [[TMP44]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP45:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP46:%.*]] = xor i64 [[TMP45]], 87960930222080
 ; CHECK-NEXT:    [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 304) to ptr), ptr align 8 [[TMP47]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 304), ptr align 8 [[TMP47]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP48:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP49:%.*]] = xor i64 [[TMP48]], 87960930222080
 ; CHECK-NEXT:    [[TMP50:%.*]] = inttoptr i64 [[TMP49]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 368) to ptr), ptr align 8 [[TMP50]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 368), ptr align 8 [[TMP50]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP51:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP52:%.*]] = xor i64 [[TMP51]], 87960930222080
 ; CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 432) to ptr), ptr align 8 [[TMP53]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 432), ptr align 8 [[TMP53]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP54:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP55:%.*]] = xor i64 [[TMP54]], 87960930222080
 ; CHECK-NEXT:    [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 496) to ptr), ptr align 8 [[TMP56]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 496), ptr align 8 [[TMP56]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP57:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP58:%.*]] = xor i64 [[TMP57]], 87960930222080
 ; CHECK-NEXT:    [[TMP59:%.*]] = inttoptr i64 [[TMP58]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 560) to ptr), ptr align 8 [[TMP59]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 560), ptr align 8 [[TMP59]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP60:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP61:%.*]] = xor i64 [[TMP60]], 87960930222080
 ; CHECK-NEXT:    [[TMP62:%.*]] = inttoptr i64 [[TMP61]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 624) to ptr), ptr align 8 [[TMP62]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 624), ptr align 8 [[TMP62]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP63:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP64:%.*]] = xor i64 [[TMP63]], 87960930222080
 ; CHECK-NEXT:    [[TMP65:%.*]] = inttoptr i64 [[TMP64]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 688) to ptr), ptr align 8 [[TMP65]], i64 64, i1 false)
-; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 752) to ptr), i8 0, i32 48, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 688), ptr align 8 [[TMP65]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 752), i8 0, i32 48, i1 false)
 ; CHECK-NEXT:    store i64 1280, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (ptr, i32, ...) @_Z5test2I11LongDouble4EvT_iz(ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], i32 noundef 20, ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
index 429829ef39ab9..8a9cf6081d7dd 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
@@ -14,7 +14,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64(<2 x i64> %x0, <2 x i64> %x1) #0 {
 ; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP1]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X1]])
@@ -42,9 +42,9 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) #0 {
 define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 {
 ; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer
@@ -74,9 +74,9 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x
 define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 {
 ; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer
@@ -107,7 +107,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) #0 {
 ; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP1]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X1]])
@@ -135,9 +135,9 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) #0 {
 define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) #0 {
 ; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i64> [[M]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP4]], zeroinitializer
@@ -168,7 +168,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) #0 {
 ; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X1]])
@@ -196,9 +196,9 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) #0 {
 define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) #0 {
 ; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(
 ; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i64> [[M]], splat (i64 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
@@ -233,7 +233,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32(<4 x i32> %x0, <4 x i32> %x1) #0 {
 ; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X1]])
@@ -261,9 +261,9 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) #0 {
 define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) #0 {
 ; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> [[M]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i32> [[TMP4]], zeroinitializer
@@ -294,7 +294,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) #0 {
 ; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X1]])
@@ -322,9 +322,9 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) #0 {
 define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) #0 {
 ; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i32> [[M]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[TMP4]], zeroinitializer
@@ -355,7 +355,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) #0 {
 ; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X1]])
@@ -383,9 +383,9 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) #0 {
 define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) #0 {
 ; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(
 ; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i32> [[M]], splat (i32 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP4]], zeroinitializer
@@ -420,7 +420,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16(<8 x i16> %x0, <8 x i16> %x1) #0 {
 ; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16(
 ; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP1]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X1]])
@@ -448,9 +448,9 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) #0 {
 define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) #0 {
 ; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(
 ; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i16> [[M]], splat (i16 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i16> [[TMP4]], zeroinitializer
@@ -481,7 +481,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) #0 {
 ; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16(
 ; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP1]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X1]])
@@ -509,9 +509,9 @@ define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) #0 {
 define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %m) #0 {
 ; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(
 ; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]], <16 x i16> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i16> [[M]], splat (i16 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i16> [[TMP4]], zeroinitializer
@@ -542,7 +542,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16(
 ; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X1]])
@@ -570,9 +570,9 @@ define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) #0 {
 define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %m) #0 {
 ; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(
 ; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i16> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <32 x i16> [[M]], splat (i16 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP4]], zeroinitializer
@@ -607,7 +607,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8(<16 x i8> %x0, <16 x i8> %x1) #0 {
 ; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8(
 ; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP1]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X1]])
@@ -635,9 +635,9 @@ define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) #0 {
 define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %m) #0 {
 ; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(
 ; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i8> [[M]], splat (i8 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i8> [[TMP4]], zeroinitializer
@@ -668,7 +668,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) #0 {
 ; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8(
 ; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP1]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X1]])
@@ -696,9 +696,9 @@ define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) #0 {
 define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %m) #0 {
 ; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(
 ; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <32 x i8> [[M]], splat (i8 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i8> [[TMP4]], zeroinitializer
@@ -729,7 +729,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) #0 {
 ; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8(
 ; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP1]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X1]])
@@ -757,9 +757,9 @@ define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) #0 {
 define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %m) #0 {
 ; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(
 ; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[M:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <64 x i8> [[M]], splat (i8 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP4]], zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
index ddebe3ee20038..399c0fec78ab9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
@@ -10,7 +10,7 @@ define [2 x i32] @InsertValue(i32 %x, i32 %y) sanitize_memory {
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[A:%.*]] = insertvalue [2 x i32] undef, i32 [[X]], 0
@@ -24,8 +24,8 @@ define [2 x i32] @InsertValue(i32 %x, i32 %y) sanitize_memory {
 ; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
 ; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
 ; CHECK-ORIGIN-NEXT:    [[TMP4:%.*]] = insertvalue [2 x i32] [i32 -1, i32 -1], i32 [[TMP0]], 0
 ; CHECK-ORIGIN-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -50,7 +50,7 @@ define [2 x double] @InsertValueDouble(double %x, double %y) sanitize_memory {
 ; CHECK-SAME: double [[X:%.*]], double [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[TMP0]], 0
 ; CHECK-NEXT:    [[A:%.*]] = insertvalue [2 x double] undef, double [[X]], 0
@@ -64,8 +64,8 @@ define [2 x double] @InsertValueDouble(double %x, double %y) sanitize_memory {
 ; CHECK-ORIGIN-NEXT:  [[ENTRY:.*:]]
 ; CHECK-ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-ORIGIN-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CHECK-ORIGIN-NEXT:    call void @llvm.donothing()
 ; CHECK-ORIGIN-NEXT:    [[TMP4:%.*]] = insertvalue [2 x i64] [i64 -1, i64 -1], i64 [[TMP0]], 0
 ; CHECK-ORIGIN-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP0]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
index f0f67fc8f1210..46bec2956c73c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
@@ -19,7 +19,7 @@ define i32 @Test_bzhi_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-LABEL: define i32 @Test_bzhi_32(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -40,7 +40,7 @@ define i64 @Test_bzhi_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-LABEL: define i64 @Test_bzhi_64(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
@@ -62,7 +62,7 @@ define i32 @Test_bextr_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-LABEL: define i32 @Test_bextr_32(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -83,7 +83,7 @@ define i64 @Test_bextr_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-LABEL: define i64 @Test_bextr_64(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
@@ -105,7 +105,7 @@ define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-LABEL: define i32 @Test_pdep_32(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -126,7 +126,7 @@ define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-LABEL: define i64 @Test_pdep_64(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
@@ -147,7 +147,7 @@ define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-LABEL: define i32 @Test_pext_32(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -168,7 +168,7 @@ define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-LABEL: define i64 @Test_pext_64(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
diff --git a/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll b/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll
index e06576e2fead6..0acdf71361000 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 %struct.S = type { i64, i64, i64, [8 x i8] }
 
-; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 {{.*}} add {{.*}} ptrtoint {{.*}} @__msan_param_tls {{.*}} i64 8) {{.*}}, ptr align 8 {{.*}}, i64 32, i1 false)
+; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), ptr align 8 {{.*}}, i64 32, i1 false)
 
 define void @Caller() sanitize_memory {
 entry:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/byval.ll b/llvm/test/Instrumentation/MemorySanitizer/byval.ll
index 69970896a0527..9f6a7cb189547 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/byval.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/byval.ll
@@ -16,8 +16,8 @@ define i128 @ByValArgument(i32, ptr byval(i128) %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), i64 16, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[X:%.*]] = load i128, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
@@ -66,8 +66,8 @@ define void @ByValForward(i32, ptr byval(i128) %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), i64 16, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @Fn(ptr [[P]])
@@ -107,8 +107,8 @@ define void @ByValForwardByVal(i32, ptr byval(i128) %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 16, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), i64 16, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
@@ -165,8 +165,8 @@ define i8 @ByValArgument8(i32, ptr byval(i8) %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), i64 4, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[P]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
@@ -218,8 +218,8 @@ define void @ByValForward8(i32, ptr byval(i8) %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), i64 4, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @Fn8(ptr [[P]])
@@ -261,8 +261,8 @@ define void @ByValForwardByVal8(i32, ptr byval(i8) %p) sanitize_memory {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 1, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP3]], ptr align 1 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 1, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), i64 4, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
diff --git a/llvm/test/Instrumentation/MemorySanitizer/expand-experimental-reductions.ll b/llvm/test/Instrumentation/MemorySanitizer/expand-experimental-reductions.ll
index 0696ac92e59b8..582d75330f6d6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/expand-experimental-reductions.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/expand-experimental-reductions.ll
@@ -134,7 +134,7 @@ define float @fadd_f32_accum(float %accum, <4 x float> %vec) #0 {
 ; CHECK-SAME: float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP2]]
@@ -152,7 +152,7 @@ define float @fadd_f32_strict(float %param, <4 x float> %vec) #0 {
 ; CHECK-SAME: float [[PARAM:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP3]], [[TMP1]]
@@ -170,7 +170,7 @@ define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) #0 {
 ; CHECK-SAME: float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP2]]
@@ -205,7 +205,7 @@ define float @fmul_f32_accum(float %accum, <4 x float> %vec) #0 {
 ; CHECK-SAME: float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP2]]
@@ -223,7 +223,7 @@ define float @fmul_f32_strict(float %param, <4 x float> %vec) #0 {
 ; CHECK-SAME: float [[PARAM:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP0]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP3]], [[TMP1]]
@@ -241,7 +241,7 @@ define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) #0 {
 ; CHECK-SAME: float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/funnel_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/funnel_shift.ll
index 5ea407b3fda7a..a96046b9ed62d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/funnel_shift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/funnel_shift.ll
@@ -7,17 +7,17 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_funnel_i64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
@@ -51,17 +51,17 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_funnel_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i32
@@ -95,17 +95,17 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_funnel_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i16 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i16
@@ -139,17 +139,17 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_funnel_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP3]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i8
@@ -183,13 +183,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %c64, <2 x i64> %c128, <4 x i64> %c256, <8 x i64> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_rotate_i64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i64
@@ -223,13 +223,13 @@ define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %c32, <4 x i32> %c128, <8 x i32> %c256, <16 x i32> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_rotate_i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i32
@@ -263,13 +263,13 @@ define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %c16, <8 x i16> %c128, <16 x i16> %c256, <32 x i16> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_rotate_i16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i16 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i16
@@ -303,13 +303,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %c8, <16 x i8> %c128, <32 x i8> %c256, <64 x i8> %c512) sanitize_memory {
 ; CHECK-LABEL: @var_rotate_i8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <64 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i8 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
index cbc556f8a8ee2..0d94357d169f0 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
@@ -13,7 +13,7 @@ target triple = "i386-unknown-linux-gnu"
 define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_addsub_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -30,7 +30,7 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
 define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_addsub_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -46,8 +46,8 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
 
 define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_blendv_pd_256(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -75,8 +75,8 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
 
 define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_blendv_ps_256(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -105,7 +105,7 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
 define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_cmp_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -124,7 +124,7 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
 define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_cmp_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -141,7 +141,7 @@ define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) #0
 define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_cmp_ps_256_pseudo_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP99:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -400,7 +400,7 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_dp_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -427,7 +427,7 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
 define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hadd_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -446,7 +446,7 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
 define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hadd_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -465,7 +465,7 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
 define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hsub_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -484,7 +484,7 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
 define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_hsub_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -527,7 +527,7 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly
 
 define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -554,7 +554,7 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly
 
 define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -581,7 +581,7 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read
 
 define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -608,7 +608,7 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly
 
 define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -635,9 +635,9 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado
 
 define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -665,9 +665,9 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind
 
 define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -695,9 +695,9 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi
 
 define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -725,9 +725,9 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind
 
 define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -756,7 +756,7 @@ declare void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>) nounwin
 define <4 x double> @test_x86_avx_max_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_max_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -773,7 +773,7 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
 define <8 x float> @test_x86_avx_max_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_max_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -790,7 +790,7 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
 define <4 x double> @test_x86_avx_min_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_min_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -807,7 +807,7 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
 define <8 x float> @test_x86_avx_min_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_min_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -868,7 +868,7 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_ptestc_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -888,7 +888,7 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
 define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_ptestnzc_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -908,7 +908,7 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
 define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_ptestz_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -987,7 +987,7 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[A1]] to <2 x i1>
@@ -1014,7 +1014,7 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
 define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[A1]] to <4 x i2>
@@ -1056,7 +1056,7 @@ define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) #0 {
 define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[A1]] to <4 x i2>
@@ -1079,7 +1079,7 @@ define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) #
 }
 define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, ptr %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1119,7 +1119,7 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
 define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vpermilvar_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i32> [[A1]] to <8 x i3>
@@ -1146,7 +1146,7 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
 define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1166,7 +1166,7 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
 define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -1186,7 +1186,7 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
 define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -1206,7 +1206,7 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestc_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -1226,7 +1226,7 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
 define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1246,7 +1246,7 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -1266,7 +1266,7 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
 define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -1286,7 +1286,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
 define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestnzc_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -1306,7 +1306,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
 define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1326,7 +1326,7 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
 define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
@@ -1346,7 +1346,7 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
 define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -1366,7 +1366,7 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx_vtestz_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -1410,7 +1410,7 @@ declare void @llvm.x86.avx.vzeroupper() nounwind
 
 define void @movnt_dq(ptr %p, <2 x i64> %a1) nounwind #0 {
 ; CHECK-LABEL: @movnt_dq(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1442,7 +1442,7 @@ define void @movnt_ps(ptr %p, <8 x float> %a) nounwind #0 {
 ; CHECK-LABEL: @movnt_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
@@ -1465,7 +1465,7 @@ declare void @llvm.x86.avx.movnt.ps.256(ptr, <8 x float>) nounwind
 define void @movnt_pd(ptr %p, <4 x double> %a1) nounwind #0 {
   ; add operation forces the execution domain.
 ; CHECK-LABEL: @movnt_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1494,7 +1494,7 @@ declare void @llvm.x86.avx.movnt.pd.256(ptr, <4 x double>) nounwind
 define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_pclmulqdq(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index cd79bcb2233fe..6471e09fc467a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -7,7 +7,7 @@ target triple = "i386-unknown-linux-gnu"
 define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packssdw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
@@ -42,7 +42,7 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() #0 {
 define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packsswb(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
@@ -77,7 +77,7 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() #0 {
 define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packuswb(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
@@ -112,7 +112,7 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() #0 {
 define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pavg_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
@@ -129,7 +129,7 @@ declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pavg_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
@@ -146,7 +146,7 @@ declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readno
 define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmadd_wd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer
@@ -197,7 +197,7 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
 define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmulh_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
@@ -214,7 +214,7 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
 define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmulhu_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
@@ -231,7 +231,7 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
 define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psad_bw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
@@ -252,7 +252,7 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psll_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -275,7 +275,7 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psll_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -298,7 +298,7 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psll_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
@@ -372,7 +372,7 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
 define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psra_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -395,7 +395,7 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psra_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
@@ -452,7 +452,7 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
 define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -475,7 +475,7 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -498,7 +498,7 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
@@ -520,7 +520,7 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon
 
 define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, ptr %p) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrl_w_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -606,7 +606,7 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
 define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phadd_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -625,7 +625,7 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phadd_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -644,7 +644,7 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
 define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phadd_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -663,7 +663,7 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
 define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phsub_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -682,7 +682,7 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phsub_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -701,7 +701,7 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
 define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_phsub_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -720,7 +720,7 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
 define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer
@@ -748,7 +748,7 @@ declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind rea
 define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmadd_ub_sw_load_op0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -787,7 +787,7 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) #
 define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pmul_hr_sw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
@@ -804,7 +804,7 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
 define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pshuf_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]])
@@ -822,7 +822,7 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
 define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psign_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
@@ -839,7 +839,7 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psign_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -856,7 +856,7 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
 define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psign_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
@@ -873,7 +873,7 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_mpsadbw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i8> [[TMP1]] to i256
@@ -898,7 +898,7 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
 define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_mpsadbw_load_op0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -934,7 +934,7 @@ define <16 x i16> @test_x86_avx2_mpsadbw_load_op0(ptr %ptr, <32 x i8> %a1) #0 {
 define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_packusdw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
@@ -968,8 +968,8 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() #0 {
 
 define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendvb(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -995,7 +995,7 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1012,7 +1012,7 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendd_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 3>
@@ -1029,7 +1029,7 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_pblendd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1046,7 +1046,7 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind
 define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_permd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
@@ -1063,7 +1063,7 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_permps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
@@ -1088,7 +1088,7 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
 
 define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1114,7 +1114,7 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
 
 define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1140,7 +1140,7 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl
 
 define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1166,7 +1166,7 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
 
 define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1192,9 +1192,9 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl
 
 define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1221,9 +1221,9 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1250,9 +1250,9 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1279,9 +1279,9 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind
 
 define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
@@ -1309,7 +1309,7 @@ declare void @llvm.x86.avx2.maskstore.d.256(ptr, <8 x i32>, <8 x i32>) nounwind
 define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
@@ -1350,7 +1350,7 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
@@ -1391,7 +1391,7 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
 define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
@@ -1424,7 +1424,7 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psllv_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
@@ -1458,7 +1458,7 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
 define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
@@ -1499,7 +1499,7 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
@@ -1540,7 +1540,7 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
 define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
@@ -1574,7 +1574,7 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrlv_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
@@ -1609,7 +1609,7 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
 define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrav_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer
@@ -1642,7 +1642,7 @@ declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_psrav_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i32> [[TMP2]], zeroinitializer
@@ -1675,9 +1675,9 @@ declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind read
 define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, ptr %a1, <4 x i32> %idx, <2 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
@@ -1709,9 +1709,9 @@ declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, ptr,
 define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, ptr %a1, <4 x i32> %idx, <4 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
@@ -1743,9 +1743,9 @@ declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, ptr,
 define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, ptr %a1, <2 x i64> %idx, <2 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
@@ -1777,9 +1777,9 @@ declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, ptr,
 define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, ptr %a1, <4 x i64> %idx, <4 x double> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_pd_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
@@ -1811,9 +1811,9 @@ declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, ptr,
 define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, ptr %a1, <4 x i32> %idx, <4 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -1845,9 +1845,9 @@ declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, ptr,
 define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, ptr %a1, <8 x i32> %idx, <8 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
@@ -1879,9 +1879,9 @@ declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr,
 define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, ptr %a1, <2 x i64> %idx, <4 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -1913,9 +1913,9 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, ptr,
 define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, ptr %a1, <4 x i64> %idx, <4 x float> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_ps_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -1947,9 +1947,9 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, ptr,
 define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, ptr %a1, <4 x i32> %idx, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
@@ -1981,9 +1981,9 @@ declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, ptr,
 define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, ptr %a1, <4 x i32> %idx, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
@@ -2015,9 +2015,9 @@ declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr,
 define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, ptr %a1, <2 x i64> %idx, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
@@ -2049,9 +2049,9 @@ declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, ptr,
 define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, ptr %a1, <4 x i64> %idx, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_q_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
@@ -2083,9 +2083,9 @@ declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr,
 define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, ptr %a1, <4 x i32> %idx, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -2117,9 +2117,9 @@ declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr,
 define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, ptr %a1, <8 x i32> %idx, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_d_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
@@ -2151,9 +2151,9 @@ declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr,
 define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, ptr %a1, <2 x i64> %idx, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -2185,9 +2185,9 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, ptr,
 define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, ptr %a1, <4 x i64> %idx, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx2_gather_q_d_256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -2219,10 +2219,10 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr,
 define <8 x float>  @test_gather_mask(<8 x float> %a0, ptr %a, <8 x i32> %idx, <8 x float> %mask, ptr nocapture %out) #0 {
 ; CHECK-LABEL: @test_gather_mask(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
@@ -2265,10 +2265,10 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, ptr %a, <8 x i32> %idx, <
 
 define <2 x i64> @test_mask_demanded_bits(<2 x i64> %a0, ptr %a1, <2 x i64> %idx, <2 x i1> %mask) #0 {
 ; CHECK-LABEL: @test_mask_demanded_bits(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
index 8052b5e345265..1b7e3d780a32e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll
@@ -22,7 +22,7 @@ define i64 @test1(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test1(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -69,7 +69,7 @@ define i64 @test88(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test88(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -110,7 +110,7 @@ define i64 @test87(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test87(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -151,7 +151,7 @@ define i64 @test86(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test86(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -192,7 +192,7 @@ define i64 @test85(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test85(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -233,7 +233,7 @@ define i64 @test84(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test84(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -274,7 +274,7 @@ define i64 @test83(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test83(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -315,7 +315,7 @@ define i64 @test82(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test82(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -356,7 +356,7 @@ define i64 @test81(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test81(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -397,7 +397,7 @@ define i64 @test80(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test80(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -438,7 +438,7 @@ define i64 @test79(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test79(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -479,7 +479,7 @@ define i64 @test78(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test78(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -520,7 +520,7 @@ define i64 @test77(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test77(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -561,7 +561,7 @@ define i64 @test76(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test76(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -610,7 +610,7 @@ define i64 @test75(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test75(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -659,7 +659,7 @@ define i64 @test74(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test74(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1076,7 +1076,7 @@ define i64 @test65(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
@@ -1122,7 +1122,7 @@ define i64 @test64(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -1168,7 +1168,7 @@ define i64 @test63(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
@@ -1208,7 +1208,7 @@ define i64 @test62(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
@@ -1254,7 +1254,7 @@ define i64 @test61(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -1300,7 +1300,7 @@ define i64 @test60(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP7]], i32 0
@@ -1340,7 +1340,7 @@ define i64 @test59(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <2 x i32>
@@ -1386,7 +1386,7 @@ define i64 @test58(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP9]] to <4 x i16>
@@ -1431,7 +1431,7 @@ define i64 @test56(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test56(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1472,7 +1472,7 @@ define i64 @test55(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test55(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1513,7 +1513,7 @@ define i64 @test54(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test54(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1554,7 +1554,7 @@ define i64 @test53(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test53(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1595,7 +1595,7 @@ define i64 @test52(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test52(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1634,7 +1634,7 @@ define i64 @test51(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test51(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1675,7 +1675,7 @@ define i64 @test50(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test50(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1716,7 +1716,7 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test49(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP13:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1775,7 +1775,7 @@ define i64 @test48(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test48(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1816,7 +1816,7 @@ define i64 @test47(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test47(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1857,7 +1857,7 @@ define i64 @test46(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test46(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1898,7 +1898,7 @@ define i64 @test45(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test45(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1938,7 +1938,7 @@ define i64 @test44(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
@@ -1974,7 +1974,7 @@ define i64 @test43(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test43(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2015,7 +2015,7 @@ define i64 @test42(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test42(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2056,7 +2056,7 @@ define i64 @test41(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test41(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2097,7 +2097,7 @@ define i64 @test40(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test40(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2138,7 +2138,7 @@ define i64 @test39(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test39(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2179,7 +2179,7 @@ define i64 @test38(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test38(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2220,7 +2220,7 @@ define i64 @test37(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test37(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2262,7 +2262,7 @@ define i64 @test36(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0
@@ -2296,7 +2296,7 @@ define i64 @test35(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test35(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2337,7 +2337,7 @@ define i64 @test34(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test34(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2378,7 +2378,7 @@ define i64 @test33(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test33(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2419,7 +2419,7 @@ define i64 @test32(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test32(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2459,7 +2459,7 @@ define i64 @test31(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test31(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2500,7 +2500,7 @@ define i64 @test30(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test30(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2541,7 +2541,7 @@ define i64 @test29(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test29(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2582,7 +2582,7 @@ define i64 @test28(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test28(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2623,7 +2623,7 @@ define i64 @test27(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test27(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2664,7 +2664,7 @@ define i64 @test26(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test26(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2705,7 +2705,7 @@ define void @test25(ptr %p, <1 x i64> %a) nounwind optsize ssp #0 {
 ; CHECK-LABEL: define void @test25(
 ; CHECK-SAME: ptr [[P:%.*]], <1 x i64> [[A:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2770,9 +2770,9 @@ define void @test23(<1 x i64> %d, <1 x i64> %n, ptr %p) nounwind optsize ssp #0
 ; CHECK-LABEL: define void @test23(
 ; CHECK-SAME: <1 x i64> [[D:%.*]], <1 x i64> [[N:%.*]], ptr [[P:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
@@ -2813,7 +2813,7 @@ define i64 @test22(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test22(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -2922,7 +2922,7 @@ define i64 @test20(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test20(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3051,7 +3051,7 @@ define i64 @test16(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0
@@ -3192,7 +3192,7 @@ define i64 @test12(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test12(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3233,7 +3233,7 @@ define i64 @test11(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test11(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3274,7 +3274,7 @@ define i64 @test10(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test10(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3315,7 +3315,7 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test9(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3357,7 +3357,7 @@ define i64 @test8(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test8(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3398,7 +3398,7 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test7(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3457,7 +3457,7 @@ define i64 @test6(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test6(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3504,7 +3504,7 @@ define i64 @test5(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test5(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3551,7 +3551,7 @@ define i64 @test4(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test4(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3598,7 +3598,7 @@ define i64 @test3(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test3(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3645,7 +3645,7 @@ define i64 @test2(<1 x i64> %a, <1 x i64> %b) #0 {
 ; CHECK-LABEL: define i64 @test2(
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -3694,7 +3694,7 @@ define <4 x float> @test89(<4 x float> %a, <1 x i64> %b) nounwind #0 {
 ; CHECK-LABEL: define <4 x float> @test89(
 ; CHECK-SAME: <4 x float> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -3740,7 +3740,7 @@ define <1 x i64> @test_mm_insert_pi16(<1 x i64> %a.coerce, i32 %d) nounwind #0 {
 ; CHECK-SAME: <1 x i64> [[A_COERCE:%.*]], i32 [[D:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_i386intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_i386intrinsics.ll
index 017bbcf4f3689..e37894192276a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_i386intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_i386intrinsics.ll
@@ -13,7 +13,7 @@ define void @StoreIntrinsic(ptr %p, <4 x float> %x) nounwind uwtable sanitize_me
 ; CHECK-LABEL: define void @StoreIntrinsic(
 ; CHECK-SAME: ptr [[P:%.*]], <4 x float> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -2147483649
@@ -25,8 +25,8 @@ define void @StoreIntrinsic(ptr %p, <4 x float> %x) nounwind uwtable sanitize_me
 ; ORIGINS-LABEL: define void @StoreIntrinsic(
 ; ORIGINS-SAME: ptr [[P:%.*]], <4 x float> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ORIGINS-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
-; ORIGINS-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGINS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGINS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -2147483649
@@ -107,7 +107,7 @@ define <8 x i16> @Pmulhuw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable saniti
 ; CHECK-LABEL: define <8 x i16> @Pmulhuw128(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
@@ -119,8 +119,8 @@ define <8 x i16> @Pmulhuw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable saniti
 ; ORIGINS-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR0]] {
 ; ORIGINS-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; ORIGINS-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGINS-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGINS-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGINS-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGINS-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGINS-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP3]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse-intrinsics-i386.ll
index ffad6fb5a2b68..6b7f813336f78 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse-intrinsics-i386.ll
@@ -7,7 +7,7 @@ target triple = "i386-unknown-linux-gnu"
 define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_cmp_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -26,7 +26,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
 define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_cmp_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -47,7 +47,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
 define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comieq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -67,7 +67,7 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comige_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -87,7 +87,7 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comigt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -107,7 +107,7 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comile_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -127,7 +127,7 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comilt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -147,7 +147,7 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_comineq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -238,7 +238,7 @@ declare void @llvm.x86.sse.ldmxcsr(ptr) nounwind
 define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_max_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -255,7 +255,7 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
 define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_max_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -273,7 +273,7 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
 define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_min_ps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -290,7 +290,7 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
 define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_min_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -415,7 +415,7 @@ declare void @llvm.x86.sse.stmxcsr(ptr) nounwind
 define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomieq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -435,7 +435,7 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomige_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -455,7 +455,7 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomigt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -475,7 +475,7 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomile_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -495,7 +495,7 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomilt_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -515,7 +515,7 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse_ucomineq_ss(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
index 3a37eafd78ecb..806eac09c695e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll
@@ -15,7 +15,7 @@ target triple = "i386-unknown-linux-gnu"
 define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cmp_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -34,7 +34,7 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
 define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cmp_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -55,7 +55,7 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
 define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comieq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -75,7 +75,7 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comige_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -95,7 +95,7 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comigt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -115,7 +115,7 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comile_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -135,7 +135,7 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comilt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -155,7 +155,7 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
 define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_comineq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -356,7 +356,7 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
 
 define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -380,7 +380,7 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
 
 define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, ptr %p1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -415,7 +415,7 @@ define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, ptr %p1) #0 {
 
 define <4 x float> @test_x86_sse2_cvtsd2ss_load_optsize(<4 x float> %a0, ptr %p1) optsize #0 {
 ; CHECK-LABEL: @test_x86_sse2_cvtsd2ss_load_optsize(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -566,7 +566,7 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
 define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_max_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -583,7 +583,7 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
 define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_max_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -601,7 +601,7 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
 define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_min_pd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -618,7 +618,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
 define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_min_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -658,7 +658,7 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
 define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_packssdw_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
@@ -693,7 +693,7 @@ define <8 x i16> @test_x86_sse2_packssdw_128_fold() #0 {
 define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_packsswb_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
@@ -728,7 +728,7 @@ define <16 x i8> @test_x86_sse2_packsswb_128_fold() #0 {
 define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_packuswb_128(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
@@ -763,7 +763,7 @@ define <16 x i8> @test_x86_sse2_packuswb_128_fold() #0 {
 define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pavg_b(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
@@ -780,7 +780,7 @@ declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
 define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pavg_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
@@ -797,7 +797,7 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pmadd_wd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
@@ -848,7 +848,7 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
 define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pmulh_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
@@ -865,7 +865,7 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
 define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_pmulhu_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]]
@@ -882,7 +882,7 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
 define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psad_bw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
@@ -903,7 +903,7 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
 define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psll_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -926,7 +926,7 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psll_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -949,7 +949,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psll_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
@@ -1023,7 +1023,7 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
 define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psra_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -1046,7 +1046,7 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psra_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
@@ -1103,7 +1103,7 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
 define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_d(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
@@ -1126,7 +1126,7 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
 define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_q(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
@@ -1149,7 +1149,7 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
 define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_w(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
@@ -1171,7 +1171,7 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, ptr %p) #0 {
 ; CHECK-LABEL: @test_x86_sse2_psrl_w_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -1257,7 +1257,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
 define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomieq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1277,7 +1277,7 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomige_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1297,7 +1297,7 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomigt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1317,7 +1317,7 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomile_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1337,7 +1337,7 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomilt_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -1357,7 +1357,7 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
 define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse2_ucomineq_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse41-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse41-intrinsics-i386.ll
index e51c53375d2b5..24f22bd56a64f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse41-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse41-intrinsics-i386.ll
@@ -6,8 +6,8 @@ target triple = "i386-unknown-linux-gnu"
 
 define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_sse41_blendvpd(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -35,8 +35,8 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
 
 define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_sse41_blendvps(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -65,7 +65,7 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
 define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_dppd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -87,7 +87,7 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
 define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_dpps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
@@ -109,7 +109,7 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
 define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_insertps(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
@@ -136,7 +136,7 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
 define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_mpsadbw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
@@ -161,7 +161,7 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
 define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(ptr %ptr, <16 x i8> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_mpsadbw_load_op0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
@@ -197,7 +197,7 @@ define <8 x i16> @test_x86_sse41_mpsadbw_load_op0(ptr %ptr, <16 x i8> %a1) #0 {
 define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_packusdw(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer
@@ -231,8 +231,8 @@ define <8 x i16> @test_x86_sse41_packusdw_fold() #0 {
 
 define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) #0 {
 ; CHECK-LABEL: @test_x86_sse41_pblendvb(
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -273,7 +273,7 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
 define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_ptestc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -293,7 +293,7 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
 define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_ptestnzc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -313,7 +313,7 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
 define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_ptestz(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
@@ -363,7 +363,7 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_round_sd(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 2, i32 1>
@@ -379,7 +379,7 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
 
 define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, ptr %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_round_sd_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
@@ -407,7 +407,7 @@ define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, ptr %a1) #0
 
 define <4 x float> @test_x86_sse41_round_ss_load(<4 x float> %a0, ptr %a1) #0 {
 ; CHECK-LABEL: @test_x86_sse41_round_ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll
index 7bc9cf3b8c582..436a3b31221d8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll
@@ -13,205 +13,205 @@ define dso_local i64 @many_args() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 56) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 88) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 112) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 120) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 152) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 168) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 184) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 208) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 216) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 224) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 232) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 240) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 248) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 264) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 272) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 280) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 288) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 296) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 304) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 312) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 328) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 336) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 344) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 352) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 360) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 368) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 376) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 384) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 392) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 400) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 408) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 416) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 424) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 432) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 440) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 448) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 456) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 464) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 472) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 480) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 488) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 496) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 504) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 512) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 520) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 528) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 536) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 544) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 552) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 560) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 568) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 576) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 584) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 592) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 600) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 608) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 616) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 624) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 632) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 640) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 648) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 656) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 664) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 672) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 680) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 688) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 696) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 704) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 712) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 720) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 728) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 736) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 744) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 752) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 760) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 768) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 776) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 784) to ptr), align 8
-; CHECK-NEXT:    store i64 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 792) to ptr), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 40), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 56), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 72), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 80), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 88), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 96), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 104), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 112), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 120), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 136), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 144), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 152), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 160), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 168), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 184), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 192), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 200), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 208), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 216), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 224), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 232), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 240), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 248), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 256), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 264), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 272), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 280), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 288), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 296), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 304), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 312), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 320), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 328), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 336), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 344), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 352), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 360), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 368), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 376), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 384), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 392), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 400), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 408), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 416), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 424), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 432), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 440), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 448), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 456), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 464), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 472), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 480), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 488), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 496), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 504), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 512), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 520), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 528), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 536), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 544), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 552), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 560), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 568), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 576), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 584), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 592), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 600), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 608), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 616), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 624), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 632), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 640), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 648), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 656), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 664), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 672), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 680), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 688), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 696), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 704), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 712), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 720), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 728), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 736), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 744), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 752), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 760), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 768), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 776), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 784), align 8
+; CHECK-NEXT:    store i64 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 792), align 8
 ; CHECK-NEXT:    store i64 1040, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i64 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[RET:%.*]] = call i64 (i64, ...) @sum(i64 120, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll
index 27459397b34a3..cc2d94c5f867b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll
@@ -18,12 +18,12 @@ define dso_local i32 @test(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i32, ...) @sum(i32 3, i32 [[A]], i32 [[B]], i32 [[C]])
@@ -37,12 +37,12 @@ define dso_local i32 @test(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; ORIGIN-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; ORIGIN-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; ORIGIN-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; ORIGIN-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; ORIGIN-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; ORIGIN-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; ORIGIN-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; ORIGIN-NEXT:    [[CALL:%.*]] = tail call i32 (i32, ...) @sum(i32 3, i32 [[A]], i32 [[B]], i32 [[C]])
@@ -58,12 +58,12 @@ define dso_local i32 @test(i32 %a, i32 %b, i32 %c) local_unnamed_addr {
 ; ORIGIN2-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN2-NEXT:    call void @llvm.donothing()
 ; ORIGIN2-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; ORIGIN2-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN2-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN2-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; ORIGIN2-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN2-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN2-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; ORIGIN2-NEXT:    store i32 0, ptr @__msan_va_arg_tls, align 8
-; ORIGIN2-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; ORIGIN2-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; ORIGIN2-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; ORIGIN2-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; ORIGIN2-NEXT:    store i64 24, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN2-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
 ; ORIGIN2-NEXT:    [[CALL:%.*]] = tail call i32 (i32, ...) @sum(i32 3, i32 [[A]], i32 [[B]], i32 [[C]])
@@ -446,12 +446,12 @@ define dso_local i80 @test_i80(i80 %a, i80 %b, i80 %c) local_unnamed_addr {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    store i80 0, ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; CHECK-NEXT:    store i64 48, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    store i80 0, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i80 (i32, ...) @sum_i80(i32 3, i80 [[A]], i80 [[B]], i80 [[C]])
@@ -465,12 +465,12 @@ define dso_local i80 @test_i80(i80 %a, i80 %b, i80 %c) local_unnamed_addr {
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; ORIGIN-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; ORIGIN-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; ORIGIN-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; ORIGIN-NEXT:    store i80 0, ptr @__msan_va_arg_tls, align 8
-; ORIGIN-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; ORIGIN-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; ORIGIN-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; ORIGIN-NEXT:    store i64 48, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN-NEXT:    store i80 0, ptr @__msan_retval_tls, align 8
 ; ORIGIN-NEXT:    [[CALL:%.*]] = tail call i80 (i32, ...) @sum_i80(i32 3, i80 [[A]], i80 [[B]], i80 [[C]])
@@ -486,12 +486,12 @@ define dso_local i80 @test_i80(i80 %a, i80 %b, i80 %c) local_unnamed_addr {
 ; ORIGIN2-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN2-NEXT:    call void @llvm.donothing()
 ; ORIGIN2-NEXT:    store i32 0, ptr @__msan_param_tls, align 8
-; ORIGIN2-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN2-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; ORIGIN2-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; ORIGIN2-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN2-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; ORIGIN2-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; ORIGIN2-NEXT:    store i80 0, ptr @__msan_va_arg_tls, align 8
-; ORIGIN2-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; ORIGIN2-NEXT:    store i80 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; ORIGIN2-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; ORIGIN2-NEXT:    store i80 0, ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; ORIGIN2-NEXT:    store i64 48, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN2-NEXT:    store i80 0, ptr @__msan_retval_tls, align 8
 ; ORIGIN2-NEXT:    [[CALL:%.*]] = tail call i80 (i32, ...) @sum_i80(i32 3, i80 [[A]], i80 [[B]], i80 [[C]])
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_shadow.ll
index 74a62762fc184..681b331fcb137 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_shadow.ll
@@ -40,8 +40,8 @@ define linkonce_odr dso_local void @_Z4testIcEvT_(i8 noundef signext %arg) sanit
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext i8 [[_MSLD]] to i32
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP7]] to i32
 ; CHECK-NEXT:    store i8 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 8, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i8, i32, ...) @_Z5test2IcEvT_iz(i8 noundef signext [[TMP7]], i32 noundef 1, i32 noundef [[CONV]])
@@ -82,8 +82,8 @@ define linkonce_odr dso_local void @_Z4testIiEvT_(i32 noundef %arg) sanitize_mem
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 8, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i32, i32, ...) @_Z5test2IiEvT_iz(i32 noundef [[TMP7]], i32 noundef 1, i32 noundef [[TMP7]])
@@ -125,8 +125,8 @@ define linkonce_odr dso_local void @_Z4testIfEvT_(float noundef %arg) sanitize_m
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP7]] to double
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[TMP11]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[TMP11]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 [[TMP11]], ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 8, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (float, i32, ...) @_Z5test2IfEvT_iz(float noundef [[TMP7]], i32 noundef 1, double noundef [[CONV]])
@@ -167,8 +167,8 @@ define linkonce_odr dso_local void @_Z4testIdEvT_(double noundef %arg) sanitize_
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 8, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (double, i32, ...) @_Z5test2IdEvT_iz(double noundef [[TMP7]], i32 noundef 1, double noundef [[TMP7]])
@@ -208,8 +208,8 @@ define linkonce_odr dso_local void @_Z4testIeEvT_(x86_fp80 noundef %arg) sanitiz
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i80, ptr [[TMP10]], align 16
 ; CHECK-NEXT:    store i80 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i80 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i80 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    store i80 [[_MSLD]], ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (x86_fp80, i32, ...) @_Z5test2IeEvT_iz(x86_fp80 noundef [[TMP7]], i32 noundef 1, x86_fp80 noundef [[TMP7]])
@@ -249,8 +249,8 @@ define linkonce_odr dso_local void @_Z4testI6IntIntEvT_(i64 %arg.coerce) sanitiz
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_va_arg_tls, align 8
 ; CHECK-NEXT:    store i64 8, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i64, i32, ...) @_Z5test2I6IntIntEvT_iz(i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i32 noundef 1, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]])
@@ -271,7 +271,7 @@ define linkonce_odr dso_local void @_Z4testI10Int64Int64EvT_(i64 %arg.coerce0, i
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[ARG:%.*]] = alloca [[STRUCT_INT64INT64:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
@@ -302,12 +302,12 @@ define linkonce_odr dso_local void @_Z4testI10Int64Int64EvT_(i64 %arg.coerce0, i
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i64, i64, i32, ...) @_Z5test2I10Int64Int64EvT_iz(i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i32 noundef 1, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -330,7 +330,7 @@ define linkonce_odr dso_local void @_Z4testI12DoubleDoubleEvT_(double %arg.coerc
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[ARG:%.*]] = alloca [[STRUCT_DOUBLEDOUBLE:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
@@ -361,12 +361,12 @@ define linkonce_odr dso_local void @_Z4testI12DoubleDoubleEvT_(double %arg.coerc
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (double, double, i32, ...) @_Z5test2I12DoubleDoubleEvT_iz(double [[AGG_TMP_SROA_0_0_COPYLOAD]], double [[AGG_TMP_SROA_2_0_COPYLOAD]], i32 noundef 1, double [[AGG_TMP_SROA_0_0_COPYLOAD]], double [[AGG_TMP_SROA_2_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -399,11 +399,11 @@ define linkonce_odr dso_local void @_Z4testI7Double4EvT_(ptr noundef byval(%stru
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -2147483649
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 32, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], -2147483649
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP8]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP8]], i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP11]], -2147483649
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
@@ -426,7 +426,7 @@ define linkonce_odr dso_local void @_Z4testI11DoubleFloatEvT_(double %arg.coerce
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[ARG:%.*]] = alloca [[STRUCT_DOUBLEFLOAT:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[ARG]] to i64
@@ -457,12 +457,12 @@ define linkonce_odr dso_local void @_Z4testI11DoubleFloatEvT_(double %arg.coerce
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i32, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i32 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i32 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i32 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i32 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    store i64 [[_MSLD]], ptr @__msan_va_arg_tls, align 8
-; CHECK-NEXT:    store i32 [[_MSLD1]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i32 [[_MSLD1]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (double, float, i32, ...) @_Z5test2I11DoubleFloatEvT_iz(double [[AGG_TMP_SROA_0_0_COPYLOAD]], float [[AGG_TMP_SROA_2_0_COPYLOAD]], i32 noundef 1, double [[AGG_TMP_SROA_0_0_COPYLOAD]], float [[AGG_TMP_SROA_2_0_COPYLOAD]])
 ; CHECK-NEXT:    ret void
@@ -495,11 +495,11 @@ define linkonce_odr dso_local void @_Z4testI11LongDouble2EvT_(ptr noundef byval(
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -2147483649
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 32, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], -2147483649
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP8]], i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP8]], i64 32, i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP11]], -2147483649
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
@@ -530,11 +530,11 @@ define linkonce_odr dso_local void @_Z4testI11LongDouble4EvT_(ptr noundef byval(
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -2147483649
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 64, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], -2147483649
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), ptr align 8 [[TMP8]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 72), ptr align 8 [[TMP8]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP11]], -2147483649
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
@@ -1103,51 +1103,51 @@ define linkonce_odr dso_local void @_Z4test3I11LongDouble4EvT_(ptr noundef byval
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -2147483649
 ; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 @__msan_param_tls, ptr align 8 [[TMP5]], i64 64, i1 false)
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[TMP6]], -2147483649
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), ptr align 8 [[TMP8]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 72), ptr align 8 [[TMP8]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -2147483649
 ; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), ptr align 8 [[TMP11]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 136), ptr align 8 [[TMP11]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], -2147483649
 ; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), ptr align 8 [[TMP14]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 200), ptr align 8 [[TMP14]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP15]], -2147483649
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 264) to ptr), ptr align 8 [[TMP17]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 264), ptr align 8 [[TMP17]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = and i64 [[TMP18]], -2147483649
 ; CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 328) to ptr), ptr align 8 [[TMP20]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 328), ptr align 8 [[TMP20]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = and i64 [[TMP21]], -2147483649
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 392) to ptr), ptr align 8 [[TMP23]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 392), ptr align 8 [[TMP23]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], -2147483649
 ; CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 456) to ptr), ptr align 8 [[TMP26]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 456), ptr align 8 [[TMP26]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP27]], -2147483649
 ; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 520) to ptr), ptr align 8 [[TMP29]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 520), ptr align 8 [[TMP29]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP31:%.*]] = and i64 [[TMP30]], -2147483649
 ; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 584) to ptr), ptr align 8 [[TMP32]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 584), ptr align 8 [[TMP32]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = and i64 [[TMP33]], -2147483649
 ; CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 648) to ptr), ptr align 8 [[TMP35]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 648), ptr align 8 [[TMP35]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP36:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP37:%.*]] = and i64 [[TMP36]], -2147483649
 ; CHECK-NEXT:    [[TMP38:%.*]] = inttoptr i64 [[TMP37]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 712) to ptr), ptr align 8 [[TMP38]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 712), ptr align 8 [[TMP38]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP41:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP42:%.*]] = and i64 [[TMP41]], -2147483649
 ; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr
@@ -1155,47 +1155,47 @@ define linkonce_odr dso_local void @_Z4test3I11LongDouble4EvT_(ptr noundef byval
 ; CHECK-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP45:%.*]] = and i64 [[TMP44]], -2147483649
 ; CHECK-NEXT:    [[TMP46:%.*]] = inttoptr i64 [[TMP45]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 64) to ptr), ptr align 8 [[TMP46]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 64), ptr align 8 [[TMP46]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP47:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP48:%.*]] = and i64 [[TMP47]], -2147483649
 ; CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 128) to ptr), ptr align 8 [[TMP49]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 128), ptr align 8 [[TMP49]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP50:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP51:%.*]] = and i64 [[TMP50]], -2147483649
 ; CHECK-NEXT:    [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 192) to ptr), ptr align 8 [[TMP52]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 192), ptr align 8 [[TMP52]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP53:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP54:%.*]] = and i64 [[TMP53]], -2147483649
 ; CHECK-NEXT:    [[TMP55:%.*]] = inttoptr i64 [[TMP54]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 256) to ptr), ptr align 8 [[TMP55]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 256), ptr align 8 [[TMP55]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP56:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP57:%.*]] = and i64 [[TMP56]], -2147483649
 ; CHECK-NEXT:    [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 320) to ptr), ptr align 8 [[TMP58]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 320), ptr align 8 [[TMP58]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP59:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP60:%.*]] = and i64 [[TMP59]], -2147483649
 ; CHECK-NEXT:    [[TMP61:%.*]] = inttoptr i64 [[TMP60]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 384) to ptr), ptr align 8 [[TMP61]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 384), ptr align 8 [[TMP61]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP62:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP63:%.*]] = and i64 [[TMP62]], -2147483649
 ; CHECK-NEXT:    [[TMP64:%.*]] = inttoptr i64 [[TMP63]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 448) to ptr), ptr align 8 [[TMP64]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 448), ptr align 8 [[TMP64]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP65:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP66:%.*]] = and i64 [[TMP65]], -2147483649
 ; CHECK-NEXT:    [[TMP67:%.*]] = inttoptr i64 [[TMP66]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 512) to ptr), ptr align 8 [[TMP67]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 512), ptr align 8 [[TMP67]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP68:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP69:%.*]] = and i64 [[TMP68]], -2147483649
 ; CHECK-NEXT:    [[TMP70:%.*]] = inttoptr i64 [[TMP69]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 576) to ptr), ptr align 8 [[TMP70]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 576), ptr align 8 [[TMP70]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP71:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP72:%.*]] = and i64 [[TMP71]], -2147483649
 ; CHECK-NEXT:    [[TMP73:%.*]] = inttoptr i64 [[TMP72]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 640) to ptr), ptr align 8 [[TMP73]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 640), ptr align 8 [[TMP73]], i64 64, i1 false)
 ; CHECK-NEXT:    [[TMP74:%.*]] = ptrtoint ptr [[ARG]] to i64
 ; CHECK-NEXT:    [[TMP75:%.*]] = and i64 [[TMP74]], -2147483649
 ; CHECK-NEXT:    [[TMP76:%.*]] = inttoptr i64 [[TMP75]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 704) to ptr), ptr align 8 [[TMP76]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 704), ptr align 8 [[TMP76]], i64 64, i1 false)
 ; CHECK-NEXT:    store i64 1280, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (ptr, i32, ...) @_Z5test2I11LongDouble4EvT_iz(ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], i32 noundef 20, ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]], ptr noundef nonnull byval([[STRUCT_LONGDOUBLE4]]) align 16 [[ARG]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
index ff37605acaddd..3ac6844b3ffe8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/masked-store-load.ll
@@ -19,7 +19,7 @@ declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
 define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ; CHECK-LABEL: @Store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
@@ -30,9 +30,9 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ;
 ; ADDR-LABEL: @Store(
 ; ADDR-NEXT:  entry:
-; ADDR-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; ADDR-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; ADDR-NEXT:    [[TMP2:%.*]] = load <4 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; ADDR-NEXT:    [[TMP2:%.*]] = load <4 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -52,8 +52,8 @@ define void @Store(ptr %p, <4 x i64> %v, <4 x i1> %mask) sanitize_memory {
 ;
 ; ORIGINS-LABEL: @Store(
 ; ORIGINS-NEXT:  entry:
-; ORIGINS-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGINS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGINS-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -88,7 +88,7 @@ entry:
 define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memory {
 ; CHECK-LABEL: @Load(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
@@ -101,8 +101,8 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ; ADDR-LABEL: @Load(
 ; ADDR-NEXT:  entry:
 ; ADDR-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; ADDR-NEXT:    [[TMP1:%.*]] = load <4 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; ADDR-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; ADDR-NEXT:    [[TMP1:%.*]] = load <4 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; ADDR-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ADDR-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -123,8 +123,8 @@ define <4 x double> @Load(ptr %p, <4 x double> %v, <4 x i1> %mask) sanitize_memo
 ;
 ; ORIGINS-LABEL: @Load(
 ; ORIGINS-NEXT:  entry:
-; ORIGINS-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGINS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGINS-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P:%.*]] to i64
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -235,7 +235,7 @@ entry:
 ; FIXME: Provide real implementation.
 define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %passthru) sanitize_memory {
 ; CHECK-LABEL: @Gather(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint <16 x ptr> [[PTRS:%.*]] to <16 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i64> [[TMP2]], splat (i64 87960930222080)
@@ -246,9 +246,9 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; CHECK-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ADDR-LABEL: @Gather(
-; ADDR-NEXT:    [[TMP1:%.*]] = load <16 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; ADDR-NEXT:    [[TMP1:%.*]] = load <16 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; ADDR-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr @__msan_param_tls, align 8
-; ADDR-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; ADDR-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[_MSMASKEDPTRS:%.*]] = select <16 x i1> [[MASK:%.*]], <16 x i64> [[TMP2]], <16 x i64> zeroinitializer
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint <16 x ptr> [[PTRS:%.*]] to <16 x i64>
@@ -270,8 +270,8 @@ define <16 x float> @Gather(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %pas
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ORIGINS-LABEL: @Gather(
-; ORIGINS-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; ORIGINS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 136) to ptr), align 4
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; ORIGINS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 136), align 4
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = ptrtoint <16 x ptr> [[PTRS:%.*]] to <16 x i64>
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = xor <16 x i64> [[TMP3]], splat (i64 87960930222080)
@@ -326,8 +326,8 @@ define void @Scatter(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask) sanitize
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @Scatter(
-; ADDR-NEXT:    [[TMP1:%.*]] = load <8 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; ADDR-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; ADDR-NEXT:    [[TMP1:%.*]] = load <8 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; ADDR-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; ADDR-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[_MSMASKEDPTRS:%.*]] = select <8 x i1> [[MASK:%.*]], <8 x i64> [[TMP2]], <8 x i64> zeroinitializer
@@ -403,7 +403,7 @@ define void @ScatterNoSanitize(<8 x i32> %value, <8 x ptr> %ptrs, <8 x i1> %mask
 ; FIXME: Provide real implementation.
 define <16 x float> @ExpandLoad(ptr %ptr, <16 x i1> %mask, <16 x float> %passthru) sanitize_memory {
 ; CHECK-LABEL: @ExpandLoad(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -415,8 +415,8 @@ define <16 x float> @ExpandLoad(ptr %ptr, <16 x i1> %mask, <16 x float> %passthr
 ;
 ; ADDR-LABEL: @ExpandLoad(
 ; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; ADDR-NEXT:    [[TMP2:%.*]] = load <16 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ADDR-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; ADDR-NEXT:    [[TMP2:%.*]] = load <16 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ADDR-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; ADDR-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
@@ -436,8 +436,8 @@ define <16 x float> @ExpandLoad(ptr %ptr, <16 x i1> %mask, <16 x float> %passthr
 ; ADDR-NEXT:    ret <16 x float> [[RET]]
 ;
 ; ORIGINS-LABEL: @ExpandLoad(
-; ORIGINS-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGINS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGINS-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGINS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGINS-NEXT:    call void @llvm.donothing()
 ; ORIGINS-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
 ; ORIGINS-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
@@ -492,8 +492,8 @@ define void @CompressStore(<16 x float> %value, ptr %ptr, <16 x i1> %mask) sanit
 ; CHECK-NEXT:    ret void
 ;
 ; ADDR-LABEL: @CompressStore(
-; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; ADDR-NEXT:    [[TMP2:%.*]] = load <16 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; ADDR-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; ADDR-NEXT:    [[TMP2:%.*]] = load <16 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; ADDR-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; ADDR-NEXT:    call void @llvm.donothing()
 ; ADDR-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll
index b4feb1ec57224..0ad9e4dd32adf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -18,7 +18,7 @@ define void @Store(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: define void @Store(
 ; CHECK-SAME: ptr captures(none) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
@@ -30,8 +30,8 @@ define void @Store(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_memory {
 ; ORIGIN-LABEL: define void @Store(
 ; ORIGIN-SAME: ptr captures(none) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -53,8 +53,8 @@ define void @Store(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_memory {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
 ; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]])
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
@@ -80,7 +80,7 @@ define void @AlignedStore(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_me
 ; CHECK-LABEL: define void @AlignedStore(
 ; CHECK-SAME: ptr captures(none) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
@@ -92,8 +92,8 @@ define void @AlignedStore(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_me
 ; ORIGIN-LABEL: define void @AlignedStore(
 ; ORIGIN-SAME: ptr captures(none) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -118,8 +118,8 @@ define void @AlignedStore(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_me
 ; CALLS-NEXT:  [[ENTRY:.*:]]
 ; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]])
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
@@ -353,7 +353,7 @@ define void @FuncWithPhi(ptr nocapture %a, ptr %b, ptr nocapture %c) nounwind uw
 ; CHECK-LABEL: define void @FuncWithPhi(
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], ptr [[B:%.*]], ptr captures(none) [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 0
@@ -397,8 +397,8 @@ define void @FuncWithPhi(ptr nocapture %a, ptr %b, ptr nocapture %c) nounwind uw
 ; ORIGIN-LABEL: define void @FuncWithPhi(
 ; ORIGIN-SAME: ptr captures(none) [[A:%.*]], ptr [[B:%.*]], ptr captures(none) [[C:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[B]] to i64
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 0
@@ -457,10 +457,10 @@ define void @FuncWithPhi(ptr nocapture %a, ptr %b, ptr nocapture %c) nounwind uw
 ; CALLS-LABEL: define void @FuncWithPhi(
 ; CALLS-SAME: ptr captures(none) [[A:%.*]], ptr [[B:%.*]], ptr captures(none) [[C:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CALLS-NEXT:    [[TMP4:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
@@ -770,8 +770,8 @@ define void @SExt(ptr nocapture %a, ptr nocapture %b) nounwind uwtable sanitize_
 ; CALLS-LABEL: define void @SExt(
 ; CALLS-SAME: ptr captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
@@ -844,7 +844,7 @@ define void @MemCpy(ptr nocapture %x, ptr nocapture %y) nounwind uwtable sanitiz
 ; CHECK-LABEL: define void @MemCpy(
 ; CHECK-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__msan_memcpy(ptr [[X]], ptr [[Y]], i64 10)
 ; CHECK-NEXT:    ret void
@@ -852,8 +852,8 @@ define void @MemCpy(ptr nocapture %x, ptr nocapture %y) nounwind uwtable sanitiz
 ; ORIGIN-LABEL: define void @MemCpy(
 ; ORIGIN-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X]], ptr [[Y]], i64 10)
 ; ORIGIN-NEXT:    ret void
@@ -861,8 +861,8 @@ define void @MemCpy(ptr nocapture %x, ptr nocapture %y) nounwind uwtable sanitiz
 ; CALLS-LABEL: define void @MemCpy(
 ; CALLS-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X]], ptr [[Y]], i64 10)
 ; CALLS-NEXT:    ret void
@@ -911,7 +911,7 @@ define void @MemCpyInline(ptr nocapture %x, ptr nocapture %y) nounwind uwtable s
 ; CHECK-LABEL: define void @MemCpyInline(
 ; CHECK-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__msan_memcpy(ptr [[X]], ptr [[Y]], i64 10)
 ; CHECK-NEXT:    ret void
@@ -919,8 +919,8 @@ define void @MemCpyInline(ptr nocapture %x, ptr nocapture %y) nounwind uwtable s
 ; ORIGIN-LABEL: define void @MemCpyInline(
 ; ORIGIN-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X]], ptr [[Y]], i64 10)
 ; ORIGIN-NEXT:    ret void
@@ -928,8 +928,8 @@ define void @MemCpyInline(ptr nocapture %x, ptr nocapture %y) nounwind uwtable s
 ; CALLS-LABEL: define void @MemCpyInline(
 ; CALLS-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X]], ptr [[Y]], i64 10)
 ; CALLS-NEXT:    ret void
@@ -947,7 +947,7 @@ define void @MemMove(ptr nocapture %x, ptr nocapture %y) nounwind uwtable saniti
 ; CHECK-LABEL: define void @MemMove(
 ; CHECK-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @__msan_memmove(ptr [[X]], ptr [[Y]], i64 10)
 ; CHECK-NEXT:    ret void
@@ -955,8 +955,8 @@ define void @MemMove(ptr nocapture %x, ptr nocapture %y) nounwind uwtable saniti
 ; ORIGIN-LABEL: define void @MemMove(
 ; ORIGIN-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memmove(ptr [[X]], ptr [[Y]], i64 10)
 ; ORIGIN-NEXT:    ret void
@@ -964,8 +964,8 @@ define void @MemMove(ptr nocapture %x, ptr nocapture %y) nounwind uwtable saniti
 ; CALLS-LABEL: define void @MemMove(
 ; CALLS-SAME: ptr captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memmove(ptr [[X]], ptr [[Y]], i64 10)
 ; CALLS-NEXT:    ret void
@@ -1065,9 +1065,9 @@ define i32 @Select(i32 %a, i32 %b, i1 %c) nounwind uwtable readnone sanitize_mem
 ; CHECK-LABEL: define i32 @Select(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i1 [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[C]], i32 [[TMP1]], i32 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 [[A]], [[B]]
@@ -1081,12 +1081,12 @@ define i32 @Select(i32 %a, i32 %b, i1 %c) nounwind uwtable readnone sanitize_mem
 ; ORIGIN-LABEL: define i32 @Select(
 ; ORIGIN-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i1 [[C:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i1, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i1, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[C]], i32 [[TMP2]], i32 [[TMP4]]
 ; ORIGIN-NEXT:    [[TMP7:%.*]] = xor i32 [[A]], [[B]]
@@ -1103,12 +1103,12 @@ define i32 @Select(i32 %a, i32 %b, i1 %c) nounwind uwtable readnone sanitize_mem
 ; CALLS-LABEL: define i32 @Select(
 ; CALLS-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i1 [[C:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i1, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i1, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP6:%.*]] = select i1 [[C]], i32 [[TMP2]], i32 [[TMP4]]
 ; CALLS-NEXT:    [[TMP7:%.*]] = xor i32 [[A]], [[B]]
@@ -1135,9 +1135,9 @@ define <8 x i16> @SelectVector(<8 x i16> %a, <8 x i16> %b, <8 x i1> %c) nounwind
 ; CHECK-LABEL: define <8 x i16> @SelectVector(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[C]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i16> [[A]], [[B]]
@@ -1151,12 +1151,12 @@ define <8 x i16> @SelectVector(<8 x i16> %a, <8 x i16> %b, <8 x i1> %c) nounwind
 ; ORIGIN-LABEL: define <8 x i16> @SelectVector(
 ; ORIGIN-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[C:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[C]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]]
 ; ORIGIN-NEXT:    [[TMP7:%.*]] = xor <8 x i16> [[A]], [[B]]
@@ -1177,12 +1177,12 @@ define <8 x i16> @SelectVector(<8 x i16> %a, <8 x i16> %b, <8 x i1> %c) nounwind
 ; CALLS-LABEL: define <8 x i16> @SelectVector(
 ; CALLS-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[C:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; CALLS-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CALLS-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[C]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]]
 ; CALLS-NEXT:    [[TMP7:%.*]] = xor <8 x i16> [[A]], [[B]]
@@ -1213,9 +1213,9 @@ define <8 x i16> @SelectVector2(<8 x i16> %a, <8 x i16> %b, i1 %c) nounwind uwta
 ; CHECK-LABEL: define <8 x i16> @SelectVector2(
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i1 [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[C]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i16> [[A]], [[B]]
@@ -1229,12 +1229,12 @@ define <8 x i16> @SelectVector2(<8 x i16> %a, <8 x i16> %b, i1 %c) nounwind uwta
 ; ORIGIN-LABEL: define <8 x i16> @SelectVector2(
 ; ORIGIN-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i1 [[C:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i1, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i1, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[C]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]]
 ; ORIGIN-NEXT:    [[TMP7:%.*]] = xor <8 x i16> [[A]], [[B]]
@@ -1251,12 +1251,12 @@ define <8 x i16> @SelectVector2(<8 x i16> %a, <8 x i16> %b, i1 %c) nounwind uwta
 ; CALLS-LABEL: define <8 x i16> @SelectVector2(
 ; CALLS-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], i1 [[C:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i1, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i1, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; CALLS-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CALLS-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP6:%.*]] = select i1 [[C]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]]
 ; CALLS-NEXT:    [[TMP7:%.*]] = xor <8 x i16> [[A]], [[B]]
@@ -1280,8 +1280,8 @@ define { i64, i64 } @SelectStruct(i1 zeroext %x, { i64, i64 } %a, { i64, i64 } %
 ; CHECK-SAME: i1 zeroext [[X:%.*]], { i64, i64 } [[A:%.*]], { i64, i64 } [[B:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[X]], { i64, i64 } [[TMP1]], { i64, i64 } [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], { i64, i64 } { i64 -1, i64 -1 }, { i64, i64 } [[TMP3]]
@@ -1294,10 +1294,10 @@ define { i64, i64 } @SelectStruct(i1 zeroext %x, { i64, i64 } %a, { i64, i64 } %
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
 ; ORIGIN-NEXT:    [[TMP0:%.*]] = load i1, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; ORIGIN-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; ORIGIN-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[X]], { i64, i64 } [[TMP2]], { i64, i64 } [[TMP4]]
 ; ORIGIN-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], { i64, i64 } { i64 -1, i64 -1 }, { i64, i64 } [[TMP6]]
@@ -1313,10 +1313,10 @@ define { i64, i64 } @SelectStruct(i1 zeroext %x, { i64, i64 } %a, { i64, i64 } %
 ; CALLS-NEXT:  [[ENTRY:.*:]]
 ; CALLS-NEXT:    [[TMP0:%.*]] = load i1, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; CALLS-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; CALLS-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP6:%.*]] = select i1 [[X]], { i64, i64 } [[TMP2]], { i64, i64 } [[TMP4]]
 ; CALLS-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], { i64, i64 } { i64 -1, i64 -1 }, { i64, i64 } [[TMP6]]
@@ -1337,8 +1337,8 @@ define { ptr, double } @SelectStruct2(i1 zeroext %x, { ptr, double } %a, { ptr,
 ; CHECK-SAME: i1 zeroext [[X:%.*]], { ptr, double } [[A:%.*]], { ptr, double } [[B:%.*]]) #[[ATTR6]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[X]], { i64, i64 } [[TMP1]], { i64, i64 } [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], { i64, i64 } { i64 -1, i64 -1 }, { i64, i64 } [[TMP3]]
@@ -1351,10 +1351,10 @@ define { ptr, double } @SelectStruct2(i1 zeroext %x, { ptr, double } %a, { ptr,
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
 ; ORIGIN-NEXT:    [[TMP0:%.*]] = load i1, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; ORIGIN-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; ORIGIN-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP6:%.*]] = select i1 [[X]], { i64, i64 } [[TMP2]], { i64, i64 } [[TMP4]]
 ; ORIGIN-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], { i64, i64 } { i64 -1, i64 -1 }, { i64, i64 } [[TMP6]]
@@ -1370,10 +1370,10 @@ define { ptr, double } @SelectStruct2(i1 zeroext %x, { ptr, double } %a, { ptr,
 ; CALLS-NEXT:  [[ENTRY:.*:]]
 ; CALLS-NEXT:    [[TMP0:%.*]] = load i1, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; CALLS-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; CALLS-NEXT:    [[TMP4:%.*]] = load { i64, i64 }, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP6:%.*]] = select i1 [[X]], { i64, i64 } [[TMP2]], { i64, i64 } [[TMP4]]
 ; CALLS-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], { i64, i64 } { i64 -1, i64 -1 }, { i64, i64 } [[TMP6]]
@@ -1475,7 +1475,7 @@ define i32 @Div(i32 %a, i32 %b) nounwind uwtable readnone sanitize_memory {
 ; CHECK-LABEL: define i32 @Div(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP0]], 0
@@ -1491,8 +1491,8 @@ define i32 @Div(i32 %a, i32 %b) nounwind uwtable readnone sanitize_memory {
 ; ORIGIN-LABEL: define i32 @Div(
 ; ORIGIN-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
@@ -1510,8 +1510,8 @@ define i32 @Div(i32 %a, i32 %b) nounwind uwtable readnone sanitize_memory {
 ; CALLS-LABEL: define i32 @Div(
 ; CALLS-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
@@ -1533,7 +1533,7 @@ define float @FDiv(float %a, float %b) nounwind uwtable readnone sanitize_memory
 ; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[C:%.*]] = fdiv float [[A]], [[B]]
@@ -1545,8 +1545,8 @@ define float @FDiv(float %a, float %b) nounwind uwtable readnone sanitize_memory
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
 ; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP0]], [[TMP2]]
 ; ORIGIN-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], 0
@@ -1561,8 +1561,8 @@ define float @FDiv(float %a, float %b) nounwind uwtable readnone sanitize_memory
 ; CALLS-NEXT:  [[ENTRY:.*:]]
 ; CALLS-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP0]], [[TMP2]]
 ; CALLS-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP2]], 0
@@ -2416,7 +2416,7 @@ define i32 @ShadowLoadAlignmentSmall() nounwind uwtable sanitize_memory {
 define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
 ; CHECK-LABEL: define i32 @ExtractElement(
 ; CHECK-SAME: <4 x i32> [[VEC:%.*]], i32 [[IDX:%.*]]) #[[ATTR6]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i32 [[IDX]]
@@ -2432,8 +2432,8 @@ define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
 ;
 ; ORIGIN-LABEL: define i32 @ExtractElement(
 ; ORIGIN-SAME: <4 x i32> [[VEC:%.*]], i32 [[IDX:%.*]]) #[[ATTR6]] {
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
@@ -2451,8 +2451,8 @@ define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
 ;
 ; CALLS-LABEL: define i32 @ExtractElement(
 ; CALLS-SAME: <4 x i32> [[VEC:%.*]], i32 [[IDX:%.*]]) #[[ATTR6]] {
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CALLS-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
@@ -2470,9 +2470,9 @@ define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
 define <4 x i32> @InsertElement(<4 x i32> %vec, i32 %idx, i32 %x) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @InsertElement(
 ; CHECK-SAME: <4 x i32> [[VEC:%.*]], i32 [[IDX:%.*]], i32 [[X:%.*]]) #[[ATTR6]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP3]], i32 [[IDX]]
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
@@ -2487,12 +2487,12 @@ define <4 x i32> @InsertElement(<4 x i32> %vec, i32 %idx, i32 %x) sanitize_memor
 ;
 ; ORIGIN-LABEL: define <4 x i32> @InsertElement(
 ; ORIGIN-SAME: <4 x i32> [[VEC:%.*]], i32 [[IDX:%.*]], i32 [[X:%.*]]) #[[ATTR6]] {
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; ORIGIN-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i32 [[IDX]]
 ; ORIGIN-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0
@@ -2512,12 +2512,12 @@ define <4 x i32> @InsertElement(<4 x i32> %vec, i32 %idx, i32 %x) sanitize_memor
 ;
 ; CALLS-LABEL: define <4 x i32> @InsertElement(
 ; CALLS-SAME: <4 x i32> [[VEC:%.*]], i32 [[IDX:%.*]], i32 [[X:%.*]]) #[[ATTR6]] {
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CALLS-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CALLS-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
+; CALLS-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CALLS-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i32 [[IDX]]
 ; CALLS-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0
@@ -2538,7 +2538,7 @@ define <4 x i32> @ShuffleVector(<4 x i32> %vec, <4 x i32> %vec1) sanitize_memory
 ; CHECK-LABEL: define <4 x i32> @ShuffleVector(
 ; CHECK-SAME: <4 x i32> [[VEC:%.*]], <4 x i32> [[VEC1:%.*]]) #[[ATTR6]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 ; CHECK-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i32> [[VEC]], <4 x i32> [[VEC1]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2549,8 +2549,8 @@ define <4 x i32> @ShuffleVector(<4 x i32> %vec, <4 x i32> %vec1) sanitize_memory
 ; ORIGIN-SAME: <4 x i32> [[VEC:%.*]], <4 x i32> [[VEC1:%.*]]) #[[ATTR6]] {
 ; ORIGIN-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 ; ORIGIN-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
@@ -2565,8 +2565,8 @@ define <4 x i32> @ShuffleVector(<4 x i32> %vec, <4 x i32> %vec1) sanitize_memory
 ; CALLS-SAME: <4 x i32> [[VEC:%.*]], <4 x i32> [[VEC1:%.*]]) #[[ATTR6]] {
 ; CALLS-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
+; CALLS-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
 ; CALLS-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
@@ -2761,17 +2761,13 @@ define void @VAStart(i32 %x, ...) sanitize_memory {
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP16]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[VA]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[TMP17]], 16
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[VA]], i64 16
 ; CHECK-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP23]], ptr align 16 [[TMP2]], i64 176, i1 false)
-; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[VA]] to i64
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[TMP24]], 8
-; CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[VA]], i64 8
 ; CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[TMP26]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = ptrtoint ptr [[TMP27]] to i64
 ; CHECK-NEXT:    [[TMP29:%.*]] = xor i64 [[TMP28]], 87960930222080
@@ -2832,9 +2828,7 @@ define void @VAStart(i32 %x, ...) sanitize_memory {
 ; ORIGIN-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
 ; ORIGIN-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 24, i1 false)
 ; ORIGIN-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
-; ORIGIN-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[VA]] to i64
-; ORIGIN-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], 16
-; ORIGIN-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; ORIGIN-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[VA]], i64 16
 ; ORIGIN-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8
 ; ORIGIN-NEXT:    [[TMP35:%.*]] = ptrtoint ptr [[TMP34]] to i64
 ; ORIGIN-NEXT:    [[TMP36:%.*]] = xor i64 [[TMP35]], 87960930222080
@@ -2843,9 +2837,7 @@ define void @VAStart(i32 %x, ...) sanitize_memory {
 ; ORIGIN-NEXT:    [[TMP39:%.*]] = inttoptr i64 [[TMP38]] to ptr
 ; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP37]], ptr align 16 [[TMP2]], i64 176, i1 false)
 ; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP39]], ptr align 16 [[TMP4]], i64 176, i1 false)
-; ORIGIN-NEXT:    [[TMP40:%.*]] = ptrtoint ptr [[VA]] to i64
-; ORIGIN-NEXT:    [[TMP41:%.*]] = add i64 [[TMP40]], 8
-; ORIGIN-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
+; ORIGIN-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[VA]], i64 8
 ; ORIGIN-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
 ; ORIGIN-NEXT:    [[TMP44:%.*]] = ptrtoint ptr [[TMP43]] to i64
 ; ORIGIN-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP44]], 87960930222080
@@ -2905,9 +2897,7 @@ define void @VAStart(i32 %x, ...) sanitize_memory {
 ; CALLS-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr
 ; CALLS-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 24, i1 false)
 ; CALLS-NEXT:    call void @llvm.va_start.p0(ptr [[VA]])
-; CALLS-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[VA]] to i64
-; CALLS-NEXT:    [[TMP30:%.*]] = add i64 [[TMP29]], 16
-; CALLS-NEXT:    [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CALLS-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[VA]], i64 16
 ; CALLS-NEXT:    [[TMP32:%.*]] = load ptr, ptr [[TMP31]], align 8
 ; CALLS-NEXT:    [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64
 ; CALLS-NEXT:    [[TMP34:%.*]] = xor i64 [[TMP33]], 87960930222080
@@ -2916,9 +2906,7 @@ define void @VAStart(i32 %x, ...) sanitize_memory {
 ; CALLS-NEXT:    [[TMP37:%.*]] = inttoptr i64 [[TMP36]] to ptr
 ; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP35]], ptr align 16 [[TMP2]], i64 176, i1 false)
 ; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP37]], ptr align 16 [[TMP4]], i64 176, i1 false)
-; CALLS-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[VA]] to i64
-; CALLS-NEXT:    [[TMP39:%.*]] = add i64 [[TMP38]], 8
-; CALLS-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
+; CALLS-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[VA]], i64 8
 ; CALLS-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
 ; CALLS-NEXT:    [[TMP42:%.*]] = ptrtoint ptr [[TMP41]] to i64
 ; CALLS-NEXT:    [[TMP43:%.*]] = xor i64 [[TMP42]], 87960930222080
@@ -2948,7 +2936,7 @@ define void @VolatileStore(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_m
 ; CHECK-LABEL: define void @VolatileStore(
 ; CHECK-SAME: ptr captures(none) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
@@ -2960,8 +2948,8 @@ define void @VolatileStore(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_m
 ; ORIGIN-LABEL: define void @VolatileStore(
 ; ORIGIN-SAME: ptr captures(none) [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], 87960930222080
@@ -2983,8 +2971,8 @@ define void @VolatileStore(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_m
 ; CALLS-NEXT:  [[ENTRY:.*:]]
 ; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]])
 ; CALLS-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P]] to i64
@@ -3333,7 +3321,7 @@ define <2 x i64> @ArgumentShadowAlignment(i64 %a, <2 x i64> %b) sanitize_memory
 ; CHECK-LABEL: define <2 x i64> @ArgumentShadowAlignment(
 ; CHECK-SAME: i64 [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR6]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[B]]
@@ -3341,8 +3329,8 @@ define <2 x i64> @ArgumentShadowAlignment(i64 %a, <2 x i64> %b) sanitize_memory
 ; ORIGIN-LABEL: define <2 x i64> @ArgumentShadowAlignment(
 ; ORIGIN-SAME: i64 [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR6]] {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
-; ORIGIN-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    store <2 x i64> [[TMP0]], ptr @__msan_retval_tls, align 8
 ; ORIGIN-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
@@ -3351,8 +3339,8 @@ define <2 x i64> @ArgumentShadowAlignment(i64 %a, <2 x i64> %b) sanitize_memory
 ; CALLS-LABEL: define <2 x i64> @ArgumentShadowAlignment(
 ; CALLS-SAME: i64 [[A:%.*]], <2 x i64> [[B:%.*]]) #[[ATTR6]] {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
-; CALLS-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    store <2 x i64> [[TMP0]], ptr @__msan_retval_tls, align 8
 ; CALLS-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
@@ -3371,7 +3359,7 @@ define { i64, i32 } @make_pair_64_32(i64 %x, i32 %y) sanitize_memory {
 ; CHECK-SAME: i64 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR6]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { i64, i32 } { i64 -1, i32 -1 }, i64 [[TMP0]], 0
 ; CHECK-NEXT:    [[A:%.*]] = insertvalue { i64, i32 } undef, i64 [[X]], 0
@@ -3385,8 +3373,8 @@ define { i64, i32 } @make_pair_64_32(i64 %x, i32 %y) sanitize_memory {
 ; ORIGIN-NEXT:  [[ENTRY:.*:]]
 ; ORIGIN-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP4:%.*]] = insertvalue { i64, i32 } { i64 -1, i32 -1 }, i64 [[TMP0]], 0
 ; ORIGIN-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP0]], 0
@@ -3405,8 +3393,8 @@ define { i64, i32 } @make_pair_64_32(i64 %x, i32 %y) sanitize_memory {
 ; CALLS-NEXT:  [[ENTRY:.*:]]
 ; CALLS-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CALLS-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CALLS-NEXT:    call void @llvm.donothing()
 ; CALLS-NEXT:    [[TMP4:%.*]] = insertvalue { i64, i32 } { i64 -1, i32 -1 }, i64 [[TMP0]], 0
 ; CALLS-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP0]], 0
@@ -3458,22 +3446,22 @@ define void @VAArgStruct(ptr nocapture %s) sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = call ptr @__msan_memcpy(ptr [[AGG_TMP2]], ptr [[S]], i64 16)
 ; CHECK-NEXT:    store i32 -1, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP13]], i64 16, i1 false)
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP13]], i64 16, i1 false)
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), ptr align 8 [[TMP16]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), ptr align 8 [[TMP16]], i64 16, i1 false)
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i32, ...) @VAArgStructFn(i32 undef, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], ptr byval([[STRUCT_STRUCTBYVAL]]) align 8 [[AGG_TMP2]])
 ; CHECK-NEXT:    ret void
@@ -3515,48 +3503,48 @@ define void @VAArgStruct(ptr nocapture %s) sanitize_memory {
 ; ORIGIN-NEXT:    [[TMP20:%.*]] = call ptr @__msan_memcpy(ptr [[AGG_TMP2]], ptr [[S]], i64 16)
 ; ORIGIN-NEXT:    store i32 -1, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    store i32 0, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; ORIGIN-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; ORIGIN-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; ORIGIN-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; ORIGIN-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; ORIGIN-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
 ; ORIGIN-NEXT:    [[TMP24:%.*]] = add i64 [[TMP22]], 17592186044416
 ; ORIGIN-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP23]], i64 16, i1 false)
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 40) to ptr), ptr align 4 [[TMP25]], i64 16, i1 false)
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP23]], i64 16, i1 false)
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 40), ptr align 4 [[TMP25]], i64 16, i1 false)
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; ORIGIN-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP13]] to i64
 ; ORIGIN-NEXT:    [[TMP27:%.*]] = shl i64 [[TMP26]], 32
 ; ORIGIN-NEXT:    [[TMP28:%.*]] = or i64 [[TMP26]], [[TMP27]]
-; ORIGIN-NEXT:    store i64 [[TMP28]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP28]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 8), align 8
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; ORIGIN-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP19]] to i64
 ; ORIGIN-NEXT:    [[TMP30:%.*]] = shl i64 [[TMP29]], 32
 ; ORIGIN-NEXT:    [[TMP31:%.*]] = or i64 [[TMP29]], [[TMP30]]
-; ORIGIN-NEXT:    store i64 [[TMP31]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP31]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 16), align 8
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
 ; ORIGIN-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP13]] to i64
 ; ORIGIN-NEXT:    [[TMP33:%.*]] = shl i64 [[TMP32]], 32
 ; ORIGIN-NEXT:    [[TMP34:%.*]] = or i64 [[TMP32]], [[TMP33]]
-; ORIGIN-NEXT:    store i64 [[TMP34]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP34]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 24), align 8
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; ORIGIN-NEXT:    [[TMP35:%.*]] = zext i32 [[TMP19]] to i64
 ; ORIGIN-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
 ; ORIGIN-NEXT:    [[TMP37:%.*]] = or i64 [[TMP35]], [[TMP36]]
-; ORIGIN-NEXT:    store i64 [[TMP37]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 32) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP37]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 32), align 8
 ; ORIGIN-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; ORIGIN-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
 ; ORIGIN-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
 ; ORIGIN-NEXT:    [[TMP41:%.*]] = add i64 [[TMP39]], 17592186044416
 ; ORIGIN-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), ptr align 8 [[TMP40]], i64 16, i1 false)
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 176) to ptr), ptr align 8 [[TMP42]], i64 16, i1 false)
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), ptr align 8 [[TMP40]], i64 16, i1 false)
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 176), ptr align 8 [[TMP42]], i64 16, i1 false)
 ; ORIGIN-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN-NEXT:    call void (i32, ...) @VAArgStructFn(i32 undef, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], ptr byval([[STRUCT_STRUCTBYVAL]]) align 8 [[AGG_TMP2]])
 ; ORIGIN-NEXT:    ret void
@@ -3600,48 +3588,48 @@ define void @VAArgStruct(ptr nocapture %s) sanitize_memory {
 ; CALLS-NEXT:    [[TMP20:%.*]] = call ptr @__msan_memcpy(ptr [[AGG_TMP2]], ptr [[S]], i64 16)
 ; CALLS-NEXT:    store i32 -1, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    store i32 0, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CALLS-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CALLS-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; CALLS-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CALLS-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CALLS-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
 ; CALLS-NEXT:    [[TMP24:%.*]] = add i64 [[TMP22]], 17592186044416
 ; CALLS-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP23]], i64 16, i1 false)
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 40) to ptr), ptr align 4 [[TMP25]], i64 16, i1 false)
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP23]], i64 16, i1 false)
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 40), ptr align 4 [[TMP25]], i64 16, i1 false)
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CALLS-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP13]] to i64
 ; CALLS-NEXT:    [[TMP27:%.*]] = shl i64 [[TMP26]], 32
 ; CALLS-NEXT:    [[TMP28:%.*]] = or i64 [[TMP26]], [[TMP27]]
-; CALLS-NEXT:    store i64 [[TMP28]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP28]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 8), align 8
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CALLS-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP19]] to i64
 ; CALLS-NEXT:    [[TMP30:%.*]] = shl i64 [[TMP29]], 32
 ; CALLS-NEXT:    [[TMP31:%.*]] = or i64 [[TMP29]], [[TMP30]]
-; CALLS-NEXT:    store i64 [[TMP31]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP31]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 16), align 8
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
 ; CALLS-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP13]] to i64
 ; CALLS-NEXT:    [[TMP33:%.*]] = shl i64 [[TMP32]], 32
 ; CALLS-NEXT:    [[TMP34:%.*]] = or i64 [[TMP32]], [[TMP33]]
-; CALLS-NEXT:    store i64 [[TMP34]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 24) to ptr), align 8
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP34]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 24), align 8
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; CALLS-NEXT:    [[TMP35:%.*]] = zext i32 [[TMP19]] to i64
 ; CALLS-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
 ; CALLS-NEXT:    [[TMP37:%.*]] = or i64 [[TMP35]], [[TMP36]]
-; CALLS-NEXT:    store i64 [[TMP37]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 32) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP37]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 32), align 8
 ; CALLS-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CALLS-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
 ; CALLS-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
 ; CALLS-NEXT:    [[TMP41:%.*]] = add i64 [[TMP39]], 17592186044416
 ; CALLS-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 176) to ptr), ptr align 8 [[TMP40]], i64 16, i1 false)
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 176) to ptr), ptr align 8 [[TMP42]], i64 16, i1 false)
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 176), ptr align 8 [[TMP40]], i64 16, i1 false)
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 176), ptr align 8 [[TMP42]], i64 16, i1 false)
 ; CALLS-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CALLS-NEXT:    call void (i32, ...) @VAArgStructFn(i32 undef, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], ptr byval([[STRUCT_STRUCTBYVAL]]) align 8 [[AGG_TMP2]])
 ; CALLS-NEXT:    ret void
@@ -3685,22 +3673,22 @@ define void @VAArgStructNoSSE(ptr nocapture %s) sanitize_memory #0 {
 ; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = call ptr @__msan_memcpy(ptr [[AGG_TMP2]], ptr [[S]], i64 16)
 ; CHECK-NEXT:    store i32 -1, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
 ; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP13]], i64 16, i1 false)
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP13]], i64 16, i1 false)
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
+; CHECK-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
+; CHECK-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP14:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP14]], 87960930222080
 ; CHECK-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), ptr align 8 [[TMP16]], i64 16, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), ptr align 8 [[TMP16]], i64 16, i1 false)
 ; CHECK-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void (i32, ...) @VAArgStructFn(i32 undef, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], ptr byval([[STRUCT_STRUCTBYVAL]]) align 8 [[AGG_TMP2]])
 ; CHECK-NEXT:    ret void
@@ -3742,48 +3730,48 @@ define void @VAArgStructNoSSE(ptr nocapture %s) sanitize_memory #0 {
 ; ORIGIN-NEXT:    [[TMP20:%.*]] = call ptr @__msan_memcpy(ptr [[AGG_TMP2]], ptr [[S]], i64 16)
 ; ORIGIN-NEXT:    store i32 -1, ptr @__msan_param_tls, align 8
 ; ORIGIN-NEXT:    store i32 0, ptr @__msan_param_origin_tls, align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; ORIGIN-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; ORIGIN-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; ORIGIN-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; ORIGIN-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; ORIGIN-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; ORIGIN-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; ORIGIN-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
 ; ORIGIN-NEXT:    [[TMP24:%.*]] = add i64 [[TMP22]], 17592186044416
 ; ORIGIN-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP23]], i64 16, i1 false)
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 40) to ptr), ptr align 4 [[TMP25]], i64 16, i1 false)
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP23]], i64 16, i1 false)
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 40), ptr align 4 [[TMP25]], i64 16, i1 false)
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; ORIGIN-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP13]] to i64
 ; ORIGIN-NEXT:    [[TMP27:%.*]] = shl i64 [[TMP26]], 32
 ; ORIGIN-NEXT:    [[TMP28:%.*]] = or i64 [[TMP26]], [[TMP27]]
-; ORIGIN-NEXT:    store i64 [[TMP28]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP28]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 8), align 8
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; ORIGIN-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP19]] to i64
 ; ORIGIN-NEXT:    [[TMP30:%.*]] = shl i64 [[TMP29]], 32
 ; ORIGIN-NEXT:    [[TMP31:%.*]] = or i64 [[TMP29]], [[TMP30]]
-; ORIGIN-NEXT:    store i64 [[TMP31]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 16) to ptr), align 8
-; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP31]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 16), align 8
+; ORIGIN-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
 ; ORIGIN-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP13]] to i64
 ; ORIGIN-NEXT:    [[TMP33:%.*]] = shl i64 [[TMP32]], 32
 ; ORIGIN-NEXT:    [[TMP34:%.*]] = or i64 [[TMP32]], [[TMP33]]
-; ORIGIN-NEXT:    store i64 [[TMP34]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 24) to ptr), align 8
-; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP34]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 24), align 8
+; ORIGIN-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; ORIGIN-NEXT:    [[TMP35:%.*]] = zext i32 [[TMP19]] to i64
 ; ORIGIN-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
 ; ORIGIN-NEXT:    [[TMP37:%.*]] = or i64 [[TMP35]], [[TMP36]]
-; ORIGIN-NEXT:    store i64 [[TMP37]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 32) to ptr), align 8
+; ORIGIN-NEXT:    store i64 [[TMP37]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 32), align 8
 ; ORIGIN-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; ORIGIN-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
 ; ORIGIN-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
 ; ORIGIN-NEXT:    [[TMP41:%.*]] = add i64 [[TMP39]], 17592186044416
 ; ORIGIN-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), ptr align 8 [[TMP40]], i64 16, i1 false)
-; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 48) to ptr), ptr align 8 [[TMP42]], i64 16, i1 false)
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), ptr align 8 [[TMP40]], i64 16, i1 false)
+; ORIGIN-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 48), ptr align 8 [[TMP42]], i64 16, i1 false)
 ; ORIGIN-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; ORIGIN-NEXT:    call void (i32, ...) @VAArgStructFn(i32 undef, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], ptr byval([[STRUCT_STRUCTBYVAL]]) align 8 [[AGG_TMP2]])
 ; ORIGIN-NEXT:    ret void
@@ -3827,48 +3815,48 @@ define void @VAArgStructNoSSE(ptr nocapture %s) sanitize_memory #0 {
 ; CALLS-NEXT:    [[TMP20:%.*]] = call ptr @__msan_memcpy(ptr [[AGG_TMP2]], ptr [[S]], i64 16)
 ; CALLS-NEXT:    store i32 -1, ptr @__msan_param_tls, align 8
 ; CALLS-NEXT:    store i32 0, ptr @__msan_param_origin_tls, align 4
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP13]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CALLS-NEXT:    store i32 [[TMP19]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CALLS-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CALLS-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CALLS-NEXT:    store i32 [[TMP13]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CALLS-NEXT:    store i32 [[TMP19]], ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4
 ; CALLS-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CALLS-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
 ; CALLS-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
 ; CALLS-NEXT:    [[TMP24:%.*]] = add i64 [[TMP22]], 17592186044416
 ; CALLS-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), ptr align 8 [[TMP23]], i64 16, i1 false)
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 40) to ptr), ptr align 4 [[TMP25]], i64 16, i1 false)
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 8) to ptr), align 8
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_param_tls, i64 40), ptr align 8 [[TMP23]], i64 16, i1 false)
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 getelementptr (i8, ptr @__msan_param_origin_tls, i64 40), ptr align 4 [[TMP25]], i64 16, i1 false)
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 8), align 8
 ; CALLS-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP13]] to i64
 ; CALLS-NEXT:    [[TMP27:%.*]] = shl i64 [[TMP26]], 32
 ; CALLS-NEXT:    [[TMP28:%.*]] = or i64 [[TMP26]], [[TMP27]]
-; CALLS-NEXT:    store i64 [[TMP28]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 8) to ptr), align 8
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 16) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP28]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 8), align 8
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 16), align 8
 ; CALLS-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP19]] to i64
 ; CALLS-NEXT:    [[TMP30:%.*]] = shl i64 [[TMP29]], 32
 ; CALLS-NEXT:    [[TMP31:%.*]] = or i64 [[TMP29]], [[TMP30]]
-; CALLS-NEXT:    store i64 [[TMP31]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 16) to ptr), align 8
-; CALLS-NEXT:    store i64 [[_MSLD]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 24) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP31]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 16), align 8
+; CALLS-NEXT:    store i64 [[_MSLD]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 24), align 8
 ; CALLS-NEXT:    [[TMP32:%.*]] = zext i32 [[TMP13]] to i64
 ; CALLS-NEXT:    [[TMP33:%.*]] = shl i64 [[TMP32]], 32
 ; CALLS-NEXT:    [[TMP34:%.*]] = or i64 [[TMP32]], [[TMP33]]
-; CALLS-NEXT:    store i64 [[TMP34]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 24) to ptr), align 8
-; CALLS-NEXT:    store i64 [[_MSLD2]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 32) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP34]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 24), align 8
+; CALLS-NEXT:    store i64 [[_MSLD2]], ptr getelementptr (i8, ptr @__msan_va_arg_tls, i64 32), align 8
 ; CALLS-NEXT:    [[TMP35:%.*]] = zext i32 [[TMP19]] to i64
 ; CALLS-NEXT:    [[TMP36:%.*]] = shl i64 [[TMP35]], 32
 ; CALLS-NEXT:    [[TMP37:%.*]] = or i64 [[TMP35]], [[TMP36]]
-; CALLS-NEXT:    store i64 [[TMP37]], ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 32) to ptr), align 8
+; CALLS-NEXT:    store i64 [[TMP37]], ptr getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 32), align 8
 ; CALLS-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[AGG_TMP2]] to i64
 ; CALLS-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
 ; CALLS-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
 ; CALLS-NEXT:    [[TMP41:%.*]] = add i64 [[TMP39]], 17592186044416
 ; CALLS-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_tls to i64), i64 48) to ptr), ptr align 8 [[TMP40]], i64 16, i1 false)
-; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_va_arg_origin_tls to i64), i64 48) to ptr), ptr align 8 [[TMP42]], i64 16, i1 false)
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_tls, i64 48), ptr align 8 [[TMP40]], i64 16, i1 false)
+; CALLS-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 getelementptr (i8, ptr @__msan_va_arg_origin_tls, i64 48), ptr align 8 [[TMP42]], i64 16, i1 false)
 ; CALLS-NEXT:    store i64 16, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CALLS-NEXT:    call void (i32, ...) @VAArgStructFn(i32 undef, i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], i64 [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]], ptr byval([[STRUCT_STRUCTBYVAL]]) align 8 [[AGG_TMP2]])
 ; CALLS-NEXT:    ret void
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
index 04fdd23aa5a88..846912ebef54a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_debug_info.ll
@@ -22,20 +22,20 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @Store(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @Store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG1:![0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 17592186044416, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP6]], align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP2]], ptr [[P]], i32 zeroext [[TMP3]]), !dbg [[DBG1]]
-; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[P]], align 4, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG2:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[P:%.*]] to i64, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 17592186044416, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP6]], align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP2]], ptr [[P]], i32 zeroext [[TMP3]]), !dbg [[DBG2]]
+; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[P]], align 4, !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -46,29 +46,29 @@ entry:
 define void @LoadAndCmp(ptr nocapture %a) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @LoadAndCmp(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A:%.*]], align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP2]], 0, !dbg [[DBG7:![0-9]+]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[_MSLD]], 0, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], -1, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = and i32 [[TMP12]], [[TMP9]], !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0, !dbg [[DBG7]]
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]], !dbg [[DBG7]]
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP2]], 0, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = zext i1 [[_MSPROP_ICMP]] to i8, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_1(i8 zeroext [[TMP15]], i32 zeroext [[TMP8]]), !dbg [[DBG8]]
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A:%.*]], align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP2]], 0, !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[_MSLD]], 0, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], -1, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and i32 [[TMP12]], [[TMP9]], !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0, !dbg [[DBG8]]
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]], !dbg [[DBG8]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP2]], 0, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i1 [[_MSPROP_ICMP]] to i8, !dbg [[DBG9:![0-9]+]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_1(i8 zeroext [[TMP15]], i32 zeroext [[TMP8]]), !dbg [[DBG9]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG9]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    store i64 0, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    tail call void (...) @foo() #[[ATTR5:[0-9]+]]
@@ -92,10 +92,10 @@ declare void @foo(...)
 define i32 @ReturnInt() nounwind uwtable readnone sanitize_memory {
 ; CHECK-LABEL: @ReturnInt(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    ret i32 123, !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    ret i32 123, !dbg [[DBG2]]
 ;
 entry:
   ret i32 123, !dbg !10
@@ -104,22 +104,22 @@ entry:
 define void @CopyRetVal(ptr nocapture %a) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @CopyRetVal(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @ReturnInt() #[[ATTR5]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_retval_origin_tls, align 4, !dbg [[DBG7]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A:%.*]] to i64, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    store i32 [[_MSRET]], ptr [[TMP5]], align 4, !dbg [[DBG7]]
-; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSRET]], ptr [[A]], i32 zeroext [[TMP2]]), !dbg [[DBG7]]
-; CHECK-NEXT:    store i32 [[CALL]], ptr [[A]], align 4, !dbg [[DBG7]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @ReturnInt() #[[ATTR5]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_retval_origin_tls, align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A:%.*]] to i64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], 17592186044416, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    store i32 [[_MSRET]], ptr [[TMP5]], align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSRET]], ptr [[A]], i32 zeroext [[TMP2]]), !dbg [[DBG8]]
+; CHECK-NEXT:    store i32 [[CALL]], ptr [[A]], align 4, !dbg [[DBG8]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -133,32 +133,32 @@ entry:
 define void @SExt(ptr nocapture %a, ptr nocapture %b) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @SExt(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[B:%.*]], align 2, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B]] to i64, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], 17592186044416, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], -4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP7]], align 2, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = sext i16 [[_MSLD]] to i32, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sext i16 [[TMP4]] to i32, !dbg [[DBG7]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP2]], i32 zeroext [[TMP3]]), !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A:%.*]] to i64, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr, !dbg [[DBG8]]
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr [[TMP15]], align 4, !dbg [[DBG8]]
-; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSPROP]], ptr [[A]], i32 zeroext [[TMP11]]), !dbg [[DBG8]]
-; CHECK-NEXT:    store i32 [[TMP12]], ptr [[A]], align 4, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[B:%.*]], align 2, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B]] to i64, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], 17592186044416, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], -4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i16, ptr [[TMP7]], align 2, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = sext i16 [[_MSLD]] to i32, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i16 [[TMP4]] to i32, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_8(i64 zeroext [[TMP2]], i32 zeroext [[TMP3]]), !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[A:%.*]] to i64, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr, !dbg [[DBG9]]
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr [[TMP15]], align 4, !dbg [[DBG9]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[_MSPROP]], ptr [[A]], i32 zeroext [[TMP11]]), !dbg [[DBG9]]
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[A]], align 4, !dbg [[DBG9]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -171,8 +171,8 @@ entry:
 define void @MemSet(ptr nocapture %x) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @MemSet(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_memset(ptr [[X:%.*]], i32 42, i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_memset(ptr [[X:%.*]], i32 42, i64 10), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -187,10 +187,10 @@ declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
 define void @MemCpy(ptr nocapture %x, ptr nocapture %y) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @MemCpy(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X:%.*]], ptr [[Y:%.*]], i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X:%.*]], ptr [[Y:%.*]], i64 10), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -204,8 +204,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounw
 define void @MemSetInline(ptr nocapture %x) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @MemSetInline(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_memset(ptr [[X:%.*]], i32 42, i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__msan_memset(ptr [[X:%.*]], i32 42, i64 10), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -219,10 +219,10 @@ declare void @llvm.memset.inline.p0.i64(ptr nocapture, i8, i64, i1) nounwind
 define void @MemCpyInline(ptr nocapture %x, ptr nocapture %y) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @MemCpyInline(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X:%.*]], ptr [[Y:%.*]], i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memcpy(ptr [[X:%.*]], ptr [[Y:%.*]], i64 10), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -236,10 +236,10 @@ declare void @llvm.memcpy.inline.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1
 define void @MemMove(ptr nocapture %x, ptr nocapture %y) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @MemMove(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memmove(ptr [[X:%.*]], ptr [[Y:%.*]], i64 10), !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @__msan_memmove(ptr [[X:%.*]], ptr [[Y:%.*]], i64 10), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -256,8 +256,8 @@ declare void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr nocapture write
 
 define void @atomic_memcpy(ptr nocapture %x, ptr nocapture %y) nounwind {
 ; CHECK-LABEL: @atomic_memcpy(
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 1 [[X:%.*]], ptr align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 1 [[X:%.*]], ptr align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 1 %x, ptr align 2 %y, i64 16, i32 1), !dbg !10
@@ -266,8 +266,8 @@ define void @atomic_memcpy(ptr nocapture %x, ptr nocapture %y) nounwind {
 
 define void @atomic_memmove(ptr nocapture %x, ptr nocapture %y) nounwind {
 ; CHECK-LABEL: @atomic_memmove(
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr align 1 [[X:%.*]], ptr align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr align 1 [[X:%.*]], ptr align 2 [[Y:%.*]], i64 16, i32 1), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memmove.element.unordered.atomic.p0.p0.i64(ptr align 1 %x, ptr align 2 %y, i64 16, i32 1), !dbg !10
@@ -276,8 +276,8 @@ define void @atomic_memmove(ptr nocapture %x, ptr nocapture %y) nounwind {
 
 define void @atomic_memset(ptr nocapture %x) nounwind {
 ; CHECK-LABEL: @atomic_memset(
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 1 [[X:%.*]], i8 88, i64 16, i32 1), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 1 [[X:%.*]], i8 88, i64 16, i32 1), !dbg [[DBG2]]
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 1 %x, i8 88, i64 16, i32 1), !dbg !10
@@ -290,21 +290,21 @@ define void @atomic_memset(ptr nocapture %x) nounwind {
 define i32 @Select(i32 %a, i32 %b, i1 %c) nounwind uwtable readnone sanitize_memory {
 ; CHECK-LABEL: @Select(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[C:%.*]], i32 [[TMP2]], i32 [[TMP4]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP2]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP8]], [[TMP4]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], i32 [[TMP9]], i32 [[TMP6]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[C]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP0]], i32 [[TMP1]], i32 [[TMP10]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[COND:%.*]] = select i1 [[C]], i32 [[A]], i32 [[B]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i1, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[C:%.*]], i32 [[TMP2]], i32 [[TMP4]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP2]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP8]], [[TMP4]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP0]], i32 [[TMP9]], i32 [[TMP6]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[C]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select i1 [[TMP0]], i32 [[TMP1]], i32 [[TMP10]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[C]], i32 [[A]], i32 [[B]], !dbg [[DBG2]]
 ; CHECK-NEXT:    store i32 [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret i32 [[COND]]
@@ -320,25 +320,25 @@ entry:
 define <8 x i16> @SelectVector(<8 x i16> %a, <8 x i16> %b, <8 x i1> %c) nounwind uwtable readnone sanitize_memory {
 ; CHECK-LABEL: @SelectVector(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 32) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[C:%.*]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i16> [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP7]], [[TMP2]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP4]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[TMP9]], <8 x i16> [[TMP6]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[C]] to i8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i1> [[TMP0]] to i8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP12]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP11]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 [[TMP1]], i32 [[TMP14]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[COND:%.*]] = select <8 x i1> [[C]], <8 x i16> [[A]], <8 x i16> [[B]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 32), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[C:%.*]], <8 x i16> [[TMP2]], <8 x i16> [[TMP4]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i16> [[A:%.*]], [[B:%.*]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP7]], [[TMP2]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP4]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[TMP9]], <8 x i16> [[TMP6]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[C]] to i8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i1> [[TMP0]] to i8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP12]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP11]], i32 [[TMP3]], i32 [[TMP5]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP13]], i32 [[TMP1]], i32 [[TMP14]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[COND:%.*]] = select <8 x i1> [[C]], <8 x i16> [[A]], <8 x i16> [[B]], !dbg [[DBG2]]
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP15]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <8 x i16> [[COND]]
@@ -354,10 +354,10 @@ entry:
 define ptr @IntToPtr(i64 %x) nounwind uwtable readnone sanitize_memory {
 ; CHECK-LABEL: @IntToPtr(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[X:%.*]] to ptr, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[X:%.*]] to ptr, !dbg [[DBG2]]
 ; CHECK-NEXT:    store i64 [[TMP0]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP1]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret ptr [[TMP2]]
@@ -374,13 +374,13 @@ entry:
 define i32 @Div(i32 %a, i32 %b) nounwind uwtable readnone sanitize_memory {
 ; CHECK-LABEL: @Div(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP0]], i32 zeroext [[TMP1]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[A:%.*]], [[B:%.*]], !dbg [[DBG2]]
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret i32 [[DIV]]
@@ -398,24 +398,24 @@ entry:
 
 define i32 @ShadowLoadAlignmentLarge() nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @ShadowLoadAlignmentLarge(
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[Y:%.*]] = alloca i32, align 64, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[Y]] to i64, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 64 [[TMP3]], i8 -1, i64 4, i1 false), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[Y]], i64 4, ptr @[[GLOB0:[0-9]+]], ptr @[[GLOB1:[0-9]+]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load volatile i32, ptr [[Y]], align 64, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[Y]] to i64, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP9]], 87960930222080, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP10]], 17592186044416, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP11]], align 64, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 64, !dbg [[DBG7]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[Y:%.*]] = alloca i32, align 64, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[Y]] to i64, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], 17592186044416, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 64 [[TMP3]], i8 -1, i64 4, i1 false), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[Y]], i64 4, ptr @[[GLOB0:[0-9]+]], ptr @[[GLOB1:[0-9]+]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load volatile i32, ptr [[Y]], align 64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[Y]] to i64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP9]], 17592186044416, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP10]], align 64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP12]], align 64, !dbg [[DBG8]]
 ; CHECK-NEXT:    store i32 [[_MSLD]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP14]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret i32 [[TMP8]]
@@ -429,14 +429,14 @@ define i32 @ShadowLoadAlignmentLarge() nounwind uwtable sanitize_memory {
 
 define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
 ; CHECK-LABEL: @ExtractElement(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 [[IDX:%.*]], !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[X:%.*]] = extractelement <4 x i32> [[VEC:%.*]], i32 [[IDX]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP3]], i32 [[IDX:%.*]], !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[X:%.*]] = extractelement <4 x i32> [[VEC:%.*]], i32 [[IDX]], !dbg [[DBG2]]
 ; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret i32 [[X]]
@@ -448,20 +448,20 @@ define i32 @ExtractElement(<4 x i32> %vec, i32 %idx) sanitize_memory {
 
 define <4 x i32> @InsertElement(<4 x i32> %vec, i32 %idx, i32 %x) sanitize_memory {
 ; CHECK-LABEL: @InsertElement(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 24) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i32 [[IDX:%.*]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP4]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP1]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP2]], i32 [[TMP8]], !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <4 x i32> [[VEC:%.*]], i32 [[X:%.*]], i32 [[IDX]], !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 24), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i32 [[IDX:%.*]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP5]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP4]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP1]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP2]], i32 [[TMP8]], !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_maybe_warning_4(i32 zeroext [[TMP1]], i32 zeroext [[TMP2]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <4 x i32> [[VEC:%.*]], i32 [[X:%.*]], i32 [[IDX]], !dbg [[DBG2]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP10]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <4 x i32> [[VEC1]]
@@ -473,16 +473,16 @@ define <4 x i32> @InsertElement(<4 x i32> %vec, i32 %idx, i32 %x) sanitize_memor
 
 define <4 x i32> @ShuffleVector(<4 x i32> %vec, <4 x i32> %vec1) sanitize_memory {
 ; CHECK-LABEL: @ShuffleVector(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 16) to ptr), align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i128 [[TMP5]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 [[TMP4]], i32 [[TMP2]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i32> [[VEC:%.*]], <4 x i32> [[VEC1:%.*]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 16), align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i128 [[TMP5]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 [[TMP4]], i32 [[TMP2]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[VEC2:%.*]] = shufflevector <4 x i32> [[VEC:%.*]], <4 x i32> [[VEC1:%.*]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>, !dbg [[DBG2]]
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    store i32 [[TMP7]], ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    ret <4 x i32> [[VEC2]]
@@ -499,74 +499,70 @@ declare void @llvm.va_start(ptr) nounwind
 define void @VAStart(i32 %x, ...) sanitize_memory {
 ; CHECK-LABEL: @VAStart(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 176, [[TMP0]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 0, i64 [[TMP1]], i1 false), !dbg [[DBG1]]
-; CHECK-NEXT:    [[SRCSZ:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 8 @__msan_va_arg_tls, i64 [[SRCSZ]], i1 false), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = alloca i8, i64 [[TMP1]], align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 @__msan_va_arg_origin_tls, i64 [[SRCSZ]], i1 false), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X_ADDR]] to i64, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 17592186044416, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP9]], -4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP8]], i8 -1, i64 4, i1 false), !dbg [[DBG1]]
-; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[X_ADDR]], i64 4, ptr @[[GLOB2:[0-9]+]], ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG1]]
-; CHECK-NEXT:    [[VA:%.*]] = alloca [1 x %struct.__va_list_tag], align 16, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], -4, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 24, i1 false), !dbg [[DBG7]]
-; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[VA]], i64 24, ptr @[[GLOB4:[0-9]+]], ptr @[[GLOB5:[0-9]+]]), !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[X_ADDR]] to i64, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[TMP21]], 17592186044416, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr, !dbg [[DBG8]]
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP22]], align 4, !dbg [[DBG8]]
-; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP4]], ptr [[X_ADDR]], i32 zeroext [[TMP5]]), !dbg [[DBG8]]
-; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[X_ADDR]], align 4, !dbg [[DBG8]]
-; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG11:![0-9]+]]
-; CHECK-NEXT:    [[TMP27:%.*]] = xor i64 [[TMP26]], 87960930222080, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[TMP27]], 17592186044416, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP28]], i8 0, i64 24, i1 false), !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]]), !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], 16, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP35:%.*]] = ptrtoint ptr [[TMP34]] to i64, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP36:%.*]] = xor i64 [[TMP35]], 87960930222080, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP37:%.*]] = inttoptr i64 [[TMP36]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP38:%.*]] = add i64 [[TMP36]], 17592186044416, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP39:%.*]] = inttoptr i64 [[TMP38]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP37]], ptr align 16 [[TMP2]], i64 176, i1 false), !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP39]], ptr align 16 [[TMP3]], i64 176, i1 false), !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP41:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP42:%.*]] = add i64 [[TMP41]], 8, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP43:%.*]] = inttoptr i64 [[TMP42]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP44:%.*]] = load ptr, ptr [[TMP43]], align 8, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP45:%.*]] = ptrtoint ptr [[TMP44]] to i64, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP46:%.*]] = xor i64 [[TMP45]], 87960930222080, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP47:%.*]] = inttoptr i64 [[TMP46]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP48:%.*]] = add i64 [[TMP46]], 17592186044416, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP49:%.*]] = inttoptr i64 [[TMP48]] to ptr, !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[TMP2]], i32 176, !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP47]], ptr align 16 [[TMP50]], i64 [[TMP0]], i1 false), !dbg [[DBG11]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[TMP3]], i32 176, !dbg [[DBG11]]
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP49]], ptr align 16 [[TMP51]], i64 [[TMP0]], i1 false), !dbg [[DBG11]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 176, [[TMP0]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i8, i64 [[TMP1]], align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP2]], i8 0, i64 [[TMP1]], i1 false), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 800), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP3]], i1 false), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca i8, i64 [[TMP1]], align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP4]], ptr align 8 @__msan_va_arg_origin_tls, i64 [[TMP3]], i1 false), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @__msan_param_origin_tls, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X_ADDR]] to i64, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[TMP8]], 17592186044416, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP10]], -4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP9]], i8 -1, i64 4, i1 false), !dbg [[DBG2]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[X_ADDR]], i64 4, ptr @[[GLOB2:[0-9]+]], ptr @[[GLOB3:[0-9]+]]), !dbg [[DBG2]]
+; CHECK-NEXT:    [[VA:%.*]] = alloca [1 x %struct.__va_list_tag], align 16, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 87960930222080, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP14]], 17592186044416, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], -4, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 16 [[TMP15]], i8 -1, i64 24, i1 false), !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[VA]], i64 24, ptr @[[GLOB4:[0-9]+]], ptr @[[GLOB5:[0-9]+]]), !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[X_ADDR]] to i64, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 87960930222080, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP20]] to ptr, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[TMP20]], 17592186044416, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr, !dbg [[DBG9]]
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP21]], align 4, !dbg [[DBG9]]
+; CHECK-NEXT:    call void @__msan_maybe_store_origin_4(i32 zeroext [[TMP5]], ptr [[X_ADDR]], i32 zeroext [[TMP6]]), !dbg [[DBG9]]
+; CHECK-NEXT:    store i32 [[X:%.*]], ptr [[X_ADDR]], align 4, !dbg [[DBG9]]
+; CHECK-NEXT:    [[TMP24:%.*]] = ptrtoint ptr [[VA]] to i64, !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP24]], 87960930222080, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP26:%.*]] = inttoptr i64 [[TMP25]] to ptr, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[TMP25]], 17592186044416, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr, !dbg [[DBG10]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP26]], i8 0, i64 24, i1 false), !dbg [[DBG10]]
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[VA]]), !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[VA]], i64 16, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load ptr, ptr [[TMP29]], align 8, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[TMP30]] to i64, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP34:%.*]] = add i64 [[TMP32]], 17592186044416, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP35:%.*]] = inttoptr i64 [[TMP34]] to ptr, !dbg [[DBG10]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP33]], ptr align 16 [[TMP2]], i64 176, i1 false), !dbg [[DBG10]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP35]], ptr align 16 [[TMP4]], i64 176, i1 false), !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[VA]], i64 8, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[TMP36]], align 8, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP41:%.*]] = add i64 [[TMP39]], 17592186044416, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP42:%.*]] = inttoptr i64 [[TMP41]] to ptr, !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP2]], i32 176, !dbg [[DBG10]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP40]], ptr align 16 [[TMP43]], i64 [[TMP0]], i1 false), !dbg [[DBG10]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[TMP4]], i32 176, !dbg [[DBG10]]
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP42]], ptr align 16 [[TMP44]], i64 [[TMP0]], i1 false), !dbg [[DBG10]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -582,15 +578,15 @@ entry:
 define i32 @NoSanitizeMemory(i32 %x) uwtable {
 ; CHECK-LABEL: @NoSanitizeMemory(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 -1, [[TMP0]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 false, [[TMP2]], !dbg [[DBG1]]
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X]], 0, !dbg [[DBG1]]
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG7]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i32 [[X:%.*]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 -1, [[TMP0]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 false, [[TMP2]], !dbg [[DBG2]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[X]], 0, !dbg [[DBG2]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]], !dbg [[DBG8]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    tail call void @bar(), !dbg [[DBG8]]
+; CHECK-NEXT:    tail call void @bar(), !dbg [[DBG9]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -615,18 +611,18 @@ declare void @bar()
 define i32 @NoSanitizeMemoryAlloca() {
 ; CHECK-LABEL: @NoSanitizeMemoryAlloca(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[P]] to i64, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP2]], i8 0, i64 4, i1 false), !dbg [[DBG1]]
-; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8, !dbg [[DBG7]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG7]]
-; CHECK-NEXT:    [[X:%.*]] = call i32 @NoSanitizeMemoryAllocaHelper(ptr [[P]]), !dbg [[DBG7]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[P]] to i64, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP2]], i8 0, i64 4, i1 false), !dbg [[DBG2]]
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8, !dbg [[DBG8]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG8]]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @NoSanitizeMemoryAllocaHelper(ptr [[P]]), !dbg [[DBG8]]
 ; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -647,10 +643,10 @@ declare i32 @NoSanitizeMemoryAllocaHelper(ptr %p)
 define i32 @NoSanitizeMemoryUndef() {
 ; CHECK-LABEL: @NoSanitizeMemoryUndef(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG1]]
-; CHECK-NEXT:    [[X:%.*]] = call i32 @NoSanitizeMemoryUndefHelper(i32 undef), !dbg [[DBG1]]
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_param_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8, !dbg [[DBG2]]
+; CHECK-NEXT:    [[X:%.*]] = call i32 @NoSanitizeMemoryUndefHelper(i32 undef), !dbg [[DBG2]]
 ; CHECK-NEXT:    [[_MSRET:%.*]] = load i32, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_retval_origin_tls, align 4
 ; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
@@ -672,21 +668,21 @@ declare void @foo8(ptr nocapture)
 define void @msan() sanitize_memory {
 ; CHECK-LABEL: @msan(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG1]]
-; CHECK-NEXT:    [[TEXT:%.*]] = alloca i8, align 1, !dbg [[DBG1]]
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[TEXT]]), !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[TEXT]] to i64, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG7]]
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG7]]
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP2]], i8 -1, i64 1, i1 false), !dbg [[DBG7]]
-; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[TEXT]], i64 1, ptr @[[GLOB6:[0-9]+]], ptr @[[GLOB7:[0-9]+]]), !dbg [[DBG7]]
-; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8, !dbg [[DBG8]]
-; CHECK-NEXT:    call void @foo8(ptr [[TEXT]]), !dbg [[DBG8]]
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[TEXT]]), !dbg
-; CHECK-NEXT:    ret void, !dbg
+; CHECK-NEXT:    call void @llvm.donothing(), !dbg [[DBG2]]
+; CHECK-NEXT:    [[TEXT:%.*]] = alloca i8, align 1, !dbg [[DBG2]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[TEXT]]), !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[TEXT]] to i64, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 [[TMP0]], 87960930222080, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP1]], 17592186044416, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP3]], -4, !dbg [[DBG8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr, !dbg [[DBG8]]
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP2]], i8 -1, i64 1, i1 false), !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__msan_set_alloca_origin_with_descr(ptr [[TEXT]], i64 1, ptr @[[GLOB6:[0-9]+]], ptr @[[GLOB7:[0-9]+]]), !dbg [[DBG8]]
+; CHECK-NEXT:    store i64 0, ptr @__msan_param_tls, align 8, !dbg [[DBG9]]
+; CHECK-NEXT:    call void @foo8(ptr [[TEXT]]), !dbg [[DBG9]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[TEXT]]), !dbg [[DBG11:![0-9]+]]
+; CHECK-NEXT:    ret void, !dbg [[DBG12:![0-9]+]]
 ;
 entry:
   %text = alloca i8, align 1, !dbg !10
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll
index 946c95b072ea9..13a50c28aa286 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll
@@ -36,7 +36,7 @@ define noundef i32 @LoadedRet() nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSLD]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    unreachable
@@ -69,8 +69,8 @@ define void @NormalArg(i32 noundef %a) nounwind uwtable sanitize_memory {
 
 define void @NormalArgAfterNoUndef(i32 noundef %a, i32 %b) nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @NormalArgAfterNoUndef(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 0 to ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[P]] to i64
@@ -80,7 +80,7 @@ define void @NormalArgAfterNoUndef(i32 noundef %a, i32 %b) nounwind uwtable sani
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    store i32 [[TMP1]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    br label [[TMP9]]
@@ -106,7 +106,7 @@ define void @PartialArg(i32 %a) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    store i32 [[TMP1]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
 ; CHECK:       8:
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    br label [[TMP9]]
@@ -135,7 +135,7 @@ define void @CallNormalArgAfterNoUndef() nounwind uwtable sanitize_memory {
 ; CHECK-LABEL: @CallNormalArgAfterNoUndef(
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @NormalRet() #[[ATTR0]]
-; CHECK-NEXT:    store i32 0, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    store i32 0, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @NormalArgAfterNoUndef(i32 [[R]], i32 [[R]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret void
 ;
@@ -157,7 +157,7 @@ define void @CallWithLoaded() nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSLD]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF0]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_with_origin_noreturn(i32 [[TMP6]]) #[[ATTR3]]
 ; CHECK-NEXT:    unreachable
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
index a7209de32380a..5d63367919d1a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
@@ -38,38 +38,32 @@ define void @Store1(ptr nocapture %p, i8 %x) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       [[BB11]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT:    br label %[[BB12]]
-; CHECK:       [[BB12]]:
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[P]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
 ; CHECK-NEXT:    store i8 [[TMP7]], ptr [[TMP14]], align 1
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB18:.*]], !prof [[PROF1]]
-; CHECK:       [[BB16]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB10:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    br label %[[BB18]]
-; CHECK:       [[BB18]]:
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
 ; CHECK-NEXT:    store i8 [[X]], ptr [[P]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -90,38 +84,32 @@ define void @Store2(ptr nocapture %p, i16 %x) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
-; CHECK:       [[BB11]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB12]]
-; CHECK:       [[BB12]]:
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_2(ptr [[P]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
 ; CHECK-NEXT:    store i16 [[TMP7]], ptr [[TMP14]], align 2
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB18:.*]], !prof [[PROF1]]
-; CHECK:       [[BB16]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB10:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    br label %[[BB18]]
-; CHECK:       [[BB18]]:
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
 ; CHECK-NEXT:    store i16 [[X]], ptr [[P]], align 2
 ; CHECK-NEXT:    ret void
 ;
@@ -142,38 +130,32 @@ define void @Store4(ptr nocapture %p, i32 %x) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
-; CHECK:       [[BB11]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB12]]
-; CHECK:       [[BB12]]:
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_4(ptr [[P]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
 ; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i32 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB18:.*]], !prof [[PROF1]]
-; CHECK:       [[BB16]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB10:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    br label %[[BB18]]
-; CHECK:       [[BB18]]:
+; CHECK-NEXT:    br label %[[BB12]]
+; CHECK:       [[BB12]]:
 ; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -194,41 +176,35 @@ define void @Store8(ptr nocapture %p, i64 %x) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
-; CHECK:       [[BB11]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB12]]
-; CHECK:       [[BB12]]:
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_8(ptr [[P]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
 ; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB21:.*]], !prof [[PROF1]]
-; CHECK:       [[BB16]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB10:.*]], label %[[BB15:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = shl i64 [[TMP18]], 32
 ; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP18]], [[TMP19]]
 ; CHECK-NEXT:    store i64 [[TMP20]], ptr [[TMP15]], align 8
-; CHECK-NEXT:    br label %[[BB21]]
-; CHECK:       [[BB21]]:
+; CHECK-NEXT:    br label %[[BB15]]
+; CHECK:       [[BB15]]:
 ; CHECK-NEXT:    store i64 [[X]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -249,34 +225,28 @@ define void @Store16(ptr nocapture %p, i128 %x) nounwind uwtable sanitize_memory
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i128, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[_MSARG_O2]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
-; CHECK:       [[BB11]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB12]]
-; CHECK:       [[BB12]]:
+; CHECK-NEXT:    br label %[[BB6]]
+; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_n(ptr [[P]], i64 16)
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1
 ; CHECK-NEXT:    store i128 [[TMP7]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB16:.*]], label %[[BB22:.*]], !prof [[PROF1]]
-; CHECK:       [[BB16]]:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label %[[BB10:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @__msan_chain_origin(i32 [[TMP10]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = shl i64 [[TMP18]], 32
@@ -284,8 +254,8 @@ define void @Store16(ptr nocapture %p, i128 %x) nounwind uwtable sanitize_memory
 ; CHECK-NEXT:    store i64 [[TMP20]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[TMP15]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP20]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    br label %[[BB22]]
-; CHECK:       [[BB22]]:
+; CHECK-NEXT:    br label %[[BB16]]
+; CHECK:       [[BB16]]:
 ; CHECK-NEXT:    store i128 [[X]], ptr [[P]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -308,19 +278,17 @@ define i8 @Load1(ptr nocapture %p) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB6]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[P]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_1(ptr [[P]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
@@ -348,19 +316,17 @@ define i16 @Load2(ptr nocapture %p) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB6]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[P]], align 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_2(ptr [[P]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
@@ -388,19 +354,17 @@ define i32 @Load4(ptr nocapture %p) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB6]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[P]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_4(ptr [[P]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
@@ -428,19 +392,17 @@ define i64 @Load8(ptr nocapture %p) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB6]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_8(ptr [[P]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
@@ -468,19 +430,17 @@ define i128 @Load16(ptr nocapture %p) nounwind uwtable sanitize_memory {
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
 ; CHECK-NEXT:    call void @__msan_warning(i32 [[TMP4]]) #[[ATTR8]]
-; CHECK-NEXT:    br label %[[BB6]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br label %[[BB4]]
+; CHECK:       [[BB4]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i128, ptr [[P]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_n(ptr [[P]], i64 16)
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP8]], 0
@@ -517,11 +477,9 @@ define dso_local i32 @VarArgFn(ptr %fmt, ...) local_unnamed_addr sanitize_memory
 ; CHECK-NEXT:    [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG_O:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[_MSARG_O]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[VA_ARG_OVERFLOW_SIZE]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 48, [[TMP5]]
@@ -539,18 +497,14 @@ define dso_local i32 @VarArgFn(ptr %fmt, ...) local_unnamed_addr sanitize_memory
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, ptr } [[TMP10]], 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 0, i64 24, i1 false)
 ; CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[ARGS]])
-; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP13]], 16
-; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[ARGS]], i64 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP17:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[TMP16]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { ptr, ptr } [[TMP17]], 0
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { ptr, ptr } [[TMP17]], 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP18]], ptr align 16 [[TMP7]], i64 48, i1 false)
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP19]], ptr align 16 [[TMP9]], i64 48, i1 false)
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[ARGS]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 8
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[ARGS]], i64 8
 ; CHECK-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[TMP23]])
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { ptr, ptr } [[TMP24]], 0
@@ -559,15 +513,11 @@ define dso_local i32 @VarArgFn(ptr %fmt, ...) local_unnamed_addr sanitize_memory
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP25]], ptr align 16 [[TMP27]], i64 [[TMP5]], i1 false)
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[TMP9]], i32 48
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[TMP26]], ptr align 16 [[TMP28]], i64 [[TMP5]], i1 false)
-; CHECK-NEXT:    [[TMP29:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    store i64 [[TMP2]], ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[PARAM_ORIGIN]] to i64
-; CHECK-NEXT:    [[_MSARG_O2:%.*]] = inttoptr i64 [[TMP30]] to ptr
+; CHECK-NEXT:    [[_MSARG_O2:%.*]] = getelementptr i8, ptr [[PARAM_ORIGIN]], i64 0
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[_MSARG_O2]], align 4
-; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = add i64 [[TMP31]], 8
-; CHECK-NEXT:    [[_MSARG3:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT:    [[_MSARG3:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 8
 ; CHECK-NEXT:    store i64 0, ptr [[_MSARG3]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[RETVAL_SHADOW]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @VAListFn(ptr [[FMT]], ptr nonnull [[ARGS]])
@@ -606,25 +556,14 @@ define dso_local void @VarArgCaller() local_unnamed_addr sanitize_memory {
 ; CHECK-NEXT:    [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5
 ; CHECK-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[_MSARG:%.*]] = inttoptr i64 [[TMP1]] to ptr
+; CHECK-NEXT:    [[_MSARG:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 0
 ; CHECK-NEXT:    store i64 0, ptr [[_MSARG]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[PARAM_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], 8
-; CHECK-NEXT:    [[_MSARG1:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[_MSARG1:%.*]] = getelementptr i8, ptr [[PARAM_SHADOW]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[_MSARG1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[VA_ARG_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSARG_VA_S:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[VA_ARG_ORIGIN]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSARG_VA_O:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[VA_ARG_SHADOW]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 8
-; CHECK-NEXT:    [[_MSARG_VA_S2:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[VA_ARG_ORIGIN]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP10]], 8
-; CHECK-NEXT:    [[_MSARG_VA_O3:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    [[_MSARG_VA_S:%.*]] = getelementptr i8, ptr [[VA_ARG_SHADOW]], i64 0
+; CHECK-NEXT:    [[_MSARG_VA_O:%.*]] = getelementptr i8, ptr [[VA_ARG_ORIGIN]], i64 0
+; CHECK-NEXT:    [[_MSARG_VA_S2:%.*]] = getelementptr i8, ptr [[VA_ARG_SHADOW]], i64 8
+; CHECK-NEXT:    [[_MSARG_VA_O3:%.*]] = getelementptr i8, ptr [[VA_ARG_ORIGIN]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[_MSARG_VA_S2]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[_MSARG_VA_O3]], align 8
 ; CHECK-NEXT:    store i64 0, ptr [[VA_ARG_OVERFLOW_SIZE]], align 8
diff --git a/llvm/test/Instrumentation/MemorySanitizer/opaque-ptr.ll b/llvm/test/Instrumentation/MemorySanitizer/opaque-ptr.ll
index 24276a28fdd70..e88341663fa3a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/opaque-ptr.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/opaque-ptr.ll
@@ -8,7 +8,7 @@ define void @test_memcpy(ptr %p, ptr byval(i32) %p2) sanitize_memory {
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 4, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @__msan_memcpy(ptr [[P:%.*]], ptr [[P2]], i64 4)
 ; CHECK-NEXT:    ret void
@@ -22,7 +22,7 @@ define void @test_memmove(ptr %p, ptr byval(i32) %p2) sanitize_memory {
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P2:%.*]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 87960930222080
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 getelementptr (i8, ptr @__msan_param_tls, i64 8), i64 4, i1 false)
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @__msan_memmove(ptr [[P:%.*]], ptr [[P2]], i64 4)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Instrumentation/MemorySanitizer/or.ll b/llvm/test/Instrumentation/MemorySanitizer/or.ll
index 20993a54187ac..ce33022e46652 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/or.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/or.ll
@@ -11,7 +11,7 @@ define i8 @test_or(i8 %a, i8 %b) sanitize_memory {
 ; CHECK-LABEL: define i8 @test_or(
 ; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor i8 [[B]], -1
@@ -32,7 +32,7 @@ define i8 @test_disjoint_or(i8 %a, i8 %b) sanitize_memory {
 ; CHECK-IMPRECISE-LABEL: define i8 @test_disjoint_or(
 ; CHECK-IMPRECISE-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-IMPRECISE-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-IMPRECISE-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-IMPRECISE-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-IMPRECISE-NEXT:    call void @llvm.donothing()
 ; CHECK-IMPRECISE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], -1
 ; CHECK-IMPRECISE-NEXT:    [[TMP4:%.*]] = xor i8 [[B]], -1
@@ -48,7 +48,7 @@ define i8 @test_disjoint_or(i8 %a, i8 %b) sanitize_memory {
 ; CHECK-PRECISE-LABEL: define i8 @test_disjoint_or(
 ; CHECK-PRECISE-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-PRECISE-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-PRECISE-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-PRECISE-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-PRECISE-NEXT:    call void @llvm.donothing()
 ; CHECK-PRECISE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], -1
 ; CHECK-PRECISE-NEXT:    [[TMP4:%.*]] = xor i8 [[B]], -1
diff --git a/llvm/test/Instrumentation/MemorySanitizer/overflow.ll b/llvm/test/Instrumentation/MemorySanitizer/overflow.ll
index 0cfae0008263f..9c9efcb72def3 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/overflow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/overflow.ll
@@ -8,7 +8,7 @@ define {i64, i1} @test_sadd_with_overflow(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define { i64, i1 } @test_sadd_with_overflow(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
@@ -26,7 +26,7 @@ define {i64, i1} @test_uadd_with_overflow(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define { i64, i1 } @test_uadd_with_overflow(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
@@ -44,7 +44,7 @@ define {i64, i1} @test_smul_with_overflow(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define { i64, i1 } @test_smul_with_overflow(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
@@ -61,7 +61,7 @@ define {i64, i1} @test_umul_with_overflow(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define { i64, i1 } @test_umul_with_overflow(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
@@ -78,7 +78,7 @@ define {i64, i1} @test_ssub_with_overflow(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define { i64, i1 } @test_ssub_with_overflow(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
@@ -95,7 +95,7 @@ define {i64, i1} @test_usub_with_overflow(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define { i64, i1 } @test_usub_with_overflow(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
@@ -113,7 +113,7 @@ define {<4 x i32>, <4 x i1>} @test_sadd_with_overflow_vec(<4 x i32> %a, <4 x i32
 ; CHECK-LABEL: define { <4 x i32>, <4 x i1> } @test_sadd_with_overflow_vec(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll b/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll
index 6d275b3e2d383..87ff4e6200b69 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/pr32842.ll
@@ -13,7 +13,7 @@ define zeroext i1 @_Z1fii(i32 %x, i32 %y) sanitize_memory {
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[X]], -2147483648
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP0]], -1
diff --git a/llvm/test/Instrumentation/MemorySanitizer/saturating.ll b/llvm/test/Instrumentation/MemorySanitizer/saturating.ll
index dcd8a080144ba..9473523c5f19e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/saturating.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/saturating.ll
@@ -8,7 +8,7 @@ define i64 @test_sadd_sat(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define i64 @test_sadd_sat(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A]], i64 [[B]])
@@ -23,7 +23,7 @@ define i64 @test_uadd_sat(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define i64 @test_uadd_sat(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A]], i64 [[B]])
@@ -38,7 +38,7 @@ define i64 @test_ssub_sat(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define i64 @test_ssub_sat(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A]], i64 [[B]])
@@ -53,7 +53,7 @@ define i64 @test_usub_sat(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define i64 @test_usub_sat(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A]], i64 [[B]])
@@ -68,7 +68,7 @@ define i64 @test_sshl_sat(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define i64 @test_sshl_sat(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.sshl.sat.i64(i64 [[A]], i64 [[B]])
@@ -83,7 +83,7 @@ define i64 @test_ushl_sat(i64 %a, i64 %b) #0 {
 ; CHECK-LABEL: define i64 @test_ushl_sat(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.ushl.sat.i64(i64 [[A]], i64 [[B]])
@@ -98,7 +98,7 @@ define <4 x i32> @test_sadd_sat_vec(<4 x i32> %a, <4 x i32> %b) #0 {
 ; CHECK-LABEL: define <4 x i32> @test_sadd_sat_vec(
 ; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]])
diff --git a/llvm/test/Instrumentation/MemorySanitizer/scmp.ll b/llvm/test/Instrumentation/MemorySanitizer/scmp.ll
index 5c94c216106a2..0d4799fbe6f60 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/scmp.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/scmp.ll
@@ -10,7 +10,7 @@ define i8 @scmp.8.8(i8 %x, i8 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @scmp.8.8(
 ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i8 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i8 [[_MSPROP]], 0
@@ -26,7 +26,7 @@ define i8 @scmp.8.16(i16 %x, i16 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @scmp.8.16(
 ; CHECK-SAME: i16 [[X:%.*]], i16 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i16 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i16 [[_MSPROP]], 0
@@ -43,7 +43,7 @@ define i8 @scmp.8.32(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @scmp.8.32(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -60,7 +60,7 @@ define i8 @scmp.8.64(i64 %x, i64 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @scmp.8.64(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -77,7 +77,7 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @scmp.8.128(
 ; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i128 [[_MSPROP]], 0
@@ -94,7 +94,7 @@ define i32 @scmp.32.32(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i32 @scmp.32.32(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -110,7 +110,7 @@ define i32 @scmp.32.64(i64 %x, i64 %y) nounwind #0 {
 ; CHECK-LABEL: define i32 @scmp.32.64(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -127,7 +127,7 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind #0 {
 ; CHECK-LABEL: define i64 @scmp.64.64(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -143,7 +143,7 @@ define i4 @scmp_narrow_result(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i4 @scmp_narrow_result(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -160,7 +160,7 @@ define i8 @scmp_narrow_op(i62 %x, i62 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @scmp_narrow_op(
 ; CHECK-SAME: i62 [[X:%.*]], i62 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i62, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i62, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i62, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i62 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i62 [[_MSPROP]], 0
@@ -177,7 +177,7 @@ define i141 @scmp_wide_result(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i141 @scmp_wide_result(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -194,7 +194,7 @@ define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @scmp_wide_op(
 ; CHECK-SAME: i109 [[X:%.*]], i109 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i109, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i109, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i109, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i109 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i109 [[_MSPROP]], 0
@@ -211,7 +211,7 @@ define i41 @scmp_uncommon_types(i7 %x, i7 %y) nounwind #0 {
 ; CHECK-LABEL: define i41 @scmp_uncommon_types(
 ; CHECK-SAME: i7 [[X:%.*]], i7 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i7, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i7, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i7, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i7 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i7 [[_MSPROP]], 0
@@ -228,7 +228,7 @@ define <4 x i32> @scmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @scmp_normal_vectors(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
@@ -244,7 +244,7 @@ define <4 x i8> @scmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind #0
 ; CHECK-LABEL: define <4 x i8> @scmp_narrow_vec_result(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
@@ -261,7 +261,7 @@ define <4 x i32> @scmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @scmp_narrow_vec_op(
 ; CHECK-SAME: <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i8> [[_MSPROP]], zeroinitializer
@@ -278,7 +278,7 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind #0
 ; CHECK-LABEL: define <16 x i32> @scmp_wide_vec_result(
 ; CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
@@ -295,7 +295,7 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @scmp_wide_vec_op(
 ; CHECK-SAME: <16 x i64> [[X:%.*]], <16 x i64> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i64> [[_MSPROP]], zeroinitializer
@@ -312,7 +312,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind #0 {
 ; CHECK-LABEL: define <7 x i117> @scmp_uncommon_vectors(
 ; CHECK-SAME: <7 x i7> [[X:%.*]], <7 x i7> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <7 x i7>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <7 x i7>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <7 x i7>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <7 x i7> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <7 x i7> [[_MSPROP]], zeroinitializer
@@ -329,7 +329,7 @@ define <1 x i3> @scmp_scalarize(<1 x i33> %x, <1 x i33> %y) nounwind #0 {
 ; CHECK-LABEL: define <1 x i3> @scmp_scalarize(
 ; CHECK-SAME: <1 x i33> [[X:%.*]], <1 x i33> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i33>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i33>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <1 x i33>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i33> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i33> [[_MSPROP]], zeroinitializer
@@ -346,7 +346,7 @@ define <2 x i8> @scmp_bool_operands(<2 x i1> %x, <2 x i1> %y) nounwind #0 {
 ; CHECK-LABEL: define <2 x i8> @scmp_bool_operands(
 ; CHECK-SAME: <2 x i1> [[X:%.*]], <2 x i1> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i1>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i1>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i1>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i1> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i1> [[_MSPROP]], zeroinitializer
@@ -363,7 +363,7 @@ define <2 x i16> @scmp_ret_wider_than_operands(<2 x i8> %x, <2 x i8> %y) nounwin
 ; CHECK-LABEL: define <2 x i16> @scmp_ret_wider_than_operands(
 ; CHECK-SAME: <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i8> [[_MSPROP]], zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll b/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll
index 1b70242dae2b5..3c9d6d8b91b3e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/ucmp.ll
@@ -10,7 +10,7 @@ define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @ucmp.8.8(
 ; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i8 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i8 [[_MSPROP]], 0
@@ -26,7 +26,7 @@ define i8 @ucmp.8.16(i16 %x, i16 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @ucmp.8.16(
 ; CHECK-SAME: i16 [[X:%.*]], i16 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i16 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i16 [[_MSPROP]], 0
@@ -43,7 +43,7 @@ define i8 @ucmp.8.32(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @ucmp.8.32(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -60,7 +60,7 @@ define i8 @ucmp.8.64(i64 %x, i64 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @ucmp.8.64(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -77,7 +77,7 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @ucmp.8.128(
 ; CHECK-SAME: i128 [[X:%.*]], i128 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i128 [[_MSPROP]], 0
@@ -94,7 +94,7 @@ define i32 @ucmp.32.32(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i32 @ucmp.32.32(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -110,7 +110,7 @@ define i32 @ucmp.32.64(i64 %x, i64 %y) nounwind #0 {
 ; CHECK-LABEL: define i32 @ucmp.32.64(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -127,7 +127,7 @@ define i64 @ucmp.64.64(i64 %x, i64 %y) nounwind #0 {
 ; CHECK-LABEL: define i64 @ucmp.64.64(
 ; CHECK-SAME: i64 [[X:%.*]], i64 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
@@ -143,7 +143,7 @@ define i4 @ucmp_narrow_result(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i4 @ucmp_narrow_result(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -160,7 +160,7 @@ define i8 @ucmp_narrow_op(i62 %x, i62 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @ucmp_narrow_op(
 ; CHECK-SAME: i62 [[X:%.*]], i62 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i62, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i62, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i62, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i62 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i62 [[_MSPROP]], 0
@@ -177,7 +177,7 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind #0 {
 ; CHECK-LABEL: define i141 @ucmp_wide_result(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], 0
@@ -194,7 +194,7 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind #0 {
 ; CHECK-LABEL: define i8 @ucmp_wide_op(
 ; CHECK-SAME: i109 [[X:%.*]], i109 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i109, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i109, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i109, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i109 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i109 [[_MSPROP]], 0
@@ -211,7 +211,7 @@ define i41 @ucmp_uncommon_types(i7 %x, i7 %y) nounwind #0 {
 ; CHECK-LABEL: define i41 @ucmp_uncommon_types(
 ; CHECK-SAME: i7 [[X:%.*]], i7 [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i7, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i7, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i7, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or i7 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i7 [[_MSPROP]], 0
@@ -228,7 +228,7 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @ucmp_normal_vectors(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
@@ -244,7 +244,7 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind #0
 ; CHECK-LABEL: define <4 x i8> @ucmp_narrow_vec_result(
 ; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
@@ -261,7 +261,7 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind #0 {
 ; CHECK-LABEL: define <4 x i32> @ucmp_narrow_vec_op(
 ; CHECK-SAME: <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i8> [[_MSPROP]], zeroinitializer
@@ -278,7 +278,7 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind #0
 ; CHECK-LABEL: define <16 x i32> @ucmp_wide_vec_result(
 ; CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], zeroinitializer
@@ -295,7 +295,7 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind #0 {
 ; CHECK-LABEL: define <16 x i8> @ucmp_wide_vec_op(
 ; CHECK-SAME: <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
@@ -312,7 +312,7 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind #
 ; CHECK-LABEL: define <17 x i2> @ucmp_uncommon_vectors(
 ; CHECK-SAME: <17 x i71> [[X:%.*]], <17 x i71> [[Y:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <17 x i71>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <17 x i71>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <17 x i71>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 256), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <17 x i71> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <17 x i71> [[_MSPROP]], zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll
index 5da4c7357b6ad..bfc47dc6bdc2b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fadd.ll
@@ -13,7 +13,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v2f32(
 ; CHECK-SAME: float [[A0:%.*]], <2 x float> [[A1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -29,7 +29,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v4f32(
 ; CHECK-SAME: float [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -45,7 +45,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v8f32(
 ; CHECK-SAME: float [[A0:%.*]], <8 x float> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -61,7 +61,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v16f32(
 ; CHECK-SAME: float [[A0:%.*]], <16 x float> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -138,7 +138,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v2f64(
 ; CHECK-SAME: double [[A0:%.*]], <2 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
@@ -154,7 +154,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v4f64(
 ; CHECK-SAME: double [[A0:%.*]], <4 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
@@ -170,7 +170,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v8f64(
 ; CHECK-SAME: double [[A0:%.*]], <8 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
@@ -186,7 +186,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v16f64(
 ; CHECK-SAME: double [[A0:%.*]], <16 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll
index 0c1c4edc4367f..db86d55616c62 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector-reduce-fmul.ll
@@ -13,7 +13,7 @@ define float @test_v2f32(float %a0, <2 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v2f32(
 ; CHECK-SAME: float [[A0:%.*]], <2 x float> [[A1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -29,7 +29,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v4f32(
 ; CHECK-SAME: float [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -45,7 +45,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v8f32(
 ; CHECK-SAME: float [[A0:%.*]], <8 x float> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -61,7 +61,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) #0 {
 ; CHECK-LABEL: define float @test_v16f32(
 ; CHECK-SAME: float [[A0:%.*]], <16 x float> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP1]], [[TMP3]]
@@ -138,7 +138,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v2f64(
 ; CHECK-SAME: double [[A0:%.*]], <2 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
@@ -154,7 +154,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v4f64(
 ; CHECK-SAME: double [[A0:%.*]], <4 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
@@ -170,7 +170,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v8f64(
 ; CHECK-SAME: double [[A0:%.*]], <8 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
@@ -186,7 +186,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) #0 {
 ; CHECK-LABEL: define double @test_v16f64(
 ; CHECK-SAME: double [[A0:%.*]], <16 x double> [[A1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], [[TMP3]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index d1060fb33e1bc..1146131465883 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -15,7 +15,7 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory
 ; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer
@@ -46,7 +46,7 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8>
@@ -82,7 +82,7 @@ define <2 x i64> @Test_x86_sse2_psad_bw(<16 x i8> %a, <16 x i8> %b) sanitize_mem
 ; CHECK-LABEL: define <2 x i64> @Test_x86_sse2_psad_bw(
 ; CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64>
@@ -104,7 +104,7 @@ define <1 x i64> @Test_x86_mmx_psad_bw(<1 x i64> %a, <1 x i64> %b) sanitize_memo
 ; CHECK-SAME: <1 x i64> [[A:%.*]], <1 x i64> [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
index 0c0b393667bf0..514abedf8fe1a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vscale.ll
@@ -435,7 +435,7 @@ define void @fn_param(<vscale x 2 x float> %a, ptr %b) sanitize_memory {
 define void @test_param(ptr %a, ptr %b) sanitize_memory {
 ; CHECK-LABEL: define void @test_param(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[A]] to i64
@@ -455,8 +455,8 @@ define void @test_param(ptr %a, ptr %b) sanitize_memory {
 ;
 ; ORIGIN-LABEL: define void @test_param(
 ; ORIGIN-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; ORIGIN-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_origin_tls to i64), i64 8) to ptr), align 4
+; ORIGIN-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; ORIGIN-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_origin_tls, i64 8), align 4
 ; ORIGIN-NEXT:    call void @llvm.donothing()
 ; ORIGIN-NEXT:    [[TMP3:%.*]] = load <vscale x 2 x float>, ptr [[A]], align 8
 ; ORIGIN-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A]] to i64

From e5b8c24cc0d0d2ccf44e0c5e155fdfa4b2cf7720 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 2 Oct 2025 08:27:17 +0100
Subject: [PATCH 471/878] [DAG] Add ComputeNumSignBits(FREEZE(X)) handling
 (#161507)

If X is known never under/poison then skip the freeze and return ComputeNumSignBits(X)
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 5 +++++
 llvm/test/CodeGen/AArch64/freeze.ll            | 6 ------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8fc7eabf90ea8..95f53fe0bfdba 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4762,6 +4762,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::AssertZext:
     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
     return VTBits-Tmp;
+  case ISD::FREEZE:
+    if (isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedElts,
+                                         /*PoisonOnly=*/false))
+      return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    break;
   case ISD::MERGE_VALUES:
     return ComputeNumSignBits(Op.getOperand(Op.getResNo()), DemandedElts,
                               Depth + 1);
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index fae3bbe2dcfba..fb909fec90434 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -466,15 +466,12 @@ define <8 x i16> @freeze_urhadd(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %masked
 }
 
-; TODO: Unnecessary sext_inreg
 define <8 x i16> @freeze_shadd(<8 x i8> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: freeze_shadd:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshr v1.8h, v1.8h, #8
 ; CHECK-NEXT:    shadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    shl v0.8h, v0.8h, #8
-; CHECK-NEXT:    sshr v0.8h, v0.8h, #8
 ; CHECK-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = ashr <8 x i16> %a1, splat (i16 8)
@@ -485,15 +482,12 @@ define <8 x i16> @freeze_shadd(<8 x i8> %a0, <8 x i16> %a1) {
   ret <8 x i16> %sext
 }
 
-; TODO: Unnecessary sext_inreg
 define <8 x i16> @freeze_srhadd(<8 x i8> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: freeze_srhadd:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshr v1.8h, v1.8h, #8
 ; CHECK-NEXT:    srhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    shl v0.8h, v0.8h, #8
-; CHECK-NEXT:    sshr v0.8h, v0.8h, #8
 ; CHECK-NEXT:    ret
   %x0 = sext <8 x i8> %a0 to <8 x i16>
   %x1 = ashr <8 x i16> %a1, splat (i16 8)

From 662f56f4281cd42af98a3d809cc443ddce8ce5dd Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Thu, 2 Oct 2025 09:27:27 +0200
Subject: [PATCH 472/878] [flang] handle scalars in
 getDescriptorWithNewBaseAddress (#161515)

Follow up on #161347 to allow scalar fir.box/class reconstruction (at
least required for polymorphic types).
The assert in genDimInfoFromBox was rejecting scalars while there is no
functional reason for that (only assumed-rank are an issue there).
---
 flang/lib/Optimizer/Builder/FIRBuilder.cpp    |  2 +-
 .../Optimizer/Builder/FIRBuilderTest.cpp      | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 5e6e20861fd85..5da27d1713825 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1943,7 +1943,7 @@ void fir::factory::genDimInfoFromBox(
     return;
 
   unsigned rank = fir::getBoxRank(boxType);
-  assert(rank != 0 && "must be an array of known rank");
+  assert(!boxType.isAssumedRank() && "must be an array of known rank");
   mlir::Type idxTy = builder.getIndexType();
   for (unsigned i = 0; i < rank; ++i) {
     mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i);
diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
index fffd4ab5446ca..10a7ddf339133 100644
--- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
+++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp
@@ -691,3 +691,40 @@ TEST_F(FIRBuilderTest, getDescriptorWithNewBaseAddress) {
   auto lbOp = llvm::dyn_cast<fir::BoxDimsOp>(origin0.getDefiningOp());
   EXPECT_EQ(lbOp.getVal(), inputBox);
 }
+
+TEST_F(FIRBuilderTest, getDescriptorWithNewBaseAddress_PolymorphicScalar) {
+  auto builder = getBuilder();
+  auto loc = builder.getUnknownLoc();
+
+  // Build a polymorphic scalar: fir.class<ptr<!fir.type<rec>>>.
+  auto recTy = fir::RecordType::get(builder.getContext(), "poly_rec");
+  auto ptrRecTy = fir::PointerType::get(recTy);
+  auto classTy = fir::ClassType::get(ptrRecTy);
+
+  // Input descriptor is an undefined fir.class value.
+  mlir::Value inputBox = fir::UndefOp::create(builder, loc, classTy);
+
+  // New base address of the same element type (reference to the record).
+  auto refRecTy = fir::ReferenceType::get(recTy);
+  mlir::Value newAddr = fir::UndefOp::create(builder, loc, refRecTy);
+
+  mlir::Value newBox = fir::factory::getDescriptorWithNewBaseAddress(
+      builder, loc, inputBox, newAddr);
+
+  // Same descriptor type must be preserved.
+  EXPECT_EQ(newBox.getType(), inputBox.getType());
+
+  // Must be an embox using the new base address and carrying the original box
+  // as mold.
+  ASSERT_TRUE(llvm::isa_and_nonnull<fir::EmboxOp>(newBox.getDefiningOp()));
+  auto embox = llvm::dyn_cast<fir::EmboxOp>(newBox.getDefiningOp());
+  EXPECT_EQ(embox.getMemref(), newAddr);
+
+  // Polymorphic scalar should have no shape operand.
+  mlir::Value shape = embox.getShape();
+  EXPECT_TRUE(shape == nullptr);
+
+  // The type descriptor/mold must be the original input box.
+  mlir::Value tdesc = embox.getSourceBox();
+  EXPECT_EQ(tdesc, inputBox);
+}

From 0b0dcf856abd6435d94d3108fa1386eebccce5d6 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 2 Oct 2025 09:32:30 +0200
Subject: [PATCH 473/878] [libc++] Upgrade Xcode to 26.0 (#160097)

---
 .github/workflows/libcxx-build-and-test.yaml                   | 2 +-
 libcxx/docs/index.rst                                          | 2 +-
 .../propagation/make_exception_ptr.objc.pass.mm                | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 5fe2ffbf58b43..1c07a0adc6e99 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -215,7 +215,7 @@ jobs:
       - uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0
         with:
           # https://github.com/actions/runner-images/blob/main/images/macos/macos-15-Readme.md
-          xcode-version: '16.3'
+          xcode-version: '26.0'
       - uses: seanmiddleditch/gha-setup-ninja@3b1f8f94a2f8254bd26914c4ab9474d4f0015f67 # v6
       - name: Build and test
         run: |
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index a44c3161534b3..4d5064bfd7f3b 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -133,7 +133,7 @@ velocity, libc++ drops support for older compilers as newer ones are released.
 Compiler     Versions            Restrictions               Support policy
 ============ =================== ========================== =====================
 Clang        19, 20, 21-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
-AppleClang   16.4                                           latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
+AppleClang   26.0                                           latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1.3 (AIX)                                   latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
 GCC          15                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
 ============ =================== ========================== =====================
diff --git a/libcxx/test/std/language.support/support.exception/propagation/make_exception_ptr.objc.pass.mm b/libcxx/test/std/language.support/support.exception/propagation/make_exception_ptr.objc.pass.mm
index 05a6698ea1a59..de383051543be 100644
--- a/libcxx/test/std/language.support/support.exception/propagation/make_exception_ptr.objc.pass.mm
+++ b/libcxx/test/std/language.support/support.exception/propagation/make_exception_ptr.objc.pass.mm
@@ -17,7 +17,8 @@
 // out-of-the-box.
 // REQUIRES: has-fobjc-arc && darwin
 
-// ADDITIONAL_COMPILE_FLAGS: -fobjc-arc
+// FIXME: including <Foundation/Foundation.h> seems to be currently broken with modules enabled
+// ADDITIONAL_COMPILE_FLAGS: -fobjc-arc -fno-modules
 
 #include <cassert>
 #include <exception>

From 5f5a84e8509d4f274bcd63e37225c23d97555094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?=
 <alejandro.alvarez@sonarsource.com>
Date: Thu, 2 Oct 2025 00:59:00 -0700
Subject: [PATCH 474/878] [clang][analyzer] Clear `ObjCMethodCall`'s cache
 between runs (#161327)

`lookupRuntimeDefinition` assumed that a process would handle only one
TU. This is not true for unit tests, for instance. Multiple snippets of
code get parsed, and their AST are unloaded each time.

Since the cache relies on pointers as keys, if the same address happens
to be reused between runs, the cache would return a stale pointer,
potentially causing a segmentation fault. This is not that unlikely if
the snippets are similar, which would trigger similar allocation
patterns.

CPP-4889
---
 .../Core/PathSensitive/CallEvent.h            |  2 +-
 clang/lib/StaticAnalyzer/Core/CallEvent.cpp   | 31 +++++++-------
 .../StaticAnalyzer/CallEventTest.cpp          | 41 +++++++++++++++++++
 3 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
index 5dcf03f7a4648..c233ca1af0256 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h
@@ -1414,7 +1414,7 @@ class CallEventManager {
   }
 
 public:
-  CallEventManager(llvm::BumpPtrAllocator &alloc) : Alloc(alloc) {}
+  CallEventManager(llvm::BumpPtrAllocator &alloc);
 
   /// Gets an outside caller given a callee context.
   CallEventRef<> getCaller(const StackFrameContext *CalleeCtx,
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index 180056cf68b64..06ba01507fa4f 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -1254,6 +1254,15 @@ template <> struct DenseMapInfo<PrivateMethodKey> {
 };
 } // end namespace llvm
 
+// NOTE: This cache is a "global" variable, and it is cleared by
+// CallEventManager's constructor so we do not keep old entries when
+// loading/unloading ASTs. If we are worried about concurrency, we may  need to
+// revisit this someday. In terms of memory, this table stays around until clang
+// quits, which also may be bad if we need to release memory.
+using PrivateMethodCacheTy =
+    llvm::DenseMap<PrivateMethodKey, std::optional<const ObjCMethodDecl *>>;
+static PrivateMethodCacheTy PrivateMethodCache;
+
 static const ObjCMethodDecl *
 lookupRuntimeDefinition(const ObjCInterfaceDecl *Interface,
                         Selector LookupSelector, bool InstanceMethod) {
@@ -1262,21 +1271,8 @@ lookupRuntimeDefinition(const ObjCInterfaceDecl *Interface,
   // that repeated queries on the same ObjCIntefaceDecl and Selector
   // don't incur the same cost.  On some test cases, we can see the
   // same query being issued thousands of times.
-  //
-  // NOTE: This cache is essentially a "global" variable, but it
-  // only gets lazily created when we get here.  The value of the
-  // cache probably comes from it being global across ExprEngines,
-  // where the same queries may get issued.  If we are worried about
-  // concurrency, or possibly loading/unloading ASTs, etc., we may
-  // need to revisit this someday.  In terms of memory, this table
-  // stays around until clang quits, which also may be bad if we
-  // need to release memory.
-  using PrivateMethodCache =
-      llvm::DenseMap<PrivateMethodKey, std::optional<const ObjCMethodDecl *>>;
-
-  static PrivateMethodCache PMC;
   std::optional<const ObjCMethodDecl *> &Val =
-      PMC[{Interface, LookupSelector, InstanceMethod}];
+      PrivateMethodCache[{Interface, LookupSelector, InstanceMethod}];
 
   // Query lookupPrivateMethod() if the cache does not hit.
   if (!Val) {
@@ -1422,6 +1418,13 @@ void ObjCMethodCall::getInitialStackFrameContents(
   }
 }
 
+CallEventManager::CallEventManager(llvm::BumpPtrAllocator &alloc)
+    : Alloc(alloc) {
+  // Clear the method cache to avoid hits when multiple AST are loaded/unloaded
+  // within a single process. This can happen with unit tests, for instance.
+  PrivateMethodCache.clear();
+}
+
 CallEventRef<>
 CallEventManager::getSimpleCall(const CallExpr *CE, ProgramStateRef State,
                                 const LocationContext *LCtx,
diff --git a/clang/unittests/StaticAnalyzer/CallEventTest.cpp b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
index 8b5289ea7472b..f42689218bb1a 100644
--- a/clang/unittests/StaticAnalyzer/CallEventTest.cpp
+++ b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
@@ -84,6 +84,47 @@ TEST(CXXDeallocatorCall, SimpleDestructor) {
 #endif
 }
 
+TEST(PrivateMethodCache, NeverReturnDanglingPointersWithMultipleASTs) {
+  // Each iteration will load and unload an AST multiple times. Since the code
+  // is always the same, we increase the chance of hitting a bug in the private
+  // method cache, returning a dangling pointer and crashing the process. If the
+  // cache is properly cleared between runs, the test should pass.
+  for (int I = 0; I < 100; ++I) {
+    auto const *Code = R"(
+    typedef __typeof(sizeof(int)) size_t;
+
+    extern void *malloc(size_t size);
+    extern void *memcpy(void *dest, const void *src, size_t n);
+
+    @interface SomeMoreData {
+      char const* _buffer;
+      int _size;
+    }
+    @property(nonatomic, readonly) const char* buffer;
+    @property(nonatomic) int size;
+
+    - (void)appendData:(SomeMoreData*)other;
+
+    @end
+
+    @implementation SomeMoreData
+    @synthesize size = _size;
+    @synthesize buffer = _buffer;
+
+    - (void)appendData:(SomeMoreData*)other {
+      int const len = (_size + other.size); // implicit self._length
+      char* d = malloc(sizeof(char) * len);
+      memcpy(d + 20, other.buffer, len);
+    }
+
+    @end
+  )";
+    std::string Diags;
+    EXPECT_TRUE(runCheckerOnCodeWithArgs<addCXXDeallocatorChecker>(
+        Code, {"-x", "objective-c", "-Wno-objc-root-class"}, Diags));
+  }
+}
+
 } // namespace
 } // namespace ento
 } // namespace clang

From 66558d70dc11fd04ce908ac94424ed7c6bd9e35b Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 2 Oct 2025 10:05:26 +0200
Subject: [PATCH 475/878] [libc++] Fix <__algorithm/find.h> when using
 -flax-vector-conversions=none (#161362)

---
 libcxx/include/__algorithm/find.h                              | 2 +-
 .../test/std/experimental/simd/simd.class/simd_unary.pass.cpp  | 3 +++
 libcxx/utils/libcxx/test/params.py                             | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 5f32ae8fc9524..72e201a3c693b 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -69,7 +69,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find_vectorized(_Tp* __first, _Tp* __last,
 
     auto __orig_first = __first;
 
-    auto __values = static_cast<__simd_vector<_Up, __vec_size>>(__value); // broadcast the value
+    auto __values = static_cast<__simd_vector<_Tp, __vec_size>>(__value); // broadcast the value
     while (static_cast<size_t>(__last - __first) >= __unroll_count * __vec_size) [[__unlikely__]] {
       __vec __lhs[__unroll_count];
 
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
index 056d6f65fc368..2c3751a97cf4e 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
@@ -12,6 +12,9 @@
 //   Assertion failed: N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type"
 // XFAIL: target=armv7-unknown-linux-gnueabihf
 
+// FIXME: This should work with -flax-vector-conversions=none
+// ADDITIONAL_COMPILE_FLAGS(clang): -flax-vector-conversions=integer
+
 // <experimental/simd>
 //
 // [simd.class]
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 6f013a75195a8..c02d6df1c47a4 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -75,6 +75,9 @@
 
     # We're not annotating all the APIs, since that's a lot of annotations compared to how many we actually care about
     "-Wno-nullability-completeness",
+
+    # Technically not a warning flag, but might as well be:
+    "-flax-vector-conversions=none",
 ]
 
 _allStandards = ["c++03", "c++11", "c++14", "c++17", "c++20", "c++23", "c++26"]

From bf847a8b9d54643c457eaaad7f5dc60e6454cd2e Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Thu, 2 Oct 2025 10:06:25 +0200
Subject: [PATCH 476/878] [AMDGPU][SDAG] Enable ISD::PTRADD for 64-bit AS by
 default (#146076)

Also removes the command line option to control this feature.

There seem to be mainly two kinds of test changes:
- Some operands of addition instructions are swapped; that is to be expected
  since PTRADD is not commutative.
- Improvements in code generation, probably because the legacy lowering enabled
  some transformations that were sometimes harmful.

For SWDEV-516125.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  10 +-
 .../identical-subrange-spill-infloop.ll       | 331 ++++++++++--------
 .../AMDGPU/infer-addrspace-flat-atomic.ll     |  14 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |   8 +-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |  16 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |   4 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |  16 +-
 .../match-perm-extract-vector-elt-bug.ll      |  22 +-
 llvm/test/CodeGen/AMDGPU/memmove-var-size.ll  |  16 +-
 .../AMDGPU/no-folding-imm-to-inst-with-fi.ll  |  75 ++--
 .../AMDGPU/preload-implicit-kernargs.ll       |   6 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |   8 +-
 llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll |   7 +-
 .../AMDGPU/ptradd-sdag-optimizations.ll       |  94 ++---
 .../AMDGPU/ptradd-sdag-undef-poison.ll        |   6 +-
 llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll       |  27 +-
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |  29 +-
 17 files changed, 323 insertions(+), 366 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 16530087444d2..f7265c5fda9dc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -64,14 +64,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
     cl::desc("Use indirect register addressing for divergent indexes"),
     cl::init(false));
 
-// TODO: This option should be removed once we switch to always using PTRADD in
-// the SelectionDAG.
-static cl::opt<bool> UseSelectionDAGPTRADD(
-    "amdgpu-use-sdag-ptradd", cl::Hidden,
-    cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
-             "SelectionDAG ISel"),
-    cl::init(false));
-
 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
@@ -11466,7 +11458,7 @@ static bool isNoUnsignedWrap(SDValue Addr) {
 
 bool SITargetLowering::shouldPreservePtrArith(const Function &F,
                                               EVT PtrVT) const {
-  return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
+  return PtrVT == MVT::i64;
 }
 
 bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 9b4539c0bb4de..10d61deed71cc 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -6,96 +6,134 @@ define void @main(i1 %arg) #0 {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v5, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s48, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s49, 7
-; CHECK-NEXT:    v_writelane_b32 v5, s50, 8
-; CHECK-NEXT:    v_writelane_b32 v5, s51, 9
-; CHECK-NEXT:    v_writelane_b32 v5, s52, 10
-; CHECK-NEXT:    v_writelane_b32 v5, s53, 11
-; CHECK-NEXT:    v_writelane_b32 v5, s54, 12
-; CHECK-NEXT:    v_writelane_b32 v5, s55, 13
-; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v5, s64, 14
-; CHECK-NEXT:    s_movk_i32 s4, 0xf0
-; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s65, 15
-; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_writelane_b32 v5, s66, 16
-; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v5, s67, 17
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s6, 0x130
-; CHECK-NEXT:    s_mov_b32 s7, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s68, 18
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[6:7], 0x0
-; CHECK-NEXT:    v_writelane_b32 v5, s69, 19
-; CHECK-NEXT:    v_writelane_b32 v5, s70, 20
+; CHECK-NEXT:    v_writelane_b32 v6, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s52, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s53, 11
+; CHECK-NEXT:    v_writelane_b32 v6, s54, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s55, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s64, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s65, 15
+; CHECK-NEXT:    v_writelane_b32 v6, s66, 16
+; CHECK-NEXT:    v_writelane_b32 v6, s67, 17
+; CHECK-NEXT:    v_writelane_b32 v6, s68, 18
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
+; CHECK-NEXT:    v_writelane_b32 v6, s69, 19
 ; CHECK-NEXT:    s_mov_b32 s68, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s71, 21
+; CHECK-NEXT:    s_mov_b32 s69, s4
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[24:31], s[68:69], 0x30
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[68:69], 0xf0
+; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x130
+; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v6, s70, 20
+; CHECK-NEXT:    v_writelane_b32 v6, s71, 21
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_writelane_b32 v7, s8, 0
+; CHECK-NEXT:    v_writelane_b32 v7, s9, 1
+; CHECK-NEXT:    v_writelane_b32 v7, s10, 2
+; CHECK-NEXT:    v_writelane_b32 v7, s11, 3
+; CHECK-NEXT:    v_writelane_b32 v7, s12, 4
+; CHECK-NEXT:    v_writelane_b32 v7, s13, 5
+; CHECK-NEXT:    v_writelane_b32 v7, s14, 6
+; CHECK-NEXT:    v_writelane_b32 v7, s15, 7
+; CHECK-NEXT:    v_writelane_b32 v7, s16, 8
+; CHECK-NEXT:    v_writelane_b32 v7, s17, 9
+; CHECK-NEXT:    v_writelane_b32 v7, s18, 10
+; CHECK-NEXT:    v_writelane_b32 v7, s19, 11
+; CHECK-NEXT:    v_writelane_b32 v7, s20, 12
+; CHECK-NEXT:    v_writelane_b32 v7, s21, 13
+; CHECK-NEXT:    v_writelane_b32 v7, s22, 14
+; CHECK-NEXT:    v_writelane_b32 v7, s23, 15
+; CHECK-NEXT:    v_writelane_b32 v7, s52, 16
+; CHECK-NEXT:    v_writelane_b32 v7, s53, 17
+; CHECK-NEXT:    v_writelane_b32 v7, s54, 18
+; CHECK-NEXT:    v_writelane_b32 v7, s55, 19
+; CHECK-NEXT:    v_writelane_b32 v7, s56, 20
+; CHECK-NEXT:    v_writelane_b32 v7, s57, 21
+; CHECK-NEXT:    v_writelane_b32 v7, s58, 22
+; CHECK-NEXT:    v_writelane_b32 v7, s59, 23
+; CHECK-NEXT:    v_writelane_b32 v7, s60, 24
+; CHECK-NEXT:    v_writelane_b32 v7, s61, 25
+; CHECK-NEXT:    v_writelane_b32 v7, s62, 26
+; CHECK-NEXT:    v_writelane_b32 v7, s63, 27
+; CHECK-NEXT:    v_writelane_b32 v7, s64, 28
+; CHECK-NEXT:    v_writelane_b32 v7, s65, 29
+; CHECK-NEXT:    v_writelane_b32 v7, s66, 30
+; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[68:69], 0x1f0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[68:69], 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s69, s68
 ; CHECK-NEXT:    s_mov_b32 s70, s68
 ; CHECK-NEXT:    s_mov_b32 s71, s68
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v7, s67, 31
+; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s52, v7, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, v2
-; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT:    s_mov_b32 s6, 48
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v6, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v6, s41, 5
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[36:43], s[68:71] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v6, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v6, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v6, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v6, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v6, s46, 10
-; CHECK-NEXT:    v_writelane_b32 v6, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v6, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v6, s49, 13
-; CHECK-NEXT:    v_writelane_b32 v6, s50, 14
-; CHECK-NEXT:    s_movk_i32 s56, 0x1f0
-; CHECK-NEXT:    s_movk_i32 s72, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s57, s24
-; CHECK-NEXT:    s_mov_b32 s73, s24
-; CHECK-NEXT:    v_writelane_b32 v6, s51, 15
-; CHECK-NEXT:    s_load_dwordx8 s[24:31], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[56:57], 0x0
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[72:73], 0x0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v0
+; CHECK-NEXT:    v_readlane_b32 s53, v7, 1
+; CHECK-NEXT:    v_readlane_b32 s54, v7, 2
+; CHECK-NEXT:    v_readlane_b32 s55, v7, 3
+; CHECK-NEXT:    v_readlane_b32 s56, v7, 4
+; CHECK-NEXT:    v_readlane_b32 s57, v7, 5
+; CHECK-NEXT:    v_readlane_b32 s58, v7, 6
+; CHECK-NEXT:    v_readlane_b32 s59, v7, 7
+; CHECK-NEXT:    v_and_b32_e32 v5, 1, v0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[4:5], 1, v5
+; CHECK-NEXT:    v_readlane_b32 s60, v7, 8
+; CHECK-NEXT:    v_readlane_b32 s61, v7, 9
+; CHECK-NEXT:    v_readlane_b32 s62, v7, 10
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[68:71] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s63, v7, 11
+; CHECK-NEXT:    v_readlane_b32 s64, v7, 12
+; CHECK-NEXT:    v_readlane_b32 s65, v7, 13
+; CHECK-NEXT:    v_readlane_b32 s66, v7, 14
+; CHECK-NEXT:    v_readlane_b32 s67, v7, 15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v3
 ; CHECK-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_readlane_b32 s52, v7, 16
+; CHECK-NEXT:    v_readlane_b32 s60, v7, 24
+; CHECK-NEXT:    v_readlane_b32 s61, v7, 25
+; CHECK-NEXT:    v_readlane_b32 s62, v7, 26
+; CHECK-NEXT:    v_readlane_b32 s63, v7, 27
+; CHECK-NEXT:    v_readlane_b32 s64, v7, 28
+; CHECK-NEXT:    v_readlane_b32 s65, v7, 29
+; CHECK-NEXT:    v_readlane_b32 s66, v7, 30
+; CHECK-NEXT:    v_readlane_b32 s67, v7, 31
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
+; CHECK-NEXT:    v_readlane_b32 s53, v7, 17
+; CHECK-NEXT:    v_readlane_b32 s54, v7, 18
+; CHECK-NEXT:    v_readlane_b32 s55, v7, 19
+; CHECK-NEXT:    v_readlane_b32 s56, v7, 20
+; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_readlane_b32 s57, v7, 21
+; CHECK-NEXT:    v_readlane_b32 s58, v7, 22
+; CHECK-NEXT:    v_readlane_b32 s59, v7, 23
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_mov_b32 s69, s68
-; CHECK-NEXT:    s_mov_b32 s70, s68
-; CHECK-NEXT:    s_mov_b32 s71, s68
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[16:23], s[28:31] dmask:0x1
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[44:51], s[68:71] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v4
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
@@ -103,60 +141,75 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[6:7]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[16:17], s[4:5]
-; CHECK-NEXT:    s_xor_b64 s[22:23], exec, s[16:17]
+; CHECK-NEXT:    s_xor_b64 s[4:5], exec, s[16:17]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s44, 0
-; CHECK-NEXT:    s_mov_b32 s45, s44
-; CHECK-NEXT:    v_mov_b32_e32 v2, s44
-; CHECK-NEXT:    v_mov_b32_e32 v3, s45
-; CHECK-NEXT:    s_mov_b32 s46, s44
-; CHECK-NEXT:    s_mov_b32 s47, s44
-; CHECK-NEXT:    image_sample_lz v1, v[2:3], s[8:15], s[44:47] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s4, v6, 0
-; CHECK-NEXT:    v_readlane_b32 s12, v6, 8
-; CHECK-NEXT:    v_readlane_b32 s13, v6, 9
-; CHECK-NEXT:    v_readlane_b32 s14, v6, 10
-; CHECK-NEXT:    v_readlane_b32 s15, v6, 11
-; CHECK-NEXT:    v_readlane_b32 s16, v6, 12
-; CHECK-NEXT:    v_readlane_b32 s17, v6, 13
-; CHECK-NEXT:    v_readlane_b32 s18, v6, 14
-; CHECK-NEXT:    v_readlane_b32 s19, v6, 15
-; CHECK-NEXT:    v_readlane_b32 s5, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s6, v6, 2
-; CHECK-NEXT:    v_readlane_b32 s7, v6, 3
-; CHECK-NEXT:    v_readlane_b32 s8, v6, 4
-; CHECK-NEXT:    v_readlane_b32 s9, v6, 5
-; CHECK-NEXT:    image_sample_lz v0, v[2:3], s[12:19], s[24:27] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, v2
-; CHECK-NEXT:    v_readlane_b32 s10, v6, 6
-; CHECK-NEXT:    v_readlane_b32 s11, v6, 7
+; CHECK-NEXT:    s_mov_b32 s16, 0
+; CHECK-NEXT:    s_mov_b32 s17, s16
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_readlane_b32 s44, v7, 16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    s_mov_b32 s18, s16
+; CHECK-NEXT:    s_mov_b32 s19, s16
+; CHECK-NEXT:    v_readlane_b32 s45, v7, 17
+; CHECK-NEXT:    v_readlane_b32 s46, v7, 18
+; CHECK-NEXT:    v_readlane_b32 s47, v7, 19
+; CHECK-NEXT:    v_readlane_b32 s48, v7, 20
+; CHECK-NEXT:    v_readlane_b32 s49, v7, 21
+; CHECK-NEXT:    v_readlane_b32 s50, v7, 22
+; CHECK-NEXT:    v_readlane_b32 s51, v7, 23
+; CHECK-NEXT:    v_readlane_b32 s52, v7, 24
+; CHECK-NEXT:    v_readlane_b32 s53, v7, 25
+; CHECK-NEXT:    v_readlane_b32 s54, v7, 26
+; CHECK-NEXT:    v_readlane_b32 s55, v7, 27
+; CHECK-NEXT:    v_readlane_b32 s56, v7, 28
+; CHECK-NEXT:    v_readlane_b32 s57, v7, 29
+; CHECK-NEXT:    v_readlane_b32 s58, v7, 30
+; CHECK-NEXT:    v_readlane_b32 s59, v7, 31
+; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s44, v7, 0
+; CHECK-NEXT:    v_readlane_b32 s52, v7, 8
+; CHECK-NEXT:    v_readlane_b32 s53, v7, 9
+; CHECK-NEXT:    v_readlane_b32 s54, v7, 10
+; CHECK-NEXT:    v_readlane_b32 s55, v7, 11
+; CHECK-NEXT:    v_readlane_b32 s56, v7, 12
+; CHECK-NEXT:    v_readlane_b32 s57, v7, 13
+; CHECK-NEXT:    v_readlane_b32 s58, v7, 14
+; CHECK-NEXT:    v_readlane_b32 s59, v7, 15
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, v3
+; CHECK-NEXT:    v_readlane_b32 s45, v7, 1
+; CHECK-NEXT:    v_readlane_b32 s46, v7, 2
+; CHECK-NEXT:    v_readlane_b32 s47, v7, 3
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s48, v7, 4
+; CHECK-NEXT:    v_readlane_b32 s49, v7, 5
+; CHECK-NEXT:    v_readlane_b32 s50, v7, 6
+; CHECK-NEXT:    v_readlane_b32 s51, v7, 7
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[1:3], off, s[44:47], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[2:4], off, s[16:19], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[44:47], 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[22:23]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
-; CHECK-NEXT:    s_mov_b32 s8, 0
-; CHECK-NEXT:    s_mov_b32 s12, s8
-; CHECK-NEXT:    s_mov_b32 s13, s8
-; CHECK-NEXT:    v_mov_b32_e32 v1, s12
-; CHECK-NEXT:    s_mov_b32 s9, s8
-; CHECK-NEXT:    s_mov_b32 s10, s8
-; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_mov_b32_e32 v2, s13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_mov_b32 s16, 0
+; CHECK-NEXT:    s_mov_b32 s20, s16
+; CHECK-NEXT:    s_mov_b32 s21, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s20
+; CHECK-NEXT:    s_mov_b32 s17, s16
+; CHECK-NEXT:    s_mov_b32 s18, s16
+; CHECK-NEXT:    s_mov_b32 s19, s16
+; CHECK-NEXT:    v_mov_b32_e32 v2, s21
+; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[8:15], s[16:19] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
@@ -171,33 +224,33 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT:    v_readlane_b32 s71, v5, 21
-; CHECK-NEXT:    v_readlane_b32 s70, v5, 20
-; CHECK-NEXT:    v_readlane_b32 s69, v5, 19
-; CHECK-NEXT:    v_readlane_b32 s68, v5, 18
+; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT:    v_readlane_b32 s71, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s70, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s69, v6, 19
+; CHECK-NEXT:    v_readlane_b32 s68, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s67, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s66, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s65, v6, 15
+; CHECK-NEXT:    v_readlane_b32 s64, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s55, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s54, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s53, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s52, v6, 10
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s67, v5, 17
-; CHECK-NEXT:    v_readlane_b32 s66, v5, 16
-; CHECK-NEXT:    v_readlane_b32 s65, v5, 15
-; CHECK-NEXT:    v_readlane_b32 s64, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s55, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s54, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s53, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s52, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 8
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 7
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v5, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v5, 0
+; CHECK-NEXT:    v_readlane_b32 s51, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s50, v6, 8
+; CHECK-NEXT:    v_readlane_b32 s49, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s48, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v6, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 59dfd713ef4fd..bd11b0710fadd 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -11,8 +11,8 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s3
 ; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
-; CHECK-NEXT:    s_add_u32 s0, s2, s0
-; CHECK-NEXT:    s_addc_u32 s1, s3, s1
+; CHECK-NEXT:    s_add_u32 s0, s0, s2
+; CHECK-NEXT:    s_addc_u32 s1, s1, s3
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-NEXT:    v_add_co_u32_e64 v0, vcc, -8, s0
 ; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -69,13 +69,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou
 ; CHECK-NEXT:    s_lshl_b64 s[2:3], s[6:7], 3
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
+; CHECK-NEXT:    s_add_u32 s0, s0, -8
+; CHECK-NEXT:    s_addc_u32 s1, s1, -1
 ; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    v_add_co_u32_e64 v0, vcc, -7, s0
-; CHECK-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] offset:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_wbinvl1_vol
 ; CHECK-NEXT:    s_endpgm
@@ -113,7 +113,7 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    s_add_u32 s4, s0, -8
 ; CHECK-NEXT:    s_addc_u32 s5, s1, -1
-; CHECK-NEXT:    s_cmp_eq_u64 s[0:1], 9
+; CHECK-NEXT:    s_cmp_eq_u64 s[4:5], 1
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 48bf7fbe0a3cb..3eef616ba267d 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -46,8 +46,8 @@ define void @use_extern_normal() #0 {
 ; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x4048f5c3
 ; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT:    s_add_u32 s4, s4, s6
-; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_add_u32 s4, s6, s4
+; CHECK-NEXT:    s_addc_u32 s5, s7, s5
 ; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
@@ -70,8 +70,8 @@ define void @use_extern_overalign() #0 {
 ; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x42280000
 ; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT:    s_add_u32 s4, s4, s6
-; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_add_u32 s4, s6, s4
+; CHECK-NEXT:    s_addc_u32 s5, s7, s5
 ; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index ea9d5e8a0bc1f..1e6b77ecea85e 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -400,9 +400,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v2, s1, v0, s6
+; GFX12-NEXT:    v_add_co_u32 v2, s1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s1
 ; GFX12-NEXT:    v_add_co_u32 v0, s1, s4, v0
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, 0xb0, v2
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
@@ -438,9 +438,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
 ; GFX12-SPREFETCH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-SPREFETCH-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v2, s1, v0, s6
+; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v2, s1, s6, v0
 ; GFX12-SPREFETCH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s1
 ; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v0, s1, s4, v0
 ; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v2, vcc_lo, 0xb0, v2
 ; GFX12-SPREFETCH-NEXT:    s_wait_alu 0xf1ff
@@ -531,9 +531,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_add_co_u32 v2, s1, v0, s6
+; GFX12-NEXT:    v_add_co_u32 v2, s1, s6, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s1
 ; GFX12-NEXT:    v_add_co_u32 v0, s1, s4, v0
 ; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, 0xb0, v2
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
@@ -569,9 +569,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
 ; GFX12-SPREFETCH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-SPREFETCH-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX12-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v2, s1, v0, s6
+; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v2, s1, s6, v0
 ; GFX12-SPREFETCH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s1
 ; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v0, s1, s4, v0
 ; GFX12-SPREFETCH-NEXT:    v_add_co_u32 v2, vcc_lo, 0xb0, v2
 ; GFX12-SPREFETCH-NEXT:    s_wait_alu 0xf1ff
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index b6f70fa6a9892..12212a0968c96 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -84,8 +84,8 @@ define void @f2() {
 ; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index c316f03dde89b..b689e1e51c2a4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -49,8 +49,8 @@ define void @f0() {
 ; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -90,8 +90,8 @@ define void @f1() {
 ; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+8
 ; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+16
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -131,8 +131,8 @@ define void @f2() {
 ; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+12
 ; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+20
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
@@ -172,8 +172,8 @@ define void @f3() {
 ; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+16
 ; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+24
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 65b4d37a8d583..93d772fdb7854 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -13,9 +13,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX9-NEXT:    s_mul_i32 s14, s14, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s14
-; GFX9-NEXT:    v_add_u32_e32 v0, s5, v0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v1, s5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_ashrrev_i64 v[4:5], 28, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
@@ -37,12 +37,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX10-NEXT:    s_load_dword s4, s[8:9], 0x1c
 ; GFX10-NEXT:    s_load_dword s5, s[8:9], 0x38
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GFX10-NEXT:    s_mul_i32 s14, s14, s4
-; GFX10-NEXT:    v_add3_u32 v0, s5, s14, v0
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX10-NEXT:    v_add3_u32 v2, s5, s14, v0
+; GFX10-NEXT:    v_ashrrev_i64 v[4:5], 28, v[1:2]
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, s2, v4
@@ -62,21 +62,19 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x1c
 ; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x38
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s4, s6, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s13, s13, s4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add3_u32 v0, s7, s13, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX11-NEXT:    v_add3_u32 v1, s7, s13, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_ashrrev_i64 v[4:5], 28, v[0:1]
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, s2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, s3, v5, vcc_lo
 ; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index dd5c247f6ef35..14b0729b37302 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -388,8 +388,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB2_13
 ; CHECK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v9, s4, v3, v0
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v4, v1, s4
+; CHECK-NEXT:    v_add_co_u32 v9, s4, v0, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v1, v4, s4
 ; CHECK-NEXT:    v_add3_u32 v4, v3, v2, -1
 ; CHECK-NEXT:    v_add_co_u32 v9, s4, v9, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, -1, v10, s4
@@ -684,8 +684,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB4_13
 ; CHECK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v9, s4, v3, v0
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v4, v1, s4
+; CHECK-NEXT:    v_add_co_u32 v9, s4, v0, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v1, v4, s4
 ; CHECK-NEXT:    v_add3_u32 v4, v3, v2, -1
 ; CHECK-NEXT:    v_add_co_u32 v9, s4, v9, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, -1, v10, s4
@@ -1411,8 +1411,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB10_13
 ; CHECK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v9, s4, v3, v1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v4, v2, s4
+; CHECK-NEXT:    v_add_co_u32 v9, s4, v1, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v2, v4, s4
 ; CHECK-NEXT:    v_add3_u32 v4, v3, v0, -1
 ; CHECK-NEXT:    v_add_co_u32 v9, s4, v9, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, -1, v10, s4
@@ -1889,8 +1889,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
 ; CHECK-NEXT:    s_and_saveexec_b32 s7, s4
 ; CHECK-NEXT:    s_cbranch_execz .LBB15_13
 ; CHECK-NEXT:  ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT:    v_add_co_u32 v9, s4, v3, v1
-; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v4, v2, s4
+; CHECK-NEXT:    v_add_co_u32 v9, s4, v1, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, v2, v4, s4
 ; CHECK-NEXT:    v_add3_u32 v4, v3, v0, -1
 ; CHECK-NEXT:    v_add_co_u32 v9, s4, v9, -1
 ; CHECK-NEXT:    v_add_co_ci_u32_e64 v10, null, -1, v10, s4
diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
index 6d0aa1e784530..7e4be65898b65 100644
--- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
@@ -9,92 +9,65 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v
 ; CHECK-NEXT:    s_load_b512 s[16:31], s[4:5], 0xe4
 ; CHECK-NEXT:    s_load_b512 s[0:15], s[4:5], 0xa4
 ; CHECK-NEXT:    s_mov_b64 s[34:35], src_private_base
-; CHECK-NEXT:    s_movk_i32 s33, 0x70
-; CHECK-NEXT:    s_movk_i32 s34, 0x60
-; CHECK-NEXT:    s_or_b32 s44, 0x80, s33
-; CHECK-NEXT:    s_mov_b32 s45, s35
-; CHECK-NEXT:    s_or_b32 s46, 0x80, s34
-; CHECK-NEXT:    s_mov_b32 s47, s35
-; CHECK-NEXT:    v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45
-; CHECK-NEXT:    v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47
 ; CHECK-NEXT:    s_movk_i32 s34, 0x80
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT:    v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35
+; CHECK-NEXT:    v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v21, s35
 ; CHECK-NEXT:    s_wait_kmcnt 0x0
 ; CHECK-NEXT:    v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41
 ; CHECK-NEXT:    v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
 ; CHECK-NEXT:    v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37
 ; CHECK-NEXT:    v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39
-; CHECK-NEXT:    scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS
-; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
-; CHECK-NEXT:    s_movk_i32 s20, 0x50
 ; CHECK-NEXT:    v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29
 ; CHECK-NEXT:    v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    s_or_b32 s20, 0x80, s20
-; CHECK-NEXT:    s_mov_b32 s21, s35
 ; CHECK-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
 ; CHECK-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; CHECK-NEXT:    v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20
+; CHECK-NEXT:    v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21
+; CHECK-NEXT:    v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; CHECK-NEXT:    scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[8:11] offset:112 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[12:15] offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[16:19] offset:80 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
-; CHECK-NEXT:    s_or_b32 s16, 0x80, 64
-; CHECK-NEXT:    s_mov_b32 s17, s35
-; CHECK-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
-; CHECK-NEXT:    s_or_b32 s12, 0x80, 48
-; CHECK-NEXT:    s_mov_b32 s13, s35
-; CHECK-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; CHECK-NEXT:    s_or_b32 s8, 0x80, 32
-; CHECK-NEXT:    s_mov_b32 s9, s35
-; CHECK-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
-; CHECK-NEXT:    s_or_b32 s4, 0x80, 16
-; CHECK-NEXT:    s_mov_b32 s5, s35
 ; CHECK-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
-; CHECK-NEXT:    s_wait_alu 0xfffe
-; CHECK-NEXT:    v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16
+; CHECK-NEXT:    v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
 ; CHECK-NEXT:    v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
-; CHECK-NEXT:    v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12
-; CHECK-NEXT:    v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8
-; CHECK-NEXT:    v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4
+; CHECK-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
 ; CHECK-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; CHECK-NEXT:    v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
 ; CHECK-NEXT:    v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
 ; CHECK-NEXT:    v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
 ; CHECK-NEXT:    v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; CHECK-NEXT:    flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[0:3] offset:64 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[4:7] offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[8:11] offset:32 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[12:15] offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_store_b128 v[20:21], v[16:19] scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] offset:96 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] offset:112 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] offset:64 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] offset:80 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] offset:32 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] offset:48 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT:    flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS
+; CHECK-NEXT:    flat_load_b128 v[0:3], v[20:21] offset:16 scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
 ; CHECK-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index f5e136a80b4a8..b717f85e179b3 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -337,8 +337,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 ; GFX942-NEXT:    .p2align 8
 ; GFX942-NEXT:  ; %bb.2:
 ; GFX942-NEXT:  .LBB8_0:
-; GFX942-NEXT:    s_mov_b32 s4, 8
-; GFX942-NEXT:    s_load_dword s0, s[0:1], s4 offset:0x2
+; GFX942-NEXT:    s_load_dword s0, s[0:1], 0xa
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s0
@@ -353,8 +352,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 ; GFX90a-NEXT:    .p2align 8
 ; GFX90a-NEXT:  ; %bb.2:
 ; GFX90a-NEXT:  .LBB8_0:
-; GFX90a-NEXT:    s_mov_b32 s0, 8
-; GFX90a-NEXT:    s_load_dword s0, s[4:5], s0 offset:0x2
+; GFX90a-NEXT:    s_load_dword s0, s[4:5], 0xa
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90a-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90a-NEXT:    v_mov_b32_e32 v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 760a298ce8971..85a9aba1a0e51 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -608,8 +608,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0x7f
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 3, v6
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v0, s34
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, 0, s35, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, s34, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader
@@ -819,8 +819,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xfe000000, v1
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 3, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, s0, v0, s34
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s35, s0
+; GFX11-NEXT:    v_add_co_u32 v0, s0, s34, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x5000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
index ff90f1f175c3c..40f39a24d7a99 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX6,GFX6_PTRADD %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX6,GFX6_LEGACY %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s
 
 ; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectMUBUF.
 
@@ -34,7 +33,3 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
   store i32 %result, ptr addrspace(1) %out
   ret void
 }
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX6_LEGACY: {{.*}}
-; GFX6_PTRADD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 7d3b19e885877..1c986a02e8bd6 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 < %s | FileCheck --check-prefixes=GFX942 %s
 
 ; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
 ; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
@@ -24,21 +23,13 @@ define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
 }
 
 define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
-; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:24
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:24
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: global_load_gep_add_reassoc:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %add0 = add nuw nsw i64 %voffset, 24
   %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
   %l = load i64, ptr addrspace(1) %gep0, align 8
@@ -221,23 +212,14 @@ define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %
 ; Check that offsets are folded into global addresses if possible. For example,
 ; this is relevant when using --amdgpu-lower-module-lds-strategy=table.
 define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
-; GFX942_PTRADD-LABEL: complextype_global_gep:
-; GFX942_PTRADD:       ; %bb.0:
-; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    s_getpc_b64 s[0:1]
-; GFX942_PTRADD-NEXT:    s_add_u32 s0, s0, v0@rel32@lo+14
-; GFX942_PTRADD-NEXT:    s_addc_u32 s1, s1, v0@rel32@hi+22
-; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: complextype_global_gep:
-; GFX942_LEGACY:       ; %bb.0:
-; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    s_getpc_b64 s[0:1]
-; GFX942_LEGACY-NEXT:    s_add_u32 s0, s0, v0@rel32@lo+14
-; GFX942_LEGACY-NEXT:    s_addc_u32 s1, s1, v0@rel32@hi+22
-; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-LABEL: complextype_global_gep:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, v0@rel32@lo+14
+; GFX942-NEXT:    s_addc_u32 s1, s1, v0@rel32@hi+22
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %gep0 = getelementptr inbounds %complextype, ptr addrspace(1) @v0, i64 0, i32 1, i64 %offset
   %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2
   ret ptr addrspace(1) %gep1
@@ -430,36 +412,20 @@ define ptr @gep_disjoint_or(ptr %base) {
 ; Check that AssertAlign nodes between ptradd nodes don't block offset folding,
 ; taken from preload-implicit-kernargs.ll
 define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) {
-; GFX942_PTRADD-LABEL: random_incorrect_offset:
-; GFX942_PTRADD:       ; %bb.1:
-; GFX942_PTRADD-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    s_branch .LBB21_0
-; GFX942_PTRADD-NEXT:    .p2align 8
-; GFX942_PTRADD-NEXT:  ; %bb.2:
-; GFX942_PTRADD-NEXT:  .LBB21_0:
-; GFX942_PTRADD-NEXT:    s_load_dword s0, s[4:5], 0xa
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942_PTRADD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942_PTRADD-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX942_PTRADD-NEXT:    s_endpgm
-;
-; GFX942_LEGACY-LABEL: random_incorrect_offset:
-; GFX942_LEGACY:       ; %bb.1:
-; GFX942_LEGACY-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    s_branch .LBB21_0
-; GFX942_LEGACY-NEXT:    .p2align 8
-; GFX942_LEGACY-NEXT:  ; %bb.2:
-; GFX942_LEGACY-NEXT:  .LBB21_0:
-; GFX942_LEGACY-NEXT:    s_mov_b32 s0, 8
-; GFX942_LEGACY-NEXT:    s_load_dword s0, s[4:5], s0 offset:0x2
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942_LEGACY-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT:    v_mov_b32_e32 v1, s0
-; GFX942_LEGACY-NEXT:    global_store_dword v0, v1, s[8:9]
-; GFX942_LEGACY-NEXT:    s_endpgm
+; GFX942-LABEL: random_incorrect_offset:
+; GFX942:       ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_branch .LBB21_0
+; GFX942-NEXT:    .p2align 8
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:  .LBB21_0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0xa
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX942-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
   %load = load i32, ptr addrspace(4) %gep
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
index 1934ce395e63d..e7c715f0a38bf 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel < %s | FileCheck --check-prefixes=GFX942 %s
 
 ; Tests for undef and poison DAG folds for the ISD::PTRADD SelectionDAG opcode.
 ; If any additions are generated for these tests, the folds don't work.
@@ -44,6 +43,3 @@ define ptr @undef_base(ptr %p, i64 %offset) {
   %gep1 = getelementptr i8, ptr undef, i64 %offset
   ret ptr %gep1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX942_LEGACY: {{.*}}
-; GFX942_PTRADD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
index 9dd25025d4381..f4f5a78f0e2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
@@ -1,14 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12
 
 ; Tests for the ISD::PTRADD SelectionDAG opcode. This only tests 64-bit address
 ; spaces since PTRADD is currently only used for these.
@@ -511,15 +506,3 @@ entry:
   store i32 %val, ptr addrspace(1) %gep.to, align 4
   ret void
 }
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10_LEGACY: {{.*}}
-; GFX10_PTRADD: {{.*}}
-; GFX11_LEGACY: {{.*}}
-; GFX11_PTRADD: {{.*}}
-; GFX12_LEGACY: {{.*}}
-; GFX12_PTRADD: {{.*}}
-; GFX8_LEGACY: {{.*}}
-; GFX8_PTRADD: {{.*}}
-; GFX942_LEGACY: {{.*}}
-; GFX942_PTRADD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 65a99d0d097f9..480eb0dd5fe9c 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -52,11 +52,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
 ; HAWAII-NEXT:    s_add_i32 s12, s12, s17
-; HAWAII-NEXT:    s_or_b32 s0, s8, 14
-; HAWAII-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; HAWAII-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; HAWAII-NEXT:    s_add_u32 s0, s8, 14
+; HAWAII-NEXT:    s_addc_u32 s1, s9, 0
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
-; HAWAII-NEXT:    v_mov_b32_e32 v1, s9
+; HAWAII-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s1
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
 ; HAWAII-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; HAWAII-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x2
@@ -74,25 +75,27 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ;
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
 ; FIJI-NEXT:    s_add_i32 s12, s12, s17
-; FIJI-NEXT:    s_or_b32 s0, s8, 14
-; FIJI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; FIJI-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; FIJI-NEXT:    v_mov_b32_e32 v0, s0
-; FIJI-NEXT:    v_mov_b32_e32 v1, s9
-; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
-; FIJI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x8
-; FIJI-NEXT:    s_load_dword s2, s[8:9], 0x0
+; FIJI-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; FIJI-NEXT:    s_mov_b32 m0, -1
 ; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
-; FIJI-NEXT:    s_and_b32 s3, s1, 0xffff
-; FIJI-NEXT:    v_mov_b32_e32 v1, s2
+; FIJI-NEXT:    s_and_b32 s4, s1, 0xffff
+; FIJI-NEXT:    s_add_u32 s2, s8, 14
+; FIJI-NEXT:    s_addc_u32 s3, s9, 0
+; FIJI-NEXT:    v_mov_b32_e32 v0, s2
+; FIJI-NEXT:    v_mov_b32_e32 v1, s3
+; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
+; FIJI-NEXT:    s_load_dword s2, s[8:9], 0x0
 ; FIJI-NEXT:    v_mov_b32_e32 v2, s1
 ; FIJI-NEXT:    v_mov_b32_e32 v3, s0
+; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:    v_mov_b32_e32 v1, s2
 ; FIJI-NEXT:    ds_write_b16 v1, v2 offset:4
 ; FIJI-NEXT:    s_waitcnt vmcnt(0)
 ; FIJI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; FIJI-NEXT:    v_or_b32_e32 v0, s3, v0
+; FIJI-NEXT:    v_or_b32_e32 v0, s4, v0
 ; FIJI-NEXT:    v_bfe_u32 v0, v0, 16, 7
 ; FIJI-NEXT:    ds_write_b8 v1, v0 offset:6
 ; FIJI-NEXT:    ds_write_b32 v1, v3

From 6e52e538cd6e7912058f73f244a45aeea153d05c Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Thu, 2 Oct 2025 09:17:36 +0100
Subject: [PATCH 477/878] [LAA] Test different-type-sizes in safe-dep-dist
 (#161244)

The isSafeDependenceDistance routine is guarded by a HasSameSize check
which can be removed, as the test demonstrates.
---
 .../LoopAccessAnalysis/depend_diff_types.ll   | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 023a8c056968f..27a85c7a46084 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -560,3 +560,44 @@ loop:
 exit:
   ret void
 }
+
+; TODO: Relax HasSameSize check in isSafeDependenceDistance.
+define void @different_type_sizes_safe_dep_dist(i16  %n, ptr %p) {
+; CHECK-LABEL: 'different_type_sizes_safe_dep_dist'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            store i32 0, ptr %gep.iv, align 1 ->
+; CHECK-NEXT:            store i16 1, ptr %gep.off.iv, align 1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %n.pos = icmp sgt i16 %n, 0
+  br i1 %n.pos, label %ph, label %exit
+
+ph:
+  %gep.off = getelementptr i32, ptr %p, i16 %n
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %ph ], [ %iv.next, %loop ]
+  %gep.iv = getelementptr inbounds i32, ptr %p, i16 %iv
+  store i32 0, ptr %gep.iv, align 1
+  %gep.off.iv = getelementptr i32, ptr %gep.off, i16 %iv
+  store i16 1, ptr %gep.off.iv, align 1
+  %iv.next = add i16 %iv, 1
+  %exit.cond = icmp eq i16 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}

From 031fb7414fd6edf20e0cd7f7783666313169a0d2 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 2 Oct 2025 09:27:48 +0100
Subject: [PATCH 478/878] [AArch64][SME] Preserve `Chain` when selecting
 multi-vector LUT4Is (#161494)

Previously, the `Chain` was dropped meaning LUTI4 nodes that only
differed in the chain operand would be incorrectly CSE'd.

Fixes: #161420
---
 llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp     | 12 +++++++-----
 llvm/test/CodeGen/AArch64/pr161420.ll               | 13 ++++++++-----
 .../AArch64/sme2-intrinsics-luti4-lane-x4.ll        |  5 ++++-
 llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll  |  5 ++++-
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6a1b06eea4309..177b4b0febcac 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2089,7 +2089,8 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4), Chain};
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
 
@@ -2110,14 +2111,15 @@ void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
 void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
                                                 unsigned NumOutVecs,
                                                 unsigned Opc) {
-
   SDValue ZtValue;
-  SmallVector<SDValue, 4> Ops;
   if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
     return;
 
-  Ops.push_back(ZtValue);
-  Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ops[] = {ZtValue,
+                   createZMulTuple({Node->getOperand(3), Node->getOperand(4)}),
+                   Chain};
+
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
 
diff --git a/llvm/test/CodeGen/AArch64/pr161420.ll b/llvm/test/CodeGen/AArch64/pr161420.ll
index 515a1bf47cc1e..dcdf0ed1e7a35 100644
--- a/llvm/test/CodeGen/AArch64/pr161420.ll
+++ b/llvm/test/CodeGen/AArch64/pr161420.ll
@@ -5,17 +5,20 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "arm64-apple-macosx15.0.0"
 
 ; From: https://github.com/llvm/llvm-project/issues/161420. This test checks that
-; two `luti4` instructions are emitted. FIXME: This is currently broken!
+; two `luti4` instructions are emitted.
 define void @pluto(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) #0 {
 ; CHECK-LABEL: pluto:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    mov w8, #0 ; =0x0
 ; CHECK-NEXT:    ldr zt0, [x1]
-; CHECK-NEXT:    ldr z0, [x3]
+; CHECK-NEXT:    ldr z4, [x3]
 ; CHECK-NEXT:    ptrue pn8.h
-; CHECK-NEXT:    ld1h { z4.h - z7.h }, pn8/z, [x0]
-; CHECK-NEXT:    luti4 { z0.h - z3.h }, zt0, z0[0]
-; CHECK-NEXT:    fmla za.h[w8, 2, vgx4], { z4.h - z7.h }, { z0.h - z3.h }
+; CHECK-NEXT:    ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT:    luti4 { z16.h - z19.h }, zt0, z4[0]
+; CHECK-NEXT:    fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z16.h - z19.h }
+; CHECK-NEXT:    ldr zt0, [x2]
+; CHECK-NEXT:    luti4 { z4.h - z7.h }, zt0, z4[0]
+; CHECK-NEXT:    fmla za.h[w8, 2, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
 ; CHECK-NEXT:    ret
 bb:
   tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg1)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
index cf306e5238018..d48e0cd4d9a92 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
@@ -49,10 +49,13 @@ define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscal
 }
 
 ; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
-; FIXME: This is currently broken!
 define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %x) {
 ; CHECK-LABEL: test_multiple_luti4_zt_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr zt0, [x0]
+; CHECK-NEXT:    luti4 { z4.s - z7.s }, zt0, z0[1]
+; CHECK-NEXT:    // fake_use: $z4 $z4_z5_z6_z7
+; CHECK-NEXT:    ldr zt0, [x1]
 ; CHECK-NEXT:    luti4 { z0.s - z3.s }, zt0, z0[1]
 ; CHECK-NEXT:    // fake_use: $z0 $z0_z1_z2_z3
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
index 0024b70bd7c8f..c1eff8dd1f8a8 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
@@ -15,12 +15,15 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16
 }
 
 ; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
-; FIXME: This is currently broken!
 define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
 ; CHECK-LABEL: test_multiple_luti4_zt_i8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr zt0, [x0]
 ; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
 ; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT:    luti4 { z4.b - z7.b }, zt0, { z0, z1 }
+; CHECK-NEXT:    // fake_use: $z4 $z4_z5_z6_z7
+; CHECK-NEXT:    ldr zt0, [x1]
 ; CHECK-NEXT:    luti4 { z0.b - z3.b }, zt0, { z0, z1 }
 ; CHECK-NEXT:    // fake_use: $z0 $z0_z1_z2_z3
 ; CHECK-NEXT:    ret

From 8aa64edb34ec6b30e1e7d0dbcc86236a6290eb0c Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Thu, 2 Oct 2025 10:50:05 +0200
Subject: [PATCH 479/878] [Flang] Add -ffast-real-mod and direct code for MOD
 on REAL types (#160660)

This patch adds direct code-gen support for a faster MOD intrinsic for
REAL types. Flang has maintained and keeps maintaining a high-precision
implementation of the MOD intrinsic as part of the Fortran runtime. With
the -ffast-real-mod flag, users can opt to avoid calling into the
Fortran runtime, but instead trigger code-gen that produces faster code
by avoiding the runtime call, at the expense of potentially risking bit
cancelation by having the compiler use the MOD formula a specified by
ISO Fortran.
---
 clang/include/clang/Driver/Options.td         |  3 +
 clang/lib/Driver/ToolChains/Flang.cpp         |  3 +
 flang/include/flang/Support/LangOptions.def   |  3 +-
 flang/lib/Frontend/CompilerInvocation.cpp     |  3 +
 flang/lib/Frontend/FrontendActions.cpp        |  8 ++
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 38 ++++++++-
 flang/test/Driver/fast-real-mod.f90           |  7 ++
 flang/test/Lower/Intrinsics/fast-real-mod.f90 | 83 +++++++++++++++++++
 8 files changed, 144 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Driver/fast-real-mod.f90
 create mode 100644 flang/test/Lower/Intrinsics/fast-real-mod.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 096df56d0f183..2ef609831637e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2750,6 +2750,9 @@ def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">
   Group<f_Group>;
 def fassociative_math : Flag<["-"], "fassociative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
 def fno_associative_math : Flag<["-"], "fno-associative-math">, Visibility<[ClangOption, FlangOption]>, Group<f_Group>;
+def fno_fast_real_mod : Flag<["-"], "fno-fast-real-mod">,
+  Group<f_Group>, Visibility<[FlangOption, FC1Option]>,
+  HelpText<"Disable optimization of MOD for REAL types in presence of -ffast-math">;
 defm reciprocal_math : BoolFOption<"reciprocal-math",
   LangOpts<"AllowRecip">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option, FC1Option, FlangOption],
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 426bc52f34859..a56fa41c49d34 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -822,6 +822,9 @@ static void addFloatingPointOptions(const Driver &D, const ArgList &Args,
                                          complexRangeKindToStr(Range)));
   }
 
+  if (Args.hasArg(options::OPT_fno_fast_real_mod))
+    CmdArgs.push_back("-fno-fast-real-mod");
+
   if (!HonorINFs && !HonorNaNs && AssociativeMath && ReciprocalMath &&
       ApproxFunc && !SignedZeros &&
       (FPContract == "fast" || FPContract.empty())) {
diff --git a/flang/include/flang/Support/LangOptions.def b/flang/include/flang/Support/LangOptions.def
index ba72d7b4b7212..e7185c836f45b 100644
--- a/flang/include/flang/Support/LangOptions.def
+++ b/flang/include/flang/Support/LangOptions.def
@@ -60,7 +60,8 @@ LANGOPT(OpenMPNoThreadState, 1, 0)
 LANGOPT(OpenMPNoNestedParallelism, 1, 0)
 /// Use SIMD only OpenMP support.
 LANGOPT(OpenMPSimd, 1, false)
-
+/// Enable fast MOD operations for REAL
+LANGOPT(NoFastRealMod, 1, false)
 LANGOPT(VScaleMin, 32, 0)  ///< Minimum vscale range value
 LANGOPT(VScaleMax, 32, 0)  ///< Maximum vscale range value
 
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 81610edee36fb..548ca675db5ea 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -1425,6 +1425,9 @@ static bool parseFloatingPointArgs(CompilerInvocation &invoc,
     opts.setFPContractMode(Fortran::common::LangOptions::FPM_Fast);
   }
 
+  if (args.hasArg(clang::driver::options::OPT_fno_fast_real_mod))
+    opts.NoFastRealMod = true;
+
   return true;
 }
 
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index d5e0325911adf..0c630d2ba876d 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -277,6 +277,14 @@ bool CodeGenAction::beginSourceFileAction() {
                               ci.getInvocation().getLangOpts().OpenMPVersion);
   }
 
+  if (ci.getInvocation().getLangOpts().NoFastRealMod) {
+    mlir::ModuleOp mod = lb.getModule();
+    mod.getOperation()->setAttr(
+        mlir::StringAttr::get(mod.getContext(),
+                              llvm::Twine{"fir.no_fast_real_mod"}),
+        mlir::BoolAttr::get(mod.getContext(), true));
+  }
+
   // Create a parse tree and lower it to FIR
   parseAndLowerTree(ci, lb);
 
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 71d35e37bbe94..de7694ffd468c 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -6989,8 +6989,33 @@ mlir::Value IntrinsicLibrary::genMergeBits(mlir::Type resultType,
 }
 
 // MOD
+static mlir::Value genFastMod(fir::FirOpBuilder &builder, mlir::Location loc,
+                              mlir::Value a, mlir::Value p) {
+  auto fastmathFlags = mlir::arith::FastMathFlags::contract;
+  auto fastmathAttr =
+      mlir::arith::FastMathFlagsAttr::get(builder.getContext(), fastmathFlags);
+  mlir::Value divResult =
+      mlir::arith::DivFOp::create(builder, loc, a, p, fastmathAttr);
+  mlir::Type intType = builder.getIntegerType(
+      a.getType().getIntOrFloatBitWidth(), /*signed=*/true);
+  mlir::Value intResult = builder.createConvert(loc, intType, divResult);
+  mlir::Value cnvResult = builder.createConvert(loc, a.getType(), intResult);
+  mlir::Value mulResult =
+      mlir::arith::MulFOp::create(builder, loc, cnvResult, p, fastmathAttr);
+  mlir::Value subResult =
+      mlir::arith::SubFOp::create(builder, loc, a, mulResult, fastmathAttr);
+  return subResult;
+}
+
 mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
                                      llvm::ArrayRef<mlir::Value> args) {
+  auto mod = builder.getModule();
+  bool dontUseFastRealMod = false;
+  bool canUseApprox = mlir::arith::bitEnumContainsAny(
+      builder.getFastMathFlags(), mlir::arith::FastMathFlags::afn);
+  if (auto attr = mod->getAttrOfType<mlir::BoolAttr>("fir.no_fast_real_mod"))
+    dontUseFastRealMod = attr.getValue();
+
   assert(args.size() == 2);
   if (resultType.isUnsignedInteger()) {
     mlir::Type signlessType = mlir::IntegerType::get(
@@ -7002,9 +7027,16 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
   if (mlir::isa<mlir::IntegerType>(resultType))
     return mlir::arith::RemSIOp::create(builder, loc, args[0], args[1]);
 
-  // Use runtime.
-  return builder.createConvert(
-      loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+  if (resultType.isFloat() && canUseApprox && !dontUseFastRealMod) {
+    // Treat MOD as an approximate function and code-gen inline code
+    // instead of calling into the Fortran runtime library.
+    return builder.createConvert(loc, resultType,
+                                 genFastMod(builder, loc, args[0], args[1]));
+  } else {
+    // Use runtime.
+    return builder.createConvert(
+        loc, resultType, fir::runtime::genMod(builder, loc, args[0], args[1]));
+  }
 }
 
 // MODULO
diff --git a/flang/test/Driver/fast-real-mod.f90 b/flang/test/Driver/fast-real-mod.f90
new file mode 100644
index 0000000000000..4ea9b26e64753
--- /dev/null
+++ b/flang/test/Driver/fast-real-mod.f90
@@ -0,0 +1,7 @@
+! RUN: %flang -fno-fast-real-mod -### -c %s 2>&1 | FileCheck %s -check-prefix CHECK-NO-FAST-REAL-MOD
+
+! CHECK-NO-FAST-REAL-MOD: "-fno-fast-real-mod"
+
+program test
+    ! nothing to be done in here
+end program test
diff --git a/flang/test/Lower/Intrinsics/fast-real-mod.f90 b/flang/test/Lower/Intrinsics/fast-real-mod.f90
new file mode 100644
index 0000000000000..f80f7203ad1a2
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/fast-real-mod.f90
@@ -0,0 +1,83 @@
+! RUN: %flang_fc1 -ffast-math -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK%if target=x86_64{{.*}} %{,CHECK-KIND10%}%if flang-supports-f128-math %{,CHECK-KIND16%}
+! RUN: %flang_fc1 -ffast-math -fno-fast-real-mod -emit-mlir -o - %s | FileCheck %s --check-prefixes=CHECK-NFRM%if target=x86_64{{.*}} %{,CHECK-NFRM-KIND10%}%if flang-supports-f128-math %{,CHECK-NFRM-KIND16%}
+
+! TODO: check line that fir.fast_real_mod is not there
+! CHECK-NFRM: module attributes {{{.*}}fir.no_fast_real_mod = true{{.*}}}
+
+! CHECK-LABEL: @_QPmod_real4
+subroutine mod_real4(r, a, p)
+    implicit none
+    real(kind=4) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f32
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f32) -> si32
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si32) -> f32
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f32
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f32
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f32>
+! CHECK-NFRM: fir.call @_FortranAModReal4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f32, f32, !fir.ref<i8>, i32) -> f32
+    r = mod(a, p)
+end subroutine mod_real4
+
+! CHECK-LABEL: @_QPmod_real8
+subroutine mod_real8(r, a, p)
+    implicit none
+    real(kind=8) :: r, a, p
+! CHECK: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f64
+! CHECK: %[[CV1:.*]] = fir.convert %[[DIV]] : (f64) -> si64
+! CHECK: %[[CV2:.*]] = fir.convert %[[CV1]] : (si64) -> f64
+! CHECK: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f64
+! CHECK: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f64
+! CHECK: fir.store %[[SUB]] to %[[R]] : !fir.ref<f64>
+! CHECK-NFRM: fir.call @_FortranAModReal8(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f64, f64, !fir.ref<i8>, i32) -> f64
+    r = mod(a, p)
+end subroutine mod_real8
+
+! CHECK-LABEL: @_QPmod_real10
+subroutine mod_real10(r, a, p)
+    implicit none
+    integer, parameter :: kind10 = merge(10, 4, selected_real_kind(p=18).eq.10)
+    real(kind=kind10) :: r, a, p
+! CHECK-KIND10: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND10: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND10: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND10: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND10: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND10: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f80
+! CHECK-KIND10: %[[CV1:.*]] = fir.convert %[[DIV]] : (f80) -> si80
+! CHECK-KIND10: %[[CV2:.*]] = fir.convert %[[CV1]] : (si80) -> f80
+! CHECK-KIND10: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f80
+! CHECK-KIND10: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f80
+! CHECK-KIND10: fir.store %[[SUB]] to %[[R]] : !fir.ref<f80>
+! CHECK-NFRM-KIND10: fir.call @_FortranAModReal10(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f80, f80, !fir.ref<i8>, i32) -> f80
+    r = mod(a, p)
+end subroutine mod_real10
+
+! CHECK-LABEL: @_QPmod_real16
+subroutine mod_real16(r, a, p)
+    implicit none
+    integer, parameter :: kind16 = merge(16, 4, selected_real_kind(p=33).eq.16)
+    real(kind=kind16) :: r, a, p
+! CHECK-KIND16: %[[A:.*]] = fir.declare{{.*}}a"
+! CHECK-KIND16: %[[P:.*]] = fir.declare{{.*}}p"
+! CHECK-KIND16: %[[R:.*]] = fir.declare{{.*}}r"
+! CHECK-KIND16: %[[A_LOAD:.*]] = fir.load %[[A]]
+! CHECK-KIND16: %[[P_LOAD:.*]] = fir.load %[[P]]
+! CHECK-KIND16: %[[DIV:.*]] = arith.divf %[[A_LOAD]], %[[P_LOAD]] fastmath<fast> : f128
+! CHECK-KIND16: %[[CV1:.*]] = fir.convert %[[DIV]] : (f128) -> si128
+! CHECK-KIND16: %[[CV2:.*]] = fir.convert %[[CV1]] : (si128) -> f128
+! CHECK-KIND16: %[[MUL:.*]] = arith.mulf %[[CV2]], %[[P_LOAD]] fastmath<fast> : f128
+! CHECK-KIND16: %[[SUB:.*]] = arith.subf %[[A_LOAD]], %[[MUL]] fastmath<fast> : f128
+! CHECK-KIND16: fir.store %[[SUB]] to %[[R]] : !fir.ref<f128>
+! CHECK-NFRM-KIND16: fir.call @_FortranAModReal16(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (f128, f128, !fir.ref<i8>, i32) -> f128
+    r = mod(a, p)
+end subroutine mod_real16

From 20e0e80a540223194e06d5e593634f65e1ee0de8 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 2 Oct 2025 10:13:06 +0100
Subject: [PATCH 480/878] [AArch64] Combine PTEST_FIRST(PTRUE, CONCAT(A, B)) ->
 PTEST_FIRST(PTRUE, A) (#161384)

When the input to ptest_first is a vector concat and the mask is all active,
performPTestFirstCombine returns a ptest_first using the first operand
of the concat, looking through any reinterpret casts.

This allows optimizePTestInstr to later remove the ptest when the first
operand is a flag setting instruction such as whilelo.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 43 +++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  7 +++
 .../AArch64/get-active-lane-mask-extract.ll   | 20 +--------
 3 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 45f52352d45fd..a1f4734f83562 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27234,6 +27234,21 @@ static bool isLanes1toNKnownZero(SDValue Op) {
   }
 }
 
+// Return true if the vector operation can guarantee that the first lane of its
+// result is active.
+static bool isLane0KnownActive(SDValue Op) {
+  switch (Op.getOpcode()) {
+  default:
+    return false;
+  case AArch64ISD::REINTERPRET_CAST:
+    return isLane0KnownActive(Op->getOperand(0));
+  case ISD::SPLAT_VECTOR:
+    return isOneConstant(Op.getOperand(0));
+  case AArch64ISD::PTRUE:
+    return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;
+  };
+}
+
 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
   assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
   SDValue InsertVec = N->getOperand(0);
@@ -27519,6 +27534,32 @@ static SDValue performMULLCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue performPTestFirstCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  SDLoc DL(N);
+  auto Mask = N->getOperand(0);
+  auto Pred = N->getOperand(1);
+
+  if (!isLane0KnownActive(Mask))
+    return SDValue();
+
+  if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)
+    Pred = Pred->getOperand(0);
+
+  if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {
+    Pred = Pred->getOperand(0);
+    Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);
+    return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,
+                       Pred);
+  }
+
+  return SDValue();
+}
+
 static SDValue
 performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                              SelectionDAG &DAG) {
@@ -27875,6 +27916,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case AArch64ISD::UMULL:
   case AArch64ISD::PMULL:
     return performMULLCombine(N, DCI, DAG);
+  case AArch64ISD::PTEST_FIRST:
+    return performPTestFirstCombine(N, DCI, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (N->getConstantOperandVal(1)) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5a51c812732e6..35b27ea2ec9dd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1503,6 +1503,13 @@ AArch64InstrInfo::canRemovePTestInstr(MachineInstr *PTest, MachineInstr *Mask,
             getElementSizeForOpcode(PredOpcode))
       return PredOpcode;
 
+    // For PTEST_FIRST(PTRUE_ALL, WHILE), the PTEST_FIRST is redundant since
+    // WHILEcc performs an implicit PTEST with an all active mask, setting
+    // the N flag as the PTEST_FIRST would.
+    if (PTest->getOpcode() == AArch64::PTEST_PP_FIRST &&
+        isPTrueOpcode(MaskOpcode) && Mask->getOperand(1).getImm() == 31)
+      return PredOpcode;
+
     return {};
   }
 
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
index b89f55188b0f2..e2c861b40e706 100644
--- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -327,9 +327,6 @@ define void @test_2x8bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
 ; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_ptest:
 ; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
 ; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.h, p1.h }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT:    ptrue p2.b
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p3.b, p0.b, p1.b
-; CHECK-SVE2p1-SME2-NEXT:    ptest p2, p3.b
 ; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB11_2
 ; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
 ; CHECK-SVE2p1-SME2-NEXT:    b use
@@ -368,9 +365,6 @@ define void @test_2x8bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n
 ; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_reinterpret_casts:
 ; CHECK-SVE2p1-SME2:       // %bb.0: // %entry
 ; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.s, p1.s }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT:    ptrue p2.h
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p3.h, p0.h, p1.h
-; CHECK-SVE2p1-SME2-NEXT:    ptest p2, p3.b
 ; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB12_2
 ; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
 ; CHECK-SVE2p1-SME2-NEXT:    b use
@@ -413,14 +407,9 @@ define void @test_4x4bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
 ; CHECK-SVE2p1-SME2-NEXT:    adds x8, x0, x8
 ; CHECK-SVE2p1-SME2-NEXT:    csinv x8, x8, xzr, lo
 ; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.s, p1.s }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.s, p3.s }, x8, x1
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.h, p0.h, p1.h
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p5.h, p2.h, p3.h
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.b, p4.b, p5.b
-; CHECK-SVE2p1-SME2-NEXT:    ptrue p5.b
-; CHECK-SVE2p1-SME2-NEXT:    ptest p5, p4.b
 ; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB13_2
 ; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.s, p3.s }, x8, x1
 ; CHECK-SVE2p1-SME2-NEXT:    b use
 ; CHECK-SVE2p1-SME2-NEXT:  .LBB13_2: // %if.end
 ; CHECK-SVE2p1-SME2-NEXT:    ret
@@ -463,14 +452,9 @@ define void @test_4x2bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n
 ; CHECK-SVE2p1-SME2-NEXT:    adds x8, x0, x8
 ; CHECK-SVE2p1-SME2-NEXT:    csinv x8, x8, xzr, lo
 ; CHECK-SVE2p1-SME2-NEXT:    whilelo { p0.d, p1.d }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.d, p3.d }, x8, x1
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.s, p0.s, p1.s
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p5.s, p2.s, p3.s
-; CHECK-SVE2p1-SME2-NEXT:    uzp1 p4.h, p4.h, p5.h
-; CHECK-SVE2p1-SME2-NEXT:    ptrue p5.h
-; CHECK-SVE2p1-SME2-NEXT:    ptest p5, p4.b
 ; CHECK-SVE2p1-SME2-NEXT:    b.pl .LBB14_2
 ; CHECK-SVE2p1-SME2-NEXT:  // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT:    whilelo { p2.d, p3.d }, x8, x1
 ; CHECK-SVE2p1-SME2-NEXT:    b use
 ; CHECK-SVE2p1-SME2-NEXT:  .LBB14_2: // %if.end
 ; CHECK-SVE2p1-SME2-NEXT:    ret

From 55803b8af1e4ddde1c0c43a9cd283133205c295d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 2 Oct 2025 11:15:33 +0200
Subject: [PATCH 481/878] Reapply "[libc++] Avoid constructing additional
 objects when using map::at" (#160738) (#161485)

This reverts commit b86aaacf28b358b187071bc87075f1faa2d65c4e.

The issue in LLVM has been fixed now.
---
 libcxx/include/CMakeLists.txt                 |  2 +
 libcxx/include/__algorithm/comp.h             |  4 ++
 libcxx/include/__functional/is_transparent.h  |  8 +++
 libcxx/include/__functional/operations.h      | 18 +++++++
 .../include/__functional/ranges_operations.h  |  7 +++
 libcxx/include/__tree                         |  4 +-
 .../is_generic_transparent_comparator.h       | 30 +++++++++++
 .../include/__type_traits/make_transparent.h  | 48 +++++++++++++++++
 libcxx/include/map                            | 54 ++++++++++++++++---
 libcxx/include/module.modulemap.in            |  2 +
 libcxx/include/string                         | 16 ++++++
 .../containers/associative/map.bench.cpp      | 13 +++++
 .../associative/unordered_map.bench.cpp       | 13 +++++
 13 files changed, 211 insertions(+), 8 deletions(-)
 create mode 100644 libcxx/include/__type_traits/is_generic_transparent_comparator.h
 create mode 100644 libcxx/include/__type_traits/make_transparent.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index e050362abb658..ddace8bf8c728 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -839,6 +839,7 @@ set(files
   __type_traits/is_floating_point.h
   __type_traits/is_function.h
   __type_traits/is_fundamental.h
+  __type_traits/is_generic_transparent_comparator.h
   __type_traits/is_implicit_lifetime.h
   __type_traits/is_implicitly_default_constructible.h
   __type_traits/is_integral.h
@@ -881,6 +882,7 @@ set(files
   __type_traits/make_32_64_or_128_bit.h
   __type_traits/make_const_lvalue_ref.h
   __type_traits/make_signed.h
+  __type_traits/make_transparent.h
   __type_traits/make_unsigned.h
   __type_traits/maybe_const.h
   __type_traits/nat.h
diff --git a/libcxx/include/__algorithm/comp.h b/libcxx/include/__algorithm/comp.h
index ab3c598418828..38e2fb9f5e744 100644
--- a/libcxx/include/__algorithm/comp.h
+++ b/libcxx/include/__algorithm/comp.h
@@ -11,6 +11,7 @@
 
 #include <__config>
 #include <__type_traits/desugars_to.h>
+#include <__type_traits/is_generic_transparent_comparator.h>
 #include <__type_traits/is_integral.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -48,6 +49,9 @@ inline const bool __desugars_to_v<__less_tag, __less<>, _Tp, _Tp> = true;
 template <class _Tp>
 inline const bool __desugars_to_v<__totally_ordered_less_tag, __less<>, _Tp, _Tp> = is_integral<_Tp>::value;
 
+template <>
+inline const bool __is_generic_transparent_comparator_v<__less<> > = true;
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___ALGORITHM_COMP_H
diff --git a/libcxx/include/__functional/is_transparent.h b/libcxx/include/__functional/is_transparent.h
index 567df1a662f54..c2c6fbce2465b 100644
--- a/libcxx/include/__functional/is_transparent.h
+++ b/libcxx/include/__functional/is_transparent.h
@@ -29,6 +29,14 @@ inline const bool __is_transparent_v<_Tp, _Key, __void_t<typename _Tp::is_transp
 
 #endif
 
+// Two types are considered transparently comparable if `comparator(key, arg)` is equivalent to `comparator(key,
+// <implicit cast to KeyT>(arg))`.
+//
+// This is different from `__is_transparent_v`, which is only a property of the comparator and doesn't provide
+// additional semantic guarantees.
+template <class _Comparator, class _KeyT, class _Arg, class = void>
+inline const bool __is_transparently_comparable_v = false;
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FUNCTIONAL_IS_TRANSPARENT
diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h
index 7b0ea11db5844..7f315ca851c08 100644
--- a/libcxx/include/__functional/operations.h
+++ b/libcxx/include/__functional/operations.h
@@ -15,7 +15,9 @@
 #include <__functional/unary_function.h>
 #include <__fwd/functional.h>
 #include <__type_traits/desugars_to.h>
+#include <__type_traits/is_generic_transparent_comparator.h>
 #include <__type_traits/is_integral.h>
+#include <__type_traits/make_transparent.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -377,6 +379,14 @@ struct less<void> {
   typedef void is_transparent;
 };
 
+template <class _Tp>
+struct __make_transparent<less<_Tp> > {
+  using type _LIBCPP_NODEBUG = less<>;
+};
+
+template <>
+inline const bool __is_generic_transparent_comparator_v<less<>> = true;
+
 template <class _Tp, class _Up>
 inline const bool __desugars_to_v<__less_tag, less<>, _Tp, _Up> = true;
 
@@ -466,6 +476,14 @@ struct greater<void> {
 
 template <class _Tp, class _Up>
 inline const bool __desugars_to_v<__greater_tag, greater<>, _Tp, _Up> = true;
+
+template <class _Tp>
+struct __make_transparent<greater<_Tp>> {
+  using type _LIBCPP_NODEBUG = greater<>;
+};
+
+template <>
+inline const bool __is_generic_transparent_comparator_v<greater<>> = true;
 #endif
 
 // Logical operations
diff --git a/libcxx/include/__functional/ranges_operations.h b/libcxx/include/__functional/ranges_operations.h
index df95843e7c9af..dc9da061af264 100644
--- a/libcxx/include/__functional/ranges_operations.h
+++ b/libcxx/include/__functional/ranges_operations.h
@@ -14,6 +14,7 @@
 #include <__concepts/totally_ordered.h>
 #include <__config>
 #include <__type_traits/desugars_to.h>
+#include <__type_traits/is_generic_transparent_comparator.h>
 #include <__utility/forward.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -108,6 +109,12 @@ inline const bool __desugars_to_v<__less_tag, ranges::less, _Tp, _Up> = true;
 template <class _Tp, class _Up>
 inline const bool __desugars_to_v<__greater_tag, ranges::greater, _Tp, _Up> = true;
 
+template <>
+inline const bool __is_generic_transparent_comparator_v<ranges::less> = true;
+
+template <>
+inline const bool __is_generic_transparent_comparator_v<ranges::greater> = true;
+
 #endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index 61c910c52c536..ef960d481cb7b 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -34,6 +34,7 @@
 #include <__type_traits/is_same.h>
 #include <__type_traits/is_specialization.h>
 #include <__type_traits/is_swappable.h>
+#include <__type_traits/make_transparent.h>
 #include <__type_traits/remove_const.h>
 #include <__utility/forward.h>
 #include <__utility/lazy_synth_three_way_comparator.h>
@@ -1749,7 +1750,8 @@ __tree<_Tp, _Compare, _Allocator>::__find_equal(const _Key& __v) {
   }
 
   __node_base_pointer* __node_ptr = __root_ptr();
-  auto __comp                     = __lazy_synth_three_way_comparator<_Compare, _Key, value_type>(value_comp());
+  auto&& __transparent            = std::__as_transparent(value_comp());
+  auto __comp = __lazy_synth_three_way_comparator<__make_transparent_t<_Compare>, _Key, value_type>(__transparent);
 
   while (true) {
     auto __comp_res = __comp(__v, __nd->__get_value());
diff --git a/libcxx/include/__type_traits/is_generic_transparent_comparator.h b/libcxx/include/__type_traits/is_generic_transparent_comparator.h
new file mode 100644
index 0000000000000..fd02c0b0423d1
--- /dev/null
+++ b/libcxx/include/__type_traits/is_generic_transparent_comparator.h
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___TYPE_TRAITS_IS_GENERIC_TRANSPARENT_COMPARATOR_H
+#define _LIBCPP___TYPE_TRAITS_IS_GENERIC_TRANSPARENT_COMPARATOR_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// This traits returns true if the given _Comparator is known to accept any two types for compaison. This is separate
+// from `__is_transparent_v`, since that only enables overloads of specific functions, but doesn't give any semantic
+// guarantees. This trait guarantess that the comparator simply calls the appropriate comparison functions for any two
+// types.
+
+template <class _Comparator>
+inline const bool __is_generic_transparent_comparator_v = false;
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___TYPE_TRAITS_IS_GENERIC_TRANSPARENT_COMPARATOR_H
diff --git a/libcxx/include/__type_traits/make_transparent.h b/libcxx/include/__type_traits/make_transparent.h
new file mode 100644
index 0000000000000..4d3207a807fa7
--- /dev/null
+++ b/libcxx/include/__type_traits/make_transparent.h
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___TYPE_TRAITS_MAKE_TRANSPARENT_H
+#define _LIBCPP___TYPE_TRAITS_MAKE_TRANSPARENT_H
+
+#include <__config>
+#include <__type_traits/enable_if.h>
+#include <__type_traits/is_empty.h>
+#include <__type_traits/is_same.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// __make_transparent tries to create a transparent comparator from its non-transparent counterpart, e.g. obtain
+// `less<>` from `less<T>`. This is useful in cases where conversions can be avoided (e.g. a string literal to a
+// std::string).
+
+template <class _Comparator>
+struct __make_transparent {
+  using type _LIBCPP_NODEBUG = _Comparator;
+};
+
+template <class _Comparator>
+using __make_transparent_t _LIBCPP_NODEBUG = typename __make_transparent<_Comparator>::type;
+
+template <class _Comparator, __enable_if_t<is_same<_Comparator, __make_transparent_t<_Comparator> >::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _Comparator& __as_transparent(_Comparator& __comp) {
+  return __comp;
+}
+
+template <class _Comparator, __enable_if_t<!is_same<_Comparator, __make_transparent_t<_Comparator> >::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI __make_transparent_t<_Comparator> __as_transparent(_Comparator&) {
+  static_assert(is_empty<_Comparator>::value);
+  return __make_transparent_t<_Comparator>();
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___TYPE_TRAITS_MAKE_TRANSPARENT_H
diff --git a/libcxx/include/map b/libcxx/include/map
index a63dfec910aae..035f913bd3497 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -600,7 +600,10 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #  include <__ranges/from_range.h>
 #  include <__tree>
 #  include <__type_traits/container_traits.h>
+#  include <__type_traits/desugars_to.h>
 #  include <__type_traits/is_allocator.h>
+#  include <__type_traits/is_convertible.h>
+#  include <__type_traits/make_transparent.h>
 #  include <__type_traits/remove_const.h>
 #  include <__type_traits/type_identity.h>
 #  include <__utility/forward.h>
@@ -666,6 +669,11 @@ public:
 #  endif
 };
 
+template <class _Key, class _MapValueT, class _Compare>
+struct __make_transparent<__map_value_compare<_Key, _MapValueT, _Compare> > {
+  using type _LIBCPP_NODEBUG = __map_value_compare<_Key, _MapValueT, __make_transparent_t<_Compare> >;
+};
+
 #  if _LIBCPP_STD_VER >= 14
 template <class _MapValueT, class _Key, class _Compare>
 struct __lazy_synth_three_way_comparator<__map_value_compare<_Key, _MapValueT, _Compare>, _MapValueT, _MapValueT> {
@@ -1048,6 +1056,24 @@ public:
   _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
 #  endif
 
+  template <class _Arg,
+            __enable_if_t<__is_transparently_comparable_v<_Compare, key_type, __remove_cvref_t<_Arg> >, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI mapped_type& at(_Arg&& __arg) {
+    auto [_, __child] = __tree_.__find_equal(__arg);
+    if (__child == nullptr)
+      std::__throw_out_of_range("map::at:  key not found");
+    return static_cast<__node_pointer>(__child)->__get_value().second;
+  }
+
+  template <class _Arg,
+            __enable_if_t<__is_transparently_comparable_v<_Compare, key_type, __remove_cvref_t<_Arg> >, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI const mapped_type& at(_Arg&& __arg) const {
+    auto [_, __child] = __tree_.__find_equal(__arg);
+    if (__child == nullptr)
+      std::__throw_out_of_range("map::at:  key not found");
+    return static_cast<__node_pointer>(__child)->__get_value().second;
+  }
+
   _LIBCPP_HIDE_FROM_ABI mapped_type& at(const key_type& __k);
   _LIBCPP_HIDE_FROM_ABI const mapped_type& at(const key_type& __k) const;
 
@@ -1242,11 +1268,15 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __k) { return __tree_.find(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __k) const { return __tree_.find(__k); }
 #  if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
+  template <typename _K2,
+            enable_if_t<__is_transparent_v<_Compare, _K2> || __is_transparently_comparable_v<_Compare, key_type, _K2>,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator find(const _K2& __k) {
     return __tree_.find(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
+  template <typename _K2,
+            enable_if_t<__is_transparent_v<_Compare, _K2> || __is_transparently_comparable_v<_Compare, key_type, _K2>,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator find(const _K2& __k) const {
     return __tree_.find(__k);
   }
@@ -1262,7 +1292,9 @@ public:
 
 #  if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __k) const { return find(__k) != end(); }
-  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
+  template <typename _K2,
+            enable_if_t<__is_transparent_v<_Compare, _K2> || __is_transparently_comparable_v<_Compare, key_type, _K2>,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI bool contains(const _K2& __k) const {
     return find(__k) != end();
   }
@@ -1271,12 +1303,16 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __k) { return __tree_.lower_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __k) const { return __tree_.lower_bound(__k); }
 #  if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
+  template <typename _K2,
+            enable_if_t<__is_transparent_v<_Compare, _K2> || __is_transparently_comparable_v<_Compare, key_type, _K2>,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _K2& __k) {
     return __tree_.lower_bound(__k);
   }
 
-  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
+  template <typename _K2,
+            enable_if_t<__is_transparent_v<_Compare, _K2> || __is_transparently_comparable_v<_Compare, key_type, _K2>,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _K2& __k) const {
     return __tree_.lower_bound(__k);
   }
@@ -1285,11 +1321,15 @@ public:
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __k) { return __tree_.upper_bound(__k); }
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __k) const { return __tree_.upper_bound(__k); }
 #  if _LIBCPP_STD_VER >= 14
-  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
+  template <typename _K2,
+            enable_if_t<__is_transparent_v<_Compare, _K2> || __is_transparently_comparable_v<_Compare, key_type, _K2>,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _K2& __k) {
     return __tree_.upper_bound(__k);
   }
-  template <typename _K2, enable_if_t<__is_transparent_v<_Compare, _K2>, int> = 0>
+  template <typename _K2,
+            enable_if_t<__is_transparent_v<_Compare, _K2> || __is_transparently_comparable_v<_Compare, key_type, _K2>,
+                        int> = 0>
   _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _K2& __k) const {
     return __tree_.upper_bound(__k);
   }
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 93d43f8d7e195..894093b409e11 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -200,6 +200,7 @@ module std_core [system] {
       header "__type_traits/is_fundamental.h"
       export std_core.type_traits.integral_constant
     }
+    module is_generic_transparent_comparator { header "__type_traits/is_generic_transparent_comparator.h" }
     module is_implicit_lifetime {
       header "__type_traits/is_implicit_lifetime.h"
       export std_core.type_traits.integral_constant
@@ -353,6 +354,7 @@ module std_core [system] {
     module make_32_64_or_128_bit                      { header "__type_traits/make_32_64_or_128_bit.h" }
     module make_const_lvalue_ref                      { header "__type_traits/make_const_lvalue_ref.h" }
     module make_signed                                { header "__type_traits/make_signed.h" }
+    module make_transparent                           { header "__type_traits/make_transparent.h" }
     module make_unsigned                              { header "__type_traits/make_unsigned.h" }
     module maybe_const                                { header "__type_traits/maybe_const.h" }
     module nat                                        { header "__type_traits/nat.h" }
diff --git a/libcxx/include/string b/libcxx/include/string
index cfd6861e5c9c2..dc562e0207630 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -600,6 +600,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__debug_utils/sanitizers.h>
 #  include <__format/enable_insertable.h>
 #  include <__functional/hash.h>
+#  include <__functional/is_transparent.h>
 #  include <__functional/unary_function.h>
 #  include <__fwd/string.h>
 #  include <__iterator/bounded_iter.h>
@@ -628,6 +629,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__type_traits/is_allocator.h>
 #  include <__type_traits/is_array.h>
 #  include <__type_traits/is_convertible.h>
+#  include <__type_traits/is_generic_transparent_comparator.h>
 #  include <__type_traits/is_nothrow_assignable.h>
 #  include <__type_traits/is_nothrow_constructible.h>
 #  include <__type_traits/is_replaceable.h>
@@ -2567,6 +2569,20 @@ struct __default_three_way_comparator<basic_string<_CharT, _Traits, _Alloc>, bas
 };
 #  endif
 
+template <class _Comparator, class _CharT, class _Traits, class _Alloc>
+inline const bool __is_transparently_comparable_v<_Comparator,
+                                                  basic_string<_CharT, _Traits, _Alloc>,
+                                                  const _CharT*,
+                                                  __enable_if_t<__is_generic_transparent_comparator_v<_Comparator> > > =
+    true;
+
+template <class _Comparator, class _CharT, class _Traits, class _Alloc, size_t _Np>
+inline const bool __is_transparently_comparable_v<_Comparator,
+                                                  basic_string<_CharT, _Traits, _Alloc>,
+                                                  _CharT[_Np],
+                                                  __enable_if_t<__is_generic_transparent_comparator_v<_Comparator> > > =
+    true;
+
 #  if _LIBCPP_STD_VER >= 17
 template <class _InputIterator,
           class _CharT     = __iter_value_type<_InputIterator>,
diff --git a/libcxx/test/benchmarks/containers/associative/map.bench.cpp b/libcxx/test/benchmarks/containers/associative/map.bench.cpp
index bd664dbb56ee7..142229ae64cad 100644
--- a/libcxx/test/benchmarks/containers/associative/map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/map.bench.cpp
@@ -16,6 +16,19 @@
 #include "../../GenerateInput.h"
 #include "benchmark/benchmark.h"
 
+static void BM_map_find_string_literal(benchmark::State& state) {
+  std::map<std::string, int> map;
+  map.emplace("Something very very long to show a long string situation", 1);
+  map.emplace("Something Else", 2);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(map);
+    benchmark::DoNotOptimize(map.find("Something very very long to show a long string situation"));
+  }
+}
+
+BENCHMARK(BM_map_find_string_literal);
+
 template <class K, class V>
 struct support::adapt_operations<std::map<K, V>> {
   using ValueType = typename std::map<K, V>::value_type;
diff --git a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
index 57adec2d214d4..d670c531910ea 100644
--- a/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
+++ b/libcxx/test/benchmarks/containers/associative/unordered_map.bench.cpp
@@ -15,6 +15,19 @@
 #include "../../GenerateInput.h"
 #include "benchmark/benchmark.h"
 
+static void BM_map_find_string_literal(benchmark::State& state) {
+  std::unordered_map<std::string, int> map;
+  map.emplace("Something very very long to show a long string situation", 1);
+  map.emplace("Something Else", 2);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(map);
+    benchmark::DoNotOptimize(map.find("Something very very long to show a long string situation"));
+  }
+}
+
+BENCHMARK(BM_map_find_string_literal);
+
 template <class K, class V>
 struct support::adapt_operations<std::unordered_map<K, V>> {
   using ValueType = typename std::unordered_map<K, V>::value_type;

From 04c01ff144a172230c053d73eb15831a4120db81 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 2 Oct 2025 09:20:19 +0000
Subject: [PATCH 482/878] [gn build] Port 55803b8af1e4

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 42a7940ccd44e..f771099cb4c4a 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1496,6 +1496,7 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_floating_point.h",
       "__type_traits/is_function.h",
       "__type_traits/is_fundamental.h",
+      "__type_traits/is_generic_transparent_comparator.h",
       "__type_traits/is_implicit_lifetime.h",
       "__type_traits/is_implicitly_default_constructible.h",
       "__type_traits/is_integral.h",
@@ -1538,6 +1539,7 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/make_32_64_or_128_bit.h",
       "__type_traits/make_const_lvalue_ref.h",
       "__type_traits/make_signed.h",
+      "__type_traits/make_transparent.h",
       "__type_traits/make_unsigned.h",
       "__type_traits/maybe_const.h",
       "__type_traits/nat.h",

From eb803df5029d08321102d59ead4c61d03ddc8a7a Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan.cowan@arm.com>
Date: Thu, 2 Oct 2025 10:30:31 +0100
Subject: [PATCH 483/878] [AArch64][GlobalISel] Add `G_FMODF` instruction
 (#160061)

This commit adds the intrinsic `G_FMODF` to GMIR & enables its
translation, legalization and instruction selection in AArch64.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   4 +
 .../CodeGen/GlobalISel/MachineIRBuilder.h     |   7 +
 llvm/include/llvm/Support/TargetOpcodes.def   |   3 +
 llvm/include/llvm/Target/GenericOpcodes.td    |   7 +
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   7 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  63 +++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   2 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  15 +-
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |   1 +
 .../AArch64/GlobalISel/legalize-modf.mir      | 206 ++++++++
 .../GlobalISel/legalizer-info-validation.mir  |   4 +
 .../AArch64/GlobalISel/select-modf.mir        | 136 ++++++
 llvm/test/CodeGen/AArch64/llvm.modf.ll        | 459 ++++++++++++------
 .../GlobalISel/legalizer-info-validation.mir  |   7 +-
 .../match-table-cxx.td                        | 132 ++---
 .../GlobalISelEmitter/GlobalISelEmitter.td    |   2 +-
 16 files changed, 826 insertions(+), 229 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 22569aab236af..c0e426c4a8db3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -300,6 +300,10 @@ class LegalizerHelper {
                                    Type *OpType,
                                    LostDebugLocObserver &LocObserver);
 
+  LegalizeResult emitModfLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+                                 unsigned Size, Type *OpType,
+                                 LostDebugLocObserver &LocObserver);
+
 public:
   /// Return the alignment to use for a stack temporary object with the given
   /// type.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 0b6033b4ba60a..40c7792f7e8a2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -2184,6 +2184,13 @@ class LLVM_ABI MachineIRBuilder {
     return buildInstr(TargetOpcode::G_FSINCOS, {Sin, Cos}, {Src}, Flags);
   }
 
+  /// Build and insert \p Fract, \p Int = G_FMODF \p Src
+  MachineInstrBuilder buildModf(const DstOp &Fract, const DstOp &Int,
+                                const SrcOp &Src,
+                                std::optional<unsigned> Flags = std::nullopt) {
+    return buildInstr(TargetOpcode::G_FMODF, {Fract, Int}, {Src}, Flags);
+  }
+
   /// Build and insert \p Res = G_FCOPYSIGN \p Op0, \p Op1
   MachineInstrBuilder buildFCopysign(const DstOp &Dst, const SrcOp &Src0,
                                      const SrcOp &Src1) {
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 7710e2fc2f22b..e55314568d683 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -650,6 +650,9 @@ HANDLE_TARGET_OPCODE(G_FDIV)
 /// Generic FP remainder.
 HANDLE_TARGET_OPCODE(G_FREM)
 
+/// Generic FP modf
+HANDLE_TARGET_OPCODE(G_FMODF)
+
 /// Generic FP exponentiation.
 HANDLE_TARGET_OPCODE(G_FPOW)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 733d10b1c5f3c..faf77880e4614 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -981,6 +981,13 @@ def G_FREM : GenericInstruction {
   let hasSideEffects = false;
 }
 
+/// Generic FP modf
+def G_FMODF : GenericInstruction {
+  let OutOperandList = (outs type0:$dst1, type0:$dst2);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = false;
+}
+
 // Floating point exponentiation.
 def G_FPOW : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 56e13f075aaac..884c3f1692e94 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2362,6 +2362,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                            MachineInstr::copyFlagsFromInstruction(CI));
     return true;
   }
+  case Intrinsic::modf: {
+    ArrayRef<Register> VRegs = getOrCreateVRegs(CI);
+    MIRBuilder.buildModf(VRegs[0], VRegs[1],
+                         getOrCreateVReg(*CI.getArgOperand(0)),
+                         MachineInstr::copyFlagsFromInstruction(CI));
+    return true;
+  }
   case Intrinsic::sincos: {
     ArrayRef<Register> VRegs = getOrCreateVRegs(CI);
     MIRBuilder.buildFSincos(VRegs[0], VRegs[1],
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 03dfa6f3f243f..cffaf7ce5aa06 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -471,6 +471,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(TANH_F);
   case TargetOpcode::G_FSINCOS:
     RTLIBCASE(SINCOS_F);
+  case TargetOpcode::G_FMODF:
+    RTLIBCASE(MODF_F);
   case TargetOpcode::G_FLOG10:
     RTLIBCASE(LOG10_F);
   case TargetOpcode::G_FLOG:
@@ -702,6 +704,46 @@ LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
   return LegalizerHelper::Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::emitModfLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+                                 unsigned Size, Type *OpType,
+                                 LostDebugLocObserver &LocObserver) {
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  Register DstFrac = MI.getOperand(0).getReg();
+  Register DstInt = MI.getOperand(1).getReg();
+  Register Src = MI.getOperand(2).getReg();
+  LLT DstTy = MRI.getType(DstFrac);
+
+  int MemSize = DstTy.getSizeInBytes();
+  Align Alignment = getStackTemporaryAlignment(DstTy);
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  unsigned AddrSpace = DL.getAllocaAddrSpace();
+  MachinePointerInfo PtrInfo;
+
+  Register StackPtrInt =
+      createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
+          .getReg(0);
+
+  auto &Ctx = MF.getFunction().getContext();
+  auto LibcallResult = createLibcall(
+      MIRBuilder, getRTLibDesc(MI.getOpcode(), Size), {DstFrac, OpType, 0},
+      {{Src, OpType, 0}, {StackPtrInt, PointerType::get(Ctx, AddrSpace), 1}},
+      LocObserver, &MI);
+
+  if (LibcallResult != LegalizeResult::Legalized)
+    return LegalizerHelper::UnableToLegalize;
+
+  MachineMemOperand *LoadMMOInt = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
+
+  MIRBuilder.buildLoad(DstInt, StackPtrInt, *LoadMMOInt);
+  MI.eraseFromParent();
+
+  return LegalizerHelper::Legalized;
+}
+
 LegalizerHelper::LegalizeResult
 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
@@ -1341,6 +1383,16 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
     }
     return emitSincosLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
   }
+  case TargetOpcode::G_FMODF: {
+    LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
+    unsigned Size = LLTy.getSizeInBits();
+    Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
+      return UnableToLegalize;
+    }
+    return emitModfLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
+  }
   case TargetOpcode::G_LROUND:
   case TargetOpcode::G_LLROUND:
   case TargetOpcode::G_INTRINSIC_LRINT:
@@ -3333,6 +3385,16 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
     Observer.changedInstr(MI);
     return Legalized;
+  case TargetOpcode::G_FMODF: {
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
+
+    widenScalarDst(MI, WideTy, 1, TargetOpcode::G_FPTRUNC);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), --MIRBuilder.getInsertPt());
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   case TargetOpcode::G_FPOWI:
   case TargetOpcode::G_FLDEXP:
   case TargetOpcode::G_STRICT_FLDEXP: {
@@ -5472,6 +5534,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_LROUND:
   case G_LLROUND:
   case G_INTRINSIC_TRUNC:
+  case G_FMODF:
   case G_FCOS:
   case G_FSIN:
   case G_FTAN:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 7ee54c5932b15..c197550ee38c7 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -438,7 +438,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FPOW, G_FLOG, G_FLOG2,
                                G_FLOG10, G_FTAN, G_FEXP, G_FEXP2, G_FEXP10,
                                G_FACOS, G_FASIN, G_FATAN, G_FATAN2, G_FCOSH,
-                               G_FSINH, G_FTANH})
+                               G_FSINH, G_FTANH, G_FMODF})
       // We need a call for these, so we always need to scalarize.
       .scalarize(0)
       // Regardless of FP16 support, widen 16-bit elements to 32-bits.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 273edf374bef0..0afec42135337 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -752,6 +752,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
     return selectExtInst(ResVReg, ResType, I, CL::exp, GL::Exp);
   case TargetOpcode::G_FEXP2:
     return selectExtInst(ResVReg, ResType, I, CL::exp2, GL::Exp2);
+  case TargetOpcode::G_FMODF:
+    return selectModf(ResVReg, ResType, I);
 
   case TargetOpcode::G_FLOG:
     return selectExtInst(ResVReg, ResType, I, CL::log, GL::Log);
@@ -3453,9 +3455,6 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_discard: {
     return selectDiscard(ResVReg, ResType, I);
   }
-  case Intrinsic::modf: {
-    return selectModf(ResVReg, ResType, I);
-  }
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
@@ -4268,6 +4267,7 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg,
         PtrTyReg,
         LLT::pointer(storageClassToAddressSpace(SPIRV::StorageClass::Function),
                      GR.getPointerSize()));
+
     // Assign SPIR-V type of the pointer type of the alloca variable to the
     // new register.
     GR.assignSPIRVTypeToVReg(PtrType, PtrTyReg, MIRBuilder.getMF());
@@ -4280,10 +4280,7 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg,
             .addUse(GR.getSPIRVTypeID(PtrType))
             .addImm(static_cast<uint32_t>(SPIRV::StorageClass::Function));
     Register Variable = AllocaMIB->getOperand(0).getReg();
-    // Modf must have 4 operands, the first two are the 2 parts of the result,
-    // the third is the operand, and the last one is the floating point value.
-    assert(I.getNumOperands() == 4 &&
-           "Expected 4 operands for modf instruction");
+
     MachineBasicBlock &BB = *I.getParent();
     // Create the OpenCLLIB::modf instruction.
     auto MIB =
@@ -4293,8 +4290,8 @@ bool SPIRVInstructionSelector::selectModf(Register ResVReg,
             .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::OpenCL_std))
             .addImm(CL::modf)
             .setMIFlags(I.getFlags())
-            .add(I.getOperand(3)) // Floating point value.
-            .addUse(Variable);    // Pointer to integral part.
+            .add(I.getOperand(I.getNumExplicitDefs())) // Floating point value.
+            .addUse(Variable); // Pointer to integral part.
     // Assign the integral part stored in the ptr to the second element of the
     // result.
     Register IntegralPartReg = I.getOperand(1).getReg();
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index db85e33a3c6c7..53074ea3b2597 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -300,6 +300,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   getActionDefinitionsBuilder({G_STRICT_FSQRT,
                                G_FPOW,
                                G_FEXP,
+                               G_FMODF,
                                G_FEXP2,
                                G_FLOG,
                                G_FLOG2,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir
new file mode 100644
index 0000000000000..36ac7ebf007d1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir
@@ -0,0 +1,206 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            test_modf_f16
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_modf_f16
+    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[COPY]](s16)
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $s0 = COPY [[FPEXT]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.0)
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[LOAD]](s32)
+    ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: $h0 = COPY [[FPTRUNC1]](s16)
+    ; CHECK-NEXT: $h1 = COPY [[FPTRUNC]](s16)
+    ; CHECK-NEXT: RET_ReallyLR implicit $h0, implicit $h1
+    %0:_(s16) = COPY $h0
+    %1:_(s16), %2:_(s16) = G_FMODF %0
+    $h0 = COPY %1(s16)
+    $h1 = COPY %2(s16)
+    RET_ReallyLR implicit $h0, implicit $h1
+...
+---
+name:            test_modf_f16_only_use_fractional_part
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_modf_f16_only_use_fractional_part
+    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[COPY]](s16)
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $s0 = COPY [[FPEXT]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: $h0 = COPY [[FPTRUNC]](s16)
+    ; CHECK-NEXT: RET_ReallyLR implicit $h0
+    %0:_(s16) = COPY $h0
+    %1:_(s16), %2:_(s16) = G_FMODF %0
+    $h0 = COPY %1(s16)
+    RET_ReallyLR implicit $h0
+...
+---
+name:            test_modf_v2f16
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_modf_v2f16
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16)
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $s0 = COPY [[FPEXT]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.1)
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[LOAD]](s32)
+    ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16)
+    ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $s0 = COPY [[FPEXT1]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX1]](p0)
+    ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %stack.0)
+    ; CHECK-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[LOAD1]](s32)
+    ; CHECK-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[FPTRUNC1]](s16), [[FPTRUNC3]](s16), [[DEF]](s16), [[DEF]](s16)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[FPTRUNC]](s16), [[FPTRUNC2]](s16), [[DEF]](s16), [[DEF]](s16)
+    ; CHECK-NEXT: $d0 = COPY [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK-NEXT: $d1 = COPY [[BUILD_VECTOR1]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
+    %1:_(<4 x s16>) = COPY $d0
+    %0:_(<2 x s16>), %2:_(<2 x s16>) = G_UNMERGE_VALUES %1(<4 x s16>)
+    %3:_(<2 x s16>), %4:_(<2 x s16>) = G_FMODF %0
+    %5:_(s16), %6:_(s16) = G_UNMERGE_VALUES %3(<2 x s16>)
+    %7:_(s16) = G_IMPLICIT_DEF
+    %8:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %7(s16)
+    %9:_(s16), %10:_(s16) = G_UNMERGE_VALUES %4(<2 x s16>)
+    %11:_(<4 x s16>) = G_BUILD_VECTOR %9(s16), %10(s16), %7(s16), %7(s16)
+    $d0 = COPY %8(<4 x s16>)
+    $d1 = COPY %11(<4 x s16>)
+    RET_ReallyLR implicit $d0, implicit $d1
+...
+---
+name:            test_modf_v3f32
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_modf_v3f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<4 x s32>)
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.2
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $s0 = COPY [[UV]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.2)
+    ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $s0 = COPY [[UV1]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX1]](p0)
+    ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %stack.1)
+    ; CHECK-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $s0 = COPY [[UV2]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX2]](p0)
+    ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %stack.0)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY [[BUILD_VECTOR1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %1:_(<2 x s64>) = COPY $q0
+    %2:_(<4 x s32>) = G_BITCAST %1(<2 x s64>)
+    %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %2(<4 x s32>)
+    %0:_(<3 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32), %5(s32)
+    %7:_(<3 x s32>), %8:_(<3 x s32>) = G_FMODF %0
+    %9:_(s32), %10:_(s32), %11:_(s32) = G_UNMERGE_VALUES %7(<3 x s32>)
+    %12:_(s32) = G_IMPLICIT_DEF
+    %13:_(<4 x s32>) = G_BUILD_VECTOR %9(s32), %10(s32), %11(s32), %12(s32)
+    %14:_(s32), %15:_(s32), %16:_(s32) = G_UNMERGE_VALUES %8(<3 x s32>)
+    %17:_(<4 x s32>) = G_BUILD_VECTOR %14(s32), %15(s32), %16(s32), %12(s32)
+    $q0 = COPY %13(<4 x s32>)
+    $q1 = COPY %17(<4 x s32>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+---
+name:            test_modf_v2f64
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_modf_v2f64
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $d0 = COPY [[UV]](s64)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $d0 = COPY [[UV1]](s64)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX1]](p0)
+    ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s64) from %stack.0)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY1]](s64), [[COPY2]](s64)
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
+    ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK-NEXT: $q1 = COPY [[BUILD_VECTOR1]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>), %2:_(<2 x s64>) = G_FMODF %0
+    $q0 = COPY %1(<2 x s64>)
+    $q1 = COPY %2(<2 x s64>)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+---
+name:            test_modf_fp128
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_modf_fp128
+    ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $q0
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $q0 = COPY [[COPY]](s128)
+    ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+    ; CHECK-NEXT: BL &modfl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $x0, implicit-def $q0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $q0
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s128) from %stack.0)
+    ; CHECK-NEXT: $q0 = COPY [[COPY1]](s128)
+    ; CHECK-NEXT: $q1 = COPY [[LOAD]](s128)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:_(s128) = COPY $q0
+    %1:_(s128), %2:_(s128) = G_FMODF %0
+    $q0 = COPY %1(s128)
+    $q1 = COPY %2(s128)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ba867f4ae0c26..d721b73c2b5ba 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -508,6 +508,10 @@
 # DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_FMODF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FPOW (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir
new file mode 100644
index 0000000000000..604cb96e38dc3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-unknown -run-pass=instruction-select %s -o - | FileCheck %s
+---
+name:            test_modf_fp128
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+frameInfo:
+  maxAlignment:    16
+stack:
+  - { id: 0, size: 16, alignment: 16 }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_modf_fp128
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $q0 = COPY [[COPY]]
+    ; CHECK-NEXT: $x0 = COPY [[ADDXri]]
+    ; CHECK-NEXT: BL &modfl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $x0, implicit-def $q0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
+    ; CHECK-NEXT: $q0 = COPY [[COPY1]]
+    ; CHECK-NEXT: $q1 = COPY [[LDRQui]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+    %0:fpr(s128) = COPY $q0
+    %3:gpr(p0) = G_FRAME_INDEX %stack.0
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    $q0 = COPY %0(s128)
+    $x0 = COPY %3(p0)
+    BL &modfl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $x0, implicit-def $q0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    %1:fpr(s128) = COPY $q0
+    %2:fpr(s128) = G_LOAD %3(p0) :: (load (s128) from %stack.0)
+    $q0 = COPY %1(s128)
+    $q1 = COPY %2(s128)
+    RET_ReallyLR implicit $q0, implicit $q1
+...
+---
+name:            test_modf_double
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    8
+stack:
+  - { id: 0, size: 8, alignment: 8 }
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_modf_double
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $d0 = COPY [[COPY]]
+    ; CHECK-NEXT: $x0 = COPY [[ADDXri]]
+    ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui %stack.0, 0 :: (load (s64) from %stack.0)
+    ; CHECK-NEXT: $d0 = COPY [[COPY1]]
+    ; CHECK-NEXT: $d1 = COPY [[LDRDui]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
+    %0:fpr(s64) = COPY $d0
+    %3:gpr(p0) = G_FRAME_INDEX %stack.0
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    $d0 = COPY %0(s64)
+    $x0 = COPY %3(p0)
+    BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    %1:fpr(s64) = COPY $d0
+    %2:fpr(s64) = G_LOAD %3(p0) :: (load (s64) from %stack.0)
+    $d0 = COPY %1(s64)
+    $d1 = COPY %2(s64)
+    RET_ReallyLR implicit $d0, implicit $d1
+...
+---
+name:            test_modf_double_vec
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+frameInfo:
+  maxAlignment:    8
+stack:
+  - { id: 0, size: 8, alignment: 8 }
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_modf_double_vec
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: $d0 = COPY [[COPY]]
+    ; CHECK-NEXT: $x0 = COPY [[ADDXri]]
+    ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui %stack.0, 0 :: (load (s64) from %stack.0)
+    ; CHECK-NEXT: $d0 = COPY [[COPY1]]
+    ; CHECK-NEXT: $d1 = COPY [[LDRDui]]
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
+    %0:fpr(s64) = COPY $d0
+    %3:gpr(p0) = G_FRAME_INDEX %stack.0
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    $d0 = COPY %0(s64)
+    $x0 = COPY %3(p0)
+    BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    %1:fpr(s64) = COPY $d0
+    %2:fpr(s64) = G_LOAD %3(p0) :: (load (s64) from %stack.0)
+    $d0 = COPY %1(s64)
+    $d1 = COPY %2(s64)
+    RET_ReallyLR implicit $d0, implicit $d1
+...
diff --git a/llvm/test/CodeGen/AArch64/llvm.modf.ll b/llvm/test/CodeGen/AArch64/llvm.modf.ll
index 41fe796daca86..503742fa1c443 100644
--- a/llvm/test/CodeGen/AArch64/llvm.modf.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.modf.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK,CHECK-SD %s
+; RUN: llc -mtriple=aarch64-gnu-linux -global-isel < %s | FileCheck -check-prefixes=CHECK,CHECK-GI %s
 
 define { half, half } @test_modf_f16(half %a) {
 ; CHECK-LABEL: test_modf_f16:
@@ -55,61 +56,95 @@ define half @test_modf_f16_only_use_integral_part(half %a) {
 }
 
 define { <2 x half>, <2 x half> } @test_modf_v2f16(<2 x half> %a) {
-; CHECK-LABEL: test_modf_v2f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h1, v0.h[1]
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    add x0, sp, #44
-; CHECK-NEXT:    fcvt s0, h1
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcvt h0, s0
-; CHECK-NEXT:    add x0, sp, #40
-; CHECK-NEXT:    fcvt s1, h1
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    fmov s0, s1
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcvt h2, s0
-; CHECK-NEXT:    add x0, sp, #56
-; CHECK-NEXT:    mov h1, v1.h[2]
-; CHECK-NEXT:    fcvt s0, h1
-; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v2.h[1], v1.h[0]
-; CHECK-NEXT:    str q2, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fcvt h2, s0
-; CHECK-NEXT:    add x0, sp, #60
-; CHECK-NEXT:    mov h1, v1.h[3]
-; CHECK-NEXT:    fcvt s0, h1
-; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    ldp s2, s1, [sp, #40]
-; CHECK-NEXT:    fcvt h4, s0
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    fcvt h3, s1
-; CHECK-NEXT:    fcvt h1, s2
-; CHECK-NEXT:    ldr s2, [sp, #56]
-; CHECK-NEXT:    mov v0.h[3], v4.h[0]
-; CHECK-NEXT:    fcvt h2, s2
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    mov v1.h[1], v3.h[0]
-; CHECK-NEXT:    ldr s3, [sp, #60]
-; CHECK-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-NEXT:    fcvt h2, s3
-; CHECK-NEXT:    mov v1.h[3], v2.h[0]
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_modf_v2f16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    add x0, sp, #44
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h0, s0
+; CHECK-SD-NEXT:    add x0, sp, #40
+; CHECK-SD-NEXT:    fcvt s1, h1
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    fmov s0, s1
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h2, s0
+; CHECK-SD-NEXT:    add x0, sp, #56
+; CHECK-SD-NEXT:    mov h1, v1.h[2]
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v2.h[1], v1.h[0]
+; CHECK-SD-NEXT:    str q2, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h2, s0
+; CHECK-SD-NEXT:    add x0, sp, #60
+; CHECK-SD-NEXT:    mov h1, v1.h[3]
+; CHECK-SD-NEXT:    fcvt s0, h1
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-SD-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    ldp s2, s1, [sp, #40]
+; CHECK-SD-NEXT:    fcvt h4, s0
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    fcvt h3, s1
+; CHECK-SD-NEXT:    fcvt h1, s2
+; CHECK-SD-NEXT:    ldr s2, [sp, #56]
+; CHECK-SD-NEXT:    mov v0.h[3], v4.h[0]
+; CHECK-SD-NEXT:    fcvt h2, s2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    mov v1.h[1], v3.h[0]
+; CHECK-SD-NEXT:    ldr s3, [sp, #60]
+; CHECK-SD-NEXT:    mov v1.h[2], v2.h[0]
+; CHECK-SD-NEXT:    fcvt h2, s3
+; CHECK-SD-NEXT:    mov v1.h[3], v2.h[0]
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_modf_v2f16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    str d8, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    str x30, [sp, #56] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w30, -8
+; CHECK-GI-NEXT:    .cfi_offset b8, -16
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h8, v0.h[1]
+; CHECK-GI-NEXT:    add x0, sp, #40
+; CHECK-GI-NEXT:    fcvt s0, h0
+; CHECK-GI-NEXT:    bl modff
+; CHECK-GI-NEXT:    fcvt h0, s0
+; CHECK-GI-NEXT:    ldr s1, [sp, #40]
+; CHECK-GI-NEXT:    add x0, sp, #44
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fcvt h0, s1
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fcvt s0, h8
+; CHECK-GI-NEXT:    bl modff
+; CHECK-GI-NEXT:    ldr s1, [sp, #44]
+; CHECK-GI-NEXT:    fcvt h3, s0
+; CHECK-GI-NEXT:    ldr x30, [sp, #56] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    ldr d8, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fcvt h2, s1
+; CHECK-GI-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
   %result = call { <2 x half>, <2 x half> } @llvm.modf.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
@@ -130,80 +165,156 @@ define { float, float } @test_modf_f32(float %a) {
 }
 
 define { <3 x float>, <3 x float> } @test_modf_v3f32(<3 x float> %a) {
-; CHECK-LABEL: test_modf_v3f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    add x0, sp, #56
-; CHECK-NEXT:    add x19, sp, #56
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    add x0, sp, #44
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    add x0, sp, #60
-; CHECK-NEXT:    add x20, sp, #60
-; CHECK-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    mov s0, v0.s[2]
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    ldr s1, [sp, #44]
-; CHECK-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
-; CHECK-NEXT:    mov v2.s[2], v0.s[0]
-; CHECK-NEXT:    ld1 { v1.s }[2], [x20]
-; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    add sp, sp, #80
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_modf_v3f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #80
+; CHECK-SD-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w30, -32
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    add x0, sp, #56
+; CHECK-SD-NEXT:    add x19, sp, #56
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    add x0, sp, #44
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    add x0, sp, #60
+; CHECK-SD-NEXT:    add x20, sp, #60
+; CHECK-SD-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov s0, v0.s[2]
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    ldr s1, [sp, #44]
+; CHECK-SD-NEXT:    ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-SD-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT:    ld1 { v1.s }[2], [x20]
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    add sp, sp, #80
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_modf_v3f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    add x0, sp, #68
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    mov s9, v0.s[2]
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl modff
+; CHECK-GI-NEXT:    ldr s1, [sp, #68]
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    add x0, sp, #72
+; CHECK-GI-NEXT:    stp q0, q1, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl modff
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    add x0, sp, #76
+; CHECK-GI-NEXT:    add x19, sp, #76
+; CHECK-GI-NEXT:    ldr s0, [sp, #72]
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s9
+; CHECK-GI-NEXT:    bl modff
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v2.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT:    mov v2.s[2], v0.s[0]
+; CHECK-GI-NEXT:    ld1 { v1.s }[2], [x19]
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.modf.v3f32(<3 x float> %a)
   ret { <3 x float>, <3 x float> } %result
 }
 
 define { <2 x float>, <2 x float> } @test_modf_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_modf_v2f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    add x0, sp, #40
-; CHECK-NEXT:    add x19, sp, #40
-; CHECK-NEXT:    mov s0, v0.s[1]
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    add x0, sp, #44
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl modff
-; CHECK-NEXT:    ldr s1, [sp, #44]
-; CHECK-NEXT:    ldr q2, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_modf_v2f32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    add x0, sp, #40
+; CHECK-SD-NEXT:    add x19, sp, #40
+; CHECK-SD-NEXT:    mov s0, v0.s[1]
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    add x0, sp, #44
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    bl modff
+; CHECK-SD-NEXT:    ldr s1, [sp, #44]
+; CHECK-SD-NEXT:    ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_modf_v2f32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #64
+; CHECK-GI-NEXT:    str d8, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -32
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add x0, sp, #40
+; CHECK-GI-NEXT:    mov s8, v0.s[1]
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    bl modff
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    add x0, sp, #44
+; CHECK-GI-NEXT:    add x19, sp, #44
+; CHECK-GI-NEXT:    ldr s0, [sp, #40]
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov s0, s8
+; CHECK-GI-NEXT:    bl modff
+; CHECK-GI-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    ldr d8, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v2.s[1], v0.s[0]
+; CHECK-GI-NEXT:    ld1 { v1.s }[1], [x19]
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT:    fmov d0, d2
+; CHECK-GI-NEXT:    add sp, sp, #64
+; CHECK-GI-NEXT:    ret
   %result = call { <2 x float>, <2 x float> } @llvm.modf.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
 }
@@ -224,32 +335,80 @@ define { double, double } @test_modf_f64(double %a) {
 }
 
 define { <2 x double>, <2 x double> } @test_modf_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_modf_v2f64:
+; CHECK-SD-LABEL: test_modf_v2f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #64
+; CHECK-SD-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    mov d0, v0.d[1]
+; CHECK-SD-NEXT:    add x0, sp, #32
+; CHECK-SD-NEXT:    add x19, sp, #32
+; CHECK-SD-NEXT:    bl modf
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    add x0, sp, #40
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    bl modf
+; CHECK-SD-NEXT:    ldr d1, [sp, #40]
+; CHECK-SD-NEXT:    ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    ld1 { v1.d }[1], [x19]
+; CHECK-SD-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-SD-NEXT:    add sp, sp, #64
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_modf_v2f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sub sp, sp, #80
+; CHECK-GI-NEXT:    str d8, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -32
+; CHECK-GI-NEXT:    add x0, sp, #40
+; CHECK-GI-NEXT:    mov d8, v0.d[1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    bl modf
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    add x0, sp, #56
+; CHECK-GI-NEXT:    add x19, sp, #56
+; CHECK-GI-NEXT:    ldr d0, [sp, #40]
+; CHECK-GI-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    fmov d0, d8
+; CHECK-GI-NEXT:    bl modf
+; CHECK-GI-NEXT:    ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    ldr d8, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v2.d[1], v0.d[0]
+; CHECK-GI-NEXT:    ld1 { v1.d }[1], [x19]
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    add sp, sp, #80
+; CHECK-GI-NEXT:    ret
+  %result = call { <2 x double>, <2 x double> } @llvm.modf.v2f64(<2 x double> %a)
+  ret { <2 x double>, <2 x double> } %result
+}
+
+define { fp128, fp128 } @test_modf_fp128(fp128 %a) {
+; CHECK-LABEL: test_modf_fp128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
-; CHECK-NEXT:    stp x30, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    mov d0, v0.d[1]
-; CHECK-NEXT:    add x0, sp, #32
-; CHECK-NEXT:    add x19, sp, #32
-; CHECK-NEXT:    bl modf
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    add x0, sp, #40
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    bl modf
-; CHECK-NEXT:    ldr d1, [sp, #40]
-; CHECK-NEXT:    ldr q2, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    ld1 { v1.d }[1], [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    ldr q1, [sp]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
-  %result = call { <2 x double>, <2 x double> } @llvm.modf.v2f64(<2 x double> %a)
-  ret { <2 x double>, <2 x double> } %result
+  %result = call { fp128, fp128 } @llvm.modf.fp128(fp128 %a)
+  ret { fp128, fp128 } %result
 }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index 7204064a07f40..f1d17f9f02b90 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -505,6 +505,9 @@
 # DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_FMODF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FPOW (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
@@ -607,11 +610,11 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FMINIMUMNUM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode 219 is aliased to 183
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAXIMUMNUM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode 220 is aliased to 183
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_GET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
index ce4f0108b4843..18960b43ab97d 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-cxx.td
@@ -96,71 +96,71 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(99), GIMT_Encode2(210), /*)*//*default:*//*Label 5*/ GIMT_Encode4(520),
-// CHECK-NEXT:      /* 10 */ /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(454), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:      /* 182 */ /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(472), GIMT_Encode4(0),
-// CHECK-NEXT:      /* 190 */ /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(484), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:      /* 414 */ /*TargetOpcode::G_FNEG*//*Label 3*/ GIMT_Encode4(496), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:      /* 450 */ /*TargetOpcode::G_FABS*//*Label 4*/ GIMT_Encode4(508),
-// CHECK-NEXT:      /* 454 */ // Label 0: @454
-// CHECK-NEXT:      /* 454 */ GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(471), // Rule ID 2 //
-// CHECK-NEXT:      /* 459 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
-// CHECK-NEXT:      /* 462 */ // MIs[0] x
-// CHECK-NEXT:      /* 462 */ // No operand predicates
-// CHECK-NEXT:      /* 462 */ // MIs[0] y
-// CHECK-NEXT:      /* 462 */ // No operand predicates
-// CHECK-NEXT:      /* 462 */ GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_GICombiner0),
-// CHECK-NEXT:      /* 466 */ GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_GICombiner1),
-// CHECK-NEXT:      /* 470 */ // Combiner Rule #2: TwoMatchNoApply
-// CHECK-NEXT:      /* 470 */ GIR_EraseRootFromParent_Done,
-// CHECK-NEXT:      /* 471 */ // Label 6: @471
-// CHECK-NEXT:      /* 471 */ GIM_Reject,
-// CHECK-NEXT:      /* 472 */ // Label 1: @472
-// CHECK-NEXT:      /* 472 */ GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(483), // Rule ID 3 //
-// CHECK-NEXT:      /* 477 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
-// CHECK-NEXT:      /* 480 */ // MIs[0] a
-// CHECK-NEXT:      /* 480 */ // No operand predicates
-// CHECK-NEXT:      /* 480 */ // MIs[0] y
-// CHECK-NEXT:      /* 480 */ // No operand predicates
-// CHECK-NEXT:      /* 480 */ // Combiner Rule #3: NoMatchTwoApply
-// CHECK-NEXT:      /* 480 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2),
-// CHECK-NEXT:      /* 483 */ // Label 7: @483
-// CHECK-NEXT:      /* 483 */ GIM_Reject,
-// CHECK-NEXT:      /* 484 */ // Label 2: @484
-// CHECK-NEXT:      /* 484 */ GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(495), // Rule ID 4 //
-// CHECK-NEXT:      /* 489 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
-// CHECK-NEXT:      /* 492 */ // MIs[0] a
-// CHECK-NEXT:      /* 492 */ // No operand predicates
-// CHECK-NEXT:      /* 492 */ // MIs[0] y
-// CHECK-NEXT:      /* 492 */ // No operand predicates
-// CHECK-NEXT:      /* 492 */ // Combiner Rule #4: CombineCXXOrder
-// CHECK-NEXT:      /* 492 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner3),
-// CHECK-NEXT:      /* 495 */ // Label 8: @495
-// CHECK-NEXT:      /* 495 */ GIM_Reject,
-// CHECK-NEXT:      /* 496 */ // Label 3: @496
-// CHECK-NEXT:      /* 496 */ GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(507), // Rule ID 1 //
-// CHECK-NEXT:      /* 501 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
-// CHECK-NEXT:      /* 504 */ // MIs[0] a
-// CHECK-NEXT:      /* 504 */ // No operand predicates
-// CHECK-NEXT:      /* 504 */ // MIs[0] b
-// CHECK-NEXT:      /* 504 */ // No operand predicates
-// CHECK-NEXT:      /* 504 */ // Combiner Rule #1: TwoMatchTwoApply
-// CHECK-NEXT:      /* 504 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1),
-// CHECK-NEXT:      /* 507 */ // Label 9: @507
-// CHECK-NEXT:      /* 507 */ GIM_Reject,
-// CHECK-NEXT:      /* 508 */ // Label 4: @508
-// CHECK-NEXT:      /* 508 */ GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(519), // Rule ID 0 //
-// CHECK-NEXT:      /* 513 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
-// CHECK-NEXT:      /* 516 */ // MIs[0] a
-// CHECK-NEXT:      /* 516 */ // No operand predicates
-// CHECK-NEXT:      /* 516 */ // MIs[0] b
-// CHECK-NEXT:      /* 516 */ // No operand predicates
-// CHECK-NEXT:      /* 516 */ // Combiner Rule #0: OneMatchOneApply
-// CHECK-NEXT:      /* 516 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
-// CHECK-NEXT:      /* 519 */ // Label 10: @519
-// CHECK-NEXT:      /* 519 */ GIM_Reject,
-// CHECK-NEXT:      /* 520 */ // Label 5: @520
-// CHECK-NEXT:      /* 520 */ GIM_Reject,
-// CHECK-NEXT:      /* 521 */ }; // Size: 521 bytes
+// CHECK-NEXT:      /*   0 */ GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(99), GIMT_Encode2(211), /*)*//*default:*//*Label 5*/ GIMT_Encode4(524),
+// CHECK-NEXT:      /* 10 */ /*TargetOpcode::G_STORE*//*Label 0*/ GIMT_Encode4(458), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:      /* 182 */ /*TargetOpcode::G_SEXT*//*Label 1*/ GIMT_Encode4(476), GIMT_Encode4(0),
+// CHECK-NEXT:      /* 190 */ /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(488), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:      /* 418 */ /*TargetOpcode::G_FNEG*//*Label 3*/ GIMT_Encode4(500), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:      /* 454 */ /*TargetOpcode::G_FABS*//*Label 4*/ GIMT_Encode4(512),
+// CHECK-NEXT:      /* 458 */ // Label 0: @458
+// CHECK-NEXT:      /* 458 */ GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(475), // Rule ID 2 //
+// CHECK-NEXT:      /* 463 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
+// CHECK-NEXT:      /* 466 */ // MIs[0] x
+// CHECK-NEXT:      /* 466 */ // No operand predicates
+// CHECK-NEXT:      /* 466 */ // MIs[0] y
+// CHECK-NEXT:      /* 466 */ // No operand predicates
+// CHECK-NEXT:      /* 466 */ GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_GICombiner0),
+// CHECK-NEXT:      /* 470 */ GIM_CheckCxxInsnPredicate, /*MI*/0, /*FnId*/GIMT_Encode2(GICXXPred_MI_Predicate_GICombiner1),
+// CHECK-NEXT:      /* 474 */ // Combiner Rule #2: TwoMatchNoApply
+// CHECK-NEXT:      /* 474 */ GIR_EraseRootFromParent_Done,
+// CHECK-NEXT:      /* 475 */ // Label 6: @475
+// CHECK-NEXT:      /* 475 */ GIM_Reject,
+// CHECK-NEXT:      /* 476 */ // Label 1: @476
+// CHECK-NEXT:      /* 476 */ GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(487), // Rule ID 3 //
+// CHECK-NEXT:      /* 481 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
+// CHECK-NEXT:      /* 484 */ // MIs[0] a
+// CHECK-NEXT:      /* 484 */ // No operand predicates
+// CHECK-NEXT:      /* 484 */ // MIs[0] y
+// CHECK-NEXT:      /* 484 */ // No operand predicates
+// CHECK-NEXT:      /* 484 */ // Combiner Rule #3: NoMatchTwoApply
+// CHECK-NEXT:      /* 484 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner2),
+// CHECK-NEXT:      /* 487 */ // Label 7: @487
+// CHECK-NEXT:      /* 487 */ GIM_Reject,
+// CHECK-NEXT:      /* 488 */ // Label 2: @488
+// CHECK-NEXT:      /* 488 */ GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(499), // Rule ID 4 //
+// CHECK-NEXT:      /* 493 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
+// CHECK-NEXT:      /* 496 */ // MIs[0] a
+// CHECK-NEXT:      /* 496 */ // No operand predicates
+// CHECK-NEXT:      /* 496 */ // MIs[0] y
+// CHECK-NEXT:      /* 496 */ // No operand predicates
+// CHECK-NEXT:      /* 496 */ // Combiner Rule #4: CombineCXXOrder
+// CHECK-NEXT:      /* 496 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner3),
+// CHECK-NEXT:      /* 499 */ // Label 8: @499
+// CHECK-NEXT:      /* 499 */ GIM_Reject,
+// CHECK-NEXT:      /* 500 */ // Label 3: @500
+// CHECK-NEXT:      /* 500 */ GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(511), // Rule ID 1 //
+// CHECK-NEXT:      /* 505 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
+// CHECK-NEXT:      /* 508 */ // MIs[0] a
+// CHECK-NEXT:      /* 508 */ // No operand predicates
+// CHECK-NEXT:      /* 508 */ // MIs[0] b
+// CHECK-NEXT:      /* 508 */ // No operand predicates
+// CHECK-NEXT:      /* 508 */ // Combiner Rule #1: TwoMatchTwoApply
+// CHECK-NEXT:      /* 508 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner1),
+// CHECK-NEXT:      /* 511 */ // Label 9: @511
+// CHECK-NEXT:      /* 511 */ GIM_Reject,
+// CHECK-NEXT:      /* 512 */ // Label 4: @512
+// CHECK-NEXT:      /* 512 */ GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(523), // Rule ID 0 //
+// CHECK-NEXT:      /* 517 */ GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
+// CHECK-NEXT:      /* 520 */ // MIs[0] a
+// CHECK-NEXT:      /* 520 */ // No operand predicates
+// CHECK-NEXT:      /* 520 */ // MIs[0] b
+// CHECK-NEXT:      /* 520 */ // No operand predicates
+// CHECK-NEXT:      /* 520 */ // Combiner Rule #0: OneMatchOneApply
+// CHECK-NEXT:      /* 520 */ GIR_DoneWithCustomAction, /*Fn*/GIMT_Encode2(GICXXCustomAction_GICombiner0),
+// CHECK-NEXT:      /* 523 */ // Label 10: @523
+// CHECK-NEXT:      /* 523 */ GIM_Reject,
+// CHECK-NEXT:      /* 524 */ // Label 5: @524
+// CHECK-NEXT:      /* 524 */ GIM_Reject,
+// CHECK-NEXT:      /* 525 */ }; // Size: 525 bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
index 6be1720a6da23..fdabc53a3ff3b 100644
--- a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td
@@ -535,7 +535,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 1898 bytes
+// R00O-NEXT:  }; // Size: 1902 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,

From 2cb530868ca1d6e66e950f4b247b0905ee95d7eb Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Thu, 2 Oct 2025 10:45:27 +0100
Subject: [PATCH 484/878] [DebugInfo][InstrRef] Copy instr-ref to replacement
 instrs in X86FixupSetCCPass (#159777)

...to preserve variable location coverage.

Fixes missing variable location coverage in #49818 reproducer
---
 llvm/lib/Target/X86/X86FixupSetCC.cpp         |  6 +++
 .../X86/x86fixupsetcc-debug-instr-num.mir     | 54 +++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 llvm/test/DebugInfo/X86/x86fixupsetcc-debug-instr-num.mir

diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 2de89947c4519..ea93a575ec530 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -136,6 +136,12 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
           .addReg(ZeroReg)
           .addReg(Reg0)
           .addImm(X86::sub_8bit);
+
+      // Redirect the debug-instr-number to the setcc.
+      if (unsigned InstrNum = ZExt->peekDebugInstrNum())
+        MF.makeDebugValueSubstitution({InstrNum, 0},
+                                      {MI.getDebugInstrNum(), 0});
+
       ToErase.push_back(ZExt);
     }
   }
diff --git a/llvm/test/DebugInfo/X86/x86fixupsetcc-debug-instr-num.mir b/llvm/test/DebugInfo/X86/x86fixupsetcc-debug-instr-num.mir
new file mode 100644
index 0000000000000..b7149f0155ae3
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/x86fixupsetcc-debug-instr-num.mir
@@ -0,0 +1,54 @@
+# RUN: llc %s  --run-pass=x86-fixup-setcc -o - | FileCheck %s
+
+## Check the debug-isntr-number transfers from MOVZX32rr8 to the SETCC
+## after the mov is replaced with an INSERT_SUBREG, updating the substitutions
+## table.
+
+# CHECK: debugValueSubstitutions:
+# CHECK:  - { srcinst: 1, srcop: 0, dstinst: 2, dstop: 0, subreg: 0 }
+
+# CHECK: %[[#]]:gr8 = SETCCr 15, implicit $eflags,  debug-instr-number 2
+# CHECK: INSERT_SUBREG
+# CHECK-NOT: debug-instr-number
+# CHECK-NEXT: DBG_INSTR_REF ![[#]], !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0)
+
+--- |
+  source_filename = "reduced.ll"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  define i32 @main(i32 %call2) {
+  entry:
+    %cmp12 = icmp sgt i32 %call2, 0
+    %conv13 = zext i1 %cmp12 to i32
+      #dbg_value(i32 %conv13, !4, !DIExpression(), !8)
+    ret i32 %conv13
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 22.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/")
+  !2 = !{}
+  !3 = !{i32 2, !"Debug Info Version", i32 3}
+  !4 = !DILocalVariable(name: "v_3", scope: !5, file: !1, line: 10, type: !7)
+  !5 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !6, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2, keyInstructions: true)
+  !6 = !DISubroutineType(types: !2)
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !DILocation(line: 0, scope: !5)
+...
+---
+name:            main
+body:             |
+  bb.0.entry:
+    liveins: $edi
+
+    %0:gr32 = COPY $edi
+    TEST32rr %0, %0, implicit-def $eflags
+    %1:gr8 = SETCCr 15, implicit $eflags
+    %2:gr32 = MOVZX32rr8 killed %1,  debug-instr-number 1
+    DBG_INSTR_REF !4, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0),  debug-location !8
+    $eax = COPY %2
+    RET 0, $eax
+...

From 3c391877c10b5fea24b4c44cb8631a15d4cb809c Mon Sep 17 00:00:00 2001
From: Ruoyu Qiu <cabbaken@outlook.com>
Date: Thu, 2 Oct 2025 18:02:55 +0800
Subject: [PATCH 485/878] [ELF]Add overflow check to ELF note iterator
 (#160451)

Add overflow check to ELF note iterator to handle large `p_filesz` or
`sh_size`, avoid accessing invalid memory.

---------

Signed-off-by: Ruoyu Qiu <cabbaken@outlook.com>
---
 llvm/include/llvm/Object/ELF.h    |  6 ++-
 llvm/unittests/Object/ELFTest.cpp | 72 +++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index 0b362d389c177..59f63eb6b5bb6 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -407,7 +407,8 @@ class ELFFile {
   Elf_Note_Iterator notes_begin(const Elf_Phdr &Phdr, Error &Err) const {
     assert(Phdr.p_type == ELF::PT_NOTE && "Phdr is not of type PT_NOTE");
     ErrorAsOutParameter ErrAsOutParam(Err);
-    if (Phdr.p_offset + Phdr.p_filesz > getBufSize()) {
+    if (Phdr.p_offset + Phdr.p_filesz > getBufSize() ||
+        Phdr.p_offset + Phdr.p_filesz < Phdr.p_offset) {
       Err =
           createError("invalid offset (0x" + Twine::utohexstr(Phdr.p_offset) +
                       ") or size (0x" + Twine::utohexstr(Phdr.p_filesz) + ")");
@@ -435,7 +436,8 @@ class ELFFile {
   Elf_Note_Iterator notes_begin(const Elf_Shdr &Shdr, Error &Err) const {
     assert(Shdr.sh_type == ELF::SHT_NOTE && "Shdr is not of type SHT_NOTE");
     ErrorAsOutParameter ErrAsOutParam(Err);
-    if (Shdr.sh_offset + Shdr.sh_size > getBufSize()) {
+    if (Shdr.sh_offset + Shdr.sh_size > getBufSize() ||
+        Shdr.sh_offset + Shdr.sh_size < Shdr.sh_offset) {
       Err =
           createError("invalid offset (0x" + Twine::utohexstr(Shdr.sh_offset) +
                       ") or size (0x" + Twine::utohexstr(Shdr.sh_size) + ")");
diff --git a/llvm/unittests/Object/ELFTest.cpp b/llvm/unittests/Object/ELFTest.cpp
index faf855c09cfe8..7c68ab5c8985f 100644
--- a/llvm/unittests/Object/ELFTest.cpp
+++ b/llvm/unittests/Object/ELFTest.cpp
@@ -7,6 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 
@@ -310,3 +314,71 @@ TEST(ELFTest, Hash) {
   // presuming 32-bit long. Thus make sure that extra bit doesn't appear. 
   EXPECT_EQ(hashSysV("ZZZZZW9p"), 0U);
 }
+
+template <class ELFT>
+static Expected<ELFObjectFile<ELFT>> toBinary(SmallVectorImpl<char> &Storage,
+                                              StringRef Yaml) {
+  raw_svector_ostream OS(Storage);
+  yaml::Input YIn(Yaml);
+  if (!yaml::convertYAML(YIn, OS, [](const Twine &Msg) {}))
+    return createStringError(std::errc::invalid_argument,
+                             "unable to convert YAML");
+  return ELFObjectFile<ELFT>::create(MemoryBufferRef(OS.str(), "dummyELF"));
+}
+
+TEST(ELFObjectFileTest, ELFNoteIteratorOverflow) {
+  using Elf_Shdr_Range = ELFFile<ELF64LE>::Elf_Shdr_Range;
+  using Elf_Phdr_Range = ELFFile<ELF64LE>::Elf_Phdr_Range;
+
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ElfOrErr = toBinary<ELF64LE>(Storage, R"(
+--- !ELF
+FileHeader:
+  Class:          ELFCLASS64
+  Data:           ELFDATA2LSB
+  Type:           ET_EXEC
+  Machine:        EM_X86_64
+ProgramHeaders:
+  - Type:         PT_NOTE
+    FileSize:     0xffffffffffffff88
+    FirstSec:     .note.gnu.build-id
+    LastSec:      .note.gnu.build-id
+Sections:
+  - Name:         .note.gnu.build-id
+    Type:         SHT_NOTE
+    AddressAlign: 0x04
+    ShOffset:     0xffffffffffffff88
+    Notes:
+      - Name:     "GNU"
+        Desc:     "abb50d82b6bdc861"
+        Type:     3
+)");
+  ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+  ELFFile<ELF64LE> Obj = ElfOrErr.get().getELFFile();
+
+  auto CheckOverflow = [&](auto &&PhdrOrShdr, uint64_t Offset, uint64_t Size) {
+    Error Err = Error::success();
+    Obj.notes(PhdrOrShdr, Err);
+
+    std::string ErrMessage;
+    handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
+      ErrMessage = EI.message();
+    });
+
+    EXPECT_EQ(ErrMessage, ("invalid offset (0x" + Twine::utohexstr(Offset) +
+                           ") or size (0x" + Twine::utohexstr(Size) + ")")
+                              .str());
+  };
+
+  Expected<Elf_Phdr_Range> PhdrsOrErr = Obj.program_headers();
+  EXPECT_FALSE(!PhdrsOrErr);
+  for (Elf_Phdr_Impl<ELF64LE> P : *PhdrsOrErr)
+    if (P.p_type == ELF::PT_NOTE)
+      CheckOverflow(P, P.p_offset, P.p_filesz);
+
+  Expected<Elf_Shdr_Range> ShdrsOrErr = Obj.sections();
+  EXPECT_FALSE(!ShdrsOrErr);
+  for (Elf_Shdr_Impl<ELF64LE> S : *ShdrsOrErr)
+    if (S.sh_type == ELF::SHT_NOTE)
+      CheckOverflow(S, S.sh_offset, S.sh_size);
+}

From 7ccb5c08f0685d4787f12c3224a72f0650c5865e Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 2 Oct 2025 12:05:43 +0200
Subject: [PATCH 486/878] Thread Safety Analysis: Optimize LocalVariableMap's
 canonical reference computation (#161600)

We observed slowdowns in auto-generated million+ line C++ source files
due to recent fixes to LocalVariableMap.
Rather than recompute the canonical underlying non-reference
VarDefinition every time we need to perform variable definition
intersection at CFG merge points, pre-compute the canonical references
once on construction. Ensure to
maintain the invariant that if both the direct and canonical reference
are being invalidated, both become 0.

Reported-by: Bogdan Graur <bgraur@google.com>
---
 clang/lib/Analysis/ThreadSafety.cpp | 36 +++++++++++++++++------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index d19f86a2223d8..a56fdb1abd625 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -419,22 +419,28 @@ class LocalVariableMap {
     // The expression for this variable, OR
     const Expr *Exp = nullptr;
 
-    // Reference to another VarDefinition
-    unsigned Ref = 0;
+    // Direct reference to another VarDefinition
+    unsigned DirectRef = 0;
+
+    // Reference to underlying canonical non-reference VarDefinition.
+    unsigned CanonicalRef = 0;
 
     // The map with which Exp should be interpreted.
     Context Ctx;
 
     bool isReference() const { return !Exp; }
 
+    void invalidateRef() { DirectRef = CanonicalRef = 0; }
+
   private:
     // Create ordinary variable definition
     VarDefinition(const NamedDecl *D, const Expr *E, Context C)
         : Dec(D), Exp(E), Ctx(C) {}
 
     // Create reference to previous definition
-    VarDefinition(const NamedDecl *D, unsigned R, Context C)
-        : Dec(D), Ref(R), Ctx(C) {}
+    VarDefinition(const NamedDecl *D, unsigned DirectRef, unsigned CanonicalRef,
+                  Context C)
+        : Dec(D), DirectRef(DirectRef), CanonicalRef(CanonicalRef), Ctx(C) {}
   };
 
 private:
@@ -445,7 +451,7 @@ class LocalVariableMap {
 public:
   LocalVariableMap() {
     // index 0 is a placeholder for undefined variables (aka phi-nodes).
-    VarDefinitions.push_back(VarDefinition(nullptr, 0u, getEmptyContext()));
+    VarDefinitions.push_back(VarDefinition(nullptr, 0, 0, getEmptyContext()));
   }
 
   /// Look up a definition, within the given context.
@@ -471,7 +477,7 @@ class LocalVariableMap {
         Ctx = VarDefinitions[i].Ctx;
         return VarDefinitions[i].Exp;
       }
-      i = VarDefinitions[i].Ref;
+      i = VarDefinitions[i].DirectRef;
     }
     return nullptr;
   }
@@ -508,7 +514,7 @@ class LocalVariableMap {
   void dump() {
     for (unsigned i = 1, e = VarDefinitions.size(); i < e; ++i) {
       const Expr *Exp = VarDefinitions[i].Exp;
-      unsigned Ref = VarDefinitions[i].Ref;
+      unsigned Ref = VarDefinitions[i].DirectRef;
 
       dumpVarDefinitionName(i);
       llvm::errs() << " = ";
@@ -539,9 +545,9 @@ class LocalVariableMap {
   friend class VarMapBuilder;
 
   // Resolve any definition ID down to its non-reference base ID.
-  unsigned getCanonicalDefinitionID(unsigned ID) {
+  unsigned getCanonicalDefinitionID(unsigned ID) const {
     while (ID > 0 && VarDefinitions[ID].isReference())
-      ID = VarDefinitions[ID].Ref;
+      ID = VarDefinitions[ID].CanonicalRef;
     return ID;
   }
 
@@ -564,10 +570,11 @@ class LocalVariableMap {
   }
 
   // Add a new reference to an existing definition.
-  Context addReference(const NamedDecl *D, unsigned i, Context Ctx) {
+  Context addReference(const NamedDecl *D, unsigned Ref, Context Ctx) {
     unsigned newID = VarDefinitions.size();
     Context NewCtx = ContextFactory.add(Ctx, D, newID);
-    VarDefinitions.push_back(VarDefinition(D, i, Ctx));
+    VarDefinitions.push_back(
+        VarDefinition(D, Ref, getCanonicalDefinitionID(Ref), Ctx));
     return NewCtx;
   }
 
@@ -769,15 +776,14 @@ void LocalVariableMap::intersectBackEdge(Context C1, Context C2) {
     const unsigned *I2 = C2.lookup(P.first);
     if (!I2) {
       // Variable does not exist at the end of the loop, invalidate.
-      VDef->Ref = 0;
+      VDef->invalidateRef();
       continue;
     }
 
     // Compare the canonical IDs. This correctly handles chains of references
     // and determines if the variable is truly loop-invariant.
-    if (getCanonicalDefinitionID(VDef->Ref) != getCanonicalDefinitionID(*I2)) {
-      VDef->Ref = 0; // Mark this variable as undefined
-    }
+    if (VDef->CanonicalRef != getCanonicalDefinitionID(*I2))
+      VDef->invalidateRef(); // Mark this variable as undefined
   }
 }
 

From 0db784ed3cc9b60b98a6de878ab5a940509a3d22 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 09:46:17 -0700
Subject: [PATCH 487/878] [MLIR] Apply clang-tidy fixes for
 llvm-else-after-return in TransformOps.cpp (NFC)

---
 mlir/lib/Dialect/Transform/IR/TransformOps.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 132ed815c354e..3385b2a38afc1 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -616,11 +616,10 @@ DiagnosedSilenceableFailure transform::ApplyConversionPatternsOp::apply(
       if (diag.succeeded()) {
         // Tracking failure is the only failure.
         return trackingFailure;
-      } else {
-        diag.attachNote() << "tracking listener also failed: "
-                          << trackingFailure.getMessage();
-        (void)trackingFailure.silence();
       }
+      diag.attachNote() << "tracking listener also failed: "
+                        << trackingFailure.getMessage();
+      (void)trackingFailure.silence();
     }
 
     if (!diag.succeeded())

From 2daa2f1538b7d0b4fa874ffc16940460da77767b Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Thu, 2 Oct 2025 12:33:39 +0200
Subject: [PATCH 488/878] [Clang] Fix a crash when using ctad with a template
 template parameter. (#161488)

This fixes the crash reported in #130604.
It does not try to improve diagnostics or resolve CWG3003 (this will be
explored separately).

Fixes #130604
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp |  9 ++++++---
 clang/test/SemaCXX/cxx20-ctad-type-alias.cpp  | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b1ddfa070318b..c6ee1e282a008 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -437,6 +437,7 @@ Bug Fixes to C++ Support
 - Fix the result of `__builtin_is_implicit_lifetime` for types with a user-provided constructor. (#GH160610)
 - Correctly deduce return types in ``decltype`` expressions. (#GH160497) (#GH56652) (#GH116319) (#GH161196)
 - Fixed a crash in the pre-C++23 warning for attributes before a lambda declarator (#GH161070).
+- Fix a crash when attempting to deduce a deduction guide from a non deducible template template parameter. (#130604)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 3d54d1eb4373a..fe673eac8fcfa 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -1428,10 +1428,13 @@ void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template,
     DeclareImplicitDeductionGuidesForTypeAlias(*this, AliasTemplate, Loc);
     return;
   }
-  if (CXXRecordDecl *DefRecord =
-          cast<CXXRecordDecl>(Template->getTemplatedDecl())->getDefinition()) {
+  CXXRecordDecl *DefRecord =
+      dyn_cast_or_null<CXXRecordDecl>(Template->getTemplatedDecl());
+  if (!DefRecord)
+    return;
+  if (const CXXRecordDecl *Definition = DefRecord->getDefinition()) {
     if (TemplateDecl *DescribedTemplate =
-            DefRecord->getDescribedClassTemplate())
+            Definition->getDescribedClassTemplate())
       Template = DescribedTemplate;
   }
 
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 2f1817d0ca7eb..fd1a5c01233d5 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -586,3 +586,18 @@ Baz a{};
 static_assert(__is_same(decltype(a), A<A<int>>));
 
 } // namespace GH133132
+
+namespace GH130604 {
+template <typename T> struct A {
+    A(T);
+};
+
+template <typename T, template <typename> class TT = A> using Alias = TT<T>; // #gh130604-alias
+template <typename T> using Alias2 = Alias<T>;
+
+Alias2 a(42);
+// expected-error@-1 {{no viable constructor or deduction guide for deduction of template arguments of 'Alias2'}}
+Alias  b(42);
+// expected-error@-1 {{alias template 'Alias' requires template arguments; argument deduction only allowed for class templates or alias template}}
+// expected-note@#gh130604-alias {{template is declared here}}
+}

From 2165aa4c9dc828f18fe792551ed42799e65e0507 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 2 Oct 2025 20:34:12 +1000
Subject: [PATCH 489/878] [orc-rt] Tidy up some type_traits uses. NFC.

---
 orc-rt/include/orc-rt/Error.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/orc-rt/include/orc-rt/Error.h b/orc-rt/include/orc-rt/Error.h
index fe0754bd1103f..6b43b3388aff4 100644
--- a/orc-rt/include/orc-rt/Error.h
+++ b/orc-rt/include/orc-rt/Error.h
@@ -114,7 +114,7 @@ class ORC_RT_NODISCARD Error {
   void setChecked(bool Checked) { ErrPtr = (ErrPtr & ~uintptr_t(1)) | Checked; }
 
   template <typename ErrT = ErrorInfoBase> std::unique_ptr<ErrT> takePayload() {
-    static_assert(std::is_base_of<ErrorInfoBase, ErrT>::value,
+    static_assert(std::is_base_of_v<ErrorInfoBase, ErrT>,
                   "ErrT is not an ErrorInfoBase subclass");
     std::unique_ptr<ErrT> Tmp(getPtr<ErrT>());
     setPtr(nullptr);
@@ -292,7 +292,7 @@ template <typename T> class ORC_RT_NODISCARD Expected {
 
   template <class OtherT> friend class Expected;
 
-  static constexpr bool IsRef = std::is_reference<T>::value;
+  static constexpr bool IsRef = std::is_reference_v<T>;
   using wrap = std::reference_wrapper<std::remove_reference_t<T>>;
   using error_type = std::unique_ptr<ErrorInfoBase>;
   using storage_type = std::conditional_t<IsRef, wrap, T>;
@@ -313,7 +313,7 @@ template <typename T> class ORC_RT_NODISCARD Expected {
   /// Create an Expected from a T value.
   template <typename OtherT>
   Expected(OtherT &&Val,
-           std::enable_if_t<std::is_convertible<OtherT, T>::value> * = nullptr)
+           std::enable_if_t<std::is_convertible_v<OtherT, T>> * = nullptr)
       : HasError(false), Unchecked(true) {
     new (getStorage()) storage_type(std::forward<OtherT>(Val));
   }
@@ -324,9 +324,8 @@ template <typename T> class ORC_RT_NODISCARD Expected {
   /// Move construct an Expected<T> value from an Expected<OtherT>, where OtherT
   /// must be convertible to T.
   template <class OtherT>
-  Expected(
-      Expected<OtherT> &&Other,
-      std::enable_if_t<std::is_convertible<OtherT, T>::value> * = nullptr) {
+  Expected(Expected<OtherT> &&Other,
+           std::enable_if_t<std::is_convertible_v<OtherT, T>> * = nullptr) {
     moveConstruct(std::move(Other));
   }
 
@@ -335,7 +334,7 @@ template <typename T> class ORC_RT_NODISCARD Expected {
   template <class OtherT>
   explicit Expected(
       Expected<OtherT> &&Other,
-      std::enable_if_t<!std::is_convertible<OtherT, T>::value> * = nullptr) {
+      std::enable_if_t<!std::is_convertible_v<OtherT, T>> * = nullptr) {
     moveConstruct(std::move(Other));
   }
 

From 7c4f188f27ee7c562c2aa11b2384fd0ef918be94 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 2 Oct 2025 11:53:17 +0100
Subject: [PATCH 490/878] [LV] Support multiplies by constants when forming
 scaled reductions. (#161092)

We can create partial reductions for multiplies with constants, if the
constant is small enough to be extended from source to destination type
w/o changing the value.

This only handles constant on the right side of a multiply, relying on
other passes to canonicalize the input.

Alive2 Proofs: https://alive2.llvm.org/ce/z/iWRMr6

PR: https://github.com/llvm/llvm-project/pull/161092
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  7 ++++
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 10 ++++++
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  4 +++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 +++++
 .../AArch64/partial-reduce-constant-ops.ll    | 34 +++++++++----------
 5 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5d6c8118eb55..7fa787bc9befd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7954,6 +7954,13 @@ bool VPRecipeBuilder::getScaledReductions(
   auto CollectExtInfo = [this, &Exts, &ExtOpTypes,
                          &ExtKinds](SmallVectorImpl<Value *> &Ops) -> bool {
     for (const auto &[I, OpI] : enumerate(Ops)) {
+      auto *CI = dyn_cast<ConstantInt>(OpI);
+      if (I > 0 && CI &&
+          canConstantBeExtended(CI, ExtOpTypes[0], ExtKinds[0])) {
+        ExtOpTypes[I] = ExtOpTypes[0];
+        ExtKinds[I] = ExtKinds[0];
+        continue;
+      }
       Value *ExtOp;
       if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
         return false;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 02eb6375aac41..07b191a787806 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1753,6 +1753,16 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
 }
 #endif
 
+bool llvm::canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+                                 TTI::PartialReductionExtendKind ExtKind) {
+  APInt TruncatedVal = CI->getValue().trunc(NarrowType->getScalarSizeInBits());
+  unsigned WideSize = CI->getType()->getScalarSizeInBits();
+  APInt ExtendedVal = ExtKind == TTI::PR_SignExtend
+                          ? TruncatedVal.sext(WideSize)
+                          : TruncatedVal.zext(WideSize);
+  return ExtendedVal == CI->getValue();
+}
+
 TargetTransformInfo::OperandValueInfo
 VPCostContext::getOperandInfo(VPValue *V) const {
   if (!V->isLiveIn())
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fe59774b7c838..fc1a09e9850f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -468,6 +468,10 @@ class VPlanPrinter {
 };
 #endif
 
+/// Check if a constant \p CI can be safely treated as having been extended
+/// from a narrower type with the given extension kind.
+bool canConstantBeExtended(const ConstantInt *CI, Type *NarrowType,
+                           TTI::PartialReductionExtendKind ExtKind);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 46909a53a9547..67b9244e9dc72 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -340,6 +340,14 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
                                                  : Widen->getOperand(1));
     ExtAType = GetExtendKind(ExtAR);
     ExtBType = GetExtendKind(ExtBR);
+
+    if (!ExtBR && Widen->getOperand(1)->isLiveIn()) {
+      auto *CI = cast<ConstantInt>(Widen->getOperand(1)->getLiveInIRValue());
+      if (canConstantBeExtended(CI, InputTypeA, ExtAType)) {
+        InputTypeB = InputTypeA;
+        ExtBType = ExtAType;
+      }
+    }
   };
 
   if (isa<VPWidenCastRecipe>(OpR)) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index 0086f6e61cd36..b033f6051f812 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -20,22 +20,22 @@ define i32 @red_zext_mul_by_63(ptr %start, ptr %end) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 63)
-; CHECK-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
@@ -48,7 +48,7 @@ define i32 @red_zext_mul_by_63(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
 entry:
@@ -86,17 +86,17 @@ define i32 @red_zext_mul_by_255(ptr %start, ptr %end) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 255)
-; CHECK-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -218,22 +218,22 @@ define i32 @red_sext_mul_by_63(ptr %start, ptr %end) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 63)
-; CHECK-NEXT:    [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
@@ -246,7 +246,7 @@ define i32 @red_sext_mul_by_63(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[RED_NEXT_LCSSA]]
 ;
 entry:

From 6382bb5dda106a4bae0f0a17e88396036969784b Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 2 Oct 2025 11:50:16 +0100
Subject: [PATCH 491/878] [lldb][test] XFAIL TestGlobalSymbolObjCConflict.c on
 Windows

Failing with:
```
error: command failed with exit status: 1
executed command: 'c:\buildbot\as-builder-10\lldb-x86-64\build\bin\filecheck.exe' 'C:\buildbot\as-builder-10\lldb-x86-64\llvm-project\lldb\test\Shell\Expr\TestGlobalSymbolObjCConflict.c'
.---command stderr------------
| C:\buildbot\as-builder-10\lldb-x86-64\llvm-project\lldb\test\Shell\Expr\TestGlobalSymbolObjCConflict.c:30:11: error: CHECK: expected string not found in input
| // CHECK: (lldb) p OglobalVar
|           ^
| <stdin>:1:1: note: scanning from here
| (lldb) command source -s 0 'C:/buildbot/as-builder-10/lldb-x86-64/build/tools/lldb\test\Shell\lit-lldb-init-quiet'
| ^
| <stdin>:4:1: note: possible intended match here
| (lldb) target create "C:\\buildbot\\as-builder-10\\lldb-x86-64\\build\\tools\\lldb\\test\\Shell\\Expr\\Output\\TestGlobalSymbolObjCConflict.c.tmp.out"
| ^
|
| Input file: <stdin>
| Check file: C:\buildbot\as-builder-10\lldb-x86-64\llvm-project\lldb\test\Shell\Expr\TestGlobalSymbolObjCConflict.c
|
| -dump-input=help explains the following input dump.
|
| Input was:
| <<<<<<
|             1: (lldb) command source -s 0 'C:/buildbot/as-builder-10/lldb-x86-64/build/tools/lldb\test\Shell\lit-lldb-init-quiet'
| check:30'0     X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
|             2: Executing commands in 'C:\buildbot\as-builder-10\lldb-x86-64\build\tools\lldb\test\Shell\lit-lldb-init-quiet'.
| check:30'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|             3: (lldb) command source -C --silent-run true lit-lldb-init
| check:30'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|             4: (lldb) target create "C:\\buildbot\\as-builder-10\\lldb-x86-64\\build\\tools\\lldb\\test\\Shell\\Expr\\Output\\TestGlobalSymbolObjCConflict.c.tmp.out"
| check:30'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
| check:30'1     ?                                                                                                                                                       possible intended match
|             5: Current executable set to 'C:\buildbot\as-builder-10\lldb-x86-64\build\tools\lldb\test\Shell\Expr\Output\TestGlobalSymbolObjCConflict.c.tmp.out' (x86_64).
| check:30'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|             6: (lldb) b 27
| check:30'0     ~~~~~~~~~~~~
| >>>>>>
`-----------------------------
error: command failed with exit status: 1
```

We probably need to use LLD here or something. But I don't have a Windows machine to test this on. So XFAILing for now.
---
 lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c b/lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c
index 62c0162863337..8f1bb62874a12 100644
--- a/lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c
+++ b/lldb/test/Shell/Expr/TestGlobalSymbolObjCConflict.c
@@ -1,3 +1,5 @@
+// XFAIL: target-windows
+
 // Tests that LLDB correctly parses global symbols
 // starting with 'O'. On some platforms (e.g., Darwin)
 // C-symbols are prefixed with a '_'. The LLDB Macho-O
@@ -9,7 +11,7 @@
 // RUN: %clang_host -c -g -fno-common %s -o %t.o
 // RUN: %clang_host %t.o -o %t.out
 // RUN: %lldb -b -x %t.out \
-// RUN:       -o "b 27" \
+// RUN:       -o "b 29" \
 // RUN:       -o "run" \
 // RUN:       -o "p OglobalVar" \
 // RUN:       -o "p Oabc" | FileCheck %s

From 2eb63375912b5c6585c1fde2e49860d0d78d4fee Mon Sep 17 00:00:00 2001
From: Lucie Choi <clucie@google.com>
Date: Thu, 2 Oct 2025 04:00:56 -0700
Subject: [PATCH 492/878] [SPIR-V] Prevent adding duplicate binding
 instructions for implicit binding (#161299)

Prevent adding duplicate instructions for implicit bindings when they
are from the same resource. The fix is to store and check if the binding
number is already assigned for each `OrderId`.


Resolves https://github.com/llvm/llvm-project/issues/160716
---
 .../SPIRV/SPIRVLegalizeImplicitBinding.cpp    | 43 ++++++++++++++-
 .../SPIRV/hlsl-resources/ImplicitBinding.ll   | 53 ++++++++++---------
 .../UniqueImplicitBindingNumber.ll            | 19 +++++++
 3 files changed, 90 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
index aea3397ad2fd6..205895e48a379 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -39,6 +39,7 @@ class SPIRVLegalizeImplicitBinding : public ModulePass {
   void collectBindingInfo(Module &M);
   uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet);
   void replaceImplicitBindingCalls(Module &M);
+  void verifyUniqueOrderIdPerResource(SmallVectorImpl<CallInst *> &Calls);
 
   // A map from descriptor set to a bit vector of used binding numbers.
   std::vector<BitVector> UsedBindings;
@@ -94,6 +95,33 @@ void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) {
       });
 }
 
+void SPIRVLegalizeImplicitBinding::verifyUniqueOrderIdPerResource(
+    SmallVectorImpl<CallInst *> &Calls) {
+  // Check that the order Id is unique per resource.
+  for (uint32_t i = 1; i < Calls.size(); ++i) {
+    const uint32_t OrderIdArgIdx = 0;
+    const uint32_t DescSetArgIdx = 1;
+    const uint32_t OrderA =
+        cast<ConstantInt>(Calls[i - 1]->getArgOperand(OrderIdArgIdx))
+            ->getZExtValue();
+    const uint32_t OrderB =
+        cast<ConstantInt>(Calls[i]->getArgOperand(OrderIdArgIdx))
+            ->getZExtValue();
+    if (OrderA == OrderB) {
+      const uint32_t DescSetA =
+          cast<ConstantInt>(Calls[i - 1]->getArgOperand(DescSetArgIdx))
+              ->getZExtValue();
+      const uint32_t DescSetB =
+          cast<ConstantInt>(Calls[i]->getArgOperand(DescSetArgIdx))
+              ->getZExtValue();
+      if (DescSetA != DescSetB) {
+        report_fatal_error("Implicit binding calls with the same order ID must "
+                           "have the same descriptor set");
+      }
+    }
+  }
+}
+
 uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
     uint32_t DescSet) {
   if (UsedBindings.size() <= DescSet) {
@@ -112,11 +140,23 @@ uint32_t SPIRVLegalizeImplicitBinding::getAndReserveFirstUnusedBinding(
 }
 
 void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) {
+  uint32_t lastOrderId = -1;
+  uint32_t lastBindingNumber = -1;
+
   for (CallInst *OldCI : ImplicitBindingCalls) {
     IRBuilder<> Builder(OldCI);
+    const uint32_t OrderId =
+        cast<ConstantInt>(OldCI->getArgOperand(0))->getZExtValue();
     const uint32_t DescSet =
         cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue();
-    const uint32_t NewBinding = getAndReserveFirstUnusedBinding(DescSet);
+
+    // Reuse an existing binding for this order ID, if one was already assigned.
+    // Otherwise, assign a new binding.
+    const uint32_t NewBinding = (lastOrderId == OrderId)
+                                    ? lastBindingNumber
+                                    : getAndReserveFirstUnusedBinding(DescSet);
+    lastOrderId = OrderId;
+    lastBindingNumber = NewBinding;
 
     SmallVector<Value *, 8> Args;
     Args.push_back(Builder.getInt32(DescSet));
@@ -142,6 +182,7 @@ bool SPIRVLegalizeImplicitBinding::runOnModule(Module &M) {
   if (ImplicitBindingCalls.empty()) {
     return false;
   }
+  verifyUniqueOrderIdPerResource(ImplicitBindingCalls);
 
   replaceImplicitBindingCalls(M);
   return true;
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
index cd524980ed275..2964da9058104 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
@@ -32,6 +32,7 @@
 ; CHECK-DAG: OpDecorate [[g]] Binding 0
 ; CHECK-DAG: OpDecorate [[h]] DescriptorSet 10
 ; CHECK-DAG: OpDecorate [[h]] Binding 3
+; CHECK-NOT: OpDecorate [[h]] Binding 4
 ; CHECK-DAG: OpDecorate [[i]] DescriptorSet 10
 ; CHECK-DAG: OpDecorate [[i]] Binding 2
 
@@ -44,30 +45,34 @@ entry:
   %3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.6)
   %4 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 1, i32 1, i32 0, ptr nonnull @.str.8)
   %5 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 2, i32 10, i32 1, i32 0, ptr nonnull @.str.10)
-  %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 1, i32 0, ptr nonnull @.str.12)
-  %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14)
-  %8 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0)
-  %9 = load i32, ptr addrspace(11) %8, align 4
-  %10 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0)
-  %11 = load i32, ptr addrspace(11) %10, align 4
-  %add.i = add nsw i32 %11, %9
-  %12 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
-  %13 = load i32, ptr addrspace(11) %12, align 4
-  %add4.i = add nsw i32 %add.i, %13
-  %14 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0)
-  %15 = load i32, ptr addrspace(11) %14, align 4
-  %add6.i = add nsw i32 %add4.i, %15
-  %16 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0)
-  %17 = load i32, ptr addrspace(11) %16, align 4
-  %add8.i = add nsw i32 %add6.i, %17
-  %18 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0)
-  %19 = load i32, ptr addrspace(11) %18, align 4
-  %add10.i = add nsw i32 %add8.i, %19
-  %20 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0)
-  %21 = load i32, ptr addrspace(11) %20, align 4
-  %add12.i = add nsw i32 %add10.i, %21
-  %22 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
-  store i32 %add12.i, ptr addrspace(11) %22, align 4
+  %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 0, ptr nonnull @.str.12)
+  %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 1, ptr nonnull @.str.12)
+  %8 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14)
+  %9 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0)
+  %10 = load i32, ptr addrspace(11) %9, align 4
+  %11 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0)
+  %12 = load i32, ptr addrspace(11) %11, align 4
+  %add.i = add nsw i32 %12, %10
+  %13 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
+  %14 = load i32, ptr addrspace(11) %13, align 4
+  %add4.i = add nsw i32 %add.i, %14
+  %15 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0)
+  %16 = load i32, ptr addrspace(11) %15, align 4
+  %add6.i = add nsw i32 %add4.i, %16
+  %17 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0)
+  %18 = load i32, ptr addrspace(11) %17, align 4
+  %add8.i = add nsw i32 %add6.i, %18
+  %19 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0)
+  %20 = load i32, ptr addrspace(11) %19, align 4
+  %add10.i = add nsw i32 %add8.i, %20
+  %21 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0)
+  %22 = load i32, ptr addrspace(11) %21, align 4
+  %add12.i = add nsw i32 %add10.i, %22
+  %23 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %8, i32 0)
+  %24 = load i32, ptr addrspace(11) %23, align 4
+  %add14.i = add nsw i32 %add12.i, %24
+  %25 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
+  store i32 %add14.i, ptr addrspace(11) %25, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
new file mode 100644
index 0000000000000..c968c99e4d58a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; CHECK-ERROR: LLVM ERROR: Implicit binding calls with the same order ID must have the same descriptor set
+
+@.str = private unnamed_addr constant [2 x i8] c"b\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"c\00", align 1
+
+define void @main() local_unnamed_addr #0 {
+entry:
+  %0 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 0, i32 1, i32 0, ptr nonnull @.str)
+  %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
+  %2 = load i32, ptr addrspace(11) %1, align 4
+  %3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 1, i32 1, i32 0, ptr nonnull @.str.2)
+  %4 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
+  store i32 %2, ptr addrspace(11) %4, align 4
+  ret void
+}
+
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }

From f3f9e7b928c2fb3828498e328c83bac14a9b46fe Mon Sep 17 00:00:00 2001
From: Henrich Lauko <xlauko@mail.muni.cz>
Date: Thu, 2 Oct 2025 13:02:16 +0200
Subject: [PATCH 493/878] [CIR] Make all opt tests verify roundtrip (#161439)

This mirrors incubator changes from https://github.com/llvm/clangir/pull/1923
---
 clang/test/CIR/IR/alloca.cir             | 2 +-
 clang/test/CIR/IR/array-ctor.cir         | 2 +-
 clang/test/CIR/IR/array-dtor.cir         | 2 +-
 clang/test/CIR/IR/array.cir              | 2 +-
 clang/test/CIR/IR/atomic.cir             | 2 +-
 clang/test/CIR/IR/binassign.cir          | 2 +-
 clang/test/CIR/IR/bitfield_info.cir      | 2 +-
 clang/test/CIR/IR/call.cir               | 2 +-
 clang/test/CIR/IR/cast.cir               | 2 +-
 clang/test/CIR/IR/cmp.cir                | 2 +-
 clang/test/CIR/IR/complex.cir            | 2 +-
 clang/test/CIR/IR/copy.cir               | 2 +-
 clang/test/CIR/IR/func.cir               | 2 +-
 clang/test/CIR/IR/global-init.cir        | 2 +-
 clang/test/CIR/IR/global-var-linkage.cir | 3 +--
 clang/test/CIR/IR/global.cir             | 2 +-
 clang/test/CIR/IR/label.cir              | 2 +-
 clang/test/CIR/IR/module.cir             | 3 +--
 clang/test/CIR/IR/stack-save-restore.cir | 2 +-
 clang/test/CIR/IR/struct.cir             | 2 +-
 clang/test/CIR/IR/switch-flat.cir        | 2 +-
 clang/test/CIR/IR/switch.cir             | 2 +-
 clang/test/CIR/IR/ternary.cir            | 2 +-
 clang/test/CIR/IR/throw.cir              | 2 +-
 clang/test/CIR/IR/unary.cir              | 2 +-
 clang/test/CIR/IR/vector.cir             | 2 +-
 clang/test/CIR/IR/vtable-addrpt.cir      | 2 +-
 clang/test/CIR/IR/vtable-attr.cir        | 2 +-
 clang/test/CIR/IR/vtt-addrpoint.cir      | 2 +-
 29 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/clang/test/CIR/IR/alloca.cir b/clang/test/CIR/IR/alloca.cir
index 4a13c44292b35..d94da815f37a7 100644
--- a/clang/test/CIR/IR/alloca.cir
+++ b/clang/test/CIR/IR/alloca.cir
@@ -1,5 +1,5 @@
 
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !u64i = !cir.int<u, 64>
 !u8i = !cir.int<u, 8>
diff --git a/clang/test/CIR/IR/array-ctor.cir b/clang/test/CIR/IR/array-ctor.cir
index 2378992bbd9fc..fd2ec7eb93c23 100644
--- a/clang/test/CIR/IR/array-ctor.cir
+++ b/clang/test/CIR/IR/array-ctor.cir
@@ -1,5 +1,5 @@
 
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !u8i = !cir.int<u, 8>
 !rec_S = !cir.record<struct "S" padded {!u8i}>
diff --git a/clang/test/CIR/IR/array-dtor.cir b/clang/test/CIR/IR/array-dtor.cir
index 6d08d1639f0db..1bb9ff9169a9d 100644
--- a/clang/test/CIR/IR/array-dtor.cir
+++ b/clang/test/CIR/IR/array-dtor.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !u8i = !cir.int<u, 8>
 !rec_S = !cir.record<struct "S" padded {!u8i}>
diff --git a/clang/test/CIR/IR/array.cir b/clang/test/CIR/IR/array.cir
index bba536062d740..ddc6b92b11ee9 100644
--- a/clang/test/CIR/IR/array.cir
+++ b/clang/test/CIR/IR/array.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
diff --git a/clang/test/CIR/IR/atomic.cir b/clang/test/CIR/IR/atomic.cir
index 6ca5af2aac175..85207633a5294 100644
--- a/clang/test/CIR/IR/atomic.cir
+++ b/clang/test/CIR/IR/atomic.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 !u32i = !cir.int<u, 32>
diff --git a/clang/test/CIR/IR/binassign.cir b/clang/test/CIR/IR/binassign.cir
index 6d2c5c8ab6962..02471264d779e 100644
--- a/clang/test/CIR/IR/binassign.cir
+++ b/clang/test/CIR/IR/binassign.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | cir-opt | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 !s8i = !cir.int<s, 8>
diff --git a/clang/test/CIR/IR/bitfield_info.cir b/clang/test/CIR/IR/bitfield_info.cir
index 682e0903fd552..2d743fbfbf595 100644
--- a/clang/test/CIR/IR/bitfield_info.cir
+++ b/clang/test/CIR/IR/bitfield_info.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 !u32i = !cir.int<u, 32>
diff --git a/clang/test/CIR/IR/call.cir b/clang/test/CIR/IR/call.cir
index 9607df7202e0f..59f28be36846f 100644
--- a/clang/test/CIR/IR/call.cir
+++ b/clang/test/CIR/IR/call.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
diff --git a/clang/test/CIR/IR/cast.cir b/clang/test/CIR/IR/cast.cir
index 11b1664871ef7..3f2fca9fc307b 100644
--- a/clang/test/CIR/IR/cast.cir
+++ b/clang/test/CIR/IR/cast.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | cir-opt | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 !s32i = !cir.int<s, 32>
 
 module  {
diff --git a/clang/test/CIR/IR/cmp.cir b/clang/test/CIR/IR/cmp.cir
index fdf538d7eef92..0d473986df1c2 100644
--- a/clang/test/CIR/IR/cmp.cir
+++ b/clang/test/CIR/IR/cmp.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | cir-opt | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 !s32i = !cir.int<s, 32>
 !u32i = !cir.int<u, 32>
 
diff --git a/clang/test/CIR/IR/complex.cir b/clang/test/CIR/IR/complex.cir
index a73a8654ca274..a7e0c77696d66 100644
--- a/clang/test/CIR/IR/complex.cir
+++ b/clang/test/CIR/IR/complex.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
diff --git a/clang/test/CIR/IR/copy.cir b/clang/test/CIR/IR/copy.cir
index 2cfb25d82b278..f9db29aa0e01f 100644
--- a/clang/test/CIR/IR/copy.cir
+++ b/clang/test/CIR/IR/copy.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 module {
diff --git a/clang/test/CIR/IR/func.cir b/clang/test/CIR/IR/func.cir
index 0e9a92fcf8201..9532859587629 100644
--- a/clang/test/CIR/IR/func.cir
+++ b/clang/test/CIR/IR/func.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 !s64i = !cir.int<s, 64>
diff --git a/clang/test/CIR/IR/global-init.cir b/clang/test/CIR/IR/global-init.cir
index 727c067e25472..2fd25df4e050b 100644
--- a/clang/test/CIR/IR/global-init.cir
+++ b/clang/test/CIR/IR/global-init.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt --verify-roundtrip %s -o - | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !u8i = !cir.int<u, 8>
 
diff --git a/clang/test/CIR/IR/global-var-linkage.cir b/clang/test/CIR/IR/global-var-linkage.cir
index e1b7de4bb2156..df74e3825e967 100644
--- a/clang/test/CIR/IR/global-var-linkage.cir
+++ b/clang/test/CIR/IR/global-var-linkage.cir
@@ -1,5 +1,4 @@
-// RUN: cir-opt %s -o %t.cir
-// RUN: FileCheck --input-file=%t.cir %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
diff --git a/clang/test/CIR/IR/global.cir b/clang/test/CIR/IR/global.cir
index 28fad6bbf4471..0464db822448e 100644
--- a/clang/test/CIR/IR/global.cir
+++ b/clang/test/CIR/IR/global.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s -o - | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s8i = !cir.int<s, 8>
 !s16i = !cir.int<s, 16>
diff --git a/clang/test/CIR/IR/label.cir b/clang/test/CIR/IR/label.cir
index 2211a4e8da331..1049766e7ce69 100644
--- a/clang/test/CIR/IR/label.cir
+++ b/clang/test/CIR/IR/label.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
diff --git a/clang/test/CIR/IR/module.cir b/clang/test/CIR/IR/module.cir
index 7ce2c0ba21cb0..8c782fdb2dbc6 100644
--- a/clang/test/CIR/IR/module.cir
+++ b/clang/test/CIR/IR/module.cir
@@ -1,5 +1,4 @@
-// RUN: cir-opt %s -split-input-file -o %t.cir
-// RUN: FileCheck --input-file=%t.cir %s
+// RUN: cir-opt %s -split-input-file --verify-roundtrip | FileCheck %s
 
 // Should parse and print C source language attribute.
 module attributes {cir.lang = #cir.lang<c>} { }
diff --git a/clang/test/CIR/IR/stack-save-restore.cir b/clang/test/CIR/IR/stack-save-restore.cir
index f98889ac1083a..476f2120a079d 100644
--- a/clang/test/CIR/IR/stack-save-restore.cir
+++ b/clang/test/CIR/IR/stack-save-restore.cir
@@ -1,6 +1,6 @@
 // Test the CIR operations can parse and print correctly (roundtrip)
 
-// RUN: cir-opt %s | cir-opt | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !u8i = !cir.int<u, 8>
 
diff --git a/clang/test/CIR/IR/struct.cir b/clang/test/CIR/IR/struct.cir
index 33f2e9860c5cb..2e011fba36b26 100644
--- a/clang/test/CIR/IR/struct.cir
+++ b/clang/test/CIR/IR/struct.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !u8i = !cir.int<u, 8>
 !u16i = !cir.int<u, 16>
diff --git a/clang/test/CIR/IR/switch-flat.cir b/clang/test/CIR/IR/switch-flat.cir
index 8c11a74484d39..d39c3e7e81215 100644
--- a/clang/test/CIR/IR/switch-flat.cir
+++ b/clang/test/CIR/IR/switch-flat.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 !s32i = !cir.int<s, 32>
 
 cir.func @FlatSwitchWithoutDefault(%arg0: !s32i) {
diff --git a/clang/test/CIR/IR/switch.cir b/clang/test/CIR/IR/switch.cir
index 0bdc9c1e7e896..87d45bf1f5219 100644
--- a/clang/test/CIR/IR/switch.cir
+++ b/clang/test/CIR/IR/switch.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 !s32i = !cir.int<s, 32>
 
 cir.func @s0() {
diff --git a/clang/test/CIR/IR/ternary.cir b/clang/test/CIR/IR/ternary.cir
index e419c7f5af40c..78e1de4eea8f1 100644
--- a/clang/test/CIR/IR/ternary.cir
+++ b/clang/test/CIR/IR/ternary.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | cir-opt | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 !u32i = !cir.int<u, 32>
 
 module  {
diff --git a/clang/test/CIR/IR/throw.cir b/clang/test/CIR/IR/throw.cir
index 8b24b481057b1..e7a1bf4f2f283 100644
--- a/clang/test/CIR/IR/throw.cir
+++ b/clang/test/CIR/IR/throw.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
diff --git a/clang/test/CIR/IR/unary.cir b/clang/test/CIR/IR/unary.cir
index ba3bc20d574f5..d01d4eb3c920a 100644
--- a/clang/test/CIR/IR/unary.cir
+++ b/clang/test/CIR/IR/unary.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 !s64i = !cir.int<s, 64>
diff --git a/clang/test/CIR/IR/vector.cir b/clang/test/CIR/IR/vector.cir
index 6d8e5beffd63f..d274c35099ee5 100644
--- a/clang/test/CIR/IR/vector.cir
+++ b/clang/test/CIR/IR/vector.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !s32i = !cir.int<s, 32>
 
diff --git a/clang/test/CIR/IR/vtable-addrpt.cir b/clang/test/CIR/IR/vtable-addrpt.cir
index 106e7485fbbcf..7c8fa8d5ebe18 100644
--- a/clang/test/CIR/IR/vtable-addrpt.cir
+++ b/clang/test/CIR/IR/vtable-addrpt.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 // Test the parsing and printing of a constructor that uses a vtable addess_point op.
 
diff --git a/clang/test/CIR/IR/vtable-attr.cir b/clang/test/CIR/IR/vtable-attr.cir
index 3854208ff78cc..70e32969c1985 100644
--- a/clang/test/CIR/IR/vtable-attr.cir
+++ b/clang/test/CIR/IR/vtable-attr.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 !rec_Q = !cir.record<struct "Q" {!cir.vptr}>
 !rec_S = !cir.record<struct "S" {!cir.vptr}>
diff --git a/clang/test/CIR/IR/vtt-addrpoint.cir b/clang/test/CIR/IR/vtt-addrpoint.cir
index 11e5f4da83b50..823ddd2e7dc1d 100644
--- a/clang/test/CIR/IR/vtt-addrpoint.cir
+++ b/clang/test/CIR/IR/vtt-addrpoint.cir
@@ -1,4 +1,4 @@
-// RUN: cir-opt %s | FileCheck %s
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
 
 // Test the parsing and printing of the two forms of vtt.address_point op, as
 // they will appear in constructors.

From 8cf43aebc67b8e88fe54cfd3371fc3777205ec4a Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 2 Oct 2025 12:24:23 +0100
Subject: [PATCH 494/878] [LLVM][CodeGen][SVE] Remove failure cases when
 widening vector load/store ops. (#160515)

When unable to widen a vector load/store we can replace the operation
with a masked variant. Support for extending loads largely came for free
hence its inclusion, but truncating stores require more work.

Fixes https://github.com/llvm/llvm-project/issues/159995
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   50 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |    1 +
 llvm/lib/Target/VE/VEISelLowering.cpp         |    2 +
 .../AArch64/sve-load-store-legalisation.ll    | 2854 +++++++++++++++++
 llvm/test/CodeGen/VE/Vector/vec_divrem.ll     |   56 +-
 5 files changed, 2923 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ff7cd665446cc..87d5453cd98cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6256,17 +6256,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   // FIXME: Not all targets may support EVL in VP_LOAD. These will have been
   // removed from the IR by the ExpandVectorPredication pass but we're
   // reintroducing them here.
-  EVT LdVT = LD->getMemoryVT();
-  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), LdVT);
-  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                    WideVT.getVectorElementCount());
+  EVT VT = LD->getValueType(0);
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  EVT WideMaskVT = getSetCCResultType(WideVT);
+
   if (ExtType == ISD::NON_EXTLOAD &&
       TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WideVT) &&
       TLI.isTypeLegal(WideMaskVT)) {
     SDLoc DL(N);
     SDValue Mask = DAG.getAllOnesConstant(DL, WideMaskVT);
     SDValue EVL = DAG.getElementCount(DL, TLI.getVPExplicitVectorLengthTy(),
-                                      LdVT.getVectorElementCount());
+                                      VT.getVectorElementCount());
     SDValue NewLoad =
         DAG.getLoadVP(LD->getAddressingMode(), ISD::NON_EXTLOAD, WideVT, DL,
                       LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -6303,6 +6303,24 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
     return Result;
   }
 
+  if (VT.isVector()) {
+    // If all else fails replace the load with a wide masked load.
+    SDLoc DL(N);
+    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+    SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
+    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+                               DAG.getConstant(0, DL, IdxVT), Len);
+
+    SDValue NewLoad = DAG.getMaskedLoad(
+        WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
+        DAG.getPOISON(WideVT), LD->getMemoryVT(), LD->getMemOperand(),
+        LD->getAddressingMode(), LD->getExtensionType());
+
+    ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
+    return NewLoad;
+  }
+
   report_fatal_error("Unable to widen vector load");
 }
 
@@ -7516,8 +7534,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   SDValue StVal = ST->getValue();
   EVT StVT = StVal.getValueType();
   EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StVT);
-  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
-                                    WideVT.getVectorElementCount());
+  EVT WideMaskVT = getSetCCResultType(WideVT);
 
   if (TLI.isOperationLegalOrCustom(ISD::VP_STORE, WideVT) &&
       TLI.isTypeLegal(WideMaskVT)) {
@@ -7540,6 +7557,22 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
     return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
   }
 
+  if (StVT.isVector()) {
+    // If all else fails replace the store with a wide masked store.
+    SDLoc DL(N);
+    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+
+    SDValue WideStVal = GetWidenedVector(StVal);
+    SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
+    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
+                               DAG.getConstant(0, DL, IdxVT), Len);
+
+    return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
+                              ST->getOffset(), Mask, ST->getMemoryVT(),
+                              ST->getMemOperand(), ST->getAddressingMode(),
+                              ST->isTruncatingStore());
+  }
+
   report_fatal_error("Unable to widen vector store");
 }
 
@@ -8298,8 +8331,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   AAMDNodes AAInfo = LD->getAAInfo();
 
   if (LdVT.isScalableVector())
-    report_fatal_error("Generating widen scalable extending vector loads is "
-                       "not yet supported");
+    return SDValue();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a1f4734f83562..debe4d9f4aeb8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1537,6 +1537,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
+      setOperationAction(ISD::MSTORE, VT, Legal);
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::MULHS, VT, Custom);
       setOperationAction(ISD::MULHU, VT, Custom);
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 2cfdc751a55e0..a068138791cb4 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -957,6 +957,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                          EVT VT) const {
+  if (VT.isVector())
+    return VT.changeVectorElementType(MVT::i1);
   return MVT::i32;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
new file mode 100644
index 0000000000000..584753bffdbe0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
@@ -0,0 +1,2854 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @sve_load_store_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i8>, ptr %a
+  store <vscale x 1 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i8>, ptr %a
+  store <vscale x 2 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i8>, ptr %a
+  store <vscale x 3 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i8>, ptr %a
+  store <vscale x 4 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x i8>, ptr %a
+  store <vscale x 5 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1b { z1.s }, p1, [x1]
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1b { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x i8>, ptr %a
+  store <vscale x 6 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x i8>, ptr %a
+  store <vscale x 7 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x i8>, ptr %a
+  store <vscale x 8 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 9 x i8>, ptr %a
+  store <vscale x 9 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    st1b { z0.h }, p1, [x1]
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    st1b { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 10 x i8>, ptr %a
+  store <vscale x 10 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #11 // =0xb
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 11 x i8>, ptr %a
+  store <vscale x 11 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    st1b { z0.h }, p1, [x1]
+; CHECK-NEXT:    st1b { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 12 x i8>, ptr %a
+  store <vscale x 12 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #13 // =0xd
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 13 x i8>, ptr %a
+  store <vscale x 13 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    ld1b { z1.h }, p2/z, [x0]
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    st1b { z0.h }, p2, [x1]
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    st1b { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT:    st1b { z2.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 14 x i8>, ptr %a
+  store <vscale x 14 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 15 x i8>, ptr %a
+  store <vscale x 15 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 16 x i8>, ptr %a
+  store <vscale x 16 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv17i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv17i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #17 // =0x11
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 17 x i8>, ptr %a
+  store <vscale x 17 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv18i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv18i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z1.s, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1b { z0.d }, p0, [x1, x8]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 18 x i8>, ptr %a
+  store <vscale x 18 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv19i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv19i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #19 // =0x13
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 19 x i8>, ptr %a
+  store <vscale x 19 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv20i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv20i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 20 x i8>, ptr %a
+  store <vscale x 20 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv21i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv21i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #21 // =0x15
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 21 x i8>, ptr %a
+  store <vscale x 21 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv22i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv22i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cntw x8, all, mul #5
+; CHECK-NEXT:    ldr z2, [x0]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    ld1b { z1.d }, p1/z, [x0, x8]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    st1b { z1.d }, p1, [x1, x8]
+; CHECK-NEXT:    st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT:    str z2, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 22 x i8>, ptr %a
+  store <vscale x 22 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv23i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv23i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #23 // =0x17
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 23 x i8>, ptr %a
+  store <vscale x 23 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv24i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv24i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1b { z1.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    st1b { z0.h }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 24 x i8>, ptr %a
+  store <vscale x 24 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv25i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv25i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #25 // =0x19
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 25 x i8>, ptr %a
+  store <vscale x 25 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv26i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv26i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    cnth x8, all, mul #3
+; CHECK-NEXT:    ldr z2, [x0]
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT:    ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    st1b { z1.d }, p0, [x1, x8]
+; CHECK-NEXT:    st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT:    str z2, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 26 x i8>, ptr %a
+  store <vscale x 26 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv27i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv27i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #27 // =0x1b
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 27 x i8>, ptr %a
+  store <vscale x 27 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv28i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv28i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z2, [x0]
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z2, [x1]
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT:    st1b { z1.s }, p0, [x1, #6, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 28 x i8>, ptr %a
+  store <vscale x 28 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv29i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv29i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #29 // =0x1d
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 29 x i8>, ptr %a
+  store <vscale x 29 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv30i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv30i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    cntw x8, all, mul #7
+; CHECK-NEXT:    ldr z3, [x0]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ld1b { z2.h }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uzp1 z0.b, z2.b, z0.b
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    st1b { z2.d }, p0, [x1, x8]
+; CHECK-NEXT:    st1b { z0.h }, p2, [x1, #2, mul vl]
+; CHECK-NEXT:    st1b { z1.s }, p1, [x1, #6, mul vl]
+; CHECK-NEXT:    str z3, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 30 x i8>, ptr %a
+  store <vscale x 30 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv31i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv31i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w10, #31 // =0x1f
+; CHECK-NEXT:    lsr x9, x8, #4
+; CHECK-NEXT:    mul x9, x9, x10
+; CHECK-NEXT:    whilelo p0.b, x8, x9
+; CHECK-NEXT:    whilelo p1.b, xzr, x9
+; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT:    st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 31 x i8>, ptr %a
+  store <vscale x 31 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv32i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 32 x i8>, ptr %a
+  store <vscale x 32 x i8> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i16>, ptr %a
+  store <vscale x 1 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i16>, ptr %a
+  store <vscale x 2 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i16>, ptr %a
+  store <vscale x 3 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i16>, ptr %a
+  store <vscale x 4 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x i16>, ptr %a
+  store <vscale x 5 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    st1h { z0.s }, p1, [x1]
+; CHECK-NEXT:    st1h { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x i16>, ptr %a
+  store <vscale x 6 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x i16>, ptr %a
+  store <vscale x 7 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x i16>, ptr %a
+  store <vscale x 8 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv9i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 9 x i16>, ptr %a
+  store <vscale x 9 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv10i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 10 x i16>, ptr %a
+  store <vscale x 10 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv11i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #11 // =0xb
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 11 x i16>, ptr %a
+  store <vscale x 11 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv12i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 12 x i16>, ptr %a
+  store <vscale x 12 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv13i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #13 // =0xd
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 13 x i16>, ptr %a
+  store <vscale x 13 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv14i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z2, [x0]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z2, [x1]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    st1h { z0.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT:    st1h { z1.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 14 x i16>, ptr %a
+  store <vscale x 14 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv15i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 15 x i16>, ptr %a
+  store <vscale x 15 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv16i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 16 x i16>, ptr %a
+  store <vscale x 16 x i16> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i32>, ptr %a
+  store <vscale x 1 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i32>, ptr %a
+  store <vscale x 2 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i32>, ptr %a
+  store <vscale x 3 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i32>, ptr %a
+  store <vscale x 4 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv5i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x i32>, ptr %a
+  store <vscale x 5 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv6i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x i32>, ptr %a
+  store <vscale x 6 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv7i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x i32>, ptr %a
+  store <vscale x 7 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv8i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x i32>, ptr %a
+  store <vscale x 8 x i32> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv1i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i64>, ptr %a
+  store <vscale x 1 x i64> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i64>, ptr %a
+  store <vscale x 2 x i64> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i64>, ptr %a
+  store <vscale x 3 x i64> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i64>, ptr %a
+  store <vscale x 4 x i64> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv1f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x half>, ptr %a
+  store <vscale x 1 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x half>, ptr %a
+  store <vscale x 2 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x half>, ptr %a
+  store <vscale x 3 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x half>, ptr %a
+  store <vscale x 4 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv5f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x half>, ptr %a
+  store <vscale x 5 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv6f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1h { z1.s }, p1, [x1]
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x half>, ptr %a
+  store <vscale x 6 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv7f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x half>, ptr %a
+  store <vscale x 7 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv8f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x half>, ptr %a
+  store <vscale x 8 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv9f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 9 x half>, ptr %a
+  store <vscale x 9 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv10f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 10 x half>, ptr %a
+  store <vscale x 10 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv11f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #11 // =0xb
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 11 x half>, ptr %a
+  store <vscale x 11 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv12f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 12 x half>, ptr %a
+  store <vscale x 12 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv13f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #13 // =0xd
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 13 x half>, ptr %a
+  store <vscale x 13 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv14f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z2, [x0]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z2, [x1]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 14 x half>, ptr %a
+  store <vscale x 14 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv15f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 15 x half>, ptr %a
+  store <vscale x 15 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 16 x half>, ptr %a
+  store <vscale x 16 x half> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv1f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x float>, ptr %a
+  store <vscale x 1 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x float>, ptr %a
+  store <vscale x 2 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x float>, ptr %a
+  store <vscale x 3 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x float>, ptr %a
+  store <vscale x 4 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv5f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x float>, ptr %a
+  store <vscale x 5 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv6f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    st1w { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x float>, ptr %a
+  store <vscale x 6 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv7f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x float>, ptr %a
+  store <vscale x 7 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x float>, ptr %a
+  store <vscale x 8 x float> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv1f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x double>, ptr %a
+  store <vscale x 1 x double> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x double>, ptr %a
+  store <vscale x 2 x double> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x double>, ptr %a
+  store <vscale x 3 x double> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x double>, ptr %a
+  store <vscale x 4 x double> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv1bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x bfloat>, ptr %a
+  store <vscale x 1 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv2bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x bfloat>, ptr %a
+  store <vscale x 2 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv3bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x bfloat>, ptr %a
+  store <vscale x 3 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv4bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.s }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x bfloat>, ptr %a
+  store <vscale x 4 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv5bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x bfloat>, ptr %a
+  store <vscale x 5 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv6bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1h { z1.s }, p1, [x1]
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x bfloat>, ptr %a
+  store <vscale x 6 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv7bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x bfloat>, ptr %a
+  store <vscale x 7 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv8bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x bfloat>, ptr %a
+  store <vscale x 8 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv9bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 9 x bfloat>, ptr %a
+  store <vscale x 9 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv10bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 10 x bfloat>, ptr %a
+  store <vscale x 10 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv11bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #11 // =0xb
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 11 x bfloat>, ptr %a
+  store <vscale x 11 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv12bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z0, [x1]
+; CHECK-NEXT:    st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 12 x bfloat>, ptr %a
+  store <vscale x 12 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv13bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #13 // =0xd
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 13 x bfloat>, ptr %a
+  store <vscale x 13 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv14bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr z2, [x0]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    str z2, [x1]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 14 x bfloat>, ptr %a
+  store <vscale x 14 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv15bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 15 x bfloat>, ptr %a
+  store <vscale x 15 x bfloat> %c, ptr %b
+  ret void
+}
+
+define void @sve_load_store_nxv16bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    str z0, [x1, #1, mul vl]
+; CHECK-NEXT:    str z1, [x1]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 16 x bfloat>, ptr %a
+  store <vscale x 16 x bfloat> %c, ptr %b
+  ret void
+}
+
+define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i8>, ptr %a
+  %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+  ret <vscale x 1 x i16> %c.sext
+}
+
+define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i8>, ptr %a
+  %c.sext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+  ret <vscale x 2 x i16> %c.sext
+}
+
+define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i8>, ptr %a
+  %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+  ret <vscale x 3 x i16> %c.sext
+}
+
+define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i8>, ptr %a
+  %c.sext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+  ret <vscale x 4 x i16> %c.sext
+}
+
+define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x i8>, ptr %a
+  %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+  ret <vscale x 5 x i16> %c.sext
+}
+
+define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x8, all, mul #3
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x i8>, ptr %a
+  %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+  ret <vscale x 6 x i16> %c.sext
+}
+
+define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x i8>, ptr %a
+  %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+  ret <vscale x 7 x i16> %c.sext
+}
+
+define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x i8>, ptr %a
+  %c.sext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %c.sext
+}
+
+define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv9i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 9 x i8>, ptr %a
+  %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+  ret <vscale x 9 x i16> %c.sext
+}
+
+define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv10i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8, all, mul #5
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 10 x i8>, ptr %a
+  %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+  ret <vscale x 10 x i16> %c.sext
+}
+
+define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv11i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #11 // =0xb
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 11 x i8>, ptr %a
+  %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+  ret <vscale x 11 x i16> %c.sext
+}
+
+define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv12i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 12 x i8>, ptr %a
+  %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+  ret <vscale x 12 x i16> %c.sext
+}
+
+define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv13i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #13 // =0xd
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 13 x i8>, ptr %a
+  %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+  ret <vscale x 13 x i16> %c.sext
+}
+
+define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv14i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8, all, mul #7
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 14 x i8>, ptr %a
+  %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+  ret <vscale x 14 x i16> %c.sext
+}
+
+define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv15i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 15 x i8>, ptr %a
+  %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+  ret <vscale x 15 x i16> %c.sext
+}
+
+define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 16 x i8>, ptr %a
+  %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+  ret <vscale x 16 x i16> %c.sext
+}
+
+define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i16>, ptr %a
+  %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+  ret <vscale x 1 x i32> %c.sext
+}
+
+define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i16>, ptr %a
+  %c.sext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+  ret <vscale x 2 x i32> %c.sext
+}
+
+define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i16>, ptr %a
+  %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+  ret <vscale x 3 x i32> %c.sext
+}
+
+define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i16>, ptr %a
+  %c.sext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %c.sext
+}
+
+define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x i16>, ptr %a
+  %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+  ret <vscale x 5 x i32> %c.sext
+}
+
+define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8, all, mul #3
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x i16>, ptr %a
+  %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+  ret <vscale x 6 x i32> %c.sext
+}
+
+define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x i16>, ptr %a
+  %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+  ret <vscale x 7 x i32> %c.sext
+}
+
+define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x i16>, ptr %a
+  %c.sext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+  ret <vscale x 8 x i32> %c.sext
+}
+
+define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i32>, ptr %a
+  %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+  ret <vscale x 1 x i64> %c.sext
+}
+
+define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i32>, ptr %a
+  %c.sext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %c.sext
+}
+
+define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i32>, ptr %a
+  %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+  ret <vscale x 3 x i64> %c.sext
+}
+
+define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i32>, ptr %a
+  %c.sext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+  ret <vscale x 4 x i64> %c.sext
+}
+
+define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i8>, ptr %a
+  %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+  ret <vscale x 1 x i16> %c.zext
+}
+
+define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i8>, ptr %a
+  %c.zext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+  ret <vscale x 2 x i16> %c.zext
+}
+
+define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i8>, ptr %a
+  %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+  ret <vscale x 3 x i16> %c.zext
+}
+
+define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i8>, ptr %a
+  %c.zext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+  ret <vscale x 4 x i16> %c.zext
+}
+
+define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x i8>, ptr %a
+  %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+  ret <vscale x 5 x i16> %c.zext
+}
+
+define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x8, all, mul #3
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x i8>, ptr %a
+  %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+  ret <vscale x 6 x i16> %c.zext
+}
+
+define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x i8>, ptr %a
+  %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+  ret <vscale x 7 x i16> %c.zext
+}
+
+define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x i8>, ptr %a
+  %c.zext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+  ret <vscale x 8 x i16> %c.zext
+}
+
+define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv9i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #9 // =0x9
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 9 x i8>, ptr %a
+  %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+  ret <vscale x 9 x i16> %c.zext
+}
+
+define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv10i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8, all, mul #5
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 10 x i8>, ptr %a
+  %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+  ret <vscale x 10 x i16> %c.zext
+}
+
+define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv11i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #11 // =0xb
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 11 x i8>, ptr %a
+  %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+  ret <vscale x 11 x i16> %c.zext
+}
+
+define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv12i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 12 x i8>, ptr %a
+  %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+  ret <vscale x 12 x i16> %c.zext
+}
+
+define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv13i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #13 // =0xd
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 13 x i8>, ptr %a
+  %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+  ret <vscale x 13 x i16> %c.zext
+}
+
+define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv14i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8, all, mul #7
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z1.s, z0.h
+; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    str z2, [sp]
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 14 x i8>, ptr %a
+  %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+  ret <vscale x 14 x i16> %c.zext
+}
+
+define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv15i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #15 // =0xf
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 15 x i8>, ptr %a
+  %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+  ret <vscale x 15 x i16> %c.zext
+}
+
+define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 16 x i8>, ptr %a
+  %c.zext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+  ret <vscale x 16 x i16> %c.zext
+}
+
+define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i16>, ptr %a
+  %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+  ret <vscale x 1 x i32> %c.zext
+}
+
+define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i16>, ptr %a
+  %c.zext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+  ret <vscale x 2 x i32> %c.zext
+}
+
+define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i16>, ptr %a
+  %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+  ret <vscale x 3 x i32> %c.zext
+}
+
+define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i16>, ptr %a
+  %c.zext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %c.zext
+}
+
+define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #5 // =0x5
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 5 x i16>, ptr %a
+  %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+  ret <vscale x 5 x i32> %c.zext
+}
+
+define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    cntd x8, all, mul #3
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    str z1, [sp]
+; CHECK-NEXT:    st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 6 x i16>, ptr %a
+  %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+  ret <vscale x 6 x i32> %c.zext
+}
+
+define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #7 // =0x7
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 7 x i16>, ptr %a
+  %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+  ret <vscale x 7 x i32> %c.zext
+}
+
+define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 8 x i16>, ptr %a
+  %c.zext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+  ret <vscale x 8 x i32> %c.zext
+}
+
+define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 1 x i32>, ptr %a
+  %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+  ret <vscale x 1 x i64> %c.zext
+}
+
+define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 2 x i32>, ptr %a
+  %c.zext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %c.zext
+}
+
+define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov w9, #3 // =0x3
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    mul x8, x8, x9
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp]
+; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %c = load <vscale x 3 x i32>, ptr %a
+  %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+  ret <vscale x 3 x i64> %c.zext
+}
+
+define <vscale x 4 x i64> @sve_zextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %c = load <vscale x 4 x i32>, ptr %a
+  %c.zext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+  ret <vscale x 4 x i64> %c.zext
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
index 3bc0aba8d4264..93e2889793ba5 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -7,19 +7,22 @@
 define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
 ; CHECK-LABEL: udiv_by_minus_one:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and %s0, %s0, (56)0
-; CHECK-NEXT:    lea %s4, 16843010
-; CHECK-NEXT:    muls.l %s0, %s0, %s4
-; CHECK-NEXT:    srl %s0, %s0, 32
+; CHECK-NEXT:    and %s4, %s0, (56)0
 ; CHECK-NEXT:    and %s1, %s1, (56)0
-; CHECK-NEXT:    muls.l %s1, %s1, %s4
-; CHECK-NEXT:    srl %s1, %s1, 32
 ; CHECK-NEXT:    and %s2, %s2, (56)0
-; CHECK-NEXT:    muls.l %s2, %s2, %s4
-; CHECK-NEXT:    srl %s2, %s2, 32
 ; CHECK-NEXT:    and %s3, %s3, (56)0
-; CHECK-NEXT:    muls.l %s3, %s3, %s4
-; CHECK-NEXT:    srl %s3, %s3, 32
+; CHECK-NEXT:    or %s0, 0, (0)1
+; CHECK-NEXT:    cmpu.w %s5, %s3, (56)0
+; CHECK-NEXT:    or %s3, 0, (0)1
+; CHECK-NEXT:    cmov.w.eq %s3, (63)0, %s5
+; CHECK-NEXT:    cmpu.w %s5, %s2, (56)0
+; CHECK-NEXT:    or %s2, 0, (0)1
+; CHECK-NEXT:    cmov.w.eq %s2, (63)0, %s5
+; CHECK-NEXT:    cmpu.w %s5, %s1, (56)0
+; CHECK-NEXT:    or %s1, 0, (0)1
+; CHECK-NEXT:    cmov.w.eq %s1, (63)0, %s5
+; CHECK-NEXT:    cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT:    cmov.w.eq %s0, (63)0, %s4
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
   ret <4 x i8> %r
@@ -28,27 +31,18 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
 define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
 ; CHECK-LABEL: urem_by_minus_one:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    and %s0, %s0, (56)0
-; CHECK-NEXT:    and %s1, %s1, (56)0
-; CHECK-NEXT:    and %s2, %s2, (56)0
-; CHECK-NEXT:    and %s3, %s3, (56)0
-; CHECK-NEXT:    lea %s4, 16843010
-; CHECK-NEXT:    muls.l %s5, %s3, %s4
-; CHECK-NEXT:    srl %s5, %s5, 32
-; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT:    subs.w.sx %s3, %s3, %s5
-; CHECK-NEXT:    muls.l %s5, %s2, %s4
-; CHECK-NEXT:    srl %s5, %s5, 32
-; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT:    subs.w.sx %s2, %s2, %s5
-; CHECK-NEXT:    muls.l %s5, %s1, %s4
-; CHECK-NEXT:    srl %s5, %s5, 32
-; CHECK-NEXT:    muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT:    subs.w.sx %s1, %s1, %s5
-; CHECK-NEXT:    muls.l %s4, %s0, %s4
-; CHECK-NEXT:    srl %s4, %s4, 32
-; CHECK-NEXT:    muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT:    subs.w.sx %s0, %s0, %s4
+; CHECK-NEXT:    and %s4, %s0, (56)0
+; CHECK-NEXT:    and %s5, %s1, (56)0
+; CHECK-NEXT:    and %s6, %s2, (56)0
+; CHECK-NEXT:    and %s7, %s3, (56)0
+; CHECK-NEXT:    cmpu.w %s7, %s7, (56)0
+; CHECK-NEXT:    cmov.w.eq %s3, (0)1, %s7
+; CHECK-NEXT:    cmpu.w %s6, %s6, (56)0
+; CHECK-NEXT:    cmov.w.eq %s2, (0)1, %s6
+; CHECK-NEXT:    cmpu.w %s5, %s5, (56)0
+; CHECK-NEXT:    cmov.w.eq %s1, (0)1, %s5
+; CHECK-NEXT:    cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT:    cmov.w.eq %s0, (0)1, %s4
 ; CHECK-NEXT:    b.l.t (, %s10)
   %r = urem <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
   ret <4 x i8> %r

From 39d0e41a5f3e13b127caea16988d7c69371de6cf Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Thu, 2 Oct 2025 13:27:57 +0200
Subject: [PATCH 495/878] [NFC][Clang Improve performance of
 `DoMarkVarDeclReferenced` (#161648)

Address post commit feedback from #161231
---
 clang/lib/Sema/SemaExpr.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 06b2529011c74..4d3c7d611f370 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -20107,9 +20107,10 @@ static void DoMarkVarDeclReferenced(
   bool NeededForConstantEvaluation =
       isPotentiallyConstantEvaluatedContext(SemaRef) && UsableInConstantExpr;
 
-  bool NeedDefinition = OdrUse == OdrUseContext::Used ||
-                        NeededForConstantEvaluation ||
-                        Var->getType()->isUndeducedType();
+  bool NeedDefinition =
+      OdrUse == OdrUseContext::Used || NeededForConstantEvaluation ||
+      (TSK != clang::TSK_Undeclared && !UsableInConstantExpr &&
+       Var->getType()->isUndeducedType());
 
   assert(!isa<VarTemplatePartialSpecializationDecl>(Var) &&
          "Can't instantiate a partial template specialization.");

From dbf44c28234670361017429871286324ced1e5f3 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 2 Oct 2025 12:40:15 +0100
Subject: [PATCH 496/878] [AArch64] Remove unused tablegen classes and code
 cleanup. NFC

---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 30 -------------------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  5 +---
 .../lib/Target/AArch64/AArch64InstrFormats.td | 22 --------------
 3 files changed, 1 insertion(+), 56 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7c9aef52b3acf..fbc92d77da1ab 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -130,8 +130,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Expand_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
-  class AdvSIMD_1VectorArg_Long_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>;
   class AdvSIMD_1IntArg_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty], [IntrNoMem]>;
   class AdvSIMD_1VectorArg_Narrow_Intrinsic
@@ -150,9 +148,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2VectorArg_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
                 [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Compare_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
   class AdvSIMD_2Arg_FloatCompare_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
@@ -160,10 +155,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>, LLVMTruncatedType<0>],
                 [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Wide_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, LLVMTruncatedType<0>],
-                [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMExtendedType<0>, LLVMExtendedType<0>],
@@ -172,10 +163,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                 [LLVMExtendedType<0>, llvm_i32_ty],
                 [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [llvm_anyvector_ty],
-                [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>],
@@ -184,10 +171,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>, llvm_i32_ty],
                 [IntrNoMem]>;
-  class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty],
-                [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Lane_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                 [LLVMMatchType<0>, llvm_anyint_ty, llvm_i32_ty],
@@ -205,14 +188,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
-      : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-               [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty,
-                LLVMMatchType<1>], [IntrNoMem]>;
-  class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty, llvm_i32_ty],
-                [IntrNoMem]>;
   class AdvSIMD_CvtFxToFP_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
                 [IntrNoMem]>;
@@ -238,11 +213,6 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
-  class AdvSIMD_FML_Intrinsic
-    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
-                [IntrNoMem]>;
-
   class AdvSIMD_BF16FML_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
                 [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index debe4d9f4aeb8..50a0562edaff1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6618,7 +6618,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
           "llvm.eh.recoverfp must take a function as the first argument");
     return IncomingFPOp;
   }
-
   case Intrinsic::aarch64_neon_vsri:
   case Intrinsic::aarch64_neon_vsli:
   case Intrinsic::aarch64_sve_sri:
@@ -15155,9 +15154,7 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
                                : Shift.getOperand(1);
 
   unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
-  SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
-
-  return ResultSLI;
+  return DAG.getNode(Inst, DL, VT, X, Y, Imm);
 }
 
 static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f07d3514d1a99..6ef0a95d7406d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -10176,28 +10176,6 @@ multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
             (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
 }
 
-multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
 //----------------------------------------------------------------------------
 // AdvSIMD vector x indexed element
 //----------------------------------------------------------------------------

From 8dd2846fcf7b0ad254f3768149d28fe87af9b15d Mon Sep 17 00:00:00 2001
From: Marco Borgeaud <marco.borgeaud@sonarsource.com>
Date: Thu, 2 Oct 2025 14:13:09 +0200
Subject: [PATCH 497/878] [analyzer] Harden RegionStoreManager::bindArray
 (#153177)

Fixes https://github.com/llvm/llvm-project/issues/147686 by handling
symbolic values similarly to bindStruct and handling constant values.
The latter is actually more of a workaround: bindArray should not have
to deal with such constants.

CPP-6688
---
 clang/lib/StaticAnalyzer/Core/RegionStore.cpp | 10 +++-
 clang/test/Analysis/initializer.cpp           | 48 +++++++++++++++++++
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index 8e9d6fe59e6ae..af0ef52334bd7 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -2658,14 +2658,20 @@ RegionStoreManager::bindArray(LimitedRegionBindingsConstRef B,
     return bindAggregate(B, R, V);
   }
 
-  // Handle lazy compound values.
+  // FIXME Single value constant should have been handled before this call to
+  // bindArray. This is only a hotfix to not crash.
+  if (Init.isConstant())
+    return bindAggregate(B, R, Init);
+
   if (std::optional LCV = Init.getAs<nonloc::LazyCompoundVal>()) {
     if (std::optional NewB = tryBindSmallArray(B, R, AT, *LCV))
       return *NewB;
-
     return bindAggregate(B, R, Init);
   }
 
+  if (isa<nonloc::SymbolVal>(Init))
+    return bindAggregate(B, R, Init);
+
   if (Init.isUnknown())
     return bindAggregate(B, R, UnknownVal());
 
diff --git a/clang/test/Analysis/initializer.cpp b/clang/test/Analysis/initializer.cpp
index 713e121168571..88758f7c3ac1d 100644
--- a/clang/test/Analysis/initializer.cpp
+++ b/clang/test/Analysis/initializer.cpp
@@ -610,3 +610,51 @@ void top() {
   consume(parseMatchComponent());
 }
 } // namespace elementwise_copy_small_array_from_post_initializer_of_cctor
+
+namespace gh147686 {
+// The problem reported in https://github.com/llvm/llvm-project/issues/147686
+// is sensitive to the initializer form: using parenthesis to initialize m_ptr
+// resulted in crashes when analyzing *m_ptr = '\0'; but using braces is fine.
+
+struct A {
+  A() : m_ptr(m_buf) { *m_ptr = '\0'; } // no-crash
+  A(int overload) : m_ptr{m_buf} { *m_ptr = '\0'; }
+  A(char src) : m_ptr(m_buf) { *m_ptr = src; } // no-crash
+  A(char src, int overload) : m_ptr{m_buf} { *m_ptr = src; }
+  char m_buf[64] = {0};
+  char * m_ptr;
+};
+
+void test1() {
+  A a;
+  clang_analyzer_eval(a.m_buf[0] == 0); // expected-warning{{TRUE}}
+  // FIXME The next eval should result in TRUE.
+  clang_analyzer_eval(*a.m_ptr == 0); // expected-warning{{UNKNOWN}}
+}
+
+void test2() {
+  A a(314);
+  clang_analyzer_eval(a.m_buf[0] == 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(*a.m_ptr == 0); // expected-warning{{TRUE}}
+}
+
+void test3() {
+  A a(0);
+  clang_analyzer_eval(a.m_buf[0] == 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(*a.m_ptr == 0); // expected-warning{{TRUE}}
+}
+
+void test3Bis(char arg) {
+  A a(arg);
+  // FIXME This test should behave like test3.
+  clang_analyzer_eval(a.m_buf[0] == arg); // expected-warning{{FALSE}} // expected-warning{{TRUE}}
+  clang_analyzer_eval(*a.m_ptr == arg); // expected-warning{{UNKNOWN}}
+}
+
+void test4(char arg) {
+  A a(arg, 314);
+  clang_analyzer_eval(a.m_buf[0] == arg); // expected-warning{{TRUE}}
+  clang_analyzer_eval(*a.m_ptr == arg); // expected-warning{{TRUE}}
+}
+
+} // namespace gh147686

From 3537e8abfa067013f01b53259fb2cc854d587dee Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 21:21:02 +0900
Subject: [PATCH 498/878] RegAllocGreedy: Check if copied lanes are live in
 trySplitAroundHintReg (#160424)

For subregister copies, do a subregister live check instead of checking
the main range. Doesn't do much yet, the split analysis still does not
track live ranges.
---
 llvm/lib/CodeGen/RegAllocGreedy.cpp    | 20 +++++-
 llvm/test/CodeGen/SystemZ/fp-cmp-04.ll |  4 +-
 llvm/test/CodeGen/X86/fshl.ll          | 81 ++++++++++++-----------
 llvm/test/CodeGen/X86/fshr.ll          | 90 +++++++++++++-------------
 4 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 8e6cf3e6b51b3..5638f98b8163d 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1406,8 +1406,24 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
       continue;
 
     // Check if VirtReg interferes with OtherReg after this COPY instruction.
-    if (!IsDef && VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot()))
-      continue;
+    if (Opnd.readsReg()) {
+      SlotIndex Index = LIS->getInstructionIndex(Instr).getRegSlot();
+
+      if (SubReg) {
+        LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+        if (IsDef)
+          Mask = ~Mask;
+
+        if (any_of(VirtReg.subranges(), [=](const LiveInterval::SubRange &S) {
+              return (S.LaneMask & Mask).any() && S.liveAt(Index);
+            })) {
+          continue;
+        }
+      } else {
+        if (VirtReg.liveAt(Index))
+          continue;
+      }
+    }
 
     MCRegister OtherPhysReg =
         OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
diff --git a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
index d3d641357ae58..eb7c1b632dba9 100644
--- a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
@@ -235,7 +235,7 @@ define half @f12_half(half %dummy, half %val, ptr %dest) {
 ; CHECK-NEXT: blah %f0
 ; CHECK-NEXT: #NO_APP
 ; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT
-; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: ltebr %f1, %f0
 ; CHECK-NEXT: jl .LBB11_2
 ; CHECK-NEXT:# %bb.1:
 ; CHECK-NEXT: lgdr %r0, %f8
@@ -344,7 +344,7 @@ define half @f15_half(half %val, half %dummy, ptr %dest) {
 ; CHECK-NEXT: blah %f2
 ; CHECK-NEXT: #NO_APP
 ; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT
-; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: ltebr %f1, %f0
 ; CHECK-NEXT: jl .LBB15_2
 ; CHECK-NEXT:# %bb.1:
 ; CHECK-NEXT: lgdr %r0, %f8
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index ec1b8a3c8d6d9..f998128af95f8 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -335,84 +335,83 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %esi
 ; X86-SLOW-NEXT:    andl $-16, %esp
 ; X86-SLOW-NEXT:    subl $32, %esp
-; X86-SLOW-NEXT:    movl 24(%ebp), %esi
+; X86-SLOW-NEXT:    movl 24(%ebp), %edi
 ; X86-SLOW-NEXT:    movl 28(%ebp), %eax
 ; X86-SLOW-NEXT:    movl 48(%ebp), %edx
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    testb $64, %cl
-; X86-SLOW-NEXT:    movl 52(%ebp), %edi
+; X86-SLOW-NEXT:    movl 52(%ebp), %ebx
 ; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
 ; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %esi, %edx
-; X86-SLOW-NEXT:    movl 32(%ebp), %esi
-; X86-SLOW-NEXT:    movl %edi, %ecx
-; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl %edi, %edx
+; X86-SLOW-NEXT:    movl 32(%ebp), %edi
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ebx
 ; X86-SLOW-NEXT:    movl 36(%ebp), %eax
 ; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
 ; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
 ; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:  .LBB6_3:
-; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
-; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    jne .LBB6_4
 ; X86-SLOW-NEXT:  # %bb.5:
-; X86-SLOW-NEXT:    movl %ecx, %ebx
 ; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %edi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    jmp .LBB6_6
 ; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ecx, %edx
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %ebx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    movl %edx, %esi
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    notb %dl
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %eax
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl %edi
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    notb %bl
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:    orl %eax, %edi
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-SLOW-NEXT:    movl %esi, %eax
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    shrl %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    orl %eax, %edx
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-SLOW-NEXT:    movl %ebx, %eax
+; X86-SLOW-NEXT:    shrl %ebx
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    orl %eax, %ebx
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl %esi
-; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT:    movl %edx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %esi
 ; X86-SLOW-NEXT:    orl %eax, %esi
-; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    shrl %ebx
-; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SLOW-NEXT:    shrl %cl, %ebx
-; X86-SLOW-NEXT:    orl %eax, %ebx
 ; X86-SLOW-NEXT:    movl 8(%ebp), %eax
-; X86-SLOW-NEXT:    movl %ebx, 12(%eax)
-; X86-SLOW-NEXT:    movl %esi, 8(%eax)
-; X86-SLOW-NEXT:    movl %edx, 4(%eax)
-; X86-SLOW-NEXT:    movl %edi, (%eax)
+; X86-SLOW-NEXT:    movl %esi, 12(%eax)
+; X86-SLOW-NEXT:    movl %ebx, 8(%eax)
+; X86-SLOW-NEXT:    movl %edi, 4(%eax)
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ecx, (%eax)
 ; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 544ab7fc77374..c307833e488c9 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -322,79 +322,79 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    subl $16, %esp
 ; X86-SLOW-NEXT:    movl 24(%ebp), %edx
 ; X86-SLOW-NEXT:    movl 28(%ebp), %esi
-; X86-SLOW-NEXT:    movl 48(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 48(%ebp), %edi
 ; X86-SLOW-NEXT:    movl 56(%ebp), %eax
 ; X86-SLOW-NEXT:    testb $64, %al
-; X86-SLOW-NEXT:    movl 52(%ebp), %edi
+; X86-SLOW-NEXT:    movl 52(%ebp), %eax
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edx, %ebx
+; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %edi
 ; X86-SLOW-NEXT:    movl 32(%ebp), %edx
-; X86-SLOW-NEXT:    movl %edi, %eax
-; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    movl %esi, %eax
 ; X86-SLOW-NEXT:    movl 36(%ebp), %esi
 ; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl 40(%ebp), %eax
-; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl 44(%ebp), %eax
+; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
 ; X86-SLOW-NEXT:  .LBB6_3:
-; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    testb $32, %cl
+; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
+; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    je .LBB6_4
 ; X86-SLOW-NEXT:  # %bb.5:
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %ebx
 ; X86-SLOW-NEXT:    jmp .LBB6_6
 ; X86-SLOW-NEXT:  .LBB6_4:
 ; X86-SLOW-NEXT:    movl %edx, %esi
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %eax, %ebx
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ecx, %edi
+; X86-SLOW-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    shrl %cl, %eax
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:    movl %ecx, %eax
-; X86-SLOW-NEXT:    notb %al
-; X86-SLOW-NEXT:    movl %ebx, %edi
-; X86-SLOW-NEXT:    addl %ebx, %ebx
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %ebx
-; X86-SLOW-NEXT:    orl %edx, %ebx
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
-; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%ebx,%ebx), %edx
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    orl %edi, %edx
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    notb %dl
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:    addl %edi, %edi
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    orl %ebx, %edi
+; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebx
-; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    shrl %cl, %eax
 ; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-SLOW-NEXT:    leal (%edi,%edi), %ebx
-; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    movl %edx, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %ebx
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    orl %eax, %ebx
 ; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%eax,%eax), %edi
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shrl %cl, %eax
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    movl %edx, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl 8(%ebp), %ecx
-; X86-SLOW-NEXT:    movl %esi, 12(%ecx)
-; X86-SLOW-NEXT:    movl %ebx, 8(%ecx)
-; X86-SLOW-NEXT:    movl %edx, 4(%ecx)
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-SLOW-NEXT:    movl %eax, (%ecx)
-; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl 8(%ebp), %eax
+; X86-SLOW-NEXT:    movl %esi, 12(%eax)
+; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    movl %ebx, 4(%eax)
+; X86-SLOW-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ecx, (%eax)
 ; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi

From 86ba1986a29478681ddc64af7d08fdff390b00e8 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 2 Oct 2025 13:28:47 +0100
Subject: [PATCH 499/878] [lldb][test] Un-XFAIL
 TestDataFormatterStdUnorderedMap.py for older Clang versions

Fixed in https://github.com/llvm/llvm-project/pull/156033
---
 .../TestDataFormatterStdUnorderedMap.py                      | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py
index 1e920faab6397..45f7b5be465c5 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unordered_map-iterator/TestDataFormatterStdUnorderedMap.py
@@ -124,11 +124,6 @@ def do_test_ptr(self):
         self.check_ptr_ptr("ptr5")
         self.check_ptr_ptr("ptr6")
 
-    @expectedFailureAll(
-        bugnumber="https://github.com/llvm/llvm-project/issues/146040",
-        compiler="clang",
-        compiler_version=["<", "21"],
-    )
     @add_test_categories(["libc++"])
     def test_ptr_libcxx(self):
         self.build(dictionary={"USE_LIBCPP": 1})

From 7e6d277d3bd10bacc121962637c3c646866e2ca3 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 2 Oct 2025 13:33:21 +0100
Subject: [PATCH 500/878] [lldb][test] TestStructuredBinding.py: adjust
 assertion to check for compatible compiler version

Requires a compiler with the changes in https://github.com/llvm/llvm-project/pull/122265
---
 .../TestStructuredBinding.py                  | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
index 5f939ecfbef29..882c91d1ce8c8 100644
--- a/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
+++ b/lldb/test/API/lang/cpp/structured-binding/TestStructuredBinding.py
@@ -99,16 +99,21 @@ def test(self):
         self.expect_expr("ty2", result_value="'z'")
         self.expect_expr("tz2", result_value="10")
 
-        self.expect(
-            "frame variable",
-            substrs=[
-                "tx1 =",
-                "ty1 =",
-                "tz1 =",
-                "tx2 =",
-                "ty2 =",
-                "tz2 =",
-                "mp1 =",
-                "mp2 =",
-            ],
-        )
+        # Older versions of Clang marked structured binding variables
+        # as artificial, and thus LLDB wouldn't display them.
+        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
+            [">=", "22.0"]
+        ):
+            self.expect(
+                "frame variable",
+                substrs=[
+                    "tx1 =",
+                    "ty1 =",
+                    "tz1 =",
+                    "tx2 =",
+                    "ty2 =",
+                    "tz2 =",
+                    "mp1 =",
+                    "mp2 =",
+                ],
+            )

From db39ef9d566529000a1edcd58108f2df7b323bf5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 2 Oct 2025 14:34:04 +0200
Subject: [PATCH 501/878] [GVN] Add additional tests for inverted condition
 propagation (NFC)

---
 llvm/test/Transforms/GVN/condprop.ll | 60 ++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/llvm/test/Transforms/GVN/condprop.ll b/llvm/test/Transforms/GVN/condprop.ll
index 15ffcbff1e157..eb2a9f1e847c4 100644
--- a/llvm/test/Transforms/GVN/condprop.ll
+++ b/llvm/test/Transforms/GVN/condprop.ll
@@ -321,6 +321,66 @@ different:
   ret i1 %cmp3
 }
 
+define i1 @test6_phi1(i1 %c, i32 %x, i32 %y) {
+; CHECK-LABEL: @test6_phi1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ false, [[BB1]] ], [ true, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp.not = icmp ne i32 %x, %y
+  br i1 %c, label %bb1, label %bb2
+
+bb1:
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %bb2, label %bb3
+
+bb2:
+  %phi = phi i1 [ %cmp.not, %bb1 ], [ true, %entry ]
+  ret i1 %phi
+
+bb3:
+  ret i1 false
+}
+
+define i1 @test6_phi2(i1 %c, i32 %x, i32 %y) {
+; CHECK-LABEL: @test6_phi2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ [[CMP_NOT]], [[BB1]] ], [ true, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i1 [[PHI]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  br i1 %c, label %bb1, label %bb2
+
+bb1:
+  %cmp.not = icmp ne i32 %x, %y
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %bb2, label %bb3
+
+bb2:
+  %phi = phi i1 [ %cmp.not, %bb1 ], [ true, %entry ]
+  ret i1 %phi
+
+bb3:
+  ret i1 false
+}
+
 define i1 @test7(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]

From 9583b399d85cacdfa0a41f798ab44abaa3981bbf Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Thu, 2 Oct 2025 14:35:38 +0200
Subject: [PATCH 502/878] [Clang] Normalize constraints before checking for
 satisfaction (#141776)

In the standard, constraint satisfaction checking is done on the
normalized form of a constraint.

Clang instead substitutes on the non-normalized form, which causes us to
report substitution failures in template arguments or concept ids, which
is non-conforming but unavoidable without a parameter mapping

This patch normalizes before satisfaction checking. However, we preserve
concept-id nodes in the normalized form, solely for diagnostics
purposes.

This addresses #61811 and related concepts conformance bugs, ideally to
make the remaining implementation of concept template parameters easier

Fixes #135190
Fixes  #61811

Co-authored-by: Younan Zhang <zyn7109@gmail.com>
---
 clang/docs/InternalsManual.rst                |   61 +
 clang/docs/ReleaseNotes.rst                   |    4 +
 clang/include/clang/AST/ASTConcept.h          |   33 +-
 clang/include/clang/AST/ASTContext.h          |    1 -
 clang/include/clang/Sema/Sema.h               |  105 +-
 clang/include/clang/Sema/SemaConcept.h        |  434 +++-
 clang/include/clang/Sema/Template.h           |   22 +-
 clang/lib/AST/ASTConcept.cpp                  |   31 +-
 clang/lib/AST/ASTImporter.cpp                 |   12 +-
 clang/lib/Sema/SemaConcept.cpp                | 2012 +++++++++++------
 clang/lib/Sema/SemaDeclCXX.cpp                |   16 +-
 clang/lib/Sema/SemaExprCXX.cpp                |   16 +-
 clang/lib/Sema/SemaOverload.cpp               |    3 +-
 clang/lib/Sema/SemaTemplate.cpp               |   93 +-
 clang/lib/Sema/SemaTemplateDeduction.cpp      |   51 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  169 +-
 clang/lib/Sema/TreeTransform.h                |   19 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |    2 +-
 clang/lib/Serialization/ASTReaderStmt.cpp     |   14 +-
 clang/lib/Serialization/ASTWriterStmt.cpp     |   18 +-
 clang/test/AST/ast-dump-concepts.cpp          |   10 +-
 clang/test/AST/ast-dump-ctad-alias.cpp        |   21 +-
 clang/test/CXX/drs/cwg25xx.cpp                |   14 +-
 .../CXX/expr/expr.prim/expr.prim.id/p3.cpp    |    3 +-
 .../expr.prim.req/compound-requirement.cpp    |   14 +-
 .../expr.prim.req/nested-requirement.cpp      |   35 +-
 .../expr.prim.req/simple-requirement.cpp      |    4 +-
 .../expr.prim.req/type-requirement.cpp        |   12 +-
 .../constrant-satisfaction-conversions.cpp    |    5 +-
 .../temp.constr/temp.constr.normal/p1.cpp     |   59 +-
 clang/test/CXX/temp/temp.param/p10-2a.cpp     |   23 +-
 clang/test/SemaCXX/cxx23-assume.cpp           |    9 +-
 clang/test/SemaCXX/cxx2b-deducing-this.cpp    |    8 +-
 clang/test/SemaCXX/cxx2c-fold-exprs.cpp       |  202 +-
 .../SemaCXX/cxx2c-template-template-param.cpp |    4 +-
 .../invalid-requirement-requires-expr.cpp     |    4 +-
 ...overload-resolution-deferred-templates.cpp |    3 +-
 clang/test/SemaCXX/type-traits.cpp            |    4 +-
 clang/test/SemaHLSL/BuiltIns/Buffers.hlsl     |    6 +-
 clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl   |    6 +-
 .../SemaTemplate/concepts-recovery-expr.cpp   |   32 +-
 .../SemaTemplate/concepts-recursive-inst.cpp  |   27 +-
 clang/test/SemaTemplate/concepts.cpp          |   71 +-
 clang/test/SemaTemplate/deduction-guide.cpp   |   15 +-
 .../instantiate-abbreviated-template.cpp      |    1 +
 .../instantiate-expanded-type-constraint.cpp  |    4 +-
 .../instantiate-requires-expr.cpp             |   20 +-
 .../instantiate-template-argument.cpp         |   97 +-
 clang/test/SemaTemplate/pr52970.cpp           |    2 +-
 .../cpp17_iterator_concepts.verify.cpp        |    4 +-
 50 files changed, 2638 insertions(+), 1197 deletions(-)

diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index bd742273f4ed5..c677ddfa5ecc1 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -2859,6 +2859,67 @@ This library is called by the :ref:`Parser library <Parser>` during parsing to
 do semantic analysis of the input.  For valid programs, Sema builds an AST for
 parsed constructs.
 
+
+Concept Satisfaction Checking and Subsumption
+---------------------------------------------
+
+As per the C++ standard, constraints are `normalized <https://eel.is/c++draft/temp.constr.normal>`_
+and the normal form is used both for subsumption, and constraint checking.
+Both depend on a parameter mapping that substitutes lazily. In particular,
+we should not substitute in unused arguments.
+
+Clang follows the order of operations prescribed by the standard.
+
+Normalization happens prior to satisfaction and subsumption
+and is handled by ``NormalizedConstraint``.
+
+Clang preserves in the normalized form intermediate concept-ids
+(``ConceptIdConstraint``) This is used for diagnostics only and no substitution
+happens in a ConceptIdConstraint if its expression is satisfied.
+
+The normal form of the associated constraints of a declaration is cached in
+Sema::NormalizationCache such that it is only computed once.
+
+A ``NormalizedConstraint`` is a recursive data structure, where each node
+contains a parameter mapping, represented by the indexes of all parameter
+being used.
+
+Checking satisfaction is done by ``ConstraintSatisfactionChecker``, recursively
+walking ``NormalizedConstraint``. At each level, we substitute the outermost
+level of the template arguments referenced in the parameter mapping of a
+normalized expression (``MultiLevelTemplateArgumentList``).
+
+For the following example,
+
+.. code-block:: c++
+
+  template <typename T>
+  concept A = __is_same(T, int);
+
+  template <typename U>
+  concept B = A<U> && __is_same(U, int);
+
+The normal form of B is
+
+.. code-block:: c++
+
+    __is_same(T, int) /*T->U, innermost level*/
+ && __is_same(U, int) {U->U} /*T->U, outermost level*/
+
+After substitution in the mapping, we substitute in the constraint expression
+using that copy of the ``MultiLevelTemplateArgumentList``, and then evaluate it.
+
+Because this is expensive, it is cached in
+``UnsubstitutedConstraintSatisfactionCache``.
+
+Any error during satisfaction is recorded in ``ConstraintSatisfaction``.
+for nested requirements, ``ConstraintSatisfaction`` is stored (including
+diagnostics) in the AST, which is something we might want to improve.
+
+When an atomic constraint is not satified, we try to substitute into any
+enclosing concept-id using the same mechanism described above, for
+diagnostics purpose, and inject that in the ``ConstraintSatisfaction``.
+
 .. _CodeGen:
 
 The CodeGen Library
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c6ee1e282a008..a1e3a0c51d8e1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -160,6 +160,10 @@ C++23 Feature Support
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
+- Clang now normalizes constraints before checking whether they are satisfied, as mandated by the standard.
+  As a result, Clang no longer incorrectly diagnoses substitution failures in template arguments only
+  used in concept-ids, and produces better diagnostics for satisfaction failure. (#GH61811) (#GH135190)
+
 C++17 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h
index 72da0059744f2..f362f24ebc72a 100644
--- a/clang/include/clang/AST/ASTConcept.h
+++ b/clang/include/clang/AST/ASTConcept.h
@@ -28,10 +28,20 @@ namespace clang {
 
 class ConceptDecl;
 class TemplateDecl;
+class ConceptReference;
 class Expr;
 class NamedDecl;
 struct PrintingPolicy;
 
+/// Unsatisfied constraint expressions if the template arguments could be
+/// substituted into them, or a diagnostic if substitution resulted in
+/// an invalid expression.
+///
+using ConstraintSubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
+using UnsatisfiedConstraintRecord =
+    llvm::PointerUnion<const Expr *, const ConceptReference *,
+                       const ConstraintSubstitutionDiagnostic *>;
+
 /// The result of a constraint satisfaction check, containing the necessary
 /// information to diagnose an unsatisfied constraint.
 class ConstraintSatisfaction : public llvm::FoldingSetNode {
@@ -48,16 +58,13 @@ class ConstraintSatisfaction : public llvm::FoldingSetNode {
                          ArrayRef<TemplateArgument> TemplateArgs)
       : ConstraintOwner(ConstraintOwner), TemplateArgs(TemplateArgs) {}
 
-  using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
-  using Detail = llvm::PointerUnion<Expr *, SubstitutionDiagnostic *>;
-
   bool IsSatisfied = false;
   bool ContainsErrors = false;
 
   /// \brief The substituted constraint expr, if the template arguments could be
   /// substituted into them, or a diagnostic if substitution resulted in an
   /// invalid expression.
-  llvm::SmallVector<Detail, 4> Details;
+  llvm::SmallVector<UnsatisfiedConstraintRecord, 4> Details;
 
   void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &C) {
     Profile(ID, C, ConstraintOwner, TemplateArgs);
@@ -69,19 +76,12 @@ class ConstraintSatisfaction : public llvm::FoldingSetNode {
 
   bool HasSubstitutionFailure() {
     for (const auto &Detail : Details)
-      if (Detail.dyn_cast<SubstitutionDiagnostic *>())
+      if (Detail.dyn_cast<const ConstraintSubstitutionDiagnostic *>())
         return true;
     return false;
   }
 };
 
-/// Pairs of unsatisfied atomic constraint expressions along with the
-/// substituted constraint expr, if the template arguments could be
-/// substituted into them, or a diagnostic if substitution resulted in
-/// an invalid expression.
-using UnsatisfiedConstraintRecord =
-    llvm::PointerUnion<Expr *, std::pair<SourceLocation, StringRef> *>;
-
 /// \brief The result of a constraint satisfaction check, containing the
 /// necessary information to diagnose an unsatisfied constraint.
 ///
@@ -101,6 +101,10 @@ struct ASTConstraintSatisfaction final :
     return getTrailingObjects() + NumRecords;
   }
 
+  ArrayRef<UnsatisfiedConstraintRecord> records() const {
+    return {begin(), end()};
+  }
+
   ASTConstraintSatisfaction(const ASTContext &C,
                             const ConstraintSatisfaction &Satisfaction);
   ASTConstraintSatisfaction(const ASTContext &C,
@@ -282,6 +286,11 @@ class TypeConstraint {
   }
 };
 
+/// Insertion operator for diagnostics.  This allows sending ConceptReferences's
+/// into a diagnostic with <<.
+const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+                                      const ConceptReference *C);
+
 } // clang
 
 #endif // LLVM_CLANG_AST_ASTCONCEPT_H
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 12351e98e5a2b..78220d4d8ff5b 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -3877,7 +3877,6 @@ typename clang::LazyGenerationalUpdatePtr<Owner, T, Update>::ValueType
     return new (Ctx) LazyData(Source, Value);
   return Value;
 }
-
 template <> struct llvm::DenseMapInfo<llvm::FoldingSetNodeID> {
   static FoldingSetNodeID getEmptyKey() { return FoldingSetNodeID{}; }
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index f53aafdeb4f36..bd3e042868299 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -65,6 +65,7 @@
 #include "clang/Sema/Redeclaration.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaBase.h"
+#include "clang/Sema/SemaConcept.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "clang/Sema/Weak.h"
 #include "llvm/ADT/APInt.h"
@@ -11694,8 +11695,9 @@ class Sema final : public SemaBase {
   ExprResult
   CheckConceptTemplateId(const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
                          const DeclarationNameInfo &ConceptNameInfo,
-                         NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
-                         const TemplateArgumentListInfo *TemplateArgs);
+                         NamedDecl *FoundDecl, TemplateDecl *NamedConcept,
+                         const TemplateArgumentListInfo *TemplateArgs,
+                         bool DoCheckConstraintSatisfaction = true);
 
   void diagnoseMissingTemplateArguments(TemplateName Name, SourceLocation Loc);
   void diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
@@ -12025,6 +12027,13 @@ class Sema final : public SemaBase {
                                  bool UpdateArgsWithConversions = true,
                                  bool *ConstraintsNotSatisfied = nullptr);
 
+  bool CheckTemplateArgumentList(
+      TemplateDecl *Template, TemplateParameterList *Params,
+      SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
+      const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
+      CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions = true,
+      bool *ConstraintsNotSatisfied = nullptr);
+
   bool CheckTemplateTypeArgument(
       TemplateTypeParmDecl *Param, TemplateArgumentLoc &Arg,
       SmallVectorImpl<TemplateArgument> &SugaredConverted,
@@ -12783,6 +12792,18 @@ class Sema final : public SemaBase {
   void MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
                                   unsigned Depth, llvm::SmallBitVector &Used);
 
+  /// Mark which template parameters are named in a given expression.
+  ///
+  /// Unlike MarkUsedTemplateParameters, this excludes parameter that
+  /// are used but not directly named by an expression - i.e. it excludes
+  /// any template parameter that denotes the type of a referenced NTTP.
+  ///
+  /// \param Used a bit vector whose elements will be set to \c true
+  /// to indicate when the corresponding template parameter will be
+  /// deduced.
+  void MarkUsedTemplateParametersForSubsumptionParameterMapping(
+      const Expr *E, unsigned Depth, llvm::SmallBitVector &Used);
+
   /// Mark which template parameters can be deduced from a given
   /// template argument list.
   ///
@@ -12799,6 +12820,9 @@ class Sema final : public SemaBase {
   void MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
                                   unsigned Depth, llvm::SmallBitVector &Used);
 
+  void MarkUsedTemplateParameters(ArrayRef<TemplateArgumentLoc> TemplateArgs,
+                                  unsigned Depth, llvm::SmallBitVector &Used);
+
   void
   MarkDeducedTemplateParameters(const FunctionTemplateDecl *FunctionTemplate,
                                 llvm::SmallBitVector &Deduced) {
@@ -13096,6 +13120,9 @@ class Sema final : public SemaBase {
     /// Whether we're substituting into constraints.
     bool InConstraintSubstitution;
 
+    /// Whether we're substituting into the parameter mapping of a constraint.
+    bool InParameterMappingSubstitution;
+
     /// The point of instantiation or synthesis within the source code.
     SourceLocation PointOfInstantiation;
 
@@ -13359,6 +13386,11 @@ class Sema final : public SemaBase {
                          const MultiLevelTemplateArgumentList &TemplateArgs,
                          TemplateArgumentListInfo &Outputs);
 
+  bool SubstTemplateArgumentsInParameterMapping(
+      ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
+      const MultiLevelTemplateArgumentList &TemplateArgs,
+      TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes);
+
   /// Retrieve the template argument list(s) that should be used to
   /// instantiate the definition of the given declaration.
   ///
@@ -13820,6 +13852,12 @@ class Sema final : public SemaBase {
            CodeSynthesisContexts.back().InConstraintSubstitution;
   }
 
+  bool inParameterMappingSubstitution() const {
+    return !CodeSynthesisContexts.empty() &&
+           CodeSynthesisContexts.back().InParameterMappingSubstitution &&
+           !inConstraintSubstitution();
+  }
+
   using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>;
 
   /// \brief create a Requirement::SubstitutionDiagnostic with only a
@@ -14704,6 +14742,10 @@ class Sema final : public SemaBase {
     SatisfactionStack.swap(NewSS);
   }
 
+  using ConstrainedDeclOrNestedRequirement =
+      llvm::PointerUnion<const NamedDecl *,
+                         const concepts::NestedRequirement *>;
+
   /// Check whether the given expression is a valid constraint expression.
   /// A diagnostic is emitted if it is not, false is returned, and
   /// PossibleNonPrimary will be set to true if the failure might be due to a
@@ -14728,44 +14770,12 @@ class Sema final : public SemaBase {
   /// \returns true if an error occurred and satisfaction could not be checked,
   /// false otherwise.
   bool CheckConstraintSatisfaction(
-      const NamedDecl *Template,
+      ConstrainedDeclOrNestedRequirement Entity,
       ArrayRef<AssociatedConstraint> AssociatedConstraints,
       const MultiLevelTemplateArgumentList &TemplateArgLists,
-      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
-    llvm::SmallVector<Expr *, 4> Converted;
-    return CheckConstraintSatisfaction(Template, AssociatedConstraints,
-                                       Converted, TemplateArgLists,
-                                       TemplateIDRange, Satisfaction);
-  }
-
-  /// \brief Check whether the given list of constraint expressions are
-  /// satisfied (as if in a 'conjunction') given template arguments.
-  /// Additionally, takes an empty list of Expressions which is populated with
-  /// the instantiated versions of the ConstraintExprs.
-  /// \param Template the template-like entity that triggered the constraints
-  /// check (either a concept or a constrained entity).
-  /// \param ConstraintExprs a list of constraint expressions, treated as if
-  /// they were 'AND'ed together.
-  /// \param ConvertedConstraints a out parameter that will get populated with
-  /// the instantiated version of the ConstraintExprs if we successfully checked
-  /// satisfaction.
-  /// \param TemplateArgList the multi-level list of template arguments to
-  /// substitute into the constraint expression. This should be relative to the
-  /// top-level (hence multi-level), since we need to instantiate fully at the
-  /// time of checking.
-  /// \param TemplateIDRange The source range of the template id that
-  /// caused the constraints check.
-  /// \param Satisfaction if true is returned, will contain details of the
-  /// satisfaction, with enough information to diagnose an unsatisfied
-  /// expression.
-  /// \returns true if an error occurred and satisfaction could not be checked,
-  /// false otherwise.
-  bool CheckConstraintSatisfaction(
-      const NamedDecl *Template,
-      ArrayRef<AssociatedConstraint> AssociatedConstraints,
-      llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
-      const MultiLevelTemplateArgumentList &TemplateArgList,
-      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction);
+      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
+      const ConceptReference *TopLevelConceptId = nullptr,
+      Expr **ConvertedExpr = nullptr);
 
   /// \brief Check whether the given non-dependent constraint expression is
   /// satisfied. Returns false and updates Satisfaction with the satisfaction
@@ -14831,16 +14841,17 @@ class Sema final : public SemaBase {
   /// \param First whether this is the first time an unsatisfied constraint is
   /// diagnosed for this error.
   void DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction &Satisfaction,
+                                     SourceLocation Loc = {},
                                      bool First = true);
 
   /// \brief Emit diagnostics explaining why a constraint expression was deemed
   /// unsatisfied.
   void
-  DiagnoseUnsatisfiedConstraint(const ASTConstraintSatisfaction &Satisfaction,
+  DiagnoseUnsatisfiedConstraint(const ConceptSpecializationExpr *ConstraintExpr,
                                 bool First = true);
 
   const NormalizedConstraint *getNormalizedAssociatedConstraints(
-      const NamedDecl *ConstrainedDecl,
+      ConstrainedDeclOrNestedRequirement Entity,
       ArrayRef<AssociatedConstraint> AssociatedConstraints);
 
   /// \brief Check whether the given declaration's associated constraints are
@@ -14865,6 +14876,15 @@ class Sema final : public SemaBase {
       const NamedDecl *D1, ArrayRef<AssociatedConstraint> AC1,
       const NamedDecl *D2, ArrayRef<AssociatedConstraint> AC2);
 
+  /// Cache the satisfaction of an atomic constraint.
+  /// The key is based on the unsubstituted expression and the parameter
+  /// mapping. This lets us not substituting the mapping more than once,
+  /// which is (very!) expensive.
+  /// FIXME: this should be private.
+  llvm::DenseMap<llvm::FoldingSetNodeID,
+                 UnsubstitutedConstraintSatisfactionCacheResult>
+      UnsubstitutedConstraintSatisfactionCache;
+
 private:
   /// Caches pairs of template-like decls whose associated constraints were
   /// checked for subsumption and whether or not the first's constraints did in
@@ -14875,8 +14895,11 @@ class Sema final : public SemaBase {
   /// constrained declarations). If an error occurred while normalizing the
   /// associated constraints of the template or concept, nullptr will be cached
   /// here.
-  llvm::DenseMap<const NamedDecl *, NormalizedConstraint *> NormalizationCache;
+  llvm::DenseMap<ConstrainedDeclOrNestedRequirement, NormalizedConstraint *>
+      NormalizationCache;
 
+  /// Cache whether the associated constraint of a declaration
+  /// is satisfied.
   llvm::ContextualFoldingSet<ConstraintSatisfaction, const ASTContext &>
       SatisfactionCache;
 
diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
index 648a9c51ae6c1..51ca1e16331f5 100644
--- a/clang/include/clang/Sema/SemaConcept.h
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -16,130 +16,406 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Sema/Ownership.h"
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include <optional>
 #include <utility>
 
 namespace clang {
 class Sema;
+class MultiLevelTemplateArgumentList;
 
-enum { ConstraintAlignment = 8 };
+/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
+/// either an atomic constraint, a conjunction of normalized constraints or a
+/// disjunction of normalized constraints.
+struct NormalizedConstraint {
+
+  enum class ConstraintKind : unsigned char {
+    Atomic = 0,
+    ConceptId,
+    FoldExpanded,
+    Compound,
+  };
+
+  enum CompoundConstraintKind : unsigned char {
+    CCK_Conjunction,
+    CCK_Disjunction
+  };
+  enum class FoldOperatorKind : unsigned char { And, Or };
+
+  using OccurenceList = llvm::SmallBitVector;
+
+protected:
+  using ExprOrConcept =
+      llvm::PointerUnion<const Expr *, const ConceptReference *>;
+
+  struct AtomicConstraintBits {
+    // Kind is the first member of all union members,
+    // as we rely on their initial common sequence.
+    LLVM_PREFERRED_TYPE(ConstraintKind)
+    unsigned Kind : 5;
+    unsigned Placeholder : 1;
+    unsigned PackSubstitutionIndex : 26;
+    // Indexes, IndexesForSubsumption, and Args are part of the common initial
+    // sequences of constraints that do have a mapping.
+
+    // Indexes of the parameters used in a constraint expression.
+    OccurenceList Indexes;
+    // Indexes of the parameters named directly in a constraint expression.
+    // FIXME: we should try to reduce the size of this struct?
+    OccurenceList IndexesForSubsumption;
+
+    TemplateArgumentLoc *Args;
+    TemplateParameterList *ParamList;
+    ExprOrConcept ConstraintExpr;
+    const NamedDecl *ConstraintDecl;
+  };
+
+  struct FoldExpandedConstraintBits {
+    LLVM_PREFERRED_TYPE(ConstraintKind)
+    unsigned Kind : 5;
+    LLVM_PREFERRED_TYPE(FoldOperatorKind)
+    unsigned FoldOperator : 1;
+    unsigned Placeholder : 26;
+    OccurenceList Indexes;
+    OccurenceList IndexesForSubsumption;
+    TemplateArgumentLoc *Args;
+    TemplateParameterList *ParamList;
+    const Expr *Pattern;
+    const NamedDecl *ConstraintDecl;
+    NormalizedConstraint *Constraint;
+  };
+
+  struct ConceptIdBits : AtomicConstraintBits {
+    NormalizedConstraint *Sub;
+
+    // Only used for parameter mapping.
+    const ConceptSpecializationExpr *CSE;
+  };
+
+  struct CompoundConstraintBits {
+    LLVM_PREFERRED_TYPE(ConstraintKind)
+    unsigned Kind : 5;
+    LLVM_PREFERRED_TYPE(CompoundConstraintKind)
+    unsigned CCK : 1;
+    NormalizedConstraint *LHS;
+    NormalizedConstraint *RHS;
+  };
+
+  union {
+    AtomicConstraintBits Atomic;
+    FoldExpandedConstraintBits FoldExpanded;
+    ConceptIdBits ConceptId;
+    CompoundConstraintBits Compound;
+  };
+
+  ~NormalizedConstraint() {
+    if (getKind() != ConstraintKind::Compound)
+      Atomic.Indexes.llvm::SmallBitVector::~SmallBitVector();
+  }
+
+  NormalizedConstraint(const Expr *ConstraintExpr,
+                       const NamedDecl *ConstraintDecl,
+                       UnsignedOrNone PackIndex)
+      : Atomic{llvm::to_underlying(ConstraintKind::Atomic),
+               /*Placeholder=*/0,
+               PackIndex.toInternalRepresentation(),
+               /*Indexes=*/{},
+               /*IndexesForSubsumption=*/{},
+               /*Args=*/nullptr,
+               /*ParamList=*/nullptr,
+               ConstraintExpr,
+               ConstraintDecl} {}
+
+  NormalizedConstraint(const Expr *Pattern, FoldOperatorKind OpKind,
+                       NormalizedConstraint *Constraint,
+                       const NamedDecl *ConstraintDecl)
+      : FoldExpanded{llvm::to_underlying(ConstraintKind::FoldExpanded),
+                     llvm::to_underlying(OpKind),
+                     /*Placeholder=*/0,
+                     /*Indexes=*/{},
+                     /*IndexesForSubsumption=*/{},
+                     /*Args=*/nullptr,
+                     /*ParamList=*/nullptr,
+                     Pattern,
+                     ConstraintDecl,
+                     Constraint} {}
+
+  NormalizedConstraint(const ConceptReference *ConceptId,
+                       const NamedDecl *ConstraintDecl,
+                       NormalizedConstraint *SubConstraint,
+                       const ConceptSpecializationExpr *CSE,
+                       UnsignedOrNone PackIndex)
+      : ConceptId{{llvm::to_underlying(ConstraintKind::ConceptId),
+                   /*Placeholder=*/0, PackIndex.toInternalRepresentation(),
+                   /*Indexes=*/{},
+                   /*IndexesForSubsumption=*/{},
+                   /*Args=*/nullptr, /*ParamList=*/nullptr, ConceptId,
+                   ConstraintDecl},
+                  SubConstraint,
+                  CSE} {}
+
+  NormalizedConstraint(NormalizedConstraint *LHS, CompoundConstraintKind CCK,
+                       NormalizedConstraint *RHS)
+      : Compound{llvm::to_underlying(ConstraintKind::Compound),
+                 llvm::to_underlying(CCK), LHS, RHS} {}
+
+  bool hasParameterMapping() const {
+    // compound constraints do not have a mapping
+    // and Args is not part of their common initial sequence.
+    return getKind() != ConstraintKind::Compound && Atomic.Args != nullptr;
+  }
+
+  const OccurenceList &mappingOccurenceList() const {
+    assert(hasParameterMapping() && "This constraint has no parameter mapping");
+    return Atomic.Indexes;
+  }
+
+  const OccurenceList &mappingOccurenceListForSubsumption() const {
+    assert(hasParameterMapping() && "This constraint has no parameter mapping");
+    return Atomic.IndexesForSubsumption;
+  }
 
-struct alignas(ConstraintAlignment) AtomicConstraint {
-  const Expr *ConstraintExpr;
-  const NamedDecl *ConstraintDecl;
-  std::optional<ArrayRef<TemplateArgumentLoc>> ParameterMapping;
+  llvm::MutableArrayRef<TemplateArgumentLoc> getParameterMapping() const {
+    return {Atomic.Args, Atomic.Indexes.count()};
+  }
+
+  TemplateParameterList *getUsedTemplateParamList() const {
+    return Atomic.ParamList;
+  }
 
-  AtomicConstraint(const Expr *ConstraintExpr, const NamedDecl *ConstraintDecl)
-      : ConstraintExpr(ConstraintExpr), ConstraintDecl(ConstraintDecl) {};
+  void updateParameterMapping(OccurenceList Indexes,
+                              OccurenceList IndexesForSubsumption,
+                              llvm::MutableArrayRef<TemplateArgumentLoc> Args,
+                              TemplateParameterList *ParamList) {
+    assert(getKind() != ConstraintKind::Compound);
+    assert(Indexes.count() == Args.size());
+    assert(IndexesForSubsumption.size() == Indexes.size());
+    assert((Indexes | IndexesForSubsumption) == Indexes);
+
+    Atomic.IndexesForSubsumption = std::move(IndexesForSubsumption);
+    Atomic.Indexes = std::move(Indexes);
+    Atomic.Args = Args.data();
+    Atomic.ParamList = ParamList;
+  }
 
   bool hasMatchingParameterMapping(ASTContext &C,
-                                   const AtomicConstraint &Other) const {
-    if (!ParameterMapping != !Other.ParameterMapping)
+                                   const NormalizedConstraint &Other) const {
+    assert(getKind() != ConstraintKind::Compound);
+
+    if (hasParameterMapping() != Other.hasParameterMapping())
       return false;
-    if (!ParameterMapping)
+    if (!hasParameterMapping())
       return true;
-    if (ParameterMapping->size() != Other.ParameterMapping->size())
-      return false;
 
-    for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I) {
+    llvm::ArrayRef<TemplateArgumentLoc> ParameterMapping =
+        getParameterMapping();
+    llvm::ArrayRef<TemplateArgumentLoc> OtherParameterMapping =
+        Other.getParameterMapping();
+
+    const OccurenceList &Indexes = mappingOccurenceListForSubsumption();
+    const OccurenceList &OtherIndexes =
+        Other.mappingOccurenceListForSubsumption();
+
+    if (ParameterMapping.size() != OtherParameterMapping.size())
+      return false;
+    for (unsigned I = 0, S = ParameterMapping.size(); I < S; ++I) {
+      if (Indexes[I] != OtherIndexes[I])
+        return false;
+      if (!Indexes[I])
+        continue;
       llvm::FoldingSetNodeID IDA, IDB;
-      C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
+      C.getCanonicalTemplateArgument(ParameterMapping[I].getArgument())
           .Profile(IDA, C);
-      C.getCanonicalTemplateArgument((*Other.ParameterMapping)[I].getArgument())
+      C.getCanonicalTemplateArgument(OtherParameterMapping[I].getArgument())
           .Profile(IDB, C);
       if (IDA != IDB)
         return false;
     }
     return true;
   }
-};
 
-struct alignas(ConstraintAlignment) NormalizedConstraintPair;
-struct alignas(ConstraintAlignment) FoldExpandedConstraint;
+public:
+  ConstraintKind getKind() const {
+    return static_cast<ConstraintKind>(Atomic.Kind);
+  }
 
-/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
-/// either an atomic constraint, a conjunction of normalized constraints or a
-/// disjunction of normalized constraints.
-struct NormalizedConstraint {
+  SourceLocation getBeginLoc() const {
+    switch (getKind()) {
+    case ConstraintKind::Atomic:
+      return cast<const Expr *>(Atomic.ConstraintExpr)->getBeginLoc();
+    case ConstraintKind::ConceptId:
+      return cast<const ConceptReference *>(Atomic.ConstraintExpr)
+          ->getBeginLoc();
+    case ConstraintKind::Compound:
+      return Compound.LHS->getBeginLoc();
+    case ConstraintKind::FoldExpanded:
+      return FoldExpanded.Pattern->getBeginLoc();
+    }
+  }
+
+  SourceLocation getEndLoc() const {
+    switch (getKind()) {
+    case ConstraintKind::Atomic:
+      return cast<const Expr *>(Atomic.ConstraintExpr)->getEndLoc();
+    case ConstraintKind::ConceptId:
+      return cast<const ConceptReference *>(Atomic.ConstraintExpr)->getEndLoc();
+    case ConstraintKind::Compound:
+      return Compound.RHS->getEndLoc();
+    case ConstraintKind::FoldExpanded:
+      return FoldExpanded.Pattern->getEndLoc();
+    }
+  }
+
+  SourceRange getSourceRange() const { return {getBeginLoc(), getEndLoc()}; }
+
+private:
   friend class Sema;
+  static NormalizedConstraint *
+  fromAssociatedConstraints(Sema &S, const NamedDecl *D,
+                            ArrayRef<AssociatedConstraint> ACs);
+  static NormalizedConstraint *fromConstraintExpr(Sema &S, const NamedDecl *D,
+                                                  const Expr *E,
+                                                  UnsignedOrNone SubstIndex);
+};
+
+class CompoundConstraint : public NormalizedConstraint {
+  using NormalizedConstraint::NormalizedConstraint;
 
-  enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
+public:
+  static CompoundConstraint *Create(ASTContext &Ctx, NormalizedConstraint *LHS,
+                                    CompoundConstraintKind CCK,
+                                    NormalizedConstraint *RHS) {
+    return new (Ctx) CompoundConstraint(LHS, CCK, RHS);
+  }
 
-  using CompoundConstraint = llvm::PointerIntPair<NormalizedConstraintPair *, 1,
-                                                  CompoundConstraintKind>;
+  static CompoundConstraint *CreateConjunction(ASTContext &Ctx,
+                                               NormalizedConstraint *LHS,
+                                               NormalizedConstraint *RHS) {
+    return new (Ctx) CompoundConstraint(LHS, CCK_Conjunction, RHS);
+  }
 
-  llvm::PointerUnion<AtomicConstraint *, FoldExpandedConstraint *,
-                     CompoundConstraint>
-      Constraint;
+  const NormalizedConstraint &getLHS() const { return *Compound.LHS; }
 
-  NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
-  NormalizedConstraint(FoldExpandedConstraint *C) : Constraint{C} {};
+  NormalizedConstraint &getLHS() { return *Compound.LHS; }
 
-  NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
-                       NormalizedConstraint RHS, CompoundConstraintKind Kind);
+  const NormalizedConstraint &getRHS() const { return *Compound.RHS; }
 
-  NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other);
-  NormalizedConstraint(NormalizedConstraint &&Other):
-      Constraint(Other.Constraint) {
-    Other.Constraint = nullptr;
+  NormalizedConstraint &getRHS() { return *Compound.RHS; }
+
+  CompoundConstraintKind getCompoundKind() const {
+    return static_cast<CompoundConstraintKind>(Compound.CCK);
   }
-  NormalizedConstraint &operator=(const NormalizedConstraint &Other) = delete;
-  NormalizedConstraint &operator=(NormalizedConstraint &&Other) {
-    if (&Other != this) {
-      NormalizedConstraint Temp(std::move(Other));
-      std::swap(Constraint, Temp.Constraint);
-    }
-    return *this;
+};
+
+class NormalizedConstraintWithParamMapping : public NormalizedConstraint {
+protected:
+  using NormalizedConstraint::NormalizedConstraint;
+
+public:
+  using NormalizedConstraint::getParameterMapping;
+  using NormalizedConstraint::getUsedTemplateParamList;
+  using NormalizedConstraint::hasMatchingParameterMapping;
+  using NormalizedConstraint::hasParameterMapping;
+  using NormalizedConstraint::mappingOccurenceList;
+  using NormalizedConstraint::mappingOccurenceListForSubsumption;
+  using NormalizedConstraint::updateParameterMapping;
+
+  const NamedDecl *getConstraintDecl() const { return Atomic.ConstraintDecl; }
+
+  UnsignedOrNone getPackSubstitutionIndex() const {
+    return UnsignedOrNone::fromInternalRepresentation(
+        Atomic.PackSubstitutionIndex);
   }
+};
+
+class AtomicConstraint : public NormalizedConstraintWithParamMapping {
+  using NormalizedConstraintWithParamMapping::
+      NormalizedConstraintWithParamMapping;
 
-  bool isAtomic() const { return llvm::isa<AtomicConstraint *>(Constraint); }
-  bool isFoldExpanded() const {
-    return llvm::isa<FoldExpandedConstraint *>(Constraint);
+public:
+  static AtomicConstraint *Create(ASTContext &Ctx, const Expr *ConstraintExpr,
+                                  const NamedDecl *ConstraintDecl,
+                                  UnsignedOrNone PackIndex) {
+    return new (Ctx)
+        AtomicConstraint(ConstraintExpr, ConstraintDecl, PackIndex);
   }
-  bool isCompound() const { return llvm::isa<CompoundConstraint>(Constraint); }
 
-  CompoundConstraintKind getCompoundKind() const;
+  const Expr *getConstraintExpr() const {
+    return cast<const Expr *>(Atomic.ConstraintExpr);
+  }
+};
 
-  NormalizedConstraint &getLHS() const;
-  NormalizedConstraint &getRHS() const;
+class FoldExpandedConstraint : public NormalizedConstraintWithParamMapping {
+  using NormalizedConstraintWithParamMapping::
+      NormalizedConstraintWithParamMapping;
 
-  AtomicConstraint *getAtomicConstraint() const;
+public:
+  static FoldExpandedConstraint *Create(ASTContext &Ctx, const Expr *Pattern,
+                                        const NamedDecl *ConstraintDecl,
+                                        FoldOperatorKind OpKind,
+                                        NormalizedConstraint *Constraint) {
+    return new (Ctx)
+        FoldExpandedConstraint(Pattern, OpKind, Constraint, ConstraintDecl);
+  }
 
-  FoldExpandedConstraint *getFoldExpandedConstraint() const;
+  using NormalizedConstraint::hasMatchingParameterMapping;
 
-private:
-  static std::optional<NormalizedConstraint>
-  fromAssociatedConstraints(Sema &S, const NamedDecl *D,
-                            ArrayRef<AssociatedConstraint> ACs);
-  static std::optional<NormalizedConstraint>
-  fromConstraintExpr(Sema &S, const NamedDecl *D, const Expr *E);
-};
+  FoldOperatorKind getFoldOperator() const {
+    return static_cast<FoldOperatorKind>(FoldExpanded.FoldOperator);
+  }
 
-struct alignas(ConstraintAlignment) NormalizedConstraintPair {
-  NormalizedConstraint LHS, RHS;
-};
+  const Expr *getPattern() const { return FoldExpanded.Pattern; }
 
-struct alignas(ConstraintAlignment) FoldExpandedConstraint {
-  enum class FoldOperatorKind { And, Or } Kind;
-  NormalizedConstraint Constraint;
-  const Expr *Pattern;
+  const NormalizedConstraint &getNormalizedPattern() const {
+    return *FoldExpanded.Constraint;
+  }
 
-  FoldExpandedConstraint(FoldOperatorKind K, NormalizedConstraint C,
-                         const Expr *Pattern)
-      : Kind(K), Constraint(std::move(C)), Pattern(Pattern) {};
+  NormalizedConstraint &getNormalizedPattern() {
+    return *FoldExpanded.Constraint;
+  }
 
   static bool AreCompatibleForSubsumption(const FoldExpandedConstraint &A,
                                           const FoldExpandedConstraint &B);
 };
 
-const NormalizedConstraint *getNormalizedAssociatedConstraints(
-    Sema &S, const NamedDecl *ConstrainedDecl,
-    ArrayRef<AssociatedConstraint> AssociatedConstraints);
+class ConceptIdConstraint : public NormalizedConstraintWithParamMapping {
+  using NormalizedConstraintWithParamMapping::
+      NormalizedConstraintWithParamMapping;
+
+public:
+  static ConceptIdConstraint *
+  Create(ASTContext &Ctx, const ConceptReference *ConceptId,
+         NormalizedConstraint *SubConstraint, const NamedDecl *ConstraintDecl,
+         const ConceptSpecializationExpr *CSE, UnsignedOrNone PackIndex) {
+    return new (Ctx) ConceptIdConstraint(ConceptId, ConstraintDecl,
+                                         SubConstraint, CSE, PackIndex);
+  }
+
+  const ConceptSpecializationExpr *getConceptSpecializationExpr() const {
+    return ConceptId.CSE;
+  }
+
+  const ConceptReference *getConceptId() const {
+    return cast<const ConceptReference *>(ConceptId.ConstraintExpr);
+  }
+
+  const NormalizedConstraint &getNormalizedConstraint() const {
+    return *ConceptId.Sub;
+  }
+
+  NormalizedConstraint &getNormalizedConstraint() { return *ConceptId.Sub; }
+};
+
+struct UnsubstitutedConstraintSatisfactionCacheResult {
+  ExprResult SubstExpr;
+  ConstraintSatisfaction Satisfaction;
+};
 
 /// \brief SubsumptionChecker establishes subsumption
 /// between two set of constraints.
@@ -189,13 +465,13 @@ class SubsumptionChecker {
   };
 
   struct MappedAtomicConstraint {
-    AtomicConstraint *Constraint;
+    const AtomicConstraint *Constraint;
     Literal ID;
   };
 
   struct FoldExpendedConstraintKey {
     FoldExpandedConstraint::FoldOperatorKind Kind;
-    AtomicConstraint *Constraint;
+    const AtomicConstraint *Constraint;
     Literal ID;
   };
 
@@ -207,7 +483,7 @@ class SubsumptionChecker {
 
   // A map from a literal to a corresponding associated constraint.
   // We do not have enough bits left for a pointer union here :(
-  llvm::DenseMap<uint16_t, void *> ReverseMap;
+  llvm::DenseMap<uint16_t, const void *> ReverseMap;
 
   // Fold expanded constraints ask us to recursively establish subsumption.
   // This caches the result.
@@ -234,12 +510,12 @@ class SubsumptionChecker {
   FormulaType Normalize(const NormalizedConstraint &C);
   void AddUniqueClauseToFormula(Formula &F, Clause C);
 
-  Literal find(AtomicConstraint *);
-  Literal find(FoldExpandedConstraint *);
+  Literal find(const AtomicConstraint *);
+  Literal find(const FoldExpandedConstraint *);
 
   uint16_t getNewLiteralId();
 };
 
-} // clang
+} // namespace clang
 
 #endif // LLVM_CLANG_SEMA_SEMACONCEPT_H
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 115c19d4f1540..60c7d275f1aaf 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -234,21 +234,25 @@ enum class TemplateSubstitutionKind : char {
     /// Replaces the current 'innermost' level with the provided argument list.
     /// This is useful for type deduction cases where we need to get the entire
     /// list from the AST, but then add the deduced innermost list.
-    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
+    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args,
+                                           bool Final = false) {
       assert((!TemplateArgumentLists.empty() || NumRetainedOuterLevels) &&
              "Replacing in an empty list?");
 
       if (!TemplateArgumentLists.empty()) {
-        assert((TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ||
-                TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ==
-                    AssociatedDecl) &&
-               "Trying to change incorrect declaration?");
         TemplateArgumentLists[0].Args = Args;
-      } else {
-        --NumRetainedOuterLevels;
-        TemplateArgumentLists.push_back(
-            {{AssociatedDecl, /*Final=*/false}, Args});
+        return;
       }
+      --NumRetainedOuterLevels;
+      TemplateArgumentLists.push_back(
+          {{AssociatedDecl, /*Final=*/Final}, Args});
+    }
+
+    void replaceOutermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
+      assert((!TemplateArgumentLists.empty()) && "Replacing in an empty list?");
+      TemplateArgumentLists.back().AssociatedDeclAndFinal.setPointer(
+          AssociatedDecl);
+      TemplateArgumentLists.back().Args = Args;
     }
 
     /// Add an outermost level that we are not substituting. We have no
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index d658890e076c2..fd12bc4e83827 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -24,13 +24,18 @@ static void
 CreateUnsatisfiedConstraintRecord(const ASTContext &C,
                                   const UnsatisfiedConstraintRecord &Detail,
                                   UnsatisfiedConstraintRecord *TrailingObject) {
-  if (auto *E = dyn_cast<Expr *>(Detail))
+  if (Detail.isNull())
+    new (TrailingObject) UnsatisfiedConstraintRecord(nullptr);
+  else if (const auto *E = llvm::dyn_cast<const Expr *>(Detail))
     new (TrailingObject) UnsatisfiedConstraintRecord(E);
+  else if (const auto *Concept =
+               llvm::dyn_cast<const ConceptReference *>(Detail))
+    new (TrailingObject) UnsatisfiedConstraintRecord(Concept);
   else {
     auto &SubstitutionDiagnostic =
-        *cast<std::pair<SourceLocation, StringRef> *>(Detail);
+        *cast<const clang::ConstraintSubstitutionDiagnostic *>(Detail);
     StringRef Message = C.backupStr(SubstitutionDiagnostic.second);
-    auto *NewSubstDiag = new (C) std::pair<SourceLocation, StringRef>(
+    auto *NewSubstDiag = new (C) clang::ConstraintSubstitutionDiagnostic(
         SubstitutionDiagnostic.first, Message);
     new (TrailingObject) UnsatisfiedConstraintRecord(NewSubstDiag);
   }
@@ -74,9 +79,10 @@ ASTConstraintSatisfaction *ASTConstraintSatisfaction::Rebuild(
   return new (Mem) ASTConstraintSatisfaction(C, Satisfaction);
 }
 
-void ConstraintSatisfaction::Profile(
-    llvm::FoldingSetNodeID &ID, const ASTContext &C,
-    const NamedDecl *ConstraintOwner, ArrayRef<TemplateArgument> TemplateArgs) {
+void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID,
+                                     const ASTContext &C,
+                                     const NamedDecl *ConstraintOwner,
+                                     ArrayRef<TemplateArgument> TemplateArgs) {
   ID.AddPointer(ConstraintOwner);
   ID.AddInteger(TemplateArgs.size());
   for (auto &Arg : TemplateArgs)
@@ -116,6 +122,19 @@ void ConceptReference::print(llvm::raw_ostream &OS,
   }
 }
 
+const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB,
+                                             const ConceptReference *C) {
+  std::string NameStr;
+  llvm::raw_string_ostream OS(NameStr);
+  LangOptions LO;
+  LO.CPlusPlus = true;
+  LO.Bool = true;
+  OS << '\'';
+  C->print(OS, PrintingPolicy(LO));
+  OS << '\'';
+  return DB << NameStr;
+}
+
 concepts::ExprRequirement::ExprRequirement(
     Expr *E, bool IsSimple, SourceLocation NoexceptLoc,
     ReturnTypeRequirement Req, SatisfactionStatus Status,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 1c8fd83feb7f8..f43fa8c90ad3b 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1069,22 +1069,22 @@ Error ASTNodeImporter::ImportConstraintSatisfaction(
   ToSat.ContainsErrors = FromSat.ContainsErrors;
   if (!ToSat.IsSatisfied) {
     for (auto Record = FromSat.begin(); Record != FromSat.end(); ++Record) {
-      if (Expr *E = Record->dyn_cast<Expr *>()) {
+      if (const Expr *E = Record->dyn_cast<const Expr *>()) {
         ExpectedExpr ToSecondExpr = import(E);
         if (!ToSecondExpr)
           return ToSecondExpr.takeError();
         ToSat.Details.emplace_back(ToSecondExpr.get());
       } else {
-        auto Pair = Record->dyn_cast<std::pair<SourceLocation, StringRef> *>();
+        auto Pair =
+            Record->dyn_cast<const ConstraintSubstitutionDiagnostic *>();
 
         ExpectedSLoc ToPairFirst = import(Pair->first);
         if (!ToPairFirst)
           return ToPairFirst.takeError();
         StringRef ToPairSecond = ImportASTStringRef(Pair->second);
-        ToSat.Details.emplace_back(
-            new (Importer.getToContext())
-                ConstraintSatisfaction::SubstitutionDiagnostic{
-                    ToPairFirst.get(), ToPairSecond});
+        ToSat.Details.emplace_back(new (Importer.getToContext())
+                                       ConstraintSubstitutionDiagnostic{
+                                           ToPairFirst.get(), ToPairSecond});
       }
     }
   }
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index dc6d232d9a525..40c9e49193ffe 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -12,9 +12,11 @@
 
 #include "clang/Sema/SemaConcept.h"
 #include "TreeTransform.h"
+#include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ExprConcepts.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/OperatorPrecedence.h"
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Initialization.h"
@@ -27,7 +29,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/StringExtras.h"
-#include <optional>
+#include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
 using namespace sema;
@@ -85,7 +87,7 @@ class LogicalBinOp {
                                   OK_Ordinary, Loc, FPOptionsOverride{});
   }
 };
-}
+} // namespace
 
 bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
                                      Token NextToken, bool *PossibleNonPrimary,
@@ -146,14 +148,14 @@ bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
 
   if (!Context.hasSameUnqualifiedType(Type, Context.BoolTy)) {
     Diag(ConstraintExpression->getExprLoc(),
-         diag::err_non_bool_atomic_constraint) << Type
-        << ConstraintExpression->getSourceRange();
+         diag::err_non_bool_atomic_constraint)
+        << Type << ConstraintExpression->getSourceRange();
     CheckForNonPrimary();
     return false;
   }
 
   if (PossibleNonPrimary)
-      *PossibleNonPrimary = false;
+    *PossibleNonPrimary = false;
   return true;
 }
 
@@ -164,52 +166,315 @@ struct SatisfactionStackRAII {
   SatisfactionStackRAII(Sema &SemaRef, const NamedDecl *ND,
                         const llvm::FoldingSetNodeID &FSNID)
       : SemaRef(SemaRef) {
-      if (ND) {
+    if (ND) {
       SemaRef.PushSatisfactionStackEntry(ND, FSNID);
       Inserted = true;
-      }
+    }
   }
   ~SatisfactionStackRAII() {
-        if (Inserted)
-          SemaRef.PopSatisfactionStackEntry();
+    if (Inserted)
+      SemaRef.PopSatisfactionStackEntry();
   }
 };
 } // namespace
 
-static bool
-DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
-                            const NamedDecl *Templ, const Expr *E,
-                            const MultiLevelTemplateArgumentList &MLTAL) {
+static bool DiagRecursiveConstraintEval(
+    Sema &S, llvm::FoldingSetNodeID &ID, const NamedDecl *Templ, const Expr *E,
+    const MultiLevelTemplateArgumentList *MLTAL = nullptr) {
   E->Profile(ID, S.Context, /*Canonical=*/true);
-  for (const auto &List : MLTAL)
-    for (const auto &TemplateArg : List.Args)
-      TemplateArg.Profile(ID, S.Context);
-
-  // Note that we have to do this with our own collection, because there are
-  // times where a constraint-expression check can cause us to need to evaluate
-  // other constriants that are unrelated, such as when evaluating a recovery
-  // expression, or when trying to determine the constexpr-ness of special
-  // members. Otherwise we could just use the
-  // Sema::InstantiatingTemplate::isAlreadyBeingInstantiated function.
+  if (MLTAL) {
+    for (const auto &List : *MLTAL)
+      for (const auto &TemplateArg : List.Args)
+        S.Context.getCanonicalTemplateArgument(TemplateArg)
+            .Profile(ID, S.Context);
+  }
   if (S.SatisfactionStackContains(Templ, ID)) {
     S.Diag(E->getExprLoc(), diag::err_constraint_depends_on_self)
         << E << E->getSourceRange();
     return true;
   }
-
   return false;
 }
 
-static ExprResult EvaluateAtomicConstraint(
-    Sema &S, const Expr *AtomicExpr, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
+// Figure out the to-translation-unit depth for this function declaration for
+// the purpose of seeing if they differ by constraints. This isn't the same as
+// getTemplateDepth, because it includes already instantiated parents.
+static unsigned
+CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
+                                     bool SkipForSpecialization = false) {
+  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
+      ND, ND->getLexicalDeclContext(), /*Final=*/false,
+      /*Innermost=*/std::nullopt,
+      /*RelativeToPrimary=*/true,
+      /*Pattern=*/nullptr,
+      /*ForConstraintInstantiation=*/true, SkipForSpecialization);
+  return MLTAL.getNumLevels();
+}
+
+namespace {
+class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
+  unsigned TemplateDepth = 0;
+
+public:
+  using inherited = TreeTransform<AdjustConstraintDepth>;
+  AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
+      : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
+
+  using inherited::TransformTemplateTypeParmType;
+  QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
+                                         TemplateTypeParmTypeLoc TL, bool) {
+    const TemplateTypeParmType *T = TL.getTypePtr();
+
+    TemplateTypeParmDecl *NewTTPDecl = nullptr;
+    if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
+      NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
+          TransformDecl(TL.getNameLoc(), OldTTPDecl));
+
+    QualType Result = getSema().Context.getTemplateTypeParmType(
+        T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
+        NewTTPDecl);
+    TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
+    NewTL.setNameLoc(TL.getNameLoc());
+    return Result;
+  }
+
+  bool AlreadyTransformed(QualType T) {
+    if (T.isNull())
+      return true;
+
+    if (T->isInstantiationDependentType() || T->isVariablyModifiedType() ||
+        T->containsUnexpandedParameterPack())
+      return false;
+    return true;
+  }
+};
+} // namespace
+
+namespace {
+
+// FIXME: Convert it to DynamicRecursiveASTVisitor
+class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
+  using inherited = RecursiveASTVisitor<HashParameterMapping>;
+  friend inherited;
+
+  Sema &SemaRef;
+  const MultiLevelTemplateArgumentList &TemplateArgs;
+  llvm::FoldingSetNodeID &ID;
+  llvm::SmallVector<TemplateArgument, 10> UsedTemplateArgs;
+
+  UnsignedOrNone OuterPackSubstIndex;
+
+  TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) {
+    assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size());
+    Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex];
+    if (Arg.isPackExpansion())
+      Arg = Arg.getPackExpansionPattern();
+    return Arg;
+  }
+
+  bool shouldVisitTemplateInstantiations() const { return true; }
+
+public:
+  HashParameterMapping(Sema &SemaRef,
+                       const MultiLevelTemplateArgumentList &TemplateArgs,
+                       llvm::FoldingSetNodeID &ID,
+                       UnsignedOrNone OuterPackSubstIndex)
+      : SemaRef(SemaRef), TemplateArgs(TemplateArgs), ID(ID),
+        OuterPackSubstIndex(OuterPackSubstIndex) {}
+
+  bool VisitTemplateTypeParmType(TemplateTypeParmType *T) {
+    // A lambda expression can introduce template parameters that don't have
+    // corresponding template arguments yet.
+    if (T->getDepth() >= TemplateArgs.getNumLevels())
+      return true;
+
+    TemplateArgument Arg = TemplateArgs(T->getDepth(), T->getIndex());
+
+    if (T->isParameterPack() && SemaRef.ArgPackSubstIndex) {
+      assert(Arg.getKind() == TemplateArgument::Pack &&
+             "Missing argument pack");
+
+      Arg = getPackSubstitutedTemplateArgument(Arg);
+    }
+
+    UsedTemplateArgs.push_back(
+        SemaRef.Context.getCanonicalTemplateArgument(Arg));
+    return true;
+  }
+
+  bool VisitDeclRefExpr(DeclRefExpr *E) {
+    NamedDecl *D = E->getDecl();
+    NonTypeTemplateParmDecl *NTTP = dyn_cast<NonTypeTemplateParmDecl>(D);
+    if (!NTTP)
+      return TraverseDecl(D);
+
+    TemplateArgument Arg = TemplateArgs(NTTP->getDepth(), NTTP->getPosition());
+    if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
+      assert(Arg.getKind() == TemplateArgument::Pack &&
+             "Missing argument pack");
+      Arg = getPackSubstitutedTemplateArgument(Arg);
+    }
+
+    UsedTemplateArgs.push_back(
+        SemaRef.Context.getCanonicalTemplateArgument(Arg));
+    return true;
+  }
+
+  bool VisitTypedefType(TypedefType *TT) {
+    return inherited::TraverseType(TT->desugar());
+  }
+
+  bool TraverseDecl(Decl *D) {
+    if (auto *VD = dyn_cast<ValueDecl>(D))
+      return TraverseType(VD->getType());
+
+    return inherited::TraverseDecl(D);
+  }
+
+  bool TraverseTypeLoc(TypeLoc TL, bool TraverseQualifier = true) {
+    // We don't care about TypeLocs. So traverse Types instead.
+    return TraverseType(TL.getType(), TraverseQualifier);
+  }
+
+  bool TraverseTagType(const TagType *T, bool TraverseQualifier) {
+    // T's parent can be dependent while T doesn't have any template arguments.
+    // We should have already traversed its qualifier.
+    // FIXME: Add an assert to catch cases where we failed to profile the
+    // concept. assert(!T->isDependentType() && "We missed a case in profiling
+    // concepts!");
+    return true;
+  }
+
+  bool TraverseInjectedClassNameType(InjectedClassNameType *T,
+                                     bool TraverseQualifier) {
+    return TraverseTemplateArguments(T->getTemplateArgs(SemaRef.Context));
+  }
+
+  bool TraverseTemplateArgument(const TemplateArgument &Arg) {
+    if (!Arg.containsUnexpandedParameterPack() || Arg.isPackExpansion()) {
+      // Act as if we are fully expanding this pack, if it is a PackExpansion.
+      Sema::ArgPackSubstIndexRAII _1(SemaRef, std::nullopt);
+      llvm::SaveAndRestore<UnsignedOrNone> _2(OuterPackSubstIndex,
+                                              std::nullopt);
+      return inherited::TraverseTemplateArgument(Arg);
+    }
+
+    Sema::ArgPackSubstIndexRAII _1(SemaRef, OuterPackSubstIndex);
+    return inherited::TraverseTemplateArgument(Arg);
+  }
+
+  void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
+    if (!Constraint.hasParameterMapping()) {
+      for (const auto &List : TemplateArgs)
+        for (const TemplateArgument &Arg : List.Args)
+          SemaRef.Context.getCanonicalTemplateArgument(Arg).Profile(
+              ID, SemaRef.Context);
+      return;
+    }
+
+    llvm::ArrayRef<TemplateArgumentLoc> Mapping =
+        Constraint.getParameterMapping();
+    for (auto &ArgLoc : Mapping) {
+      TemplateArgument Canonical =
+          SemaRef.Context.getCanonicalTemplateArgument(ArgLoc.getArgument());
+      // We don't want sugars to impede the profile of cache.
+      UsedTemplateArgs.push_back(Canonical);
+      TraverseTemplateArgument(Canonical);
+    }
+
+    for (auto &Used : UsedTemplateArgs) {
+      llvm::FoldingSetNodeID R;
+      Used.Profile(R, SemaRef.Context);
+      ID.AddNodeID(R);
+    }
+  }
+};
+
+class ConstraintSatisfactionChecker {
+  Sema &S;
+  const NamedDecl *Template;
+  SourceLocation TemplateNameLoc;
+  UnsignedOrNone PackSubstitutionIndex;
+
+  ConstraintSatisfaction &Satisfaction;
+
+private:
+  ExprResult
+  EvaluateAtomicConstraint(const Expr *AtomicExpr,
+                           const MultiLevelTemplateArgumentList &MLTAL);
+
+  UnsignedOrNone EvaluateFoldExpandedConstraintSize(
+      const FoldExpandedConstraint &FE,
+      const MultiLevelTemplateArgumentList &MLTAL);
+
+  // XXX: It is SLOW! Use it very carefully.
+  std::optional<MultiLevelTemplateArgumentList> SubstitutionInTemplateArguments(
+      const NormalizedConstraintWithParamMapping &Constraint,
+      MultiLevelTemplateArgumentList MLTAL,
+      llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost);
+
+  ExprResult EvaluateSlow(const AtomicConstraint &Constraint,
+                          const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult Evaluate(const AtomicConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult EvaluateSlow(const FoldExpandedConstraint &Constraint,
+                          const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult Evaluate(const FoldExpandedConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult EvaluateSlow(const ConceptIdConstraint &Constraint,
+                          const MultiLevelTemplateArgumentList &MLTAL,
+                          unsigned int Size);
+
+  ExprResult Evaluate(const ConceptIdConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult Evaluate(const CompoundConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+public:
+  ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template,
+                                SourceLocation TemplateNameLoc,
+                                UnsignedOrNone PackSubstitutionIndex,
+                                ConstraintSatisfaction &Satisfaction)
+      : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc),
+        PackSubstitutionIndex(PackSubstitutionIndex),
+        Satisfaction(Satisfaction) {}
+
+  ExprResult Evaluate(const NormalizedConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+};
+
+StringRef allocateStringFromConceptDiagnostic(const Sema &S,
+                                              const PartialDiagnostic Diag) {
+  SmallString<128> DiagString;
+  DiagString = ": ";
+  Diag.EmitToString(S.getDiagnostics(), DiagString);
+  return S.getASTContext().backupStr(DiagString);
+}
+
+} // namespace
+
+ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
+    const Expr *AtomicExpr, const MultiLevelTemplateArgumentList &MLTAL) {
   EnterExpressionEvaluationContext ConstantEvaluated(
       S, Sema::ExpressionEvaluationContext::ConstantEvaluated,
       Sema::ReuseLambdaContextDecl);
 
+  llvm::FoldingSetNodeID ID;
+  if (Template &&
+      DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, &MLTAL)) {
+    Satisfaction.IsSatisfied = false;
+    Satisfaction.ContainsErrors = true;
+    return ExprEmpty();
+  }
+  SatisfactionStackRAII StackRAII(S, Template, ID);
+
   // Atomic constraint - substitute arguments and check satisfaction.
-  ExprResult SubstitutedExpression;
+  ExprResult SubstitutedExpression = const_cast<Expr *>(AtomicExpr);
   {
     TemplateDeductionInfo Info(TemplateNameLoc);
     Sema::InstantiatingTemplate Inst(
@@ -220,16 +485,6 @@ static ExprResult EvaluateAtomicConstraint(
     if (Inst.isInvalid())
       return ExprError();
 
-    llvm::FoldingSetNodeID ID;
-    if (Template &&
-        DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, MLTAL)) {
-      Satisfaction.IsSatisfied = false;
-      Satisfaction.ContainsErrors = true;
-      return ExprEmpty();
-    }
-
-    SatisfactionStackRAII StackRAII(S, Template, ID);
-
     // We do not want error diagnostics escaping here.
     Sema::SFINAETrap Trap(S);
     SubstitutedExpression =
@@ -247,21 +502,16 @@ static ExprResult EvaluateAtomicConstraint(
       PartialDiagnosticAt SubstDiag{SourceLocation(),
                                     PartialDiagnostic::NullDiagnostic()};
       Info.takeSFINAEDiagnostic(SubstDiag);
-      // FIXME: Concepts: This is an unfortunate consequence of there
+      // FIXME: This is an unfortunate consequence of there
       //  being no serialization code for PartialDiagnostics and the fact
       //  that serializing them would likely take a lot more storage than
       //  just storing them as strings. We would still like, in the
       //  future, to serialize the proper PartialDiagnostic as serializing
       //  it as a string defeats the purpose of the diagnostic mechanism.
-      SmallString<128> DiagString;
-      DiagString = ": ";
-      SubstDiag.second.EmitToString(S.getDiagnostics(), DiagString);
-      unsigned MessageSize = DiagString.size();
-      char *Mem = new (S.Context) char[MessageSize];
-      memcpy(Mem, DiagString.c_str(), MessageSize);
       Satisfaction.Details.emplace_back(
-          new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
-              SubstDiag.first, StringRef(Mem, MessageSize)});
+          new (S.Context) ConstraintSubstitutionDiagnostic{
+              SubstDiag.first,
+              allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
       Satisfaction.IsSatisfied = false;
       return ExprEmpty();
     }
@@ -289,284 +539,525 @@ static ExprResult EvaluateAtomicConstraint(
   return SubstitutedExpression;
 }
 
-static UnsignedOrNone EvaluateFoldExpandedConstraintSize(
-    Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
+std::optional<MultiLevelTemplateArgumentList>
+ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
+    const NormalizedConstraintWithParamMapping &Constraint,
+    MultiLevelTemplateArgumentList MLTAL,
+    llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost) {
+
+  if (!Constraint.hasParameterMapping())
+    return std::move(MLTAL);
+
+  TemplateDeductionInfo Info(Constraint.getBeginLoc());
+  Sema::InstantiatingTemplate Inst(
+      S, Constraint.getBeginLoc(),
+      Sema::InstantiatingTemplate::ConstraintSubstitution{},
+      // FIXME: improve const-correctness of InstantiatingTemplate
+      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+  if (Inst.isInvalid())
+    return std::nullopt;
+
+  Sema::SFINAETrap Trap(S);
+
+  TemplateArgumentListInfo SubstArgs;
+  Sema::ArgPackSubstIndexRAII SubstIndex(
+      S, Constraint.getPackSubstitutionIndex()
+             ? Constraint.getPackSubstitutionIndex()
+             : PackSubstitutionIndex);
+
+  if (S.SubstTemplateArgumentsInParameterMapping(
+          Constraint.getParameterMapping(), Constraint.getBeginLoc(), MLTAL,
+          SubstArgs, /*BuildPackExpansionTypes=*/true)) {
+    Satisfaction.IsSatisfied = false;
+    return std::nullopt;
+  }
+
+  Sema::CheckTemplateArgumentInfo CTAI;
+  auto *TD = const_cast<TemplateDecl *>(
+      cast<TemplateDecl>(Constraint.getConstraintDecl()));
+  if (S.CheckTemplateArgumentList(TD, Constraint.getUsedTemplateParamList(),
+                                  TD->getLocation(), SubstArgs,
+                                  /*DefaultArguments=*/{},
+                                  /*PartialTemplateArgs=*/false, CTAI))
+    return std::nullopt;
+  const NormalizedConstraint::OccurenceList &Used =
+      Constraint.mappingOccurenceList();
+  SubstitutedOuterMost =
+      llvm::to_vector_of<TemplateArgument>(MLTAL.getOutermost());
+  unsigned Offset = 0;
+  for (unsigned I = 0, MappedIndex = 0; I < Used.size(); I++) {
+    TemplateArgument Arg;
+    if (Used[I])
+      Arg = S.Context.getCanonicalTemplateArgument(
+          CTAI.SugaredConverted[MappedIndex++]);
+    if (I < SubstitutedOuterMost.size()) {
+      SubstitutedOuterMost[I] = Arg;
+      Offset = I + 1;
+    } else {
+      SubstitutedOuterMost.push_back(Arg);
+      Offset = SubstitutedOuterMost.size();
+    }
+  }
+  if (Offset < SubstitutedOuterMost.size())
+    SubstitutedOuterMost.erase(SubstitutedOuterMost.begin() + Offset);
+
+  MLTAL.replaceOutermostTemplateArguments(
+      const_cast<NamedDecl *>(Constraint.getConstraintDecl()),
+      SubstitutedOuterMost);
+  return std::move(MLTAL);
+}
+
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+    const AtomicConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+
+  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
+  if (!SubstitutedArgs) {
+    Satisfaction.IsSatisfied = false;
+    return ExprEmpty();
+  }
+
+  Sema::ArgPackSubstIndexRAII SubstIndex(S, PackSubstitutionIndex);
+  ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint(
+      Constraint.getConstraintExpr(), *SubstitutedArgs);
+
+  if (SubstitutedAtomicExpr.isInvalid())
+    return ExprError();
+
+  if (SubstitutedAtomicExpr.isUnset())
+    // Evaluator has decided satisfaction without yielding an expression.
+    return ExprEmpty();
+
+  // We don't have the ability to evaluate this, since it contains a
+  // RecoveryExpr, so we want to fail overload resolution.  Otherwise,
+  // we'd potentially pick up a different overload, and cause confusing
+  // diagnostics. SO, add a failure detail that will cause us to make this
+  // overload set not viable.
+  if (SubstitutedAtomicExpr.get()->containsErrors()) {
+    Satisfaction.IsSatisfied = false;
+    Satisfaction.ContainsErrors = true;
+
+    PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error);
+    Satisfaction.Details.emplace_back(
+        new (S.Context) ConstraintSubstitutionDiagnostic{
+            SubstitutedAtomicExpr.get()->getBeginLoc(),
+            allocateStringFromConceptDiagnostic(S, Msg)});
+    return SubstitutedAtomicExpr;
+  }
+
+  if (SubstitutedAtomicExpr.get()->isValueDependent()) {
+    Satisfaction.IsSatisfied = true;
+    Satisfaction.ContainsErrors = false;
+    return SubstitutedAtomicExpr;
+  }
+
+  EnterExpressionEvaluationContext ConstantEvaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  SmallVector<PartialDiagnosticAt, 2> EvaluationDiags;
+  Expr::EvalResult EvalResult;
+  EvalResult.Diag = &EvaluationDiags;
+  if (!SubstitutedAtomicExpr.get()->EvaluateAsConstantExpr(EvalResult,
+                                                           S.Context) ||
+      !EvaluationDiags.empty()) {
+    // C++2a [temp.constr.atomic]p1
+    //   ...E shall be a constant expression of type bool.
+    S.Diag(SubstitutedAtomicExpr.get()->getBeginLoc(),
+           diag::err_non_constant_constraint_expression)
+        << SubstitutedAtomicExpr.get()->getSourceRange();
+    for (const PartialDiagnosticAt &PDiag : EvaluationDiags)
+      S.Diag(PDiag.first, PDiag.second);
+    return ExprError();
+  }
+
+  assert(EvalResult.Val.isInt() &&
+         "evaluating bool expression didn't produce int");
+  Satisfaction.IsSatisfied = EvalResult.Val.getInt().getBoolValue();
+  if (!Satisfaction.IsSatisfied)
+    Satisfaction.Details.emplace_back(SubstitutedAtomicExpr.get());
+
+  return SubstitutedAtomicExpr;
+}
+
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const AtomicConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+
+  unsigned Size = Satisfaction.Details.size();
+  llvm::FoldingSetNodeID ID;
+  UnsignedOrNone OuterPackSubstIndex =
+      Constraint.getPackSubstitutionIndex()
+          ? Constraint.getPackSubstitutionIndex()
+          : PackSubstitutionIndex;
+
+  ID.AddPointer(Constraint.getConstraintExpr());
+  ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
+  HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
+      .VisitConstraint(Constraint);
+
+  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+    auto &Cached = Iter->second.Satisfaction;
+    Satisfaction.ContainsErrors = Cached.ContainsErrors;
+    Satisfaction.IsSatisfied = Cached.IsSatisfied;
+    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
+                                Cached.Details.begin(), Cached.Details.end());
+    return Iter->second.SubstExpr;
+  }
+
+  ExprResult E = EvaluateSlow(Constraint, MLTAL);
+
+  UnsubstitutedConstraintSatisfactionCacheResult Cache;
+  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.SubstExpr = E;
+  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+
+  return E;
+}
+
+UnsignedOrNone
+ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
+    const FoldExpandedConstraint &FE,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
   // We should ignore errors in the presence of packs of different size.
   Sema::SFINAETrap Trap(S);
 
-  Expr *Pattern = FE->getPattern();
+  Expr *Pattern = const_cast<Expr *>(FE.getPattern());
 
   SmallVector<UnexpandedParameterPack, 2> Unexpanded;
   S.collectUnexpandedParameterPacks(Pattern, Unexpanded);
   assert(!Unexpanded.empty() && "Pack expansion without parameter packs?");
   bool Expand = true;
   bool RetainExpansion = false;
-  UnsignedOrNone NumExpansions = FE->getNumExpansions();
+  UnsignedOrNone NumExpansions(std::nullopt);
   if (S.CheckParameterPacksForExpansion(
-          FE->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
-          /*FailOnPackProducingTemplates=*/true, Expand, RetainExpansion,
+          Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
+          /*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion,
           NumExpansions) ||
       !Expand || RetainExpansion)
     return std::nullopt;
 
   if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
-    S.Diag(FE->getEllipsisLoc(),
+    S.Diag(Pattern->getExprLoc(),
            clang::diag::err_fold_expression_limit_exceeded)
         << *NumExpansions << S.getLangOpts().BracketDepth
-        << FE->getSourceRange();
-    S.Diag(FE->getEllipsisLoc(), diag::note_bracket_depth);
+        << Pattern->getSourceRange();
+    S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth);
     return std::nullopt;
   }
   return NumExpansions;
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction);
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+    const FoldExpandedConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+
+  bool Conjunction = Constraint.getFoldOperator() ==
+                     FoldExpandedConstraint::FoldOperatorKind::And;
+  unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
+
+  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  // FIXME: Is PackSubstitutionIndex correct?
+  llvm::SaveAndRestore _(PackSubstitutionIndex, S.ArgPackSubstIndex);
+  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+      SubstitutionInTemplateArguments(
+          static_cast<const NormalizedConstraintWithParamMapping &>(Constraint),
+          MLTAL, SubstitutedOuterMost);
+  if (!SubstitutedArgs) {
+    Satisfaction.IsSatisfied = false;
+    return ExprError();
+  }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const Expr *LHS, OverloadedOperatorKind Op, const Expr *RHS,
-    const NamedDecl *Template, SourceLocation TemplateNameLoc,
-    const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
-  size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
+  ExprResult Out;
+  UnsignedOrNone NumExpansions =
+      EvaluateFoldExpandedConstraintSize(Constraint, *SubstitutedArgs);
+  if (!NumExpansions)
+    return ExprEmpty();
 
-  ExprResult LHSRes = calculateConstraintSatisfaction(
-      S, LHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
+  if (*NumExpansions == 0) {
+    Satisfaction.IsSatisfied = Conjunction;
+    return ExprEmpty();
+  }
 
-  if (LHSRes.isInvalid())
-    return ExprError();
+  for (unsigned I = 0; I < *NumExpansions; I++) {
+    Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
+    Satisfaction.IsSatisfied = false;
+    Satisfaction.ContainsErrors = false;
+    ExprResult Expr =
+        ConstraintSatisfactionChecker(S, Template, TemplateNameLoc,
+                                      UnsignedOrNone(I), Satisfaction)
+            .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs);
+    if (Expr.isUsable()) {
+      if (Out.isUnset())
+        Out = Expr;
+      else
+        Out = BinaryOperator::Create(S.Context, Out.get(), Expr.get(),
+                                     Conjunction ? BinaryOperatorKind::BO_LAnd
+                                                 : BinaryOperatorKind::BO_LOr,
+                                     S.Context.BoolTy, VK_PRValue, OK_Ordinary,
+                                     Constraint.getBeginLoc(),
+                                     FPOptionsOverride{});
+    } else {
+      assert(!Satisfaction.IsSatisfied);
+    }
+    if (!Conjunction && Satisfaction.IsSatisfied) {
+      Satisfaction.Details.erase(Satisfaction.Details.begin() +
+                                     EffectiveDetailEndIndex,
+                                 Satisfaction.Details.end());
+      break;
+    }
+    if (Satisfaction.IsSatisfied != Conjunction)
+      return Out;
+  }
 
-  bool IsLHSSatisfied = Satisfaction.IsSatisfied;
-
-  if (Op == clang::OO_PipePipe && IsLHSSatisfied)
-    // [temp.constr.op] p3
-    //    A disjunction is a constraint taking two operands. To determine if
-    //    a disjunction is satisfied, the satisfaction of the first operand
-    //    is checked. If that is satisfied, the disjunction is satisfied.
-    //    Otherwise, the disjunction is satisfied if and only if the second
-    //    operand is satisfied.
-    // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
-    return LHSRes;
-
-  if (Op == clang::OO_AmpAmp && !IsLHSSatisfied)
-    // [temp.constr.op] p2
-    //    A conjunction is a constraint taking two operands. To determine if
-    //    a conjunction is satisfied, the satisfaction of the first operand
-    //    is checked. If that is not satisfied, the conjunction is not
-    //    satisfied. Otherwise, the conjunction is satisfied if and only if
-    //    the second operand is satisfied.
-    // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
-    return LHSRes;
-
-  ExprResult RHSRes = calculateConstraintSatisfaction(
-      S, RHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
-  if (RHSRes.isInvalid())
-    return ExprError();
+  return Out;
+}
 
-  bool IsRHSSatisfied = Satisfaction.IsSatisfied;
-  // Current implementation adds diagnostic information about the falsity
-  // of each false atomic constraint expression when it evaluates them.
-  // When the evaluation results to `false || true`, the information
-  // generated during the evaluation of left-hand side is meaningless
-  // because the whole expression evaluates to true.
-  // The following code removes the irrelevant diagnostic information.
-  // FIXME: We should probably delay the addition of diagnostic information
-  // until we know the entire expression is false.
-  if (Op == clang::OO_PipePipe && IsRHSSatisfied) {
-    auto EffectiveDetailEnd = Satisfaction.Details.begin();
-    std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
-    Satisfaction.Details.erase(EffectiveDetailEnd, Satisfaction.Details.end());
-  }
-
-  if (!LHSRes.isUsable() || !RHSRes.isUsable())
-    return ExprEmpty();
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const FoldExpandedConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
-  return BinaryOperator::Create(S.Context, LHSRes.get(), RHSRes.get(),
-                                BinaryOperator::getOverloadedOpcode(Op),
-                                S.Context.BoolTy, VK_PRValue, OK_Ordinary,
-                                LHS->getBeginLoc(), FPOptionsOverride{});
+  llvm::FoldingSetNodeID ID;
+  ID.AddPointer(Constraint.getPattern());
+  HashParameterMapping(S, MLTAL, ID, std::nullopt).VisitConstraint(Constraint);
+
+  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+    auto &Cached = Iter->second.Satisfaction;
+    Satisfaction.ContainsErrors = Cached.ContainsErrors;
+    Satisfaction.IsSatisfied = Cached.IsSatisfied;
+    Satisfaction.Details.insert(Satisfaction.Details.end(),
+                                Cached.Details.begin(), Cached.Details.end());
+    return Iter->second.SubstExpr;
+  }
+
+  unsigned Size = Satisfaction.Details.size();
+
+  ExprResult E = EvaluateSlow(Constraint, MLTAL);
+  UnsubstitutedConstraintSatisfactionCacheResult Cache;
+  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.SubstExpr = E;
+  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+  return E;
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
-  bool Conjunction = FE->getOperator() == BinaryOperatorKind::BO_LAnd;
-  size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+    const ConceptIdConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL, unsigned Size) {
+  const ConceptReference *ConceptId = Constraint.getConceptId();
 
-  ExprResult Out;
-  if (FE->isLeftFold() && FE->getInit()) {
-    Out = calculateConstraintSatisfaction(S, FE->getInit(), Template,
-                                          TemplateNameLoc, MLTAL, Satisfaction);
-    if (Out.isInvalid())
-      return ExprError();
+  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
 
-    // If the first clause of a conjunction is not satisfied,
-    // or if the first clause of a disjection is satisfied,
-    // we have established satisfaction of the whole constraint
-    // and we should not continue further.
-    if (Conjunction != Satisfaction.IsSatisfied)
-      return Out;
-  }
-  UnsignedOrNone NumExpansions = EvaluateFoldExpandedConstraintSize(
-      S, FE, Template, TemplateNameLoc, MLTAL, Satisfaction);
-  if (!NumExpansions)
+  if (!SubstitutedArgs) {
+    Satisfaction.IsSatisfied = false;
+    // FIXME: diagnostics?
     return ExprError();
-  for (unsigned I = 0; I < *NumExpansions; I++) {
-    Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
-    ExprResult Res = calculateConstraintSatisfaction(
-        S, FE->getPattern(), Template, TemplateNameLoc, MLTAL, Satisfaction);
-    if (Res.isInvalid())
-      return ExprError();
-    bool IsRHSSatisfied = Satisfaction.IsSatisfied;
-    if (!Conjunction && IsRHSSatisfied) {
-      auto EffectiveDetailEnd = Satisfaction.Details.begin();
-      std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
-      Satisfaction.Details.erase(EffectiveDetailEnd,
-                                 Satisfaction.Details.end());
-    }
-    if (Out.isUnset())
-      Out = Res;
-    else if (!Res.isUnset()) {
-      Out = BinaryOperator::Create(
-          S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
-          VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
-    }
-    if (Conjunction != IsRHSSatisfied)
-      return Out;
   }
 
-  if (FE->isRightFold() && FE->getInit()) {
-    ExprResult Res = calculateConstraintSatisfaction(
-        S, FE->getInit(), Template, TemplateNameLoc, MLTAL, Satisfaction);
-    if (Out.isInvalid())
+  Sema::SFINAETrap Trap(S);
+  Sema::ArgPackSubstIndexRAII SubstIndex(
+      S, Constraint.getPackSubstitutionIndex()
+             ? Constraint.getPackSubstitutionIndex()
+             : PackSubstitutionIndex);
+
+  const ASTTemplateArgumentListInfo *Ori =
+      ConceptId->getTemplateArgsAsWritten();
+  TemplateDeductionInfo Info(TemplateNameLoc);
+  Sema::InstantiatingTemplate _(
+      S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{},
+      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+
+  TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc);
+  if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) ||
+      Trap.hasErrorOccurred()) {
+    Satisfaction.IsSatisfied = false;
+    if (!Trap.hasErrorOccurred())
       return ExprError();
 
-    if (Out.isUnset())
-      Out = Res;
-    else if (!Res.isUnset()) {
-      Out = BinaryOperator::Create(
-          S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
-          VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
-    }
+    PartialDiagnosticAt SubstDiag{SourceLocation(),
+                                  PartialDiagnostic::NullDiagnostic()};
+    Info.takeSFINAEDiagnostic(SubstDiag);
+    // FIXME: This is an unfortunate consequence of there
+    //  being no serialization code for PartialDiagnostics and the fact
+    //  that serializing them would likely take a lot more storage than
+    //  just storing them as strings. We would still like, in the
+    //  future, to serialize the proper PartialDiagnostic as serializing
+    //  it as a string defeats the purpose of the diagnostic mechanism.
+    Satisfaction.Details.insert(
+        Satisfaction.Details.begin() + Size,
+        new (S.Context) ConstraintSubstitutionDiagnostic{
+            SubstDiag.first,
+            allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
+    return ExprError();
   }
 
-  if (Out.isUnset()) {
-    Satisfaction.IsSatisfied = Conjunction;
-    Out = S.BuildEmptyCXXFoldExpr(FE->getBeginLoc(), FE->getOperator());
+  CXXScopeSpec SS;
+  SS.Adopt(ConceptId->getNestedNameSpecifierLoc());
+
+  ExprResult SubstitutedConceptId = S.CheckConceptTemplateId(
+      SS, ConceptId->getTemplateKWLoc(), ConceptId->getConceptNameInfo(),
+      ConceptId->getFoundDecl(), ConceptId->getNamedConcept(), &OutArgs,
+      /*DoCheckConstraintSatisfaction=*/false);
+
+  if (SubstitutedConceptId.isInvalid() || Trap.hasErrorOccurred())
+    return ExprError();
+
+  if (Size != Satisfaction.Details.size()) {
+    Satisfaction.Details.insert(
+        Satisfaction.Details.begin() + Size,
+        UnsatisfiedConstraintRecord(
+            SubstitutedConceptId.getAs<ConceptSpecializationExpr>()
+                ->getConceptReference()));
   }
-  return Out;
+  return SubstitutedConceptId;
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
-  ConstraintExpr = ConstraintExpr->IgnoreParenImpCasts();
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const ConceptIdConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
-  if (LogicalBinOp BO = ConstraintExpr)
-    return calculateConstraintSatisfaction(
-        S, BO.getLHS(), BO.getOp(), BO.getRHS(), Template, TemplateNameLoc,
-        MLTAL, Satisfaction);
+  const ConceptReference *ConceptId = Constraint.getConceptId();
 
-  if (auto *C = dyn_cast<ExprWithCleanups>(ConstraintExpr)) {
-    // These aren't evaluated, so we don't care about cleanups, so we can just
-    // evaluate these as if the cleanups didn't exist.
-    return calculateConstraintSatisfaction(
-        S, C->getSubExpr(), Template, TemplateNameLoc, MLTAL, Satisfaction);
-  }
+  UnsignedOrNone OuterPackSubstIndex =
+      Constraint.getPackSubstitutionIndex()
+          ? Constraint.getPackSubstitutionIndex()
+          : PackSubstitutionIndex;
+
+  Sema::InstantiatingTemplate _(S, ConceptId->getBeginLoc(),
+                                Sema::InstantiatingTemplate::ConstraintsCheck{},
+                                ConceptId->getNamedConcept(),
+                                MLTAL.getInnermost(),
+                                Constraint.getSourceRange());
+
+  unsigned Size = Satisfaction.Details.size();
 
-  if (auto *FE = dyn_cast<CXXFoldExpr>(ConstraintExpr);
-      FE && S.getLangOpts().CPlusPlus26 &&
-      (FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
-       FE->getOperator() == BinaryOperatorKind::BO_LOr)) {
-    return calculateConstraintSatisfaction(S, FE, Template, TemplateNameLoc,
-                                           MLTAL, Satisfaction);
+  ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL);
+
+  if (!E.isUsable()) {
+    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId);
+    return E;
   }
 
-  // FIXME: We should not treat ConceptSpecializationExpr as atomic constraints.
+  // ConceptIdConstraint is only relevant for diagnostics,
+  // so if the normalized constraint is satisfied, we should not
+  // substitute into the constraint.
+  if (Satisfaction.IsSatisfied)
+    return E;
 
-  // An atomic constraint expression
-  ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint(
-      S, ConstraintExpr, Template, TemplateNameLoc, MLTAL, Satisfaction);
+  llvm::FoldingSetNodeID ID;
+  ID.AddPointer(Constraint.getConceptId());
+  ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
+  HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
+      .VisitConstraint(Constraint);
+
+  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+    auto &Cached = Iter->second.Satisfaction;
+    Satisfaction.ContainsErrors = Cached.ContainsErrors;
+    Satisfaction.IsSatisfied = Cached.IsSatisfied;
+    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
+                                Cached.Details.begin(), Cached.Details.end());
+    return Iter->second.SubstExpr;
+  }
+
+  ExprResult CE = EvaluateSlow(Constraint, MLTAL, Size);
+  if (CE.isInvalid())
+    return E;
+  UnsubstitutedConstraintSatisfactionCacheResult Cache;
+  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.SubstExpr = CE;
+  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+  return CE;
+}
 
-  if (SubstitutedAtomicExpr.isInvalid())
-    return ExprError();
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const CompoundConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
-  if (!SubstitutedAtomicExpr.isUsable())
-    // Evaluator has decided satisfaction without yielding an expression.
-    return ExprEmpty();
+  unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
+
+  bool Conjunction =
+      Constraint.getCompoundKind() == NormalizedConstraint::CCK_Conjunction;
+
+  ExprResult LHS = Evaluate(Constraint.getLHS(), MLTAL);
+
+  if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors))
+    return LHS;
+
+  if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied &&
+      !Satisfaction.ContainsErrors)
+    return LHS;
+
+  Satisfaction.ContainsErrors = false;
+  Satisfaction.IsSatisfied = false;
 
-  // We don't have the ability to evaluate this, since it contains a
-  // RecoveryExpr, so we want to fail overload resolution.  Otherwise,
-  // we'd potentially pick up a different overload, and cause confusing
-  // diagnostics. SO, add a failure detail that will cause us to make this
-  // overload set not viable.
-  if (SubstitutedAtomicExpr.get()->containsErrors()) {
-    Satisfaction.IsSatisfied = false;
-    Satisfaction.ContainsErrors = true;
+  ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL);
 
-    PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error);
-    SmallString<128> DiagString;
-    DiagString = ": ";
-    Msg.EmitToString(S.getDiagnostics(), DiagString);
-    unsigned MessageSize = DiagString.size();
-    char *Mem = new (S.Context) char[MessageSize];
-    memcpy(Mem, DiagString.c_str(), MessageSize);
-    Satisfaction.Details.emplace_back(
-        new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
-            SubstitutedAtomicExpr.get()->getBeginLoc(),
-            StringRef(Mem, MessageSize)});
-    return SubstitutedAtomicExpr;
-  }
+  if (RHS.isUsable() && Satisfaction.IsSatisfied &&
+      !Satisfaction.ContainsErrors)
+    Satisfaction.Details.erase(Satisfaction.Details.begin() +
+                                   EffectiveDetailEndIndex,
+                               Satisfaction.Details.end());
 
-  EnterExpressionEvaluationContext ConstantEvaluated(
-      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  SmallVector<PartialDiagnosticAt, 2> EvaluationDiags;
-  Expr::EvalResult EvalResult;
-  EvalResult.Diag = &EvaluationDiags;
-  if (!SubstitutedAtomicExpr.get()->EvaluateAsConstantExpr(EvalResult,
-                                                           S.Context) ||
-      !EvaluationDiags.empty()) {
-    // C++2a [temp.constr.atomic]p1
-    //   ...E shall be a constant expression of type bool.
-    S.Diag(SubstitutedAtomicExpr.get()->getBeginLoc(),
-           diag::err_non_constant_constraint_expression)
-        << SubstitutedAtomicExpr.get()->getSourceRange();
-    for (const PartialDiagnosticAt &PDiag : EvaluationDiags)
-      S.Diag(PDiag.first, PDiag.second);
-    return ExprError();
-  }
+  if (!LHS.isUsable())
+    return RHS;
 
-  assert(EvalResult.Val.isInt() &&
-         "evaluating bool expression didn't produce int");
-  Satisfaction.IsSatisfied = EvalResult.Val.getInt().getBoolValue();
-  if (!Satisfaction.IsSatisfied)
-    Satisfaction.Details.emplace_back(SubstitutedAtomicExpr.get());
+  if (!RHS.isUsable())
+    return LHS;
 
-  return SubstitutedAtomicExpr;
+  return BinaryOperator::Create(S.Context, LHS.get(), RHS.get(),
+                                Conjunction ? BinaryOperatorKind::BO_LAnd
+                                            : BinaryOperatorKind::BO_LOr,
+                                S.Context.BoolTy, VK_PRValue, OK_Ordinary,
+                                Constraint.getBeginLoc(), FPOptionsOverride{});
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const NamedDecl *Template, SourceLocation TemplateNameLoc,
-    const MultiLevelTemplateArgumentList &MLTAL, const Expr *ConstraintExpr,
-    ConstraintSatisfaction &Satisfaction) {
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const NormalizedConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+  switch (Constraint.getKind()) {
+  case NormalizedConstraint::ConstraintKind::Atomic:
+    return Evaluate(static_cast<const AtomicConstraint &>(Constraint), MLTAL);
+
+  case NormalizedConstraint::ConstraintKind::FoldExpanded:
+    return Evaluate(static_cast<const FoldExpandedConstraint &>(Constraint),
+                    MLTAL);
+
+  case NormalizedConstraint::ConstraintKind::ConceptId:
+    return Evaluate(static_cast<const ConceptIdConstraint &>(Constraint),
+                    MLTAL);
 
-  return calculateConstraintSatisfaction(S, ConstraintExpr, Template,
-                                         TemplateNameLoc, MLTAL, Satisfaction);
+  case NormalizedConstraint::ConstraintKind::Compound:
+    return Evaluate(static_cast<const CompoundConstraint &>(Constraint), MLTAL);
+  }
 }
 
 static bool CheckConstraintSatisfaction(
     Sema &S, const NamedDecl *Template,
     ArrayRef<AssociatedConstraint> AssociatedConstraints,
-    llvm::SmallVectorImpl<Expr *> &Converted,
     const MultiLevelTemplateArgumentList &TemplateArgsLists,
-    SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
+    SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
+    Expr **ConvertedExpr, const ConceptReference *TopLevelConceptId = nullptr) {
+
+  if (ConvertedExpr)
+    *ConvertedExpr = nullptr;
+
   if (AssociatedConstraints.empty()) {
     Satisfaction.IsSatisfied = true;
     return false;
@@ -578,57 +1069,60 @@ static bool CheckConstraintSatisfaction(
     return false;
   }
 
-  ArrayRef<TemplateArgument> TemplateArgs =
-      TemplateArgsLists.getNumSubstitutedLevels() > 0
-          ? TemplateArgsLists.getOutermost()
-          : ArrayRef<TemplateArgument>{};
-  Sema::InstantiatingTemplate Inst(S, TemplateIDRange.getBegin(),
-      Sema::InstantiatingTemplate::ConstraintsCheck{},
-      const_cast<NamedDecl *>(Template), TemplateArgs, TemplateIDRange);
-  if (Inst.isInvalid())
+  llvm::ArrayRef<TemplateArgument> Args;
+  if (TemplateArgsLists.getNumLevels() != 0)
+    Args = TemplateArgsLists.getInnermost();
+
+  std::optional<Sema::InstantiatingTemplate> SynthesisContext;
+  if (!TopLevelConceptId) {
+    SynthesisContext.emplace(S, TemplateIDRange.getBegin(),
+                             Sema::InstantiatingTemplate::ConstraintsCheck{},
+                             const_cast<NamedDecl *>(Template), Args,
+                             TemplateIDRange);
+  }
+
+  const NormalizedConstraint *C =
+      S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints);
+  if (!C) {
+    Satisfaction.IsSatisfied = false;
     return true;
+  }
 
-  for (const AssociatedConstraint &AC : AssociatedConstraints) {
-    if (AC.isNull())
-      return true;
+  if (TopLevelConceptId)
+    C = ConceptIdConstraint::Create(S.getASTContext(), TopLevelConceptId,
+                                    const_cast<NormalizedConstraint *>(C),
+                                    Template, /*CSE=*/nullptr,
+                                    S.ArgPackSubstIndex);
 
-    Sema::ArgPackSubstIndexRAII _(S, AC.ArgPackSubstIndex);
-    ExprResult Res = calculateConstraintSatisfaction(
-        S, Template, TemplateIDRange.getBegin(), TemplateArgsLists,
-        AC.ConstraintExpr, Satisfaction);
-    if (Res.isInvalid())
-      return true;
+  ExprResult Res =
+      ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(),
+                                    S.ArgPackSubstIndex, Satisfaction)
+          .Evaluate(*C, TemplateArgsLists);
+
+  if (Res.isInvalid())
+    return true;
+
+  if (Res.isUsable() && ConvertedExpr)
+    *ConvertedExpr = Res.get();
 
-    Converted.push_back(Res.get());
-    if (!Satisfaction.IsSatisfied) {
-      // Backfill the 'converted' list with nulls so we can keep the Converted
-      // and unconverted lists in sync.
-      Converted.append(AssociatedConstraints.size() - Converted.size(),
-                       nullptr);
-      // [temp.constr.op] p2
-      // [...] To determine if a conjunction is satisfied, the satisfaction
-      // of the first operand is checked. If that is not satisfied, the
-      // conjunction is not satisfied. [...]
-      return false;
-    }
-  }
   return false;
 }
 
 bool Sema::CheckConstraintSatisfaction(
-    const NamedDecl *Template,
+    ConstrainedDeclOrNestedRequirement Entity,
     ArrayRef<AssociatedConstraint> AssociatedConstraints,
-    llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
     const MultiLevelTemplateArgumentList &TemplateArgsLists,
-    SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction) {
+    SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction,
+    const ConceptReference *TopLevelConceptId, Expr **ConvertedExpr) {
   if (AssociatedConstraints.empty()) {
     OutSatisfaction.IsSatisfied = true;
     return false;
   }
+  const auto *Template = Entity.dyn_cast<const NamedDecl *>();
   if (!Template) {
     return ::CheckConstraintSatisfaction(
-        *this, nullptr, AssociatedConstraints, ConvertedConstraints,
-        TemplateArgsLists, TemplateIDRange, OutSatisfaction);
+        *this, nullptr, AssociatedConstraints, TemplateArgsLists,
+        TemplateIDRange, OutSatisfaction, ConvertedExpr, TopLevelConceptId);
   }
   // Invalid templates could make their way here. Substituting them could result
   // in dependent expressions.
@@ -643,10 +1137,15 @@ bool Sema::CheckConstraintSatisfaction(
   // here.
   llvm::SmallVector<TemplateArgument, 4> FlattenedArgs;
   for (auto List : TemplateArgsLists)
-    llvm::append_range(FlattenedArgs, List.Args);
+    for (const TemplateArgument &Arg : List.Args)
+      FlattenedArgs.emplace_back(Context.getCanonicalTemplateArgument(Arg));
+
+  const NamedDecl *Owner = Template;
+  if (TopLevelConceptId)
+    Owner = TopLevelConceptId->getNamedConcept();
 
   llvm::FoldingSetNodeID ID;
-  ConstraintSatisfaction::Profile(ID, Context, Template, FlattenedArgs);
+  ConstraintSatisfaction::Profile(ID, Context, Owner, FlattenedArgs);
   void *InsertPos;
   if (auto *Cached = SatisfactionCache.FindNodeOrInsertPos(ID, InsertPos)) {
     OutSatisfaction = *Cached;
@@ -654,10 +1153,10 @@ bool Sema::CheckConstraintSatisfaction(
   }
 
   auto Satisfaction =
-      std::make_unique<ConstraintSatisfaction>(Template, FlattenedArgs);
-  if (::CheckConstraintSatisfaction(*this, Template, AssociatedConstraints,
-                                    ConvertedConstraints, TemplateArgsLists,
-                                    TemplateIDRange, *Satisfaction)) {
+      std::make_unique<ConstraintSatisfaction>(Owner, FlattenedArgs);
+  if (::CheckConstraintSatisfaction(
+          *this, Template, AssociatedConstraints, TemplateArgsLists,
+          TemplateIDRange, *Satisfaction, ConvertedExpr, TopLevelConceptId)) {
     OutSatisfaction = *Satisfaction;
     return true;
   }
@@ -688,14 +1187,18 @@ bool Sema::CheckConstraintSatisfaction(
     const ConceptSpecializationExpr *ConstraintExpr,
     ConstraintSatisfaction &Satisfaction) {
 
+  llvm::SmallVector<AssociatedConstraint, 1> Constraints;
+  Constraints.emplace_back(
+      ConstraintExpr->getNamedConcept()->getConstraintExpr());
+
   MultiLevelTemplateArgumentList MLTAL(ConstraintExpr->getNamedConcept(),
                                        ConstraintExpr->getTemplateArguments(),
                                        true);
 
-  return calculateConstraintSatisfaction(
-             *this, ConstraintExpr, ConstraintExpr->getNamedConcept(),
-             ConstraintExpr->getConceptNameLoc(), MLTAL, Satisfaction)
-      .isInvalid();
+  return CheckConstraintSatisfaction(
+      ConstraintExpr->getNamedConcept(), Constraints, MLTAL,
+      ConstraintExpr->getSourceRange(), Satisfaction,
+      ConstraintExpr->getConceptReference());
 }
 
 bool Sema::SetupConstraintScope(
@@ -854,50 +1357,6 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
       Satisfaction);
 }
 
-
-// Figure out the to-translation-unit depth for this function declaration for
-// the purpose of seeing if they differ by constraints. This isn't the same as
-// getTemplateDepth, because it includes already instantiated parents.
-static unsigned
-CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
-                                     bool SkipForSpecialization = false) {
-  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      ND, ND->getLexicalDeclContext(), /*Final=*/false,
-      /*Innermost=*/std::nullopt,
-      /*RelativeToPrimary=*/true,
-      /*Pattern=*/nullptr,
-      /*ForConstraintInstantiation=*/true, SkipForSpecialization);
-  return MLTAL.getNumLevels();
-}
-
-namespace {
-  class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
-  unsigned TemplateDepth = 0;
-  public:
-  using inherited = TreeTransform<AdjustConstraintDepth>;
-  AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
-      : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
-
-  using inherited::TransformTemplateTypeParmType;
-  QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
-                                         TemplateTypeParmTypeLoc TL, bool) {
-    const TemplateTypeParmType *T = TL.getTypePtr();
-
-    TemplateTypeParmDecl *NewTTPDecl = nullptr;
-    if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
-      NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
-          TransformDecl(TL.getNameLoc(), OldTTPDecl));
-
-    QualType Result = getSema().Context.getTemplateTypeParmType(
-        T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
-        NewTTPDecl);
-    TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
-    NewTL.setNameLoc(TL.getNameLoc());
-    return Result;
-  }
-  };
-} // namespace
-
 static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
     Sema &S, const Sema::TemplateCompareNewDeclInfo &DeclInfo,
     const Expr *ConstrExpr) {
@@ -1161,73 +1620,61 @@ bool Sema::CheckFunctionTemplateConstraints(
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::ExprRequirement *Req,
                                            bool First) {
-  assert(!Req->isSatisfied()
-         && "Diagnose() can only be used on an unsatisfied requirement");
+  assert(!Req->isSatisfied() &&
+         "Diagnose() can only be used on an unsatisfied requirement");
   switch (Req->getSatisfactionStatus()) {
-    case concepts::ExprRequirement::SS_Dependent:
-      llvm_unreachable("Diagnosing a dependent requirement");
-      break;
-    case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
-      auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
-      if (!SubstDiag->DiagMessage.empty())
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_expr_substitution_error)
-               << (int)First << SubstDiag->SubstitutedEntity
-               << SubstDiag->DiagMessage;
-      else
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_expr_unknown_substitution_error)
-            << (int)First << SubstDiag->SubstitutedEntity;
-      break;
-    }
-    case concepts::ExprRequirement::SS_NoexceptNotMet:
-      S.Diag(Req->getNoexceptLoc(),
-             diag::note_expr_requirement_noexcept_not_met)
-          << (int)First << Req->getExpr();
-      break;
-    case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
-      auto *SubstDiag =
-          Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
-      if (!SubstDiag->DiagMessage.empty())
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_type_requirement_substitution_error)
-            << (int)First << SubstDiag->SubstitutedEntity
-            << SubstDiag->DiagMessage;
-      else
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_type_requirement_unknown_substitution_error)
-            << (int)First << SubstDiag->SubstitutedEntity;
-      break;
-    }
-    case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
-      ConceptSpecializationExpr *ConstraintExpr =
-          Req->getReturnTypeRequirementSubstitutedConstraintExpr();
-      if (ConstraintExpr->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
-        // A simple case - expr type is the type being constrained and the concept
-        // was not provided arguments.
-        Expr *e = Req->getExpr();
-        S.Diag(e->getBeginLoc(),
-               diag::note_expr_requirement_constraints_not_satisfied_simple)
-            << (int)First << S.Context.getReferenceQualifiedType(e)
-            << ConstraintExpr->getNamedConcept();
-      } else {
-        S.Diag(ConstraintExpr->getBeginLoc(),
-               diag::note_expr_requirement_constraints_not_satisfied)
-            << (int)First << ConstraintExpr;
-      }
-      S.DiagnoseUnsatisfiedConstraint(ConstraintExpr->getSatisfaction());
-      break;
-    }
-    case concepts::ExprRequirement::SS_Satisfied:
-      llvm_unreachable("We checked this above");
+  case concepts::ExprRequirement::SS_Dependent:
+    llvm_unreachable("Diagnosing a dependent requirement");
+    break;
+  case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
+    auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
+    if (!SubstDiag->DiagMessage.empty())
+      S.Diag(SubstDiag->DiagLoc,
+             diag::note_expr_requirement_expr_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity
+          << SubstDiag->DiagMessage;
+    else
+      S.Diag(SubstDiag->DiagLoc,
+             diag::note_expr_requirement_expr_unknown_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity;
+    break;
+  }
+  case concepts::ExprRequirement::SS_NoexceptNotMet:
+    S.Diag(Req->getNoexceptLoc(), diag::note_expr_requirement_noexcept_not_met)
+        << (int)First << Req->getExpr();
+    break;
+  case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
+    auto *SubstDiag =
+        Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
+    if (!SubstDiag->DiagMessage.empty())
+      S.Diag(SubstDiag->DiagLoc,
+             diag::note_expr_requirement_type_requirement_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity
+          << SubstDiag->DiagMessage;
+    else
+      S.Diag(
+          SubstDiag->DiagLoc,
+          diag::
+              note_expr_requirement_type_requirement_unknown_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity;
+    break;
+  }
+  case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
+    ConceptSpecializationExpr *ConstraintExpr =
+        Req->getReturnTypeRequirementSubstitutedConstraintExpr();
+    S.DiagnoseUnsatisfiedConstraint(ConstraintExpr);
+    break;
+  }
+  case concepts::ExprRequirement::SS_Satisfied:
+    llvm_unreachable("We checked this above");
   }
 }
 
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::TypeRequirement *Req,
                                            bool First) {
-  assert(!Req->isSatisfied()
-         && "Diagnose() can only be used on an unsatisfied requirement");
+  assert(!Req->isSatisfied() &&
+         "Diagnose() can only be used on an unsatisfied requirement");
   switch (Req->getSatisfactionStatus()) {
   case concepts::TypeRequirement::SS_Dependent:
     llvm_unreachable("Diagnosing a dependent requirement");
@@ -1235,9 +1682,9 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
   case concepts::TypeRequirement::SS_SubstitutionFailure: {
     auto *SubstDiag = Req->getSubstitutionDiagnostic();
     if (!SubstDiag->DiagMessage.empty())
-      S.Diag(SubstDiag->DiagLoc,
-             diag::note_type_requirement_substitution_error) << (int)First
-          << SubstDiag->SubstitutedEntity << SubstDiag->DiagMessage;
+      S.Diag(SubstDiag->DiagLoc, diag::note_type_requirement_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity
+          << SubstDiag->DiagMessage;
     else
       S.Diag(SubstDiag->DiagLoc,
              diag::note_type_requirement_unknown_substitution_error)
@@ -1249,31 +1696,53 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
     return;
   }
 }
-static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
-                                                        Expr *SubstExpr,
-                                                        bool First = true);
+
+static void diagnoseUnsatisfiedConceptIdExpr(Sema &S,
+                                             const ConceptReference *Concept,
+                                             SourceLocation Loc, bool First) {
+  if (Concept->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
+    S.Diag(
+        Loc,
+        diag::
+            note_single_arg_concept_specialization_constraint_evaluated_to_false)
+        << (int)First
+        << Concept->getTemplateArgsAsWritten()->arguments()[0].getArgument()
+        << Concept->getNamedConcept();
+  } else {
+    S.Diag(Loc, diag::note_concept_specialization_constraint_evaluated_to_false)
+        << (int)First << Concept;
+  }
+}
+
+static void diagnoseUnsatisfiedConstraintExpr(
+    Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
+    bool First, concepts::NestedRequirement *Req = nullptr);
+
+static void DiagnoseUnsatisfiedConstraint(
+    Sema &S, ArrayRef<UnsatisfiedConstraintRecord> Records, SourceLocation Loc,
+    bool First = true, concepts::NestedRequirement *Req = nullptr) {
+  for (auto &Record : Records) {
+    diagnoseUnsatisfiedConstraintExpr(S, Record, Loc, First, Req);
+    Loc = {};
+    First = isa<const ConceptReference *>(Record);
+  }
+}
 
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::NestedRequirement *Req,
                                            bool First) {
-  using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
-  for (auto &Record : Req->getConstraintSatisfaction()) {
-    if (auto *SubstDiag = Record.dyn_cast<SubstitutionDiagnostic *>())
-      S.Diag(SubstDiag->first, diag::note_nested_requirement_substitution_error)
-          << (int)First << Req->getInvalidConstraintEntity()
-          << SubstDiag->second;
-    else
-      diagnoseWellFormedUnsatisfiedConstraintExpr(S, Record.dyn_cast<Expr *>(),
-                                                  First);
-    First = false;
-  }
+  DiagnoseUnsatisfiedConstraint(S, Req->getConstraintSatisfaction().records(),
+                                Req->hasInvalidConstraint()
+                                    ? SourceLocation()
+                                    : Req->getConstraintExpr()->getExprLoc(),
+                                First, Req);
 }
 
 static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
-                                                        Expr *SubstExpr,
+                                                        const Expr *SubstExpr,
                                                         bool First) {
   SubstExpr = SubstExpr->IgnoreParenImpCasts();
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
+  if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
     switch (BO->getOpcode()) {
     // These two cases will in practice only be reached when using fold
     // expressions with || and &&, since otherwise the || and && will have been
@@ -1319,7 +1788,7 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
         BO->getRHS()->EvaluateAsInt(SimplifiedRHS, S.Context,
                                     Expr::SE_NoSideEffects,
                                     /*InConstantContext=*/true);
-        if (!SimplifiedLHS.Diag && ! SimplifiedRHS.Diag) {
+        if (!SimplifiedLHS.Diag && !SimplifiedRHS.Diag) {
           S.Diag(SubstExpr->getBeginLoc(),
                  diag::note_atomic_constraint_evaluated_to_false_elaborated)
               << (int)First << SubstExpr
@@ -1334,22 +1803,6 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
     default:
       break;
     }
-  } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
-    if (CSE->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
-      S.Diag(
-          CSE->getSourceRange().getBegin(),
-          diag::
-          note_single_arg_concept_specialization_constraint_evaluated_to_false)
-          << (int)First
-          << CSE->getTemplateArgsAsWritten()->arguments()[0].getArgument()
-          << CSE->getNamedConcept();
-    } else {
-      S.Diag(SubstExpr->getSourceRange().getBegin(),
-             diag::note_concept_specialization_constraint_evaluated_to_false)
-          << (int)First << CSE;
-    }
-    S.DiagnoseUnsatisfiedConstraint(CSE->getSatisfaction());
-    return;
   } else if (auto *RE = dyn_cast<RequiresExpr>(SubstExpr)) {
     // FIXME: RequiresExpr should store dependent diagnostics.
     for (concepts::Requirement *Req : RE->getRequirements())
@@ -1364,6 +1817,10 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
         break;
       }
     return;
+  } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
+    // Drill down concept ids treated as atomic constraints
+    S.DiagnoseUnsatisfiedConstraint(CSE, First);
+    return;
   } else if (auto *TTE = dyn_cast<TypeTraitExpr>(SubstExpr);
              TTE && TTE->getTrait() == clang::TypeTrait::BTT_IsDeducible) {
     assert(TTE->getNumArgs() == 2);
@@ -1379,216 +1836,332 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
   S.DiagnoseTypeTraitDetails(SubstExpr);
 }
 
-template <typename SubstitutionDiagnostic>
 static void diagnoseUnsatisfiedConstraintExpr(
-    Sema &S, const llvm::PointerUnion<Expr *, SubstitutionDiagnostic *> &Record,
-    bool First = true) {
-  if (auto *Diag = Record.template dyn_cast<SubstitutionDiagnostic *>()) {
-    S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
-        << Diag->second;
+    Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
+    bool First, concepts::NestedRequirement *Req) {
+  if (auto *Diag =
+          Record
+              .template dyn_cast<const ConstraintSubstitutionDiagnostic *>()) {
+    if (Req)
+      S.Diag(Diag->first, diag::note_nested_requirement_substitution_error)
+          << (int)First << Req->getInvalidConstraintEntity() << Diag->second;
+    else
+      S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
+          << Diag->second;
     return;
   }
-
-  diagnoseWellFormedUnsatisfiedConstraintExpr(S, cast<Expr *>(Record), First);
+  if (const auto *Concept = dyn_cast<const ConceptReference *>(Record)) {
+    if (Loc.isInvalid())
+      Loc = Concept->getBeginLoc();
+    diagnoseUnsatisfiedConceptIdExpr(S, Concept, Loc, First);
+    return;
+  }
+  diagnoseWellFormedUnsatisfiedConstraintExpr(
+      S, cast<const class Expr *>(Record), First);
 }
 
-void
-Sema::DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction& Satisfaction,
-                                    bool First) {
+void Sema::DiagnoseUnsatisfiedConstraint(
+    const ConstraintSatisfaction &Satisfaction, SourceLocation Loc,
+    bool First) {
+
   assert(!Satisfaction.IsSatisfied &&
          "Attempted to diagnose a satisfied constraint");
-  for (auto &Record : Satisfaction.Details) {
-    diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
-    First = false;
-  }
+  ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.Details, Loc, First);
 }
 
 void Sema::DiagnoseUnsatisfiedConstraint(
-    const ASTConstraintSatisfaction &Satisfaction,
-    bool First) {
+    const ConceptSpecializationExpr *ConstraintExpr, bool First) {
+
+  const ASTConstraintSatisfaction &Satisfaction =
+      ConstraintExpr->getSatisfaction();
+
   assert(!Satisfaction.IsSatisfied &&
          "Attempted to diagnose a satisfied constraint");
-  for (auto &Record : Satisfaction) {
-    diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
-    First = false;
-  }
+
+  ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.records(),
+                                  ConstraintExpr->getBeginLoc(), First);
 }
 
-const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
-    const NamedDecl *ConstrainedDecl,
-    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
-  // In case the ConstrainedDecl comes from modules, it is necessary to use
-  // the canonical decl to avoid different atomic constraints with the 'same'
-  // declarations.
-  ConstrainedDecl = cast<NamedDecl>(ConstrainedDecl->getCanonicalDecl());
+namespace {
 
-  auto CacheEntry = NormalizationCache.find(ConstrainedDecl);
-  if (CacheEntry == NormalizationCache.end()) {
-    auto Normalized = NormalizedConstraint::fromAssociatedConstraints(
-        *this, ConstrainedDecl, AssociatedConstraints);
-    CacheEntry =
-        NormalizationCache
-            .try_emplace(ConstrainedDecl,
-                         Normalized
-                             ? new (Context) NormalizedConstraint(
-                                 std::move(*Normalized))
-                             : nullptr)
-            .first;
-  }
-  return CacheEntry->second;
-}
+class SubstituteParameterMappings {
+  Sema &SemaRef;
 
-const NormalizedConstraint *clang::getNormalizedAssociatedConstraints(
-    Sema &S, const NamedDecl *ConstrainedDecl,
-    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
-  return S.getNormalizedAssociatedConstraints(ConstrainedDecl,
-                                              AssociatedConstraints);
-}
+  const MultiLevelTemplateArgumentList *MLTAL;
+  const ASTTemplateArgumentListInfo *ArgsAsWritten;
 
-static bool
-substituteParameterMappings(Sema &S, NormalizedConstraint &N,
-                            ConceptDecl *Concept,
-                            const MultiLevelTemplateArgumentList &MLTAL,
-                            const ASTTemplateArgumentListInfo *ArgsAsWritten) {
+  bool InFoldExpr;
 
-  if (N.isCompound()) {
-    if (substituteParameterMappings(S, N.getLHS(), Concept, MLTAL,
-                                    ArgsAsWritten))
-      return true;
-    return substituteParameterMappings(S, N.getRHS(), Concept, MLTAL,
-                                       ArgsAsWritten);
-  }
+  SubstituteParameterMappings(Sema &SemaRef,
+                              const MultiLevelTemplateArgumentList *MLTAL,
+                              const ASTTemplateArgumentListInfo *ArgsAsWritten,
+                              bool InFoldExpr)
+      : SemaRef(SemaRef), MLTAL(MLTAL), ArgsAsWritten(ArgsAsWritten),
+        InFoldExpr(InFoldExpr) {}
+
+  void buildParameterMapping(NormalizedConstraintWithParamMapping &N);
 
-  if (N.isFoldExpanded()) {
-    Sema::ArgPackSubstIndexRAII _(S, std::nullopt);
-    return substituteParameterMappings(
-        S, N.getFoldExpandedConstraint()->Constraint, Concept, MLTAL,
-        ArgsAsWritten);
+  bool substitute(NormalizedConstraintWithParamMapping &N);
+
+  bool substitute(ConceptIdConstraint &CC);
+
+public:
+  SubstituteParameterMappings(Sema &SemaRef, bool InFoldExpr = false)
+      : SemaRef(SemaRef), MLTAL(nullptr), ArgsAsWritten(nullptr),
+        InFoldExpr(InFoldExpr) {}
+
+  bool substitute(NormalizedConstraint &N);
+};
+
+void SubstituteParameterMappings::buildParameterMapping(
+    NormalizedConstraintWithParamMapping &N) {
+  TemplateParameterList *TemplateParams =
+      cast<TemplateDecl>(N.getConstraintDecl())->getTemplateParameters();
+
+  llvm::SmallBitVector OccurringIndices(TemplateParams->size());
+  llvm::SmallBitVector OccurringIndicesForSubsumption(TemplateParams->size());
+
+  if (N.getKind() == NormalizedConstraint::ConstraintKind::Atomic) {
+    SemaRef.MarkUsedTemplateParameters(
+        static_cast<AtomicConstraint &>(N).getConstraintExpr(),
+        /*OnlyDeduced=*/false,
+        /*Depth=*/0, OccurringIndices);
+
+    SemaRef.MarkUsedTemplateParametersForSubsumptionParameterMapping(
+        static_cast<AtomicConstraint &>(N).getConstraintExpr(),
+        /*Depth=*/0, OccurringIndicesForSubsumption);
+
+  } else if (N.getKind() ==
+             NormalizedConstraint::ConstraintKind::FoldExpanded) {
+    SemaRef.MarkUsedTemplateParameters(
+        static_cast<FoldExpandedConstraint &>(N).getPattern(),
+        /*OnlyDeduced=*/false,
+        /*Depth=*/0, OccurringIndices);
+  } else if (N.getKind() == NormalizedConstraint::ConstraintKind::ConceptId) {
+    auto *Args = static_cast<ConceptIdConstraint &>(N)
+                     .getConceptId()
+                     ->getTemplateArgsAsWritten();
+    if (Args)
+      SemaRef.MarkUsedTemplateParameters(Args->arguments(),
+                                         /*Depth=*/0, OccurringIndices);
+  }
+  TemplateArgumentLoc *TempArgs =
+      new (SemaRef.Context) TemplateArgumentLoc[OccurringIndices.count()];
+  llvm::SmallVector<NamedDecl *> UsedParams;
+  for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I) {
+    SourceLocation Loc = ArgsAsWritten->NumTemplateArgs > I
+                             ? ArgsAsWritten->arguments()[I].getLocation()
+                             : SourceLocation();
+    // FIXME: Investigate why we couldn't always preserve the SourceLoc. We
+    // can't assert Loc.isValid() now.
+    if (OccurringIndices[I]) {
+      NamedDecl *Param = TemplateParams->begin()[I];
+      new (&(TempArgs)[J]) TemplateArgumentLoc(
+          SemaRef.getIdentityTemplateArgumentLoc(Param, Loc));
+      UsedParams.push_back(Param);
+      J++;
+    }
   }
+  auto *UsedList = TemplateParameterList::Create(
+      SemaRef.Context, TemplateParams->getTemplateLoc(),
+      TemplateParams->getLAngleLoc(), UsedParams,
+      /*RAngleLoc=*/SourceLocation(),
+      /*RequiresClause=*/nullptr);
+  unsigned Size = OccurringIndices.count();
+  N.updateParameterMapping(
+      std::move(OccurringIndices), std::move(OccurringIndicesForSubsumption),
+      MutableArrayRef<TemplateArgumentLoc>{TempArgs, Size}, UsedList);
+}
 
-  TemplateParameterList *TemplateParams = Concept->getTemplateParameters();
+bool SubstituteParameterMappings::substitute(
+    NormalizedConstraintWithParamMapping &N) {
+  if (!N.hasParameterMapping())
+    buildParameterMapping(N);
 
-  AtomicConstraint &Atomic = *N.getAtomicConstraint();
-  TemplateArgumentListInfo SubstArgs;
-  if (!Atomic.ParameterMapping) {
-    llvm::SmallBitVector OccurringIndices(TemplateParams->size());
-    S.MarkUsedTemplateParameters(Atomic.ConstraintExpr, /*OnlyDeduced=*/false,
-                                 /*Depth=*/0, OccurringIndices);
-    TemplateArgumentLoc *TempArgs =
-        new (S.Context) TemplateArgumentLoc[OccurringIndices.count()];
-    for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I)
-      if (OccurringIndices[I])
-        new (&(TempArgs)[J++])
-            TemplateArgumentLoc(S.getIdentityTemplateArgumentLoc(
-                TemplateParams->begin()[I],
-                // Here we assume we do not support things like
-                // template<typename A, typename B>
-                // concept C = ...;
-                //
-                // template<typename... Ts> requires C<Ts...>
-                // struct S { };
-                // The above currently yields a diagnostic.
-                // We still might have default arguments for concept parameters.
-                ArgsAsWritten->NumTemplateArgs > I
-                    ? ArgsAsWritten->arguments()[I].getLocation()
-                    : SourceLocation()));
-    Atomic.ParameterMapping.emplace(TempArgs,  OccurringIndices.count());
-  }
-  SourceLocation InstLocBegin =
-      ArgsAsWritten->arguments().empty()
-          ? ArgsAsWritten->getLAngleLoc()
-          : ArgsAsWritten->arguments().front().getSourceRange().getBegin();
-  SourceLocation InstLocEnd =
-      ArgsAsWritten->arguments().empty()
-          ? ArgsAsWritten->getRAngleLoc()
-          : ArgsAsWritten->arguments().front().getSourceRange().getEnd();
+  SourceLocation InstLocBegin, InstLocEnd;
+  llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
+  if (Arguments.empty()) {
+    InstLocBegin = ArgsAsWritten->getLAngleLoc();
+    InstLocEnd = ArgsAsWritten->getRAngleLoc();
+  } else {
+    auto SR = Arguments[0].getSourceRange();
+    InstLocBegin = SR.getBegin();
+    InstLocEnd = SR.getEnd();
+  }
   Sema::InstantiatingTemplate Inst(
-      S, InstLocBegin,
+      SemaRef, InstLocBegin,
       Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
-      const_cast<NamedDecl *>(Atomic.ConstraintDecl),
+      const_cast<NamedDecl *>(N.getConstraintDecl()),
       {InstLocBegin, InstLocEnd});
   if (Inst.isInvalid())
     return true;
-  if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs))
+
+  // TransformTemplateArguments is unable to preserve the source location of a
+  // pack. The SourceLocation is necessary for the instantiation location.
+  // FIXME: The BaseLoc will be used as the location of the pack expansion,
+  // which is wrong.
+  TemplateArgumentListInfo SubstArgs;
+  if (SemaRef.SubstTemplateArgumentsInParameterMapping(
+          N.getParameterMapping(), N.getBeginLoc(), *MLTAL, SubstArgs,
+          /*BuildPackExpansionTypes=*/!InFoldExpr))
+    return true;
+  Sema::CheckTemplateArgumentInfo CTAI;
+  auto *TD =
+      const_cast<TemplateDecl *>(cast<TemplateDecl>(N.getConstraintDecl()));
+  if (SemaRef.CheckTemplateArgumentList(TD, N.getUsedTemplateParamList(),
+                                        TD->getLocation(), SubstArgs,
+                                        /*DefaultArguments=*/{},
+                                        /*PartialTemplateArgs=*/false, CTAI))
     return true;
 
   TemplateArgumentLoc *TempArgs =
-      new (S.Context) TemplateArgumentLoc[SubstArgs.size()];
-  std::copy(SubstArgs.arguments().begin(), SubstArgs.arguments().end(),
-            TempArgs);
-  Atomic.ParameterMapping.emplace(TempArgs, SubstArgs.size());
+      new (SemaRef.Context) TemplateArgumentLoc[CTAI.SugaredConverted.size()];
+
+  for (unsigned I = 0; I < CTAI.SugaredConverted.size(); ++I) {
+    SourceLocation Loc;
+    // If this is an empty pack, we have no corresponding SubstArgs.
+    if (I < SubstArgs.size())
+      Loc = SubstArgs.arguments()[I].getLocation();
+
+    TempArgs[I] = SemaRef.getTrivialTemplateArgumentLoc(
+        CTAI.SugaredConverted[I], QualType(), Loc);
+  }
+
+  MutableArrayRef<TemplateArgumentLoc> Mapping(TempArgs,
+                                               CTAI.SugaredConverted.size());
+  N.updateParameterMapping(N.mappingOccurenceList(),
+                           N.mappingOccurenceListForSubsumption(), Mapping,
+                           N.getUsedTemplateParamList());
   return false;
 }
 
-static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
-                                        const ConceptSpecializationExpr *CSE) {
-  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(),
-      /*Final=*/false, CSE->getTemplateArguments(),
-      /*RelativeToPrimary=*/true,
-      /*Pattern=*/nullptr,
-      /*ForConstraintInstantiation=*/true);
+bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
+  assert(CC.getConstraintDecl() && MLTAL && ArgsAsWritten);
 
-  return substituteParameterMappings(S, N, CSE->getNamedConcept(), MLTAL,
-                                     CSE->getTemplateArgsAsWritten());
-}
+  if (substitute(static_cast<NormalizedConstraintWithParamMapping &>(CC)))
+    return true;
+
+  auto *CSE = CC.getConceptSpecializationExpr();
+  assert(CSE);
+  assert(!CC.getBeginLoc().isInvalid());
 
-NormalizedConstraint::NormalizedConstraint(ASTContext &C,
-                                           NormalizedConstraint LHS,
-                                           NormalizedConstraint RHS,
-                                           CompoundConstraintKind Kind)
-    : Constraint{CompoundConstraint{
-          new(C) NormalizedConstraintPair{std::move(LHS), std::move(RHS)},
-          Kind}} {}
-
-NormalizedConstraint::NormalizedConstraint(ASTContext &C,
-                                           const NormalizedConstraint &Other) {
-  if (Other.isAtomic()) {
-    Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint());
-  } else if (Other.isFoldExpanded()) {
-    Constraint = new (C) FoldExpandedConstraint(
-        Other.getFoldExpandedConstraint()->Kind,
-        NormalizedConstraint(C, Other.getFoldExpandedConstraint()->Constraint),
-        Other.getFoldExpandedConstraint()->Pattern);
+  SourceLocation InstLocBegin, InstLocEnd;
+  if (llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
+      Arguments.empty()) {
+    InstLocBegin = ArgsAsWritten->getLAngleLoc();
+    InstLocEnd = ArgsAsWritten->getRAngleLoc();
   } else {
-    Constraint = CompoundConstraint(
-        new (C)
-            NormalizedConstraintPair{NormalizedConstraint(C, Other.getLHS()),
-                                     NormalizedConstraint(C, Other.getRHS())},
-        Other.getCompoundKind());
+    auto SR = Arguments[0].getSourceRange();
+    InstLocBegin = SR.getBegin();
+    InstLocEnd = SR.getEnd();
   }
-}
+  // This is useful for name lookup across modules; see Sema::getLookupModules.
+  Sema::InstantiatingTemplate Inst(
+      SemaRef, InstLocBegin,
+      Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
+      const_cast<NamedDecl *>(CC.getConstraintDecl()),
+      {InstLocBegin, InstLocEnd});
+  if (Inst.isInvalid())
+    return true;
 
-NormalizedConstraint &NormalizedConstraint::getLHS() const {
-  assert(isCompound() && "getLHS called on a non-compound constraint.");
-  return cast<CompoundConstraint>(Constraint).getPointer()->LHS;
+  TemplateArgumentListInfo Out;
+  // TransformTemplateArguments is unable to preserve the source location of a
+  // pack. The SourceLocation is necessary for the instantiation location.
+  // FIXME: The BaseLoc will be used as the location of the pack expansion,
+  // which is wrong.
+  const ASTTemplateArgumentListInfo *ArgsAsWritten =
+      CSE->getTemplateArgsAsWritten();
+  if (SemaRef.SubstTemplateArgumentsInParameterMapping(
+          ArgsAsWritten->arguments(), CC.getBeginLoc(), *MLTAL, Out,
+          /*BuildPackExpansionTypes=*/!InFoldExpr))
+    return true;
+  Sema::CheckTemplateArgumentInfo CTAI;
+  if (SemaRef.CheckTemplateArgumentList(CSE->getNamedConcept(),
+                                        CSE->getConceptNameInfo().getLoc(), Out,
+                                        /*DefaultArgs=*/{},
+                                        /*PartialTemplateArgs=*/false, CTAI,
+                                        /*UpdateArgsWithConversions=*/false))
+    return true;
+  auto TemplateArgs = *MLTAL;
+  TemplateArgs.replaceOutermostTemplateArguments(
+      TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted);
+  return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten,
+                                     InFoldExpr)
+      .substitute(CC.getNormalizedConstraint());
 }
 
-NormalizedConstraint &NormalizedConstraint::getRHS() const {
-  assert(isCompound() && "getRHS called on a non-compound constraint.");
-  return cast<CompoundConstraint>(Constraint).getPointer()->RHS;
+bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) {
+  switch (N.getKind()) {
+  case NormalizedConstraint::ConstraintKind::Atomic: {
+    if (!MLTAL) {
+      assert(!ArgsAsWritten);
+      return false;
+    }
+    return substitute(static_cast<NormalizedConstraintWithParamMapping &>(N));
+  }
+  case NormalizedConstraint::ConstraintKind::FoldExpanded: {
+    auto &FE = static_cast<FoldExpandedConstraint &>(N);
+    if (!MLTAL) {
+      llvm::SaveAndRestore _1(InFoldExpr, true);
+      assert(!ArgsAsWritten);
+      return substitute(FE.getNormalizedPattern());
+    }
+    Sema::ArgPackSubstIndexRAII _(SemaRef, std::nullopt);
+    substitute(static_cast<NormalizedConstraintWithParamMapping &>(FE));
+    return SubstituteParameterMappings(SemaRef, /*InFoldExpr=*/true)
+        .substitute(FE.getNormalizedPattern());
+  }
+  case NormalizedConstraint::ConstraintKind::ConceptId: {
+    auto &CC = static_cast<ConceptIdConstraint &>(N);
+    if (MLTAL) {
+      assert(ArgsAsWritten);
+      return substitute(CC);
+    }
+    assert(!ArgsAsWritten);
+    const ConceptSpecializationExpr *CSE = CC.getConceptSpecializationExpr();
+    ConceptDecl *Concept = CSE->getNamedConcept();
+    MultiLevelTemplateArgumentList MLTAL = SemaRef.getTemplateInstantiationArgs(
+        Concept, Concept->getLexicalDeclContext(),
+        /*Final=*/true, CSE->getTemplateArguments(),
+        /*RelativeToPrimary=*/true,
+        /*Pattern=*/nullptr,
+        /*ForConstraintInstantiation=*/true);
+
+    return SubstituteParameterMappings(
+               SemaRef, &MLTAL, CSE->getTemplateArgsAsWritten(), InFoldExpr)
+        .substitute(CC.getNormalizedConstraint());
+  }
+  case NormalizedConstraint::ConstraintKind::Compound: {
+    auto &Compound = static_cast<CompoundConstraint &>(N);
+    if (substitute(Compound.getLHS()))
+      return true;
+    return substitute(Compound.getRHS());
+  }
+  }
 }
 
-std::optional<NormalizedConstraint>
-NormalizedConstraint::fromAssociatedConstraints(
+} // namespace
+
+NormalizedConstraint *NormalizedConstraint::fromAssociatedConstraints(
     Sema &S, const NamedDecl *D, ArrayRef<AssociatedConstraint> ACs) {
   assert(ACs.size() != 0);
-  auto Conjunction = fromConstraintExpr(S, D, ACs[0].ConstraintExpr);
+  auto *Conjunction =
+      fromConstraintExpr(S, D, ACs[0].ConstraintExpr, ACs[0].ArgPackSubstIndex);
   if (!Conjunction)
-    return std::nullopt;
+    return nullptr;
   for (unsigned I = 1; I < ACs.size(); ++I) {
-    auto Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr);
+    auto *Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr,
+                                    ACs[I].ArgPackSubstIndex);
     if (!Next)
-      return std::nullopt;
-    *Conjunction = NormalizedConstraint(S.Context, std::move(*Conjunction),
-                                        std::move(*Next), CCK_Conjunction);
+      return nullptr;
+    Conjunction = CompoundConstraint::CreateConjunction(S.getASTContext(),
+                                                        Conjunction, Next);
   }
   return Conjunction;
 }
 
-std::optional<NormalizedConstraint>
-NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
-                                         const Expr *E) {
+NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
+    Sema &S, const NamedDecl *D, const Expr *E, UnsignedOrNone SubstIndex) {
   assert(E != nullptr);
 
   // C++ [temp.constr.normal]p1.1
@@ -1597,23 +2170,29 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
   // [...]
   E = E->IgnoreParenImpCasts();
 
+  llvm::FoldingSetNodeID ID;
+  if (D && DiagRecursiveConstraintEval(S, ID, D, E)) {
+    return nullptr;
+  }
+  SatisfactionStackRAII StackRAII(S, D, ID);
+
   // C++2a [temp.param]p4:
   //     [...] If T is not a pack, then E is E', otherwise E is (E' && ...).
   // Fold expression is considered atomic constraints per current wording.
   // See http://cplusplus.github.io/concepts-ts/ts-active.html#28
 
   if (LogicalBinOp BO = E) {
-    auto LHS = fromConstraintExpr(S, D, BO.getLHS());
+    auto *LHS = fromConstraintExpr(S, D, BO.getLHS(), SubstIndex);
     if (!LHS)
-      return std::nullopt;
-    auto RHS = fromConstraintExpr(S, D, BO.getRHS());
+      return nullptr;
+    auto *RHS = fromConstraintExpr(S, D, BO.getRHS(), SubstIndex);
     if (!RHS)
-      return std::nullopt;
+      return nullptr;
 
-    return NormalizedConstraint(S.Context, std::move(*LHS), std::move(*RHS),
-                                BO.isAnd() ? CCK_Conjunction : CCK_Disjunction);
+    return CompoundConstraint::Create(
+        S.Context, LHS, BO.isAnd() ? CCK_Conjunction : CCK_Disjunction, RHS);
   } else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
-    const NormalizedConstraint *SubNF;
+    NormalizedConstraint *SubNF;
     {
       Sema::InstantiatingTemplate Inst(
           S, CSE->getExprLoc(),
@@ -1621,7 +2200,7 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
           // FIXME: improve const-correctness of InstantiatingTemplate
           const_cast<NamedDecl *>(D), CSE->getSourceRange());
       if (Inst.isInvalid())
-        return std::nullopt;
+        return nullptr;
       // C++ [temp.constr.normal]p1.1
       // [...]
       // The normal form of an id-expression of the form C<A1, A2, ..., AN>,
@@ -1631,20 +2210,21 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
       // constraint. If any such substitution results in an invalid type or
       // expression, the program is ill-formed; no diagnostic is required.
       // [...]
-      ConceptDecl *CD = CSE->getNamedConcept();
-      SubNF = S.getNormalizedAssociatedConstraints(
-          CD, AssociatedConstraint(CD->getConstraintExpr()));
+
+      // Use canonical declarations to merge ConceptDecls across
+      // different modules.
+      ConceptDecl *CD = CSE->getNamedConcept()->getCanonicalDecl();
+      SubNF = NormalizedConstraint::fromAssociatedConstraints(
+          S, CD, AssociatedConstraint(CD->getConstraintExpr(), SubstIndex));
+
       if (!SubNF)
-        return std::nullopt;
+        return nullptr;
     }
 
-    std::optional<NormalizedConstraint> New;
-    New.emplace(S.Context, *SubNF);
-
-    if (substituteParameterMappings(S, *New, CSE))
-      return std::nullopt;
+    return ConceptIdConstraint::Create(S.getASTContext(),
+                                       CSE->getConceptReference(), SubNF, D,
+                                       CSE, SubstIndex);
 
-    return New;
   } else if (auto *FE = dyn_cast<const CXXFoldExpr>(E);
              FE && S.getLangOpts().CPlusPlus26 &&
              (FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
@@ -1658,31 +2238,61 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
             : FoldExpandedConstraint::FoldOperatorKind::Or;
 
     if (FE->getInit()) {
-      auto LHS = fromConstraintExpr(S, D, FE->getLHS());
-      auto RHS = fromConstraintExpr(S, D, FE->getRHS());
+      auto *LHS = fromConstraintExpr(S, D, FE->getLHS(), SubstIndex);
+      auto *RHS = fromConstraintExpr(S, D, FE->getRHS(), SubstIndex);
       if (!LHS || !RHS)
-        return std::nullopt;
+        return nullptr;
 
       if (FE->isRightFold())
-        RHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
-            Kind, std::move(*RHS), FE->getPattern()}};
+        LHS = FoldExpandedConstraint::Create(S.getASTContext(),
+                                             FE->getPattern(), D, Kind, LHS);
       else
-        LHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
-            Kind, std::move(*LHS), FE->getPattern()}};
-
-      return NormalizedConstraint(
-          S.Context, std::move(*LHS), std::move(*RHS),
-          FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
-                                                           : CCK_Disjunction);
+        RHS = FoldExpandedConstraint::Create(S.getASTContext(),
+                                             FE->getPattern(), D, Kind, RHS);
+
+      return CompoundConstraint::Create(
+          S.getASTContext(), LHS,
+          (FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
+                                                            : CCK_Disjunction),
+          RHS);
     }
-    auto Sub = fromConstraintExpr(S, D, FE->getPattern());
+    auto *Sub = fromConstraintExpr(S, D, FE->getPattern(), SubstIndex);
     if (!Sub)
-      return std::nullopt;
-    return NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
-        Kind, std::move(*Sub), FE->getPattern()}};
+      return nullptr;
+    return FoldExpandedConstraint::Create(S.getASTContext(), FE->getPattern(),
+                                          D, Kind, Sub);
+  }
+  return AtomicConstraint::Create(S.getASTContext(), E, D, SubstIndex);
+}
+
+const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
+    ConstrainedDeclOrNestedRequirement ConstrainedDeclOrNestedReq,
+    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
+  if (!ConstrainedDeclOrNestedReq) {
+    auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
+        *this, nullptr, AssociatedConstraints);
+    if (!Normalized ||
+        SubstituteParameterMappings(*this).substitute(*Normalized))
+      return nullptr;
+
+    return Normalized;
   }
 
-  return NormalizedConstraint{new (S.Context) AtomicConstraint(E, D)};
+  // FIXME: ConstrainedDeclOrNestedReq is never a NestedRequirement!
+  const NamedDecl *ND =
+      ConstrainedDeclOrNestedReq.dyn_cast<const NamedDecl *>();
+  auto CacheEntry = NormalizationCache.find(ConstrainedDeclOrNestedReq);
+  if (CacheEntry == NormalizationCache.end()) {
+    auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
+        *this, ND, AssociatedConstraints);
+    CacheEntry =
+        NormalizationCache.try_emplace(ConstrainedDeclOrNestedReq, Normalized)
+            .first;
+    if (!Normalized ||
+        SubstituteParameterMappings(*this).substitute(*Normalized))
+      return nullptr;
+  }
+  return CacheEntry->second;
 }
 
 bool FoldExpandedConstraint::AreCompatibleForSubsumption(
@@ -1693,8 +2303,10 @@ bool FoldExpandedConstraint::AreCompatibleForSubsumption(
   // if their respective constraints both contain an equivalent unexpanded pack.
 
   llvm::SmallVector<UnexpandedParameterPack> APacks, BPacks;
-  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.Pattern), APacks);
-  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.Pattern), BPacks);
+  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.getPattern()),
+                                        APacks);
+  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.getPattern()),
+                                        BPacks);
 
   for (const UnexpandedParameterPack &APack : APacks) {
     auto ADI = getDepthAndIndex(APack);
@@ -1788,7 +2400,7 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
                                     const AtomicConstraint &B) {
     if (!A.hasMatchingParameterMapping(Context, B))
       return false;
-    const Expr *EA = A.ConstraintExpr, *EB = B.ConstraintExpr;
+    const Expr *EA = A.getConstraintExpr(), *EB = B.getConstraintExpr();
     if (EA == EB)
       return true;
 
@@ -1841,24 +2453,6 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
   return true;
 }
 
-NormalizedConstraint::CompoundConstraintKind
-NormalizedConstraint::getCompoundKind() const {
-  assert(isCompound() && "getCompoundKind on a non-compound constraint..");
-  return cast<CompoundConstraint>(Constraint).getInt();
-}
-
-AtomicConstraint *NormalizedConstraint::getAtomicConstraint() const {
-  assert(isAtomic() && "getAtomicConstraint called on non-atomic constraint.");
-  return cast<AtomicConstraint *>(Constraint);
-}
-
-FoldExpandedConstraint *
-NormalizedConstraint::getFoldExpandedConstraint() const {
-  assert(isFoldExpanded() &&
-         "getFoldExpandedConstraint called on non-fold-expanded constraint.");
-  return cast<FoldExpandedConstraint *>(Constraint);
-}
-
 //
 //
 // ------------------------ Subsumption -----------------------------------
@@ -1874,8 +2468,8 @@ uint16_t SubsumptionChecker::getNewLiteralId() {
   return NextID++;
 }
 
-auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
-  auto &Elems = AtomicMap[Ori->ConstraintExpr];
+auto SubsumptionChecker::find(const AtomicConstraint *Ori) -> Literal {
+  auto &Elems = AtomicMap[Ori->getConstraintExpr()];
   // C++ [temp.constr.order] p2
   //   - an atomic constraint A subsumes another atomic constraint B
   //     if and only if the A and B are identical [...]
@@ -1891,13 +2485,16 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
   // subsumes another, their literal will be the same
 
   llvm::FoldingSetNodeID ID;
-  const auto &Mapping = Ori->ParameterMapping;
-  ID.AddBoolean(Mapping.has_value());
-  if (Mapping) {
-    for (const TemplateArgumentLoc &TAL : *Mapping) {
-      SemaRef.getASTContext()
-          .getCanonicalTemplateArgument(TAL.getArgument())
-          .Profile(ID, SemaRef.getASTContext());
+  ID.AddBoolean(Ori->hasParameterMapping());
+  if (Ori->hasParameterMapping()) {
+    const auto &Mapping = Ori->getParameterMapping();
+    const NormalizedConstraint::OccurenceList &Indexes =
+        Ori->mappingOccurenceListForSubsumption();
+    for (auto [Idx, TAL] : llvm::enumerate(Mapping)) {
+      if (Indexes[Idx])
+        SemaRef.getASTContext()
+            .getCanonicalTemplateArgument(TAL.getArgument())
+            .Profile(ID, SemaRef.getASTContext());
     }
   }
   auto It = Elems.find(ID);
@@ -1912,11 +2509,11 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
   return It->getSecond().ID;
 }
 
-auto SubsumptionChecker::find(FoldExpandedConstraint *Ori) -> Literal {
-  auto &Elems = FoldMap[Ori->Pattern];
+auto SubsumptionChecker::find(const FoldExpandedConstraint *Ori) -> Literal {
+  auto &Elems = FoldMap[Ori->getPattern()];
 
   FoldExpendedConstraintKey K;
-  K.Kind = Ori->Kind;
+  K.Kind = Ori->getFoldOperator();
 
   auto It = llvm::find_if(Elems, [&K](const FoldExpendedConstraintKey &Other) {
     return K.Kind == Other.Kind;
@@ -1960,38 +2557,47 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
     AddUniqueClauseToFormula(Res, std::move(C));
   };
 
-  if (NC.isAtomic())
-    return {{find(NC.getAtomicConstraint())}};
+  switch (NC.getKind()) {
 
-  if (NC.isFoldExpanded())
-    return {{find(NC.getFoldExpandedConstraint())}};
+  case NormalizedConstraint::ConstraintKind::Atomic:
+    return {{find(&static_cast<const AtomicConstraint &>(NC))}};
 
-  FormulaType Left, Right;
-  SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
-    Left = Normalize<FormulaType>(NC.getLHS());
-    Right = Normalize<FormulaType>(NC.getRHS());
-  });
+  case NormalizedConstraint::ConstraintKind::FoldExpanded:
+    return {{find(&static_cast<const FoldExpandedConstraint &>(NC))}};
 
-  if (NC.getCompoundKind() == FormulaType::Kind) {
-    auto SizeLeft = Left.size();
-    Res = std::move(Left);
-    Res.reserve(SizeLeft + Right.size());
-    std::for_each(std::make_move_iterator(Right.begin()),
-                  std::make_move_iterator(Right.end()), Add);
-    return Res;
-  }
+  case NormalizedConstraint::ConstraintKind::ConceptId:
+    return Normalize<FormulaType>(
+        static_cast<const ConceptIdConstraint &>(NC).getNormalizedConstraint());
+
+  case NormalizedConstraint::ConstraintKind::Compound: {
+    const auto &Compound = static_cast<const CompoundConstraint &>(NC);
+    FormulaType Left, Right;
+    SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
+      Left = Normalize<FormulaType>(Compound.getLHS());
+      Right = Normalize<FormulaType>(Compound.getRHS());
+    });
+
+    if (Compound.getCompoundKind() == FormulaType::Kind) {
+      Res = std::move(Left);
+      Res.reserve(Left.size() + Right.size());
+      std::for_each(std::make_move_iterator(Right.begin()),
+                    std::make_move_iterator(Right.end()), Add);
+      return Res;
+    }
 
-  Res.reserve(Left.size() * Right.size());
-  for (const auto &LTransform : Left) {
-    for (const auto &RTransform : Right) {
-      Clause Combined;
-      Combined.reserve(LTransform.size() + RTransform.size());
-      llvm::append_range(Combined, LTransform);
-      llvm::append_range(Combined, RTransform);
-      Add(std::move(Combined));
+    Res.reserve(Left.size() * Right.size());
+    for (const auto &LTransform : Left) {
+      for (const auto &RTransform : Right) {
+        Clause Combined;
+        Combined.reserve(LTransform.size() + RTransform.size());
+        llvm::copy(LTransform, std::back_inserter(Combined));
+        llvm::copy(RTransform, std::back_inserter(Combined));
+        Add(std::move(Combined));
+      }
     }
+    return Res;
+  }
   }
-  return Res;
 }
 
 void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) {
@@ -2006,12 +2612,12 @@ std::optional<bool> SubsumptionChecker::Subsumes(
     const NamedDecl *DP, ArrayRef<AssociatedConstraint> P, const NamedDecl *DQ,
     ArrayRef<AssociatedConstraint> Q) {
   const NormalizedConstraint *PNormalized =
-      getNormalizedAssociatedConstraints(SemaRef, DP, P);
+      SemaRef.getNormalizedAssociatedConstraints(DP, P);
   if (!PNormalized)
     return std::nullopt;
 
   const NormalizedConstraint *QNormalized =
-      getNormalizedAssociatedConstraints(SemaRef, DQ, Q);
+      SemaRef.getNormalizedAssociatedConstraints(DQ, Q);
   if (!QNormalized)
     return std::nullopt;
 
@@ -2061,9 +2667,9 @@ bool SubsumptionChecker::Subsumes(const FoldExpandedConstraint *A,
     // constraint B if they are compatible for subsumption, have the same
     // fold-operator, and the constraint of A subsumes that of B.
     bool DoesSubsume =
-        A->Kind == B->Kind &&
+        A->getFoldOperator() == B->getFoldOperator() &&
         FoldExpandedConstraint::AreCompatibleForSubsumption(*A, *B) &&
-        Subsumes(&A->Constraint, &B->Constraint);
+        Subsumes(&A->getNormalizedPattern(), &B->getNormalizedPattern());
     It = FoldSubsumptionCache.try_emplace(std::move(Key), DoesSubsume).first;
   }
   return It->second;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 1131e1f033b72..c5724d7f3285e 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -17876,13 +17876,15 @@ Decl *Sema::BuildStaticAssertDeclaration(SourceLocation StaticAssertLoc,
         findFailedBooleanCondition(Converted.get());
       if (const auto *ConceptIDExpr =
               dyn_cast_or_null<ConceptSpecializationExpr>(InnerCond)) {
-        // Drill down into concept specialization expressions to see why they
-        // weren't satisfied.
-        Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
-            << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
-        ConstraintSatisfaction Satisfaction;
-        if (!CheckConstraintSatisfaction(ConceptIDExpr, Satisfaction))
-          DiagnoseUnsatisfiedConstraint(Satisfaction);
+        const ASTConstraintSatisfaction &Satisfaction =
+            ConceptIDExpr->getSatisfaction();
+        if (!Satisfaction.ContainsErrors || Satisfaction.NumRecords) {
+          Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
+              << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
+          // Drill down into concept specialization expressions to see why they
+          // weren't satisfied.
+          DiagnoseUnsatisfiedConstraint(ConceptIDExpr);
+        }
       } else if (InnerCond && !isa<CXXBoolLiteralExpr>(InnerCond) &&
                  !isa<IntegerLiteral>(InnerCond)) {
         Diag(InnerCond->getBeginLoc(),
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 779ccf5f1e888..ce5527fd9ae59 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7931,21 +7931,27 @@ Sema::BuildExprRequirement(
     //     be satisfied.
     TemplateParameterList *TPL =
         ReturnTypeRequirement.getTypeConstraintTemplateParameterList();
-    QualType MatchedType =
-        Context.getReferenceQualifiedType(E).getCanonicalType();
+    QualType MatchedType = Context.getReferenceQualifiedType(E);
     llvm::SmallVector<TemplateArgument, 1> Args;
     Args.push_back(TemplateArgument(MatchedType));
 
     auto *Param = cast<TemplateTypeParmDecl>(TPL->getParam(0));
 
-    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/false);
+    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/true);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
     const TypeConstraint *TC = Param->getTypeConstraint();
     assert(TC && "Type Constraint cannot be null here");
     auto *IDC = TC->getImmediatelyDeclaredConstraint();
     assert(IDC && "ImmediatelyDeclaredConstraint can't be null here.");
     ExprResult Constraint = SubstExpr(IDC, MLTAL);
-    if (Constraint.isInvalid()) {
+    bool HasError = Constraint.isInvalid();
+    if (!HasError) {
+      SubstitutedConstraintExpr =
+          cast<ConceptSpecializationExpr>(Constraint.get());
+      if (SubstitutedConstraintExpr->getSatisfaction().ContainsErrors)
+        HasError = true;
+    }
+    if (HasError) {
       return new (Context) concepts::ExprRequirement(
           createSubstDiagAt(IDC->getExprLoc(),
                             [&](llvm::raw_ostream &OS) {
@@ -7954,8 +7960,6 @@ Sema::BuildExprRequirement(
                             }),
           IsSimple, NoexceptLoc, ReturnTypeRequirement);
     }
-    SubstitutedConstraintExpr =
-        cast<ConceptSpecializationExpr>(Constraint.get());
     if (!SubstitutedConstraintExpr->isSatisfied())
       Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
   }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index ea5c4265d736d..f741e8283761d 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -12739,7 +12739,8 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
         << (unsigned)FnKindPair.first << (unsigned)ocs_non_template
         << FnDesc /* Ignored */;
     ConstraintSatisfaction Satisfaction;
-    if (S.CheckFunctionConstraints(Fn, Satisfaction))
+    if (S.CheckFunctionConstraints(Fn, Satisfaction, SourceLocation(),
+                                   /*ForOverloadResolution=*/true))
       break;
     S.DiagnoseUnsatisfiedConstraint(Satisfaction);
   }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 2bf1511c5cfa0..dcf2876af81fc 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TreeTransform.h"
+#include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
@@ -1222,8 +1223,9 @@ static ExprResult formImmediatelyDeclaredConstraint(
   if (auto *CD = dyn_cast<ConceptDecl>(NamedConcept)) {
     ImmediatelyDeclaredConstraint = S.CheckConceptTemplateId(
         SS, /*TemplateKWLoc=*/SourceLocation(), NameInfo,
-        /*FoundDecl=*/FoundDecl ? FoundDecl : NamedConcept, CD,
-        &ConstraintArgs);
+        /*FoundDecl=*/FoundDecl ? FoundDecl : CD, CD, &ConstraintArgs,
+        /*DoCheckConstraintSatisfaction=*/
+        !S.inParameterMappingSubstitution());
   }
   // We have a template template parameter
   else {
@@ -4850,13 +4852,11 @@ void Sema::diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
   diagnoseMissingTemplateArguments(Name, Loc);
 }
 
-ExprResult
-Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
-                             SourceLocation TemplateKWLoc,
-                             const DeclarationNameInfo &ConceptNameInfo,
-                             NamedDecl *FoundDecl,
-                             ConceptDecl *NamedConcept,
-                             const TemplateArgumentListInfo *TemplateArgs) {
+ExprResult Sema::CheckConceptTemplateId(
+    const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
+    const DeclarationNameInfo &ConceptNameInfo, NamedDecl *FoundDecl,
+    TemplateDecl *NamedConcept, const TemplateArgumentListInfo *TemplateArgs,
+    bool DoCheckConstraintSatisfaction) {
   assert(NamedConcept && "A concept template id without a template?");
 
   if (NamedConcept->isInvalidDecl())
@@ -4873,33 +4873,48 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
 
   DiagnoseUseOfDecl(NamedConcept, ConceptNameInfo.getLoc());
 
+  // There's a bug with CTAI.CanonicalConverted.
+  // If the template argument contains a DependentDecltypeType that includes a
+  // TypeAliasType, and the same written type had occurred previously in the
+  // source, then the DependentDecltypeType would be canonicalized to that
+  // previous type which would mess up the substitution.
+  // FIXME: Reland https://github.com/llvm/llvm-project/pull/101782 properly!
   auto *CSD = ImplicitConceptSpecializationDecl::Create(
       Context, NamedConcept->getDeclContext(), NamedConcept->getLocation(),
-      CTAI.CanonicalConverted);
+      CTAI.SugaredConverted);
   ConstraintSatisfaction Satisfaction;
   bool AreArgsDependent =
       TemplateSpecializationType::anyDependentTemplateArguments(
-          *TemplateArgs, CTAI.CanonicalConverted);
-  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.CanonicalConverted,
+          *TemplateArgs, CTAI.SugaredConverted);
+  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.SugaredConverted,
                                        /*Final=*/false);
-  LocalInstantiationScope Scope(*this);
-
-  EnterExpressionEvaluationContext EECtx{
-      *this, ExpressionEvaluationContext::Unevaluated, CSD};
-
-  if (!AreArgsDependent &&
-      CheckConstraintSatisfaction(
-          NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()),
-          MLTAL,
-          SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
-                      TemplateArgs->getRAngleLoc()),
-          Satisfaction))
-    return ExprError();
   auto *CL = ConceptReference::Create(
       Context,
       SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc{},
       TemplateKWLoc, ConceptNameInfo, FoundDecl, NamedConcept,
       ASTTemplateArgumentListInfo::Create(Context, *TemplateArgs));
+
+  bool Error = false;
+  if (const auto *Concept = dyn_cast<ConceptDecl>(NamedConcept);
+      Concept && Concept->getConstraintExpr() && !AreArgsDependent &&
+      DoCheckConstraintSatisfaction) {
+
+    LocalInstantiationScope Scope(*this);
+
+    EnterExpressionEvaluationContext EECtx{
+        *this, ExpressionEvaluationContext::Unevaluated, CSD};
+
+    Error = CheckConstraintSatisfaction(
+        NamedConcept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
+        SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
+                    TemplateArgs->getRAngleLoc()),
+        Satisfaction, CL);
+    Satisfaction.ContainsErrors = Error;
+  }
+
+  if (Error)
+    return ExprError();
+
   return ConceptSpecializationExpr::Create(
       Context, CL, CSD, AreArgsDependent ? nullptr : &Satisfaction);
 }
@@ -5217,10 +5232,11 @@ bool Sema::CheckTemplateTypeArgument(
   }
   default: {
     // We allow instantiating a template with template argument packs when
-    // building deduction guides.
+    // building deduction guides or mapping constraint template parameters.
     if (Arg.getKind() == TemplateArgument::Pack &&
-        CodeSynthesisContexts.back().Kind ==
-            Sema::CodeSynthesisContext::BuildingDeductionGuides) {
+        (CodeSynthesisContexts.back().Kind ==
+             Sema::CodeSynthesisContext::BuildingDeductionGuides ||
+         inParameterMappingSubstitution())) {
       SugaredConverted.push_back(Arg);
       CanonicalConverted.push_back(Arg);
       return false;
@@ -5813,6 +5829,20 @@ bool Sema::CheckTemplateArgumentList(
     TemplateArgumentListInfo &TemplateArgs, const DefaultArguments &DefaultArgs,
     bool PartialTemplateArgs, CheckTemplateArgumentInfo &CTAI,
     bool UpdateArgsWithConversions, bool *ConstraintsNotSatisfied) {
+  return CheckTemplateArgumentList(
+      Template, GetTemplateParameterList(Template), TemplateLoc, TemplateArgs,
+      DefaultArgs, PartialTemplateArgs, CTAI, UpdateArgsWithConversions,
+      ConstraintsNotSatisfied);
+}
+
+/// Check that the given template argument list is well-formed
+/// for specializing the given template.
+bool Sema::CheckTemplateArgumentList(
+    TemplateDecl *Template, TemplateParameterList *Params,
+    SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
+    const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
+    CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions,
+    bool *ConstraintsNotSatisfied) {
 
   if (ConstraintsNotSatisfied)
     *ConstraintsNotSatisfied = false;
@@ -5822,8 +5852,6 @@ bool Sema::CheckTemplateArgumentList(
   // template.
   TemplateArgumentListInfo NewArgs = TemplateArgs;
 
-  TemplateParameterList *Params = GetTemplateParameterList(Template);
-
   SourceLocation RAngleLoc = NewArgs.getRAngleLoc();
 
   // C++23 [temp.arg.general]p1:
@@ -6163,11 +6191,12 @@ bool Sema::CheckTemplateArgumentList(
     CXXThisScopeRAII Scope(*this, RD, ThisQuals, RD != nullptr);
 
     MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs(
-        Template, NewContext, /*Final=*/false, CTAI.CanonicalConverted,
+        Template, NewContext, /*Final=*/true, CTAI.SugaredConverted,
         /*RelativeToPrimary=*/true,
         /*Pattern=*/nullptr,
         /*ForConceptInstantiation=*/true);
-    if (EnsureTemplateArgumentListConstraints(
+    if (!isa<ConceptDecl>(Template) &&
+        EnsureTemplateArgumentListConstraints(
             Template, MLTAL,
             SourceRange(TemplateLoc, TemplateArgs.getRAngleLoc()))) {
       if (ConstraintsNotSatisfied)
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index f6ee7452c2f9a..6bba505ece07d 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3206,7 +3206,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
   // If we don't need to replace the deduced template arguments,
   // we can add them immediately as the inner-most argument list.
   if (!DeducedArgsNeedReplacement)
-    Innermost = CanonicalDeducedArgs;
+    Innermost = SugaredDeducedArgs;
 
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       Template, Template->getDeclContext(), /*Final=*/false, Innermost,
@@ -3218,7 +3218,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
   // not class-scope explicit specialization, so replace with Deduced Args
   // instead of adding to inner-most.
   if (!Innermost)
-    MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs);
+    MLTAL.replaceInnermostTemplateArguments(Template, SugaredDeducedArgs);
 
   if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL,
                                     Info.getLocation(),
@@ -3995,11 +3995,12 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     if (CheckFunctionTemplateConstraints(
             Info.getLocation(),
             FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(),
-            CTAI.CanonicalConverted, Info.AssociatedConstraintsSatisfaction))
+            CTAI.SugaredConverted, Info.AssociatedConstraintsSatisfaction))
       return TemplateDeductionResult::MiscellaneousDeductionFailure;
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
-      Info.reset(Info.takeSugared(), TemplateArgumentList::CreateCopy(
-                                         Context, CTAI.CanonicalConverted));
+      Info.reset(
+          TemplateArgumentList::CreateCopy(Context, CTAI.SugaredConverted),
+          Info.takeCanonical());
       return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
@@ -5167,8 +5168,8 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
                                   /*DefaultArgs=*/{},
                                   /*PartialTemplateArgs=*/false, CTAI))
     return true;
-  MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.CanonicalConverted,
-                                       /*Final=*/false);
+  MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.SugaredConverted,
+                                       /*Final=*/true);
   // Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so
   // that the template arguments of the constraint can be preserved. For
   // example:
@@ -5182,7 +5183,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
       S, Sema::ExpressionEvaluationContext::Unevaluated,
       ImplicitConceptSpecializationDecl::Create(
           S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(),
-          CTAI.CanonicalConverted));
+          CTAI.SugaredConverted));
   if (S.CheckConstraintSatisfaction(
           Concept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
           TypeLoc.getLocalSourceRange(), Satisfaction))
@@ -6676,10 +6677,11 @@ namespace {
 struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
   llvm::SmallBitVector &Used;
   unsigned Depth;
+  bool VisitDeclRefTypes = true;
 
-  MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used,
-                                   unsigned Depth)
-      : Used(Used), Depth(Depth) { }
+  MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used, unsigned Depth,
+                                   bool VisitDeclRefTypes = true)
+      : Used(Used), Depth(Depth), VisitDeclRefTypes(VisitDeclRefTypes) {}
 
   bool VisitTemplateTypeParmType(TemplateTypeParmType *T) override {
     if (T->getDepth() == Depth)
@@ -6700,6 +6702,8 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
     if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(E->getDecl()))
       if (NTTP->getDepth() == Depth)
         Used[NTTP->getIndex()] = true;
+    if (VisitDeclRefTypes)
+      DynamicRecursiveASTVisitor::TraverseType(E->getType());
     return true;
   }
 
@@ -7043,10 +7047,13 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
     break;
 
   case Type::UnaryTransform:
-    if (!OnlyDeduced)
-      MarkUsedTemplateParameters(Ctx,
-                                 cast<UnaryTransformType>(T)->getUnderlyingType(),
-                                 OnlyDeduced, Depth, Used);
+    if (!OnlyDeduced) {
+      auto *UTT = cast<UnaryTransformType>(T);
+      auto Next = UTT->getUnderlyingType();
+      if (Next.isNull())
+        Next = UTT->getBaseType();
+      MarkUsedTemplateParameters(Ctx, Next, OnlyDeduced, Depth, Used);
+    }
     break;
 
   case Type::PackExpansion:
@@ -7146,6 +7153,12 @@ Sema::MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
   ::MarkUsedTemplateParameters(Context, E, OnlyDeduced, Depth, Used);
 }
 
+void Sema::MarkUsedTemplateParametersForSubsumptionParameterMapping(
+    const Expr *E, unsigned Depth, llvm::SmallBitVector &Used) {
+  MarkUsedTemplateParameterVisitor(Used, Depth, /*VisitDeclRefTypes=*/false)
+      .TraverseStmt(const_cast<Expr *>(E));
+}
+
 void
 Sema::MarkUsedTemplateParameters(const TemplateArgumentList &TemplateArgs,
                                  bool OnlyDeduced, unsigned Depth,
@@ -7171,6 +7184,14 @@ void Sema::MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
                                  /*OnlyDeduced=*/false, Depth, Used);
 }
 
+void Sema::MarkUsedTemplateParameters(
+    ArrayRef<TemplateArgumentLoc> TemplateArgs, unsigned Depth,
+    llvm::SmallBitVector &Used) {
+  for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
+    ::MarkUsedTemplateParameters(Context, TemplateArgs[I].getArgument(),
+                                 /*OnlyDeduced=*/false, Depth, Used);
+}
+
 void Sema::MarkDeducedTemplateParameters(
     ASTContext &Ctx, const FunctionTemplateDecl *FunctionTemplate,
     llvm::SmallBitVector &Deduced) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index f1c9c5c868159..b996efd441799 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -628,9 +628,14 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
   Inst.InstantiationRange = InstantiationRange;
   Inst.InConstraintSubstitution =
       Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
-  if (!SemaRef.CodeSynthesisContexts.empty())
+  Inst.InParameterMappingSubstitution =
+      Inst.Kind == CodeSynthesisContext::ParameterMappingSubstitution;
+  if (!SemaRef.CodeSynthesisContexts.empty()) {
     Inst.InConstraintSubstitution |=
         SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
+    Inst.InParameterMappingSubstitution |=
+        SemaRef.CodeSynthesisContexts.back().InParameterMappingSubstitution;
+  }
 
   Invalid = SemaRef.pushCodeSynthesisContext(Inst);
   if (!Invalid) {
@@ -1375,6 +1380,7 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
 // Template Instantiation for Types
 //===----------------------------------------------------------------------===/
 namespace {
+
   class TemplateInstantiator : public TreeTransform<TemplateInstantiator> {
     const MultiLevelTemplateArgumentList &TemplateArgs;
     SourceLocation Loc;
@@ -1387,7 +1393,11 @@ namespace {
     // Whether an incomplete substituion should be treated as an error.
     bool BailOutOnIncomplete;
 
-  private:
+    // Whether to rebuild pack expansion types; We don't do that when
+    // rebuilding the parameter mapping of a fold expression appearing
+    // in a constraint expression.
+    bool BuildPackExpansionTypes = true;
+
     // CWG2770: Function parameters should be instantiated when they are
     // needed by a satisfaction check of an atomic constraint or
     // (recursively) by another function parameter.
@@ -1410,6 +1420,17 @@ namespace {
       return EvaluateConstraints;
     }
 
+    inline static struct ForParameterMappingSubstitution_t {
+    } ForParameterMappingSubstitution;
+
+    TemplateInstantiator(ForParameterMappingSubstitution_t, Sema &SemaRef,
+                         SourceLocation Loc,
+                         const MultiLevelTemplateArgumentList &TemplateArgs,
+                         bool BuildPackExpansionTypes)
+        : inherited(SemaRef), TemplateArgs(TemplateArgs), Loc(Loc),
+          BailOutOnIncomplete(false),
+          BuildPackExpansionTypes(BuildPackExpansionTypes) {}
+
     /// Determine whether the given type \p T has already been
     /// transformed.
     ///
@@ -1444,7 +1465,8 @@ namespace {
                                  bool &ShouldExpand, bool &RetainExpansion,
                                  UnsignedOrNone &NumExpansions) {
       if (SemaRef.CurrentInstantiationScope &&
-          SemaRef.inConstraintSubstitution()) {
+          (SemaRef.inConstraintSubstitution() ||
+           SemaRef.inParameterMappingSubstitution())) {
         for (UnexpandedParameterPack ParmPack : Unexpanded) {
           NamedDecl *VD = ParmPack.first.dyn_cast<NamedDecl *>();
           if (auto *PVD = dyn_cast_if_present<ParmVarDecl>(VD);
@@ -1465,10 +1487,10 @@ namespace {
 
     TemplateArgument ForgetPartiallySubstitutedPack() {
       TemplateArgument Result;
-      if (NamedDecl *PartialPack
-            = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
-        MultiLevelTemplateArgumentList &TemplateArgs
-          = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+      if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
+                                       ->getPartiallySubstitutedPack()) {
+        MultiLevelTemplateArgumentList &TemplateArgs =
+            const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
         unsigned Depth, Index;
         std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
         if (TemplateArgs.hasTemplateArgument(Depth, Index)) {
@@ -1488,10 +1510,10 @@ namespace {
       if (Arg.isNull())
         return;
 
-      if (NamedDecl *PartialPack
-            = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
-        MultiLevelTemplateArgumentList &TemplateArgs
-        = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+      if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
+                                       ->getPartiallySubstitutedPack()) {
+        MultiLevelTemplateArgumentList &TemplateArgs =
+            const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
         unsigned Depth, Index;
         std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
         TemplateArgs.setArgument(Depth, Index, Arg);
@@ -1508,9 +1530,9 @@ namespace {
           std::move(New);
       return Old;
     }
+
     void RememberSubstitution(MultiLevelTemplateArgumentList Old) {
-      const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) =
-          std::move(Old);
+      const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) = Old;
     }
 
     TemplateArgument
@@ -1691,6 +1713,24 @@ namespace {
       return inherited::TransformTemplateArgument(Input, Output, Uneval);
     }
 
+    // This has to be here to allow its overload.
+    ExprResult RebuildPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc,
+                                    UnsignedOrNone NumExpansions) {
+      return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
+                                             NumExpansions);
+    }
+
+    TemplateArgumentLoc RebuildPackExpansion(TemplateArgumentLoc Pattern,
+                                             SourceLocation EllipsisLoc,
+                                             UnsignedOrNone NumExpansions) {
+      // We don't rewrite a PackExpansion type when we want to normalize a
+      // CXXFoldExpr constraint. We'll expand it when evaluating the constraint.
+      if (BuildPackExpansionTypes)
+        return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
+                                               NumExpansions);
+      return Pattern;
+    }
+
     using TreeTransform::TransformTemplateSpecializationType;
     QualType
     TransformTemplateSpecializationType(TypeLocBuilder &TLB,
@@ -1961,7 +2001,8 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
 
   if (ParmVarDecl *PVD = dyn_cast<ParmVarDecl>(D);
       PVD && SemaRef.CurrentInstantiationScope &&
-      SemaRef.inConstraintSubstitution() &&
+      (SemaRef.inConstraintSubstitution() ||
+       SemaRef.inParameterMappingSubstitution()) &&
       maybeInstantiateFunctionParameterToScope(PVD))
     return nullptr;
 
@@ -2759,18 +2800,30 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
 concepts::NestedRequirement *
 TemplateInstantiator::TransformNestedRequirement(
     concepts::NestedRequirement *Req) {
-  if (!Req->isDependent() && !AlwaysRebuild())
-    return Req;
+
+  ASTContext &C = SemaRef.Context;
+
+  Expr *Constraint = Req->getConstraintExpr();
+  ExprResult TransConstraint = Constraint;
+  ConstraintSatisfaction Satisfaction;
+
+  auto NestedReqWithDiag = [&C, this](Expr *E,
+                                      ConstraintSatisfaction Satisfaction) {
+    Satisfaction.IsSatisfied = false;
+    SmallString<128> Entity;
+    llvm::raw_svector_ostream OS(Entity);
+    E->printPretty(OS, nullptr, SemaRef.getPrintingPolicy());
+    return new (C) concepts::NestedRequirement(
+        SemaRef.Context, C.backupStr(Entity), std::move(Satisfaction));
+  };
+
   if (Req->hasInvalidConstraint()) {
     if (AlwaysRebuild())
       return RebuildNestedRequirement(Req->getInvalidConstraintEntity(),
                                       Req->getConstraintSatisfaction());
     return Req;
   }
-  Sema::InstantiatingTemplate ReqInst(SemaRef,
-      Req->getConstraintExpr()->getBeginLoc(), Req,
-      Sema::InstantiatingTemplate::ConstraintsCheck{},
-      Req->getConstraintExpr()->getSourceRange());
+
   if (!getEvaluateConstraints()) {
     ExprResult TransConstraint = TransformExpr(Req->getConstraintExpr());
     if (TransConstraint.isInvalid() || !TransConstraint.get())
@@ -2783,45 +2836,45 @@ TemplateInstantiator::TransformNestedRequirement(
         SemaRef.Context, TransConstraint.get(), Satisfaction);
   }
 
-  ExprResult TransConstraint;
-  ConstraintSatisfaction Satisfaction;
-  TemplateDeductionInfo Info(Req->getConstraintExpr()->getBeginLoc());
+  bool Success;
+  Expr *NewConstraint;
+  TemplateDeductionInfo Info(Constraint->getBeginLoc());
   {
     EnterExpressionEvaluationContext ContextRAII(
         SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-    Sema::SFINAETrap Trap(SemaRef);
-    Sema::InstantiatingTemplate ConstrInst(SemaRef,
-        Req->getConstraintExpr()->getBeginLoc(), Req, Info,
-        Req->getConstraintExpr()->getSourceRange());
+
+    Sema::InstantiatingTemplate ConstrInst(
+        SemaRef, Constraint->getBeginLoc(), Req,
+        Sema::InstantiatingTemplate::ConstraintsCheck(),
+        Constraint->getSourceRange());
+
     if (ConstrInst.isInvalid())
       return nullptr;
-    llvm::SmallVector<Expr *> Result;
-    if (!SemaRef.CheckConstraintSatisfaction(
-            nullptr,
-            AssociatedConstraint(Req->getConstraintExpr(),
-                                 SemaRef.ArgPackSubstIndex),
-            Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(),
-            Satisfaction) &&
-        !Result.empty())
-      TransConstraint = Result[0];
-    assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled "
-                                       "by CheckConstraintSatisfaction.");
+
+    Sema::SFINAETrap Trap(SemaRef);
+
+    Success = !SemaRef.CheckConstraintSatisfaction(
+        Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex),
+        TemplateArgs, Constraint->getSourceRange(), Satisfaction,
+        /*TopLevelConceptId=*/nullptr, &NewConstraint);
+
+    assert(!Success || !Trap.hasErrorOccurred() &&
+                           "Substitution failures must be handled "
+                           "by CheckConstraintSatisfaction.");
   }
-  ASTContext &C = SemaRef.Context;
-  if (TransConstraint.isUsable() &&
-      TransConstraint.get()->isInstantiationDependent())
-    return new (C) concepts::NestedRequirement(TransConstraint.get());
-  if (TransConstraint.isInvalid() || !TransConstraint.get() ||
-      Satisfaction.HasSubstitutionFailure()) {
-    SmallString<128> Entity;
-    llvm::raw_svector_ostream OS(Entity);
-    Req->getConstraintExpr()->printPretty(OS, nullptr,
-                                          SemaRef.getPrintingPolicy());
-    return new (C) concepts::NestedRequirement(
-        SemaRef.Context, C.backupStr(Entity), Satisfaction);
+
+  if (!Success || Satisfaction.HasSubstitutionFailure())
+    return NestedReqWithDiag(Constraint, Satisfaction);
+
+  // FIXME: const correctness
+  // MLTAL might be dependent.
+  if (!NewConstraint) {
+    if (!Satisfaction.IsSatisfied)
+      return NestedReqWithDiag(Constraint, Satisfaction);
+
+    NewConstraint = Constraint;
   }
-  return new (C)
-      concepts::NestedRequirement(C, TransConstraint.get(), Satisfaction);
+  return new (C) concepts::NestedRequirement(C, NewConstraint, Satisfaction);
 }
 
 TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T,
@@ -3078,7 +3131,7 @@ bool Sema::SubstTypeConstraint(
   const ASTTemplateArgumentListInfo *TemplArgInfo =
       TC->getTemplateArgsAsWritten();
 
-  if (!EvaluateConstraints) {
+  if (!EvaluateConstraints && !inParameterMappingSubstitution()) {
     UnsignedOrNone Index = TC->getArgPackSubstIndex();
     if (!Index)
       Index = SemaRef.ArgPackSubstIndex;
@@ -4378,6 +4431,16 @@ bool Sema::SubstTemplateArguments(
   return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
 }
 
+bool Sema::SubstTemplateArgumentsInParameterMapping(
+    ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
+    const MultiLevelTemplateArgumentList &TemplateArgs,
+    TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes) {
+  TemplateInstantiator Instantiator(
+      TemplateInstantiator::ForParameterMappingSubstitution, *this, BaseLoc,
+      TemplateArgs, BuildPackExpansionTypes);
+  return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
+}
+
 ExprResult
 Sema::SubstExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) {
   if (!E)
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6967301483361..51b55b82f4208 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -3722,10 +3722,6 @@ class TreeTransform {
                                         ParentContext);
   }
 
-  /// Build a new Objective-C boxed expression.
-  ///
-  /// By default, performs semantic analysis to build the new expression.
-  /// Subclasses may override this routine to provide different behavior.
   ExprResult RebuildConceptSpecializationExpr(NestedNameSpecifierLoc NNS,
       SourceLocation TemplateKWLoc, DeclarationNameInfo ConceptNameInfo,
       NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
@@ -5110,9 +5106,13 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
       typedef TemplateArgumentLocInventIterator<Derived,
                                                 TemplateArgument::pack_iterator>
         PackLocIterator;
+
+      TemplateArgumentListInfo *PackOutput = &Outputs;
+      TemplateArgumentListInfo New;
+
       if (TransformTemplateArguments(
               PackLocIterator(*this, In.getArgument().pack_begin()),
-              PackLocIterator(*this, In.getArgument().pack_end()), Outputs,
+              PackLocIterator(*this, In.getArgument().pack_end()), *PackOutput,
               Uneval))
         return true;
 
@@ -5179,7 +5179,6 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
   }
 
   return false;
-
 }
 
 // FIXME: Find ways to reduce code duplication for pack expansions.
@@ -6247,7 +6246,7 @@ ParmVarDecl *TreeTransform<Derived>::TransformFunctionTypeParam(
                                              /* DefArg */ nullptr);
   newParm->setScopeInfo(OldParm->getFunctionScopeDepth(),
                         OldParm->getFunctionScopeIndex() + indexAdjustment);
-  transformedLocalDecl(OldParm, {newParm});
+  getDerived().transformedLocalDecl(OldParm, {newParm});
   return newParm;
 }
 
@@ -7082,11 +7081,11 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
                                                             TypeLocBuilder &TLB,
                                                      UnaryTransformTypeLoc TL) {
   QualType Result = TL.getType();
+  TypeSourceInfo *NewBaseTSI = TL.getUnderlyingTInfo();
   if (Result->isDependentType()) {
     const UnaryTransformType *T = TL.getTypePtr();
 
-    TypeSourceInfo *NewBaseTSI =
-        getDerived().TransformType(TL.getUnderlyingTInfo());
+    NewBaseTSI = getDerived().TransformType(TL.getUnderlyingTInfo());
     if (!NewBaseTSI)
       return QualType();
     QualType NewBase = NewBaseTSI->getType();
@@ -7101,7 +7100,7 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
   UnaryTransformTypeLoc NewTL = TLB.push<UnaryTransformTypeLoc>(Result);
   NewTL.setKWLoc(TL.getKWLoc());
   NewTL.setParensRange(TL.getParensRange());
-  NewTL.setUnderlyingTInfo(TL.getUnderlyingTInfo());
+  NewTL.setUnderlyingTInfo(NewBaseTSI);
   return Result;
 }
 
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index cf32d4f56b7c2..5456e73956659 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2424,7 +2424,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl(
   VisitDecl(D);
   llvm::SmallVector<TemplateArgument, 4> Args;
   for (unsigned I = 0; I < D->NumTemplateArgs; ++I)
-    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/true));
+    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/false));
   D->setTemplateArguments(Args);
 }
 
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 70b898a53fcbd..eef97a8588f0b 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -807,15 +807,19 @@ readConstraintSatisfaction(ASTRecordReader &Record) {
   if (!Satisfaction.IsSatisfied) {
     unsigned NumDetailRecords = Record.readInt();
     for (unsigned i = 0; i != NumDetailRecords; ++i) {
-      if (/* IsDiagnostic */Record.readInt()) {
+      auto Kind = Record.readInt();
+      if (Kind == 0) {
         SourceLocation DiagLocation = Record.readSourceLocation();
         StringRef DiagMessage = C.backupStr(Record.readString());
 
-        Satisfaction.Details.emplace_back(
-            new (C) ConstraintSatisfaction::SubstitutionDiagnostic(
-                DiagLocation, DiagMessage));
-      } else
+        Satisfaction.Details.emplace_back(new (
+            C) ConstraintSubstitutionDiagnostic(DiagLocation, DiagMessage));
+      } else if (Kind == 1) {
         Satisfaction.Details.emplace_back(Record.readExpr());
+      } else {
+        assert(Kind == 2);
+        Satisfaction.Details.emplace_back(Record.readConceptReference());
+      }
     }
   }
   return Satisfaction;
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index ebda91e3819c3..acf345392aa1a 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -482,14 +482,20 @@ addConstraintSatisfaction(ASTRecordWriter &Record,
   if (!Satisfaction.IsSatisfied) {
     Record.push_back(Satisfaction.NumRecords);
     for (const auto &DetailRecord : Satisfaction) {
-      auto *E = dyn_cast<Expr *>(DetailRecord);
-      Record.push_back(/* IsDiagnostic */ E == nullptr);
-      if (E)
-        Record.AddStmt(E);
-      else {
-        auto *Diag = cast<std::pair<SourceLocation, StringRef> *>(DetailRecord);
+      if (auto *Diag = dyn_cast<const ConstraintSubstitutionDiagnostic *>(
+              DetailRecord)) {
+        Record.push_back(/*Kind=*/0);
         Record.AddSourceLocation(Diag->first);
         Record.AddString(Diag->second);
+        continue;
+      }
+      if (auto *E = dyn_cast<const Expr *>(DetailRecord)) {
+        Record.push_back(/*Kind=*/1);
+        Record.AddStmt(const_cast<Expr *>(E));
+      } else {
+        Record.push_back(/*Kind=*/2);
+        auto *CR = cast<const ConceptReference *>(DetailRecord);
+        Record.AddConceptReference(CR);
       }
     }
   }
diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp
index 84d981d2ab8de..9419dba057a4e 100644
--- a/clang/test/AST/ast-dump-concepts.cpp
+++ b/clang/test/AST/ast-dump-concepts.cpp
@@ -20,8 +20,9 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13, col:31> 'bool' Concept {{.*}} 'binary_concept'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:13:9> col:9
-  // CHECK-NEXT:   | |-TemplateArgument type 'type-parameter-1-0'  
-  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   | |-TemplateArgument type 'R'
+  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   | |   `-TemplateTypeParm {{.*}} 'R'
   // CHECK-NEXT:   | `-TemplateArgument type 'int'
   // CHECK-NEXT:   |   `-BuiltinType {{.*}} 'int'
   // CHECK-NEXT:   |-TemplateArgument {{.*}} type 'R'
@@ -35,8 +36,9 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13> 'bool'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:10:9> col:9
-  // CHECK-NEXT:   | `-TemplateArgument type 'type-parameter-1-0'
-  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   | `-TemplateArgument type 'R'
+  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   |     `-TemplateTypeParm {{.*}} 'R'
   template <unary_concept R>
   Foo(R);
 
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index 781fb9f28cb8d..9a3adbcb534e8 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -185,17 +185,18 @@ void foo() {
 // CHECK-NEXT: | |-BinaryOperator {{.*}} 'bool' '&&'
 // CHECK-NEXT: | | |-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'invocable'
 // CHECK-NEXT: | | | |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: | | | | |-TemplateArgument type 'type-parameter-0-2'
-// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
-// CHECK-NEXT: | | | | `-TemplateArgument pack '<GH124715::Packs<type-parameter-0-1...>>'
-// CHECK-NEXT: | | | |   `-TemplateArgument type 'GH124715::Packs<type-parameter-0-1...>'
-// CHECK-NEXT: | | | |     `-TemplateSpecializationType {{.*}} 'GH124715::Packs<type-parameter-0-1...>' dependent
-// CHECK-NEXT: | | | |       |-name: 'GH124715::Packs'
+// CHECK-NEXT: | | | | |-TemplateArgument type 'U'
+// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
+// CHECK-NEXT: | | | | |   `-TemplateTypeParm {{.*}} 'U'
+// CHECK-NEXT: | | | | `-TemplateArgument pack '<Packs<Ts...>>'
+// CHECK-NEXT: | | | |   `-TemplateArgument type 'Packs<Ts...>'
+// CHECK-NEXT: | | | |     `-TemplateSpecializationType {{.*}} 'Packs<Ts...>' dependent
+// CHECK-NEXT: | | | |       |-name: 'Packs':'GH124715::Packs' qualified
 // CHECK-NEXT: | | | |       | `-ClassTemplateDecl {{.*}} Packs
-// CHECK-NEXT: | | | |       `-TemplateArgument pack '<type-parameter-0-1...>'
-// CHECK-NEXT: | | | |         `-TemplateArgument type 'type-parameter-0-1...'
-// CHECK-NEXT: | | | |           `-PackExpansionType {{.*}} 'type-parameter-0-1...' dependent
-// CHECK-NEXT: | | | |             `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
+// CHECK-NEXT: | | | |       `-TemplateArgument type 'Ts...'
+// CHECK-NEXT: | | | |         `-PackExpansionType {{.*}} 'Ts...' dependent
+// CHECK-NEXT: | | | |           `-TemplateTypeParmType {{.*}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack
+// CHECK-NEXT: | | | |             `-TemplateTypeParm {{.*}} 'Ts'
 // CHECK-NEXT: | | | |-TemplateArgument {{.*}} type 'U':'type-parameter-0-2'
 // CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
 // CHECK-NEXT: | | | |   `-TemplateTypeParm {{.*}} 'U'
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 5c2948f67d0ee..0e0fc735c6843 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -243,19 +243,20 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   //   since-cxx20-note@#cwg2565-VC {{because 'b' would be invalid: argument may not have 'void' type}}
 
   template<typename T>
-  concept ErrorRequires = requires (ErrorRequires auto x) {
+  concept ErrorRequires = requires (ErrorRequires auto x) { // #cwg2565-expr
   // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
   //   since-cxx20-note@-2 {{declared here}}
   // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
     x;
   };
   static_assert(ErrorRequires<int>);
-  // since-cxx20-error@-1 {{static assertion failed}}
-  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}} \
+  //   since-cxx20-note@-1 {{because 'int' does not satisfy 'ErrorRequires'}} \
+  //   since-cxx20-note@#cwg2565-expr {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   template<typename T>
   concept NestedErrorInRequires = requires (T x) { // #cwg2565-NEIR
-    requires requires (NestedErrorInRequires auto y) {
+    requires requires (NestedErrorInRequires auto y) { // #cwg2565-NEIR-inner
     // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
     //   since-cxx20-note@#cwg2565-NEIR {{declared here}}
     // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
@@ -263,8 +264,9 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
     };
   };
   static_assert(NestedErrorInRequires<int>);
-  // since-cxx20-error@-1 {{static assertion failed}}
-  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}} \
+  //   since-cxx20-note@-1 {{because 'int' does not satisfy 'NestedErrorInRequires'}} \
+  //   since-cxx20-note-re@#cwg2565-NEIR-inner {{because {{.*}} would be invalid: constraint depends on a previously diagnosed expression}}
 
 #endif
 } // namespace cwg2565
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
index 28b5d0adcf054..af2fc938fbea2 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
@@ -140,7 +140,8 @@ concept C7 = sizeof(T) == 1 || sizeof(
         ::type) == 1;
 
 static_assert(!C6<short>);
-static_assert(!C6<char>); // expected-note{{while checking the satisfaction of concept 'C6<char>' requested here}}
+static_assert(!C6<char>);
+// expected-note@-1 {{while checking the satisfaction of concept 'C6<char>' requested here}}
 static_assert(C7<char>);
 static_assert(!C7<short>); // expected-note{{while checking the satisfaction of concept 'C7<short>' requested here}}
 
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
index 31587a956b8ab..af2dce81d8a4b 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
@@ -35,14 +35,14 @@ using r2i2 = r2<A>; // expected-error{{constraints not satisfied for class templ
 using r2i3 = r2<D>;
 using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template 'r2' [with T = const D]}}
 
-template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
+template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
 
 // Non-dependent expressions
 
@@ -89,7 +89,7 @@ template<typename T>
 concept Large = sizeof(typename remove_reference<T>::type) >= 4;
 // expected-note@-1{{because 'sizeof(typename remove_reference<short &>::type) >= 4' (2 >= 4) evaluated to false}}
 
-template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large':}}
+template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large'}}
 struct r7 {};
 
 using r7i1 = r7<int>;
@@ -149,7 +149,7 @@ namespace std_example {
   template<typename T> constexpr bool is_same_v<T, T> = true;
 
   template<typename T, typename U> concept same_as = is_same_v<T, U>;
-  // expected-note@-1 {{because 'is_same_v<int, int *>' evaluated to false}}
+  // expected-note@-1 {{because 'is_same_v<int, typename std_example::T2::inner>' evaluated to false}}
 
   static_assert(C1<int>);
   static_assert(C1<int*>);
@@ -160,7 +160,7 @@ namespace std_example {
   template<typename T> concept C2 =
     requires(T x) {
       {*x} -> same_as<typename T::inner>;
-      // expected-note@-1{{because type constraint 'same_as<int, typename std_example::T2::inner>' was not satisfied:}}
+      // expected-note@-1{{because 'same_as<int, typename std_example::T2::inner>' evaluated to false}}
       // expected-note@-2{{because '*x' would be invalid: indirection requires pointer operand ('int' invalid)}}
     };
 
@@ -173,9 +173,9 @@ namespace std_example {
     int operator *() { return 0; }
   };
   static_assert(C2<T1>);
-  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'std_example::T2' does not satisfy 'C2'}}
+  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'T2' does not satisfy 'C2'}}
   using c2c1 = C2_check<int>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = int]}}
-  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::T2]}}
+  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = T2]}}
 
   template<typename T>
   void g(T t) noexcept(sizeof(T) == 1) {}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 033ae349a02e5..70a96bed05867 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -43,11 +43,10 @@ namespace std_example {
       requires sizeof(a) == 4; // OK
       requires a == 0; // expected-error{{substitution into constraint expression resulted in a non-constant expression}}
       // expected-note@-1{{while checking the satisfaction of nested requirement requested here}}
-      // expected-note@-2{{in instantiation of requirement here}}
-      // expected-note@-3{{while checking the satisfaction of nested requirement requested here}}
-      // expected-note@-6{{while substituting template arguments into constraint expression here}}
-      // expected-note@-5{{function parameter 'a' with unknown value cannot be used in a constant expression}}
-      // expected-note@-8{{declared here}}
+      // expected-note@-2{{while checking the satisfaction of nested requirement requested here}}
+      // expected-note@-5{{while substituting template arguments into constraint expression here}}
+      // expected-note@-4{{function parameter 'a' with unknown value cannot be used in a constant expression}}
+      // expected-note@-7{{declared here}}
     };
     static_assert(C2<int>); // expected-error{{static assertion failed}}
     // expected-note@-1{{while checking the satisfaction of concept 'C2<int>' requested here}}
@@ -84,31 +83,26 @@ static_assert(Pipes<S>);
 static_assert(Pipes<double>);
 
 static_assert(Amps1<S>);
-static_assert(!Amps1<double>);
+static_assert(Amps1<double>);
 
 static_assert(Amps2<S>);
-static_assert(!Amps2<double>);
+static_assert(Amps2<double>);
 
 template<class T>
-void foo1() requires requires (T x) { // #foo1
+void foo1() requires requires (T x) {
   requires
-  True<decltype(x.value)> // #foo1Value
+  True<decltype(x.value)>
   && True<T>;
 } {}
 template<class T> void fooPipes() requires Pipes<T> {}
-template<class T> void fooAmps1() requires Amps1<T> {} // #fooAmps1
+template<class T> void fooAmps1() requires Amps1<T> {}
 void foo() {
   foo1<S>();
-  foo1<int>(); // expected-error {{no matching function for call to 'foo1'}}
-  // expected-note@#foo1Value {{because 'True<decltype(x.value)> && True<T>' would be invalid: member reference base type 'int' is not a structure or union}}
-  // expected-note@#foo1 {{candidate template ignored: constraints not satisfied [with T = int]}}
+  foo1<int>();
   fooPipes<S>();
   fooPipes<int>();
   fooAmps1<S>();
-  fooAmps1<int>(); // expected-error {{no matching function for call to 'fooAmps1'}}
-  // expected-note@#fooAmps1 {{candidate template ignored: constraints not satisfied [with T = int]}}
-  // expected-note@#fooAmps1 {{because 'int' does not satisfy 'Amps1'}}
-  // expected-note@#Amps1 {{because 'True<decltype(x.value)> && True<T> && !False<T>' would be invalid: member reference base type 'int' is not a structure or union}}
+  fooAmps1<int>();
 }
 
 template<class T>
@@ -158,15 +152,16 @@ void func() {
   // expected-note@#bar {{while substituting template arguments into constraint expression here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
   // expected-note@#bar {{candidate template ignored: constraints not satisfied [with T = False]}}
-  // expected-note@#bar {{because 'X<SubstitutionFailureNestedRequires::ErrorExpressions_NotSF::False>::value' evaluated to false}}
+  // expected-note@#bar {{because 'X<False>::value' evaluated to false}}
 
   bar<int>();
+  // expected-error@-1 {{no matching function for call to 'bar'}} \
   // expected-note@-1 {{while checking constraint satisfaction for template 'bar<int>' required here}} \
-  // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}}
+  // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}} \
   // expected-note@#bar {{in instantiation of static data member}}
-  // expected-note@#bar {{in instantiation of requirement here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
   // expected-note@#bar {{while substituting template arguments into constraint expression here}}
+  // expected-note@#bar {{candidate template ignored}}
   // expected-error@#X_Value {{type 'int' cannot be used prior to '::' because it has no members}}
 }
 }
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
index 5199708cd8166..5dcb1880ded48 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
@@ -39,14 +39,14 @@ using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class
 
 template<typename T> requires requires { sizeof(T); }
 // expected-note@-1{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}}
-// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
+// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
 
 template<typename T> requires requires (T t) { 0; "a"; (void)'a'; }
 struct r4 {};
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
index 5433cfb21955d..28dff336d053c 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
@@ -182,14 +182,14 @@ namespace std_example {
   static_assert(C1<has_inner_and_type> && C2<has_inner_and_type> && C3<has_inner_and_type>);
   template<C1 T> struct C1_check {};
   // expected-note@-1 {{because 'int' does not satisfy 'C1'}}
-  // expected-note@-2 {{because 'std_example::has_type' does not satisfy 'C1'}}
+  // expected-note@-2 {{because 'has_type' does not satisfy 'C1'}}
   template<C2 T> struct C2_check {};
-  // expected-note@-1 {{because 'std_example::has_inner' does not satisfy 'C2'}}
+  // expected-note@-1 {{because 'has_inner' does not satisfy 'C2'}}
   template<C3 T> struct C3_check {};
   // expected-note@-1 {{because 'void' does not satisfy 'C3'}}
   using c1 = C1_check<int>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = int]}}
-  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = std_example::has_type]}}
-  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::has_inner]}}
+  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = has_type]}}
+  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = has_inner]}}
   using c4 = C3_check<void>; // expected-error{{constraints not satisfied for class template 'C3_check' [with T = void]}}
 }
 
@@ -199,10 +199,10 @@ template <typename T> concept C = requires { requires requires { T::a; }; };
 // expected-note@-1 {{because 'T::a' would be invalid: no member named 'a' in 'PR48656::T1'}}
 
 template <C...> struct A {};
-// expected-note@-1 {{because 'PR48656::T1' does not satisfy 'C'}}
+// expected-note@-1 {{because 'T1' does not satisfy 'C'}}
 
 struct T1 {};
-template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <PR48656::T1>]}}
+template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <T1>]}}
 
 struct T2 { static constexpr bool a = false; };
 template struct A<T2>;
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
index 59e6a48e48878..6dea0c62fe686 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
@@ -28,9 +28,8 @@ template<typename T> requires requires {
   requires S<T>{};
   // expected-error@-1{{atomic constraint must be of type 'bool' (found 'S<int>')}}
   // expected-note@-2{{while checking the satisfaction}}
-  // expected-note@-3{{in instantiation of requirement}}
-  // expected-note@-4{{while checking the satisfaction}}
-  // expected-note@-6{{while substituting template arguments}}
+  // expected-note@-3{{while checking the satisfaction of nested requirement}}
+  // expected-note@-5{{while substituting template arguments}}
   // expected-note@#F3INST{{while checking constraint satisfaction}}
   // expected-note@#F3INST{{while substituting deduced template arguments into function template 'f3' [with T = int]}}
   //
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
index 3992835c44402..34c5c5d338bfe 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
@@ -1,21 +1,31 @@
 // RUN: %clang_cc1 -std=c++2a -x c++ -verify %s
+// RUN: %clang_cc1 -std=c++2c -x c++ -verify %s
 
 template<typename T> concept True = true;
-template<typename T> concept Foo = True<T*>;
-template<typename T> concept Bar = Foo<T&>;
-template<typename T> requires Bar<T> struct S { };
-template<typename T> requires Bar<T> && true struct S<T> { };
+template<typename T> concept Foo = True<T*>; // #Foo
+template<typename T> concept Bar = Foo<T&>;  // #Bar
+template<typename T> requires Bar<T> struct S { }; // #S
+template<typename T> requires Bar<T> && true struct S<T> { }; // #SpecS
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo 2{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-note@#SpecS {{while substituting into concept arguments here}}
+// expected-note@#S {{while substituting into concept arguments here}}
+// expected-note@#Bar 2{{while substituting into concept arguments here}}
+// expected-note@#S {{template is declared here}}
+
+
 
 template<typename T> concept True2 = sizeof(T) >= 0;
-template<typename T> concept Foo2 = True2<T*>;
-// expected-error@-1{{'type name' declared as a pointer to a reference of type 'type-parameter-0-0 &'}}
-template<typename T> concept Bar2 = Foo2<T&>;
-// expected-note@-1{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
-template<typename T> requires Bar2<T> struct S2 { };
+template<typename T> concept Foo2 = True2<T*>; // #Foo2
+
+template<typename T> concept Bar2 = Foo2<T&>; // #Bar2
+// expected-note@-1 3{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
+template<typename T> requires Bar2<T> struct S2 { }; // #SpecS2_1
 // expected-note@-1{{template is declared here}}
-template<typename T> requires Bar2<T> && true struct S2<T> { };
+template<typename T> requires Bar2<T> && true struct S2<T> { }; // #SpecS2_2
 // expected-error@-1{{class template partial specialization is not more specialized than the primary template}}
-// expected-note@-2{{while calculating associated constraint of template 'S2<T>' here}}
+// expected-error@#Foo2{{'type name' declared as a pointer to a reference of type 'T &'}}
+
 
 namespace type_pack {
   template<typename... Args>
@@ -71,16 +81,31 @@ namespace non_type_pack {
 namespace PR47174 {
 // This checks that we don't crash with a failed substitution on the first constrained argument when
 // performing normalization.
-template <Bar2 T, True U>
+template <Bar2 T, True U> // #S3_Header
 requires true struct S3; // expected-note {{template is declared here}}
 template <True T, True U>
-requires true struct S3<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+requires true struct S3<T, U>;
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo2 2{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-note@#SpecS2_1 {{while substituting into concept arguments here}}
+// expected-note@#SpecS2_2 {{while substituting into concept arguments here}}
+// expected-note@#S3_Header {{while substituting into concept arguments here}}
+// expected-note@#Bar2 {{while substituting into concept arguments here}}
+
 
 // Same as above, for the second position (but this was already working).
-template <True T, Bar2 U>
-requires true struct S4; // expected-note {{template is declared here}}
+template <True T, Bar2 U> // #S4_Header
+requires true struct S4; // #S4
 template <True T, True U>
-requires true struct S4<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+requires true struct S4<T, U>; // #S4-spec
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo2 {{'type name' declared as a pointer to a reference of type 'U &'}}
+// expected-note@#S4_Header {{while substituting into concept arguments here}}
+// expected-note@#S4 {{template is declared here}}
+// expected-note@#S4 {{similar constraint expressions not considered equivalent}}
+// expected-note@#S4-spec {{similar constraint expression here}}
+
+
 
 struct X {
   template<int> struct Y {
@@ -96,7 +121,7 @@ template<class T> requires C1<T> && C2<T> void t1() = delete; // expected-note {
 template void t1<X>();
 void t1() { t1<X>(); } // expected-error {{call to deleted function 't1'}}
 
-template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}} 
+template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}}
 template<class T> requires C2<T> void t2() {}; // expected-note 2 {{candidate function}}
 template void t2<X>(); // expected-error {{partial ordering for explicit instantiation of 't2' is ambiguous}}
 void t2() { t2<X>(); } // expected-error {{call to 't2' is ambiguous}}
diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp
index 4f5fdd3b4809a..c0406f88db5f3 100644
--- a/clang/test/CXX/temp/temp.param/p10-2a.cpp
+++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp
@@ -86,16 +86,18 @@ using f1 = F<int>;
 using f2 = F<long>; // expected-error {{constraints not satisfied for alias template 'F' [with T = long]}}
 
 template<typename T, typename... Ts>
-concept OneOf = (is_same_v<T, Ts> || ...);
-// expected-note@-1 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
-// expected-note@-2 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
-// expected-note@-3 {{because 'is_same_v<short, int>' evaluated to false}}
-// expected-note@-4 {{and 'is_same_v<short, long>' evaluated to false}}
-// expected-note@-5 {{and 'is_same_v<short, char>' evaluated to false}}
-// expected-note@-6 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
-// expected-note@-7 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
-// expected-note@-8 2{{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
-// expected-note@-9 2{{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
+concept OneOf = (is_same_v<T, Ts> || ...); // #OneOf
+// expected-note@#OneOf 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
+// expected-note@#OneOf 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<short, int>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<short, long>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<short, char>' evaluated to false}}
+// expected-note@#OneOf 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
+// expected-note@#OneOf 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
 
 template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U>
 // expected-note@-1 2{{because 'OneOf<char, char[1], char[2]>' evaluated to false}}
@@ -124,6 +126,7 @@ using I = int;
 
 using i1 = I<1>;
 using i2 = I<'a'>;
+// FIXME: This crashes with -std=c++2c
 using i3 = I<nullptr>;
 // expected-error@-1 {{constraints not satisfied for alias template 'I' [with x = nullptr]}}
 
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 99a82d96d321b..ce862666aa48f 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -127,13 +127,12 @@ struct F {
 
 template <typename T>
 constexpr int f5() requires C<T> { return 1; } // expected-note {{while checking the satisfaction}}
-                                               // expected-note@-1 {{while substituting template arguments}}
-                                               // expected-note@-2 {{candidate template ignored}}
+                                               // expected-note@-1 {{candidate template ignored}}
 
 template <typename T>
-constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}}
-                                                  // expected-note@-1 4 {{while substituting template arguments}}
-                                                  // expected-note@-2 {{candidate template ignored}}
+constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} \
+                                                  // expected-note 4 {{while substituting template arguments}} \
+                                                  // expected-note {{candidate template ignored}}
 
 static_assert(f5<int>() == 1);
 static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 74b3573a0dcaa..6777dc23c44a6 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -1257,13 +1257,13 @@ void f() {
     (&A::e)(a, a);
     // expected-error@-1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::e<A>)(a, 0);
     (&A::e<A>)(a, a);
     // expected-error@-1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::e<A, int>)(a, 0);
 
@@ -1273,12 +1273,12 @@ void f() {
     (&A::f<A>)(a);
     // expected-error@-1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::f)(a);
     // expected-error@-1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::g)(a);
     (&A::g)(a, 0);
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 4220486d3aed3..137f46ee3dc01 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
 
-template <class T> concept A = true;
-template <class T> concept C = A<T> && true;
+template <class T> concept A = (T(), true);
+template <class T> concept C = A<T> && true; // #C
 template <class T> concept D = A<T> && __is_same(T, int);
 
 
@@ -40,13 +40,23 @@ constexpr int i(T...) { return 1; }; // expected-note {{candidate}}
 static_assert(i(0) == 1); // expected-error {{call to 'i' is ambiguous}}
 
 
-template <class... T> requires (A<T> || ... || true)
-constexpr int j(T...) { return 0; };
-template <class... T> requires (C<T> && ... && true)
-constexpr int j(T...) { return 1; };
+template <class... T> requires (A<T> || ... || true) constexpr int j(T...) { return 0; }; // #j1
+template <class... T> requires (C<T> && ... && true) constexpr int j(T...) { return 1; }; // #j2
 
 static_assert(j(0) == 1);
+// expected-error@-1 {{call to 'j' is ambiguous}}
+// expected-note@#j1 {{candidate function [with T = <int>]}}
+// expected-note@#j2 {{candidate function [with T = <int>]}}
+// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
+// expected-note@#j1 {{similar constraint expression here}}
+
+
 static_assert(j() == 1);
+// expected-error@-1 {{call to 'j' is ambiguous}}
+// expected-note@#j1 {{candidate function [with T = <>]}}
+// expected-note@#j2 {{candidate function [with T = <>]}}
+// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
+// expected-note@#j1 {{similar constraint expression here}}
 
 
@@ -107,7 +117,7 @@ void test() {
 }
 
 namespace substitution {
-    struct S {
+struct S {
     using type = int;
 };
 
@@ -144,51 +154,69 @@ consteval int Or3() requires (C<typename T::type> || ... || C<typename U::type>)
 static_assert(And1<>() == 1);
 static_assert(And1<S>() == 1);
 static_assert(And1<S, S>() == 1);
+// FIXME: The diagnostics are not so great
 static_assert(And1<int>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                 // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
-                                 // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+                                 // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int>]}}
+                                 // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And1<S, int>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <S, int>]}}
+                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And1<int, S>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int, S>]}}
+                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And2<S>() == 2);
 static_assert(And2<S, S>() == 2);
-static_assert(And2<int>() == 2);
+static_assert(And2<int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
+                                  // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+                                  // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
+                                  // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And2<int, int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
-                                      // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
-                                     // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+                                      // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} \
+                                      // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And2<S, int>() == 2); // expected-error {{no matching function for call to 'And2'}}
-                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
+                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
+                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And2<int, S>() == 2); // expected-error {{no matching function for call to 'And2'}}
-                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
+                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
+                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And3<S>() == 3);
 static_assert(And3<S, S>() == 3);
 static_assert(And3<int>() == 3);   // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And3<int, int>() == 3);  // expected-error {{no matching function for call to 'And3'}}
-                                      // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                     // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                      // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
+                                      // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+                                     // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And3<S, int>() == 3); // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}}
+                                   // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And3<int, S>() == 3); // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
+                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 
 static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}}
@@ -198,25 +226,26 @@ static_assert(Or1<int, S>() == 1);
 static_assert(Or1<S, int>() == 1);
 static_assert(Or1<S, S>() == 1);
 static_assert(Or1<int>() == 1); // expected-error {{no matching function for call to 'Or1'}}
-                                // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} \
-                                // expected-note@#or1 {{because substituted constraint expression is ill-formed}}
-
+                                // expected-note@#or1 {{candidate template ignored: constraints not satisfied}}
+                                // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}}
+                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(Or2<S>() == 2);
 static_assert(Or2<int, S>() == 2);
 static_assert(Or2<S, int>() == 2);
 static_assert(Or2<S, S>() == 2);
 static_assert(Or2<int>() == 2); // expected-error {{no matching function for call to 'Or2'}}
-                                // expected-note@#or2 {{candidate template ignored: constraints not satisfied}} \
-                                // expected-note@#or2 {{because substituted constraint expression is ill-formed}}
-
+                                // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+                                // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}}
+                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
 static_assert(Or3<S>() == 3);
 static_assert(Or3<int, S>() == 3);
 static_assert(Or3<S, int>() == 3);
 static_assert(Or3<S, S>() == 3);
 static_assert(Or3<int>() == 3); // expected-error {{no matching function for call to 'Or3'}}
-                                // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} \
-                                // expected-note@#or3 {{because substituted constraint expression is ill-formed}}
+                                // expected-note@#or3 {{candidate template ignored: constraints not satisfied}}
+                                // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}}
+                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
 }
 
 namespace bool_conversion_break {
@@ -226,7 +255,7 @@ struct Thingy {
     static constexpr int compare(const Thingy&) {return 1;}
 };
 template <typename ...T, typename ...U>
-void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: failed template argument deduction}}
+void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: constraints not satisfied}}
 requires (T::compare(U{}) && ...); // expected-error {{atomic constraint must be of type 'bool' (found 'int')}}
 
 void g() {
@@ -269,9 +298,7 @@ struct S {
 
 static_assert(S<int>::f<int>() == 2);
 
-static_assert(S<int>::g<int>() == 2); // expected-error {{call to 'g' is ambiguous}}
-                                      // expected-note@#nested-ambiguous-g1 {{candidate}}
-                                      // expected-note@#nested-ambiguous-g2 {{candidate}}
+static_assert(S<int>::g<int>() == 2);
 
 
 }
@@ -384,3 +411,98 @@ struct LazyLitMatrix<index_by<Indices...>, init> {
 }
 
 }
+
+namespace GH135190 {
+template <typename T>
+concept A = __is_same_as(T, int) || __is_same_as(T, double) ;
+
+template <typename T>
+concept B = A<T> && __is_same_as(T, double);
+
+template <class... Ts>
+requires(A<Ts> && ...)
+constexpr int g() {
+    return 1;
+}
+
+template <class... Ts>
+requires(B<Ts> && ...)
+constexpr int g() {
+    return 2;
+}
+
+static_assert(g<double>() == 2);
+
+
+template <class... Ts>
+concept all_A = (A<Ts> && ...);
+
+template <class... Ts>
+concept all_B = (B<Ts> && ...);
+
+template <class... Ts>
+requires all_A<Ts...>
+constexpr int h() {
+    return 1;
+}
+
+template <class... Ts>
+requires all_B<Ts...>
+constexpr int h() {
+    return 2;
+}
+
+static_assert(h<double>() == 2);
+}
+
+
+namespace parameter_mapping_regressions {
+
+namespace case1 {
+namespace std {
+template <class _Tp, class... _Args>
+constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
+template <class _Tp, class... _Args>
+concept constructible_from = is_constructible_v<_Tp, _Args...>;
+template <class _Tp>
+concept default_initializable = true;
+template <class> using iterator_t = int;
+template <class _Tp>
+concept view = constructible_from<_Tp, _Tp>;
+template <class... _Views>
+  requires(view<_Views> && ...)
+class zip_transform_view;
+} // namespace std
+struct IterDefaultCtrView {};
+template <class... Views>
+using Iter = std::iterator_t<std::zip_transform_view<Views...>>;
+static_assert(
+    std::default_initializable<Iter<IterDefaultCtrView, IterDefaultCtrView>>);
+
+}
+
+namespace case2 {
+
+template <class _Bp>
+constexpr bool False = false;
+
+template <class... _Views>
+concept __zip_all_random_access = (False<_Views> && ...);
+// expected-note@-1 {{evaluated to false}}
+
+template <typename... _Views>
+struct zip_view {
+  void f() requires __zip_all_random_access<_Views...>{};
+  // expected-note@-1 {{because 'int' does not satisfy}}
+};
+
+zip_view<int> test_v;
+static_assert(!__zip_all_random_access<int>);
+
+void test() {
+  test_v.f(); // expected-error {{invalid reference to function 'f'}}
+}
+
+}
+
+}
diff --git a/clang/test/SemaCXX/cxx2c-template-template-param.cpp b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
index ed55a059bb53c..4ad3fd95039cd 100644
--- a/clang/test/SemaCXX/cxx2c-template-template-param.cpp
+++ b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
@@ -106,7 +106,7 @@ concept BinaryDefaultedFalse = false;
 
 template <template <typename...> concept C, typename T>
 struct S {
-    template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
+    template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
     void f(TT); // expected-note {{ignored}}
     void g(C auto); // expected-note {{ignored}} \
                     // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
@@ -171,7 +171,7 @@ concept BinaryDefaultedFalse = false;
 
 template <template <typename...> concept C, typename T>
 struct S {
-    template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
+    template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
     void f(TT); // expected-note {{ignored}}
     void g(C auto); // expected-note {{ignored}} \
                     // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
diff --git a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
index 436dfb9aac0a7..8400340d19f93 100644
--- a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
+++ b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang -fsyntax-only -std=c++2a -Xclang -verify -ftemplate-depth=5 -ftemplate-backtrace-limit=4 %s
 
-// RequiresExpr contains invalid requirement. (Eg. Highly recurisive template).
+// RequiresExpr contains invalid requirement. (Eg. Highly recursive template).
 template<int x>
 struct A { static constexpr bool far(); };
 class B {
@@ -19,7 +19,7 @@ constexpr bool A<x>::far() {
       // expected-error@#Invalid {{recursive template instantiation exceeded maximum depth}}
       // expected-note@#Invalid 3 {{while}}
       // expected-note@#Invalid {{contexts in backtrace}}
-      // expected-note@#Invalid {{increase recursive template instantiation depth}}
+      // expected-note@#Invalid {{use -ftemplate-depth=N to increase}}
     };
 }
 static_assert(A<1>::far());
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 135865c8450f5..c3bda3988484d 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -102,7 +102,7 @@ static_assert(__is_constructible(Movable, int));
 // expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
 // expected-note@-1 2{{}}
 // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
-// expected-note@#err-self-constraint-1 4{{}}
+// expected-note@#err-self-constraint-1 3{{}}
 // expected-note@#Movable  {{'Movable' defined here}}
 
 template <typename T>
@@ -200,7 +200,6 @@ void h(short n) { f(n); }
 // expected-note@-1{{while checking constraint satisfaction for template}}
 // expected-note@#GH62096-note1{{in instantiation}}
 // expected-note@#GH62096-note1{{while substituting template arguments into constraint expression here}}
-// expected-note@#GH62096-note2{{while substituting template arguments into constraint expression here}}
 // expected-note@#GH62096-note2{{while checking the satisfaction of concept}}
 // expected-note@#GH62096-err {{expression evaluates}}
 }
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index d49330f97fad0..901d510bba847 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -5129,12 +5129,12 @@ namespace GH121278 {
 #if __cplusplus >= 202002L
 template <typename B, typename D>
 concept C = __is_base_of(B, D);
-// expected-error@-1 {{incomplete type 'GH121278::S' used in type trait expression}}
+// expected-error@-1 {{incomplete type 'S' used in type trait expression}}
 // expected-note@-2 {{while substituting template arguments into constraint expression here}}
 
 struct T;
 struct S;
 bool b = C<T, S>;
-// expected-note@-1 {{while checking the satisfaction of concept 'C<GH121278::T, GH121278::S>' requested here}}
+// expected-note@-1 {{while checking the satisfaction of concept 'C<T, S>' requested here}}
 #endif
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
index d7c6876d3b9e3..999372c95554e 100644
--- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
@@ -19,7 +19,7 @@ Buffer<double2> r4;
 
 // expected-error@+4 {{constraints not satisfied for class template 'Buffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class Buffer}}
-// expected-note@*:* {{because 'hlsl::Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::Buffer<int>)' evaluated to false}}
 Buffer<Buffer<int> > r5;
 
@@ -65,7 +65,7 @@ Buffer<half[4]> r10;
 
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 Buffer<int8> r11;
 
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
 Buffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 Buffer<double3> r16;
 
diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
index 361f4303b4961..b33f2af8a1117 100644
--- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
@@ -19,7 +19,7 @@ RWBuffer<double2> r4;
 
 // expected-error@+4 {{constraints not satisfied for class template 'RWBuffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class RWBuffer}}
-// expected-note@*:* {{because 'hlsl::RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::RWBuffer<int>)' evaluated to false}}
 RWBuffer<RWBuffer<int> > r5;
 
@@ -65,7 +65,7 @@ RWBuffer<half[4]> r10;
 
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 RWBuffer<int8> r11;
 
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
 RWBuffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 RWBuffer<double3> r16;
 
diff --git a/clang/test/SemaTemplate/concepts-recovery-expr.cpp b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
index 6bed1790051f3..aa4ed53ec4ed3 100644
--- a/clang/test/SemaTemplate/concepts-recovery-expr.cpp
+++ b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
@@ -4,7 +4,7 @@
 constexpr bool CausesRecoveryExpr = "test" + 1.0f;
 
 template<typename T>
-concept ReferencesCRE = CausesRecoveryExpr;
+concept ReferencesCRE = CausesRecoveryExpr; // #subst1
 
 template<typename T> requires CausesRecoveryExpr // #NVC1REQ
 void NoViableCands1(){} // #NVC1
@@ -19,16 +19,18 @@ void NVCUse() {
   NoViableCands1<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands1'}}
   // expected-note@#NVC1{{candidate template ignored: constraints not satisfied}}
+  // expected-note@#NVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
   // expected-note@#NVC1REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   NoViableCands2<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands2'}}
   // expected-note@#NVC2{{candidate template ignored: constraints not satisfied}}
-  // expected-note@#NVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   NoViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands3'}}
   // expected-note@#NVC3{{candidate template ignored: constraints not satisfied}}
-  // expected-note@#NVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#NVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 template<typename T> requires CausesRecoveryExpr // #OVC1REQ
@@ -58,12 +60,14 @@ void OVCUse() {
   // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
   // expected-note@#OVC2_ALT {{candidate function}}
   // expected-note@#OVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   OtherViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
   // expected-note@#OVC3_ALT {{candidate function}}
   // expected-note@#OVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 template<typename T> requires CausesRecoveryExpr // #OBNVC1REQ
@@ -95,13 +99,15 @@ void OBNVCUse() {
   // expected-note@#OBNVC2_ALT {{candidate template ignored: constraints not satisfied}}
   // expected-note@#OBNVC2REQ_ALT {{because 'false' evaluated to false}}
   // expected-note@#OBNVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OBNVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OBNVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   OtherBadNoViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherBadNoViableCands3'}}
   // expected-note@#OBNVC3_ALT {{candidate template ignored: constraints not satisfied}}
   // expected-note@#OBNVC3REQ_ALT {{because 'false' evaluated to false}}
   // expected-note@#OBNVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OBNVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OBNVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 
@@ -136,12 +142,14 @@ void MemOVCUse() {
   // expected-error@-1 {{no matching member function for call to 'OtherViableCands2'}}
   // expected-note@#MEMOVC2_ALT {{candidate function}}
   // expected-note@#MEMOVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#MEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#MEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   S.OtherViableCands3<int>();
   // expected-error@-1 {{no matching member function for call to 'OtherViableCands3'}}
   // expected-note@#MEMOVC3_ALT {{candidate function}}
   // expected-note@#MEMOVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#MEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#MEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 struct StaticOVC {
@@ -173,12 +181,14 @@ void StaticMemOVCUse() {
   // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
   // expected-note@#SMEMOVC2_ALT {{candidate function}}
   // expected-note@#SMEMOVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#SMEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#SMEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   StaticOVC::OtherViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
   // expected-note@#SMEMOVC3_ALT {{candidate function}}
   // expected-note@#SMEMOVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#SMEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#SMEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 namespace GH58548 {
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 097cad1a64179..73dce9317f383 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -12,7 +12,7 @@ void g() {
   // expected-note@#FDEF{{because 'int' does not satisfy 'c'}}
   // expected-note@#CDEF{{because 'f(t)' would be invalid: no matching function for call to 'f'}}
 }
-} // namespace GH53213 
+} // namespace GH53213
 
 namespace GH45736 {
 struct constrained;
@@ -67,15 +67,14 @@ struct my_range{
 
 void baz() {
 auto it = begin(rng); // #BEGIN_CALL
-// expected-error@#INF_BEGIN {{satisfaction of constraint 'Inf<Inf auto>' depends on itself}}
-// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
+// expected-error-re@#INF_REQ {{satisfaction of constraint {{.*}} depends on itself}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
 // expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
 // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
 // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
-// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
-// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
+// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
 // expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}}
 
 // Fallout of the failure is failed lookup, which is necessary to stop odd
@@ -83,6 +82,7 @@ auto it = begin(rng); // #BEGIN_CALL
 // expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}}
 // expected-note@#NOTINF_BEGIN {{candidate function}}
 // expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}}
+// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}}
 }
 } // namespace DirectRecursiveCheck
 
@@ -100,16 +100,17 @@ namespace GH50891 {
   static_assert(Numeric<Deferred>); // #STATIC_ASSERT
   // expected-error@#NUMERIC{{satisfaction of constraint 'requires (T a) { foo(a); }' depends on itself}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
-  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
-  // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}}
-  // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}}
-  // expected-note@#FOO_CALL {{while substituting deduced template arguments into function template}}
-  // expected-note@#FOO_CALL {{in instantiation of requirement here}}
+  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#OP_TO {{skipping 1 context}}
+  // expected-note@#FOO_CALL 2{{while checking constraint satisfaction for template}}
+  // expected-note@#FOO_CALL 2{{while substituting deduced template arguments into function template}}
+  // expected-note@#FOO_CALL 2{{in instantiation of requirement here}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
 
   // expected-error@#STATIC_ASSERT {{static assertion failed}}
-  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
-  // expected-note@#STATIC_ASSERT{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#STATIC_ASSERT{{because 'Deferred' does not satisfy 'Numeric'}}
+  // expected-note@#FOO_CALL{{because 'foo(a)' would be invalid}}
 
 } // namespace GH50891
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 209e7dc69797d..6d29f8b798e73 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1002,7 +1002,7 @@ template<class>
 concept Irrelevant = false;
 
 template <typename T>
-concept ErrorRequires = requires(ErrorRequires auto x) { x; };
+concept ErrorRequires = requires(ErrorRequires auto x) { x; }; //#GH54678-ill-formed-concept
 // expected-error@-1 {{a concept definition cannot refer to itself}} \
 // expected-error@-1 {{'auto' not allowed in requires expression parameter}} \
 // expected-note@-1 {{declared here}}
@@ -1023,8 +1023,7 @@ template<class T> void eee(T t) // expected-note {{candidate template ignored: c
 requires (Irrelevant<T> || Irrelevant<T> || True<T>) && False<T> {} // expected-note {{'long' does not satisfy 'False'}}
 
 template<class T> void fff(T t) // expected-note {{candidate template ignored: constraints not satisfied}}
-requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{'unsigned long' does not satisfy 'False'}}
-
+requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{because 'unsigned long' does not satisfy 'False'}}
 void test() {
     aaa(42); // expected-error {{no matching function}}
     bbb(42L); // expected-error{{no matching function}}
@@ -1264,12 +1263,7 @@ C auto x = 0;
 // expected-error@#T_Type {{type 'int' cannot be used prior to '::'}} \
 // expected-note@-1 {{in instantiation of default argument}}
 
-// This will be fixed when we merge https://github.com/llvm/llvm-project/pull/141776
-// Which makes us behave like GCC.
 static_assert(f(0));
-// expected-error@-1 {{no matching function for call}} \
-// expected-note@#GH61824_f {{constraints not satisfied}} \
-// expected-note@#T_Type {{type 'int' cannot be used prior to '::'}}
 
 }
 
@@ -1278,4 +1272,65 @@ template <typename T> concept PerfectSquare = [](){} // expected-note 2{{here}}
 ([](auto) { return true; }) < PerfectSquare <class T>;
 // expected-error@-1 {{declaration of 'T' shadows template parameter}} \
 // expected-error@-1 {{a concept definition cannot refer to itself}}
+
+}
+namespace GH61811{
+template <class T> struct A { static const int x = 42; };
+template <class Ta> concept A42 = A<Ta>::x == 42;
+template <class Tv> concept Void = __is_same_as(Tv, void);
+template <class Tb, class Ub> concept A42b = Void<Tb> || A42<Ub>;
+template <class Tc> concept R42c = A42b<Tc, Tc&>;
+static_assert (R42c<void>);
+}
+
+namespace parameter_mapping_regressions {
+
+namespace case1 {
+
+template <template <class> class> using __meval = struct __q;
+template <template <class> class _Tp>
+concept __mvalid = requires { typename __meval<_Tp>; };
+template <class _Fn>
+concept __minvocable = __mvalid<_Fn::template __f>;
+template <class...> struct __mdefer_;
+template <class _Fn, class... _Args>
+  requires __minvocable<_Fn>
+struct __mdefer_<_Fn, _Args...> {};
+template <class = __q> struct __mtransform {
+  template <class> using __f = int;
+};
+struct __completion_domain_or_none_ : __mdefer_<__mtransform<>> {};
+
+}
+
+namespace case2 {
+
+template<auto& Q, class P> concept C = Q.template operator()<P>();
+template<class P> concept E = C<[]<class Ty>{ return false; }, P>;
+static_assert(!E<int>);
+
+}
+
+
+namespace case3 {
+template <class> constexpr bool is_move_constructible_v = false;
+
+template <class _Tp>
+concept __cpp17_move_constructible = is_move_constructible_v<_Tp>; // #is_move_constructible_v
+
+template <class _Tp>
+concept __cpp17_copy_constructible = __cpp17_move_constructible<_Tp>; // #__cpp17_move_constructible
+
+template <class _Iter>
+concept __cpp17_iterator = __cpp17_copy_constructible<_Iter>; // #__cpp17_copy_constructible
+
+struct not_move_constructible {};
+static_assert(__cpp17_iterator<not_move_constructible>); \
+// expected-error {{static assertion failed}} \
+// expected-note {{because 'not_move_constructible' does not satisfy '__cpp17_iterator'}} \
+// expected-note@#__cpp17_copy_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_copy_constructible'}} \
+// expected-note@#__cpp17_move_constructible {{because 'parameter_mapping_regressions::case3::not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \
+// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
+}
+
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index e2b586e4c1e44..9e5756ffec3fc 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -574,8 +574,9 @@ static_assert(x.size == 4);
 // CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'U (&)[3]'
 // CHECK-NEXT: | `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
 // CHECK-NEXT: |   |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT: |   | `-TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT: |   |   `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT: |   | `-TemplateArgument type 'T'
+// CHECK-NEXT: |   |   `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
+// CHECK-NEXT: |   |     `-TemplateTypeParm 0x{{.+}} 'T'
 // CHECK-NEXT: |   `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
 // CHECK-NEXT: |     `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
 // CHECK-NEXT: |       `-TemplateTypeParm 0x{{.+}} 'T'
@@ -588,8 +589,9 @@ static_assert(x.size == 4);
 // CHECK-NEXT:   |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'double (&)[3]'
 // CHECK-NEXT:   `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
 // CHECK-NEXT:     |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT:     | `-TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT:     |   `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT:     | `-TemplateArgument type 'T'
+// CHECK-NEXT:     |   `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
+// CHECK-NEXT:     |     `-TemplateTypeParm 0x{{.+}} 'T'
 // CHECK-NEXT:     `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
 // CHECK-NEXT:       `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
 // CHECK-NEXT:         `-TemplateTypeParm 0x{{.+}} 'T'
@@ -660,8 +662,9 @@ Test test(42);
 // CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'Constraint' depth 0 index 1 auto:1
 // CHECK-NEXT: | `-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'Constraint'
 // CHECK-NEXT: |   |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: |   | |-TemplateArgument type 'type-parameter-0-1'
-// CHECK-NEXT: |   | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
+// CHECK-NEXT: |   | |-TemplateArgument type 'auto:1'
+// CHECK-NEXT: |   | | `-TemplateTypeParmType {{.*}} 'auto:1' dependent depth 0 index 1
+// CHECK-NEXT: |   | |   `-TemplateTypeParm {{.*}} 'auto:1'
 // CHECK-NEXT: |   | `-TemplateArgument type 'int'
 // CHECK-NEXT: |   |   `-BuiltinType {{.*}} 'int'
 // CHECK-NEXT: |   |-TemplateArgument {{.*}} type 'auto:1':'type-parameter-0-1'
diff --git a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
index 1f2171a25ebb0..e03756ea3e3a8 100644
--- a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
 
+
 template<typename...>
 concept C = false; // expected-note 9{{because}}
 
diff --git a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
index 3edf243982958..de4a4847d0996 100644
--- a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
+++ b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
@@ -7,8 +7,7 @@ template<typename T>
 constexpr bool is_same_v<T, T> = true;
 
 template<typename T, typename U>
-concept same_as = is_same_v<T, U>;
-// expected-note@-1{{because 'is_same_v<int, bool>' evaluated to false}}
+concept same_as = is_same_v<T, U>; //#is_same_v
 
 template<typename T, typename... Us>
 concept either = (is_same_v<T, Us> || ...);
@@ -17,6 +16,7 @@ template<typename... Ts>
 struct T {
     template<same_as<Ts>... Us>
     // expected-note@-1{{because 'same_as<int, bool>' evaluated to false}}
+    // expected-note@#is_same_v{{because 'is_same_v<int, bool>' evaluated to false}}
     static void foo(Us... u, int x) { };
     // expected-note@-1{{candidate template ignored: deduced too few arguments}}
     // expected-note@-2{{candidate template ignored: constraints not satisfied}}
diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
index e60f79230aa00..32ad5374f46a7 100644
--- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp
+++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
@@ -72,12 +72,12 @@ namespace type_requirement {
 
   template<typename T> requires
   false_v<requires { typename T::template temp<T>; }>
-  // expected-note@-1 {{because 'false_v<requires { typename type_requirement::contains_template<int>::template temp<type_requirement::contains_template<int>>; }>' evaluated to false}}
-  // expected-note@-2 {{because 'false_v<requires { typename type_requirement::contains_template<short>::template temp<type_requirement::contains_template<short>>; }>' evaluated to false}}
+  // expected-note@-1 {{because 'false_v<requires { typename contains_template<int>::template temp<contains_template<int>>; }>' evaluated to false}}
+  // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short>>; }>' evaluated to false}}
   struct r2 {};
 
-  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}}
-  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<short>]}}
+  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<int>]}}
+  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<short>]}}
 
   // substitution error occurs, then requires expr is instantiated again
 
@@ -108,7 +108,7 @@ namespace type_requirement {
   // expected-note@-1 {{because 'false_v<requires { <<error-type>>; } && requires { <<error-type>>; }>' evaluated to false}}
   struct r7 {};
 
-  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, type_requirement::A>]}}
+  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, A>]}}
 }
 
 namespace expr_requirement {
@@ -268,3 +268,13 @@ struct Foo {
 };
 
 } // namespace GH110785
+
+namespace sugared_instantiation {
+  template <class C1> concept C = requires { C1{}; };
+  template <class D1> concept D = requires { new D1; };
+
+  // Test that 'deduced auto' doesn't get confused with 'undeduced auto'.
+  auto f() { return 0; }
+  static_assert(requires { { f() } -> C; });
+  static_assert(requires { { f() } -> D; });
+} // namespace sugared_instantiation
diff --git a/clang/test/SemaTemplate/instantiate-template-argument.cpp b/clang/test/SemaTemplate/instantiate-template-argument.cpp
index 43d5d00c8cb20..7606619af566d 100644
--- a/clang/test/SemaTemplate/instantiate-template-argument.cpp
+++ b/clang/test/SemaTemplate/instantiate-template-argument.cpp
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
+// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify=expected,cxx20
+// RUN: %clang_cc1 -std=c++2c -x c++ %s -verify
+
 
 template<auto T, decltype(T) U>
 concept C1 = sizeof(U) >= 4;
@@ -9,20 +11,101 @@ concept C2 = C1<Y{}, V>;
 // sizeof(U) >= 4 [U = V (decltype(Y{}))]
 
 template<char W>
-constexpr int foo() requires C2<int, W> { return 1; }
+constexpr int foo() requires C2<int, W> { return 1; } // #cand1
 // sizeof(U) >= 4 [U = W (decltype(int{}))]
 
 template<char X>
-// expected-note@+1{{candidate function}}
-constexpr int foo() requires C1<1, X> && true { return 2; }
+constexpr int foo() requires C1<1, X> && true { return 2; } // #cand2
 // sizeof(U) >= 4 [U = X (decltype(1))]
 
 static_assert(foo<'a'>() == 2);
 
+
 template<char Z>
-// expected-note@+1{{candidate function}}
-constexpr int foo() requires C2<long long, Z> && true { return 3; }
+constexpr int foo() requires C2<long long, Z> && true { return 3; } // #cand3
 // sizeof(U) >= 4 [U = Z (decltype(long long{}))]
 
 static_assert(foo<'a'>() == 3);
-// expected-error@-1{{call to 'foo' is ambiguous}}
\ No newline at end of file
+// expected-error@-1{{call to 'foo' is ambiguous}}
+// expected-note@#cand2 {{candidate function}}
+// expected-note@#cand3 {{candidate function}}
+
+
+namespace case1 {
+
+template<auto T, decltype(T) U>
+concept C1 = sizeof(T) >= 4; // #case1_C1
+
+template<typename Y, char V>
+concept C2 = C1<Y{}, V>; // #case1_C2
+
+template<class T, char W>
+constexpr int foo() requires C2<T, W> { return 1; } // #case1_foo1
+
+template<class T, char X>
+constexpr int foo() requires C1<T{}, X> && true { return 2; } // #case1_foo2
+
+static_assert(foo<char, 'a'>() == 2);
+// expected-error@-1{{no matching function for call to 'foo'}}
+// expected-note@#case1_foo1{{candidate template ignored: constraints not satisfied [with T = char, W = 'a']}}
+// expected-note@#case1_foo1{{because 'C2<char, 'a'>' evaluated to false}}
+// expected-note@#case1_C2{{because 'C1<char{}, 'a'>' evaluated to false}}
+// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
+// expected-note@#case1_foo2{{candidate template ignored: constraints not satisfied [with T = char, X = 'a']}}
+// expected-note@#case1_foo2{{because 'C1<char{}, 'a'>' evaluated to false}}
+// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
+
+static_assert(foo<int, 'a'>() == 2);
+
+}
+
+namespace packs {
+
+template<auto T, decltype(T) U>
+concept C1 = sizeof(U) >= 4;
+
+template<typename Y, char V>
+concept C2 = C1<Y{}, V>;
+
+template<char... W>
+constexpr int foo() requires (C2<int, W> && ...) { return 1; } // #packs-cand1
+
+template<char... X>
+constexpr int foo() requires (C1<1, X> && ...) && true { return 2; } // #packs-cand2
+
+static_assert(foo<'a'>() == 2);
+// cxx20-error@-1{{call to 'foo' is ambiguous}}
+// cxx20-note@#packs-cand1 {{candidate function}}
+// cxx20-note@#packs-cand2 {{candidate function}}
+
+}
+
+namespace case2 {
+template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
+template<typename Y> concept C2 = C1<Y{}>;
+
+template<char W>
+constexpr int foo() requires C2<int> { return 1; }
+
+template<char X>
+constexpr int foo() requires C1<0> && true { return 2; }
+
+static_assert(foo<0>() == 2);
+}
+
+namespace case3 {
+template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
+
+template<typename Y> concept C2 = C1<Y{}>;
+
+template<char W>
+constexpr int foo() requires C2<int> { return 1; } // #case3_foo1
+
+template<char X>
+constexpr int foo() requires C1<1> && true { return 2; } // #case3_foo2
+
+static_assert(foo<0>() == 2);
+// expected-error@-1{{call to 'foo' is ambiguous}}
+// expected-note@#case3_foo1 {{candidate function}}
+// expected-note@#case3_foo2 {{candidate function}}
+}
diff --git a/clang/test/SemaTemplate/pr52970.cpp b/clang/test/SemaTemplate/pr52970.cpp
index 7aac5ee856593..6aabc419bd2b8 100644
--- a/clang/test/SemaTemplate/pr52970.cpp
+++ b/clang/test/SemaTemplate/pr52970.cpp
@@ -53,7 +53,7 @@ static_assert(!DotFollowingPointer::f(Bad{}), "");
 #if __cplusplus >= 202002L
 template <class T>
 concept C = requires(T t) { t.begin(); };
-  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Holder<Incomplete> *' is a pointer}}
+  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Bad' (aka 'Holder<Incomplete> *') is a pointer}}
 
 static_assert(C<Good>);
 static_assert(!C<Bad>);
diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 629a887c0f7ed..70341eee79a96 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -143,7 +143,7 @@ void check_forward_iterator_requirements() {
   // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref, ""); // expected-error {{static assertion failed}}
 #ifndef _AIX
-  // expected-note-re@*:* {{because type constraint 'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>' was not satisfied}}
+  // expected-note-re@*:* {{'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>'}}
 #endif
 }
 
@@ -173,7 +173,7 @@ void check_bidirectional_iterator_requirements() {
   _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
   _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference, ""); // expected-error {{static assertion failed}}
-  // expected-note-re@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>' was not satisfied}}
+  // expected-note-re@*:* {{'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>'}}
   // clang-format on
 }
 

From 99ce20624629921771de2674946bbb2f9707ca5a Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 2 Oct 2025 22:36:02 +1000
Subject: [PATCH 503/878] [orc-rt] Add support for constructing Expected<Error>
 values. (#161656)

These will be used in upcoming RPC support patches where the outer
Expected value captures any RPC-infrastructure errors, and the inner
Error is returned from the romet call (i.e. the remote handler's return
type is Error).
---
 orc-rt/include/orc-rt/Error.h  | 11 +++++++++++
 orc-rt/unittests/ErrorTest.cpp | 23 +++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/orc-rt/include/orc-rt/Error.h b/orc-rt/include/orc-rt/Error.h
index 6b43b3388aff4..48d9064440b6d 100644
--- a/orc-rt/include/orc-rt/Error.h
+++ b/orc-rt/include/orc-rt/Error.h
@@ -288,6 +288,10 @@ class ErrorAsOutParameter {
   Error *Err;
 };
 
+/// Tag to force construction of an Expected value in the success state. See
+/// Expected constructor for details.
+struct ForceExpectedSuccessValue {};
+
 template <typename T> class ORC_RT_NODISCARD Expected {
 
   template <class OtherT> friend class Expected;
@@ -310,6 +314,13 @@ template <typename T> class ORC_RT_NODISCARD Expected {
     new (getErrorStorage()) error_type(Err.takePayload());
   }
 
+  template <typename OtherT>
+  Expected(OtherT &&Val, ForceExpectedSuccessValue _,
+           std::enable_if_t<std::is_convertible_v<OtherT, T>> * = nullptr)
+      : HasError(false), Unchecked(true) {
+    new (getStorage()) storage_type(std::forward<OtherT>(Val));
+  }
+
   /// Create an Expected from a T value.
   template <typename OtherT>
   Expected(OtherT &&Val,
diff --git a/orc-rt/unittests/ErrorTest.cpp b/orc-rt/unittests/ErrorTest.cpp
index 3fd8279b7dd77..0da55c33961c8 100644
--- a/orc-rt/unittests/ErrorTest.cpp
+++ b/orc-rt/unittests/ErrorTest.cpp
@@ -386,6 +386,29 @@ TEST(ErrorTest, ExpectedCovariance) {
   (void)!!A2;
 }
 
+// Test that Expected<Error> works as expected with .
+TEST(ErrorTest, ExpectedError) {
+  {
+    // Test success-success case.
+    Expected<Error> E(Error::success(), ForceExpectedSuccessValue());
+    EXPECT_TRUE(!!E);
+    cantFail(E.takeError());
+    auto Err = std::move(*E);
+    EXPECT_FALSE(!!Err);
+  }
+
+  {
+    // Test "failure" success case.
+    Expected<Error> E(make_error<StringError>("foo"),
+                      ForceExpectedSuccessValue());
+    EXPECT_TRUE(!!E);
+    cantFail(E.takeError());
+    auto Err = std::move(*E);
+    EXPECT_TRUE(!!Err);
+    EXPECT_EQ(toString(std::move(Err)), "foo");
+  }
+}
+
 // Test that the ExitOnError utility works as expected.
 TEST(ErrorTest, CantFailSuccess) {
   cantFail(Error::success());

From 82b3057a5ec552d5e42bb595c8243e4fac0c70b4 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 2 Oct 2025 13:40:23 +0100
Subject: [PATCH 504/878] [lldb][test] TestAbiTagStructors.py adjust test to
 account for older compiler versions

Skip tests that require `-gstructor-decl-linkage-names` on Clang
versions that don't support it.

Don't pass `-gno-structor-decl-linkage-names` on Clang versions where it
the flag didn't exist but it was the default behaviour of the compiler
anyway.
---
 .../abi_tag_structors/TestAbiTagStructors.py  | 33 ++++++++++++++++---
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py b/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
index 87d8adb42b82e..2d3e4f7cdd472 100644
--- a/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
+++ b/lldb/test/API/lang/cpp/abi_tag_structors/TestAbiTagStructors.py
@@ -10,6 +10,11 @@
 
 
 class AbiTagStructorsTestCase(TestBase):
+    @skipIf(
+        compiler="clang",
+        compiler_version=["<", "22"],
+        bugnumber="Required Clang flag not supported",
+    )
     @expectedFailureAll(oslist=["windows"])
     def test_with_structor_linkage_names(self):
         self.build(dictionary={"CXXFLAGS_EXTRAS": "-gstructor-decl-linkage-names"})
@@ -73,7 +78,16 @@ def test_no_structor_linkage_names(self):
         Test that without linkage names on structor declarations we can't call
         ABI-tagged structors.
         """
-        self.build(dictionary={"CXXFLAGS_EXTRAS": "-gno-structor-decl-linkage-names"})
+        # In older versions of Clang the -gno-structor-decl-linkage-names
+        # behaviour was the default.
+        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
+            [">=", "22.0"]
+        ):
+            self.build(
+                dictionary={"CXXFLAGS_EXTRAS": "-gno-structor-decl-linkage-names"}
+            )
+        else:
+            self.build()
 
         lldbutil.run_to_source_breakpoint(
             self, "Break here", lldb.SBFileSpec("main.cpp", False)
@@ -105,12 +119,23 @@ def do_nested_structor_test(self):
             "expression TaggedLocal()", error=True, substrs=["Couldn't look up symbols"]
         )
 
+    @skipIf(compiler="clang", compiler_version=["<", "22"])
     @expectedFailureAll(oslist=["windows"])
-    def test_nested_no_structor_linkage_names(self):
+    def test_nested_with_structor_linkage_names(self):
         self.build(dictionary={"CXXFLAGS_EXTRAS": "-gstructor-decl-linkage-names"})
         self.do_nested_structor_test()
 
     @expectedFailureAll(oslist=["windows"])
-    def test_nested_with_structor_linkage_names(self):
-        self.build(dictionary={"CXXFLAGS_EXTRAS": "-gno-structor-decl-linkage-names"})
+    def test_nested_no_structor_linkage_names(self):
+        # In older versions of Clang the -gno-structor-decl-linkage-names
+        # behaviour was the default.
+        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
+            [">=", "22.0"]
+        ):
+            self.build(
+                dictionary={"CXXFLAGS_EXTRAS": "-gno-structor-decl-linkage-names"}
+            )
+        else:
+            self.build()
+
         self.do_nested_structor_test()

From 0cb9d402f369723ef2c80c140a49145b99365450 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 2 Oct 2025 22:40:56 +1000
Subject: [PATCH 505/878] [orc-rt] Fix typo in comment. NFC.

---
 orc-rt/unittests/ErrorTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/orc-rt/unittests/ErrorTest.cpp b/orc-rt/unittests/ErrorTest.cpp
index 0da55c33961c8..6b1fc1635b5b8 100644
--- a/orc-rt/unittests/ErrorTest.cpp
+++ b/orc-rt/unittests/ErrorTest.cpp
@@ -386,7 +386,7 @@ TEST(ErrorTest, ExpectedCovariance) {
   (void)!!A2;
 }
 
-// Test that Expected<Error> works as expected with .
+// Test that Expected<Error> works as expected.
 TEST(ErrorTest, ExpectedError) {
   {
     // Test success-success case.

From c2159f2d3a45e9309f1a6abd201812b6b00cdcb4 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 2 Oct 2025 06:05:16 -0700
Subject: [PATCH 506/878] [LLVM-Test] Fix a regression in test dependencies
 introduced by 0e14973 (#161623)

Add `UnitTests` as an explicit dependency for `check-llvm` and
`llvm-test-depends`. In
https://github.com/llvm/llvm-project/pull/161442, the intent was to
remove `UnitTests` as a dependency for the individual per-directory
`check-llvm-*` test suites created but not to drop it from `check-llvm`
or `llvm-test-depends`. This missing dependency will cause LLVM unit
tests to be not rebuilt and resulting in either `warning: test suite
'LLVM-Unit' contained no tests` or running stale running versions of the
unit tests when running `check-llvm`.
---
 llvm/test/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 32c7c64451746..e810fcb608a20 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -247,7 +247,7 @@ if (LLVM_INCLUDE_SPIRV_TOOLS_TESTS)
   list(APPEND LLVM_TEST_DEPENDS spirv-link)
 endif()
 
-add_custom_target(llvm-test-depends DEPENDS ${LLVM_TEST_DEPENDS})
+add_custom_target(llvm-test-depends DEPENDS ${LLVM_TEST_DEPENDS} UnitTests)
 set_target_properties(llvm-test-depends PROPERTIES FOLDER "LLVM/Tests")
 
 if(LLVM_BUILD_TOOLS)
@@ -259,7 +259,7 @@ endif()
 add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   ${exclude_from_check_all}
-  DEPENDS ${LLVM_TEST_DEPENDS}
+  DEPENDS ${LLVM_TEST_DEPENDS} UnitTests
   )
 set_target_properties(check-llvm PROPERTIES FOLDER "LLVM/Tests")
 

From e7839eed6b9f48609f536f55b277eeb544d41c78 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 22:05:50 +0900
Subject: [PATCH 507/878] AMDGPU: Switch test to generated checks (#161658)

---
 llvm/test/CodeGen/AMDGPU/limit-coalesce.mir | 33 +++++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index ca774825f4dde..fa52b96e9ea95 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -1,19 +1,9 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
 # RUN: llc -mtriple=amdgcn -run-pass register-coalescer -o - %s | FileCheck %s
 
-# Check that coalescer does not create wider register tuple than in source
-
-# CHECK:  - { id: 2, class: vreg_64, preferred-register: '', flags: [  ] }
-# CHECK:  - { id: 3, class: vreg_64, preferred-register: '', flags: [  ] }
-# CHECK:  - { id: 4, class: vreg_64, preferred-register: '', flags: [  ] }
-# CHECK:  - { id: 5, class: vreg_96, preferred-register: '', flags: [  ] }
-# CHECK:  - { id: 6, class: vreg_96, preferred-register: '', flags: [  ] }
-# CHECK:  - { id: 7, class: vreg_128, preferred-register: '', flags: [  ] }
-# CHECK:  - { id: 8, class: vreg_128, preferred-register: '', flags: [  ] }
+# Check that coalescer does not create wider register tuple than in
+# source.
 # No more registers shall be defined
-# CHECK-NEXT: liveins:
-# CHECK:    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %4,
-# CHECK:    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, %6,
-
 ---
 name:            main
 alignment:       1
@@ -52,6 +42,23 @@ body:             |
   bb.0.entry:
     liveins: $sgpr0, $vgpr0_vgpr1
 
+    ; CHECK-LABEL: name: main
+    ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
+    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0
+    ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
+    ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
     %3 = IMPLICIT_DEF
     undef %4.sub0 = COPY $sgpr0
     %4.sub1 = COPY %3.sub0

From faf070f062ac7f3861092ab110982a613d7dfe1b Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 2 Oct 2025 14:05:31 +0100
Subject: [PATCH 508/878] [lldb][test] TestExprDefinitionInDylib.py adjust test
 to account for older compiler versions

 Skip tests that require `-gstructor-decl-linkage-names` on Clang versions that don't support it.

 Don't pass `-gno-structor-decl-linkage-names` on Clang versions where it the flag didn't exist but it was the default behaviour of the compiler anyway.

Drive-by:
- We used to run `self.expect("Bar()")` which would always fail. So the `error=True` would be true even if we didn't pass the `-gno-structor-linkage-names`. So it wasn't testing the behaviour properly. This patch changes these to `self.expect("expr Bar()")`.
---
 .../TestExprDefinitionInDylib.py              | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py b/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
index c0545c70c84ea..b3bed43c75873 100644
--- a/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
+++ b/lldb/test/API/lang/cpp/expr-definition-in-dylib/TestExprDefinitionInDylib.py
@@ -6,6 +6,11 @@
 
 class ExprDefinitionInDylibTestCase(TestBase):
 
+    @skipIf(
+        compiler="clang",
+        compiler_version=["<", "22"],
+        bugnumber="Required Clang flag not supported",
+    )
     @skipIfWindows
     def test_with_structor_linkage_names(self):
         """
@@ -74,7 +79,16 @@ def test_no_structor_linkage_names(self):
         Tests that if structor declarations don't have linkage names, we can't
         call ABI-tagged constructors. But non-tagged ones are fine.
         """
-        self.build(dictionary={"CXXFLAGS_EXTRAS": "-gno-structor-decl-linkage-names"})
+        # In older versions of Clang the -gno-structor-decl-linkage-names
+        # behaviour was the default.
+        if self.expectedCompiler(["clang"]) and self.expectedCompilerVersion(
+            [">=", "22.0"]
+        ):
+            self.build(
+                dictionary={"CXXFLAGS_EXTRAS": "-gno-structor-decl-linkage-names"}
+            )
+        else:
+            self.build()
 
         target = self.dbg.CreateTarget(self.getBuildArtifact("a.out"))
         self.assertTrue(target, VALID_TARGET)
@@ -95,6 +109,6 @@ def test_no_structor_linkage_names(self):
 
         self.expect_expr("Foo(10)", result_type="Foo")
 
-        self.expect("Base()", error=True)
+        self.expect("expr Base()", error=True)
 
-        self.expect("Bar()", error=True)
+        self.expect("expr Bar()", error=True)

From 2ece31bb9f31ea48f456bab339e63f38673c94e5 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Thu, 2 Oct 2025 06:09:12 -0700
Subject: [PATCH 509/878] Enforce Unix line endings for Clang/LLVM/MLIR
 projects (#161460)

Change top-level and LLVM/MLIR/Clang `.clang-format` files to enforce
Unix line ending.
---
 .clang-format       | 1 +
 clang/.clang-format | 1 +
 llvm/.clang-format  | 2 +-
 mlir/.clang-format  | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.clang-format b/.clang-format
index 9b3aa8b7213b2..ecb44bfabd9aa 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1 +1,2 @@
 BasedOnStyle: LLVM
+LineEnding: LF
diff --git a/clang/.clang-format b/clang/.clang-format
index 9b3aa8b7213b2..ecb44bfabd9aa 100644
--- a/clang/.clang-format
+++ b/clang/.clang-format
@@ -1 +1,2 @@
 BasedOnStyle: LLVM
+LineEnding: LF
diff --git a/llvm/.clang-format b/llvm/.clang-format
index 5bead5f39dd3c..ecb44bfabd9aa 100644
--- a/llvm/.clang-format
+++ b/llvm/.clang-format
@@ -1,2 +1,2 @@
 BasedOnStyle: LLVM
-
+LineEnding: LF
diff --git a/mlir/.clang-format b/mlir/.clang-format
index a74fda4b67345..76cc928e64588 100644
--- a/mlir/.clang-format
+++ b/mlir/.clang-format
@@ -1,2 +1,3 @@
 BasedOnStyle: LLVM
 AlwaysBreakTemplateDeclarations: Yes
+LineEnding: LF

From 119cdf7e1b70493148daa9f51d88d6595a680dc0 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Thu, 2 Oct 2025 06:10:08 -0700
Subject: [PATCH 510/878] [OpenACC][CIR] Finish 'private' recipe lowering by
 doing 'init' (#161540)

Private only does 'init' when a constructor needs to be called, so this
patch adds that. The logic of what to init is caused by Sema, but the
tests show that types that are pointers or non-class-types or class
types without a constructor aren't actually initialized.
---
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp |  41 +-
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h   |   4 +
 .../combined-private-clause.cpp               |  32 +-
 .../CodeGenOpenACC/compute-private-clause.c   |   3 -
 .../CodeGenOpenACC/compute-private-clause.cpp |  32 +-
 .../CodeGenOpenACC/loop-private-clause.cpp    |  32 +-
 .../private-clause-array-recipes-CtorDtor.cpp | 223 ++++++-
 .../private-clause-array-recipes-NoOps.cpp    | 224 +++++++-
 .../private-clause-array-recipes-int.cpp      |   4 -
 ...-clause-pointer-array-recipes-CtorDtor.cpp | 539 ++++++++++++++++-
 ...ate-clause-pointer-array-recipes-NoOps.cpp | 544 +++++++++++++++++-
 ...ivate-clause-pointer-array-recipes-int.cpp |  12 -
 ...rivate-clause-pointer-recipes-CtorDtor.cpp | 159 ++++-
 .../private-clause-pointer-recipes-NoOps.cpp  | 161 +++++-
 .../private-clause-pointer-recipes-int.cpp    |   6 -
 15 files changed, 1910 insertions(+), 106 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index e603884d08e3c..f8e511e28faed 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -400,6 +400,32 @@ void OpenACCRecipeBuilderBase::createRecipeDestroySection(
 
   mlir::acc::YieldOp::create(builder, locEnd);
 }
+void OpenACCRecipeBuilderBase::makeBoundsInit(
+    mlir::Value alloca, mlir::Location loc, mlir::Block *block,
+    const VarDecl *allocaDecl, QualType origType, bool isInitSection) {
+  mlir::OpBuilder::InsertionGuard guardCase(builder);
+  builder.setInsertionPointToEnd(block);
+  CIRGenFunction::LexicalScope ls(cgf, loc, block);
+
+  CIRGenFunction::AutoVarEmission tempDeclEmission{*allocaDecl};
+  tempDeclEmission.EmittedAsOffload = true;
+
+  // The init section is the only one of the handful that only has a single
+  // argument for the 'type', so we have to drop 1 for init, and future calls
+  // to this will need to drop 2.
+  llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
+      block->getArguments().drop_front(isInitSection ? 1 : 2);
+
+  mlir::Value subscriptedValue = alloca;
+  for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
+    subscriptedValue = createBoundsLoop(subscriptedValue, boundArg, loc,
+                                        /*inverse=*/false);
+
+  tempDeclEmission.setAllocatedAddress(
+      Address{subscriptedValue, cgf.convertType(origType),
+              cgf.getContext().getDeclAlign(allocaDecl)});
+  cgf.emitAutoVarInit(tempDeclEmission);
+}
 
 // TODO: OpenACC: When we get this implemented for the reduction/firstprivate,
 // this might end up re-merging with createRecipeInitCopy.  For now, keep it
@@ -442,11 +468,16 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
         cgf.emitAutoVarAlloca(*allocaDecl, builder.saveInsertionPoint());
     cgf.emitAutoVarInit(tempDeclEmission);
   } else {
-    makeBoundsAlloca(block, exprRange, loc, "openacc.private.init", numBounds,
-                     boundTypes);
-
-    if (initExpr)
-      cgf.cgm.errorNYI(exprRange, "private-init with bounds initialization");
+    mlir::Value alloca = makeBoundsAlloca(
+        block, exprRange, loc, "openacc.private.init", numBounds, boundTypes);
+
+    // If the initializer is trivial, there is nothing to do here, so save
+    // ourselves some effort.
+    if (initExpr && (!cgf.isTrivialInitializer(initExpr) ||
+                     cgf.getContext().getLangOpts().getTrivialAutoVarInit() !=
+                         LangOptions::TrivialAutoVarInitKind::Uninitialized))
+      makeBoundsInit(alloca, loc, block, allocaDecl, origType,
+                     /*isInitSection=*/true);
   }
 
   mlir::acc::YieldOp::create(builder, locEnd);
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index d802ccb151b64..203eaffc96427 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -35,6 +35,10 @@ class OpenACCRecipeBuilderBase {
                                size_t numBounds,
                                llvm::ArrayRef<QualType> boundTypes);
 
+  void makeBoundsInit(mlir::Value alloca, mlir::Location loc,
+                      mlir::Block *block, const VarDecl *allocaDecl,
+                      QualType origType, bool isInitSection);
+
 protected:
   CIRGen::CIRGenFunction &cgf;
   CIRGen::CIRGenBuilderTy &builder;
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
index 10f4482fee54f..f636a0f3fc416 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -66,7 +66,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -74,7 +73,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -82,7 +80,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -90,7 +87,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -98,7 +94,30 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_NonDefaultCtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -106,7 +125,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
index 097005e75dcbe..34b8b6995792b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.c
@@ -27,7 +27,6 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -35,7 +34,6 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -43,7 +41,6 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
index d8542225b45fd..af84684476322 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -59,42 +59,60 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_NonDefaultCtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
index b356f0fb26cc4..6824f77b98e71 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -66,7 +66,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -74,7 +73,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -82,7 +80,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -90,7 +87,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -98,7 +94,30 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_NonDefaultCtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+ // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
@@ -106,7 +125,6 @@ struct HasDtor {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.private.init"]
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
index 30a14ac836e8e..101f18e8d071c 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct CtorDtor {
   int i;
@@ -14,7 +14,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_8CtorDtor : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_CtorDtor x 5>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+//
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!rec_CtorDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -100,7 +126,57 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_CtorDtor x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i 
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -217,7 +293,78 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[BOUND1_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[BOUND1_STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -310,7 +457,73 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_8CtorDtor : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i 
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>>
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5>> -> !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<5> : !u64i
+// CHECK-NEXT: %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[ARR_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
+// CHECK-NEXT: cir.store %[[ARR_DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.do {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[ARR_IDX]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[IDX_LOAD]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[INC_STRIDE:.*]] = cir.ptr_stride(%[[IDX_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ONE]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.store %[[INC_STRIDE]], %[[ARR_IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } while {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[ARR_IDX]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ne, %[[IDX_LOAD]], %[[LAST_ELT]]) : !cir.ptr<!rec_CtorDtor>, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_CtorDtor x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
index 753389f2a3f47..7e2b8b83bdf87 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoOps { int i = 0; };
 
@@ -9,7 +9,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_5NoOps : !cir.ptr<!cir.array<!rec_NoOps x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!rec_NoOps x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!rec_NoOps x 5>, !cir.ptr<!cir.array<!rec_NoOps x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+//
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -45,7 +71,58 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!rec_NoOps x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i 
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
   ;
@@ -84,7 +161,78 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND1_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[BOUND1_STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
   ;
@@ -98,7 +246,73 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i 
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<5> : !u64i
+// CHECK-NEXT: %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LAST_ELT:.*]] = cir.ptr_stride(%[[ARR_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[ARR_IDX:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["__array_idx"] {alignment = 1 : i64}
+// CHECK-NEXT: cir.store %[[ARR_DECAY]], %[[ARR_IDX]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.do {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[ARR_IDX]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[IDX_LOAD]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[INC_STRIDE:.*]] = cir.ptr_stride(%[[IDX_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ONE]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.store %[[INC_STRIDE]], %[[ARR_IDX]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } while {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[ARR_IDX]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ne, %[[IDX_LOAD]], %[[LAST_ELT]]) : !cir.ptr<!rec_NoOps>, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
index 3d4aaa063a19f..e83e548cd138b 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-int.cpp
@@ -7,7 +7,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK: acc.private.recipe @privatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -26,7 +25,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_i : !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!s32i x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!s32i x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
   ;
@@ -47,7 +45,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT:acc.private.recipe @privatization__Bcnt3__ZTSA5_A5_A5_i : !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
   ;
@@ -61,7 +58,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: acc.private.recipe @privatization__Bcnt2__ZTSA5_A5_A5_i : !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!s32i x 5> x 5> x 5>>, ["openacc.private.init"] {alignment = 4 : i64}
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:}
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
index 4d0e481a9d19e..3149493095d9a 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct CtorDtor {
   int i;
@@ -44,7 +44,33 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[ELT_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[ELT_STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -154,7 +180,55 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -321,7 +395,77 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: } 
 //
 //
-// TODO: Add Init here.
+// Init:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] :  !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_LOAD]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -475,7 +619,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -535,7 +678,56 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
 //
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[ELT_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[ELT_STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -641,7 +833,55 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[ELT_STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[ELT_STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -783,7 +1023,78 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
+//
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_CtorDtor>> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -914,7 +1225,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -996,7 +1306,77 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_LOAD]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -1149,7 +1529,71 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
+//
+// Initialization.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>>, !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>, !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_CtorDtor x 5>>
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<5> : !u64i 
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[STRIDE]] : !cir.ptr<!cir.array<!rec_CtorDtor x 5>> -> !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_CtorDtor>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>, ["__array_idx"] {alignment = 1 : i64}
+// CHECK-NEXT: cir.store %[[DECAY]], %[[IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.do {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[IDX]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[IDX_LOAD]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.ptr_stride(%[[IDX_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ONE]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.store %[[INC]], %[[IDX]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } while {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[IDX]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[IDX_LOAD]], %[[ELT]]) : !cir.ptr<!rec_CtorDtor>, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_CtorDtor x 5>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -1313,7 +1757,77 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -1437,7 +1951,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
index 4687320d3fca2..ed8c38080f4b2 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoOps { int i = 0; };
 
@@ -39,14 +39,40 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[ELT_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[ELT_STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
 #pragma acc parallel private(OnePtr[B])
   ;
 #pragma acc parallel private(OnePtr)
-// CHECK-NEXT: acc.private.recipe @privatization__ZTSP5NoOps : !cir.ptr<!cir.ptr<!rec_NoOps>> init {
+// CHECK: acc.private.recipe @privatization__ZTSP5NoOps : !cir.ptr<!cir.ptr<!rec_NoOps>> init {
 // CHECK-NEXT: ^bb0(%arg0: !cir.ptr<!cir.ptr<!rec_NoOps>> {{.*}}):
 // CHECK-NEXT: cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["openacc.private.init"] {alignment = 8 : i64} 
 // CHECK-NEXT: acc.yield 
@@ -118,7 +144,54 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -187,7 +260,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[CMP]]) 
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
 // CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
 // CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
 // CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
@@ -229,7 +302,78 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+//
+// Init:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] :  !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -303,7 +447,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -362,7 +505,56 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[ELT_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[ELT_STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -415,7 +607,54 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+// Init Section
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[ELT_STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[ELT_STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 #pragma acc parallel private(PtrToArrays[B][A:B])
@@ -502,7 +741,77 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!cir.ptr<!rec_NoOps>> x 5>> -> !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -553,7 +862,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -634,7 +942,77 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_LOAD]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -708,7 +1086,70 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>>, !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!cir.array<!rec_NoOps x 5>>>, !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<5> : !u64i 
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[ELT:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_NoOps>, %[[ARR_SIZE]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[IDX:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, ["__array_idx"] {alignment = 1 : i64}
+// CHECK-NEXT: cir.store %[[DECAY]], %[[IDX]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.do {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[IDX]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[IDX_LOAD]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.ptr_stride(%[[IDX_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ONE]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.store %[[INC]], %[[IDX]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } while {
+// CHECK-NEXT: %[[IDX_LOAD:.*]] = cir.load %[[IDX]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[IDX_LOAD]], %[[ELT]]) : !cir.ptr<!rec_NoOps>, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -735,7 +1176,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
 // CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
 // CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
-// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
 // CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
 //
 // CHECK-NEXT: cir.scope {
@@ -749,7 +1190,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.condition(%[[CMP]]) 
 // CHECK-NEXT: } body {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
 // CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> 
 // CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
 // CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
@@ -799,7 +1240,77 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -843,7 +1354,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
index db5d5782bcecd..aac75730c4703 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
@@ -38,7 +38,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
 //
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -117,7 +116,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -227,7 +225,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -301,7 +298,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -359,7 +355,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -411,7 +406,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -499,7 +493,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -550,7 +543,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -632,7 +624,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT:} 
   ;
@@ -706,7 +697,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -797,7 +787,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -842,7 +831,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
 // CHECK-NEXT: } 
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
index 65b0365521bf4..77b7143e41aec 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct CtorDtor {
   int i;
@@ -57,7 +57,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -130,7 +129,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -243,7 +241,79 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
+// Initialization Section
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUNDS3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUNDS3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS3:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -379,7 +449,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -452,7 +521,58 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
+//
+// Initialization Section
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUNDS2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[TLA_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUNDS1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_STRIDE_LOAD:.*]] = cir.load %[[TLA_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[TLA_STRIDE_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_CtorDtor>>> {{.*}}, %[[BOUNDS1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUNDS2:.*]]: !acc.data_bounds_ty {{.*}}):
@@ -565,7 +685,32 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
+// Initialization Section
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!rec_CtorDtor>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}):
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
index 07e06f86924fb..b988fc4f282d0 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoOps { int i = 0; };
 
@@ -50,7 +50,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -124,7 +123,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -237,7 +235,79 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -250,7 +320,7 @@ void do_things(unsigned A, unsigned B) {
 
   T **TwoPtr;
 #pragma acc parallel private(TwoPtr)
-// CHECK-NEXT: acc.private.recipe @privatization__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
+// CHECK: acc.private.recipe @privatization__ZTSPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>> {{.*}}):
 // CHECK-NEXT: cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, ["openacc.private.init"] {alignment = 8 : i64}
 // CHECK-NEXT: acc.yield
@@ -294,7 +364,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -367,7 +436,58 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
+//
+// Initialization Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+//
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[TLA_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_STRIDE_LOAD:.*]] = cir.load %[[TLA_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[TLA_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -423,7 +543,32 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
+// Init Section.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
index a3b7dcac7689d..c87e1a6e8a89b 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
@@ -48,7 +48,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -122,7 +121,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -233,7 +231,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -291,7 +288,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -365,7 +361,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: }
 //
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;
@@ -420,7 +415,6 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
 // CHECK-NEXT: }
-// TODO: Add Init here.
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
   ;

From 1c1d525bf2535d3ce8ba28179a1b577215fe5e1c Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Thu, 2 Oct 2025 15:15:59 +0200
Subject: [PATCH 511/878] [mlir][omp] Improve canonloop/iv naming (#159773)

Improve the automatic naming of variables defined by the
`omp.canonical_loop` operation:

1. The iteration variable gets a name consistent with the cli variable
2. Instead of appending `_s0` for each nesting level, shorten it to
   `_d<num>` for a perfectly nested loop at depth `<num>`
3. Do not add any suffix to the top-level loop if it is the only
   top-level loop
---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 291 ++++++++++++++----
 .../Dialect/OpenMP/cli-canonical_loop.mlir    | 198 ++++++++++--
 .../Dialect/OpenMP/cli-unroll-heuristic.mlir  |  28 +-
 3 files changed, 417 insertions(+), 100 deletions(-)

diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index a173cf13328cd..32ebe06e240db 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -77,6 +77,232 @@ struct LLVMPointerPointerLikeModel
 };
 } // namespace
 
+/// Generate a name of a canonical loop nest of the format
+/// `<prefix>(_r<idx>_s<idx>)*`. Hereby, `_r<idx>` identifies the region
+/// argument index of an operation that has multiple regions, if the operation
+/// has multiple regions.
+/// `_s<idx>` identifies the position of an operation within a region, where
+/// only operations that may potentially contain loops ("container operations"
+/// i.e. have region arguments) are counted. Again, it is omitted if there is
+/// only one such operation in a region. If there are canonical loops nested
+/// inside each other, also may also use the format `_d<num>` where <num> is the
+/// nesting depth of the loop.
+///
+/// The generated name is a best-effort to make canonical loop unique within an
+/// SSA namespace. This also means that regions with IsolatedFromAbove property
+/// do not consider any parents or siblings.
+static std::string generateLoopNestingName(StringRef prefix,
+                                           CanonicalLoopOp op) {
+  struct Component {
+    /// If true, this component describes a region operand of an operation (the
+    /// operand's owner) If false, this component describes an operation located
+    /// in a parent region
+    bool isRegionArgOfOp;
+    bool skip = false;
+    bool isUnique = false;
+
+    size_t idx;
+    Operation *op;
+    Region *parentRegion;
+    size_t loopDepth;
+
+    Operation *&getOwnerOp() {
+      assert(isRegionArgOfOp && "Must describe a region operand");
+      return op;
+    }
+    size_t &getArgIdx() {
+      assert(isRegionArgOfOp && "Must describe a region operand");
+      return idx;
+    }
+
+    Operation *&getContainerOp() {
+      assert(!isRegionArgOfOp && "Must describe a operation of a region");
+      return op;
+    }
+    size_t &getOpPos() {
+      assert(!isRegionArgOfOp && "Must describe a operation of a region");
+      return idx;
+    }
+    bool isLoopOp() const {
+      assert(!isRegionArgOfOp && "Must describe a operation of a region");
+      return isa<CanonicalLoopOp>(op);
+    }
+    Region *&getParentRegion() {
+      assert(!isRegionArgOfOp && "Must describe a operation of a region");
+      return parentRegion;
+    }
+    size_t &getLoopDepth() {
+      assert(!isRegionArgOfOp && "Must describe a operation of a region");
+      return loopDepth;
+    }
+
+    void skipIf(bool v = true) { skip = skip || v; }
+  };
+
+  // List of ancestors, from inner to outer.
+  // Alternates between
+  //  * region argument of an operation
+  //  * operation within a region
+  SmallVector<Component> components;
+
+  // Gather a list of parent regions and operations, and the position within
+  // their parent
+  Operation *o = op.getOperation();
+  while (o) {
+    // Operation within a region
+    Region *r = o->getParentRegion();
+    if (!r)
+      break;
+
+    llvm::ReversePostOrderTraversal<Block *> traversal(&r->getBlocks().front());
+    size_t idx = 0;
+    bool found = false;
+    size_t sequentialIdx = -1;
+    bool isOnlyContainerOp = true;
+    for (Block *b : traversal) {
+      for (Operation &op : *b) {
+        if (&op == o && !found) {
+          sequentialIdx = idx;
+          found = true;
+        }
+        if (op.getNumRegions()) {
+          idx += 1;
+          if (idx > 1)
+            isOnlyContainerOp = false;
+        }
+        if (found && !isOnlyContainerOp)
+          break;
+      }
+    }
+
+    Component &containerOpInRegion = components.emplace_back();
+    containerOpInRegion.isRegionArgOfOp = false;
+    containerOpInRegion.isUnique = isOnlyContainerOp;
+    containerOpInRegion.getContainerOp() = o;
+    containerOpInRegion.getOpPos() = sequentialIdx;
+    containerOpInRegion.getParentRegion() = r;
+
+    Operation *parent = r->getParentOp();
+
+    // Region argument of an operation
+    Component &regionArgOfOperation = components.emplace_back();
+    regionArgOfOperation.isRegionArgOfOp = true;
+    regionArgOfOperation.isUnique = true;
+    regionArgOfOperation.getArgIdx() = 0;
+    regionArgOfOperation.getOwnerOp() = parent;
+
+    // The IsolatedFromAbove trait of the parent operation implies that each
+    // individual region argument has its own separate namespace, so no
+    // ambiguity.
+    if (!parent || parent->hasTrait<mlir::OpTrait::IsIsolatedFromAbove>())
+      break;
+
+    // Component only needed if operation has multiple region operands. Region
+    // arguments may be optional, but we currently do not consider this.
+    if (parent->getRegions().size() > 1) {
+      auto getRegionIndex = [](Operation *o, Region *r) {
+        for (auto [idx, region] : llvm::enumerate(o->getRegions())) {
+          if (&region == r)
+            return idx;
+        }
+        llvm_unreachable("Region not child of its parent operation");
+      };
+      regionArgOfOperation.isUnique = false;
+      regionArgOfOperation.getArgIdx() = getRegionIndex(parent, r);
+    }
+
+    // next parent
+    o = parent;
+  }
+
+  // Determine whether a region-argument component is not needed
+  for (Component &c : components)
+    c.skipIf(c.isRegionArgOfOp && c.isUnique);
+
+  // Find runs of nested loops and determine each loop's depth in the loop nest
+  size_t numSurroundingLoops = 0;
+  for (Component &c : llvm::reverse(components)) {
+    if (c.skip)
+      continue;
+
+    // non-skipped multi-argument operands interrupt the loop nest
+    if (c.isRegionArgOfOp) {
+      numSurroundingLoops = 0;
+      continue;
+    }
+
+    // Multiple loops in a region means each of them is the outermost loop of a
+    // new loop nest
+    if (!c.isUnique)
+      numSurroundingLoops = 0;
+
+    c.getLoopDepth() = numSurroundingLoops;
+
+    // Next loop is surrounded by one more loop
+    if (isa<CanonicalLoopOp>(c.getContainerOp()))
+      numSurroundingLoops += 1;
+  }
+
+  // In loop nests, skip all but the innermost loop that contains the depth
+  // number
+  bool isLoopNest = false;
+  for (Component &c : components) {
+    if (c.skip || c.isRegionArgOfOp)
+      continue;
+
+    if (!isLoopNest && c.getLoopDepth() >= 1) {
+      // Innermost loop of a loop nest of at least two loops
+      isLoopNest = true;
+    } else if (isLoopNest) {
+      // Non-innermost loop of a loop nest
+      c.skipIf(c.isUnique);
+
+      // If there is no surrounding loop left, this must have been the outermost
+      // loop; leave loop-nest mode for the next iteration
+      if (c.getLoopDepth() == 0)
+        isLoopNest = false;
+    }
+  }
+
+  // Skip non-loop unambiguous regions (but they should interrupt loop nests, so
+  // we mark them as skipped only after computing loop nests)
+  for (Component &c : components)
+    c.skipIf(!c.isRegionArgOfOp && c.isUnique &&
+             !isa<CanonicalLoopOp>(c.getContainerOp()));
+
+  // Components can be skipped if they are already disambiguated by their parent
+  // (or does not have a parent)
+  bool newRegion = true;
+  for (Component &c : llvm::reverse(components)) {
+    c.skipIf(newRegion && c.isUnique);
+
+    // non-skipped components disambiguate unique children
+    if (!c.skip)
+      newRegion = true;
+
+    // ...except canonical loops that need a suffix for each nest
+    if (!c.isRegionArgOfOp && c.getContainerOp())
+      newRegion = false;
+  }
+
+  // Compile the nesting name string
+  SmallString<64> Name{prefix};
+  llvm::raw_svector_ostream NameOS(Name);
+  for (auto &c : llvm::reverse(components)) {
+    if (c.skip)
+      continue;
+
+    if (c.isRegionArgOfOp)
+      NameOS << "_r" << c.getArgIdx();
+    else if (c.getLoopDepth() >= 1)
+      NameOS << "_d" << c.getLoopDepth();
+    else
+      NameOS << "_s" << c.getOpPos();
+  }
+
+  return NameOS.str().str();
+}
+
 void OpenMPDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
@@ -3172,67 +3398,7 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) {
     cliName =
         TypeSwitch<Operation *, std::string>(gen->getOwner())
             .Case([&](CanonicalLoopOp op) {
-              // Find the canonical loop nesting: For each ancestor add a
-              // "+_r<idx>" suffix (in reverse order)
-              SmallVector<std::string> components;
-              Operation *o = op.getOperation();
-              while (o) {
-                if (o->hasTrait<mlir::OpTrait::IsIsolatedFromAbove>())
-                  break;
-
-                Region *r = o->getParentRegion();
-                if (!r)
-                  break;
-
-                auto getSequentialIndex = [](Region *r, Operation *o) {
-                  llvm::ReversePostOrderTraversal<Block *> traversal(
-                      &r->getBlocks().front());
-                  size_t idx = 0;
-                  for (Block *b : traversal) {
-                    for (Operation &op : *b) {
-                      if (&op == o)
-                        return idx;
-                      // Only consider operations that are containers as
-                      // possible children
-                      if (!op.getRegions().empty())
-                        idx += 1;
-                    }
-                  }
-                  llvm_unreachable("Operation not part of the region");
-                };
-                size_t sequentialIdx = getSequentialIndex(r, o);
-                components.push_back(("s" + Twine(sequentialIdx)).str());
-
-                Operation *parent = r->getParentOp();
-                if (!parent)
-                  break;
-
-                // If the operation has more than one region, also count in
-                // which of the regions
-                if (parent->getRegions().size() > 1) {
-                  auto getRegionIndex = [](Operation *o, Region *r) {
-                    for (auto [idx, region] :
-                         llvm::enumerate(o->getRegions())) {
-                      if (&region == r)
-                        return idx;
-                    }
-                    llvm_unreachable("Region not child its parent operation");
-                  };
-                  size_t regionIdx = getRegionIndex(parent, r);
-                  components.push_back(("r" + Twine(regionIdx)).str());
-                }
-
-                // next parent
-                o = parent;
-              }
-
-              SmallString<64> Name("canonloop");
-              for (const std::string &s : reverse(components)) {
-                Name += '_';
-                Name += s;
-              }
-
-              return Name;
+              return generateLoopNestingName("canonloop", op);
             })
             .Case([&](UnrollHeuristicOp op) -> std::string {
               llvm_unreachable("heuristic unrolling does not generate a loop");
@@ -3323,7 +3489,8 @@ void CanonicalLoopOp::getAsmBlockNames(OpAsmSetBlockNameFn setNameFn) {
 
 void CanonicalLoopOp::getAsmBlockArgumentNames(Region &region,
                                                OpAsmSetValueNameFn setNameFn) {
-  setNameFn(region.getArgument(0), "iv");
+  std::string ivName = generateLoopNestingName("iv", *this);
+  setNameFn(region.getArgument(0), ivName);
 }
 
 void CanonicalLoopOp::print(OpAsmPrinter &p) {
diff --git a/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir b/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir
index adadb8bbac49d..0e9385ee75c47 100644
--- a/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir
+++ b/mlir/test/Dialect/OpenMP/cli-canonical_loop.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s | FileCheck %s
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt %s            | FileCheck %s --enable-var-scope
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope
 
 
 // CHECK-LABEL: @omp_canonloop_raw(
@@ -24,10 +24,10 @@ func.func @omp_canonloop_raw(%tc : i32) -> () {
 func.func @omp_canonloop_sequential_raw(%tc : i32) -> () {
   // CHECK-NEXT: %canonloop_s0 = omp.new_cli
   %canonloop_s0 = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) {
   "omp.canonical_loop" (%tc, %canonloop_s0) ({
     ^bb_first(%iv_first: i32):
-      // CHECK-NEXT: = llvm.add %iv, %iv : i32
+      // CHECK-NEXT: = llvm.add %iv_s0, %iv_s0 : i32
       %newval = llvm.add %iv_first, %iv_first : i32
     // CHECK-NEXT: omp.terminator
     omp.terminator
@@ -36,7 +36,7 @@ func.func @omp_canonloop_sequential_raw(%tc : i32) -> () {
 
   // CHECK-NEXT: %canonloop_s1 = omp.new_cli
   %canonloop_s1 = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) {
   "omp.canonical_loop" (%tc, %canonloop_s1) ({
     ^bb_second(%iv_second: i32):
     // CHECK: omp.terminator
@@ -52,17 +52,17 @@ func.func @omp_canonloop_sequential_raw(%tc : i32) -> () {
 // CHECK-LABEL: @omp_nested_canonloop_raw(
 // CHECK-SAME: %[[tc_outer:.+]]: i32, %[[tc_inner:.+]]: i32)
 func.func @omp_nested_canonloop_raw(%tc_outer : i32, %tc_inner : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop = omp.new_cli
   %outer = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
   %inner = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc_outer]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc_outer]]) {
   "omp.canonical_loop" (%tc_outer, %outer) ({
     ^bb_outer(%iv_outer: i32):
-      // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc_inner]]) {
+      // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc_inner]]) {
       "omp.canonical_loop" (%tc_inner, %inner) ({
         ^bb_inner(%iv_inner: i32):
-          // CHECK-NEXT: = llvm.add %iv, %iv_0 : i32
+          // CHECK-NEXT: = llvm.add %iv, %iv_d1 : i32
           %newval = llvm.add %iv_outer, %iv_inner: i32
           // CHECK-NEXT: omp.terminator
           omp.terminator
@@ -108,16 +108,24 @@ func.func @omp_canonloop_constant_pretty() -> () {
 func.func @omp_canonloop_sequential_pretty(%tc : i32) -> () {
   // CHECK-NEXT: %canonloop_s0 = omp.new_cli
   %canonloop_s0 = omp.new_cli
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
-  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%tc) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc) {
     // CHECK-NEXT: omp.terminator
     omp.terminator
   }
 
   // CHECK: %canonloop_s1 = omp.new_cli
   %canonloop_s1 = omp.new_cli
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s1) %iv : i32 in range(%[[tc]]) {
-  omp.canonical_loop(%canonloop_s1) %iv_0 : i32 in range(%tc) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc) {
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  }
+
+  // CHECK: %canonloop_s2 = omp.new_cli
+  %canonloop_s2 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%tc) {
     // CHECK-NEXT: omp.terminator
     omp.terminator
   }
@@ -126,17 +134,17 @@ func.func @omp_canonloop_sequential_pretty(%tc : i32) -> () {
 }
 
 
-// CHECK-LABEL: @omp_canonloop_nested_pretty(
+// CHECK-LABEL: @omp_canonloop_2d_nested_pretty(
 // CHECK-SAME: %[[tc:.+]]: i32)
-func.func @omp_canonloop_nested_pretty(%tc : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
-  %canonloop_s0 = omp.new_cli
-  // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli
-  %canonloop_s0_s0 = omp.new_cli
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
-  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%tc) {
-    // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc]]) {
-    omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%tc) {
+func.func @omp_canonloop_2d_nested_pretty(%tc : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
+  %canonloop_d1 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%tc) {
       // CHECK: omp.terminator
       omp.terminator
     }
@@ -147,6 +155,77 @@ func.func @omp_canonloop_nested_pretty(%tc : i32) -> () {
 }
 
 
+// CHECK-LABEL: @omp_canonloop_3d_nested_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32)
+func.func @omp_canonloop_3d_nested_pretty(%tc : i32) -> () {
+  // CHECK: %canonloop = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK: %canonloop_d1 = omp.new_cli
+  %canonloop_d1 = omp.new_cli
+  // CHECK: %canonloop_d2 = omp.new_cli
+  %canonloop_d2 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_d1) %iv_1d : i32 in range(%tc) {
+      // CHECK-NEXT: omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%[[tc]]) {
+      omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%tc) {
+        // CHECK-NEXT: omp.terminator
+        omp.terminator
+      // CHECK-NEXT: }
+      }
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    // CHECK-NEXT: }
+    }
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  }
+
+  return
+}
+
+
+// CHECK-LABEL: @omp_canonloop_sequential_nested_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32)
+func.func @omp_canonloop_sequential_nested_pretty(%tc : i32) -> () {
+  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop_s0_d1 = omp.new_cli
+  %canonloop_s0_d1 = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc) {
+   // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_d1) %iv_s0_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_s0_d1) %iv_s0_d1 : i32 in range(%tc) {
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    // CHECK-NEXT: }
+    }
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  // CHECK-NEXT: }
+  }
+
+  // CHECK-NEXT: %canonloop_s1 = omp.new_cli
+  %canonloop_s1 = omp.new_cli
+  // CHECK-NEXT: %canonloop_s1_d1 = omp.new_cli
+  %canonloop_s1_d1 = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc) {
+    // CHECK-NEXT:  omp.canonical_loop(%canonloop_s1_d1) %iv_s1_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%canonloop_s1_d1) %iv_s1d1 : i32 in range(%tc) {
+      // CHECK-NEXT: omp.terminator
+      omp.terminator
+    // CHECK-NEXT: }
+    }
+    // CHECK-NEXT: omp.terminator
+    omp.terminator
+  }
+
+  return
+}
+
+
 // CHECK-LABEL: @omp_newcli_unused(
 // CHECK-SAME: )
 func.func @omp_newcli_unused() -> () {
@@ -155,3 +234,74 @@ func.func @omp_newcli_unused() -> () {
   // CHECK-NEXT: return
   return
 }
+
+
+// CHECK-LABEL: @omp_canonloop_multiregion_isolatedfromabove(
+func.func @omp_canonloop_multiregion_isolatedfromabove() -> () {
+  omp.private {type = firstprivate} @x.privatizer : !llvm.ptr init {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+      %c42_i32 = arith.constant 42: i32
+      // CHECK: omp.canonical_loop %iv : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv1 : i32 in range(%c42_i32) {
+        omp.terminator
+      }
+      // CHECK: omp.yield
+      omp.yield(%arg0 : !llvm.ptr)
+  } copy {
+    ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+      %c42_i32 = arith.constant 42: i32
+      // CHECK: omp.canonical_loop %iv : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv : i32 in range(%c42_i32) {
+        // CHECK: omp.canonical_loop %iv_d1 : i32 in range(%c42_i32) {
+        omp.canonical_loop %iv_d1 : i32 in range(%c42_i32) {
+          omp.terminator
+        }
+        omp.terminator
+      }
+      // CHECK: omp.yield
+      omp.yield(%arg0 : !llvm.ptr)
+  } dealloc {
+    ^bb0(%arg0: !llvm.ptr):
+      %c42_i32 = arith.constant 42: i32
+      // CHECK: omp.canonical_loop %iv_s0 : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv_s0 : i32 in range(%c42_i32) {
+        omp.terminator
+      }
+      // CHECK: omp.canonical_loop %iv_s1 : i32 in range(%c42_i32) {
+      omp.canonical_loop %iv_s1 : i32 in range(%c42_i32) {
+        omp.terminator
+      }
+      // CHECK: omp.yield
+      omp.yield
+  }
+
+  // CHECK: return
+  return
+}
+
+
+// CHECK-LABEL: @omp_canonloop_multiregion(
+func.func @omp_canonloop_multiregion(%c : i1) -> () {
+  %c42_i32 = arith.constant 42: i32
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  %canonloop3 = omp.new_cli
+  scf.if %c {
+    // CHECK: omp.canonical_loop(%canonloop_r0) %iv_r0 : i32 in range(%c42_i32) {
+    omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%c42_i32) {
+      omp.terminator
+    }
+  } else {
+    // CHECK: omp.canonical_loop(%canonloop_r1_s0) %iv_r1_s0 : i32 in range(%c42_i32) {
+    omp.canonical_loop(%canonloop2)  %iv2 : i32 in range(%c42_i32) {
+      omp.terminator
+    }
+    // CHECK: omp.canonical_loop(%canonloop_r1_s1) %iv_r1_s1 : i32 in range(%c42_i32) {
+    omp.canonical_loop(%canonloop3)  %iv3 : i32 in range(%c42_i32) {
+      omp.terminator
+    }
+  }
+
+  // CHECK: return
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir b/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir
index cda7d0b500166..16884f4245e76 100644
--- a/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir
+++ b/mlir/test/Dialect/OpenMP/cli-unroll-heuristic.mlir
@@ -1,18 +1,18 @@
-// RUN: mlir-opt %s            | FileCheck %s
-// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt %s            | FileCheck %s --enable-var-scope
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope
 
 
 // CHECK-LABEL: @omp_unroll_heuristic_raw(
 // CHECK-SAME: %[[tc:.+]]: i32) {
 func.func @omp_unroll_heuristic_raw(%tc : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop = omp.new_cli
   %canonloop = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
   "omp.canonical_loop" (%tc, %canonloop) ({
     ^bb0(%iv: i32):
       omp.terminator
   }) : (i32, !omp.cli) -> ()
-  // CHECK: omp.unroll_heuristic(%canonloop_s0)
+  // CHECK: omp.unroll_heuristic(%canonloop)
   "omp.unroll_heuristic" (%canonloop) : (!omp.cli) -> ()
   return
 }
@@ -22,12 +22,12 @@ func.func @omp_unroll_heuristic_raw(%tc : i32) -> () {
 // CHECK-SAME: %[[tc:.+]]: i32) {
 func.func @omp_unroll_heuristic_pretty(%tc : i32) -> () {
   // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
-  %canonloop = "omp.new_cli" () : () -> (!omp.cli)
-  // CHECK-NEXT:  omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  %canonloop = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
   omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
     omp.terminator
   }
-  // CHECK: omp.unroll_heuristic(%canonloop_s0)
+  // CHECK: omp.unroll_heuristic(%canonloop)
   omp.unroll_heuristic(%canonloop)
   return
 }
@@ -36,13 +36,13 @@ func.func @omp_unroll_heuristic_pretty(%tc : i32) -> () {
 // CHECK-LABEL: @omp_unroll_heuristic_nested_pretty(
 // CHECK-SAME: %[[tc:.+]]: i32) {
 func.func @omp_unroll_heuristic_nested_pretty(%tc : i32) -> () {
-  // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop = omp.new_cli
   %cli_outer = omp.new_cli
-  // CHECK-NEXT: %canonloop_s0_s0 = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
   %cli_inner = omp.new_cli
-  // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv : i32 in range(%[[tc]]) {
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
   omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc) {
-    // CHECK-NEXT: omp.canonical_loop(%canonloop_s0_s0) %iv_0 : i32 in range(%[[tc]]) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
     omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc) {
       // CHECK: omp.terminator
       omp.terminator
@@ -51,9 +51,9 @@ func.func @omp_unroll_heuristic_nested_pretty(%tc : i32) -> () {
     omp.terminator
   }
 
-  // CHECK: omp.unroll_heuristic(%canonloop_s0)
+  // CHECK: omp.unroll_heuristic(%canonloop)
   omp.unroll_heuristic(%cli_outer)
-  // CHECK-NEXT: omp.unroll_heuristic(%canonloop_s0_s0)
+  // CHECK-NEXT: omp.unroll_heuristic(%canonloop_d1)
   omp.unroll_heuristic(%cli_inner)
   return
 }

From e48fe7627732162683245f712eb84d1155ffcd58 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 2 Oct 2025 15:18:24 +0200
Subject: [PATCH 512/878] [InstCombine] Remove foldSelectWithFrozenICmp() fold
 (#161659)

After https://github.com/llvm/llvm-project/pull/154336 this fold no
longer triggers, as the freeze will be pushed through to the icmp
operands, and generic handling will take care of it.
---
 .../InstCombine/InstCombineSelect.cpp         | 29 -------------------
 1 file changed, 29 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index b6b3a95f35c76..87000a1c36eef 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2934,32 +2934,6 @@ static Instruction *foldSelectWithSRem(SelectInst &SI, InstCombinerImpl &IC,
   return nullptr;
 }
 
-static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
-  FreezeInst *FI = dyn_cast<FreezeInst>(Sel.getCondition());
-  if (!FI)
-    return nullptr;
-
-  Value *Cond = FI->getOperand(0);
-  Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
-
-  //   select (freeze(x == y)), x, y --> y
-  //   select (freeze(x != y)), x, y --> x
-  // The freeze should be only used by this select. Otherwise, remaining uses of
-  // the freeze can observe a contradictory value.
-  //   c = freeze(x == y)   ; Let's assume that y = poison & x = 42; c is 0 or 1
-  //   a = select c, x, y   ;
-  //   f(a, c)              ; f(poison, 1) cannot happen, but if a is folded
-  //                        ; to y, this can happen.
-  CmpPredicate Pred;
-  if (FI->hasOneUse() &&
-      match(Cond, m_c_ICmp(Pred, m_Specific(TrueVal), m_Specific(FalseVal))) &&
-      (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) {
-    return Pred == ICmpInst::ICMP_EQ ? FalseVal : TrueVal;
-  }
-
-  return nullptr;
-}
-
 /// Given that \p CondVal is known to be \p CondIsTrue, try to simplify \p SI.
 static Value *simplifyNestedSelectsUsingImpliedCond(SelectInst &SI,
                                                     Value *CondVal,
@@ -4446,9 +4420,6 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
     return replaceInstUsesWith(SI, PN);
 
-  if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
-    return replaceInstUsesWith(SI, Fr);
-
   if (Value *V = foldRoundUpIntegerWithPow2Alignment(SI, Builder))
     return replaceInstUsesWith(SI, V);
 

From c4e1bca407f4cca7644937c117890fad157fec4b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 22:24:49 +0900
Subject: [PATCH 513/878] Greedy: Move physreg check when trying to recolor
 vregs (NFC) (#160484)

Instead of checking if the recoloring candidate is a virtual register,
avoid adding it to the candidates in the first place.
---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 5638f98b8163d..24fe838e4b7d8 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2498,10 +2498,6 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
   do {
     Reg = RecoloringCandidates.pop_back_val();
 
-    // We cannot recolor physical register.
-    if (Reg.isPhysical())
-      continue;
-
     // This may be a skipped register.
     if (!VRM->hasPhys(Reg)) {
       assert(!shouldAllocateRegister(Reg) &&
@@ -2549,7 +2545,8 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
     // Push all copy-related live-ranges to keep reconciling the broken
     // hints.
     for (const HintInfo &HI : Info) {
-      if (Visited.insert(HI.Reg).second)
+      // We cannot recolor physical register.
+      if (HI.Reg.isVirtual() && Visited.insert(HI.Reg).second)
         RecoloringCandidates.push_back(HI.Reg);
     }
   } while (!RecoloringCandidates.empty());

From 9f5abd38dd1782a6fd3b8ed1c2f76aa62dc850b1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 2 Oct 2025 14:43:07 +0100
Subject: [PATCH 514/878] [Codegen] Add a separate stack ID for scalable
 predicates (#142390)

This splits out "ScalablePredicateVector" from the "ScalableVector"
StackID this is primarily to allow easy differentiation between vectors
and predicates (without inspecting instructions).

This new stack ID is not used in many places yet, but will be used in a
later patch to mark stack slots that are known to contain predicates.

Co-authored-by: Kerry McLaughlin <kerry.mclaughlin@arm.com>
---
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  2 ++
 llvm/include/llvm/CodeGen/MachineFrameInfo.h  |  9 +++++-
 .../llvm/CodeGen/TargetFrameLowering.h        |  1 +
 .../CodeGen/StackFrameLayoutAnalysisPass.cpp  |  2 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   | 29 ++++++++++---------
 .../lib/Target/AArch64/AArch64FrameLowering.h |  4 ++-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  4 +--
 .../Target/AArch64/AArch64ISelLowering.cpp    | 13 +++++----
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  8 ++---
 .../AArch64/AArch64PrologueEpilogue.cpp       |  7 ++---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  1 +
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |  1 +
 .../AArch64/debug-info-sve-dbg-declare.mir    |  8 ++---
 .../AArch64/debug-info-sve-dbg-value.mir      |  4 +--
 llvm/test/CodeGen/AArch64/framelayout-sve.mir | 12 ++++----
 llvm/test/CodeGen/AArch64/spillfill-sve.mir   | 10 +++----
 .../AArch64/sve-calling-convention-byref.ll   | 10 +++----
 17 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index c7304e386b542..e80c13885805b 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -378,6 +378,8 @@ struct ScalarEnumerationTraits<TargetStackID::Value> {
     IO.enumCase(ID, "default", TargetStackID::Default);
     IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill);
     IO.enumCase(ID, "scalable-vector", TargetStackID::ScalableVector);
+    IO.enumCase(ID, "scalable-predicate-vector",
+                TargetStackID::ScalablePredicateVector);
     IO.enumCase(ID, "wasm-local", TargetStackID::WasmLocal);
     IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc);
   }
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 00c734330a40b..b37c677c10569 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -497,7 +497,14 @@ class MachineFrameInfo {
   /// Should this stack ID be considered in MaxAlignment.
   bool contributesToMaxAlignment(uint8_t StackID) {
     return StackID == TargetStackID::Default ||
-           StackID == TargetStackID::ScalableVector;
+           StackID == TargetStackID::ScalableVector ||
+           StackID == TargetStackID::ScalablePredicateVector;
+  }
+
+  bool isScalableStackID(int ObjectIdx) const {
+    uint8_t StackID = getStackID(ObjectIdx);
+    return StackID == TargetStackID::ScalableVector ||
+           StackID == TargetStackID::ScalablePredicateVector;
   }
 
   /// setObjectAlignment - Change the alignment of the specified stack object.
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 0e29e45752a9f..75696faf114cc 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -32,6 +32,7 @@ enum Value {
   SGPRSpill = 1,
   ScalableVector = 2,
   WasmLocal = 3,
+  ScalablePredicateVector = 4,
   NoAlloc = 255
 };
 }
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 096a33c17cb4b..ec75dc3e3871c 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -72,7 +72,7 @@ struct StackFrameLayoutAnalysis {
         : Slot(Idx), Size(MFI.getObjectSize(Idx)),
           Align(MFI.getObjectAlign(Idx).value()), Offset(Offset),
           SlotTy(Invalid), Scalable(false) {
-      Scalable = MFI.getStackID(Idx) == TargetStackID::ScalableVector;
+      Scalable = MFI.isScalableStackID(Idx);
       if (MFI.isSpillSlotObjectIndex(Idx))
         SlotTy = SlotType::Spill;
       else if (MFI.isFixedObjectIndex(Idx))
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ab5c6f3c0a19d..20b0d697827c5 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -324,7 +324,7 @@ AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(const AArch64FrameLowering &,
                                       MachineFunction &MF);
 
-// Conservatively, returns true if the function is likely to have an SVE vectors
+// Conservatively, returns true if the function is likely to have SVE vectors
 // on the stack. This function is safe to be called before callee-saves or
 // object offsets have been determined.
 static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
@@ -338,7 +338,7 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
-    if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
+    if (MFI.isScalableStackID(FI))
       return true;
   }
 
@@ -1228,7 +1228,7 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool FPAfterSVECalleeSaves =
       isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
-  if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+  if (MFI.isScalableStackID(FI)) {
     if (FPAfterSVECalleeSaves &&
         -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize())
       return StackOffset::getScalable(ObjectOffset);
@@ -1294,7 +1294,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
   const auto &MFI = MF.getFrameInfo();
   int64_t ObjectOffset = MFI.getObjectOffset(FI);
   bool isFixed = MFI.isFixedObjectIndex(FI);
-  bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
+  bool isSVE = MFI.isScalableStackID(FI);
   return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
                                      PreferFP, ForSimm);
 }
@@ -2021,10 +2021,14 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     }
     // Update the StackIDs of the SVE stack slots.
     MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
+    if (RPI.Type == RegPairInfo::ZPR) {
       MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
       if (RPI.isPaired())
         MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
+    } else if (RPI.Type == RegPairInfo::PPR) {
+      MFI.setStackID(FrameIdxReg1, TargetStackID::ScalablePredicateVector);
+      if (RPI.isPaired())
+        MFI.setStackID(FrameIdxReg2, TargetStackID::ScalablePredicateVector);
     }
   }
   return true;
@@ -2232,8 +2236,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
       for (auto &MI : MBB) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
         if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
-              AArch64InstrInfo::isFpOrNEON(MI))
+          if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI))
             FrameObjects[*FI] |= 2;
           else
             FrameObjects[*FI] |= 1;
@@ -2678,7 +2681,7 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
 #ifndef NDEBUG
   // First process all fixed stack objects.
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
-    assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
+    assert(!MFI.isScalableStackID(I) &&
            "SVE vectors should never be passed on the stack by value, only by "
            "reference.");
 #endif
@@ -2712,12 +2715,11 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
   int StackProtectorFI = -1;
   if (MFI.hasStackProtectorIndex()) {
     StackProtectorFI = MFI.getStackProtectorIndex();
-    if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
+    if (MFI.isScalableStackID(StackProtectorFI))
       ObjectsToAllocate.push_back(StackProtectorFI);
   }
   for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
-    unsigned StackID = MFI.getStackID(I);
-    if (StackID != TargetStackID::ScalableVector)
+    if (!MFI.isScalableStackID(I))
       continue;
     if (I == StackProtectorFI)
       continue;
@@ -3721,8 +3723,7 @@ void AArch64FrameLowering::orderFrameObjects(
       if (AFI.hasStackHazardSlotIndex()) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
         if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
-              AArch64InstrInfo::isFpOrNEON(MI))
+          if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI))
             FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
           else
             FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
@@ -4080,7 +4081,7 @@ void AArch64FrameLowering::emitRemarks(
           }
 
           unsigned RegTy = StackAccess::AccessType::GPR;
-          if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
+          if (MFI.isScalableStackID(FrameIdx)) {
             // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
             // spill/fill the predicate as a data vector (so are an FPR access).
             if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 7bba053111e89..20d1d6afc0ebb 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -124,6 +124,7 @@ class AArch64FrameLowering : public TargetFrameLowering {
       return false;
     case TargetStackID::Default:
     case TargetStackID::ScalableVector:
+    case TargetStackID::ScalablePredicateVector:
     case TargetStackID::NoAlloc:
       return true;
     }
@@ -132,7 +133,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
   bool isStackIdSafeForLocalArea(unsigned StackId) const override {
     // We don't support putting SVE objects into the pre-allocated local
     // frame block at the moment.
-    return StackId != TargetStackID::ScalableVector;
+    return (StackId != TargetStackID::ScalableVector &&
+            StackId != TargetStackID::ScalablePredicateVector);
   }
 
   void
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 177b4b0febcac..35bbb0c023472 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -7497,7 +7497,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     // We can only encode VL scaled offsets, so only fold in frame indexes
     // referencing SVE objects.
-    if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+    if (MFI.isScalableStackID(FI)) {
       Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
       OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
       return true;
@@ -7543,7 +7543,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
     // We can only encode VL scaled offsets, so only fold in frame indexes
     // referencing SVE objects.
-    if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
+    if (MFI.isScalableStackID(FI))
       Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 50a0562edaff1..c2a482a7abeb0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9256,8 +9256,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
       (MI.getOpcode() == AArch64::ADDXri ||
        MI.getOpcode() == AArch64::SUBXri)) {
     const MachineOperand &MO = MI.getOperand(1);
-    if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
-                         TargetStackID::ScalableVector)
+    if (MO.isFI() && MF.getFrameInfo().isScalableStackID(MO.getIndex()))
       MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
                                               /*IsImplicit=*/true));
   }
@@ -9704,8 +9703,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
       MachineFrameInfo &MFI = MF.getFrameInfo();
       int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
-      if (isScalable)
-        MFI.setStackID(FI, TargetStackID::ScalableVector);
+      if (isScalable) {
+        bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
+                      VA.getValVT().getVectorElementType() == MVT::i1;
+        MFI.setStackID(FI, IsPred ? TargetStackID::ScalablePredicateVector
+                                  : TargetStackID::ScalableVector);
+      }
 
       MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
       SDValue Ptr = DAG.getFrameIndex(
@@ -29605,7 +29608,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
   // than doing it here in finalizeLowering.
   if (MFI.hasStackProtectorIndex()) {
     for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
-      if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
+      if (MFI.isScalableStackID(i) &&
           MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
         MFI.setStackID(MFI.getStackProtectorIndex(),
                        TargetStackID::ScalableVector);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 35b27ea2ec9dd..5a90da1fade39 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5599,7 +5599,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_PXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   }
@@ -5614,7 +5614,7 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       Opc = AArch64::STRSui;
     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
       Opc = AArch64::STR_PPXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   case 8:
@@ -5784,7 +5784,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       if (IsPNR)
         PNRReg = DestReg;
       Opc = AArch64::LDR_PXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   }
@@ -5799,7 +5799,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       Opc = AArch64::LDRSui;
     else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
       Opc = AArch64::LDR_PPXI;
-      StackID = TargetStackID::ScalableVector;
+      StackID = TargetStackID::ScalablePredicateVector;
     }
     break;
   case 8:
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 09b36433801a4..5da16b913ac4a 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1165,7 +1165,7 @@ void AArch64PrologueEmitter::emitCalleeSavedGPRLocations(
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
   for (const auto &Info : CSI) {
     unsigned FrameIdx = Info.getFrameIdx();
-    if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
+    if (MFI.isScalableStackID(FrameIdx))
       continue;
 
     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
@@ -1192,7 +1192,7 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
   }
 
   for (const auto &Info : CSI) {
-    if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
+    if (!MFI.isScalableStackID(Info.getFrameIdx()))
       continue;
 
     // Not all unwinders may know about SVE registers, so assume the lowest
@@ -1624,8 +1624,7 @@ void AArch64EpilogueEmitter::emitCalleeSavedRestores(
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
 
   for (const auto &Info : CSI) {
-    if (SVE !=
-        (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+    if (SVE != MFI.isScalableStackID(Info.getFrameIdx()))
       continue;
 
     MCRegister Reg = Info.getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 7c5d4fc2dacf6..e4b3528b432bb 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -924,6 +924,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   case TargetStackID::SGPRSpill:
     return true;
   case TargetStackID::ScalableVector:
+  case TargetStackID::ScalablePredicateVector:
   case TargetStackID::WasmLocal:
     return false;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 06ce91771c9e7..7d4535ad46916 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -2395,6 +2395,7 @@ bool RISCVFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   case TargetStackID::NoAlloc:
   case TargetStackID::SGPRSpill:
   case TargetStackID::WasmLocal:
+  case TargetStackID::ScalablePredicateVector:
     return false;
   }
   llvm_unreachable("Invalid TargetStackID::Value");
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
index aca2816225e3e..7fd0cee068fd1 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
@@ -164,10 +164,10 @@ stack:
   - { id: 1, name: z1.addr, size: 16, alignment: 16, stack-id: scalable-vector,
       debug-info-variable: '!31', debug-info-expression: '!DIExpression()',
       debug-info-location: '!32' }
-  - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+  - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
       debug-info-variable: '!33', debug-info-expression: '!DIExpression()',
       debug-info-location: '!34' }
-  - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+  - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
       debug-info-variable: '!35', debug-info-expression: '!DIExpression()',
       debug-info-location: '!36' }
   - { id: 4, name: w0.addr, size: 4, alignment: 4, local-offset: -4, debug-info-variable: '!37',
@@ -181,10 +181,10 @@ stack:
   - { id: 7, name: localv1, size: 16, alignment: 16, stack-id: scalable-vector,
       debug-info-variable: '!45', debug-info-expression: '!DIExpression()',
       debug-info-location: '!46' }
-  - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-vector,
+  - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
       debug-info-variable: '!48', debug-info-expression: '!DIExpression()',
       debug-info-location: '!49' }
-  - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-vector,
+  - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
       debug-info-variable: '!51', debug-info-expression: '!DIExpression()',
       debug-info-location: '!52' }
 machineFunctionInfo: {}
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
index 0ea180b20730f..41ba5542150ab 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
@@ -96,8 +96,8 @@ stack:
   - { id: 1, size: 8,  alignment: 8 }
   - { id: 2, size: 16, alignment: 16, stack-id: scalable-vector }
   - { id: 3, size: 16, alignment: 16, stack-id: scalable-vector }
-  - { id: 4, size: 2,  alignment: 2,  stack-id: scalable-vector }
-  - { id: 5, size: 2,  alignment: 2,  stack-id: scalable-vector }
+  - { id: 4, size: 2,  alignment: 2,  stack-id: scalable-predicate-vector }
+  - { id: 5, size: 2,  alignment: 2,  stack-id: scalable-predicate-vector }
 machineFunctionInfo: {}
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 03a6aabffaaf1..110141609a59e 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -1215,19 +1215,19 @@ body:             |
 # CHECK:        - { id: 2, name: '', type: default, offset: -112, size: 16, alignment: 16,
 # CHECK-NEXT:       stack-id: scalable-vector,
 # CHECK:        - { id: 3, name: '', type: default, offset: -114, size: 2, alignment: 2,
-# CHECK-NEXT:       stack-id: scalable-vector,
+# CHECK-NEXT:       stack-id: scalable-predicate-vector,
 # CHECK:        - { id: 4, name: '', type: spill-slot, offset: -144, size: 16, alignment: 16,
 # CHECK-NEXT:       stack-id: scalable-vector,
 # CHECK:        - { id: 5, name: '', type: spill-slot, offset: -146, size: 2, alignment: 2,
-# CHECK-NEXT:       stack-id: scalable-vector,
+# CHECK-NEXT:       stack-id: scalable-predicate-vector,
 # CHECK:        - { id: 6, name: '', type: spill-slot, offset: -16, size: 16, alignment: 16,
 # CHECK-NEXT:       stack-id: scalable-vector, callee-saved-register: '$z8',
 # CHECK:        - { id: 7, name: '', type: spill-slot, offset: -32, size: 16, alignment: 16,
 # CHECK-NEXT:       stack-id: scalable-vector, callee-saved-register: '$z23',
 # CHECK:        - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2,
-# CHECK-NEXT:       stack-id: scalable-vector, callee-saved-register: '$p4',
+# CHECK-NEXT:       stack-id: scalable-predicate-vector, callee-saved-register: '$p4',
 # CHECK:        - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2,
-# CHECK-NEXT:       stack-id: scalable-vector, callee-saved-register: '$p15',
+# CHECK-NEXT:       stack-id: scalable-predicate-vector, callee-saved-register: '$p15',
 # CHECK:        - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16,
 # CHECK-NEXT:       stack-id: default, callee-saved-register: '$fp',
 #
@@ -1295,9 +1295,9 @@ stack:
   - { id: 0, type: default,    size:  32, alignment: 16, stack-id: scalable-vector }
   - { id: 1, type: default,    size:   4, alignment:  2, stack-id: scalable-vector }
   - { id: 2, type: default,    size:  16, alignment: 16, stack-id: scalable-vector }
-  - { id: 3, type: default,    size:   2, alignment:  2, stack-id: scalable-vector }
+  - { id: 3, type: default,    size:   2, alignment:  2, stack-id: scalable-predicate-vector }
   - { id: 4, type: spill-slot, size:  16, alignment: 16, stack-id: scalable-vector }
-  - { id: 5, type: spill-slot, size:   2, alignment:  2, stack-id: scalable-vector }
+  - { id: 5, type: spill-slot, size:   2, alignment:  2, stack-id: scalable-predicate-vector }
 body:             |
   bb.0.entry:
 
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 2b16dd0f29ecc..5569175fb58fc 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -39,7 +39,7 @@ body:             |
     ; CHECK-LABEL: name: spills_fills_stack_id_ppr
     ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+    ; CHECK-NEXT:     stack-id: scalable-predicate-vector, callee-saved-register: ''
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_ppr
     ; EXPAND: STR_PXI $p0, $sp, 7
@@ -82,7 +82,7 @@ body:             |
     ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
     ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+    ; CHECK-NEXT:     stack-id: scalable-predicate-vector, callee-saved-register: ''
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2
     ; EXPAND: STR_PXI $p0, $sp, 6
@@ -127,7 +127,7 @@ body:             |
     ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
     ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+    ; CHECK-NEXT:     stack-id: scalable-predicate-vector, callee-saved-register: ''
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2
     ; EXPAND: STR_PXI $p0, $sp, 6
@@ -172,7 +172,7 @@ body:             |
     ; CHECK-LABEL: name: spills_fills_stack_id_pnr
     ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+    ; CHECK-NEXT:     stack-id: scalable-predicate-vector, callee-saved-register: ''
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_pnr
     ; EXPAND: STR_PXI $pn0, $sp, 7
@@ -211,7 +211,7 @@ body:             |
     ; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr
     ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+    ; CHECK-NEXT:     stack-id: scalable-predicate-vector, callee-saved-register: ''
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr
     ; EXPAND: renamable $pn8 = WHILEGE_CXX_B
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 7bddd1d70aaa8..cc63c7ffc0c1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -56,9 +56,9 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
 ; CHECK: name: caller_with_many_svepred_arg
 ; CHECK: stack:
 ; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT:     stack-id: scalable-vector
+; CHECK-NEXT:     stack-id: scalable-predicate-vector
 ; CHECK:      - { id: 1, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT:     stack-id: scalable-vector
+; CHECK-NEXT:     stack-id: scalable-predicate-vector
 ; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0
 ; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0
 ; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
@@ -90,7 +90,7 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i
 ; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
 ; CHECK: stack:
 ; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK-NEXT:     stack-id: scalable-predicate-vector,
 ; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
 ; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
 ; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
@@ -139,7 +139,7 @@ define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <v
 ; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
 ; CHECK: stack:
 ; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK-NEXT:     stack-id: scalable-predicate-vector,
 ; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
 ; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
 ; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
@@ -200,7 +200,7 @@ define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <v
 ; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
 ; CHECK: stack:
 ; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK-NEXT:     stack-id: scalable-predicate-vector,
 ; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
 ; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
 ; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1

From 047f8c8ee08e6efc51c552b910a6fbec2baca189 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Thu, 2 Oct 2025 15:58:55 +0200
Subject: [PATCH 515/878] Revert "[Clang] Normalize constraints before checking
 for satisfaction" (#161669)

Reverts llvm/llvm-project#141776

CI failures

https://lab.llvm.org/buildbot/#/builders/202/builds/3591
https://lab.llvm.org/buildbot/#/builders/55/builds/18066
https://lab.llvm.org/buildbot/#/builders/85/builds/14103
---
 clang/docs/InternalsManual.rst                |   61 -
 clang/docs/ReleaseNotes.rst                   |    4 -
 clang/include/clang/AST/ASTConcept.h          |   33 +-
 clang/include/clang/AST/ASTContext.h          |    1 +
 clang/include/clang/Sema/Sema.h               |  105 +-
 clang/include/clang/Sema/SemaConcept.h        |  434 +---
 clang/include/clang/Sema/Template.h           |   22 +-
 clang/lib/AST/ASTConcept.cpp                  |   31 +-
 clang/lib/AST/ASTImporter.cpp                 |   12 +-
 clang/lib/Sema/SemaConcept.cpp                | 2012 ++++++-----------
 clang/lib/Sema/SemaDeclCXX.cpp                |   16 +-
 clang/lib/Sema/SemaExprCXX.cpp                |   16 +-
 clang/lib/Sema/SemaOverload.cpp               |    3 +-
 clang/lib/Sema/SemaTemplate.cpp               |   93 +-
 clang/lib/Sema/SemaTemplateDeduction.cpp      |   51 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  169 +-
 clang/lib/Sema/TreeTransform.h                |   19 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |    2 +-
 clang/lib/Serialization/ASTReaderStmt.cpp     |   14 +-
 clang/lib/Serialization/ASTWriterStmt.cpp     |   18 +-
 clang/test/AST/ast-dump-concepts.cpp          |   10 +-
 clang/test/AST/ast-dump-ctad-alias.cpp        |   21 +-
 clang/test/CXX/drs/cwg25xx.cpp                |   14 +-
 .../CXX/expr/expr.prim/expr.prim.id/p3.cpp    |    3 +-
 .../expr.prim.req/compound-requirement.cpp    |   14 +-
 .../expr.prim.req/nested-requirement.cpp      |   35 +-
 .../expr.prim.req/simple-requirement.cpp      |    4 +-
 .../expr.prim.req/type-requirement.cpp        |   12 +-
 .../constrant-satisfaction-conversions.cpp    |    5 +-
 .../temp.constr/temp.constr.normal/p1.cpp     |   59 +-
 clang/test/CXX/temp/temp.param/p10-2a.cpp     |   23 +-
 clang/test/SemaCXX/cxx23-assume.cpp           |    9 +-
 clang/test/SemaCXX/cxx2b-deducing-this.cpp    |    8 +-
 clang/test/SemaCXX/cxx2c-fold-exprs.cpp       |  202 +-
 .../SemaCXX/cxx2c-template-template-param.cpp |    4 +-
 .../invalid-requirement-requires-expr.cpp     |    4 +-
 ...overload-resolution-deferred-templates.cpp |    3 +-
 clang/test/SemaCXX/type-traits.cpp            |    4 +-
 clang/test/SemaHLSL/BuiltIns/Buffers.hlsl     |    6 +-
 clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl   |    6 +-
 .../SemaTemplate/concepts-recovery-expr.cpp   |   32 +-
 .../SemaTemplate/concepts-recursive-inst.cpp  |   27 +-
 clang/test/SemaTemplate/concepts.cpp          |   71 +-
 clang/test/SemaTemplate/deduction-guide.cpp   |   15 +-
 .../instantiate-abbreviated-template.cpp      |    1 -
 .../instantiate-expanded-type-constraint.cpp  |    4 +-
 .../instantiate-requires-expr.cpp             |   20 +-
 .../instantiate-template-argument.cpp         |   97 +-
 clang/test/SemaTemplate/pr52970.cpp           |    2 +-
 .../cpp17_iterator_concepts.verify.cpp        |    4 +-
 50 files changed, 1197 insertions(+), 2638 deletions(-)

diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index c677ddfa5ecc1..bd742273f4ed5 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -2859,67 +2859,6 @@ This library is called by the :ref:`Parser library <Parser>` during parsing to
 do semantic analysis of the input.  For valid programs, Sema builds an AST for
 parsed constructs.
 
-
-Concept Satisfaction Checking and Subsumption
----------------------------------------------
-
-As per the C++ standard, constraints are `normalized <https://eel.is/c++draft/temp.constr.normal>`_
-and the normal form is used both for subsumption, and constraint checking.
-Both depend on a parameter mapping that substitutes lazily. In particular,
-we should not substitute in unused arguments.
-
-Clang follows the order of operations prescribed by the standard.
-
-Normalization happens prior to satisfaction and subsumption
-and is handled by ``NormalizedConstraint``.
-
-Clang preserves in the normalized form intermediate concept-ids
-(``ConceptIdConstraint``) This is used for diagnostics only and no substitution
-happens in a ConceptIdConstraint if its expression is satisfied.
-
-The normal form of the associated constraints of a declaration is cached in
-Sema::NormalizationCache such that it is only computed once.
-
-A ``NormalizedConstraint`` is a recursive data structure, where each node
-contains a parameter mapping, represented by the indexes of all parameter
-being used.
-
-Checking satisfaction is done by ``ConstraintSatisfactionChecker``, recursively
-walking ``NormalizedConstraint``. At each level, we substitute the outermost
-level of the template arguments referenced in the parameter mapping of a
-normalized expression (``MultiLevelTemplateArgumentList``).
-
-For the following example,
-
-.. code-block:: c++
-
-  template <typename T>
-  concept A = __is_same(T, int);
-
-  template <typename U>
-  concept B = A<U> && __is_same(U, int);
-
-The normal form of B is
-
-.. code-block:: c++
-
-    __is_same(T, int) /*T->U, innermost level*/
- && __is_same(U, int) {U->U} /*T->U, outermost level*/
-
-After substitution in the mapping, we substitute in the constraint expression
-using that copy of the ``MultiLevelTemplateArgumentList``, and then evaluate it.
-
-Because this is expensive, it is cached in
-``UnsubstitutedConstraintSatisfactionCache``.
-
-Any error during satisfaction is recorded in ``ConstraintSatisfaction``.
-for nested requirements, ``ConstraintSatisfaction`` is stored (including
-diagnostics) in the AST, which is something we might want to improve.
-
-When an atomic constraint is not satified, we try to substitute into any
-enclosing concept-id using the same mechanism described above, for
-diagnostics purpose, and inject that in the ``ConstraintSatisfaction``.
-
 .. _CodeGen:
 
 The CodeGen Library
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a1e3a0c51d8e1..c6ee1e282a008 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -160,10 +160,6 @@ C++23 Feature Support
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
-- Clang now normalizes constraints before checking whether they are satisfied, as mandated by the standard.
-  As a result, Clang no longer incorrectly diagnoses substitution failures in template arguments only
-  used in concept-ids, and produces better diagnostics for satisfaction failure. (#GH61811) (#GH135190)
-
 C++17 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h
index f362f24ebc72a..72da0059744f2 100644
--- a/clang/include/clang/AST/ASTConcept.h
+++ b/clang/include/clang/AST/ASTConcept.h
@@ -28,20 +28,10 @@ namespace clang {
 
 class ConceptDecl;
 class TemplateDecl;
-class ConceptReference;
 class Expr;
 class NamedDecl;
 struct PrintingPolicy;
 
-/// Unsatisfied constraint expressions if the template arguments could be
-/// substituted into them, or a diagnostic if substitution resulted in
-/// an invalid expression.
-///
-using ConstraintSubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
-using UnsatisfiedConstraintRecord =
-    llvm::PointerUnion<const Expr *, const ConceptReference *,
-                       const ConstraintSubstitutionDiagnostic *>;
-
 /// The result of a constraint satisfaction check, containing the necessary
 /// information to diagnose an unsatisfied constraint.
 class ConstraintSatisfaction : public llvm::FoldingSetNode {
@@ -58,13 +48,16 @@ class ConstraintSatisfaction : public llvm::FoldingSetNode {
                          ArrayRef<TemplateArgument> TemplateArgs)
       : ConstraintOwner(ConstraintOwner), TemplateArgs(TemplateArgs) {}
 
+  using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
+  using Detail = llvm::PointerUnion<Expr *, SubstitutionDiagnostic *>;
+
   bool IsSatisfied = false;
   bool ContainsErrors = false;
 
   /// \brief The substituted constraint expr, if the template arguments could be
   /// substituted into them, or a diagnostic if substitution resulted in an
   /// invalid expression.
-  llvm::SmallVector<UnsatisfiedConstraintRecord, 4> Details;
+  llvm::SmallVector<Detail, 4> Details;
 
   void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &C) {
     Profile(ID, C, ConstraintOwner, TemplateArgs);
@@ -76,12 +69,19 @@ class ConstraintSatisfaction : public llvm::FoldingSetNode {
 
   bool HasSubstitutionFailure() {
     for (const auto &Detail : Details)
-      if (Detail.dyn_cast<const ConstraintSubstitutionDiagnostic *>())
+      if (Detail.dyn_cast<SubstitutionDiagnostic *>())
         return true;
     return false;
   }
 };
 
+/// Pairs of unsatisfied atomic constraint expressions along with the
+/// substituted constraint expr, if the template arguments could be
+/// substituted into them, or a diagnostic if substitution resulted in
+/// an invalid expression.
+using UnsatisfiedConstraintRecord =
+    llvm::PointerUnion<Expr *, std::pair<SourceLocation, StringRef> *>;
+
 /// \brief The result of a constraint satisfaction check, containing the
 /// necessary information to diagnose an unsatisfied constraint.
 ///
@@ -101,10 +101,6 @@ struct ASTConstraintSatisfaction final :
     return getTrailingObjects() + NumRecords;
   }
 
-  ArrayRef<UnsatisfiedConstraintRecord> records() const {
-    return {begin(), end()};
-  }
-
   ASTConstraintSatisfaction(const ASTContext &C,
                             const ConstraintSatisfaction &Satisfaction);
   ASTConstraintSatisfaction(const ASTContext &C,
@@ -286,11 +282,6 @@ class TypeConstraint {
   }
 };
 
-/// Insertion operator for diagnostics.  This allows sending ConceptReferences's
-/// into a diagnostic with <<.
-const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
-                                      const ConceptReference *C);
-
 } // clang
 
 #endif // LLVM_CLANG_AST_ASTCONCEPT_H
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 78220d4d8ff5b..12351e98e5a2b 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -3877,6 +3877,7 @@ typename clang::LazyGenerationalUpdatePtr<Owner, T, Update>::ValueType
     return new (Ctx) LazyData(Source, Value);
   return Value;
 }
+
 template <> struct llvm::DenseMapInfo<llvm::FoldingSetNodeID> {
   static FoldingSetNodeID getEmptyKey() { return FoldingSetNodeID{}; }
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index bd3e042868299..f53aafdeb4f36 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -65,7 +65,6 @@
 #include "clang/Sema/Redeclaration.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaBase.h"
-#include "clang/Sema/SemaConcept.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "clang/Sema/Weak.h"
 #include "llvm/ADT/APInt.h"
@@ -11695,9 +11694,8 @@ class Sema final : public SemaBase {
   ExprResult
   CheckConceptTemplateId(const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
                          const DeclarationNameInfo &ConceptNameInfo,
-                         NamedDecl *FoundDecl, TemplateDecl *NamedConcept,
-                         const TemplateArgumentListInfo *TemplateArgs,
-                         bool DoCheckConstraintSatisfaction = true);
+                         NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
+                         const TemplateArgumentListInfo *TemplateArgs);
 
   void diagnoseMissingTemplateArguments(TemplateName Name, SourceLocation Loc);
   void diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
@@ -12027,13 +12025,6 @@ class Sema final : public SemaBase {
                                  bool UpdateArgsWithConversions = true,
                                  bool *ConstraintsNotSatisfied = nullptr);
 
-  bool CheckTemplateArgumentList(
-      TemplateDecl *Template, TemplateParameterList *Params,
-      SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
-      const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
-      CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions = true,
-      bool *ConstraintsNotSatisfied = nullptr);
-
   bool CheckTemplateTypeArgument(
       TemplateTypeParmDecl *Param, TemplateArgumentLoc &Arg,
       SmallVectorImpl<TemplateArgument> &SugaredConverted,
@@ -12792,18 +12783,6 @@ class Sema final : public SemaBase {
   void MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
                                   unsigned Depth, llvm::SmallBitVector &Used);
 
-  /// Mark which template parameters are named in a given expression.
-  ///
-  /// Unlike MarkUsedTemplateParameters, this excludes parameter that
-  /// are used but not directly named by an expression - i.e. it excludes
-  /// any template parameter that denotes the type of a referenced NTTP.
-  ///
-  /// \param Used a bit vector whose elements will be set to \c true
-  /// to indicate when the corresponding template parameter will be
-  /// deduced.
-  void MarkUsedTemplateParametersForSubsumptionParameterMapping(
-      const Expr *E, unsigned Depth, llvm::SmallBitVector &Used);
-
   /// Mark which template parameters can be deduced from a given
   /// template argument list.
   ///
@@ -12820,9 +12799,6 @@ class Sema final : public SemaBase {
   void MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
                                   unsigned Depth, llvm::SmallBitVector &Used);
 
-  void MarkUsedTemplateParameters(ArrayRef<TemplateArgumentLoc> TemplateArgs,
-                                  unsigned Depth, llvm::SmallBitVector &Used);
-
   void
   MarkDeducedTemplateParameters(const FunctionTemplateDecl *FunctionTemplate,
                                 llvm::SmallBitVector &Deduced) {
@@ -13120,9 +13096,6 @@ class Sema final : public SemaBase {
     /// Whether we're substituting into constraints.
     bool InConstraintSubstitution;
 
-    /// Whether we're substituting into the parameter mapping of a constraint.
-    bool InParameterMappingSubstitution;
-
     /// The point of instantiation or synthesis within the source code.
     SourceLocation PointOfInstantiation;
 
@@ -13386,11 +13359,6 @@ class Sema final : public SemaBase {
                          const MultiLevelTemplateArgumentList &TemplateArgs,
                          TemplateArgumentListInfo &Outputs);
 
-  bool SubstTemplateArgumentsInParameterMapping(
-      ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
-      const MultiLevelTemplateArgumentList &TemplateArgs,
-      TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes);
-
   /// Retrieve the template argument list(s) that should be used to
   /// instantiate the definition of the given declaration.
   ///
@@ -13852,12 +13820,6 @@ class Sema final : public SemaBase {
            CodeSynthesisContexts.back().InConstraintSubstitution;
   }
 
-  bool inParameterMappingSubstitution() const {
-    return !CodeSynthesisContexts.empty() &&
-           CodeSynthesisContexts.back().InParameterMappingSubstitution &&
-           !inConstraintSubstitution();
-  }
-
   using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>;
 
   /// \brief create a Requirement::SubstitutionDiagnostic with only a
@@ -14742,10 +14704,6 @@ class Sema final : public SemaBase {
     SatisfactionStack.swap(NewSS);
   }
 
-  using ConstrainedDeclOrNestedRequirement =
-      llvm::PointerUnion<const NamedDecl *,
-                         const concepts::NestedRequirement *>;
-
   /// Check whether the given expression is a valid constraint expression.
   /// A diagnostic is emitted if it is not, false is returned, and
   /// PossibleNonPrimary will be set to true if the failure might be due to a
@@ -14770,12 +14728,44 @@ class Sema final : public SemaBase {
   /// \returns true if an error occurred and satisfaction could not be checked,
   /// false otherwise.
   bool CheckConstraintSatisfaction(
-      ConstrainedDeclOrNestedRequirement Entity,
+      const NamedDecl *Template,
       ArrayRef<AssociatedConstraint> AssociatedConstraints,
       const MultiLevelTemplateArgumentList &TemplateArgLists,
-      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
-      const ConceptReference *TopLevelConceptId = nullptr,
-      Expr **ConvertedExpr = nullptr);
+      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
+    llvm::SmallVector<Expr *, 4> Converted;
+    return CheckConstraintSatisfaction(Template, AssociatedConstraints,
+                                       Converted, TemplateArgLists,
+                                       TemplateIDRange, Satisfaction);
+  }
+
+  /// \brief Check whether the given list of constraint expressions are
+  /// satisfied (as if in a 'conjunction') given template arguments.
+  /// Additionally, takes an empty list of Expressions which is populated with
+  /// the instantiated versions of the ConstraintExprs.
+  /// \param Template the template-like entity that triggered the constraints
+  /// check (either a concept or a constrained entity).
+  /// \param ConstraintExprs a list of constraint expressions, treated as if
+  /// they were 'AND'ed together.
+  /// \param ConvertedConstraints a out parameter that will get populated with
+  /// the instantiated version of the ConstraintExprs if we successfully checked
+  /// satisfaction.
+  /// \param TemplateArgList the multi-level list of template arguments to
+  /// substitute into the constraint expression. This should be relative to the
+  /// top-level (hence multi-level), since we need to instantiate fully at the
+  /// time of checking.
+  /// \param TemplateIDRange The source range of the template id that
+  /// caused the constraints check.
+  /// \param Satisfaction if true is returned, will contain details of the
+  /// satisfaction, with enough information to diagnose an unsatisfied
+  /// expression.
+  /// \returns true if an error occurred and satisfaction could not be checked,
+  /// false otherwise.
+  bool CheckConstraintSatisfaction(
+      const NamedDecl *Template,
+      ArrayRef<AssociatedConstraint> AssociatedConstraints,
+      llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
+      const MultiLevelTemplateArgumentList &TemplateArgList,
+      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction);
 
   /// \brief Check whether the given non-dependent constraint expression is
   /// satisfied. Returns false and updates Satisfaction with the satisfaction
@@ -14841,17 +14831,16 @@ class Sema final : public SemaBase {
   /// \param First whether this is the first time an unsatisfied constraint is
   /// diagnosed for this error.
   void DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction &Satisfaction,
-                                     SourceLocation Loc = {},
                                      bool First = true);
 
   /// \brief Emit diagnostics explaining why a constraint expression was deemed
   /// unsatisfied.
   void
-  DiagnoseUnsatisfiedConstraint(const ConceptSpecializationExpr *ConstraintExpr,
+  DiagnoseUnsatisfiedConstraint(const ASTConstraintSatisfaction &Satisfaction,
                                 bool First = true);
 
   const NormalizedConstraint *getNormalizedAssociatedConstraints(
-      ConstrainedDeclOrNestedRequirement Entity,
+      const NamedDecl *ConstrainedDecl,
       ArrayRef<AssociatedConstraint> AssociatedConstraints);
 
   /// \brief Check whether the given declaration's associated constraints are
@@ -14876,15 +14865,6 @@ class Sema final : public SemaBase {
       const NamedDecl *D1, ArrayRef<AssociatedConstraint> AC1,
       const NamedDecl *D2, ArrayRef<AssociatedConstraint> AC2);
 
-  /// Cache the satisfaction of an atomic constraint.
-  /// The key is based on the unsubstituted expression and the parameter
-  /// mapping. This lets us not substituting the mapping more than once,
-  /// which is (very!) expensive.
-  /// FIXME: this should be private.
-  llvm::DenseMap<llvm::FoldingSetNodeID,
-                 UnsubstitutedConstraintSatisfactionCacheResult>
-      UnsubstitutedConstraintSatisfactionCache;
-
 private:
   /// Caches pairs of template-like decls whose associated constraints were
   /// checked for subsumption and whether or not the first's constraints did in
@@ -14895,11 +14875,8 @@ class Sema final : public SemaBase {
   /// constrained declarations). If an error occurred while normalizing the
   /// associated constraints of the template or concept, nullptr will be cached
   /// here.
-  llvm::DenseMap<ConstrainedDeclOrNestedRequirement, NormalizedConstraint *>
-      NormalizationCache;
+  llvm::DenseMap<const NamedDecl *, NormalizedConstraint *> NormalizationCache;
 
-  /// Cache whether the associated constraint of a declaration
-  /// is satisfied.
   llvm::ContextualFoldingSet<ConstraintSatisfaction, const ASTContext &>
       SatisfactionCache;
 
diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
index 51ca1e16331f5..648a9c51ae6c1 100644
--- a/clang/include/clang/Sema/SemaConcept.h
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -16,406 +16,130 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
-#include "clang/AST/ExprConcepts.h"
 #include "clang/Basic/SourceLocation.h"
-#include "clang/Sema/Ownership.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include <optional>
 #include <utility>
 
 namespace clang {
 class Sema;
-class MultiLevelTemplateArgumentList;
 
-/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
-/// either an atomic constraint, a conjunction of normalized constraints or a
-/// disjunction of normalized constraints.
-struct NormalizedConstraint {
-
-  enum class ConstraintKind : unsigned char {
-    Atomic = 0,
-    ConceptId,
-    FoldExpanded,
-    Compound,
-  };
-
-  enum CompoundConstraintKind : unsigned char {
-    CCK_Conjunction,
-    CCK_Disjunction
-  };
-  enum class FoldOperatorKind : unsigned char { And, Or };
-
-  using OccurenceList = llvm::SmallBitVector;
-
-protected:
-  using ExprOrConcept =
-      llvm::PointerUnion<const Expr *, const ConceptReference *>;
-
-  struct AtomicConstraintBits {
-    // Kind is the first member of all union members,
-    // as we rely on their initial common sequence.
-    LLVM_PREFERRED_TYPE(ConstraintKind)
-    unsigned Kind : 5;
-    unsigned Placeholder : 1;
-    unsigned PackSubstitutionIndex : 26;
-    // Indexes, IndexesForSubsumption, and Args are part of the common initial
-    // sequences of constraints that do have a mapping.
-
-    // Indexes of the parameters used in a constraint expression.
-    OccurenceList Indexes;
-    // Indexes of the parameters named directly in a constraint expression.
-    // FIXME: we should try to reduce the size of this struct?
-    OccurenceList IndexesForSubsumption;
-
-    TemplateArgumentLoc *Args;
-    TemplateParameterList *ParamList;
-    ExprOrConcept ConstraintExpr;
-    const NamedDecl *ConstraintDecl;
-  };
-
-  struct FoldExpandedConstraintBits {
-    LLVM_PREFERRED_TYPE(ConstraintKind)
-    unsigned Kind : 5;
-    LLVM_PREFERRED_TYPE(FoldOperatorKind)
-    unsigned FoldOperator : 1;
-    unsigned Placeholder : 26;
-    OccurenceList Indexes;
-    OccurenceList IndexesForSubsumption;
-    TemplateArgumentLoc *Args;
-    TemplateParameterList *ParamList;
-    const Expr *Pattern;
-    const NamedDecl *ConstraintDecl;
-    NormalizedConstraint *Constraint;
-  };
-
-  struct ConceptIdBits : AtomicConstraintBits {
-    NormalizedConstraint *Sub;
-
-    // Only used for parameter mapping.
-    const ConceptSpecializationExpr *CSE;
-  };
-
-  struct CompoundConstraintBits {
-    LLVM_PREFERRED_TYPE(ConstraintKind)
-    unsigned Kind : 5;
-    LLVM_PREFERRED_TYPE(CompoundConstraintKind)
-    unsigned CCK : 1;
-    NormalizedConstraint *LHS;
-    NormalizedConstraint *RHS;
-  };
-
-  union {
-    AtomicConstraintBits Atomic;
-    FoldExpandedConstraintBits FoldExpanded;
-    ConceptIdBits ConceptId;
-    CompoundConstraintBits Compound;
-  };
-
-  ~NormalizedConstraint() {
-    if (getKind() != ConstraintKind::Compound)
-      Atomic.Indexes.llvm::SmallBitVector::~SmallBitVector();
-  }
-
-  NormalizedConstraint(const Expr *ConstraintExpr,
-                       const NamedDecl *ConstraintDecl,
-                       UnsignedOrNone PackIndex)
-      : Atomic{llvm::to_underlying(ConstraintKind::Atomic),
-               /*Placeholder=*/0,
-               PackIndex.toInternalRepresentation(),
-               /*Indexes=*/{},
-               /*IndexesForSubsumption=*/{},
-               /*Args=*/nullptr,
-               /*ParamList=*/nullptr,
-               ConstraintExpr,
-               ConstraintDecl} {}
-
-  NormalizedConstraint(const Expr *Pattern, FoldOperatorKind OpKind,
-                       NormalizedConstraint *Constraint,
-                       const NamedDecl *ConstraintDecl)
-      : FoldExpanded{llvm::to_underlying(ConstraintKind::FoldExpanded),
-                     llvm::to_underlying(OpKind),
-                     /*Placeholder=*/0,
-                     /*Indexes=*/{},
-                     /*IndexesForSubsumption=*/{},
-                     /*Args=*/nullptr,
-                     /*ParamList=*/nullptr,
-                     Pattern,
-                     ConstraintDecl,
-                     Constraint} {}
-
-  NormalizedConstraint(const ConceptReference *ConceptId,
-                       const NamedDecl *ConstraintDecl,
-                       NormalizedConstraint *SubConstraint,
-                       const ConceptSpecializationExpr *CSE,
-                       UnsignedOrNone PackIndex)
-      : ConceptId{{llvm::to_underlying(ConstraintKind::ConceptId),
-                   /*Placeholder=*/0, PackIndex.toInternalRepresentation(),
-                   /*Indexes=*/{},
-                   /*IndexesForSubsumption=*/{},
-                   /*Args=*/nullptr, /*ParamList=*/nullptr, ConceptId,
-                   ConstraintDecl},
-                  SubConstraint,
-                  CSE} {}
-
-  NormalizedConstraint(NormalizedConstraint *LHS, CompoundConstraintKind CCK,
-                       NormalizedConstraint *RHS)
-      : Compound{llvm::to_underlying(ConstraintKind::Compound),
-                 llvm::to_underlying(CCK), LHS, RHS} {}
-
-  bool hasParameterMapping() const {
-    // compound constraints do not have a mapping
-    // and Args is not part of their common initial sequence.
-    return getKind() != ConstraintKind::Compound && Atomic.Args != nullptr;
-  }
-
-  const OccurenceList &mappingOccurenceList() const {
-    assert(hasParameterMapping() && "This constraint has no parameter mapping");
-    return Atomic.Indexes;
-  }
-
-  const OccurenceList &mappingOccurenceListForSubsumption() const {
-    assert(hasParameterMapping() && "This constraint has no parameter mapping");
-    return Atomic.IndexesForSubsumption;
-  }
+enum { ConstraintAlignment = 8 };
 
-  llvm::MutableArrayRef<TemplateArgumentLoc> getParameterMapping() const {
-    return {Atomic.Args, Atomic.Indexes.count()};
-  }
-
-  TemplateParameterList *getUsedTemplateParamList() const {
-    return Atomic.ParamList;
-  }
+struct alignas(ConstraintAlignment) AtomicConstraint {
+  const Expr *ConstraintExpr;
+  const NamedDecl *ConstraintDecl;
+  std::optional<ArrayRef<TemplateArgumentLoc>> ParameterMapping;
 
-  void updateParameterMapping(OccurenceList Indexes,
-                              OccurenceList IndexesForSubsumption,
-                              llvm::MutableArrayRef<TemplateArgumentLoc> Args,
-                              TemplateParameterList *ParamList) {
-    assert(getKind() != ConstraintKind::Compound);
-    assert(Indexes.count() == Args.size());
-    assert(IndexesForSubsumption.size() == Indexes.size());
-    assert((Indexes | IndexesForSubsumption) == Indexes);
-
-    Atomic.IndexesForSubsumption = std::move(IndexesForSubsumption);
-    Atomic.Indexes = std::move(Indexes);
-    Atomic.Args = Args.data();
-    Atomic.ParamList = ParamList;
-  }
+  AtomicConstraint(const Expr *ConstraintExpr, const NamedDecl *ConstraintDecl)
+      : ConstraintExpr(ConstraintExpr), ConstraintDecl(ConstraintDecl) {};
 
   bool hasMatchingParameterMapping(ASTContext &C,
-                                   const NormalizedConstraint &Other) const {
-    assert(getKind() != ConstraintKind::Compound);
-
-    if (hasParameterMapping() != Other.hasParameterMapping())
+                                   const AtomicConstraint &Other) const {
+    if (!ParameterMapping != !Other.ParameterMapping)
       return false;
-    if (!hasParameterMapping())
+    if (!ParameterMapping)
       return true;
-
-    llvm::ArrayRef<TemplateArgumentLoc> ParameterMapping =
-        getParameterMapping();
-    llvm::ArrayRef<TemplateArgumentLoc> OtherParameterMapping =
-        Other.getParameterMapping();
-
-    const OccurenceList &Indexes = mappingOccurenceListForSubsumption();
-    const OccurenceList &OtherIndexes =
-        Other.mappingOccurenceListForSubsumption();
-
-    if (ParameterMapping.size() != OtherParameterMapping.size())
+    if (ParameterMapping->size() != Other.ParameterMapping->size())
       return false;
-    for (unsigned I = 0, S = ParameterMapping.size(); I < S; ++I) {
-      if (Indexes[I] != OtherIndexes[I])
-        return false;
-      if (!Indexes[I])
-        continue;
+
+    for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I) {
       llvm::FoldingSetNodeID IDA, IDB;
-      C.getCanonicalTemplateArgument(ParameterMapping[I].getArgument())
+      C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
           .Profile(IDA, C);
-      C.getCanonicalTemplateArgument(OtherParameterMapping[I].getArgument())
+      C.getCanonicalTemplateArgument((*Other.ParameterMapping)[I].getArgument())
           .Profile(IDB, C);
       if (IDA != IDB)
         return false;
     }
     return true;
   }
-
-public:
-  ConstraintKind getKind() const {
-    return static_cast<ConstraintKind>(Atomic.Kind);
-  }
-
-  SourceLocation getBeginLoc() const {
-    switch (getKind()) {
-    case ConstraintKind::Atomic:
-      return cast<const Expr *>(Atomic.ConstraintExpr)->getBeginLoc();
-    case ConstraintKind::ConceptId:
-      return cast<const ConceptReference *>(Atomic.ConstraintExpr)
-          ->getBeginLoc();
-    case ConstraintKind::Compound:
-      return Compound.LHS->getBeginLoc();
-    case ConstraintKind::FoldExpanded:
-      return FoldExpanded.Pattern->getBeginLoc();
-    }
-  }
-
-  SourceLocation getEndLoc() const {
-    switch (getKind()) {
-    case ConstraintKind::Atomic:
-      return cast<const Expr *>(Atomic.ConstraintExpr)->getEndLoc();
-    case ConstraintKind::ConceptId:
-      return cast<const ConceptReference *>(Atomic.ConstraintExpr)->getEndLoc();
-    case ConstraintKind::Compound:
-      return Compound.RHS->getEndLoc();
-    case ConstraintKind::FoldExpanded:
-      return FoldExpanded.Pattern->getEndLoc();
-    }
-  }
-
-  SourceRange getSourceRange() const { return {getBeginLoc(), getEndLoc()}; }
-
-private:
-  friend class Sema;
-  static NormalizedConstraint *
-  fromAssociatedConstraints(Sema &S, const NamedDecl *D,
-                            ArrayRef<AssociatedConstraint> ACs);
-  static NormalizedConstraint *fromConstraintExpr(Sema &S, const NamedDecl *D,
-                                                  const Expr *E,
-                                                  UnsignedOrNone SubstIndex);
 };
 
-class CompoundConstraint : public NormalizedConstraint {
-  using NormalizedConstraint::NormalizedConstraint;
+struct alignas(ConstraintAlignment) NormalizedConstraintPair;
+struct alignas(ConstraintAlignment) FoldExpandedConstraint;
 
-public:
-  static CompoundConstraint *Create(ASTContext &Ctx, NormalizedConstraint *LHS,
-                                    CompoundConstraintKind CCK,
-                                    NormalizedConstraint *RHS) {
-    return new (Ctx) CompoundConstraint(LHS, CCK, RHS);
-  }
+/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
+/// either an atomic constraint, a conjunction of normalized constraints or a
+/// disjunction of normalized constraints.
+struct NormalizedConstraint {
+  friend class Sema;
 
-  static CompoundConstraint *CreateConjunction(ASTContext &Ctx,
-                                               NormalizedConstraint *LHS,
-                                               NormalizedConstraint *RHS) {
-    return new (Ctx) CompoundConstraint(LHS, CCK_Conjunction, RHS);
-  }
+  enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
 
-  const NormalizedConstraint &getLHS() const { return *Compound.LHS; }
+  using CompoundConstraint = llvm::PointerIntPair<NormalizedConstraintPair *, 1,
+                                                  CompoundConstraintKind>;
 
-  NormalizedConstraint &getLHS() { return *Compound.LHS; }
+  llvm::PointerUnion<AtomicConstraint *, FoldExpandedConstraint *,
+                     CompoundConstraint>
+      Constraint;
 
-  const NormalizedConstraint &getRHS() const { return *Compound.RHS; }
+  NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
+  NormalizedConstraint(FoldExpandedConstraint *C) : Constraint{C} {};
 
-  NormalizedConstraint &getRHS() { return *Compound.RHS; }
+  NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
+                       NormalizedConstraint RHS, CompoundConstraintKind Kind);
 
-  CompoundConstraintKind getCompoundKind() const {
-    return static_cast<CompoundConstraintKind>(Compound.CCK);
+  NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other);
+  NormalizedConstraint(NormalizedConstraint &&Other):
+      Constraint(Other.Constraint) {
+    Other.Constraint = nullptr;
   }
-};
-
-class NormalizedConstraintWithParamMapping : public NormalizedConstraint {
-protected:
-  using NormalizedConstraint::NormalizedConstraint;
-
-public:
-  using NormalizedConstraint::getParameterMapping;
-  using NormalizedConstraint::getUsedTemplateParamList;
-  using NormalizedConstraint::hasMatchingParameterMapping;
-  using NormalizedConstraint::hasParameterMapping;
-  using NormalizedConstraint::mappingOccurenceList;
-  using NormalizedConstraint::mappingOccurenceListForSubsumption;
-  using NormalizedConstraint::updateParameterMapping;
-
-  const NamedDecl *getConstraintDecl() const { return Atomic.ConstraintDecl; }
-
-  UnsignedOrNone getPackSubstitutionIndex() const {
-    return UnsignedOrNone::fromInternalRepresentation(
-        Atomic.PackSubstitutionIndex);
+  NormalizedConstraint &operator=(const NormalizedConstraint &Other) = delete;
+  NormalizedConstraint &operator=(NormalizedConstraint &&Other) {
+    if (&Other != this) {
+      NormalizedConstraint Temp(std::move(Other));
+      std::swap(Constraint, Temp.Constraint);
+    }
+    return *this;
   }
-};
-
-class AtomicConstraint : public NormalizedConstraintWithParamMapping {
-  using NormalizedConstraintWithParamMapping::
-      NormalizedConstraintWithParamMapping;
 
-public:
-  static AtomicConstraint *Create(ASTContext &Ctx, const Expr *ConstraintExpr,
-                                  const NamedDecl *ConstraintDecl,
-                                  UnsignedOrNone PackIndex) {
-    return new (Ctx)
-        AtomicConstraint(ConstraintExpr, ConstraintDecl, PackIndex);
+  bool isAtomic() const { return llvm::isa<AtomicConstraint *>(Constraint); }
+  bool isFoldExpanded() const {
+    return llvm::isa<FoldExpandedConstraint *>(Constraint);
   }
+  bool isCompound() const { return llvm::isa<CompoundConstraint>(Constraint); }
 
-  const Expr *getConstraintExpr() const {
-    return cast<const Expr *>(Atomic.ConstraintExpr);
-  }
-};
+  CompoundConstraintKind getCompoundKind() const;
 
-class FoldExpandedConstraint : public NormalizedConstraintWithParamMapping {
-  using NormalizedConstraintWithParamMapping::
-      NormalizedConstraintWithParamMapping;
+  NormalizedConstraint &getLHS() const;
+  NormalizedConstraint &getRHS() const;
 
-public:
-  static FoldExpandedConstraint *Create(ASTContext &Ctx, const Expr *Pattern,
-                                        const NamedDecl *ConstraintDecl,
-                                        FoldOperatorKind OpKind,
-                                        NormalizedConstraint *Constraint) {
-    return new (Ctx)
-        FoldExpandedConstraint(Pattern, OpKind, Constraint, ConstraintDecl);
-  }
+  AtomicConstraint *getAtomicConstraint() const;
 
-  using NormalizedConstraint::hasMatchingParameterMapping;
+  FoldExpandedConstraint *getFoldExpandedConstraint() const;
 
-  FoldOperatorKind getFoldOperator() const {
-    return static_cast<FoldOperatorKind>(FoldExpanded.FoldOperator);
-  }
+private:
+  static std::optional<NormalizedConstraint>
+  fromAssociatedConstraints(Sema &S, const NamedDecl *D,
+                            ArrayRef<AssociatedConstraint> ACs);
+  static std::optional<NormalizedConstraint>
+  fromConstraintExpr(Sema &S, const NamedDecl *D, const Expr *E);
+};
 
-  const Expr *getPattern() const { return FoldExpanded.Pattern; }
+struct alignas(ConstraintAlignment) NormalizedConstraintPair {
+  NormalizedConstraint LHS, RHS;
+};
 
-  const NormalizedConstraint &getNormalizedPattern() const {
-    return *FoldExpanded.Constraint;
-  }
+struct alignas(ConstraintAlignment) FoldExpandedConstraint {
+  enum class FoldOperatorKind { And, Or } Kind;
+  NormalizedConstraint Constraint;
+  const Expr *Pattern;
 
-  NormalizedConstraint &getNormalizedPattern() {
-    return *FoldExpanded.Constraint;
-  }
+  FoldExpandedConstraint(FoldOperatorKind K, NormalizedConstraint C,
+                         const Expr *Pattern)
+      : Kind(K), Constraint(std::move(C)), Pattern(Pattern) {};
 
   static bool AreCompatibleForSubsumption(const FoldExpandedConstraint &A,
                                           const FoldExpandedConstraint &B);
 };
 
-class ConceptIdConstraint : public NormalizedConstraintWithParamMapping {
-  using NormalizedConstraintWithParamMapping::
-      NormalizedConstraintWithParamMapping;
-
-public:
-  static ConceptIdConstraint *
-  Create(ASTContext &Ctx, const ConceptReference *ConceptId,
-         NormalizedConstraint *SubConstraint, const NamedDecl *ConstraintDecl,
-         const ConceptSpecializationExpr *CSE, UnsignedOrNone PackIndex) {
-    return new (Ctx) ConceptIdConstraint(ConceptId, ConstraintDecl,
-                                         SubConstraint, CSE, PackIndex);
-  }
-
-  const ConceptSpecializationExpr *getConceptSpecializationExpr() const {
-    return ConceptId.CSE;
-  }
-
-  const ConceptReference *getConceptId() const {
-    return cast<const ConceptReference *>(ConceptId.ConstraintExpr);
-  }
-
-  const NormalizedConstraint &getNormalizedConstraint() const {
-    return *ConceptId.Sub;
-  }
-
-  NormalizedConstraint &getNormalizedConstraint() { return *ConceptId.Sub; }
-};
-
-struct UnsubstitutedConstraintSatisfactionCacheResult {
-  ExprResult SubstExpr;
-  ConstraintSatisfaction Satisfaction;
-};
+const NormalizedConstraint *getNormalizedAssociatedConstraints(
+    Sema &S, const NamedDecl *ConstrainedDecl,
+    ArrayRef<AssociatedConstraint> AssociatedConstraints);
 
 /// \brief SubsumptionChecker establishes subsumption
 /// between two set of constraints.
@@ -465,13 +189,13 @@ class SubsumptionChecker {
   };
 
   struct MappedAtomicConstraint {
-    const AtomicConstraint *Constraint;
+    AtomicConstraint *Constraint;
     Literal ID;
   };
 
   struct FoldExpendedConstraintKey {
     FoldExpandedConstraint::FoldOperatorKind Kind;
-    const AtomicConstraint *Constraint;
+    AtomicConstraint *Constraint;
     Literal ID;
   };
 
@@ -483,7 +207,7 @@ class SubsumptionChecker {
 
   // A map from a literal to a corresponding associated constraint.
   // We do not have enough bits left for a pointer union here :(
-  llvm::DenseMap<uint16_t, const void *> ReverseMap;
+  llvm::DenseMap<uint16_t, void *> ReverseMap;
 
   // Fold expanded constraints ask us to recursively establish subsumption.
   // This caches the result.
@@ -510,12 +234,12 @@ class SubsumptionChecker {
   FormulaType Normalize(const NormalizedConstraint &C);
   void AddUniqueClauseToFormula(Formula &F, Clause C);
 
-  Literal find(const AtomicConstraint *);
-  Literal find(const FoldExpandedConstraint *);
+  Literal find(AtomicConstraint *);
+  Literal find(FoldExpandedConstraint *);
 
   uint16_t getNewLiteralId();
 };
 
-} // namespace clang
+} // clang
 
 #endif // LLVM_CLANG_SEMA_SEMACONCEPT_H
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 60c7d275f1aaf..115c19d4f1540 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -234,25 +234,21 @@ enum class TemplateSubstitutionKind : char {
     /// Replaces the current 'innermost' level with the provided argument list.
     /// This is useful for type deduction cases where we need to get the entire
     /// list from the AST, but then add the deduced innermost list.
-    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args,
-                                           bool Final = false) {
+    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
       assert((!TemplateArgumentLists.empty() || NumRetainedOuterLevels) &&
              "Replacing in an empty list?");
 
       if (!TemplateArgumentLists.empty()) {
+        assert((TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ||
+                TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ==
+                    AssociatedDecl) &&
+               "Trying to change incorrect declaration?");
         TemplateArgumentLists[0].Args = Args;
-        return;
+      } else {
+        --NumRetainedOuterLevels;
+        TemplateArgumentLists.push_back(
+            {{AssociatedDecl, /*Final=*/false}, Args});
       }
-      --NumRetainedOuterLevels;
-      TemplateArgumentLists.push_back(
-          {{AssociatedDecl, /*Final=*/Final}, Args});
-    }
-
-    void replaceOutermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
-      assert((!TemplateArgumentLists.empty()) && "Replacing in an empty list?");
-      TemplateArgumentLists.back().AssociatedDeclAndFinal.setPointer(
-          AssociatedDecl);
-      TemplateArgumentLists.back().Args = Args;
     }
 
     /// Add an outermost level that we are not substituting. We have no
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index fd12bc4e83827..d658890e076c2 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -24,18 +24,13 @@ static void
 CreateUnsatisfiedConstraintRecord(const ASTContext &C,
                                   const UnsatisfiedConstraintRecord &Detail,
                                   UnsatisfiedConstraintRecord *TrailingObject) {
-  if (Detail.isNull())
-    new (TrailingObject) UnsatisfiedConstraintRecord(nullptr);
-  else if (const auto *E = llvm::dyn_cast<const Expr *>(Detail))
+  if (auto *E = dyn_cast<Expr *>(Detail))
     new (TrailingObject) UnsatisfiedConstraintRecord(E);
-  else if (const auto *Concept =
-               llvm::dyn_cast<const ConceptReference *>(Detail))
-    new (TrailingObject) UnsatisfiedConstraintRecord(Concept);
   else {
     auto &SubstitutionDiagnostic =
-        *cast<const clang::ConstraintSubstitutionDiagnostic *>(Detail);
+        *cast<std::pair<SourceLocation, StringRef> *>(Detail);
     StringRef Message = C.backupStr(SubstitutionDiagnostic.second);
-    auto *NewSubstDiag = new (C) clang::ConstraintSubstitutionDiagnostic(
+    auto *NewSubstDiag = new (C) std::pair<SourceLocation, StringRef>(
         SubstitutionDiagnostic.first, Message);
     new (TrailingObject) UnsatisfiedConstraintRecord(NewSubstDiag);
   }
@@ -79,10 +74,9 @@ ASTConstraintSatisfaction *ASTConstraintSatisfaction::Rebuild(
   return new (Mem) ASTConstraintSatisfaction(C, Satisfaction);
 }
 
-void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID,
-                                     const ASTContext &C,
-                                     const NamedDecl *ConstraintOwner,
-                                     ArrayRef<TemplateArgument> TemplateArgs) {
+void ConstraintSatisfaction::Profile(
+    llvm::FoldingSetNodeID &ID, const ASTContext &C,
+    const NamedDecl *ConstraintOwner, ArrayRef<TemplateArgument> TemplateArgs) {
   ID.AddPointer(ConstraintOwner);
   ID.AddInteger(TemplateArgs.size());
   for (auto &Arg : TemplateArgs)
@@ -122,19 +116,6 @@ void ConceptReference::print(llvm::raw_ostream &OS,
   }
 }
 
-const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB,
-                                             const ConceptReference *C) {
-  std::string NameStr;
-  llvm::raw_string_ostream OS(NameStr);
-  LangOptions LO;
-  LO.CPlusPlus = true;
-  LO.Bool = true;
-  OS << '\'';
-  C->print(OS, PrintingPolicy(LO));
-  OS << '\'';
-  return DB << NameStr;
-}
-
 concepts::ExprRequirement::ExprRequirement(
     Expr *E, bool IsSimple, SourceLocation NoexceptLoc,
     ReturnTypeRequirement Req, SatisfactionStatus Status,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index f43fa8c90ad3b..1c8fd83feb7f8 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1069,22 +1069,22 @@ Error ASTNodeImporter::ImportConstraintSatisfaction(
   ToSat.ContainsErrors = FromSat.ContainsErrors;
   if (!ToSat.IsSatisfied) {
     for (auto Record = FromSat.begin(); Record != FromSat.end(); ++Record) {
-      if (const Expr *E = Record->dyn_cast<const Expr *>()) {
+      if (Expr *E = Record->dyn_cast<Expr *>()) {
         ExpectedExpr ToSecondExpr = import(E);
         if (!ToSecondExpr)
           return ToSecondExpr.takeError();
         ToSat.Details.emplace_back(ToSecondExpr.get());
       } else {
-        auto Pair =
-            Record->dyn_cast<const ConstraintSubstitutionDiagnostic *>();
+        auto Pair = Record->dyn_cast<std::pair<SourceLocation, StringRef> *>();
 
         ExpectedSLoc ToPairFirst = import(Pair->first);
         if (!ToPairFirst)
           return ToPairFirst.takeError();
         StringRef ToPairSecond = ImportASTStringRef(Pair->second);
-        ToSat.Details.emplace_back(new (Importer.getToContext())
-                                       ConstraintSubstitutionDiagnostic{
-                                           ToPairFirst.get(), ToPairSecond});
+        ToSat.Details.emplace_back(
+            new (Importer.getToContext())
+                ConstraintSatisfaction::SubstitutionDiagnostic{
+                    ToPairFirst.get(), ToPairSecond});
       }
     }
   }
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 40c9e49193ffe..dc6d232d9a525 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -12,11 +12,9 @@
 
 #include "clang/Sema/SemaConcept.h"
 #include "TreeTransform.h"
-#include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ExprConcepts.h"
-#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/OperatorPrecedence.h"
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Initialization.h"
@@ -29,7 +27,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/SaveAndRestore.h"
+#include <optional>
 
 using namespace clang;
 using namespace sema;
@@ -87,7 +85,7 @@ class LogicalBinOp {
                                   OK_Ordinary, Loc, FPOptionsOverride{});
   }
 };
-} // namespace
+}
 
 bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
                                      Token NextToken, bool *PossibleNonPrimary,
@@ -148,14 +146,14 @@ bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
 
   if (!Context.hasSameUnqualifiedType(Type, Context.BoolTy)) {
     Diag(ConstraintExpression->getExprLoc(),
-         diag::err_non_bool_atomic_constraint)
-        << Type << ConstraintExpression->getSourceRange();
+         diag::err_non_bool_atomic_constraint) << Type
+        << ConstraintExpression->getSourceRange();
     CheckForNonPrimary();
     return false;
   }
 
   if (PossibleNonPrimary)
-    *PossibleNonPrimary = false;
+      *PossibleNonPrimary = false;
   return true;
 }
 
@@ -166,315 +164,52 @@ struct SatisfactionStackRAII {
   SatisfactionStackRAII(Sema &SemaRef, const NamedDecl *ND,
                         const llvm::FoldingSetNodeID &FSNID)
       : SemaRef(SemaRef) {
-    if (ND) {
+      if (ND) {
       SemaRef.PushSatisfactionStackEntry(ND, FSNID);
       Inserted = true;
-    }
+      }
   }
   ~SatisfactionStackRAII() {
-    if (Inserted)
-      SemaRef.PopSatisfactionStackEntry();
+        if (Inserted)
+          SemaRef.PopSatisfactionStackEntry();
   }
 };
 } // namespace
 
-static bool DiagRecursiveConstraintEval(
-    Sema &S, llvm::FoldingSetNodeID &ID, const NamedDecl *Templ, const Expr *E,
-    const MultiLevelTemplateArgumentList *MLTAL = nullptr) {
+static bool
+DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
+                            const NamedDecl *Templ, const Expr *E,
+                            const MultiLevelTemplateArgumentList &MLTAL) {
   E->Profile(ID, S.Context, /*Canonical=*/true);
-  if (MLTAL) {
-    for (const auto &List : *MLTAL)
-      for (const auto &TemplateArg : List.Args)
-        S.Context.getCanonicalTemplateArgument(TemplateArg)
-            .Profile(ID, S.Context);
-  }
+  for (const auto &List : MLTAL)
+    for (const auto &TemplateArg : List.Args)
+      TemplateArg.Profile(ID, S.Context);
+
+  // Note that we have to do this with our own collection, because there are
+  // times where a constraint-expression check can cause us to need to evaluate
+  // other constriants that are unrelated, such as when evaluating a recovery
+  // expression, or when trying to determine the constexpr-ness of special
+  // members. Otherwise we could just use the
+  // Sema::InstantiatingTemplate::isAlreadyBeingInstantiated function.
   if (S.SatisfactionStackContains(Templ, ID)) {
     S.Diag(E->getExprLoc(), diag::err_constraint_depends_on_self)
         << E << E->getSourceRange();
     return true;
   }
-  return false;
-}
-
-// Figure out the to-translation-unit depth for this function declaration for
-// the purpose of seeing if they differ by constraints. This isn't the same as
-// getTemplateDepth, because it includes already instantiated parents.
-static unsigned
-CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
-                                     bool SkipForSpecialization = false) {
-  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      ND, ND->getLexicalDeclContext(), /*Final=*/false,
-      /*Innermost=*/std::nullopt,
-      /*RelativeToPrimary=*/true,
-      /*Pattern=*/nullptr,
-      /*ForConstraintInstantiation=*/true, SkipForSpecialization);
-  return MLTAL.getNumLevels();
-}
-
-namespace {
-class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
-  unsigned TemplateDepth = 0;
-
-public:
-  using inherited = TreeTransform<AdjustConstraintDepth>;
-  AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
-      : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
-
-  using inherited::TransformTemplateTypeParmType;
-  QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
-                                         TemplateTypeParmTypeLoc TL, bool) {
-    const TemplateTypeParmType *T = TL.getTypePtr();
-
-    TemplateTypeParmDecl *NewTTPDecl = nullptr;
-    if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
-      NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
-          TransformDecl(TL.getNameLoc(), OldTTPDecl));
-
-    QualType Result = getSema().Context.getTemplateTypeParmType(
-        T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
-        NewTTPDecl);
-    TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
-    NewTL.setNameLoc(TL.getNameLoc());
-    return Result;
-  }
-
-  bool AlreadyTransformed(QualType T) {
-    if (T.isNull())
-      return true;
-
-    if (T->isInstantiationDependentType() || T->isVariablyModifiedType() ||
-        T->containsUnexpandedParameterPack())
-      return false;
-    return true;
-  }
-};
-} // namespace
-
-namespace {
-
-// FIXME: Convert it to DynamicRecursiveASTVisitor
-class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
-  using inherited = RecursiveASTVisitor<HashParameterMapping>;
-  friend inherited;
-
-  Sema &SemaRef;
-  const MultiLevelTemplateArgumentList &TemplateArgs;
-  llvm::FoldingSetNodeID &ID;
-  llvm::SmallVector<TemplateArgument, 10> UsedTemplateArgs;
-
-  UnsignedOrNone OuterPackSubstIndex;
-
-  TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) {
-    assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size());
-    Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex];
-    if (Arg.isPackExpansion())
-      Arg = Arg.getPackExpansionPattern();
-    return Arg;
-  }
-
-  bool shouldVisitTemplateInstantiations() const { return true; }
-
-public:
-  HashParameterMapping(Sema &SemaRef,
-                       const MultiLevelTemplateArgumentList &TemplateArgs,
-                       llvm::FoldingSetNodeID &ID,
-                       UnsignedOrNone OuterPackSubstIndex)
-      : SemaRef(SemaRef), TemplateArgs(TemplateArgs), ID(ID),
-        OuterPackSubstIndex(OuterPackSubstIndex) {}
-
-  bool VisitTemplateTypeParmType(TemplateTypeParmType *T) {
-    // A lambda expression can introduce template parameters that don't have
-    // corresponding template arguments yet.
-    if (T->getDepth() >= TemplateArgs.getNumLevels())
-      return true;
-
-    TemplateArgument Arg = TemplateArgs(T->getDepth(), T->getIndex());
-
-    if (T->isParameterPack() && SemaRef.ArgPackSubstIndex) {
-      assert(Arg.getKind() == TemplateArgument::Pack &&
-             "Missing argument pack");
-
-      Arg = getPackSubstitutedTemplateArgument(Arg);
-    }
-
-    UsedTemplateArgs.push_back(
-        SemaRef.Context.getCanonicalTemplateArgument(Arg));
-    return true;
-  }
-
-  bool VisitDeclRefExpr(DeclRefExpr *E) {
-    NamedDecl *D = E->getDecl();
-    NonTypeTemplateParmDecl *NTTP = dyn_cast<NonTypeTemplateParmDecl>(D);
-    if (!NTTP)
-      return TraverseDecl(D);
-
-    TemplateArgument Arg = TemplateArgs(NTTP->getDepth(), NTTP->getPosition());
-    if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
-      assert(Arg.getKind() == TemplateArgument::Pack &&
-             "Missing argument pack");
-      Arg = getPackSubstitutedTemplateArgument(Arg);
-    }
-
-    UsedTemplateArgs.push_back(
-        SemaRef.Context.getCanonicalTemplateArgument(Arg));
-    return true;
-  }
-
-  bool VisitTypedefType(TypedefType *TT) {
-    return inherited::TraverseType(TT->desugar());
-  }
-
-  bool TraverseDecl(Decl *D) {
-    if (auto *VD = dyn_cast<ValueDecl>(D))
-      return TraverseType(VD->getType());
-
-    return inherited::TraverseDecl(D);
-  }
-
-  bool TraverseTypeLoc(TypeLoc TL, bool TraverseQualifier = true) {
-    // We don't care about TypeLocs. So traverse Types instead.
-    return TraverseType(TL.getType(), TraverseQualifier);
-  }
-
-  bool TraverseTagType(const TagType *T, bool TraverseQualifier) {
-    // T's parent can be dependent while T doesn't have any template arguments.
-    // We should have already traversed its qualifier.
-    // FIXME: Add an assert to catch cases where we failed to profile the
-    // concept. assert(!T->isDependentType() && "We missed a case in profiling
-    // concepts!");
-    return true;
-  }
-
-  bool TraverseInjectedClassNameType(InjectedClassNameType *T,
-                                     bool TraverseQualifier) {
-    return TraverseTemplateArguments(T->getTemplateArgs(SemaRef.Context));
-  }
-
-  bool TraverseTemplateArgument(const TemplateArgument &Arg) {
-    if (!Arg.containsUnexpandedParameterPack() || Arg.isPackExpansion()) {
-      // Act as if we are fully expanding this pack, if it is a PackExpansion.
-      Sema::ArgPackSubstIndexRAII _1(SemaRef, std::nullopt);
-      llvm::SaveAndRestore<UnsignedOrNone> _2(OuterPackSubstIndex,
-                                              std::nullopt);
-      return inherited::TraverseTemplateArgument(Arg);
-    }
-
-    Sema::ArgPackSubstIndexRAII _1(SemaRef, OuterPackSubstIndex);
-    return inherited::TraverseTemplateArgument(Arg);
-  }
-
-  void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
-    if (!Constraint.hasParameterMapping()) {
-      for (const auto &List : TemplateArgs)
-        for (const TemplateArgument &Arg : List.Args)
-          SemaRef.Context.getCanonicalTemplateArgument(Arg).Profile(
-              ID, SemaRef.Context);
-      return;
-    }
-
-    llvm::ArrayRef<TemplateArgumentLoc> Mapping =
-        Constraint.getParameterMapping();
-    for (auto &ArgLoc : Mapping) {
-      TemplateArgument Canonical =
-          SemaRef.Context.getCanonicalTemplateArgument(ArgLoc.getArgument());
-      // We don't want sugars to impede the profile of cache.
-      UsedTemplateArgs.push_back(Canonical);
-      TraverseTemplateArgument(Canonical);
-    }
-
-    for (auto &Used : UsedTemplateArgs) {
-      llvm::FoldingSetNodeID R;
-      Used.Profile(R, SemaRef.Context);
-      ID.AddNodeID(R);
-    }
-  }
-};
-
-class ConstraintSatisfactionChecker {
-  Sema &S;
-  const NamedDecl *Template;
-  SourceLocation TemplateNameLoc;
-  UnsignedOrNone PackSubstitutionIndex;
-
-  ConstraintSatisfaction &Satisfaction;
-
-private:
-  ExprResult
-  EvaluateAtomicConstraint(const Expr *AtomicExpr,
-                           const MultiLevelTemplateArgumentList &MLTAL);
-
-  UnsignedOrNone EvaluateFoldExpandedConstraintSize(
-      const FoldExpandedConstraint &FE,
-      const MultiLevelTemplateArgumentList &MLTAL);
 
-  // XXX: It is SLOW! Use it very carefully.
-  std::optional<MultiLevelTemplateArgumentList> SubstitutionInTemplateArguments(
-      const NormalizedConstraintWithParamMapping &Constraint,
-      MultiLevelTemplateArgumentList MLTAL,
-      llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost);
-
-  ExprResult EvaluateSlow(const AtomicConstraint &Constraint,
-                          const MultiLevelTemplateArgumentList &MLTAL);
-
-  ExprResult Evaluate(const AtomicConstraint &Constraint,
-                      const MultiLevelTemplateArgumentList &MLTAL);
-
-  ExprResult EvaluateSlow(const FoldExpandedConstraint &Constraint,
-                          const MultiLevelTemplateArgumentList &MLTAL);
-
-  ExprResult Evaluate(const FoldExpandedConstraint &Constraint,
-                      const MultiLevelTemplateArgumentList &MLTAL);
-
-  ExprResult EvaluateSlow(const ConceptIdConstraint &Constraint,
-                          const MultiLevelTemplateArgumentList &MLTAL,
-                          unsigned int Size);
-
-  ExprResult Evaluate(const ConceptIdConstraint &Constraint,
-                      const MultiLevelTemplateArgumentList &MLTAL);
-
-  ExprResult Evaluate(const CompoundConstraint &Constraint,
-                      const MultiLevelTemplateArgumentList &MLTAL);
-
-public:
-  ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template,
-                                SourceLocation TemplateNameLoc,
-                                UnsignedOrNone PackSubstitutionIndex,
-                                ConstraintSatisfaction &Satisfaction)
-      : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc),
-        PackSubstitutionIndex(PackSubstitutionIndex),
-        Satisfaction(Satisfaction) {}
-
-  ExprResult Evaluate(const NormalizedConstraint &Constraint,
-                      const MultiLevelTemplateArgumentList &MLTAL);
-};
-
-StringRef allocateStringFromConceptDiagnostic(const Sema &S,
-                                              const PartialDiagnostic Diag) {
-  SmallString<128> DiagString;
-  DiagString = ": ";
-  Diag.EmitToString(S.getDiagnostics(), DiagString);
-  return S.getASTContext().backupStr(DiagString);
+  return false;
 }
 
-} // namespace
-
-ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
-    const Expr *AtomicExpr, const MultiLevelTemplateArgumentList &MLTAL) {
+static ExprResult EvaluateAtomicConstraint(
+    Sema &S, const Expr *AtomicExpr, const NamedDecl *Template,
+    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
+    ConstraintSatisfaction &Satisfaction) {
   EnterExpressionEvaluationContext ConstantEvaluated(
       S, Sema::ExpressionEvaluationContext::ConstantEvaluated,
       Sema::ReuseLambdaContextDecl);
 
-  llvm::FoldingSetNodeID ID;
-  if (Template &&
-      DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, &MLTAL)) {
-    Satisfaction.IsSatisfied = false;
-    Satisfaction.ContainsErrors = true;
-    return ExprEmpty();
-  }
-  SatisfactionStackRAII StackRAII(S, Template, ID);
-
   // Atomic constraint - substitute arguments and check satisfaction.
-  ExprResult SubstitutedExpression = const_cast<Expr *>(AtomicExpr);
+  ExprResult SubstitutedExpression;
   {
     TemplateDeductionInfo Info(TemplateNameLoc);
     Sema::InstantiatingTemplate Inst(
@@ -485,6 +220,16 @@ ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
     if (Inst.isInvalid())
       return ExprError();
 
+    llvm::FoldingSetNodeID ID;
+    if (Template &&
+        DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, MLTAL)) {
+      Satisfaction.IsSatisfied = false;
+      Satisfaction.ContainsErrors = true;
+      return ExprEmpty();
+    }
+
+    SatisfactionStackRAII StackRAII(S, Template, ID);
+
     // We do not want error diagnostics escaping here.
     Sema::SFINAETrap Trap(S);
     SubstitutedExpression =
@@ -502,16 +247,21 @@ ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
       PartialDiagnosticAt SubstDiag{SourceLocation(),
                                     PartialDiagnostic::NullDiagnostic()};
       Info.takeSFINAEDiagnostic(SubstDiag);
-      // FIXME: This is an unfortunate consequence of there
+      // FIXME: Concepts: This is an unfortunate consequence of there
       //  being no serialization code for PartialDiagnostics and the fact
       //  that serializing them would likely take a lot more storage than
       //  just storing them as strings. We would still like, in the
       //  future, to serialize the proper PartialDiagnostic as serializing
       //  it as a string defeats the purpose of the diagnostic mechanism.
+      SmallString<128> DiagString;
+      DiagString = ": ";
+      SubstDiag.second.EmitToString(S.getDiagnostics(), DiagString);
+      unsigned MessageSize = DiagString.size();
+      char *Mem = new (S.Context) char[MessageSize];
+      memcpy(Mem, DiagString.c_str(), MessageSize);
       Satisfaction.Details.emplace_back(
-          new (S.Context) ConstraintSubstitutionDiagnostic{
-              SubstDiag.first,
-              allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
+          new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
+              SubstDiag.first, StringRef(Mem, MessageSize)});
       Satisfaction.IsSatisfied = false;
       return ExprEmpty();
     }
@@ -539,525 +289,284 @@ ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
   return SubstitutedExpression;
 }
 
-std::optional<MultiLevelTemplateArgumentList>
-ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
-    const NormalizedConstraintWithParamMapping &Constraint,
-    MultiLevelTemplateArgumentList MLTAL,
-    llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost) {
-
-  if (!Constraint.hasParameterMapping())
-    return std::move(MLTAL);
-
-  TemplateDeductionInfo Info(Constraint.getBeginLoc());
-  Sema::InstantiatingTemplate Inst(
-      S, Constraint.getBeginLoc(),
-      Sema::InstantiatingTemplate::ConstraintSubstitution{},
-      // FIXME: improve const-correctness of InstantiatingTemplate
-      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
-  if (Inst.isInvalid())
-    return std::nullopt;
-
-  Sema::SFINAETrap Trap(S);
-
-  TemplateArgumentListInfo SubstArgs;
-  Sema::ArgPackSubstIndexRAII SubstIndex(
-      S, Constraint.getPackSubstitutionIndex()
-             ? Constraint.getPackSubstitutionIndex()
-             : PackSubstitutionIndex);
-
-  if (S.SubstTemplateArgumentsInParameterMapping(
-          Constraint.getParameterMapping(), Constraint.getBeginLoc(), MLTAL,
-          SubstArgs, /*BuildPackExpansionTypes=*/true)) {
-    Satisfaction.IsSatisfied = false;
-    return std::nullopt;
-  }
-
-  Sema::CheckTemplateArgumentInfo CTAI;
-  auto *TD = const_cast<TemplateDecl *>(
-      cast<TemplateDecl>(Constraint.getConstraintDecl()));
-  if (S.CheckTemplateArgumentList(TD, Constraint.getUsedTemplateParamList(),
-                                  TD->getLocation(), SubstArgs,
-                                  /*DefaultArguments=*/{},
-                                  /*PartialTemplateArgs=*/false, CTAI))
-    return std::nullopt;
-  const NormalizedConstraint::OccurenceList &Used =
-      Constraint.mappingOccurenceList();
-  SubstitutedOuterMost =
-      llvm::to_vector_of<TemplateArgument>(MLTAL.getOutermost());
-  unsigned Offset = 0;
-  for (unsigned I = 0, MappedIndex = 0; I < Used.size(); I++) {
-    TemplateArgument Arg;
-    if (Used[I])
-      Arg = S.Context.getCanonicalTemplateArgument(
-          CTAI.SugaredConverted[MappedIndex++]);
-    if (I < SubstitutedOuterMost.size()) {
-      SubstitutedOuterMost[I] = Arg;
-      Offset = I + 1;
-    } else {
-      SubstitutedOuterMost.push_back(Arg);
-      Offset = SubstitutedOuterMost.size();
-    }
-  }
-  if (Offset < SubstitutedOuterMost.size())
-    SubstitutedOuterMost.erase(SubstitutedOuterMost.begin() + Offset);
-
-  MLTAL.replaceOutermostTemplateArguments(
-      const_cast<NamedDecl *>(Constraint.getConstraintDecl()),
-      SubstitutedOuterMost);
-  return std::move(MLTAL);
-}
-
-ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
-    const AtomicConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL) {
-
-  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
-  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
-      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
-  if (!SubstitutedArgs) {
-    Satisfaction.IsSatisfied = false;
-    return ExprEmpty();
-  }
-
-  Sema::ArgPackSubstIndexRAII SubstIndex(S, PackSubstitutionIndex);
-  ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint(
-      Constraint.getConstraintExpr(), *SubstitutedArgs);
-
-  if (SubstitutedAtomicExpr.isInvalid())
-    return ExprError();
-
-  if (SubstitutedAtomicExpr.isUnset())
-    // Evaluator has decided satisfaction without yielding an expression.
-    return ExprEmpty();
-
-  // We don't have the ability to evaluate this, since it contains a
-  // RecoveryExpr, so we want to fail overload resolution.  Otherwise,
-  // we'd potentially pick up a different overload, and cause confusing
-  // diagnostics. SO, add a failure detail that will cause us to make this
-  // overload set not viable.
-  if (SubstitutedAtomicExpr.get()->containsErrors()) {
-    Satisfaction.IsSatisfied = false;
-    Satisfaction.ContainsErrors = true;
-
-    PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error);
-    Satisfaction.Details.emplace_back(
-        new (S.Context) ConstraintSubstitutionDiagnostic{
-            SubstitutedAtomicExpr.get()->getBeginLoc(),
-            allocateStringFromConceptDiagnostic(S, Msg)});
-    return SubstitutedAtomicExpr;
-  }
-
-  if (SubstitutedAtomicExpr.get()->isValueDependent()) {
-    Satisfaction.IsSatisfied = true;
-    Satisfaction.ContainsErrors = false;
-    return SubstitutedAtomicExpr;
-  }
-
-  EnterExpressionEvaluationContext ConstantEvaluated(
-      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  SmallVector<PartialDiagnosticAt, 2> EvaluationDiags;
-  Expr::EvalResult EvalResult;
-  EvalResult.Diag = &EvaluationDiags;
-  if (!SubstitutedAtomicExpr.get()->EvaluateAsConstantExpr(EvalResult,
-                                                           S.Context) ||
-      !EvaluationDiags.empty()) {
-    // C++2a [temp.constr.atomic]p1
-    //   ...E shall be a constant expression of type bool.
-    S.Diag(SubstitutedAtomicExpr.get()->getBeginLoc(),
-           diag::err_non_constant_constraint_expression)
-        << SubstitutedAtomicExpr.get()->getSourceRange();
-    for (const PartialDiagnosticAt &PDiag : EvaluationDiags)
-      S.Diag(PDiag.first, PDiag.second);
-    return ExprError();
-  }
-
-  assert(EvalResult.Val.isInt() &&
-         "evaluating bool expression didn't produce int");
-  Satisfaction.IsSatisfied = EvalResult.Val.getInt().getBoolValue();
-  if (!Satisfaction.IsSatisfied)
-    Satisfaction.Details.emplace_back(SubstitutedAtomicExpr.get());
-
-  return SubstitutedAtomicExpr;
-}
-
-ExprResult ConstraintSatisfactionChecker::Evaluate(
-    const AtomicConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL) {
-
-  unsigned Size = Satisfaction.Details.size();
-  llvm::FoldingSetNodeID ID;
-  UnsignedOrNone OuterPackSubstIndex =
-      Constraint.getPackSubstitutionIndex()
-          ? Constraint.getPackSubstitutionIndex()
-          : PackSubstitutionIndex;
-
-  ID.AddPointer(Constraint.getConstraintExpr());
-  ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
-  HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
-      .VisitConstraint(Constraint);
-
-  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
-      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
-
-    auto &Cached = Iter->second.Satisfaction;
-    Satisfaction.ContainsErrors = Cached.ContainsErrors;
-    Satisfaction.IsSatisfied = Cached.IsSatisfied;
-    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
-                                Cached.Details.begin(), Cached.Details.end());
-    return Iter->second.SubstExpr;
-  }
-
-  ExprResult E = EvaluateSlow(Constraint, MLTAL);
-
-  UnsubstitutedConstraintSatisfactionCacheResult Cache;
-  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
-  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
-  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
-            std::back_inserter(Cache.Satisfaction.Details));
-  Cache.SubstExpr = E;
-  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
-
-  return E;
-}
-
-UnsignedOrNone
-ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
-    const FoldExpandedConstraint &FE,
-    const MultiLevelTemplateArgumentList &MLTAL) {
+static UnsignedOrNone EvaluateFoldExpandedConstraintSize(
+    Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
+    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
+    ConstraintSatisfaction &Satisfaction) {
 
   // We should ignore errors in the presence of packs of different size.
   Sema::SFINAETrap Trap(S);
 
-  Expr *Pattern = const_cast<Expr *>(FE.getPattern());
+  Expr *Pattern = FE->getPattern();
 
   SmallVector<UnexpandedParameterPack, 2> Unexpanded;
   S.collectUnexpandedParameterPacks(Pattern, Unexpanded);
   assert(!Unexpanded.empty() && "Pack expansion without parameter packs?");
   bool Expand = true;
   bool RetainExpansion = false;
-  UnsignedOrNone NumExpansions(std::nullopt);
+  UnsignedOrNone NumExpansions = FE->getNumExpansions();
   if (S.CheckParameterPacksForExpansion(
-          Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
-          /*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion,
+          FE->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
+          /*FailOnPackProducingTemplates=*/true, Expand, RetainExpansion,
           NumExpansions) ||
       !Expand || RetainExpansion)
     return std::nullopt;
 
   if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
-    S.Diag(Pattern->getExprLoc(),
+    S.Diag(FE->getEllipsisLoc(),
            clang::diag::err_fold_expression_limit_exceeded)
         << *NumExpansions << S.getLangOpts().BracketDepth
-        << Pattern->getSourceRange();
-    S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth);
+        << FE->getSourceRange();
+    S.Diag(FE->getEllipsisLoc(), diag::note_bracket_depth);
     return std::nullopt;
   }
   return NumExpansions;
 }
 
-ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
-    const FoldExpandedConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL) {
-
-  bool Conjunction = Constraint.getFoldOperator() ==
-                     FoldExpandedConstraint::FoldOperatorKind::And;
-  unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
-
-  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
-  // FIXME: Is PackSubstitutionIndex correct?
-  llvm::SaveAndRestore _(PackSubstitutionIndex, S.ArgPackSubstIndex);
-  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
-      SubstitutionInTemplateArguments(
-          static_cast<const NormalizedConstraintWithParamMapping &>(Constraint),
-          MLTAL, SubstitutedOuterMost);
-  if (!SubstitutedArgs) {
-    Satisfaction.IsSatisfied = false;
-    return ExprError();
-  }
+static ExprResult calculateConstraintSatisfaction(
+    Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
+    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
+    ConstraintSatisfaction &Satisfaction);
 
-  ExprResult Out;
-  UnsignedOrNone NumExpansions =
-      EvaluateFoldExpandedConstraintSize(Constraint, *SubstitutedArgs);
-  if (!NumExpansions)
-    return ExprEmpty();
+static ExprResult calculateConstraintSatisfaction(
+    Sema &S, const Expr *LHS, OverloadedOperatorKind Op, const Expr *RHS,
+    const NamedDecl *Template, SourceLocation TemplateNameLoc,
+    const MultiLevelTemplateArgumentList &MLTAL,
+    ConstraintSatisfaction &Satisfaction) {
+  size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
 
-  if (*NumExpansions == 0) {
-    Satisfaction.IsSatisfied = Conjunction;
-    return ExprEmpty();
-  }
+  ExprResult LHSRes = calculateConstraintSatisfaction(
+      S, LHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
 
-  for (unsigned I = 0; I < *NumExpansions; I++) {
-    Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
-    Satisfaction.IsSatisfied = false;
-    Satisfaction.ContainsErrors = false;
-    ExprResult Expr =
-        ConstraintSatisfactionChecker(S, Template, TemplateNameLoc,
-                                      UnsignedOrNone(I), Satisfaction)
-            .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs);
-    if (Expr.isUsable()) {
-      if (Out.isUnset())
-        Out = Expr;
-      else
-        Out = BinaryOperator::Create(S.Context, Out.get(), Expr.get(),
-                                     Conjunction ? BinaryOperatorKind::BO_LAnd
-                                                 : BinaryOperatorKind::BO_LOr,
-                                     S.Context.BoolTy, VK_PRValue, OK_Ordinary,
-                                     Constraint.getBeginLoc(),
-                                     FPOptionsOverride{});
-    } else {
-      assert(!Satisfaction.IsSatisfied);
-    }
-    if (!Conjunction && Satisfaction.IsSatisfied) {
-      Satisfaction.Details.erase(Satisfaction.Details.begin() +
-                                     EffectiveDetailEndIndex,
-                                 Satisfaction.Details.end());
-      break;
-    }
-    if (Satisfaction.IsSatisfied != Conjunction)
-      return Out;
-  }
+  if (LHSRes.isInvalid())
+    return ExprError();
 
-  return Out;
-}
+  bool IsLHSSatisfied = Satisfaction.IsSatisfied;
+
+  if (Op == clang::OO_PipePipe && IsLHSSatisfied)
+    // [temp.constr.op] p3
+    //    A disjunction is a constraint taking two operands. To determine if
+    //    a disjunction is satisfied, the satisfaction of the first operand
+    //    is checked. If that is satisfied, the disjunction is satisfied.
+    //    Otherwise, the disjunction is satisfied if and only if the second
+    //    operand is satisfied.
+    // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
+    return LHSRes;
+
+  if (Op == clang::OO_AmpAmp && !IsLHSSatisfied)
+    // [temp.constr.op] p2
+    //    A conjunction is a constraint taking two operands. To determine if
+    //    a conjunction is satisfied, the satisfaction of the first operand
+    //    is checked. If that is not satisfied, the conjunction is not
+    //    satisfied. Otherwise, the conjunction is satisfied if and only if
+    //    the second operand is satisfied.
+    // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
+    return LHSRes;
+
+  ExprResult RHSRes = calculateConstraintSatisfaction(
+      S, RHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
+  if (RHSRes.isInvalid())
+    return ExprError();
 
-ExprResult ConstraintSatisfactionChecker::Evaluate(
-    const FoldExpandedConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL) {
+  bool IsRHSSatisfied = Satisfaction.IsSatisfied;
+  // Current implementation adds diagnostic information about the falsity
+  // of each false atomic constraint expression when it evaluates them.
+  // When the evaluation results to `false || true`, the information
+  // generated during the evaluation of left-hand side is meaningless
+  // because the whole expression evaluates to true.
+  // The following code removes the irrelevant diagnostic information.
+  // FIXME: We should probably delay the addition of diagnostic information
+  // until we know the entire expression is false.
+  if (Op == clang::OO_PipePipe && IsRHSSatisfied) {
+    auto EffectiveDetailEnd = Satisfaction.Details.begin();
+    std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
+    Satisfaction.Details.erase(EffectiveDetailEnd, Satisfaction.Details.end());
+  }
+
+  if (!LHSRes.isUsable() || !RHSRes.isUsable())
+    return ExprEmpty();
 
-  llvm::FoldingSetNodeID ID;
-  ID.AddPointer(Constraint.getPattern());
-  HashParameterMapping(S, MLTAL, ID, std::nullopt).VisitConstraint(Constraint);
-
-  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
-      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
-
-    auto &Cached = Iter->second.Satisfaction;
-    Satisfaction.ContainsErrors = Cached.ContainsErrors;
-    Satisfaction.IsSatisfied = Cached.IsSatisfied;
-    Satisfaction.Details.insert(Satisfaction.Details.end(),
-                                Cached.Details.begin(), Cached.Details.end());
-    return Iter->second.SubstExpr;
-  }
-
-  unsigned Size = Satisfaction.Details.size();
-
-  ExprResult E = EvaluateSlow(Constraint, MLTAL);
-  UnsubstitutedConstraintSatisfactionCacheResult Cache;
-  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
-  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
-  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
-            std::back_inserter(Cache.Satisfaction.Details));
-  Cache.SubstExpr = E;
-  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
-  return E;
+  return BinaryOperator::Create(S.Context, LHSRes.get(), RHSRes.get(),
+                                BinaryOperator::getOverloadedOpcode(Op),
+                                S.Context.BoolTy, VK_PRValue, OK_Ordinary,
+                                LHS->getBeginLoc(), FPOptionsOverride{});
 }
 
-ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
-    const ConceptIdConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL, unsigned Size) {
-  const ConceptReference *ConceptId = Constraint.getConceptId();
+static ExprResult calculateConstraintSatisfaction(
+    Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
+    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
+    ConstraintSatisfaction &Satisfaction) {
+  bool Conjunction = FE->getOperator() == BinaryOperatorKind::BO_LAnd;
+  size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
 
-  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
-  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
-      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
+  ExprResult Out;
+  if (FE->isLeftFold() && FE->getInit()) {
+    Out = calculateConstraintSatisfaction(S, FE->getInit(), Template,
+                                          TemplateNameLoc, MLTAL, Satisfaction);
+    if (Out.isInvalid())
+      return ExprError();
 
-  if (!SubstitutedArgs) {
-    Satisfaction.IsSatisfied = false;
-    // FIXME: diagnostics?
+    // If the first clause of a conjunction is not satisfied,
+    // or if the first clause of a disjection is satisfied,
+    // we have established satisfaction of the whole constraint
+    // and we should not continue further.
+    if (Conjunction != Satisfaction.IsSatisfied)
+      return Out;
+  }
+  UnsignedOrNone NumExpansions = EvaluateFoldExpandedConstraintSize(
+      S, FE, Template, TemplateNameLoc, MLTAL, Satisfaction);
+  if (!NumExpansions)
     return ExprError();
+  for (unsigned I = 0; I < *NumExpansions; I++) {
+    Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
+    ExprResult Res = calculateConstraintSatisfaction(
+        S, FE->getPattern(), Template, TemplateNameLoc, MLTAL, Satisfaction);
+    if (Res.isInvalid())
+      return ExprError();
+    bool IsRHSSatisfied = Satisfaction.IsSatisfied;
+    if (!Conjunction && IsRHSSatisfied) {
+      auto EffectiveDetailEnd = Satisfaction.Details.begin();
+      std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
+      Satisfaction.Details.erase(EffectiveDetailEnd,
+                                 Satisfaction.Details.end());
+    }
+    if (Out.isUnset())
+      Out = Res;
+    else if (!Res.isUnset()) {
+      Out = BinaryOperator::Create(
+          S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
+          VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
+    }
+    if (Conjunction != IsRHSSatisfied)
+      return Out;
   }
 
-  Sema::SFINAETrap Trap(S);
-  Sema::ArgPackSubstIndexRAII SubstIndex(
-      S, Constraint.getPackSubstitutionIndex()
-             ? Constraint.getPackSubstitutionIndex()
-             : PackSubstitutionIndex);
-
-  const ASTTemplateArgumentListInfo *Ori =
-      ConceptId->getTemplateArgsAsWritten();
-  TemplateDeductionInfo Info(TemplateNameLoc);
-  Sema::InstantiatingTemplate _(
-      S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{},
-      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
-
-  TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc);
-  if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) ||
-      Trap.hasErrorOccurred()) {
-    Satisfaction.IsSatisfied = false;
-    if (!Trap.hasErrorOccurred())
+  if (FE->isRightFold() && FE->getInit()) {
+    ExprResult Res = calculateConstraintSatisfaction(
+        S, FE->getInit(), Template, TemplateNameLoc, MLTAL, Satisfaction);
+    if (Out.isInvalid())
       return ExprError();
 
-    PartialDiagnosticAt SubstDiag{SourceLocation(),
-                                  PartialDiagnostic::NullDiagnostic()};
-    Info.takeSFINAEDiagnostic(SubstDiag);
-    // FIXME: This is an unfortunate consequence of there
-    //  being no serialization code for PartialDiagnostics and the fact
-    //  that serializing them would likely take a lot more storage than
-    //  just storing them as strings. We would still like, in the
-    //  future, to serialize the proper PartialDiagnostic as serializing
-    //  it as a string defeats the purpose of the diagnostic mechanism.
-    Satisfaction.Details.insert(
-        Satisfaction.Details.begin() + Size,
-        new (S.Context) ConstraintSubstitutionDiagnostic{
-            SubstDiag.first,
-            allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
-    return ExprError();
+    if (Out.isUnset())
+      Out = Res;
+    else if (!Res.isUnset()) {
+      Out = BinaryOperator::Create(
+          S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
+          VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
+    }
   }
 
-  CXXScopeSpec SS;
-  SS.Adopt(ConceptId->getNestedNameSpecifierLoc());
-
-  ExprResult SubstitutedConceptId = S.CheckConceptTemplateId(
-      SS, ConceptId->getTemplateKWLoc(), ConceptId->getConceptNameInfo(),
-      ConceptId->getFoundDecl(), ConceptId->getNamedConcept(), &OutArgs,
-      /*DoCheckConstraintSatisfaction=*/false);
-
-  if (SubstitutedConceptId.isInvalid() || Trap.hasErrorOccurred())
-    return ExprError();
-
-  if (Size != Satisfaction.Details.size()) {
-    Satisfaction.Details.insert(
-        Satisfaction.Details.begin() + Size,
-        UnsatisfiedConstraintRecord(
-            SubstitutedConceptId.getAs<ConceptSpecializationExpr>()
-                ->getConceptReference()));
+  if (Out.isUnset()) {
+    Satisfaction.IsSatisfied = Conjunction;
+    Out = S.BuildEmptyCXXFoldExpr(FE->getBeginLoc(), FE->getOperator());
   }
-  return SubstitutedConceptId;
+  return Out;
 }
 
-ExprResult ConstraintSatisfactionChecker::Evaluate(
-    const ConceptIdConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL) {
-
-  const ConceptReference *ConceptId = Constraint.getConceptId();
-
-  UnsignedOrNone OuterPackSubstIndex =
-      Constraint.getPackSubstitutionIndex()
-          ? Constraint.getPackSubstitutionIndex()
-          : PackSubstitutionIndex;
-
-  Sema::InstantiatingTemplate _(S, ConceptId->getBeginLoc(),
-                                Sema::InstantiatingTemplate::ConstraintsCheck{},
-                                ConceptId->getNamedConcept(),
-                                MLTAL.getInnermost(),
-                                Constraint.getSourceRange());
-
-  unsigned Size = Satisfaction.Details.size();
+static ExprResult calculateConstraintSatisfaction(
+    Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
+    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
+    ConstraintSatisfaction &Satisfaction) {
+  ConstraintExpr = ConstraintExpr->IgnoreParenImpCasts();
 
-  ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL);
+  if (LogicalBinOp BO = ConstraintExpr)
+    return calculateConstraintSatisfaction(
+        S, BO.getLHS(), BO.getOp(), BO.getRHS(), Template, TemplateNameLoc,
+        MLTAL, Satisfaction);
 
-  if (!E.isUsable()) {
-    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId);
-    return E;
+  if (auto *C = dyn_cast<ExprWithCleanups>(ConstraintExpr)) {
+    // These aren't evaluated, so we don't care about cleanups, so we can just
+    // evaluate these as if the cleanups didn't exist.
+    return calculateConstraintSatisfaction(
+        S, C->getSubExpr(), Template, TemplateNameLoc, MLTAL, Satisfaction);
   }
 
-  // ConceptIdConstraint is only relevant for diagnostics,
-  // so if the normalized constraint is satisfied, we should not
-  // substitute into the constraint.
-  if (Satisfaction.IsSatisfied)
-    return E;
-
-  llvm::FoldingSetNodeID ID;
-  ID.AddPointer(Constraint.getConceptId());
-  ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
-  HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
-      .VisitConstraint(Constraint);
-
-  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
-      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
-
-    auto &Cached = Iter->second.Satisfaction;
-    Satisfaction.ContainsErrors = Cached.ContainsErrors;
-    Satisfaction.IsSatisfied = Cached.IsSatisfied;
-    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
-                                Cached.Details.begin(), Cached.Details.end());
-    return Iter->second.SubstExpr;
-  }
-
-  ExprResult CE = EvaluateSlow(Constraint, MLTAL, Size);
-  if (CE.isInvalid())
-    return E;
-  UnsubstitutedConstraintSatisfactionCacheResult Cache;
-  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
-  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
-  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
-            std::back_inserter(Cache.Satisfaction.Details));
-  Cache.SubstExpr = CE;
-  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
-  return CE;
-}
-
-ExprResult ConstraintSatisfactionChecker::Evaluate(
-    const CompoundConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL) {
-
-  unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
-
-  bool Conjunction =
-      Constraint.getCompoundKind() == NormalizedConstraint::CCK_Conjunction;
+  if (auto *FE = dyn_cast<CXXFoldExpr>(ConstraintExpr);
+      FE && S.getLangOpts().CPlusPlus26 &&
+      (FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
+       FE->getOperator() == BinaryOperatorKind::BO_LOr)) {
+    return calculateConstraintSatisfaction(S, FE, Template, TemplateNameLoc,
+                                           MLTAL, Satisfaction);
+  }
 
-  ExprResult LHS = Evaluate(Constraint.getLHS(), MLTAL);
+  // FIXME: We should not treat ConceptSpecializationExpr as atomic constraints.
 
-  if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors))
-    return LHS;
+  // An atomic constraint expression
+  ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint(
+      S, ConstraintExpr, Template, TemplateNameLoc, MLTAL, Satisfaction);
 
-  if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied &&
-      !Satisfaction.ContainsErrors)
-    return LHS;
+  if (SubstitutedAtomicExpr.isInvalid())
+    return ExprError();
 
-  Satisfaction.ContainsErrors = false;
-  Satisfaction.IsSatisfied = false;
+  if (!SubstitutedAtomicExpr.isUsable())
+    // Evaluator has decided satisfaction without yielding an expression.
+    return ExprEmpty();
 
-  ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL);
+  // We don't have the ability to evaluate this, since it contains a
+  // RecoveryExpr, so we want to fail overload resolution.  Otherwise,
+  // we'd potentially pick up a different overload, and cause confusing
+  // diagnostics. SO, add a failure detail that will cause us to make this
+  // overload set not viable.
+  if (SubstitutedAtomicExpr.get()->containsErrors()) {
+    Satisfaction.IsSatisfied = false;
+    Satisfaction.ContainsErrors = true;
 
-  if (RHS.isUsable() && Satisfaction.IsSatisfied &&
-      !Satisfaction.ContainsErrors)
-    Satisfaction.Details.erase(Satisfaction.Details.begin() +
-                                   EffectiveDetailEndIndex,
-                               Satisfaction.Details.end());
+    PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error);
+    SmallString<128> DiagString;
+    DiagString = ": ";
+    Msg.EmitToString(S.getDiagnostics(), DiagString);
+    unsigned MessageSize = DiagString.size();
+    char *Mem = new (S.Context) char[MessageSize];
+    memcpy(Mem, DiagString.c_str(), MessageSize);
+    Satisfaction.Details.emplace_back(
+        new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
+            SubstitutedAtomicExpr.get()->getBeginLoc(),
+            StringRef(Mem, MessageSize)});
+    return SubstitutedAtomicExpr;
+  }
 
-  if (!LHS.isUsable())
-    return RHS;
+  EnterExpressionEvaluationContext ConstantEvaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  SmallVector<PartialDiagnosticAt, 2> EvaluationDiags;
+  Expr::EvalResult EvalResult;
+  EvalResult.Diag = &EvaluationDiags;
+  if (!SubstitutedAtomicExpr.get()->EvaluateAsConstantExpr(EvalResult,
+                                                           S.Context) ||
+      !EvaluationDiags.empty()) {
+    // C++2a [temp.constr.atomic]p1
+    //   ...E shall be a constant expression of type bool.
+    S.Diag(SubstitutedAtomicExpr.get()->getBeginLoc(),
+           diag::err_non_constant_constraint_expression)
+        << SubstitutedAtomicExpr.get()->getSourceRange();
+    for (const PartialDiagnosticAt &PDiag : EvaluationDiags)
+      S.Diag(PDiag.first, PDiag.second);
+    return ExprError();
+  }
 
-  if (!RHS.isUsable())
-    return LHS;
+  assert(EvalResult.Val.isInt() &&
+         "evaluating bool expression didn't produce int");
+  Satisfaction.IsSatisfied = EvalResult.Val.getInt().getBoolValue();
+  if (!Satisfaction.IsSatisfied)
+    Satisfaction.Details.emplace_back(SubstitutedAtomicExpr.get());
 
-  return BinaryOperator::Create(S.Context, LHS.get(), RHS.get(),
-                                Conjunction ? BinaryOperatorKind::BO_LAnd
-                                            : BinaryOperatorKind::BO_LOr,
-                                S.Context.BoolTy, VK_PRValue, OK_Ordinary,
-                                Constraint.getBeginLoc(), FPOptionsOverride{});
+  return SubstitutedAtomicExpr;
 }
 
-ExprResult ConstraintSatisfactionChecker::Evaluate(
-    const NormalizedConstraint &Constraint,
-    const MultiLevelTemplateArgumentList &MLTAL) {
-  switch (Constraint.getKind()) {
-  case NormalizedConstraint::ConstraintKind::Atomic:
-    return Evaluate(static_cast<const AtomicConstraint &>(Constraint), MLTAL);
-
-  case NormalizedConstraint::ConstraintKind::FoldExpanded:
-    return Evaluate(static_cast<const FoldExpandedConstraint &>(Constraint),
-                    MLTAL);
-
-  case NormalizedConstraint::ConstraintKind::ConceptId:
-    return Evaluate(static_cast<const ConceptIdConstraint &>(Constraint),
-                    MLTAL);
+static ExprResult calculateConstraintSatisfaction(
+    Sema &S, const NamedDecl *Template, SourceLocation TemplateNameLoc,
+    const MultiLevelTemplateArgumentList &MLTAL, const Expr *ConstraintExpr,
+    ConstraintSatisfaction &Satisfaction) {
 
-  case NormalizedConstraint::ConstraintKind::Compound:
-    return Evaluate(static_cast<const CompoundConstraint &>(Constraint), MLTAL);
-  }
+  return calculateConstraintSatisfaction(S, ConstraintExpr, Template,
+                                         TemplateNameLoc, MLTAL, Satisfaction);
 }
 
 static bool CheckConstraintSatisfaction(
     Sema &S, const NamedDecl *Template,
     ArrayRef<AssociatedConstraint> AssociatedConstraints,
+    llvm::SmallVectorImpl<Expr *> &Converted,
     const MultiLevelTemplateArgumentList &TemplateArgsLists,
-    SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
-    Expr **ConvertedExpr, const ConceptReference *TopLevelConceptId = nullptr) {
-
-  if (ConvertedExpr)
-    *ConvertedExpr = nullptr;
-
+    SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
   if (AssociatedConstraints.empty()) {
     Satisfaction.IsSatisfied = true;
     return false;
@@ -1069,60 +578,57 @@ static bool CheckConstraintSatisfaction(
     return false;
   }
 
-  llvm::ArrayRef<TemplateArgument> Args;
-  if (TemplateArgsLists.getNumLevels() != 0)
-    Args = TemplateArgsLists.getInnermost();
-
-  std::optional<Sema::InstantiatingTemplate> SynthesisContext;
-  if (!TopLevelConceptId) {
-    SynthesisContext.emplace(S, TemplateIDRange.getBegin(),
-                             Sema::InstantiatingTemplate::ConstraintsCheck{},
-                             const_cast<NamedDecl *>(Template), Args,
-                             TemplateIDRange);
-  }
-
-  const NormalizedConstraint *C =
-      S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints);
-  if (!C) {
-    Satisfaction.IsSatisfied = false;
+  ArrayRef<TemplateArgument> TemplateArgs =
+      TemplateArgsLists.getNumSubstitutedLevels() > 0
+          ? TemplateArgsLists.getOutermost()
+          : ArrayRef<TemplateArgument>{};
+  Sema::InstantiatingTemplate Inst(S, TemplateIDRange.getBegin(),
+      Sema::InstantiatingTemplate::ConstraintsCheck{},
+      const_cast<NamedDecl *>(Template), TemplateArgs, TemplateIDRange);
+  if (Inst.isInvalid())
     return true;
-  }
-
-  if (TopLevelConceptId)
-    C = ConceptIdConstraint::Create(S.getASTContext(), TopLevelConceptId,
-                                    const_cast<NormalizedConstraint *>(C),
-                                    Template, /*CSE=*/nullptr,
-                                    S.ArgPackSubstIndex);
 
-  ExprResult Res =
-      ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(),
-                                    S.ArgPackSubstIndex, Satisfaction)
-          .Evaluate(*C, TemplateArgsLists);
-
-  if (Res.isInvalid())
-    return true;
+  for (const AssociatedConstraint &AC : AssociatedConstraints) {
+    if (AC.isNull())
+      return true;
 
-  if (Res.isUsable() && ConvertedExpr)
-    *ConvertedExpr = Res.get();
+    Sema::ArgPackSubstIndexRAII _(S, AC.ArgPackSubstIndex);
+    ExprResult Res = calculateConstraintSatisfaction(
+        S, Template, TemplateIDRange.getBegin(), TemplateArgsLists,
+        AC.ConstraintExpr, Satisfaction);
+    if (Res.isInvalid())
+      return true;
 
+    Converted.push_back(Res.get());
+    if (!Satisfaction.IsSatisfied) {
+      // Backfill the 'converted' list with nulls so we can keep the Converted
+      // and unconverted lists in sync.
+      Converted.append(AssociatedConstraints.size() - Converted.size(),
+                       nullptr);
+      // [temp.constr.op] p2
+      // [...] To determine if a conjunction is satisfied, the satisfaction
+      // of the first operand is checked. If that is not satisfied, the
+      // conjunction is not satisfied. [...]
+      return false;
+    }
+  }
   return false;
 }
 
 bool Sema::CheckConstraintSatisfaction(
-    ConstrainedDeclOrNestedRequirement Entity,
+    const NamedDecl *Template,
     ArrayRef<AssociatedConstraint> AssociatedConstraints,
+    llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
     const MultiLevelTemplateArgumentList &TemplateArgsLists,
-    SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction,
-    const ConceptReference *TopLevelConceptId, Expr **ConvertedExpr) {
+    SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction) {
   if (AssociatedConstraints.empty()) {
     OutSatisfaction.IsSatisfied = true;
     return false;
   }
-  const auto *Template = Entity.dyn_cast<const NamedDecl *>();
   if (!Template) {
     return ::CheckConstraintSatisfaction(
-        *this, nullptr, AssociatedConstraints, TemplateArgsLists,
-        TemplateIDRange, OutSatisfaction, ConvertedExpr, TopLevelConceptId);
+        *this, nullptr, AssociatedConstraints, ConvertedConstraints,
+        TemplateArgsLists, TemplateIDRange, OutSatisfaction);
   }
   // Invalid templates could make their way here. Substituting them could result
   // in dependent expressions.
@@ -1137,15 +643,10 @@ bool Sema::CheckConstraintSatisfaction(
   // here.
   llvm::SmallVector<TemplateArgument, 4> FlattenedArgs;
   for (auto List : TemplateArgsLists)
-    for (const TemplateArgument &Arg : List.Args)
-      FlattenedArgs.emplace_back(Context.getCanonicalTemplateArgument(Arg));
-
-  const NamedDecl *Owner = Template;
-  if (TopLevelConceptId)
-    Owner = TopLevelConceptId->getNamedConcept();
+    llvm::append_range(FlattenedArgs, List.Args);
 
   llvm::FoldingSetNodeID ID;
-  ConstraintSatisfaction::Profile(ID, Context, Owner, FlattenedArgs);
+  ConstraintSatisfaction::Profile(ID, Context, Template, FlattenedArgs);
   void *InsertPos;
   if (auto *Cached = SatisfactionCache.FindNodeOrInsertPos(ID, InsertPos)) {
     OutSatisfaction = *Cached;
@@ -1153,10 +654,10 @@ bool Sema::CheckConstraintSatisfaction(
   }
 
   auto Satisfaction =
-      std::make_unique<ConstraintSatisfaction>(Owner, FlattenedArgs);
-  if (::CheckConstraintSatisfaction(
-          *this, Template, AssociatedConstraints, TemplateArgsLists,
-          TemplateIDRange, *Satisfaction, ConvertedExpr, TopLevelConceptId)) {
+      std::make_unique<ConstraintSatisfaction>(Template, FlattenedArgs);
+  if (::CheckConstraintSatisfaction(*this, Template, AssociatedConstraints,
+                                    ConvertedConstraints, TemplateArgsLists,
+                                    TemplateIDRange, *Satisfaction)) {
     OutSatisfaction = *Satisfaction;
     return true;
   }
@@ -1187,18 +688,14 @@ bool Sema::CheckConstraintSatisfaction(
     const ConceptSpecializationExpr *ConstraintExpr,
     ConstraintSatisfaction &Satisfaction) {
 
-  llvm::SmallVector<AssociatedConstraint, 1> Constraints;
-  Constraints.emplace_back(
-      ConstraintExpr->getNamedConcept()->getConstraintExpr());
-
   MultiLevelTemplateArgumentList MLTAL(ConstraintExpr->getNamedConcept(),
                                        ConstraintExpr->getTemplateArguments(),
                                        true);
 
-  return CheckConstraintSatisfaction(
-      ConstraintExpr->getNamedConcept(), Constraints, MLTAL,
-      ConstraintExpr->getSourceRange(), Satisfaction,
-      ConstraintExpr->getConceptReference());
+  return calculateConstraintSatisfaction(
+             *this, ConstraintExpr, ConstraintExpr->getNamedConcept(),
+             ConstraintExpr->getConceptNameLoc(), MLTAL, Satisfaction)
+      .isInvalid();
 }
 
 bool Sema::SetupConstraintScope(
@@ -1357,6 +854,50 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
       Satisfaction);
 }
 
+
+// Figure out the to-translation-unit depth for this function declaration for
+// the purpose of seeing if they differ by constraints. This isn't the same as
+// getTemplateDepth, because it includes already instantiated parents.
+static unsigned
+CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
+                                     bool SkipForSpecialization = false) {
+  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
+      ND, ND->getLexicalDeclContext(), /*Final=*/false,
+      /*Innermost=*/std::nullopt,
+      /*RelativeToPrimary=*/true,
+      /*Pattern=*/nullptr,
+      /*ForConstraintInstantiation=*/true, SkipForSpecialization);
+  return MLTAL.getNumLevels();
+}
+
+namespace {
+  class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
+  unsigned TemplateDepth = 0;
+  public:
+  using inherited = TreeTransform<AdjustConstraintDepth>;
+  AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
+      : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
+
+  using inherited::TransformTemplateTypeParmType;
+  QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
+                                         TemplateTypeParmTypeLoc TL, bool) {
+    const TemplateTypeParmType *T = TL.getTypePtr();
+
+    TemplateTypeParmDecl *NewTTPDecl = nullptr;
+    if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
+      NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
+          TransformDecl(TL.getNameLoc(), OldTTPDecl));
+
+    QualType Result = getSema().Context.getTemplateTypeParmType(
+        T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
+        NewTTPDecl);
+    TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
+    NewTL.setNameLoc(TL.getNameLoc());
+    return Result;
+  }
+  };
+} // namespace
+
 static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
     Sema &S, const Sema::TemplateCompareNewDeclInfo &DeclInfo,
     const Expr *ConstrExpr) {
@@ -1620,61 +1161,73 @@ bool Sema::CheckFunctionTemplateConstraints(
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::ExprRequirement *Req,
                                            bool First) {
-  assert(!Req->isSatisfied() &&
-         "Diagnose() can only be used on an unsatisfied requirement");
+  assert(!Req->isSatisfied()
+         && "Diagnose() can only be used on an unsatisfied requirement");
   switch (Req->getSatisfactionStatus()) {
-  case concepts::ExprRequirement::SS_Dependent:
-    llvm_unreachable("Diagnosing a dependent requirement");
-    break;
-  case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
-    auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
-    if (!SubstDiag->DiagMessage.empty())
-      S.Diag(SubstDiag->DiagLoc,
-             diag::note_expr_requirement_expr_substitution_error)
-          << (int)First << SubstDiag->SubstitutedEntity
-          << SubstDiag->DiagMessage;
-    else
-      S.Diag(SubstDiag->DiagLoc,
-             diag::note_expr_requirement_expr_unknown_substitution_error)
-          << (int)First << SubstDiag->SubstitutedEntity;
-    break;
-  }
-  case concepts::ExprRequirement::SS_NoexceptNotMet:
-    S.Diag(Req->getNoexceptLoc(), diag::note_expr_requirement_noexcept_not_met)
-        << (int)First << Req->getExpr();
-    break;
-  case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
-    auto *SubstDiag =
-        Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
-    if (!SubstDiag->DiagMessage.empty())
-      S.Diag(SubstDiag->DiagLoc,
-             diag::note_expr_requirement_type_requirement_substitution_error)
-          << (int)First << SubstDiag->SubstitutedEntity
-          << SubstDiag->DiagMessage;
-    else
-      S.Diag(
-          SubstDiag->DiagLoc,
-          diag::
-              note_expr_requirement_type_requirement_unknown_substitution_error)
-          << (int)First << SubstDiag->SubstitutedEntity;
-    break;
-  }
-  case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
-    ConceptSpecializationExpr *ConstraintExpr =
-        Req->getReturnTypeRequirementSubstitutedConstraintExpr();
-    S.DiagnoseUnsatisfiedConstraint(ConstraintExpr);
-    break;
-  }
-  case concepts::ExprRequirement::SS_Satisfied:
-    llvm_unreachable("We checked this above");
+    case concepts::ExprRequirement::SS_Dependent:
+      llvm_unreachable("Diagnosing a dependent requirement");
+      break;
+    case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
+      auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
+      if (!SubstDiag->DiagMessage.empty())
+        S.Diag(SubstDiag->DiagLoc,
+               diag::note_expr_requirement_expr_substitution_error)
+               << (int)First << SubstDiag->SubstitutedEntity
+               << SubstDiag->DiagMessage;
+      else
+        S.Diag(SubstDiag->DiagLoc,
+               diag::note_expr_requirement_expr_unknown_substitution_error)
+            << (int)First << SubstDiag->SubstitutedEntity;
+      break;
+    }
+    case concepts::ExprRequirement::SS_NoexceptNotMet:
+      S.Diag(Req->getNoexceptLoc(),
+             diag::note_expr_requirement_noexcept_not_met)
+          << (int)First << Req->getExpr();
+      break;
+    case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
+      auto *SubstDiag =
+          Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
+      if (!SubstDiag->DiagMessage.empty())
+        S.Diag(SubstDiag->DiagLoc,
+               diag::note_expr_requirement_type_requirement_substitution_error)
+            << (int)First << SubstDiag->SubstitutedEntity
+            << SubstDiag->DiagMessage;
+      else
+        S.Diag(SubstDiag->DiagLoc,
+               diag::note_expr_requirement_type_requirement_unknown_substitution_error)
+            << (int)First << SubstDiag->SubstitutedEntity;
+      break;
+    }
+    case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
+      ConceptSpecializationExpr *ConstraintExpr =
+          Req->getReturnTypeRequirementSubstitutedConstraintExpr();
+      if (ConstraintExpr->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
+        // A simple case - expr type is the type being constrained and the concept
+        // was not provided arguments.
+        Expr *e = Req->getExpr();
+        S.Diag(e->getBeginLoc(),
+               diag::note_expr_requirement_constraints_not_satisfied_simple)
+            << (int)First << S.Context.getReferenceQualifiedType(e)
+            << ConstraintExpr->getNamedConcept();
+      } else {
+        S.Diag(ConstraintExpr->getBeginLoc(),
+               diag::note_expr_requirement_constraints_not_satisfied)
+            << (int)First << ConstraintExpr;
+      }
+      S.DiagnoseUnsatisfiedConstraint(ConstraintExpr->getSatisfaction());
+      break;
+    }
+    case concepts::ExprRequirement::SS_Satisfied:
+      llvm_unreachable("We checked this above");
   }
 }
 
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::TypeRequirement *Req,
                                            bool First) {
-  assert(!Req->isSatisfied() &&
-         "Diagnose() can only be used on an unsatisfied requirement");
+  assert(!Req->isSatisfied()
+         && "Diagnose() can only be used on an unsatisfied requirement");
   switch (Req->getSatisfactionStatus()) {
   case concepts::TypeRequirement::SS_Dependent:
     llvm_unreachable("Diagnosing a dependent requirement");
@@ -1682,9 +1235,9 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
   case concepts::TypeRequirement::SS_SubstitutionFailure: {
     auto *SubstDiag = Req->getSubstitutionDiagnostic();
     if (!SubstDiag->DiagMessage.empty())
-      S.Diag(SubstDiag->DiagLoc, diag::note_type_requirement_substitution_error)
-          << (int)First << SubstDiag->SubstitutedEntity
-          << SubstDiag->DiagMessage;
+      S.Diag(SubstDiag->DiagLoc,
+             diag::note_type_requirement_substitution_error) << (int)First
+          << SubstDiag->SubstitutedEntity << SubstDiag->DiagMessage;
     else
       S.Diag(SubstDiag->DiagLoc,
              diag::note_type_requirement_unknown_substitution_error)
@@ -1696,53 +1249,31 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
     return;
   }
 }
-
-static void diagnoseUnsatisfiedConceptIdExpr(Sema &S,
-                                             const ConceptReference *Concept,
-                                             SourceLocation Loc, bool First) {
-  if (Concept->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
-    S.Diag(
-        Loc,
-        diag::
-            note_single_arg_concept_specialization_constraint_evaluated_to_false)
-        << (int)First
-        << Concept->getTemplateArgsAsWritten()->arguments()[0].getArgument()
-        << Concept->getNamedConcept();
-  } else {
-    S.Diag(Loc, diag::note_concept_specialization_constraint_evaluated_to_false)
-        << (int)First << Concept;
-  }
-}
-
-static void diagnoseUnsatisfiedConstraintExpr(
-    Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
-    bool First, concepts::NestedRequirement *Req = nullptr);
-
-static void DiagnoseUnsatisfiedConstraint(
-    Sema &S, ArrayRef<UnsatisfiedConstraintRecord> Records, SourceLocation Loc,
-    bool First = true, concepts::NestedRequirement *Req = nullptr) {
-  for (auto &Record : Records) {
-    diagnoseUnsatisfiedConstraintExpr(S, Record, Loc, First, Req);
-    Loc = {};
-    First = isa<const ConceptReference *>(Record);
-  }
-}
+static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
+                                                        Expr *SubstExpr,
+                                                        bool First = true);
 
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::NestedRequirement *Req,
                                            bool First) {
-  DiagnoseUnsatisfiedConstraint(S, Req->getConstraintSatisfaction().records(),
-                                Req->hasInvalidConstraint()
-                                    ? SourceLocation()
-                                    : Req->getConstraintExpr()->getExprLoc(),
-                                First, Req);
+  using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
+  for (auto &Record : Req->getConstraintSatisfaction()) {
+    if (auto *SubstDiag = Record.dyn_cast<SubstitutionDiagnostic *>())
+      S.Diag(SubstDiag->first, diag::note_nested_requirement_substitution_error)
+          << (int)First << Req->getInvalidConstraintEntity()
+          << SubstDiag->second;
+    else
+      diagnoseWellFormedUnsatisfiedConstraintExpr(S, Record.dyn_cast<Expr *>(),
+                                                  First);
+    First = false;
+  }
 }
 
 static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
-                                                        const Expr *SubstExpr,
+                                                        Expr *SubstExpr,
                                                         bool First) {
   SubstExpr = SubstExpr->IgnoreParenImpCasts();
-  if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
     switch (BO->getOpcode()) {
     // These two cases will in practice only be reached when using fold
     // expressions with || and &&, since otherwise the || and && will have been
@@ -1788,7 +1319,7 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
         BO->getRHS()->EvaluateAsInt(SimplifiedRHS, S.Context,
                                     Expr::SE_NoSideEffects,
                                     /*InConstantContext=*/true);
-        if (!SimplifiedLHS.Diag && !SimplifiedRHS.Diag) {
+        if (!SimplifiedLHS.Diag && ! SimplifiedRHS.Diag) {
           S.Diag(SubstExpr->getBeginLoc(),
                  diag::note_atomic_constraint_evaluated_to_false_elaborated)
               << (int)First << SubstExpr
@@ -1803,6 +1334,22 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
     default:
       break;
     }
+  } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
+    if (CSE->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
+      S.Diag(
+          CSE->getSourceRange().getBegin(),
+          diag::
+          note_single_arg_concept_specialization_constraint_evaluated_to_false)
+          << (int)First
+          << CSE->getTemplateArgsAsWritten()->arguments()[0].getArgument()
+          << CSE->getNamedConcept();
+    } else {
+      S.Diag(SubstExpr->getSourceRange().getBegin(),
+             diag::note_concept_specialization_constraint_evaluated_to_false)
+          << (int)First << CSE;
+    }
+    S.DiagnoseUnsatisfiedConstraint(CSE->getSatisfaction());
+    return;
   } else if (auto *RE = dyn_cast<RequiresExpr>(SubstExpr)) {
     // FIXME: RequiresExpr should store dependent diagnostics.
     for (concepts::Requirement *Req : RE->getRequirements())
@@ -1817,10 +1364,6 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
         break;
       }
     return;
-  } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
-    // Drill down concept ids treated as atomic constraints
-    S.DiagnoseUnsatisfiedConstraint(CSE, First);
-    return;
   } else if (auto *TTE = dyn_cast<TypeTraitExpr>(SubstExpr);
              TTE && TTE->getTrait() == clang::TypeTrait::BTT_IsDeducible) {
     assert(TTE->getNumArgs() == 2);
@@ -1836,332 +1379,216 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
   S.DiagnoseTypeTraitDetails(SubstExpr);
 }
 
+template <typename SubstitutionDiagnostic>
 static void diagnoseUnsatisfiedConstraintExpr(
-    Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
-    bool First, concepts::NestedRequirement *Req) {
-  if (auto *Diag =
-          Record
-              .template dyn_cast<const ConstraintSubstitutionDiagnostic *>()) {
-    if (Req)
-      S.Diag(Diag->first, diag::note_nested_requirement_substitution_error)
-          << (int)First << Req->getInvalidConstraintEntity() << Diag->second;
-    else
-      S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
-          << Diag->second;
-    return;
-  }
-  if (const auto *Concept = dyn_cast<const ConceptReference *>(Record)) {
-    if (Loc.isInvalid())
-      Loc = Concept->getBeginLoc();
-    diagnoseUnsatisfiedConceptIdExpr(S, Concept, Loc, First);
+    Sema &S, const llvm::PointerUnion<Expr *, SubstitutionDiagnostic *> &Record,
+    bool First = true) {
+  if (auto *Diag = Record.template dyn_cast<SubstitutionDiagnostic *>()) {
+    S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
+        << Diag->second;
     return;
   }
-  diagnoseWellFormedUnsatisfiedConstraintExpr(
-      S, cast<const class Expr *>(Record), First);
-}
 
-void Sema::DiagnoseUnsatisfiedConstraint(
-    const ConstraintSatisfaction &Satisfaction, SourceLocation Loc,
-    bool First) {
+  diagnoseWellFormedUnsatisfiedConstraintExpr(S, cast<Expr *>(Record), First);
+}
 
+void
+Sema::DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction& Satisfaction,
+                                    bool First) {
   assert(!Satisfaction.IsSatisfied &&
          "Attempted to diagnose a satisfied constraint");
-  ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.Details, Loc, First);
+  for (auto &Record : Satisfaction.Details) {
+    diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
+    First = false;
+  }
 }
 
 void Sema::DiagnoseUnsatisfiedConstraint(
-    const ConceptSpecializationExpr *ConstraintExpr, bool First) {
-
-  const ASTConstraintSatisfaction &Satisfaction =
-      ConstraintExpr->getSatisfaction();
-
+    const ASTConstraintSatisfaction &Satisfaction,
+    bool First) {
   assert(!Satisfaction.IsSatisfied &&
          "Attempted to diagnose a satisfied constraint");
-
-  ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.records(),
-                                  ConstraintExpr->getBeginLoc(), First);
+  for (auto &Record : Satisfaction) {
+    diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
+    First = false;
+  }
 }
 
-namespace {
-
-class SubstituteParameterMappings {
-  Sema &SemaRef;
-
-  const MultiLevelTemplateArgumentList *MLTAL;
-  const ASTTemplateArgumentListInfo *ArgsAsWritten;
-
-  bool InFoldExpr;
-
-  SubstituteParameterMappings(Sema &SemaRef,
-                              const MultiLevelTemplateArgumentList *MLTAL,
-                              const ASTTemplateArgumentListInfo *ArgsAsWritten,
-                              bool InFoldExpr)
-      : SemaRef(SemaRef), MLTAL(MLTAL), ArgsAsWritten(ArgsAsWritten),
-        InFoldExpr(InFoldExpr) {}
-
-  void buildParameterMapping(NormalizedConstraintWithParamMapping &N);
-
-  bool substitute(NormalizedConstraintWithParamMapping &N);
+const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
+    const NamedDecl *ConstrainedDecl,
+    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
+  // In case the ConstrainedDecl comes from modules, it is necessary to use
+  // the canonical decl to avoid different atomic constraints with the 'same'
+  // declarations.
+  ConstrainedDecl = cast<NamedDecl>(ConstrainedDecl->getCanonicalDecl());
 
-  bool substitute(ConceptIdConstraint &CC);
+  auto CacheEntry = NormalizationCache.find(ConstrainedDecl);
+  if (CacheEntry == NormalizationCache.end()) {
+    auto Normalized = NormalizedConstraint::fromAssociatedConstraints(
+        *this, ConstrainedDecl, AssociatedConstraints);
+    CacheEntry =
+        NormalizationCache
+            .try_emplace(ConstrainedDecl,
+                         Normalized
+                             ? new (Context) NormalizedConstraint(
+                                 std::move(*Normalized))
+                             : nullptr)
+            .first;
+  }
+  return CacheEntry->second;
+}
 
-public:
-  SubstituteParameterMappings(Sema &SemaRef, bool InFoldExpr = false)
-      : SemaRef(SemaRef), MLTAL(nullptr), ArgsAsWritten(nullptr),
-        InFoldExpr(InFoldExpr) {}
+const NormalizedConstraint *clang::getNormalizedAssociatedConstraints(
+    Sema &S, const NamedDecl *ConstrainedDecl,
+    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
+  return S.getNormalizedAssociatedConstraints(ConstrainedDecl,
+                                              AssociatedConstraints);
+}
 
-  bool substitute(NormalizedConstraint &N);
-};
+static bool
+substituteParameterMappings(Sema &S, NormalizedConstraint &N,
+                            ConceptDecl *Concept,
+                            const MultiLevelTemplateArgumentList &MLTAL,
+                            const ASTTemplateArgumentListInfo *ArgsAsWritten) {
 
-void SubstituteParameterMappings::buildParameterMapping(
-    NormalizedConstraintWithParamMapping &N) {
-  TemplateParameterList *TemplateParams =
-      cast<TemplateDecl>(N.getConstraintDecl())->getTemplateParameters();
-
-  llvm::SmallBitVector OccurringIndices(TemplateParams->size());
-  llvm::SmallBitVector OccurringIndicesForSubsumption(TemplateParams->size());
-
-  if (N.getKind() == NormalizedConstraint::ConstraintKind::Atomic) {
-    SemaRef.MarkUsedTemplateParameters(
-        static_cast<AtomicConstraint &>(N).getConstraintExpr(),
-        /*OnlyDeduced=*/false,
-        /*Depth=*/0, OccurringIndices);
-
-    SemaRef.MarkUsedTemplateParametersForSubsumptionParameterMapping(
-        static_cast<AtomicConstraint &>(N).getConstraintExpr(),
-        /*Depth=*/0, OccurringIndicesForSubsumption);
-
-  } else if (N.getKind() ==
-             NormalizedConstraint::ConstraintKind::FoldExpanded) {
-    SemaRef.MarkUsedTemplateParameters(
-        static_cast<FoldExpandedConstraint &>(N).getPattern(),
-        /*OnlyDeduced=*/false,
-        /*Depth=*/0, OccurringIndices);
-  } else if (N.getKind() == NormalizedConstraint::ConstraintKind::ConceptId) {
-    auto *Args = static_cast<ConceptIdConstraint &>(N)
-                     .getConceptId()
-                     ->getTemplateArgsAsWritten();
-    if (Args)
-      SemaRef.MarkUsedTemplateParameters(Args->arguments(),
-                                         /*Depth=*/0, OccurringIndices);
+  if (N.isCompound()) {
+    if (substituteParameterMappings(S, N.getLHS(), Concept, MLTAL,
+                                    ArgsAsWritten))
+      return true;
+    return substituteParameterMappings(S, N.getRHS(), Concept, MLTAL,
+                                       ArgsAsWritten);
   }
-  TemplateArgumentLoc *TempArgs =
-      new (SemaRef.Context) TemplateArgumentLoc[OccurringIndices.count()];
-  llvm::SmallVector<NamedDecl *> UsedParams;
-  for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I) {
-    SourceLocation Loc = ArgsAsWritten->NumTemplateArgs > I
-                             ? ArgsAsWritten->arguments()[I].getLocation()
-                             : SourceLocation();
-    // FIXME: Investigate why we couldn't always preserve the SourceLoc. We
-    // can't assert Loc.isValid() now.
-    if (OccurringIndices[I]) {
-      NamedDecl *Param = TemplateParams->begin()[I];
-      new (&(TempArgs)[J]) TemplateArgumentLoc(
-          SemaRef.getIdentityTemplateArgumentLoc(Param, Loc));
-      UsedParams.push_back(Param);
-      J++;
-    }
+
+  if (N.isFoldExpanded()) {
+    Sema::ArgPackSubstIndexRAII _(S, std::nullopt);
+    return substituteParameterMappings(
+        S, N.getFoldExpandedConstraint()->Constraint, Concept, MLTAL,
+        ArgsAsWritten);
   }
-  auto *UsedList = TemplateParameterList::Create(
-      SemaRef.Context, TemplateParams->getTemplateLoc(),
-      TemplateParams->getLAngleLoc(), UsedParams,
-      /*RAngleLoc=*/SourceLocation(),
-      /*RequiresClause=*/nullptr);
-  unsigned Size = OccurringIndices.count();
-  N.updateParameterMapping(
-      std::move(OccurringIndices), std::move(OccurringIndicesForSubsumption),
-      MutableArrayRef<TemplateArgumentLoc>{TempArgs, Size}, UsedList);
-}
 
-bool SubstituteParameterMappings::substitute(
-    NormalizedConstraintWithParamMapping &N) {
-  if (!N.hasParameterMapping())
-    buildParameterMapping(N);
+  TemplateParameterList *TemplateParams = Concept->getTemplateParameters();
 
-  SourceLocation InstLocBegin, InstLocEnd;
-  llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
-  if (Arguments.empty()) {
-    InstLocBegin = ArgsAsWritten->getLAngleLoc();
-    InstLocEnd = ArgsAsWritten->getRAngleLoc();
-  } else {
-    auto SR = Arguments[0].getSourceRange();
-    InstLocBegin = SR.getBegin();
-    InstLocEnd = SR.getEnd();
-  }
+  AtomicConstraint &Atomic = *N.getAtomicConstraint();
+  TemplateArgumentListInfo SubstArgs;
+  if (!Atomic.ParameterMapping) {
+    llvm::SmallBitVector OccurringIndices(TemplateParams->size());
+    S.MarkUsedTemplateParameters(Atomic.ConstraintExpr, /*OnlyDeduced=*/false,
+                                 /*Depth=*/0, OccurringIndices);
+    TemplateArgumentLoc *TempArgs =
+        new (S.Context) TemplateArgumentLoc[OccurringIndices.count()];
+    for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I)
+      if (OccurringIndices[I])
+        new (&(TempArgs)[J++])
+            TemplateArgumentLoc(S.getIdentityTemplateArgumentLoc(
+                TemplateParams->begin()[I],
+                // Here we assume we do not support things like
+                // template<typename A, typename B>
+                // concept C = ...;
+                //
+                // template<typename... Ts> requires C<Ts...>
+                // struct S { };
+                // The above currently yields a diagnostic.
+                // We still might have default arguments for concept parameters.
+                ArgsAsWritten->NumTemplateArgs > I
+                    ? ArgsAsWritten->arguments()[I].getLocation()
+                    : SourceLocation()));
+    Atomic.ParameterMapping.emplace(TempArgs,  OccurringIndices.count());
+  }
+  SourceLocation InstLocBegin =
+      ArgsAsWritten->arguments().empty()
+          ? ArgsAsWritten->getLAngleLoc()
+          : ArgsAsWritten->arguments().front().getSourceRange().getBegin();
+  SourceLocation InstLocEnd =
+      ArgsAsWritten->arguments().empty()
+          ? ArgsAsWritten->getRAngleLoc()
+          : ArgsAsWritten->arguments().front().getSourceRange().getEnd();
   Sema::InstantiatingTemplate Inst(
-      SemaRef, InstLocBegin,
+      S, InstLocBegin,
       Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
-      const_cast<NamedDecl *>(N.getConstraintDecl()),
+      const_cast<NamedDecl *>(Atomic.ConstraintDecl),
       {InstLocBegin, InstLocEnd});
   if (Inst.isInvalid())
     return true;
-
-  // TransformTemplateArguments is unable to preserve the source location of a
-  // pack. The SourceLocation is necessary for the instantiation location.
-  // FIXME: The BaseLoc will be used as the location of the pack expansion,
-  // which is wrong.
-  TemplateArgumentListInfo SubstArgs;
-  if (SemaRef.SubstTemplateArgumentsInParameterMapping(
-          N.getParameterMapping(), N.getBeginLoc(), *MLTAL, SubstArgs,
-          /*BuildPackExpansionTypes=*/!InFoldExpr))
-    return true;
-  Sema::CheckTemplateArgumentInfo CTAI;
-  auto *TD =
-      const_cast<TemplateDecl *>(cast<TemplateDecl>(N.getConstraintDecl()));
-  if (SemaRef.CheckTemplateArgumentList(TD, N.getUsedTemplateParamList(),
-                                        TD->getLocation(), SubstArgs,
-                                        /*DefaultArguments=*/{},
-                                        /*PartialTemplateArgs=*/false, CTAI))
+  if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs))
     return true;
 
   TemplateArgumentLoc *TempArgs =
-      new (SemaRef.Context) TemplateArgumentLoc[CTAI.SugaredConverted.size()];
-
-  for (unsigned I = 0; I < CTAI.SugaredConverted.size(); ++I) {
-    SourceLocation Loc;
-    // If this is an empty pack, we have no corresponding SubstArgs.
-    if (I < SubstArgs.size())
-      Loc = SubstArgs.arguments()[I].getLocation();
-
-    TempArgs[I] = SemaRef.getTrivialTemplateArgumentLoc(
-        CTAI.SugaredConverted[I], QualType(), Loc);
-  }
-
-  MutableArrayRef<TemplateArgumentLoc> Mapping(TempArgs,
-                                               CTAI.SugaredConverted.size());
-  N.updateParameterMapping(N.mappingOccurenceList(),
-                           N.mappingOccurenceListForSubsumption(), Mapping,
-                           N.getUsedTemplateParamList());
+      new (S.Context) TemplateArgumentLoc[SubstArgs.size()];
+  std::copy(SubstArgs.arguments().begin(), SubstArgs.arguments().end(),
+            TempArgs);
+  Atomic.ParameterMapping.emplace(TempArgs, SubstArgs.size());
   return false;
 }
 
-bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
-  assert(CC.getConstraintDecl() && MLTAL && ArgsAsWritten);
-
-  if (substitute(static_cast<NormalizedConstraintWithParamMapping &>(CC)))
-    return true;
+static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
+                                        const ConceptSpecializationExpr *CSE) {
+  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
+      CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(),
+      /*Final=*/false, CSE->getTemplateArguments(),
+      /*RelativeToPrimary=*/true,
+      /*Pattern=*/nullptr,
+      /*ForConstraintInstantiation=*/true);
 
-  auto *CSE = CC.getConceptSpecializationExpr();
-  assert(CSE);
-  assert(!CC.getBeginLoc().isInvalid());
+  return substituteParameterMappings(S, N, CSE->getNamedConcept(), MLTAL,
+                                     CSE->getTemplateArgsAsWritten());
+}
 
-  SourceLocation InstLocBegin, InstLocEnd;
-  if (llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
-      Arguments.empty()) {
-    InstLocBegin = ArgsAsWritten->getLAngleLoc();
-    InstLocEnd = ArgsAsWritten->getRAngleLoc();
+NormalizedConstraint::NormalizedConstraint(ASTContext &C,
+                                           NormalizedConstraint LHS,
+                                           NormalizedConstraint RHS,
+                                           CompoundConstraintKind Kind)
+    : Constraint{CompoundConstraint{
+          new(C) NormalizedConstraintPair{std::move(LHS), std::move(RHS)},
+          Kind}} {}
+
+NormalizedConstraint::NormalizedConstraint(ASTContext &C,
+                                           const NormalizedConstraint &Other) {
+  if (Other.isAtomic()) {
+    Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint());
+  } else if (Other.isFoldExpanded()) {
+    Constraint = new (C) FoldExpandedConstraint(
+        Other.getFoldExpandedConstraint()->Kind,
+        NormalizedConstraint(C, Other.getFoldExpandedConstraint()->Constraint),
+        Other.getFoldExpandedConstraint()->Pattern);
   } else {
-    auto SR = Arguments[0].getSourceRange();
-    InstLocBegin = SR.getBegin();
-    InstLocEnd = SR.getEnd();
+    Constraint = CompoundConstraint(
+        new (C)
+            NormalizedConstraintPair{NormalizedConstraint(C, Other.getLHS()),
+                                     NormalizedConstraint(C, Other.getRHS())},
+        Other.getCompoundKind());
   }
-  // This is useful for name lookup across modules; see Sema::getLookupModules.
-  Sema::InstantiatingTemplate Inst(
-      SemaRef, InstLocBegin,
-      Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
-      const_cast<NamedDecl *>(CC.getConstraintDecl()),
-      {InstLocBegin, InstLocEnd});
-  if (Inst.isInvalid())
-    return true;
-
-  TemplateArgumentListInfo Out;
-  // TransformTemplateArguments is unable to preserve the source location of a
-  // pack. The SourceLocation is necessary for the instantiation location.
-  // FIXME: The BaseLoc will be used as the location of the pack expansion,
-  // which is wrong.
-  const ASTTemplateArgumentListInfo *ArgsAsWritten =
-      CSE->getTemplateArgsAsWritten();
-  if (SemaRef.SubstTemplateArgumentsInParameterMapping(
-          ArgsAsWritten->arguments(), CC.getBeginLoc(), *MLTAL, Out,
-          /*BuildPackExpansionTypes=*/!InFoldExpr))
-    return true;
-  Sema::CheckTemplateArgumentInfo CTAI;
-  if (SemaRef.CheckTemplateArgumentList(CSE->getNamedConcept(),
-                                        CSE->getConceptNameInfo().getLoc(), Out,
-                                        /*DefaultArgs=*/{},
-                                        /*PartialTemplateArgs=*/false, CTAI,
-                                        /*UpdateArgsWithConversions=*/false))
-    return true;
-  auto TemplateArgs = *MLTAL;
-  TemplateArgs.replaceOutermostTemplateArguments(
-      TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted);
-  return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten,
-                                     InFoldExpr)
-      .substitute(CC.getNormalizedConstraint());
 }
 
-bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) {
-  switch (N.getKind()) {
-  case NormalizedConstraint::ConstraintKind::Atomic: {
-    if (!MLTAL) {
-      assert(!ArgsAsWritten);
-      return false;
-    }
-    return substitute(static_cast<NormalizedConstraintWithParamMapping &>(N));
-  }
-  case NormalizedConstraint::ConstraintKind::FoldExpanded: {
-    auto &FE = static_cast<FoldExpandedConstraint &>(N);
-    if (!MLTAL) {
-      llvm::SaveAndRestore _1(InFoldExpr, true);
-      assert(!ArgsAsWritten);
-      return substitute(FE.getNormalizedPattern());
-    }
-    Sema::ArgPackSubstIndexRAII _(SemaRef, std::nullopt);
-    substitute(static_cast<NormalizedConstraintWithParamMapping &>(FE));
-    return SubstituteParameterMappings(SemaRef, /*InFoldExpr=*/true)
-        .substitute(FE.getNormalizedPattern());
-  }
-  case NormalizedConstraint::ConstraintKind::ConceptId: {
-    auto &CC = static_cast<ConceptIdConstraint &>(N);
-    if (MLTAL) {
-      assert(ArgsAsWritten);
-      return substitute(CC);
-    }
-    assert(!ArgsAsWritten);
-    const ConceptSpecializationExpr *CSE = CC.getConceptSpecializationExpr();
-    ConceptDecl *Concept = CSE->getNamedConcept();
-    MultiLevelTemplateArgumentList MLTAL = SemaRef.getTemplateInstantiationArgs(
-        Concept, Concept->getLexicalDeclContext(),
-        /*Final=*/true, CSE->getTemplateArguments(),
-        /*RelativeToPrimary=*/true,
-        /*Pattern=*/nullptr,
-        /*ForConstraintInstantiation=*/true);
-
-    return SubstituteParameterMappings(
-               SemaRef, &MLTAL, CSE->getTemplateArgsAsWritten(), InFoldExpr)
-        .substitute(CC.getNormalizedConstraint());
-  }
-  case NormalizedConstraint::ConstraintKind::Compound: {
-    auto &Compound = static_cast<CompoundConstraint &>(N);
-    if (substitute(Compound.getLHS()))
-      return true;
-    return substitute(Compound.getRHS());
-  }
-  }
+NormalizedConstraint &NormalizedConstraint::getLHS() const {
+  assert(isCompound() && "getLHS called on a non-compound constraint.");
+  return cast<CompoundConstraint>(Constraint).getPointer()->LHS;
 }
 
-} // namespace
+NormalizedConstraint &NormalizedConstraint::getRHS() const {
+  assert(isCompound() && "getRHS called on a non-compound constraint.");
+  return cast<CompoundConstraint>(Constraint).getPointer()->RHS;
+}
 
-NormalizedConstraint *NormalizedConstraint::fromAssociatedConstraints(
+std::optional<NormalizedConstraint>
+NormalizedConstraint::fromAssociatedConstraints(
     Sema &S, const NamedDecl *D, ArrayRef<AssociatedConstraint> ACs) {
   assert(ACs.size() != 0);
-  auto *Conjunction =
-      fromConstraintExpr(S, D, ACs[0].ConstraintExpr, ACs[0].ArgPackSubstIndex);
+  auto Conjunction = fromConstraintExpr(S, D, ACs[0].ConstraintExpr);
   if (!Conjunction)
-    return nullptr;
+    return std::nullopt;
   for (unsigned I = 1; I < ACs.size(); ++I) {
-    auto *Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr,
-                                    ACs[I].ArgPackSubstIndex);
+    auto Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr);
     if (!Next)
-      return nullptr;
-    Conjunction = CompoundConstraint::CreateConjunction(S.getASTContext(),
-                                                        Conjunction, Next);
+      return std::nullopt;
+    *Conjunction = NormalizedConstraint(S.Context, std::move(*Conjunction),
+                                        std::move(*Next), CCK_Conjunction);
   }
   return Conjunction;
 }
 
-NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
-    Sema &S, const NamedDecl *D, const Expr *E, UnsignedOrNone SubstIndex) {
+std::optional<NormalizedConstraint>
+NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
+                                         const Expr *E) {
   assert(E != nullptr);
 
   // C++ [temp.constr.normal]p1.1
@@ -2170,29 +1597,23 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
   // [...]
   E = E->IgnoreParenImpCasts();
 
-  llvm::FoldingSetNodeID ID;
-  if (D && DiagRecursiveConstraintEval(S, ID, D, E)) {
-    return nullptr;
-  }
-  SatisfactionStackRAII StackRAII(S, D, ID);
-
   // C++2a [temp.param]p4:
   //     [...] If T is not a pack, then E is E', otherwise E is (E' && ...).
   // Fold expression is considered atomic constraints per current wording.
   // See http://cplusplus.github.io/concepts-ts/ts-active.html#28
 
   if (LogicalBinOp BO = E) {
-    auto *LHS = fromConstraintExpr(S, D, BO.getLHS(), SubstIndex);
+    auto LHS = fromConstraintExpr(S, D, BO.getLHS());
     if (!LHS)
-      return nullptr;
-    auto *RHS = fromConstraintExpr(S, D, BO.getRHS(), SubstIndex);
+      return std::nullopt;
+    auto RHS = fromConstraintExpr(S, D, BO.getRHS());
     if (!RHS)
-      return nullptr;
+      return std::nullopt;
 
-    return CompoundConstraint::Create(
-        S.Context, LHS, BO.isAnd() ? CCK_Conjunction : CCK_Disjunction, RHS);
+    return NormalizedConstraint(S.Context, std::move(*LHS), std::move(*RHS),
+                                BO.isAnd() ? CCK_Conjunction : CCK_Disjunction);
   } else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
-    NormalizedConstraint *SubNF;
+    const NormalizedConstraint *SubNF;
     {
       Sema::InstantiatingTemplate Inst(
           S, CSE->getExprLoc(),
@@ -2200,7 +1621,7 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
           // FIXME: improve const-correctness of InstantiatingTemplate
           const_cast<NamedDecl *>(D), CSE->getSourceRange());
       if (Inst.isInvalid())
-        return nullptr;
+        return std::nullopt;
       // C++ [temp.constr.normal]p1.1
       // [...]
       // The normal form of an id-expression of the form C<A1, A2, ..., AN>,
@@ -2210,21 +1631,20 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
       // constraint. If any such substitution results in an invalid type or
       // expression, the program is ill-formed; no diagnostic is required.
       // [...]
-
-      // Use canonical declarations to merge ConceptDecls across
-      // different modules.
-      ConceptDecl *CD = CSE->getNamedConcept()->getCanonicalDecl();
-      SubNF = NormalizedConstraint::fromAssociatedConstraints(
-          S, CD, AssociatedConstraint(CD->getConstraintExpr(), SubstIndex));
-
+      ConceptDecl *CD = CSE->getNamedConcept();
+      SubNF = S.getNormalizedAssociatedConstraints(
+          CD, AssociatedConstraint(CD->getConstraintExpr()));
       if (!SubNF)
-        return nullptr;
+        return std::nullopt;
     }
 
-    return ConceptIdConstraint::Create(S.getASTContext(),
-                                       CSE->getConceptReference(), SubNF, D,
-                                       CSE, SubstIndex);
+    std::optional<NormalizedConstraint> New;
+    New.emplace(S.Context, *SubNF);
+
+    if (substituteParameterMappings(S, *New, CSE))
+      return std::nullopt;
 
+    return New;
   } else if (auto *FE = dyn_cast<const CXXFoldExpr>(E);
              FE && S.getLangOpts().CPlusPlus26 &&
              (FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
@@ -2238,61 +1658,31 @@ NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
             : FoldExpandedConstraint::FoldOperatorKind::Or;
 
     if (FE->getInit()) {
-      auto *LHS = fromConstraintExpr(S, D, FE->getLHS(), SubstIndex);
-      auto *RHS = fromConstraintExpr(S, D, FE->getRHS(), SubstIndex);
+      auto LHS = fromConstraintExpr(S, D, FE->getLHS());
+      auto RHS = fromConstraintExpr(S, D, FE->getRHS());
       if (!LHS || !RHS)
-        return nullptr;
+        return std::nullopt;
 
       if (FE->isRightFold())
-        LHS = FoldExpandedConstraint::Create(S.getASTContext(),
-                                             FE->getPattern(), D, Kind, LHS);
+        RHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
+            Kind, std::move(*RHS), FE->getPattern()}};
       else
-        RHS = FoldExpandedConstraint::Create(S.getASTContext(),
-                                             FE->getPattern(), D, Kind, RHS);
-
-      return CompoundConstraint::Create(
-          S.getASTContext(), LHS,
-          (FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
-                                                            : CCK_Disjunction),
-          RHS);
+        LHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
+            Kind, std::move(*LHS), FE->getPattern()}};
+
+      return NormalizedConstraint(
+          S.Context, std::move(*LHS), std::move(*RHS),
+          FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
+                                                           : CCK_Disjunction);
     }
-    auto *Sub = fromConstraintExpr(S, D, FE->getPattern(), SubstIndex);
+    auto Sub = fromConstraintExpr(S, D, FE->getPattern());
     if (!Sub)
-      return nullptr;
-    return FoldExpandedConstraint::Create(S.getASTContext(), FE->getPattern(),
-                                          D, Kind, Sub);
-  }
-  return AtomicConstraint::Create(S.getASTContext(), E, D, SubstIndex);
-}
-
-const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
-    ConstrainedDeclOrNestedRequirement ConstrainedDeclOrNestedReq,
-    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
-  if (!ConstrainedDeclOrNestedReq) {
-    auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
-        *this, nullptr, AssociatedConstraints);
-    if (!Normalized ||
-        SubstituteParameterMappings(*this).substitute(*Normalized))
-      return nullptr;
-
-    return Normalized;
+      return std::nullopt;
+    return NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
+        Kind, std::move(*Sub), FE->getPattern()}};
   }
 
-  // FIXME: ConstrainedDeclOrNestedReq is never a NestedRequirement!
-  const NamedDecl *ND =
-      ConstrainedDeclOrNestedReq.dyn_cast<const NamedDecl *>();
-  auto CacheEntry = NormalizationCache.find(ConstrainedDeclOrNestedReq);
-  if (CacheEntry == NormalizationCache.end()) {
-    auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
-        *this, ND, AssociatedConstraints);
-    CacheEntry =
-        NormalizationCache.try_emplace(ConstrainedDeclOrNestedReq, Normalized)
-            .first;
-    if (!Normalized ||
-        SubstituteParameterMappings(*this).substitute(*Normalized))
-      return nullptr;
-  }
-  return CacheEntry->second;
+  return NormalizedConstraint{new (S.Context) AtomicConstraint(E, D)};
 }
 
 bool FoldExpandedConstraint::AreCompatibleForSubsumption(
@@ -2303,10 +1693,8 @@ bool FoldExpandedConstraint::AreCompatibleForSubsumption(
   // if their respective constraints both contain an equivalent unexpanded pack.
 
   llvm::SmallVector<UnexpandedParameterPack> APacks, BPacks;
-  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.getPattern()),
-                                        APacks);
-  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.getPattern()),
-                                        BPacks);
+  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.Pattern), APacks);
+  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.Pattern), BPacks);
 
   for (const UnexpandedParameterPack &APack : APacks) {
     auto ADI = getDepthAndIndex(APack);
@@ -2400,7 +1788,7 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
                                     const AtomicConstraint &B) {
     if (!A.hasMatchingParameterMapping(Context, B))
       return false;
-    const Expr *EA = A.getConstraintExpr(), *EB = B.getConstraintExpr();
+    const Expr *EA = A.ConstraintExpr, *EB = B.ConstraintExpr;
     if (EA == EB)
       return true;
 
@@ -2453,6 +1841,24 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
   return true;
 }
 
+NormalizedConstraint::CompoundConstraintKind
+NormalizedConstraint::getCompoundKind() const {
+  assert(isCompound() && "getCompoundKind on a non-compound constraint..");
+  return cast<CompoundConstraint>(Constraint).getInt();
+}
+
+AtomicConstraint *NormalizedConstraint::getAtomicConstraint() const {
+  assert(isAtomic() && "getAtomicConstraint called on non-atomic constraint.");
+  return cast<AtomicConstraint *>(Constraint);
+}
+
+FoldExpandedConstraint *
+NormalizedConstraint::getFoldExpandedConstraint() const {
+  assert(isFoldExpanded() &&
+         "getFoldExpandedConstraint called on non-fold-expanded constraint.");
+  return cast<FoldExpandedConstraint *>(Constraint);
+}
+
 //
 //
 // ------------------------ Subsumption -----------------------------------
@@ -2468,8 +1874,8 @@ uint16_t SubsumptionChecker::getNewLiteralId() {
   return NextID++;
 }
 
-auto SubsumptionChecker::find(const AtomicConstraint *Ori) -> Literal {
-  auto &Elems = AtomicMap[Ori->getConstraintExpr()];
+auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
+  auto &Elems = AtomicMap[Ori->ConstraintExpr];
   // C++ [temp.constr.order] p2
   //   - an atomic constraint A subsumes another atomic constraint B
   //     if and only if the A and B are identical [...]
@@ -2485,16 +1891,13 @@ auto SubsumptionChecker::find(const AtomicConstraint *Ori) -> Literal {
   // subsumes another, their literal will be the same
 
   llvm::FoldingSetNodeID ID;
-  ID.AddBoolean(Ori->hasParameterMapping());
-  if (Ori->hasParameterMapping()) {
-    const auto &Mapping = Ori->getParameterMapping();
-    const NormalizedConstraint::OccurenceList &Indexes =
-        Ori->mappingOccurenceListForSubsumption();
-    for (auto [Idx, TAL] : llvm::enumerate(Mapping)) {
-      if (Indexes[Idx])
-        SemaRef.getASTContext()
-            .getCanonicalTemplateArgument(TAL.getArgument())
-            .Profile(ID, SemaRef.getASTContext());
+  const auto &Mapping = Ori->ParameterMapping;
+  ID.AddBoolean(Mapping.has_value());
+  if (Mapping) {
+    for (const TemplateArgumentLoc &TAL : *Mapping) {
+      SemaRef.getASTContext()
+          .getCanonicalTemplateArgument(TAL.getArgument())
+          .Profile(ID, SemaRef.getASTContext());
     }
   }
   auto It = Elems.find(ID);
@@ -2509,11 +1912,11 @@ auto SubsumptionChecker::find(const AtomicConstraint *Ori) -> Literal {
   return It->getSecond().ID;
 }
 
-auto SubsumptionChecker::find(const FoldExpandedConstraint *Ori) -> Literal {
-  auto &Elems = FoldMap[Ori->getPattern()];
+auto SubsumptionChecker::find(FoldExpandedConstraint *Ori) -> Literal {
+  auto &Elems = FoldMap[Ori->Pattern];
 
   FoldExpendedConstraintKey K;
-  K.Kind = Ori->getFoldOperator();
+  K.Kind = Ori->Kind;
 
   auto It = llvm::find_if(Elems, [&K](const FoldExpendedConstraintKey &Other) {
     return K.Kind == Other.Kind;
@@ -2557,47 +1960,38 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
     AddUniqueClauseToFormula(Res, std::move(C));
   };
 
-  switch (NC.getKind()) {
-
-  case NormalizedConstraint::ConstraintKind::Atomic:
-    return {{find(&static_cast<const AtomicConstraint &>(NC))}};
-
-  case NormalizedConstraint::ConstraintKind::FoldExpanded:
-    return {{find(&static_cast<const FoldExpandedConstraint &>(NC))}};
+  if (NC.isAtomic())
+    return {{find(NC.getAtomicConstraint())}};
 
-  case NormalizedConstraint::ConstraintKind::ConceptId:
-    return Normalize<FormulaType>(
-        static_cast<const ConceptIdConstraint &>(NC).getNormalizedConstraint());
+  if (NC.isFoldExpanded())
+    return {{find(NC.getFoldExpandedConstraint())}};
 
-  case NormalizedConstraint::ConstraintKind::Compound: {
-    const auto &Compound = static_cast<const CompoundConstraint &>(NC);
-    FormulaType Left, Right;
-    SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
-      Left = Normalize<FormulaType>(Compound.getLHS());
-      Right = Normalize<FormulaType>(Compound.getRHS());
-    });
-
-    if (Compound.getCompoundKind() == FormulaType::Kind) {
-      Res = std::move(Left);
-      Res.reserve(Left.size() + Right.size());
-      std::for_each(std::make_move_iterator(Right.begin()),
-                    std::make_move_iterator(Right.end()), Add);
-      return Res;
-    }
+  FormulaType Left, Right;
+  SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
+    Left = Normalize<FormulaType>(NC.getLHS());
+    Right = Normalize<FormulaType>(NC.getRHS());
+  });
 
-    Res.reserve(Left.size() * Right.size());
-    for (const auto &LTransform : Left) {
-      for (const auto &RTransform : Right) {
-        Clause Combined;
-        Combined.reserve(LTransform.size() + RTransform.size());
-        llvm::copy(LTransform, std::back_inserter(Combined));
-        llvm::copy(RTransform, std::back_inserter(Combined));
-        Add(std::move(Combined));
-      }
-    }
+  if (NC.getCompoundKind() == FormulaType::Kind) {
+    auto SizeLeft = Left.size();
+    Res = std::move(Left);
+    Res.reserve(SizeLeft + Right.size());
+    std::for_each(std::make_move_iterator(Right.begin()),
+                  std::make_move_iterator(Right.end()), Add);
     return Res;
   }
+
+  Res.reserve(Left.size() * Right.size());
+  for (const auto &LTransform : Left) {
+    for (const auto &RTransform : Right) {
+      Clause Combined;
+      Combined.reserve(LTransform.size() + RTransform.size());
+      llvm::append_range(Combined, LTransform);
+      llvm::append_range(Combined, RTransform);
+      Add(std::move(Combined));
+    }
   }
+  return Res;
 }
 
 void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) {
@@ -2612,12 +2006,12 @@ std::optional<bool> SubsumptionChecker::Subsumes(
     const NamedDecl *DP, ArrayRef<AssociatedConstraint> P, const NamedDecl *DQ,
     ArrayRef<AssociatedConstraint> Q) {
   const NormalizedConstraint *PNormalized =
-      SemaRef.getNormalizedAssociatedConstraints(DP, P);
+      getNormalizedAssociatedConstraints(SemaRef, DP, P);
   if (!PNormalized)
     return std::nullopt;
 
   const NormalizedConstraint *QNormalized =
-      SemaRef.getNormalizedAssociatedConstraints(DQ, Q);
+      getNormalizedAssociatedConstraints(SemaRef, DQ, Q);
   if (!QNormalized)
     return std::nullopt;
 
@@ -2667,9 +2061,9 @@ bool SubsumptionChecker::Subsumes(const FoldExpandedConstraint *A,
     // constraint B if they are compatible for subsumption, have the same
     // fold-operator, and the constraint of A subsumes that of B.
     bool DoesSubsume =
-        A->getFoldOperator() == B->getFoldOperator() &&
+        A->Kind == B->Kind &&
         FoldExpandedConstraint::AreCompatibleForSubsumption(*A, *B) &&
-        Subsumes(&A->getNormalizedPattern(), &B->getNormalizedPattern());
+        Subsumes(&A->Constraint, &B->Constraint);
     It = FoldSubsumptionCache.try_emplace(std::move(Key), DoesSubsume).first;
   }
   return It->second;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index c5724d7f3285e..1131e1f033b72 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -17876,15 +17876,13 @@ Decl *Sema::BuildStaticAssertDeclaration(SourceLocation StaticAssertLoc,
         findFailedBooleanCondition(Converted.get());
       if (const auto *ConceptIDExpr =
               dyn_cast_or_null<ConceptSpecializationExpr>(InnerCond)) {
-        const ASTConstraintSatisfaction &Satisfaction =
-            ConceptIDExpr->getSatisfaction();
-        if (!Satisfaction.ContainsErrors || Satisfaction.NumRecords) {
-          Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
-              << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
-          // Drill down into concept specialization expressions to see why they
-          // weren't satisfied.
-          DiagnoseUnsatisfiedConstraint(ConceptIDExpr);
-        }
+        // Drill down into concept specialization expressions to see why they
+        // weren't satisfied.
+        Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
+            << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
+        ConstraintSatisfaction Satisfaction;
+        if (!CheckConstraintSatisfaction(ConceptIDExpr, Satisfaction))
+          DiagnoseUnsatisfiedConstraint(Satisfaction);
       } else if (InnerCond && !isa<CXXBoolLiteralExpr>(InnerCond) &&
                  !isa<IntegerLiteral>(InnerCond)) {
         Diag(InnerCond->getBeginLoc(),
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index ce5527fd9ae59..779ccf5f1e888 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7931,27 +7931,21 @@ Sema::BuildExprRequirement(
     //     be satisfied.
     TemplateParameterList *TPL =
         ReturnTypeRequirement.getTypeConstraintTemplateParameterList();
-    QualType MatchedType = Context.getReferenceQualifiedType(E);
+    QualType MatchedType =
+        Context.getReferenceQualifiedType(E).getCanonicalType();
     llvm::SmallVector<TemplateArgument, 1> Args;
     Args.push_back(TemplateArgument(MatchedType));
 
     auto *Param = cast<TemplateTypeParmDecl>(TPL->getParam(0));
 
-    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/true);
+    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/false);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
     const TypeConstraint *TC = Param->getTypeConstraint();
     assert(TC && "Type Constraint cannot be null here");
     auto *IDC = TC->getImmediatelyDeclaredConstraint();
     assert(IDC && "ImmediatelyDeclaredConstraint can't be null here.");
     ExprResult Constraint = SubstExpr(IDC, MLTAL);
-    bool HasError = Constraint.isInvalid();
-    if (!HasError) {
-      SubstitutedConstraintExpr =
-          cast<ConceptSpecializationExpr>(Constraint.get());
-      if (SubstitutedConstraintExpr->getSatisfaction().ContainsErrors)
-        HasError = true;
-    }
-    if (HasError) {
+    if (Constraint.isInvalid()) {
       return new (Context) concepts::ExprRequirement(
           createSubstDiagAt(IDC->getExprLoc(),
                             [&](llvm::raw_ostream &OS) {
@@ -7960,6 +7954,8 @@ Sema::BuildExprRequirement(
                             }),
           IsSimple, NoexceptLoc, ReturnTypeRequirement);
     }
+    SubstitutedConstraintExpr =
+        cast<ConceptSpecializationExpr>(Constraint.get());
     if (!SubstitutedConstraintExpr->isSatisfied())
       Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
   }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f741e8283761d..ea5c4265d736d 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -12739,8 +12739,7 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
         << (unsigned)FnKindPair.first << (unsigned)ocs_non_template
         << FnDesc /* Ignored */;
     ConstraintSatisfaction Satisfaction;
-    if (S.CheckFunctionConstraints(Fn, Satisfaction, SourceLocation(),
-                                   /*ForOverloadResolution=*/true))
+    if (S.CheckFunctionConstraints(Fn, Satisfaction))
       break;
     S.DiagnoseUnsatisfiedConstraint(Satisfaction);
   }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index dcf2876af81fc..2bf1511c5cfa0 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "TreeTransform.h"
-#include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
@@ -1223,9 +1222,8 @@ static ExprResult formImmediatelyDeclaredConstraint(
   if (auto *CD = dyn_cast<ConceptDecl>(NamedConcept)) {
     ImmediatelyDeclaredConstraint = S.CheckConceptTemplateId(
         SS, /*TemplateKWLoc=*/SourceLocation(), NameInfo,
-        /*FoundDecl=*/FoundDecl ? FoundDecl : CD, CD, &ConstraintArgs,
-        /*DoCheckConstraintSatisfaction=*/
-        !S.inParameterMappingSubstitution());
+        /*FoundDecl=*/FoundDecl ? FoundDecl : NamedConcept, CD,
+        &ConstraintArgs);
   }
   // We have a template template parameter
   else {
@@ -4852,11 +4850,13 @@ void Sema::diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
   diagnoseMissingTemplateArguments(Name, Loc);
 }
 
-ExprResult Sema::CheckConceptTemplateId(
-    const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
-    const DeclarationNameInfo &ConceptNameInfo, NamedDecl *FoundDecl,
-    TemplateDecl *NamedConcept, const TemplateArgumentListInfo *TemplateArgs,
-    bool DoCheckConstraintSatisfaction) {
+ExprResult
+Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
+                             SourceLocation TemplateKWLoc,
+                             const DeclarationNameInfo &ConceptNameInfo,
+                             NamedDecl *FoundDecl,
+                             ConceptDecl *NamedConcept,
+                             const TemplateArgumentListInfo *TemplateArgs) {
   assert(NamedConcept && "A concept template id without a template?");
 
   if (NamedConcept->isInvalidDecl())
@@ -4873,48 +4873,33 @@ ExprResult Sema::CheckConceptTemplateId(
 
   DiagnoseUseOfDecl(NamedConcept, ConceptNameInfo.getLoc());
 
-  // There's a bug with CTAI.CanonicalConverted.
-  // If the template argument contains a DependentDecltypeType that includes a
-  // TypeAliasType, and the same written type had occurred previously in the
-  // source, then the DependentDecltypeType would be canonicalized to that
-  // previous type which would mess up the substitution.
-  // FIXME: Reland https://github.com/llvm/llvm-project/pull/101782 properly!
   auto *CSD = ImplicitConceptSpecializationDecl::Create(
       Context, NamedConcept->getDeclContext(), NamedConcept->getLocation(),
-      CTAI.SugaredConverted);
+      CTAI.CanonicalConverted);
   ConstraintSatisfaction Satisfaction;
   bool AreArgsDependent =
       TemplateSpecializationType::anyDependentTemplateArguments(
-          *TemplateArgs, CTAI.SugaredConverted);
-  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.SugaredConverted,
+          *TemplateArgs, CTAI.CanonicalConverted);
+  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.CanonicalConverted,
                                        /*Final=*/false);
+  LocalInstantiationScope Scope(*this);
+
+  EnterExpressionEvaluationContext EECtx{
+      *this, ExpressionEvaluationContext::Unevaluated, CSD};
+
+  if (!AreArgsDependent &&
+      CheckConstraintSatisfaction(
+          NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()),
+          MLTAL,
+          SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
+                      TemplateArgs->getRAngleLoc()),
+          Satisfaction))
+    return ExprError();
   auto *CL = ConceptReference::Create(
       Context,
       SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc{},
       TemplateKWLoc, ConceptNameInfo, FoundDecl, NamedConcept,
       ASTTemplateArgumentListInfo::Create(Context, *TemplateArgs));
-
-  bool Error = false;
-  if (const auto *Concept = dyn_cast<ConceptDecl>(NamedConcept);
-      Concept && Concept->getConstraintExpr() && !AreArgsDependent &&
-      DoCheckConstraintSatisfaction) {
-
-    LocalInstantiationScope Scope(*this);
-
-    EnterExpressionEvaluationContext EECtx{
-        *this, ExpressionEvaluationContext::Unevaluated, CSD};
-
-    Error = CheckConstraintSatisfaction(
-        NamedConcept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
-        SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
-                    TemplateArgs->getRAngleLoc()),
-        Satisfaction, CL);
-    Satisfaction.ContainsErrors = Error;
-  }
-
-  if (Error)
-    return ExprError();
-
   return ConceptSpecializationExpr::Create(
       Context, CL, CSD, AreArgsDependent ? nullptr : &Satisfaction);
 }
@@ -5232,11 +5217,10 @@ bool Sema::CheckTemplateTypeArgument(
   }
   default: {
     // We allow instantiating a template with template argument packs when
-    // building deduction guides or mapping constraint template parameters.
+    // building deduction guides.
     if (Arg.getKind() == TemplateArgument::Pack &&
-        (CodeSynthesisContexts.back().Kind ==
-             Sema::CodeSynthesisContext::BuildingDeductionGuides ||
-         inParameterMappingSubstitution())) {
+        CodeSynthesisContexts.back().Kind ==
+            Sema::CodeSynthesisContext::BuildingDeductionGuides) {
       SugaredConverted.push_back(Arg);
       CanonicalConverted.push_back(Arg);
       return false;
@@ -5829,20 +5813,6 @@ bool Sema::CheckTemplateArgumentList(
     TemplateArgumentListInfo &TemplateArgs, const DefaultArguments &DefaultArgs,
     bool PartialTemplateArgs, CheckTemplateArgumentInfo &CTAI,
     bool UpdateArgsWithConversions, bool *ConstraintsNotSatisfied) {
-  return CheckTemplateArgumentList(
-      Template, GetTemplateParameterList(Template), TemplateLoc, TemplateArgs,
-      DefaultArgs, PartialTemplateArgs, CTAI, UpdateArgsWithConversions,
-      ConstraintsNotSatisfied);
-}
-
-/// Check that the given template argument list is well-formed
-/// for specializing the given template.
-bool Sema::CheckTemplateArgumentList(
-    TemplateDecl *Template, TemplateParameterList *Params,
-    SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
-    const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
-    CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions,
-    bool *ConstraintsNotSatisfied) {
 
   if (ConstraintsNotSatisfied)
     *ConstraintsNotSatisfied = false;
@@ -5852,6 +5822,8 @@ bool Sema::CheckTemplateArgumentList(
   // template.
   TemplateArgumentListInfo NewArgs = TemplateArgs;
 
+  TemplateParameterList *Params = GetTemplateParameterList(Template);
+
   SourceLocation RAngleLoc = NewArgs.getRAngleLoc();
 
   // C++23 [temp.arg.general]p1:
@@ -6191,12 +6163,11 @@ bool Sema::CheckTemplateArgumentList(
     CXXThisScopeRAII Scope(*this, RD, ThisQuals, RD != nullptr);
 
     MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs(
-        Template, NewContext, /*Final=*/true, CTAI.SugaredConverted,
+        Template, NewContext, /*Final=*/false, CTAI.CanonicalConverted,
         /*RelativeToPrimary=*/true,
         /*Pattern=*/nullptr,
         /*ForConceptInstantiation=*/true);
-    if (!isa<ConceptDecl>(Template) &&
-        EnsureTemplateArgumentListConstraints(
+    if (EnsureTemplateArgumentListConstraints(
             Template, MLTAL,
             SourceRange(TemplateLoc, TemplateArgs.getRAngleLoc()))) {
       if (ConstraintsNotSatisfied)
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 6bba505ece07d..f6ee7452c2f9a 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3206,7 +3206,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
   // If we don't need to replace the deduced template arguments,
   // we can add them immediately as the inner-most argument list.
   if (!DeducedArgsNeedReplacement)
-    Innermost = SugaredDeducedArgs;
+    Innermost = CanonicalDeducedArgs;
 
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       Template, Template->getDeclContext(), /*Final=*/false, Innermost,
@@ -3218,7 +3218,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
   // not class-scope explicit specialization, so replace with Deduced Args
   // instead of adding to inner-most.
   if (!Innermost)
-    MLTAL.replaceInnermostTemplateArguments(Template, SugaredDeducedArgs);
+    MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs);
 
   if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL,
                                     Info.getLocation(),
@@ -3995,12 +3995,11 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     if (CheckFunctionTemplateConstraints(
             Info.getLocation(),
             FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(),
-            CTAI.SugaredConverted, Info.AssociatedConstraintsSatisfaction))
+            CTAI.CanonicalConverted, Info.AssociatedConstraintsSatisfaction))
       return TemplateDeductionResult::MiscellaneousDeductionFailure;
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
-      Info.reset(
-          TemplateArgumentList::CreateCopy(Context, CTAI.SugaredConverted),
-          Info.takeCanonical());
+      Info.reset(Info.takeSugared(), TemplateArgumentList::CreateCopy(
+                                         Context, CTAI.CanonicalConverted));
       return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
@@ -5168,8 +5167,8 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
                                   /*DefaultArgs=*/{},
                                   /*PartialTemplateArgs=*/false, CTAI))
     return true;
-  MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.SugaredConverted,
-                                       /*Final=*/true);
+  MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.CanonicalConverted,
+                                       /*Final=*/false);
   // Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so
   // that the template arguments of the constraint can be preserved. For
   // example:
@@ -5183,7 +5182,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
       S, Sema::ExpressionEvaluationContext::Unevaluated,
       ImplicitConceptSpecializationDecl::Create(
           S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(),
-          CTAI.SugaredConverted));
+          CTAI.CanonicalConverted));
   if (S.CheckConstraintSatisfaction(
           Concept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
           TypeLoc.getLocalSourceRange(), Satisfaction))
@@ -6677,11 +6676,10 @@ namespace {
 struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
   llvm::SmallBitVector &Used;
   unsigned Depth;
-  bool VisitDeclRefTypes = true;
 
-  MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used, unsigned Depth,
-                                   bool VisitDeclRefTypes = true)
-      : Used(Used), Depth(Depth), VisitDeclRefTypes(VisitDeclRefTypes) {}
+  MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used,
+                                   unsigned Depth)
+      : Used(Used), Depth(Depth) { }
 
   bool VisitTemplateTypeParmType(TemplateTypeParmType *T) override {
     if (T->getDepth() == Depth)
@@ -6702,8 +6700,6 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
     if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(E->getDecl()))
       if (NTTP->getDepth() == Depth)
         Used[NTTP->getIndex()] = true;
-    if (VisitDeclRefTypes)
-      DynamicRecursiveASTVisitor::TraverseType(E->getType());
     return true;
   }
 
@@ -7047,13 +7043,10 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
     break;
 
   case Type::UnaryTransform:
-    if (!OnlyDeduced) {
-      auto *UTT = cast<UnaryTransformType>(T);
-      auto Next = UTT->getUnderlyingType();
-      if (Next.isNull())
-        Next = UTT->getBaseType();
-      MarkUsedTemplateParameters(Ctx, Next, OnlyDeduced, Depth, Used);
-    }
+    if (!OnlyDeduced)
+      MarkUsedTemplateParameters(Ctx,
+                                 cast<UnaryTransformType>(T)->getUnderlyingType(),
+                                 OnlyDeduced, Depth, Used);
     break;
 
   case Type::PackExpansion:
@@ -7153,12 +7146,6 @@ Sema::MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
   ::MarkUsedTemplateParameters(Context, E, OnlyDeduced, Depth, Used);
 }
 
-void Sema::MarkUsedTemplateParametersForSubsumptionParameterMapping(
-    const Expr *E, unsigned Depth, llvm::SmallBitVector &Used) {
-  MarkUsedTemplateParameterVisitor(Used, Depth, /*VisitDeclRefTypes=*/false)
-      .TraverseStmt(const_cast<Expr *>(E));
-}
-
 void
 Sema::MarkUsedTemplateParameters(const TemplateArgumentList &TemplateArgs,
                                  bool OnlyDeduced, unsigned Depth,
@@ -7184,14 +7171,6 @@ void Sema::MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
                                  /*OnlyDeduced=*/false, Depth, Used);
 }
 
-void Sema::MarkUsedTemplateParameters(
-    ArrayRef<TemplateArgumentLoc> TemplateArgs, unsigned Depth,
-    llvm::SmallBitVector &Used) {
-  for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
-    ::MarkUsedTemplateParameters(Context, TemplateArgs[I].getArgument(),
-                                 /*OnlyDeduced=*/false, Depth, Used);
-}
-
 void Sema::MarkDeducedTemplateParameters(
     ASTContext &Ctx, const FunctionTemplateDecl *FunctionTemplate,
     llvm::SmallBitVector &Deduced) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index b996efd441799..f1c9c5c868159 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -628,14 +628,9 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
   Inst.InstantiationRange = InstantiationRange;
   Inst.InConstraintSubstitution =
       Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
-  Inst.InParameterMappingSubstitution =
-      Inst.Kind == CodeSynthesisContext::ParameterMappingSubstitution;
-  if (!SemaRef.CodeSynthesisContexts.empty()) {
+  if (!SemaRef.CodeSynthesisContexts.empty())
     Inst.InConstraintSubstitution |=
         SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
-    Inst.InParameterMappingSubstitution |=
-        SemaRef.CodeSynthesisContexts.back().InParameterMappingSubstitution;
-  }
 
   Invalid = SemaRef.pushCodeSynthesisContext(Inst);
   if (!Invalid) {
@@ -1380,7 +1375,6 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
 // Template Instantiation for Types
 //===----------------------------------------------------------------------===/
 namespace {
-
   class TemplateInstantiator : public TreeTransform<TemplateInstantiator> {
     const MultiLevelTemplateArgumentList &TemplateArgs;
     SourceLocation Loc;
@@ -1393,11 +1387,7 @@ namespace {
     // Whether an incomplete substituion should be treated as an error.
     bool BailOutOnIncomplete;
 
-    // Whether to rebuild pack expansion types; We don't do that when
-    // rebuilding the parameter mapping of a fold expression appearing
-    // in a constraint expression.
-    bool BuildPackExpansionTypes = true;
-
+  private:
     // CWG2770: Function parameters should be instantiated when they are
     // needed by a satisfaction check of an atomic constraint or
     // (recursively) by another function parameter.
@@ -1420,17 +1410,6 @@ namespace {
       return EvaluateConstraints;
     }
 
-    inline static struct ForParameterMappingSubstitution_t {
-    } ForParameterMappingSubstitution;
-
-    TemplateInstantiator(ForParameterMappingSubstitution_t, Sema &SemaRef,
-                         SourceLocation Loc,
-                         const MultiLevelTemplateArgumentList &TemplateArgs,
-                         bool BuildPackExpansionTypes)
-        : inherited(SemaRef), TemplateArgs(TemplateArgs), Loc(Loc),
-          BailOutOnIncomplete(false),
-          BuildPackExpansionTypes(BuildPackExpansionTypes) {}
-
     /// Determine whether the given type \p T has already been
     /// transformed.
     ///
@@ -1465,8 +1444,7 @@ namespace {
                                  bool &ShouldExpand, bool &RetainExpansion,
                                  UnsignedOrNone &NumExpansions) {
       if (SemaRef.CurrentInstantiationScope &&
-          (SemaRef.inConstraintSubstitution() ||
-           SemaRef.inParameterMappingSubstitution())) {
+          SemaRef.inConstraintSubstitution()) {
         for (UnexpandedParameterPack ParmPack : Unexpanded) {
           NamedDecl *VD = ParmPack.first.dyn_cast<NamedDecl *>();
           if (auto *PVD = dyn_cast_if_present<ParmVarDecl>(VD);
@@ -1487,10 +1465,10 @@ namespace {
 
     TemplateArgument ForgetPartiallySubstitutedPack() {
       TemplateArgument Result;
-      if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
-                                       ->getPartiallySubstitutedPack()) {
-        MultiLevelTemplateArgumentList &TemplateArgs =
-            const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+      if (NamedDecl *PartialPack
+            = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
+        MultiLevelTemplateArgumentList &TemplateArgs
+          = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
         unsigned Depth, Index;
         std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
         if (TemplateArgs.hasTemplateArgument(Depth, Index)) {
@@ -1510,10 +1488,10 @@ namespace {
       if (Arg.isNull())
         return;
 
-      if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
-                                       ->getPartiallySubstitutedPack()) {
-        MultiLevelTemplateArgumentList &TemplateArgs =
-            const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+      if (NamedDecl *PartialPack
+            = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
+        MultiLevelTemplateArgumentList &TemplateArgs
+        = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
         unsigned Depth, Index;
         std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
         TemplateArgs.setArgument(Depth, Index, Arg);
@@ -1530,9 +1508,9 @@ namespace {
           std::move(New);
       return Old;
     }
-
     void RememberSubstitution(MultiLevelTemplateArgumentList Old) {
-      const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) = Old;
+      const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) =
+          std::move(Old);
     }
 
     TemplateArgument
@@ -1713,24 +1691,6 @@ namespace {
       return inherited::TransformTemplateArgument(Input, Output, Uneval);
     }
 
-    // This has to be here to allow its overload.
-    ExprResult RebuildPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc,
-                                    UnsignedOrNone NumExpansions) {
-      return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
-                                             NumExpansions);
-    }
-
-    TemplateArgumentLoc RebuildPackExpansion(TemplateArgumentLoc Pattern,
-                                             SourceLocation EllipsisLoc,
-                                             UnsignedOrNone NumExpansions) {
-      // We don't rewrite a PackExpansion type when we want to normalize a
-      // CXXFoldExpr constraint. We'll expand it when evaluating the constraint.
-      if (BuildPackExpansionTypes)
-        return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
-                                               NumExpansions);
-      return Pattern;
-    }
-
     using TreeTransform::TransformTemplateSpecializationType;
     QualType
     TransformTemplateSpecializationType(TypeLocBuilder &TLB,
@@ -2001,8 +1961,7 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
 
   if (ParmVarDecl *PVD = dyn_cast<ParmVarDecl>(D);
       PVD && SemaRef.CurrentInstantiationScope &&
-      (SemaRef.inConstraintSubstitution() ||
-       SemaRef.inParameterMappingSubstitution()) &&
+      SemaRef.inConstraintSubstitution() &&
       maybeInstantiateFunctionParameterToScope(PVD))
     return nullptr;
 
@@ -2800,30 +2759,18 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
 concepts::NestedRequirement *
 TemplateInstantiator::TransformNestedRequirement(
     concepts::NestedRequirement *Req) {
-
-  ASTContext &C = SemaRef.Context;
-
-  Expr *Constraint = Req->getConstraintExpr();
-  ExprResult TransConstraint = Constraint;
-  ConstraintSatisfaction Satisfaction;
-
-  auto NestedReqWithDiag = [&C, this](Expr *E,
-                                      ConstraintSatisfaction Satisfaction) {
-    Satisfaction.IsSatisfied = false;
-    SmallString<128> Entity;
-    llvm::raw_svector_ostream OS(Entity);
-    E->printPretty(OS, nullptr, SemaRef.getPrintingPolicy());
-    return new (C) concepts::NestedRequirement(
-        SemaRef.Context, C.backupStr(Entity), std::move(Satisfaction));
-  };
-
+  if (!Req->isDependent() && !AlwaysRebuild())
+    return Req;
   if (Req->hasInvalidConstraint()) {
     if (AlwaysRebuild())
       return RebuildNestedRequirement(Req->getInvalidConstraintEntity(),
                                       Req->getConstraintSatisfaction());
     return Req;
   }
-
+  Sema::InstantiatingTemplate ReqInst(SemaRef,
+      Req->getConstraintExpr()->getBeginLoc(), Req,
+      Sema::InstantiatingTemplate::ConstraintsCheck{},
+      Req->getConstraintExpr()->getSourceRange());
   if (!getEvaluateConstraints()) {
     ExprResult TransConstraint = TransformExpr(Req->getConstraintExpr());
     if (TransConstraint.isInvalid() || !TransConstraint.get())
@@ -2836,45 +2783,45 @@ TemplateInstantiator::TransformNestedRequirement(
         SemaRef.Context, TransConstraint.get(), Satisfaction);
   }
 
-  bool Success;
-  Expr *NewConstraint;
-  TemplateDeductionInfo Info(Constraint->getBeginLoc());
+  ExprResult TransConstraint;
+  ConstraintSatisfaction Satisfaction;
+  TemplateDeductionInfo Info(Req->getConstraintExpr()->getBeginLoc());
   {
     EnterExpressionEvaluationContext ContextRAII(
         SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-
-    Sema::InstantiatingTemplate ConstrInst(
-        SemaRef, Constraint->getBeginLoc(), Req,
-        Sema::InstantiatingTemplate::ConstraintsCheck(),
-        Constraint->getSourceRange());
-
+    Sema::SFINAETrap Trap(SemaRef);
+    Sema::InstantiatingTemplate ConstrInst(SemaRef,
+        Req->getConstraintExpr()->getBeginLoc(), Req, Info,
+        Req->getConstraintExpr()->getSourceRange());
     if (ConstrInst.isInvalid())
       return nullptr;
-
-    Sema::SFINAETrap Trap(SemaRef);
-
-    Success = !SemaRef.CheckConstraintSatisfaction(
-        Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex),
-        TemplateArgs, Constraint->getSourceRange(), Satisfaction,
-        /*TopLevelConceptId=*/nullptr, &NewConstraint);
-
-    assert(!Success || !Trap.hasErrorOccurred() &&
-                           "Substitution failures must be handled "
-                           "by CheckConstraintSatisfaction.");
+    llvm::SmallVector<Expr *> Result;
+    if (!SemaRef.CheckConstraintSatisfaction(
+            nullptr,
+            AssociatedConstraint(Req->getConstraintExpr(),
+                                 SemaRef.ArgPackSubstIndex),
+            Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(),
+            Satisfaction) &&
+        !Result.empty())
+      TransConstraint = Result[0];
+    assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled "
+                                       "by CheckConstraintSatisfaction.");
   }
-
-  if (!Success || Satisfaction.HasSubstitutionFailure())
-    return NestedReqWithDiag(Constraint, Satisfaction);
-
-  // FIXME: const correctness
-  // MLTAL might be dependent.
-  if (!NewConstraint) {
-    if (!Satisfaction.IsSatisfied)
-      return NestedReqWithDiag(Constraint, Satisfaction);
-
-    NewConstraint = Constraint;
+  ASTContext &C = SemaRef.Context;
+  if (TransConstraint.isUsable() &&
+      TransConstraint.get()->isInstantiationDependent())
+    return new (C) concepts::NestedRequirement(TransConstraint.get());
+  if (TransConstraint.isInvalid() || !TransConstraint.get() ||
+      Satisfaction.HasSubstitutionFailure()) {
+    SmallString<128> Entity;
+    llvm::raw_svector_ostream OS(Entity);
+    Req->getConstraintExpr()->printPretty(OS, nullptr,
+                                          SemaRef.getPrintingPolicy());
+    return new (C) concepts::NestedRequirement(
+        SemaRef.Context, C.backupStr(Entity), Satisfaction);
   }
-  return new (C) concepts::NestedRequirement(C, NewConstraint, Satisfaction);
+  return new (C)
+      concepts::NestedRequirement(C, TransConstraint.get(), Satisfaction);
 }
 
 TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T,
@@ -3131,7 +3078,7 @@ bool Sema::SubstTypeConstraint(
   const ASTTemplateArgumentListInfo *TemplArgInfo =
       TC->getTemplateArgsAsWritten();
 
-  if (!EvaluateConstraints && !inParameterMappingSubstitution()) {
+  if (!EvaluateConstraints) {
     UnsignedOrNone Index = TC->getArgPackSubstIndex();
     if (!Index)
       Index = SemaRef.ArgPackSubstIndex;
@@ -4431,16 +4378,6 @@ bool Sema::SubstTemplateArguments(
   return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
 }
 
-bool Sema::SubstTemplateArgumentsInParameterMapping(
-    ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
-    const MultiLevelTemplateArgumentList &TemplateArgs,
-    TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes) {
-  TemplateInstantiator Instantiator(
-      TemplateInstantiator::ForParameterMappingSubstitution, *this, BaseLoc,
-      TemplateArgs, BuildPackExpansionTypes);
-  return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
-}
-
 ExprResult
 Sema::SubstExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) {
   if (!E)
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 51b55b82f4208..6967301483361 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -3722,6 +3722,10 @@ class TreeTransform {
                                         ParentContext);
   }
 
+  /// Build a new Objective-C boxed expression.
+  ///
+  /// By default, performs semantic analysis to build the new expression.
+  /// Subclasses may override this routine to provide different behavior.
   ExprResult RebuildConceptSpecializationExpr(NestedNameSpecifierLoc NNS,
       SourceLocation TemplateKWLoc, DeclarationNameInfo ConceptNameInfo,
       NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
@@ -5106,13 +5110,9 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
       typedef TemplateArgumentLocInventIterator<Derived,
                                                 TemplateArgument::pack_iterator>
         PackLocIterator;
-
-      TemplateArgumentListInfo *PackOutput = &Outputs;
-      TemplateArgumentListInfo New;
-
       if (TransformTemplateArguments(
               PackLocIterator(*this, In.getArgument().pack_begin()),
-              PackLocIterator(*this, In.getArgument().pack_end()), *PackOutput,
+              PackLocIterator(*this, In.getArgument().pack_end()), Outputs,
               Uneval))
         return true;
 
@@ -5179,6 +5179,7 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
   }
 
   return false;
+
 }
 
 // FIXME: Find ways to reduce code duplication for pack expansions.
@@ -6246,7 +6247,7 @@ ParmVarDecl *TreeTransform<Derived>::TransformFunctionTypeParam(
                                              /* DefArg */ nullptr);
   newParm->setScopeInfo(OldParm->getFunctionScopeDepth(),
                         OldParm->getFunctionScopeIndex() + indexAdjustment);
-  getDerived().transformedLocalDecl(OldParm, {newParm});
+  transformedLocalDecl(OldParm, {newParm});
   return newParm;
 }
 
@@ -7081,11 +7082,11 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
                                                             TypeLocBuilder &TLB,
                                                      UnaryTransformTypeLoc TL) {
   QualType Result = TL.getType();
-  TypeSourceInfo *NewBaseTSI = TL.getUnderlyingTInfo();
   if (Result->isDependentType()) {
     const UnaryTransformType *T = TL.getTypePtr();
 
-    NewBaseTSI = getDerived().TransformType(TL.getUnderlyingTInfo());
+    TypeSourceInfo *NewBaseTSI =
+        getDerived().TransformType(TL.getUnderlyingTInfo());
     if (!NewBaseTSI)
       return QualType();
     QualType NewBase = NewBaseTSI->getType();
@@ -7100,7 +7101,7 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
   UnaryTransformTypeLoc NewTL = TLB.push<UnaryTransformTypeLoc>(Result);
   NewTL.setKWLoc(TL.getKWLoc());
   NewTL.setParensRange(TL.getParensRange());
-  NewTL.setUnderlyingTInfo(NewBaseTSI);
+  NewTL.setUnderlyingTInfo(TL.getUnderlyingTInfo());
   return Result;
 }
 
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 5456e73956659..cf32d4f56b7c2 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2424,7 +2424,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl(
   VisitDecl(D);
   llvm::SmallVector<TemplateArgument, 4> Args;
   for (unsigned I = 0; I < D->NumTemplateArgs; ++I)
-    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/false));
+    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/true));
   D->setTemplateArguments(Args);
 }
 
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index eef97a8588f0b..70b898a53fcbd 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -807,19 +807,15 @@ readConstraintSatisfaction(ASTRecordReader &Record) {
   if (!Satisfaction.IsSatisfied) {
     unsigned NumDetailRecords = Record.readInt();
     for (unsigned i = 0; i != NumDetailRecords; ++i) {
-      auto Kind = Record.readInt();
-      if (Kind == 0) {
+      if (/* IsDiagnostic */Record.readInt()) {
         SourceLocation DiagLocation = Record.readSourceLocation();
         StringRef DiagMessage = C.backupStr(Record.readString());
 
-        Satisfaction.Details.emplace_back(new (
-            C) ConstraintSubstitutionDiagnostic(DiagLocation, DiagMessage));
-      } else if (Kind == 1) {
+        Satisfaction.Details.emplace_back(
+            new (C) ConstraintSatisfaction::SubstitutionDiagnostic(
+                DiagLocation, DiagMessage));
+      } else
         Satisfaction.Details.emplace_back(Record.readExpr());
-      } else {
-        assert(Kind == 2);
-        Satisfaction.Details.emplace_back(Record.readConceptReference());
-      }
     }
   }
   return Satisfaction;
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index acf345392aa1a..ebda91e3819c3 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -482,20 +482,14 @@ addConstraintSatisfaction(ASTRecordWriter &Record,
   if (!Satisfaction.IsSatisfied) {
     Record.push_back(Satisfaction.NumRecords);
     for (const auto &DetailRecord : Satisfaction) {
-      if (auto *Diag = dyn_cast<const ConstraintSubstitutionDiagnostic *>(
-              DetailRecord)) {
-        Record.push_back(/*Kind=*/0);
+      auto *E = dyn_cast<Expr *>(DetailRecord);
+      Record.push_back(/* IsDiagnostic */ E == nullptr);
+      if (E)
+        Record.AddStmt(E);
+      else {
+        auto *Diag = cast<std::pair<SourceLocation, StringRef> *>(DetailRecord);
         Record.AddSourceLocation(Diag->first);
         Record.AddString(Diag->second);
-        continue;
-      }
-      if (auto *E = dyn_cast<const Expr *>(DetailRecord)) {
-        Record.push_back(/*Kind=*/1);
-        Record.AddStmt(const_cast<Expr *>(E));
-      } else {
-        Record.push_back(/*Kind=*/2);
-        auto *CR = cast<const ConceptReference *>(DetailRecord);
-        Record.AddConceptReference(CR);
       }
     }
   }
diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp
index 9419dba057a4e..84d981d2ab8de 100644
--- a/clang/test/AST/ast-dump-concepts.cpp
+++ b/clang/test/AST/ast-dump-concepts.cpp
@@ -20,9 +20,8 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13, col:31> 'bool' Concept {{.*}} 'binary_concept'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:13:9> col:9
-  // CHECK-NEXT:   | |-TemplateArgument type 'R'
-  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
-  // CHECK-NEXT:   | |   `-TemplateTypeParm {{.*}} 'R'
+  // CHECK-NEXT:   | |-TemplateArgument type 'type-parameter-1-0'  
+  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
   // CHECK-NEXT:   | `-TemplateArgument type 'int'
   // CHECK-NEXT:   |   `-BuiltinType {{.*}} 'int'
   // CHECK-NEXT:   |-TemplateArgument {{.*}} type 'R'
@@ -36,9 +35,8 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13> 'bool'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:10:9> col:9
-  // CHECK-NEXT:   | `-TemplateArgument type 'R'
-  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
-  // CHECK-NEXT:   |     `-TemplateTypeParm {{.*}} 'R'
+  // CHECK-NEXT:   | `-TemplateArgument type 'type-parameter-1-0'
+  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
   template <unary_concept R>
   Foo(R);
 
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index 9a3adbcb534e8..781fb9f28cb8d 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -185,18 +185,17 @@ void foo() {
 // CHECK-NEXT: | |-BinaryOperator {{.*}} 'bool' '&&'
 // CHECK-NEXT: | | |-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'invocable'
 // CHECK-NEXT: | | | |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: | | | | |-TemplateArgument type 'U'
-// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
-// CHECK-NEXT: | | | | |   `-TemplateTypeParm {{.*}} 'U'
-// CHECK-NEXT: | | | | `-TemplateArgument pack '<Packs<Ts...>>'
-// CHECK-NEXT: | | | |   `-TemplateArgument type 'Packs<Ts...>'
-// CHECK-NEXT: | | | |     `-TemplateSpecializationType {{.*}} 'Packs<Ts...>' dependent
-// CHECK-NEXT: | | | |       |-name: 'Packs':'GH124715::Packs' qualified
+// CHECK-NEXT: | | | | |-TemplateArgument type 'type-parameter-0-2'
+// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
+// CHECK-NEXT: | | | | `-TemplateArgument pack '<GH124715::Packs<type-parameter-0-1...>>'
+// CHECK-NEXT: | | | |   `-TemplateArgument type 'GH124715::Packs<type-parameter-0-1...>'
+// CHECK-NEXT: | | | |     `-TemplateSpecializationType {{.*}} 'GH124715::Packs<type-parameter-0-1...>' dependent
+// CHECK-NEXT: | | | |       |-name: 'GH124715::Packs'
 // CHECK-NEXT: | | | |       | `-ClassTemplateDecl {{.*}} Packs
-// CHECK-NEXT: | | | |       `-TemplateArgument type 'Ts...'
-// CHECK-NEXT: | | | |         `-PackExpansionType {{.*}} 'Ts...' dependent
-// CHECK-NEXT: | | | |           `-TemplateTypeParmType {{.*}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack
-// CHECK-NEXT: | | | |             `-TemplateTypeParm {{.*}} 'Ts'
+// CHECK-NEXT: | | | |       `-TemplateArgument pack '<type-parameter-0-1...>'
+// CHECK-NEXT: | | | |         `-TemplateArgument type 'type-parameter-0-1...'
+// CHECK-NEXT: | | | |           `-PackExpansionType {{.*}} 'type-parameter-0-1...' dependent
+// CHECK-NEXT: | | | |             `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
 // CHECK-NEXT: | | | |-TemplateArgument {{.*}} type 'U':'type-parameter-0-2'
 // CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
 // CHECK-NEXT: | | | |   `-TemplateTypeParm {{.*}} 'U'
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 0e0fc735c6843..5c2948f67d0ee 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -243,20 +243,19 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   //   since-cxx20-note@#cwg2565-VC {{because 'b' would be invalid: argument may not have 'void' type}}
 
   template<typename T>
-  concept ErrorRequires = requires (ErrorRequires auto x) { // #cwg2565-expr
+  concept ErrorRequires = requires (ErrorRequires auto x) {
   // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
   //   since-cxx20-note@-2 {{declared here}}
   // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
     x;
   };
   static_assert(ErrorRequires<int>);
-  // since-cxx20-error@-1 {{static assertion failed}} \
-  //   since-cxx20-note@-1 {{because 'int' does not satisfy 'ErrorRequires'}} \
-  //   since-cxx20-note@#cwg2565-expr {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}}
+  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   template<typename T>
   concept NestedErrorInRequires = requires (T x) { // #cwg2565-NEIR
-    requires requires (NestedErrorInRequires auto y) { // #cwg2565-NEIR-inner
+    requires requires (NestedErrorInRequires auto y) {
     // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
     //   since-cxx20-note@#cwg2565-NEIR {{declared here}}
     // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
@@ -264,9 +263,8 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
     };
   };
   static_assert(NestedErrorInRequires<int>);
-  // since-cxx20-error@-1 {{static assertion failed}} \
-  //   since-cxx20-note@-1 {{because 'int' does not satisfy 'NestedErrorInRequires'}} \
-  //   since-cxx20-note-re@#cwg2565-NEIR-inner {{because {{.*}} would be invalid: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}}
+  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
 #endif
 } // namespace cwg2565
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
index af2fc938fbea2..28b5d0adcf054 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
@@ -140,8 +140,7 @@ concept C7 = sizeof(T) == 1 || sizeof(
         ::type) == 1;
 
 static_assert(!C6<short>);
-static_assert(!C6<char>);
-// expected-note@-1 {{while checking the satisfaction of concept 'C6<char>' requested here}}
+static_assert(!C6<char>); // expected-note{{while checking the satisfaction of concept 'C6<char>' requested here}}
 static_assert(C7<char>);
 static_assert(!C7<short>); // expected-note{{while checking the satisfaction of concept 'C7<short>' requested here}}
 
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
index af2dce81d8a4b..31587a956b8ab 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
@@ -35,14 +35,14 @@ using r2i2 = r2<A>; // expected-error{{constraints not satisfied for class templ
 using r2i3 = r2<D>;
 using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template 'r2' [with T = const D]}}
 
-template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
+template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
 
 // Non-dependent expressions
 
@@ -89,7 +89,7 @@ template<typename T>
 concept Large = sizeof(typename remove_reference<T>::type) >= 4;
 // expected-note@-1{{because 'sizeof(typename remove_reference<short &>::type) >= 4' (2 >= 4) evaluated to false}}
 
-template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large'}}
+template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large':}}
 struct r7 {};
 
 using r7i1 = r7<int>;
@@ -149,7 +149,7 @@ namespace std_example {
   template<typename T> constexpr bool is_same_v<T, T> = true;
 
   template<typename T, typename U> concept same_as = is_same_v<T, U>;
-  // expected-note@-1 {{because 'is_same_v<int, typename std_example::T2::inner>' evaluated to false}}
+  // expected-note@-1 {{because 'is_same_v<int, int *>' evaluated to false}}
 
   static_assert(C1<int>);
   static_assert(C1<int*>);
@@ -160,7 +160,7 @@ namespace std_example {
   template<typename T> concept C2 =
     requires(T x) {
       {*x} -> same_as<typename T::inner>;
-      // expected-note@-1{{because 'same_as<int, typename std_example::T2::inner>' evaluated to false}}
+      // expected-note@-1{{because type constraint 'same_as<int, typename std_example::T2::inner>' was not satisfied:}}
       // expected-note@-2{{because '*x' would be invalid: indirection requires pointer operand ('int' invalid)}}
     };
 
@@ -173,9 +173,9 @@ namespace std_example {
     int operator *() { return 0; }
   };
   static_assert(C2<T1>);
-  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'T2' does not satisfy 'C2'}}
+  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'std_example::T2' does not satisfy 'C2'}}
   using c2c1 = C2_check<int>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = int]}}
-  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = T2]}}
+  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::T2]}}
 
   template<typename T>
   void g(T t) noexcept(sizeof(T) == 1) {}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 70a96bed05867..033ae349a02e5 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -43,10 +43,11 @@ namespace std_example {
       requires sizeof(a) == 4; // OK
       requires a == 0; // expected-error{{substitution into constraint expression resulted in a non-constant expression}}
       // expected-note@-1{{while checking the satisfaction of nested requirement requested here}}
-      // expected-note@-2{{while checking the satisfaction of nested requirement requested here}}
-      // expected-note@-5{{while substituting template arguments into constraint expression here}}
-      // expected-note@-4{{function parameter 'a' with unknown value cannot be used in a constant expression}}
-      // expected-note@-7{{declared here}}
+      // expected-note@-2{{in instantiation of requirement here}}
+      // expected-note@-3{{while checking the satisfaction of nested requirement requested here}}
+      // expected-note@-6{{while substituting template arguments into constraint expression here}}
+      // expected-note@-5{{function parameter 'a' with unknown value cannot be used in a constant expression}}
+      // expected-note@-8{{declared here}}
     };
     static_assert(C2<int>); // expected-error{{static assertion failed}}
     // expected-note@-1{{while checking the satisfaction of concept 'C2<int>' requested here}}
@@ -83,26 +84,31 @@ static_assert(Pipes<S>);
 static_assert(Pipes<double>);
 
 static_assert(Amps1<S>);
-static_assert(Amps1<double>);
+static_assert(!Amps1<double>);
 
 static_assert(Amps2<S>);
-static_assert(Amps2<double>);
+static_assert(!Amps2<double>);
 
 template<class T>
-void foo1() requires requires (T x) {
+void foo1() requires requires (T x) { // #foo1
   requires
-  True<decltype(x.value)>
+  True<decltype(x.value)> // #foo1Value
   && True<T>;
 } {}
 template<class T> void fooPipes() requires Pipes<T> {}
-template<class T> void fooAmps1() requires Amps1<T> {}
+template<class T> void fooAmps1() requires Amps1<T> {} // #fooAmps1
 void foo() {
   foo1<S>();
-  foo1<int>();
+  foo1<int>(); // expected-error {{no matching function for call to 'foo1'}}
+  // expected-note@#foo1Value {{because 'True<decltype(x.value)> && True<T>' would be invalid: member reference base type 'int' is not a structure or union}}
+  // expected-note@#foo1 {{candidate template ignored: constraints not satisfied [with T = int]}}
   fooPipes<S>();
   fooPipes<int>();
   fooAmps1<S>();
-  fooAmps1<int>();
+  fooAmps1<int>(); // expected-error {{no matching function for call to 'fooAmps1'}}
+  // expected-note@#fooAmps1 {{candidate template ignored: constraints not satisfied [with T = int]}}
+  // expected-note@#fooAmps1 {{because 'int' does not satisfy 'Amps1'}}
+  // expected-note@#Amps1 {{because 'True<decltype(x.value)> && True<T> && !False<T>' would be invalid: member reference base type 'int' is not a structure or union}}
 }
 
 template<class T>
@@ -152,16 +158,15 @@ void func() {
   // expected-note@#bar {{while substituting template arguments into constraint expression here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
   // expected-note@#bar {{candidate template ignored: constraints not satisfied [with T = False]}}
-  // expected-note@#bar {{because 'X<False>::value' evaluated to false}}
+  // expected-note@#bar {{because 'X<SubstitutionFailureNestedRequires::ErrorExpressions_NotSF::False>::value' evaluated to false}}
 
   bar<int>();
-  // expected-error@-1 {{no matching function for call to 'bar'}} \
   // expected-note@-1 {{while checking constraint satisfaction for template 'bar<int>' required here}} \
-  // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}} \
+  // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}}
   // expected-note@#bar {{in instantiation of static data member}}
+  // expected-note@#bar {{in instantiation of requirement here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
   // expected-note@#bar {{while substituting template arguments into constraint expression here}}
-  // expected-note@#bar {{candidate template ignored}}
   // expected-error@#X_Value {{type 'int' cannot be used prior to '::' because it has no members}}
 }
 }
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
index 5dcb1880ded48..5199708cd8166 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
@@ -39,14 +39,14 @@ using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class
 
 template<typename T> requires requires { sizeof(T); }
 // expected-note@-1{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}}
-// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
+// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
 
 template<typename T> requires requires (T t) { 0; "a"; (void)'a'; }
 struct r4 {};
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
index 28dff336d053c..5433cfb21955d 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
@@ -182,14 +182,14 @@ namespace std_example {
   static_assert(C1<has_inner_and_type> && C2<has_inner_and_type> && C3<has_inner_and_type>);
   template<C1 T> struct C1_check {};
   // expected-note@-1 {{because 'int' does not satisfy 'C1'}}
-  // expected-note@-2 {{because 'has_type' does not satisfy 'C1'}}
+  // expected-note@-2 {{because 'std_example::has_type' does not satisfy 'C1'}}
   template<C2 T> struct C2_check {};
-  // expected-note@-1 {{because 'has_inner' does not satisfy 'C2'}}
+  // expected-note@-1 {{because 'std_example::has_inner' does not satisfy 'C2'}}
   template<C3 T> struct C3_check {};
   // expected-note@-1 {{because 'void' does not satisfy 'C3'}}
   using c1 = C1_check<int>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = int]}}
-  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = has_type]}}
-  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = has_inner]}}
+  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = std_example::has_type]}}
+  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::has_inner]}}
   using c4 = C3_check<void>; // expected-error{{constraints not satisfied for class template 'C3_check' [with T = void]}}
 }
 
@@ -199,10 +199,10 @@ template <typename T> concept C = requires { requires requires { T::a; }; };
 // expected-note@-1 {{because 'T::a' would be invalid: no member named 'a' in 'PR48656::T1'}}
 
 template <C...> struct A {};
-// expected-note@-1 {{because 'T1' does not satisfy 'C'}}
+// expected-note@-1 {{because 'PR48656::T1' does not satisfy 'C'}}
 
 struct T1 {};
-template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <T1>]}}
+template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <PR48656::T1>]}}
 
 struct T2 { static constexpr bool a = false; };
 template struct A<T2>;
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
index 6dea0c62fe686..59e6a48e48878 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
@@ -28,8 +28,9 @@ template<typename T> requires requires {
   requires S<T>{};
   // expected-error@-1{{atomic constraint must be of type 'bool' (found 'S<int>')}}
   // expected-note@-2{{while checking the satisfaction}}
-  // expected-note@-3{{while checking the satisfaction of nested requirement}}
-  // expected-note@-5{{while substituting template arguments}}
+  // expected-note@-3{{in instantiation of requirement}}
+  // expected-note@-4{{while checking the satisfaction}}
+  // expected-note@-6{{while substituting template arguments}}
   // expected-note@#F3INST{{while checking constraint satisfaction}}
   // expected-note@#F3INST{{while substituting deduced template arguments into function template 'f3' [with T = int]}}
   //
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
index 34c5c5d338bfe..3992835c44402 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
@@ -1,31 +1,21 @@
 // RUN: %clang_cc1 -std=c++2a -x c++ -verify %s
-// RUN: %clang_cc1 -std=c++2c -x c++ -verify %s
 
 template<typename T> concept True = true;
-template<typename T> concept Foo = True<T*>; // #Foo
-template<typename T> concept Bar = Foo<T&>;  // #Bar
-template<typename T> requires Bar<T> struct S { }; // #S
-template<typename T> requires Bar<T> && true struct S<T> { }; // #SpecS
-// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
-// expected-error@#Foo 2{{'type name' declared as a pointer to a reference of type 'T &'}}
-// expected-note@#SpecS {{while substituting into concept arguments here}}
-// expected-note@#S {{while substituting into concept arguments here}}
-// expected-note@#Bar 2{{while substituting into concept arguments here}}
-// expected-note@#S {{template is declared here}}
-
-
+template<typename T> concept Foo = True<T*>;
+template<typename T> concept Bar = Foo<T&>;
+template<typename T> requires Bar<T> struct S { };
+template<typename T> requires Bar<T> && true struct S<T> { };
 
 template<typename T> concept True2 = sizeof(T) >= 0;
-template<typename T> concept Foo2 = True2<T*>; // #Foo2
-
-template<typename T> concept Bar2 = Foo2<T&>; // #Bar2
-// expected-note@-1 3{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
-template<typename T> requires Bar2<T> struct S2 { }; // #SpecS2_1
+template<typename T> concept Foo2 = True2<T*>;
+// expected-error@-1{{'type name' declared as a pointer to a reference of type 'type-parameter-0-0 &'}}
+template<typename T> concept Bar2 = Foo2<T&>;
+// expected-note@-1{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
+template<typename T> requires Bar2<T> struct S2 { };
 // expected-note@-1{{template is declared here}}
-template<typename T> requires Bar2<T> && true struct S2<T> { }; // #SpecS2_2
+template<typename T> requires Bar2<T> && true struct S2<T> { };
 // expected-error@-1{{class template partial specialization is not more specialized than the primary template}}
-// expected-error@#Foo2{{'type name' declared as a pointer to a reference of type 'T &'}}
-
+// expected-note@-2{{while calculating associated constraint of template 'S2<T>' here}}
 
 namespace type_pack {
   template<typename... Args>
@@ -81,31 +71,16 @@ namespace non_type_pack {
 namespace PR47174 {
 // This checks that we don't crash with a failed substitution on the first constrained argument when
 // performing normalization.
-template <Bar2 T, True U> // #S3_Header
+template <Bar2 T, True U>
 requires true struct S3; // expected-note {{template is declared here}}
 template <True T, True U>
-requires true struct S3<T, U>;
-// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
-// expected-error@#Foo2 2{{'type name' declared as a pointer to a reference of type 'T &'}}
-// expected-note@#SpecS2_1 {{while substituting into concept arguments here}}
-// expected-note@#SpecS2_2 {{while substituting into concept arguments here}}
-// expected-note@#S3_Header {{while substituting into concept arguments here}}
-// expected-note@#Bar2 {{while substituting into concept arguments here}}
-
+requires true struct S3<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
 
 // Same as above, for the second position (but this was already working).
-template <True T, Bar2 U> // #S4_Header
-requires true struct S4; // #S4
+template <True T, Bar2 U>
+requires true struct S4; // expected-note {{template is declared here}}
 template <True T, True U>
-requires true struct S4<T, U>; // #S4-spec
-// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
-// expected-error@#Foo2 {{'type name' declared as a pointer to a reference of type 'U &'}}
-// expected-note@#S4_Header {{while substituting into concept arguments here}}
-// expected-note@#S4 {{template is declared here}}
-// expected-note@#S4 {{similar constraint expressions not considered equivalent}}
-// expected-note@#S4-spec {{similar constraint expression here}}
-
-
+requires true struct S4<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
 
 struct X {
   template<int> struct Y {
@@ -121,7 +96,7 @@ template<class T> requires C1<T> && C2<T> void t1() = delete; // expected-note {
 template void t1<X>();
 void t1() { t1<X>(); } // expected-error {{call to deleted function 't1'}}
 
-template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}}
+template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}} 
 template<class T> requires C2<T> void t2() {}; // expected-note 2 {{candidate function}}
 template void t2<X>(); // expected-error {{partial ordering for explicit instantiation of 't2' is ambiguous}}
 void t2() { t2<X>(); } // expected-error {{call to 't2' is ambiguous}}
diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp
index c0406f88db5f3..4f5fdd3b4809a 100644
--- a/clang/test/CXX/temp/temp.param/p10-2a.cpp
+++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp
@@ -86,18 +86,16 @@ using f1 = F<int>;
 using f2 = F<long>; // expected-error {{constraints not satisfied for alias template 'F' [with T = long]}}
 
 template<typename T, typename... Ts>
-concept OneOf = (is_same_v<T, Ts> || ...); // #OneOf
-// expected-note@#OneOf 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
-// expected-note@#OneOf 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
-// expected-note@#OneOf {{because 'is_same_v<short, int>' evaluated to false}}
-// expected-note@#OneOf {{and 'is_same_v<short, long>' evaluated to false}}
-// expected-note@#OneOf {{and 'is_same_v<short, char>' evaluated to false}}
-// expected-note@#OneOf 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
-// expected-note@#OneOf 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
-// expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
-// expected-note@#OneOf {{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
-// expected-note@#OneOf {{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
-// expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
+concept OneOf = (is_same_v<T, Ts> || ...);
+// expected-note@-1 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
+// expected-note@-2 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
+// expected-note@-3 {{because 'is_same_v<short, int>' evaluated to false}}
+// expected-note@-4 {{and 'is_same_v<short, long>' evaluated to false}}
+// expected-note@-5 {{and 'is_same_v<short, char>' evaluated to false}}
+// expected-note@-6 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
+// expected-note@-7 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
+// expected-note@-8 2{{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
+// expected-note@-9 2{{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
 
 template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U>
 // expected-note@-1 2{{because 'OneOf<char, char[1], char[2]>' evaluated to false}}
@@ -126,7 +124,6 @@ using I = int;
 
 using i1 = I<1>;
 using i2 = I<'a'>;
-// FIXME: This crashes with -std=c++2c
 using i3 = I<nullptr>;
 // expected-error@-1 {{constraints not satisfied for alias template 'I' [with x = nullptr]}}
 
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index ce862666aa48f..99a82d96d321b 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -127,12 +127,13 @@ struct F {
 
 template <typename T>
 constexpr int f5() requires C<T> { return 1; } // expected-note {{while checking the satisfaction}}
-                                               // expected-note@-1 {{candidate template ignored}}
+                                               // expected-note@-1 {{while substituting template arguments}}
+                                               // expected-note@-2 {{candidate template ignored}}
 
 template <typename T>
-constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} \
-                                                  // expected-note 4 {{while substituting template arguments}} \
-                                                  // expected-note {{candidate template ignored}}
+constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}}
+                                                  // expected-note@-1 4 {{while substituting template arguments}}
+                                                  // expected-note@-2 {{candidate template ignored}}
 
 static_assert(f5<int>() == 1);
 static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 6777dc23c44a6..74b3573a0dcaa 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -1257,13 +1257,13 @@ void f() {
     (&A::e)(a, a);
     // expected-error@-1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::e<A>)(a, 0);
     (&A::e<A>)(a, a);
     // expected-error@-1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::e<A, int>)(a, 0);
 
@@ -1273,12 +1273,12 @@ void f() {
     (&A::f<A>)(a);
     // expected-error@-1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::f)(a);
     // expected-error@-1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::g)(a);
     (&A::g)(a, 0);
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 137f46ee3dc01..4220486d3aed3 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
 
-template <class T> concept A = (T(), true);
-template <class T> concept C = A<T> && true; // #C
+template <class T> concept A = true;
+template <class T> concept C = A<T> && true;
 template <class T> concept D = A<T> && __is_same(T, int);
 
 
@@ -40,23 +40,13 @@ constexpr int i(T...) { return 1; }; // expected-note {{candidate}}
 static_assert(i(0) == 1); // expected-error {{call to 'i' is ambiguous}}
 
 
-template <class... T> requires (A<T> || ... || true) constexpr int j(T...) { return 0; }; // #j1
-template <class... T> requires (C<T> && ... && true) constexpr int j(T...) { return 1; }; // #j2
+template <class... T> requires (A<T> || ... || true)
+constexpr int j(T...) { return 0; };
+template <class... T> requires (C<T> && ... && true)
+constexpr int j(T...) { return 1; };
 
 static_assert(j(0) == 1);
-// expected-error@-1 {{call to 'j' is ambiguous}}
-// expected-note@#j1 {{candidate function [with T = <int>]}}
-// expected-note@#j2 {{candidate function [with T = <int>]}}
-// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
-// expected-note@#j1 {{similar constraint expression here}}
-
-
 static_assert(j() == 1);
-// expected-error@-1 {{call to 'j' is ambiguous}}
-// expected-note@#j1 {{candidate function [with T = <>]}}
-// expected-note@#j2 {{candidate function [with T = <>]}}
-// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
-// expected-note@#j1 {{similar constraint expression here}}
 
 
@@ -117,7 +107,7 @@ void test() {
 }
 
 namespace substitution {
-struct S {
+    struct S {
     using type = int;
 };
 
@@ -154,69 +144,51 @@ consteval int Or3() requires (C<typename T::type> || ... || C<typename U::type>)
 static_assert(And1<>() == 1);
 static_assert(And1<S>() == 1);
 static_assert(And1<S, S>() == 1);
-// FIXME: The diagnostics are not so great
 static_assert(And1<int>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                 // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int>]}}
-                                 // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                 // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
+                                 // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And1<S, int>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <S, int>]}}
-                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
+                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And1<int, S>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int, S>]}}
-                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
+                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And2<S>() == 2);
 static_assert(And2<S, S>() == 2);
-static_assert(And2<int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
-                                  // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                  // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
-                                  // expected-note@#C {{because 'T' does not satisfy 'A'}}
-
+static_assert(And2<int>() == 2);
 
 static_assert(And2<int, int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
-                                      // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} \
-                                      // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                      // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
+                                     // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And2<S, int>() == 2); // expected-error {{no matching function for call to 'And2'}}
-                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
-                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
+                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And2<int, S>() == 2); // expected-error {{no matching function for call to 'And2'}}
-                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
-                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
+                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And3<S>() == 3);
 static_assert(And3<S, S>() == 3);
 static_assert(And3<int>() == 3);   // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
-
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And3<int, int>() == 3);  // expected-error {{no matching function for call to 'And3'}}
-                                      // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
-                                      // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                     // expected-note@#C {{because 'T' does not satisfy 'A'}}
-
+                                      // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
+                                     // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And3<S, int>() == 3); // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}}
-                                   // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
-
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
 
 static_assert(And3<int, S>() == 3); // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
-                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
 
 
 static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}}
@@ -226,26 +198,25 @@ static_assert(Or1<int, S>() == 1);
 static_assert(Or1<S, int>() == 1);
 static_assert(Or1<S, S>() == 1);
 static_assert(Or1<int>() == 1); // expected-error {{no matching function for call to 'Or1'}}
-                                // expected-note@#or1 {{candidate template ignored: constraints not satisfied}}
-                                // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} \
+                                // expected-note@#or1 {{because substituted constraint expression is ill-formed}}
+
 
 static_assert(Or2<S>() == 2);
 static_assert(Or2<int, S>() == 2);
 static_assert(Or2<S, int>() == 2);
 static_assert(Or2<S, S>() == 2);
 static_assert(Or2<int>() == 2); // expected-error {{no matching function for call to 'Or2'}}
-                                // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or2 {{candidate template ignored: constraints not satisfied}} \
+                                // expected-note@#or2 {{because substituted constraint expression is ill-formed}}
+
 static_assert(Or3<S>() == 3);
 static_assert(Or3<int, S>() == 3);
 static_assert(Or3<S, int>() == 3);
 static_assert(Or3<S, S>() == 3);
 static_assert(Or3<int>() == 3); // expected-error {{no matching function for call to 'Or3'}}
-                                // expected-note@#or3 {{candidate template ignored: constraints not satisfied}}
-                                // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} \
+                                // expected-note@#or3 {{because substituted constraint expression is ill-formed}}
 }
 
 namespace bool_conversion_break {
@@ -255,7 +226,7 @@ struct Thingy {
     static constexpr int compare(const Thingy&) {return 1;}
 };
 template <typename ...T, typename ...U>
-void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: constraints not satisfied}}
+void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: failed template argument deduction}}
 requires (T::compare(U{}) && ...); // expected-error {{atomic constraint must be of type 'bool' (found 'int')}}
 
 void g() {
@@ -298,7 +269,9 @@ struct S {
 
 static_assert(S<int>::f<int>() == 2);
 
-static_assert(S<int>::g<int>() == 2);
+static_assert(S<int>::g<int>() == 2); // expected-error {{call to 'g' is ambiguous}}
+                                      // expected-note@#nested-ambiguous-g1 {{candidate}}
+                                      // expected-note@#nested-ambiguous-g2 {{candidate}}
 
 
 }
@@ -411,98 +384,3 @@ struct LazyLitMatrix<index_by<Indices...>, init> {
 }
 
 }
-
-namespace GH135190 {
-template <typename T>
-concept A = __is_same_as(T, int) || __is_same_as(T, double) ;
-
-template <typename T>
-concept B = A<T> && __is_same_as(T, double);
-
-template <class... Ts>
-requires(A<Ts> && ...)
-constexpr int g() {
-    return 1;
-}
-
-template <class... Ts>
-requires(B<Ts> && ...)
-constexpr int g() {
-    return 2;
-}
-
-static_assert(g<double>() == 2);
-
-
-template <class... Ts>
-concept all_A = (A<Ts> && ...);
-
-template <class... Ts>
-concept all_B = (B<Ts> && ...);
-
-template <class... Ts>
-requires all_A<Ts...>
-constexpr int h() {
-    return 1;
-}
-
-template <class... Ts>
-requires all_B<Ts...>
-constexpr int h() {
-    return 2;
-}
-
-static_assert(h<double>() == 2);
-}
-
-
-namespace parameter_mapping_regressions {
-
-namespace case1 {
-namespace std {
-template <class _Tp, class... _Args>
-constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
-template <class _Tp, class... _Args>
-concept constructible_from = is_constructible_v<_Tp, _Args...>;
-template <class _Tp>
-concept default_initializable = true;
-template <class> using iterator_t = int;
-template <class _Tp>
-concept view = constructible_from<_Tp, _Tp>;
-template <class... _Views>
-  requires(view<_Views> && ...)
-class zip_transform_view;
-} // namespace std
-struct IterDefaultCtrView {};
-template <class... Views>
-using Iter = std::iterator_t<std::zip_transform_view<Views...>>;
-static_assert(
-    std::default_initializable<Iter<IterDefaultCtrView, IterDefaultCtrView>>);
-
-}
-
-namespace case2 {
-
-template <class _Bp>
-constexpr bool False = false;
-
-template <class... _Views>
-concept __zip_all_random_access = (False<_Views> && ...);
-// expected-note@-1 {{evaluated to false}}
-
-template <typename... _Views>
-struct zip_view {
-  void f() requires __zip_all_random_access<_Views...>{};
-  // expected-note@-1 {{because 'int' does not satisfy}}
-};
-
-zip_view<int> test_v;
-static_assert(!__zip_all_random_access<int>);
-
-void test() {
-  test_v.f(); // expected-error {{invalid reference to function 'f'}}
-}
-
-}
-
-}
diff --git a/clang/test/SemaCXX/cxx2c-template-template-param.cpp b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
index 4ad3fd95039cd..ed55a059bb53c 100644
--- a/clang/test/SemaCXX/cxx2c-template-template-param.cpp
+++ b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
@@ -106,7 +106,7 @@ concept BinaryDefaultedFalse = false;
 
 template <template <typename...> concept C, typename T>
 struct S {
-    template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
+    template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
     void f(TT); // expected-note {{ignored}}
     void g(C auto); // expected-note {{ignored}} \
                     // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
@@ -171,7 +171,7 @@ concept BinaryDefaultedFalse = false;
 
 template <template <typename...> concept C, typename T>
 struct S {
-    template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
+    template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
     void f(TT); // expected-note {{ignored}}
     void g(C auto); // expected-note {{ignored}} \
                     // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
diff --git a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
index 8400340d19f93..436dfb9aac0a7 100644
--- a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
+++ b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang -fsyntax-only -std=c++2a -Xclang -verify -ftemplate-depth=5 -ftemplate-backtrace-limit=4 %s
 
-// RequiresExpr contains invalid requirement. (Eg. Highly recursive template).
+// RequiresExpr contains invalid requirement. (Eg. Highly recurisive template).
 template<int x>
 struct A { static constexpr bool far(); };
 class B {
@@ -19,7 +19,7 @@ constexpr bool A<x>::far() {
       // expected-error@#Invalid {{recursive template instantiation exceeded maximum depth}}
       // expected-note@#Invalid 3 {{while}}
       // expected-note@#Invalid {{contexts in backtrace}}
-      // expected-note@#Invalid {{use -ftemplate-depth=N to increase}}
+      // expected-note@#Invalid {{increase recursive template instantiation depth}}
     };
 }
 static_assert(A<1>::far());
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index c3bda3988484d..135865c8450f5 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -102,7 +102,7 @@ static_assert(__is_constructible(Movable, int));
 // expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
 // expected-note@-1 2{{}}
 // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
-// expected-note@#err-self-constraint-1 3{{}}
+// expected-note@#err-self-constraint-1 4{{}}
 // expected-note@#Movable  {{'Movable' defined here}}
 
 template <typename T>
@@ -200,6 +200,7 @@ void h(short n) { f(n); }
 // expected-note@-1{{while checking constraint satisfaction for template}}
 // expected-note@#GH62096-note1{{in instantiation}}
 // expected-note@#GH62096-note1{{while substituting template arguments into constraint expression here}}
+// expected-note@#GH62096-note2{{while substituting template arguments into constraint expression here}}
 // expected-note@#GH62096-note2{{while checking the satisfaction of concept}}
 // expected-note@#GH62096-err {{expression evaluates}}
 }
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index 901d510bba847..d49330f97fad0 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -5129,12 +5129,12 @@ namespace GH121278 {
 #if __cplusplus >= 202002L
 template <typename B, typename D>
 concept C = __is_base_of(B, D);
-// expected-error@-1 {{incomplete type 'S' used in type trait expression}}
+// expected-error@-1 {{incomplete type 'GH121278::S' used in type trait expression}}
 // expected-note@-2 {{while substituting template arguments into constraint expression here}}
 
 struct T;
 struct S;
 bool b = C<T, S>;
-// expected-note@-1 {{while checking the satisfaction of concept 'C<T, S>' requested here}}
+// expected-note@-1 {{while checking the satisfaction of concept 'C<GH121278::T, GH121278::S>' requested here}}
 #endif
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
index 999372c95554e..d7c6876d3b9e3 100644
--- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
@@ -19,7 +19,7 @@ Buffer<double2> r4;
 
 // expected-error@+4 {{constraints not satisfied for class template 'Buffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class Buffer}}
-// expected-note@*:* {{because 'Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'hlsl::Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::Buffer<int>)' evaluated to false}}
 Buffer<Buffer<int> > r5;
 
@@ -65,7 +65,7 @@ Buffer<half[4]> r10;
 
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 Buffer<int8> r11;
 
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
 Buffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 Buffer<double3> r16;
 
diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
index b33f2af8a1117..361f4303b4961 100644
--- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
@@ -19,7 +19,7 @@ RWBuffer<double2> r4;
 
 // expected-error@+4 {{constraints not satisfied for class template 'RWBuffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class RWBuffer}}
-// expected-note@*:* {{because 'RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'hlsl::RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::RWBuffer<int>)' evaluated to false}}
 RWBuffer<RWBuffer<int> > r5;
 
@@ -65,7 +65,7 @@ RWBuffer<half[4]> r10;
 
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 RWBuffer<int8> r11;
 
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
 RWBuffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 RWBuffer<double3> r16;
 
diff --git a/clang/test/SemaTemplate/concepts-recovery-expr.cpp b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
index aa4ed53ec4ed3..6bed1790051f3 100644
--- a/clang/test/SemaTemplate/concepts-recovery-expr.cpp
+++ b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
@@ -4,7 +4,7 @@
 constexpr bool CausesRecoveryExpr = "test" + 1.0f;
 
 template<typename T>
-concept ReferencesCRE = CausesRecoveryExpr; // #subst1
+concept ReferencesCRE = CausesRecoveryExpr;
 
 template<typename T> requires CausesRecoveryExpr // #NVC1REQ
 void NoViableCands1(){} // #NVC1
@@ -19,18 +19,16 @@ void NVCUse() {
   NoViableCands1<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands1'}}
   // expected-note@#NVC1{{candidate template ignored: constraints not satisfied}}
-  // expected-note@#NVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
   // expected-note@#NVC1REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   NoViableCands2<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands2'}}
   // expected-note@#NVC2{{candidate template ignored: constraints not satisfied}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#NVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   NoViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands3'}}
   // expected-note@#NVC3{{candidate template ignored: constraints not satisfied}}
-  // expected-note@#NVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#NVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 template<typename T> requires CausesRecoveryExpr // #OVC1REQ
@@ -60,14 +58,12 @@ void OVCUse() {
   // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
   // expected-note@#OVC2_ALT {{candidate function}}
   // expected-note@#OVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   OtherViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
   // expected-note@#OVC3_ALT {{candidate function}}
   // expected-note@#OVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 template<typename T> requires CausesRecoveryExpr // #OBNVC1REQ
@@ -99,15 +95,13 @@ void OBNVCUse() {
   // expected-note@#OBNVC2_ALT {{candidate template ignored: constraints not satisfied}}
   // expected-note@#OBNVC2REQ_ALT {{because 'false' evaluated to false}}
   // expected-note@#OBNVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OBNVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OBNVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   OtherBadNoViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherBadNoViableCands3'}}
   // expected-note@#OBNVC3_ALT {{candidate template ignored: constraints not satisfied}}
   // expected-note@#OBNVC3REQ_ALT {{because 'false' evaluated to false}}
   // expected-note@#OBNVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OBNVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OBNVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 
@@ -142,14 +136,12 @@ void MemOVCUse() {
   // expected-error@-1 {{no matching member function for call to 'OtherViableCands2'}}
   // expected-note@#MEMOVC2_ALT {{candidate function}}
   // expected-note@#MEMOVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#MEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#MEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   S.OtherViableCands3<int>();
   // expected-error@-1 {{no matching member function for call to 'OtherViableCands3'}}
   // expected-note@#MEMOVC3_ALT {{candidate function}}
   // expected-note@#MEMOVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#MEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#MEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 struct StaticOVC {
@@ -181,14 +173,12 @@ void StaticMemOVCUse() {
   // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
   // expected-note@#SMEMOVC2_ALT {{candidate function}}
   // expected-note@#SMEMOVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#SMEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#SMEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   StaticOVC::OtherViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
   // expected-note@#SMEMOVC3_ALT {{candidate function}}
   // expected-note@#SMEMOVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#SMEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
-  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#SMEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 namespace GH58548 {
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 73dce9317f383..097cad1a64179 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -12,7 +12,7 @@ void g() {
   // expected-note@#FDEF{{because 'int' does not satisfy 'c'}}
   // expected-note@#CDEF{{because 'f(t)' would be invalid: no matching function for call to 'f'}}
 }
-} // namespace GH53213
+} // namespace GH53213 
 
 namespace GH45736 {
 struct constrained;
@@ -67,14 +67,15 @@ struct my_range{
 
 void baz() {
 auto it = begin(rng); // #BEGIN_CALL
-// expected-error-re@#INF_REQ {{satisfaction of constraint {{.*}} depends on itself}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
+// expected-error@#INF_BEGIN {{satisfaction of constraint 'Inf<Inf auto>' depends on itself}}
+// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
 // expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
 // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
 // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
-// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
+// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
+// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}}
 
 // Fallout of the failure is failed lookup, which is necessary to stop odd
@@ -82,7 +83,6 @@ auto it = begin(rng); // #BEGIN_CALL
 // expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}}
 // expected-note@#NOTINF_BEGIN {{candidate function}}
 // expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}}
-// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}}
 }
 } // namespace DirectRecursiveCheck
 
@@ -100,17 +100,16 @@ namespace GH50891 {
   static_assert(Numeric<Deferred>); // #STATIC_ASSERT
   // expected-error@#NUMERIC{{satisfaction of constraint 'requires (T a) { foo(a); }' depends on itself}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
-  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
-  // expected-note@#OP_TO {{skipping 1 context}}
-  // expected-note@#FOO_CALL 2{{while checking constraint satisfaction for template}}
-  // expected-note@#FOO_CALL 2{{while substituting deduced template arguments into function template}}
-  // expected-note@#FOO_CALL 2{{in instantiation of requirement here}}
+  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
+  // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}}
+  // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}}
+  // expected-note@#FOO_CALL {{while substituting deduced template arguments into function template}}
+  // expected-note@#FOO_CALL {{in instantiation of requirement here}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
 
   // expected-error@#STATIC_ASSERT {{static assertion failed}}
-  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
-  // expected-note@#STATIC_ASSERT{{because 'Deferred' does not satisfy 'Numeric'}}
-  // expected-note@#FOO_CALL{{because 'foo(a)' would be invalid}}
+  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
+  // expected-note@#STATIC_ASSERT{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
 } // namespace GH50891
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 6d29f8b798e73..209e7dc69797d 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1002,7 +1002,7 @@ template<class>
 concept Irrelevant = false;
 
 template <typename T>
-concept ErrorRequires = requires(ErrorRequires auto x) { x; }; //#GH54678-ill-formed-concept
+concept ErrorRequires = requires(ErrorRequires auto x) { x; };
 // expected-error@-1 {{a concept definition cannot refer to itself}} \
 // expected-error@-1 {{'auto' not allowed in requires expression parameter}} \
 // expected-note@-1 {{declared here}}
@@ -1023,7 +1023,8 @@ template<class T> void eee(T t) // expected-note {{candidate template ignored: c
 requires (Irrelevant<T> || Irrelevant<T> || True<T>) && False<T> {} // expected-note {{'long' does not satisfy 'False'}}
 
 template<class T> void fff(T t) // expected-note {{candidate template ignored: constraints not satisfied}}
-requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{because 'unsigned long' does not satisfy 'False'}}
+requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{'unsigned long' does not satisfy 'False'}}
+
 void test() {
     aaa(42); // expected-error {{no matching function}}
     bbb(42L); // expected-error{{no matching function}}
@@ -1263,7 +1264,12 @@ C auto x = 0;
 // expected-error@#T_Type {{type 'int' cannot be used prior to '::'}} \
 // expected-note@-1 {{in instantiation of default argument}}
 
+// This will be fixed when we merge https://github.com/llvm/llvm-project/pull/141776
+// Which makes us behave like GCC.
 static_assert(f(0));
+// expected-error@-1 {{no matching function for call}} \
+// expected-note@#GH61824_f {{constraints not satisfied}} \
+// expected-note@#T_Type {{type 'int' cannot be used prior to '::'}}
 
 }
 
@@ -1272,65 +1278,4 @@ template <typename T> concept PerfectSquare = [](){} // expected-note 2{{here}}
 ([](auto) { return true; }) < PerfectSquare <class T>;
 // expected-error@-1 {{declaration of 'T' shadows template parameter}} \
 // expected-error@-1 {{a concept definition cannot refer to itself}}
-
-}
-namespace GH61811{
-template <class T> struct A { static const int x = 42; };
-template <class Ta> concept A42 = A<Ta>::x == 42;
-template <class Tv> concept Void = __is_same_as(Tv, void);
-template <class Tb, class Ub> concept A42b = Void<Tb> || A42<Ub>;
-template <class Tc> concept R42c = A42b<Tc, Tc&>;
-static_assert (R42c<void>);
-}
-
-namespace parameter_mapping_regressions {
-
-namespace case1 {
-
-template <template <class> class> using __meval = struct __q;
-template <template <class> class _Tp>
-concept __mvalid = requires { typename __meval<_Tp>; };
-template <class _Fn>
-concept __minvocable = __mvalid<_Fn::template __f>;
-template <class...> struct __mdefer_;
-template <class _Fn, class... _Args>
-  requires __minvocable<_Fn>
-struct __mdefer_<_Fn, _Args...> {};
-template <class = __q> struct __mtransform {
-  template <class> using __f = int;
-};
-struct __completion_domain_or_none_ : __mdefer_<__mtransform<>> {};
-
-}
-
-namespace case2 {
-
-template<auto& Q, class P> concept C = Q.template operator()<P>();
-template<class P> concept E = C<[]<class Ty>{ return false; }, P>;
-static_assert(!E<int>);
-
-}
-
-
-namespace case3 {
-template <class> constexpr bool is_move_constructible_v = false;
-
-template <class _Tp>
-concept __cpp17_move_constructible = is_move_constructible_v<_Tp>; // #is_move_constructible_v
-
-template <class _Tp>
-concept __cpp17_copy_constructible = __cpp17_move_constructible<_Tp>; // #__cpp17_move_constructible
-
-template <class _Iter>
-concept __cpp17_iterator = __cpp17_copy_constructible<_Iter>; // #__cpp17_copy_constructible
-
-struct not_move_constructible {};
-static_assert(__cpp17_iterator<not_move_constructible>); \
-// expected-error {{static assertion failed}} \
-// expected-note {{because 'not_move_constructible' does not satisfy '__cpp17_iterator'}} \
-// expected-note@#__cpp17_copy_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_copy_constructible'}} \
-// expected-note@#__cpp17_move_constructible {{because 'parameter_mapping_regressions::case3::not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \
-// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
-}
-
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 9e5756ffec3fc..e2b586e4c1e44 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -574,9 +574,8 @@ static_assert(x.size == 4);
 // CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'U (&)[3]'
 // CHECK-NEXT: | `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
 // CHECK-NEXT: |   |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT: |   | `-TemplateArgument type 'T'
-// CHECK-NEXT: |   |   `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
-// CHECK-NEXT: |   |     `-TemplateTypeParm 0x{{.+}} 'T'
+// CHECK-NEXT: |   | `-TemplateArgument type 'type-parameter-0-0'
+// CHECK-NEXT: |   |   `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
 // CHECK-NEXT: |   `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
 // CHECK-NEXT: |     `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
 // CHECK-NEXT: |       `-TemplateTypeParm 0x{{.+}} 'T'
@@ -589,9 +588,8 @@ static_assert(x.size == 4);
 // CHECK-NEXT:   |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'double (&)[3]'
 // CHECK-NEXT:   `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
 // CHECK-NEXT:     |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT:     | `-TemplateArgument type 'T'
-// CHECK-NEXT:     |   `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
-// CHECK-NEXT:     |     `-TemplateTypeParm 0x{{.+}} 'T'
+// CHECK-NEXT:     | `-TemplateArgument type 'type-parameter-0-0'
+// CHECK-NEXT:     |   `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
 // CHECK-NEXT:     `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
 // CHECK-NEXT:       `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
 // CHECK-NEXT:         `-TemplateTypeParm 0x{{.+}} 'T'
@@ -662,9 +660,8 @@ Test test(42);
 // CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'Constraint' depth 0 index 1 auto:1
 // CHECK-NEXT: | `-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'Constraint'
 // CHECK-NEXT: |   |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: |   | |-TemplateArgument type 'auto:1'
-// CHECK-NEXT: |   | | `-TemplateTypeParmType {{.*}} 'auto:1' dependent depth 0 index 1
-// CHECK-NEXT: |   | |   `-TemplateTypeParm {{.*}} 'auto:1'
+// CHECK-NEXT: |   | |-TemplateArgument type 'type-parameter-0-1'
+// CHECK-NEXT: |   | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
 // CHECK-NEXT: |   | `-TemplateArgument type 'int'
 // CHECK-NEXT: |   |   `-BuiltinType {{.*}} 'int'
 // CHECK-NEXT: |   |-TemplateArgument {{.*}} type 'auto:1':'type-parameter-0-1'
diff --git a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
index e03756ea3e3a8..1f2171a25ebb0 100644
--- a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
@@ -1,6 +1,5 @@
 // RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
 
-
 template<typename...>
 concept C = false; // expected-note 9{{because}}
 
diff --git a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
index de4a4847d0996..3edf243982958 100644
--- a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
+++ b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
@@ -7,7 +7,8 @@ template<typename T>
 constexpr bool is_same_v<T, T> = true;
 
 template<typename T, typename U>
-concept same_as = is_same_v<T, U>; //#is_same_v
+concept same_as = is_same_v<T, U>;
+// expected-note@-1{{because 'is_same_v<int, bool>' evaluated to false}}
 
 template<typename T, typename... Us>
 concept either = (is_same_v<T, Us> || ...);
@@ -16,7 +17,6 @@ template<typename... Ts>
 struct T {
     template<same_as<Ts>... Us>
     // expected-note@-1{{because 'same_as<int, bool>' evaluated to false}}
-    // expected-note@#is_same_v{{because 'is_same_v<int, bool>' evaluated to false}}
     static void foo(Us... u, int x) { };
     // expected-note@-1{{candidate template ignored: deduced too few arguments}}
     // expected-note@-2{{candidate template ignored: constraints not satisfied}}
diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
index 32ad5374f46a7..e60f79230aa00 100644
--- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp
+++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
@@ -72,12 +72,12 @@ namespace type_requirement {
 
   template<typename T> requires
   false_v<requires { typename T::template temp<T>; }>
-  // expected-note@-1 {{because 'false_v<requires { typename contains_template<int>::template temp<contains_template<int>>; }>' evaluated to false}}
-  // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short>>; }>' evaluated to false}}
+  // expected-note@-1 {{because 'false_v<requires { typename type_requirement::contains_template<int>::template temp<type_requirement::contains_template<int>>; }>' evaluated to false}}
+  // expected-note@-2 {{because 'false_v<requires { typename type_requirement::contains_template<short>::template temp<type_requirement::contains_template<short>>; }>' evaluated to false}}
   struct r2 {};
 
-  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<int>]}}
-  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<short>]}}
+  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}}
+  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<short>]}}
 
   // substitution error occurs, then requires expr is instantiated again
 
@@ -108,7 +108,7 @@ namespace type_requirement {
   // expected-note@-1 {{because 'false_v<requires { <<error-type>>; } && requires { <<error-type>>; }>' evaluated to false}}
   struct r7 {};
 
-  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, A>]}}
+  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, type_requirement::A>]}}
 }
 
 namespace expr_requirement {
@@ -268,13 +268,3 @@ struct Foo {
 };
 
 } // namespace GH110785
-
-namespace sugared_instantiation {
-  template <class C1> concept C = requires { C1{}; };
-  template <class D1> concept D = requires { new D1; };
-
-  // Test that 'deduced auto' doesn't get confused with 'undeduced auto'.
-  auto f() { return 0; }
-  static_assert(requires { { f() } -> C; });
-  static_assert(requires { { f() } -> D; });
-} // namespace sugared_instantiation
diff --git a/clang/test/SemaTemplate/instantiate-template-argument.cpp b/clang/test/SemaTemplate/instantiate-template-argument.cpp
index 7606619af566d..43d5d00c8cb20 100644
--- a/clang/test/SemaTemplate/instantiate-template-argument.cpp
+++ b/clang/test/SemaTemplate/instantiate-template-argument.cpp
@@ -1,6 +1,4 @@
-// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify=expected,cxx20
-// RUN: %clang_cc1 -std=c++2c -x c++ %s -verify
-
+// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
 
 template<auto T, decltype(T) U>
 concept C1 = sizeof(U) >= 4;
@@ -11,101 +9,20 @@ concept C2 = C1<Y{}, V>;
 // sizeof(U) >= 4 [U = V (decltype(Y{}))]
 
 template<char W>
-constexpr int foo() requires C2<int, W> { return 1; } // #cand1
+constexpr int foo() requires C2<int, W> { return 1; }
 // sizeof(U) >= 4 [U = W (decltype(int{}))]
 
 template<char X>
-constexpr int foo() requires C1<1, X> && true { return 2; } // #cand2
+// expected-note@+1{{candidate function}}
+constexpr int foo() requires C1<1, X> && true { return 2; }
 // sizeof(U) >= 4 [U = X (decltype(1))]
 
 static_assert(foo<'a'>() == 2);
 
-
 template<char Z>
-constexpr int foo() requires C2<long long, Z> && true { return 3; } // #cand3
+// expected-note@+1{{candidate function}}
+constexpr int foo() requires C2<long long, Z> && true { return 3; }
 // sizeof(U) >= 4 [U = Z (decltype(long long{}))]
 
 static_assert(foo<'a'>() == 3);
-// expected-error@-1{{call to 'foo' is ambiguous}}
-// expected-note@#cand2 {{candidate function}}
-// expected-note@#cand3 {{candidate function}}
-
-
-namespace case1 {
-
-template<auto T, decltype(T) U>
-concept C1 = sizeof(T) >= 4; // #case1_C1
-
-template<typename Y, char V>
-concept C2 = C1<Y{}, V>; // #case1_C2
-
-template<class T, char W>
-constexpr int foo() requires C2<T, W> { return 1; } // #case1_foo1
-
-template<class T, char X>
-constexpr int foo() requires C1<T{}, X> && true { return 2; } // #case1_foo2
-
-static_assert(foo<char, 'a'>() == 2);
-// expected-error@-1{{no matching function for call to 'foo'}}
-// expected-note@#case1_foo1{{candidate template ignored: constraints not satisfied [with T = char, W = 'a']}}
-// expected-note@#case1_foo1{{because 'C2<char, 'a'>' evaluated to false}}
-// expected-note@#case1_C2{{because 'C1<char{}, 'a'>' evaluated to false}}
-// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
-// expected-note@#case1_foo2{{candidate template ignored: constraints not satisfied [with T = char, X = 'a']}}
-// expected-note@#case1_foo2{{because 'C1<char{}, 'a'>' evaluated to false}}
-// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
-
-static_assert(foo<int, 'a'>() == 2);
-
-}
-
-namespace packs {
-
-template<auto T, decltype(T) U>
-concept C1 = sizeof(U) >= 4;
-
-template<typename Y, char V>
-concept C2 = C1<Y{}, V>;
-
-template<char... W>
-constexpr int foo() requires (C2<int, W> && ...) { return 1; } // #packs-cand1
-
-template<char... X>
-constexpr int foo() requires (C1<1, X> && ...) && true { return 2; } // #packs-cand2
-
-static_assert(foo<'a'>() == 2);
-// cxx20-error@-1{{call to 'foo' is ambiguous}}
-// cxx20-note@#packs-cand1 {{candidate function}}
-// cxx20-note@#packs-cand2 {{candidate function}}
-
-}
-
-namespace case2 {
-template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
-template<typename Y> concept C2 = C1<Y{}>;
-
-template<char W>
-constexpr int foo() requires C2<int> { return 1; }
-
-template<char X>
-constexpr int foo() requires C1<0> && true { return 2; }
-
-static_assert(foo<0>() == 2);
-}
-
-namespace case3 {
-template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
-
-template<typename Y> concept C2 = C1<Y{}>;
-
-template<char W>
-constexpr int foo() requires C2<int> { return 1; } // #case3_foo1
-
-template<char X>
-constexpr int foo() requires C1<1> && true { return 2; } // #case3_foo2
-
-static_assert(foo<0>() == 2);
-// expected-error@-1{{call to 'foo' is ambiguous}}
-// expected-note@#case3_foo1 {{candidate function}}
-// expected-note@#case3_foo2 {{candidate function}}
-}
+// expected-error@-1{{call to 'foo' is ambiguous}}
\ No newline at end of file
diff --git a/clang/test/SemaTemplate/pr52970.cpp b/clang/test/SemaTemplate/pr52970.cpp
index 6aabc419bd2b8..7aac5ee856593 100644
--- a/clang/test/SemaTemplate/pr52970.cpp
+++ b/clang/test/SemaTemplate/pr52970.cpp
@@ -53,7 +53,7 @@ static_assert(!DotFollowingPointer::f(Bad{}), "");
 #if __cplusplus >= 202002L
 template <class T>
 concept C = requires(T t) { t.begin(); };
-  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Bad' (aka 'Holder<Incomplete> *') is a pointer}}
+  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Holder<Incomplete> *' is a pointer}}
 
 static_assert(C<Good>);
 static_assert(!C<Bad>);
diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 70341eee79a96..629a887c0f7ed 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -143,7 +143,7 @@ void check_forward_iterator_requirements() {
   // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref, ""); // expected-error {{static assertion failed}}
 #ifndef _AIX
-  // expected-note-re@*:* {{'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>'}}
+  // expected-note-re@*:* {{because type constraint 'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>' was not satisfied}}
 #endif
 }
 
@@ -173,7 +173,7 @@ void check_bidirectional_iterator_requirements() {
   _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
   _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference, ""); // expected-error {{static assertion failed}}
-  // expected-note-re@*:* {{'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>'}}
+  // expected-note-re@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>' was not satisfied}}
   // clang-format on
 }
 

From 706b79002e21b571888db7f275bf5ed00e7cce41 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 23:06:54 +0900
Subject: [PATCH 516/878] Greedy: Merge VirtRegMap queries into one use (NFC)
 (#160485)

---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 24fe838e4b7d8..f0f313050cf92 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2498,8 +2498,10 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
   do {
     Reg = RecoloringCandidates.pop_back_val();
 
+    MCRegister CurrPhys = VRM->getPhys(Reg);
+
     // This may be a skipped register.
-    if (!VRM->hasPhys(Reg)) {
+    if (!CurrPhys) {
       assert(!shouldAllocateRegister(Reg) &&
              "We have an unallocated variable which should have been handled");
       continue;
@@ -2508,7 +2510,6 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
     // Get the live interval mapped with this virtual register to be able
     // to check for the interference with the new color.
     LiveInterval &LI = LIS->getInterval(Reg);
-    MCRegister CurrPhys = VRM->getPhys(Reg);
     // Check that the new color matches the register class constraints and
     // that it is free for this live range.
     if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||

From b147019f8b11cd491f331bd707f764786792665e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?=
 <alejandro.alvarez@sonarsource.com>
Date: Thu, 2 Oct 2025 07:11:31 -0700
Subject: [PATCH 517/878] [clang] Preserve `externs` following broken
 declarations (#161641)

Treat them as namespaces: if they are at the beginning of the line, they
are likely a good recovery point.

For instance, in

```cpp
1.3.0

extern "C" {
    extern int foo();

    extern "C++" {
        namespace bar {
            void baz();
        };
    }
}

namespace {}
```

Everything until `namespace`... is gone from the AST. Headers (like
libc's C++ `math.h`) can be included from an `extern "C"` context, and
they do an `extern "C++"` back again before including C++ headers (like
`__type_traits`).
However, a malformed declaration just before the include (as the orphan
`1.3.0` in the example) causes everything from these standard headers to
go missing. This patch updates the heuristic to try to recover from the
first `extern` keyword seen, pretty much as it is done for `namespace`.

CPP-4478
---
 clang/docs/ReleaseNotes.rst                              | 1 +
 clang/lib/Parse/ParseDecl.cpp                            | 3 +++
 .../Parser/recovery-after-expected-unqualified-id.cpp    | 9 +++++++++
 3 files changed, 13 insertions(+)
 create mode 100644 clang/test/Parser/recovery-after-expected-unqualified-id.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c6ee1e282a008..74b0647f38795 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -446,6 +446,7 @@ Bug Fixes to AST Handling
   legal representation. This is fixed because ElaboratedTypes don't exist anymore. (#GH43179) (#GH68670) (#GH92757)
 - Fix unrecognized html tag causing undesirable comment lexing (#GH152944)
 - Fix comment lexing of special command names (#GH152943)
+- Use `extern` as a hint to continue parsing when recovering from a malformed declaration.
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 22c01c4e371f3..d6cd7eb8c2c3d 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -2083,6 +2083,9 @@ void Parser::SkipMalformedDecl() {
         return;
       break;
 
+    case tok::kw_extern:
+      // 'extern' at the start of a line is almost certainly a good
+      // place to pick back up parsing
     case tok::kw_namespace:
       // 'namespace' at the start of a line is almost certainly a good
       // place to pick back up parsing, except in an Objective-C
diff --git a/clang/test/Parser/recovery-after-expected-unqualified-id.cpp b/clang/test/Parser/recovery-after-expected-unqualified-id.cpp
new file mode 100644
index 0000000000000..8019b46df1e7b
--- /dev/null
+++ b/clang/test/Parser/recovery-after-expected-unqualified-id.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -verify %s
+
+3.2 // expected-error {{expected unqualified-id}}
+
+extern "C" {
+    typedef int Int;
+}
+
+Int foo(); // Ok

From b92ff6b2099fa5103a0350028c6752ad953b9b8d Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com@gmail.com>
Date: Thu, 2 Oct 2025 16:14:19 +0200
Subject: [PATCH 518/878] [ROCDL] Added `rocdl.cvt.scale.pk8` ops (#161411)

This patch introduces some missing FP conversion instructions in the
ROCDL dialect


Specifically:
 - Downscaling 8x packed F16, Bf16, Fp32 values to Fp8, Bf8, Fp4

Tests:
- Added lit-tests to check MLIR -> LLVM lowering
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 22 ++++++++++++--
 mlir/test/Dialect/LLVMIR/rocdl.mlir          | 32 ++++++++++++++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir           | 28 +++++++++++++++++
 3 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 8b687a7f29bef..29001e26eaaaf 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -985,7 +985,6 @@ class ScaleArgInfo<TypeConstraint argTyVal, string typeName> {
 //===---------------------------------------------------------------------===//
 // Scaled {fp4,bf8,fp8} to {bf16,f16,f32} conversion intrinsics
 //===---------------------------------------------------------------------===//
-
 foreach smallT = [
   ScaleArgInfo<I32, "Fp4">,
   ScaleArgInfo<ROCDL_V2I32Type, "Fp8">,
@@ -996,6 +995,8 @@ foreach smallT = [
     ScaleArgInfo<ROCDL_V8BF16Type, "Bf16">,
     ScaleArgInfo<ROCDL_V8F32Type, "F32">,
   ] in {
+
+    // Up-scaling
     def ROCDL_CvtPkScalePk8 # largeT.nameForOp # smallT.nameForOp # Op :
           ROCDL_ConcreteNonMemIntrOp<"cvt.scale.pk8." # largeT.name # "." # smallT.name,
           [Pure], 1, [2], ["scaleSel"]>,
@@ -1010,13 +1011,30 @@ foreach smallT = [
         attr-dict $src `,` $scale `[` $scaleSel `]` `:` type($res)
       }];
     }
+
+    // Down-scaling
+    def ROCDL_CvtScaleF32Pk8 # smallT.nameForOp # largeT.nameForOp # Op :
+        ROCDL_ConcreteNonMemIntrOp<"cvt.scalef32.pk8." # smallT.name # "." # largeT.name,
+          [Pure], 1>,
+        Arguments<(ins largeT.type:$src, F32:$scale)> {
+      let results = (outs smallT.type:$res);
+      let summary = "Scale and convert packed "
+        # largeT.name # " to packed " # smallT.name ;
+     let description = [{
+        Convert 8 packed }] # largeT.name # [{ values to packed }]
+        # smallT.name # [{, multiplying by the exponent part of `scale`
+        before doing so. This op is for gfx1250+ arch.
+      }];
+      let assemblyFormat = [{
+        attr-dict $src `,` $scale `:` type($res)
+      }];
+    }
   } // foreach largeT
 } // foreach smallTOp
 
 //===---------------------------------------------------------------------===//
 // Scaled {bf6,fp6} to {bf16,f16,f32} conversion intrinsics
 //===---------------------------------------------------------------------===//
-
 foreach smallT = [
   ScaleArgInfo<ROCDL_V3I32Type, "Fp6">,
   ScaleArgInfo<ROCDL_V3I32Type, "Bf6">
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 0bad151570029..6134695e9ced6 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -1068,6 +1068,38 @@ llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
 
 // -----
 
+// CHECK-LABEL: rocdl.cvt.scalef32.pk8
+llvm.func @rocdl.cvt.scalef32.pk8(%v8xf32: vector<8xf32>,
+                                  %v8xf16: vector<8xf16>,
+                                  %v8xbf16: vector<8xbf16>,
+                                  %scale: f32) {
+
+  // CHECK: rocdl.cvt.scalef32.pk8.fp8.f32
+  %0 =      rocdl.cvt.scalef32.pk8.fp8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.bf8.f32
+  %1 =      rocdl.cvt.scalef32.pk8.bf8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.fp4.f32
+  %2 =      rocdl.cvt.scalef32.pk8.fp4.f32 %v8xf32, %scale : i32
+
+  // CHECK: rocdl.cvt.scalef32.pk8.fp8.f16
+  %3 =      rocdl.cvt.scalef32.pk8.fp8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.bf8.f16
+  %4 =      rocdl.cvt.scalef32.pk8.bf8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.fp4.f16
+  %5 =      rocdl.cvt.scalef32.pk8.fp4.f16 %v8xf16, %scale : i32
+
+  // CHECK: rocdl.cvt.scalef32.pk8.fp8.bf16
+  %6 =      rocdl.cvt.scalef32.pk8.fp8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.bf8.bf16
+  %7 =      rocdl.cvt.scalef32.pk8.bf8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: rocdl.cvt.scalef32.pk8.fp4.bf16
+  %8 =      rocdl.cvt.scalef32.pk8.fp4.bf16 %v8xbf16, %scale : i32
+
+  llvm.return
+}
+
+// -----
+
 // CHECK-LABEL: rocdl.cvt.scale.pk16
 llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {
 
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index e043a8c533d05..00ee6b795c43a 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1340,6 +1340,34 @@ llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.cvt.scalef32.pk8
+// CHECK-SAME:(<8 x float> %[[V8F32:.+]], <8 x half> %[[V8F16:.+]], <8 x bfloat> %[[V8BF16:.+]], float %[[SCALE:.+]])
+llvm.func @rocdl.cvt.scalef32.pk8(%v8xf32: vector<8xf32>, %v8xf16: vector<8xf16>, %v8xbf16: vector<8xbf16>, %scale: f32) {
+
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %[[V8F32]], float %[[SCALE]])
+  %0 = rocdl.cvt.scalef32.pk8.fp8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %[[V8F32]], float %[[SCALE]])
+  %1 = rocdl.cvt.scalef32.pk8.bf8.f32 %v8xf32, %scale : vector<2xi32>
+  // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %[[V8F32]], float %[[SCALE]])
+  %2 = rocdl.cvt.scalef32.pk8.fp4.f32 %v8xf32, %scale : i32
+
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %[[V8F16]], float %[[SCALE]])
+  %3 = rocdl.cvt.scalef32.pk8.fp8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %[[V8F16]], float %[[SCALE]])
+  %4 = rocdl.cvt.scalef32.pk8.bf8.f16 %v8xf16, %scale : vector<2xi32>
+  // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %[[V8F16]], float %[[SCALE]])
+  %5 = rocdl.cvt.scalef32.pk8.fp4.f16 %v8xf16, %scale : i32
+
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]])
+  %6 = rocdl.cvt.scalef32.pk8.fp8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]])
+  %7 = rocdl.cvt.scalef32.pk8.bf8.bf16 %v8xbf16, %scale : vector<2xi32>
+  // CHECK: call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %[[V8BF16]], float %[[SCALE]])
+  %8 = rocdl.cvt.scalef32.pk8.fp4.bf16 %v8xbf16, %scale : i32
+
+  llvm.return
+}
+
 // CHECK-LABEL: @rocdl.cvt.scale.pk16
 // CHECK-SAME:(<3 x i32> %[[SRC0:.+]], i32 %[[SCALE:.+]])
 llvm.func @rocdl.cvt.scale.pk16(%v3xi32: vector<3xi32>, %scale:i32) {

From f98735f1264c9214a2efa2b05d48871e44cfd245 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 23:22:20 +0900
Subject: [PATCH 519/878] Greedy: Use initializer list for recoloring
 candidates (NFC) (#160486)

---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index f0f313050cf92..803ecdc89bb7a 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2482,15 +2482,13 @@ void RAGreedy::tryHintRecoloring(const LiveInterval &VirtReg) {
   // We have a broken hint, check if it is possible to fix it by
   // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
   // some register and PhysReg may be available for the other live-ranges.
-  SmallSet<Register, 4> Visited;
-  SmallVector<Register, 2> RecoloringCandidates;
   HintsInfo Info;
   Register Reg = VirtReg.reg();
   MCRegister PhysReg = VRM->getPhys(Reg);
   // Start the recoloring algorithm from the input live-interval, then
   // it will propagate to the ones that are copy-related with it.
-  Visited.insert(Reg);
-  RecoloringCandidates.push_back(Reg);
+  SmallSet<Register, 4> Visited = {Reg};
+  SmallVector<Register, 2> RecoloringCandidates = {Reg};
 
   LLVM_DEBUG(dbgs() << "Trying to reconcile hints for: " << printReg(Reg, TRI)
                     << '(' << printReg(PhysReg, TRI) << ")\n");

From 11b0cf8fbeaf8b2384f3ab4d7f6fe97bc7d3db63 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Oct 2025 23:28:38 +0900
Subject: [PATCH 520/878] Greedy: Take hints from copy to physical subreg
 (#160467)

Previously this took hints from subregister extract of physreg,
like  %vreg.sub = COPY $physreg

This now also handles the rarer case:
  $physreg_sub = COPY %vreg

Also make an accidental bug here before explicit; this was
only using the superregister as a hint if it was already
in the copy, and not if using the existing assignment. There are
a handful of regressions in that case, so leave that extension
for a future change.
---
 llvm/lib/CodeGen/RegAllocGreedy.cpp | 35 ++++++++++++++++-------------
 llvm/test/CodeGen/X86/shift-i128.ll |  3 +--
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 803ecdc89bb7a..7fe13a316db95 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2435,25 +2435,28 @@ void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
     unsigned SubReg = Opnd.getSubReg();
 
     // Get the current assignment.
-    MCRegister OtherPhysReg =
-        OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
-    if (OtherSubReg) {
-      if (OtherReg.isPhysical()) {
-        MCRegister Tuple =
-            TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC);
-        if (!Tuple)
-          continue;
-        OtherPhysReg = Tuple;
-      } else {
-        // TODO: There should be a hinting mechanism for subregisters
-        if (SubReg != OtherSubReg)
-          continue;
-      }
+    MCRegister OtherPhysReg;
+    if (OtherReg.isPhysical()) {
+      if (OtherSubReg)
+        OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, OtherSubReg, RC);
+      else if (SubReg)
+        OtherPhysReg = TRI->getMatchingSuperReg(OtherReg, SubReg, RC);
+      else
+        OtherPhysReg = OtherReg;
+    } else {
+      OtherPhysReg = VRM->getPhys(OtherReg);
+      // TODO: Should find matching superregister, but applying this in the
+      // non-hint case currently causes regressions
+
+      if (SubReg && OtherSubReg && SubReg != OtherSubReg)
+        continue;
     }
 
     // Push the collected information.
-    Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
-                           OtherPhysReg));
+    if (OtherPhysReg) {
+      Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+                             OtherPhysReg));
+    }
   }
 }
 
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 7462c77482827..049ee47af9681 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -613,8 +613,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    shldl %cl, %esi, %ebx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    shll %cl, %esi
 ; i686-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    negl %edx

From daa4e57ccf38ff6ac22243e98a035c87b9f9f3ae Mon Sep 17 00:00:00 2001
From: Ikhlas Ajbar <iajbar@quicinc.com>
Date: Thu, 2 Oct 2025 09:43:24 -0500
Subject: [PATCH 521/878] [Hexagon] Add opcode V6_vS32Ub_npred_ai for offset
 validity check (#161618)

Check for a valid offset for unaligned vector store V6_vS32Ub_npred_ai.
isValidOffset() is updated to evaluate offset of this instruction.
Fixes #160647
---
 llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp  |  1 +
 .../CodeGen/Hexagon/unaligned-vec-store.ll    | 23 +++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 45d194e944fb9..939841ae817c3 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -2804,6 +2804,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vL32b_nt_cur_npred_ai:
   case Hexagon::V6_vL32b_nt_tmp_pred_ai:
   case Hexagon::V6_vL32b_nt_tmp_npred_ai:
+  case Hexagon::V6_vS32Ub_npred_ai:
   case Hexagon::V6_vgathermh_pseudo:
   case Hexagon::V6_vgathermw_pseudo:
   case Hexagon::V6_vgathermhw_pseudo:
diff --git a/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll
new file mode 100644
index 0000000000000..267e365243711
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128B < %s | FileCheck %s
+; REQUIRES: asserts
+
+; Check that the test does not assert when unaligned vector store V6_vS32Ub_npred_ai is generated.
+; CHECK: if (!p{{[0-3]}}) vmemu
+
+target triple = "hexagon-unknown-unknown-elf"
+
+define fastcc void @test(i1 %cmp.i.i) {
+entry:
+  %call.i.i.i172 = load ptr, ptr null, align 4
+  %add.ptr = getelementptr i8, ptr %call.i.i.i172, i32 1
+  store <32 x i32> zeroinitializer, ptr %add.ptr, align 128
+  %add.ptr4.i4 = getelementptr i8, ptr %call.i.i.i172, i32 129
+  br i1 %cmp.i.i, label %common.ret, label %if.end.i.i
+
+common.ret:                                       ; preds = %if.end.i.i, %entry
+  ret void
+
+if.end.i.i:                                       ; preds = %entry
+  store <32 x i32> zeroinitializer, ptr %add.ptr4.i4, align 1
+  br label %common.ret
+}

From c242aff2452fb662a7ea23954abe654b51182b8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 2 Oct 2025 04:43:45 -1000
Subject: [PATCH 522/878] [flang][cuda][openacc] Create new symbol in host_data
 region for CUDA Fortran interop (#161613)

---
 flang/include/flang/Lower/OpenACC.h           |  3 +-
 flang/include/flang/Lower/SymbolMap.h         |  4 ++
 flang/include/flang/Semantics/symbol.h        |  2 +-
 flang/lib/Lower/Bridge.cpp                    |  2 +-
 flang/lib/Lower/OpenACC.cpp                   | 34 +++++++---
 flang/lib/Lower/SymbolMap.cpp                 | 10 +++
 flang/lib/Semantics/check-declarations.cpp    |  3 +-
 flang/lib/Semantics/resolve-directives.cpp    |  5 ++
 flang/lib/Semantics/resolve-names.cpp         | 65 ++++++++++++++++++-
 .../OpenACC/acc-host-data-cuda-device.f90     | 43 ++++++++++++
 10 files changed, 157 insertions(+), 14 deletions(-)
 create mode 100644 flang/test/Lower/OpenACC/acc-host-data-cuda-device.f90

diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index 19d759479abaf..4622dbc8ccf64 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -77,7 +77,8 @@ static constexpr llvm::StringRef privatizationRecipePrefix = "privatization";
 mlir::Value genOpenACCConstruct(AbstractConverter &,
                                 Fortran::semantics::SemanticsContext &,
                                 pft::Evaluation &,
-                                const parser::OpenACCConstruct &);
+                                const parser::OpenACCConstruct &,
+                                Fortran::lower::SymMap &localSymbols);
 void genOpenACCDeclarativeConstruct(
     AbstractConverter &, Fortran::semantics::SemanticsContext &,
     StatementContext &, const parser::OpenACCDeclarativeConstruct &);
diff --git a/flang/include/flang/Lower/SymbolMap.h b/flang/include/flang/Lower/SymbolMap.h
index 813df777d7a39..e57b6a42d3cc1 100644
--- a/flang/include/flang/Lower/SymbolMap.h
+++ b/flang/include/flang/Lower/SymbolMap.h
@@ -260,6 +260,10 @@ class SymMap {
     return lookupSymbol(*sym);
   }
 
+  /// Find a symbol by name and return its value if it appears in the current
+  /// mappings. This lookup is more expensive as it iterates over the map.
+  const semantics::Symbol *lookupSymbolByName(llvm::StringRef symName);
+
   /// Find `symbol` and return its value if it appears in the inner-most level
   /// map.
   SymbolBox shallowLookupSymbol(semantics::SymbolRef sym);
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index e90e9c617805d..a0d5ae7176141 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -801,7 +801,7 @@ class Symbol {
       AccPrivate, AccFirstPrivate, AccShared,
       // OpenACC data-mapping attribute
       AccCopy, AccCopyIn, AccCopyInReadOnly, AccCopyOut, AccCreate, AccDelete,
-      AccPresent, AccLink, AccDeviceResident, AccDevicePtr,
+      AccPresent, AccLink, AccDeviceResident, AccDevicePtr, AccUseDevice,
       // OpenACC declare
       AccDeclare,
       // OpenACC data-movement attribute
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 149e51b501a82..780d56f085f69 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3182,7 +3182,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     mlir::OpBuilder::InsertPoint insertPt = builder->saveInsertionPoint();
     localSymbols.pushScope();
     mlir::Value exitCond = genOpenACCConstruct(
-        *this, bridge.getSemanticsContext(), getEval(), acc);
+        *this, bridge.getSemanticsContext(), getEval(), acc, localSymbols);
 
     const Fortran::parser::OpenACCLoopConstruct *accLoop =
         std::get_if<Fortran::parser::OpenACCLoopConstruct>(&acc.u);
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 95d0adae02670..f9b9b850ad839 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -3184,7 +3184,8 @@ genACCHostDataOp(Fortran::lower::AbstractConverter &converter,
                  Fortran::lower::pft::Evaluation &eval,
                  Fortran::semantics::SemanticsContext &semanticsContext,
                  Fortran::lower::StatementContext &stmtCtx,
-                 const Fortran::parser::AccClauseList &accClauseList) {
+                 const Fortran::parser::AccClauseList &accClauseList,
+                 Fortran::lower::SymMap &localSymbols) {
   mlir::Value ifCond;
   llvm::SmallVector<mlir::Value> dataOperands;
   bool addIfPresentAttr = false;
@@ -3199,6 +3200,19 @@ genACCHostDataOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *useDevice =
                    std::get_if<Fortran::parser::AccClause::UseDevice>(
                        &clause.u)) {
+      // When CUDA Fotran is enabled, extra symbols are used in the host_data
+      // region. Look for them and bind their values with the symbols in the
+      // outer scope.
+      if (semanticsContext.IsEnabled(Fortran::common::LanguageFeature::CUDA)) {
+        const Fortran::parser::AccObjectList &objectList{useDevice->v};
+        for (const auto &accObject : objectList.v) {
+          Fortran::semantics::Symbol &symbol =
+              getSymbolFromAccObject(accObject);
+          const Fortran::semantics::Symbol *baseSym =
+              localSymbols.lookupSymbolByName(symbol.name().ToString());
+          localSymbols.copySymbolBinding(*baseSym, symbol);
+        }
+      }
       genDataOperandOperations<mlir::acc::UseDeviceOp>(
           useDevice->v, converter, semanticsContext, stmtCtx, dataOperands,
           mlir::acc::DataClause::acc_use_device,
@@ -3239,11 +3253,11 @@ genACCHostDataOp(Fortran::lower::AbstractConverter &converter,
     hostDataOp.setIfPresentAttr(builder.getUnitAttr());
 }
 
-static void
-genACC(Fortran::lower::AbstractConverter &converter,
-       Fortran::semantics::SemanticsContext &semanticsContext,
-       Fortran::lower::pft::Evaluation &eval,
-       const Fortran::parser::OpenACCBlockConstruct &blockConstruct) {
+static void genACC(Fortran::lower::AbstractConverter &converter,
+                   Fortran::semantics::SemanticsContext &semanticsContext,
+                   Fortran::lower::pft::Evaluation &eval,
+                   const Fortran::parser::OpenACCBlockConstruct &blockConstruct,
+                   Fortran::lower::SymMap &localSymbols) {
   const auto &beginBlockDirective =
       std::get<Fortran::parser::AccBeginBlockDirective>(blockConstruct.t);
   const auto &blockDirective =
@@ -3273,7 +3287,7 @@ genACC(Fortran::lower::AbstractConverter &converter,
                                           accClauseList);
   } else if (blockDirective.v == llvm::acc::ACCD_host_data) {
     genACCHostDataOp(converter, currentLocation, eval, semanticsContext,
-                     stmtCtx, accClauseList);
+                     stmtCtx, accClauseList, localSymbols);
   }
 }
 
@@ -4647,13 +4661,15 @@ mlir::Value Fortran::lower::genOpenACCConstruct(
     Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semanticsContext,
     Fortran::lower::pft::Evaluation &eval,
-    const Fortran::parser::OpenACCConstruct &accConstruct) {
+    const Fortran::parser::OpenACCConstruct &accConstruct,
+    Fortran::lower::SymMap &localSymbols) {
 
   mlir::Value exitCond;
   Fortran::common::visit(
       common::visitors{
           [&](const Fortran::parser::OpenACCBlockConstruct &blockConstruct) {
-            genACC(converter, semanticsContext, eval, blockConstruct);
+            genACC(converter, semanticsContext, eval, blockConstruct,
+                   localSymbols);
           },
           [&](const Fortran::parser::OpenACCCombinedConstruct
                   &combinedConstruct) {
diff --git a/flang/lib/Lower/SymbolMap.cpp b/flang/lib/Lower/SymbolMap.cpp
index 080f21ec67400..78529e0d539fb 100644
--- a/flang/lib/Lower/SymbolMap.cpp
+++ b/flang/lib/Lower/SymbolMap.cpp
@@ -45,6 +45,16 @@ Fortran::lower::SymMap::lookupSymbol(Fortran::semantics::SymbolRef symRef) {
   return SymbolBox::None{};
 }
 
+const Fortran::semantics::Symbol *
+Fortran::lower::SymMap::lookupSymbolByName(llvm::StringRef symName) {
+  for (auto jmap = symbolMapStack.rbegin(), jend = symbolMapStack.rend();
+       jmap != jend; ++jmap)
+    for (auto const &[sym, symBox] : *jmap)
+      if (sym->name().ToString() == symName)
+        return sym;
+  return nullptr;
+}
+
 Fortran::lower::SymbolBox Fortran::lower::SymMap::shallowLookupSymbol(
     Fortran::semantics::SymbolRef symRef) {
   auto *sym = symRef->HasLocalLocality() ? &*symRef : &symRef->GetUltimate();
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 1049a6d2c1b2e..7b881008219df 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -1189,7 +1189,8 @@ void CheckHelper::CheckObjectEntity(
       }
     } else if (!subpDetails && symbol.owner().kind() != Scope::Kind::Module &&
         symbol.owner().kind() != Scope::Kind::MainProgram &&
-        symbol.owner().kind() != Scope::Kind::BlockConstruct) {
+        symbol.owner().kind() != Scope::Kind::BlockConstruct &&
+        symbol.owner().kind() != Scope::Kind::OpenACCConstruct) {
       messages_.Say(
           "ATTRIBUTES(%s) may apply only to module, host subprogram, block, or device subprogram data"_err_en_US,
           parser::ToUpperCaseLetters(common::EnumToString(attr)));
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index b1eaaa859c30a..624b89005c809 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -328,6 +328,11 @@ class AccAttributeVisitor : DirectiveAttributeVisitor<llvm::acc::Directive> {
     return false;
   }
 
+  bool Pre(const parser::AccClause::UseDevice &x) {
+    ResolveAccObjectList(x.v, Symbol::Flag::AccUseDevice);
+    return false;
+  }
+
   void Post(const parser::Name &);
 
 private:
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index d1150a9eb67f4..5041a6a08fc3c 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1387,6 +1387,8 @@ class ConstructVisitor : public virtual DeclarationVisitor {
 // Create scopes for OpenACC constructs
 class AccVisitor : public virtual DeclarationVisitor {
 public:
+  explicit AccVisitor(SemanticsContext &context) : context_{context} {}
+
   void AddAccSourceRange(const parser::CharBlock &);
 
   static bool NeedsScope(const parser::OpenACCBlockConstruct &);
@@ -1395,6 +1397,7 @@ class AccVisitor : public virtual DeclarationVisitor {
   void Post(const parser::OpenACCBlockConstruct &);
   bool Pre(const parser::OpenACCCombinedConstruct &);
   void Post(const parser::OpenACCCombinedConstruct &);
+  bool Pre(const parser::AccClause::UseDevice &x);
   bool Pre(const parser::AccBeginBlockDirective &x) {
     AddAccSourceRange(x.source);
     return true;
@@ -1430,6 +1433,11 @@ class AccVisitor : public virtual DeclarationVisitor {
   void Post(const parser::AccBeginLoopDirective &x) {
     messageHandler().set_currStmtSource(std::nullopt);
   }
+
+  void CopySymbolWithDevice(const parser::Name *name);
+
+private:
+  SemanticsContext &context_;
 };
 
 bool AccVisitor::NeedsScope(const parser::OpenACCBlockConstruct &x) {
@@ -1459,6 +1467,60 @@ bool AccVisitor::Pre(const parser::OpenACCBlockConstruct &x) {
   return true;
 }
 
+void AccVisitor::CopySymbolWithDevice(const parser::Name *name) {
+  // When CUDA Fortran is enabled together with OpenACC, new
+  // symbols are created for the one appearing in the use_device
+  // clause. These new symbols have the CUDA Fortran device
+  // attribute.
+  if (context_.languageFeatures().IsEnabled(common::LanguageFeature::CUDA)) {
+    name->symbol = currScope().CopySymbol(*name->symbol);
+    if (auto *object{name->symbol->detailsIf<ObjectEntityDetails>()}) {
+      object->set_cudaDataAttr(common::CUDADataAttr::Device);
+    }
+  }
+}
+
+bool AccVisitor::Pre(const parser::AccClause::UseDevice &x) {
+  for (const auto &accObject : x.v.v) {
+    common::visit(
+        common::visitors{
+            [&](const parser::Designator &designator) {
+              if (const auto *name{
+                      semantics::getDesignatorNameIfDataRef(designator)}) {
+                Symbol *prev{currScope().FindSymbol(name->source)};
+                if (prev != name->symbol) {
+                  name->symbol = prev;
+                }
+                CopySymbolWithDevice(name);
+              } else {
+                if (const auto *dataRef{
+                        std::get_if<parser::DataRef>(&designator.u)}) {
+                  using ElementIndirection =
+                      common::Indirection<parser::ArrayElement>;
+                  if (auto *ind{std::get_if<ElementIndirection>(&dataRef->u)}) {
+                    const parser::ArrayElement &arrayElement{ind->value()};
+                    Walk(arrayElement.subscripts);
+                    const parser::DataRef &base{arrayElement.base};
+                    if (auto *name{std::get_if<parser::Name>(&base.u)}) {
+                      Symbol *prev{currScope().FindSymbol(name->source)};
+                      if (prev != name->symbol) {
+                        name->symbol = prev;
+                      }
+                      CopySymbolWithDevice(name);
+                    }
+                  }
+                }
+              }
+            },
+            [&](const parser::Name &name) {
+              // TODO: common block in use_device?
+            },
+        },
+        accObject.u);
+  }
+  return false;
+}
+
 void AccVisitor::Post(const parser::OpenACCBlockConstruct &x) {
   if (NeedsScope(x)) {
     PopScope();
@@ -2038,7 +2100,8 @@ class ResolveNamesVisitor : public virtual ScopeHandler,
 
   ResolveNamesVisitor(
       SemanticsContext &context, ImplicitRulesMap &rules, Scope &top)
-      : BaseVisitor{context, *this, rules}, topScope_{top} {
+      : BaseVisitor{context, *this, rules}, AccVisitor(context),
+        topScope_{top} {
     PushScope(top);
   }
 
diff --git a/flang/test/Lower/OpenACC/acc-host-data-cuda-device.f90 b/flang/test/Lower/OpenACC/acc-host-data-cuda-device.f90
new file mode 100644
index 0000000000000..da034adec39c4
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-host-data-cuda-device.f90
@@ -0,0 +1,43 @@
+
+! RUN: bbc -fopenacc -fcuda -emit-hlfir %s -o - | FileCheck %s
+
+module m
+
+interface doit
+subroutine __device_sub(a)
+    real(4), device, intent(in) :: a(:,:,:)
+    !dir$ ignore_tkr(c) a
+end
+subroutine __host_sub(a)
+    real(4), intent(in) :: a(:,:,:)
+    !dir$ ignore_tkr(c) a
+end
+end interface
+end module
+
+program testex1
+integer, parameter :: ntimes = 10
+integer, parameter :: ni=128
+integer, parameter :: nj=256
+integer, parameter :: nk=64
+real(4), dimension(ni,nj,nk) :: a
+
+!$acc enter data copyin(a)
+
+block; use m
+!$acc host_data use_device(a)
+do nt = 1, ntimes
+  call doit(a)
+end do
+!$acc end host_data
+end block
+
+block; use m
+do nt = 1, ntimes
+  call doit(a)
+end do
+end block
+end
+
+! CHECK: fir.call @_QP__device_sub
+! CHECK: fir.call @_QP__host_sub

From 197e77b33d344c162b8c87de8aa7ddd942f50874 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Thu, 2 Oct 2025 11:15:28 -0400
Subject: [PATCH 523/878] [X86] Create special case for (a-b) - (a<b) -> sbb a,
 b (#161388)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 41 ++++++++++++++++++++-----
 llvm/test/CodeGen/X86/sbb.ll            | 29 +++++++++++++++++
 2 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 34854e4d8b6c0..cda5568a2cb59 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52388,16 +52388,41 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
     // cannot take an immediate as its first operand.
     //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
-        EFLAGS.getValueType().isInteger() &&
-        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub =
-          DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
-                      EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+    // If EFLAGS is from a CMP that compares the same operands as the earlier
+    // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
+    // SBB/ADC without creating a flipped SUB.
+    if (EFLAGS.getOpcode() == X86ISD::CMP &&
+        EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
                          DAG.getVTList(VT, MVT::i32), X,
-                         DAG.getConstant(0, DL, VT), NewEFLAGS);
+                         DAG.getConstant(0, DL, VT), EFLAGS);
+    }
+
+    if (EFLAGS.getOpcode() == X86ISD::SUB &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      // Only create NewSub if we know one of the folds will succeed to avoid
+      // introducing a temporary node that may persist and affect one-use checks
+      // below.
+      if (EFLAGS.getNode()->hasOneUse()) {
+        SDValue NewSub = DAG.getNode(
+            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+        SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+        return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+                           DAG.getVTList(VT, MVT::i32), X,
+                           DAG.getConstant(0, DL, VT), NewEFLAGS);
+      }
+
+      if (IsSub && X == EFLAGS.getValue(0)) {
+        SDValue NewSub = DAG.getNode(
+            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+        SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+        return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
+                           EFLAGS.getOperand(0), EFLAGS.getOperand(1),
+                           NewEFLAGS);
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/sbb.ll b/llvm/test/CodeGen/X86/sbb.ll
index 78d609d3a17e6..f5a34688d67b5 100644
--- a/llvm/test/CodeGen/X86/sbb.ll
+++ b/llvm/test/CodeGen/X86/sbb.ll
@@ -365,3 +365,32 @@ define i32 @uge_sext_add(i32 %0, i32 %1, i32 %2) {
   %6 = add nsw i32 %5, %0
   ret i32 %6
 }
+
+define i32 @sub_sub_ugt(i32 %a, i32 %b) {
+; CHECK-LABEL: sub_sub_ugt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    cmpl %edi, %esi
+; CHECK-NEXT:    sbbl %esi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ugt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  %sub = sub i32 %a, %b
+  %res = sub i32 %sub, %conv
+  ret i32 %res
+}
+
+define i32 @sub_sub_ult(i32 %a, i32 %b) {
+; CHECK-LABEL: sub_sub_ult:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    cmpl %edi, %esi
+; CHECK-NEXT:    sbbl %esi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ult i32 %b, %a
+  %conv = zext i1 %cmp to i32
+  %sub = sub i32 %a, %b
+  %res = sub i32 %sub, %conv
+  ret i32 %res
+}
+

From a3594cd6442ee988c18249d96b8331f0a5331df7 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Thu, 2 Oct 2025 11:18:47 -0400
Subject: [PATCH 524/878] [MLIR][Python] fixup Context and Location stubs and
 NanobindAdaptors (#161433)

add correct names for `NB_TYPE_CASTER(..., name)` so users of
`NanobindAdaptors.h` can generate the correct hints. Also fix a few
straggler stubs.
---
 .../mlir/Bindings/Python/NanobindAdaptors.h   | 38 +++++++-----
 mlir/lib/Bindings/Python/IRCore.cpp           | 58 ++++++++-----------
 mlir/lib/Bindings/Python/IRModule.h           |  6 +-
 mlir/lib/Bindings/Python/IRTypes.cpp          |  4 +-
 mlir/lib/Bindings/Python/MainModule.cpp       |  6 --
 5 files changed, 50 insertions(+), 62 deletions(-)

diff --git a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h
index b5f985f803de6..847951ab5fd46 100644
--- a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h
+++ b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h
@@ -116,7 +116,8 @@ mlirApiObjectToCapsule(nanobind::handle apiObject) {
 /// Casts object <-> MlirAffineMap.
 template <>
 struct type_caster<MlirAffineMap> {
-  NB_TYPE_CASTER(MlirAffineMap, const_name("MlirAffineMap"))
+  NB_TYPE_CASTER(MlirAffineMap,
+                 const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.AffineMap")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToAffineMap(capsule->ptr());
@@ -138,7 +139,8 @@ struct type_caster<MlirAffineMap> {
 /// Casts object <-> MlirAttribute.
 template <>
 struct type_caster<MlirAttribute> {
-  NB_TYPE_CASTER(MlirAttribute, const_name("MlirAttribute"))
+  NB_TYPE_CASTER(MlirAttribute,
+                 const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Attribute")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToAttribute(capsule->ptr());
@@ -161,7 +163,7 @@ struct type_caster<MlirAttribute> {
 /// Casts object -> MlirBlock.
 template <>
 struct type_caster<MlirBlock> {
-  NB_TYPE_CASTER(MlirBlock, const_name("MlirBlock"))
+  NB_TYPE_CASTER(MlirBlock, const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Block")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToBlock(capsule->ptr());
@@ -174,7 +176,8 @@ struct type_caster<MlirBlock> {
 /// Casts object -> MlirContext.
 template <>
 struct type_caster<MlirContext> {
-  NB_TYPE_CASTER(MlirContext, const_name("MlirContext"))
+  NB_TYPE_CASTER(MlirContext,
+                 const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Context")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (src.is_none()) {
       // Gets the current thread-bound context.
@@ -192,7 +195,8 @@ struct type_caster<MlirContext> {
 /// Casts object <-> MlirDialectRegistry.
 template <>
 struct type_caster<MlirDialectRegistry> {
-  NB_TYPE_CASTER(MlirDialectRegistry, const_name("MlirDialectRegistry"))
+  NB_TYPE_CASTER(MlirDialectRegistry,
+                 const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.DialectRegistry")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToDialectRegistry(capsule->ptr());
@@ -214,7 +218,8 @@ struct type_caster<MlirDialectRegistry> {
 /// Casts object <-> MlirLocation.
 template <>
 struct type_caster<MlirLocation> {
-  NB_TYPE_CASTER(MlirLocation, const_name("MlirLocation"))
+  NB_TYPE_CASTER(MlirLocation,
+                 const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Location")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (src.is_none()) {
       // Gets the current thread-bound context.
@@ -240,7 +245,7 @@ struct type_caster<MlirLocation> {
 /// Casts object <-> MlirModule.
 template <>
 struct type_caster<MlirModule> {
-  NB_TYPE_CASTER(MlirModule, const_name("MlirModule"))
+  NB_TYPE_CASTER(MlirModule, const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Module")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToModule(capsule->ptr());
@@ -262,8 +267,9 @@ struct type_caster<MlirModule> {
 /// Casts object <-> MlirFrozenRewritePatternSet.
 template <>
 struct type_caster<MlirFrozenRewritePatternSet> {
-  NB_TYPE_CASTER(MlirFrozenRewritePatternSet,
-                 const_name("MlirFrozenRewritePatternSet"))
+  NB_TYPE_CASTER(
+      MlirFrozenRewritePatternSet,
+      const_name(MAKE_MLIR_PYTHON_QUALNAME("rewrite.FrozenRewritePatternSet")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToFrozenRewritePatternSet(capsule->ptr());
@@ -285,7 +291,8 @@ struct type_caster<MlirFrozenRewritePatternSet> {
 /// Casts object <-> MlirOperation.
 template <>
 struct type_caster<MlirOperation> {
-  NB_TYPE_CASTER(MlirOperation, const_name("MlirOperation"))
+  NB_TYPE_CASTER(MlirOperation,
+                 const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Operation")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToOperation(capsule->ptr());
@@ -309,7 +316,7 @@ struct type_caster<MlirOperation> {
 /// Casts object <-> MlirValue.
 template <>
 struct type_caster<MlirValue> {
-  NB_TYPE_CASTER(MlirValue, const_name("MlirValue"))
+  NB_TYPE_CASTER(MlirValue, const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Value")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToValue(capsule->ptr());
@@ -334,7 +341,8 @@ struct type_caster<MlirValue> {
 /// Casts object -> MlirPassManager.
 template <>
 struct type_caster<MlirPassManager> {
-  NB_TYPE_CASTER(MlirPassManager, const_name("MlirPassManager"))
+  NB_TYPE_CASTER(MlirPassManager, const_name(MAKE_MLIR_PYTHON_QUALNAME(
+                                      "passmanager.PassManager")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToPassManager(capsule->ptr());
@@ -347,7 +355,7 @@ struct type_caster<MlirPassManager> {
 /// Casts object <-> MlirTypeID.
 template <>
 struct type_caster<MlirTypeID> {
-  NB_TYPE_CASTER(MlirTypeID, const_name("MlirTypeID"))
+  NB_TYPE_CASTER(MlirTypeID, const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.TypeID")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToTypeID(capsule->ptr());
@@ -371,7 +379,7 @@ struct type_caster<MlirTypeID> {
 /// Casts object <-> MlirType.
 template <>
 struct type_caster<MlirType> {
-  NB_TYPE_CASTER(MlirType, const_name("MlirType"))
+  NB_TYPE_CASTER(MlirType, const_name(MAKE_MLIR_PYTHON_QUALNAME("ir.Type")))
   bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept {
     if (auto capsule = mlirApiObjectToCapsule(src)) {
       value = mlirPythonCapsuleToType(capsule->ptr());
@@ -394,7 +402,7 @@ struct type_caster<MlirType> {
 /// Casts MlirStringRef -> object.
 template <>
 struct type_caster<MlirStringRef> {
-  NB_TYPE_CASTER(MlirStringRef, const_name("MlirStringRef"))
+  NB_TYPE_CASTER(MlirStringRef, const_name("str"))
   static handle from_cpp(MlirStringRef s, rv_policy,
                          cleanup_list *cleanup) noexcept {
     return nanobind::str(s.data, s.length).release();
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index c20b2111c071e..32b2b0c648cff 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -3219,13 +3219,11 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           nb::arg("end_line"), nb::arg("end_col"),
           nb::arg("context") = nb::none(), kContextGetFileRangeDocstring)
       .def("is_a_file", mlirLocationIsAFileLineColRange)
-      .def_prop_ro(
-          "filename",
-          [](MlirLocation loc) {
-            return mlirIdentifierStr(
-                mlirLocationFileLineColRangeGetFilename(loc));
-          },
-          nb::sig("def filename(self) -> str"))
+      .def_prop_ro("filename",
+                   [](MlirLocation loc) {
+                     return mlirIdentifierStr(
+                         mlirLocationFileLineColRangeGetFilename(loc));
+                   })
       .def_prop_ro("start_line", mlirLocationFileLineColRangeGetStartLine)
       .def_prop_ro("start_col", mlirLocationFileLineColRangeGetStartColumn)
       .def_prop_ro("end_line", mlirLocationFileLineColRangeGetEndLine)
@@ -3274,12 +3272,10 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           nb::arg("name"), nb::arg("childLoc") = nb::none(),
           nb::arg("context") = nb::none(), kContextGetNameLocationDocString)
       .def("is_a_name", mlirLocationIsAName)
-      .def_prop_ro(
-          "name_str",
-          [](MlirLocation loc) {
-            return mlirIdentifierStr(mlirLocationNameGetName(loc));
-          },
-          nb::sig("def name_str(self) -> str"))
+      .def_prop_ro("name_str",
+                   [](MlirLocation loc) {
+                     return mlirIdentifierStr(mlirLocationNameGetName(loc));
+                   })
       .def_prop_ro("child_loc",
                    [](PyLocation &self) {
                      return PyLocation(self.getContext(),
@@ -3453,15 +3449,13 @@ void mlir::python::populateIRCore(nb::module_ &m) {
             return concreteOperation.getContext().getObject();
           },
           "Context that owns the Operation")
-      .def_prop_ro(
-          "name",
-          [](PyOperationBase &self) {
-            auto &concreteOperation = self.getOperation();
-            concreteOperation.checkValid();
-            MlirOperation operation = concreteOperation.get();
-            return mlirIdentifierStr(mlirOperationGetName(operation));
-          },
-          nb::sig("def name(self) -> str"))
+      .def_prop_ro("name",
+                   [](PyOperationBase &self) {
+                     auto &concreteOperation = self.getOperation();
+                     concreteOperation.checkValid();
+                     MlirOperation operation = concreteOperation.get();
+                     return mlirIdentifierStr(mlirOperationGetName(operation));
+                   })
       .def_prop_ro("operands",
                    [](PyOperationBase &self) {
                      return PyOpOperandList(self.getOperation().getRef());
@@ -3603,12 +3597,11 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           },
           "Reports if the operation is attached to its parent block.")
       .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); })
-      .def(
-          "walk", &PyOperationBase::walk, nb::arg("callback"),
-          nb::arg("walk_order") = MlirWalkPostOrder,
-          // clang-format off
-          nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder = " MAKE_MLIR_PYTHON_QUALNAME("ir.WalkOrder.POST_ORDER") ") -> None")
-          // clang-format on
+      .def("walk", &PyOperationBase::walk, nb::arg("callback"),
+           nb::arg("walk_order") = MlirWalkPostOrder,
+           // clang-format off
+          nb::sig("def walk(self, callback: Callable[[Operation], WalkResult], walk_order: WalkOrder) -> None")
+           // clang-format on
       );
 
   nb::class_<PyOperation, PyOperationBase>(m, "Operation")
@@ -4124,7 +4117,6 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           [](PyNamedAttribute &self) {
             return mlirIdentifierStr(self.namedAttr.name);
           },
-          nb::sig("def name(self) -> str"),
           "The name of the NamedAttribute binding")
       .def_prop_ro(
           "attr",
@@ -4342,17 +4334,15 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           kValueReplaceAllUsesWithDocstring)
       .def(
           "replace_all_uses_except",
-          [](MlirValue self, MlirValue with, PyOperation &exception) {
+          [](PyValue &self, PyValue &with, PyOperation &exception) {
             MlirOperation exceptedUser = exception.get();
             mlirValueReplaceAllUsesExcept(self, with, 1, &exceptedUser);
           },
           nb::arg("with_"), nb::arg("exceptions"),
-          nb::sig("def replace_all_uses_except(self, with_: Value, exceptions: "
-                  "Operation) -> None"),
           kValueReplaceAllUsesExceptDocstring)
       .def(
           "replace_all_uses_except",
-          [](MlirValue self, MlirValue with, nb::list exceptions) {
+          [](PyValue &self, PyValue &with, const nb::list &exceptions) {
             // Convert Python list to a SmallVector of MlirOperations
             llvm::SmallVector<MlirOperation> exceptionOps;
             for (nb::handle exception : exceptions) {
@@ -4364,8 +4354,6 @@ void mlir::python::populateIRCore(nb::module_ &m) {
                 exceptionOps.data());
           },
           nb::arg("with_"), nb::arg("exceptions"),
-          nb::sig("def replace_all_uses_except(self, with_: Value, exceptions: "
-                  "Sequence[Operation]) -> None"),
           kValueReplaceAllUsesExceptDocstring)
       .def(
           "replace_all_uses_except",
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 598ae0188464a..edbd73eade906 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -273,8 +273,7 @@ class DefaultingPyMlirContext
     : public Defaulting<DefaultingPyMlirContext, PyMlirContext> {
 public:
   using Defaulting::Defaulting;
-  static constexpr const char kTypeDescription[] =
-      MAKE_MLIR_PYTHON_QUALNAME("ir.Context");
+  static constexpr const char kTypeDescription[] = "Context";
   static PyMlirContext &resolve();
 };
 
@@ -500,8 +499,7 @@ class DefaultingPyLocation
     : public Defaulting<DefaultingPyLocation, PyLocation> {
 public:
   using Defaulting::Defaulting;
-  static constexpr const char kTypeDescription[] =
-      MAKE_MLIR_PYTHON_QUALNAME("ir.Location");
+  static constexpr const char kTypeDescription[] = "Location";
   static PyLocation &resolve();
 
   operator MlirLocation() const { return *get(); }
diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp
index 3488d92250b45..34c5b8dd86a66 100644
--- a/mlir/lib/Bindings/Python/IRTypes.cpp
+++ b/mlir/lib/Bindings/Python/IRTypes.cpp
@@ -1010,7 +1010,7 @@ class PyTupleType : public PyConcreteType<PyTupleType> {
         },
         nb::arg("elements"), nb::arg("context") = nb::none(),
         // clang-format off
-        nb::sig("def get_tuple(elements: Sequence[Type], context: mlir.ir.Context | None = None) -> TupleType"),
+        nb::sig("def get_tuple(elements: Sequence[Type], context: Context | None = None) -> TupleType"),
         // clang-format on
         "Create a tuple type");
     c.def(
@@ -1070,7 +1070,7 @@ class PyFunctionType : public PyConcreteType<PyFunctionType> {
         },
         nb::arg("inputs"), nb::arg("results"), nb::arg("context") = nb::none(),
         // clang-format off
-        nb::sig("def get(inputs: Sequence[Type], results: Sequence[Type], context: mlir.ir.Context | None = None) -> FunctionType"),
+        nb::sig("def get(inputs: Sequence[Type], results: Sequence[Type], context: Context | None = None) -> FunctionType"),
         // clang-format on
         "Gets a FunctionType from a list of input and result types");
     c.def_prop_ro(
diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp
index 52656138843b9..a14f09f77d2c3 100644
--- a/mlir/lib/Bindings/Python/MainModule.cpp
+++ b/mlir/lib/Bindings/Python/MainModule.cpp
@@ -115,9 +115,6 @@ NB_MODULE(_mlir, m) {
         });
       },
       "typeid"_a, nb::kw_only(), "replace"_a = false,
-      // clang-format off
-      nb::sig("def register_type_caster(typeid: " MAKE_MLIR_PYTHON_QUALNAME("ir.TypeID") ", *, replace: bool = False) -> object"),
-      // clang-format on
       "Register a type caster for casting MLIR types to custom user types.");
   m.def(
       MLIR_PYTHON_CAPI_VALUE_CASTER_REGISTER_ATTR,
@@ -130,9 +127,6 @@ NB_MODULE(_mlir, m) {
             });
       },
       "typeid"_a, nb::kw_only(), "replace"_a = false,
-      // clang-format off
-      nb::sig("def register_value_caster(typeid: " MAKE_MLIR_PYTHON_QUALNAME("ir.TypeID") ", *, replace: bool = False) -> object"),
-      // clang-format on
       "Register a value caster for casting MLIR values to custom user values.");
 
   // Define and populate IR submodule.

From 32d03f3991bb03e976a25a3fa311f9a4e172dc5e Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Thu, 2 Oct 2025 08:20:09 -0700
Subject: [PATCH 525/878] [Clang][Sema] Fix crash in CheckUsingDeclQualifier
 due to diagnostic missing an argument (#161277)

Crash report came in and it was pretty obvious the diagnostic line was
just missing an argument. I supplied the argument and added a test.

Fixes: https://github.com/llvm/llvm-project/issues/161072
---
 clang/lib/Sema/SemaDeclCXX.cpp      |  2 +-
 clang/test/SemaCXX/cxx98-compat.cpp | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 1131e1f033b72..16d42d27d3b4e 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -13660,7 +13660,7 @@ bool Sema::CheckUsingDeclQualifier(SourceLocation UsingLoc, bool HasTypename,
 
     if (Cxx20Enumerator) {
       Diag(NameLoc, diag::warn_cxx17_compat_using_decl_non_member_enumerator)
-          << SS.getRange();
+          << SS.getScopeRep() << SS.getRange();
       return false;
     }
 
diff --git a/clang/test/SemaCXX/cxx98-compat.cpp b/clang/test/SemaCXX/cxx98-compat.cpp
index 8e7acf73923e5..587c242271a02 100644
--- a/clang/test/SemaCXX/cxx98-compat.cpp
+++ b/clang/test/SemaCXX/cxx98-compat.cpp
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++11 -Wc++98-compat -verify %s
-// RUN: %clang_cc1 -fsyntax-only -std=c++14 -Wc++98-compat -verify %s -DCXX14COMPAT
-// RUN: %clang_cc1 -fsyntax-only -std=c++17 -Wc++98-compat -verify %s -DCXX14COMPAT -DCXX17COMPAT
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -Wc++98-compat -verify=expected,not-cpp20 %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++14 -Wc++98-compat -verify=expected,not-cpp20 %s -DCXX14COMPAT
+// RUN: %clang_cc1 -fsyntax-only -std=c++17 -Wc++98-compat -verify=expected,not-cpp20 %s -DCXX14COMPAT -DCXX17COMPAT
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -Wc++98-compat -verify=expected,cpp20 %s -DCXX14COMPAT -DCXX17COMPAT
 
 namespace std {
   struct type_info;
@@ -226,7 +227,8 @@ void TrivialButNonPODThroughEllipsis() {
 }
 
 struct HasExplicitConversion {
-  explicit operator bool(); // expected-warning {{explicit conversion functions are incompatible with C++98}}
+  // FIXME I think we should generate this diagnostic in C++20
+  explicit operator bool(); // not-cpp20-warning {{explicit conversion functions are incompatible with C++98}}
 };
 
 struct Struct {};
@@ -430,3 +432,12 @@ void ctad_test() {
   CTAD t = s; // expected-warning {{class template argument deduction is incompatible with C++ standards before C++17}}
 }
 #endif
+
+namespace GH161702 {
+struct S {
+  enum E { A };
+  using E::A; // expected-warning {{enumeration type in nested name specifier is incompatible with C++98}}
+              // not-cpp20-error@-1 {{using declaration refers to its own class}}
+             // cpp20-warning@-2 {{member using declaration naming non-class ''E'' enumerator is incompatible with C++ standards before C++20}}
+};
+}

From 6632b2f377674864bd4bc05278bf90805d30665a Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 2 Oct 2025 16:22:13 +0100
Subject: [PATCH 526/878] [AArch64] Prepare for split ZPR and PPR area
 allocation (NFCI) (#142391)

This patch attempts to refactor AArch64FrameLowering to allow the size
of the ZPR and PPR areas to be calculated separately. This will be used
by a subsequent patch to support allocating ZPRs and PPRs to separate
areas. This patch should be an NFC and is split out to make later
functional changes easier to spot.

Co-authored-by: Kerry McLaughlin <kerry.mclaughlin@arm.com>
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 220 ++++++++++--------
 .../lib/Target/AArch64/AArch64FrameLowering.h |  20 +-
 .../AArch64/AArch64MachineFunctionInfo.cpp    |  20 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |  63 ++---
 .../AArch64/AArch64PrologueEpilogue.cpp       | 128 ++++++----
 .../Target/AArch64/AArch64RegisterInfo.cpp    |   4 +-
 .../DebugInfo/AArch64/asan-stack-vars.mir     |   3 +-
 .../compiler-gen-bbs-livedebugvalues.mir      |   3 +-
 8 files changed, 288 insertions(+), 173 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 20b0d697827c5..f5f7b6522ddec 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -324,6 +324,36 @@ AArch64FrameLowering::getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(const AArch64FrameLowering &,
                                       MachineFunction &MF);
 
+enum class AssignObjectOffsets { No, Yes };
+/// Process all the SVE stack objects and the SVE stack size and offsets for
+/// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE
+/// stack sizes set). Returns the size of the SVE stack.
+static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
+                                            AssignObjectOffsets AssignOffsets,
+                                            bool SplitSVEObjects = false);
+
+static unsigned getStackHazardSize(const MachineFunction &MF) {
+  return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
+}
+
+/// Returns true if PPRs are spilled as ZPRs.
+static bool arePPRsSpilledAsZPR(const MachineFunction &MF) {
+  return MF.getSubtarget().getRegisterInfo()->getSpillSize(
+             AArch64::PPRRegClass) == 16;
+}
+
+StackOffset
+AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return StackOffset::getScalable(AFI->getStackSizeZPR());
+}
+
+StackOffset
+AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return StackOffset::getScalable(AFI->getStackSizePPR());
+}
+
 // Conservatively, returns true if the function is likely to have SVE vectors
 // on the stack. This function is safe to be called before callee-saves or
 // object offsets have been determined.
@@ -482,13 +512,6 @@ AArch64FrameLowering::getFixedObjectSize(const MachineFunction &MF,
   }
 }
 
-/// Returns the size of the entire SVE stackframe (calleesaves + spills).
-StackOffset
-AArch64FrameLowering::getSVEStackSize(const MachineFunction &MF) const {
-  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
-}
-
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   if (!EnableRedZone)
     return false;
@@ -514,7 +537,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
                                  !Subtarget.hasSVE();
 
   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
-           getSVEStackSize(MF) || LowerQRegCopyThroughMem);
+           AFI->hasSVEStackSize() || LowerQRegCopyThroughMem);
 }
 
 /// hasFPImpl - Return true if the specified function should have a dedicated
@@ -557,7 +580,7 @@ bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
   // CFA in either of these cases.
   if (AFI.needsDwarfUnwindInfo(MF) &&
       ((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) &&
-       (!AFI.hasCalculatedStackSizeSVE() || AFI.getStackSizeSVE() > 0)))
+       (!AFI.hasCalculatedStackSizeSVE() || AFI.hasSVEStackSize())))
     return true;
   // With large callframes around we may need to use FP to access the scavenging
   // emergency spillslot.
@@ -1126,10 +1149,6 @@ static bool isTargetWindows(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
 }
 
-static unsigned getStackHazardSize(const MachineFunction &MF) {
-  return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
-}
-
 void AArch64FrameLowering::emitPacRetPlusLeafHardening(
     MachineFunction &MF) const {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
@@ -1212,7 +1231,9 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
   const auto &MFI = MF.getFrameInfo();
 
   int64_t ObjectOffset = MFI.getObjectOffset(FI);
-  StackOffset SVEStackSize = getSVEStackSize(MF);
+  StackOffset ZPRStackSize = getZPRStackSize(MF);
+  StackOffset PPRStackSize = getPPRStackSize(MF);
+  StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
 
   // For VLA-area objects, just emit an offset at the end of the stack frame.
   // Whilst not quite correct, these objects do live at the end of the frame and
@@ -1313,7 +1334,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
   bool isCSR =
       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
 
-  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+  const StackOffset SVEStackSize = getSVEStackSize(MF);
 
   // Use frame pointer to reference fixed objects. Use it for locals if
   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -1615,10 +1636,13 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
     FirstReg = Count - 1;
   }
   bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize();
-  int ScalableByteOffset =
-      FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize();
+  int ScalableByteOffset = FPAfterSVECalleeSaves
+                               ? 0
+                               : AFI->getZPRCalleeSavedStackSize() +
+                                     AFI->getPPRCalleeSavedStackSize();
   bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
   Register LastReg = 0;
+  bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex();
 
   // When iterating backwards, the loop condition relies on unsigned wraparound.
   for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -1648,7 +1672,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
     }
 
     // Add the stack hazard size as we transition from GPR->FPR CSRs.
-    if (AFI->hasStackHazardSlotIndex() &&
+    if (HasCSHazardPadding &&
         (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
         AArch64InstrInfo::isFpOrNEON(RPI.Reg1))
       ByteOffset += StackFillDir * StackHazardSize;
@@ -1656,7 +1680,7 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
 
     int Scale = TRI->getSpillSize(*RPI.RC);
     // Add the next reg to the pair if it is in the same register class.
-    if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
+    if (unsigned(i + RegInc) < Count && !HasCSHazardPadding) {
       MCRegister NextReg = CSI[i + RegInc].getReg();
       bool IsFirst = i == FirstReg;
       switch (RPI.Type) {
@@ -2263,10 +2287,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned UnspilledCSGPR = AArch64::NoRegister;
   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2387,15 +2412,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // Calculates the callee saved stack size.
   unsigned CSStackSize = 0;
-  unsigned SVECSStackSize = 0;
+  unsigned ZPRCSStackSize = 0;
+  unsigned PPRCSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   for (unsigned Reg : SavedRegs.set_bits()) {
     auto *RC = TRI->getMinimalPhysRegClass(Reg);
     assert(RC && "expected register class!");
     auto SpillSize = TRI->getSpillSize(*RC);
-    if (AArch64::PPRRegClass.contains(Reg) ||
-        AArch64::ZPRRegClass.contains(Reg))
-      SVECSStackSize += SpillSize;
+    bool IsZPR = AArch64::ZPRRegClass.contains(Reg);
+    bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg);
+    if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF)))
+      ZPRCSStackSize += SpillSize;
+    else if (IsPPR)
+      PPRCSStackSize += SpillSize;
     else
       CSStackSize += SpillSize;
   }
@@ -2405,17 +2434,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // only 64-bit GPRs can be added to SavedRegs.
   unsigned NumSavedRegs = SavedRegs.count();
 
-  // Increase the callee-saved stack size if the function has streaming mode
-  // changes, as we will need to spill the value of the VG register.
-  if (requiresSaveVG(MF))
-    CSStackSize += 8;
-
   // Determine if a Hazard slot should be used, and increase the CSStackSize by
   // StackHazardSize if so.
   determineStackHazardSlot(MF, SavedRegs);
   if (AFI->hasStackHazardSlotIndex())
     CSStackSize += getStackHazardSize(MF);
 
+  // Increase the callee-saved stack size if the function has streaming mode
+  // changes, as we will need to spill the value of the VG register.
+  if (requiresSaveVG(MF))
+    CSStackSize += 8;
+
   // If we must call __arm_get_current_vg in the prologue preserve the LR.
   if (requiresSaveVG(MF) && !Subtarget.hasSVE())
     SavedRegs.set(AArch64::LR);
@@ -2436,8 +2465,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   });
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
-  int64_t SVEStackSize =
-      alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
+  auto [ZPRLocalStackSize, PPRLocalStackSize] =
+      determineSVEStackSizes(MF, AssignObjectOffsets::No);
+  uint64_t SVELocals = ZPRLocalStackSize + PPRLocalStackSize;
+  uint64_t SVEStackSize =
+      alignTo(ZPRCSStackSize + PPRCSStackSize + SVELocals, 16);
   bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
@@ -2522,7 +2554,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // instructions.
   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
-  AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
+  AFI->setSVECalleeSavedStackSize(ZPRCSStackSize, alignTo(PPRCSStackSize, 16));
 }
 
 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
@@ -2661,7 +2693,6 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
       assert((Max == std::numeric_limits<int>::min() ||
               Max + 1 == CS.getFrameIdx()) &&
              "SVE CalleeSaves are not consecutive");
-
       Min = std::min(Min, CS.getFrameIdx());
       Max = std::max(Max, CS.getFrameIdx());
     }
@@ -2669,15 +2700,21 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
   return Min != std::numeric_limits<int>::max();
 }
 
-// Process all the SVE stack objects and determine offsets for each
-// object. If AssignOffsets is true, the offsets get assigned.
-// Fills in the first and last callee-saved frame indices into
-// Min/MaxCSFrameIndex, respectively.
-// Returns the size of the stack.
-static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
-                                              int &MinCSFrameIndex,
-                                              int &MaxCSFrameIndex,
-                                              bool AssignOffsets) {
+static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
+                                            AssignObjectOffsets AssignOffsets,
+                                            bool SplitSVEObjects) {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  SVEStackSizes SVEStack{};
+
+  // With SplitSVEObjects we maintain separate stack offsets for predicates
+  // (PPRs) and SVE vectors (ZPRs). When SplitSVEObjects is disabled predicates
+  // are included in the SVE vector area.
+  uint64_t &ZPRStackTop = SVEStack.ZPRStackSize;
+  uint64_t &PPRStackTop =
+      SplitSVEObjects ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize;
+
 #ifndef NDEBUG
   // First process all fixed stack objects.
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
@@ -2686,26 +2723,42 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
            "reference.");
 #endif
 
-  auto Assign = [&MFI](int FI, int64_t Offset) {
+  auto AllocateObject = [&](int FI) {
+    uint64_t &StackTop = MFI.getStackID(FI) == TargetStackID::ScalableVector
+                             ? ZPRStackTop
+                             : PPRStackTop;
+
+    // FIXME: Given that the length of SVE vectors is not necessarily a power of
+    // two, we'd need to align every object dynamically at runtime if the
+    // alignment is larger than 16. This is not yet supported.
+    Align Alignment = MFI.getObjectAlign(FI);
+    if (Alignment > Align(16))
+      report_fatal_error(
+          "Alignment of scalable vectors > 16 bytes is not yet supported");
+
+    StackTop += MFI.getObjectSize(FI);
+    StackTop = alignTo(StackTop, Alignment);
+
+    assert(StackTop < std::numeric_limits<int64_t>::max() &&
+           "SVE StackTop far too large?!");
+
+    int64_t Offset = -int64_t(StackTop);
+    if (AssignOffsets == AssignObjectOffsets::Yes)
+      MFI.setObjectOffset(FI, Offset);
+
     LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
-    MFI.setObjectOffset(FI, Offset);
   };
 
-  int64_t Offset = 0;
-
   // Then process all callee saved slots.
+  int MinCSFrameIndex, MaxCSFrameIndex;
   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
-    // Assign offsets to the callee save slots.
-    for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
-      Offset += MFI.getObjectSize(I);
-      Offset = alignTo(Offset, MFI.getObjectAlign(I));
-      if (AssignOffsets)
-        Assign(I, -Offset);
-    }
+    for (int FI = MinCSFrameIndex; FI <= MaxCSFrameIndex; ++FI)
+      AllocateObject(FI);
   }
 
-  // Ensure that the Callee-save area is aligned to 16bytes.
-  Offset = alignTo(Offset, Align(16U));
+  // Ensure the CS area is 16-byte aligned.
+  PPRStackTop = alignTo(PPRStackTop, Align(16U));
+  ZPRStackTop = alignTo(ZPRStackTop, Align(16U));
 
   // Create a buffer of SVE objects to allocate and sort it.
   SmallVector<int, 8> ObjectsToAllocate;
@@ -2715,50 +2768,34 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
   int StackProtectorFI = -1;
   if (MFI.hasStackProtectorIndex()) {
     StackProtectorFI = MFI.getStackProtectorIndex();
-    if (MFI.isScalableStackID(StackProtectorFI))
+    if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
       ObjectsToAllocate.push_back(StackProtectorFI);
   }
-  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
-    if (!MFI.isScalableStackID(I))
-      continue;
-    if (I == StackProtectorFI)
+
+  for (int FI = 0, E = MFI.getObjectIndexEnd(); FI != E; ++FI) {
+    if (FI == StackProtectorFI || MFI.isDeadObjectIndex(FI))
       continue;
-    if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
+    if (MaxCSFrameIndex >= FI && FI >= MinCSFrameIndex)
       continue;
-    if (MFI.isDeadObjectIndex(I))
+
+    if (MFI.getStackID(FI) != TargetStackID::ScalableVector &&
+        MFI.getStackID(FI) != TargetStackID::ScalablePredicateVector)
       continue;
 
-    ObjectsToAllocate.push_back(I);
+    ObjectsToAllocate.push_back(FI);
   }
 
   // Allocate all SVE locals and spills
-  for (unsigned FI : ObjectsToAllocate) {
-    Align Alignment = MFI.getObjectAlign(FI);
-    // FIXME: Given that the length of SVE vectors is not necessarily a power of
-    // two, we'd need to align every object dynamically at runtime if the
-    // alignment is larger than 16. This is not yet supported.
-    if (Alignment > Align(16))
-      report_fatal_error(
-          "Alignment of scalable vectors > 16 bytes is not yet supported");
+  for (unsigned FI : ObjectsToAllocate)
+    AllocateObject(FI);
 
-    Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
-    if (AssignOffsets)
-      Assign(FI, -Offset);
-  }
+  PPRStackTop = alignTo(PPRStackTop, Align(16U));
+  ZPRStackTop = alignTo(ZPRStackTop, Align(16U));
 
-  return Offset;
-}
+  if (AssignOffsets == AssignObjectOffsets::Yes)
+    AFI->setStackSizeSVE(SVEStack.ZPRStackSize, SVEStack.PPRStackSize);
 
-int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
-    MachineFrameInfo &MFI) const {
-  int MinCSFrameIndex, MaxCSFrameIndex;
-  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
-}
-
-int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
-    MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
-  return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
-                                        true);
+  return SVEStack;
 }
 
 /// Attempts to scavenge a register from \p ScavengeableRegs given the used
@@ -3072,12 +3109,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
          "Upwards growing stack unsupported");
 
-  int MinCSFrameIndex, MaxCSFrameIndex;
-  int64_t SVEStackSize =
-      assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
-
-  AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
-  AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
+  (void)determineSVEStackSizes(MF, AssignObjectOffsets::Yes);
 
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.
@@ -3599,7 +3631,7 @@ StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
 
   // Go to common code if we cannot provide sp + offset.
   if (MFI.hasVarSizedObjects() ||
-      MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
+      MF.getInfo<AArch64FunctionInfo>()->hasSVEStackSize() ||
       MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
     return getFrameIndexReference(MF, FI, FrameReg);
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 20d1d6afc0ebb..38aa28b130b37 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -24,6 +24,11 @@ class AArch64FunctionInfo;
 class AArch64PrologueEmitter;
 class AArch64EpilogueEmitter;
 
+struct SVEStackSizes {
+  uint64_t ZPRStackSize{0};
+  uint64_t PPRStackSize{0};
+};
+
 class AArch64FrameLowering : public TargetFrameLowering {
 public:
   explicit AArch64FrameLowering()
@@ -147,7 +152,16 @@ class AArch64FrameLowering : public TargetFrameLowering {
 
   bool requiresSaveVG(const MachineFunction &MF) const;
 
-  StackOffset getSVEStackSize(const MachineFunction &MF) const;
+  /// Returns the size of the entire ZPR stackframe (calleesaves + spills).
+  StackOffset getZPRStackSize(const MachineFunction &MF) const;
+
+  /// Returns the size of the entire PPR stackframe (calleesaves + spills).
+  StackOffset getPPRStackSize(const MachineFunction &MF) const;
+
+  /// Returns the size of the entire SVE stackframe (PPRs + ZPRs).
+  StackOffset getSVEStackSize(const MachineFunction &MF) const {
+    return getZPRStackSize(MF) + getPPRStackSize(MF);
+  }
 
   friend class AArch64PrologueEpilogueCommon;
   friend class AArch64PrologueEmitter;
@@ -167,10 +181,6 @@ class AArch64FrameLowering : public TargetFrameLowering {
   /// Returns true if CSRs should be paired.
   bool producePairRegisters(MachineFunction &MF) const;
 
-  int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
-  int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
-                                      int &MinCSFrameIndex,
-                                      int &MaxCSFrameIndex) const;
   /// Make a determination whether a Hazard slot is used and create it if
   /// needed.
   void determineStackHazardSlot(MachineFunction &MF,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index a81f5b3d436a9..b3c9656d4d80b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -23,12 +23,21 @@
 
 using namespace llvm;
 
+static std::optional<uint64_t>
+getSVEStackSize(const AArch64FunctionInfo &MFI,
+                uint64_t (AArch64FunctionInfo::*GetStackSize)() const) {
+  if (!MFI.hasCalculatedStackSizeSVE())
+    return std::nullopt;
+  return (MFI.*GetStackSize)();
+}
+
 yaml::AArch64FunctionInfo::AArch64FunctionInfo(
     const llvm::AArch64FunctionInfo &MFI)
     : HasRedZone(MFI.hasRedZone()),
-      StackSizeSVE(MFI.hasCalculatedStackSizeSVE()
-                       ? std::optional<uint64_t>(MFI.getStackSizeSVE())
-                       : std::nullopt),
+      StackSizeZPR(
+          getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizeZPR)),
+      StackSizePPR(
+          getSVEStackSize(MFI, &llvm::AArch64FunctionInfo::getStackSizePPR)),
       HasStackFrame(MFI.hasStackFrame()
                         ? std::optional<bool>(MFI.hasStackFrame())
                         : std::nullopt) {}
@@ -41,8 +50,9 @@ void AArch64FunctionInfo::initializeBaseYamlFields(
     const yaml::AArch64FunctionInfo &YamlMFI) {
   if (YamlMFI.HasRedZone)
     HasRedZone = YamlMFI.HasRedZone;
-  if (YamlMFI.StackSizeSVE)
-    setStackSizeSVE(*YamlMFI.StackSizeSVE);
+  if (YamlMFI.StackSizeZPR || YamlMFI.StackSizePPR)
+    setStackSizeSVE(YamlMFI.StackSizeZPR.value_or(0),
+                    YamlMFI.StackSizePPR.value_or(0));
   if (YamlMFI.HasStackFrame)
     setHasStackFrame(*YamlMFI.HasStackFrame);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 897c7e8539608..4a79d9ce39c8d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -74,13 +74,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// Amount of stack frame size, not including callee-saved registers.
   uint64_t LocalStackSize = 0;
 
-  /// The start and end frame indices for the SVE callee saves.
-  int MinSVECSFrameIndex = 0;
-  int MaxSVECSFrameIndex = 0;
-
   /// Amount of stack frame size used for saving callee-saved registers.
   unsigned CalleeSavedStackSize = 0;
-  unsigned SVECalleeSavedStackSize = 0;
+  unsigned ZPRCalleeSavedStackSize = 0;
+  unsigned PPRCalleeSavedStackSize = 0;
   bool HasCalleeSavedStackSize = false;
   bool HasSVECalleeSavedStackSize = false;
 
@@ -137,9 +134,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// SVE stack size (for predicates and data vectors) are maintained here
   /// rather than in FrameInfo, as the placement and Stack IDs are target
   /// specific.
-  uint64_t StackSizeSVE = 0;
+  uint64_t StackSizeZPR = 0;
+  uint64_t StackSizePPR = 0;
 
-  /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid.
+  /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid.
   bool HasCalculatedStackSizeSVE = false;
 
   /// Has a value when it is known whether or not the function uses a
@@ -312,16 +310,25 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     TailCallReservedStack = bytes;
   }
 
-  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
-
-  void setStackSizeSVE(uint64_t S) {
+  void setStackSizeSVE(uint64_t ZPR, uint64_t PPR) {
+    StackSizeZPR = ZPR;
+    StackSizePPR = PPR;
     HasCalculatedStackSizeSVE = true;
-    StackSizeSVE = S;
   }
 
-  uint64_t getStackSizeSVE() const {
+  uint64_t getStackSizeZPR() const {
     assert(hasCalculatedStackSizeSVE());
-    return StackSizeSVE;
+    return StackSizeZPR;
+  }
+  uint64_t getStackSizePPR() const {
+    assert(hasCalculatedStackSizeSVE());
+    return StackSizePPR;
+  }
+
+  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+
+  bool hasSVEStackSize() const {
+    return getStackSizeZPR() > 0 || getStackSizePPR() > 0;
   }
 
   bool hasStackFrame() const { return HasStackFrame; }
@@ -414,23 +421,25 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   }
 
   // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
-  void setSVECalleeSavedStackSize(unsigned Size) {
-    SVECalleeSavedStackSize = Size;
+  void setSVECalleeSavedStackSize(unsigned ZPR, unsigned PPR) {
+    ZPRCalleeSavedStackSize = ZPR;
+    PPRCalleeSavedStackSize = PPR;
     HasSVECalleeSavedStackSize = true;
   }
-  unsigned getSVECalleeSavedStackSize() const {
+  unsigned getZPRCalleeSavedStackSize() const {
     assert(HasSVECalleeSavedStackSize &&
-           "SVECalleeSavedStackSize has not been calculated");
-    return SVECalleeSavedStackSize;
+           "ZPRCalleeSavedStackSize has not been calculated");
+    return ZPRCalleeSavedStackSize;
   }
-
-  void setMinMaxSVECSFrameIndex(int Min, int Max) {
-    MinSVECSFrameIndex = Min;
-    MaxSVECSFrameIndex = Max;
+  unsigned getPPRCalleeSavedStackSize() const {
+    assert(HasSVECalleeSavedStackSize &&
+           "PPRCalleeSavedStackSize has not been calculated");
+    return PPRCalleeSavedStackSize;
   }
 
-  int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; }
-  int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; }
+  unsigned getSVECalleeSavedStackSize() const {
+    return getZPRCalleeSavedStackSize() + getPPRCalleeSavedStackSize();
+  }
 
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
   unsigned getNumLocalDynamicTLSAccesses() const {
@@ -611,7 +620,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
 namespace yaml {
 struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
   std::optional<bool> HasRedZone;
-  std::optional<uint64_t> StackSizeSVE;
+  std::optional<uint64_t> StackSizeZPR;
+  std::optional<uint64_t> StackSizePPR;
   std::optional<bool> HasStackFrame;
 
   AArch64FunctionInfo() = default;
@@ -624,7 +634,8 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
 template <> struct MappingTraits<AArch64FunctionInfo> {
   static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) {
     YamlIO.mapOptional("hasRedZone", MFI.HasRedZone);
-    YamlIO.mapOptional("stackSizeSVE", MFI.StackSizeSVE);
+    YamlIO.mapOptional("stackSizeZPR", MFI.StackSizeZPR);
+    YamlIO.mapOptional("stackSizePPR", MFI.StackSizePPR);
     YamlIO.mapOptional("hasStackFrame", MFI.HasStackFrame);
   }
 };
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 5da16b913ac4a..aad6579285e47 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -48,21 +48,19 @@ bool AArch64PrologueEpilogueCommon::isVGInstruction(
   return Opc == TargetOpcode::COPY;
 }
 
-// Convenience function to determine whether I is an SVE callee save.
-static bool isSVECalleeSave(MachineBasicBlock::iterator I) {
+// Convenience function to determine whether I is part of the ZPR callee saves.
+static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
   switch (I->getOpcode()) {
   default:
     return false;
-  case AArch64::PTRUE_C_B:
   case AArch64::LD1B_2Z_IMM:
   case AArch64::ST1B_2Z_IMM:
   case AArch64::STR_ZXI:
-  case AArch64::STR_PXI:
   case AArch64::LDR_ZXI:
-  case AArch64::LDR_PXI:
-  case AArch64::PTRUE_B:
   case AArch64::CPY_ZPzI_B:
   case AArch64::CMPNE_PPzZI_B:
+  case AArch64::PTRUE_C_B:
+  case AArch64::PTRUE_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
   case AArch64::SEH_SavePReg:
@@ -71,6 +69,23 @@ static bool isSVECalleeSave(MachineBasicBlock::iterator I) {
   }
 }
 
+// Convenience function to determine whether I is part of the PPR callee saves.
+static bool isPartOfPPRCalleeSaves(MachineBasicBlock::iterator I) {
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STR_PXI:
+  case AArch64::LDR_PXI:
+    return I->getFlag(MachineInstr::FrameSetup) ||
+           I->getFlag(MachineInstr::FrameDestroy);
+  }
+}
+
+// Convenience function to determine whether I is part of the SVE callee saves.
+static bool isPartOfSVECalleeSaves(MachineBasicBlock::iterator I) {
+  return isPartOfZPRCalleeSaves(I) || isPartOfPPRCalleeSaves(I);
+}
+
 AArch64PrologueEpilogueCommon::AArch64PrologueEpilogueCommon(
     MachineFunction &MF, MachineBasicBlock &MBB,
     const AArch64FrameLowering &AFL)
@@ -316,7 +331,7 @@ bool AArch64PrologueEpilogueCommon::shouldCombineCSRLocalStackBump(
 
   // When there is an SVE area on the stack, always allocate the
   // callee-saves and spills/locals separately.
-  if (AFL.getSVEStackSize(MF))
+  if (AFI->hasSVEStackSize())
     return false;
 
   return true;
@@ -639,7 +654,7 @@ void AArch64PrologueEmitter::emitPrologue() {
 
     // Now allocate space for the GPR callee saves.
     MachineBasicBlock::iterator MBBI = PrologueBeginI;
-    while (MBBI != EndI && isSVECalleeSave(MBBI))
+    while (MBBI != EndI && isPartOfSVECalleeSaves(MBBI))
       ++MBBI;
     FirstGPRSaveI = convertCalleeSaveRestoreToSPPrePostIncDec(
         MBBI, DL, -AFI->getCalleeSavedStackSize(), EmitAsyncCFI);
@@ -669,7 +684,7 @@ void AArch64PrologueEmitter::emitPrologue() {
   MachineBasicBlock::iterator AfterGPRSavesI = FirstGPRSaveI;
   while (AfterGPRSavesI != EndI &&
          AfterGPRSavesI->getFlag(MachineInstr::FrameSetup) &&
-         !isSVECalleeSave(AfterGPRSavesI)) {
+         !isPartOfSVECalleeSaves(AfterGPRSavesI)) {
     if (CombineSPBump &&
         // Only fix-up frame-setup load/store instructions.
         (!AFL.requiresSaveVG(MF) || !isVGInstruction(AfterGPRSavesI, TLI)))
@@ -700,38 +715,66 @@ void AArch64PrologueEmitter::emitPrologue() {
   if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
     emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
 
-  StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
-  StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
   MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI;
 
+  StackOffset PPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+  StackOffset ZPRCalleeSavesSize =
+      StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+  StackOffset SVECalleeSavesSize = PPRCalleeSavesSize + ZPRCalleeSavesSize;
+  StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
+  StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
+
   StackOffset CFAOffset =
       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
-
-  // Process the SVE callee-saves to determine what space needs to be
-  // allocated.
   MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
-  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
-    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
-                      << "\n");
-    SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
-    SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
-    // Find callee save instructions in frame.
-    // Note: With FPAfterSVECalleeSaves the callee saves have already been
+
+  if (!FPAfterSVECalleeSaves) {
+    MachineBasicBlock::iterator ZPRCalleeSavesBegin = AfterGPRSavesI,
+                                ZPRCalleeSavesEnd = AfterGPRSavesI;
+    MachineBasicBlock::iterator PPRCalleeSavesBegin = AfterGPRSavesI,
+                                PPRCalleeSavesEnd = AfterGPRSavesI;
+
+    // Process the SVE callee-saves to determine what space needs to be
     // allocated.
-    if (!FPAfterSVECalleeSaves) {
-      MachineBasicBlock::iterator CalleeSavesBegin = AfterGPRSavesI;
-      assert(isSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
-      while (isSVECalleeSave(AfterSVESavesI) &&
+
+    if (PPRCalleeSavesSize) {
+      LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
+                        << PPRCalleeSavesSize.getScalable() << "\n");
+
+      PPRCalleeSavesBegin = AfterSVESavesI;
+      assert(isPartOfPPRCalleeSaves(PPRCalleeSavesBegin) &&
+             "Unexpected instruction");
+      while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
              AfterSVESavesI != MBB.getFirstTerminator())
         ++AfterSVESavesI;
-      CalleeSavesEnd = AfterSVESavesI;
+      PPRCalleeSavesEnd = AfterSVESavesI;
+    }
 
-      StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
-      // Allocate space for the callee saves (if any).
-      allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize,
-                         EmitAsyncCFI && !HasFP, CFAOffset,
-                         MFI.hasVarSizedObjects() || LocalsSize);
+    if (ZPRCalleeSavesSize) {
+      LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
+                        << ZPRCalleeSavesSize.getScalable() << "\n");
+      ZPRCalleeSavesBegin = AfterSVESavesI;
+      assert(isPartOfZPRCalleeSaves(ZPRCalleeSavesBegin) &&
+             "Unexpected instruction");
+      while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
+             AfterSVESavesI != MBB.getFirstTerminator())
+        ++AfterSVESavesI;
+      ZPRCalleeSavesEnd = AfterSVESavesI;
     }
+
+    // Allocate space for the callee saves (if any).
+    StackOffset LocalsSize =
+        PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
+    MachineBasicBlock::iterator CalleeSavesBegin =
+        AFI->getPPRCalleeSavedStackSize() ? PPRCalleeSavesBegin
+                                          : ZPRCalleeSavesBegin;
+    allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize,
+                       EmitAsyncCFI && !HasFP, CFAOffset,
+                       MFI.hasVarSizedObjects() || LocalsSize);
+
+    CalleeSavesEnd = AFI->getZPRCalleeSavedStackSize() ? ZPRCalleeSavesEnd
+                                                       : PPRCalleeSavesEnd;
   }
   CFAOffset += SVECalleeSavesSize;
 
@@ -746,6 +789,7 @@ void AArch64PrologueEmitter::emitPrologue() {
     // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
     // the correct value here, as NumBytes also includes padding bytes,
     // which shouldn't be counted here.
+    StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
     allocateStackSpace(CalleeSavesEnd, RealignmentPadding,
                        SVELocalsSize + StackOffset::getFixed(NumBytes),
                        EmitAsyncCFI && !HasFP, CFAOffset,
@@ -796,7 +840,8 @@ void AArch64PrologueEmitter::emitPrologue() {
       emitDefineCFAWithFP(AfterSVESavesI, FixedObject);
     } else {
       StackOffset TotalSize =
-          SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
+          AFL.getSVEStackSize(MF) +
+          StackOffset::getFixed((int64_t)MFI.getStackSize());
       CFIInstBuilder CFIBuilder(MBB, AfterSVESavesI, MachineInstr::FrameSetup);
       CFIBuilder.insertCFIInst(
           createDefCFA(RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
@@ -1322,7 +1367,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   while (FirstGPRRestoreI != Begin) {
     --FirstGPRRestoreI;
     if (!FirstGPRRestoreI->getFlag(MachineInstr::FrameDestroy) ||
-        (!FPAfterSVECalleeSaves && isSVECalleeSave(FirstGPRRestoreI))) {
+        (!FPAfterSVECalleeSaves && isPartOfSVECalleeSaves(FirstGPRRestoreI))) {
       ++FirstGPRRestoreI;
       break;
     } else if (CombineSPBump)
@@ -1346,7 +1391,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   if (HasFP && AFI->hasSwiftAsyncContext())
     emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
 
-  const StackOffset &SVEStackSize = AFL.getSVEStackSize(MF);
+  StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
 
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
@@ -1372,20 +1417,25 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
   MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
                               RestoreEnd = FirstGPRRestoreI;
-  if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+  int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
+  int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
+  int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
+
+  if (SVECalleeSavedSize) {
     if (FPAfterSVECalleeSaves)
       RestoreEnd = MBB.getFirstTerminator();
 
     RestoreBegin = std::prev(RestoreEnd);
     while (RestoreBegin != MBB.begin() &&
-           isSVECalleeSave(std::prev(RestoreBegin)))
+           isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
       --RestoreBegin;
 
-    assert(isSVECalleeSave(RestoreBegin) &&
-           isSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
+    assert(isPartOfSVECalleeSaves(RestoreBegin) &&
+           isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
+           "Unexpected instruction");
 
     StackOffset CalleeSavedSizeAsOffset =
-        StackOffset::getScalable(CalleeSavedSize);
+        StackOffset::getScalable(SVECalleeSavedSize);
     DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
     DeallocateAfter = CalleeSavedSizeAsOffset;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 2b0c8ad0578bc..3f43b70c4a053 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -643,7 +643,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
     if (ST.hasSVE() || ST.isStreaming()) {
       // Frames that have variable sized objects and scalable SVE objects,
       // should always use a basepointer.
-      if (!AFI->hasCalculatedStackSizeSVE() || AFI->getStackSizeSVE())
+      if (!AFI->hasCalculatedStackSizeSVE() || AFI->hasSVEStackSize())
         return true;
     }
 
@@ -783,7 +783,7 @@ AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   assert((!MF.getSubtarget<AArch64Subtarget>().hasSVE() ||
           AFI->hasCalculatedStackSizeSVE()) &&
          "Expected SVE area to be calculated by this point");
-  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->getStackSizeSVE() &&
+  return TFI.hasFP(MF) && !hasStackRealignment(MF) && !AFI->hasSVEStackSize() &&
          !AFI->hasStackHazardSlotIndex();
 }
 
diff --git a/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir b/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir
index 5d644c3e5416c..718fa6f4c4cb1 100644
--- a/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir
+++ b/llvm/test/DebugInfo/AArch64/asan-stack-vars.mir
@@ -366,7 +366,8 @@ frameInfo:
   maxCallFrameSize: 0
   localFrameSize:  144
 machineFunctionInfo:
-  stackSizeSVE:    0
+  stackSizeZPR:    0
+  stackSizePPR:    0
 stack:
   - { id: 0, name: StackGuardSlot, offset: -40, size: 8, alignment: 8,
       stack-id: default, local-offset: -8 }
diff --git a/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir b/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
index 013d93378a204..b7a9892f13977 100644
--- a/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
+++ b/llvm/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
@@ -69,7 +69,8 @@ frameInfo:
   hasCalls:        true
   maxCallFrameSize: 0
 machineFunctionInfo:
-  stackSizeSVE:     0
+  stackSizeZPR:     0
+  stackSizePPR:     0
 stack:
   - { id: 0, type: spill-slot, offset: -20, size: 4, alignment: 4, stack-id: default }
   - { id: 1, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: default,

From f646d766de4e7978feeced765b414dc7f905c5f1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 2 Oct 2025 16:27:15 +0100
Subject: [PATCH 527/878] [LV] Extend test coverage for tail-folding with
 recurrences.

Add additional test coverage for tail-folding loops with first-order
recurrences and users outside the loop. Test a combination of
vectorization factors and interleave counts.

Also update check lines in reduction-order.ll and adjust naming for
clarity.

This adds extra test coverage for
https://github.com/llvm/llvm-project/pull/149042.
---
 .../first-order-recurrence-tail-folding.ll    | 244 ++++++++++++++++++
 .../LoopVectorize/reduction-order.ll          | 116 ++++++---
 2 files changed, 317 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll

diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
new file mode 100644
index 0000000000000..e97d6e66d9d7a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-tail-folding.ll
@@ -0,0 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF2IC1 %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF2IC2 %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S %s | FileCheck --check-prefix=VF1IC2 %s
+
+define i32 @FOR_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
+; VF2IC1-LABEL: define i32 @FOR_used_outside(
+; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*]]:
+; VF2IC1-NEXT:    br label %[[LOOP:.*]]
+; VF2IC1:       [[LOOP]]:
+; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
+; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
+; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1:       [[FOR_END]]:
+; VF2IC1-NEXT:    [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC1-NEXT:    ret i32 [[TMP32]]
+;
+; VF2IC2-LABEL: define i32 @FOR_used_outside(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*]]:
+; VF2IC2-NEXT:    br label %[[LOOP:.*]]
+; VF2IC2:       [[LOOP]]:
+; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
+; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
+; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2:       [[FOR_END]]:
+; VF2IC2-NEXT:    [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC2-NEXT:    ret i32 [[TMP66]]
+;
+; VF1IC2-LABEL: define i32 @FOR_used_outside(
+; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF1IC2-NEXT:  [[ENTRY:.*]]:
+; VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; VF1IC2:       [[LOOP]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2:       [[FOR_END]]:
+; VF1IC2-NEXT:    [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF1IC2-NEXT:    ret i32 [[TMP30]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 33, %entry ], [ %for.next, %loop ]
+  %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv
+  %for.next = load i32, ptr %gep.A, align 4
+  %add = add nsw i32 %for, %for.next
+  %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv
+  store i32 %add, ptr %gep.B, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %for.end, label %loop
+
+for.end:
+  ret i32 %for
+}
+
+define i32 @FOR_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
+; VF2IC1-LABEL: define i32 @FOR_next_used_outside(
+; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*]]:
+; VF2IC1-NEXT:    br label %[[LOOP:.*]]
+; VF2IC1:       [[LOOP]]:
+; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
+; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
+; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1:       [[FOR_END]]:
+; VF2IC1-NEXT:    [[TMP28:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
+; VF2IC1-NEXT:    ret i32 [[TMP28]]
+;
+; VF2IC2-LABEL: define i32 @FOR_next_used_outside(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*]]:
+; VF2IC2-NEXT:    br label %[[LOOP:.*]]
+; VF2IC2:       [[LOOP]]:
+; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
+; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
+; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2:       [[FOR_END]]:
+; VF2IC2-NEXT:    [[TMP62:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
+; VF2IC2-NEXT:    ret i32 [[TMP62]]
+;
+; VF1IC2-LABEL: define i32 @FOR_next_used_outside(
+; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF1IC2-NEXT:  [[ENTRY:.*]]:
+; VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; VF1IC2:       [[LOOP]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2:       [[FOR_END]]:
+; VF1IC2-NEXT:    [[TMP27:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
+; VF1IC2-NEXT:    ret i32 [[TMP27]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 33, %entry ], [ %for.next, %loop ]
+  %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv
+  %for.next = load i32, ptr %gep.A, align 4
+  %add = add nsw i32 %for, %for.next
+  %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv
+  store i32 %add, ptr %gep.B, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %for.end, label %loop
+
+for.end:
+  ret i32 %for.next
+}
+
+define i32 @FOR_and_next_used_outside(ptr noalias %A, ptr noalias %B, i64 %n) {
+; VF2IC1-LABEL: define i32 @FOR_and_next_used_outside(
+; VF2IC1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*]]:
+; VF2IC1-NEXT:    br label %[[LOOP:.*]]
+; VF2IC1:       [[LOOP]]:
+; VF2IC1-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP1]]
+; VF2IC1-NEXT:    [[TMP10]] = load i32, ptr [[TMP9]], align 4
+; VF2IC1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[FOR]], [[TMP10]]
+; VF2IC1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP1]]
+; VF2IC1-NEXT:    store i32 [[TMP23]], ptr [[TMP20]], align 4
+; VF2IC1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP1]], 1
+; VF2IC1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC1-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC1:       [[FOR_END]]:
+; VF2IC1-NEXT:    [[TMP32:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[TMP33:%.*]] = phi i32 [ [[TMP10]], %[[LOOP]] ]
+; VF2IC1-NEXT:    [[RES:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; VF2IC1-NEXT:    ret i32 [[RES]]
+;
+; VF2IC2-LABEL: define i32 @FOR_and_next_used_outside(
+; VF2IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*]]:
+; VF2IC2-NEXT:    br label %[[LOOP:.*]]
+; VF2IC2:       [[LOOP]]:
+; VF2IC2-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP23:%.*]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP3]]
+; VF2IC2-NEXT:    [[TMP23]] = load i32, ptr [[TMP22]], align 4
+; VF2IC2-NEXT:    [[TMP47:%.*]] = add nsw i32 [[FOR]], [[TMP23]]
+; VF2IC2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP3]]
+; VF2IC2-NEXT:    store i32 [[TMP47]], ptr [[TMP44]], align 4
+; VF2IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP3]], 1
+; VF2IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF2IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF2IC2:       [[FOR_END]]:
+; VF2IC2-NEXT:    [[TMP66:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[TMP71:%.*]] = phi i32 [ [[TMP23]], %[[LOOP]] ]
+; VF2IC2-NEXT:    [[RES:%.*]] = add i32 [[TMP66]], [[TMP71]]
+; VF2IC2-NEXT:    ret i32 [[RES]]
+;
+; VF1IC2-LABEL: define i32 @FOR_and_next_used_outside(
+; VF1IC2-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; VF1IC2-NEXT:  [[ENTRY:.*]]:
+; VF1IC2-NEXT:    br label %[[LOOP:.*]]
+; VF1IC2:       [[LOOP]]:
+; VF1IC2-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[FOR:%.*]] = phi i32 [ 33, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP0]]
+; VF1IC2-NEXT:    [[TMP7]] = load i32, ptr [[TMP6]], align 4
+; VF1IC2-NEXT:    [[TMP12:%.*]] = add nsw i32 [[FOR]], [[TMP7]]
+; VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[TMP0]]
+; VF1IC2-NEXT:    store i32 [[TMP12]], ptr [[TMP11]], align 4
+; VF1IC2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
+; VF1IC2-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF1IC2-NEXT:    br i1 [[EC]], label %[[FOR_END:.*]], label %[[LOOP]]
+; VF1IC2:       [[FOR_END]]:
+; VF1IC2-NEXT:    [[TMP30:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[TMP33:%.*]] = phi i32 [ [[TMP7]], %[[LOOP]] ]
+; VF1IC2-NEXT:    [[RES:%.*]] = add i32 [[TMP30]], [[TMP33]]
+; VF1IC2-NEXT:    ret i32 [[RES]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 33, %entry ], [ %for.next, %loop ]
+  %gep.A = getelementptr inbounds nuw i32, ptr %A, i64 %iv
+  %for.next = load i32, ptr %gep.A, align 4
+  %add = add nsw i32 %for, %for.next
+  %gep.B = getelementptr inbounds nuw i32, ptr %B, i64 %iv
+  store i32 %add, ptr %gep.B, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %for.end, label %loop
+
+for.end:
+  %res = add i32 %for, %for.next
+  ret i32 %res
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
index b07c3833ca235..b51db48c1c6ed 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll
@@ -1,63 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
 ; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -force-vector-interleave=1 -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Make sure the selects generated from reduction are always emitted
 ; in deterministic order.
-; CHECK-LABEL: @foo(
-; CHECK: vector.body:
-; CHECK:      [[VEC_PHI_1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_5:%.+]], %vector.body ]
-; CHECK:      [[VEC_PHI_2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD_3:%.+]], %vector.body ]
-; CHECK:      icmp ule <4 x i64>
-; CHECK-NEXT: [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
-; CHECK-NEXT: [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
-; CHECK:      select <4 x i1> {{.*}}, <4 x i32> [[ADD_5]], <4 x i32>
-; CHECK-NEXT: select <4 x i1> {{.*}}, <4 x i32> [[ADD_3]], <4 x i32>
-; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
 ;
-define internal i64 @foo(ptr %t0) !prof !1 {
-t16:
-  br label %t20
-
-t17:                                               ; preds = %t20
-  %t18 = phi i32 [ %t24, %t20 ]
-  %t19 = phi i32 [ %t28, %t20 ]
-  br label %t31
+define i32 @foo() !prof !1 {
+; CHECK-LABEL: define i32 @foo() {{.*}}{
+; CHECK-NEXT:  [[T16:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI_1:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI_2:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[ADD_3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 9)
+; CHECK-NEXT:    [[ADD_3]] = add <4 x i32> splat (i32 3), [[VEC_PHI_2]]
+; CHECK-NEXT:    [[ADD_5]] = add <4 x i32> [[VEC_PHI_1]], splat (i32 5)
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_5]], <4 x i32> [[VEC_PHI_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[ADD_3]], <4 x i32> [[VEC_PHI_2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+entry:
+  br label %loop
 
-t20:                                               ; preds = %t20, %t16
-  %t21 = phi i64 [ 0, %t16 ], [ %t29, %t20 ]
-  %t22 = phi i32 [ 0, %t16 ], [ %t28, %t20 ]
-  %t23 = phi i32 [ 0, %t16 ], [ %t24, %t20 ]
-  %t24 = add i32 3, %t23
-  %t28 = add i32 %t22, 5
-  %t29 = add nuw nsw i64 %t21, 1
-  %t30 = icmp eq i64 %t29, 10
-  br i1 %t30, label %t17, label %t20, !prof !2
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red.1 = phi i32 [ 0, %entry ], [ %red.1.next, %loop ]
+  %red.2 = phi i32 [ 0, %entry ], [ %red.2.next, %loop ]
+  %red.2.next = add i32 3, %red.2
+  %red.1.next = add i32 %red.1, 5
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 10
+  br i1 %ec, label %exit, label %loop, !prof !2
 
-t31:
-  ret i64 undef
+exit:
+  %r.2 = phi i32 [ %red.2.next, %loop ]
+  %r.1 = phi i32 [ %red.1.next, %loop ]
+  %add = add i32 %r.2, %r.1
+  ret i32 %add
 }
 
 ; Make sure we do not fail when checking for ordered reduction. This test just
 ; exercises the path and bails out without performing vectorization.
-; CHECK-LABEL: quux
-; CHECK-NOT: fadd <4 x 
-define void @quux(i1 %arg) {
-bb:
+define double @quux(i1 %arg) {
+; CHECK-LABEL: define double @quux(
+; CHECK-SAME: i1 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[HEADER:.*]]
+; CHECK:       [[HEADER]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi double [ 1.300000e+01, %[[ENTRY]] ], [ [[TMP:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP5]], 1.000000e+00
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[TMP]] = phi double [ [[TMP6]], %[[HEADER]] ]
+; CHECK-NEXT:    br i1 [[ARG]], label %[[HEADER]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[R:%.*]] = phi double [ [[TMP]], %[[LATCH]] ]
+; CHECK-NEXT:    ret double [[R]]
+;
+entry:
   br label %header
 
-latch:                                              ; preds = %header
-  %tmp = phi double [ %tmp6, %header ]
-  br i1 %arg, label %header, label %bb2
-
-bb2:                                              ; preds = %latch
-  %tmp3 = phi double [ %tmp, %latch ]
-  ret void
-
-header:                                              ; preds = %latch, %bb
-  %tmp5 = phi double [ 1.300000e+01, %bb ], [ %tmp, %latch ]
+header:
+  %tmp5 = phi double [ 1.300000e+01, %entry ], [ %tmp, %latch ]
   %tmp6 = fadd double %tmp5, 1.000000e+00
   br label %latch
+
+latch:
+  %tmp = phi double [ %tmp6, %header ]
+  br i1 %arg, label %header, label %exit
+
+exit:
+  %r = phi double [ %tmp, %latch ]
+  ret double %r
 }
 
 !1 = !{!"function_entry_count", i64 801}

From e3e0f69fca744f357d45c04d06038f6b566a0b90 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Thu, 2 Oct 2025 11:27:56 -0400
Subject: [PATCH 528/878] [NFC][clang-sycl-linker] Avoid ambiguous call to
 CallingConv (#161682)

both llvm and clang namespace have CallingConv.
Add namespace prefix to avoid ambiguous call .
---
 clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
index 594c79a28047b..de20e74360fbc 100644
--- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
+++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp
@@ -462,9 +462,10 @@ static Error runAOTCompile(StringRef InputFile, StringRef OutputFile,
 
 // TODO: Consider using LLVM-IR metadata to identify globals of interest
 bool isKernel(const Function &F) {
-  const CallingConv::ID CC = F.getCallingConv();
-  return CC == CallingConv::SPIR_KERNEL || CC == CallingConv::AMDGPU_KERNEL ||
-         CC == CallingConv::PTX_Kernel;
+  const llvm::CallingConv::ID CC = F.getCallingConv();
+  return CC == llvm::CallingConv::SPIR_KERNEL ||
+         CC == llvm::CallingConv::AMDGPU_KERNEL ||
+         CC == llvm::CallingConv::PTX_Kernel;
 }
 
 /// Performs the following steps:

From 235cd75c1429ad51cc3ad9728ccd8ffdc10742e3 Mon Sep 17 00:00:00 2001
From: LU-JOHN <John.Lu@amd.com>
Date: Thu, 2 Oct 2025 10:29:09 -0500
Subject: [PATCH 529/878] [AMDGPU] s_quadmask* implicitly defines SCC (#161582)

Fix s_quadmask* instruction description so that it defines SCC.

---------

Signed-off-by: John Lu <John.Lu@amd.com>
---
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  2 +
 .../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll    | 92 ++++++++++++++++++-
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index b3fd8c70dd045..84287b621fe78 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -352,10 +352,12 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
 
 } // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
 
+let Defs = [SCC] in {
 def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32",
   [(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>;
 def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64",
   [(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>;
+}
 
 let Uses = [M0] in {
 def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index de7d2346a0b42..b9bf76c1423b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck  -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck  -check-prefixes=GFX11,GFX11-SDAG %s
 
 declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
 declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
@@ -172,3 +172,91 @@ entry:
   %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
   ret i64 %qm
 }
+
+;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b32 implicitly defines SCC.
+define amdgpu_kernel void @test_scc_quadmask_32(i32 %val0, i32 %val1, ptr addrspace(1) %ptr) {
+; GFX11-GISEL-LABEL: test_scc_quadmask_32:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_quadmask_b32 s1, s1
+; GFX11-GISEL-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0
+; GFX11-GISEL-NEXT:    global_store_b32 v2, v3, s[2:3]
+; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_scc_quadmask_32:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 1
+; GFX11-SDAG-NEXT:    s_quadmask_b32 s1, s1
+; GFX11-SDAG-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11-SDAG-NEXT:    global_store_b32 v2, v3, s[2:3]
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-SDAG-NEXT:    s_endpgm
+  %and = and i32 %val0, 1
+  %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val1) nounwind readnone
+  store i32 %result, ptr addrspace(1) %ptr
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) null, align 4
+  ret void
+}
+
+;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b64 implicitly defines SCC.
+define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrspace(1) %ptr) {
+; GFX11-GISEL-LABEL: test_scc_quadmask_64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_clause 0x1
+; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x24
+; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-GISEL-NEXT:    global_store_b64 v4, v[0:1], s[2:3]
+; GFX11-GISEL-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX11-GISEL-NEXT:    s_endpgm
+;
+; GFX11-SDAG-LABEL: test_scc_quadmask_64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    s_load_b32 s6, s[4:5], 0x24
+; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s6, 1
+; GFX11-SDAG-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-SDAG-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-SDAG-NEXT:    global_store_b32 v[0:1], v5, off
+; GFX11-SDAG-NEXT:    s_endpgm
+  %and = and i32 %val0, 1
+  %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val1) nounwind readnone
+  store i64 %result, ptr addrspace(1) %ptr
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) null, align 4
+  ret void
+}

From 5bbf72400cfeb8f4e205e9ff1c98d34d2997796c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 2 Oct 2025 16:39:52 +0100
Subject: [PATCH 530/878] [X86] Add bytecode /
 -fexperimental-new-constant-interpreter VPMADD52 intrinsics test coverage
 (#161684)

Inspired by test problems encountered on #161056
---
 clang/test/CodeGen/X86/avx512ifma-builtins.c   | 5 +++++
 clang/test/CodeGen/X86/avx512ifmavl-builtins.c | 6 ++++++
 clang/test/CodeGen/X86/avxifma-builtins.c      | 6 ++++++
 3 files changed, 17 insertions(+)

diff --git a/clang/test/CodeGen/X86/avx512ifma-builtins.c b/clang/test/CodeGen/X86/avx512ifma-builtins.c
index 7c7c492c79c99..eebefb0bad4ab 100644
--- a/clang/test/CodeGen/X86/avx512ifma-builtins.c
+++ b/clang/test/CodeGen/X86/avx512ifma-builtins.c
@@ -3,6 +3,11 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 
 #include <immintrin.h>
 
diff --git a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
index c115b60381383..89108fc037520 100644
--- a/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512ifmavl-builtins.c
@@ -3,6 +3,12 @@
 // RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ %s -flax-vector-conversions=none -ffreestanding -triple=i386-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
+
 #include <immintrin.h>
 
 __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
diff --git a/clang/test/CodeGen/X86/avxifma-builtins.c b/clang/test/CodeGen/X86/avxifma-builtins.c
index dd0f220b378b4..aa151591ed143 100644
--- a/clang/test/CodeGen/X86/avxifma-builtins.c
+++ b/clang/test/CodeGen/X86/avxifma-builtins.c
@@ -3,6 +3,12 @@
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxifma -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
+
 #include <immintrin.h>
 
 __m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {

From afb262855e755b499a733c2b84b6a1cb789b3b1f Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl@gmail.com>
Date: Thu, 2 Oct 2025 12:07:55 -0400
Subject: [PATCH 531/878] [LoopPeel] Fix branch weights' effect on block
 frequencies (#128785)

[LoopPeel] Fix branch weights' effect on block frequencies

This patch implements the LoopPeel changes discussed in [[RFC] Fix Loop
Transformations to Preserve Block
Frequencies](https://discourse.llvm.org/t/rfc-fix-loop-transformations-to-preserve-block-frequencies/85785).

In summary, a loop's latch block can have branch weight metadata that
encodes an estimated trip count that is derived from application profile
data. Initially, the loop body's block frequencies agree with the
estimated trip count, as expected. However, sometimes loop
transformations adjust those branch weights in a way that correctly
maintains the estimated trip count but that corrupts the block
frequencies. This patch addresses that problem in LoopPeel, which it
changes to:

- Maintain branch weights consistently with the original loop for the
sake of preserving the total frequency of the original loop body.
- Store the new estimated trip count in the
`llvm.loop.estimated_trip_count` metadata, introduced by PR #148758.
---
 llvm/lib/Transforms/Utils/LoopPeel.cpp        | 121 ++++--------------
 .../LoopUnroll/peel-branch-weights-freq.ll    |  75 +++++++++++
 .../LoopUnroll/peel-branch-weights.ll         |  64 ++++-----
 .../LoopUnroll/peel-loop-pgo-deopt.ll         |  11 +-
 .../Transforms/LoopUnroll/peel-loop-pgo.ll    |  13 +-
 .../LoopUnroll/scev-invalidation-lcssa.ll     |   4 +-
 6 files changed, 151 insertions(+), 137 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 735bad1cb1348..e1dcaa85a5780 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -883,84 +883,6 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   }
 }
 
-struct WeightInfo {
-  // Weights for current iteration.
-  SmallVector<uint32_t> Weights;
-  // Weights to subtract after each iteration.
-  const SmallVector<uint32_t> SubWeights;
-};
-
-/// Update the branch weights of an exiting block of a peeled-off loop
-/// iteration.
-/// Let F is a weight of the edge to continue (fallthrough) into the loop.
-/// Let E is a weight of the edge to an exit.
-/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
-/// go to exit.
-/// Then, Estimated ExitCount = F / E.
-/// For I-th (counting from 0) peeled off iteration we set the weights for
-/// the peeled exit as (EC - I, 1). It gives us reasonable distribution,
-/// The probability to go to exit 1/(EC-I) increases. At the same time
-/// the estimated exit count in the remainder loop reduces by I.
-/// To avoid dealing with division rounding we can just multiple both part
-/// of weights to E and use weight as (F - I * E, E).
-static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
-  setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
-  for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
-    if (SubWeight != 0)
-      // Don't set the probability of taking the edge from latch to loop header
-      // to less than 1:1 ratio (meaning Weight should not be lower than
-      // SubWeight), as this could significantly reduce the loop's hotness,
-      // which would be incorrect in the case of underestimating the trip count.
-      Info.Weights[Idx] =
-          Info.Weights[Idx] > SubWeight
-              ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
-              : SubWeight;
-}
-
-/// Initialize the weights for all exiting blocks.
-static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
-                              Loop *L) {
-  SmallVector<BasicBlock *> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-  for (BasicBlock *ExitingBlock : ExitingBlocks) {
-    Instruction *Term = ExitingBlock->getTerminator();
-    SmallVector<uint32_t> Weights;
-    if (!extractBranchWeights(*Term, Weights))
-      continue;
-
-    // See the comment on updateBranchWeights() for an explanation of what we
-    // do here.
-    uint32_t FallThroughWeights = 0;
-    uint32_t ExitWeights = 0;
-    for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
-      if (L->contains(Succ))
-        FallThroughWeights += Weight;
-      else
-        ExitWeights += Weight;
-    }
-
-    // Don't try to update weights for degenerate case.
-    if (FallThroughWeights == 0)
-      continue;
-
-    SmallVector<uint32_t> SubWeights;
-    for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
-      if (!L->contains(Succ)) {
-        // Exit weights stay the same.
-        SubWeights.push_back(0);
-        continue;
-      }
-
-      // Subtract exit weights on each iteration, distributed across all
-      // fallthrough edges.
-      double W = (double)Weight / (double)FallThroughWeights;
-      SubWeights.push_back((uint32_t)(ExitWeights * W));
-    }
-
-    WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}});
-  }
-}
-
 /// Clones the body of the loop L, putting it between \p InsertTop and \p
 /// InsertBot.
 /// \param IterNumber The serial number of the iteration currently being
@@ -1332,11 +1254,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
   Instruction *LatchTerm =
       cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator());
 
-  // If we have branch weight information, we'll want to update it for the
-  // newly created branches.
-  DenseMap<Instruction *, WeightInfo> Weights;
-  initBranchWeights(Weights, L);
-
   // Identify what noalias metadata is inside the loop: if it is inside the
   // loop, the associated metadata must be cloned for each iteration.
   SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
@@ -1382,11 +1299,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
 #endif
 
-    for (auto &[Term, Info] : Weights) {
-      auto *TermCopy = cast<Instruction>(VMap[Term]);
-      updateBranchWeights(TermCopy, Info);
-    }
-
     // Remove Loop metadata from the latch branch instruction
     // because it is not the Loop's latch branch anymore.
     auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]);
@@ -1426,15 +1338,38 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
     }
   }
 
-  for (const auto &[Term, Info] : Weights) {
-    setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
-  }
-
   // Update Metadata for count of peeled off iterations.
   unsigned AlreadyPeeled = 0;
   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
     AlreadyPeeled = *Peeled;
-  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
+  unsigned TotalPeeled = AlreadyPeeled + PeelCount;
+  addStringMetadataToLoop(L, PeeledCountMetaData, TotalPeeled);
+
+  // Update metadata for the estimated trip count.  The original branch weight
+  // metadata is already correct for both the remaining loop and the peeled loop
+  // iterations, so do not adjust it.
+  //
+  // For example, consider what happens when peeling 2 iterations from a loop
+  // with an estimated trip count of 10 and inserting them before the remaining
+  // loop.  Each of the peeled iterations and each iteration in the remaining
+  // loop still has the same probability of exiting the *entire original* loop
+  // as it did when in the original loop, and thus it should still have the same
+  // branch weights.  The peeled iterations' non-zero probabilities of exiting
+  // already appropriately reduce the probability of reaching the remaining
+  // iterations just as they did in the original loop.  Trying to also adjust
+  // the remaining loop's branch weights to reflect its new trip count of 8 will
+  // erroneously further reduce its block frequencies.  However, in case an
+  // analysis later needs to determine the trip count of the remaining loop
+  // while examining it in isolation without considering the probability of
+  // actually reaching it, we store the new trip count as separate metadata.
+  if (auto EstimatedTripCount = getLoopEstimatedTripCount(L)) {
+    unsigned EstimatedTripCountNew = *EstimatedTripCount;
+    if (EstimatedTripCountNew < TotalPeeled)
+      EstimatedTripCountNew = 0;
+    else
+      EstimatedTripCountNew -= TotalPeeled;
+    setLoopEstimatedTripCount(L, EstimatedTripCountNew);
+  }
 
   if (Loop *ParentLoop = L->getParentLoop())
     L = ParentLoop;
diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll
new file mode 100644
index 0000000000000..1339afe146f21
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll
@@ -0,0 +1,75 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after loop peeling.
+
+; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \
+; RUN:   FileCheck -check-prefix=CHECK %s
+
+; The -implicit-check-not options make sure that no additional labels or calls
+; to @f show up.
+; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \
+; RUN:     -unroll-force-peel-count=2 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-UR \
+; RUN:       -implicit-check-not='{{^[^ ;]*:}}' \
+; RUN:       -implicit-check-not='call void @f'
+
+; CHECK: block-frequency-info: test
+; CHECK: do.body: float = 10.0,
+
+; The sum should still be ~10.
+;
+; CHECK-UR: block-frequency-info: test
+; CHECK-UR: - [[DO_BODY_PEEL:.*]]: float = 1.0,
+; CHECK-UR: - [[DO_BODY_PEEL2:.*]]: float = 0.9,
+; CHECK-UR: - [[DO_BODY:.*]]: float = 8.1,
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+; CHECK-UR-LABEL: define void @test(
+;       CHECK-UR: [[ENTRY:.*]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL_BEGIN:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL_BEGIN]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END:.*]], label %[[DO_BODY_PEEL_NEXT:.*]], !prof ![[#PROF:]]
+;       CHECK-UR: [[DO_BODY_PEEL_NEXT]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL2:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL2]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_PEEL_NEXT1:.*]], !prof ![[#PROF]]
+;       CHECK-UR: [[DO_BODY_PEEL_NEXT1]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL_NEXT5:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL_NEXT5]]:
+;       CHECK-UR:   br label %[[ENTRY_PEEL_NEWPH:.*]]
+;       CHECK-UR: [[ENTRY_PEEL_NEWPH]]:
+;       CHECK-UR:   br label %[[DO_BODY]]
+;       CHECK-UR: [[DO_BODY]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END_LOOPEXIT:.*]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK-UR: [[DO_END_LOOPEXIT]]:
+;       CHECK-UR:   br label %[[DO_END]]
+;       CHECK-UR: [[DO_END]]:
+;       CHECK-UR:   ret void
+
+entry:
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp sge i32 %inc, %n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 9}
+
+; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9}
+; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_PC:]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; CHECK-UR: ![[#LOOP_UR_PC]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 8}
+; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}
diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll
index c58f8f1f4e4ee..63a0dd4b4b4f9 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-branch-weights.ll
@@ -15,9 +15,9 @@ define void @test() {
 ; CHECK:       loop.peel:
 ; CHECK-NEXT:    [[X_PEEL:%.*]] = call i32 @get.x()
 ; CHECK-NEXT:    switch i32 [[X_PEEL]], label [[LOOP_LATCH_PEEL:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_LATCH_PEEL]]
-; CHECK-NEXT:    i32 1, label [[LOOP_EXIT:%.*]]
-; CHECK-NEXT:    i32 2, label [[LOOP_EXIT]]
+; CHECK-NEXT:      i32 0, label [[LOOP_LATCH_PEEL]]
+; CHECK-NEXT:      i32 1, label [[LOOP_EXIT:%.*]]
+; CHECK-NEXT:      i32 2, label [[LOOP_EXIT]]
 ; CHECK-NEXT:    ], !prof [[PROF0:![0-9]+]]
 ; CHECK:       loop.latch.peel:
 ; CHECK-NEXT:    br label [[LOOP_PEEL_NEXT:%.*]]
@@ -26,10 +26,10 @@ define void @test() {
 ; CHECK:       loop.peel2:
 ; CHECK-NEXT:    [[X_PEEL3:%.*]] = call i32 @get.x()
 ; CHECK-NEXT:    switch i32 [[X_PEEL3]], label [[LOOP_LATCH_PEEL4:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_LATCH_PEEL4]]
-; CHECK-NEXT:    i32 1, label [[LOOP_EXIT]]
-; CHECK-NEXT:    i32 2, label [[LOOP_EXIT]]
-; CHECK-NEXT:    ], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:      i32 0, label [[LOOP_LATCH_PEEL4]]
+; CHECK-NEXT:      i32 1, label [[LOOP_EXIT]]
+; CHECK-NEXT:      i32 2, label [[LOOP_EXIT]]
+; CHECK-NEXT:    ], !prof [[PROF0]]
 ; CHECK:       loop.latch.peel4:
 ; CHECK-NEXT:    br label [[LOOP_PEEL_NEXT1:%.*]]
 ; CHECK:       loop.peel.next1:
@@ -41,31 +41,33 @@ define void @test() {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @get.x()
 ; CHECK-NEXT:    switch i32 [[X]], label [[LOOP_LATCH:%.*]] [
-; CHECK-NEXT:    i32 0, label [[LOOP_LATCH]]
-; CHECK-NEXT:    i32 1, label [[LOOP_EXIT_LOOPEXIT:%.*]]
-; CHECK-NEXT:    i32 2, label [[LOOP_EXIT_LOOPEXIT]]
-; CHECK-NEXT:    ], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:      i32 0, label [[LOOP_LATCH]]
+; CHECK-NEXT:      i32 1, label [[LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK-NEXT:      i32 2, label [[LOOP_EXIT_LOOPEXIT]]
+; CHECK-NEXT:    ], !prof [[PROF0]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    br label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br label [[LOOP]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       loop.exit.loopexit:
 ; CHECK-NEXT:    br label [[LOOP_EXIT]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    ret void
+;
+; DISABLEADV-LABEL: @test(
+; DISABLEADV-NEXT:  entry:
+; DISABLEADV-NEXT:    br label [[LOOP:%.*]]
+; DISABLEADV:       loop:
+; DISABLEADV-NEXT:    [[X:%.*]] = call i32 @get.x()
+; DISABLEADV-NEXT:    switch i32 [[X]], label [[LOOP_LATCH:%.*]] [
+; DISABLEADV-NEXT:      i32 0, label [[LOOP_LATCH]]
+; DISABLEADV-NEXT:      i32 1, label [[LOOP_EXIT:%.*]]
+; DISABLEADV-NEXT:      i32 2, label [[LOOP_EXIT]]
+; DISABLEADV-NEXT:    ], !prof [[PROF0:![0-9]+]]
+; DISABLEADV:       loop.latch:
+; DISABLEADV-NEXT:    br label [[LOOP]]
+; DISABLEADV:       loop.exit:
+; DISABLEADV-NEXT:    ret void
+;
 
-; DISABLEADV-LABEL: @test()
-; DISABLEADV-NEXT: entry:
-; DISABLEADV-NEXT:  br label %loop
-; DISABLEADV: loop
-; DISABLEADV-NEXT:  %x = call i32 @get.x()
-; DISABLEADV-NEXT:  switch i32 %x, label %loop.latch [
-; DISABLEADV-NEXT:    i32 0, label %loop.latch
-; DISABLEADV-NEXT:    i32 1, label %loop.exit
-; DISABLEADV-NEXT:    i32 2, label %loop.exit
-; DISABLEADV-NEXT:  ], !prof !0
-; DISABLEADV: loop.latch:
-; DISABLEADV-NEXT:  br label %loop
-; DISABLEADV: loop.exit:
-; DISABLEADV-NEXT:  ret void
 
 entry:
   br label %loop
@@ -89,9 +91,9 @@ loop.exit:
 
 ;.
 ; CHECK: [[PROF0]] = !{!"branch_weights", i32 100, i32 200, i32 20, i32 10}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 90, i32 180, i32 20, i32 10}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 80, i32 160, i32 20, i32 10}
-; CHECK: [[LOOP3]] = distinct !{!3, !4, !5}
-; CHECK: [[META4:![0-9]+]] = !{!"llvm.loop.peeled.count", i32 2}
-; CHECK: [[META5:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META2]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.disable"}
+;.
+; DISABLEADV: [[PROF0]] = !{!"branch_weights", i32 100, i32 200, i32 20, i32 10}
 ;.
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
index d91cb5bab3827..e95121593e4f7 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
@@ -15,13 +15,13 @@
 ; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !16
 ; CHECK: [[NEXT0]]:
 ; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15
-; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !17
+; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !16
 ; CHECK: [[NEXT1]]:
 ; CHECK: br i1 %c, label %{{.*}}, label %side_exit, !prof !15
-; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !18
+; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !16
 ; CHECK: [[NEXT2]]:
 ; CHECK: br i1 %c, label %{{.*}}, label %side_exit.loopexit, !prof !15
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !16, !llvm.loop !17
 
 define i32 @basic(ptr %p, i32 %k, i1 %c) #0 !prof !15 {
 entry:
@@ -84,6 +84,7 @@ attributes #1 = { nounwind optsize }
 ;CHECK: !15 = !{!"branch_weights", i32 1, i32 0}
 ; This is a weights of latch and its copies.
 ;CHECK: !16 = !{!"branch_weights", i32 3001, i32 1001}
-;CHECK: !17 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 1001, i32 1001}
+;CHECK: !17 = distinct !{!17, !18, !19, {{.*}}}
+;CHECK: !18 = !{!"llvm.loop.peeled.count", i32 4}
+;CHECK: !19 = !{!"llvm.loop.estimated_trip_count", i32 0}
 
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index 15dce234baee9..dec126f289d32 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -5,7 +5,7 @@
 ; RUN: opt < %s -S -profile-summary-huge-working-set-size-threshold=9 -debug-only=loop-unroll -passes='require<profile-summary>,function(require<opt-remark-emit>,loop-unroll)' 2>&1 | FileCheck %s --check-prefix=NOPEEL
 ; REQUIRES: asserts
 
-; Make sure we use the profile information correctly to peel-off 3 iterations
+; Make sure we use the profile information correctly to peel-off 4 iterations
 ; from the loop, and update the branch weights for the peeled loop properly.
 
 ; CHECK: Loop Unroll: F[basic]
@@ -20,11 +20,11 @@
 ; CHECK-LABEL: @basic
 ; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !15
 ; CHECK: [[NEXT0]]:
-; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !16
+; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !15
 ; CHECK: [[NEXT1]]:
-; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !17
+; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !15
 ; CHECK: [[NEXT2]]:
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !17
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !15, !llvm.loop !16
 
 define void @basic(ptr %p, i32 %k) #0 !prof !15 {
 entry:
@@ -104,6 +104,7 @@ attributes #1 = { nounwind optsize }
 !16 = !{!"branch_weights", i32 3001, i32 1001}
 
 ;CHECK: !15 = !{!"branch_weights", i32 3001, i32 1001}
-;CHECK: !16 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !17 = !{!"branch_weights", i32 1001, i32 1001}
+;CHECK: !16 = distinct !{!16, !17, !18, {{.*}}}
+;CHECK: !17 = !{!"llvm.loop.peeled.count", i32 4}
+;CHECK: !18 = !{!"llvm.loop.estimated_trip_count", i32 0}
 
diff --git a/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll b/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll
index ec71c67d250b4..0a3d201e617de 100644
--- a/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll
+++ b/llvm/test/Transforms/LoopUnroll/scev-invalidation-lcssa.ll
@@ -3,7 +3,7 @@
 
 define i32 @f(i1 %cond1) #0 !prof !0 {
 ; CHECK-LABEL: define i32 @f
-; CHECK-SAME: (i1 [[COND1:%.*]]) !prof [[PROF0:![0-9]+]] {
+; CHECK-SAME: (i1 [[COND1:%.*]]) {{.*}}{
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP1_PEEL_BEGIN:%.*]]
 ; CHECK:       loop1.peel.begin:
@@ -19,7 +19,7 @@ define i32 @f(i1 %cond1) #0 !prof !0 {
 ; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop1:
 ; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT:    br i1 [[COND1]], label [[LOOP1]], label [[EXIT1_LOOPEXIT:%.*]], !prof [[PROF2:![0-9]+]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND1]], label [[LOOP1]], label [[EXIT1_LOOPEXIT:%.*]], !prof [[PROF1]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       exit1.loopexit:
 ; CHECK-NEXT:    [[LD_LCSSA_PH:%.*]] = phi i64 [ [[LD]], [[LOOP1]] ]
 ; CHECK-NEXT:    br label [[EXIT1]]

From ed12dc5e306c6d062c71e89f47756e771792f4fb Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee@amd.com>
Date: Thu, 2 Oct 2025 17:15:16 +0100
Subject: [PATCH 532/878] [Flang][OpenMP] Implicitly map nested allocatable
 components in derived types (#160766)

This PR adds support for nested derived types and their mappers to the
MapInfoFinalization pass.

- Generalize MapInfoFinalization to add child maps for arbitrarily
nested allocatables when a derived object is mapped via declare mapper.
- Traverse HLFIR designates rooted at the target block arg and build
full coordinate_of chains; append members with correct membersIndex.

This fixes #156461.
---
 .../Optimizer/OpenMP/MapInfoFinalization.cpp  | 294 +++++++++++++-----
 flang/test/Lower/OpenMP/declare-mapper.f90    |  39 +++
 ...rget-declare-mapper-parent-allocatable.f90 |  43 +++
 3 files changed, 302 insertions(+), 74 deletions(-)
 create mode 100644 offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90

diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 57be863cfa1b8..e595e6129c6c3 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -41,7 +41,9 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstddef>
 #include <iterator>
@@ -75,6 +77,112 @@ class MapInfoFinalizationPass
   ///      |                  |
   std::map<mlir::Operation *, mlir::Value> localBoxAllocas;
 
+  /// Return true if the given path exists in a list of paths.
+  static bool
+  containsPath(const llvm::SmallVectorImpl<llvm::SmallVector<int64_t>> &paths,
+               llvm::ArrayRef<int64_t> path) {
+    return llvm::any_of(paths, [&](const llvm::SmallVector<int64_t> &p) {
+      return p.size() == path.size() &&
+             std::equal(p.begin(), p.end(), path.begin());
+    });
+  }
+
+  /// Return true if the given path is already present in
+  /// op.getMembersIndexAttr().
+  static bool mappedIndexPathExists(mlir::omp::MapInfoOp op,
+                                    llvm::ArrayRef<int64_t> indexPath) {
+    if (mlir::ArrayAttr attr = op.getMembersIndexAttr()) {
+      for (mlir::Attribute list : attr) {
+        auto listAttr = mlir::cast<mlir::ArrayAttr>(list);
+        if (listAttr.size() != indexPath.size())
+          continue;
+        bool allEq = true;
+        for (auto [i, val] : llvm::enumerate(listAttr)) {
+          if (mlir::cast<mlir::IntegerAttr>(val).getInt() != indexPath[i]) {
+            allEq = false;
+            break;
+          }
+        }
+        if (allEq)
+          return true;
+      }
+    }
+    return false;
+  }
+
+  /// Build a compact string key for an index path for set-based
+  /// deduplication. Format: "N:v0,v1,..." where N is the length.
+  static void buildPathKey(llvm::ArrayRef<int64_t> path,
+                           llvm::SmallString<64> &outKey) {
+    outKey.clear();
+    llvm::raw_svector_ostream os(outKey);
+    os << path.size() << ':';
+    for (size_t i = 0; i < path.size(); ++i) {
+      if (i)
+        os << ',';
+      os << path[i];
+    }
+  }
+
+  /// Create the member map for coordRef and append it (and its index
+  /// path) to the provided new* vectors, if it is not already present.
+  void appendMemberMapIfNew(
+      mlir::omp::MapInfoOp op, fir::FirOpBuilder &builder, mlir::Location loc,
+      mlir::Value coordRef, llvm::ArrayRef<int64_t> indexPath,
+      llvm::StringRef memberName,
+      llvm::SmallVectorImpl<mlir::Value> &newMapOpsForFields,
+      llvm::SmallVectorImpl<llvm::SmallVector<int64_t>> &newMemberIndexPaths) {
+    // Local de-dup within this op invocation.
+    if (containsPath(newMemberIndexPaths, indexPath))
+      return;
+    // Global de-dup against already present member indices.
+    if (mappedIndexPathExists(op, indexPath))
+      return;
+
+    if (op.getMapperId()) {
+      mlir::omp::DeclareMapperOp symbol =
+          mlir::SymbolTable::lookupNearestSymbolFrom<
+              mlir::omp::DeclareMapperOp>(op, op.getMapperIdAttr());
+      assert(symbol && "missing symbol for declare mapper identifier");
+      mlir::omp::DeclareMapperInfoOp mapperInfo = symbol.getDeclareMapperInfo();
+      // TODO: Probably a way to cache these keys in someway so we don't
+      // constantly go through the process of rebuilding them on every check, to
+      // save some cycles, but it can wait for a subsequent patch.
+      for (auto v : mapperInfo.getMapVars()) {
+        mlir::omp::MapInfoOp map =
+            mlir::cast<mlir::omp::MapInfoOp>(v.getDefiningOp());
+        if (!map.getMembers().empty() && mappedIndexPathExists(map, indexPath))
+          return;
+      }
+    }
+
+    builder.setInsertionPoint(op);
+    fir::factory::AddrAndBoundsInfo info = fir::factory::getDataOperandBaseAddr(
+        builder, coordRef, /*isOptional=*/false, loc);
+    llvm::SmallVector<mlir::Value> bounds = fir::factory::genImplicitBoundsOps<
+        mlir::omp::MapBoundsOp, mlir::omp::MapBoundsType>(
+        builder, info,
+        hlfir::translateToExtendedValue(loc, builder, hlfir::Entity{coordRef})
+            .first,
+        /*dataExvIsAssumedSize=*/false, loc);
+
+    mlir::omp::MapInfoOp fieldMapOp = mlir::omp::MapInfoOp::create(
+        builder, loc, coordRef.getType(), coordRef,
+        mlir::TypeAttr::get(fir::unwrapRefType(coordRef.getType())),
+        op.getMapTypeAttr(),
+        builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
+            mlir::omp::VariableCaptureKind::ByRef),
+        /*varPtrPtr=*/mlir::Value{}, /*members=*/mlir::ValueRange{},
+        /*members_index=*/mlir::ArrayAttr{}, bounds,
+        /*mapperId=*/mlir::FlatSymbolRefAttr(),
+        builder.getStringAttr(op.getNameAttr().strref() + "." + memberName +
+                              ".implicit_map"),
+        /*partial_map=*/builder.getBoolAttr(false));
+
+    newMapOpsForFields.emplace_back(fieldMapOp);
+    newMemberIndexPaths.emplace_back(indexPath.begin(), indexPath.end());
+  }
+
   /// getMemberUserList gathers all users of a particular MapInfoOp that are
   /// other MapInfoOp's and places them into the mapMemberUsers list, which
   /// records the map that the current argument MapInfoOp "op" is part of
@@ -363,7 +471,7 @@ class MapInfoFinalizationPass
     mlir::ArrayAttr newMembersAttr;
     mlir::SmallVector<mlir::Value> newMembers;
     llvm::SmallVector<llvm::SmallVector<int64_t>> memberIndices;
-    bool IsHasDeviceAddr = isHasDeviceAddr(op, target);
+    bool isHasDeviceAddrFlag = isHasDeviceAddr(op, target);
 
     if (!mapMemberUsers.empty() || !op.getMembers().empty())
       getMemberIndicesAsVectors(
@@ -406,7 +514,7 @@ class MapInfoFinalizationPass
       mapUser.parent.getMembersMutable().assign(newMemberOps);
       mapUser.parent.setMembersIndexAttr(
           builder.create2DI64ArrayAttr(memberIndices));
-    } else if (!IsHasDeviceAddr) {
+    } else if (!isHasDeviceAddrFlag) {
       auto baseAddr =
           genBaseAddrMap(descriptor, op.getBounds(), op.getMapType(), builder);
       newMembers.push_back(baseAddr);
@@ -429,7 +537,7 @@ class MapInfoFinalizationPass
     // The contents of the descriptor (the base address in particular) will
     // remain unchanged though.
     uint64_t mapType = op.getMapType();
-    if (IsHasDeviceAddr) {
+    if (isHasDeviceAddrFlag) {
       mapType |= llvm::to_underlying(
           llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_ALWAYS);
     }
@@ -701,94 +809,134 @@ class MapInfoFinalizationPass
 
         auto recordType = mlir::cast<fir::RecordType>(underlyingType);
         llvm::SmallVector<mlir::Value> newMapOpsForFields;
-        llvm::SmallVector<int64_t> fieldIndicies;
+        llvm::SmallVector<llvm::SmallVector<int64_t>> newMemberIndexPaths;
 
+        // 1) Handle direct top-level allocatable fields.
         for (auto fieldMemTyPair : recordType.getTypeList()) {
           auto &field = fieldMemTyPair.first;
           auto memTy = fieldMemTyPair.second;
 
-          bool shouldMapField =
-              llvm::find_if(mapVarForwardSlice, [&](mlir::Operation *sliceOp) {
-                if (!fir::isAllocatableType(memTy))
-                  return false;
-
-                auto designateOp = mlir::dyn_cast<hlfir::DesignateOp>(sliceOp);
-                if (!designateOp)
-                  return false;
-
-                return designateOp.getComponent() &&
-                       designateOp.getComponent()->strref() == field;
-              }) != mapVarForwardSlice.end();
-
-          // TODO Handle recursive record types. Adapting
-          // `createParentSymAndGenIntermediateMaps` to work direclty on MLIR
-          // entities might be helpful here.
-
-          if (!shouldMapField)
+          if (!fir::isAllocatableType(memTy))
             continue;
 
-          int32_t fieldIdx = recordType.getFieldIndex(field);
-          bool alreadyMapped = [&]() {
-            if (op.getMembersIndexAttr())
-              for (auto indexList : op.getMembersIndexAttr()) {
-                auto indexListAttr = mlir::cast<mlir::ArrayAttr>(indexList);
-                if (indexListAttr.size() == 1 &&
-                    mlir::cast<mlir::IntegerAttr>(indexListAttr[0]).getInt() ==
-                        fieldIdx)
-                  return true;
-              }
-
-            return false;
-          }();
-
-          if (alreadyMapped)
+          bool referenced = llvm::any_of(mapVarForwardSlice, [&](auto *opv) {
+            auto designateOp = mlir::dyn_cast<hlfir::DesignateOp>(opv);
+            return designateOp && designateOp.getComponent() &&
+                   designateOp.getComponent()->strref() == field;
+          });
+          if (!referenced)
             continue;
 
+          int32_t fieldIdx = recordType.getFieldIndex(field);
           builder.setInsertionPoint(op);
           fir::IntOrValue idxConst =
               mlir::IntegerAttr::get(builder.getI32Type(), fieldIdx);
           auto fieldCoord = fir::CoordinateOp::create(
               builder, op.getLoc(), builder.getRefType(memTy), op.getVarPtr(),
               llvm::SmallVector<fir::IntOrValue, 1>{idxConst});
-          fir::factory::AddrAndBoundsInfo info =
-              fir::factory::getDataOperandBaseAddr(
-                  builder, fieldCoord, /*isOptional=*/false, op.getLoc());
-          llvm::SmallVector<mlir::Value> bounds =
-              fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
-                                                 mlir::omp::MapBoundsType>(
-                  builder, info,
-                  hlfir::translateToExtendedValue(op.getLoc(), builder,
-                                                  hlfir::Entity{fieldCoord})
-                      .first,
-                  /*dataExvIsAssumedSize=*/false, op.getLoc());
-
-          mlir::omp::MapInfoOp fieldMapOp = mlir::omp::MapInfoOp::create(
-              builder, op.getLoc(), fieldCoord.getResult().getType(),
-              fieldCoord.getResult(),
-              mlir::TypeAttr::get(
-                  fir::unwrapRefType(fieldCoord.getResult().getType())),
-              op.getMapTypeAttr(),
-              builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
-                  mlir::omp::VariableCaptureKind::ByRef),
-              /*varPtrPtr=*/mlir::Value{}, /*members=*/mlir::ValueRange{},
-              /*members_index=*/mlir::ArrayAttr{}, bounds,
-              /*mapperId=*/mlir::FlatSymbolRefAttr(),
-              builder.getStringAttr(op.getNameAttr().strref() + "." + field +
-                                    ".implicit_map"),
-              /*partial_map=*/builder.getBoolAttr(false));
-          newMapOpsForFields.emplace_back(fieldMapOp);
-          fieldIndicies.emplace_back(fieldIdx);
+          int64_t fieldIdx64 = static_cast<int64_t>(fieldIdx);
+          llvm::SmallVector<int64_t, 1> idxPath{fieldIdx64};
+          appendMemberMapIfNew(op, builder, op.getLoc(), fieldCoord, idxPath,
+                               field, newMapOpsForFields, newMemberIndexPaths);
+        }
+
+        // Handle nested allocatable fields along any component chain
+        // referenced in the region via HLFIR designates.
+        llvm::SmallVector<llvm::SmallVector<int64_t>> seenIndexPaths;
+        for (mlir::Operation *sliceOp : mapVarForwardSlice) {
+          auto designateOp = mlir::dyn_cast<hlfir::DesignateOp>(sliceOp);
+          if (!designateOp || !designateOp.getComponent())
+            continue;
+          llvm::SmallVector<llvm::StringRef> compPathReversed;
+          compPathReversed.push_back(designateOp.getComponent()->strref());
+          mlir::Value curBase = designateOp.getMemref();
+          bool rootedAtMapArg = false;
+          while (true) {
+            if (auto parentDes = curBase.getDefiningOp<hlfir::DesignateOp>()) {
+              if (!parentDes.getComponent())
+                break;
+              compPathReversed.push_back(parentDes.getComponent()->strref());
+              curBase = parentDes.getMemref();
+              continue;
+            }
+            if (auto decl = curBase.getDefiningOp<hlfir::DeclareOp>()) {
+              if (auto barg =
+                      mlir::dyn_cast<mlir::BlockArgument>(decl.getMemref()))
+                rootedAtMapArg = (barg == opBlockArg);
+            } else if (auto blockArg =
+                           mlir::dyn_cast_or_null<mlir::BlockArgument>(
+                               curBase)) {
+              rootedAtMapArg = (blockArg == opBlockArg);
+            }
+            break;
+          }
+          // Only process nested paths (2+ components). Single-component paths
+          // for direct fields are handled above.
+          if (!rootedAtMapArg || compPathReversed.size() < 2)
+            continue;
+          builder.setInsertionPoint(op);
+          llvm::SmallVector<int64_t> indexPath;
+          mlir::Type curTy = underlyingType;
+          mlir::Value coordRef = op.getVarPtr();
+          bool validPath = true;
+          for (llvm::StringRef compName : llvm::reverse(compPathReversed)) {
+            auto recTy = mlir::dyn_cast<fir::RecordType>(curTy);
+            if (!recTy) {
+              validPath = false;
+              break;
+            }
+            int32_t idx = recTy.getFieldIndex(compName);
+            if (idx < 0) {
+              validPath = false;
+              break;
+            }
+            indexPath.push_back(idx);
+            mlir::Type memTy = recTy.getType(idx);
+            fir::IntOrValue idxConst =
+                mlir::IntegerAttr::get(builder.getI32Type(), idx);
+            coordRef = fir::CoordinateOp::create(
+                builder, op.getLoc(), builder.getRefType(memTy), coordRef,
+                llvm::SmallVector<fir::IntOrValue, 1>{idxConst});
+            curTy = memTy;
+          }
+          if (!validPath)
+            continue;
+          if (auto finalRefTy =
+                  mlir::dyn_cast<fir::ReferenceType>(coordRef.getType())) {
+            mlir::Type eleTy = finalRefTy.getElementType();
+            if (fir::isAllocatableType(eleTy)) {
+              if (!containsPath(seenIndexPaths, indexPath)) {
+                seenIndexPaths.emplace_back(indexPath.begin(), indexPath.end());
+                appendMemberMapIfNew(op, builder, op.getLoc(), coordRef,
+                                     indexPath, compPathReversed.front(),
+                                     newMapOpsForFields, newMemberIndexPaths);
+              }
+            }
+          }
         }
 
         if (newMapOpsForFields.empty())
           return mlir::WalkResult::advance();
 
-        op.getMembersMutable().append(newMapOpsForFields);
+        // Deduplicate by index path to avoid emitting duplicate members for
+        // the same component. Use a set-based key to keep this near O(n).
+        llvm::SmallVector<mlir::Value> dedupMapOps;
+        llvm::SmallVector<llvm::SmallVector<int64_t>> dedupIndexPaths;
+        llvm::StringSet<> seenKeys;
+        for (auto [i, mapOp] : llvm::enumerate(newMapOpsForFields)) {
+          const auto &path = newMemberIndexPaths[i];
+          llvm::SmallString<64> key;
+          buildPathKey(path, key);
+          if (seenKeys.contains(key))
+            continue;
+          seenKeys.insert(key);
+          dedupMapOps.push_back(mapOp);
+          dedupIndexPaths.emplace_back(path.begin(), path.end());
+        }
+        op.getMembersMutable().append(dedupMapOps);
         llvm::SmallVector<llvm::SmallVector<int64_t>> newMemberIndices;
-        mlir::ArrayAttr oldMembersIdxAttr = op.getMembersIndexAttr();
-
-        if (oldMembersIdxAttr)
-          for (mlir::Attribute indexList : oldMembersIdxAttr) {
+        if (mlir::ArrayAttr oldAttr = op.getMembersIndexAttr())
+          for (mlir::Attribute indexList : oldAttr) {
             llvm::SmallVector<int64_t> listVec;
 
             for (mlir::Attribute index : mlir::cast<mlir::ArrayAttr>(indexList))
@@ -796,10 +944,8 @@ class MapInfoFinalizationPass
 
             newMemberIndices.emplace_back(std::move(listVec));
           }
-
-        for (int64_t newFieldIdx : fieldIndicies)
-          newMemberIndices.emplace_back(
-              llvm::SmallVector<int64_t>(1, newFieldIdx));
+        for (auto &path : dedupIndexPaths)
+          newMemberIndices.emplace_back(path);
 
         op.setMembersIndexAttr(builder.create2DI64ArrayAttr(newMemberIndices));
         op.setPartialMap(true);
diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90
index 8a98c68a8d582..3d4d0da1e18a3 100644
--- a/flang/test/Lower/OpenMP/declare-mapper.f90
+++ b/flang/test/Lower/OpenMP/declare-mapper.f90
@@ -6,6 +6,7 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-3.f90 -o - | FileCheck %t/omp-declare-mapper-3.f90
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-4.f90 -o - | FileCheck %t/omp-declare-mapper-4.f90
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-5.f90 -o - | FileCheck %t/omp-declare-mapper-5.f90
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %t/omp-declare-mapper-6.f90 -o - | FileCheck %t/omp-declare-mapper-6.f90
 
 !--- omp-declare-mapper-1.f90
 subroutine declare_mapper_1
@@ -262,3 +263,41 @@ subroutine use_inner()
       !$omp end target
    end subroutine
 end program declare_mapper_5
+
+!--- omp-declare-mapper-6.f90
+subroutine declare_mapper_nested_parent
+  type :: inner_t
+    real, allocatable :: deep_arr(:)
+  end type inner_t
+
+  type, abstract :: base_t
+    real, allocatable :: base_arr(:)
+    type(inner_t) :: inner
+  end type base_t
+
+  type, extends(base_t) :: real_t
+    real, allocatable :: real_arr(:)
+  end type real_t
+
+  !$omp declare mapper (custommapper : real_t :: t) map(tofrom: t%base_arr, t%real_arr)
+  ! CHECK: omp.declare_mapper @{{.*custommapper}}
+  ! CHECK-DAG: omp.map.info {{.*}} {name = "t%base_t%base_arr"}
+  ! CHECK-DAG: omp.map.info {{.*}} {name = "t%real_arr"}
+  ! CHECK: omp.declare_mapper.info
+
+  type(real_t) :: r
+
+  allocate(r%base_arr(10))
+  allocate(r%inner%deep_arr(10))
+  allocate(r%real_arr(10))
+  r%base_arr = 1.0
+  r%inner%deep_arr = 4.0
+  r%real_arr = 0.0
+
+  ! Check implicit maps for deep nested allocatable payloads not covered by mapper
+  ! CHECK-DAG: omp.map.info {{.*}} {name = "r.deep_arr.implicit_map"}
+  ! CHECK: omp.target
+  !$omp target map(mapper(custommapper), tofrom: r)
+    r%real_arr = r%base_arr(1) + r%inner%deep_arr(1)
+  !$omp end target
+end subroutine declare_mapper_nested_parent
diff --git a/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90 b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90
new file mode 100644
index 0000000000000..65e04af66e022
--- /dev/null
+++ b/offload/test/offloading/fortran/target-declare-mapper-parent-allocatable.f90
@@ -0,0 +1,43 @@
+! This test validates that declare mapper for a derived type that extends
+! a parent type with an allocatable component correctly maps the nested
+! allocatable payload via the mapper when the whole object is mapped on
+! target.
+
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+program target_declare_mapper_parent_allocatable
+  implicit none
+
+  type, abstract :: base_t
+    real, allocatable :: base_arr(:)
+  end type base_t
+
+  type, extends(base_t) :: real_t
+    real, allocatable :: real_arr(:)
+  end type real_t
+  !$omp declare mapper(custommapper: real_t :: t) map(t%base_arr, t%real_arr)
+
+  type(real_t) :: r
+  integer :: i
+  allocate(r%base_arr(10), source=1.0)
+  allocate(r%real_arr(10), source=1.0)
+
+  !$omp target map(mapper(custommapper), tofrom: r)
+  do i = 1, size(r%base_arr)
+    r%base_arr(i) = 2.0
+    r%real_arr(i) = 3.0
+    r%real_arr(i) = r%base_arr(1)
+  end do
+  !$omp end target
+
+
+  !CHECK: base_arr:  2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
+  print*, "base_arr: ", r%base_arr
+  !CHECK: real_arr:  2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
+  print*, "real_arr: ", r%real_arr
+
+  deallocate(r%real_arr)
+  deallocate(r%base_arr)
+end program target_declare_mapper_parent_allocatable

From 5ed678a1d9b428780744098078dc52387b1e7a79 Mon Sep 17 00:00:00 2001
From: Fred Tingaud
 <95592999+frederic-tingaud-sonarsource@users.noreply.github.com>
Date: Thu, 2 Oct 2025 18:16:30 +0200
Subject: [PATCH 533/878] [CFG] Add a BuildOption to consider default branch of
 switch on covered enumerations (#161345)

By default, the `default:` branch (or the successor if there is no
`default` and cases return) of a switch on fully covered enumerations is
considered as "Unreachable". It is a sane assumption in most cases, but
not always. That commit allows to change such behavior when needed.
---
 clang/include/clang/Analysis/CFG.h   |   1 +
 clang/lib/Analysis/CFG.cpp           |   7 +-
 clang/unittests/Analysis/CFGTest.cpp | 153 +++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Analysis/CFG.h b/clang/include/clang/Analysis/CFG.h
index 1b1ff5e558ec5..6dd7d138e4357 100644
--- a/clang/include/clang/Analysis/CFG.h
+++ b/clang/include/clang/Analysis/CFG.h
@@ -1251,6 +1251,7 @@ class CFG {
     bool MarkElidedCXXConstructors = false;
     bool AddVirtualBaseBranches = false;
     bool OmitImplicitValueInitializers = false;
+    bool AssumeReachableDefaultInSwitchStatements = false;
 
     BuildOptions() = default;
 
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index 60a2d113c08e2..cdde849b0e026 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -4516,10 +4516,13 @@ CFGBlock *CFGBuilder::VisitSwitchStmt(SwitchStmt *Terminator) {
   //
   // Note: We add a successor to a switch that is considered covered yet has no
   //       case statements if the enumeration has no enumerators.
+  //       We also consider this successor reachable if
+  //       BuildOpts.SwitchReqDefaultCoveredEnum is true.
   bool SwitchAlwaysHasSuccessor = false;
   SwitchAlwaysHasSuccessor |= switchExclusivelyCovered;
-  SwitchAlwaysHasSuccessor |= Terminator->isAllEnumCasesCovered() &&
-                              Terminator->getSwitchCaseList();
+  SwitchAlwaysHasSuccessor |=
+      !BuildOpts.AssumeReachableDefaultInSwitchStatements &&
+      Terminator->isAllEnumCasesCovered() && Terminator->getSwitchCaseList();
   addSuccessor(SwitchTerminatedBlock, DefaultCaseBlock,
                !SwitchAlwaysHasSuccessor);
 
diff --git a/clang/unittests/Analysis/CFGTest.cpp b/clang/unittests/Analysis/CFGTest.cpp
index 46a6751391cf5..6aa09a8ff61a3 100644
--- a/clang/unittests/Analysis/CFGTest.cpp
+++ b/clang/unittests/Analysis/CFGTest.cpp
@@ -93,6 +93,159 @@ TEST(CFG, DependantBaseAddImplicitDtors) {
                 .getStatus());
 }
 
+TEST(CFG, SwitchCoveredEnumNoDefault) {
+  const char *Code = R"(
+    enum class E {E1, E2};
+    int f(E e) {
+      switch(e) {
+        case E::E1:
+          return 1;
+        case E::E2:
+          return 2;
+      }
+      return 0;
+    }
+  )";
+  CFG::BuildOptions Options;
+  Options.AssumeReachableDefaultInSwitchStatements = true;
+  BuildResult B = BuildCFG(Code, Options);
+  ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus());
+
+  // [B5 (ENTRY)]
+  //   Succs (1): B2
+  //
+  // [B1]
+  //   1: 0
+  //   2: return [B1.1];
+  //   Preds (1): B2
+  //   Succs (1): B0
+  //
+  // [B2]
+  //   1: e (ImplicitCastExpr, LValueToRValue, E)
+  //   T: switch [B2.1]
+  //   Preds (1): B5
+  //   Succs (3): B3 B4 B1
+  //
+  // [B3]
+  //  case E::E2:
+  //   1: 2
+  //   2: return [B3.1];
+  //   Preds (1): B2
+  //   Succs (1): B0
+  //
+  // [B4]
+  //  case E::E1:
+  //   1: 1
+  //   2: return [B4.1];
+  //   Preds (1): B2
+  //   Succs (1): B0
+  //
+  // [B0 (EXIT)]
+  //   Preds (3): B1 B3 B4
+
+  auto *CFG = B.getCFG();
+  const auto &Entry = CFG->getEntry();
+  ASSERT_EQ(1u, Entry.succ_size());
+  // First successor of Entry is the switch
+  CFGBlock *SwitchBlock = *Entry.succ_begin();
+  ASSERT_EQ(3u, SwitchBlock->succ_size());
+  // Last successor of the switch is after the switch
+  auto NoCaseSucc = SwitchBlock->succ_rbegin();
+  EXPECT_TRUE(NoCaseSucc->isReachable());
+
+  // Checking that the same node is Unreachable without this setting
+  Options.AssumeReachableDefaultInSwitchStatements = false;
+  B = BuildCFG(Code, Options);
+  ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus());
+
+  const auto &Entry2 = B.getCFG()->getEntry();
+  ASSERT_EQ(1u, Entry2.succ_size());
+  CFGBlock *SwitchBlock2 = *Entry2.succ_begin();
+  ASSERT_EQ(3u, SwitchBlock2->succ_size());
+  auto NoCaseSucc2 = SwitchBlock2->succ_rbegin();
+  EXPECT_FALSE(NoCaseSucc2->isReachable());
+}
+
+TEST(CFG, SwitchCoveredEnumWithDefault) {
+  const char *Code = R"(
+    enum class E {E1, E2};
+    int f(E e) {
+      switch(e) {
+        case E::E1:
+          return 1;
+        case E::E2:
+          return 2;
+        default:
+          return 0;
+      }
+      return -1;
+    }
+  )";
+  CFG::BuildOptions Options;
+  Options.AssumeReachableDefaultInSwitchStatements = true;
+  BuildResult B = BuildCFG(Code, Options);
+  ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus());
+
+  // [B6 (ENTRY)]
+  //   Succs (1): B2
+  //
+  // [B1]
+  //   1: -1
+  //   2: return [B1.1];
+  //   Succs (1): B0
+  //
+  // [B2]
+  //   1: e (ImplicitCastExpr, LValueToRValue, E)
+  //   T: switch [B2.1]
+  //   Preds (1): B6
+  //   Succs (3): B4 B5 B3
+  //
+  // [B3]
+  //  default:
+  //   1: 0
+  //   2: return [B3.1];
+  //   Preds (1): B2
+  //   Succs (1): B0
+  //
+  // [B4]
+  //  case E::E2:
+  //   1: 2
+  //   2: return [B4.1];
+  //   Preds (1): B2
+  //   Succs (1): B0
+  //
+  // [B5]
+  //  case E::E1:
+  //   1: 1
+  //   2: return [B5.1];
+  //   Preds (1): B2
+  //   Succs (1): B0
+  //
+  // [B0 (EXIT)]
+  //   Preds (4): B1 B3 B4 B5
+
+  const auto &Entry = B.getCFG()->getEntry();
+  ASSERT_EQ(1u, Entry.succ_size());
+  // First successor of Entry is the switch
+  CFGBlock *SwitchBlock = *Entry.succ_begin();
+  ASSERT_EQ(3u, SwitchBlock->succ_size());
+  // Last successor of the switch is the default branch
+  auto defaultBlock = SwitchBlock->succ_rbegin();
+  EXPECT_TRUE(defaultBlock->isReachable());
+
+  // Checking that the same node is Unreachable without this setting
+  Options.AssumeReachableDefaultInSwitchStatements = false;
+  B = BuildCFG(Code, Options);
+  ASSERT_EQ(BuildResult::BuiltCFG, B.getStatus());
+
+  const auto &Entry2 = B.getCFG()->getEntry();
+  ASSERT_EQ(1u, Entry2.succ_size());
+  CFGBlock *SwitchBlock2 = *Entry2.succ_begin();
+  ASSERT_EQ(3u, SwitchBlock2->succ_size());
+  auto defaultBlock2 = SwitchBlock2->succ_rbegin();
+  EXPECT_FALSE(defaultBlock2->isReachable());
+}
+
 TEST(CFG, IsLinear) {
   auto expectLinear = [](bool IsLinear, const char *Code) {
     BuildResult B = BuildCFG(Code);

From 407ecfe6a3278968519628956484b546621ff545 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Thu, 2 Oct 2025 09:16:48 -0700
Subject: [PATCH 534/878] [CIR][NFC] Cleanup MissingFeature asserts in
 RecordLayoutBuilder (#161605)

This change cleans up some cir::MissingFeature asserts in
CIRGenRecordLayoutBuilder.cpp. In a couple of cases the asserts were
stale markers that we failed to remove when the correspond support was
implemented.

In one case, a cir::MissingFeature::bitfields() assert was ambiguous in
meaning and has been replaced by a comment and something more specific.
The missing feature in this case is just a bit of debug code to verify
certain bitfield-related conditions. This check should be added, but it
is not functionally required.
---
 clang/include/clang/CIR/MissingFeatures.h         |  2 +-
 .../lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp | 15 ++++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 7a6c084f51cd7..3dfcafc0399a5 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -133,7 +133,6 @@ struct MissingFeatures {
   // RecordType
   static bool skippedLayout() { return false; }
   static bool astRecordDeclAttr() { return false; }
-  static bool recordZeroInit() { return false; }
   static bool recordZeroInitPadding() { return false; }
   static bool zeroSizeRecordMembers() { return false; }
 
@@ -192,6 +191,7 @@ struct MissingFeatures {
   static bool builtinCheckKind() { return false; }
   static bool cgCapturedStmtInfo() { return false; }
   static bool cgFPOptionsRAII() { return false; }
+  static bool checkBitfieldClipping() { return false; }
   static bool cirgenABIInfo() { return false; }
   static bool cleanupAfterErrorDiags() { return false; }
   static bool cleanupsToDeactivate() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
index 2baeb43c48b2e..87f23409e8e4b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenRecordLayoutBuilder.cpp
@@ -296,9 +296,8 @@ void CIRRecordLowering::lower(bool nonVirtualBaseType) {
   }
 
   llvm::stable_sort(members);
-  // TODO: implement clipTailPadding once bitfields are implemented
-  assert(!cir::MissingFeatures::bitfields());
-  assert(!cir::MissingFeatures::recordZeroInit());
+  // TODO: Verify bitfield clipping
+  assert(!cir::MissingFeatures::checkBitfieldClipping());
 
   members.push_back(makeStorageInfo(size, getUIntNType(8)));
   determinePacked(nonVirtualBaseType);
@@ -319,9 +318,11 @@ void CIRRecordLowering::fillOutputFields() {
         fieldIdxMap[member.fieldDecl->getCanonicalDecl()] =
             fieldTypes.size() - 1;
       // A field without storage must be a bitfield.
-      assert(!cir::MissingFeatures::bitfields());
-      if (!member.data)
+      if (!member.data) {
+        assert(member.fieldDecl &&
+               "member.data is a nullptr so member.fieldDecl should not be");
         setBitFieldInfo(member.fieldDecl, member.offset, fieldTypes.back());
+      }
     } else if (member.kind == MemberInfo::InfoKind::Base) {
       nonVirtualBases[member.cxxRecordDecl] = fieldTypes.size() - 1;
     } else if (member.kind == MemberInfo::InfoKind::VBase) {
@@ -697,13 +698,9 @@ CIRGenTypes::computeRecordLayout(const RecordDecl *rd, cir::RecordType *ty) {
       ty ? *ty : cir::RecordType{}, baseTy ? baseTy : cir::RecordType{},
       (bool)lowering.zeroInitializable, (bool)lowering.zeroInitializableAsBase);
 
-  assert(!cir::MissingFeatures::recordZeroInit());
-
   rl->nonVirtualBases.swap(lowering.nonVirtualBases);
   rl->completeObjectVirtualBases.swap(lowering.virtualBases);
 
-  assert(!cir::MissingFeatures::bitfields());
-
   // Add all the field numbers.
   rl->fieldIdxMap.swap(lowering.fieldIdxMap);
 

From 3c8c500a191e81044beeb6ec566f6aebd202c3c3 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Thu, 2 Oct 2025 09:20:17 -0700
Subject: [PATCH 535/878] [LLDB][NFC] Fix variable casing issue (#161691)

Fixes some casing mistakes I added in #161581
---
 lldb/source/API/SBTarget.cpp                 | 2 +-
 lldb/source/Commands/CommandObjectTarget.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index 90ffe786c696c..0d03250753802 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -255,7 +255,7 @@ SBProcess SBTarget::LoadCore(const char *core_file, lldb::SBError &error) {
     ProcessSP process_sp(target_sp->CreateProcess(
         target_sp->GetDebugger().GetListener(), "", &filespec, false));
     if (process_sp) {
-      ElapsedTime loadCoreTime(target_sp->GetStatistics().GetLoadCoreTime());
+      ElapsedTime load_core_time(target_sp->GetStatistics().GetLoadCoreTime());
       error.SetError(process_sp->LoadCore());
       if (error.Success())
         sb_process.SetSP(process_sp);
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index b5fc49d58c1eb..c59d02812f328 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -419,7 +419,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
           // Seems weird that we Launch a core file, but that is what we
           // do!
           {
-            ElapsedTime loadCoreTime(
+            ElapsedTime load_core_time(
                 target_sp->GetStatistics().GetLoadCoreTime());
             error = process_sp->LoadCore();
           }

From c2ef022aa7413ddc9aba48fa6fbe6fbd0cb14e19 Mon Sep 17 00:00:00 2001
From: Tom Tromey <tromey@adacore.com>
Date: Thu, 2 Oct 2025 10:22:21 -0600
Subject: [PATCH 536/878] Omit member size from DWARF when desired (#161423)

---
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp   |  7 ++++---
 llvm/test/DebugInfo/X86/dynamic-bitfield.ll | 13 +++++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 62fb5eb011cf2..3cfe7cc12d5b6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1889,11 +1889,12 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
     bool IsBitfield = DT->isBitField();
 
     // Handle the size.
-    if (auto *Var = dyn_cast_or_null<DIVariable>(DT->getRawSizeInBits())) {
+    if (DT->getRawSizeInBits() == nullptr) {
+      // No size, just ignore.
+    } else if (auto *Var = dyn_cast<DIVariable>(DT->getRawSizeInBits())) {
       if (auto *VarDIE = getDIE(Var))
         addDIEEntry(MemberDie, dwarf::DW_AT_bit_size, *VarDIE);
-    } else if (auto *Exp =
-                   dyn_cast_or_null<DIExpression>(DT->getRawSizeInBits())) {
+    } else if (auto *Exp = dyn_cast<DIExpression>(DT->getRawSizeInBits())) {
       DIELoc *Loc = new (DIEValueAllocator) DIELoc;
       DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
       DwarfExpr.setMemoryLocationKind();
diff --git a/llvm/test/DebugInfo/X86/dynamic-bitfield.ll b/llvm/test/DebugInfo/X86/dynamic-bitfield.ll
index c9148ca4582f6..f8935977c64e7 100644
--- a/llvm/test/DebugInfo/X86/dynamic-bitfield.ll
+++ b/llvm/test/DebugInfo/X86/dynamic-bitfield.ll
@@ -27,7 +27,7 @@ source_filename = "bitfield.c"
 !6 = !{}
 !7 = !{!0, !2}
 !8 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "PackedBits", file: !5, line: 3, size: 40, elements: !9)
-!9 = !{!10, !12, !16}
+!9 = !{!10, !12, !16, !21}
 !10 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !8, file: !5, line: 5, baseType: !11, size: 8)
 ; CHECK: DW_TAG_member
 ; CHECK-NEXT: DW_AT_name{{.*}}"a"
@@ -60,5 +60,14 @@ source_filename = "bitfield.c"
 ; CHECK:      DW_AT_bit_size             [DW_FORM_exprloc]	(DW_OP_lit27)
 ; CHECK-NEXT: DW_AT_data_bit_offset      [DW_FORM_exprloc]	(DW_OP_lit13)
 ; CHECK-NOT:  DW_AT_data_member_location
-; CHECK: DW_TAG
 !20 = !{!"clang version 3.9.0 (trunk 267633)"}
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "d", scope: !8, file: !5, line: 7, baseType: !13, offset: !DIExpression(DW_OP_constu, 15), flags: DIFlagBitField)
+; CHECK: DW_TAG_member
+; CHECK-NEXT: DW_AT_name{{.*}}"d"
+; CHECK-NOT:  DW_TAG
+; CHECK-NOT:  DW_AT_bit_offset
+; CHECK-NOT:  DW_AT_byte_size
+; CHECK-NOT:  DW_AT_bit_size
+; CHECK:      DW_AT_data_bit_offset      [DW_FORM_exprloc]	(DW_OP_lit15)
+; CHECK-NOT:  DW_AT_data_member_location
+; CHECK: DW_TAG

From 18997b5a8fafe3b176529b83700d00d5b5876335 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 2 Oct 2025 09:26:46 -0700
Subject: [PATCH 537/878] [ADT] Fix a bug in DoubleAPFloat::frexp (#161625)

Without this patch, we call APFloat::makeQuiet() in frexp like so:

  Quiet.getFirst().makeQuiet();

The problem is that makeQuiet returns a new value instead of modifying
"*this" in place, so we end up discarding the newly returned value.

This patch fixes the problem by assigning the result back to
Quiet.getFirst().

We should put [[nodiscard]] on APFloat::makeQuiet, but I'll do that in
another patch.
---
 llvm/lib/Support/APFloat.cpp       | 2 +-
 llvm/unittests/ADT/APFloatTest.cpp | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index d14abb4bd05b5..8623c06597f5c 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -5857,7 +5857,7 @@ DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
   // practice.
   if (Exp == APFloat::IEK_NaN) {
     DoubleAPFloat Quiet{Arg};
-    Quiet.getFirst().makeQuiet();
+    Quiet.getFirst() = Quiet.getFirst().makeQuiet();
     return Quiet;
   }
 
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 141282ea254b4..30f0a8e5089ef 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -10176,4 +10176,11 @@ TEST(APFloatTest, hasSignBitInMSB) {
   EXPECT_FALSE(APFloat::hasSignBitInMSB(APFloat::Float8E8M0FNU()));
 }
 
+TEST(APFloatTest, FrexpQuietSNaN) {
+  APFloat SNaN = APFloat::getSNaN(APFloat::PPCDoubleDouble());
+  int Exp;
+  APFloat Result = frexp(SNaN, Exp, APFloat::rmNearestTiesToEven);
+  EXPECT_FALSE(Result.isSignaling());
+}
+
 } // namespace

From b43648818768620b615ee06517d58b6fc5d39583 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 2 Oct 2025 09:26:53 -0700
Subject: [PATCH 538/878] [ADT] Use a C++17 fold expression in BitVector.h
 (NFC) (#161626)

This patch simplifies the assertion by replacing the std::all_of check
with a more direct C++17 fold expression.
---
 llvm/include/llvm/ADT/BitVector.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 83350e6e45846..9e81a4b735e7f 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -570,10 +570,7 @@ class BitVector {
   template <class F, class... ArgTys>
   static BitVector &apply(F &&f, BitVector &Out, BitVector const &Arg,
                           ArgTys const &...Args) {
-    assert(llvm::all_of(
-               std::initializer_list<unsigned>{Args.size()...},
-               [&Arg](auto const &BV) { return Arg.size() == BV; }) &&
-           "consistent sizes");
+    assert(((Arg.size() == Args.size()) && ...) && "consistent sizes");
     Out.resize(Arg.size());
     for (size_type I = 0, E = Arg.Bits.size(); I != E; ++I)
       Out.Bits[I] = f(Arg.Bits[I], Args.Bits[I]...);

From 0dd8f322c1b40bdac469eec248e3b2b4e1043754 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 2 Oct 2025 09:27:01 -0700
Subject: [PATCH 539/878] [ADT] Use structured bindings (NFC) (#161627)

Both Size and Count are just integers, so I am not using & here.
---
 llvm/include/llvm/ADT/ConcurrentHashtable.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 6de194db9ba7a..6a943c5b062e7 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -253,9 +253,8 @@ class ConcurrentHashTableByPtr {
 
     OS << "\nOverall number of entries = " << OverallNumberOfEntries;
     OS << "\nOverall number of non empty buckets = " << NumberOfNonEmptyBuckets;
-    for (auto &BucketSize : BucketSizesMap)
-      OS << "\n Number of buckets with size " << BucketSize.first << ": "
-         << BucketSize.second;
+    for (auto [Size, Count] : BucketSizesMap)
+      OS << "\n Number of buckets with size " << Size << ": " << Count;
 
     std::stringstream stream;
     stream << std::fixed << std::setprecision(2)

From 7eb5c08ac3a5bc524a5fe4e2e91db3a5b1ffe3cd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 2 Oct 2025 09:27:09 -0700
Subject: [PATCH 540/878] [ADT] Use "= default" in DirectedGraph.h (#161628)

This patch drops user copy/move constructors and assignment operators
of DirectedGraph to adhere to the Rule of Zero.

Now, the original code:

  DGraphType &operator=(const DGraphType &&G)

is most likely unintended -- a move assignment operator with const
r-value reference.  This patch fixes that.
---
 llvm/include/llvm/ADT/DirectedGraph.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/include/llvm/ADT/DirectedGraph.h b/llvm/include/llvm/ADT/DirectedGraph.h
index 83c0bea6393c4..fb6b180f77e6b 100644
--- a/llvm/include/llvm/ADT/DirectedGraph.h
+++ b/llvm/include/llvm/ADT/DirectedGraph.h
@@ -181,16 +181,6 @@ template <class NodeType, class EdgeType> class DirectedGraph {
 
   DirectedGraph() = default;
   explicit DirectedGraph(NodeType &N) : Nodes() { addNode(N); }
-  DirectedGraph(const DGraphType &G) : Nodes(G.Nodes) {}
-  DirectedGraph(DGraphType &&RHS) : Nodes(std::move(RHS.Nodes)) {}
-  DGraphType &operator=(const DGraphType &G) {
-    Nodes = G.Nodes;
-    return *this;
-  }
-  DGraphType &operator=(const DGraphType &&G) {
-    Nodes = std::move(G.Nodes);
-    return *this;
-  }
 
   const_iterator begin() const { return Nodes.begin(); }
   const_iterator end() const { return Nodes.end(); }

From c0a2bead5b471497c42966a73548f7690de2f301 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 2 Oct 2025 09:27:17 -0700
Subject: [PATCH 541/878] [llvm] Proofread TableGen/ProgRef.rst (#161629)

---
 llvm/docs/TableGen/ProgRef.rst | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst
index 2b1af05794021..0ff4cc764eaaf 100644
--- a/llvm/docs/TableGen/ProgRef.rst
+++ b/llvm/docs/TableGen/ProgRef.rst
@@ -64,7 +64,7 @@ Classes and concrete records have a unique *name*, either chosen by
 the programmer or generated by TableGen. Associated with that name
 is a list of *fields* with values and an optional list of *parent classes*
 (sometimes called base or super classes). The fields are the primary data that
-backends will process. Note that TableGen assigns no meanings to fields; the
+backends will process. Note that TableGen assigns no meaning to fields; the
 meanings are entirely up to the backends and the programs that incorporate
 the output of those backends.
 
@@ -243,7 +243,7 @@ Include files
 -------------
 
 TableGen has an include mechanism. The content of the included file
-lexically replaces the ``include`` directive and is then parsed as if it was
+lexically replaces the ``include`` directive and is then parsed as if it were
 originally in the main file.
 
 .. productionlist::
@@ -670,17 +670,17 @@ name of a multiclass.
 The argument values can be specified in two forms:
 
 * Positional argument (``value``). The value is assigned to the argument in the
-  corresponding position. For ``Foo<a0, a1>``, ``a0`` will be assigned to first
-  argument and ``a1`` will be assigned to second argument.
+  corresponding position. For ``Foo<a0, a1>``, ``a0`` will be assigned to the first
+  argument and ``a1`` will be assigned to the second argument.
 * Named argument (``name=value``). The value is assigned to the argument with
   the specified name. For ``Foo<a=a0, b=a1>``, ``a0`` will be assigned to the
   argument with name ``a`` and ``a1`` will be assigned to the argument with
   name ``b``.
 
-Required arguments can also be specified as named argument.
+Required arguments can also be specified as a named argument.
 
 Note that the argument can only be specified once regardless of the way (named
-or positional) to specify and positional arguments should be put before named
+or positional) to specify and positional arguments should precede named
 arguments.
 
 .. productionlist::
@@ -817,7 +817,7 @@ type. It provides a single field, ``Value``, which holds a 3-bit number. Its
 template argument, ``val``, is used to set the ``Value`` field.  Each of the
 eight records is defined with ``FPFormat`` as its parent class. The
 enumeration value is passed in angle brackets as the template argument. Each
-record will inherent the ``Value`` field with the appropriate enumeration
+record will inherit the ``Value`` field with the appropriate enumeration
 value.
 
 Here is a more complex example of classes with template arguments. First, we
@@ -1308,7 +1308,7 @@ with ``F0``, ``F1``, ``F2``, and ``F3``.
 -------------------------------------
 
 A ``dump`` statement prints the input string to standard error
-output. It is intended for debugging purpose.
+output. It is intended for debugging purposes.
 
 * At top level, the message is printed immediately.
 
@@ -1727,7 +1727,7 @@ and non-0 as true.
 
 ``!div(``\ *a*\ ``,`` *b*\ ``)``
     This operator performs signed division of *a* by *b*, and produces the quotient.
-    Division by 0 produces an error. Division of INT64_MIN by -1 produces an error.
+    Division by 0 produces an error. Division of ``INT64_MIN`` by -1 produces an error.
 
 ``!empty(``\ *a*\ ``)``
     This operator produces 1 if the string, list, or DAG *a* is empty; 0 otherwise.
@@ -1914,7 +1914,7 @@ and non-0 as true.
 ``!or(``\ *a*\ ``,`` *b*\ ``, ...)``
     This operator does a bitwise OR on *a*, *b*, etc., and produces the
     result. A logical OR can be performed if all the arguments are either
-    0 or 1. This operator is short-circuit to -1 (all ones) the left-most
+    0 or 1. This operator is short-circuit to -1 (all ones) when the left-most
     operand is -1.
 
 ``!range([``\ *start*\ ``,]`` *end*\ ``[,``\ *step*\ ``])``
@@ -1937,7 +1937,7 @@ and non-0 as true.
     Equivalent to ``!range(0, !size(list))``.
 
 ``!repr(``\ *value*\ ``)``
-    Represents *value* as a string. String format for the value is not
+    Represents *value* as a string. The string format for the value is not
     guaranteed to be stable. Intended for debugging purposes only.
 
 ``!setdagarg(``\ *dag*\ ``,``\ *key*\ ``,``\ *arg*\ ``)``

From 25126117b5781e96453e5c5b1a9a6a6f8aa3989c Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 2 Oct 2025 09:31:24 -0700
Subject: [PATCH 542/878] [RegAlloc] Add coverage leading to revert of pr160765
 (#161614)

Essentially what happened is the following series of events:
1) We rematerialized the vmv.v.x into the loop.
2) As this was the last use of the instruction, we deleted the
   instruction, and removed it from the original live range.
3) We split the live range for the remat.
4) We tried to rematerialize the uses of that split interval, and
   crashed because the assert about the def being available in
   the original live interval does not hold.
---
 llvm/test/CodeGen/RISCV/rvv/remat.ll | 132 +++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 06d54fadaeffd..95bff27fe8ca6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -301,3 +301,135 @@ define void @vfmv.s.f(ptr %p, double %x) {
   store volatile double %x, ptr %p
   ret void
 }
+
+; This test is fairly fragile, but it's trying to cover the case which
+; caused the revert of bba9172 due to interaction with how rematerialize
+; instructions are pruned from the original live interval.  In the result
+; below, we remat the vmv.v.x into the loop, but fail to remat the vmv.v.x
+; a second time after further splitting it's live range.  We shouldn't need
+; to spill it to the stack at all.
+define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, ptr %p) #0 {
+; CHECK-LABEL: dual_remat:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a1, a2, 3
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vmv.v.i v0, 0
+; CHECK-NEXT:  .LBB8_1: # %vector.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a5, a4, 4
+; CHECK-NEXT:    add a4, a5, a4
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vand.vv v16, v16, v8
+; CHECK-NEXT:    vmsne.vi v24, v16, 0
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs1r.v v24, (a4) # vscale x 8-byte Folded Spill
+; CHECK-NEXT:    vand.vv v16, v0, v8
+; CHECK-NEXT:    vmsne.vi v8, v16, 0
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vl1r.v v9, (a4) # vscale x 8-byte Folded Reload
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vcpop.m a4, v9
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a6, a5, 4
+; CHECK-NEXT:    add a5, a6, a5
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vs8r.v v8, (a3)
+; CHECK-NEXT:    vs8r.v v8, (a2)
+; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vor.vv v16, v16, v8
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vor.vv v0, v0, v8
+; CHECK-NEXT:    beqz a4, .LBB8_1
+; CHECK-NEXT:  # %bb.2: # %middle.block
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    add sp, sp, a1
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+entry:
+  %broadcast.splatinsert = insertelement <vscale x 16 x i64> zeroinitializer, i64 %0, i64 0
+  %broadcast.splat = shufflevector <vscale x 16 x i64> %broadcast.splatinsert, <vscale x 16 x i64> zeroinitializer, <vscale x 16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %vec.ind = phi <vscale x 16 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
+  %3 = and <vscale x 16 x i64> %vec.ind, %broadcast.splat
+  %4 = icmp ne <vscale x 16 x i64> %3, zeroinitializer
+  store <vscale x 16 x i64> %broadcast.splat, ptr %p
+  %5 = tail call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> %4)
+  %vec.ind.next = or <vscale x 16 x i64> %vec.ind, %1
+  br i1 %5, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %and.i = and i64 1, %0
+  ret i64 %and.i
+}

From e394df39c63152d9ba6f81262d8a9eb82e7c7814 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Thu, 2 Oct 2025 09:56:34 -0700
Subject: [PATCH 543/878] [NFC] Rename members in AutoVarEmission (#161668)

It was brought up by Andy in a different review that AutoVarEmission's
member variables didn't follow our naming standard. This patch corrects
that and fixes all references.
---
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          | 22 +++++++--------
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 28 +++++++++----------
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp |  4 +--
 3 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 10b8255623763..563a753ab4efd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -35,8 +35,8 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
       getContext().getLangOpts().ElideConstructors && d.isNRVOVariable();
 
   CIRGenFunction::AutoVarEmission emission(d);
-  emission.IsEscapingByRef = d.isEscapingByref();
-  if (emission.IsEscapingByRef)
+  emission.isEscapingByRef = d.isEscapingByref();
+  if (emission.isEscapingByRef)
     cgm.errorNYI(d.getSourceRange(),
                  "emitAutoVarDecl: decl escaping by reference");
 
@@ -78,7 +78,7 @@ CIRGenFunction::emitAutoVarAlloca(const VarDecl &d,
             alignment);
   }
 
-  emission.Addr = address;
+  emission.addr = address;
   setAddrOfLocalVar(&d, address);
 
   return emission;
@@ -101,13 +101,13 @@ bool CIRGenFunction::isTrivialInitializer(const Expr *init) {
 
 void CIRGenFunction::emitAutoVarInit(
     const CIRGenFunction::AutoVarEmission &emission) {
-  assert(emission.Variable && "emission was not valid!");
+  assert(emission.variable && "emission was not valid!");
 
   // If this was emitted as a global constant, we're done.
   if (emission.wasEmittedAsGlobal())
     return;
 
-  const VarDecl &d = *emission.Variable;
+  const VarDecl &d = *emission.variable;
 
   QualType type = d.getType();
 
@@ -124,7 +124,7 @@ void CIRGenFunction::emitAutoVarInit(
     return;
   }
 
-  const Address addr = emission.Addr;
+  const Address addr = emission.addr;
 
   // Check whether this is a byref variable that's potentially
   // captured and moved by its own initializer.  If so, we'll need to
@@ -153,7 +153,7 @@ void CIRGenFunction::emitAutoVarInit(
   }
 
   mlir::Attribute constant;
-  if (emission.IsConstantAggregate ||
+  if (emission.isConstantAggregate ||
       d.mightBeUsableInConstantExpressions(getContext())) {
     // FIXME: Differently from LLVM we try not to emit / lower too much
     // here for CIR since we are interested in seeing the ctor in some
@@ -196,7 +196,7 @@ void CIRGenFunction::emitAutoVarInit(
   // FIXME(cir): migrate most of this file to use mlir::TypedAttr directly.
   auto typedConstant = mlir::dyn_cast<mlir::TypedAttr>(constant);
   assert(typedConstant && "expected typed attribute");
-  if (!emission.IsConstantAggregate) {
+  if (!emission.isConstantAggregate) {
     // For simple scalar/complex initialization, store the value directly.
     LValue lv = makeAddrLValue(addr, type);
     assert(init && "expected initializer");
@@ -209,7 +209,7 @@ void CIRGenFunction::emitAutoVarInit(
 
 void CIRGenFunction::emitAutoVarCleanups(
     const CIRGenFunction::AutoVarEmission &emission) {
-  const VarDecl &d = *emission.Variable;
+  const VarDecl &d = *emission.variable;
 
   // Check the type for a cleanup.
   if (QualType::DestructionKind dtorKind = d.needsDestruction(getContext()))
@@ -821,7 +821,7 @@ void CIRGenFunction::emitAutoVarTypeCleanup(
   // original stack object, not the possibly forwarded object.
   Address addr = emission.getObjectAddress(*this);
 
-  const VarDecl *var = emission.Variable;
+  const VarDecl *var = emission.variable;
   QualType type = var->getType();
 
   CleanupKind cleanupKind = NormalAndEHCleanup;
@@ -834,7 +834,7 @@ void CIRGenFunction::emitAutoVarTypeCleanup(
   case QualType::DK_cxx_destructor:
     // If there's an NRVO flag on the emission, we need a different
     // cleanup.
-    if (emission.NRVOFlag) {
+    if (emission.nrvoFlag) {
       cgm.errorNYI(var->getSourceRange(), "emitAutoVarTypeCleanup: NRVO");
       return;
     }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index c0ed8b4006ec5..cb7cf983006e9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -479,55 +479,55 @@ class CIRGenFunction : public CIRGenTypeCache {
   ConstantEmission tryEmitAsConstant(const MemberExpr *me);
 
   struct AutoVarEmission {
-    const clang::VarDecl *Variable;
+    const clang::VarDecl *variable;
     /// The address of the alloca for languages with explicit address space
     /// (e.g. OpenCL) or alloca casted to generic pointer for address space
     /// agnostic languages (e.g. C++). Invalid if the variable was emitted
     /// as a global constant.
-    Address Addr;
+    Address addr;
 
     /// True if the variable is of aggregate type and has a constant
     /// initializer.
-    bool IsConstantAggregate = false;
+    bool isConstantAggregate = false;
 
     /// True if the variable is a __block variable that is captured by an
     /// escaping block.
-    bool IsEscapingByRef = false;
+    bool isEscapingByRef = false;
 
     /// True if the variable was emitted as an offload recipe, and thus doesn't
     /// have the same sort of alloca initialization.
-    bool EmittedAsOffload = false;
+    bool emittedAsOffload = false;
 
-    mlir::Value NRVOFlag{};
+    mlir::Value nrvoFlag{};
 
     struct Invalid {};
-    AutoVarEmission(Invalid) : Variable(nullptr), Addr(Address::invalid()) {}
+    AutoVarEmission(Invalid) : variable(nullptr), addr(Address::invalid()) {}
 
     AutoVarEmission(const clang::VarDecl &variable)
-        : Variable(&variable), Addr(Address::invalid()) {}
+        : variable(&variable), addr(Address::invalid()) {}
 
     static AutoVarEmission invalid() { return AutoVarEmission(Invalid()); }
 
-    bool wasEmittedAsGlobal() const { return !Addr.isValid(); }
+    bool wasEmittedAsGlobal() const { return !addr.isValid(); }
 
-    bool wasEmittedAsOffloadClause() const { return EmittedAsOffload; }
+    bool wasEmittedAsOffloadClause() const { return emittedAsOffload; }
 
     /// Returns the raw, allocated address, which is not necessarily
     /// the address of the object itself. It is casted to default
     /// address space for address space agnostic languages.
-    Address getAllocatedAddress() const { return Addr; }
+    Address getAllocatedAddress() const { return addr; }
 
     // Changes the stored address for the emission.  This function should only
     // be used in extreme cases, and isn't required to model normal AST
     // initialization/variables.
-    void setAllocatedAddress(Address A) { Addr = A; }
+    void setAllocatedAddress(Address a) { addr = a; }
 
     /// Returns the address of the object within this declaration.
     /// Note that this does not chase the forwarding pointer for
     /// __block decls.
     Address getObjectAddress(CIRGenFunction &cgf) const {
-      if (!IsEscapingByRef)
-        return Addr;
+      if (!isEscapingByRef)
+        return addr;
 
       assert(!cir::MissingFeatures::opAllocaEscapeByReference());
       return Address::invalid();
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index f8e511e28faed..565030d2e476e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -408,7 +408,7 @@ void OpenACCRecipeBuilderBase::makeBoundsInit(
   CIRGenFunction::LexicalScope ls(cgf, loc, block);
 
   CIRGenFunction::AutoVarEmission tempDeclEmission{*allocaDecl};
-  tempDeclEmission.EmittedAsOffload = true;
+  tempDeclEmission.emittedAsOffload = true;
 
   // The init section is the only one of the handful that only has a single
   // argument for the 'type', so we have to drop 1 for init, and future calls
@@ -504,7 +504,7 @@ void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
   // that instead of the variable in the other block.
   tempDeclEmission.setAllocatedAddress(
       Address{toArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)});
-  tempDeclEmission.EmittedAsOffload = true;
+  tempDeclEmission.emittedAsOffload = true;
 
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, temporary};
   cgf.setAddrOfLocalVar(

From 758fd7a0279021d51492848b30c0c5faf5f90cfe Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 2 Oct 2025 17:58:51 +0100
Subject: [PATCH 544/878] [AArch64][SME] Reshuffle emit[prologue|epilogue]()
 for splitSVEObjects (NFCI) (#161217)

Requested in
https://github.com/llvm/llvm-project/pull/142392#discussion_r2207880079
---
 .../AArch64/AArch64MachineFunctionInfo.h      |   2 +
 .../AArch64/AArch64PrologueEpilogue.cpp       | 280 +++++++++---------
 2 files changed, 140 insertions(+), 142 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 4a79d9ce39c8d..00c104ed2254d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -481,6 +481,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     StackHazardCSRSlotIndex = Index;
   }
 
+  bool hasSplitSVEObjects() const { return false; }
+
   SMEAttrs getSMEFnAttrs() const { return SMEFnAttrs; }
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index aad6579285e47..fec350b55d334 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -715,8 +715,6 @@ void AArch64PrologueEmitter::emitPrologue() {
   if (AFL.windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding))
     emitWindowsStackProbe(AfterGPRSavesI, DL, NumBytes, RealignmentPadding);
 
-  MachineBasicBlock::iterator CalleeSavesEnd = AfterGPRSavesI;
-
   StackOffset PPRCalleeSavesSize =
       StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
   StackOffset ZPRCalleeSavesSize =
@@ -728,72 +726,59 @@ void AArch64PrologueEmitter::emitPrologue() {
   StackOffset CFAOffset =
       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
   MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
-
   if (!FPAfterSVECalleeSaves) {
-    MachineBasicBlock::iterator ZPRCalleeSavesBegin = AfterGPRSavesI,
-                                ZPRCalleeSavesEnd = AfterGPRSavesI;
-    MachineBasicBlock::iterator PPRCalleeSavesBegin = AfterGPRSavesI,
-                                PPRCalleeSavesEnd = AfterGPRSavesI;
-
-    // Process the SVE callee-saves to determine what space needs to be
-    // allocated.
-
+    // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR
+    // areas.
     if (PPRCalleeSavesSize) {
       LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
                         << PPRCalleeSavesSize.getScalable() << "\n");
 
-      PPRCalleeSavesBegin = AfterSVESavesI;
-      assert(isPartOfPPRCalleeSaves(PPRCalleeSavesBegin) &&
+      assert(isPartOfPPRCalleeSaves(AfterSVESavesI) &&
              "Unexpected instruction");
       while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
              AfterSVESavesI != MBB.getFirstTerminator())
         ++AfterSVESavesI;
-      PPRCalleeSavesEnd = AfterSVESavesI;
     }
-
     if (ZPRCalleeSavesSize) {
       LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
                         << ZPRCalleeSavesSize.getScalable() << "\n");
-      ZPRCalleeSavesBegin = AfterSVESavesI;
-      assert(isPartOfZPRCalleeSaves(ZPRCalleeSavesBegin) &&
+      assert(isPartOfZPRCalleeSaves(AfterSVESavesI) &&
              "Unexpected instruction");
       while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
              AfterSVESavesI != MBB.getFirstTerminator())
         ++AfterSVESavesI;
-      ZPRCalleeSavesEnd = AfterSVESavesI;
     }
+  }
+
+  if (EmitAsyncCFI)
+    emitCalleeSavedSVELocations(AfterSVESavesI);
 
+  if (AFI->hasSplitSVEObjects()) {
+    reportFatalInternalError("not implemented yet");
+  } else {
     // Allocate space for the callee saves (if any).
     StackOffset LocalsSize =
         PPRLocalsSize + ZPRLocalsSize + StackOffset::getFixed(NumBytes);
-    MachineBasicBlock::iterator CalleeSavesBegin =
-        AFI->getPPRCalleeSavedStackSize() ? PPRCalleeSavesBegin
-                                          : ZPRCalleeSavesBegin;
-    allocateStackSpace(CalleeSavesBegin, 0, SVECalleeSavesSize,
-                       EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects() || LocalsSize);
-
-    CalleeSavesEnd = AFI->getZPRCalleeSavedStackSize() ? ZPRCalleeSavesEnd
-                                                       : PPRCalleeSavesEnd;
-  }
-  CFAOffset += SVECalleeSavesSize;
-
-  if (EmitAsyncCFI)
-    emitCalleeSavedSVELocations(CalleeSavesEnd);
-
-  // Allocate space for the rest of the frame including SVE locals. Align the
-  // stack as necessary.
-  assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
-         "Cannot use redzone with stack realignment");
-  if (!AFL.canUseRedZone(MF)) {
-    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-    // the correct value here, as NumBytes also includes padding bytes,
-    // which shouldn't be counted here.
-    StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
-    allocateStackSpace(CalleeSavesEnd, RealignmentPadding,
-                       SVELocalsSize + StackOffset::getFixed(NumBytes),
-                       EmitAsyncCFI && !HasFP, CFAOffset,
-                       MFI.hasVarSizedObjects());
+    if (!FPAfterSVECalleeSaves)
+      allocateStackSpace(AfterGPRSavesI, 0, SVECalleeSavesSize,
+                         EmitAsyncCFI && !HasFP, CFAOffset,
+                         MFI.hasVarSizedObjects() || LocalsSize);
+    CFAOffset += SVECalleeSavesSize;
+
+    // Allocate space for the rest of the frame including SVE locals. Align the
+    // stack as necessary.
+    assert(!(AFL.canUseRedZone(MF) && NeedsRealignment) &&
+           "Cannot use redzone with stack realignment");
+    if (!AFL.canUseRedZone(MF)) {
+      // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+      // the correct value here, as NumBytes also includes padding bytes,
+      // which shouldn't be counted here.
+      StackOffset SVELocalsSize = PPRLocalsSize + ZPRLocalsSize;
+      allocateStackSpace(AfterSVESavesI, RealignmentPadding,
+                         SVELocalsSize + StackOffset::getFixed(NumBytes),
+                         EmitAsyncCFI && !HasFP, CFAOffset,
+                         MFI.hasVarSizedObjects());
+    }
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
@@ -1391,7 +1376,9 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   if (HasFP && AFI->hasSwiftAsyncContext())
     emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
 
-  StackOffset SVEStackSize = AFL.getSVEStackSize(MF);
+  StackOffset ZPRStackSize = AFL.getZPRStackSize(MF);
+  StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
+  StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
 
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
@@ -1412,111 +1399,120 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
-  // Process the SVE callee-saves to determine what space needs to be
-  // deallocated.
-  StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
-  MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
-                              RestoreEnd = FirstGPRRestoreI;
-  int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
-  int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
-  int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
-
-  if (SVECalleeSavedSize) {
-    if (FPAfterSVECalleeSaves)
-      RestoreEnd = MBB.getFirstTerminator();
-
-    RestoreBegin = std::prev(RestoreEnd);
-    while (RestoreBegin != MBB.begin() &&
-           isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
-      --RestoreBegin;
-
-    assert(isPartOfSVECalleeSaves(RestoreBegin) &&
-           isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
-           "Unexpected instruction");
-
-    StackOffset CalleeSavedSizeAsOffset =
-        StackOffset::getScalable(SVECalleeSavedSize);
-    DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
-    DeallocateAfter = CalleeSavedSizeAsOffset;
-  }
+  if (!AFI->hasSplitSVEObjects()) {
+    // Process the SVE callee-saves to determine what space needs to be
+    // deallocated.
+    StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
+    MachineBasicBlock::iterator RestoreBegin = FirstGPRRestoreI,
+                                RestoreEnd = FirstGPRRestoreI;
+    int64_t ZPRCalleeSavedSize = AFI->getZPRCalleeSavedStackSize();
+    int64_t PPRCalleeSavedSize = AFI->getPPRCalleeSavedStackSize();
+    int64_t SVECalleeSavedSize = ZPRCalleeSavedSize + PPRCalleeSavedSize;
+
+    if (SVECalleeSavedSize) {
+      if (FPAfterSVECalleeSaves)
+        RestoreEnd = MBB.getFirstTerminator();
+
+      RestoreBegin = std::prev(RestoreEnd);
+      while (RestoreBegin != MBB.begin() &&
+             isPartOfSVECalleeSaves(std::prev(RestoreBegin)))
+        --RestoreBegin;
+
+      assert(isPartOfSVECalleeSaves(RestoreBegin) &&
+             isPartOfSVECalleeSaves(std::prev(RestoreEnd)) &&
+             "Unexpected instruction");
 
-  // Deallocate the SVE area.
-  if (FPAfterSVECalleeSaves) {
-    // If the callee-save area is before FP, restoring the FP implicitly
-    // deallocates non-callee-save SVE allocations.  Otherwise, deallocate
-    // them explicitly.
-    if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
-      emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
-                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI);
+      StackOffset CalleeSavedSizeAsOffset =
+          StackOffset::getScalable(SVECalleeSavedSize);
+      DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
+      DeallocateAfter = CalleeSavedSizeAsOffset;
     }
 
-    // Deallocate callee-save non-SVE registers.
-    emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-    // Deallocate fixed objects.
-    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(FixedObject), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-    // Deallocate callee-save SVE registers.
-    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                    DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                    NeedsWinCFI, &HasWinCFI);
-  } else if (SVEStackSize) {
-    int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
-    // If we have stack realignment or variable-sized objects we must use the
-    // FP to restore SVE callee saves (as there is an unknown amount of
-    // data/padding between the SP and SVE CS area).
-    Register BaseForSVEDealloc =
-        (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
-                                                              : AArch64::SP;
-    if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
-      Register CalleeSaveBase = AArch64::FP;
-      if (int64_t CalleeSaveBaseOffset =
-              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
-        // If we have have an non-zero offset to the non-SVE CS base we need to
-        // compute the base address by subtracting the offest in a temporary
-        // register first (to avoid briefly deallocating the SVE CS).
-        CalleeSaveBase =
-            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
-                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
-                        MachineInstr::FrameDestroy);
-      }
-      // The code below will deallocate the stack space space by moving the
-      // SP to the start of the SVE callee-save area.
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
-                      StackOffset::getScalable(-SVECalleeSavedSize), TII,
-                      MachineInstr::FrameDestroy);
-    } else if (BaseForSVEDealloc == AArch64::SP) {
-      if (SVECalleeSavedSize) {
-        // Deallocate the non-SVE locals first before we can deallocate (and
-        // restore callee saves) from the SVE area.
-        emitFrameOffset(
-            MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-            StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
-            false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
-            SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
-        NumBytes = 0;
+    // Deallocate the SVE area.
+    if (FPAfterSVECalleeSaves) {
+      // If the callee-save area is before FP, restoring the FP implicitly
+      // deallocates non-callee-save SVE allocations.  Otherwise, deallocate
+      // them explicitly.
+      if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
+        emitFrameOffset(MBB, FirstGPRRestoreI, DL, AArch64::SP, AArch64::SP,
+                        DeallocateBefore, TII, MachineInstr::FrameDestroy,
+                        false, NeedsWinCFI, &HasWinCFI);
       }
 
+      // Deallocate callee-save non-SVE registers.
       emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                      DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
-                      SVEStackSize +
-                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
+                      StackOffset::getFixed(AFI->getCalleeSavedStackSize()),
+                      TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                      &HasWinCFI);
+
+      // Deallocate fixed objects.
+      emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                      StackOffset::getFixed(FixedObject), TII,
+                      MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                      &HasWinCFI);
 
+      // Deallocate callee-save SVE registers.
       emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
                       DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
-                      NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
-                      DeallocateAfter +
-                          StackOffset::getFixed(NumBytes + PrologueSaveSize));
+                      NeedsWinCFI, &HasWinCFI);
+    } else if (SVEStackSize) {
+      int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
+      // If we have stack realignment or variable-sized objects we must use the
+      // FP to restore SVE callee saves (as there is an unknown amount of
+      // data/padding between the SP and SVE CS area).
+      Register BaseForSVEDealloc =
+          (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
+                                                                : AArch64::SP;
+      if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
+        Register CalleeSaveBase = AArch64::FP;
+        if (int64_t CalleeSaveBaseOffset =
+                AFI->getCalleeSaveBaseToFrameRecordOffset()) {
+          // If we have have an non-zero offset to the non-SVE CS base we need
+          // to compute the base address by subtracting the offest in a
+          // temporary register first (to avoid briefly deallocating the SVE
+          // CS).
+          CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
+              &AArch64::GPR64RegClass);
+          emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
+                          StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
+                          MachineInstr::FrameDestroy);
+        }
+        // The code below will deallocate the stack space space by moving the
+        // SP to the start of the SVE callee-save area.
+        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
+                        StackOffset::getScalable(-SVECalleeSavedSize), TII,
+                        MachineInstr::FrameDestroy);
+      } else if (BaseForSVEDealloc == AArch64::SP) {
+        if (SVECalleeSavedSize) {
+          // Deallocate the non-SVE locals first before we can deallocate (and
+          // restore callee saves) from the SVE area.
+          emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                          StackOffset::getFixed(NumBytes), TII,
+                          MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                          &HasWinCFI, EmitCFI && !HasFP,
+                          SVEStackSize + StackOffset::getFixed(
+                                             NumBytes + PrologueSaveSize));
+          NumBytes = 0;
+        }
+
+        emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+                        DeallocateBefore, TII, MachineInstr::FrameDestroy,
+                        false, NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+                        SVEStackSize +
+                            StackOffset::getFixed(NumBytes + PrologueSaveSize));
+
+        emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+                        DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
+                        NeedsWinCFI, &HasWinCFI, EmitCFI && !HasFP,
+                        DeallocateAfter +
+                            StackOffset::getFixed(NumBytes + PrologueSaveSize));
+      }
+
+      if (EmitCFI)
+        emitCalleeSavedSVERestores(RestoreEnd);
     }
-    if (EmitCFI)
-      emitCalleeSavedSVERestores(RestoreEnd);
+  } else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
+    reportFatalInternalError("not implemented yet");
   }
 
   if (!HasFP) {

From 419594230f952a37dd2751056f3b92b0a9f80dee Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Thu, 2 Oct 2025 19:12:14 +0200
Subject: [PATCH 545/878] [mlir][omp] Add omp.tile operation (#160292)

Add the `omp.tile` loop transformations for the OpenMP dialect. Used for
lowering a standalone `!$omp tile` in Flang.
---
 .../mlir/Dialect/OpenMP/OpenMPClauses.td      |  29 +++
 .../mlir/Dialect/OpenMP/OpenMPOpBase.td       |  69 ++++++-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  63 +++---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 154 ++++++++++++++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  42 ++++
 mlir/test/Dialect/OpenMP/cli-tile.mlir        | 138 +++++++++++++
 mlir/test/Dialect/OpenMP/invalid-tile.mlir    | 119 +++++++++++
 .../test/Target/LLVMIR/openmp-cli-tile01.mlir | 101 ++++++++++
 .../test/Target/LLVMIR/openmp-cli-tile02.mlir | 190 ++++++++++++++++++
 mlir/test/mlir-tblgen/op-format-invalid.td    |   2 +-
 mlir/test/mlir-tblgen/op-format-spec.td       |   2 +-
 .../tools/mlir-tblgen/AttrOrTypeFormatGen.cpp |   1 +
 mlir/tools/mlir-tblgen/FormatGen.cpp          |   2 +-
 mlir/tools/mlir-tblgen/OpFormatGen.cpp        |   1 +
 14 files changed, 870 insertions(+), 43 deletions(-)
 create mode 100644 mlir/test/Dialect/OpenMP/cli-tile.mlir
 create mode 100644 mlir/test/Dialect/OpenMP/invalid-tile.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
index 1eda5e4bc1618..8e43c4284d078 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauses.td
@@ -995,6 +995,35 @@ class OpenMP_NumTeamsClauseSkip<
 
 def OpenMP_NumTeamsClause : OpenMP_NumTeamsClauseSkip<>;
 
+//===----------------------------------------------------------------------===//
+//  V5.1: [10.1.2] `sizes` clause
+//===----------------------------------------------------------------------===//
+
+class OpenMP_SizesClauseSkip<
+    bit traits = false, bit arguments = false, bit assemblyFormat = false,
+    bit description = false, bit extraClassDeclaration = false
+  > : OpenMP_Clause<traits, arguments, assemblyFormat, description,
+                    extraClassDeclaration> {
+  let arguments = (ins
+    Variadic<IntLikeType>:$sizes
+  );
+
+  let optAssemblyFormat = [{
+    `sizes` `(` $sizes `:` type($sizes) `)`
+  }];
+
+  let description = [{
+    The `sizes` clauses defines the size of a grid over a multi-dimensional
+    logical iteration space. This grid is used for loop transformations such as
+    `tile` and `strip`. The size per dimension can be a variable, but only
+    values that are not at least 2 make sense. It is not specified what happens
+    when smaller values are used, but should still result in a loop nest that
+    executes each logical iteration once.
+  }];
+}
+
+def OpenMP_SizesClause : OpenMP_SizesClauseSkip<>;
+
 //===----------------------------------------------------------------------===//
 // V5.2: [10.1.2] `num_threads` clause
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td
index bbcfb87fa03c6..5ad4e4b5b61d1 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpBase.td
@@ -38,6 +38,44 @@ def OpenMP_MapBoundsType : OpenMP_Type<"MapBounds", "map_bounds_ty"> {
   let summary = "Type for representing omp map clause bounds information";
 }
 
+//===---------------------------------------------------------------------===//
+// OpenMP Canonical Loop Info Type
+//===---------------------------------------------------------------------===//
+
+def CanonicalLoopInfoType : OpenMP_Type<"CanonicalLoopInfo", "cli"> {
+  let summary = "Type for representing a reference to a canonical loop";
+  let description = [{
+    A variable of type CanonicalLoopInfo refers to an OpenMP-compatible
+    canonical loop in the same function. Values of this type are not
+    available at runtime and therefore cannot be used by the program itself,
+    i.e. an opaque type. It is similar to the transform dialect's
+    `!transform.interface` type, but instead of implementing an interface
+    for each transformation, the OpenMP dialect itself defines possible
+    operations on this type.
+
+    A value of type CanonicalLoopInfoType (in the following: CLI) value can be
+
+    1. created by omp.new_cli.
+    2. passed to omp.canonical_loop to associate the loop to that CLI. A CLI
+       can only be associated once.
+    3. passed to an omp loop transformation operation that modifies the loop
+       associated with the CLI. The CLI is the "applyee" and the operation is
+       the consumer. A CLI can only be consumed once.
+    4. passed to an omp loop transformation operation to associate the cli with
+       a result of that transformation. The CLI is the "generatee" and the
+       operation is the generator.
+
+    A CLI cannot
+
+    1. be returned from a function.
+    2. be passed to operations that are not specifically designed to take a
+       CanonicalLoopInfoType, including AnyType.
+
+    A CLI directly corresponds to an object of
+    OpenMPIRBuilder's CanonicalLoopInfo struct when lowering to LLVM-IR.
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Base classes for OpenMP dialect operations.
 //===----------------------------------------------------------------------===//
@@ -211,8 +249,35 @@ class OpenMP_Op<string mnemonic, list<Trait> traits = [],
 // Doesn't actually create a C++ base class (only defines default values for
 // tablegen classes that derive from this). Use LoopTransformationInterface
 // instead for common operations.
-class OpenMPTransform_Op<string mnemonic, list<Trait> traits = []> :
-      OpenMP_Op<mnemonic, !listconcat([DeclareOpInterfaceMethods<LoopTransformationInterface>], traits)  > {
+class OpenMPTransform_Op<string mnemonic,
+                         list<Trait> traits = [],
+                         list<OpenMP_Clause> clauses = []> :
+      OpenMP_Op<mnemonic,
+                traits = !listconcat([DeclareOpInterfaceMethods<LoopTransformationInterface>], traits),
+                clauses = clauses> {
+}
+
+// Base clause for loop transformations using the standard syntax.
+//
+//     omp.opname ($generatees) <- ($applyees) clause(...) clause(...) ... <attr-dicr>
+//     omp.opname                  ($applyees) clause(...) clause(...) ... <attr-dict>
+//
+// $generatees is optional and is assumed to be empty if omitted
+class OpenMPTransformBase_Op<string mnemonic,
+                         list<Trait> traits = [],
+                         list<OpenMP_Clause> clauses = []> :
+      OpenMPTransform_Op<mnemonic,
+                         traits = !listconcat(traits, [AttrSizedOperandSegments]),
+                         clauses = clauses> {
+
+  let arguments = !con(
+                       (ins Variadic<CanonicalLoopInfoType>:$generatees,
+                            Variadic<CanonicalLoopInfoType>:$applyees
+                      ), clausesArgs);
+
+  let assemblyFormat = [{ custom<LoopTransformClis>($generatees, $applyees) }]
+                         # clausesAssemblyFormat
+                         # [{ attr-dict }];
 }
 
 #endif  // OPENMP_OP_BASE
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 5c77e215467e4..b73091ea0ca53 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -357,44 +357,6 @@ def SingleOp : OpenMP_Op<"single", traits = [
   let hasVerifier = 1;
 }
 
-//===---------------------------------------------------------------------===//
-// OpenMP Canonical Loop Info Type
-//===---------------------------------------------------------------------===//
-
-def CanonicalLoopInfoType : OpenMP_Type<"CanonicalLoopInfo", "cli"> {
-  let summary = "Type for representing a reference to a canonical loop";
-  let description = [{
-    A variable of type CanonicalLoopInfo refers to an OpenMP-compatible
-    canonical loop in the same function. Values of this type are not
-    available at runtime and therefore cannot be used by the program itself,
-    i.e. an opaque type. It is similar to the transform dialect's
-    `!transform.interface` type, but instead of implementing an interface
-    for each transformation, the OpenMP dialect itself defines possible
-    operations on this type.
-
-    A value of type CanonicalLoopInfoType (in the following: CLI) value can be
-
-    1. created by omp.new_cli.
-    2. passed to omp.canonical_loop to associate the loop to that CLI. A CLI
-       can only be associated once.
-    3. passed to an omp loop transformation operation that modifies the loop
-       associated with the CLI. The CLI is the "applyee" and the operation is
-       the consumer. A CLI can only be consumed once.
-    4. passed to an omp loop transformation operation to associate the cli with
-       a result of that transformation. The CLI is the "generatee" and the
-       operation is the generator.
-
-    A CLI cannot
-
-    1. be returned from a function.
-    2. be passed to operations that are not specifically designed to take a
-       CanonicalLoopInfoType, including AnyType.
-
-    A CLI directly corresponds to an object of
-    OpenMPIRBuilder's CanonicalLoopInfo struct when lowering to LLVM-IR.
-  }];
-}
-
 //===---------------------------------------------------------------------===//
 // OpenMP Canonical Loop Info Creation
 //===---------------------------------------------------------------------===//
@@ -563,6 +525,31 @@ def UnrollHeuristicOp : OpenMPTransform_Op<"unroll_heuristic", []> {
   let hasCustomAssemblyFormat = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// OpenMP tile operation
+//===----------------------------------------------------------------------===//
+
+def TileOp : OpenMPTransformBase_Op<"tile",
+                                clauses = [OpenMP_SizesClause]> {
+  let summary = "OpenMP tile operation";
+  let description = [{
+    Represents the OpenMP tile directive introduced in OpenMP 5.1.
+
+    The construct partitions the logical iteration space of the affected loops
+    into equally-sized tiles, then creates two sets of nested loops. The outer
+    loops, called the grid loops, iterate over all tiles. The inner loops,
+    called the intratile loops, iterate over the logical iterations of a tile.
+    The sizes clause determines the size of a tile.
+
+    Currently, the affected loops must be rectangular (the tripcount of the
+    inner loop must not depend on any iv of an surrounding affected loop) and
+    perfectly nested (except for the innermost affected loop, no operations
+    other than the nested loop and the terminator in the loop body).
+  }] # clausesDescription;
+
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // 2.8.3 Workshare Construct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 32ebe06e240db..5672942a18231 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Support/InterleavedRange.h"
 #include <cstddef>
 #include <iterator>
 #include <optional>
@@ -3385,6 +3386,9 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) {
   Value result = getResult();
   auto [newCli, gen, cons] = decodeCli(result);
 
+  // Structured binding `gen` cannot be captured in lambdas before C++20
+  OpOperand *generator = gen;
+
   // Derive the CLI variable name from its generator:
   //  * "canonloop" for omp.canonical_loop
   //  * custom name for loop transformation generatees
@@ -3403,6 +3407,24 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) {
             .Case([&](UnrollHeuristicOp op) -> std::string {
               llvm_unreachable("heuristic unrolling does not generate a loop");
             })
+            .Case([&](TileOp op) -> std::string {
+              auto [generateesFirst, generateesCount] =
+                  op.getGenerateesODSOperandIndexAndLength();
+              unsigned firstGrid = generateesFirst;
+              unsigned firstIntratile = generateesFirst + generateesCount / 2;
+              unsigned end = generateesFirst + generateesCount;
+              unsigned opnum = generator->getOperandNumber();
+              // In the OpenMP apply and looprange clauses, indices are 1-based
+              if (firstGrid <= opnum && opnum < firstIntratile) {
+                unsigned gridnum = opnum - firstGrid + 1;
+                return ("grid" + Twine(gridnum)).str();
+              }
+              if (firstIntratile <= opnum && opnum < end) {
+                unsigned intratilenum = opnum - firstIntratile + 1;
+                return ("intratile" + Twine(intratilenum)).str();
+              }
+              llvm_unreachable("Unexpected generatee argument");
+            })
             .Default([&](Operation *op) {
               assert(false && "TODO: Custom name for this operation");
               return "transformed";
@@ -3631,6 +3653,138 @@ UnrollHeuristicOp::getGenerateesODSOperandIndexAndLength() {
   return {0, 0};
 }
 
+//===----------------------------------------------------------------------===//
+// TileOp
+//===----------------------------------------------------------------------===//
+
+static void printLoopTransformClis(OpAsmPrinter &p, TileOp op,
+                                   OperandRange generatees,
+                                   OperandRange applyees) {
+  if (!generatees.empty())
+    p << '(' << llvm::interleaved(generatees) << ')';
+
+  if (!applyees.empty())
+    p << " <- (" << llvm::interleaved(applyees) << ')';
+}
+
+static ParseResult parseLoopTransformClis(
+    OpAsmParser &parser,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &generateesOperands,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &applyeesOperands) {
+  if (parser.parseOptionalLess()) {
+    // Syntax 1: generatees present
+
+    if (parser.parseOperandList(generateesOperands,
+                                mlir::OpAsmParser::Delimiter::Paren))
+      return failure();
+
+    if (parser.parseLess())
+      return failure();
+  } else {
+    // Syntax 2: generatees omitted
+  }
+
+  // Parse `<-` (`<` has already been parsed)
+  if (parser.parseMinus())
+    return failure();
+
+  if (parser.parseOperandList(applyeesOperands,
+                              mlir::OpAsmParser::Delimiter::Paren))
+    return failure();
+
+  return success();
+}
+
+LogicalResult TileOp::verify() {
+  if (getApplyees().empty())
+    return emitOpError() << "must apply to at least one loop";
+
+  if (getSizes().size() != getApplyees().size())
+    return emitOpError() << "there must be one tile size for each applyee";
+
+  if (!getGeneratees().empty() &&
+      2 * getSizes().size() != getGeneratees().size())
+    return emitOpError()
+           << "expecting two times the number of generatees than applyees";
+
+  DenseSet<Value> parentIVs;
+
+  Value parent = getApplyees().front();
+  for (auto &&applyee : llvm::drop_begin(getApplyees())) {
+    auto [parentCreate, parentGen, parentCons] = decodeCli(parent);
+    auto [create, gen, cons] = decodeCli(applyee);
+
+    if (!parentGen)
+      return emitOpError() << "applyee CLI has no generator";
+
+    auto parentLoop = dyn_cast_or_null<CanonicalLoopOp>(parentGen->getOwner());
+    if (!parentGen)
+      return emitOpError()
+             << "currently only supports omp.canonical_loop as applyee";
+
+    parentIVs.insert(parentLoop.getInductionVar());
+
+    if (!gen)
+      return emitOpError() << "applyee CLI has no generator";
+    auto loop = dyn_cast_or_null<CanonicalLoopOp>(gen->getOwner());
+    if (!loop)
+      return emitOpError()
+             << "currently only supports omp.canonical_loop as applyee";
+
+    // Canonical loop must be perfectly nested, i.e. the body of the parent must
+    // only contain the omp.canonical_loop of the nested loops, and
+    // omp.terminator
+    bool isPerfectlyNested = [&]() {
+      auto &parentBody = parentLoop.getRegion();
+      if (!parentBody.hasOneBlock())
+        return false;
+      auto &parentBlock = parentBody.getBlocks().front();
+
+      auto nestedLoopIt = parentBlock.begin();
+      if (nestedLoopIt == parentBlock.end() ||
+          (&*nestedLoopIt != loop.getOperation()))
+        return false;
+
+      auto termIt = std::next(nestedLoopIt);
+      if (termIt == parentBlock.end() || !isa<TerminatorOp>(termIt))
+        return false;
+
+      if (std::next(termIt) != parentBlock.end())
+        return false;
+
+      return true;
+    }();
+    if (!isPerfectlyNested)
+      return emitOpError() << "tiled loop nest must be perfectly nested";
+
+    if (parentIVs.contains(loop.getTripCount()))
+      return emitOpError() << "tiled loop nest must be rectangular";
+
+    parent = applyee;
+  }
+
+  // TODO: The tile sizes must be computed before the loop, but checking this
+  // requires dominance analysis. For instance:
+  //
+  //      %canonloop = omp.new_cli
+  //      omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+  //        // write to %x
+  //        omp.terminator
+  //      }
+  //      %ts = llvm.load %x
+  //      omp.tile <- (%canonloop) sizes(%ts : i32)
+
+  return success();
+}
+
+std::pair<unsigned, unsigned> TileOp ::getApplyeesODSOperandIndexAndLength() {
+  return getODSOperandIndexAndLength(odsIndex_applyees);
+}
+
+std::pair<unsigned, unsigned> TileOp::getGenerateesODSOperandIndexAndLength() {
+  return getODSOperandIndexAndLength(odsIndex_generatees);
+}
+
 //===----------------------------------------------------------------------===//
 // Critical construct (2.17.1)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 53209a40665ae..9fcb02eb4be3d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3175,6 +3175,45 @@ applyUnrollHeuristic(omp::UnrollHeuristicOp op, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Apply a `#pragma omp tile` / `!$omp tile` transformation using the
+/// OpenMPIRBuilder.
+static LogicalResult applyTile(omp::TileOp op, llvm::IRBuilderBase &builder,
+                               LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  llvm::OpenMPIRBuilder::LocationDescription loc(builder);
+
+  SmallVector<llvm::CanonicalLoopInfo *> translatedLoops;
+  SmallVector<llvm::Value *> translatedSizes;
+
+  for (Value size : op.getSizes()) {
+    llvm::Value *translatedSize = moduleTranslation.lookupValue(size);
+    assert(translatedSize &&
+           "sizes clause arguments must already be translated");
+    translatedSizes.push_back(translatedSize);
+  }
+
+  for (Value applyee : op.getApplyees()) {
+    llvm::CanonicalLoopInfo *consBuilderCLI =
+        moduleTranslation.lookupOMPLoop(applyee);
+    assert(applyee && "Canonical loop must already been translated");
+    translatedLoops.push_back(consBuilderCLI);
+  }
+
+  auto generatedLoops =
+      ompBuilder->tileLoops(loc.DL, translatedLoops, translatedSizes);
+  if (!op.getGeneratees().empty()) {
+    for (auto [mlirLoop, genLoop] :
+         zip_equal(op.getGeneratees(), generatedLoops))
+      moduleTranslation.mapOmpLoop(mlirLoop, genLoop);
+  }
+
+  // CLIs can only be consumed once
+  for (Value applyee : op.getApplyees())
+    moduleTranslation.invalidateOmpLoop(applyee);
+
+  return success();
+}
+
 /// Convert an Atomic Ordering attribute to llvm::AtomicOrdering.
 static llvm::AtomicOrdering
 convertAtomicOrdering(std::optional<omp::ClauseMemoryOrderKind> ao) {
@@ -6227,6 +6266,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
             // the omp.canonical_loop.
             return applyUnrollHeuristic(op, builder, moduleTranslation);
           })
+          .Case([&](omp::TileOp op) {
+            return applyTile(op, builder, moduleTranslation);
+          })
           .Case([&](omp::TargetAllocMemOp) {
             return convertTargetAllocMemOp(*op, builder, moduleTranslation);
           })
diff --git a/mlir/test/Dialect/OpenMP/cli-tile.mlir b/mlir/test/Dialect/OpenMP/cli-tile.mlir
new file mode 100644
index 0000000000000..73d54784c52b7
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/cli-tile.mlir
@@ -0,0 +1,138 @@
+// RUN: mlir-opt %s            | FileCheck %s --enable-var-scope
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope
+
+
+// Raw syntax check (MLIR output is always pretty-printed)
+// CHECK-LABEL: @omp_tile_raw(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_raw(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %canonloop = "omp.new_cli" () : () -> (!omp.cli)
+  // CHECK-NEXT: %grid1 = omp.new_cli
+  %grid = "omp.new_cli" () : () -> (!omp.cli)
+  // CHECK-NEXT: %intratile1 = omp.new_cli
+  %intratile = "omp.new_cli" () : () -> (!omp.cli)
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  "omp.canonical_loop" (%tc, %canonloop) ({
+    ^bb0(%iv: i32):
+      // CHECK: omp.terminator
+      omp.terminator
+  }) : (i32, !omp.cli) -> ()
+  // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32)
+  "omp.tile"(%grid,  %intratile, %canonloop, %ts) <{operandSegmentSizes = array<i32: 2, 1, 1>}> : (!omp.cli,  !omp.cli, !omp.cli, i32) -> ()
+  //"omp.tile" (%canonloop) : (!omp.cli) -> ()
+  return
+}
+
+
+// Pretty syntax check
+// CHECK-LABEL: @omp_tile_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_pretty(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+  %grid = omp.new_cli
+  // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+  %intratile = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK: omp.tile (%grid1, %intratile1) <- (%canonloop) sizes(%[[ts]] : i32)
+  omp.tile(%grid, %intratile) <- (%canonloop) sizes(%ts : i32)
+  return
+}
+
+
+// Specifying the generatees for omp.tile is optional
+// CHECK-LABEL: @omp_tile_optionalgen_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_optionalgen_pretty(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %canonloop = omp.new_cli
+  // CHECK-NEXT: omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK: omp.tile <- (%canonloop) sizes(%[[ts]] : i32)
+  omp.tile <- (%canonloop) sizes(%ts : i32)
+  return
+}
+
+
+// Two-dimensional tiling
+// CHECK-LABEL: @omp_tile_2d_pretty(
+// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[ts1:.+]]: i32, %[[ts2:.+]]: i32) {
+func.func @omp_tile_2d_pretty(%tc1 : i32, %tc2 : i32, %ts1 : i32, %ts2 : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %cli_outer = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
+  %cli_inner = omp.new_cli
+  // CHECK-NEXT: %grid1 = omp.new_cli
+  %grid1 = omp.new_cli
+  // CHECK-NEXT: %grid2 = omp.new_cli
+  %grid2 = omp.new_cli
+  // CHECK-NEXT: %intratile1 = omp.new_cli
+  %intratile1 = omp.new_cli
+  // CHECK-NEXT: %intratile2 = omp.new_cli
+  %intratile2 = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc1]]) {
+  omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc1) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc2]]) {
+    omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc2) {
+      // CHECK: omp.terminator
+      omp.terminator
+    }
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK:  omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%canonloop, %canonloop_d1) sizes(%[[ts1]], %[[ts2]] : i32, i32)
+  omp.tile (%grid1, %grid2, %intratile1, %intratile2) <- (%cli_outer, %cli_inner) sizes(%ts1, %ts2 : i32, i32)
+  return
+}
+
+
+// Three-dimensional tiling
+// CHECK-LABEL: @omp_tile_3d_pretty(
+// CHECK-SAME: %[[tc:.+]]: i32, %[[ts:.+]]: i32) {
+func.func @omp_tile_3d_pretty(%tc : i32, %ts : i32) -> () {
+  // CHECK-NEXT: %canonloop = omp.new_cli
+  %cli_outer = omp.new_cli
+  // CHECK-NEXT: %canonloop_d1 = omp.new_cli
+  %cli_middle = omp.new_cli
+  // CHECK-NEXT: %canonloop_d2 = omp.new_cli
+  %cli_inner = omp.new_cli
+  // CHECK-NEXT: %grid1 = omp.new_cli
+  %grid1 = omp.new_cli
+  // CHECK-NEXT: %grid2 = omp.new_cli
+  %grid2 = omp.new_cli
+  // CHECK-NEXT: %grid3 = omp.new_cli
+  %grid3 = omp.new_cli
+  // CHECK-NEXT: %intratile1 = omp.new_cli
+  %intratile1 = omp.new_cli
+  // CHECK-NEXT: %intratile2 = omp.new_cli
+  %intratile2 = omp.new_cli
+  // CHECK-NEXT: %intratile3 = omp.new_cli
+  %intratile3 = omp.new_cli
+  // CHECK-NEXT:  omp.canonical_loop(%canonloop) %iv : i32 in range(%[[tc]]) {
+  omp.canonical_loop(%cli_outer) %iv_outer : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d1) %iv_d1 : i32 in range(%[[tc]]) {
+    omp.canonical_loop(%cli_middle) %iv_middle : i32 in range(%tc) {
+    // CHECK-NEXT: omp.canonical_loop(%canonloop_d2) %iv_d2 : i32 in range(%[[tc]]) {
+      omp.canonical_loop(%cli_inner) %iv_inner : i32 in range(%tc) {
+        // CHECK: omp.terminator
+        omp.terminator
+      }
+      // CHECK: omp.terminator
+      omp.terminator
+    }
+    // CHECK: omp.terminator
+    omp.terminator
+  }
+  // CHECK:  omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%canonloop, %canonloop_d1, %canonloop_d2) sizes(%[[ts]], %[[ts]], %[[ts]] : i32, i32, i32)
+  omp.tile (%grid1, %grid2, %grid3, %intratile1, %intratile2, %intratile3) <- (%cli_outer, %cli_middle, %cli_inner) sizes(%ts, %ts, %ts: i32, i32, i32)
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid-tile.mlir b/mlir/test/Dialect/OpenMP/invalid-tile.mlir
new file mode 100644
index 0000000000000..e63a062d810ed
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/invalid-tile.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+
+
+func.func @missing_sizes(%tc : i32, %ts : i32) {
+  %canonloop = omp.new_cli
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}}
+  omp.tile <-(%canonloop)
+
+  llvm.return
+}
+
+// -----
+
+func.func @no_loop(%tc : i32, %ts : i32) {
+  // expected-error@+1 {{'omp.tile' op must apply to at least one loop}}
+  omp.tile <-()
+
+  return
+}
+
+// -----
+
+func.func @missing_generator(%tc : i32, %ts : i32) {
+  // expected-error@+1 {{'omp.new_cli' op CLI has no generator}}
+  %canonloop = omp.new_cli
+
+  // expected-note@+1 {{see consumer here: "omp.tile"(%0, %arg1) <{operandSegmentSizes = array<i32: 0, 1, 1>}> : (!omp.cli, i32) -> ()}}
+  omp.tile <-(%canonloop) sizes(%ts : i32)
+
+  return
+}
+
+// -----
+
+func.func @insufficient_sizes(%tc : i32, %ts : i32) {
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+  omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op there must be one tile size for each applyee}}
+  omp.tile <-(%canonloop1, %canonloop2) sizes(%ts : i32)
+
+  llvm.return
+}
+
+// -----
+
+func.func @insufficient_applyees(%tc : i32, %ts : i32) {
+  %canonloop = omp.new_cli
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{omp.tile' op there must be one tile size for each applyee}}
+  omp.tile <- (%canonloop) sizes(%ts, %ts : i32, i32)
+
+  return
+}
+
+// -----
+
+func.func @insufficient_generatees(%tc : i32, %ts : i32) {
+  %canonloop = omp.new_cli
+  %grid = omp.new_cli
+  omp.canonical_loop(%canonloop) %iv : i32 in range(%tc) {
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op expecting two times the number of generatees than applyees}}
+  omp.tile (%grid) <- (%canonloop) sizes(%ts : i32)
+
+  return
+}
+
+// -----
+
+func.func @not_perfectly_nested(%tc : i32, %ts : i32) {
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) {
+    %v = arith.constant 42 : i32
+    omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%tc) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op tiled loop nest must be perfectly nested}}
+  omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32)
+
+  llvm.return
+}
+
+// -----
+
+func.func @non_nectangular(%tc : i32, %ts : i32) {
+  %canonloop1 = omp.new_cli
+  %canonloop2 = omp.new_cli
+  omp.canonical_loop(%canonloop1) %iv1 : i32 in range(%tc) {
+    omp.canonical_loop(%canonloop2) %iv2 : i32 in range(%iv1) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+
+  // expected-error@+1 {{'omp.tile' op tiled loop nest must be rectangular}}
+  omp.tile <-(%canonloop1, %canonloop2) sizes(%ts, %ts : i32, i32)
+
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
new file mode 100644
index 0000000000000..4ac4f02103e8c
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
@@ -0,0 +1,101 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+
+
+llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () {
+  %literal_cli = omp.new_cli
+  omp.canonical_loop(%literal_cli) %iv : i32 in range(%tc) {
+    %ptr = llvm.getelementptr inbounds %baseptr[%iv] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+    %val = llvm.mlir.constant(42.0 : f32) : f32
+    llvm.store %val, %ptr : f32, !llvm.ptr
+    omp.terminator
+  }
+  omp.tile <- (%literal_cli) sizes(%ts : i32)
+  llvm.return
+}
+
+
+// CHECK: ; ModuleID = 'LLVMDialectModule'
+// CHECK-NEXT: source_filename = "LLVMDialectModule"
+// CHECK-EMPTY:
+// CHECK-NEXT: define void @tile_trivial_loop(ptr %0, i32 %1, i32 %2) {
+// CHECK-NEXT:   br label %omp_omp.loop.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader:                           ; preds = %3
+// CHECK-NEXT:   %4 = udiv i32 %1, %2
+// CHECK-NEXT:   %5 = urem i32 %1, %2
+// CHECK-NEXT:   %6 = icmp ne i32 %5, 0
+// CHECK-NEXT:   %7 = zext i1 %6 to i32
+// CHECK-NEXT:   %omp_floor0.tripcount = add nuw i32 %4, %7
+// CHECK-NEXT:   br label %omp_floor0.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.preheader:                             ; preds = %omp_omp.loop.preheader
+// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.header:                                ; preds = %omp_floor0.inc, %omp_floor0.preheader
+// CHECK-NEXT:   %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ]
+// CHECK-NEXT:   br label %omp_floor0.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.cond:                                  ; preds = %omp_floor0.header
+// CHECK-NEXT:   %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount
+// CHECK-NEXT:   br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.body:                                  ; preds = %omp_floor0.cond
+// CHECK-NEXT:   %8 = icmp eq i32 %omp_floor0.iv, %4
+// CHECK-NEXT:   %9 = select i1 %8, i32 %5, i32 %2
+// CHECK-NEXT:   br label %omp_tile0.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.preheader:                              ; preds = %omp_floor0.body
+// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.header:                                 ; preds = %omp_tile0.inc, %omp_tile0.preheader
+// CHECK-NEXT:   %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ]
+// CHECK-NEXT:   br label %omp_tile0.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.cond:                                   ; preds = %omp_tile0.header
+// CHECK-NEXT:   %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %9
+// CHECK-NEXT:   br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.body:                                   ; preds = %omp_tile0.cond
+// CHECK-NEXT:   %10 = mul nuw i32 %2, %omp_floor0.iv
+// CHECK-NEXT:   %11 = add nuw i32 %10, %omp_tile0.iv
+// CHECK-NEXT:   br label %omp_omp.loop.body
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body:                                ; preds = %omp_tile0.body
+// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region:                                  ; preds = %omp_omp.loop.body
+// CHECK-NEXT:   %12 = getelementptr inbounds float, ptr %0, i32 %11
+// CHECK-NEXT:   store float 4.200000e+01, ptr %12, align 4
+// CHECK-NEXT:   br label %omp.region.cont
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont:                                  ; preds = %omp.loop.region
+// CHECK-NEXT:   br label %omp_tile0.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.inc:                                    ; preds = %omp.region.cont
+// CHECK-NEXT:   %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1
+// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.exit:                                   ; preds = %omp_tile0.cond
+// CHECK-NEXT:   br label %omp_tile0.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.after:                                  ; preds = %omp_tile0.exit
+// CHECK-NEXT:   br label %omp_floor0.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.inc:                                   ; preds = %omp_tile0.after
+// CHECK-NEXT:   %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1
+// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.exit:                                  ; preds = %omp_floor0.cond
+// CHECK-NEXT:   br label %omp_floor0.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.after:                                 ; preds = %omp_floor0.exit
+// CHECK-NEXT:   br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.after:                               ; preds = %omp_floor0.after
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+// CHECK-EMPTY:
+// CHECK-NEXT: !llvm.module.flags = !{!0}
+// CHECK-EMPTY:
+// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
new file mode 100644
index 0000000000000..6fad81cd0c299
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
@@ -0,0 +1,190 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+
+llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %ts2: i32) -> () {
+  %literal_outer = omp.new_cli
+  %literal_inner = omp.new_cli
+  omp.canonical_loop(%literal_outer) %iv1 : i32 in range(%tc1) {
+    omp.canonical_loop(%literal_inner) %iv2 : i32 in range(%tc2) {
+      %idx = llvm.add %iv1, %iv2 : i32
+      %ptr = llvm.getelementptr inbounds %baseptr[%idx] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+      %val = llvm.mlir.constant(42.0 : f32) : f32
+      llvm.store %val, %ptr : f32, !llvm.ptr
+      omp.terminator
+    }
+    omp.terminator
+  }
+  omp.tile <- (%literal_outer, %literal_inner) sizes(%ts1, %ts2 : i32,i32)
+  llvm.return
+}
+
+
+// CHECK: ; ModuleID = 'LLVMDialectModule'
+// CHECK-NEXT: source_filename = "LLVMDialectModule"
+// CHECK-EMPTY:
+// CHECK-NEXT: define void @tile_2d_loop(ptr %0, i32 %1, i32 %2, i32 %3, i32 %4) {
+// CHECK-NEXT:   br label %omp_omp.loop.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader:                           ; preds = %5
+// CHECK-NEXT:   %6 = udiv i32 %1, %3
+// CHECK-NEXT:   %7 = urem i32 %1, %3
+// CHECK-NEXT:   %8 = icmp ne i32 %7, 0
+// CHECK-NEXT:   %9 = zext i1 %8 to i32
+// CHECK-NEXT:   %omp_floor0.tripcount = add nuw i32 %6, %9
+// CHECK-NEXT:   %10 = udiv i32 %2, %4
+// CHECK-NEXT:   %11 = urem i32 %2, %4
+// CHECK-NEXT:   %12 = icmp ne i32 %11, 0
+// CHECK-NEXT:   %13 = zext i1 %12 to i32
+// CHECK-NEXT:   %omp_floor1.tripcount = add nuw i32 %10, %13
+// CHECK-NEXT:   br label %omp_floor0.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.header:                              ; preds = %omp_omp.loop.inc
+// CHECK-NEXT:   %omp_omp.loop.iv = phi i32 [ %omp_omp.loop.next, %omp_omp.loop.inc ]
+// CHECK-NEXT:   br label %omp_omp.loop.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.cond:                                ; preds = %omp_omp.loop.header
+// CHECK-NEXT:   %omp_omp.loop.cmp = icmp ult i32 %19, %1
+// CHECK-NEXT:   br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body:                                ; preds = %omp_tile1.body, %omp_omp.loop.cond
+// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region:                                  ; preds = %omp_omp.loop.body
+// CHECK-NEXT:   br label %omp_omp.loop.preheader1
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.preheader1:                          ; preds = %omp.loop.region
+// CHECK-NEXT:   br label %omp_omp.loop.body4
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.preheader:                             ; preds = %omp_omp.loop.preheader
+// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.header:                                ; preds = %omp_floor0.inc, %omp_floor0.preheader
+// CHECK-NEXT:   %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ]
+// CHECK-NEXT:   br label %omp_floor0.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.cond:                                  ; preds = %omp_floor0.header
+// CHECK-NEXT:   %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount
+// CHECK-NEXT:   br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.body:                                  ; preds = %omp_floor0.cond
+// CHECK-NEXT:   br label %omp_floor1.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor1.preheader:                             ; preds = %omp_floor0.body
+// CHECK-NEXT:   br label %omp_floor1.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor1.header:                                ; preds = %omp_floor1.inc, %omp_floor1.preheader
+// CHECK-NEXT:   %omp_floor1.iv = phi i32 [ 0, %omp_floor1.preheader ], [ %omp_floor1.next, %omp_floor1.inc ]
+// CHECK-NEXT:   br label %omp_floor1.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor1.cond:                                  ; preds = %omp_floor1.header
+// CHECK-NEXT:   %omp_floor1.cmp = icmp ult i32 %omp_floor1.iv, %omp_floor1.tripcount
+// CHECK-NEXT:   br i1 %omp_floor1.cmp, label %omp_floor1.body, label %omp_floor1.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor1.body:                                  ; preds = %omp_floor1.cond
+// CHECK-NEXT:   %14 = icmp eq i32 %omp_floor0.iv, %6
+// CHECK-NEXT:   %15 = select i1 %14, i32 %7, i32 %3
+// CHECK-NEXT:   %16 = icmp eq i32 %omp_floor1.iv, %10
+// CHECK-NEXT:   %17 = select i1 %16, i32 %11, i32 %4
+// CHECK-NEXT:   br label %omp_tile0.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.preheader:                              ; preds = %omp_floor1.body
+// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.header:                                 ; preds = %omp_tile0.inc, %omp_tile0.preheader
+// CHECK-NEXT:   %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ]
+// CHECK-NEXT:   br label %omp_tile0.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.cond:                                   ; preds = %omp_tile0.header
+// CHECK-NEXT:   %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %15
+// CHECK-NEXT:   br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.body:                                   ; preds = %omp_tile0.cond
+// CHECK-NEXT:   br label %omp_tile1.preheader
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile1.preheader:                              ; preds = %omp_tile0.body
+// CHECK-NEXT:   br label %omp_tile1.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile1.header:                                 ; preds = %omp_tile1.inc, %omp_tile1.preheader
+// CHECK-NEXT:   %omp_tile1.iv = phi i32 [ 0, %omp_tile1.preheader ], [ %omp_tile1.next, %omp_tile1.inc ]
+// CHECK-NEXT:   br label %omp_tile1.cond
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile1.cond:                                   ; preds = %omp_tile1.header
+// CHECK-NEXT:   %omp_tile1.cmp = icmp ult i32 %omp_tile1.iv, %17
+// CHECK-NEXT:   br i1 %omp_tile1.cmp, label %omp_tile1.body, label %omp_tile1.exit
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile1.body:                                   ; preds = %omp_tile1.cond
+// CHECK-NEXT:   %18 = mul nuw i32 %3, %omp_floor0.iv
+// CHECK-NEXT:   %19 = add nuw i32 %18, %omp_tile0.iv
+// CHECK-NEXT:   %20 = mul nuw i32 %4, %omp_floor1.iv
+// CHECK-NEXT:   %21 = add nuw i32 %20, %omp_tile1.iv
+// CHECK-NEXT:   br label %omp_omp.loop.body
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.body4:                               ; preds = %omp_omp.loop.preheader1
+// CHECK-NEXT:   br label %omp.loop.region12
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.loop.region12:                                ; preds = %omp_omp.loop.body4
+// CHECK-NEXT:   %22 = add i32 %19, %21
+// CHECK-NEXT:   %23 = getelementptr inbounds float, ptr %0, i32 %22
+// CHECK-NEXT:   store float 4.200000e+01, ptr %23, align 4
+// CHECK-NEXT:   br label %omp.region.cont11
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont11:                                ; preds = %omp.loop.region12
+// CHECK-NEXT:   br label %omp_tile1.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile1.inc:                                    ; preds = %omp.region.cont11
+// CHECK-NEXT:   %omp_tile1.next = add nuw i32 %omp_tile1.iv, 1
+// CHECK-NEXT:   br label %omp_tile1.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile1.exit:                                   ; preds = %omp_tile1.cond
+// CHECK-NEXT:   br label %omp_tile1.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile1.after:                                  ; preds = %omp_tile1.exit
+// CHECK-NEXT:   br label %omp_tile0.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.inc:                                    ; preds = %omp_tile1.after
+// CHECK-NEXT:   %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1
+// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.exit:                                   ; preds = %omp_tile0.cond
+// CHECK-NEXT:   br label %omp_tile0.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_tile0.after:                                  ; preds = %omp_tile0.exit
+// CHECK-NEXT:   br label %omp_floor1.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor1.inc:                                   ; preds = %omp_tile0.after
+// CHECK-NEXT:   %omp_floor1.next = add nuw i32 %omp_floor1.iv, 1
+// CHECK-NEXT:   br label %omp_floor1.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor1.exit:                                  ; preds = %omp_floor1.cond
+// CHECK-NEXT:   br label %omp_floor1.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor1.after:                                 ; preds = %omp_floor1.exit
+// CHECK-NEXT:   br label %omp_floor0.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.inc:                                   ; preds = %omp_floor1.after
+// CHECK-NEXT:   %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1
+// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.exit:                                  ; preds = %omp_floor0.cond
+// CHECK-NEXT:   br label %omp_floor0.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_floor0.after:                                 ; preds = %omp_floor0.exit
+// CHECK-NEXT:   br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp.region.cont:                                  ; No predecessors!
+// CHECK-NEXT:   br label %omp_omp.loop.inc
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.inc:                                 ; preds = %omp.region.cont
+// CHECK-NEXT:   %omp_omp.loop.next = add nuw i32 %19, 1
+// CHECK-NEXT:   br label %omp_omp.loop.header
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.exit:                                ; preds = %omp_omp.loop.cond
+// CHECK-NEXT:   br label %omp_omp.loop.after
+// CHECK-EMPTY:
+// CHECK-NEXT: omp_omp.loop.after:                               ; preds = %omp_floor0.after, %omp_omp.loop.exit
+// CHECK-NEXT:   ret void
+// CHECK-NEXT: }
+// CHECK-EMPTY:
+// CHECK-NEXT: !llvm.module.flags = !{!0}
+// CHECK-EMPTY:
+// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/mlir/test/mlir-tblgen/op-format-invalid.td b/mlir/test/mlir-tblgen/op-format-invalid.td
index 2f29543f67381..0a022ad43a749 100644
--- a/mlir/test/mlir-tblgen/op-format-invalid.td
+++ b/mlir/test/mlir-tblgen/op-format-invalid.td
@@ -307,7 +307,7 @@ def DirectiveTypeZOperandInvalidI : TestFormat_Op<[{
 def LiteralInvalidA : TestFormat_Op<[{
   `a:`
 }]>;
-// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+*'
+// CHECK: error: expected valid literal but got '1': single character literal must be a letter or one of '_:,=<>()[]{}?+-*'
 def LiteralInvalidB : TestFormat_Op<[{
   `1`
 }]>;
diff --git a/mlir/test/mlir-tblgen/op-format-spec.td b/mlir/test/mlir-tblgen/op-format-spec.td
index 1541cd09f53e0..1ac231116454b 100644
--- a/mlir/test/mlir-tblgen/op-format-spec.td
+++ b/mlir/test/mlir-tblgen/op-format-spec.td
@@ -123,7 +123,7 @@ def DirectiveTypeValid : TestFormat_Op<[{
 
 // CHECK-NOT: error
 def LiteralValid : TestFormat_Op<[{
-  `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `*` ` ` `` `->` `\n` `abc$._`
+  `_` `:` `,` `=` `<` `>` `(` `)` `[` `]` `?` `+` `-` `*` ` ` `` `->` `\n` `abc$._`
   attr-dict
 }]>;
 
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
index a1899a81afcce..8dd971374fa21 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
@@ -403,6 +403,7 @@ void DefFormat::genLiteralParser(StringRef value, FmtContext &ctx,
               .Case("]", "RSquare")
               .Case("?", "Question")
               .Case("+", "Plus")
+              .Case("-", "Minus")
               .Case("*", "Star")
               .Case("...", "Ellipsis")
        << "()";
diff --git a/mlir/tools/mlir-tblgen/FormatGen.cpp b/mlir/tools/mlir-tblgen/FormatGen.cpp
index 4dfdde2146679..04d3ed1f3b70d 100644
--- a/mlir/tools/mlir-tblgen/FormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/FormatGen.cpp
@@ -518,7 +518,7 @@ bool mlir::tblgen::isValidLiteral(StringRef value,
   // If there is only one character, this must either be punctuation or a
   // single character bare identifier.
   if (value.size() == 1) {
-    StringRef bare = "_:,=<>()[]{}?+*";
+    StringRef bare = "_:,=<>()[]{}?+-*";
     if (isalpha(front) || bare.contains(front))
       return true;
     if (emitError)
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 0d113b3748354..ccf21d16005af 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -852,6 +852,7 @@ static void genLiteralParser(StringRef value, MethodBody &body) {
               .Case("]", "RSquare()")
               .Case("?", "Question()")
               .Case("+", "Plus()")
+              .Case("-", "Minus()")
               .Case("*", "Star()")
               .Case("...", "Ellipsis()");
 }

From 5843ffb14940920f72516dfe18ed657bee23d1c5 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Thu, 2 Oct 2025 18:14:43 +0100
Subject: [PATCH 546/878] [VPlan] Improve code using m_One (NFC) (#161686)

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index acdb37996a443..67e33cf2e4d7a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -3346,12 +3346,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
         VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
       }
 
-      [[maybe_unused]] auto *ConstStep =
-          ScalarStep->isLiveIn()
-              ? dyn_cast<ConstantInt>(ScalarStep->getLiveInIRValue())
-              : nullptr;
-      assert(!ConstStep || ConstStep->getValue() != 1);
-      (void)ConstStep;
+      assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
       if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
         ScalarStep =
             Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);

From d68f0c2e1c1fafea983f92233e8ef9bcfe2a410a Mon Sep 17 00:00:00 2001
From: Andrew Ng <andrew.ng@sony.com>
Date: Thu, 2 Oct 2025 18:30:51 +0100
Subject: [PATCH 547/878] [DTLTO][LLD] Tidy up DTLTO related options (NFC)
 (#161675)

Change LLD DTLTO option definitions to match actual option name.
---
 lld/COFF/Driver.cpp | 10 +++++-----
 lld/COFF/Options.td |  4 ++--
 lld/ELF/Driver.cpp  |  5 +++--
 lld/ELF/Options.td  |  6 +++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index a59cc06d51836..3676b8881016b 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2104,18 +2104,18 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   config->dtltoDistributor = args.getLastArgValue(OPT_thinlto_distributor);
 
   // Handle /thinlto-distributor-arg:<arg>
-  for (auto *arg : args.filtered(OPT_thinlto_distributor_arg))
-    config->dtltoDistributorArgs.push_back(arg->getValue());
+  config->dtltoDistributorArgs =
+      args::getStrings(args, OPT_thinlto_distributor_arg);
 
   // Handle /thinlto-remote-compiler:<path>
-  config->dtltoCompiler = args.getLastArgValue(OPT_thinlto_compiler);
+  config->dtltoCompiler = args.getLastArgValue(OPT_thinlto_remote_compiler);
   if (!config->dtltoDistributor.empty() && config->dtltoCompiler.empty())
     Err(ctx) << "A value must be specified for /thinlto-remote-compiler if "
                 "/thinlto-distributor is specified.";
 
   // Handle /thinlto-remote-compiler-arg:<arg>
-  for (auto *arg : args.filtered(OPT_thinlto_compiler_arg))
-    config->dtltoCompilerArgs.push_back(arg->getValue());
+  config->dtltoCompilerArgs =
+      args::getStrings(args, OPT_thinlto_remote_compiler_arg);
 
   // Handle /dwodir
   config->dwoDir = args.getLastArgValue(OPT_dwodir);
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index 485db5a8b21c1..f3d0eb3356200 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -289,10 +289,10 @@ def thinlto_distributor : P<"thinlto-distributor",
   "backend compilations will be distributed">;
 def thinlto_distributor_arg : P<"thinlto-distributor-arg",
   "Arguments to pass to the ThinLTO distributor">;
-def thinlto_compiler : P<"thinlto-remote-compiler",
+def thinlto_remote_compiler : P<"thinlto-remote-compiler",
   "Compiler for the ThinLTO distributor to invoke for ThinLTO backend "
   "compilations">;
-def thinlto_compiler_arg : P<"thinlto-remote-compiler-arg",
+def thinlto_remote_compiler_arg : P<"thinlto-remote-compiler-arg",
   "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend "
   "compilations">;
 def lto_obj_path : P<
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 1beab8d33f4ba..62f7fffce7dbe 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1399,8 +1399,9 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   ctx.arg.dtltoDistributor = args.getLastArgValue(OPT_thinlto_distributor_eq);
   ctx.arg.dtltoDistributorArgs =
       args::getStrings(args, OPT_thinlto_distributor_arg);
-  ctx.arg.dtltoCompiler = args.getLastArgValue(OPT_thinlto_compiler_eq);
-  ctx.arg.dtltoCompilerArgs = args::getStrings(args, OPT_thinlto_compiler_arg);
+  ctx.arg.dtltoCompiler = args.getLastArgValue(OPT_thinlto_remote_compiler_eq);
+  ctx.arg.dtltoCompilerArgs =
+      args::getStrings(args, OPT_thinlto_remote_compiler_arg);
   ctx.arg.dwoDir = args.getLastArgValue(OPT_plugin_opt_dwo_dir_eq);
   ctx.arg.dynamicLinker = getDynamicLinker(ctx, args);
   ctx.arg.ehFrameHdr =
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index f0523185a0a31..0d6dda4b60d3a 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -722,11 +722,11 @@ def thinlto_distributor_eq: JJ<"thinlto-distributor=">,
   "ThinLTO backend compilations will be distributed">;
 defm thinlto_distributor_arg: EEq<"thinlto-distributor-arg", "Arguments to "
   "pass to the ThinLTO distributor">;
-def thinlto_compiler_eq: JJ<"thinlto-remote-compiler=">,
+def thinlto_remote_compiler_eq: JJ<"thinlto-remote-compiler=">,
   HelpText<"Compiler for the ThinLTO distributor to invoke for ThinLTO backend "
   "compilations">;
-defm thinlto_compiler_arg: EEq<"thinlto-remote-compiler-arg", "Compiler "
-  "arguments for the ThinLTO distributor to pass for ThinLTO backend "
+defm thinlto_remote_compiler_arg: EEq<"thinlto-remote-compiler-arg",
+  "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend "
   "compilations">;
 defm fat_lto_objects: BB<"fat-lto-objects",
     "Use the .llvm.lto section, which contains LLVM bitcode, in fat LTO object files to perform LTO.",

From 6bfa56a29aeeb70af0039cbba25cfc7d7b2a5f6a Mon Sep 17 00:00:00 2001
From: Sterling-Augustine <saugustine@google.com>
Date: Thu, 2 Oct 2025 10:38:22 -0700
Subject: [PATCH 548/878] [NFC][DwarfLowLevel] Make getRelocatedValueImpl
 public (#160618)

Without this, the class is useless.
---
 .../llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
index 52af205257627..ffe0b50b036ac 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h
@@ -179,6 +179,7 @@ class DWARFDataExtractorBase : public DataExtractor {
 
 class DWARFDataExtractorSimple
     : public DWARFDataExtractorBase<DWARFDataExtractorSimple> {
+public:
   using DWARFDataExtractorBase::DWARFDataExtractorBase;
 
   LLVM_ABI uint64_t getRelocatedValueImpl(uint32_t Size, uint64_t *Off,

From c4709823bb82cb4b6ca2675ef69b1a0e02e3f58e Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 2 Oct 2025 10:43:57 -0700
Subject: [PATCH 549/878] [libc] Fix issue with fuzz input too short for atoi
 diff fuzz (#161705)

The string to integer differential fuzzer assumes at least one byte of
meaningful input, but wasn't explicitly checking that. Now it does.
---
 libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp b/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp
index 097e6193ee6ef..2fabbba231167 100644
--- a/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtointeger_differential_fuzz.cpp
@@ -44,6 +44,10 @@
 // greater than 50% chance for each character to end the string, making the odds
 // of getting long numbers very low.
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size < 2) // Needs at least one byte for the base and one byte for the
+                // string.
+    return 0;
+
   uint8_t *container = new uint8_t[size + 1];
   if (!container)
     __builtin_trap();

From 0b7129afccc8e79fb88bb686466211f3bcad142e Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 3 Oct 2025 01:44:03 +0800
Subject: [PATCH 550/878] [InstCombine] Fix FMF propagation in
 `foldFCmpFSubIntoFCmp` (#161539)

Proof: https://alive2.llvm.org/ce/z/orSP-S
Closes https://github.com/llvm/llvm-project/issues/161525.
---
 .../InstCombine/InstCombineCompares.cpp       |  3 ++
 llvm/test/Transforms/InstCombine/fcmp.ll      | 40 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index e4cb457499ef5..60da48e1d95d0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -8527,6 +8527,9 @@ static Instruction *foldFCmpFSubIntoFCmp(FCmpInst &I, Instruction *LHSI,
             DenormalMode::getIEEE()) {
       CI.replaceOperand(I, 0, X);
       CI.replaceOperand(I, 1, Y);
+      I.setHasNoInfs(LHSI->hasNoInfs());
+      if (LHSI->hasNoNaNs())
+        I.setHasNoNaNs(true);
       return &I;
     }
     break;
diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll
index 119cffd73c662..d94e78c55a375 100644
--- a/llvm/test/Transforms/InstCombine/fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp.ll
@@ -1812,6 +1812,46 @@ define i1 @fcmp_ule_fsub_const(float %x, float %y) {
   ret i1 %cmp
 }
 
+define i1 @fcmp_ninf_ule_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ninf_ule_fsub_const(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp ninf ule float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_nnan_ule_fsub_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_nnan_ule_fsub_const(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub float %x, %y
+  %cmp = fcmp nnan ule float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ule_fsub_ninf_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ule_fsub_ninf_const(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub ninf float %x, %y
+  %cmp = fcmp ule float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ule_fsub_nnan_const(float %x, float %y) {
+; CHECK-LABEL: @fcmp_ule_fsub_nnan_const(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %fs = fsub nnan float %x, %y
+  %cmp = fcmp ule float %fs, 0.000000e+00
+  ret i1 %cmp
+}
+
 define i1 @fcmp_ugt_fsub_const(float %x, float %y) {
 ; CHECK-LABEL: @fcmp_ugt_fsub_const(
 ; CHECK-NEXT:    [[FS:%.*]] = fsub float [[X:%.*]], [[Y:%.*]]

From 67c000efb7bb7858f9fb6e577f2c9d1f24291ba0 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Thu, 2 Oct 2025 13:47:28 -0400
Subject: [PATCH 551/878] [HLSL] [SPIR-V] Add counter member for typed buffer
 (#161414)

This is part 1 of implementing the typed buffer counters proposal:

https://github.com/llvm/wg-hlsl/blob/main/proposals/0023-typed-buffer-counters.md

This patch adds the initial plumbing for supporting counter variables
associated with structured buffers for the SPIR-V backend. It introduces
an `IsCounter` attribute to `HLSLAttributedResourceType` and threads it
through the AST, type printing, and mangling. It also adds a
`__counter_handle` member to the relevant buffer types in
`HLSLBuiltinTypeDeclBuilder`.

Contributes to https://github.com/llvm/llvm-project/issues/137032
---
 clang/include/clang/AST/TypeBase.h            |  17 ++-
 clang/include/clang/AST/TypeProperties.td     |   5 +-
 clang/include/clang/Basic/Attr.td             |   6 +
 clang/lib/AST/ItaniumMangle.cpp               |   2 +
 clang/lib/AST/TypePrinter.cpp                 |   3 +
 clang/lib/CodeGen/Targets/SPIR.cpp            |   6 +
 clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp | 110 ++++++++++++++++--
 clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h   |  16 ++-
 clang/lib/Sema/HLSLExternalSemaSource.cpp     |  26 ++---
 clang/lib/Sema/SemaHLSL.cpp                   |  11 ++
 .../test/AST/HLSL/StructuredBuffers-AST.hlsl  |  23 +++-
 .../RWStructuredBuffer-elementtype.hlsl       |  47 +++++---
 ...erOrderedStructuredBuffer-elementtype.hlsl |  26 ++---
 .../StructuredBuffers-constructors.hlsl       |   4 +-
 .../StructuredBuffers-methods-lib.hlsl        |   6 +-
 .../StructuredBuffers-methods-ps.hlsl         |   2 +-
 .../resources/resource-bindings.hlsl          |   2 +-
 17 files changed, 238 insertions(+), 74 deletions(-)

diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index e0d00b82f2b76..6786b2f6cbc78 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -6702,15 +6702,21 @@ class HLSLAttributedResourceType : public Type, public llvm::FoldingSetNode {
     LLVM_PREFERRED_TYPE(bool)
     uint8_t RawBuffer : 1;
 
+    LLVM_PREFERRED_TYPE(bool)
+    uint8_t IsCounter : 1;
+
     Attributes(llvm::dxil::ResourceClass ResourceClass, bool IsROV = false,
-               bool RawBuffer = false)
-        : ResourceClass(ResourceClass), IsROV(IsROV), RawBuffer(RawBuffer) {}
+               bool RawBuffer = false, bool IsCounter = false)
+        : ResourceClass(ResourceClass), IsROV(IsROV), RawBuffer(RawBuffer),
+          IsCounter(IsCounter) {}
 
-    Attributes() : Attributes(llvm::dxil::ResourceClass::UAV, false, false) {}
+    Attributes()
+        : Attributes(llvm::dxil::ResourceClass::UAV, false, false, false) {}
 
     friend bool operator==(const Attributes &LHS, const Attributes &RHS) {
-      return std::tie(LHS.ResourceClass, LHS.IsROV, LHS.RawBuffer) ==
-             std::tie(RHS.ResourceClass, RHS.IsROV, RHS.RawBuffer);
+      return std::tie(LHS.ResourceClass, LHS.IsROV, LHS.RawBuffer,
+                      LHS.IsCounter) == std::tie(RHS.ResourceClass, RHS.IsROV,
+                                                 RHS.RawBuffer, RHS.IsCounter);
     }
     friend bool operator!=(const Attributes &LHS, const Attributes &RHS) {
       return !(LHS == RHS);
@@ -6751,6 +6757,7 @@ class HLSLAttributedResourceType : public Type, public llvm::FoldingSetNode {
     ID.AddInteger(static_cast<uint32_t>(Attrs.ResourceClass));
     ID.AddBoolean(Attrs.IsROV);
     ID.AddBoolean(Attrs.RawBuffer);
+    ID.AddBoolean(Attrs.IsCounter);
   }
 
   static bool classof(const Type *T) {
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index b3932a67db69d..9dc85fb88e267 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -662,6 +662,9 @@ let Class = HLSLAttributedResourceType in {
   def : Property<"rawBuffer", Bool> {
     let Read = [{ node->getAttrs().RawBuffer }];
   }
+  def : Property<"isCounter", Bool> {
+    let Read = [{ node->getAttrs().IsCounter }];
+  }
   def : Property<"wrappedTy", QualType> {
     let Read = [{ node->getWrappedType() }];
   }
@@ -669,7 +672,7 @@ let Class = HLSLAttributedResourceType in {
     let Read = [{ node->getContainedType() }];
   }
   def : Creator<[{
-    HLSLAttributedResourceType::Attributes attrs(static_cast<llvm::dxil::ResourceClass>(resClass), isROV, rawBuffer);
+    HLSLAttributedResourceType::Attributes attrs(static_cast<llvm::dxil::ResourceClass>(resClass), isROV, rawBuffer, isCounter);
     return ctx.getHLSLAttributedResourceType(wrappedTy, containedTy, attrs);
   }]>;
 }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index fe3ca70a98e2d..3c697ed8dd882 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -5074,6 +5074,12 @@ def HLSLRawBuffer : TypeAttr {
   let Documentation = [InternalOnly];
 }
 
+def HLSLIsCounter : TypeAttr {
+  let Spellings = [CXX11<"hlsl", "is_counter">];
+  let LangOpts = [HLSL];
+  let Documentation = [InternalOnly];
+}
+
 def HLSLGroupSharedAddressSpace : TypeAttr {
   let Spellings = [CustomKeyword<"groupshared">];
   let Subjects = SubjectList<[Var]>;
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 2173aed5b45af..844db79f18a4a 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4624,6 +4624,8 @@ void CXXNameMangler::mangleType(const HLSLAttributedResourceType *T) {
     Str += "_ROV";
   if (Attrs.RawBuffer)
     Str += "_Raw";
+  if (Attrs.IsCounter)
+    Str += "_Counter";
   if (T->hasContainedType())
     Str += "_CT";
   mangleVendorQualifier(Str);
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index f3448af5f8f50..66a1b684ec68b 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2062,6 +2062,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   case attr::HLSLROV:
   case attr::HLSLRawBuffer:
   case attr::HLSLContainedType:
+  case attr::HLSLIsCounter:
     llvm_unreachable("HLSL resource type attributes handled separately");
 
   case attr::OpenCLPrivateAddressSpace:
@@ -2210,6 +2211,8 @@ void TypePrinter::printHLSLAttributedResourceAfter(
     OS << " [[hlsl::is_rov]]";
   if (Attrs.RawBuffer)
     OS << " [[hlsl::raw_buffer]]";
+  if (Attrs.IsCounter)
+    OS << " [[hlsl::is_counter]]";
 
   QualType ContainedTy = T->getContainedType();
   if (!ContainedTy.isNull()) {
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 2e3fc53c58edc..4aa63143a66cd 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -486,6 +486,12 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType(
       return getSPIRVImageTypeFromHLSLResource(ResAttrs, ContainedTy, CGM);
     }
 
+    if (ResAttrs.IsCounter) {
+      llvm::Type *ElemType = llvm::Type::getInt32Ty(Ctx);
+      uint32_t StorageClass = /* StorageBuffer storage class */ 12;
+      return llvm::TargetExtType::get(Ctx, "spirv.VulkanBuffer", {ElemType},
+                                      {StorageClass, true});
+    }
     llvm::Type *ElemType = CGM.getTypes().ConvertTypeForMem(ContainedTy);
     llvm::ArrayType *RuntimeArrayType = llvm::ArrayType::get(ElemType, 0);
     uint32_t StorageClass = /* StorageBuffer storage class */ 12;
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
index 97a6a7f1439db..3c20ccd799b2d 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp
@@ -138,7 +138,16 @@ struct BuiltinTypeMethodBuilder {
   //   LastStmt - refers to the last statement in the method body; referencing
   //              LastStmt will remove the statement from the method body since
   //              it will be linked from the new expression being constructed.
-  enum class PlaceHolder { _0, _1, _2, _3, _4, Handle = 128, LastStmt };
+  enum class PlaceHolder {
+    _0,
+    _1,
+    _2,
+    _3,
+    _4,
+    Handle = 128,
+    CounterHandle,
+    LastStmt
+  };
 
   Expr *convertPlaceholder(PlaceHolder PH);
   Expr *convertPlaceholder(LocalVar &Var);
@@ -178,10 +187,14 @@ struct BuiltinTypeMethodBuilder {
   template <typename ResourceT, typename ValueT>
   BuiltinTypeMethodBuilder &setHandleFieldOnResource(ResourceT ResourceRecord,
                                                      ValueT HandleValue);
+  template <typename T>
+  BuiltinTypeMethodBuilder &
+  accessCounterHandleFieldOnResource(T ResourceRecord);
   template <typename T> BuiltinTypeMethodBuilder &returnValue(T ReturnValue);
   BuiltinTypeMethodBuilder &returnThis();
   BuiltinTypeDeclBuilder &finalize();
   Expr *getResourceHandleExpr();
+  Expr *getResourceCounterHandleExpr();
 
 private:
   void createDecl();
@@ -346,6 +359,8 @@ TemplateParameterListBuilder::finalizeTemplateArgs(ConceptDecl *CD) {
 Expr *BuiltinTypeMethodBuilder::convertPlaceholder(PlaceHolder PH) {
   if (PH == PlaceHolder::Handle)
     return getResourceHandleExpr();
+  if (PH == PlaceHolder::CounterHandle)
+    return getResourceCounterHandleExpr();
 
   if (PH == PlaceHolder::LastStmt) {
     assert(!StmtsList.empty() && "no statements in the list");
@@ -467,6 +482,18 @@ Expr *BuiltinTypeMethodBuilder::getResourceHandleExpr() {
                                     OK_Ordinary);
 }
 
+Expr *BuiltinTypeMethodBuilder::getResourceCounterHandleExpr() {
+  ensureCompleteDecl();
+
+  ASTContext &AST = DeclBuilder.SemaRef.getASTContext();
+  CXXThisExpr *This = CXXThisExpr::Create(
+      AST, SourceLocation(), Method->getFunctionObjectParameterType(), true);
+  FieldDecl *HandleField = DeclBuilder.getResourceCounterHandleField();
+  return MemberExpr::CreateImplicit(AST, This, false, HandleField,
+                                    HandleField->getType(), VK_LValue,
+                                    OK_Ordinary);
+}
+
 BuiltinTypeMethodBuilder &
 BuiltinTypeMethodBuilder::declareLocalVar(LocalVar &Var) {
   ensureCompleteDecl();
@@ -583,6 +610,22 @@ BuiltinTypeMethodBuilder::setHandleFieldOnResource(ResourceT ResourceRecord,
   return *this;
 }
 
+template <typename T>
+BuiltinTypeMethodBuilder &
+BuiltinTypeMethodBuilder::accessCounterHandleFieldOnResource(T ResourceRecord) {
+  ensureCompleteDecl();
+
+  Expr *ResourceExpr = convertPlaceholder(ResourceRecord);
+
+  ASTContext &AST = DeclBuilder.SemaRef.getASTContext();
+  FieldDecl *HandleField = DeclBuilder.getResourceCounterHandleField();
+  MemberExpr *HandleExpr = MemberExpr::CreateImplicit(
+      AST, ResourceExpr, false, HandleField, HandleField->getType(), VK_LValue,
+      OK_Ordinary);
+  StmtsList.push_back(HandleExpr);
+  return *this;
+}
+
 template <typename T>
 BuiltinTypeMethodBuilder &BuiltinTypeMethodBuilder::returnValue(T ReturnValue) {
   ensureCompleteDecl();
@@ -722,8 +765,31 @@ BuiltinTypeDeclBuilder::addMemberVariable(StringRef Name, QualType Type,
   return *this;
 }
 
+BuiltinTypeDeclBuilder &
+BuiltinTypeDeclBuilder::addBufferHandles(ResourceClass RC, bool IsROV,
+                                         bool RawBuffer, bool HasCounter,
+                                         AccessSpecifier Access) {
+  addHandleMember(RC, IsROV, RawBuffer, Access);
+  if (HasCounter)
+    addCounterHandleMember(RC, IsROV, RawBuffer, Access);
+  return *this;
+}
+
 BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addHandleMember(
     ResourceClass RC, bool IsROV, bool RawBuffer, AccessSpecifier Access) {
+  return addResourceMember("__handle", RC, IsROV, RawBuffer,
+                           /*IsCounter=*/false, Access);
+}
+
+BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCounterHandleMember(
+    ResourceClass RC, bool IsROV, bool RawBuffer, AccessSpecifier Access) {
+  return addResourceMember("__counter_handle", RC, IsROV, RawBuffer,
+                           /*IsCounter=*/true, Access);
+}
+
+BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addResourceMember(
+    StringRef MemberName, ResourceClass RC, bool IsROV, bool RawBuffer,
+    bool IsCounter, AccessSpecifier Access) {
   assert(!Record->isCompleteDefinition() && "record is already complete");
 
   ASTContext &Ctx = SemaRef.getASTContext();
@@ -739,9 +805,12 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addHandleMember(
       ElementTypeInfo
           ? HLSLContainedTypeAttr::CreateImplicit(Ctx, ElementTypeInfo)
           : nullptr};
+  if (IsCounter)
+    Attrs.push_back(HLSLIsCounterAttr::CreateImplicit(Ctx));
+
   if (CreateHLSLAttributedResourceType(SemaRef, Ctx.HLSLResourceTy, Attrs,
                                        AttributedResTy))
-    addMemberVariable("__handle", AttributedResTy, {}, Access);
+    addMemberVariable(MemberName, AttributedResTy, {}, Access);
   return *this;
 }
 
@@ -844,12 +913,17 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyConstructor() {
 
   using PH = BuiltinTypeMethodBuilder::PlaceHolder;
 
-  return BuiltinTypeMethodBuilder(*this, /*Name=*/"", AST.VoidTy,
-                                  /*IsConst=*/false, /*IsCtor=*/true)
-      .addParam("other", ConstRecordRefType)
+  BuiltinTypeMethodBuilder MMB(*this, /*Name=*/"", AST.VoidTy,
+                               /*IsConst=*/false, /*IsCtor=*/true);
+  MMB.addParam("other", ConstRecordRefType)
       .accessHandleFieldOnResource(PH::_0)
-      .assign(PH::Handle, PH::LastStmt)
-      .finalize();
+      .assign(PH::Handle, PH::LastStmt);
+
+  if (getResourceCounterHandleField())
+    MMB.accessCounterHandleFieldOnResource(PH::_0).assign(PH::CounterHandle,
+                                                          PH::LastStmt);
+
+  return MMB.finalize();
 }
 
 BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyAssignmentOperator() {
@@ -863,12 +937,16 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyAssignmentOperator() {
 
   using PH = BuiltinTypeMethodBuilder::PlaceHolder;
   DeclarationName Name = AST.DeclarationNames.getCXXOperatorName(OO_Equal);
-  return BuiltinTypeMethodBuilder(*this, Name, RecordRefType)
-      .addParam("other", ConstRecordRefType)
+  BuiltinTypeMethodBuilder MMB(*this, Name, RecordRefType);
+  MMB.addParam("other", ConstRecordRefType)
       .accessHandleFieldOnResource(PH::_0)
-      .assign(PH::Handle, PH::LastStmt)
-      .returnThis()
-      .finalize();
+      .assign(PH::Handle, PH::LastStmt);
+
+  if (getResourceCounterHandleField())
+    MMB.accessCounterHandleFieldOnResource(PH::_0).assign(PH::CounterHandle,
+                                                          PH::LastStmt);
+
+  return MMB.returnThis().finalize();
 }
 
 BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addArraySubscriptOperators() {
@@ -903,6 +981,14 @@ FieldDecl *BuiltinTypeDeclBuilder::getResourceHandleField() const {
   return I->second;
 }
 
+FieldDecl *BuiltinTypeDeclBuilder::getResourceCounterHandleField() const {
+  auto I = Fields.find("__counter_handle");
+  if (I == Fields.end() ||
+      !I->second->getType()->isHLSLAttributedResourceType())
+    return nullptr;
+  return I->second;
+}
+
 QualType BuiltinTypeDeclBuilder::getFirstTemplateTypeParam() {
   assert(Template && "record it not a template");
   if (const auto *TTD = dyn_cast<TemplateTypeParmDecl>(
diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
index 9448af13530cb..a981602a50461 100644
--- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
+++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.h
@@ -72,8 +72,9 @@ class BuiltinTypeDeclBuilder {
                     AccessSpecifier Access = AccessSpecifier::AS_private);
 
   BuiltinTypeDeclBuilder &
-  addHandleMember(ResourceClass RC, bool IsROV, bool RawBuffer,
-                  AccessSpecifier Access = AccessSpecifier::AS_private);
+  addBufferHandles(ResourceClass RC, bool IsROV, bool RawBuffer,
+                   bool HasCounter,
+                   AccessSpecifier Access = AccessSpecifier::AS_private);
   BuiltinTypeDeclBuilder &addArraySubscriptOperators();
 
   // Builtin types constructors
@@ -95,7 +96,18 @@ class BuiltinTypeDeclBuilder {
   BuiltinTypeDeclBuilder &addConsumeMethod();
 
 private:
+  BuiltinTypeDeclBuilder &addResourceMember(StringRef MemberName,
+                                            ResourceClass RC, bool IsROV,
+                                            bool RawBuffer, bool IsCounter,
+                                            AccessSpecifier Access);
+  BuiltinTypeDeclBuilder &
+  addHandleMember(ResourceClass RC, bool IsROV, bool RawBuffer,
+                  AccessSpecifier Access = AccessSpecifier::AS_private);
+  BuiltinTypeDeclBuilder &
+  addCounterHandleMember(ResourceClass RC, bool IsROV, bool RawBuffer,
+                         AccessSpecifier Access = AccessSpecifier::AS_private);
   FieldDecl *getResourceHandleField() const;
+  FieldDecl *getResourceCounterHandleField() const;
   QualType getFirstTemplateTypeParam();
   QualType getHandleElementType();
   Expr *getConstantIntExpr(int value);
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 464922b6257b6..cc43e9474ea79 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -230,9 +230,9 @@ void HLSLExternalSemaSource::defineTrivialHLSLTypes() {
 /// Set up common members and attributes for buffer types
 static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
                                               ResourceClass RC, bool IsROV,
-                                              bool RawBuffer) {
+                                              bool RawBuffer, bool HasCounter) {
   return BuiltinTypeDeclBuilder(S, Decl)
-      .addHandleMember(RC, IsROV, RawBuffer)
+      .addBufferHandles(RC, IsROV, RawBuffer, HasCounter)
       .addDefaultHandleConstructor()
       .addCopyConstructor()
       .addCopyAssignmentOperator()
@@ -377,7 +377,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, /*IsROV=*/false,
-                    /*RawBuffer=*/false)
+                    /*RawBuffer=*/false, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
         .completeDefinition();
@@ -389,7 +389,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
-                    /*RawBuffer=*/false)
+                    /*RawBuffer=*/false, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
         .completeDefinition();
@@ -401,7 +401,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
           .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/true,
-                    /*RawBuffer=*/false)
+                    /*RawBuffer=*/false, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
         .completeDefinition();
@@ -412,7 +412,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/false)
         .addArraySubscriptOperators()
         .addLoadMethods()
         .completeDefinition();
@@ -423,7 +423,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/true)
         .addArraySubscriptOperators()
         .addLoadMethods()
         .addIncrementCounterMethod()
@@ -437,7 +437,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
           .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/true)
         .addAppendMethod()
         .completeDefinition();
   });
@@ -448,7 +448,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
           .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/true)
         .addConsumeMethod()
         .completeDefinition();
   });
@@ -459,7 +459,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/true,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/true)
         .addArraySubscriptOperators()
         .addLoadMethods()
         .addIncrementCounterMethod()
@@ -471,14 +471,14 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/false)
         .completeDefinition();
   });
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWByteAddressBuffer")
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/false,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/false)
         .completeDefinition();
   });
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace,
@@ -486,7 +486,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, /*IsROV=*/true,
-                    /*RawBuffer=*/true)
+                    /*RawBuffer=*/true, /*HasCounter=*/false)
         .completeDefinition();
   });
 }
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 129b03c07c0bd..fa30c66b62684 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1810,6 +1810,13 @@ bool clang::CreateHLSLAttributedResourceType(
       }
       ResAttrs.RawBuffer = true;
       break;
+    case attr::HLSLIsCounter:
+      if (ResAttrs.IsCounter) {
+        S.Diag(A->getLocation(), diag::warn_duplicate_attribute_exact) << A;
+        return false;
+      }
+      ResAttrs.IsCounter = true;
+      break;
     case attr::HLSLContainedType: {
       const HLSLContainedTypeAttr *CTAttr = cast<HLSLContainedTypeAttr>(A);
       QualType Ty = CTAttr->getType();
@@ -1902,6 +1909,10 @@ bool SemaHLSL::handleResourceTypeAttr(QualType T, const ParsedAttr &AL) {
     A = HLSLRawBufferAttr::Create(getASTContext(), ACI);
     break;
 
+  case ParsedAttr::AT_HLSLIsCounter:
+    A = HLSLIsCounterAttr::Create(getASTContext(), ACI);
+    break;
+
   case ParsedAttr::AT_HLSLContainedType: {
     if (AL.getNumArgs() != 1 && !AL.hasParsedType()) {
       Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1;
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index a490b22ab437b..6779abb10bec4 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -12,7 +12,7 @@
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
 // RUN:   -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-COUNTER,CHECK-LOAD %s
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-COUNTER,CHECK-LOAD,CHECK-COUNTER-HANDLE %s
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
 // RUN:  -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
@@ -20,7 +20,7 @@
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
 // RUN:   -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND %s
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND,CHECK-COUNTER-HANDLE %s
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
 // RUN:  -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
@@ -28,7 +28,7 @@
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
 // RUN:   -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME %s
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME,CHECK-COUNTER-HANDLE %s
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
 // RUN:  -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
@@ -36,7 +36,7 @@
 //
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
 // RUN:   -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-LOAD %s
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-SUBSCRIPT-UAV,CHECK-LOAD,CHECK-COUNTER-HANDLE %s
 
 // This test tests two different AST generations for each structured buffer.
 // The "EMPTY" test mode verifies the AST generated by forward declaration
@@ -113,6 +113,11 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
 // CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
 // CHECK-NEXT: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &'
+// CHECK-COUNTER-HANDLE-NEXT: BinaryOperator {{.*}} '='
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} lvalue .__counter_handle
+// CHECK-COUNTER-HANDLE-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-COUNTER-HANDLE-NEXT: MemberExpr {{.*}} lvalue .__counter_handle
+// CHECK-COUNTER-HANDLE-NEXT: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &'
 // CHECK-NEXT: AlwaysInlineAttr
 
 // operator=
@@ -125,6 +130,11 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
 // CHECK-NEXT: MemberExpr {{.*}} lvalue .__handle
 // CHECK-NEXT: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &'
+// CHECK-COUNTER-HANDLE: BinaryOperator {{.*}} '='
+// CHECK-COUNTER-HANDLE: MemberExpr {{.*}} lvalue .__counter_handle
+// CHECK-COUNTER-HANDLE: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-COUNTER-HANDLE: MemberExpr {{.*}} lvalue .__counter_handle
+// CHECK-COUNTER-HANDLE: DeclRefExpr {{.*}} 'const hlsl::[[RESOURCE]]<element_type>' ParmVar {{.*}} 'other' 'const hlsl::[[RESOURCE]]<element_type> &'
 // CHECK-NEXT: ReturnStmt
 // CHECK-NEXT: CXXThisExpr {{.*}} 'hlsl::[[RESOURCE]]<element_type>' lvalue implicit this
 // CHECK-NEXT: AlwaysInlineAttr
@@ -334,3 +344,8 @@ RESOURCE<float> Buffer;
 // CHECK-ROV-SAME{LITERAL}: [[hlsl::is_rov]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
+// CHECK-COUNTER-HANDLE: FieldDecl {{.*}} implicit referenced __counter_handle '__hlsl_resource_t
+// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::is_counter]]
+// CHECK-COUNTER-HANDLE-SAME{LITERAL}: [[hlsl::contained_type(float)]]
diff --git a/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl
index 472b9a8dae2ae..9f0a5b79ee6e4 100644
--- a/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl
@@ -1,23 +1,36 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
 // RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPV
 
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.12" = type { target("dx.RawBuffer", i32, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1)
-// CHECK: %"class.hlsl::RWStructuredBuffer.13" = type { target("dx.RawBuffer", <4 x i32>, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1)
+// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.0" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.1" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.2" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.3" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.4" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0), target("dx.RawBuffer", half, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.5" = type { target("spirv.VulkanBuffer", [0 x half], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.6" = type { target("spirv.VulkanBuffer", [0 x float], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0), target("dx.RawBuffer", double, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.7" = type { target("spirv.VulkanBuffer", [0 x double], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0), target("dx.RawBuffer", <4 x i16>, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.8" = type { target("spirv.VulkanBuffer", [0 x <4 x i16>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0), target("dx.RawBuffer", <3 x i32>, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.9" = type { target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0), target("dx.RawBuffer", <2 x half>, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.10" = type { target("spirv.VulkanBuffer", [0 x <2 x half>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0), target("dx.RawBuffer", <3 x float>, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.11" = type { target("spirv.VulkanBuffer", [0 x <3 x float>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.12" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// CHECK: %"class.hlsl::RWStructuredBuffer.13" = type { target("dx.RawBuffer", <4 x i32>, 1, 0), target("dx.RawBuffer", <4 x i32>, 1, 0) }
+// SPV: %"class.hlsl::RWStructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
 
 RWStructuredBuffer<int16_t> BufI16;
 RWStructuredBuffer<uint16_t> BufU16;
diff --git a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
index 6c5a705d5cf2e..c97ad4237000f 100644
--- a/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/resources/RasterizerOrderedStructuredBuffer-elementtype.hlsl
@@ -5,19 +5,19 @@ struct MyStruct {
   int2 b;
 };
 
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 1)  }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 1) }
-// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 1), target("dx.RawBuffer", i16, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 1), target("dx.RawBuffer", i16, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 1), target("dx.RawBuffer", i32, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 1), target("dx.RawBuffer", i32, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 1), target("dx.RawBuffer", i64, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 1), target("dx.RawBuffer", i64, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 1), target("dx.RawBuffer", half, 1, 1)  }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 1), target("dx.RawBuffer", float, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 1), target("dx.RawBuffer", double, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 1), target("dx.RawBuffer", <4 x i16>, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 1), target("dx.RawBuffer", <3 x i32>, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 1), target("dx.RawBuffer", <2 x half>, 1, 1) }
+// DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 1), target("dx.RawBuffer", <3 x float>, 1, 1) }
 // DXIL: %struct.MyStruct = type <{ <4 x float>, <2 x i32> }>
 
 RasterizerOrderedStructuredBuffer<int16_t> BufI16;
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
index 4f005eab5c71a..89a66b047a3bd 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-constructors.hlsl
@@ -21,8 +21,8 @@ export void foo() {
 }
 
 // CHECK-DXIL: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
-// CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
-// CHECK-DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
+// CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// CHECK-DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
 
 // CHECK: @Buf1 = internal global %"class.hlsl::StructuredBuffer" poison, align 4
 // CHECK: @[[Buf1Str:.*]] = private unnamed_addr constant [5 x i8] c"Buf1\00", align 1
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
index 93aa218f63ecf..43ddd2e768ea0 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-lib.hlsl
@@ -10,9 +10,9 @@ AppendStructuredBuffer<float> ASB : register(u2);
 ConsumeStructuredBuffer<float> CSB : register(u3);
 
 // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
-// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
+// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
 
 export int TestIncrementCounter() {
     return RWSB1.IncrementCounter();
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
index b513963d72474..9e08a6d0d7ae0 100644
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-methods-ps.hlsl
@@ -6,7 +6,7 @@
 RWStructuredBuffer<float> RWSB1, RWSB2;
 RasterizerOrderedStructuredBuffer<float> ROSB1, ROSB2;
 
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) }
+// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
 
 export void TestIncrementCounter() {
 // CHECK: define void @_Z20TestIncrementCounterv()
diff --git a/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl b/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl
index 4ffa7cfc84e17..1d85048db87a8 100644
--- a/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl
+++ b/clang/test/CodeGenHLSL/resources/resource-bindings.hlsl
@@ -4,7 +4,7 @@
 // CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0) }
 // CHECK: %"class.hlsl::RWBuffer.0" = type { target("dx.TypedBuffer", float, 1, 0, 0) }
 // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", i32, 0, 0) }
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", %struct.S, 1, 0) }
+// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", %struct.S, 1, 0), target("dx.RawBuffer", %struct.S, 1, 0) }
 // CHECK: %"class.hlsl::RWBuffer.1" = type { target("dx.TypedBuffer", double, 1, 0, 0) }
 
 // CHECK: @_ZL4U0S0 = internal global %"class.hlsl::RWBuffer" poison, align 4

From 99d85906c542c3801a24137ba6d6f2c367308563 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <jkun@google.com>
Date: Thu, 2 Oct 2025 10:56:40 -0700
Subject: [PATCH 552/878] [mlir] [irdl] Add support for regions in irdl-to-cpp
 (#158540)

Fixes https://github.com/llvm/llvm-project/issues/158034

For the input

```mlir
irdl.dialect @conditional_dialect {
  // A conditional operation with regions
  irdl.operation @conditional {
      // Create region constraints
      %r0 = irdl.region                    // Unconstrained region
      %r1 = irdl.region()                  // Region with no entry block arguments
      %v0 = irdl.any
      %r2 = irdl.region(%v0)               // Region with one i1 entry block argument

      irdl.regions(cond: %r2, then: %r0, else: %r1)
  }
}
```

This produces the following cpp:
https://gist.github.com/j2kun/d2095f108efbd8d403576d5c460e0c00

Summary of changes:

- The op class and adaptor get named accessors to the regions `Region
&get<RegionName>()` and `getRegions()`
- The op now gets `OpTrait::NRegions<3>` and `OpInvariants` to trigger
the region verification
- Support for region block argument constraints is added, but not
working for all constraints until codegen for `irdl.is` is added (filed
https://github.com/llvm/llvm-project/issues/161018 and left a TODO).
- Helper functions for the individual verification steps are added,
following mlir-tblgen's format (in the above gist,
`__mlir_irdl_local_region_constraint_ConditionalOp_cond` and similar),
and `verifyInvariantsImpl` that calls them.
- Regions are added in the builder

## Questions for the reviewer

### What is the "correct" interface for verification?

I used `mlir-tblgen` on an analogous version of the example
`ConditionalOp` in this PR, and I see an `::mlir::OpTrait::OpInvariants`
trait as well as

```cpp
::llvm::LogicalResult ConditionalOp::verifyInvariantsImpl() {
  {
    unsigned index = 0; (void)index;

    for (auto &region : ::llvm::MutableArrayRef((*this)->getRegion(0)))
      if (::mlir::failed(__mlir_ods_local_region_constraint_test1(*this, region, "cond", index++)))
        return ::mlir::failure();

    for (auto &region : ::llvm::MutableArrayRef((*this)->getRegion(1)))
      if (::mlir::failed(__mlir_ods_local_region_constraint_test1(*this, region, "then", index++)))
        return ::mlir::failure();

    for (auto &region : ::llvm::MutableArrayRef((*this)->getRegion(2)))
      if (::mlir::failed(__mlir_ods_local_region_constraint_test1(*this, region, "else", index++)))
        return ::mlir::failure();
  }
  return ::mlir::success();
}

::llvm::LogicalResult ConditionalOp::verifyInvariants() {
  if(::mlir::succeeded(verifyInvariantsImpl()) && ::mlir::succeeded(verify()))
    return ::mlir::success();
  return ::mlir::failure();
}
```

However, `OpInvariants` only seems to need `verifyInvariantsImpl`, so
it's not clear to me what is the purpose of the `verifyInvariants`
function, or, if I leave out `verifyInvariants`, whether I need to call
`verify()` in my implementation of `verifyInvariantsImpl`. In this PR, I
omitted `verifyInvariants` and generated `verifyInvariantsImpl`.

### Is testing sufficient?

I am not certain I implemented the builders properly, and it's unclear
to me to what extent the existing tests check this (which look like they
compile the generated cpp, but don't actually use it). Did I omit some
standard function or overload?

---------

Co-authored-by: Jeremy Kun <j2kun@users.noreply.github.com>
---
 mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp       | 162 +++++++++++++++++-
 .../IRDLToCpp/Templates/PerOperationDecl.txt  |  53 +++---
 .../IRDLToCpp/Templates/PerOperationDef.txt   |  14 +-
 .../lib/Dialect/TestIRDLToCpp/CMakeLists.txt  |   2 +
 .../TestIRDLToCpp/TestIRDLToCppDialect.cpp    |  31 +++-
 .../TestIRDLToCpp/test_conversion.testd.mlir  |  18 +-
 .../TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir  |  51 +++++-
 ...to_cpp_invalid_unsupported_types.irdl.mlir |  27 +--
 8 files changed, 299 insertions(+), 59 deletions(-)

diff --git a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp
index d6b8a8a1df426..e3f075fcc1294 100644
--- a/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp
+++ b/mlir/lib/Target/IRDLToCpp/IRDLToCpp.cpp
@@ -54,6 +54,7 @@ struct OpStrings {
   std::string opCppName;
   SmallVector<std::string> opResultNames;
   SmallVector<std::string> opOperandNames;
+  SmallVector<std::string> opRegionNames;
 };
 
 static std::string joinNameList(llvm::ArrayRef<std::string> names) {
@@ -87,8 +88,8 @@ static TypeStrings getStrings(irdl::TypeOp type) {
 /// Generates OpStrings from an OperatioOp
 static OpStrings getStrings(irdl::OperationOp op) {
   auto operandOp = op.getOp<irdl::OperandsOp>();
-
   auto resultOp = op.getOp<irdl::ResultsOp>();
+  auto regionsOp = op.getOp<irdl::RegionsOp>();
 
   OpStrings strings;
   strings.opName = op.getSymName();
@@ -108,6 +109,13 @@ static OpStrings getStrings(irdl::OperationOp op) {
         }));
   }
 
+  if (regionsOp) {
+    strings.opRegionNames = SmallVector<std::string>(
+        llvm::map_range(regionsOp->getNames(), [](Attribute attr) {
+          return llvm::formatv("{0}", cast<StringAttr>(attr));
+        }));
+  }
+
   return strings;
 }
 
@@ -122,6 +130,7 @@ static void fillDict(irdl::detail::dictionary &dict,
 static void fillDict(irdl::detail::dictionary &dict, const OpStrings &strings) {
   const auto operandCount = strings.opOperandNames.size();
   const auto resultCount = strings.opResultNames.size();
+  const auto regionCount = strings.opRegionNames.size();
 
   dict["OP_NAME"] = strings.opName;
   dict["OP_CPP_NAME"] = strings.opCppName;
@@ -131,6 +140,7 @@ static void fillDict(irdl::detail::dictionary &dict, const OpStrings &strings) {
       operandCount ? joinNameList(strings.opOperandNames) : "{\"\"}";
   dict["OP_RESULT_INITIALIZER_LIST"] =
       resultCount ? joinNameList(strings.opResultNames) : "{\"\"}";
+  dict["OP_REGION_COUNT"] = std::to_string(regionCount);
 }
 
 /// Fills a dictionary with values from DialectStrings
@@ -179,6 +189,8 @@ static void generateOpGetterDeclarations(irdl::detail::dictionary &dict,
                                          const OpStrings &opStrings) {
   auto opGetters = std::string{};
   auto resGetters = std::string{};
+  auto regionGetters = std::string{};
+  auto regionAdaptorGetters = std::string{};
 
   for (size_t i = 0, end = opStrings.opOperandNames.size(); i < end; ++i) {
     const auto op =
@@ -196,8 +208,23 @@ static void generateOpGetterDeclarations(irdl::detail::dictionary &dict,
         op, i);
   }
 
+  for (size_t i = 0, end = opStrings.opRegionNames.size(); i < end; ++i) {
+    const auto op =
+        llvm::convertToCamelFromSnakeCase(opStrings.opRegionNames[i], true);
+    regionAdaptorGetters += llvm::formatv(
+        R"(::mlir::Region &get{0}() { return *getRegions()[{1}]; }
+  )",
+        op, i);
+    regionGetters += llvm::formatv(
+        R"(::mlir::Region &get{0}() { return (*this)->getRegion({1}); }
+  )",
+        op, i);
+  }
+
   dict["OP_OPERAND_GETTER_DECLS"] = opGetters;
   dict["OP_RESULT_GETTER_DECLS"] = resGetters;
+  dict["OP_REGION_ADAPTER_GETTER_DECLS"] = regionAdaptorGetters;
+  dict["OP_REGION_GETTER_DECLS"] = regionGetters;
 }
 
 static void generateOpBuilderDeclarations(irdl::detail::dictionary &dict,
@@ -238,6 +265,22 @@ static void generateOpBuilderDeclarations(irdl::detail::dictionary &dict,
   dict["OP_BUILD_DECLS"] = buildDecls;
 }
 
+// add traits to the dictionary, return true if any were added
+static SmallVector<std::string> generateTraits(irdl::OperationOp op,
+                                               const OpStrings &strings) {
+  SmallVector<std::string> cppTraitNames;
+  if (!strings.opRegionNames.empty()) {
+    cppTraitNames.push_back(
+        llvm::formatv("::mlir::OpTrait::NRegions<{0}>::Impl",
+                      strings.opRegionNames.size())
+            .str());
+
+    // Requires verifyInvariantsImpl is implemented on the op
+    cppTraitNames.emplace_back("::mlir::OpTrait::OpInvariants");
+  }
+  return cppTraitNames;
+}
+
 static LogicalResult generateOperationInclude(irdl::OperationOp op,
                                               raw_ostream &output,
                                               irdl::detail::dictionary &dict) {
@@ -247,6 +290,13 @@ static LogicalResult generateOperationInclude(irdl::OperationOp op,
   const auto opStrings = getStrings(op);
   fillDict(dict, opStrings);
 
+  SmallVector<std::string> traitNames = generateTraits(op, opStrings);
+  if (traitNames.empty())
+    dict["OP_TEMPLATE_ARGS"] = opStrings.opCppName;
+  else
+    dict["OP_TEMPLATE_ARGS"] = llvm::formatv("{0}, {1}", opStrings.opCppName,
+                                             llvm::join(traitNames, ", "));
+
   generateOpGetterDeclarations(dict, opStrings);
   generateOpBuilderDeclarations(dict, opStrings);
 
@@ -301,6 +351,110 @@ static LogicalResult generateInclude(irdl::DialectOp dialect,
   return success();
 }
 
+static void generateRegionConstraintVerifiers(
+    irdl::detail::dictionary &dict, irdl::OperationOp op,
+    const OpStrings &strings, SmallVectorImpl<std::string> &verifierHelpers,
+    SmallVectorImpl<std::string> &verifierCalls) {
+  auto regionsOp = op.getOp<irdl::RegionsOp>();
+  if (strings.opRegionNames.empty() || !regionsOp)
+    return;
+
+  for (size_t i = 0; i < strings.opRegionNames.size(); ++i) {
+    std::string regionName = strings.opRegionNames[i];
+    std::string helperFnName =
+        llvm::formatv("__mlir_irdl_local_region_constraint_{0}_{1}",
+                      strings.opCppName, regionName)
+            .str();
+
+    // Extract the actual region constraint from the IRDL RegionOp
+    std::string condition = "true";
+    std::string textualConditionName = "any region";
+
+    if (auto regionDefOp =
+            dyn_cast<irdl::RegionOp>(regionsOp->getArgs()[i].getDefiningOp())) {
+      // Generate constraint condition based on RegionOp attributes
+      SmallVector<std::string> conditionParts;
+      SmallVector<std::string> descriptionParts;
+
+      // Check number of blocks constraint
+      if (auto blockCount = regionDefOp.getNumberOfBlocks()) {
+        conditionParts.push_back(
+            llvm::formatv("region.getBlocks().size() == {0}",
+                          blockCount.value())
+                .str());
+        descriptionParts.push_back(
+            llvm::formatv("exactly {0} block(s)", blockCount.value()).str());
+      }
+
+      // Check entry block arguments constraint
+      if (regionDefOp.getConstrainedArguments()) {
+        size_t expectedArgCount = regionDefOp.getEntryBlockArgs().size();
+        conditionParts.push_back(
+            llvm::formatv("region.getNumArguments() == {0}", expectedArgCount)
+                .str());
+        descriptionParts.push_back(
+            llvm::formatv("{0} entry block argument(s)", expectedArgCount)
+                .str());
+      }
+
+      // Combine conditions
+      if (!conditionParts.empty()) {
+        condition = llvm::join(conditionParts, " && ");
+      }
+
+      // Generate descriptive error message
+      if (!descriptionParts.empty()) {
+        textualConditionName =
+            llvm::formatv("region with {0}",
+                          llvm::join(descriptionParts, " and "))
+                .str();
+      }
+    }
+
+    verifierHelpers.push_back(llvm::formatv(
+        R"(static ::llvm::LogicalResult {0}(::mlir::Operation *op, ::mlir::Region &region, ::llvm::StringRef regionName, unsigned regionIndex) {{
+    if (!({1})) {{
+      return op->emitOpError("region #") << regionIndex
+          << (regionName.empty() ? " " : " ('" + regionName + "') ")
+          << "failed to verify constraint: {2}";
+      }
+  return ::mlir::success();
+})",
+        helperFnName, condition, textualConditionName));
+
+    verifierCalls.push_back(llvm::formatv(R"(
+  if (::mlir::failed({0}(*this, (*this)->getRegion({1}), "{2}", {1})))
+    return ::mlir::failure();)",
+                                          helperFnName, i, regionName)
+                                .str());
+  }
+}
+
+static void generateVerifiers(irdl::detail::dictionary &dict,
+                              irdl::OperationOp op, const OpStrings &strings) {
+  SmallVector<std::string> verifierHelpers;
+  SmallVector<std::string> verifierCalls;
+
+  generateRegionConstraintVerifiers(dict, op, strings, verifierHelpers,
+                                    verifierCalls);
+
+  // Add an overall verifier that sequences the helper calls
+  std::string verifierDef =
+      llvm::formatv(R"(
+::llvm::LogicalResult {0}::verifyInvariantsImpl() {{
+  if(::mlir::failed(verify()))
+    return ::mlir::failure();
+
+  {1}
+
+  return ::mlir::success();
+})",
+                    strings.opCppName, llvm::join(verifierCalls, "\n"));
+
+  dict["OP_VERIFIER_HELPERS"] = llvm::join(verifierHelpers, "\n");
+  dict["OP_VERIFIER"] = verifierDef;
+}
+
 static std::string generateOpDefinition(irdl::detail::dictionary &dict,
                                         irdl::OperationOp op) {
   static const auto perOpDefTemplate = mlir::irdl::detail::Template{
@@ -370,6 +524,8 @@ void {0}::build(::mlir::OpBuilder &opBuilder, ::mlir::OperationState &opState, {
 
   dict["OP_BUILD_DEFS"] = buildDefinition;
 
+  generateVerifiers(dict, op, opStrings);
+
   std::string str;
   llvm::raw_string_ostream stream{str};
   perOpDefTemplate.render(stream, dict);
@@ -427,7 +583,7 @@ static LogicalResult generateLib(irdl::DialectOp dialect, raw_ostream &output,
   dict["TYPE_PARSER"] = llvm::formatv(
       R"(static ::mlir::OptionalParseResult generatedTypeParser(::mlir::AsmParser &parser, ::llvm::StringRef *mnemonic, ::mlir::Type &value) {
   return ::mlir::AsmParser::KeywordSwitch<::mlir::OptionalParseResult>(parser)
-    {0}    
+    {0}
     .Default([&](llvm::StringRef keyword, llvm::SMLoc) {{
       *mnemonic = keyword;
       return std::nullopt;
@@ -520,6 +676,8 @@ static LogicalResult verifySupported(irdl::DialectOp dialect) {
                   "IRDL C++ translation does not yet support variadic results");
             }))
             .Case<irdl::AnyOp>(([](irdl::AnyOp) { return success(); }))
+            .Case<irdl::RegionOp>(([](irdl::RegionOp) { return success(); }))
+            .Case<irdl::RegionsOp>(([](irdl::RegionsOp) { return success(); }))
             .Default([](mlir::Operation *op) -> LogicalResult {
               return op->emitError("IRDL C++ translation does not yet support "
                                    "translation of ")
diff --git a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt
index e9068e9488f99..93ce0bef1f269 100644
--- a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt
+++ b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDecl.txt
@@ -12,15 +12,15 @@ public:
   struct Properties {
   };
 public:
-  __OP_CPP_NAME__GenericAdaptorBase(::mlir::Operation *op) 
-    : odsAttrs(op->getRawDictionaryAttrs()), odsOpName(op->getName()), 
-               odsRegions(op->getRegions()) 
+  __OP_CPP_NAME__GenericAdaptorBase(::mlir::Operation *op)
+    : odsAttrs(op->getRawDictionaryAttrs()), odsOpName(op->getName()),
+               odsRegions(op->getRegions())
   {}
 
   /// Return the unstructured operand index of a structured operand along with
   // the amount of unstructured operands it contains.
   std::pair<unsigned, unsigned>
-  getStructuredOperandIndexAndLength (unsigned index, 
+  getStructuredOperandIndexAndLength (unsigned index,
                                       unsigned odsOperandsSize) {
     return {index, 1};
   }
@@ -32,6 +32,12 @@ public:
   ::mlir::DictionaryAttr getAttributes() {
     return odsAttrs;
   }
+
+  __OP_REGION_ADAPTER_GETTER_DECLS__
+
+  ::mlir::RegionRange getRegions() {
+    return odsRegions;
+  }
 protected:
   ::mlir::DictionaryAttr odsAttrs;
   ::std::optional<::mlir::OperationName> odsOpName;
@@ -42,28 +48,28 @@ protected:
 } // namespace detail
 
 template <typename RangeT>
-class __OP_CPP_NAME__GenericAdaptor 
+class __OP_CPP_NAME__GenericAdaptor
   : public detail::__OP_CPP_NAME__GenericAdaptorBase {
   using ValueT = ::llvm::detail::ValueOfRange<RangeT>;
   using Base = detail::__OP_CPP_NAME__GenericAdaptorBase;
 public:
   __OP_CPP_NAME__GenericAdaptor(RangeT values, ::mlir::DictionaryAttr attrs,
-                                ::mlir::OpaqueProperties properties, 
-                                ::mlir::RegionRange regions = {}) 
-    : __OP_CPP_NAME__GenericAdaptor(values, attrs, 
-      (properties ? *properties.as<::mlir::EmptyProperties *>() 
+                                ::mlir::OpaqueProperties properties,
+                                ::mlir::RegionRange regions = {})
+    : __OP_CPP_NAME__GenericAdaptor(values, attrs,
+      (properties ? *properties.as<::mlir::EmptyProperties *>()
       : ::mlir::EmptyProperties{}), regions) {}
 
-  __OP_CPP_NAME__GenericAdaptor(RangeT values, 
+  __OP_CPP_NAME__GenericAdaptor(RangeT values,
                                 const __OP_CPP_NAME__GenericAdaptorBase &base)
     : Base(base), odsOperands(values) {}
 
-  // This template parameter allows using __OP_CPP_NAME__ which is declared 
+  // This template parameter allows using __OP_CPP_NAME__ which is declared
   // later.
   template <typename LateInst = __OP_CPP_NAME__,
             typename = std::enable_if_t<
                          std::is_same_v<LateInst, __OP_CPP_NAME__>>>
-  __OP_CPP_NAME__GenericAdaptor(RangeT values, LateInst op) 
+  __OP_CPP_NAME__GenericAdaptor(RangeT values, LateInst op)
     : Base(op), odsOperands(values) {}
 
   /// Return the unstructured operand index of a structured operand along with
@@ -77,7 +83,7 @@ public:
   RangeT getStructuredOperands(unsigned index) {
     auto valueRange = getStructuredOperandIndexAndLength(index);
     return {std::next(odsOperands.begin(), valueRange.first),
-             std::next(odsOperands.begin(), 
+             std::next(odsOperands.begin(),
                        valueRange.first + valueRange.second)};
   }
 
@@ -91,7 +97,7 @@ private:
   RangeT odsOperands;
 };
 
-class __OP_CPP_NAME__Adaptor 
+class __OP_CPP_NAME__Adaptor
   : public __OP_CPP_NAME__GenericAdaptor<::mlir::ValueRange> {
 public:
   using __OP_CPP_NAME__GenericAdaptor::__OP_CPP_NAME__GenericAdaptor;
@@ -100,7 +106,7 @@ public:
   ::llvm::LogicalResult verify(::mlir::Location loc);
 };
 
-class __OP_CPP_NAME__ : public ::mlir::Op<__OP_CPP_NAME__> {
+class __OP_CPP_NAME__ : public ::mlir::Op<__OP_TEMPLATE_ARGS__> {
 public:
   using Op::Op;
   using Op::print;
@@ -112,6 +118,8 @@ public:
     return {};
   }
 
+  ::llvm::LogicalResult verifyInvariantsImpl();
+
   static constexpr ::llvm::StringLiteral getOperationName() {
     return ::llvm::StringLiteral("__DIALECT_NAME__.__OP_NAME__");
   }
@@ -147,7 +155,7 @@ public:
   ::mlir::Operation::operand_range getStructuredOperands(unsigned index) {
     auto valueRange = getStructuredOperandIndexAndLength(index);
     return {std::next(getOperation()->operand_begin(), valueRange.first),
-             std::next(getOperation()->operand_begin(), 
+             std::next(getOperation()->operand_begin(),
                        valueRange.first + valueRange.second)};
   }
 
@@ -162,18 +170,19 @@ public:
   ::mlir::Operation::result_range getStructuredResults(unsigned index) {
     auto valueRange = getStructuredResultIndexAndLength(index);
     return {std::next(getOperation()->result_begin(), valueRange.first),
-             std::next(getOperation()->result_begin(), 
+             std::next(getOperation()->result_begin(),
                        valueRange.first + valueRange.second)};
   }
 
   __OP_OPERAND_GETTER_DECLS__
   __OP_RESULT_GETTER_DECLS__
-  
+  __OP_REGION_GETTER_DECLS__
+
   __OP_BUILD_DECLS__
-  static void build(::mlir::OpBuilder &odsBuilder, 
-                    ::mlir::OperationState &odsState, 
-                    ::mlir::TypeRange resultTypes, 
-                    ::mlir::ValueRange operands, 
+  static void build(::mlir::OpBuilder &odsBuilder,
+                    ::mlir::OperationState &odsState,
+                    ::mlir::TypeRange resultTypes,
+                    ::mlir::ValueRange operands,
                     ::llvm::ArrayRef<::mlir::NamedAttribute> attributes = {});
 
   static __OP_CPP_NAME__ create(::mlir::OpBuilder &odsBuilder,
diff --git a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt
index 30ca420d77448..f4a1b7a996263 100644
--- a/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt
+++ b/mlir/lib/Target/IRDLToCpp/Templates/PerOperationDef.txt
@@ -6,12 +6,14 @@ R"(
 
 __NAMESPACE_OPEN__
 
+__OP_VERIFIER_HELPERS__
+
 __OP_BUILD_DEFS__
 
-void __OP_CPP_NAME__::build(::mlir::OpBuilder &odsBuilder, 
-                            ::mlir::OperationState &odsState, 
-                            ::mlir::TypeRange resultTypes, 
-                            ::mlir::ValueRange operands, 
+void __OP_CPP_NAME__::build(::mlir::OpBuilder &odsBuilder,
+                            ::mlir::OperationState &odsState,
+                            ::mlir::TypeRange resultTypes,
+                            ::mlir::ValueRange operands,
                             ::llvm::ArrayRef<::mlir::NamedAttribute> attributes)
 {
   assert(operands.size() == __OP_OPERAND_COUNT__);
@@ -19,6 +21,9 @@ void __OP_CPP_NAME__::build(::mlir::OpBuilder &odsBuilder,
   odsState.addOperands(operands);
   odsState.addAttributes(attributes);
   odsState.addTypes(resultTypes);
+  for (unsigned i = 0; i != __OP_REGION_COUNT__; ++i) {
+    (void)odsState.addRegion();
+  }
 }
 
 __OP_CPP_NAME__
@@ -44,6 +49,7 @@ __OP_CPP_NAME__::create(::mlir::ImplicitLocOpBuilder &odsBuilder,
   return create(odsBuilder, odsBuilder.getLoc(), resultTypes, operands, attributes);
 }
 
+__OP_VERIFIER__
 
 __NAMESPACE_CLOSE__
 
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt
index 103bc94d86920..7d325778f09cb 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/CMakeLists.txt
@@ -12,5 +12,7 @@ add_mlir_library(MLIRTestIRDLToCppDialect
 mlir_target_link_libraries(MLIRTestIRDLToCppDialect PUBLIC
   MLIRIR
   MLIRPass
+  MLIRSCFDialect
   MLIRTransforms
+  MLIRTestDialect
 )
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp
index 9550e4c96e547..421db7e4c0094 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/TestIRDLToCppDialect.cpp
@@ -13,6 +13,7 @@
 // #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Region.h"
 
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
@@ -54,16 +55,34 @@ struct TestOpConversion : public OpConversionPattern<test_irdl_to_cpp::BeefOp> {
   }
 };
 
+struct TestRegionConversion
+    : public OpConversionPattern<test_irdl_to_cpp::ConditionalOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(mlir::test_irdl_to_cpp::ConditionalOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Just exercising the C++ API even though these are not enforced in the
+    // dialect definition
+    assert(op.getThen().getBlocks().size() == 1);
+    assert(adaptor.getElse().getBlocks().size() == 1);
+    auto ifOp = scf::IfOp::create(rewriter, op.getLoc(), op.getInput());
+    rewriter.replaceOp(op, ifOp);
+    return success();
+  }
+};
+
 struct ConvertTestDialectToSomethingPass
     : PassWrapper<ConvertTestDialectToSomethingPass, OperationPass<ModuleOp>> {
   void runOnOperation() override {
     MLIRContext *ctx = &getContext();
     RewritePatternSet patterns(ctx);
-    patterns.add<TestOpConversion>(ctx);
+    patterns.add<TestOpConversion, TestRegionConversion>(ctx);
     ConversionTarget target(getContext());
-    target.addIllegalOp<test_irdl_to_cpp::BeefOp>();
-    target.addLegalOp<test_irdl_to_cpp::BarOp>();
-    target.addLegalOp<test_irdl_to_cpp::HashOp>();
+    target.addIllegalOp<test_irdl_to_cpp::BeefOp,
+                        test_irdl_to_cpp::ConditionalOp>();
+    target.addLegalOp<test_irdl_to_cpp::BarOp, test_irdl_to_cpp::HashOp,
+                      scf::IfOp, scf::YieldOp>();
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       signalPassFailure();
@@ -73,6 +92,10 @@ struct ConvertTestDialectToSomethingPass
   StringRef getDescription() const final {
     return "Checks the convertability of an irdl dialect";
   }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<scf::SCFDialect>();
+  }
 };
 
 void registerIrdlTestDialect(mlir::DialectRegistry &registry) {
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir
index f6233ee18190a..1915324ccb459 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_conversion.testd.mlir
@@ -1,15 +1,29 @@
 // RUN: mlir-opt %s --pass-pipeline="builtin.module(test-irdl-conversion-check)" | FileCheck %s
 // CHECK-LABEL: module {
 module {
-    // CHECK: func.func @test() {
+    // CHECK: func.func @test(%[[test_arg:[^ ]*]]: i1) {
     // CHECK: %[[v0:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32
     // CHECK: %[[v1:[^ ]*]] = "test_irdl_to_cpp.bar"() : () -> i32
     // CHECK: %[[v2:[^ ]*]] = "test_irdl_to_cpp.hash"(%[[v0]], %[[v0]]) : (i32, i32) -> i32
+    // CHECK: scf.if %[[test_arg]]
     // CHECK: return
     // CHECK: }
-    func.func @test() {
+    func.func @test(%test_arg: i1) {
         %0 = "test_irdl_to_cpp.bar"() : () -> i32
         %1 = "test_irdl_to_cpp.beef"(%0, %0) : (i32, i32) -> i32
+        "test_irdl_to_cpp.conditional"(%test_arg) ({
+        ^cond(%test: i1):
+          %3 = "test_irdl_to_cpp.bar"() : () -> i32
+          "test.terminator"() : ()->()
+        }, {
+        ^then(%what: i1, %ever: i32):
+          %4 = "test_irdl_to_cpp.bar"() : () -> i32
+          "test.terminator"() : ()->()
+        }, {
+        ^else():
+          %5 = "test_irdl_to_cpp.bar"() : () -> i32
+          "test.terminator"() : ()->()
+        }) : (i1) -> ()
         return
     }
 
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir
index 42e713e0adecd..85fb8cb15acef 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp.irdl.mlir
@@ -2,7 +2,7 @@
 
 // CHECK: class TestIrdlToCpp
 irdl.dialect @test_irdl_to_cpp {
-    
+
     // CHECK: class FooType
     irdl.type @foo
 
@@ -32,4 +32,53 @@ irdl.dialect @test_irdl_to_cpp {
         irdl.operands(lhs: %0, rhs: %0)
         irdl.results(res: %0)
     }
+
+    // CHECK: ConditionalOp declarations
+    // CHECK: ConditionalOpGenericAdaptorBase
+    // CHECK:  ::mlir::Region &getCond() { return *getRegions()[0]; }
+    // CHECK:  ::mlir::Region &getThen() { return *getRegions()[1]; }
+    // CHECK:  ::mlir::Region &getElse() { return *getRegions()[2]; }
+    //
+    // CHECK: class ConditionalOp : public ::mlir::Op<ConditionalOp, ::mlir::OpTrait::NRegions<3>::Impl, ::mlir::OpTrait::OpInvariants>
+    // CHECK:  ::mlir::Region &getCond() { return (*this)->getRegion(0); }
+    // CHECK:  ::mlir::Region &getThen() { return (*this)->getRegion(1); }
+    // CHECK:  ::mlir::Region &getElse() { return (*this)->getRegion(2); }
+
+    // CHECK: ConditionalOp definitions
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond
+    // CHECK: if (!(region.getNumArguments() == 1)) {
+    // CHECK: failed to verify constraint: region with 1 entry block argument(s)
+
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then
+    // CHECK: if (!(true)) {
+
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else
+    // CHECK: if (!(region.getNumArguments() == 0)) {
+    // CHECK: failed to verify constraint: region with 0 entry block argument(s)
+
+    // CHECK:  ConditionalOp::build
+    // CHECK: for (unsigned i = 0; i != 3; ++i)
+    // CHECK-NEXT: (void)odsState.addRegion();
+
+    // CHECK: ConditionalOp::verifyInvariantsImpl
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_cond
+    // CHECK: failure
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_then
+    // CHECK: failure
+    // CHECK: __mlir_irdl_local_region_constraint_ConditionalOp_else
+    // CHECK: failure
+    // CHECK: success
+    irdl.operation @conditional {
+        %r0 = irdl.region      // Unconstrained region
+        %r1 = irdl.region()    // Region with no entry block arguments
+
+        // TODO(#161018): support irdl.is in irdl-to-cpp
+        // %v0 = irdl.is i1       // Type constraint: i1 (boolean)
+        %v0 = irdl.any
+        %r2 = irdl.region(%v0) // Region with one i1 entry block argument
+        irdl.regions(cond: %r2, then: %r0, else: %r1)
+
+        %0 = irdl.any
+        irdl.operands(input: %0)
+    }
 }
diff --git a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir
index 403b49235467c..cc2745643db7e 100644
--- a/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir
+++ b/mlir/test/lib/Dialect/TestIRDLToCpp/test_irdl_to_cpp_invalid_unsupported_types.irdl.mlir
@@ -7,7 +7,7 @@ irdl.dialect @test_irdl_to_cpp {
     irdl.results(res: %1)
   }
 }
-// ----- 
+// -----
 
 irdl.dialect @test_irdl_to_cpp {
   irdl.operation @operands_no_any_of {
@@ -42,7 +42,7 @@ irdl.dialect @test_irdl_to_cpp {
 
 irdl.dialect @test_irdl_to_cpp {
   irdl.type @ty {
-    %0 = irdl.any 
+    %0 = irdl.any
     // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.parameters operation}}
     irdl.parameters(ty: %0)
   }
@@ -50,30 +50,9 @@ irdl.dialect @test_irdl_to_cpp {
 
 // -----
 
-irdl.dialect @test_irdl_to_cpp {
-  irdl.operation @test_op {
-    // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.region operation}}
-    %0 = irdl.region()
-    irdl.regions(reg: %0)
-  }
-  
-}
-
-// -----
-
-irdl.dialect @test_irdl_to_cpp {
-  irdl.operation @test_op {
-    // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.regions operation}}
-    irdl.regions()
-  }
-  
-}
-
-// -----
-
 irdl.dialect @test_irdl_to_cpp {
   irdl.type @test_derived {
     // expected-error@+1 {{IRDL C++ translation does not yet support translation of irdl.base operation}}
     %0 = irdl.base "!builtin.integer"
-  }    
+  }
 }

From b86ddae1da651f921125a9864b45a5b11bc3b1c0 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Thu, 2 Oct 2025 11:01:27 -0700
Subject: [PATCH 553/878] [clang] NFCI: Clean up
 `CompilerInstance::create{File,Source}Manager()` (#160748)

The `CompilerInstance::createSourceManager()` function currently accepts
the `FileManager` to be used. However, all clients call
`CompilerInstance::createFileManager()` prior to creating the
`SourceManager`, and it never makes sense to use a `FileManager` in the
`SourceManager` that's different from the rest of the compiler. Passing
the `FileManager` explicitly is redundant, error-prone, and deviates
from the style of other `CompilerInstance` initialization APIs.

This PR therefore removes the `FileManager` parameter from
`createSourceManager()` and also stops returning the `FileManager`
pointer from `createFileManager()`, since that was its primary use. Now,
`createSourceManager()` internally calls `getFileManager()` instead.
---
 .../clang-include-fixer/IncludeFixer.cpp            |  2 +-
 .../include-cleaner/unittests/RecordTest.cpp        |  7 ++++---
 clang/include/clang/Frontend/CompilerInstance.h     |  6 ++----
 clang/lib/ExtractAPI/ExtractAPIConsumer.cpp         |  3 +--
 clang/lib/Frontend/ChainedIncludesSource.cpp        |  2 +-
 clang/lib/Frontend/CompilerInstance.cpp             | 13 +++++++------
 clang/lib/Frontend/FrontendAction.cpp               | 11 ++++-------
 clang/lib/Testing/TestAST.cpp                       |  2 +-
 .../DependencyScanning/DependencyScannerImpl.cpp    | 11 ++++++-----
 clang/lib/Tooling/Tooling.cpp                       |  2 +-
 clang/tools/clang-import-test/clang-import-test.cpp |  2 +-
 clang/unittests/CodeGen/TestCompiler.h              |  2 +-
 .../Serialization/ForceCheckFileInputTest.cpp       |  4 ++--
 .../DependencyScanning/DependencyScannerTest.cpp    |  2 +-
 .../Clang/ClangExpressionParser.cpp                 |  2 +-
 15 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
index d2ae13c022b23..e825547ba0134 100644
--- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
+++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
@@ -96,7 +96,7 @@ bool IncludeFixerActionFactory::runInvocation(
   // diagnostics here.
   Compiler.createDiagnostics(new clang::IgnoringDiagConsumer,
                              /*ShouldOwnClient=*/true);
-  Compiler.createSourceManager(*Files);
+  Compiler.createSourceManager();
 
   // We abort on fatal errors so don't let a large number of errors become
   // fatal. A missing #include can cause thousands of errors.
diff --git a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
index 3fb49796039f2..cbf7bae23b365 100644
--- a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
@@ -649,11 +649,12 @@ TEST_F(PragmaIncludeTest, ExportInUnnamedBuffer) {
   Clang->createVirtualFileSystem(VFS);
   Clang->createDiagnostics();
 
-  auto *FM = Clang->createFileManager();
+  Clang->createFileManager();
+  FileManager &FM = Clang->getFileManager();
   ASSERT_TRUE(Clang->ExecuteAction(*Inputs.MakeAction()));
   EXPECT_THAT(
-      PI.getExporters(llvm::cantFail(FM->getFileRef("foo.h")), *FM),
-      testing::ElementsAre(llvm::cantFail(FM->getFileRef("exporter.h"))));
+      PI.getExporters(llvm::cantFail(FM.getFileRef("foo.h")), FM),
+      testing::ElementsAre(llvm::cantFail(FM.getFileRef("exporter.h"))));
 }
 
 TEST_F(PragmaIncludeTest, OutlivesFMAndSM) {
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index a6b6993b708d0..44fff69c217c5 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -712,12 +712,10 @@ class CompilerInstance : public ModuleLoader {
                     const CodeGenOptions *CodeGenOpts = nullptr);
 
   /// Create the file manager and replace any existing one with it.
-  ///
-  /// \return The new file manager on success, or null on failure.
-  FileManager *createFileManager();
+  void createFileManager();
 
   /// Create the source manager and replace any existing one with it.
-  void createSourceManager(FileManager &FileMgr);
+  void createSourceManager();
 
   /// Create the preprocessor, using the invocation, file, and source managers,
   /// and replace any existing one with it.
diff --git a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
index 1087eb3001856..6966d4097d64a 100644
--- a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
+++ b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
@@ -444,8 +444,7 @@ bool ExtractAPIAction::PrepareToExecuteAction(CompilerInstance &CI) {
     return true;
 
   if (!CI.hasFileManager())
-    if (!CI.createFileManager())
-      return false;
+    CI.createFileManager();
 
   auto Kind = Inputs[0].getKind();
 
diff --git a/clang/lib/Frontend/ChainedIncludesSource.cpp b/clang/lib/Frontend/ChainedIncludesSource.cpp
index 82249f893a795..049277c2df7a9 100644
--- a/clang/lib/Frontend/ChainedIncludesSource.cpp
+++ b/clang/lib/Frontend/ChainedIncludesSource.cpp
@@ -129,7 +129,7 @@ clang::createChainedIncludesSource(CompilerInstance &CI,
     Clang->setTarget(TargetInfo::CreateTargetInfo(
         Clang->getDiagnostics(), Clang->getInvocation().getTargetOpts()));
     Clang->createFileManager();
-    Clang->createSourceManager(Clang->getFileManager());
+    Clang->createSourceManager();
     Clang->createPreprocessor(TU_Prefix);
     Clang->getDiagnosticClient().BeginSourceFile(Clang->getLangOpts(),
                                                  &Clang->getPreprocessor());
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index b1fb905e3602f..584436665622d 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -382,17 +382,18 @@ IntrusiveRefCntPtr<DiagnosticsEngine> CompilerInstance::createDiagnostics(
 
 // File Manager
 
-FileManager *CompilerInstance::createFileManager() {
+void CompilerInstance::createFileManager() {
   assert(VFS && "CompilerInstance needs a VFS for creating FileManager");
   FileMgr = llvm::makeIntrusiveRefCnt<FileManager>(getFileSystemOpts(), VFS);
-  return FileMgr.get();
 }
 
 // Source Manager
 
-void CompilerInstance::createSourceManager(FileManager &FileMgr) {
-  SourceMgr =
-      llvm::makeIntrusiveRefCnt<SourceManager>(getDiagnostics(), FileMgr);
+void CompilerInstance::createSourceManager() {
+  assert(Diagnostics && "DiagnosticsEngine needed for creating SourceManager");
+  assert(FileMgr && "FileManager needed for creating SourceManager");
+  SourceMgr = llvm::makeIntrusiveRefCnt<SourceManager>(getDiagnostics(),
+                                                       getFileManager());
 }
 
 // Initialize the remapping of files to alternative contents, e.g.,
@@ -1186,7 +1187,7 @@ std::unique_ptr<CompilerInstance> CompilerInstance::cloneForModuleCompileImpl(
   if (llvm::is_contained(DiagOpts.SystemHeaderWarningsModules, ModuleName))
     Instance.getDiagnostics().setSuppressSystemWarnings(false);
 
-  Instance.createSourceManager(Instance.getFileManager());
+  Instance.createSourceManager();
   SourceManager &SourceMgr = Instance.getSourceManager();
 
   if (ThreadSafeConfig) {
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 6cc3b65a16cb2..1b63c40a6efd7 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -879,7 +879,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
     // file, otherwise the CompilerInstance will happily destroy them.
     CI.setVirtualFileSystem(AST->getFileManager().getVirtualFileSystemPtr());
     CI.setFileManager(AST->getFileManagerPtr());
-    CI.createSourceManager(CI.getFileManager());
+    CI.createSourceManager();
     CI.getSourceManager().initializeForReplay(AST->getSourceManager());
 
     // Preload all the module files loaded transitively by the AST unit. Also
@@ -971,13 +971,10 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
   // Set up the file system, file and source managers, if needed.
   if (!CI.hasVirtualFileSystem())
     CI.createVirtualFileSystem();
-  if (!CI.hasFileManager()) {
-    if (!CI.createFileManager()) {
-      return false;
-    }
-  }
+  if (!CI.hasFileManager())
+    CI.createFileManager();
   if (!CI.hasSourceManager()) {
-    CI.createSourceManager(CI.getFileManager());
+    CI.createSourceManager();
     if (CI.getDiagnosticOpts().getFormat() == DiagnosticOptions::SARIF) {
       static_cast<SARIFDiagnosticPrinter *>(&CI.getDiagnosticClient())
           ->setSarifWriter(
diff --git a/clang/lib/Testing/TestAST.cpp b/clang/lib/Testing/TestAST.cpp
index 9ad0de95530fb..d3338956f3043 100644
--- a/clang/lib/Testing/TestAST.cpp
+++ b/clang/lib/Testing/TestAST.cpp
@@ -61,7 +61,7 @@ void createMissingComponents(CompilerInstance &Clang) {
   if (!Clang.hasFileManager())
     Clang.createFileManager();
   if (!Clang.hasSourceManager())
-    Clang.createSourceManager(Clang.getFileManager());
+    Clang.createSourceManager();
   if (!Clang.hasTarget())
     Clang.createTarget();
   if (!Clang.hasPreprocessor())
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index 66cf2688e0a13..010380dee3617 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -415,7 +415,7 @@ bool DependencyScanningAction::runInvocation(
       any(Service.getOptimizeArgs() & ScanningOptimizations::VFS);
 
   // Create a new FileManager to match the invocation's FileSystemOptions.
-  auto *FileMgr = ScanInstance.createFileManager();
+  ScanInstance.createFileManager();
 
   // Use the dependency scanning optimized file system if requested to do so.
   if (DepFS) {
@@ -423,16 +423,17 @@ bool DependencyScanningAction::runInvocation(
     if (!ScanInstance.getHeaderSearchOpts().ModuleCachePath.empty()) {
       SmallString<256> ModulesCachePath;
       normalizeModuleCachePath(
-          *FileMgr, ScanInstance.getHeaderSearchOpts().ModuleCachePath,
-          ModulesCachePath);
+          ScanInstance.getFileManager(),
+          ScanInstance.getHeaderSearchOpts().ModuleCachePath, ModulesCachePath);
       DepFS->setBypassedPathPrefix(ModulesCachePath);
     }
 
     ScanInstance.setDependencyDirectivesGetter(
-        std::make_unique<ScanningDependencyDirectivesGetter>(*FileMgr));
+        std::make_unique<ScanningDependencyDirectivesGetter>(
+            ScanInstance.getFileManager()));
   }
 
-  ScanInstance.createSourceManager(*FileMgr);
+  ScanInstance.createSourceManager();
 
   // Create a collection of stable directories derived from the ScanInstance
   // for determining whether module dependencies would fully resolve from
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 2d4790b205b1a..ea5a37216e959 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -458,7 +458,7 @@ bool FrontendActionFactory::runInvocation(
   if (!Compiler.hasDiagnostics())
     return false;
 
-  Compiler.createSourceManager(*Files);
+  Compiler.createSourceManager();
 
   const bool Success = Compiler.ExecuteAction(*ScopedToolAction);
 
diff --git a/clang/tools/clang-import-test/clang-import-test.cpp b/clang/tools/clang-import-test/clang-import-test.cpp
index 910e08ca4dffa..977cec1d53157 100644
--- a/clang/tools/clang-import-test/clang-import-test.cpp
+++ b/clang/tools/clang-import-test/clang-import-test.cpp
@@ -216,7 +216,7 @@ std::unique_ptr<CompilerInstance> BuildCompilerInstance() {
   Ins->getTarget().adjust(Ins->getDiagnostics(), Ins->getLangOpts(),
                           /*AuxTarget=*/nullptr);
   Ins->createFileManager();
-  Ins->createSourceManager(Ins->getFileManager());
+  Ins->createSourceManager();
   Ins->createPreprocessor(TU_Complete);
 
   return Ins;
diff --git a/clang/unittests/CodeGen/TestCompiler.h b/clang/unittests/CodeGen/TestCompiler.h
index 57b5b079a2e30..9bd90609fcd29 100644
--- a/clang/unittests/CodeGen/TestCompiler.h
+++ b/clang/unittests/CodeGen/TestCompiler.h
@@ -52,7 +52,7 @@ struct TestCompiler {
     PtrSize = TInfo.getPointerWidth(clang::LangAS::Default) / 8;
 
     compiler.createFileManager();
-    compiler.createSourceManager(compiler.getFileManager());
+    compiler.createSourceManager();
     compiler.createPreprocessor(clang::TU_Prefix);
 
     compiler.createASTContext();
diff --git a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
index 24e2fd65f3c0a..edf33ae04230b 100644
--- a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
+++ b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
@@ -122,8 +122,8 @@ export int aa = 43;
 
     Clang.setDiagnostics(Diags);
     Clang.createVirtualFileSystem(CIOpts.VFS);
-    FileManager *FM = Clang.createFileManager();
-    Clang.createSourceManager(*FM);
+    Clang.createFileManager();
+    Clang.createSourceManager();
 
     EXPECT_TRUE(Clang.createTarget());
     Clang.createPreprocessor(TU_Complete);
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
index 80289efd374cf..aa32bb3d39f6d 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
@@ -65,7 +65,7 @@ class TestDependencyScanningAction : public tooling::ToolAction {
     if (!Compiler.hasDiagnostics())
       return false;
 
-    Compiler.createSourceManager(*FileMgr);
+    Compiler.createSourceManager();
     Compiler.addDependencyCollector(std::make_shared<TestFileCollector>(
         Compiler.getInvocation().getDependencyOutputOpts(), Deps));
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 924953cc43fa2..3c49c911108a3 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -792,7 +792,7 @@ ClangExpressionParser::ClangExpressionParser(
   // 6. Set up the source management objects inside the compiler
   m_compiler->createFileManager();
   if (!m_compiler->hasSourceManager())
-    m_compiler->createSourceManager(m_compiler->getFileManager());
+    m_compiler->createSourceManager();
   m_compiler->createPreprocessor(TU_Complete);
 
   switch (expr.Language().AsLanguageType()) {

From 8f67cdd9b7f4ffa3cca552b00d58e72dba66b924 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 2 Oct 2025 19:05:14 +0100
Subject: [PATCH 554/878] [AArch64][SME] Support split ZPR and PPR area
 allocation (#142392)

For a while we have supported the `-aarch64-stack-hazard-size=<size>`
option, which adds "hazard padding" between GPRs and FPR/ZPRs. However,
there is currently a hole in this mitigation as PPR and FPR/ZPR accesses
to the same area also cause streaming memory hazards (this is noted by
`-pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=<val>`),
and the current stack layout places PPRs and ZPRs within the same area.

Which looks like:

```
------------------------------------  Higher address
| callee-saved gpr registers        |
|---------------------------------- |
| lr,fp  (a.k.a. "frame record")    |
|-----------------------------------| <- fp(=x29)
|   <hazard padding>                |
|-----------------------------------|
| callee-saved fp/simd/SVE regs     |
|-----------------------------------|
|        SVE stack objects          |
|-----------------------------------|
| local variables of fixed size     |
|   <FPR>                           |
|   <hazard padding>                |
|   <GPR>                           |
------------------------------------| <- sp
                                    | Lower address
```

With this patch the stack (and hazard padding) is rearranged so that
hazard padding is placed between the PPRs and ZPRs rather than within
the (fixed size) callee-save region. Which looks something like this:

```
------------------------------------  Higher address
| callee-saved gpr registers        |
|---------------------------------- |
| lr,fp  (a.k.a. "frame record")    |
|-----------------------------------| <- fp(=x29)
|        callee-saved PPRs          |
|        PPR stack objects          | (These are SVE predicates)
|-----------------------------------|
|   <hazard padding>                |
|-----------------------------------|
|       callee-saved ZPR regs       | (These are SVE vectors)
|        ZPR stack objects          | Note: FPRs are promoted to ZPRs
|-----------------------------------|
| local variables of fixed size     |
|   <FPR>                           |
|   <hazard padding>                |
|   <GPR>                           |
------------------------------------| <- sp
                                    | Lower address
```

This layout is only enabled if:

 * SplitSVEObjects are enabled (`-aarch64-split-sve-objects`)
   - (This may be enabled by default in a later patch)
 * Streaming memory hazards are present
   - (`-aarch64-stack-hazard-size=<val>` != 0)
 * PPRs and FPRs/ZPRs are on the stack
 * There's no stack realignment or variable-sized objects
   - This is left as a TODO for now

Additionally, any FPR callee-saves that are present will be promoted to
ZPRs. This is to prevent stack hazards between FPRs and GRPs in the
fixed size callee-save area (which would otherwise require more hazard
padding, or moving the FPR callee-saves).

This layout should resolve the hole in the hazard padding mitigation,
and is not intended change codegen for non-SME code.
---
 llvm/include/llvm/CodeGen/MachineFrameInfo.h  |   6 +-
 .../CodeGen/StackFrameLayoutAnalysisPass.cpp  |   2 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |   3 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   | 297 ++++--
 .../lib/Target/AArch64/AArch64FrameLowering.h |   8 +-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |   4 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |  20 +-
 .../AArch64/AArch64PrologueEpilogue.cpp       | 125 ++-
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  11 +-
 .../CodeGen/AArch64/framelayout-split-sve.mir | 587 ++++++++++++
 .../AArch64/spill-fill-zpr-predicates.mir     |  16 +-
 .../AArch64/split-sve-stack-frame-layout.ll   | 824 ++++++++++++++++
 llvm/test/CodeGen/AArch64/stack-hazard.ll     | 876 +++++++++++-------
 .../CodeGen/AArch64/sve-stack-frame-layout.ll |   2 -
 15 files changed, 2350 insertions(+), 435 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
 create mode 100644 llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll

diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index b37c677c10569..50ce93104ab53 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -501,8 +501,12 @@ class MachineFrameInfo {
            StackID == TargetStackID::ScalablePredicateVector;
   }
 
-  bool isScalableStackID(int ObjectIdx) const {
+  bool hasScalableStackID(int ObjectIdx) const {
     uint8_t StackID = getStackID(ObjectIdx);
+    return isScalableStackID(StackID);
+  }
+
+  bool isScalableStackID(uint8_t StackID) const {
     return StackID == TargetStackID::ScalableVector ||
            StackID == TargetStackID::ScalablePredicateVector;
   }
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index ec75dc3e3871c..64e5cd5cd19bb 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -72,7 +72,7 @@ struct StackFrameLayoutAnalysis {
         : Slot(Idx), Size(MFI.getObjectSize(Idx)),
           Align(MFI.getObjectAlign(Idx).value()), Offset(Offset),
           SlotTy(Invalid), Scalable(false) {
-      Scalable = MFI.isScalableStackID(Idx);
+      Scalable = MFI.hasScalableStackID(Idx);
       if (MFI.isSpillSlotObjectIndex(Idx))
         SlotTy = SlotType::Spill;
       else if (MFI.isFixedObjectIndex(Idx))
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 79655e1c9529c..0f4bbfc3d610e 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1610,7 +1610,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      int BaseOffset = -AFI->getTaggedBasePointerOffset();
      Register FrameReg;
      StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
-         MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
+         MF, BaseOffset, false /*isFixed*/, TargetStackID::Default /*StackID*/,
+         FrameReg,
          /*PreferFP=*/false,
          /*ForSimm=*/true);
      Register SrcReg = FrameReg;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f5f7b6522ddec..8d6eb91d74375 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -56,15 +56,20 @@
 // | async context if needed           |
 // | (a.k.a. "frame record")           |
 // |-----------------------------------| <- fp(=x29)
-// |   <hazard padding>                |
-// |-----------------------------------|
-// |                                   |
-// | callee-saved fp/simd/SVE regs     |
-// |                                   |
-// |-----------------------------------|
-// |                                   |
-// |        SVE stack objects          |
-// |                                   |
+//        Default SVE stack layout                 Split SVE objects
+//   (aarch64-split-sve-objects=false)      (aarch64-split-sve-objects=true)
+// |-----------------------------------|  |-----------------------------------|
+// |         <hazard padding>          |  | callee-saved PPR registers        |
+// |-----------------------------------|  |-----------------------------------|
+// |                                   |  |         PPR stack objects         |
+// | callee-saved fp/simd/SVE regs     |  |-----------------------------------|
+// |                                   |  |         <hazard padding>          |
+// |-----------------------------------|  |-----------------------------------|
+// |                                   |  | callee-saved ZPR/FPR registers    |
+// |        SVE stack objects          |  |-----------------------------------|
+// |                                   |  |         ZPR stack objects         |
+// |-----------------------------------|  |-----------------------------------|
+//                                         ^ NB: FPR CSRs are promoted to ZPRs
 // |-----------------------------------|
 // |.empty.space.to.make.part.below....|
 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
@@ -274,6 +279,11 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
                                        cl::desc("sort stack allocations"),
                                        cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    SplitSVEObjects("aarch64-split-sve-objects",
+                    cl::desc("Split allocation of ZPR & PPR objects"),
+                    cl::init(false), cl::Hidden);
+
 cl::opt<bool> EnableHomogeneousPrologEpilog(
     "homogeneous-prolog-epilog", cl::Hidden,
     cl::desc("Emit homogeneous prologue and epilogue for the size "
@@ -329,8 +339,7 @@ enum class AssignObjectOffsets { No, Yes };
 /// each object. If AssignOffsets is "Yes", the offsets get assigned (and SVE
 /// stack sizes set). Returns the size of the SVE stack.
 static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
-                                            AssignObjectOffsets AssignOffsets,
-                                            bool SplitSVEObjects = false);
+                                            AssignObjectOffsets AssignOffsets);
 
 static unsigned getStackHazardSize(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
@@ -350,8 +359,13 @@ AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const {
 
 StackOffset
 AArch64FrameLowering::getPPRStackSize(const MachineFunction &MF) const {
+  // With split SVE objects, the hazard padding is added to the PPR region,
+  // which places it between the [GPR, PPR] area and the [ZPR, FPR] area. This
+  // avoids hazards between both GPRs and FPRs and ZPRs and PPRs.
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  return StackOffset::getScalable(AFI->getStackSizePPR());
+  return StackOffset::get(AFI->hasSplitSVEObjects() ? getStackHazardSize(MF)
+                                                    : 0,
+                          AFI->getStackSizePPR());
 }
 
 // Conservatively, returns true if the function is likely to have SVE vectors
@@ -368,7 +382,7 @@ static bool isLikelyToHaveSVEStack(const AArch64FrameLowering &AFL,
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
-    if (MFI.isScalableStackID(FI))
+    if (MFI.hasScalableStackID(FI))
       return true;
   }
 
@@ -1249,11 +1263,21 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
   const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool FPAfterSVECalleeSaves =
       isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
-  if (MFI.isScalableStackID(FI)) {
+  if (MFI.hasScalableStackID(FI)) {
     if (FPAfterSVECalleeSaves &&
-        -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize())
+        -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
+      assert(!AFI->hasSplitSVEObjects() &&
+             "split-sve-objects not supported with FPAfterSVECalleeSaves");
       return StackOffset::getScalable(ObjectOffset);
-    return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
+    }
+    StackOffset AccessOffset{};
+    // The scalable vectors are below (lower address) the scalable predicates
+    // with split SVE objects, so we must subtract the size of the predicates.
+    if (AFI->hasSplitSVEObjects() &&
+        MFI.getStackID(FI) == TargetStackID::ScalableVector)
+      AccessOffset = -PPRStackSize;
+    return AccessOffset +
+           StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
                             ObjectOffset);
   }
 
@@ -1315,14 +1339,15 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
   const auto &MFI = MF.getFrameInfo();
   int64_t ObjectOffset = MFI.getObjectOffset(FI);
   bool isFixed = MFI.isFixedObjectIndex(FI);
-  bool isSVE = MFI.isScalableStackID(FI);
-  return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
-                                     PreferFP, ForSimm);
+  auto StackID = static_cast<TargetStackID::Value>(MFI.getStackID(FI));
+  return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, StackID,
+                                     FrameReg, PreferFP, ForSimm);
 }
 
 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
-    const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
-    Register &FrameReg, bool PreferFP, bool ForSimm) const {
+    const MachineFunction &MF, int64_t ObjectOffset, bool isFixed,
+    TargetStackID::Value StackID, Register &FrameReg, bool PreferFP,
+    bool ForSimm) const {
   const auto &MFI = MF.getFrameInfo();
   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
@@ -1333,8 +1358,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
   int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
   bool isCSR =
       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
+  bool isSVE = MFI.isScalableStackID(StackID);
 
-  const StackOffset SVEStackSize = getSVEStackSize(MF);
+  StackOffset ZPRStackSize = getZPRStackSize(MF);
+  StackOffset PPRStackSize = getPPRStackSize(MF);
+  StackOffset SVEStackSize = ZPRStackSize + PPRStackSize;
 
   // Use frame pointer to reference fixed objects. Use it for locals if
   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
@@ -1409,12 +1437,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
 
   if (isSVE) {
-    StackOffset FPOffset =
-        StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
+    StackOffset FPOffset = StackOffset::get(
+        -AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
     StackOffset SPOffset =
         SVEStackSize +
         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
                          ObjectOffset);
+
+    // With split SVE objects the ObjectOffset is relative to the split area
+    // (i.e. the PPR area or ZPR area respectively).
+    if (AFI->hasSplitSVEObjects() && StackID == TargetStackID::ScalableVector) {
+      // If we're accessing an SVE vector with split SVE objects...
+      // - From the FP we need to move down past the PPR area:
+      FPOffset -= PPRStackSize;
+      // - From the SP we only need to move up to the ZPR area:
+      SPOffset -= PPRStackSize;
+      // Note: `SPOffset = SVEStackSize + ...`, so `-= PPRStackSize` results in
+      // `SPOffset = ZPRStackSize + ...`.
+    }
+
     if (FPAfterSVECalleeSaves) {
       FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
       if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
@@ -1422,6 +1463,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize());
       }
     }
+
     // Always use the FP for SVE spills if available and beneficial.
     if (hasFP(MF) && (SPOffset.getFixed() ||
                       FPOffset.getScalable() < SPOffset.getScalable() ||
@@ -1429,13 +1471,13 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       FrameReg = RegInfo->getFrameRegister(MF);
       return FPOffset;
     }
-
     FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
                                            : (unsigned)AArch64::SP;
+
     return SPOffset;
   }
 
-  StackOffset ScalableOffset = {};
+  StackOffset SVEAreaOffset = {};
   if (FPAfterSVECalleeSaves) {
     // In this stack layout, the FP is in between the callee saves and other
     // SVE allocations.
@@ -1443,25 +1485,25 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
     if (UseFP) {
       if (isFixed)
-        ScalableOffset = SVECalleeSavedStack;
+        SVEAreaOffset = SVECalleeSavedStack;
       else if (!isCSR)
-        ScalableOffset = SVECalleeSavedStack - SVEStackSize;
+        SVEAreaOffset = SVECalleeSavedStack - SVEStackSize;
     } else {
       if (isFixed)
-        ScalableOffset = SVEStackSize;
+        SVEAreaOffset = SVEStackSize;
       else if (isCSR)
-        ScalableOffset = SVEStackSize - SVECalleeSavedStack;
+        SVEAreaOffset = SVEStackSize - SVECalleeSavedStack;
     }
   } else {
     if (UseFP && !(isFixed || isCSR))
-      ScalableOffset = -SVEStackSize;
+      SVEAreaOffset = -SVEStackSize;
     if (!UseFP && (isFixed || isCSR))
-      ScalableOffset = SVEStackSize;
+      SVEAreaOffset = SVEStackSize;
   }
 
   if (UseFP) {
     FrameReg = RegInfo->getFrameRegister(MF);
-    return StackOffset::getFixed(FPOffset) + ScalableOffset;
+    return StackOffset::getFixed(FPOffset) + SVEAreaOffset;
   }
 
   // Use the base pointer if we have one.
@@ -1478,7 +1520,7 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       Offset -= AFI->getLocalStackSize();
   }
 
-  return StackOffset::getFixed(Offset) + ScalableOffset;
+  return StackOffset::getFixed(Offset) + SVEAreaOffset;
 }
 
 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
@@ -1635,14 +1677,25 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
     RegInc = -1;
     FirstReg = Count - 1;
   }
+
   bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize();
-  int ScalableByteOffset = FPAfterSVECalleeSaves
-                               ? 0
-                               : AFI->getZPRCalleeSavedStackSize() +
-                                     AFI->getPPRCalleeSavedStackSize();
+
+  int ZPRByteOffset = 0;
+  int PPRByteOffset = 0;
+  bool SplitPPRs = AFI->hasSplitSVEObjects();
+  if (SplitPPRs) {
+    ZPRByteOffset = AFI->getZPRCalleeSavedStackSize();
+    PPRByteOffset = AFI->getPPRCalleeSavedStackSize();
+  } else if (!FPAfterSVECalleeSaves) {
+    ZPRByteOffset =
+        AFI->getZPRCalleeSavedStackSize() + AFI->getPPRCalleeSavedStackSize();
+    // Unused: Everything goes in ZPR space.
+    PPRByteOffset = 0;
+  }
+
   bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
   Register LastReg = 0;
-  bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex();
+  bool HasCSHazardPadding = AFI->hasStackHazardSlotIndex() && !SplitPPRs;
 
   // When iterating backwards, the loop condition relies on unsigned wraparound.
   for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -1671,6 +1724,10 @@ void computeCalleeSaveRegisterPairs(const AArch64FrameLowering &AFL,
       llvm_unreachable("Unsupported register class.");
     }
 
+    int &ScalableByteOffset = RPI.Type == RegPairInfo::PPR && SplitPPRs
+                                  ? PPRByteOffset
+                                  : ZPRByteOffset;
+
     // Add the stack hazard size as we transition from GPR->FPR CSRs.
     if (HasCSHazardPadding &&
         (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
@@ -2227,6 +2284,13 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
   return getMMOFrameID(*MI.memoperands_begin(), MFI);
 }
 
+// Returns true if the LDST MachineInstr \p MI is a PPR access.
+static bool isPPRAccess(const MachineInstr &MI) {
+  return MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
+         MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
+         AArch64::PPRRegClass.contains(MI.getOperand(0).getReg());
+}
+
 // Check if a Hazard slot is needed for the current function, and if so create
 // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
 // which can be used to determine if any hazard padding is needed.
@@ -2250,25 +2314,50 @@ void AArch64FrameLowering::determineStackHazardSlot(
   bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
     return AArch64::FPR64RegClass.contains(Reg) ||
            AArch64::FPR128RegClass.contains(Reg) ||
-           AArch64::ZPRRegClass.contains(Reg) ||
-           AArch64::PPRRegClass.contains(Reg);
+           AArch64::ZPRRegClass.contains(Reg);
+  });
+  bool HasPPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
+    return AArch64::PPRRegClass.contains(Reg);
   });
   bool HasFPRStackObjects = false;
-  if (!HasFPRCSRs) {
-    std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd());
+  bool HasPPRStackObjects = false;
+  if (!HasFPRCSRs || SplitSVEObjects) {
+    enum SlotType : uint8_t {
+      Unknown = 0,
+      ZPRorFPR = 1 << 0,
+      PPR = 1 << 1,
+      GPR = 1 << 2,
+      LLVM_MARK_AS_BITMASK_ENUM(GPR)
+    };
+
+    // Find stack slots solely used for one kind of register (ZPR, PPR, etc.),
+    // based on the kinds of accesses used in the function.
+    SmallVector<SlotType> SlotTypes(MFI.getObjectIndexEnd(), SlotType::Unknown);
     for (auto &MBB : MF) {
       for (auto &MI : MBB) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
-        if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI))
-            FrameObjects[*FI] |= 2;
-          else
-            FrameObjects[*FI] |= 1;
+        if (!FI || FI < 0 || FI > int(SlotTypes.size()))
+          continue;
+        if (MFI.hasScalableStackID(*FI)) {
+          SlotTypes[*FI] |=
+              isPPRAccess(MI) ? SlotType::PPR : SlotType::ZPRorFPR;
+        } else {
+          SlotTypes[*FI] |= AArch64InstrInfo::isFpOrNEON(MI)
+                                ? SlotType::ZPRorFPR
+                                : SlotType::GPR;
         }
       }
     }
-    HasFPRStackObjects =
-        any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; });
+
+    for (int FI = 0; FI < int(SlotTypes.size()); ++FI) {
+      HasFPRStackObjects |= SlotTypes[FI] == SlotType::ZPRorFPR;
+      // For SplitSVEObjects remember that this stack slot is a predicate, this
+      // will be needed later when determining the frame layout.
+      if (SlotTypes[FI] == SlotType::PPR) {
+        MFI.setStackID(FI, TargetStackID::ScalablePredicateVector);
+        HasPPRStackObjects = true;
+      }
+    }
   }
 
   if (HasFPRCSRs || HasFPRStackObjects) {
@@ -2277,6 +2366,78 @@ void AArch64FrameLowering::determineStackHazardSlot(
                       << StackHazardSize << "\n");
     AFI->setStackHazardSlotIndex(ID);
   }
+
+  // Determine if we should use SplitSVEObjects. This should only be used if
+  // there's a possibility of a stack hazard between PPRs and ZPRs or FPRs.
+  if (SplitSVEObjects) {
+    if (!HasPPRCSRs && !HasPPRStackObjects) {
+      LLVM_DEBUG(
+          dbgs() << "Not using SplitSVEObjects as no PPRs are on the stack\n");
+      return;
+    }
+
+    if (!HasFPRCSRs && !HasFPRStackObjects) {
+      LLVM_DEBUG(
+          dbgs()
+          << "Not using SplitSVEObjects as no FPRs or ZPRs are on the stack\n");
+      return;
+    }
+
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
+      LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
+                           "sized objects or realignment\n");
+      return;
+    }
+
+    if (arePPRsSpilledAsZPR(MF)) {
+      LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with "
+                           "-aarch64-enable-zpr-predicate-spills");
+      return;
+    }
+
+    // If another calling convention is explicitly set FPRs can't be promoted to
+    // ZPR callee-saves.
+    if (!is_contained({CallingConv::C, CallingConv::Fast,
+                       CallingConv::AArch64_SVE_VectorCall},
+                      MF.getFunction().getCallingConv())) {
+      LLVM_DEBUG(
+          dbgs() << "Calling convention is not supported with SplitSVEObjects");
+      return;
+    }
+
+    [[maybe_unused]] const AArch64Subtarget &Subtarget =
+        MF.getSubtarget<AArch64Subtarget>();
+    assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+           "Expected SVE to be available for PPRs");
+
+    // With SplitSVEObjects the CS hazard padding is placed between the
+    // PPRs and ZPRs. If there are any FPR CS there would be a hazard between
+    // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
+    BitVector FPRZRegs(SavedRegs.size());
+    for (size_t Reg = 0, E = SavedRegs.size(); HasFPRCSRs && Reg < E; ++Reg) {
+      BitVector::reference RegBit = SavedRegs[Reg];
+      if (!RegBit)
+        continue;
+      unsigned SubRegIdx = 0;
+      if (AArch64::FPR64RegClass.contains(Reg))
+        SubRegIdx = AArch64::dsub;
+      else if (AArch64::FPR128RegClass.contains(Reg))
+        SubRegIdx = AArch64::zsub;
+      else
+        continue;
+      // Clear the bit for the FPR save.
+      RegBit = false;
+      // Mark that we should save the corresponding ZPR.
+      Register ZReg =
+          TRI->getMatchingSuperReg(Reg, SubRegIdx, &AArch64::ZPRRegClass);
+      FPRZRegs.set(ZReg);
+    }
+    SavedRegs |= FPRZRegs;
+
+    AFI->setSplitSVEObjects(true);
+    LLVM_DEBUG(dbgs() << "SplitSVEObjects enabled!\n");
+  }
 }
 
 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
@@ -2410,6 +2571,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     SavedRegs.set(AArch64::X18);
   }
 
+  // Determine if a Hazard slot should be used and where it should go.
+  // If SplitSVEObjects is used, the hazard padding is placed between the PPRs
+  // and ZPRs. Otherwise, it goes in the callee save area.
+  determineStackHazardSlot(MF, SavedRegs);
+
   // Calculates the callee saved stack size.
   unsigned CSStackSize = 0;
   unsigned ZPRCSStackSize = 0;
@@ -2434,10 +2600,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // only 64-bit GPRs can be added to SavedRegs.
   unsigned NumSavedRegs = SavedRegs.count();
 
-  // Determine if a Hazard slot should be used, and increase the CSStackSize by
-  // StackHazardSize if so.
-  determineStackHazardSlot(MF, SavedRegs);
-  if (AFI->hasStackHazardSlotIndex())
+  // If we have hazard padding in the CS area add that to the size.
+  if (AFI->isStackHazardIncludedInCalleeSaveArea())
     CSStackSize += getStackHazardSize(MF);
 
   // Increase the callee-saved stack size if the function has streaming mode
@@ -2607,7 +2771,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 
     // Create a hazard slot as we switch between GPR and FPR CSRs.
-    if (AFI->hasStackHazardSlotIndex() &&
+    if (AFI->isStackHazardIncludedInCalleeSaveArea() &&
         (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
         AArch64InstrInfo::isFpOrNEON(Reg)) {
       assert(HazardSlotIndex == std::numeric_limits<int>::max() &&
@@ -2646,7 +2810,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
   }
 
   // Add hazard slot in the case where no FPR CSRs are present.
-  if (AFI->hasStackHazardSlotIndex() &&
+  if (AFI->isStackHazardIncludedInCalleeSaveArea() &&
       HazardSlotIndex == std::numeric_limits<int>::max()) {
     HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
     LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
@@ -2701,8 +2865,7 @@ static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
 }
 
 static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
-                                            AssignObjectOffsets AssignOffsets,
-                                            bool SplitSVEObjects) {
+                                            AssignObjectOffsets AssignOffsets) {
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
 
@@ -2713,12 +2876,12 @@ static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
   // are included in the SVE vector area.
   uint64_t &ZPRStackTop = SVEStack.ZPRStackSize;
   uint64_t &PPRStackTop =
-      SplitSVEObjects ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize;
+      AFI->hasSplitSVEObjects() ? SVEStack.PPRStackSize : SVEStack.ZPRStackSize;
 
 #ifndef NDEBUG
   // First process all fixed stack objects.
   for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
-    assert(!MFI.isScalableStackID(I) &&
+    assert(!MFI.hasScalableStackID(I) &&
            "SVE vectors should never be passed on the stack by value, only by "
            "reference.");
 #endif
@@ -3393,7 +3556,8 @@ void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
 
   Register Reg;
   FrameRegOffset = TFI->resolveFrameOffsetReference(
-      *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
+      *MF, FirstTagStore.Offset, false /*isFixed*/,
+      TargetStackID::Default /*StackID*/, Reg,
       /*PreferFP=*/false, /*ForSimm=*/true);
   FrameReg = Reg;
   FrameRegUpdate = std::nullopt;
@@ -3733,10 +3897,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
 
 void AArch64FrameLowering::orderFrameObjects(
     const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
-  if (!OrderFrameObjects || ObjectsToAllocate.empty())
+  const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
+
+  if ((!OrderFrameObjects && !AFI.hasSplitSVEObjects()) ||
+      ObjectsToAllocate.empty())
     return;
 
-  const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
   for (auto &Obj : ObjectsToAllocate) {
@@ -3755,7 +3921,8 @@ void AArch64FrameLowering::orderFrameObjects(
       if (AFI.hasStackHazardSlotIndex()) {
         std::optional<int> FI = getLdStFrameID(MI, MFI);
         if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
-          if (MFI.isScalableStackID(*FI) || AArch64InstrInfo::isFpOrNEON(MI))
+          if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
+              AArch64InstrInfo::isFpOrNEON(MI))
             FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
           else
             FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
@@ -4113,7 +4280,7 @@ void AArch64FrameLowering::emitRemarks(
           }
 
           unsigned RegTy = StackAccess::AccessType::GPR;
-          if (MFI.isScalableStackID(FrameIdx)) {
+          if (MFI.hasScalableStackID(FrameIdx)) {
             // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
             // spill/fill the predicate as a data vector (so are an FPR access).
             if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 38aa28b130b37..32a9bd831989c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -69,8 +69,9 @@ class AArch64FrameLowering : public TargetFrameLowering {
                                          bool ForSimm) const;
   StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
                                           int64_t ObjectOffset, bool isFixed,
-                                          bool isSVE, Register &FrameReg,
-                                          bool PreferFP, bool ForSimm) const;
+                                          TargetStackID::Value StackID,
+                                          Register &FrameReg, bool PreferFP,
+                                          bool ForSimm) const;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  ArrayRef<CalleeSavedInfo> CSI,
@@ -155,7 +156,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
   /// Returns the size of the entire ZPR stackframe (calleesaves + spills).
   StackOffset getZPRStackSize(const MachineFunction &MF) const;
 
-  /// Returns the size of the entire PPR stackframe (calleesaves + spills).
+  /// Returns the size of the entire PPR stackframe (calleesaves + spills +
+  /// hazard padding).
   StackOffset getPPRStackSize(const MachineFunction &MF) const;
 
   /// Returns the size of the entire SVE stackframe (PPRs + ZPRs).
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 35bbb0c023472..e7b2d20e2a6cb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -7497,7 +7497,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     // We can only encode VL scaled offsets, so only fold in frame indexes
     // referencing SVE objects.
-    if (MFI.isScalableStackID(FI)) {
+    if (MFI.hasScalableStackID(FI)) {
       Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
       OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
       return true;
@@ -7543,7 +7543,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
     int FI = cast<FrameIndexSDNode>(Base)->getIndex();
     // We can only encode VL scaled offsets, so only fold in frame indexes
     // referencing SVE objects.
-    if (MFI.isScalableStackID(FI))
+    if (MFI.hasScalableStackID(FI))
       Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c2a482a7abeb0..70d5ad7d660f1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9256,7 +9256,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
       (MI.getOpcode() == AArch64::ADDXri ||
        MI.getOpcode() == AArch64::SUBXri)) {
     const MachineOperand &MO = MI.getOperand(1);
-    if (MO.isFI() && MF.getFrameInfo().isScalableStackID(MO.getIndex()))
+    if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
       MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
                                               /*IsImplicit=*/true));
   }
@@ -29608,7 +29608,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
   // than doing it here in finalizeLowering.
   if (MFI.hasStackProtectorIndex()) {
     for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
-      if (MFI.isScalableStackID(i) &&
+      if (MFI.hasScalableStackID(i) &&
           MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
         MFI.setStackID(MFI.getStackProtectorIndex(),
                        TargetStackID::ScalableVector);
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 00c104ed2254d..91e64e69de6d0 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -137,6 +137,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   uint64_t StackSizeZPR = 0;
   uint64_t StackSizePPR = 0;
 
+  /// Are SVE objects (vectors and predicates) split into separate regions on
+  /// the stack.
+  bool SplitSVEObjects = false;
+
   /// HasCalculatedStackSizeSVE indicates whether StackSizeZPR/PPR is valid.
   bool HasCalculatedStackSizeSVE = false;
 
@@ -336,7 +340,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
 
   bool isStackRealigned() const { return StackRealigned; }
   void setStackRealigned(bool s) { StackRealigned = s; }
-
   bool hasCalleeSaveStackFreeSpace() const {
     return CalleeSaveStackHasFreeSpace;
   }
@@ -438,6 +441,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   }
 
   unsigned getSVECalleeSavedStackSize() const {
+    assert(!hasSplitSVEObjects() &&
+           "ZPRs and PPRs are split. Use get[ZPR|PPR]CalleeSavedStackSize()");
     return getZPRCalleeSavedStackSize() + getPPRCalleeSavedStackSize();
   }
 
@@ -446,6 +451,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     return NumLocalDynamicTLSAccesses;
   }
 
+  bool isStackHazardIncludedInCalleeSaveArea() const {
+    return hasStackHazardSlotIndex() && !hasSplitSVEObjects();
+  }
+
   std::optional<bool> hasRedZone() const { return HasRedZone; }
   void setHasRedZone(bool s) { HasRedZone = s; }
 
@@ -481,7 +490,14 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     StackHazardCSRSlotIndex = Index;
   }
 
-  bool hasSplitSVEObjects() const { return false; }
+  bool hasSplitSVEObjects() const { return SplitSVEObjects; }
+  void setSplitSVEObjects(bool s) { SplitSVEObjects = s; }
+
+  bool hasSVE_AAPCS(const MachineFunction &MF) const {
+    return hasSplitSVEObjects() || isSVECC() ||
+           MF.getFunction().getCallingConv() ==
+               CallingConv::AArch64_SVE_VectorCall;
+  }
 
   SMEAttrs getSMEFnAttrs() const { return SMEFnAttrs; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index fec350b55d334..aed137c654d15 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -723,38 +723,73 @@ void AArch64PrologueEmitter::emitPrologue() {
   StackOffset PPRLocalsSize = AFL.getPPRStackSize(MF) - PPRCalleeSavesSize;
   StackOffset ZPRLocalsSize = AFL.getZPRStackSize(MF) - ZPRCalleeSavesSize;
 
+  std::optional<MachineBasicBlock::iterator> ZPRCalleeSavesBegin,
+      ZPRCalleeSavesEnd, PPRCalleeSavesBegin, PPRCalleeSavesEnd;
+
   StackOffset CFAOffset =
       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
   MachineBasicBlock::iterator AfterSVESavesI = AfterGPRSavesI;
   if (!FPAfterSVECalleeSaves) {
     // Process the SVE callee-saves to find the starts/ends of the ZPR and PPR
     // areas.
+    PPRCalleeSavesBegin = AfterGPRSavesI;
     if (PPRCalleeSavesSize) {
       LLVM_DEBUG(dbgs() << "PPRCalleeSavedStackSize = "
                         << PPRCalleeSavesSize.getScalable() << "\n");
 
-      assert(isPartOfPPRCalleeSaves(AfterSVESavesI) &&
+      assert(isPartOfPPRCalleeSaves(*PPRCalleeSavesBegin) &&
              "Unexpected instruction");
       while (isPartOfPPRCalleeSaves(AfterSVESavesI) &&
              AfterSVESavesI != MBB.getFirstTerminator())
         ++AfterSVESavesI;
     }
+    PPRCalleeSavesEnd = ZPRCalleeSavesBegin = AfterSVESavesI;
     if (ZPRCalleeSavesSize) {
       LLVM_DEBUG(dbgs() << "ZPRCalleeSavedStackSize = "
                         << ZPRCalleeSavesSize.getScalable() << "\n");
-      assert(isPartOfZPRCalleeSaves(AfterSVESavesI) &&
+      assert(isPartOfZPRCalleeSaves(*ZPRCalleeSavesBegin) &&
              "Unexpected instruction");
       while (isPartOfZPRCalleeSaves(AfterSVESavesI) &&
              AfterSVESavesI != MBB.getFirstTerminator())
         ++AfterSVESavesI;
     }
+    ZPRCalleeSavesEnd = AfterSVESavesI;
   }
 
   if (EmitAsyncCFI)
     emitCalleeSavedSVELocations(AfterSVESavesI);
 
   if (AFI->hasSplitSVEObjects()) {
-    reportFatalInternalError("not implemented yet");
+    assert(!FPAfterSVECalleeSaves &&
+           "Cannot use FPAfterSVECalleeSaves with aarch64-split-sve-objects");
+    assert(!AFL.canUseRedZone(MF) &&
+           "Cannot use redzone with aarch64-split-sve-objects");
+    // TODO: Handle HasWinCFI/NeedsWinCFI?
+    assert(!NeedsWinCFI &&
+           "WinCFI with aarch64-split-sve-objects is not supported");
+
+    // Split ZPR and PPR allocation.
+    // Allocate PPR callee saves
+    allocateStackSpace(*PPRCalleeSavesBegin, 0, PPRCalleeSavesSize,
+                       EmitAsyncCFI && !HasFP, CFAOffset,
+                       MFI.hasVarSizedObjects() || ZPRCalleeSavesSize ||
+                           ZPRLocalsSize || PPRLocalsSize);
+    CFAOffset += PPRCalleeSavesSize;
+
+    // Allocate PPR locals + ZPR callee saves
+    assert(PPRCalleeSavesEnd == ZPRCalleeSavesBegin &&
+           "Expected ZPR callee saves after PPR locals");
+    allocateStackSpace(*PPRCalleeSavesEnd, RealignmentPadding,
+                       PPRLocalsSize + ZPRCalleeSavesSize,
+                       EmitAsyncCFI && !HasFP, CFAOffset,
+                       MFI.hasVarSizedObjects() || ZPRLocalsSize);
+    CFAOffset += PPRLocalsSize + ZPRCalleeSavesSize;
+
+    // Allocate ZPR locals
+    allocateStackSpace(*ZPRCalleeSavesEnd, RealignmentPadding,
+                       ZPRLocalsSize + StackOffset::getFixed(NumBytes),
+                       EmitAsyncCFI && !HasFP, CFAOffset,
+                       MFI.hasVarSizedObjects());
   } else {
     // Allocate space for the callee saves (if any).
     StackOffset LocalsSize =
@@ -1195,7 +1230,7 @@ void AArch64PrologueEmitter::emitCalleeSavedGPRLocations(
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
   for (const auto &Info : CSI) {
     unsigned FrameIdx = Info.getFrameIdx();
-    if (MFI.isScalableStackID(FrameIdx))
+    if (MFI.hasScalableStackID(FrameIdx))
       continue;
 
     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
@@ -1221,8 +1256,10 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
                                  AFL.getOffsetOfLocalArea();
   }
 
+  StackOffset PPRStackSize = AFL.getPPRStackSize(MF);
   for (const auto &Info : CSI) {
-    if (!MFI.isScalableStackID(Info.getFrameIdx()))
+    int FI = Info.getFrameIdx();
+    if (!MFI.hasScalableStackID(FI))
       continue;
 
     // Not all unwinders may know about SVE registers, so assume the lowest
@@ -1233,9 +1270,13 @@ void AArch64PrologueEmitter::emitCalleeSavedSVELocations(
       continue;
 
     StackOffset Offset =
-        StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
+        StackOffset::getScalable(MFI.getObjectOffset(FI)) -
         StackOffset::getFixed(AFI->getCalleeSavedStackSize(MFI));
 
+    if (AFI->hasSplitSVEObjects() &&
+        MFI.getStackID(FI) == TargetStackID::ScalableVector)
+      Offset -= PPRStackSize;
+
     CFIBuilder.insertCFIInst(
         createCFAOffset(RegInfo, Reg, Offset, IncomingVGOffsetFromDefCFA));
   }
@@ -1512,7 +1553,75 @@ void AArch64EpilogueEmitter::emitEpilogue() {
         emitCalleeSavedSVERestores(RestoreEnd);
     }
   } else if (AFI->hasSplitSVEObjects() && SVEStackSize) {
-    reportFatalInternalError("not implemented yet");
+    // TODO: Support stack realigment and variable-sized objects.
+    assert(!AFI->isStackRealigned() && !MFI.hasVarSizedObjects() &&
+           "unexpected stack realignment or variable sized objects with split "
+           "SVE stack objects");
+    // SplitSVEObjects. Determine the sizes and starts/ends of the ZPR and PPR
+    // areas.
+    auto ZPRCalleeSavedSize =
+        StackOffset::getScalable(AFI->getZPRCalleeSavedStackSize());
+    auto PPRCalleeSavedSize =
+        StackOffset::getScalable(AFI->getPPRCalleeSavedStackSize());
+    StackOffset PPRLocalsSize = PPRStackSize - PPRCalleeSavedSize;
+    StackOffset ZPRLocalsSize = ZPRStackSize - ZPRCalleeSavedSize;
+
+    MachineBasicBlock::iterator PPRRestoreBegin = FirstGPRRestoreI,
+                                PPRRestoreEnd = FirstGPRRestoreI;
+    if (PPRCalleeSavedSize) {
+      PPRRestoreBegin = std::prev(PPRRestoreEnd);
+      while (PPRRestoreBegin != MBB.begin() &&
+             isPartOfPPRCalleeSaves(std::prev(PPRRestoreBegin)))
+        --PPRRestoreBegin;
+    }
+
+    MachineBasicBlock::iterator ZPRRestoreBegin = PPRRestoreBegin,
+                                ZPRRestoreEnd = PPRRestoreBegin;
+    if (ZPRCalleeSavedSize) {
+      ZPRRestoreBegin = std::prev(ZPRRestoreEnd);
+      while (ZPRRestoreBegin != MBB.begin() &&
+             isPartOfZPRCalleeSaves(std::prev(ZPRRestoreBegin)))
+        --ZPRRestoreBegin;
+    }
+
+    auto CFAOffset =
+        SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);
+    if (PPRCalleeSavedSize || ZPRCalleeSavedSize) {
+      // Deallocate the non-SVE locals first before we can deallocate (and
+      // restore callee saves) from the SVE area.
+      auto NonSVELocals = StackOffset::getFixed(NumBytes);
+      emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
+                      NonSVELocals, TII, MachineInstr::FrameDestroy, false,
+                      false, nullptr, EmitCFI && !HasFP, CFAOffset);
+      NumBytes = 0;
+      CFAOffset -= NonSVELocals;
+    }
+
+    if (ZPRLocalsSize) {
+      emitFrameOffset(MBB, ZPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
+                      ZPRLocalsSize, TII, MachineInstr::FrameDestroy, false,
+                      false, nullptr, EmitCFI && !HasFP, CFAOffset);
+      CFAOffset -= ZPRLocalsSize;
+    }
+
+    if (PPRLocalsSize || ZPRCalleeSavedSize) {
+      assert(PPRRestoreBegin == ZPRRestoreEnd &&
+             "Expected PPR restores after ZPR");
+      emitFrameOffset(MBB, PPRRestoreBegin, DL, AArch64::SP, AArch64::SP,
+                      PPRLocalsSize + ZPRCalleeSavedSize, TII,
+                      MachineInstr::FrameDestroy, false, false, nullptr,
+                      EmitCFI && !HasFP, CFAOffset);
+      CFAOffset -= PPRLocalsSize + ZPRCalleeSavedSize;
+    }
+    if (PPRCalleeSavedSize) {
+      emitFrameOffset(MBB, PPRRestoreEnd, DL, AArch64::SP, AArch64::SP,
+                      PPRCalleeSavedSize, TII, MachineInstr::FrameDestroy,
+                      false, false, nullptr, EmitCFI && !HasFP, CFAOffset);
+    }
+
+    // We only emit CFI information for ZPRs so emit CFI after the ZPR restores.
+    if (EmitCFI)
+      emitCalleeSavedSVERestores(ZPRRestoreEnd);
   }
 
   if (!HasFP) {
@@ -1670,7 +1779,7 @@ void AArch64EpilogueEmitter::emitCalleeSavedRestores(
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
 
   for (const auto &Info : CSI) {
-    if (SVE != MFI.isScalableStackID(Info.getFrameIdx()))
+    if (SVE != MFI.hasScalableStackID(Info.getFrameIdx()))
       continue;
 
     MCRegister Reg = Info.getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 3f43b70c4a053..79975b0256328 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -71,6 +71,7 @@ bool AArch64RegisterInfo::regNeedsCFI(MCRegister Reg,
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
+  auto &AFI = *MF->getInfo<AArch64FunctionInfo>();
 
   if (MF->getFunction().getCallingConv() == CallingConv::GHC)
     // GHC set of callee saved regs is empty as all those regs are
@@ -101,10 +102,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
       return CSR_Win_AArch64_AAPCS_SwiftTail_SaveList;
     if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
       return CSR_Win_AArch64_AAVPCS_SaveList;
-    if (MF->getFunction().getCallingConv() ==
-        CallingConv::AArch64_SVE_VectorCall)
-      return CSR_Win_AArch64_SVE_AAPCS_SaveList;
-    if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
+    if (AFI.hasSVE_AAPCS(*MF))
       return CSR_Win_AArch64_SVE_AAPCS_SaveList;
     return CSR_Win_AArch64_AAPCS_SaveList;
   }
@@ -148,7 +146,7 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     // This is for OSes other than Windows; Windows is a separate case further
     // above.
     return CSR_AArch64_AAPCS_X18_SaveList;
-  if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
+  if (AFI.hasSVE_AAPCS(*MF))
     return CSR_AArch64_SVE_AAPCS_SaveList;
   return CSR_AArch64_AAPCS_SaveList;
 }
@@ -158,6 +156,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
   assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
          "Invalid subtarget for getDarwinCalleeSavedRegs");
+  auto &AFI = *MF->getInfo<AArch64FunctionInfo>();
 
   if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
     report_fatal_error(
@@ -205,7 +204,7 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_Darwin_AArch64_RT_AllRegs_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::Win64)
     return CSR_Darwin_AArch64_AAPCS_Win64_SaveList;
-  if (MF->getInfo<AArch64FunctionInfo>()->isSVECC())
+  if (AFI.hasSVE_AAPCS(*MF))
     return CSR_Darwin_AArch64_SVE_AAPCS_SaveList;
   return CSR_Darwin_AArch64_AAPCS_SaveList;
 }
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
new file mode 100644
index 0000000000000..35eafe8b7d99c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -0,0 +1,587 @@
+# RUN: llc -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -filetype=obj -o %t
+# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO
+# RUN: rm -rf %t
+#
+# Test allocation and deallocation of SVE objects on the stack with
+# split-sve-objects (and hazard padding) enabled. This also tests using a
+# combination of scalable and non-scalable offsets to access the SVE on the
+# stack.
+#
+# With split-sve-objects (which implies hazard padding) the SVE area is split
+# into PPR and ZPR areas with (fixed-size) hazard padding between them. The PPR
+# area holds all scalable predicate callee saves and locals, and the ZPR area
+# holds all scalable vector callee saves and locals. Additionally, any FPR
+# callee save is promoted to a ZPR callee save (to avoid needing additional
+# hazard padding in the callee save area).
+#
+#     +-------------+
+#     | stack arg   |
+#     +-------------+ <- SP before call
+#     | Callee Saves|
+#     | Frame record|       (if available)
+#     |-------------| <- FP (if available)
+#     |  PPR area   |
+#     |-------------|
+#     |/////////////| hazard padding
+#     |-------------|
+#     |  ZPR area   |
+#     +-------------+
+#     |     :       |
+#     | Stack objs  |
+#     |     :       |
+#     +-------------+ <- SP after call and frame-setup
+#
+--- |
+
+  define void @test_allocate_split_sve() uwtable { entry: unreachable }
+  define void @test_allocate_split_sve_realigned() uwtable { entry: unreachable }
+  define void @test_address_split_sve() uwtable { entry: unreachable }
+  define void @test_address_split_sve_fp() uwtable { entry: unreachable }
+  define aarch64_sve_vector_pcs void @save_restore_ppr_zpr() uwtable { entry: unreachable }
+
+...
+---
+# +----------+
+# |scratchreg|  // x29 is used as scratch reg.
+# |----------|
+# | %stack.0 |  // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# |          |  // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# |----------|
+# |//////////|  // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////|  // Note: This is currently not included in the "stackSize"
+# +----------+
+# | %stack.0 |  // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# |          |  // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////|  // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 |  // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_allocate_split_sve
+# CHECK:       stackSize: 1056
+
+# CHECK:      bb.0.entry:
+# CHECK:      liveins: $z0, $p0, $fp
+# CHECK:      early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, 7, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve:
+# ASM:       str x29, [sp, #-16]!
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  .cfi_offset w29, -16
+# ASM-NEXT:  sub sp, sp, #1024
+# ASM-NEXT:  .cfi_def_cfa_offset 1040
+# ASM-NEXT:  addvl sp, sp, #-1
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT:  sub sp, sp, #1040
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM:	     addvl sp, sp, #2
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT:  add sp, sp, #1024
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT:  addvl sp, sp, #1
+# ASM-NEXT:  .cfi_def_cfa wsp, 1056
+# ASM-NEXT:  add sp, sp, #1040
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  ldr x29, [sp], #16
+# ASM-NEXT:  .cfi_def_cfa_offset 0
+# ASM-NEXT:  .cfi_restore w29
+
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +16
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name:            test_allocate_split_sve
+stack:
+  - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+  - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+  - { id: 2, stack-id: default, size: 16, alignment: 8 }
+body:             |
+  bb.0.entry:
+    liveins: $z0, $p0
+    STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+    STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+    RET_ReallyLR
+...
+---
+
+# Stack realignment is not supported with split-sve-objects, so we fallback to
+# the default hazard padding implementation. This does not prevent hazards
+# between ZPRs and PPRs (TODO: support this case).
+#
+# +----------+
+# |  lr, fp  |  // frame record
+# |----------|
+# |//////////|  // hazard padding (1024 bytes)
+# |----------|
+# | %stack.0 |  // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# |          |  // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# +----------+
+# | %stack.0 |  // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# |          |  // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////|  // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 |  // not scalable
+# +----------+ <- SP
+
+name:            test_allocate_split_sve_realigned
+stack:
+  - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+  - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+  - { id: 2, stack-id: default, size: 16, alignment: 32 }
+body:             |
+  bb.0.entry:
+    liveins: $z0, $p0
+    STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+    STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+    RET_ReallyLR
+
+# CHECK-LABEL: name: test_allocate_split_sve_realigned
+# CHECK:       stackSize: 2080
+
+# CHECK:      bb.0.entry:
+# CHECK:      liveins: $z0, $p0, $lr
+# CHECK:      $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+#
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
+# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
+# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve_realigned
+# ASM:	     sub sp, sp, #1040
+# ASM-NEXT:  .cfi_def_cfa_offset 1040
+# ASM-NEXT:  str x29, [sp, #1024]
+# ASM-NEXT:  str x30, [sp, #1032]
+# ASM-NEXT:  add x29, sp, #1024
+# ASM-NEXT:  .cfi_def_cfa w29, 16
+# ASM-NEXT:  .cfi_offset w30, -8
+# ASM-NEXT:  .cfi_offset w29, -16
+#
+# ASM:       sub sp, x29, #1024
+# ASM-NEXT:  .cfi_def_cfa wsp, 1040
+# ASM-NEXT:  ldr x30, [sp, #1032]
+# ASM-NEXT:  ldr x29, [sp, #1024]
+# ASM-NEXT:  add sp, sp, #1040
+# ASM-NEXT:  .cfi_def_cfa_offset 0
+# ASM-NEXT:  .cfi_restore w30
+# ASM-NEXT:  .cfi_restore w29
+
+# UNWINDINFO:       DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO:       DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO:       DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO:       DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT:  DW_CFA_restore: reg30
+# UNWINDINFO-NEXT:  DW_CFA_restore: reg29
+...
+---
+
+# +----------+
+# |scratchreg|  // x29 is used as scratch reg.
+# +----------+
+# | %stack.2 |  // scalable predicate @ SP + 2064b + 46 scalable bytes
+# |----------|
+# |//////////|  // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////|  // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 |  // scalable vector @ SP + 1040b + 16 scalable bytes
+# | %stack.1 |  // scalable vector @ SP + 1040b
+# +----------+
+# |//////////|  // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 |  // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve
+# CHECK:       stackSize: 1056
+
+# CHECK:      bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{  $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 1
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 0
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve
+# ASM:       str x29, [sp, #-16]!
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  .cfi_offset w29, -16
+# ASM-NEXT:  sub sp, sp, #1024
+# ASM-NEXT:  .cfi_def_cfa_offset 1040
+# ASM-NEXT:  addvl sp, sp, #-1
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT:  sub sp, sp, #1040
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM:       addvl sp, sp, #2
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT:  add sp, sp, #1024
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT:  addvl sp, sp, #1
+# ASM-NEXT:  .cfi_def_cfa wsp, 1056
+# ASM-NEXT:  add sp, sp, #1040
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  ldr x29, [sp], #16
+# ASM-NEXT:  .cfi_def_cfa_offset 0
+# ASM-NEXT:  .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name:            test_address_split_sve
+frameInfo:
+  maxAlignment:  16
+stack:
+  - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+  - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+  - { id: 2, stack-id: scalable-vector, size:  2, alignment: 2 }
+  - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body:             |
+  bb.0.entry:
+    liveins: $z0, $z1, $p0
+
+    STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+    STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+    STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+    RET_ReallyLR
+...
+---
+# +----------+
+# |  lr, fp  |  // frame record
+# +----------+ <- FP
+# | %stack.2 |  // scalable predicate @ FP - 2 scalable bytes
+# |----------|
+# |//////////|  // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////|  // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 |  // scalable vector @ FP - 1024b - 32 scalable bytes
+# | %stack.1 |  // scalable vector @ FP - 1024b - 48 scalable bytes
+# +----------+
+# |//////////|  // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 |  // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve_fp
+# CHECK:       stackSize: 1056
+#
+# CHECK:      bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{  $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.6), (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3
+# CHECK-NEXT: STR_PXI $p0, $fp, -1
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve_fp
+# ASM:       stp x29, x30, [sp, #-16]!
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  mov x29, sp
+# ASM-NEXT:  .cfi_def_cfa w29, 16
+# ASM-NEXT:  .cfi_offset w30, -8
+# ASM-NEXT:  .cfi_offset w29, -16
+# ASM-NEXT:  sub sp, sp, #1024
+# ASM-NEXT:  addvl sp, sp, #-1
+# ASM-NEXT:  sub sp, sp, #1040
+# ASM-NEXT:  addvl sp, sp, #-2
+#
+# ASM:       addvl sp, sp, #2
+# ASM-NEXT:  add sp, sp, #1024
+# ASM-NEXT:  addvl sp, sp, #1
+# ASM-NEXT:  add sp, sp, #1040
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
+# ASM-NEXT:  ldp x29, x30, [sp], #16
+# ASM-NEXT:  .cfi_def_cfa_offset 0
+# ASM-NEXT:  .cfi_restore w30
+# ASM-NEXT:  .cfi_restore w29
+
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +16
+# UNWINDINFO:      DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO:      DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg30
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name:            test_address_split_sve_fp
+frameInfo:
+  maxAlignment:  16
+  isFrameAddressTaken: true
+stack:
+  - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+  - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+  - { id: 2, stack-id: scalable-vector, size:  2, alignment: 2 }
+  - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body:             |
+  bb.0.entry:
+    liveins: $z0, $z1, $p0
+
+    STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+    STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+    STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+    RET_ReallyLR
+...
+---
+# CHECK-LABEL: name: save_restore_ppr_zpr
+# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.8)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_PXI killed $p6, $sp, 5 :: (store (s16) into %stack.7)
+# CHECK-NEXT: frame-setup STR_PXI killed $p5, $sp, 6 :: (store (s16) into %stack.6)
+# CHECK-NEXT: frame-setup STR_PXI killed $p4, $sp, 7 :: (store (s16) into %stack.5)
+#
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 :: (store (s128) into %stack.4)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 :: (store (s128) into %stack.3)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 :: (store (s128) into %stack.2)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1056, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+#
+# CHECK: $sp = frame-destroy ADDXri $sp, 1056, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
+# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.2)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
+# CHECK-NEXT: $p6 = frame-destroy LDR_PXI $sp, 5 :: (load (s16) from %stack.7)
+# CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 6 :: (load (s16) from %stack.6)
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 :: (load (s16) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.8)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: save_restore_ppr_zpr:
+# ASM:       str x29, [sp, #-16]!
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  .cfi_offset w29, -16
+# ASM-NEXT:  addvl sp, sp, #-1
+# ASM-NEXT:  .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT:  str p6, [sp, #5, mul vl]
+# ASM-NEXT:  str p5, [sp, #6, mul vl]
+# ASM-NEXT:  str p4, [sp, #7, mul vl]
+# ASM-NEXT:  sub sp, sp, #1024
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT:  addvl sp, sp, #-3
+# ASM-NEXT:  .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT:  str z10, [sp]
+# ASM-NEXT:  str z9, [sp, #1, mul vl]
+# ASM-NEXT:  str z8, [sp, #2, mul vl]
+# ASM-NEXT:  .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8  @ cfa - 16 * VG - 1040
+# ASM-NEXT:  .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9  @ cfa - 24 * VG - 1040
+# ASM-NEXT:  .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10  @ cfa - 32 * VG - 1040
+# ASM-NEXT:  sub sp, sp, #1056
+# ASM-NEXT:  .cfi_escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 2096 + 32 * VG
+#
+# ASM:       add sp, sp, #1056
+# ASM-NEXT:  .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT:  ldr z10, [sp]
+# ASM-NEXT:  ldr z9, [sp, #1, mul vl]
+# ASM-NEXT:  ldr z8, [sp, #2, mul vl]
+# ASM-NEXT:  add sp, sp, #1024
+# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 16 + 32 * VG
+# ASM-NEXT:  addvl   sp, sp, #3
+# ASM-NEXT:  .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT:  .cfi_restore z8
+# ASM-NEXT:  .cfi_restore z9
+# ASM-NEXT:  .cfi_restore z10
+# ASM-NEXT:  ldr p6, [sp, #5, mul vl]
+# ASM-NEXT:  ldr p5, [sp, #6, mul vl]
+# ASM-NEXT:  ldr p4, [sp, #7, mul vl]
+# ASM-NEXT:  addvl sp, sp, #1
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
+# ASM-NEXT:  ldr x29, [sp], #16
+# ASM-NEXT:  .cfi_def_cfa_offset 0
+# ASM-NEXT:  .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2096, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg104
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg105
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg106
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: save_restore_ppr_zpr
+stack:
+  - { id: 0, stack-id: default, size: 32, alignment: 16 }
+body:             |
+  bb.0.entry:
+
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $z8 = IMPLICIT_DEF
+    $z9 = IMPLICIT_DEF
+    $z10 = IMPLICIT_DEF
+
+    RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
index bff0cacfd5190..0298168bb47a7 100644
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -983,26 +983,22 @@ body:             |
     ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
     ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
     ; EXPAND-NEXT: {{  $}}
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3)
+    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1)
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
     ;
     ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3)
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
     ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
 
     ; If we spill a register above p8, p4 must also be saved, so we can guarantee
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
new file mode 100644
index 0000000000000..690a39d12e6f1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -0,0 +1,824 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local                    sp+2048+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding  sp+2048+16*vscale
+; <hazard padding>              sp+1024+16*vscale
+; %zpr_local                    sp+1024
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x8, sp, #2048
+; CHECK-NEXT:    str p0, [x8, #15, mul vl]
+; CHECK-NEXT:    add x8, sp, #1024
+; CHECK-NEXT:    str z0, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local                    fp-2*vscale       (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding  fp-16*vscale
+; <hazard padding>              fp-1024-16*vscale
+; %zpr_local                    fp-1024-32*vscale (= #-2, mul vl for str/ldr ZPR)
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: zpr_and_ppr_local_fp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-2, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local                    sp+2064+14*vscale (= #7, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding  sp+2064
+; <hazard padding>              sp+1040
+; %fpr_local                    sp+1032
+; 8 bytes of padding            sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: fpr_and_ppr_local:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x8, sp, #2064
+; CHECK-NEXT:    str p0, [x8, #7, mul vl]
+; CHECK-NEXT:    str d0, [sp, #1032]
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ppr_local = alloca <vscale x 16 x i1>
+  %fpr_local = alloca double
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile double %double, ptr %fpr_local
+  ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local                    fp-2*vscale       (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; %fpr_local                    sp+1032
+; 8 bytes of padding            sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: fpr_and_ppr_local_fp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
+; CHECK-NEXT:    str d0, [sp, #1032]
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ppr_local = alloca <vscale x 16 x i1>
+  %fpr_local = alloca double
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile double %double, ptr %fpr_local
+  ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; %ppr_local                    sp+2064+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>              sp+1040+16*vscale
+; <fpr callee save: z8>         sp+1040
+; <hazard padding>              sp+16
+; %gpr_local                    sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: gpr_and_ppr_local:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2080 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT:    add x8, sp, #2064
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x8, #15, mul vl]
+; CHECK-NEXT:    str x0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+  %ppr_local = alloca <vscale x 16 x i1>
+  %gpr_local = alloca i64
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile i64 %int, ptr %gpr_local
+  ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; %ppr_local                    fp-2*vscale       (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; <fpr callee save: z8>
+; <hazard padding>
+; %gpr_local                    sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: gpr_and_ppr_local_fp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
+; CHECK-NEXT:    str x0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+  %ppr_local = alloca <vscale x 16 x i1>
+  %gpr_local = alloca i64
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile i64 %int, ptr %gpr_local
+  ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2088-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; <CS PPRs>
+; %ppr_local                    sp+2080+286*vscale (addvl #17, addpl #7)
+; 14 * vscale bytes of padding  sp+2080+272*vscale
+; <hazard padding>              sp+1056+272*vscale
+; <CS ZPRs>                     sp+1056+16*vscale
+; %zpr_local                    sp+1056
+; %fpr_local                    sp+1048
+; 8 bytes of padding            sp+1040
+; <hazard padding>              sp+16
+; %gpr_local                    sp+8
+; 8 bytes of padding            sp
+; -> sp
+define void @all_stack_areas(<vscale x 16 x i1> %pred, double %fp) {
+; CHECK-LABEL: all_stack_areas:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1056
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xa0, 0x01, 0x1e, 0x22 // sp + 2096 + 160 * VG
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1040
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1040
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1040
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1040
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1040
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1040
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1040
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1040
+; CHECK-NEXT:    add x0, sp, #2080
+; CHECK-NEXT:    add x8, sp, #2080
+; CHECK-NEXT:    add x1, sp, #1056
+; CHECK-NEXT:    addvl x0, x0, #17
+; CHECK-NEXT:    add x2, sp, #1048
+; CHECK-NEXT:    add x3, sp, #8
+; CHECK-NEXT:    addpl x0, x0, #7
+; CHECK-NEXT:    str d0, [sp, #1048]
+; CHECK-NEXT:    str p0, [x8, #143, mul vl]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    add sp, sp, #1056
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  %fpr_local = alloca double
+  ; // Needed to sort %fpr_local into the FPR region
+  store double %fp, ptr %fpr_local
+  ; // Needed to sort %ppr_local into the PPR region
+  store <vscale x 16 x i1> %pred, ptr %ppr_local
+  %gpr_local = alloca i64
+  call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+  ret void
+}
+declare void @foo(ptr, ptr, ptr, ptr)
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1064-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2104-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; <CS PPRs>                     fp-32*vscale
+; %ppr_local                    fp-34*vscale        (addpl #-17)
+; 14 * vscale bytes of padding  fp-48*vscale
+; <hazard padding>              fp-1024-48*vscale
+; <CS ZPRs>                     fp-1024-304*vscale
+; %zpr_local                    sp-1024-320*vscale  (addvl #-20)
+; %fpr_local                    sp+1048
+; 8 bytes of padding            sp+1040
+; <hazard padding>              sp+16
+; %gpr_local                    sp+8
+; 8 bytes of padding            sp
+; -> sp
+define void @all_stack_areas_fp(<vscale x 16 x i1> %pred, double %fp) "frame-pointer"="all" {
+; CHECK-LABEL: all_stack_areas_fp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-17
+; CHECK-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1056
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1056
+; CHECK-NEXT:    sub x1, x29, #1024
+; CHECK-NEXT:    addpl x0, x29, #-17
+; CHECK-NEXT:    add x2, sp, #1048
+; CHECK-NEXT:    addvl x1, x1, #-20
+; CHECK-NEXT:    add x3, sp, #8
+; CHECK-NEXT:    str d0, [sp, #1048]
+; CHECK-NEXT:    str p0, [x29, #-17, mul vl]
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    add sp, sp, #1056
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  %fpr_local = alloca double
+  ; // Needed to sort %fpr_local into the FPR region
+  store double %fp, ptr %fpr_local
+  ; // Needed to sort %ppr_local into the PPR region
+  store <vscale x 16 x i1> %pred, ptr %ppr_local
+  %gpr_local = alloca i64
+  call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+  ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2112-288 x vscale], Type: Variable, Align: 16, Size: 1024
+
+define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: svecc_call:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 64
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w26, -16
+; CHECK-NEXT:    .cfi_offset w27, -24
+; CHECK-NEXT:    .cfi_offset w28, -32
+; CHECK-NEXT:    .cfi_offset vg, -48
+; CHECK-NEXT:    .cfi_offset w30, -56
+; CHECK-NEXT:    .cfi_offset w29, -64
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-16
+; CHECK-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    tbz w19, #0, .LBB8_2
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:  .LBB8_2: // %entry
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    mov w1, #45 // =0x2d
+; CHECK-NEXT:    mov w2, #37 // =0x25
+; CHECK-NEXT:    bl memset
+; CHECK-NEXT:    tbz w19, #0, .LBB8_4
+; CHECK-NEXT:  // %bb.3: // %entry
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:  .LBB8_4: // %entry
+; CHECK-NEXT:    mov w0, #22647 // =0x5877
+; CHECK-NEXT:    movk w0, #59491, lsl #16
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    .cfi_restore z12
+; CHECK-NEXT:    .cfi_restore z13
+; CHECK-NEXT:    .cfi_restore z14
+; CHECK-NEXT:    .cfi_restore z15
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w26
+; CHECK-NEXT:    .cfi_restore w27
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
+  ret i32 -396142473
+}
+declare ptr @memset(ptr, i32, i32)
+
+; FIXME: aarch64-split-sve-objects is currently not supported in this function
+; as it requires stack reealignment (for the 32-byte aligned alloca).
+; GPR CSRs
+; <hazard padding>
+; FPR CSRs
+; <hazrd padding>
+; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
+; <realignment padding>
+; -> sp
+define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local_realignment:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    sub x9, sp, #1040
+; CHECK-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #1024
+; CHECK-NEXT:    addvl x9, x9, #-2
+; CHECK-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    str p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-2, mul vl]
+; CHECK-NEXT:    str x0, [sp]
+; CHECK-NEXT:    sub sp, x29, #1024
+; CHECK-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    ret
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  %gpr_local = alloca i64, align 32
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  store volatile i64 %gpr, ptr %gpr_local
+  ret void
+}
+
+define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr)
+; CHECK-LABEL: zpr_and_ppr_local_stack_probing:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1824
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x8, sp, #2848
+; CHECK-NEXT:    str p0, [x8, #15, mul vl]
+; CHECK-NEXT:    add x8, sp, #1824
+; CHECK-NEXT:    str z0, [x8]
+; CHECK-NEXT:    str x0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    add sp, sp, #1824
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible"
+{
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  %gpr_local = alloca i64, i64 100, align 8
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  store volatile i64 %gpr, ptr %gpr_local
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 5f52280935c73..333a8be27f687 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
 
 define i32 @basic(i32 noundef %num) {
 ; CHECK-LABEL: basic:
@@ -1503,72 +1504,24 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>
 }
 
 define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK0-NEXT:    addvl sp, sp, #-1
-; CHECK0-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    mov p5.b, p0.b
-; CHECK0-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    mov p4.b, p1.b
-; CHECK0-NEXT:    mov p0.b, p2.b
-; CHECK0-NEXT:    mov p1.b, p3.b
-; CHECK0-NEXT:    mov p2.b, p5.b
-; CHECK0-NEXT:    mov p3.b, p4.b
-; CHECK0-NEXT:    bl sve_signature_pred_2xv4i1
-; CHECK0-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    addvl sp, sp, #1
-; CHECK0-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK0-NEXT:    ret
-;
-; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK64:       // %bb.0:
-; CHECK64-NEXT:    sub sp, sp, #80
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    addvl sp, sp, #-1
-; CHECK64-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    sub sp, sp, #64
-; CHECK64-NEXT:    mov p4.b, p1.b
-; CHECK64-NEXT:    mov p5.b, p0.b
-; CHECK64-NEXT:    mov p0.b, p2.b
-; CHECK64-NEXT:    mov p1.b, p3.b
-; CHECK64-NEXT:    mov p2.b, p5.b
-; CHECK64-NEXT:    mov p3.b, p4.b
-; CHECK64-NEXT:    bl sve_signature_pred_2xv4i1
-; CHECK64-NEXT:    add sp, sp, #64
-; CHECK64-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    addvl sp, sp, #1
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #80
-; CHECK64-NEXT:    ret
-;
-; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK1024:       // %bb.0:
-; CHECK1024-NEXT:    sub sp, sp, #1040
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    mov p4.b, p1.b
-; CHECK1024-NEXT:    mov p5.b, p0.b
-; CHECK1024-NEXT:    mov p0.b, p2.b
-; CHECK1024-NEXT:    mov p1.b, p3.b
-; CHECK1024-NEXT:    mov p2.b, p5.b
-; CHECK1024-NEXT:    mov p3.b, p4.b
-; CHECK1024-NEXT:    bl sve_signature_pred_2xv4i1
-; CHECK1024-NEXT:    add sp, sp, #1024
-; CHECK1024-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #1
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1040
-; CHECK1024-NEXT:    ret
+; CHECK-LABEL: sve_signature_pred_2xv4i1_caller:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p5.b, p0.b
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov p4.b, p1.b
+; CHECK-NEXT:    mov p0.b, p2.b
+; CHECK-NEXT:    mov p1.b, p3.b
+; CHECK-NEXT:    mov p2.b, p5.b
+; CHECK-NEXT:    mov p3.b, p4.b
+; CHECK-NEXT:    bl sve_signature_pred_2xv4i1
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
   %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
   ret [2 x <vscale x 4 x i1>] %res
 }
@@ -2113,139 +2066,269 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    mov x8, x0
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov x19, x0
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w19, #0, .LBB28_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB28_2: // %entry
-; CHECK1024-NEXT:    mov x0, x8
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    mov w2, #37 // =0x25
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w19, #0, .LBB28_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB28_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    add sp, sp, #1024
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #18
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    mov x8, x0
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB28_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB28_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB28_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB28_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    mov x8, x0
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB28_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB28_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB28_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB28_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
@@ -2505,138 +2588,267 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_alloca_call:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub sp, sp, #1072
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov x19, x0
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w19, #0, .LBB29_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB29_2: // %entry
-; CHECK1024-NEXT:    mov x0, sp
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    mov w2, #37 // =0x25
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w19, #0, .LBB29_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB29_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    add sp, sp, #1072
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    addvl sp, sp, #18
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB29_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB29_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB29_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB29_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB29_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB29_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB29_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB29_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
 
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 2cbb29ebe1a1f..d8de12c5f66b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -672,5 +672,3 @@ entry:
   ret i32 %x
 }
 declare void @other()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-FRAMELAYOUT: {{.*}}

From 1f225676f4859842d0a1ee74c2318c01d34b571b Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Thu, 2 Oct 2025 19:21:27 +0100
Subject: [PATCH 555/878] [VPlan] Improve code using VPlan::getFalse (NFC)
 (#161681)

---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 67e33cf2e4d7a..f76777bc6cf2e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1110,8 +1110,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 
   // x && !x -> 0
   if (match(&R, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
-    return Def->replaceAllUsesWith(Plan->getOrAddLiveIn(
-        ConstantInt::getFalse(VPTypeAnalysis(*Plan).inferScalarType(Def))));
+    return Def->replaceAllUsesWith(Plan->getFalse());
 
   if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
     return Def->replaceAllUsesWith(X);

From 9133fc8cb04f8e45c9b46de85a8de99bf01e55c7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 2 Oct 2025 19:24:56 +0100
Subject: [PATCH 556/878] [LAA,LV] Add early-exit tests with deref assumes and
 nofree via context.

Add tests with early exits and dereferenceable assumptions that need
proving no-free via the context.
---
 .../early-exit-runtime-checks.ll              | 126 ++++++++++++++++++
 .../single-early-exit-deref-assumptions.ll    |  47 +++++++
 2 files changed, 173 insertions(+)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index a08f859858d24..6d9aa8d1ea32b 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -756,3 +756,129 @@ e.1:
 e.2:
   ret void
 }
+
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context(ptr %A, ptr %B) nosync {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_nofree_via_context'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %iv.next = add nuw nsw i64 %iv, 1
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %cntable.c.2 = icmp eq i64 %iv.next, 500
+  br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+  br label %loop.header
+
+cleanup4:
+  ret void
+
+e.1:
+  ret void
+
+e.2:
+  ret void
+}
+
+define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors(ptr %A, ptr %B, i1 %c) nosync {
+; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations_known_deref_via_assumption_missing_nofree_multiple_predecessors'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %A, i64 2000) ]
+  call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %B, i64 2000) ]
+  br i1 %c, label %then, label %else
+
+then:
+  br label %loop.header
+
+else:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %then ], [ 0, %else ], [ %iv.next, %latch ]
+  %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
+  %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+  %l = load i32, ptr %gep.A, align 4
+  store i32 0, ptr %gep.B, align 4
+  %cntable.c.1 = icmp ult i64 %iv, 1000
+  %iv.next = add nuw nsw i64 %iv, 1
+  br i1 %cntable.c.1, label %b2, label %e.1
+
+b2:
+  %uncntable.c.0 = icmp eq i32 %l, 0
+  br i1 %uncntable.c.0, label %e.2, label %b3
+
+b3:
+  %cntable.c.2 = icmp eq i64 %iv.next, 500
+  br i1 %cntable.c.2, label %cleanup4, label %latch
+
+latch:
+  br label %loop.header
+
+cleanup4:
+  ret void
+
+e.1:
+  ret void
+
+e.2:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index 644900d2a42c5..96206977864e2 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -500,3 +500,50 @@ exit:
   %first.addr.0.lcssa.i = phi ptr [ %first, %entry ], [ %iv, %loop.header ], [ %iv.next, %loop.latch ]
   ret ptr %first.addr.0.lcssa.i
 }
+
+define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context(ptr noalias %p1, ptr noalias %p2) nosync {
+; CHECK-LABEL: define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_size_nofree_via_context(
+; CHECK-SAME: ptr noalias [[P1:%.*]], ptr noalias [[P2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P1]], i64 4), "dereferenceable"(ptr [[P1]], i64 1024) ]
+; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[P2]], i64 4), "dereferenceable"(ptr [[P2]], i64 1024) ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
+; CHECK:       [[LOOP_INC]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX1]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[LOOP]], label %[[LOOP_END]]
+; CHECK:       [[LOOP_END]]:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX1]], %[[LOOP]] ], [ -1, %[[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  call void @llvm.assume(i1 true) [ "align"(ptr %p1, i64 4), "dereferenceable"(ptr %p1, i64 1024) ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %p2, i64 4), "dereferenceable"(ptr %p2, i64 1024) ]
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 1024
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ -1, %loop.inc ]
+  ret i64 %retval
+}

From 68b143d968f8fe97bf6d3a80b3a73d48e7b655cb Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel@intel.com>
Date: Thu, 2 Oct 2025 11:40:19 -0700
Subject: [PATCH 557/878] [MLIR][XeGPU] Use operand layouts for store scatter
 (#161447)

The PR adds a change to use the layouts from the operands since store
doesn't have a result
---
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 24 +++++++++++--------
 .../XeGPU/xegpu-wg-to-sg-unify-ops.mlir       | 19 +++++++++------
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 9413a9296b184..784e5d68ce885 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -824,7 +824,7 @@ struct WgToSgStoreScatterOpWithOffset
       return failure();
 
     xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getValue());
+        xegpu::getDistributeLayoutAttr(op.getOperand(0));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -844,12 +844,19 @@ struct WgToSgStoreScatterOpWithOffset
     auto chunkSizeAttr = rewriter.getI64IntegerAttr(chunkSize);
     for (auto [val, offs, mask] : llvm::zip(
              adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) {
-      xegpu::StoreScatterOp::create(rewriter, loc, val, op.getDest(), offs,
-                                    mask, chunkSizeAttr, op.getL1HintAttr(),
-                                    op.getL2HintAttr(), op.getL3HintAttr());
+      auto store = xegpu::StoreScatterOp::create(
+          rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr,
+          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
       // Update the layout attribute to drop sg_layout and sg_data.
-      if (auto newLayout = layout.dropSgLayoutAndData())
-        op->setAttr("layout", newLayout);
+      if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
+          !layout.getEffectiveInstDataAsInt().empty()) {
+        for (OpOperand &operand : store->getOpOperands()) {
+          // Skip for operand one (memref)
+          if (operand.getOperandNumber() == 1)
+            continue;
+          xegpu::setDistributeLayoutAttr(operand, layout.dropSgLayoutAndData());
+        }
+      }
     }
     rewriter.eraseOp(op);
     return success();
@@ -1247,10 +1254,7 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
 
   target.addDynamicallyLegalOp<xegpu::StoreScatterOp>(
       [=](xegpu::StoreScatterOp op) -> bool {
-        // Check if the layout attribute is present on the result.
-        auto layout = op->getAttrOfType<xegpu::LayoutAttr>("layout");
-        if (!layout)
-          return true;
+        auto layout = xegpu::getDistributeLayoutAttr(op.getOperand(0));
         return isLegal(layout);
       });
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 03c63861705d9..38392fd10b742 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -282,15 +282,20 @@ gpu.module @test_distribution {
   // CHECK-LABEL: @store_scatter
   // CHECK-SAME: %[[ARG0:.*]]: memref<256xf16>
   gpu.func @store_scatter(%dest : memref<256xf16>) {
-    // CHECK: %[[VAL:.*]] = arith.constant dense<2.550000e+01> : vector<8xf16>
-    // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xindex>
-    // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<8xi1>
+    // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
+    // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
+    // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
     // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
+    // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
-    %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<25.5> : vector<256xf16>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<0> : vector<256xindex>
-    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8]>} dense<1> : vector<256xi1>
-    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout = #xegpu.layout<sg_layout = [32], sg_data = [8]>, l1_hint = #xegpu.cache_hint<cached>}
+    %val = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<25.5> : vector<256xf16>
+    %offset = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<0> : vector<256xindex>
+    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>} dense<1> : vector<256xi1>
+    xegpu.store %val, %dest[%offset], %mask {chunk_size = 1, layout_operand_0 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>, 
+                                             layout_operand_2 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
+                                             layout_operand_3 = #xegpu.layout<sg_layout = [32], sg_data = [8], inst_data = [8]>,
+                                             l1_hint = #xegpu.cache_hint<cached>}
       : vector<256xf16>, memref<256xf16>, vector<256xindex>, vector<256xi1>
     gpu.return
   }

From 5be4fc24a748514a0e51c408da11d0544ebf3811 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Thu, 2 Oct 2025 11:49:48 -0700
Subject: [PATCH 558/878] [mlir][Arith] arith.select doesn't need to be
 emulated for small floats (#161707)

arith.select isn't an arithmetic operation in the sense of things like
addf or mulf, which the emulate-unsupported-floats rewrites using extf
and truncf.

This patch adds select as a legal operation to prevent a pointless
conversion aronud conditional moves.

Fixes https://github.com/iree-org/iree/issues/22181
---
 .../Arith/Transforms/EmulateUnsupportedFloats.cpp     |  3 ++-
 .../Dialect/Arith/emulate-unsupported-floats.mlir     | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
index 7626d356a37f2..c64e10f534f8e 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
@@ -123,7 +123,8 @@ void mlir::arith::populateEmulateUnsupportedFloatsLegality(
                                vector::OuterProductOp, vector::ScanOp>(
       [&](Operation *op) { return converter.isLegal(op); });
   target.addLegalOp<arith::BitcastOp, arith::ExtFOp, arith::TruncFOp,
-                    arith::ConstantOp, vector::SplatOp, vector::BroadcastOp>();
+                    arith::ConstantOp, arith::SelectOp, vector::SplatOp,
+                    vector::BroadcastOp>();
 }
 
 void EmulateUnsupportedFloatsPass::runOnOperation() {
diff --git a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir
index 99790cc45d490..fcd004ac554aa 100644
--- a/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir
+++ b/mlir/test/Dialect/Arith/emulate-unsupported-floats.mlir
@@ -85,3 +85,14 @@ func.func @no_expansion(%x: f32) -> f32 {
   %y = arith.addf %x, %c : f32
   func.return %y : f32
 }
+
+// -----
+
+func.func @no_promote_select(%c: i1, %x: bf16, %y: bf16) -> bf16 {
+// CHECK-LABEL: @no_promote_select
+// CHECK-SAME: (%[[C:.+]]: i1, %[[X:.+]]: bf16, %[[Y:.+]]: bf16)
+// CHECK: %[[Z:.+]] = arith.select %[[C]], %[[X]], %[[Y]] : bf16
+// CHECK: return %[[Z]]
+  %z = arith.select %c, %x, %y : bf16
+  func.return %z : bf16
+}

From 847e1e18902d1bedb9d7df8cbec84dbda8042e47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Thu, 2 Oct 2025 20:52:45 +0200
Subject: [PATCH 559/878] [clang-format][NFC] Introduce isNoneOf (#161021)

And apply throughout the code base.
---
 clang/lib/Format/ContinuationIndenter.cpp     |  50 ++++-----
 clang/lib/Format/Format.cpp                   |   2 +-
 clang/lib/Format/FormatToken.cpp              |   4 +-
 clang/lib/Format/FormatToken.h                |   5 +-
 clang/lib/Format/FormatTokenLexer.cpp         |   4 +-
 clang/lib/Format/MacroExpander.cpp            |   2 +-
 .../lib/Format/NamespaceEndCommentsFixer.cpp  |   4 +-
 .../ObjCPropertyAttributeOrderFixer.cpp       |   2 +-
 clang/lib/Format/QualifierAlignmentFixer.cpp  |   2 +-
 clang/lib/Format/SortJavaScriptImports.cpp    |   4 +-
 clang/lib/Format/TokenAnnotator.cpp           | 106 +++++++++---------
 clang/lib/Format/UnwrappedLineFormatter.cpp   |   6 +-
 clang/lib/Format/UnwrappedLineParser.cpp      |  42 +++----
 clang/lib/Format/WhitespaceManager.cpp        |   2 +-
 14 files changed, 119 insertions(+), 116 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 9413c13a4137e..cd4c1aabac971 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -368,7 +368,7 @@ bool ContinuationIndenter::canBreak(const LineState &State) {
 
   // If binary operators are moved to the next line (including commas for some
   // styles of constructor initializers), that's always ok.
-  if (!Current.isOneOf(TT_BinaryOperator, tok::comma) &&
+  if (Current.isNoneOf(TT_BinaryOperator, tok::comma) &&
       // Allow breaking opening brace of lambdas (when passed as function
       // arguments) to a new line when BeforeLambdaBody brace wrapping is
       // enabled.
@@ -445,7 +445,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
        (!Style.BreakBeforeTernaryOperators &&
         Previous.is(TT_ConditionalExpr))) &&
       CurrentState.BreakBeforeParameter && !Current.isTrailingComment() &&
-      !Current.isOneOf(tok::r_paren, tok::r_brace)) {
+      Current.isNoneOf(tok::r_paren, tok::r_brace)) {
     return true;
   }
   if (CurrentState.IsChainedConditional &&
@@ -523,9 +523,9 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
   if (Style.AlwaysBreakBeforeMultilineStrings &&
       (NewLineColumn == State.FirstIndent + Style.ContinuationIndentWidth ||
        Previous.is(tok::comma) || Current.NestingLevel < 2) &&
-      !Previous.isOneOf(tok::kw_return, tok::lessless, tok::at,
+      Previous.isNoneOf(tok::kw_return, tok::lessless, tok::at,
                         Keywords.kw_dollar) &&
-      !Previous.isOneOf(TT_InlineASMColon, TT_ConditionalExpr) &&
+      Previous.isNoneOf(TT_InlineASMColon, TT_ConditionalExpr) &&
       nextIsMultilineString(State)) {
     return true;
   }
@@ -648,7 +648,7 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
   // into the ColumnLimit, they are checked here in the ContinuationIndenter.
   if (Style.ColumnLimit != 0 && Previous.is(BK_Block) &&
       Previous.is(tok::l_brace) &&
-      !Current.isOneOf(tok::r_brace, tok::comment)) {
+      Current.isNoneOf(tok::r_brace, tok::comment)) {
     return true;
   }
 
@@ -752,7 +752,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       return false;
 
     const auto *Next = Comma->getNextNonComment();
-    return Next && !Next->isOneOf(TT_LambdaLSquare, tok::l_brace, tok::caret);
+    return Next && Next->isNoneOf(TT_LambdaLSquare, tok::l_brace, tok::caret);
   };
 
   if (DisallowLineBreaks())
@@ -835,7 +835,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) &&
              Style.Cpp11BracedListStyle;
     };
-    if (!Tok.isOneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
+    if (Tok.isNoneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
         !IsStartOfBracedList()) {
       return false;
     }
@@ -843,7 +843,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
       return true;
     if (Tok.Previous->isIf())
       return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak;
-    return !Tok.Previous->isOneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
+    return Tok.Previous->isNoneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
                                   tok::kw_switch) &&
            !(Style.isJavaScript() && Tok.Previous->is(Keywords.kw_await));
   };
@@ -882,8 +882,8 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
          Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) {
       return true;
     }
-    const auto *Previous = Tok.Previous;
-    if (!Previous || (!Previous->isOneOf(TT_FunctionDeclarationLParen,
+    if (const auto *Previous = Tok.Previous;
+        !Previous || (Previous->isNoneOf(TT_FunctionDeclarationLParen,
                                          TT_LambdaDefinitionLParen) &&
                       !IsFunctionCallParen(*Previous))) {
       return true;
@@ -920,9 +920,9 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   // align the commas with the opening paren.
   if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign &&
       !CurrentState.IsCSharpGenericTypeConstraint && Previous.opensScope() &&
-      Previous.isNot(TT_ObjCMethodExpr) && Previous.isNot(TT_RequiresClause) &&
-      Previous.isNot(TT_TableGenDAGArgOpener) &&
-      Previous.isNot(TT_TableGenDAGArgOpenerToBreak) &&
+      Previous.isNoneOf(TT_ObjCMethodExpr, TT_RequiresClause,
+                        TT_TableGenDAGArgOpener,
+                        TT_TableGenDAGArgOpenerToBreak) &&
       !(Current.MacroParent && Previous.MacroParent) &&
       (Current.isNot(TT_LineComment) ||
        Previous.isOneOf(BK_BracedInit, TT_VerilogMultiLineListLParen)) &&
@@ -962,7 +962,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   if (Current.isNot(tok::comment) && P &&
       (P->isOneOf(TT_BinaryOperator, tok::comma) ||
        (P->is(TT_ConditionalExpr) && P->is(tok::colon))) &&
-      !P->isOneOf(TT_OverloadedOperator, TT_CtorInitializerComma) &&
+      P->isNoneOf(TT_OverloadedOperator, TT_CtorInitializerComma) &&
       P->getPrecedence() != prec::Assignment &&
       P->getPrecedence() != prec::Relational &&
       P->getPrecedence() != prec::Spaceship) {
@@ -992,7 +992,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
     // parameter, i.e. let nested calls have a continuation indent.
     CurrentState.LastSpace = State.Column;
     CurrentState.NestedBlockIndent = State.Column;
-  } else if (!Current.isOneOf(tok::comment, tok::caret) &&
+  } else if (Current.isNoneOf(tok::comment, tok::caret) &&
              ((Previous.is(tok::comma) &&
                Previous.isNot(TT_OverloadedOperator)) ||
               (Previous.is(tok::colon) && Previous.is(TT_ObjCMethodExpr)))) {
@@ -1099,7 +1099,7 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
   if (Current.isNot(TT_LambdaArrow) &&
       (!Style.isJavaScript() || Current.NestingLevel != 0 ||
        !PreviousNonComment || PreviousNonComment->isNot(tok::equal) ||
-       !Current.isOneOf(Keywords.kw_async, Keywords.kw_function))) {
+       Current.isNoneOf(Keywords.kw_async, Keywords.kw_function))) {
     CurrentState.NestedBlockIndent = State.Column;
   }
 
@@ -1239,11 +1239,11 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
   }
 
   if (PreviousNonComment &&
-      !PreviousNonComment->isOneOf(tok::comma, tok::colon, tok::semi) &&
+      PreviousNonComment->isNoneOf(tok::comma, tok::colon, tok::semi) &&
       ((PreviousNonComment->isNot(TT_TemplateCloser) &&
         !PreviousNonComment->ClosesRequiresClause) ||
        Current.NestingLevel != 0) &&
-      !PreviousNonComment->isOneOf(
+      PreviousNonComment->isNoneOf(
           TT_BinaryOperator, TT_FunctionAnnotationRParen, TT_JavaAnnotation,
           TT_LeadingJavaAnnotation) &&
       Current.isNot(TT_BinaryOperator) && !PreviousNonComment->opensScope() &&
@@ -1281,8 +1281,8 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
     bool AllowAllConstructorInitializersOnNextLine =
         Style.PackConstructorInitializers == FormatStyle::PCIS_NextLine ||
         Style.PackConstructorInitializers == FormatStyle::PCIS_NextLineOnly;
-    if (!(Previous.isOneOf(tok::l_paren, tok::l_brace, TT_BinaryOperator) ||
-          PreviousIsBreakingCtorInitializerColon) ||
+    if ((Previous.isNoneOf(tok::l_paren, tok::l_brace, TT_BinaryOperator) &&
+         !PreviousIsBreakingCtorInitializerColon) ||
         (!Style.AllowAllParametersOfDeclarationOnNextLine &&
          State.Line->MustBeDeclaration) ||
         (!Style.AllowAllArgumentsOnNextLine &&
@@ -1576,7 +1576,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
   if (Previous.is(tok::r_paren) &&
       Previous.isNot(TT_TableGenDAGArgOperatorToBreak) &&
       !Current.isBinaryOperator() &&
-      !Current.isOneOf(tok::colon, tok::comment)) {
+      Current.isNoneOf(tok::colon, tok::comment)) {
     return ContinuationIndent;
   }
   if (Current.is(TT_ProtoExtensionLSquare))
@@ -1591,7 +1591,7 @@ unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) {
            NextNonComment->SpacesRequiredBefore;
   }
   if (CurrentState.Indent == State.FirstIndent && PreviousNonComment &&
-      !PreviousNonComment->isOneOf(tok::r_brace, TT_CtorInitializerComma)) {
+      PreviousNonComment->isNoneOf(tok::r_brace, TT_CtorInitializerComma)) {
     // Ensure that we fall back to the continuation indent width instead of
     // just flushing continuations left.
     return CurrentState.Indent + Style.ContinuationIndentWidth;
@@ -1734,7 +1734,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State,
   }
   if (Previous && (Previous->isOneOf(TT_BinaryOperator, TT_ConditionalExpr) ||
                    (Previous->isOneOf(tok::l_paren, tok::comma, tok::colon) &&
-                    !Previous->isOneOf(TT_DictLiteral, TT_ObjCMethodExpr,
+                    Previous->isNoneOf(TT_DictLiteral, TT_ObjCMethodExpr,
                                        TT_CtorInitializerColon)))) {
     CurrentState.NestedBlockInlined =
         !Newline && hasNestedBlockInlined(Previous, Current, Style);
@@ -1758,7 +1758,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State,
     State.StartOfStringLiteral = State.Column + 1;
   } else if (Current.isStringLiteral() && State.StartOfStringLiteral == 0) {
     State.StartOfStringLiteral = State.Column;
-  } else if (!Current.isOneOf(tok::comment, tok::identifier, tok::hash) &&
+  } else if (Current.isNoneOf(tok::comment, tok::identifier, tok::hash) &&
              !Current.isStringLiteral()) {
     State.StartOfStringLiteral = 0;
   }
@@ -2057,7 +2057,7 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State,
   // array literals as these follow different indentation rules.
   bool NoLineBreak =
       Current.Children.empty() &&
-      !Current.isOneOf(TT_DictLiteral, TT_ArrayInitializerLSquare) &&
+      Current.isNoneOf(TT_DictLiteral, TT_ArrayInitializerLSquare) &&
       (CurrentState.NoLineBreak || CurrentState.NoLineBreakInOperand ||
        (Current.is(TT_TemplateOpener) &&
         CurrentState.ContainsUnwrappedBuilder));
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 835071dbe715d..2bf62448a7df3 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -2435,7 +2435,7 @@ class BracesRemover : public TokenAnalyzer {
       const auto *NextLine = I + 1 == End ? nullptr : I[1];
       for (const auto *Token = Line->First; Token && !Token->Finalized;
            Token = Token->Next) {
-        if (!Token->Optional || !Token->isOneOf(tok::l_brace, tok::r_brace))
+        if (!Token->Optional || Token->isNoneOf(tok::l_brace, tok::r_brace))
           continue;
         auto *Next = Token->Next;
         assert(Next || Token == Line->Last);
diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp
index c60ae8f0d2852..c2956a179b8ed 100644
--- a/clang/lib/Format/FormatToken.cpp
+++ b/clang/lib/Format/FormatToken.cpp
@@ -108,7 +108,7 @@ unsigned CommaSeparatedList::formatAfterToken(LineState &State,
   // Ensure that we start on the opening brace.
   const FormatToken *LBrace =
       State.NextToken->Previous->getPreviousNonComment();
-  if (!LBrace || !LBrace->isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) ||
+  if (!LBrace || LBrace->isNoneOf(tok::l_brace, TT_ArrayInitializerLSquare) ||
       LBrace->is(BK_Block) || LBrace->is(TT_DictLiteral) ||
       LBrace->Next->is(TT_DesignatedInitializerPeriod)) {
     return 0;
@@ -177,7 +177,7 @@ static unsigned CodePointsBetween(const FormatToken *Begin,
 void CommaSeparatedList::precomputeFormattingInfos(const FormatToken *Token) {
   // FIXME: At some point we might want to do this for other lists, too.
   if (!Token->MatchingParen ||
-      !Token->isOneOf(tok::l_brace, TT_ArrayInitializerLSquare)) {
+      Token->isNoneOf(tok::l_brace, TT_ArrayInitializerLSquare)) {
     return;
   }
 
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index a28446a540633..e4ddd610b9722 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -645,6 +645,9 @@ struct FormatToken {
     return is(K1) || isOneOf(K2, Ks...);
   }
   template <typename T> bool isNot(T Kind) const { return !is(Kind); }
+  template <typename... Ts> bool isNoneOf(Ts... Ks) const {
+    return !isOneOf(Ks...);
+  }
 
   bool isIf(bool AllowConstexprMacro = true) const {
     return is(tok::kw_if) || endsSequence(tok::kw_constexpr, tok::kw_if) ||
@@ -748,7 +751,7 @@ struct FormatToken {
   /// Returns \c true if this is a "." or "->" accessing a member.
   bool isMemberAccess() const {
     return isOneOf(tok::arrow, tok::period, tok::arrowstar) &&
-           !isOneOf(TT_DesignatedInitializerPeriod, TT_TrailingReturnArrow,
+           isNoneOf(TT_DesignatedInitializerPeriod, TT_TrailingReturnArrow,
                     TT_LambdaArrow, TT_LeadingJavaAnnotation);
   }
 
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 3f4aa52a87d2e..86a5185a92a52 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -733,7 +733,7 @@ void FormatTokenLexer::tryParseJavaTextBlock() {
 // its text if successful.
 void FormatTokenLexer::tryParseJSRegexLiteral() {
   FormatToken *RegexToken = Tokens.back();
-  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
+  if (RegexToken->isNoneOf(tok::slash, tok::slashequal))
     return;
 
   FormatToken *Prev = nullptr;
@@ -1041,7 +1041,7 @@ void FormatTokenLexer::handleTemplateStrings() {
 
 void FormatTokenLexer::tryParsePythonComment() {
   FormatToken *HashToken = Tokens.back();
-  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
+  if (HashToken->isNoneOf(tok::hash, tok::hashhash))
     return;
   // Turn the remainder of this line into a comment.
   const char *CommentBegin =
diff --git a/clang/lib/Format/MacroExpander.cpp b/clang/lib/Format/MacroExpander.cpp
index 85a53c9bb12fe..445e17358844d 100644
--- a/clang/lib/Format/MacroExpander.cpp
+++ b/clang/lib/Format/MacroExpander.cpp
@@ -86,7 +86,7 @@ class MacroExpander::DefinitionParser {
   }
 
   bool parseExpansion() {
-    if (!Current->isOneOf(tok::equal, tok::eof))
+    if (Current->isNoneOf(tok::equal, tok::eof))
       return false;
     if (Current->is(tok::equal))
       nextToken();
diff --git a/clang/lib/Format/NamespaceEndCommentsFixer.cpp b/clang/lib/Format/NamespaceEndCommentsFixer.cpp
index 08f8d6840fe00..95ccfac5e6e61 100644
--- a/clang/lib/Format/NamespaceEndCommentsFixer.cpp
+++ b/clang/lib/Format/NamespaceEndCommentsFixer.cpp
@@ -70,7 +70,7 @@ std::string computeName(const FormatToken *NamespaceTok) {
     // and closing parenthesis or comma.
     assert(Tok && Tok->is(tok::l_paren) && "expected an opening parenthesis");
     Tok = Tok->getNextNonComment();
-    while (Tok && !Tok->isOneOf(tok::r_paren, tok::comma)) {
+    while (Tok && Tok->isNoneOf(tok::r_paren, tok::comma)) {
       name += Tok->TokenText;
       Tok = Tok->getNextNonComment();
     }
@@ -85,7 +85,7 @@ std::string computeName(const FormatToken *NamespaceTok) {
   // one token before that up until the '{'. A '(' might be a macro with
   // arguments.
   const FormatToken *FirstNSTok = nullptr;
-  while (Tok && !Tok->isOneOf(tok::l_brace, tok::coloncolon, tok::l_paren)) {
+  while (Tok && Tok->isNoneOf(tok::l_brace, tok::coloncolon, tok::l_paren)) {
     if (FirstNSTok)
       FirstNSName += FirstNSTok->TokenText;
     FirstNSTok = Tok;
diff --git a/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp b/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp
index b885942efcb55..b12b370538c96 100644
--- a/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp
+++ b/clang/lib/Format/ObjCPropertyAttributeOrderFixer.cpp
@@ -61,7 +61,7 @@ void ObjCPropertyAttributeOrderFixer::sortPropertyAttributes(
     }
 
     // Most attributes look like identifiers, but `class` is a keyword.
-    if (!Tok->isOneOf(tok::identifier, tok::kw_class)) {
+    if (Tok->isNoneOf(tok::identifier, tok::kw_class)) {
       // If we hit any other kind of token, just bail.
       return;
     }
diff --git a/clang/lib/Format/QualifierAlignmentFixer.cpp b/clang/lib/Format/QualifierAlignmentFixer.cpp
index 043d957611b19..e3e30ca8e2e89 100644
--- a/clang/lib/Format/QualifierAlignmentFixer.cpp
+++ b/clang/lib/Format/QualifierAlignmentFixer.cpp
@@ -508,7 +508,7 @@ const FormatToken *LeftRightQualifierAlignmentFixer::analyzeLeft(
 
     // Don't change declarations such as
     // `foo(struct Foo const a);` -> `foo(struct Foo const a);`
-    if (!Previous || !Previous->isOneOf(tok::kw_struct, tok::kw_class)) {
+    if (!Previous || Previous->isNoneOf(tok::kw_struct, tok::kw_class)) {
       insertQualifierBefore(SourceMgr, Fixes, TypeToken, Qualifier);
       removeToken(SourceMgr, Fixes, Tok);
     }
diff --git a/clang/lib/Format/SortJavaScriptImports.cpp b/clang/lib/Format/SortJavaScriptImports.cpp
index ace3dffebec40..a403a4fed664c 100644
--- a/clang/lib/Format/SortJavaScriptImports.cpp
+++ b/clang/lib/Format/SortJavaScriptImports.cpp
@@ -439,7 +439,7 @@ class JavaScriptImportSorter : public TokenAnalyzer {
   // for grammar EBNF (production ModuleItem).
   bool parseModuleReference(const AdditionalKeywords &Keywords,
                             JsModuleReference &Reference) {
-    if (!Current || !Current->isOneOf(Keywords.kw_import, tok::kw_export))
+    if (!Current || Current->isNoneOf(Keywords.kw_import, tok::kw_export))
       return false;
     Reference.IsExport = Current->is(tok::kw_export);
 
@@ -570,7 +570,7 @@ class JavaScriptImportSorter : public TokenAnalyzer {
       Symbol.Range.setEnd(Current->Tok.getLocation());
       Reference.Symbols.push_back(Symbol);
 
-      if (!Current->isOneOf(tok::r_brace, tok::comma))
+      if (Current->isNoneOf(tok::r_brace, tok::comma))
         return false;
     }
     Reference.SymbolsEnd = Current->Tok.getLocation();
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 0c9c88a426c89..59f81b3617ad9 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -203,7 +203,7 @@ class AnnotatingParser {
             return false;
           }
           if (InExpr && SeenTernaryOperator &&
-              (!Next || !Next->isOneOf(tok::l_paren, tok::l_brace))) {
+              (!Next || Next->isNoneOf(tok::l_paren, tok::l_brace))) {
             return false;
           }
           if (!MaybeAngles)
@@ -577,7 +577,7 @@ class AnnotatingParser {
       if (IsIf && CurrentToken->is(tok::semi)) {
         for (auto *Tok = OpeningParen.Next;
              Tok != CurrentToken &&
-             !Tok->isOneOf(tok::equal, tok::l_paren, tok::l_brace);
+             Tok->isNoneOf(tok::equal, tok::l_paren, tok::l_brace);
              Tok = Tok->Next) {
           if (Tok->isPointerOrReference())
             Tok->setFinalizedType(TT_PointerOrReference);
@@ -704,7 +704,7 @@ class AnnotatingParser {
         !IsCppStructuredBinding && !InsideInlineASM && !CppArrayTemplates &&
         IsCpp && !IsCpp11AttributeSpecifier && !IsCSharpAttributeSpecifier &&
         Contexts.back().CanBeExpression && Left->isNot(TT_LambdaLSquare) &&
-        !CurrentToken->isOneOf(tok::l_brace, tok::r_square) &&
+        CurrentToken->isNoneOf(tok::l_brace, tok::r_square) &&
         (!Parent ||
          Parent->isOneOf(tok::colon, tok::l_square, tok::l_paren,
                          tok::kw_return, tok::kw_throw) ||
@@ -1334,7 +1334,7 @@ class AnnotatingParser {
       if (Style.isJavaScript()) {
         if (Contexts.back().ColonIsForRangeExpr || // colon in for loop
             (Contexts.size() == 1 &&               // switch/case labels
-             !Line.First->isOneOf(tok::kw_enum, tok::kw_case)) ||
+             Line.First->isNoneOf(tok::kw_enum, tok::kw_case)) ||
             Contexts.back().ContextKind == tok::l_paren ||  // function params
             Contexts.back().ContextKind == tok::l_square || // array type
             (!Contexts.back().IsExpression &&
@@ -1411,7 +1411,7 @@ class AnnotatingParser {
       } else if (Contexts.back().ColonIsForRangeExpr) {
         Tok->setType(TT_RangeBasedForLoopColon);
         for (auto *Token = Prev;
-             Token && !Token->isOneOf(tok::semi, tok::l_paren);
+             Token && Token->isNoneOf(tok::semi, tok::l_paren);
              Token = Token->Previous) {
           if (Token->isPointerOrReference())
             Token->setFinalizedType(TT_PointerOrReference);
@@ -1425,7 +1425,7 @@ class AnnotatingParser {
                   Scopes.back() == ST_Class)) {
         Tok->setType(TT_BitFieldColon);
       } else if (Contexts.size() == 1 &&
-                 !Line.getFirstNonComment()->isOneOf(tok::kw_enum, tok::kw_case,
+                 Line.getFirstNonComment()->isNoneOf(tok::kw_enum, tok::kw_case,
                                                      tok::kw_default) &&
                  !Line.startsWith(tok::kw_typedef, tok::kw_enum)) {
         if (Prev->isOneOf(tok::r_paren, tok::kw_noexcept) ||
@@ -1562,10 +1562,10 @@ class AnnotatingParser {
       if (Line.MustBeDeclaration && Contexts.size() == 1 &&
           !Contexts.back().IsExpression && !Line.startsWith(TT_ObjCProperty) &&
           !Line.startsWith(tok::l_paren) &&
-          !Tok->isOneOf(TT_TypeDeclarationParen, TT_RequiresExpressionLParen)) {
+          Tok->isNoneOf(TT_TypeDeclarationParen, TT_RequiresExpressionLParen)) {
         if (!Prev ||
             (!Prev->isAttribute() &&
-             !Prev->isOneOf(TT_RequiresClause, TT_LeadingJavaAnnotation,
+             Prev->isNoneOf(TT_RequiresClause, TT_LeadingJavaAnnotation,
                             TT_BinaryOperator))) {
           Line.MightBeFunctionDecl = true;
           Tok->MightBeFunctionDeclParen = true;
@@ -1664,7 +1664,7 @@ class AnnotatingParser {
         }
       }
       while (CurrentToken &&
-             !CurrentToken->isOneOf(tok::l_paren, tok::semi, tok::r_paren)) {
+             CurrentToken->isNoneOf(tok::l_paren, tok::semi, tok::r_paren)) {
         if (CurrentToken->isOneOf(tok::star, tok::amp))
           CurrentToken->setType(TT_PointerOrReference);
         auto Next = CurrentToken->getNextNonComment();
@@ -1728,8 +1728,8 @@ class AnnotatingParser {
         // cond ? id : "B";
         // cond ? cond2 ? "A" : "B" : "C";
         if (!Contexts.back().IsExpression && Line.MustBeDeclaration &&
-            (!Next || !Next->isOneOf(tok::identifier, tok::string_literal) ||
-             !Next->Next || !Next->Next->isOneOf(tok::colon, tok::question))) {
+            (!Next || Next->isNoneOf(tok::identifier, tok::string_literal) ||
+             !Next->Next || Next->Next->isNoneOf(tok::colon, tok::question))) {
           Tok->setType(TT_CSharpNullable);
           break;
         }
@@ -1796,7 +1796,7 @@ class AnnotatingParser {
           if (!parseTableGenValue())
             return false;
         } else if (Tok->isOneOf(Keywords.kw_def, Keywords.kw_defm) &&
-                   (!Next || !Next->isOneOf(tok::colon, tok::l_brace))) {
+                   (!Next || Next->isNoneOf(tok::colon, tok::l_brace))) {
           // The case NameValue appears.
           if (!parseTableGenValue(true))
             return false;
@@ -2094,7 +2094,7 @@ class AnnotatingParser {
     // Reset token type in case we have already looked at it and then
     // recovered from an error (e.g. failure to find the matching >).
     if (!CurrentToken->isTypeFinalized() &&
-        !CurrentToken->isOneOf(
+        CurrentToken->isNoneOf(
             TT_LambdaLSquare, TT_LambdaLBrace, TT_AttributeMacro, TT_IfMacro,
             TT_ForEachMacro, TT_TypenameMacro, TT_FunctionLBrace,
             TT_ImplicitStringLiteral, TT_InlineASMBrace, TT_FatArrow,
@@ -2230,7 +2230,7 @@ class AnnotatingParser {
         // type or non-type.
         if (Contexts.back().ContextKind == tok::less) {
           assert(Current.Previous->Previous);
-          return !Current.Previous->Previous->isOneOf(tok::kw_typename,
+          return Current.Previous->Previous->isNoneOf(tok::kw_typename,
                                                       tok::kw_class);
         }
 
@@ -2266,7 +2266,7 @@ class AnnotatingParser {
       if (!Line.startsWith(TT_UnaryOperator)) {
         for (FormatToken *Previous = Current.Previous;
              Previous && Previous->Previous &&
-             !Previous->Previous->isOneOf(tok::comma, tok::semi);
+             Previous->Previous->isNoneOf(tok::comma, tok::semi);
              Previous = Previous->Previous) {
           if (Previous->isOneOf(tok::r_square, tok::r_paren, tok::greater)) {
             Previous = Previous->MatchingParen;
@@ -2430,7 +2430,7 @@ class AnnotatingParser {
       Current.setType(TT_BinaryOperator);
     } else if (Current.is(tok::arrow) && AutoFound &&
                Line.MightBeFunctionDecl && Current.NestingLevel == 0 &&
-               !Current.Previous->isOneOf(tok::kw_operator, tok::identifier)) {
+               Current.Previous->isNoneOf(tok::kw_operator, tok::identifier)) {
       // not auto operator->() -> xxx;
       Current.setType(TT_TrailingReturnArrow);
     } else if (Current.is(tok::arrow) && Current.Previous &&
@@ -2511,7 +2511,7 @@ class AnnotatingParser {
         Current.setType(TT_CastRParen);
       if (Current.MatchingParen && Current.Next &&
           !Current.Next->isBinaryOperator() &&
-          !Current.Next->isOneOf(
+          Current.Next->isNoneOf(
               tok::semi, tok::colon, tok::l_brace, tok::l_paren, tok::comma,
               tok::period, tok::arrow, tok::coloncolon, tok::kw_noexcept)) {
         if (FormatToken *AfterParen = Current.MatchingParen->Next;
@@ -2569,7 +2569,7 @@ class AnnotatingParser {
     } else if (Current.isOneOf(tok::identifier, tok::kw_const, tok::kw_noexcept,
                                tok::kw_requires) &&
                Current.Previous &&
-               !Current.Previous->isOneOf(tok::equal, tok::at,
+               Current.Previous->isNoneOf(tok::equal, tok::at,
                                           TT_CtorInitializerComma,
                                           TT_CtorInitializerColon) &&
                Line.MightBeFunctionDecl && Contexts.size() == 1) {
@@ -2658,7 +2658,7 @@ class AnnotatingParser {
     if (PreviousNotConst->is(TT_TemplateCloser)) {
       return PreviousNotConst && PreviousNotConst->MatchingParen &&
              PreviousNotConst->MatchingParen->Previous &&
-             !PreviousNotConst->MatchingParen->Previous->isOneOf(
+             PreviousNotConst->MatchingParen->Previous->isNoneOf(
                  tok::period, tok::kw_template);
     }
 
@@ -2780,7 +2780,7 @@ class AnnotatingParser {
       // If there is an identifier (or with a few exceptions a keyword) right
       // before the parentheses, this is unlikely to be a cast.
       if (LeftOfParens->Tok.getIdentifierInfo() &&
-          !LeftOfParens->isOneOf(Keywords.kw_in, tok::kw_return, tok::kw_case,
+          LeftOfParens->isNoneOf(Keywords.kw_in, tok::kw_return, tok::kw_case,
                                  tok::kw_delete, tok::kw_throw)) {
         return false;
       }
@@ -2918,7 +2918,7 @@ class AnnotatingParser {
     const bool NextIsAmpOrStar = AfterRParen->isOneOf(tok::amp, tok::star);
     if (!(AfterRParen->isUnaryOperator() || NextIsAmpOrStar) ||
         AfterRParen->is(tok::plus) ||
-        !AfterRParen->Next->isOneOf(tok::identifier, tok::numeric_constant)) {
+        AfterRParen->Next->isNoneOf(tok::identifier, tok::numeric_constant)) {
       return false;
     }
 
@@ -2948,7 +2948,7 @@ class AnnotatingParser {
 
     // Search for unexpected tokens.
     for (Prev = BeforeRParen; Prev != LParen; Prev = Prev->Previous)
-      if (!Prev->isOneOf(tok::kw_const, tok::identifier, tok::coloncolon))
+      if (Prev->isNoneOf(tok::kw_const, tok::identifier, tok::coloncolon))
         return false;
 
     return true;
@@ -3740,7 +3740,7 @@ void TokenAnnotator::annotate(AnnotatedLine &Line) {
     const bool InRequiresExpression = Line.Type == LT_RequiresExpression;
     for (auto &Child : Line.Children) {
       if (InRequiresExpression &&
-          !Child->First->isOneOf(tok::kw_typename, tok::kw_requires,
+          Child->First->isNoneOf(tok::kw_typename, tok::kw_requires,
                                  TT_CompoundRequirementLBrace)) {
         Child->Type = LT_SimpleRequirement;
       }
@@ -3857,7 +3857,7 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
   // Find parentheses of parameter list.
   if (Current.is(tok::kw_operator)) {
     if (Previous.Tok.getIdentifierInfo() &&
-        !Previous.isOneOf(tok::kw_return, tok::kw_co_return)) {
+        Previous.isNoneOf(tok::kw_return, tok::kw_co_return)) {
       return true;
     }
     if (Previous.is(tok::r_paren) && Previous.is(TT_TypeDeclarationParen)) {
@@ -4328,7 +4328,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
     // Slightly prefer formatting local lambda definitions like functions.
     if (Right.is(TT_LambdaLSquare) && Left.is(tok::equal))
       return 35;
-    if (!Right.isOneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
+    if (Right.isNoneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
                        TT_ArrayInitializerLSquare,
                        TT_DesignatedInitializerLSquare, TT_AttributeSquare)) {
       return 500;
@@ -4519,7 +4519,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
                                           const FormatToken &Left,
                                           const FormatToken &Right) const {
   if (Left.is(tok::kw_return) &&
-      !Right.isOneOf(tok::semi, tok::r_paren, tok::hashhash)) {
+      Right.isNoneOf(tok::semi, tok::r_paren, tok::hashhash)) {
     return true;
   }
   if (Left.is(tok::kw_throw) && Right.is(tok::l_paren) && Right.MatchingParen &&
@@ -4579,7 +4579,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
   }
   // co_await (x), co_yield (x), co_return (x)
   if (Left.isOneOf(tok::kw_co_await, tok::kw_co_yield, tok::kw_co_return) &&
-      !Right.isOneOf(tok::semi, tok::r_paren)) {
+      Right.isNoneOf(tok::semi, tok::r_paren)) {
     return true;
   }
 
@@ -4656,7 +4656,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       return getTokenPointerOrReferenceAlignment(Right) !=
              FormatStyle::PAS_Left;
     }
-    return !Left.isOneOf(TT_PointerOrReference, tok::l_paren) &&
+    return Left.isNoneOf(TT_PointerOrReference, tok::l_paren) &&
            (getTokenPointerOrReferenceAlignment(Right) !=
                 FormatStyle::PAS_Left ||
             (Line.IsMultiVariableDeclStmt &&
@@ -4729,7 +4729,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       const auto *LParen = Right.Next->MatchingParen;
       return !LParen || LParen->isNot(TT_FunctionTypeLParen);
     }
-    return !BeforeLeft->isOneOf(tok::l_paren, tok::l_square);
+    return BeforeLeft->isNoneOf(tok::l_paren, tok::l_square);
   }
   // Ensure right pointer alignment with ellipsis e.g. int *...P
   if (Left.is(tok::ellipsis) && BeforeLeft &&
@@ -4808,10 +4808,10 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
                                           TT_LambdaLSquare)));
   }
   if (Right.is(tok::l_square) &&
-      !Right.isOneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
+      Right.isNoneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
                      TT_DesignatedInitializerLSquare,
                      TT_StructuredBindingLSquare, TT_AttributeSquare) &&
-      !Left.isOneOf(tok::numeric_constant, TT_DictLiteral) &&
+      Left.isNoneOf(tok::numeric_constant, TT_DictLiteral) &&
       !(Left.isNot(tok::r_square) && Style.SpaceBeforeSquareBrackets &&
         Right.is(TT_ArraySubscriptLSquare))) {
     return false;
@@ -4894,7 +4894,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       return Style.SpaceBeforeParensOptions.AfterFunctionDefinitionName ||
              spaceRequiredBeforeParens(Right);
     }
-    if (!BeforeLeft || !BeforeLeft->isOneOf(tok::period, tok::arrow)) {
+    if (!BeforeLeft || BeforeLeft->isNoneOf(tok::period, tok::arrow)) {
       if (Left.isOneOf(tok::kw_try, Keywords.kw___except, tok::kw_catch)) {
         return Style.SpaceBeforeParensOptions.AfterControlStatements ||
                spaceRequiredBeforeParens(Right);
@@ -4917,7 +4917,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
   if (Left.is(tok::at) && Right.isNot(tok::objc_not_keyword))
     return false;
   if (Right.is(TT_UnaryOperator)) {
-    return !Left.isOneOf(tok::l_paren, tok::l_square, tok::at) &&
+    return Left.isNoneOf(tok::l_paren, tok::l_square, tok::at) &&
            (Left.isNot(tok::colon) || Left.isNot(TT_ObjCMethodExpr));
   }
   // No space between the variable name and the initializer list.
@@ -5260,7 +5260,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     if (Left.is(tok::ellipsis))
       return false;
     if (Left.is(TT_TemplateCloser) &&
-        !Right.isOneOf(tok::equal, tok::l_brace, tok::comma, tok::l_square,
+        Right.isNoneOf(tok::equal, tok::l_brace, tok::comma, tok::l_square,
                        Keywords.kw_implements, Keywords.kw_extends)) {
       // Type assertions ('<type>expr') are not followed by whitespace. Other
       // locations that should have whitespace following are identified by the
@@ -5299,7 +5299,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     // Add space between things in a primitive's state table unless in a
     // transition like `(0?)`.
     if ((Left.is(TT_VerilogTableItem) &&
-         !Right.isOneOf(tok::r_paren, tok::semi)) ||
+         Right.isNoneOf(tok::r_paren, tok::semi)) ||
         (Right.is(TT_VerilogTableItem) && Left.isNot(tok::l_paren))) {
       const FormatToken *Next = Right.getNextNonComment();
       return !(Next && Next->is(tok::r_paren));
@@ -5348,8 +5348,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     // previous rule.
     if ((Right.is(Keywords.kw_apostrophe) ||
          (Right.is(BK_BracedInit) && Right.is(tok::l_brace))) &&
-        !(Left.isOneOf(Keywords.kw_assign, Keywords.kw_unique) ||
-          Keywords.isVerilogWordOperator(Left)) &&
+        Left.isNoneOf(Keywords.kw_assign, Keywords.kw_unique) &&
+        !Keywords.isVerilogWordOperator(Left) &&
         (Left.isOneOf(tok::r_square, tok::r_paren, tok::r_brace,
                       tok::numeric_constant) ||
          Keywords.isWordLike(Left))) {
@@ -5549,14 +5549,14 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     return Right.hasWhitespaceBefore();
   }
   if (Right.is(tok::coloncolon) &&
-      !Left.isOneOf(tok::l_brace, tok::comment, tok::l_paren)) {
+      Left.isNoneOf(tok::l_brace, tok::comment, tok::l_paren)) {
     // Put a space between < and :: in vector< ::std::string >
     return (Left.is(TT_TemplateOpener) &&
             ((Style.Standard < FormatStyle::LS_Cpp11) ||
              ShouldAddSpacesInAngles())) ||
-           !(Left.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
-                          tok::kw___super, TT_TemplateOpener,
-                          TT_TemplateCloser)) ||
+           Left.isNoneOf(tok::l_paren, tok::r_paren, tok::l_square,
+                         tok::kw___super, TT_TemplateOpener,
+                         TT_TemplateCloser) ||
            (Left.is(tok::l_paren) && Style.SpacesInParensOptions.Other);
   }
   if ((Left.is(TT_TemplateOpener)) != (Right.is(TT_TemplateCloser)))
@@ -5567,7 +5567,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
   }
   // Space before TT_StructuredBindingLSquare.
   if (Right.is(TT_StructuredBindingLSquare)) {
-    return !Left.isOneOf(tok::amp, tok::ampamp) ||
+    return Left.isNoneOf(tok::amp, tok::ampamp) ||
            getTokenReferenceAlignment(Left) != FormatStyle::PAS_Right;
   }
   // Space before & or && following a TT_StructuredBindingLSquare.
@@ -5599,7 +5599,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
 // Returns 'true' if 'Tok' is a brace we'd want to break before in Allman style.
 static bool isAllmanBrace(const FormatToken &Tok) {
   return Tok.is(tok::l_brace) && Tok.is(BK_Block) &&
-         !Tok.isOneOf(TT_ObjCBlockLBrace, TT_LambdaLBrace, TT_DictLiteral);
+         Tok.isNoneOf(TT_ObjCBlockLBrace, TT_LambdaLBrace, TT_DictLiteral);
 }
 
 // Returns 'true' if 'Tok' is a function argument.
@@ -5617,7 +5617,7 @@ isEmptyLambdaAllowed(const FormatToken &Tok,
 
 static bool isAllmanLambdaBrace(const FormatToken &Tok) {
   return Tok.is(tok::l_brace) && Tok.is(BK_Block) &&
-         !Tok.isOneOf(TT_ObjCBlockLBrace, TT_DictLiteral);
+         Tok.isNoneOf(TT_ObjCBlockLBrace, TT_DictLiteral);
 }
 
 bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
@@ -5686,7 +5686,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
                             tok::kw_const) &&
         // kw_var/kw_let are pseudo-tokens that are tok::identifier, so match
         // above.
-        !Line.First->isOneOf(Keywords.kw_var, Keywords.kw_let)) {
+        Line.First->isNoneOf(Keywords.kw_var, Keywords.kw_let)) {
       // Object literals on the top level of a file are treated as "enum-style".
       // Each key/value pair is put on a separate line, instead of bin-packing.
       return true;
@@ -5831,7 +5831,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
   }
 
   if (Right.is(tok::comment)) {
-    return !Left.isOneOf(BK_BracedInit, TT_CtorInitializerColon) &&
+    return Left.isNoneOf(BK_BracedInit, TT_CtorInitializerColon) &&
            Right.NewlinesBefore > 0 && Right.HasUnescapedNewline;
   }
   if (Left.isTrailingComment())
@@ -5873,7 +5873,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     case FormatStyle::RCPS_WithPreceding:
       return Right.isNot(tok::semi);
     case FormatStyle::RCPS_OwnLineWithBrace:
-      return !Right.isOneOf(tok::semi, tok::l_brace);
+      return Right.isNoneOf(tok::semi, tok::l_brace);
     default:
       break;
     }
@@ -6000,7 +6000,7 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
   // Put multiple Java annotation on a new line.
   if ((Style.isJava() || Style.isJavaScript()) &&
       Left.is(TT_LeadingJavaAnnotation) &&
-      !Right.isOneOf(TT_LeadingJavaAnnotation, tok::l_paren) &&
+      Right.isNoneOf(TT_LeadingJavaAnnotation, tok::l_paren) &&
       (Line.Last->is(tok::l_brace) || Style.BreakAfterJavaFieldAnnotations)) {
     return true;
   }
@@ -6206,7 +6206,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
       return false;
     // Avoid to break after '(' in the cases that is in bang operators.
     if (Right.is(tok::l_paren)) {
-      return !Left.isOneOf(TT_TableGenBangOperator, TT_TableGenCondOperator,
+      return Left.isNoneOf(TT_TableGenBangOperator, TT_TableGenCondOperator,
                            TT_TemplateCloser);
     }
     // Avoid to break between the value and its suffix part.
@@ -6294,7 +6294,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   }
 
   if (Right.is(tok::colon) &&
-      !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon,
+      Right.isNoneOf(TT_CtorInitializerColon, TT_InlineASMColon,
                      TT_BitFieldColon)) {
     return false;
   }
@@ -6378,7 +6378,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   }
   if (Left.isOneOf(TT_TemplateCloser, TT_UnaryOperator, tok::kw_operator))
     return false;
-  if (Left.is(tok::equal) && !Right.isOneOf(tok::kw_default, tok::kw_delete) &&
+  if (Left.is(tok::equal) && Right.isNoneOf(tok::kw_default, tok::kw_delete) &&
       Line.Type == LT_VirtualFunctionDecl && Left.NestingLevel == 0) {
     return false;
   }
@@ -6405,7 +6405,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   // Allow breaking after a trailing annotation, e.g. after a method
   // declaration.
   if (Left.is(TT_TrailingAnnotation)) {
-    return !Right.isOneOf(tok::l_brace, tok::semi, tok::equal, tok::l_paren,
+    return Right.isNoneOf(tok::l_brace, tok::semi, tok::equal, tok::l_paren,
                           tok::less, tok::coloncolon);
   }
 
@@ -6448,7 +6448,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   if (Right.is(tok::kw_typename) && Left.isNot(tok::kw_const))
     return true;
   if ((Left.isBinaryOperator() || Left.is(TT_BinaryOperator)) &&
-      !Left.isOneOf(tok::arrowstar, tok::lessless) &&
+      Left.isNoneOf(tok::arrowstar, tok::lessless) &&
       Style.BreakBeforeBinaryOperators != FormatStyle::BOS_All &&
       (Style.BreakBeforeBinaryOperators == FormatStyle::BOS_None ||
        Left.getPrecedence() == prec::Assignment)) {
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index ac9d147defc13..ac9c81d4416c9 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -506,7 +506,7 @@ class LineJoiner {
                       (NextLine.First->is(tok::r_brace) &&
                        !Style.BraceWrapping.SplitEmptyRecord);
       } else if (TheLine->InPPDirective ||
-                 !TheLine->First->isOneOf(tok::kw_class, tok::kw_enum,
+                 TheLine->First->isNoneOf(tok::kw_class, tok::kw_enum,
                                           tok::kw_struct)) {
         // Try to merge a block with left brace unwrapped that wasn't yet
         // covered.
@@ -686,8 +686,8 @@ class LineJoiner {
     }
     Limit = limitConsideringMacros(I + 1, E, Limit);
     AnnotatedLine &Line = **I;
-    if (Line.First->isNot(tok::kw_do) && Line.First->isNot(tok::kw_else) &&
-        Line.Last->isNot(tok::kw_else) && Line.Last->isNot(tok::r_paren)) {
+    if (Line.First->isNoneOf(tok::kw_do, tok::kw_else) &&
+        Line.Last->isNoneOf(tok::kw_else, tok::r_paren)) {
       return 0;
     }
     // Only merge `do while` if `do` is the only statement on the line.
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 6948b3de1e408..28797433e06e3 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -405,7 +405,7 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace,
     case tok::r_brace:
       if (OpeningBrace) {
         if (!Style.RemoveBracesLLVM || Line->InPPDirective ||
-            !OpeningBrace->isOneOf(TT_ControlStatementLBrace, TT_ElseLBrace)) {
+            OpeningBrace->isNoneOf(TT_ControlStatementLBrace, TT_ElseLBrace)) {
           return false;
         }
         if (FormatTok->isNot(tok::r_brace) || StatementCount != 1 || HasLabel ||
@@ -427,7 +427,7 @@ bool UnwrappedLineParser::parseLevel(const FormatToken *OpeningBrace,
       unsigned StoredPosition = Tokens->getPosition();
       auto *Next = Tokens->getNextNonComment();
       FormatTok = Tokens->setPosition(StoredPosition);
-      if (!Next->isOneOf(tok::colon, tok::arrow)) {
+      if (Next->isNoneOf(tok::colon, tok::arrow)) {
         // default not followed by `:` or `->` is not a case label; treat it
         // like an identifier.
         parseStructuralElement();
@@ -584,7 +584,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
           ProbablyBracedList =
               ProbablyBracedList ||
               (NextTok->is(tok::identifier) &&
-               !PrevTok->isOneOf(tok::semi, tok::r_brace, tok::l_brace));
+               PrevTok->isNoneOf(tok::semi, tok::r_brace, tok::l_brace));
 
           ProbablyBracedList = ProbablyBracedList ||
                                (NextTok->is(tok::semi) &&
@@ -607,7 +607,7 @@ void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
               // A statement can end with only `;` (simple statement), a block
               // closing brace (compound statement), or `:` (label statement).
               // If PrevTok is a block opening brace, Tok ends an empty block.
-              !PrevTok->isOneOf(tok::semi, BK_Block, tok::colon)) {
+              PrevTok->isNoneOf(tok::semi, BK_Block, tok::colon)) {
             ProbablyBracedList = true;
           }
         }
@@ -1157,7 +1157,7 @@ void UnwrappedLineParser::parsePPDefine() {
     IncludeGuard = IG_Defined;
     IncludeGuardToken = nullptr;
     for (auto &Line : Lines) {
-      if (!Line.Tokens.front().Tok->isOneOf(tok::comment, tok::hash)) {
+      if (Line.Tokens.front().Tok->isNoneOf(tok::comment, tok::hash)) {
         IncludeGuard = IG_Rejected;
         break;
       }
@@ -1233,7 +1233,7 @@ void UnwrappedLineParser::parsePPUnknown() {
 static bool tokenCanStartNewLine(const FormatToken &Tok) {
   // Semicolon can be a null-statement, l_square can be a start of a macro or
   // a C++11 attribute, but this doesn't seem to be common.
-  return !Tok.isOneOf(tok::semi, tok::l_brace,
+  return Tok.isNoneOf(tok::semi, tok::l_brace,
                       // Tokens that can only be used as binary operators and a
                       // part of overloaded operator names.
                       tok::period, tok::periodstar, tok::arrow, tok::arrowstar,
@@ -1256,7 +1256,7 @@ static bool mustBeJSIdent(const AdditionalKeywords &Keywords,
   // FIXME: This returns true for C/C++ keywords like 'struct'.
   return FormatTok->is(tok::identifier) &&
          (!FormatTok->Tok.getIdentifierInfo() ||
-          !FormatTok->isOneOf(
+          FormatTok->isNoneOf(
               Keywords.kw_in, Keywords.kw_of, Keywords.kw_as, Keywords.kw_async,
               Keywords.kw_await, Keywords.kw_yield, Keywords.kw_finally,
               Keywords.kw_function, Keywords.kw_import, Keywords.kw_is,
@@ -1322,7 +1322,7 @@ static bool isC78ParameterDecl(const FormatToken *Tok, const FormatToken *Next,
     return false;
 
   if (!isC78Type(*Tok) &&
-      !Tok->isOneOf(tok::kw_register, tok::kw_struct, tok::kw_union)) {
+      Tok->isNoneOf(tok::kw_register, tok::kw_struct, tok::kw_union)) {
     return false;
   }
 
@@ -1345,7 +1345,7 @@ bool UnwrappedLineParser::parseModuleImport() {
 
   if (auto Token = Tokens->peekNextToken(/*SkipComment=*/true);
       !Token->Tok.getIdentifierInfo() &&
-      !Token->isOneOf(tok::colon, tok::less, tok::string_literal)) {
+      Token->isNoneOf(tok::colon, tok::less, tok::string_literal)) {
     return false;
   }
 
@@ -1357,7 +1357,7 @@ bool UnwrappedLineParser::parseModuleImport() {
     // Handle import <foo/bar.h> as we would an include statement.
     else if (FormatTok->is(tok::less)) {
       nextToken();
-      while (!FormatTok->isOneOf(tok::semi, tok::greater) && !eof()) {
+      while (FormatTok->isNoneOf(tok::semi, tok::greater) && !eof()) {
         // Mark tokens up to the trailing line comments as implicit string
         // literals.
         if (FormatTok->isNot(tok::comment) &&
@@ -2394,13 +2394,13 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() {
       const auto *BeforeRParen = Previous->getPreviousNonComment();
       // Lambdas can be cast to function types only, e.g. `std::function<int()>`
       // and `int (*)()`.
-      if (!BeforeRParen || !BeforeRParen->isOneOf(tok::greater, tok::r_paren))
+      if (!BeforeRParen || BeforeRParen->isNoneOf(tok::greater, tok::r_paren))
         return false;
     } else if (Previous->is(tok::star)) {
       Previous = Previous->getPreviousNonComment();
     }
     if (Previous && Previous->Tok.getIdentifierInfo() &&
-        !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield,
+        Previous->isNoneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield,
                            tok::kw_co_return)) {
       return false;
     }
@@ -2450,7 +2450,7 @@ void UnwrappedLineParser::tryToParseJSFunction() {
     if (FormatTok->is(tok::l_brace))
       tryToParseBracedList();
     else
-      while (!FormatTok->isOneOf(tok::l_brace, tok::semi) && !eof())
+      while (FormatTok->isNoneOf(tok::l_brace, tok::semi) && !eof())
         nextToken();
   }
 
@@ -3108,11 +3108,11 @@ void UnwrappedLineParser::parseTryCatch() {
   for (bool SeenCatch = false;;) {
     if (FormatTok->is(tok::at))
       nextToken();
-    if (!(FormatTok->isOneOf(tok::kw_catch, Keywords.kw___except,
-                             tok::kw___finally, tok::objc_catch,
-                             tok::objc_finally) ||
-          ((Style.isJava() || Style.isJavaScript()) &&
-           FormatTok->is(Keywords.kw_finally)))) {
+    if (FormatTok->isNoneOf(tok::kw_catch, Keywords.kw___except,
+                            tok::kw___finally, tok::objc_catch,
+                            tok::objc_finally) &&
+        !((Style.isJava() || Style.isJavaScript()) &&
+          FormatTok->is(Keywords.kw_finally))) {
       break;
     }
     if (FormatTok->is(tok::kw_catch))
@@ -3290,7 +3290,7 @@ void UnwrappedLineParser::parseForOrWhileLoop(bool HasParens) {
                               Keywords.kw_repeat))) &&
          "'for', 'while' or foreach macro expected");
   const bool KeepBraces = !Style.RemoveBracesLLVM ||
-                          !FormatTok->isOneOf(tok::kw_for, tok::kw_while);
+                          FormatTok->isNoneOf(tok::kw_for, tok::kw_while);
 
   nextToken();
   // JS' for await ( ...
@@ -4339,7 +4339,7 @@ void UnwrappedLineParser::parseJavaScriptEs6ImportExport() {
   // to the terminating `;`. For everything else, just return and continue
   // parsing the structural element, i.e. the declaration or expression for
   // `export default`.
-  if (!IsImport && !FormatTok->isOneOf(tok::l_brace, tok::star) &&
+  if (!IsImport && FormatTok->isNoneOf(tok::l_brace, tok::star) &&
       !FormatTok->isStringLiteral() &&
       !(FormatTok->is(Keywords.kw_type) &&
         Tokens->peekNextToken()->isOneOf(tok::l_brace, tok::star))) {
@@ -4886,7 +4886,7 @@ void UnwrappedLineParser::readToken(int LevelDifference) {
       const auto *Next = Tokens->peekNextToken();
       if ((Style.isVerilog() && !Keywords.isVerilogPPDirective(*Next)) ||
           (Style.isTableGen() &&
-           !Next->isOneOf(tok::kw_else, tok::pp_define, tok::pp_ifdef,
+           Next->isNoneOf(tok::kw_else, tok::pp_define, tok::pp_ifdef,
                           tok::pp_ifndef, tok::pp_endif))) {
         break;
       }
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 30c06bbb4d071..54f366fc02502 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -462,7 +462,7 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
     if ((Style.PointerAlignment == FormatStyle::PAS_Right ||
          Style.ReferenceAlignment == FormatStyle::RAS_Right) &&
         CurrentChange.Spaces != 0 &&
-        !CurrentChange.Tok->isOneOf(tok::equal, tok::r_paren,
+        CurrentChange.Tok->isNoneOf(tok::equal, tok::r_paren,
                                     TT_TemplateCloser)) {
       const bool ReferenceNotRightAligned =
           Style.ReferenceAlignment != FormatStyle::RAS_Right &&

From 819f34a6e047754276320235dfe014b71c64d7da Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Thu, 2 Oct 2025 12:09:15 -0700
Subject: [PATCH 560/878] [NFC][OpenACC] Remove 'initExpr' from AST/etc.
 (#161674)

I originally expected that we were going to need the initExpr stored
separately from the allocaDecl when doing arrays/pointers, however after
implementing it, we found that the idea of having the allocaDecl just
store its init directly still works perfectly. This patch removes the
extra field from the AST.
---
 clang/include/clang/AST/OpenACCClause.h       | 21 +++++--------
 clang/lib/AST/StmtProfile.cpp                 |  8 +----
 clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp | 19 ++----------
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp |  9 +++---
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h   | 20 +++++--------
 clang/lib/Sema/SemaOpenACC.cpp                | 30 ++++++++++++++++---
 clang/lib/Serialization/ASTReader.cpp         | 15 ++++------
 clang/lib/Serialization/ASTWriter.cpp         |  9 ++----
 clang/tools/libclang/CIndex.cpp               |  9 ++----
 9 files changed, 62 insertions(+), 78 deletions(-)

diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index 5f06117d65a47..58ba8d91f1277 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -840,14 +840,13 @@ class OpenACCClauseWithVarList : public OpenACCClauseWithExprs {
 // alloca at the level of the base, and the init at the element level.
 struct OpenACCPrivateRecipe {
   VarDecl *AllocaDecl;
-  Expr *InitExpr;
 
-  OpenACCPrivateRecipe(VarDecl *A, Expr *I) : AllocaDecl(A), InitExpr(I) {}
+  OpenACCPrivateRecipe(VarDecl *A) : AllocaDecl(A) {}
 
   bool isSet() const { return AllocaDecl; }
 
   static OpenACCPrivateRecipe Empty() {
-    return OpenACCPrivateRecipe(nullptr, nullptr);
+    return OpenACCPrivateRecipe(/*AllocaDecl=*/nullptr);
   }
 };
 
@@ -899,18 +898,17 @@ class OpenACCPrivateClause final
 // InitFromTemporary is the 'temp' declaration we put in to be 'copied from'.
 struct OpenACCFirstPrivateRecipe {
   VarDecl *AllocaDecl;
-  Expr *InitExpr;
   VarDecl *InitFromTemporary;
-  OpenACCFirstPrivateRecipe(VarDecl *A, Expr *I, VarDecl *T)
-      : AllocaDecl(A), InitExpr(I), InitFromTemporary(T) {
-    assert(!AllocaDecl || AllocaDecl->getInit() == nullptr);
+  OpenACCFirstPrivateRecipe(VarDecl *A, VarDecl *T)
+      : AllocaDecl(A), InitFromTemporary(T) {
     assert(!InitFromTemporary || InitFromTemporary->getInit() == nullptr);
   }
 
   bool isSet() const { return AllocaDecl; }
 
   static OpenACCFirstPrivateRecipe Empty() {
-    return OpenACCFirstPrivateRecipe(nullptr, nullptr, nullptr);
+    return OpenACCFirstPrivateRecipe(/*AllocaDecl=*/nullptr,
+                                     /*InitFromTemporary=*/nullptr);
   }
 };
 
@@ -1282,16 +1280,13 @@ class OpenACCCreateClause final
 // 'main' declaration used for initializaiton, which is fixed. 
 struct OpenACCReductionRecipe {
   VarDecl *AllocaDecl;
-  Expr *InitExpr;
   // TODO: OpenACC: this should eventually have the operations here too.
 
-  OpenACCReductionRecipe(VarDecl *A, Expr *I) : AllocaDecl(A), InitExpr(I) {
-    assert(!AllocaDecl || AllocaDecl->getInit() == nullptr);
-  }
+  OpenACCReductionRecipe(VarDecl *A) : AllocaDecl(A) {}
 
   bool isSet() const { return AllocaDecl; }
   static OpenACCReductionRecipe Empty() {
-    return OpenACCReductionRecipe(nullptr, nullptr);
+    return OpenACCReductionRecipe(/*AllocaDecl=*/nullptr);
   }
 };
 
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 589a156a2b6ea..f3b5478222488 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2655,8 +2655,6 @@ void OpenACCClauseProfiler::VisitPrivateClause(
 
   for (auto &Recipe : Clause.getInitRecipes()) {
     Profiler.VisitDecl(Recipe.AllocaDecl);
-    if (Recipe.InitExpr)
-      Profiler.VisitExpr(Recipe.InitExpr);
   }
 }
 
@@ -2666,8 +2664,6 @@ void OpenACCClauseProfiler::VisitFirstPrivateClause(
 
   for (auto &Recipe : Clause.getInitRecipes()) {
     Profiler.VisitDecl(Recipe.AllocaDecl);
-    if (Recipe.InitExpr)
-      Profiler.VisitExpr(Recipe.InitExpr);
     Profiler.VisitDecl(Recipe.InitFromTemporary);
   }
 }
@@ -2773,12 +2769,10 @@ void OpenACCClauseProfiler::VisitReductionClause(
 
   for (auto &Recipe : Clause.getRecipes()) {
     Profiler.VisitDecl(Recipe.AllocaDecl);
-    if (Recipe.InitExpr)
-      Profiler.VisitExpr(Recipe.InitExpr);
     // TODO: OpenACC: Make sure we remember to update this when we figure out
     // what we're adding for the operation recipe, in the meantime, a static
     // assert will make sure we don't add something.
-    static_assert(sizeof(OpenACCReductionRecipe) == 2 * sizeof(int *));
+    static_assert(sizeof(OpenACCReductionRecipe) == sizeof(int *));
   }
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index f22f3e8845f8a..3d86f71b077e2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -1001,7 +1001,7 @@ class OpenACCClauseCIREmitter final
               OpenACCRecipeBuilder<mlir::acc::PrivateRecipeOp>(cgf, builder)
                   .getOrCreateRecipe(
                       cgf.getContext(), recipeInsertLocation, varExpr,
-                      varRecipe.AllocaDecl, varRecipe.InitExpr,
+                      varRecipe.AllocaDecl,
                       /*temporary=*/nullptr, OpenACCReductionOperator::Invalid,
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
@@ -1036,20 +1036,13 @@ class OpenACCClauseCIREmitter final
 
         {
           mlir::OpBuilder::InsertionGuard guardCase(builder);
-          // TODO: OpenACC: At the moment this is a bit of a hacky way of doing
-          // this, and won't work when we get to bounds/etc. Do this for now to
-          // limit the scope of this refactor.
-          VarDecl *allocaDecl = varRecipe.AllocaDecl;
-          allocaDecl->setInit(varRecipe.InitExpr);
-          allocaDecl->setInitStyle(VarDecl::CallInit);
 
           auto recipe =
               OpenACCRecipeBuilder<mlir::acc::FirstprivateRecipeOp>(cgf,
                                                                     builder)
                   .getOrCreateRecipe(
                       cgf.getContext(), recipeInsertLocation, varExpr,
-                      varRecipe.AllocaDecl, varRecipe.InitExpr,
-                      varRecipe.InitFromTemporary,
+                      varRecipe.AllocaDecl, varRecipe.InitFromTemporary,
                       OpenACCReductionOperator::Invalid,
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
@@ -1086,18 +1079,12 @@ class OpenACCClauseCIREmitter final
 
         {
           mlir::OpBuilder::InsertionGuard guardCase(builder);
-          // TODO: OpenACC: At the moment this is a bit of a hacky way of doing
-          // this, and won't work when we get to bounds/etc. Do this for now to
-          // limit the scope of this refactor.
-          VarDecl *allocaDecl = varRecipe.AllocaDecl;
-          allocaDecl->setInit(varRecipe.InitExpr);
-          allocaDecl->setInitStyle(VarDecl::CallInit);
 
           auto recipe =
               OpenACCRecipeBuilder<mlir::acc::ReductionRecipeOp>(cgf, builder)
                   .getOrCreateRecipe(
                       cgf.getContext(), recipeInsertLocation, varExpr,
-                      varRecipe.AllocaDecl, varRecipe.InitExpr,
+                      varRecipe.AllocaDecl,
                       /*temporary=*/nullptr, clause.getReductionOp(),
                       Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType,
                       opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType,
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index 565030d2e476e..ea6ea2c63acc3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -435,7 +435,7 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
     mlir::Location loc, mlir::Location locEnd, SourceRange exprRange,
     mlir::Value mainOp, mlir::acc::PrivateRecipeOp recipe, size_t numBounds,
     llvm::ArrayRef<QualType> boundTypes, const VarDecl *allocaDecl,
-    QualType origType, const Expr *initExpr) {
+    QualType origType) {
   assert(allocaDecl && "Required recipe variable not set?");
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, allocaDecl};
 
@@ -473,9 +473,10 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
 
     // If the initializer is trivial, there is nothing to do here, so save
     // ourselves some effort.
-    if (initExpr && (!cgf.isTrivialInitializer(initExpr) ||
-                     cgf.getContext().getLangOpts().getTrivialAutoVarInit() !=
-                         LangOptions::TrivialAutoVarInitKind::Uninitialized))
+    if (allocaDecl->getInit() &&
+        (!cgf.isTrivialInitializer(allocaDecl->getInit()) ||
+         cgf.getContext().getLangOpts().getTrivialAutoVarInit() !=
+             LangOptions::TrivialAutoVarInitKind::Uninitialized))
       makeBoundsInit(alloca, loc, block, allocaDecl, origType,
                      /*isInitSection=*/true);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index 203eaffc96427..a05b0bdaf6774 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -70,8 +70,7 @@ class OpenACCRecipeBuilderBase {
                                mlir::acc::PrivateRecipeOp recipe,
                                size_t numBounds,
                                llvm::ArrayRef<QualType> boundTypes,
-                               const VarDecl *allocaDecl, QualType origType,
-                               const Expr *initExpr);
+                               const VarDecl *allocaDecl, QualType origType);
 
   void createRecipeDestroySection(mlir::Location loc, mlir::Location locEnd,
                                   mlir::Value mainOp, CharUnits alignment,
@@ -212,15 +211,12 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
   OpenACCRecipeBuilder(CIRGen::CIRGenFunction &cgf,
                        CIRGen::CIRGenBuilderTy &builder)
       : OpenACCRecipeBuilderBase(cgf, builder) {}
-  RecipeTy getOrCreateRecipe(ASTContext &astCtx,
-                             mlir::OpBuilder::InsertPoint &insertLocation,
-                             const Expr *varRef, const VarDecl *varRecipe,
-                             const Expr *initExpr, const VarDecl *temporary,
-                             OpenACCReductionOperator reductionOp,
-                             DeclContext *dc, QualType origType,
-                             size_t numBounds,
-                             llvm::ArrayRef<QualType> boundTypes,
-                             QualType baseType, mlir::Value mainOp) {
+  RecipeTy getOrCreateRecipe(
+      ASTContext &astCtx, mlir::OpBuilder::InsertPoint &insertLocation,
+      const Expr *varRef, const VarDecl *varRecipe, const VarDecl *temporary,
+      OpenACCReductionOperator reductionOp, DeclContext *dc, QualType origType,
+      size_t numBounds, llvm::ArrayRef<QualType> boundTypes, QualType baseType,
+      mlir::Value mainOp) {
     assert(!varRecipe->getType()->isSpecificBuiltinType(
                BuiltinType::ArraySection) &&
            "array section shouldn't make it to recipe creation");
@@ -266,7 +262,7 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
     if constexpr (std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) {
       createPrivateInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
                               recipe, numBounds, boundTypes, varRecipe,
-                              origType, initExpr);
+                              origType);
     } else {
       createRecipeInitCopy(loc, locEnd, varRef->getSourceRange(), mainOp,
                            recipe, varRecipe, temporary);
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index a64f207ca0231..9aaf7f463403d 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2789,7 +2789,7 @@ OpenACCPrivateRecipe SemaOpenACC::CreatePrivateInitRecipe(const Expr *VarExpr) {
     AllocaDecl->setInitStyle(VarDecl::CallInit);
   }
 
-  return OpenACCPrivateRecipe(AllocaDecl, Init.get());
+  return OpenACCPrivateRecipe(AllocaDecl);
 }
 
 OpenACCFirstPrivateRecipe
@@ -2828,7 +2828,14 @@ SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) {
   if (!ArrTy) {
     ExprResult Init = FinishValueInit(
         SemaRef.SemaRef, Entity, VarExpr->getBeginLoc(), VarTy, TemporaryDRE);
-    return OpenACCFirstPrivateRecipe(AllocaDecl, Init.get(), Temporary);
+
+    // For 'no bounds' version, we can use this as a shortcut, so set the init
+    // anyway.
+    if (Init.isUsable()) {
+      AllocaDecl->setInit(Init.get());
+      AllocaDecl->setInitStyle(VarDecl::CallInit);
+    }
+    return OpenACCFirstPrivateRecipe(AllocaDecl, Temporary);
   }
 
   // Arrays need to have each individual element initialized as there
@@ -2875,8 +2882,16 @@ SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) {
   ExprResult Init = FinishValueInit(SemaRef.SemaRef, Entity,
                                     VarExpr->getBeginLoc(), VarTy, InitExpr);
 
-  return OpenACCFirstPrivateRecipe(AllocaDecl, Init.get(), Temporary);
+  // For 'no bounds' version, we can use this as a shortcut, so set the init
+  // anyway.
+  if (Init.isUsable()) {
+    AllocaDecl->setInit(Init.get());
+    AllocaDecl->setInitStyle(VarDecl::CallInit);
+  }
+
+  return OpenACCFirstPrivateRecipe(AllocaDecl, Temporary);
 }
+
 OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe(
     OpenACCReductionOperator ReductionOperator, const Expr *VarExpr) {
   // TODO: OpenACC: This shouldn't be necessary, see PrivateInitRecipe
@@ -2932,5 +2947,12 @@ OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe(
 
   ExprResult Init = FinishValueInit(SemaRef.SemaRef, Entity,
                                     VarExpr->getBeginLoc(), VarTy, InitExpr);
-  return OpenACCReductionRecipe(AllocaDecl, Init.get());
+
+  // For 'no bounds' version, we can use this as a shortcut, so set the init
+  // anyway.
+  if (Init.isUsable()) {
+    AllocaDecl->setInit(Init.get());
+    AllocaDecl->setInitStyle(VarDecl::CallInit);
+  }
+  return OpenACCReductionRecipe(AllocaDecl);
 }
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index c05e428a6fb39..6acf79acea111 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12860,10 +12860,9 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
 
     llvm::SmallVector<OpenACCPrivateRecipe> RecipeList;
     for (unsigned I = 0; I < VarList.size(); ++I) {
-      static_assert(sizeof(OpenACCPrivateRecipe) == 2 * sizeof(int *));
+      static_assert(sizeof(OpenACCPrivateRecipe) == 1 * sizeof(int *));
       VarDecl *Alloca = readDeclAs<VarDecl>();
-      Expr *InitExpr = readSubExpr();
-      RecipeList.push_back({Alloca, InitExpr});
+      RecipeList.push_back({Alloca});
     }
 
     return OpenACCPrivateClause::Create(getContext(), BeginLoc, LParenLoc,
@@ -12886,11 +12885,10 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     llvm::SmallVector<Expr *> VarList = readOpenACCVarList();
     llvm::SmallVector<OpenACCFirstPrivateRecipe> RecipeList;
     for (unsigned I = 0; I < VarList.size(); ++I) {
-      static_assert(sizeof(OpenACCFirstPrivateRecipe) == 3 * sizeof(int *));
+      static_assert(sizeof(OpenACCFirstPrivateRecipe) == 2 * sizeof(int *));
       VarDecl *Recipe = readDeclAs<VarDecl>();
-      Expr *InitExpr = readSubExpr();
       VarDecl *RecipeTemp = readDeclAs<VarDecl>();
-      RecipeList.push_back({Recipe, InitExpr, RecipeTemp});
+      RecipeList.push_back({Recipe, RecipeTemp});
     }
 
     return OpenACCFirstPrivateClause::Create(getContext(), BeginLoc, LParenLoc,
@@ -13011,10 +13009,9 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     llvm::SmallVector<OpenACCReductionRecipe> RecipeList;
 
     for (unsigned I = 0; I < VarList.size(); ++I) {
-      static_assert(sizeof(OpenACCReductionRecipe) == 2 * sizeof(int *));
+      static_assert(sizeof(OpenACCReductionRecipe) == sizeof(int *));
       VarDecl *Recipe = readDeclAs<VarDecl>();
-      Expr *InitExpr = readSubExpr();
-      RecipeList.push_back({Recipe, InitExpr});
+      RecipeList.push_back({Recipe});
     }
 
     return OpenACCReductionClause::Create(getContext(), BeginLoc, LParenLoc, Op,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index cdf95ba1c4ba5..09b1e58ef220c 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8779,9 +8779,8 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeOpenACCVarList(PC);
 
     for (const OpenACCPrivateRecipe &R : PC->getInitRecipes()) {
-      static_assert(sizeof(R) == 2 * sizeof(int *));
+      static_assert(sizeof(R) == 1 * sizeof(int *));
       AddDeclRef(R.AllocaDecl);
-      AddStmt(const_cast<Expr *>(R.InitExpr));
     }
     return;
   }
@@ -8803,9 +8802,8 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeOpenACCVarList(FPC);
 
     for (const OpenACCFirstPrivateRecipe &R : FPC->getInitRecipes()) {
-      static_assert(sizeof(R) == 3 * sizeof(int *));
+      static_assert(sizeof(R) == 2 * sizeof(int *));
       AddDeclRef(R.AllocaDecl);
-      AddStmt(const_cast<Expr *>(R.InitExpr));
       AddDeclRef(R.InitFromTemporary);
     }
     return;
@@ -8927,9 +8925,8 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     writeOpenACCVarList(RC);
 
     for (const OpenACCReductionRecipe &R : RC->getRecipes()) {
-      static_assert(sizeof(OpenACCReductionRecipe) == 2 * sizeof(int *));
+      static_assert(sizeof(OpenACCReductionRecipe) == 1 * sizeof(int *));
       AddDeclRef(R.AllocaDecl);
-      AddStmt(const_cast<Expr *>(R.InitExpr));
     }
     return;
   }
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 30e2be758cd39..c39f337665a40 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2832,10 +2832,8 @@ void OpenACCClauseEnqueue::VisitTileClause(const OpenACCTileClause &C) {
 
 void OpenACCClauseEnqueue::VisitPrivateClause(const OpenACCPrivateClause &C) {
   VisitVarList(C);
-  for (const OpenACCPrivateRecipe &R : C.getInitRecipes()) {
+  for (const OpenACCPrivateRecipe &R : C.getInitRecipes())
     Visitor.AddDecl(R.AllocaDecl);
-    Visitor.AddStmt(R.InitExpr);
-  }
 }
 
 void OpenACCClauseEnqueue::VisitHostClause(const OpenACCHostClause &C) {
@@ -2851,7 +2849,6 @@ void OpenACCClauseEnqueue::VisitFirstPrivateClause(
   VisitVarList(C);
   for (const OpenACCFirstPrivateRecipe &R : C.getInitRecipes()) {
     Visitor.AddDecl(R.AllocaDecl);
-    Visitor.AddStmt(R.InitExpr);
     Visitor.AddDecl(R.InitFromTemporary);
   }
 }
@@ -2927,10 +2924,8 @@ void OpenACCClauseEnqueue::VisitDeviceTypeClause(
 void OpenACCClauseEnqueue::VisitReductionClause(
     const OpenACCReductionClause &C) {
   VisitVarList(C);
-  for (const OpenACCReductionRecipe &R : C.getRecipes()) {
+  for (const OpenACCReductionRecipe &R : C.getRecipes())
     Visitor.AddDecl(R.AllocaDecl);
-    Visitor.AddStmt(R.InitExpr);
-  }
 }
 void OpenACCClauseEnqueue::VisitAutoClause(const OpenACCAutoClause &C) {}
 void OpenACCClauseEnqueue::VisitIndependentClause(

From f1650cf91b01470ce44f47797663d59f00828493 Mon Sep 17 00:00:00 2001
From: Devon Loehr <DKLoehr@users.noreply.github.com>
Date: Thu, 2 Oct 2025 15:11:07 -0400
Subject: [PATCH 561/878] Revert "[clang] Convert second arg of
 __builtin_assume_aligned to ConstantExpr (#161314)" (#161719)

This reverts commit 8f77621574176387f906b8ceef9e1abb90bf22f6 (#161314).

Reason: Causing crashes when building https://github.com/google/highway.
See the original PR for details.
---
 clang/docs/ReleaseNotes.rst                   | 2 --
 clang/lib/Sema/SemaChecking.cpp               | 3 ---
 clang/test/SemaCXX/builtin-assume-aligned.cpp | 6 ------
 3 files changed, 11 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 74b0647f38795..145a83af514ed 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -246,8 +246,6 @@ Non-comprehensive list of changes in this release
 
 - ``__builtin_assume_dereferenceable`` now accepts non-constant size operands.
 
-- Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314)
-
 New Compiler Flags
 ------------------
 - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 3cc61b167ba98..7ce3513fe0969 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5954,9 +5954,6 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
     if (Result > Sema::MaximumAlignment)
       Diag(TheCall->getBeginLoc(), diag::warn_assume_aligned_too_great)
           << SecondArg->getSourceRange() << Sema::MaximumAlignment;
-
-    TheCall->setArg(1,
-                    ConstantExpr::Create(Context, SecondArg, APValue(Result)));
   }
 
   if (NumArgs > 2) {
diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp
index afc11cc694705..48bd8414fc50a 100644
--- a/clang/test/SemaCXX/builtin-assume-aligned.cpp
+++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp
@@ -47,9 +47,3 @@ constexpr void *s1 = __builtin_assume_aligned(x, 32);
 constexpr void *s2 = __builtin_assume_aligned(x, 32, 5);
 constexpr void *s3 = __builtin_assume_aligned(x, 32, -1);
 
-
-constexpr int add(int a, int b) {
-  return a+b;
-}
-constexpr void *c1 = __builtin_assume_aligned(p, add(1,1));
-constexpr void *c2 = __builtin_assume_aligned(p, add(2,1)); // expected-error {{not a power of 2}}

From 49603bd9b8c75666f8337b63a6c75a50aa618d4b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Thu, 2 Oct 2025 12:17:38 -0700
Subject: [PATCH 562/878] Revert "[MemProf] Add ambigous memprof attribute"
 (#161717)

Reverts llvm/llvm-project#157204

This caused issues in ThinLTO binaries because of the checking here,
that didn't expect allocations needing cloning to have memprof metadata:
https://github.com/llvm/llvm-project/blob/9133fc8cb04f8e45c9b46de85a8de99bf01e55c7/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp#L5572-L5582

I need to move the assert into the if check and guard by that condition.
And add a more thorough test.
---
 .../include/llvm/Analysis/MemoryProfileInfo.h |  8 -------
 llvm/lib/Analysis/MemoryProfileInfo.cpp       | 22 -------------------
 .../IPO/MemProfContextDisambiguation.cpp      |  2 --
 .../Analysis/MemoryProfileInfoTest.cpp        | 21 ++++++------------
 4 files changed, 7 insertions(+), 46 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index be690a49767b4..571caf95f275d 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -59,14 +59,6 @@ LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type);
 /// True if the AllocTypes bitmask contains just a single type.
 LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes);
 
-/// Removes any existing "ambiguous" memprof attribute. Called before we apply a
-/// specific allocation type such as "cold", "notcold", or "hot".
-LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB);
-
-/// Adds an "ambiguous" memprof attribute to call with a matched allocation
-/// profile but that we haven't yet been able to disambiguate.
-LLVM_ABI void addAmbiguousAttribute(CallBase *CB);
-
 /// Class to build a trie of call stack contexts for a particular profiled
 /// allocation call, along with their associated allocation types.
 /// The allocation will be at the root of the trie, which is then used to
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 11602d29c1313..0c1f8dbd1119a 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -125,24 +125,6 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) {
   return NumAllocTypes == 1;
 }
 
-void llvm::memprof::removeAnyExistingAmbiguousAttribute(CallBase *CB) {
-  if (!CB->hasFnAttr("memprof"))
-    return;
-  assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
-  CB->removeFnAttr("memprof");
-}
-
-void llvm::memprof::addAmbiguousAttribute(CallBase *CB) {
-  // We may have an existing ambiguous attribute if we are reanalyzing
-  // after inlining.
-  if (CB->hasFnAttr("memprof")) {
-    assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
-  } else {
-    auto A = llvm::Attribute::get(CB->getContext(), "memprof", "ambiguous");
-    CB->addFnAttr(A);
-  }
-}
-
 void CallStackTrie::addCallStack(
     AllocationType AllocType, ArrayRef<uint64_t> StackIds,
     std::vector<ContextTotalSize> ContextSizeInfo) {
@@ -488,9 +470,6 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT,
                                                 StringRef Descriptor) {
   auto AllocTypeString = getAllocTypeAttributeString(AT);
   auto A = llvm::Attribute::get(CI->getContext(), "memprof", AllocTypeString);
-  // After inlining we may be able to convert an existing ambiguous allocation
-  // to an unambiguous one.
-  removeAnyExistingAmbiguousAttribute(CI);
   CI->addFnAttr(A);
   if (MemProfReportHintedSizes) {
     std::vector<ContextTotalSize> ContextSizeInfo;
@@ -550,7 +529,6 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
     assert(MIBCallStack.size() == 1 &&
            "Should only be left with Alloc's location in stack");
     CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
-    addAmbiguousAttribute(CI);
     return true;
   }
   // If there exists corner case that CallStackTrie has one chain to leaf
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index c4f1b680a53ec..ddb95a4184756 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3981,7 +3981,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
 void ModuleCallsiteContextGraph::updateAllocationCall(
     CallInfo &Call, AllocationType AllocType) {
   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
-  removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call()));
   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
                                 "memprof", AllocTypeString);
   cast<CallBase>(Call.call())->addFnAttr(A);
@@ -5643,7 +5642,6 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
               // clone J-1 (J==0 is the original clone and does not have a VMaps
               // entry).
               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
-            removeAnyExistingAmbiguousAttribute(CBClone);
             CBClone->addFnAttr(A);
             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
                      << ore::NV("AllocationCall", CBClone) << " in clone "
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index d1c0f643f5cd7..d8457a30fd2f7 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -230,8 +230,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_TRUE(Call->hasFnAttr("memprof"));
-  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+  EXPECT_FALSE(Call->hasFnAttr("memprof"));
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -280,8 +279,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_TRUE(Call->hasFnAttr("memprof"));
-  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+  EXPECT_FALSE(Call->hasFnAttr("memprof"));
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -335,8 +333,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_TRUE(Call->hasFnAttr("memprof"));
-  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+  EXPECT_FALSE(Call->hasFnAttr("memprof"));
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -395,8 +392,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_TRUE(Call->hasFnAttr("memprof"));
-  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+  EXPECT_FALSE(Call->hasFnAttr("memprof"));
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -467,8 +463,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   ASSERT_NE(Call, nullptr);
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_TRUE(Call->hasFnAttr("memprof"));
-  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+  EXPECT_FALSE(Call->hasFnAttr("memprof"));
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -541,8 +536,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   // Restore original option value.
   MemProfKeepAllNotColdContexts = OrigMemProfKeepAllNotColdContexts;
 
-  EXPECT_TRUE(Call->hasFnAttr("memprof"));
-  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+  EXPECT_FALSE(Call->hasFnAttr("memprof"));
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -670,8 +664,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   // The hot allocations will be converted to NotCold and pruned as they
   // are unnecessary to determine how to clone the cold allocation.
 
-  EXPECT_TRUE(Call->hasFnAttr("memprof"));
-  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
+  EXPECT_FALSE(Call->hasFnAttr("memprof"));
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);

From 1d7ec60e8f8a659ad9a8da8a53c31ce1ef491dd5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 2 Oct 2025 12:29:24 -0700
Subject: [PATCH 563/878] [RISCV] Improve formatting in
 RISCVInstrInfoVPseudos.td. NFC (#161470)

Primarily focused on changes from RISCVVPseudo recently inheriting from
Pseudo.
---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 52 ++++++++-----------
 1 file changed, 22 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index d998316e4725c..298d35a5b4efa 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -554,7 +554,8 @@ defset list<VTypeInfoToWide> AllWidenableBF16ToFloatVectors = {
 // This represents the information we need in codegen for each pseudo.
 // The definition should be consistent with `struct PseudoInfo` in
 // RISCVInstrInfo.h.
-class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [], string opcodestr = "", string argstr = "">
+class RISCVVPseudo<dag outs, dag ins, list<dag> pattern = [],
+                   string opcodestr = "", string argstr = "">
     : Pseudo<outs, ins, pattern, opcodestr, argstr> {
   Pseudo Pseudo = !cast<Pseudo>(NAME); // Used as a key.
   Instruction BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
@@ -1010,8 +1011,7 @@ class VPseudoNullaryNoMask<VReg RegClass> :
 class VPseudoNullaryMask<VReg RegClass> :
       RISCVVPseudo<(outs GetVRegNoV0<RegClass>.R:$rd),
                    (ins GetVRegNoV0<RegClass>.R:$passthru,
-                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
-                   []> {
+                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1190,8 +1190,7 @@ class VPseudoBinaryNoMask<VReg RetClass,
                           bits<2> TargetConstraintType = 1,
                           DAGOperand sewop = sew> :
       RISCVVPseudo<(outs RetClass:$rd),
-                   (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew),
-                   []> {
+                   (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, sewop:$sew)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1227,8 +1226,7 @@ class VPseudoBinaryNoMaskRoundingMode<VReg RetClass,
                                       bits<2> TargetConstraintType = 1> :
       RISCVVPseudo<(outs RetClass:$rd),
                    (ins RetClass:$passthru, Op1Class:$rs2, Op2Class:$rs1,
-                        vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
-                   []> {
+                        vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1320,7 +1318,7 @@ class VPseudoIStoreNoMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
                           bit Ordered>:
       RISCVVPseudo<(outs),
                    (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
-                        AVL:$vl, sew:$sew),[]>,
+                        AVL:$vl, sew:$sew)>,
       RISCVVSX</*Masked*/0, Ordered, !logtwo(EEW), VLMul, LMUL> {
   let mayLoad = 0;
   let mayStore = 1;
@@ -1333,7 +1331,7 @@ class VPseudoIStoreMask<VReg StClass, VReg IdxClass, int EEW, bits<3> LMUL,
                         bit Ordered>:
       RISCVVPseudo<(outs),
                    (ins StClass:$rd, GPRMemZeroOffset:$rs1, IdxClass:$rs2,
-                        VMaskOp:$vm, AVL:$vl, sew:$sew),[]>,
+                        VMaskOp:$vm, AVL:$vl, sew:$sew)>,
       RISCVVSX</*Masked*/1, Ordered, !logtwo(EEW), VLMul, LMUL> {
   let mayLoad = 0;
   let mayStore = 1;
@@ -1351,8 +1349,7 @@ class VPseudoBinaryMaskPolicy<VReg RetClass,
       RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
                    (ins GetVRegNoV0<RetClass>.R:$passthru,
                         Op1Class:$rs2, Op2Class:$rs1,
-                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
-                   []> {
+                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1371,8 +1368,7 @@ class VPseudoTernaryMaskPolicy<VReg RetClass,
       RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
                    (ins GetVRegNoV0<RetClass>.R:$passthru,
                         Op1Class:$rs2, Op2Class:$rs1,
-                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
-                   []> {
+                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1414,8 +1410,7 @@ class VPseudoBinaryMOutMask<VReg RetClass,
       RISCVVPseudo<(outs RetClass:$rd),
                    (ins RetClass:$passthru,
                         Op1Class:$rs2, Op2Class:$rs1,
-                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
-                   []> {
+                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1438,8 +1433,7 @@ class VPseudoTiedBinaryMask<VReg RetClass,
       RISCVVPseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
                    (ins GetVRegNoV0<RetClass>.R:$passthru,
                         Op2Class:$rs1,
-                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy),
-                   []> {
+                        VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1546,8 +1540,7 @@ class VPseudoTernaryNoMaskWithPolicyRoundingMode<VReg RetClass,
                                                  bits<2> TargetConstraintType = 1> :
       RISCVVPseudo<(outs RetClass:$rd),
                    (ins RetClass:$rs3, Op1Class:$rs1, Op2Class:$rs2,
-                        vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy),
-                   []> {
+                        vec_rm:$rm, AVL:$vl, sew:$sew, vec_policy:$policy)> {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1716,8 +1709,8 @@ class VPseudoUSSegStoreNoMask<VReg ValClass,
                               int EEW,
                               bits<4> NF> :
       RISCVVPseudo<(outs),
-                   (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl, sew:$sew),
-                   []>,
+                   (ins ValClass:$rd, GPRMemZeroOffset:$rs1, AVL:$vl,
+                        sew:$sew)>,
       RISCVVSSEG<NF, /*Masked*/0, /*Strided*/0, !logtwo(EEW), VLMul> {
   let mayLoad = 0;
   let mayStore = 1;
@@ -6029,9 +6022,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 1 in {
                         PseudoInstExpansion<(CSRRS GPR:$rd, SysRegVLENB.Encoding, X0)>,
                         Sched<[WriteRdVLENB]>;
   let Defs = [VL, VTYPE] in {
-  def PseudoReadVLENBViaVSETVLIX0 : Pseudo<(outs GPRNoX0:$rd), (ins uimm5:$shamt),
-                                  []>,
-                           Sched<[WriteVSETVLI, ReadVSETVLI]>;
+  def PseudoReadVLENBViaVSETVLIX0 : Pseudo<(outs GPRNoX0:$rd),
+                                           (ins uimm5:$shamt), []>,
+                                    Sched<[WriteVSETVLI, ReadVSETVLI]>;
   }
 }
 
@@ -6694,14 +6687,14 @@ defm PseudoVID : VPseudoVID_V;
 let Predicates = [HasVInstructions] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
   let HasSEWOp = 1, BaseInstr = VMV_X_S in
-  def PseudoVMV_X_S:
+  def PseudoVMV_X_S :
     RISCVVPseudo<(outs GPR:$rd), (ins VR:$rs2, sew:$sew)>,
     Sched<[WriteVMovXS, ReadVMovXS]>;
   let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1,
       Constraints = "$rd = $passthru" in
-  def PseudoVMV_S_X: RISCVVPseudo<(outs VR:$rd),
-                            (ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew),
-                            []>,
+  def PseudoVMV_S_X :
+    RISCVVPseudo<(outs VR:$rd),
+                 (ins VR:$passthru, GPR:$rs1, AVL:$vl, sew:$sew)>,
     Sched<[WriteVMovSX, ReadVMovSX_V, ReadVMovSX_X]>;
 }
 } // Predicates = [HasVInstructions]
@@ -6721,8 +6714,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
         Constraints = "$rd = $passthru" in
     def "PseudoVFMV_S_" # f.FX :
       RISCVVPseudo<(outs VR:$rd),
-             (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew),
-             []>,
+                   (ins VR:$passthru, f.fprclass:$rs1, AVL:$vl, sew:$sew)>,
       Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
   }
 }

From 44d471e70679c3056fb68d2fc826bbaa41f4df3a Mon Sep 17 00:00:00 2001
From: Marcell Leleszi <59964679+mleleszi@users.noreply.github.com>
Date: Thu, 2 Oct 2025 21:33:21 +0200
Subject: [PATCH 564/878] [libc] Implement faccessat  (#161065)

#160404

- Implement POSIX function "faccessat"
- Remove redundant param in facessat syscall in access implementation,
faccessat syscall does not take a flags arg
---
 libc/config/linux/aarch64/entrypoints.txt     |   1 +
 libc/config/linux/x86_64/entrypoints.txt      |   1 +
 .../llvm-libc-macros/linux/fcntl-macros.h     |   3 +
 libc/include/sys/syscall.h.def                |   4 +
 libc/include/unistd.yaml                      |   9 ++
 libc/src/unistd/CMakeLists.txt                |   7 ++
 libc/src/unistd/faccessat.h                   |  20 +++
 libc/src/unistd/linux/CMakeLists.txt          |  13 ++
 libc/src/unistd/linux/access.cpp              |   2 +-
 libc/src/unistd/linux/faccessat.cpp           |  37 ++++++
 libc/test/src/unistd/CMakeLists.txt           |  17 +++
 libc/test/src/unistd/faccessat_test.cpp       | 115 ++++++++++++++++++
 12 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/unistd/faccessat.h
 create mode 100644 libc/src/unistd/linux/faccessat.cpp
 create mode 100644 libc/test/src/unistd/faccessat_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index ae8deabc97407..cfb77911f3c2d 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -325,6 +325,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index bf2ad4a40ca11..87b78a337b875 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -331,6 +331,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
diff --git a/libc/include/llvm-libc-macros/linux/fcntl-macros.h b/libc/include/llvm-libc-macros/linux/fcntl-macros.h
index aec8a0d2da0b5..74d406f742f38 100644
--- a/libc/include/llvm-libc-macros/linux/fcntl-macros.h
+++ b/libc/include/llvm-libc-macros/linux/fcntl-macros.h
@@ -61,6 +61,9 @@
 // Allow empty relative pathname.
 #define AT_EMPTY_PATH 0x1000
 
+// Perform access checks using the effective user and group IDs.
+#define AT_EACCESS 0x200
+
 // Values of SYS_fcntl commands.
 #define F_DUPFD 0
 #define F_GETFD 1
diff --git a/libc/include/sys/syscall.h.def b/libc/include/sys/syscall.h.def
index 6d74cc6f78556..60e5024e500e3 100644
--- a/libc/include/sys/syscall.h.def
+++ b/libc/include/sys/syscall.h.def
@@ -309,6 +309,10 @@
 #define SYS_faccessat __NR_faccessat
 #endif
 
+#ifdef __NR_faccessat2
+#define SYS_faccessat2 __NR_faccessat2
+#endif
+
 #ifdef __NR_fadvise64
 #define SYS_fadvise64 __NR_fadvise64
 #endif
diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml
index 3ba3ec71c25ce..2ff86eafaf550 100644
--- a/libc/include/unistd.yaml
+++ b/libc/include/unistd.yaml
@@ -96,6 +96,15 @@ functions:
       - type: const char *
       - type: __exec_argv_t
       - type: __exec_envp_t
+  - name: faccessat
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: const char *
+      - type: int
+      - type: int
   - name: fchdir
     standards:
       - POSIX
diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt
index c66a3a4d0ed76..78c3bf8442fab 100644
--- a/libc/src/unistd/CMakeLists.txt
+++ b/libc/src/unistd/CMakeLists.txt
@@ -55,6 +55,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.dup3
 )
 
+add_entrypoint_object(
+  faccessat
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.faccessat
+)
+
 add_entrypoint_object(
   fchdir
   ALIAS
diff --git a/libc/src/unistd/faccessat.h b/libc/src/unistd/faccessat.h
new file mode 100644
index 0000000000000..0dc834dbd7cae
--- /dev/null
+++ b/libc/src/unistd/faccessat.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for faccessat ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_FACCESSAT_H
+#define LLVM_LIBC_SRC_UNISTD_FACCESSAT_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int faccessat(int fd, const char *path, int amode, int flag);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_FACCESSAT_H
diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index 2d510f32d287d..dff6ba2db8a38 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -80,6 +80,19 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  faccessat
+  SRCS
+    faccessat.cpp
+  HDRS
+    ../faccessat.h
+  DEPENDS
+    libc.hdr.fcntl_macros
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   fchdir
   SRCS
diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp
index 55cd6adca779d..f06eec5a8db6a 100644
--- a/libc/src/unistd/linux/access.cpp
+++ b/libc/src/unistd/linux/access.cpp
@@ -23,7 +23,7 @@ LLVM_LIBC_FUNCTION(int, access, (const char *path, int mode)) {
   int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_access, path, mode);
 #elif defined(SYS_faccessat)
   int ret =
-      LIBC_NAMESPACE::syscall_impl<int>(SYS_faccessat, AT_FDCWD, path, mode, 0);
+      LIBC_NAMESPACE::syscall_impl<int>(SYS_faccessat, AT_FDCWD, path, mode);
 #else
 #error "access and faccessat syscalls not available."
 #endif
diff --git a/libc/src/unistd/linux/faccessat.cpp b/libc/src/unistd/linux/faccessat.cpp
new file mode 100644
index 0000000000000..7a2a29cb0e901
--- /dev/null
+++ b/libc/src/unistd/linux/faccessat.cpp
@@ -0,0 +1,37 @@
+//===-- Linux implementation of faccessat ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/faccessat.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+
+#include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, faccessat,
+                   (int fd, const char *path, int amode, int flag)) {
+#ifdef SYS_faccessat2
+  int ret =
+      LIBC_NAMESPACE::syscall_impl<int>(SYS_faccessat2, fd, path, amode, flag);
+#else
+#error "faccessat2 syscall is not available."
+#endif
+
+  if (ret < 0) {
+    libc_errno = -ret;
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index 6630a7ef15e53..44f28fff9ad39 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -93,6 +93,23 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
+add_libc_unittest(
+  faccessat_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    faccessat_test.cpp
+  DEPENDS
+    libc.include.unistd
+    libc.src.errno.errno
+    libc.src.fcntl.open
+    libc.src.unistd.faccessat
+    libc.src.unistd.close
+    libc.src.unistd.unlink
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
+
 add_libc_unittest(
   fchdir_test
   SUITE
diff --git a/libc/test/src/unistd/faccessat_test.cpp b/libc/test/src/unistd/faccessat_test.cpp
new file mode 100644
index 0000000000000..6280b147c6d58
--- /dev/null
+++ b/libc/test/src/unistd/faccessat_test.cpp
@@ -0,0 +1,115 @@
+//===-- Unittests for faccessat -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fcntl/open.h"
+#include "src/unistd/close.h"
+#include "src/unistd/faccessat.h"
+#include "src/unistd/unlink.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+using LlvmLibcFaccessatTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcFaccessatTest, WithAtFdcwd) {
+  // Test access checks on a file with AT_FDCWD and no flags, equivalent to
+  // access().
+  constexpr const char *FILENAME = "faccessat_basic.test";
+  auto TEST_FILE = libc_make_test_file_path(FILENAME);
+
+  // Check permissions on a file with full permissions
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, F_OK, 0),
+              Succeeds(0));
+  ASSERT_THAT(
+      LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, X_OK | W_OK | R_OK, 0),
+      Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+
+  // Check permissions on a file with execute-only permission
+  fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IXUSR);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, F_OK, 0),
+              Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, X_OK, 0),
+              Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, R_OK, 0),
+              Fails(EACCES));
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, W_OK, 0),
+              Fails(EACCES));
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+}
+
+TEST_F(LlvmLibcFaccessatTest, NonExistentFile) {
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, "faccessat_nonexistent.test",
+                                        F_OK, 0),
+              Fails(ENOENT));
+}
+
+TEST_F(LlvmLibcFaccessatTest, AtEaccess) {
+  // With AT_EACCESS, faccessat checks permissions using the effective user ID,
+  // but the effective and real user ID will be the same here and changing that
+  // is not feasible in a test, so this is just a basic sanity check.
+  constexpr const char *FILENAME = "faccessat_eaccess.test";
+  auto TEST_FILE = libc_make_test_file_path(FILENAME);
+
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, TEST_FILE, X_OK | W_OK | R_OK,
+                                        AT_EACCESS),
+              Succeeds(0));
+
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+}
+
+TEST_F(LlvmLibcFaccessatTest, AtEmptyPath) {
+  constexpr const char *FILENAME = "faccessat_atemptypath.test";
+  auto TEST_FILE = libc_make_test_file_path(FILENAME);
+
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  // Check permissions on the file referred to by fd
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(fd, "", F_OK, AT_EMPTY_PATH),
+              Succeeds(0));
+  ASSERT_THAT(
+      LIBC_NAMESPACE::faccessat(fd, "", X_OK | W_OK | R_OK, AT_EMPTY_PATH),
+      Succeeds(0));
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+
+  // Check permissions on the current working directory
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, "", F_OK, AT_EMPTY_PATH),
+              Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::faccessat(AT_FDCWD, "", X_OK | W_OK | R_OK,
+                                        AT_EMPTY_PATH),
+              Succeeds(0));
+}
+
+} // namespace

From 8779ab6b218ee9372be7758b2c9a0cf92e2b5046 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 2 Oct 2025 12:38:10 -0700
Subject: [PATCH 565/878] [RISCV] Always use XLenVT for pointer operand in
 PatLAQ and PatSRL. NFC (#161709)

The vt argument is not used today so it always gets the default XLenVT
which is why this is NFC. I plan to use it in a future patch.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
index 5e013b496c6b1..680bca30d72db 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
@@ -63,13 +63,14 @@ defm SD : SRL_r_aq_rl<0b011, "sd">;
 //===----------------------------------------------------------------------===//
 
 class PatLAQ<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
-    : Pat<(vt (OpNode (vt GPRMemZeroOffset:$rs1))), (Inst GPRMemZeroOffset:$rs1)>;
+    : Pat<(vt (OpNode (XLenVT GPRMemZeroOffset:$rs1))),
+          (Inst GPRMemZeroOffset:$rs1)>;
 
 // n.b. this switches order of arguments
 //  to deal with the fact that SRL has addr, data
 //  while atomic_store has data, addr
 class PatSRL<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
-    : Pat<(OpNode (vt GPR:$rs2), (vt GPRMemZeroOffset:$rs1)),
+    : Pat<(OpNode (vt GPR:$rs2), (XLenVT GPRMemZeroOffset:$rs1)),
           (Inst GPRMemZeroOffset:$rs1, GPR:$rs2)>;
 
 
From 902fe02e8722b8f51cb3897226c97fbff353c890 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran@intel.com>
Date: Thu, 2 Oct 2025 21:48:31 +0200
Subject: [PATCH 566/878] [OFFLOAD] Restore interop functionality (#161429)

This implements two pieces to restore the interop functionality (that I
broke) when the 6.0 interfaces were added:

* A set of wrappers that support the old interfaces on top of the new
ones
* The same level of interop support for the CUDA amd AMD plugins
---
 offload/libomptarget/OpenMP/InteropAPI.cpp | 41 ++++++++++++++++++--
 offload/libomptarget/exports               |  5 ++-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 31 +++++++++++++++
 offload/plugins-nextgen/cuda/src/rtl.cpp   | 44 ++++++++++++++++++++++
 4 files changed, 116 insertions(+), 5 deletions(-)

diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp
index eb5425ecbf062..c55ef2c2e672c 100644
--- a/offload/libomptarget/OpenMP/InteropAPI.cpp
+++ b/offload/libomptarget/OpenMP/InteropAPI.cpp
@@ -124,7 +124,7 @@ void *getProperty<void *>(omp_interop_val_t &InteropVal,
   case omp_ipr_device_context:
     return InteropVal.device_info.Context;
   case omp_ipr_targetsync:
-    return InteropVal.async_info->Queue;
+    return InteropVal.async_info ? InteropVal.async_info->Queue : nullptr;
   default:;
   }
   getTypeMismatch(Property, Err);
@@ -167,7 +167,6 @@ bool getPropertyCheck(omp_interop_val_t **InteropPtr,
                                        omp_interop_property_t property_id,     \
                                        int *err) {                             \
     omp_interop_val_t *interop_val = (omp_interop_val_t *)interop;             \
-    assert((interop_val)->interop_type == kmp_interop_type_targetsync);        \
     if (!getPropertyCheck(&interop_val, property_id, err)) {                   \
       return (RETURN_TYPE)(0);                                                 \
     }                                                                          \
@@ -275,8 +274,8 @@ omp_interop_val_t *__tgt_interop_get(ident_t *LocRef, int32_t InteropType,
   return Interop;
 }
 
-int __tgt_interop_use(ident_t *LocRef, omp_interop_val_t *Interop,
-                      interop_ctx_t *Ctx, dep_pack_t *Deps) {
+int __tgt_interop_use60(ident_t *LocRef, omp_interop_val_t *Interop,
+                        interop_ctx_t *Ctx, dep_pack_t *Deps) {
   bool Nowait = Ctx->flags.nowait;
   DP("Call to %s with interop " DPxMOD ", nowait %" PRId32 "\n", __func__,
      DPxPTR(Interop), Nowait);
@@ -359,6 +358,40 @@ EXTERN int ompx_interop_add_completion_callback(omp_interop_val_t *Interop,
   return omp_irc_success;
 }
 
+// Backwards compatibility wrappers
+void __tgt_interop_init(ident_t *LocRef, int32_t Gtid,
+                        omp_interop_val_t *&InteropPtr, int32_t InteropType,
+                        int32_t DeviceId, int32_t Ndeps,
+                        kmp_depend_info_t *DepList, int32_t HaveNowait) {
+  constexpr int32_t old_kmp_interop_type_targetsync = 2;
+  interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid};
+  dep_pack_t Deps = {Ndeps, 0, DepList, nullptr};
+  InteropPtr =
+      __tgt_interop_get(LocRef,
+                        InteropType == old_kmp_interop_type_targetsync
+                            ? kmp_interop_type_targetsync
+                            : kmp_interop_type_target,
+                        DeviceId, 0, nullptr, &Ctx, Ndeps ? &Deps : nullptr);
+}
+
+void __tgt_interop_use(ident_t *LocRef, int32_t Gtid,
+                       omp_interop_val_t *&InteropPtr, int32_t DeviceId,
+                       int32_t Ndeps, kmp_depend_info_t *DepList,
+                       int32_t HaveNowait) {
+  interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid};
+  dep_pack_t Deps = {Ndeps, 0, DepList, nullptr};
+  __tgt_interop_use60(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr);
+}
+
+void __tgt_interop_destroy(ident_t *LocRef, int32_t Gtid,
+                           omp_interop_val_t *&InteropPtr, int32_t DeviceId,
+                           int32_t Ndeps, kmp_depend_info_t *DepList,
+                           int32_t HaveNowait) {
+  interop_ctx_t Ctx = {0, {false, (bool)HaveNowait, 0}, Gtid};
+  dep_pack_t Deps = {Ndeps, 0, DepList, nullptr};
+  __tgt_interop_release(LocRef, InteropPtr, &Ctx, Ndeps ? &Deps : nullptr);
+}
+
 } // extern "C"
 
 llvm::Expected<DeviceTy &> omp_interop_val_t::getDevice() const {
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 8e2db6ba8bba4..1374bfea81511 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -68,8 +68,11 @@ VERS1.0 {
     omp_get_interop_int;
     omp_get_interop_name;
     omp_get_interop_type_desc;
-    __tgt_interop_get;
+    __tgt_interop_init;
     __tgt_interop_use;
+    __tgt_interop_destroy;
+    __tgt_interop_get;
+    __tgt_interop_use60;
     __tgt_interop_release;
     __tgt_target_sync;
     __llvmPushCallConfiguration;
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 7b834ee346e5d..f73fa0475a3a7 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2712,6 +2712,37 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override {
+    // TODO: update once targetsync is supported
+    if (InteropType == kmp_interop_type_target)
+      return interop_spec_t{tgt_fr_hsa, {false, 0}, 0};
+    return interop_spec_t{tgt_fr_none, {false, 0}, 0};
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override {
+    auto *Ret = new omp_interop_val_t(
+        DeviceId, static_cast<kmp_interop_type_t>(InteropType));
+    Ret->fr_id = tgt_fr_hsa;
+    Ret->vendor_id = omp_vendor_amd;
+
+    // TODO: implement targetsync support
+
+    Ret->device_info.Platform = nullptr;
+    Ret->device_info.Device = reinterpret_cast<void *>(Agent.handle);
+    Ret->device_info.Context = nullptr;
+
+    return Ret;
+  }
+
+  Error releaseInterop(omp_interop_val_t *Interop) override {
+    if (Interop)
+      delete Interop;
+    return Plugin::success();
+  }
+
   Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
                             AsyncInfoWrapperTy &AsyncInfo) override {
     AMDGPUStreamTy *Stream = nullptr;
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b30c651223cad..e5c4a1bfa9853 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -917,6 +917,50 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
+  interop_spec_t selectInteropPreference(int32_t InteropType,
+                                         int32_t NumPrefers,
+                                         interop_spec_t *Prefers) override {
+    return interop_spec_t{tgt_fr_cuda, {true, 0}, 0};
+  }
+
+  Expected<omp_interop_val_t *>
+  createInterop(int32_t InteropType, interop_spec_t &InteropSpec) override {
+    auto *Ret = new omp_interop_val_t(
+        DeviceId, static_cast<kmp_interop_type_t>(InteropType));
+    Ret->fr_id = tgt_fr_cuda;
+    Ret->vendor_id = omp_vendor_nvidia;
+
+    if (InteropType == kmp_interop_type_target ||
+        InteropType == kmp_interop_type_targetsync) {
+      Ret->device_info.Platform = nullptr;
+      Ret->device_info.Device = reinterpret_cast<void *>(Device);
+      Ret->device_info.Context = Context;
+    }
+
+    if (InteropType == kmp_interop_type_targetsync) {
+      Ret->async_info = new __tgt_async_info();
+      if (auto Err = setContext())
+        return Err;
+      CUstream Stream;
+      if (auto Err = CUDAStreamManager.getResource(Stream))
+        return Err;
+
+      Ret->async_info->Queue = Stream;
+    }
+    return Ret;
+  }
+
+  Error releaseInterop(omp_interop_val_t *Interop) override {
+    if (!Interop)
+      return Plugin::success();
+
+    if (Interop->async_info)
+      delete Interop->async_info;
+
+    delete Interop;
+    return Plugin::success();
+  }
+
   Error enqueueHostCallImpl(void (*Callback)(void *), void *UserData,
                             AsyncInfoWrapperTy &AsyncInfo) override {
     if (auto Err = setContext())

From 694390679ac1351a8d471d39db85400eb21a9035 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 2 Oct 2025 12:50:09 -0700
Subject: [PATCH 567/878] [RISCV][GISel] Share an atomic load isel pattern
 GISel RV64 and SDAG RV32. NFC (#161721)

Use stricter type for RV64 only patterns.

Stores are different because atomic_store doesn't differentiate
truncating and non-truncating stores.
---
 llvm/lib/Target/RISCV/RISCVGISel.td      | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoA.td | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 19d5aff023d53..af1ceb6bcda4e 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -118,7 +118,7 @@ let Predicates = [HasAtomicLdSt] in {
 }
 
 let Predicates = [HasAtomicLdSt, IsRV64] in {
-  def : LdPat<atomic_load_nonext_32, LW, i32>;
+  // Load pattern is in RISCVInstrInfoA.td and shared with RV32.
   def : StPat<atomic_store_32, SW, GPR, i32>;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 99992d196b43d..25accd93eaa03 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -174,15 +174,14 @@ let Predicates = [HasAtomicLdSt] in {
   def : StPat<relaxed_store<atomic_store_8>,  SB, GPR, XLenVT>;
   def : StPat<relaxed_store<atomic_store_16>, SH, GPR, XLenVT>;
   def : StPat<relaxed_store<atomic_store_32>, SW, GPR, XLenVT>;
-}
 
-let Predicates = [HasAtomicLdSt, IsRV32] in {
-  def : LdPat<relaxed_load<atomic_load_nonext_32>, LW>;
+  // Used by GISel for RV32 and RV64.
+  def : LdPat<relaxed_load<atomic_load_nonext_32>, LW, i32>;
 }
 
 let Predicates = [HasAtomicLdSt, IsRV64] in {
-  def : LdPat<relaxed_load<atomic_load_asext_32>, LW>;
-  def : LdPat<relaxed_load<atomic_load_zext_32>, LWU>;
+  def : LdPat<relaxed_load<atomic_load_asext_32>, LW, i64>;
+  def : LdPat<relaxed_load<atomic_load_zext_32>, LWU, i64>;
   def : LdPat<relaxed_load<atomic_load_nonext_64>, LD, i64>;
   def : StPat<relaxed_store<atomic_store_64>, SD, GPR, i64>;
 }

From 11faf88d8fea4d221e826e4e9827864f38d246fb Mon Sep 17 00:00:00 2001
From: Brandon <61314499+brandonxin@users.noreply.github.com>
Date: Thu, 2 Oct 2025 14:51:39 -0500
Subject: [PATCH 568/878] [InstCombine] Fold icmp with clamp into unsigned
 bound check (#161303)

Fix #157315

alive2: https://alive2.llvm.org/ce/z/TEnuFV

The equality comparison of `min(max(X, Lo), Hi)` and `X` is actually a
range check on `X`. This PR folds this into an unsigned bound check `(X
- Lo) u< (Hi - Lo + 1)`.

---------

Co-authored-by: Yingwei Zheng <dtcxzyw@qq.com>
---
 llvm/include/llvm/IR/IntrinsicInst.h          |  20 ++
 .../InstCombine/InstCombineCompares.cpp       |  45 ++-
 .../InstCombine/InstCombineInternal.h         |   1 +
 .../test/Transforms/InstCombine/icmp-clamp.ll | 295 ++++++++++++++++++
 4 files changed, 360 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/InstCombine/icmp-clamp.ll

diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index eb0440f500735..0622bfae2c845 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -810,6 +810,26 @@ class MinMaxIntrinsic : public IntrinsicInst {
   /// Whether the intrinsic is signed or unsigned.
   bool isSigned() const { return isSigned(getIntrinsicID()); };
 
+  /// Whether the intrinsic is a smin or umin.
+  static bool isMin(Intrinsic::ID ID) {
+    switch (ID) {
+    case Intrinsic::umin:
+    case Intrinsic::smin:
+      return true;
+    case Intrinsic::umax:
+    case Intrinsic::smax:
+      return false;
+    default:
+      llvm_unreachable("Invalid intrinsic");
+    }
+  }
+
+  /// Whether the intrinsic is a smin or a umin.
+  bool isMin() const { return isMin(getIntrinsicID()); }
+
+  /// Whether the intrinsic is a smax or a umax.
+  bool isMax() const { return !isMin(getIntrinsicID()); }
+
   /// Min/max intrinsics are monotonic, they operate on a fixed-bitwidth values,
   /// so there is a certain threshold value, upon reaching which,
   /// their value can no longer change. Return said threshold.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 60da48e1d95d0..07ad65c8b7d42 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5780,6 +5780,45 @@ Instruction *InstCombinerImpl::foldICmpWithMinMax(Instruction &I,
   return nullptr;
 }
 
+/// Match and fold patterns like:
+///   icmp eq/ne X, min(max(X, Lo), Hi)
+/// which represents a range check and can be repsented as a ConstantRange.
+///
+/// For icmp eq, build ConstantRange [Lo, Hi + 1) and convert to:
+///   (X - Lo) u< (Hi + 1 - Lo)
+/// For icmp ne, build ConstantRange [Hi + 1, Lo) and convert to:
+///   (X - (Hi + 1)) u< (Lo - (Hi + 1))
+Instruction *InstCombinerImpl::foldICmpWithClamp(ICmpInst &I, Value *X,
+                                                 MinMaxIntrinsic *Min) {
+  if (!I.isEquality() || !Min->hasOneUse() || !Min->isMin())
+    return nullptr;
+
+  const APInt *Lo = nullptr, *Hi = nullptr;
+  if (Min->isSigned()) {
+    if (!match(Min->getLHS(), m_OneUse(m_SMax(m_Specific(X), m_APInt(Lo)))) ||
+        !match(Min->getRHS(), m_APInt(Hi)) || !Lo->slt(*Hi))
+      return nullptr;
+  } else {
+    if (!match(Min->getLHS(), m_OneUse(m_UMax(m_Specific(X), m_APInt(Lo)))) ||
+        !match(Min->getRHS(), m_APInt(Hi)) || !Lo->ult(*Hi))
+      return nullptr;
+  }
+
+  ConstantRange CR = ConstantRange::getNonEmpty(*Lo, *Hi + 1);
+  ICmpInst::Predicate Pred;
+  APInt C, Offset;
+  if (I.getPredicate() == ICmpInst::ICMP_EQ)
+    CR.getEquivalentICmp(Pred, C, Offset);
+  else
+    CR.inverse().getEquivalentICmp(Pred, C, Offset);
+
+  if (!Offset.isZero())
+    X = Builder.CreateAdd(X, ConstantInt::get(X->getType(), Offset));
+
+  return replaceInstUsesWith(
+      I, Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), C)));
+}
+
 // Canonicalize checking for a power-of-2-or-zero value:
 static Instruction *foldICmpPow2Test(ICmpInst &I,
                                      InstCombiner::BuilderTy &Builder) {
@@ -7467,10 +7506,14 @@ Instruction *InstCombinerImpl::foldICmpCommutative(CmpPredicate Pred,
     if (Instruction *NI = foldSelectICmp(Pred, SI, Op1, CxtI))
       return NI;
 
-  if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0))
+  if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(Op0)) {
     if (Instruction *Res = foldICmpWithMinMax(CxtI, MinMax, Op1, Pred))
       return Res;
 
+    if (Instruction *Res = foldICmpWithClamp(CxtI, Op1, MinMax))
+      return Res;
+  }
+
   {
     Value *X;
     const APInt *C;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 4f94aa2d38541..e01c145bf5de3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -725,6 +725,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
   Instruction *foldICmpWithMinMax(Instruction &I, MinMaxIntrinsic *MinMax,
                                   Value *Z, CmpPredicate Pred);
+  Instruction *foldICmpWithClamp(ICmpInst &Cmp, Value *X, MinMaxIntrinsic *Min);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
   Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
   Instruction *foldSignBitTest(ICmpInst &I);
diff --git a/llvm/test/Transforms/InstCombine/icmp-clamp.ll b/llvm/test/Transforms/InstCombine/icmp-clamp.ll
new file mode 100644
index 0000000000000..4866dbffb567a
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-clamp.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+declare void @use(i32)
+
+define i1 @test_i32_eq(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_eq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X]], 95
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], 256
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+define i1 @test_i32_ne(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_ne(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X]], -161
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], -256
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp ne i32 %v2, %x
+  ret i1 %cmp
+}
+
+define i1 @test_i32_eq_no_add(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_eq_no_add(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X]], 161
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+define i1 @test_i32_ne_no_add(i32 %x) {
+; CHECK-LABEL: define i1 @test_i32_ne_no_add(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X]], 160
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 0)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp ne i32 %v2, %x
+  ret i1 %cmp
+}
+
+define i1 @test_unsigned_eq(i32 %x) {
+; CHECK-LABEL: define i1 @test_unsigned_eq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X]], -10
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], 91
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10)
+  %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 100)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+define i1 @test_unsigned_ne(i32 %x) {
+; CHECK-LABEL: define i1 @test_unsigned_ne(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X]], -101
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], -91
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10)
+  %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 100)
+  %cmp = icmp ne i32 %v2, %x
+  ret i1 %cmp
+}
+
+
+; Different bit widths
+define i1 @test_i8_eq(i8 %x) {
+; CHECK-LABEL: define i1 @test_i8_eq(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X]], 50
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[TMP1]], 101
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i8 @llvm.smax.i8(i8 %x, i8 -50)
+  %v2 = tail call i8 @llvm.smin.i8(i8 %v1, i8 50)
+  %cmp = icmp eq i8 %v2, %x
+  ret i1 %cmp
+}
+
+define i1 @test_i16_eq(i16 %x) {
+; CHECK-LABEL: define i1 @test_i16_eq(
+; CHECK-SAME: i16 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[X]], 1000
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[TMP1]], 2001
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i16 @llvm.smax.i16(i16 %x, i16 -1000)
+  %v2 = tail call i16 @llvm.smin.i16(i16 %v1, i16 1000)
+  %cmp = icmp eq i16 %v2, %x
+  ret i1 %cmp
+}
+
+define i1 @test_i64_eq(i64 %x) {
+; CHECK-LABEL: define i1 @test_i64_eq(
+; CHECK-SAME: i64 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[X]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[TMP1]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i64 @llvm.smax.i64(i64 %x, i64 -1)
+  %v2 = tail call i64 @llvm.smin.i64(i64 %v1, i64 9223372036854775806)
+  %cmp = icmp eq i64 %v2, %x
+  ret i1 %cmp
+}
+
+; Negative tests - wrong predicate
+define i1 @test_wrong_pred_slt(i32 %x) {
+; CHECK-LABEL: define i1 @test_wrong_pred_slt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 160
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp slt i32 %v2, %x
+  ret i1 %cmp
+}
+
+
+; Negative tests - not a clamp pattern
+define i1 @test_not_clamp_pattern(i32 %x, i32 %y) {
+; CHECK-LABEL: define i1 @test_not_clamp_pattern(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[Y]], i32 -95)
+; CHECK-NEXT:    [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %y, i32 -95)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+; Negative tests - Lo >= Hi
+define i1 @test_invalid_range(i32 %x) {
+; CHECK-LABEL: define i1 @test_invalid_range(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 50
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 100)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 50)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+; Negative tests - Lo is minimum signed value
+define i1 @test_lo_min_signed(i32 %x) {
+; CHECK-LABEL: define i1 @test_lo_min_signed(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X]], 161
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -2147483648)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+; Negative tests - Hi is maximum signed value
+define i1 @test_hi_max_signed(i32 %x) {
+; CHECK-LABEL: define i1 @test_hi_max_signed(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], -96
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 2147483647)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+; Negative tests - Hi is maximum unsigned value
+define i1 @test_hi_max_unsigned(i32 %x) {
+; CHECK-LABEL: define i1 @test_hi_max_unsigned(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X]], 9
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.umax.i32(i32 %x, i32 10)
+  %v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 4294967295)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+; Multi-use tests - multiple uses of max
+define i1 @test_multi_use_max(i32 %x) {
+; CHECK-LABEL: define i1 @test_multi_use_max(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 -95)
+; CHECK-NEXT:    call void @use(i32 [[V1]])
+; CHECK-NEXT:    [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+  call void @use(i32 %v1)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+; Multi-use tests - multiple uses of min
+define i1 @test_multi_use_min(i32 %x) {
+; CHECK-LABEL: define i1 @test_multi_use_min(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[X]], i32 -95)
+; CHECK-NEXT:    [[V2:%.*]] = tail call i32 @llvm.smin.i32(i32 [[V1]], i32 160)
+; CHECK-NEXT:    call void @use(i32 [[V2]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V2]], [[X]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  call void @use(i32 %v2)
+  %cmp = icmp eq i32 %v2, %x
+  ret i1 %cmp
+}
+
+; Commuted tests
+define i1 @test_commuted_eq(i32 %x) {
+; CHECK-LABEL: define i1 @test_commuted_eq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X]], 95
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], 256
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 -95)
+  %v2 = tail call i32 @llvm.smin.i32(i32 %v1, i32 160)
+  %cmp = icmp eq i32 %x, %v2
+  ret i1 %cmp
+}
+
+
+; Vector tests - splat constants
+define <2 x i1> @test_vec_splat_eq(<2 x i32> %x) {
+; CHECK-LABEL: define <2 x i1> @test_vec_splat_eq(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[X]], splat (i32 50)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i32> [[TMP1]], splat (i32 101)
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 -50>)
+  %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 50>)
+  %cmp = icmp eq <2 x i32> %v2, %x
+  ret <2 x i1> %cmp
+}
+
+; Vector tests - poison elements
+define <2 x i1> @test_vec_poison_eq(<2 x i32> %x) {
+; CHECK-LABEL: define <2 x i1> @test_vec_poison_eq(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X]], <2 x i32> <i32 -50, i32 poison>)
+; CHECK-NEXT:    [[V2:%.*]] = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[V1]], <2 x i32> <i32 50, i32 poison>)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[V2]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 poison>)
+  %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 poison>)
+  %cmp = icmp eq <2 x i32> %v2, %x
+  ret <2 x i1> %cmp
+}
+
+; Vector tests - non-splat
+define <2 x i1> @test_vec_non_splat_eq(<2 x i32> %x) {
+; CHECK-LABEL: define <2 x i1> @test_vec_non_splat_eq(
+; CHECK-SAME: <2 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[X]], <2 x i32> <i32 -50, i32 -30>)
+; CHECK-NEXT:    [[V2:%.*]] = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[V1]], <2 x i32> <i32 50, i32 70>)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[V2]], [[X]]
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %v1 = tail call <2 x i32> @llvm.smax.v2i32(<2 x i32> %x, <2 x i32> <i32 -50, i32 -30>)
+  %v2 = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %v1, <2 x i32> <i32 50, i32 70>)
+  %cmp = icmp eq <2 x i32> %v2, %x
+  ret <2 x i1> %cmp
+}

From 6048c2f0c752d6f9963bc47a10f208c62882c8d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 2 Oct 2025 10:00:47 -1000
Subject: [PATCH 569/878] [flang][openacc] Suppport !@acc compiler sentinel
 (#161706)

---
 flang/lib/Parser/parsing.cpp                  |  1 +
 flang/lib/Parser/prescan.cpp                  |  5 +++++
 flang/test/Semantics/OpenACC/acc-sentinel.f90 | 14 ++++++++++++++
 3 files changed, 20 insertions(+)
 create mode 100644 flang/test/Semantics/OpenACC/acc-sentinel.f90

diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp
index 8a8c6ef673a8c..2df6881146af8 100644
--- a/flang/lib/Parser/parsing.cpp
+++ b/flang/lib/Parser/parsing.cpp
@@ -85,6 +85,7 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) {
   if (options.features.IsEnabled(LanguageFeature::OpenACC) ||
       (options.prescanAndReformat && noneOfTheAbove)) {
     prescanner.AddCompilerDirectiveSentinel("$acc");
+    prescanner.AddCompilerDirectiveSentinel("@acc");
   }
   if (options.features.IsEnabled(LanguageFeature::OpenMP) ||
       (options.prescanAndReformat && noneOfTheAbove)) {
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 865c149380d85..66e5b2cbd5c7f 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -147,6 +147,11 @@ void Prescanner::Statement() {
         directiveSentinel_[4] == '\0') {
       // CUDA conditional compilation line.
       condOffset = 5;
+    } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'a' &&
+        directiveSentinel_[2] == 'c' && directiveSentinel_[3] == 'c' &&
+        directiveSentinel_[4] == '\0') {
+      // OpenACC conditional compilation line.
+      condOffset = 5;
     }
     if (condOffset && !preprocessingOnly_) {
       at_ += *condOffset, column_ += *condOffset;
diff --git a/flang/test/Semantics/OpenACC/acc-sentinel.f90 b/flang/test/Semantics/OpenACC/acc-sentinel.f90
new file mode 100644
index 0000000000000..d34d97e8b1b5d
--- /dev/null
+++ b/flang/test/Semantics/OpenACC/acc-sentinel.f90
@@ -0,0 +1,14 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenacc
+
+subroutine test_sentinel()
+! Test for error since we currently do not have an OpenACC module upstream.
+!ERROR: Cannot parse module file for module 'openacc': Source file 'openacc.mod' was not found
+  !@acc use openacc
+  integer :: i
+
+  !$acc parallel loop
+  do i = 1, 10
+  end do
+  !$acc end parallel
+
+end subroutine

From a2b6602aa46827cd0a3a379980dfbe1688f4049e Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Thu, 2 Oct 2025 13:13:36 -0700
Subject: [PATCH 570/878] [NVPTX] expand trunc/ext on v2i32 (#161715)

#153478 made v2i32 legal on newer GPUs, but we can not lower all
operations yet. Expand the `trunc/ext` operation until we implement
efficient lowering.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |   5 +
 .../test/CodeGen/NVPTX/f32x2-convert-i32x2.ll | 145 ++++++++++++++++++
 2 files changed, 150 insertions(+)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3ac7c2874408b..8c21746c4369e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -638,6 +638,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // No support for these operations with v2f32/v2i32
   setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
+
+  setOperationAction(ISD::TRUNCATE, MVT::v2i16, Expand);
+  setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
+                     MVT::v2i32, Expand);
+
   // Need custom lowering in case the index is dynamic.
   if (STI.hasF32x2Instructions())
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
index 18fb87935d17d..21ca041f6220a 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
@@ -115,5 +115,150 @@ define ptx_kernel void @inlineasm(ptr %p) {
   store <2 x float> %mul, ptr %p, align 8
   ret void
 }
+
+define ptx_kernel void @trunc_v2i32(<2 x i32> %0) {
+; CHECK-SM90A-LABEL: trunc_v2i32(
+; CHECK-SM90A:       {
+; CHECK-SM90A-NEXT:    .reg .b32 %r<7>;
+; CHECK-SM90A-NEXT:    .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT:  // %bb.0:
+; CHECK-SM90A-NEXT:    ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0];
+; CHECK-SM90A-NEXT:    prmt.b32 %r3, %r1, %r2, 0x3340U;
+; CHECK-SM90A-NEXT:    mov.b32 %r4, 0;
+; CHECK-SM90A-NEXT:    prmt.b32 %r5, %r4, 0, 0x3340U;
+; CHECK-SM90A-NEXT:    prmt.b32 %r6, %r5, %r3, 0x5410U;
+; CHECK-SM90A-NEXT:    mov.b64 %rd1, 0;
+; CHECK-SM90A-NEXT:    st.b32 [%rd1], %r6;
+; CHECK-SM90A-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: trunc_v2i32(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b32 %r<7>;
+; CHECK-SM100-NEXT:    .reg .b64 %rd<3>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.b64 %rd1, [trunc_v2i32_param_0];
+; CHECK-SM100-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-SM100-NEXT:    mov.b32 %r3, 0;
+; CHECK-SM100-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
+; CHECK-SM100-NEXT:    prmt.b32 %r5, %r1, %r2, 0x3340U;
+; CHECK-SM100-NEXT:    prmt.b32 %r6, %r4, %r5, 0x5410U;
+; CHECK-SM100-NEXT:    mov.b64 %rd2, 0;
+; CHECK-SM100-NEXT:    st.b32 [%rd2], %r6;
+; CHECK-SM100-NEXT:    ret;
+  %2 = trunc <2 x i32> %0 to <2 x i8>
+  %3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i8> %3, ptr null, align 4
+  ret void
+}
+
+define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: zextend_to_v2i32(
+; CHECK-SM90A:       {
+; CHECK-SM90A-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT:    .reg .b32 %r<4>;
+; CHECK-SM90A-NEXT:    .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT:  // %bb.0:
+; CHECK-SM90A-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT:    cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT:    cvt.u32.u16 %r3, %rs2;
+; CHECK-SM90A-NEXT:    mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT:    st.b32 [%rd1], %r3;
+; CHECK-SM90A-NEXT:    mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT:    st.b32 [%rd2], %r2;
+; CHECK-SM90A-NEXT:    mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT:    st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT:    mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT:    st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: zextend_to_v2i32(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT:    .reg .b32 %r<5>;
+; CHECK-SM100-NEXT:    .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-SM100-NEXT:    mov.b64 %rd1, {%r3, %r2};
+; CHECK-SM100-NEXT:    mov.b32 %r4, 0;
+; CHECK-SM100-NEXT:    mov.b64 %rd2, {%r4, %r4};
+; CHECK-SM100-NEXT:    mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT:    st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT:    mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT:    st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT:    mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT:    st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT:    shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT:    mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT:    st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT:    ret;
+  %2 = zext <2 x i8> %0 to <2 x i32>
+  %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i32> %3, ptr null, align 4
+  ret void
+}
+
+define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: sextend_to_v2i32(
+; CHECK-SM90A:       {
+; CHECK-SM90A-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT:    .reg .b32 %r<6>;
+; CHECK-SM90A-NEXT:    .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT:  // %bb.0:
+; CHECK-SM90A-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT:    cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT:    cvt.s32.s8 %r3, %r2;
+; CHECK-SM90A-NEXT:    cvt.u32.u16 %r4, %rs2;
+; CHECK-SM90A-NEXT:    cvt.s32.s8 %r5, %r4;
+; CHECK-SM90A-NEXT:    mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT:    st.b32 [%rd1], %r5;
+; CHECK-SM90A-NEXT:    mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT:    st.b32 [%rd2], %r3;
+; CHECK-SM90A-NEXT:    mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT:    st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT:    mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT:    st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT:    ret;
+;
+; CHECK-SM100-LABEL: sextend_to_v2i32(
+; CHECK-SM100:       {
+; CHECK-SM100-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT:    .reg .b32 %r<7>;
+; CHECK-SM100-NEXT:    .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT:  // %bb.0:
+; CHECK-SM100-NEXT:    ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT:    mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT:    cvt.s32.s8 %r3, %r2;
+; CHECK-SM100-NEXT:    cvt.u32.u16 %r4, %rs1;
+; CHECK-SM100-NEXT:    cvt.s32.s8 %r5, %r4;
+; CHECK-SM100-NEXT:    mov.b64 %rd1, {%r5, %r3};
+; CHECK-SM100-NEXT:    mov.b32 %r6, 0;
+; CHECK-SM100-NEXT:    mov.b64 %rd2, {%r6, %r6};
+; CHECK-SM100-NEXT:    mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT:    st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT:    mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT:    st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT:    mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT:    st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT:    shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT:    mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT:    st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT:    ret;
+  %2 = sext <2 x i8> %0 to <2 x i32>
+  %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i32> %3, ptr null, align 4
+  ret void
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}

From 0ebd4334021e7579bfba7a92b692e0e4ece56cb9 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jon@spectralcompute.co.uk>
Date: Thu, 2 Oct 2025 21:15:48 +0100
Subject: [PATCH 571/878] [AMDGPU] Be less optimistic when allocating module
 scope lds (#161464)

Make the test for when additional variables can be added to the struct
allocated at address zero more stringent. Previously, variables can be
added to it (for faster access) even when that increases the lds
requested by a kernel. This corrects that oversight.

Test case diff shows the change from all variables being allocated into
the module lds to only some being, in particular the introduction of
uses of the offset table and that some kernels now use less lds than
before.

Alternative to PR 160181
---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       |  5 +-
 ...e-lds-precise-allocate-to-module-struct.ll | 89 +++++++++----------
 2 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index f01d5f6726822..6efa78ef902c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -608,6 +608,8 @@ class AMDGPULowerModuleLDS {
             ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
             : EmptySet;
 
+    const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size();
+
     for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
       // Each iteration of this loop assigns exactly one global variable to
       // exactly one of the implementation strategies.
@@ -647,7 +649,8 @@ class AMDGPULowerModuleLDS {
           ModuleScopeVariables.insert(GV);
         } else if (K.second.size() == 1) {
           KernelAccessVariables.insert(GV);
-        } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
+        } else if (K.second.size() == HybridModuleRootKernelsSize &&
+                   set_is_subset(K.second, HybridModuleRootKernels)) {
           ModuleScopeVariables.insert(GV);
         } else {
           TableLookupVariables.insert(GV);
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
index 0de7f8f621a53..bd29e9e5855ff 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
 ; Regression test for issue 160181
 ; One variable is chosen to be assigned at zero. Here, that's @both
@@ -22,12 +22,20 @@
 ;.
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]]
 ; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.amdgcn.kernel.kern_one.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_one.lds.t poison, align 4, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.kernel.kern_two.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_two.lds.t poison, align 4, !absolute_symbol [[META1]]
+; CHECK: @llvm.amdgcn.kernel.kern_block_direct_allocation.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_block_direct_allocation.lds.t poison, align 4, !absolute_symbol [[META1]]
+
 ;.
 define void @func_one() {
 ; CHECK-LABEL: define {{[^@]+}}@func_one() {
-; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1:![0-9]+]]
-; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18:![0-9]+]]
-; CHECK-NEXT:    store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2:![0-9]+]]
+; CHECK-NEXT:    [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
+; CHECK-NEXT:    [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(3) [[ONE1]], align 4
+; CHECK-NEXT:    store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   %val0 = load i32, ptr addrspace(3) @both
@@ -38,9 +46,10 @@ define void @func_one() {
 
 define amdgpu_kernel void @kern_one() {
 ; CHECK-LABEL: define {{[^@]+}}@kern_one
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META16:![0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META24:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_one.lds) ]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META17:![0-9]+]]
 ; CHECK-NEXT:    call void @func_one()
 ; CHECK-NEXT:    ret void
 ;
@@ -51,9 +60,13 @@ entry:
 
 define void @func_two() {
 ; CHECK-LABEL: define {{[^@]+}}@func_two() {
-; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
-; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25:![0-9]+]]
-; CHECK-NEXT:    store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
+; CHECK-NEXT:    [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
+; CHECK-NEXT:    [[TWO1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(3) [[TWO1]], align 4
+; CHECK-NEXT:    store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
 ; CHECK-NEXT:    ret void
 ;
   %val0 = load i32, ptr addrspace(3) @both
@@ -64,9 +77,10 @@ define void @func_two() {
 
 define amdgpu_kernel void @kern_two() {
 ; CHECK-LABEL: define {{[^@]+}}@kern_two
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META18:![0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META26:![0-9]+]], !noalias [[META27:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_two.lds) ]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
 ; CHECK-NEXT:    call void @func_two()
 ; CHECK-NEXT:    ret void
 ;
@@ -82,11 +96,18 @@ entry:
 ; remains the best candidate for address zero allocation.
 define void @func_block_direct_allocation() {
 ; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() {
-; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18]]
-; CHECK-NEXT:    [[VAL2:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
+; CHECK-NEXT:    [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr addrspace(3) [[ONE1]], align 4
+; CHECK-NEXT:    [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
+; CHECK-NEXT:    [[TWO2:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT:    [[VAL2:%.*]] = load i32, ptr addrspace(3) [[TWO2]], align 4
 ; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]]
-; CHECK-NEXT:    store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
-; CHECK-NEXT:    store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT:    store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
+; CHECK-NEXT:    store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
 ; CHECK-NEXT:    ret void
 ;
   %val1 = load i32, ptr addrspace(3) @one
@@ -99,7 +120,8 @@ define void @func_block_direct_allocation() {
 
 define amdgpu_kernel void @kern_block_direct_allocation() {
 ; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META21:![0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_block_direct_allocation.lds) ], !alias.scope [[META22:![0-9]+]], !noalias [[META25:![0-9]+]]
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    call void @func_block_direct_allocation()
 ; CHECK-NEXT:    call void @func_one()
@@ -112,35 +134,8 @@ define amdgpu_kernel void @kern_block_direct_allocation() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="16" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-;.
-; CHECK: [[META0]] = !{i32 0, i32 1}
-; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
-; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]]}
-; CHECK: [[META3]] = distinct !{[[META3]]}
-; CHECK: [[META4]] = distinct !{[[META4]], [[META3]]}
-; CHECK: [[META5]] = distinct !{[[META5]], [[META3]]}
-; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
-; CHECK: [[META7]] = distinct !{[[META7]]}
-; CHECK: [[META8]] = distinct !{[[META8]], [[META7]]}
-; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]}
-; CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
-; CHECK: [[META11]] = distinct !{[[META11]]}
-; CHECK: [[META12]] = distinct !{[[META12]], [[META11]]}
-; CHECK: [[META13]] = distinct !{[[META13]], [[META11]]}
-; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]}
-; CHECK: [[META15]] = distinct !{[[META15]]}
-; CHECK: [[META16]] = distinct !{[[META16]], [[META15]]}
-; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]}
-; CHECK: [[META18]] = !{[[META19:![0-9]+]], [[META2]], [[META5]], [[META20:![0-9]+]], [[META6]], [[META9]], [[META21:![0-9]+]], [[META10]], [[META13]], [[META22:![0-9]+]], [[META14]], [[META17]]}
-; CHECK: [[META19]] = distinct !{[[META19]], [[META3]]}
-; CHECK: [[META20]] = distinct !{[[META20]], [[META7]]}
-; CHECK: [[META21]] = distinct !{[[META21]], [[META11]]}
-; CHECK: [[META22]] = distinct !{[[META22]], [[META15]]}
-; CHECK: [[META23]] = !{[[META19]], [[META4]], [[META5]], [[META20]], [[META8]], [[META9]], [[META21]], [[META12]], [[META13]], [[META22]], [[META16]], [[META17]]}
-; CHECK: [[META24]] = !{[[META10]], [[META12]], [[META13]], [[META14]], [[META16]], [[META17]]}
-; CHECK: [[META25]] = !{[[META19]], [[META2]], [[META4]], [[META20]], [[META6]], [[META8]], [[META21]], [[META10]], [[META12]], [[META22]], [[META14]], [[META16]]}
-; CHECK: [[META26]] = !{[[META22]]}
-; CHECK: [[META27]] = !{[[META14]], [[META16]], [[META17]]}
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="12" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.

From fd9a1dcc01766c71932898e9643ce28bf2801bad Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Thu, 2 Oct 2025 13:25:52 -0700
Subject: [PATCH 572/878] [bazel] Add missing dep after
 99d85906c542c3801a24137ba6d6f2c367308563 (#161728)

---
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 7e36d0ba787b6..4064c2f0c65d2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -284,6 +284,7 @@ cc_library(
         "//mlir:InferTypeOpInterface",
         "//mlir:LLVMToLLVMIRTranslation",
         "//mlir:Pass",
+        "//mlir:SCFDialect",
         "//mlir:ToLLVMIRTranslation",
         "//mlir:TransformUtils",
         "//mlir:Transforms",

From a035ef478b921250190f63854852d2b03fec6e7d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 2 Oct 2025 13:14:42 -0700
Subject: [PATCH 573/878] [RISCV] Use i64 instead of XLenVT in some RV64 only
 isel patterns. NFC

---
 llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
index 680bca30d72db..1674c957b6579 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
@@ -98,16 +98,15 @@ let Predicates = [HasStdExtZalasr] in {
 let Predicates = [HasStdExtZalasr, IsRV32] in {
   def : PatLAQ<acquiring_load<atomic_load_nonext_32>, LW_AQ>;
   def : PatLAQ<seq_cst_load<atomic_load_nonext_32>, LW_AQ>;
-
-} // Predicates = [HasStdExtZalasr, IsRV64]
+} // Predicates = [HasStdExtZalasr, IsRV32]
 
 let Predicates = [HasStdExtZalasr, IsRV64] in {
-  def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ>;
-  def : PatLAQ<seq_cst_load<atomic_load_asext_32>, LW_AQ>;
+  def : PatLAQ<acquiring_load<atomic_load_asext_32>, LW_AQ, i64>;
+  def : PatLAQ<seq_cst_load<atomic_load_asext_32>, LW_AQ, i64>;
 
-  def : PatLAQ<acquiring_load<atomic_load_nonext_64>, LD_AQ>;
-  def : PatLAQ<seq_cst_load<atomic_load_nonext_64>, LD_AQ>;
+  def : PatLAQ<acquiring_load<atomic_load_nonext_64>, LD_AQ, i64>;
+  def : PatLAQ<seq_cst_load<atomic_load_nonext_64>, LD_AQ, i64>;
 
-  def : PatSRL<releasing_store<atomic_store_64>, SD_RL>;
-  def : PatSRL<seq_cst_store<atomic_store_64>, SD_RL>;
+  def : PatSRL<releasing_store<atomic_store_64>, SD_RL, i64>;
+  def : PatSRL<seq_cst_store<atomic_store_64>, SD_RL, i64>;
 } // Predicates = [HasStdExtZalasr, IsRV64]

From 2b2bc6320f7037bdbad912fe3cd8003988e6c0ae Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 2 Oct 2025 21:30:54 +0100
Subject: [PATCH 574/878] [LV] Add tests with multiple F(Max|Min)Num reductions
 w/o fast-math.

Pre-commits extra test coverage for loops with multiple F(Max|Min)Num
reductions w/o fast-math-flags for follow-up PR.
---
 .../AArch64/fmax-without-fast-math-flags.ll   | 46 +++++++++++++++++++
 ...fmax-without-fast-math-flags-interleave.ll | 46 +++++++++++++++++++
 .../fmax-without-fast-math-flags.ll           | 46 +++++++++++++++++++
 3 files changed, 138 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 32fdc5cd6fc4f..56a1abd2384c8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -113,3 +113,49 @@ loop:
 exit:
   ret float %max.next
 }
+
+define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
+; CHECK-LABEL: define float @test_fmax_and_fmin(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
+; CHECK-NEXT:    [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[SUB]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ 0.000000e+00, %entry ], [ %min.next, %loop ]
+  %max = phi float [ 0.000000e+00, %entry ], [ %max.next, %loop ]
+  %gep.src.0 = getelementptr inbounds nuw float, ptr %src.0, i64 %iv
+  %gep.src.1 = getelementptr inbounds nuw float, ptr %src.1, i64 %iv
+  %l.0 = load float, ptr %gep.src.0, align 4
+  %l.1 = load float, ptr %gep.src.1, align 4
+  %max.next = tail call noundef float @llvm.maxnum.f32(float %max, float %l.0)
+  %min.next = tail call noundef float @llvm.minnum.f32(float %min, float %l.1)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %sub = fsub float %max.next, %min.next
+  ret float %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index 616f1566c207c..5b7c27a0b5f1b 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -113,3 +113,49 @@ loop:
 exit:
   ret float %max.next
 }
+
+define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
+; CHECK-LABEL: define float @test_fmax_and_fmin(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
+; CHECK-NEXT:    [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[SUB]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ 0.000000e+00, %entry ], [ %min.next, %loop ]
+  %max = phi float [ 0.000000e+00, %entry ], [ %max.next, %loop ]
+  %gep.src.0 = getelementptr inbounds nuw float, ptr %src.0, i64 %iv
+  %gep.src.1 = getelementptr inbounds nuw float, ptr %src.1, i64 %iv
+  %l.0 = load float, ptr %gep.src.0, align 4
+  %l.1 = load float, ptr %gep.src.1, align 4
+  %max.next = tail call noundef float @llvm.maxnum.f32(float %max, float %l.0)
+  %min.next = tail call noundef float @llvm.minnum.f32(float %min, float %l.1)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %sub = fsub float %max.next, %min.next
+  ret float %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 1a2b233d1079b..8b6a6e1e46101 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -683,3 +683,49 @@ loop:
 exit:
   ret float %max.next
 }
+
+define float @test_fmax_and_fmax(ptr %src.0, ptr %src.1, i64 %n) {
+; CHECK-LABEL: define float @test_fmax_and_fmax(
+; CHECK-SAME: ptr [[SRC_0:%.*]], ptr [[SRC_1:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MIN_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_0]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC_1]], i64 [[IV]]
+; CHECK-NEXT:    [[L_0:%.*]] = load float, ptr [[GEP_SRC_0]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = tail call noundef float @llvm.maxnum.f32(float [[MAX]], float [[L_0]])
+; CHECK-NEXT:    [[MIN_NEXT]] = tail call noundef float @llvm.minnum.f32(float [[MIN]], float [[L_1]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MIN_NEXT_LCSSA:%.*]] = phi float [ [[MIN_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[MAX_NEXT_LCSSA]], [[MIN_NEXT_LCSSA]]
+; CHECK-NEXT:    ret float [[SUB]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %min = phi float [ 0.000000e+00, %entry ], [ %min.next, %loop ]
+  %max = phi float [ 0.000000e+00, %entry ], [ %max.next, %loop ]
+  %gep.src.0 = getelementptr inbounds nuw float, ptr %src.0, i64 %iv
+  %gep.src.1 = getelementptr inbounds nuw float, ptr %src.1, i64 %iv
+  %l.0 = load float, ptr %gep.src.0, align 4
+  %l.1 = load float, ptr %gep.src.1, align 4
+  %max.next = tail call noundef float @llvm.maxnum.f32(float %max, float %l.0)
+  %min.next = tail call noundef float @llvm.minnum.f32(float %min, float %l.1)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  %sub = fsub float %max.next, %min.next
+  ret float %sub
+}

From ea443d528d1a6687c2f5ecfe3de62e5c9d2ca42c Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 2 Oct 2025 16:49:10 -0400
Subject: [PATCH 575/878] [FMV][AArch64] Remove last of MRS bits and rename
 file (#161585)

474f5d2aefb44430b89ed72774a3c1d26a0adfb1 removed the last
bits reading from system registers so remove the last bits
utilizing MRS and rename the file to hwcap as the
code is now only decoding the hwcap flags.
---
 compiler-rt/lib/builtins/cpu_model/aarch64.c                | 6 +++---
 .../builtins/cpu_model/aarch64/fmv/{mrs.inc => hwcap.inc}   | 3 ---
 2 files changed, 3 insertions(+), 6 deletions(-)
 rename compiler-rt/lib/builtins/cpu_model/aarch64/fmv/{mrs.inc => hwcap.inc} (94%)

diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index d7880529ebe70..8af736d0ffe93 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -69,15 +69,15 @@ struct {
 #if defined(__APPLE__)
 #include "aarch64/fmv/apple.inc"
 #elif defined(__FreeBSD__) || defined(__OpenBSD__)
-#include "aarch64/fmv/mrs.inc"
+#include "aarch64/fmv/hwcap.inc"
 #include "aarch64/fmv/elf_aux_info.inc"
 #elif defined(__Fuchsia__)
 #include "aarch64/fmv/fuchsia.inc"
 #elif defined(__ANDROID__)
-#include "aarch64/fmv/mrs.inc"
+#include "aarch64/fmv/hwcap.inc"
 #include "aarch64/fmv/android.inc"
 #elif defined(__linux__) && __has_include(<sys/auxv.h>)
-#include "aarch64/fmv/mrs.inc"
+#include "aarch64/fmv/hwcap.inc"
 #include "aarch64/fmv/getauxval.inc"
 #elif defined(_WIN32)
 #include "aarch64/fmv/windows.inc"
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/hwcap.inc
similarity index 94%
rename from compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
rename to compiler-rt/lib/builtins/cpu_model/aarch64/fmv/hwcap.inc
index afe9d4efd6af5..0f56cef97d4a3 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/hwcap.inc
@@ -7,9 +7,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
                                             const __ifunc_arg_t *arg) {
   unsigned long long feat = 0;
 #define setCPUFeature(F) feat |= 1ULL << F
-#define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
-#define extractBits(val, start, number)                                        \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
     hwcap2 = arg->_hwcap2;

From dbffd0aeaf3484fe3d846a10b3c3b1e48ccdf7ef Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Thu, 2 Oct 2025 13:50:11 -0700
Subject: [PATCH 576/878] [lldb] Add lld requirement to NativePDB test
 (#161731)

The cpp file fails to build without `lld-link`.
---
 lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
index 81d643d9572d8..beb5ae2f90256 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/symtab.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: x86
+// REQUIRES: lld, x86
 
 // Test symtab reading
 // RUN: %build --compiler=clang-cl --arch=64 --nodefaultlib -o %t.exe -- %s

From f4370fb801aa221d7a56f88ebdd9451cce653a68 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 2 Oct 2025 21:51:45 +0100
Subject: [PATCH 577/878] [ARM] Update and cleanup lrint/llrint tests. NFC

Most of the fp16 cases still do not work properly. See #161088.
---
 llvm/test/CodeGen/ARM/llrint-conv.ll  | 69 ++++++++++++++++++---------
 llvm/test/CodeGen/ARM/lrint-conv.ll   | 37 +++++++-------
 llvm/test/CodeGen/ARM/vector-lrint.ll | 20 --------
 3 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/llvm/test/CodeGen/ARM/llrint-conv.ll b/llvm/test/CodeGen/ARM/llrint-conv.ll
index 749ee00a3c68e..a1a04db8622c7 100644
--- a/llvm/test/CodeGen/ARM/llrint-conv.ll
+++ b/llvm/test/CodeGen/ARM/llrint-conv.ll
@@ -1,46 +1,71 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
 
-; SOFTFP-LABEL: testmsxh_builtin:
-; SOFTFP:       bl      llrintf
-; HARDFP-LABEL: testmsxh_builtin:
-; HARDFP:       bl      llrintf
 define i64 @testmsxh_builtin(half %x) {
+; CHECK-SOFT-LABEL: testmsxh_builtin:
+; CHECK-SOFT:       @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT:    .save {r11, lr}
+; CHECK-SOFT-NEXT:    push {r11, lr}
+; CHECK-SOFT-NEXT:    bl __aeabi_h2f
+; CHECK-SOFT-NEXT:    bl llrintf
+; CHECK-SOFT-NEXT:    pop {r11, pc}
+;
+; CHECK-NOFP16-LABEL: testmsxh_builtin:
+; CHECK-NOFP16:       @ %bb.0: @ %entry
+; CHECK-NOFP16-NEXT:    .save {r11, lr}
+; CHECK-NOFP16-NEXT:    push {r11, lr}
+; CHECK-NOFP16-NEXT:    vmov r0, s0
+; CHECK-NOFP16-NEXT:    bl __aeabi_h2f
+; CHECK-NOFP16-NEXT:    vmov s0, r0
+; CHECK-NOFP16-NEXT:    bl llrintf
+; CHECK-NOFP16-NEXT:    pop {r11, pc}
+;
+; CHECK-FP16-LABEL: testmsxh_builtin:
+; CHECK-FP16:       @ %bb.0: @ %entry
+; CHECK-FP16-NEXT:    .save {r11, lr}
+; CHECK-FP16-NEXT:    push {r11, lr}
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    bl llrintf
+; CHECK-FP16-NEXT:    pop {r11, pc}
 entry:
   %0 = tail call i64 @llvm.llrint.i64.f16(half %x)
   ret i64 %0
 }
 
-; SOFTFP-LABEL: testmsxs_builtin:
-; SOFTFP:       bl      llrintf
-; HARDFP-LABEL: testmsxs_builtin:
-; HARDFP:       bl      llrintf
 define i64 @testmsxs_builtin(float %x) {
+; CHECK-LABEL: testmsxs_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llrintf
+; CHECK-NEXT:    pop {r11, pc}
 entry:
   %0 = tail call i64 @llvm.llrint.i64.f32(float %x)
   ret i64 %0
 }
 
-; SOFTFP-LABEL: testmsxd_builtin:
-; SOFTFP:       bl      llrint
-; HARDFP-LABEL: testmsxd_builtin:
-; HARDFP:       bl      llrint
 define i64 @testmsxd_builtin(double %x) {
+; CHECK-LABEL: testmsxd_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llrint
+; CHECK-NEXT:    pop {r11, pc}
 entry:
   %0 = tail call i64 @llvm.llrint.i64.f64(double %x)
   ret i64 %0
 }
 
-; FIXME(#44744): incorrect libcall
-; SOFTFP-LABEL: testmsxq_builtin:
-; SOFTFP:       bl      llrintl
-; HARDFP-LABEL: testmsxq_builtin:
-; HARDFP:       bl      llrintl
 define i64 @testmsxq_builtin(fp128 %x) {
+; CHECK-LABEL: testmsxq_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llrintl
+; CHECK-NEXT:    pop {r11, pc}
 entry:
   %0 = tail call i64 @llvm.llrint.i64.f128(fp128 %x)
   ret i64 %0
 }
-
-declare i64 @llvm.llrint.i64.f32(float) nounwind readnone
-declare i64 @llvm.llrint.i64.f64(double) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/lrint-conv.ll b/llvm/test/CodeGen/ARM/lrint-conv.ll
index 9aa95112af533..23a2685aa1122 100644
--- a/llvm/test/CodeGen/ARM/lrint-conv.ll
+++ b/llvm/test/CodeGen/ARM/lrint-conv.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
 
 ; FIXME: crash
 ; define i32 @testmswh_builtin(half %x) {
@@ -8,36 +10,37 @@
 ;   ret i32 %0
 ; }
 
-; SOFTFP-LABEL: testmsws_builtin:
-; SOFTFP:       bl      lrintf
-; HARDFP-LABEL: testmsws_builtin:
-; HARDFP:       bl      lrintf
 define i32 @testmsws_builtin(float %x) {
+; CHECK-LABEL: testmsws_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    b lrintf
 entry:
   %0 = tail call i32 @llvm.lrint.i32.f32(float %x)
   ret i32 %0
 }
 
-; SOFTFP-LABEL: testmswd_builtin:
-; SOFTFP:       bl      lrint
-; HARDFP-LABEL: testmswd_builtin:
-; HARDFP:       bl      lrint
 define i32 @testmswd_builtin(double %x) {
+; CHECK-LABEL: testmswd_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    b lrint
 entry:
   %0 = tail call i32 @llvm.lrint.i32.f64(double %x)
   ret i32 %0
 }
 
-; FIXME(#44744): incorrect libcall
-; SOFTFP-LABEL: testmswq_builtin:
-; SOFTFP:       bl      lrintl
-; HARDFP-LABEL: testmswq_builtin:
-; HARDFP:       bl      lrintl
 define i32 @testmswq_builtin(fp128 %x) {
+; CHECK-LABEL: testmswq_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl lrintl
+; CHECK-NEXT:    pop {r11, pc}
 entry:
   %0 = tail call i32 @llvm.lrint.i32.f128(fp128 %x)
   ret i32 %0
 }
 
-declare i32 @llvm.lrint.i32.f32(float) nounwind readnone
-declare i32 @llvm.lrint.i32.f64(double) nounwind readnone
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP16: {{.*}}
+; CHECK-NOFP16: {{.*}}
+; CHECK-SOFT: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/vector-lrint.ll b/llvm/test/CodeGen/ARM/vector-lrint.ll
index fe5e3cbcdf771..c1159da77707c 100644
--- a/llvm/test/CodeGen/ARM/vector-lrint.ll
+++ b/llvm/test/CodeGen/ARM/vector-lrint.ll
@@ -14,31 +14,26 @@
 ;   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half> %x)
 ;   ret <1 x iXLen> %a
 ; }
-; declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f16(<1 x half>)
 
 ; define <2 x iXLen> @lrint_v2f16(<2 x half> %x) {
 ;   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half> %x)
 ;   ret <2 x iXLen> %a
 ; }
-; declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f16(<2 x half>)
 
 ; define <4 x iXLen> @lrint_v4f16(<4 x half> %x) {
 ;   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half> %x)
 ;   ret <4 x iXLen> %a
 ; }
-; declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f16(<4 x half>)
 
 ; define <8 x iXLen> @lrint_v8f16(<8 x half> %x) {
 ;   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half> %x)
 ;   ret <8 x iXLen> %a
 ; }
-; declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f16(<8 x half>)
 
 ; define <16 x iXLen> @lrint_v16f16(<16 x half> %x) {
 ;   %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half> %x)
 ;   ret <16 x iXLen> %a
 ; }
-; declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f16(<16 x half>)
 
 define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; LE-I32-LABEL: lrint_v1f32:
@@ -76,7 +71,6 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
   ret <1 x iXLen> %a
 }
-declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float>)
 
 define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
 ; LE-I32-LABEL: lrint_v2f32:
@@ -160,7 +154,6 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
   ret <2 x iXLen> %a
 }
-declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float>)
 
 define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; LE-I32-LABEL: lrint_v4f32:
@@ -274,7 +267,6 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
   ret <4 x iXLen> %a
 }
-declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float>)
 
 define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; LE-I32-LABEL: lrint_v8f32:
@@ -488,7 +480,6 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float> %x)
   ret <8 x iXLen> %a
 }
-declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f32(<8 x float>)
 
 define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; LE-I32-LABEL: lrint_v16f32:
@@ -1005,7 +996,6 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
   %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float> %x)
   ret <16 x iXLen> %a
 }
-declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f32(<16 x float>)
 
 define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
 ; LE-I32-LABEL: lrint_v1f64:
@@ -1043,7 +1033,6 @@ define <1 x iXLen> @lrint_v1f64(<1 x double> %x) {
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double> %x)
   ret <1 x iXLen> %a
 }
-declare <1 x iXLen> @llvm.lrint.v1iXLen.v1f64(<1 x double>)
 
 define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
 ; LE-I32-LABEL: lrint_v2f64:
@@ -1120,7 +1109,6 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
   ret <2 x iXLen> %a
 }
-declare <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double>)
 
 define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; LE-I32-LABEL: lrint_v4f64:
@@ -1237,7 +1225,6 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
   ret <4 x iXLen> %a
 }
-declare <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double>)
 
 define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; LE-I32-LABEL: lrint_v8f64:
@@ -1467,7 +1454,6 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double> %x)
   ret <8 x iXLen> %a
 }
-declare <8 x iXLen> @llvm.lrint.v8iXLen.v8f64(<8 x double>)
 
 define <16 x iXLen> @lrint_v16f64(<16 x double> %x) {
 ; LE-I32-LABEL: lrint_v16f64:
@@ -2053,7 +2039,6 @@ define <16 x iXLen> @lrint_v16f64(<16 x double> %x) {
   %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double> %x)
   ret <16 x iXLen> %a
 }
-declare <16 x iXLen> @llvm.lrint.v16iXLen.v16f64(<16 x double>)
 
 define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) {
 ; LE-I32-LABEL: lrint_v1fp128:
@@ -2091,7 +2076,6 @@ define <1 x iXLen> @lrint_v1fp128(<1 x fp128> %x) {
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1fp128(<1 x fp128> %x)
   ret <1 x iXLen> %a
 }
-declare <1 x iXLen> @llvm.lrint.v1iXLen.v1fp128(<1 x fp128>)
 
 define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) {
 ; LE-I32-LABEL: lrint_v2fp128:
@@ -2194,7 +2178,6 @@ define <2 x iXLen> @lrint_v2fp128(<2 x fp128> %x) {
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2fp128(<2 x fp128> %x)
   ret <2 x iXLen> %a
 }
-declare <2 x iXLen> @llvm.lrint.v2iXLen.v2fp128(<2 x fp128>)
 
 define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) {
 ; LE-I32-LABEL: lrint_v4fp128:
@@ -2347,7 +2330,6 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) {
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4fp128(<4 x fp128> %x)
   ret <4 x iXLen> %a
 }
-declare <4 x iXLen> @llvm.lrint.v4iXLen.v4fp128(<4 x fp128>)
 
 define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) {
 ; LE-I32-LABEL: lrint_v8fp128:
@@ -2664,7 +2646,6 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) {
   %a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8fp128(<8 x fp128> %x)
   ret <8 x iXLen> %a
 }
-declare <8 x iXLen> @llvm.lrint.v8iXLen.v8fp128(<8 x fp128>)
 
 define <16 x iXLen> @lrint_v16fp128(<16 x fp128> %x) {
 ; LE-I32-LABEL: lrint_v16fp128:
@@ -3262,4 +3243,3 @@ define <16 x iXLen> @lrint_v16fp128(<16 x fp128> %x) {
   %a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16fp128(<16 x fp128> %x)
   ret <16 x iXLen> %a
 }
-declare <16 x iXLen> @llvm.lrint.v16iXLen.v16fp128(<16 x fp128>)

From f4784fd13fb77a609920a06af99c3602bdec530f Mon Sep 17 00:00:00 2001
From: Julian Lettner <yln@users.noreply.github.com>
Date: Thu, 2 Oct 2025 13:57:40 -0700
Subject: [PATCH 578/878] [debugserver] Support for `qMemTags` packet (#160952)

Support for `qMemTags` packet in debugserver which allows usage of
LLDB's `memory tag read` on Darwin.
---
 .../Python/lldbsuite/test/cpu_feature.py      |   2 +-
 lldb/test/API/macosx/mte/Makefile             |  12 ++
 lldb/test/API/macosx/mte/TestDarwinMTE.py     | 110 ++++++++++++++++++
 lldb/test/API/macosx/mte/main.c               |  28 +++++
 .../API/macosx/mte/mte-entitlements.plist     |  10 ++
 lldb/tools/debugserver/source/DNB.cpp         |  10 ++
 lldb/tools/debugserver/source/DNB.h           |   3 +
 lldb/tools/debugserver/source/DNBDefs.h       |   3 +-
 .../debugserver/source/MacOSX/MachTask.h      |   2 +
 .../debugserver/source/MacOSX/MachTask.mm     |  17 +++
 .../source/MacOSX/MachVMMemory.cpp            |  59 ++++++++++
 .../debugserver/source/MacOSX/MachVMMemory.h  |   2 +
 .../source/MacOSX/MachVMRegion.cpp            |  46 ++++++++
 .../debugserver/source/MacOSX/MachVMRegion.h  |   3 +-
 lldb/tools/debugserver/source/RNBRemote.cpp   | 105 ++++++++++++++++-
 lldb/tools/debugserver/source/RNBRemote.h     |   2 +
 16 files changed, 410 insertions(+), 4 deletions(-)
 create mode 100644 lldb/test/API/macosx/mte/Makefile
 create mode 100644 lldb/test/API/macosx/mte/TestDarwinMTE.py
 create mode 100644 lldb/test/API/macosx/mte/main.c
 create mode 100644 lldb/test/API/macosx/mte/mte-entitlements.plist

diff --git a/lldb/packages/Python/lldbsuite/test/cpu_feature.py b/lldb/packages/Python/lldbsuite/test/cpu_feature.py
index b46a5acc596f0..d7668c1884e40 100644
--- a/lldb/packages/Python/lldbsuite/test/cpu_feature.py
+++ b/lldb/packages/Python/lldbsuite/test/cpu_feature.py
@@ -62,7 +62,7 @@ def _is_supported_darwin(self, cmd_runner):
 class AArch64:
     FPMR = CPUFeature("fpmr")
     GCS = CPUFeature("gcs")
-    MTE = CPUFeature("mte")
+    MTE = CPUFeature("mte", "hw.optional.arm.FEAT_MTE4")
     MTE_STORE_ONLY = CPUFeature("mtestoreonly")
     PTR_AUTH = CPUFeature("paca", "hw.optional.arm.FEAT_PAuth2")
     SME = CPUFeature("sme", "hw.optional.arm.FEAT_SME")
diff --git a/lldb/test/API/macosx/mte/Makefile b/lldb/test/API/macosx/mte/Makefile
new file mode 100644
index 0000000000000..cb20942805e2a
--- /dev/null
+++ b/lldb/test/API/macosx/mte/Makefile
@@ -0,0 +1,12 @@
+C_SOURCES := main.c
+
+EXE := uaf_mte
+
+all: uaf_mte sign
+
+include Makefile.rules
+
+sign: mte-entitlements.plist uaf_mte
+ifeq ($(OS),Darwin)
+	codesign -s - -f --entitlements $^
+endif
diff --git a/lldb/test/API/macosx/mte/TestDarwinMTE.py b/lldb/test/API/macosx/mte/TestDarwinMTE.py
new file mode 100644
index 0000000000000..ef858b1fc2710
--- /dev/null
+++ b/lldb/test/API/macosx/mte/TestDarwinMTE.py
@@ -0,0 +1,110 @@
+"""Test MTE Memory Tagging on Apple platforms"""
+
+import lldb
+import re
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+import lldbsuite.test.cpu_feature as cpu_feature
+
+exe_name = "uaf_mte"  # Must match Makefile
+
+
+class TestDarwinMTE(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @skipUnlessFeature(cpu_feature.AArch64.MTE)
+    def test_tag_fault(self):
+        self.build()
+        exe = self.getBuildArtifact(exe_name)
+
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+
+        process = target.LaunchSimple(None, None, None)
+        self.assertState(process.GetState(), lldb.eStateStopped, PROCESS_STOPPED)
+
+        self.expect(
+            "thread info",
+            substrs=[
+                "stop reason = EXC_ARM_MTE_TAG_FAULT",
+                "MTE tag mismatch detected",
+            ],
+        )
+
+    @skipUnlessFeature(cpu_feature.AArch64.MTE)
+    def test_memory_region(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "// before free", lldb.SBFileSpec("main.c"), exe_name=exe_name
+        )
+
+        # (lldb) memory region ptr
+        # [0x00000001005ec000-0x00000001009ec000) rw-
+        # memory tagging: enabled
+        # Modified memory (dirty) page list provided, 2 entries.
+        # Dirty pages: 0x1005ec000, 0x1005fc000.
+        self.expect("memory region ptr", substrs=["memory tagging: enabled"])
+
+    @skipUnlessFeature(cpu_feature.AArch64.MTE)
+    def test_memory_read_with_tags(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "// before free", lldb.SBFileSpec("main.c"), exe_name=exe_name
+        )
+
+        # (lldb) memory read ptr-16 ptr+48 --show-tags
+        # 0x7d2c00930: 00 00 00 00 00 00 00 00 d0 e3 a5 0a 02 00 00 00  ................ (tag: 0x3)
+        # 0x7d2c00940: 48 65 6c 6c 6f 00 00 00 00 00 00 00 00 00 00 00  Hello........... (tag: 0xb)
+        # 0x7d2c00950: 57 6f 72 6c 64 00 00 00 00 00 00 00 00 00 00 00  World........... (tag: 0xb)
+        # 0x7d2c00960: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................ (tag: 0x9)
+        self.expect(
+            "memory read ptr-16 ptr+48 --show-tags",
+            substrs=[" Hello...........", " World..........."],
+            patterns=[r"(.*\(tag: 0x[0-9a-f]\)\n){4}"],
+        )
+
+    def _parse_pointer_tag(self, output):
+        return re.search(r"Logical tag: (0x[0-9a-f])", output).group(1)
+
+    def _parse_memory_tags(self, output, expected_tag_count):
+        tags = re.findall(r"\): (0x[0-9a-f])", output)
+        self.assertEqual(len(tags), expected_tag_count)
+        return tags
+
+    @skipUnlessFeature(cpu_feature.AArch64.MTE)
+    def test_memory_tag_read(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "// before free", lldb.SBFileSpec("main.c"), exe_name=exe_name
+        )
+
+        # (lldb) memory tag read ptr-1 ptr+33
+        # Logical tag: 0x5
+        # Allocation tags:
+        # [0x100a65a40, 0x100a65a50): 0xf (mismatch)
+        # [0x100a65a50, 0x100a65a60): 0x5
+        # [0x100a65a60, 0x100a65a70): 0x5
+        # [0x100a65a70, 0x100a65a80): 0x2 (mismatch)
+        self.expect(
+            "memory tag read ptr-1 ptr+33",
+            substrs=["Logical tag: 0x", "Allocation tags:", "(mismatch)"],
+            patterns=[r"(\[.*\): 0x[0-9a-f].*\n){4}"],
+        )
+        output = self.res.GetOutput()
+        self.assertEqual(output.count("(mismatch)"), 2)
+        ptr_tag = self._parse_pointer_tag(output)
+        tags = self._parse_memory_tags(output, 4)
+        self.assertEqual(tags[1], ptr_tag)
+        self.assertEqual(tags[2], ptr_tag)
+        self.assertNotEqual(tags[0], ptr_tag)  # Memory that comes before/after
+        self.assertNotEqual(tags[3], ptr_tag)  # allocation has different tag.
+
+        # Continue running until MTE fault
+        self.expect("process continue", substrs=["stop reason = EXC_ARM_MTE_TAG_FAULT"])
+
+        self.runCmd("memory tag read ptr-1 ptr+33")
+        output = self.res.GetOutput()
+        self.assertEqual(output.count("(mismatch)"), 4)
+        tags = self._parse_memory_tags(output, 4)
+        self.assertTrue(all(t != ptr_tag for t in tags))
diff --git a/lldb/test/API/macosx/mte/main.c b/lldb/test/API/macosx/mte/main.c
new file mode 100644
index 0000000000000..f9f6b1594ef41
--- /dev/null
+++ b/lldb/test/API/macosx/mte/main.c
@@ -0,0 +1,28 @@
+#include <malloc/malloc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// Produce some names on the trace
+const size_t tag_granule = 16;
+static uint8_t *my_malloc(void) { return malloc(2 * tag_granule); }
+static uint8_t *allocate(void) { return my_malloc(); }
+
+static void my_free(void *ptr) { free(ptr); }
+static void deallocate(void *ptr) { my_free(ptr); }
+
+static void touch_memory(uint8_t *ptr) { ptr[7] = 1; } // invalid access
+static void modify(uint8_t *ptr) { touch_memory(ptr); }
+
+int main() {
+  uint8_t *ptr = allocate();
+
+  strncpy((char *)ptr, "Hello", 16);
+  strncpy((char *)ptr + 16, "World", 16);
+
+  deallocate(ptr); // before free
+
+  modify(ptr); // use-after-free
+
+  return 0;
+}
diff --git a/lldb/test/API/macosx/mte/mte-entitlements.plist b/lldb/test/API/macosx/mte/mte-entitlements.plist
new file mode 100644
index 0000000000000..6de5d5634d878
--- /dev/null
+++ b/lldb/test/API/macosx/mte/mte-entitlements.plist
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>com.apple.security.hardened-process</key>
+    <true/>
+    <key>com.apple.security.hardened-process.checked-allocations</key>
+    <true/>
+</dict>
+</plist>
diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp
index f541134b43a1b..0cd48d91a682a 100644
--- a/lldb/tools/debugserver/source/DNB.cpp
+++ b/lldb/tools/debugserver/source/DNB.cpp
@@ -1386,6 +1386,16 @@ int DNBProcessMemoryRegionInfo(nub_process_t pid, nub_addr_t addr,
   return -1;
 }
 
+nub_bool_t DNBProcessGetMemoryTags(nub_process_t pid, nub_addr_t addr,
+                                   nub_size_t size,
+                                   std::vector<uint8_t> &tags) {
+  MachProcessSP procSP;
+  if (GetProcessSP(pid, procSP))
+    return procSP->Task().GetMemoryTags(addr, size, tags);
+
+  return false;
+}
+
 std::string DNBProcessGetProfileData(nub_process_t pid,
                                      DNBProfileDataScanType scanType) {
   MachProcessSP procSP;
diff --git a/lldb/tools/debugserver/source/DNB.h b/lldb/tools/debugserver/source/DNB.h
index 10d1f68794355..1f3d5392c588f 100644
--- a/lldb/tools/debugserver/source/DNB.h
+++ b/lldb/tools/debugserver/source/DNB.h
@@ -105,6 +105,9 @@ nub_bool_t DNBProcessMemoryDeallocate(nub_process_t pid,
                                       nub_addr_t addr) DNB_EXPORT;
 int DNBProcessMemoryRegionInfo(nub_process_t pid, nub_addr_t addr,
                                DNBRegionInfo *region_info) DNB_EXPORT;
+nub_bool_t DNBProcessGetMemoryTags(nub_process_t pid, nub_addr_t addr,
+                                   nub_size_t size,
+                                   std::vector<uint8_t> &tags) DNB_EXPORT;
 std::string
 DNBProcessGetProfileData(nub_process_t pid,
                          DNBProfileDataScanType scanType) DNB_EXPORT;
diff --git a/lldb/tools/debugserver/source/DNBDefs.h b/lldb/tools/debugserver/source/DNBDefs.h
index df8ca809d412c..d98399aed5e19 100644
--- a/lldb/tools/debugserver/source/DNBDefs.h
+++ b/lldb/tools/debugserver/source/DNBDefs.h
@@ -358,10 +358,11 @@ struct DNBExecutableImageInfo {
 struct DNBRegionInfo {
 public:
   DNBRegionInfo()
-      : addr(0), size(0), permissions(0), dirty_pages(), vm_types() {}
+      : addr(0), size(0), permissions(0), flags(), dirty_pages(), vm_types() {}
   nub_addr_t addr;
   nub_addr_t size;
   uint32_t permissions;
+  std::vector<std::string> flags;
   std::vector<nub_addr_t> dirty_pages;
   std::vector<std::string> vm_types;
 };
diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.h b/lldb/tools/debugserver/source/MacOSX/MachTask.h
index 2284f6b99de91..c4a20b80fda95 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachTask.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachTask.h
@@ -56,6 +56,8 @@ class MachTask {
   nub_size_t ReadMemory(nub_addr_t addr, nub_size_t size, void *buf);
   nub_size_t WriteMemory(nub_addr_t addr, nub_size_t size, const void *buf);
   int GetMemoryRegionInfo(nub_addr_t addr, DNBRegionInfo *region_info);
+  nub_bool_t GetMemoryTags(nub_addr_t addr, nub_size_t size,
+                           std::vector<uint8_t> &tags);
   std::string GetProfileData(DNBProfileDataScanType scanType);
 
   nub_addr_t AllocateMemory(nub_size_t size, uint32_t permissions);
diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.mm b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
index e2395cffb3565..21156feecba2c 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachTask.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
@@ -229,6 +229,23 @@
   return ret;
 }
 
+//----------------------------------------------------------------------
+// MachTask::GetMemoryTags
+//----------------------------------------------------------------------
+nub_bool_t MachTask::GetMemoryTags(nub_addr_t addr, nub_size_t size,
+                                   std::vector<uint8_t> &tags) {
+  task_t task = TaskPort();
+  if (task == TASK_NULL)
+    return false;
+
+  bool ok = m_vm_memory.GetMemoryTags(task, addr, size, tags);
+  DNBLogThreadedIf(LOG_MEMORY, "MachTask::GetMemoryTags ( addr = 0x%8.8llx, "
+                               "size = 0x%8.8llx ) => %s ( tag count = %llu)",
+                  (uint64_t)addr, (uint64_t)size, (ok ? "ok" : "err"),
+                  (uint64_t)tags.size());
+  return ok;
+}
+
 #define TIME_VALUE_TO_TIMEVAL(a, r)                                            \
   do {                                                                         \
     (r)->tv_sec = (a)->seconds;                                                \
diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMMemory.cpp b/lldb/tools/debugserver/source/MacOSX/MachVMMemory.cpp
index f3aa4d7d980fd..bb57245e7da48 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMMemory.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMMemory.cpp
@@ -13,6 +13,7 @@
 #include "MachVMMemory.h"
 #include "DNBLog.h"
 #include "MachVMRegion.h"
+#include <cassert>
 #include <dlfcn.h>
 #include <mach/mach_vm.h>
 #include <mach/shared_region.h>
@@ -123,6 +124,7 @@ nub_bool_t MachVMMemory::GetMemoryRegionInfo(task_t task, nub_addr_t address,
     region_info->addr = vmRegion.StartAddress();
     region_info->size = vmRegion.GetByteSize();
     region_info->permissions = vmRegion.GetDNBPermissions();
+    region_info->flags = vmRegion.GetFlags();
     region_info->dirty_pages =
         get_dirty_pages(task, vmRegion.StartAddress(), vmRegion.GetByteSize());
     region_info->vm_types = vmRegion.GetMemoryTypes();
@@ -150,6 +152,63 @@ nub_bool_t MachVMMemory::GetMemoryRegionInfo(task_t task, nub_addr_t address,
   return true;
 }
 
+// API availability:
+//  mach_vm_update_pointers_with_remote_tags() - 26.0
+//  VM_OFFSET_LIST_MAX macro - 26.1
+#ifndef VM_OFFSET_LIST_MAX
+#define VM_OFFSET_LIST_MAX 512
+#endif
+using mach_vm_offset_list_t = mach_vm_offset_t *;
+using mach_vm_update_pointers_with_remote_tags_t = kern_return_t(
+    mach_port_name_t target, mach_vm_offset_list_t in_pointer_list,
+    mach_msg_type_number_t in_pointer_listCnt,
+    mach_vm_offset_list_t out_pointer_list,
+    mach_msg_type_number_t *out_pointer_listCnt);
+
+nub_bool_t MachVMMemory::GetMemoryTags(task_t task, nub_addr_t address,
+                                       nub_size_t size,
+                                       std::vector<uint8_t> &tags) {
+  static auto mach_vm_update_pointers_with_remote_tags =
+      (mach_vm_update_pointers_with_remote_tags_t *)dlsym(
+          RTLD_DEFAULT, "mach_vm_update_pointers_with_remote_tags");
+  assert(mach_vm_update_pointers_with_remote_tags);
+
+  // Max batch size supported by mach_vm_update_pointers_with_remote_tags.
+  constexpr uint32_t max_ptr_count = VM_OFFSET_LIST_MAX;
+  constexpr uint32_t tag_shift = 56;
+  constexpr nub_addr_t tag_mask =
+      ((nub_addr_t)0x0f << tag_shift); // Lower half of top byte.
+  constexpr uint32_t tag_granule = 16;
+
+  mach_msg_type_number_t ptr_count =
+      (size / tag_granule) + ((size % tag_granule > 0) ? 1 : 0);
+  ptr_count = std::min(ptr_count, max_ptr_count);
+
+  auto ptr_arr = std::make_unique<mach_vm_offset_t[]>(ptr_count);
+  for (size_t i = 0; i < ptr_count; i++)
+    ptr_arr[i] = (address + i * tag_granule);
+
+  mach_msg_type_number_t ptr_count_out = ptr_count;
+  m_err = mach_vm_update_pointers_with_remote_tags(
+      task, ptr_arr.get(), ptr_count, ptr_arr.get(), &ptr_count_out);
+
+  const bool failed = (m_err.Fail() || (ptr_count != ptr_count_out));
+  if (failed || DNBLogCheckLogBit(LOG_MEMORY))
+    m_err.LogThreaded("::mach_vm_update_pointers_with_remote_tags ( task = "
+                      "0x%4.4x, ptr_count = %d ) => %i ( ptr_count_out = %d)",
+                      task, ptr_count, m_err.Status(), ptr_count_out);
+  if (failed)
+    return false;
+
+  tags.reserve(ptr_count);
+  for (size_t i = 0; i < ptr_count; i++) {
+    nub_addr_t tag = (ptr_arr[i] & tag_mask) >> tag_shift;
+    tags.push_back(tag);
+  }
+
+  return true;
+}
+
 static uint64_t GetPhysicalMemory() {
   // This doesn't change often at all. No need to poll each time.
   static uint64_t physical_memory = 0;
diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMMemory.h b/lldb/tools/debugserver/source/MacOSX/MachVMMemory.h
index 05d2c029b9980..8a7616091fbb3 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMMemory.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMMemory.h
@@ -28,6 +28,8 @@ class MachVMMemory {
   nub_size_t PageSize(task_t task);
   nub_bool_t GetMemoryRegionInfo(task_t task, nub_addr_t address,
                                  DNBRegionInfo *region_info);
+  nub_bool_t GetMemoryTags(task_t task, nub_addr_t address, nub_size_t size,
+                           std::vector<uint8_t> &tags);
   nub_bool_t GetMemoryProfile(DNBProfileDataScanType scanType, task_t task,
                               struct task_basic_info ti, cpu_type_t cputype,
                               nub_process_t pid, vm_statistics64_data_t &vminfo,
diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
index 97908b4acaf28..9d0d60fdaaed9 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.cpp
@@ -114,6 +114,11 @@ bool MachVMRegion::RestoreProtections() {
   return false;
 }
 
+#ifdef VM_REGION_FLAG_JIT_ENABLED
+#define VM_REGION_HAS_FLAGS 1
+#else
+#define VM_REGION_HAS_FLAGS 0
+#endif
 bool MachVMRegion::GetRegionForAddress(nub_addr_t addr) {
   // Restore any original protections and clear our vars
   Clear();
@@ -140,6 +145,30 @@ bool MachVMRegion::GetRegionForAddress(nub_addr_t addr) {
   if (failed)
     return false;
   if (log_protections) {
+#if VM_REGION_HAS_FLAGS
+    DNBLogThreaded("info = { prot = %u, "
+                   "max_prot = %u, "
+                   "inheritance = 0x%8.8x, "
+                   "offset = 0x%8.8llx, "
+                   "user_tag = 0x%8.8x, "
+                   "ref_count = %u, "
+                   "shadow_depth = %u, "
+                   "ext_pager = %u, "
+                   "share_mode = %u, "
+                   "is_submap = %d, "
+                   "behavior = %d, "
+                   "object_id = 0x%8.8x, "
+                   "user_wired_count = 0x%4.4x, "
+                   "flags = %d }",
+                   m_data.protection, m_data.max_protection, m_data.inheritance,
+                   (uint64_t)m_data.offset, m_data.user_tag, m_data.ref_count,
+                   m_data.shadow_depth, m_data.external_pager,
+                   m_data.share_mode, m_data.is_submap, m_data.behavior,
+                   m_data.object_id, m_data.user_wired_count, m_data.flags);
+#else
+    // Duplicate log call instead of #if-defing printing of flags to avoid
+    // compiler warning: 'embedding a directive within macro arguments has
+    // undefined behavior'
     DNBLogThreaded("info = { prot = %u, "
                    "max_prot = %u, "
                    "inheritance = 0x%8.8x, "
@@ -158,6 +187,7 @@ bool MachVMRegion::GetRegionForAddress(nub_addr_t addr) {
                    m_data.shadow_depth, m_data.external_pager,
                    m_data.share_mode, m_data.is_submap, m_data.behavior,
                    m_data.object_id, m_data.user_wired_count);
+#endif
   }
   m_curr_protection = m_data.protection;
 
@@ -183,6 +213,22 @@ uint32_t MachVMRegion::GetDNBPermissions() const {
   return dnb_permissions;
 }
 
+#ifndef VM_REGION_FLAG_MTE_ENABLED
+#define VM_REGION_FLAG_MTE_ENABLED 0x4
+#endif
+std::vector<std::string> MachVMRegion::GetFlags() const {
+  std::vector<std::string> flags;
+#if VM_REGION_HAS_FLAGS
+  if (m_data.flags & VM_REGION_FLAG_JIT_ENABLED)
+    flags.push_back("jit");
+  if (m_data.flags & VM_REGION_FLAG_TPRO_ENABLED)
+    flags.push_back("tpro");
+  if (m_data.flags & VM_REGION_FLAG_MTE_ENABLED)
+    flags.push_back("mt");
+#endif
+  return flags;
+}
+
 std::vector<std::string> MachVMRegion::GetMemoryTypes() const {
   std::vector<std::string> types;
   if (m_data.user_tag == VM_MEMORY_STACK) {
diff --git a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.h b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.h
index cb7705893c7ed..ba6e1f3bfa70e 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachVMRegion.h
+++ b/lldb/tools/debugserver/source/MacOSX/MachVMRegion.h
@@ -40,9 +40,10 @@ class MachVMRegion {
                       vm_prot_t prot);
   bool RestoreProtections();
   bool GetRegionForAddress(nub_addr_t addr);
-  std::vector<std::string> GetMemoryTypes() const;
 
   uint32_t GetDNBPermissions() const;
+  std::vector<std::string> GetFlags() const;
+  std::vector<std::string> GetMemoryTypes() const;
 
   const DNBError &GetError() { return m_err; }
 
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index d9fb22c6a1c06..434e9cfa40fb4 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -22,6 +22,9 @@
 #include <mach/mach_vm.h>
 #include <mach/task_info.h>
 #include <memory>
+#if __has_include(<os/security_config.h>)
+#include <os/security_config.h>
+#endif
 #include <pwd.h>
 #include <string>
 #include <sys/stat.h>
@@ -502,6 +505,8 @@ void RNBRemote::CreatePacketTable() {
       memory_region_info, &RNBRemote::HandlePacket_MemoryRegionInfo, NULL,
       "qMemoryRegionInfo", "Return size and attributes of a memory region that "
                            "contains the given address"));
+  t.push_back(Packet(get_memory_tags, &RNBRemote::HandlePacket_qMemTags, NULL,
+                     "qMemTags", "Return tags for a region of memory"));
   t.push_back(Packet(get_profile_data, &RNBRemote::HandlePacket_GetProfileData,
                      NULL, "qGetProfileData",
                      "Return profiling data of the current target."));
@@ -3475,6 +3480,18 @@ static bool GetProcessNameFrom_vAttach(const char *&p,
   return return_val;
 }
 
+static bool supports_memory_tagging() {
+  const char *name = "hw.optional.arm.FEAT_MTE4";
+  uint32_t val;
+  size_t len = sizeof(val);
+  int ret = ::sysctlbyname(name, &val, &len, nullptr, 0);
+  if (ret != 0)
+    return false;
+
+  assert(len == sizeof(val));
+  return val;
+}
+
 rnb_err_t RNBRemote::HandlePacket_qSupported(const char *p) {
   uint32_t max_packet_size = 128 * 1024; // 128 KiB is a reasonable max packet
                                          // size--debugger can always use less
@@ -3505,6 +3522,9 @@ rnb_err_t RNBRemote::HandlePacket_qSupported(const char *p) {
   reply << "SupportedWatchpointTypes=x86_64;";
 #endif
 
+  if (supports_memory_tagging())
+    reply << "memory-tagging+;";
+
   return SendPacket(reply.str().c_str());
 }
 
@@ -4251,7 +4271,6 @@ rnb_err_t RNBRemote::HandlePacket_MemoryRegionInfo(const char *p) {
      is in unmapped memory
          Region lookup cannot be performed on this platform or process is not
      yet launched
-         This packet isn't implemented
 
      Examples of use:
         qMemoryRegionInfo:3a55140
@@ -4303,6 +4322,16 @@ rnb_err_t RNBRemote::HandlePacket_MemoryRegionInfo(const char *p) {
       ostrm << 'x';
     ostrm << ';';
 
+    if (!region_info.flags.empty()) {
+      ostrm << "flags:";
+      for (size_t i = 0; i < region_info.flags.size(); i++) {
+        if (i != 0)
+          ostrm << " "; // Separator is whitespace
+        ostrm << region_info.flags[i];
+      }
+      ostrm << ";";
+    }
+
     ostrm << "dirty-pages:";
     if (region_info.dirty_pages.size() > 0) {
       bool first = true;
@@ -4327,6 +4356,62 @@ rnb_err_t RNBRemote::HandlePacket_MemoryRegionInfo(const char *p) {
   return SendPacket(ostrm.str());
 }
 
+// qMemTags:<hex address>,<hex length>:<hex type>
+rnb_err_t RNBRemote::HandlePacket_qMemTags(const char *p) {
+  nub_process_t pid = m_ctx.ProcessID();
+  if (pid == INVALID_NUB_PROCESS)
+    return SendPacket("OK");
+
+  StdStringExtractor packet(p);
+  packet.SetFilePos(strlen("qMemTags:"));
+
+  // Address
+  nub_addr_t addr =
+      packet.GetHexMaxU64(StdStringExtractor::BigEndian, INVALID_NUB_ADDRESS);
+  if (addr == INVALID_NUB_ADDRESS)
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "Invalid/missing address in qMemTags packet");
+  // ,
+  if (packet.GetChar() != ',')
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "Invalid qMemTags packet format");
+  // Length
+  uint64_t length = packet.GetHexMaxU64(StdStringExtractor::BigEndian, 0);
+  if (length == 0)
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "Invalid/missing length in qMemTags packet");
+  // :
+  if (packet.GetChar() != ':')
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "Invalid qMemTags packet format");
+  // Type
+  // On the LLDB side this is a `int32_t` serialized as (unsigned) hex, which
+  // means negative values will show up as large positive values here.  Right
+  // now, we only support MTE (type 1), so we can ignore this complication.
+  uint32_t type = packet.GetHexMaxU32(StdStringExtractor::BigEndian, 0);
+  if (type != 1 /* MTE */)
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "Invalid/missing type in qMemTags packet, "
+                                  "only MTE (type 1) is supported");
+  // <EOF>
+  if (packet.GetBytesLeft() != 0)
+    return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
+                                  "Invalid qMemTags packet format");
+
+  std::vector<uint8_t> tags;
+  bool ok = DNBProcessGetMemoryTags(pid, addr, length, tags);
+  if (!ok)
+    return SendErrorPacket("E91");
+
+  std::ostringstream ostrm;
+  ostrm << "m"; // Multi part replies
+  for (uint8_t tag : tags) {
+    ostrm << RAWHEX8(tag); // 2 hex chars per tag
+  }
+
+  return SendPacket(ostrm.str());
+}
+
 // qGetProfileData;scan_type:0xYYYYYYY
 rnb_err_t RNBRemote::HandlePacket_GetProfileData(const char *p) {
   nub_process_t pid = m_ctx.ProcessID();
@@ -6162,6 +6247,21 @@ GetCPUTypesFromHost(nub_process_t pid) {
   return {cputype, cpusubtype};
 }
 
+static bool ProcessRunningWithMemoryTagging(pid_t pid) {
+#if __has_include(<os/security_config.h>)
+  if (__builtin_available(macOS 26.0, iOS 26.0, tvOS 26.0, watchOS 26.0,
+                          visionOS 26.0, driverkit 25.0, *)) {
+    os_security_config_t config;
+    int ret = ::os_security_config_get_for_proc(pid, &config);
+    if (ret != 0)
+      return false;
+
+    return (config & OS_SECURITY_CONFIG_MTE);
+  }
+#endif
+  return false;
+}
+
 // Note that all numeric values returned by qProcessInfo are hex encoded,
 // including the pid and the cpu type.
 
@@ -6338,6 +6438,9 @@ rnb_err_t RNBRemote::HandlePacket_qProcessInfo(const char *p) {
 
   rep << "vendor:apple;";
 
+  if (ProcessRunningWithMemoryTagging(pid))
+    rep << "mte:enabled;";
+
 #if defined(__LITTLE_ENDIAN__)
   rep << "endian:little;";
 #elif defined(__BIG_ENDIAN__)
diff --git a/lldb/tools/debugserver/source/RNBRemote.h b/lldb/tools/debugserver/source/RNBRemote.h
index ad254ae90e2f7..cf1c978afcd23 100644
--- a/lldb/tools/debugserver/source/RNBRemote.h
+++ b/lldb/tools/debugserver/source/RNBRemote.h
@@ -121,6 +121,7 @@ class RNBRemote {
     set_list_threads_in_stop_reply,     // 'QListThreadsInStopReply:'
     sync_thread_state,                  // 'QSyncThreadState:'
     memory_region_info,                 // 'qMemoryRegionInfo:'
+    get_memory_tags,                    // 'qMemTags:'
     get_profile_data,                   // 'qGetProfileData'
     set_enable_profiling,               // 'QSetEnableAsyncProfiling'
     enable_compression,                 // 'QEnableCompression:'
@@ -237,6 +238,7 @@ class RNBRemote {
   rnb_err_t HandlePacket_SaveRegisterState(const char *p);
   rnb_err_t HandlePacket_RestoreRegisterState(const char *p);
   rnb_err_t HandlePacket_MemoryRegionInfo(const char *p);
+  rnb_err_t HandlePacket_qMemTags(const char *p);
   rnb_err_t HandlePacket_GetProfileData(const char *p);
   rnb_err_t HandlePacket_SetEnableAsyncProfiling(const char *p);
   rnb_err_t HandlePacket_QEnableCompression(const char *p);

From 8f2466bc72a5ab163621cb1bf4bf53a27f1cefe7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 2 Oct 2025 22:00:22 +0100
Subject: [PATCH 579/878] Reapply "[VPlan] Compute cost of more replicating
 loads/stores in ::computeCost. (#160053)" (#161724)

This reverts commit f61be4352592639a0903e67a9b5d3ec664ad4d23.

Recommit a small fix handling scalarization overhead consistently with
legacy cost model if a load is used directly as operand of another
memory operation, which fixes
https://github.com/llvm/llvm-project/issues/161404.

Original message:
Update VPReplicateRecipe::computeCost to compute costs of more
replicating loads/stores.

There are 2 cases that require extra checks to match the legacy cost
model:
1. If the pointer is based on an induction, the legacy cost model passes
its SCEV to getAddressComputationCost. In those cases, still fall back
to the legacy cost. SCEV computations will be added as follow-up
2. If a load is used as part of an address of another load, the legacy
cost model skips the scalarization overhead. Those cases are currently
handled by a usedByLoadOrStore helper.

Note that getScalarizationOverhead also needs updating, because when the
legacy cost model computes the scalarization overhead, scalars have not
been collected yet, so we can't each for replicating recipes to skip
their cost, except other loads. This again can be further improved by
modeling inserts/extracts explicitly and consistently, and compute costs
for those operations directly where needed.

PR: https://github.com/llvm/llvm-project/pull/160053
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  16 ++-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   9 +-
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  16 ++-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 122 +++++++++++++++---
 4 files changed, 134 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7fa787bc9befd..77506878e2029 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3903,7 +3903,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
       if (VF.isScalar())
         continue;
 
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+                            *CM.PSE.getSE());
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4160,7 +4161,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
+                            *CM.PSE.getSE());
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6852,7 +6854,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7085,7 +7087,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
+                        *CM.PSE.getSE());
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
@@ -8621,7 +8624,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
+                          *CM.PSE.getSE());
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -10075,7 +10079,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind);
+                          CM.CostKind, *CM.PSE.getSE());
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 07b191a787806..2555ebe2ad897 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1772,7 +1772,8 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 }
 
 InstructionCost VPCostContext::getScalarizationOverhead(
-    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
+    bool AlwaysIncludeReplicatingR) {
   if (VF.isScalar())
     return 0;
 
@@ -1792,7 +1793,11 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   SmallPtrSet<const VPValue *, 4> UniqueOperands;
   SmallVector<Type *> Tys;
   for (auto *Op : Operands) {
-    if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
+    if (Op->isLiveIn() ||
+        (!AlwaysIncludeReplicatingR &&
+         isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
+        (isa<VPReplicateRecipe>(Op) &&
+         cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
         !UniqueOperands.insert(Op).second)
       continue;
     Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index fc1a09e9850f6..1580a3be3180a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,12 +349,14 @@ struct VPCostContext {
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
+  ScalarEvolution &SE;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
-                TargetTransformInfo::TargetCostKind CostKind)
+                TargetTransformInfo::TargetCostKind CostKind,
+                ScalarEvolution &SE)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind) {}
+        CostKind(CostKind), SE(SE) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -374,10 +376,12 @@ struct VPCostContext {
 
   /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
   /// and \p Operands with \p VF. This is a convenience wrapper for the
-  /// type-based getScalarizationOverhead API.
-  InstructionCost getScalarizationOverhead(Type *ResultTy,
-                                           ArrayRef<const VPValue *> Operands,
-                                           ElementCount VF);
+  /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
+  /// is true, always compute the cost of scalarizing replicating operands.
+  InstructionCost
+  getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
+                           ElementCount VF,
+                           bool AlwaysIncludeReplicatingR = false);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 67b9244e9dc72..43d61f2321a2c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -40,6 +40,7 @@
 #include <cassert>
 
 using namespace llvm;
+using namespace llvm::VPlanPatternMatch;
 
 using VectorParts = SmallVector<Value *, 2>;
 
@@ -303,7 +304,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   VPRecipeBase *OpR = Op->getDefiningRecipe();
 
   // If the partial reduction is predicated, a select will be operand 0
-  using namespace llvm::VPlanPatternMatch;
   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
     OpR = Op->getDefiningRecipe();
   }
@@ -1963,7 +1963,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
 
   VPValue *Op0, *Op1;
-  using namespace llvm::VPlanPatternMatch;
   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -3111,6 +3110,62 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
+/// Returns true if \p Ptr is a pointer computation for which the legacy cost
+/// model computes a SCEV expression when computing the address cost.
+static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
+  auto *PtrR = Ptr->getDefiningRecipe();
+  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
+                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
+                      Instruction::GetElementPtr) ||
+                 isa<VPWidenGEPRecipe>(PtrR) ||
+                 match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
+    return false;
+
+  // We are looking for a GEP where all indices are either loop invariant or
+  // inductions.
+  for (VPValue *Opd : drop_begin(PtrR->operands())) {
+    if (!Opd->isDefinedOutsideLoopRegions() &&
+        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
+      return false;
+  }
+
+  return true;
+}
+
+/// Returns true if \p V is used as part of the address of another load or
+/// store.
+static bool isUsedByLoadStoreAddress(const VPUser *V) {
+  SmallPtrSet<const VPUser *, 4> Seen;
+  SmallVector<const VPUser *> WorkList = {V};
+
+  while (!WorkList.empty()) {
+    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
+    if (!Cur || !Seen.insert(Cur).second)
+      continue;
+
+    for (VPUser *U : Cur->users()) {
+      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
+        if (InterleaveR->getAddr() == Cur)
+          return true;
+      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
+        if (RepR->getOpcode() == Instruction::Load &&
+            RepR->getOperand(0) == Cur)
+          return true;
+        if (RepR->getOpcode() == Instruction::Store &&
+            RepR->getOperand(1) == Cur)
+          return true;
+      }
+      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
+        if (MemR->getAddr() == Cur && MemR->isConsecutive())
+          return true;
+      }
+    }
+
+    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
+  }
+  return false;
+}
+
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3218,21 +3273,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   }
   case Instruction::Load:
   case Instruction::Store: {
-    if (isSingleScalar()) {
-      bool IsLoad = UI->getOpcode() == Instruction::Load;
-      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
-      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
-      const Align Alignment = getLoadStoreAlignment(UI);
-      unsigned AS = getLoadStoreAddressSpace(UI);
-      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
-      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
-          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
-      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
-    }
+    if (VF.isScalable() && !isSingleScalar())
+      return InstructionCost::getInvalid();
+
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
-    break;
+    const VPRegionBlock *ParentRegion = getParent()->getParent();
+    if (ParentRegion && ParentRegion->isReplicator())
+      break;
+
+    bool IsLoad = UI->getOpcode() == Instruction::Load;
+    const VPValue *PtrOp = getOperand(!IsLoad);
+    // TODO: Handle cases where we need to pass a SCEV to
+    // getAddressComputationCost.
+    if (shouldUseAddressAccessSCEV(PtrOp))
+      break;
+
+    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
+    const Align Alignment = getLoadStoreAlignment(UI);
+    unsigned AS = getLoadStoreAddressSpace(UI);
+    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
+
+    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
+
+    InstructionCost ScalarCost =
+        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
+    if (isSingleScalar())
+      return ScalarCost;
+
+    SmallVector<const VPValue *> OpsToScalarize;
+    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
+    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
+    // don't assign scalarization overhead in general, if the target prefers
+    // vectorized addressing or the loaded value is used as part of an address
+    // of another load or store.
+    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
+    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
+      bool EfficientVectorLoadStore =
+          Ctx.TTI.supportsEfficientVectorElementLoadStore();
+      if (!(IsLoad && !PreferVectorizedAddressing) &&
+          !(!IsLoad && EfficientVectorLoadStore))
+        append_range(OpsToScalarize, operands());
+
+      if (!EfficientVectorLoadStore)
+        ResultTy = Ctx.Types.inferScalarType(this);
+    }
+
+    return (ScalarCost * VF.getFixedValue()) +
+           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
   }
   }
 

From 3757fa6aa9eaa5a8c99bcfc75426bafddb613b10 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 3 Oct 2025 07:31:21 +1000
Subject: [PATCH 580/878] [orc-rt] Add testcase for Expected<Expected<T>>
 support. (#161660)

Follows addition of Expected<Error> support (99ce2062462), and has
essentially the same motivation: supporting RPC calls to functions
returning Expected<T>, where the RPC infrastructure wants to be able to
wrap that result in its own Expected.
---
 orc-rt/unittests/ErrorTest.cpp | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/orc-rt/unittests/ErrorTest.cpp b/orc-rt/unittests/ErrorTest.cpp
index 6b1fc1635b5b8..260b6afc2ae95 100644
--- a/orc-rt/unittests/ErrorTest.cpp
+++ b/orc-rt/unittests/ErrorTest.cpp
@@ -409,6 +409,31 @@ TEST(ErrorTest, ExpectedError) {
   }
 }
 
+// Test that Expected<Expected<T>> works as expected.
+TEST(ErrorTest, ExpectedExpected) {
+  {
+    // Test success-success case.
+    Expected<Expected<int>> E(Expected<int>(42), ForceExpectedSuccessValue());
+    EXPECT_TRUE(!!E);
+    cantFail(E.takeError());
+    auto EI = std::move(*E);
+    EXPECT_TRUE(!!EI);
+    cantFail(EI.takeError());
+    EXPECT_EQ(*EI, 42);
+  }
+
+  {
+    // Test "failure" success case.
+    Expected<Expected<int>> E(Expected<int>(make_error<StringError>("foo")),
+                              ForceExpectedSuccessValue());
+    EXPECT_TRUE(!!E);
+    cantFail(E.takeError());
+    auto EI = std::move(*E);
+    EXPECT_FALSE(!!EI);
+    EXPECT_EQ(toString(EI.takeError()), "foo");
+  }
+}
+
 // Test that the ExitOnError utility works as expected.
 TEST(ErrorTest, CantFailSuccess) {
   cantFail(Error::success());

From b3e2d6d4b4069bdfe9dd1ad832d7358cfd36f3ad Mon Sep 17 00:00:00 2001
From: wjristow <warren.ristow@sony.com>
Date: Thu, 2 Oct 2025 14:34:23 -0700
Subject: [PATCH 581/878] Suppress returning larger CXX records in mem on
 PlayStation (#161732)

In commit e8a486ea9789, a change was made so that certain 256-bit and
512-bit CXX records would be returned in memory, fixing a violation of
the x86-64 psABI (where they had been incorrectly returned in AVX
registers). For compatibility reasons, we want to suppress that ABI-fix
on PlayStation. This commit suppresses that change for PlayStation, and
updates the test to include checking the 512-bit case.
---
 clang/lib/CodeGen/Targets/X86.cpp         |  5 +++--
 clang/test/CodeGen/X86/avx-cxx-record.cpp | 23 +++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index c03ba9487a6dc..fb789489664df 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -1343,9 +1343,10 @@ class X86_64ABIInfo : public ABIInfo {
   }
 
   bool returnCXXRecordGreaterThan128InMem() const {
-    // Clang <= 20.0 did not do this.
+    // Clang <= 20.0 did not do this, and PlayStation does not do this.
     if (getContext().getLangOpts().getClangABICompat() <=
-        LangOptions::ClangABI::Ver20)
+            LangOptions::ClangABI::Ver20 ||
+        getTarget().getTriple().isPS())
       return false;
 
     return true;
diff --git a/clang/test/CodeGen/X86/avx-cxx-record.cpp b/clang/test/CodeGen/X86/avx-cxx-record.cpp
index 6ce6815a521a1..b20bcdd616a43 100644
--- a/clang/test/CodeGen/X86/avx-cxx-record.cpp
+++ b/clang/test/CodeGen/X86/avx-cxx-record.cpp
@@ -1,7 +1,9 @@
 // RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -O2 -target-cpu x86-64-v3 -o - | FileCheck %s
 // RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -O2 -target-cpu x86-64-v3 -fclang-abi-compat=20 -o - | FileCheck --check-prefix CLANG-20 %s
+// RUN: %clang_cc1 %s -triple x86_64-sie-ps4 -emit-llvm -O2 -target-cpu x86-64-v3 -o - | FileCheck --check-prefix CLANG-20 %s
 
 using UInt64x2 = unsigned long long __attribute__((__vector_size__(16), may_alias));
+using UInt64x4 = unsigned long long __attribute__((__vector_size__(32), may_alias));
 
 template<int id>
 struct XMM1 {
@@ -23,3 +25,24 @@ XMM2 foo() {
   ((XMM1<1>*)&result)->x = UInt64x2{3, 4};
   return result;
 }
+
+template<int id>
+struct YMM1 {
+    UInt64x4 x;
+};
+
+struct YMM2 : YMM1<0>, YMM1<1> {
+};
+
+// CHECK: define{{.*}} @_Z3barv({{.*}} [[ARG:%.*]]){{.*}}
+// CLANG-20: define{{.*}} <8 x double> @_Z3barv()
+// CHECK: entry:
+// CHECK-NEXT: store {{.*}}, ptr [[ARG]]{{.*}}
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr {{.*}}, ptr [[ARG]]{{.*}}
+// CHECK-NEXT: store {{.*}}, ptr [[TMP1]]{{.*}}
+YMM2 bar() {
+  YMM2 result;
+  ((YMM1<0>*)&result)->x = UInt64x4{1, 2, 3, 4};
+  ((YMM1<1>*)&result)->x = UInt64x4{5, 6, 7, 8};
+  return result;
+}

From 07974fe2b14a7601b5658deb9c41597db54e842f Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 2 Oct 2025 22:45:17 +0100
Subject: [PATCH 582/878] Reland "[lldb][MachO][NFC] Extract ObjC metadata
 symbol parsing into helper function" (#161655)

This reverts `5a80fb9177e3c831c9c574400a13d77393397f2a`. The original
change got reverted because of failing tests on macOS.

The issue was that I changed the scope of setting `type =
eSymbolTypeData` during the cleanup. This patch relands the original
patch but doesn't change the `else` branch to an `else if` branch.

Tested that macOS test-suite passes.
---
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     | 166 ++++++------------
 1 file changed, 51 insertions(+), 115 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 91c93be1b8cfd..9cdb8467bfc60 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -2067,6 +2067,43 @@ static bool ParseTrieEntries(DataExtractor &data, lldb::offset_t offset,
   return true;
 }
 
+static bool
+TryParseV2ObjCMetadataSymbol(const char *&symbol_name,
+                             const char *&symbol_name_non_abi_mangled,
+                             SymbolType &type) {
+  static constexpr llvm::StringLiteral g_objc_v2_prefix_class("_OBJC_CLASS_$_");
+  static constexpr llvm::StringLiteral g_objc_v2_prefix_metaclass(
+      "_OBJC_METACLASS_$_");
+  static constexpr llvm::StringLiteral g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
+
+  llvm::StringRef symbol_name_ref(symbol_name);
+  if (symbol_name_ref.empty())
+    return false;
+
+  if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
+    symbol_name_non_abi_mangled = symbol_name + 1;
+    symbol_name = symbol_name + g_objc_v2_prefix_class.size();
+    type = eSymbolTypeObjCClass;
+    return true;
+  }
+
+  if (symbol_name_ref.starts_with(g_objc_v2_prefix_metaclass)) {
+    symbol_name_non_abi_mangled = symbol_name + 1;
+    symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
+    type = eSymbolTypeObjCMetaClass;
+    return true;
+  }
+
+  if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
+    symbol_name_non_abi_mangled = symbol_name + 1;
+    symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
+    type = eSymbolTypeObjCIVar;
+    return true;
+  }
+
+  return false;
+}
+
 static SymbolType GetSymbolType(const char *&symbol_name,
                                 bool &demangled_is_synthesized,
                                 const SectionSP &text_section_sp,
@@ -2183,9 +2220,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
   lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
   uint32_t i;
   FileSpecList dylib_files;
-  llvm::StringRef g_objc_v2_prefix_class("_OBJC_CLASS_$_");
-  llvm::StringRef g_objc_v2_prefix_metaclass("_OBJC_METACLASS_$_");
-  llvm::StringRef g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
   UUID image_uuid;
 
   for (i = 0; i < m_header.ncmds; ++i) {
@@ -2805,33 +2839,15 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                         is_gsym = true;
                         sym[sym_idx].SetExternal(true);
 
-                        llvm::StringRef symbol_name_ref(symbol_name);
-                        if (symbol_name_ref.starts_with(
-                                g_objc_v2_prefix_class)) {
-                          symbol_name_non_abi_mangled = symbol_name + 1;
-                          symbol_name =
-                              symbol_name + g_objc_v2_prefix_class.size();
-                          type = eSymbolTypeObjCClass;
-                          demangled_is_synthesized = true;
-
-                        } else if (symbol_name_ref.starts_with(
-                                       g_objc_v2_prefix_metaclass)) {
-                          symbol_name_non_abi_mangled = symbol_name + 1;
-                          symbol_name =
-                              symbol_name + g_objc_v2_prefix_metaclass.size();
-                          type = eSymbolTypeObjCMetaClass;
-                          demangled_is_synthesized = true;
-                        } else if (symbol_name_ref.starts_with(
-                                       g_objc_v2_prefix_ivar)) {
-                          symbol_name_non_abi_mangled = symbol_name + 1;
-                          symbol_name =
-                              symbol_name + g_objc_v2_prefix_ivar.size();
-                          type = eSymbolTypeObjCIVar;
+                        if (TryParseV2ObjCMetadataSymbol(
+                                symbol_name, symbol_name_non_abi_mangled,
+                                type)) {
                           demangled_is_synthesized = true;
                         } else {
                           if (nlist.n_value != 0)
                             symbol_section = section_info.GetSection(
                                 nlist.n_sect, nlist.n_value);
+
                           type = eSymbolTypeData;
                         }
                         break;
@@ -3317,48 +3333,10 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                                       symbol_sect_name) {
                                 type = eSymbolTypeRuntime;
 
-                                if (symbol_name) {
-                                  llvm::StringRef symbol_name_ref(symbol_name);
-                                  if (symbol_name_ref.starts_with("_OBJC_")) {
-                                    llvm::StringRef
-                                        g_objc_v2_prefix_class(
-                                            "_OBJC_CLASS_$_");
-                                    llvm::StringRef
-                                        g_objc_v2_prefix_metaclass(
-                                            "_OBJC_METACLASS_$_");
-                                    llvm::StringRef
-                                        g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
-                                    if (symbol_name_ref.starts_with(
-                                            g_objc_v2_prefix_class)) {
-                                      symbol_name_non_abi_mangled =
-                                          symbol_name + 1;
-                                      symbol_name =
-                                          symbol_name +
-                                          g_objc_v2_prefix_class.size();
-                                      type = eSymbolTypeObjCClass;
-                                      demangled_is_synthesized = true;
-                                    } else if (
-                                        symbol_name_ref.starts_with(
-                                            g_objc_v2_prefix_metaclass)) {
-                                      symbol_name_non_abi_mangled =
-                                          symbol_name + 1;
-                                      symbol_name =
-                                          symbol_name +
-                                          g_objc_v2_prefix_metaclass.size();
-                                      type = eSymbolTypeObjCMetaClass;
-                                      demangled_is_synthesized = true;
-                                    } else if (symbol_name_ref.starts_with(
-                                                   g_objc_v2_prefix_ivar)) {
-                                      symbol_name_non_abi_mangled =
-                                          symbol_name + 1;
-                                      symbol_name =
-                                          symbol_name +
-                                          g_objc_v2_prefix_ivar.size();
-                                      type = eSymbolTypeObjCIVar;
-                                      demangled_is_synthesized = true;
-                                    }
-                                  }
-                                }
+                                if (TryParseV2ObjCMetadataSymbol(
+                                        symbol_name,
+                                        symbol_name_non_abi_mangled, type))
+                                  demangled_is_synthesized = true;
                               } else if (symbol_sect_name &&
                                          ::strstr(symbol_sect_name,
                                                   "__gcc_except_tab") ==
@@ -3665,27 +3643,14 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
           is_gsym = true;
           sym[sym_idx].SetExternal(true);
 
-          llvm::StringRef symbol_name_ref(symbol_name);
-          if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
-            symbol_name_non_abi_mangled = symbol_name + 1;
-            symbol_name = symbol_name + g_objc_v2_prefix_class.size();
-            type = eSymbolTypeObjCClass;
-            demangled_is_synthesized = true;
-
-          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_metaclass)) {
-            symbol_name_non_abi_mangled = symbol_name + 1;
-            symbol_name = symbol_name + g_objc_v2_prefix_metaclass.size();
-            type = eSymbolTypeObjCMetaClass;
-            demangled_is_synthesized = true;
-          } else if (symbol_name_ref.starts_with(g_objc_v2_prefix_ivar)) {
-            symbol_name_non_abi_mangled = symbol_name + 1;
-            symbol_name = symbol_name + g_objc_v2_prefix_ivar.size();
-            type = eSymbolTypeObjCIVar;
+          if (TryParseV2ObjCMetadataSymbol(symbol_name,
+                                           symbol_name_non_abi_mangled, type)) {
             demangled_is_synthesized = true;
           } else {
             if (nlist.n_value != 0)
               symbol_section =
                   section_info.GetSection(nlist.n_sect, nlist.n_value);
+
             type = eSymbolTypeData;
           }
         } break;
@@ -4124,38 +4089,9 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
                     ::strstr(symbol_sect_name, "__objc") == symbol_sect_name) {
                   type = eSymbolTypeRuntime;
 
-                  if (symbol_name) {
-                    llvm::StringRef symbol_name_ref(symbol_name);
-                    if (symbol_name_ref.starts_with("_OBJC_")) {
-                      llvm::StringRef g_objc_v2_prefix_class(
-                          "_OBJC_CLASS_$_");
-                      llvm::StringRef g_objc_v2_prefix_metaclass(
-                          "_OBJC_METACLASS_$_");
-                      llvm::StringRef g_objc_v2_prefix_ivar(
-                          "_OBJC_IVAR_$_");
-                      if (symbol_name_ref.starts_with(g_objc_v2_prefix_class)) {
-                        symbol_name_non_abi_mangled = symbol_name + 1;
-                        symbol_name =
-                            symbol_name + g_objc_v2_prefix_class.size();
-                        type = eSymbolTypeObjCClass;
-                        demangled_is_synthesized = true;
-                      } else if (symbol_name_ref.starts_with(
-                                     g_objc_v2_prefix_metaclass)) {
-                        symbol_name_non_abi_mangled = symbol_name + 1;
-                        symbol_name =
-                            symbol_name + g_objc_v2_prefix_metaclass.size();
-                        type = eSymbolTypeObjCMetaClass;
-                        demangled_is_synthesized = true;
-                      } else if (symbol_name_ref.starts_with(
-                                     g_objc_v2_prefix_ivar)) {
-                        symbol_name_non_abi_mangled = symbol_name + 1;
-                        symbol_name =
-                            symbol_name + g_objc_v2_prefix_ivar.size();
-                        type = eSymbolTypeObjCIVar;
-                        demangled_is_synthesized = true;
-                      }
-                    }
-                  }
+                  if (TryParseV2ObjCMetadataSymbol(
+                          symbol_name, symbol_name_non_abi_mangled, type))
+                    demangled_is_synthesized = true;
                 } else if (symbol_sect_name &&
                            ::strstr(symbol_sect_name, "__gcc_except_tab") ==
                                symbol_sect_name) {

From ab2c4a0ee1b4710c0f610292b6e9dcb45839c25f Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Fri, 3 Oct 2025 03:22:29 +0530
Subject: [PATCH 583/878] [clang-scan-deps] Remove unused OutputsPaths from
 FullDependencyConsumer (NFC) (#155523)

The OutputPaths field of FullDependencyConsumer is not used, and the
resulting TranslationUnitDeps has no corresponding field. This change
removes the unused member.

It was added in commit f978ea4, and this comment in the Differential
Revision suggests it was intended to be removed before landing:
https://reviews.llvm.org/D70268#1772032
---
 .../clang/Tooling/DependencyScanning/DependencyScanningTool.h    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
index c3601a4e73e1f..f222ded8a966a 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h
@@ -220,7 +220,6 @@ class FullDependencyConsumer : public DependencyConsumer {
   std::vector<std::string> VisibleModules;
   std::vector<Command> Commands;
   std::string ContextHash;
-  std::vector<std::string> OutputPaths;
   const llvm::DenseSet<ModuleID> &AlreadySeen;
 };
 

From 9345b597e90ad99dbe2749cbb20692c0c881fd71 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 2 Oct 2025 15:22:15 -0700
Subject: [PATCH 584/878] [libc][NFC] Turn off faccessat on aarch64 (#161740)

The SYS_faccessat2 syscall isn't available on the aarch64 buildbot, so
disable this entrypoint for now.
---
 libc/config/linux/aarch64/entrypoints.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index cfb77911f3c2d..4824684103983 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -325,7 +325,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
-    libc.src.unistd.faccessat
+    # Disabled while SYS_faccessat2 is unavailable on the buildbot.
+    # libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync

From 66feafd4bef17df6d4b5f2674b979797d5e1c516 Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffranllvm@gmail.com>
Date: Thu, 2 Oct 2025 15:34:35 -0700
Subject: [PATCH 585/878] [HLSL] Add missing dependency to Frontend unittest
 (#161738)

Recently, a bot fails on `DefinedStaticSamplerDump` unittest. After some
debugging, we come to the conclusion that the issue might because due to
a missing dependency when compiling. This adds that dependency that was
idenfied missing
---
 llvm/unittests/Frontend/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/unittests/Frontend/CMakeLists.txt b/llvm/unittests/Frontend/CMakeLists.txt
index 836a844b710db..1ce34e77cb348 100644
--- a/llvm/unittests/Frontend/CMakeLists.txt
+++ b/llvm/unittests/Frontend/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   Analysis
+  BinaryFormat
   Core
   FrontendHLSL
   FrontendOffloading

From 3960ff6ca03b0441087f9042850199583d9e11d2 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 2 Oct 2025 19:07:25 -0400
Subject: [PATCH 586/878] [mlir][vector] Simplify op rewrite pattern inheriting
 constructors. NFC. (#161670)

Use the `Base` type alias from
https://github.com/llvm/llvm-project/pull/158433.
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 82 +++++++++----------
 .../Transforms/LowerVectorBroadcast.cpp       |  2 +-
 .../Vector/Transforms/LowerVectorContract.cpp |  2 +-
 .../Vector/Transforms/LowerVectorGather.cpp   |  6 +-
 .../Transforms/LowerVectorInterleave.cpp      |  2 +-
 .../Vector/Transforms/LowerVectorMask.cpp     |  6 +-
 .../Transforms/LowerVectorMultiReduction.cpp  | 10 +--
 .../Vector/Transforms/LowerVectorScan.cpp     |  2 +-
 .../Transforms/LowerVectorShapeCast.cpp       |  4 +-
 .../Vector/Transforms/LowerVectorShuffle.cpp  |  2 +-
 .../Vector/Transforms/LowerVectorStep.cpp     |  2 +-
 ...LowerVectorToFromElementsToShuffleTree.cpp |  2 +-
 .../Transforms/LowerVectorTranspose.cpp       |  6 +-
 .../Transforms/VectorDropLeadUnitDim.cpp      | 12 +--
 .../VectorEmulateMaskedLoadStore.cpp          |  4 +-
 .../Transforms/VectorEmulateNarrowType.cpp    | 16 ++--
 ...sertExtractStridedSliceRewritePatterns.cpp |  8 +-
 .../Vector/Transforms/VectorLinearize.cpp     | 24 +++---
 .../Transforms/VectorTransferOpTransforms.cpp |  2 +-
 .../Vector/Transforms/VectorTransforms.cpp    | 40 ++++-----
 20 files changed, 117 insertions(+), 117 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index eb4686997c1b9..b0132e889302f 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -580,7 +580,7 @@ namespace {
 // ElideSingleElementReduction for ReduceOp.
 struct ElideUnitDimsInMultiDimReduction
     : public OpRewritePattern<MultiDimReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(MultiDimReductionOp reductionOp,
                                 PatternRewriter &rewriter) const override {
@@ -730,7 +730,7 @@ std::optional<SmallVector<int64_t, 4>> ReductionOp::getShapeForUnroll() {
 
 namespace {
 struct ElideSingleElementReduction : public OpRewritePattern<ReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ReductionOp reductionOp,
                                 PatternRewriter &rewriter) const override {
@@ -2197,7 +2197,7 @@ namespace {
 // Pattern to rewrite a ExtractOp(Broadcast) -> Broadcast.
 class ExtractOpFromBroadcast final : public OpRewritePattern<ExtractOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -2220,7 +2220,7 @@ class ExtractOpFromBroadcast final : public OpRewritePattern<ExtractOp> {
 // Pattern to rewrite a ExtractOp(CreateMask) -> CreateMask.
 class ExtractOpFromCreateMask final : public OpRewritePattern<ExtractOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -2546,7 +2546,7 @@ rewriteFromElementsAsBroadcast(FromElementsOp fromElementsOp,
 
 class FromElementsToShapeCast : public OpRewritePattern<FromElementsOp> {
 
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(FromElementsOp fromElements,
                                 PatternRewriter &rewriter) const override {
@@ -2938,7 +2938,7 @@ namespace {
 
 // Fold broadcast1(broadcast2(x)) into broadcast1(x).
 struct BroadcastFolder : public OpRewritePattern<BroadcastOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(BroadcastOp broadcastOp,
                                 PatternRewriter &rewriter) const override {
@@ -3109,7 +3109,7 @@ namespace {
 // Pattern to rewrite a 0-D shuffle with [0] or [1] mask returning a 1-D vector
 // to a broadcast.
 struct Canonicalize0DShuffleOp : public OpRewritePattern<ShuffleOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ShuffleOp shuffleOp,
                                 PatternRewriter &rewriter) const override {
@@ -3165,7 +3165,7 @@ static Value getScalarSplatSource(Value value) {
 /// Pattern to rewrite shuffle(splat-like(v), splat-like(v)) as broadcast(v).
 class ShuffleSplat final : public OpRewritePattern<ShuffleOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ShuffleOp op,
                                 PatternRewriter &rewriter) const override {
@@ -3182,7 +3182,7 @@ class ShuffleSplat final : public OpRewritePattern<ShuffleOp> {
 /// vector.interleave.
 class ShuffleInterleave : public OpRewritePattern<ShuffleOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ShuffleOp op,
                                 PatternRewriter &rewriter) const override {
@@ -3326,7 +3326,7 @@ namespace {
 // broadcast.
 class InsertToBroadcast final : public OpRewritePattern<InsertOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(InsertOp insertOp,
                                 PatternRewriter &rewriter) const override {
@@ -3344,7 +3344,7 @@ class InsertToBroadcast final : public OpRewritePattern<InsertOp> {
 /// Pattern to rewrite a insert(splat-like(v), splat-like(v)) as broadcast(v).
 class InsertSplatToSplat final : public OpRewritePattern<InsertOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(InsertOp op,
                                 PatternRewriter &rewriter) const override {
@@ -3380,7 +3380,7 @@ class InsertSplatToSplat final : public OpRewritePattern<InsertOp> {
 ///   %result = vector.from_elements %c1, %c2 : vector<2xi32>
 class InsertChainFullyInitialized final : public OpRewritePattern<InsertOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(InsertOp op,
                                 PatternRewriter &rewriter) const override {
 
@@ -3748,7 +3748,7 @@ namespace {
 class FoldInsertStridedSliceSplat final
     : public OpRewritePattern<InsertStridedSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(InsertStridedSliceOp insertStridedSliceOp,
                                 PatternRewriter &rewriter) const override {
@@ -3768,7 +3768,7 @@ class FoldInsertStridedSliceSplat final
 class FoldInsertStridedSliceOfExtract final
     : public OpRewritePattern<InsertStridedSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(InsertStridedSliceOp insertStridedSliceOp,
                                 PatternRewriter &rewriter) const override {
@@ -3798,7 +3798,7 @@ class FoldInsertStridedSliceOfExtract final
 class InsertStridedSliceConstantFolder final
     : public OpRewritePattern<InsertStridedSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   // Do not create constants with more than `vectorSizeFoldThreashold` elements,
   // unless the source vector constant has a single use.
@@ -4250,7 +4250,7 @@ namespace {
 // %mask = vector.create_mask %new_ub : vector<8xi1>
 class StridedSliceCreateMaskFolder final
     : public OpRewritePattern<ExtractStridedSliceOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
 public:
   LogicalResult matchAndRewrite(ExtractStridedSliceOp extractStridedSliceOp,
@@ -4310,7 +4310,7 @@ class StridedSliceCreateMaskFolder final
 class StridedSliceConstantMaskFolder final
     : public OpRewritePattern<ExtractStridedSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ExtractStridedSliceOp extractStridedSliceOp,
                                 PatternRewriter &rewriter) const override {
@@ -4365,7 +4365,7 @@ class StridedSliceConstantMaskFolder final
 class StridedSliceBroadcast final
     : public OpRewritePattern<ExtractStridedSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ExtractStridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -4416,7 +4416,7 @@ class StridedSliceBroadcast final
 /// Rewrite extract_strided_slice(splat-like(v)) with broadcast(v).
 class StridedSliceSplat final : public OpRewritePattern<ExtractStridedSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ExtractStridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -4448,7 +4448,7 @@ class StridedSliceSplat final : public OpRewritePattern<ExtractStridedSliceOp> {
 class ContiguousExtractStridedSliceToExtract final
     : public OpRewritePattern<ExtractStridedSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ExtractStridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -5023,7 +5023,7 @@ namespace {
 /// ```
 struct TransferReadAfterWriteToBroadcast
     : public OpRewritePattern<TransferReadOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
@@ -5458,7 +5458,7 @@ namespace {
 /// any other uses.
 class FoldWaw final : public OpRewritePattern<TransferWriteOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const override {
     if (!llvm::isa<RankedTensorType>(writeOp.getShapedType()))
@@ -5514,7 +5514,7 @@ class FoldWaw final : public OpRewritePattern<TransferWriteOp> {
 struct SwapExtractSliceOfTransferWrite
     : public OpRewritePattern<tensor::InsertSliceOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(tensor::InsertSliceOp insertOp,
                                 PatternRewriter &rewriter) const override {
@@ -5737,7 +5737,7 @@ LogicalResult MaskedLoadOp::verify() {
 namespace {
 class MaskedLoadFolder final : public OpRewritePattern<MaskedLoadOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(MaskedLoadOp load,
                                 PatternRewriter &rewriter) const override {
     switch (getMaskFormat(load.getMask())) {
@@ -5794,7 +5794,7 @@ LogicalResult MaskedStoreOp::verify() {
 namespace {
 class MaskedStoreFolder final : public OpRewritePattern<MaskedStoreOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(MaskedStoreOp store,
                                 PatternRewriter &rewriter) const override {
     switch (getMaskFormat(store.getMask())) {
@@ -5890,7 +5890,7 @@ static LogicalResult isZeroBasedContiguousSeq(Value indexVec) {
 namespace {
 class GatherFolder final : public OpRewritePattern<GatherOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(GatherOp gather,
                                 PatternRewriter &rewriter) const override {
     switch (getMaskFormat(gather.getMask())) {
@@ -5910,7 +5910,7 @@ class GatherFolder final : public OpRewritePattern<GatherOp> {
 /// maskedload. Only 1D fixed vectors are supported for now.
 class FoldContiguousGather final : public OpRewritePattern<GatherOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(GatherOp op,
                                 PatternRewriter &rewriter) const override {
     if (!isa<MemRefType>(op.getBase().getType()))
@@ -5962,7 +5962,7 @@ LogicalResult ScatterOp::verify() {
 namespace {
 class ScatterFolder final : public OpRewritePattern<ScatterOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(ScatterOp scatter,
                                 PatternRewriter &rewriter) const override {
     switch (getMaskFormat(scatter.getMask())) {
@@ -5982,7 +5982,7 @@ class ScatterFolder final : public OpRewritePattern<ScatterOp> {
 /// maskedstore. Only 1D fixed vectors are supported for now.
 class FoldContiguousScatter final : public OpRewritePattern<ScatterOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(ScatterOp op,
                                 PatternRewriter &rewriter) const override {
     if (failed(isZeroBasedContiguousSeq(op.getIndices())))
@@ -6030,7 +6030,7 @@ LogicalResult ExpandLoadOp::verify() {
 namespace {
 class ExpandLoadFolder final : public OpRewritePattern<ExpandLoadOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(ExpandLoadOp expand,
                                 PatternRewriter &rewriter) const override {
     switch (getMaskFormat(expand.getMask())) {
@@ -6081,7 +6081,7 @@ LogicalResult CompressStoreOp::verify() {
 namespace {
 class CompressStoreFolder final : public OpRewritePattern<CompressStoreOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(CompressStoreOp compress,
                                 PatternRewriter &rewriter) const override {
     switch (getMaskFormat(compress.getMask())) {
@@ -6260,7 +6260,7 @@ static VectorType trimTrailingOneDims(VectorType oldType) {
 class ShapeCastCreateMaskFolderTrailingOneDim final
     : public OpRewritePattern<ShapeCastOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ShapeCastOp shapeOp,
                                 PatternRewriter &rewriter) const override {
@@ -6330,7 +6330,7 @@ class ShapeCastCreateMaskFolderTrailingOneDim final
 /// If both (i) and (ii) are possible, (i) is chosen.
 class ShapeCastBroadcastFolder final : public OpRewritePattern<ShapeCastOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ShapeCastOp shapeCastOp,
                                 PatternRewriter &rewriter) const override {
@@ -6614,7 +6614,7 @@ namespace {
 // Rewrites two back-to-back TransposeOp operations into a single TransposeOp.
 class TransposeFolder final : public OpRewritePattern<vector::TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp transposeOp,
                                 PatternRewriter &rewriter) const override {
@@ -6646,7 +6646,7 @@ class TransposeFolder final : public OpRewritePattern<vector::TransposeOp> {
 /// Replace transpose(splat-like(v)) with broadcast(v)
 class FoldTransposeSplat final : public OpRewritePattern<TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(TransposeOp transposeOp,
                                 PatternRewriter &rewriter) const override {
@@ -6663,7 +6663,7 @@ class FoldTransposeSplat final : public OpRewritePattern<TransposeOp> {
 /// Folds transpose(create_mask) into a new transposed create_mask.
 class FoldTransposeCreateMask final : public OpRewritePattern<TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(TransposeOp transpOp,
                                 PatternRewriter &rewriter) const override {
@@ -6700,7 +6700,7 @@ class FoldTransposeCreateMask final : public OpRewritePattern<TransposeOp> {
 /// Folds transpose(shape_cast) into a new shape_cast.
 class FoldTransposeShapeCast final : public OpRewritePattern<TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(TransposeOp transposeOp,
                                 PatternRewriter &rewriter) const override {
@@ -6750,7 +6750,7 @@ class FoldTransposeShapeCast final : public OpRewritePattern<TransposeOp> {
 /// within the groups [0,1] and [3,4], like (1 0 2 4 3 5 6).
 class FoldTransposeBroadcast : public OpRewritePattern<vector::TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
   FoldTransposeBroadcast(MLIRContext *context, PatternBenefit benefit = 1)
       : OpRewritePattern<vector::TransposeOp>(context, benefit) {}
 
@@ -6971,7 +6971,7 @@ namespace {
 ///   %0 = vector.constant_mask [8, 16] : vector<8x[16]xi1>
 class CreateMaskFolder final : public OpRewritePattern<CreateMaskOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(CreateMaskOp createMaskOp,
                                 PatternRewriter &rewriter) const override {
@@ -7300,7 +7300,7 @@ LogicalResult MaskOp::fold(FoldAdaptor adaptor,
 ///   %0 = arith.select %mask, %a, %passthru : vector<8xf32>
 ///
 class CanonializeEmptyMaskOp : public OpRewritePattern<MaskOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(MaskOp maskOp,
                                 PatternRewriter &rewriter) const override {
@@ -7410,7 +7410,7 @@ OpFoldResult SplatOp::fold(FoldAdaptor adaptor) {
 // vector.broadcast.
 class SplatToBroadcastPattern final : public OpRewritePattern<SplatOp> {
 public:
-  using OpRewritePattern<SplatOp>::OpRewritePattern;
+  using Base::Base;
   LogicalResult matchAndRewrite(SplatOp splatOp,
                                 PatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<vector::BroadcastOp>(splatOp, splatOp.getType(),
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
index dedc3b3f30201..61d9357e19bb4 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
@@ -34,7 +34,7 @@ namespace {
 /// convertible to the lower level target dialect (LLVM, SPIR-V, etc.) directly.
 class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::BroadcastOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
index 65702ffa152d9..efe8d14b3532a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -1151,7 +1151,7 @@ FailureOr<Value> ContractionOpLowering::lowerReduction(
 ///
 class OuterProductOpLowering : public OpRewritePattern<vector::OuterProductOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::OuterProductOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
index 1f96a3a108006..6bc8347bc6f76 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorGather.cpp
@@ -50,7 +50,7 @@ namespace {
 ///
 /// Supports vector types with a fixed leading dimension.
 struct UnrollGather : OpRewritePattern<vector::GatherOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::GatherOp op,
                                 PatternRewriter &rewriter) const override {
@@ -98,7 +98,7 @@ struct UnrollGather : OpRewritePattern<vector::GatherOp> {
 /// ATM this is effectively limited to reading a 1D Vector from a 2D MemRef,
 /// but should be fairly straightforward to extend beyond that.
 struct RemoveStrideFromGatherSource : OpRewritePattern<vector::GatherOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::GatherOp op,
                                 PatternRewriter &rewriter) const override {
@@ -164,7 +164,7 @@ struct RemoveStrideFromGatherSource : OpRewritePattern<vector::GatherOp> {
 /// `tensor.extract`s. To avoid out-of-bounds memory accesses, these
 /// loads/extracts are made conditional using `scf.if` ops.
 struct Gather1DToConditionalLoads : OpRewritePattern<vector::GatherOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::GatherOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp
index 9d6a865a9301f..479fc0c6a9d8c 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorInterleave.cpp
@@ -163,7 +163,7 @@ class UnrollDeinterleaveOp final
 ///   : vector<7xi16>, vector<7xi16>
 /// ```
 struct InterleaveToShuffle final : OpRewritePattern<vector::InterleaveOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::InterleaveOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
index 5617b067d249e..7730c4e7c950a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMask.cpp
@@ -48,7 +48,7 @@ namespace {
 /// until a one-dimensional vector is reached.
 class CreateMaskOpLowering : public OpRewritePattern<vector::CreateMaskOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::CreateMaskOp op,
                                 PatternRewriter &rewriter) const override {
@@ -100,7 +100,7 @@ class CreateMaskOpLowering : public OpRewritePattern<vector::CreateMaskOp> {
 /// will be folded at LLVM IR level.
 class ConstantMaskOpLowering : public OpRewritePattern<vector::ConstantMaskOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ConstantMaskOp op,
                                 PatternRewriter &rewriter) const override {
@@ -184,7 +184,7 @@ namespace {
 /// and actually match the traits of its the nested `MaskableOpInterface`.
 template <class SourceOp>
 struct MaskOpRewritePattern : OpRewritePattern<MaskOp> {
-  using OpRewritePattern<MaskOp>::OpRewritePattern;
+  using Base::Base;
 
 private:
   LogicalResult matchAndRewrite(MaskOp maskOp,
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
index 4773732d8d9a6..e86e2a97038db 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorMultiReduction.cpp
@@ -39,7 +39,7 @@ namespace {
 class InnerOuterDimReductionConversion
     : public OpRewritePattern<vector::MultiDimReductionOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   explicit InnerOuterDimReductionConversion(
       MLIRContext *context, vector::VectorMultiReductionLowering options,
@@ -136,7 +136,7 @@ class InnerOuterDimReductionConversion
 class ReduceMultiDimReductionRank
     : public OpRewritePattern<vector::MultiDimReductionOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   explicit ReduceMultiDimReductionRank(
       MLIRContext *context, vector::VectorMultiReductionLowering options,
@@ -304,7 +304,7 @@ class ReduceMultiDimReductionRank
 /// and combines results
 struct TwoDimMultiReductionToElementWise
     : public OpRewritePattern<vector::MultiDimReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp,
                                 PatternRewriter &rewriter) const override {
@@ -359,7 +359,7 @@ struct TwoDimMultiReductionToElementWise
 /// a sequence of vector.reduction ops.
 struct TwoDimMultiReductionToReduction
     : public OpRewritePattern<vector::MultiDimReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp,
                                 PatternRewriter &rewriter) const override {
@@ -420,7 +420,7 @@ struct TwoDimMultiReductionToReduction
 /// separately.
 struct OneDimMultiReductionToTwoDim
     : public OpRewritePattern<vector::MultiDimReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
index af4851eb5f158..258f2cbc77736 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorScan.cpp
@@ -99,7 +99,7 @@ namespace {
 ///   return %7, %8 : vector<2x3xi32>, vector<2xi32>
 /// ```
 struct ScanToArithOps : public OpRewritePattern<vector::ScanOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ScanOp scanOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
index 603ea41d43360..c5f22b2eafeb7 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
@@ -189,7 +189,7 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
   }
 
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ShapeCastOp op,
                                 PatternRewriter &rewriter) const override {
@@ -356,7 +356,7 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
 class ScalableShapeCastOpRewritePattern
     : public OpRewritePattern<vector::ShapeCastOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ShapeCastOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
index 78102f7325b9f..8f46ad6ea892b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
@@ -44,7 +44,7 @@ namespace {
 ///
 struct MixedSizeInputShuffleOpRewrite final
     : OpRewritePattern<vector::ShuffleOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ShuffleOp shuffleOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorStep.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorStep.cpp
index ee5568aefda27..08e7c895831ce 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorStep.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorStep.cpp
@@ -24,7 +24,7 @@ using namespace mlir::vector;
 namespace {
 
 struct StepToArithConstantOpRewrite final : OpRewritePattern<vector::StepOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::StepOp stepOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp
index 6407a868abd85..7521e2491335b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorToFromElementsToShuffleTree.cpp
@@ -667,7 +667,7 @@ getToElementsDefiningOps(FromElementsOp fromElemsOp,
 struct ToFromElementsToShuffleTreeRewrite final
     : OpRewritePattern<vector::FromElementsOp> {
 
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::FromElementsOp fromElemsOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
index 9e7d0ced3e6d1..c3f7de0ac3c4e 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorTranspose.cpp
@@ -300,7 +300,7 @@ namespace {
 ///   %x = vector.insert .., .. [.., ..]
 class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   TransposeOpLowering(vector::VectorTransposeLowering vectorTransposeLowering,
                       MLIRContext *context, PatternBenefit benefit = 1)
@@ -395,7 +395,7 @@ class TransposeOpLowering : public OpRewritePattern<vector::TransposeOp> {
 class Transpose2DWithUnitDimToShapeCast
     : public OpRewritePattern<vector::TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   Transpose2DWithUnitDimToShapeCast(MLIRContext *context,
                                     PatternBenefit benefit = 1)
@@ -433,7 +433,7 @@ class Transpose2DWithUnitDimToShapeCast
 class TransposeOp2DToShuffleLowering
     : public OpRewritePattern<vector::TransposeOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   TransposeOp2DToShuffleLowering(
       vector::VectorTransposeLowering vectorTransposeLowering,
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index cab12894487e2..963b2c803bc5a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -54,7 +54,7 @@ namespace {
 // input by inserting vector.broadcast.
 struct CastAwayExtractStridedSliceLeadingOneDim
     : public OpRewritePattern<vector::ExtractStridedSliceOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractStridedSliceOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -104,7 +104,7 @@ struct CastAwayExtractStridedSliceLeadingOneDim
 // inputs by inserting vector.broadcast.
 struct CastAwayInsertStridedSliceLeadingOneDim
     : public OpRewritePattern<vector::InsertStridedSliceOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::InsertStridedSliceOp insertOp,
                                 PatternRewriter &rewriter) const override {
@@ -145,7 +145,7 @@ struct CastAwayInsertStridedSliceLeadingOneDim
 // Casts away leading one dimensions in vector.insert's vector inputs by
 // inserting vector.broadcast.
 struct CastAwayInsertLeadingOneDim : public OpRewritePattern<vector::InsertOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::InsertOp insertOp,
                                 PatternRewriter &rewriter) const override {
@@ -221,7 +221,7 @@ static Value dropUnitDimsFromMask(OpBuilder &b, Location loc, Value mask,
 // 1 dimensions.
 struct CastAwayTransferReadLeadingOneDim
     : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp read,
                                 PatternRewriter &rewriter) const override {
@@ -275,7 +275,7 @@ struct CastAwayTransferReadLeadingOneDim
 // 1 dimensions.
 struct CastAwayTransferWriteLeadingOneDim
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp write,
                                 PatternRewriter &rewriter) const override {
@@ -541,7 +541,7 @@ class CastAwayElementwiseLeadingOneDim : public RewritePattern {
 // vector.broadcast back to the original shape.
 struct CastAwayConstantMaskLeadingOneDim
     : public OpRewritePattern<vector::ConstantMaskOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ConstantMaskOp mask,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
index bdbb792041e3d..7acc120508a44 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp
@@ -48,7 +48,7 @@ namespace {
 ///
 struct VectorMaskedLoadOpConverter final
     : OpRewritePattern<vector::MaskedLoadOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::MaskedLoadOp maskedLoadOp,
                                 PatternRewriter &rewriter) const override {
@@ -117,7 +117,7 @@ struct VectorMaskedLoadOpConverter final
 ///
 struct VectorMaskedStoreOpConverter final
     : OpRewritePattern<vector::MaskedStoreOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::MaskedStoreOp maskedStoreOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 264cbc1869b9a..3a6684f4edfb7 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -548,7 +548,7 @@ namespace {
 // NOTE: By default, all RMW sequences are atomic. Set `disableAtomicRMW` to
 // `false` to generate non-atomic RMW sequences.
 struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   ConvertVectorStore(MLIRContext *context, bool disableAtomicRMW)
       : OpConversionPattern<vector::StoreOp>(context),
@@ -827,7 +827,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
 /// adjusted mask .
 struct ConvertVectorMaskedStore final
     : OpConversionPattern<vector::MaskedStoreOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   LogicalResult
   matchAndRewrite(vector::MaskedStoreOp op, OpAdaptor adaptor,
@@ -950,7 +950,7 @@ struct ConvertVectorMaskedStore final
 /// those cases, loads are converted to byte-aligned, byte-sized loads and the
 /// target vector is extracted from the loaded vector.
 struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   LogicalResult
   matchAndRewrite(vector::LoadOp op, OpAdaptor adaptor,
@@ -1059,7 +1059,7 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
 /// bitcasting, since each `i8` container element holds two `i4` values.
 struct ConvertVectorMaskedLoad final
     : OpConversionPattern<vector::MaskedLoadOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   LogicalResult
   matchAndRewrite(vector::MaskedLoadOp op, OpAdaptor adaptor,
@@ -1257,7 +1257,7 @@ static bool fitsInMultiByteContainerTy(VectorType subByteVecTy,
 // TODO: Document-me
 struct ConvertVectorTransferRead final
     : OpConversionPattern<vector::TransferReadOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   LogicalResult
   matchAndRewrite(vector::TransferReadOp op, OpAdaptor adaptor,
@@ -1942,7 +1942,7 @@ namespace {
 /// advantage of high-level information to avoid leaving LLVM to scramble with
 /// peephole optimizations.
 struct RewriteBitCastOfTruncI : OpRewritePattern<vector::BitCastOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::BitCastOp bitCastOp,
                                 PatternRewriter &rewriter) const override {
@@ -2147,7 +2147,7 @@ struct RewriteAlignedSubByteIntExt : OpRewritePattern<ConversionOpType> {
 ///   %5 = vector.bitcast %4 : vector<4xi8> to vector<8xi4>
 ///
 struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
-  using OpRewritePattern<arith::TruncIOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(arith::TruncIOp truncOp,
                                 PatternRewriter &rewriter) const override {
@@ -2200,7 +2200,7 @@ struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
 ///   %2 = arith.trunci %1 : vector<16x8xi8> to vector<16x8xi4>
 ///
 struct RewriteVectorTranspose : OpRewritePattern<vector::TransposeOp> {
-  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
+  using Base::Base;
 
   RewriteVectorTranspose(MLIRContext *context, PatternBenefit benefit)
       : OpRewritePattern<vector::TransposeOp>(context, benefit) {}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
index f6d6555f4c6e2..9e49873a4b4b0 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp
@@ -34,7 +34,7 @@ using namespace mlir::vector;
 class DecomposeDifferentRankInsertStridedSlice
     : public OpRewritePattern<InsertStridedSliceOp> {
 public:
-  using OpRewritePattern<InsertStridedSliceOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(InsertStridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -84,7 +84,7 @@ class DecomposeDifferentRankInsertStridedSlice
 class ConvertSameRankInsertStridedSliceIntoShuffle
     : public OpRewritePattern<InsertStridedSliceOp> {
 public:
-  using OpRewritePattern<InsertStridedSliceOp>::OpRewritePattern;
+  using Base::Base;
 
   void initialize() {
     // This pattern creates recursive InsertStridedSliceOp, but the recursion is
@@ -183,7 +183,7 @@ class ConvertSameRankInsertStridedSliceIntoShuffle
 class Convert1DExtractStridedSliceIntoShuffle
     : public OpRewritePattern<ExtractStridedSliceOp> {
 public:
-  using OpRewritePattern<ExtractStridedSliceOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(ExtractStridedSliceOp op,
                                 PatternRewriter &rewriter) const override {
@@ -271,7 +271,7 @@ class Convert1DExtractStridedSliceIntoExtractInsertChain final
 class DecomposeNDExtractStridedSlice
     : public OpRewritePattern<ExtractStridedSliceOp> {
 public:
-  using OpRewritePattern<ExtractStridedSliceOp>::OpRewritePattern;
+  using Base::Base;
 
   void initialize() {
     // This pattern creates recursive ExtractStridedSliceOp, but the recursion
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 82bac8c499028..71fba71c9f15f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -214,7 +214,7 @@ SmallVector<int64_t> static getStridedSliceInsertionIndices(
 /// vector.extract_strided_slice operation.
 struct LinearizeVectorExtractStridedSlice final
     : public mlir::OpConversionPattern<mlir::vector::ExtractStridedSliceOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorExtractStridedSlice(const TypeConverter &typeConverter,
                                      MLIRContext *context,
                                      PatternBenefit benefit = 1)
@@ -285,7 +285,7 @@ struct LinearizeVectorExtractStridedSlice final
 ///
 struct LinearizeVectorInsertStridedSlice final
     : public mlir::OpConversionPattern<mlir::vector::InsertStridedSliceOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorInsertStridedSlice(const TypeConverter &typeConverter,
                                     MLIRContext *context,
                                     PatternBenefit benefit = 1)
@@ -348,7 +348,7 @@ struct LinearizeVectorInsertStridedSlice final
 /// of the original shuffle operation.
 struct LinearizeVectorShuffle final
     : public OpConversionPattern<vector::ShuffleOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorShuffle(const TypeConverter &typeConverter,
                          MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(typeConverter, context, benefit) {}
@@ -423,7 +423,7 @@ struct LinearizeVectorShuffle final
 ///
 struct LinearizeVectorExtract final
     : public OpConversionPattern<vector::ExtractOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorExtract(const TypeConverter &typeConverter,
                          MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(typeConverter, context, benefit) {}
@@ -501,7 +501,7 @@ struct LinearizeVectorExtract final
 ///
 struct LinearizeVectorInsert final
     : public OpConversionPattern<vector::InsertOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorInsert(const TypeConverter &typeConverter,
                         MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(typeConverter, context, benefit) {}
@@ -575,7 +575,7 @@ struct LinearizeVectorInsert final
 ///   %out_nd = vector.shape_cast %out_1d: vector<16xf16> to vector<4x4xf16>
 struct LinearizeVectorBitCast final
     : public OpConversionPattern<vector::BitCastOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorBitCast(const TypeConverter &typeConverter,
                          MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(typeConverter, context, benefit) {}
@@ -598,7 +598,7 @@ struct LinearizeVectorBitCast final
 ///   %out_nd = vector.shape_cast %out_1d : vector<16xf32> to vector<4x4xf32>
 struct LinearizeVectorSplat final
     : public OpConversionPattern<vector::SplatOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   LinearizeVectorSplat(const TypeConverter &typeConverter, MLIRContext *context,
                        PatternBenefit benefit = 1)
@@ -629,7 +629,7 @@ struct LinearizeVectorSplat final
 ///   %shape_cast = vector.shape_cast %mask : vector<4xi1> to vector<1x4xi1>
 struct LinearizeVectorCreateMask final
     : OpConversionPattern<vector::CreateMaskOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   LinearizeVectorCreateMask(const TypeConverter &typeConverter,
                             MLIRContext *context, PatternBenefit benefit = 1)
@@ -684,7 +684,7 @@ struct LinearizeVectorCreateMask final
 /// For generic cases, the vector unroll pass should be used to unroll the load
 /// to vector<1x1x...xN> form and then linearized
 struct LinearizeVectorLoad final : public OpConversionPattern<vector::LoadOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorLoad(const TypeConverter &typeConverter, MLIRContext *context,
                       PatternBenefit benefit = 1)
       : OpConversionPattern(typeConverter, context, benefit) {}
@@ -731,7 +731,7 @@ struct LinearizeVectorLoad final : public OpConversionPattern<vector::LoadOp> {
 /// to vector<1x1x...xN> form and then linearized
 struct LinearizeVectorStore final
     : public OpConversionPattern<vector::StoreOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorStore(const TypeConverter &typeConverter, MLIRContext *context,
                        PatternBenefit benefit = 1)
       : OpConversionPattern(typeConverter, context, benefit) {}
@@ -778,7 +778,7 @@ struct LinearizeVectorStore final
 ///
 struct LinearizeVectorFromElements final
     : public OpConversionPattern<vector::FromElementsOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
   LinearizeVectorFromElements(const TypeConverter &typeConverter,
                               MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(typeConverter, context, benefit) {}
@@ -814,7 +814,7 @@ struct LinearizeVectorFromElements final
 ///
 struct LinearizeVectorToElements final
     : public OpConversionPattern<vector::ToElementsOp> {
-  using OpConversionPattern::OpConversionPattern;
+  using Base::Base;
 
   LinearizeVectorToElements(const TypeConverter &typeConverter,
                             MLIRContext *context, PatternBenefit benefit = 1)
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index c364a8b54167c..1121d9550f265 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -1081,7 +1081,7 @@ class RewriteScalarExtractOfTransferRead
 /// Rewrite transfer_writes of vectors of size 1 (e.g., vector<1x1xf32>)
 /// to memref.store.
 class RewriteScalarWrite : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp xferOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 866f789ec6a39..d6a6d7cdba673 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -78,7 +78,7 @@ namespace {
 ///  ```
 struct MultiReduceToContract
     : public OpRewritePattern<vector::MultiDimReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::MultiDimReductionOp reduceOp,
                                 PatternRewriter &rewriter) const override {
@@ -138,7 +138,7 @@ struct MultiReduceToContract
 ///  ```
 struct CombineContractABTranspose final
     : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
                                 PatternRewriter &rewriter) const override {
@@ -202,7 +202,7 @@ struct CombineContractABTranspose final
 /// ```
 struct CombineContractResultTranspose final
     : public OpRewritePattern<vector::TransposeOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp resTOp,
                                 PatternRewriter &rewriter) const override {
@@ -568,7 +568,7 @@ static SmallVector<int64_t> getIntValueVector(ArrayAttr arrayAttr) {
 //   %2 = vector.extract %1[1] : f16 from vector<2xf16>
 struct BubbleDownVectorBitCastForExtract
     : public OpRewritePattern<vector::ExtractOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -643,7 +643,7 @@ struct BubbleDownVectorBitCastForExtract
 //   %1 = vector.bitcast %0 : vector<2xf32> to vector<4xf16>
 struct BubbleDownBitCastForStridedSliceExtract
     : public OpRewritePattern<vector::ExtractStridedSliceOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractStridedSliceOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -721,7 +721,7 @@ struct BubbleDownBitCastForStridedSliceExtract
 //   %2 = vector.insert %0, %1 [4] : vector<16xi8> into vector<8x16xi8>
 //
 struct BubbleUpBitCastForInsert : public OpRewritePattern<vector::BitCastOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::BitCastOp bitcastOp,
                                 PatternRewriter &rewriter) const override {
@@ -794,7 +794,7 @@ struct BubbleUpBitCastForInsert : public OpRewritePattern<vector::BitCastOp> {
 //          offsets = [0], strides = [1]} : vector<2xf32> into vector<4xf32>
 struct BubbleUpBitCastForStridedSliceInsert
     : public OpRewritePattern<vector::BitCastOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::BitCastOp bitcastOp,
                                 PatternRewriter &rewriter) const override {
@@ -892,7 +892,7 @@ struct BubbleUpBitCastForStridedSliceInsert
 //   %7 = vector.insert_strided_slice %6, %cst {
 //          offsets = [2], strides = [1]} : vector<2xf32> into vector<4xf32>
 struct BreakDownVectorBitCast : public OpRewritePattern<vector::BitCastOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
 public:
   BreakDownVectorBitCast(MLIRContext *context,
@@ -1131,7 +1131,7 @@ struct ReorderElementwiseOpsOnBroadcast final
 class ExtractOpFromElementwise final
     : public OpRewritePattern<vector::ExtractOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractOp op,
                                 PatternRewriter &rewriter) const override {
@@ -1206,7 +1206,7 @@ static bool isSupportedMemSinkElementType(Type type) {
 /// ```
 class ExtractOpFromLoad final : public OpRewritePattern<vector::ExtractOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractOp op,
                                 PatternRewriter &rewriter) const override {
@@ -1285,7 +1285,7 @@ class ExtractOpFromLoad final : public OpRewritePattern<vector::ExtractOp> {
 class StoreOpFromSplatOrBroadcast final
     : public OpRewritePattern<vector::StoreOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::StoreOp op,
                                 PatternRewriter &rewriter) const override {
@@ -1476,7 +1476,7 @@ static bool allI1ConstantValuesSetTo(arith::ConstantOp constantOp, bool value) {
 /// InstCombine seems to handle vectors with multiple elements but not the
 /// single element ones.
 struct FoldI1Select : public OpRewritePattern<arith::SelectOp> {
-  using OpRewritePattern<arith::SelectOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(arith::SelectOp selectOp,
                                 PatternRewriter &rewriter) const override {
@@ -1560,7 +1560,7 @@ getTransferFoldableInnerUnitDims(MemRefType srcType, VectorType vectorType) {
 /// Drop inner most contiguous unit dimensions from transfer_read operand.
 class DropInnerMostUnitDimsTransferRead
     : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
@@ -1651,7 +1651,7 @@ class DropInnerMostUnitDimsTransferRead
 /// Note, this pattern will not collapse "scalable unit" dims (i.e. `[1]`).
 class DropInnerMostUnitDimsTransferWrite
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const override {
@@ -1728,7 +1728,7 @@ class DropInnerMostUnitDimsTransferWrite
 /// with the RHS transposed) lowering.
 struct CanonicalizeContractMatmulToMMT final
     : OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   using FilterConstraintType =
       std::function<LogicalResult(vector::ContractionOp op)>;
@@ -1845,7 +1845,7 @@ struct CanonicalizeContractMatmulToMMT final
 template <typename ExtOp>
 struct FoldArithExtIntoContractionOp
     : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
                                 PatternRewriter &rewriter) const override {
@@ -1878,7 +1878,7 @@ struct FoldArithExtIntoContractionOp
 /// %b = vector.reduction <add> %a, %acc
 /// ```
 struct ChainedReduction final : OpRewritePattern<vector::ReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ReductionOp op,
                                 PatternRewriter &rewriter) const override {
@@ -2033,7 +2033,7 @@ struct DropUnitDimFromElementwiseOps final
 ///  ```
 struct DropUnitDimsFromTransposeOp final
     : OpRewritePattern<vector::TransposeOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
@@ -2110,7 +2110,7 @@ struct DropUnitDimsFromTransposeOp final
 ///    : vector<[4]x4xf32> to vector<[4]x1x1x4xf32>
 ///  ```
 struct DropUnitDimsFromScfForOp final : OpRewritePattern<scf::ForOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(scf::ForOp forOp,
                                 PatternRewriter &rewriter) const override {
@@ -2155,7 +2155,7 @@ struct DropUnitDimsFromScfForOp final : OpRewritePattern<scf::ForOp> {
 /// %c = vector.reduction <add> %b, %acc
 /// ```
 struct ReduceRedundantZero final : OpRewritePattern<vector::ReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ReductionOp op,
                                 PatternRewriter &rewriter) const override {

From 79d1524bde4c0253b349304e70716c3fb4f7193e Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Thu, 2 Oct 2025 16:35:12 -0700
Subject: [PATCH 587/878] [NFC][IR2Vec] Moving `parseVocabSection()` to
 `VocabStorage` (#161711)

---
 llvm/include/llvm/Analysis/IR2Vec.h |  9 ++-
 llvm/lib/Analysis/IR2Vec.cpp        | 86 ++++++++++++++---------------
 2 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index b7c301580a8a4..ed43f19b4a7d3 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -210,6 +210,13 @@ class VocabStorage {
   const_iterator end() const {
     return const_iterator(this, getNumSections(), 0);
   }
+
+  using VocabMap = std::map<std::string, Embedding>;
+  /// Parse a vocabulary section from JSON and populate the target vocabulary
+  /// map.
+  static Error parseVocabSection(StringRef Key,
+                                 const json::Value &ParsedVocabValue,
+                                 VocabMap &TargetVocab, unsigned &Dim);
 };
 
 /// Class for storing and accessing the IR2Vec vocabulary.
@@ -600,8 +607,6 @@ class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
 
   Error readVocabulary(VocabMap &OpcVocab, VocabMap &TypeVocab,
                        VocabMap &ArgVocab);
-  Error parseVocabSection(StringRef Key, const json::Value &ParsedVocabValue,
-                          VocabMap &TargetVocab, unsigned &Dim);
   void generateVocabStorage(VocabMap &OpcVocab, VocabMap &TypeVocab,
                             VocabMap &ArgVocab);
   void emitError(Error Err, LLVMContext &Ctx);
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index af30422b73759..295b6d33525d9 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -330,6 +330,43 @@ bool VocabStorage::const_iterator::operator!=(
   return !(*this == Other);
 }
 
+Error VocabStorage::parseVocabSection(StringRef Key,
+                                      const json::Value &ParsedVocabValue,
+                                      VocabMap &TargetVocab, unsigned &Dim) {
+  json::Path::Root Path("");
+  const json::Object *RootObj = ParsedVocabValue.getAsObject();
+  if (!RootObj)
+    return createStringError(errc::invalid_argument,
+                             "JSON root is not an object");
+
+  const json::Value *SectionValue = RootObj->get(Key);
+  if (!SectionValue)
+    return createStringError(errc::invalid_argument,
+                             "Missing '" + std::string(Key) +
+                                 "' section in vocabulary file");
+  if (!json::fromJSON(*SectionValue, TargetVocab, Path))
+    return createStringError(errc::illegal_byte_sequence,
+                             "Unable to parse '" + std::string(Key) +
+                                 "' section from vocabulary");
+
+  Dim = TargetVocab.begin()->second.size();
+  if (Dim == 0)
+    return createStringError(errc::illegal_byte_sequence,
+                             "Dimension of '" + std::string(Key) +
+                                 "' section of the vocabulary is zero");
+
+  if (!std::all_of(TargetVocab.begin(), TargetVocab.end(),
+                   [Dim](const std::pair<StringRef, Embedding> &Entry) {
+                     return Entry.second.size() == Dim;
+                   }))
+    return createStringError(
+        errc::illegal_byte_sequence,
+        "All vectors in the '" + std::string(Key) +
+            "' section of the vocabulary are not of the same dimension");
+
+  return Error::success();
+}
+
 // ==----------------------------------------------------------------------===//
 // Vocabulary
 //===----------------------------------------------------------------------===//
@@ -460,43 +497,6 @@ VocabStorage Vocabulary::createDummyVocabForTest(unsigned Dim) {
 // IR2VecVocabAnalysis
 //===----------------------------------------------------------------------===//
 
-Error IR2VecVocabAnalysis::parseVocabSection(
-    StringRef Key, const json::Value &ParsedVocabValue, VocabMap &TargetVocab,
-    unsigned &Dim) {
-  json::Path::Root Path("");
-  const json::Object *RootObj = ParsedVocabValue.getAsObject();
-  if (!RootObj)
-    return createStringError(errc::invalid_argument,
-                             "JSON root is not an object");
-
-  const json::Value *SectionValue = RootObj->get(Key);
-  if (!SectionValue)
-    return createStringError(errc::invalid_argument,
-                             "Missing '" + std::string(Key) +
-                                 "' section in vocabulary file");
-  if (!json::fromJSON(*SectionValue, TargetVocab, Path))
-    return createStringError(errc::illegal_byte_sequence,
-                             "Unable to parse '" + std::string(Key) +
-                                 "' section from vocabulary");
-
-  Dim = TargetVocab.begin()->second.size();
-  if (Dim == 0)
-    return createStringError(errc::illegal_byte_sequence,
-                             "Dimension of '" + std::string(Key) +
-                                 "' section of the vocabulary is zero");
-
-  if (!std::all_of(TargetVocab.begin(), TargetVocab.end(),
-                   [Dim](const std::pair<StringRef, Embedding> &Entry) {
-                     return Entry.second.size() == Dim;
-                   }))
-    return createStringError(
-        errc::illegal_byte_sequence,
-        "All vectors in the '" + std::string(Key) +
-            "' section of the vocabulary are not of the same dimension");
-
-  return Error::success();
-}
-
 // FIXME: Make this optional. We can avoid file reads
 // by auto-generating a default vocabulary during the build time.
 Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab,
@@ -513,16 +513,16 @@ Error IR2VecVocabAnalysis::readVocabulary(VocabMap &OpcVocab,
     return ParsedVocabValue.takeError();
 
   unsigned OpcodeDim = 0, TypeDim = 0, ArgDim = 0;
-  if (auto Err =
-          parseVocabSection("Opcodes", *ParsedVocabValue, OpcVocab, OpcodeDim))
+  if (auto Err = VocabStorage::parseVocabSection("Opcodes", *ParsedVocabValue,
+                                                 OpcVocab, OpcodeDim))
     return Err;
 
-  if (auto Err =
-          parseVocabSection("Types", *ParsedVocabValue, TypeVocab, TypeDim))
+  if (auto Err = VocabStorage::parseVocabSection("Types", *ParsedVocabValue,
+                                                 TypeVocab, TypeDim))
     return Err;
 
-  if (auto Err =
-          parseVocabSection("Arguments", *ParsedVocabValue, ArgVocab, ArgDim))
+  if (auto Err = VocabStorage::parseVocabSection("Arguments", *ParsedVocabValue,
+                                                 ArgVocab, ArgDim))
     return Err;
 
   if (!(OpcodeDim == TypeDim && TypeDim == ArgDim))

From 487cdf14f67e95f61a42389bd168b32c00995ea4 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 3 Oct 2025 09:22:55 +0800
Subject: [PATCH 588/878] [X86][AMX] Combine constant zero vector and AMX cast
 to tilezero (#92384)

Found this problem when investigating #91207
---
 llvm/lib/Target/X86/X86LowerAMXType.cpp     | 30 ++++++++
 llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll | 78 +++++----------------
 2 files changed, 48 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 278ae46b8a5f5..0ba71ada8638e 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -854,6 +854,7 @@ class X86LowerAMXCast {
       : Func(F), SC(ShapeC), DT(nullptr) {}
   bool combineCastStore(IntrinsicInst *Cast, StoreInst *ST);
   bool combineLoadCast(IntrinsicInst *Cast, LoadInst *LD);
+  bool combineTilezero(IntrinsicInst *Cast);
   bool combineLdSt(SmallVectorImpl<Instruction *> &Casts);
   bool combineAMXcast(TargetLibraryInfo *TLI);
   bool transformAMXCast(IntrinsicInst *AMXCast);
@@ -1175,6 +1176,26 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) {
   return EraseLoad;
 }
 
+// %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+// -->
+// %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+bool X86LowerAMXCast::combineTilezero(IntrinsicInst *Cast) {
+  Value *Row = nullptr, *Col = nullptr;
+  Use &U = *(Cast->use_begin());
+  unsigned OpNo = U.getOperandNo();
+  auto *II = cast<IntrinsicInst>(U.getUser());
+  if (!isAMXIntrinsic(II))
+    return false;
+
+  std::tie(Row, Col) = SC->getShape(II, OpNo);
+
+  IRBuilder<> Builder(Cast);
+  Value *NewInst =
+      Builder.CreateIntrinsic(Intrinsic::x86_tilezero_internal, {}, {Row, Col});
+  Cast->replaceAllUsesWith(NewInst);
+  return true;
+}
+
 bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
   bool Change = false;
   for (auto *Cast : Casts) {
@@ -1198,6 +1219,14 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
       for (auto *Store : DeadStores)
         Store->eraseFromParent();
     } else { // x86_cast_vector_to_tile
+      //  %19 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> zeroinitializer)
+      //  -->
+      //  %19 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
+      if (isa<ConstantAggregateZero>(Cast->getOperand(0))) {
+        Change |= combineTilezero(cast<IntrinsicInst>(Cast));
+        continue;
+      }
+
       auto *Load = dyn_cast<LoadInst>(Cast->getOperand(0));
       if (!Load || !Load->hasOneUse())
         continue;
@@ -1210,6 +1239,7 @@ bool X86LowerAMXCast::combineLdSt(SmallVectorImpl<Instruction *> &Casts) {
         // Set the operand is null so that load instruction can be erased.
         Cast->setOperand(0, nullptr);
         Load->eraseFromParent();
+        Change = true;
       }
     }
   }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
index 6ef7219cfebdb..9cf7aab0b3655 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll
@@ -56,14 +56,9 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
 ; CHECK-LABEL: PR90954:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rbp
-; CHECK-NEXT:    movq %rsp, %rbp
-; CHECK-NEXT:    pushq %r15
 ; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    pushq %r13
-; CHECK-NEXT:    pushq %r12
 ; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    andq $-1024, %rsp # imm = 0xFC00
-; CHECK-NEXT:    subq $5120, %rsp # imm = 0x1400
+; CHECK-NEXT:    subq $2912, %rsp # imm = 0xB60
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
@@ -79,29 +74,26 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
 ; CHECK-NEXT:    movw $64, %cx
 ; CHECK-NEXT:    movw $16, %di
 ; CHECK-NEXT:    movb $1, %r8b
-; CHECK-NEXT:    movl $64, %r9d
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT:    xorl %ebx, %ebx
-; CHECK-NEXT:    xorl %r14d, %r14d
+; CHECK-NEXT:    xorl %r9d, %r9d
+; CHECK-NEXT:    xorl %r10d, %r10d
 ; CHECK-NEXT:    jmp .LBB1_1
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_5: # in Loop: Header=BB1_1 Depth=1
-; CHECK-NEXT:    incq %r14
-; CHECK-NEXT:    addl %edx, %ebx
+; CHECK-NEXT:    incq %r10
+; CHECK-NEXT:    addl %edx, %r9d
 ; CHECK-NEXT:  .LBB1_1: # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB1_2 Depth 2
-; CHECK-NEXT:    movslq %ebx, %r15
-; CHECK-NEXT:    leaq (%rsi,%r15,4), %r15
-; CHECK-NEXT:    xorl %r12d, %r12d
-; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    movslq %r9d, %r11
+; CHECK-NEXT:    leaq (%rsi,%r11,4), %r11
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    xorl %r14d, %r14d
 ; CHECK-NEXT:    jmp .LBB1_2
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_4: # in Loop: Header=BB1_2 Depth=2
-; CHECK-NEXT:    tilestored %tmm1, (%r15,%rax)
-; CHECK-NEXT:    incq %r13
-; CHECK-NEXT:    addq $64, %r15
-; CHECK-NEXT:    decq %r12
+; CHECK-NEXT:    tilestored %tmm1, (%r11,%rax)
+; CHECK-NEXT:    incq %r14
+; CHECK-NEXT:    addq $64, %r11
+; CHECK-NEXT:    decq %rbx
 ; CHECK-NEXT:    je .LBB1_5
 ; CHECK-NEXT:  .LBB1_2: # Parent Loop BB1_1 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
@@ -110,46 +102,12 @@ define void @PR90954(ptr %0, ptr %1, i32 %2) nounwind {
 ; CHECK-NEXT:    testb %r8b, %r8b
 ; CHECK-NEXT:    jne .LBB1_4
 ; CHECK-NEXT:  # %bb.3: # in Loop: Header=BB1_2 Depth=2
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    tileloadd (%r10,%r9), %tmm1
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    tileloadd (%r11,%r9), %tmm2
+; CHECK-NEXT:    tilezero %tmm1
+; CHECK-NEXT:    tilezero %tmm2
 ; CHECK-NEXT:    tdpbf16ps %tmm2, %tmm1, %tmm0
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movabsq $64, %rax
-; CHECK-NEXT:    tilestored %tmm0, 3072(%rsp,%rax) # 1024-byte Folded Spill
-; CHECK-NEXT:    tileloadd 3072(%rsp,%rax), %tmm1 # 1024-byte Folded Reload
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT:    movabsq $64, %rbp
+; CHECK-NEXT:    tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
+; CHECK-NEXT:    tileloadd 896(%rsp,%rbp), %tmm1 # 1024-byte Folded Reload
 ; CHECK-NEXT:    jmp .LBB1_4
   %4 = shl i32 %2, 4
   %5 = icmp eq i64 0, 0

From 40fce3250110b196fa351235d25f143b21e97957 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 3 Oct 2025 12:01:38 +1000
Subject: [PATCH 589/878] [orc-rt] Add CallableTraitsHelper, refactor
 WrapperFunction to use it. (#161761)

CallableTraitsHelper identifies the return type and argument types of a
callable type and passes those to an implementation class template to
operate on.

The CallableArgInfo utility uses CallableTraitsHelper to provide
typedefs for the return type and argument types (as a tuple) of a
callable type.

In WrapperFunction.h, the detail::WFCallableTraits utility is rewritten
in terms of CallableTraitsHandler (and renamed to WFHandlerTraits).
---
 orc-rt/include/orc-rt/CallableTraitsHelper.h  | 74 ++++++++++++++++++
 orc-rt/include/orc-rt/WrapperFunction.h       | 77 ++++++++-----------
 orc-rt/unittests/CMakeLists.txt               |  1 +
 orc-rt/unittests/CallableTraitsHelperTest.cpp | 69 +++++++++++++++++
 4 files changed, 176 insertions(+), 45 deletions(-)
 create mode 100644 orc-rt/include/orc-rt/CallableTraitsHelper.h
 create mode 100644 orc-rt/unittests/CallableTraitsHelperTest.cpp

diff --git a/orc-rt/include/orc-rt/CallableTraitsHelper.h b/orc-rt/include/orc-rt/CallableTraitsHelper.h
new file mode 100644
index 0000000000000..12d7d5672c73a
--- /dev/null
+++ b/orc-rt/include/orc-rt/CallableTraitsHelper.h
@@ -0,0 +1,74 @@
+//===- CallableTraitsHelper.h - Callable arg/ret type extractor -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// CallableTraitsHelper API.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_CALLABLETRAITSHELPER_H
+#define ORC_RT_CALLABLETRAITSHELPER_H
+
+#include <tuple>
+#include <type_traits>
+
+namespace orc_rt {
+
+/// CallableTraitsHelper takes an implementation class template Impl and some
+/// callable type C and passes the return and argument types of C to the Impl
+/// class template.
+///
+/// This can be used to simplify the implementation of classes that need to
+/// operate on callable types.
+template <template <typename...> typename ImplT, typename C>
+struct CallableTraitsHelper
+    : public CallableTraitsHelper<
+          ImplT,
+          decltype(&std::remove_cv_t<std::remove_reference_t<C>>::operator())> {
+};
+
+template <template <typename...> typename ImplT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT(ArgTs...)>
+    : public ImplT<RetT, ArgTs...> {};
+
+template <template <typename...> typename ImplT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (*)(ArgTs...)>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+template <template <typename...> typename ImplT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (&)(ArgTs...)>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+template <template <typename...> typename ImplT, typename ClassT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (ClassT::*)(ArgTs...)>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+template <template <typename...> typename ImplT, typename ClassT, typename RetT,
+          typename... ArgTs>
+struct CallableTraitsHelper<ImplT, RetT (ClassT::*)(ArgTs...) const>
+    : public CallableTraitsHelper<ImplT, RetT(ArgTs...)> {};
+
+namespace detail {
+template <typename RetT, typename... ArgTs> struct CallableArgInfoImpl {
+  typedef RetT return_type;
+  typedef std::tuple<ArgTs...> args_tuple_type;
+};
+} // namespace detail
+
+/// CallableArgInfo provides typedefs for the return type and argument types
+/// (as a tuple) of the given callable type.
+template <typename Callable>
+struct CallableArgInfo
+    : public CallableTraitsHelper<detail::CallableArgInfoImpl, Callable> {};
+
+} // namespace orc_rt
+
+#endif // ORC_RT_CALLABLETRAITSHELPER_H
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index bedc097707e38..41960d24de04e 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -14,6 +14,7 @@
 #define ORC_RT_WRAPPERFUNCTION_H
 
 #include "orc-rt-c/WrapperFunction.h"
+#include "orc-rt/CallableTraitsHelper.h"
 #include "orc-rt/Error.h"
 #include "orc-rt/bind.h"
 
@@ -105,37 +106,16 @@ class WrapperFunctionBuffer {
 
 namespace detail {
 
-template <typename C>
-struct WFCallableTraits
-    : public WFCallableTraits<
-          decltype(&std::remove_cv_t<std::remove_reference_t<C>>::operator())> {
-};
-
-template <typename RetT> struct WFCallableTraits<RetT()> {
-  typedef void HeadArgType;
+template <typename RetT, typename ReturnT, typename... ArgTs>
+struct WFHandlerTraitsImpl {
+  static_assert(std::is_void_v<RetT>,
+                "Async wrapper function handler must return void");
+  typedef ReturnT YieldType;
+  typedef std::tuple<ArgTs...> ArgTupleType;
 };
 
-template <typename RetT, typename ArgT, typename... ArgTs>
-struct WFCallableTraits<RetT(ArgT, ArgTs...)> {
-  typedef ArgT HeadArgType;
-  typedef std::tuple<ArgTs...> TailArgTuple;
-};
-
-template <typename RetT, typename... ArgTs>
-struct WFCallableTraits<RetT (*)(ArgTs...)>
-    : public WFCallableTraits<RetT(ArgTs...)> {};
-
-template <typename RetT, typename... ArgTs>
-struct WFCallableTraits<RetT (&)(ArgTs...)>
-    : public WFCallableTraits<RetT(ArgTs...)> {};
-
-template <typename ClassT, typename RetT, typename... ArgTs>
-struct WFCallableTraits<RetT (ClassT::*)(ArgTs...)>
-    : public WFCallableTraits<RetT(ArgTs...)> {};
-
-template <typename ClassT, typename RetT, typename... ArgTs>
-struct WFCallableTraits<RetT (ClassT::*)(ArgTs...) const>
-    : public WFCallableTraits<RetT(ArgTs...)> {};
+template <typename C>
+using WFHandlerTraits = CallableTraitsHelper<WFHandlerTraitsImpl, C>;
 
 template <typename Serializer> class StructuredYieldBase {
 public:
@@ -151,8 +131,11 @@ template <typename Serializer> class StructuredYieldBase {
   std::decay_t<Serializer> S;
 };
 
+template <typename RetT, typename Serializer> class StructuredYield;
+
 template <typename RetT, typename Serializer>
-class StructuredYield : public StructuredYieldBase<Serializer> {
+class StructuredYield<std::tuple<RetT>, Serializer>
+    : public StructuredYieldBase<Serializer> {
 public:
   using StructuredYieldBase<Serializer>::StructuredYieldBase;
   void operator()(RetT &&R) {
@@ -167,7 +150,7 @@ class StructuredYield : public StructuredYieldBase<Serializer> {
 };
 
 template <typename Serializer>
-class StructuredYield<void, Serializer>
+class StructuredYield<std::tuple<>, Serializer>
     : public StructuredYieldBase<Serializer> {
 public:
   using StructuredYieldBase<Serializer>::StructuredYieldBase;
@@ -180,7 +163,7 @@ class StructuredYield<void, Serializer>
 template <typename T, typename Serializer> struct ResultDeserializer;
 
 template <typename T, typename Serializer>
-struct ResultDeserializer<Expected<T>, Serializer> {
+struct ResultDeserializer<std::tuple<Expected<T>>, Serializer> {
   static Expected<T> deserialize(WrapperFunctionBuffer ResultBytes,
                                  Serializer &S) {
     T Val;
@@ -191,7 +174,8 @@ struct ResultDeserializer<Expected<T>, Serializer> {
   }
 };
 
-template <typename Serializer> struct ResultDeserializer<Error, Serializer> {
+template <typename Serializer>
+struct ResultDeserializer<std::tuple<Error>, Serializer> {
   static Error deserialize(WrapperFunctionBuffer ResultBytes, Serializer &S) {
     assert(ResultBytes.empty());
     return Error::success();
@@ -213,11 +197,13 @@ struct WrapperFunction {
             typename... ArgTs>
   static void call(Caller &&C, Serializer &&S, ResultHandler &&RH,
                    ArgTs &&...Args) {
-    typedef detail::WFCallableTraits<ResultHandler> ResultHandlerTraits;
+    typedef CallableArgInfo<ResultHandler> ResultHandlerTraits;
+    static_assert(std::is_void_v<typename ResultHandlerTraits::return_type>,
+                  "Result handler should return void");
     static_assert(
-        std::tuple_size_v<typename ResultHandlerTraits::TailArgTuple> == 0,
-        "Expected one argument to result-handler");
-    typedef typename ResultHandlerTraits::HeadArgType ResultType;
+        std::tuple_size_v<typename ResultHandlerTraits::args_tuple_type> == 1,
+        "Result-handler should have exactly one argument");
+    typedef typename ResultHandlerTraits::args_tuple_type ResultTupleType;
 
     if (auto ArgBytes = S.argumentSerializer()(std::forward<ArgTs>(Args)...)) {
       C(
@@ -227,9 +213,8 @@ struct WrapperFunction {
             if (const char *ErrMsg = ResultBytes.getOutOfBandError())
               RH(make_error<StringError>(ErrMsg));
             else
-              RH(detail::ResultDeserializer<
-                  ResultType, Serializer>::deserialize(std::move(ResultBytes),
-                                                       S));
+              RH(detail::ResultDeserializer<ResultTupleType, Serializer>::
+                     deserialize(std::move(ResultBytes), S));
           },
           std::move(*ArgBytes));
     } else
@@ -246,10 +231,12 @@ struct WrapperFunction {
                      orc_rt_WrapperFunctionReturn Return,
                      WrapperFunctionBuffer ArgBytes, Serializer &&S,
                      Handler &&H) {
-    typedef detail::WFCallableTraits<Handler> HandlerTraits;
-    typedef typename HandlerTraits::HeadArgType Yield;
-    typedef typename HandlerTraits::TailArgTuple ArgTuple;
-    typedef typename detail::WFCallableTraits<Yield>::HeadArgType RetType;
+    typedef detail::WFHandlerTraits<Handler> HandlerTraits;
+    typedef typename HandlerTraits::ArgTupleType ArgTuple;
+    typedef typename HandlerTraits::YieldType Yield;
+    static_assert(std::is_void_v<typename CallableArgInfo<Yield>::return_type>,
+                  "Return callback must return void");
+    typedef typename CallableArgInfo<Yield>::args_tuple_type RetTupleType;
 
     if (ArgBytes.getOutOfBandError())
       return Return(Session, CallCtx, ArgBytes.release());
@@ -258,7 +245,7 @@ struct WrapperFunction {
     if (std::apply(bind_front(S.argumentDeserializer(), std::move(ArgBytes)),
                    Args))
       std::apply(bind_front(std::forward<Handler>(H),
-                            detail::StructuredYield<RetType, Serializer>(
+                            detail::StructuredYield<RetTupleType, Serializer>(
                                 Session, CallCtx, Return, std::move(S))),
                  std::move(Args));
     else
diff --git a/orc-rt/unittests/CMakeLists.txt b/orc-rt/unittests/CMakeLists.txt
index f29fb1e3b9d03..54c453de3a97f 100644
--- a/orc-rt/unittests/CMakeLists.txt
+++ b/orc-rt/unittests/CMakeLists.txt
@@ -14,6 +14,7 @@ endfunction()
 add_orc_rt_unittest(CoreTests
   AllocActionTest.cpp
   BitmaskEnumTest.cpp
+  CallableTraitsHelperTest.cpp
   CommonTestUtils.cpp
   ErrorTest.cpp
   ExecutorAddressTest.cpp
diff --git a/orc-rt/unittests/CallableTraitsHelperTest.cpp b/orc-rt/unittests/CallableTraitsHelperTest.cpp
new file mode 100644
index 0000000000000..1db39162a9106
--- /dev/null
+++ b/orc-rt/unittests/CallableTraitsHelperTest.cpp
@@ -0,0 +1,69 @@
+//===- CallableTraitsHelperTest.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for orc-rt's CallableTraitsHelper.h APIs.
+//
+// NOTE: All tests in this file are testing compile-time functionality, so the
+//       tests at runtime all end up being noops. That's fine -- those are
+//       cheap.
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/CallableTraitsHelper.h"
+#include "gtest/gtest.h"
+
+using namespace orc_rt;
+
+static void freeVoidVoid() {}
+
+TEST(CallableTraitsHelperTest, FreeVoidVoid) {
+  (void)freeVoidVoid;
+  typedef CallableArgInfo<decltype(freeVoidVoid)> CAI;
+  static_assert(std::is_void_v<CAI::return_type>);
+  static_assert(std::is_same_v<CAI::args_tuple_type, std::tuple<>>);
+}
+
+static int freeBinaryOp(int, float) { return 0; }
+
+TEST(CallableTraitsHelperTest, FreeBinaryOp) {
+  (void)freeBinaryOp;
+  typedef CallableArgInfo<decltype(freeBinaryOp)> CAI;
+  static_assert(std::is_same_v<CAI::return_type, int>);
+  static_assert(std::is_same_v<CAI::args_tuple_type, std::tuple<int, float>>);
+}
+
+TEST(CallableTraitsHelperTest, VoidVoidObj) {
+  auto VoidVoid = []() {};
+  typedef CallableArgInfo<decltype(VoidVoid)> CAI;
+  static_assert(std::is_void_v<CAI::return_type>);
+  static_assert(std::is_same_v<CAI::args_tuple_type, std::tuple<>>);
+}
+
+TEST(CallableTraitsHelperTest, BinaryOpObj) {
+  auto BinaryOp = [](int X, float Y) -> int { return X + Y; };
+  typedef CallableArgInfo<decltype(BinaryOp)> CAI;
+  static_assert(std::is_same_v<CAI::return_type, int>);
+  static_assert(std::is_same_v<CAI::args_tuple_type, std::tuple<int, float>>);
+}
+
+TEST(CallableTraitsHelperTest, PreservesLValueRef) {
+  auto RefOp = [](int &) {};
+  typedef CallableArgInfo<decltype(RefOp)> CAI;
+  static_assert(std::is_same_v<CAI::args_tuple_type, std::tuple<int &>>);
+}
+
+TEST(CallableTraitsHelperTest, PreservesLValueRefConstness) {
+  auto RefOp = [](const int &) {};
+  typedef CallableArgInfo<decltype(RefOp)> CAI;
+  static_assert(std::is_same_v<CAI::args_tuple_type, std::tuple<const int &>>);
+}
+
+TEST(CallableTraitsHelperTest, PreservesRValueRef) {
+  auto RefOp = [](int &&) {};
+  typedef CallableArgInfo<decltype(RefOp)> CAI;
+  static_assert(std::is_same_v<CAI::args_tuple_type, std::tuple<int &&>>);
+}

From d0e15f99ead47a077eb9c574e9614dd11a8f1ec1 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 3 Oct 2025 12:37:53 +1000
Subject: [PATCH 590/878] [orc-rt] Refactor WrapperFunction to simplify
 Serializer classes. (#161763)

Serializers only need to provide two methods now, rather than four. The
first method should return an argument serializer / deserializer, the
second a result value serializer / deserializer. The interfaces for
these are now more uniform (deserialize now returns a tuple, rather than
taking its output location(s) by reference). The intent is to simplify
Serializer helper code. NFCI.
---
 orc-rt/include/orc-rt/SPSWrapperFunction.h | 31 ++++++++--------------
 orc-rt/include/orc-rt/WrapperFunction.h    | 17 ++++++------
 2 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index d08176f676289..3ea6406b69a37 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -20,9 +20,9 @@
 namespace orc_rt {
 namespace detail {
 
-template <typename... SPSArgTs> struct WFSPSSerializer {
+template <typename... SPSArgTs> struct WFSPSHelper {
   template <typename... ArgTs>
-  std::optional<WrapperFunctionBuffer> operator()(const ArgTs &...Args) {
+  std::optional<WrapperFunctionBuffer> serialize(const ArgTs &...Args) {
     auto R =
         WrapperFunctionBuffer::allocate(SPSArgList<SPSArgTs...>::size(Args...));
     SPSOutputBuffer OB(R.data(), R.size());
@@ -30,15 +30,17 @@ template <typename... SPSArgTs> struct WFSPSSerializer {
       return std::nullopt;
     return std::move(R);
   }
-};
 
-template <typename... SPSArgTs> struct WFSPSDeserializer {
-  template <typename... ArgTs>
-  bool operator()(WrapperFunctionBuffer &ArgBytes, ArgTs &...Args) {
+  template <typename ArgTuple>
+  std::optional<ArgTuple> deserialize(WrapperFunctionBuffer ArgBytes) {
     assert(!ArgBytes.getOutOfBandError() &&
            "Should not attempt to deserialize out-of-band error");
     SPSInputBuffer IB(ArgBytes.data(), ArgBytes.size());
-    return SPSArgList<SPSArgTs...>::deserialize(IB, Args...);
+    ArgTuple Args;
+    if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>, ArgTuple>::deserialize(
+            IB, Args))
+      return std::nullopt;
+    return Args;
   }
 };
 
@@ -48,19 +50,8 @@ template <typename SPSSig> struct WrapperFunctionSPSSerializer;
 
 template <typename SPSRetT, typename... SPSArgTs>
 struct WrapperFunctionSPSSerializer<SPSRetT(SPSArgTs...)> {
-  static detail::WFSPSSerializer<SPSArgTs...> argumentSerializer() noexcept {
-    return {};
-  }
-  static detail::WFSPSDeserializer<SPSArgTs...>
-  argumentDeserializer() noexcept {
-    return {};
-  }
-  static detail::WFSPSSerializer<SPSRetT> resultSerializer() noexcept {
-    return {};
-  }
-  static detail::WFSPSDeserializer<SPSRetT> resultDeserializer() noexcept {
-    return {};
-  }
+  static detail::WFSPSHelper<SPSArgTs...> arguments() noexcept { return {}; }
+  static detail::WFSPSHelper<SPSRetT> result() noexcept { return {}; }
 };
 
 /// Provides call and handle utilities to simplify writing and invocation of
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index 41960d24de04e..233c3b21e041d 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -139,7 +139,7 @@ class StructuredYield<std::tuple<RetT>, Serializer>
 public:
   using StructuredYieldBase<Serializer>::StructuredYieldBase;
   void operator()(RetT &&R) {
-    if (auto ResultBytes = this->S.resultSerializer()(std::forward<RetT>(R)))
+    if (auto ResultBytes = this->S.result().serialize(std::forward<RetT>(R)))
       this->Return(this->Session, this->CallCtx, ResultBytes->release());
     else
       this->Return(this->Session, this->CallCtx,
@@ -166,9 +166,9 @@ template <typename T, typename Serializer>
 struct ResultDeserializer<std::tuple<Expected<T>>, Serializer> {
   static Expected<T> deserialize(WrapperFunctionBuffer ResultBytes,
                                  Serializer &S) {
-    T Val;
-    if (S.resultDeserializer()(ResultBytes, Val))
-      return std::move(Val);
+    if (auto Val = S.result().template deserialize<std::tuple<T>>(
+            std::move(ResultBytes)))
+      return std::move(std::get<0>(*Val));
     else
       return make_error<StringError>("Could not deserialize result");
   }
@@ -205,7 +205,7 @@ struct WrapperFunction {
         "Result-handler should have exactly one argument");
     typedef typename ResultHandlerTraits::args_tuple_type ResultTupleType;
 
-    if (auto ArgBytes = S.argumentSerializer()(std::forward<ArgTs>(Args)...)) {
+    if (auto ArgBytes = S.arguments().serialize(std::forward<ArgTs>(Args)...)) {
       C(
           [RH = std::move(RH),
            S = std::move(S)](orc_rt_SessionRef Session,
@@ -241,13 +241,12 @@ struct WrapperFunction {
     if (ArgBytes.getOutOfBandError())
       return Return(Session, CallCtx, ArgBytes.release());
 
-    ArgTuple Args;
-    if (std::apply(bind_front(S.argumentDeserializer(), std::move(ArgBytes)),
-                   Args))
+    if (auto Args =
+            S.arguments().template deserialize<ArgTuple>(std::move(ArgBytes)))
       std::apply(bind_front(std::forward<Handler>(H),
                             detail::StructuredYield<RetTupleType, Serializer>(
                                 Session, CallCtx, Return, std::move(S))),
-                 std::move(Args));
+                 std::move(*Args));
     else
       Return(Session, CallCtx,
              WrapperFunctionBuffer::createOutOfBandError(

From c64d4cee657020a65723604ff4f0f81dbcb620e3 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 2 Oct 2025 23:47:09 -0300
Subject: [PATCH 591/878] [clang] fix lambda dependency issue with late parse
 attributes (#161765)

This fixes a regression introduced in #147835

When parsing a lambda where the call operator has a late parsed
attribute, we would try to build a 'this' type for the lambda, but in a
lambda 'this' never refers to the lambda class itself.

This late parsed attribute can be added implicitly by the
-ftrapping-math flag.

This patch patch makes it so CXXThisScopeRAII ignores lambdas.

This became observable in #147835 because that made clang lazily create
tag types, and it removed a workaround for a lambda dependency bug where
any previously created tag type for the lambda is discarded after its
dependency is recalculated.

But the 'this' scope created above would defeat this laziness and create
the lambda type too soon, before its dependency was updated.

Since this regression was never released, there are no release notes.

Fixes #161657
---
 clang/lib/Sema/SemaExprCXX.cpp       |  4 ++++
 clang/test/SemaTemplate/GH161657.cpp | 11 +++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 clang/test/SemaTemplate/GH161657.cpp

diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 779ccf5f1e888..576eb326e6529 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1251,6 +1251,10 @@ Sema::CXXThisScopeRAII::CXXThisScopeRAII(Sema &S,
   else
     Record = cast<CXXRecordDecl>(ContextDecl);
 
+  // 'this' never refers to the lambda class itself.
+  if (Record->isLambda())
+    return;
+
   QualType T = S.Context.getCanonicalTagType(Record);
   T = S.getASTContext().getQualifiedType(T, CXXThisTypeQuals);
 
diff --git a/clang/test/SemaTemplate/GH161657.cpp b/clang/test/SemaTemplate/GH161657.cpp
new file mode 100644
index 0000000000000..6ec793115db12
--- /dev/null
+++ b/clang/test/SemaTemplate/GH161657.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -ffp-exception-behavior=strict -verify %s
+// expected-no-diagnostics
+
+template <class T> struct S {
+  template <class U> using type1 = decltype([] { return U{}; });
+};
+
+void foo() {
+  using T1 = S<int>::type1<int>;
+  int x = T1()();
+}

From 715e0fab00fc7c6cbbbf84c1309f36dc9c613553 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 3 Oct 2025 13:26:20 +1000
Subject: [PATCH 592/878] [orc-rt] Add transparent SPS conversion for
 error/expected types. (#161768)

This commit aims to reduce boilerplate by adding transparent conversion
between Error/Expected types and their SPS-serializable counterparts
(SPSSerializableError/SPSSerializableExpected). This allows
SPSWrapperFunction calls and handles to be written in terms of
Error/Expected directly.

This functionality can also be extended to transparently convert between
other types. This may be used in the future to provide conversion
between ExecutorAddr and native pointer types.
---
 orc-rt/include/orc-rt/SPSWrapperFunction.h  | 59 ++++++++++++++--
 orc-rt/include/orc-rt/WrapperFunction.h     |  3 +-
 orc-rt/unittests/SPSWrapperFunctionTest.cpp | 74 +++++++++++++++++++++
 3 files changed, 129 insertions(+), 7 deletions(-)

diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 3ea6406b69a37..14a3d8e3d6ad6 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -21,8 +21,10 @@ namespace orc_rt {
 namespace detail {
 
 template <typename... SPSArgTs> struct WFSPSHelper {
-  template <typename... ArgTs>
-  std::optional<WrapperFunctionBuffer> serialize(const ArgTs &...Args) {
+private:
+  template <typename... SerializableArgTs>
+  std::optional<WrapperFunctionBuffer>
+  serializeImpl(const SerializableArgTs &...Args) {
     auto R =
         WrapperFunctionBuffer::allocate(SPSArgList<SPSArgTs...>::size(Args...));
     SPSOutputBuffer OB(R.data(), R.size());
@@ -31,16 +33,61 @@ template <typename... SPSArgTs> struct WFSPSHelper {
     return std::move(R);
   }
 
+  template <typename T> static const T &toSerializable(const T &Arg) noexcept {
+    return Arg;
+  }
+
+  static SPSSerializableError toSerializable(Error Err) noexcept {
+    return SPSSerializableError(std::move(Err));
+  }
+
+  template <typename T>
+  static SPSSerializableExpected<T> toSerializable(Expected<T> Arg) noexcept {
+    return SPSSerializableExpected<T>(std::move(Arg));
+  }
+
+  template <typename... Ts> struct DeserializableTuple;
+
+  template <typename... Ts> struct DeserializableTuple<std::tuple<Ts...>> {
+    typedef std::tuple<
+        std::decay_t<decltype(toSerializable(std::declval<Ts>()))>...>
+        type;
+  };
+
+  template <typename... Ts>
+  using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type;
+
+  template <typename T> static T fromSerializable(T &&Arg) noexcept {
+    return Arg;
+  }
+
+  static Error fromSerializable(SPSSerializableError Err) noexcept {
+    return Err.toError();
+  }
+
+  template <typename T>
+  static Expected<T> fromSerializable(SPSSerializableExpected<T> Val) noexcept {
+    return Val.toExpected();
+  }
+
+public:
+  template <typename... ArgTs>
+  std::optional<WrapperFunctionBuffer> serialize(ArgTs &&...Args) {
+    return serializeImpl(toSerializable(std::forward<ArgTs>(Args))...);
+  }
+
   template <typename ArgTuple>
   std::optional<ArgTuple> deserialize(WrapperFunctionBuffer ArgBytes) {
     assert(!ArgBytes.getOutOfBandError() &&
            "Should not attempt to deserialize out-of-band error");
     SPSInputBuffer IB(ArgBytes.data(), ArgBytes.size());
-    ArgTuple Args;
-    if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>, ArgTuple>::deserialize(
-            IB, Args))
+    DeserializableTuple_t<ArgTuple> Args;
+    if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>,
+                                decltype(Args)>::deserialize(IB, Args))
       return std::nullopt;
-    return Args;
+    return std::apply(
+        [](auto &&...A) { return ArgTuple(fromSerializable(A)...); },
+        std::move(Args));
   }
 };
 
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index 233c3b21e041d..ca165db7188b4 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -168,7 +168,8 @@ struct ResultDeserializer<std::tuple<Expected<T>>, Serializer> {
                                  Serializer &S) {
     if (auto Val = S.result().template deserialize<std::tuple<T>>(
             std::move(ResultBytes)))
-      return std::move(std::get<0>(*Val));
+      return Expected<T>(std::move(std::get<0>(*Val)),
+                         ForceExpectedSuccessValue());
     else
       return make_error<StringError>("Could not deserialize result");
   }
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index 0b65515120b7f..c0c86ff8715ce 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -144,3 +144,77 @@ TEST(SPSWrapperFunctionUtilsTest, TestBinaryOpViaFunctionPointer) {
       [&](Expected<int32_t> R) { Result = cantFail(std::move(R)); }, 41, 1);
   EXPECT_EQ(Result, 42);
 }
+
+static void improbable_feat_sps_wrapper(orc_rt_SessionRef Session,
+                                        void *CallCtx,
+                                        orc_rt_WrapperFunctionReturn Return,
+                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
+  SPSWrapperFunction<SPSError(bool)>::handle(
+      Session, CallCtx, Return, ArgBytes,
+      [](move_only_function<void(Error)> Return, bool LuckyHat) {
+        if (LuckyHat)
+          Return(Error::success());
+        else
+          Return(make_error<StringError>("crushed by boulder"));
+      });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorSuccessCase) {
+  bool DidRun = false;
+  SPSWrapperFunction<SPSError(bool)>::call(
+      DirectCaller(nullptr, improbable_feat_sps_wrapper),
+      [&](Expected<Error> E) {
+        DidRun = true;
+        cantFail(cantFail(std::move(E)));
+      },
+      true);
+
+  EXPECT_TRUE(DidRun);
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorFailureCase) {
+  std::string ErrMsg;
+  SPSWrapperFunction<SPSError(bool)>::call(
+      DirectCaller(nullptr, improbable_feat_sps_wrapper),
+      [&](Expected<Error> E) { ErrMsg = toString(cantFail(std::move(E))); },
+      false);
+
+  EXPECT_EQ(ErrMsg, "crushed by boulder");
+}
+
+static void halve_number_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+                                     orc_rt_WrapperFunctionReturn Return,
+                                     orc_rt_WrapperFunctionBuffer ArgBytes) {
+  SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::handle(
+      Session, CallCtx, Return, ArgBytes,
+      [](move_only_function<void(Expected<int32_t>)> Return, int N) {
+        if (N % 2 == 0)
+          Return(N >> 1);
+        else
+          Return(make_error<StringError>("N is not a multiple of 2"));
+      });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedSuccessCase) {
+  int32_t Result = 0;
+  SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call(
+      DirectCaller(nullptr, halve_number_sps_wrapper),
+      [&](Expected<Expected<int32_t>> R) {
+        Result = cantFail(cantFail(std::move(R)));
+      },
+      2);
+
+  EXPECT_EQ(Result, 1);
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) {
+  std::string ErrMsg;
+  SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call(
+      DirectCaller(nullptr, halve_number_sps_wrapper),
+      [&](Expected<Expected<int32_t>> R) {
+        ErrMsg = toString(cantFail(std::move(R)).takeError());
+      },
+      3);
+
+  EXPECT_EQ(ErrMsg, "N is not a multiple of 2");
+}

From 874428708f3bcf2fc8402b2aa0ca720c1b6cd3a6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 2 Oct 2025 21:14:43 -0700
Subject: [PATCH 593/878] [RISCV][GISel] Use relaxed_load/store in GISel atomic
 patterns. NFC (#161712)

We have additional patterns for GISel because we need to make s16 and
s32 legal for load/store. GISel does not distinquish integer and FP
scalar types in LLT. We only know whether the load should be integer or
FP after register bank selection.

These patterns should have been updated to use relaxed_load/store when
the patterns in RISCVInstrInfoA.td were updated. Without this we will
miscompile loads/stores with strong memory ordering when Zalasr is
enabled.

This patch just fixes the miscompile, Zalasr will now cause a GISel
abort in some cases. A follow up patch will add additional GISel
patterns for Zalasr.
---
 llvm/lib/Target/RISCV/RISCVGISel.td | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index af1ceb6bcda4e..cf6f83a09610d 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -110,16 +110,16 @@ def : StPat<truncstorei8, SB, GPR, i16>;
 
 let Predicates = [HasAtomicLdSt] in {
   // Prefer unsigned due to no c.lb in Zcb.
-  def : LdPat<atomic_load_aext_8,    LBU, i16>;
-  def : LdPat<atomic_load_nonext_16, LH,  i16>;
+  def : LdPat<relaxed_load<atomic_load_aext_8>,    LBU, i16>;
+  def : LdPat<relaxed_load<atomic_load_nonext_16>, LH,  i16>;
 
-  def : StPat<atomic_store_8,  SB, GPR, i16>;
-  def : StPat<atomic_store_16, SH, GPR, i16>;
+  def : StPat<relaxed_store<atomic_store_8>,  SB, GPR, i16>;
+  def : StPat<relaxed_store<atomic_store_16>, SH, GPR, i16>;
 }
 
 let Predicates = [HasAtomicLdSt, IsRV64] in {
   // Load pattern is in RISCVInstrInfoA.td and shared with RV32.
-  def : StPat<atomic_store_32, SW, GPR, i32>;
+  def : StPat<relaxed_store<atomic_store_32>, SW, GPR, i32>;
 }
 
 //===----------------------------------------------------------------------===//

From 065699b34316f5dca74ed7e88c3958d488869bec Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Fri, 3 Oct 2025 01:36:42 -0300
Subject: [PATCH 594/878] [clang] fix #161765 test triple dependency (#161769)

Fixes the new test introduced in #161765, so that it always uses a
triple which supports floating point exceptions.

Otherwise, some post-commit bots fail.
---
 clang/test/SemaTemplate/GH161657.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/SemaTemplate/GH161657.cpp b/clang/test/SemaTemplate/GH161657.cpp
index 6ec793115db12..5ad4dde855898 100644
--- a/clang/test/SemaTemplate/GH161657.cpp
+++ b/clang/test/SemaTemplate/GH161657.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -std=c++20 -ffp-exception-behavior=strict -verify %s
+// RUN: %clang_cc1 -triple=x86_64 -fsyntax-only -std=c++20 -ffp-exception-behavior=strict -verify %s
 // expected-no-diagnostics
 
 template <class T> struct S {

From d7f1cc885a75cf51e8f695e847d856d75fb05dd0 Mon Sep 17 00:00:00 2001
From: Connector Switch <c8ef@outlook.com>
Date: Fri, 3 Oct 2025 12:53:57 +0800
Subject: [PATCH 595/878] [flang] use specialized scan/verify version for char
 (#161767)

The specialized version has a time complexity of `O(n)`.
---
 flang-rt/lib/runtime/character.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang-rt/lib/runtime/character.cpp b/flang-rt/lib/runtime/character.cpp
index 98a225dbec9f9..0f9f419ff0b7b 100644
--- a/flang-rt/lib/runtime/character.cpp
+++ b/flang-rt/lib/runtime/character.cpp
@@ -789,7 +789,7 @@ void RTDEF(LenTrim)(Descriptor &result, const Descriptor &string, int kind,
 
 std::size_t RTDEF(Scan1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
-  return ScanVerify<char, CharFunc::Scan>(x, xLen, set, setLen, back);
+  return ScanVerify<false>(x, xLen, set, setLen, back);
 }
 std::size_t RTDEF(Scan2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {
@@ -873,7 +873,7 @@ void RTDEF(Trim)(Descriptor &result, const Descriptor &string,
 
 std::size_t RTDEF(Verify1)(const char *x, std::size_t xLen, const char *set,
     std::size_t setLen, bool back) {
-  return ScanVerify<char, CharFunc::Verify>(x, xLen, set, setLen, back);
+  return ScanVerify<true>(x, xLen, set, setLen, back);
 }
 std::size_t RTDEF(Verify2)(const char16_t *x, std::size_t xLen,
     const char16_t *set, std::size_t setLen, bool back) {

From a7016c43daa30b9517042b4f87b11787fa76ae30 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 3 Oct 2025 15:24:29 +1000
Subject: [PATCH 596/878] [JITLink] Add LinkGraph name / triple to debugging
 output. (#161772)

Adds the name and triple of the graph to LinkGraph::dump output before
the rest of the graph content. Calls from JITLinkGeneric.cpp to dump the
graph are updated to avoid redundantly naming the graph.
---
 llvm/lib/ExecutionEngine/JITLink/JITLink.cpp  |  3 ++
 .../JITLink/JITLinkGeneric.cpp                | 30 ++++++++-----------
 .../JITLink/AArch32/ELF_data_alignment.s      |  2 +-
 .../JITLink/RISCV/ELF_relax_call.s            |  2 +-
 .../JITLink/RISCV/ELF_relax_call_rvc.s        |  2 +-
 5 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 23b72daedacaa..6e316f105715d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -280,6 +280,9 @@ std::vector<Block *> LinkGraph::splitBlockImpl(std::vector<Block *> Blocks,
 void LinkGraph::dump(raw_ostream &OS) {
   DenseMap<Block *, std::vector<Symbol *>> BlockSymbols;
 
+  OS << "LinkGraph \"" << getName()
+     << "\" (triple = " << getTargetTriple().str() << ")\n";
+
   // Map from blocks to the symbols pointing at them.
   for (auto *Sym : defined_symbols())
     BlockSymbols[&Sym->getBlock()].push_back(Sym);
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 584b9f0067460..17050b0a52480 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -21,23 +21,21 @@ JITLinkerBase::~JITLinkerBase() = default;
 
 void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 
-  LLVM_DEBUG({
-    dbgs() << "Starting link phase 1 for graph " << G->getName() << "\n";
-  });
+  LLVM_DEBUG(dbgs() << "Starting link phase 1\n");
 
   // Prune and optimize the graph.
   if (auto Err = runPasses(Passes.PrePrunePasses))
     return Ctx->notifyFailed(std::move(Err));
 
   LLVM_DEBUG({
-    dbgs() << "Link graph \"" << G->getName() << "\" pre-pruning:\n";
+    dbgs() << "Link graph pre-pruning:\n";
     G->dump(dbgs());
   });
 
   prune(*G);
 
   LLVM_DEBUG({
-    dbgs() << "Link graph \"" << G->getName() << "\" post-pruning:\n";
+    dbgs() << "Link graph post-pruning:\n";
     G->dump(dbgs());
   });
 
@@ -67,14 +65,15 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
                                AllocResult AR) {
 
+  LLVM_DEBUG(dbgs() << "Starting link phase 2\n");
+
   if (AR)
     Alloc = std::move(*AR);
   else
     return Ctx->notifyFailed(AR.takeError());
 
   LLVM_DEBUG({
-    dbgs() << "Link graph \"" << G->getName()
-           << "\" before post-allocation passes:\n";
+    dbgs() << "Link graph before post-allocation passes:\n";
     G->dump(dbgs());
   });
 
@@ -131,9 +130,7 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
 void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
                                Expected<AsyncLookupResult> LR) {
 
-  LLVM_DEBUG({
-    dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n";
-  });
+  LLVM_DEBUG(dbgs() << "Starting link phase 3\n");
 
   // If the lookup failed, bail out.
   if (!LR)
@@ -143,8 +140,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
   applyLookupResult(*LR);
 
   LLVM_DEBUG({
-    dbgs() << "Link graph \"" << G->getName()
-           << "\" before pre-fixup passes:\n";
+    dbgs() << "Link graph before pre-fixup passes:\n";
     G->dump(dbgs());
   });
 
@@ -152,7 +148,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
     return abandonAllocAndBailOut(std::move(Self), std::move(Err));
 
   LLVM_DEBUG({
-    dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n";
+    dbgs() << "Link graph before copy-and-fixup:\n";
     G->dump(dbgs());
   });
 
@@ -161,7 +157,7 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
     return abandonAllocAndBailOut(std::move(Self), std::move(Err));
 
   LLVM_DEBUG({
-    dbgs() << "Link graph \"" << G->getName() << "\" after copy-and-fixup:\n";
+    dbgs() << "Link graph after copy-and-fixup:\n";
     G->dump(dbgs());
   });
 
@@ -186,16 +182,14 @@ void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self,
 void JITLinkerBase::linkPhase4(std::unique_ptr<JITLinkerBase> Self,
                                FinalizeResult FR) {
 
-  LLVM_DEBUG({
-    dbgs() << "Starting link phase 4 for graph " << G->getName() << "\n";
-  });
+  LLVM_DEBUG(dbgs() << "Starting link phase 4\n");
 
   if (!FR)
     return Ctx->notifyFailed(FR.takeError());
 
   Ctx->notifyFinalized(std::move(*FR));
 
-  LLVM_DEBUG({ dbgs() << "Link of graph " << G->getName() << " complete\n"; });
+  LLVM_DEBUG({ dbgs() << "Link complete\n"; });
 }
 
 Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) {
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
index 9296f048e51ed..ed76a286263f2 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_data_alignment.s
@@ -22,7 +22,7 @@
 # CHECK-OBJ: Contents of section .rodata:
 # CHECK-OBJ: 0000 48310048 32004833 00                 H1.H2.H3.
 
-# CHECK-LG: Starting link phase 1 for graph
+# CHECK-LG: Starting link phase 1
 # CHECK-LG: section .rodata:
 
 # CHECK-LG:       block 0x0 size = 0x00000009, align = 1, alignment-offset = 0
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
index 2b5c9e383c04f..5f6babfcd6008 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call.s
@@ -102,7 +102,7 @@ p:
         call o
         .size p, .-p
 
-# CHECK: Link graph "{{.*}}" before copy-and-fixup:
+# CHECK: Link graph before copy-and-fixup:
 # CHECK: section .text:
 # CHECK:   block 0x1000
 # CHECK:     symbols:
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
index 3bbfd557a0a6c..c31250b546504 100644
--- a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_relax_call_rvc.s
@@ -131,7 +131,7 @@ p:
         call o
         .size p, .-p
 
-# CHECK:      Link graph "{{.*}}" before copy-and-fixup:
+# CHECK:      Link graph before copy-and-fixup:
 # CHECK:      section .text:
 # CHECK:        block 0x1000
 # CHECK:          symbols:

From e0eb7c25cbb5b5b80f023b1e3cc0b31e1bdaa8f0 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Fri, 3 Oct 2025 14:17:15 +0800
Subject: [PATCH 597/878] [Clang] Ensure initialized NTTP expressions when
 building CTAD for type aliases (#161035)

We missed calling CheckTemplateArgument when building CTAD deduction
guides. That ensures some InitExprs are correctly initialized, as in the
test that crashed due to incorrect NTTP initialization.

I don't use CheckTemplateArguments because, in CTAD synthesis, template
parameter packs don't always appear at the end of the parameter list,
unlike user-written ones mandated by the standard. This makes it
difficult for CheckTemplateArguments to determine how many arguments a
pack in middle should match, leading to unnecessary complexity.

On the other hand, since we substitute non-deduced template parameters
with deduced ones, we need to fold the packs midway through
substitution, where CheckTemplateArgument is more convenient.

As a drive-by this also removes some dead code in SemaInit.

Fixes #131408
---
 clang/docs/ReleaseNotes.rst                   |  2 +-
 clang/lib/Sema/SemaInit.cpp                   |  5 +--
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 39 ++++++++++++++++---
 clang/test/SemaCXX/cxx20-ctad-type-alias.cpp  | 17 ++++++++
 4 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 145a83af514ed..0ec333579ea86 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -361,7 +361,7 @@ Bug Fixes in This Version
   first parameter. (#GH113323).
 - Fixed a crash with incompatible pointer to integer conversions in designated
   initializers involving string literals. (#GH154046)
-- Fix crash on CTAD for alias template. (#GH131342)
+- Fix crash on CTAD for alias template. (#GH131342), (#GH131408)
 - Clang now emits a frontend error when a function marked with the `flatten` attribute
   calls another function that requires target features not enabled in the caller. This
   prevents a fatal error in the backend.
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index c97129336736b..0d0d2c00eb5d4 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -8219,8 +8219,8 @@ ExprResult InitializationSequence::Perform(Sema &S,
       // InitializeTemporary entity for our target type.
       QualType Ty = Step->Type;
       bool IsTemporary = !S.Context.hasSameType(Entity.getType(), Ty);
-      InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(Ty);
-      InitializedEntity InitEntity = IsTemporary ? TempEntity : Entity;
+      InitializedEntity InitEntity =
+          IsTemporary ? InitializedEntity::InitializeTemporary(Ty) : Entity;
       InitListChecker PerformInitList(S, InitEntity,
           InitList, Ty, /*VerifyOnly=*/false,
           /*TreatUnavailableAsInvalid=*/false);
@@ -8242,7 +8242,6 @@ ExprResult InitializationSequence::Perform(Sema &S,
 
       InitListExpr *StructuredInitList =
           PerformInitList.getFullyStructuredList();
-      CurInit.get();
       CurInit = shouldBindAsTemporary(InitEntity)
           ? S.MaybeBindToTemporary(StructuredInitList)
           : StructuredInitList;
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index fe673eac8fcfa..9a61888e318b3 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -1171,17 +1171,46 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
   Args.addOuterTemplateArguments(TransformedDeducedAliasArgs);
   for (unsigned Index = 0; Index < DeduceResults.size(); ++Index) {
     const auto &D = DeduceResults[Index];
+    auto *TP = F->getTemplateParameters()->getParam(Index);
     if (IsNonDeducedArgument(D)) {
       // 2): Non-deduced template parameters would be substituted later.
       continue;
     }
     TemplateArgumentLoc Input =
         SemaRef.getTrivialTemplateArgumentLoc(D, QualType(), SourceLocation{});
-    TemplateArgumentLoc Output;
-    if (!SemaRef.SubstTemplateArgument(Input, Args, Output)) {
-      assert(TemplateArgsForBuildingFPrime[Index].isNull() &&
-             "InstantiatedArgs must be null before setting");
-      TemplateArgsForBuildingFPrime[Index] = Output.getArgument();
+    TemplateArgumentListInfo Output;
+    if (SemaRef.SubstTemplateArguments(Input, Args, Output))
+      return nullptr;
+    assert(TemplateArgsForBuildingFPrime[Index].isNull() &&
+           "InstantiatedArgs must be null before setting");
+    // CheckTemplateArgument is necessary for NTTP initializations.
+    // FIXME: We may want to call CheckTemplateArguments instead, but we cannot
+    // match packs as usual, since packs can appear in the middle of the
+    // parameter list of a synthesized CTAD guide. See also the FIXME in
+    // test/SemaCXX/cxx20-ctad-type-alias.cpp:test25.
+    Sema::CheckTemplateArgumentInfo CTAI;
+    if (Input.getArgument().getKind() == TemplateArgument::Pack) {
+      for (auto TA : Output.arguments()) {
+        if (SemaRef.CheckTemplateArgument(
+                TP, TA, F, F->getLocation(), F->getLocation(),
+                /*ArgumentPackIndex=*/-1, CTAI,
+                Sema::CheckTemplateArgumentKind::CTAK_Specified))
+          return nullptr;
+      }
+      // We will substitute the non-deduced template arguments with these
+      // transformed (unpacked at this point) arguments, where that substitution
+      // requires a pack for the corresponding parameter packs.
+      TemplateArgsForBuildingFPrime[Index] =
+          TemplateArgument::CreatePackCopy(Context, CTAI.SugaredConverted);
+    } else {
+      assert(Output.arguments().size() == 1);
+      TemplateArgumentLoc Transformed = Output.arguments()[0];
+      if (SemaRef.CheckTemplateArgument(
+              TP, Transformed, F, F->getLocation(), F->getLocation(),
+              /*ArgumentPackIndex=*/-1, CTAI,
+              Sema::CheckTemplateArgumentKind::CTAK_Specified))
+        return nullptr;
+      TemplateArgsForBuildingFPrime[Index] = CTAI.SugaredConverted[0];
     }
   }
 
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index fd1a5c01233d5..404b928903c8e 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -587,6 +587,23 @@ static_assert(__is_same(decltype(a), A<A<int>>));
 
 } // namespace GH133132
 
+namespace GH131408 {
+
+struct Node {};
+
+template <class T, Node>
+struct A {
+    A(T) {}
+};
+
+template <class T>
+using AA = A<T, {}>;
+
+AA a{0};
+
+static_assert(__is_same(decltype(a), A<int, Node{}>));
+}
+
 namespace GH130604 {
 template <typename T> struct A {
     A(T);

From dd668aafdedffadb4cb6d8a3657c653fd55e2c6d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 3 Oct 2025 08:09:55 +0100
Subject: [PATCH 598/878] [X86] combineBitcastvxi1 - bail out on soft-float
 targets (#161704)

combineBitcastvxi1 is sometimes called pre-legalization, so don't
introduce X86ISD::MOVMSK nodes when vector types aren't legal

Fixes #161693
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  3 +-
 llvm/test/CodeGen/X86/pr161693.ll       | 40 +++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/pr161693.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cda5568a2cb59..38025068a2745 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45457,7 +45457,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
                                   const SDLoc &DL,
                                   const X86Subtarget &Subtarget) {
   EVT SrcVT = Src.getValueType();
-  if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
+  if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
+      SrcVT.getScalarType() != MVT::i1)
     return SDValue();
 
   // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
diff --git a/llvm/test/CodeGen/X86/pr161693.ll b/llvm/test/CodeGen/X86/pr161693.ll
new file mode 100644
index 0000000000000..de8188f483d24
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr161693.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define void @PR161693() #0 {
+; CHECK-LABEL: PR161693:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    movzbl (%rax), %eax
+; CHECK-NEXT:    andb $-33, %al
+; CHECK-NEXT:    addb $-71, %al
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $-6, %al
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    leal (%rcx,%rcx), %edx
+; CHECK-NEXT:    orb %cl, %dl
+; CHECK-NEXT:    leal (,%rdx,4), %ecx
+; CHECK-NEXT:    orb %dl, %cl
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    retq
+start:
+  br label %loop
+
+loop:
+  %.val.i.i89 = load <16 x i8>, ptr poison, align 1
+  %.not49.i = icmp ult <16 x i8> zeroinitializer, splat (i8 -10)
+  %i = and <16 x i8> %.val.i.i89, splat (i8 -33)
+  %i1 = add <16 x i8> %i, splat (i8 -71)
+  %.not51.i = icmp ult <16 x i8> %i1, splat (i8 -6)
+  %.not46.i = and <16 x i1> %.not49.i, %.not51.i
+  %i2 = bitcast <16 x i1> %.not46.i to i16
+  %_0.i = icmp eq i16 %i2, 0
+  br i1 %_0.i, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+soft-float" }

From 90582ad28466f439f42a8d676699c246d454a0ad Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Fri, 3 Oct 2025 03:10:22 -0400
Subject: [PATCH 599/878] [ARM] shouldFoldMaskToVariableShiftPair should be
 true for scalars up to the biggest legal type (#158070)

For ARM, we want to do this up to 32-bits. Otherwise the code ends up
bigger and bloated.
---
 llvm/lib/Target/ARM/ARMISelLowering.h      |   10 +
 llvm/test/CodeGen/ARM/and-mask-variable.ll |   90 +
 llvm/test/CodeGen/ARM/extract-bits.ll      | 4591 ++++++++++++++++++++
 llvm/test/CodeGen/ARM/extract-lowbits.ll   | 2752 ++++++++++++
 4 files changed, 7443 insertions(+)
 create mode 100644 llvm/test/CodeGen/ARM/and-mask-variable.ll
 create mode 100644 llvm/test/CodeGen/ARM/extract-bits.ll
 create mode 100644 llvm/test/CodeGen/ARM/extract-lowbits.ll

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index fa130a153b0de..26ff54cfe0b90 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -775,6 +775,16 @@ class VectorType;
     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                            CombineLevel Level) const override;
 
+    /// Return true if it is profitable to fold a pair of shifts into a mask.
+    bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override {
+      EVT VT = Y.getValueType();
+
+      if (VT.isVector())
+        return false;
+
+      return VT.getScalarSizeInBits() <= 32;
+    }
+
     bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
                                               unsigned SelectOpcode, SDValue X,
                                               SDValue Y) const override;
diff --git a/llvm/test/CodeGen/ARM/and-mask-variable.ll b/llvm/test/CodeGen/ARM/and-mask-variable.ll
new file mode 100644
index 0000000000000..0f84b76f97a6b
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/and-mask-variable.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi  %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi  %s -o -   | FileCheck %s --check-prefix V6M
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; V7M-LABEL: mask_pair:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: mask_pair:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: mask_pair:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: mask_pair:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    bx lr
+  %shl = shl nsw i32 -1, %y
+  %and = and i32 %shl, %x
+  ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; V7M-LABEL: mask_pair_64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsl.w r12, r3, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl.w r12, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl r3, r2
+; V7M-NEXT:    and.w r0, r0, r12
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: mask_pair_64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    subs r12, r2, #32
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsl r2, r3, r2
+; V7A-NEXT:    lslpl r3, r3, r12
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    and r1, r3, r1
+; V7A-NEXT:    and r0, r2, r0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: mask_pair_64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsl.w r12, r3, r2
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl.w r12, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r3, r2
+; V7A-T-NEXT:    and.w r0, r0, r12
+; V7A-T-NEXT:    ands r1, r3
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: mask_pair_64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %shl = shl nsw i64 -1, %y
+  %and = and i64 %shl, %x
+  ret i64 %and
+}
diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll
new file mode 100644
index 0000000000000..77deaa5f59339
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/extract-bits.ll
@@ -0,0 +1,4591 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi  %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi  %s -o -   | FileCheck %s --check-prefix V6M
+
+; Patterns:
+;   a) (x >> start) &  (1 << nbits) - 1
+;   b) (x >> start) & ~(-1 << nbits)
+;   c) (x >> start) &  (-1 >> (32 - y))
+;   d) (x >> start) << (32 - y) >> (32 - y)
+; are equivalent.
+
+; ---------------------------------------------------------------------------- ;
+; Pattern a. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    movs r1, #1
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_a0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r2, r3, r12, lsl r2
+; V7A-NEXT:    and r0, r2, r0, lsr r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_a0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsls r1, r2
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_a0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r2
+; V6M-NEXT:    subs r1, r1, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %shifted = lshr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a0_arithmetic:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    asrs r0, r1
+; V7M-NEXT:    movs r1, #1
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_a0_arithmetic:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r2, r3, r12, lsl r2
+; V7A-NEXT:    and r0, r2, r0, asr r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_a0_arithmetic:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    asrs r0, r1
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsls r1, r2
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_a0_arithmetic:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    asrs r0, r1
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r2
+; V6M-NEXT:    subs r1, r1, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %shifted = ashr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    movs r1, #1
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_a1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r2, r3, r12, lsl r2
+; V7A-NEXT:    and r0, r2, r0, lsr r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_a1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsls r1, r2
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_a1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r2
+; V6M-NEXT:    subs r1, r1, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %conv = zext i8 %numlowbits to i32
+  %onebit = shl i32 1, %conv
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    movs r1, #1
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_a2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r2, r3, r12, lsl r2
+; V7A-NEXT:    and r0, r2, r0, lsr r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_a2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsls r1, r2
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_a2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r3, [r0]
+; V6M-NEXT:    lsrs r3, r1
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    lsls r0, r2
+; V6M-NEXT:    subs r0, r0, #1
+; V6M-NEXT:    ands r0, r3
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %shifted = lshr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    movs r1, #1
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_a3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r2, r3, r12, lsl r2
+; V7A-NEXT:    and r0, r2, r0, lsr r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_a3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsls r1, r2
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_a3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r3, [r0]
+; V6M-NEXT:    lsrs r3, r1
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    lsls r0, r2
+; V6M-NEXT:    subs r0, r0, #1
+; V6M-NEXT:    ands r0, r3
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %conv = zext i8 %numlowbits to i32
+  %onebit = shl i32 1, %conv
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_a4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    movs r1, #1
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_a4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r2, r3, r12, lsl r2
+; V7A-NEXT:    and r0, r2, r0, lsr r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_a4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsls r1, r2
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_a4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r2
+; V6M-NEXT:    subs r1, r1, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %shifted = lshr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %shifted, %mask ; swapped order
+  ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    mov.w lr, #1
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    rsb.w r4, r12, #32
+; V7M-NEXT:    subs.w r3, r12, #32
+; V7M-NEXT:    lsr.w r4, lr, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r4, lr, r3
+; V7M-NEXT:    lsl.w r3, lr, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    subs r3, #1
+; V7M-NEXT:    sbc r12, r4, #0
+; V7M-NEXT:    rsb.w r4, r2, #32
+; V7M-NEXT:    lsl.w r4, r1, r4
+; V7M-NEXT:    orrs r0, r4
+; V7M-NEXT:    subs.w r4, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r4
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    and.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    and.w r1, r1, r12
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, lr}
+; V7A-NEXT:    push {r4, lr}
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r3, r12, #32
+; V7A-NEXT:    subs r4, r12, #32
+; V7A-NEXT:    lsr r3, lr, r3
+; V7A-NEXT:    lslpl r3, lr, r4
+; V7A-NEXT:    lsl r4, lr, r12
+; V7A-NEXT:    movwpl r4, #0
+; V7A-NEXT:    subs r4, r4, #1
+; V7A-NEXT:    sbc r12, r3, #0
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    lsr r1, r1, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    and r0, r4, r0
+; V7A-NEXT:    and r1, r12, r1
+; V7A-NEXT:    pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    mov.w lr, #1
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    rsb.w r4, r12, #32
+; V7A-T-NEXT:    subs.w r3, r12, #32
+; V7A-T-NEXT:    lsr.w r4, lr, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r4, lr, r3
+; V7A-T-NEXT:    lsl.w r3, lr, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    subs r3, #1
+; V7A-T-NEXT:    sbc r12, r4, #0
+; V7A-T-NEXT:    rsb.w r4, r2, #32
+; V7A-T-NEXT:    lsl.w r4, r1, r4
+; V7A-T-NEXT:    orrs r0, r4
+; V7A-T-NEXT:    subs.w r4, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r4
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    and.w r1, r1, r12
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, r7, lr}
+; V6M-NEXT:    push {r4, r5, r6, r7, lr}
+; V6M-NEXT:    .pad #12
+; V6M-NEXT:    sub sp, #12
+; V6M-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT:    mov r6, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r7, #0
+; V6M-NEXT:    ldr r2, [sp, #32]
+; V6M-NEXT:    mov r1, r7
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    subs r5, r0, #1
+; V6M-NEXT:    sbcs r4, r7
+; V6M-NEXT:    mov r0, r6
+; V6M-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    add sp, #12
+; V6M-NEXT:    pop {r4, r5, r6, r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a0_arithmetic:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    mov.w lr, #1
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    rsb.w r4, r12, #32
+; V7M-NEXT:    subs.w r3, r12, #32
+; V7M-NEXT:    lsr.w r4, lr, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r4, lr, r3
+; V7M-NEXT:    lsl.w r3, lr, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    subs r3, #1
+; V7M-NEXT:    sbc r12, r4, #0
+; V7M-NEXT:    rsb.w r4, r2, #32
+; V7M-NEXT:    lsl.w r4, r1, r4
+; V7M-NEXT:    orrs r0, r4
+; V7M-NEXT:    subs.w r4, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    asrpl.w r0, r1, r4
+; V7M-NEXT:    asr.w r2, r1, r2
+; V7M-NEXT:    and.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    asrpl r2, r1, #31
+; V7M-NEXT:    and.w r1, r12, r2
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a0_arithmetic:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, lr}
+; V7A-NEXT:    push {r4, lr}
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r3, r12, #32
+; V7A-NEXT:    subs r4, r12, #32
+; V7A-NEXT:    lsr r3, lr, r3
+; V7A-NEXT:    lslpl r3, lr, r4
+; V7A-NEXT:    lsl r4, lr, r12
+; V7A-NEXT:    movwpl r4, #0
+; V7A-NEXT:    subs r4, r4, #1
+; V7A-NEXT:    sbc r12, r3, #0
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    asr r2, r1, r2
+; V7A-NEXT:    asrpl r0, r1, r3
+; V7A-NEXT:    asrpl r2, r1, #31
+; V7A-NEXT:    and r0, r4, r0
+; V7A-NEXT:    and r1, r12, r2
+; V7A-NEXT:    pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a0_arithmetic:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    mov.w lr, #1
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    rsb.w r4, r12, #32
+; V7A-T-NEXT:    subs.w r3, r12, #32
+; V7A-T-NEXT:    lsr.w r4, lr, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r4, lr, r3
+; V7A-T-NEXT:    lsl.w r3, lr, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    subs r3, #1
+; V7A-T-NEXT:    sbc r12, r4, #0
+; V7A-T-NEXT:    rsb.w r4, r2, #32
+; V7A-T-NEXT:    lsl.w r4, r1, r4
+; V7A-T-NEXT:    orrs r0, r4
+; V7A-T-NEXT:    subs.w r4, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    asrpl.w r0, r1, r4
+; V7A-T-NEXT:    asr.w r2, r1, r2
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    asrpl r2, r1, #31
+; V7A-T-NEXT:    and.w r1, r12, r2
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a0_arithmetic:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, r7, lr}
+; V6M-NEXT:    push {r4, r5, r6, r7, lr}
+; V6M-NEXT:    .pad #12
+; V6M-NEXT:    sub sp, #12
+; V6M-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT:    mov r6, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r7, #0
+; V6M-NEXT:    ldr r2, [sp, #32]
+; V6M-NEXT:    mov r1, r7
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    subs r5, r0, #1
+; V6M-NEXT:    sbcs r4, r7
+; V6M-NEXT:    mov r0, r6
+; V6M-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT:    bl __aeabi_lasr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    add sp, #12
+; V6M-NEXT:    pop {r4, r5, r6, r7, pc}
+  %shifted = ashr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    rsb.w r4, r3, #32
+; V7M-NEXT:    mov.w lr, #1
+; V7M-NEXT:    subs.w r12, r3, #32
+; V7M-NEXT:    lsl.w r3, lr, r3
+; V7M-NEXT:    lsr.w r4, lr, r4
+; V7M-NEXT:    lsr.w r0, r0, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r4, lr, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    subs r3, #1
+; V7M-NEXT:    sbc r12, r4, #0
+; V7M-NEXT:    rsb.w r4, r2, #32
+; V7M-NEXT:    lsl.w r4, r1, r4
+; V7M-NEXT:    orrs r0, r4
+; V7M-NEXT:    subs.w r4, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r4
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    and.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    and.w r1, r1, r12
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, lr}
+; V7A-NEXT:    push {r4, lr}
+; V7A-NEXT:    rsb r12, r3, #32
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    subs r4, r3, #32
+; V7A-NEXT:    lsl r3, lr, r3
+; V7A-NEXT:    lsr r12, lr, r12
+; V7A-NEXT:    movwpl r3, #0
+; V7A-NEXT:    lslpl r12, lr, r4
+; V7A-NEXT:    rsb r4, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    subs r3, r3, #1
+; V7A-NEXT:    sbc r12, r12, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    subs r4, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r4
+; V7A-NEXT:    lsr r1, r1, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    and r0, r3, r0
+; V7A-NEXT:    and r1, r12, r1
+; V7A-NEXT:    pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    rsb.w r4, r3, #32
+; V7A-T-NEXT:    mov.w lr, #1
+; V7A-T-NEXT:    subs.w r12, r3, #32
+; V7A-T-NEXT:    lsl.w r3, lr, r3
+; V7A-T-NEXT:    lsr.w r4, lr, r4
+; V7A-T-NEXT:    lsr.w r0, r0, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r4, lr, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    subs r3, #1
+; V7A-T-NEXT:    sbc r12, r4, #0
+; V7A-T-NEXT:    rsb.w r4, r2, #32
+; V7A-T-NEXT:    lsl.w r4, r1, r4
+; V7A-T-NEXT:    orrs r0, r4
+; V7A-T-NEXT:    subs.w r4, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r4
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    and.w r1, r1, r12
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, r7, lr}
+; V6M-NEXT:    push {r4, r5, r6, r7, lr}
+; V6M-NEXT:    .pad #12
+; V6M-NEXT:    sub sp, #12
+; V6M-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT:    mov r6, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r7, #0
+; V6M-NEXT:    mov r1, r7
+; V6M-NEXT:    mov r2, r3
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    subs r5, r0, #1
+; V6M-NEXT:    sbcs r4, r7
+; V6M-NEXT:    mov r0, r6
+; V6M-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    add sp, #12
+; V6M-NEXT:    pop {r4, r5, r6, r7, pc}
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %conv = zext i8 %numlowbits to i64
+  %onebit = shl i64 1, %conv
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_a2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    mov.w lr, #1
+; V7M-NEXT:    rsb.w r1, r12, #32
+; V7M-NEXT:    subs.w r3, r12, #32
+; V7M-NEXT:    lsr.w r1, lr, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, lr, r3
+; V7M-NEXT:    lsl.w r3, lr, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    subs.w lr, r3, #1
+; V7M-NEXT:    ldrd r0, r3, [r0]
+; V7M-NEXT:    sbc r12, r1, #0
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    lsl.w r1, r3, r1
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    orrs r0, r1
+; V7M-NEXT:    subs.w r1, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r3, r1
+; V7M-NEXT:    lsr.w r1, r3, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    and.w r0, r0, lr
+; V7M-NEXT:    and.w r1, r1, r12
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_a2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r5, r6, lr}
+; V7A-NEXT:    push {r4, r5, r6, lr}
+; V7A-NEXT:    ldr r1, [sp, #16]
+; V7A-NEXT:    mov r3, #1
+; V7A-NEXT:    ldr r6, [r0]
+; V7A-NEXT:    ldr r5, [r0, #4]
+; V7A-NEXT:    rsb r0, r1, #32
+; V7A-NEXT:    subs r4, r1, #32
+; V7A-NEXT:    lsl r1, r3, r1
+; V7A-NEXT:    lsr r0, r3, r0
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    lslpl r0, r3, r4
+; V7A-NEXT:    subs r1, r1, #1
+; V7A-NEXT:    sbc r3, r0, #0
+; V7A-NEXT:    lsr r0, r6, r2
+; V7A-NEXT:    rsb r6, r2, #32
+; V7A-NEXT:    orr r0, r0, r5, lsl r6
+; V7A-NEXT:    subs r6, r2, #32
+; V7A-NEXT:    lsrpl r0, r5, r6
+; V7A-NEXT:    and r0, r1, r0
+; V7A-NEXT:    lsr r1, r5, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    and r1, r3, r1
+; V7A-NEXT:    pop {r4, r5, r6, pc}
+;
+; V7A-T-LABEL: bextr64_a2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    movs r3, #1
+; V7A-T-NEXT:    ldrd lr, r1, [r0]
+; V7A-T-NEXT:    rsb.w r4, r12, #32
+; V7A-T-NEXT:    subs.w r0, r12, #32
+; V7A-T-NEXT:    lsr.w r4, r3, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r4, r3, r0
+; V7A-T-NEXT:    lsl.w r0, r3, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsr.w r3, lr, r2
+; V7A-T-NEXT:    subs r0, #1
+; V7A-T-NEXT:    sbc r12, r4, #0
+; V7A-T-NEXT:    rsb.w r4, r2, #32
+; V7A-T-NEXT:    lsl.w r4, r1, r4
+; V7A-T-NEXT:    orrs r3, r4
+; V7A-T-NEXT:    subs.w r4, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r3, r1, r4
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    and.w r1, r1, r12
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, r7, lr}
+; V6M-NEXT:    push {r4, r5, r6, r7, lr}
+; V6M-NEXT:    .pad #4
+; V6M-NEXT:    sub sp, #4
+; V6M-NEXT:    str r2, [sp] @ 4-byte Spill
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r7, #0
+; V6M-NEXT:    ldr r2, [sp, #24]
+; V6M-NEXT:    mov r1, r7
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r6, r1
+; V6M-NEXT:    subs r4, r0, #1
+; V6M-NEXT:    sbcs r6, r7
+; V6M-NEXT:    ldm r5!, {r0, r1}
+; V6M-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    ands r1, r6
+; V6M-NEXT:    add sp, #4
+; V6M-NEXT:    pop {r4, r5, r6, r7, pc}
+  %val = load i64, ptr %w
+  %shifted = lshr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    mov.w r12, #1
+; V7M-NEXT:    subs.w lr, r2, #32
+; V7M-NEXT:    lsl.w r2, r12, r2
+; V7M-NEXT:    lsr.w r3, r12, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r3, r12, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    subs.w lr, r2, #1
+; V7M-NEXT:    ldrd r0, r2, [r0]
+; V7M-NEXT:    sbc r12, r3, #0
+; V7M-NEXT:    rsb.w r3, r1, #32
+; V7M-NEXT:    lsl.w r3, r2, r3
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    orrs r0, r3
+; V7M-NEXT:    subs.w r3, r1, #32
+; V7M-NEXT:    lsr.w r1, r2, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r2, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    and.w r0, r0, lr
+; V7M-NEXT:    and.w r1, r1, r12
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_a3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r5, r6, lr}
+; V7A-NEXT:    push {r4, r5, r6, lr}
+; V7A-NEXT:    ldr r6, [r0]
+; V7A-NEXT:    mov r3, #1
+; V7A-NEXT:    ldr r5, [r0, #4]
+; V7A-NEXT:    rsb r0, r2, #32
+; V7A-NEXT:    subs r4, r2, #32
+; V7A-NEXT:    lsl r2, r3, r2
+; V7A-NEXT:    lsr r0, r3, r0
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    lslpl r0, r3, r4
+; V7A-NEXT:    subs r3, r2, #1
+; V7A-NEXT:    sbc r0, r0, #0
+; V7A-NEXT:    lsr r2, r5, r1
+; V7A-NEXT:    subs r4, r1, #32
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    and r2, r0, r2
+; V7A-NEXT:    lsr r0, r6, r1
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    orr r0, r0, r5, lsl r1
+; V7A-NEXT:    mov r1, r2
+; V7A-NEXT:    lsrpl r0, r5, r4
+; V7A-NEXT:    and r0, r3, r0
+; V7A-NEXT:    pop {r4, r5, r6, pc}
+;
+; V7A-T-LABEL: bextr64_a3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    rsb.w r4, r2, #32
+; V7A-T-NEXT:    mov.w lr, #1
+; V7A-T-NEXT:    subs.w r3, r2, #32
+; V7A-T-NEXT:    lsl.w r2, lr, r2
+; V7A-T-NEXT:    lsr.w r4, lr, r4
+; V7A-T-NEXT:    ldrd r12, r0, [r0]
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r4, lr, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    subs.w lr, r2, #1
+; V7A-T-NEXT:    sbc r2, r4, #0
+; V7A-T-NEXT:    lsr.w r4, r0, r1
+; V7A-T-NEXT:    subs.w r3, r1, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r4, #0
+; V7A-T-NEXT:    and.w r2, r2, r4
+; V7A-T-NEXT:    rsb.w r4, r1, #32
+; V7A-T-NEXT:    lsr.w r1, r12, r1
+; V7A-T-NEXT:    lsl.w r4, r0, r4
+; V7A-T-NEXT:    orr.w r1, r1, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r1, r0, r3
+; V7A-T-NEXT:    and.w r0, lr, r1
+; V7A-T-NEXT:    mov r1, r2
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, r7, lr}
+; V6M-NEXT:    push {r4, r5, r6, r7, lr}
+; V6M-NEXT:    .pad #4
+; V6M-NEXT:    sub sp, #4
+; V6M-NEXT:    str r1, [sp] @ 4-byte Spill
+; V6M-NEXT:    mov r6, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r7, #0
+; V6M-NEXT:    mov r1, r7
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    subs r4, r0, #1
+; V6M-NEXT:    sbcs r5, r7
+; V6M-NEXT:    ldm r6!, {r0, r1}
+; V6M-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    ands r1, r5
+; V6M-NEXT:    add sp, #4
+; V6M-NEXT:    pop {r4, r5, r6, r7, pc}
+  %val = load i64, ptr %w
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %conv = zext i8 %numlowbits to i64
+  %onebit = shl i64 1, %conv
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_a4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    mov.w lr, #1
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    rsb.w r4, r12, #32
+; V7M-NEXT:    subs.w r3, r12, #32
+; V7M-NEXT:    lsr.w r4, lr, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r4, lr, r3
+; V7M-NEXT:    lsl.w r3, lr, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    subs r3, #1
+; V7M-NEXT:    sbc r12, r4, #0
+; V7M-NEXT:    rsb.w r4, r2, #32
+; V7M-NEXT:    lsl.w r4, r1, r4
+; V7M-NEXT:    orrs r0, r4
+; V7M-NEXT:    subs.w r4, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r4
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    and.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    and.w r1, r1, r12
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_a4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, lr}
+; V7A-NEXT:    push {r4, lr}
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r3, r12, #32
+; V7A-NEXT:    subs r4, r12, #32
+; V7A-NEXT:    lsr r3, lr, r3
+; V7A-NEXT:    lslpl r3, lr, r4
+; V7A-NEXT:    lsl r4, lr, r12
+; V7A-NEXT:    movwpl r4, #0
+; V7A-NEXT:    subs r4, r4, #1
+; V7A-NEXT:    sbc r12, r3, #0
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    lsr r1, r1, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    and r0, r0, r4
+; V7A-NEXT:    and r1, r1, r12
+; V7A-NEXT:    pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_a4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    mov.w lr, #1
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    rsb.w r4, r12, #32
+; V7A-T-NEXT:    subs.w r3, r12, #32
+; V7A-T-NEXT:    lsr.w r4, lr, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r4, lr, r3
+; V7A-T-NEXT:    lsl.w r3, lr, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    subs r3, #1
+; V7A-T-NEXT:    sbc r12, r4, #0
+; V7A-T-NEXT:    rsb.w r4, r2, #32
+; V7A-T-NEXT:    lsl.w r4, r1, r4
+; V7A-T-NEXT:    orrs r0, r4
+; V7A-T-NEXT:    subs.w r4, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r4
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    and.w r1, r1, r12
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_a4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, r7, lr}
+; V6M-NEXT:    push {r4, r5, r6, r7, lr}
+; V6M-NEXT:    .pad #12
+; V6M-NEXT:    sub sp, #12
+; V6M-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; V6M-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; V6M-NEXT:    mov r6, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r7, #0
+; V6M-NEXT:    ldr r2, [sp, #32]
+; V6M-NEXT:    mov r1, r7
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    subs r5, r0, #1
+; V6M-NEXT:    sbcs r4, r7
+; V6M-NEXT:    mov r0, r6
+; V6M-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; V6M-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    add sp, #12
+; V6M-NEXT:    pop {r4, r5, r6, r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %shifted, %mask ; swapped order
+  ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldr r1, [sp]
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    lsls r2, r1
+; V7M-NEXT:    subs r1, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    subs r1, r2, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_a0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldr r12, [sp]
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    mov r1, #1
+; V7A-NEXT:    lsl r1, r1, r12
+; V7A-NEXT:    subs r2, r12, #32
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    sub r1, r1, #1
+; V7A-NEXT:    and r0, r1, r0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_a0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsl.w r1, r1, r12
+; V7A-T-NEXT:    subs.w r2, r12, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_a0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    ldr r2, [sp, #8]
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    subs r0, r0, #1
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    pop {r4, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  %res = trunc i64 %masked to i32
+  ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_a1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a1:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldr r1, [sp]
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_a1:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    add r12, r3, lr, lsl r12
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    and r0, r12, r0
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_a1:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsl.w r1, r1, r12
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_a1:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r1, [sp, #8]
+; V6M-NEXT:    movs r2, #1
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    subs r1, r2, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    pop {r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %truncshifted = trunc i64 %shifted to i32
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %truncshifted
+  ret i32 %masked
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_a2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_a2:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldr r1, [sp]
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_a2:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    add r12, r3, lr, lsl r12
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    and r0, r12, r0
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_a2:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    lsl.w r1, r1, r12
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_a2:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r1, [sp, #8]
+; V6M-NEXT:    movs r2, #1
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    subs r1, r2, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    pop {r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %zextmask = zext i32 %mask to i64
+  %masked = and i64 %zextmask, %shifted
+  %truncmasked = trunc i64 %masked to i32
+  ret i32 %truncmasked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern b. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    lsl.w r2, r3, r2
+; V7M-NEXT:    bics r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_b0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    bic r0, r0, r1, lsl r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_b0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    lsl.w r2, r3, r2
+; V7A-T-NEXT:    bics r0, r2
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_b0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    mvns r1, r1
+; V6M-NEXT:    lsls r1, r2
+; V6M-NEXT:    bics r0, r1
+; V6M-NEXT:    bx lr
+  %shifted = lshr i32 %val, %numskipbits
+  %notmask = shl i32 -1, %numlowbits
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    lsl.w r2, r3, r2
+; V7M-NEXT:    bics r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_b1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    bic r0, r0, r1, lsl r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_b1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    lsl.w r2, r3, r2
+; V7A-T-NEXT:    bics r0, r2
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_b1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    mvns r1, r1
+; V6M-NEXT:    lsls r1, r2
+; V6M-NEXT:    bics r0, r1
+; V6M-NEXT:    bx lr
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %conv = zext i8 %numlowbits to i32
+  %notmask = shl i32 -1, %conv
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsl.w r2, r3, r2
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bics r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_b2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    bic r0, r0, r1, lsl r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_b2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsl.w r2, r3, r2
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bics r0, r2
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_b2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r3, #0
+; V6M-NEXT:    mvns r3, r3
+; V6M-NEXT:    lsls r3, r2
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bics r0, r3
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %shifted = lshr i32 %val, %numskipbits
+  %notmask = shl i32 -1, %numlowbits
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsl.w r2, r3, r2
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bics r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_b3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    bic r0, r0, r1, lsl r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_b3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsl.w r2, r3, r2
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bics r0, r2
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_b3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r3, #0
+; V6M-NEXT:    mvns r3, r3
+; V6M-NEXT:    lsls r3, r2
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bics r0, r3
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %conv = zext i8 %numlowbits to i32
+  %notmask = shl i32 -1, %conv
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_b4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    lsl.w r2, r3, r2
+; V7M-NEXT:    bics r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_b4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    bic r0, r0, r1, lsl r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_b4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    lsl.w r2, r3, r2
+; V7A-T-NEXT:    bics r0, r2
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_b4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    mvns r1, r1
+; V6M-NEXT:    lsls r1, r2
+; V6M-NEXT:    bics r0, r1
+; V6M-NEXT:    bx lr
+  %shifted = lshr i32 %val, %numskipbits
+  %notmask = shl i32 -1, %numlowbits
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %shifted, %mask ; swapped order
+  ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orrs r0, r3
+; V7M-NEXT:    subs.w r3, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    lsl.w r3, r2, r12
+; V7M-NEXT:    subs.w lr, r12, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r2, r2, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    bics r1, r2
+; V7M-NEXT:    bics r0, r3
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    lsr r1, r1, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    subs lr, r12, #32
+; V7A-NEXT:    lsl r2, r3, r12
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r0, r0, r2
+; V7A-NEXT:    lslpl r3, r3, lr
+; V7A-NEXT:    bic r1, r1, r3
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, r5, r7, lr}
+; V7A-T-NEXT:    push {r4, r5, r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    ldr.w r12, [sp, #16]
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r5, r0, r3
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    subs.w lr, r12, #32
+; V7A-T-NEXT:    lsl.w r0, r3, r12
+; V7A-T-NEXT:    itt pl
+; V7A-T-NEXT:    lslpl.w r3, r3, lr
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    subs.w r4, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r5, r1, r4
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    bic.w r0, r5, r0
+; V7A-T-NEXT:    bics r1, r3
+; V7A-T-NEXT:    pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_b0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    ldr r2, [sp, #16]
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r4, r0
+; V6M-NEXT:    bics r5, r1
+; V6M-NEXT:    mov r0, r4
+; V6M-NEXT:    mov r1, r5
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %notmask = shl i64 -1, %numlowbits
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsr.w r12, r0, r2
+; V7M-NEXT:    rsb.w r0, r2, #32
+; V7M-NEXT:    lsl.w r0, r1, r0
+; V7M-NEXT:    orr.w r12, r12, r0
+; V7M-NEXT:    subs.w r0, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r12, r1, r0
+; V7M-NEXT:    lsr.w r0, r1, r2
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    subs.w r1, r3, #32
+; V7M-NEXT:    lsl.w r3, r2, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl r2, r1
+; V7M-NEXT:    bic.w r1, r0, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    bic.w r0, r12, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_b1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r12, r0, r2
+; V7A-NEXT:    rsb r0, r2, #32
+; V7A-NEXT:    orr r12, r12, r1, lsl r0
+; V7A-NEXT:    subs r0, r2, #32
+; V7A-NEXT:    lsrpl r12, r1, r0
+; V7A-NEXT:    lsr r0, r1, r2
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    subs r1, r3, #32
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    lsl r3, r2, r3
+; V7A-NEXT:    lslpl r2, r2, r1
+; V7A-NEXT:    bic r1, r0, r2
+; V7A-NEXT:    movwpl r3, #0
+; V7A-NEXT:    bic r0, r12, r3
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_b1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsr.w r12, r0, r2
+; V7A-T-NEXT:    rsb.w r0, r2, #32
+; V7A-T-NEXT:    lsl.w r0, r1, r0
+; V7A-T-NEXT:    orr.w r12, r12, r0
+; V7A-T-NEXT:    subs.w r0, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r12, r1, r0
+; V7A-T-NEXT:    lsr.w r0, r1, r2
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    subs.w r1, r3, #32
+; V7A-T-NEXT:    lsl.w r3, r2, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r2, r1
+; V7A-T-NEXT:    bic.w r1, r0, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    bic.w r0, r12, r3
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_b1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r4, r3
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    mov r6, r1
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r5, r0
+; V6M-NEXT:    bics r6, r1
+; V6M-NEXT:    mov r0, r5
+; V6M-NEXT:    mov r1, r6
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %conv = zext i8 %numlowbits to i64
+  %notmask = shl i64 -1, %conv
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_b2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    ldrd r0, r3, [r0]
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    lsl.w r1, r3, r1
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    orrs r0, r1
+; V7M-NEXT:    subs.w r1, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r3, r1
+; V7M-NEXT:    lsr.w r1, r3, r2
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    lsl.w r3, r2, r12
+; V7M-NEXT:    subs.w lr, r12, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r2, r2, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    bics r1, r2
+; V7M-NEXT:    bics r0, r3
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    ldrd r0, r1, [r0]
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    lsr r1, r1, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    subs lr, r12, #32
+; V7A-NEXT:    lsl r2, r3, r12
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r0, r0, r2
+; V7A-NEXT:    lslpl r3, r3, lr
+; V7A-NEXT:    bic r1, r1, r3
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    ldrd r0, r3, [r0]
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    lsl.w r1, r3, r1
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    orrs r0, r1
+; V7A-T-NEXT:    subs.w r1, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r3, r1
+; V7A-T-NEXT:    lsr.w r1, r3, r2
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    lsl.w r2, r3, r12
+; V7A-T-NEXT:    subs.w lr, r12, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r3, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    bics r1, r3
+; V7A-T-NEXT:    bics r0, r2
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bextr64_b2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    ldr r3, [r0]
+; V6M-NEXT:    ldr r1, [r0, #4]
+; V6M-NEXT:    mov r0, r3
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    ldr r2, [sp, #16]
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r4, r0
+; V6M-NEXT:    bics r5, r1
+; V6M-NEXT:    mov r0, r4
+; V6M-NEXT:    mov r1, r5
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %val = load i64, ptr %w
+  %shifted = lshr i64 %val, %numskipbits
+  %notmask = shl i64 -1, %numlowbits
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    ldrd r12, r0, [r0]
+; V7M-NEXT:    rsb.w r3, r1, #32
+; V7M-NEXT:    lsl.w lr, r0, r3
+; V7M-NEXT:    lsr.w r3, r12, r1
+; V7M-NEXT:    orr.w r12, r3, lr
+; V7M-NEXT:    subs.w r3, r1, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r12, r0, r3
+; V7M-NEXT:    lsr.w r0, r0, r1
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    subs.w r1, r2, #32
+; V7M-NEXT:    lsl.w r2, r3, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl r3, r1
+; V7M-NEXT:    bic.w r1, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    bic.w r0, r12, r2
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldm r0, {r0, r3}
+; V7A-NEXT:    lsr r12, r0, r1
+; V7A-NEXT:    rsb r0, r1, #32
+; V7A-NEXT:    orr r12, r12, r3, lsl r0
+; V7A-NEXT:    subs r0, r1, #32
+; V7A-NEXT:    lsrpl r12, r3, r0
+; V7A-NEXT:    lsr r0, r3, r1
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    subs r1, r2, #32
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsl r2, r3, r2
+; V7A-NEXT:    lslpl r3, r3, r1
+; V7A-NEXT:    bic r1, r0, r3
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r0, r12, r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_b3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    ldrd r12, r3, [r0]
+; V7A-T-NEXT:    rsb.w r0, r1, #32
+; V7A-T-NEXT:    lsl.w lr, r3, r0
+; V7A-T-NEXT:    lsr.w r0, r12, r1
+; V7A-T-NEXT:    orr.w r12, r0, lr
+; V7A-T-NEXT:    subs.w r0, r1, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r12, r3, r0
+; V7A-T-NEXT:    lsr.w r0, r3, r1
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    subs.w r1, r2, #32
+; V7A-T-NEXT:    lsl.w r2, r3, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r3, r1
+; V7A-T-NEXT:    bic.w r1, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    bic.w r0, r12, r2
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bextr64_b3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r4, r2
+; V6M-NEXT:    mov r2, r1
+; V6M-NEXT:    ldr r3, [r0]
+; V6M-NEXT:    ldr r1, [r0, #4]
+; V6M-NEXT:    mov r0, r3
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    mov r6, r1
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r5, r0
+; V6M-NEXT:    bics r6, r1
+; V6M-NEXT:    mov r0, r5
+; V6M-NEXT:    mov r1, r6
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %val = load i64, ptr %w
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %conv = zext i8 %numlowbits to i64
+  %notmask = shl i64 -1, %conv
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_b4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orrs r0, r3
+; V7M-NEXT:    subs.w r3, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    lsl.w r3, r2, r12
+; V7M-NEXT:    subs.w lr, r12, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r2, r2, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    bics r1, r2
+; V7M-NEXT:    bics r0, r3
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_b4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    lsr r1, r1, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    subs lr, r12, #32
+; V7A-NEXT:    lsl r2, r3, r12
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r0, r0, r2
+; V7A-NEXT:    lslpl r3, r3, lr
+; V7A-NEXT:    bic r1, r1, r3
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_b4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, r5, r7, lr}
+; V7A-T-NEXT:    push {r4, r5, r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    ldr.w r12, [sp, #16]
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r5, r0, r3
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    subs.w lr, r12, #32
+; V7A-T-NEXT:    lsl.w r0, r3, r12
+; V7A-T-NEXT:    itt pl
+; V7A-T-NEXT:    lslpl.w r3, r3, lr
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    subs.w r4, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r5, r1, r4
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    bic.w r0, r5, r0
+; V7A-T-NEXT:    bics r1, r3
+; V7A-T-NEXT:    pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_b4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    ldr r2, [sp, #16]
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r4, r0
+; V6M-NEXT:    bics r5, r1
+; V6M-NEXT:    mov r0, r4
+; V6M-NEXT:    mov r1, r5
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %notmask = shl i64 -1, %numlowbits
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %shifted, %mask ; swapped order
+  ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_b0(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldrb.w r1, [sp]
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsls r2, r1
+; V7M-NEXT:    subs r1, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    bics r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_b0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldrb r12, [sp]
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    lsl r1, r1, r12
+; V7A-NEXT:    subs r2, r12, #32
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    bic r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_b0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsr.w r12, r0, r2
+; V7A-T-NEXT:    rsb.w r0, r2, #32
+; V7A-T-NEXT:    ldrb.w r3, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r0, r1, r0
+; V7A-T-NEXT:    orr.w r0, r0, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    mov.w r1, #-1
+; V7A-T-NEXT:    lsls r1, r3
+; V7A-T-NEXT:    subs.w r2, r3, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_b0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    add r1, sp, #8
+; V6M-NEXT:    ldrb r2, [r1]
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r4, r0
+; V6M-NEXT:    mov r0, r4
+; V6M-NEXT:    pop {r4, pc}
+  %shiftedval = lshr i64 %val, %numskipbits
+  %widenumlowbits = zext i8 %numlowbits to i64
+  %notmask = shl nsw i64 -1, %widenumlowbits
+  %mask = xor i64 %notmask, -1
+  %wideres = and i64 %shiftedval, %mask
+  %res = trunc i64 %wideres to i32
+  ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_b1(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b1:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldrb.w r1, [sp]
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    bics r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_b1:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldrb r12, [sp]
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    bic r0, r0, r1, lsl r12
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_b1:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldrb.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    mov.w r1, #-1
+; V7A-T-NEXT:    lsl.w r1, r1, r12
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_b1:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    add r1, sp, #8
+; V6M-NEXT:    ldrb r1, [r1]
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    mvns r2, r2
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    bics r0, r2
+; V6M-NEXT:    pop {r7, pc}
+  %shiftedval = lshr i64 %val, %numskipbits
+  %truncshiftedval = trunc i64 %shiftedval to i32
+  %widenumlowbits = zext i8 %numlowbits to i32
+  %notmask = shl nsw i32 -1, %widenumlowbits
+  %mask = xor i32 %notmask, -1
+  %res = and i32 %truncshiftedval, %mask
+  ret i32 %res
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_b2(i64 %val, i64 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_b2:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldrb.w r1, [sp]
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    bics r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_b2:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldrb r12, [sp]
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    bic r0, r0, r1, lsl r12
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_b2:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldrb.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    mov.w r1, #-1
+; V7A-T-NEXT:    lsl.w r1, r1, r12
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_b2:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    add r1, sp, #8
+; V6M-NEXT:    ldrb r1, [r1]
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    mvns r2, r2
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    bics r0, r2
+; V6M-NEXT:    pop {r7, pc}
+  %shiftedval = lshr i64 %val, %numskipbits
+  %widenumlowbits = zext i8 %numlowbits to i32
+  %notmask = shl nsw i32 -1, %widenumlowbits
+  %mask = xor i32 %notmask, -1
+  %zextmask = zext i32 %mask to i64
+  %wideres = and i64 %shiftedval, %zextmask
+  %res = trunc i64 %wideres to i32
+  ret i32 %res
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern c. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_c0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_c0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_c0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r3, #32
+; V6M-NEXT:    subs r2, r3, r2
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    lsls r0, r2
+; V6M-NEXT:    lsrs r0, r2
+; V6M-NEXT:    bx lr
+  %shifted = lshr i32 %val, %numskipbits
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_c1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_c1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_c1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #32
+; V6M-NEXT:    subs r1, r1, r2
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %mask = lshr i32 -1, %sh_prom
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_c2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_c2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_c2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_c2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r3, #32
+; V6M-NEXT:    subs r2, r3, r2
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    lsls r0, r2
+; V6M-NEXT:    lsrs r0, r2
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %shifted = lshr i32 %val, %numskipbits
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_c3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_c3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_c3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #32
+; V6M-NEXT:    subs r1, r1, r2
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %mask = lshr i32 -1, %sh_prom
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
+define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_c4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_c4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_c4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_c4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r3, #32
+; V6M-NEXT:    subs r2, r3, r2
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    lsls r0, r2
+; V6M-NEXT:    lsrs r0, r2
+; V6M-NEXT:    bx lr
+  %shifted = lshr i32 %val, %numskipbits
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %shifted, %mask ; swapped order
+  ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    ldr.w r12, [sp]
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orrs r0, r3
+; V7M-NEXT:    subs.w r3, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    rsb.w r3, r12, #64
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    lsr.w r3, r2, r3
+; V7M-NEXT:    rsbs.w r12, r12, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r2, r2, r12
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_c0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr r12, [sp, #16]
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsr r5, r1, r2
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r4, r12, #64
+; V7A-NEXT:    rsbs lr, r12, #32
+; V7A-NEXT:    lsr r4, r3, r4
+; V7A-NEXT:    lsrpl r3, r3, lr
+; V7A-NEXT:    movwpl r4, #0
+; V7A-NEXT:    subs lr, r2, #32
+; V7A-NEXT:    rsb r2, r2, #32
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    and r12, r4, r5
+; V7A-NEXT:    orr r0, r0, r1, lsl r2
+; V7A-NEXT:    lsrpl r0, r1, lr
+; V7A-NEXT:    mov r1, r12
+; V7A-NEXT:    and r0, r3, r0
+; V7A-NEXT:    pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_c0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    mov.w lr, #-1
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orrs r0, r3
+; V7A-T-NEXT:    subs.w r3, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r3
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    rsbs.w r2, r12, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r3, r2
+; V7A-T-NEXT:    rsb.w r2, r12, #64
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    lsr.w r2, lr, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    ands r1, r2
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bextr64_c0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    ldr r0, [sp, #16]
+; V6M-NEXT:    movs r1, #64
+; V6M-NEXT:    subs r2, r1, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i64 64, %numlowbits
+  %mask = lshr i64 -1, %numhighbits
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    uxtb r2, r2
+; V7M-NEXT:    lsr.w r12, r0, r2
+; V7M-NEXT:    rsb.w r0, r2, #32
+; V7M-NEXT:    lsl.w r0, r1, r0
+; V7M-NEXT:    orr.w r12, r12, r0
+; V7M-NEXT:    subs.w r0, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r12, r1, r0
+; V7M-NEXT:    rsb.w r0, r3, #64
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    uxtb r0, r0
+; V7M-NEXT:    subs.w lr, r0, #32
+; V7M-NEXT:    lsr.w r2, r3, r0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r3, r3, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    and.w r0, r3, r12
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_c1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, lr}
+; V7A-NEXT:    push {r4, lr}
+; V7A-NEXT:    uxtb r12, r2
+; V7A-NEXT:    lsr lr, r0, r12
+; V7A-NEXT:    rsb r0, r12, #32
+; V7A-NEXT:    orr r4, lr, r1, lsl r0
+; V7A-NEXT:    mvn lr, #31
+; V7A-NEXT:    uxtab r2, lr, r2
+; V7A-NEXT:    cmp r2, #0
+; V7A-NEXT:    lsrpl r4, r1, r2
+; V7A-NEXT:    rsb r2, r3, #64
+; V7A-NEXT:    lsr r1, r1, r12
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    uxtb r12, r2
+; V7A-NEXT:    uxtab r2, lr, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    lsr r0, r3, r12
+; V7A-NEXT:    cmp r2, #0
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    and r1, r0, r1
+; V7A-NEXT:    lsrpl r3, r3, r2
+; V7A-NEXT:    and r0, r3, r4
+; V7A-NEXT:    pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_c1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    uxtb.w r12, r2
+; V7A-T-NEXT:    lsr.w lr, r0, r12
+; V7A-T-NEXT:    rsb.w r0, r12, #32
+; V7A-T-NEXT:    lsl.w r0, r1, r0
+; V7A-T-NEXT:    orr.w r4, lr, r0
+; V7A-T-NEXT:    mvn lr, #31
+; V7A-T-NEXT:    uxtab r2, lr, r2
+; V7A-T-NEXT:    cmp r2, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r4, r1, r2
+; V7A-T-NEXT:    rsb.w r2, r3, #64
+; V7A-T-NEXT:    lsr.w r1, r1, r12
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    uxtb.w r12, r2
+; V7A-T-NEXT:    uxtab r2, lr, r2
+; V7A-T-NEXT:    lsr.w r0, r3, r12
+; V7A-T-NEXT:    cmp r2, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    and.w r1, r1, r0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r3, r2
+; V7A-T-NEXT:    and.w r0, r3, r4
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_c1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r5, r3
+; V6M-NEXT:    uxtb r2, r2
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r6, r0
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    movs r0, #64
+; V6M-NEXT:    subs r0, r0, r5
+; V6M-NEXT:    uxtb r2, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r6
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %mask = lshr i64 -1, %sh_prom
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_c2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldrd r0, r3, [r0]
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    ldr.w r12, [sp]
+; V7M-NEXT:    lsl.w r1, r3, r1
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    orrs r0, r1
+; V7M-NEXT:    subs.w r1, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r3, r1
+; V7M-NEXT:    lsr.w r1, r3, r2
+; V7M-NEXT:    rsb.w r3, r12, #64
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    rsbs.w r12, r12, #32
+; V7M-NEXT:    lsr.w r3, r2, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r2, r2, r12
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_c2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r6, r8, lr}
+; V7A-NEXT:    push {r4, r6, r8, lr}
+; V7A-NEXT:    ldr r12, [sp, #16]
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    rsb r6, r12, #64
+; V7A-NEXT:    ldr r8, [r0]
+; V7A-NEXT:    mvn r0, #0
+; V7A-NEXT:    rsbs r1, r12, #32
+; V7A-NEXT:    lsr r6, r0, r6
+; V7A-NEXT:    lsr r4, r3, r2
+; V7A-NEXT:    lsrpl r0, r0, r1
+; V7A-NEXT:    movwpl r6, #0
+; V7A-NEXT:    subs r12, r2, #32
+; V7A-NEXT:    movwpl r4, #0
+; V7A-NEXT:    and r1, r6, r4
+; V7A-NEXT:    lsr r6, r8, r2
+; V7A-NEXT:    rsb r2, r2, #32
+; V7A-NEXT:    orr r2, r6, r3, lsl r2
+; V7A-NEXT:    lsrpl r2, r3, r12
+; V7A-NEXT:    and r0, r0, r2
+; V7A-NEXT:    pop {r4, r6, r8, pc}
+;
+; V7A-T-LABEL: bextr64_c2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldrd r0, r3, [r0]
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    lsl.w r1, r3, r1
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    orrs r0, r1
+; V7A-T-NEXT:    subs.w r1, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r3, r1
+; V7A-T-NEXT:    lsr.w r1, r3, r2
+; V7A-T-NEXT:    rsb.w r2, r12, #64
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    rsbs.w r12, r12, #32
+; V7A-T-NEXT:    lsr.w r2, r3, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r3, r3, r12
+; V7A-T-NEXT:    ands r1, r2
+; V7A-T-NEXT:    ands r0, r3
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_c2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    ldr r3, [r0]
+; V6M-NEXT:    ldr r1, [r0, #4]
+; V6M-NEXT:    mov r0, r3
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    ldr r0, [sp, #16]
+; V6M-NEXT:    movs r1, #64
+; V6M-NEXT:    subs r2, r1, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %val = load i64, ptr %w
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i64 64, %numlowbits
+  %mask = lshr i64 -1, %numhighbits
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_c3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    ldrd r0, r3, [r0]
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsr.w r12, r0, r1
+; V7M-NEXT:    rsb.w r0, r1, #32
+; V7M-NEXT:    lsl.w r0, r3, r0
+; V7M-NEXT:    orr.w r12, r12, r0
+; V7M-NEXT:    subs.w r0, r1, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r12, r3, r0
+; V7M-NEXT:    rsb.w r0, r2, #64
+; V7M-NEXT:    lsr.w r1, r3, r1
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    uxtb r0, r0
+; V7M-NEXT:    subs.w lr, r0, #32
+; V7M-NEXT:    lsr.w r2, r3, r0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r3, r3, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    and.w r0, r3, r12
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bextr64_c3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, lr}
+; V7A-NEXT:    push {r4, lr}
+; V7A-NEXT:    ldr r4, [r0]
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    uxtb r0, r1
+; V7A-NEXT:    lsr r12, r4, r0
+; V7A-NEXT:    rsb r4, r0, #32
+; V7A-NEXT:    lsr r0, r3, r0
+; V7A-NEXT:    orr lr, r12, r3, lsl r4
+; V7A-NEXT:    mvn r12, #31
+; V7A-NEXT:    uxtab r1, r12, r1
+; V7A-NEXT:    cmp r1, #0
+; V7A-NEXT:    lsrpl lr, r3, r1
+; V7A-NEXT:    rsb r1, r2, #64
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    uxtb r2, r1
+; V7A-NEXT:    uxtab r4, r12, r1
+; V7A-NEXT:    lsr r2, r3, r2
+; V7A-NEXT:    cmp r4, #0
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    and r1, r2, r0
+; V7A-NEXT:    lsrpl r3, r3, r4
+; V7A-NEXT:    and r0, r3, lr
+; V7A-NEXT:    pop {r4, pc}
+;
+; V7A-T-LABEL: bextr64_c3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, r5, r7, lr}
+; V7A-T-NEXT:    push {r4, r5, r7, lr}
+; V7A-T-NEXT:    ldrd r12, lr, [r0]
+; V7A-T-NEXT:    uxtb r0, r1
+; V7A-T-NEXT:    rsb.w r3, r0, #32
+; V7A-T-NEXT:    lsl.w r4, lr, r3
+; V7A-T-NEXT:    lsr.w r3, r12, r0
+; V7A-T-NEXT:    orr.w r5, r3, r4
+; V7A-T-NEXT:    mvn r12, #31
+; V7A-T-NEXT:    uxtab r1, r12, r1
+; V7A-T-NEXT:    lsr.w r0, lr, r0
+; V7A-T-NEXT:    cmp r1, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r5, lr, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #64
+; V7A-T-NEXT:    mov.w r4, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    uxtb r2, r1
+; V7A-T-NEXT:    uxtab r3, r12, r1
+; V7A-T-NEXT:    lsr.w r2, r4, r2
+; V7A-T-NEXT:    cmp r3, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    and.w r1, r2, r0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r4, r3
+; V7A-T-NEXT:    and.w r0, r4, r5
+; V7A-T-NEXT:    pop {r4, r5, r7, pc}
+;
+; V6M-LABEL: bextr64_c3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r5, r2
+; V6M-NEXT:    ldr r4, [r0]
+; V6M-NEXT:    ldr r3, [r0, #4]
+; V6M-NEXT:    uxtb r2, r1
+; V6M-NEXT:    mov r0, r4
+; V6M-NEXT:    mov r1, r3
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r6, r0
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    movs r0, #64
+; V6M-NEXT:    subs r0, r0, r5
+; V6M-NEXT:    uxtb r2, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r6
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %val = load i64, ptr %w
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %mask = lshr i64 -1, %sh_prom
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_c4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    ldr.w r12, [sp]
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orrs r0, r3
+; V7M-NEXT:    subs.w r3, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    rsb.w r3, r12, #64
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    lsr.w r3, r2, r3
+; V7M-NEXT:    rsbs.w r12, r12, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r2, r2, r12
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_c4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr r12, [sp, #16]
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsr r5, r1, r2
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r4, r12, #64
+; V7A-NEXT:    rsbs lr, r12, #32
+; V7A-NEXT:    lsr r4, r3, r4
+; V7A-NEXT:    lsrpl r3, r3, lr
+; V7A-NEXT:    movwpl r4, #0
+; V7A-NEXT:    subs lr, r2, #32
+; V7A-NEXT:    rsb r2, r2, #32
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    and r12, r5, r4
+; V7A-NEXT:    orr r0, r0, r1, lsl r2
+; V7A-NEXT:    lsrpl r0, r1, lr
+; V7A-NEXT:    mov r1, r12
+; V7A-NEXT:    and r0, r0, r3
+; V7A-NEXT:    pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_c4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    mov.w lr, #-1
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orrs r0, r3
+; V7A-T-NEXT:    subs.w r3, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r3
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    rsbs.w r2, r12, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r3, r2
+; V7A-T-NEXT:    rsb.w r2, r12, #64
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    lsr.w r2, lr, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    ands r1, r2
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bextr64_c4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    ldr r0, [sp, #16]
+; V6M-NEXT:    movs r1, #64
+; V6M-NEXT:    subs r2, r1, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i64 64, %numlowbits
+  %mask = lshr i64 -1, %numhighbits
+  %masked = and i64 %shifted, %mask ; swapped order
+  ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldr r1, [sp]
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    rsbs.w r1, r1, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl r2, r1
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_c0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r3, [sp]
+; V7A-NEXT:    rsbs r12, r3, #32
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsrpl r3, r3, r12
+; V7A-NEXT:    lsr r12, r0, r2
+; V7A-NEXT:    rsb r0, r2, #32
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r12, r1, lsl r0
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    and r0, r3, r0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_c0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    rsbs.w r1, r12, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r2, r1
+; V7A-T-NEXT:    ands r0, r2
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_c0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    ldr r0, [sp, #8]
+; V6M-NEXT:    movs r1, #64
+; V6M-NEXT:    subs r2, r1, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    pop {r4, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i64 64, %numlowbits
+  %mask = lshr i64 -1, %numhighbits
+  %masked = and i64 %mask, %shifted
+  %res = trunc i64 %masked to i32
+  ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_c1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c1:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldr r1, [sp]
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_c1:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldr r12, [sp]
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    rsb r1, r12, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_c1:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    rsb.w r1, r12, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_c1:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r1, [sp, #8]
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    pop {r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %truncshifted = trunc i64 %shifted to i32
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %mask, %truncshifted
+  ret i32 %masked
+}
+
+; Shifting happens in 64-bit. Mask is 32-bit, but extended to 64-bit.
+; Masking is 64-bit. Then truncation.
+define i32 @bextr64_32_c2(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_c2:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldr r1, [sp]
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_c2:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldr r12, [sp]
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    rsb r1, r12, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_c2:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    rsb.w r1, r12, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_c2:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r1, [sp, #8]
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    pop {r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %zextmask = zext i32 %mask to i64
+  %masked = and i64 %zextmask, %shifted
+  %truncmasked = trunc i64 %masked to i32
+  ret i32 %truncmasked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern d. 32-bit.
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_d0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_d0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_d0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r3, #32
+; V6M-NEXT:    subs r2, r3, r2
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    lsls r0, r2
+; V6M-NEXT:    lsrs r0, r2
+; V6M-NEXT:    bx lr
+  %shifted = lshr i32 %val, %numskipbits
+  %numhighbits = sub i32 32, %numlowbits
+  %highbitscleared = shl i32 %shifted, %numhighbits
+  %masked = lshr i32 %highbitscleared, %numhighbits
+  ret i32 %masked
+}
+
+define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_d1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_d1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_d1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #32
+; V6M-NEXT:    subs r1, r1, r2
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %highbitscleared = shl i32 %shifted, %sh_prom
+  %masked = lshr i32 %highbitscleared, %sh_prom
+  ret i32 %masked
+}
+
+define i32 @bextr32_d2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_d2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_d2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_d2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r3, #32
+; V6M-NEXT:    subs r2, r3, r2
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    lsls r0, r2
+; V6M-NEXT:    lsrs r0, r2
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %shifted = lshr i32 %val, %numskipbits
+  %numhighbits = sub i32 32, %numlowbits
+  %highbitscleared = shl i32 %shifted, %numhighbits
+  %masked = lshr i32 %highbitscleared, %numhighbits
+  ret i32 %masked
+}
+
+define i32 @bextr32_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr32_d3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr32_d3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    rsb r1, r2, #32
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr32_d3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr32_d3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    movs r1, #32
+; V6M-NEXT:    subs r1, r1, r2
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %skip = zext i8 %numskipbits to i32
+  %shifted = lshr i32 %val, %skip
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %highbitscleared = shl i32 %shifted, %sh_prom
+  %masked = lshr i32 %highbitscleared, %sh_prom
+  ret i32 %masked
+}
+
+; 64-bit.
+
+define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orrs r0, r3
+; V7M-NEXT:    subs.w r3, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    rsb.w r3, r12, #64
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    rsb.w lr, r12, #32
+; V7M-NEXT:    rsb.w r12, r3, #32
+; V7M-NEXT:    lsls r1, r3
+; V7M-NEXT:    cmp.w lr, #0
+; V7M-NEXT:    lsr.w r4, r0, r12
+; V7M-NEXT:    orr.w r1, r1, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r0, lr
+; V7M-NEXT:    lsl.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r2, r1, r12
+; V7M-NEXT:    lsr.w r0, r0, r3
+; V7M-NEXT:    orr.w r0, r0, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, lr
+; V7M-NEXT:    lsr.w r1, r1, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    lsr r3, r1, r2
+; V7A-NEXT:    subs lr, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r2, r2, #32
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    movwpl r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r2
+; V7A-NEXT:    lsrpl r0, r1, lr
+; V7A-NEXT:    rsb r1, r12, #64
+; V7A-NEXT:    rsb lr, r1, #32
+; V7A-NEXT:    lsr r2, r0, lr
+; V7A-NEXT:    orr r2, r2, r3, lsl r1
+; V7A-NEXT:    rsbs r3, r12, #32
+; V7A-NEXT:    lslpl r2, r0, r3
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    lsr r1, r2, r1
+; V7A-NEXT:    orr r0, r0, r2, lsl lr
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    lsrpl r0, r2, r3
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_d0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orrs r0, r3
+; V7A-T-NEXT:    subs.w r3, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r3
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    rsb.w r3, r12, #64
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    rsb.w lr, r3, #32
+; V7A-T-NEXT:    lsls r1, r3
+; V7A-T-NEXT:    rsbs.w r2, r12, #32
+; V7A-T-NEXT:    lsr.w r4, r0, lr
+; V7A-T-NEXT:    orr.w r1, r1, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r1, r0, r2
+; V7A-T-NEXT:    lsl.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r4, r1, lr
+; V7A-T-NEXT:    lsr.w r0, r0, r3
+; V7A-T-NEXT:    orr.w r0, r0, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    lsr.w r1, r1, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_d0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r2, [sp, #8]
+; V6M-NEXT:    movs r3, #64
+; V6M-NEXT:    subs r4, r3, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i64 64, %numlowbits
+  %highbitscleared = shl i64 %shifted, %numhighbits
+  %masked = lshr i64 %highbitscleared, %numhighbits
+  ret i64 %masked
+}
+
+define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    uxtb.w lr, r2
+; V7M-NEXT:    subs.w r2, lr, #32
+; V7M-NEXT:    lsr.w r12, r0, lr
+; V7M-NEXT:    rsb.w r0, lr, #32
+; V7M-NEXT:    lsl.w r0, r1, r0
+; V7M-NEXT:    orr.w r0, r0, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    rsb.w r2, r3, #64
+; V7M-NEXT:    lsr.w r1, r1, lr
+; V7M-NEXT:    uxtb r2, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    rsb.w r12, r2, #32
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    sub.w r3, r2, #32
+; V7M-NEXT:    lsr.w r4, r0, r12
+; V7M-NEXT:    orrs r1, r4
+; V7M-NEXT:    cmp r3, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r0, r3
+; V7M-NEXT:    lsl.w r0, r0, r2
+; V7M-NEXT:    lsl.w r4, r1, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsr.w r0, r0, r2
+; V7M-NEXT:    orr.w r0, r0, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    uxtb r12, r2
+; V7A-NEXT:    lsr lr, r0, r12
+; V7A-NEXT:    rsb r0, r12, #32
+; V7A-NEXT:    orr r0, lr, r1, lsl r0
+; V7A-NEXT:    mvn lr, #31
+; V7A-NEXT:    uxtab r2, lr, r2
+; V7A-NEXT:    cmp r2, #0
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    rsb r2, r3, #64
+; V7A-NEXT:    lsr r1, r1, r12
+; V7A-NEXT:    uxtb r3, r2
+; V7A-NEXT:    rsb r4, r3, #32
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    uxtab r2, lr, r2
+; V7A-NEXT:    lsr r5, r0, r4
+; V7A-NEXT:    orr r1, r5, r1, lsl r3
+; V7A-NEXT:    cmp r2, #0
+; V7A-NEXT:    lslpl r1, r0, r2
+; V7A-NEXT:    lsl r0, r0, r3
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, r3
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    lsr r1, r1, r3
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_d1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, r5, r6, r7, lr}
+; V7A-T-NEXT:    push {r4, r5, r6, r7, lr}
+; V7A-T-NEXT:    uxtb.w r12, r2
+; V7A-T-NEXT:    rsb.w r6, r12, #32
+; V7A-T-NEXT:    rsb.w r3, r3, #64
+; V7A-T-NEXT:    lsr.w r0, r0, r12
+; V7A-T-NEXT:    mvn r7, #31
+; V7A-T-NEXT:    uxtab r2, r7, r2
+; V7A-T-NEXT:    lsl.w r6, r1, r6
+; V7A-T-NEXT:    lsr.w lr, r1, r12
+; V7A-T-NEXT:    orrs r0, r6
+; V7A-T-NEXT:    cmp r2, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl.w lr, #0
+; V7A-T-NEXT:    uxtb r5, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    rsb.w r1, r5, #32
+; V7A-T-NEXT:    uxtab r3, r7, r3
+; V7A-T-NEXT:    lsl.w r4, lr, r5
+; V7A-T-NEXT:    lsr.w r2, r0, r1
+; V7A-T-NEXT:    cmp r3, #0
+; V7A-T-NEXT:    orr.w r2, r2, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r2, r0, r3
+; V7A-T-NEXT:    lsl.w r0, r0, r5
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    lsr.w r0, r0, r5
+; V7A-T-NEXT:    orr.w r0, r0, r1
+; V7A-T-NEXT:    lsr.w r1, r2, r5
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r2, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; V6M-LABEL: bextr64_d1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    mov r4, r3
+; V6M-NEXT:    uxtb r2, r2
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    movs r2, #64
+; V6M-NEXT:    subs r2, r2, r4
+; V6M-NEXT:    uxtb r4, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %highbitscleared = shl i64 %shifted, %sh_prom
+  %masked = lshr i64 %highbitscleared, %sh_prom
+  ret i64 %masked
+}
+
+define i64 @bextr64_d2_load(ptr %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    ldrd r0, r3, [r0]
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    lsl.w r1, r3, r1
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    rsb.w lr, r12, #32
+; V7M-NEXT:    orrs r0, r1
+; V7M-NEXT:    subs.w r1, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r3, r1
+; V7M-NEXT:    rsb.w r1, r12, #64
+; V7M-NEXT:    lsr.w r2, r3, r2
+; V7M-NEXT:    rsb.w r12, r1, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    cmp.w lr, #0
+; V7M-NEXT:    lsl.w r2, r2, r1
+; V7M-NEXT:    lsr.w r4, r0, r12
+; V7M-NEXT:    orr.w r2, r2, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r2, r0, lr
+; V7M-NEXT:    lsl.w r0, r0, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r3, r2, r12
+; V7M-NEXT:    lsr.w r0, r0, r1
+; V7M-NEXT:    lsr.w r1, r2, r1
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r2, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    ldrd r0, r1, [r0]
+; V7A-NEXT:    subs lr, r2, #32
+; V7A-NEXT:    lsr r3, r1, r2
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    movwpl r3, #0
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r2
+; V7A-NEXT:    lsrpl r0, r1, lr
+; V7A-NEXT:    rsb r1, r12, #64
+; V7A-NEXT:    rsb lr, r1, #32
+; V7A-NEXT:    lsr r2, r0, lr
+; V7A-NEXT:    orr r2, r2, r3, lsl r1
+; V7A-NEXT:    rsbs r3, r12, #32
+; V7A-NEXT:    lslpl r2, r0, r3
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    lsr r1, r2, r1
+; V7A-NEXT:    orr r0, r0, r2, lsl lr
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    lsrpl r0, r2, r3
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_d2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    ldrd r0, r3, [r0]
+; V7A-T-NEXT:    rsb.w r1, r2, #32
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    lsl.w r1, r3, r1
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    orrs r0, r1
+; V7A-T-NEXT:    subs.w r1, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r3, r1
+; V7A-T-NEXT:    lsr.w r2, r3, r2
+; V7A-T-NEXT:    rsb.w r1, r12, #64
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    rsb.w lr, r1, #32
+; V7A-T-NEXT:    rsbs.w r3, r12, #32
+; V7A-T-NEXT:    lsl.w r2, r2, r1
+; V7A-T-NEXT:    lsr.w r4, r0, lr
+; V7A-T-NEXT:    orr.w r2, r2, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r2, r0, r3
+; V7A-T-NEXT:    lsl.w r0, r0, r1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r4, r2, lr
+; V7A-T-NEXT:    lsr.w r0, r0, r1
+; V7A-T-NEXT:    lsr.w r1, r2, r1
+; V7A-T-NEXT:    orr.w r0, r0, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r2, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_d2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    ldr r3, [r0]
+; V6M-NEXT:    ldr r1, [r0, #4]
+; V6M-NEXT:    mov r0, r3
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r2, [sp, #8]
+; V6M-NEXT:    movs r3, #64
+; V6M-NEXT:    subs r4, r3, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %val = load i64, ptr %w
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i64 64, %numlowbits
+  %highbitscleared = shl i64 %shifted, %numhighbits
+  %masked = lshr i64 %highbitscleared, %numhighbits
+  ret i64 %masked
+}
+
+define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_d3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    ldrd r0, lr, [r0]
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    rsb.w r2, r2, #64
+; V7M-NEXT:    subs.w r3, r1, #32
+; V7M-NEXT:    lsr.w r12, r0, r1
+; V7M-NEXT:    rsb.w r0, r1, #32
+; V7M-NEXT:    lsr.w r1, lr, r1
+; V7M-NEXT:    uxtb r2, r2
+; V7M-NEXT:    lsl.w r0, lr, r0
+; V7M-NEXT:    orr.w r0, r0, r12
+; V7M-NEXT:    rsb.w r12, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, lr, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    lsls r1, r2
+; V7M-NEXT:    sub.w r3, r2, #32
+; V7M-NEXT:    lsr.w r4, r0, r12
+; V7M-NEXT:    orrs r1, r4
+; V7M-NEXT:    cmp r3, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r0, r3
+; V7M-NEXT:    lsl.w r0, r0, r2
+; V7M-NEXT:    lsl.w r4, r1, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsr.w r0, r0, r2
+; V7M-NEXT:    orr.w r0, r0, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_d3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr r4, [r0]
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    uxtb r0, r1
+; V7A-NEXT:    lsr r12, r4, r0
+; V7A-NEXT:    rsb r4, r0, #32
+; V7A-NEXT:    lsr r0, r3, r0
+; V7A-NEXT:    orr r4, r12, r3, lsl r4
+; V7A-NEXT:    mvn r12, #31
+; V7A-NEXT:    uxtab r1, r12, r1
+; V7A-NEXT:    cmp r1, #0
+; V7A-NEXT:    lsrpl r4, r3, r1
+; V7A-NEXT:    rsb r1, r2, #64
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    uxtb r2, r1
+; V7A-NEXT:    rsb lr, r2, #32
+; V7A-NEXT:    uxtab r1, r12, r1
+; V7A-NEXT:    lsr r5, r4, lr
+; V7A-NEXT:    orr r3, r5, r0, lsl r2
+; V7A-NEXT:    cmp r1, #0
+; V7A-NEXT:    lsl r0, r4, r2
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lslpl r3, r4, r1
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    orr r0, r0, r3, lsl lr
+; V7A-NEXT:    lsrpl r0, r3, r1
+; V7A-NEXT:    lsr r1, r3, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    pop {r4, r5, r11, pc}
+;
+; V7A-T-LABEL: bextr64_d3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, r5, r6, lr}
+; V7A-T-NEXT:    push {r4, r5, r6, lr}
+; V7A-T-NEXT:    ldrd r12, lr, [r0]
+; V7A-T-NEXT:    uxtb r0, r1
+; V7A-T-NEXT:    rsb.w r6, r0, #32
+; V7A-T-NEXT:    lsr.w r3, lr, r0
+; V7A-T-NEXT:    rsb.w r2, r2, #64
+; V7A-T-NEXT:    mvn r4, #31
+; V7A-T-NEXT:    lsr.w r0, r12, r0
+; V7A-T-NEXT:    uxtab r1, r4, r1
+; V7A-T-NEXT:    lsl.w r6, lr, r6
+; V7A-T-NEXT:    orrs r0, r6
+; V7A-T-NEXT:    cmp r1, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    uxtb r5, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, lr, r1
+; V7A-T-NEXT:    rsb.w r1, r5, #32
+; V7A-T-NEXT:    lsls r3, r5
+; V7A-T-NEXT:    uxtab r2, r4, r2
+; V7A-T-NEXT:    lsr.w r6, r0, r1
+; V7A-T-NEXT:    orrs r3, r6
+; V7A-T-NEXT:    cmp r2, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r0, r2
+; V7A-T-NEXT:    lsl.w r0, r0, r5
+; V7A-T-NEXT:    lsl.w r1, r3, r1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsr.w r0, r0, r5
+; V7A-T-NEXT:    orr.w r0, r0, r1
+; V7A-T-NEXT:    lsr.w r1, r3, r5
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r3, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r4, r5, r6, pc}
+;
+; V6M-LABEL: bextr64_d3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r2
+; V6M-NEXT:    ldr r5, [r0]
+; V6M-NEXT:    ldr r3, [r0, #4]
+; V6M-NEXT:    uxtb r2, r1
+; V6M-NEXT:    mov r0, r5
+; V6M-NEXT:    mov r1, r3
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    movs r2, #64
+; V6M-NEXT:    subs r2, r2, r4
+; V6M-NEXT:    uxtb r4, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %val = load i64, ptr %w
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %highbitscleared = shl i64 %shifted, %sh_prom
+  %masked = lshr i64 %highbitscleared, %sh_prom
+  ret i64 %masked
+}
+
+; 64-bit, but with 32-bit output
+
+; Everything done in 64-bit, truncation happens last.
+define i32 @bextr64_32_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_d0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r4, lr}
+; V7M-NEXT:    push {r4, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    ldr.w r12, [sp, #8]
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orrs r0, r3
+; V7M-NEXT:    subs.w r3, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    rsb.w r3, r12, #64
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    rsb.w lr, r12, #32
+; V7M-NEXT:    rsb.w r12, r3, #32
+; V7M-NEXT:    lsls r1, r3
+; V7M-NEXT:    cmp.w lr, #0
+; V7M-NEXT:    lsr.w r4, r0, r12
+; V7M-NEXT:    orr.w r1, r1, r4
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r0, lr
+; V7M-NEXT:    lsl.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r2, r1, r12
+; V7M-NEXT:    lsr.w r0, r0, r3
+; V7M-NEXT:    orr.w r0, r0, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, lr
+; V7M-NEXT:    pop {r4, pc}
+;
+; V7A-LABEL: bextr64_32_d0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    lsr r3, r1, r2
+; V7A-NEXT:    subs lr, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    rsb r2, r2, #32
+; V7A-NEXT:    ldr r12, [sp, #8]
+; V7A-NEXT:    movwpl r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r2
+; V7A-NEXT:    lsrpl r0, r1, lr
+; V7A-NEXT:    rsb r1, r12, #64
+; V7A-NEXT:    rsb lr, r1, #32
+; V7A-NEXT:    lsr r2, r0, lr
+; V7A-NEXT:    orr r2, r2, r3, lsl r1
+; V7A-NEXT:    rsbs r3, r12, #32
+; V7A-NEXT:    lslpl r2, r0, r3
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    orr r0, r0, r2, lsl lr
+; V7A-NEXT:    lsrpl r0, r2, r3
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bextr64_32_d0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    ldr.w r12, [sp, #8]
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orrs r0, r3
+; V7A-T-NEXT:    subs.w r3, r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r3
+; V7A-T-NEXT:    lsr.w r1, r1, r2
+; V7A-T-NEXT:    rsb.w r3, r12, #64
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    rsb.w lr, r3, #32
+; V7A-T-NEXT:    lsls r1, r3
+; V7A-T-NEXT:    rsbs.w r2, r12, #32
+; V7A-T-NEXT:    lsr.w r4, r0, lr
+; V7A-T-NEXT:    orr.w r1, r1, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r1, r0, r2
+; V7A-T-NEXT:    lsl.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r4, r1, lr
+; V7A-T-NEXT:    lsr.w r0, r0, r3
+; V7A-T-NEXT:    orr.w r0, r0, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bextr64_32_d0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r2, [sp, #8]
+; V6M-NEXT:    movs r3, #64
+; V6M-NEXT:    subs r4, r3, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %numhighbits = sub i64 64, %numlowbits
+  %highbitscleared = shl i64 %shifted, %numhighbits
+  %masked = lshr i64 %highbitscleared, %numhighbits
+  %res = trunc i64 %masked to i32
+  ret i32 %res
+}
+
+; Shifting happens in 64-bit, then truncation. Masking is 32-bit.
+define i32 @bextr64_32_d1(i64 %val, i64 %numskipbits, i32 %numlowbits) nounwind {
+; V7M-LABEL: bextr64_32_d1:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsrs r0, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    ldr r1, [sp]
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bextr64_32_d1:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    ldr r12, [sp]
+; V7A-NEXT:    subs r2, r2, #32
+; V7A-NEXT:    orr r0, r0, r1, lsl r3
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    rsb r1, r12, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bextr64_32_d1:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    lsrs r0, r2
+; V7A-T-NEXT:    ldr.w r12, [sp]
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    lsl.w r3, r1, r3
+; V7A-T-NEXT:    orr.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    rsb.w r1, r12, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bextr64_32_d1:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r7, lr}
+; V6M-NEXT:    push {r7, lr}
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldr r1, [sp, #8]
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    pop {r7, pc}
+  %shifted = lshr i64 %val, %numskipbits
+  %truncshifted = trunc i64 %shifted to i32
+  %numhighbits = sub i32 32, %numlowbits
+  %highbitscleared = shl i32 %truncshifted, %numhighbits
+  %masked = lshr i32 %highbitscleared, %numhighbits
+  ret i32 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant
+; ---------------------------------------------------------------------------- ;
+
+; https://bugs.llvm.org/show_bug.cgi?id=38938
+define void @pr38938(ptr %a0, ptr %a1) nounwind {
+; V7M-LABEL: pr38938:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r1, [r1]
+; V7M-NEXT:    ubfx r1, r1, #21, #10
+; V7M-NEXT:    ldr.w r2, [r0, r1, lsl #2]
+; V7M-NEXT:    adds r2, #1
+; V7M-NEXT:    str.w r2, [r0, r1, lsl #2]
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: pr38938:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r1, [r1]
+; V7A-NEXT:    ubfx r1, r1, #21, #10
+; V7A-NEXT:    ldr r2, [r0, r1, lsl #2]
+; V7A-NEXT:    add r2, r2, #1
+; V7A-NEXT:    str r2, [r0, r1, lsl #2]
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: pr38938:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r1, [r1]
+; V7A-T-NEXT:    ubfx r1, r1, #21, #10
+; V7A-T-NEXT:    ldr.w r2, [r0, r1, lsl #2]
+; V7A-T-NEXT:    adds r2, #1
+; V7A-T-NEXT:    str.w r2, [r0, r1, lsl #2]
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: pr38938:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r1, [r1]
+; V6M-NEXT:    lsrs r1, r1, #19
+; V6M-NEXT:    ldr r2, .LCPI51_0
+; V6M-NEXT:    ands r2, r1
+; V6M-NEXT:    ldr r1, [r0, r2]
+; V6M-NEXT:    adds r1, r1, #1
+; V6M-NEXT:    str r1, [r0, r2]
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI51_0:
+; V6M-NEXT:    .long 4092 @ 0xffc
+  %tmp = load i64, ptr %a1, align 8
+  %tmp1 = lshr i64 %tmp, 21
+  %tmp2 = and i64 %tmp1, 1023
+  %tmp3 = getelementptr inbounds i32, ptr %a0, i64 %tmp2
+  %tmp4 = load i32, ptr %tmp3, align 4
+  %tmp5 = add nsw i32 %tmp4, 1
+  store i32 %tmp5, ptr %tmp3, align 4
+  ret void
+}
+
+; The most canonical variant
+define i32 @c0_i32(i32 %arg) nounwind {
+; V7M-LABEL: c0_i32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ubfx r0, r0, #19, #10
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c0_i32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ubfx r0, r0, #19, #10
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c0_i32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ubfx r0, r0, #19, #10
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c0_i32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsls r0, r0, #3
+; V6M-NEXT:    lsrs r0, r0, #22
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  ret i32 %tmp1
+}
+
+; Should be still fine, but the mask is shifted
+define i32 @c1_i32(i32 %arg) nounwind {
+; V7M-LABEL: c1_i32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movw r1, #4092
+; V7M-NEXT:    and.w r0, r1, r0, lsr #19
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c1_i32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    movw r1, #4092
+; V7A-NEXT:    and r0, r1, r0, lsr #19
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c1_i32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movw r1, #4092
+; V7A-T-NEXT:    and.w r0, r1, r0, lsr #19
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c1_i32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r1, r0, #19
+; V6M-NEXT:    ldr r0, .LCPI53_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI53_0:
+; V6M-NEXT:    .long 4092 @ 0xffc
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 4092
+  ret i32 %tmp1
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define i32 @c2_i32(i32 %arg) nounwind {
+; V7M-LABEL: c2_i32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movw r1, #4092
+; V7M-NEXT:    and.w r0, r1, r0, lsr #17
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c2_i32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    movw r1, #4092
+; V7A-NEXT:    and r0, r1, r0, lsr #17
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c2_i32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movw r1, #4092
+; V7A-T-NEXT:    and.w r0, r1, r0, lsr #17
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c2_i32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r1, r0, #17
+; V6M-NEXT:    ldr r0, .LCPI54_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI54_0:
+; V6M-NEXT:    .long 4092 @ 0xffc
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  %tmp2 = shl i32 %tmp1, 2
+  ret i32 %tmp2
+}
+
+; The mask covers newly shifted-in bit
+define i32 @c4_i32_bad(i32 %arg) nounwind {
+; V7M-LABEL: c4_i32_bad:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mvn r1, #1
+; V7M-NEXT:    and.w r0, r1, r0, lsr #19
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c4_i32_bad:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mvn r1, #1
+; V7A-NEXT:    and r0, r1, r0, lsr #19
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c4_i32_bad:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mvn r1, #1
+; V7A-T-NEXT:    and.w r0, r1, r0, lsr #19
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c4_i32_bad:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r0, #20
+; V6M-NEXT:    lsls r0, r0, #1
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 16382
+  ret i32 %tmp1
+}
+
+; i64
+
+; The most canonical variant
+define i64 @c0_i64(i64 %arg) nounwind {
+; V7M-LABEL: c0_i64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ubfx r0, r1, #19, #10
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c0_i64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ubfx r0, r1, #19, #10
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c0_i64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ubfx r0, r1, #19, #10
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c0_i64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsls r0, r1, #3
+; V6M-NEXT:    lsrs r0, r0, #22
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  ret i64 %tmp1
+}
+
+; Should be still fine, but the mask is shifted
+define i64 @c1_i64(i64 %arg) nounwind {
+; V7M-LABEL: c1_i64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movw r0, #4092
+; V7M-NEXT:    and.w r0, r0, r1, lsr #19
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c1_i64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    movw r0, #4092
+; V7A-NEXT:    and r0, r0, r1, lsr #19
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c1_i64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movw r0, #4092
+; V7A-T-NEXT:    and.w r0, r0, r1, lsr #19
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c1_i64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r1, r1, #19
+; V6M-NEXT:    ldr r0, .LCPI57_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI57_0:
+; V6M-NEXT:    .long 4092 @ 0xffc
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 4092
+  ret i64 %tmp1
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define i64 @c2_i64(i64 %arg) nounwind {
+; V7M-LABEL: c2_i64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movw r0, #4092
+; V7M-NEXT:    and.w r0, r0, r1, lsr #17
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c2_i64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    movw r0, #4092
+; V7A-NEXT:    and r0, r0, r1, lsr #17
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c2_i64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movw r0, #4092
+; V7A-T-NEXT:    and.w r0, r0, r1, lsr #17
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c2_i64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r1, r1, #17
+; V6M-NEXT:    ldr r0, .LCPI58_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI58_0:
+; V6M-NEXT:    .long 4092 @ 0xffc
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  %tmp2 = shl i64 %tmp1, 2
+  ret i64 %tmp2
+}
+
+; The mask covers newly shifted-in bit
+define i64 @c4_i64_bad(i64 %arg) nounwind {
+; V7M-LABEL: c4_i64_bad:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mvn r0, #1
+; V7M-NEXT:    and.w r0, r0, r1, lsr #19
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c4_i64_bad:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mvn r0, #1
+; V7A-NEXT:    and r0, r0, r1, lsr #19
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c4_i64_bad:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mvn r0, #1
+; V7A-T-NEXT:    and.w r0, r0, r1, lsr #19
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c4_i64_bad:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r1, #20
+; V6M-NEXT:    lsls r0, r0, #1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 16382
+  ret i64 %tmp1
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant, storing the result afterwards.
+; ---------------------------------------------------------------------------- ;
+
+; i32
+
+; The most canonical variant
+define void @c5_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c5_i32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ubfx r0, r0, #19, #10
+; V7M-NEXT:    str r0, [r1]
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c5_i32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ubfx r0, r0, #19, #10
+; V7A-NEXT:    str r0, [r1]
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c5_i32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ubfx r0, r0, #19, #10
+; V7A-T-NEXT:    str r0, [r1]
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c5_i32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsls r0, r0, #3
+; V6M-NEXT:    lsrs r0, r0, #22
+; V6M-NEXT:    str r0, [r1]
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  store i32 %tmp1, ptr %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c6_i32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ubfx r0, r0, #19, #12
+; V7M-NEXT:    str r0, [r1]
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c6_i32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ubfx r0, r0, #19, #12
+; V7A-NEXT:    str r0, [r1]
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c6_i32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ubfx r0, r0, #19, #12
+; V7A-T-NEXT:    str r0, [r1]
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c6_i32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsls r0, r0, #1
+; V6M-NEXT:    lsrs r0, r0, #20
+; V6M-NEXT:    str r0, [r1]
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 4095
+  store i32 %tmp1, ptr %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c7_i32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movw r2, #4092
+; V7M-NEXT:    and.w r0, r2, r0, lsr #17
+; V7M-NEXT:    str r0, [r1]
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c7_i32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    movw r2, #4092
+; V7A-NEXT:    and r0, r2, r0, lsr #17
+; V7A-NEXT:    str r0, [r1]
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c7_i32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movw r2, #4092
+; V7A-T-NEXT:    and.w r0, r2, r0, lsr #17
+; V7A-T-NEXT:    str r0, [r1]
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c7_i32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    lsrs r0, r0, #17
+; V6M-NEXT:    ldr r2, .LCPI62_0
+; V6M-NEXT:    ands r2, r0
+; V6M-NEXT:    str r2, [r1]
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI62_0:
+; V6M-NEXT:    .long 4092 @ 0xffc
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  %tmp2 = shl i32 %tmp1, 2
+  store i32 %tmp2, ptr %ptr
+  ret void
+}
+
+; i64
+
+; The most canonical variant
+define void @c5_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c5_i64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r0, #0
+; V7M-NEXT:    ubfx r1, r1, #19, #10
+; V7M-NEXT:    strd r1, r0, [r2]
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c5_i64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r0, #0
+; V7A-NEXT:    str r0, [r2, #4]
+; V7A-NEXT:    ubfx r0, r1, #19, #10
+; V7A-NEXT:    str r0, [r2]
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c5_i64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r0, #0
+; V7A-T-NEXT:    ubfx r1, r1, #19, #10
+; V7A-T-NEXT:    strd r1, r0, [r2]
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c5_i64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    lsls r1, r1, #3
+; V6M-NEXT:    lsrs r1, r1, #22
+; V6M-NEXT:    str r1, [r2]
+; V6M-NEXT:    str r0, [r2, #4]
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  store i64 %tmp1, ptr %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c6_i64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r0, #0
+; V7M-NEXT:    ubfx r1, r1, #19, #12
+; V7M-NEXT:    strd r1, r0, [r2]
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c6_i64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r0, #0
+; V7A-NEXT:    str r0, [r2, #4]
+; V7A-NEXT:    ubfx r0, r1, #19, #12
+; V7A-NEXT:    str r0, [r2]
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c6_i64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r0, #0
+; V7A-T-NEXT:    ubfx r1, r1, #19, #12
+; V7A-T-NEXT:    strd r1, r0, [r2]
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c6_i64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    lsls r1, r1, #1
+; V6M-NEXT:    lsrs r1, r1, #20
+; V6M-NEXT:    str r1, [r2]
+; V6M-NEXT:    str r0, [r2, #4]
+; V6M-NEXT:    bx lr
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 4095
+  store i64 %tmp1, ptr %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
+; V7M-LABEL: c7_i64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r0, #0
+; V7M-NEXT:    movw r3, #4092
+; V7M-NEXT:    and.w r1, r3, r1, lsr #17
+; V7M-NEXT:    strd r1, r0, [r2]
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: c7_i64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    movw r0, #4092
+; V7A-NEXT:    mov r3, #0
+; V7A-NEXT:    and r0, r0, r1, lsr #17
+; V7A-NEXT:    stm r2, {r0, r3}
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: c7_i64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r0, #0
+; V7A-T-NEXT:    movw r3, #4092
+; V7A-T-NEXT:    and.w r1, r3, r1, lsr #17
+; V7A-T-NEXT:    strd r1, r0, [r2]
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: c7_i64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    lsrs r1, r1, #17
+; V6M-NEXT:    ldr r3, .LCPI65_0
+; V6M-NEXT:    ands r3, r1
+; V6M-NEXT:    str r3, [r2]
+; V6M-NEXT:    str r0, [r2, #4]
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI65_0:
+; V6M-NEXT:    .long 4092 @ 0xffc
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  %tmp2 = shl i64 %tmp1, 2
+  store i64 %tmp2, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll
new file mode 100644
index 0000000000000..b483793187513
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll
@@ -0,0 +1,2752 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv7m-eabi  %s -o - | FileCheck %s --check-prefix V7M
+; RUN: llc -mtriple=armv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A
+; RUN: llc -mtriple=thumbv7a-eabi  %s -o -   | FileCheck %s --check-prefix V7A-T
+; RUN: llc -mtriple=armv6m-eabi  %s -o -   | FileCheck %s --check-prefix V6M
+
+; Patterns:
+;   a) x &  (1 << nbits) - 1
+;   b) x & ~(-1 << nbits)
+;   c) x &  (-1 >> (32 - y))
+;   d) x << (32 - y) >> (32 - y)
+; are equivalent.
+
+; ---------------------------------------------------------------------------- ;
+; Pattern a. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_a0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r2, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r1, r3, r2, lsl r1
+; V7A-NEXT:    and r0, r1, r0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_a0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r2, #1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_a0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #1
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    subs r1, r2, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_a1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r2, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r1, r3, r2, lsl r1
+; V7A-NEXT:    and r0, r1, r0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_a1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r2, #1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_a1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #1
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    subs r1, r2, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %conv = zext i8 %numlowbits to i32
+  %onebit = shl i32 1, %conv
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_a2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_a2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r2, #1
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r1, r3, r2, lsl r1
+; V7A-NEXT:    and r0, r1, r0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_a2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r2, #1
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_a2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #1
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    subs r1, r2, #1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_a3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r2, #1
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r1, r3, r2, lsl r1
+; V7A-NEXT:    and r0, r1, r0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_a3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r2, #1
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_a3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #1
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    subs r1, r2, #1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %conv = zext i8 %numlowbits to i32
+  %onebit = shl i32 1, %conv
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_a4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    movs r2, #1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    subs r1, #1
+; V7M-NEXT:    ands r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_a4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mov r2, #1
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    add r1, r3, r2, lsl r1
+; V7A-NEXT:    and r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_a4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    movs r2, #1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    subs r1, #1
+; V7A-T-NEXT:    ands r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_a4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #1
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    subs r1, r2, #1
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %val, %mask ; swapped order
+  ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    mov.w r12, #1
+; V7M-NEXT:    subs.w lr, r2, #32
+; V7M-NEXT:    lsl.w r2, r12, r2
+; V7M-NEXT:    lsr.w r3, r12, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r3, r12, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    subs r2, #1
+; V7M-NEXT:    sbc r3, r3, #0
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    subs r2, r2, #1
+; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    and r0, r2, r0
+; V7A-NEXT:    and r1, r3, r1
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    mov.w r12, #1
+; V7A-T-NEXT:    subs.w lr, r2, #32
+; V7A-T-NEXT:    lsl.w r2, r12, r2
+; V7A-T-NEXT:    lsr.w r3, r12, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r12, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    subs r2, #1
+; V7A-T-NEXT:    sbc r3, r3, #0
+; V7A-T-NEXT:    ands r0, r2
+; V7A-T-NEXT:    ands r1, r3
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r6, #0
+; V6M-NEXT:    mov r1, r6
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    subs r0, r0, #1
+; V6M-NEXT:    sbcs r1, r6
+; V6M-NEXT:    ands r1, r5
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+; Check that we don't throw away the vreg_width-1 mask if not using shifts
+define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a0_masked:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    and r2, r2, #63
+; V7M-NEXT:    mov.w r12, #1
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    subs.w lr, r2, #32
+; V7M-NEXT:    lsl.w r2, r12, r2
+; V7M-NEXT:    lsr.w r3, r12, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r3, r12, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    subs r2, #1
+; V7M-NEXT:    sbc r3, r3, #0
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a0_masked:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    and r2, r2, #63
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    subs r2, r2, #1
+; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    and r0, r2, r0
+; V7A-NEXT:    and r1, r3, r1
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a0_masked:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    and r2, r2, #63
+; V7A-T-NEXT:    mov.w r12, #1
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    subs.w lr, r2, #32
+; V7A-T-NEXT:    lsl.w r2, r12, r2
+; V7A-T-NEXT:    lsr.w r3, r12, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r12, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    subs r2, #1
+; V7A-T-NEXT:    sbc r3, r3, #0
+; V7A-T-NEXT:    ands r0, r2
+; V7A-T-NEXT:    ands r1, r3
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a0_masked:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #63
+; V6M-NEXT:    ands r2, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r6, #0
+; V6M-NEXT:    mov r1, r6
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    subs r0, r0, #1
+; V6M-NEXT:    sbcs r1, r6
+; V6M-NEXT:    ands r1, r5
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %numlowbits.masked = and i64 %numlowbits, 63
+  %onebit = shl i64 1, %numlowbits.masked
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    mov.w r12, #1
+; V7M-NEXT:    subs.w lr, r2, #32
+; V7M-NEXT:    lsl.w r2, r12, r2
+; V7M-NEXT:    lsr.w r3, r12, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r3, r12, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    subs r2, #1
+; V7M-NEXT:    sbc r3, r3, #0
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    subs r2, r2, #1
+; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    and r0, r2, r0
+; V7A-NEXT:    and r1, r3, r1
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    mov.w r12, #1
+; V7A-T-NEXT:    subs.w lr, r2, #32
+; V7A-T-NEXT:    lsl.w r2, r12, r2
+; V7A-T-NEXT:    lsr.w r3, r12, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r12, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    subs r2, #1
+; V7A-T-NEXT:    sbc r3, r3, #0
+; V7A-T-NEXT:    ands r0, r2
+; V7A-T-NEXT:    ands r1, r3
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r6, #0
+; V6M-NEXT:    mov r1, r6
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    subs r0, r0, #1
+; V6M-NEXT:    sbcs r1, r6
+; V6M-NEXT:    ands r1, r5
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %conv = zext i8 %numlowbits to i64
+  %onebit = shl i64 1, %conv
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_a2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r2, #32
+; V7M-NEXT:    movs r3, #1
+; V7M-NEXT:    subs.w r12, r2, #32
+; V7M-NEXT:    lsl.w r2, r3, r2
+; V7M-NEXT:    lsr.w r1, r3, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r3, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    subs r2, #1
+; V7M-NEXT:    ldrd r0, r3, [r0]
+; V7M-NEXT:    sbc r1, r1, #0
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_a2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r6, r11, lr}
+; V7A-NEXT:    push {r4, r6, r11, lr}
+; V7A-NEXT:    ldr r6, [r0]
+; V7A-NEXT:    mov r1, #1
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    rsb r0, r2, #32
+; V7A-NEXT:    subs r4, r2, #32
+; V7A-NEXT:    lsr r0, r1, r0
+; V7A-NEXT:    lslpl r0, r1, r4
+; V7A-NEXT:    lsl r1, r1, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    subs r2, r1, #1
+; V7A-NEXT:    sbc r0, r0, #0
+; V7A-NEXT:    and r1, r0, r3
+; V7A-NEXT:    and r0, r2, r6
+; V7A-NEXT:    pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    movs r1, #1
+; V7A-T-NEXT:    ldrd r12, lr, [r0]
+; V7A-T-NEXT:    subs.w r0, r2, #32
+; V7A-T-NEXT:    lsr.w r3, r1, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r1, r0
+; V7A-T-NEXT:    lsl.w r0, r1, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    subs r0, #1
+; V7A-T-NEXT:    sbc r1, r3, #0
+; V7A-T-NEXT:    and.w r0, r0, r12
+; V7A-T-NEXT:    and.w r1, r1, lr
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r5, #0
+; V6M-NEXT:    mov r1, r5
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    subs r2, r0, #1
+; V6M-NEXT:    sbcs r1, r5
+; V6M-NEXT:    ldm r4!, {r0, r3}
+; V6M-NEXT:    ands r1, r3
+; V6M-NEXT:    ands r0, r2
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %val = load i64, ptr %w
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_a3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r2, r1, #32
+; V7M-NEXT:    movs r3, #1
+; V7M-NEXT:    subs.w r12, r1, #32
+; V7M-NEXT:    lsl.w r1, r3, r1
+; V7M-NEXT:    lsr.w r2, r3, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r2, r3, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    subs r3, r1, #1
+; V7M-NEXT:    sbc r1, r2, #0
+; V7M-NEXT:    ldrd r0, r2, [r0]
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    ands r0, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_a3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r6, r11, lr}
+; V7A-NEXT:    push {r4, r6, r11, lr}
+; V7A-NEXT:    ldr r6, [r0]
+; V7A-NEXT:    mov r2, #1
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    rsb r0, r1, #32
+; V7A-NEXT:    subs r4, r1, #32
+; V7A-NEXT:    lsl r1, r2, r1
+; V7A-NEXT:    lsr r0, r2, r0
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    lslpl r0, r2, r4
+; V7A-NEXT:    subs r2, r1, #1
+; V7A-NEXT:    sbc r0, r0, #0
+; V7A-NEXT:    and r1, r0, r3
+; V7A-NEXT:    and r0, r2, r6
+; V7A-NEXT:    pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r1, #32
+; V7A-T-NEXT:    movs r2, #1
+; V7A-T-NEXT:    ldrd r12, lr, [r0]
+; V7A-T-NEXT:    subs.w r0, r1, #32
+; V7A-T-NEXT:    lsr.w r3, r2, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r2, r0
+; V7A-T-NEXT:    lsl.w r0, r2, r1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    subs r0, #1
+; V7A-T-NEXT:    sbc r1, r3, #0
+; V7A-T-NEXT:    and.w r0, r0, r12
+; V7A-T-NEXT:    and.w r1, r1, lr
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r2, r1
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r5, #0
+; V6M-NEXT:    mov r1, r5
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    subs r2, r0, #1
+; V6M-NEXT:    sbcs r1, r5
+; V6M-NEXT:    ldm r4!, {r0, r3}
+; V6M-NEXT:    ands r1, r3
+; V6M-NEXT:    ands r0, r2
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %val = load i64, ptr %w
+  %conv = zext i8 %numlowbits to i64
+  %onebit = shl i64 1, %conv
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_a4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    mov.w r12, #1
+; V7M-NEXT:    subs.w lr, r2, #32
+; V7M-NEXT:    lsl.w r2, r12, r2
+; V7M-NEXT:    lsr.w r3, r12, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r3, r12, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    subs r2, #1
+; V7M-NEXT:    sbc r3, r3, #0
+; V7M-NEXT:    ands r0, r2
+; V7M-NEXT:    ands r1, r3
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_a4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    mov r12, #1
+; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    subs r2, r2, #1
+; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    and r0, r0, r2
+; V7A-NEXT:    and r1, r1, r3
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_a4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #32
+; V7A-T-NEXT:    mov.w r12, #1
+; V7A-T-NEXT:    subs.w lr, r2, #32
+; V7A-T-NEXT:    lsl.w r2, r12, r2
+; V7A-T-NEXT:    lsr.w r3, r12, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r3, r12, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    subs r2, #1
+; V7A-T-NEXT:    sbc r3, r3, #0
+; V7A-T-NEXT:    ands r0, r2
+; V7A-T-NEXT:    ands r1, r3
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_a4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r6, lr}
+; V6M-NEXT:    push {r4, r5, r6, lr}
+; V6M-NEXT:    mov r5, r1
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #1
+; V6M-NEXT:    movs r6, #0
+; V6M-NEXT:    mov r1, r6
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    subs r0, r0, #1
+; V6M-NEXT:    sbcs r1, r6
+; V6M-NEXT:    ands r1, r5
+; V6M-NEXT:    ands r0, r4
+; V6M-NEXT:    pop {r4, r5, r6, pc}
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %val, %mask ; swapped order
+  ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern b. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    bics r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_b0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    bic r0, r0, r2, lsl r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_b0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_b0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    mvns r2, r2
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    bics r0, r2
+; V6M-NEXT:    bx lr
+  %notmask = shl i32 -1, %numlowbits
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    bics r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_b1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    bic r0, r0, r2, lsl r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_b1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_b1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    mvns r2, r2
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    bics r0, r2
+; V6M-NEXT:    bx lr
+  %conv = zext i8 %numlowbits to i32
+  %notmask = shl i32 -1, %conv
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_b2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    bics r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_b2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    bic r0, r0, r2, lsl r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_b2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_b2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    mvns r2, r2
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    bics r0, r2
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %notmask = shl i32 -1, %numlowbits
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    bics r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_b3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    bic r0, r0, r2, lsl r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_b3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_b3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    mvns r2, r2
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    bics r0, r2
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %conv = zext i8 %numlowbits to i32
+  %notmask = shl i32 -1, %conv
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_b4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    lsl.w r1, r2, r1
+; V7M-NEXT:    bics r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_b4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    bic r0, r0, r2, lsl r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_b4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    lsl.w r1, r2, r1
+; V7A-T-NEXT:    bics r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_b4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #0
+; V6M-NEXT:    mvns r2, r2
+; V6M-NEXT:    lsls r2, r1
+; V6M-NEXT:    bics r0, r2
+; V6M-NEXT:    bx lr
+  %notmask = shl i32 -1, %numlowbits
+  %mask = xor i32 %notmask, -1
+  %masked = and i32 %val, %mask ; swapped order
+  ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsl.w r12, r3, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl.w r12, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl r3, r2
+; V7M-NEXT:    bic.w r0, r0, r12
+; V7M-NEXT:    bics r1, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_b0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    subs r12, r2, #32
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsl r2, r3, r2
+; V7A-NEXT:    lslpl r3, r3, r12
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r1, r1, r3
+; V7A-NEXT:    bic r0, r0, r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_b0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsl.w r12, r3, r2
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl.w r12, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r3, r2
+; V7A-T-NEXT:    bic.w r0, r0, r12
+; V7A-T-NEXT:    bics r1, r3
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_b0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r5, r0
+; V6M-NEXT:    bics r4, r1
+; V6M-NEXT:    mov r0, r5
+; V6M-NEXT:    mov r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %notmask = shl i64 -1, %numlowbits
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsl.w r12, r3, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl.w r12, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl r3, r2
+; V7M-NEXT:    bic.w r0, r0, r12
+; V7M-NEXT:    bics r1, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_b1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    subs r12, r2, #32
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsl r2, r3, r2
+; V7A-NEXT:    lslpl r3, r3, r12
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r1, r1, r3
+; V7A-NEXT:    bic r0, r0, r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_b1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsl.w r12, r3, r2
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl.w r12, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r3, r2
+; V7A-T-NEXT:    bic.w r0, r0, r12
+; V7A-T-NEXT:    bics r1, r3
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_b1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r5, r0
+; V6M-NEXT:    bics r4, r1
+; V6M-NEXT:    mov r0, r5
+; V6M-NEXT:    mov r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %conv = zext i8 %numlowbits to i64
+  %notmask = shl i64 -1, %conv
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_b2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r1, #-1
+; V7M-NEXT:    subs.w r12, r2, #32
+; V7M-NEXT:    lsl.w r3, r1, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    ldrd r0, r2, [r0]
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r1, r12
+; V7M-NEXT:    bics r0, r3
+; V7M-NEXT:    bic.w r1, r2, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_b2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, lr}
+; V7A-NEXT:    push {r4, lr}
+; V7A-NEXT:    ldr r4, [r0]
+; V7A-NEXT:    mvn r1, #0
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    subs r0, r2, #32
+; V7A-NEXT:    lsl r2, r1, r2
+; V7A-NEXT:    lslpl r1, r1, r0
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r1, r3, r1
+; V7A-NEXT:    bic r0, r4, r2
+; V7A-NEXT:    pop {r4, pc}
+;
+; V7A-T-LABEL: bzhi64_b2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r1, #-1
+; V7A-T-NEXT:    ldrd r0, r12, [r0]
+; V7A-T-NEXT:    lsl.w r3, r1, r2
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r1, r2
+; V7A-T-NEXT:    bics r0, r3
+; V7A-T-NEXT:    bic.w r1, r12, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_b2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    ldm r4!, {r2, r3}
+; V6M-NEXT:    bics r2, r0
+; V6M-NEXT:    bics r3, r1
+; V6M-NEXT:    mov r0, r2
+; V6M-NEXT:    mov r1, r3
+; V6M-NEXT:    pop {r4, pc}
+  %val = load i64, ptr %w
+  %notmask = shl i64 -1, %numlowbits
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_b3_load_indexzext(ptr %w, i8 zeroext %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r2, #-1
+; V7M-NEXT:    subs.w r12, r1, #32
+; V7M-NEXT:    lsl.w r3, r2, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r3, #0
+; V7M-NEXT:    ldrd r0, r1, [r0]
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r2, r2, r12
+; V7M-NEXT:    bics r1, r2
+; V7M-NEXT:    bics r0, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_b3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r6, r11, lr}
+; V7A-NEXT:    push {r4, r6, r11, lr}
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    ldr r6, [r0]
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    subs r0, r1, #32
+; V7A-NEXT:    lsl r4, r2, r1
+; V7A-NEXT:    lslpl r2, r2, r0
+; V7A-NEXT:    movwpl r4, #0
+; V7A-NEXT:    bic r1, r3, r2
+; V7A-NEXT:    bic r0, r6, r4
+; V7A-NEXT:    pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_b3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    ldrd r0, r12, [r0]
+; V7A-T-NEXT:    lsl.w r3, r2, r1
+; V7A-T-NEXT:    subs r1, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r3, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r2, r1
+; V7A-T-NEXT:    bics r0, r3
+; V7A-T-NEXT:    bic.w r1, r12, r2
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_b3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    mov r2, r1
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    ldm r4!, {r2, r3}
+; V6M-NEXT:    bics r2, r0
+; V6M-NEXT:    bics r3, r1
+; V6M-NEXT:    mov r0, r2
+; V6M-NEXT:    mov r1, r3
+; V6M-NEXT:    pop {r4, pc}
+  %val = load i64, ptr %w
+  %conv = zext i8 %numlowbits to i64
+  %notmask = shl i64 -1, %conv
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_b4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsl.w r12, r3, r2
+; V7M-NEXT:    subs r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl.w r12, #0
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl r3, r2
+; V7M-NEXT:    bic.w r0, r0, r12
+; V7M-NEXT:    bics r1, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_b4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    subs r12, r2, #32
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsl r2, r3, r2
+; V7A-NEXT:    lslpl r3, r3, r12
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    bic r1, r1, r3
+; V7A-NEXT:    bic r0, r0, r2
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_b4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsl.w r12, r3, r2
+; V7A-T-NEXT:    subs r2, #32
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl.w r12, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl r3, r2
+; V7A-T-NEXT:    bic.w r0, r0, r12
+; V7A-T-NEXT:    bics r1, r3
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_b4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    bics r5, r0
+; V6M-NEXT:    bics r4, r1
+; V6M-NEXT:    mov r0, r5
+; V6M-NEXT:    mov r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %notmask = shl i64 -1, %numlowbits
+  %mask = xor i64 %notmask, -1
+  %masked = and i64 %val, %mask ; swapped order
+  ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern c. 32-bit
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_c0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_c0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_c0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_c1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_c1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_c1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %mask = lshr i32 -1, %sh_prom
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_c2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_c2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_c2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_c2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_c3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_c3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_c3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %mask = lshr i32 -1, %sh_prom
+  %masked = and i32 %mask, %val
+  ret i32 %masked
+}
+
+define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_c4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_c4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_c4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_c4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %numhighbits = sub i32 32, %numlowbits
+  %mask = lshr i32 -1, %numhighbits
+  %masked = and i32 %val, %mask ; swapped order
+  ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsbs.w lr, r2, #32
+; V7M-NEXT:    rsb.w r2, r2, #64
+; V7M-NEXT:    mov.w r12, #-1
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsr.w r2, r12, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r3, r3, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    ands r0, r3
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_c0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsbs lr, r2, #32
+; V7A-NEXT:    rsb r2, r2, #64
+; V7A-NEXT:    mvn r12, #0
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsr r2, r12, r2
+; V7A-NEXT:    lsrpl r3, r3, lr
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    and r0, r3, r0
+; V7A-NEXT:    and r1, r2, r1
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsbs.w lr, r2, #32
+; V7A-T-NEXT:    rsb.w r2, r2, #64
+; V7A-T-NEXT:    mov.w r12, #-1
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsr.w r2, r12, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r3, r3, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    ands r0, r3
+; V7A-T-NEXT:    ands r1, r2
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #64
+; V6M-NEXT:    subs r2, r0, r2
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %numhighbits = sub i64 64, %numlowbits
+  %mask = lshr i64 -1, %numhighbits
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r2, r2, #64
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    uxtb r2, r2
+; V7M-NEXT:    subs.w r12, r2, #32
+; V7M-NEXT:    lsr.w r2, r3, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r3, r3, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    ands r0, r3
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_c1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb lr, r2, #64
+; V7A-NEXT:    mvn r2, #31
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    uxtb r12, lr
+; V7A-NEXT:    uxtab r2, r2, lr
+; V7A-NEXT:    lsr r12, r3, r12
+; V7A-NEXT:    cmp r2, #0
+; V7A-NEXT:    movwpl r12, #0
+; V7A-NEXT:    lsrpl r3, r3, r2
+; V7A-NEXT:    and r1, r12, r1
+; V7A-NEXT:    and r0, r3, r0
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w lr, r2, #64
+; V7A-T-NEXT:    mvn r2, #31
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    uxtb.w r12, lr
+; V7A-T-NEXT:    uxtab r2, r2, lr
+; V7A-T-NEXT:    lsr.w r12, r3, r12
+; V7A-T-NEXT:    cmp r2, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl.w r12, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r3, r2
+; V7A-T-NEXT:    and.w r1, r1, r12
+; V7A-T-NEXT:    ands r0, r3
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #64
+; V6M-NEXT:    subs r0, r0, r2
+; V6M-NEXT:    uxtb r2, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %mask = lshr i64 -1, %sh_prom
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_c2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsbs.w r1, r2, #32
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    rsb.w r2, r2, #64
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl r3, r1
+; V7M-NEXT:    ldrd r0, r1, [r0]
+; V7M-NEXT:    mov.w r12, #-1
+; V7M-NEXT:    lsr.w r2, r12, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    ands r0, r3
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_c2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r5, lr}
+; V7A-NEXT:    push {r5, lr}
+; V7A-NEXT:    rsbs r1, r2, #32
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    mvn r12, #0
+; V7A-NEXT:    ldm r0, {r0, r5}
+; V7A-NEXT:    lsrpl r3, r3, r1
+; V7A-NEXT:    rsb r1, r2, #64
+; V7A-NEXT:    and r0, r3, r0
+; V7A-NEXT:    lsr r1, r12, r1
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    and r1, r1, r5
+; V7A-NEXT:    pop {r5, pc}
+;
+; V7A-T-LABEL: bzhi64_c2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsbs.w r1, r2, #32
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    ldrd r0, lr, [r0]
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r3, r1
+; V7A-T-NEXT:    rsb.w r1, r2, #64
+; V7A-T-NEXT:    mov.w r12, #-1
+; V7A-T-NEXT:    and.w r0, r0, r3
+; V7A-T-NEXT:    lsr.w r1, r12, r1
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    and.w r1, r1, lr
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #64
+; V6M-NEXT:    subs r2, r0, r2
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldm r4!, {r2, r3}
+; V6M-NEXT:    ands r0, r2
+; V6M-NEXT:    ands r1, r3
+; V6M-NEXT:    pop {r4, pc}
+  %val = load i64, ptr %w
+  %numhighbits = sub i64 64, %numlowbits
+  %mask = lshr i64 -1, %numhighbits
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_c3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #64
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    subs.w r2, r1, #32
+; V7M-NEXT:    lsr.w r1, r3, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl r3, r2
+; V7M-NEXT:    ldrd r0, r2, [r0]
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    ands r0, r3
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_c3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r4, r6, r11, lr}
+; V7A-NEXT:    push {r4, r6, r11, lr}
+; V7A-NEXT:    rsb r1, r1, #64
+; V7A-NEXT:    mvn r4, #31
+; V7A-NEXT:    mvn r2, #0
+; V7A-NEXT:    ldr r6, [r0]
+; V7A-NEXT:    ldr r3, [r0, #4]
+; V7A-NEXT:    uxtb r0, r1
+; V7A-NEXT:    uxtab r4, r4, r1
+; V7A-NEXT:    lsr r0, r2, r0
+; V7A-NEXT:    cmp r4, #0
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    and r1, r0, r3
+; V7A-NEXT:    lsrpl r2, r2, r4
+; V7A-NEXT:    and r0, r2, r6
+; V7A-NEXT:    pop {r4, r6, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r1, r1, #64
+; V7A-T-NEXT:    mvn r3, #31
+; V7A-T-NEXT:    ldrd r12, lr, [r0]
+; V7A-T-NEXT:    mov.w r2, #-1
+; V7A-T-NEXT:    uxtb r0, r1
+; V7A-T-NEXT:    uxtab r3, r3, r1
+; V7A-T-NEXT:    lsr.w r0, r2, r0
+; V7A-T-NEXT:    cmp r3, #0
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    and.w r1, r0, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl r2, r3
+; V7A-T-NEXT:    and.w r0, r2, r12
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    mov r4, r0
+; V6M-NEXT:    movs r0, #64
+; V6M-NEXT:    subs r0, r0, r1
+; V6M-NEXT:    uxtb r2, r0
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ldm r4!, {r2, r3}
+; V6M-NEXT:    ands r0, r2
+; V6M-NEXT:    ands r1, r3
+; V6M-NEXT:    pop {r4, pc}
+  %val = load i64, ptr %w
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %mask = lshr i64 -1, %sh_prom
+  %masked = and i64 %mask, %val
+  ret i64 %masked
+}
+
+define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_c4_commutative:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsbs.w lr, r2, #32
+; V7M-NEXT:    rsb.w r2, r2, #64
+; V7M-NEXT:    mov.w r12, #-1
+; V7M-NEXT:    mov.w r3, #-1
+; V7M-NEXT:    lsr.w r2, r12, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r3, r3, lr
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r2, #0
+; V7M-NEXT:    ands r0, r3
+; V7M-NEXT:    ands r1, r2
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_c4_commutative:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsbs lr, r2, #32
+; V7A-NEXT:    rsb r2, r2, #64
+; V7A-NEXT:    mvn r12, #0
+; V7A-NEXT:    mvn r3, #0
+; V7A-NEXT:    lsr r2, r12, r2
+; V7A-NEXT:    lsrpl r3, r3, lr
+; V7A-NEXT:    movwpl r2, #0
+; V7A-NEXT:    and r0, r0, r3
+; V7A-NEXT:    and r1, r1, r2
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_c4_commutative:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsbs.w lr, r2, #32
+; V7A-T-NEXT:    rsb.w r2, r2, #64
+; V7A-T-NEXT:    mov.w r12, #-1
+; V7A-T-NEXT:    mov.w r3, #-1
+; V7A-T-NEXT:    lsr.w r2, r12, r2
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r3, r3, lr
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r2, #0
+; V7A-T-NEXT:    ands r0, r3
+; V7A-T-NEXT:    ands r1, r2
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_c4_commutative:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, r5, r7, lr}
+; V6M-NEXT:    push {r4, r5, r7, lr}
+; V6M-NEXT:    mov r4, r1
+; V6M-NEXT:    mov r5, r0
+; V6M-NEXT:    movs r0, #64
+; V6M-NEXT:    subs r2, r0, r2
+; V6M-NEXT:    movs r0, #0
+; V6M-NEXT:    mvns r0, r0
+; V6M-NEXT:    mov r1, r0
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    ands r0, r5
+; V6M-NEXT:    ands r1, r4
+; V6M-NEXT:    pop {r4, r5, r7, pc}
+  %numhighbits = sub i64 64, %numlowbits
+  %mask = lshr i64 -1, %numhighbits
+  %masked = and i64 %val, %mask ; swapped order
+  ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Pattern d. 32-bit.
+; ---------------------------------------------------------------------------- ;
+
+define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_d0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_d0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_d0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %numhighbits = sub i32 32, %numlowbits
+  %highbitscleared = shl i32 %val, %numhighbits
+  %masked = lshr i32 %highbitscleared, %numhighbits
+  ret i32 %masked
+}
+
+define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_d1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_d1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_d1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %highbitscleared = shl i32 %val, %sh_prom
+  %masked = lshr i32 %highbitscleared, %sh_prom
+  ret i32 %masked
+}
+
+define i32 @bzhi32_d2_load(ptr %w, i32 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_d2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_d2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_d2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %numhighbits = sub i32 32, %numlowbits
+  %highbitscleared = shl i32 %val, %numhighbits
+  %masked = lshr i32 %highbitscleared, %numhighbits
+  ret i32 %masked
+}
+
+define i32 @bzhi32_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi32_d3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #32
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    lsls r0, r1
+; V7M-NEXT:    lsrs r0, r1
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_d3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    rsb r1, r1, #32
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    uxtb r1, r1
+; V7A-NEXT:    lsl r0, r0, r1
+; V7A-NEXT:    lsr r0, r0, r1
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_d3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    rsb.w r1, r1, #32
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    uxtb r1, r1
+; V7A-T-NEXT:    lsls r0, r1
+; V7A-T-NEXT:    lsrs r0, r1
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_d3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #32
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    uxtb r1, r1
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    lsls r0, r1
+; V6M-NEXT:    lsrs r0, r1
+; V6M-NEXT:    bx lr
+  %val = load i32, ptr %w
+  %numhighbits = sub i8 32, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i32
+  %highbitscleared = shl i32 %val, %sh_prom
+  %masked = lshr i32 %highbitscleared, %sh_prom
+  ret i32 %masked
+}
+
+; 64-bit.
+
+define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d0:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r3, r2, #64
+; V7M-NEXT:    rsbs.w r2, r2, #32
+; V7M-NEXT:    rsb.w lr, r3, #32
+; V7M-NEXT:    lsl.w r12, r1, r3
+; V7M-NEXT:    lsr.w r1, r0, lr
+; V7M-NEXT:    orr.w r1, r1, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r0, r2
+; V7M-NEXT:    lsl.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r12, r1, lr
+; V7M-NEXT:    lsr.w r0, r0, r3
+; V7M-NEXT:    orr.w r0, r0, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r2
+; V7M-NEXT:    lsr.w r1, r1, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_d0:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb lr, r2, #64
+; V7A-NEXT:    rsbs r2, r2, #32
+; V7A-NEXT:    rsb r12, lr, #32
+; V7A-NEXT:    lsr r3, r0, r12
+; V7A-NEXT:    orr r1, r3, r1, lsl lr
+; V7A-NEXT:    lslpl r1, r0, r2
+; V7A-NEXT:    lsl r0, r0, lr
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, lr
+; V7A-NEXT:    orr r0, r0, r1, lsl r12
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    lsr r1, r1, lr
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d0:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #64
+; V7A-T-NEXT:    rsbs.w r2, r2, #32
+; V7A-T-NEXT:    rsb.w lr, r3, #32
+; V7A-T-NEXT:    lsl.w r12, r1, r3
+; V7A-T-NEXT:    lsr.w r1, r0, lr
+; V7A-T-NEXT:    orr.w r1, r1, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r1, r0, r2
+; V7A-T-NEXT:    lsl.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r12, r1, lr
+; V7A-T-NEXT:    lsr.w r0, r0, r3
+; V7A-T-NEXT:    orr.w r0, r0, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    lsr.w r1, r1, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_d0:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    movs r3, #64
+; V6M-NEXT:    subs r4, r3, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %numhighbits = sub i64 64, %numlowbits
+  %highbitscleared = shl i64 %val, %numhighbits
+  %masked = lshr i64 %highbitscleared, %numhighbits
+  ret i64 %masked
+}
+
+define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d1_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r2, r2, #64
+; V7M-NEXT:    uxtb r2, r2
+; V7M-NEXT:    rsb.w r3, r2, #32
+; V7M-NEXT:    lsl.w r12, r1, r2
+; V7M-NEXT:    lsr.w r1, r0, r3
+; V7M-NEXT:    orr.w r1, r1, r12
+; V7M-NEXT:    subs.w r12, r2, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r1, r0, r12
+; V7M-NEXT:    lsl.w r0, r0, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r3, r1, r3
+; V7M-NEXT:    lsr.w r0, r0, r2
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r1, r12
+; V7M-NEXT:    lsr.w r1, r1, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_d1_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r11, lr}
+; V7A-NEXT:    push {r11, lr}
+; V7A-NEXT:    rsb lr, r2, #64
+; V7A-NEXT:    uxtb r3, lr
+; V7A-NEXT:    rsb r12, r3, #32
+; V7A-NEXT:    lsr r2, r0, r12
+; V7A-NEXT:    orr r1, r2, r1, lsl r3
+; V7A-NEXT:    mvn r2, #31
+; V7A-NEXT:    uxtab r2, r2, lr
+; V7A-NEXT:    cmp r2, #0
+; V7A-NEXT:    lslpl r1, r0, r2
+; V7A-NEXT:    lsl r0, r0, r3
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, r3
+; V7A-NEXT:    orr r0, r0, r1, lsl r12
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    lsr r1, r1, r3
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    pop {r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d1_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    rsb.w r4, r2, #64
+; V7A-T-NEXT:    mvn r2, #31
+; V7A-T-NEXT:    uxtb r3, r4
+; V7A-T-NEXT:    rsb.w lr, r3, #32
+; V7A-T-NEXT:    lsl.w r12, r1, r3
+; V7A-T-NEXT:    uxtab r2, r2, r4
+; V7A-T-NEXT:    lsr.w r1, r0, lr
+; V7A-T-NEXT:    cmp r2, #0
+; V7A-T-NEXT:    orr.w r1, r1, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r1, r0, r2
+; V7A-T-NEXT:    lsl.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r4, r1, lr
+; V7A-T-NEXT:    lsr.w r0, r0, r3
+; V7A-T-NEXT:    orr.w r0, r0, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    lsr.w r1, r1, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bzhi64_d1_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    movs r3, #64
+; V6M-NEXT:    subs r2, r3, r2
+; V6M-NEXT:    uxtb r4, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %highbitscleared = shl i64 %val, %sh_prom
+  %masked = lshr i64 %highbitscleared, %sh_prom
+  ret i64 %masked
+}
+
+define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d2_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    .save {r7, lr}
+; V7M-NEXT:    push {r7, lr}
+; V7M-NEXT:    rsb.w r1, r2, #64
+; V7M-NEXT:    ldrd r0, r3, [r0]
+; V7M-NEXT:    rsb.w lr, r1, #32
+; V7M-NEXT:    rsbs.w r2, r2, #32
+; V7M-NEXT:    lsl.w r12, r3, r1
+; V7M-NEXT:    lsr.w r3, r0, lr
+; V7M-NEXT:    orr.w r3, r3, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r3, r0, r2
+; V7M-NEXT:    lsl.w r0, r0, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r12, r3, lr
+; V7M-NEXT:    lsr.w r0, r0, r1
+; V7M-NEXT:    lsr.w r1, r3, r1
+; V7M-NEXT:    orr.w r0, r0, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r3, r2
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    pop {r7, pc}
+;
+; V7A-LABEL: bzhi64_d2_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r5, r7, r11, lr}
+; V7A-NEXT:    push {r5, r7, r11, lr}
+; V7A-NEXT:    rsb r3, r2, #64
+; V7A-NEXT:    ldm r0, {r0, r7}
+; V7A-NEXT:    rsb r1, r3, #32
+; V7A-NEXT:    rsbs r2, r2, #32
+; V7A-NEXT:    lsr r5, r0, r1
+; V7A-NEXT:    orr r7, r5, r7, lsl r3
+; V7A-NEXT:    lslpl r7, r0, r2
+; V7A-NEXT:    lsl r0, r0, r3
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, r3
+; V7A-NEXT:    orr r0, r0, r7, lsl r1
+; V7A-NEXT:    lsr r1, r7, r3
+; V7A-NEXT:    lsrpl r0, r7, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    pop {r5, r7, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d2_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r7, lr}
+; V7A-T-NEXT:    push {r7, lr}
+; V7A-T-NEXT:    rsb.w r3, r2, #64
+; V7A-T-NEXT:    ldrd r0, r1, [r0]
+; V7A-T-NEXT:    rsb.w lr, r3, #32
+; V7A-T-NEXT:    rsbs.w r2, r2, #32
+; V7A-T-NEXT:    lsl.w r12, r1, r3
+; V7A-T-NEXT:    lsr.w r1, r0, lr
+; V7A-T-NEXT:    orr.w r1, r1, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r1, r0, r2
+; V7A-T-NEXT:    lsl.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r12, r1, lr
+; V7A-T-NEXT:    lsr.w r0, r0, r3
+; V7A-T-NEXT:    orr.w r0, r0, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r1, r2
+; V7A-T-NEXT:    lsr.w r1, r1, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r7, pc}
+;
+; V6M-LABEL: bzhi64_d2_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    movs r1, #64
+; V6M-NEXT:    subs r4, r1, r2
+; V6M-NEXT:    ldr r2, [r0]
+; V6M-NEXT:    ldr r1, [r0, #4]
+; V6M-NEXT:    mov r0, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %val = load i64, ptr %w
+  %numhighbits = sub i64 64, %numlowbits
+  %highbitscleared = shl i64 %val, %numhighbits
+  %masked = lshr i64 %highbitscleared, %numhighbits
+  ret i64 %masked
+}
+
+define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
+; V7M-LABEL: bzhi64_d3_load_indexzext:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    rsb.w r1, r1, #64
+; V7M-NEXT:    ldrd r0, r2, [r0]
+; V7M-NEXT:    uxtb r1, r1
+; V7M-NEXT:    rsb.w r3, r1, #32
+; V7M-NEXT:    lsl.w r12, r2, r1
+; V7M-NEXT:    lsr.w r2, r0, r3
+; V7M-NEXT:    orr.w r2, r2, r12
+; V7M-NEXT:    subs.w r12, r1, #32
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lslpl.w r2, r0, r12
+; V7M-NEXT:    lsl.w r0, r0, r1
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r3, r2, r3
+; V7M-NEXT:    lsr.w r0, r0, r1
+; V7M-NEXT:    lsr.w r1, r2, r1
+; V7M-NEXT:    orr.w r0, r0, r3
+; V7M-NEXT:    it pl
+; V7M-NEXT:    lsrpl.w r0, r2, r12
+; V7M-NEXT:    it pl
+; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_d3_load_indexzext:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    .save {r5, r7, r11, lr}
+; V7A-NEXT:    push {r5, r7, r11, lr}
+; V7A-NEXT:    rsb r1, r1, #64
+; V7A-NEXT:    ldm r0, {r0, r7}
+; V7A-NEXT:    uxtb r2, r1
+; V7A-NEXT:    rsb r3, r2, #32
+; V7A-NEXT:    lsr r5, r0, r3
+; V7A-NEXT:    orr r7, r5, r7, lsl r2
+; V7A-NEXT:    mvn r5, #31
+; V7A-NEXT:    uxtab r1, r5, r1
+; V7A-NEXT:    cmp r1, #0
+; V7A-NEXT:    lslpl r7, r0, r1
+; V7A-NEXT:    lsl r0, r0, r2
+; V7A-NEXT:    movwpl r0, #0
+; V7A-NEXT:    lsr r0, r0, r2
+; V7A-NEXT:    orr r0, r0, r7, lsl r3
+; V7A-NEXT:    lsrpl r0, r7, r1
+; V7A-NEXT:    lsr r1, r7, r2
+; V7A-NEXT:    movwpl r1, #0
+; V7A-NEXT:    pop {r5, r7, r11, pc}
+;
+; V7A-T-LABEL: bzhi64_d3_load_indexzext:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    .save {r4, lr}
+; V7A-T-NEXT:    push {r4, lr}
+; V7A-T-NEXT:    rsb.w r4, r1, #64
+; V7A-T-NEXT:    ldrd r0, r2, [r0]
+; V7A-T-NEXT:    mvn r1, #31
+; V7A-T-NEXT:    uxtb r3, r4
+; V7A-T-NEXT:    rsb.w lr, r3, #32
+; V7A-T-NEXT:    lsl.w r12, r2, r3
+; V7A-T-NEXT:    uxtab r1, r1, r4
+; V7A-T-NEXT:    lsr.w r2, r0, lr
+; V7A-T-NEXT:    cmp r1, #0
+; V7A-T-NEXT:    orr.w r2, r2, r12
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lslpl.w r2, r0, r1
+; V7A-T-NEXT:    lsl.w r0, r0, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r0, #0
+; V7A-T-NEXT:    lsl.w r4, r2, lr
+; V7A-T-NEXT:    lsr.w r0, r0, r3
+; V7A-T-NEXT:    orr.w r0, r0, r4
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    lsrpl.w r0, r2, r1
+; V7A-T-NEXT:    lsr.w r1, r2, r3
+; V7A-T-NEXT:    it pl
+; V7A-T-NEXT:    movpl r1, #0
+; V7A-T-NEXT:    pop {r4, pc}
+;
+; V6M-LABEL: bzhi64_d3_load_indexzext:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    .save {r4, lr}
+; V6M-NEXT:    push {r4, lr}
+; V6M-NEXT:    movs r2, #64
+; V6M-NEXT:    subs r1, r2, r1
+; V6M-NEXT:    uxtb r4, r1
+; V6M-NEXT:    ldr r2, [r0]
+; V6M-NEXT:    ldr r1, [r0, #4]
+; V6M-NEXT:    mov r0, r2
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsl
+; V6M-NEXT:    mov r2, r4
+; V6M-NEXT:    bl __aeabi_llsr
+; V6M-NEXT:    pop {r4, pc}
+  %val = load i64, ptr %w
+  %numhighbits = sub i8 64, %numlowbits
+  %sh_prom = zext i8 %numhighbits to i64
+  %highbitscleared = shl i64 %val, %sh_prom
+  %masked = lshr i64 %highbitscleared, %sh_prom
+  ret i64 %masked
+}
+
+; ---------------------------------------------------------------------------- ;
+; Constant mask
+; ---------------------------------------------------------------------------- ;
+
+; 32-bit
+
+define i32 @bzhi32_constant_mask32(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    bic r0, r0, #-2147483648
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    bic r0, r0, #-2147483648
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    bic r0, r0, #-2147483648
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r1, #31
+; V6M-NEXT:    bics r0, r1
+; V6M-NEXT:    bx lr
+  %masked = and i32 %val, 2147483647
+  ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask32_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask32_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    bic r0, r0, #-2147483648
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask32_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    bic r0, r0, #-2147483648
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask32_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    bic r0, r0, #-2147483648
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask32_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r1, #31
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    bics r0, r1
+; V6M-NEXT:    bx lr
+  %val1 = load i32, ptr %val
+  %masked = and i32 %val1, 2147483647
+  ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask16(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask16:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    bfc r0, #15, #17
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask16:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    bfc r0, #15, #17
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask16:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    bfc r0, #15, #17
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask16:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r1, .LCPI41_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI41_0:
+; V6M-NEXT:    .long 32767 @ 0x7fff
+  %masked = and i32 %val, 32767
+  ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask16_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask16_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    bfc r0, #15, #17
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask16_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    bfc r0, #15, #17
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask16_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    bfc r0, #15, #17
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask16_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r1, [r0]
+; V6M-NEXT:    ldr r0, .LCPI42_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI42_0:
+; V6M-NEXT:    .long 32767 @ 0x7fff
+  %val1 = load i32, ptr %val
+  %masked = and i32 %val1, 32767
+  ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask8(i32 %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask8:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    and r0, r0, #127
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask8:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    and r0, r0, #127
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask8:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    and r0, r0, #127
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask8:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r1, #127
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %masked = and i32 %val, 127
+  ret i32 %masked
+}
+
+define i32 @bzhi32_constant_mask8_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi32_constant_mask8_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    and r0, r0, #127
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi32_constant_mask8_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    and r0, r0, #127
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi32_constant_mask8_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    and r0, r0, #127
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi32_constant_mask8_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r1, [r0]
+; V6M-NEXT:    movs r0, #127
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    bx lr
+  %val1 = load i32, ptr %val
+  %masked = and i32 %val1, 127
+  ret i32 %masked
+}
+
+; 64-bit
+
+define i64 @bzhi64_constant_mask64(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask64:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    bic r1, r1, #-1073741824
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask64:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    bic r1, r1, #-1073741824
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask64:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    bic r1, r1, #-1073741824
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask64:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r2, #3
+; V6M-NEXT:    lsls r2, r2, #30
+; V6M-NEXT:    bics r1, r2
+; V6M-NEXT:    bx lr
+  %masked = and i64 %val, 4611686018427387903
+  ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask64_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask64_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldrd r0, r1, [r0]
+; V7M-NEXT:    bic r1, r1, #-1073741824
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask64_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldrd r0, r1, [r0]
+; V7A-NEXT:    bic r1, r1, #-1073741824
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask64_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldrd r0, r1, [r0]
+; V7A-T-NEXT:    bic r1, r1, #-1073741824
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask64_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r1, #3
+; V6M-NEXT:    lsls r3, r1, #30
+; V6M-NEXT:    ldr r2, [r0]
+; V6M-NEXT:    ldr r1, [r0, #4]
+; V6M-NEXT:    bics r1, r3
+; V6M-NEXT:    mov r0, r2
+; V6M-NEXT:    bx lr
+  %val1 = load i64, ptr %val
+  %masked = and i64 %val1, 4611686018427387903
+  ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask32(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask32:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    bic r0, r0, #-2147483648
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask32:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    bic r0, r0, #-2147483648
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask32:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    bic r0, r0, #-2147483648
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask32:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r1, #31
+; V6M-NEXT:    bics r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+  %masked = and i64 %val, 2147483647
+  ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask32_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask32_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bic r0, r0, #-2147483648
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask32_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bic r0, r0, #-2147483648
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask32_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bic r0, r0, #-2147483648
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask32_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r1, #1
+; V6M-NEXT:    lsls r1, r1, #31
+; V6M-NEXT:    ldr r0, [r0]
+; V6M-NEXT:    bics r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+  %val1 = load i64, ptr %val
+  %masked = and i64 %val1, 2147483647
+  ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask16(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask16:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    bfc r0, #15, #17
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask16:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    bfc r0, #15, #17
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask16:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    bfc r0, #15, #17
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask16:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r1, .LCPI49_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI49_0:
+; V6M-NEXT:    .long 32767 @ 0x7fff
+  %masked = and i64 %val, 32767
+  ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask16_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask16_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bfc r0, #15, #17
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask16_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bfc r0, #15, #17
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask16_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bfc r0, #15, #17
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask16_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r1, [r0]
+; V6M-NEXT:    ldr r0, .LCPI50_0
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+; V6M-NEXT:    .p2align 2
+; V6M-NEXT:  @ %bb.1:
+; V6M-NEXT:  .LCPI50_0:
+; V6M-NEXT:    .long 32767 @ 0x7fff
+  %val1 = load i64, ptr %val
+  %masked = and i64 %val1, 32767
+  ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask8(i64 %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask8:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    and r0, r0, #127
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask8:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    and r0, r0, #127
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask8:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    and r0, r0, #127
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask8:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    movs r1, #127
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+  %masked = and i64 %val, 127
+  ret i64 %masked
+}
+
+define i64 @bzhi64_constant_mask8_load(ptr %val) nounwind {
+; V7M-LABEL: bzhi64_constant_mask8_load:
+; V7M:       @ %bb.0:
+; V7M-NEXT:    ldr r0, [r0]
+; V7M-NEXT:    movs r1, #0
+; V7M-NEXT:    and r0, r0, #127
+; V7M-NEXT:    bx lr
+;
+; V7A-LABEL: bzhi64_constant_mask8_load:
+; V7A:       @ %bb.0:
+; V7A-NEXT:    ldr r0, [r0]
+; V7A-NEXT:    mov r1, #0
+; V7A-NEXT:    and r0, r0, #127
+; V7A-NEXT:    bx lr
+;
+; V7A-T-LABEL: bzhi64_constant_mask8_load:
+; V7A-T:       @ %bb.0:
+; V7A-T-NEXT:    ldr r0, [r0]
+; V7A-T-NEXT:    movs r1, #0
+; V7A-T-NEXT:    and r0, r0, #127
+; V7A-T-NEXT:    bx lr
+;
+; V6M-LABEL: bzhi64_constant_mask8_load:
+; V6M:       @ %bb.0:
+; V6M-NEXT:    ldr r1, [r0]
+; V6M-NEXT:    movs r0, #127
+; V6M-NEXT:    ands r0, r1
+; V6M-NEXT:    movs r1, #0
+; V6M-NEXT:    bx lr
+  %val1 = load i64, ptr %val
+  %masked = and i64 %val1, 127
+  ret i64 %masked
+}

From fa57ce980a77a0da222e56ca833ff2bc0a73623c Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Fri, 3 Oct 2025 00:15:31 -0700
Subject: [PATCH 600/878] [AMDGPU] Define VS_128*. NFCI (#161798)

Needed for future patch.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      | 16 +++++++
 .../coalesce-copy-to-agpr-to-av-registers.mir | 48 +++++++++----------
 ...class-vgpr-mfma-to-av-with-load-source.mir | 12 ++---
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   | 24 +++++-----
 ...al-regcopy-and-spill-missed-at-regalloc.ll |  8 ++--
 ...gpr-mfma-to-agpr-subreg-insert-extract.mir | 12 ++---
 ...te-vgpr-mfma-to-agpr-subreg-src2-chain.mir | 16 +++----
 7 files changed, 76 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 8f1dd6244f20d..56305804e816a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1163,6 +1163,22 @@ def VS_64_Lo256 : SIRegisterClass<"AMDGPU", VReg_64.RegTypes, 32,
   let HasSGPR = 1;
   let Size = 64;
 }
+
+def VS_128 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+                             (add VReg_128, SReg_128)> {
+  let isAllocatable = 0;
+  let HasVGPR = 1;
+  let HasSGPR = 1;
+  let Size = 128;
+}
+
+def VS_128_Align2 : SIRegisterClass<"AMDGPU", VReg_128.RegTypes, 32,
+                                    (add VReg_128_Align2, SReg_128)> {
+  let isAllocatable = 0;
+  let HasVGPR = 1;
+  let HasSGPR = 1;
+  let Size = 128;
+}
 } // End GeneratePressureSet = 0
 
 // Define a register tuple class, along with one requiring an even
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 029aa3957d32b..ce1ea4d2e3cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -128,13 +128,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0
     %2.sub2_sub3:areg_128 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -153,13 +153,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0
     %2.sub2_sub3:areg_128_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -398,14 +398,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0
     %1.sub1:areg_128 = COPY %0
     %1.sub2:areg_128 = COPY %0
     %1.sub3:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -425,14 +425,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0
     %1.sub1:areg_128_align2 = COPY %0
     %1.sub2:areg_128_align2 = COPY %0
     %1.sub3:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -641,13 +641,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -668,13 +668,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub1:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -890,14 +890,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0.sub0
     %1.sub1:areg_128 = COPY %0.sub0
     %1.sub2:areg_128 = COPY %0.sub0
     %1.sub3:areg_128 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -917,14 +917,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0.sub0
     %1.sub1:areg_128_align2 = COPY %0.sub0
     %1.sub2:areg_128_align2 = COPY %0.sub0
     %1.sub3:areg_128_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -1051,13 +1051,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1076,13 +1076,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1358,11 +1358,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1379,11 +1379,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 92836d8417ed3..63db24a63a54a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body:             |
     S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
     S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
     S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
     S_ENDPGM 0
 
 ...
@@ -1368,7 +1368,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1408,7 +1408,7 @@ body:             |
     undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
     early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
@@ -1726,7 +1726,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1763,7 +1763,7 @@ body:             |
     undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
     %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 9cbdc3867e374..5b3e48698c32b 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7798793 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
 
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8323082 /* regdef:AReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8847370 /* regdef:AReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9568266 /* regdef:AReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 6509d8010dd95..f88b1bfae2c68 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
   ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %25
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %25
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %25
   ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27
   ; REGALLOC-GFX908-NEXT:   SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
@@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
@@ -61,7 +61,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
   ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %23
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
   ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21
   ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
@@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; PEI-GFX90A-NEXT: {{  $}}
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index d7b713aa53b86..0b4e6622b9418 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body:             |
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body:             |
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -172,7 +172,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -208,7 +208,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub1:areg_128_align2 = COPY %4.sub2
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index 57f611b4a033e..4c2ea2f3b814a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -47,7 +47,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -151,7 +151,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %5.sub0_sub1
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %8.sub0_sub1:areg_128_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 
@@ -231,7 +231,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...

From e9972debc98ce5d00db47409248bbcf06fafaf73 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Fri, 3 Oct 2025 09:29:23 +0200
Subject: [PATCH 601/878] [Clang] Normalize constraints before checking for
 satisfaction (#161671)

In the standard, constraint satisfaction checking is done on the
normalized form of a constraint.

Clang instead substitutes on the non-normalized form, which causes us to
report substitution failures in template arguments or concept ids, which
is non-conforming but unavoidable without a parameter mapping

This patch normalizes before satisfaction checking. However, we preserve
concept-id nodes in the normalized form, solely for diagnostics
purposes.

This addresses https://github.com/llvm/llvm-project/issues/61811 and
related concepts conformance bugs, ideally to make the remaining
implementation of concept template parameters easier

Fixes https://github.com/llvm/llvm-project/issues/135190
Fixes https://github.com/llvm/llvm-project/issues/61811

Co-authored-by: Younan Zhang
[zyn7109@gmail.com](mailto:zyn7109@gmail.com)

---------

Co-authored-by: Younan Zhang <zyn7109@gmail.com>
---
 clang/docs/InternalsManual.rst                |   61 +
 clang/docs/ReleaseNotes.rst                   |    4 +
 clang/include/clang/AST/ASTConcept.h          |   33 +-
 clang/include/clang/AST/ASTContext.h          |    1 -
 clang/include/clang/Sema/Sema.h               |  111 +-
 clang/include/clang/Sema/SemaConcept.h        |  434 +++-
 clang/include/clang/Sema/Template.h           |   22 +-
 clang/lib/AST/ASTConcept.cpp                  |   31 +-
 clang/lib/AST/ASTImporter.cpp                 |   12 +-
 clang/lib/Sema/SemaConcept.cpp                | 2014 +++++++++++------
 clang/lib/Sema/SemaDeclCXX.cpp                |   16 +-
 clang/lib/Sema/SemaExprCXX.cpp                |   16 +-
 clang/lib/Sema/SemaOverload.cpp               |    6 +-
 clang/lib/Sema/SemaTemplate.cpp               |   93 +-
 clang/lib/Sema/SemaTemplateDeduction.cpp      |   51 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  168 +-
 clang/lib/Sema/TreeTransform.h                |   19 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |    2 +-
 clang/lib/Serialization/ASTReaderStmt.cpp     |   14 +-
 clang/lib/Serialization/ASTWriterStmt.cpp     |   18 +-
 clang/test/AST/ast-dump-concepts.cpp          |   10 +-
 clang/test/AST/ast-dump-ctad-alias.cpp        |   21 +-
 clang/test/CXX/drs/cwg25xx.cpp                |   14 +-
 .../CXX/expr/expr.prim/expr.prim.id/p3.cpp    |    3 +-
 .../expr.prim.req/compound-requirement.cpp    |   14 +-
 .../expr.prim.req/nested-requirement.cpp      |   35 +-
 .../expr.prim.req/simple-requirement.cpp      |    4 +-
 .../expr.prim.req/type-requirement.cpp        |   12 +-
 .../constrant-satisfaction-conversions.cpp    |    5 +-
 .../temp.constr/temp.constr.normal/p1.cpp     |   59 +-
 clang/test/CXX/temp/temp.param/p10-2a.cpp     |   23 +-
 clang/test/SemaCXX/cxx23-assume.cpp           |    9 +-
 clang/test/SemaCXX/cxx2b-deducing-this.cpp    |    8 +-
 clang/test/SemaCXX/cxx2c-fold-exprs.cpp       |  202 +-
 .../SemaCXX/cxx2c-template-template-param.cpp |    4 +-
 .../invalid-requirement-requires-expr.cpp     |    4 +-
 ...overload-resolution-deferred-templates.cpp |    3 +-
 clang/test/SemaCXX/type-traits.cpp            |    4 +-
 clang/test/SemaHLSL/BuiltIns/Buffers.hlsl     |    6 +-
 clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl   |    6 +-
 .../SemaTemplate/concepts-recovery-expr.cpp   |   32 +-
 .../SemaTemplate/concepts-recursive-inst.cpp  |   27 +-
 clang/test/SemaTemplate/concepts.cpp          |   71 +-
 clang/test/SemaTemplate/deduction-guide.cpp   |   15 +-
 .../instantiate-abbreviated-template.cpp      |    1 +
 .../instantiate-expanded-type-constraint.cpp  |    4 +-
 .../instantiate-requires-expr.cpp             |   20 +-
 .../instantiate-template-argument.cpp         |   97 +-
 clang/test/SemaTemplate/pr52970.cpp           |    2 +-
 .../cpp17_iterator_concepts.verify.cpp        |    4 +-
 50 files changed, 2644 insertions(+), 1201 deletions(-)

diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst
index bd742273f4ed5..c677ddfa5ecc1 100644
--- a/clang/docs/InternalsManual.rst
+++ b/clang/docs/InternalsManual.rst
@@ -2859,6 +2859,67 @@ This library is called by the :ref:`Parser library <Parser>` during parsing to
 do semantic analysis of the input.  For valid programs, Sema builds an AST for
 parsed constructs.
 
+
+Concept Satisfaction Checking and Subsumption
+---------------------------------------------
+
+As per the C++ standard, constraints are `normalized <https://eel.is/c++draft/temp.constr.normal>`_
+and the normal form is used both for subsumption, and constraint checking.
+Both depend on a parameter mapping that substitutes lazily. In particular,
+we should not substitute in unused arguments.
+
+Clang follows the order of operations prescribed by the standard.
+
+Normalization happens prior to satisfaction and subsumption
+and is handled by ``NormalizedConstraint``.
+
+Clang preserves in the normalized form intermediate concept-ids
+(``ConceptIdConstraint``) This is used for diagnostics only and no substitution
+happens in a ConceptIdConstraint if its expression is satisfied.
+
+The normal form of the associated constraints of a declaration is cached in
+Sema::NormalizationCache such that it is only computed once.
+
+A ``NormalizedConstraint`` is a recursive data structure, where each node
+contains a parameter mapping, represented by the indexes of all parameter
+being used.
+
+Checking satisfaction is done by ``ConstraintSatisfactionChecker``, recursively
+walking ``NormalizedConstraint``. At each level, we substitute the outermost
+level of the template arguments referenced in the parameter mapping of a
+normalized expression (``MultiLevelTemplateArgumentList``).
+
+For the following example,
+
+.. code-block:: c++
+
+  template <typename T>
+  concept A = __is_same(T, int);
+
+  template <typename U>
+  concept B = A<U> && __is_same(U, int);
+
+The normal form of B is
+
+.. code-block:: c++
+
+    __is_same(T, int) /*T->U, innermost level*/
+ && __is_same(U, int) {U->U} /*T->U, outermost level*/
+
+After substitution in the mapping, we substitute in the constraint expression
+using that copy of the ``MultiLevelTemplateArgumentList``, and then evaluate it.
+
+Because this is expensive, it is cached in
+``UnsubstitutedConstraintSatisfactionCache``.
+
+Any error during satisfaction is recorded in ``ConstraintSatisfaction``.
+for nested requirements, ``ConstraintSatisfaction`` is stored (including
+diagnostics) in the AST, which is something we might want to improve.
+
+When an atomic constraint is not satified, we try to substitute into any
+enclosing concept-id using the same mechanism described above, for
+diagnostics purpose, and inject that in the ``ConstraintSatisfaction``.
+
 .. _CodeGen:
 
 The CodeGen Library
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0ec333579ea86..d2e5bd284d350 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -160,6 +160,10 @@ C++23 Feature Support
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
+- Clang now normalizes constraints before checking whether they are satisfied, as mandated by the standard.
+  As a result, Clang no longer incorrectly diagnoses substitution failures in template arguments only
+  used in concept-ids, and produces better diagnostics for satisfaction failure. (#GH61811) (#GH135190)
+
 C++17 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h
index 72da0059744f2..f362f24ebc72a 100644
--- a/clang/include/clang/AST/ASTConcept.h
+++ b/clang/include/clang/AST/ASTConcept.h
@@ -28,10 +28,20 @@ namespace clang {
 
 class ConceptDecl;
 class TemplateDecl;
+class ConceptReference;
 class Expr;
 class NamedDecl;
 struct PrintingPolicy;
 
+/// Unsatisfied constraint expressions if the template arguments could be
+/// substituted into them, or a diagnostic if substitution resulted in
+/// an invalid expression.
+///
+using ConstraintSubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
+using UnsatisfiedConstraintRecord =
+    llvm::PointerUnion<const Expr *, const ConceptReference *,
+                       const ConstraintSubstitutionDiagnostic *>;
+
 /// The result of a constraint satisfaction check, containing the necessary
 /// information to diagnose an unsatisfied constraint.
 class ConstraintSatisfaction : public llvm::FoldingSetNode {
@@ -48,16 +58,13 @@ class ConstraintSatisfaction : public llvm::FoldingSetNode {
                          ArrayRef<TemplateArgument> TemplateArgs)
       : ConstraintOwner(ConstraintOwner), TemplateArgs(TemplateArgs) {}
 
-  using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
-  using Detail = llvm::PointerUnion<Expr *, SubstitutionDiagnostic *>;
-
   bool IsSatisfied = false;
   bool ContainsErrors = false;
 
   /// \brief The substituted constraint expr, if the template arguments could be
   /// substituted into them, or a diagnostic if substitution resulted in an
   /// invalid expression.
-  llvm::SmallVector<Detail, 4> Details;
+  llvm::SmallVector<UnsatisfiedConstraintRecord, 4> Details;
 
   void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &C) {
     Profile(ID, C, ConstraintOwner, TemplateArgs);
@@ -69,19 +76,12 @@ class ConstraintSatisfaction : public llvm::FoldingSetNode {
 
   bool HasSubstitutionFailure() {
     for (const auto &Detail : Details)
-      if (Detail.dyn_cast<SubstitutionDiagnostic *>())
+      if (Detail.dyn_cast<const ConstraintSubstitutionDiagnostic *>())
         return true;
     return false;
   }
 };
 
-/// Pairs of unsatisfied atomic constraint expressions along with the
-/// substituted constraint expr, if the template arguments could be
-/// substituted into them, or a diagnostic if substitution resulted in
-/// an invalid expression.
-using UnsatisfiedConstraintRecord =
-    llvm::PointerUnion<Expr *, std::pair<SourceLocation, StringRef> *>;
-
 /// \brief The result of a constraint satisfaction check, containing the
 /// necessary information to diagnose an unsatisfied constraint.
 ///
@@ -101,6 +101,10 @@ struct ASTConstraintSatisfaction final :
     return getTrailingObjects() + NumRecords;
   }
 
+  ArrayRef<UnsatisfiedConstraintRecord> records() const {
+    return {begin(), end()};
+  }
+
   ASTConstraintSatisfaction(const ASTContext &C,
                             const ConstraintSatisfaction &Satisfaction);
   ASTConstraintSatisfaction(const ASTContext &C,
@@ -282,6 +286,11 @@ class TypeConstraint {
   }
 };
 
+/// Insertion operator for diagnostics.  This allows sending ConceptReferences's
+/// into a diagnostic with <<.
+const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB,
+                                      const ConceptReference *C);
+
 } // clang
 
 #endif // LLVM_CLANG_AST_ASTCONCEPT_H
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 12351e98e5a2b..78220d4d8ff5b 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -3877,7 +3877,6 @@ typename clang::LazyGenerationalUpdatePtr<Owner, T, Update>::ValueType
     return new (Ctx) LazyData(Source, Value);
   return Value;
 }
-
 template <> struct llvm::DenseMapInfo<llvm::FoldingSetNodeID> {
   static FoldingSetNodeID getEmptyKey() { return FoldingSetNodeID{}; }
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index f53aafdeb4f36..265462a86e405 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -65,6 +65,7 @@
 #include "clang/Sema/Redeclaration.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaBase.h"
+#include "clang/Sema/SemaConcept.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "clang/Sema/Weak.h"
 #include "llvm/ADT/APInt.h"
@@ -11694,8 +11695,9 @@ class Sema final : public SemaBase {
   ExprResult
   CheckConceptTemplateId(const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
                          const DeclarationNameInfo &ConceptNameInfo,
-                         NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
-                         const TemplateArgumentListInfo *TemplateArgs);
+                         NamedDecl *FoundDecl, TemplateDecl *NamedConcept,
+                         const TemplateArgumentListInfo *TemplateArgs,
+                         bool DoCheckConstraintSatisfaction = true);
 
   void diagnoseMissingTemplateArguments(TemplateName Name, SourceLocation Loc);
   void diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
@@ -12025,6 +12027,13 @@ class Sema final : public SemaBase {
                                  bool UpdateArgsWithConversions = true,
                                  bool *ConstraintsNotSatisfied = nullptr);
 
+  bool CheckTemplateArgumentList(
+      TemplateDecl *Template, TemplateParameterList *Params,
+      SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
+      const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
+      CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions = true,
+      bool *ConstraintsNotSatisfied = nullptr);
+
   bool CheckTemplateTypeArgument(
       TemplateTypeParmDecl *Param, TemplateArgumentLoc &Arg,
       SmallVectorImpl<TemplateArgument> &SugaredConverted,
@@ -12783,6 +12792,18 @@ class Sema final : public SemaBase {
   void MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
                                   unsigned Depth, llvm::SmallBitVector &Used);
 
+  /// Mark which template parameters are named in a given expression.
+  ///
+  /// Unlike MarkUsedTemplateParameters, this excludes parameter that
+  /// are used but not directly named by an expression - i.e. it excludes
+  /// any template parameter that denotes the type of a referenced NTTP.
+  ///
+  /// \param Used a bit vector whose elements will be set to \c true
+  /// to indicate when the corresponding template parameter will be
+  /// deduced.
+  void MarkUsedTemplateParametersForSubsumptionParameterMapping(
+      const Expr *E, unsigned Depth, llvm::SmallBitVector &Used);
+
   /// Mark which template parameters can be deduced from a given
   /// template argument list.
   ///
@@ -12799,6 +12820,9 @@ class Sema final : public SemaBase {
   void MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
                                   unsigned Depth, llvm::SmallBitVector &Used);
 
+  void MarkUsedTemplateParameters(ArrayRef<TemplateArgumentLoc> TemplateArgs,
+                                  unsigned Depth, llvm::SmallBitVector &Used);
+
   void
   MarkDeducedTemplateParameters(const FunctionTemplateDecl *FunctionTemplate,
                                 llvm::SmallBitVector &Deduced) {
@@ -13096,6 +13120,9 @@ class Sema final : public SemaBase {
     /// Whether we're substituting into constraints.
     bool InConstraintSubstitution;
 
+    /// Whether we're substituting into the parameter mapping of a constraint.
+    bool InParameterMappingSubstitution;
+
     /// The point of instantiation or synthesis within the source code.
     SourceLocation PointOfInstantiation;
 
@@ -13146,8 +13173,10 @@ class Sema final : public SemaBase {
     CodeSynthesisContext()
         : Kind(TemplateInstantiation),
           SavedInNonInstantiationSFINAEContext(false),
-          InConstraintSubstitution(false), Entity(nullptr), Template(nullptr),
-          TemplateArgs(nullptr), NumTemplateArgs(0), DeductionInfo(nullptr) {}
+          InConstraintSubstitution(false),
+          InParameterMappingSubstitution(false), Entity(nullptr),
+          Template(nullptr), TemplateArgs(nullptr), NumTemplateArgs(0),
+          DeductionInfo(nullptr) {}
 
     /// Determines whether this template is an actual instantiation
     /// that should be counted toward the maximum instantiation depth.
@@ -13359,6 +13388,11 @@ class Sema final : public SemaBase {
                          const MultiLevelTemplateArgumentList &TemplateArgs,
                          TemplateArgumentListInfo &Outputs);
 
+  bool SubstTemplateArgumentsInParameterMapping(
+      ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
+      const MultiLevelTemplateArgumentList &TemplateArgs,
+      TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes);
+
   /// Retrieve the template argument list(s) that should be used to
   /// instantiate the definition of the given declaration.
   ///
@@ -13820,6 +13854,12 @@ class Sema final : public SemaBase {
            CodeSynthesisContexts.back().InConstraintSubstitution;
   }
 
+  bool inParameterMappingSubstitution() const {
+    return !CodeSynthesisContexts.empty() &&
+           CodeSynthesisContexts.back().InParameterMappingSubstitution &&
+           !inConstraintSubstitution();
+  }
+
   using EntityPrinter = llvm::function_ref<void(llvm::raw_ostream &)>;
 
   /// \brief create a Requirement::SubstitutionDiagnostic with only a
@@ -14704,6 +14744,10 @@ class Sema final : public SemaBase {
     SatisfactionStack.swap(NewSS);
   }
 
+  using ConstrainedDeclOrNestedRequirement =
+      llvm::PointerUnion<const NamedDecl *,
+                         const concepts::NestedRequirement *>;
+
   /// Check whether the given expression is a valid constraint expression.
   /// A diagnostic is emitted if it is not, false is returned, and
   /// PossibleNonPrimary will be set to true if the failure might be due to a
@@ -14728,44 +14772,12 @@ class Sema final : public SemaBase {
   /// \returns true if an error occurred and satisfaction could not be checked,
   /// false otherwise.
   bool CheckConstraintSatisfaction(
-      const NamedDecl *Template,
+      ConstrainedDeclOrNestedRequirement Entity,
       ArrayRef<AssociatedConstraint> AssociatedConstraints,
       const MultiLevelTemplateArgumentList &TemplateArgLists,
-      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
-    llvm::SmallVector<Expr *, 4> Converted;
-    return CheckConstraintSatisfaction(Template, AssociatedConstraints,
-                                       Converted, TemplateArgLists,
-                                       TemplateIDRange, Satisfaction);
-  }
-
-  /// \brief Check whether the given list of constraint expressions are
-  /// satisfied (as if in a 'conjunction') given template arguments.
-  /// Additionally, takes an empty list of Expressions which is populated with
-  /// the instantiated versions of the ConstraintExprs.
-  /// \param Template the template-like entity that triggered the constraints
-  /// check (either a concept or a constrained entity).
-  /// \param ConstraintExprs a list of constraint expressions, treated as if
-  /// they were 'AND'ed together.
-  /// \param ConvertedConstraints a out parameter that will get populated with
-  /// the instantiated version of the ConstraintExprs if we successfully checked
-  /// satisfaction.
-  /// \param TemplateArgList the multi-level list of template arguments to
-  /// substitute into the constraint expression. This should be relative to the
-  /// top-level (hence multi-level), since we need to instantiate fully at the
-  /// time of checking.
-  /// \param TemplateIDRange The source range of the template id that
-  /// caused the constraints check.
-  /// \param Satisfaction if true is returned, will contain details of the
-  /// satisfaction, with enough information to diagnose an unsatisfied
-  /// expression.
-  /// \returns true if an error occurred and satisfaction could not be checked,
-  /// false otherwise.
-  bool CheckConstraintSatisfaction(
-      const NamedDecl *Template,
-      ArrayRef<AssociatedConstraint> AssociatedConstraints,
-      llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
-      const MultiLevelTemplateArgumentList &TemplateArgList,
-      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction);
+      SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
+      const ConceptReference *TopLevelConceptId = nullptr,
+      Expr **ConvertedExpr = nullptr);
 
   /// \brief Check whether the given non-dependent constraint expression is
   /// satisfied. Returns false and updates Satisfaction with the satisfaction
@@ -14831,16 +14843,17 @@ class Sema final : public SemaBase {
   /// \param First whether this is the first time an unsatisfied constraint is
   /// diagnosed for this error.
   void DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction &Satisfaction,
+                                     SourceLocation Loc = {},
                                      bool First = true);
 
   /// \brief Emit diagnostics explaining why a constraint expression was deemed
   /// unsatisfied.
   void
-  DiagnoseUnsatisfiedConstraint(const ASTConstraintSatisfaction &Satisfaction,
+  DiagnoseUnsatisfiedConstraint(const ConceptSpecializationExpr *ConstraintExpr,
                                 bool First = true);
 
   const NormalizedConstraint *getNormalizedAssociatedConstraints(
-      const NamedDecl *ConstrainedDecl,
+      ConstrainedDeclOrNestedRequirement Entity,
       ArrayRef<AssociatedConstraint> AssociatedConstraints);
 
   /// \brief Check whether the given declaration's associated constraints are
@@ -14865,6 +14878,15 @@ class Sema final : public SemaBase {
       const NamedDecl *D1, ArrayRef<AssociatedConstraint> AC1,
       const NamedDecl *D2, ArrayRef<AssociatedConstraint> AC2);
 
+  /// Cache the satisfaction of an atomic constraint.
+  /// The key is based on the unsubstituted expression and the parameter
+  /// mapping. This lets us not substituting the mapping more than once,
+  /// which is (very!) expensive.
+  /// FIXME: this should be private.
+  llvm::DenseMap<llvm::FoldingSetNodeID,
+                 UnsubstitutedConstraintSatisfactionCacheResult>
+      UnsubstitutedConstraintSatisfactionCache;
+
 private:
   /// Caches pairs of template-like decls whose associated constraints were
   /// checked for subsumption and whether or not the first's constraints did in
@@ -14875,8 +14897,11 @@ class Sema final : public SemaBase {
   /// constrained declarations). If an error occurred while normalizing the
   /// associated constraints of the template or concept, nullptr will be cached
   /// here.
-  llvm::DenseMap<const NamedDecl *, NormalizedConstraint *> NormalizationCache;
+  llvm::DenseMap<ConstrainedDeclOrNestedRequirement, NormalizedConstraint *>
+      NormalizationCache;
 
+  /// Cache whether the associated constraint of a declaration
+  /// is satisfied.
   llvm::ContextualFoldingSet<ConstraintSatisfaction, const ASTContext &>
       SatisfactionCache;
 
diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
index 648a9c51ae6c1..51ca1e16331f5 100644
--- a/clang/include/clang/Sema/SemaConcept.h
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -16,130 +16,406 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/ExprConcepts.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Sema/Ownership.h"
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include <optional>
 #include <utility>
 
 namespace clang {
 class Sema;
+class MultiLevelTemplateArgumentList;
 
-enum { ConstraintAlignment = 8 };
+/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
+/// either an atomic constraint, a conjunction of normalized constraints or a
+/// disjunction of normalized constraints.
+struct NormalizedConstraint {
+
+  enum class ConstraintKind : unsigned char {
+    Atomic = 0,
+    ConceptId,
+    FoldExpanded,
+    Compound,
+  };
+
+  enum CompoundConstraintKind : unsigned char {
+    CCK_Conjunction,
+    CCK_Disjunction
+  };
+  enum class FoldOperatorKind : unsigned char { And, Or };
+
+  using OccurenceList = llvm::SmallBitVector;
+
+protected:
+  using ExprOrConcept =
+      llvm::PointerUnion<const Expr *, const ConceptReference *>;
+
+  struct AtomicConstraintBits {
+    // Kind is the first member of all union members,
+    // as we rely on their initial common sequence.
+    LLVM_PREFERRED_TYPE(ConstraintKind)
+    unsigned Kind : 5;
+    unsigned Placeholder : 1;
+    unsigned PackSubstitutionIndex : 26;
+    // Indexes, IndexesForSubsumption, and Args are part of the common initial
+    // sequences of constraints that do have a mapping.
+
+    // Indexes of the parameters used in a constraint expression.
+    OccurenceList Indexes;
+    // Indexes of the parameters named directly in a constraint expression.
+    // FIXME: we should try to reduce the size of this struct?
+    OccurenceList IndexesForSubsumption;
+
+    TemplateArgumentLoc *Args;
+    TemplateParameterList *ParamList;
+    ExprOrConcept ConstraintExpr;
+    const NamedDecl *ConstraintDecl;
+  };
+
+  struct FoldExpandedConstraintBits {
+    LLVM_PREFERRED_TYPE(ConstraintKind)
+    unsigned Kind : 5;
+    LLVM_PREFERRED_TYPE(FoldOperatorKind)
+    unsigned FoldOperator : 1;
+    unsigned Placeholder : 26;
+    OccurenceList Indexes;
+    OccurenceList IndexesForSubsumption;
+    TemplateArgumentLoc *Args;
+    TemplateParameterList *ParamList;
+    const Expr *Pattern;
+    const NamedDecl *ConstraintDecl;
+    NormalizedConstraint *Constraint;
+  };
+
+  struct ConceptIdBits : AtomicConstraintBits {
+    NormalizedConstraint *Sub;
+
+    // Only used for parameter mapping.
+    const ConceptSpecializationExpr *CSE;
+  };
+
+  struct CompoundConstraintBits {
+    LLVM_PREFERRED_TYPE(ConstraintKind)
+    unsigned Kind : 5;
+    LLVM_PREFERRED_TYPE(CompoundConstraintKind)
+    unsigned CCK : 1;
+    NormalizedConstraint *LHS;
+    NormalizedConstraint *RHS;
+  };
+
+  union {
+    AtomicConstraintBits Atomic;
+    FoldExpandedConstraintBits FoldExpanded;
+    ConceptIdBits ConceptId;
+    CompoundConstraintBits Compound;
+  };
+
+  ~NormalizedConstraint() {
+    if (getKind() != ConstraintKind::Compound)
+      Atomic.Indexes.llvm::SmallBitVector::~SmallBitVector();
+  }
+
+  NormalizedConstraint(const Expr *ConstraintExpr,
+                       const NamedDecl *ConstraintDecl,
+                       UnsignedOrNone PackIndex)
+      : Atomic{llvm::to_underlying(ConstraintKind::Atomic),
+               /*Placeholder=*/0,
+               PackIndex.toInternalRepresentation(),
+               /*Indexes=*/{},
+               /*IndexesForSubsumption=*/{},
+               /*Args=*/nullptr,
+               /*ParamList=*/nullptr,
+               ConstraintExpr,
+               ConstraintDecl} {}
+
+  NormalizedConstraint(const Expr *Pattern, FoldOperatorKind OpKind,
+                       NormalizedConstraint *Constraint,
+                       const NamedDecl *ConstraintDecl)
+      : FoldExpanded{llvm::to_underlying(ConstraintKind::FoldExpanded),
+                     llvm::to_underlying(OpKind),
+                     /*Placeholder=*/0,
+                     /*Indexes=*/{},
+                     /*IndexesForSubsumption=*/{},
+                     /*Args=*/nullptr,
+                     /*ParamList=*/nullptr,
+                     Pattern,
+                     ConstraintDecl,
+                     Constraint} {}
+
+  NormalizedConstraint(const ConceptReference *ConceptId,
+                       const NamedDecl *ConstraintDecl,
+                       NormalizedConstraint *SubConstraint,
+                       const ConceptSpecializationExpr *CSE,
+                       UnsignedOrNone PackIndex)
+      : ConceptId{{llvm::to_underlying(ConstraintKind::ConceptId),
+                   /*Placeholder=*/0, PackIndex.toInternalRepresentation(),
+                   /*Indexes=*/{},
+                   /*IndexesForSubsumption=*/{},
+                   /*Args=*/nullptr, /*ParamList=*/nullptr, ConceptId,
+                   ConstraintDecl},
+                  SubConstraint,
+                  CSE} {}
+
+  NormalizedConstraint(NormalizedConstraint *LHS, CompoundConstraintKind CCK,
+                       NormalizedConstraint *RHS)
+      : Compound{llvm::to_underlying(ConstraintKind::Compound),
+                 llvm::to_underlying(CCK), LHS, RHS} {}
+
+  bool hasParameterMapping() const {
+    // compound constraints do not have a mapping
+    // and Args is not part of their common initial sequence.
+    return getKind() != ConstraintKind::Compound && Atomic.Args != nullptr;
+  }
+
+  const OccurenceList &mappingOccurenceList() const {
+    assert(hasParameterMapping() && "This constraint has no parameter mapping");
+    return Atomic.Indexes;
+  }
+
+  const OccurenceList &mappingOccurenceListForSubsumption() const {
+    assert(hasParameterMapping() && "This constraint has no parameter mapping");
+    return Atomic.IndexesForSubsumption;
+  }
 
-struct alignas(ConstraintAlignment) AtomicConstraint {
-  const Expr *ConstraintExpr;
-  const NamedDecl *ConstraintDecl;
-  std::optional<ArrayRef<TemplateArgumentLoc>> ParameterMapping;
+  llvm::MutableArrayRef<TemplateArgumentLoc> getParameterMapping() const {
+    return {Atomic.Args, Atomic.Indexes.count()};
+  }
+
+  TemplateParameterList *getUsedTemplateParamList() const {
+    return Atomic.ParamList;
+  }
 
-  AtomicConstraint(const Expr *ConstraintExpr, const NamedDecl *ConstraintDecl)
-      : ConstraintExpr(ConstraintExpr), ConstraintDecl(ConstraintDecl) {};
+  void updateParameterMapping(OccurenceList Indexes,
+                              OccurenceList IndexesForSubsumption,
+                              llvm::MutableArrayRef<TemplateArgumentLoc> Args,
+                              TemplateParameterList *ParamList) {
+    assert(getKind() != ConstraintKind::Compound);
+    assert(Indexes.count() == Args.size());
+    assert(IndexesForSubsumption.size() == Indexes.size());
+    assert((Indexes | IndexesForSubsumption) == Indexes);
+
+    Atomic.IndexesForSubsumption = std::move(IndexesForSubsumption);
+    Atomic.Indexes = std::move(Indexes);
+    Atomic.Args = Args.data();
+    Atomic.ParamList = ParamList;
+  }
 
   bool hasMatchingParameterMapping(ASTContext &C,
-                                   const AtomicConstraint &Other) const {
-    if (!ParameterMapping != !Other.ParameterMapping)
+                                   const NormalizedConstraint &Other) const {
+    assert(getKind() != ConstraintKind::Compound);
+
+    if (hasParameterMapping() != Other.hasParameterMapping())
       return false;
-    if (!ParameterMapping)
+    if (!hasParameterMapping())
       return true;
-    if (ParameterMapping->size() != Other.ParameterMapping->size())
-      return false;
 
-    for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I) {
+    llvm::ArrayRef<TemplateArgumentLoc> ParameterMapping =
+        getParameterMapping();
+    llvm::ArrayRef<TemplateArgumentLoc> OtherParameterMapping =
+        Other.getParameterMapping();
+
+    const OccurenceList &Indexes = mappingOccurenceListForSubsumption();
+    const OccurenceList &OtherIndexes =
+        Other.mappingOccurenceListForSubsumption();
+
+    if (ParameterMapping.size() != OtherParameterMapping.size())
+      return false;
+    for (unsigned I = 0, S = ParameterMapping.size(); I < S; ++I) {
+      if (Indexes[I] != OtherIndexes[I])
+        return false;
+      if (!Indexes[I])
+        continue;
       llvm::FoldingSetNodeID IDA, IDB;
-      C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
+      C.getCanonicalTemplateArgument(ParameterMapping[I].getArgument())
           .Profile(IDA, C);
-      C.getCanonicalTemplateArgument((*Other.ParameterMapping)[I].getArgument())
+      C.getCanonicalTemplateArgument(OtherParameterMapping[I].getArgument())
           .Profile(IDB, C);
       if (IDA != IDB)
         return false;
     }
     return true;
   }
-};
 
-struct alignas(ConstraintAlignment) NormalizedConstraintPair;
-struct alignas(ConstraintAlignment) FoldExpandedConstraint;
+public:
+  ConstraintKind getKind() const {
+    return static_cast<ConstraintKind>(Atomic.Kind);
+  }
 
-/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
-/// either an atomic constraint, a conjunction of normalized constraints or a
-/// disjunction of normalized constraints.
-struct NormalizedConstraint {
+  SourceLocation getBeginLoc() const {
+    switch (getKind()) {
+    case ConstraintKind::Atomic:
+      return cast<const Expr *>(Atomic.ConstraintExpr)->getBeginLoc();
+    case ConstraintKind::ConceptId:
+      return cast<const ConceptReference *>(Atomic.ConstraintExpr)
+          ->getBeginLoc();
+    case ConstraintKind::Compound:
+      return Compound.LHS->getBeginLoc();
+    case ConstraintKind::FoldExpanded:
+      return FoldExpanded.Pattern->getBeginLoc();
+    }
+  }
+
+  SourceLocation getEndLoc() const {
+    switch (getKind()) {
+    case ConstraintKind::Atomic:
+      return cast<const Expr *>(Atomic.ConstraintExpr)->getEndLoc();
+    case ConstraintKind::ConceptId:
+      return cast<const ConceptReference *>(Atomic.ConstraintExpr)->getEndLoc();
+    case ConstraintKind::Compound:
+      return Compound.RHS->getEndLoc();
+    case ConstraintKind::FoldExpanded:
+      return FoldExpanded.Pattern->getEndLoc();
+    }
+  }
+
+  SourceRange getSourceRange() const { return {getBeginLoc(), getEndLoc()}; }
+
+private:
   friend class Sema;
+  static NormalizedConstraint *
+  fromAssociatedConstraints(Sema &S, const NamedDecl *D,
+                            ArrayRef<AssociatedConstraint> ACs);
+  static NormalizedConstraint *fromConstraintExpr(Sema &S, const NamedDecl *D,
+                                                  const Expr *E,
+                                                  UnsignedOrNone SubstIndex);
+};
+
+class CompoundConstraint : public NormalizedConstraint {
+  using NormalizedConstraint::NormalizedConstraint;
 
-  enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
+public:
+  static CompoundConstraint *Create(ASTContext &Ctx, NormalizedConstraint *LHS,
+                                    CompoundConstraintKind CCK,
+                                    NormalizedConstraint *RHS) {
+    return new (Ctx) CompoundConstraint(LHS, CCK, RHS);
+  }
 
-  using CompoundConstraint = llvm::PointerIntPair<NormalizedConstraintPair *, 1,
-                                                  CompoundConstraintKind>;
+  static CompoundConstraint *CreateConjunction(ASTContext &Ctx,
+                                               NormalizedConstraint *LHS,
+                                               NormalizedConstraint *RHS) {
+    return new (Ctx) CompoundConstraint(LHS, CCK_Conjunction, RHS);
+  }
 
-  llvm::PointerUnion<AtomicConstraint *, FoldExpandedConstraint *,
-                     CompoundConstraint>
-      Constraint;
+  const NormalizedConstraint &getLHS() const { return *Compound.LHS; }
 
-  NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
-  NormalizedConstraint(FoldExpandedConstraint *C) : Constraint{C} {};
+  NormalizedConstraint &getLHS() { return *Compound.LHS; }
 
-  NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
-                       NormalizedConstraint RHS, CompoundConstraintKind Kind);
+  const NormalizedConstraint &getRHS() const { return *Compound.RHS; }
 
-  NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other);
-  NormalizedConstraint(NormalizedConstraint &&Other):
-      Constraint(Other.Constraint) {
-    Other.Constraint = nullptr;
+  NormalizedConstraint &getRHS() { return *Compound.RHS; }
+
+  CompoundConstraintKind getCompoundKind() const {
+    return static_cast<CompoundConstraintKind>(Compound.CCK);
   }
-  NormalizedConstraint &operator=(const NormalizedConstraint &Other) = delete;
-  NormalizedConstraint &operator=(NormalizedConstraint &&Other) {
-    if (&Other != this) {
-      NormalizedConstraint Temp(std::move(Other));
-      std::swap(Constraint, Temp.Constraint);
-    }
-    return *this;
+};
+
+class NormalizedConstraintWithParamMapping : public NormalizedConstraint {
+protected:
+  using NormalizedConstraint::NormalizedConstraint;
+
+public:
+  using NormalizedConstraint::getParameterMapping;
+  using NormalizedConstraint::getUsedTemplateParamList;
+  using NormalizedConstraint::hasMatchingParameterMapping;
+  using NormalizedConstraint::hasParameterMapping;
+  using NormalizedConstraint::mappingOccurenceList;
+  using NormalizedConstraint::mappingOccurenceListForSubsumption;
+  using NormalizedConstraint::updateParameterMapping;
+
+  const NamedDecl *getConstraintDecl() const { return Atomic.ConstraintDecl; }
+
+  UnsignedOrNone getPackSubstitutionIndex() const {
+    return UnsignedOrNone::fromInternalRepresentation(
+        Atomic.PackSubstitutionIndex);
   }
+};
+
+class AtomicConstraint : public NormalizedConstraintWithParamMapping {
+  using NormalizedConstraintWithParamMapping::
+      NormalizedConstraintWithParamMapping;
 
-  bool isAtomic() const { return llvm::isa<AtomicConstraint *>(Constraint); }
-  bool isFoldExpanded() const {
-    return llvm::isa<FoldExpandedConstraint *>(Constraint);
+public:
+  static AtomicConstraint *Create(ASTContext &Ctx, const Expr *ConstraintExpr,
+                                  const NamedDecl *ConstraintDecl,
+                                  UnsignedOrNone PackIndex) {
+    return new (Ctx)
+        AtomicConstraint(ConstraintExpr, ConstraintDecl, PackIndex);
   }
-  bool isCompound() const { return llvm::isa<CompoundConstraint>(Constraint); }
 
-  CompoundConstraintKind getCompoundKind() const;
+  const Expr *getConstraintExpr() const {
+    return cast<const Expr *>(Atomic.ConstraintExpr);
+  }
+};
 
-  NormalizedConstraint &getLHS() const;
-  NormalizedConstraint &getRHS() const;
+class FoldExpandedConstraint : public NormalizedConstraintWithParamMapping {
+  using NormalizedConstraintWithParamMapping::
+      NormalizedConstraintWithParamMapping;
 
-  AtomicConstraint *getAtomicConstraint() const;
+public:
+  static FoldExpandedConstraint *Create(ASTContext &Ctx, const Expr *Pattern,
+                                        const NamedDecl *ConstraintDecl,
+                                        FoldOperatorKind OpKind,
+                                        NormalizedConstraint *Constraint) {
+    return new (Ctx)
+        FoldExpandedConstraint(Pattern, OpKind, Constraint, ConstraintDecl);
+  }
 
-  FoldExpandedConstraint *getFoldExpandedConstraint() const;
+  using NormalizedConstraint::hasMatchingParameterMapping;
 
-private:
-  static std::optional<NormalizedConstraint>
-  fromAssociatedConstraints(Sema &S, const NamedDecl *D,
-                            ArrayRef<AssociatedConstraint> ACs);
-  static std::optional<NormalizedConstraint>
-  fromConstraintExpr(Sema &S, const NamedDecl *D, const Expr *E);
-};
+  FoldOperatorKind getFoldOperator() const {
+    return static_cast<FoldOperatorKind>(FoldExpanded.FoldOperator);
+  }
 
-struct alignas(ConstraintAlignment) NormalizedConstraintPair {
-  NormalizedConstraint LHS, RHS;
-};
+  const Expr *getPattern() const { return FoldExpanded.Pattern; }
 
-struct alignas(ConstraintAlignment) FoldExpandedConstraint {
-  enum class FoldOperatorKind { And, Or } Kind;
-  NormalizedConstraint Constraint;
-  const Expr *Pattern;
+  const NormalizedConstraint &getNormalizedPattern() const {
+    return *FoldExpanded.Constraint;
+  }
 
-  FoldExpandedConstraint(FoldOperatorKind K, NormalizedConstraint C,
-                         const Expr *Pattern)
-      : Kind(K), Constraint(std::move(C)), Pattern(Pattern) {};
+  NormalizedConstraint &getNormalizedPattern() {
+    return *FoldExpanded.Constraint;
+  }
 
   static bool AreCompatibleForSubsumption(const FoldExpandedConstraint &A,
                                           const FoldExpandedConstraint &B);
 };
 
-const NormalizedConstraint *getNormalizedAssociatedConstraints(
-    Sema &S, const NamedDecl *ConstrainedDecl,
-    ArrayRef<AssociatedConstraint> AssociatedConstraints);
+class ConceptIdConstraint : public NormalizedConstraintWithParamMapping {
+  using NormalizedConstraintWithParamMapping::
+      NormalizedConstraintWithParamMapping;
+
+public:
+  static ConceptIdConstraint *
+  Create(ASTContext &Ctx, const ConceptReference *ConceptId,
+         NormalizedConstraint *SubConstraint, const NamedDecl *ConstraintDecl,
+         const ConceptSpecializationExpr *CSE, UnsignedOrNone PackIndex) {
+    return new (Ctx) ConceptIdConstraint(ConceptId, ConstraintDecl,
+                                         SubConstraint, CSE, PackIndex);
+  }
+
+  const ConceptSpecializationExpr *getConceptSpecializationExpr() const {
+    return ConceptId.CSE;
+  }
+
+  const ConceptReference *getConceptId() const {
+    return cast<const ConceptReference *>(ConceptId.ConstraintExpr);
+  }
+
+  const NormalizedConstraint &getNormalizedConstraint() const {
+    return *ConceptId.Sub;
+  }
+
+  NormalizedConstraint &getNormalizedConstraint() { return *ConceptId.Sub; }
+};
+
+struct UnsubstitutedConstraintSatisfactionCacheResult {
+  ExprResult SubstExpr;
+  ConstraintSatisfaction Satisfaction;
+};
 
 /// \brief SubsumptionChecker establishes subsumption
 /// between two set of constraints.
@@ -189,13 +465,13 @@ class SubsumptionChecker {
   };
 
   struct MappedAtomicConstraint {
-    AtomicConstraint *Constraint;
+    const AtomicConstraint *Constraint;
     Literal ID;
   };
 
   struct FoldExpendedConstraintKey {
     FoldExpandedConstraint::FoldOperatorKind Kind;
-    AtomicConstraint *Constraint;
+    const AtomicConstraint *Constraint;
     Literal ID;
   };
 
@@ -207,7 +483,7 @@ class SubsumptionChecker {
 
   // A map from a literal to a corresponding associated constraint.
   // We do not have enough bits left for a pointer union here :(
-  llvm::DenseMap<uint16_t, void *> ReverseMap;
+  llvm::DenseMap<uint16_t, const void *> ReverseMap;
 
   // Fold expanded constraints ask us to recursively establish subsumption.
   // This caches the result.
@@ -234,12 +510,12 @@ class SubsumptionChecker {
   FormulaType Normalize(const NormalizedConstraint &C);
   void AddUniqueClauseToFormula(Formula &F, Clause C);
 
-  Literal find(AtomicConstraint *);
-  Literal find(FoldExpandedConstraint *);
+  Literal find(const AtomicConstraint *);
+  Literal find(const FoldExpandedConstraint *);
 
   uint16_t getNewLiteralId();
 };
 
-} // clang
+} // namespace clang
 
 #endif // LLVM_CLANG_SEMA_SEMACONCEPT_H
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 115c19d4f1540..60c7d275f1aaf 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -234,21 +234,25 @@ enum class TemplateSubstitutionKind : char {
     /// Replaces the current 'innermost' level with the provided argument list.
     /// This is useful for type deduction cases where we need to get the entire
     /// list from the AST, but then add the deduced innermost list.
-    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
+    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args,
+                                           bool Final = false) {
       assert((!TemplateArgumentLists.empty() || NumRetainedOuterLevels) &&
              "Replacing in an empty list?");
 
       if (!TemplateArgumentLists.empty()) {
-        assert((TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ||
-                TemplateArgumentLists[0].AssociatedDeclAndFinal.getPointer() ==
-                    AssociatedDecl) &&
-               "Trying to change incorrect declaration?");
         TemplateArgumentLists[0].Args = Args;
-      } else {
-        --NumRetainedOuterLevels;
-        TemplateArgumentLists.push_back(
-            {{AssociatedDecl, /*Final=*/false}, Args});
+        return;
       }
+      --NumRetainedOuterLevels;
+      TemplateArgumentLists.push_back(
+          {{AssociatedDecl, /*Final=*/Final}, Args});
+    }
+
+    void replaceOutermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
+      assert((!TemplateArgumentLists.empty()) && "Replacing in an empty list?");
+      TemplateArgumentLists.back().AssociatedDeclAndFinal.setPointer(
+          AssociatedDecl);
+      TemplateArgumentLists.back().Args = Args;
     }
 
     /// Add an outermost level that we are not substituting. We have no
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index d658890e076c2..fd12bc4e83827 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -24,13 +24,18 @@ static void
 CreateUnsatisfiedConstraintRecord(const ASTContext &C,
                                   const UnsatisfiedConstraintRecord &Detail,
                                   UnsatisfiedConstraintRecord *TrailingObject) {
-  if (auto *E = dyn_cast<Expr *>(Detail))
+  if (Detail.isNull())
+    new (TrailingObject) UnsatisfiedConstraintRecord(nullptr);
+  else if (const auto *E = llvm::dyn_cast<const Expr *>(Detail))
     new (TrailingObject) UnsatisfiedConstraintRecord(E);
+  else if (const auto *Concept =
+               llvm::dyn_cast<const ConceptReference *>(Detail))
+    new (TrailingObject) UnsatisfiedConstraintRecord(Concept);
   else {
     auto &SubstitutionDiagnostic =
-        *cast<std::pair<SourceLocation, StringRef> *>(Detail);
+        *cast<const clang::ConstraintSubstitutionDiagnostic *>(Detail);
     StringRef Message = C.backupStr(SubstitutionDiagnostic.second);
-    auto *NewSubstDiag = new (C) std::pair<SourceLocation, StringRef>(
+    auto *NewSubstDiag = new (C) clang::ConstraintSubstitutionDiagnostic(
         SubstitutionDiagnostic.first, Message);
     new (TrailingObject) UnsatisfiedConstraintRecord(NewSubstDiag);
   }
@@ -74,9 +79,10 @@ ASTConstraintSatisfaction *ASTConstraintSatisfaction::Rebuild(
   return new (Mem) ASTConstraintSatisfaction(C, Satisfaction);
 }
 
-void ConstraintSatisfaction::Profile(
-    llvm::FoldingSetNodeID &ID, const ASTContext &C,
-    const NamedDecl *ConstraintOwner, ArrayRef<TemplateArgument> TemplateArgs) {
+void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID,
+                                     const ASTContext &C,
+                                     const NamedDecl *ConstraintOwner,
+                                     ArrayRef<TemplateArgument> TemplateArgs) {
   ID.AddPointer(ConstraintOwner);
   ID.AddInteger(TemplateArgs.size());
   for (auto &Arg : TemplateArgs)
@@ -116,6 +122,19 @@ void ConceptReference::print(llvm::raw_ostream &OS,
   }
 }
 
+const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB,
+                                             const ConceptReference *C) {
+  std::string NameStr;
+  llvm::raw_string_ostream OS(NameStr);
+  LangOptions LO;
+  LO.CPlusPlus = true;
+  LO.Bool = true;
+  OS << '\'';
+  C->print(OS, PrintingPolicy(LO));
+  OS << '\'';
+  return DB << NameStr;
+}
+
 concepts::ExprRequirement::ExprRequirement(
     Expr *E, bool IsSimple, SourceLocation NoexceptLoc,
     ReturnTypeRequirement Req, SatisfactionStatus Status,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 1c8fd83feb7f8..f43fa8c90ad3b 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1069,22 +1069,22 @@ Error ASTNodeImporter::ImportConstraintSatisfaction(
   ToSat.ContainsErrors = FromSat.ContainsErrors;
   if (!ToSat.IsSatisfied) {
     for (auto Record = FromSat.begin(); Record != FromSat.end(); ++Record) {
-      if (Expr *E = Record->dyn_cast<Expr *>()) {
+      if (const Expr *E = Record->dyn_cast<const Expr *>()) {
         ExpectedExpr ToSecondExpr = import(E);
         if (!ToSecondExpr)
           return ToSecondExpr.takeError();
         ToSat.Details.emplace_back(ToSecondExpr.get());
       } else {
-        auto Pair = Record->dyn_cast<std::pair<SourceLocation, StringRef> *>();
+        auto Pair =
+            Record->dyn_cast<const ConstraintSubstitutionDiagnostic *>();
 
         ExpectedSLoc ToPairFirst = import(Pair->first);
         if (!ToPairFirst)
           return ToPairFirst.takeError();
         StringRef ToPairSecond = ImportASTStringRef(Pair->second);
-        ToSat.Details.emplace_back(
-            new (Importer.getToContext())
-                ConstraintSatisfaction::SubstitutionDiagnostic{
-                    ToPairFirst.get(), ToPairSecond});
+        ToSat.Details.emplace_back(new (Importer.getToContext())
+                                       ConstraintSubstitutionDiagnostic{
+                                           ToPairFirst.get(), ToPairSecond});
       }
     }
   }
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index dc6d232d9a525..8413090e7ebd1 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -12,9 +12,11 @@
 
 #include "clang/Sema/SemaConcept.h"
 #include "TreeTransform.h"
+#include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ExprConcepts.h"
+#include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/OperatorPrecedence.h"
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Initialization.h"
@@ -27,7 +29,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/StringExtras.h"
-#include <optional>
+#include "llvm/Support/SaveAndRestore.h"
 
 using namespace clang;
 using namespace sema;
@@ -85,7 +87,7 @@ class LogicalBinOp {
                                   OK_Ordinary, Loc, FPOptionsOverride{});
   }
 };
-}
+} // namespace
 
 bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
                                      Token NextToken, bool *PossibleNonPrimary,
@@ -146,14 +148,14 @@ bool Sema::CheckConstraintExpression(const Expr *ConstraintExpression,
 
   if (!Context.hasSameUnqualifiedType(Type, Context.BoolTy)) {
     Diag(ConstraintExpression->getExprLoc(),
-         diag::err_non_bool_atomic_constraint) << Type
-        << ConstraintExpression->getSourceRange();
+         diag::err_non_bool_atomic_constraint)
+        << Type << ConstraintExpression->getSourceRange();
     CheckForNonPrimary();
     return false;
   }
 
   if (PossibleNonPrimary)
-      *PossibleNonPrimary = false;
+    *PossibleNonPrimary = false;
   return true;
 }
 
@@ -164,52 +166,315 @@ struct SatisfactionStackRAII {
   SatisfactionStackRAII(Sema &SemaRef, const NamedDecl *ND,
                         const llvm::FoldingSetNodeID &FSNID)
       : SemaRef(SemaRef) {
-      if (ND) {
+    if (ND) {
       SemaRef.PushSatisfactionStackEntry(ND, FSNID);
       Inserted = true;
-      }
+    }
   }
   ~SatisfactionStackRAII() {
-        if (Inserted)
-          SemaRef.PopSatisfactionStackEntry();
+    if (Inserted)
+      SemaRef.PopSatisfactionStackEntry();
   }
 };
 } // namespace
 
-static bool
-DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
-                            const NamedDecl *Templ, const Expr *E,
-                            const MultiLevelTemplateArgumentList &MLTAL) {
+static bool DiagRecursiveConstraintEval(
+    Sema &S, llvm::FoldingSetNodeID &ID, const NamedDecl *Templ, const Expr *E,
+    const MultiLevelTemplateArgumentList *MLTAL = nullptr) {
   E->Profile(ID, S.Context, /*Canonical=*/true);
-  for (const auto &List : MLTAL)
-    for (const auto &TemplateArg : List.Args)
-      TemplateArg.Profile(ID, S.Context);
-
-  // Note that we have to do this with our own collection, because there are
-  // times where a constraint-expression check can cause us to need to evaluate
-  // other constriants that are unrelated, such as when evaluating a recovery
-  // expression, or when trying to determine the constexpr-ness of special
-  // members. Otherwise we could just use the
-  // Sema::InstantiatingTemplate::isAlreadyBeingInstantiated function.
+  if (MLTAL) {
+    for (const auto &List : *MLTAL)
+      for (const auto &TemplateArg : List.Args)
+        S.Context.getCanonicalTemplateArgument(TemplateArg)
+            .Profile(ID, S.Context);
+  }
   if (S.SatisfactionStackContains(Templ, ID)) {
     S.Diag(E->getExprLoc(), diag::err_constraint_depends_on_self)
         << E << E->getSourceRange();
     return true;
   }
-
   return false;
 }
 
-static ExprResult EvaluateAtomicConstraint(
-    Sema &S, const Expr *AtomicExpr, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
+// Figure out the to-translation-unit depth for this function declaration for
+// the purpose of seeing if they differ by constraints. This isn't the same as
+// getTemplateDepth, because it includes already instantiated parents.
+static unsigned
+CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
+                                     bool SkipForSpecialization = false) {
+  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
+      ND, ND->getLexicalDeclContext(), /*Final=*/false,
+      /*Innermost=*/std::nullopt,
+      /*RelativeToPrimary=*/true,
+      /*Pattern=*/nullptr,
+      /*ForConstraintInstantiation=*/true, SkipForSpecialization);
+  return MLTAL.getNumLevels();
+}
+
+namespace {
+class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
+  unsigned TemplateDepth = 0;
+
+public:
+  using inherited = TreeTransform<AdjustConstraintDepth>;
+  AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
+      : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
+
+  using inherited::TransformTemplateTypeParmType;
+  QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
+                                         TemplateTypeParmTypeLoc TL, bool) {
+    const TemplateTypeParmType *T = TL.getTypePtr();
+
+    TemplateTypeParmDecl *NewTTPDecl = nullptr;
+    if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
+      NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
+          TransformDecl(TL.getNameLoc(), OldTTPDecl));
+
+    QualType Result = getSema().Context.getTemplateTypeParmType(
+        T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
+        NewTTPDecl);
+    TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
+    NewTL.setNameLoc(TL.getNameLoc());
+    return Result;
+  }
+
+  bool AlreadyTransformed(QualType T) {
+    if (T.isNull())
+      return true;
+
+    if (T->isInstantiationDependentType() || T->isVariablyModifiedType() ||
+        T->containsUnexpandedParameterPack())
+      return false;
+    return true;
+  }
+};
+} // namespace
+
+namespace {
+
+// FIXME: Convert it to DynamicRecursiveASTVisitor
+class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
+  using inherited = RecursiveASTVisitor<HashParameterMapping>;
+  friend inherited;
+
+  Sema &SemaRef;
+  const MultiLevelTemplateArgumentList &TemplateArgs;
+  llvm::FoldingSetNodeID &ID;
+  llvm::SmallVector<TemplateArgument, 10> UsedTemplateArgs;
+
+  UnsignedOrNone OuterPackSubstIndex;
+
+  TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) {
+    assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size());
+    Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex];
+    if (Arg.isPackExpansion())
+      Arg = Arg.getPackExpansionPattern();
+    return Arg;
+  }
+
+  bool shouldVisitTemplateInstantiations() const { return true; }
+
+public:
+  HashParameterMapping(Sema &SemaRef,
+                       const MultiLevelTemplateArgumentList &TemplateArgs,
+                       llvm::FoldingSetNodeID &ID,
+                       UnsignedOrNone OuterPackSubstIndex)
+      : SemaRef(SemaRef), TemplateArgs(TemplateArgs), ID(ID),
+        OuterPackSubstIndex(OuterPackSubstIndex) {}
+
+  bool VisitTemplateTypeParmType(TemplateTypeParmType *T) {
+    // A lambda expression can introduce template parameters that don't have
+    // corresponding template arguments yet.
+    if (T->getDepth() >= TemplateArgs.getNumLevels())
+      return true;
+
+    TemplateArgument Arg = TemplateArgs(T->getDepth(), T->getIndex());
+
+    if (T->isParameterPack() && SemaRef.ArgPackSubstIndex) {
+      assert(Arg.getKind() == TemplateArgument::Pack &&
+             "Missing argument pack");
+
+      Arg = getPackSubstitutedTemplateArgument(Arg);
+    }
+
+    UsedTemplateArgs.push_back(
+        SemaRef.Context.getCanonicalTemplateArgument(Arg));
+    return true;
+  }
+
+  bool VisitDeclRefExpr(DeclRefExpr *E) {
+    NamedDecl *D = E->getDecl();
+    NonTypeTemplateParmDecl *NTTP = dyn_cast<NonTypeTemplateParmDecl>(D);
+    if (!NTTP)
+      return TraverseDecl(D);
+
+    TemplateArgument Arg = TemplateArgs(NTTP->getDepth(), NTTP->getPosition());
+    if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
+      assert(Arg.getKind() == TemplateArgument::Pack &&
+             "Missing argument pack");
+      Arg = getPackSubstitutedTemplateArgument(Arg);
+    }
+
+    UsedTemplateArgs.push_back(
+        SemaRef.Context.getCanonicalTemplateArgument(Arg));
+    return true;
+  }
+
+  bool VisitTypedefType(TypedefType *TT) {
+    return inherited::TraverseType(TT->desugar());
+  }
+
+  bool TraverseDecl(Decl *D) {
+    if (auto *VD = dyn_cast<ValueDecl>(D))
+      return TraverseType(VD->getType());
+
+    return inherited::TraverseDecl(D);
+  }
+
+  bool TraverseTypeLoc(TypeLoc TL, bool TraverseQualifier = true) {
+    // We don't care about TypeLocs. So traverse Types instead.
+    return TraverseType(TL.getType(), TraverseQualifier);
+  }
+
+  bool TraverseTagType(const TagType *T, bool TraverseQualifier) {
+    // T's parent can be dependent while T doesn't have any template arguments.
+    // We should have already traversed its qualifier.
+    // FIXME: Add an assert to catch cases where we failed to profile the
+    // concept. assert(!T->isDependentType() && "We missed a case in profiling
+    // concepts!");
+    return true;
+  }
+
+  bool TraverseInjectedClassNameType(InjectedClassNameType *T,
+                                     bool TraverseQualifier) {
+    return TraverseTemplateArguments(T->getTemplateArgs(SemaRef.Context));
+  }
+
+  bool TraverseTemplateArgument(const TemplateArgument &Arg) {
+    if (!Arg.containsUnexpandedParameterPack() || Arg.isPackExpansion()) {
+      // Act as if we are fully expanding this pack, if it is a PackExpansion.
+      Sema::ArgPackSubstIndexRAII _1(SemaRef, std::nullopt);
+      llvm::SaveAndRestore<UnsignedOrNone> _2(OuterPackSubstIndex,
+                                              std::nullopt);
+      return inherited::TraverseTemplateArgument(Arg);
+    }
+
+    Sema::ArgPackSubstIndexRAII _1(SemaRef, OuterPackSubstIndex);
+    return inherited::TraverseTemplateArgument(Arg);
+  }
+
+  void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
+    if (!Constraint.hasParameterMapping()) {
+      for (const auto &List : TemplateArgs)
+        for (const TemplateArgument &Arg : List.Args)
+          SemaRef.Context.getCanonicalTemplateArgument(Arg).Profile(
+              ID, SemaRef.Context);
+      return;
+    }
+
+    llvm::ArrayRef<TemplateArgumentLoc> Mapping =
+        Constraint.getParameterMapping();
+    for (auto &ArgLoc : Mapping) {
+      TemplateArgument Canonical =
+          SemaRef.Context.getCanonicalTemplateArgument(ArgLoc.getArgument());
+      // We don't want sugars to impede the profile of cache.
+      UsedTemplateArgs.push_back(Canonical);
+      TraverseTemplateArgument(Canonical);
+    }
+
+    for (auto &Used : UsedTemplateArgs) {
+      llvm::FoldingSetNodeID R;
+      Used.Profile(R, SemaRef.Context);
+      ID.AddNodeID(R);
+    }
+  }
+};
+
+class ConstraintSatisfactionChecker {
+  Sema &S;
+  const NamedDecl *Template;
+  SourceLocation TemplateNameLoc;
+  UnsignedOrNone PackSubstitutionIndex;
+
+  ConstraintSatisfaction &Satisfaction;
+
+private:
+  ExprResult
+  EvaluateAtomicConstraint(const Expr *AtomicExpr,
+                           const MultiLevelTemplateArgumentList &MLTAL);
+
+  UnsignedOrNone EvaluateFoldExpandedConstraintSize(
+      const FoldExpandedConstraint &FE,
+      const MultiLevelTemplateArgumentList &MLTAL);
+
+  // XXX: It is SLOW! Use it very carefully.
+  std::optional<MultiLevelTemplateArgumentList> SubstitutionInTemplateArguments(
+      const NormalizedConstraintWithParamMapping &Constraint,
+      MultiLevelTemplateArgumentList MLTAL,
+      llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost);
+
+  ExprResult EvaluateSlow(const AtomicConstraint &Constraint,
+                          const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult Evaluate(const AtomicConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult EvaluateSlow(const FoldExpandedConstraint &Constraint,
+                          const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult Evaluate(const FoldExpandedConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult EvaluateSlow(const ConceptIdConstraint &Constraint,
+                          const MultiLevelTemplateArgumentList &MLTAL,
+                          unsigned int Size);
+
+  ExprResult Evaluate(const ConceptIdConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+  ExprResult Evaluate(const CompoundConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+
+public:
+  ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template,
+                                SourceLocation TemplateNameLoc,
+                                UnsignedOrNone PackSubstitutionIndex,
+                                ConstraintSatisfaction &Satisfaction)
+      : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc),
+        PackSubstitutionIndex(PackSubstitutionIndex),
+        Satisfaction(Satisfaction) {}
+
+  ExprResult Evaluate(const NormalizedConstraint &Constraint,
+                      const MultiLevelTemplateArgumentList &MLTAL);
+};
+
+StringRef allocateStringFromConceptDiagnostic(const Sema &S,
+                                              const PartialDiagnostic Diag) {
+  SmallString<128> DiagString;
+  DiagString = ": ";
+  Diag.EmitToString(S.getDiagnostics(), DiagString);
+  return S.getASTContext().backupStr(DiagString);
+}
+
+} // namespace
+
+ExprResult ConstraintSatisfactionChecker::EvaluateAtomicConstraint(
+    const Expr *AtomicExpr, const MultiLevelTemplateArgumentList &MLTAL) {
   EnterExpressionEvaluationContext ConstantEvaluated(
       S, Sema::ExpressionEvaluationContext::ConstantEvaluated,
       Sema::ReuseLambdaContextDecl);
 
+  llvm::FoldingSetNodeID ID;
+  if (Template &&
+      DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, &MLTAL)) {
+    Satisfaction.IsSatisfied = false;
+    Satisfaction.ContainsErrors = true;
+    return ExprEmpty();
+  }
+  SatisfactionStackRAII StackRAII(S, Template, ID);
+
   // Atomic constraint - substitute arguments and check satisfaction.
-  ExprResult SubstitutedExpression;
+  ExprResult SubstitutedExpression = const_cast<Expr *>(AtomicExpr);
   {
     TemplateDeductionInfo Info(TemplateNameLoc);
     Sema::InstantiatingTemplate Inst(
@@ -220,16 +485,6 @@ static ExprResult EvaluateAtomicConstraint(
     if (Inst.isInvalid())
       return ExprError();
 
-    llvm::FoldingSetNodeID ID;
-    if (Template &&
-        DiagRecursiveConstraintEval(S, ID, Template, AtomicExpr, MLTAL)) {
-      Satisfaction.IsSatisfied = false;
-      Satisfaction.ContainsErrors = true;
-      return ExprEmpty();
-    }
-
-    SatisfactionStackRAII StackRAII(S, Template, ID);
-
     // We do not want error diagnostics escaping here.
     Sema::SFINAETrap Trap(S);
     SubstitutedExpression =
@@ -247,21 +502,16 @@ static ExprResult EvaluateAtomicConstraint(
       PartialDiagnosticAt SubstDiag{SourceLocation(),
                                     PartialDiagnostic::NullDiagnostic()};
       Info.takeSFINAEDiagnostic(SubstDiag);
-      // FIXME: Concepts: This is an unfortunate consequence of there
+      // FIXME: This is an unfortunate consequence of there
       //  being no serialization code for PartialDiagnostics and the fact
       //  that serializing them would likely take a lot more storage than
       //  just storing them as strings. We would still like, in the
       //  future, to serialize the proper PartialDiagnostic as serializing
       //  it as a string defeats the purpose of the diagnostic mechanism.
-      SmallString<128> DiagString;
-      DiagString = ": ";
-      SubstDiag.second.EmitToString(S.getDiagnostics(), DiagString);
-      unsigned MessageSize = DiagString.size();
-      char *Mem = new (S.Context) char[MessageSize];
-      memcpy(Mem, DiagString.c_str(), MessageSize);
       Satisfaction.Details.emplace_back(
-          new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
-              SubstDiag.first, StringRef(Mem, MessageSize)});
+          new (S.Context) ConstraintSubstitutionDiagnostic{
+              SubstDiag.first,
+              allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
       Satisfaction.IsSatisfied = false;
       return ExprEmpty();
     }
@@ -289,284 +539,525 @@ static ExprResult EvaluateAtomicConstraint(
   return SubstitutedExpression;
 }
 
-static UnsignedOrNone EvaluateFoldExpandedConstraintSize(
-    Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
+std::optional<MultiLevelTemplateArgumentList>
+ConstraintSatisfactionChecker::SubstitutionInTemplateArguments(
+    const NormalizedConstraintWithParamMapping &Constraint,
+    MultiLevelTemplateArgumentList MLTAL,
+    llvm::SmallVector<TemplateArgument> &SubstitutedOuterMost) {
+
+  if (!Constraint.hasParameterMapping())
+    return std::move(MLTAL);
+
+  TemplateDeductionInfo Info(Constraint.getBeginLoc());
+  Sema::InstantiatingTemplate Inst(
+      S, Constraint.getBeginLoc(),
+      Sema::InstantiatingTemplate::ConstraintSubstitution{},
+      // FIXME: improve const-correctness of InstantiatingTemplate
+      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+  if (Inst.isInvalid())
+    return std::nullopt;
+
+  Sema::SFINAETrap Trap(S);
+
+  TemplateArgumentListInfo SubstArgs;
+  Sema::ArgPackSubstIndexRAII SubstIndex(
+      S, Constraint.getPackSubstitutionIndex()
+             ? Constraint.getPackSubstitutionIndex()
+             : PackSubstitutionIndex);
+
+  if (S.SubstTemplateArgumentsInParameterMapping(
+          Constraint.getParameterMapping(), Constraint.getBeginLoc(), MLTAL,
+          SubstArgs, /*BuildPackExpansionTypes=*/true)) {
+    Satisfaction.IsSatisfied = false;
+    return std::nullopt;
+  }
+
+  Sema::CheckTemplateArgumentInfo CTAI;
+  auto *TD = const_cast<TemplateDecl *>(
+      cast<TemplateDecl>(Constraint.getConstraintDecl()));
+  if (S.CheckTemplateArgumentList(TD, Constraint.getUsedTemplateParamList(),
+                                  TD->getLocation(), SubstArgs,
+                                  /*DefaultArguments=*/{},
+                                  /*PartialTemplateArgs=*/false, CTAI))
+    return std::nullopt;
+  const NormalizedConstraint::OccurenceList &Used =
+      Constraint.mappingOccurenceList();
+  SubstitutedOuterMost =
+      llvm::to_vector_of<TemplateArgument>(MLTAL.getOutermost());
+  unsigned Offset = 0;
+  for (unsigned I = 0, MappedIndex = 0; I < Used.size(); I++) {
+    TemplateArgument Arg;
+    if (Used[I])
+      Arg = S.Context.getCanonicalTemplateArgument(
+          CTAI.SugaredConverted[MappedIndex++]);
+    if (I < SubstitutedOuterMost.size()) {
+      SubstitutedOuterMost[I] = Arg;
+      Offset = I + 1;
+    } else {
+      SubstitutedOuterMost.push_back(Arg);
+      Offset = SubstitutedOuterMost.size();
+    }
+  }
+  if (Offset < SubstitutedOuterMost.size())
+    SubstitutedOuterMost.erase(SubstitutedOuterMost.begin() + Offset);
+
+  MLTAL.replaceOutermostTemplateArguments(
+      const_cast<NamedDecl *>(Constraint.getConstraintDecl()),
+      SubstitutedOuterMost);
+  return std::move(MLTAL);
+}
+
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+    const AtomicConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+
+  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
+  if (!SubstitutedArgs) {
+    Satisfaction.IsSatisfied = false;
+    return ExprEmpty();
+  }
+
+  Sema::ArgPackSubstIndexRAII SubstIndex(S, PackSubstitutionIndex);
+  ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint(
+      Constraint.getConstraintExpr(), *SubstitutedArgs);
+
+  if (SubstitutedAtomicExpr.isInvalid())
+    return ExprError();
+
+  if (SubstitutedAtomicExpr.isUnset())
+    // Evaluator has decided satisfaction without yielding an expression.
+    return ExprEmpty();
+
+  // We don't have the ability to evaluate this, since it contains a
+  // RecoveryExpr, so we want to fail overload resolution.  Otherwise,
+  // we'd potentially pick up a different overload, and cause confusing
+  // diagnostics. SO, add a failure detail that will cause us to make this
+  // overload set not viable.
+  if (SubstitutedAtomicExpr.get()->containsErrors()) {
+    Satisfaction.IsSatisfied = false;
+    Satisfaction.ContainsErrors = true;
+
+    PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error);
+    Satisfaction.Details.emplace_back(
+        new (S.Context) ConstraintSubstitutionDiagnostic{
+            SubstitutedAtomicExpr.get()->getBeginLoc(),
+            allocateStringFromConceptDiagnostic(S, Msg)});
+    return SubstitutedAtomicExpr;
+  }
+
+  if (SubstitutedAtomicExpr.get()->isValueDependent()) {
+    Satisfaction.IsSatisfied = true;
+    Satisfaction.ContainsErrors = false;
+    return SubstitutedAtomicExpr;
+  }
+
+  EnterExpressionEvaluationContext ConstantEvaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  SmallVector<PartialDiagnosticAt, 2> EvaluationDiags;
+  Expr::EvalResult EvalResult;
+  EvalResult.Diag = &EvaluationDiags;
+  if (!SubstitutedAtomicExpr.get()->EvaluateAsConstantExpr(EvalResult,
+                                                           S.Context) ||
+      !EvaluationDiags.empty()) {
+    // C++2a [temp.constr.atomic]p1
+    //   ...E shall be a constant expression of type bool.
+    S.Diag(SubstitutedAtomicExpr.get()->getBeginLoc(),
+           diag::err_non_constant_constraint_expression)
+        << SubstitutedAtomicExpr.get()->getSourceRange();
+    for (const PartialDiagnosticAt &PDiag : EvaluationDiags)
+      S.Diag(PDiag.first, PDiag.second);
+    return ExprError();
+  }
+
+  assert(EvalResult.Val.isInt() &&
+         "evaluating bool expression didn't produce int");
+  Satisfaction.IsSatisfied = EvalResult.Val.getInt().getBoolValue();
+  if (!Satisfaction.IsSatisfied)
+    Satisfaction.Details.emplace_back(SubstitutedAtomicExpr.get());
+
+  return SubstitutedAtomicExpr;
+}
+
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const AtomicConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+
+  unsigned Size = Satisfaction.Details.size();
+  llvm::FoldingSetNodeID ID;
+  UnsignedOrNone OuterPackSubstIndex =
+      Constraint.getPackSubstitutionIndex()
+          ? Constraint.getPackSubstitutionIndex()
+          : PackSubstitutionIndex;
+
+  ID.AddPointer(Constraint.getConstraintExpr());
+  ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
+  HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
+      .VisitConstraint(Constraint);
+
+  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+    auto &Cached = Iter->second.Satisfaction;
+    Satisfaction.ContainsErrors = Cached.ContainsErrors;
+    Satisfaction.IsSatisfied = Cached.IsSatisfied;
+    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
+                                Cached.Details.begin(), Cached.Details.end());
+    return Iter->second.SubstExpr;
+  }
+
+  ExprResult E = EvaluateSlow(Constraint, MLTAL);
+
+  UnsubstitutedConstraintSatisfactionCacheResult Cache;
+  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.SubstExpr = E;
+  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+
+  return E;
+}
+
+UnsignedOrNone
+ConstraintSatisfactionChecker::EvaluateFoldExpandedConstraintSize(
+    const FoldExpandedConstraint &FE,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
   // We should ignore errors in the presence of packs of different size.
   Sema::SFINAETrap Trap(S);
 
-  Expr *Pattern = FE->getPattern();
+  Expr *Pattern = const_cast<Expr *>(FE.getPattern());
 
   SmallVector<UnexpandedParameterPack, 2> Unexpanded;
   S.collectUnexpandedParameterPacks(Pattern, Unexpanded);
   assert(!Unexpanded.empty() && "Pack expansion without parameter packs?");
   bool Expand = true;
   bool RetainExpansion = false;
-  UnsignedOrNone NumExpansions = FE->getNumExpansions();
+  UnsignedOrNone NumExpansions(std::nullopt);
   if (S.CheckParameterPacksForExpansion(
-          FE->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
-          /*FailOnPackProducingTemplates=*/true, Expand, RetainExpansion,
+          Pattern->getExprLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL,
+          /*FailOnPackProducingTemplates=*/false, Expand, RetainExpansion,
           NumExpansions) ||
       !Expand || RetainExpansion)
     return std::nullopt;
 
   if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) {
-    S.Diag(FE->getEllipsisLoc(),
+    S.Diag(Pattern->getExprLoc(),
            clang::diag::err_fold_expression_limit_exceeded)
         << *NumExpansions << S.getLangOpts().BracketDepth
-        << FE->getSourceRange();
-    S.Diag(FE->getEllipsisLoc(), diag::note_bracket_depth);
+        << Pattern->getSourceRange();
+    S.Diag(Pattern->getExprLoc(), diag::note_bracket_depth);
     return std::nullopt;
   }
   return NumExpansions;
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction);
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+    const FoldExpandedConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+
+  bool Conjunction = Constraint.getFoldOperator() ==
+                     FoldExpandedConstraint::FoldOperatorKind::And;
+  unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
+
+  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  // FIXME: Is PackSubstitutionIndex correct?
+  llvm::SaveAndRestore _(PackSubstitutionIndex, S.ArgPackSubstIndex);
+  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+      SubstitutionInTemplateArguments(
+          static_cast<const NormalizedConstraintWithParamMapping &>(Constraint),
+          MLTAL, SubstitutedOuterMost);
+  if (!SubstitutedArgs) {
+    Satisfaction.IsSatisfied = false;
+    return ExprError();
+  }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const Expr *LHS, OverloadedOperatorKind Op, const Expr *RHS,
-    const NamedDecl *Template, SourceLocation TemplateNameLoc,
-    const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
-  size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
+  ExprResult Out;
+  UnsignedOrNone NumExpansions =
+      EvaluateFoldExpandedConstraintSize(Constraint, *SubstitutedArgs);
+  if (!NumExpansions)
+    return ExprEmpty();
 
-  ExprResult LHSRes = calculateConstraintSatisfaction(
-      S, LHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
+  if (*NumExpansions == 0) {
+    Satisfaction.IsSatisfied = Conjunction;
+    return ExprEmpty();
+  }
 
-  if (LHSRes.isInvalid())
-    return ExprError();
+  for (unsigned I = 0; I < *NumExpansions; I++) {
+    Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
+    Satisfaction.IsSatisfied = false;
+    Satisfaction.ContainsErrors = false;
+    ExprResult Expr =
+        ConstraintSatisfactionChecker(S, Template, TemplateNameLoc,
+                                      UnsignedOrNone(I), Satisfaction)
+            .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs);
+    if (Expr.isUsable()) {
+      if (Out.isUnset())
+        Out = Expr;
+      else
+        Out = BinaryOperator::Create(S.Context, Out.get(), Expr.get(),
+                                     Conjunction ? BinaryOperatorKind::BO_LAnd
+                                                 : BinaryOperatorKind::BO_LOr,
+                                     S.Context.BoolTy, VK_PRValue, OK_Ordinary,
+                                     Constraint.getBeginLoc(),
+                                     FPOptionsOverride{});
+    } else {
+      assert(!Satisfaction.IsSatisfied);
+    }
+    if (!Conjunction && Satisfaction.IsSatisfied) {
+      Satisfaction.Details.erase(Satisfaction.Details.begin() +
+                                     EffectiveDetailEndIndex,
+                                 Satisfaction.Details.end());
+      break;
+    }
+    if (Satisfaction.IsSatisfied != Conjunction)
+      return Out;
+  }
 
-  bool IsLHSSatisfied = Satisfaction.IsSatisfied;
-
-  if (Op == clang::OO_PipePipe && IsLHSSatisfied)
-    // [temp.constr.op] p3
-    //    A disjunction is a constraint taking two operands. To determine if
-    //    a disjunction is satisfied, the satisfaction of the first operand
-    //    is checked. If that is satisfied, the disjunction is satisfied.
-    //    Otherwise, the disjunction is satisfied if and only if the second
-    //    operand is satisfied.
-    // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
-    return LHSRes;
-
-  if (Op == clang::OO_AmpAmp && !IsLHSSatisfied)
-    // [temp.constr.op] p2
-    //    A conjunction is a constraint taking two operands. To determine if
-    //    a conjunction is satisfied, the satisfaction of the first operand
-    //    is checked. If that is not satisfied, the conjunction is not
-    //    satisfied. Otherwise, the conjunction is satisfied if and only if
-    //    the second operand is satisfied.
-    // LHS is instantiated while RHS is not. Skip creating invalid BinaryOp.
-    return LHSRes;
-
-  ExprResult RHSRes = calculateConstraintSatisfaction(
-      S, RHS, Template, TemplateNameLoc, MLTAL, Satisfaction);
-  if (RHSRes.isInvalid())
-    return ExprError();
+  return Out;
+}
 
-  bool IsRHSSatisfied = Satisfaction.IsSatisfied;
-  // Current implementation adds diagnostic information about the falsity
-  // of each false atomic constraint expression when it evaluates them.
-  // When the evaluation results to `false || true`, the information
-  // generated during the evaluation of left-hand side is meaningless
-  // because the whole expression evaluates to true.
-  // The following code removes the irrelevant diagnostic information.
-  // FIXME: We should probably delay the addition of diagnostic information
-  // until we know the entire expression is false.
-  if (Op == clang::OO_PipePipe && IsRHSSatisfied) {
-    auto EffectiveDetailEnd = Satisfaction.Details.begin();
-    std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
-    Satisfaction.Details.erase(EffectiveDetailEnd, Satisfaction.Details.end());
-  }
-
-  if (!LHSRes.isUsable() || !RHSRes.isUsable())
-    return ExprEmpty();
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const FoldExpandedConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
-  return BinaryOperator::Create(S.Context, LHSRes.get(), RHSRes.get(),
-                                BinaryOperator::getOverloadedOpcode(Op),
-                                S.Context.BoolTy, VK_PRValue, OK_Ordinary,
-                                LHS->getBeginLoc(), FPOptionsOverride{});
+  llvm::FoldingSetNodeID ID;
+  ID.AddPointer(Constraint.getPattern());
+  HashParameterMapping(S, MLTAL, ID, std::nullopt).VisitConstraint(Constraint);
+
+  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+    auto &Cached = Iter->second.Satisfaction;
+    Satisfaction.ContainsErrors = Cached.ContainsErrors;
+    Satisfaction.IsSatisfied = Cached.IsSatisfied;
+    Satisfaction.Details.insert(Satisfaction.Details.end(),
+                                Cached.Details.begin(), Cached.Details.end());
+    return Iter->second.SubstExpr;
+  }
+
+  unsigned Size = Satisfaction.Details.size();
+
+  ExprResult E = EvaluateSlow(Constraint, MLTAL);
+  UnsubstitutedConstraintSatisfactionCacheResult Cache;
+  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.SubstExpr = E;
+  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+  return E;
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
-  bool Conjunction = FE->getOperator() == BinaryOperatorKind::BO_LAnd;
-  size_t EffectiveDetailEndIndex = Satisfaction.Details.size();
+ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
+    const ConceptIdConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL, unsigned Size) {
+  const ConceptReference *ConceptId = Constraint.getConceptId();
 
-  ExprResult Out;
-  if (FE->isLeftFold() && FE->getInit()) {
-    Out = calculateConstraintSatisfaction(S, FE->getInit(), Template,
-                                          TemplateNameLoc, MLTAL, Satisfaction);
-    if (Out.isInvalid())
-      return ExprError();
+  llvm::SmallVector<TemplateArgument> SubstitutedOuterMost;
+  std::optional<MultiLevelTemplateArgumentList> SubstitutedArgs =
+      SubstitutionInTemplateArguments(Constraint, MLTAL, SubstitutedOuterMost);
 
-    // If the first clause of a conjunction is not satisfied,
-    // or if the first clause of a disjection is satisfied,
-    // we have established satisfaction of the whole constraint
-    // and we should not continue further.
-    if (Conjunction != Satisfaction.IsSatisfied)
-      return Out;
-  }
-  UnsignedOrNone NumExpansions = EvaluateFoldExpandedConstraintSize(
-      S, FE, Template, TemplateNameLoc, MLTAL, Satisfaction);
-  if (!NumExpansions)
+  if (!SubstitutedArgs) {
+    Satisfaction.IsSatisfied = false;
+    // FIXME: diagnostics?
     return ExprError();
-  for (unsigned I = 0; I < *NumExpansions; I++) {
-    Sema::ArgPackSubstIndexRAII SubstIndex(S, I);
-    ExprResult Res = calculateConstraintSatisfaction(
-        S, FE->getPattern(), Template, TemplateNameLoc, MLTAL, Satisfaction);
-    if (Res.isInvalid())
-      return ExprError();
-    bool IsRHSSatisfied = Satisfaction.IsSatisfied;
-    if (!Conjunction && IsRHSSatisfied) {
-      auto EffectiveDetailEnd = Satisfaction.Details.begin();
-      std::advance(EffectiveDetailEnd, EffectiveDetailEndIndex);
-      Satisfaction.Details.erase(EffectiveDetailEnd,
-                                 Satisfaction.Details.end());
-    }
-    if (Out.isUnset())
-      Out = Res;
-    else if (!Res.isUnset()) {
-      Out = BinaryOperator::Create(
-          S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
-          VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
-    }
-    if (Conjunction != IsRHSSatisfied)
-      return Out;
   }
 
-  if (FE->isRightFold() && FE->getInit()) {
-    ExprResult Res = calculateConstraintSatisfaction(
-        S, FE->getInit(), Template, TemplateNameLoc, MLTAL, Satisfaction);
-    if (Out.isInvalid())
+  Sema::SFINAETrap Trap(S);
+  Sema::ArgPackSubstIndexRAII SubstIndex(
+      S, Constraint.getPackSubstitutionIndex()
+             ? Constraint.getPackSubstitutionIndex()
+             : PackSubstitutionIndex);
+
+  const ASTTemplateArgumentListInfo *Ori =
+      ConceptId->getTemplateArgsAsWritten();
+  TemplateDeductionInfo Info(TemplateNameLoc);
+  Sema::InstantiatingTemplate _(
+      S, TemplateNameLoc, Sema::InstantiatingTemplate::ConstraintSubstitution{},
+      const_cast<NamedDecl *>(Template), Info, Constraint.getSourceRange());
+
+  TemplateArgumentListInfo OutArgs(Ori->LAngleLoc, Ori->RAngleLoc);
+  if (S.SubstTemplateArguments(Ori->arguments(), *SubstitutedArgs, OutArgs) ||
+      Trap.hasErrorOccurred()) {
+    Satisfaction.IsSatisfied = false;
+    if (!Trap.hasErrorOccurred())
       return ExprError();
 
-    if (Out.isUnset())
-      Out = Res;
-    else if (!Res.isUnset()) {
-      Out = BinaryOperator::Create(
-          S.Context, Out.get(), Res.get(), FE->getOperator(), S.Context.BoolTy,
-          VK_PRValue, OK_Ordinary, FE->getBeginLoc(), FPOptionsOverride{});
-    }
+    PartialDiagnosticAt SubstDiag{SourceLocation(),
+                                  PartialDiagnostic::NullDiagnostic()};
+    Info.takeSFINAEDiagnostic(SubstDiag);
+    // FIXME: This is an unfortunate consequence of there
+    //  being no serialization code for PartialDiagnostics and the fact
+    //  that serializing them would likely take a lot more storage than
+    //  just storing them as strings. We would still like, in the
+    //  future, to serialize the proper PartialDiagnostic as serializing
+    //  it as a string defeats the purpose of the diagnostic mechanism.
+    Satisfaction.Details.insert(
+        Satisfaction.Details.begin() + Size,
+        new (S.Context) ConstraintSubstitutionDiagnostic{
+            SubstDiag.first,
+            allocateStringFromConceptDiagnostic(S, SubstDiag.second)});
+    return ExprError();
   }
 
-  if (Out.isUnset()) {
-    Satisfaction.IsSatisfied = Conjunction;
-    Out = S.BuildEmptyCXXFoldExpr(FE->getBeginLoc(), FE->getOperator());
+  CXXScopeSpec SS;
+  SS.Adopt(ConceptId->getNestedNameSpecifierLoc());
+
+  ExprResult SubstitutedConceptId = S.CheckConceptTemplateId(
+      SS, ConceptId->getTemplateKWLoc(), ConceptId->getConceptNameInfo(),
+      ConceptId->getFoundDecl(), ConceptId->getNamedConcept(), &OutArgs,
+      /*DoCheckConstraintSatisfaction=*/false);
+
+  if (SubstitutedConceptId.isInvalid() || Trap.hasErrorOccurred())
+    return ExprError();
+
+  if (Size != Satisfaction.Details.size()) {
+    Satisfaction.Details.insert(
+        Satisfaction.Details.begin() + Size,
+        UnsatisfiedConstraintRecord(
+            SubstitutedConceptId.getAs<ConceptSpecializationExpr>()
+                ->getConceptReference()));
   }
-  return Out;
+  return SubstitutedConceptId;
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const Expr *ConstraintExpr, const NamedDecl *Template,
-    SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL,
-    ConstraintSatisfaction &Satisfaction) {
-  ConstraintExpr = ConstraintExpr->IgnoreParenImpCasts();
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const ConceptIdConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
-  if (LogicalBinOp BO = ConstraintExpr)
-    return calculateConstraintSatisfaction(
-        S, BO.getLHS(), BO.getOp(), BO.getRHS(), Template, TemplateNameLoc,
-        MLTAL, Satisfaction);
+  const ConceptReference *ConceptId = Constraint.getConceptId();
 
-  if (auto *C = dyn_cast<ExprWithCleanups>(ConstraintExpr)) {
-    // These aren't evaluated, so we don't care about cleanups, so we can just
-    // evaluate these as if the cleanups didn't exist.
-    return calculateConstraintSatisfaction(
-        S, C->getSubExpr(), Template, TemplateNameLoc, MLTAL, Satisfaction);
-  }
+  UnsignedOrNone OuterPackSubstIndex =
+      Constraint.getPackSubstitutionIndex()
+          ? Constraint.getPackSubstitutionIndex()
+          : PackSubstitutionIndex;
+
+  Sema::InstantiatingTemplate _(S, ConceptId->getBeginLoc(),
+                                Sema::InstantiatingTemplate::ConstraintsCheck{},
+                                ConceptId->getNamedConcept(),
+                                MLTAL.getInnermost(),
+                                Constraint.getSourceRange());
+
+  unsigned Size = Satisfaction.Details.size();
 
-  if (auto *FE = dyn_cast<CXXFoldExpr>(ConstraintExpr);
-      FE && S.getLangOpts().CPlusPlus26 &&
-      (FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
-       FE->getOperator() == BinaryOperatorKind::BO_LOr)) {
-    return calculateConstraintSatisfaction(S, FE, Template, TemplateNameLoc,
-                                           MLTAL, Satisfaction);
+  ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL);
+
+  if (!E.isUsable()) {
+    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId);
+    return E;
   }
 
-  // FIXME: We should not treat ConceptSpecializationExpr as atomic constraints.
+  // ConceptIdConstraint is only relevant for diagnostics,
+  // so if the normalized constraint is satisfied, we should not
+  // substitute into the constraint.
+  if (Satisfaction.IsSatisfied)
+    return E;
 
-  // An atomic constraint expression
-  ExprResult SubstitutedAtomicExpr = EvaluateAtomicConstraint(
-      S, ConstraintExpr, Template, TemplateNameLoc, MLTAL, Satisfaction);
+  llvm::FoldingSetNodeID ID;
+  ID.AddPointer(Constraint.getConceptId());
+  ID.AddInteger(OuterPackSubstIndex.toInternalRepresentation());
+  HashParameterMapping(S, MLTAL, ID, OuterPackSubstIndex)
+      .VisitConstraint(Constraint);
+
+  if (auto Iter = S.UnsubstitutedConstraintSatisfactionCache.find(ID);
+      Iter != S.UnsubstitutedConstraintSatisfactionCache.end()) {
+
+    auto &Cached = Iter->second.Satisfaction;
+    Satisfaction.ContainsErrors = Cached.ContainsErrors;
+    Satisfaction.IsSatisfied = Cached.IsSatisfied;
+    Satisfaction.Details.insert(Satisfaction.Details.begin() + Size,
+                                Cached.Details.begin(), Cached.Details.end());
+    return Iter->second.SubstExpr;
+  }
+
+  ExprResult CE = EvaluateSlow(Constraint, MLTAL, Size);
+  if (CE.isInvalid())
+    return E;
+  UnsubstitutedConstraintSatisfactionCacheResult Cache;
+  Cache.Satisfaction.ContainsErrors = Satisfaction.ContainsErrors;
+  Cache.Satisfaction.IsSatisfied = Satisfaction.IsSatisfied;
+  std::copy(Satisfaction.Details.begin() + Size, Satisfaction.Details.end(),
+            std::back_inserter(Cache.Satisfaction.Details));
+  Cache.SubstExpr = CE;
+  S.UnsubstitutedConstraintSatisfactionCache.insert({ID, std::move(Cache)});
+  return CE;
+}
 
-  if (SubstitutedAtomicExpr.isInvalid())
-    return ExprError();
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const CompoundConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
 
-  if (!SubstitutedAtomicExpr.isUsable())
-    // Evaluator has decided satisfaction without yielding an expression.
-    return ExprEmpty();
+  unsigned EffectiveDetailEndIndex = Satisfaction.Details.size();
+
+  bool Conjunction =
+      Constraint.getCompoundKind() == NormalizedConstraint::CCK_Conjunction;
+
+  ExprResult LHS = Evaluate(Constraint.getLHS(), MLTAL);
+
+  if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors))
+    return LHS;
+
+  if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied &&
+      !Satisfaction.ContainsErrors)
+    return LHS;
+
+  Satisfaction.ContainsErrors = false;
+  Satisfaction.IsSatisfied = false;
 
-  // We don't have the ability to evaluate this, since it contains a
-  // RecoveryExpr, so we want to fail overload resolution.  Otherwise,
-  // we'd potentially pick up a different overload, and cause confusing
-  // diagnostics. SO, add a failure detail that will cause us to make this
-  // overload set not viable.
-  if (SubstitutedAtomicExpr.get()->containsErrors()) {
-    Satisfaction.IsSatisfied = false;
-    Satisfaction.ContainsErrors = true;
+  ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL);
 
-    PartialDiagnostic Msg = S.PDiag(diag::note_constraint_references_error);
-    SmallString<128> DiagString;
-    DiagString = ": ";
-    Msg.EmitToString(S.getDiagnostics(), DiagString);
-    unsigned MessageSize = DiagString.size();
-    char *Mem = new (S.Context) char[MessageSize];
-    memcpy(Mem, DiagString.c_str(), MessageSize);
-    Satisfaction.Details.emplace_back(
-        new (S.Context) ConstraintSatisfaction::SubstitutionDiagnostic{
-            SubstitutedAtomicExpr.get()->getBeginLoc(),
-            StringRef(Mem, MessageSize)});
-    return SubstitutedAtomicExpr;
-  }
+  if (RHS.isUsable() && Satisfaction.IsSatisfied &&
+      !Satisfaction.ContainsErrors)
+    Satisfaction.Details.erase(Satisfaction.Details.begin() +
+                                   EffectiveDetailEndIndex,
+                               Satisfaction.Details.end());
 
-  EnterExpressionEvaluationContext ConstantEvaluated(
-      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  SmallVector<PartialDiagnosticAt, 2> EvaluationDiags;
-  Expr::EvalResult EvalResult;
-  EvalResult.Diag = &EvaluationDiags;
-  if (!SubstitutedAtomicExpr.get()->EvaluateAsConstantExpr(EvalResult,
-                                                           S.Context) ||
-      !EvaluationDiags.empty()) {
-    // C++2a [temp.constr.atomic]p1
-    //   ...E shall be a constant expression of type bool.
-    S.Diag(SubstitutedAtomicExpr.get()->getBeginLoc(),
-           diag::err_non_constant_constraint_expression)
-        << SubstitutedAtomicExpr.get()->getSourceRange();
-    for (const PartialDiagnosticAt &PDiag : EvaluationDiags)
-      S.Diag(PDiag.first, PDiag.second);
-    return ExprError();
-  }
+  if (!LHS.isUsable())
+    return RHS;
 
-  assert(EvalResult.Val.isInt() &&
-         "evaluating bool expression didn't produce int");
-  Satisfaction.IsSatisfied = EvalResult.Val.getInt().getBoolValue();
-  if (!Satisfaction.IsSatisfied)
-    Satisfaction.Details.emplace_back(SubstitutedAtomicExpr.get());
+  if (!RHS.isUsable())
+    return LHS;
 
-  return SubstitutedAtomicExpr;
+  return BinaryOperator::Create(S.Context, LHS.get(), RHS.get(),
+                                Conjunction ? BinaryOperatorKind::BO_LAnd
+                                            : BinaryOperatorKind::BO_LOr,
+                                S.Context.BoolTy, VK_PRValue, OK_Ordinary,
+                                Constraint.getBeginLoc(), FPOptionsOverride{});
 }
 
-static ExprResult calculateConstraintSatisfaction(
-    Sema &S, const NamedDecl *Template, SourceLocation TemplateNameLoc,
-    const MultiLevelTemplateArgumentList &MLTAL, const Expr *ConstraintExpr,
-    ConstraintSatisfaction &Satisfaction) {
+ExprResult ConstraintSatisfactionChecker::Evaluate(
+    const NormalizedConstraint &Constraint,
+    const MultiLevelTemplateArgumentList &MLTAL) {
+  switch (Constraint.getKind()) {
+  case NormalizedConstraint::ConstraintKind::Atomic:
+    return Evaluate(static_cast<const AtomicConstraint &>(Constraint), MLTAL);
+
+  case NormalizedConstraint::ConstraintKind::FoldExpanded:
+    return Evaluate(static_cast<const FoldExpandedConstraint &>(Constraint),
+                    MLTAL);
+
+  case NormalizedConstraint::ConstraintKind::ConceptId:
+    return Evaluate(static_cast<const ConceptIdConstraint &>(Constraint),
+                    MLTAL);
 
-  return calculateConstraintSatisfaction(S, ConstraintExpr, Template,
-                                         TemplateNameLoc, MLTAL, Satisfaction);
+  case NormalizedConstraint::ConstraintKind::Compound:
+    return Evaluate(static_cast<const CompoundConstraint &>(Constraint), MLTAL);
+  }
 }
 
 static bool CheckConstraintSatisfaction(
     Sema &S, const NamedDecl *Template,
     ArrayRef<AssociatedConstraint> AssociatedConstraints,
-    llvm::SmallVectorImpl<Expr *> &Converted,
     const MultiLevelTemplateArgumentList &TemplateArgsLists,
-    SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) {
+    SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction,
+    Expr **ConvertedExpr, const ConceptReference *TopLevelConceptId = nullptr) {
+
+  if (ConvertedExpr)
+    *ConvertedExpr = nullptr;
+
   if (AssociatedConstraints.empty()) {
     Satisfaction.IsSatisfied = true;
     return false;
@@ -578,57 +1069,60 @@ static bool CheckConstraintSatisfaction(
     return false;
   }
 
-  ArrayRef<TemplateArgument> TemplateArgs =
-      TemplateArgsLists.getNumSubstitutedLevels() > 0
-          ? TemplateArgsLists.getOutermost()
-          : ArrayRef<TemplateArgument>{};
-  Sema::InstantiatingTemplate Inst(S, TemplateIDRange.getBegin(),
-      Sema::InstantiatingTemplate::ConstraintsCheck{},
-      const_cast<NamedDecl *>(Template), TemplateArgs, TemplateIDRange);
-  if (Inst.isInvalid())
+  llvm::ArrayRef<TemplateArgument> Args;
+  if (TemplateArgsLists.getNumLevels() != 0)
+    Args = TemplateArgsLists.getInnermost();
+
+  std::optional<Sema::InstantiatingTemplate> SynthesisContext;
+  if (!TopLevelConceptId) {
+    SynthesisContext.emplace(S, TemplateIDRange.getBegin(),
+                             Sema::InstantiatingTemplate::ConstraintsCheck{},
+                             const_cast<NamedDecl *>(Template), Args,
+                             TemplateIDRange);
+  }
+
+  const NormalizedConstraint *C =
+      S.getNormalizedAssociatedConstraints(Template, AssociatedConstraints);
+  if (!C) {
+    Satisfaction.IsSatisfied = false;
     return true;
+  }
 
-  for (const AssociatedConstraint &AC : AssociatedConstraints) {
-    if (AC.isNull())
-      return true;
+  if (TopLevelConceptId)
+    C = ConceptIdConstraint::Create(S.getASTContext(), TopLevelConceptId,
+                                    const_cast<NormalizedConstraint *>(C),
+                                    Template, /*CSE=*/nullptr,
+                                    S.ArgPackSubstIndex);
 
-    Sema::ArgPackSubstIndexRAII _(S, AC.ArgPackSubstIndex);
-    ExprResult Res = calculateConstraintSatisfaction(
-        S, Template, TemplateIDRange.getBegin(), TemplateArgsLists,
-        AC.ConstraintExpr, Satisfaction);
-    if (Res.isInvalid())
-      return true;
+  ExprResult Res =
+      ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(),
+                                    S.ArgPackSubstIndex, Satisfaction)
+          .Evaluate(*C, TemplateArgsLists);
+
+  if (Res.isInvalid())
+    return true;
+
+  if (Res.isUsable() && ConvertedExpr)
+    *ConvertedExpr = Res.get();
 
-    Converted.push_back(Res.get());
-    if (!Satisfaction.IsSatisfied) {
-      // Backfill the 'converted' list with nulls so we can keep the Converted
-      // and unconverted lists in sync.
-      Converted.append(AssociatedConstraints.size() - Converted.size(),
-                       nullptr);
-      // [temp.constr.op] p2
-      // [...] To determine if a conjunction is satisfied, the satisfaction
-      // of the first operand is checked. If that is not satisfied, the
-      // conjunction is not satisfied. [...]
-      return false;
-    }
-  }
   return false;
 }
 
 bool Sema::CheckConstraintSatisfaction(
-    const NamedDecl *Template,
+    ConstrainedDeclOrNestedRequirement Entity,
     ArrayRef<AssociatedConstraint> AssociatedConstraints,
-    llvm::SmallVectorImpl<Expr *> &ConvertedConstraints,
     const MultiLevelTemplateArgumentList &TemplateArgsLists,
-    SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction) {
+    SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction,
+    const ConceptReference *TopLevelConceptId, Expr **ConvertedExpr) {
   if (AssociatedConstraints.empty()) {
     OutSatisfaction.IsSatisfied = true;
     return false;
   }
+  const auto *Template = Entity.dyn_cast<const NamedDecl *>();
   if (!Template) {
     return ::CheckConstraintSatisfaction(
-        *this, nullptr, AssociatedConstraints, ConvertedConstraints,
-        TemplateArgsLists, TemplateIDRange, OutSatisfaction);
+        *this, nullptr, AssociatedConstraints, TemplateArgsLists,
+        TemplateIDRange, OutSatisfaction, ConvertedExpr, TopLevelConceptId);
   }
   // Invalid templates could make their way here. Substituting them could result
   // in dependent expressions.
@@ -643,10 +1137,15 @@ bool Sema::CheckConstraintSatisfaction(
   // here.
   llvm::SmallVector<TemplateArgument, 4> FlattenedArgs;
   for (auto List : TemplateArgsLists)
-    llvm::append_range(FlattenedArgs, List.Args);
+    for (const TemplateArgument &Arg : List.Args)
+      FlattenedArgs.emplace_back(Context.getCanonicalTemplateArgument(Arg));
+
+  const NamedDecl *Owner = Template;
+  if (TopLevelConceptId)
+    Owner = TopLevelConceptId->getNamedConcept();
 
   llvm::FoldingSetNodeID ID;
-  ConstraintSatisfaction::Profile(ID, Context, Template, FlattenedArgs);
+  ConstraintSatisfaction::Profile(ID, Context, Owner, FlattenedArgs);
   void *InsertPos;
   if (auto *Cached = SatisfactionCache.FindNodeOrInsertPos(ID, InsertPos)) {
     OutSatisfaction = *Cached;
@@ -654,11 +1153,11 @@ bool Sema::CheckConstraintSatisfaction(
   }
 
   auto Satisfaction =
-      std::make_unique<ConstraintSatisfaction>(Template, FlattenedArgs);
-  if (::CheckConstraintSatisfaction(*this, Template, AssociatedConstraints,
-                                    ConvertedConstraints, TemplateArgsLists,
-                                    TemplateIDRange, *Satisfaction)) {
-    OutSatisfaction = *Satisfaction;
+      std::make_unique<ConstraintSatisfaction>(Owner, FlattenedArgs);
+  if (::CheckConstraintSatisfaction(
+          *this, Template, AssociatedConstraints, TemplateArgsLists,
+          TemplateIDRange, *Satisfaction, ConvertedExpr, TopLevelConceptId)) {
+    OutSatisfaction = std::move(*Satisfaction);
     return true;
   }
 
@@ -688,14 +1187,18 @@ bool Sema::CheckConstraintSatisfaction(
     const ConceptSpecializationExpr *ConstraintExpr,
     ConstraintSatisfaction &Satisfaction) {
 
+  llvm::SmallVector<AssociatedConstraint, 1> Constraints;
+  Constraints.emplace_back(
+      ConstraintExpr->getNamedConcept()->getConstraintExpr());
+
   MultiLevelTemplateArgumentList MLTAL(ConstraintExpr->getNamedConcept(),
                                        ConstraintExpr->getTemplateArguments(),
                                        true);
 
-  return calculateConstraintSatisfaction(
-             *this, ConstraintExpr, ConstraintExpr->getNamedConcept(),
-             ConstraintExpr->getConceptNameLoc(), MLTAL, Satisfaction)
-      .isInvalid();
+  return CheckConstraintSatisfaction(
+      ConstraintExpr->getNamedConcept(), Constraints, MLTAL,
+      ConstraintExpr->getSourceRange(), Satisfaction,
+      ConstraintExpr->getConceptReference());
 }
 
 bool Sema::SetupConstraintScope(
@@ -854,50 +1357,6 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD,
       Satisfaction);
 }
 
-
-// Figure out the to-translation-unit depth for this function declaration for
-// the purpose of seeing if they differ by constraints. This isn't the same as
-// getTemplateDepth, because it includes already instantiated parents.
-static unsigned
-CalculateTemplateDepthForConstraints(Sema &S, const NamedDecl *ND,
-                                     bool SkipForSpecialization = false) {
-  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      ND, ND->getLexicalDeclContext(), /*Final=*/false,
-      /*Innermost=*/std::nullopt,
-      /*RelativeToPrimary=*/true,
-      /*Pattern=*/nullptr,
-      /*ForConstraintInstantiation=*/true, SkipForSpecialization);
-  return MLTAL.getNumLevels();
-}
-
-namespace {
-  class AdjustConstraintDepth : public TreeTransform<AdjustConstraintDepth> {
-  unsigned TemplateDepth = 0;
-  public:
-  using inherited = TreeTransform<AdjustConstraintDepth>;
-  AdjustConstraintDepth(Sema &SemaRef, unsigned TemplateDepth)
-      : inherited(SemaRef), TemplateDepth(TemplateDepth) {}
-
-  using inherited::TransformTemplateTypeParmType;
-  QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB,
-                                         TemplateTypeParmTypeLoc TL, bool) {
-    const TemplateTypeParmType *T = TL.getTypePtr();
-
-    TemplateTypeParmDecl *NewTTPDecl = nullptr;
-    if (TemplateTypeParmDecl *OldTTPDecl = T->getDecl())
-      NewTTPDecl = cast_or_null<TemplateTypeParmDecl>(
-          TransformDecl(TL.getNameLoc(), OldTTPDecl));
-
-    QualType Result = getSema().Context.getTemplateTypeParmType(
-        T->getDepth() + TemplateDepth, T->getIndex(), T->isParameterPack(),
-        NewTTPDecl);
-    TemplateTypeParmTypeLoc NewTL = TLB.push<TemplateTypeParmTypeLoc>(Result);
-    NewTL.setNameLoc(TL.getNameLoc());
-    return Result;
-  }
-  };
-} // namespace
-
 static const Expr *SubstituteConstraintExpressionWithoutSatisfaction(
     Sema &S, const Sema::TemplateCompareNewDeclInfo &DeclInfo,
     const Expr *ConstrExpr) {
@@ -1161,73 +1620,61 @@ bool Sema::CheckFunctionTemplateConstraints(
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::ExprRequirement *Req,
                                            bool First) {
-  assert(!Req->isSatisfied()
-         && "Diagnose() can only be used on an unsatisfied requirement");
+  assert(!Req->isSatisfied() &&
+         "Diagnose() can only be used on an unsatisfied requirement");
   switch (Req->getSatisfactionStatus()) {
-    case concepts::ExprRequirement::SS_Dependent:
-      llvm_unreachable("Diagnosing a dependent requirement");
-      break;
-    case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
-      auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
-      if (!SubstDiag->DiagMessage.empty())
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_expr_substitution_error)
-               << (int)First << SubstDiag->SubstitutedEntity
-               << SubstDiag->DiagMessage;
-      else
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_expr_unknown_substitution_error)
-            << (int)First << SubstDiag->SubstitutedEntity;
-      break;
-    }
-    case concepts::ExprRequirement::SS_NoexceptNotMet:
-      S.Diag(Req->getNoexceptLoc(),
-             diag::note_expr_requirement_noexcept_not_met)
-          << (int)First << Req->getExpr();
-      break;
-    case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
-      auto *SubstDiag =
-          Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
-      if (!SubstDiag->DiagMessage.empty())
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_type_requirement_substitution_error)
-            << (int)First << SubstDiag->SubstitutedEntity
-            << SubstDiag->DiagMessage;
-      else
-        S.Diag(SubstDiag->DiagLoc,
-               diag::note_expr_requirement_type_requirement_unknown_substitution_error)
-            << (int)First << SubstDiag->SubstitutedEntity;
-      break;
-    }
-    case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
-      ConceptSpecializationExpr *ConstraintExpr =
-          Req->getReturnTypeRequirementSubstitutedConstraintExpr();
-      if (ConstraintExpr->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
-        // A simple case - expr type is the type being constrained and the concept
-        // was not provided arguments.
-        Expr *e = Req->getExpr();
-        S.Diag(e->getBeginLoc(),
-               diag::note_expr_requirement_constraints_not_satisfied_simple)
-            << (int)First << S.Context.getReferenceQualifiedType(e)
-            << ConstraintExpr->getNamedConcept();
-      } else {
-        S.Diag(ConstraintExpr->getBeginLoc(),
-               diag::note_expr_requirement_constraints_not_satisfied)
-            << (int)First << ConstraintExpr;
-      }
-      S.DiagnoseUnsatisfiedConstraint(ConstraintExpr->getSatisfaction());
-      break;
-    }
-    case concepts::ExprRequirement::SS_Satisfied:
-      llvm_unreachable("We checked this above");
+  case concepts::ExprRequirement::SS_Dependent:
+    llvm_unreachable("Diagnosing a dependent requirement");
+    break;
+  case concepts::ExprRequirement::SS_ExprSubstitutionFailure: {
+    auto *SubstDiag = Req->getExprSubstitutionDiagnostic();
+    if (!SubstDiag->DiagMessage.empty())
+      S.Diag(SubstDiag->DiagLoc,
+             diag::note_expr_requirement_expr_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity
+          << SubstDiag->DiagMessage;
+    else
+      S.Diag(SubstDiag->DiagLoc,
+             diag::note_expr_requirement_expr_unknown_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity;
+    break;
+  }
+  case concepts::ExprRequirement::SS_NoexceptNotMet:
+    S.Diag(Req->getNoexceptLoc(), diag::note_expr_requirement_noexcept_not_met)
+        << (int)First << Req->getExpr();
+    break;
+  case concepts::ExprRequirement::SS_TypeRequirementSubstitutionFailure: {
+    auto *SubstDiag =
+        Req->getReturnTypeRequirement().getSubstitutionDiagnostic();
+    if (!SubstDiag->DiagMessage.empty())
+      S.Diag(SubstDiag->DiagLoc,
+             diag::note_expr_requirement_type_requirement_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity
+          << SubstDiag->DiagMessage;
+    else
+      S.Diag(
+          SubstDiag->DiagLoc,
+          diag::
+              note_expr_requirement_type_requirement_unknown_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity;
+    break;
+  }
+  case concepts::ExprRequirement::SS_ConstraintsNotSatisfied: {
+    ConceptSpecializationExpr *ConstraintExpr =
+        Req->getReturnTypeRequirementSubstitutedConstraintExpr();
+    S.DiagnoseUnsatisfiedConstraint(ConstraintExpr);
+    break;
+  }
+  case concepts::ExprRequirement::SS_Satisfied:
+    llvm_unreachable("We checked this above");
   }
 }
 
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::TypeRequirement *Req,
                                            bool First) {
-  assert(!Req->isSatisfied()
-         && "Diagnose() can only be used on an unsatisfied requirement");
+  assert(!Req->isSatisfied() &&
+         "Diagnose() can only be used on an unsatisfied requirement");
   switch (Req->getSatisfactionStatus()) {
   case concepts::TypeRequirement::SS_Dependent:
     llvm_unreachable("Diagnosing a dependent requirement");
@@ -1235,9 +1682,9 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
   case concepts::TypeRequirement::SS_SubstitutionFailure: {
     auto *SubstDiag = Req->getSubstitutionDiagnostic();
     if (!SubstDiag->DiagMessage.empty())
-      S.Diag(SubstDiag->DiagLoc,
-             diag::note_type_requirement_substitution_error) << (int)First
-          << SubstDiag->SubstitutedEntity << SubstDiag->DiagMessage;
+      S.Diag(SubstDiag->DiagLoc, diag::note_type_requirement_substitution_error)
+          << (int)First << SubstDiag->SubstitutedEntity
+          << SubstDiag->DiagMessage;
     else
       S.Diag(SubstDiag->DiagLoc,
              diag::note_type_requirement_unknown_substitution_error)
@@ -1249,31 +1696,53 @@ static void diagnoseUnsatisfiedRequirement(Sema &S,
     return;
   }
 }
-static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
-                                                        Expr *SubstExpr,
-                                                        bool First = true);
+
+static void diagnoseUnsatisfiedConceptIdExpr(Sema &S,
+                                             const ConceptReference *Concept,
+                                             SourceLocation Loc, bool First) {
+  if (Concept->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
+    S.Diag(
+        Loc,
+        diag::
+            note_single_arg_concept_specialization_constraint_evaluated_to_false)
+        << (int)First
+        << Concept->getTemplateArgsAsWritten()->arguments()[0].getArgument()
+        << Concept->getNamedConcept();
+  } else {
+    S.Diag(Loc, diag::note_concept_specialization_constraint_evaluated_to_false)
+        << (int)First << Concept;
+  }
+}
+
+static void diagnoseUnsatisfiedConstraintExpr(
+    Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
+    bool First, concepts::NestedRequirement *Req = nullptr);
+
+static void DiagnoseUnsatisfiedConstraint(
+    Sema &S, ArrayRef<UnsatisfiedConstraintRecord> Records, SourceLocation Loc,
+    bool First = true, concepts::NestedRequirement *Req = nullptr) {
+  for (auto &Record : Records) {
+    diagnoseUnsatisfiedConstraintExpr(S, Record, Loc, First, Req);
+    Loc = {};
+    First = isa<const ConceptReference *>(Record);
+  }
+}
 
 static void diagnoseUnsatisfiedRequirement(Sema &S,
                                            concepts::NestedRequirement *Req,
                                            bool First) {
-  using SubstitutionDiagnostic = std::pair<SourceLocation, StringRef>;
-  for (auto &Record : Req->getConstraintSatisfaction()) {
-    if (auto *SubstDiag = Record.dyn_cast<SubstitutionDiagnostic *>())
-      S.Diag(SubstDiag->first, diag::note_nested_requirement_substitution_error)
-          << (int)First << Req->getInvalidConstraintEntity()
-          << SubstDiag->second;
-    else
-      diagnoseWellFormedUnsatisfiedConstraintExpr(S, Record.dyn_cast<Expr *>(),
-                                                  First);
-    First = false;
-  }
+  DiagnoseUnsatisfiedConstraint(S, Req->getConstraintSatisfaction().records(),
+                                Req->hasInvalidConstraint()
+                                    ? SourceLocation()
+                                    : Req->getConstraintExpr()->getExprLoc(),
+                                First, Req);
 }
 
 static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
-                                                        Expr *SubstExpr,
+                                                        const Expr *SubstExpr,
                                                         bool First) {
   SubstExpr = SubstExpr->IgnoreParenImpCasts();
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
+  if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(SubstExpr)) {
     switch (BO->getOpcode()) {
     // These two cases will in practice only be reached when using fold
     // expressions with || and &&, since otherwise the || and && will have been
@@ -1319,7 +1788,7 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
         BO->getRHS()->EvaluateAsInt(SimplifiedRHS, S.Context,
                                     Expr::SE_NoSideEffects,
                                     /*InConstantContext=*/true);
-        if (!SimplifiedLHS.Diag && ! SimplifiedRHS.Diag) {
+        if (!SimplifiedLHS.Diag && !SimplifiedRHS.Diag) {
           S.Diag(SubstExpr->getBeginLoc(),
                  diag::note_atomic_constraint_evaluated_to_false_elaborated)
               << (int)First << SubstExpr
@@ -1334,22 +1803,6 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
     default:
       break;
     }
-  } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
-    if (CSE->getTemplateArgsAsWritten()->NumTemplateArgs == 1) {
-      S.Diag(
-          CSE->getSourceRange().getBegin(),
-          diag::
-          note_single_arg_concept_specialization_constraint_evaluated_to_false)
-          << (int)First
-          << CSE->getTemplateArgsAsWritten()->arguments()[0].getArgument()
-          << CSE->getNamedConcept();
-    } else {
-      S.Diag(SubstExpr->getSourceRange().getBegin(),
-             diag::note_concept_specialization_constraint_evaluated_to_false)
-          << (int)First << CSE;
-    }
-    S.DiagnoseUnsatisfiedConstraint(CSE->getSatisfaction());
-    return;
   } else if (auto *RE = dyn_cast<RequiresExpr>(SubstExpr)) {
     // FIXME: RequiresExpr should store dependent diagnostics.
     for (concepts::Requirement *Req : RE->getRequirements())
@@ -1364,6 +1817,10 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
         break;
       }
     return;
+  } else if (auto *CSE = dyn_cast<ConceptSpecializationExpr>(SubstExpr)) {
+    // Drill down concept ids treated as atomic constraints
+    S.DiagnoseUnsatisfiedConstraint(CSE, First);
+    return;
   } else if (auto *TTE = dyn_cast<TypeTraitExpr>(SubstExpr);
              TTE && TTE->getTrait() == clang::TypeTrait::BTT_IsDeducible) {
     assert(TTE->getNumArgs() == 2);
@@ -1379,216 +1836,332 @@ static void diagnoseWellFormedUnsatisfiedConstraintExpr(Sema &S,
   S.DiagnoseTypeTraitDetails(SubstExpr);
 }
 
-template <typename SubstitutionDiagnostic>
 static void diagnoseUnsatisfiedConstraintExpr(
-    Sema &S, const llvm::PointerUnion<Expr *, SubstitutionDiagnostic *> &Record,
-    bool First = true) {
-  if (auto *Diag = Record.template dyn_cast<SubstitutionDiagnostic *>()) {
-    S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
-        << Diag->second;
+    Sema &S, const UnsatisfiedConstraintRecord &Record, SourceLocation Loc,
+    bool First, concepts::NestedRequirement *Req) {
+  if (auto *Diag =
+          Record
+              .template dyn_cast<const ConstraintSubstitutionDiagnostic *>()) {
+    if (Req)
+      S.Diag(Diag->first, diag::note_nested_requirement_substitution_error)
+          << (int)First << Req->getInvalidConstraintEntity() << Diag->second;
+    else
+      S.Diag(Diag->first, diag::note_substituted_constraint_expr_is_ill_formed)
+          << Diag->second;
     return;
   }
-
-  diagnoseWellFormedUnsatisfiedConstraintExpr(S, cast<Expr *>(Record), First);
+  if (const auto *Concept = dyn_cast<const ConceptReference *>(Record)) {
+    if (Loc.isInvalid())
+      Loc = Concept->getBeginLoc();
+    diagnoseUnsatisfiedConceptIdExpr(S, Concept, Loc, First);
+    return;
+  }
+  diagnoseWellFormedUnsatisfiedConstraintExpr(
+      S, cast<const class Expr *>(Record), First);
 }
 
-void
-Sema::DiagnoseUnsatisfiedConstraint(const ConstraintSatisfaction& Satisfaction,
-                                    bool First) {
+void Sema::DiagnoseUnsatisfiedConstraint(
+    const ConstraintSatisfaction &Satisfaction, SourceLocation Loc,
+    bool First) {
+
   assert(!Satisfaction.IsSatisfied &&
          "Attempted to diagnose a satisfied constraint");
-  for (auto &Record : Satisfaction.Details) {
-    diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
-    First = false;
-  }
+  ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.Details, Loc, First);
 }
 
 void Sema::DiagnoseUnsatisfiedConstraint(
-    const ASTConstraintSatisfaction &Satisfaction,
-    bool First) {
+    const ConceptSpecializationExpr *ConstraintExpr, bool First) {
+
+  const ASTConstraintSatisfaction &Satisfaction =
+      ConstraintExpr->getSatisfaction();
+
   assert(!Satisfaction.IsSatisfied &&
          "Attempted to diagnose a satisfied constraint");
-  for (auto &Record : Satisfaction) {
-    diagnoseUnsatisfiedConstraintExpr(*this, Record, First);
-    First = false;
-  }
+
+  ::DiagnoseUnsatisfiedConstraint(*this, Satisfaction.records(),
+                                  ConstraintExpr->getBeginLoc(), First);
 }
 
-const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
-    const NamedDecl *ConstrainedDecl,
-    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
-  // In case the ConstrainedDecl comes from modules, it is necessary to use
-  // the canonical decl to avoid different atomic constraints with the 'same'
-  // declarations.
-  ConstrainedDecl = cast<NamedDecl>(ConstrainedDecl->getCanonicalDecl());
+namespace {
 
-  auto CacheEntry = NormalizationCache.find(ConstrainedDecl);
-  if (CacheEntry == NormalizationCache.end()) {
-    auto Normalized = NormalizedConstraint::fromAssociatedConstraints(
-        *this, ConstrainedDecl, AssociatedConstraints);
-    CacheEntry =
-        NormalizationCache
-            .try_emplace(ConstrainedDecl,
-                         Normalized
-                             ? new (Context) NormalizedConstraint(
-                                 std::move(*Normalized))
-                             : nullptr)
-            .first;
-  }
-  return CacheEntry->second;
-}
+class SubstituteParameterMappings {
+  Sema &SemaRef;
 
-const NormalizedConstraint *clang::getNormalizedAssociatedConstraints(
-    Sema &S, const NamedDecl *ConstrainedDecl,
-    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
-  return S.getNormalizedAssociatedConstraints(ConstrainedDecl,
-                                              AssociatedConstraints);
-}
+  const MultiLevelTemplateArgumentList *MLTAL;
+  const ASTTemplateArgumentListInfo *ArgsAsWritten;
 
-static bool
-substituteParameterMappings(Sema &S, NormalizedConstraint &N,
-                            ConceptDecl *Concept,
-                            const MultiLevelTemplateArgumentList &MLTAL,
-                            const ASTTemplateArgumentListInfo *ArgsAsWritten) {
+  bool InFoldExpr;
 
-  if (N.isCompound()) {
-    if (substituteParameterMappings(S, N.getLHS(), Concept, MLTAL,
-                                    ArgsAsWritten))
-      return true;
-    return substituteParameterMappings(S, N.getRHS(), Concept, MLTAL,
-                                       ArgsAsWritten);
-  }
+  SubstituteParameterMappings(Sema &SemaRef,
+                              const MultiLevelTemplateArgumentList *MLTAL,
+                              const ASTTemplateArgumentListInfo *ArgsAsWritten,
+                              bool InFoldExpr)
+      : SemaRef(SemaRef), MLTAL(MLTAL), ArgsAsWritten(ArgsAsWritten),
+        InFoldExpr(InFoldExpr) {}
+
+  void buildParameterMapping(NormalizedConstraintWithParamMapping &N);
 
-  if (N.isFoldExpanded()) {
-    Sema::ArgPackSubstIndexRAII _(S, std::nullopt);
-    return substituteParameterMappings(
-        S, N.getFoldExpandedConstraint()->Constraint, Concept, MLTAL,
-        ArgsAsWritten);
+  bool substitute(NormalizedConstraintWithParamMapping &N);
+
+  bool substitute(ConceptIdConstraint &CC);
+
+public:
+  SubstituteParameterMappings(Sema &SemaRef, bool InFoldExpr = false)
+      : SemaRef(SemaRef), MLTAL(nullptr), ArgsAsWritten(nullptr),
+        InFoldExpr(InFoldExpr) {}
+
+  bool substitute(NormalizedConstraint &N);
+};
+
+void SubstituteParameterMappings::buildParameterMapping(
+    NormalizedConstraintWithParamMapping &N) {
+  TemplateParameterList *TemplateParams =
+      cast<TemplateDecl>(N.getConstraintDecl())->getTemplateParameters();
+
+  llvm::SmallBitVector OccurringIndices(TemplateParams->size());
+  llvm::SmallBitVector OccurringIndicesForSubsumption(TemplateParams->size());
+
+  if (N.getKind() == NormalizedConstraint::ConstraintKind::Atomic) {
+    SemaRef.MarkUsedTemplateParameters(
+        static_cast<AtomicConstraint &>(N).getConstraintExpr(),
+        /*OnlyDeduced=*/false,
+        /*Depth=*/0, OccurringIndices);
+
+    SemaRef.MarkUsedTemplateParametersForSubsumptionParameterMapping(
+        static_cast<AtomicConstraint &>(N).getConstraintExpr(),
+        /*Depth=*/0, OccurringIndicesForSubsumption);
+
+  } else if (N.getKind() ==
+             NormalizedConstraint::ConstraintKind::FoldExpanded) {
+    SemaRef.MarkUsedTemplateParameters(
+        static_cast<FoldExpandedConstraint &>(N).getPattern(),
+        /*OnlyDeduced=*/false,
+        /*Depth=*/0, OccurringIndices);
+  } else if (N.getKind() == NormalizedConstraint::ConstraintKind::ConceptId) {
+    auto *Args = static_cast<ConceptIdConstraint &>(N)
+                     .getConceptId()
+                     ->getTemplateArgsAsWritten();
+    if (Args)
+      SemaRef.MarkUsedTemplateParameters(Args->arguments(),
+                                         /*Depth=*/0, OccurringIndices);
+  }
+  TemplateArgumentLoc *TempArgs =
+      new (SemaRef.Context) TemplateArgumentLoc[OccurringIndices.count()];
+  llvm::SmallVector<NamedDecl *> UsedParams;
+  for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I) {
+    SourceLocation Loc = ArgsAsWritten->NumTemplateArgs > I
+                             ? ArgsAsWritten->arguments()[I].getLocation()
+                             : SourceLocation();
+    // FIXME: Investigate why we couldn't always preserve the SourceLoc. We
+    // can't assert Loc.isValid() now.
+    if (OccurringIndices[I]) {
+      NamedDecl *Param = TemplateParams->begin()[I];
+      new (&(TempArgs)[J]) TemplateArgumentLoc(
+          SemaRef.getIdentityTemplateArgumentLoc(Param, Loc));
+      UsedParams.push_back(Param);
+      J++;
+    }
   }
+  auto *UsedList = TemplateParameterList::Create(
+      SemaRef.Context, TemplateParams->getTemplateLoc(),
+      TemplateParams->getLAngleLoc(), UsedParams,
+      /*RAngleLoc=*/SourceLocation(),
+      /*RequiresClause=*/nullptr);
+  unsigned Size = OccurringIndices.count();
+  N.updateParameterMapping(
+      std::move(OccurringIndices), std::move(OccurringIndicesForSubsumption),
+      MutableArrayRef<TemplateArgumentLoc>{TempArgs, Size}, UsedList);
+}
 
-  TemplateParameterList *TemplateParams = Concept->getTemplateParameters();
+bool SubstituteParameterMappings::substitute(
+    NormalizedConstraintWithParamMapping &N) {
+  if (!N.hasParameterMapping())
+    buildParameterMapping(N);
 
-  AtomicConstraint &Atomic = *N.getAtomicConstraint();
-  TemplateArgumentListInfo SubstArgs;
-  if (!Atomic.ParameterMapping) {
-    llvm::SmallBitVector OccurringIndices(TemplateParams->size());
-    S.MarkUsedTemplateParameters(Atomic.ConstraintExpr, /*OnlyDeduced=*/false,
-                                 /*Depth=*/0, OccurringIndices);
-    TemplateArgumentLoc *TempArgs =
-        new (S.Context) TemplateArgumentLoc[OccurringIndices.count()];
-    for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I)
-      if (OccurringIndices[I])
-        new (&(TempArgs)[J++])
-            TemplateArgumentLoc(S.getIdentityTemplateArgumentLoc(
-                TemplateParams->begin()[I],
-                // Here we assume we do not support things like
-                // template<typename A, typename B>
-                // concept C = ...;
-                //
-                // template<typename... Ts> requires C<Ts...>
-                // struct S { };
-                // The above currently yields a diagnostic.
-                // We still might have default arguments for concept parameters.
-                ArgsAsWritten->NumTemplateArgs > I
-                    ? ArgsAsWritten->arguments()[I].getLocation()
-                    : SourceLocation()));
-    Atomic.ParameterMapping.emplace(TempArgs,  OccurringIndices.count());
-  }
-  SourceLocation InstLocBegin =
-      ArgsAsWritten->arguments().empty()
-          ? ArgsAsWritten->getLAngleLoc()
-          : ArgsAsWritten->arguments().front().getSourceRange().getBegin();
-  SourceLocation InstLocEnd =
-      ArgsAsWritten->arguments().empty()
-          ? ArgsAsWritten->getRAngleLoc()
-          : ArgsAsWritten->arguments().front().getSourceRange().getEnd();
+  SourceLocation InstLocBegin, InstLocEnd;
+  llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
+  if (Arguments.empty()) {
+    InstLocBegin = ArgsAsWritten->getLAngleLoc();
+    InstLocEnd = ArgsAsWritten->getRAngleLoc();
+  } else {
+    auto SR = Arguments[0].getSourceRange();
+    InstLocBegin = SR.getBegin();
+    InstLocEnd = SR.getEnd();
+  }
   Sema::InstantiatingTemplate Inst(
-      S, InstLocBegin,
+      SemaRef, InstLocBegin,
       Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
-      const_cast<NamedDecl *>(Atomic.ConstraintDecl),
+      const_cast<NamedDecl *>(N.getConstraintDecl()),
       {InstLocBegin, InstLocEnd});
   if (Inst.isInvalid())
     return true;
-  if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs))
+
+  // TransformTemplateArguments is unable to preserve the source location of a
+  // pack. The SourceLocation is necessary for the instantiation location.
+  // FIXME: The BaseLoc will be used as the location of the pack expansion,
+  // which is wrong.
+  TemplateArgumentListInfo SubstArgs;
+  if (SemaRef.SubstTemplateArgumentsInParameterMapping(
+          N.getParameterMapping(), N.getBeginLoc(), *MLTAL, SubstArgs,
+          /*BuildPackExpansionTypes=*/!InFoldExpr))
+    return true;
+  Sema::CheckTemplateArgumentInfo CTAI;
+  auto *TD =
+      const_cast<TemplateDecl *>(cast<TemplateDecl>(N.getConstraintDecl()));
+  if (SemaRef.CheckTemplateArgumentList(TD, N.getUsedTemplateParamList(),
+                                        TD->getLocation(), SubstArgs,
+                                        /*DefaultArguments=*/{},
+                                        /*PartialTemplateArgs=*/false, CTAI))
     return true;
 
   TemplateArgumentLoc *TempArgs =
-      new (S.Context) TemplateArgumentLoc[SubstArgs.size()];
-  std::copy(SubstArgs.arguments().begin(), SubstArgs.arguments().end(),
-            TempArgs);
-  Atomic.ParameterMapping.emplace(TempArgs, SubstArgs.size());
+      new (SemaRef.Context) TemplateArgumentLoc[CTAI.SugaredConverted.size()];
+
+  for (unsigned I = 0; I < CTAI.SugaredConverted.size(); ++I) {
+    SourceLocation Loc;
+    // If this is an empty pack, we have no corresponding SubstArgs.
+    if (I < SubstArgs.size())
+      Loc = SubstArgs.arguments()[I].getLocation();
+
+    TempArgs[I] = SemaRef.getTrivialTemplateArgumentLoc(
+        CTAI.SugaredConverted[I], QualType(), Loc);
+  }
+
+  MutableArrayRef<TemplateArgumentLoc> Mapping(TempArgs,
+                                               CTAI.SugaredConverted.size());
+  N.updateParameterMapping(N.mappingOccurenceList(),
+                           N.mappingOccurenceListForSubsumption(), Mapping,
+                           N.getUsedTemplateParamList());
   return false;
 }
 
-static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
-                                        const ConceptSpecializationExpr *CSE) {
-  MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
-      CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(),
-      /*Final=*/false, CSE->getTemplateArguments(),
-      /*RelativeToPrimary=*/true,
-      /*Pattern=*/nullptr,
-      /*ForConstraintInstantiation=*/true);
+bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
+  assert(CC.getConstraintDecl() && MLTAL && ArgsAsWritten);
 
-  return substituteParameterMappings(S, N, CSE->getNamedConcept(), MLTAL,
-                                     CSE->getTemplateArgsAsWritten());
-}
+  if (substitute(static_cast<NormalizedConstraintWithParamMapping &>(CC)))
+    return true;
+
+  auto *CSE = CC.getConceptSpecializationExpr();
+  assert(CSE);
+  assert(!CC.getBeginLoc().isInvalid());
 
-NormalizedConstraint::NormalizedConstraint(ASTContext &C,
-                                           NormalizedConstraint LHS,
-                                           NormalizedConstraint RHS,
-                                           CompoundConstraintKind Kind)
-    : Constraint{CompoundConstraint{
-          new(C) NormalizedConstraintPair{std::move(LHS), std::move(RHS)},
-          Kind}} {}
-
-NormalizedConstraint::NormalizedConstraint(ASTContext &C,
-                                           const NormalizedConstraint &Other) {
-  if (Other.isAtomic()) {
-    Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint());
-  } else if (Other.isFoldExpanded()) {
-    Constraint = new (C) FoldExpandedConstraint(
-        Other.getFoldExpandedConstraint()->Kind,
-        NormalizedConstraint(C, Other.getFoldExpandedConstraint()->Constraint),
-        Other.getFoldExpandedConstraint()->Pattern);
+  SourceLocation InstLocBegin, InstLocEnd;
+  if (llvm::ArrayRef Arguments = ArgsAsWritten->arguments();
+      Arguments.empty()) {
+    InstLocBegin = ArgsAsWritten->getLAngleLoc();
+    InstLocEnd = ArgsAsWritten->getRAngleLoc();
   } else {
-    Constraint = CompoundConstraint(
-        new (C)
-            NormalizedConstraintPair{NormalizedConstraint(C, Other.getLHS()),
-                                     NormalizedConstraint(C, Other.getRHS())},
-        Other.getCompoundKind());
+    auto SR = Arguments[0].getSourceRange();
+    InstLocBegin = SR.getBegin();
+    InstLocEnd = SR.getEnd();
   }
-}
+  // This is useful for name lookup across modules; see Sema::getLookupModules.
+  Sema::InstantiatingTemplate Inst(
+      SemaRef, InstLocBegin,
+      Sema::InstantiatingTemplate::ParameterMappingSubstitution{},
+      const_cast<NamedDecl *>(CC.getConstraintDecl()),
+      {InstLocBegin, InstLocEnd});
+  if (Inst.isInvalid())
+    return true;
 
-NormalizedConstraint &NormalizedConstraint::getLHS() const {
-  assert(isCompound() && "getLHS called on a non-compound constraint.");
-  return cast<CompoundConstraint>(Constraint).getPointer()->LHS;
+  TemplateArgumentListInfo Out;
+  // TransformTemplateArguments is unable to preserve the source location of a
+  // pack. The SourceLocation is necessary for the instantiation location.
+  // FIXME: The BaseLoc will be used as the location of the pack expansion,
+  // which is wrong.
+  const ASTTemplateArgumentListInfo *ArgsAsWritten =
+      CSE->getTemplateArgsAsWritten();
+  if (SemaRef.SubstTemplateArgumentsInParameterMapping(
+          ArgsAsWritten->arguments(), CC.getBeginLoc(), *MLTAL, Out,
+          /*BuildPackExpansionTypes=*/!InFoldExpr))
+    return true;
+  Sema::CheckTemplateArgumentInfo CTAI;
+  if (SemaRef.CheckTemplateArgumentList(CSE->getNamedConcept(),
+                                        CSE->getConceptNameInfo().getLoc(), Out,
+                                        /*DefaultArgs=*/{},
+                                        /*PartialTemplateArgs=*/false, CTAI,
+                                        /*UpdateArgsWithConversions=*/false))
+    return true;
+  auto TemplateArgs = *MLTAL;
+  TemplateArgs.replaceOutermostTemplateArguments(
+      TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted);
+  return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten,
+                                     InFoldExpr)
+      .substitute(CC.getNormalizedConstraint());
 }
 
-NormalizedConstraint &NormalizedConstraint::getRHS() const {
-  assert(isCompound() && "getRHS called on a non-compound constraint.");
-  return cast<CompoundConstraint>(Constraint).getPointer()->RHS;
+bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) {
+  switch (N.getKind()) {
+  case NormalizedConstraint::ConstraintKind::Atomic: {
+    if (!MLTAL) {
+      assert(!ArgsAsWritten);
+      return false;
+    }
+    return substitute(static_cast<NormalizedConstraintWithParamMapping &>(N));
+  }
+  case NormalizedConstraint::ConstraintKind::FoldExpanded: {
+    auto &FE = static_cast<FoldExpandedConstraint &>(N);
+    if (!MLTAL) {
+      llvm::SaveAndRestore _1(InFoldExpr, true);
+      assert(!ArgsAsWritten);
+      return substitute(FE.getNormalizedPattern());
+    }
+    Sema::ArgPackSubstIndexRAII _(SemaRef, std::nullopt);
+    substitute(static_cast<NormalizedConstraintWithParamMapping &>(FE));
+    return SubstituteParameterMappings(SemaRef, /*InFoldExpr=*/true)
+        .substitute(FE.getNormalizedPattern());
+  }
+  case NormalizedConstraint::ConstraintKind::ConceptId: {
+    auto &CC = static_cast<ConceptIdConstraint &>(N);
+    if (MLTAL) {
+      assert(ArgsAsWritten);
+      return substitute(CC);
+    }
+    assert(!ArgsAsWritten);
+    const ConceptSpecializationExpr *CSE = CC.getConceptSpecializationExpr();
+    ConceptDecl *Concept = CSE->getNamedConcept();
+    MultiLevelTemplateArgumentList MLTAL = SemaRef.getTemplateInstantiationArgs(
+        Concept, Concept->getLexicalDeclContext(),
+        /*Final=*/true, CSE->getTemplateArguments(),
+        /*RelativeToPrimary=*/true,
+        /*Pattern=*/nullptr,
+        /*ForConstraintInstantiation=*/true);
+
+    return SubstituteParameterMappings(
+               SemaRef, &MLTAL, CSE->getTemplateArgsAsWritten(), InFoldExpr)
+        .substitute(CC.getNormalizedConstraint());
+  }
+  case NormalizedConstraint::ConstraintKind::Compound: {
+    auto &Compound = static_cast<CompoundConstraint &>(N);
+    if (substitute(Compound.getLHS()))
+      return true;
+    return substitute(Compound.getRHS());
+  }
+  }
 }
 
-std::optional<NormalizedConstraint>
-NormalizedConstraint::fromAssociatedConstraints(
+} // namespace
+
+NormalizedConstraint *NormalizedConstraint::fromAssociatedConstraints(
     Sema &S, const NamedDecl *D, ArrayRef<AssociatedConstraint> ACs) {
   assert(ACs.size() != 0);
-  auto Conjunction = fromConstraintExpr(S, D, ACs[0].ConstraintExpr);
+  auto *Conjunction =
+      fromConstraintExpr(S, D, ACs[0].ConstraintExpr, ACs[0].ArgPackSubstIndex);
   if (!Conjunction)
-    return std::nullopt;
+    return nullptr;
   for (unsigned I = 1; I < ACs.size(); ++I) {
-    auto Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr);
+    auto *Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr,
+                                    ACs[I].ArgPackSubstIndex);
     if (!Next)
-      return std::nullopt;
-    *Conjunction = NormalizedConstraint(S.Context, std::move(*Conjunction),
-                                        std::move(*Next), CCK_Conjunction);
+      return nullptr;
+    Conjunction = CompoundConstraint::CreateConjunction(S.getASTContext(),
+                                                        Conjunction, Next);
   }
   return Conjunction;
 }
 
-std::optional<NormalizedConstraint>
-NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
-                                         const Expr *E) {
+NormalizedConstraint *NormalizedConstraint::fromConstraintExpr(
+    Sema &S, const NamedDecl *D, const Expr *E, UnsignedOrNone SubstIndex) {
   assert(E != nullptr);
 
   // C++ [temp.constr.normal]p1.1
@@ -1597,23 +2170,29 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
   // [...]
   E = E->IgnoreParenImpCasts();
 
+  llvm::FoldingSetNodeID ID;
+  if (D && DiagRecursiveConstraintEval(S, ID, D, E)) {
+    return nullptr;
+  }
+  SatisfactionStackRAII StackRAII(S, D, ID);
+
   // C++2a [temp.param]p4:
   //     [...] If T is not a pack, then E is E', otherwise E is (E' && ...).
   // Fold expression is considered atomic constraints per current wording.
   // See http://cplusplus.github.io/concepts-ts/ts-active.html#28
 
   if (LogicalBinOp BO = E) {
-    auto LHS = fromConstraintExpr(S, D, BO.getLHS());
+    auto *LHS = fromConstraintExpr(S, D, BO.getLHS(), SubstIndex);
     if (!LHS)
-      return std::nullopt;
-    auto RHS = fromConstraintExpr(S, D, BO.getRHS());
+      return nullptr;
+    auto *RHS = fromConstraintExpr(S, D, BO.getRHS(), SubstIndex);
     if (!RHS)
-      return std::nullopt;
+      return nullptr;
 
-    return NormalizedConstraint(S.Context, std::move(*LHS), std::move(*RHS),
-                                BO.isAnd() ? CCK_Conjunction : CCK_Disjunction);
+    return CompoundConstraint::Create(
+        S.Context, LHS, BO.isAnd() ? CCK_Conjunction : CCK_Disjunction, RHS);
   } else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
-    const NormalizedConstraint *SubNF;
+    NormalizedConstraint *SubNF;
     {
       Sema::InstantiatingTemplate Inst(
           S, CSE->getExprLoc(),
@@ -1621,7 +2200,7 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
           // FIXME: improve const-correctness of InstantiatingTemplate
           const_cast<NamedDecl *>(D), CSE->getSourceRange());
       if (Inst.isInvalid())
-        return std::nullopt;
+        return nullptr;
       // C++ [temp.constr.normal]p1.1
       // [...]
       // The normal form of an id-expression of the form C<A1, A2, ..., AN>,
@@ -1631,20 +2210,21 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
       // constraint. If any such substitution results in an invalid type or
       // expression, the program is ill-formed; no diagnostic is required.
       // [...]
-      ConceptDecl *CD = CSE->getNamedConcept();
-      SubNF = S.getNormalizedAssociatedConstraints(
-          CD, AssociatedConstraint(CD->getConstraintExpr()));
+
+      // Use canonical declarations to merge ConceptDecls across
+      // different modules.
+      ConceptDecl *CD = CSE->getNamedConcept()->getCanonicalDecl();
+      SubNF = NormalizedConstraint::fromAssociatedConstraints(
+          S, CD, AssociatedConstraint(CD->getConstraintExpr(), SubstIndex));
+
       if (!SubNF)
-        return std::nullopt;
+        return nullptr;
     }
 
-    std::optional<NormalizedConstraint> New;
-    New.emplace(S.Context, *SubNF);
-
-    if (substituteParameterMappings(S, *New, CSE))
-      return std::nullopt;
+    return ConceptIdConstraint::Create(S.getASTContext(),
+                                       CSE->getConceptReference(), SubNF, D,
+                                       CSE, SubstIndex);
 
-    return New;
   } else if (auto *FE = dyn_cast<const CXXFoldExpr>(E);
              FE && S.getLangOpts().CPlusPlus26 &&
              (FE->getOperator() == BinaryOperatorKind::BO_LAnd ||
@@ -1658,31 +2238,61 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D,
             : FoldExpandedConstraint::FoldOperatorKind::Or;
 
     if (FE->getInit()) {
-      auto LHS = fromConstraintExpr(S, D, FE->getLHS());
-      auto RHS = fromConstraintExpr(S, D, FE->getRHS());
+      auto *LHS = fromConstraintExpr(S, D, FE->getLHS(), SubstIndex);
+      auto *RHS = fromConstraintExpr(S, D, FE->getRHS(), SubstIndex);
       if (!LHS || !RHS)
-        return std::nullopt;
+        return nullptr;
 
       if (FE->isRightFold())
-        RHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
-            Kind, std::move(*RHS), FE->getPattern()}};
+        LHS = FoldExpandedConstraint::Create(S.getASTContext(),
+                                             FE->getPattern(), D, Kind, LHS);
       else
-        LHS = NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
-            Kind, std::move(*LHS), FE->getPattern()}};
-
-      return NormalizedConstraint(
-          S.Context, std::move(*LHS), std::move(*RHS),
-          FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
-                                                           : CCK_Disjunction);
+        RHS = FoldExpandedConstraint::Create(S.getASTContext(),
+                                             FE->getPattern(), D, Kind, RHS);
+
+      return CompoundConstraint::Create(
+          S.getASTContext(), LHS,
+          (FE->getOperator() == BinaryOperatorKind::BO_LAnd ? CCK_Conjunction
+                                                            : CCK_Disjunction),
+          RHS);
     }
-    auto Sub = fromConstraintExpr(S, D, FE->getPattern());
+    auto *Sub = fromConstraintExpr(S, D, FE->getPattern(), SubstIndex);
     if (!Sub)
-      return std::nullopt;
-    return NormalizedConstraint{new (S.Context) FoldExpandedConstraint{
-        Kind, std::move(*Sub), FE->getPattern()}};
+      return nullptr;
+    return FoldExpandedConstraint::Create(S.getASTContext(), FE->getPattern(),
+                                          D, Kind, Sub);
+  }
+  return AtomicConstraint::Create(S.getASTContext(), E, D, SubstIndex);
+}
+
+const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints(
+    ConstrainedDeclOrNestedRequirement ConstrainedDeclOrNestedReq,
+    ArrayRef<AssociatedConstraint> AssociatedConstraints) {
+  if (!ConstrainedDeclOrNestedReq) {
+    auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
+        *this, nullptr, AssociatedConstraints);
+    if (!Normalized ||
+        SubstituteParameterMappings(*this).substitute(*Normalized))
+      return nullptr;
+
+    return Normalized;
   }
 
-  return NormalizedConstraint{new (S.Context) AtomicConstraint(E, D)};
+  // FIXME: ConstrainedDeclOrNestedReq is never a NestedRequirement!
+  const NamedDecl *ND =
+      ConstrainedDeclOrNestedReq.dyn_cast<const NamedDecl *>();
+  auto CacheEntry = NormalizationCache.find(ConstrainedDeclOrNestedReq);
+  if (CacheEntry == NormalizationCache.end()) {
+    auto *Normalized = NormalizedConstraint::fromAssociatedConstraints(
+        *this, ND, AssociatedConstraints);
+    CacheEntry =
+        NormalizationCache.try_emplace(ConstrainedDeclOrNestedReq, Normalized)
+            .first;
+    if (!Normalized ||
+        SubstituteParameterMappings(*this).substitute(*Normalized))
+      return nullptr;
+  }
+  return CacheEntry->second;
 }
 
 bool FoldExpandedConstraint::AreCompatibleForSubsumption(
@@ -1693,8 +2303,10 @@ bool FoldExpandedConstraint::AreCompatibleForSubsumption(
   // if their respective constraints both contain an equivalent unexpanded pack.
 
   llvm::SmallVector<UnexpandedParameterPack> APacks, BPacks;
-  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.Pattern), APacks);
-  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.Pattern), BPacks);
+  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(A.getPattern()),
+                                        APacks);
+  Sema::collectUnexpandedParameterPacks(const_cast<Expr *>(B.getPattern()),
+                                        BPacks);
 
   for (const UnexpandedParameterPack &APack : APacks) {
     auto ADI = getDepthAndIndex(APack);
@@ -1788,7 +2400,7 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
                                     const AtomicConstraint &B) {
     if (!A.hasMatchingParameterMapping(Context, B))
       return false;
-    const Expr *EA = A.ConstraintExpr, *EB = B.ConstraintExpr;
+    const Expr *EA = A.getConstraintExpr(), *EB = B.getConstraintExpr();
     if (EA == EB)
       return true;
 
@@ -1841,24 +2453,6 @@ bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(
   return true;
 }
 
-NormalizedConstraint::CompoundConstraintKind
-NormalizedConstraint::getCompoundKind() const {
-  assert(isCompound() && "getCompoundKind on a non-compound constraint..");
-  return cast<CompoundConstraint>(Constraint).getInt();
-}
-
-AtomicConstraint *NormalizedConstraint::getAtomicConstraint() const {
-  assert(isAtomic() && "getAtomicConstraint called on non-atomic constraint.");
-  return cast<AtomicConstraint *>(Constraint);
-}
-
-FoldExpandedConstraint *
-NormalizedConstraint::getFoldExpandedConstraint() const {
-  assert(isFoldExpanded() &&
-         "getFoldExpandedConstraint called on non-fold-expanded constraint.");
-  return cast<FoldExpandedConstraint *>(Constraint);
-}
-
 //
 //
 // ------------------------ Subsumption -----------------------------------
@@ -1874,8 +2468,8 @@ uint16_t SubsumptionChecker::getNewLiteralId() {
   return NextID++;
 }
 
-auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
-  auto &Elems = AtomicMap[Ori->ConstraintExpr];
+auto SubsumptionChecker::find(const AtomicConstraint *Ori) -> Literal {
+  auto &Elems = AtomicMap[Ori->getConstraintExpr()];
   // C++ [temp.constr.order] p2
   //   - an atomic constraint A subsumes another atomic constraint B
   //     if and only if the A and B are identical [...]
@@ -1891,13 +2485,16 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
   // subsumes another, their literal will be the same
 
   llvm::FoldingSetNodeID ID;
-  const auto &Mapping = Ori->ParameterMapping;
-  ID.AddBoolean(Mapping.has_value());
-  if (Mapping) {
-    for (const TemplateArgumentLoc &TAL : *Mapping) {
-      SemaRef.getASTContext()
-          .getCanonicalTemplateArgument(TAL.getArgument())
-          .Profile(ID, SemaRef.getASTContext());
+  ID.AddBoolean(Ori->hasParameterMapping());
+  if (Ori->hasParameterMapping()) {
+    const auto &Mapping = Ori->getParameterMapping();
+    const NormalizedConstraint::OccurenceList &Indexes =
+        Ori->mappingOccurenceListForSubsumption();
+    for (auto [Idx, TAL] : llvm::enumerate(Mapping)) {
+      if (Indexes[Idx])
+        SemaRef.getASTContext()
+            .getCanonicalTemplateArgument(TAL.getArgument())
+            .Profile(ID, SemaRef.getASTContext());
     }
   }
   auto It = Elems.find(ID);
@@ -1912,11 +2509,11 @@ auto SubsumptionChecker::find(AtomicConstraint *Ori) -> Literal {
   return It->getSecond().ID;
 }
 
-auto SubsumptionChecker::find(FoldExpandedConstraint *Ori) -> Literal {
-  auto &Elems = FoldMap[Ori->Pattern];
+auto SubsumptionChecker::find(const FoldExpandedConstraint *Ori) -> Literal {
+  auto &Elems = FoldMap[Ori->getPattern()];
 
   FoldExpendedConstraintKey K;
-  K.Kind = Ori->Kind;
+  K.Kind = Ori->getFoldOperator();
 
   auto It = llvm::find_if(Elems, [&K](const FoldExpendedConstraintKey &Other) {
     return K.Kind == Other.Kind;
@@ -1960,38 +2557,47 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
     AddUniqueClauseToFormula(Res, std::move(C));
   };
 
-  if (NC.isAtomic())
-    return {{find(NC.getAtomicConstraint())}};
+  switch (NC.getKind()) {
 
-  if (NC.isFoldExpanded())
-    return {{find(NC.getFoldExpandedConstraint())}};
+  case NormalizedConstraint::ConstraintKind::Atomic:
+    return {{find(&static_cast<const AtomicConstraint &>(NC))}};
 
-  FormulaType Left, Right;
-  SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
-    Left = Normalize<FormulaType>(NC.getLHS());
-    Right = Normalize<FormulaType>(NC.getRHS());
-  });
+  case NormalizedConstraint::ConstraintKind::FoldExpanded:
+    return {{find(&static_cast<const FoldExpandedConstraint &>(NC))}};
 
-  if (NC.getCompoundKind() == FormulaType::Kind) {
-    auto SizeLeft = Left.size();
-    Res = std::move(Left);
-    Res.reserve(SizeLeft + Right.size());
-    std::for_each(std::make_move_iterator(Right.begin()),
-                  std::make_move_iterator(Right.end()), Add);
-    return Res;
-  }
+  case NormalizedConstraint::ConstraintKind::ConceptId:
+    return Normalize<FormulaType>(
+        static_cast<const ConceptIdConstraint &>(NC).getNormalizedConstraint());
+
+  case NormalizedConstraint::ConstraintKind::Compound: {
+    const auto &Compound = static_cast<const CompoundConstraint &>(NC);
+    FormulaType Left, Right;
+    SemaRef.runWithSufficientStackSpace(SourceLocation(), [&] {
+      Left = Normalize<FormulaType>(Compound.getLHS());
+      Right = Normalize<FormulaType>(Compound.getRHS());
+    });
+
+    if (Compound.getCompoundKind() == FormulaType::Kind) {
+      Res = std::move(Left);
+      Res.reserve(Left.size() + Right.size());
+      std::for_each(std::make_move_iterator(Right.begin()),
+                    std::make_move_iterator(Right.end()), Add);
+      return Res;
+    }
 
-  Res.reserve(Left.size() * Right.size());
-  for (const auto &LTransform : Left) {
-    for (const auto &RTransform : Right) {
-      Clause Combined;
-      Combined.reserve(LTransform.size() + RTransform.size());
-      llvm::append_range(Combined, LTransform);
-      llvm::append_range(Combined, RTransform);
-      Add(std::move(Combined));
+    Res.reserve(Left.size() * Right.size());
+    for (const auto &LTransform : Left) {
+      for (const auto &RTransform : Right) {
+        Clause Combined;
+        Combined.reserve(LTransform.size() + RTransform.size());
+        llvm::copy(LTransform, std::back_inserter(Combined));
+        llvm::copy(RTransform, std::back_inserter(Combined));
+        Add(std::move(Combined));
+      }
     }
+    return Res;
+  }
   }
-  return Res;
 }
 
 void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) {
@@ -2006,12 +2612,12 @@ std::optional<bool> SubsumptionChecker::Subsumes(
     const NamedDecl *DP, ArrayRef<AssociatedConstraint> P, const NamedDecl *DQ,
     ArrayRef<AssociatedConstraint> Q) {
   const NormalizedConstraint *PNormalized =
-      getNormalizedAssociatedConstraints(SemaRef, DP, P);
+      SemaRef.getNormalizedAssociatedConstraints(DP, P);
   if (!PNormalized)
     return std::nullopt;
 
   const NormalizedConstraint *QNormalized =
-      getNormalizedAssociatedConstraints(SemaRef, DQ, Q);
+      SemaRef.getNormalizedAssociatedConstraints(DQ, Q);
   if (!QNormalized)
     return std::nullopt;
 
@@ -2061,9 +2667,9 @@ bool SubsumptionChecker::Subsumes(const FoldExpandedConstraint *A,
     // constraint B if they are compatible for subsumption, have the same
     // fold-operator, and the constraint of A subsumes that of B.
     bool DoesSubsume =
-        A->Kind == B->Kind &&
+        A->getFoldOperator() == B->getFoldOperator() &&
         FoldExpandedConstraint::AreCompatibleForSubsumption(*A, *B) &&
-        Subsumes(&A->Constraint, &B->Constraint);
+        Subsumes(&A->getNormalizedPattern(), &B->getNormalizedPattern());
     It = FoldSubsumptionCache.try_emplace(std::move(Key), DoesSubsume).first;
   }
   return It->second;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 16d42d27d3b4e..d27f76706e5ee 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -17876,13 +17876,15 @@ Decl *Sema::BuildStaticAssertDeclaration(SourceLocation StaticAssertLoc,
         findFailedBooleanCondition(Converted.get());
       if (const auto *ConceptIDExpr =
               dyn_cast_or_null<ConceptSpecializationExpr>(InnerCond)) {
-        // Drill down into concept specialization expressions to see why they
-        // weren't satisfied.
-        Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
-            << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
-        ConstraintSatisfaction Satisfaction;
-        if (!CheckConstraintSatisfaction(ConceptIDExpr, Satisfaction))
-          DiagnoseUnsatisfiedConstraint(Satisfaction);
+        const ASTConstraintSatisfaction &Satisfaction =
+            ConceptIDExpr->getSatisfaction();
+        if (!Satisfaction.ContainsErrors || Satisfaction.NumRecords) {
+          Diag(AssertExpr->getBeginLoc(), diag::err_static_assert_failed)
+              << !HasMessage << Msg.str() << AssertExpr->getSourceRange();
+          // Drill down into concept specialization expressions to see why they
+          // weren't satisfied.
+          DiagnoseUnsatisfiedConstraint(ConceptIDExpr);
+        }
       } else if (InnerCond && !isa<CXXBoolLiteralExpr>(InnerCond) &&
                  !isa<IntegerLiteral>(InnerCond)) {
         Diag(InnerCond->getBeginLoc(),
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 576eb326e6529..0fe242dce45e9 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7935,21 +7935,27 @@ Sema::BuildExprRequirement(
     //     be satisfied.
     TemplateParameterList *TPL =
         ReturnTypeRequirement.getTypeConstraintTemplateParameterList();
-    QualType MatchedType =
-        Context.getReferenceQualifiedType(E).getCanonicalType();
+    QualType MatchedType = Context.getReferenceQualifiedType(E);
     llvm::SmallVector<TemplateArgument, 1> Args;
     Args.push_back(TemplateArgument(MatchedType));
 
     auto *Param = cast<TemplateTypeParmDecl>(TPL->getParam(0));
 
-    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/false);
+    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/true);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
     const TypeConstraint *TC = Param->getTypeConstraint();
     assert(TC && "Type Constraint cannot be null here");
     auto *IDC = TC->getImmediatelyDeclaredConstraint();
     assert(IDC && "ImmediatelyDeclaredConstraint can't be null here.");
     ExprResult Constraint = SubstExpr(IDC, MLTAL);
-    if (Constraint.isInvalid()) {
+    bool HasError = Constraint.isInvalid();
+    if (!HasError) {
+      SubstitutedConstraintExpr =
+          cast<ConceptSpecializationExpr>(Constraint.get());
+      if (SubstitutedConstraintExpr->getSatisfaction().ContainsErrors)
+        HasError = true;
+    }
+    if (HasError) {
       return new (Context) concepts::ExprRequirement(
           createSubstDiagAt(IDC->getExprLoc(),
                             [&](llvm::raw_ostream &OS) {
@@ -7958,8 +7964,6 @@ Sema::BuildExprRequirement(
                             }),
           IsSimple, NoexceptLoc, ReturnTypeRequirement);
     }
-    SubstitutedConstraintExpr =
-        cast<ConceptSpecializationExpr>(Constraint.get());
     if (!SubstitutedConstraintExpr->isSatisfied())
       Status = concepts::ExprRequirement::SS_ConstraintsNotSatisfied;
   }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index ea5c4265d736d..b8701147ae7d4 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -804,7 +804,7 @@ clang::MakeDeductionFailureInfo(ASTContext &Context,
   case TemplateDeductionResult::ConstraintsNotSatisfied: {
     CNSInfo *Saved = new (Context) CNSInfo;
     Saved->TemplateArgs = Info.takeSugared();
-    Saved->Satisfaction = Info.AssociatedConstraintsSatisfaction;
+    Saved->Satisfaction = std::move(Info.AssociatedConstraintsSatisfaction);
     Result.Data = Saved;
     break;
   }
@@ -852,6 +852,7 @@ void DeductionFailureInfo::Destroy() {
 
   case TemplateDeductionResult::ConstraintsNotSatisfied:
     // FIXME: Destroy the template argument list?
+    static_cast<CNSInfo *>(Data)->Satisfaction.~ConstraintSatisfaction();
     Data = nullptr;
     if (PartialDiagnosticAt *Diag = getSFINAEDiagnostic()) {
       Diag->~PartialDiagnosticAt();
@@ -12739,7 +12740,8 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
         << (unsigned)FnKindPair.first << (unsigned)ocs_non_template
         << FnDesc /* Ignored */;
     ConstraintSatisfaction Satisfaction;
-    if (S.CheckFunctionConstraints(Fn, Satisfaction))
+    if (S.CheckFunctionConstraints(Fn, Satisfaction, SourceLocation(),
+                                   /*ForOverloadResolution=*/true))
       break;
     S.DiagnoseUnsatisfiedConstraint(Satisfaction);
   }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 2bf1511c5cfa0..dcf2876af81fc 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TreeTransform.h"
+#include "clang/AST/ASTConcept.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
@@ -1222,8 +1223,9 @@ static ExprResult formImmediatelyDeclaredConstraint(
   if (auto *CD = dyn_cast<ConceptDecl>(NamedConcept)) {
     ImmediatelyDeclaredConstraint = S.CheckConceptTemplateId(
         SS, /*TemplateKWLoc=*/SourceLocation(), NameInfo,
-        /*FoundDecl=*/FoundDecl ? FoundDecl : NamedConcept, CD,
-        &ConstraintArgs);
+        /*FoundDecl=*/FoundDecl ? FoundDecl : CD, CD, &ConstraintArgs,
+        /*DoCheckConstraintSatisfaction=*/
+        !S.inParameterMappingSubstitution());
   }
   // We have a template template parameter
   else {
@@ -4850,13 +4852,11 @@ void Sema::diagnoseMissingTemplateArguments(const CXXScopeSpec &SS,
   diagnoseMissingTemplateArguments(Name, Loc);
 }
 
-ExprResult
-Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
-                             SourceLocation TemplateKWLoc,
-                             const DeclarationNameInfo &ConceptNameInfo,
-                             NamedDecl *FoundDecl,
-                             ConceptDecl *NamedConcept,
-                             const TemplateArgumentListInfo *TemplateArgs) {
+ExprResult Sema::CheckConceptTemplateId(
+    const CXXScopeSpec &SS, SourceLocation TemplateKWLoc,
+    const DeclarationNameInfo &ConceptNameInfo, NamedDecl *FoundDecl,
+    TemplateDecl *NamedConcept, const TemplateArgumentListInfo *TemplateArgs,
+    bool DoCheckConstraintSatisfaction) {
   assert(NamedConcept && "A concept template id without a template?");
 
   if (NamedConcept->isInvalidDecl())
@@ -4873,33 +4873,48 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
 
   DiagnoseUseOfDecl(NamedConcept, ConceptNameInfo.getLoc());
 
+  // There's a bug with CTAI.CanonicalConverted.
+  // If the template argument contains a DependentDecltypeType that includes a
+  // TypeAliasType, and the same written type had occurred previously in the
+  // source, then the DependentDecltypeType would be canonicalized to that
+  // previous type which would mess up the substitution.
+  // FIXME: Reland https://github.com/llvm/llvm-project/pull/101782 properly!
   auto *CSD = ImplicitConceptSpecializationDecl::Create(
       Context, NamedConcept->getDeclContext(), NamedConcept->getLocation(),
-      CTAI.CanonicalConverted);
+      CTAI.SugaredConverted);
   ConstraintSatisfaction Satisfaction;
   bool AreArgsDependent =
       TemplateSpecializationType::anyDependentTemplateArguments(
-          *TemplateArgs, CTAI.CanonicalConverted);
-  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.CanonicalConverted,
+          *TemplateArgs, CTAI.SugaredConverted);
+  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CTAI.SugaredConverted,
                                        /*Final=*/false);
-  LocalInstantiationScope Scope(*this);
-
-  EnterExpressionEvaluationContext EECtx{
-      *this, ExpressionEvaluationContext::Unevaluated, CSD};
-
-  if (!AreArgsDependent &&
-      CheckConstraintSatisfaction(
-          NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()),
-          MLTAL,
-          SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
-                      TemplateArgs->getRAngleLoc()),
-          Satisfaction))
-    return ExprError();
   auto *CL = ConceptReference::Create(
       Context,
       SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc{},
       TemplateKWLoc, ConceptNameInfo, FoundDecl, NamedConcept,
       ASTTemplateArgumentListInfo::Create(Context, *TemplateArgs));
+
+  bool Error = false;
+  if (const auto *Concept = dyn_cast<ConceptDecl>(NamedConcept);
+      Concept && Concept->getConstraintExpr() && !AreArgsDependent &&
+      DoCheckConstraintSatisfaction) {
+
+    LocalInstantiationScope Scope(*this);
+
+    EnterExpressionEvaluationContext EECtx{
+        *this, ExpressionEvaluationContext::Unevaluated, CSD};
+
+    Error = CheckConstraintSatisfaction(
+        NamedConcept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
+        SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(),
+                    TemplateArgs->getRAngleLoc()),
+        Satisfaction, CL);
+    Satisfaction.ContainsErrors = Error;
+  }
+
+  if (Error)
+    return ExprError();
+
   return ConceptSpecializationExpr::Create(
       Context, CL, CSD, AreArgsDependent ? nullptr : &Satisfaction);
 }
@@ -5217,10 +5232,11 @@ bool Sema::CheckTemplateTypeArgument(
   }
   default: {
     // We allow instantiating a template with template argument packs when
-    // building deduction guides.
+    // building deduction guides or mapping constraint template parameters.
     if (Arg.getKind() == TemplateArgument::Pack &&
-        CodeSynthesisContexts.back().Kind ==
-            Sema::CodeSynthesisContext::BuildingDeductionGuides) {
+        (CodeSynthesisContexts.back().Kind ==
+             Sema::CodeSynthesisContext::BuildingDeductionGuides ||
+         inParameterMappingSubstitution())) {
       SugaredConverted.push_back(Arg);
       CanonicalConverted.push_back(Arg);
       return false;
@@ -5813,6 +5829,20 @@ bool Sema::CheckTemplateArgumentList(
     TemplateArgumentListInfo &TemplateArgs, const DefaultArguments &DefaultArgs,
     bool PartialTemplateArgs, CheckTemplateArgumentInfo &CTAI,
     bool UpdateArgsWithConversions, bool *ConstraintsNotSatisfied) {
+  return CheckTemplateArgumentList(
+      Template, GetTemplateParameterList(Template), TemplateLoc, TemplateArgs,
+      DefaultArgs, PartialTemplateArgs, CTAI, UpdateArgsWithConversions,
+      ConstraintsNotSatisfied);
+}
+
+/// Check that the given template argument list is well-formed
+/// for specializing the given template.
+bool Sema::CheckTemplateArgumentList(
+    TemplateDecl *Template, TemplateParameterList *Params,
+    SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs,
+    const DefaultArguments &DefaultArgs, bool PartialTemplateArgs,
+    CheckTemplateArgumentInfo &CTAI, bool UpdateArgsWithConversions,
+    bool *ConstraintsNotSatisfied) {
 
   if (ConstraintsNotSatisfied)
     *ConstraintsNotSatisfied = false;
@@ -5822,8 +5852,6 @@ bool Sema::CheckTemplateArgumentList(
   // template.
   TemplateArgumentListInfo NewArgs = TemplateArgs;
 
-  TemplateParameterList *Params = GetTemplateParameterList(Template);
-
   SourceLocation RAngleLoc = NewArgs.getRAngleLoc();
 
   // C++23 [temp.arg.general]p1:
@@ -6163,11 +6191,12 @@ bool Sema::CheckTemplateArgumentList(
     CXXThisScopeRAII Scope(*this, RD, ThisQuals, RD != nullptr);
 
     MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs(
-        Template, NewContext, /*Final=*/false, CTAI.CanonicalConverted,
+        Template, NewContext, /*Final=*/true, CTAI.SugaredConverted,
         /*RelativeToPrimary=*/true,
         /*Pattern=*/nullptr,
         /*ForConceptInstantiation=*/true);
-    if (EnsureTemplateArgumentListConstraints(
+    if (!isa<ConceptDecl>(Template) &&
+        EnsureTemplateArgumentListConstraints(
             Template, MLTAL,
             SourceRange(TemplateLoc, TemplateArgs.getRAngleLoc()))) {
       if (ConstraintsNotSatisfied)
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index f6ee7452c2f9a..6bba505ece07d 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3206,7 +3206,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
   // If we don't need to replace the deduced template arguments,
   // we can add them immediately as the inner-most argument list.
   if (!DeducedArgsNeedReplacement)
-    Innermost = CanonicalDeducedArgs;
+    Innermost = SugaredDeducedArgs;
 
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       Template, Template->getDeclContext(), /*Final=*/false, Innermost,
@@ -3218,7 +3218,7 @@ CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template,
   // not class-scope explicit specialization, so replace with Deduced Args
   // instead of adding to inner-most.
   if (!Innermost)
-    MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs);
+    MLTAL.replaceInnermostTemplateArguments(Template, SugaredDeducedArgs);
 
   if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL,
                                     Info.getLocation(),
@@ -3995,11 +3995,12 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     if (CheckFunctionTemplateConstraints(
             Info.getLocation(),
             FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(),
-            CTAI.CanonicalConverted, Info.AssociatedConstraintsSatisfaction))
+            CTAI.SugaredConverted, Info.AssociatedConstraintsSatisfaction))
       return TemplateDeductionResult::MiscellaneousDeductionFailure;
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
-      Info.reset(Info.takeSugared(), TemplateArgumentList::CreateCopy(
-                                         Context, CTAI.CanonicalConverted));
+      Info.reset(
+          TemplateArgumentList::CreateCopy(Context, CTAI.SugaredConverted),
+          Info.takeCanonical());
       return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
@@ -5167,8 +5168,8 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
                                   /*DefaultArgs=*/{},
                                   /*PartialTemplateArgs=*/false, CTAI))
     return true;
-  MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.CanonicalConverted,
-                                       /*Final=*/false);
+  MultiLevelTemplateArgumentList MLTAL(Concept, CTAI.SugaredConverted,
+                                       /*Final=*/true);
   // Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so
   // that the template arguments of the constraint can be preserved. For
   // example:
@@ -5182,7 +5183,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
       S, Sema::ExpressionEvaluationContext::Unevaluated,
       ImplicitConceptSpecializationDecl::Create(
           S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(),
-          CTAI.CanonicalConverted));
+          CTAI.SugaredConverted));
   if (S.CheckConstraintSatisfaction(
           Concept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL,
           TypeLoc.getLocalSourceRange(), Satisfaction))
@@ -6676,10 +6677,11 @@ namespace {
 struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
   llvm::SmallBitVector &Used;
   unsigned Depth;
+  bool VisitDeclRefTypes = true;
 
-  MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used,
-                                   unsigned Depth)
-      : Used(Used), Depth(Depth) { }
+  MarkUsedTemplateParameterVisitor(llvm::SmallBitVector &Used, unsigned Depth,
+                                   bool VisitDeclRefTypes = true)
+      : Used(Used), Depth(Depth), VisitDeclRefTypes(VisitDeclRefTypes) {}
 
   bool VisitTemplateTypeParmType(TemplateTypeParmType *T) override {
     if (T->getDepth() == Depth)
@@ -6700,6 +6702,8 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
     if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(E->getDecl()))
       if (NTTP->getDepth() == Depth)
         Used[NTTP->getIndex()] = true;
+    if (VisitDeclRefTypes)
+      DynamicRecursiveASTVisitor::TraverseType(E->getType());
     return true;
   }
 
@@ -7043,10 +7047,13 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
     break;
 
   case Type::UnaryTransform:
-    if (!OnlyDeduced)
-      MarkUsedTemplateParameters(Ctx,
-                                 cast<UnaryTransformType>(T)->getUnderlyingType(),
-                                 OnlyDeduced, Depth, Used);
+    if (!OnlyDeduced) {
+      auto *UTT = cast<UnaryTransformType>(T);
+      auto Next = UTT->getUnderlyingType();
+      if (Next.isNull())
+        Next = UTT->getBaseType();
+      MarkUsedTemplateParameters(Ctx, Next, OnlyDeduced, Depth, Used);
+    }
     break;
 
   case Type::PackExpansion:
@@ -7146,6 +7153,12 @@ Sema::MarkUsedTemplateParameters(const Expr *E, bool OnlyDeduced,
   ::MarkUsedTemplateParameters(Context, E, OnlyDeduced, Depth, Used);
 }
 
+void Sema::MarkUsedTemplateParametersForSubsumptionParameterMapping(
+    const Expr *E, unsigned Depth, llvm::SmallBitVector &Used) {
+  MarkUsedTemplateParameterVisitor(Used, Depth, /*VisitDeclRefTypes=*/false)
+      .TraverseStmt(const_cast<Expr *>(E));
+}
+
 void
 Sema::MarkUsedTemplateParameters(const TemplateArgumentList &TemplateArgs,
                                  bool OnlyDeduced, unsigned Depth,
@@ -7171,6 +7184,14 @@ void Sema::MarkUsedTemplateParameters(ArrayRef<TemplateArgument> TemplateArgs,
                                  /*OnlyDeduced=*/false, Depth, Used);
 }
 
+void Sema::MarkUsedTemplateParameters(
+    ArrayRef<TemplateArgumentLoc> TemplateArgs, unsigned Depth,
+    llvm::SmallBitVector &Used) {
+  for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
+    ::MarkUsedTemplateParameters(Context, TemplateArgs[I].getArgument(),
+                                 /*OnlyDeduced=*/false, Depth, Used);
+}
+
 void Sema::MarkDeducedTemplateParameters(
     ASTContext &Ctx, const FunctionTemplateDecl *FunctionTemplate,
     llvm::SmallBitVector &Deduced) {
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index f1c9c5c868159..1f762ca64c05f 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -628,9 +628,14 @@ Sema::InstantiatingTemplate::InstantiatingTemplate(
   Inst.InstantiationRange = InstantiationRange;
   Inst.InConstraintSubstitution =
       Inst.Kind == CodeSynthesisContext::ConstraintSubstitution;
-  if (!SemaRef.CodeSynthesisContexts.empty())
+  Inst.InParameterMappingSubstitution =
+      Inst.Kind == CodeSynthesisContext::ParameterMappingSubstitution;
+  if (!SemaRef.CodeSynthesisContexts.empty()) {
     Inst.InConstraintSubstitution |=
         SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution;
+    Inst.InParameterMappingSubstitution |=
+        SemaRef.CodeSynthesisContexts.back().InParameterMappingSubstitution;
+  }
 
   Invalid = SemaRef.pushCodeSynthesisContext(Inst);
   if (!Invalid) {
@@ -1375,6 +1380,7 @@ std::optional<TemplateDeductionInfo *> Sema::isSFINAEContext() const {
 // Template Instantiation for Types
 //===----------------------------------------------------------------------===/
 namespace {
+
   class TemplateInstantiator : public TreeTransform<TemplateInstantiator> {
     const MultiLevelTemplateArgumentList &TemplateArgs;
     SourceLocation Loc;
@@ -1387,7 +1393,11 @@ namespace {
     // Whether an incomplete substituion should be treated as an error.
     bool BailOutOnIncomplete;
 
-  private:
+    // Whether to rebuild pack expansion types; We don't do that when
+    // rebuilding the parameter mapping of a fold expression appearing
+    // in a constraint expression.
+    bool BuildPackExpansionTypes = true;
+
     // CWG2770: Function parameters should be instantiated when they are
     // needed by a satisfaction check of an atomic constraint or
     // (recursively) by another function parameter.
@@ -1410,6 +1420,17 @@ namespace {
       return EvaluateConstraints;
     }
 
+    inline static struct ForParameterMappingSubstitution_t {
+    } ForParameterMappingSubstitution;
+
+    TemplateInstantiator(ForParameterMappingSubstitution_t, Sema &SemaRef,
+                         SourceLocation Loc,
+                         const MultiLevelTemplateArgumentList &TemplateArgs,
+                         bool BuildPackExpansionTypes)
+        : inherited(SemaRef), TemplateArgs(TemplateArgs), Loc(Loc),
+          BailOutOnIncomplete(false),
+          BuildPackExpansionTypes(BuildPackExpansionTypes) {}
+
     /// Determine whether the given type \p T has already been
     /// transformed.
     ///
@@ -1444,7 +1465,8 @@ namespace {
                                  bool &ShouldExpand, bool &RetainExpansion,
                                  UnsignedOrNone &NumExpansions) {
       if (SemaRef.CurrentInstantiationScope &&
-          SemaRef.inConstraintSubstitution()) {
+          (SemaRef.inConstraintSubstitution() ||
+           SemaRef.inParameterMappingSubstitution())) {
         for (UnexpandedParameterPack ParmPack : Unexpanded) {
           NamedDecl *VD = ParmPack.first.dyn_cast<NamedDecl *>();
           if (auto *PVD = dyn_cast_if_present<ParmVarDecl>(VD);
@@ -1465,10 +1487,10 @@ namespace {
 
     TemplateArgument ForgetPartiallySubstitutedPack() {
       TemplateArgument Result;
-      if (NamedDecl *PartialPack
-            = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
-        MultiLevelTemplateArgumentList &TemplateArgs
-          = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+      if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
+                                       ->getPartiallySubstitutedPack()) {
+        MultiLevelTemplateArgumentList &TemplateArgs =
+            const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
         unsigned Depth, Index;
         std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
         if (TemplateArgs.hasTemplateArgument(Depth, Index)) {
@@ -1488,10 +1510,10 @@ namespace {
       if (Arg.isNull())
         return;
 
-      if (NamedDecl *PartialPack
-            = SemaRef.CurrentInstantiationScope->getPartiallySubstitutedPack()){
-        MultiLevelTemplateArgumentList &TemplateArgs
-        = const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
+      if (NamedDecl *PartialPack = SemaRef.CurrentInstantiationScope
+                                       ->getPartiallySubstitutedPack()) {
+        MultiLevelTemplateArgumentList &TemplateArgs =
+            const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs);
         unsigned Depth, Index;
         std::tie(Depth, Index) = getDepthAndIndex(PartialPack);
         TemplateArgs.setArgument(Depth, Index, Arg);
@@ -1508,9 +1530,9 @@ namespace {
           std::move(New);
       return Old;
     }
+
     void RememberSubstitution(MultiLevelTemplateArgumentList Old) {
-      const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) =
-          std::move(Old);
+      const_cast<MultiLevelTemplateArgumentList &>(this->TemplateArgs) = Old;
     }
 
     TemplateArgument
@@ -1691,6 +1713,24 @@ namespace {
       return inherited::TransformTemplateArgument(Input, Output, Uneval);
     }
 
+    // This has to be here to allow its overload.
+    ExprResult RebuildPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc,
+                                    UnsignedOrNone NumExpansions) {
+      return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
+                                             NumExpansions);
+    }
+
+    TemplateArgumentLoc RebuildPackExpansion(TemplateArgumentLoc Pattern,
+                                             SourceLocation EllipsisLoc,
+                                             UnsignedOrNone NumExpansions) {
+      // We don't rewrite a PackExpansion type when we want to normalize a
+      // CXXFoldExpr constraint. We'll expand it when evaluating the constraint.
+      if (BuildPackExpansionTypes)
+        return inherited::RebuildPackExpansion(Pattern, EllipsisLoc,
+                                               NumExpansions);
+      return Pattern;
+    }
+
     using TreeTransform::TransformTemplateSpecializationType;
     QualType
     TransformTemplateSpecializationType(TypeLocBuilder &TLB,
@@ -1961,7 +2001,8 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
 
   if (ParmVarDecl *PVD = dyn_cast<ParmVarDecl>(D);
       PVD && SemaRef.CurrentInstantiationScope &&
-      SemaRef.inConstraintSubstitution() &&
+      (SemaRef.inConstraintSubstitution() ||
+       SemaRef.inParameterMappingSubstitution()) &&
       maybeInstantiateFunctionParameterToScope(PVD))
     return nullptr;
 
@@ -2759,18 +2800,29 @@ TemplateInstantiator::TransformExprRequirement(concepts::ExprRequirement *Req) {
 concepts::NestedRequirement *
 TemplateInstantiator::TransformNestedRequirement(
     concepts::NestedRequirement *Req) {
-  if (!Req->isDependent() && !AlwaysRebuild())
-    return Req;
+
+  ASTContext &C = SemaRef.Context;
+
+  Expr *Constraint = Req->getConstraintExpr();
+  ConstraintSatisfaction Satisfaction;
+
+  auto NestedReqWithDiag = [&C, this](Expr *E,
+                                      ConstraintSatisfaction Satisfaction) {
+    Satisfaction.IsSatisfied = false;
+    SmallString<128> Entity;
+    llvm::raw_svector_ostream OS(Entity);
+    E->printPretty(OS, nullptr, SemaRef.getPrintingPolicy());
+    return new (C) concepts::NestedRequirement(
+        SemaRef.Context, C.backupStr(Entity), std::move(Satisfaction));
+  };
+
   if (Req->hasInvalidConstraint()) {
     if (AlwaysRebuild())
       return RebuildNestedRequirement(Req->getInvalidConstraintEntity(),
                                       Req->getConstraintSatisfaction());
     return Req;
   }
-  Sema::InstantiatingTemplate ReqInst(SemaRef,
-      Req->getConstraintExpr()->getBeginLoc(), Req,
-      Sema::InstantiatingTemplate::ConstraintsCheck{},
-      Req->getConstraintExpr()->getSourceRange());
+
   if (!getEvaluateConstraints()) {
     ExprResult TransConstraint = TransformExpr(Req->getConstraintExpr());
     if (TransConstraint.isInvalid() || !TransConstraint.get())
@@ -2783,45 +2835,45 @@ TemplateInstantiator::TransformNestedRequirement(
         SemaRef.Context, TransConstraint.get(), Satisfaction);
   }
 
-  ExprResult TransConstraint;
-  ConstraintSatisfaction Satisfaction;
-  TemplateDeductionInfo Info(Req->getConstraintExpr()->getBeginLoc());
+  bool Success;
+  Expr *NewConstraint;
+  TemplateDeductionInfo Info(Constraint->getBeginLoc());
   {
     EnterExpressionEvaluationContext ContextRAII(
         SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-    Sema::SFINAETrap Trap(SemaRef);
-    Sema::InstantiatingTemplate ConstrInst(SemaRef,
-        Req->getConstraintExpr()->getBeginLoc(), Req, Info,
-        Req->getConstraintExpr()->getSourceRange());
+
+    Sema::InstantiatingTemplate ConstrInst(
+        SemaRef, Constraint->getBeginLoc(), Req,
+        Sema::InstantiatingTemplate::ConstraintsCheck(),
+        Constraint->getSourceRange());
+
     if (ConstrInst.isInvalid())
       return nullptr;
-    llvm::SmallVector<Expr *> Result;
-    if (!SemaRef.CheckConstraintSatisfaction(
-            nullptr,
-            AssociatedConstraint(Req->getConstraintExpr(),
-                                 SemaRef.ArgPackSubstIndex),
-            Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(),
-            Satisfaction) &&
-        !Result.empty())
-      TransConstraint = Result[0];
-    assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled "
-                                       "by CheckConstraintSatisfaction.");
+
+    Sema::SFINAETrap Trap(SemaRef);
+
+    Success = !SemaRef.CheckConstraintSatisfaction(
+        Req, AssociatedConstraint(Constraint, SemaRef.ArgPackSubstIndex),
+        TemplateArgs, Constraint->getSourceRange(), Satisfaction,
+        /*TopLevelConceptId=*/nullptr, &NewConstraint);
+
+    assert(!Success || !Trap.hasErrorOccurred() &&
+                           "Substitution failures must be handled "
+                           "by CheckConstraintSatisfaction.");
   }
-  ASTContext &C = SemaRef.Context;
-  if (TransConstraint.isUsable() &&
-      TransConstraint.get()->isInstantiationDependent())
-    return new (C) concepts::NestedRequirement(TransConstraint.get());
-  if (TransConstraint.isInvalid() || !TransConstraint.get() ||
-      Satisfaction.HasSubstitutionFailure()) {
-    SmallString<128> Entity;
-    llvm::raw_svector_ostream OS(Entity);
-    Req->getConstraintExpr()->printPretty(OS, nullptr,
-                                          SemaRef.getPrintingPolicy());
-    return new (C) concepts::NestedRequirement(
-        SemaRef.Context, C.backupStr(Entity), Satisfaction);
+
+  if (!Success || Satisfaction.HasSubstitutionFailure())
+    return NestedReqWithDiag(Constraint, Satisfaction);
+
+  // FIXME: const correctness
+  // MLTAL might be dependent.
+  if (!NewConstraint) {
+    if (!Satisfaction.IsSatisfied)
+      return NestedReqWithDiag(Constraint, Satisfaction);
+
+    NewConstraint = Constraint;
   }
-  return new (C)
-      concepts::NestedRequirement(C, TransConstraint.get(), Satisfaction);
+  return new (C) concepts::NestedRequirement(C, NewConstraint, Satisfaction);
 }
 
 TypeSourceInfo *Sema::SubstType(TypeSourceInfo *T,
@@ -3078,7 +3130,7 @@ bool Sema::SubstTypeConstraint(
   const ASTTemplateArgumentListInfo *TemplArgInfo =
       TC->getTemplateArgsAsWritten();
 
-  if (!EvaluateConstraints) {
+  if (!EvaluateConstraints && !inParameterMappingSubstitution()) {
     UnsignedOrNone Index = TC->getArgPackSubstIndex();
     if (!Index)
       Index = SemaRef.ArgPackSubstIndex;
@@ -4378,6 +4430,16 @@ bool Sema::SubstTemplateArguments(
   return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
 }
 
+bool Sema::SubstTemplateArgumentsInParameterMapping(
+    ArrayRef<TemplateArgumentLoc> Args, SourceLocation BaseLoc,
+    const MultiLevelTemplateArgumentList &TemplateArgs,
+    TemplateArgumentListInfo &Out, bool BuildPackExpansionTypes) {
+  TemplateInstantiator Instantiator(
+      TemplateInstantiator::ForParameterMappingSubstitution, *this, BaseLoc,
+      TemplateArgs, BuildPackExpansionTypes);
+  return Instantiator.TransformTemplateArguments(Args.begin(), Args.end(), Out);
+}
+
 ExprResult
 Sema::SubstExpr(Expr *E, const MultiLevelTemplateArgumentList &TemplateArgs) {
   if (!E)
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6967301483361..51b55b82f4208 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -3722,10 +3722,6 @@ class TreeTransform {
                                         ParentContext);
   }
 
-  /// Build a new Objective-C boxed expression.
-  ///
-  /// By default, performs semantic analysis to build the new expression.
-  /// Subclasses may override this routine to provide different behavior.
   ExprResult RebuildConceptSpecializationExpr(NestedNameSpecifierLoc NNS,
       SourceLocation TemplateKWLoc, DeclarationNameInfo ConceptNameInfo,
       NamedDecl *FoundDecl, ConceptDecl *NamedConcept,
@@ -5110,9 +5106,13 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
       typedef TemplateArgumentLocInventIterator<Derived,
                                                 TemplateArgument::pack_iterator>
         PackLocIterator;
+
+      TemplateArgumentListInfo *PackOutput = &Outputs;
+      TemplateArgumentListInfo New;
+
       if (TransformTemplateArguments(
               PackLocIterator(*this, In.getArgument().pack_begin()),
-              PackLocIterator(*this, In.getArgument().pack_end()), Outputs,
+              PackLocIterator(*this, In.getArgument().pack_end()), *PackOutput,
               Uneval))
         return true;
 
@@ -5179,7 +5179,6 @@ bool TreeTransform<Derived>::TransformTemplateArguments(
   }
 
   return false;
-
 }
 
 // FIXME: Find ways to reduce code duplication for pack expansions.
@@ -6247,7 +6246,7 @@ ParmVarDecl *TreeTransform<Derived>::TransformFunctionTypeParam(
                                              /* DefArg */ nullptr);
   newParm->setScopeInfo(OldParm->getFunctionScopeDepth(),
                         OldParm->getFunctionScopeIndex() + indexAdjustment);
-  transformedLocalDecl(OldParm, {newParm});
+  getDerived().transformedLocalDecl(OldParm, {newParm});
   return newParm;
 }
 
@@ -7082,11 +7081,11 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
                                                             TypeLocBuilder &TLB,
                                                      UnaryTransformTypeLoc TL) {
   QualType Result = TL.getType();
+  TypeSourceInfo *NewBaseTSI = TL.getUnderlyingTInfo();
   if (Result->isDependentType()) {
     const UnaryTransformType *T = TL.getTypePtr();
 
-    TypeSourceInfo *NewBaseTSI =
-        getDerived().TransformType(TL.getUnderlyingTInfo());
+    NewBaseTSI = getDerived().TransformType(TL.getUnderlyingTInfo());
     if (!NewBaseTSI)
       return QualType();
     QualType NewBase = NewBaseTSI->getType();
@@ -7101,7 +7100,7 @@ QualType TreeTransform<Derived>::TransformUnaryTransformType(
   UnaryTransformTypeLoc NewTL = TLB.push<UnaryTransformTypeLoc>(Result);
   NewTL.setKWLoc(TL.getKWLoc());
   NewTL.setParensRange(TL.getParensRange());
-  NewTL.setUnderlyingTInfo(TL.getUnderlyingTInfo());
+  NewTL.setUnderlyingTInfo(NewBaseTSI);
   return Result;
 }
 
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index cf32d4f56b7c2..5456e73956659 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2424,7 +2424,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl(
   VisitDecl(D);
   llvm::SmallVector<TemplateArgument, 4> Args;
   for (unsigned I = 0; I < D->NumTemplateArgs; ++I)
-    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/true));
+    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/false));
   D->setTemplateArguments(Args);
 }
 
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 70b898a53fcbd..eef97a8588f0b 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -807,15 +807,19 @@ readConstraintSatisfaction(ASTRecordReader &Record) {
   if (!Satisfaction.IsSatisfied) {
     unsigned NumDetailRecords = Record.readInt();
     for (unsigned i = 0; i != NumDetailRecords; ++i) {
-      if (/* IsDiagnostic */Record.readInt()) {
+      auto Kind = Record.readInt();
+      if (Kind == 0) {
         SourceLocation DiagLocation = Record.readSourceLocation();
         StringRef DiagMessage = C.backupStr(Record.readString());
 
-        Satisfaction.Details.emplace_back(
-            new (C) ConstraintSatisfaction::SubstitutionDiagnostic(
-                DiagLocation, DiagMessage));
-      } else
+        Satisfaction.Details.emplace_back(new (
+            C) ConstraintSubstitutionDiagnostic(DiagLocation, DiagMessage));
+      } else if (Kind == 1) {
         Satisfaction.Details.emplace_back(Record.readExpr());
+      } else {
+        assert(Kind == 2);
+        Satisfaction.Details.emplace_back(Record.readConceptReference());
+      }
     }
   }
   return Satisfaction;
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index ebda91e3819c3..acf345392aa1a 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -482,14 +482,20 @@ addConstraintSatisfaction(ASTRecordWriter &Record,
   if (!Satisfaction.IsSatisfied) {
     Record.push_back(Satisfaction.NumRecords);
     for (const auto &DetailRecord : Satisfaction) {
-      auto *E = dyn_cast<Expr *>(DetailRecord);
-      Record.push_back(/* IsDiagnostic */ E == nullptr);
-      if (E)
-        Record.AddStmt(E);
-      else {
-        auto *Diag = cast<std::pair<SourceLocation, StringRef> *>(DetailRecord);
+      if (auto *Diag = dyn_cast<const ConstraintSubstitutionDiagnostic *>(
+              DetailRecord)) {
+        Record.push_back(/*Kind=*/0);
         Record.AddSourceLocation(Diag->first);
         Record.AddString(Diag->second);
+        continue;
+      }
+      if (auto *E = dyn_cast<const Expr *>(DetailRecord)) {
+        Record.push_back(/*Kind=*/1);
+        Record.AddStmt(const_cast<Expr *>(E));
+      } else {
+        Record.push_back(/*Kind=*/2);
+        auto *CR = cast<const ConceptReference *>(DetailRecord);
+        Record.AddConceptReference(CR);
       }
     }
   }
diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp
index 84d981d2ab8de..9419dba057a4e 100644
--- a/clang/test/AST/ast-dump-concepts.cpp
+++ b/clang/test/AST/ast-dump-concepts.cpp
@@ -20,8 +20,9 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13, col:31> 'bool' Concept {{.*}} 'binary_concept'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:13:9> col:9
-  // CHECK-NEXT:   | |-TemplateArgument type 'type-parameter-1-0'  
-  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   | |-TemplateArgument type 'R'
+  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   | |   `-TemplateTypeParm {{.*}} 'R'
   // CHECK-NEXT:   | `-TemplateArgument type 'int'
   // CHECK-NEXT:   |   `-BuiltinType {{.*}} 'int'
   // CHECK-NEXT:   |-TemplateArgument {{.*}} type 'R'
@@ -35,8 +36,9 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13> 'bool'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:10:9> col:9
-  // CHECK-NEXT:   | `-TemplateArgument type 'type-parameter-1-0'
-  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   | `-TemplateArgument type 'R'
+  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
+  // CHECK-NEXT:   |     `-TemplateTypeParm {{.*}} 'R'
   template <unary_concept R>
   Foo(R);
 
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index 781fb9f28cb8d..9a3adbcb534e8 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -185,17 +185,18 @@ void foo() {
 // CHECK-NEXT: | |-BinaryOperator {{.*}} 'bool' '&&'
 // CHECK-NEXT: | | |-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'invocable'
 // CHECK-NEXT: | | | |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: | | | | |-TemplateArgument type 'type-parameter-0-2'
-// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
-// CHECK-NEXT: | | | | `-TemplateArgument pack '<GH124715::Packs<type-parameter-0-1...>>'
-// CHECK-NEXT: | | | |   `-TemplateArgument type 'GH124715::Packs<type-parameter-0-1...>'
-// CHECK-NEXT: | | | |     `-TemplateSpecializationType {{.*}} 'GH124715::Packs<type-parameter-0-1...>' dependent
-// CHECK-NEXT: | | | |       |-name: 'GH124715::Packs'
+// CHECK-NEXT: | | | | |-TemplateArgument type 'U'
+// CHECK-NEXT: | | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
+// CHECK-NEXT: | | | | |   `-TemplateTypeParm {{.*}} 'U'
+// CHECK-NEXT: | | | | `-TemplateArgument pack '<Packs<Ts...>>'
+// CHECK-NEXT: | | | |   `-TemplateArgument type 'Packs<Ts...>'
+// CHECK-NEXT: | | | |     `-TemplateSpecializationType {{.*}} 'Packs<Ts...>' dependent
+// CHECK-NEXT: | | | |       |-name: 'Packs':'GH124715::Packs' qualified
 // CHECK-NEXT: | | | |       | `-ClassTemplateDecl {{.*}} Packs
-// CHECK-NEXT: | | | |       `-TemplateArgument pack '<type-parameter-0-1...>'
-// CHECK-NEXT: | | | |         `-TemplateArgument type 'type-parameter-0-1...'
-// CHECK-NEXT: | | | |           `-PackExpansionType {{.*}} 'type-parameter-0-1...' dependent
-// CHECK-NEXT: | | | |             `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
+// CHECK-NEXT: | | | |       `-TemplateArgument type 'Ts...'
+// CHECK-NEXT: | | | |         `-PackExpansionType {{.*}} 'Ts...' dependent
+// CHECK-NEXT: | | | |           `-TemplateTypeParmType {{.*}} 'Ts' dependent contains_unexpanded_pack depth 0 index 1 pack
+// CHECK-NEXT: | | | |             `-TemplateTypeParm {{.*}} 'Ts'
 // CHECK-NEXT: | | | |-TemplateArgument {{.*}} type 'U':'type-parameter-0-2'
 // CHECK-NEXT: | | | | `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 2
 // CHECK-NEXT: | | | |   `-TemplateTypeParm {{.*}} 'U'
diff --git a/clang/test/CXX/drs/cwg25xx.cpp b/clang/test/CXX/drs/cwg25xx.cpp
index 5c2948f67d0ee..0e0fc735c6843 100644
--- a/clang/test/CXX/drs/cwg25xx.cpp
+++ b/clang/test/CXX/drs/cwg25xx.cpp
@@ -243,19 +243,20 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
   //   since-cxx20-note@#cwg2565-VC {{because 'b' would be invalid: argument may not have 'void' type}}
 
   template<typename T>
-  concept ErrorRequires = requires (ErrorRequires auto x) {
+  concept ErrorRequires = requires (ErrorRequires auto x) { // #cwg2565-expr
   // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
   //   since-cxx20-note@-2 {{declared here}}
   // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
     x;
   };
   static_assert(ErrorRequires<int>);
-  // since-cxx20-error@-1 {{static assertion failed}}
-  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}} \
+  //   since-cxx20-note@-1 {{because 'int' does not satisfy 'ErrorRequires'}} \
+  //   since-cxx20-note@#cwg2565-expr {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   template<typename T>
   concept NestedErrorInRequires = requires (T x) { // #cwg2565-NEIR
-    requires requires (NestedErrorInRequires auto y) {
+    requires requires (NestedErrorInRequires auto y) { // #cwg2565-NEIR-inner
     // since-cxx20-error@-1 {{a concept definition cannot refer to itself}}
     //   since-cxx20-note@#cwg2565-NEIR {{declared here}}
     // since-cxx20-error@-3 {{'auto' not allowed in requires expression parameter}}
@@ -263,8 +264,9 @@ namespace cwg2565 { // cwg2565: 16 open 2023-06-07
     };
   };
   static_assert(NestedErrorInRequires<int>);
-  // since-cxx20-error@-1 {{static assertion failed}}
-  //   since-cxx20-note@-2 {{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // since-cxx20-error@-1 {{static assertion failed}} \
+  //   since-cxx20-note@-1 {{because 'int' does not satisfy 'NestedErrorInRequires'}} \
+  //   since-cxx20-note-re@#cwg2565-NEIR-inner {{because {{.*}} would be invalid: constraint depends on a previously diagnosed expression}}
 
 #endif
 } // namespace cwg2565
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
index 28b5d0adcf054..af2fc938fbea2 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/p3.cpp
@@ -140,7 +140,8 @@ concept C7 = sizeof(T) == 1 || sizeof(
         ::type) == 1;
 
 static_assert(!C6<short>);
-static_assert(!C6<char>); // expected-note{{while checking the satisfaction of concept 'C6<char>' requested here}}
+static_assert(!C6<char>);
+// expected-note@-1 {{while checking the satisfaction of concept 'C6<char>' requested here}}
 static_assert(C7<char>);
 static_assert(!C7<short>); // expected-note{{while checking the satisfaction of concept 'C7<short>' requested here}}
 
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
index 31587a956b8ab..af2dce81d8a4b 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
@@ -35,14 +35,14 @@ using r2i2 = r2<A>; // expected-error{{constraints not satisfied for class templ
 using r2i3 = r2<D>;
 using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template 'r2' [with T = const D]}}
 
-template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
+template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
 
 // Non-dependent expressions
 
@@ -89,7 +89,7 @@ template<typename T>
 concept Large = sizeof(typename remove_reference<T>::type) >= 4;
 // expected-note@-1{{because 'sizeof(typename remove_reference<short &>::type) >= 4' (2 >= 4) evaluated to false}}
 
-template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large':}}
+template<typename T> requires requires (T t) { { t } -> Large; } // expected-note{{because 'short &' does not satisfy 'Large'}}
 struct r7 {};
 
 using r7i1 = r7<int>;
@@ -149,7 +149,7 @@ namespace std_example {
   template<typename T> constexpr bool is_same_v<T, T> = true;
 
   template<typename T, typename U> concept same_as = is_same_v<T, U>;
-  // expected-note@-1 {{because 'is_same_v<int, int *>' evaluated to false}}
+  // expected-note@-1 {{because 'is_same_v<int, typename std_example::T2::inner>' evaluated to false}}
 
   static_assert(C1<int>);
   static_assert(C1<int*>);
@@ -160,7 +160,7 @@ namespace std_example {
   template<typename T> concept C2 =
     requires(T x) {
       {*x} -> same_as<typename T::inner>;
-      // expected-note@-1{{because type constraint 'same_as<int, typename std_example::T2::inner>' was not satisfied:}}
+      // expected-note@-1{{because 'same_as<int, typename std_example::T2::inner>' evaluated to false}}
       // expected-note@-2{{because '*x' would be invalid: indirection requires pointer operand ('int' invalid)}}
     };
 
@@ -173,9 +173,9 @@ namespace std_example {
     int operator *() { return 0; }
   };
   static_assert(C2<T1>);
-  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'std_example::T2' does not satisfy 'C2'}}
+  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'T2' does not satisfy 'C2'}}
   using c2c1 = C2_check<int>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = int]}}
-  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::T2]}}
+  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = T2]}}
 
   template<typename T>
   void g(T t) noexcept(sizeof(T) == 1) {}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 033ae349a02e5..70a96bed05867 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -43,11 +43,10 @@ namespace std_example {
       requires sizeof(a) == 4; // OK
       requires a == 0; // expected-error{{substitution into constraint expression resulted in a non-constant expression}}
       // expected-note@-1{{while checking the satisfaction of nested requirement requested here}}
-      // expected-note@-2{{in instantiation of requirement here}}
-      // expected-note@-3{{while checking the satisfaction of nested requirement requested here}}
-      // expected-note@-6{{while substituting template arguments into constraint expression here}}
-      // expected-note@-5{{function parameter 'a' with unknown value cannot be used in a constant expression}}
-      // expected-note@-8{{declared here}}
+      // expected-note@-2{{while checking the satisfaction of nested requirement requested here}}
+      // expected-note@-5{{while substituting template arguments into constraint expression here}}
+      // expected-note@-4{{function parameter 'a' with unknown value cannot be used in a constant expression}}
+      // expected-note@-7{{declared here}}
     };
     static_assert(C2<int>); // expected-error{{static assertion failed}}
     // expected-note@-1{{while checking the satisfaction of concept 'C2<int>' requested here}}
@@ -84,31 +83,26 @@ static_assert(Pipes<S>);
 static_assert(Pipes<double>);
 
 static_assert(Amps1<S>);
-static_assert(!Amps1<double>);
+static_assert(Amps1<double>);
 
 static_assert(Amps2<S>);
-static_assert(!Amps2<double>);
+static_assert(Amps2<double>);
 
 template<class T>
-void foo1() requires requires (T x) { // #foo1
+void foo1() requires requires (T x) {
   requires
-  True<decltype(x.value)> // #foo1Value
+  True<decltype(x.value)>
   && True<T>;
 } {}
 template<class T> void fooPipes() requires Pipes<T> {}
-template<class T> void fooAmps1() requires Amps1<T> {} // #fooAmps1
+template<class T> void fooAmps1() requires Amps1<T> {}
 void foo() {
   foo1<S>();
-  foo1<int>(); // expected-error {{no matching function for call to 'foo1'}}
-  // expected-note@#foo1Value {{because 'True<decltype(x.value)> && True<T>' would be invalid: member reference base type 'int' is not a structure or union}}
-  // expected-note@#foo1 {{candidate template ignored: constraints not satisfied [with T = int]}}
+  foo1<int>();
   fooPipes<S>();
   fooPipes<int>();
   fooAmps1<S>();
-  fooAmps1<int>(); // expected-error {{no matching function for call to 'fooAmps1'}}
-  // expected-note@#fooAmps1 {{candidate template ignored: constraints not satisfied [with T = int]}}
-  // expected-note@#fooAmps1 {{because 'int' does not satisfy 'Amps1'}}
-  // expected-note@#Amps1 {{because 'True<decltype(x.value)> && True<T> && !False<T>' would be invalid: member reference base type 'int' is not a structure or union}}
+  fooAmps1<int>();
 }
 
 template<class T>
@@ -158,15 +152,16 @@ void func() {
   // expected-note@#bar {{while substituting template arguments into constraint expression here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
   // expected-note@#bar {{candidate template ignored: constraints not satisfied [with T = False]}}
-  // expected-note@#bar {{because 'X<SubstitutionFailureNestedRequires::ErrorExpressions_NotSF::False>::value' evaluated to false}}
+  // expected-note@#bar {{because 'X<False>::value' evaluated to false}}
 
   bar<int>();
+  // expected-error@-1 {{no matching function for call to 'bar'}} \
   // expected-note@-1 {{while checking constraint satisfaction for template 'bar<int>' required here}} \
-  // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}}
+  // expected-note@-1 {{while substituting deduced template arguments into function template 'bar' [with T = int]}} \
   // expected-note@#bar {{in instantiation of static data member}}
-  // expected-note@#bar {{in instantiation of requirement here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
   // expected-note@#bar {{while substituting template arguments into constraint expression here}}
+  // expected-note@#bar {{candidate template ignored}}
   // expected-error@#X_Value {{type 'int' cannot be used prior to '::' because it has no members}}
 }
 }
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
index 5199708cd8166..5dcb1880ded48 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
@@ -39,14 +39,14 @@ using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class
 
 template<typename T> requires requires { sizeof(T); }
 // expected-note@-1{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}}
-// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
+// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
 
 template<typename T> requires requires (T t) { 0; "a"; (void)'a'; }
 struct r4 {};
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
index 5433cfb21955d..28dff336d053c 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
@@ -182,14 +182,14 @@ namespace std_example {
   static_assert(C1<has_inner_and_type> && C2<has_inner_and_type> && C3<has_inner_and_type>);
   template<C1 T> struct C1_check {};
   // expected-note@-1 {{because 'int' does not satisfy 'C1'}}
-  // expected-note@-2 {{because 'std_example::has_type' does not satisfy 'C1'}}
+  // expected-note@-2 {{because 'has_type' does not satisfy 'C1'}}
   template<C2 T> struct C2_check {};
-  // expected-note@-1 {{because 'std_example::has_inner' does not satisfy 'C2'}}
+  // expected-note@-1 {{because 'has_inner' does not satisfy 'C2'}}
   template<C3 T> struct C3_check {};
   // expected-note@-1 {{because 'void' does not satisfy 'C3'}}
   using c1 = C1_check<int>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = int]}}
-  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = std_example::has_type]}}
-  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::has_inner]}}
+  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = has_type]}}
+  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = has_inner]}}
   using c4 = C3_check<void>; // expected-error{{constraints not satisfied for class template 'C3_check' [with T = void]}}
 }
 
@@ -199,10 +199,10 @@ template <typename T> concept C = requires { requires requires { T::a; }; };
 // expected-note@-1 {{because 'T::a' would be invalid: no member named 'a' in 'PR48656::T1'}}
 
 template <C...> struct A {};
-// expected-note@-1 {{because 'PR48656::T1' does not satisfy 'C'}}
+// expected-note@-1 {{because 'T1' does not satisfy 'C'}}
 
 struct T1 {};
-template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <PR48656::T1>]}}
+template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <T1>]}}
 
 struct T2 { static constexpr bool a = false; };
 template struct A<T2>;
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
index 59e6a48e48878..6dea0c62fe686 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.atomic/constrant-satisfaction-conversions.cpp
@@ -28,9 +28,8 @@ template<typename T> requires requires {
   requires S<T>{};
   // expected-error@-1{{atomic constraint must be of type 'bool' (found 'S<int>')}}
   // expected-note@-2{{while checking the satisfaction}}
-  // expected-note@-3{{in instantiation of requirement}}
-  // expected-note@-4{{while checking the satisfaction}}
-  // expected-note@-6{{while substituting template arguments}}
+  // expected-note@-3{{while checking the satisfaction of nested requirement}}
+  // expected-note@-5{{while substituting template arguments}}
   // expected-note@#F3INST{{while checking constraint satisfaction}}
   // expected-note@#F3INST{{while substituting deduced template arguments into function template 'f3' [with T = int]}}
   //
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
index 3992835c44402..34c5c5d338bfe 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
@@ -1,21 +1,31 @@
 // RUN: %clang_cc1 -std=c++2a -x c++ -verify %s
+// RUN: %clang_cc1 -std=c++2c -x c++ -verify %s
 
 template<typename T> concept True = true;
-template<typename T> concept Foo = True<T*>;
-template<typename T> concept Bar = Foo<T&>;
-template<typename T> requires Bar<T> struct S { };
-template<typename T> requires Bar<T> && true struct S<T> { };
+template<typename T> concept Foo = True<T*>; // #Foo
+template<typename T> concept Bar = Foo<T&>;  // #Bar
+template<typename T> requires Bar<T> struct S { }; // #S
+template<typename T> requires Bar<T> && true struct S<T> { }; // #SpecS
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo 2{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-note@#SpecS {{while substituting into concept arguments here}}
+// expected-note@#S {{while substituting into concept arguments here}}
+// expected-note@#Bar 2{{while substituting into concept arguments here}}
+// expected-note@#S {{template is declared here}}
+
+
 
 template<typename T> concept True2 = sizeof(T) >= 0;
-template<typename T> concept Foo2 = True2<T*>;
-// expected-error@-1{{'type name' declared as a pointer to a reference of type 'type-parameter-0-0 &'}}
-template<typename T> concept Bar2 = Foo2<T&>;
-// expected-note@-1{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
-template<typename T> requires Bar2<T> struct S2 { };
+template<typename T> concept Foo2 = True2<T*>; // #Foo2
+
+template<typename T> concept Bar2 = Foo2<T&>; // #Bar2
+// expected-note@-1 3{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
+template<typename T> requires Bar2<T> struct S2 { }; // #SpecS2_1
 // expected-note@-1{{template is declared here}}
-template<typename T> requires Bar2<T> && true struct S2<T> { };
+template<typename T> requires Bar2<T> && true struct S2<T> { }; // #SpecS2_2
 // expected-error@-1{{class template partial specialization is not more specialized than the primary template}}
-// expected-note@-2{{while calculating associated constraint of template 'S2<T>' here}}
+// expected-error@#Foo2{{'type name' declared as a pointer to a reference of type 'T &'}}
+
 
 namespace type_pack {
   template<typename... Args>
@@ -71,16 +81,31 @@ namespace non_type_pack {
 namespace PR47174 {
 // This checks that we don't crash with a failed substitution on the first constrained argument when
 // performing normalization.
-template <Bar2 T, True U>
+template <Bar2 T, True U> // #S3_Header
 requires true struct S3; // expected-note {{template is declared here}}
 template <True T, True U>
-requires true struct S3<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+requires true struct S3<T, U>;
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo2 2{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-note@#SpecS2_1 {{while substituting into concept arguments here}}
+// expected-note@#SpecS2_2 {{while substituting into concept arguments here}}
+// expected-note@#S3_Header {{while substituting into concept arguments here}}
+// expected-note@#Bar2 {{while substituting into concept arguments here}}
+
 
 // Same as above, for the second position (but this was already working).
-template <True T, Bar2 U>
-requires true struct S4; // expected-note {{template is declared here}}
+template <True T, Bar2 U> // #S4_Header
+requires true struct S4; // #S4
 template <True T, True U>
-requires true struct S4<T, U>; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+requires true struct S4<T, U>; // #S4-spec
+// expected-error@-1 {{class template partial specialization is not more specialized than the primary template}}
+// expected-error@#Foo2 {{'type name' declared as a pointer to a reference of type 'U &'}}
+// expected-note@#S4_Header {{while substituting into concept arguments here}}
+// expected-note@#S4 {{template is declared here}}
+// expected-note@#S4 {{similar constraint expressions not considered equivalent}}
+// expected-note@#S4-spec {{similar constraint expression here}}
+
+
 
 struct X {
   template<int> struct Y {
@@ -96,7 +121,7 @@ template<class T> requires C1<T> && C2<T> void t1() = delete; // expected-note {
 template void t1<X>();
 void t1() { t1<X>(); } // expected-error {{call to deleted function 't1'}}
 
-template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}} 
+template<class T> requires C1<T> void t2() {}; // expected-note 2 {{candidate function}}
 template<class T> requires C2<T> void t2() {}; // expected-note 2 {{candidate function}}
 template void t2<X>(); // expected-error {{partial ordering for explicit instantiation of 't2' is ambiguous}}
 void t2() { t2<X>(); } // expected-error {{call to 't2' is ambiguous}}
diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp
index 4f5fdd3b4809a..c0406f88db5f3 100644
--- a/clang/test/CXX/temp/temp.param/p10-2a.cpp
+++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp
@@ -86,16 +86,18 @@ using f1 = F<int>;
 using f2 = F<long>; // expected-error {{constraints not satisfied for alias template 'F' [with T = long]}}
 
 template<typename T, typename... Ts>
-concept OneOf = (is_same_v<T, Ts> || ...);
-// expected-note@-1 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
-// expected-note@-2 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
-// expected-note@-3 {{because 'is_same_v<short, int>' evaluated to false}}
-// expected-note@-4 {{and 'is_same_v<short, long>' evaluated to false}}
-// expected-note@-5 {{and 'is_same_v<short, char>' evaluated to false}}
-// expected-note@-6 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
-// expected-note@-7 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
-// expected-note@-8 2{{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
-// expected-note@-9 2{{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
+concept OneOf = (is_same_v<T, Ts> || ...); // #OneOf
+// expected-note@#OneOf 2{{because 'is_same_v<char, char[1]>' evaluated to false}}
+// expected-note@#OneOf 2{{and 'is_same_v<char, char[2]>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<short, int>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<short, long>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<short, char>' evaluated to false}}
+// expected-note@#OneOf 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
+// expected-note@#OneOf 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
+// expected-note@#OneOf {{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
+// expected-note@#OneOf {{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
 
 template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U>
 // expected-note@-1 2{{because 'OneOf<char, char[1], char[2]>' evaluated to false}}
@@ -124,6 +126,7 @@ using I = int;
 
 using i1 = I<1>;
 using i2 = I<'a'>;
+// FIXME: This crashes with -std=c++2c
 using i3 = I<nullptr>;
 // expected-error@-1 {{constraints not satisfied for alias template 'I' [with x = nullptr]}}
 
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index 99a82d96d321b..ce862666aa48f 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -127,13 +127,12 @@ struct F {
 
 template <typename T>
 constexpr int f5() requires C<T> { return 1; } // expected-note {{while checking the satisfaction}}
-                                               // expected-note@-1 {{while substituting template arguments}}
-                                               // expected-note@-2 {{candidate template ignored}}
+                                               // expected-note@-1 {{candidate template ignored}}
 
 template <typename T>
-constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}}
-                                                  // expected-note@-1 4 {{while substituting template arguments}}
-                                                  // expected-note@-2 {{candidate template ignored}}
+constexpr int f5() requires (!C<T>) { return 2; } // expected-note 4 {{while checking the satisfaction}} \
+                                                  // expected-note 4 {{while substituting template arguments}} \
+                                                  // expected-note {{candidate template ignored}}
 
 static_assert(f5<int>() == 1);
 static_assert(f5<D>() == 1); // expected-note 3 {{while checking constraint satisfaction}}
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 74b3573a0dcaa..6777dc23c44a6 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -1257,13 +1257,13 @@ void f() {
     (&A::e)(a, a);
     // expected-error@-1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::e<A>)(a, 0);
     (&A::e<A>)(a, a);
     // expected-error@-1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::e<A, int>)(a, 0);
 
@@ -1273,12 +1273,12 @@ void f() {
     (&A::f<A>)(a);
     // expected-error@-1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::f)(a);
     // expected-error@-1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
 
     (&A::g)(a);
     (&A::g)(a, 0);
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 4220486d3aed3..137f46ee3dc01 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
 
-template <class T> concept A = true;
-template <class T> concept C = A<T> && true;
+template <class T> concept A = (T(), true);
+template <class T> concept C = A<T> && true; // #C
 template <class T> concept D = A<T> && __is_same(T, int);
 
 
@@ -40,13 +40,23 @@ constexpr int i(T...) { return 1; }; // expected-note {{candidate}}
 static_assert(i(0) == 1); // expected-error {{call to 'i' is ambiguous}}
 
 
-template <class... T> requires (A<T> || ... || true)
-constexpr int j(T...) { return 0; };
-template <class... T> requires (C<T> && ... && true)
-constexpr int j(T...) { return 1; };
+template <class... T> requires (A<T> || ... || true) constexpr int j(T...) { return 0; }; // #j1
+template <class... T> requires (C<T> && ... && true) constexpr int j(T...) { return 1; }; // #j2
 
 static_assert(j(0) == 1);
+// expected-error@-1 {{call to 'j' is ambiguous}}
+// expected-note@#j1 {{candidate function [with T = <int>]}}
+// expected-note@#j2 {{candidate function [with T = <int>]}}
+// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
+// expected-note@#j1 {{similar constraint expression here}}
+
+
 static_assert(j() == 1);
+// expected-error@-1 {{call to 'j' is ambiguous}}
+// expected-note@#j1 {{candidate function [with T = <>]}}
+// expected-note@#j2 {{candidate function [with T = <>]}}
+// expected-note@#j2 {{imilar constraint expressions not considered equivalent}}
+// expected-note@#j1 {{similar constraint expression here}}
 
 
@@ -107,7 +117,7 @@ void test() {
 }
 
 namespace substitution {
-    struct S {
+struct S {
     using type = int;
 };
 
@@ -144,51 +154,69 @@ consteval int Or3() requires (C<typename T::type> || ... || C<typename U::type>)
 static_assert(And1<>() == 1);
 static_assert(And1<S>() == 1);
 static_assert(And1<S, S>() == 1);
+// FIXME: The diagnostics are not so great
 static_assert(And1<int>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                 // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
-                                 // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+                                 // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int>]}}
+                                 // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And1<S, int>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <S, int>]}}
+                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And1<int, S>() == 1); // expected-error {{no matching function for call to 'And1'}}
-                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int, S>]}}
+                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And2<S>() == 2);
 static_assert(And2<S, S>() == 2);
-static_assert(And2<int>() == 2);
+static_assert(And2<int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
+                                  // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+                                  // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
+                                  // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And2<int, int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
-                                      // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
-                                     // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+                                      // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} \
+                                      // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And2<S, int>() == 2); // expected-error {{no matching function for call to 'And2'}}
-                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
+                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
+                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And2<int, S>() == 2); // expected-error {{no matching function for call to 'And2'}}
-                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
+                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
+                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(And3<S>() == 3);
 static_assert(And3<S, S>() == 3);
 static_assert(And3<int>() == 3);   // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And3<int, int>() == 3);  // expected-error {{no matching function for call to 'And3'}}
-                                      // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                     // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                      // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
+                                      // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+                                     // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And3<S, int>() == 3); // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}}
+                                   // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+
 
 static_assert(And3<int, S>() == 3); // expected-error {{no matching function for call to 'And3'}}
-                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied}}
-                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed}}
+                                   // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
+                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
+                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 
 static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}}
@@ -198,25 +226,26 @@ static_assert(Or1<int, S>() == 1);
 static_assert(Or1<S, int>() == 1);
 static_assert(Or1<S, S>() == 1);
 static_assert(Or1<int>() == 1); // expected-error {{no matching function for call to 'Or1'}}
-                                // expected-note@#or1 {{candidate template ignored: constraints not satisfied}} \
-                                // expected-note@#or1 {{because substituted constraint expression is ill-formed}}
-
+                                // expected-note@#or1 {{candidate template ignored: constraints not satisfied}}
+                                // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}}
+                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
 
 static_assert(Or2<S>() == 2);
 static_assert(Or2<int, S>() == 2);
 static_assert(Or2<S, int>() == 2);
 static_assert(Or2<S, S>() == 2);
 static_assert(Or2<int>() == 2); // expected-error {{no matching function for call to 'Or2'}}
-                                // expected-note@#or2 {{candidate template ignored: constraints not satisfied}} \
-                                // expected-note@#or2 {{because substituted constraint expression is ill-formed}}
-
+                                // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
+                                // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}}
+                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
 static_assert(Or3<S>() == 3);
 static_assert(Or3<int, S>() == 3);
 static_assert(Or3<S, int>() == 3);
 static_assert(Or3<S, S>() == 3);
 static_assert(Or3<int>() == 3); // expected-error {{no matching function for call to 'Or3'}}
-                                // expected-note@#or3 {{candidate template ignored: constraints not satisfied}} \
-                                // expected-note@#or3 {{because substituted constraint expression is ill-formed}}
+                                // expected-note@#or3 {{candidate template ignored: constraints not satisfied}}
+                                // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}}
+                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
 }
 
 namespace bool_conversion_break {
@@ -226,7 +255,7 @@ struct Thingy {
     static constexpr int compare(const Thingy&) {return 1;}
 };
 template <typename ...T, typename ...U>
-void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: failed template argument deduction}}
+void f(A<T ...> *, A<U ...> *) // expected-note {{candidate template ignored: constraints not satisfied}}
 requires (T::compare(U{}) && ...); // expected-error {{atomic constraint must be of type 'bool' (found 'int')}}
 
 void g() {
@@ -269,9 +298,7 @@ struct S {
 
 static_assert(S<int>::f<int>() == 2);
 
-static_assert(S<int>::g<int>() == 2); // expected-error {{call to 'g' is ambiguous}}
-                                      // expected-note@#nested-ambiguous-g1 {{candidate}}
-                                      // expected-note@#nested-ambiguous-g2 {{candidate}}
+static_assert(S<int>::g<int>() == 2);
 
 
 }
@@ -384,3 +411,98 @@ struct LazyLitMatrix<index_by<Indices...>, init> {
 }
 
 }
+
+namespace GH135190 {
+template <typename T>
+concept A = __is_same_as(T, int) || __is_same_as(T, double) ;
+
+template <typename T>
+concept B = A<T> && __is_same_as(T, double);
+
+template <class... Ts>
+requires(A<Ts> && ...)
+constexpr int g() {
+    return 1;
+}
+
+template <class... Ts>
+requires(B<Ts> && ...)
+constexpr int g() {
+    return 2;
+}
+
+static_assert(g<double>() == 2);
+
+
+template <class... Ts>
+concept all_A = (A<Ts> && ...);
+
+template <class... Ts>
+concept all_B = (B<Ts> && ...);
+
+template <class... Ts>
+requires all_A<Ts...>
+constexpr int h() {
+    return 1;
+}
+
+template <class... Ts>
+requires all_B<Ts...>
+constexpr int h() {
+    return 2;
+}
+
+static_assert(h<double>() == 2);
+}
+
+
+namespace parameter_mapping_regressions {
+
+namespace case1 {
+namespace std {
+template <class _Tp, class... _Args>
+constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
+template <class _Tp, class... _Args>
+concept constructible_from = is_constructible_v<_Tp, _Args...>;
+template <class _Tp>
+concept default_initializable = true;
+template <class> using iterator_t = int;
+template <class _Tp>
+concept view = constructible_from<_Tp, _Tp>;
+template <class... _Views>
+  requires(view<_Views> && ...)
+class zip_transform_view;
+} // namespace std
+struct IterDefaultCtrView {};
+template <class... Views>
+using Iter = std::iterator_t<std::zip_transform_view<Views...>>;
+static_assert(
+    std::default_initializable<Iter<IterDefaultCtrView, IterDefaultCtrView>>);
+
+}
+
+namespace case2 {
+
+template <class _Bp>
+constexpr bool False = false;
+
+template <class... _Views>
+concept __zip_all_random_access = (False<_Views> && ...);
+// expected-note@-1 {{evaluated to false}}
+
+template <typename... _Views>
+struct zip_view {
+  void f() requires __zip_all_random_access<_Views...>{};
+  // expected-note@-1 {{because 'int' does not satisfy}}
+};
+
+zip_view<int> test_v;
+static_assert(!__zip_all_random_access<int>);
+
+void test() {
+  test_v.f(); // expected-error {{invalid reference to function 'f'}}
+}
+
+}
+
+}
diff --git a/clang/test/SemaCXX/cxx2c-template-template-param.cpp b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
index ed55a059bb53c..4ad3fd95039cd 100644
--- a/clang/test/SemaCXX/cxx2c-template-template-param.cpp
+++ b/clang/test/SemaCXX/cxx2c-template-template-param.cpp
@@ -106,7 +106,7 @@ concept BinaryDefaultedFalse = false;
 
 template <template <typename...> concept C, typename T>
 struct S {
-    template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
+    template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
     void f(TT); // expected-note {{ignored}}
     void g(C auto); // expected-note {{ignored}} \
                     // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
@@ -171,7 +171,7 @@ concept BinaryDefaultedFalse = false;
 
 template <template <typename...> concept C, typename T>
 struct S {
-    template <C TT> // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
+    template <C TT> // expected-note 2{{because 'int' does not satisfy 'UnaryFalse'}}
     void f(TT); // expected-note {{ignored}}
     void g(C auto); // expected-note {{ignored}} \
                     // expected-note {{because 'int' does not satisfy 'UnaryFalse'}}
diff --git a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
index 436dfb9aac0a7..8400340d19f93 100644
--- a/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
+++ b/clang/test/SemaCXX/invalid-requirement-requires-expr.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang -fsyntax-only -std=c++2a -Xclang -verify -ftemplate-depth=5 -ftemplate-backtrace-limit=4 %s
 
-// RequiresExpr contains invalid requirement. (Eg. Highly recurisive template).
+// RequiresExpr contains invalid requirement. (Eg. Highly recursive template).
 template<int x>
 struct A { static constexpr bool far(); };
 class B {
@@ -19,7 +19,7 @@ constexpr bool A<x>::far() {
       // expected-error@#Invalid {{recursive template instantiation exceeded maximum depth}}
       // expected-note@#Invalid 3 {{while}}
       // expected-note@#Invalid {{contexts in backtrace}}
-      // expected-note@#Invalid {{increase recursive template instantiation depth}}
+      // expected-note@#Invalid {{use -ftemplate-depth=N to increase}}
     };
 }
 static_assert(A<1>::far());
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 135865c8450f5..c3bda3988484d 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -102,7 +102,7 @@ static_assert(__is_constructible(Movable, int));
 // expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
 // expected-note@-1 2{{}}
 // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
-// expected-note@#err-self-constraint-1 4{{}}
+// expected-note@#err-self-constraint-1 3{{}}
 // expected-note@#Movable  {{'Movable' defined here}}
 
 template <typename T>
@@ -200,7 +200,6 @@ void h(short n) { f(n); }
 // expected-note@-1{{while checking constraint satisfaction for template}}
 // expected-note@#GH62096-note1{{in instantiation}}
 // expected-note@#GH62096-note1{{while substituting template arguments into constraint expression here}}
-// expected-note@#GH62096-note2{{while substituting template arguments into constraint expression here}}
 // expected-note@#GH62096-note2{{while checking the satisfaction of concept}}
 // expected-note@#GH62096-err {{expression evaluates}}
 }
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index d49330f97fad0..901d510bba847 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -5129,12 +5129,12 @@ namespace GH121278 {
 #if __cplusplus >= 202002L
 template <typename B, typename D>
 concept C = __is_base_of(B, D);
-// expected-error@-1 {{incomplete type 'GH121278::S' used in type trait expression}}
+// expected-error@-1 {{incomplete type 'S' used in type trait expression}}
 // expected-note@-2 {{while substituting template arguments into constraint expression here}}
 
 struct T;
 struct S;
 bool b = C<T, S>;
-// expected-note@-1 {{while checking the satisfaction of concept 'C<GH121278::T, GH121278::S>' requested here}}
+// expected-note@-1 {{while checking the satisfaction of concept 'C<T, S>' requested here}}
 #endif
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
index d7c6876d3b9e3..999372c95554e 100644
--- a/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/Buffers.hlsl
@@ -19,7 +19,7 @@ Buffer<double2> r4;
 
 // expected-error@+4 {{constraints not satisfied for class template 'Buffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class Buffer}}
-// expected-note@*:* {{because 'hlsl::Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'Buffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::Buffer<int>)' evaluated to false}}
 Buffer<Buffer<int> > r5;
 
@@ -65,7 +65,7 @@ Buffer<half[4]> r10;
 
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 Buffer<int8> r11;
 
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
 Buffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'Buffer'}}
-// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 Buffer<double3> r16;
 
diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
index 361f4303b4961..b33f2af8a1117 100644
--- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl
@@ -19,7 +19,7 @@ RWBuffer<double2> r4;
 
 // expected-error@+4 {{constraints not satisfied for class template 'RWBuffer'}}
 // expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_typed_resource_element_compatible<element_type> class RWBuffer}}
-// expected-note@*:* {{because 'hlsl::RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'RWBuffer<int>' does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(hlsl::RWBuffer<int>)' evaluated to false}}
 RWBuffer<RWBuffer<int> > r5;
 
@@ -65,7 +65,7 @@ RWBuffer<half[4]> r10;
 
 typedef vector<int, 8> int8;
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'vector<int, 8>' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'int8' (aka 'vector<int, 8>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<int, 8>)' evaluated to false}}
 RWBuffer<int8> r11;
 
@@ -90,7 +90,7 @@ enum numbers { one, two, three };
 RWBuffer<numbers> r15;
 
 // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}}
-// expected-note@*:* {{because 'vector<double, 3>' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}}
+// expected-note@*:* {{because 'double3' (aka 'vector<double, 3>') does not satisfy '__is_typed_resource_element_compatible'}}
 // expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector<double, 3>)' evaluated to false}}
 RWBuffer<double3> r16;
 
diff --git a/clang/test/SemaTemplate/concepts-recovery-expr.cpp b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
index 6bed1790051f3..aa4ed53ec4ed3 100644
--- a/clang/test/SemaTemplate/concepts-recovery-expr.cpp
+++ b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
@@ -4,7 +4,7 @@
 constexpr bool CausesRecoveryExpr = "test" + 1.0f;
 
 template<typename T>
-concept ReferencesCRE = CausesRecoveryExpr;
+concept ReferencesCRE = CausesRecoveryExpr; // #subst1
 
 template<typename T> requires CausesRecoveryExpr // #NVC1REQ
 void NoViableCands1(){} // #NVC1
@@ -19,16 +19,18 @@ void NVCUse() {
   NoViableCands1<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands1'}}
   // expected-note@#NVC1{{candidate template ignored: constraints not satisfied}}
+  // expected-note@#NVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
   // expected-note@#NVC1REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
   NoViableCands2<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands2'}}
   // expected-note@#NVC2{{candidate template ignored: constraints not satisfied}}
-  // expected-note@#NVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   NoViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'NoViableCands3'}}
   // expected-note@#NVC3{{candidate template ignored: constraints not satisfied}}
-  // expected-note@#NVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#NVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 template<typename T> requires CausesRecoveryExpr // #OVC1REQ
@@ -58,12 +60,14 @@ void OVCUse() {
   // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
   // expected-note@#OVC2_ALT {{candidate function}}
   // expected-note@#OVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   OtherViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
   // expected-note@#OVC3_ALT {{candidate function}}
   // expected-note@#OVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 template<typename T> requires CausesRecoveryExpr // #OBNVC1REQ
@@ -95,13 +99,15 @@ void OBNVCUse() {
   // expected-note@#OBNVC2_ALT {{candidate template ignored: constraints not satisfied}}
   // expected-note@#OBNVC2REQ_ALT {{because 'false' evaluated to false}}
   // expected-note@#OBNVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OBNVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OBNVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   OtherBadNoViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherBadNoViableCands3'}}
   // expected-note@#OBNVC3_ALT {{candidate template ignored: constraints not satisfied}}
   // expected-note@#OBNVC3REQ_ALT {{because 'false' evaluated to false}}
   // expected-note@#OBNVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#OBNVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#OBNVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 
@@ -136,12 +142,14 @@ void MemOVCUse() {
   // expected-error@-1 {{no matching member function for call to 'OtherViableCands2'}}
   // expected-note@#MEMOVC2_ALT {{candidate function}}
   // expected-note@#MEMOVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#MEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#MEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   S.OtherViableCands3<int>();
   // expected-error@-1 {{no matching member function for call to 'OtherViableCands3'}}
   // expected-note@#MEMOVC3_ALT {{candidate function}}
   // expected-note@#MEMOVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#MEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#MEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 struct StaticOVC {
@@ -173,12 +181,14 @@ void StaticMemOVCUse() {
   // expected-error@-1 {{no matching function for call to 'OtherViableCands2'}}
   // expected-note@#SMEMOVC2_ALT {{candidate function}}
   // expected-note@#SMEMOVC2 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#SMEMOVC2REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#SMEMOVC2REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
   StaticOVC::OtherViableCands3<int>();
   // expected-error@-1 {{no matching function for call to 'OtherViableCands3'}}
   // expected-note@#SMEMOVC3_ALT {{candidate function}}
   // expected-note@#SMEMOVC3 {{candidate template ignored: constraints not satisfied}}
-  // expected-note@#SMEMOVC3REQ{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#SMEMOVC3REQ{{because 'int' does not satisfy 'ReferencesCRE'}}
+  // expected-note@#subst1{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 }
 
 namespace GH58548 {
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 097cad1a64179..73dce9317f383 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -12,7 +12,7 @@ void g() {
   // expected-note@#FDEF{{because 'int' does not satisfy 'c'}}
   // expected-note@#CDEF{{because 'f(t)' would be invalid: no matching function for call to 'f'}}
 }
-} // namespace GH53213 
+} // namespace GH53213
 
 namespace GH45736 {
 struct constrained;
@@ -67,15 +67,14 @@ struct my_range{
 
 void baz() {
 auto it = begin(rng); // #BEGIN_CALL
-// expected-error@#INF_BEGIN {{satisfaction of constraint 'Inf<Inf auto>' depends on itself}}
-// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
+// expected-error-re@#INF_REQ {{satisfaction of constraint {{.*}} depends on itself}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
 // expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
 // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
 // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
-// expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
-// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
+// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
 // expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}}
 
 // Fallout of the failure is failed lookup, which is necessary to stop odd
@@ -83,6 +82,7 @@ auto it = begin(rng); // #BEGIN_CALL
 // expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}}
 // expected-note@#NOTINF_BEGIN {{candidate function}}
 // expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}}
+// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}}
 }
 } // namespace DirectRecursiveCheck
 
@@ -100,16 +100,17 @@ namespace GH50891 {
   static_assert(Numeric<Deferred>); // #STATIC_ASSERT
   // expected-error@#NUMERIC{{satisfaction of constraint 'requires (T a) { foo(a); }' depends on itself}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
-  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
-  // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}}
-  // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}}
-  // expected-note@#FOO_CALL {{while substituting deduced template arguments into function template}}
-  // expected-note@#FOO_CALL {{in instantiation of requirement here}}
+  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#OP_TO {{skipping 1 context}}
+  // expected-note@#FOO_CALL 2{{while checking constraint satisfaction for template}}
+  // expected-note@#FOO_CALL 2{{while substituting deduced template arguments into function template}}
+  // expected-note@#FOO_CALL 2{{in instantiation of requirement here}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
 
   // expected-error@#STATIC_ASSERT {{static assertion failed}}
-  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
-  // expected-note@#STATIC_ASSERT{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
+  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#STATIC_ASSERT{{because 'Deferred' does not satisfy 'Numeric'}}
+  // expected-note@#FOO_CALL{{because 'foo(a)' would be invalid}}
 
 } // namespace GH50891
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 209e7dc69797d..6d29f8b798e73 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1002,7 +1002,7 @@ template<class>
 concept Irrelevant = false;
 
 template <typename T>
-concept ErrorRequires = requires(ErrorRequires auto x) { x; };
+concept ErrorRequires = requires(ErrorRequires auto x) { x; }; //#GH54678-ill-formed-concept
 // expected-error@-1 {{a concept definition cannot refer to itself}} \
 // expected-error@-1 {{'auto' not allowed in requires expression parameter}} \
 // expected-note@-1 {{declared here}}
@@ -1023,8 +1023,7 @@ template<class T> void eee(T t) // expected-note {{candidate template ignored: c
 requires (Irrelevant<T> || Irrelevant<T> || True<T>) && False<T> {} // expected-note {{'long' does not satisfy 'False'}}
 
 template<class T> void fff(T t) // expected-note {{candidate template ignored: constraints not satisfied}}
-requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{'unsigned long' does not satisfy 'False'}}
-
+requires((ErrorRequires<T> || False<T> || True<T>) && False<T>) {} // expected-note {{because 'unsigned long' does not satisfy 'False'}}
 void test() {
     aaa(42); // expected-error {{no matching function}}
     bbb(42L); // expected-error{{no matching function}}
@@ -1264,12 +1263,7 @@ C auto x = 0;
 // expected-error@#T_Type {{type 'int' cannot be used prior to '::'}} \
 // expected-note@-1 {{in instantiation of default argument}}
 
-// This will be fixed when we merge https://github.com/llvm/llvm-project/pull/141776
-// Which makes us behave like GCC.
 static_assert(f(0));
-// expected-error@-1 {{no matching function for call}} \
-// expected-note@#GH61824_f {{constraints not satisfied}} \
-// expected-note@#T_Type {{type 'int' cannot be used prior to '::'}}
 
 }
 
@@ -1278,4 +1272,65 @@ template <typename T> concept PerfectSquare = [](){} // expected-note 2{{here}}
 ([](auto) { return true; }) < PerfectSquare <class T>;
 // expected-error@-1 {{declaration of 'T' shadows template parameter}} \
 // expected-error@-1 {{a concept definition cannot refer to itself}}
+
+}
+namespace GH61811{
+template <class T> struct A { static const int x = 42; };
+template <class Ta> concept A42 = A<Ta>::x == 42;
+template <class Tv> concept Void = __is_same_as(Tv, void);
+template <class Tb, class Ub> concept A42b = Void<Tb> || A42<Ub>;
+template <class Tc> concept R42c = A42b<Tc, Tc&>;
+static_assert (R42c<void>);
+}
+
+namespace parameter_mapping_regressions {
+
+namespace case1 {
+
+template <template <class> class> using __meval = struct __q;
+template <template <class> class _Tp>
+concept __mvalid = requires { typename __meval<_Tp>; };
+template <class _Fn>
+concept __minvocable = __mvalid<_Fn::template __f>;
+template <class...> struct __mdefer_;
+template <class _Fn, class... _Args>
+  requires __minvocable<_Fn>
+struct __mdefer_<_Fn, _Args...> {};
+template <class = __q> struct __mtransform {
+  template <class> using __f = int;
+};
+struct __completion_domain_or_none_ : __mdefer_<__mtransform<>> {};
+
+}
+
+namespace case2 {
+
+template<auto& Q, class P> concept C = Q.template operator()<P>();
+template<class P> concept E = C<[]<class Ty>{ return false; }, P>;
+static_assert(!E<int>);
+
+}
+
+
+namespace case3 {
+template <class> constexpr bool is_move_constructible_v = false;
+
+template <class _Tp>
+concept __cpp17_move_constructible = is_move_constructible_v<_Tp>; // #is_move_constructible_v
+
+template <class _Tp>
+concept __cpp17_copy_constructible = __cpp17_move_constructible<_Tp>; // #__cpp17_move_constructible
+
+template <class _Iter>
+concept __cpp17_iterator = __cpp17_copy_constructible<_Iter>; // #__cpp17_copy_constructible
+
+struct not_move_constructible {};
+static_assert(__cpp17_iterator<not_move_constructible>); \
+// expected-error {{static assertion failed}} \
+// expected-note {{because 'not_move_constructible' does not satisfy '__cpp17_iterator'}} \
+// expected-note@#__cpp17_copy_constructible {{because 'not_move_constructible' does not satisfy '__cpp17_copy_constructible'}} \
+// expected-note@#__cpp17_move_constructible {{because 'parameter_mapping_regressions::case3::not_move_constructible' does not satisfy '__cpp17_move_constructible'}} \
+// expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
+}
+
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index e2b586e4c1e44..9e5756ffec3fc 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -574,8 +574,9 @@ static_assert(x.size == 4);
 // CHECK-NEXT: | |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'U (&)[3]'
 // CHECK-NEXT: | `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
 // CHECK-NEXT: |   |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT: |   | `-TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT: |   |   `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT: |   | `-TemplateArgument type 'T'
+// CHECK-NEXT: |   |   `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
+// CHECK-NEXT: |   |     `-TemplateTypeParm 0x{{.+}} 'T'
 // CHECK-NEXT: |   `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
 // CHECK-NEXT: |     `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
 // CHECK-NEXT: |       `-TemplateTypeParm 0x{{.+}} 'T'
@@ -588,8 +589,9 @@ static_assert(x.size == 4);
 // CHECK-NEXT:   |-ParmVarDecl 0x{{.+}} <col:18, col:24> col:21 'double (&)[3]'
 // CHECK-NEXT:   `-ConceptSpecializationExpr 0x{{.+}} <col:36, col:42> 'bool' Concept 0x{{.+}} 'True'
 // CHECK-NEXT:     |-ImplicitConceptSpecializationDecl 0x{{.+}} <{{.+}}> col:28
-// CHECK-NEXT:     | `-TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT:     |   `-TemplateTypeParmType 0x{{.+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT:     | `-TemplateArgument type 'T'
+// CHECK-NEXT:     |   `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
+// CHECK-NEXT:     |     `-TemplateTypeParm 0x{{.+}} 'T'
 // CHECK-NEXT:     `-TemplateArgument <{{.+}}> type 'T':'type-parameter-0-0'
 // CHECK-NEXT:       `-TemplateTypeParmType 0x{{.+}} 'T' dependent depth 0 index 0
 // CHECK-NEXT:         `-TemplateTypeParm 0x{{.+}} 'T'
@@ -660,8 +662,9 @@ Test test(42);
 // CHECK-NEXT: |-TemplateTypeParmDecl {{.*}} Concept {{.*}} 'Constraint' depth 0 index 1 auto:1
 // CHECK-NEXT: | `-ConceptSpecializationExpr {{.*}} 'bool' Concept {{.*}} 'Constraint'
 // CHECK-NEXT: |   |-ImplicitConceptSpecializationDecl {{.*}}
-// CHECK-NEXT: |   | |-TemplateArgument type 'type-parameter-0-1'
-// CHECK-NEXT: |   | | `-TemplateTypeParmType {{.*}} 'type-parameter-0-1' dependent depth 0 index 1
+// CHECK-NEXT: |   | |-TemplateArgument type 'auto:1'
+// CHECK-NEXT: |   | | `-TemplateTypeParmType {{.*}} 'auto:1' dependent depth 0 index 1
+// CHECK-NEXT: |   | |   `-TemplateTypeParm {{.*}} 'auto:1'
 // CHECK-NEXT: |   | `-TemplateArgument type 'int'
 // CHECK-NEXT: |   |   `-BuiltinType {{.*}} 'int'
 // CHECK-NEXT: |   |-TemplateArgument {{.*}} type 'auto:1':'type-parameter-0-1'
diff --git a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
index 1f2171a25ebb0..e03756ea3e3a8 100644
--- a/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
+++ b/clang/test/SemaTemplate/instantiate-abbreviated-template.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
 
+
 template<typename...>
 concept C = false; // expected-note 9{{because}}
 
diff --git a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
index 3edf243982958..de4a4847d0996 100644
--- a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
+++ b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp
@@ -7,8 +7,7 @@ template<typename T>
 constexpr bool is_same_v<T, T> = true;
 
 template<typename T, typename U>
-concept same_as = is_same_v<T, U>;
-// expected-note@-1{{because 'is_same_v<int, bool>' evaluated to false}}
+concept same_as = is_same_v<T, U>; //#is_same_v
 
 template<typename T, typename... Us>
 concept either = (is_same_v<T, Us> || ...);
@@ -17,6 +16,7 @@ template<typename... Ts>
 struct T {
     template<same_as<Ts>... Us>
     // expected-note@-1{{because 'same_as<int, bool>' evaluated to false}}
+    // expected-note@#is_same_v{{because 'is_same_v<int, bool>' evaluated to false}}
     static void foo(Us... u, int x) { };
     // expected-note@-1{{candidate template ignored: deduced too few arguments}}
     // expected-note@-2{{candidate template ignored: constraints not satisfied}}
diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
index e60f79230aa00..32ad5374f46a7 100644
--- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp
+++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
@@ -72,12 +72,12 @@ namespace type_requirement {
 
   template<typename T> requires
   false_v<requires { typename T::template temp<T>; }>
-  // expected-note@-1 {{because 'false_v<requires { typename type_requirement::contains_template<int>::template temp<type_requirement::contains_template<int>>; }>' evaluated to false}}
-  // expected-note@-2 {{because 'false_v<requires { typename type_requirement::contains_template<short>::template temp<type_requirement::contains_template<short>>; }>' evaluated to false}}
+  // expected-note@-1 {{because 'false_v<requires { typename contains_template<int>::template temp<contains_template<int>>; }>' evaluated to false}}
+  // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short>>; }>' evaluated to false}}
   struct r2 {};
 
-  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}}
-  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<short>]}}
+  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<int>]}}
+  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<short>]}}
 
   // substitution error occurs, then requires expr is instantiated again
 
@@ -108,7 +108,7 @@ namespace type_requirement {
   // expected-note@-1 {{because 'false_v<requires { <<error-type>>; } && requires { <<error-type>>; }>' evaluated to false}}
   struct r7 {};
 
-  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, type_requirement::A>]}}
+  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, A>]}}
 }
 
 namespace expr_requirement {
@@ -268,3 +268,13 @@ struct Foo {
 };
 
 } // namespace GH110785
+
+namespace sugared_instantiation {
+  template <class C1> concept C = requires { C1{}; };
+  template <class D1> concept D = requires { new D1; };
+
+  // Test that 'deduced auto' doesn't get confused with 'undeduced auto'.
+  auto f() { return 0; }
+  static_assert(requires { { f() } -> C; });
+  static_assert(requires { { f() } -> D; });
+} // namespace sugared_instantiation
diff --git a/clang/test/SemaTemplate/instantiate-template-argument.cpp b/clang/test/SemaTemplate/instantiate-template-argument.cpp
index 43d5d00c8cb20..7606619af566d 100644
--- a/clang/test/SemaTemplate/instantiate-template-argument.cpp
+++ b/clang/test/SemaTemplate/instantiate-template-argument.cpp
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify
+// RUN: %clang_cc1 -std=c++2a -x c++ %s -verify=expected,cxx20
+// RUN: %clang_cc1 -std=c++2c -x c++ %s -verify
+
 
 template<auto T, decltype(T) U>
 concept C1 = sizeof(U) >= 4;
@@ -9,20 +11,101 @@ concept C2 = C1<Y{}, V>;
 // sizeof(U) >= 4 [U = V (decltype(Y{}))]
 
 template<char W>
-constexpr int foo() requires C2<int, W> { return 1; }
+constexpr int foo() requires C2<int, W> { return 1; } // #cand1
 // sizeof(U) >= 4 [U = W (decltype(int{}))]
 
 template<char X>
-// expected-note@+1{{candidate function}}
-constexpr int foo() requires C1<1, X> && true { return 2; }
+constexpr int foo() requires C1<1, X> && true { return 2; } // #cand2
 // sizeof(U) >= 4 [U = X (decltype(1))]
 
 static_assert(foo<'a'>() == 2);
 
+
 template<char Z>
-// expected-note@+1{{candidate function}}
-constexpr int foo() requires C2<long long, Z> && true { return 3; }
+constexpr int foo() requires C2<long long, Z> && true { return 3; } // #cand3
 // sizeof(U) >= 4 [U = Z (decltype(long long{}))]
 
 static_assert(foo<'a'>() == 3);
-// expected-error@-1{{call to 'foo' is ambiguous}}
\ No newline at end of file
+// expected-error@-1{{call to 'foo' is ambiguous}}
+// expected-note@#cand2 {{candidate function}}
+// expected-note@#cand3 {{candidate function}}
+
+
+namespace case1 {
+
+template<auto T, decltype(T) U>
+concept C1 = sizeof(T) >= 4; // #case1_C1
+
+template<typename Y, char V>
+concept C2 = C1<Y{}, V>; // #case1_C2
+
+template<class T, char W>
+constexpr int foo() requires C2<T, W> { return 1; } // #case1_foo1
+
+template<class T, char X>
+constexpr int foo() requires C1<T{}, X> && true { return 2; } // #case1_foo2
+
+static_assert(foo<char, 'a'>() == 2);
+// expected-error@-1{{no matching function for call to 'foo'}}
+// expected-note@#case1_foo1{{candidate template ignored: constraints not satisfied [with T = char, W = 'a']}}
+// expected-note@#case1_foo1{{because 'C2<char, 'a'>' evaluated to false}}
+// expected-note@#case1_C2{{because 'C1<char{}, 'a'>' evaluated to false}}
+// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
+// expected-note@#case1_foo2{{candidate template ignored: constraints not satisfied [with T = char, X = 'a']}}
+// expected-note@#case1_foo2{{because 'C1<char{}, 'a'>' evaluated to false}}
+// expected-note@#case1_C1{{because 'sizeof ('\x00') >= 4' (1 >= 4) evaluated to false}}
+
+static_assert(foo<int, 'a'>() == 2);
+
+}
+
+namespace packs {
+
+template<auto T, decltype(T) U>
+concept C1 = sizeof(U) >= 4;
+
+template<typename Y, char V>
+concept C2 = C1<Y{}, V>;
+
+template<char... W>
+constexpr int foo() requires (C2<int, W> && ...) { return 1; } // #packs-cand1
+
+template<char... X>
+constexpr int foo() requires (C1<1, X> && ...) && true { return 2; } // #packs-cand2
+
+static_assert(foo<'a'>() == 2);
+// cxx20-error@-1{{call to 'foo' is ambiguous}}
+// cxx20-note@#packs-cand1 {{candidate function}}
+// cxx20-note@#packs-cand2 {{candidate function}}
+
+}
+
+namespace case2 {
+template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
+template<typename Y> concept C2 = C1<Y{}>;
+
+template<char W>
+constexpr int foo() requires C2<int> { return 1; }
+
+template<char X>
+constexpr int foo() requires C1<0> && true { return 2; }
+
+static_assert(foo<0>() == 2);
+}
+
+namespace case3 {
+template<auto T> concept C1 = sizeof(decltype(T)) >= 0;
+
+template<typename Y> concept C2 = C1<Y{}>;
+
+template<char W>
+constexpr int foo() requires C2<int> { return 1; } // #case3_foo1
+
+template<char X>
+constexpr int foo() requires C1<1> && true { return 2; } // #case3_foo2
+
+static_assert(foo<0>() == 2);
+// expected-error@-1{{call to 'foo' is ambiguous}}
+// expected-note@#case3_foo1 {{candidate function}}
+// expected-note@#case3_foo2 {{candidate function}}
+}
diff --git a/clang/test/SemaTemplate/pr52970.cpp b/clang/test/SemaTemplate/pr52970.cpp
index 7aac5ee856593..6aabc419bd2b8 100644
--- a/clang/test/SemaTemplate/pr52970.cpp
+++ b/clang/test/SemaTemplate/pr52970.cpp
@@ -53,7 +53,7 @@ static_assert(!DotFollowingPointer::f(Bad{}), "");
 #if __cplusplus >= 202002L
 template <class T>
 concept C = requires(T t) { t.begin(); };
-  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Holder<Incomplete> *' is a pointer}}
+  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Bad' (aka 'Holder<Incomplete> *') is a pointer}}
 
 static_assert(C<Good>);
 static_assert(!C<Bad>);
diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 629a887c0f7ed..70341eee79a96 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -143,7 +143,7 @@ void check_forward_iterator_requirements() {
   // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref, ""); // expected-error {{static assertion failed}}
 #ifndef _AIX
-  // expected-note-re@*:* {{because type constraint 'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>' was not satisfied}}
+  // expected-note-re@*:* {{'convertible_to<{{(valid_iterator<postincrement_not_ref>::)?}}Proxy, const postincrement_not_ref &>'}}
 #endif
 }
 
@@ -173,7 +173,7 @@ void check_bidirectional_iterator_requirements() {
   _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
   _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference, ""); // expected-error {{static assertion failed}}
-  // expected-note-re@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>' was not satisfied}}
+  // expected-note-re@*:* {{'same_as<int, __iter_reference<not_returning_iter_reference>{{ ?}}>'}}
   // clang-format on
 }
 

From 05a49dac85e94f9c05f0ec1549cdc812443491e5 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 3 Oct 2025 17:45:16 +1000
Subject: [PATCH 602/878] =?UTF-8?q?[llvm-jitlink]=20Use=20MachOObjectFile:?=
 =?UTF-8?q?:getArchTriple=20for=20triple=20identifi=E2=80=A6=20(#161799)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…cation.

Replaces a call to ObjectFile::makeTriple (still used for ELF and COFF)
with a call to MachOObjectFile::getArchTriple. The latter knows how to
build correct triples for different MachO CPU subtypes, e.g. arm64 vs
arm64e, which is important for selecting the right slice from universal
archives.
---
 .../JITLink/AArch64/Inputs/x-0.s              |  7 ++++
 .../JITLink/AArch64/Inputs/x-1.s              |  7 ++++
 .../AArch64/MachO_universal_slice_selection.s | 32 +++++++++++++++++++
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      |  6 +++-
 4 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s

diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s
new file mode 100644
index 0000000000000..557e4033d4c93
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-0.s
@@ -0,0 +1,7 @@
+	.section	__DATA,__data
+	.globl	x
+	.p2align	2, 0x0
+x:
+	.long	0
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s
new file mode 100644
index 0000000000000..711c8a03d634b
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/Inputs/x-1.s
@@ -0,0 +1,7 @@
+	.section	__DATA,__data
+	.globl	x
+	.p2align	2, 0x0
+x:
+	.long	1
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s
new file mode 100644
index 0000000000000..c58f84e7e7325
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_universal_slice_selection.s
@@ -0,0 +1,32 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/main.o %s
+# RUN: llvm-mc -triple=arm64-apple-darwin -filetype=obj -o %t/x.arm64.o \
+# RUN:     %S/Inputs/x-1.s
+# RUN: llvm-ar crs %t/libX.arm64.a %t/x.arm64.o
+# RUN: llvm-mc -triple=arm64e-apple-darwin -filetype=obj -o %t/x.arm64e.o \
+# RUN:     %S/Inputs/x-0.s
+# RUN: llvm-ar crs %t/libX.arm64e.a %t/x.arm64e.o
+# RUN: llvm-lipo --create --output %t/libX.a %t/libX.arm64.a %t/libX.arm64e.a
+# RUN: llvm-jitlink -noexec -check=%s %t/main.o -L%t -lX
+#
+# Create a universal archive with two slices (arm64e, arm64) each containing
+# a definition of X: in arm64e X = 0, in arm64 X = 1.
+# Check that if we load an arm64e object file then we link the arm64e slice
+# of the archive by verifying that X = 0.
+#
+
+# jitlink-check: *{4}x = 0
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_main
+	.p2align	2
+_main:
+	mov     w0, #0
+        ret
+
+	.section	__DATA,__data
+	.globl	p
+p:
+	.quad   x
+
+.subsections_via_symbols
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index e09ddb45da6e9..731d64877ac63 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1636,7 +1636,11 @@ static std::pair<Triple, SubtargetFeatures> getFirstFileTripleAndFeatures() {
       case file_magic::macho_object: {
         auto Obj = ExitOnErr(
             object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef()));
-        Triple TT = Obj->makeTriple();
+        Triple TT;
+        if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj.get()))
+          TT = MachOObj->getArchTriple();
+        else
+          TT = Obj->makeTriple();
         if (Magic == file_magic::coff_object) {
           // TODO: Move this to makeTriple() if possible.
           TT.setObjectFormat(Triple::COFF);

From 441f0c7c9a5e6ae18ecacb0dee4e32e037bf4b1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gergely=20B=C3=A1lint?= <gergely.balint@arm.com>
Date: Fri, 3 Oct 2025 09:45:56 +0200
Subject: [PATCH 603/878] [BOLT] Add GNUPropertyRewriter and warn on AArch64
 BTI note (#161206)

This commit adds the GNUPropertyRewriter, which parses features from the
.note.gnu.property section.

Currently we only read the bit indicating BTI support
(GNU_PROPERTY_AARCH64_FEATURE_1_BTI).

As BOLT does not add BTI landing pads to targets of indirect
branches/calls, we have to emit a warning that the output binary may be
corrupted.
---
 bolt/include/bolt/Core/BinaryContext.h        |   6 +
 bolt/include/bolt/Rewrite/MetadataRewriters.h |   2 +
 bolt/lib/Rewrite/CMakeLists.txt               |   1 +
 bolt/lib/Rewrite/GNUPropertyRewriter.cpp      | 147 ++++++++++++++++++
 bolt/lib/Rewrite/RewriteInstance.cpp          |   2 +
 .../AArch64/Inputs/property-note-bti.yaml     |  50 ++++++
 .../AArch64/Inputs/property-note-nobti.yaml   |  50 ++++++
 bolt/test/AArch64/bti-note.test               |  10 ++
 bolt/test/AArch64/no-bti-note.test            |  10 ++
 9 files changed, 278 insertions(+)
 create mode 100644 bolt/lib/Rewrite/GNUPropertyRewriter.cpp
 create mode 100644 bolt/test/AArch64/Inputs/property-note-bti.yaml
 create mode 100644 bolt/test/AArch64/Inputs/property-note-nobti.yaml
 create mode 100644 bolt/test/AArch64/bti-note.test
 create mode 100644 bolt/test/AArch64/no-bti-note.test

diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 082f1cec34d52..8960b1984745f 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -190,6 +190,9 @@ class BinaryContext {
   /// Unique build ID if available for the binary.
   std::optional<std::string> FileBuildID;
 
+  /// GNU property note indicating AArch64 BTI.
+  bool UsesBTI{false};
+
   /// Set of all sections.
   struct CompareSections {
     bool operator()(const BinarySection *A, const BinarySection *B) const {
@@ -384,6 +387,9 @@ class BinaryContext {
   }
   void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); }
 
+  bool usesBTI() const { return UsesBTI; }
+  void setUsesBTI(bool Value) { UsesBTI = Value; }
+
   bool hasSymbolsWithFileName() const { return HasSymbolsWithFileName; }
   void setHasSymbolsWithFileName(bool Value) { HasSymbolsWithFileName = Value; }
 
diff --git a/bolt/include/bolt/Rewrite/MetadataRewriters.h b/bolt/include/bolt/Rewrite/MetadataRewriters.h
index b71bd6cad2505..2c09c879b9128 100644
--- a/bolt/include/bolt/Rewrite/MetadataRewriters.h
+++ b/bolt/include/bolt/Rewrite/MetadataRewriters.h
@@ -27,6 +27,8 @@ std::unique_ptr<MetadataRewriter> createPseudoProbeRewriter(BinaryContext &);
 
 std::unique_ptr<MetadataRewriter> createSDTRewriter(BinaryContext &);
 
+std::unique_ptr<MetadataRewriter> createGNUPropertyRewriter(BinaryContext &);
+
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index 775036063dd5a..5b15edcacb482 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMBOLTRewrite
   PseudoProbeRewriter.cpp
   RewriteInstance.cpp
   SDTRewriter.cpp
+  GNUPropertyRewriter.cpp
 
   NO_EXPORT
   DISABLE_LLVM_LINK_LLVM_DYLIB
diff --git a/bolt/lib/Rewrite/GNUPropertyRewriter.cpp b/bolt/lib/Rewrite/GNUPropertyRewriter.cpp
new file mode 100644
index 0000000000000..f61c08ec46fe6
--- /dev/null
+++ b/bolt/lib/Rewrite/GNUPropertyRewriter.cpp
@@ -0,0 +1,147 @@
+//===- bolt/Rewrite/GNUPropertyRewriter.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Read the .note.gnu.property section.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Rewrite/MetadataRewriter.h"
+#include "bolt/Rewrite/MetadataRewriters.h"
+#include "llvm/Support/Errc.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace {
+
+class GNUPropertyRewriter final : public MetadataRewriter {
+
+  Expected<uint32_t> decodeGNUPropertyNote(StringRef Desc);
+
+public:
+  GNUPropertyRewriter(StringRef Name, BinaryContext &BC)
+      : MetadataRewriter(Name, BC) {}
+
+  Error sectionInitializer() override;
+};
+
+Error GNUPropertyRewriter::sectionInitializer() {
+
+  ErrorOr<BinarySection &> Sec =
+      BC.getUniqueSectionByName(".note.gnu.property");
+  if (!Sec)
+    return Error::success();
+
+  // Accumulate feature bits
+  uint32_t FeaturesAcc = 0;
+
+  StringRef Buf = Sec->getContents();
+  DataExtractor DE(Buf, BC.AsmInfo->isLittleEndian(),
+                   BC.AsmInfo->getCodePointerSize());
+  DataExtractor::Cursor Cursor(0);
+  while (Cursor && !DE.eof(Cursor)) {
+    const uint32_t NameSz = DE.getU32(Cursor);
+    const uint32_t DescSz = DE.getU32(Cursor);
+    const uint32_t Type = DE.getU32(Cursor);
+
+    StringRef Name =
+        NameSz ? Buf.slice(Cursor.tell(), Cursor.tell() + NameSz) : "<empty>";
+    Cursor.seek(alignTo(Cursor.tell() + NameSz, 4));
+
+    const uint64_t DescOffset = Cursor.tell();
+    StringRef Desc =
+        DescSz ? Buf.slice(DescOffset, DescOffset + DescSz) : "<empty>";
+    Cursor.seek(alignTo(DescOffset + DescSz, 4));
+    if (!Cursor)
+      return createStringError(
+          errc::executable_format_error,
+          "out of bounds while reading .note.gnu.property section: %s",
+          toString(Cursor.takeError()).c_str());
+
+    if (Type == ELF::NT_GNU_PROPERTY_TYPE_0 && Name.starts_with("GNU") &&
+        DescSz) {
+      auto Features = decodeGNUPropertyNote(Desc);
+      if (!Features)
+        return Features.takeError();
+      FeaturesAcc |= *Features;
+    }
+  }
+
+  if (BC.isAArch64()) {
+    BC.setUsesBTI(FeaturesAcc & llvm::ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI);
+    if (BC.usesBTI())
+      BC.outs() << "BOLT-WARNING: binary is using BTI. Optimized binary may be "
+                   "corrupted\n";
+  }
+
+  return Error::success();
+}
+
+/// \p Desc contains an array of property descriptors. Each member has the
+/// following structure:
+/// typedef struct {
+///   Elf_Word pr_type;
+///   Elf_Word pr_datasz;
+///   unsigned char pr_data[PR_DATASZ];
+///   unsigned char pr_padding[PR_PADDING];
+/// } Elf_Prop;
+///
+/// As there is no guarantee that the features are encoded in which element of
+/// the array, we have to read all, and OR together the result.
+Expected<uint32_t> GNUPropertyRewriter::decodeGNUPropertyNote(StringRef Desc) {
+  DataExtractor DE(Desc, BC.AsmInfo->isLittleEndian(),
+                   BC.AsmInfo->getCodePointerSize());
+  DataExtractor::Cursor Cursor(0);
+  const uint32_t Align = DE.getAddressSize();
+
+  std::optional<uint32_t> Features = 0;
+  while (Cursor && !DE.eof(Cursor)) {
+    const uint32_t PrType = DE.getU32(Cursor);
+    const uint32_t PrDataSz = DE.getU32(Cursor);
+
+    const uint64_t PrDataStart = Cursor.tell();
+    const uint64_t PrDataEnd = PrDataStart + PrDataSz;
+    Cursor.seek(PrDataEnd);
+    if (!Cursor)
+      return createStringError(
+          errc::executable_format_error,
+          "out of bounds while reading .note.gnu.property section: %s",
+          toString(Cursor.takeError()).c_str());
+
+    if (PrType == llvm::ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND) {
+      if (PrDataSz != 4) {
+        return createStringError(
+            errc::executable_format_error,
+            "Property descriptor size has to be 4 bytes on AArch64\n");
+      }
+      DataExtractor::Cursor Tmp(PrDataStart);
+      // PrDataSz = 4 -> PrData is uint32_t
+      const uint32_t FeaturesItem = DE.getU32(Tmp);
+      if (!Tmp)
+        return createStringError(
+            errc::executable_format_error,
+            "failed to read property from .note.gnu.property section: %s",
+            toString(Tmp.takeError()).c_str());
+      Features = Features ? (*Features | FeaturesItem) : FeaturesItem;
+    }
+
+    Cursor.seek(alignTo(PrDataEnd, Align));
+    if (!Cursor)
+      return createStringError(errc::executable_format_error,
+                               "out of bounds while reading property array in "
+                               ".note.gnu.property section: %s",
+                               toString(Cursor.takeError()).c_str());
+  }
+  return Features.value_or(0u);
+}
+} // namespace
+
+std::unique_ptr<MetadataRewriter>
+llvm::bolt::createGNUPropertyRewriter(BinaryContext &BC) {
+  return std::make_unique<GNUPropertyRewriter>("gnu-property-rewriter", BC);
+}
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index c13a9f016e8ae..8a25d0b90fa9d 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -3331,6 +3331,8 @@ void RewriteInstance::initializeMetadataManager() {
   MetadataManager.registerRewriter(createPseudoProbeRewriter(*BC));
 
   MetadataManager.registerRewriter(createSDTRewriter(*BC));
+
+  MetadataManager.registerRewriter(createGNUPropertyRewriter(*BC));
 }
 
 void RewriteInstance::processSectionMetadata() {
diff --git a/bolt/test/AArch64/Inputs/property-note-bti.yaml b/bolt/test/AArch64/Inputs/property-note-bti.yaml
new file mode 100644
index 0000000000000..541ae92c92f6f
--- /dev/null
+++ b/bolt/test/AArch64/Inputs/property-note-bti.yaml
@@ -0,0 +1,50 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_AARCH64
+  Entry:           0x400510
+ProgramHeaders:
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x400338
+    Align:           0x8
+  - Type:           PT_LOAD
+    Flags:          [ PF_R ]
+    VAddr:          0x0
+    Align:          0x10000
+    FileSize:       0xf8
+    MemSize:        0xf8
+    Offset:         0x0
+Sections:
+  - Name:           .text
+    Type:           SHT_PROGBITS
+    Flags:          [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:        0x2a0000
+    AddressAlign:   0x4
+    Content:        400580d2c0035fd6
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400338
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            000000C0040000000300000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .note.gnu.property
+      - Name:            .symtab
+      - Name:            .strtab
+      - Name:            .shstrtab
+      - Name:            .text
+Symbols:
+  - Name:            .note.gnu.property
+    Type:            STT_SECTION
+    Section:         .note.gnu.property
+    Value:           0x400338
+...
diff --git a/bolt/test/AArch64/Inputs/property-note-nobti.yaml b/bolt/test/AArch64/Inputs/property-note-nobti.yaml
new file mode 100644
index 0000000000000..a041a586d7804
--- /dev/null
+++ b/bolt/test/AArch64/Inputs/property-note-nobti.yaml
@@ -0,0 +1,50 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_AARCH64
+  Entry:           0x400510
+ProgramHeaders:
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.property
+    LastSec:         .note.gnu.property
+    VAddr:           0x400338
+    Align:           0x8
+  - Type:           PT_LOAD
+    Flags:          [ PF_R ]
+    VAddr:          0x0
+    Align:          0x10000
+    FileSize:       0xf8
+    MemSize:        0xf8
+    Offset:         0x0
+Sections:
+  - Name:           .text
+    Type:           SHT_PROGBITS
+    Flags:          [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:        0x2a0000
+    AddressAlign:   0x4
+    Content:        400580d2c0035fd6
+  - Name:            .note.gnu.property
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400338
+    AddressAlign:    0x8
+    Notes:
+      - Name:            GNU
+        Desc:            000000C0040000000200000000000000
+        Type:            NT_GNU_PROPERTY_TYPE_0
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .note.gnu.property
+      - Name:            .symtab
+      - Name:            .strtab
+      - Name:            .shstrtab
+      - Name:            .text
+Symbols:
+  - Name:            .note.gnu.property
+    Type:            STT_SECTION
+    Section:         .note.gnu.property
+    Value:           0x400338
+...
diff --git a/bolt/test/AArch64/bti-note.test b/bolt/test/AArch64/bti-note.test
new file mode 100644
index 0000000000000..1ec9d774b3271
--- /dev/null
+++ b/bolt/test/AArch64/bti-note.test
@@ -0,0 +1,10 @@
+// This test checks that the GNUPropertyRewriter can decode the BTI feature flag.
+// It decodes an executable with BTI, and checks for the warning.
+
+RUN: yaml2obj %p/Inputs/property-note-bti.yaml &> %t.exe
+
+RUN: llvm-readelf -n %t.exe | FileCheck %s
+CHECK: BTI
+
+RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s -check-prefix=CHECK-BOLT
+CHECK-BOLT: BOLT-WARNING: binary is using BTI. Optimized binary may be corrupted
diff --git a/bolt/test/AArch64/no-bti-note.test b/bolt/test/AArch64/no-bti-note.test
new file mode 100644
index 0000000000000..28cce345deaab
--- /dev/null
+++ b/bolt/test/AArch64/no-bti-note.test
@@ -0,0 +1,10 @@
+// This test checks that the GNUPropertyRewriter can decode the BTI feature flag.
+// It decodes an executable without BTI, and checks for the warning.
+
+RUN: yaml2obj %p/Inputs/property-note-nobti.yaml &> %t.exe
+
+RUN: llvm-readelf -n %t.exe | FileCheck %s
+CHECK-NOT: BTI
+
+RUN: llvm-bolt %t.exe -o %t.exe.bolt | FileCheck %s -check-prefix=CHECK-BOLT
+CHECK-BOLT-NOT: BOLT-WARNING: binary is using BTI. Optimized binary may be corrupted

From a1db40fef857cc11a38fdec1395b2731e73ca085 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 3 Oct 2025 07:48:36 +0000
Subject: [PATCH 604/878] [gn build] Port 441f0c7c9a5e

---
 llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
index b856d1c2f7fb0..764ebb90c8a4e 100644
--- a/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/lib/Rewrite/BUILD.gn
@@ -28,6 +28,7 @@ static_library("Rewrite") {
     "BuildIDRewriter.cpp",
     "DWARFRewriter.cpp",
     "ExecutableFileMemoryManager.cpp",
+    "GNUPropertyRewriter.cpp",
     "JITLinkLinker.cpp",
     "LinuxKernelRewriter.cpp",
     "MachORewriteInstance.cpp",

From 19cd5bd350b730da35629de5095764861f70ecee Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:38:37 +0530
Subject: [PATCH 605/878] [AMDGPU] Account for implicit XCNT insertion
 (#160812)

Hardware inserts an implicit `S_WAIT_XCNT 0` between
alternate SMEM and VMEM instructions, so there are
never outstanding address translations for both SMEM
and VMEM at the same time.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp           | 11 +++++++++++
 llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll |  1 -
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir                |  9 +++------
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 76bfce8c0f6f9..5e27b37809c7c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1013,6 +1013,15 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
       }
     }
   } else if (T == X_CNT) {
+    WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
+    if (PendingEvents & (1 << OtherEvent)) {
+      // Hardware inserts an implicit xcnt between interleaved
+      // SMEM and VMEM operations. So there will never be
+      // outstanding address translations for both SMEM and
+      // VMEM at the same time.
+      setScoreLB(T, CurrScore - 1);
+      PendingEvents &= ~(1 << OtherEvent);
+    }
     for (const MachineOperand &Op : Inst.all_uses())
       setScoreByOperand(&Inst, Op, T, CurrScore);
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -2220,6 +2229,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   // Now look at the instruction opcode. If it is a memory access
   // instruction, update the upper-bound of the appropriate counter's
   // bracket and the destination operand scores.
+  // For architectures with X_CNT, mark the source address operands
+  // with the appropriate counter values.
   // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
 
   bool IsVMEMAccess = false;
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 243f0ed3a8d0d..f8655a702180e 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add
 ; GCN-NEXT:    s_wait_storecnt 0x0
 ; GCN-NEXT:  .LBB5_3: ; %bb4
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_wait_xcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index af8b9e7c8cd82..6fe99d895c7bb 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -520,6 +520,7 @@ body: |
     ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec
     ; GCN-NEXT: S_WAIT_KMCNT 0
     ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc
+    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec
     $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4)
     $vgpr0 = V_MOV_B32_e32 1, implicit $exec
@@ -921,7 +922,6 @@ body: |
     $vgpr2 = V_MOV_B32_e32 1, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
 ---
 name: wait_kmcnt_with_outstanding_vmem
 tracksRegLiveness: true
@@ -937,6 +937,7 @@ body: |
     ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: S_WAIT_KMCNT 0
     ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
@@ -944,7 +945,6 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0.
 ---
 name: wait_loadcnt_with_outstanding_smem
 tracksRegLiveness: true
@@ -960,6 +960,7 @@ body: |
     ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     ; GCN-NEXT: S_WAIT_LOADCNT 0
     ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
@@ -967,7 +968,6 @@ body: |
     $sgpr0 = S_MOV_B32 0
 ...
 
-# TODO: Unnecessary wait before overwriting vgpr0.
 ---
 name: overwrite_vgpr_after_smem
 tracksRegLiveness: true
@@ -981,14 +981,12 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
-    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
-# TODO: Unnecessary wait before overwriting sgpr0.
 ---
 name: overwrite_sgpr_after_vmem
 tracksRegLiveness: true
@@ -1002,7 +1000,6 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
-    ; GCN-NEXT: S_WAIT_XCNT 0
     ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
     $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
     $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec

From 224a7176dc6afaec83bfcfdb7542d30a130997cc Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Fri, 3 Oct 2025 09:13:09 +0100
Subject: [PATCH 606/878] [BOLT][AArch64] Refuse to run CDSplit pass (#159351)

LongJmp does not support warm blocks.
On builds without assertions, this may lead to unexpected crashes.

This patch exits with a clear message.
---
 bolt/include/bolt/Passes/SplitFunctions.h | 19 ------------
 bolt/include/bolt/Utils/CommandLineOpts.h | 20 +++++++++++++
 bolt/lib/Passes/LongJmp.cpp               |  4 +++
 bolt/lib/Passes/SplitFunctions.cpp        | 35 ++++-------------------
 bolt/lib/Rewrite/RewriteInstance.cpp      |  7 +++++
 bolt/lib/Utils/CommandLineOpts.cpp        | 23 +++++++++++++++
 bolt/test/AArch64/unsupported-passes.test |  7 +++--
 7 files changed, 65 insertions(+), 50 deletions(-)

diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h
index 8bdc48b68eb7a..2c1bf1890cd97 100644
--- a/bolt/include/bolt/Passes/SplitFunctions.h
+++ b/bolt/include/bolt/Passes/SplitFunctions.h
@@ -18,25 +18,6 @@
 namespace llvm {
 namespace bolt {
 
-/// Strategy used to partition blocks into fragments.
-enum SplitFunctionsStrategy : char {
-  /// Split each function into a hot and cold fragment using profiling
-  /// information.
-  Profile2 = 0,
-  /// Split each function into a hot, warm, and cold fragment using
-  /// profiling information.
-  CDSplit,
-  /// Split each function into a hot and cold fragment at a randomly chosen
-  /// split point (ignoring any available profiling information).
-  Random2,
-  /// Split each function into N fragments at a randomly chosen split points
-  /// (ignoring any available profiling information).
-  RandomN,
-  /// Split all basic blocks of each function into fragments such that each
-  /// fragment contains exactly a single basic block.
-  All
-};
-
 class SplitStrategy {
 public:
   using BlockIt = BinaryFunction::BasicBlockOrderType::iterator;
diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h
index 859d6f3bf6774..0964c2c9d8473 100644
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@@ -29,6 +29,25 @@ enum HeatmapModeKind {
   HM_Optional   // perf2bolt --heatmap
 };
 
+/// Strategy used to partition blocks into fragments.
+enum SplitFunctionsStrategy : char {
+  /// Split each function into a hot and cold fragment using profiling
+  /// information.
+  Profile2 = 0,
+  /// Split each function into a hot, warm, and cold fragment using
+  /// profiling information.
+  CDSplit,
+  /// Split each function into a hot and cold fragment at a randomly chosen
+  /// split point (ignoring any available profiling information).
+  Random2,
+  /// Split each function into N fragments at a randomly chosen split points
+  /// (ignoring any available profiling information).
+  RandomN,
+  /// Split all basic blocks of each function into fragments such that each
+  /// fragment contains exactly a single basic block.
+  All
+};
+
 using HeatmapBlockSizes = std::vector<unsigned>;
 struct HeatmapBlockSpecParser : public llvm::cl::parser<HeatmapBlockSizes> {
   explicit HeatmapBlockSpecParser(llvm::cl::Option &O)
@@ -78,6 +97,7 @@ extern llvm::cl::opt<std::string> OutputFilename;
 extern llvm::cl::opt<std::string> PerfData;
 extern llvm::cl::opt<bool> PrintCacheMetrics;
 extern llvm::cl::opt<bool> PrintSections;
+extern llvm::cl::opt<SplitFunctionsStrategy> SplitStrategy;
 
 // The format to use with -o in aggregation mode (perf2bolt)
 enum ProfileFormatKind { PF_Fdata, PF_YAML };
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 4dade161cc232..03c1ea9d837e2 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -895,6 +895,10 @@ void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
 
 Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
 
+  assert((opts::CompactCodeModel ||
+          opts::SplitStrategy != opts::SplitFunctionsStrategy::CDSplit) &&
+         "LongJmp cannot work with functions split in more than two fragments");
+
   if (opts::CompactCodeModel) {
     BC.outs()
         << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index b21401e069bfa..eab669b32b71e 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -86,29 +86,6 @@ static cl::opt<unsigned> SplitThreshold(
              "increase after splitting."),
     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
 
-static cl::opt<SplitFunctionsStrategy> SplitStrategy(
-    "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
-    cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
-                          "split each function into a hot and cold fragment "
-                          "using profiling information")),
-    cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
-                          "split each function into a hot, warm, and cold "
-                          "fragment using profiling information")),
-    cl::values(clEnumValN(
-        SplitFunctionsStrategy::Random2, "random2",
-        "split each function into a hot and cold fragment at a randomly chosen "
-        "split point (ignoring any available profiling information)")),
-    cl::values(clEnumValN(
-        SplitFunctionsStrategy::RandomN, "randomN",
-        "split each function into N fragments at a randomly chosen split "
-        "points (ignoring any available profiling information)")),
-    cl::values(clEnumValN(
-        SplitFunctionsStrategy::All, "all",
-        "split all basic blocks of each function into fragments such that each "
-        "fragment contains exactly a single basic block")),
-    cl::desc("strategy used to partition blocks into fragments"),
-    cl::cat(BoltOptCategory));
-
 static cl::opt<double> CallScale(
     "call-scale",
     cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
@@ -724,14 +701,14 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
   // If split strategy is not CDSplit, then a second run of the pass is not
   // needed after function reordering.
   if (BC.HasFinalizedFunctionOrder &&
-      opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
+      opts::SplitStrategy != opts::SplitFunctionsStrategy::CDSplit)
     return Error::success();
 
   std::unique_ptr<SplitStrategy> Strategy;
   bool ForceSequential = false;
 
   switch (opts::SplitStrategy) {
-  case SplitFunctionsStrategy::CDSplit:
+  case opts::SplitFunctionsStrategy::CDSplit:
     // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
     // before function reordering and hot-warm-cold splitting
     // (SplitCacheDirected) after function reordering.
@@ -742,21 +719,21 @@ Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
     opts::AggressiveSplitting = true;
     BC.HasWarmSection = true;
     break;
-  case SplitFunctionsStrategy::Profile2:
+  case opts::SplitFunctionsStrategy::Profile2:
     Strategy = std::make_unique<SplitProfile2>();
     break;
-  case SplitFunctionsStrategy::Random2:
+  case opts::SplitFunctionsStrategy::Random2:
     Strategy = std::make_unique<SplitRandom2>();
     // If we split functions randomly, we need to ensure that across runs with
     // the same input, we generate random numbers for each function in the same
     // order.
     ForceSequential = true;
     break;
-  case SplitFunctionsStrategy::RandomN:
+  case opts::SplitFunctionsStrategy::RandomN:
     Strategy = std::make_unique<SplitRandomN>();
     ForceSequential = true;
     break;
-  case SplitFunctionsStrategy::All:
+  case opts::SplitFunctionsStrategy::All:
     Strategy = std::make_unique<SplitAll>();
     break;
   }
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 8a25d0b90fa9d..bfd03e01db98e 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -2115,6 +2115,13 @@ void RewriteInstance::adjustCommandLineOptions() {
     opts::SplitEH = false;
   }
 
+  if (BC->isAArch64() && !opts::CompactCodeModel &&
+      opts::SplitStrategy == opts::SplitFunctionsStrategy::CDSplit) {
+    BC->errs() << "BOLT-ERROR: CDSplit is not supported with LongJmp. Try with "
+                  "'--compact-code-model'\n";
+    exit(1);
+  }
+
   if (opts::StrictMode && !BC->HasRelocations) {
     BC->errs()
         << "BOLT-WARNING: disabling strict mode (-strict) in non-relocation "
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index 5635da476451d..095612ac3a4ac 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -104,6 +104,29 @@ ExecutionCountThreshold("execution-count-threshold",
   cl::Hidden,
   cl::cat(BoltOptCategory));
 
+cl::opt<SplitFunctionsStrategy> SplitStrategy(
+    "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
+    cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
+                          "split each function into a hot and cold fragment "
+                          "using profiling information")),
+    cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
+                          "split each function into a hot, warm, and cold "
+                          "fragment using profiling information")),
+    cl::values(clEnumValN(
+        SplitFunctionsStrategy::Random2, "random2",
+        "split each function into a hot and cold fragment at a randomly chosen "
+        "split point (ignoring any available profiling information)")),
+    cl::values(clEnumValN(
+        SplitFunctionsStrategy::RandomN, "randomN",
+        "split each function into N fragments at a randomly chosen split "
+        "points (ignoring any available profiling information)")),
+    cl::values(clEnumValN(
+        SplitFunctionsStrategy::All, "all",
+        "split all basic blocks of each function into fragments such that each "
+        "fragment contains exactly a single basic block")),
+    cl::desc("strategy used to partition blocks into fragments"),
+    cl::cat(BoltOptCategory));
+
 bool HeatmapBlockSpecParser::parse(cl::Option &O, StringRef ArgName,
                                    StringRef Arg, HeatmapBlockSizes &Val) {
   // Parses a human-readable suffix into a shift amount or nullopt on error.
diff --git a/bolt/test/AArch64/unsupported-passes.test b/bolt/test/AArch64/unsupported-passes.test
index 886fc1c574dcf..5b12d86500eea 100644
--- a/bolt/test/AArch64/unsupported-passes.test
+++ b/bolt/test/AArch64/unsupported-passes.test
@@ -3,6 +3,9 @@
 // REQUIRES: system-linux,asserts,target=aarch64{{.*}}
 
 RUN: %clang %cflags %p/../Inputs/hello.c -o %t -Wl,-q
-RUN: not llvm-bolt %t -o %t.bolt --frame-opt=all 2>&1 | FileCheck %s
+RUN: not llvm-bolt %t -o %t.bolt --frame-opt=all 2>&1 | FileCheck %s --check-prefix=CHECK-FRAME-OPT
 
-CHECK: BOLT-ERROR: frame-optimizer is supported only on X86
+CHECK-FRAME-OPT: BOLT-ERROR: frame-optimizer is supported only on X86
+
+RUN: not llvm-bolt %t -o %t.bolt split-functions --split-strategy=cdsplit 2>&1 | FileCheck %s --check-prefix=CHECK-CDSPLIT
+CHECK-CDSPLIT: BOLT-ERROR: CDSplit is not supported with LongJmp. Try with '--compact-code-model'

From ff4aec5d3cd466c72ebad7401a3b49b13426e845 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 3 Oct 2025 09:47:35 +0100
Subject: [PATCH 607/878] [VPlan] Deref VPlanPtr when passing to transform
 (NFC) (#161369)

For uniformity with other transforms.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp        |  2 +-
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp      | 10 +++++-----
 llvm/lib/Transforms/Vectorize/VPlanTransforms.h        |  2 +-
 llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp  |  2 +-
 .../Transforms/Vectorize/VPlanUncountableExitTest.cpp  |  4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 77506878e2029..cb6bfb26673bb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8694,7 +8694,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
     Plan->addVF(VF);
 
   if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
-          Plan,
+          *Plan,
           [this](PHINode *P) {
             return Legal->getIntOrFpInductionDescriptor(P);
           },
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f76777bc6cf2e..ca63bf337b5b8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -45,13 +45,13 @@ static cl::opt<bool> EnableWideActiveLaneMask(
     cl::desc("Enable use of wide get active lane mask instructions"));
 
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
-    VPlanPtr &Plan,
+    VPlan &Plan,
     function_ref<const InductionDescriptor *(PHINode *)>
         GetIntOrFpInductionDescriptor,
     const TargetLibraryInfo &TLI) {
 
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
-      Plan->getVectorLoopRegion());
+      Plan.getVectorLoopRegion());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
     // Skip blocks outside region
     if (!VPBB->getParent())
@@ -77,11 +77,11 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
           for (VPValue *Op : PhiR->operands())
             NewRecipe->addOperand(Op);
         } else {
-          VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
+          VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
           VPValue *Step =
-              vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep());
+              vputils::getOrCreateVPValueForSCEVExpr(Plan, II->getStep());
           NewRecipe = new VPWidenIntOrFpInductionRecipe(
-              Phi, Start, Step, &Plan->getVF(), *II, Ingredient.getDebugLoc());
+              Phi, Start, Step, &Plan.getVF(), *II, Ingredient.getDebugLoc());
         }
       } else {
         assert(isa<VPInstruction>(&Ingredient) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 4c65cb7d7a80d..2f00e51ba3025 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -138,7 +138,7 @@ struct VPlanTransforms {
   /// widen recipes. Returns false if any VPInstructions could not be converted
   /// to a wide recipe if needed.
   LLVM_ABI_FOR_TEST static bool tryToConvertVPInstructionsToVPRecipes(
-      VPlanPtr &Plan,
+      VPlan &Plan,
       function_ref<const InductionDescriptor *(PHINode *)>
           GetIntOrFpInductionDescriptor,
       const TargetLibraryInfo &TLI);
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index a943e7ac12b1b..b99d656c5c50f 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -203,7 +203,7 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
           VPInstruction::BranchOnCond,
           {Plan->getOrAddLiveIn(ConstantInt::getTrue(F->getContext()))}));
   VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
-      Plan, [](PHINode *P) { return nullptr; }, TLI);
+      *Plan, [](PHINode *P) { return nullptr; }, TLI);
 
   VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
   EXPECT_EQ(0u, Entry->getNumPredecessors());
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp
index eb075e6267683..b89d378c02660 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanUncountableExitTest.cpp
@@ -48,7 +48,7 @@ TEST_F(VPUncountableExitTest, FindUncountableExitRecipes) {
   BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
   auto Plan = buildVPlan(LoopHeader, /*HasUncountableExit=*/true);
   VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
-      Plan, [](PHINode *P) { return nullptr; }, *TLI);
+      *Plan, [](PHINode *P) { return nullptr; }, *TLI);
   VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
 
   SmallVector<VPRecipeBase *> Recipes;
@@ -85,7 +85,7 @@ TEST_F(VPUncountableExitTest, NoUncountableExit) {
   BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
   auto Plan = buildVPlan(LoopHeader);
   VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
-      Plan, [](PHINode *P) { return nullptr; }, *TLI);
+      *Plan, [](PHINode *P) { return nullptr; }, *TLI);
   VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
 
   SmallVector<VPRecipeBase *> Recipes;

From 5cd3db3bed62c07790c17bf1947e98bc903472a9 Mon Sep 17 00:00:00 2001
From: Ruoyu Qiu <cabbaken@outlook.com>
Date: Fri, 3 Oct 2025 17:02:36 +0800
Subject: [PATCH 608/878] [llvm][ELF]Add Shdr check for getBuildID (#126537)

Add Section Header check for getBuildID, fix crash with invalid Program
Header.

Fixes: #126418

---------

Signed-off-by: Ruoyu Qiu <cabbaken@outlook.com>
Signed-off-by: Ruoyu Qiu <qiuruoyu@xiaomi.com>
Co-authored-by: Ruoyu Qiu <qiuruoyu@xiaomi.com>
Co-authored-by: James Henderson <James.Henderson@sony.com>
---
 llvm/lib/Object/BuildID.cpp                 |  26 ++++-
 llvm/test/DebugInfo/symbolize-build-id.test |   1 +
 llvm/unittests/Object/BuildIDTest.cpp       | 120 ++++++++++++++++++++
 llvm/unittests/Object/CMakeLists.txt        |   1 +
 4 files changed, 142 insertions(+), 6 deletions(-)
 create mode 100644 llvm/unittests/Object/BuildIDTest.cpp

diff --git a/llvm/lib/Object/BuildID.cpp b/llvm/lib/Object/BuildID.cpp
index 89d6bc3ab550d..d1ee597a11327 100644
--- a/llvm/lib/Object/BuildID.cpp
+++ b/llvm/lib/Object/BuildID.cpp
@@ -24,6 +24,24 @@ using namespace llvm::object;
 namespace {
 
 template <typename ELFT> BuildIDRef getBuildID(const ELFFile<ELFT> &Obj) {
+  auto findBuildID = [&Obj](const auto &ShdrOrPhdr,
+                            uint64_t Alignment) -> std::optional<BuildIDRef> {
+    Error Err = Error::success();
+    for (auto N : Obj.notes(ShdrOrPhdr, Err))
+      if (N.getType() == ELF::NT_GNU_BUILD_ID &&
+          N.getName() == ELF::ELF_NOTE_GNU)
+        return N.getDesc(Alignment);
+    consumeError(std::move(Err));
+    return std::nullopt;
+  };
+
+  auto Sections = cantFail(Obj.sections());
+  for (const auto &S : Sections) {
+    if (S.sh_type != ELF::SHT_NOTE)
+      continue;
+    if (std::optional<BuildIDRef> ShdrRes = findBuildID(S, S.sh_addralign))
+      return ShdrRes.value();
+  }
   auto PhdrsOrErr = Obj.program_headers();
   if (!PhdrsOrErr) {
     consumeError(PhdrsOrErr.takeError());
@@ -32,12 +50,8 @@ template <typename ELFT> BuildIDRef getBuildID(const ELFFile<ELFT> &Obj) {
   for (const auto &P : *PhdrsOrErr) {
     if (P.p_type != ELF::PT_NOTE)
       continue;
-    Error Err = Error::success();
-    for (auto N : Obj.notes(P, Err))
-      if (N.getType() == ELF::NT_GNU_BUILD_ID &&
-          N.getName() == ELF::ELF_NOTE_GNU)
-        return N.getDesc(P.p_align);
-    consumeError(std::move(Err));
+    if (std::optional<BuildIDRef> PhdrRes = findBuildID(P, P.p_align))
+      return PhdrRes.value();
   }
   return {};
 }
diff --git a/llvm/test/DebugInfo/symbolize-build-id.test b/llvm/test/DebugInfo/symbolize-build-id.test
index d63f43ff859e6..2620718293320 100644
--- a/llvm/test/DebugInfo/symbolize-build-id.test
+++ b/llvm/test/DebugInfo/symbolize-build-id.test
@@ -21,6 +21,7 @@ Sections:
     Type:    SHT_NOTE
     Flags:   [ SHF_ALLOC ]
     Content: 040000000800000003000000474e5500abb50d82b6bdc861
+    AddressAlign: 4
 ProgramHeaders:
   - Type:     PT_NOTE
     Flags:    [ PF_R ]
diff --git a/llvm/unittests/Object/BuildIDTest.cpp b/llvm/unittests/Object/BuildIDTest.cpp
new file mode 100644
index 0000000000000..04ca6364fbc4d
--- /dev/null
+++ b/llvm/unittests/Object/BuildIDTest.cpp
@@ -0,0 +1,120 @@
+//===- BuildIDTest.cpp - Tests for getBuildID ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/BuildID.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Testing/Support/Error.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+template <class ELFT>
+static Expected<ELFObjectFile<ELFT>> toBinary(SmallVectorImpl<char> &Storage,
+                                              StringRef Yaml) {
+  raw_svector_ostream OS(Storage);
+  yaml::Input YIn(Yaml);
+  if (!yaml::convertYAML(YIn, OS, [](const Twine &Msg) {}))
+    return createStringError(std::errc::invalid_argument,
+                             "unable to convert YAML");
+  return ELFObjectFile<ELFT>::create(MemoryBufferRef(OS.str(), "dummyELF"));
+}
+
+static StringRef getInvalidNoteELF(bool WithShdr) {
+  static std::string WithSection(R"(
+--- !ELF
+FileHeader:
+  Class:          ELFCLASS64
+  Data:           ELFDATA2LSB
+  Type:           ET_EXEC
+  Machine:        EM_X86_64
+ProgramHeaders:
+  - Type:         PT_NOTE
+    FileSize:     0x1a
+    FirstSec:     .note.gnu.build-id
+    LastSec:      .note.gnu.build-id
+Sections:
+  - Name:         .note.gnu.build-id
+    Type:         SHT_NOTE
+    AddressAlign: 0x04
+    Notes:
+      - Name:     "GNU"
+        Desc:     "abb50d82b6bdc861"
+        Type:     3
+)");
+  static std::string WithoutSection(WithSection + R"(
+  - Type:         SectionHeaderTable
+    NoHeaders:    true
+)");
+  if (WithShdr)
+    return WithSection;
+  return WithoutSection;
+}
+
+// The BuildID can be looked up from a section header, if there is no program
+// header.
+TEST(BuildIDTest, InvalidPhdrFileSizeWithShdrs) {
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ElfOrErr =
+      toBinary<ELF64LE>(Storage, getInvalidNoteELF(true));
+  ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+  BuildIDRef BuildID = getBuildID(&ElfOrErr.get());
+  EXPECT_EQ(
+      StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()),
+      "\xAB\xB5\x0D\x82\xB6\xBD\xC8\x61");
+}
+
+// The code handles a malformed program header that points at data outside the
+// file.
+TEST(BuildIDTest, InvalidPhdrFileSizeNoShdrs) {
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ElfOrErr =
+      toBinary<ELF64LE>(Storage, getInvalidNoteELF(false));
+  ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+  BuildIDRef BuildID = getBuildID(&ElfOrErr.get());
+  EXPECT_EQ(
+      StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()),
+      "");
+}
+
+// The code handles a malformed section header that points at data outside the
+// file.
+TEST(BuildIDTest, InvalidSectionHeader) {
+  SmallString<0> Storage;
+  Expected<ELFObjectFile<ELF64LE>> ElfOrErr = toBinary<ELF64LE>(Storage, R"(
+--- !ELF
+FileHeader:
+  Class:          ELFCLASS64
+  Data:           ELFDATA2LSB
+  Type:           ET_EXEC
+  Machine:        EM_X86_64
+ProgramHeaders:
+  - Type:         PT_NOTE
+    FirstSec:     .note.gnu.build-id
+    LastSec:      .note.gnu.build-id
+Sections:
+  - Name:         .note.gnu.build-id
+    Type:         SHT_NOTE
+    AddressAlign: 0x04
+    ShOffset:     0x1a1
+    Notes:
+      - Name:     "GNU"
+        Desc:     "abb50d82b6bdc861"
+        Type:     3
+)");
+  ASSERT_THAT_EXPECTED(ElfOrErr, Succeeded());
+  BuildIDRef BuildID = getBuildID(&ElfOrErr.get());
+  EXPECT_EQ(
+      StringRef(reinterpret_cast<const char *>(BuildID.data()), BuildID.size()),
+      "\xAB\xB5\x0D\x82\xB6\xBD\xC8\x61");
+}
diff --git a/llvm/unittests/Object/CMakeLists.txt b/llvm/unittests/Object/CMakeLists.txt
index 1343352d1dc69..cd70a7b18b5f4 100644
--- a/llvm/unittests/Object/CMakeLists.txt
+++ b/llvm/unittests/Object/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(ObjectTests
   ArchiveTest.cpp
+  BuildIDTest.cpp
   COFFObjectFileTest.cpp
   DXContainerTest.cpp
   ELFObjectFileTest.cpp

From cc9c64d525ece2167a6fae657578a7379541ac6e Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 3 Oct 2025 11:07:07 +0200
Subject: [PATCH 609/878] [AArch64] Refactor and refine cost-model for partial
 reductions (#158641)

This cost-model takes into account any type-legalisation that would
happen on vectors such as splitting and promotion. This results in wider
VFs being chosen for loops that can use partial reductions.

The cost-model now also assumes that when SVE is available, the SVE dot
instructions for i16 -> i64 dot products can be used for fixed-length
vectors. In practice this means that loops with non-scalable VFs are
vectorized using partial reductions where they wouldn't before, e.g.

```
  int64_t foo2(int8_t *src1, int8_t *src2, int N) {
    int64_t sum = 0;
    for (int i=0; i<N; ++i)
      sum += (int64_t)src1[i] * (int64_t)src2[i];
    return sum;
  }
```

These changes also fix an issue where previously a partial reduction
would be used for mixed sign/zero-extends (USDOT), even when +i8mm was
not available.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 121 ++--
 .../AArch64/fully-unrolled-cost.ll            |   2 +-
 .../AArch64/partial-reduce-chained.ll         | 162 ++---
 .../partial-reduce-dot-product-epilogue.ll    |   2 +-
 .../partial-reduce-dot-product-mixed.ll       | 100 +--
 .../AArch64/partial-reduce-dot-product.ll     | 638 ++++++++----------
 .../AArch64/partial-reduce-sub.ll             |  94 ++-
 .../LoopVectorize/AArch64/partial-reduce.ll   |  94 +--
 .../LoopVectorize/AArch64/reg-usage.ll        |   2 +-
 9 files changed, 549 insertions(+), 666 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8c4b4f6e4d6de..50a8754d74075 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5632,75 +5632,94 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
     TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
     TTI::TargetCostKind CostKind) const {
   InstructionCost Invalid = InstructionCost::getInvalid();
-  InstructionCost Cost(TTI::TCC_Basic);
 
   if (CostKind != TTI::TCK_RecipThroughput)
     return Invalid;
 
-  // Sub opcodes currently only occur in chained cases.
-  // Independent partial reduction subtractions are still costed as an add
+  if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
+      (!ST->isNeonAvailable() || !ST->hasDotProd()))
+    return Invalid;
+
   if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
       OpAExtend == TTI::PR_None)
     return Invalid;
 
+  assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
+         (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
+         "Unexpected values for OpBExtend or InputTypeB");
+
   // We only support multiply binary operations for now, and for muls we
   // require the types being extended to be the same.
-  // NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
-  // only if the i8mm or sve/streaming features are available.
-  if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB ||
-                OpBExtend == TTI::PR_None ||
-                (OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
-                 !ST->isSVEorStreamingSVEAvailable())))
+  if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
     return Invalid;
-  assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
-         "Unexpected values for OpBExtend or InputTypeB");
 
-  EVT InputEVT = EVT::getEVT(InputTypeA);
-  EVT AccumEVT = EVT::getEVT(AccumType);
+  bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
+  if (IsUSDot && !ST->hasMatMulInt8())
+    return Invalid;
+
+  unsigned Ratio =
+      AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+  if (VF.getKnownMinValue() <= Ratio)
+    return Invalid;
+
+  VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
+  VectorType *AccumVectorType =
+      VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
+  // We don't yet support all kinds of legalization.
+  auto TA = TLI->getTypeAction(AccumVectorType->getContext(),
+                               EVT::getEVT(AccumVectorType));
+  switch (TA) {
+  default:
+    return Invalid;
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypePromoteInteger:
+  case TargetLowering::TypeSplitVector:
+    break;
+  }
+
+  // Check what kind of type-legalisation happens.
+  std::pair<InstructionCost, MVT> AccumLT =
+      getTypeLegalizationCost(AccumVectorType);
+  std::pair<InstructionCost, MVT> InputLT =
+      getTypeLegalizationCost(InputVectorType);
 
-  unsigned VFMinValue = VF.getKnownMinValue();
+  InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
 
-  if (VF.isScalable()) {
-    if (!ST->isSVEorStreamingSVEAvailable())
-      return Invalid;
+  // Prefer using full types by costing half-full input types as more expensive.
+  if (TypeSize::isKnownLT(InputVectorType->getPrimitiveSizeInBits(),
+                          TypeSize::getScalable(128)))
+    // FIXME: This can be removed after the cost of the extends are folded into
+    // the dot-product expression in VPlan, after landing:
+    //  https://github.com/llvm/llvm-project/pull/147302
+    Cost *= 2;
 
-    // Don't accept a partial reduction if the scaled accumulator is vscale x 1,
-    // since we can't lower that type.
-    unsigned Scale =
-        AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
-    if (VFMinValue == Scale)
-      return Invalid;
+  if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
+    // i16 -> i64 is natively supported for udot/sdot
+    if (AccumLT.second.getScalarType() == MVT::i64 &&
+        InputLT.second.getScalarType() == MVT::i16)
+      return Cost;
+    // i8 -> i64 is supported with an extra level of extends
+    if (AccumLT.second.getScalarType() == MVT::i64 &&
+        InputLT.second.getScalarType() == MVT::i8)
+      // FIXME: This cost should probably be a little higher, e.g. Cost + 2
+      // because it requires two extra extends on the inputs. But if we'd change
+      // that now, a regular reduction would be cheaper because the costs of
+      // the extends in the IR are still counted. This can be fixed
+      // after https://github.com/llvm/llvm-project/pull/147302 has landed.
+      return Cost;
   }
-  if (VF.isFixed() &&
-      (!ST->isNeonAvailable() || !ST->hasDotProd() || AccumEVT == MVT::i64))
-    return Invalid;
 
-  if (InputEVT == MVT::i8) {
-    switch (VFMinValue) {
-    default:
-      return Invalid;
-    case 8:
-      if (AccumEVT == MVT::i32)
-        Cost *= 2;
-      else if (AccumEVT != MVT::i64)
-        return Invalid;
-      break;
-    case 16:
-      if (AccumEVT == MVT::i64)
-        Cost *= 2;
-      else if (AccumEVT != MVT::i32)
-        return Invalid;
-      break;
-    }
-  } else if (InputEVT == MVT::i16) {
-    // FIXME: Allow i32 accumulator but increase cost, as we would extend
-    //        it to i64.
-    if (VFMinValue != 8 || AccumEVT != MVT::i64)
-      return Invalid;
-  } else
-    return Invalid;
+  // i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
+  if (ST->isSVEorStreamingSVEAvailable() ||
+      (AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
+       ST->hasDotProd())) {
+    if (AccumLT.second.getScalarType() == MVT::i32 &&
+        InputLT.second.getScalarType() == MVT::i8)
+      return Cost;
+  }
 
-  return Cost;
+  // Add additional cost for the extends that would need to be inserted.
+  return Cost + 4;
 }
 
 InstructionCost
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index c3b0bc8c00a74..27ca4143b5be5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -86,7 +86,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 41
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 229209e98af78..5ae08393a1804 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -204,37 +204,33 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -670,39 +666,35 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP21]] = add <vscale x 4 x i32> [[TMP19]], [[TMP20]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -996,36 +988,32 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -1140,32 +1128,28 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP11]]
-; CHECK-SVE-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[TMP12]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -1277,36 +1261,32 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[TMP16]], [[TMP17]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index dd239c023c686..8ece59aef90a9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -81,7 +81,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 49e9989b65d2f..09b41fb551775 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -12,40 +12,40 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -82,14 +82,14 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
 ; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT:    [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NOI8MM-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NOI8MM-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
@@ -123,40 +123,40 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -193,14 +193,14 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
 ; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP20]])
-; CHECK-NOI8MM-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP21]])
+; CHECK-NOI8MM-NEXT:    [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NOI8MM-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
+; CHECK-NOI8MM-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 6e11e559151da..3a882730f0379 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -12,74 +12,62 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP6]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP14]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP28]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP11]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @dotp(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -139,78 +127,52 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP9]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[TMP14]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i64 [[TMP4]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
 ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nuw i64 [[TMP9]], 4
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP13]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 2 x i64> [[TMP22]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[TMP23]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP4]]
 ;
 ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
 ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
@@ -274,86 +236,66 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP10]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP10]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[TMP16]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP17]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i64 [[TMP4]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod(
 ; CHECK-INTERLEAVED-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i16>, ptr [[TMP30]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i16>, ptr [[NEXT_GEP3]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 2 x i16>, ptr [[TMP21]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 2 x i16> [[WIDE_LOAD6]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP22]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26]] = add <vscale x 2 x i64> [[TMP24]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP27]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP9]]
 ;
 ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i16_to_i64_has_neon_dotprod(
 ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1]] {
@@ -497,7 +439,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
@@ -656,7 +598,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -803,7 +745,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
@@ -851,7 +793,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
@@ -952,7 +894,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -990,7 +932,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8
@@ -1058,22 +1000,18 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
 ; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1085,38 +1023,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP41]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP35]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP30]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP23]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -1124,26 +1062,22 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
 ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP13]], 3
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i64 [[TMP16]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1155,90 +1089,74 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP56]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = shl nuw i64 [[TMP25]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = shl nuw i64 [[TMP35]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = shl nuw i64 [[TMP41]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = shl nuw i64 [[TMP57]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = shl nuw i64 [[TMP67]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = shl nuw i64 [[TMP73]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP81]], [[TMP80]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX23:%.*]] = add <vscale x 4 x i32> [[TMP65]], [[TMP64]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX23]])
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX24:%.*]] = add <vscale x 4 x i32> [[TMP49]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX24]])
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX25:%.*]] = add <vscale x 4 x i32> [[TMP33]], [[TMP50]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX25]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE23]], [[PARTIAL_REDUCE22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -1396,7 +1314,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT:%.*]]
@@ -1434,7 +1352,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[EXIT:%.*]]
@@ -1525,7 +1443,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1572,7 +1490,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
@@ -1607,7 +1525,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1666,7 +1584,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1713,7 +1631,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
@@ -1748,7 +1666,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1866,7 +1784,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -1978,7 +1896,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2016,7 +1934,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2053,7 +1971,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2111,7 +2029,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2149,7 +2067,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2186,7 +2104,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2226,36 +2144,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK-INTERLEAVE1:       for.body.preheader:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP16]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = add <vscale x 2 x i64> [[TMP17]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP18]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -2267,50 +2181,32 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK-INTERLEAVED:       for.body.preheader:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[NEXT_GEP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP23]], [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP24]], [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27]] = add <vscale x 2 x i64> [[TMP25]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP28]] = add <vscale x 2 x i64> [[TMP26]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -2349,7 +2245,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2471,7 +2367,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]])
@@ -2571,7 +2467,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2671,7 +2567,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 11ff688cb53a9..7bb47157d7cf2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -12,77 +12,65 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP8]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP21]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sub <vscale x 4 x i32> zeroinitializer, [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26]] = add <vscale x 4 x i32> [[VEC_PHI1]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP26]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @dotp(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index db3166cc0ec8d..3c2ae1c74d171 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -17,16 +17,16 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3]] = add <16 x i32> [[TMP2]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
@@ -38,22 +38,22 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -199,16 +199,16 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = add <16 x i64> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP3]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
@@ -220,22 +220,22 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <16 x i64> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i64> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i64> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -293,16 +293,16 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = add <8 x i64> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
@@ -314,22 +314,22 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i32 8
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <8 x i64> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -764,16 +764,16 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = add <16 x i32> [[TMP3]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
@@ -785,22 +785,22 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <16 x i32> [[TMP4]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -984,21 +984,21 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
 ; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP6]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -1015,26 +1015,26 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
 ; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[VEC_PHI2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index c61361bb3df76..25ee10077fb47 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK:       LV(REG): VF = 16
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: Generic::ScalarRC, 9 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 24 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 12 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 entry:
   br label %for.body

From 6c40c76c21266e1f6c20111317041cfbf7e9b4e7 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Fri, 3 Oct 2025 10:08:35 +0100
Subject: [PATCH 610/878] [flang][debug] Avoid redundant module info. (#161542)

Fixes https://github.com/llvm/llvm-project/issues/160907.

When a module is just being used and not defined, we generate it with
decl=true. But if the file/line fields are valid, the module is not
merged with the original and is considered different. This patch avoids
setting file/line/scope in such cases.
---
 flang/lib/Optimizer/Transforms/AddDebugInfo.cpp |  9 +++++++--
 flang/test/Transforms/debug-module-3.fir        | 13 +++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Transforms/debug-module-3.fir

diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index bdf7e4a366cf1..e006d2e878fd8 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -285,11 +285,16 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr(
   if (auto iter{moduleMap.find(name)}; iter != moduleMap.end()) {
     modAttr = iter->getValue();
   } else {
+    // When decl is true, it means that module is only being used in this
+    // compilation unit and it is defined elsewhere. But if the file/line/scope
+    // fields are valid, the module is not merged with its definition and is
+    // considered different. So we only set those fields when decl is false.
     modAttr = mlir::LLVM::DIModuleAttr::get(
-        context, fileAttr, scope, mlir::StringAttr::get(context, name),
+        context, decl ? nullptr : fileAttr, decl ? nullptr : scope,
+        mlir::StringAttr::get(context, name),
         /* configMacros */ mlir::StringAttr(),
         /* includePath */ mlir::StringAttr(),
-        /* apinotes */ mlir::StringAttr(), line, decl);
+        /* apinotes */ mlir::StringAttr(), decl ? 0 : line, decl);
     moduleMap[name] = modAttr;
   }
   return modAttr;
diff --git a/flang/test/Transforms/debug-module-3.fir b/flang/test/Transforms/debug-module-3.fir
new file mode 100644
index 0000000000000..03cc21ea4faa4
--- /dev/null
+++ b/flang/test/Transforms/debug-module-3.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module {
+  func.func @_QQmain() {
+    %2 = fir.address_of(@_QMmodEvar1) : !fir.ref<i32> loc(#loc1)
+    %3 = fircg.ext_declare %2 {uniq_name = "_QMmodEvar1"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc1)
+    return
+  } loc(#loc1)
+  fir.global @_QMmodEvar1 : i32 loc(#loc1)
+}
+#loc1 = loc("test1.f90":1:0)
+
+// CHECK: #llvm.di_module<name = "mod", isDecl = true>

From 4647cd7344f3683a6a0e094c4f751a846b4bcfb4 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 3 Oct 2025 09:09:45 +0000
Subject: [PATCH 611/878] [gn build] Port 5cd3db3bed62

---
 llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
index 9fcb05c1a34dc..54193c8de73f1 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Object/BUILD.gn
@@ -10,6 +10,7 @@ unittest("ObjectTests") {
   ]
   sources = [
     "ArchiveTest.cpp",
+    "BuildIDTest.cpp",
     "COFFObjectFileTest.cpp",
     "DXContainerTest.cpp",
     "ELFObjectFileTest.cpp",

From e83c3c58878111af29dee3ead196a5c315d3f7e9 Mon Sep 17 00:00:00 2001
From: Mahesh-Attarde <mahesh.attarde@intel.com>
Date: Fri, 3 Oct 2025 15:05:13 +0530
Subject: [PATCH 612/878] [X86][GlobalIsel] Enable gisel run for fpclass isel
 (#160741)

X86 Gisel has all necessary opcodes supported to expand/lower isfpclass
intrinsic, enabling test prior fpclass patch. This patch enables runs
for isel-fpclass.ll tests
---
 llvm/test/CodeGen/X86/isel-fpclass.ll | 256 ++++++++++++--------------
 1 file changed, 122 insertions(+), 134 deletions(-)

diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index 960bbf53a6451..df04b673d8223 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86-SDAGISEL
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
 ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
 ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
+; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2  | FileCheck %s -check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2  | FileCheck %s -check-prefixes=X64,X64-GISEL
 
-; FIXME: We can reuse/delete llvm/test/CodeGen/X86/is_fpclass.ll when all patches are included.
-
-define i1 @isnone_f(float %x) {
-; X86-SDAGISEL-LABEL: isnone_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    xorl %eax, %eax
-; X86-SDAGISEL-NEXT:    retl
+define i1 @isnone_f(float %x) nounwind {
+; X86-LABEL: isnone_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isnone_f:
 ; X64:       # %bb.0: # %entry
@@ -28,11 +28,11 @@ entry:
   ret i1 %0
 }
 
-define i1 @isany_f(float %x) {
-; X86-SDAGISEL-LABEL: isany_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movb $1, %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @isany_f(float %x) nounwind {
+; X86-LABEL: isany_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isany_f:
 ; X64:       # %bb.0: # %entry
@@ -50,17 +50,17 @@ entry:
   ret i1 %0
 }
 
-define i1 @issignaling_f(float %x) {
-; X86-SDAGISEL-LABEL: issignaling_f:
-; X86-SDAGISEL:       # %bb.0:
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setl %cl
-; X86-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    andb %cl, %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @issignaling_f(float %x) nounwind {
+; X86-LABEL: issignaling_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: issignaling_f:
 ; X64:       # %bb.0:
@@ -76,7 +76,6 @@ define i1 @issignaling_f(float %x) {
 ; X86-FASTISEL-LABEL: issignaling_f:
 ; X86-FASTISEL:       # %bb.0:
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -87,20 +86,19 @@ define i1 @issignaling_f(float %x) {
 ; X86-FASTISEL-NEXT:    setge %al
 ; X86-FASTISEL-NEXT:    andb %cl, %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
    %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1)  ; "snan"
    ret i1 %a0
 }
 
- define i1 @isquiet_f(float %x) {
-; X86-SDAGISEL-LABEL: isquiet_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    retl
+ define i1 @isquiet_f(float %x) nounwind {
+; X86-LABEL: isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isquiet_f:
 ; X64:       # %bb.0: # %entry
@@ -113,7 +111,6 @@ define i1 @issignaling_f(float %x) {
 ; X86-FASTISEL-LABEL: isquiet_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -121,21 +118,20 @@ define i1 @issignaling_f(float %x) {
 ; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
 ; X86-FASTISEL-NEXT:    setge %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
  entry:
    %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2)  ; "qnan"
    ret i1 %0
 }
 
-define i1 @not_isquiet_f(float %x) {
-; X86-SDAGISEL-LABEL: not_isquiet_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setl %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @not_isquiet_f(float %x) nounwind {
+; X86-LABEL: not_isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_isquiet_f:
 ; X64:       # %bb.0: # %entry
@@ -148,7 +144,6 @@ define i1 @not_isquiet_f(float %x) {
 ; X86-FASTISEL-LABEL: not_isquiet_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -156,21 +151,20 @@ define i1 @not_isquiet_f(float %x) {
 ; X86-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
 ; X86-FASTISEL-NEXT:    setl %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021)  ; ~"qnan"
   ret i1 %0
 }
 
-define i1 @isinf_f(float %x) {
-; X86-SDAGISEL-LABEL: isinf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @isinf_f(float %x) nounwind {
+; X86-LABEL: isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isinf_f:
 ; X64:       # %bb.0: # %entry
@@ -183,7 +177,6 @@ define i1 @isinf_f(float %x) {
 ; X86-FASTISEL-LABEL: isinf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -191,21 +184,20 @@ define i1 @isinf_f(float %x) {
 ; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
   ret i1 %0
 }
 
-define i1 @not_isinf_f(float %x) {
-; X86-SDAGISEL-LABEL: not_isinf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setne %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @not_isinf_f(float %x) nounwind {
+; X86-LABEL: not_isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_isinf_f:
 ; X64:       # %bb.0: # %entry
@@ -218,7 +210,6 @@ define i1 @not_isinf_f(float %x) {
 ; X86-FASTISEL-LABEL: not_isinf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -226,19 +217,18 @@ define i1 @not_isinf_f(float %x) {
 ; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
 ; X86-FASTISEL-NEXT:    setne %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; ~0x204 = "~inf"
   ret i1 %0
 }
 
-define i1 @is_plus_inf_f(float %x) {
-; X86-SDAGISEL-LABEL: is_plus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @is_plus_inf_f(float %x) nounwind {
+; X86-LABEL: is_plus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_plus_inf_f:
 ; X64:       # %bb.0: # %entry
@@ -250,25 +240,23 @@ define i1 @is_plus_inf_f(float %x) {
 ; X86-FASTISEL-LABEL: is_plus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    cmpl $2139095040, (%esp) # imm = 0x7F800000
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512)  ; 0x200 = "+inf"
   ret i1 %0
 }
 
-define i1 @is_minus_inf_f(float %x) {
-; X86-SDAGISEL-LABEL: is_minus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @is_minus_inf_f(float %x) nounwind {
+; X86-LABEL: is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_minus_inf_f:
 ; X64:       # %bb.0: # %entry
@@ -280,25 +268,23 @@ define i1 @is_minus_inf_f(float %x) {
 ; X86-FASTISEL-LABEL: is_minus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    cmpl $-8388608, (%esp) # imm = 0xFF800000
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4)  ; "-inf"
   ret i1 %0
 }
 
-define i1 @not_is_minus_inf_f(float %x) {
-; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT:    setne %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @not_is_minus_inf_f(float %x) nounwind {
+; X86-LABEL: not_is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_is_minus_inf_f:
 ; X64:       # %bb.0: # %entry
@@ -310,27 +296,25 @@ define i1 @not_is_minus_inf_f(float %x) {
 ; X86-FASTISEL-LABEL: not_is_minus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    cmpl $-8388608, (%esp) # imm = 0xFF800000
 ; X86-FASTISEL-NEXT:    setne %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019)  ; ~"-inf"
   ret i1 %0
 }
 
-define i1 @isfinite_f(float %x) {
-; X86-SDAGISEL-LABEL: isfinite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setl %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @isfinite_f(float %x) nounwind {
+; X86-LABEL: isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isfinite_f:
 ; X64:       # %bb.0: # %entry
@@ -343,7 +327,6 @@ define i1 @isfinite_f(float %x) {
 ; X86-FASTISEL-LABEL: isfinite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -351,21 +334,20 @@ define i1 @isfinite_f(float %x) {
 ; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
 ; X86-FASTISEL-NEXT:    setl %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %0
 }
 
-define i1 @not_isfinite_f(float %x) {
-; X86-SDAGISEL-LABEL: not_isfinite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @not_isfinite_f(float %x) nounwind {
+; X86-LABEL: not_isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: not_isfinite_f:
 ; X64:       # %bb.0: # %entry
@@ -378,7 +360,6 @@ define i1 @not_isfinite_f(float %x) {
 ; X86-FASTISEL-LABEL: not_isfinite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
@@ -386,19 +367,18 @@ define i1 @not_isfinite_f(float %x) {
 ; X86-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
 ; X86-FASTISEL-NEXT:    setge %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; ~0x1f8 = "~finite"
   ret i1 %0
 }
 
-define i1 @is_plus_finite_f(float %x) {
-; X86-SDAGISEL-LABEL: is_plus_finite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setb %al
-; X86-SDAGISEL-NEXT:    retl
+define i1 @is_plus_finite_f(float %x) nounwind {
+; X86-LABEL: is_plus_finite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: is_plus_finite_f:
 ; X64:       # %bb.0: # %entry
@@ -410,13 +390,11 @@ define i1 @is_plus_finite_f(float %x) {
 ; X86-FASTISEL-LABEL: is_plus_finite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
 ; X86-FASTISEL-NEXT:    pushl %eax
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 8
 ; X86-FASTISEL-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-FASTISEL-NEXT:    fstps (%esp)
 ; X86-FASTISEL-NEXT:    cmpl $2139095040, (%esp) # imm = 0x7F800000
 ; X86-FASTISEL-NEXT:    setb %al
 ; X86-FASTISEL-NEXT:    popl %ecx
-; X86-FASTISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-FASTISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448)  ; 0x1c0 = "+finite"
@@ -424,10 +402,10 @@ entry:
 }
 
 define i1 @isnone_d(double %x) nounwind {
-; X86-SDAGISEL-LABEL: isnone_d:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    xorl %eax, %eax
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isnone_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isnone_d:
 ; X64:       # %bb.0: # %entry
@@ -446,10 +424,10 @@ entry:
 }
 
 define i1 @isany_d(double %x) nounwind {
-; X86-SDAGISEL-LABEL: isany_d:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movb $1, %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isany_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isany_d:
 ; X64:       # %bb.0: # %entry
@@ -468,10 +446,10 @@ entry:
 }
 
 define i1 @isnone_f80(x86_fp80 %x) nounwind {
-; X86-SDAGISEL-LABEL: isnone_f80:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    xorl %eax, %eax
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isnone_f80:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isnone_f80:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -491,16 +469,21 @@ define i1 @isnone_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    fstp %st(0)
 ; X64-FASTISEL-NEXT:    xorl %eax, %eax
 ; X64-FASTISEL-NEXT:    retq
+;
+; X64-GISEL-LABEL: isnone_f80:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %eax, %eax
+; X64-GISEL-NEXT:    retq
 entry:
 %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 0)
 ret i1 %0
 }
 
 define i1 @isany_f80(x86_fp80 %x) nounwind {
-; X86-SDAGISEL-LABEL: isany_f80:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movb $1, %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isany_f80:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isany_f80:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -520,6 +503,11 @@ define i1 @isany_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    fstp %st(0)
 ; X64-FASTISEL-NEXT:    movb $1, %al
 ; X64-FASTISEL-NEXT:    retq
+;
+; X64-GISEL-LABEL: isany_f80:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movb $1, %al
+; X64-GISEL-NEXT:    retq
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f80(x86_fp80 %x, i32 1023)
     ret i1 %0

From 72679c8e0e26df15f8d36a53be63d33384d54b69 Mon Sep 17 00:00:00 2001
From: Mahesh-Attarde <mahesh.attarde@intel.com>
Date: Fri, 3 Oct 2025 15:22:10 +0530
Subject: [PATCH 613/878] [X86][GlobalIsel] Adds support for
 G_UMIN/G_UMAX/G_SMIN/G_SMAX (#161783)

Original PR broke in rebase
https://github.com/llvm/llvm-project/pull/160247. Continuing here
This patch adds support for G_[U|S][MIN|MAX] opcodes into X86 Target.

This PR addressed review comments
1. About Widening to next power of 2
https://github.com/llvm/llvm-project/pull/160247#discussion_r2371655478
2. clamping scalar
https://github.com/llvm/llvm-project/pull/160247#discussion_r2374748440
---
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp |   4 +
 llvm/test/CodeGen/X86/isel-smax.ll            | 244 ++++++++++++------
 llvm/test/CodeGen/X86/isel-smin.ll            | 244 ++++++++++++------
 llvm/test/CodeGen/X86/isel-umax.ll            | 244 ++++++++++++------
 llvm/test/CodeGen/X86/isel-umin.ll            | 244 ++++++++++++------
 5 files changed, 652 insertions(+), 328 deletions(-)

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 143c4c43e611a..e7709ef589502 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -149,6 +149,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
         });
   }
 
+  getActionDefinitionsBuilder({G_UMIN, G_UMAX, G_SMIN, G_SMAX})
+      .widenScalarToNextPow2(0, /*Min=*/32)
+      .lower();
+
   // integer addition/subtraction
   getActionDefinitionsBuilder({G_ADD, G_SUB})
       .legalFor({s8, s16, s32})
diff --git a/llvm/test/CodeGen/X86/isel-smax.ll b/llvm/test/CodeGen/X86/isel-smax.ll
index 9c9a48e3a1b3e..1ce0a8006bb74 100644
--- a/llvm/test/CodeGen/X86/isel-smax.ll
+++ b/llvm/test/CodeGen/X86/isel-smax.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
 define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: smax_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmpb %al, %dil
-; X64-NEXT:    cmovgl %edi, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
+; DAG-X64-LABEL: smax_i8:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    movl %esi, %eax
+; DAG-X64-NEXT:    cmpb %al, %dil
+; DAG-X64-NEXT:    cmovgl %edi, %eax
+; DAG-X64-NEXT:    # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT:    retq
 ;
 ; FASTISEL-X64-LABEL: smax_i8:
 ; FASTISEL-X64:       # %bb.0:
@@ -24,6 +24,17 @@ define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone {
 ; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; FASTISEL-X64-NEXT:    retq
 ;
+; GISEL-X64-LABEL: smax_i8:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %esi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpb %al, %dil
+; GISEL-X64-NEXT:    setg %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovnew %di, %ax
+; GISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT:    retq
+;
 ; X86-LABEL: smax_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @smax_i8(i8 %a, i8 %b) nounwind readnone {
 ; X86-NEXT:  .LBB0_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: smax_i8:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpb %cl, %al
-; FASTISEL-X86-NEXT:    jg .LBB0_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB0_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: smax_i8:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpb %al, %cl
+; GISEL-X86-NEXT:    setg %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB0_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB0_2:
+; GISEL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i8 @llvm.smax.i8(i8 %a, i8 %b)
     ret i8 %ret
 }
@@ -57,25 +72,28 @@ define i16 @smax_i16(i16 %a, i16 %b) nounwind readnone {
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: smax_i16:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpw %ax, %di
-; FASTISEL-X64-NEXT:    cmovgl %edi, %eax
-; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: smax_i16:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpw %si, %ax
+; GISEL-X64-NEXT:    setg %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovew %si, %ax
+; GISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: smax_i16:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %cx, %ax
-; X86-NEXT:    jg .LBB1_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB1_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
+; DAG-X86-LABEL: smax_i16:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    cmpw %cx, %ax
+; DAG-X86-NEXT:    jg .LBB1_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:  .LBB1_2:
+; DAG-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: smax_i16:
 ; FASTISEL-X86:       # %bb.0:
@@ -88,6 +106,21 @@ define i16 @smax_i16(i16 %a, i16 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:  .LBB1_2:
 ; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: smax_i16:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpw %ax, %cx
+; GISEL-X86-NEXT:    setg %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB1_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB1_2:
+; GISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i16 @llvm.smax.i16(i16 %a, i16 %b)
     ret i16 %ret
 }
@@ -99,12 +132,15 @@ define i32 @smax_i32(i32 %a, i32 %b) nounwind readnone {
 ; X64-NEXT:    cmovgl %edi, %eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: smax_i32:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpl %esi, %edi
-; FASTISEL-X64-NEXT:    cmovgl %edi, %eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: smax_i32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setg %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovel %esi, %eax
+; GISEL-X64-NEXT:    retq
 ;
 ; X86-LABEL: smax_i32:
 ; X86:       # %bb.0:
@@ -117,16 +153,19 @@ define i32 @smax_i32(i32 %a, i32 %b) nounwind readnone {
 ; X86-NEXT:  .LBB2_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: smax_i32:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
-; FASTISEL-X86-NEXT:    jg .LBB2_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB2_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: smax_i32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpl %eax, %ecx
+; GISEL-X86-NEXT:    setg %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB2_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB2_2:
+; GISEL-X86-NEXT:    retl
     %ret = call i32 @llvm.smax.i32(i32 %a, i32 %b)
     ret i32 %ret
 }
@@ -138,32 +177,35 @@ define i64 @smax_i64(i64 %a, i64 %b) nounwind readnone {
 ; X64-NEXT:    cmovgq %rdi, %rax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: smax_i64:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movq %rsi, %rax
-; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT:    cmovgq %rdi, %rax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: smax_i64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movq %rdi, %rax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpq %rsi, %rdi
+; GISEL-X64-NEXT:    setg %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmoveq %rsi, %rax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: smax_i64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    jl .LBB3_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:  .LBB3_2:
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; DAG-X86-LABEL: smax_i64:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    pushl %edi
+; DAG-X86-NEXT:    pushl %esi
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT:    cmpl %eax, %ecx
+; DAG-X86-NEXT:    movl %esi, %edi
+; DAG-X86-NEXT:    sbbl %edx, %edi
+; DAG-X86-NEXT:    jl .LBB3_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:    movl %esi, %edx
+; DAG-X86-NEXT:  .LBB3_2:
+; DAG-X86-NEXT:    popl %esi
+; DAG-X86-NEXT:    popl %edi
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: smax_i64:
 ; FASTISEL-X86:       # %bb.0:
@@ -184,6 +226,44 @@ define i64 @smax_i64(i64 %a, i64 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:    popl %esi
 ; FASTISEL-X86-NEXT:    popl %edi
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: smax_i64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    pushl %ebp
+; GISEL-X86-NEXT:    pushl %ebx
+; GISEL-X86-NEXT:    pushl %edi
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    cmpl %eax, %esi
+; GISEL-X86-NEXT:    seta %bl
+; GISEL-X86-NEXT:    xorl %ecx, %ecx
+; GISEL-X86-NEXT:    cmpl %edx, %ebp
+; GISEL-X86-NEXT:    setg %bh
+; GISEL-X86-NEXT:    sete %cl
+; GISEL-X86-NEXT:    testl %ecx, %ecx
+; GISEL-X86-NEXT:    je .LBB3_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movb %bl, %bh
+; GISEL-X86-NEXT:  .LBB3_2:
+; GISEL-X86-NEXT:    movzbl %bh, %edi
+; GISEL-X86-NEXT:    andl $1, %edi
+; GISEL-X86-NEXT:    je .LBB3_4
+; GISEL-X86-NEXT:  # %bb.3:
+; GISEL-X86-NEXT:    movl %esi, %eax
+; GISEL-X86-NEXT:  .LBB3_4:
+; GISEL-X86-NEXT:    testl %edi, %edi
+; GISEL-X86-NEXT:    je .LBB3_6
+; GISEL-X86-NEXT:  # %bb.5:
+; GISEL-X86-NEXT:    movl %ebp, %edx
+; GISEL-X86-NEXT:  .LBB3_6:
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    popl %edi
+; GISEL-X86-NEXT:    popl %ebx
+; GISEL-X86-NEXT:    popl %ebp
+; GISEL-X86-NEXT:    retl
     %ret = call i64 @llvm.smax.i64(i64 %a, i64 %b)
     ret i64 %ret
 }
diff --git a/llvm/test/CodeGen/X86/isel-smin.ll b/llvm/test/CodeGen/X86/isel-smin.ll
index 7349a7c6a06f3..bbed3c356cb3b 100644
--- a/llvm/test/CodeGen/X86/isel-smin.ll
+++ b/llvm/test/CodeGen/X86/isel-smin.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
 define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: smin_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmpb %al, %dil
-; X64-NEXT:    cmovll %edi, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
+; DAG-X64-LABEL: smin_i8:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    movl %esi, %eax
+; DAG-X64-NEXT:    cmpb %al, %dil
+; DAG-X64-NEXT:    cmovll %edi, %eax
+; DAG-X64-NEXT:    # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT:    retq
 ;
 ; FASTISEL-X64-LABEL: smin_i8:
 ; FASTISEL-X64:       # %bb.0:
@@ -24,6 +24,17 @@ define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone {
 ; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; FASTISEL-X64-NEXT:    retq
 ;
+; GISEL-X64-LABEL: smin_i8:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %esi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpb %al, %dil
+; GISEL-X64-NEXT:    setl %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovnew %di, %ax
+; GISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT:    retq
+;
 ; X86-LABEL: smin_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @smin_i8(i8 %a, i8 %b) nounwind readnone {
 ; X86-NEXT:  .LBB0_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: smin_i8:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpb %cl, %al
-; FASTISEL-X86-NEXT:    jl .LBB0_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB0_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: smin_i8:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpb %al, %cl
+; GISEL-X86-NEXT:    setl %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB0_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB0_2:
+; GISEL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i8 @llvm.smin.i8(i8 %a, i8 %b)
     ret i8 %ret
 }
@@ -57,25 +72,28 @@ define i16 @smin_i16(i16 %a, i16 %b) nounwind readnone {
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: smin_i16:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpw %ax, %di
-; FASTISEL-X64-NEXT:    cmovll %edi, %eax
-; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: smin_i16:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpw %si, %ax
+; GISEL-X64-NEXT:    setl %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovew %si, %ax
+; GISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: smin_i16:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %cx, %ax
-; X86-NEXT:    jl .LBB1_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB1_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
+; DAG-X86-LABEL: smin_i16:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    cmpw %cx, %ax
+; DAG-X86-NEXT:    jl .LBB1_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:  .LBB1_2:
+; DAG-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: smin_i16:
 ; FASTISEL-X86:       # %bb.0:
@@ -88,6 +106,21 @@ define i16 @smin_i16(i16 %a, i16 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:  .LBB1_2:
 ; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: smin_i16:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpw %ax, %cx
+; GISEL-X86-NEXT:    setl %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB1_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB1_2:
+; GISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i16 @llvm.smin.i16(i16 %a, i16 %b)
     ret i16 %ret
 }
@@ -99,12 +132,15 @@ define i32 @smin_i32(i32 %a, i32 %b) nounwind readnone {
 ; X64-NEXT:    cmovll %edi, %eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: smin_i32:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpl %esi, %edi
-; FASTISEL-X64-NEXT:    cmovll %edi, %eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: smin_i32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setl %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovel %esi, %eax
+; GISEL-X64-NEXT:    retq
 ;
 ; X86-LABEL: smin_i32:
 ; X86:       # %bb.0:
@@ -117,16 +153,19 @@ define i32 @smin_i32(i32 %a, i32 %b) nounwind readnone {
 ; X86-NEXT:  .LBB2_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: smin_i32:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
-; FASTISEL-X86-NEXT:    jl .LBB2_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB2_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: smin_i32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpl %eax, %ecx
+; GISEL-X86-NEXT:    setl %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB2_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB2_2:
+; GISEL-X86-NEXT:    retl
     %ret = call i32 @llvm.smin.i32(i32 %a, i32 %b)
     ret i32 %ret
 }
@@ -138,32 +177,35 @@ define i64 @smin_i64(i64 %a, i64 %b) nounwind readnone {
 ; X64-NEXT:    cmovlq %rdi, %rax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: smin_i64:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movq %rsi, %rax
-; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT:    cmovlq %rdi, %rax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: smin_i64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movq %rdi, %rax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpq %rsi, %rdi
+; GISEL-X64-NEXT:    setl %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmoveq %rsi, %rax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: smin_i64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    jl .LBB3_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:  .LBB3_2:
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; DAG-X86-LABEL: smin_i64:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    pushl %edi
+; DAG-X86-NEXT:    pushl %esi
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT:    cmpl %ecx, %eax
+; DAG-X86-NEXT:    movl %edx, %edi
+; DAG-X86-NEXT:    sbbl %esi, %edi
+; DAG-X86-NEXT:    jl .LBB3_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:    movl %esi, %edx
+; DAG-X86-NEXT:  .LBB3_2:
+; DAG-X86-NEXT:    popl %esi
+; DAG-X86-NEXT:    popl %edi
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: smin_i64:
 ; FASTISEL-X86:       # %bb.0:
@@ -184,6 +226,44 @@ define i64 @smin_i64(i64 %a, i64 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:    popl %esi
 ; FASTISEL-X86-NEXT:    popl %edi
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: smin_i64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    pushl %ebp
+; GISEL-X86-NEXT:    pushl %ebx
+; GISEL-X86-NEXT:    pushl %edi
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    cmpl %eax, %esi
+; GISEL-X86-NEXT:    setb %bl
+; GISEL-X86-NEXT:    xorl %ecx, %ecx
+; GISEL-X86-NEXT:    cmpl %edx, %ebp
+; GISEL-X86-NEXT:    setl %bh
+; GISEL-X86-NEXT:    sete %cl
+; GISEL-X86-NEXT:    testl %ecx, %ecx
+; GISEL-X86-NEXT:    je .LBB3_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movb %bl, %bh
+; GISEL-X86-NEXT:  .LBB3_2:
+; GISEL-X86-NEXT:    movzbl %bh, %edi
+; GISEL-X86-NEXT:    andl $1, %edi
+; GISEL-X86-NEXT:    je .LBB3_4
+; GISEL-X86-NEXT:  # %bb.3:
+; GISEL-X86-NEXT:    movl %esi, %eax
+; GISEL-X86-NEXT:  .LBB3_4:
+; GISEL-X86-NEXT:    testl %edi, %edi
+; GISEL-X86-NEXT:    je .LBB3_6
+; GISEL-X86-NEXT:  # %bb.5:
+; GISEL-X86-NEXT:    movl %ebp, %edx
+; GISEL-X86-NEXT:  .LBB3_6:
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    popl %edi
+; GISEL-X86-NEXT:    popl %ebx
+; GISEL-X86-NEXT:    popl %ebp
+; GISEL-X86-NEXT:    retl
     %ret = call i64 @llvm.smin.i64(i64 %a, i64 %b)
     ret i64 %ret
 }
diff --git a/llvm/test/CodeGen/X86/isel-umax.ll b/llvm/test/CodeGen/X86/isel-umax.ll
index a90456cdbebb1..990af262065af 100644
--- a/llvm/test/CodeGen/X86/isel-umax.ll
+++ b/llvm/test/CodeGen/X86/isel-umax.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
 define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: umax_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmpb %al, %dil
-; X64-NEXT:    cmoval %edi, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
+; DAG-X64-LABEL: umax_i8:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    movl %esi, %eax
+; DAG-X64-NEXT:    cmpb %al, %dil
+; DAG-X64-NEXT:    cmoval %edi, %eax
+; DAG-X64-NEXT:    # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT:    retq
 ;
 ; FASTISEL-X64-LABEL: umax_i8:
 ; FASTISEL-X64:       # %bb.0:
@@ -24,6 +24,17 @@ define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone {
 ; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; FASTISEL-X64-NEXT:    retq
 ;
+; GISEL-X64-LABEL: umax_i8:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %esi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpb %al, %dil
+; GISEL-X64-NEXT:    seta %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovnew %di, %ax
+; GISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT:    retq
+;
 ; X86-LABEL: umax_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @umax_i8(i8 %a, i8 %b) nounwind readnone {
 ; X86-NEXT:  .LBB0_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: umax_i8:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpb %cl, %al
-; FASTISEL-X86-NEXT:    ja .LBB0_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB0_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: umax_i8:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpb %al, %cl
+; GISEL-X86-NEXT:    seta %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB0_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB0_2:
+; GISEL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i8 @llvm.umax.i8(i8 %a, i8 %b)
     ret i8 %ret
 }
@@ -57,25 +72,28 @@ define i16 @umax_i16(i16 %a, i16 %b) nounwind readnone {
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: umax_i16:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpw %ax, %di
-; FASTISEL-X64-NEXT:    cmoval %edi, %eax
-; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: umax_i16:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpw %si, %ax
+; GISEL-X64-NEXT:    seta %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovew %si, %ax
+; GISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: umax_i16:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %cx, %ax
-; X86-NEXT:    ja .LBB1_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB1_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
+; DAG-X86-LABEL: umax_i16:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    cmpw %cx, %ax
+; DAG-X86-NEXT:    ja .LBB1_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:  .LBB1_2:
+; DAG-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: umax_i16:
 ; FASTISEL-X86:       # %bb.0:
@@ -88,6 +106,21 @@ define i16 @umax_i16(i16 %a, i16 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:  .LBB1_2:
 ; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: umax_i16:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpw %ax, %cx
+; GISEL-X86-NEXT:    seta %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB1_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB1_2:
+; GISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i16 @llvm.umax.i16(i16 %a, i16 %b)
     ret i16 %ret
 }
@@ -99,12 +132,15 @@ define i32 @umax_i32(i32 %a, i32 %b) nounwind readnone {
 ; X64-NEXT:    cmoval %edi, %eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: umax_i32:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpl %esi, %edi
-; FASTISEL-X64-NEXT:    cmoval %edi, %eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: umax_i32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    seta %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovel %esi, %eax
+; GISEL-X64-NEXT:    retq
 ;
 ; X86-LABEL: umax_i32:
 ; X86:       # %bb.0:
@@ -117,16 +153,19 @@ define i32 @umax_i32(i32 %a, i32 %b) nounwind readnone {
 ; X86-NEXT:  .LBB2_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: umax_i32:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
-; FASTISEL-X86-NEXT:    ja .LBB2_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB2_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: umax_i32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpl %eax, %ecx
+; GISEL-X86-NEXT:    seta %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB2_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB2_2:
+; GISEL-X86-NEXT:    retl
     %ret = call i32 @llvm.umax.i32(i32 %a, i32 %b)
     ret i32 %ret
 }
@@ -138,32 +177,35 @@ define i64 @umax_i64(i64 %a, i64 %b) nounwind readnone {
 ; X64-NEXT:    cmovaq %rdi, %rax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: umax_i64:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movq %rsi, %rax
-; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT:    cmovaq %rdi, %rax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: umax_i64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movq %rdi, %rax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpq %rsi, %rdi
+; GISEL-X64-NEXT:    seta %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmoveq %rsi, %rax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: umax_i64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    jb .LBB3_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:  .LBB3_2:
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; DAG-X86-LABEL: umax_i64:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    pushl %edi
+; DAG-X86-NEXT:    pushl %esi
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT:    cmpl %eax, %ecx
+; DAG-X86-NEXT:    movl %esi, %edi
+; DAG-X86-NEXT:    sbbl %edx, %edi
+; DAG-X86-NEXT:    jb .LBB3_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:    movl %esi, %edx
+; DAG-X86-NEXT:  .LBB3_2:
+; DAG-X86-NEXT:    popl %esi
+; DAG-X86-NEXT:    popl %edi
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: umax_i64:
 ; FASTISEL-X86:       # %bb.0:
@@ -184,6 +226,44 @@ define i64 @umax_i64(i64 %a, i64 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:    popl %esi
 ; FASTISEL-X86-NEXT:    popl %edi
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: umax_i64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    pushl %ebp
+; GISEL-X86-NEXT:    pushl %ebx
+; GISEL-X86-NEXT:    pushl %edi
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    cmpl %eax, %esi
+; GISEL-X86-NEXT:    seta %bl
+; GISEL-X86-NEXT:    xorl %ecx, %ecx
+; GISEL-X86-NEXT:    cmpl %edx, %ebp
+; GISEL-X86-NEXT:    seta %bh
+; GISEL-X86-NEXT:    sete %cl
+; GISEL-X86-NEXT:    testl %ecx, %ecx
+; GISEL-X86-NEXT:    je .LBB3_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movb %bl, %bh
+; GISEL-X86-NEXT:  .LBB3_2:
+; GISEL-X86-NEXT:    movzbl %bh, %edi
+; GISEL-X86-NEXT:    andl $1, %edi
+; GISEL-X86-NEXT:    je .LBB3_4
+; GISEL-X86-NEXT:  # %bb.3:
+; GISEL-X86-NEXT:    movl %esi, %eax
+; GISEL-X86-NEXT:  .LBB3_4:
+; GISEL-X86-NEXT:    testl %edi, %edi
+; GISEL-X86-NEXT:    je .LBB3_6
+; GISEL-X86-NEXT:  # %bb.5:
+; GISEL-X86-NEXT:    movl %ebp, %edx
+; GISEL-X86-NEXT:  .LBB3_6:
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    popl %edi
+; GISEL-X86-NEXT:    popl %ebx
+; GISEL-X86-NEXT:    popl %ebp
+; GISEL-X86-NEXT:    retl
     %ret = call i64 @llvm.umax.i64(i64 %a, i64 %b)
     ret i64 %ret
 }
diff --git a/llvm/test/CodeGen/X86/isel-umin.ll b/llvm/test/CodeGen/X86/isel-umin.ll
index 53a0b277e6d7b..1710b9fbfa059 100644
--- a/llvm/test/CodeGen/X86/isel-umin.ll
+++ b/llvm/test/CodeGen/X86/isel-umin.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FASTISEL-X86
-; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,DAG-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86,DAG-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86
 
 define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone {
-; X64-LABEL: umin_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    cmpb %al, %dil
-; X64-NEXT:    cmovbl %edi, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
+; DAG-X64-LABEL: umin_i8:
+; DAG-X64:       # %bb.0:
+; DAG-X64-NEXT:    movl %esi, %eax
+; DAG-X64-NEXT:    cmpb %al, %dil
+; DAG-X64-NEXT:    cmovbl %edi, %eax
+; DAG-X64-NEXT:    # kill: def $al killed $al killed $eax
+; DAG-X64-NEXT:    retq
 ;
 ; FASTISEL-X64-LABEL: umin_i8:
 ; FASTISEL-X64:       # %bb.0:
@@ -24,6 +24,17 @@ define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone {
 ; FASTISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; FASTISEL-X64-NEXT:    retq
 ;
+; GISEL-X64-LABEL: umin_i8:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %esi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpb %al, %dil
+; GISEL-X64-NEXT:    setb %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovnew %di, %ax
+; GISEL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X64-NEXT:    retq
+;
 ; X86-LABEL: umin_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
@@ -35,16 +46,20 @@ define i8 @umin_i8(i8 %a, i8 %b) nounwind readnone {
 ; X86-NEXT:  .LBB0_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: umin_i8:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpb %cl, %al
-; FASTISEL-X86-NEXT:    jb .LBB0_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB0_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: umin_i8:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpb %al, %cl
+; GISEL-X86-NEXT:    setb %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB0_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB0_2:
+; GISEL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i8 @llvm.umin.i8(i8 %a, i8 %b)
     ret i8 %ret
 }
@@ -57,25 +72,28 @@ define i16 @umin_i16(i16 %a, i16 %b) nounwind readnone {
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: umin_i16:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpw %ax, %di
-; FASTISEL-X64-NEXT:    cmovbl %edi, %eax
-; FASTISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: umin_i16:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpw %si, %ax
+; GISEL-X64-NEXT:    setb %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovew %si, %ax
+; GISEL-X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: umin_i16:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw %cx, %ax
-; X86-NEXT:    jb .LBB1_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:  .LBB1_2:
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    retl
+; DAG-X86-LABEL: umin_i16:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    cmpw %cx, %ax
+; DAG-X86-NEXT:    jb .LBB1_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:  .LBB1_2:
+; DAG-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: umin_i16:
 ; FASTISEL-X86:       # %bb.0:
@@ -88,6 +106,21 @@ define i16 @umin_i16(i16 %a, i16 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:  .LBB1_2:
 ; FASTISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: umin_i16:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpw %ax, %cx
+; GISEL-X86-NEXT:    setb %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB1_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB1_2:
+; GISEL-X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-X86-NEXT:    retl
     %ret = call i16 @llvm.umin.i16(i16 %a, i16 %b)
     ret i16 %ret
 }
@@ -99,12 +132,15 @@ define i32 @umin_i32(i32 %a, i32 %b) nounwind readnone {
 ; X64-NEXT:    cmovbl %edi, %eax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: umin_i32:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movl %esi, %eax
-; FASTISEL-X64-NEXT:    cmpl %esi, %edi
-; FASTISEL-X64-NEXT:    cmovbl %edi, %eax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: umin_i32:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movl %edi, %eax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpl %esi, %edi
+; GISEL-X64-NEXT:    setb %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmovel %esi, %eax
+; GISEL-X64-NEXT:    retq
 ;
 ; X86-LABEL: umin_i32:
 ; X86:       # %bb.0:
@@ -117,16 +153,19 @@ define i32 @umin_i32(i32 %a, i32 %b) nounwind readnone {
 ; X86-NEXT:  .LBB2_2:
 ; X86-NEXT:    retl
 ;
-; FASTISEL-X86-LABEL: umin_i32:
-; FASTISEL-X86:       # %bb.0:
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FASTISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FASTISEL-X86-NEXT:    cmpl %ecx, %eax
-; FASTISEL-X86-NEXT:    jb .LBB2_2
-; FASTISEL-X86-NEXT:  # %bb.1:
-; FASTISEL-X86-NEXT:    movl %ecx, %eax
-; FASTISEL-X86-NEXT:  .LBB2_2:
-; FASTISEL-X86-NEXT:    retl
+; GISEL-X86-LABEL: umin_i32:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    xorl %edx, %edx
+; GISEL-X86-NEXT:    cmpl %eax, %ecx
+; GISEL-X86-NEXT:    setb %dl
+; GISEL-X86-NEXT:    andl $1, %edx
+; GISEL-X86-NEXT:    je .LBB2_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movl %ecx, %eax
+; GISEL-X86-NEXT:  .LBB2_2:
+; GISEL-X86-NEXT:    retl
     %ret = call i32 @llvm.umin.i32(i32 %a, i32 %b)
     ret i32 %ret
 }
@@ -138,32 +177,35 @@ define i64 @umin_i64(i64 %a, i64 %b) nounwind readnone {
 ; X64-NEXT:    cmovbq %rdi, %rax
 ; X64-NEXT:    retq
 ;
-; FASTISEL-X64-LABEL: umin_i64:
-; FASTISEL-X64:       # %bb.0:
-; FASTISEL-X64-NEXT:    movq %rsi, %rax
-; FASTISEL-X64-NEXT:    cmpq %rsi, %rdi
-; FASTISEL-X64-NEXT:    cmovbq %rdi, %rax
-; FASTISEL-X64-NEXT:    retq
+; GISEL-X64-LABEL: umin_i64:
+; GISEL-X64:       # %bb.0:
+; GISEL-X64-NEXT:    movq %rdi, %rax
+; GISEL-X64-NEXT:    xorl %ecx, %ecx
+; GISEL-X64-NEXT:    cmpq %rsi, %rdi
+; GISEL-X64-NEXT:    setb %cl
+; GISEL-X64-NEXT:    andl $1, %ecx
+; GISEL-X64-NEXT:    cmoveq %rsi, %rax
+; GISEL-X64-NEXT:    retq
 ;
-; X86-LABEL: umin_i64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    jb .LBB3_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:  .LBB3_2:
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    retl
+; DAG-X86-LABEL: umin_i64:
+; DAG-X86:       # %bb.0:
+; DAG-X86-NEXT:    pushl %edi
+; DAG-X86-NEXT:    pushl %esi
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; DAG-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; DAG-X86-NEXT:    cmpl %ecx, %eax
+; DAG-X86-NEXT:    movl %edx, %edi
+; DAG-X86-NEXT:    sbbl %esi, %edi
+; DAG-X86-NEXT:    jb .LBB3_2
+; DAG-X86-NEXT:  # %bb.1:
+; DAG-X86-NEXT:    movl %ecx, %eax
+; DAG-X86-NEXT:    movl %esi, %edx
+; DAG-X86-NEXT:  .LBB3_2:
+; DAG-X86-NEXT:    popl %esi
+; DAG-X86-NEXT:    popl %edi
+; DAG-X86-NEXT:    retl
 ;
 ; FASTISEL-X86-LABEL: umin_i64:
 ; FASTISEL-X86:       # %bb.0:
@@ -184,6 +226,44 @@ define i64 @umin_i64(i64 %a, i64 %b) nounwind readnone {
 ; FASTISEL-X86-NEXT:    popl %esi
 ; FASTISEL-X86-NEXT:    popl %edi
 ; FASTISEL-X86-NEXT:    retl
+;
+; GISEL-X86-LABEL: umin_i64:
+; GISEL-X86:       # %bb.0:
+; GISEL-X86-NEXT:    pushl %ebp
+; GISEL-X86-NEXT:    pushl %ebx
+; GISEL-X86-NEXT:    pushl %edi
+; GISEL-X86-NEXT:    pushl %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT:    cmpl %eax, %esi
+; GISEL-X86-NEXT:    setb %bl
+; GISEL-X86-NEXT:    xorl %ecx, %ecx
+; GISEL-X86-NEXT:    cmpl %edx, %ebp
+; GISEL-X86-NEXT:    setb %bh
+; GISEL-X86-NEXT:    sete %cl
+; GISEL-X86-NEXT:    testl %ecx, %ecx
+; GISEL-X86-NEXT:    je .LBB3_2
+; GISEL-X86-NEXT:  # %bb.1:
+; GISEL-X86-NEXT:    movb %bl, %bh
+; GISEL-X86-NEXT:  .LBB3_2:
+; GISEL-X86-NEXT:    movzbl %bh, %edi
+; GISEL-X86-NEXT:    andl $1, %edi
+; GISEL-X86-NEXT:    je .LBB3_4
+; GISEL-X86-NEXT:  # %bb.3:
+; GISEL-X86-NEXT:    movl %esi, %eax
+; GISEL-X86-NEXT:  .LBB3_4:
+; GISEL-X86-NEXT:    testl %edi, %edi
+; GISEL-X86-NEXT:    je .LBB3_6
+; GISEL-X86-NEXT:  # %bb.5:
+; GISEL-X86-NEXT:    movl %ebp, %edx
+; GISEL-X86-NEXT:  .LBB3_6:
+; GISEL-X86-NEXT:    popl %esi
+; GISEL-X86-NEXT:    popl %edi
+; GISEL-X86-NEXT:    popl %ebx
+; GISEL-X86-NEXT:    popl %ebp
+; GISEL-X86-NEXT:    retl
     %ret = call i64 @llvm.umin.i64(i64 %a, i64 %b)
     ret i64 %ret
 }

From 30c578a051c7842e44bdd1106fc8d46175f568cb Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau@arm.com>
Date: Fri, 3 Oct 2025 10:55:06 +0100
Subject: [PATCH 614/878] [GVN] Teach GVN simple masked load/store forwarding
 (#157689)

This patch teaches GVN how to eliminate redundant masked loads and
forward previous loads or instructions with a select. This is possible
when the same mask is used for masked stores/loads that write to the
same memory location
---
 llvm/include/llvm/IR/PatternMatch.h           |   8 +
 llvm/include/llvm/Transforms/Scalar/GVN.h     |   2 +
 llvm/lib/Transforms/Scalar/GVN.cpp            |  33 ++++
 .../GVN/masked-load-store-no-mem-dep.ll       |  34 ++++
 llvm/test/Transforms/GVN/masked-load-store.ll | 178 +++++++++++++++++-
 5 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 6168e24569f99..2e31fe5bb431e 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2773,6 +2773,14 @@ m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
   return m_Intrinsic<Intrinsic::masked_load>(Op0, Op1, Op2, Op3);
 }
 
+/// Matches MaskedStore Intrinsic.
+template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
+m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2,
+              const Opnd3 &Op3) {
+  return m_Intrinsic<Intrinsic::masked_store>(Op0, Op1, Op2, Op3);
+}
+
 /// Matches MaskedGather Intrinsic.
 template <typename Opnd0, typename Opnd1, typename Opnd2, typename Opnd3>
 inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2, Opnd3>::Ty
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 245414935bc0f..74a4d6ce00fcc 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -56,6 +56,7 @@ class OptimizationRemarkEmitter;
 class PHINode;
 class TargetLibraryInfo;
 class Value;
+class IntrinsicInst;
 /// A private "module" namespace for types and utilities used by GVN. These
 /// are implementation details and should not be used by clients.
 namespace LLVM_LIBRARY_VISIBILITY_NAMESPACE gvn {
@@ -349,6 +350,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
 
   // Helper functions of redundant load elimination.
   bool processLoad(LoadInst *L);
+  bool processMaskedLoad(IntrinsicInst *I);
   bool processNonLocalLoad(LoadInst *L);
   bool processAssumeIntrinsic(AssumeInst *II);
 
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 26e17cc849bff..b9b5b5823d780 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2287,6 +2287,35 @@ bool GVNPass::processLoad(LoadInst *L) {
   return true;
 }
 
+// Attempt to process masked loads which have loaded from
+// masked stores with the same mask
+bool GVNPass::processMaskedLoad(IntrinsicInst *I) {
+  if (!MD)
+    return false;
+  MemDepResult Dep = MD->getDependency(I);
+  Instruction *DepInst = Dep.getInst();
+  if (!DepInst || !Dep.isLocal() || !Dep.isDef())
+    return false;
+
+  Value *Mask = I->getOperand(2);
+  Value *Passthrough = I->getOperand(3);
+  Value *StoreVal;
+  if (!match(DepInst, m_MaskedStore(m_Value(StoreVal), m_Value(), m_Value(),
+                                    m_Specific(Mask))) ||
+      StoreVal->getType() != I->getType())
+    return false;
+
+  // Remove the load but generate a select for the passthrough
+  Value *OpToForward = llvm::SelectInst::Create(Mask, StoreVal, Passthrough, "",
+                                                I->getIterator());
+
+  ICF->removeUsersOf(I);
+  I->replaceAllUsesWith(OpToForward);
+  salvageAndRemoveInstruction(I);
+  ++NumGVNLoad;
+  return true;
+}
+
 /// Return a pair the first field showing the value number of \p Exp and the
 /// second field showing whether it is a value number newly created.
 std::pair<uint32_t, bool>
@@ -2734,6 +2763,10 @@ bool GVNPass::processInstruction(Instruction *I) {
     return false;
   }
 
+  if (match(I, m_Intrinsic<Intrinsic::masked_load>()) &&
+      processMaskedLoad(cast<IntrinsicInst>(I)))
+    return true;
+
   // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
diff --git a/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
new file mode 100644
index 0000000000000..512ea37641ab9
--- /dev/null
+++ b/llvm/test/Transforms/GVN/masked-load-store-no-mem-dep.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=gvn -S -enable-gvn-memdep=true < %s | FileCheck %s
+; RUN: opt -passes=gvn -S -enable-gvn-memdep=false < %s | FileCheck %s --check-prefix=MEMDEPFALSE
+
+define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel(
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+; MEMDEPFALSE-LABEL: @forward_binop_with_sel(
+; MEMDEPFALSE-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; MEMDEPFALSE-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; MEMDEPFALSE-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; MEMDEPFALSE-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; MEMDEPFALSE-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; MEMDEPFALSE-NEXT:    [[LOAD_1_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP1]], i32 1, <4 x i1> [[MASK]], <4 x float> [[PASSTHROUGH:%.*]])
+; MEMDEPFALSE-NEXT:    ret <4 x float> [[LOAD_1_0]]
+;
+  %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
+  %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %gep.0.16 = getelementptr i8, ptr %0, i32 16
+  %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %fmul = fmul <4 x float> %load.0.0, %load.0.16
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+  %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough)
+  ret <4 x float> %load.1.0
+}
diff --git a/llvm/test/Transforms/GVN/masked-load-store.ll b/llvm/test/Transforms/GVN/masked-load-store.ll
index 984a756591701..b112e990e0c58 100644
--- a/llvm/test/Transforms/GVN/masked-load-store.ll
+++ b/llvm/test/Transforms/GVN/masked-load-store.ll
@@ -36,6 +36,180 @@ define <128 x i8> @f1(ptr %a0, <128 x i8> %a1, <128 x i8> %a2) {
   ret <128 x i8> %v4
 }
 
-declare <128 x i8> @llvm.masked.load.v128i8.p0(ptr, i32, <128 x i1>, <128 x i8>)
-declare void @llvm.masked.store.v128i8.p0(<128 x i8>, ptr, i32, <128 x i1>)
+define <4 x float> @forward_masked_load(ptr %0, ptr %1) {
+; CHECK-LABEL: @forward_masked_load(
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+  %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %1, i32 1, <4 x i1> %mask)
+  %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  ret <4 x float> %load2
+}
+
+define <4 x float> @forward_masked_load_arbitrary_mask(ptr %loc_a, ptr %loc_b, <4 x i1> %mask) {
+; CHECK-LABEL: @forward_masked_load_arbitrary_mask(
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[LOC_A:%.*]], i32 1, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[LOAD1]], ptr [[LOC_B:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[LOAD1]], <4 x float> zeroinitializer
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %load1 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_a, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %load1, ptr %loc_b, i32 1, <4 x i1> %mask)
+  %load2 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %loc_b, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  ret <4 x float> %load2
+}
+
+define <4 x float> @forward_binop_splat_i1_mask(ptr %0, ptr %1) {
+; CHECK-LABEL: @forward_binop_splat_i1_mask(
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> splat (i1 true), <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <4 x float> [[FMUL]]
+;
+  %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 4)
+  %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %gep.0.16 = getelementptr i8, ptr %0, i32 16
+  %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %fmul = fmul <4 x float> %load.0.0, %load.0.16
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+  %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  ret <4 x float> %load.1.0
+}
+
+define <4 x float> @forward_binop_with_sel(ptr %0, ptr %1, i32 %a, i32 %b, <4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel(
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0:%.*]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[GEP_0_16]], i32 1, <4 x i1> [[MASK]], <4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul <4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[MASK]], <4 x float> [[FMUL]], <4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %a, i32 %b)
+  %load.0.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %gep.0.16 = getelementptr i8, ptr %0, i32 16
+  %load.0.16 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %gep.0.16, i32 1, <4 x i1> %mask, <4 x float> zeroinitializer)
+  %fmul = fmul <4 x float> %load.0.0, %load.0.16
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %fmul, ptr %1, i32 1, <4 x i1> %mask)
+  %load.1.0 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %1, i32 1, <4 x i1> %mask, <4 x float> %passthrough)
+  ret <4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @forward_masked_load_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_masked_load_scalable(
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP5]]
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+  %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+  ret <vscale x 4 x float> %load2
+}
 
+define <vscale x 4 x float> @forward_masked_load_scalable_type_mismatch(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_masked_load_scalable_type_mismatch(
+; CHECK-NEXT:    [[MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[LOAD1:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x double> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[LOAD1]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD2]]
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %load1 = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x double> zeroinitializer)
+  call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+  %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+  ret <vscale x 4 x float> %load2
+}
+
+define <vscale x 4 x float> @generate_sel_with_passthrough(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @generate_sel_with_passthrough(
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP4]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = select <vscale x 4 x i1> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP5]]
+;
+  %mask = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %load1 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %load1, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+  %load2 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+  ret <vscale x 4 x float> %load2
+}
+
+define <vscale x 4 x float> @forward_binop_with_sel_scalable(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @forward_binop_with_sel_scalable(
+; CHECK-NEXT:    [[MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK]])
+; CHECK-NEXT:    [[TMP3:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x float> [[FMUL]], <vscale x 4 x float> [[PASSTHROUGH:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP3]]
+;
+  %mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %gep.0.16 = getelementptr i8, ptr %0, i32 16
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> zeroinitializer)
+  %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask)
+  %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x float> %passthrough)
+  ret <vscale x 4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @load_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @load_mask_differs(
+; CHECK-NEXT:    [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT:    [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]])
+; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK1]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD_1_0]]
+;
+  %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+  %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+  %gep.0.16 = getelementptr i8, ptr %0, i32 16
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+  %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask0)
+  %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask1, <vscale x 4 x float> %passthrough)
+  ret <vscale x 4 x float> %load.1.0
+}
+
+define <vscale x 4 x float> @store_mask_differs(ptr %0, ptr %1, <vscale x 4 x float> %passthrough) {
+; CHECK-LABEL: @store_mask_differs(
+; CHECK-NEXT:    [[MASK0:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+; CHECK-NEXT:    [[MASK1:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+; CHECK-NEXT:    [[LOAD_0_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP0:%.*]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[GEP_0_16:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-NEXT:    [[LOAD_0_16:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[GEP_0_16]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> zeroinitializer)
+; CHECK-NEXT:    [[FMUL:%.*]] = fmul <vscale x 4 x float> [[LOAD_0_0]], [[LOAD_0_16]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[FMUL]], ptr [[TMP1:%.*]], i32 1, <vscale x 4 x i1> [[MASK1]])
+; CHECK-NEXT:    [[LOAD_1_0:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP1]], i32 1, <vscale x 4 x i1> [[MASK0]], <vscale x 4 x float> [[PASSTHROUGH:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[LOAD_1_0]]
+;
+  %mask0 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 8)
+  %mask1 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 4)
+  %load.0.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %0, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+  %gep.0.16 = getelementptr i8, ptr %0, i32 16
+  %load.0.16 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %gep.0.16, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> zeroinitializer)
+  %fmul = fmul <vscale x 4 x float> %load.0.0, %load.0.16
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %fmul, ptr %1, i32 1, <vscale x 4 x i1> %mask1)
+  %load.1.0 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %1, i32 1, <vscale x 4 x i1> %mask0, <vscale x 4 x float> %passthrough)
+  ret <vscale x 4 x float> %load.1.0
+}

From 1a07e67f7ae84bf51e6be6f9009a1842838d9e72 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Fri, 3 Oct 2025 11:01:50 +0100
Subject: [PATCH 615/878] [AArch64][SME] Enable `aarch64-split-sve-objects`
 with hazard padding (#161714)

This enables `aarch64-split-sve-objects` by default. Note: This option
only has an effect when used in conjunction with hazard padding
(`aarch64-stack-hazard-size` != 0).

See https://github.com/llvm/llvm-project/pull/142392 for more details.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |   2 +-
 .../AArch64/ssve-stack-hazard-remarks.ll      |   6 +-
 llvm/test/CodeGen/AArch64/stack-hazard.ll     | 272 +++++++++---------
 3 files changed, 141 insertions(+), 139 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8d6eb91d74375..4357264d2232d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -282,7 +282,7 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
 static cl::opt<bool>
     SplitSVEObjects("aarch64-split-sve-objects",
                     cl::desc("Split allocation of ZPR & PPR objects"),
-                    cl::init(false), cl::Hidden);
+                    cl::init(true), cl::Hidden);
 
 cl::opt<bool> EnableHomogeneousPrologEpilog(
     "homogeneous-prolog-epilog", cl::Hidden,
diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
index 1de8d0a080b70..01e3d3ae5e1fd 100644
--- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
@@ -68,13 +68,12 @@ entry:
 }
 
 ; SVE calling conventions
-; Predicate register spills end up in FP region, currently. This can be
-; mitigated with the -aarch64-enable-zpr-predicate-spills option.
+; Padding is placed between predicate and fpr/zpr register spills, so only emit remarks when hazard padding is off.
+; Note: The -aarch64-enable-zpr-predicate-spills option is deprecated (and will be removed soon).
 
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
-; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call':
 ; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
 ; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
@@ -89,7 +88,6 @@ entry:
 define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
-; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call':
 ; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
 ; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 333a8be27f687..bdee359487ce6 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects=false -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
 
 define i32 @basic(i32 noundef %num) {
 ; CHECK-LABEL: basic:
@@ -1940,23 +1940,22 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ;
 ; CHECK64-LABEL: svecc_call:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w26, -24
-; CHECK64-NEXT:    .cfi_offset w27, -32
-; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w26, -16
+; CHECK64-NEXT:    .cfi_offset w27, -24
+; CHECK64-NEXT:    .cfi_offset w28, -32
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -1969,30 +1968,32 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    mov x8, x0
 ; CHECK64-NEXT:    bl __arm_sme_state
@@ -2014,22 +2015,32 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    add sp, sp, #64
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #16
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -2042,20 +2053,11 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    addvl sp, sp, #18
-; CHECK64-NEXT:    .cfi_restore z8
-; CHECK64-NEXT:    .cfi_restore z9
-; CHECK64-NEXT:    .cfi_restore z10
-; CHECK64-NEXT:    .cfi_restore z11
-; CHECK64-NEXT:    .cfi_restore z12
-; CHECK64-NEXT:    .cfi_restore z13
-; CHECK64-NEXT:    .cfi_restore z14
-; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    addvl sp, sp, #2
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w26
@@ -2463,23 +2465,22 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ;
 ; CHECK64-LABEL: svecc_alloca_call:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w26, -24
-; CHECK64-NEXT:    .cfi_offset w27, -32
-; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w26, -16
+; CHECK64-NEXT:    .cfi_offset w27, -24
+; CHECK64-NEXT:    .cfi_offset w28, -32
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -2492,30 +2493,32 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    mov x19, x0
@@ -2536,22 +2539,32 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    add sp, sp, #112
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    add sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #16
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -2564,20 +2577,11 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    addvl sp, sp, #18
-; CHECK64-NEXT:    .cfi_restore z8
-; CHECK64-NEXT:    .cfi_restore z9
-; CHECK64-NEXT:    .cfi_restore z10
-; CHECK64-NEXT:    .cfi_restore z11
-; CHECK64-NEXT:    .cfi_restore z12
-; CHECK64-NEXT:    .cfi_restore z13
-; CHECK64-NEXT:    .cfi_restore z14
-; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    addvl sp, sp, #2
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w26

From a549555722eb9864ce1280407b3300970e4dc050 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 07:18:07 -0700
Subject: [PATCH 616/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in ElementwiseOpFusion.cpp (NFC)

---
 .../Linalg/Transforms/ElementwiseOpFusion.cpp | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 3bd763ea00cd7..05fc7cbbb90af 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -1622,12 +1622,12 @@ static void generateCollapsedIndexingRegion(
   }
 }
 
-void collapseOperandsAndResults(LinalgOp op,
-                                const CollapsingInfo &collapsingInfo,
-                                RewriterBase &rewriter,
-                                SmallVectorImpl<Value> &inputOperands,
-                                SmallVectorImpl<Value> &outputOperands,
-                                SmallVectorImpl<Type> &resultTypes) {
+static void collapseOperandsAndResults(LinalgOp op,
+                                       const CollapsingInfo &collapsingInfo,
+                                       RewriterBase &rewriter,
+                                       SmallVectorImpl<Value> &inputOperands,
+                                       SmallVectorImpl<Value> &outputOperands,
+                                       SmallVectorImpl<Type> &resultTypes) {
   Location loc = op->getLoc();
   inputOperands =
       llvm::map_to_vector(op.getDpsInputOperands(), [&](OpOperand *opOperand) {
@@ -1651,8 +1651,8 @@ void collapseOperandsAndResults(LinalgOp op,
 
 /// Clone a `LinalgOp` to a collapsed version of same name
 template <typename OpTy>
-OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp,
-                        const CollapsingInfo &collapsingInfo) {
+static OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp,
+                               const CollapsingInfo &collapsingInfo) {
   return nullptr;
 }
 
@@ -1699,8 +1699,9 @@ GenericOp cloneToCollapsedOp<GenericOp>(RewriterBase &rewriter,
   return collapsedOp;
 }
 
-LinalgOp createCollapsedOp(LinalgOp op, const CollapsingInfo &collapsingInfo,
-                           RewriterBase &rewriter) {
+static LinalgOp createCollapsedOp(LinalgOp op,
+                                  const CollapsingInfo &collapsingInfo,
+                                  RewriterBase &rewriter) {
   if (GenericOp genericOp = dyn_cast<GenericOp>(op.getOperation())) {
     return cloneToCollapsedOp(rewriter, genericOp, collapsingInfo);
   } else {

From fd1d157f2bf7c3ca1480f708004f35a882b639e9 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 13:06:35 -0700
Subject: [PATCH 617/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in mlir-rewrite.cpp (NFC)

---
 mlir/tools/mlir-rewrite/mlir-rewrite.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp
index fd8ae7e0ca89b..795766f933ddc 100644
--- a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp
+++ b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp
@@ -35,7 +35,7 @@ namespace mlir {
 using OperationDefinition = AsmParserState::OperationDefinition;
 
 /// Return the source code associated with the OperationDefinition.
-SMRange getOpRange(const OperationDefinition &op) {
+static SMRange getOpRange(const OperationDefinition &op) {
   const char *startOp = op.scopeLoc.Start.getPointer();
   const char *endOp = op.scopeLoc.End.getPointer();
 
@@ -187,15 +187,15 @@ std::unique_ptr<RewritePad> RewritePad::init(StringRef inputFilename,
 }
 
 /// Return the source code associated with the operation name.
-SMRange getOpNameRange(const OperationDefinition &op) { return op.loc; }
+static SMRange getOpNameRange(const OperationDefinition &op) { return op.loc; }
 
 /// Return whether the operation was printed using generic syntax in original
 /// buffer.
-bool isGeneric(const OperationDefinition &op) {
+static bool isGeneric(const OperationDefinition &op) {
   return op.loc.Start.getPointer()[0] == '"';
 }
 
-inline int asMainReturnCode(LogicalResult r) {
+static inline int asMainReturnCode(LogicalResult r) {
   return r.succeeded() ? EXIT_SUCCESS : EXIT_FAILURE;
 }
 
@@ -293,7 +293,7 @@ static llvm::cl::opt<std::string> simpleRenameReplace{
     llvm::cl::cat(clSimpleRenameCategory)};
 
 // Rewriter that does simple renames.
-LogicalResult simpleRename(RewritePad &rewriteState, raw_ostream &os) {
+static LogicalResult simpleRename(RewritePad &rewriteState, raw_ostream &os) {
   StringRef opName = simpleRenameOpName;
   StringRef match = simpleRenameMatch;
   StringRef replace = simpleRenameReplace;
@@ -317,7 +317,7 @@ static mlir::RewriterRegistration rewriteSimpleRename("simple-rename",
                                                       simpleRename);
 
 // Rewriter that insert range markers.
-LogicalResult markRanges(RewritePad &rewriteState, raw_ostream &os) {
+static LogicalResult markRanges(RewritePad &rewriteState, raw_ostream &os) {
   for (const auto &it : rewriteState.getOpDefs()) {
     auto [startOp, endOp] = getOpRange(it);
 

From 2da3658c9c3a8df281bd9c747c52c9b54a502a3c Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 13:32:23 -0700
Subject: [PATCH 618/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in PassGenTest.cpp (NFC)

---
 mlir/unittests/TableGen/PassGenTest.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/unittests/TableGen/PassGenTest.cpp b/mlir/unittests/TableGen/PassGenTest.cpp
index 27f2fa0654296..ac01d498790b7 100644
--- a/mlir/unittests/TableGen/PassGenTest.cpp
+++ b/mlir/unittests/TableGen/PassGenTest.cpp
@@ -11,7 +11,8 @@
 
 #include "gmock/gmock.h"
 
-std::unique_ptr<mlir::Pass> createTestPassWithCustomConstructor(int v = 0);
+static std::unique_ptr<mlir::Pass>
+createTestPassWithCustomConstructor(int v = 0);
 
 #define GEN_PASS_DECL
 #define GEN_PASS_REGISTRATION

From e4f7ce11bfcf8a5abc567c5aaf55f04b31056b2f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 05:14:59 -0700
Subject: [PATCH 619/878] [MLIR] Apply clang-tidy fixes for
 misc-use-internal-linkage in Rewrite.cpp (NFC)

---
 mlir/lib/CAPI/Transforms/Rewrite.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 8ee6308cadf83..0d56259e469af 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -259,22 +259,23 @@ void mlirIRRewriterDestroy(MlirRewriterBase rewriter) {
 /// RewritePatternSet and FrozenRewritePatternSet API
 //===----------------------------------------------------------------------===//
 
-inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) {
+static inline mlir::RewritePatternSet &unwrap(MlirRewritePatternSet module) {
   assert(module.ptr && "unexpected null module");
   return *(static_cast<mlir::RewritePatternSet *>(module.ptr));
 }
 
-inline MlirRewritePatternSet wrap(mlir::RewritePatternSet *module) {
+static inline MlirRewritePatternSet wrap(mlir::RewritePatternSet *module) {
   return {module};
 }
 
-inline mlir::FrozenRewritePatternSet *
+static inline mlir::FrozenRewritePatternSet *
 unwrap(MlirFrozenRewritePatternSet module) {
   assert(module.ptr && "unexpected null module");
   return static_cast<mlir::FrozenRewritePatternSet *>(module.ptr);
 }
 
-inline MlirFrozenRewritePatternSet wrap(mlir::FrozenRewritePatternSet *module) {
+static inline MlirFrozenRewritePatternSet
+wrap(mlir::FrozenRewritePatternSet *module) {
   return {module};
 }
 
@@ -321,12 +322,12 @@ inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) {
 //===----------------------------------------------------------------------===//
 
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
-inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) {
+static inline mlir::PDLPatternModule *unwrap(MlirPDLPatternModule module) {
   assert(module.ptr && "unexpected null module");
   return static_cast<mlir::PDLPatternModule *>(module.ptr);
 }
 
-inline MlirPDLPatternModule wrap(mlir::PDLPatternModule *module) {
+static inline MlirPDLPatternModule wrap(mlir::PDLPatternModule *module) {
   return {module};
 }
 

From 35530f4b65d04c79d3c9fb68272b82772c665823 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Fri, 3 Oct 2025 12:44:51 +0200
Subject: [PATCH 620/878] [NFC][AMDGPU] Replace size & set_is_subset by
 operator== (#161813)

---
 llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 6efa78ef902c0..a4ef524c43466 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -608,8 +608,6 @@ class AMDGPULowerModuleLDS {
             ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
             : EmptySet;
 
-    const size_t HybridModuleRootKernelsSize = HybridModuleRootKernels.size();
-
     for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
       // Each iteration of this loop assigns exactly one global variable to
       // exactly one of the implementation strategies.
@@ -649,8 +647,7 @@ class AMDGPULowerModuleLDS {
           ModuleScopeVariables.insert(GV);
         } else if (K.second.size() == 1) {
           KernelAccessVariables.insert(GV);
-        } else if (K.second.size() == HybridModuleRootKernelsSize &&
-                   set_is_subset(K.second, HybridModuleRootKernels)) {
+        } else if (K.second == HybridModuleRootKernels) {
           ModuleScopeVariables.insert(GV);
         } else {
           TableLookupVariables.insert(GV);

From c2765b74ed2bb71bebe96c7dd49f69ade714e5a8 Mon Sep 17 00:00:00 2001
From: ronlieb <ron.lieberman@amd.com>
Date: Fri, 3 Oct 2025 07:13:22 -0400
Subject: [PATCH 621/878] Revert "[PATCH] offload-tunnel-cmake with proper
 escape" (#161727)

Reverts llvm/llvm-project#161552
---
 llvm/runtimes/CMakeLists.txt | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 6f98eaee241bc..839929204c064 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -507,14 +507,10 @@ if(build_runtimes)
   endif()
 
   # Forward user-provived system configuration to runtimes for requirement introspection.
-  # CMAKE_PREFIX_PATH is the search path for CMake packages. In order to pass through
-  # the command line interface, the CMake semicolon separator needs to be replaced
-  # with $<SEMICOLON>
+  # CMAKE_PREFIX_PATH is the search path for CMake packages.
   if(CMAKE_PREFIX_PATH)
-    string(JOIN "$<SEMICOLON>" escaped_cmake_prefix_path ${CMAKE_PREFIX_PATH})
-    list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${escaped_cmake_prefix_path}")
+    list(APPEND extra_cmake_args "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}")
   endif()
-
   # CMAKE_PROGRAM_PATH is the search path for executables such as python.
   if(CMAKE_PROGRAM_PATH)
     list(APPEND extra_cmake_args "-DCMAKE_PROGRAM_PATH=${CMAKE_PROGRAM_PATH}")

From 4d32ea87673adaa5252545ca82fa6cd58b134ff9 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 3 Oct 2025 05:00:40 -0700
Subject: [PATCH 622/878] [NFC][LLVM][IR] Fix namespace usage in several files
 (#161756)

- Move standalone functions/variables out of anonymous namespace and
make them static.
- Use `using namespace llvm` instead of wrapping all the code in a file
in `namespace llvm { }`.
- Restrict anonymous namespace to just class/struct/enum declarations.
---
 llvm/lib/IR/Assumptions.cpp         | 12 ++--
 llvm/lib/IR/DiagnosticHandler.cpp   |  2 +-
 llvm/lib/IR/ModuleSummaryIndex.cpp  |  2 +-
 llvm/lib/IR/PassInstrumentation.cpp |  7 +-
 llvm/lib/IR/ProfDataUtils.cpp       | 99 ++++++++++++++---------------
 llvm/lib/IR/SafepointIRVerifier.cpp |  2 -
 llvm/lib/IR/VFABIDemangler.cpp      |  4 +-
 llvm/lib/IR/Value.cpp               |  2 +-
 8 files changed, 62 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/IR/Assumptions.cpp b/llvm/lib/IR/Assumptions.cpp
index f8bbcb32231cb..3397f0e455bfb 100644
--- a/llvm/lib/IR/Assumptions.cpp
+++ b/llvm/lib/IR/Assumptions.cpp
@@ -20,9 +20,8 @@
 
 using namespace llvm;
 
-namespace {
-bool hasAssumption(const Attribute &A,
-                   const KnownAssumptionString &AssumptionStr) {
+static bool hasAssumption(const Attribute &A,
+                          const KnownAssumptionString &AssumptionStr) {
   if (!A.isValid())
     return false;
   assert(A.isStringAttribute() && "Expected a string attribute!");
@@ -33,7 +32,7 @@ bool hasAssumption(const Attribute &A,
   return llvm::is_contained(Strings, AssumptionStr);
 }
 
-DenseSet<StringRef> getAssumptions(const Attribute &A) {
+static DenseSet<StringRef> getAssumptions(const Attribute &A) {
   if (!A.isValid())
     return DenseSet<StringRef>();
   assert(A.isStringAttribute() && "Expected a string attribute!");
@@ -47,8 +46,8 @@ DenseSet<StringRef> getAssumptions(const Attribute &A) {
 }
 
 template <typename AttrSite>
-bool addAssumptionsImpl(AttrSite &Site,
-                        const DenseSet<StringRef> &Assumptions) {
+static bool addAssumptionsImpl(AttrSite &Site,
+                               const DenseSet<StringRef> &Assumptions) {
   if (Assumptions.empty())
     return false;
 
@@ -64,7 +63,6 @@ bool addAssumptionsImpl(AttrSite &Site,
 
   return true;
 }
-} // namespace
 
 bool llvm::hasAssumption(const Function &F,
                          const KnownAssumptionString &AssumptionStr) {
diff --git a/llvm/lib/IR/DiagnosticHandler.cpp b/llvm/lib/IR/DiagnosticHandler.cpp
index 683eade502917..eb2fe3bfdf140 100644
--- a/llvm/lib/IR/DiagnosticHandler.cpp
+++ b/llvm/lib/IR/DiagnosticHandler.cpp
@@ -36,6 +36,7 @@ struct PassRemarksOpt {
     }
   }
 };
+} // namespace
 
 static PassRemarksOpt PassRemarksPassedOptLoc;
 static PassRemarksOpt PassRemarksMissedOptLoc;
@@ -66,7 +67,6 @@ static cl::opt<PassRemarksOpt, true, cl::parser<std::string>>
             "Enable optimization analysis remarks from passes whose name match "
             "the given regular expression"),
         cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired);
-}
 
 bool DiagnosticHandler::isAnalysisRemarkEnabled(StringRef PassName) const {
   return (PassRemarksAnalysisOptLoc.Pattern &&
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index d9024b0a8673f..dc55b63c8b790 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -409,7 +409,7 @@ struct Edge {
   GlobalValue::GUID Src;
   GlobalValue::GUID Dst;
 };
-}
+} // namespace
 
 void Attributes::add(const Twine &Name, const Twine &Value,
                      const Twine &Comment) {
diff --git a/llvm/lib/IR/PassInstrumentation.cpp b/llvm/lib/IR/PassInstrumentation.cpp
index 70bbe8f6234b1..52aad8f84123b 100644
--- a/llvm/lib/IR/PassInstrumentation.cpp
+++ b/llvm/lib/IR/PassInstrumentation.cpp
@@ -15,7 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/PassManager.h"
 
-namespace llvm {
+using namespace llvm;
 
 template struct LLVM_EXPORT_TEMPLATE Any::TypeId<const Module *>;
 template struct LLVM_EXPORT_TEMPLATE Any::TypeId<const Function *>;
@@ -42,7 +42,8 @@ PassInstrumentationCallbacks::getPassNameForClassName(StringRef ClassName) {
 
 AnalysisKey PassInstrumentationAnalysis::Key;
 
-bool isSpecialPass(StringRef PassID, const std::vector<StringRef> &Specials) {
+bool llvm::isSpecialPass(StringRef PassID,
+                         const std::vector<StringRef> &Specials) {
   size_t Pos = PassID.find('<');
   StringRef Prefix = PassID;
   if (Pos != StringRef::npos)
@@ -50,5 +51,3 @@ bool isSpecialPass(StringRef PassID, const std::vector<StringRef> &Specials) {
   return any_of(Specials,
                 [Prefix](StringRef S) { return Prefix.ends_with(S); });
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index edeca976d293e..fc2be5188f456 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -24,8 +24,6 @@
 
 using namespace llvm;
 
-namespace {
-
 // MD_prof nodes have the following layout
 //
 // In general:
@@ -41,14 +39,15 @@ namespace {
 // correctly, and can change the behavior in the future if the layout changes
 
 // the minimum number of operands for MD_prof nodes with branch weights
-constexpr unsigned MinBWOps = 3;
+static constexpr unsigned MinBWOps = 3;
 
 // the minimum number of operands for MD_prof nodes with value profiles
-constexpr unsigned MinVPOps = 5;
+static constexpr unsigned MinVPOps = 5;
 
 // We may want to add support for other MD_prof types, so provide an abstraction
 // for checking the metadata type.
-bool isTargetMD(const MDNode *ProfData, const char *Name, unsigned MinOps) {
+static bool isTargetMD(const MDNode *ProfData, const char *Name,
+                       unsigned MinOps) {
   // TODO: This routine may be simplified if MD_prof used an enum instead of a
   // string to differentiate the types of MD_prof nodes.
   if (!ProfData || !Name || MinOps < 2)
@@ -101,14 +100,11 @@ static SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights) {
   return Ret;
 }
 
-} // namespace
-
-namespace llvm {
-cl::opt<bool> ElideAllZeroBranchWeights("elide-all-zero-branch-weights",
+static cl::opt<bool> ElideAllZeroBranchWeights("elide-all-zero-branch-weights",
 #if defined(LLVM_ENABLE_PROFCHECK)
-                                        cl::init(false)
+                                               cl::init(false)
 #else
-                                        cl::init(true)
+                                               cl::init(true)
 #endif
 );
 const char *MDProfLabels::BranchWeights = "branch_weights";
@@ -118,21 +114,21 @@ const char *MDProfLabels::FunctionEntryCount = "function_entry_count";
 const char *MDProfLabels::SyntheticFunctionEntryCount =
     "synthetic_function_entry_count";
 const char *MDProfLabels::UnknownBranchWeightsMarker = "unknown";
-const char *LLVMLoopEstimatedTripCount = "llvm.loop.estimated_trip_count";
+const char *llvm::LLVMLoopEstimatedTripCount = "llvm.loop.estimated_trip_count";
 
-bool hasProfMD(const Instruction &I) {
+bool llvm::hasProfMD(const Instruction &I) {
   return I.hasMetadata(LLVMContext::MD_prof);
 }
 
-bool isBranchWeightMD(const MDNode *ProfileData) {
+bool llvm::isBranchWeightMD(const MDNode *ProfileData) {
   return isTargetMD(ProfileData, MDProfLabels::BranchWeights, MinBWOps);
 }
 
-bool isValueProfileMD(const MDNode *ProfileData) {
+bool llvm::isValueProfileMD(const MDNode *ProfileData) {
   return isTargetMD(ProfileData, MDProfLabels::ValueProfile, MinVPOps);
 }
 
-bool hasBranchWeightMD(const Instruction &I) {
+bool llvm::hasBranchWeightMD(const Instruction &I) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   return isBranchWeightMD(ProfileData);
 }
@@ -147,16 +143,16 @@ static bool hasCountTypeMD(const Instruction &I) {
   return isa<CallBase>(I) && !isBranchWeightMD(ProfileData);
 }
 
-bool hasValidBranchWeightMD(const Instruction &I) {
+bool llvm::hasValidBranchWeightMD(const Instruction &I) {
   return getValidBranchWeightMDNode(I);
 }
 
-bool hasBranchWeightOrigin(const Instruction &I) {
+bool llvm::hasBranchWeightOrigin(const Instruction &I) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   return hasBranchWeightOrigin(ProfileData);
 }
 
-bool hasBranchWeightOrigin(const MDNode *ProfileData) {
+bool llvm::hasBranchWeightOrigin(const MDNode *ProfileData) {
   if (!isBranchWeightMD(ProfileData))
     return false;
   auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(1));
@@ -168,54 +164,54 @@ bool hasBranchWeightOrigin(const MDNode *ProfileData) {
   return ProfDataName != nullptr;
 }
 
-unsigned getBranchWeightOffset(const MDNode *ProfileData) {
+unsigned llvm::getBranchWeightOffset(const MDNode *ProfileData) {
   return hasBranchWeightOrigin(ProfileData) ? 2 : 1;
 }
 
-unsigned getNumBranchWeights(const MDNode &ProfileData) {
+unsigned llvm::getNumBranchWeights(const MDNode &ProfileData) {
   return ProfileData.getNumOperands() - getBranchWeightOffset(&ProfileData);
 }
 
-MDNode *getBranchWeightMDNode(const Instruction &I) {
+MDNode *llvm::getBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   if (!isBranchWeightMD(ProfileData))
     return nullptr;
   return ProfileData;
 }
 
-MDNode *getValidBranchWeightMDNode(const Instruction &I) {
+MDNode *llvm::getValidBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = getBranchWeightMDNode(I);
   if (ProfileData && getNumBranchWeights(*ProfileData) == I.getNumSuccessors())
     return ProfileData;
   return nullptr;
 }
 
-void extractFromBranchWeightMD32(const MDNode *ProfileData,
-                                 SmallVectorImpl<uint32_t> &Weights) {
+void llvm::extractFromBranchWeightMD32(const MDNode *ProfileData,
+                                       SmallVectorImpl<uint32_t> &Weights) {
   extractFromBranchWeightMD(ProfileData, Weights);
 }
 
-void extractFromBranchWeightMD64(const MDNode *ProfileData,
-                                 SmallVectorImpl<uint64_t> &Weights) {
+void llvm::extractFromBranchWeightMD64(const MDNode *ProfileData,
+                                       SmallVectorImpl<uint64_t> &Weights) {
   extractFromBranchWeightMD(ProfileData, Weights);
 }
 
-bool extractBranchWeights(const MDNode *ProfileData,
-                          SmallVectorImpl<uint32_t> &Weights) {
+bool llvm::extractBranchWeights(const MDNode *ProfileData,
+                                SmallVectorImpl<uint32_t> &Weights) {
   if (!isBranchWeightMD(ProfileData))
     return false;
   extractFromBranchWeightMD(ProfileData, Weights);
   return true;
 }
 
-bool extractBranchWeights(const Instruction &I,
-                          SmallVectorImpl<uint32_t> &Weights) {
+bool llvm::extractBranchWeights(const Instruction &I,
+                                SmallVectorImpl<uint32_t> &Weights) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   return extractBranchWeights(ProfileData, Weights);
 }
 
-bool extractBranchWeights(const Instruction &I, uint64_t &TrueVal,
-                          uint64_t &FalseVal) {
+bool llvm::extractBranchWeights(const Instruction &I, uint64_t &TrueVal,
+                                uint64_t &FalseVal) {
   assert((I.getOpcode() == Instruction::Br ||
           I.getOpcode() == Instruction::Select) &&
          "Looking for branch weights on something besides branch, select, or "
@@ -234,7 +230,8 @@ bool extractBranchWeights(const Instruction &I, uint64_t &TrueVal,
   return true;
 }
 
-bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalVal) {
+bool llvm::extractProfTotalWeight(const MDNode *ProfileData,
+                                  uint64_t &TotalVal) {
   TotalVal = 0;
   if (!ProfileData)
     return false;
@@ -262,11 +259,12 @@ bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalVal) {
   return false;
 }
 
-bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalVal) {
+bool llvm::extractProfTotalWeight(const Instruction &I, uint64_t &TotalVal) {
   return extractProfTotalWeight(I.getMetadata(LLVMContext::MD_prof), TotalVal);
 }
 
-void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName) {
+void llvm::setExplicitlyUnknownBranchWeights(Instruction &I,
+                                             StringRef PassName) {
   MDBuilder MDB(I.getContext());
   I.setMetadata(
       LLVMContext::MD_prof,
@@ -275,14 +273,16 @@ void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName) {
                    MDB.createString(PassName)}));
 }
 
-void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, Function &F,
-                                                 StringRef PassName) {
+void llvm::setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I,
+                                                       Function &F,
+                                                       StringRef PassName) {
   if (std::optional<Function::ProfileCount> EC = F.getEntryCount();
       EC && EC->getCount() > 0)
     setExplicitlyUnknownBranchWeights(I, PassName);
 }
 
-void setExplicitlyUnknownFunctionEntryCount(Function &F, StringRef PassName) {
+void llvm::setExplicitlyUnknownFunctionEntryCount(Function &F,
+                                                  StringRef PassName) {
   MDBuilder MDB(F.getContext());
   F.setMetadata(
       LLVMContext::MD_prof,
@@ -291,21 +291,21 @@ void setExplicitlyUnknownFunctionEntryCount(Function &F, StringRef PassName) {
                    MDB.createString(PassName)}));
 }
 
-bool isExplicitlyUnknownProfileMetadata(const MDNode &MD) {
+bool llvm::isExplicitlyUnknownProfileMetadata(const MDNode &MD) {
   if (MD.getNumOperands() != 2)
     return false;
   return MD.getOperand(0).equalsStr(MDProfLabels::UnknownBranchWeightsMarker);
 }
 
-bool hasExplicitlyUnknownBranchWeights(const Instruction &I) {
+bool llvm::hasExplicitlyUnknownBranchWeights(const Instruction &I) {
   auto *MD = I.getMetadata(LLVMContext::MD_prof);
   if (!MD)
     return false;
   return isExplicitlyUnknownProfileMetadata(*MD);
 }
 
-void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
-                      bool IsExpected, bool ElideAllZero) {
+void llvm::setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
+                            bool IsExpected, bool ElideAllZero) {
   if ((ElideAllZeroBranchWeights && ElideAllZero) &&
       llvm::all_of(Weights, [](uint32_t V) { return V == 0; })) {
     I.setMetadata(LLVMContext::MD_prof, nullptr);
@@ -317,13 +317,14 @@ void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
   I.setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
-void setFittedBranchWeights(Instruction &I, ArrayRef<uint64_t> Weights,
-                            bool IsExpected, bool ElideAllZero) {
+void llvm::setFittedBranchWeights(Instruction &I, ArrayRef<uint64_t> Weights,
+                                  bool IsExpected, bool ElideAllZero) {
   setBranchWeights(I, fitWeights(Weights), IsExpected, ElideAllZero);
 }
 
-SmallVector<uint32_t> downscaleWeights(ArrayRef<uint64_t> Weights,
-                                       std::optional<uint64_t> KnownMaxCount) {
+SmallVector<uint32_t>
+llvm::downscaleWeights(ArrayRef<uint64_t> Weights,
+                       std::optional<uint64_t> KnownMaxCount) {
   uint64_t MaxCount = KnownMaxCount.has_value() ? KnownMaxCount.value()
                                                 : *llvm::max_element(Weights);
   assert(MaxCount > 0 && "Bad max count");
@@ -334,7 +335,7 @@ SmallVector<uint32_t> downscaleWeights(ArrayRef<uint64_t> Weights,
   return DownscaledWeights;
 }
 
-void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
+void llvm::scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
   assert(T != 0 && "Caller should guarantee");
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   if (ProfileData == nullptr)
@@ -387,5 +388,3 @@ void scaleProfData(Instruction &I, uint64_t S, uint64_t T) {
     }
   I.setMetadata(LLVMContext::MD_prof, MDNode::get(C, Vals));
 }
-
-} // namespace llvm
diff --git a/llvm/lib/IR/SafepointIRVerifier.cpp b/llvm/lib/IR/SafepointIRVerifier.cpp
index e54894c1e9274..e35b5b30a7cc1 100644
--- a/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -196,7 +196,6 @@ class CFGDeadness {
 static void Verify(const Function &F, const DominatorTree &DT,
                    const CFGDeadness &CD);
 
-namespace llvm {
 PreservedAnalyses SafepointIRVerifierPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
   const auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
@@ -205,7 +204,6 @@ PreservedAnalyses SafepointIRVerifierPass::run(Function &F,
   Verify(F, DT, CD);
   return PreservedAnalyses::all();
 }
-} // namespace llvm
 
 namespace {
 
diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp
index 2de05a5432636..4fcf43616d60c 100644
--- a/llvm/lib/IR/VFABIDemangler.cpp
+++ b/llvm/lib/IR/VFABIDemangler.cpp
@@ -20,15 +20,16 @@ using namespace llvm;
 
 #define DEBUG_TYPE "vfabi-demangler"
 
-namespace {
 /// Utilities for the Vector Function ABI name parser.
 
+namespace {
 /// Return types for the parser functions.
 enum class ParseRet {
   OK,   // Found.
   None, // Not found.
   Error // Syntax error.
 };
+} // namespace
 
 /// Extracts the `<isa>` information from the mangled string, and
 /// sets the `ISA` accordingly. If successful, the <isa> token is removed
@@ -372,7 +373,6 @@ getScalableECFromSignature(const FunctionType *Signature, const VFISAKind ISA,
 
   return std::nullopt;
 }
-} // namespace
 
 // Format of the ABI name:
 // _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index a3476092253e7..b775cbb0c7920 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -622,6 +622,7 @@ enum PointerStripKind {
   PSK_InBoundsConstantIndices,
   PSK_InBounds
 };
+} // end anonymous namespace
 
 template <PointerStripKind StripKind> static void NoopCallback(const Value *) {}
 
@@ -696,7 +697,6 @@ static const Value *stripPointerCastsAndOffsets(
 
   return V;
 }
-} // end anonymous namespace
 
 const Value *Value::stripPointerCasts() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);

From 2a39d8be87e0ee7124078e9ebed8749a0e735e52 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 3 Oct 2025 21:19:13 +0900
Subject: [PATCH 623/878] AMDGPU: Remove dead code trying to constrain a
 physical register (#161790)

This constrainRegClass check would never pass for a physical
register.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 56435a50c87ad..fe6b8b96cbd57 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8117,21 +8117,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     // hope for the best.
     if (Inst.isCopy() && DstReg.isPhysical() &&
         RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
-      // TODO: Only works for 32 bit registers.
-      if (MRI.constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass)) {
-        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
-                get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
-            .add(Inst.getOperand(1));
-      } else {
-        Register NewDst =
-            MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
-                get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
-            .add(Inst.getOperand(1));
-        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
-                DstReg)
-            .addReg(NewDst);
-      }
+      Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+              get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
+          .add(Inst.getOperand(1));
+      BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
+              DstReg)
+          .addReg(NewDst);
+
       Inst.eraseFromParent();
       return;
     }

From 5601c4080abfdd362415cb0406de64c685edca67 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 3 Oct 2025 21:19:33 +0900
Subject: [PATCH 624/878] AMDGPU: Stop trying to constrain register class of
 post-RA-pseudos (#161792)

This is trying to constrain the register class of a physical register,
which makes no sense.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fe6b8b96cbd57..cda8069936af2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2112,8 +2112,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 
   case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
     MI.setDesc(get(AMDGPU::V_READLANE_B32));
-    MI.getMF()->getRegInfo().constrainRegClass(MI.getOperand(0).getReg(),
-                                               &AMDGPU::SReg_32_XM0RegClass);
     break;
   case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();

From e3d23f85737863c1ffc7b757b729df1903ca30b7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 3 Oct 2025 21:19:48 +0900
Subject: [PATCH 625/878] AMDGPU: Fix trying to constrain physical registers in
 spill handling (#161793)

It's nonsensical to call constrainRegClass on a physical register,
and we should not see virtual registers here.
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp |  3 ++-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp  | 10 +++-------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index e4b3528b432bb..0189e7b90ca94 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -306,7 +306,8 @@ class PrologEpilogSGPRSpillBuilder {
 
       buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
                          TmpVGPR, FI, FrameReg, DwordOff);
-      MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
+      assert(SubReg.isPhysical());
+
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
           .addReg(TmpVGPR, RegState::Kill);
       DwordOff += 4;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 205237fefe785..3c2dd4252c583 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2222,8 +2222,6 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
     // Don't need to write VGPR out.
   }
 
-  MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
-
   // Restore clobbered registers in the specified restore block.
   MI = RestoreMBB.end();
   SB.setMI(&RestoreMBB, MI);
@@ -2238,7 +2236,8 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
           SB.NumSubRegs == 1
               ? SB.SuperReg
               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
-      MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
+
+      assert(SubReg.isPhysical());
       bool LastSubReg = (i + 1 == e);
       auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
                          SubReg)
@@ -3059,8 +3058,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (IsSALU && LiveSCC) {
           Register NewDest;
           if (IsCopy) {
-            MF->getRegInfo().constrainRegClass(ResultReg,
-                                               &AMDGPU::SReg_32_XM0RegClass);
+            assert(ResultReg.isPhysical());
             NewDest = ResultReg;
           } else {
             NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
@@ -3190,8 +3188,6 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
             Register NewDest;
             if (IsCopy) {
-              MF->getRegInfo().constrainRegClass(ResultReg,
-                                                 &AMDGPU::SReg_32_XM0RegClass);
               NewDest = ResultReg;
             } else {
               NewDest = RS->scavengeRegisterBackwards(

From 80fd3eda25c63a7198137661ff7505d75de3e6f7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 3 Oct 2025 21:20:36 +0900
Subject: [PATCH 626/878] AMDGPU: Fix constrain register logic for physregs
 (#161794)

We do not need to reconstrain physical registers. Enables an
additional fold for constant physregs.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |    3 +-
 llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll |   25 +-
 .../CodeGen/AMDGPU/atomics-system-scope.ll    |  410 +++----
 .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 1081 +++++++----------
 .../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll  |   54 +-
 llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll |   20 +-
 6 files changed, 652 insertions(+), 941 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index fed37788802b9..82789bc4968c5 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -722,7 +722,8 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
         return false;
     }
 
-    if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
+    if (New->getReg().isVirtual() &&
+        !MRI->constrainRegClass(New->getReg(), ConstrainRC)) {
       LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI)
                         << TRI->getRegClassName(ConstrainRC) << '\n');
       return false;
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index aac499f2fc602..b486fabb19497 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -9,15 +9,14 @@ target triple = "amdgcn-amd-amdhsa"
 define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0
-; GFX1250-SDAG-NEXT:    s_cmp_lg_u32 s2, -1
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
@@ -56,13 +55,11 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
+; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
 ; GFX1250-SDAG-NEXT:    flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-SDAG-NEXT:    s_endpgm
@@ -91,10 +88,9 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
 ; GFX1250-LABEL: use_flat_to_private_addrspacecast:
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-NEXT:    s_cselect_b32 s0, s2, -1
 ; GFX1250-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
@@ -110,9 +106,8 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-SDAG-NEXT:    s_endpgm
@@ -122,9 +117,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
index ef52694910da3..54871a622189b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll
@@ -538,58 +538,61 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_6
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s1, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    s_cbranch_execnz .LBB34_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow2
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB34_8
+; GFX1250-NEXT:  .LBB34_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB34_3: ; %atomicrmw.check.private
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s1, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_3
-; GFX1250-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX1250-NEXT:    global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execz .LBB34_5
+; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX1250-NEXT:    global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB34_3: ; %Flow
+; GFX1250-NEXT:  .LBB34_5: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_5
-; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB34_7
+; GFX1250-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT:  .LBB34_5: ; %Flow1
+; GFX1250-NEXT:    v_add_f64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-NEXT:  .LBB34_7: ; %Flow1
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB34_6: ; %Flow2
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB34_8
-; GFX1250-NEXT:  ; %bb.7: ; %atomicrmw.shared
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT:    s_cbranch_execz .LBB34_2
+; GFX1250-NEXT:  .LBB34_8: ; %atomicrmw.shared
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    ds_add_rtn_f64 v[4:5], v0, v[2:3]
-; GFX1250-NEXT:  .LBB34_8: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
+; GFX1250-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fadd ptr %ptr, double %val monotonic
   ret double %result
@@ -600,58 +603,61 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
 ; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v1
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmpx_ne_u32_e64 s1, v5
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_6
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s1, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    s_cbranch_execnz .LBB35_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow2
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB35_8
+; GFX1250-NEXT:  .LBB35_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB35_3: ; %atomicrmw.check.private
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s1, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_3
-; GFX1250-NEXT:  ; %bb.2: ; %atomicrmw.global
-; GFX1250-NEXT:    global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execz .LBB35_5
+; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.global
+; GFX1250-NEXT:    global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB35_3: ; %Flow
+; GFX1250-NEXT:  .LBB35_5: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_5
-; GFX1250-NEXT:  ; %bb.4: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB35_7
+; GFX1250-NEXT:  ; %bb.6: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s2, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[4:5], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT:  .LBB35_5: ; %Flow1
+; GFX1250-NEXT:    v_add_f64_e32 v[2:3], v[0:1], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v4, v[2:3], off
+; GFX1250-NEXT:  .LBB35_7: ; %Flow1
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB35_6: ; %Flow2
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB35_8
-; GFX1250-NEXT:  ; %bb.7: ; %atomicrmw.shared
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
+; GFX1250-NEXT:    s_cbranch_execz .LBB35_2
+; GFX1250-NEXT:  .LBB35_8: ; %atomicrmw.shared
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    ds_add_rtn_f64 v[4:5], v0, v[2:3]
-; GFX1250-NEXT:  .LBB35_8: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
+; GFX1250-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fadd ptr %ptr, double %val syncscope("one-as") monotonic
   ret double %result
@@ -686,40 +692,42 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB38_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB38_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB38_4
+; GFX1250-NEXT:  .LBB38_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB38_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB38_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB38_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB38_2
+; GFX1250-NEXT:  .LBB38_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT:  .LBB38_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val monotonic
   ret double %result
@@ -730,40 +738,42 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB39_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB39_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB39_4
+; GFX1250-NEXT:  .LBB39_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB39_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB39_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB39_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB39_2
+; GFX1250-NEXT:  .LBB39_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT:  .LBB39_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_min_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmin ptr %ptr, double %val syncscope("one-as") monotonic
   ret double %result
@@ -798,40 +808,42 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB42_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB42_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB42_4
+; GFX1250-NEXT:  .LBB42_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB42_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB42_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB42_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB42_2
+; GFX1250-NEXT:  .LBB42_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT:  .LBB42_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val monotonic
   ret double %result
@@ -842,40 +854,42 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
-; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
+; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
+; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB43_2
-; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
-; GFX1250-NEXT:    flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX1250-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX1250-NEXT:    s_cbranch_execnz .LBB43_3
+; GFX1250-NEXT:  ; %bb.1: ; %Flow
+; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
+; GFX1250-NEXT:    s_cbranch_execnz .LBB43_4
+; GFX1250-NEXT:  .LBB43_2: ; %atomicrmw.phi
+; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+; GFX1250-NEXT:  .LBB43_3: ; %atomicrmw.global
+; GFX1250-NEXT:    flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-NEXT:  .LBB43_2: ; %Flow
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
-; GFX1250-NEXT:    s_cbranch_execz .LBB43_4
-; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
-; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
+; GFX1250-NEXT:    s_cbranch_execz .LBB43_2
+; GFX1250-NEXT:  .LBB43_4: ; %atomicrmw.private
+; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo
-; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
+; GFX1250-NEXT:    v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo
+; GFX1250-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
-; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    scratch_store_b64 v6, v[0:1], off
-; GFX1250-NEXT:  .LBB43_4: ; %atomicrmw.phi
+; GFX1250-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
+; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[2:3]
+; GFX1250-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = atomicrmw fmax ptr %ptr, double %val syncscope("one-as") monotonic
   ret double %result
@@ -982,13 +996,11 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1000,10 +1012,9 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB52_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1025,13 +1036,11 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB53_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1043,10 +1052,9 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB53_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1068,13 +1076,11 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB54_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1086,10 +1092,9 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB54_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1111,13 +1116,11 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB55_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1129,10 +1132,9 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB55_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1154,13 +1156,11 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB56_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1172,10 +1172,9 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB56_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1197,13 +1196,11 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB57_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1215,10 +1212,9 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB57_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1240,13 +1236,11 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1258,10 +1252,9 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB58_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
@@ -1283,13 +1276,11 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_xor_b32_e32 v4, s0, v1
+; GFX1250-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4
 ; GFX1250-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX1250-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-NEXT:  ; %bb.1: ; %atomicrmw.global
@@ -1301,10 +1292,9 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) {
 ; GFX1250-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-NEXT:    s_cbranch_execz .LBB59_4
 ; GFX1250-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, s1, v0
+; GFX1250-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-NEXT:    scratch_load_b64 v[4:5], v6, off
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 1e7855ccb3642..eefc7811d42b6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -541,11 +541,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -570,9 +569,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1250-SDAG-NEXT:  .LBB10_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
@@ -586,14 +584,13 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -618,10 +615,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB10_2
 ; GFX1250-GISEL-NEXT:  .LBB10_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -727,13 +723,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB11_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -754,9 +749,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1250-SDAG-NEXT:  .LBB11_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
@@ -770,8 +764,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -780,7 +773,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -805,10 +798,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1250-GISEL-NEXT:  .LBB11_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -917,11 +909,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -943,9 +934,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1250-SDAG-NEXT:  .LBB12_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off
@@ -953,15 +943,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB12_3
@@ -982,10 +971,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB12_2
 ; GFX1250-GISEL-NEXT:  .LBB12_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -1069,11 +1057,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB13_3
@@ -1094,9 +1080,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1250-SDAG-NEXT:  .LBB13_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v0, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_store_b64 v0, v[2:3], off
@@ -1104,8 +1089,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1115,7 +1099,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB13_3
@@ -1136,10 +1120,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX1250-GISEL-NEXT:  .LBB13_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v0, v[4:5], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -1400,11 +1383,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -1429,9 +1411,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1250-SDAG-NEXT:  .LBB18_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1445,14 +1426,13 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1477,10 +1457,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX1250-GISEL-NEXT:  .LBB18_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -1590,13 +1569,12 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB19_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -1617,9 +1595,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX1250-SDAG-NEXT:  .LBB19_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1633,8 +1610,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -1643,7 +1619,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -1668,10 +1644,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB19_2
 ; GFX1250-GISEL-NEXT:  .LBB19_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -1784,11 +1759,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -1810,9 +1784,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1250-SDAG-NEXT:  .LBB20_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1823,15 +1796,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB20_3
@@ -1852,10 +1824,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX1250-GISEL-NEXT:  .LBB20_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -1950,11 +1921,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB21_3
@@ -1975,9 +1944,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX1250-SDAG-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -1988,8 +1956,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1999,7 +1966,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB21_3
@@ -2020,10 +1987,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB21_2
 ; GFX1250-GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2295,11 +2261,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -2324,9 +2289,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1250-SDAG-NEXT:  .LBB26_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2340,14 +2304,13 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2372,10 +2335,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB26_2
 ; GFX1250-GISEL-NEXT:  .LBB26_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2487,13 +2449,12 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB27_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -2514,9 +2475,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX1250-SDAG-NEXT:  .LBB27_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2530,8 +2490,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -2540,7 +2499,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -2565,10 +2524,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB27_2
 ; GFX1250-GISEL-NEXT:  .LBB27_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2683,11 +2641,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2709,9 +2666,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX1250-SDAG-NEXT:  .LBB28_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2722,15 +2678,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB28_3
@@ -2751,10 +2706,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB28_2
 ; GFX1250-GISEL-NEXT:  .LBB28_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -2851,11 +2805,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB29_3
@@ -2876,9 +2828,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1250-SDAG-NEXT:  .LBB29_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -2889,8 +2840,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2900,7 +2850,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB29_3
@@ -2921,10 +2871,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX1250-GISEL-NEXT:  .LBB29_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3198,11 +3147,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -3227,9 +3175,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB34_2
 ; GFX1250-SDAG-NEXT:  .LBB34_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3244,14 +3191,13 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3276,10 +3222,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB34_2
 ; GFX1250-GISEL-NEXT:  .LBB34_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3390,13 +3335,12 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB35_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -3417,9 +3361,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB35_2
 ; GFX1250-SDAG-NEXT:  .LBB35_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3434,8 +3377,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -3444,7 +3386,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -3469,10 +3411,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB35_2
 ; GFX1250-GISEL-NEXT:  .LBB35_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3586,11 +3527,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -3612,9 +3552,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX1250-SDAG-NEXT:  .LBB36_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3626,15 +3565,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB36_3
@@ -3655,10 +3593,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB36_2
 ; GFX1250-GISEL-NEXT:  .LBB36_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -3754,11 +3691,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB37_3
@@ -3779,9 +3714,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB37_2
 ; GFX1250-SDAG-NEXT:  .LBB37_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -3793,8 +3727,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3804,7 +3737,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB37_3
@@ -3825,10 +3758,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB37_2
 ; GFX1250-GISEL-NEXT:  .LBB37_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4101,11 +4033,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -4130,9 +4061,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX1250-SDAG-NEXT:  .LBB42_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4147,14 +4077,13 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4179,10 +4108,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB42_2
 ; GFX1250-GISEL-NEXT:  .LBB42_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4293,13 +4221,12 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB43_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -4320,9 +4247,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB43_2
 ; GFX1250-SDAG-NEXT:  .LBB43_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4337,8 +4263,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -4347,7 +4272,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -4372,10 +4297,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB43_2
 ; GFX1250-GISEL-NEXT:  .LBB43_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4489,11 +4413,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -4515,9 +4438,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB44_2
 ; GFX1250-SDAG-NEXT:  .LBB44_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4529,15 +4451,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB44_3
@@ -4558,10 +4479,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB44_2
 ; GFX1250-GISEL-NEXT:  .LBB44_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -4657,11 +4577,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB45_3
@@ -4682,9 +4600,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB45_2
 ; GFX1250-SDAG-NEXT:  .LBB45_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -4696,8 +4613,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4707,7 +4623,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB45_3
@@ -4728,10 +4644,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB45_2
 ; GFX1250-GISEL-NEXT:  .LBB45_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5004,11 +4919,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5033,9 +4947,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX1250-SDAG-NEXT:  .LBB50_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -5050,14 +4963,13 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5082,10 +4994,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB50_2
 ; GFX1250-GISEL-NEXT:  .LBB50_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5196,13 +5107,12 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB51_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -5223,9 +5133,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB51_2
 ; GFX1250-SDAG-NEXT:  .LBB51_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -5240,8 +5149,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -5250,7 +5158,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5275,10 +5183,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB51_2
 ; GFX1250-GISEL-NEXT:  .LBB51_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5392,11 +5299,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -5418,9 +5324,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX1250-SDAG-NEXT:  .LBB52_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -5432,15 +5337,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB52_3
@@ -5461,10 +5365,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB52_2
 ; GFX1250-GISEL-NEXT:  .LBB52_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5560,11 +5463,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB53_3
@@ -5585,9 +5486,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB53_2
 ; GFX1250-SDAG-NEXT:  .LBB53_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -5599,8 +5499,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5610,7 +5509,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB53_3
@@ -5631,10 +5530,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB53_2
 ; GFX1250-GISEL-NEXT:  .LBB53_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -5877,11 +5775,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -5902,9 +5799,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-SDAG-NEXT:  .LBB58_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -5918,14 +5814,13 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -5946,10 +5841,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB58_2
 ; GFX1250-GISEL-NEXT:  .LBB58_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6061,13 +5955,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB59_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -6084,9 +5977,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-SDAG-NEXT:  .LBB59_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -6100,8 +5992,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -6110,7 +6001,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6131,10 +6022,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB59_2
 ; GFX1250-GISEL-NEXT:  .LBB59_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6249,11 +6139,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -6272,9 +6161,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB60_2
 ; GFX1250-SDAG-NEXT:  .LBB60_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -6285,15 +6173,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB60_3
@@ -6311,10 +6198,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB60_2
 ; GFX1250-GISEL-NEXT:  .LBB60_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6409,11 +6295,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB61_3
@@ -6431,9 +6315,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB61_2
 ; GFX1250-SDAG-NEXT:  .LBB61_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -6444,8 +6327,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6455,7 +6337,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB61_3
@@ -6473,10 +6355,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB61_2
 ; GFX1250-GISEL-NEXT:  .LBB61_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6718,11 +6599,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -6743,9 +6623,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-SDAG-NEXT:  .LBB66_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -6759,14 +6638,13 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6787,10 +6665,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB66_2
 ; GFX1250-GISEL-NEXT:  .LBB66_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -6902,13 +6779,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB67_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -6925,9 +6801,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-SDAG-NEXT:  .LBB67_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -6941,8 +6816,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -6951,7 +6825,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -6972,10 +6846,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB67_2
 ; GFX1250-GISEL-NEXT:  .LBB67_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7090,11 +6963,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -7113,9 +6985,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB68_2
 ; GFX1250-SDAG-NEXT:  .LBB68_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -7126,15 +6997,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB68_3
@@ -7152,10 +7022,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB68_2
 ; GFX1250-GISEL-NEXT:  .LBB68_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7250,11 +7119,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB69_3
@@ -7272,9 +7139,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB69_2
 ; GFX1250-SDAG-NEXT:  .LBB69_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -7285,8 +7151,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7296,7 +7161,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB69_3
@@ -7314,10 +7179,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB69_2
 ; GFX1250-GISEL-NEXT:  .LBB69_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7559,11 +7423,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -7584,9 +7447,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-SDAG-NEXT:  .LBB74_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -7600,14 +7462,13 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -7628,10 +7489,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB74_2
 ; GFX1250-GISEL-NEXT:  .LBB74_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7743,13 +7603,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB75_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -7766,9 +7625,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-SDAG-NEXT:  .LBB75_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -7782,8 +7640,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -7792,7 +7649,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -7813,10 +7670,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB75_2
 ; GFX1250-GISEL-NEXT:  .LBB75_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -7931,11 +7787,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -7954,9 +7809,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB76_2
 ; GFX1250-SDAG-NEXT:  .LBB76_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -7967,15 +7821,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB76_3
@@ -7993,10 +7846,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB76_2
 ; GFX1250-GISEL-NEXT:  .LBB76_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8091,11 +7943,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB77_3
@@ -8113,9 +7963,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB77_2
 ; GFX1250-SDAG-NEXT:  .LBB77_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -8126,8 +7975,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8137,7 +7985,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB77_3
@@ -8155,10 +8003,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB77_2
 ; GFX1250-GISEL-NEXT:  .LBB77_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8400,11 +8247,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -8425,9 +8271,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-SDAG-NEXT:  .LBB82_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -8441,14 +8286,13 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -8469,10 +8313,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB82_2
 ; GFX1250-GISEL-NEXT:  .LBB82_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8584,13 +8427,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB83_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -8607,9 +8449,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-SDAG-NEXT:  .LBB83_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -8623,8 +8464,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -8633,7 +8473,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -8654,10 +8494,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB83_2
 ; GFX1250-GISEL-NEXT:  .LBB83_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8772,11 +8611,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -8795,9 +8633,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB84_2
 ; GFX1250-SDAG-NEXT:  .LBB84_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -8808,15 +8645,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB84_3
@@ -8834,10 +8670,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB84_2
 ; GFX1250-GISEL-NEXT:  .LBB84_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -8932,11 +8767,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB85_3
@@ -8954,9 +8787,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB85_2
 ; GFX1250-SDAG-NEXT:  .LBB85_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -8967,8 +8799,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8978,7 +8809,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB85_3
@@ -8996,10 +8827,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB85_2
 ; GFX1250-GISEL-NEXT:  .LBB85_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9281,12 +9111,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v3
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -9311,9 +9140,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX1250-SDAG-NEXT:  .LBB90_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v8, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v8, off
@@ -9328,15 +9156,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v5
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v3, v10
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -9361,10 +9188,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB90_2
 ; GFX1250-GISEL-NEXT:  .LBB90_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9485,13 +9311,12 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v3
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB91_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -9512,9 +9337,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB91_2
 ; GFX1250-SDAG-NEXT:  .LBB91_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v8, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v8, off
@@ -9529,10 +9353,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -9540,7 +9363,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v3, v10
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -9565,10 +9388,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB91_2
 ; GFX1250-GISEL-NEXT:  .LBB91_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9690,13 +9512,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
+; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v2
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -9718,9 +9539,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX1250-SDAG-NEXT:  .LBB92_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -9732,15 +9552,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB92_3
@@ -9761,10 +9581,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB92_2
 ; GFX1250-GISEL-NEXT:  .LBB92_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -9869,11 +9688,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v2
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB93_3
@@ -9894,9 +9711,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB93_2
 ; GFX1250-SDAG-NEXT:  .LBB93_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v2, off
@@ -9908,18 +9724,18 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v1, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB93_3
@@ -9940,10 +9756,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB93_2
 ; GFX1250-GISEL-NEXT:  .LBB93_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -10188,11 +10003,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -10214,10 +10028,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX1250-SDAG-NEXT:  .LBB98_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -10233,14 +10046,13 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -10262,18 +10074,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB98_2
 ; GFX1250-GISEL-NEXT:  .LBB98_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
@@ -10386,13 +10197,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB99_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -10410,10 +10220,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB99_2
 ; GFX1250-SDAG-NEXT:  .LBB99_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -10429,8 +10238,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -10439,7 +10247,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -10461,18 +10269,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB99_2
 ; GFX1250-GISEL-NEXT:  .LBB99_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[2:3], off
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
@@ -10588,11 +10395,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -10610,9 +10416,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB100_2
 ; GFX1250-SDAG-NEXT:  .LBB100_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -10625,15 +10430,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB100_3
@@ -10650,17 +10454,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB100_2
 ; GFX1250-GISEL-NEXT:  .LBB100_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -10754,11 +10557,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB101_3
@@ -10775,9 +10576,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB101_2
 ; GFX1250-SDAG-NEXT:  .LBB101_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -10790,8 +10590,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10801,7 +10600,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB101_3
@@ -10818,17 +10617,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB101_2
 ; GFX1250-GISEL-NEXT:  .LBB101_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 1, v[0:1]
 ; GFX1250-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_store_b64 v6, v[0:1], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -11064,11 +10862,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -11090,10 +10887,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX1250-SDAG-NEXT:  .LBB106_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -11112,14 +10908,13 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -11141,11 +10936,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB106_2
 ; GFX1250-GISEL-NEXT:  .LBB106_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11268,13 +11062,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GFX1250-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-SDAG-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB107_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %Flow
@@ -11292,10 +11085,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB107_2
 ; GFX1250-SDAG-NEXT:  .LBB107_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s0, v4
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v6, off
@@ -11314,8 +11106,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
@@ -11324,7 +11115,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v7, v8
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14
 ; GFX1250-GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1250-GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
@@ -11346,11 +11137,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB107_2
 ; GFX1250-GISEL-NEXT:  .LBB107_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v6, v0
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v6, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11476,11 +11266,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -11498,9 +11287,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB108_2
 ; GFX1250-SDAG-NEXT:  .LBB108_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -11516,15 +11304,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB108_3
@@ -11541,10 +11328,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB108_2
 ; GFX1250-GISEL-NEXT:  .LBB108_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11648,11 +11434,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, s0, v1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1
 ; GFX1250-SDAG-NEXT:    v_cmpx_lt_u32_e32 0x3ffffff, v4
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB109_3
@@ -11669,9 +11453,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execz .LBB109_2
 ; GFX1250-SDAG-NEXT:  .LBB109_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1]
-; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, s0, v0
+; GFX1250-SDAG-NEXT:    v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc_lo
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -11687,8 +11470,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -11698,7 +11480,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3
 ; GFX1250-GISEL-NEXT:    v_cmpx_le_u32_e32 0x4000000, v1
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execnz .LBB109_3
@@ -11715,10 +11497,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ; GFX1250-GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; GFX1250-GISEL-NEXT:    s_cbranch_execz .LBB109_2
 ; GFX1250-GISEL-NEXT:  .LBB109_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX1250-GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc_lo
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[0:1], v2, off
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -11834,12 +11615,10 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
 ; GFX1250-SDAG-NEXT:    s_cbranch_vccz .LBB110_3
 ; GFX1250-SDAG-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s2, 0x4000000
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
 ; GFX1250-SDAG-NEXT:    s_cbranch_vccz .LBB110_4
 ; GFX1250-SDAG-NEXT:  ; %bb.2: ; %atomicrmw.global
@@ -11855,9 +11634,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:  .LBB110_4:
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-SDAG-NEXT:  .LBB110_5: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, s2, -1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[2:3], off, s2
@@ -11891,10 +11668,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB110_6
 ; GFX1250-GISEL-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s2, 0x4000000
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB110_3
@@ -11910,9 +11686,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB110_5
 ; GFX1250-GISEL-NEXT:  ; %bb.4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, s2, -1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[2:3], off, s2
@@ -12060,12 +11834,10 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:  .LBB111_2: ; %atomicrmw.phi
 ; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ; GFX1250-SDAG-NEXT:  .LBB111_3: ; %atomicrmw.check.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s2, 0x4000000
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-SDAG-NEXT:    s_cbranch_vccz .LBB111_5
@@ -12079,9 +11851,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2
 ; GFX1250-SDAG-NEXT:    s_cbranch_vccnz .LBB111_7
 ; GFX1250-SDAG-NEXT:  ; %bb.6: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, s2, -1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[2:3], off, s2
@@ -12112,9 +11882,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s1, s3
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB111_6
 ; GFX1250-GISEL-NEXT:  ; %bb.1: ; %atomicrmw.check.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-GISEL-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s2, 0x4000000
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB111_3
@@ -12130,9 +11899,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB111_5
 ; GFX1250-GISEL-NEXT:  ; %bb.4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, s2, -1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[2:3], off, s2
@@ -12261,9 +12028,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s2, 0x4000000
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -12279,10 +12045,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:  .LBB112_2:
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-SDAG-NEXT:  .LBB112_3: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s2, -1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -12301,10 +12066,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s2, s0, 0x50
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s4, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s4, 0x4000000
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB112_2
@@ -12320,10 +12084,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB112_4
 ; GFX1250-GISEL-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -12413,9 +12176,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s2, 0x4000000
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -12434,11 +12196,10 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB113_2
 ; GFX1250-SDAG-NEXT:  .LBB113_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, s2
-; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s2, -1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -12454,9 +12215,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s2, s0, 0x50
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s4, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s4, 0x4000000
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB113_2
@@ -12473,10 +12233,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB113_4
 ; GFX1250-GISEL-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -12562,9 +12321,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s2, 0x4000000
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -12580,10 +12338,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:  .LBB114_2:
 ; GFX1250-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-SDAG-NEXT:  .LBB114_3: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
-; GFX1250-SDAG-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
 ; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s2, -1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -12602,10 +12359,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s2, s0, 0x50
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s4, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s4, 0x4000000
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB114_2
@@ -12621,10 +12377,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB114_4
 ; GFX1250-GISEL-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
@@ -12714,9 +12469,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x50
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s1, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s2, 0x4000000
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -12735,11 +12489,10 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    s_cbranch_execnz .LBB115_2
 ; GFX1250-SDAG-NEXT:  .LBB115_4: ; %atomicrmw.private
-; GFX1250-SDAG-NEXT:    s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-SDAG-NEXT:    s_sub_co_i32 s2, s0, s2
-; GFX1250-SDAG-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, s2, -1
 ; GFX1250-SDAG-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
@@ -12755,9 +12508,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s2, s0, 0x50
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s3, s1, 0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s4, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT:    s_xor_b32 s4, s3, src_flat_scratch_base_hi
 ; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s4, 0x4000000
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s4, 1
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc0 .LBB115_2
@@ -12774,10 +12526,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB115_4
 ; GFX1250-GISEL-NEXT:  ; %bb.3: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_lo
-; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
-; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT:    s_sub_co_i32 s0, s2, src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-GISEL-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX1250-GISEL-NEXT:    scratch_load_b64 v[2:3], off, s0
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 56215ca20651a..67d0410434a99 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -59,21 +59,20 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX1250-SDAG-LABEL: is_private_vgpr:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_xor_b32_e32 v0, s0, v1
-; GFX1250-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
-; GFX1250-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1250-SDAG-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX1250-SDAG-NEXT:    s_endpgm
+; GFX1250-LABEL: is_private_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
 ;
 ; CI-GISEL-LABEL: is_private_vgpr:
 ; CI-GISEL:       ; %bb.0:
@@ -122,22 +121,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: is_private_vgpr:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0
-; GFX1250-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX1250-GISEL-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX1250-GISEL-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id
   %ptr = load volatile ptr, ptr addrspace(1) %gep
@@ -206,9 +189,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX1250-SDAG-LABEL: is_private_sgpr:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x4
-; GFX1250-SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    s_xor_b32 s0, s0, s1
+; GFX1250-SDAG-NEXT:    s_xor_b32 s0, s0, src_flat_scratch_base_hi
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_cmp_lt_u32 s0, 0x4000000
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s0, -1, 0
@@ -285,9 +267,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ; GFX1250-GISEL:       ; %bb.0:
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_xor_b32 s0, s1, s0
+; GFX1250-GISEL-NEXT:    s_xor_b32 s0, s1, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-NEXT:    s_cmp_ge_u32 s0, 0x4000000
 ; GFX1250-GISEL-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GFX1250-GISEL-NEXT:  ; %bb.1: ; %bb0
@@ -311,5 +292,4 @@ bb1:
 ; CI: {{.*}}
 ; GFX10-GISEL: {{.*}}
 ; GFX11-GISEL: {{.*}}
-; GFX1250: {{.*}}
 ; SI-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index 335d58c43c936..a18847b56a330 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -324,11 +324,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; SDAG:       ; %bb.0: ; %entry
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
-; SDAG-NEXT:    s_mov_b32 s0, src_flat_scratch_base_hi
-; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; SDAG-NEXT:    v_xor_b32_e32 v0, s0, v3
+; SDAG-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; SDAG-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0
 ; SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
@@ -350,10 +348,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; SDAG-NEXT:    s_and_not1_saveexec_b32 s0, s0
 ; SDAG-NEXT:    s_cbranch_execz .LBB21_2
 ; SDAG-NEXT:  .LBB21_4: ; %atomicrmw.private
-; SDAG-NEXT:    s_mov_b32 s1, src_flat_scratch_base_lo
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
 ; SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; SDAG-NEXT:    v_subrev_nc_u32_e32 v0, s1, v2
+; SDAG-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; SDAG-NEXT:    scratch_load_b64 v[0:1], v4, off
@@ -367,12 +364,12 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ;
 ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v0, src_flat_scratch_base_hi
+; GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GISEL-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 3, s[0:1]
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_xor_b32_e32 v0, v5, v0
+; GISEL-NEXT:    v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
 ; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GISEL-NEXT:    s_and_saveexec_b32 s2, vcc_lo
@@ -394,11 +391,10 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ; GISEL-NEXT:    s_and_not1_saveexec_b32 s0, s2
 ; GISEL-NEXT:    s_cbranch_execz .LBB21_2
 ; GISEL-NEXT:  .LBB21_4: ; %atomicrmw.private
-; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GISEL-NEXT:    v_mov_b32_e32 v0, src_flat_scratch_base_lo
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
-; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-NEXT:    v_sub_nc_u32_e32 v0, v4, v0
+; GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT:    v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_cndmask_b32_e32 v4, -1, v0, vcc_lo
 ; GISEL-NEXT:    scratch_load_b64 v[0:1], v4, off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0

From 2ff635dde236308073bd50ec11490c604492e438 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 3 Oct 2025 21:20:59 +0900
Subject: [PATCH 627/878] CodeGen: Stop checking for physregs in
 constrainRegClass (#161795)

It's nonsensical to call this function on a physical register.
---
 llvm/lib/CodeGen/MachineRegisterInfo.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index abb3f3e612000..ae284f3ae2929 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -83,8 +83,6 @@ constrainRegClass(MachineRegisterInfo &MRI, Register Reg,
 
 const TargetRegisterClass *MachineRegisterInfo::constrainRegClass(
     Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs) {
-  if (Reg.isPhysical())
-    return nullptr;
   return ::constrainRegClass(*this, Reg, getRegClass(Reg), RC, MinNumRegs);
 }
 

From 2e1fab93467ec8c37a236ae6e059300ebaa0c986 Mon Sep 17 00:00:00 2001
From: Koakuma <koachan@protonmail.com>
Date: Fri, 3 Oct 2025 19:25:08 +0700
Subject: [PATCH 628/878] [SPARC] Prevent meta instructions from being inserted
 into delay slots (#161111)

Do not move meta instructions like `FAKE_USE`/`@llvm.fake.use` into
delay slots, as they don't correspond to real machine instructions.

This should fix crashes when compiling with, for example, `clang -Og`.
---
 llvm/lib/Target/Sparc/DelaySlotFiller.cpp     |  4 +--
 .../CodeGen/SPARC/2011-01-19-DelaySlot.ll     | 25 +++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
index 6c19049a001cf..024030d196ee3 100644
--- a/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -206,8 +206,8 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
     if (!done)
       --I;
 
-    // skip debug instruction
-    if (I->isDebugInstr())
+    // Skip meta instructions.
+    if (I->isMetaInstruction())
       continue;
 
     if (I->hasUnmodeledSideEffects() || I->isInlineAsm() || I->isPosition() ||
diff --git a/llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll b/llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
index 9ccd4f1c0ac9a..767ef7eb510e6 100644
--- a/llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
+++ b/llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
@@ -184,4 +184,29 @@ entry:
   ret i32 %2
 }
 
+define i32 @test_generic_inst(i32 %arg) #0 {
+;CHECK-LABEL: test_generic_inst:
+;CHECK: ! fake_use: {{.*}}
+;CHECK: bne {{.*}}
+;CHECK-NEXT: nop
+  %bar1 = call i32 @bar(i32 %arg)
+  %even = and i32 %bar1, 1
+  %cmp = icmp eq i32 %even, 0
+  ; This shouldn't get reordered into a delay slot
+  call void (...) @llvm.fake.use(i32 %arg)
+  br i1 %cmp, label %true, label %false
+true:
+  %bar2 = call i32 @bar(i32 %bar1)
+  br label %cont
+
+false:
+  %inc = add nsw i32 %bar1, 1
+  br label %cont
+
+cont:
+  %ret = phi i32 [ %bar2, %true ], [ %inc, %false ]
+  ret i32 %ret
+}
+
+declare void @llvm.fake.use(...)
 attributes #0 = { nounwind "disable-tail-calls"="true" }

From 3c5c82d09c691a83fec5d09df2f6a308a789ead1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 3 Oct 2025 15:25:24 +0300
Subject: [PATCH 629/878] =?UTF-8?q?[clang]=20[Headers]=C2=A0Don't=20use=20?=
 =?UTF-8?q?unreserved=20names=20in=20avx10=5F2bf16intrin.h=20(#161824)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This can cause breakage with user code that does "#define A ...".

This fixes issue https://github.com/llvm/llvm-project/issues/161808.
---
 clang/lib/Headers/avx10_2bf16intrin.h | 36 +++++++++++++--------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 5bcec4bdaa2b2..765cd682986b4 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -519,34 +519,34 @@ _mm_maskz_min_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
       (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sbh(__m128bh A,
-                                                           __m128bh B) {
-  return __builtin_ia32_vcomisbf16eq((__v8bf)A, (__v8bf)B);
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sbh(__m128bh __A,
+                                                           __m128bh __B) {
+  return __builtin_ia32_vcomisbf16eq((__v8bf)__A, (__v8bf)__B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sbh(__m128bh A,
-                                                           __m128bh B) {
-  return __builtin_ia32_vcomisbf16lt((__v8bf)A, (__v8bf)B);
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sbh(__m128bh __A,
+                                                           __m128bh __B) {
+  return __builtin_ia32_vcomisbf16lt((__v8bf)__A, (__v8bf)__B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sbh(__m128bh A,
-                                                           __m128bh B) {
-  return __builtin_ia32_vcomisbf16le((__v8bf)A, (__v8bf)B);
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sbh(__m128bh __A,
+                                                           __m128bh __B) {
+  return __builtin_ia32_vcomisbf16le((__v8bf)__A, (__v8bf)__B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sbh(__m128bh A,
-                                                           __m128bh B) {
-  return __builtin_ia32_vcomisbf16gt((__v8bf)A, (__v8bf)B);
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sbh(__m128bh __A,
+                                                           __m128bh __B) {
+  return __builtin_ia32_vcomisbf16gt((__v8bf)__A, (__v8bf)__B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sbh(__m128bh A,
-                                                           __m128bh B) {
-  return __builtin_ia32_vcomisbf16ge((__v8bf)A, (__v8bf)B);
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sbh(__m128bh __A,
+                                                           __m128bh __B) {
+  return __builtin_ia32_vcomisbf16ge((__v8bf)__A, (__v8bf)__B);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sbh(__m128bh A,
-                                                            __m128bh B) {
-  return __builtin_ia32_vcomisbf16neq((__v8bf)A, (__v8bf)B);
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sbh(__m128bh __A,
+                                                            __m128bh __B) {
+  return __builtin_ia32_vcomisbf16neq((__v8bf)__A, (__v8bf)__B);
 }
 
 #define _mm256_cmp_pbh_mask(__A, __B, __P)                                     \

From 4845b3e3eb3759cb87ab52a0ac6836ed1df4f57e Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Fri, 3 Oct 2025 05:46:28 -0700
Subject: [PATCH 630/878] [OpenACC][CIR] Impl reduction recipe pointer/array
 bound lowering (#161726)

Just like with private, the lowering for these bounds are all pretty
trivial. This patch enables them for reduction, which has everything in
common except the init pattern, but that is handled/managed by Sema.

This also adds sufficient testing to spot-check the
allocation/initialization/destruction/etc.
---
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp |  30 +-
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h   |  39 +-
 clang/lib/Sema/SemaOpenACC.cpp                |  11 +-
 .../combined-reduction-clause-default-ops.cpp | 418 ++++++++++-
 .../combined-reduction-clause-float.cpp       | 301 +++++++-
 .../combined-reduction-clause-inline-ops.cpp  | 679 ++++++++++++++++-
 .../combined-reduction-clause-int.cpp         | 301 +++++++-
 .../combined-reduction-clause-outline-ops.cpp | 679 ++++++++++++++++-
 .../compute-reduction-clause-default-ops.c    | 418 ++++++++++-
 .../compute-reduction-clause-default-ops.cpp  | 418 ++++++++++-
 .../compute-reduction-clause-float.c          | 301 +++++++-
 .../compute-reduction-clause-float.cpp        | 301 +++++++-
 .../compute-reduction-clause-inline-ops.cpp   | 679 ++++++++++++++++-
 .../compute-reduction-clause-int.c            | 301 +++++++-
 .../compute-reduction-clause-int.cpp          | 301 +++++++-
 .../compute-reduction-clause-outline-ops.cpp  | 679 ++++++++++++++++-
 .../compute-reduction-clause-unsigned-int.c   | 301 +++++++-
 .../loop-reduction-clause-default-ops.cpp     | 418 ++++++++++-
 .../loop-reduction-clause-float.cpp           | 301 +++++++-
 .../loop-reduction-clause-inline-ops.cpp      | 679 ++++++++++++++++-
 .../loop-reduction-clause-int.cpp             | 301 +++++++-
 .../loop-reduction-clause-outline-ops.cpp     | 680 +++++++++++++++++-
 .../reduction-clause-recipes.cpp              | 677 +++++++++++++++++
 23 files changed, 9118 insertions(+), 95 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index ea6ea2c63acc3..bbc45e5f92ca5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -427,22 +427,20 @@ void OpenACCRecipeBuilderBase::makeBoundsInit(
   cgf.emitAutoVarInit(tempDeclEmission);
 }
 
-// TODO: OpenACC: When we get this implemented for the reduction/firstprivate,
-// this might end up re-merging with createRecipeInitCopy.  For now, keep it
-// separate until we're sure what everything looks like to keep this as clean
-// as possible.
-void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
+// TODO: OpenACC: when we start doing firstprivate for array/vlas/etc, we
+// probably need to do a little work about the 'init' calls to put it in 'copy'
+// region instead.
+void OpenACCRecipeBuilderBase::createInitRecipe(
     mlir::Location loc, mlir::Location locEnd, SourceRange exprRange,
-    mlir::Value mainOp, mlir::acc::PrivateRecipeOp recipe, size_t numBounds,
+    mlir::Value mainOp, mlir::Region &recipeInitRegion, size_t numBounds,
     llvm::ArrayRef<QualType> boundTypes, const VarDecl *allocaDecl,
     QualType origType) {
   assert(allocaDecl && "Required recipe variable not set?");
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, allocaDecl};
 
-  mlir::Block *block =
-      createRecipeBlock(recipe.getInitRegion(), mainOp.getType(), loc,
-                        numBounds, /*isInit=*/true);
-  builder.setInsertionPointToEnd(&recipe.getInitRegion().back());
+  mlir::Block *block = createRecipeBlock(recipeInitRegion, mainOp.getType(),
+                                         loc, numBounds, /*isInit=*/true);
+  builder.setInsertionPointToEnd(&recipeInitRegion.back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);
 
   const Type *allocaPointeeType =
@@ -458,7 +456,7 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
     // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll
     // emit an error to tell us.  However, emitting those errors during
     // production is a violation of the standard, so we cannot do them.
-    cgf.cgm.errorNYI(exprRange, "private default-init recipe");
+    cgf.cgm.errorNYI(exprRange, "private/reduction default-init recipe");
   }
 
   if (!numBounds) {
@@ -469,7 +467,7 @@ void OpenACCRecipeBuilderBase::createPrivateInitRecipe(
     cgf.emitAutoVarInit(tempDeclEmission);
   } else {
     mlir::Value alloca = makeBoundsAlloca(
-        block, exprRange, loc, "openacc.private.init", numBounds, boundTypes);
+        block, exprRange, loc, allocaDecl->getName(), numBounds, boundTypes);
 
     // If the initializer is trivial, there is nothing to do here, so save
     // ourselves some effort.
@@ -521,10 +519,10 @@ void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
 // doesn't restore it aftewards.
 void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
     mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
-    mlir::acc::ReductionRecipeOp recipe) {
-  mlir::Block *block = builder.createBlock(
-      &recipe.getCombinerRegion(), recipe.getCombinerRegion().end(),
-      {mainOp.getType(), mainOp.getType()}, {loc, loc});
+    mlir::acc::ReductionRecipeOp recipe, size_t numBounds) {
+  mlir::Block *block =
+      createRecipeBlock(recipe.getCombinerRegion(), mainOp.getType(), loc,
+                        numBounds, /*isInit=*/false);
   builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index a05b0bdaf6774..21707ad137f77 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -64,13 +64,13 @@ class OpenACCRecipeBuilderBase {
   // doesn't restore it aftewards.
   void createReductionRecipeCombiner(mlir::Location loc, mlir::Location locEnd,
                                      mlir::Value mainOp,
-                                     mlir::acc::ReductionRecipeOp recipe);
-  void createPrivateInitRecipe(mlir::Location loc, mlir::Location locEnd,
-                               SourceRange exprRange, mlir::Value mainOp,
-                               mlir::acc::PrivateRecipeOp recipe,
-                               size_t numBounds,
-                               llvm::ArrayRef<QualType> boundTypes,
-                               const VarDecl *allocaDecl, QualType origType);
+                                     mlir::acc::ReductionRecipeOp recipe,
+                                     size_t numBounds);
+  void createInitRecipe(mlir::Location loc, mlir::Location locEnd,
+                        SourceRange exprRange, mlir::Value mainOp,
+                        mlir::Region &recipeInitRegion, size_t numBounds,
+                        llvm::ArrayRef<QualType> boundTypes,
+                        const VarDecl *allocaDecl, QualType origType);
 
   void createRecipeDestroySection(mlir::Location loc, mlir::Location locEnd,
                                   mlir::Value mainOp, CharUnits alignment,
@@ -224,10 +224,10 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
     // TODO: OpenACC: This is a bit of a hackery to get this to not change for
     // the non-private recipes. This will be removed soon, when we get this
     // 'right' for firstprivate and reduction.
-    if constexpr (!std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) {
+    if constexpr (std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>) {
       if (numBounds) {
         cgf.cgm.errorNYI(varRef->getSourceRange(),
-                         "firstprivate/reduction-init with bounds");
+                         "firstprivate-init with bounds");
       }
       boundTypes = {};
       numBounds = 0;
@@ -260,18 +260,25 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
     insertLocation = modBuilder.saveInsertionPoint();
 
     if constexpr (std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) {
-      createPrivateInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
-                              recipe, numBounds, boundTypes, varRecipe,
-                              origType);
+      createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
+                       recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
+                       origType);
+    } else if constexpr (std::is_same_v<RecipeTy,
+                                        mlir::acc::ReductionRecipeOp>) {
+      createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
+                       recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
+                       origType);
+      createReductionRecipeCombiner(loc, locEnd, mainOp, recipe, numBounds);
     } else {
+      static_assert(std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>);
+      // TODO: OpenACC: we probably want this to call createInitRecipe as well,
+      // but do so in a way that omits the 'initialization', so that we can do
+      // it separately, since it belongs in the 'copy' region. It also might
+      // need a way of getting the tempDeclEmission out of it for that purpose.
       createRecipeInitCopy(loc, locEnd, varRef->getSourceRange(), mainOp,
                            recipe, varRecipe, temporary);
     }
 
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::ReductionRecipeOp>) {
-      createReductionRecipeCombiner(loc, locEnd, mainOp, recipe);
-    }
-
     if (origType.isDestructedType())
       createRecipeDestroySection(
           loc, locEnd, mainOp, cgf.getContext().getDeclAlign(varRecipe),
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 9aaf7f463403d..7ad7049237e63 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2894,17 +2894,18 @@ SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) {
 
 OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe(
     OpenACCReductionOperator ReductionOperator, const Expr *VarExpr) {
-  // TODO: OpenACC: This shouldn't be necessary, see PrivateInitRecipe
-  VarExpr = StripOffBounds(VarExpr);
-
+  // We don't strip bounds here, so that we are doing our recipe init at the
+  // 'lowest' possible level.  Codegen is going to have to do its own 'looping'.
   if (!VarExpr || VarExpr->getType()->isDependentType())
     return OpenACCReductionRecipe::Empty();
 
   QualType VarTy =
       VarExpr->getType().getNonReferenceType().getUnqualifiedType();
 
-  // TODO: OpenACC: for arrays/bounds versions, we're going to have to do a
-  // different initializer, but for now we can go ahead with this.
+  // Array sections are special, and we have to treat them that way.
+  if (const auto *ASE =
+          dyn_cast<ArraySectionExpr>(VarExpr->IgnoreParenImpCasts()))
+    VarTy = ArraySectionExpr::getBaseOriginalType(ASE);
 
   VarDecl *AllocaDecl = CreateAllocaDecl(
       getASTContext(), SemaRef.getCurContext(), VarExpr->getBeginLoc(),
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
index 3d295d58d1026..36d8c5ed4d7c5 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -944,22 +944,436 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[1:1])
@@ -980,8 +1394,6 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z12acc_combined
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
index be33afe07e363..d3d500d6b6609 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 template<typename T>
 void acc_combined() {
   T someVar;
@@ -403,22 +403,319 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[1:1])
@@ -439,8 +736,6 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z12acc_combined
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
index f13d96d171123..df7dc5dc41d49 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -1172,22 +1172,697 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[1:1])
@@ -1208,8 +1883,6 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z12acc_combined
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
index 952fee9b1ac1a..8ca4ffa045a84 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_combined() {
@@ -406,22 +406,319 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[1:1])
@@ -442,8 +739,6 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z12acc_combined
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
index 15646ed87b284..99d5bd2934e21 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -1172,22 +1172,697 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr[1:1])
@@ -1209,8 +1884,6 @@ void acc_combined() {
 #pragma acc parallel loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
 
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z12acc_combined
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
index e357f440eb4c3..8f45c77b6afd1 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -888,22 +888,436 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -925,8 +1339,6 @@ void acc_compute() {
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
 
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@acc_compute
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
index e0098bc625459..c61d04771cfe8 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -944,22 +944,436 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -980,8 +1394,6 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z11acc_compute
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
index 5336fadc9fd0c..3e4aa6f96fdb2 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   float someVar;
@@ -403,22 +403,319 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -439,7 +736,5 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@acc_compute
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
index a51388203a3d8..fce4c931d232c 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -404,22 +404,319 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -440,8 +737,6 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z11acc_compute
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
index 1968c0ac740dd..635de6a84413f 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -1172,22 +1172,697 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -1208,8 +1883,6 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z11acc_compute
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
index f63e340b29aa7..da5f4c0511075 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   int someVar;
@@ -404,22 +404,319 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -440,7 +737,5 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@acc_compute
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
index 48e5ac94627f5..933a7a41346d9 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -406,22 +406,319 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -442,8 +739,6 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z11acc_compute
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
index 6d204bc9060b0..b078ebab46b99 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -1172,22 +1172,697 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -1209,8 +1884,6 @@ void acc_compute() {
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
 
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z11acc_compute
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
index 35a7e7a951f74..81139a736d61e 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   unsigned int someVar;
@@ -404,22 +404,319 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_j : !cir.ptr<!cir.array<!u32i x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!u32i x 5>, !cir.ptr<!cir.array<!u32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!u32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
+// CHECK-NEXT: }
   ;
 
 #pragma acc parallel reduction(+:someVarArr[1:1])
@@ -440,7 +737,5 @@ void acc_compute() {
   ;
 #pragma acc parallel reduction(||:someVarArr[1:1])
   ;
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@acc_compute
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
index 73b8fe27c6aa1..bc4768e24059d 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -944,22 +944,436 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_16DefaultOperators : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_DefaultOperators x 5>, !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_DefaultOperators>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[1:1])
@@ -980,8 +1394,6 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z8acc_loop
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
index 77c61382c06bf..6b29ab59802d3 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -404,22 +404,319 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[1:1])
@@ -440,8 +737,6 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z8acc_loop
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
index 6ca0654b0384d..df07041bed9f9 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -1172,22 +1172,697 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsInline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: cir.call @_ZN18HasOperatorsInlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsInline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[1:1])
@@ -1208,8 +1883,6 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z8acc_loop
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
index dd3c54fa8f023..19f96f2bc8e63 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -406,22 +406,319 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LEAST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[LARGEST]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ALL_ONES]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[STRIDE]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[1:1])
@@ -442,8 +739,6 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z8acc_loop
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
index d36f9c608920e..ccc5db6746108 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -1172,24 +1172,698 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_add__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_mul__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(max:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_max__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <max> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<-2147483648> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #cir.fp<-1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LEAST:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[LEAST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(min:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_min__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <min> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<2147483647> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<3.4{{.*}}E+38> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #cir.fp<1.7{{.*}}E+308> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LARGEST:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[LARGEST]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_iand__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <iand> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<-1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.int<4294967295> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #cir.fp<0xFF{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ALL_ONES:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ALL_ONES]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(|:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_ior__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <ior> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(^:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_xor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <xor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(&&:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_land__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <land> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.fp<1{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #true
+// CHECK-NEXT: cir.store {{.*}} %[[ONE]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVarArr[2])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt1__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init"]
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ALLOCA]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: %[[GET_U:.*]] = cir.get_member %[[STRIDE]][1] {name = "u"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_U]] : !u32i, !cir.ptr<!u32i>
+// CHECK-NEXT: %[[GET_F:.*]] = cir.get_member %[[STRIDE]][2] {name = "f"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_F]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[GET_D:.*]] = cir.get_member %[[STRIDE]][4] {name = "d"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.fp<0{{.*}}> : !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_D]] : !cir.double, !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[GET_B:.*]] = cir.get_member %[[STRIDE]][5] {name = "b"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #false
+// CHECK-NEXT: cir.store {{.*}} %[[ZERO]], %[[GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasOperatorsOutline>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: cir.call @_ZN19HasOperatorsOutlineD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasOperatorsOutline>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
-
 #pragma acc loop reduction(+:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(*:someVarArr[1:1])
@@ -1209,8 +1883,6 @@ void acc_loop() {
 #pragma acc loop reduction(||:someVarArr[1:1])
   for(int i=0;i < 5; ++i);
 
-  // TODO OpenACC: When pointers/arrays are handled correctly, we should see all
-  // of the above repeated for arrays/pointers.
   // CHECK-NEXT: cir.func {{.*}}@_Z8acc_loop
 }
 
diff --git a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
new file mode 100644
index 0000000000000..4c012aa97e616
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
@@ -0,0 +1,677 @@
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+
+// Note: unlike the 'private' recipe checks, this is just for spot-checking,
+// so this test isn't as comprehensive.  The same code paths are used for
+// 'private', so we just count on those to catch the errors.
+struct NoOps {
+  int i;
+  ~NoOps();
+};
+void do_things(unsigned A, unsigned B) {
+  NoOps ThreeArr[5][5][5];
+
+#pragma acc parallel reduction(+:ThreeArr[B][B][B])
+// CHECK:acc.reduction.recipe @reduction_add__Bcnt3__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> reduction_operator <add> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.reduction.init"] {alignment = 4 : i64}
+//
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[TL_ALLOCA]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND1_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[BOUND1_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT:} destroy {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB3_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR3_LOAD]], %[[LB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB2_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR2_LOAD]], %[[LB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB1_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR1_LOAD]], %[[LB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND1_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsD1Ev(%[[BOUND1_STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT:acc.yield
+// CHECK-NEXT:}
+  ;
+
+  NoOps ***ThreePtr;
+#pragma acc parallel reduction(*:ThreePtr[B][B][A:B])
+// CHECK: acc.reduction.recipe @reduction_mul__Bcnt3__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> reduction_operator <mul> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.reduction.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_ALLOCA]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// Initialization Section
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ONE]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB3:.*]] = cir.binop(sub, %[[UB3_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB3]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR3_LOAD]], %[[LB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[PRIVATE]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB2:.*]] = cir.binop(sub, %[[UB2_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB2]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR2_LOAD]], %[[LB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB1:.*]] = cir.binop(sub, %[[UB1_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB1]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR1_LOAD]], %[[LB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
+;
+  using PtrTArrayTy = NoOps*[5];
+  PtrTArrayTy *PtrArrayPtr;
+
+#pragma acc parallel reduction(||:PtrArrayPtr[B][B][B])
+// CHECK-NEXT: acc.reduction.recipe @reduction_lor__Bcnt3__ZTSPA5_P5NoOps : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> reduction_operator <lor> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, ["openacc.reduction.init"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_NoOps> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+//
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// 
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+// Init Section:
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[ZERO]], %[[GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } combiner {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB3:.*]] = cir.binop(sub, %[[UB3_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB3]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR3_LOAD]], %[[LB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[PRIVATE]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB2:.*]] = cir.binop(sub, %[[UB2_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB2]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR2_LOAD]], %[[LB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB1:.*]] = cir.binop(sub, %[[UB1_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB1]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR1_LOAD]], %[[LB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield 
+// CHECK-NEXT: }
+  ;
+}

From bbae6a460c9fe41bdbf1f93f36928301d53c7db0 Mon Sep 17 00:00:00 2001
From: Valery Dmitriev <valeryd@nvidia.com>
Date: Fri, 3 Oct 2025 05:59:45 -0700
Subject: [PATCH 631/878] [flang] Simplify hlfir.index in a few limited cases.
 (#161558)

Primarily targeted simplification case of substring being
a singleton by inlining a search loop (with an exception
where runtime function performs better).
Few trivial simplifications also covered.
This is a reapply of #157883 with additional fix to avoid generation of
new ops during
analysis that mess up greedy rewriter if we end up bailing out without
any simplification
but just leaving few stranded new ops. For technical reasons this patch
comes as a new PR.
---
 .../flang/Optimizer/Builder/HLFIRTools.h      |   4 +
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    |  81 +++-
 .../Transforms/SimplifyHLFIRIntrinsics.cpp    | 207 +++++++++++
 .../HLFIR/simplify-hlfir-intrinsics-index.fir | 345 ++++++++++++++++++
 4 files changed, 618 insertions(+), 19 deletions(-)
 create mode 100644 flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir

diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 4d2a5bf385857..190f2eaf2e6f8 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -324,6 +324,10 @@ void genLengthParameters(mlir::Location loc, fir::FirOpBuilder &builder,
 mlir::Value genCharLength(mlir::Location loc, fir::FirOpBuilder &builder,
                           Entity entity);
 
+/// Return character length if known at compile time. Unlike genCharLength
+/// it does not create any new op as specifically is intended for analysis.
+std::optional<std::int64_t> getCharLengthIfConst(Entity entity);
+
 mlir::Value genRank(mlir::Location loc, fir::FirOpBuilder &builder,
                     Entity entity, mlir::Type resultType);
 
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index f93eaf7ba90b4..dbfcae1081c87 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -676,6 +676,34 @@ mlir::Value hlfir::genLBound(mlir::Location loc, fir::FirOpBuilder &builder,
   return dimInfo.getLowerBound();
 }
 
+static bool
+getExprLengthParameters(mlir::Value expr,
+                        llvm::SmallVectorImpl<mlir::Value> &result) {
+  if (auto concat = expr.getDefiningOp<hlfir::ConcatOp>()) {
+    result.push_back(concat.getLength());
+    return true;
+  }
+  if (auto setLen = expr.getDefiningOp<hlfir::SetLengthOp>()) {
+    result.push_back(setLen.getLength());
+    return true;
+  }
+  if (auto elemental = expr.getDefiningOp<hlfir::ElementalOp>()) {
+    result.append(elemental.getTypeparams().begin(),
+                  elemental.getTypeparams().end());
+    return true;
+  }
+  if (auto evalInMem = expr.getDefiningOp<hlfir::EvaluateInMemoryOp>()) {
+    result.append(evalInMem.getTypeparams().begin(),
+                  evalInMem.getTypeparams().end());
+    return true;
+  }
+  if (auto apply = expr.getDefiningOp<hlfir::ApplyOp>()) {
+    result.append(apply.getTypeparams().begin(), apply.getTypeparams().end());
+    return true;
+  }
+  return false;
+}
+
 void hlfir::genLengthParameters(mlir::Location loc, fir::FirOpBuilder &builder,
                                 Entity entity,
                                 llvm::SmallVectorImpl<mlir::Value> &result) {
@@ -688,29 +716,14 @@ void hlfir::genLengthParameters(mlir::Location loc, fir::FirOpBuilder &builder,
     // Going through fir::ExtendedValue would create a temp,
     // which is not desired for an inquiry.
     // TODO: make this an interface when adding further character producing ops.
-    if (auto concat = expr.getDefiningOp<hlfir::ConcatOp>()) {
-      result.push_back(concat.getLength());
-      return;
-    } else if (auto concat = expr.getDefiningOp<hlfir::SetLengthOp>()) {
-      result.push_back(concat.getLength());
-      return;
-    } else if (auto asExpr = expr.getDefiningOp<hlfir::AsExprOp>()) {
+
+    if (auto asExpr = expr.getDefiningOp<hlfir::AsExprOp>()) {
       hlfir::genLengthParameters(loc, builder, hlfir::Entity{asExpr.getVar()},
                                  result);
       return;
-    } else if (auto elemental = expr.getDefiningOp<hlfir::ElementalOp>()) {
-      result.append(elemental.getTypeparams().begin(),
-                    elemental.getTypeparams().end());
-      return;
-    } else if (auto evalInMem =
-                   expr.getDefiningOp<hlfir::EvaluateInMemoryOp>()) {
-      result.append(evalInMem.getTypeparams().begin(),
-                    evalInMem.getTypeparams().end());
-      return;
-    } else if (auto apply = expr.getDefiningOp<hlfir::ApplyOp>()) {
-      result.append(apply.getTypeparams().begin(), apply.getTypeparams().end());
-      return;
     }
+    if (getExprLengthParameters(expr, result))
+      return;
     if (entity.isCharacter()) {
       result.push_back(hlfir::GetLengthOp::create(builder, loc, expr));
       return;
@@ -733,6 +746,36 @@ mlir::Value hlfir::genCharLength(mlir::Location loc, fir::FirOpBuilder &builder,
   return lenParams[0];
 }
 
+std::optional<std::int64_t> hlfir::getCharLengthIfConst(hlfir::Entity entity) {
+  if (!entity.isCharacter()) {
+    return std::nullopt;
+  }
+  if (mlir::isa<hlfir::ExprType>(entity.getType())) {
+    mlir::Value expr = entity;
+    if (auto reassoc = expr.getDefiningOp<hlfir::NoReassocOp>())
+      expr = reassoc.getVal();
+
+    if (auto asExpr = expr.getDefiningOp<hlfir::AsExprOp>())
+      return getCharLengthIfConst(hlfir::Entity{asExpr.getVar()});
+
+    llvm::SmallVector<mlir::Value> param;
+    if (getExprLengthParameters(expr, param)) {
+      assert(param.size() == 1 && "characters must have one length parameters");
+      return fir::getIntIfConstant(param.pop_back_val());
+    }
+    return std::nullopt;
+  }
+
+  // entity is a var
+  if (mlir::Value len = tryGettingNonDeferredCharLen(entity))
+    return fir::getIntIfConstant(len);
+  auto charType =
+      mlir::cast<fir::CharacterType>(entity.getFortranElementType());
+  if (charType.hasConstantLen())
+    return charType.getLen();
+  return std::nullopt;
+}
+
 mlir::Value hlfir::genRank(mlir::Location loc, fir::FirOpBuilder &builder,
                            hlfir::Entity entity, mlir::Type resultType) {
   if (!entity.isAssumedRank())
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
index d8e36ea294cdb..ce8ebaa803f47 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp
@@ -2284,6 +2284,212 @@ class CmpCharOpConversion : public mlir::OpRewritePattern<hlfir::CmpCharOp> {
   }
 };
 
+static std::pair<mlir::Value, hlfir::AssociateOp>
+getVariable(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value val) {
+  // If it is an expression - create a variable from it, or forward
+  // the value otherwise.
+  hlfir::AssociateOp associate;
+  if (!mlir::isa<hlfir::ExprType>(val.getType()))
+    return {val, associate};
+  hlfir::Entity entity{val};
+  mlir::NamedAttribute byRefAttr = fir::getAdaptToByRefAttr(builder);
+  associate = hlfir::genAssociateExpr(loc, builder, entity, entity.getType(),
+                                      "", byRefAttr);
+  return {associate.getBase(), associate};
+}
+
+class IndexOpConversion : public mlir::OpRewritePattern<hlfir::IndexOp> {
+public:
+  using mlir::OpRewritePattern<hlfir::IndexOp>::OpRewritePattern;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::IndexOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    // We simplify only limited cases:
+    // 1) a substring length shall be known at compile time
+    // 2) if a substring length is 0 then replace with 1 for forward search,
+    //    or otherwise with the string length + 1 (builder shall const-fold if
+    //    lookup direction is known at compile time).
+    // 3) for known string length at compile time, if it is
+    //    shorter than substring  => replace with zero.
+    // 4) if a substring length is one => inline as simple search loop
+    // 5) for forward search with input strings of kind=1 runtime is faster.
+    // Do not simplify in all the other cases relying on a runtime call.
+
+    fir::FirOpBuilder builder{rewriter, op.getOperation()};
+    const mlir::Location &loc = op->getLoc();
+
+    auto resultTy = op.getType();
+    mlir::Value back = op.getBack();
+    auto substrLenCst =
+        hlfir::getCharLengthIfConst(hlfir::Entity{op.getSubstr()});
+    if (!substrLenCst) {
+      return rewriter.notifyMatchFailure(
+          op, "substring length unknown at compile time");
+    }
+    hlfir::Entity strEntity{op.getStr()};
+    auto i1Ty = builder.getI1Type();
+    auto idxTy = builder.getIndexType();
+    if (*substrLenCst == 0) {
+      mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
+      // zero length substring. For back search replace with
+      // strLen+1, or otherwise with 1.
+      mlir::Value strLen = hlfir::genCharLength(loc, builder, strEntity);
+      mlir::Value strEnd = mlir::arith::AddIOp::create(
+          builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
+      if (back)
+        back = builder.createConvert(loc, i1Ty, back);
+      else
+        back = builder.createIntegerConstant(loc, i1Ty, 0);
+      mlir::Value result =
+          mlir::arith::SelectOp::create(builder, loc, back, strEnd, oneIdx);
+
+      rewriter.replaceOp(op, builder.createConvert(loc, resultTy, result));
+      return mlir::success();
+    }
+
+    if (auto strLenCst = hlfir::getCharLengthIfConst(strEntity)) {
+      if (*strLenCst < *substrLenCst) {
+        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 0));
+        return mlir::success();
+      }
+      if (*strLenCst == 0) {
+        // both strings have zero length
+        rewriter.replaceOp(op, builder.createIntegerConstant(loc, resultTy, 1));
+        return mlir::success();
+      }
+    }
+    if (*substrLenCst != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "rely on runtime implementation if substring length > 1");
+    }
+    // For forward search and character kind=1 the runtime uses memchr
+    // which well optimized. But it looks like memchr idiom is not recognized
+    // in LLVM yet. On a micro-kernel test with strings of length 40 runtime
+    // had ~2x less execution time vs inlined code. For unknown search direction
+    // at compile time pessimistically assume "forward".
+    std::optional<bool> isBack;
+    if (back) {
+      if (auto backCst = fir::getIntIfConstant(back))
+        isBack = *backCst != 0;
+    } else {
+      isBack = false;
+    }
+    auto charTy = mlir::cast<fir::CharacterType>(
+        hlfir::getFortranElementType(op.getSubstr().getType()));
+    unsigned kind = charTy.getFKind();
+    if (kind == 1 && (!isBack || !*isBack)) {
+      return rewriter.notifyMatchFailure(
+          op, "rely on runtime implementation for character kind 1");
+    }
+
+    // All checks are passed here. Generate single character search loop.
+    auto [strV, strAssociate] = getVariable(builder, loc, op.getStr());
+    auto [substrV, substrAssociate] = getVariable(builder, loc, op.getSubstr());
+    hlfir::Entity str{strV};
+    hlfir::Entity substr{substrV};
+    mlir::Value oneIdx = builder.createIntegerConstant(loc, idxTy, 1);
+
+    auto genExtractAndConvertToInt = [&charTy, &idxTy, &oneIdx,
+                                      kind](mlir::Location loc,
+                                            fir::FirOpBuilder &builder,
+                                            hlfir::Entity &charStr,
+                                            mlir::Value index) {
+      auto bits = builder.getKindMap().getCharacterBitsize(kind);
+      auto intTy = builder.getIntegerType(bits);
+      auto charLen1Ty =
+          fir::CharacterType::getSingleton(builder.getContext(), kind);
+      mlir::Type designatorTy =
+          fir::ReferenceType::get(charLen1Ty, fir::isa_volatile_type(charTy));
+      auto idxAttr = builder.getIntegerAttr(idxTy, 0);
+
+      auto singleChr = hlfir::DesignateOp::create(
+          builder, loc, designatorTy, charStr, /*component=*/{},
+          /*compShape=*/mlir::Value{}, hlfir::DesignateOp::Subscripts{},
+          /*substring=*/mlir::ValueRange{index, index},
+          /*complexPart=*/std::nullopt,
+          /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{oneIdx},
+          fir::FortranVariableFlagsAttr{});
+      auto chrVal = fir::LoadOp::create(builder, loc, singleChr);
+      mlir::Value intVal = fir::ExtractValueOp::create(
+          builder, loc, intTy, chrVal, builder.getArrayAttr(idxAttr));
+      return intVal;
+    };
+
+    auto wantChar = genExtractAndConvertToInt(loc, builder, substr, oneIdx);
+
+    // Generate search loop body with the following C equivalent:
+    //  idx_t result = 0;
+    //  idx_t end = strlen + 1;
+    //  char want = substr[0];
+    //  for (idx_t idx = 1; idx < end; ++idx) {
+    //    if (result == 0) {
+    //        idx_t at = back ? end - idx: idx;
+    //        result = str[at-1] == want ? at : result;
+    //    }
+    //  }
+    mlir::Value strLen = hlfir::genCharLength(loc, builder, strEntity);
+    if (!back)
+      back = builder.createIntegerConstant(loc, i1Ty, 0);
+    else
+      back = builder.createConvert(loc, i1Ty, back);
+    mlir::Value strEnd = mlir::arith::AddIOp::create(
+        builder, loc, builder.createConvert(loc, idxTy, strLen), oneIdx);
+    mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
+    auto genSearchBody = [&](mlir::Location loc, fir::FirOpBuilder &builder,
+                             mlir::ValueRange index,
+                             mlir::ValueRange reductionArgs)
+        -> llvm::SmallVector<mlir::Value, 1> {
+      assert(index.size() == 1 && "expected single loop");
+      assert(reductionArgs.size() == 1 && "expected single reduction value");
+      mlir::Value inRes = reductionArgs[0];
+      auto resEQzero = mlir::arith::CmpIOp::create(
+          builder, loc, mlir::arith::CmpIPredicate::eq, inRes, zeroIdx);
+
+      mlir::Value res =
+          builder
+              .genIfOp(loc, {idxTy}, resEQzero,
+                       /*withElseRegion=*/true)
+              .genThen([&]() {
+                mlir::Value idx = builder.createConvert(loc, idxTy, index[0]);
+                // offset = back ? end - idx : idx;
+                mlir::Value offset = mlir::arith::SelectOp::create(
+                    builder, loc, back,
+                    mlir::arith::SubIOp::create(builder, loc, strEnd, idx),
+                    idx);
+
+                auto haveChar =
+                    genExtractAndConvertToInt(loc, builder, str, offset);
+                auto charsEQ = mlir::arith::CmpIOp::create(
+                    builder, loc, mlir::arith::CmpIPredicate::eq, haveChar,
+                    wantChar);
+                mlir::Value newVal = mlir::arith::SelectOp::create(
+                    builder, loc, charsEQ, offset, inRes);
+
+                fir::ResultOp::create(builder, loc, newVal);
+              })
+              .genElse([&]() { fir::ResultOp::create(builder, loc, inRes); })
+              .getResults()[0];
+      return {res};
+    };
+
+    llvm::SmallVector<mlir::Value, 1> loopOut =
+        hlfir::genLoopNestWithReductions(loc, builder, {strLen},
+                                         /*reductionInits=*/{zeroIdx},
+                                         genSearchBody,
+                                         /*isUnordered=*/false);
+    mlir::Value result = builder.createConvert(loc, resultTy, loopOut[0]);
+
+    if (strAssociate)
+      hlfir::EndAssociateOp::create(builder, loc, strAssociate);
+    if (substrAssociate)
+      hlfir::EndAssociateOp::create(builder, loc, substrAssociate);
+
+    rewriter.replaceOp(op, result);
+    return mlir::success();
+  }
+};
+
 template <typename Op>
 class MatmulConversion : public mlir::OpRewritePattern<Op> {
 public:
@@ -2955,6 +3161,7 @@ class SimplifyHLFIRIntrinsics
     patterns.insert<ArrayShiftConversion<hlfir::CShiftOp>>(context);
     patterns.insert<ArrayShiftConversion<hlfir::EOShiftOp>>(context);
     patterns.insert<CmpCharOpConversion>(context);
+    patterns.insert<IndexOpConversion>(context);
     patterns.insert<MatmulConversion<hlfir::MatmulTransposeOp>>(context);
     patterns.insert<ReductionConversion<hlfir::CountOp>>(context);
     patterns.insert<ReductionConversion<hlfir::AnyOp>>(context);
diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
new file mode 100644
index 0000000000000..258a1d899a40d
--- /dev/null
+++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-index.fir
@@ -0,0 +1,345 @@
+// RUN: fir-opt %s --simplify-hlfir-intrinsics | FileCheck %s
+
+// Simplify should reduce hlfir.index to constant (5)
+func.func @_QPt1() {
+// CHECK-LABEL:   func.func @_QPt1() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 5 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 3 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 4 : index
+// CHECK:           %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_3]] {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+// CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] typeparams %[[VAL_2]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+// CHECK:           hlfir.assign %[[VAL_10]]#0 to %[[VAL_8]]#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
+// CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
+// CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_0]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt1En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt1En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c4 = arith.constant 4 : index
+    %3 = fir.alloca !fir.char<1,4> {bindc_name = "s", uniq_name = "_QFt1Es"}
+    %4:2 = hlfir.declare %3 typeparams %c4 {uniq_name = "_QFt1Es"} : (!fir.ref<!fir.char<1,4>>, index) -> (!fir.ref<!fir.char<1,4>>, !fir.ref<!fir.char<1,4>>)
+    %5 = fir.address_of(@_QQclX616263) : !fir.ref<!fir.char<1,3>>
+    %c3 = arith.constant 3 : index
+    %6:2 = hlfir.declare %5 typeparams %c3 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX616263"} : (!fir.ref<!fir.char<1,3>>, index) -> (!fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,3>>)
+    hlfir.assign %6#0 to %4#0 : !fir.ref<!fir.char<1,3>>, !fir.ref<!fir.char<1,4>>
+    %7 = fir.address_of(@_QQclX) : !fir.ref<!fir.char<1,0>>
+    %c0 = arith.constant 0 : index
+    %8:2 = hlfir.declare %7 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX"} : (!fir.ref<!fir.char<1,0>>, index) -> (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,0>>)
+    %true = arith.constant true
+    %9 = hlfir.index %8#0 in %4#0 back %true : (!fir.ref<!fir.char<1,0>>, !fir.ref<!fir.char<1,4>>, i1) -> i32
+    hlfir.assign %9 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// ! 'back' is unknown at compile time, substring is zero length - generate select (back ? strlen+1 : 1)
+func.func @_QPt2(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK-LABEL:   func.func @_QPt2(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+// CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_8:.*]] = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_0]] : index
+// CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_10]] : (!fir.logical<4>) -> i1
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_14]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt2Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt2En"}
+    %3:2 = hlfir.declare %2 {uniq_name = "_QFt2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt2Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %6 = fir.address_of(@_QQcl2X) : !fir.ref<!fir.char<2,0>>
+    %c0 = arith.constant 0 : index
+    %7:2 = hlfir.declare %6 typeparams %c0 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X"} : (!fir.ref<!fir.char<2,0>>, index) -> (!fir.ref<!fir.char<2,0>>, !fir.ref<!fir.char<2,0>>)
+    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
+    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<2,0>>, !fir.boxchar<2>, !fir.logical<4>) -> i32
+    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
+    return
+}
+
+// inline as search loop (backward)
+func.func @_QPt3(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt3(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
+// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
+// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:               fir.result %[[VAL_23]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_15]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_17]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt3En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt3En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt3Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %true = arith.constant true
+    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+//inline as search loop (forward)
+func.func @_QPt4(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt4(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
+// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
+// CHECK:               fir.result %[[VAL_21]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_14]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_16]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_22]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt4En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt4En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt4Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Same as t4 above but result kind=1
+func.func @_QPt5(%arg0: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt5(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<2> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<2>>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<2>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:           %[[VAL_12:.*]] = fir.do_loop %[[VAL_13:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_14:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (index) {
+// CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_13]], %[[VAL_13]]  typeparams %[[VAL_1]] : (!fir.boxchar<2>, index, index, index) -> !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.char<2>>
+// CHECK:               %[[VAL_19:.*]] = fir.extract_value %[[VAL_18]], [0 : index] : (!fir.char<2>) -> i16
+// CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_19]], %[[VAL_11]] : i16
+// CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_13]], %[[VAL_14]] : index
+// CHECK:               fir.result %[[VAL_21]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_14]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_16]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]] : (index) -> i8
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i8) -> i32
+// CHECK:           hlfir.assign %[[VAL_23]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt5En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt5En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<2>) -> (!fir.ref<!fir.char<2,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt5Es"} : (!fir.ref<!fir.char<2,?>>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref<!fir.char<2,?>>)
+    %5 = fir.address_of(@_QQcl2X6500) : !fir.ref<!fir.char<2>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQcl2X6500"} : (!fir.ref<!fir.char<2>>, index) -> (!fir.ref<!fir.char<2>>, !fir.ref<!fir.char<2>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<2>>, !fir.boxchar<2>, i1) -> i8
+    %8 = fir.convert %7 : (i8) -> i32
+    hlfir.assign %8 to %2#0 : i32, !fir.ref<i32>
+    return
+  }
+
+// Do no simplify - runtime call for forward search with character kind=1 is faster
+func.func @_QPt6(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt6(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_0]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+// CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt6En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt6En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt6Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %false = arith.constant false
+    %7 = hlfir.index %6#0 in %4#0 back %false : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Do not simplify - runtime call for forward search with character kind=1 is faster
+// Lookup direction is unknown at compile time, hence forward is pessimistically assumed
+func.func @_QPt7(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}, %arg1: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK-LABEL:   func.func @_QPt7(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"},
+// CHECK-SAME:                     %[[ARG1:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_1]] {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_0]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<!fir.logical<4>>
+// CHECK:           %[[VAL_10:.*]] = hlfir.index %[[VAL_8]]#0 in %[[VAL_6]]#0 back %[[VAL_9]] : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+// CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFt7Eb"} : (!fir.ref<!fir.logical<4>>, !fir.dscope) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+    %2 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt7En"}
+    %3:2 = hlfir.declare %2 {uniq_name = "_QFt7En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %4:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %5:2 = hlfir.declare %4#0 typeparams %4#1 dummy_scope %0 {uniq_name = "_QFt7Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %6 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %7:2 = hlfir.declare %6 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %8 = fir.load %1#0 : !fir.ref<!fir.logical<4>>
+    %9 = hlfir.index %7#0 in %5#0 back %8 : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, !fir.logical<4>) -> i32
+    hlfir.assign %9 to %3#0 : i32, !fir.ref<i32>
+    return
+}
+
+// Inline as backward search loop for character kind=1.
+// The case similar to t7 but direction is known, so it is faster than runtime call.
+func.func @_QPt8(%arg0: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK-LABEL:   func.func @_QPt8(
+// CHECK-SAME:                     %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "s"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
+// CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+// CHECK:           %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+// CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_2]] {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_1]] {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+// CHECK:           %[[VAL_9:.*]] = hlfir.designate %[[VAL_8]]#0  substr %[[VAL_1]], %[[VAL_1]]  typeparams %[[VAL_1]] : (!fir.ref<!fir.char<1>>, index, index, index) -> !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.char<1>>
+// CHECK:           %[[VAL_11:.*]] = fir.extract_value %[[VAL_10]], [0 : index] : (!fir.char<1>) -> i8
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.do_loop %[[VAL_14:.*]] = %[[VAL_1]] to %[[VAL_5]]#1 step %[[VAL_1]] iter_args(%[[VAL_15:.*]] = %[[VAL_0]]) -> (index) {
+// CHECK:             %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_0]] : index
+// CHECK:             %[[VAL_17:.*]] = fir.if %[[VAL_16]] -> (index) {
+// CHECK:               %[[VAL_18:.*]] = arith.subi %[[VAL_12]], %[[VAL_14]] : index
+// CHECK:               %[[VAL_19:.*]] = hlfir.designate %[[VAL_6]]#0  substr %[[VAL_18]], %[[VAL_18]]  typeparams %[[VAL_1]] : (!fir.boxchar<1>, index, index, index) -> !fir.ref<!fir.char<1>>
+// CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.char<1>>
+// CHECK:               %[[VAL_21:.*]] = fir.extract_value %[[VAL_20]], [0 : index] : (!fir.char<1>) -> i8
+// CHECK:               %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_11]] : i8
+// CHECK:               %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:               fir.result %[[VAL_23]] : index
+// CHECK:             } else {
+// CHECK:               fir.result %[[VAL_15]] : index
+// CHECK:             }
+// CHECK:             fir.result %[[VAL_17]] : index
+// CHECK:           }
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_13]] : (index) -> i32
+// CHECK:           hlfir.assign %[[VAL_24]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFt8En"}
+    %2:2 = hlfir.declare %1 {uniq_name = "_QFt8En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %3:2 = fir.unboxchar %arg0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+    %4:2 = hlfir.declare %3#0 typeparams %3#1 dummy_scope %0 {uniq_name = "_QFt8Es"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %5 = fir.address_of(@_QQclX65) : !fir.ref<!fir.char<1>>
+    %c1 = arith.constant 1 : index
+    %6:2 = hlfir.declare %5 typeparams %c1 {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX65"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+    %true = arith.constant true
+    %7 = hlfir.index %6#0 in %4#0 back %true : (!fir.ref<!fir.char<1>>, !fir.boxchar<1>, i1) -> i32
+    hlfir.assign %7 to %2#0 : i32, !fir.ref<i32>
+    return
+}
+

From 6bdf2cb2024e3a1f9596189f567b13665a19d1b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20B=C3=B6ck?= <markus.boeck02@gmail.com>
Date: Fri, 3 Oct 2025 15:05:59 +0200
Subject: [PATCH 632/878] [mlir][NFC] Remove redundant insertion point changes
 (#161837)

These insertion points were added in
https://github.com/llvm/llvm-project/pull/146551 and
https://github.com/llvm/llvm-project/pull/146908 to support the one-shot
dialect conversion driver which performs changes to the IR immediately
and would otherwise invalidate previous insertion points.

Since then, the insertion point has been made resilient against op
erasure (https://github.com/llvm/llvm-project/pull/146955) making the
changes now redundant.
---
 .../Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp  | 5 ++---
 mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp       | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
index 035f197b1eac2..399ccf3925f3a 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
@@ -267,9 +267,8 @@ class GPULaunchLowering : public ConvertOpToLLVMPattern<gpu::LaunchFuncOp> {
       copyInfo.push_back(info);
     }
     // Create a call to the kernel and copy the data back.
-    Operation *callOp = rewriter.replaceOpWithNewOp<LLVM::CallOp>(
-        op, kernelFunc, ArrayRef<Value>());
-    rewriter.setInsertionPointAfter(callOp);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, kernelFunc,
+                                              ArrayRef<Value>());
     for (CopyInfo info : copyInfo)
       copy(loc, info.src, info.dst, info.size, rewriter);
     return success();
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index 6f288495516f7..0cb0badacf69d 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -802,7 +802,6 @@ class MaxPool2dConverter : public OpConversionPattern<tosa::MaxPool2dOp> {
         ValueRange{paddedInput, fakeWindowDims}, filledEmptyTensor, strideAttr,
         dilationAttr);
 
-    rewriter.setInsertionPointAfter(op);
     NanPropagationMode nanMode = op.getNanMode();
     rewriter.replaceOp(op, resultOp);
 

From 9e3bbbb049ea9ca2abcbf8e4b3ea39a8deb7c1e2 Mon Sep 17 00:00:00 2001
From: Aditya Chaudhari <98672108+AdityaC4@users.noreply.github.com>
Date: Fri, 3 Oct 2025 08:08:39 -0500
Subject: [PATCH 633/878] [X86][Clang] VectorExprEvaluator::VisitCallExpr /
 InterpretBuiltin - allow element extraction/insertion intrinsics to be used
 in constexpr #159753 (#161302)

FIXES: #159753

Enable constexpr evaluation for X86 vector element extract/insert builtins. and adds corresponding tests
Index is masked with `(Idx & (NumElts - 1))`, matching existing CodeGen.
---
 clang/include/clang/Basic/BuiltinsX86.td    | 24 +++----
 clang/include/clang/Basic/BuiltinsX86_64.td |  4 +-
 clang/lib/AST/ByteCode/InterpBuiltin.cpp    | 78 +++++++++++++++++++++
 clang/lib/AST/ExprConstant.cpp              | 65 +++++++++++++++++
 clang/test/CodeGen/X86/avx-builtins.c       |  8 +++
 clang/test/CodeGen/X86/mmx-builtins.c       |  2 +
 clang/test/CodeGen/X86/sse2-builtins.c      |  2 +
 clang/test/CodeGen/X86/sse41-builtins.c     |  7 ++
 8 files changed, 176 insertions(+), 14 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index e98bee28c15be..d8eb0bed3e3e5 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -52,7 +52,7 @@ def emms : X86Builtin<"void()"> {
   let Features = "mmx";
 }
 
-let Attributes = [NoThrow, Const, RequiredVectorWidth<64>], Features = "sse" in {
+let Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<64>], Features = "sse" in {
   def vec_ext_v4hi : X86Builtin<"short(_Vector<4, short>, _Constant int)">;
   def vec_set_v4hi : X86Builtin<"_Vector<4, short>(_Vector<4, short>, short, _Constant int)">;
 }
@@ -92,13 +92,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
     def cmpsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant char)">;
   }
 
-  let Features = "sse2" in {
-    def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
-    def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
-    def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
-    def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
-    def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
-  }
 
   let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
     def pavgb128 : X86Builtin<"_Vector<16, unsigned char>(_Vector<16, unsigned char>, _Vector<16, unsigned char>)">;
@@ -108,6 +101,12 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
     def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
     def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
     def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">;
+
+    def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">;
+    def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">;
+    def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">;
+    def vec_ext_v8hi : X86Builtin<"short(_Vector<8, short>, _Constant int)">;
+    def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">;
   }
 
   let Features = "sse3" in {
@@ -323,9 +322,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
   def ptestnzc128 : X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
   def mpsadbw128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
   def phminposuw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>)">;
-  def vec_ext_v16qi : X86Builtin<"char(_Vector<16, char>, _Constant int)">;
-  def vec_set_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, char, _Constant int)">;
-  def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
 }
 
 let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -338,6 +334,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVector
 
   def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">;
   def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
+
+  def vec_ext_v16qi : X86Builtin<"char(_Vector<16, char>, _Constant int)">;
+  def vec_set_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, char, _Constant int)">;
+  def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">;
 }
 
 let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -560,7 +560,7 @@ let Features = "avx", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
   def maskstoreps : X86Builtin<"void(_Vector<4, float *>, _Vector<4, int>, _Vector<4, float>)">;
 }
 
-let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vec_ext_v32qi : X86Builtin<"char(_Vector<32, char>, _Constant int)">;
   def vec_ext_v16hi : X86Builtin<"short(_Vector<16, short>, _Constant int)">;
   def vec_ext_v8si : X86Builtin<"int(_Vector<8, int>, _Constant int)">;
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
index 214b175ace5eb..275278c5ac089 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.td
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -56,7 +56,7 @@ let Features = "sse2", Attributes = [NoThrow] in {
   def movnti64 : X86Builtin<"void(long long int *, long long int)">;
 }
 
-let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vec_set_v2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, long long int, _Constant int)">;
 }
 
@@ -64,7 +64,7 @@ let Features = "crc32", Attributes = [NoThrow, Const] in {
   def crc32di : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
 }
 
-let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vec_ext_v4di : X86Builtin<"long long int(_Vector<4, long long int>, _Constant int)">;
   def vec_set_v4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, long long int, _Constant int)">;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a2e97fcafdfef..5b3aa26e84248 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2878,6 +2878,61 @@ static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_vec_ext(InterpState &S, CodePtr OpPC,
+                                    const CallExpr *Call, unsigned ID) {
+  assert(Call->getNumArgs() == 2);
+
+  APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
+  const Pointer &Vec = S.Stk.pop<Pointer>();
+  if (!Vec.getFieldDesc()->isPrimitiveArray())
+    return false;
+
+  unsigned NumElems = Vec.getNumElems();
+  unsigned Index =
+      static_cast<unsigned>(ImmAPS.getZExtValue() & (NumElems - 1));
+
+  PrimType ElemPT = Vec.getFieldDesc()->getPrimType();
+  // FIXME(#161685): Replace float+int split with a numeric-only type switch
+  if (ElemPT == PT_Float) {
+    S.Stk.push<Floating>(Vec.elem<Floating>(Index));
+    return true;
+  }
+  INT_TYPE_SWITCH_NO_BOOL(ElemPT, {
+    APSInt V = Vec.elem<T>(Index).toAPSInt();
+    pushInteger(S, V, Call->getType());
+  });
+
+  return true;
+}
+
+static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
+                                    const CallExpr *Call, unsigned ID) {
+  assert(Call->getNumArgs() == 3);
+
+  APSInt ImmAPS = popToAPSInt(S, Call->getArg(2));
+  APSInt ValAPS = popToAPSInt(S, Call->getArg(1));
+
+  const Pointer &Base = S.Stk.pop<Pointer>();
+  if (!Base.getFieldDesc()->isPrimitiveArray())
+    return false;
+
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  unsigned NumElems = Base.getNumElems();
+  unsigned Index =
+      static_cast<unsigned>(ImmAPS.getZExtValue() & (NumElems - 1));
+
+  PrimType ElemPT = Base.getFieldDesc()->getPrimType();
+  INT_TYPE_SWITCH_NO_BOOL(ElemPT, {
+    for (unsigned I = 0; I != NumElems; ++I)
+      Dst.elem<T>(I) = Base.elem<T>(I);
+    Dst.elem<T>(Index) = static_cast<T>(ValAPS);
+  });
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3686,6 +3741,29 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_insert128i256:
     return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);
 
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
+  case X86::BI__builtin_ia32_vec_ext_v16qi:
+  case X86::BI__builtin_ia32_vec_ext_v8hi:
+  case X86::BI__builtin_ia32_vec_ext_v4si:
+  case X86::BI__builtin_ia32_vec_ext_v2di:
+  case X86::BI__builtin_ia32_vec_ext_v32qi:
+  case X86::BI__builtin_ia32_vec_ext_v16hi:
+  case X86::BI__builtin_ia32_vec_ext_v8si:
+  case X86::BI__builtin_ia32_vec_ext_v4di:
+  case X86::BI__builtin_ia32_vec_ext_v4sf:
+    return interp__builtin_vec_ext(S, OpPC, Call, BuiltinID);
+
+  case X86::BI__builtin_ia32_vec_set_v4hi:
+  case X86::BI__builtin_ia32_vec_set_v16qi:
+  case X86::BI__builtin_ia32_vec_set_v8hi:
+  case X86::BI__builtin_ia32_vec_set_v4si:
+  case X86::BI__builtin_ia32_vec_set_v2di:
+  case X86::BI__builtin_ia32_vec_set_v32qi:
+  case X86::BI__builtin_ia32_vec_set_v16hi:
+  case X86::BI__builtin_ia32_vec_set_v8si:
+  case X86::BI__builtin_ia32_vec_set_v4di:
+    return interp__builtin_vec_set(S, OpPC, Call, BuiltinID);
+
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
              diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b706b14945b6d..81a5c2de151a0 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12235,6 +12235,41 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+
+  case clang::X86::BI__builtin_ia32_vec_set_v4hi:
+  case clang::X86::BI__builtin_ia32_vec_set_v16qi:
+  case clang::X86::BI__builtin_ia32_vec_set_v8hi:
+  case clang::X86::BI__builtin_ia32_vec_set_v4si:
+  case clang::X86::BI__builtin_ia32_vec_set_v2di:
+  case clang::X86::BI__builtin_ia32_vec_set_v32qi:
+  case clang::X86::BI__builtin_ia32_vec_set_v16hi:
+  case clang::X86::BI__builtin_ia32_vec_set_v8si:
+  case clang::X86::BI__builtin_ia32_vec_set_v4di: {
+    APValue VecVal;
+    APSInt Scalar, IndexAPS;
+    if (!EvaluateVector(E->getArg(0), VecVal, Info) ||
+        !EvaluateInteger(E->getArg(1), Scalar, Info) ||
+        !EvaluateInteger(E->getArg(2), IndexAPS, Info))
+      return false;
+
+    QualType ElemTy = E->getType()->castAs<VectorType>()->getElementType();
+    unsigned ElemWidth = Info.Ctx.getIntWidth(ElemTy);
+    bool ElemUnsigned = ElemTy->isUnsignedIntegerOrEnumerationType();
+    Scalar.setIsUnsigned(ElemUnsigned);
+    APSInt ElemAPS = Scalar.extOrTrunc(ElemWidth);
+    APValue ElemAV(ElemAPS);
+
+    unsigned NumElems = VecVal.getVectorLength();
+    unsigned Index =
+        static_cast<unsigned>(IndexAPS.getZExtValue() & (NumElems - 1));
+
+    SmallVector<APValue, 4> Elems;
+    Elems.reserve(NumElems);
+    for (unsigned ElemNum = 0; ElemNum != NumElems; ++ElemNum)
+      Elems.push_back(ElemNum == Index ? ElemAV : VecVal.getVectorElt(ElemNum));
+
+    return Success(APValue(Elems.data(), NumElems), E);
+  }
   }
 }
 
@@ -14822,6 +14857,25 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return HandleMaskBinOp(
         [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
   }
+
+  case clang::X86::BI__builtin_ia32_vec_ext_v4hi:
+  case clang::X86::BI__builtin_ia32_vec_ext_v16qi:
+  case clang::X86::BI__builtin_ia32_vec_ext_v8hi:
+  case clang::X86::BI__builtin_ia32_vec_ext_v4si:
+  case clang::X86::BI__builtin_ia32_vec_ext_v2di:
+  case clang::X86::BI__builtin_ia32_vec_ext_v32qi:
+  case clang::X86::BI__builtin_ia32_vec_ext_v16hi:
+  case clang::X86::BI__builtin_ia32_vec_ext_v8si:
+  case clang::X86::BI__builtin_ia32_vec_ext_v4di: {
+    APValue Vec;
+    APSInt IdxAPS;
+    if (!EvaluateVector(E->getArg(0), Vec, Info) ||
+        !EvaluateInteger(E->getArg(1), IdxAPS, Info))
+      return false;
+    unsigned N = Vec.getVectorLength();
+    unsigned Idx = static_cast<unsigned>(IdxAPS.getZExtValue() & (N - 1));
+    return Success(Vec.getVectorElt(Idx).getInt(), E);
+  }
   }
 }
 
@@ -16638,6 +16692,17 @@ bool FloatExprEvaluator::VisitCallExpr(const CallExpr *E) {
     (void)Result.fusedMultiplyAdd(SourceY, SourceZ, RM);
     return true;
   }
+
+  case clang::X86::BI__builtin_ia32_vec_ext_v4sf: {
+    APValue Vec;
+    APSInt IdxAPS;
+    if (!EvaluateVector(E->getArg(0), Vec, Info) ||
+        !EvaluateInteger(E->getArg(1), IdxAPS, Info))
+      return false;
+    unsigned N = Vec.getVectorLength();
+    unsigned Idx = static_cast<unsigned>(IdxAPS.getZExtValue() & (N - 1));
+    return Success(Vec.getVectorElt(Idx), E);
+  }
   }
 }
 
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 3018bb9719b89..5f08b6be81ab7 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1039,6 +1039,7 @@ int test_mm256_extract_epi8(__m256i A) {
   // CHECK: zext i8 %{{.*}} to i32
   return _mm256_extract_epi8(A, 31);
 }
+TEST_CONSTEXPR(_mm256_extract_epi8(((__m256i)(__v32qs){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 45) == 13);
 
 int test_mm256_extract_epi16(__m256i A) {
   // CHECK-LABEL: test_mm256_extract_epi16
@@ -1046,12 +1047,14 @@ int test_mm256_extract_epi16(__m256i A) {
   // CHECK: zext i16 %{{.*}} to i32
   return _mm256_extract_epi16(A, 15);
 }
+TEST_CONSTEXPR(_mm256_extract_epi16(((__m256i)(__v16hi){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}), 50) == 4);
 
 int test_mm256_extract_epi32(__m256i A) {
   // CHECK-LABEL: test_mm256_extract_epi32
   // CHECK: extractelement <8 x i32> %{{.*}}, {{i32|i64}} 7
   return _mm256_extract_epi32(A, 7);
 }
+TEST_CONSTEXPR(_mm256_extract_epi32(((__m256i)(__v8si){0, 5, 10, 15, 20, 25, 30, 35}), 18) == 10);
 
 #if __x86_64__
 long long test_mm256_extract_epi64(__m256i A) {
@@ -1059,6 +1062,7 @@ long long test_mm256_extract_epi64(__m256i A) {
   // X64: extractelement <4 x i64> %{{.*}}, {{i32|i64}} 3
   return _mm256_extract_epi64(A, 3);
 }
+TEST_CONSTEXPR(_mm256_extract_epi64(((__m256i)(__v4di){5, 15, 25, 35}), 14) == 25);
 #endif
 
 __m128d test_mm256_extractf128_pd(__m256d A) {
@@ -1120,18 +1124,21 @@ __m256i test_mm256_insert_epi8(__m256i x, char b) {
   // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, {{i32|i64}} 14
   return _mm256_insert_epi8(x, b, 14);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_insert_epi8(((__m256i)(__v32qs){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}), 77, 47), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 77, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31));
 
 __m256i test_mm256_insert_epi16(__m256i x, int b) {
   // CHECK-LABEL: test_mm256_insert_epi16
   // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, {{i32|i64}} 4
   return _mm256_insert_epi16(x, b, 4);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_insert_epi16(((__m256i)(__v16hi){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}), 909, 62), 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 909, 30));
 
 __m256i test_mm256_insert_epi32(__m256i x, int b) {
   // CHECK-LABEL: test_mm256_insert_epi32
   // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, {{i32|i64}} 5
   return _mm256_insert_epi32(x, b, 5);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_insert_epi32(((__m256i)(__v8si){ 0, 5, 10, 15, 20, 25, 30, 35}), 4321, 18), 0, 5, 4321, 15, 20, 25, 30, 35));
 
 #if __x86_64__
 __m256i test_mm256_insert_epi64(__m256i x, long long b) {
@@ -1139,6 +1146,7 @@ __m256i test_mm256_insert_epi64(__m256i x, long long b) {
   // X64: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, {{i32|i64}} 2
   return _mm256_insert_epi64(x, b, 2);
 }
+TEST_CONSTEXPR(match_v4di(_mm256_insert_epi64(((__m256i)(__v4di){5, 15, 25, 35}), -123456789LL, 10), 5, 15, -123456789LL, 35));
 #endif
 
 __m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 26c5f7315457e..f9ee32e440795 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -292,6 +292,7 @@ int test_mm_extract_pi16(__m64 a) {
   // CHECK: extractelement <4 x i16> {{%.*}}, i64 2
   return _mm_extract_pi16(a, 2);
 }
+TEST_CONSTEXPR(_mm_extract_pi16(((__m64)(__v4hi){10, 20, 30, 40}), 7) == 40);
 
 __m64 test_m_from_int(int a) {
   // CHECK-LABEL: test_m_from_int
@@ -347,6 +348,7 @@ __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK: insertelement <4 x i16>
   return _mm_insert_pi16(a, d, 2);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_insert_pi16(((__m64)(__v4hi){0, 1, 2, 3}), 77, 10), 0, 1, 77, 3));
 
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_madd_pi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 84b90c09444c2..65bfec39c8f5a 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -723,12 +723,14 @@ int test_mm_extract_epi16(__m128i A) {
   // CHECK: zext i16 %{{.*}} to i32
   return _mm_extract_epi16(A, 1);
 }
+TEST_CONSTEXPR(_mm_extract_epi16(((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}), 25) == 10);
 
 __m128i test_mm_insert_epi16(__m128i A, int B) {
   // CHECK-LABEL: test_mm_insert_epi16
   // CHECK: insertelement <8 x i16> %{{.*}}, {{i32|i64}} 0
   return _mm_insert_epi16(A, B, 0);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_insert_epi16(((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70}), 555, 17), 0, 555, 20, 30, 40, 50, 60, 70));
 
 void test_mm_lfence(void) {
   // CHECK-LABEL: test_mm_lfence
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 3c3724643870e..eee479a755ab4 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -231,24 +231,28 @@ int test_mm_extract_epi8(__m128i x) {
   // CHECK: zext i8 %{{.*}} to i32
   return _mm_extract_epi8(x, 1);
 }
+TEST_CONSTEXPR(_mm_extract_epi8(((__m128i)(__v16qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), 20) == 4);
 
 int test_mm_extract_epi32(__m128i x) {
   // CHECK-LABEL: test_mm_extract_epi32
   // CHECK: extractelement <4 x i32> %{{.*}}, {{i32|i64}} 1
   return _mm_extract_epi32(x, 1);
 }
+TEST_CONSTEXPR(_mm_extract_epi32(((__m128i)(__v4si){1, 3, 5, 7}), 10) == 5);
 
 long long test_mm_extract_epi64(__m128i x) {
   // CHECK-LABEL: test_mm_extract_epi64
   // CHECK: extractelement <2 x i64> %{{.*}}, {{i32|i64}} 1
   return _mm_extract_epi64(x, 1);
 }
+TEST_CONSTEXPR(_mm_extract_epi64(((__m128i)(__v2di){11, 22}), 5) == 22);
 
 int test_mm_extract_ps(__m128 x) {
   // CHECK-LABEL: test_mm_extract_ps
   // CHECK: extractelement <4 x float> %{{.*}}, {{i32|i64}} 1
   return _mm_extract_ps(x, 1);
 }
+TEST_CONSTEXPR(_mm_extract_ps(((__m128){1.25f, 2.5f, 3.75f, 5.0f}), 6) == __builtin_bit_cast(int, 3.75f));
 
 __m128d test_mm_floor_pd(__m128d x) {
   // CHECK-LABEL: test_mm_floor_pd
@@ -279,12 +283,14 @@ __m128i test_mm_insert_epi8(__m128i x, char b) {
   // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, {{i32|i64}} 1
   return _mm_insert_epi8(x, b, 1);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_insert_epi8(((__m128i)(__v16qi){ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), 101, 33), 0, 101, 2, 3, 4, 5, 6, 7, 8,  9, 10, 11, 12, 13, 14, 15));
 
 __m128i test_mm_insert_epi32(__m128i x, int b) {
   // CHECK-LABEL: test_mm_insert_epi32
   // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, {{i32|i64}} 1
   return _mm_insert_epi32(x, b, 1);
 }
+TEST_CONSTEXPR(match_v4si(_mm_insert_epi32(((__m128i)(__v4si){0, 1, 2, 3}), 5678, 18), 0, 1, 5678, 3));
 
 #ifdef __x86_64__
 __m128i test_mm_insert_epi64(__m128i x, long long b) {
@@ -292,6 +298,7 @@ __m128i test_mm_insert_epi64(__m128i x, long long b) {
   // X64: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, {{i32|i64}} 1
   return _mm_insert_epi64(x, b, 1);
 }
+TEST_CONSTEXPR(match_v2di(_mm_insert_epi64(((__m128i)(__v2di){100, 200}), -999, 9), 100, -999));
 #endif
 
 __m128 test_mm_insert_ps(__m128 x, __m128 y) {

From 13c83c0a785179d51d673e86dddc86801c16c079 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Fri, 3 Oct 2025 15:14:06 +0200
Subject: [PATCH 634/878] [analyzer][NFC] Simplify Analysis/csv2json.py
 (#161665)

---
 clang/test/Analysis/csv2json.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/test/Analysis/csv2json.py b/clang/test/Analysis/csv2json.py
index 3c20d689243e7..6e1aca9a51779 100644
--- a/clang/test/Analysis/csv2json.py
+++ b/clang/test/Analysis/csv2json.py
@@ -44,7 +44,7 @@ def csv_to_json_dict(csv_filepath):
     """
     try:
         with open(csv_filepath, "r", encoding="utf-8") as csvfile:
-            reader = csv.reader(csvfile)
+            reader = csv.reader(csvfile, skipinitialspace=True)
 
             # Read the header row (column names)
             try:
@@ -58,12 +58,13 @@ def csv_to_json_dict(csv_filepath):
                 json.dumps({}, indent=2)
                 return
 
-            other_column_names = [name.strip() for name in header[1:]]
+            header_length = len(header)
+            other_column_names = header[1:]
 
             data_dict = {}
 
             for row in reader:
-                if len(row) != len(header):
+                if len(row) != header_length:
                     raise csv.Error("Inconsistent CSV file")
                     exit(1)
 

From 173063cf054645a7f72e0ca1d0f2dfe87346d65c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 3 Oct 2025 09:16:57 -0400
Subject: [PATCH 635/878] [AMDGPU][Disassembler] Use target feature for
 `.amdhsa_reserve_xnack_mask` instead of hard code zero (#161771)

There is no test change at this moment because we don't have a target
that has this feature by default yet.
---
 llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2d5ae29c1037c..2120bf8f3afb2 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2303,7 +2303,10 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
   KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
   if (!hasArchitectedFlatScratch())
     KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
-  KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
+  bool ReservedXnackMask = STI.hasFeature(AMDGPU::FeatureXNACK);
+  assert(!ReservedXnackMask || STI.hasFeature(AMDGPU::FeatureSupportsXNACK));
+  KdStream << Indent << ".amdhsa_reserve_xnack_mask " << ReservedXnackMask
+           << '\n';
   KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
 
   CHECK_RESERVED_BITS(COMPUTE_PGM_RSRC1_PRIORITY);

From ffc503edd0a2d07121232fe204e480fc29631a90 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Fri, 3 Oct 2025 09:25:49 -0400
Subject: [PATCH 636/878] [LLVM] Add GNU make jobserver support (#145131)

This patch introduces support for the jobserver protocol to control
parallelism for device offloading tasks.

When running a parallel build with a modern build system like `make -jN`
or `ninja -jN`, each Clang process might also be configured to use
multiple threads for its own tasks (e.g., via `--offload-jobs=4`). This
can lead to an explosion of threads (N * 4), causing heavy system load,
CPU contention, and ultimately slowing down the entire build.

This patch allows Clang to act as a cooperative client of the build
system's jobserver. It extends the `--offload-jobs` option to accept the
value 'jobserver'. With the recent addition of jobserver support to the
Ninja build system, this functionality now benefits users of both Make
and Ninja.

When `--offload-jobs=jobserver` is specified, Clang's thread pool will:
1. Parse the MAKEFLAGS environment variable to find the jobserver
details.
2. Before dispatching a task, acquire a job slot from the jobserver. If
none are available, the worker thread will block.
3. Release the job slot once the task is complete.

This ensures that the total number of active offload tasks across all
Clang processes does not exceed the limit defined by the parent build
system, leading to more efficient and controlled parallel builds.

Implementation:
- A new library, `llvm/Support/Jobserver`, is added to provide a
platform-agnostic client for the jobserver protocol, with backends for
Unix (FIFO) and Windows (semaphores).
- `llvm/Support/ThreadPool` and `llvm/Support/Parallel` are updated with
a `jobserver_concurrency` strategy to integrate this logic.
- The Clang driver and linker-wrapper are modified to recognize the
'jobserver' argument and enable the new thread pool strategy.
- New unit and integration tests are added to validate the feature.
---
 clang/include/clang/Driver/Options.td         |   5 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  22 +-
 clang/test/Driver/hip-options.hip             |   6 +
 clang/test/Driver/linker-wrapper.c            |   2 +
 .../ClangLinkerWrapper.cpp                    |  18 +-
 .../clang-linker-wrapper/LinkerWrapperOpts.td |   3 +-
 llvm/include/llvm/Support/Jobserver.h         | 162 +++++++
 llvm/include/llvm/Support/ThreadPool.h        |   4 +
 llvm/include/llvm/Support/Threading.h         |  18 +
 llvm/lib/Support/CMakeLists.txt               |   1 +
 llvm/lib/Support/Jobserver.cpp                | 259 +++++++++++
 llvm/lib/Support/Parallel.cpp                 |  98 +++-
 llvm/lib/Support/ThreadPool.cpp               | 108 ++++-
 llvm/lib/Support/Threading.cpp                |   5 +
 llvm/lib/Support/Unix/Jobserver.inc           | 195 ++++++++
 llvm/lib/Support/Windows/Jobserver.inc        |  79 ++++
 llvm/unittests/Support/CMakeLists.txt         |   1 +
 llvm/unittests/Support/JobserverTest.cpp      | 437 ++++++++++++++++++
 18 files changed, 1393 insertions(+), 30 deletions(-)
 create mode 100644 llvm/include/llvm/Support/Jobserver.h
 create mode 100644 llvm/lib/Support/Jobserver.cpp
 create mode 100644 llvm/lib/Support/Unix/Jobserver.inc
 create mode 100644 llvm/lib/Support/Windows/Jobserver.inc
 create mode 100644 llvm/unittests/Support/JobserverTest.cpp

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ef609831637e..5a48f0bcf65e5 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1258,8 +1258,9 @@ def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
   HelpText<"Compression level for offload device binaries (HIP only)">;
 
 def offload_jobs_EQ : Joined<["--"], "offload-jobs=">,
-  HelpText<"Specify the number of threads to use for device offloading tasks"
-           " during compilation.">;
+  HelpText<"Specify the number of threads to use for device offloading tasks "
+           "during compilation. Can be a positive integer or the string "
+           "'jobserver' to use the make-style jobserver from the environment.">;
 
 defm offload_via_llvm : BoolFOption<"offload-via-llvm",
   LangOpts<"OffloadViaLLVM">, DefaultFalse,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 412a176006bc0..684cc0902916f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9224,14 +9224,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   addOffloadCompressArgs(Args, CmdArgs);
 
   if (Arg *A = Args.getLastArg(options::OPT_offload_jobs_EQ)) {
-    int NumThreads;
-    if (StringRef(A->getValue()).getAsInteger(10, NumThreads) ||
-        NumThreads <= 0)
-      C.getDriver().Diag(diag::err_drv_invalid_int_value)
-          << A->getAsString(Args) << A->getValue();
-    else
-      CmdArgs.push_back(
-          Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
+    StringRef Val = A->getValue();
+
+    if (Val.equals_insensitive("jobserver"))
+      CmdArgs.push_back(Args.MakeArgString("--wrapper-jobs=jobserver"));
+    else {
+      int NumThreads;
+      if (Val.getAsInteger(10, NumThreads) || NumThreads <= 0) {
+        C.getDriver().Diag(diag::err_drv_invalid_int_value)
+            << A->getAsString(Args) << Val;
+      } else {
+        CmdArgs.push_back(
+            Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
+      }
+    }
   }
 
   const char *Exec =
diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index 6206020d76db6..09f1ffa62d348 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -254,3 +254,9 @@
 // RUN:   --offload-arch=gfx1100 --offload-new-driver --offload-jobs=0x4 %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=INVJOBS %s
 // INVJOBS: clang: error: invalid integral value '0x4' in '--offload-jobs=0x4'
+
+// RUN: %clang -### -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
+// RUN:   --offload-arch=gfx1100 --offload-new-driver --offload-jobs=jobserver %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=JOBSV %s
+// JOBSV: clang-linker-wrapper{{.*}} "--wrapper-jobs=jobserver"
+
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index c060dae7bb154..1c0fb9644ef54 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -114,6 +114,8 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=4 \
 // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
+// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=jobserver \
+// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
 
 // CUDA-PAR: fatbinary{{.*}}-64 --create {{.*}}.fatbin
 
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 1419b8c90a625..4d5b956031674 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1295,12 +1295,18 @@ int main(int Argc, char **Argv) {
 
   parallel::strategy = hardware_concurrency(1);
   if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) {
-    unsigned Threads = 0;
-    if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0)
-      reportError(createStringError("%s: expected a positive integer, got '%s'",
-                                    Arg->getSpelling().data(),
-                                    Arg->getValue()));
-    parallel::strategy = hardware_concurrency(Threads);
+    StringRef Val = Arg->getValue();
+    if (Val.equals_insensitive("jobserver"))
+      parallel::strategy = jobserver_concurrency();
+    else {
+      unsigned Threads = 0;
+      if (!llvm::to_integer(Val, Threads) || Threads == 0)
+        reportError(createStringError(
+            "%s: expected a positive integer or 'jobserver', got '%s'",
+            Arg->getSpelling().data(), Val.data()));
+      else
+        parallel::strategy = hardware_concurrency(Threads);
+    }
   }
 
   if (Args.hasArg(OPT_wrapper_time_trace_eq)) {
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index fa73e02fd5178..87f911c749bf6 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -53,7 +53,8 @@ def wrapper_time_trace_granularity : Joined<["--"], "wrapper-time-trace-granular
 
 def wrapper_jobs : Joined<["--"], "wrapper-jobs=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">,
-  HelpText<"Sets the number of parallel jobs to use for device linking">;
+  HelpText<"Sets the number of parallel jobs for device linking. Can be a "
+            "positive integer or 'jobserver'.">;
 
 def override_image : Joined<["--"], "override-image=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<kind=file>">,
diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h
new file mode 100644
index 0000000000000..6bee3b5671d55
--- /dev/null
+++ b/llvm/include/llvm/Support/Jobserver.h
@@ -0,0 +1,162 @@
+//===- llvm/Support/Jobserver.h - Jobserver Client --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a client for the GNU Make jobserver protocol. This allows
+// LLVM tools to coordinate parallel execution with a parent `make` process.
+//
+// The jobserver protocol is a mechanism for GNU Make to share its pool of
+// available "job slots" with the subprocesses it invokes. This is particularly
+// useful for tools that can perform parallel operations themselves (e.g., a
+// multi-threaded linker or compiler). By participating in this protocol, a
+// tool can ensure the total number of concurrent jobs does not exceed the
+// limit specified by the user (e.g., `make -j8`).
+//
+// How it works:
+//
+// 1. Establishment:
+//    A child process discovers the jobserver by inspecting the `MAKEFLAGS`
+//    environment variable. If a jobserver is active, this variable will
+//    contain a `--jobserver-auth=<value>` argument. The format of `<value>`
+//    determines how to communicate with the server.
+//
+// 2. The Implicit Slot:
+//    Every command invoked by `make` is granted one "implicit" job slot. This
+//    means a tool can always perform at least one unit of work without needing
+//    to communicate with the jobserver. This implicit slot should NEVER be
+//    released back to the jobserver.
+//
+// 3. Acquiring and Releasing Slots:
+//    On POSIX systems, the jobserver is implemented as a pipe. The
+//    `--jobserver-auth` value specifies either a path to a named pipe
+//    (`fifo:PATH`) or a pair of file descriptors (`R,W`). The pipe is
+//    pre-loaded with single-character tokens, one for each available job slot.
+//
+//    - To acquire an additional slot, a client reads a single-character token
+//      from the pipe.
+//    - To release a slot, the client must write the *exact same* character
+//      token back to the pipe.
+//
+//    It is critical that a client releases all acquired slots before it exits,
+//    even in cases of error, to avoid deadlocking the build.
+//
+// Example:
+//    A multi-threaded linker invoked by `make -j8` wants to use multiple
+//    threads. It first checks for the jobserver. It knows it has one implicit
+//    slot, so it can use one thread. It then tries to acquire 7 more slots by
+//    reading 7 tokens from the jobserver pipe. If it only receives 3 tokens,
+//    it knows it can use a total of 1 (implicit) + 3 (acquired) = 4 threads.
+//    Before exiting, it must write the 3 tokens it read back to the pipe.
+//
+// For more context, see:
+//   - GNU Make manual on job slots:
+//     https://www.gnu.org/software/make/manual/html_node/Job-Slots.html
+//   - LLVM RFC discussion on jobserver support:
+//     https://discourse.llvm.org/t/rfc-adding-gnu-make-jobserver-
+//     support-to-llvm-for-coordinated-parallelism/87034
+//   - Ninja’s jobserver support PR:
+//     https://github.com/ninja-build/ninja/pull/2506
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_JOBSERVER_H
+#define LLVM_SUPPORT_JOBSERVER_H
+
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+
+/// A JobSlot represents a single job slot that can be acquired from or released
+/// to a jobserver pool. This class is move-only.
+class JobSlot {
+public:
+  /// Default constructor creates an invalid instance.
+  JobSlot() = default;
+
+  // Move operations are allowed.
+  JobSlot(JobSlot &&Other) noexcept : Value(Other.Value) {
+    Other.Value = kInvalidValue;
+  }
+  JobSlot &operator=(JobSlot &&Other) noexcept {
+    if (this != &Other) {
+      this->Value = Other.Value;
+      Other.Value = kInvalidValue;
+    }
+    return *this;
+  }
+
+  // Copy operations are disallowed.
+  JobSlot(const JobSlot &) = delete;
+  JobSlot &operator=(const JobSlot &) = delete;
+
+  /// Returns true if this instance is valid (either implicit or explicit).
+  bool isValid() const { return Value >= 0; }
+
+  /// Returns true if this instance represents the implicit job slot.
+  bool isImplicit() const { return Value == kImplicitValue; }
+
+  static JobSlot createExplicit(uint8_t V) {
+    return JobSlot(static_cast<int16_t>(V));
+  }
+
+  static JobSlot createImplicit() { return JobSlot(kImplicitValue); }
+
+  uint8_t getExplicitValue() const;
+  bool isExplicit() const { return isValid() && !isImplicit(); }
+
+private:
+  friend class JobserverClient;
+  friend class JobserverClientImpl;
+
+  JobSlot(int16_t V) : Value(V) {}
+
+  /// The jobserver pipe carries explicit tokens (bytes 0–255). We reserve two
+  /// sentinels in Value for special cases:
+  ///   kInvalidValue  (-1): no slot held
+  ///   kImplicitValue (INT16_MAX): implicit slot granted at startup (no pipe
+  ///   I/O)
+  ///
+  /// We use int16_t so Value can store 0–255 explicit tokens and
+  /// sentinels without overflow, enforces fixed 16-bit width, and avoids
+  /// unsigned/signed mix-ups.
+  static constexpr int16_t kInvalidValue = -1;
+  static constexpr int16_t kImplicitValue = INT16_MAX;
+  int16_t Value = kInvalidValue;
+};
+
+/// The public interface for a jobserver client.
+/// This client is a lazy-initialized singleton that is created on first use.
+class JobserverClient {
+public:
+  virtual ~JobserverClient();
+
+  /// Tries to acquire a job slot from the pool. On failure (e.g., if the pool
+  /// is empty), this returns an invalid JobSlot instance. The first successful
+  /// call will always return the implicit slot.
+  virtual JobSlot tryAcquire() = 0;
+
+  /// Releases a job slot back to the pool.
+  virtual void release(JobSlot Slot) = 0;
+
+  /// Returns the number of job slots available, as determined on first use.
+  /// This value is cached. Returns 0 if no jobserver is active.
+  virtual unsigned getNumJobs() const = 0;
+
+  /// Returns the singleton instance of the JobserverClient.
+  /// The instance is created on the first call to this function.
+  /// Returns a nullptr if no jobserver is configured or an error occurs.
+  static JobserverClient *getInstance();
+
+  /// Resets the singleton instance. For testing purposes only.
+  static void resetForTesting();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_JOBSERVER_H
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index c26681c25c8f6..c20efc7396b79 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Jobserver.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
@@ -180,6 +181,7 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
   void grow(int requested);
 
   void processTasks(ThreadPoolTaskGroup *WaitingForGroup);
+  void processTasksWithJobserver();
 
   /// Threads in flight
   std::vector<llvm::thread> Threads;
@@ -208,6 +210,8 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
 
   /// Maximum number of threads to potentially grow this pool to.
   const unsigned MaxThreadCount;
+
+  JobserverClient *TheJobserver = nullptr;
 };
 #endif // LLVM_ENABLE_THREADS
 
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index d3fe0a57ee44e..88846807f111a 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -142,6 +142,11 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     /// the thread shall remain on the actual CPU socket.
     LLVM_ABI std::optional<unsigned>
     compute_cpu_socket(unsigned ThreadPoolNum) const;
+
+    /// If true, the thread pool will attempt to coordinate with a GNU Make
+    /// jobserver, acquiring a job slot before processing a task. If no
+    /// jobserver is found in the environment, this is ignored.
+    bool UseJobserver = false;
   };
 
   /// Build a strategy from a number of threads as a string provided in \p Num.
@@ -210,6 +215,19 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     return S;
   }
 
+  /// Returns a thread strategy that attempts to coordinate with a GNU Make
+  /// jobserver. The number of active threads will be limited by the number of
+  /// available job slots. If no jobserver is detected in the environment, this
+  /// strategy falls back to the default hardware_concurrency() behavior.
+  inline ThreadPoolStrategy jobserver_concurrency() {
+    ThreadPoolStrategy S;
+    S.UseJobserver = true;
+    // We can still request all threads be created, as they will simply
+    // block waiting for a job slot if the jobserver is the limiting factor.
+    S.ThreadsRequested = 0; // 0 means 'use all available'
+    return S;
+  }
+
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
   /// unique across the entire system, so portable code should not assume
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 7da972f372c5b..42b21b5e62029 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -207,6 +207,7 @@ add_llvm_component_library(LLVMSupport
   InstructionCost.cpp
   IntEqClasses.cpp
   IntervalMap.cpp
+  Jobserver.cpp
   JSON.cpp
   KnownBits.cpp
   KnownFPClass.cpp
diff --git a/llvm/lib/Support/Jobserver.cpp b/llvm/lib/Support/Jobserver.cpp
new file mode 100644
index 0000000000000..9f726eb37506f
--- /dev/null
+++ b/llvm/lib/Support/Jobserver.cpp
@@ -0,0 +1,259 @@
+//===- llvm/Support/Jobserver.cpp - Jobserver Client Implementation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Jobserver.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <new>
+
+#define DEBUG_TYPE "jobserver"
+
+using namespace llvm;
+
+namespace {
+struct FdPair {
+  int Read = -1;
+  int Write = -1;
+  bool isValid() const { return Read >= 0 && Write >= 0; }
+};
+
+struct JobserverConfig {
+  enum Mode {
+    None,
+    PosixFifo,
+    PosixPipe,
+    Win32Semaphore,
+  };
+  Mode TheMode = None;
+  std::string Path;
+  FdPair PipeFDs;
+};
+
+/// A helper function that checks if `Input` starts with `Prefix`.
+/// If it does, it removes the prefix from `Input`, assigns the remainder to
+/// `Value`, and returns true. Otherwise, it returns false.
+bool getPrefixedValue(StringRef Input, StringRef Prefix, StringRef &Value) {
+  if (Input.consume_front(Prefix)) {
+    Value = Input;
+    return true;
+  }
+  return false;
+}
+
+/// A helper function to parse a string in the format "R,W" where R and W are
+/// non-negative integers representing file descriptors. It populates the
+/// `ReadFD` and `WriteFD` output parameters. Returns true on success.
+static std::optional<FdPair> getFileDescriptorPair(StringRef Input) {
+  FdPair FDs;
+  if (Input.consumeInteger(10, FDs.Read))
+    return std::nullopt;
+  if (!Input.consume_front(","))
+    return std::nullopt;
+  if (Input.consumeInteger(10, FDs.Write))
+    return std::nullopt;
+  if (!Input.empty() || !FDs.isValid())
+    return std::nullopt;
+  return FDs;
+}
+
+/// Parses the `MAKEFLAGS` environment variable string to find jobserver
+/// arguments. It splits the string into space-separated arguments and searches
+/// for `--jobserver-auth` or `--jobserver-fds`. Based on the value of these
+/// arguments, it determines the jobserver mode (Pipe, FIFO, or Semaphore) and
+/// connection details (file descriptors or path).
+Expected<JobserverConfig> parseNativeMakeFlags(StringRef MakeFlags) {
+  JobserverConfig Config;
+  if (MakeFlags.empty())
+    return Config;
+
+  // Split the MAKEFLAGS string into arguments.
+  SmallVector<StringRef, 8> Args;
+  SplitString(MakeFlags, Args);
+
+  // If '-n' (dry-run) is present as a legacy flag (not starting with '-'),
+  // disable the jobserver.
+  if (!Args.empty() && !Args[0].starts_with("-") && Args[0].contains('n'))
+    return Config;
+
+  // Iterate through arguments to find jobserver flags.
+  // Note that make may pass multiple --jobserver-auth flags; the last one wins.
+  for (StringRef Arg : Args) {
+    StringRef Value;
+    if (getPrefixedValue(Arg, "--jobserver-auth=", Value)) {
+      // Try to parse as a file descriptor pair first.
+      if (auto FDPair = getFileDescriptorPair(Value)) {
+        Config.TheMode = JobserverConfig::PosixPipe;
+        Config.PipeFDs = *FDPair;
+      } else {
+        StringRef FifoPath;
+        // If not FDs, try to parse as a named pipe (fifo).
+        if (getPrefixedValue(Value, "fifo:", FifoPath)) {
+          Config.TheMode = JobserverConfig::PosixFifo;
+          Config.Path = FifoPath.str();
+        } else {
+          // Otherwise, assume it's a Windows semaphore.
+          Config.TheMode = JobserverConfig::Win32Semaphore;
+          Config.Path = Value.str();
+        }
+      }
+    } else if (getPrefixedValue(Arg, "--jobserver-fds=", Value)) {
+      // This is an alternative, older syntax for the pipe-based server.
+      if (auto FDPair = getFileDescriptorPair(Value)) {
+        Config.TheMode = JobserverConfig::PosixPipe;
+        Config.PipeFDs = *FDPair;
+      } else {
+        return createStringError(inconvertibleErrorCode(),
+                                 "Invalid file descriptor pair in MAKEFLAGS");
+      }
+    }
+  }
+
+// Perform platform-specific validation.
+#ifdef _WIN32
+  if (Config.TheMode == JobserverConfig::PosixFifo ||
+      Config.TheMode == JobserverConfig::PosixPipe)
+    return createStringError(
+        inconvertibleErrorCode(),
+        "FIFO/Pipe-based jobserver is not supported on Windows");
+#else
+  if (Config.TheMode == JobserverConfig::Win32Semaphore)
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Semaphore-based jobserver is not supported on this platform");
+#endif
+  return Config;
+}
+
+std::once_flag GJobserverOnceFlag;
+JobserverClient *GJobserver = nullptr;
+
+} // namespace
+
+namespace llvm {
+class JobserverClientImpl : public JobserverClient {
+  bool IsInitialized = false;
+  std::atomic<bool> HasImplicitSlot{true};
+  unsigned NumJobs = 0;
+
+public:
+  JobserverClientImpl(const JobserverConfig &Config);
+  ~JobserverClientImpl() override;
+
+  JobSlot tryAcquire() override;
+  void release(JobSlot Slot) override;
+  unsigned getNumJobs() const override { return NumJobs; }
+
+  bool isValid() const { return IsInitialized; }
+
+private:
+#if defined(LLVM_ON_UNIX)
+  int ReadFD = -1;
+  int WriteFD = -1;
+  std::string FifoPath;
+#elif defined(_WIN32)
+  void *Semaphore = nullptr;
+#endif
+};
+} // namespace llvm
+
+// Include the platform-specific parts of the class.
+#if defined(LLVM_ON_UNIX)
+#include "Unix/Jobserver.inc"
+#elif defined(_WIN32)
+#include "Windows/Jobserver.inc"
+#else
+// Dummy implementation for unsupported platforms.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {}
+JobserverClientImpl::~JobserverClientImpl() = default;
+JobSlot JobserverClientImpl::tryAcquire() { return JobSlot(); }
+void JobserverClientImpl::release(JobSlot Slot) {}
+#endif
+
+namespace llvm {
+JobserverClient::~JobserverClient() = default;
+
+uint8_t JobSlot::getExplicitValue() const {
+  assert(isExplicit() && "Cannot get value of implicit or invalid slot");
+  return static_cast<uint8_t>(Value);
+}
+
+/// This is the main entry point for acquiring a jobserver client. It uses a
+/// std::call_once to ensure the singleton `GJobserver` instance is created
+/// safely in a multi-threaded environment. On first call, it reads the
+/// `MAKEFLAGS` environment variable, parses it, and attempts to construct and
+/// initialize a `JobserverClientImpl`. If successful, the global instance is
+/// stored in `GJobserver`. Subsequent calls will return the existing instance.
+JobserverClient *JobserverClient::getInstance() {
+  std::call_once(GJobserverOnceFlag, []() {
+    LLVM_DEBUG(
+        dbgs()
+        << "JobserverClient::getInstance() called for the first time.\n");
+    const char *MakeFlagsEnv = getenv("MAKEFLAGS");
+    if (!MakeFlagsEnv) {
+      errs() << "Warning: failed to create jobserver client due to MAKEFLAGS "
+                "environment variable not found\n";
+      return;
+    }
+
+    LLVM_DEBUG(dbgs() << "Found MAKEFLAGS = \"" << MakeFlagsEnv << "\"\n");
+
+    auto ConfigOrErr = parseNativeMakeFlags(MakeFlagsEnv);
+    if (Error Err = ConfigOrErr.takeError()) {
+      errs() << "Warning: failed to create jobserver client due to invalid "
+                "MAKEFLAGS environment variable: "
+             << toString(std::move(Err)) << "\n";
+      return;
+    }
+
+    JobserverConfig Config = *ConfigOrErr;
+    if (Config.TheMode == JobserverConfig::None) {
+      errs() << "Warning: failed to create jobserver client due to jobserver "
+                "mode missing in MAKEFLAGS environment variable\n";
+      return;
+    }
+
+    if (Config.TheMode == JobserverConfig::PosixPipe) {
+#if defined(LLVM_ON_UNIX)
+      if (!areFdsValid(Config.PipeFDs.Read, Config.PipeFDs.Write)) {
+        errs() << "Warning: failed to create jobserver client due to invalid "
+                  "Pipe FDs in MAKEFLAGS environment variable\n";
+        return;
+      }
+#endif
+    }
+
+    auto Client = std::make_unique<JobserverClientImpl>(Config);
+    if (Client->isValid()) {
+      LLVM_DEBUG(dbgs() << "Jobserver client created successfully!\n");
+      GJobserver = Client.release();
+    } else
+      errs() << "Warning: jobserver client initialization failed.\n";
+  });
+  return GJobserver;
+}
+
+/// For testing purposes only. This function resets the singleton instance by
+/// destroying the existing client and re-initializing the `std::once_flag`.
+/// This allows tests to simulate the first-time initialization of the
+/// jobserver client multiple times.
+void JobserverClient::resetForTesting() {
+  delete GJobserver;
+  GJobserver = nullptr;
+  // Re-construct the std::once_flag in place to reset the singleton state.
+  new (&GJobserverOnceFlag) std::once_flag();
+}
+} // namespace llvm
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 3ac6fc74fd3e0..8e0c724accb36 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -7,12 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Parallel.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ExponentialBackoff.h"
+#include "llvm/Support/Jobserver.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Threading.h"
 
 #include <atomic>
 #include <future>
+#include <memory>
+#include <mutex>
 #include <thread>
 #include <vector>
 
@@ -49,6 +54,9 @@ class Executor {
 class ThreadPoolExecutor : public Executor {
 public:
   explicit ThreadPoolExecutor(ThreadPoolStrategy S) {
+    if (S.UseJobserver)
+      TheJobserver = JobserverClient::getInstance();
+
     ThreadCount = S.compute_thread_count();
     // Spawn all but one of the threads in another thread as spawning threads
     // can take a while.
@@ -69,6 +77,10 @@ class ThreadPoolExecutor : public Executor {
     });
   }
 
+  // To make sure the thread pool executor can only be created with a parallel
+  // strategy.
+  ThreadPoolExecutor() = delete;
+
   void stop() {
     {
       std::lock_guard<std::mutex> Lock(Mutex);
@@ -111,15 +123,62 @@ class ThreadPoolExecutor : public Executor {
   void work(ThreadPoolStrategy S, unsigned ThreadID) {
     threadIndex = ThreadID;
     S.apply_thread_strategy(ThreadID);
+    // Note on jobserver deadlock avoidance:
+    // GNU Make grants each invoked process one implicit job slot. Our
+    // JobserverClient models this by returning an implicit JobSlot on the
+    // first successful tryAcquire() in a process. This guarantees forward
+    // progress without requiring a dedicated "always-on" thread here.
+
+    static thread_local std::unique_ptr<ExponentialBackoff> Backoff;
+
     while (true) {
-      std::unique_lock<std::mutex> Lock(Mutex);
-      Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
-      if (Stop)
-        break;
-      auto Task = std::move(WorkStack.back());
-      WorkStack.pop_back();
-      Lock.unlock();
-      Task();
+      if (TheJobserver) {
+        // Jobserver-mode scheduling:
+        // - Acquire one job slot (with exponential backoff to avoid busy-wait).
+        // - While holding the slot, drain and run tasks from the local queue.
+        // - Release the slot when the queue is empty or when shutting down.
+        // Rationale: Holding a slot amortizes acquire/release overhead over
+        // multiple tasks and avoids requeue/yield churn, while still enforcing
+        // the jobserver’s global concurrency limit. With K available slots,
+        // up to K workers run tasks in parallel; within each worker tasks run
+        // sequentially until the local queue is empty.
+        ExponentialBackoff Backoff(std::chrono::hours(24));
+        JobSlot Slot;
+        do {
+          if (Stop)
+            return;
+          Slot = TheJobserver->tryAcquire();
+          if (Slot.isValid())
+            break;
+        } while (Backoff.waitForNextAttempt());
+
+        auto SlotReleaser = llvm::make_scope_exit(
+            [&] { TheJobserver->release(std::move(Slot)); });
+
+        while (true) {
+          std::function<void()> Task;
+          {
+            std::unique_lock<std::mutex> Lock(Mutex);
+            Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+            if (Stop && WorkStack.empty())
+              return;
+            if (WorkStack.empty())
+              break;
+            Task = std::move(WorkStack.back());
+            WorkStack.pop_back();
+          }
+          Task();
+        }
+      } else {
+        std::unique_lock<std::mutex> Lock(Mutex);
+        Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+        if (Stop)
+          break;
+        auto Task = std::move(WorkStack.back());
+        WorkStack.pop_back();
+        Lock.unlock();
+        Task();
+      }
     }
   }
 
@@ -130,9 +189,20 @@ class ThreadPoolExecutor : public Executor {
   std::promise<void> ThreadsCreated;
   std::vector<std::thread> Threads;
   unsigned ThreadCount;
+
+  JobserverClient *TheJobserver = nullptr;
 };
 
-Executor *Executor::getDefaultExecutor() {
+// A global raw pointer to the executor. Lifetime is managed by the
+// objects created within createExecutor().
+static Executor *TheExec = nullptr;
+static std::once_flag Flag;
+
+// This function will be called exactly once to create the executor.
+// It contains the necessary platform-specific logic. Since functions
+// called by std::call_once cannot return value, we have to set the
+// executor as a global variable.
+void createExecutor() {
 #ifdef _WIN32
   // The ManagedStatic enables the ThreadPoolExecutor to be stopped via
   // llvm_shutdown() which allows a "clean" fast exit, e.g. via _exit(). This
@@ -156,16 +226,22 @@ Executor *Executor::getDefaultExecutor() {
                        ThreadPoolExecutor::Deleter>
       ManagedExec;
   static std::unique_ptr<ThreadPoolExecutor> Exec(&(*ManagedExec));
-  return Exec.get();
+  TheExec = Exec.get();
 #else
   // ManagedStatic is not desired on other platforms. When `Exec` is destroyed
   // by llvm_shutdown(), worker threads will clean up and invoke TLS
   // destructors. This can lead to race conditions if other threads attempt to
   // access TLS objects that have already been destroyed.
   static ThreadPoolExecutor Exec(strategy);
-  return &Exec;
+  TheExec = &Exec;
 #endif
 }
+
+Executor *Executor::getDefaultExecutor() {
+  // Use std::call_once to lazily and safely initialize the executor.
+  std::call_once(Flag, createExecutor);
+  return TheExec;
+}
 } // namespace
 } // namespace detail
 
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index c304f0f45360b..69602688cf3fd 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
+//
 // This file implements a crude C++11 based thread pool.
 //
 //===----------------------------------------------------------------------===//
@@ -14,6 +15,8 @@
 
 #include "llvm/Config/llvm-config.h"
 
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/ExponentialBackoff.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,7 +36,10 @@ ThreadPoolInterface::~ThreadPoolInterface() = default;
 #if LLVM_ENABLE_THREADS
 
 StdThreadPool::StdThreadPool(ThreadPoolStrategy S)
-    : Strategy(S), MaxThreadCount(S.compute_thread_count()) {}
+    : Strategy(S), MaxThreadCount(S.compute_thread_count()) {
+  if (Strategy.UseJobserver)
+    TheJobserver = JobserverClient::getInstance();
+}
 
 void StdThreadPool::grow(int requested) {
   llvm::sys::ScopedWriter LockGuard(ThreadsLock);
@@ -45,7 +51,15 @@ void StdThreadPool::grow(int requested) {
     Threads.emplace_back([this, ThreadID] {
       set_thread_name(formatv("llvm-worker-{0}", ThreadID));
       Strategy.apply_thread_strategy(ThreadID);
-      processTasks(nullptr);
+      // Note on jobserver deadlock avoidance:
+      // GNU Make grants each invoked process one implicit job slot.
+      // JobserverClient::tryAcquire() returns that implicit slot on the first
+      // successful call in a process, ensuring forward progress without a
+      // dedicated "always-on" thread.
+      if (TheJobserver)
+        processTasksWithJobserver();
+      else
+        processTasks(nullptr);
     });
   }
 }
@@ -133,6 +147,96 @@ void StdThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) {
   }
 }
 
+/// Main loop for worker threads when using a jobserver.
+/// This function uses a two-level queue; it first acquires a job slot from the
+/// external jobserver, then retrieves a task from the internal queue.
+/// This allows the thread pool to cooperate with build systems like `make -j`.
+void StdThreadPool::processTasksWithJobserver() {
+  while (true) {
+    // Acquire a job slot from the external jobserver.
+    // This polls for a slot and yields the thread to avoid a high-CPU wait.
+    JobSlot Slot;
+    // The timeout for the backoff can be very long, as the shutdown
+    // is checked on each iteration. The sleep duration is capped by MaxWait
+    // in ExponentialBackoff, so shutdown latency is not a problem.
+    ExponentialBackoff Backoff(std::chrono::hours(24));
+    bool AcquiredToken = false;
+    do {
+      // Return if the thread pool is shutting down.
+      {
+        std::unique_lock<std::mutex> LockGuard(QueueLock);
+        if (!EnableFlag)
+          return;
+      }
+
+      Slot = TheJobserver->tryAcquire();
+      if (Slot.isValid()) {
+        AcquiredToken = true;
+        break;
+      }
+    } while (Backoff.waitForNextAttempt());
+
+    if (!AcquiredToken) {
+      // This is practically unreachable with a 24h timeout and indicates a
+      // deeper problem if hit.
+      report_fatal_error("Timed out waiting for jobserver token.");
+    }
+
+    // `make_scope_exit` guarantees the job slot is released, even if the
+    // task throws or we exit early. This prevents deadlocking the build.
+    auto SlotReleaser =
+        make_scope_exit([&] { TheJobserver->release(std::move(Slot)); });
+
+    // While we hold a job slot, process tasks from the internal queue.
+    while (true) {
+      std::function<void()> Task;
+      ThreadPoolTaskGroup *GroupOfTask = nullptr;
+
+      {
+        std::unique_lock<std::mutex> LockGuard(QueueLock);
+
+        // Wait until a task is available or the pool is shutting down.
+        QueueCondition.wait(LockGuard,
+                            [&] { return !EnableFlag || !Tasks.empty(); });
+
+        // If shutting down and the queue is empty, the thread can terminate.
+        if (!EnableFlag && Tasks.empty())
+          return;
+
+        // If the queue is empty, we're done processing tasks for now.
+        // Break the inner loop to release the job slot.
+        if (Tasks.empty())
+          break;
+
+        // A task is available. Mark it as active before releasing the lock
+        // to prevent race conditions with `wait()`.
+        ++ActiveThreads;
+        Task = std::move(Tasks.front().first);
+        GroupOfTask = Tasks.front().second;
+        if (GroupOfTask != nullptr)
+          ++ActiveGroups[GroupOfTask];
+        Tasks.pop_front();
+      } // The queue lock is released.
+
+      // Run the task. The job slot remains acquired during execution.
+      Task();
+
+      // The task has finished. Update the active count and notify any waiters.
+      {
+        std::lock_guard<std::mutex> LockGuard(QueueLock);
+        --ActiveThreads;
+        if (GroupOfTask != nullptr) {
+          auto A = ActiveGroups.find(GroupOfTask);
+          if (--(A->second) == 0)
+            ActiveGroups.erase(A);
+        }
+        // If all tasks are complete, notify any waiting threads.
+        if (workCompletedUnlocked(nullptr))
+          CompletionCondition.notify_all();
+      }
+    }
+  }
+}
 bool StdThreadPool::workCompletedUnlocked(ThreadPoolTaskGroup *Group) const {
   if (Group == nullptr)
     return !ActiveThreads && Tasks.empty();
diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index 693de0e6400fb..9da357a7ebb91 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Config/config.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Jobserver.h"
 
 #include <cassert>
 #include <optional>
@@ -51,6 +52,10 @@ int llvm::get_physical_cores() { return -1; }
 static int computeHostNumHardwareThreads();
 
 unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  if (UseJobserver)
+    if (auto JS = JobserverClient::getInstance())
+      return JS->getNumJobs();
+
   int MaxThreadCount =
       UseHyperThreads ? computeHostNumHardwareThreads() : get_physical_cores();
   if (MaxThreadCount <= 0)
diff --git a/llvm/lib/Support/Unix/Jobserver.inc b/llvm/lib/Support/Unix/Jobserver.inc
new file mode 100644
index 0000000000000..53bf7f288ca1f
--- /dev/null
+++ b/llvm/lib/Support/Unix/Jobserver.inc
@@ -0,0 +1,195 @@
+//===- llvm/Support/Unix/Jobserver.inc - Unix Jobserver Impl ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the UNIX-specific parts of the JobserverClient class.
+//
+//===----------------------------------------------------------------------===//
+
+#include <atomic>
+#include <cassert>
+#include <cerrno>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+/// Returns true if the given file descriptor is a FIFO (named pipe).
+bool isFifo(int FD) {
+  struct stat StatBuf;
+  if (::fstat(FD, &StatBuf) != 0)
+    return false;
+  return S_ISFIFO(StatBuf.st_mode);
+}
+
+/// Returns true if the given file descriptors are valid.
+bool areFdsValid(int ReadFD, int WriteFD) {
+  if (ReadFD == -1 || WriteFD == -1)
+    return false;
+  // Check if the file descriptors are actually valid by checking their flags.
+  return ::fcntl(ReadFD, F_GETFD) != -1 && ::fcntl(WriteFD, F_GETFD) != -1;
+}
+} // namespace
+
+/// The constructor sets up the client based on the provided configuration.
+/// For pipe-based jobservers, it duplicates the inherited file descriptors,
+/// sets them to close-on-exec, and makes the read descriptor non-blocking.
+/// For FIFO-based jobservers, it opens the named pipe. After setup, it drains
+/// all available tokens from the jobserver to determine the total number of
+/// available jobs (`NumJobs`), then immediately releases them back.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
+  switch (Config.TheMode) {
+  case JobserverConfig::PosixPipe: {
+    // Duplicate the read and write file descriptors.
+    int NewReadFD = ::dup(Config.PipeFDs.Read);
+    if (NewReadFD < 0)
+      return;
+    int NewWriteFD = ::dup(Config.PipeFDs.Write);
+    if (NewWriteFD < 0) {
+      ::close(NewReadFD);
+      return;
+    }
+    // Set the new descriptors to be closed automatically on exec().
+    if (::fcntl(NewReadFD, F_SETFD, FD_CLOEXEC) == -1 ||
+        ::fcntl(NewWriteFD, F_SETFD, FD_CLOEXEC) == -1) {
+      ::close(NewReadFD);
+      ::close(NewWriteFD);
+      return;
+    }
+    // Set the read descriptor to non-blocking.
+    int flags = ::fcntl(NewReadFD, F_GETFL, 0);
+    if (flags == -1 || ::fcntl(NewReadFD, F_SETFL, flags | O_NONBLOCK) == -1) {
+      ::close(NewReadFD);
+      ::close(NewWriteFD);
+      return;
+    }
+    ReadFD = NewReadFD;
+    WriteFD = NewWriteFD;
+    break;
+  }
+  case JobserverConfig::PosixFifo:
+    // Open the FIFO for reading. It must be non-blocking and close-on-exec.
+    ReadFD = ::open(Config.Path.c_str(), O_RDONLY | O_NONBLOCK | O_CLOEXEC);
+    if (ReadFD < 0 || !isFifo(ReadFD)) {
+      if (ReadFD >= 0)
+        ::close(ReadFD);
+      ReadFD = -1;
+      return;
+    }
+    FifoPath = Config.Path;
+    // The write FD is opened on-demand in release().
+    WriteFD = -1;
+    break;
+  default:
+    return;
+  }
+
+  IsInitialized = true;
+  // Determine the total number of jobs by acquiring all available slots and
+  // then immediately releasing them.
+  SmallVector<JobSlot, 8> Slots;
+  while (true) {
+    auto S = tryAcquire();
+    if (!S.isValid())
+      break;
+    Slots.push_back(std::move(S));
+  }
+  NumJobs = Slots.size();
+  assert(NumJobs >= 1 && "Invalid number of jobs");
+  for (auto &S : Slots)
+    release(std::move(S));
+}
+
+/// The destructor closes any open file descriptors.
+JobserverClientImpl::~JobserverClientImpl() {
+  if (ReadFD >= 0)
+    ::close(ReadFD);
+  if (WriteFD >= 0)
+    ::close(WriteFD);
+}
+
+/// Tries to acquire a job slot. The first call to this function will always
+/// successfully acquire the single "implicit" slot that is granted to every
+/// process started by `make`. Subsequent calls attempt to read a one-byte
+/// token from the jobserver's read pipe. A successful read grants one
+/// explicit job slot. The read is non-blocking; if no token is available,
+/// it fails and returns an invalid JobSlot.
+JobSlot JobserverClientImpl::tryAcquire() {
+  if (!IsInitialized)
+    return JobSlot();
+
+  // The first acquisition is always for the implicit slot.
+  if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
+    LLVM_DEBUG(dbgs() << "Acquired implicit job slot.\n");
+    return JobSlot::createImplicit();
+  }
+
+  char Token;
+  ssize_t Ret;
+  LLVM_DEBUG(dbgs() << "Attempting to read token from FD " << ReadFD << ".\n");
+  // Loop to retry on EINTR (interrupted system call).
+  do {
+    Ret = ::read(ReadFD, &Token, 1);
+  } while (Ret < 0 && errno == EINTR);
+
+  if (Ret == 1) {
+    LLVM_DEBUG(dbgs() << "Acquired explicit token '" << Token << "'.\n");
+    return JobSlot::createExplicit(static_cast<uint8_t>(Token));
+  }
+
+  LLVM_DEBUG(dbgs() << "Failed to acquire job slot, read returned " << Ret
+                    << ".\n");
+  return JobSlot();
+}
+
+/// Releases a job slot back to the pool. If the slot is implicit, it simply
+/// resets a flag. If the slot is explicit, it writes the character token
+/// associated with the slot back into the jobserver's write pipe. For FIFO
+/// jobservers, this may require opening the FIFO for writing if it hasn't
+/// been already.
+void JobserverClientImpl::release(JobSlot Slot) {
+  if (!Slot.isValid())
+    return;
+
+  // Releasing the implicit slot just makes it available for the next acquire.
+  if (Slot.isImplicit()) {
+    LLVM_DEBUG(dbgs() << "Released implicit job slot.\n");
+    [[maybe_unused]] bool was_already_released =
+        HasImplicitSlot.exchange(true, std::memory_order_release);
+    assert(!was_already_released && "Implicit slot released twice");
+    return;
+  }
+
+  uint8_t Token = Slot.getExplicitValue();
+  LLVM_DEBUG(dbgs() << "Releasing explicit token '" << (char)Token << "' to FD "
+                    << WriteFD << ".\n");
+
+  // For FIFO-based jobservers, the write FD might not be open yet.
+  // Open it on the first release.
+  if (WriteFD < 0) {
+    LLVM_DEBUG(dbgs() << "WriteFD is invalid, opening FIFO: " << FifoPath
+                      << "\n");
+    WriteFD = ::open(FifoPath.c_str(), O_WRONLY | O_CLOEXEC);
+    if (WriteFD < 0) {
+      LLVM_DEBUG(dbgs() << "Failed to open FIFO for writing.\n");
+      return;
+    }
+    LLVM_DEBUG(dbgs() << "Opened FIFO as new WriteFD: " << WriteFD << "\n");
+  }
+
+  ssize_t Written;
+  // Loop to retry on EINTR (interrupted system call).
+  do {
+    Written = ::write(WriteFD, &Token, 1);
+  } while (Written < 0 && errno == EINTR);
+
+  if (Written <= 0) {
+    LLVM_DEBUG(dbgs() << "Failed to write token to pipe, write returned "
+                      << Written << "\n");
+  }
+}
diff --git a/llvm/lib/Support/Windows/Jobserver.inc b/llvm/lib/Support/Windows/Jobserver.inc
new file mode 100644
index 0000000000000..79028eee4b302
--- /dev/null
+++ b/llvm/lib/Support/Windows/Jobserver.inc
@@ -0,0 +1,79 @@
+//==- llvm/Support/Windows/Jobserver.inc - Windows Jobserver Impl -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Windows-specific parts of the JobserverClient class.
+// On Windows, the jobserver is implemented using a named semaphore.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Windows/WindowsSupport.h"
+#include <atomic>
+#include <cassert>
+
+namespace llvm {
+/// The constructor for the Windows jobserver client. It attempts to open a
+/// handle to an existing named semaphore, the name of which is provided by
+/// GNU make in the --jobserver-auth argument. If the semaphore is opened
+/// successfully, the client is marked as initialized.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
+  Semaphore = (void *)::OpenSemaphoreA(SEMAPHORE_MODIFY_STATE | SYNCHRONIZE,
+                                       FALSE, Config.Path.c_str());
+  if (Semaphore != nullptr)
+    IsInitialized = true;
+}
+
+/// The destructor closes the handle to the semaphore, releasing the resource.
+JobserverClientImpl::~JobserverClientImpl() {
+  if (Semaphore != nullptr)
+    ::CloseHandle((HANDLE)Semaphore);
+}
+
+/// Tries to acquire a job slot. The first call always returns the implicit
+/// slot. Subsequent calls use a non-blocking wait on the semaphore
+/// (`WaitForSingleObject` with a timeout of 0). If the wait succeeds, the
+/// semaphore's count is decremented, and an explicit job slot is acquired.
+/// If the wait times out, it means no slots are available, and an invalid
+/// slot is returned.
+JobSlot JobserverClientImpl::tryAcquire() {
+  if (!IsInitialized)
+    return JobSlot();
+
+  // First, grant the implicit slot.
+  if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
+    return JobSlot::createImplicit();
+  }
+
+  // Try to acquire a slot from the semaphore without blocking.
+  if (::WaitForSingleObject((HANDLE)Semaphore, 0) == WAIT_OBJECT_0) {
+    // The explicit token value is arbitrary on Windows, as the semaphore
+    // count is the real resource.
+    return JobSlot::createExplicit(1);
+  }
+
+  return JobSlot(); // Invalid slot
+}
+
+/// Releases a job slot back to the pool. If the slot is implicit, it simply
+/// resets a flag. For an explicit slot, it increments the semaphore's count
+/// by one using `ReleaseSemaphore`, making the slot available to other
+/// processes.
+void JobserverClientImpl::release(JobSlot Slot) {
+  if (!IsInitialized || !Slot.isValid())
+    return;
+
+  if (Slot.isImplicit()) {
+    [[maybe_unused]] bool was_already_released =
+        HasImplicitSlot.exchange(true, std::memory_order_release);
+    assert(!was_already_released && "Implicit slot released twice");
+    return;
+  }
+
+  // Release the slot by incrementing the semaphore count.
+  (void)::ReleaseSemaphore((HANDLE)Semaphore, 1, NULL);
+}
+} // namespace llvm
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index d1dfb1dc4a722..25efa00c5abfd 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -52,6 +52,7 @@ add_llvm_unittest(SupportTests
   IndexedAccessorTest.cpp
   InstructionCostTest.cpp
   InterleavedRangeTest.cpp
+  JobserverTest.cpp
   JSONTest.cpp
   KnownBitsTest.cpp
   LEB128Test.cpp
diff --git a/llvm/unittests/Support/JobserverTest.cpp b/llvm/unittests/Support/JobserverTest.cpp
new file mode 100644
index 0000000000000..36d5f2e68dab5
--- /dev/null
+++ b/llvm/unittests/Support/JobserverTest.cpp
@@ -0,0 +1,437 @@
+//===- llvm/unittest/Support/JobserverTest.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Jobserver.h unit tests.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Jobserver.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Parallel.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+#include <future>
+#include <random>
+#include <stdlib.h>
+
+#if defined(LLVM_ON_UNIX)
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/FileSystem.h"
+#include <atomic>
+#include <condition_variable>
+#include <fcntl.h>
+#include <mutex>
+#include <sys/stat.h>
+#include <thread>
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+#endif
+
+#define DEBUG_TYPE "jobserver-test"
+
+using namespace llvm;
+
+namespace {
+
+// RAII helper to set an environment variable for the duration of a test.
+class ScopedEnvironment {
+  std::string Name;
+  std::string OldValue;
+  bool HadOldValue;
+
+public:
+  ScopedEnvironment(const char *Name, const char *Value) : Name(Name) {
+#if defined(_WIN32)
+    char *Old = nullptr;
+    size_t OldLen;
+    errno_t err = _dupenv_s(&Old, &OldLen, Name);
+    if (err == 0 && Old != nullptr) {
+      HadOldValue = true;
+      OldValue = Old;
+      free(Old);
+    } else {
+      HadOldValue = false;
+    }
+    _putenv_s(Name, Value);
+#else
+    const char *Old = getenv(Name);
+    if (Old) {
+      HadOldValue = true;
+      OldValue = Old;
+    } else {
+      HadOldValue = false;
+    }
+    setenv(Name, Value, 1);
+#endif
+  }
+
+  ~ScopedEnvironment() {
+#if defined(_WIN32)
+    if (HadOldValue)
+      _putenv_s(Name.c_str(), OldValue.c_str());
+    else
+      // On Windows, setting an environment variable to an empty string
+      // unsets it, making getenv() return NULL.
+      _putenv_s(Name.c_str(), "");
+#else
+    if (HadOldValue)
+      setenv(Name.c_str(), OldValue.c_str(), 1);
+    else
+      unsetenv(Name.c_str());
+#endif
+  }
+};
+
+TEST(Jobserver, Slot) {
+  // Default constructor creates an invalid slot.
+  JobSlot S1;
+  EXPECT_FALSE(S1.isValid());
+  EXPECT_FALSE(S1.isImplicit());
+
+  // Create an implicit slot.
+  JobSlot S2 = JobSlot::createImplicit();
+  EXPECT_TRUE(S2.isValid());
+  EXPECT_TRUE(S2.isImplicit());
+
+  // Create an explicit slot.
+  JobSlot S3 = JobSlot::createExplicit(42);
+  EXPECT_TRUE(S3.isValid());
+  EXPECT_FALSE(S3.isImplicit());
+
+  // Test move construction.
+  JobSlot S4 = std::move(S2);
+  EXPECT_TRUE(S4.isValid());
+  EXPECT_TRUE(S4.isImplicit());
+  EXPECT_FALSE(S2.isValid()); // S2 is now invalid.
+
+  // Test move assignment.
+  S1 = std::move(S3);
+  EXPECT_TRUE(S1.isValid());
+  EXPECT_FALSE(S1.isImplicit());
+  EXPECT_FALSE(S3.isValid()); // S3 is now invalid.
+}
+
+// Test fixture for parsing tests to ensure the singleton state is
+// reset between each test case.
+class JobserverParsingTest : public ::testing::Test {
+protected:
+  void TearDown() override { JobserverClient::resetForTesting(); }
+};
+
+TEST_F(JobserverParsingTest, NoMakeflags) {
+  // No MAKEFLAGS, should be null.
+  ScopedEnvironment Env("MAKEFLAGS", "");
+  // On Unix, setting an env var to "" makes getenv() return an empty
+  // string, not NULL. We must call unsetenv() to test the case where
+  // the variable is truly not present.
+#if !defined(_WIN32)
+  unsetenv("MAKEFLAGS");
+#endif
+  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
+}
+
+TEST_F(JobserverParsingTest, EmptyMakeflags) {
+  // Empty MAKEFLAGS, should be null.
+  ScopedEnvironment Env("MAKEFLAGS", "");
+  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
+}
+
+TEST_F(JobserverParsingTest, DryRunFlag) {
+  // Dry-run flag 'n', should be null.
+  ScopedEnvironment Env("MAKEFLAGS", "n -j --jobserver-auth=fifo:/tmp/foo");
+  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
+}
+
+// Separate fixture for non-threaded client tests.
+class JobserverClientTest : public JobserverParsingTest {};
+
+#if defined(LLVM_ON_UNIX)
+// RAII helper to create and clean up a temporary FIFO file.
+class ScopedFifo {
+  SmallString<128> Path;
+  bool IsValid = false;
+
+public:
+  ScopedFifo() {
+    // To get a unique, non-colliding name for a FIFO, we use the
+    // createTemporaryFile function to reserve a name in the filesystem.
+    std::error_code EC =
+        sys::fs::createTemporaryFile("jobserver-test", "fifo", Path);
+    if (EC)
+      return;
+    // Then we immediately remove the regular file it created, but keep the
+    // unique path.
+    sys::fs::remove(Path);
+    // Finally, we create the FIFO at that safe, unique path.
+    if (mkfifo(Path.c_str(), 0600) != 0)
+      return;
+    IsValid = true;
+  }
+
+  ~ScopedFifo() {
+    if (IsValid)
+      sys::fs::remove(Path);
+  }
+
+  const char *c_str() const { return Path.data(); }
+  bool isValid() const { return IsValid; }
+};
+
+TEST_F(JobserverClientTest, UnixClientFifo) {
+  // This test covers basic FIFO client creation and behavior with an empty
+  // FIFO. No job tokens are available.
+  ScopedFifo F;
+  ASSERT_TRUE(F.isValid());
+
+  // Intentionally inserted \t in environment string.
+  std::string Makeflags = " \t -j4\t \t--jobserver-auth=fifo:";
+  Makeflags += F.c_str();
+  ScopedEnvironment Env("MAKEFLAGS", Makeflags.c_str());
+
+  JobserverClient *Client = JobserverClient::getInstance();
+  ASSERT_NE(Client, nullptr);
+
+  // Get the implicit token.
+  JobSlot S1 = Client->tryAcquire();
+  EXPECT_TRUE(S1.isValid());
+  EXPECT_TRUE(S1.isImplicit());
+
+  // FIFO is empty, next acquire fails.
+  JobSlot S2 = Client->tryAcquire();
+  EXPECT_FALSE(S2.isValid());
+
+  // Release does not write to the pipe for the implicit token.
+  Client->release(std::move(S1));
+
+  // Re-acquire the implicit token.
+  S1 = Client->tryAcquire();
+  EXPECT_TRUE(S1.isValid());
+}
+
+#if LLVM_ENABLE_THREADS
+// Test fixture for tests that use the jobserver strategy. It creates a
+// temporary FIFO, sets MAKEFLAGS, and provides a helper to pre-load the FIFO
+// with job tokens, simulating `make -jN`.
+class JobserverStrategyTest : public JobserverParsingTest {
+protected:
+  std::unique_ptr<ScopedFifo> TheFifo;
+  std::thread MakeThread;
+  std::atomic<bool> StopMakeThread{false};
+
+  void SetUp() override {
+    TheFifo = std::make_unique<ScopedFifo>();
+    ASSERT_TRUE(TheFifo->isValid());
+
+    std::string MakeFlags = "--jobserver-auth=fifo:";
+    MakeFlags += TheFifo->c_str();
+    setenv("MAKEFLAGS", MakeFlags.c_str(), 1);
+  }
+
+  void TearDown() override {
+    if (MakeThread.joinable()) {
+      StopMakeThread = true;
+      MakeThread.join();
+    }
+    unsetenv("MAKEFLAGS");
+    TheFifo.reset();
+    JobserverClient::resetForTesting();
+  }
+
+  // Starts a background thread that emulates `make`. It populates the FIFO
+  // with initial tokens and then recycles tokens released by clients.
+  void startMakeProxy(int NumInitialJobs) {
+    MakeThread = std::thread([this, NumInitialJobs]() {
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Thread started.\n");
+      // Open the FIFO for reading and writing. This call does not block.
+      int RWFd = open(TheFifo->c_str(), O_RDWR);
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Opened FIFO " << TheFifo->c_str()
+                        << " with O_RDWR, FD=" << RWFd << "\n");
+      if (RWFd == -1) {
+        LLVM_DEBUG(
+            dbgs()
+            << "[MakeProxy] ERROR: Failed to open FIFO with O_RDWR. Errno: "
+            << errno << "\n");
+        return;
+      }
+
+      // Populate with initial jobs.
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Writing " << NumInitialJobs
+                        << " initial tokens.\n");
+      for (int i = 0; i < NumInitialJobs; ++i) {
+        if (write(RWFd, "+", 1) != 1) {
+          LLVM_DEBUG(dbgs()
+                     << "[MakeProxy] ERROR: Failed to write initial token " << i
+                     << ".\n");
+          close(RWFd);
+          return;
+        }
+      }
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Finished writing initial tokens.\n");
+
+      // Make the read non-blocking so we can periodically check StopMakeThread.
+      int flags = fcntl(RWFd, F_GETFL, 0);
+      fcntl(RWFd, F_SETFL, flags | O_NONBLOCK);
+
+      while (!StopMakeThread) {
+        char Token;
+        ssize_t Ret = read(RWFd, &Token, 1);
+        if (Ret == 1) {
+          LLVM_DEBUG(dbgs() << "[MakeProxy] Read token '" << Token
+                            << "' to recycle.\n");
+          // A client released a token, 'make' makes it available again.
+          std::this_thread::sleep_for(std::chrono::microseconds(100));
+          ssize_t WRet;
+          do {
+            WRet = write(RWFd, &Token, 1);
+          } while (WRet < 0 && errno == EINTR);
+          if (WRet <= 0) {
+            LLVM_DEBUG(
+                dbgs()
+                << "[MakeProxy] ERROR: Failed to write recycled token.\n");
+            break; // Error, stop the proxy.
+          }
+          LLVM_DEBUG(dbgs()
+                     << "[MakeProxy] Wrote token '" << Token << "' back.\n");
+        } else if (Ret < 0 && errno != EAGAIN && errno != EWOULDBLOCK) {
+          LLVM_DEBUG(dbgs() << "[MakeProxy] ERROR: Read failed with errno "
+                            << errno << ".\n");
+          break; // Error, stop the proxy.
+        }
+        // Yield to prevent this thread from busy-waiting.
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      }
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Thread stopping.\n");
+      close(RWFd);
+    });
+
+    // Give the proxy thread a moment to start and populate the FIFO.
+    // This is a simple way to avoid a race condition where the client starts
+    // before the initial tokens are in the pipe.
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+};
+
+TEST_F(JobserverStrategyTest, ThreadPoolConcurrencyIsLimited) {
+  // This test simulates `make -j3`. We will have 1 implicit job slot and
+  // we will add 2 explicit job tokens to the FIFO, for a total of 3.
+  const int NumExplicitJobs = 2;
+  const int ConcurrencyLimit = NumExplicitJobs + 1; // +1 for the implicit slot
+  const int NumTasks = 8; // More tasks than available slots.
+
+  LLVM_DEBUG(dbgs() << "Calling startMakeProxy with " << NumExplicitJobs
+                    << " jobs.\n");
+  startMakeProxy(NumExplicitJobs);
+  LLVM_DEBUG(dbgs() << "MakeProxy is running.\n");
+
+  // Create the thread pool. Its constructor will call jobserver_concurrency()
+  // and create a client that reads from our pre-loaded FIFO.
+  StdThreadPool Pool(jobserver_concurrency());
+
+  std::atomic<int> ActiveTasks{0};
+  std::atomic<int> MaxActiveTasks{0};
+  std::atomic<int> CompletedTasks{0};
+  std::mutex M;
+  std::condition_variable CV;
+
+  // Dispatch more tasks than there are job slots. The pool should block
+  // and only run up to `ConcurrencyLimit` tasks at once.
+  for (int i = 0; i < NumTasks; ++i) {
+    Pool.async([&, i] {
+      // Track the number of concurrently running tasks.
+      int CurrentActive = ++ActiveTasks;
+      LLVM_DEBUG(dbgs() << "Task " << i << ": Active tasks: " << CurrentActive
+                        << "\n");
+      int OldMax = MaxActiveTasks.load();
+      while (CurrentActive > OldMax)
+        MaxActiveTasks.compare_exchange_weak(OldMax, CurrentActive);
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(25));
+
+      --ActiveTasks;
+      if (++CompletedTasks == NumTasks) {
+        std::lock_guard<std::mutex> Lock(M);
+        CV.notify_one();
+      }
+    });
+  }
+
+  // Wait for all tasks to complete.
+  std::unique_lock<std::mutex> Lock(M);
+  CV.wait(Lock, [&] { return CompletedTasks == NumTasks; });
+
+  LLVM_DEBUG(dbgs() << "Test finished. Max active tasks was " << MaxActiveTasks
+                    << ".\n");
+  // The key assertion: the maximum number of concurrent tasks should
+  // not have exceeded the limit imposed by the jobserver.
+  EXPECT_LE(MaxActiveTasks, ConcurrencyLimit);
+  EXPECT_EQ(CompletedTasks, NumTasks);
+}
+
+TEST_F(JobserverStrategyTest, ParallelForIsLimited) {
+  // This test verifies that llvm::parallelFor respects the jobserver limit.
+  const int NumExplicitJobs = 3;
+  const int ConcurrencyLimit = NumExplicitJobs + 1; // +1 implicit
+  const int NumTasks = 20;
+
+  LLVM_DEBUG(dbgs() << "Calling startMakeProxy with " << NumExplicitJobs
+                    << " jobs.\n");
+  startMakeProxy(NumExplicitJobs);
+  LLVM_DEBUG(dbgs() << "MakeProxy is running.\n");
+
+  // Set the global strategy. parallelFor will use this.
+  parallel::strategy = jobserver_concurrency();
+
+  std::atomic<int> ActiveTasks{0};
+  std::atomic<int> MaxActiveTasks{0};
+
+  parallelFor(0, NumTasks, [&](int i) {
+    int CurrentActive = ++ActiveTasks;
+    LLVM_DEBUG(dbgs() << "Task " << i << ": Active tasks: " << CurrentActive
+                      << "\n");
+    int OldMax = MaxActiveTasks.load();
+    while (CurrentActive > OldMax)
+      MaxActiveTasks.compare_exchange_weak(OldMax, CurrentActive);
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(20));
+    --ActiveTasks;
+  });
+
+  LLVM_DEBUG(dbgs() << "ParallelFor finished. Max active tasks was "
+                    << MaxActiveTasks << ".\n");
+  EXPECT_LE(MaxActiveTasks, ConcurrencyLimit);
+}
+
+TEST_F(JobserverStrategyTest, ParallelSortIsLimited) {
+  // This test serves as an integration test to ensure parallelSort completes
+  // correctly when running under the jobserver strategy. It doesn't directly
+  // measure concurrency but verifies correctness.
+  const int NumExplicitJobs = 3;
+  startMakeProxy(NumExplicitJobs);
+
+  parallel::strategy = jobserver_concurrency();
+
+  std::vector<int> V(1024);
+  // Fill with random data
+  std::mt19937 randEngine;
+  std::uniform_int_distribution<int> dist;
+  for (int &i : V)
+    i = dist(randEngine);
+
+  parallelSort(V.begin(), V.end());
+  ASSERT_TRUE(llvm::is_sorted(V));
+}
+
+#endif // LLVM_ENABLE_THREADS
+
+#endif // defined(LLVM_ON_UNIX)
+
+} // end anonymous namespace

From 296fddc89e23b33055ff3f7ccb55de6c47dac757 Mon Sep 17 00:00:00 2001
From: Tom Tromey <tromey@adacore.com>
Date: Fri, 3 Oct 2025 07:36:17 -0600
Subject: [PATCH 637/878] Allow DW_OP_rot, DW_OP_neg, and DW_OP_abs in
 DIExpression (#160757)

The Ada front end can emit somewhat complicated DWARF expressions for
the offset of a field. While working in this area I found that I needed
DW_OP_rot (to implement a branch-free computation -- it looked more
difficult to add support for branching); and DW_OP_neg and DW_OP_abs
(just basic functionality).
---
 llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp |  3 +++
 llvm/lib/IR/DebugInfoMetadata.cpp               |  3 +++
 llvm/test/Bitcode/DW_OP_rot_neg_abs.ll          | 10 ++++++++++
 3 files changed, 16 insertions(+)
 create mode 100644 llvm/test/Bitcode/DW_OP_rot_neg_abs.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 1703b27d350f3..bc0bb349be249 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -618,12 +618,15 @@ bool DwarfExpression::addExpression(
     case dwarf::DW_OP_dup:
     case dwarf::DW_OP_push_object_address:
     case dwarf::DW_OP_over:
+    case dwarf::DW_OP_rot:
     case dwarf::DW_OP_eq:
     case dwarf::DW_OP_ne:
     case dwarf::DW_OP_gt:
     case dwarf::DW_OP_ge:
     case dwarf::DW_OP_lt:
     case dwarf::DW_OP_le:
+    case dwarf::DW_OP_neg:
+    case dwarf::DW_OP_abs:
       emitOp(OpNum);
       break;
     case dwarf::DW_OP_deref:
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index 1ededb9e7b3e2..77d044b55f7e0 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -1768,6 +1768,7 @@ bool DIExpression::isValid() const {
     case dwarf::DW_OP_bregx:
     case dwarf::DW_OP_push_object_address:
     case dwarf::DW_OP_over:
+    case dwarf::DW_OP_rot:
     case dwarf::DW_OP_consts:
     case dwarf::DW_OP_eq:
     case dwarf::DW_OP_ne:
@@ -1775,6 +1776,8 @@ bool DIExpression::isValid() const {
     case dwarf::DW_OP_ge:
     case dwarf::DW_OP_lt:
     case dwarf::DW_OP_le:
+    case dwarf::DW_OP_neg:
+    case dwarf::DW_OP_abs:
       break;
     }
   }
diff --git a/llvm/test/Bitcode/DW_OP_rot_neg_abs.ll b/llvm/test/Bitcode/DW_OP_rot_neg_abs.ll
new file mode 100644
index 0000000000000..e185530ead6d6
--- /dev/null
+++ b/llvm/test/Bitcode/DW_OP_rot_neg_abs.ll
@@ -0,0 +1,10 @@
+;; This test checks the validity of DWARF operators DW_OP_rot, DW_OP_neg, and DW_OP_abs.
+
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+; CHECK: !DIExpression(DW_OP_push_object_address, DW_OP_lit0, DW_OP_lit0, DW_OP_neg, DW_OP_abs, DW_OP_rot, DW_OP_rot, DW_OP_rot, DW_OP_plus, DW_OP_plus)
+
+; ModuleID = 'DW_OP_rot_neg_abs.adb'
+source_filename = "/dir/DW_OP_rot_neg_abs.ll"
+
+!named = !{!DIExpression(DW_OP_push_object_address, DW_OP_lit0, DW_OP_lit0, DW_OP_neg, DW_OP_abs, DW_OP_rot, DW_OP_rot, DW_OP_rot, DW_OP_plus, DW_OP_plus)}

From 7ceef762c8c914270cfa49ad5794227b270c8223 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 3 Oct 2025 14:44:58 +0100
Subject: [PATCH 638/878] [LAA] Check if Ptr can be freed between Assume and
 CtxI. (#161725)

When using information from dereferenceable assumptions, we need to make
sure that the memory is not freed between the assume and the specified
context instruction. Instead of just checking canBeFreed, check if there
any calls that may free between the assume and the context instruction.

This patch introduces a willNotFreeBetween to check for calls that may
free between an assume and a context instructions, to also be used in
https://github.com/llvm/llvm-project/pull/161255.

PR: https://github.com/llvm/llvm-project/pull/161725
---
 llvm/include/llvm/Analysis/ValueTracking.h    |  6 ++++
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 32 +++++++++++--------
 llvm/lib/Analysis/ValueTracking.cpp           | 26 +++++++++++++++
 .../early-exit-runtime-checks.ll              |  4 +--
 .../single-early-exit-deref-assumptions.ll    |  4 +--
 5 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 15ff129deda13..af218ba564081 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -613,6 +613,12 @@ LLVM_ABI bool isValidAssumeForContext(const Instruction *I,
                                       const DominatorTree *DT = nullptr,
                                       bool AllowEphemerals = false);
 
+/// Returns true, if no instruction between \p Assume and \p CtxI may free
+/// memory and the function is marked as NoSync. The latter ensures the current
+/// function cannot arrange for another thread to free on its behalf.
+LLVM_ABI bool willNotFreeBetween(const Instruction *Assume,
+                                 const Instruction *CtxI);
+
 enum class OverflowResult {
   /// Always overflows in the direction of signed/unsigned min value.
   AlwaysOverflowsLow,
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 47dccde45337b..7adb25d0e5e0d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -233,19 +233,25 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
   const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes);
 
   // Check if we have a suitable dereferencable assumption we can use.
-  if (!StartPtrV->canBeFreed()) {
-    Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
-    if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
-      if (isa<BranchInst>(LoopPred->getTerminator()))
-        CtxI = LoopPred->getTerminator();
-    }
-
-    RetainedKnowledge DerefRK = getKnowledgeValidInContext(
-        StartPtrV, {Attribute::Dereferenceable}, *AC, CtxI, DT);
-    if (DerefRK) {
-      DerefBytesSCEV =
-          SE.getUMaxExpr(DerefBytesSCEV, SE.getSCEV(DerefRK.IRArgValue));
-    }
+  Instruction *CtxI = &*L->getHeader()->getFirstNonPHIIt();
+  if (BasicBlock *LoopPred = L->getLoopPredecessor()) {
+    if (isa<BranchInst>(LoopPred->getTerminator()))
+      CtxI = LoopPred->getTerminator();
+  }
+  RetainedKnowledge DerefRK;
+  getKnowledgeForValue(StartPtrV, {Attribute::Dereferenceable}, *AC,
+                       [&](RetainedKnowledge RK, Instruction *Assume, auto) {
+                         if (!isValidAssumeForContext(Assume, CtxI, DT))
+                           return false;
+                         if (StartPtrV->canBeFreed() &&
+                             !willNotFreeBetween(Assume, CtxI))
+                           return false;
+                         DerefRK = std::max(DerefRK, RK);
+                         return true;
+                       });
+  if (DerefRK) {
+    DerefBytesSCEV =
+        SE.getUMaxExpr(DerefBytesSCEV, SE.getSCEV(DerefRK.IRArgValue));
   }
 
   if (DerefBytesSCEV->isZero())
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 09a8fbea065ac..1eda7a7c90b08 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -89,6 +89,9 @@ using namespace llvm::PatternMatch;
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
                                               cl::Hidden, cl::init(20));
 
+/// Maximum number of instructions to check between assume and context
+/// instruction.
+static constexpr unsigned MaxInstrsToCheckForFree = 16;
 
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
 /// returns the element type's bitwidth.
@@ -561,6 +564,29 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
   return false;
 }
 
+bool llvm::willNotFreeBetween(const Instruction *Assume,
+                              const Instruction *CtxI) {
+  if (CtxI->getParent() != Assume->getParent() || !Assume->comesBefore(CtxI))
+    return false;
+  // Make sure the current function cannot arrange for another thread to free on
+  // its behalf.
+  if (!CtxI->getFunction()->hasNoSync())
+    return false;
+
+  // Check if there are any calls between the assume and CtxI that may
+  // free memory.
+  for (const auto &[Idx, I] :
+       enumerate(make_range(Assume->getIterator(), CtxI->getIterator()))) {
+    // Limit number of instructions to walk.
+    if (Idx > MaxInstrsToCheckForFree)
+      return false;
+    if (const auto *CB = dyn_cast<CallBase>(&I))
+      if (!CB->hasFnAttr(Attribute::NoFree))
+        return false;
+  }
+  return true;
+}
+
 // TODO: cmpExcludesZero misses many cases where `RHS` is non-constant but
 // we still have enough information about `RHS` to conclude non-zero. For
 // example Pred=EQ, RHS=isKnownNonZero. cmpExcludesZero is called in loops
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index 6d9aa8d1ea32b..05aad8a6118d2 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -770,10 +770,10 @@ define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations_kno
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group GRP0:
-; CHECK-NEXT:          (Low: %B High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %B High: (2000 + %B))
 ; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
 ; CHECK-NEXT:        Group GRP1:
-; CHECK-NEXT:          (Low: %A High: inttoptr (i64 -1 to ptr))
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
 ; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
index 96206977864e2..f7946206931c2 100644
--- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
+++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll
@@ -512,8 +512,8 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[LOOP_INC:.*]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
 ; CHECK-NEXT:    br i1 [[CMP3]], label %[[LOOP_INC]], label %[[LOOP_END:.*]]
 ; CHECK:       [[LOOP_INC]]:

From 375f48942b9a3f3fbd82133390af25b6c96f1460 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Fri, 3 Oct 2025 15:52:48 +0200
Subject: [PATCH 639/878] [Flang] Add standalone tile support (#160298)

Add support for the standalone OpenMP tile construct:
```f90
!$omp tile sizes(...)
DO i = 1, 100
  ...
```

This is complementary to #143715 which added support for the tile
construct as part of another loop-associated construct such as
worksharing-loop, distribute, etc.
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    |  13 +
 flang/lib/Lower/OpenMP/ClauseProcessor.h      |   2 +
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 360 ++++++++++++------
 flang/lib/Lower/OpenMP/Utils.cpp              |  24 +-
 flang/lib/Lower/OpenMP/Utils.h                |   7 +
 .../lib/Semantics/check-directive-structure.h |   7 +-
 flang/lib/Semantics/check-omp-structure.cpp   |   8 +-
 flang/lib/Semantics/resolve-directives.cpp    |  16 +-
 flang/test/Lower/OpenMP/tile01.f90            |  58 +++
 flang/test/Lower/OpenMP/tile02.f90            |  88 +++++
 .../loop-transformation-construct02.f90       |   5 +-
 flang/test/Parser/OpenMP/tile-fail.f90        |  32 ++
 flang/test/Parser/OpenMP/tile.f90             |  15 +-
 flang/test/Semantics/OpenMP/tile01.f90        |  26 ++
 flang/test/Semantics/OpenMP/tile02.f90        |  15 +
 flang/test/Semantics/OpenMP/tile03.f90        |  15 +
 flang/test/Semantics/OpenMP/tile04.f90        |  38 ++
 flang/test/Semantics/OpenMP/tile05.f90        |  14 +
 flang/test/Semantics/OpenMP/tile06.f90        |  44 +++
 flang/test/Semantics/OpenMP/tile07.f90        |  35 ++
 flang/test/Semantics/OpenMP/tile08.f90        |  15 +
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |   3 +
 .../test/Target/LLVMIR/openmp-cli-tile01.mlir | 127 +++---
 .../test/Target/LLVMIR/openmp-cli-tile02.mlir | 262 +++++++------
 openmp/runtime/test/transform/tile/intfor.f90 |  31 ++
 .../runtime/test/transform/tile/intfor_2d.f90 |  53 +++
 .../transform/tile/intfor_2d_varsizes.F90     |  60 +++
 27 files changed, 1029 insertions(+), 344 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/tile01.f90
 create mode 100644 flang/test/Lower/OpenMP/tile02.f90
 create mode 100644 flang/test/Parser/OpenMP/tile-fail.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile01.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile02.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile03.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile04.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile05.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile06.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile07.f90
 create mode 100644 flang/test/Semantics/OpenMP/tile08.f90
 create mode 100644 openmp/runtime/test/transform/tile/intfor.f90
 create mode 100644 openmp/runtime/test/transform/tile/intfor_2d.f90
 create mode 100644 openmp/runtime/test/transform/tile/intfor_2d_varsizes.F90

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index a96884f5680ba..55eda7e3404c1 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -431,6 +431,19 @@ bool ClauseProcessor::processNumTasks(
   return false;
 }
 
+bool ClauseProcessor::processSizes(StatementContext &stmtCtx,
+                                   mlir::omp::SizesClauseOps &result) const {
+  if (auto *clause = findUniqueClause<omp::clause::Sizes>()) {
+    result.sizes.reserve(clause->v.size());
+    for (const ExprTy &vv : clause->v)
+      result.sizes.push_back(fir::getBase(converter.genExprValue(vv, stmtCtx)));
+
+    return true;
+  }
+
+  return false;
+}
+
 bool ClauseProcessor::processNumTeams(
     lower::StatementContext &stmtCtx,
     mlir::omp::NumTeamsClauseOps &result) const {
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 324ea3c1047a5..9e352fa574a97 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -66,6 +66,8 @@ class ClauseProcessor {
                   mlir::omp::LoopRelatedClauseOps &loopResult,
                   mlir::omp::CollapseClauseOps &collapseResult,
                   llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const;
+  bool processSizes(StatementContext &stmtCtx,
+                    mlir::omp::SizesClauseOps &result) const;
   bool processDevice(lower::StatementContext &stmtCtx,
                      mlir::omp::DeviceClauseOps &result) const;
   bool processDeviceType(mlir::omp::DeviceTypeClauseOps &result) const;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 1cb3335abbd06..9e56c2bfb7e25 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1982,125 +1982,241 @@ genLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   return loopOp;
 }
 
-static mlir::omp::CanonicalLoopOp
-genCanonicalLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
-                   semantics::SemanticsContext &semaCtx,
-                   lower::pft::Evaluation &eval, mlir::Location loc,
-                   const ConstructQueue &queue,
-                   ConstructQueue::const_iterator item,
-                   llvm::ArrayRef<const semantics::Symbol *> ivs,
-                   llvm::omp::Directive directive) {
+static void genCanonicalLoopNest(
+    lower::AbstractConverter &converter, lower::SymMap &symTable,
+    semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
+    mlir::Location loc, const ConstructQueue &queue,
+    ConstructQueue::const_iterator item, size_t numLoops,
+    llvm::SmallVectorImpl<mlir::omp::CanonicalLoopOp> &loops) {
+  assert(loops.empty() && "Expecting empty list to fill");
+  assert(numLoops >= 1 && "Expecting at least one loop");
+
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  assert(ivs.size() == 1 && "Nested loops not yet implemented");
-  const semantics::Symbol *iv = ivs[0];
+  mlir::omp::LoopRelatedClauseOps loopInfo;
+  llvm::SmallVector<const semantics::Symbol *, 3> ivs;
+  collectLoopRelatedInfo(converter, loc, eval, numLoops, loopInfo, ivs);
+  assert(ivs.size() == numLoops &&
+         "Expected to parse as many loop variables as there are loops");
+
+  // Steps that follow:
+  // 1. Emit all of the loop's prologues (compute the tripcount)
+  // 2. Emit omp.canonical_loop nested inside each other (iteratively)
+  // 2.1. In the innermost omp.canonical_loop, emit the loop body prologue (in
+  // the body callback)
+  //
+  // Since emitting prologues and body code is split, remember prologue values
+  // for use when emitting the same loop's epilogues.
+  llvm::SmallVector<mlir::Value> tripcounts;
+  llvm::SmallVector<mlir::Value> clis;
+  llvm::SmallVector<lower::pft::Evaluation *> evals;
+  llvm::SmallVector<mlir::Type> loopVarTypes;
+  llvm::SmallVector<mlir::Value> loopStepVars;
+  llvm::SmallVector<mlir::Value> loopLBVars;
+  llvm::SmallVector<mlir::Value> blockArgs;
+
+  // Step 1: Loop prologues
+  // Computing the trip count must happen before entering the outermost loop
+  lower::pft::Evaluation *innermostEval = &eval.getFirstNestedEvaluation();
+  for ([[maybe_unused]] auto iv : ivs) {
+    if (innermostEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
+      // OpenMP specifies DO CONCURRENT only with the `!omp loop` construct.
+      // Will need to add special cases for this combination.
+      TODO(loc, "DO CONCURRENT as canonical loop not supported");
+    }
+
+    auto &doLoopEval = innermostEval->getFirstNestedEvaluation();
+    evals.push_back(innermostEval);
+
+    // Get the loop bounds (and increment)
+    // auto &doLoopEval = nestedEval.getFirstNestedEvaluation();
+    auto *doStmt = doLoopEval.getIf<parser::NonLabelDoStmt>();
+    assert(doStmt && "Expected do loop to be in the nested evaluation");
+    auto &loopControl = std::get<std::optional<parser::LoopControl>>(doStmt->t);
+    assert(loopControl.has_value());
+    auto *bounds = std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
+    assert(bounds && "Expected bounds for canonical loop");
+    lower::StatementContext stmtCtx;
+    mlir::Value loopLBVar = fir::getBase(
+        converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx));
+    mlir::Value loopUBVar = fir::getBase(
+        converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx));
+    mlir::Value loopStepVar = [&]() {
+      if (bounds->step) {
+        return fir::getBase(
+            converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx));
+      }
 
-  auto &nestedEval = eval.getFirstNestedEvaluation();
-  if (nestedEval.getIf<parser::DoConstruct>()->IsDoConcurrent()) {
-    // OpenMP specifies DO CONCURRENT only with the `!omp loop` construct. Will
-    // need to add special cases for this combination.
-    TODO(loc, "DO CONCURRENT as canonical loop not supported");
+      // If `step` is not present, assume it is `1`.
+      auto intTy = firOpBuilder.getI32Type();
+      return firOpBuilder.createIntegerConstant(loc, intTy, 1);
+    }();
+
+    // Get the integer kind for the loop variable and cast the loop bounds
+    size_t loopVarTypeSize = bounds->name.thing.symbol->GetUltimate().size();
+    mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+    loopVarTypes.push_back(loopVarType);
+    loopLBVar = firOpBuilder.createConvert(loc, loopVarType, loopLBVar);
+    loopUBVar = firOpBuilder.createConvert(loc, loopVarType, loopUBVar);
+    loopStepVar = firOpBuilder.createConvert(loc, loopVarType, loopStepVar);
+    loopLBVars.push_back(loopLBVar);
+    loopStepVars.push_back(loopStepVar);
+
+    // Start lowering
+    mlir::Value zero = firOpBuilder.createIntegerConstant(loc, loopVarType, 0);
+    mlir::Value one = firOpBuilder.createIntegerConstant(loc, loopVarType, 1);
+    mlir::Value isDownwards = firOpBuilder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
+
+    // Ensure we are counting upwards. If not, negate step and swap lb and ub.
+    mlir::Value negStep =
+        firOpBuilder.create<mlir::arith::SubIOp>(loc, zero, loopStepVar);
+    mlir::Value incr = firOpBuilder.create<mlir::arith::SelectOp>(
+        loc, isDownwards, negStep, loopStepVar);
+    mlir::Value lb = firOpBuilder.create<mlir::arith::SelectOp>(
+        loc, isDownwards, loopUBVar, loopLBVar);
+    mlir::Value ub = firOpBuilder.create<mlir::arith::SelectOp>(
+        loc, isDownwards, loopLBVar, loopUBVar);
+
+    // Compute the trip count assuming lb <= ub. This guarantees that the result
+    // is non-negative and we can use unsigned arithmetic.
+    mlir::Value span = firOpBuilder.create<mlir::arith::SubIOp>(
+        loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
+    mlir::Value tcMinusOne =
+        firOpBuilder.create<mlir::arith::DivUIOp>(loc, span, incr);
+    mlir::Value tcIfLooping = firOpBuilder.create<mlir::arith::AddIOp>(
+        loc, tcMinusOne, one, ::mlir::arith::IntegerOverflowFlags::nuw);
+
+    // Fall back to 0 if lb > ub
+    mlir::Value isZeroTC = firOpBuilder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::slt, ub, lb);
+    mlir::Value tripcount = firOpBuilder.create<mlir::arith::SelectOp>(
+        loc, isZeroTC, zero, tcIfLooping);
+    tripcounts.push_back(tripcount);
+
+    // Create the CLI handle.
+    auto newcli = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    mlir::Value cli = newcli.getResult();
+    clis.push_back(cli);
+
+    innermostEval = &*std::next(innermostEval->getNestedEvaluations().begin());
   }
 
-  // Get the loop bounds (and increment)
-  auto &doLoopEval = nestedEval.getFirstNestedEvaluation();
-  auto *doStmt = doLoopEval.getIf<parser::NonLabelDoStmt>();
-  assert(doStmt && "Expected do loop to be in the nested evaluation");
-  auto &loopControl = std::get<std::optional<parser::LoopControl>>(doStmt->t);
-  assert(loopControl.has_value());
-  auto *bounds = std::get_if<parser::LoopControl::Bounds>(&loopControl->u);
-  assert(bounds && "Expected bounds for canonical loop");
-  lower::StatementContext stmtCtx;
-  mlir::Value loopLBVar = fir::getBase(
-      converter.genExprValue(*semantics::GetExpr(bounds->lower), stmtCtx));
-  mlir::Value loopUBVar = fir::getBase(
-      converter.genExprValue(*semantics::GetExpr(bounds->upper), stmtCtx));
-  mlir::Value loopStepVar = [&]() {
-    if (bounds->step) {
-      return fir::getBase(
-          converter.genExprValue(*semantics::GetExpr(bounds->step), stmtCtx));
-    }
+  // Step 2: Create nested canoncial loops
+  for (auto i : llvm::seq<size_t>(numLoops)) {
+    bool isInnermost = (i == numLoops - 1);
+    mlir::Type loopVarType = loopVarTypes[i];
+    mlir::Value tripcount = tripcounts[i];
+    mlir::Value cli = clis[i];
+    auto &&eval = evals[i];
+
+    auto ivCallback = [&, i, isInnermost](mlir::Operation *op)
+        -> llvm::SmallVector<const Fortran::semantics::Symbol *> {
+      mlir::Region &region = op->getRegion(0);
+
+      // Create the op's region skeleton (BB taking the iv as argument)
+      firOpBuilder.createBlock(&region, {}, {loopVarType}, {loc});
+      blockArgs.push_back(region.front().getArgument(0));
+
+      // Step 2.1: Emit body prologue code
+      // Compute the translation from logical iteration number to the value of
+      // the loop's iteration variable only in the innermost body. Currently,
+      // loop transformations do not allow any instruction between loops, but
+      // this will change with
+      if (isInnermost) {
+        assert(blockArgs.size() == numLoops &&
+               "Expecting all block args to have been collected by now");
+        for (auto j : llvm::seq<size_t>(numLoops)) {
+          mlir::Value natIterNum = fir::getBase(blockArgs[j]);
+          mlir::Value scaled = firOpBuilder.create<mlir::arith::MulIOp>(
+              loc, natIterNum, loopStepVars[j]);
+          mlir::Value userVal = firOpBuilder.create<mlir::arith::AddIOp>(
+              loc, loopLBVars[j], scaled);
+
+          mlir::OpBuilder::InsertPoint insPt =
+              firOpBuilder.saveInsertionPoint();
+          firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock());
+          mlir::Type tempTy = converter.genType(*ivs[j]);
+          firOpBuilder.restoreInsertionPoint(insPt);
+
+          // Write the loop value into loop variable
+          mlir::Value cvtVal = firOpBuilder.createConvert(loc, tempTy, userVal);
+          hlfir::Entity lhs{converter.getSymbolAddress(*ivs[j])};
+          lhs = hlfir::derefPointersAndAllocatables(loc, firOpBuilder, lhs);
+          mlir::Operation *storeOp =
+              hlfir::AssignOp::create(firOpBuilder, loc, cvtVal, lhs);
+          firOpBuilder.setInsertionPointAfter(storeOp);
+        }
+      }
 
-    // If `step` is not present, assume it is `1`.
-    return firOpBuilder.createIntegerConstant(loc, firOpBuilder.getI32Type(),
-                                              1);
-  }();
+      return {ivs[i]};
+    };
 
-  // Get the integer kind for the loop variable and cast the loop bounds
-  size_t loopVarTypeSize = bounds->name.thing.symbol->GetUltimate().size();
-  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  loopLBVar = firOpBuilder.createConvert(loc, loopVarType, loopLBVar);
-  loopUBVar = firOpBuilder.createConvert(loc, loopVarType, loopUBVar);
-  loopStepVar = firOpBuilder.createConvert(loc, loopVarType, loopStepVar);
-
-  // Start lowering
-  mlir::Value zero = firOpBuilder.createIntegerConstant(loc, loopVarType, 0);
-  mlir::Value one = firOpBuilder.createIntegerConstant(loc, loopVarType, 1);
-  mlir::Value isDownwards = mlir::arith::CmpIOp::create(
-      firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
-
-  // Ensure we are counting upwards. If not, negate step and swap lb and ub.
-  mlir::Value negStep =
-      mlir::arith::SubIOp::create(firOpBuilder, loc, zero, loopStepVar);
-  mlir::Value incr = mlir::arith::SelectOp::create(
-      firOpBuilder, loc, isDownwards, negStep, loopStepVar);
-  mlir::Value lb = mlir::arith::SelectOp::create(firOpBuilder, loc, isDownwards,
-                                                 loopUBVar, loopLBVar);
-  mlir::Value ub = mlir::arith::SelectOp::create(firOpBuilder, loc, isDownwards,
-                                                 loopLBVar, loopUBVar);
-
-  // Compute the trip count assuming lb <= ub. This guarantees that the result
-  // is non-negative and we can use unsigned arithmetic.
-  mlir::Value span = mlir::arith::SubIOp::create(
-      firOpBuilder, loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
-  mlir::Value tcMinusOne =
-      mlir::arith::DivUIOp::create(firOpBuilder, loc, span, incr);
-  mlir::Value tcIfLooping =
-      mlir::arith::AddIOp::create(firOpBuilder, loc, tcMinusOne, one,
-                                  ::mlir::arith::IntegerOverflowFlags::nuw);
-
-  // Fall back to 0 if lb > ub
-  mlir::Value isZeroTC = mlir::arith::CmpIOp::create(
-      firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, ub, lb);
-  mlir::Value tripcount = mlir::arith::SelectOp::create(
-      firOpBuilder, loc, isZeroTC, zero, tcIfLooping);
-
-  // Create the CLI handle.
-  auto newcli = mlir::omp::NewCliOp::create(firOpBuilder, loc);
-  mlir::Value cli = newcli.getResult();
-
-  auto ivCallback = [&](mlir::Operation *op)
-      -> llvm::SmallVector<const Fortran::semantics::Symbol *> {
-    mlir::Region &region = op->getRegion(0);
-
-    // Create the op's region skeleton (BB taking the iv as argument)
-    firOpBuilder.createBlock(&region, {}, {loopVarType}, {loc});
-
-    // Compute the value of the loop variable from the logical iteration number.
-    mlir::Value natIterNum = fir::getBase(region.front().getArgument(0));
-    mlir::Value scaled =
-        mlir::arith::MulIOp::create(firOpBuilder, loc, natIterNum, loopStepVar);
-    mlir::Value userVal =
-        mlir::arith::AddIOp::create(firOpBuilder, loc, loopLBVar, scaled);
-
-    // Write loop value to loop variable
-    mlir::Operation *storeOp = setLoopVar(converter, loc, userVal, iv);
-
-    firOpBuilder.setInsertionPointAfter(storeOp);
-    return {iv};
-  };
+    // Create the omp.canonical_loop operation
+    auto opGenInfo = OpWithBodyGenInfo(converter, symTable, semaCtx, loc, *eval,
+                                       llvm::omp::Directive::OMPD_unknown)
+                         .setGenSkeletonOnly(!isInnermost)
+                         .setClauses(&item->clauses)
+                         .setPrivatize(false)
+                         .setGenRegionEntryCb(ivCallback);
+    auto canonLoop = genOpWithBody<mlir::omp::CanonicalLoopOp>(
+        std::move(opGenInfo), queue, item, tripcount, cli);
+    loops.push_back(canonLoop);
+
+    // Insert next loop nested inside last loop
+    firOpBuilder.setInsertionPoint(
+        canonLoop.getRegion().back().getTerminator());
+  }
 
-  // Create the omp.canonical_loop operation
-  auto canonLoop = genOpWithBody<mlir::omp::CanonicalLoopOp>(
-      OpWithBodyGenInfo(converter, symTable, semaCtx, loc, nestedEval,
-                        directive)
-          .setClauses(&item->clauses)
-          .setPrivatize(false)
-          .setGenRegionEntryCb(ivCallback),
-      queue, item, tripcount, cli);
+  firOpBuilder.setInsertionPointAfter(loops.front());
+}
+
+static void genTileOp(Fortran::lower::AbstractConverter &converter,
+                      Fortran::lower::SymMap &symTable,
+                      lower::StatementContext &stmtCtx,
+                      Fortran::semantics::SemanticsContext &semaCtx,
+                      Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+                      const ConstructQueue &queue,
+                      ConstructQueue::const_iterator item) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  firOpBuilder.setInsertionPointAfter(canonLoop);
-  return canonLoop;
+  mlir::omp::SizesClauseOps sizesClause;
+  ClauseProcessor cp(converter, semaCtx, item->clauses);
+  cp.processSizes(stmtCtx, sizesClause);
+
+  size_t numLoops = sizesClause.sizes.size();
+  llvm::SmallVector<mlir::omp::CanonicalLoopOp, 3> canonLoops;
+  canonLoops.reserve(numLoops);
+
+  genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item,
+                       numLoops, canonLoops);
+  assert((canonLoops.size() == numLoops) &&
+         "Expecting the predetermined number of loops");
+
+  llvm::SmallVector<mlir::Value, 3> applyees;
+  applyees.reserve(numLoops);
+  for (mlir::omp::CanonicalLoopOp l : canonLoops)
+    applyees.push_back(l.getCli());
+
+  // Emit the associated loops and create a CLI for each affected loop
+  llvm::SmallVector<mlir::Value, 3> gridGeneratees;
+  llvm::SmallVector<mlir::Value, 3> intratileGeneratees;
+  gridGeneratees.reserve(numLoops);
+  intratileGeneratees.reserve(numLoops);
+  for ([[maybe_unused]] auto i : llvm::seq<int>(0, sizesClause.sizes.size())) {
+    auto gridCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    gridGeneratees.push_back(gridCLI.getResult());
+    auto intratileCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    intratileGeneratees.push_back(intratileCLI.getResult());
+  }
+
+  llvm::SmallVector<mlir::Value, 6> generatees;
+  generatees.reserve(2 * numLoops);
+  generatees.append(gridGeneratees);
+  generatees.append(intratileGeneratees);
+
+  firOpBuilder.create<mlir::omp::TileOp>(loc, generatees, applyees,
+                                         sizesClause.sizes);
 }
 
 static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
@@ -2112,22 +2228,22 @@ static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
                         ConstructQueue::const_iterator item) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  mlir::omp::LoopRelatedClauseOps loopInfo;
-  llvm::SmallVector<const semantics::Symbol *> iv;
-  collectLoopRelatedInfo(converter, loc, eval, item->clauses, loopInfo, iv);
-
   // Clauses for unrolling not yet implemnted
   ClauseProcessor cp(converter, semaCtx, item->clauses);
   cp.processTODO<clause::Partial, clause::Full>(
       loc, llvm::omp::Directive::OMPD_unroll);
 
   // Emit the associated loop
-  auto canonLoop =
-      genCanonicalLoopOp(converter, symTable, semaCtx, eval, loc, queue, item,
-                         iv, llvm::omp::Directive::OMPD_unroll);
+  llvm::SmallVector<mlir::omp::CanonicalLoopOp, 1> canonLoops;
+  genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item, 1,
+                       canonLoops);
+
+  llvm::SmallVector<mlir::Value, 1> applyees;
+  for (auto &&canonLoop : canonLoops)
+    applyees.push_back(canonLoop.getCli());
 
   // Apply unrolling to it
-  auto cli = canonLoop.getCli();
+  auto cli = llvm::getSingleElement(canonLoops).getCli();
   mlir::omp::UnrollHeuristicOp::create(firOpBuilder, loc, cli);
 }
 
@@ -3360,13 +3476,9 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     newOp = genTeamsOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue,
                        item);
     break;
-  case llvm::omp::Directive::OMPD_tile: {
-    unsigned version = semaCtx.langOptions().OpenMPVersion;
-    if (!semaCtx.langOptions().OpenMPSimd)
-      TODO(loc, "Unhandled loop directive (" +
-                    llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+  case llvm::omp::Directive::OMPD_tile:
+    genTileOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
     break;
-  }
   case llvm::omp::Directive::OMPD_unroll:
     genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
     break;
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 83b7ccb1ce0ee..29cccbd1bfe5a 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -652,7 +652,6 @@ int64_t collectLoopRelatedInfo(
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
   int64_t numCollapse = 1;
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   // Collect the loops to collapse.
   lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
@@ -667,6 +666,25 @@ int64_t collectLoopRelatedInfo(
     numCollapse = collapseValue;
   }
 
+  collectLoopRelatedInfo(converter, currentLocation, eval, numCollapse, result,
+                         iv);
+  return numCollapse;
+}
+
+void collectLoopRelatedInfo(
+    lower::AbstractConverter &converter, mlir::Location currentLocation,
+    lower::pft::Evaluation &eval, int64_t numCollapse,
+    mlir::omp::LoopRelatedClauseOps &result,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
+
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+  // Collect the loops to collapse.
+  lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
+  if (doConstructEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
+    TODO(currentLocation, "Do Concurrent in Worksharing loop construct");
+  }
+
   // Collect sizes from tile directive if present.
   std::int64_t sizesLengthValue = 0l;
   if (auto *ompCons{eval.getIf<parser::OpenMPConstruct>()}) {
@@ -676,7 +694,7 @@ int64_t collectLoopRelatedInfo(
         });
   }
 
-  collapseValue = std::max(collapseValue, sizesLengthValue);
+  std::int64_t collapseValue = std::max(numCollapse, sizesLengthValue);
   std::size_t loopVarTypeSize = 0;
   do {
     lower::pft::Evaluation *doLoop =
@@ -709,8 +727,6 @@ int64_t collectLoopRelatedInfo(
   } while (collapseValue > 0);
 
   convertLoopBounds(converter, currentLocation, result, loopVarTypeSize);
-
-  return numCollapse;
 }
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 5f191d89ae205..69499f9c7b621 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -165,6 +165,13 @@ int64_t collectLoopRelatedInfo(
     mlir::omp::LoopRelatedClauseOps &result,
     llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
 
+void collectLoopRelatedInfo(
+    lower::AbstractConverter &converter, mlir::Location currentLocation,
+    lower::pft::Evaluation &eval, std::int64_t collapseValue,
+    // const omp::List<omp::Clause> &clauses,
+    mlir::omp::LoopRelatedClauseOps &result,
+    llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
+
 void collectTileSizesFromOpenMPConstruct(
     const parser::OpenMPConstruct *ompCons,
     llvm::SmallVectorImpl<int64_t> &tileSizes,
diff --git a/flang/lib/Semantics/check-directive-structure.h b/flang/lib/Semantics/check-directive-structure.h
index b1bf3e550aebc..bd78d3cfe91e7 100644
--- a/flang/lib/Semantics/check-directive-structure.h
+++ b/flang/lib/Semantics/check-directive-structure.h
@@ -383,7 +383,8 @@ class DirectiveStructureChecker : public virtual BaseChecker {
       const C &clause, const parser::ScalarIntConstantExpr &i);
 
   void RequiresPositiveParameter(const C &clause,
-      const parser::ScalarIntExpr &i, llvm::StringRef paramName = "parameter");
+      const parser::ScalarIntExpr &i, llvm::StringRef paramName = "parameter",
+      bool allowZero = true);
 
   void OptionalConstantPositiveParameter(
       const C &clause, const std::optional<parser::ScalarIntConstantExpr> &o);
@@ -657,9 +658,9 @@ void DirectiveStructureChecker<D, C, PC, ClauseEnumSize>::SayNotMatching(
 template <typename D, typename C, typename PC, std::size_t ClauseEnumSize>
 void DirectiveStructureChecker<D, C, PC,
     ClauseEnumSize>::RequiresPositiveParameter(const C &clause,
-    const parser::ScalarIntExpr &i, llvm::StringRef paramName) {
+    const parser::ScalarIntExpr &i, llvm::StringRef paramName, bool allowZero) {
   if (const auto v{GetIntValue(i)}) {
-    if (*v < 0) {
+    if (*v < (allowZero ? 0 : 1)) {
       context_.Say(GetContext().clauseSource,
           "The %s of the %s clause must be "
           "a positive integer expression"_err_en_US,
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 1f059f747bad0..c0c41c1eed89d 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3145,6 +3145,13 @@ void OmpStructureChecker::Enter(const parser::OmpClause &x) {
   }
 }
 
+void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_sizes);
+  for (const parser::Cosubscript &v : c.v)
+    RequiresPositiveParameter(llvm::omp::Clause::OMPC_sizes, v,
+        /*paramName=*/"parameter", /*allowZero=*/false);
+}
+
 // Following clauses do not have a separate node in parse-tree.h.
 CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent)
 CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity)
@@ -3186,7 +3193,6 @@ CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch)
 CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
 CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
 CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
-CHECK_SIMPLE_CLAUSE(Sizes, OMPC_sizes)
 CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
 CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
 CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 624b89005c809..02fcf021137ab 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2421,10 +2421,18 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
 void OmpAttributeVisitor::CheckAssocLoopLevel(
     std::int64_t level, const parser::OmpClause *clause) {
   if (clause && level != 0) {
-    context_.Say(clause->source,
-        "The value of the parameter in the COLLAPSE or ORDERED clause must"
-        " not be larger than the number of nested loops"
-        " following the construct."_err_en_US);
+    switch (clause->Id()) {
+    case llvm::omp::OMPC_sizes:
+      context_.Say(clause->source,
+          "The SIZES clause has more entries than there are nested canonical loops."_err_en_US);
+      break;
+    default:
+      context_.Say(clause->source,
+          "The value of the parameter in the COLLAPSE or ORDERED clause must"
+          " not be larger than the number of nested loops"
+          " following the construct."_err_en_US);
+      break;
+    }
   }
 }
 
diff --git a/flang/test/Lower/OpenMP/tile01.f90 b/flang/test/Lower/OpenMP/tile01.f90
new file mode 100644
index 0000000000000..7603eee4b18d8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/tile01.f90
@@ -0,0 +1,58 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s | FileCheck %s
+
+
+subroutine omp_tile01(lb, ub, inc)
+  integer res, i, lb, ub, inc
+
+  !$omp tile sizes(4)
+  do i = lb, ub, inc
+    res = i
+  end do
+  !$omp end tile
+
+end subroutine omp_tile01
+
+
+! CHECK: func.func @_QPomp_tile01(
+! CHECK:    %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb"},
+! CHECK:    %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub"},
+! CHECK:    %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc"}) {
+! CHECK:         %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_tile01Ei"}
+! CHECK:         %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_tile01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_5:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_tile01Eres"}
+! CHECK:         %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFomp_tile01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile01Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_8:.*]] = arith.constant 4 : i32
+! CHECK:         %[[VAL_9:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_11:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_12:.*]] = arith.constant 0 : i32
+! CHECK:         %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_14:.*]] = arith.cmpi slt, %[[VAL_11]], %[[VAL_12]] : i32
+! CHECK:         %[[VAL_15:.*]] = arith.subi %[[VAL_12]], %[[VAL_11]] : i32
+! CHECK:         %[[VAL_16:.*]] = arith.select %[[VAL_14]], %[[VAL_15]], %[[VAL_11]] : i32
+! CHECK:         %[[VAL_17:.*]] = arith.select %[[VAL_14]], %[[VAL_10]], %[[VAL_9]] : i32
+! CHECK:         %[[VAL_18:.*]] = arith.select %[[VAL_14]], %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK:         %[[VAL_19:.*]] = arith.subi %[[VAL_18]], %[[VAL_17]] overflow<nuw> : i32
+! CHECK:         %[[VAL_20:.*]] = arith.divui %[[VAL_19]], %[[VAL_16]] : i32
+! CHECK:         %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_13]] overflow<nuw> : i32
+! CHECK:         %[[VAL_22:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_17]] : i32
+! CHECK:         %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_12]], %[[VAL_21]] : i32
+! CHECK:         %[[VAL_24:.*]] = omp.new_cli
+! CHECK:         omp.canonical_loop(%[[VAL_24]]) %[[VAL_25:.*]] : i32 in range(%[[VAL_23]]) {
+! CHECK:           %[[VAL_26:.*]] = arith.muli %[[VAL_25]], %[[VAL_11]] : i32
+! CHECK:           %[[VAL_27:.*]] = arith.addi %[[VAL_9]], %[[VAL_26]] : i32
+! CHECK:           hlfir.assign %[[VAL_27]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_28:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
+! CHECK:           hlfir.assign %[[VAL_28]] to %[[VAL_6]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.terminator
+! CHECK:         }
+! CHECK:         %[[VAL_29:.*]] = omp.new_cli
+! CHECK:         %[[VAL_30:.*]] = omp.new_cli
+! CHECK:         omp.tile (%[[VAL_29]], %[[VAL_30]]) <- (%[[VAL_24]]) sizes(%[[VAL_8]] : i32)
+! CHECK:         return
+! CHECK:       }
+
diff --git a/flang/test/Lower/OpenMP/tile02.f90 b/flang/test/Lower/OpenMP/tile02.f90
new file mode 100644
index 0000000000000..5df506d17ed05
--- /dev/null
+++ b/flang/test/Lower/OpenMP/tile02.f90
@@ -0,0 +1,88 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -o - %s | FileCheck %s
+
+
+subroutine omp_tile02(lb, ub, inc)
+  integer res, i, lb, ub, inc
+
+  !$omp tile sizes(3,7)
+  do i = lb, ub, inc
+    do j = lb, ub, inc
+      res = i + j
+    end do
+  end do
+  !$omp end tile
+
+end subroutine omp_tile02
+
+
+! CHECK: func.func @_QPomp_tile02(
+! CHECK:    %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb"},
+! CHECK:    %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub"},
+! CHECK:    %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc"}) {
+! CHECK:         %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_tile02Ei"}
+! CHECK:         %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFomp_tile02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_3:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Einc"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_tile02Ej"}
+! CHECK:         %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFomp_tile02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Elb"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_7:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_tile02Eres"}
+! CHECK:         %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFomp_tile02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_0]] {uniq_name = "_QFomp_tile02Eub"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_10:.*]] = arith.constant 3 : i32
+! CHECK:         %[[VAL_11:.*]] = arith.constant 7 : i32
+! CHECK:         %[[VAL_12:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_13:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_14:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_15:.*]] = arith.constant 0 : i32
+! CHECK:         %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_15]] : i32
+! CHECK:         %[[VAL_18:.*]] = arith.subi %[[VAL_15]], %[[VAL_14]] : i32
+! CHECK:         %[[VAL_19:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_14]] : i32
+! CHECK:         %[[VAL_20:.*]] = arith.select %[[VAL_17]], %[[VAL_13]], %[[VAL_12]] : i32
+! CHECK:         %[[VAL_21:.*]] = arith.select %[[VAL_17]], %[[VAL_12]], %[[VAL_13]] : i32
+! CHECK:         %[[VAL_22:.*]] = arith.subi %[[VAL_21]], %[[VAL_20]] overflow<nuw> : i32
+! CHECK:         %[[VAL_23:.*]] = arith.divui %[[VAL_22]], %[[VAL_19]] : i32
+! CHECK:         %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_16]] overflow<nuw> : i32
+! CHECK:         %[[VAL_25:.*]] = arith.cmpi slt, %[[VAL_21]], %[[VAL_20]] : i32
+! CHECK:         %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_15]], %[[VAL_24]] : i32
+! CHECK:         %[[VAL_27:.*]] = omp.new_cli
+! CHECK:         %[[VAL_28:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_29:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_30:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK:         %[[VAL_31:.*]] = arith.constant 0 : i32
+! CHECK:         %[[VAL_32:.*]] = arith.constant 1 : i32
+! CHECK:         %[[VAL_33:.*]] = arith.cmpi slt, %[[VAL_30]], %[[VAL_31]] : i32
+! CHECK:         %[[VAL_34:.*]] = arith.subi %[[VAL_31]], %[[VAL_30]] : i32
+! CHECK:         %[[VAL_35:.*]] = arith.select %[[VAL_33]], %[[VAL_34]], %[[VAL_30]] : i32
+! CHECK:         %[[VAL_36:.*]] = arith.select %[[VAL_33]], %[[VAL_29]], %[[VAL_28]] : i32
+! CHECK:         %[[VAL_37:.*]] = arith.select %[[VAL_33]], %[[VAL_28]], %[[VAL_29]] : i32
+! CHECK:         %[[VAL_38:.*]] = arith.subi %[[VAL_37]], %[[VAL_36]] overflow<nuw> : i32
+! CHECK:         %[[VAL_39:.*]] = arith.divui %[[VAL_38]], %[[VAL_35]] : i32
+! CHECK:         %[[VAL_40:.*]] = arith.addi %[[VAL_39]], %[[VAL_32]] overflow<nuw> : i32
+! CHECK:         %[[VAL_41:.*]] = arith.cmpi slt, %[[VAL_37]], %[[VAL_36]] : i32
+! CHECK:         %[[VAL_42:.*]] = arith.select %[[VAL_41]], %[[VAL_31]], %[[VAL_40]] : i32
+! CHECK:         %[[VAL_43:.*]] = omp.new_cli
+! CHECK:         omp.canonical_loop(%[[VAL_27]]) %[[VAL_44:.*]] : i32 in range(%[[VAL_26]]) {
+! CHECK:           omp.canonical_loop(%[[VAL_43]]) %[[VAL_45:.*]] : i32 in range(%[[VAL_42]]) {
+! CHECK:             %[[VAL_46:.*]] = arith.muli %[[VAL_44]], %[[VAL_14]] : i32
+! CHECK:             %[[VAL_47:.*]] = arith.addi %[[VAL_12]], %[[VAL_46]] : i32
+! CHECK:             hlfir.assign %[[VAL_47]] to %[[VAL_2]]#0 : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_48:.*]] = arith.muli %[[VAL_45]], %[[VAL_30]] : i32
+! CHECK:             %[[VAL_49:.*]] = arith.addi %[[VAL_28]], %[[VAL_48]] : i32
+! CHECK:             hlfir.assign %[[VAL_49]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_50:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_51:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_52:.*]] = arith.addi %[[VAL_50]], %[[VAL_51]] : i32
+! CHECK:             hlfir.assign %[[VAL_52]] to %[[VAL_8]]#0 : i32, !fir.ref<i32>
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           omp.terminator
+! CHECK:         }
+! CHECK:         %[[VAL_53:.*]] = omp.new_cli
+! CHECK:         %[[VAL_54:.*]] = omp.new_cli
+! CHECK:         %[[VAL_55:.*]] = omp.new_cli
+! CHECK:         %[[VAL_56:.*]] = omp.new_cli
+! CHECK:         omp.tile (%[[VAL_53]], %[[VAL_55]], %[[VAL_54]], %[[VAL_56]]) <- (%[[VAL_27]], %[[VAL_43]]) sizes(%[[VAL_10]], %[[VAL_11]] : i32, i32)
+! CHECK:         return
+! CHECK:       }
diff --git a/flang/test/Parser/OpenMP/loop-transformation-construct02.f90 b/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
index a6af35a0111a3..a876c77a274b5 100644
--- a/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
+++ b/flang/test/Parser/OpenMP/loop-transformation-construct02.f90
@@ -11,7 +11,7 @@ subroutine loop_transformation_construct
 
   !$omp do
   !$omp unroll
-  !$omp tile
+  !$omp tile sizes(2)
   do i = 1, I
     y(i) = y(i) * 5
   end do
@@ -34,7 +34,8 @@ subroutine loop_transformation_construct
 !CHECK-PARSE-NEXT: | | | | OpenMPLoopConstruct
 !CHECK-PARSE-NEXT: | | | | | OmpBeginLoopDirective
 !CHECK-PARSE-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = tile
-!CHECK-PARSE-NEXT: | | | | | | OmpClauseList ->
+!CHECK-PARSE-NEXT: | | | | | | OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
+!CHECK-PARSE-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
 !CHECK-PARSE-NEXT: | | | | | | Flags = None
 !CHECK-PARSE-NEXT: | | | | | DoConstruct
 !CHECK-PARSE-NEXT: | | | | | | NonLabelDoStmt
diff --git a/flang/test/Parser/OpenMP/tile-fail.f90 b/flang/test/Parser/OpenMP/tile-fail.f90
new file mode 100644
index 0000000000000..0a92e5bcb6570
--- /dev/null
+++ b/flang/test/Parser/OpenMP/tile-fail.f90
@@ -0,0 +1,32 @@
+! RUN: split-file %s %t
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp %t/stray_end1.f90 2>&1 | FileCheck %t/stray_end1.f90
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp %t/stray_end2.f90 2>&1 | FileCheck %t/stray_end2.f90
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp %t/stray_begin.f90 2>&1 | FileCheck %t/stray_begin.f90
+
+
+!--- stray_end1.f90
+! Parser error
+
+subroutine stray_end1
+  !CHECK: error: expected OpenMP construct
+  !$omp end tile
+end subroutine
+
+
+!--- stray_end2.f90
+! Semantic error
+
+subroutine stray_end2
+  print *
+  !CHECK: error: The END TILE directive must follow the DO loop associated with the loop construct
+  !$omp end tile
+end subroutine
+
+
+!--- stray_begin.f90
+
+subroutine stray_begin
+  !CHECK: error: A DO loop must follow the TILE directive
+  !$omp tile sizes(2)
+end subroutine
+
diff --git a/flang/test/Parser/OpenMP/tile.f90 b/flang/test/Parser/OpenMP/tile.f90
index 2ea17471866a4..82004fd37a0f2 100644
--- a/flang/test/Parser/OpenMP/tile.f90
+++ b/flang/test/Parser/OpenMP/tile.f90
@@ -1,12 +1,12 @@
-! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s
-! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s
 
 subroutine openmp_tiles(x)
 
   integer, intent(inout)::x
 
-!CHECK: !$omp tile
-!$omp  tile
+!CHECK: !$omp tile sizes(2_4)
+!$omp tile sizes(2)
 !CHECK: do
   do x = 1, 100
   	call F1()
@@ -17,7 +17,12 @@ subroutine openmp_tiles(x)
 
 !PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
 !PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE:   OmpClauseList -> OmpClause -> Sizes -> Scalar -> Integer -> Expr = '2_4'
+!PARSE-TREE:     LiteralConstant -> IntLiteralConstant = '2'
+!PARSE-TREE:     Flags = None
+!PARSE-TREE:   DoConstruct
+!PARSE-TREE:   EndDoStmt
+!PARSE-TREE: OmpEndLoopDirective
 !PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = tile
 
 END subroutine openmp_tiles
-
diff --git a/flang/test/Semantics/OpenMP/tile01.f90 b/flang/test/Semantics/OpenMP/tile01.f90
new file mode 100644
index 0000000000000..3d7b3f4f42e92
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile01.f90
@@ -0,0 +1,26 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine missing_sizes
+  implicit none
+  integer i
+
+  !ERROR: At least one of SIZES clause must appear on the TILE directive
+  !$omp tile
+  do i = 1, 42
+    print *, i
+  end do
+end subroutine
+
+
+subroutine double_sizes
+  implicit none
+  integer i
+
+  !ERROR: At most one SIZES clause can appear on the TILE directive
+  !$omp tile sizes(2) sizes(2)
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/tile02.f90 b/flang/test/Semantics/OpenMP/tile02.f90
new file mode 100644
index 0000000000000..676796375353f
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile02.f90
@@ -0,0 +1,15 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine on_unroll
+  implicit none
+  integer i
+
+  !ERROR: If a loop construct has been fully unrolled, it cannot then be tiled
+  !$omp tile sizes(2)
+  !$omp unroll
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/tile03.f90 b/flang/test/Semantics/OpenMP/tile03.f90
new file mode 100644
index 0000000000000..e5c134638ac8d
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile03.f90
@@ -0,0 +1,15 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine loop_assoc
+  implicit none
+  integer :: i = 0
+
+  !$omp tile sizes(2)
+  !ERROR: The associated loop of a loop-associated directive cannot be a DO WHILE.
+  do while (i <= 10)
+    i = i + 1
+    print *, i
+  end do
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/tile04.f90 b/flang/test/Semantics/OpenMP/tile04.f90
new file mode 100644
index 0000000000000..2b503efbcf52b
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile04.f90
@@ -0,0 +1,38 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine threads_zero
+  implicit none
+  integer i
+
+  !ERROR: The parameter of the NUM_THREADS clause must be a positive integer expression
+  !$omp parallel do num_threads(-1)
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
+
+
+subroutine sizes_zero
+  implicit none
+  integer i
+
+  !ERROR: The parameter of the SIZES clause must be a positive integer expression
+  !$omp tile sizes(0)
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
+
+
+subroutine sizes_negative
+  implicit none
+  integer i
+
+  !ERROR: The parameter of the SIZES clause must be a positive integer expression
+  !$omp tile sizes(-1)
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/tile05.f90 b/flang/test/Semantics/OpenMP/tile05.f90
new file mode 100644
index 0000000000000..70c43811a5832
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile05.f90
@@ -0,0 +1,14 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine insufficient_loops
+  implicit none
+  integer i
+
+  !ERROR: The SIZES clause has more entries than there are nested canonical loops.
+  !$omp tile sizes(2, 2)
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/tile06.f90 b/flang/test/Semantics/OpenMP/tile06.f90
new file mode 100644
index 0000000000000..52518d43f0554
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile06.f90
@@ -0,0 +1,44 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine nonrectangular_loop_lb
+  implicit none
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp tile sizes(2,2)
+  do i = 1, 5
+    do j = 1, i
+      print *, i, j
+    end do
+  end do
+end subroutine
+
+
+subroutine nonrectangular_loop_ub
+  implicit none
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp tile sizes(2,2)
+  do i = 1, 5
+    do j = 1, i
+      print *, i, j
+    end do
+  end do
+end subroutine
+
+
+subroutine nonrectangular_loop_step
+  implicit none
+  integer i, j
+
+  !ERROR: Trip count must be computable and invariant
+  !$omp tile sizes(2,2)
+  do i = 1, 5
+    do j = 1, 42, i
+      print *, i, j
+    end do
+  end do
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/tile07.f90 b/flang/test/Semantics/OpenMP/tile07.f90
new file mode 100644
index 0000000000000..70a6f5fc529a4
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile07.f90
@@ -0,0 +1,35 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine non_perfectly_nested_loop_behind
+  implicit none
+  integer i, j
+
+  !ERROR: Canonical loop nest must be perfectly nested.
+  !$omp tile sizes(2,2)
+  do i = 1, 5
+    do j = 1, 42
+      print *, j
+    end do
+    print *, i
+  end do
+end subroutine
+
+
+subroutine non_perfectly_nested_loop_before
+  implicit none
+  integer i, j
+
+  !ERROR: The SIZES clause has more entries than there are nested canonical loops.
+  !$omp tile sizes(2,2)
+  do i = 1, 5
+    print *, i
+    do j = 1, 42
+      print *, j
+    end do
+  end do
+end subroutine
+
+
+
diff --git a/flang/test/Semantics/OpenMP/tile08.f90 b/flang/test/Semantics/OpenMP/tile08.f90
new file mode 100644
index 0000000000000..f42805cb81b7d
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/tile08.f90
@@ -0,0 +1,15 @@
+! Testing the Semantics of tile
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine do_concurrent
+  implicit none
+  integer i, j
+
+
+  !$omp tile sizes(2,2)
+  !ERROR: DO CONCURRENT loops cannot form part of a loop nest.
+  do concurrent (i = 1:42, j = 1:42)
+    print *, i, j
+  end do
+end subroutine
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 38f95a11bf85f..bba0d6ebdb791 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -1333,6 +1333,9 @@ def OMP_Tile : Directive<[Spelling<"tile">]> {
   let allowedOnceClauses = [
     VersionedClause<OMPC_Sizes, 51>,
   ];
+  let requiredClauses = [
+    VersionedClause<OMPC_Sizes, 51>,
+  ];
   let association = AS_Loop;
   let category = CA_Executable;
 }
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
index 4ac4f02103e8c..0d559b69a3ad1 100644
--- a/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile01.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
-
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
 
 
 llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () {
@@ -15,87 +14,81 @@ llvm.func @tile_trivial_loop(%baseptr: !llvm.ptr, %tc: i32, %ts: i32) -> () {
 }
 
 
-// CHECK: ; ModuleID = 'LLVMDialectModule'
-// CHECK-NEXT: source_filename = "LLVMDialectModule"
-// CHECK-EMPTY:
-// CHECK-NEXT: define void @tile_trivial_loop(ptr %0, i32 %1, i32 %2) {
-// CHECK-NEXT:   br label %omp_omp.loop.preheader
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.preheader:                           ; preds = %3
-// CHECK-NEXT:   %4 = udiv i32 %1, %2
-// CHECK-NEXT:   %5 = urem i32 %1, %2
-// CHECK-NEXT:   %6 = icmp ne i32 %5, 0
-// CHECK-NEXT:   %7 = zext i1 %6 to i32
-// CHECK-NEXT:   %omp_floor0.tripcount = add nuw i32 %4, %7
-// CHECK-NEXT:   br label %omp_floor0.preheader
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.preheader:                             ; preds = %omp_omp.loop.preheader
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-LABEL: define void @tile_trivial_loop(
+// CHECK-SAME:    ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]]) {
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.header:                                ; preds = %omp_floor0.inc, %omp_floor0.preheader
-// CHECK-NEXT:   %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ]
-// CHECK-NEXT:   br label %omp_floor0.cond
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT:    %[[TMP4:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP2:.+]]
+// CHECK-NEXT:    %[[TMP5:.+]] = urem i32 %[[TMP1:.+]], %[[TMP2:.+]]
+// CHECK-NEXT:    %[[TMP6:.+]] = icmp ne i32 %[[TMP5:.+]], 0
+// CHECK-NEXT:    %[[TMP7:.+]] = zext i1 %[[TMP6:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP4:.+]], %[[TMP7:.+]]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.cond:                                  ; preds = %omp_floor0.header
-// CHECK-NEXT:   %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount
-// CHECK-NEXT:   br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit
+// CHECK-NEXT:  [[OMP_FLOOR0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.body:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   %8 = icmp eq i32 %omp_floor0.iv, %4
-// CHECK-NEXT:   %9 = select i1 %8, i32 %5, i32 %2
-// CHECK-NEXT:   br label %omp_tile0.preheader
+// CHECK-NEXT:  [[OMP_FLOOR0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.preheader:                              ; preds = %omp_floor0.body
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_FLOOR0_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.header:                                 ; preds = %omp_tile0.inc, %omp_tile0.preheader
-// CHECK-NEXT:   %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ]
-// CHECK-NEXT:   br label %omp_tile0.cond
+// CHECK-NEXT:  [[OMP_FLOOR0_BODY]]:
+// CHECK-NEXT:    %[[TMP8:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP9:.+]] = select i1 %[[TMP8:.+]], i32 %[[TMP5:.+]], i32 %[[TMP2:.+]]
+// CHECK-NEXT:    br label %[[OMP_TILE0_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.cond:                                   ; preds = %omp_tile0.header
-// CHECK-NEXT:   %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %9
-// CHECK-NEXT:   br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit
+// CHECK-NEXT:  [[OMP_TILE0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.body:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   %10 = mul nuw i32 %2, %omp_floor0.iv
-// CHECK-NEXT:   %11 = add nuw i32 %10, %omp_tile0.iv
-// CHECK-NEXT:   br label %omp_omp.loop.body
+// CHECK-NEXT:  [[OMP_TILE0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.body:                                ; preds = %omp_tile0.body
-// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-NEXT:  [[OMP_TILE0_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP9:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.loop.region:                                  ; preds = %omp_omp.loop.body
-// CHECK-NEXT:   %12 = getelementptr inbounds float, ptr %0, i32 %11
-// CHECK-NEXT:   store float 4.200000e+01, ptr %12, align 4
-// CHECK-NEXT:   br label %omp.region.cont
+// CHECK-NEXT:  [[OMP_TILE0_BODY]]:
+// CHECK-NEXT:    %[[TMP10:.+]] = mul nuw i32 %[[TMP2:.+]], %[[OMP_FLOOR0_IV:.+]]
+// CHECK-NEXT:    %[[TMP11:.+]] = add nuw i32 %[[TMP10:.+]], %[[OMP_TILE0_IV:.+]]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.region.cont:                                  ; preds = %omp.loop.region
-// CHECK-NEXT:   br label %omp_tile0.inc
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.inc:                                    ; preds = %omp.region.cont
-// CHECK-NEXT:   %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_LOOP_REGION]]:
+// CHECK-NEXT:    %[[TMP12:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP11:.+]]
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[TMP12:.+]], align 4
+// CHECK-NEXT:    br label %[[OMP_REGION_CONT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.exit:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   br label %omp_tile0.after
+// CHECK-NEXT:  [[OMP_REGION_CONT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.after:                                  ; preds = %omp_tile0.exit
-// CHECK-NEXT:   br label %omp_floor0.inc
+// CHECK-NEXT:  [[OMP_TILE0_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.inc:                                   ; preds = %omp_tile0.after
-// CHECK-NEXT:   %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-NEXT:  [[OMP_TILE0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.exit:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   br label %omp_floor0.after
+// CHECK-NEXT:  [[OMP_TILE0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.after:                                 ; preds = %omp_floor0.exit
-// CHECK-NEXT:   br label %omp_omp.loop.after
+// CHECK-NEXT:  [[OMP_FLOOR0_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.after:                               ; preds = %omp_floor0.after
-// CHECK-NEXT:   ret void
-// CHECK-NEXT: }
+// CHECK-NEXT:  [[OMP_FLOOR0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: !llvm.module.flags = !{!0}
+// CHECK-NEXT:  [[OMP_FLOOR0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3}
+// CHECK-NEXT:  [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
index 6fad81cd0c299..22c2973164159 100644
--- a/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-cli-tile02.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
 
 
 llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %ts2: i32) -> () {
@@ -19,172 +19,166 @@ llvm.func @tile_2d_loop(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %ts1: i32, %t
 }
 
 
-// CHECK: ; ModuleID = 'LLVMDialectModule'
-// CHECK-NEXT: source_filename = "LLVMDialectModule"
+// CHECK-LABEL: define void @tile_2d_loop(
+// CHECK-SAME:      ptr %[[TMP0:.+]], i32 %[[TMP1:.+]], i32 %[[TMP2:.+]], i32 %[[TMP3:.+]], i32 %[[TMP4:.+]]) {
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT:    %[[TMP6:.+]] = udiv i32 %[[TMP1:.+]], %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP7:.+]] = urem i32 %[[TMP1:.+]], %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP8:.+]] = icmp ne i32 %[[TMP7:.+]], 0
+// CHECK-NEXT:    %[[TMP9:.+]] = zext i1 %[[TMP8:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR0_TRIPCOUNT:.+]] = add nuw i32 %[[TMP6:.+]], %[[TMP9:.+]]
+// CHECK-NEXT:    %[[TMP10:.+]] = udiv i32 %[[TMP2:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP11:.+]] = urem i32 %[[TMP2:.+]], %[[TMP4:.+]]
+// CHECK-NEXT:    %[[TMP12:.+]] = icmp ne i32 %[[TMP11:.+]], 0
+// CHECK-NEXT:    %[[TMP13:.+]] = zext i1 %[[TMP12:.+]] to i32
+// CHECK-NEXT:    %[[OMP_FLOOR1_TRIPCOUNT:.+]] = add nuw i32 %[[TMP10:.+]], %[[TMP13:.+]]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_HEADER:.+]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_IV:.+]] = phi i32 [ %[[OMP_OMP_LOOP_NEXT:.+]], %[[OMP_OMP_LOOP_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_COND]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_CMP:.+]] = icmp ult i32 %[[TMP19:.+]], %[[TMP1:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_OMP_LOOP_CMP:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_OMP_LOOP_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT:  [[OMP_LOOP_REGION]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_PREHEADER1:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: define void @tile_2d_loop(ptr %0, i32 %1, i32 %2, i32 %3, i32 %4) {
-// CHECK-NEXT:   br label %omp_omp.loop.preheader
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.preheader:                           ; preds = %5
-// CHECK-NEXT:   %6 = udiv i32 %1, %3
-// CHECK-NEXT:   %7 = urem i32 %1, %3
-// CHECK-NEXT:   %8 = icmp ne i32 %7, 0
-// CHECK-NEXT:   %9 = zext i1 %8 to i32
-// CHECK-NEXT:   %omp_floor0.tripcount = add nuw i32 %6, %9
-// CHECK-NEXT:   %10 = udiv i32 %2, %4
-// CHECK-NEXT:   %11 = urem i32 %2, %4
-// CHECK-NEXT:   %12 = icmp ne i32 %11, 0
-// CHECK-NEXT:   %13 = zext i1 %12 to i32
-// CHECK-NEXT:   %omp_floor1.tripcount = add nuw i32 %10, %13
-// CHECK-NEXT:   br label %omp_floor0.preheader
+// CHECK-NEXT:  [[OMP_OMP_LOOP_PREHEADER1]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY4:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.header:                              ; preds = %omp_omp.loop.inc
-// CHECK-NEXT:   %omp_omp.loop.iv = phi i32 [ %omp_omp.loop.next, %omp_omp.loop.inc ]
-// CHECK-NEXT:   br label %omp_omp.loop.cond
+// CHECK-NEXT:  [[OMP_FLOOR0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.cond:                                ; preds = %omp_omp.loop.header
-// CHECK-NEXT:   %omp_omp.loop.cmp = icmp ult i32 %19, %1
-// CHECK-NEXT:   br i1 %omp_omp.loop.cmp, label %omp_omp.loop.body, label %omp_omp.loop.exit
+// CHECK-NEXT:  [[OMP_FLOOR0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR0_PREHEADER:.+]] ], [ %[[OMP_FLOOR0_NEXT:.+]], %[[OMP_FLOOR0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.body:                                ; preds = %omp_tile1.body, %omp_omp.loop.cond
-// CHECK-NEXT:   br label %omp.loop.region
+// CHECK-NEXT:  [[OMP_FLOOR0_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR0_IV:.+]], %[[OMP_FLOOR0_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR0_CMP:.+]], label %[[OMP_FLOOR0_BODY:.+]], label %[[OMP_FLOOR0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.loop.region:                                  ; preds = %omp_omp.loop.body
-// CHECK-NEXT:   br label %omp_omp.loop.preheader1
+// CHECK-NEXT:  [[OMP_FLOOR0_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.preheader1:                          ; preds = %omp.loop.region
-// CHECK-NEXT:   br label %omp_omp.loop.body4
+// CHECK-NEXT:  [[OMP_FLOOR1_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.preheader:                             ; preds = %omp_omp.loop.preheader
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-NEXT:  [[OMP_FLOOR1_HEADER]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_IV:.+]] = phi i32 [ 0, %[[OMP_FLOOR1_PREHEADER:.+]] ], [ %[[OMP_FLOOR1_NEXT:.+]], %[[OMP_FLOOR1_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.header:                                ; preds = %omp_floor0.inc, %omp_floor0.preheader
-// CHECK-NEXT:   %omp_floor0.iv = phi i32 [ 0, %omp_floor0.preheader ], [ %omp_floor0.next, %omp_floor0.inc ]
-// CHECK-NEXT:   br label %omp_floor0.cond
+// CHECK-NEXT:  [[OMP_FLOOR1_COND]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_CMP:.+]] = icmp ult i32 %[[OMP_FLOOR1_IV:.+]], %[[OMP_FLOOR1_TRIPCOUNT:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_FLOOR1_CMP:.+]], label %[[OMP_FLOOR1_BODY:.+]], label %[[OMP_FLOOR1_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.cond:                                  ; preds = %omp_floor0.header
-// CHECK-NEXT:   %omp_floor0.cmp = icmp ult i32 %omp_floor0.iv, %omp_floor0.tripcount
-// CHECK-NEXT:   br i1 %omp_floor0.cmp, label %omp_floor0.body, label %omp_floor0.exit
+// CHECK-NEXT:  [[OMP_FLOOR1_BODY]]:
+// CHECK-NEXT:    %[[TMP14:.+]] = icmp eq i32 %[[OMP_FLOOR0_IV:.+]], %[[TMP6:.+]]
+// CHECK-NEXT:    %[[TMP15:.+]] = select i1 %[[TMP14:.+]], i32 %[[TMP7:.+]], i32 %[[TMP3:.+]]
+// CHECK-NEXT:    %[[TMP16:.+]] = icmp eq i32 %[[OMP_FLOOR1_IV:.+]], %[[TMP10:.+]]
+// CHECK-NEXT:    %[[TMP17:.+]] = select i1 %[[TMP16:.+]], i32 %[[TMP11:.+]], i32 %[[TMP4:.+]]
+// CHECK-NEXT:    br label %[[OMP_TILE0_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.body:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   br label %omp_floor1.preheader
+// CHECK-NEXT:  [[OMP_TILE0_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.preheader:                             ; preds = %omp_floor0.body
-// CHECK-NEXT:   br label %omp_floor1.header
+// CHECK-NEXT:  [[OMP_TILE0_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE0_IV:.+]] = phi i32 [ 0, %[[OMP_TILE0_PREHEADER:.+]] ], [ %[[OMP_TILE0_NEXT:.+]], %[[OMP_TILE0_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE0_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.header:                                ; preds = %omp_floor1.inc, %omp_floor1.preheader
-// CHECK-NEXT:   %omp_floor1.iv = phi i32 [ 0, %omp_floor1.preheader ], [ %omp_floor1.next, %omp_floor1.inc ]
-// CHECK-NEXT:   br label %omp_floor1.cond
+// CHECK-NEXT:  [[OMP_TILE0_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE0_CMP:.+]] = icmp ult i32 %[[OMP_TILE0_IV:.+]], %[[TMP15:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE0_CMP:.+]], label %[[OMP_TILE0_BODY:.+]], label %[[OMP_TILE0_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.cond:                                  ; preds = %omp_floor1.header
-// CHECK-NEXT:   %omp_floor1.cmp = icmp ult i32 %omp_floor1.iv, %omp_floor1.tripcount
-// CHECK-NEXT:   br i1 %omp_floor1.cmp, label %omp_floor1.body, label %omp_floor1.exit
+// CHECK-NEXT:  [[OMP_TILE0_BODY]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_PREHEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.body:                                  ; preds = %omp_floor1.cond
-// CHECK-NEXT:   %14 = icmp eq i32 %omp_floor0.iv, %6
-// CHECK-NEXT:   %15 = select i1 %14, i32 %7, i32 %3
-// CHECK-NEXT:   %16 = icmp eq i32 %omp_floor1.iv, %10
-// CHECK-NEXT:   %17 = select i1 %16, i32 %11, i32 %4
-// CHECK-NEXT:   br label %omp_tile0.preheader
+// CHECK-NEXT:  [[OMP_TILE1_PREHEADER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.preheader:                              ; preds = %omp_floor1.body
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_TILE1_HEADER]]:
+// CHECK-NEXT:    %[[OMP_TILE1_IV:.+]] = phi i32 [ 0, %[[OMP_TILE1_PREHEADER:.+]] ], [ %[[OMP_TILE1_NEXT:.+]], %[[OMP_TILE1_INC:.+]] ]
+// CHECK-NEXT:    br label %[[OMP_TILE1_COND:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.header:                                 ; preds = %omp_tile0.inc, %omp_tile0.preheader
-// CHECK-NEXT:   %omp_tile0.iv = phi i32 [ 0, %omp_tile0.preheader ], [ %omp_tile0.next, %omp_tile0.inc ]
-// CHECK-NEXT:   br label %omp_tile0.cond
+// CHECK-NEXT:  [[OMP_TILE1_COND]]:
+// CHECK-NEXT:    %[[OMP_TILE1_CMP:.+]] = icmp ult i32 %[[OMP_TILE1_IV:.+]], %[[TMP17:.+]]
+// CHECK-NEXT:    br i1 %[[OMP_TILE1_CMP:.+]], label %[[OMP_TILE1_BODY:.+]], label %[[OMP_TILE1_EXIT:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.cond:                                   ; preds = %omp_tile0.header
-// CHECK-NEXT:   %omp_tile0.cmp = icmp ult i32 %omp_tile0.iv, %15
-// CHECK-NEXT:   br i1 %omp_tile0.cmp, label %omp_tile0.body, label %omp_tile0.exit
+// CHECK-NEXT:  [[OMP_TILE1_BODY]]:
+// CHECK-NEXT:    %[[TMP18:.+]] = mul nuw i32 %[[TMP3:.+]], %[[OMP_FLOOR0_IV:.+]]
+// CHECK-NEXT:    %[[TMP19:.+]] = add nuw i32 %[[TMP18:.+]], %[[OMP_TILE0_IV:.+]]
+// CHECK-NEXT:    %[[TMP20:.+]] = mul nuw i32 %[[TMP4:.+]], %[[OMP_FLOOR1_IV:.+]]
+// CHECK-NEXT:    %[[TMP21:.+]] = add nuw i32 %[[TMP20:.+]], %[[OMP_TILE1_IV:.+]]
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_BODY:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.body:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   br label %omp_tile1.preheader
+// CHECK-NEXT:  [[OMP_OMP_LOOP_BODY4]]:
+// CHECK-NEXT:    br label %[[OMP_LOOP_REGION12:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.preheader:                              ; preds = %omp_tile0.body
-// CHECK-NEXT:   br label %omp_tile1.header
+// CHECK-NEXT:  [[OMP_LOOP_REGION12]]:
+// CHECK-NEXT:    %[[TMP22:.+]] = add i32 %[[TMP19:.+]], %[[TMP21:.+]]
+// CHECK-NEXT:    %[[TMP23:.+]] = getelementptr inbounds float, ptr %[[TMP0:.+]], i32 %[[TMP22:.+]]
+// CHECK-NEXT:    store float 4.200000e+01, ptr %[[TMP23:.+]], align 4
+// CHECK-NEXT:    br label %[[OMP_REGION_CONT11:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.header:                                 ; preds = %omp_tile1.inc, %omp_tile1.preheader
-// CHECK-NEXT:   %omp_tile1.iv = phi i32 [ 0, %omp_tile1.preheader ], [ %omp_tile1.next, %omp_tile1.inc ]
-// CHECK-NEXT:   br label %omp_tile1.cond
+// CHECK-NEXT:  [[OMP_REGION_CONT11]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.cond:                                   ; preds = %omp_tile1.header
-// CHECK-NEXT:   %omp_tile1.cmp = icmp ult i32 %omp_tile1.iv, %17
-// CHECK-NEXT:   br i1 %omp_tile1.cmp, label %omp_tile1.body, label %omp_tile1.exit
+// CHECK-NEXT:  [[OMP_TILE1_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE1_NEXT:.+]] = add nuw i32 %[[OMP_TILE1_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.body:                                   ; preds = %omp_tile1.cond
-// CHECK-NEXT:   %18 = mul nuw i32 %3, %omp_floor0.iv
-// CHECK-NEXT:   %19 = add nuw i32 %18, %omp_tile0.iv
-// CHECK-NEXT:   %20 = mul nuw i32 %4, %omp_floor1.iv
-// CHECK-NEXT:   %21 = add nuw i32 %20, %omp_tile1.iv
-// CHECK-NEXT:   br label %omp_omp.loop.body
+// CHECK-NEXT:  [[OMP_TILE1_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE1_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.body4:                               ; preds = %omp_omp.loop.preheader1
-// CHECK-NEXT:   br label %omp.loop.region12
+// CHECK-NEXT:  [[OMP_TILE1_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.loop.region12:                                ; preds = %omp_omp.loop.body4
-// CHECK-NEXT:   %22 = add i32 %19, %21
-// CHECK-NEXT:   %23 = getelementptr inbounds float, ptr %0, i32 %22
-// CHECK-NEXT:   store float 4.200000e+01, ptr %23, align 4
-// CHECK-NEXT:   br label %omp.region.cont11
+// CHECK-NEXT:  [[OMP_TILE0_INC]]:
+// CHECK-NEXT:    %[[OMP_TILE0_NEXT:.+]] = add nuw i32 %[[OMP_TILE0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_TILE0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp.region.cont11:                                ; preds = %omp.loop.region12
-// CHECK-NEXT:   br label %omp_tile1.inc
+// CHECK-NEXT:  [[OMP_TILE0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_TILE0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.inc:                                    ; preds = %omp.region.cont11
-// CHECK-NEXT:   %omp_tile1.next = add nuw i32 %omp_tile1.iv, 1
-// CHECK-NEXT:   br label %omp_tile1.header
+// CHECK-NEXT:  [[OMP_TILE0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.exit:                                   ; preds = %omp_tile1.cond
-// CHECK-NEXT:   br label %omp_tile1.after
+// CHECK-NEXT:  [[OMP_FLOOR1_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR1_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR1_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile1.after:                                  ; preds = %omp_tile1.exit
-// CHECK-NEXT:   br label %omp_tile0.inc
+// CHECK-NEXT:  [[OMP_FLOOR1_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR1_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.inc:                                    ; preds = %omp_tile1.after
-// CHECK-NEXT:   %omp_tile0.next = add nuw i32 %omp_tile0.iv, 1
-// CHECK-NEXT:   br label %omp_tile0.header
+// CHECK-NEXT:  [[OMP_FLOOR1_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.exit:                                   ; preds = %omp_tile0.cond
-// CHECK-NEXT:   br label %omp_tile0.after
+// CHECK-NEXT:  [[OMP_FLOOR0_INC]]:
+// CHECK-NEXT:    %[[OMP_FLOOR0_NEXT:.+]] = add nuw i32 %[[OMP_FLOOR0_IV:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_tile0.after:                                  ; preds = %omp_tile0.exit
-// CHECK-NEXT:   br label %omp_floor1.inc
+// CHECK-NEXT:  [[OMP_FLOOR0_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_FLOOR0_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.inc:                                   ; preds = %omp_tile0.after
-// CHECK-NEXT:   %omp_floor1.next = add nuw i32 %omp_floor1.iv, 1
-// CHECK-NEXT:   br label %omp_floor1.header
+// CHECK-NEXT:  [[OMP_FLOOR0_AFTER]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.exit:                                  ; preds = %omp_floor1.cond
-// CHECK-NEXT:   br label %omp_floor1.after
+// CHECK-NEXT:  [[OMP_REGION_CONT:.+]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_INC:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor1.after:                                 ; preds = %omp_floor1.exit
-// CHECK-NEXT:   br label %omp_floor0.inc
+// CHECK-NEXT:  [[OMP_OMP_LOOP_INC]]:
+// CHECK-NEXT:    %[[OMP_OMP_LOOP_NEXT:.+]] = add nuw i32 %[[TMP19:.+]], 1
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_HEADER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.inc:                                   ; preds = %omp_floor1.after
-// CHECK-NEXT:   %omp_floor0.next = add nuw i32 %omp_floor0.iv, 1
-// CHECK-NEXT:   br label %omp_floor0.header
+// CHECK-NEXT:  [[OMP_OMP_LOOP_EXIT]]:
+// CHECK-NEXT:    br label %[[OMP_OMP_LOOP_AFTER:.+]]
 // CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.exit:                                  ; preds = %omp_floor0.cond
-// CHECK-NEXT:   br label %omp_floor0.after
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_floor0.after:                                 ; preds = %omp_floor0.exit
-// CHECK-NEXT:   br label %omp_omp.loop.after
-// CHECK-EMPTY:
-// CHECK-NEXT: omp.region.cont:                                  ; No predecessors!
-// CHECK-NEXT:   br label %omp_omp.loop.inc
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.inc:                                 ; preds = %omp.region.cont
-// CHECK-NEXT:   %omp_omp.loop.next = add nuw i32 %19, 1
-// CHECK-NEXT:   br label %omp_omp.loop.header
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.exit:                                ; preds = %omp_omp.loop.cond
-// CHECK-NEXT:   br label %omp_omp.loop.after
-// CHECK-EMPTY:
-// CHECK-NEXT: omp_omp.loop.after:                               ; preds = %omp_floor0.after, %omp_omp.loop.exit
-// CHECK-NEXT:   ret void
-// CHECK-NEXT: }
-// CHECK-EMPTY:
-// CHECK-NEXT: !llvm.module.flags = !{!0}
-// CHECK-EMPTY:
-// CHECK-NEXT: !0 = !{i32 2, !"Debug Info Version", i32 3}
+// CHECK-NEXT:  [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
diff --git a/openmp/runtime/test/transform/tile/intfor.f90 b/openmp/runtime/test/transform/tile/intfor.f90
new file mode 100644
index 0000000000000..dac0de6a99021
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/intfor.f90
@@ -0,0 +1,31 @@
+! This test checks lowering of the OpenMP tile directive
+! It is done 3 times corresponding to every possible fraction of the last
+! iteration before passing beyond UB.
+
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -DUB=16 %s -o %t-ub16.exe
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -DUB=17 %s -o %t-ub17.exe
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -DUB=18 %s -o %t-ub18.exe
+! RUN: %t-ub16.exe | FileCheck %s --match-full-lines
+! RUN: %t-ub17.exe | FileCheck %s --match-full-lines
+! RUN: %t-ub18.exe | FileCheck %s --match-full-lines
+
+program tile_intfor_1d
+  integer i
+  print *, 'do'
+
+  !$OMP TILE SIZES(2)
+  do i=7, UB, 3
+    print '("i=", I0)', i
+  end do
+  !$OMP END TILE
+
+  print *, 'done'
+end program
+
+
+! CHECK:      do
+! CHECK-NEXT: i=7
+! CHECK-NEXT: i=10
+! CHECK-NEXT: i=13
+! CHECK-NEXT: i=16
+! CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/tile/intfor_2d.f90 b/openmp/runtime/test/transform/tile/intfor_2d.f90
new file mode 100644
index 0000000000000..6bc90c768b8d3
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/intfor_2d.f90
@@ -0,0 +1,53 @@
+! This test checks lowering of OpenMP tile directive
+
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 %s -o %t.exe
+! RUN: %t.exe | FileCheck %s --match-full-lines
+
+
+program tile_intfor_2d
+  integer i, j
+  print *, 'do'
+
+  !$OMP TILE SIZES(2,3)
+  do i = 7, 16, 3
+    do j = 0, 4
+      print '("i=", I0," j=", I0)', i, j
+    end do
+  end do
+  !$OMP END TILE
+
+  print *, 'done'
+end program
+
+
+! CHECK:      do
+
+! complete tile
+! CHECK-NEXT: i=7 j=0
+! CHECK-NEXT: i=7 j=1
+! CHECK-NEXT: i=7 j=2
+! CHECK-NEXT: i=10 j=0
+! CHECK-NEXT: i=10 j=1
+! CHECK-NEXT: i=10 j=2
+
+! partial tile
+! CHECK-NEXT: i=7 j=3
+! CHECK-NEXT: i=7 j=4
+! CHECK-NEXT: i=10 j=3
+! CHECK-NEXT: i=10 j=4
+
+! complete tile
+! CHECK-NEXT: i=13 j=0
+! CHECK-NEXT: i=13 j=1
+! CHECK-NEXT: i=13 j=2
+! CHECK-NEXT: i=16 j=0
+! CHECK-NEXT: i=16 j=1
+! CHECK-NEXT: i=16 j=2
+
+! partial tile
+! CHECK-NEXT: i=13 j=3
+! CHECK-NEXT: i=13 j=4
+! CHECK-NEXT: i=16 j=3
+! CHECK-NEXT: i=16 j=4
+
+! CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/tile/intfor_2d_varsizes.F90 b/openmp/runtime/test/transform/tile/intfor_2d_varsizes.F90
new file mode 100644
index 0000000000000..4cb5adf606dd2
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/intfor_2d_varsizes.F90
@@ -0,0 +1,60 @@
+! This test checks lowering of OpenMP tile directive
+
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 %s -o %t.exe
+! RUN: %t.exe | FileCheck %s --match-full-lines
+
+program tile_intfor_varsizes
+  integer i
+
+  call kernel(7,17,3,2)
+  call kernel(7,17,3,3)
+
+end program
+
+
+subroutine kernel(lb, ub, step, ts)
+  integer i, j, lb, ub, step, ts
+
+  print *, 'do'
+
+  !$OMP TILE SIZES(ts,ts)
+  do i = lb, ub, step
+    do j = 0, 2
+      print '("i=", I0," j=", I0)', i, j
+    end do
+  end do
+  !$OMP END TILE
+
+  print *, 'done'
+
+end subroutine
+
+! CHECK:      do
+! CHECK-NEXT: i=7 j=0
+! CHECK-NEXT: i=7 j=1
+! CHECK-NEXT: i=10 j=0
+! CHECK-NEXT: i=10 j=1
+! CHECK-NEXT: i=7 j=2
+! CHECK-NEXT: i=10 j=2
+! CHECK-NEXT: i=13 j=0
+! CHECK-NEXT: i=13 j=1
+! CHECK-NEXT: i=16 j=0
+! CHECK-NEXT: i=16 j=1
+! CHECK-NEXT: i=13 j=2
+! CHECK-NEXT: i=16 j=2
+! CHECK-NEXT: done
+
+! CHECK:      do
+! CHECK-NEXT: i=7 j=0
+! CHECK-NEXT: i=7 j=1
+! CHECK-NEXT: i=7 j=2
+! CHECK-NEXT: i=10 j=0
+! CHECK-NEXT: i=10 j=1
+! CHECK-NEXT: i=10 j=2
+! CHECK-NEXT: i=13 j=0
+! CHECK-NEXT: i=13 j=1
+! CHECK-NEXT: i=13 j=2
+! CHECK-NEXT: i=16 j=0
+! CHECK-NEXT: i=16 j=1
+! CHECK-NEXT: i=16 j=2
+! CHECK-NEXT: done

From ab9611e7353be46351b9ce52836e56431ac5a45c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 3 Oct 2025 15:01:58 +0100
Subject: [PATCH 640/878] [X86] Fold ADD(x,x) -> X86ISD::VSHLI(x,1) (#161843)

Now that #161007 will attempt to fold this back to ADD(x,x) in
X86FixupInstTunings, we can more aggressively create X86ISD::VSHLI nodes
to avoid missed optimisations due to oneuse limits, avoids unnecessary
freezes and allows AVX512 to fold to mi memory folding variants.

I've currently limited SSE targets to cases where ADD is the only user
of x to prevent extra moves - AVX shift patterns benefit from breaking
the ADD+ADD+ADD chains into shifts, but its not so beneficial on SSE
with the extra moves.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   8 +
 llvm/test/CodeGen/X86/avx2-vector-shifts.ll   |  12 +-
 llvm/test/CodeGen/X86/gfni-shifts.ll          | 168 +++++++++---------
 llvm/test/CodeGen/X86/logic-shift.ll          |  36 ++--
 llvm/test/CodeGen/X86/prefer-avx256-shift.ll  |  36 ++--
 llvm/test/CodeGen/X86/shuffle-as-shifts.ll    |  54 +++---
 llvm/test/CodeGen/X86/sshl_sat_vec.ll         |  12 +-
 llvm/test/CodeGen/X86/vector-fshr-128.ll      |  16 +-
 llvm/test/CodeGen/X86/vector-fshr-256.ll      |  34 ++--
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |  28 +--
 .../test/CodeGen/X86/vector-shift-ashr-256.ll | 164 ++++++++---------
 .../test/CodeGen/X86/vector-shift-ashr-512.ll |  36 ++--
 .../CodeGen/X86/vector-shift-ashr-sub128.ll   |  68 +++----
 .../test/CodeGen/X86/vector-shift-lshr-128.ll |  16 +-
 .../test/CodeGen/X86/vector-shift-lshr-256.ll |  68 +++----
 .../CodeGen/X86/vector-shift-lshr-sub128.ll   |  32 ++--
 .../CodeGen/X86/vector-shuffle-256-v16.ll     |  16 +-
 17 files changed, 406 insertions(+), 398 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 38025068a2745..02b20b3ae5301 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58135,6 +58135,14 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
     return V;
 
+  // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending
+  // on the scheduler model. Limit multiple users to AVX+ targets to prevent
+  // introducing extra register moves.
+  if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))
+    if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))
+      return getTargetVShiftByConstNode(X86ISD::VSHLI, DL, VT.getSimpleVT(),
+                                        Op0, 1, DAG);
+
   // Canonicalize hidden LEA pattern:
   // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
   // iff c < 4
diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
index 983c69d1a1c2e..95c2eda5059e5 100644
--- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -441,10 +441,10 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
 ; CHECK-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; CHECK-NEXT:    vpsraw $2, %ymm3, %ymm4
-; CHECK-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; CHECK-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; CHECK-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; CHECK-NEXT:    vpsraw $1, %ymm3, %ymm4
-; CHECK-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; CHECK-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -452,10 +452,10 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
 ; CHECK-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; CHECK-NEXT:    vpsraw $2, %ymm0, %ymm3
-; CHECK-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; CHECK-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; CHECK-NEXT:    vpsraw $1, %ymm0, %ymm3
-; CHECK-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; CHECK-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; CHECK-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; CHECK-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; CHECK-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index cd16651123b07..feac3dcad243a 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -166,10 +166,10 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNIAVX1OR2-NEXT:    vpsraw $4, %xmm3, %xmm4
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpsraw $2, %xmm3, %xmm4
-; GFNIAVX1OR2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; GFNIAVX1OR2-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; GFNIAVX1OR2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpsraw $1, %xmm3, %xmm4
-; GFNIAVX1OR2-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; GFNIAVX1OR2-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; GFNIAVX1OR2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -177,10 +177,10 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; GFNIAVX1OR2-NEXT:    vpsraw $4, %xmm0, %xmm3
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpsraw $2, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; GFNIAVX1OR2-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
+; GFNIAVX1OR2-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpsraw $1, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; GFNIAVX1OR2-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; GFNIAVX1OR2-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; GFNIAVX1OR2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -896,10 +896,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -907,10 +907,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
-; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
-; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
@@ -920,10 +920,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
-; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
-; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -931,10 +931,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm5
+; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
@@ -949,10 +949,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
-; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
-; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; GFNIAVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -960,10 +960,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
@@ -977,10 +977,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX512VL-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsraw $2, %ymm3, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsraw $1, %ymm3, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -988,10 +988,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; GFNIAVX512VL-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsraw $2, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsraw $1, %ymm0, %ymm3
-; GFNIAVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
@@ -2027,10 +2027,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm7, %xmm8
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm8, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm7, %xmm8
-; GFNIAVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm8, %xmm7, %xmm7
+; GFNIAVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm9
+; GFNIAVX1-NEXT:    vpblendvb %xmm9, %xmm8, %xmm7, %xmm7
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm7, %xmm8
-; GFNIAVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm8, %xmm7, %xmm5
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2038,10 +2038,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm6, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm4
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4
@@ -2051,10 +2051,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm6, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm6, %xmm5
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2062,10 +2062,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm0, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm0, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm0, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; GFNIAVX1-NEXT:    vpackuswb %xmm5, %xmm0, %xmm0
@@ -2078,10 +2078,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm6, %xmm7
 ; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm6
+; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm8
+; GFNIAVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm6, %xmm6
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm6, %xmm7
-; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm7, %xmm6, %xmm4
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2089,10 +2089,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpblendvb %xmm2, %xmm6, %xmm5, %xmm2
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
@@ -2102,10 +2102,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
 ; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm5, %xmm5
+; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm7
+; GFNIAVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm5
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
-; GFNIAVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm5, %xmm4
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm4, %xmm4
 ; GFNIAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2113,10 +2113,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX1-NEXT:    vpsraw $4, %xmm1, %xmm5
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpsraw $2, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm6
+; GFNIAVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpsraw $1, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpsllw $2, %xmm3, %xmm3
 ; GFNIAVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; GFNIAVX1-NEXT:    vpackuswb %xmm4, %xmm1, %xmm1
@@ -2131,10 +2131,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX2-NEXT:    vpsraw $4, %ymm5, %ymm6
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
 ; GFNIAVX2-NEXT:    vpsraw $2, %ymm5, %ymm6
-; GFNIAVX2-NEXT:    vpaddw %ymm4, %ymm4, %ymm4
-; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
+; GFNIAVX2-NEXT:    vpaddw %ymm4, %ymm4, %ymm7
+; GFNIAVX2-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
 ; GFNIAVX2-NEXT:    vpsraw $1, %ymm5, %ymm6
-; GFNIAVX2-NEXT:    vpaddw %ymm4, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm4, %ymm6, %ymm5, %ymm4
 ; GFNIAVX2-NEXT:    vpsrlw $8, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2142,10 +2142,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX2-NEXT:    vpsraw $4, %ymm0, %ymm5
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsraw $2, %ymm0, %ymm5
-; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm6
+; GFNIAVX2-NEXT:    vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsraw $1, %ymm0, %ymm5
-; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    vpackuswb %ymm4, %ymm0, %ymm0
@@ -2155,10 +2155,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX2-NEXT:    vpsraw $4, %ymm4, %ymm5
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpsraw $2, %ymm4, %ymm5
-; GFNIAVX2-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
-; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX2-NEXT:    vpaddw %ymm3, %ymm3, %ymm6
+; GFNIAVX2-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
 ; GFNIAVX2-NEXT:    vpsraw $1, %ymm4, %ymm5
-; GFNIAVX2-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
 ; GFNIAVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; GFNIAVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2166,10 +2166,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX2-NEXT:    vpsraw $4, %ymm1, %ymm4
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpsraw $2, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; GFNIAVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpsraw $1, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; GFNIAVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
@@ -2185,10 +2185,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-NEXT:    vpsraw $4, %ymm5, %ymm6
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsraw $2, %ymm5, %ymm6
-; GFNIAVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
+; GFNIAVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm7
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
 ; GFNIAVX512VL-NEXT:    vpsraw $1, %ymm5, %ymm6
-; GFNIAVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm3, %ymm3
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm5, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2196,10 +2196,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-NEXT:    vpsraw $4, %ymm4, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsraw $2, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsraw $1, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm2
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; GFNIAVX512VL-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
@@ -2209,10 +2209,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-NEXT:    vpsraw $4, %ymm4, %ymm5
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsraw $2, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; GFNIAVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm6
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
 ; GFNIAVX512VL-NEXT:    vpsraw $1, %ymm4, %ymm5
-; GFNIAVX512VL-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm3, %ymm3
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; GFNIAVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -2220,10 +2220,10 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512VL-NEXT:    vpsraw $4, %ymm0, %ymm4
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsraw $2, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; GFNIAVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm5
+; GFNIAVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsraw $1, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; GFNIAVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; GFNIAVX512VL-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
@@ -2239,11 +2239,11 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
 ; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
 ; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; GFNIAVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
+; GFNIAVX512BW-NEXT:    vpmovb2m %zmm5, %k1
 ; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
 ; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
 ; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
 ; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
@@ -2253,11 +2253,11 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; GFNIAVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
+; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
 ; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
 ; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/logic-shift.ll b/llvm/test/CodeGen/X86/logic-shift.ll
index 96e63d1122ec9..104151c76bc5d 100644
--- a/llvm/test/CodeGen/X86/logic-shift.ll
+++ b/llvm/test/CodeGen/X86/logic-shift.ll
@@ -129,10 +129,10 @@ define <16 x i8> @or_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <
 ; CHECK-NEXT:    vpsraw $4, %xmm1, %xmm5
 ; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsraw $2, %xmm1, %xmm5
-; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm6
+; CHECK-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsraw $1, %xmm1, %xmm5
-; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpsllw $2, %xmm4, %xmm4
 ; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -140,10 +140,10 @@ define <16 x i8> @or_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <
 ; CHECK-NEXT:    vpsraw $4, %xmm0, %xmm4
 ; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsraw $2, %xmm0, %xmm4
-; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; CHECK-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsraw $1, %xmm0, %xmm4
-; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -413,10 +413,10 @@ define <16 x i8> @xor_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y,
 ; CHECK-NEXT:    vpsraw $4, %xmm1, %xmm5
 ; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsraw $2, %xmm1, %xmm5
-; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm6
+; CHECK-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsraw $1, %xmm1, %xmm5
-; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpsllw $2, %xmm4, %xmm4
 ; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -424,10 +424,10 @@ define <16 x i8> @xor_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y,
 ; CHECK-NEXT:    vpsraw $4, %xmm0, %xmm4
 ; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsraw $2, %xmm0, %xmm4
-; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; CHECK-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsraw $1, %xmm0, %xmm4
-; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
@@ -697,10 +697,10 @@ define <16 x i8> @and_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y,
 ; CHECK-NEXT:    vpsraw $4, %xmm1, %xmm5
 ; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsraw $2, %xmm1, %xmm5
-; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
-; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm6
+; CHECK-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsraw $1, %xmm1, %xmm5
-; CHECK-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vpsllw $2, %xmm4, %xmm4
 ; CHECK-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -708,10 +708,10 @@ define <16 x i8> @and_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y,
 ; CHECK-NEXT:    vpsraw $4, %xmm0, %xmm4
 ; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsraw $2, %xmm0, %xmm4
-; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; CHECK-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsraw $1, %xmm0, %xmm4
-; CHECK-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll
index bf04c8d435559..63bbac12c6255 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-shift.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-shift.ll
@@ -302,10 +302,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX256-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; AVX256-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; AVX256-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX256-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX256-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX256-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; AVX256-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX256-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX256-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX256-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; AVX256-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; AVX256-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX256-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -313,10 +313,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX256-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; AVX256-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX256-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX256-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX256-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX256-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; AVX256-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX256-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX256-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX256-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; AVX256-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX256-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX256-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
@@ -338,10 +338,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512VL-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; AVX512VL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX512VL-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX512VL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -349,10 +349,10 @@ define <32 x i8> @var_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512VL-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; AVX512VL-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
@@ -432,10 +432,10 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX256VL-NEXT:    vpsraw $4, %xmm3, %xmm4
 ; AVX256VL-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
 ; AVX256VL-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX256VL-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX256VL-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX256VL-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; AVX256VL-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; AVX256VL-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX256VL-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX256VL-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; AVX256VL-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; AVX256VL-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX256VL-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -443,10 +443,10 @@ define <16 x i8> @var_ashr_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX256VL-NEXT:    vpsraw $4, %xmm0, %xmm3
 ; AVX256VL-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX256VL-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX256VL-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX256VL-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX256VL-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
+; AVX256VL-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
 ; AVX256VL-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX256VL-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX256VL-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; AVX256VL-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX256VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX256VL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
index 9c8729b3ea505..4b8f78d36c3f5 100644
--- a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
+++ b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
@@ -15,20 +15,20 @@ define <4 x i32> @shuf_rot_v4i32_1032(<4 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_rot_v4i32_1032:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-ICX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_rot_v4i32_1032:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-V4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_rot_v4i32_1032:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; CHECK-ZNVER4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <4 x i32> %x, %x
   %r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -44,20 +44,20 @@ define <8 x i32> @shuf_rot_v8i32_10325476(<8 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_rot_v8i32_10325476:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; CHECK-ICX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_rot_v8i32_10325476:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; CHECK-V4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_rot_v8i32_10325476:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; CHECK-ZNVER4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <8 x i32> %x, %x
   %r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -73,20 +73,20 @@ define <16 x i32> @shuf_rot_v16i32_1032547698111013121514(<16 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_rot_v16i32_1032547698111013121514:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-ICX-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_rot_v16i32_1032547698111013121514:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-V4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_rot_v16i32_1032547698111013121514:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-ZNVER4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <16 x i32> %x, %x
   %r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
@@ -168,20 +168,20 @@ define <4 x i32> @shuf_shr_v4i32_1U3U(<4 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_shr_v4i32_1U3U:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-ICX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_shr_v4i32_1U3U:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-V4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_shr_v4i32_1U3U:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-ZNVER4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <4 x i32> %x, %x
   %r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
@@ -197,20 +197,20 @@ define <8 x i32> @shuf_shr_v8i32_1U3U5U7U(<8 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_shr_v8i32_1U3U5U7U:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-ICX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_shr_v8i32_1U3U5U7U:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-V4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_shr_v8i32_1U3U5U7U:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-ZNVER4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <8 x i32> %x, %x
   %r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef>
@@ -226,20 +226,20 @@ define <16 x i32> @shuf_shr_v16i32_U3U5U7U9U11U13U15(<16 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-ICX-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-V4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-ZNVER4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <16 x i32> %x, %x
   %r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 9, i32 undef, i32 11, i32 undef, i32 13, i32 undef, i32 15, i32 undef>
@@ -288,20 +288,20 @@ define <4 x i32> @shuf_shl_v4i32_U0U2(<4 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_shl_v4i32_U0U2:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-ICX-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_shl_v4i32_U0U2:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-V4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_shl_v4i32_U0U2:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-ZNVER4-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <4 x i32> %x, %x
   %r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> <i32 undef, i32 0, i32 undef, i32 2>
@@ -317,20 +317,20 @@ define <8 x i32> @shuf_shl_v8i32_U0U2U4U6(<8 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_shl_v8i32_U0U2U4U6:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-ICX-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_shl_v8i32_U0U2U4U6:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-V4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_shl_v8i32_U0U2U4U6:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-ZNVER4-NEXT:    vpaddd %ymm0, %ymm0, %ymm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <8 x i32> %x, %x
   %r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6>
@@ -346,20 +346,20 @@ define <16 x i32> @shuf_shl_v16i32_U0U2U4U6U8U10U12U14(<16 x i32> %x) {
 ;
 ; CHECK-ICX-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14:
 ; CHECK-ICX:       # %bb.0:
-; CHECK-ICX-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ICX-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-ICX-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ICX-NEXT:    retq
 ;
 ; CHECK-V4-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14:
 ; CHECK-V4:       # %bb.0:
-; CHECK-V4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-V4-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-V4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-V4-NEXT:    retq
 ;
 ; CHECK-ZNVER4-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14:
 ; CHECK-ZNVER4:       # %bb.0:
-; CHECK-ZNVER4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ZNVER4-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-ZNVER4-NEXT:    vpaddd %zmm0, %zmm0, %zmm0
 ; CHECK-ZNVER4-NEXT:    retq
   %x1 = add <16 x i32> %x, %x
   %r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 undef, i32 12, i32 undef, i32 14>
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index f91758b861b4c..10dee14bdd1a0 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -602,10 +602,10 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X64-AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vpsraw $2, %xmm3, %xmm4
-; X64-AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
-; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X64-AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
+; X64-AVX2-NEXT:    vpblendvb %xmm6, %xmm4, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vpsraw $1, %xmm3, %xmm4
-; X64-AVX2-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
+; X64-AVX2-NEXT:    vpsllw $2, %xmm5, %xmm5
 ; X64-AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; X64-AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -613,10 +613,10 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X64-AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm5, %xmm4, %xmm4
 ; X64-AVX2-NEXT:    vpsraw $2, %xmm4, %xmm5
-; X64-AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm5, %xmm4, %xmm4
+; X64-AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm6
+; X64-AVX2-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
 ; X64-AVX2-NEXT:    vpsraw $1, %xmm4, %xmm5
-; X64-AVX2-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vpblendvb %xmm1, %xmm5, %xmm4, %xmm1
 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 20be5791309fd..9b528574e1188 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -536,14 +536,14 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
 ; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm6
 ; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
-; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
-; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
-; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm5
+; AVX1-NEXT:    vpsllw $2, %xmm4, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm5
+; AVX1-NEXT:    vpsllw $3, %xmm4, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 1f164635910c0..a387562ad7b5f 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -328,15 +328,15 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 ; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm8
-; AVX1-NEXT:    vpblendvb %xmm5, %xmm8, %xmm7, %xmm5
-; AVX1-NEXT:    vpsrlw $4, %xmm5, %xmm7
-; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrlw $2, %xmm5, %xmm7
-; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm7
-; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm8, %xmm7, %xmm7
+; AVX1-NEXT:    vpsrlw $4, %xmm7, %xmm8
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm8, %xmm7, %xmm6
+; AVX1-NEXT:    vpsrlw $2, %xmm6, %xmm7
+; AVX1-NEXT:    vpsllw $2, %xmm5, %xmm8
+; AVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm7
+; AVX1-NEXT:    vpsllw $3, %xmm5, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm7, %xmm6, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm6
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpslld $23, %xmm4, %xmm7
@@ -358,14 +358,14 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
 ; AVX1-NEXT:    vpaddw %xmm6, %xmm6, %xmm7
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm8
 ; AVX1-NEXT:    vpblendvb %xmm6, %xmm8, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm6
-; AVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm6
-; AVX1-NEXT:    vpaddw %xmm7, %xmm7, %xmm7
-; AVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm6
-; AVX1-NEXT:    vpaddw %xmm7, %xmm7, %xmm7
-; AVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm8
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm7
+; AVX1-NEXT:    vpsllw $2, %xmm6, %xmm8
+; AVX1-NEXT:    vpblendvb %xmm8, %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm7
+; AVX1-NEXT:    vpsllw $3, %xmm6, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm7, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 02f0f53a0bb30..d565ef01ececf 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -293,14 +293,14 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v8i16:
@@ -494,10 +494,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -505,10 +505,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
+; AVX-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 15855e3bce46f..249bcbaa1bf9c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -237,29 +237,29 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm5
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
-; AVX1-NEXT:    vpsraw $4, %xmm2, %xmm4
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; AVX1-NEXT:    vpsraw $2, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm4
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -339,29 +339,29 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; X86-AVX1-NEXT:    vpsraw $8, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpsraw $4, %xmm2, %xmm4
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpsraw $2, %xmm2, %xmm4
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpsraw $1, %xmm2, %xmm4
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
+; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vpsraw $2, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vpsllw $2, %xmm2, %xmm5
+; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpsraw $1, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vpsllw $3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
 ; X86-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
 ; X86-AVX1-NEXT:    vpsraw $8, %xmm0, %xmm4
 ; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
@@ -393,10 +393,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
 ; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm7
+; AVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -404,10 +404,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
@@ -417,10 +417,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
 ; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm6
+; AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpsllw $2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -428,10 +428,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm4
-; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm4
-; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
@@ -446,10 +446,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -457,10 +457,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; AVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
@@ -498,10 +498,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -509,10 +509,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
@@ -534,10 +534,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
-; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; AVX512DQVL-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
-; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -545,10 +545,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
-; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; AVX512DQVL-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
-; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQVL-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
@@ -572,10 +572,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; X86-AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
 ; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
 ; X86-AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
+; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm7
+; X86-AVX1-NEXT:    vpblendvb %xmm7, %xmm6, %xmm5, %xmm5
 ; X86-AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpsllw $2, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -583,10 +583,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; X86-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
 ; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm6
+; X86-AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
@@ -596,10 +596,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; X86-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
 ; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm6
+; X86-AVX1-NEXT:    vpblendvb %xmm6, %xmm5, %xmm4, %xmm4
 ; X86-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpsllw $2, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -607,10 +607,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; X86-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
 ; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
@@ -625,10 +625,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; X86-AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
 ; X86-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
 ; X86-AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
-; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; X86-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm5
+; X86-AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; X86-AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
-; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
 ; X86-AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -636,10 +636,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; X86-AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
 ; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
-; X86-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm4
+; X86-AVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
-; X86-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; X86-AVX2-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index ea0745b157f58..0fb0420bb2609 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -59,10 +59,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsraw $4, %ymm5, %ymm6
 ; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm5, %ymm6
-; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm6, %ymm5, %ymm5
+; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm7
+; AVX512DQ-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm5, %ymm6
-; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsllw $2, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm6, %ymm5, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -70,10 +70,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
 ; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsllw $2, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm2
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
@@ -83,10 +83,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsraw $4, %ymm4, %ymm5
 ; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm6
+; AVX512DQ-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm4, %ymm5
-; AVX512DQ-NEXT:    vpaddw %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsllw $2, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
 ; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
@@ -94,10 +94,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm4
 ; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm5
+; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm4
-; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
@@ -113,11 +113,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
+; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
@@ -127,11 +127,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
+; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index f7de8d427150f..c5d3297e334c7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -196,14 +196,14 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v4i16:
@@ -367,14 +367,14 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v2i16:
@@ -568,10 +568,10 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -579,10 +579,10 @@ define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
+; AVX-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -796,10 +796,10 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -807,10 +807,10 @@ define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
+; AVX-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -1024,10 +1024,10 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
+; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm5
+; AVX-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
 ; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $2, %xmm2, %xmm2
 ; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1035,10 +1035,10 @@ define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
 ; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm4
+; AVX-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpsllw $2, %xmm1, %xmm1
 ; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 1d1697aa38bae..8cb2c7be3b044 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -262,14 +262,14 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v8i16:
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 3a4bb223618dd..606adb4aded5c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -198,29 +198,29 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
-; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
+; AVX1-NEXT:    vpsllw $3, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -300,29 +300,29 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpsrlw $4, %xmm4, %xmm5
+; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vpsllw $2, %xmm2, %xmm5
+; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vpsllw $3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
 ; X86-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
 ; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vpsllw $2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 79281116665f6..57874c4399277 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -196,14 +196,14 @@ define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v4i16:
@@ -367,14 +367,14 @@ define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shift_v2i16:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index dbbfaab9ea26a..be41945528e14 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -8079,14 +8079,14 @@ define <16 x i16> @pr43230(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
 ; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $2, %xmm1, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    retq

From df6549403525dd3bca864d65bb168956ef946d12 Mon Sep 17 00:00:00 2001
From: pkarveti <quic_pkarveti@quicinc.com>
Date: Fri, 3 Oct 2025 19:45:41 +0530
Subject: [PATCH 641/878] [Hexagon] Added lowering for sint_to_fp from v32i1 to
 v32f32 (#159507)

The transformation pattern is identical to the uint_to_fp
conversion from v32i1 to v32f32.
---
 .../Target/Hexagon/HexagonISelLoweringHVX.cpp |  5 ++-
 .../Hexagon/isel-inttofp-v32i1tov32f32.ll     | 42 +++++++++++++++++++
 .../Hexagon/isel-uinttofp-v32i1tov32f32.ll    | 25 -----------
 3 files changed, 45 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/isel-inttofp-v32i1tov32f32.ll
 delete mode 100644 llvm/test/CodeGen/Hexagon/isel-uinttofp-v32i1tov32f32.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index d0dfa47468705..004050407e556 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -449,6 +449,7 @@ HexagonTargetLowering::initializeHVXLowering() {
   // Include cases which are not hander earlier
   setOperationAction(ISD::UINT_TO_FP, MVT::v32i1, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::v64i1, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v32i1, Custom);
 
   setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT});
 }
@@ -2337,7 +2338,7 @@ HexagonTargetLowering::LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const {
   return ExpandHvxFpToInt(Op, DAG);
 }
 
-// For vector type v32i1 uint_to_fp to v32f32:
+// For vector type v32i1 uint_to_fp/sint_to_fp to v32f32:
 // R1 = #1, R2 holds the v32i1 param
 // V1 = vsplat(R1)
 // V2 = vsplat(R2)
@@ -2464,7 +2465,7 @@ HexagonTargetLowering::LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const {
   MVT IntTy = ty(Op.getOperand(0)).getVectorElementType();
   MVT FpTy = ResTy.getVectorElementType();
 
-  if (Op.getOpcode() == ISD::UINT_TO_FP) {
+  if (Op.getOpcode() == ISD::UINT_TO_FP || Op.getOpcode() == ISD::SINT_TO_FP) {
     if (ResTy == MVT::v32f32 && ty(Op.getOperand(0)) == MVT::v32i1)
       return LowerHvxPred32ToFp(Op, DAG);
     if (ResTy == MVT::v64f16 && ty(Op.getOperand(0)) == MVT::v64i1)
diff --git a/llvm/test/CodeGen/Hexagon/isel-inttofp-v32i1tov32f32.ll b/llvm/test/CodeGen/Hexagon/isel-inttofp-v32i1tov32f32.ll
new file mode 100644
index 0000000000000..93ca3a2b68c9b
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/isel-inttofp-v32i1tov32f32.ll
@@ -0,0 +1,42 @@
+; Tests lowering of v32i1 to v32f32
+
+; RUN: llc -march=hexagon -mattr=+hvxv79,+hvx-length128b,+hvx-ieee-fp \
+; RUN: -stop-after=hexagon-isel %s -o - | FileCheck %s
+
+define <32 x float> @uitofp_i1(<32 x i16> %in0, <32 x i16> %in1) #0 {
+; CHECK: name:            uitofp_i1
+; CHECK: [[R0:%[0-9]+]]:hvxvr = V6_lvsplatw killed %{{[0-9]+}}
+; CHECK-NEXT: [[R1:%[0-9]+]]:intregs = A2_tfrsi 1
+; CHECK-NEXT: [[R2:%[0-9]+]]:hvxvr = V6_lvsplatw [[R1]]
+; CHECK-NEXT: [[R3:%[0-9]+]]:hvxqr = V6_vandvrt [[R2]], [[R1]]
+; CHECK-NEXT: [[R4:%[0-9]+]]:hvxvr = V6_vprefixqw killed [[R3]]
+; CHECK-NEXT: [[R5:%[0-9]+]]:hvxvr = V6_vsubw killed [[R4]], [[R2]]
+; CHECK-NEXT: [[R6:%[0-9]+]]:hvxvr = V6_vlsrwv killed [[R0]], killed [[R5]]
+; CHECK-NEXT: [[R7:%[0-9]+]]:hvxvr = V6_vand killed [[R6]], [[R2]]
+; CHECK-NEXT: [[R8:%[0-9]+]]:hvxvr = V6_vconv_sf_w killed [[R7]]
+; CHECK-NEXT: hvxvr = V6_vadd_sf_sf [[R8]], [[R8]]
+  %q1 = icmp eq <32 x i16> %in0, %in1
+  %fp0 = uitofp <32 x i1> %q1 to <32 x float>
+  %out = fadd <32 x float> %fp0, %fp0
+  ret <32 x float> %out
+}
+
+define <32 x float> @sitofp_i1(<32 x i16> %in0, <32 x i16> %in1) #0 {
+; CHECK: name:            sitofp_i1
+; CHECK: [[R0:%[0-9]+]]:hvxvr = V6_lvsplatw killed %{{[0-9]+}}
+; CHECK-NEXT: [[R1:%[0-9]+]]:intregs = A2_tfrsi 1
+; CHECK-NEXT: [[R2:%[0-9]+]]:hvxvr = V6_lvsplatw [[R1]]
+; CHECK-NEXT: [[R3:%[0-9]+]]:hvxqr = V6_vandvrt [[R2]], [[R1]]
+; CHECK-NEXT: [[R4:%[0-9]+]]:hvxvr = V6_vprefixqw killed [[R3]]
+; CHECK-NEXT: [[R5:%[0-9]+]]:hvxvr = V6_vsubw killed [[R4]], [[R2]]
+; CHECK-NEXT: [[R6:%[0-9]+]]:hvxvr = V6_vlsrwv killed [[R0]], killed [[R5]]
+; CHECK-NEXT: [[R7:%[0-9]+]]:hvxvr = V6_vand killed [[R6]], [[R2]]
+; CHECK-NEXT: [[R8:%[0-9]+]]:hvxvr = V6_vconv_sf_w killed [[R7]]
+; CHECK-NEXT: hvxvr = V6_vadd_sf_sf [[R8]], [[R8]]
+  %q1 = icmp eq <32 x i16> %in0, %in1
+  %fp0 = sitofp <32 x i1> %q1 to <32 x float>
+  %out = fadd <32 x float> %fp0, %fp0
+  ret <32 x float> %out
+}
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv79" "target-features"="+hvxv79,+hvx-length128b" }
diff --git a/llvm/test/CodeGen/Hexagon/isel-uinttofp-v32i1tov32f32.ll b/llvm/test/CodeGen/Hexagon/isel-uinttofp-v32i1tov32f32.ll
deleted file mode 100644
index dfb2bc83537dc..0000000000000
--- a/llvm/test/CodeGen/Hexagon/isel-uinttofp-v32i1tov32f32.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; Tests lowering of v32i1 to v32f32
-
-; RUN: llc -march=hexagon -mattr=+hvxv79,+hvx-length128b,+hvx-ieee-fp \
-; RUN: -stop-after=hexagon-isel %s -o - | FileCheck %s
-
-; CHECK: [[R0:%[0-9]+]]:hvxvr = V6_lvsplatw killed %{{[0-9]+}}
-; CHECK-NEXT: [[R1:%[0-9]+]]:intregs = A2_tfrsi 1
-; CHECK-NEXT: [[R2:%[0-9]+]]:hvxvr = V6_lvsplatw [[R1]]
-; CHECK-NEXT: [[R3:%[0-9]+]]:hvxqr = V6_vandvrt [[R2]], [[R1]]
-; CHECK-NEXT: [[R4:%[0-9]+]]:hvxvr = V6_vprefixqw killed [[R3]]
-; CHECK-NEXT: [[R5:%[0-9]+]]:hvxvr = V6_vsubw killed [[R4]], [[R2]]
-; CHECK-NEXT: [[R6:%[0-9]+]]:hvxvr = V6_vlsrwv killed [[R0]], killed [[R5]]
-; CHECK-NEXT: [[R7:%[0-9]+]]:hvxvr = V6_vand killed [[R6]], [[R2]]
-; CHECK-NEXT: [[R8:%[0-9]+]]:hvxvr = V6_vconv_sf_w killed [[R7]]
-; CHECK-NEXT: hvxvr = V6_vadd_sf_sf [[R8]], [[R8]]
-
-define <32 x float> @uitofp_i1(<32 x i16> %in0, <32 x i16> %in1) #0
-{
-   %q1 = icmp eq <32 x i16> %in0, %in1
-   %fp0 = uitofp <32 x i1> %q1 to <32 x float>
-   %out = fadd <32 x float> %fp0, %fp0
-   ret <32 x float> %out
-}
-
-attributes #0 = { nounwind readnone "target-cpu"="hexagonv79" "target-features"="+hvxv79,+hvx-length128b" }

From ef4471edfa6d72e13c8f8e90bef521caee6f4b1f Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Fri, 3 Oct 2025 16:24:03 +0200
Subject: [PATCH 642/878] [analyzer] Teach -analyze-function about USRs, extend
 documentation (#161666)

This flag is really convinient in most cases.
It's easy to figure out what value to pass for most cases. However, it
can sometimes match too many times, like for template functions that has
non-decuded (aka. explicitly specified) template parameters - because
they don't appear in the parameter list, thus they are not accounted for
in the current logic.

It would be nice to improve `getFunctionName` but I'd say to just settle
on using USRs. So this PR enables passing USRs to the flag, while
keeping previous behavior.
---
 .../analyzer/developer-docs/DebugChecks.rst    | 16 ++++++++++++++++
 .../clang/CrossTU/CrossTranslationUnit.h       |  4 ++--
 clang/lib/CrossTU/CrossTranslationUnit.cpp     |  4 ++--
 .../Frontend/AnalysisConsumer.cpp              |  5 ++++-
 clang/test/Analysis/analyzeOneFunction.cpp     | 18 ++++++++++++++++++
 5 files changed, 42 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/Analysis/analyzeOneFunction.cpp

diff --git a/clang/docs/analyzer/developer-docs/DebugChecks.rst b/clang/docs/analyzer/developer-docs/DebugChecks.rst
index 767ef6565d335..b3b908941317e 100644
--- a/clang/docs/analyzer/developer-docs/DebugChecks.rst
+++ b/clang/docs/analyzer/developer-docs/DebugChecks.rst
@@ -9,6 +9,22 @@ The analyzer contains a number of checkers which can aid in debugging. Enable
 them by using the "-analyzer-checker=" flag, followed by the name of the
 checker.
 
+These checkers are especially useful when analyzing a specific function, using
+the `-analyze-function` flag. The flag accepts the function name for C code,
+like `-analyze-function=myfunction`.
+For C++ code, due to overloading, the function name must include the
+parameter list, like `-analyze-function="myfunction(int, _Bool)"`.
+
+Note that `bool` must be spelled as `_Bool` in the parameter list.
+Refer to the output of `-analyzer-display-progress` to find the fully qualified
+function name.
+
+There are cases when this name can still collide. For example with template
+function instances with non-deducible (aka. explicit) template parameters.
+In such cases, prefer passing a USR instead of a function name can resolve this
+ambiguity, like this: `-analyze-function="c:@S@Window@F@overloaded#I#"`.
+
+Use the `clang-extdef-mapping` tool to find the USR for different functions.
 
 General Analysis Dumpers
 ========================
diff --git a/clang/include/clang/CrossTU/CrossTranslationUnit.h b/clang/include/clang/CrossTU/CrossTranslationUnit.h
index e6b608a10e61b..9e0721edfc323 100644
--- a/clang/include/clang/CrossTU/CrossTranslationUnit.h
+++ b/clang/include/clang/CrossTU/CrossTranslationUnit.h
@@ -180,8 +180,8 @@ class CrossTranslationUnitContext {
   llvm::Expected<const VarDecl *> importDefinition(const VarDecl *VD,
                                                    ASTUnit *Unit);
 
-  /// Get a name to identify a named decl.
-  static std::optional<std::string> getLookupName(const NamedDecl *ND);
+  /// Get a name to identify a decl.
+  static std::optional<std::string> getLookupName(const Decl *D);
 
   /// Emit diagnostics for the user for potential configuration errors.
   void emitCrossTUDiagnostics(const IndexError &IE);
diff --git a/clang/lib/CrossTU/CrossTranslationUnit.cpp b/clang/lib/CrossTU/CrossTranslationUnit.cpp
index 847913d4b03fd..0287845a741ed 100644
--- a/clang/lib/CrossTU/CrossTranslationUnit.cpp
+++ b/clang/lib/CrossTU/CrossTranslationUnit.cpp
@@ -252,9 +252,9 @@ CrossTranslationUnitContext::CrossTranslationUnitContext(CompilerInstance &CI)
 CrossTranslationUnitContext::~CrossTranslationUnitContext() {}
 
 std::optional<std::string>
-CrossTranslationUnitContext::getLookupName(const NamedDecl *ND) {
+CrossTranslationUnitContext::getLookupName(const Decl *D) {
   SmallString<128> DeclUSR;
-  bool Ret = index::generateUSRForDecl(ND, DeclUSR);
+  bool Ret = index::generateUSRForDecl(D, DeclUSR);
   if (Ret)
     return {};
   return std::string(DeclUSR);
diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index 53466e7a75b0f..e9ba374851726 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -659,8 +659,11 @@ void AnalysisConsumer::HandleTranslationUnit(ASTContext &C) {
 AnalysisConsumer::AnalysisMode
 AnalysisConsumer::getModeForDecl(Decl *D, AnalysisMode Mode) {
   if (!Opts.AnalyzeSpecificFunction.empty() &&
-      AnalysisDeclContext::getFunctionName(D) != Opts.AnalyzeSpecificFunction)
+      AnalysisDeclContext::getFunctionName(D) != Opts.AnalyzeSpecificFunction &&
+      cross_tu::CrossTranslationUnitContext::getLookupName(D).value_or("") !=
+          Opts.AnalyzeSpecificFunction) {
     return AM_None;
+  }
 
   // Unless -analyze-all is specified, treat decls differently depending on
   // where they came from:
diff --git a/clang/test/Analysis/analyzeOneFunction.cpp b/clang/test/Analysis/analyzeOneFunction.cpp
new file mode 100644
index 0000000000000..3a362dfd9a08c
--- /dev/null
+++ b/clang/test/Analysis/analyzeOneFunction.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s \
+// RUN:   -analyze-function="Window::overloaded(int)"
+
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s \
+// RUN:   -analyze-function="c:@S@Window@F@overloaded#I#"
+
+// RUN: %clang_extdef_map %s | FileCheck %s
+// CHECK:      27:c:@S@Window@F@overloaded#I#
+// CHECK-NEXT: 27:c:@S@Window@F@overloaded#C#
+// CHECK-NEXT: 27:c:@S@Window@F@overloaded#d#
+
+void clang_analyzer_warnIfReached();
+
+struct Window {
+  void overloaded(double) { clang_analyzer_warnIfReached(); } // not analyzed, thus not reachable
+  void overloaded(char) { clang_analyzer_warnIfReached(); }   // not analyzed, thus not reachable
+  void overloaded(int) { clang_analyzer_warnIfReached(); } // expected-warning {{REACHABLE}}
+};

From cf86ef925d75ef08526fc399a2888673338298b3 Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Fri, 3 Oct 2025 16:29:41 +0200
Subject: [PATCH 643/878] [clang][analyzer] Make per-entry-point metric rows
 uniquely identifiable

Also remove the gratuitous spaces after "," that break strict CSV
compliance.

As the debug function name does not uniquely identify an entry point,
add the main-TU name and the USR values for each entry point snapshot to
reduce the likelihood of collisions between declarations across large
projects.

While adding a filename to each row increases the file size
substantially, the difference in size for the compressed is acceptable.

I evaluated it on our set of 200+ open source C and C++ projects with 3M
entry points, and got the following results when adding these two
columns:
- Raw CSV file increased from 530MB to 1.1GB
- Compressed file (XZ) increased from 54 MB to 78 MB

--

CPP-7098

---------

Co-authored-by: Balazs Benics <balazs.benics@sonarsource.com>
---
 .../Core/PathSensitive/EntryPointStats.h      |  2 +-
 .../StaticAnalyzer/Core/EntryPointStats.cpp   | 31 ++++++++++++++-----
 .../Frontend/AnalysisConsumer.cpp             | 12 ++++++-
 .../analyzer-stats/entry-point-stats.cpp      |  8 +++--
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
index 633fb7aa8f72d..448e40269ca2d 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
@@ -25,7 +25,7 @@ class EntryPointStat {
 public:
   llvm::StringLiteral name() const { return Name; }
 
-  static void lockRegistry();
+  static void lockRegistry(llvm::StringRef CPPFileName);
 
   static void takeSnapshot(const Decl *EntryPoint);
   static void dumpStatsAsCSV(llvm::raw_ostream &OS);
diff --git a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
index b7f9044f65308..62ae62f2f2154 100644
--- a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
+++ b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
@@ -9,7 +9,9 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
+#include "clang/Index/USRGeneration.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
@@ -38,6 +40,7 @@ struct Registry {
   };
 
   std::vector<Snapshot> Snapshots;
+  std::string EscapedCPPFileName;
 };
 } // namespace
 
@@ -69,7 +72,7 @@ static void checkStatName(const EntryPointStat *M) {
   }
 }
 
-void EntryPointStat::lockRegistry() {
+void EntryPointStat::lockRegistry(llvm::StringRef CPPFileName) {
   auto CmpByNames = [](const EntryPointStat *L, const EntryPointStat *R) {
     return L->name() < R->name();
   };
@@ -78,6 +81,8 @@ void EntryPointStat::lockRegistry() {
   enumerateStatVectors(
       [](const auto &Stats) { llvm::for_each(Stats, checkStatName); });
   StatsRegistry->IsLocked = true;
+  llvm::raw_string_ostream OS(StatsRegistry->EscapedCPPFileName);
+  llvm::printEscapedString(CPPFileName, OS);
 }
 
 [[maybe_unused]] static bool isRegistered(llvm::StringLiteral Name) {
@@ -144,15 +149,27 @@ static std::vector<llvm::StringLiteral> getStatNames() {
   return Ret;
 }
 
+static std::string getUSR(const Decl *D) {
+  llvm::SmallVector<char> Buf;
+  if (index::generateUSRForDecl(D, Buf)) {
+    assert(false && "This should never fail");
+    return AnalysisDeclContext::getFunctionName(D);
+  }
+  return llvm::toStringRef(Buf).str();
+}
+
 void Registry::Snapshot::dumpAsCSV(llvm::raw_ostream &OS) const {
   OS << '"';
+  llvm::printEscapedString(getUSR(EntryPoint), OS);
+  OS << "\",\"";
+  OS << StatsRegistry->EscapedCPPFileName << "\",\"";
   llvm::printEscapedString(
       clang::AnalysisDeclContext::getFunctionName(EntryPoint), OS);
-  OS << "\", ";
+  OS << "\",";
   auto PrintAsBool = [&OS](bool B) { OS << (B ? "true" : "false"); };
-  llvm::interleaveComma(BoolStatValues, OS, PrintAsBool);
-  OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ", ");
-  llvm::interleaveComma(UnsignedStatValues, OS);
+  llvm::interleave(BoolStatValues, OS, PrintAsBool, ",");
+  OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ",");
+  llvm::interleave(UnsignedStatValues, OS, [&OS](unsigned U) { OS << U; }, ",");
 }
 
 static std::vector<bool> consumeBoolStats() {
@@ -181,8 +198,8 @@ void EntryPointStat::dumpStatsAsCSV(llvm::StringRef FileName) {
 }
 
 void EntryPointStat::dumpStatsAsCSV(llvm::raw_ostream &OS) {
-  OS << "EntryPoint, ";
-  llvm::interleaveComma(getStatNames(), OS);
+  OS << "USR,File,DebugName,";
+  llvm::interleave(getStatNames(), OS, [&OS](const auto &a) { OS << a; }, ",");
   OS << "\n";
 
   std::vector<std::string> Rows;
diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index e9ba374851726..e9b9b0bc187bc 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -65,6 +65,15 @@ STAT_MAX(MaxCFGSize, "The maximum number of basic blocks in a function.");
 
 namespace {
 
+StringRef getMainFileName(const CompilerInvocation &Invocation) {
+  if (!Invocation.getFrontendOpts().Inputs.empty()) {
+    const FrontendInputFile &Input = Invocation.getFrontendOpts().Inputs[0];
+    return Input.isFile() ? Input.getFile()
+                          : Input.getBuffer().getBufferIdentifier();
+  }
+  return "<no input>";
+}
+
 class AnalysisConsumer : public AnalysisASTConsumer,
                          public DynamicRecursiveASTVisitor {
   enum {
@@ -125,7 +134,8 @@ class AnalysisConsumer : public AnalysisASTConsumer,
         PP(CI.getPreprocessor()), OutDir(outdir), Opts(opts), Plugins(plugins),
         Injector(std::move(injector)), CTU(CI),
         MacroExpansions(CI.getLangOpts()) {
-    EntryPointStat::lockRegistry();
+
+    EntryPointStat::lockRegistry(getMainFileName(CI.getInvocation()));
     DigestAnalyzerOptions();
 
     if (Opts.AnalyzerDisplayProgress || Opts.PrintStats ||
diff --git a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
index 1ff31d114ee99..9cbe04550a8d3 100644
--- a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
+++ b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
@@ -5,7 +5,9 @@
 // RUN: %csv2json "%t.csv" | FileCheck --check-prefix=CHECK %s
 //
 // CHECK:      {
-// CHECK-NEXT:   "fib(unsigned int)": {
+// CHECK-NEXT:   "c:@F@fib#i#": {
+// CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
+// CHECK-NEXT:     "DebugName": "fib(unsigned int)",
 // CHECK-NEXT:     "NumBlocks": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumBlocksUnreachable": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumCTUSteps": "{{[0-9]+}}",
@@ -40,7 +42,9 @@
 // CHECK-NEXT:     "MaxValidBugClassSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "PathRunningTime": "{{[0-9]+}}"
 // CHECK-NEXT:   },
-// CHECK-NEXT:   "main(int, char **)": {
+// CHECK-NEXT:   "c:@F@main#I#**C#": {
+// CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
+// CHECK-NEXT:     "DebugName": "main(int, char **)",
 // CHECK-NEXT:     "NumBlocks": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumBlocksUnreachable": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumCTUSteps": "{{[0-9]+}}",

From c75ae01233b0965d48e3770160119c162eb5c2ee Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Fri, 3 Oct 2025 16:32:52 +0200
Subject: [PATCH 644/878] [analyzer] Fix -analyze-function debug warning to
 account for syntax checkers (#161664)

Previously, when using `-analyze-function` to target a specific
function, the analyzer would incorrectly report "Every top-level
function was skipped" even when the function was successfully analyzed
by syntax-only checkers.

This happened because `NumFunctionsAnalyzed` only counted path-sensitive
analysis, not syntax-only analysis. The misuse detection logic would see
0 functions analyzed and incorrectly conclude the function wasn't found.
---
 .../Frontend/AnalysisConsumer.cpp             | 10 +++++++---
 .../test/Analysis/analyze-function-guide.cpp  | 19 +++++++++++--------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index e9b9b0bc187bc..cf01e2f37c662 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -51,6 +51,9 @@ STAT_COUNTER(NumFunctionTopLevel, "The # of functions at top level.");
 ALWAYS_ENABLED_STATISTIC(NumFunctionsAnalyzed,
                          "The # of functions and blocks analyzed (as top level "
                          "with inlining turned on).");
+ALWAYS_ENABLED_STATISTIC(
+    NumFunctionsAnalyzedSyntaxOnly,
+    "The # of functions analyzed by syntax checkers only.");
 ALWAYS_ENABLED_STATISTIC(NumBlocksInAnalyzedFunctions,
                          "The # of basic blocks in the analyzed functions.");
 ALWAYS_ENABLED_STATISTIC(
@@ -598,10 +601,10 @@ void AnalysisConsumer::runAnalysisOnTranslationUnit(ASTContext &C) {
   // If the user wanted to analyze a specific function and the number of basic
   // blocks analyzed is zero, than the user might not specified the function
   // name correctly.
-  // FIXME: The user might have analyzed the requested function in Syntax mode,
-  // but we are unaware of that.
-  if (!Opts.AnalyzeSpecificFunction.empty() && NumFunctionsAnalyzed == 0)
+  if (!Opts.AnalyzeSpecificFunction.empty() && NumFunctionsAnalyzed == 0 &&
+      NumFunctionsAnalyzedSyntaxOnly == 0) {
     reportAnalyzerFunctionMisuse(Opts, *Ctx);
+  }
 }
 
 void AnalysisConsumer::reportAnalyzerProgress(StringRef S) {
@@ -736,6 +739,7 @@ void AnalysisConsumer::HandleCode(Decl *D, AnalysisMode Mode,
       SyntaxCheckTimer->startTimer();
     }
     checkerMgr->runCheckersOnASTBody(D, *Mgr, BR);
+    ++NumFunctionsAnalyzedSyntaxOnly;
     if (SyntaxCheckTimer) {
       SyntaxCheckTimer->stopTimer();
       llvm::TimeRecord CheckerEndTime = SyntaxCheckTimer->getTotalTime();
diff --git a/clang/test/Analysis/analyze-function-guide.cpp b/clang/test/Analysis/analyze-function-guide.cpp
index 96f1001057eb6..e260fc43a666a 100644
--- a/clang/test/Analysis/analyze-function-guide.cpp
+++ b/clang/test/Analysis/analyze-function-guide.cpp
@@ -46,14 +46,17 @@ int fizzbuzz(int x, bool y) {
 // CHECK-ADVOCATE-DISPLAY-PROGRESS-NEXT: Pass the -analyzer-display-progress for tracking which functions are analyzed.
 // CHECK-ADVOCATE-DISPLAY-PROGRESS-NOT:  For analyzing
 
-// Same as the previous but syntax mode only.
-// FIXME: This should have empty standard output.
+// The user only enables syntax-only analysis, like `debug.DumpDominators`.
+// `-analyze-function` should only match the given function.
 //
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -analyzer-config ipa=none \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.DumpDominators -analyzer-config ipa=none \
 // RUN:   -analyze-function='fizzbuzz(int, _Bool)' -x c++ \
 // RUN:   -triple x86_64-pc-linux-gnu 2>&1 %s \
-// RUN: | FileCheck %s -check-prefix=CHECK-EMPTY3 --allow-empty
-//
-// FIXME: This should have empty standard output.
-// CHECK-EMPTY3:      Every top-level function was skipped.
-// CHECK-EMPTY3-NEXT: Pass the -analyzer-display-progress for tracking which functions are analyzed.
+// RUN: | FileCheck %s -check-prefix=CHECK-SYNTAX-ONLY --allow-empty
+//
+// With syntax-only analysis, the function is found and analyzed, so no error message.
+// CHECK-SYNTAX-ONLY:      Immediate dominance tree (Node#,IDom#):
+// CHECK-SYNTAX-ONLY-NEXT: (0,1)
+// CHECK-SYNTAX-ONLY-NEXT: (1,2)
+// CHECK-SYNTAX-ONLY-NEXT: (2,2)
+// CHECK-SYNTAX-ONLY-NOT: Every top-level function was skipped.

From f8f6c0b6ecb9d87ead48246d4fadf6048207375d Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Fri, 3 Oct 2025 10:35:03 -0400
Subject: [PATCH 645/878] Revert "[LLVM] Add GNU make jobserver support
 (#145131)"

revert this patch due to failure in unittests/Support, e.g.

https://lab.llvm.org/buildbot/#/builders/33/builds/24178/steps/6/logs/FAIL__LLVM-Unit__SupportTests_61

This reverts commit ffc503edd0a2d07121232fe204e480fc29631a90.
---
 clang/include/clang/Driver/Options.td         |   5 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  22 +-
 clang/test/Driver/hip-options.hip             |   6 -
 clang/test/Driver/linker-wrapper.c            |   2 -
 .../ClangLinkerWrapper.cpp                    |  18 +-
 .../clang-linker-wrapper/LinkerWrapperOpts.td |   3 +-
 llvm/include/llvm/Support/Jobserver.h         | 162 -------
 llvm/include/llvm/Support/ThreadPool.h        |   4 -
 llvm/include/llvm/Support/Threading.h         |  18 -
 llvm/lib/Support/CMakeLists.txt               |   1 -
 llvm/lib/Support/Jobserver.cpp                | 259 -----------
 llvm/lib/Support/Parallel.cpp                 |  98 +---
 llvm/lib/Support/ThreadPool.cpp               | 108 +----
 llvm/lib/Support/Threading.cpp                |   5 -
 llvm/lib/Support/Unix/Jobserver.inc           | 195 --------
 llvm/lib/Support/Windows/Jobserver.inc        |  79 ----
 llvm/unittests/Support/CMakeLists.txt         |   1 -
 llvm/unittests/Support/JobserverTest.cpp      | 437 ------------------
 18 files changed, 30 insertions(+), 1393 deletions(-)
 delete mode 100644 llvm/include/llvm/Support/Jobserver.h
 delete mode 100644 llvm/lib/Support/Jobserver.cpp
 delete mode 100644 llvm/lib/Support/Unix/Jobserver.inc
 delete mode 100644 llvm/lib/Support/Windows/Jobserver.inc
 delete mode 100644 llvm/unittests/Support/JobserverTest.cpp

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 5a48f0bcf65e5..2ef609831637e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1258,9 +1258,8 @@ def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
   HelpText<"Compression level for offload device binaries (HIP only)">;
 
 def offload_jobs_EQ : Joined<["--"], "offload-jobs=">,
-  HelpText<"Specify the number of threads to use for device offloading tasks "
-           "during compilation. Can be a positive integer or the string "
-           "'jobserver' to use the make-style jobserver from the environment.">;
+  HelpText<"Specify the number of threads to use for device offloading tasks"
+           " during compilation.">;
 
 defm offload_via_llvm : BoolFOption<"offload-via-llvm",
   LangOpts<"OffloadViaLLVM">, DefaultFalse,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 684cc0902916f..412a176006bc0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9224,20 +9224,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   addOffloadCompressArgs(Args, CmdArgs);
 
   if (Arg *A = Args.getLastArg(options::OPT_offload_jobs_EQ)) {
-    StringRef Val = A->getValue();
-
-    if (Val.equals_insensitive("jobserver"))
-      CmdArgs.push_back(Args.MakeArgString("--wrapper-jobs=jobserver"));
-    else {
-      int NumThreads;
-      if (Val.getAsInteger(10, NumThreads) || NumThreads <= 0) {
-        C.getDriver().Diag(diag::err_drv_invalid_int_value)
-            << A->getAsString(Args) << Val;
-      } else {
-        CmdArgs.push_back(
-            Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
-      }
-    }
+    int NumThreads;
+    if (StringRef(A->getValue()).getAsInteger(10, NumThreads) ||
+        NumThreads <= 0)
+      C.getDriver().Diag(diag::err_drv_invalid_int_value)
+          << A->getAsString(Args) << A->getValue();
+    else
+      CmdArgs.push_back(
+          Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
   }
 
   const char *Exec =
diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index 09f1ffa62d348..6206020d76db6 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -254,9 +254,3 @@
 // RUN:   --offload-arch=gfx1100 --offload-new-driver --offload-jobs=0x4 %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=INVJOBS %s
 // INVJOBS: clang: error: invalid integral value '0x4' in '--offload-jobs=0x4'
-
-// RUN: %clang -### -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
-// RUN:   --offload-arch=gfx1100 --offload-new-driver --offload-jobs=jobserver %s 2>&1 | \
-// RUN:   FileCheck -check-prefix=JOBSV %s
-// JOBSV: clang-linker-wrapper{{.*}} "--wrapper-jobs=jobserver"
-
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 1c0fb9644ef54..c060dae7bb154 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -114,8 +114,6 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=4 \
 // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
-// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=jobserver \
-// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
 
 // CUDA-PAR: fatbinary{{.*}}-64 --create {{.*}}.fatbin
 
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 4d5b956031674..1419b8c90a625 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1295,18 +1295,12 @@ int main(int Argc, char **Argv) {
 
   parallel::strategy = hardware_concurrency(1);
   if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) {
-    StringRef Val = Arg->getValue();
-    if (Val.equals_insensitive("jobserver"))
-      parallel::strategy = jobserver_concurrency();
-    else {
-      unsigned Threads = 0;
-      if (!llvm::to_integer(Val, Threads) || Threads == 0)
-        reportError(createStringError(
-            "%s: expected a positive integer or 'jobserver', got '%s'",
-            Arg->getSpelling().data(), Val.data()));
-      else
-        parallel::strategy = hardware_concurrency(Threads);
-    }
+    unsigned Threads = 0;
+    if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0)
+      reportError(createStringError("%s: expected a positive integer, got '%s'",
+                                    Arg->getSpelling().data(),
+                                    Arg->getValue()));
+    parallel::strategy = hardware_concurrency(Threads);
   }
 
   if (Args.hasArg(OPT_wrapper_time_trace_eq)) {
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index 87f911c749bf6..fa73e02fd5178 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -53,8 +53,7 @@ def wrapper_time_trace_granularity : Joined<["--"], "wrapper-time-trace-granular
 
 def wrapper_jobs : Joined<["--"], "wrapper-jobs=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">,
-  HelpText<"Sets the number of parallel jobs for device linking. Can be a "
-            "positive integer or 'jobserver'.">;
+  HelpText<"Sets the number of parallel jobs to use for device linking">;
 
 def override_image : Joined<["--"], "override-image=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<kind=file>">,
diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h
deleted file mode 100644
index 6bee3b5671d55..0000000000000
--- a/llvm/include/llvm/Support/Jobserver.h
+++ /dev/null
@@ -1,162 +0,0 @@
-//===- llvm/Support/Jobserver.h - Jobserver Client --------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a client for the GNU Make jobserver protocol. This allows
-// LLVM tools to coordinate parallel execution with a parent `make` process.
-//
-// The jobserver protocol is a mechanism for GNU Make to share its pool of
-// available "job slots" with the subprocesses it invokes. This is particularly
-// useful for tools that can perform parallel operations themselves (e.g., a
-// multi-threaded linker or compiler). By participating in this protocol, a
-// tool can ensure the total number of concurrent jobs does not exceed the
-// limit specified by the user (e.g., `make -j8`).
-//
-// How it works:
-//
-// 1. Establishment:
-//    A child process discovers the jobserver by inspecting the `MAKEFLAGS`
-//    environment variable. If a jobserver is active, this variable will
-//    contain a `--jobserver-auth=<value>` argument. The format of `<value>`
-//    determines how to communicate with the server.
-//
-// 2. The Implicit Slot:
-//    Every command invoked by `make` is granted one "implicit" job slot. This
-//    means a tool can always perform at least one unit of work without needing
-//    to communicate with the jobserver. This implicit slot should NEVER be
-//    released back to the jobserver.
-//
-// 3. Acquiring and Releasing Slots:
-//    On POSIX systems, the jobserver is implemented as a pipe. The
-//    `--jobserver-auth` value specifies either a path to a named pipe
-//    (`fifo:PATH`) or a pair of file descriptors (`R,W`). The pipe is
-//    pre-loaded with single-character tokens, one for each available job slot.
-//
-//    - To acquire an additional slot, a client reads a single-character token
-//      from the pipe.
-//    - To release a slot, the client must write the *exact same* character
-//      token back to the pipe.
-//
-//    It is critical that a client releases all acquired slots before it exits,
-//    even in cases of error, to avoid deadlocking the build.
-//
-// Example:
-//    A multi-threaded linker invoked by `make -j8` wants to use multiple
-//    threads. It first checks for the jobserver. It knows it has one implicit
-//    slot, so it can use one thread. It then tries to acquire 7 more slots by
-//    reading 7 tokens from the jobserver pipe. If it only receives 3 tokens,
-//    it knows it can use a total of 1 (implicit) + 3 (acquired) = 4 threads.
-//    Before exiting, it must write the 3 tokens it read back to the pipe.
-//
-// For more context, see:
-//   - GNU Make manual on job slots:
-//     https://www.gnu.org/software/make/manual/html_node/Job-Slots.html
-//   - LLVM RFC discussion on jobserver support:
-//     https://discourse.llvm.org/t/rfc-adding-gnu-make-jobserver-
-//     support-to-llvm-for-coordinated-parallelism/87034
-//   - Ninja’s jobserver support PR:
-//     https://github.com/ninja-build/ninja/pull/2506
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_JOBSERVER_H
-#define LLVM_SUPPORT_JOBSERVER_H
-
-#include "llvm/ADT/StringRef.h"
-#include <memory>
-#include <string>
-
-namespace llvm {
-
-/// A JobSlot represents a single job slot that can be acquired from or released
-/// to a jobserver pool. This class is move-only.
-class JobSlot {
-public:
-  /// Default constructor creates an invalid instance.
-  JobSlot() = default;
-
-  // Move operations are allowed.
-  JobSlot(JobSlot &&Other) noexcept : Value(Other.Value) {
-    Other.Value = kInvalidValue;
-  }
-  JobSlot &operator=(JobSlot &&Other) noexcept {
-    if (this != &Other) {
-      this->Value = Other.Value;
-      Other.Value = kInvalidValue;
-    }
-    return *this;
-  }
-
-  // Copy operations are disallowed.
-  JobSlot(const JobSlot &) = delete;
-  JobSlot &operator=(const JobSlot &) = delete;
-
-  /// Returns true if this instance is valid (either implicit or explicit).
-  bool isValid() const { return Value >= 0; }
-
-  /// Returns true if this instance represents the implicit job slot.
-  bool isImplicit() const { return Value == kImplicitValue; }
-
-  static JobSlot createExplicit(uint8_t V) {
-    return JobSlot(static_cast<int16_t>(V));
-  }
-
-  static JobSlot createImplicit() { return JobSlot(kImplicitValue); }
-
-  uint8_t getExplicitValue() const;
-  bool isExplicit() const { return isValid() && !isImplicit(); }
-
-private:
-  friend class JobserverClient;
-  friend class JobserverClientImpl;
-
-  JobSlot(int16_t V) : Value(V) {}
-
-  /// The jobserver pipe carries explicit tokens (bytes 0–255). We reserve two
-  /// sentinels in Value for special cases:
-  ///   kInvalidValue  (-1): no slot held
-  ///   kImplicitValue (INT16_MAX): implicit slot granted at startup (no pipe
-  ///   I/O)
-  ///
-  /// We use int16_t so Value can store 0–255 explicit tokens and
-  /// sentinels without overflow, enforces fixed 16-bit width, and avoids
-  /// unsigned/signed mix-ups.
-  static constexpr int16_t kInvalidValue = -1;
-  static constexpr int16_t kImplicitValue = INT16_MAX;
-  int16_t Value = kInvalidValue;
-};
-
-/// The public interface for a jobserver client.
-/// This client is a lazy-initialized singleton that is created on first use.
-class JobserverClient {
-public:
-  virtual ~JobserverClient();
-
-  /// Tries to acquire a job slot from the pool. On failure (e.g., if the pool
-  /// is empty), this returns an invalid JobSlot instance. The first successful
-  /// call will always return the implicit slot.
-  virtual JobSlot tryAcquire() = 0;
-
-  /// Releases a job slot back to the pool.
-  virtual void release(JobSlot Slot) = 0;
-
-  /// Returns the number of job slots available, as determined on first use.
-  /// This value is cached. Returns 0 if no jobserver is active.
-  virtual unsigned getNumJobs() const = 0;
-
-  /// Returns the singleton instance of the JobserverClient.
-  /// The instance is created on the first call to this function.
-  /// Returns a nullptr if no jobserver is configured or an error occurs.
-  static JobserverClient *getInstance();
-
-  /// Resets the singleton instance. For testing purposes only.
-  static void resetForTesting();
-};
-
-} // end namespace llvm
-
-#endif // LLVM_SUPPORT_JOBSERVER_H
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index c20efc7396b79..c26681c25c8f6 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -16,7 +16,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/Jobserver.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
@@ -181,7 +180,6 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
   void grow(int requested);
 
   void processTasks(ThreadPoolTaskGroup *WaitingForGroup);
-  void processTasksWithJobserver();
 
   /// Threads in flight
   std::vector<llvm::thread> Threads;
@@ -210,8 +208,6 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
 
   /// Maximum number of threads to potentially grow this pool to.
   const unsigned MaxThreadCount;
-
-  JobserverClient *TheJobserver = nullptr;
 };
 #endif // LLVM_ENABLE_THREADS
 
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index 88846807f111a..d3fe0a57ee44e 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -142,11 +142,6 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     /// the thread shall remain on the actual CPU socket.
     LLVM_ABI std::optional<unsigned>
     compute_cpu_socket(unsigned ThreadPoolNum) const;
-
-    /// If true, the thread pool will attempt to coordinate with a GNU Make
-    /// jobserver, acquiring a job slot before processing a task. If no
-    /// jobserver is found in the environment, this is ignored.
-    bool UseJobserver = false;
   };
 
   /// Build a strategy from a number of threads as a string provided in \p Num.
@@ -215,19 +210,6 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     return S;
   }
 
-  /// Returns a thread strategy that attempts to coordinate with a GNU Make
-  /// jobserver. The number of active threads will be limited by the number of
-  /// available job slots. If no jobserver is detected in the environment, this
-  /// strategy falls back to the default hardware_concurrency() behavior.
-  inline ThreadPoolStrategy jobserver_concurrency() {
-    ThreadPoolStrategy S;
-    S.UseJobserver = true;
-    // We can still request all threads be created, as they will simply
-    // block waiting for a job slot if the jobserver is the limiting factor.
-    S.ThreadsRequested = 0; // 0 means 'use all available'
-    return S;
-  }
-
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
   /// unique across the entire system, so portable code should not assume
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 42b21b5e62029..7da972f372c5b 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -207,7 +207,6 @@ add_llvm_component_library(LLVMSupport
   InstructionCost.cpp
   IntEqClasses.cpp
   IntervalMap.cpp
-  Jobserver.cpp
   JSON.cpp
   KnownBits.cpp
   KnownFPClass.cpp
diff --git a/llvm/lib/Support/Jobserver.cpp b/llvm/lib/Support/Jobserver.cpp
deleted file mode 100644
index 9f726eb37506f..0000000000000
--- a/llvm/lib/Support/Jobserver.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-//===- llvm/Support/Jobserver.cpp - Jobserver Client Implementation -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Jobserver.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <atomic>
-#include <memory>
-#include <mutex>
-#include <new>
-
-#define DEBUG_TYPE "jobserver"
-
-using namespace llvm;
-
-namespace {
-struct FdPair {
-  int Read = -1;
-  int Write = -1;
-  bool isValid() const { return Read >= 0 && Write >= 0; }
-};
-
-struct JobserverConfig {
-  enum Mode {
-    None,
-    PosixFifo,
-    PosixPipe,
-    Win32Semaphore,
-  };
-  Mode TheMode = None;
-  std::string Path;
-  FdPair PipeFDs;
-};
-
-/// A helper function that checks if `Input` starts with `Prefix`.
-/// If it does, it removes the prefix from `Input`, assigns the remainder to
-/// `Value`, and returns true. Otherwise, it returns false.
-bool getPrefixedValue(StringRef Input, StringRef Prefix, StringRef &Value) {
-  if (Input.consume_front(Prefix)) {
-    Value = Input;
-    return true;
-  }
-  return false;
-}
-
-/// A helper function to parse a string in the format "R,W" where R and W are
-/// non-negative integers representing file descriptors. It populates the
-/// `ReadFD` and `WriteFD` output parameters. Returns true on success.
-static std::optional<FdPair> getFileDescriptorPair(StringRef Input) {
-  FdPair FDs;
-  if (Input.consumeInteger(10, FDs.Read))
-    return std::nullopt;
-  if (!Input.consume_front(","))
-    return std::nullopt;
-  if (Input.consumeInteger(10, FDs.Write))
-    return std::nullopt;
-  if (!Input.empty() || !FDs.isValid())
-    return std::nullopt;
-  return FDs;
-}
-
-/// Parses the `MAKEFLAGS` environment variable string to find jobserver
-/// arguments. It splits the string into space-separated arguments and searches
-/// for `--jobserver-auth` or `--jobserver-fds`. Based on the value of these
-/// arguments, it determines the jobserver mode (Pipe, FIFO, or Semaphore) and
-/// connection details (file descriptors or path).
-Expected<JobserverConfig> parseNativeMakeFlags(StringRef MakeFlags) {
-  JobserverConfig Config;
-  if (MakeFlags.empty())
-    return Config;
-
-  // Split the MAKEFLAGS string into arguments.
-  SmallVector<StringRef, 8> Args;
-  SplitString(MakeFlags, Args);
-
-  // If '-n' (dry-run) is present as a legacy flag (not starting with '-'),
-  // disable the jobserver.
-  if (!Args.empty() && !Args[0].starts_with("-") && Args[0].contains('n'))
-    return Config;
-
-  // Iterate through arguments to find jobserver flags.
-  // Note that make may pass multiple --jobserver-auth flags; the last one wins.
-  for (StringRef Arg : Args) {
-    StringRef Value;
-    if (getPrefixedValue(Arg, "--jobserver-auth=", Value)) {
-      // Try to parse as a file descriptor pair first.
-      if (auto FDPair = getFileDescriptorPair(Value)) {
-        Config.TheMode = JobserverConfig::PosixPipe;
-        Config.PipeFDs = *FDPair;
-      } else {
-        StringRef FifoPath;
-        // If not FDs, try to parse as a named pipe (fifo).
-        if (getPrefixedValue(Value, "fifo:", FifoPath)) {
-          Config.TheMode = JobserverConfig::PosixFifo;
-          Config.Path = FifoPath.str();
-        } else {
-          // Otherwise, assume it's a Windows semaphore.
-          Config.TheMode = JobserverConfig::Win32Semaphore;
-          Config.Path = Value.str();
-        }
-      }
-    } else if (getPrefixedValue(Arg, "--jobserver-fds=", Value)) {
-      // This is an alternative, older syntax for the pipe-based server.
-      if (auto FDPair = getFileDescriptorPair(Value)) {
-        Config.TheMode = JobserverConfig::PosixPipe;
-        Config.PipeFDs = *FDPair;
-      } else {
-        return createStringError(inconvertibleErrorCode(),
-                                 "Invalid file descriptor pair in MAKEFLAGS");
-      }
-    }
-  }
-
-// Perform platform-specific validation.
-#ifdef _WIN32
-  if (Config.TheMode == JobserverConfig::PosixFifo ||
-      Config.TheMode == JobserverConfig::PosixPipe)
-    return createStringError(
-        inconvertibleErrorCode(),
-        "FIFO/Pipe-based jobserver is not supported on Windows");
-#else
-  if (Config.TheMode == JobserverConfig::Win32Semaphore)
-    return createStringError(
-        inconvertibleErrorCode(),
-        "Semaphore-based jobserver is not supported on this platform");
-#endif
-  return Config;
-}
-
-std::once_flag GJobserverOnceFlag;
-JobserverClient *GJobserver = nullptr;
-
-} // namespace
-
-namespace llvm {
-class JobserverClientImpl : public JobserverClient {
-  bool IsInitialized = false;
-  std::atomic<bool> HasImplicitSlot{true};
-  unsigned NumJobs = 0;
-
-public:
-  JobserverClientImpl(const JobserverConfig &Config);
-  ~JobserverClientImpl() override;
-
-  JobSlot tryAcquire() override;
-  void release(JobSlot Slot) override;
-  unsigned getNumJobs() const override { return NumJobs; }
-
-  bool isValid() const { return IsInitialized; }
-
-private:
-#if defined(LLVM_ON_UNIX)
-  int ReadFD = -1;
-  int WriteFD = -1;
-  std::string FifoPath;
-#elif defined(_WIN32)
-  void *Semaphore = nullptr;
-#endif
-};
-} // namespace llvm
-
-// Include the platform-specific parts of the class.
-#if defined(LLVM_ON_UNIX)
-#include "Unix/Jobserver.inc"
-#elif defined(_WIN32)
-#include "Windows/Jobserver.inc"
-#else
-// Dummy implementation for unsupported platforms.
-JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {}
-JobserverClientImpl::~JobserverClientImpl() = default;
-JobSlot JobserverClientImpl::tryAcquire() { return JobSlot(); }
-void JobserverClientImpl::release(JobSlot Slot) {}
-#endif
-
-namespace llvm {
-JobserverClient::~JobserverClient() = default;
-
-uint8_t JobSlot::getExplicitValue() const {
-  assert(isExplicit() && "Cannot get value of implicit or invalid slot");
-  return static_cast<uint8_t>(Value);
-}
-
-/// This is the main entry point for acquiring a jobserver client. It uses a
-/// std::call_once to ensure the singleton `GJobserver` instance is created
-/// safely in a multi-threaded environment. On first call, it reads the
-/// `MAKEFLAGS` environment variable, parses it, and attempts to construct and
-/// initialize a `JobserverClientImpl`. If successful, the global instance is
-/// stored in `GJobserver`. Subsequent calls will return the existing instance.
-JobserverClient *JobserverClient::getInstance() {
-  std::call_once(GJobserverOnceFlag, []() {
-    LLVM_DEBUG(
-        dbgs()
-        << "JobserverClient::getInstance() called for the first time.\n");
-    const char *MakeFlagsEnv = getenv("MAKEFLAGS");
-    if (!MakeFlagsEnv) {
-      errs() << "Warning: failed to create jobserver client due to MAKEFLAGS "
-                "environment variable not found\n";
-      return;
-    }
-
-    LLVM_DEBUG(dbgs() << "Found MAKEFLAGS = \"" << MakeFlagsEnv << "\"\n");
-
-    auto ConfigOrErr = parseNativeMakeFlags(MakeFlagsEnv);
-    if (Error Err = ConfigOrErr.takeError()) {
-      errs() << "Warning: failed to create jobserver client due to invalid "
-                "MAKEFLAGS environment variable: "
-             << toString(std::move(Err)) << "\n";
-      return;
-    }
-
-    JobserverConfig Config = *ConfigOrErr;
-    if (Config.TheMode == JobserverConfig::None) {
-      errs() << "Warning: failed to create jobserver client due to jobserver "
-                "mode missing in MAKEFLAGS environment variable\n";
-      return;
-    }
-
-    if (Config.TheMode == JobserverConfig::PosixPipe) {
-#if defined(LLVM_ON_UNIX)
-      if (!areFdsValid(Config.PipeFDs.Read, Config.PipeFDs.Write)) {
-        errs() << "Warning: failed to create jobserver client due to invalid "
-                  "Pipe FDs in MAKEFLAGS environment variable\n";
-        return;
-      }
-#endif
-    }
-
-    auto Client = std::make_unique<JobserverClientImpl>(Config);
-    if (Client->isValid()) {
-      LLVM_DEBUG(dbgs() << "Jobserver client created successfully!\n");
-      GJobserver = Client.release();
-    } else
-      errs() << "Warning: jobserver client initialization failed.\n";
-  });
-  return GJobserver;
-}
-
-/// For testing purposes only. This function resets the singleton instance by
-/// destroying the existing client and re-initializing the `std::once_flag`.
-/// This allows tests to simulate the first-time initialization of the
-/// jobserver client multiple times.
-void JobserverClient::resetForTesting() {
-  delete GJobserver;
-  GJobserver = nullptr;
-  // Re-construct the std::once_flag in place to reset the singleton state.
-  new (&GJobserverOnceFlag) std::once_flag();
-}
-} // namespace llvm
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 8e0c724accb36..3ac6fc74fd3e0 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -7,17 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Parallel.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Support/ExponentialBackoff.h"
-#include "llvm/Support/Jobserver.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Threading.h"
 
 #include <atomic>
 #include <future>
-#include <memory>
-#include <mutex>
 #include <thread>
 #include <vector>
 
@@ -54,9 +49,6 @@ class Executor {
 class ThreadPoolExecutor : public Executor {
 public:
   explicit ThreadPoolExecutor(ThreadPoolStrategy S) {
-    if (S.UseJobserver)
-      TheJobserver = JobserverClient::getInstance();
-
     ThreadCount = S.compute_thread_count();
     // Spawn all but one of the threads in another thread as spawning threads
     // can take a while.
@@ -77,10 +69,6 @@ class ThreadPoolExecutor : public Executor {
     });
   }
 
-  // To make sure the thread pool executor can only be created with a parallel
-  // strategy.
-  ThreadPoolExecutor() = delete;
-
   void stop() {
     {
       std::lock_guard<std::mutex> Lock(Mutex);
@@ -123,62 +111,15 @@ class ThreadPoolExecutor : public Executor {
   void work(ThreadPoolStrategy S, unsigned ThreadID) {
     threadIndex = ThreadID;
     S.apply_thread_strategy(ThreadID);
-    // Note on jobserver deadlock avoidance:
-    // GNU Make grants each invoked process one implicit job slot. Our
-    // JobserverClient models this by returning an implicit JobSlot on the
-    // first successful tryAcquire() in a process. This guarantees forward
-    // progress without requiring a dedicated "always-on" thread here.
-
-    static thread_local std::unique_ptr<ExponentialBackoff> Backoff;
-
     while (true) {
-      if (TheJobserver) {
-        // Jobserver-mode scheduling:
-        // - Acquire one job slot (with exponential backoff to avoid busy-wait).
-        // - While holding the slot, drain and run tasks from the local queue.
-        // - Release the slot when the queue is empty or when shutting down.
-        // Rationale: Holding a slot amortizes acquire/release overhead over
-        // multiple tasks and avoids requeue/yield churn, while still enforcing
-        // the jobserver’s global concurrency limit. With K available slots,
-        // up to K workers run tasks in parallel; within each worker tasks run
-        // sequentially until the local queue is empty.
-        ExponentialBackoff Backoff(std::chrono::hours(24));
-        JobSlot Slot;
-        do {
-          if (Stop)
-            return;
-          Slot = TheJobserver->tryAcquire();
-          if (Slot.isValid())
-            break;
-        } while (Backoff.waitForNextAttempt());
-
-        auto SlotReleaser = llvm::make_scope_exit(
-            [&] { TheJobserver->release(std::move(Slot)); });
-
-        while (true) {
-          std::function<void()> Task;
-          {
-            std::unique_lock<std::mutex> Lock(Mutex);
-            Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
-            if (Stop && WorkStack.empty())
-              return;
-            if (WorkStack.empty())
-              break;
-            Task = std::move(WorkStack.back());
-            WorkStack.pop_back();
-          }
-          Task();
-        }
-      } else {
-        std::unique_lock<std::mutex> Lock(Mutex);
-        Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
-        if (Stop)
-          break;
-        auto Task = std::move(WorkStack.back());
-        WorkStack.pop_back();
-        Lock.unlock();
-        Task();
-      }
+      std::unique_lock<std::mutex> Lock(Mutex);
+      Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+      if (Stop)
+        break;
+      auto Task = std::move(WorkStack.back());
+      WorkStack.pop_back();
+      Lock.unlock();
+      Task();
     }
   }
 
@@ -189,20 +130,9 @@ class ThreadPoolExecutor : public Executor {
   std::promise<void> ThreadsCreated;
   std::vector<std::thread> Threads;
   unsigned ThreadCount;
-
-  JobserverClient *TheJobserver = nullptr;
 };
 
-// A global raw pointer to the executor. Lifetime is managed by the
-// objects created within createExecutor().
-static Executor *TheExec = nullptr;
-static std::once_flag Flag;
-
-// This function will be called exactly once to create the executor.
-// It contains the necessary platform-specific logic. Since functions
-// called by std::call_once cannot return value, we have to set the
-// executor as a global variable.
-void createExecutor() {
+Executor *Executor::getDefaultExecutor() {
 #ifdef _WIN32
   // The ManagedStatic enables the ThreadPoolExecutor to be stopped via
   // llvm_shutdown() which allows a "clean" fast exit, e.g. via _exit(). This
@@ -226,22 +156,16 @@ void createExecutor() {
                        ThreadPoolExecutor::Deleter>
       ManagedExec;
   static std::unique_ptr<ThreadPoolExecutor> Exec(&(*ManagedExec));
-  TheExec = Exec.get();
+  return Exec.get();
 #else
   // ManagedStatic is not desired on other platforms. When `Exec` is destroyed
   // by llvm_shutdown(), worker threads will clean up and invoke TLS
   // destructors. This can lead to race conditions if other threads attempt to
   // access TLS objects that have already been destroyed.
   static ThreadPoolExecutor Exec(strategy);
-  TheExec = &Exec;
+  return &Exec;
 #endif
 }
-
-Executor *Executor::getDefaultExecutor() {
-  // Use std::call_once to lazily and safely initialize the executor.
-  std::call_once(Flag, createExecutor);
-  return TheExec;
-}
 } // namespace
 } // namespace detail
 
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index 69602688cf3fd..c304f0f45360b 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//
 // This file implements a crude C++11 based thread pool.
 //
 //===----------------------------------------------------------------------===//
@@ -15,8 +14,6 @@
 
 #include "llvm/Config/llvm-config.h"
 
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/Support/ExponentialBackoff.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
@@ -36,10 +33,7 @@ ThreadPoolInterface::~ThreadPoolInterface() = default;
 #if LLVM_ENABLE_THREADS
 
 StdThreadPool::StdThreadPool(ThreadPoolStrategy S)
-    : Strategy(S), MaxThreadCount(S.compute_thread_count()) {
-  if (Strategy.UseJobserver)
-    TheJobserver = JobserverClient::getInstance();
-}
+    : Strategy(S), MaxThreadCount(S.compute_thread_count()) {}
 
 void StdThreadPool::grow(int requested) {
   llvm::sys::ScopedWriter LockGuard(ThreadsLock);
@@ -51,15 +45,7 @@ void StdThreadPool::grow(int requested) {
     Threads.emplace_back([this, ThreadID] {
       set_thread_name(formatv("llvm-worker-{0}", ThreadID));
       Strategy.apply_thread_strategy(ThreadID);
-      // Note on jobserver deadlock avoidance:
-      // GNU Make grants each invoked process one implicit job slot.
-      // JobserverClient::tryAcquire() returns that implicit slot on the first
-      // successful call in a process, ensuring forward progress without a
-      // dedicated "always-on" thread.
-      if (TheJobserver)
-        processTasksWithJobserver();
-      else
-        processTasks(nullptr);
+      processTasks(nullptr);
     });
   }
 }
@@ -147,96 +133,6 @@ void StdThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) {
   }
 }
 
-/// Main loop for worker threads when using a jobserver.
-/// This function uses a two-level queue; it first acquires a job slot from the
-/// external jobserver, then retrieves a task from the internal queue.
-/// This allows the thread pool to cooperate with build systems like `make -j`.
-void StdThreadPool::processTasksWithJobserver() {
-  while (true) {
-    // Acquire a job slot from the external jobserver.
-    // This polls for a slot and yields the thread to avoid a high-CPU wait.
-    JobSlot Slot;
-    // The timeout for the backoff can be very long, as the shutdown
-    // is checked on each iteration. The sleep duration is capped by MaxWait
-    // in ExponentialBackoff, so shutdown latency is not a problem.
-    ExponentialBackoff Backoff(std::chrono::hours(24));
-    bool AcquiredToken = false;
-    do {
-      // Return if the thread pool is shutting down.
-      {
-        std::unique_lock<std::mutex> LockGuard(QueueLock);
-        if (!EnableFlag)
-          return;
-      }
-
-      Slot = TheJobserver->tryAcquire();
-      if (Slot.isValid()) {
-        AcquiredToken = true;
-        break;
-      }
-    } while (Backoff.waitForNextAttempt());
-
-    if (!AcquiredToken) {
-      // This is practically unreachable with a 24h timeout and indicates a
-      // deeper problem if hit.
-      report_fatal_error("Timed out waiting for jobserver token.");
-    }
-
-    // `make_scope_exit` guarantees the job slot is released, even if the
-    // task throws or we exit early. This prevents deadlocking the build.
-    auto SlotReleaser =
-        make_scope_exit([&] { TheJobserver->release(std::move(Slot)); });
-
-    // While we hold a job slot, process tasks from the internal queue.
-    while (true) {
-      std::function<void()> Task;
-      ThreadPoolTaskGroup *GroupOfTask = nullptr;
-
-      {
-        std::unique_lock<std::mutex> LockGuard(QueueLock);
-
-        // Wait until a task is available or the pool is shutting down.
-        QueueCondition.wait(LockGuard,
-                            [&] { return !EnableFlag || !Tasks.empty(); });
-
-        // If shutting down and the queue is empty, the thread can terminate.
-        if (!EnableFlag && Tasks.empty())
-          return;
-
-        // If the queue is empty, we're done processing tasks for now.
-        // Break the inner loop to release the job slot.
-        if (Tasks.empty())
-          break;
-
-        // A task is available. Mark it as active before releasing the lock
-        // to prevent race conditions with `wait()`.
-        ++ActiveThreads;
-        Task = std::move(Tasks.front().first);
-        GroupOfTask = Tasks.front().second;
-        if (GroupOfTask != nullptr)
-          ++ActiveGroups[GroupOfTask];
-        Tasks.pop_front();
-      } // The queue lock is released.
-
-      // Run the task. The job slot remains acquired during execution.
-      Task();
-
-      // The task has finished. Update the active count and notify any waiters.
-      {
-        std::lock_guard<std::mutex> LockGuard(QueueLock);
-        --ActiveThreads;
-        if (GroupOfTask != nullptr) {
-          auto A = ActiveGroups.find(GroupOfTask);
-          if (--(A->second) == 0)
-            ActiveGroups.erase(A);
-        }
-        // If all tasks are complete, notify any waiting threads.
-        if (workCompletedUnlocked(nullptr))
-          CompletionCondition.notify_all();
-      }
-    }
-  }
-}
 bool StdThreadPool::workCompletedUnlocked(ThreadPoolTaskGroup *Group) const {
   if (Group == nullptr)
     return !ActiveThreads && Tasks.empty();
diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index 9da357a7ebb91..693de0e6400fb 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -14,7 +14,6 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Config/config.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Jobserver.h"
 
 #include <cassert>
 #include <optional>
@@ -52,10 +51,6 @@ int llvm::get_physical_cores() { return -1; }
 static int computeHostNumHardwareThreads();
 
 unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
-  if (UseJobserver)
-    if (auto JS = JobserverClient::getInstance())
-      return JS->getNumJobs();
-
   int MaxThreadCount =
       UseHyperThreads ? computeHostNumHardwareThreads() : get_physical_cores();
   if (MaxThreadCount <= 0)
diff --git a/llvm/lib/Support/Unix/Jobserver.inc b/llvm/lib/Support/Unix/Jobserver.inc
deleted file mode 100644
index 53bf7f288ca1f..0000000000000
--- a/llvm/lib/Support/Unix/Jobserver.inc
+++ /dev/null
@@ -1,195 +0,0 @@
-//===- llvm/Support/Unix/Jobserver.inc - Unix Jobserver Impl ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the UNIX-specific parts of the JobserverClient class.
-//
-//===----------------------------------------------------------------------===//
-
-#include <atomic>
-#include <cassert>
-#include <cerrno>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-namespace {
-/// Returns true if the given file descriptor is a FIFO (named pipe).
-bool isFifo(int FD) {
-  struct stat StatBuf;
-  if (::fstat(FD, &StatBuf) != 0)
-    return false;
-  return S_ISFIFO(StatBuf.st_mode);
-}
-
-/// Returns true if the given file descriptors are valid.
-bool areFdsValid(int ReadFD, int WriteFD) {
-  if (ReadFD == -1 || WriteFD == -1)
-    return false;
-  // Check if the file descriptors are actually valid by checking their flags.
-  return ::fcntl(ReadFD, F_GETFD) != -1 && ::fcntl(WriteFD, F_GETFD) != -1;
-}
-} // namespace
-
-/// The constructor sets up the client based on the provided configuration.
-/// For pipe-based jobservers, it duplicates the inherited file descriptors,
-/// sets them to close-on-exec, and makes the read descriptor non-blocking.
-/// For FIFO-based jobservers, it opens the named pipe. After setup, it drains
-/// all available tokens from the jobserver to determine the total number of
-/// available jobs (`NumJobs`), then immediately releases them back.
-JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
-  switch (Config.TheMode) {
-  case JobserverConfig::PosixPipe: {
-    // Duplicate the read and write file descriptors.
-    int NewReadFD = ::dup(Config.PipeFDs.Read);
-    if (NewReadFD < 0)
-      return;
-    int NewWriteFD = ::dup(Config.PipeFDs.Write);
-    if (NewWriteFD < 0) {
-      ::close(NewReadFD);
-      return;
-    }
-    // Set the new descriptors to be closed automatically on exec().
-    if (::fcntl(NewReadFD, F_SETFD, FD_CLOEXEC) == -1 ||
-        ::fcntl(NewWriteFD, F_SETFD, FD_CLOEXEC) == -1) {
-      ::close(NewReadFD);
-      ::close(NewWriteFD);
-      return;
-    }
-    // Set the read descriptor to non-blocking.
-    int flags = ::fcntl(NewReadFD, F_GETFL, 0);
-    if (flags == -1 || ::fcntl(NewReadFD, F_SETFL, flags | O_NONBLOCK) == -1) {
-      ::close(NewReadFD);
-      ::close(NewWriteFD);
-      return;
-    }
-    ReadFD = NewReadFD;
-    WriteFD = NewWriteFD;
-    break;
-  }
-  case JobserverConfig::PosixFifo:
-    // Open the FIFO for reading. It must be non-blocking and close-on-exec.
-    ReadFD = ::open(Config.Path.c_str(), O_RDONLY | O_NONBLOCK | O_CLOEXEC);
-    if (ReadFD < 0 || !isFifo(ReadFD)) {
-      if (ReadFD >= 0)
-        ::close(ReadFD);
-      ReadFD = -1;
-      return;
-    }
-    FifoPath = Config.Path;
-    // The write FD is opened on-demand in release().
-    WriteFD = -1;
-    break;
-  default:
-    return;
-  }
-
-  IsInitialized = true;
-  // Determine the total number of jobs by acquiring all available slots and
-  // then immediately releasing them.
-  SmallVector<JobSlot, 8> Slots;
-  while (true) {
-    auto S = tryAcquire();
-    if (!S.isValid())
-      break;
-    Slots.push_back(std::move(S));
-  }
-  NumJobs = Slots.size();
-  assert(NumJobs >= 1 && "Invalid number of jobs");
-  for (auto &S : Slots)
-    release(std::move(S));
-}
-
-/// The destructor closes any open file descriptors.
-JobserverClientImpl::~JobserverClientImpl() {
-  if (ReadFD >= 0)
-    ::close(ReadFD);
-  if (WriteFD >= 0)
-    ::close(WriteFD);
-}
-
-/// Tries to acquire a job slot. The first call to this function will always
-/// successfully acquire the single "implicit" slot that is granted to every
-/// process started by `make`. Subsequent calls attempt to read a one-byte
-/// token from the jobserver's read pipe. A successful read grants one
-/// explicit job slot. The read is non-blocking; if no token is available,
-/// it fails and returns an invalid JobSlot.
-JobSlot JobserverClientImpl::tryAcquire() {
-  if (!IsInitialized)
-    return JobSlot();
-
-  // The first acquisition is always for the implicit slot.
-  if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
-    LLVM_DEBUG(dbgs() << "Acquired implicit job slot.\n");
-    return JobSlot::createImplicit();
-  }
-
-  char Token;
-  ssize_t Ret;
-  LLVM_DEBUG(dbgs() << "Attempting to read token from FD " << ReadFD << ".\n");
-  // Loop to retry on EINTR (interrupted system call).
-  do {
-    Ret = ::read(ReadFD, &Token, 1);
-  } while (Ret < 0 && errno == EINTR);
-
-  if (Ret == 1) {
-    LLVM_DEBUG(dbgs() << "Acquired explicit token '" << Token << "'.\n");
-    return JobSlot::createExplicit(static_cast<uint8_t>(Token));
-  }
-
-  LLVM_DEBUG(dbgs() << "Failed to acquire job slot, read returned " << Ret
-                    << ".\n");
-  return JobSlot();
-}
-
-/// Releases a job slot back to the pool. If the slot is implicit, it simply
-/// resets a flag. If the slot is explicit, it writes the character token
-/// associated with the slot back into the jobserver's write pipe. For FIFO
-/// jobservers, this may require opening the FIFO for writing if it hasn't
-/// been already.
-void JobserverClientImpl::release(JobSlot Slot) {
-  if (!Slot.isValid())
-    return;
-
-  // Releasing the implicit slot just makes it available for the next acquire.
-  if (Slot.isImplicit()) {
-    LLVM_DEBUG(dbgs() << "Released implicit job slot.\n");
-    [[maybe_unused]] bool was_already_released =
-        HasImplicitSlot.exchange(true, std::memory_order_release);
-    assert(!was_already_released && "Implicit slot released twice");
-    return;
-  }
-
-  uint8_t Token = Slot.getExplicitValue();
-  LLVM_DEBUG(dbgs() << "Releasing explicit token '" << (char)Token << "' to FD "
-                    << WriteFD << ".\n");
-
-  // For FIFO-based jobservers, the write FD might not be open yet.
-  // Open it on the first release.
-  if (WriteFD < 0) {
-    LLVM_DEBUG(dbgs() << "WriteFD is invalid, opening FIFO: " << FifoPath
-                      << "\n");
-    WriteFD = ::open(FifoPath.c_str(), O_WRONLY | O_CLOEXEC);
-    if (WriteFD < 0) {
-      LLVM_DEBUG(dbgs() << "Failed to open FIFO for writing.\n");
-      return;
-    }
-    LLVM_DEBUG(dbgs() << "Opened FIFO as new WriteFD: " << WriteFD << "\n");
-  }
-
-  ssize_t Written;
-  // Loop to retry on EINTR (interrupted system call).
-  do {
-    Written = ::write(WriteFD, &Token, 1);
-  } while (Written < 0 && errno == EINTR);
-
-  if (Written <= 0) {
-    LLVM_DEBUG(dbgs() << "Failed to write token to pipe, write returned "
-                      << Written << "\n");
-  }
-}
diff --git a/llvm/lib/Support/Windows/Jobserver.inc b/llvm/lib/Support/Windows/Jobserver.inc
deleted file mode 100644
index 79028eee4b302..0000000000000
--- a/llvm/lib/Support/Windows/Jobserver.inc
+++ /dev/null
@@ -1,79 +0,0 @@
-//==- llvm/Support/Windows/Jobserver.inc - Windows Jobserver Impl -*- C++ -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Windows-specific parts of the JobserverClient class.
-// On Windows, the jobserver is implemented using a named semaphore.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Windows/WindowsSupport.h"
-#include <atomic>
-#include <cassert>
-
-namespace llvm {
-/// The constructor for the Windows jobserver client. It attempts to open a
-/// handle to an existing named semaphore, the name of which is provided by
-/// GNU make in the --jobserver-auth argument. If the semaphore is opened
-/// successfully, the client is marked as initialized.
-JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
-  Semaphore = (void *)::OpenSemaphoreA(SEMAPHORE_MODIFY_STATE | SYNCHRONIZE,
-                                       FALSE, Config.Path.c_str());
-  if (Semaphore != nullptr)
-    IsInitialized = true;
-}
-
-/// The destructor closes the handle to the semaphore, releasing the resource.
-JobserverClientImpl::~JobserverClientImpl() {
-  if (Semaphore != nullptr)
-    ::CloseHandle((HANDLE)Semaphore);
-}
-
-/// Tries to acquire a job slot. The first call always returns the implicit
-/// slot. Subsequent calls use a non-blocking wait on the semaphore
-/// (`WaitForSingleObject` with a timeout of 0). If the wait succeeds, the
-/// semaphore's count is decremented, and an explicit job slot is acquired.
-/// If the wait times out, it means no slots are available, and an invalid
-/// slot is returned.
-JobSlot JobserverClientImpl::tryAcquire() {
-  if (!IsInitialized)
-    return JobSlot();
-
-  // First, grant the implicit slot.
-  if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
-    return JobSlot::createImplicit();
-  }
-
-  // Try to acquire a slot from the semaphore without blocking.
-  if (::WaitForSingleObject((HANDLE)Semaphore, 0) == WAIT_OBJECT_0) {
-    // The explicit token value is arbitrary on Windows, as the semaphore
-    // count is the real resource.
-    return JobSlot::createExplicit(1);
-  }
-
-  return JobSlot(); // Invalid slot
-}
-
-/// Releases a job slot back to the pool. If the slot is implicit, it simply
-/// resets a flag. For an explicit slot, it increments the semaphore's count
-/// by one using `ReleaseSemaphore`, making the slot available to other
-/// processes.
-void JobserverClientImpl::release(JobSlot Slot) {
-  if (!IsInitialized || !Slot.isValid())
-    return;
-
-  if (Slot.isImplicit()) {
-    [[maybe_unused]] bool was_already_released =
-        HasImplicitSlot.exchange(true, std::memory_order_release);
-    assert(!was_already_released && "Implicit slot released twice");
-    return;
-  }
-
-  // Release the slot by incrementing the semaphore count.
-  (void)::ReleaseSemaphore((HANDLE)Semaphore, 1, NULL);
-}
-} // namespace llvm
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 25efa00c5abfd..d1dfb1dc4a722 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -52,7 +52,6 @@ add_llvm_unittest(SupportTests
   IndexedAccessorTest.cpp
   InstructionCostTest.cpp
   InterleavedRangeTest.cpp
-  JobserverTest.cpp
   JSONTest.cpp
   KnownBitsTest.cpp
   LEB128Test.cpp
diff --git a/llvm/unittests/Support/JobserverTest.cpp b/llvm/unittests/Support/JobserverTest.cpp
deleted file mode 100644
index 36d5f2e68dab5..0000000000000
--- a/llvm/unittests/Support/JobserverTest.cpp
+++ /dev/null
@@ -1,437 +0,0 @@
-//===- llvm/unittest/Support/JobserverTest.cpp ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// Jobserver.h unit tests.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Jobserver.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Parallel.h"
-#include "llvm/Support/ThreadPool.h"
-#include "llvm/Support/raw_ostream.h"
-#include "gtest/gtest.h"
-#include <future>
-#include <random>
-#include <stdlib.h>
-
-#if defined(LLVM_ON_UNIX)
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/FileSystem.h"
-#include <atomic>
-#include <condition_variable>
-#include <fcntl.h>
-#include <mutex>
-#include <sys/stat.h>
-#include <thread>
-#include <unistd.h>
-#elif defined(_WIN32)
-#include <windows.h>
-#endif
-
-#define DEBUG_TYPE "jobserver-test"
-
-using namespace llvm;
-
-namespace {
-
-// RAII helper to set an environment variable for the duration of a test.
-class ScopedEnvironment {
-  std::string Name;
-  std::string OldValue;
-  bool HadOldValue;
-
-public:
-  ScopedEnvironment(const char *Name, const char *Value) : Name(Name) {
-#if defined(_WIN32)
-    char *Old = nullptr;
-    size_t OldLen;
-    errno_t err = _dupenv_s(&Old, &OldLen, Name);
-    if (err == 0 && Old != nullptr) {
-      HadOldValue = true;
-      OldValue = Old;
-      free(Old);
-    } else {
-      HadOldValue = false;
-    }
-    _putenv_s(Name, Value);
-#else
-    const char *Old = getenv(Name);
-    if (Old) {
-      HadOldValue = true;
-      OldValue = Old;
-    } else {
-      HadOldValue = false;
-    }
-    setenv(Name, Value, 1);
-#endif
-  }
-
-  ~ScopedEnvironment() {
-#if defined(_WIN32)
-    if (HadOldValue)
-      _putenv_s(Name.c_str(), OldValue.c_str());
-    else
-      // On Windows, setting an environment variable to an empty string
-      // unsets it, making getenv() return NULL.
-      _putenv_s(Name.c_str(), "");
-#else
-    if (HadOldValue)
-      setenv(Name.c_str(), OldValue.c_str(), 1);
-    else
-      unsetenv(Name.c_str());
-#endif
-  }
-};
-
-TEST(Jobserver, Slot) {
-  // Default constructor creates an invalid slot.
-  JobSlot S1;
-  EXPECT_FALSE(S1.isValid());
-  EXPECT_FALSE(S1.isImplicit());
-
-  // Create an implicit slot.
-  JobSlot S2 = JobSlot::createImplicit();
-  EXPECT_TRUE(S2.isValid());
-  EXPECT_TRUE(S2.isImplicit());
-
-  // Create an explicit slot.
-  JobSlot S3 = JobSlot::createExplicit(42);
-  EXPECT_TRUE(S3.isValid());
-  EXPECT_FALSE(S3.isImplicit());
-
-  // Test move construction.
-  JobSlot S4 = std::move(S2);
-  EXPECT_TRUE(S4.isValid());
-  EXPECT_TRUE(S4.isImplicit());
-  EXPECT_FALSE(S2.isValid()); // S2 is now invalid.
-
-  // Test move assignment.
-  S1 = std::move(S3);
-  EXPECT_TRUE(S1.isValid());
-  EXPECT_FALSE(S1.isImplicit());
-  EXPECT_FALSE(S3.isValid()); // S3 is now invalid.
-}
-
-// Test fixture for parsing tests to ensure the singleton state is
-// reset between each test case.
-class JobserverParsingTest : public ::testing::Test {
-protected:
-  void TearDown() override { JobserverClient::resetForTesting(); }
-};
-
-TEST_F(JobserverParsingTest, NoMakeflags) {
-  // No MAKEFLAGS, should be null.
-  ScopedEnvironment Env("MAKEFLAGS", "");
-  // On Unix, setting an env var to "" makes getenv() return an empty
-  // string, not NULL. We must call unsetenv() to test the case where
-  // the variable is truly not present.
-#if !defined(_WIN32)
-  unsetenv("MAKEFLAGS");
-#endif
-  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
-}
-
-TEST_F(JobserverParsingTest, EmptyMakeflags) {
-  // Empty MAKEFLAGS, should be null.
-  ScopedEnvironment Env("MAKEFLAGS", "");
-  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
-}
-
-TEST_F(JobserverParsingTest, DryRunFlag) {
-  // Dry-run flag 'n', should be null.
-  ScopedEnvironment Env("MAKEFLAGS", "n -j --jobserver-auth=fifo:/tmp/foo");
-  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
-}
-
-// Separate fixture for non-threaded client tests.
-class JobserverClientTest : public JobserverParsingTest {};
-
-#if defined(LLVM_ON_UNIX)
-// RAII helper to create and clean up a temporary FIFO file.
-class ScopedFifo {
-  SmallString<128> Path;
-  bool IsValid = false;
-
-public:
-  ScopedFifo() {
-    // To get a unique, non-colliding name for a FIFO, we use the
-    // createTemporaryFile function to reserve a name in the filesystem.
-    std::error_code EC =
-        sys::fs::createTemporaryFile("jobserver-test", "fifo", Path);
-    if (EC)
-      return;
-    // Then we immediately remove the regular file it created, but keep the
-    // unique path.
-    sys::fs::remove(Path);
-    // Finally, we create the FIFO at that safe, unique path.
-    if (mkfifo(Path.c_str(), 0600) != 0)
-      return;
-    IsValid = true;
-  }
-
-  ~ScopedFifo() {
-    if (IsValid)
-      sys::fs::remove(Path);
-  }
-
-  const char *c_str() const { return Path.data(); }
-  bool isValid() const { return IsValid; }
-};
-
-TEST_F(JobserverClientTest, UnixClientFifo) {
-  // This test covers basic FIFO client creation and behavior with an empty
-  // FIFO. No job tokens are available.
-  ScopedFifo F;
-  ASSERT_TRUE(F.isValid());
-
-  // Intentionally inserted \t in environment string.
-  std::string Makeflags = " \t -j4\t \t--jobserver-auth=fifo:";
-  Makeflags += F.c_str();
-  ScopedEnvironment Env("MAKEFLAGS", Makeflags.c_str());
-
-  JobserverClient *Client = JobserverClient::getInstance();
-  ASSERT_NE(Client, nullptr);
-
-  // Get the implicit token.
-  JobSlot S1 = Client->tryAcquire();
-  EXPECT_TRUE(S1.isValid());
-  EXPECT_TRUE(S1.isImplicit());
-
-  // FIFO is empty, next acquire fails.
-  JobSlot S2 = Client->tryAcquire();
-  EXPECT_FALSE(S2.isValid());
-
-  // Release does not write to the pipe for the implicit token.
-  Client->release(std::move(S1));
-
-  // Re-acquire the implicit token.
-  S1 = Client->tryAcquire();
-  EXPECT_TRUE(S1.isValid());
-}
-
-#if LLVM_ENABLE_THREADS
-// Test fixture for tests that use the jobserver strategy. It creates a
-// temporary FIFO, sets MAKEFLAGS, and provides a helper to pre-load the FIFO
-// with job tokens, simulating `make -jN`.
-class JobserverStrategyTest : public JobserverParsingTest {
-protected:
-  std::unique_ptr<ScopedFifo> TheFifo;
-  std::thread MakeThread;
-  std::atomic<bool> StopMakeThread{false};
-
-  void SetUp() override {
-    TheFifo = std::make_unique<ScopedFifo>();
-    ASSERT_TRUE(TheFifo->isValid());
-
-    std::string MakeFlags = "--jobserver-auth=fifo:";
-    MakeFlags += TheFifo->c_str();
-    setenv("MAKEFLAGS", MakeFlags.c_str(), 1);
-  }
-
-  void TearDown() override {
-    if (MakeThread.joinable()) {
-      StopMakeThread = true;
-      MakeThread.join();
-    }
-    unsetenv("MAKEFLAGS");
-    TheFifo.reset();
-    JobserverClient::resetForTesting();
-  }
-
-  // Starts a background thread that emulates `make`. It populates the FIFO
-  // with initial tokens and then recycles tokens released by clients.
-  void startMakeProxy(int NumInitialJobs) {
-    MakeThread = std::thread([this, NumInitialJobs]() {
-      LLVM_DEBUG(dbgs() << "[MakeProxy] Thread started.\n");
-      // Open the FIFO for reading and writing. This call does not block.
-      int RWFd = open(TheFifo->c_str(), O_RDWR);
-      LLVM_DEBUG(dbgs() << "[MakeProxy] Opened FIFO " << TheFifo->c_str()
-                        << " with O_RDWR, FD=" << RWFd << "\n");
-      if (RWFd == -1) {
-        LLVM_DEBUG(
-            dbgs()
-            << "[MakeProxy] ERROR: Failed to open FIFO with O_RDWR. Errno: "
-            << errno << "\n");
-        return;
-      }
-
-      // Populate with initial jobs.
-      LLVM_DEBUG(dbgs() << "[MakeProxy] Writing " << NumInitialJobs
-                        << " initial tokens.\n");
-      for (int i = 0; i < NumInitialJobs; ++i) {
-        if (write(RWFd, "+", 1) != 1) {
-          LLVM_DEBUG(dbgs()
-                     << "[MakeProxy] ERROR: Failed to write initial token " << i
-                     << ".\n");
-          close(RWFd);
-          return;
-        }
-      }
-      LLVM_DEBUG(dbgs() << "[MakeProxy] Finished writing initial tokens.\n");
-
-      // Make the read non-blocking so we can periodically check StopMakeThread.
-      int flags = fcntl(RWFd, F_GETFL, 0);
-      fcntl(RWFd, F_SETFL, flags | O_NONBLOCK);
-
-      while (!StopMakeThread) {
-        char Token;
-        ssize_t Ret = read(RWFd, &Token, 1);
-        if (Ret == 1) {
-          LLVM_DEBUG(dbgs() << "[MakeProxy] Read token '" << Token
-                            << "' to recycle.\n");
-          // A client released a token, 'make' makes it available again.
-          std::this_thread::sleep_for(std::chrono::microseconds(100));
-          ssize_t WRet;
-          do {
-            WRet = write(RWFd, &Token, 1);
-          } while (WRet < 0 && errno == EINTR);
-          if (WRet <= 0) {
-            LLVM_DEBUG(
-                dbgs()
-                << "[MakeProxy] ERROR: Failed to write recycled token.\n");
-            break; // Error, stop the proxy.
-          }
-          LLVM_DEBUG(dbgs()
-                     << "[MakeProxy] Wrote token '" << Token << "' back.\n");
-        } else if (Ret < 0 && errno != EAGAIN && errno != EWOULDBLOCK) {
-          LLVM_DEBUG(dbgs() << "[MakeProxy] ERROR: Read failed with errno "
-                            << errno << ".\n");
-          break; // Error, stop the proxy.
-        }
-        // Yield to prevent this thread from busy-waiting.
-        std::this_thread::sleep_for(std::chrono::milliseconds(1));
-      }
-      LLVM_DEBUG(dbgs() << "[MakeProxy] Thread stopping.\n");
-      close(RWFd);
-    });
-
-    // Give the proxy thread a moment to start and populate the FIFO.
-    // This is a simple way to avoid a race condition where the client starts
-    // before the initial tokens are in the pipe.
-    std::this_thread::sleep_for(std::chrono::milliseconds(50));
-  }
-};
-
-TEST_F(JobserverStrategyTest, ThreadPoolConcurrencyIsLimited) {
-  // This test simulates `make -j3`. We will have 1 implicit job slot and
-  // we will add 2 explicit job tokens to the FIFO, for a total of 3.
-  const int NumExplicitJobs = 2;
-  const int ConcurrencyLimit = NumExplicitJobs + 1; // +1 for the implicit slot
-  const int NumTasks = 8; // More tasks than available slots.
-
-  LLVM_DEBUG(dbgs() << "Calling startMakeProxy with " << NumExplicitJobs
-                    << " jobs.\n");
-  startMakeProxy(NumExplicitJobs);
-  LLVM_DEBUG(dbgs() << "MakeProxy is running.\n");
-
-  // Create the thread pool. Its constructor will call jobserver_concurrency()
-  // and create a client that reads from our pre-loaded FIFO.
-  StdThreadPool Pool(jobserver_concurrency());
-
-  std::atomic<int> ActiveTasks{0};
-  std::atomic<int> MaxActiveTasks{0};
-  std::atomic<int> CompletedTasks{0};
-  std::mutex M;
-  std::condition_variable CV;
-
-  // Dispatch more tasks than there are job slots. The pool should block
-  // and only run up to `ConcurrencyLimit` tasks at once.
-  for (int i = 0; i < NumTasks; ++i) {
-    Pool.async([&, i] {
-      // Track the number of concurrently running tasks.
-      int CurrentActive = ++ActiveTasks;
-      LLVM_DEBUG(dbgs() << "Task " << i << ": Active tasks: " << CurrentActive
-                        << "\n");
-      int OldMax = MaxActiveTasks.load();
-      while (CurrentActive > OldMax)
-        MaxActiveTasks.compare_exchange_weak(OldMax, CurrentActive);
-
-      std::this_thread::sleep_for(std::chrono::milliseconds(25));
-
-      --ActiveTasks;
-      if (++CompletedTasks == NumTasks) {
-        std::lock_guard<std::mutex> Lock(M);
-        CV.notify_one();
-      }
-    });
-  }
-
-  // Wait for all tasks to complete.
-  std::unique_lock<std::mutex> Lock(M);
-  CV.wait(Lock, [&] { return CompletedTasks == NumTasks; });
-
-  LLVM_DEBUG(dbgs() << "Test finished. Max active tasks was " << MaxActiveTasks
-                    << ".\n");
-  // The key assertion: the maximum number of concurrent tasks should
-  // not have exceeded the limit imposed by the jobserver.
-  EXPECT_LE(MaxActiveTasks, ConcurrencyLimit);
-  EXPECT_EQ(CompletedTasks, NumTasks);
-}
-
-TEST_F(JobserverStrategyTest, ParallelForIsLimited) {
-  // This test verifies that llvm::parallelFor respects the jobserver limit.
-  const int NumExplicitJobs = 3;
-  const int ConcurrencyLimit = NumExplicitJobs + 1; // +1 implicit
-  const int NumTasks = 20;
-
-  LLVM_DEBUG(dbgs() << "Calling startMakeProxy with " << NumExplicitJobs
-                    << " jobs.\n");
-  startMakeProxy(NumExplicitJobs);
-  LLVM_DEBUG(dbgs() << "MakeProxy is running.\n");
-
-  // Set the global strategy. parallelFor will use this.
-  parallel::strategy = jobserver_concurrency();
-
-  std::atomic<int> ActiveTasks{0};
-  std::atomic<int> MaxActiveTasks{0};
-
-  parallelFor(0, NumTasks, [&](int i) {
-    int CurrentActive = ++ActiveTasks;
-    LLVM_DEBUG(dbgs() << "Task " << i << ": Active tasks: " << CurrentActive
-                      << "\n");
-    int OldMax = MaxActiveTasks.load();
-    while (CurrentActive > OldMax)
-      MaxActiveTasks.compare_exchange_weak(OldMax, CurrentActive);
-
-    std::this_thread::sleep_for(std::chrono::milliseconds(20));
-    --ActiveTasks;
-  });
-
-  LLVM_DEBUG(dbgs() << "ParallelFor finished. Max active tasks was "
-                    << MaxActiveTasks << ".\n");
-  EXPECT_LE(MaxActiveTasks, ConcurrencyLimit);
-}
-
-TEST_F(JobserverStrategyTest, ParallelSortIsLimited) {
-  // This test serves as an integration test to ensure parallelSort completes
-  // correctly when running under the jobserver strategy. It doesn't directly
-  // measure concurrency but verifies correctness.
-  const int NumExplicitJobs = 3;
-  startMakeProxy(NumExplicitJobs);
-
-  parallel::strategy = jobserver_concurrency();
-
-  std::vector<int> V(1024);
-  // Fill with random data
-  std::mt19937 randEngine;
-  std::uniform_int_distribution<int> dist;
-  for (int &i : V)
-    i = dist(randEngine);
-
-  parallelSort(V.begin(), V.end());
-  ASSERT_TRUE(llvm::is_sorted(V));
-}
-
-#endif // LLVM_ENABLE_THREADS
-
-#endif // defined(LLVM_ON_UNIX)
-
-} // end anonymous namespace

From 83c135cad3bec97947e1635b6d8de9ec883041dd Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 3 Oct 2025 16:39:01 +0200
Subject: [PATCH 646/878] [mlir] Dialect conversion: Print note when
 replacement types do not match legalized types (#161802)

When running with `-debug`, print a note when the replacement types
(during a `ConversionPatternRewriter::replaceOp`) do not match the
legalized types of the current type converter. That's not an API
violation, but it could indicate a bug in user code.

Example output:
```
[dialect-conversion:1]     ** Replace : 'test.multiple_1_to_n_replacement'(0x56b745f99470)
[dialect-conversion:1]        Note: Replacing op result of type f16 with value(s) of type (f16, f16), but the legalized type(s) is/are (f16)
```
---
 .../Transforms/Utils/DialectConversion.cpp    | 46 +++++++++++++++----
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index bf0136b39e03c..3a23bbfd70eac 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -1856,6 +1856,44 @@ void ConversionPatternRewriterImpl::replaceOp(
     Operation *op, SmallVector<SmallVector<Value>> &&newValues) {
   assert(newValues.size() == op->getNumResults() &&
          "incorrect number of replacement values");
+  LLVM_DEBUG({
+    logger.startLine() << "** Replace : '" << op->getName() << "'(" << op
+                       << ")\n";
+    if (currentTypeConverter) {
+      // If the user-provided replacement types are different from the
+      // legalized types, as per the current type converter, print a note.
+      // In most cases, the replacement types are expected to match the types
+      // produced by the type converter, so this could indicate a bug in the
+      // user code.
+      for (auto [result, repls] :
+           llvm::zip_equal(op->getResults(), newValues)) {
+        Type resultType = result.getType();
+        auto logProlog = [&, repls = repls]() {
+          logger.startLine() << "   Note: Replacing op result of type "
+                             << resultType << " with value(s) of type (";
+          llvm::interleaveComma(repls, logger.getOStream(), [&](Value v) {
+            logger.getOStream() << v.getType();
+          });
+          logger.getOStream() << ")";
+        };
+        SmallVector<Type> convertedTypes;
+        if (failed(currentTypeConverter->convertTypes(resultType,
+                                                      convertedTypes))) {
+          logProlog();
+          logger.getOStream() << ", but the type converter failed to legalize "
+                                 "the original type.\n";
+          continue;
+        }
+        if (TypeRange(convertedTypes) != TypeRange(ValueRange(repls))) {
+          logProlog();
+          logger.getOStream() << ", but the legalized type(s) is/are (";
+          llvm::interleaveComma(convertedTypes, logger.getOStream(),
+                                [&](Type t) { logger.getOStream() << t; });
+          logger.getOStream() << ")\n";
+        }
+      }
+    }
+  });
 
   if (!config.allowPatternRollback) {
     // Pattern rollback is not allowed: materialize all IR changes immediately.
@@ -2072,10 +2110,6 @@ void ConversionPatternRewriter::replaceOp(Operation *op, Operation *newOp) {
 void ConversionPatternRewriter::replaceOp(Operation *op, ValueRange newValues) {
   assert(op->getNumResults() == newValues.size() &&
          "incorrect # of replacement values");
-  LLVM_DEBUG({
-    impl->logger.startLine()
-        << "** Replace : '" << op->getName() << "'(" << op << ")\n";
-  });
 
   // If the current insertion point is before the erased operation, we adjust
   // the insertion point to be after the operation.
@@ -2093,10 +2127,6 @@ void ConversionPatternRewriter::replaceOpWithMultiple(
     Operation *op, SmallVector<SmallVector<Value>> &&newValues) {
   assert(op->getNumResults() == newValues.size() &&
          "incorrect # of replacement values");
-  LLVM_DEBUG({
-    impl->logger.startLine()
-        << "** Replace : '" << op->getName() << "'(" << op << ")\n";
-  });
 
   // If the current insertion point is before the erased operation, we adjust
   // the insertion point to be after the operation.

From 5d3b3eaaa02b52dcf4a0a4ed409c683972377c96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 3 Oct 2025 17:45:14 +0300
Subject: [PATCH 647/878] [libcxx] Mark system_reserved_names.gen.py
 unsupported on clang-20 && msvc (#161811)

Clang 20 (and early 21 versions; let's hope it can be fixed before the
later versions before such versions become relevant for libcxx CI) have
got an issue with its intrinsics headers, where they use unreserved
names, that users are allowed to override.

See https://github.com/llvm/llvm-project/issues/161808 for the issue
report.

This only crops up in the MSVC build configurations, as recent versions
of some MSVC/UCRT headers include `<intrin.h>`, which ends up pulling in
most intrinsics headers, exposing this issue in the Clang headers.

This should unblock https://github.com/llvm/llvm-project/pull/161736
from being merged.
---
 libcxx/test/libcxx/system_reserved_names.gen.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py
index f8589f2beaae3..d69182d68e0de 100644
--- a/libcxx/test/libcxx/system_reserved_names.gen.py
+++ b/libcxx/test/libcxx/system_reserved_names.gen.py
@@ -10,6 +10,13 @@
 # alphabetic macros. Also ensure that we don't swallow the definition of user
 # provided macros (in other words, ensure that we push/pop correctly everywhere).
 
+# This test fails with MSVC headers, with Clang 20 (and early 21 versions);
+# the headers end up pulling in Clang intrinsics headers, which in 20.x and
+# early 21.x versions use unreserved identifiers,
+# see https://github.com/llvm/llvm-project/issues/161808.
+#
+# UNSUPPORTED: clang-20 && msvc
+
 # RUN: %{python} %s %{libcxx-dir}/utils
 # END.
 

From 3d810086d1e16e2de57634d7eb5ecf25a5227e4c Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 3 Oct 2025 15:47:21 +0100
Subject: [PATCH 648/878] [lldb][Lanugage][NFC] Adapt Language::ForEach to
 IterationAction (#161830)

---
 lldb/include/lldb/Target/Language.h               |  5 +++--
 lldb/source/Breakpoint/BreakpointResolverName.cpp |  2 +-
 lldb/source/Commands/CommandObjectType.cpp        |  4 ++--
 lldb/source/Core/Mangled.cpp                      |  4 ++--
 lldb/source/Symbol/Symtab.cpp                     |  2 +-
 lldb/source/Target/Language.cpp                   | 15 ++++++++-------
 6 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 3d0aa326d5a6d..6f20a02335b35 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -166,7 +166,7 @@ class Language : public PluginInterface {
                               llvm::StringRef file_path);
 
   // return false from callback to stop iterating
-  static void ForEach(std::function<bool(Language *)> callback);
+  static void ForEach(llvm::function_ref<IterationAction(Language *)> callback);
 
   virtual lldb::LanguageType GetLanguageType() const = 0;
 
@@ -420,7 +420,8 @@ class Language : public PluginInterface {
                                                     llvm::StringRef suffix);
 
   // return false from callback to stop iterating
-  static void ForAllLanguages(std::function<bool(lldb::LanguageType)> callback);
+  static void ForAllLanguages(
+      llvm::function_ref<IterationAction(lldb::LanguageType)> callback);
 
   static bool LanguageIsCPlusPlus(lldb::LanguageType language);
 
diff --git a/lldb/source/Breakpoint/BreakpointResolverName.cpp b/lldb/source/Breakpoint/BreakpointResolverName.cpp
index 6372595a0f21f..4f252f91cccdc 100644
--- a/lldb/source/Breakpoint/BreakpointResolverName.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverName.cpp
@@ -233,7 +233,7 @@ void BreakpointResolverName::AddNameLookup(ConstString name,
         m_lookups.emplace_back(variant_lookup);
       }
     }
-    return true;
+    return IterationAction::Continue;
   };
 
   if (Language *lang = Language::FindPlugin(m_language)) {
diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp
index 19cd3ff2972e9..22ed5b8ed593a 100644
--- a/lldb/source/Commands/CommandObjectType.cpp
+++ b/lldb/source/Commands/CommandObjectType.cpp
@@ -2610,7 +2610,7 @@ class CommandObjectTypeLookup : public CommandObjectRaw {
     Language::ForEach([&](Language *lang) {
       if (const char *help = lang->GetLanguageSpecificTypeLookupHelp())
         stream.Printf("%s\n", help);
-      return true;
+      return IterationAction::Continue;
     });
 
     m_cmd_help_long = std::string(stream.GetString());
@@ -2649,7 +2649,7 @@ class CommandObjectTypeLookup : public CommandObjectRaw {
              (m_command_options.m_language == eLanguageTypeUnknown))) {
       Language::ForEach([&](Language *lang) {
         languages.push_back(lang);
-        return true;
+        return IterationAction::Continue;
       });
     } else {
       languages.push_back(Language::FindPlugin(m_command_options.m_language));
diff --git a/lldb/source/Core/Mangled.cpp b/lldb/source/Core/Mangled.cpp
index 0780846b0ed60..f7683c55baf84 100644
--- a/lldb/source/Core/Mangled.cpp
+++ b/lldb/source/Core/Mangled.cpp
@@ -428,9 +428,9 @@ lldb::LanguageType Mangled::GuessLanguage() const {
   Language::ForEach([this, &result](Language *l) {
     if (l->SymbolNameFitsToLanguage(*this)) {
       result = l->GetLanguageType();
-      return false;
+      return IterationAction::Stop;
     }
-    return true;
+    return IterationAction::Continue;
   });
   return result;
 }
diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index 970f6c474e375..6080703998ff2 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -289,7 +289,7 @@ void Symtab::InitNameIndexes() {
     std::vector<Language *> languages;
     Language::ForEach([&languages](Language *l) {
       languages.push_back(l);
-      return true;
+      return IterationAction::Continue;
     });
 
     auto &name_to_index = GetNameToSymbolIndexMap(lldb::eFunctionNameTypeNone);
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index 484d9badde397..d4a926874b0ce 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -111,9 +111,9 @@ Language *Language::FindPlugin(llvm::StringRef file_path) {
   ForEach([&result, file_path](Language *language) {
     if (language->IsSourceFile(file_path)) {
       result = language;
-      return false;
+      return IterationAction::Stop;
     }
-    return true;
+    return IterationAction::Continue;
   });
   return result;
 }
@@ -128,7 +128,8 @@ Language *Language::FindPlugin(LanguageType language,
   return result;
 }
 
-void Language::ForEach(std::function<bool(Language *)> callback) {
+void Language::ForEach(
+    llvm::function_ref<IterationAction(Language *)> callback) {
   // If we want to iterate over all languages, we first have to complete the
   // LanguagesMap.
   static llvm::once_flag g_initialize;
@@ -153,7 +154,7 @@ void Language::ForEach(std::function<bool(Language *)> callback) {
   }
 
   for (auto *lang : loaded_plugins) {
-    if (!callback(lang))
+    if (callback(lang) == IterationAction::Stop)
       break;
   }
 }
@@ -289,9 +290,9 @@ void Language::PrintAllLanguages(Stream &s, const char *prefix,
 }
 
 void Language::ForAllLanguages(
-    std::function<bool(lldb::LanguageType)> callback) {
+    llvm::function_ref<IterationAction(lldb::LanguageType)> callback) {
   for (uint32_t i = 1; i < num_languages; i++) {
-    if (!callback(language_names[i].type))
+    if (callback(language_names[i].type) == IterationAction::Stop)
       break;
   }
 }
@@ -416,7 +417,7 @@ std::set<lldb::LanguageType> Language::GetSupportedLanguages() {
   std::set<lldb::LanguageType> supported_languages;
   ForEach([&](Language *lang) {
     supported_languages.emplace(lang->GetLanguageType());
-    return true;
+    return IterationAction::Continue;
   });
   return supported_languages;
 }

From 952b12394ecffce6cf2430aa29b193dd7dc897a8 Mon Sep 17 00:00:00 2001
From: Nagraj Gaonkar <nagrajgaonkarmumbai749@gmail.com>
Date: Fri, 3 Oct 2025 20:22:21 +0530
Subject: [PATCH 649/878] [X86] Allow PSHUFD/PSHUFLW/PSHUFW intrinsics in
 constexpr. (#161210)

The i16/i32 shuffle intrinsics (`pshufw`, `pshuflw`, `pshufhw`,
`pshufd`) currently cannot be used in constant expressions. This patch
adds support in both bytecode interpreter (InterpBuiltin.cpp) and
constant evaluator
(ExprConstant.cpp) for pshuf intrinsics, enabling their use in constant
expressions.
## Intrinsics covered
- `_mm_shuffle_pi16` (MMX `pshufw`)
- `_mm_shufflelo_epi16` / `_mm_shufflehi_epi16`
- `_mm_shuffle_epi32`
- Their AVX2/AVX512 vector-width variants
- Masked and maskz forms (handled indirectly via
`__builtin_ia32_select*`)

Fixes #156611
---
 clang/include/clang/Basic/BuiltinsX86.td     | 23 +++---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp     | 59 ++++++++++++++
 clang/lib/AST/ExprConstant.cpp               | 83 +++++++++++++++++++-
 clang/test/CodeGen/X86/avx2-builtins.c       |  6 +-
 clang/test/CodeGen/X86/avx512bw-builtins.c   | 11 ++-
 clang/test/CodeGen/X86/avx512f-builtins.c    |  9 ++-
 clang/test/CodeGen/X86/avx512vl-builtins.c   | 17 ++++
 clang/test/CodeGen/X86/avx512vlbw-builtins.c | 50 ++++++++++++
 clang/test/CodeGen/X86/mmx-builtins.c        |  2 +-
 clang/test/CodeGen/X86/sse2-builtins.c       |  6 +-
 10 files changed, 245 insertions(+), 21 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index d8eb0bed3e3e5..a0181b7ae8f9d 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -216,10 +216,13 @@ let Features = "sse2", Attributes = [NoThrow] in {
   def movnti : X86Builtin<"void(int *, int)">;
 }
 
-let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
+let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+  def pshufd : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Constant int)">;
   def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
+}
+
+let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
   def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
   def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
@@ -584,9 +587,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
   def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
-  def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
-  def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
-  def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
   def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
   def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
@@ -647,6 +647,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
   def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
   def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
+
+  def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
+  def pshufd256  : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
 }
 
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -1017,6 +1021,7 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
   def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">;
+  def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
 }
 
 let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
@@ -1990,13 +1995,13 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVect
 }
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
-  def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
   def psllw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def psllv32hi : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">;
+  def pshufhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
+  def pshuflw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Constant int)">;
 }
 
 let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
@@ -2026,8 +2031,7 @@ let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, Req
   def psrlv8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512f",
-    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def psrlwi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, int)">;
   def psrldi512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, int)">;
   def psrlqi512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, int)">;
@@ -3266,7 +3270,6 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>
 }
 
 let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def pshufd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Constant int)">;
   def expanddf512_mask : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, unsigned char)">;
   def expanddi512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, unsigned char)">;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5b3aa26e84248..6053237b1a261 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2773,6 +2773,50 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
+                                       const CallExpr *Call, bool IsShufHW) {
+  assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
+  APSInt ControlImm = popToAPSInt(S, Call->getArg(1));
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  unsigned NumElems = Dst.getNumElems();
+  PrimType ElemT = Dst.getFieldDesc()->getPrimType();
+
+  unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);
+  if (ElemBits != 16 && ElemBits != 32)
+    return false;
+
+  unsigned LaneElts = 128u / ElemBits;
+  assert(LaneElts && (NumElems % LaneElts == 0));
+
+  uint8_t Ctl = static_cast<uint8_t>(ControlImm.getZExtValue());
+
+  for (unsigned Idx = 0; Idx != NumElems; Idx++) {
+    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
+    unsigned LaneIdx = Idx % LaneElts;
+    unsigned SrcIdx = Idx;
+    unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3;
+    if (ElemBits == 32) {
+      SrcIdx = LaneBase + Sel;
+    } else {
+      constexpr unsigned HalfSize = 4;
+      bool InHigh = LaneIdx >= HalfSize;
+      if (!IsShufHW && !InHigh) {
+        SrcIdx = LaneBase + Sel;
+      } else if (IsShufHW && InHigh) {
+        unsigned Rel = LaneIdx - HalfSize;
+        Sel = (Ctl >> (2 * Rel)) & 0x3;
+        SrcIdx = LaneBase + HalfSize + Sel;
+      }
+    }
+
+    INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(Idx) = Src.elem<T>(SrcIdx); });
+  }
+  Dst.initializeAllElements();
+  return true;
+}
+
 static bool interp__builtin_elementwise_triop(
     InterpState &S, CodePtr OpPC, const CallExpr *Call,
     llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &)>
@@ -3661,6 +3705,21 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_selectpd_512:
     return interp__builtin_select(S, OpPC, Call);
 
+  case X86::BI__builtin_ia32_pshuflw:
+  case X86::BI__builtin_ia32_pshuflw256:
+  case X86::BI__builtin_ia32_pshuflw512:
+    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
+
+  case X86::BI__builtin_ia32_pshufhw:
+  case X86::BI__builtin_ia32_pshufhw256:
+  case X86::BI__builtin_ia32_pshufhw512:
+    return interp__builtin_ia32_pshuf(S, OpPC, Call, true);
+
+  case X86::BI__builtin_ia32_pshufd:
+  case X86::BI__builtin_ia32_pshufd256:
+  case X86::BI__builtin_ia32_pshufd512:
+    return interp__builtin_ia32_pshuf(S, OpPC, Call, false);
+
   case X86::BI__builtin_ia32_kandqi:
   case X86::BI__builtin_ia32_kandhi:
   case X86::BI__builtin_ia32_kandsi:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 81a5c2de151a0..7bf28d988f405 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11615,6 +11615,60 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
   return true;
 }
 
+static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
+                             bool IsShufHW, APValue &Out) {
+  APValue Vec;
+  APSInt Imm;
+  if (!EvaluateAsRValue(Info, Call->getArg(0), Vec))
+    return false;
+  if (!EvaluateInteger(Call->getArg(1), Imm, Info))
+    return false;
+
+  const auto *VT = Call->getType()->getAs<VectorType>();
+  if (!VT)
+    return false;
+
+  QualType ElemT = VT->getElementType();
+  unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);
+  unsigned NumElts = VT->getNumElements();
+
+  unsigned LaneBits = 128u;
+  unsigned LaneElts = LaneBits / ElemBits;
+  if (!LaneElts || (NumElts % LaneElts) != 0)
+    return false;
+
+  uint8_t Ctl = static_cast<uint8_t>(Imm.getZExtValue());
+
+  SmallVector<APValue, 32> ResultElements;
+  ResultElements.reserve(NumElts);
+
+  for (unsigned Idx = 0; Idx != NumElts; Idx++) {
+    unsigned LaneBase = (Idx / LaneElts) * LaneElts;
+    unsigned LaneIdx = Idx % LaneElts;
+    unsigned SrcIdx = Idx;
+    unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3;
+
+    if (ElemBits == 32) {
+      SrcIdx = LaneBase + Sel;
+    } else {
+      constexpr unsigned HalfSize = 4;
+      bool InHigh = LaneIdx >= HalfSize;
+      if (!IsShufHW && !InHigh) {
+        SrcIdx = LaneBase + Sel;
+      } else if (IsShufHW && InHigh) {
+        unsigned Rel = LaneIdx - HalfSize;
+        Sel = (Ctl >> (2 * Rel)) & 0x3;
+        SrcIdx = LaneBase + HalfSize + Sel;
+      }
+    }
+
+    ResultElements.push_back(Vec.getVectorElt(SrcIdx));
+  }
+
+  Out = APValue(ResultElements.data(), ResultElements.size());
+  return true;
+}
+
 bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
   if (!IsConstantEvaluatedBuiltinCall(E))
     return ExprEvaluatorBaseTy::VisitCallExpr(E);
@@ -11868,7 +11922,6 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
-
   case clang::X86::BI__builtin_ia32_vprotbi:
   case clang::X86::BI__builtin_ia32_vprotdi:
   case clang::X86::BI__builtin_ia32_vprotqi:
@@ -12087,6 +12140,34 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+
+  case X86::BI__builtin_ia32_pshuflw:
+  case X86::BI__builtin_ia32_pshuflw256:
+  case X86::BI__builtin_ia32_pshuflw512: {
+    APValue R;
+    if (!evalPshufBuiltin(Info, E, false, R))
+      return false;
+    return Success(R, E);
+  }
+
+  case X86::BI__builtin_ia32_pshufhw:
+  case X86::BI__builtin_ia32_pshufhw256:
+  case X86::BI__builtin_ia32_pshufhw512: {
+    APValue R;
+    if (!evalPshufBuiltin(Info, E, true, R))
+      return false;
+    return Success(R, E);
+  }
+
+  case X86::BI__builtin_ia32_pshufd:
+  case X86::BI__builtin_ia32_pshufd256:
+  case X86::BI__builtin_ia32_pshufd512: {
+    APValue R;
+    if (!evalPshufBuiltin(Info, E, false, R))
+      return false;
+    return Success(R, E);
+  }
+
   case Builtin::BI__builtin_elementwise_clzg:
   case Builtin::BI__builtin_elementwise_ctzg: {
     APValue SourceLHS;
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index eff2797e87c75..4299b18243f21 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1109,19 +1109,19 @@ __m256i test_mm256_shuffle_epi32(__m256i a) {
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
   return _mm256_shuffle_epi32(a, 15);
 }
-
+TEST_CONSTEXPR(match_v8si(_mm256_shuffle_epi32((((__m256i)(__v8si){0,1,2,3,4,5,6,7})), 15), 3,3,0,0, 7,7,4,4));
 __m256i test_mm256_shufflehi_epi16(__m256i a) {
   // CHECK-LABEL: test_mm256_shufflehi_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
   return _mm256_shufflehi_epi16(a, 107);
 }
-
+TEST_CONSTEXPR(match_v16hi(_mm256_shufflehi_epi16((((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})), 107), 0,1,2,3, 7,6,6,5, 8,9,10,11, 15,14,14,13));
 __m256i test_mm256_shufflelo_epi16(__m256i a) {
   // CHECK-LABEL: test_mm256_shufflelo_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
   return _mm256_shufflelo_epi16(a, 83);
 }
-
+TEST_CONSTEXPR(match_v16hi(_mm256_shufflelo_epi16(((__m256i)(__v16hi){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 83), 3,0,1,1, 4,5,6,7, 11,8,9,9, 12,13,14,15) );
 __m256i test_mm256_sign_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_sign_epi8
   // CHECK: call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index 3f42ac0268978..bd19363c8d948 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1876,13 +1876,15 @@ __m512i test_mm512_shufflehi_epi16(__m512i __A) {
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   return _mm512_shufflehi_epi16(__A, 5); 
 }
-
+TEST_CONSTEXPR(match_v32hi(_mm512_shufflehi_epi16((((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 0,1,2,3, 5,5,4,4, 8,9,10,11, 13,13,12,12, 16,17,18,19, 21,21,20,20, 24,25,26,27, 29,29,28,28));
 __m512i test_mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_shufflehi_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shufflehi_epi16(__W, __U, __A, 5); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflehi_epi16((((__m512i)(__v32hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131})), 0xFFFF0000u, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115, 16,17,18,19,21,21,20,20, 24,25,26,27,29,29,28,28));
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflehi_epi16(((__m512i)(__v32hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131}), 0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 0,1,2,3,5,5,4,4, 8,9,10,11,13,13,12,12, 116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131));
 
 __m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_shufflehi_epi16
@@ -1890,12 +1892,15 @@ __m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shufflehi_epi16(__U, __A, 5); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflehi_epi16(0xAAAAAAAAu, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 0,1,0,3,0,5,0,4, 0,9,0,11,0,13,0,12, 0,17,0,19,0,21,0,20, 0,25,0,27,0,29,0,28));
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflehi_epi16(0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 0,1,2,3,5,5,4,4, 8,9,10,11,13,13,12,12, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
 
 __m512i test_mm512_shufflelo_epi16(__m512i __A) {
   // CHECK-LABEL: test_mm512_shufflelo_epi16
   // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> poison, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   return _mm512_shufflelo_epi16(__A, 5); 
 }
+TEST_CONSTEXPR( match_v32hi(_mm512_shufflelo_epi16(((__m512i)(__v32hi){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15, 16,17,18,19, 20,21,22,23, 24,25,26,27, 28,29,30,31}), 5), 1,1,0,0, 4,5,6,7, 9,9,8,8, 12,13,14,15, 17,17,16,16, 20,21,22,23, 25,25,24,24, 28,29,30,31));
 
 __m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_shufflelo_epi16
@@ -1903,6 +1908,8 @@ __m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A)
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_shufflelo_epi16(__W, __U, __A, 5); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflelo_epi16((((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 0xFFFFFFFF, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 1,1,0,0, 4,5,6,7, 9,9,8,8, 12,13,14,15, 17,17,16,16, 20,21,22,23, 25,25,24,24, 28,29,30,31));
+TEST_CONSTEXPR(match_v32hi(_mm512_mask_shufflelo_epi16(((__m512i)(__v32hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131}), 0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 1,1,0,0,4,5,6,7, 9,9,8,8,12,13,14,15, 116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131));
 
 __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_shufflelo_epi16
@@ -1910,6 +1917,8 @@ __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) {
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_shufflelo_epi16(__U, __A, 5); 
 }
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflelo_epi16(0xFFFFFFFF, (((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31})), 5), 1,1,0,0, 4,5,6,7, 9,9,8,8, 12,13,14,15, 17,17,16,16, 20,21,22,23, 25,25,24,24, 28,29,30,31));
+TEST_CONSTEXPR(match_v32hi(_mm512_maskz_shufflelo_epi16(0x0000FFFFu, ((__m512i)(__v32hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}), 5), 1,1,0,0,4,5,6,7, 9,9,8,8,12,13,14,15, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
 
 __m512i test_mm512_sllv_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_sllv_epi16
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 84eaad8d99e61..47cb485a84210 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -9073,20 +9073,25 @@ __m512i test_mm512_shuffle_epi32(__m512i __A) {
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   return _mm512_shuffle_epi32(__A, 1); 
 }
-
+TEST_CONSTEXPR(match_v16si(_mm512_shuffle_epi32((((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})), 1), 1,0,0,0, 5,4,4,4, 9,8,8,8, 13,12,12,12));
 __m512i test_mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_shuffle_epi32
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_shuffle_epi32(__W, __U, __A, 1); 
 }
-
+TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_epi32(((__m512i)(__v16si){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}), 0xFFFFu, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,4,4,4, 9,8,8,8, 13,12,12,12));
+TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_epi32(((__m512i)(__v16si){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}), 0x0000u, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207));
+TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_epi32(((__m512i)(__v16si){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}), 0x00FFu, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,4,4,4, 200,201,202,203,204,205,206,207));
 __m512i test_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_maskz_shuffle_epi32
   // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_shuffle_epi32(__U, __A, 1); 
 }
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_epi32(0xFFFFu, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,4,4,4, 9,8,8,8, 13,12,12,12));
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_epi32(0x5555u, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 5,0,4,0, 9,0,8,0, 13,0,12,0));
+TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_epi32(0x8001u, ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 1), 1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,12));
 
 __m512d test_mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) {
   // CHECK-LABEL: test_mm512_mask_expand_pd
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 5282c7ab06dea..88006232c5c99 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -10025,6 +10025,11 @@ __m128i test_mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   return _mm_mask_shuffle_epi32(__W, __U, __A, 1);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x0Fu, ((__m128i)(__v4si){0,1,2,3}), 1), 1,0,0,0));
+TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x0Au, ((__m128i)(__v4si){0,1,2,3}), 1), 100,0,102,0));
+TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x05u, ((__m128i)(__v4si){0,1,2,3}), 1), 1,101,0,103));
+TEST_CONSTEXPR(match_v4si(_mm_mask_shuffle_epi32(((__m128i)(__v4si){100,101,102,103}), 0x00u, ((__m128i)(__v4si){0,1,2,3}), 1), 100,101,102,103));
+
 __m128i test_mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_shuffle_epi32
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
@@ -10032,6 +10037,10 @@ __m128i test_mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A) {
   return _mm_maskz_shuffle_epi32(__U, __A, 2);
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_maskz_shuffle_epi32(0x01u, ((__m128i)(__v4si){0,1,2,3}), 2), 2,0,0,0));
+TEST_CONSTEXPR(match_v4si(_mm_maskz_shuffle_epi32(0x0Au, ((__m128i)(__v4si){0,1,2,3}), 2), 0,0,0,0));
+TEST_CONSTEXPR(match_v4si(_mm_maskz_shuffle_epi32(0x0Fu, ((__m128i)(__v4si){0,1,2,3}), 2), 2,0,0,0));
+
 __m256i test_mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_shuffle_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
@@ -10039,6 +10048,10 @@ __m256i test_mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
   return _mm256_mask_shuffle_epi32(__W, __U, __A, 2);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_epi32(((__m256i)(__v8si){100,101,102,103,104,105,106,107}), 0xF0u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 100,101,102,103, 6,4,4,4));
+TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_epi32(((__m256i)(__v8si){100,101,102,103,104,105,106,107}), 0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,102,103, 6,4,106,107));
+TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_epi32(((__m256i)(__v8si){100,101,102,103,104,105,106,107}), 0x00u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 100,101,102,103,104,105,106,107));
+
 __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_shuffle_epi32
   // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
@@ -10046,6 +10059,10 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
   return _mm256_maskz_shuffle_epi32(__U, __A, 2);
 }
 
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0));
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4));
+TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4));
+
 __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_mov_pd
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index 6c9c80efcef9d..1fe1ec08ede88 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -3393,6 +3393,13 @@ __m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   return _mm_mask_shufflehi_epi16(__W, __U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,5,5,4,4));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x00u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,104,105,106,107));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xFFu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,5,5,4,4));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,104,105,106,107));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,101,2,103,5,105,4,107));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflehi_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,1,102,3,104,5,106,4));
+
 __m128i test_mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_shufflehi_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
@@ -3400,6 +3407,13 @@ __m128i test_mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A) {
   return _mm_maskz_shufflehi_epi16(__U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,0,0,5,5,4,4));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0x00u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,0,0,0,0,0,0));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0xFFu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,5,5,4,4));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,2,3,0,0,0,0));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,2,0,5,0,4,0));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflehi_epi16(0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,0,3,0,5,0,4));
+
 __m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_shufflelo_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -3407,6 +3421,13 @@ __m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
   return _mm_mask_shufflelo_epi16(__W, __U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),0xFF,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,4,5,6,7));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x00u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,104,105,106,107));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,104,105,106,107));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,101,102,103,4,5,6,7));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),100,1,102,0,104,5,106,7));
+TEST_CONSTEXPR(match_v8hi(_mm_mask_shufflelo_epi16(((__m128i)(__v8hi){100,101,102,103,104,105,106,107}),0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,101,0,103,4,105,6,107));
+
 __m128i test_mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_maskz_shufflelo_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -3414,6 +3435,12 @@ __m128i test_mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A) {
   return _mm_maskz_shufflelo_epi16(__U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0xFF,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,4,5,6,7));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0x0Fu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,1,0,0,0,0,0,0));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0xF0u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,0,0,0,4,5,6,7));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0xAAu,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),0,1,0,0,0,5,0,7));
+TEST_CONSTEXPR(match_v8hi(_mm_maskz_shufflelo_epi16(0x55u,((__m128i)(__v8hi){0,1,2,3,4,5,6,7}),5),1,0,0,0,4,0,6,0));
+
 __m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_shufflehi_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
@@ -3421,6 +3448,12 @@ __m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A)
   return _mm256_mask_shufflehi_epi16(__W, __U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0xFF00u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),100,101,102,103,104,105,106,107,8,9,10,11,13,13,12,12));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0x0000u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0xFFFFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,108,109,110,111,112,113,114,115));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflehi_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),0x5555u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,101,2,103,5,105,4,107,8,109,10,111,13,113,12,115));
+
 __m256i test_mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_shufflehi_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
@@ -3428,6 +3461,13 @@ __m256i test_mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A) {
   return _mm256_maskz_shufflehi_epi16(__U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0x0000u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0xFFFFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,2,3,5,5,4,4,0,0,0,0,0,0,0,0));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0xFF00u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,0,0,0,0,0,0,8,9,10,11,13,13,12,12));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0x5555u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,2,0,5,0,4,0,8,0,10,0,13,0,12,0));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflehi_epi16(0xAAAAu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,1,0,3,0,5,0,4,0,9,0,11,0,13,0,12));
+
 __m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_shufflelo_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -3435,6 +3475,11 @@ __m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A)
   return _mm256_mask_shufflelo_epi16(__W, __U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0xFFFF,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0x000Fu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,104,105,106,107,200,201,202,203,204,205,206,207));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,200,201,202,203,204,205,206,207));
+TEST_CONSTEXPR(match_v16hi(_mm256_mask_shufflelo_epi16(((__m256i)(__v16hi){100,101,102,103,104,105,106,107,200,201,202,203,204,205,206,207}),0xF00Fu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,104,105,106,107,200,201,202,203,12,13,14,15));
+
 __m256i test_mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_shufflelo_epi16
   // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -3442,6 +3487,11 @@ __m256i test_mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A) {
   return _mm256_maskz_shufflelo_epi16(__U, __A, 5); 
 }
 
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0xFFFF,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0x000Fu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0x00FFu,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),1,1,0,0,4,5,6,7,0,0,0,0,0,0,0,0));
+TEST_CONSTEXPR(match_v16hi(_mm256_maskz_shufflelo_epi16(0xF0F0u,((__m256i)(__v16hi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}),5),0,0,0,0,4,5,6,7,0,0,0,0,12,13,14,15));
+
 void test_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
 {
  // CHECK-LABEL: test_mm_mask_cvtepi16_storeu_epi8
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index f9ee32e440795..a4494b69219da 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -586,7 +586,7 @@ __m64 test_mm_shuffle_pi16(__m64 a) {
   // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   return _mm_shuffle_pi16(a, 3);
 }
-
+TEST_CONSTEXPR(match_v4hi(_mm_shuffle_pi16(((__m64)(__v4hi){0,1,2,3}), 3), 3,0,0,0));
 __m64 test_mm_sign_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi8
   // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 65bfec39c8f5a..8428fd6540ac9 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1301,7 +1301,7 @@ __m128i test_mm_shuffle_epi32(__m128i A) {
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
   return _mm_shuffle_epi32(A, 0);
 }
-
+TEST_CONSTEXPR(match_v4si(_mm_shuffle_epi32(((__m128i)(__v4si){0,1,2,3}), 0), 0,0,0,0));
 __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_shuffle_pd
   // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
@@ -1313,13 +1313,13 @@ __m128i test_mm_shufflehi_epi16(__m128i A) {
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
   return _mm_shufflehi_epi16(A, 0);
 }
-
+TEST_CONSTEXPR(match_v8hi(_mm_shufflehi_epi16(((__m128i)(__v8hi){0,1,2,3,4,5,6,7}), 0), 0,1,2,3, 4,4,4,4));
 __m128i test_mm_shufflelo_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_shufflelo_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   return _mm_shufflelo_epi16(A, 0);
 }
-
+TEST_CONSTEXPR(match_v8hi(_mm_shufflelo_epi16(((__m128i)(__v8hi){0,1,2,3,4,5,6,7}), 0), 0,0,0,0, 4,5,6,7));
 __m128i test_mm_sll_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sll_epi16
   // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})

From f4a39a838ad4b21825be0f54f1f2bf0aad7700e1 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking@arm.com>
Date: Fri, 3 Oct 2025 15:52:43 +0100
Subject: [PATCH 650/878] Fold SVE mul and mul_u to neg during isel (#160828)

Replace mul and mul_u ops with a neg operation if their second operand
is a splat value -1.

Apply the optimization also for mul_u ops if their first operand is a
splat value -1 due to their commutativity.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  30 ++++
 llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll  | 131 ++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 36c9cb6c1d94f..bc6b9310686a6 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1010,6 +1010,36 @@ let Predicates = [HasSVE_or_SME] in {
   defm SEL_ZPZZ   : sve_int_sel_vvv<"sel", vselect>;
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", AArch64splice>;
+
+  // mul x (splat -1) -> neg x
+  def : Pat<(nxv16i8 (AArch64mul_m1 nxv16i1:$Op1, nxv16i8:$Op2, (nxv16i8 (splat_vector (i32 -1))))),
+      (NEG_ZPmZ_B $Op2, $Op1, $Op2)>;
+  def : Pat<(nxv8i16 (AArch64mul_m1 nxv8i1:$Op1, nxv8i16:$Op2, (nxv8i16 (splat_vector (i32 -1))))),
+      (NEG_ZPmZ_H $Op2, $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (AArch64mul_m1 nxv4i1:$Op1, nxv4i32:$Op2, (nxv4i32 (splat_vector (i32 -1))))),
+      (NEG_ZPmZ_S $Op2, $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (AArch64mul_m1 nxv2i1:$Op1, nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 -1))))),
+      (NEG_ZPmZ_D $Op2, $Op1, $Op2)>;
+
+  let AddedComplexity = 5 in {
+    def : Pat<(nxv16i8 (AArch64mul_p nxv16i1:$Op1, nxv16i8:$Op2, (nxv16i8 (splat_vector (i32 -1))))),
+        (NEG_ZPmZ_B_UNDEF $Op2, $Op1, $Op2)>;
+    def : Pat<(nxv8i16 (AArch64mul_p nxv8i1:$Op1, nxv8i16:$Op2, (nxv8i16 (splat_vector (i32 -1))))),
+        (NEG_ZPmZ_H_UNDEF $Op2, $Op1, $Op2)>;
+    def : Pat<(nxv4i32 (AArch64mul_p nxv4i1:$Op1, nxv4i32:$Op2, (nxv4i32 (splat_vector (i32 -1))))),
+        (NEG_ZPmZ_S_UNDEF $Op2, $Op1, $Op2)>;
+    def : Pat<(nxv2i64 (AArch64mul_p nxv2i1:$Op1, nxv2i64:$Op2, (nxv2i64 (splat_vector (i64 -1))))),
+        (NEG_ZPmZ_D_UNDEF $Op2, $Op1, $Op2)>;
+  }
+
+  def : Pat<(nxv16i8 (AArch64mul_m1 nxv16i1:$Op1, (nxv16i8 (splat_vector (i32 -1))), nxv16i8:$Op2)),
+        (NEG_ZPmZ_B (DUP_ZI_B -1, 0), $Op1, $Op2)>;
+  def : Pat<(nxv8i16 (AArch64mul_m1 nxv8i1:$Op1, (nxv8i16 (splat_vector (i32 -1))), nxv8i16:$Op2)),
+        (NEG_ZPmZ_H (DUP_ZI_H -1, 0), $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (AArch64mul_m1 nxv4i1:$Op1, (nxv4i32 (splat_vector (i32 -1))), nxv4i32:$Op2)),
+        (NEG_ZPmZ_S (DUP_ZI_S -1, 0), $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (AArch64mul_m1 nxv2i1:$Op1, (nxv2i64 (splat_vector (i64 -1))), nxv2i64:$Op2)),
+        (NEG_ZPmZ_D (DUP_ZI_D -1, 0), $Op1, $Op2)>;
 } // End HasSVE_or_SME
 
 // COMPACT - word and doubleword
diff --git a/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
new file mode 100644
index 0000000000000..a1065bcf7d7da
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-int-mul-neg.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Muls with (-1) as operand should fold to neg.
+define <vscale x 16 x i8> @mul_neg_fold_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: mul_neg_fold_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 -1))
+  ret <vscale x 16 x i8> %1
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 -1))
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: mul_neg_fold_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 -1))
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 -1))
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 16 x i8> @mul_neg_fold_u_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat(i8 -1))
+  ret <vscale x 16 x i8> %1
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_u_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat(i16 -1))
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_u_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat(i32 -1))
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_u_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_u_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat(i64 -1))
+  ret <vscale x 2 x i64> %1
+}
+
+define <vscale x 16 x i8> @mul_neg_fold_different_argument_order_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z1.b, p0/m, z0.b
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> splat(i8 -1), <vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %1
+}
+
+define <vscale x 8 x i16> @mul_neg_fold_different_argument_order_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z1.h, p0/m, z0.h
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> splat(i16 -1), <vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %1
+}
+
+define <vscale x 4 x i32> @mul_neg_fold_different_argument_order_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z1.s, p0/m, z0.s
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> splat(i32 -1), <vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @mul_neg_fold_different_argument_order_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: mul_neg_fold_different_argument_order_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    neg z1.d, p0/m, z0.d
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+  %1 = call <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> splat(i64 -1), <vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %1
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.u.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.u.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.u.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.u.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.mul.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.mul.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.mul.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)

From 78739ff84a5986623684235e1f29e55b754a1594 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 3 Oct 2025 15:58:11 +0100
Subject: [PATCH 651/878] [llvm][HashRecognize] Fix compiler warning on Arm
 32-bit (#161821)

```
/home/david.spickett/llvm-project/llvm/lib/Analysis/HashRecognize.cpp:100:54: warning: comparison of integers of different signs:
'typename iterator_traits<ilist_iterator_w_bits<node_options<Instruction, true, false, void, true, BasicBlock>, false, false>>::difference_type' (aka 'int') and 'size_type' (aka 'unsigned int') [-Wsign-compare]
  100 |   return std::distance(Latch->begin(), Latch->end()) != Visited.size();
      |          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^  ~~~~~~~~~~~~~~
```

By using Latch->size() instead.
---
 llvm/lib/Analysis/HashRecognize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index 5d7ee1fe8eb12..4529123508a7c 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -97,7 +97,7 @@ static bool containsUnreachable(const Loop &L,
       }
     }
   }
-  return std::distance(Latch->begin(), Latch->end()) != Visited.size();
+  return Latch->size() != Visited.size();
 }
 
 /// A structure that can hold either a Simple Recurrence or a Conditional

From e7f47e70e75512aa73116fbafaf09dfb2b881672 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 3 Oct 2025 11:04:55 -0400
Subject: [PATCH 652/878] [AMDGPU] Enable XNACK on gfx1250 (#161457)

This should be always on.

Fixes SWDEV-555931.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   2 +
 .../AMDGPU/GlobalISel/load-constant.96.ll     | 231 ++++++---
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  34 +-
 .../CodeGen/AMDGPU/calling-conventions.ll     | 202 ++++----
 .../test/CodeGen/AMDGPU/carryout-selection.ll | 166 +++----
 llvm/test/CodeGen/AMDGPU/ds_write2.ll         |  19 +-
 llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/fmax3.ll             | 228 ++++-----
 llvm/test/CodeGen/AMDGPU/fmin3.ll             | 300 ++++++------
 llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll  | 151 +++---
 .../CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll  |  64 +--
 .../llvm.amdgcn.cluster.workgroup.id.ll       |  28 +-
 ...vm.amdgcn.cluster.workgroup.max.flat.id.ll |   4 +-
 .../llvm.amdgcn.cluster.workgroup.max.id.ll   |  24 +-
 .../AMDGPU/llvm.amdgcn.permlane.gfx1250.ll    |  24 +-
 .../AMDGPU/load-store-opt-scale-offset.mir    |   6 +-
 .../test/CodeGen/AMDGPU/loop-prefetch-data.ll |  12 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |  13 +-
 llvm/test/CodeGen/AMDGPU/max.ll               |  55 +--
 llvm/test/CodeGen/AMDGPU/min.ll               | 129 +++--
 llvm/test/CodeGen/AMDGPU/mul.ll               |  81 ++--
 llvm/test/CodeGen/AMDGPU/packed-fp32.ll       | 444 +++++++++---------
 .../AMDGPU/preload-implicit-kernargs.ll       |  10 +-
 llvm/test/CodeGen/AMDGPU/preload-kernargs.ll  |  12 +-
 llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll |  90 ++--
 llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll         |  12 +-
 .../AMDGPU/workgroup-id-in-arch-sgprs.ll      |  72 +--
 llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s          |   1 +
 llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s          |   1 +
 .../llvm-objdump/ELF/AMDGPU/kd-gfx1250.s      |   4 +-
 30 files changed, 1258 insertions(+), 1169 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7003a40a940aa..9446144d30e9b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2126,6 +2126,8 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureLdsBarrierArriveAtomic,
    FeatureSetPrioIncWgInst,
    Feature45BitNumRecordsBufferResource,
+   FeatureSupportsXNACK,
+   FeatureXNACK,
 ]>;
 
 def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 41fda6de82181..efa51ead0d196 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -90,26 +90,24 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v10, v[0:1], off offset:8
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v11, v[0:1], off offset:9
 ; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v12, v[0:1], off offset:11
-; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v0, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT:    global_load_u8 v13, v[0:1], off offset:10
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0xa
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
+; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v0, v3, 8, v2
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x8
-; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5
+; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v1, 24, v5
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x6
-; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
+; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x4
-; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9
+; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v5, 16, v8 :: v_dual_lshlrev_b32 v4, 24, v9
+; GFX1250-NOUNALIGNED-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x2
-; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v7, v11, 8, v10
-; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x1
-; GFX1250-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
+; GFX1250-NOUNALIGNED-NEXT:    v_lshl_or_b32 v6, v11, 8, v10
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NOUNALIGNED-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
-; GFX1250-NOUNALIGNED-NEXT:    v_or3_b32 v0, v2, v3, v1
-; GFX1250-NOUNALIGNED-NEXT:    v_or3_b32 v1, v5, v6, v4
-; GFX1250-NOUNALIGNED-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NOUNALIGNED-NEXT:    v_or3_b32 v2, v8, v9, v7
+; GFX1250-NOUNALIGNED-NEXT:    v_dual_lshlrev_b32 v7, 24, v12 :: v_dual_lshlrev_b32 v8, 16, v13
+; GFX1250-NOUNALIGNED-NEXT:    v_or3_b32 v1, v4, v5, v3
+; GFX1250-NOUNALIGNED-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NOUNALIGNED-NEXT:    v_or3_b32 v2, v7, v8, v6
 ; GFX1250-NOUNALIGNED-NEXT:    s_set_pc_i64 s[30:31]
 ;
 ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@@ -942,7 +940,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ;
 ; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
 ; GFX1250-NOUNALIGNED:       ; %bb.0:
-; GFX1250-NOUNALIGNED-NEXT:    s_clause 0xa
+; GFX1250-NOUNALIGNED-NEXT:    s_clause 0xb
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s2, s[0:1], 0x1
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s3, s[0:1], 0x3
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s4, s[0:1], 0x2
@@ -954,27 +952,26 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s10, s[0:1], 0x0
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s11, s[0:1], 0x4
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s12, s[0:1], 0xa
-; GFX1250-NOUNALIGNED-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s1, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s13, s[0:1], 0x8
 ; GFX1250-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s0, s2, 8
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s3, 24
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s3, s4, 16
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s4, s5, 8
-; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s3
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s5, s6, 24
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s6, s7, 16
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s7, s8, 8
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s1, s3, 24
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s4, 16
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s3, s5, 8
+; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s1, s1, s2
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s4, s6, 24
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s5, s7, 16
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s6, s8, 8
 ; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s0, s0, s10
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s8, s9, 24
-; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s0, s2, s0
-; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s2, s12, 16
-; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s3, s4, s11
-; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s4, s5, s6
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s7, s9, 24
+; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s0, s1, s0
+; GFX1250-NOUNALIGNED-NEXT:    s_lshl_b32 s1, s12, 16
+; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s2, s3, s11
+; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s3, s4, s5
+; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s4, s6, s13
 ; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s5, s7, s1
-; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s2, s8, s2
-; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s1, s4, s3
-; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s2, s2, s5
+; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s1, s3, s2
+; GFX1250-NOUNALIGNED-NEXT:    s_or_b32 s2, s5, s4
 ; GFX1250-NOUNALIGNED-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
@@ -1351,11 +1348,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
 }
 
 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v3i32_align4:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align4:
+; GFX12-UNALIGNED:       ; %bb.0:
+; GFX12-UNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align4:
+; GFX12-NOUNALIGNED:       ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v3i32_align4:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_v3i32_align4:
 ; GFX9:       ; %bb.0:
@@ -1388,11 +1399,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
 }
 
 define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_i96_align8:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_i96_align8:
+; GFX12-UNALIGNED:       ; %bb.0:
+; GFX12-UNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_i96_align8:
+; GFX12-NOUNALIGNED:       ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_i96_align8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_i96_align8:
 ; GFX9:       ; %bb.0:
@@ -1425,11 +1450,25 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
 }
 
 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v3i32_align8:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align8:
+; GFX12-UNALIGNED:       ; %bb.0:
+; GFX12-UNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align8:
+; GFX12-NOUNALIGNED:       ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v3i32_align8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_v3i32_align8:
 ; GFX9:       ; %bb.0:
@@ -1462,11 +1501,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
 }
 
 define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v6i16_align8:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v6i16_align8:
+; GFX12-UNALIGNED:       ; %bb.0:
+; GFX12-UNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v6i16_align8:
+; GFX12-NOUNALIGNED:       ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v6i16_align8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_v6i16_align8:
 ; GFX9:       ; %bb.0:
@@ -1500,24 +1553,64 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
 }
 
 define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v12i8_align8:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshr_b32 s13, s0, 8
-; GFX12-NEXT:    s_lshr_b32 s12, s0, 16
-; GFX12-NEXT:    s_lshr_b32 s3, s0, 24
-; GFX12-NEXT:    s_lshr_b32 s5, s1, 8
-; GFX12-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX12-NEXT:    s_lshr_b32 s7, s1, 24
-; GFX12-NEXT:    s_lshr_b32 s9, s2, 8
-; GFX12-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX12-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX12-NEXT:    s_mov_b32 s4, s1
-; GFX12-NEXT:    s_mov_b32 s8, s2
-; GFX12-NEXT:    s_mov_b32 s1, s13
-; GFX12-NEXT:    s_mov_b32 s2, s12
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v12i8_align8:
+; GFX12-UNALIGNED:       ; %bb.0:
+; GFX12-UNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s13, s0, 8
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s12, s0, 16
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX12-UNALIGNED-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX12-UNALIGNED-NEXT:    s_mov_b32 s4, s1
+; GFX12-UNALIGNED-NEXT:    s_mov_b32 s8, s2
+; GFX12-UNALIGNED-NEXT:    s_mov_b32 s1, s13
+; GFX12-UNALIGNED-NEXT:    s_mov_b32 s2, s12
+; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v12i8_align8:
+; GFX12-NOUNALIGNED:       ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s13, s0, 8
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s12, s0, 16
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX12-NOUNALIGNED-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX12-NOUNALIGNED-NEXT:    s_mov_b32 s4, s1
+; GFX12-NOUNALIGNED-NEXT:    s_mov_b32 s8, s2
+; GFX12-NOUNALIGNED-NEXT:    s_mov_b32 s1, s13
+; GFX12-NOUNALIGNED-NEXT:    s_mov_b32 s2, s12
+; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v12i8_align8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshr_b32 s13, s0, 8
+; GFX1250-NEXT:    s_lshr_b32 s12, s0, 16
+; GFX1250-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX1250-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX1250-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX1250-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX1250-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX1250-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX1250-NEXT:    s_lshr_b32 s11, s2, 24
+; GFX1250-NEXT:    s_mov_b32 s4, s1
+; GFX1250-NEXT:    s_mov_b32 s8, s2
+; GFX1250-NEXT:    s_mov_b32 s1, s13
+; GFX1250-NEXT:    s_mov_b32 s2, s12
+; GFX1250-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_load_constant_v12i8_align8:
 ; GFX9:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 94ba5cdd09df4..6b5647e696356 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -569,10 +569,10 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX1250-NEXT:    v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_load_b128 v[0:3], v[4:5], off
-; GFX1250-NEXT:    global_load_b128 v[4:7], v[4:5], off offset:16
+; GFX1250-NEXT:    global_load_b128 v[0:3], v[8:9], off
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[8:9], off offset:16
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <16 x bfloat>, ptr addrspace(1) %ptr
@@ -752,12 +752,12 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX1250-NEXT:    v_dual_mov_b32 v17, v1 :: v_dual_mov_b32 v16, v0
 ; GFX1250-NEXT:    s_clause 0x3
-; GFX1250-NEXT:    global_load_b128 v[0:3], v[12:13], off
-; GFX1250-NEXT:    global_load_b128 v[4:7], v[12:13], off offset:16
-; GFX1250-NEXT:    global_load_b128 v[8:11], v[12:13], off offset:32
-; GFX1250-NEXT:    global_load_b128 v[12:15], v[12:13], off offset:48
+; GFX1250-NEXT:    global_load_b128 v[0:3], v[16:17], off
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[16:17], off offset:16
+; GFX1250-NEXT:    global_load_b128 v[8:11], v[16:17], off offset:32
+; GFX1250-NEXT:    global_load_b128 v[12:15], v[16:17], off offset:48
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <32 x bfloat>, ptr addrspace(1) %ptr
@@ -1055,16 +1055,16 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX1250-NEXT:    v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0
 ; GFX1250-NEXT:    s_clause 0x7
-; GFX1250-NEXT:    global_load_b128 v[0:3], v[28:29], off
-; GFX1250-NEXT:    global_load_b128 v[4:7], v[28:29], off offset:16
-; GFX1250-NEXT:    global_load_b128 v[8:11], v[28:29], off offset:32
-; GFX1250-NEXT:    global_load_b128 v[12:15], v[28:29], off offset:48
-; GFX1250-NEXT:    global_load_b128 v[16:19], v[28:29], off offset:64
-; GFX1250-NEXT:    global_load_b128 v[20:23], v[28:29], off offset:80
-; GFX1250-NEXT:    global_load_b128 v[24:27], v[28:29], off offset:96
-; GFX1250-NEXT:    global_load_b128 v[28:31], v[28:29], off offset:112
+; GFX1250-NEXT:    global_load_b128 v[0:3], v[32:33], off
+; GFX1250-NEXT:    global_load_b128 v[4:7], v[32:33], off offset:16
+; GFX1250-NEXT:    global_load_b128 v[8:11], v[32:33], off offset:32
+; GFX1250-NEXT:    global_load_b128 v[12:15], v[32:33], off offset:48
+; GFX1250-NEXT:    global_load_b128 v[16:19], v[32:33], off offset:64
+; GFX1250-NEXT:    global_load_b128 v[20:23], v[32:33], off offset:80
+; GFX1250-NEXT:    global_load_b128 v[24:27], v[32:33], off offset:96
+; GFX1250-NEXT:    global_load_b128 v[28:31], v[32:33], off offset:112
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %load = load <64 x bfloat>, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index ddd3b1520bf5e..363a248ead8d5 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -2700,142 +2700,142 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v32i8:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[8:9], 16
 ; GFX1250-NEXT:    v_mov_b64_e32 v[10:11], 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX1250-NEXT:    s_lshr_b32 s17, s0, 24
-; GFX1250-NEXT:    s_lshr_b32 s20, s2, 16
-; GFX1250-NEXT:    s_lshr_b32 s21, s2, 24
-; GFX1250-NEXT:    s_lshr_b32 s14, s7, 16
-; GFX1250-NEXT:    s_lshr_b32 s15, s7, 24
-; GFX1250-NEXT:    s_bfe_u32 s27, s7, 0x80008
+; GFX1250-NEXT:    s_lshr_b32 s16, s8, 16
+; GFX1250-NEXT:    s_lshr_b32 s17, s8, 24
+; GFX1250-NEXT:    s_lshr_b32 s6, s15, 16
+; GFX1250-NEXT:    s_lshr_b32 s7, s15, 24
+; GFX1250-NEXT:    s_bfe_u32 s27, s15, 0x80008
 ; GFX1250-NEXT:    s_add_co_i32 s17, s17, s17
 ; GFX1250-NEXT:    s_add_co_i32 s16, s16, s16
-; GFX1250-NEXT:    s_lshr_b32 s18, s1, 16
-; GFX1250-NEXT:    s_lshr_b32 s19, s1, 24
-; GFX1250-NEXT:    s_lshr_b32 s22, s3, 16
-; GFX1250-NEXT:    s_lshr_b32 s23, s3, 24
-; GFX1250-NEXT:    s_bfe_u32 s29, s1, 0x80008
-; GFX1250-NEXT:    s_bfe_u32 s30, s3, 0x80008
-; GFX1250-NEXT:    s_add_co_i32 s21, s21, s21
-; GFX1250-NEXT:    s_add_co_i32 s20, s20, s20
 ; GFX1250-NEXT:    s_lshl_b32 s17, s17, 8
 ; GFX1250-NEXT:    s_and_b32 s16, s16, 0xff
-; GFX1250-NEXT:    s_add_co_i32 s7, s7, s7
-; GFX1250-NEXT:    s_add_co_i32 s27, s27, s27
 ; GFX1250-NEXT:    s_add_co_i32 s15, s15, s15
-; GFX1250-NEXT:    s_add_co_i32 s14, s14, s14
-; GFX1250-NEXT:    s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT:    s_add_co_i32 s27, s27, s27
+; GFX1250-NEXT:    s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT:    s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT:    s_or_b32 s16, s16, s17
+; GFX1250-NEXT:    s_and_b32 s15, s15, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s17, s27, 8
+; GFX1250-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX1250-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX1250-NEXT:    s_or_b32 s15, s15, s17
+; GFX1250-NEXT:    s_or_b32 s6, s6, s7
+; GFX1250-NEXT:    s_bfe_u32 s26, s14, 0x80008
+; GFX1250-NEXT:    s_and_b32 s7, s15, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX1250-NEXT:    s_lshr_b32 s20, s10, 16
+; GFX1250-NEXT:    s_lshr_b32 s21, s10, 24
+; GFX1250-NEXT:    s_lshr_b32 s4, s14, 16
+; GFX1250-NEXT:    s_lshr_b32 s5, s14, 24
+; GFX1250-NEXT:    s_or_b32 s6, s7, s6
+; GFX1250-NEXT:    s_add_co_i32 s7, s14, s14
+; GFX1250-NEXT:    s_add_co_i32 s26, s26, s26
+; GFX1250-NEXT:    s_lshr_b32 s18, s9, 16
+; GFX1250-NEXT:    s_lshr_b32 s19, s9, 24
+; GFX1250-NEXT:    s_lshr_b32 s22, s11, 16
+; GFX1250-NEXT:    s_lshr_b32 s23, s11, 24
+; GFX1250-NEXT:    s_bfe_u32 s29, s9, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s30, s11, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s21, s21, s21
+; GFX1250-NEXT:    s_add_co_i32 s20, s20, s20
+; GFX1250-NEXT:    s_lshr_b32 s2, s13, 16
+; GFX1250-NEXT:    s_lshr_b32 s3, s13, 24
+; GFX1250-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s14, s26, 8
+; GFX1250-NEXT:    s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT:    s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT:    s_add_co_i32 s11, s11, s11
 ; GFX1250-NEXT:    s_add_co_i32 s30, s30, s30
 ; GFX1250-NEXT:    s_add_co_i32 s23, s23, s23
 ; GFX1250-NEXT:    s_add_co_i32 s22, s22, s22
 ; GFX1250-NEXT:    s_lshl_b32 s21, s21, 8
 ; GFX1250-NEXT:    s_and_b32 s20, s20, 0xff
-; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_add_co_i32 s9, s9, s9
 ; GFX1250-NEXT:    s_add_co_i32 s29, s29, s29
 ; GFX1250-NEXT:    s_add_co_i32 s19, s19, s19
 ; GFX1250-NEXT:    s_add_co_i32 s18, s18, s18
-; GFX1250-NEXT:    s_lshr_b32 s10, s5, 16
-; GFX1250-NEXT:    s_lshr_b32 s11, s5, 24
-; GFX1250-NEXT:    s_lshr_b32 s12, s6, 16
-; GFX1250-NEXT:    s_lshr_b32 s13, s6, 24
-; GFX1250-NEXT:    s_or_b32 s16, s16, s17
-; GFX1250-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX1250-NEXT:    s_lshl_b32 s17, s27, 8
-; GFX1250-NEXT:    s_lshl_b32 s15, s15, 8
-; GFX1250-NEXT:    s_and_b32 s14, s14, 0xff
-; GFX1250-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT:    s_bfe_u32 s25, s13, 0x80008
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX1250-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT:    s_or_b32 s7, s7, s14
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_and_b32 s11, s11, 0xff
 ; GFX1250-NEXT:    s_lshl_b32 s30, s30, 8
 ; GFX1250-NEXT:    s_lshl_b32 s23, s23, 8
 ; GFX1250-NEXT:    s_and_b32 s22, s22, 0xff
 ; GFX1250-NEXT:    s_or_b32 s20, s20, s21
-; GFX1250-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT:    s_and_b32 s9, s9, 0xff
 ; GFX1250-NEXT:    s_lshl_b32 s21, s29, 8
 ; GFX1250-NEXT:    s_lshl_b32 s19, s19, 8
 ; GFX1250-NEXT:    s_and_b32 s18, s18, 0xff
-; GFX1250-NEXT:    s_lshr_b32 s8, s4, 16
-; GFX1250-NEXT:    s_lshr_b32 s9, s4, 24
-; GFX1250-NEXT:    s_bfe_u32 s24, s4, 0x80008
-; GFX1250-NEXT:    s_bfe_u32 s25, s5, 0x80008
-; GFX1250-NEXT:    s_bfe_u32 s26, s6, 0x80008
-; GFX1250-NEXT:    s_or_b32 s7, s7, s17
-; GFX1250-NEXT:    s_or_b32 s14, s14, s15
-; GFX1250-NEXT:    s_add_co_i32 s13, s13, s13
-; GFX1250-NEXT:    s_add_co_i32 s12, s12, s12
-; GFX1250-NEXT:    s_add_co_i32 s11, s11, s11
-; GFX1250-NEXT:    s_add_co_i32 s10, s10, s10
-; GFX1250-NEXT:    s_bfe_u32 s28, s0, 0x80008
-; GFX1250-NEXT:    s_or_b32 s3, s3, s30
+; GFX1250-NEXT:    s_lshr_b32 s0, s12, 16
+; GFX1250-NEXT:    s_lshr_b32 s1, s12, 24
+; GFX1250-NEXT:    s_bfe_u32 s24, s12, 0x80008
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_and_b32 s5, s7, 0xffff
+; GFX1250-NEXT:    s_add_co_i32 s7, s13, s13
+; GFX1250-NEXT:    s_add_co_i32 s25, s25, s25
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT:    s_bfe_u32 s28, s8, 0x80008
+; GFX1250-NEXT:    s_or_b32 s11, s11, s30
 ; GFX1250-NEXT:    s_or_b32 s22, s22, s23
-; GFX1250-NEXT:    s_bfe_u32 s23, s2, 0x80008
-; GFX1250-NEXT:    s_or_b32 s1, s1, s21
+; GFX1250-NEXT:    s_bfe_u32 s23, s10, 0x80008
+; GFX1250-NEXT:    s_or_b32 s9, s9, s21
 ; GFX1250-NEXT:    s_or_b32 s18, s18, s19
-; GFX1250-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX1250-NEXT:    s_lshl_b32 s14, s14, 16
-; GFX1250-NEXT:    s_add_co_i32 s6, s6, s6
-; GFX1250-NEXT:    s_add_co_i32 s26, s26, s26
-; GFX1250-NEXT:    s_lshl_b32 s13, s13, 8
-; GFX1250-NEXT:    s_and_b32 s12, s12, 0xff
-; GFX1250-NEXT:    s_add_co_i32 s5, s5, s5
-; GFX1250-NEXT:    s_add_co_i32 s25, s25, s25
-; GFX1250-NEXT:    s_lshl_b32 s11, s11, 8
-; GFX1250-NEXT:    s_and_b32 s10, s10, 0xff
-; GFX1250-NEXT:    s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX1250-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s13, s25, 8
+; GFX1250-NEXT:    s_or_b32 s2, s2, s3
+; GFX1250-NEXT:    s_add_co_i32 s3, s12, s12
 ; GFX1250-NEXT:    s_add_co_i32 s24, s24, s24
-; GFX1250-NEXT:    s_add_co_i32 s9, s9, s9
-; GFX1250-NEXT:    s_add_co_i32 s8, s8, s8
-; GFX1250-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_and_b32 s11, s11, 0xffff
 ; GFX1250-NEXT:    s_lshl_b32 s22, s22, 16
-; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_add_co_i32 s10, s10, s10
 ; GFX1250-NEXT:    s_add_co_i32 s23, s23, s23
-; GFX1250-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT:    s_and_b32 s9, s9, 0xffff
 ; GFX1250-NEXT:    s_lshl_b32 s18, s18, 16
-; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s8, s8, s8
 ; GFX1250-NEXT:    s_add_co_i32 s28, s28, s28
-; GFX1250-NEXT:    s_or_b32 s7, s7, s14
-; GFX1250-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX1250-NEXT:    s_lshl_b32 s14, s26, 8
-; GFX1250-NEXT:    s_or_b32 s12, s12, s13
-; GFX1250-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX1250-NEXT:    s_lshl_b32 s13, s25, 8
-; GFX1250-NEXT:    s_or_b32 s10, s10, s11
-; GFX1250-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX1250-NEXT:    s_lshl_b32 s11, s24, 8
-; GFX1250-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX1250-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX1250-NEXT:    s_or_b32 s3, s3, s22
-; GFX1250-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX1250-NEXT:    s_lshl_b32 s22, s23, 8
-; GFX1250-NEXT:    s_or_b32 s1, s1, s18
+; GFX1250-NEXT:    s_or_b32 s4, s5, s4
+; GFX1250-NEXT:    s_or_b32 s5, s7, s13
+; GFX1250-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s7, s24, 8
+; GFX1250-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_or_b32 s11, s11, s22
+; GFX1250-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s22, s23, 8
+; GFX1250-NEXT:    s_or_b32 s9, s9, s18
+; GFX1250-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX1250-NEXT:    s_lshl_b32 s18, s28, 8
-; GFX1250-NEXT:    s_or_b32 s6, s6, s14
-; GFX1250-NEXT:    s_or_b32 s5, s5, s13
-; GFX1250-NEXT:    s_or_b32 s4, s4, s11
-; GFX1250-NEXT:    s_or_b32 s8, s8, s9
-; GFX1250-NEXT:    s_or_b32 s2, s2, s22
-; GFX1250-NEXT:    s_or_b32 s0, s0, s18
-; GFX1250-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX1250-NEXT:    s_lshl_b32 s12, s12, 16
+; GFX1250-NEXT:    s_or_b32 s3, s3, s7
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_or_b32 s10, s10, s22
+; GFX1250-NEXT:    s_or_b32 s8, s8, s18
 ; GFX1250-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX1250-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX1250-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX1250-NEXT:    s_lshl_b32 s9, s10, 16
-; GFX1250-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT:    s_and_b32 s1, s3, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s0, s0, 16
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT:    s_and_b32 s10, s10, 0xffff
 ; GFX1250-NEXT:    s_lshl_b32 s20, s20, 16
-; GFX1250-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT:    s_and_b32 s8, s8, 0xffff
 ; GFX1250-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX1250-NEXT:    s_or_b32 s6, s6, s12
-; GFX1250-NEXT:    s_or_b32 s4, s4, s8
-; GFX1250-NEXT:    s_or_b32 s5, s5, s9
-; GFX1250-NEXT:    s_or_b32 s2, s2, s20
-; GFX1250-NEXT:    s_or_b32 s0, s0, s16
-; GFX1250-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1250-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
-; GFX1250-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX1250-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT:    s_or_b32 s0, s1, s0
+; GFX1250-NEXT:    s_or_b32 s1, s5, s2
+; GFX1250-NEXT:    s_or_b32 s10, s10, s20
+; GFX1250-NEXT:    s_or_b32 s8, s8, s16
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s6
+; GFX1250-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GFX1250-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX1250-NEXT:    global_store_b128 v[10:11], v[4:7], off
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 51652a09863e0..2ae6fc2081ad9 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -117,12 +117,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ;
 ; GFX1250-LABEL: sadd64rr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[6:7]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -818,17 +818,17 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: suaddo64:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT:    s_add_nc_u64 s[0:1], s[12:13], s[14:15]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT:    v_cmp_lt_u64_e64 s0, s[0:1], s[12:13]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
+; GFX1250-NEXT:    global_store_b8 v2, v3, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
@@ -1096,12 +1096,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ;
 ; GFX1250-LABEL: ssub64rr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_sub_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_sub_nc_u64 s[2:3], s[2:3], s[6:7]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -1798,17 +1798,17 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: susubo64:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_sub_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX1250-NEXT:    s_sub_nc_u64 s[0:1], s[12:13], s[14:15]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], s[12:13]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX1250-NEXT:    global_store_b8 v2, v3, s[2:3]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[8:9]
+; GFX1250-NEXT:    global_store_b8 v2, v3, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
@@ -3099,70 +3099,70 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ;
 ; GFX1250-LABEL: sudiv64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_or_b64 s[6:7], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_or_b64 s[4:5], s[2:3], s[6:7]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_and_b64 s[6:7], s[6:7], 0xffffffff00000000
-; GFX1250-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX1250-NEXT:    s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
+; GFX1250-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1250-NEXT:  ; %bb.1:
-; GFX1250-NEXT:    s_cvt_f32_u32 s6, s4
-; GFX1250-NEXT:    s_cvt_f32_u32 s7, s5
-; GFX1250-NEXT:    s_sub_nc_u64 s[10:11], 0, s[4:5]
+; GFX1250-NEXT:    s_cvt_f32_u32 s4, s6
+; GFX1250-NEXT:    s_cvt_f32_u32 s5, s7
+; GFX1250-NEXT:    s_sub_nc_u64 s[10:11], 0, s[6:7]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT:    s_fmac_f32 s6, s7, 0x4f800000
-; GFX1250-NEXT:    v_s_rcp_f32 s6, s6
+; GFX1250-NEXT:    s_fmac_f32 s4, s5, 0x4f800000
+; GFX1250-NEXT:    v_s_rcp_f32 s4, s4
 ; GFX1250-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT:    s_mul_f32 s6, s6, 0x5f7ffffc
-; GFX1250-NEXT:    s_mul_f32 s7, s6, 0x2f800000
+; GFX1250-NEXT:    s_mul_f32 s4, s4, 0x5f7ffffc
+; GFX1250-NEXT:    s_mul_f32 s5, s4, 0x2f800000
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT:    s_trunc_f32 s7, s7
-; GFX1250-NEXT:    s_fmac_f32 s6, s7, 0xcf800000
-; GFX1250-NEXT:    s_cvt_u32_f32 s9, s7
-; GFX1250-NEXT:    s_mov_b32 s7, 0
+; GFX1250-NEXT:    s_trunc_f32 s5, s5
+; GFX1250-NEXT:    s_fmac_f32 s4, s5, 0xcf800000
+; GFX1250-NEXT:    s_cvt_u32_f32 s9, s5
+; GFX1250-NEXT:    s_mov_b32 s5, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX1250-NEXT:    s_cvt_u32_f32 s8, s6
+; GFX1250-NEXT:    s_cvt_u32_f32 s8, s4
 ; GFX1250-NEXT:    s_mul_u64 s[12:13], s[10:11], s[8:9]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s15, s8, s13
 ; GFX1250-NEXT:    s_mul_i32 s14, s8, s13
-; GFX1250-NEXT:    s_mul_hi_u32 s6, s8, s12
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s8, s12
 ; GFX1250-NEXT:    s_mul_i32 s17, s9, s12
-; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[6:7], s[14:15]
+; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[4:5], s[14:15]
 ; GFX1250-NEXT:    s_mul_hi_u32 s16, s9, s12
 ; GFX1250-NEXT:    s_mul_hi_u32 s18, s9, s13
-; GFX1250-NEXT:    s_add_co_u32 s6, s14, s17
-; GFX1250-NEXT:    s_add_co_ci_u32 s6, s15, s16
+; GFX1250-NEXT:    s_add_co_u32 s4, s14, s17
+; GFX1250-NEXT:    s_add_co_ci_u32 s4, s15, s16
 ; GFX1250-NEXT:    s_mul_i32 s12, s9, s13
 ; GFX1250-NEXT:    s_add_co_ci_u32 s13, s18, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[6:7], s[12:13]
+; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[4:5], s[12:13]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s9, s9, s13
 ; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[8:9]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s8, s11
 ; GFX1250-NEXT:    s_mul_i32 s12, s8, s11
-; GFX1250-NEXT:    s_mul_hi_u32 s6, s8, s10
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s8, s10
 ; GFX1250-NEXT:    s_mul_i32 s15, s9, s10
-; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[6:7], s[12:13]
+; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[4:5], s[12:13]
 ; GFX1250-NEXT:    s_mul_hi_u32 s14, s9, s10
 ; GFX1250-NEXT:    s_mul_hi_u32 s16, s9, s11
-; GFX1250-NEXT:    s_add_co_u32 s6, s12, s15
-; GFX1250-NEXT:    s_add_co_ci_u32 s6, s13, s14
+; GFX1250-NEXT:    s_add_co_u32 s4, s12, s15
+; GFX1250-NEXT:    s_add_co_ci_u32 s4, s13, s14
 ; GFX1250-NEXT:    s_mul_i32 s10, s9, s11
 ; GFX1250-NEXT:    s_add_co_ci_u32 s11, s16, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_add_nc_u64 s[10:11], s[6:7], s[10:11]
+; GFX1250-NEXT:    s_add_nc_u64 s[10:11], s[4:5], s[10:11]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s10
 ; GFX1250-NEXT:    s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s6, s2, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
 ; GFX1250-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX1250-NEXT:    s_mul_hi_u32 s12, s3, s8
 ; GFX1250-NEXT:    s_add_co_ci_u32 s10, s9, s11
@@ -3170,33 +3170,33 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_mul_hi_u32 s9, s2, s10
 ; GFX1250-NEXT:    s_mul_i32 s8, s2, s10
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s3, s10
-; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[6:7], s[8:9]
+; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[8:9]
 ; GFX1250-NEXT:    s_mul_i32 s10, s3, s10
-; GFX1250-NEXT:    s_add_co_u32 s6, s8, s11
-; GFX1250-NEXT:    s_add_co_ci_u32 s6, s9, s12
+; GFX1250-NEXT:    s_add_co_u32 s4, s8, s11
+; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s12
 ; GFX1250-NEXT:    s_add_co_ci_u32 s11, s13, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[6:7], s[10:11]
+; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[10:11]
 ; GFX1250-NEXT:    s_and_b64 s[10:11], s[8:9], 0xffffffff00000000
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_or_b32 s10, s10, s8
-; GFX1250-NEXT:    s_mul_u64 s[8:9], s[4:5], s[10:11]
+; GFX1250-NEXT:    s_mul_u64 s[8:9], s[6:7], s[10:11]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_sub_co_u32 s6, s2, s8
+; GFX1250-NEXT:    s_sub_co_u32 s4, s2, s8
 ; GFX1250-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX1250-NEXT:    s_sub_co_i32 s12, s3, s9
 ; GFX1250-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, s5
-; GFX1250-NEXT:    s_sub_co_u32 s13, s6, s4
+; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, s7
+; GFX1250-NEXT:    s_sub_co_u32 s13, s4, s6
 ; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, 0
-; GFX1250-NEXT:    s_cmp_ge_u32 s12, s5
+; GFX1250-NEXT:    s_cmp_ge_u32 s12, s7
 ; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1250-NEXT:    s_cmp_ge_u32 s13, s6
 ; GFX1250-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX1250-NEXT:    s_cmp_eq_u32 s12, s5
+; GFX1250-NEXT:    s_cmp_eq_u32 s12, s7
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[10:11], 1
 ; GFX1250-NEXT:    s_cselect_b32 s16, s15, s14
 ; GFX1250-NEXT:    s_add_nc_u64 s[14:15], s[10:11], 2
@@ -3206,20 +3206,20 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s3, s3, s9
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_ge_u32 s3, s5
+; GFX1250-NEXT:    s_cmp_ge_u32 s3, s7
 ; GFX1250-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX1250-NEXT:    s_cmp_ge_u32 s6, s4
-; GFX1250-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1250-NEXT:    s_cmp_eq_u32 s3, s5
-; GFX1250-NEXT:    s_cselect_b32 s3, s6, s8
+; GFX1250-NEXT:    s_cmp_ge_u32 s4, s6
+; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s3, s7
+; GFX1250-NEXT:    s_cselect_b32 s3, s4, s8
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-NEXT:    s_cselect_b32 s9, s13, s11
 ; GFX1250-NEXT:    s_cselect_b32 s8, s12, s10
 ; GFX1250-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX1250-NEXT:  .LBB16_2:
-; GFX1250-NEXT:    v_cvt_f32_u32_e32 v0, s4
-; GFX1250-NEXT:    s_sub_co_i32 s5, 0, s4
+; GFX1250-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX1250-NEXT:    s_sub_co_i32 s4, 0, s6
 ; GFX1250-NEXT:    s_mov_b32 s9, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
 ; GFX1250-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -3228,23 +3228,23 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1250-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1250-NEXT:    s_mul_i32 s5, s5, s3
+; GFX1250-NEXT:    s_mul_i32 s4, s4, s3
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_mul_hi_u32 s5, s3, s5
-; GFX1250-NEXT:    s_add_co_i32 s3, s3, s5
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s3, s4
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, s4
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s3, s2, s3
-; GFX1250-NEXT:    s_mul_i32 s5, s3, s4
+; GFX1250-NEXT:    s_mul_i32 s4, s3, s6
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_sub_co_i32 s2, s2, s5
-; GFX1250-NEXT:    s_add_co_i32 s5, s3, 1
-; GFX1250-NEXT:    s_sub_co_i32 s6, s2, s4
-; GFX1250-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX1250-NEXT:    s_cselect_b32 s3, s5, s3
-; GFX1250-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX1250-NEXT:    s_add_co_i32 s5, s3, 1
-; GFX1250-NEXT:    s_cmp_ge_u32 s2, s4
-; GFX1250-NEXT:    s_cselect_b32 s8, s5, s3
+; GFX1250-NEXT:    s_sub_co_i32 s2, s2, s4
+; GFX1250-NEXT:    s_add_co_i32 s4, s3, 1
+; GFX1250-NEXT:    s_sub_co_i32 s5, s2, s6
+; GFX1250-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX1250-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX1250-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX1250-NEXT:    s_add_co_i32 s4, s3, 1
+; GFX1250-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX1250-NEXT:    s_cselect_b32 s8, s4, s3
 ; GFX1250-NEXT:  .LBB16_3:
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index be60a00145c8a..0cae0e51107df 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -705,12 +705,13 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b32 v1, v0, s[0:1] scale_offset
-; GFX1250-NEXT:    global_load_b32 v0, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1250-NEXT:    s_wait_loadcnt 0x1
-; GFX1250-NEXT:    ds_store_b32 v2, v1 offset:32
+; GFX1250-NEXT:    ds_store_b32 v0, v1 offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    ds_store_b32 v3, v0 offset:32
+; GFX1250-NEXT:    ds_store_b32 v3, v2 offset:32
 ; GFX1250-NEXT:    s_endpgm
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i
@@ -1282,14 +1283,14 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
 ;
 ; GFX1250-LABEL: simple_write2_v4f32_superreg_align4:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x8
+; GFX1250-NEXT:    s_load_b32 s8, s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[6:7], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, 4, s4
+; GFX1250-NEXT:    v_lshl_add_u32 v0, v0, 4, s8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 20795431b4cd8..b5b2655246c3f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2179,6 +2179,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
 ; GFX1250-SDAG-NEXT:    flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s0, 0x400
+; GFX1250-SDAG-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GFX1250-SDAG-NEXT:    s_cbranch_scc0 .LBB117_1
 ; GFX1250-SDAG-NEXT:  ; %bb.2: ; %bb2
 ; GFX1250-SDAG-NEXT:    s_endpgm
@@ -2190,15 +2191,16 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX1250-GISEL-NEXT:  .LBB117_1: ; %bb3
 ; GFX1250-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
 ; GFX1250-GISEL-NEXT:    v_add_nc_u64_e32 v[2:3], 4, v[2:3]
+; GFX1250-GISEL-NEXT:    ; kill: killed $vgpr4 killed $vgpr5
+; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT:    flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    flat_load_b32 v6, v[4:5] scope:SCOPE_SYS
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-GISEL-NEXT:    flat_load_b32 v4, v[4:5] scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x400, v2
 ; GFX1250-GISEL-NEXT:    s_cbranch_vccz .LBB117_1
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 4827f752d9f7c..5e6de6d66ccc1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmax3_olt_0_f32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s18, s10
-; GFX1250-NEXT:    s_mov_b32 s19, s11
-; GFX1250-NEXT:    s_mov_b32 s22, s10
-; GFX1250-NEXT:    s_mov_b32 s23, s11
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s7, s3
+; GFX1250-NEXT:    s_mov_b32 s18, s2
+; GFX1250-NEXT:    s_mov_b32 s19, s3
+; GFX1250-NEXT:    s_mov_b32 s22, s2
+; GFX1250-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s12, s2
-; GFX1250-NEXT:    s_mov_b32 s13, s3
-; GFX1250-NEXT:    s_mov_b32 s16, s4
-; GFX1250-NEXT:    s_mov_b32 s17, s5
-; GFX1250-NEXT:    s_mov_b32 s20, s6
-; GFX1250-NEXT:    s_mov_b32 s21, s7
-; GFX1250-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s10
+; GFX1250-NEXT:    s_mov_b32 s5, s11
+; GFX1250-NEXT:    s_mov_b32 s16, s12
+; GFX1250-NEXT:    s_mov_b32 s17, s13
+; GFX1250-NEXT:    s_mov_b32 s20, s14
+; GFX1250-NEXT:    s_mov_b32 s21, s15
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_mov_b32 s0, s8
+; GFX1250-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-NEXT:    v_max3_num_f32 v0, v0, v1, v2
-; GFX1250-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX1250-NEXT:    s_endpgm
   %a = load volatile  float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmax3_olt_1_f32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s18, s10
-; GFX1250-NEXT:    s_mov_b32 s19, s11
-; GFX1250-NEXT:    s_mov_b32 s22, s10
-; GFX1250-NEXT:    s_mov_b32 s23, s11
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s7, s3
+; GFX1250-NEXT:    s_mov_b32 s18, s2
+; GFX1250-NEXT:    s_mov_b32 s19, s3
+; GFX1250-NEXT:    s_mov_b32 s22, s2
+; GFX1250-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s12, s2
-; GFX1250-NEXT:    s_mov_b32 s13, s3
-; GFX1250-NEXT:    s_mov_b32 s16, s4
-; GFX1250-NEXT:    s_mov_b32 s17, s5
-; GFX1250-NEXT:    s_mov_b32 s20, s6
-; GFX1250-NEXT:    s_mov_b32 s21, s7
-; GFX1250-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s10
+; GFX1250-NEXT:    s_mov_b32 s5, s11
+; GFX1250-NEXT:    s_mov_b32 s16, s12
+; GFX1250-NEXT:    s_mov_b32 s17, s13
+; GFX1250-NEXT:    s_mov_b32 s20, s14
+; GFX1250-NEXT:    s_mov_b32 s21, s15
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_mov_b32 s0, s8
+; GFX1250-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-NEXT:    v_max3_num_f32 v0, v2, v0, v1
-; GFX1250-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX1250-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
-; GFX1250-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
-; GFX1250-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-FAKE16-NEXT:    v_max3_num_f16 v0, v0, v1, v2
-; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
-; GFX1250-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
-; GFX1250-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-FAKE16-NEXT:    v_max3_num_f16 v0, v2, v0, v1
-; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 6dfefd8a6052a..6a6f232c55e24 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_0_f32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s18, s10
-; GFX1250-NEXT:    s_mov_b32 s19, s11
-; GFX1250-NEXT:    s_mov_b32 s22, s10
-; GFX1250-NEXT:    s_mov_b32 s23, s11
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s7, s3
+; GFX1250-NEXT:    s_mov_b32 s18, s2
+; GFX1250-NEXT:    s_mov_b32 s19, s3
+; GFX1250-NEXT:    s_mov_b32 s22, s2
+; GFX1250-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s12, s2
-; GFX1250-NEXT:    s_mov_b32 s13, s3
-; GFX1250-NEXT:    s_mov_b32 s16, s4
-; GFX1250-NEXT:    s_mov_b32 s17, s5
-; GFX1250-NEXT:    s_mov_b32 s20, s6
-; GFX1250-NEXT:    s_mov_b32 s21, s7
-; GFX1250-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s10
+; GFX1250-NEXT:    s_mov_b32 s5, s11
+; GFX1250-NEXT:    s_mov_b32 s16, s12
+; GFX1250-NEXT:    s_mov_b32 s17, s13
+; GFX1250-NEXT:    s_mov_b32 s20, s14
+; GFX1250-NEXT:    s_mov_b32 s21, s15
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_mov_b32 s0, s8
+; GFX1250-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-NEXT:    v_min3_num_f32 v0, v0, v1, v2
-; GFX1250-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX1250-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_1_f32:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s18, s10
-; GFX1250-NEXT:    s_mov_b32 s19, s11
-; GFX1250-NEXT:    s_mov_b32 s22, s10
-; GFX1250-NEXT:    s_mov_b32 s23, s11
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s7, s3
+; GFX1250-NEXT:    s_mov_b32 s18, s2
+; GFX1250-NEXT:    s_mov_b32 s19, s3
+; GFX1250-NEXT:    s_mov_b32 s22, s2
+; GFX1250-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s12, s2
-; GFX1250-NEXT:    s_mov_b32 s13, s3
-; GFX1250-NEXT:    s_mov_b32 s16, s4
-; GFX1250-NEXT:    s_mov_b32 s17, s5
-; GFX1250-NEXT:    s_mov_b32 s20, s6
-; GFX1250-NEXT:    s_mov_b32 s21, s7
-; GFX1250-NEXT:    buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s10
+; GFX1250-NEXT:    s_mov_b32 s5, s11
+; GFX1250-NEXT:    s_mov_b32 s16, s12
+; GFX1250-NEXT:    s_mov_b32 s17, s13
+; GFX1250-NEXT:    s_mov_b32 s20, s14
+; GFX1250-NEXT:    s_mov_b32 s21, s15
+; GFX1250-NEXT:    buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_mov_b32 s0, s8
+; GFX1250-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-NEXT:    v_min3_num_f32 v0, v2, v0, v1
-; GFX1250-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX1250-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
@@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
-; GFX1250-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
-; GFX1250-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-FAKE16-NEXT:    v_min3_num_f16 v0, v0, v1, v2
-; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
-; GFX1250-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
-; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
-; GFX1250-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s10, -1
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s14, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s15, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s11
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s10
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s11
+; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s6, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s7, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s18, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s19, s3
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s22, s2
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s23, s3
 ; GFX1250-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s12, s2
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s13, s3
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s4
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s5
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s6
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s7
-; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s4, s10
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s5, s11
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s16, s12
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s17, s13
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s20, s14
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s21, s15
+; GFX1250-FAKE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-FAKE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-FAKE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s8, s0
-; GFX1250-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s0, s8
+; GFX1250-FAKE16-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-FAKE16-NEXT:    v_min3_num_f16 v0, v2, v0, v1
-; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
+; GFX1250-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-FAKE16-NEXT:    s_endpgm
   %a = load volatile half, ptr addrspace(1) %aptr, align 2
   %b = load volatile half, ptr addrspace(1) %bptr, align 2
@@ -1217,36 +1217,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_0_f64:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s18, s10
-; GFX1250-NEXT:    s_mov_b32 s19, s11
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s7, s3
+; GFX1250-NEXT:    s_mov_b32 s18, s2
+; GFX1250-NEXT:    s_mov_b32 s19, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s12, s2
-; GFX1250-NEXT:    s_mov_b32 s13, s3
-; GFX1250-NEXT:    s_mov_b32 s16, s4
-; GFX1250-NEXT:    s_mov_b32 s17, s5
-; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s10
+; GFX1250-NEXT:    s_mov_b32 s5, s11
+; GFX1250-NEXT:    s_mov_b32 s16, s12
+; GFX1250-NEXT:    s_mov_b32 s17, s13
+; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x1
-; GFX1250-NEXT:    s_mov_b32 s12, s6
-; GFX1250-NEXT:    s_mov_b32 s13, s7
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s14
+; GFX1250-NEXT:    s_mov_b32 s5, s15
+; GFX1250-NEXT:    s_mov_b32 s0, s8
+; GFX1250-NEXT:    buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX1250-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
@@ -1427,36 +1427,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_1_f64:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s18, s10
-; GFX1250-NEXT:    s_mov_b32 s19, s11
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s7, s3
+; GFX1250-NEXT:    s_mov_b32 s18, s2
+; GFX1250-NEXT:    s_mov_b32 s19, s3
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s12, s2
-; GFX1250-NEXT:    s_mov_b32 s13, s3
-; GFX1250-NEXT:    s_mov_b32 s16, s4
-; GFX1250-NEXT:    s_mov_b32 s17, s5
-; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s10
+; GFX1250-NEXT:    s_mov_b32 s5, s11
+; GFX1250-NEXT:    s_mov_b32 s16, s12
+; GFX1250-NEXT:    s_mov_b32 s17, s13
+; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_xcnt 0x1
-; GFX1250-NEXT:    s_mov_b32 s12, s6
-; GFX1250-NEXT:    s_mov_b32 s13, s7
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS
+; GFX1250-NEXT:    s_mov_b32 s4, s14
+; GFX1250-NEXT:    s_mov_b32 s5, s15
+; GFX1250-NEXT:    s_mov_b32 s0, s8
+; GFX1250-NEXT:    buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    s_mov_b32 s1, s9
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
 ; GFX1250-NEXT:    v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
-; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
 ; GFX1250-NEXT:    s_endpgm
   %a = load volatile double, ptr addrspace(1) %aptr, align 4
   %b = load volatile double, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index e532deaca98a8..f80716939f618 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -11,22 +11,20 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GCN-SDAG-NEXT:    global_load_u8 v2, v[2:3], off
-; GCN-SDAG-NEXT:    global_load_u8 v3, v[4:5], off
-; GCN-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
+; GCN-SDAG-NEXT:    global_load_u8 v6, v[2:3], off
+; GCN-SDAG-NEXT:    global_load_u8 v7, v[4:5], off
+; GCN-SDAG-NEXT:    global_load_u8 v10, v[0:1], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x2
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
-; GCN-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v2
+; GCN-SDAG-NEXT:    v_lshlrev_b16 v0, 8, v6
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x1
-; GCN-SDAG-NEXT:    v_lshlrev_b16 v2, 8, v3
+; GCN-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v7
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GCN-SDAG-NEXT:    v_or_b32_e32 v1, v7, v1
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-SDAG-NEXT:    v_or_b32_e32 v1, v3, v2
-; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GCN-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GCN-SDAG-NEXT:    v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_bitop2_b32 v0, v10, v0 bitop3:0x54
+; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GCN-SDAG-NEXT:    global_store_b32 v[8:9], v0, off
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
@@ -35,13 +33,15 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GCN-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GCN-GISEL-NEXT:    global_load_u8 v1, v[2:3], off
-; GCN-GISEL-NEXT:    global_load_u8 v2, v[4:5], off
+; GCN-GISEL-NEXT:    global_load_u8 v6, v[0:1], off
+; GCN-GISEL-NEXT:    global_load_u8 v7, v[2:3], off
+; GCN-GISEL-NEXT:    global_load_u8 v10, v[4:5], off
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x1
-; GCN-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x2
+; GCN-GISEL-NEXT:    v_lshl_or_b32 v0, v7, 8, v6
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    v_dual_lshlrev_b32 v1, 16, v2 :: v_dual_lshlrev_b32 v2, 24, v2
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x1
+; GCN-GISEL-NEXT:    v_dual_lshlrev_b32 v1, 16, v10 :: v_dual_lshlrev_b32 v2, 24, v10
 ; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GCN-GISEL-NEXT:    global_store_b32 v[8:9], v0, off
@@ -64,21 +64,21 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2
 ; GCN-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v[0:1], off
-; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[2:3], off
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[8:9], 0
-; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    v_pk_add_u16 v10, v6, v2
-; GCN-SDAG-NEXT:    v_pk_add_u16 v11, v7, v3
+; GCN-SDAG-NEXT:    global_load_b128 v[8:11], v[2:3], off
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 12
+; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GCN-SDAG-NEXT:    v_pk_add_u16 v1, v6, v10
+; GCN-SDAG-NEXT:    v_pk_add_u16 v12, v7, v11
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[6:7], 8
-; GCN-SDAG-NEXT:    v_pk_add_u16 v4, v4, v0
-; GCN-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
-; GCN-SDAG-NEXT:    v_pk_add_u16 v5, v5, v1
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[10:11], 0
+; GCN-SDAG-NEXT:    v_pk_add_u16 v5, v5, v9
+; GCN-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-SDAG-NEXT:    v_pk_add_u16 v4, v4, v8
 ; GCN-SDAG-NEXT:    s_clause 0x2
-; GCN-SDAG-NEXT:    global_store_b16 v[2:3], v11, off
-; GCN-SDAG-NEXT:    global_store_b32 v[6:7], v10, off
-; GCN-SDAG-NEXT:    global_store_b64 v[8:9], v[4:5], off
+; GCN-SDAG-NEXT:    global_store_b16 v[2:3], v12, off
+; GCN-SDAG-NEXT:    global_store_b32 v[6:7], v1, off
+; GCN-SDAG-NEXT:    global_store_b64 v[10:11], v[4:5], off
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
 ; GCN-GISEL-LABEL: test_v7i16_load_store:
@@ -86,28 +86,29 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2
 ; GCN-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v[0:1], off
-; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[2:3], off
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[8:9], 0
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[10:11], 2
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[12:13], 4
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[14:15], 6
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[16:17], 8
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[18:19], 10
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[20:21], 12
+; GCN-GISEL-NEXT:    global_load_b128 v[8:11], v[2:3], off
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[12:13], 2
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[14:15], 4
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[16:17], 6
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[18:19], 8
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[20:21], 10
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[22:23], 12
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    v_pk_add_u16 v2, v6, v2
-; GCN-GISEL-NEXT:    v_pk_add_u16 v4, v4, v0
-; GCN-GISEL-NEXT:    v_pk_add_u16 v1, v5, v1
-; GCN-GISEL-NEXT:    v_pk_add_u16 v3, v7, v3
+; GCN-GISEL-NEXT:    v_pk_add_u16 v1, v6, v10
+; GCN-GISEL-NEXT:    v_pk_add_u16 v4, v4, v8
+; GCN-GISEL-NEXT:    v_pk_add_u16 v5, v5, v9
+; GCN-GISEL-NEXT:    v_pk_add_u16 v6, v7, v11
 ; GCN-GISEL-NEXT:    s_clause 0x6
-; GCN-GISEL-NEXT:    global_store_b16 v[8:9], v4, off
-; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[10:11], v4, off
-; GCN-GISEL-NEXT:    global_store_b16 v[12:13], v1, off
-; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[14:15], v1, off
-; GCN-GISEL-NEXT:    global_store_b16 v[16:17], v2, off
-; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[18:19], v2, off
-; GCN-GISEL-NEXT:    global_store_b16 v[20:21], v3, off
-; GCN-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GCN-GISEL-NEXT:    global_store_b16 v[2:3], v4, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[12:13], v4, off
+; GCN-GISEL-NEXT:    global_store_b16 v[14:15], v5, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[16:17], v5, off
+; GCN-GISEL-NEXT:    global_store_b16 v[18:19], v1, off
+; GCN-GISEL-NEXT:    global_store_d16_hi_b16 v[20:21], v1, off
+; GCN-GISEL-NEXT:    global_store_b16 v[22:23], v6, off
+; GCN-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1
   %insert = insertelement <7 x i16> %vec1, i16 20, i32 4
@@ -253,8 +254,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:32
 ; GCN-SDAG-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:16
 ; GCN-SDAG-NEXT:    global_load_b128 v[30:33], v[0:1], off
-; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[36:37], 0x70
+; GCN-SDAG-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:64
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x70
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 48
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0x60
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[50:51], 32
@@ -262,14 +263,15 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[66:67], 0
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 0x50
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[54:55], 64
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, 0xc8 :: v_dual_mov_b32 v1, 0
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x7
-; GCN-SDAG-NEXT:    global_store_b128 v[36:37], v[6:9], off
+; GCN-SDAG-NEXT:    global_store_b128 v[2:3], v[6:9], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x6
 ; GCN-SDAG-NEXT:    global_store_b128 v[38:39], v[10:13], off
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x5
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x1
-; GCN-SDAG-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17
+; GCN-SDAG-NEXT:    v_dual_mov_b32 v2, v16 :: v_dual_mov_b32 v3, v17
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
@@ -286,8 +288,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7]
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
-; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37]
+; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[24:25], 0x64, v[24:25]
@@ -298,8 +300,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21]
 ; GCN-SDAG-NEXT:    v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19]
 ; GCN-SDAG-NEXT:    s_clause 0x1
-; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[34:37], off
-; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[0:3], off
+; GCN-SDAG-NEXT:    global_store_b128 v[52:53], v[0:3], off
+; GCN-SDAG-NEXT:    global_store_b128 v[54:55], v[34:37], off
 ; GCN-SDAG-NEXT:    s_clause 0x7
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:96
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:112
@@ -309,7 +311,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:48
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[30:33], off
 ; GCN-SDAG-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:16
-; GCN-SDAG-NEXT:    s_wait_xcnt 0x8
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x9
 ; GCN-SDAG-NEXT:    v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33
 ; GCN-SDAG-NEXT:    s_set_pc_i64 s[30:31]
 ;
@@ -325,7 +327,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    global_load_b128 v[22:25], v[0:1], off offset:48
 ; GCN-GISEL-NEXT:    global_load_b128 v[26:29], v[0:1], off offset:96
 ; GCN-GISEL-NEXT:    global_load_b128 v[30:33], v[0:1], off offset:112
-; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:64
+; GCN-GISEL-NEXT:    global_load_b128 v[34:37], v[0:1], off offset:64
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[38:39], 0
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[48:49], 16
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[50:51], 32
@@ -333,7 +335,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[66:67], 0x60
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[68:69], 0x70
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[54:55], 64
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[34:35], 0xc8
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0xc8
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[64:65], 0x50
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x6
 ; GCN-GISEL-NEXT:    global_store_b128 v[38:39], v[10:13], off
@@ -349,7 +352,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    global_store_b128 v[68:69], v[30:33], off
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x5
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13]
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[36:37], v[8:9]
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x4
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15]
@@ -361,8 +364,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25]
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1]
-; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35]
+; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9]
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x1
@@ -372,8 +375,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31]
 ; GCN-GISEL-NEXT:    v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33]
 ; GCN-GISEL-NEXT:    s_clause 0x1
-; GCN-GISEL-NEXT:    global_store_b128 v[54:55], v[0:3], off
-; GCN-GISEL-NEXT:    global_store_b128 v[64:65], v[34:37], off
+; GCN-GISEL-NEXT:    global_store_b128 v[54:55], v[34:37], off
+; GCN-GISEL-NEXT:    global_store_b128 v[64:65], v[0:3], off
 ; GCN-GISEL-NEXT:    s_clause 0x7
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[10:13], off
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[14:17], off offset:16
@@ -383,7 +386,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:80
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[26:29], off offset:96
 ; GCN-GISEL-NEXT:    global_store_b128 v[4:5], v[30:33], off offset:112
-; GCN-GISEL-NEXT:    s_wait_xcnt 0x9
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x8
 ; GCN-GISEL-NEXT:    v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13
 ; GCN-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4
@@ -402,16 +405,17 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
 ; GCN-SDAG:       ; %bb.0:
 ; GCN-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GCN-SDAG-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
 ; GCN-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
-; GCN-SDAG-NEXT:    v_mov_b64_e32 v[8:9], 12
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[10:11], 8
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[12:13], 0
 ; GCN-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GCN-SDAG-NEXT:    s_clause 0x1
-; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v4, s[0:1] scale_offset
-; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v4, s[2:3] scale_offset
+; GCN-SDAG-NEXT:    global_load_b128 v[0:3], v8, s[0:1] scale_offset
+; GCN-SDAG-NEXT:    global_load_b128 v[4:7], v8, s[2:3] scale_offset
+; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
+; GCN-SDAG-NEXT:    v_mov_b64_e32 v[8:9], 12
 ; GCN-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v3, v3, v7
 ; GCN-SDAG-NEXT:    v_pk_add_u16 v2, v2, v6
@@ -428,10 +432,9 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
 ; GCN-GISEL:       ; %bb.0:
 ; GCN-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GCN-GISEL-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
 ; GCN-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
-; GCN-GISEL-NEXT:    v_mov_b64_e32 v[8:9], 0
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[10:11], 2
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[12:13], 4
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[14:15], 6
@@ -440,8 +443,10 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[20:21], 12
 ; GCN-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GCN-GISEL-NEXT:    s_clause 0x1
-; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v4, s[0:1] scale_offset
-; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v4, s[2:3] scale_offset
+; GCN-GISEL-NEXT:    global_load_b128 v[0:3], v8, s[0:1] scale_offset
+; GCN-GISEL-NEXT:    global_load_b128 v[4:7], v8, s[2:3] scale_offset
+; GCN-GISEL-NEXT:    s_wait_xcnt 0x0
+; GCN-GISEL-NEXT:    v_mov_b64_e32 v[8:9], 0
 ; GCN-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v0, v0, v4
 ; GCN-GISEL-NEXT:    v_pk_add_u16 v1, v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
index 90fcb5191c353..fa97380583798 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
@@ -11,14 +11,11 @@ declare i32 @llvm.amdgcn.cluster.id.z() #0
 define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ; CHECK-UNKNOWN-LABEL: test_cluster_id_x:
 ; CHECK-UNKNOWN:       ; %bb.0:
-; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
-; CHECK-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; CHECK-UNKNOWN-NEXT:    s_endpgm
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ;
 ; CHECK-MESA3D-LABEL: test_cluster_id_x:
 ; CHECK-MESA3D:         .amd_kernel_code_t
@@ -68,7 +65,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -98,14 +95,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_x:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
-; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
-; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; CHECK-G-UNKNOWN-NEXT:    s_endpgm
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ;
 ; CHECK-G-MESA3D-LABEL: test_cluster_id_x:
 ; CHECK-G-MESA3D:         .amd_kernel_code_t
@@ -155,7 +149,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -190,14 +184,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_cluster_id_y:
 ; CHECK-UNKNOWN:       ; %bb.0:
-; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
-; CHECK-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; CHECK-UNKNOWN-NEXT:    s_endpgm
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ;
 ; CHECK-MESA3D-LABEL: test_cluster_id_y:
 ; CHECK-MESA3D:         .amd_kernel_code_t
@@ -247,7 +238,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -277,14 +268,11 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_y:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
-; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
-; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; CHECK-G-UNKNOWN-NEXT:    s_endpgm
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ;
 ; CHECK-G-MESA3D-LABEL: test_cluster_id_y:
 ; CHECK-G-MESA3D:         .amd_kernel_code_t
@@ -334,7 +322,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -369,16 +357,14 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_cluster_id_z:
 ; CHECK-UNKNOWN:       ; %bb.0:
-; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; CHECK-UNKNOWN-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; CHECK-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; CHECK-UNKNOWN-NEXT:    s_wait_xcnt 0x0
+; CHECK-UNKNOWN-NEXT:    s_lshr_b32 s0, ttmp7, 16
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
-; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[0:1]
+; CHECK-UNKNOWN-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; CHECK-UNKNOWN-NEXT:    s_endpgm
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ;
 ; CHECK-MESA3D-LABEL: test_cluster_id_z:
 ; CHECK-MESA3D:         .amd_kernel_code_t
@@ -428,7 +414,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -460,16 +446,14 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_z:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
-; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; CHECK-G-UNKNOWN-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; CHECK-G-UNKNOWN-NEXT:    s_wait_xcnt 0x0
+; CHECK-G-UNKNOWN-NEXT:    s_lshr_b32 s0, ttmp7, 16
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
-; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[0:1]
+; CHECK-G-UNKNOWN-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; CHECK-G-UNKNOWN-NEXT:    s_endpgm
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
-; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ;
 ; CHECK-G-MESA3D-LABEL: test_cluster_id_z:
 ; CHECK-G-MESA3D:         .amd_kernel_code_t
@@ -519,7 +503,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
index aa3b7b3606fd8..3ef84a3943d14 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -956,7 +956,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -1135,7 +1135,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out)
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -1219,7 +1219,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out)
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
index afe37e371fbc3..b8ff9e5ae0366 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
@@ -65,7 +65,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -153,7 +153,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
index 7ea4fa5373e57..9bca696b73437 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
@@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -954,7 +954,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o
 ; CHECK-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-MESA3D-NEXT:     gds_segment_byte_size = 0
@@ -1038,7 +1038,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o
 ; CHECK-G-MESA3D-NEXT:     is_ptr64 = 1
 ; CHECK-G-MESA3D-NEXT:     is_dynamic_callstack = 0
 ; CHECK-G-MESA3D-NEXT:     is_debug_enabled = 0
-; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 0
+; CHECK-G-MESA3D-NEXT:     is_xnack_enabled = 1
 ; CHECK-G-MESA3D-NEXT:     workitem_private_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     workgroup_group_segment_byte_size = 0
 ; CHECK-G-MESA3D-NEXT:     gds_segment_byte_size = 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
index 4f7bbf8f3746f..42a50bb304bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
@@ -5,13 +5,13 @@
 define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_bcast_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s3, s4
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s3, s6
 ; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
@@ -92,13 +92,13 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_down_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s3, s4
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s3, s6
 ; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
@@ -179,13 +179,13 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_up_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s3, s4
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s3, s6
 ; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
@@ -266,13 +266,13 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_xor_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s3, s4
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s3, s6
 ; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
index 76e2092c8b57a..abcae69c1c589 100644
--- a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
+++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir
@@ -69,9 +69,9 @@ body: |
   bb.0:
     ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset
     ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF
-    ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-    ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+    ; GCN-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4)
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1
     %0:sgpr_64 = IMPLICIT_DEF
     %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32))
     %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32))
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 1e6b77ecea85e..4ad161c03f5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -471,13 +471,13 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
 ; GFX1250-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX1250-NEXT:    s_cbranch_scc1 .LBB4_3
 ; GFX1250-NEXT:  ; %bb.1: ; %for.body.preheader
-; GFX1250-NEXT:    s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1]
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
 ; GFX1250-NEXT:  .LBB4_2: ; %for.body
@@ -602,13 +602,13 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
 ; GFX1250-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX1250-NEXT:    s_cbranch_scc1 .LBB5_3
 ; GFX1250-NEXT:  ; %bb.1: ; %for.body.preheader
-; GFX1250-NEXT:    s_load_b128 s[4:7], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1]
-; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1]
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1]
+; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3]
 ; GFX1250-NEXT:  .LBB5_2: ; %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index dbcd3700a1605..08ec0c847e941 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1117,18 +1117,19 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ;
 ; GFX1250-LABEL: mad_i64_i32_uniform:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT:    s_mov_b32 s7, 0
+; GFX1250-NEXT:    s_mov_b32 s5, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mov_b32 s6, s2
+; GFX1250-NEXT:    s_mov_b32 s4, s2
 ; GFX1250-NEXT:    s_mov_b32 s2, s3
-; GFX1250-NEXT:    s_mov_b32 s3, s7
+; GFX1250-NEXT:    s_mov_b32 s3, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_mul_u64 s[2:3], s[6:7], s[2:3]
-; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_mul_u64 s[2:3], s[4:5], s[2:3]
+; GFX1250-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[6:7]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index fef9a9ae07fb1..ae0805448d693 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -257,16 +257,15 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: v_test_imax_sge_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_i8 s2, s[2:3], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_i8 s3, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_i8 s4, s[2:3], 0x0
+; GFX1250-NEXT:    s_load_i8 s5, s[6:7], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_max_i32 s2, s2, s3
+; GFX1250-NEXT:    s_max_i32 s2, s4, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1250-NEXT:    global_store_b8 v0, v1, s[0:1]
@@ -701,16 +700,15 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: v_test_umax_uge_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_u8 s3, s[4:5], 0x0
+; GFX1250-NEXT:    s_load_u8 s4, s[2:3], 0x0
+; GFX1250-NEXT:    s_load_u8 s5, s[6:7], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_max_u32 s2, s2, s3
+; GFX1250-NEXT:    s_max_u32 s2, s4, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1250-NEXT:    global_store_b8 v0, v1, s[0:1]
@@ -777,13 +775,12 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
-; GFX1250-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b32 s6, s[0:1], 0x0
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_max_u32_e32 v0, s2, v0
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1250-NEXT:    v_max_u32_e32 v0, s6, v0
+; GFX1250-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_test_umax_ugt_i32:
@@ -1122,12 +1119,12 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umax_ugt_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_max_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_max_u64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
 ;
@@ -1175,12 +1172,12 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umax_uge_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_max_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_max_u64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
 ;
@@ -1228,12 +1225,12 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imax_sgt_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_max_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_max_i64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
 ;
@@ -1281,12 +1278,12 @@ define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imax_sge_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_max_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_max_i64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 311527d5d04cc..6a3d31ffcb9f1 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -131,14 +131,14 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_imin_sle_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
@@ -1172,14 +1172,14 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v4i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_pk_min_i16 v1, s1, s3
 ; GFX1250-NEXT:    v_pk_min_i16 v0, s0, s2
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
 ; GFX1250-NEXT:    s_endpgm
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
@@ -1307,14 +1307,14 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_imin_slt_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_i32_e32 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
@@ -1484,14 +1484,14 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_imin_slt_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_u16 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_u16 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_u16 v2, v0, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_i16 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1] scale_offset
@@ -1686,16 +1686,16 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ;
 ; GFX1250-LABEL: s_test_imin_slt_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_min_i32 s0, s0, s2
 ; GFX1250-NEXT:    s_min_i32 s1, s1, s3
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
 ; GFX1250-NEXT:    s_endpgm
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
@@ -2011,14 +2011,14 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_umin_ule_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
@@ -2171,16 +2171,16 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_umin_ule_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b96 v[0:2], v3, s[2:3]
-; GFX1250-NEXT:    global_load_b96 v[4:6], v3, s[4:5]
+; GFX1250-NEXT:    global_load_b96 v[4:6], v3, s[6:7]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_u32_e32 v2, v2, v6
 ; GFX1250-NEXT:    v_min_u32_e32 v1, v1, v5
@@ -2374,14 +2374,14 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_umin_ule_v3i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b64 v[0:1], v4, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_b64 v[2:3], v4, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_b64 v[2:3], v4, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v4, 3, v4
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -2611,14 +2611,14 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_u32_e32 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
@@ -2771,14 +2771,14 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX1250-NEXT:    global_load_u8 v2, v0, s[4:5]
+; GFX1250-NEXT:    global_load_u8 v2, v0, s[6:7]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_min_u16 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b8 v0, v1, s[0:1]
@@ -3023,23 +3023,22 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b32 s5, s[6:7], 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[12:13], 0x0
+; GFX1250-NEXT:    s_load_b32 s1, s[14:15], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_cmp_lt_u32 s4, s5
-; GFX1250-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX1250-NEXT:    s_cmp_lt_u32 s0, s1
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX1250-NEXT:    s_and_b32 s6, s6, exec_lo
-; GFX1250-NEXT:    s_cselect_b32 s4, s4, s5
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX1250-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX1250-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_store_b32 v1, v2, s[0:1]
-; GFX1250-NEXT:    global_store_b8 v1, v0, s[2:3]
+; GFX1250-NEXT:    global_store_b32 v1, v2, s[8:9]
+; GFX1250-NEXT:    global_store_b8 v1, v0, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
   %a = load i32, ptr addrspace(1) %aptr, align 4
   %b = load i32, ptr addrspace(1) %bptr, align 4
@@ -3220,12 +3219,12 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i16_multi_use:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x0
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_load_u16 v1, v0, s[6:7]
-; GFX1250-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[14:15]
+; GFX1250-NEXT:    global_load_u16 v2, v0, s[12:13]
 ; GFX1250-NEXT:    s_wait_loadcnt 0x1
 ; GFX1250-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -3235,8 +3234,8 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:    global_store_b8 v0, v2, s[2:3]
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[8:9]
+; GFX1250-NEXT:    global_store_b8 v0, v2, s[10:11]
 ; GFX1250-NEXT:    s_endpgm
   %a = load i16, ptr addrspace(1) %aptr, align 2
   %b = load i16, ptr addrspace(1) %bptr, align 2
@@ -4338,12 +4337,12 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umin_ult_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_min_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_min_u64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %tmp = icmp ult i64 %a, %b
@@ -4462,12 +4461,12 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umin_ule_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_min_u64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_min_u64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %tmp = icmp ule i64 %a, %b
@@ -4586,12 +4585,12 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imin_slt_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_min_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_min_i64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %tmp = icmp slt i64 %a, %b
@@ -4710,12 +4709,12 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imin_sle_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_min_i64 v[0:1], s[2:3], s[4:5]
+; GFX1250-NEXT:    v_min_i64 v[0:1], s[2:3], s[6:7]
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
   %tmp = icmp sle i64 %a, %b
@@ -4872,14 +4871,14 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_imin_sle_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_pk_min_i16 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
@@ -5042,14 +5041,14 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_imin_ule_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
-; GFX1250-NEXT:    global_load_b32 v2, v0, s[4:5] scale_offset
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[6:7] scale_offset
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_pk_min_u16 v1, v1, v2
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index baccb4c7d0859..d29847e40dc8b 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -450,6 +450,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x34
+; GFX1250-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_mul_i32 s2, s3, s2
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
@@ -613,25 +614,25 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
 ;
 ; GFX1250-LABEL: v_trunc_i64_mul_to_i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s6, s10
-; GFX1250-NEXT:    s_mov_b32 s7, s11
+; GFX1250-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s14, s6
+; GFX1250-NEXT:    s_mov_b32 s15, s7
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_mov_b32 s12, s2
 ; GFX1250-NEXT:    s_mov_b32 s13, s3
 ; GFX1250-NEXT:    buffer_load_b32 v0, off, s[12:15], null
-; GFX1250-NEXT:    buffer_load_b32 v1, off, s[4:7], null
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    buffer_load_b32 v1, off, s[8:11], null
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_mul_lo_u32 v0, v1, v0
-; GFX1250-NEXT:    buffer_store_b32 v0, off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_trunc_i64_mul_to_i32:
@@ -2091,11 +2092,11 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
 ;
 ; GFX1250-LABEL: s_mul_i64:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_mul_u64 s[4:5], s[2:3], s[4:5]
+; GFX1250-NEXT:    s_mul_u64 s[4:5], s[2:3], s[6:7]
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
@@ -2292,25 +2293,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
 ;
 ; GFX1250-LABEL: v_mul_i64:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1250-NEXT:    s_mov_b32 s10, -1
-; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s14, s10
-; GFX1250-NEXT:    s_mov_b32 s15, s11
-; GFX1250-NEXT:    s_mov_b32 s6, s10
-; GFX1250-NEXT:    s_mov_b32 s7, s11
+; GFX1250-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
+; GFX1250-NEXT:    s_mov_b32 s6, -1
+; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s14, s6
+; GFX1250-NEXT:    s_mov_b32 s15, s7
+; GFX1250-NEXT:    s_mov_b32 s10, s6
+; GFX1250-NEXT:    s_mov_b32 s11, s7
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_mov_b32 s12, s2
 ; GFX1250-NEXT:    s_mov_b32 s13, s3
 ; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[12:15], null
-; GFX1250-NEXT:    buffer_load_b64 v[2:3], off, s[4:7], null
-; GFX1250-NEXT:    s_mov_b32 s8, s0
-; GFX1250-NEXT:    s_mov_b32 s9, s1
+; GFX1250-NEXT:    buffer_load_b64 v[2:3], off, s[8:11], null
+; GFX1250-NEXT:    s_mov_b32 s4, s0
+; GFX1250-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
-; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], null
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_mul_i64:
@@ -2845,30 +2846,30 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1250-LABEL: mul64_in_branch:
 ; GFX1250:       ; %bb.0: ; %entry
-; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1250-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_3
 ; GFX1250-NEXT:  ; %bb.1: ; %else
-; GFX1250-NEXT:    s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX1250-NEXT:    s_mul_u64 s[0:1], s[12:13], s[14:15]
 ; GFX1250-NEXT:    s_cbranch_execnz .LBB16_4
 ; GFX1250-NEXT:  .LBB16_2: ; %if
-; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s6, -1
-; GFX1250-NEXT:    s_mov_b32 s4, s2
-; GFX1250-NEXT:    s_mov_b32 s5, s3
-; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s0, s10
+; GFX1250-NEXT:    s_mov_b32 s1, s11
+; GFX1250-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], null
 ; GFX1250-NEXT:    s_branch .LBB16_5
 ; GFX1250-NEXT:  .LBB16_3:
-; GFX1250-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX1250-NEXT:    ; implicit-def: $sgpr0_sgpr1
 ; GFX1250-NEXT:    s_branch .LBB16_2
 ; GFX1250-NEXT:  .LBB16_4:
-; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX1250-NEXT:  .LBB16_5: ; %endif
-; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1250-NEXT:    s_mov_b32 s2, -1
+; GFX1250-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1250-NEXT:    s_mov_b32 s10, -1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], null
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; EG-LABEL: mul64_in_branch:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index b0651ef53dd1b..78207c2cf605e 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -340,46 +340,46 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[40:41]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[8:9]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], v[38:39]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[36:37]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[34:35]
@@ -395,58 +395,58 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], v[36:37]
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fadd_v32_vs:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_clause 0x7
-; GFX1250-GISEL-NEXT:    global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT:    global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT:    global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT:    global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT:    global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT:    global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[14:15]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[18:19]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[4:5], v[4:5], v[36:37]
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[38:39]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x5
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[40:41]
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[42:43]
@@ -466,14 +466,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[28:29], v[28:29], v[36:37]
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[30:31], v[30:31], v[38:39]
 ; GFX1250-GISEL-NEXT:    s_clause 0x7
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[28:31], s[0:1] offset:112
 ; GFX1250-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -1597,46 +1597,46 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[40:41]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[8:9]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[38:39]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[36:37]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x2
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[20:21], v[20:21], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[22:23], v[22:23], v[34:35]
@@ -1652,58 +1652,58 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[14:15], v[14:15], v[36:37]
 ; GFX1250-SDAG-NEXT:    v_pk_mul_f32 v[12:13], v[12:13], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fmul_v32_vs:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_clause 0x7
-; GFX1250-GISEL-NEXT:    global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT:    global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT:    global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT:    global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT:    global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT:    global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[14:15]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[32:33]
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[2:3], v[2:3], v[34:35]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[18:19]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[4:5], v[4:5], v[36:37]
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[6:7], v[6:7], v[38:39]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x5
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[8:9], v[8:9], v[40:41]
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[10:11], v[10:11], v[42:43]
@@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[28:29], v[28:29], v[36:37]
 ; GFX1250-GISEL-NEXT:    v_pk_mul_f32 v[30:31], v[30:31], v[38:39]
 ; GFX1250-GISEL-NEXT:    s_clause 0x7
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[28:31], s[0:1] offset:112
 ; GFX1250-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -2428,46 +2428,46 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[34:35] offset:16
-; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[34:35]
-; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT:    global_load_b128 v[0:3], v56, s[0:1] offset:16
+; GFX1250-SDAG-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:48
+; GFX1250-SDAG-NEXT:    global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-SDAG-NEXT:    global_load_b128 v[12:15], v56, s[0:1]
+; GFX1250-SDAG-NEXT:    global_load_b128 v[16:19], v56, s[0:1] offset:80
+; GFX1250-SDAG-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:96
+; GFX1250-SDAG-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:64
+; GFX1250-SDAG-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[20:21]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[22:23]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[30:31]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[28:29]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[52:53], s[12:13]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[54:55], s[14:15]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[46:47], s[4:5]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[48:49], s[6:7]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[24:25]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[26:27]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[18:19]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[40:41]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[42:43]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[50:51]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[48:49]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[46:47], s[12:13]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[48:49], s[14:15]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[44:45]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[46:47]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[38:39]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[18:19]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[0:1]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[8:9]
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39]
-; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[16:17]
+; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[36:37]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53]
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33]
@@ -2482,58 +2482,58 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37]
 ; GFX1250-SDAG-NEXT:    v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39]
 ; GFX1250-SDAG-NEXT:    s_clause 0x7
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:96
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:64
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:80
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:48
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[34:35]
-; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[34:35] offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[20:23], s[0:1] offset:96
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[28:31], s[0:1] offset:112
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[24:27], s[0:1] offset:64
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[16:19], s[0:1] offset:80
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[4:7], s[0:1] offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[12:15], s[0:1]
+; GFX1250-SDAG-NEXT:    global_store_b128 v56, v[0:3], s[0:1] offset:16
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fma_v32_vs:
 ; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v56, 7, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_clause 0x7
-; GFX1250-GISEL-NEXT:    global_load_b128 v[0:3], v56, s[34:35]
-; GFX1250-GISEL-NEXT:    global_load_b128 v[4:7], v56, s[34:35] offset:16
-; GFX1250-GISEL-NEXT:    global_load_b128 v[8:11], v56, s[34:35] offset:32
-; GFX1250-GISEL-NEXT:    global_load_b128 v[12:15], v56, s[34:35] offset:48
-; GFX1250-GISEL-NEXT:    global_load_b128 v[16:19], v56, s[34:35] offset:64
-; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[34:35] offset:80
-; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[34:35] offset:96
-; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[34:35] offset:112
-; GFX1250-GISEL-NEXT:    s_load_b512 s[16:31], s[4:5], 0xa4
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT:    global_load_b128 v[0:3], v56, s[0:1]
+; GFX1250-GISEL-NEXT:    global_load_b128 v[4:7], v56, s[0:1] offset:16
+; GFX1250-GISEL-NEXT:    global_load_b128 v[8:11], v56, s[0:1] offset:32
+; GFX1250-GISEL-NEXT:    global_load_b128 v[12:15], v56, s[0:1] offset:48
+; GFX1250-GISEL-NEXT:    global_load_b128 v[16:19], v56, s[0:1] offset:64
+; GFX1250-GISEL-NEXT:    global_load_b128 v[20:23], v56, s[0:1] offset:80
+; GFX1250-GISEL-NEXT:    global_load_b128 v[24:27], v56, s[0:1] offset:96
+; GFX1250-GISEL-NEXT:    global_load_b128 v[28:31], v56, s[0:1] offset:112
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT:    s_load_b512 s[8:23], s[4:5], 0xe4
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[16:17]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[18:19]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[24:25]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[26:27]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[28:29]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[30:31]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[0:1]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[4:5]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[6:7]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[40:41]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[42:43]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[44:45]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[46:47]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[48:49]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[50:51]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[8:9]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[10:11]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[12:13]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[14:15]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x7
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33]
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[16:17]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[18:19]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x6
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37]
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
-; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[20:21]
+; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[22:23]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x5
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41]
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43]
@@ -2553,14 +2553,14 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37]
 ; GFX1250-GISEL-NEXT:    v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39]
 ; GFX1250-GISEL-NEXT:    s_clause 0x7
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[0:3], s[34:35]
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[4:7], s[34:35] offset:16
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[8:11], s[34:35] offset:32
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[12:15], s[34:35] offset:48
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[16:19], s[34:35] offset:64
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[20:23], s[34:35] offset:80
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[24:27], s[34:35] offset:96
-; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[0:3], s[0:1]
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[4:7], s[0:1] offset:16
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[8:11], s[0:1] offset:32
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[12:15], s[0:1] offset:48
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[16:19], s[0:1] offset:64
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[20:23], s[0:1] offset:80
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[24:27], s[0:1] offset:96
+; GFX1250-GISEL-NEXT:    global_store_b128 v56, v[28:31], s[0:1] offset:112
 ; GFX1250-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
@@ -3529,9 +3529,9 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ;
 ; GFX1250-SDAG-LABEL: fadd_fadd_fsub:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_add_f32 s2, s1, s3
@@ -3541,14 +3541,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
-; GFX1250-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
+; GFX1250-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[6:7]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1250-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -3560,7 +3560,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ; GFX1250-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[2:3], v[0:1]
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-GISEL-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
+; GFX1250-GISEL-NEXT:    global_store_b64 v0, v[2:3], s[6:7]
 ; GFX1250-GISEL-NEXT:    s_endpgm
 bb:
   %i12 = fadd <2 x float> %arg, %arg1
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index b717f85e179b3..6671201ca2b94 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -186,12 +186,12 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
 ;
 ; GFX1250-LABEL: mixed_inreg_block_count_x:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b32 s2, s[0:1], 0x10
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b32 s4, s[0:1], 0x10
+; GFX1250-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
   %load = load i32, ptr addrspace(4) %imp_arg_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 4d367ef7ffd9d..c1764c94ea2de 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -346,10 +346,10 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
 ;
 ; GFX1250-LABEL: byref_preload_arg:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[0:1], 0x100
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
@@ -404,10 +404,10 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
 ;
 ; GFX1250-LABEL: byref_staggered_preload_arg:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x100
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[0:1], 0x100
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; GFX1250-NEXT:    v_mov_b32_e32 v2, s1
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
index b5bb68e1eaa89..e0ea08d276979 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -97,9 +97,9 @@ entry:
 define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b64 s[4:5], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idxprom = zext i32 %idx to i64
@@ -111,10 +111,10 @@ entry:
 define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b96_idxprom:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idxprom = zext i32 %idx to i64
@@ -126,10 +126,10 @@ entry:
 define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b128_idxprom:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idxprom = zext i32 %idx to i64
@@ -141,12 +141,12 @@ entry:
 define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b256_idxprom:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idxprom = zext i32 %idx to i64
@@ -158,16 +158,16 @@ entry:
 define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b512_idxprom:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GCN-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GCN-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GCN-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GCN-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GCN-NEXT:    v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GCN-NEXT:    v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15
+; GCN-NEXT:    v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17
+; GCN-NEXT:    v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idxprom = zext i32 %idx to i64
@@ -275,11 +275,11 @@ entry:
 define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b64_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], s4 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -294,10 +294,10 @@ define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -312,10 +312,10 @@ define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -330,12 +330,12 @@ define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
@@ -350,16 +350,16 @@ define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
-; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
-; GCN-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; GCN-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
-; GCN-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
-; GCN-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GCN-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GCN-NEXT:    v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9
+; GCN-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11
+; GCN-NEXT:    v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13
+; GCN-NEXT:    v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15
+; GCN-NEXT:    v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17
+; GCN-NEXT:    v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
   %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
diff --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
index f2ecfe8fc9a1f..3d74b171400ac 100644
--- a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
@@ -17,16 +17,16 @@ define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i3
 ;
 ; GFX1250-LABEL: v_ashr_pk_i8_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_and_b32 s2, s2, 31
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-NEXT:    v_ashr_pk_i8_i32 v0, s0, s1, v0
-; GFX1250-NEXT:    global_store_b16 v1, v0, s[4:5]
+; GFX1250-NEXT:    global_store_b16 v1, v0, s[6:7]
 ; GFX1250-NEXT:    s_endpgm
   %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0
   %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1
@@ -58,16 +58,16 @@ define amdgpu_kernel void @v_ashr_pk_u8_i32(ptr addrspace(1) %out, i32 %src0, i3
 ;
 ; GFX1250-LABEL: v_ashr_pk_u8_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
-; GFX1250-NEXT:    s_wait_xcnt 0x0
-; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_and_b32 s2, s2, 31
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-NEXT:    v_ashr_pk_u8_i32 v0, s0, s1, v0
-; GFX1250-NEXT:    global_store_b16 v1, v0, s[4:5]
+; GFX1250-NEXT:    global_store_b16 v1, v0, s[6:7]
 ; GFX1250-NEXT:    s_endpgm
   %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0
   %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index a392692e618cd..6636eb544343b 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -211,38 +211,39 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GFX1250-SDAG-LABEL: workgroup_id_xyz:
 ; GFX1250-SDAG:       ; %bb.0:
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
-; GFX1250-SDAG-NEXT:    s_lshr_b32 s6, ttmp7, 16
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s0, 1
+; GFX1250-SDAG-NEXT:    s_lshr_b32 s8, ttmp7, 16
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s0, 1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-SDAG-NEXT:    s_mul_i32 s4, s8, s9
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s9, ttmp6, 0x40010
-; GFX1250-SDAG-NEXT:    s_mul_i32 s7, s6, s7
-; GFX1250-SDAG-NEXT:    s_bfe_u32 s8, ttmp6, 0x40008
+; GFX1250-SDAG-NEXT:    s_bfe_u32 s5, ttmp6, 0x40008
 ; GFX1250-SDAG-NEXT:    s_and_b32 s10, ttmp7, 0xffff
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s9, 1
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s11, ttmp6, 0x4000c
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s8, s8, s7
-; GFX1250-SDAG-NEXT:    s_mul_i32 s7, s10, s9
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s5, s5, s4
+; GFX1250-SDAG-NEXT:    s_mul_i32 s4, s10, s9
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s9, ttmp6, 0x40004
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s11, s11, 1
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s9, s7
-; GFX1250-SDAG-NEXT:    s_and_b32 s7, ttmp6, 15
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s9, s4
+; GFX1250-SDAG-NEXT:    s_and_b32 s4, ttmp6, 15
 ; GFX1250-SDAG-NEXT:    s_mul_i32 s11, ttmp9, s11
 ; GFX1250-SDAG-NEXT:    s_getreg_b32 s12, hwreg(HW_REG_IB_STS2, 6, 4)
-; GFX1250-SDAG-NEXT:    s_add_co_i32 s7, s7, s11
+; GFX1250-SDAG-NEXT:    s_add_co_i32 s4, s4, s11
 ; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX1250-SDAG-NEXT:    s_cselect_b32 s7, ttmp9, s7
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s4, ttmp9, s4
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7
-; GFX1250-SDAG-NEXT:    s_cselect_b32 s7, s10, s9
-; GFX1250-SDAG-NEXT:    s_cselect_b32 s6, s6, s8
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s6
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s4, s10, s9
+; GFX1250-SDAG-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1250-SDAG-NEXT:    global_store_b32 v0, v2, s[2:3]
-; GFX1250-SDAG-NEXT:    global_store_b32 v0, v3, s[4:5]
+; GFX1250-SDAG-NEXT:    global_store_b32 v0, v3, s[6:7]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: workgroup_id_xyz:
@@ -250,39 +251,40 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
 ; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
-; GFX1250-GISEL-NEXT:    s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4)
+; GFX1250-GISEL-NEXT:    s_getreg_b32 s8, hwreg(HW_REG_IB_STS2, 6, 4)
 ; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
-; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX1250-GISEL-NEXT:    s_cselect_b32 s7, ttmp9, s1
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s9, ttmp9, s1
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x40010
-; GFX1250-GISEL-NEXT:    s_and_b32 s8, ttmp7, 0xffff
+; GFX1250-GISEL-NEXT:    s_and_b32 s10, ttmp7, 0xffff
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
-; GFX1250-GISEL-NEXT:    s_bfe_u32 s9, ttmp6, 0x40004
-; GFX1250-GISEL-NEXT:    s_mul_i32 s10, s8, s0
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s11, ttmp6, 0x40004
+; GFX1250-GISEL-NEXT:    s_mul_i32 s12, s10, s0
+; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s11, s11, s12
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
-; GFX1250-GISEL-NEXT:    s_add_co_i32 s9, s9, s10
-; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1250-GISEL-NEXT:    s_cselect_b32 s8, s8, s9
-; GFX1250-GISEL-NEXT:    s_bfe_u32 s9, ttmp6, 0x40014
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s4, s10, s11
+; GFX1250-GISEL-NEXT:    s_bfe_u32 s5, ttmp6, 0x40014
 ; GFX1250-GISEL-NEXT:    s_lshr_b32 s10, ttmp7, 16
-; GFX1250-GISEL-NEXT:    s_add_co_i32 s9, s9, 1
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s5, s5, 1
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s11, ttmp6, 0x40008
-; GFX1250-GISEL-NEXT:    s_mul_i32 s9, s10, s9
+; GFX1250-GISEL-NEXT:    s_mul_i32 s5, s10, s5
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT:    s_add_co_i32 s11, s11, s9
-; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s6, 0
-; GFX1250-GISEL-NEXT:    s_cselect_b32 s6, s10, s11
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, s6
+; GFX1250-GISEL-NEXT:    s_add_co_i32 s11, s11, s5
+; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX1250-GISEL-NEXT:    s_cselect_b32 s5, s10, s11
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_clause 0x2
 ; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX1250-GISEL-NEXT:    global_store_b32 v1, v2, s[2:3]
-; GFX1250-GISEL-NEXT:    global_store_b32 v1, v3, s[4:5]
+; GFX1250-GISEL-NEXT:    global_store_b32 v1, v3, s[6:7]
 ; GFX1250-GISEL-NEXT:    s_endpgm
 ; GFX12-LABEL: workgroup_id_xyz:
 ; GFX12:       ; %bb.0:
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s
index 3c693610bee51..80a340c1f6261 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s
@@ -178,6 +178,7 @@ max_vgprs:
 // ASM-NEXT: .amdhsa_next_free_sgpr 32
 // ASM-NEXT: .amdhsa_named_barrier_count 3
 // ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 1
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 1
 // ASM-NEXT: .amdhsa_float_denorm_mode_32 1
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s
index 776006bdfba28..642e62df0437a 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s
@@ -178,6 +178,7 @@ max_vgprs:
 // ASM-NEXT: .amdhsa_next_free_sgpr 32
 // ASM-NEXT: .amdhsa_named_barrier_count 3
 // ASM-NEXT: .amdhsa_reserve_vcc 0
+// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
 // ASM-NEXT: .amdhsa_float_round_mode_32 1
 // ASM-NEXT: .amdhsa_float_round_mode_16_64 1
 // ASM-NEXT: .amdhsa_float_denorm_mode_32 1
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s
index 3e96ea3c67380..13f20bfd1081a 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx1250.s
@@ -20,7 +20,7 @@
 ; CHECK-NEXT: ; IMAGE_OP 0
 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32
 ; CHECK-NEXT: .amdhsa_reserve_vcc 0
-; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT: .amdhsa_reserve_xnack_mask 1
 ; CHECK-NEXT: .amdhsa_next_free_sgpr 8
 ; CHECK-NEXT: .amdhsa_float_round_mode_32 0
 ; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -76,7 +76,7 @@
 ; CHECK-NEXT: ; IMAGE_OP 0
 ; CHECK-NEXT: .amdhsa_next_free_vgpr 32
 ; CHECK-NEXT: .amdhsa_reserve_vcc 0
-; CHECK-NEXT: .amdhsa_reserve_xnack_mask 0
+; CHECK-NEXT: .amdhsa_reserve_xnack_mask 1
 ; CHECK-NEXT: .amdhsa_next_free_sgpr 8
 ; CHECK-NEXT: .amdhsa_float_round_mode_32 0
 ; CHECK-NEXT: .amdhsa_float_round_mode_16_64 0

From 0efc083c4a59bf10aaaf82d81c645fefeb15843d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 08:05:24 -0700
Subject: [PATCH 653/878] [RISCV] Support scalar llvm.fmodf intrinsic.
 (#161743)

---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |   2 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   2 +-
 .../RISCV/GlobalISel/double-intrinsics.ll     |  58 +++++++
 .../RISCV/GlobalISel/float-intrinsics.ll      |  59 +++++++
 llvm/test/CodeGen/RISCV/GlobalISel/fp128.ll   |  25 +++
 .../GlobalISel/legalizer-info-validation.mir  |   5 +-
 llvm/test/CodeGen/RISCV/double-intrinsics.ll  |  82 +++++++++
 llvm/test/CodeGen/RISCV/float-intrinsics.ll   |  81 +++++++++
 llvm/test/CodeGen/RISCV/fp128.ll              |  43 +++++
 llvm/test/CodeGen/RISCV/half-intrinsics.ll    | 158 ++++++++++++++++++
 10 files changed, 511 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 395d2c485af0e..662d3f6219033 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -629,7 +629,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   getActionDefinitionsBuilder({G_FCOS, G_FSIN, G_FTAN, G_FPOW, G_FLOG, G_FLOG2,
                                G_FLOG10, G_FEXP, G_FEXP2, G_FEXP10, G_FACOS,
                                G_FASIN, G_FATAN, G_FATAN2, G_FCOSH, G_FSINH,
-                               G_FTANH})
+                               G_FTANH, G_FMODF})
       .libcallFor({s32, s64})
       .libcallFor(ST.is64Bit(), {s128});
   getActionDefinitionsBuilder({G_FPOWI, G_FLDEXP})
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 50649cf3caba4..dcce2d29a7042 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -533,7 +533,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::FREM, ISD::FPOW, ISD::FPOWI,
                         ISD::FCOS, ISD::FSIN, ISD::FSINCOS, ISD::FEXP,
                         ISD::FEXP2, ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
-                        ISD::FLOG10, ISD::FLDEXP, ISD::FFREXP},
+                        ISD::FLOG10, ISD::FLDEXP, ISD::FFREXP, ISD::FMODF},
                        MVT::f16, Promote);
 
     // FIXME: Need to promote f16 STRICT_* to f32 libcalls, but we don't have
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll
index 1469d49e210e0..4b0acda839ad6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/double-intrinsics.ll
@@ -1420,3 +1420,61 @@ define double @tanh_f64(double %a) nounwind {
   %1 = call double @llvm.tanh.f64(double %a)
   ret double %1
 }
+
+define { double, double } @test_modf_f64(double %a) nounwind {
+; RV32IFD-LABEL: test_modf_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    mv a0, sp
+; RV32IFD-NEXT:    call modf
+; RV32IFD-NEXT:    fld fa1, 0(sp)
+; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+;
+; RV64IFD-LABEL: test_modf_f64:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    addi sp, sp, -16
+; RV64IFD-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IFD-NEXT:    mv a0, sp
+; RV64IFD-NEXT:    call modf
+; RV64IFD-NEXT:    fld fa1, 0(sp)
+; RV64IFD-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IFD-NEXT:    addi sp, sp, 16
+; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: test_modf_f64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    call modf
+; RV32I-NEXT:    lw a2, 0(sp)
+; RV32I-NEXT:    lw a3, 4(sp)
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 4(s0)
+; RV32I-NEXT:    sw a2, 8(s0)
+; RV32I-NEXT:    sw a3, 12(s0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_modf_f64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a1, sp
+; RV64I-NEXT:    call modf
+; RV64I-NEXT:    ld a1, 0(sp)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %result = call { double, double } @llvm.modf.f64(double %a)
+  ret { double, double } %result
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll
index 23f660bb026a7..01d9ceb0a0860 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll
@@ -2118,3 +2118,62 @@ define float @tanh_f32(float %a) nounwind {
   %1 = call float @llvm.tanh.f32(float %a)
   ret float %1
 }
+
+define { float, float } @test_modf_f32(float %a) nounwind {
+; RV32IF-LABEL: test_modf_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    addi a0, sp, 8
+; RV32IF-NEXT:    call modff
+; RV32IF-NEXT:    flw fa1, 8(sp)
+; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+;
+; RV64IF-LABEL: test_modf_f32:
+; RV64IF:       # %bb.0:
+; RV64IF-NEXT:    addi sp, sp, -16
+; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IF-NEXT:    addi a0, sp, 4
+; RV64IF-NEXT:    call modff
+; RV64IF-NEXT:    flw fa1, 4(sp)
+; RV64IF-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IF-NEXT:    addi sp, sp, 16
+; RV64IF-NEXT:    ret
+;
+; RV64IFD-LABEL: test_modf_f32:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    addi sp, sp, -16
+; RV64IFD-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IFD-NEXT:    addi a0, sp, 4
+; RV64IFD-NEXT:    call modff
+; RV64IFD-NEXT:    flw fa1, 4(sp)
+; RV64IFD-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IFD-NEXT:    addi sp, sp, 16
+; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: test_modf_f32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi a1, sp, 8
+; RV32I-NEXT:    call modff
+; RV32I-NEXT:    lw a1, 8(sp)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_modf_f32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi a1, sp, 4
+; RV64I-NEXT:    call modff
+; RV64I-NEXT:    lw a1, 4(sp)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %result = call { float, float } @llvm.modf.f32(float %a)
+  ret { float, float } %result
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/fp128.ll b/llvm/test/CodeGen/RISCV/GlobalISel/fp128.ll
index ae9b6cc8948f5..e7a3f79acfad6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/fp128.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/fp128.ll
@@ -911,3 +911,28 @@ define fp128 @tanh(fp128 %a) nounwind {
   %1 = call fp128 @llvm.tanh.f128(fp128 %a)
   ret fp128 %1
 }
+
+define { fp128, fp128 } @modf(fp128 %a) nounwind {
+; CHECK-LABEL: modf:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -32
+; CHECK-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    mv s0, a0
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    mv a2, sp
+; CHECK-NEXT:    call modfl
+; CHECK-NEXT:    ld a2, 0(sp)
+; CHECK-NEXT:    ld a3, 8(sp)
+; CHECK-NEXT:    sd a0, 0(s0)
+; CHECK-NEXT:    sd a1, 8(s0)
+; CHECK-NEXT:    sd a2, 16(s0)
+; CHECK-NEXT:    sd a3, 24(s0)
+; CHECK-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    addi sp, sp, 32
+; CHECK-NEXT:    ret
+  %result = call { fp128, fp128 } @llvm.modf.f128(fp128 %a)
+  ret { fp128, fp128 } %result
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index f1d17f9f02b90..1361d92239db5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -506,8 +506,9 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMODF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FPOW (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
index caeb6e6ce70af..aaa08b577c4f4 100644
--- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
@@ -2109,3 +2109,85 @@ define double @tanh_f64(double %a) nounwind {
   %1 = call double @llvm.tanh.f64(double %a)
   ret double %1
 }
+
+define { double, double } @test_modf_f64(double %a) nounwind {
+; RV32IFD-LABEL: test_modf_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    mv a0, sp
+; RV32IFD-NEXT:    call modf
+; RV32IFD-NEXT:    fld fa1, 0(sp)
+; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+;
+; RV64IFD-LABEL: test_modf_f64:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    addi sp, sp, -16
+; RV64IFD-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IFD-NEXT:    mv a0, sp
+; RV64IFD-NEXT:    call modf
+; RV64IFD-NEXT:    fld fa1, 0(sp)
+; RV64IFD-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IFD-NEXT:    addi sp, sp, 16
+; RV64IFD-NEXT:    ret
+;
+; RV32IZFINXZDINX-LABEL: test_modf_f64:
+; RV32IZFINXZDINX:       # %bb.0:
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
+; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    mv a2, sp
+; RV32IZFINXZDINX-NEXT:    call modf
+; RV32IZFINXZDINX-NEXT:    lw a2, 0(sp)
+; RV32IZFINXZDINX-NEXT:    lw a3, 4(sp)
+; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, 16
+; RV32IZFINXZDINX-NEXT:    ret
+;
+; RV64IZFINXZDINX-LABEL: test_modf_f64:
+; RV64IZFINXZDINX:       # %bb.0:
+; RV64IZFINXZDINX-NEXT:    addi sp, sp, -16
+; RV64IZFINXZDINX-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFINXZDINX-NEXT:    mv a1, sp
+; RV64IZFINXZDINX-NEXT:    call modf
+; RV64IZFINXZDINX-NEXT:    ld a1, 0(sp)
+; RV64IZFINXZDINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFINXZDINX-NEXT:    addi sp, sp, 16
+; RV64IZFINXZDINX-NEXT:    ret
+;
+; RV32I-LABEL: test_modf_f64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv a3, a2
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv a2, sp
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    call modf
+; RV32I-NEXT:    lw a2, 0(sp)
+; RV32I-NEXT:    lw a3, 4(sp)
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 4(s0)
+; RV32I-NEXT:    sw a2, 8(s0)
+; RV32I-NEXT:    sw a3, 12(s0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_modf_f64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv a1, sp
+; RV64I-NEXT:    call modf
+; RV64I-NEXT:    ld a1, 0(sp)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %result = call { double, double } @llvm.modf.f64(double %a)
+  ret { double, double } %result
+}
diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
index b1230ae9dd6bf..5f673ac17d569 100644
--- a/llvm/test/CodeGen/RISCV/float-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
@@ -3050,3 +3050,84 @@ define float @tanh_f32(float %a) nounwind {
   %1 = call float @llvm.tanh.f32(float %a)
   ret float %1
 }
+
+define { float, float } @test_modf_f32(float %a) nounwind {
+; RV32IF-LABEL: test_modf_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    addi a0, sp, 8
+; RV32IF-NEXT:    call modff
+; RV32IF-NEXT:    flw fa1, 8(sp)
+; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+;
+; RV32IZFINX-LABEL: test_modf_f32:
+; RV32IZFINX:       # %bb.0:
+; RV32IZFINX-NEXT:    addi sp, sp, -16
+; RV32IZFINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFINX-NEXT:    addi a1, sp, 8
+; RV32IZFINX-NEXT:    call modff
+; RV32IZFINX-NEXT:    lw a1, 8(sp)
+; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFINX-NEXT:    addi sp, sp, 16
+; RV32IZFINX-NEXT:    ret
+;
+; RV64IF-LABEL: test_modf_f32:
+; RV64IF:       # %bb.0:
+; RV64IF-NEXT:    addi sp, sp, -16
+; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IF-NEXT:    addi a0, sp, 4
+; RV64IF-NEXT:    call modff
+; RV64IF-NEXT:    flw fa1, 4(sp)
+; RV64IF-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IF-NEXT:    addi sp, sp, 16
+; RV64IF-NEXT:    ret
+;
+; RV64IZFINX-LABEL: test_modf_f32:
+; RV64IZFINX:       # %bb.0:
+; RV64IZFINX-NEXT:    addi sp, sp, -16
+; RV64IZFINX-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFINX-NEXT:    addi a1, sp, 4
+; RV64IZFINX-NEXT:    call modff
+; RV64IZFINX-NEXT:    lw a1, 4(sp)
+; RV64IZFINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFINX-NEXT:    addi sp, sp, 16
+; RV64IZFINX-NEXT:    ret
+;
+; RV64IFD-LABEL: test_modf_f32:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    addi sp, sp, -16
+; RV64IFD-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IFD-NEXT:    addi a0, sp, 4
+; RV64IFD-NEXT:    call modff
+; RV64IFD-NEXT:    flw fa1, 4(sp)
+; RV64IFD-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IFD-NEXT:    addi sp, sp, 16
+; RV64IFD-NEXT:    ret
+;
+; RV32I-LABEL: test_modf_f32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    addi a1, sp, 8
+; RV32I-NEXT:    call modff
+; RV32I-NEXT:    lw a1, 8(sp)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_modf_f32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi a1, sp, 4
+; RV64I-NEXT:    call modff
+; RV64I-NEXT:    lw a1, 4(sp)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %result = call { float, float } @llvm.modf.f32(float %a)
+  ret { float, float } %result
+}
diff --git a/llvm/test/CodeGen/RISCV/fp128.ll b/llvm/test/CodeGen/RISCV/fp128.ll
index a8e26f7686e50..704cb425c0eb4 100644
--- a/llvm/test/CodeGen/RISCV/fp128.ll
+++ b/llvm/test/CodeGen/RISCV/fp128.ll
@@ -189,3 +189,46 @@ define fp128 @fminimum(fp128 %x, fp128 %y) {
   %a = call fp128 @llvm.minimum.fp128(fp128 %x, fp128 %y)
   ret fp128 %a
 }
+
+define { fp128, fp128 } @modf(fp128 %a) nounwind {
+; RV32I-LABEL: modf:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    addi a0, sp, 24
+; RV32I-NEXT:    addi a1, sp, 8
+; RV32I-NEXT:    addi a2, sp, 40
+; RV32I-NEXT:    sw a3, 8(sp)
+; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a5, 16(sp)
+; RV32I-NEXT:    sw a6, 20(sp)
+; RV32I-NEXT:    call modfl
+; RV32I-NEXT:    lw a0, 24(sp)
+; RV32I-NEXT:    lw a1, 28(sp)
+; RV32I-NEXT:    lw a2, 32(sp)
+; RV32I-NEXT:    lw a3, 36(sp)
+; RV32I-NEXT:    lw a4, 40(sp)
+; RV32I-NEXT:    lw a5, 44(sp)
+; RV32I-NEXT:    lw a6, 48(sp)
+; RV32I-NEXT:    lw a7, 52(sp)
+; RV32I-NEXT:    sw a4, 16(s0)
+; RV32I-NEXT:    sw a5, 20(s0)
+; RV32I-NEXT:    sw a6, 24(s0)
+; RV32I-NEXT:    sw a7, 28(s0)
+; RV32I-NEXT:    sw a0, 0(s0)
+; RV32I-NEXT:    sw a1, 4(s0)
+; RV32I-NEXT:    sw a2, 8(s0)
+; RV32I-NEXT:    sw a3, 12(s0)
+; RV32I-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    ret
+  %result = call { fp128, fp128 } @llvm.modf.f128(fp128 %a)
+  ret { fp128, fp128 } %result
+}
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index e16d788f66ede..847054d96968a 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -4417,3 +4417,161 @@ define half @tanh_f16(half %a) nounwind {
   %1 = call half @llvm.tanh.f16(half %a)
   ret half %1
 }
+
+define { half, half } @test_modf_f16(half %a) nounwind {
+; RV32IZFH-LABEL: test_modf_f16:
+; RV32IZFH:       # %bb.0:
+; RV32IZFH-NEXT:    addi sp, sp, -16
+; RV32IZFH-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFH-NEXT:    fcvt.s.h fa0, fa0
+; RV32IZFH-NEXT:    addi a0, sp, 8
+; RV32IZFH-NEXT:    call modff
+; RV32IZFH-NEXT:    flw fa5, 8(sp)
+; RV32IZFH-NEXT:    fcvt.h.s fa0, fa0
+; RV32IZFH-NEXT:    fcvt.h.s fa1, fa5
+; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFH-NEXT:    addi sp, sp, 16
+; RV32IZFH-NEXT:    ret
+;
+; RV64IZFH-LABEL: test_modf_f16:
+; RV64IZFH:       # %bb.0:
+; RV64IZFH-NEXT:    addi sp, sp, -16
+; RV64IZFH-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFH-NEXT:    fcvt.s.h fa0, fa0
+; RV64IZFH-NEXT:    addi a0, sp, 4
+; RV64IZFH-NEXT:    call modff
+; RV64IZFH-NEXT:    flw fa5, 4(sp)
+; RV64IZFH-NEXT:    fcvt.h.s fa0, fa0
+; RV64IZFH-NEXT:    fcvt.h.s fa1, fa5
+; RV64IZFH-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFH-NEXT:    addi sp, sp, 16
+; RV64IZFH-NEXT:    ret
+;
+; RV32IZHINX-LABEL: test_modf_f16:
+; RV32IZHINX:       # %bb.0:
+; RV32IZHINX-NEXT:    addi sp, sp, -16
+; RV32IZHINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINX-NEXT:    addi a1, sp, 8
+; RV32IZHINX-NEXT:    call modff
+; RV32IZHINX-NEXT:    lw a1, 8(sp)
+; RV32IZHINX-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINX-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZHINX-NEXT:    addi sp, sp, 16
+; RV32IZHINX-NEXT:    ret
+;
+; RV64IZHINX-LABEL: test_modf_f16:
+; RV64IZHINX:       # %bb.0:
+; RV64IZHINX-NEXT:    addi sp, sp, -16
+; RV64IZHINX-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINX-NEXT:    addi a1, sp, 4
+; RV64IZHINX-NEXT:    call modff
+; RV64IZHINX-NEXT:    lw a1, 4(sp)
+; RV64IZHINX-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINX-NEXT:    fcvt.h.s a1, a1
+; RV64IZHINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZHINX-NEXT:    addi sp, sp, 16
+; RV64IZHINX-NEXT:    ret
+;
+; RV32I-LABEL: test_modf_f16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    addi a1, sp, 4
+; RV32I-NEXT:    call modff
+; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lw a0, 4(sp)
+; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: test_modf_f16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -32
+; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    addi a1, sp, 12
+; RV64I-NEXT:    call modff
+; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lw a0, 12(sp)
+; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 32
+; RV64I-NEXT:    ret
+;
+; RV32IZFHMIN-LABEL: test_modf_f16:
+; RV32IZFHMIN:       # %bb.0:
+; RV32IZFHMIN-NEXT:    addi sp, sp, -16
+; RV32IZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
+; RV32IZFHMIN-NEXT:    addi a0, sp, 8
+; RV32IZFHMIN-NEXT:    call modff
+; RV32IZFHMIN-NEXT:    flw fa5, 8(sp)
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa0, fa0
+; RV32IZFHMIN-NEXT:    fcvt.h.s fa1, fa5
+; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFHMIN-NEXT:    addi sp, sp, 16
+; RV32IZFHMIN-NEXT:    ret
+;
+; RV64IZFHMIN-LABEL: test_modf_f16:
+; RV64IZFHMIN:       # %bb.0:
+; RV64IZFHMIN-NEXT:    addi sp, sp, -16
+; RV64IZFHMIN-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
+; RV64IZFHMIN-NEXT:    addi a0, sp, 4
+; RV64IZFHMIN-NEXT:    call modff
+; RV64IZFHMIN-NEXT:    flw fa5, 4(sp)
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa0, fa0
+; RV64IZFHMIN-NEXT:    fcvt.h.s fa1, fa5
+; RV64IZFHMIN-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFHMIN-NEXT:    addi sp, sp, 16
+; RV64IZFHMIN-NEXT:    ret
+;
+; RV32IZHINXMIN-LABEL: test_modf_f16:
+; RV32IZHINXMIN:       # %bb.0:
+; RV32IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV32IZHINXMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    addi a1, sp, 8
+; RV32IZHINXMIN-NEXT:    call modff
+; RV32IZHINXMIN-NEXT:    lw a1, 8(sp)
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV32IZHINXMIN-NEXT:    ret
+;
+; RV64IZHINXMIN-LABEL: test_modf_f16:
+; RV64IZHINXMIN:       # %bb.0:
+; RV64IZHINXMIN-NEXT:    addi sp, sp, -16
+; RV64IZHINXMIN-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    addi a1, sp, 4
+; RV64IZHINXMIN-NEXT:    call modff
+; RV64IZHINXMIN-NEXT:    lw a1, 4(sp)
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a0, a0
+; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
+; RV64IZHINXMIN-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZHINXMIN-NEXT:    addi sp, sp, 16
+; RV64IZHINXMIN-NEXT:    ret
+  %result = call { half, half } @llvm.modf.f16(half %a)
+  ret { half, half } %result
+}

From fee840deb75842ec80ff9aed1c35e6c60d095d29 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 4 Oct 2025 00:07:35 +0900
Subject: [PATCH 654/878] AMDGPU: Fix broken register class IDs in mir tests
 (#161832)

---
 .../coalesce-copy-to-agpr-to-av-registers.mir    |  4 ++--
 llvm/test/CodeGen/AMDGPU/mai-hazards.mir         |  6 +++---
 ...egalloc-failure-overlapping-insert-assert.mir | 16 ++++++++--------
 .../rewrite-vgpr-mfma-to-agpr-copy-from.mir      |  8 ++++----
 ...write-vgpr-mfma-to-agpr-subreg-src2-chain.mir |  8 ++++----
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index ce1ea4d2e3cbc..c475efbb1ca26 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -504,7 +504,7 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VS_64_with_sub1 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
@@ -512,7 +512,7 @@ body:             |
     undef %2.sub0:vreg_64 = COPY %0
     %2.sub1:vreg_64 = COPY %0
     INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, killed %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index 61f2629dded83..c19d5a6979377 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -33,7 +33,7 @@ name:            asm_write_vgpr_accvgpr_write_read
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
@@ -47,7 +47,7 @@ name:            asm_write_vgpr_accvgpr_write_read_partialnop
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0
     S_NOP 0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
@@ -60,7 +60,7 @@ name:            asm_write_vgpr_accvgpr_write_read_otherreg
 body:             |
   bb.0:
     liveins: $vgpr0
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr1
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index b32e99721c1f6..80afe7a180182 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -43,17 +43,17 @@ machineFunctionInfo:
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2424842 /* regdef:AGPR_32 */, implicit-def $agpr0
     %14:vgpr_32 = COPY killed $agpr0
-    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27394058 /* regdef:VReg_512 */, def %7, 13697034 /* regdef:VReg_256 */, def %8, 6225930 /* regdef:VReg_128 */, def %9, 4915210 /* regdef:VReg_96 */, def %10, 4915210 /* regdef:VReg_96 */, def %11
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 40042506 /* regdef:VReg_512 */, def %7, 19464202 /* regdef:VReg_256 */, def %8, 7929866 /* regdef:VReg_128 */, def %9, 5963786 /* regdef:VReg_96 */, def %10, 5963786 /* regdef:VReg_96 */, def %11
     INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27394057 /* reguse:VReg_512 */, %7
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13697033 /* reguse:VReg_256 */, %8
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, %9
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %10
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %11
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40042505 /* reguse:VReg_512 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19464201 /* reguse:VReg_256 */, %8
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, %9
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %10
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %11
     $agpr1 = COPY %14
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, killed $agpr1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
index 1b09f5d6ab9c3..ad490f8fe54d8 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
@@ -41,9 +41,9 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
-    ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:AV_64_Align2 */, [[V_MFMA_F64_4X4X4F64_e64_]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
+    ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
     %1:av_64_align2 = COPY $vgpr0_vgpr1
@@ -51,7 +51,7 @@ body:             |
     %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_128_align2 = COPY %3
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:VReg_64_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, %5
     SI_RETURN
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index 4c2ea2f3b814a..dcf3b8b0ce389 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -79,7 +79,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -90,7 +90,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %4.sub0_sub1
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %6:areg_64_align2 = COPY %5
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
     GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -114,7 +114,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -126,7 +126,7 @@ body:             |
     undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %7:areg_64_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %7
     GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 

From 3c6cd732eb59b50fa2394f659763f5d216b76703 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 4 Oct 2025 00:10:02 +0900
Subject: [PATCH 655/878] CodeGen: Do not store RegisterClass copy costs as a
 signed value (#161786)

Tolerate setting negative values in tablegen, and store them as a
saturated uint8_t value. This will allow naive uses of the copy cost
to directly add it as a cost without considering the degenerate negative
case. The degenerate negative cases are only used in InstrEmitter / DAG
scheduling, so leave the special case processing there. There are also
fixmes about this system already there.

This is the expedient fix for an out of tree target regression
after #160084. Currently targets can set a negative copy cost to mark
copies as "impossible". However essentially all the in-tree uses only
uses this for non-allocatable condition registers. We probably should
replace the InstrEmitter/DAG scheduler uses with a more direct check
for a copyable register but that has test changes.
---
 .../include/llvm/CodeGen/TargetRegisterInfo.h | 13 +++++---
 llvm/include/llvm/MC/MCRegisterInfo.h         |  4 +--
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |  2 +-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |  2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  2 +-
 llvm/test/TableGen/RegisterClassCopyCost.td   | 31 +++++++++++++++++++
 .../TableGen/Common/CodeGenRegisters.cpp      | 10 +++++-
 llvm/utils/TableGen/Common/CodeGenRegisters.h |  2 +-
 llvm/utils/TableGen/RegisterInfoEmitter.cpp   |  3 +-
 9 files changed, 56 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/TableGen/RegisterClassCopyCost.td

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index bf133f0332bcb..822245f13edff 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -109,10 +109,15 @@ class TargetRegisterClass {
     return MC->contains(Reg1.asMCReg(), Reg2.asMCReg());
   }
 
-  /// Return the cost of copying a value between two registers in this class.
-  /// A negative number means the register class is very expensive
-  /// to copy e.g. status flag register classes.
-  int getCopyCost() const { return MC->getCopyCost(); }
+  /// Return the cost of copying a value between two registers in this class. If
+  /// this is the maximum value, the register may be impossible to copy.
+  uint8_t getCopyCost() const { return MC->getCopyCost(); }
+
+  /// \return true if register class is very expensive to copy e.g. status flag
+  /// register classes.
+  bool expensiveOrImpossibleToCopy() const {
+    return MC->getCopyCost() == std::numeric_limits<uint8_t>::max();
+  }
 
   /// Return true if this register class may be used to create virtual
   /// registers.
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index aad3792f88784..e6fc7077a2dc3 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -45,7 +45,7 @@ class MCRegisterClass {
   const uint16_t RegSetSize;
   const uint16_t ID;
   const uint16_t RegSizeInBits;
-  const int8_t CopyCost;
+  const uint8_t CopyCost;
   const bool Allocatable;
   const bool BaseClass;
 
@@ -94,7 +94,7 @@ class MCRegisterClass {
   /// getCopyCost - Return the cost of copying a value between two registers in
   /// this class. A negative number means the register class is very expensive
   /// to copy e.g. status flag register classes.
-  int getCopyCost() const { return CopyCost; }
+  uint8_t getCopyCost() const { return CopyCost; }
 
   /// isAllocatable - Return true if this register class may be used to create
   /// virtual registers.
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 11bc64c626421..bb10cf687db8d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -160,7 +160,7 @@ void InstrEmitter::EmitCopyFromReg(SDValue Op, bool IsClone, Register SrcReg,
 
   // If all uses are reading from the src physical register and copying the
   // register is either impossible or very expensive, then don't create a copy.
-  if (MatchReg && SrcRC->getCopyCost() < 0) {
+  if (MatchReg && SrcRC->expensiveOrImpossibleToCopy()) {
     VRBase = SrcReg;
   } else {
     // Create the reg, emit the copy.
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 31e78558e239b..79022295f0abd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -136,7 +136,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
   if (PhysReg) {
     const TargetRegisterClass *RC =
         TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo));
-    Cost = RC->getCopyCost();
+    Cost = RC->expensiveOrImpossibleToCopy() ? -1 : RC->getCopyCost();
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f7265c5fda9dc..79876ff37b97a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18879,7 +18879,7 @@ bool SITargetLowering::checkForPhysRegDependency(
     PhysReg = AMDGPU::SCC;
     const TargetRegisterClass *RC =
         TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
-    Cost = RC->getCopyCost();
+    Cost = RC->expensiveOrImpossibleToCopy() ? -1 : RC->getCopyCost();
     return true;
   }
   return false;
diff --git a/llvm/test/TableGen/RegisterClassCopyCost.td b/llvm/test/TableGen/RegisterClassCopyCost.td
new file mode 100644
index 0000000000000..fc65fdb5fdbef
--- /dev/null
+++ b/llvm/test/TableGen/RegisterClassCopyCost.td
@@ -0,0 +1,31 @@
+// RUN: llvm-tblgen --gen-register-info -I %p/../../include %s 2>&1 | FileCheck %s
+// RUN: not llvm-tblgen --gen-register-info -I %p/../../include -DERROR %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+// Check that there is no assertion when specifying unsupported
+// CopyCost values on register classes. Check that negative CopyCost
+// values are saturated to 255.
+
+include "llvm/Target/Target.td"
+
+// CHECK: extern const MCRegisterClass MyTargetMCRegisterClasses[] = {
+// CHECK-NEXT: { GPR32, GPR32Bits, 0, 2, sizeof(GPR32Bits), MyTarget::GPR32RegClassID, 32, 1, true, false },
+// CHECK-NEXT:   { SPECIAL_CLASS, SPECIAL_CLASSBits, 6, 1, sizeof(SPECIAL_CLASSBits), MyTarget::SPECIAL_CLASSRegClassID, 32, 255, true, false },
+// CHECK-NEXT: };
+
+def MyTargetISA : InstrInfo;
+def MyTarget : Target { let InstructionSet = MyTargetISA; }
+
+def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
+def R1 : Register<"r1"> { let Namespace = "MyTarget"; }
+def SPECIAL : Register<"special"> { let Namespace = "MyTarget"; }
+
+// ERROR: :[[@LINE+1]]:5: error: 'CopyCost' must be an 8-bit value
+def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0, R1)> {
+#ifdef ERROR
+  let CopyCost = 256;
+#endif
+}
+
+def SPECIAL_CLASS : RegisterClass<"MyTarget", [i32], 32, (add SPECIAL)> {
+  let CopyCost = -1;
+}
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index e873b3eaa4b7e..294f3af5a0055 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -744,7 +744,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
     RSI.insertRegSizeForMode(DefaultMode, RI);
   }
 
-  CopyCost = R->getValueAsInt("CopyCost");
+  int CopyCostParsed = R->getValueAsInt("CopyCost");
   Allocatable = R->getValueAsBit("isAllocatable");
   AltOrderSelect = R->getValueAsString("AltOrderSelect");
   int AllocationPriority = R->getValueAsInt("AllocationPriority");
@@ -757,6 +757,14 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
   const BitsInit *TSF = R->getValueAsBitsInit("TSFlags");
   for (auto [Idx, Bit] : enumerate(TSF->getBits()))
     TSFlags |= uint8_t(cast<BitInit>(Bit)->getValue()) << Idx;
+
+  // Saturate negative costs to the maximum
+  if (CopyCostParsed < 0)
+    CopyCost = std::numeric_limits<uint8_t>::max();
+  else if (!isUInt<8>(CopyCostParsed))
+    PrintFatalError(R->getLoc(), "'CopyCost' must be an 8-bit value");
+
+  CopyCost = CopyCostParsed;
 }
 
 // Create an inferred register class that was missing from the .td files.
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index 81aa663b8f11e..89dac125e3d15 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -359,7 +359,7 @@ class CodeGenRegisterClass {
   StringRef Namespace;
   SmallVector<ValueTypeByHwMode, 4> VTs;
   RegSizeInfoByHwMode RSI;
-  int CopyCost;
+  uint8_t CopyCost;
   bool Allocatable;
   StringRef AltOrderSelect;
   uint8_t AllocationPriority;
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index c4b1c5f5a6d9d..a67a5a944b1e1 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1083,14 +1083,13 @@ void RegisterInfoEmitter::runMCDesc(raw_ostream &OS) {
     std::string RCName = Order.empty() ? "nullptr" : RC.getName();
     std::string RCBitsName = Order.empty() ? "nullptr" : RC.getName() + "Bits";
     std::string RCBitsSize = Order.empty() ? "0" : "sizeof(" + RCBitsName + ")";
-    assert(isInt<8>(RC.CopyCost) && "Copy cost too large.");
     uint32_t RegSize = 0;
     if (RC.RSI.isSimple())
       RegSize = RC.RSI.getSimple().RegSize;
     OS << "  { " << RCName << ", " << RCBitsName << ", "
        << RegClassStrings.get(RC.getName()) << ", " << RC.getOrder().size()
        << ", " << RCBitsSize << ", " << RC.getQualifiedIdName() << ", "
-       << RegSize << ", " << RC.CopyCost << ", "
+       << RegSize << ", " << static_cast<unsigned>(RC.CopyCost) << ", "
        << (RC.Allocatable ? "true" : "false") << ", "
        << (RC.getBaseClassOrder() ? "true" : "false") << " },\n";
   }

From b4f54bf548839ebe3308b1979b448403c2ba2a81 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 3 Oct 2025 15:09:54 +0000
Subject: [PATCH 656/878] [llvm][LoongArch] Fix compiler warning produced by
 assert
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #154918

tion ‘llvm::SDValue fillSubVectorFromBuildVector(llvm::BuildVectorSDNode*, llvm::SelectionDAG&, llvm::SDLoc, const llvm::LoongArchSubtarget&, llvm::EVT, unsigned int)’:
/home/davspi01/work/open_source/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp:2863:16: warning: comparison of unsigned expression in ‘>= 0’ is always true [-Wtype-limits]
 2863 |   assert(first >= 0 &&
      |          ~~~~~~^~~~

first is unsigned so this part of the expression is redundant.
---
 llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4cfbfca45d359..7ddf996f53f4c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2860,8 +2860,7 @@ static SDValue fillSubVectorFromBuildVector(BuildVectorSDNode *Node,
                                             EVT ResTy, unsigned first) {
   unsigned NumElts = ResTy.getVectorNumElements();
 
-  assert(first >= 0 &&
-         first + NumElts <= Node->getSimpleValueType(0).getVectorNumElements());
+  assert(first + NumElts <= Node->getSimpleValueType(0).getVectorNumElements());
 
   SmallVector<SDValue, 16> Ops(Node->op_begin() + first,
                                Node->op_begin() + first + NumElts);

From 69b8d6d4ead01b88fb8d6642914ca7492e32fdb6 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Fri, 3 Oct 2025 17:17:23 +0200
Subject: [PATCH 657/878] [LLD][COFF] Fix tailMergeARM64 delayload thunk 128 MB
 range limitation (#161844)

lld would fail with "error: relocation out of range" if the thunk was
laid out more than 128 MB away from __delayLoadHelper2.

This patch changes the call sequence to load the offset into a register
and call through that, allowing for 32-bit offsets.

Fixes #161812
---
 lld/COFF/DLL.cpp                      | 10 ++++--
 lld/test/COFF/arm64-delayimport.yaml  | 26 +++++++-------
 lld/test/COFF/arm64x-delayimport.test | 52 ++++++++++++++-------------
 3 files changed, 49 insertions(+), 39 deletions(-)

diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 3ce8853adb2a2..f4284efee8d4d 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -333,7 +333,9 @@ static const uint8_t tailMergeARM64[] = {
     0xe1, 0x03, 0x11, 0xaa, // mov     x1, x17
     0x00, 0x00, 0x00, 0x90, // adrp    x0, #0     DELAY_IMPORT_DESCRIPTOR
     0x00, 0x00, 0x00, 0x91, // add     x0, x0, #0 :lo12:DELAY_IMPORT_DESCRIPTOR
-    0x00, 0x00, 0x00, 0x94, // bl      #0 __delayLoadHelper2
+    0x02, 0x00, 0x00, 0x90, // adrp    x2, #0     __delayLoadHelper2
+    0x42, 0x00, 0x00, 0x91, // add     x2, x2, #0 :lo12:__delayLoadHelper2
+    0x40, 0x00, 0x3f, 0xd6, // blr     x2
     0xf0, 0x03, 0x00, 0xaa, // mov     x16, x0
     0xe6, 0x9f, 0x45, 0xad, // ldp     q6, q7, [sp, #176]
     0xe4, 0x97, 0x44, 0xad, // ldp     q4, q5, [sp, #144]
@@ -556,8 +558,10 @@ class TailMergeChunkARM64 : public NonSectionCodeChunk {
     memcpy(buf, tailMergeARM64, sizeof(tailMergeARM64));
     applyArm64Addr(buf + 44, desc->getRVA(), rva + 44, 12);
     applyArm64Imm(buf + 48, desc->getRVA() & 0xfff, 0);
-    if (helper)
-      applyArm64Branch26(buf + 52, helper->getRVA() - rva - 52);
+    if (helper) {
+      applyArm64Addr(buf + 52, helper->getRVA(), rva + 52, 12);
+      applyArm64Imm(buf + 56, helper->getRVA() & 0xfff, 0);
+    }
   }
 
   Chunk *desc = nullptr;
diff --git a/lld/test/COFF/arm64-delayimport.yaml b/lld/test/COFF/arm64-delayimport.yaml
index abb9f25d5c379..7090206dea38a 100644
--- a/lld/test/COFF/arm64-delayimport.yaml
+++ b/lld/test/COFF/arm64-delayimport.yaml
@@ -21,18 +21,20 @@
 # DISASM:  140001048:      aa1103e1        mov     x1, x17
 # DISASM:  14000104c:      b0000000        adrp    x0, 0x140002000
 # DISASM:  140001050:      91000000        add     x0, x0, #0
-# DISASM:  140001054:      97ffffeb        bl      0x140001000 <.text>
-# DISASM:  140001058:      aa0003f0        mov     x16, x0
-# DISASM:  14000105c:      ad459fe6        ldp     q6, q7, [sp, #176]
-# DISASM:  140001060:      ad4497e4        ldp     q4, q5, [sp, #144]
-# DISASM:  140001064:      ad438fe2        ldp     q2, q3, [sp, #112]
-# DISASM:  140001068:      ad4287e0        ldp     q0, q1, [sp, #80]
-# DISASM:  14000106c:      a9441fe6        ldp     x6, x7, [sp, #64]
-# DISASM:  140001070:      a94317e4        ldp     x4, x5, [sp, #48]
-# DISASM:  140001074:      a9420fe2        ldp     x2, x3, [sp, #32]
-# DISASM:  140001078:      a94107e0        ldp     x0, x1, [sp, #16]
-# DISASM:  14000107c:      a8cd7bfd        ldp     x29, x30, [sp], #208
-# DISASM:  140001080:      d61f0200        br      x16
+# DISASM:  140001054:      90000002        adrp    x2, 0x140001000 <.text>
+# DISASM:  140001058:      91000042        add     x2, x2, #0
+# DISASM:  14000105c:      d63f0040        blr     x2
+# DISASM:  140001060:      aa0003f0        mov     x16, x0
+# DISASM:  140001064:      ad459fe6        ldp     q6, q7, [sp, #176]
+# DISASM:  140001068:      ad4497e4        ldp     q4, q5, [sp, #144]
+# DISASM:  14000106c:      ad438fe2        ldp     q2, q3, [sp, #112]
+# DISASM:  140001070:      ad4287e0        ldp     q0, q1, [sp, #80]
+# DISASM:  140001074:      a9441fe6        ldp     x6, x7, [sp, #64]
+# DISASM:  140001078:      a94317e4        ldp     x4, x5, [sp, #48]
+# DISASM:  14000107c:      a9420fe2        ldp     x2, x3, [sp, #32]
+# DISASM:  140001080:      a94107e0        ldp     x0, x1, [sp, #16]
+# DISASM:  140001084:      a8cd7bfd        ldp     x29, x30, [sp], #208
+# DISASM:  140001088:      d61f0200        br      x16
 
 # IMPORTS: Format: COFF-ARM64
 # IMPORTS: Arch: aarch64
diff --git a/lld/test/COFF/arm64x-delayimport.test b/lld/test/COFF/arm64x-delayimport.test
index 2a68bce79baad..e22cc6d5c42fc 100644
--- a/lld/test/COFF/arm64x-delayimport.test
+++ b/lld/test/COFF/arm64x-delayimport.test
@@ -74,18 +74,20 @@ DISASM-NEXT: 180001044: ad059fe6     stp     q6, q7, [sp, #0xb0]
 DISASM-NEXT: 180001048: aa1103e1     mov     x1, x17
 DISASM-NEXT: 18000104c: f0000000     adrp    x0, 0x180004000
 DISASM-NEXT: 180001050: 910d2000     add     x0, x0, #0x348
-DISASM-NEXT: 180001054: 97ffffeb     bl      0x180001000 <.text>
-DISASM-NEXT: 180001058: aa0003f0     mov     x16, x0
-DISASM-NEXT: 18000105c: ad459fe6     ldp     q6, q7, [sp, #0xb0]
-DISASM-NEXT: 180001060: ad4497e4     ldp     q4, q5, [sp, #0x90]
-DISASM-NEXT: 180001064: ad438fe2     ldp     q2, q3, [sp, #0x70]
-DISASM-NEXT: 180001068: ad4287e0     ldp     q0, q1, [sp, #0x50]
-DISASM-NEXT: 18000106c: a9441fe6     ldp     x6, x7, [sp, #0x40]
-DISASM-NEXT: 180001070: a94317e4     ldp     x4, x5, [sp, #0x30]
-DISASM-NEXT: 180001074: a9420fe2     ldp     x2, x3, [sp, #0x20]
-DISASM-NEXT: 180001078: a94107e0     ldp     x0, x1, [sp, #0x10]
-DISASM-NEXT: 18000107c: a8cd7bfd     ldp     x29, x30, [sp], #0xd0
-DISASM-NEXT: 180001080: d61f0200     br      x16
+DISASM-NEXT: 180001054: 90000002     adrp    x2, 0x180001000 <.text>
+DISASM-NEXT: 180001058: 91000042     add     x2, x2, #0x0
+DISASM-NEXT: 18000105c: d63f0040     blr     x2
+DISASM-NEXT: 180001060: aa0003f0     mov     x16, x0
+DISASM-NEXT: 180001064: ad459fe6     ldp     q6, q7, [sp, #0xb0]
+DISASM-NEXT: 180001068: ad4497e4     ldp     q4, q5, [sp, #0x90]
+DISASM-NEXT: 18000106c: ad438fe2     ldp     q2, q3, [sp, #0x70]
+DISASM-NEXT: 180001070: ad4287e0     ldp     q0, q1, [sp, #0x50]
+DISASM-NEXT: 180001074: a9441fe6     ldp     x6, x7, [sp, #0x40]
+DISASM-NEXT: 180001078: a94317e4     ldp     x4, x5, [sp, #0x30]
+DISASM-NEXT: 18000107c: a9420fe2     ldp     x2, x3, [sp, #0x20]
+DISASM-NEXT: 180001080: a94107e0     ldp     x0, x1, [sp, #0x10]
+DISASM-NEXT: 180001084: a8cd7bfd     ldp     x29, x30, [sp], #0xd0
+DISASM-NEXT: 180001088: d61f0200     br      x16
 DISASM-NEXT:                 ...
 DISASM-NEXT: 180002000: 52800040     mov     w0, #0x2                // =2
 DISASM-NEXT: 180002004: d65f03c0     ret
@@ -197,18 +199,20 @@ NATIVE-DISASM-NEXT: 180001044: ad059fe6     stp     q6, q7, [sp, #0xb0]
 NATIVE-DISASM-NEXT: 180001048: aa1103e1     mov     x1, x17
 NATIVE-DISASM-NEXT: 18000104c: d0000000     adrp    x0, 0x180003000
 NATIVE-DISASM-NEXT: 180001050: 910cc000     add     x0, x0, #0x330
-NATIVE-DISASM-NEXT: 180001054: 97ffffeb     bl      0x180001000 <.text>
-NATIVE-DISASM-NEXT: 180001058: aa0003f0     mov     x16, x0
-NATIVE-DISASM-NEXT: 18000105c: ad459fe6     ldp     q6, q7, [sp, #0xb0]
-NATIVE-DISASM-NEXT: 180001060: ad4497e4     ldp     q4, q5, [sp, #0x90]
-NATIVE-DISASM-NEXT: 180001064: ad438fe2     ldp     q2, q3, [sp, #0x70]
-NATIVE-DISASM-NEXT: 180001068: ad4287e0     ldp     q0, q1, [sp, #0x50]
-NATIVE-DISASM-NEXT: 18000106c: a9441fe6     ldp     x6, x7, [sp, #0x40]
-NATIVE-DISASM-NEXT: 180001070: a94317e4     ldp     x4, x5, [sp, #0x30]
-NATIVE-DISASM-NEXT: 180001074: a9420fe2     ldp     x2, x3, [sp, #0x20]
-NATIVE-DISASM-NEXT: 180001078: a94107e0     ldp     x0, x1, [sp, #0x10]
-NATIVE-DISASM-NEXT: 18000107c: a8cd7bfd     ldp     x29, x30, [sp], #0xd0
-NATIVE-DISASM-NEXT: 180001080: d61f0200     br      x16
+NATIVE-DISASM-NEXT: 180001054: 90000002     adrp    x2, 0x180001000 <.text>
+NATIVE-DISASM-NEXT: 180001058: 91000042     add     x2, x2, #0x0
+NATIVE-DISASM-NEXT: 18000105c: d63f0040     blr     x2
+NATIVE-DISASM-NEXT: 180001060: aa0003f0     mov     x16, x0
+NATIVE-DISASM-NEXT: 180001064: ad459fe6     ldp     q6, q7, [sp, #0xb0]
+NATIVE-DISASM-NEXT: 180001068: ad4497e4     ldp     q4, q5, [sp, #0x90]
+NATIVE-DISASM-NEXT: 18000106c: ad438fe2     ldp     q2, q3, [sp, #0x70]
+NATIVE-DISASM-NEXT: 180001070: ad4287e0     ldp     q0, q1, [sp, #0x50]
+NATIVE-DISASM-NEXT: 180001074: a9441fe6     ldp     x6, x7, [sp, #0x40]
+NATIVE-DISASM-NEXT: 180001078: a94317e4     ldp     x4, x5, [sp, #0x30]
+NATIVE-DISASM-NEXT: 18000107c: a9420fe2     ldp     x2, x3, [sp, #0x20]
+NATIVE-DISASM-NEXT: 180001080: a94107e0     ldp     x0, x1, [sp, #0x10]
+NATIVE-DISASM-NEXT: 180001084: a8cd7bfd     ldp     x29, x30, [sp], #0xd0
+NATIVE-DISASM-NEXT: 180001088: d61f0200     br      x16
 
 RUN: llvm-readobj --coff-load-config out-native.dll | FileCheck --check-prefix=NATIVE-LOADCFG %s
 NATIVE-LOADCFG:      AuxiliaryDelayloadIAT: 0x4000

From 5537b9a16734fd0633cd9aa1f8b6427a7be68154 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 3 Oct 2025 11:20:25 -0400
Subject: [PATCH 658/878] [FIX] Add `clangIndex` as a dependence for
 `clangStaticAnalyzerCore`

This was introduced by #161663.
---
 clang/lib/StaticAnalyzer/Core/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/StaticAnalyzer/Core/CMakeLists.txt b/clang/lib/StaticAnalyzer/Core/CMakeLists.txt
index d0a9b202f9c52..b8095a5427b51 100644
--- a/clang/lib/StaticAnalyzer/Core/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Core/CMakeLists.txt
@@ -61,6 +61,7 @@ add_clang_library(clangStaticAnalyzerCore
   clangBasic
   clangCrossTU
   clangFrontend
+  clangIndex
   clangLex
   clangRewrite
   clangToolingCore

From 73651ba9bcb94faded230a8d9bb682250c3c9327 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 3 Oct 2025 16:29:43 +0100
Subject: [PATCH 659/878] [x86] lowerV4I32Shuffle - don't adjust PSHUFD splat
 masks to match UNPCK (#161846)

Allow getV4X86ShuffleImm8ForMask to create a pure splat mask, helping to reduce demanded elts.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 10 +++---
 .../any_extend_vector_inreg_of_broadcast.ll   |  6 ++--
 ...d_vector_inreg_of_broadcast_from_memory.ll |  6 ++--
 .../CodeGen/X86/shuffle-of-splat-multiuses.ll |  8 ++---
 llvm/test/CodeGen/X86/vec-strict-cmp-128.ll   | 32 +++++++++----------
 .../CodeGen/X86/vector-fshl-rot-sub128.ll     | 10 +++---
 llvm/test/CodeGen/X86/vector-fshl-sub128.ll   | 10 +++---
 .../CodeGen/X86/vector-fshr-rot-sub128.ll     | 10 +++---
 llvm/test/CodeGen/X86/vector-fshr-sub128.ll   | 10 +++---
 llvm/test/CodeGen/X86/vector-sext.ll          |  4 +--
 llvm/test/CodeGen/X86/vector-zext.ll          |  6 ++--
 .../zero_extend_vector_inreg_of_broadcast.ll  |  8 ++---
 ...d_vector_inreg_of_broadcast_from_memory.ll |  8 ++---
 13 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 02b20b3ae5301..931a10b700c87 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13783,10 +13783,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // so prevents folding a load into this instruction or making a copy.
     const int UnpackLoMask[] = {0, 0, 1, 1};
     const int UnpackHiMask[] = {2, 2, 3, 3};
-    if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
-      Mask = UnpackLoMask;
-    else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
-      Mask = UnpackHiMask;
+    if (!isSingleElementRepeatedMask(Mask)) {
+      if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
+        Mask = UnpackLoMask;
+      else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
+        Mask = UnpackHiMask;
+    }
 
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index dec829fed3535..44cf4e897d626 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -911,7 +911,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    paddb 16(%rsi), %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    paddb (%rdx), %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, (%rcx)
@@ -1898,7 +1898,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    paddb 32(%rsi), %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
@@ -4155,7 +4155,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    paddb 48(%rsi), %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSE2-NEXT:    paddb (%rdx), %xmm2
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 3d4cddbb94c7b..89b5c33e3f27b 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -769,7 +769,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
 ; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,0,0]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    paddb (%rsi), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, (%rdx)
@@ -1522,7 +1522,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
 ; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,0,0]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
@@ -3335,7 +3335,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSE2-NEXT:    paddb (%rsi), %xmm2
diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
index ecd9435c84857..1766b4d1fbb6e 100644
--- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
+++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
@@ -58,7 +58,7 @@ define <8 x float> @foo8(<8 x float> %v, ptr%p) nounwind {
 define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
 ; AVX2-LABEL: undef_splatmask:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -68,7 +68,7 @@ define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
 define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
 ; AVX2-LABEL: undef_splatmask2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -78,7 +78,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
 define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
 ; AVX2-LABEL: undef_splatmask3:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2]
 ; AVX2-NEXT:    retq
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
   %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
@@ -88,7 +88,7 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
 define <4 x i32> @undef_splatmask4(<4 x i32> %v, ptr %p) nounwind {
 ; AVX2-LABEL: undef_splatmask4:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
 ; AVX2-NEXT:    vmovaps %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
index 209d6a5a67100..93a692cb002e0 100644
--- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll
@@ -1911,13 +1911,13 @@ define <2 x i64> @test_v2f64_ogt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmoval %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -2031,13 +2031,13 @@ define <2 x i64> @test_v2f64_oge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmovael %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -2151,13 +2151,13 @@ define <2 x i64> @test_v2f64_olt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmoval %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmoval %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -2269,13 +2269,13 @@ define <2 x i64> @test_v2f64_ole_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovael %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmovael %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -2680,13 +2680,13 @@ define <2 x i64> @test_v2f64_ugt_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmovbl %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -2798,13 +2798,13 @@ define <2 x i64> @test_v2f64_uge_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm2, %xmm4
 ; SSE-32-NEXT:    cmovbel %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -2916,13 +2916,13 @@ define <2 x i64> @test_v2f64_ult_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbl %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmovbl %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
@@ -3036,13 +3036,13 @@ define <2 x i64> @test_v2f64_ule_q(<2 x i64> %a, <2 x i64> %b, <2 x double> %f1,
 ; SSE-32-NEXT:    movl $0, %edx
 ; SSE-32-NEXT:    cmovbel %ecx, %edx
 ; SSE-32-NEXT:    movd %edx, %xmm3
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1,1]
 ; SSE-32-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
 ; SSE-32-NEXT:    ucomisd %xmm4, %xmm2
 ; SSE-32-NEXT:    cmovbel %ecx, %eax
 ; SSE-32-NEXT:    movd %eax, %xmm2
-; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-32-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
 ; SSE-32-NEXT:    pand %xmm3, %xmm0
 ; SSE-32-NEXT:    pandn %xmm1, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 9ecc6296a844a..b378dce2b52fd 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -162,7 +162,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
 define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pslld $23, %xmm1
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -182,7 +182,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v2i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    pslld $23, %xmm1
 ; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -200,7 +200,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; AVX1-LABEL: splatvar_funnnel_v2i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -277,7 +277,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v2i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -289,7 +289,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X86-SSE2-NEXT:    pslld $23, %xmm1
 ; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
 ; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 322ebe22671e6..06ff7e77753a0 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -250,7 +250,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
 define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; SSE2-NEXT:    pandn %xmm4, %xmm5
@@ -286,7 +286,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; SSE41-LABEL: splatvar_funnnel_v2i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    pandn %xmm3, %xmm4
@@ -316,7 +316,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; AVX1-LABEL: splatvar_funnnel_v2i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -423,7 +423,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v2i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
 ; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
@@ -450,7 +450,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index 178c02f384f9b..ef5ffe4959b9c 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -172,7 +172,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
 define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm2
 ; SSE2-NEXT:    pslld $23, %xmm2
@@ -194,7 +194,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v2i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    psubd %xmm1, %xmm2
 ; SSE41-NEXT:    pslld $23, %xmm2
@@ -214,7 +214,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; AVX1-LABEL: splatvar_funnnel_v2i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
@@ -293,7 +293,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v2i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
@@ -309,7 +309,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
 ; X86-SSE2-NEXT:    psubd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pslld $23, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 372deb05e550c..2d8670a6d3f23 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -251,7 +251,7 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt)
 define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %amt) nounwind {
 ; SSE2-LABEL: splatvar_funnnel_v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; SSE2-NEXT:    pand %xmm4, %xmm5
@@ -287,7 +287,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; SSE41-LABEL: splatvar_funnnel_v2i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    pand %xmm3, %xmm4
@@ -317,7 +317,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; AVX1-LABEL: splatvar_funnnel_v2i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -425,7 +425,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v2i32:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
 ; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
@@ -452,7 +452,7 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pand %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index f57efb40bf0e3..1e11ea97396da 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -1409,11 +1409,11 @@ define <2 x i64> @load_sext_2i1_to_2i64(ptr%ptr) {
 ; X86-SSE2-NEXT:    movzbl %al, %eax
 ; X86-SSE2-NEXT:    negl %eax
 ; X86-SSE2-NEXT:    movd %eax, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
 ; X86-SSE2-NEXT:    andl $1, %ecx
 ; X86-SSE2-NEXT:    negl %ecx
 ; X86-SSE2-NEXT:    movd %ecx, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE2-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll
index bd1a48ba5d6ec..7b0f1c9f8a660 100644
--- a/llvm/test/CodeGen/X86/vector-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-zext.ll
@@ -2555,7 +2555,7 @@ entry:
 define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) {
 ; SSE2-LABEL: splatshuf_zext_v4i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
@@ -2563,7 +2563,7 @@ define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) {
 ;
 ; SSSE3-LABEL: splatshuf_zext_v4i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
@@ -2571,7 +2571,7 @@ define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) {
 ;
 ; SSE41-LABEL: splatshuf_zext_v4i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 19a31a6eca9bd..31ed74535b125 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -911,7 +911,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    paddb 16(%rsi), %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    paddb (%rdx), %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, (%rcx)
@@ -1898,7 +1898,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    paddb 32(%rsi), %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
@@ -4610,7 +4610,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    paddb 48(%rsi), %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -6544,7 +6544,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movdqa 16(%rdx), %xmm1
 ; SSE2-NEXT:    paddb %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 239472c5cd1c1..5b4cdd2feca06 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -769,7 +769,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
 ; SSE2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,0,0]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    paddb (%rsi), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, (%rdx)
@@ -1522,7 +1522,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
 ; SSE2-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,0,0]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
@@ -3660,7 +3660,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; SSE2-LABEL: vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = mem[0,0,0,0]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
@@ -5250,7 +5250,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
 define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind {
 ; SSE2-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,1,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1

From 2d67cb1a1f2a5e4864c84470454b287d4061fbb2 Mon Sep 17 00:00:00 2001
From: Chaitanya <Krishna.Sankisa@amd.com>
Date: Fri, 3 Oct 2025 21:16:57 +0530
Subject: [PATCH 660/878] [AMDGPU][Attributor] Stop inferring
 amdgpu-no-flat-scratch-init in sanitized functions. (#161319)

This PR stops the attributor pass to infer `amdgpu-no-flat-scratch-init`
for functions marked with `sanitize_*` attribute.
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 20 +++++++++-------
 ...mdgpu-attributor-flat-scratch-init-asan.ll | 24 +++++++++++++++++++
 .../annotate-kernel-features-hsa-call.ll      |  4 ++--
 3 files changed, 38 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 2ba31562c4784..9dd64e0f6b35b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -131,10 +131,8 @@ static bool isDSAddress(const Constant *C) {
   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
 }
 
-/// Returns true if the function requires the implicit argument be passed
-/// regardless of the function contents.
-static bool funcRequiresHostcallPtr(const Function &F) {
-  // Sanitizers require the hostcall buffer passed in the implicit arguments.
+/// Returns true if sanitizer attributes are present on a function.
+static bool hasSanitizerAttributes(const Function &F) {
   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
          F.hasFnAttribute(Attribute::SanitizeThread) ||
          F.hasFnAttribute(Attribute::SanitizeMemory) ||
@@ -469,15 +467,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
     // If the function requires the implicit arg pointer due to sanitizers,
     // assume it's needed even if explicitly marked as not requiring it.
-    const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
-    if (NeedsHostcall) {
+    // Flat scratch initialization is needed because `asan_malloc_impl`
+    // calls introduced later in pipeline will have flat scratch accesses.
+    // FIXME: FLAT_SCRATCH_INIT will not be required here if device-libs
+    // implementation for `asan_malloc_impl` is updated.
+    const bool HasSanitizerAttrs = hasSanitizerAttributes(*F);
+    if (HasSanitizerAttrs) {
       removeAssumedBits(IMPLICIT_ARG_PTR);
       removeAssumedBits(HOSTCALL_PTR);
+      removeAssumedBits(FLAT_SCRATCH_INIT);
     }
 
     for (auto Attr : ImplicitAttrs) {
-      if (NeedsHostcall &&
-          (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
+      if (HasSanitizerAttrs &&
+          (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
+           Attr.first == FLAT_SCRATCH_INIT))
         continue;
 
       if (F->hasFnAttribute(Attr.second))
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll
new file mode 100644
index 0000000000000..0d68762182f96
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes='amdgpu-attributor' %s -o - | FileCheck %s
+
+@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+
+;.
+; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4
+;.
+define amdgpu_kernel void @k0() #0 {
+; CHECK: Function Attrs: sanitize_address
+; CHECK-LABEL: define amdgpu_kernel void @k0(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    store i8 7, ptr addrspace(3) @lds_1, align 4
+; CHECK-NEXT:    ret void
+;
+  store i8 7, ptr addrspace(3) @lds_1, align 4
+  ret void
+}
+
+attributes #0 = { sanitize_address }
+; "amdgpu-no-flat-scratch-init" attribute should not be present in attribute list
+;.
+; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index a688b6fc6399f..fb566e5bb75f4 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -707,8 +707,8 @@ attributes #6 = { "enqueued-block" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR15:[0-9]+]] = { nounwind "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR19:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" }

From 36b543ab20277596edbeb46a402a8e1bd6c2ddb1 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Fri, 3 Oct 2025 16:52:48 +0100
Subject: [PATCH 661/878] [InstComb] Handle undef in
 simplifyMasked(Store|Scatter) (#161825)

---
 .../InstCombine/InstCombineCalls.cpp          | 10 ++--
 .../InstCombine/masked_intrinsics.ll          | 58 +++++++++++++++++++
 llvm/test/Transforms/InstCombine/pr83947.ll   |  2 -
 3 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index cf6d0ecab4f69..e1e24a99d0474 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -318,18 +318,18 @@ Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
 // * Single constant active lane -> store
 // * Narrow width by halfs excluding zero/undef lanes
 Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
+  Value *StorePtr = II.getArgOperand(1);
+  Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
   auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
   if (!ConstMask)
     return nullptr;
 
   // If the mask is all zeros, this instruction does nothing.
-  if (ConstMask->isNullValue())
+  if (maskIsAllZeroOrUndef(ConstMask))
     return eraseInstFromFunction(II);
 
   // If the mask is all ones, this is a plain vector store of the 1st argument.
-  if (ConstMask->isAllOnesValue()) {
-    Value *StorePtr = II.getArgOperand(1);
-    Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+  if (maskIsAllOneOrUndef(ConstMask)) {
     StoreInst *S =
         new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
     S->copyMetadata(II);
@@ -389,7 +389,7 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
     return nullptr;
 
   // If the mask is all zeros, a scatter does nothing.
-  if (ConstMask->isNullValue())
+  if (maskIsAllZeroOrUndef(ConstMask))
     return eraseInstFromFunction(II);
 
   // Vector splat address -> scalar store
diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
index 8f7683419a82a..67ab167c189b4 100644
--- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -16,6 +16,14 @@ define <2 x double> @load_zeromask(ptr %ptr, <2 x double> %passthru)  {
   ret <2 x double> %res
 }
 
+define <2 x double> @load_zero_withpoison_mask(ptr %ptr, <2 x double> %passthru)  {
+; CHECK-LABEL: @load_zero_withpoison_mask(
+; CHECK-NEXT:    ret <2 x double> [[PASSTHRU:%.*]]
+;
+  %res = call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 1, <2 x i1> <i1 0, i1 poison>, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
 define <2 x double> @load_onemask(ptr %ptr, <2 x double> %passthru)  {
 ; CHECK-LABEL: @load_onemask(
 ; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, ptr [[PTR:%.*]], align 2
@@ -150,6 +158,14 @@ define void @store_zeromask(ptr %ptr, <2 x double> %val)  {
   ret void
 }
 
+define void @store_poisonmask(ptr %ptr, <2 x double> %val)  {
+; CHECK-LABEL: @store_poisonmask(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 4, <2 x i1> splat(i1 poison))
+  ret void
+}
+
 define void @store_onemask(ptr %ptr, <2 x double> %val)  {
 ; CHECK-LABEL: @store_onemask(
 ; CHECK-NEXT:    store <2 x double> [[VAL:%.*]], ptr [[PTR:%.*]], align 4
@@ -159,6 +175,15 @@ define void @store_onemask(ptr %ptr, <2 x double> %val)  {
   ret void
 }
 
+define void @store_one_withpoison_mask(ptr %ptr, <2 x double> %val)  {
+; CHECK-LABEL: @store_one_withpoison_mask(
+; CHECK-NEXT:    store <2 x double> [[VAL:%.*]], ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 4, <2 x i1> <i1 1, i1 poison>)
+  ret void
+}
+
 define void @store_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @store_demandedelts(
 ; CHECK-NEXT:    [[VALVEC1:%.*]] = insertelement <2 x double> poison, double [[VAL:%.*]], i64 0
@@ -189,6 +214,13 @@ define <2 x double> @gather_zeromask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
   ret <2 x double> %res
 }
 
+define <2 x double> @gather_zero_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
+; CHECK-LABEL: @gather_zero_withpoison_mask(
+; CHECK-NEXT:    ret <2 x double> [[PASSTHRU:%.*]]
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 0, i1 poison>, <2 x double> %passthru)
+  ret <2 x double> %res
+}
 
 define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
 ; CHECK-LABEL: @gather_onemask(
@@ -199,6 +231,15 @@ define <2 x double> @gather_onemask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
   ret <2 x double> %res
 }
 
+define <2 x double> @gather_one_withpoisonmask(<2 x ptr> %ptrs, <2 x double> %passthru)  {
+; CHECK-LABEL: @gather_one_withpoisonmask(
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[PTRS:%.*]], i32 4, <2 x i1> <i1 true, i1 poison>, <2 x double> [[PASSTHRU:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> <i1 true, i1 poison>, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
 define <4 x double> @gather_lane2(ptr %base, double %pt)  {
 ; CHECK-LABEL: @gather_lane2(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[BASE:%.*]], <4 x i64> <i64 poison, i64 poison, i64 2, i64 poison>
@@ -257,6 +298,23 @@ define void @scatter_zeromask(<2 x ptr> %ptrs, <2 x double> %val)  {
   ret void
 }
 
+define void @scatter_zero_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %val)  {
+; CHECK-LABEL: @scatter_zero_withpoison_mask(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %val, <2 x ptr> %ptrs, i32 8, <2 x i1> <i1 0, i1 poison>)
+  ret void
+}
+
+define void @scatter_one_withpoison_mask(<2 x ptr> %ptrs, <2 x double> %val)  {
+; CHECK-LABEL: @scatter_one_withpoison_mask(
+; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[VAL:%.*]], <2 x ptr> [[PTRS:%.*]], i32 8, <2 x i1> <i1 true, i1 poison>)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %val, <2 x ptr> %ptrs, i32 8, <2 x i1> <i1 1, i1 poison>)
+  ret void
+}
+
 define void @scatter_demandedelts(ptr %ptr, double %val)  {
 ; CHECK-LABEL: @scatter_demandedelts(
 ; CHECK-NEXT:    [[PTRS:%.*]] = getelementptr double, ptr [[PTR:%.*]], <2 x i64> <i64 0, i64 poison>
diff --git a/llvm/test/Transforms/InstCombine/pr83947.ll b/llvm/test/Transforms/InstCombine/pr83947.ll
index 1906502fdaf23..679230a46d369 100644
--- a/llvm/test/Transforms/InstCombine/pr83947.ll
+++ b/llvm/test/Transforms/InstCombine/pr83947.ll
@@ -24,7 +24,6 @@ define void @masked_scatter2() {
 
 define void @masked_scatter3() {
 ; CHECK-LABEL: define void @masked_scatter3() {
-; CHECK-NEXT:    store i32 0, ptr @c, align 4
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> undef)
@@ -50,7 +49,6 @@ define void @masked_scatter5() {
 
 define void @masked_scatter6() {
 ; CHECK-LABEL: define void @masked_scatter6() {
-; CHECK-NEXT:    store i32 0, ptr @c, align 4
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> zeroinitializer, <2 x ptr> splat (ptr @c), i32 4, <2 x i1> <i1 undef, i1 false>)

From 62791b43774615b24d565130c8c3f1e6e5145dd8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 08:53:58 -0700
Subject: [PATCH 662/878] [RISCV] Replace uses of RISCV::NoRegister with
 Register() or isValid. NFC (#161781)

---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp    |  2 +-
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp    |  6 +++---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp        |  4 ++--
 llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp |  4 ++--
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp   | 16 ++++++++--------
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 7d4535ad46916..b37b7405a660f 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1560,7 +1560,7 @@ static MCRegister getRVVBaseRegister(const RISCVRegisterInfo &TRI,
   MCRegister BaseReg = TRI.getSubReg(Reg, RISCV::sub_vrm1_0);
   // If it's not a grouped vector register, it doesn't have subregister, so
   // the base register is just itself.
-  if (BaseReg == RISCV::NoRegister)
+  if (!BaseReg.isValid())
     BaseReg = Reg;
   return BaseReg;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 6a6ead2697591..cf8d120e6e620 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -128,7 +128,7 @@ static bool hasUndefinedPassthru(const MachineInstr &MI) {
   // All undefined passthrus should be $noreg: see
   // RISCVDAGToDAGISel::doPeepholeNoRegPassThru
   const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
-  return UseMO.getReg() == RISCV::NoRegister || UseMO.isUndef();
+  return !UseMO.getReg().isValid() || UseMO.isUndef();
 }
 
 /// Return true if \p MI is a copy that will be lowered to one or more vmvNr.vs.
@@ -1454,7 +1454,7 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
           Register Reg = VLOp.getReg();
 
           // Erase the AVL operand from the instruction.
-          VLOp.setReg(RISCV::NoRegister);
+          VLOp.setReg(Register());
           VLOp.setIsKill(false);
           if (LIS) {
             LiveInterval &LI = LIS->getInterval(Reg);
@@ -1663,7 +1663,7 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
     if (!MO.isReg() || !MO.getReg().isVirtual())
       return;
     Register OldVLReg = MO.getReg();
-    MO.setReg(RISCV::NoRegister);
+    MO.setReg(Register());
 
     if (LIS)
       LIS->shrinkToUses(&LIS->getInterval(OldVLReg));
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 1e6b04f8a4281..7db4832d198b5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1364,7 +1364,7 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
       RS->scavengeRegisterBackwards(RISCV::GPRRegClass, MI.getIterator(),
                                     /*RestoreAfter=*/false, /*SpAdj=*/0,
                                     /*AllowSpill=*/false);
-  if (TmpGPR != RISCV::NoRegister)
+  if (TmpGPR.isValid())
     RS->setRegUsed(TmpGPR);
   else {
     // The case when there is no scavenged register needs special handling.
@@ -3021,7 +3021,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
       ErrInfo = "Invalid operand type for VL operand";
       return false;
     }
-    if (Op.isReg() && Op.getReg() != RISCV::NoRegister) {
+    if (Op.isReg() && Op.getReg().isValid()) {
       const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
       auto *RC = MRI.getRegClass(Op.getReg());
       if (!RISCV::GPRRegClass.hasSubClassEq(RC)) {
diff --git a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
index f8d33ae8d24ca..54569b1f7aaf3 100644
--- a/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMakeCompressible.cpp
@@ -259,7 +259,7 @@ static RegImmPair getRegImmPairPreventingCompression(const MachineInstr &MI) {
   if (isCompressibleLoad(MI) || isCompressibleStore(MI)) {
     const MachineOperand &MOImm = MI.getOperand(2);
     if (!MOImm.isImm())
-      return RegImmPair(RISCV::NoRegister, 0);
+      return RegImmPair(Register(), 0);
 
     int64_t Offset = MOImm.getImm();
     int64_t NewBaseAdjust = getBaseAdjustForCompression(Offset, Opcode);
@@ -292,7 +292,7 @@ static RegImmPair getRegImmPairPreventingCompression(const MachineInstr &MI) {
       }
     }
   }
-  return RegImmPair(RISCV::NoRegister, 0);
+  return RegImmPair(Register(), 0);
 }
 
 // Check all uses after FirstMI of the given register, keeping a vector of
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index ffba2843bde1f..fdf9a4fe32fe6 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -382,7 +382,7 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const {
   // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
   // register class for the destination and passthru operands e.g. VRNoV0 -> VR
   MRI->recomputeRegClass(MI.getOperand(0).getReg());
-  if (MI.getOperand(1).getReg() != RISCV::NoRegister)
+  if (MI.getOperand(1).getReg().isValid())
     MRI->recomputeRegClass(MI.getOperand(1).getReg());
   return true;
 }
@@ -448,7 +448,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
   Register FalseReg = MI.getOperand(2).getReg();
   if (TruePassthruReg != FalseReg) {
     // If True's passthru is undef see if we can change it to False
-    if (TruePassthruReg != RISCV::NoRegister ||
+    if (TruePassthruReg.isValid() ||
         !MRI->hasOneUse(MI.getOperand(3).getReg()) ||
         !ensureDominates(MI.getOperand(2), *True))
       return false;
@@ -467,7 +467,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
   // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
   // register class for the destination and passthru operands e.g. VRNoV0 -> VR
   MRI->recomputeRegClass(MI.getOperand(0).getReg());
-  if (MI.getOperand(1).getReg() != RISCV::NoRegister)
+  if (MI.getOperand(1).getReg().isValid())
     MRI->recomputeRegClass(MI.getOperand(1).getReg());
   return true;
 }
@@ -517,7 +517,7 @@ bool RISCVVectorPeephole::convertToUnmasked(MachineInstr &MI) const {
   if (RISCVII::isFirstDefTiedToFirstUse(MaskedMCID)) {
     unsigned PassthruOpIdx = MI.getNumExplicitDefs();
     if (HasPassthru) {
-      if (MI.getOperand(PassthruOpIdx).getReg() != RISCV::NoRegister)
+      if (MI.getOperand(PassthruOpIdx).getReg())
         MRI->recomputeRegClass(MI.getOperand(PassthruOpIdx).getReg());
     } else
       MI.removeOperand(PassthruOpIdx);
@@ -576,7 +576,7 @@ static bool dominates(MachineBasicBlock::const_iterator A,
 bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO,
                                           MachineInstr &Src) const {
   assert(MO.getParent()->getParent() == Src.getParent());
-  if (!MO.isReg() || MO.getReg() == RISCV::NoRegister)
+  if (!MO.isReg() || !MO.getReg().isValid())
     return true;
 
   MachineInstr *Def = MRI->getVRegDef(MO.getReg());
@@ -593,7 +593,7 @@ bool RISCVVectorPeephole::ensureDominates(const MachineOperand &MO,
 bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) {
   if (RISCV::getRVVMCOpcode(MI.getOpcode()) != RISCV::VMV_V_V)
     return false;
-  if (MI.getOperand(1).getReg() != RISCV::NoRegister)
+  if (MI.getOperand(1).getReg().isValid())
     return false;
 
   // If the input was a pseudo with a policy operand, we can give it a tail
@@ -654,7 +654,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
 
   // Src needs to have the same passthru as VMV_V_V
   MachineOperand &SrcPassthru = Src->getOperand(Src->getNumExplicitDefs());
-  if (SrcPassthru.getReg() != RISCV::NoRegister &&
+  if (SrcPassthru.getReg().isValid() &&
       SrcPassthru.getReg() != Passthru.getReg())
     return false;
 
@@ -672,7 +672,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
   if (SrcPassthru.getReg() != Passthru.getReg()) {
     SrcPassthru.setReg(Passthru.getReg());
     // If Src is masked then its passthru needs to be in VRNoV0.
-    if (Passthru.getReg() != RISCV::NoRegister)
+    if (Passthru.getReg().isValid())
       MRI->constrainRegClass(
           Passthru.getReg(),
           TII->getRegClass(Src->getDesc(), SrcPassthru.getOperandNo(), TRI));

From 78c5d9100f718fe380eeea6ca6236f3505a6c3a6 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 3 Oct 2025 16:54:18 +0100
Subject: [PATCH 663/878] [lldb][test] Disable TestLldbGdbServer.py on Linux
 temporarily (#161868)

While #161510 is dealt with.
---
 lldb/test/API/tools/lldb-server/TestLldbGdbServer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
index c01f6d83fafe5..f1c0519ae56d8 100644
--- a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
+++ b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
@@ -22,7 +22,9 @@
 from lldbsuite.test.lldbdwarf import *
 from lldbsuite.test import lldbutil, lldbplatformutil
 
-
+# On Linux systems with Yama ptrace_scope = 1 there is a race condition when the
+# debugee enables tracing. See https://github.com/llvm/llvm-project/issues/161510.
+@skipIfLinux
 class LldbGdbServerTestCase(
     gdbremote_testcase.GdbRemoteTestCaseBase, DwarfOpcodeParser
 ):

From 2a73a79e36fd388057c14dd9ef56cdf7602a345e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 4 Oct 2025 00:58:51 +0900
Subject: [PATCH 664/878] DAG: Remove TargetLowering::checkForPhysRegDependency
 (#161787)

I have no idea why this was here. The only implementation was AMDGPU,
which was essentially repeating the generic logic but for one specific
case.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    | 17 -------------
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |  7 +-----
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 25 -------------------
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |  5 ----
 4 files changed, 1 insertion(+), 53 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 7bbad172b2d42..88691b931a8d8 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4654,23 +4654,6 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
     return false;
   }
 
-  /// Allows the target to handle physreg-carried dependency
-  /// in target-specific way. Used from the ScheduleDAGSDNodes to decide whether
-  /// to add the edge to the dependency graph.
-  /// Def - input: Selection DAG node defininfg physical register
-  /// User - input: Selection DAG node using physical register
-  /// Op - input: Number of User operand
-  /// PhysReg - inout: set to the physical register if the edge is
-  /// necessary, unchanged otherwise
-  /// Cost - inout: physical register copy cost.
-  /// Returns 'true' is the edge is necessary, 'false' otherwise
-  virtual bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
-                                         const TargetRegisterInfo *TRI,
-                                         const TargetInstrInfo *TII,
-                                         MCRegister &PhysReg, int &Cost) const {
-    return false;
-  }
-
   /// Target-specific combining of register parts into its original value
   virtual SDValue
   joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 79022295f0abd..4f4fb9c759ad7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -111,15 +111,11 @@ SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
 static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
                                       const TargetRegisterInfo *TRI,
                                       const TargetInstrInfo *TII,
-                                      const TargetLowering &TLI,
                                       MCRegister &PhysReg, int &Cost) {
   if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
     return;
 
   Register Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-  if (TLI.checkForPhysRegDependency(Def, User, Op, TRI, TII, PhysReg, Cost))
-    return;
-
   if (Reg.isVirtual())
     return;
 
@@ -490,8 +486,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         MCRegister PhysReg;
         int Cost = 1;
         // Determine if this is a physical register dependency.
-        const TargetLowering &TLI = DAG->getTargetLoweringInfo();
-        CheckForPhysRegDependency(OpN, N, i, TRI, TII, TLI, PhysReg, Cost);
+        CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
         assert((!PhysReg || !isChain) && "Chain dependence via physreg data?");
         // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
         // emits a copy from the physical register to a virtual register unless
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 79876ff37b97a..e2334577884b7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18860,31 +18860,6 @@ SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
   return Flags;
 }
 
-bool SITargetLowering::checkForPhysRegDependency(
-    SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
-    const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
-  if (User->getOpcode() != ISD::CopyToReg)
-    return false;
-  if (!Def->isMachineOpcode())
-    return false;
-  MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
-  if (!MDef)
-    return false;
-
-  unsigned ResNo = User->getOperand(Op).getResNo();
-  if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
-    return false;
-  const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
-  if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
-    PhysReg = AMDGPU::SCC;
-    const TargetRegisterClass *RC =
-        TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
-    Cost = RC->expensiveOrImpossibleToCopy() ? -1 : RC->getCopyCost();
-    return true;
-  }
-  return false;
-}
-
 void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
     Instruction *AI) const {
   // Given: atomicrmw fadd ptr %addr, float %val ordering
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a474dab8d843c..74e58f4272e10 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -561,11 +561,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
   bool denormalsEnabledForType(LLT Ty, const MachineFunction &MF) const;
 
-  bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
-                                 const TargetRegisterInfo *TRI,
-                                 const TargetInstrInfo *TII,
-                                 MCRegister &PhysReg, int &Cost) const override;
-
   bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts,
                                     const SelectionDAG &DAG, bool SNaN = false,
                                     unsigned Depth = 0) const override;

From bd7e228fa43d7da93aef19892cd8a7de3350835e Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 3 Oct 2025 09:27:04 -0700
Subject: [PATCH 665/878] [NFC][TableGen] Fix namespace usage in various files
 (#161839)

- Move standalone functions and variables out of anonymous namespace and
make them static.
- Eliminate `namespace llvm {}` wrapping all code in .cpp files, and
instead use namespace qualifier to define such functions
(https://llvm.org/docs/CodingStandards.html#use-namespace-qualifiers-to-implement-previously-declared-functions)
- Add namespace for X86DisassemblerShared.h.
---
 .../Support/X86DisassemblerDecoderCommon.h    |  6 +-
 llvm/lib/TableGen/Error.cpp                   | 58 +++++++++----------
 llvm/lib/TableGen/Main.cpp                    |  4 +-
 llvm/lib/TableGen/Record.cpp                  |  6 +-
 llvm/lib/TableGen/TGParser.cpp                |  6 +-
 .../TableGen/Common/CodeGenDAGPatterns.cpp    |  6 +-
 .../TableGen/Common/CodeGenRegisters.cpp      | 11 ----
 llvm/utils/TableGen/Common/InfoByHwMode.cpp   |  8 +--
 .../TableGen/Common/PredicateExpander.cpp     |  4 +-
 llvm/utils/TableGen/DXILEmitter.cpp           | 22 +++----
 llvm/utils/TableGen/DecoderEmitter.cpp        |  4 --
 llvm/utils/TableGen/ExegesisEmitter.cpp       | 22 +++----
 llvm/utils/TableGen/FastISelEmitter.cpp       |  6 --
 llvm/utils/TableGen/X86DisassemblerShared.h   |  4 ++
 llvm/utils/TableGen/X86FoldTablesEmitter.cpp  | 18 +++---
 .../utils/TableGen/X86InstrMappingEmitter.cpp |  4 +-
 llvm/utils/TableGen/X86MnemonicTables.cpp     |  3 +-
 llvm/utils/TableGen/X86ModRMFilters.h         |  8 +--
 llvm/utils/TableGen/X86RecognizableInstr.h    |  4 +-
 19 files changed, 86 insertions(+), 118 deletions(-)

diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
index 1e07fbe64f7d3..faaff4ad8ee02 100644
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -18,8 +18,7 @@
 
 #include "llvm/Support/DataTypes.h"
 
-namespace llvm {
-namespace X86Disassembler {
+namespace llvm::X86Disassembler {
 
 #define INSTRUCTIONS_SYM x86DisassemblerInstrSpecifiers
 #define CONTEXTS_SYM x86DisassemblerContexts
@@ -541,7 +540,6 @@ static const unsigned X86_MAX_OPERANDS = 6;
 /// respectively.
 enum DisassemblerMode { MODE_16BIT, MODE_32BIT, MODE_64BIT };
 
-} // namespace X86Disassembler
-} // namespace llvm
+} // namespace llvm::X86Disassembler
 
 #endif
diff --git a/llvm/lib/TableGen/Error.cpp b/llvm/lib/TableGen/Error.cpp
index de0c4c971efeb..3ba2c6c6dddb2 100644
--- a/llvm/lib/TableGen/Error.cpp
+++ b/llvm/lib/TableGen/Error.cpp
@@ -19,10 +19,10 @@
 #include "llvm/TableGen/Record.h"
 #include <cstdlib>
 
-namespace llvm {
+using namespace llvm;
 
-SourceMgr SrcMgr;
-unsigned ErrorsPrinted = 0;
+SourceMgr llvm::SrcMgr;
+unsigned llvm::ErrorsPrinted = 0;
 
 static void PrintMessage(ArrayRef<SMLoc> Locs, SourceMgr::DiagKind Kind,
                          const Twine &Msg) {
@@ -49,118 +49,118 @@ static void PrintMessage(ArrayRef<SMLoc> Locs, SourceMgr::DiagKind Kind,
 
 // Functions to print notes.
 
-void PrintNote(const Twine &Msg) {
-  WithColor::note() << Msg << "\n";
-}
+void llvm::PrintNote(const Twine &Msg) { WithColor::note() << Msg << "\n"; }
 
-void PrintNote(function_ref<void(raw_ostream &OS)> PrintMsg) {
+void llvm::PrintNote(function_ref<void(raw_ostream &OS)> PrintMsg) {
   PrintMsg(WithColor::note());
 }
 
-void PrintNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg) {
+void llvm::PrintNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg) {
   PrintMessage(NoteLoc, SourceMgr::DK_Note, Msg);
 }
 
 // Functions to print fatal notes.
 
-void PrintFatalNote(const Twine &Msg) {
+void llvm::PrintFatalNote(const Twine &Msg) {
   PrintNote(Msg);
   fatal_exit();
 }
 
-void PrintFatalNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg) {
+void llvm::PrintFatalNote(ArrayRef<SMLoc> NoteLoc, const Twine &Msg) {
   PrintNote(NoteLoc, Msg);
   fatal_exit();
 }
 
 // This method takes a Record and uses the source location
 // stored in it.
-void PrintFatalNote(const Record *Rec, const Twine &Msg) {
+void llvm::PrintFatalNote(const Record *Rec, const Twine &Msg) {
   PrintNote(Rec->getLoc(), Msg);
   fatal_exit();
 }
 
 // This method takes a RecordVal and uses the source location
 // stored in it.
-void PrintFatalNote(const RecordVal *RecVal, const Twine &Msg) {
+void llvm::PrintFatalNote(const RecordVal *RecVal, const Twine &Msg) {
   PrintNote(RecVal->getLoc(), Msg);
   fatal_exit();
 }
 
 // Functions to print warnings.
 
-void PrintWarning(const Twine &Msg) { WithColor::warning() << Msg << "\n"; }
+void llvm::PrintWarning(const Twine &Msg) {
+  WithColor::warning() << Msg << "\n";
+}
 
-void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg) {
+void llvm::PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg) {
   PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
 }
 
-void PrintWarning(const char *Loc, const Twine &Msg) {
+void llvm::PrintWarning(const char *Loc, const Twine &Msg) {
   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Warning, Msg);
 }
 
 // Functions to print errors.
 
-void PrintError(const Twine &Msg) { WithColor::error() << Msg << "\n"; }
+void llvm::PrintError(const Twine &Msg) { WithColor::error() << Msg << "\n"; }
 
-void PrintError(function_ref<void(raw_ostream &OS)> PrintMsg) {
+void llvm::PrintError(function_ref<void(raw_ostream &OS)> PrintMsg) {
   PrintMsg(WithColor::error());
 }
 
-void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
+void llvm::PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
   PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
 }
 
-void PrintError(const char *Loc, const Twine &Msg) {
+void llvm::PrintError(const char *Loc, const Twine &Msg) {
   SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg);
 }
 
 // This method takes a Record and uses the source location
 // stored in it.
-void PrintError(const Record *Rec, const Twine &Msg) {
+void llvm::PrintError(const Record *Rec, const Twine &Msg) {
   PrintMessage(Rec->getLoc(), SourceMgr::DK_Error, Msg);
 }
 
 // This method takes a RecordVal and uses the source location
 // stored in it.
-void PrintError(const RecordVal *RecVal, const Twine &Msg) {
+void llvm::PrintError(const RecordVal *RecVal, const Twine &Msg) {
   PrintMessage(RecVal->getLoc(), SourceMgr::DK_Error, Msg);
 }
 
 // Functions to print fatal errors.
 
-void PrintFatalError(const Twine &Msg) {
+void llvm::PrintFatalError(const Twine &Msg) {
   PrintError(Msg);
   fatal_exit();
 }
 
-void PrintFatalError(function_ref<void(raw_ostream &OS)> PrintMsg) {
+void llvm::PrintFatalError(function_ref<void(raw_ostream &OS)> PrintMsg) {
   PrintError(PrintMsg);
   fatal_exit();
 }
 
-void PrintFatalError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
+void llvm::PrintFatalError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
   PrintError(ErrorLoc, Msg);
   fatal_exit();
 }
 
 // This method takes a Record and uses the source location
 // stored in it.
-void PrintFatalError(const Record *Rec, const Twine &Msg) {
+void llvm::PrintFatalError(const Record *Rec, const Twine &Msg) {
   PrintError(Rec->getLoc(), Msg);
   fatal_exit();
 }
 
 // This method takes a RecordVal and uses the source location
 // stored in it.
-void PrintFatalError(const RecordVal *RecVal, const Twine &Msg) {
+void llvm::PrintFatalError(const RecordVal *RecVal, const Twine &Msg) {
   PrintError(RecVal->getLoc(), Msg);
   fatal_exit();
 }
 
 // Check an assertion: Obtain the condition value and be sure it is true.
 // If not, print a nonfatal error along with the message.
-bool CheckAssert(SMLoc Loc, const Init *Condition, const Init *Message) {
+bool llvm::CheckAssert(SMLoc Loc, const Init *Condition, const Init *Message) {
   auto *CondValue = dyn_cast_or_null<IntInit>(Condition->convertInitializerTo(
       IntRecTy::get(Condition->getRecordKeeper())));
   if (!CondValue) {
@@ -178,11 +178,9 @@ bool CheckAssert(SMLoc Loc, const Init *Condition, const Init *Message) {
 }
 
 // Dump a message to stderr.
-void dumpMessage(SMLoc Loc, const Init *Message) {
+void llvm::dumpMessage(SMLoc Loc, const Init *Message) {
   if (auto *MessageInit = dyn_cast<StringInit>(Message))
     PrintNote(Loc, MessageInit->getValue());
   else
     PrintError(Loc, "dump value is not of type string");
 }
-
-} // end namespace llvm
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index f545706d6fe30..42043f70768c5 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -64,14 +64,12 @@ WriteIfChanged("write-if-changed", cl::desc("Only write output if it changed"));
 static cl::opt<bool>
 TimePhases("time-phases", cl::desc("Time phases of parser and backend"));
 
-namespace llvm {
-cl::opt<bool> EmitLongStrLiterals(
+cl::opt<bool> llvm::EmitLongStrLiterals(
     "long-string-literals",
     cl::desc("when emitting large string tables, prefer string literals over "
              "comma-separated char literals. This can be a readability and "
              "compile-time performance win, but upsets some compilers"),
     cl::Hidden, cl::init(true));
-} // end namespace llvm
 
 static cl::opt<bool> NoWarnOnUnusedTemplateArgs(
     "no-warn-on-unused-template-args",
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 051a896cfd1b4..2ea3a24bb8eb2 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -46,8 +46,7 @@ using namespace llvm;
 //    Context
 //===----------------------------------------------------------------------===//
 
-namespace llvm {
-namespace detail {
+namespace llvm::detail {
 /// This class represents the internal implementation of the RecordKeeper.
 /// It contains all of the contextual static state of the Record classes. It is
 /// kept out-of-line to simplify dependencies, and also make it easier for
@@ -100,8 +99,7 @@ struct RecordKeeperImpl {
 
   void dumpAllocationStats(raw_ostream &OS) const;
 };
-} // namespace detail
-} // namespace llvm
+} // namespace llvm::detail
 
 void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const {
   // Dump memory allocation related stats.
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index f928ded16186f..3d31d8e2717b1 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -31,8 +31,6 @@ using namespace llvm;
 // Support Code for the Semantic Actions.
 //===----------------------------------------------------------------------===//
 
-namespace llvm {
-
 RecordsEntry::RecordsEntry(std::unique_ptr<Record> Rec) : Rec(std::move(Rec)) {}
 RecordsEntry::RecordsEntry(std::unique_ptr<ForeachLoop> Loop)
     : Loop(std::move(Loop)) {}
@@ -41,6 +39,7 @@ RecordsEntry::RecordsEntry(std::unique_ptr<Record::AssertionInfo> Assertion)
 RecordsEntry::RecordsEntry(std::unique_ptr<Record::DumpInfo> Dump)
     : Dump(std::move(Dump)) {}
 
+namespace llvm {
 struct SubClassReference {
   SMRange RefRange;
   const Record *Rec = nullptr;
@@ -61,6 +60,7 @@ struct SubMultiClassReference {
   bool isInvalid() const { return MC == nullptr; }
   void dump() const;
 };
+} // end namespace llvm
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
@@ -74,8 +74,6 @@ LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
 }
 #endif
 
-} // end namespace llvm
-
 static bool checkBitsConcrete(Record &R, const RecordVal &RV) {
   const auto *BV = cast<BitsInit>(RV.getValue());
   for (unsigned i = 0, e = BV->getNumBits(); i != e; ++i) {
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 75bea77faba42..8076ce2486f56 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -246,16 +246,14 @@ bool TypeSetByHwMode::operator==(const TypeSetByHwMode &VTS) const {
   return true;
 }
 
-namespace llvm {
-raw_ostream &operator<<(raw_ostream &OS, const MachineValueTypeSet &T) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineValueTypeSet &T) {
   T.writeToStream(OS);
   return OS;
 }
-raw_ostream &operator<<(raw_ostream &OS, const TypeSetByHwMode &T) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const TypeSetByHwMode &T) {
   T.writeToStream(OS);
   return OS;
 }
-} // namespace llvm
 
 LLVM_DUMP_METHOD
 void TypeSetByHwMode::dump() const { dbgs() << *this << '\n'; }
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 294f3af5a0055..8d0ec9abd23ae 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -857,17 +857,6 @@ unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank &RegBank) const {
   return (*Members.begin())->getWeight(RegBank);
 }
 
-namespace llvm {
-
-raw_ostream &operator<<(raw_ostream &OS, const CodeGenRegisterClass::Key &K) {
-  OS << "{ " << K.RSI;
-  for (const auto R : *K.Members)
-    OS << ", " << R->getName();
-  return OS << " }";
-}
-
-} // end namespace llvm
-
 // This is a simple lexicographical order that can be used to search for sets.
 // It is not the same as the topological order provided by TopoOrderRC.
 bool CodeGenRegisterClass::Key::operator<(
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
index a6e2fc415f350..4c8197dc60c14 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
@@ -227,19 +227,17 @@ EncodingInfoByHwMode::EncodingInfoByHwMode(const Record *R,
   }
 }
 
-namespace llvm {
-raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const ValueTypeByHwMode &T) {
   T.writeToStream(OS);
   return OS;
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfo &T) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const RegSizeInfo &T) {
   T.writeToStream(OS);
   return OS;
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const RegSizeInfoByHwMode &T) {
   T.writeToStream(OS);
   return OS;
 }
-} // namespace llvm
diff --git a/llvm/utils/TableGen/Common/PredicateExpander.cpp b/llvm/utils/TableGen/Common/PredicateExpander.cpp
index 09d953801600d..03252ed465b75 100644
--- a/llvm/utils/TableGen/Common/PredicateExpander.cpp
+++ b/llvm/utils/TableGen/Common/PredicateExpander.cpp
@@ -14,7 +14,7 @@
 #include "CodeGenSchedule.h" // Definition of STIPredicateFunction.
 #include "llvm/TableGen/Record.h"
 
-namespace llvm {
+using namespace llvm;
 
 void PredicateExpander::expandTrue(raw_ostream &OS) { OS << "true"; }
 void PredicateExpander::expandFalse(raw_ostream &OS) { OS << "false"; }
@@ -553,5 +553,3 @@ void STIPredicateExpander::expandSTIPredicate(raw_ostream &OS,
     expandEpilogue(OS, Fn);
   }
 }
-
-} // namespace llvm
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 09ce9f3027410..94719598dfd58 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -37,15 +37,6 @@ struct DXILIntrinsicSelect {
   SmallVector<const Record *> ArgSelectRecords;
 };
 
-static StringRef StripIntrinArgSelectTypePrefix(StringRef Type) {
-  StringRef Prefix = "IntrinArgSelect_";
-  if (!Type.starts_with(Prefix)) {
-    PrintFatalError("IntrinArgSelectType definintion must be prefixed with "
-                    "'IntrinArgSelect_'");
-  }
-  return Type.substr(Prefix.size());
-}
-
 struct DXILOperationDesc {
   std::string OpName; // name of DXIL operation
   int OpCode;         // ID of DXIL operation
@@ -66,6 +57,15 @@ struct DXILOperationDesc {
 };
 } // end anonymous namespace
 
+static StringRef stripIntrinArgSelectTypePrefix(StringRef Type) {
+  StringRef Prefix = "IntrinArgSelect_";
+  if (!Type.starts_with(Prefix)) {
+    PrintFatalError("IntrinArgSelectType definintion must be prefixed with "
+                    "'IntrinArgSelect_'");
+  }
+  return Type.substr(Prefix.size());
+}
+
 /// In-place sort TableGen records of class with a field
 ///    Version dxil_version
 /// in the ascending version order.
@@ -449,7 +449,7 @@ static void emitDXILIntrinsicMap(ArrayRef<DXILOperationDesc> Ops,
             ArgSelect->getValueAsDef("type")->getNameInitAsString();
         int Value = ArgSelect->getValueAsInt("value");
         OS << "(IntrinArgSelect{"
-           << "IntrinArgSelect::Type::" << StripIntrinArgSelectTypePrefix(Type)
+           << "IntrinArgSelect::Type::" << stripIntrinArgSelectTypePrefix(Type)
            << "," << Value << "}), ";
       }
       OS << ")\n";
@@ -466,7 +466,7 @@ static void emitDXILIntrinsicArgSelectTypes(const RecordKeeper &Records,
   OS << "#ifdef DXIL_OP_INTRINSIC_ARG_SELECT_TYPE\n";
   for (const Record *Records :
        Records.getAllDerivedDefinitions("IntrinArgSelectType")) {
-    StringRef StrippedName = StripIntrinArgSelectTypePrefix(Records->getName());
+    StringRef StrippedName = stripIntrinArgSelectTypePrefix(Records->getName());
     OS << "DXIL_OP_INTRINSIC_ARG_SELECT_TYPE(" << StrippedName << ")\n";
   }
   OS << "#undef DXIL_OP_INTRINSIC_ARG_SELECT_TYPE\n";
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 961dc2815f6b9..5d41b7dc3c311 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -194,10 +194,6 @@ class DecoderEmitter {
   void parseInstructionEncodings();
 };
 
-} // end anonymous namespace
-
-namespace {
-
 struct EncodingIsland {
   unsigned StartBit;
   unsigned NumBits;
diff --git a/llvm/utils/TableGen/ExegesisEmitter.cpp b/llvm/utils/TableGen/ExegesisEmitter.cpp
index 1b4b0729a5fcc..bd69919bb462e 100644
--- a/llvm/utils/TableGen/ExegesisEmitter.cpp
+++ b/llvm/utils/TableGen/ExegesisEmitter.cpp
@@ -58,6 +58,14 @@ class ExegesisEmitter {
   const std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
 };
 
+struct ValidationCounterInfo {
+  int64_t EventNumber;
+  StringRef EventName;
+  unsigned PfmCounterID;
+};
+
+} // namespace
+
 static std::map<llvm::StringRef, unsigned>
 collectPfmCounters(const RecordKeeper &Records) {
   std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
@@ -106,14 +114,8 @@ ExegesisEmitter::ExegesisEmitter(const RecordKeeper &RK)
   Target = Targets[0]->getName().str();
 }
 
-struct ValidationCounterInfo {
-  int64_t EventNumber;
-  StringRef EventName;
-  unsigned PfmCounterID;
-};
-
-bool EventNumberLess(const ValidationCounterInfo &LHS,
-                     const ValidationCounterInfo &RHS) {
+static bool EventNumberLess(const ValidationCounterInfo &LHS,
+                            const ValidationCounterInfo &RHS) {
   return LHS.EventNumber < RHS.EventNumber;
 }
 
@@ -221,7 +223,7 @@ void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const {
     emitPfmCountersInfo(*Def, IssueCountersTableOffset, OS);
 
   OS << "\n";
-} // namespace
+}
 
 void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
   std::vector<const Record *> Bindings =
@@ -249,7 +251,5 @@ void ExegesisEmitter::run(raw_ostream &OS) const {
   emitPfmCountersLookupTable(OS);
 }
 
-} // end anonymous namespace
-
 static TableGen::Emitter::OptClass<ExegesisEmitter>
     X("gen-exegesis", "Generate llvm-exegesis tables");
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index 694d89a5ada3c..dba8bde54ff4d 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -52,11 +52,9 @@ struct InstructionMemo {
   InstructionMemo(const InstructionMemo &Other) = delete;
   InstructionMemo(InstructionMemo &&Other) = default;
 };
-} // End anonymous namespace
 
 /// ImmPredicateSet - This uniques predicates (represented as a string) and
 /// gives them unique (small) integer ID's that start at 0.
-namespace {
 class ImmPredicateSet {
   DenseMap<TreePattern *, unsigned> ImmIDs;
   std::vector<TreePredicateFn> PredsByName;
@@ -77,12 +75,10 @@ class ImmPredicateSet {
   iterator begin() const { return PredsByName.begin(); }
   iterator end() const { return PredsByName.end(); }
 };
-} // End anonymous namespace
 
 /// OperandsSignature - This class holds a description of a list of operand
 /// types. It has utility methods for emitting text based on the operands.
 ///
-namespace {
 struct OperandsSignature {
   class OpKind {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
@@ -366,9 +362,7 @@ struct OperandsSignature {
       Opnd.printManglingSuffix(OS, ImmPredicates, StripImmCodes);
   }
 };
-} // End anonymous namespace
 
-namespace {
 class FastISelMap {
   // A multimap is needed instead of a "plain" map because the key is
   // the instruction's complexity (an int) and they are not unique.
diff --git a/llvm/utils/TableGen/X86DisassemblerShared.h b/llvm/utils/TableGen/X86DisassemblerShared.h
index f60fd477788a5..d5f936dabab6a 100644
--- a/llvm/utils/TableGen/X86DisassemblerShared.h
+++ b/llvm/utils/TableGen/X86DisassemblerShared.h
@@ -14,6 +14,8 @@
 
 #include "llvm/Support/X86DisassemblerDecoderCommon.h"
 
+namespace llvm::X86Disassembler {
+
 struct InstructionSpecifier {
   llvm::X86Disassembler::OperandSpecifier
       operands[llvm::X86Disassembler::X86_MAX_OPERANDS];
@@ -52,4 +54,6 @@ struct ContextDecision {
   ContextDecision() { memset(opcodeDecisions, 0, sizeof(opcodeDecisions)); }
 };
 
+} // namespace llvm::X86Disassembler
+
 #endif
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 1e1e4ab650030..6f523b5a197d3 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -30,22 +30,23 @@ struct ManualMapEntry {
   const char *MemInstStr;
   uint16_t Strategy;
 };
+} // namespace
 
 // List of instructions requiring explicitly aligned memory.
-const char *ExplicitAlign[] = {"MOVDQA",  "MOVAPS",  "MOVAPD",  "MOVNTPS",
-                               "MOVNTPD", "MOVNTDQ", "MOVNTDQA"};
+static constexpr const char *ExplicitAlign[] = {
+    "MOVDQA", "MOVAPS", "MOVAPD", "MOVNTPS", "MOVNTPD", "MOVNTDQ", "MOVNTDQA"};
 
 // List of instructions NOT requiring explicit memory alignment.
-const char *ExplicitUnalign[] = {"MOVDQU",    "MOVUPS",    "MOVUPD",
-                                 "PCMPESTRM", "PCMPESTRI", "PCMPISTRM",
-                                 "PCMPISTRI"};
+static constexpr const char *ExplicitUnalign[] = {
+    "MOVDQU",    "MOVUPS",    "MOVUPD",   "PCMPESTRM",
+    "PCMPESTRI", "PCMPISTRM", "PCMPISTRI"};
 
-const ManualMapEntry ManualMapSet[] = {
+static const ManualMapEntry ManualMapSet[] = {
 #define ENTRY(REG, MEM, FLAGS) {#REG, #MEM, FLAGS},
 #include "X86ManualFoldTables.def"
 };
 
-const std::set<StringRef> NoFoldSet = {
+static const std::set<StringRef> NoFoldSet = {
 #define NOFOLD(INSN) #INSN,
 #include "X86ManualFoldTables.def"
 };
@@ -62,6 +63,7 @@ static bool isExplicitUnalign(const CodeGenInstruction *Inst) {
   });
 }
 
+namespace {
 class X86FoldTablesEmitter {
   const RecordKeeper &Records;
   const CodeGenTarget Target;
@@ -230,6 +232,7 @@ class X86FoldTablesEmitter {
     OS << "};\n\n";
   }
 };
+} // namespace
 
 // Return true if one of the instruction's operands is a RST register class
 static bool hasRSTRegClass(const CodeGenInstruction *Inst) {
@@ -318,6 +321,7 @@ static bool isNOREXRegClass(const Record *Op) {
 
 // Function object - Operator() returns true if the given Reg instruction
 // matches the Mem instruction of this object.
+namespace {
 class IsMatch {
   const CodeGenInstruction *MemInst;
   const X86Disassembler::RecognizableInstrBase MemRI;
diff --git a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
index be5e2a73e8d43..2745ba72ba624 100644
--- a/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
+++ b/llvm/utils/TableGen/X86InstrMappingEmitter.cpp
@@ -66,6 +66,7 @@ class X86InstrMappingEmitter {
   void printTable(ArrayRef<Entry> Table, StringRef Name, StringRef Macro,
                   raw_ostream &OS);
 };
+} // namespace
 
 void X86InstrMappingEmitter::printClassDef(raw_ostream &OS) {
   OS << "struct X86TableEntry {\n"
@@ -106,6 +107,7 @@ void X86InstrMappingEmitter::printTable(ArrayRef<Entry> Table, StringRef Name,
   printMacroEnd(Macro, OS);
 }
 
+namespace {
 class IsMatch {
   const CodeGenInstruction *OldInst;
 
@@ -146,6 +148,7 @@ class IsMatch {
     return true;
   }
 };
+} // namespace
 
 static bool isInteresting(const Record *Rec) {
   // _REV instruction should not appear before encoding optimization
@@ -368,7 +371,6 @@ void X86InstrMappingEmitter::run(raw_ostream &OS) {
   emitND2NonNDTable(Insts, OS);
   emitSSE2AVXTable(Insts, OS);
 }
-} // namespace
 
 static TableGen::Emitter::OptClass<X86InstrMappingEmitter>
     X("gen-x86-instr-mapping", "Generate X86 instruction mapping");
diff --git a/llvm/utils/TableGen/X86MnemonicTables.cpp b/llvm/utils/TableGen/X86MnemonicTables.cpp
index 85bd4df0d4362..7851919bd7387 100644
--- a/llvm/utils/TableGen/X86MnemonicTables.cpp
+++ b/llvm/utils/TableGen/X86MnemonicTables.cpp
@@ -30,6 +30,7 @@ class X86MnemonicTablesEmitter {
   // Output X86 mnemonic tables.
   void run(raw_ostream &OS);
 };
+} // namespace
 
 void X86MnemonicTablesEmitter::run(raw_ostream &OS) {
   emitSourceFileHeader("X86 Mnemonic tables", OS);
@@ -83,7 +84,5 @@ void X86MnemonicTablesEmitter::run(raw_ostream &OS) {
   OS << "} // end namespace X86\n} // end namespace llvm";
 }
 
-} // namespace
-
 static TableGen::Emitter::OptClass<X86MnemonicTablesEmitter>
     X("gen-x86-mnemonic-tables", "Generate X86 mnemonic tables");
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index b579f229379bb..7bf111ffa1d50 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -19,9 +19,7 @@
 
 #include <cstdint>
 
-namespace llvm {
-
-namespace X86Disassembler {
+namespace llvm::X86Disassembler {
 
 /// ModRMFilter - Abstract base class for clases that recognize patterns in
 ///   ModR/M bytes.
@@ -135,8 +133,6 @@ class ExactFilter : public ModRMFilter {
   bool accepts(uint8_t modRM) const override { return (ModRM == modRM); }
 };
 
-} // namespace X86Disassembler
-
-} // namespace llvm
+} // namespace llvm::X86Disassembler
 
 #endif
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index b74e74dd3f6e7..52f95383b6fb4 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -22,8 +22,6 @@
 #include <string>
 #include <vector>
 
-struct InstructionSpecifier;
-
 namespace llvm {
 class Record;
 #define X86_INSTR_MRM_MAPPING                                                  \
@@ -179,6 +177,8 @@ enum { ExplicitREX2 = 1, ExplicitEVEX = 3 };
 
 namespace X86Disassembler {
 class DisassemblerTables;
+struct InstructionSpecifier;
+
 /// Extract common fields of a single X86 instruction from a CodeGenInstruction
 struct RecognizableInstrBase {
   /// The OpPrefix field from the record

From 3add28bf3f53c01d1f321ae061ecbfdc83b1172e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 3 Oct 2025 17:28:44 +0100
Subject: [PATCH 666/878] [Hexagon] isel-fold-shl-zext.ll - regenerate test
 checks (#161869)

Improves codegen diff in an upcoming patch
---
 llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll b/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll
index 5fa5023100514..fe0f7dd89babe 100644
--- a/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll
+++ b/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=hexagon-unknown-elf < %s | FileCheck %s
 
 ; In ISelLowering, when folding nodes (or (shl xx, s), (zext y))
@@ -11,17 +12,18 @@ target triple = "hexagon"
 ; Function Attrs: nofree nosync nounwind memory(readwrite, inaccessiblemem: none)
 define dso_local void @foo(i64* nocapture noundef %buf, i32 %a, i32 %b) local_unnamed_addr {
 ; CHECK-LABEL: foo:
-; CHECK:       // %bb.0: // %entry
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r[[REG0:[0-9]+]] = addasl(r2,r1,#1)
-; CHECK-NEXT:     r[[REG2:[0-9]+]] = asl(r1,#1)
+; CHECK-NEXT:     r2 = addasl(r2,r1,#1)
+; CHECK-NEXT:     r3 = asl(r1,#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     r[[REG1:[0-9]+]] = addasl(r[[REG0]],r1,#1)
+; CHECK-NEXT:     r2 = addasl(r2,r1,#1)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     jumpr r31
-; CHECK-NEXT:     memd(r0+#8) = r[[REG2]]:[[REG1]]
+; CHECK-NEXT:     memd(r0+#8) = r3:2
 ; CHECK-NEXT:    }
 entry:
   %arrayidx = getelementptr inbounds i64, i64* %buf, i32 1

From d0e98909d28be377408b1e52fa35423a2236036c Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Fri, 3 Oct 2025 09:38:10 -0700
Subject: [PATCH 667/878] [lld][MachO] Tail merge strings (#161262)

Add the flag `--tail-merge-strings` to enable tail merging of cstrings.
For example, if we have strings `mystring\0` and `ring\0`, we could
place `mystring\0` at address `0x1000` and `ring\0` at address `0x1004`
and have them share the same underlying data.

It turns out that many ObjC method names can be tail merged. For
example, `error:` and `doFoo:error:`. On a large iOS binary, we saw
nearly a 15% size improvement in the `__TEXT__objc_methname` section and
negligible impact on link time.
```
$ bloaty --domain=vm merged.o.stripped -- base.o.stripped
     VM SIZE
 --------------
   +95% +5.85Ki    [__TEXT]
  -2.4%  -239Ki    __TEXT,__cstring
 -14.5%  -710Ki    __TEXT,__objc_methname
  -1.0%  -944Ki    TOTAL
```

Tail merging for MachO was originally removed in
https://github.com/llvm/llvm-project/commit/7c269db779ff3950bac2e25ea78b14b4e2b8b247.
The previous implementation used `StringTableBuilder`, but that was
removed in
https://github.com/llvm/llvm-project/commit/4308f031cd0c679c539914608134b9c8046743b3
to ensure deduplicated strings are aligned correctly. This
implementation ensures that tail merged strings are also aligned
correctly.

Special thanks to nocchijiang for pointing this out in
https://github.com/llvm/llvm-project/pull/158720#issuecomment-3310416030.

Depends on https://github.com/llvm/llvm-project/pull/161253.
---
 lld/MachO/Config.h                            |   1 +
 lld/MachO/Driver.cpp                          |   2 +
 lld/MachO/Options.td                          |   4 +
 lld/MachO/SyntheticSections.cpp               |  60 +++++++-
 lld/docs/ReleaseNotes.rst                     |   2 +
 lld/test/MachO/cstring-tailmerge-objc.s       | 144 ++++++++++++++++++
 lld/test/MachO/cstring-tailmerge.s            |  85 +++++++++++
 lld/test/MachO/order-file-cstring-tailmerge.s |  56 +++++++
 8 files changed, 351 insertions(+), 3 deletions(-)
 create mode 100644 lld/test/MachO/cstring-tailmerge-objc.s
 create mode 100644 lld/test/MachO/cstring-tailmerge.s
 create mode 100644 lld/test/MachO/order-file-cstring-tailmerge.s

diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 51b1363d87615..a2ca5770bf952 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -223,6 +223,7 @@ struct Configuration {
   bool warnThinArchiveMissingMembers;
   bool disableVerify;
   bool separateCstringLiteralSections;
+  bool tailMergeStrings;
 
   bool callGraphProfileSort = false;
   llvm::StringRef printSymbolOrder;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 7ce987e400a24..94f441b7643a7 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1986,6 +1986,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   config->separateCstringLiteralSections =
       args.hasFlag(OPT_separate_cstring_literal_sections,
                    OPT_no_separate_cstring_literal_sections, false);
+  config->tailMergeStrings =
+      args.hasFlag(OPT_tail_merge_strings, OPT_no_tail_merge_strings, false);
 
   auto IncompatWithCGSort = [&](StringRef firstArgStr) {
     // Throw an error only if --call-graph-profile-sort is explicitly specified
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 4eeb8fbe11121..be1a1cc2963d9 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -1091,6 +1091,10 @@ defm separate_cstring_literal_sections
          "Emit all cstring literals into the __cstring section. As a special "
          "case, the __objc_methname section will still be emitted. (default)">,
       Group<grp_rare>;
+defm tail_merge_strings
+    : BB<"tail-merge-strings", "Enable string tail merging",
+         "Disable string tail merging to improve link-time performance">,
+      Group<grp_rare>;
 
 def grp_deprecated : OptionGroup<"deprecated">, HelpText<"DEPRECATED">;
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 903ba78a27c75..187cccbe90dbc 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1746,6 +1746,8 @@ void CStringSection::finalizeContents() {
 void DeduplicatedCStringSection::finalizeContents() {
   // Find the largest alignment required for each string.
   DenseMap<CachedHashStringRef, Align> strToAlignment;
+  // Used for tail merging only
+  std::vector<CachedHashStringRef> deduplicatedStrs;
   for (const CStringInputSection *isec : inputs) {
     for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
       if (!piece.live)
@@ -1754,17 +1756,66 @@ void DeduplicatedCStringSection::finalizeContents() {
       assert(isec->align != 0);
       auto align = getStringPieceAlignment(isec, piece);
       auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
+      if (config->tailMergeStrings && wasInserted)
+        deduplicatedStrs.push_back(s);
       if (!wasInserted && it->second < align)
         it->second = align;
     }
   }
 
+  // Like lexigraphical sort, except we read strings in reverse and take the
+  // longest string first
+  // TODO: We could improve performance by implementing our own sort that avoids
+  // comparing characters we know to be the same. See
+  // StringTableBuilder::multikeySort() for details
+  llvm::sort(deduplicatedStrs, [](const auto &left, const auto &right) {
+    for (const auto &[leftChar, rightChar] :
+         llvm::zip(llvm::reverse(left.val()), llvm::reverse(right.val()))) {
+      if (leftChar == rightChar)
+        continue;
+      return leftChar < rightChar;
+    }
+    return left.size() > right.size();
+  });
+  std::optional<CachedHashStringRef> mergeCandidate;
+  DenseMap<CachedHashStringRef, std::pair<CachedHashStringRef, uint64_t>>
+      tailMergeMap;
+  for (auto &s : deduplicatedStrs) {
+    if (!mergeCandidate || !mergeCandidate->val().ends_with(s.val())) {
+      mergeCandidate = s;
+      continue;
+    }
+    uint64_t tailMergeOffset = mergeCandidate->size() - s.size();
+    // TODO: If the tail offset is incompatible with this string's alignment, we
+    // might be able to find another superstring with a compatible tail offset.
+    // The difficulty is how to do this efficiently
+    const auto &align = strToAlignment.at(s);
+    if (!isAligned(align, tailMergeOffset))
+      continue;
+    auto &mergeCandidateAlign = strToAlignment[*mergeCandidate];
+    if (align > mergeCandidateAlign)
+      mergeCandidateAlign = align;
+    tailMergeMap.try_emplace(s, *mergeCandidate, tailMergeOffset);
+  }
+
   // Sort the strings for performance and compression size win, and then
   // assign an offset for each string and save it to the corresponding
   // StringPieces for easy access.
   for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
     auto &piece = isec->pieces[i];
     auto s = isec->getCachedHashStringRef(i);
+    // Any string can be tail merged with itself with an offset of zero
+    uint64_t tailMergeOffset = 0;
+    auto mergeIt =
+        config->tailMergeStrings ? tailMergeMap.find(s) : tailMergeMap.end();
+    if (mergeIt != tailMergeMap.end()) {
+      auto &[superString, offset] = mergeIt->second;
+      // s can be tail merged with superString. Do not layout s. Instead layout
+      // superString if we haven't already
+      assert(superString.val().ends_with(s.val()));
+      s = superString;
+      tailMergeOffset = offset;
+    }
     auto [it, wasInserted] = stringOffsetMap.try_emplace(s, /*placeholder*/ 0);
     if (wasInserted) {
       // Avoid computing the offset until we are sure we will need to
@@ -1772,9 +1823,12 @@ void DeduplicatedCStringSection::finalizeContents() {
       it->second = offset;
       size = offset + s.size() + 1; // account for null terminator
     }
-    // If the string was already in stringOffsetMap, it is a duplicate and we
-    // only need to assign the offset.
-    piece.outSecOff = it->second;
+    piece.outSecOff = it->second + tailMergeOffset;
+    if (mergeIt != tailMergeMap.end()) {
+      auto &tailMergedString = mergeIt->first;
+      stringOffsetMap[tailMergedString] = piece.outSecOff;
+      assert(isAligned(strToAlignment.at(tailMergedString), piece.outSecOff));
+    }
   }
   for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 566dde6e08115..29db1cdf9e9c4 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -46,6 +46,8 @@ MachO Improvements
 
 * ``--separate-cstring-literal-sections`` emits cstring literal sections into sections defined by their section name.
   (`#158720 <https://github.com/llvm/llvm-project/pull/158720>`_)
+* ``--tail-merge-strings`` enables tail merging of cstring literals.
+  (`#161262 <https://github.com/llvm/llvm-project/pull/161262>`_)
 
 WebAssembly Improvements
 ------------------------
diff --git a/lld/test/MachO/cstring-tailmerge-objc.s b/lld/test/MachO/cstring-tailmerge-objc.s
new file mode 100644
index 0000000000000..46b2bbf9dcd9a
--- /dev/null
+++ b/lld/test/MachO/cstring-tailmerge-objc.s
@@ -0,0 +1,144 @@
+; REQUIRES: aarch64
+; RUN: rm -rf %t && split-file %s %t
+
+; Test that ObjC method names are tail merged and
+; ObjCSelRefsHelper::makeSelRef() still works correctly
+
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
+; RUN: %lld -dylib -arch arm64 --tail-merge-strings %t/a.o -o %t/a
+; RUN: llvm-objdump --macho --section="__TEXT,__objc_methname" %t/a | FileCheck %s --implicit-check-not=error
+
+; RUN: %lld -dylib -arch arm64 --no-tail-merge-strings %t/a.o -o %t/nomerge
+; RUN: llvm-objdump --macho --section="__TEXT,__objc_methname" %t/nomerge | FileCheck %s --check-prefixes=CHECK,NOMERGE --implicit-check-not=error
+
+; CHECK: withBar:error:
+; NOMERGE: error:
+
+;--- a.mm
+__attribute__((objc_root_class))
+@interface Foo
+- (void)withBar:(int)bar error:(int)error;
+- (void)error:(int)error;
+@end
+
+@implementation Foo
+- (void)withBar:(int)bar error:(int)error {}
+- (void)error:(int)error {}
+@end
+
+void *_objc_empty_cache;
+void *_objc_empty_vtable;
+;--- gen
+clang -Oz -target arm64-apple-darwin a.mm -S -o -
+;--- a.s
+	.build_version macos, 11, 0
+	.section	__TEXT,__text,regular,pure_instructions
+	.p2align	2                               ; -- Begin function -[Foo withBar:error:]
+"-[Foo withBar:error:]":                ; @"\01-[Foo withBar:error:]"
+	.cfi_startproc
+; %bb.0:
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.p2align	2                               ; -- Begin function -[Foo error:]
+"-[Foo error:]":                        ; @"\01-[Foo error:]"
+	.cfi_startproc
+; %bb.0:
+	ret
+	.cfi_endproc
+                                        ; -- End function
+	.globl	__objc_empty_vtable             ; @_objc_empty_vtable
+.zerofill __DATA,__common,__objc_empty_vtable,8,3
+	.section	__DATA,__objc_data
+	.globl	_OBJC_CLASS_$_Foo               ; @"OBJC_CLASS_$_Foo"
+	.p2align	3, 0x0
+_OBJC_CLASS_$_Foo:
+	.quad	_OBJC_METACLASS_$_Foo
+	.quad	0
+	.quad	__objc_empty_cache
+	.quad	__objc_empty_vtable
+	.quad	__OBJC_CLASS_RO_$_Foo
+
+	.globl	_OBJC_METACLASS_$_Foo           ; @"OBJC_METACLASS_$_Foo"
+	.p2align	3, 0x0
+_OBJC_METACLASS_$_Foo:
+	.quad	_OBJC_METACLASS_$_Foo
+	.quad	_OBJC_CLASS_$_Foo
+	.quad	__objc_empty_cache
+	.quad	__objc_empty_vtable
+	.quad	__OBJC_METACLASS_RO_$_Foo
+
+	.section	__TEXT,__objc_classname,cstring_literals
+l_OBJC_CLASS_NAME_:                     ; @OBJC_CLASS_NAME_
+	.asciz	"Foo"
+
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_METACLASS_RO_$_Foo"
+__OBJC_METACLASS_RO_$_Foo:
+	.long	3                               ; 0x3
+	.long	40                              ; 0x28
+	.long	40                              ; 0x28
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_:                  ; @OBJC_METH_VAR_NAME_
+	.asciz	"withBar:error:"
+
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_:                  ; @OBJC_METH_VAR_TYPE_
+	.asciz	"v24@0:8i16i20"
+
+	.section	__TEXT,__objc_methname,cstring_literals
+l_OBJC_METH_VAR_NAME_.1:                ; @OBJC_METH_VAR_NAME_.1
+	.asciz	"error:"
+
+	.section	__TEXT,__objc_methtype,cstring_literals
+l_OBJC_METH_VAR_TYPE_.2:                ; @OBJC_METH_VAR_TYPE_.2
+	.asciz	"v20@0:8i16"
+
+	.section	__DATA,__objc_const
+	.p2align	3, 0x0                          ; @"_OBJC_$_INSTANCE_METHODS_Foo"
+__OBJC_$_INSTANCE_METHODS_Foo:
+	.long	24                              ; 0x18
+	.long	2                               ; 0x2
+	.quad	l_OBJC_METH_VAR_NAME_
+	.quad	l_OBJC_METH_VAR_TYPE_
+	.quad	"-[Foo withBar:error:]"
+	.quad	l_OBJC_METH_VAR_NAME_.1
+	.quad	l_OBJC_METH_VAR_TYPE_.2
+	.quad	"-[Foo error:]"
+
+	.p2align	3, 0x0                          ; @"_OBJC_CLASS_RO_$_Foo"
+__OBJC_CLASS_RO_$_Foo:
+	.long	2                               ; 0x2
+	.long	0                               ; 0x0
+	.long	0                               ; 0x0
+	.space	4
+	.quad	0
+	.quad	l_OBJC_CLASS_NAME_
+	.quad	__OBJC_$_INSTANCE_METHODS_Foo
+	.quad	0
+	.quad	0
+	.quad	0
+	.quad	0
+
+	.globl	__objc_empty_cache              ; @_objc_empty_cache
+.zerofill __DATA,__common,__objc_empty_cache,8,3
+	.section	__DATA,__objc_classlist,regular,no_dead_strip
+	.p2align	3, 0x0                          ; @"OBJC_LABEL_CLASS_$"
+l_OBJC_LABEL_CLASS_$:
+	.quad	_OBJC_CLASS_$_Foo
+
+	.section	__DATA,__objc_imageinfo,regular,no_dead_strip
+L_OBJC_IMAGE_INFO:
+	.long	0
+	.long	64
+
+.subsections_via_symbols
diff --git a/lld/test/MachO/cstring-tailmerge.s b/lld/test/MachO/cstring-tailmerge.s
new file mode 100644
index 0000000000000..740f971eb4bb8
--- /dev/null
+++ b/lld/test/MachO/cstring-tailmerge.s
@@ -0,0 +1,85 @@
+; REQUIRES: aarch64
+; RUN: rm -rf %t && split-file %s %t
+
+; RUN: sed "s/<ALIGN>/0/g" %t/align.s.template > %t/align-1.s
+; RUN: sed "s/<ALIGN>/1/g" %t/align.s.template > %t/align-2.s
+; RUN: sed "s/<ALIGN>/2/g" %t/align.s.template > %t/align-4.s
+
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/first.s -o %t/first.o
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/align-1.s -o %t/align-1.o
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/align-2.s -o %t/align-2.o
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/align-4.s -o %t/align-4.o
+
+; RUN: %lld -dylib -arch arm64 --tail-merge-strings %t/first.o %t/align-1.o -o %t/align-1
+; RUN: llvm-objdump --macho --section="__TEXT,__cstring" --syms %t/align-1 | FileCheck %s --check-prefixes=CHECK,ALIGN1
+
+; RUN: %lld -dylib -arch arm64 --tail-merge-strings %t/first.o %t/align-2.o -o %t/align-2
+; RUN: llvm-objdump --macho --section="__TEXT,__cstring" --syms %t/align-2 | FileCheck %s --check-prefixes=CHECK,ALIGN2
+
+; RUN: %lld -dylib -arch arm64 --tail-merge-strings %t/first.o %t/align-4.o -o %t/align-4
+; RUN: llvm-objdump --macho --section="__TEXT,__cstring" --syms %t/align-4 | FileCheck %s --check-prefixes=CHECK,ALIGN4
+
+; CHECK: Contents of (__TEXT,__cstring) section
+; CHECK: [[#%.16x,START:]] get awkward offset{{$}}
+
+; ALIGN1: [[#%.16x,START+19]] myotherlongstr{{$}}
+; ALIGN1: [[#%.16x,START+19+15]] otherstr{{$}}
+
+; ALIGN2: [[#%.16x,START+20]] myotherlongstr{{$}}
+; ALIGN2: [[#%.16x,START+20+16]] longstr{{$}}
+; ALIGN2: [[#%.16x,START+20+16+8]] otherstr{{$}}
+; ALIGN2: [[#%.16x,START+20+16+8+10]] str{{$}}
+
+; ALIGN4: [[#%.16x,START+20]] myotherlongstr{{$}}
+; ALIGN4: [[#%.16x,START+20+16]] otherlongstr{{$}}
+; ALIGN4: [[#%.16x,START+20+16+16]] longstr{{$}}
+; ALIGN4: [[#%.16x,START+20+16+16+8]] otherstr{{$}}
+; ALIGN4: [[#%.16x,START+20+16+16+8+12]] str{{$}}
+
+; CHECK: SYMBOL TABLE:
+
+; ALIGN1: [[#%.16x,START+19]] l     O __TEXT,__cstring _myotherlongstr
+; ALIGN1: [[#%.16x,START+21]] l     O __TEXT,__cstring _otherlongstr
+; ALIGN1: [[#%.16x,START+26]] l     O __TEXT,__cstring _longstr
+; ALIGN1: [[#%.16x,START+34]] l     O __TEXT,__cstring _otherstr
+; ALIGN1: [[#%.16x,START+39]] l     O __TEXT,__cstring _str
+
+; ALIGN2: [[#%.16x,START+20]] l     O __TEXT,__cstring _myotherlongstr
+; ALIGN2: [[#%.16x,START+20+2]] l     O __TEXT,__cstring _otherlongstr
+; ALIGN2: [[#%.16x,START+20+16]] l     O __TEXT,__cstring _longstr
+; ALIGN2: [[#%.16x,START+20+16+8]] l     O __TEXT,__cstring _otherstr
+; ALIGN2: [[#%.16x,START+20+16+8+10]] l     O __TEXT,__cstring _str
+
+; ALIGN4: [[#%.16x,START+20]] l     O __TEXT,__cstring _myotherlongstr
+; ALIGN4: [[#%.16x,START+20+16]] l     O __TEXT,__cstring _otherlongstr
+; ALIGN4: [[#%.16x,START+20+16+16]] l     O __TEXT,__cstring _longstr
+; ALIGN4: [[#%.16x,START+20+16+16+8]] l     O __TEXT,__cstring _otherstr
+; ALIGN4: [[#%.16x,START+20+16+16+8+12]] l     O __TEXT,__cstring _str
+
+;--- first.s
+.cstring
+.p2align 2
+.asciz "get awkward offset"  ; length = 19
+
+;--- align.s.template
+.cstring
+
+.p2align <ALIGN>
+  _myotherlongstr:
+.asciz "myotherlongstr"      ; length = 15
+
+.p2align <ALIGN>
+  _otherlongstr:
+.asciz   "otherlongstr"      ; length = 13, tail offset = 2
+
+.p2align <ALIGN>
+  _longstr:
+.asciz        "longstr"      ; length = 8, tail offset = 7
+
+.p2align <ALIGN>
+  _otherstr:
+.asciz       "otherstr"      ; length = 9
+
+.p2align <ALIGN>
+  _str:
+.asciz            "str"      ; length = 4, tail offset = 5
diff --git a/lld/test/MachO/order-file-cstring-tailmerge.s b/lld/test/MachO/order-file-cstring-tailmerge.s
new file mode 100644
index 0000000000000..20a4d162c573a
--- /dev/null
+++ b/lld/test/MachO/order-file-cstring-tailmerge.s
@@ -0,0 +1,56 @@
+; REQUIRES: aarch64
+; RUN: rm -rf %t && split-file %s %t
+
+; RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
+; RUN: %lld -dylib -arch arm64 --no-tail-merge-strings -order_file %t/orderfile.txt %t/a.o -o - | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s
+; RUN: %lld -dylib -arch arm64 --tail-merge-strings -order_file %t/orderfile.txt %t/a.o -o - | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=MERGED
+
+; CHECK: _str2
+; CHECK: _str1
+; CHECK: _superstr2
+; CHECK: _superstr3
+; CHECK: _superstr1
+; CHECK: _str3
+
+; str1 has a higher priority than superstr1, so str1 must be ordered before
+; str3, even though superstr1 is before superstr3 in the orderfile.
+
+; MERGED: _superstr2
+; MERGED: _str2
+; MERGED: _superstr1
+; MERGED: _str1
+; MERGED: _superstr3
+; MERGED: _str3
+
+;--- a.s
+.cstring
+  _superstr1:
+.asciz "superstr1"
+  _str1:
+.asciz "str1"
+  _superstr2:
+.asciz "superstr2"
+  _str2:
+.asciz "str2"
+  _superstr3:
+.asciz "superstr3"
+  _str3:
+.asciz "str3"
+
+; TODO: We could use update_test_body.py to generate the hashes for the
+; orderfile. Unfortunately, it seems that LLVM has a different hash
+; implementation than the xxh64sum tool. See
+; DeduplicatedCStringSection::getStringOffset() for hash details.
+;
+; while IFS="" read -r line; do
+;     echo -n $line | xxh64sum | awk '{printf "CSTR;%010d", and(strtonum("0x"$1), 0x7FFFFFFF)}'
+;     echo " # $line"
+; done < orderfile.txt.template
+
+;--- orderfile.txt
+CSTR;1236462241 # str2
+CSTR;1526669509 # str1
+CSTR;1563550684 # superstr2
+CSTR;1044337806 # superstr3
+CSTR;262417687  # superstr1
+CSTR;717161398  # str3

From 74180eb024f3e45c4e0ebeb5dd07f34f85ff6539 Mon Sep 17 00:00:00 2001
From: modiking <mmo@nvidia.com>
Date: Fri, 3 Oct 2025 09:48:59 -0700
Subject: [PATCH 668/878] [flang][rt] Add noinline attributes for CUDA compile
 path for successful compilation (#161760)

NVCC does more aggressive inlining than Clang/GCC causing the exported
functions in extrema.cpp and findloc.cpp to become extremely large from
function specializations leading to compilation timeouts. Marking the 2
functions in this change as noinline for NVCC alleviates this problem as
it removes the worst of the cross-matrix argument specializations.

Also remove the workaround in
https://github.com/llvm/llvm-project/pull/156542 that opted out
findloc.cpp from the CUDA flang-rt build

Testing:
ninja flang-rt builds in ~30 minutes, these 2 files build in ~3 minutes
---
 flang-rt/lib/runtime/CMakeLists.txt |  3 ---
 flang-rt/lib/runtime/extrema.cpp    |  9 ++++++---
 flang-rt/lib/runtime/findloc.cpp    | 11 +++++++----
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b8..e8f70bd544e0b 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -178,9 +178,6 @@ endif ()
 if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
   set(sources ${gpu_sources})
 elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
-  # findloc.cpp has some issues with higher compute capability. Remove it
-  # from CUDA build until we can lower its memory footprint.
-  list(REMOVE_ITEM supported_sources findloc.cpp)
   set(sources ${supported_sources})
 else ()
   set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang-rt/lib/runtime/extrema.cpp b/flang-rt/lib/runtime/extrema.cpp
index 9846529665e8b..c4575cced9017 100644
--- a/flang-rt/lib/runtime/extrema.cpp
+++ b/flang-rt/lib/runtime/extrema.cpp
@@ -397,9 +397,12 @@ template <TypeCategory CAT, bool IS_MAX,
     template <typename, bool, bool> class COMPARE>
 struct DoPartialMaxOrMinLocHelper {
   template <int KIND> struct Functor {
-    RT_API_ATTRS void operator()(const char *intrinsic, Descriptor &result,
-        const Descriptor &x, int kind, int dim, const Descriptor *mask,
-        bool back, Terminator &terminator) const {
+    // NVCC inlines more aggressively which causes too many specializations of
+    // this function to be inlined causing compiler timeouts. Set as
+    // noinline to allow compilation to complete.
+    RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(const char *intrinsic,
+        Descriptor &result, const Descriptor &x, int kind, int dim,
+        const Descriptor *mask, bool back, Terminator &terminator) const {
       DoPartialMaxOrMinLoc<CAT, KIND, IS_MAX, COMPARE>(
           intrinsic, result, x, kind, dim, mask, back, terminator);
     }
diff --git a/flang-rt/lib/runtime/findloc.cpp b/flang-rt/lib/runtime/findloc.cpp
index 5485f4b97bd2f..b5031ec95508d 100644
--- a/flang-rt/lib/runtime/findloc.cpp
+++ b/flang-rt/lib/runtime/findloc.cpp
@@ -153,10 +153,13 @@ template <TypeCategory CAT,
     class HELPER>
 struct NumericFindlocHelper {
   template <int KIND> struct Functor {
-    RT_API_ATTRS void operator()(TypeCategory targetCat, int targetKind,
-        Descriptor &result, const Descriptor &x, const Descriptor &target,
-        int kind, int dim, const Descriptor *mask, bool back,
-        Terminator &terminator) const {
+    // NVCC inlines more aggressively which causes too many specializations of
+    // this function to be inlined causing compiler timeouts. Set as
+    // noinline to allow compilation to complete.
+    RT_API_ATTRS RT_DEVICE_NOINLINE void operator()(TypeCategory targetCat,
+        int targetKind, Descriptor &result, const Descriptor &x,
+        const Descriptor &target, int kind, int dim, const Descriptor *mask,
+        bool back, Terminator &terminator) const {
       switch (targetCat) {
       case TypeCategory::Integer:
       case TypeCategory::Unsigned:

From 1af8ed1988289ee4555cc75d1f6750a1098b6919 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Fri, 3 Oct 2025 10:02:00 -0700
Subject: [PATCH 669/878] Fix filename in wasm build of compiler-rt (#161875)

---
 compiler-rt/lib/builtins/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 9095b056ae782..6c226aa7d2d48 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -819,7 +819,7 @@ set(s390x_SOURCES
 
 set(wasm_SOURCES
   wasm/__c_longjmp.S
-  wasm/__cpp_exceptions.S
+  wasm/__cpp_exception.S
   ${GENERIC_TF_SOURCES}
   ${GENERIC_SOURCES}
 )

From c54d0d7a9630f5a554cd9cc9d14262cad216efed Mon Sep 17 00:00:00 2001
From: Alan Zhao <ayzhao@google.com>
Date: Fri, 3 Oct 2025 10:19:08 -0700
Subject: [PATCH 670/878] [InstCombine] Preserve profile after folding select
 instructions with conditionals (#159666)

If `select` simplification produces the transform:

```
(select A && B, T, F) -> (select A, T, F)
```

or

```
(select A || B, T, F) -> (select A, T, F)
```

it stands to reason that if the branches are the same, then the branch
weights remain the same since the net effect is a simplification of the
conditional.

There are also cases where InstCombine negates the conditional (and
therefore reverses the branches); this PR asserts that the branch
weights are reversed in this case.

Tracking issue: #147390
---
 .../InstCombine/InstCombineSelect.cpp         | 18 ++++++-
 .../Transforms/InstCombine/select-and-cmp.ll  | 44 +++++++++-------
 .../Transforms/InstCombine/select-or-cmp.ll   | 50 +++++++++++--------
 3 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 87000a1c36eef..a5028b7d9d537 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -50,6 +50,7 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
 /// Replace a select operand based on an equality comparison with the identity
 /// constant of a binop.
@@ -4492,8 +4493,21 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   auto FoldSelectWithAndOrCond = [&](bool IsAnd, Value *A,
                                      Value *B) -> Instruction * {
     if (Value *V = simplifySelectInst(B, TrueVal, FalseVal,
-                                      SQ.getWithInstruction(&SI)))
-      return SelectInst::Create(A, IsAnd ? V : TrueVal, IsAnd ? FalseVal : V);
+                                      SQ.getWithInstruction(&SI))) {
+      Value *NewTrueVal = IsAnd ? V : TrueVal;
+      Value *NewFalseVal = IsAnd ? FalseVal : V;
+
+      // If the True and False values don't change, then preserve the branch
+      // metadata of the original select as the net effect of this change is to
+      // simplify the conditional.
+      Instruction *MDFrom = nullptr;
+      if (NewTrueVal == TrueVal && NewFalseVal == FalseVal &&
+          !ProfcheckDisableMetadataFixes) {
+        MDFrom = &SI;
+      }
+      return SelectInst::Create(A, NewTrueVal, NewFalseVal, "", nullptr,
+                                MDFrom);
+    }
 
     // Is (select B, T, F) a SPF?
     if (CondVal->hasOneUse() && SelType->isIntOrIntVectorTy()) {
diff --git a/llvm/test/Transforms/InstCombine/select-and-cmp.ll b/llvm/test/Transforms/InstCombine/select-and-cmp.ll
index 50e1493cad5c7..26c04ad412445 100644
--- a/llvm/test/Transforms/InstCombine/select-and-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-and-cmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i32 @select_and_icmp(i32 %x, i32 %y, i32 %z) {
@@ -114,34 +114,34 @@ define i32 @select_and_icmp_inv(i32 %x, i32 %y, i32 %z) {
 
 ; Below used to be negative tests in InstSimplify, but are no more negative cases here
 
-define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_1(
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
   %C = and i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_2(
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = and i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
 define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_3(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -153,8 +153,8 @@ define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_4(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -166,7 +166,7 @@ define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_1(
-; CHECK-NEXT:    ret i32 [[Z]]
+; CHECK-NEXT:    ret i32 [[Z:%.*]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
@@ -177,8 +177,8 @@ define i32 @select_and_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_2(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -191,8 +191,8 @@ define i32 @select_and_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_3(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -204,8 +204,8 @@ define i32 @select_and_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_4(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -322,3 +322,11 @@ define i32 @select_and_icmp_alt_bad_false_val(i32 %x, i32 %y, i32 %z, i32 %k) {
   %D = select i1 %C, i32 %x, i32 %k
   ret i32 %D
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+;.
diff --git a/llvm/test/Transforms/InstCombine/select-or-cmp.ll b/llvm/test/Transforms/InstCombine/select-or-cmp.ll
index 72a3747c5e405..82b069bc91e88 100644
--- a/llvm/test/Transforms/InstCombine/select-or-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-or-cmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i32 @select_or_icmp(i32 %x, i32 %y, i32 %z) {
@@ -114,47 +114,47 @@ define i32 @select_or_icmp_inv(i32 %x, i32 %y, i32 %z) {
 
 ; Below used to be negative tests in InstSimplify, but are no more negative cases here
 
-define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_1(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_2(
-; CHECK-NEXT:    ret i32 [[Z]]
+; CHECK-NEXT:    ret i32 [[Z:%.*]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_3(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
 define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_4(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -166,8 +166,8 @@ define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_1(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -179,7 +179,7 @@ define i32 @select_or_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_2(
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
@@ -190,8 +190,8 @@ define i32 @select_or_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_3(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -203,8 +203,8 @@ define i32 @select_or_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_4(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -321,3 +321,11 @@ define i32 @select_or_icmp_alt_bad_false_val(i32 %x, i32 %y, i32 %z, i32 %k) {
   %D = select i1 %C, i32 %x, i32 %k
   ret i32 %D
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 2}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 2, i32 3}
+;.

From d6449b55cd8f5e40da91b8495e4766e845295946 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 3 Oct 2025 10:37:06 -0700
Subject: [PATCH 671/878] [CIR] Add support for emitting predefined expressions
 (#161757)

This adds support for emitting pseudo-macro expressions that represent
some form of the name of a function (such as `__func__` or
`__PRETTY_FUNCTION__`) as l-values.
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp     | 20 ++++++-
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp |  2 +
 clang/lib/CIR/CodeGen/CIRGenFunction.h   |  5 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp   | 52 +++++++++--------
 clang/lib/CIR/CodeGen/CIRGenModule.h     |  2 +
 clang/test/CIR/CodeGen/predefined-expr.c | 71 ++++++++++++++++++++++++
 6 files changed, 125 insertions(+), 27 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/predefined-expr.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index fa68ad931ba74..b4c8924d14b73 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1108,8 +1108,9 @@ CIRGenFunction::emitArraySubscriptExpr(const clang::ArraySubscriptExpr *e) {
   return lv;
 }
 
-LValue CIRGenFunction::emitStringLiteralLValue(const StringLiteral *e) {
-  cir::GlobalOp globalOp = cgm.getGlobalForStringLiteral(e);
+LValue CIRGenFunction::emitStringLiteralLValue(const StringLiteral *e,
+                                               llvm::StringRef name) {
+  cir::GlobalOp globalOp = cgm.getGlobalForStringLiteral(e, name);
   assert(globalOp.getAlignment() && "expected alignment for string literal");
   unsigned align = *(globalOp.getAlignment());
   mlir::Value addr =
@@ -2372,6 +2373,21 @@ mlir::Value CIRGenFunction::emitScalarConstant(
   return builder.getConstant(getLoc(e->getSourceRange()), constant.getValue());
 }
 
+LValue CIRGenFunction::emitPredefinedLValue(const PredefinedExpr *e) {
+  const StringLiteral *sl = e->getFunctionName();
+  assert(sl != nullptr && "No StringLiteral name in PredefinedExpr");
+  auto fn = cast<cir::FuncOp>(curFn);
+  StringRef fnName = fn.getName();
+  fnName.consume_front("\01");
+  std::array<StringRef, 2> nameItems = {
+      PredefinedExpr::getIdentKindName(e->getIdentKind()), fnName};
+  std::string gvName = llvm::join(nameItems, ".");
+  if (isa_and_nonnull<BlockDecl>(curCodeDecl))
+    cgm.errorNYI(e->getSourceRange(), "predefined lvalue in block");
+
+  return emitStringLiteralLValue(sl, gvName);
+}
+
 /// An LValue is a candidate for having its loads and stores be made atomic if
 /// we are operating under /volatile:ms *and* the LValue itself is volatile and
 /// performing such an operation can be performed without a libcall.
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index b26b4f2500579..3e34f5ca541fe 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -815,6 +815,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
     return emitMemberExpr(cast<MemberExpr>(e));
   case Expr::CompoundLiteralExprClass:
     return emitCompoundLiteralLValue(cast<CompoundLiteralExpr>(e));
+  case Expr::PredefinedExprClass:
+    return emitPredefinedLValue(cast<PredefinedExpr>(e));
   case Expr::BinaryOperatorClass:
     return emitBinaryOperatorLValue(cast<BinaryOperator>(e));
   case Expr::CompoundAssignOperatorClass: {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index cb7cf983006e9..3eba242550c2c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1279,6 +1279,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitInitializerForField(clang::FieldDecl *field, LValue lhs,
                                clang::Expr *init);
 
+  LValue emitPredefinedLValue(const PredefinedExpr *e);
+
   mlir::Value emitPromotedComplexExpr(const Expr *e, QualType promotionType);
 
   mlir::Value emitPromotedScalarExpr(const Expr *e, QualType promotionType);
@@ -1473,7 +1475,8 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::Value emitStoreThroughBitfieldLValue(RValue src, LValue dstresult);
 
-  LValue emitStringLiteralLValue(const StringLiteral *e);
+  LValue emitStringLiteralLValue(const StringLiteral *e,
+                                 llvm::StringRef name = ".str");
 
   mlir::LogicalResult emitSwitchBody(const clang::Stmt *s);
   mlir::LogicalResult emitSwitchCase(const clang::SwitchCase &s,
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 2bd2729f0b0fb..beeabafaeb964 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1343,32 +1343,36 @@ cir::GlobalOp CIRGenModule::getGlobalForStringLiteral(const StringLiteral *s,
 
   mlir::Attribute c = getConstantArrayFromStringLiteral(s);
 
-  if (getLangOpts().WritableStrings) {
-    errorNYI(s->getSourceRange(),
-             "getGlobalForStringLiteral: Writable strings");
-  }
-
-  // Mangle the string literal if that's how the ABI merges duplicate strings.
-  // Don't do it if they are writable, since we don't want writes in one TU to
-  // affect strings in another.
-  if (getCXXABI().getMangleContext().shouldMangleStringLiteral(s) &&
-      !getLangOpts().WritableStrings) {
-    errorNYI(s->getSourceRange(),
-             "getGlobalForStringLiteral: mangle string literals");
-  }
-
-  // Unlike LLVM IR, CIR doesn't automatically unique names for globals, so
-  // we need to do that explicitly.
-  std::string uniqueName = getUniqueGlobalName(name.str());
-  mlir::Location loc = getLoc(s->getSourceRange());
-  auto typedC = llvm::cast<mlir::TypedAttr>(c);
-  cir::GlobalOp gv =
-      generateStringLiteral(loc, typedC, cir::GlobalLinkageKind::PrivateLinkage,
-                            *this, uniqueName, alignment);
-  setDSOLocal(static_cast<mlir::Operation *>(gv));
+  cir::GlobalOp gv;
+  if (!getLangOpts().WritableStrings && constantStringMap.count(c)) {
+    gv = constantStringMap[c];
+    // The bigger alignment always wins.
+    if (!gv.getAlignment() ||
+        uint64_t(alignment.getQuantity()) > *gv.getAlignment())
+      gv.setAlignmentAttr(getSize(alignment));
+  } else {
+    // Mangle the string literal if that's how the ABI merges duplicate strings.
+    // Don't do it if they are writable, since we don't want writes in one TU to
+    // affect strings in another.
+    if (getCXXABI().getMangleContext().shouldMangleStringLiteral(s) &&
+        !getLangOpts().WritableStrings) {
+      errorNYI(s->getSourceRange(),
+               "getGlobalForStringLiteral: mangle string literals");
+    }
 
-  assert(!cir::MissingFeatures::sanitizers());
+    // Unlike LLVM IR, CIR doesn't automatically unique names for globals, so
+    // we need to do that explicitly.
+    std::string uniqueName = getUniqueGlobalName(name.str());
+    mlir::Location loc = getLoc(s->getSourceRange());
+    auto typedC = llvm::cast<mlir::TypedAttr>(c);
+    gv = generateStringLiteral(loc, typedC,
+                               cir::GlobalLinkageKind::PrivateLinkage, *this,
+                               uniqueName, alignment);
+    setDSOLocal(static_cast<mlir::Operation *>(gv));
+    constantStringMap[c] = gv;
 
+    assert(!cir::MissingFeatures::sanitizers());
+  }
   return gv;
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 2c4c6dd14e2ff..541432407fea5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -274,6 +274,8 @@ class CIRGenModule : public CIRGenTypeCache {
     llvm_unreachable("unknown visibility!");
   }
 
+  llvm::DenseMap<mlir::Attribute, cir::GlobalOp> constantStringMap;
+
   /// Return a constant array for the given string.
   mlir::Attribute getConstantArrayFromStringLiteral(const StringLiteral *e);
 
diff --git a/clang/test/CIR/CodeGen/predefined-expr.c b/clang/test/CIR/CodeGen/predefined-expr.c
new file mode 100644
index 0000000000000..674c9bd02b5cf
--- /dev/null
+++ b/clang/test/CIR/CodeGen/predefined-expr.c
@@ -0,0 +1,71 @@
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -o %t.cir
+// RUN: FileCheck %s --input-file=%t.cir --check-prefix=CIR
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -o %t-cir.ll
+// RUN: FileCheck %s --input-file=%t-cir.ll --check-prefix=LLVM
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -o %t.ll
+// RUN: FileCheck %s --input-file=%t.ll --check-prefix=OGCG
+
+// CIR: cir.global "private" constant cir_private dso_local @__func__.plainFunction = #cir.const_array<"plainFunction\00" : !cir.array<!s8i x 14>>
+// CIR: cir.global "private" constant cir_private dso_local @__PRETTY_FUNCTION__.plainFunction = #cir.const_array<"void plainFunction(void)\00" : !cir.array<!s8i x 25>>
+// CIR: cir.global "private" constant cir_private dso_local @__func__.externFunction = #cir.const_array<"externFunction\00" : !cir.array<!s8i x 15>>
+// CIR: cir.global "private" constant cir_private dso_local @__PRETTY_FUNCTION__.externFunction = #cir.const_array<"void externFunction(void)\00" : !cir.array<!s8i x 26>>
+// CIR: cir.global "private" constant cir_private dso_local @__func__.privateExternFunction = #cir.const_array<"privateExternFunction\00" : !cir.array<!s8i x 22>>
+// CIR: cir.global "private" constant cir_private dso_local @__PRETTY_FUNCTION__.privateExternFunction = #cir.const_array<"void privateExternFunction(void)\00" : !cir.array<!s8i x 33>>
+// CIR: cir.global "private" constant cir_private dso_local @__func__.staticFunction = #cir.const_array<"staticFunction\00" : !cir.array<!s8i x 15>>
+// CIR: cir.global "private" constant cir_private dso_local @__PRETTY_FUNCTION__.staticFunction = #cir.const_array<"void staticFunction(void)\00" : !cir.array<!s8i x 26>>
+
+// TODO(cir): These should be unnamed_addr
+// LLVM: @__func__.plainFunction = private constant [14 x i8] c"plainFunction\00"
+// LLVM: @__PRETTY_FUNCTION__.plainFunction = private constant [25 x i8] c"void plainFunction(void)\00"
+// LLVM: @__func__.externFunction = private constant [15 x i8] c"externFunction\00"
+// LLVM: @__PRETTY_FUNCTION__.externFunction = private constant [26 x i8] c"void externFunction(void)\00"
+// LLVM: @__func__.privateExternFunction = private constant [22 x i8] c"privateExternFunction\00"
+// LLVM: @__PRETTY_FUNCTION__.privateExternFunction = private constant [33 x i8] c"void privateExternFunction(void)\00"
+// LLVM: @__func__.staticFunction = private constant [15 x i8] c"staticFunction\00"
+// LLVM: @__PRETTY_FUNCTION__.staticFunction = private constant [26 x i8] c"void staticFunction(void)\00"
+
+// OGCG: @__func__.plainFunction = private unnamed_addr constant [14 x i8] c"plainFunction\00"
+// OGCG: @__PRETTY_FUNCTION__.plainFunction = private unnamed_addr constant [25 x i8] c"void plainFunction(void)\00"
+// OGCG: @__func__.externFunction = private unnamed_addr constant [15 x i8] c"externFunction\00"
+// OGCG: @__PRETTY_FUNCTION__.externFunction = private unnamed_addr constant [26 x i8] c"void externFunction(void)\00"
+// OGCG: @__func__.privateExternFunction = private unnamed_addr constant [22 x i8] c"privateExternFunction\00"
+// OGCG: @__PRETTY_FUNCTION__.privateExternFunction = private unnamed_addr constant [33 x i8] c"void privateExternFunction(void)\00"
+// OGCG: @__func__.staticFunction = private unnamed_addr constant [15 x i8] c"staticFunction\00"
+// OGCG: @__PRETTY_FUNCTION__.staticFunction = private unnamed_addr constant [26 x i8] c"void staticFunction(void)\00"
+
+int printf(const char *, ...);
+
+void plainFunction(void) {
+  printf("__func__ %s\n", __func__);
+  printf("__FUNCTION__ %s\n", __FUNCTION__);
+  printf("__PRETTY_FUNCTION__ %s\n\n", __PRETTY_FUNCTION__);
+}
+
+extern void externFunction(void) {
+  printf("__func__ %s\n", __func__);
+  printf("__FUNCTION__ %s\n", __FUNCTION__);
+  printf("__PRETTY_FUNCTION__ %s\n\n", __PRETTY_FUNCTION__);
+}
+
+__private_extern__ void privateExternFunction(void) {
+  printf("__func__ %s\n", __func__);
+  printf("__FUNCTION__ %s\n", __FUNCTION__);
+  printf("__PRETTY_FUNCTION__ %s\n\n", __PRETTY_FUNCTION__);
+}
+
+// TODO(cir): Add support for __captured_stmt
+
+static void staticFunction(void) {
+  printf("__func__ %s\n", __func__);
+  printf("__FUNCTION__ %s\n", __FUNCTION__);
+  printf("__PRETTY_FUNCTION__ %s\n\n", __PRETTY_FUNCTION__);
+}
+
+int main(void) {
+  plainFunction();
+  externFunction();
+  privateExternFunction();
+  staticFunction();
+
+  return 0;
+}

From 572b579632fb79ea6eb562a537c9ff1280b3d4f5 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 3 Oct 2025 19:37:23 +0200
Subject: [PATCH 672/878] Revert "[InstCombine] Preserve profile after folding
 select instructions with conditionals" (#161885)

Reverts llvm/llvm-project#159666

Many bots are broken right now.
---
 .../InstCombine/InstCombineSelect.cpp         | 18 +------
 .../Transforms/InstCombine/select-and-cmp.ll  | 44 +++++++---------
 .../Transforms/InstCombine/select-or-cmp.ll   | 50 ++++++++-----------
 3 files changed, 41 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a5028b7d9d537..87000a1c36eef 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -50,7 +50,6 @@
 using namespace llvm;
 using namespace PatternMatch;
 
-extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 
 /// Replace a select operand based on an equality comparison with the identity
 /// constant of a binop.
@@ -4493,21 +4492,8 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   auto FoldSelectWithAndOrCond = [&](bool IsAnd, Value *A,
                                      Value *B) -> Instruction * {
     if (Value *V = simplifySelectInst(B, TrueVal, FalseVal,
-                                      SQ.getWithInstruction(&SI))) {
-      Value *NewTrueVal = IsAnd ? V : TrueVal;
-      Value *NewFalseVal = IsAnd ? FalseVal : V;
-
-      // If the True and False values don't change, then preserve the branch
-      // metadata of the original select as the net effect of this change is to
-      // simplify the conditional.
-      Instruction *MDFrom = nullptr;
-      if (NewTrueVal == TrueVal && NewFalseVal == FalseVal &&
-          !ProfcheckDisableMetadataFixes) {
-        MDFrom = &SI;
-      }
-      return SelectInst::Create(A, NewTrueVal, NewFalseVal, "", nullptr,
-                                MDFrom);
-    }
+                                      SQ.getWithInstruction(&SI)))
+      return SelectInst::Create(A, IsAnd ? V : TrueVal, IsAnd ? FalseVal : V);
 
     // Is (select B, T, F) a SPF?
     if (CondVal->hasOneUse() && SelType->isIntOrIntVectorTy()) {
diff --git a/llvm/test/Transforms/InstCombine/select-and-cmp.ll b/llvm/test/Transforms/InstCombine/select-and-cmp.ll
index 26c04ad412445..50e1493cad5c7 100644
--- a/llvm/test/Transforms/InstCombine/select-and-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-and-cmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i32 @select_and_icmp(i32 %x, i32 %y, i32 %z) {
@@ -114,34 +114,34 @@ define i32 @select_and_icmp_inv(i32 %x, i32 %y, i32 %z) {
 
 ; Below used to be negative tests in InstSimplify, but are no more negative cases here
 
-define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) !prof !0 {
+define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_1(
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
   %C = and i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x, !prof !1
+  %D = select i1 %C, i32 %z, i32 %x
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) !prof !0 {
+define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_2(
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = and i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x, !prof !1
+  %D = select i1 %C, i32 %z, i32 %x
   ret i32 %D
 }
 
 define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_3(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -153,8 +153,8 @@ define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_4(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -166,7 +166,7 @@ define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_1(
-; CHECK-NEXT:    ret i32 [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
@@ -177,8 +177,8 @@ define i32 @select_and_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_2(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -191,8 +191,8 @@ define i32 @select_and_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_3(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -204,8 +204,8 @@ define i32 @select_and_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_4(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -322,11 +322,3 @@ define i32 @select_and_icmp_alt_bad_false_val(i32 %x, i32 %y, i32 %z, i32 %k) {
   %D = select i1 %C, i32 %x, i32 %k
   ret i32 %D
 }
-
-!0 = !{!"function_entry_count", i64 1000}
-!1 = !{!"branch_weights", i32 2, i32 3}
-
-;.
-; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
-;.
diff --git a/llvm/test/Transforms/InstCombine/select-or-cmp.ll b/llvm/test/Transforms/InstCombine/select-or-cmp.ll
index 82b069bc91e88..72a3747c5e405 100644
--- a/llvm/test/Transforms/InstCombine/select-or-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-or-cmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i32 @select_or_icmp(i32 %x, i32 %y, i32 %z) {
@@ -114,47 +114,47 @@ define i32 @select_or_icmp_inv(i32 %x, i32 %y, i32 %z) {
 
 ; Below used to be negative tests in InstSimplify, but are no more negative cases here
 
-define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) !prof !0 {
+define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_1(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x, !prof !1
+  %D = select i1 %C, i32 %z, i32 %x
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) !prof !0 {
+define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_2(
-; CHECK-NEXT:    ret i32 [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[Z]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x, !prof !1
+  %D = select i1 %C, i32 %z, i32 %x
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) !prof !0 {
+define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_3(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]], !prof [[PROF2:![0-9]+]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x, !prof !1
+  %D = select i1 %C, i32 %z, i32 %x
   ret i32 %D
 }
 
 define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_4(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -166,8 +166,8 @@ define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_1(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -179,7 +179,7 @@ define i32 @select_or_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_2(
-; CHECK-NEXT:    ret i32 [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
@@ -190,8 +190,8 @@ define i32 @select_or_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_3(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -203,8 +203,8 @@ define i32 @select_or_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_4(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -321,11 +321,3 @@ define i32 @select_or_icmp_alt_bad_false_val(i32 %x, i32 %y, i32 %z, i32 %k) {
   %D = select i1 %C, i32 %x, i32 %k
   ret i32 %D
 }
-
-!0 = !{!"function_entry_count", i64 1000}
-!1 = !{!"branch_weights", i32 2, i32 3}
-;.
-; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 2}
-; CHECK: [[PROF2]] = !{!"branch_weights", i32 2, i32 3}
-;.

From 436f9f39cb259d28381aad87b777236c7cb94d02 Mon Sep 17 00:00:00 2001
From: Yi Zhang <cathyzhyi@google.com>
Date: Fri, 3 Oct 2025 13:40:46 -0400
Subject: [PATCH 673/878] [bazel] Add missing dep after
 cf86ef925d75ef08526fc399a2888673338298b3 (#161884)

---
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 5af035d840946..258d732d43be8 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1445,6 +1445,7 @@ cc_library(
         ":crosstu",
         ":driver",
         ":frontend",
+        ":index",
         ":lex",
         ":rewrite",
         ":static_analyzer_checkers_gen",

From d682f61c1272ac28de06aa47d580f040ba54cd4b Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffranllvm@gmail.com>
Date: Fri, 3 Oct 2025 10:45:12 -0700
Subject: [PATCH 674/878] [HLSL][TEST] Fix root signature driver test on WSL
 (#161566)

Running those test with validation enabled, causes emitting an
additional intermediary object, which causes the checks to fail, since
it will emit 2 `obj`, instead of one `obj` and one `dxo`. This patch
changes the test to make sure validation is disabled, making the test
consistent across environments.
---
 clang/test/Driver/dxc_frs.hlsl                  | 3 +--
 clang/test/Driver/dxc_rootsignature_target.hlsl | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/dxc_frs.hlsl b/clang/test/Driver/dxc_frs.hlsl
index 767cab604c829..ffc38866687ba 100644
--- a/clang/test/Driver/dxc_frs.hlsl
+++ b/clang/test/Driver/dxc_frs.hlsl
@@ -1,10 +1,9 @@
-// RUN: %clang_dxc -T cs_6_0 /Fo %t.dxo /Frs %t.rs.dxo -### %s 2>&1 | FileCheck %s
+// RUN: %clang_dxc -Vd -T cs_6_0 /Fo %t.dxo /Frs %t.rs.dxo -### %s 2>&1 | FileCheck %s
 
 // Test to demonstrate extracting the root signature to the specified
 // output file with /Frs.
 
 // CHECK: "{{.*}}llvm-objcopy{{(.exe)?}}" "{{.*}}.obj" "{{.*}}.dxo" "--extract-section=RTS0={{.*}}.rs.dxo"
-
 [shader("compute"), RootSignature("")]
 [numthreads(1,1,1)]
 void EmptyEntry() {}
diff --git a/clang/test/Driver/dxc_rootsignature_target.hlsl b/clang/test/Driver/dxc_rootsignature_target.hlsl
index 08cd1ab00089b..bb4806391912e 100644
--- a/clang/test/Driver/dxc_rootsignature_target.hlsl
+++ b/clang/test/Driver/dxc_rootsignature_target.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_dxc -E EntryRS -T rootsig_1_1 /Fo %t.dxo -### %s 2>&1 | FileCheck %s --check-prefix=CMDS
+// RUN: %clang_dxc -Vd -E EntryRS -T rootsig_1_1 /Fo %t.dxo -### %s 2>&1 | FileCheck %s --check-prefix=CMDS
 
 // CMDS: "{{.*}}clang{{.*}}" "-cc1"
 // CMDS-SAME: "-triple" "dxilv1.1-unknown-shadermodel1.1-rootsignature"

From 8fd8fb4d40bf382c7ae1c00b729a0e8cb80733e7 Mon Sep 17 00:00:00 2001
From: joaosaffran <joaosaffranllvm@gmail.com>
Date: Fri, 3 Oct 2025 10:45:32 -0700
Subject: [PATCH 675/878] [Documentation][DirectX] Updating RTS0 structs to
 match d3d12 docs (#161593)

This PR updates the `DXContainer.rst` in two ways: First, it makes sure
the structures names are matching the ones in d3d12 documentation;
Second, it makes sure the types match what is in the code currently.
Closes: [135210](https://github.com/llvm/llvm-project/issues/135210)
---
 llvm/docs/DirectX/DXContainer.rst | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/docs/DirectX/DXContainer.rst b/llvm/docs/DirectX/DXContainer.rst
index 17452d99e1d99..4473f4ed8d002 100644
--- a/llvm/docs/DirectX/DXContainer.rst
+++ b/llvm/docs/DirectX/DXContainer.rst
@@ -530,7 +530,7 @@ but adds a 32-bit access flag.
 .. code-block:: c
 
    struct DescriptorRange_V1_0 {
-      uint32_t RangeType;
+      dxil::ResourceClass RangeType;
       uint32_t NumDescriptors;
       uint32_t BaseShaderRegister;
       uint32_t RegisterSpace;
@@ -538,12 +538,12 @@ but adds a 32-bit access flag.
    };
 
    struct DescriptorRange_V1_1 {
-      dxbc::DescriptorRangeType RangeType;
+      dxil::ResourceClass RangeType;
       uint32_t NumDescriptors;
       uint32_t BaseShaderRegister;
       uint32_t RegisterSpace;
-      uint32_t OffsetInDescriptorsFromTableStart;      
       uint32_t Flags;
+      uint32_t OffsetInDescriptorsFromTableStart;      
    };
 
 Static Samplers
@@ -556,22 +556,26 @@ This section also has a variable size, since it can contain multiple static
 samplers definitions. However, the definition is a fixed sized struct, 
 containing 13 32-byte fields of various enum, float, and integer values. 
 
+In version 1.2, the static sampler is 17 bytes. It matches the 1.0 static sampler
+but adds a 32-bit access flag. In Version 1.1, it matches static sampler 
+version 1.0.
+
 .. code-block:: c
 
    struct StaticSamplerDesc {
-      FilterMode Filter; 
-      TextureAddressMode AddressU;
-      TextureAddressMode AddressV;
-      TextureAddressMode AddressW;
+      dxbc::FilterMode Filter; 
+      dxbc::TextureAddressMode AddressU;
+      dxbc::TextureAddressMode AddressV;
+      dxbc::TextureAddressMode AddressW;
       float MipLODBias;
       uint32_t MaxAnisotropy;
-      ComparisonFunc ComparisonFunc; 
-      StaticBorderColor BorderColor;
+      dxbc::ComparisonFunc ComparisonFunc; 
+      dxbc::StaticBorderColor BorderColor;
       float MinLOD;
       float MaxLOD;
       uint32_t ShaderRegister;
       uint32_t RegisterSpace;
-      ShaderVisibility ShaderVisibility;
+      dxbc::ShaderVisibility ShaderVisibility;
    };
 
 SFI0 Part

From 1fba01a51d6cbf1cc63f4209203babd81950c5e6 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Fri, 3 Oct 2025 10:47:50 -0700
Subject: [PATCH 676/878] [flang] Consolidate & clean up COMMON block checks
 (#161286)

COMMON block checks are split between name resolution and declaration
checking. We generally want declaration checks to take place after name
resolution, and the COMMON block checks that are currently in name
resolution have some derived type analyses that are redundant with the
derived type component iteration framework used elsewhere in semantics.
So move as much as possible into declaration checking, use the component
iteration framework, and cope with the missing COMMON block name case
that arises with blank COMMON when placing the error messages.
---
 flang/include/flang/Parser/parse-tree.h    |   1 +
 flang/include/flang/Semantics/scope.h      |   2 +-
 flang/include/flang/Semantics/symbol.h     |   6 +-
 flang/include/flang/Semantics/type.h       |   3 +
 flang/lib/Evaluate/tools.cpp               |   9 ++
 flang/lib/Parser/Fortran-parsers.cpp       |   4 +-
 flang/lib/Semantics/check-declarations.cpp | 126 ++++++++++++++++-----
 flang/lib/Semantics/resolve-directives.cpp |   2 +-
 flang/lib/Semantics/resolve-names.cpp      | 101 +++--------------
 flang/lib/Semantics/scope.cpp              |   5 +-
 flang/lib/Semantics/semantics.cpp          |   6 +-
 flang/lib/Semantics/type.cpp               |  23 +++-
 flang/test/Semantics/declarations01.f90    |   2 +-
 flang/test/Semantics/declarations08.f90    |   2 +-
 flang/test/Semantics/resolve42.f90         |  38 +++++--
 15 files changed, 186 insertions(+), 144 deletions(-)

diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 325ca9b4a227b..1443e93fc4eb0 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -1639,6 +1639,7 @@ struct CommonStmt {
   BOILERPLATE(CommonStmt);
   CommonStmt(std::optional<Name> &&, std::list<CommonBlockObject> &&,
       std::list<Block> &&);
+  CharBlock source;
   std::list<Block> blocks;
 };
 
diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h
index b4046830522b8..3195892fa7b91 100644
--- a/flang/include/flang/Semantics/scope.h
+++ b/flang/include/flang/Semantics/scope.h
@@ -188,7 +188,7 @@ class Scope {
   void add_crayPointer(const SourceName &, Symbol &);
   mapType &commonBlocks() { return commonBlocks_; }
   const mapType &commonBlocks() const { return commonBlocks_; }
-  Symbol &MakeCommonBlock(const SourceName &);
+  Symbol &MakeCommonBlock(SourceName, SourceName location);
   Symbol *FindCommonBlock(const SourceName &) const;
 
   /// Make a Symbol but don't add it to the scope.
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index a0d5ae7176141..975423b32da73 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -570,17 +570,21 @@ class NamelistDetails {
 
 class CommonBlockDetails : public WithBindName {
 public:
+  explicit CommonBlockDetails(SourceName location)
+      : sourceLocation_{location} {}
+  SourceName sourceLocation() const { return sourceLocation_; }
   MutableSymbolVector &objects() { return objects_; }
   const MutableSymbolVector &objects() const { return objects_; }
   void add_object(Symbol &object) { objects_.emplace_back(object); }
   void replace_object(Symbol &object, unsigned index) {
-    CHECK(index < (unsigned)objects_.size());
+    CHECK(index < objects_.size());
     objects_[index] = object;
   }
   std::size_t alignment() const { return alignment_; }
   void set_alignment(std::size_t alignment) { alignment_ = alignment; }
 
 private:
+  SourceName sourceLocation_;
   MutableSymbolVector objects_;
   std::size_t alignment_{0}; // required alignment in bytes
 };
diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h
index 5d96f1e89bf52..3bd638b89053d 100644
--- a/flang/include/flang/Semantics/type.h
+++ b/flang/include/flang/Semantics/type.h
@@ -285,6 +285,9 @@ class DerivedTypeSpec {
   bool IsForwardReferenced() const;
   bool HasDefaultInitialization(
       bool ignoreAllocatable = false, bool ignorePointer = true) const;
+  std::optional<std::string> // component path suitable for error messages
+  ComponentWithDefaultInitialization(
+      bool ignoreAllocatable = false, bool ignorePointer = true) const;
   bool HasDestruction() const;
 
   // The "raw" type parameter list is a simple transcription from the
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 3cfad03648aee..b927fa3cc7ed7 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1209,6 +1209,15 @@ parser::Message *AttachDeclaration(
     message.Attach(use->location(),
         "'%s' is USE-associated with '%s' in module '%s'"_en_US, symbol.name(),
         unhosted->name(), GetUsedModule(*use).name());
+  } else if (const auto *common{
+                 unhosted->detailsIf<semantics::CommonBlockDetails>()}) {
+    parser::CharBlock at{unhosted->name()};
+    if (at.empty()) { // blank COMMON, with or without //
+      at = common->sourceLocation();
+    }
+    if (!at.empty()) {
+      message.Attach(at, "Declaration of /%s/"_en_US, unhosted->name());
+    }
   } else {
     message.Attach(
         unhosted->name(), "Declaration of '%s'"_en_US, unhosted->name());
diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp
index fbe629ab52935..d33a18fe9572c 100644
--- a/flang/lib/Parser/Fortran-parsers.cpp
+++ b/flang/lib/Parser/Fortran-parsers.cpp
@@ -1100,14 +1100,14 @@ TYPE_PARSER(construct<EquivalenceObject>(indirect(designator)))
 // R873 common-stmt ->
 //        COMMON [/ [common-block-name] /] common-block-object-list
 //        [[,] / [common-block-name] / common-block-object-list]...
-TYPE_PARSER(
+TYPE_PARSER(sourced(
     construct<CommonStmt>("COMMON" >> defaulted("/" >> maybe(name) / "/"),
         nonemptyList("expected COMMON block objects"_err_en_US,
             Parser<CommonBlockObject>{}),
         many(maybe(","_tok) >>
             construct<CommonStmt::Block>("/" >> maybe(name) / "/",
                 nonemptyList("expected COMMON block objects"_err_en_US,
-                    Parser<CommonBlockObject>{})))))
+                    Parser<CommonBlockObject>{}))))))
 
 // R874 common-block-object -> variable-name [( array-spec )]
 TYPE_PARSER(construct<CommonBlockObject>(name, maybe(arraySpec)))
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 7b881008219df..8a80bf045bcbc 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -512,39 +512,111 @@ void CheckHelper::Check(const Symbol &symbol) {
 }
 
 void CheckHelper::CheckCommonBlock(const Symbol &symbol) {
-  auto restorer{messages_.SetLocation(symbol.name())};
   CheckGlobalName(symbol);
-  if (symbol.attrs().test(Attr::BIND_C)) {
+  const auto &common{symbol.get<CommonBlockDetails>()};
+  SourceName location{symbol.name()};
+  if (location.empty()) {
+    location = common.sourceLocation();
+  }
+  bool isBindCCommon{symbol.attrs().test(Attr::BIND_C)};
+  if (isBindCCommon) {
     CheckBindC(symbol);
-    for (auto ref : symbol.get<CommonBlockDetails>().objects()) {
-      if (ref->has<ObjectEntityDetails>()) {
-        if (auto msgs{WhyNotInteroperableObject(*ref,
-                /*allowInteroperableType=*/false, /*forCommonBlock=*/true)};
-            !msgs.empty()) {
-          parser::Message &reason{msgs.messages().front()};
-          parser::Message *msg{nullptr};
-          if (reason.IsFatal()) {
-            msg = messages_.Say(symbol.name(),
-                "'%s' may not be a member of BIND(C) COMMON block /%s/"_err_en_US,
-                ref->name(), symbol.name());
-          } else {
-            msg = messages_.Say(symbol.name(),
-                "'%s' should not be a member of BIND(C) COMMON block /%s/"_warn_en_US,
-                ref->name(), symbol.name());
-          }
-          if (msg) {
-            msg->Attach(
-                std::move(reason.set_severity(parser::Severity::Because)));
-          }
+  }
+  for (auto ref : symbol.get<CommonBlockDetails>().objects()) {
+    auto restorer{
+        messages_.SetLocation(location.empty() ? ref->name() : location)};
+    if (isBindCCommon && ref->has<ObjectEntityDetails>()) {
+      if (auto msgs{WhyNotInteroperableObject(*ref,
+              /*allowInteroperableType=*/false, /*forCommonBlock=*/true)};
+          !msgs.empty()) {
+        parser::Message &reason{msgs.messages().front()};
+        parser::Message *msg{nullptr};
+        if (reason.IsFatal()) {
+          msg = messages_.Say(
+              "'%s' may not be a member of BIND(C) COMMON block /%s/"_err_en_US,
+              ref->name(), symbol.name());
+        } else {
+          msg = messages_.Say(
+              "'%s' should not be a member of BIND(C) COMMON block /%s/"_warn_en_US,
+              ref->name(), symbol.name());
         }
+        if (msg) {
+          msg = &msg->Attach(
+              std::move(reason.set_severity(parser::Severity::Because)));
+        }
+        evaluate::AttachDeclaration(msg, *ref);
       }
     }
-  }
-  for (auto ref : symbol.get<CommonBlockDetails>().objects()) {
     if (ref->test(Symbol::Flag::CrayPointee)) {
-      messages_.Say(ref->name(),
-          "Cray pointee '%s' may not be a member of a COMMON block"_err_en_US,
-          ref->name());
+      evaluate::AttachDeclaration(
+          messages_.Say(
+              "Cray pointee '%s' may not be a member of COMMON block /%s/"_err_en_US,
+              ref->name(), symbol.name()),
+          *ref);
+    }
+    if (IsAllocatable(*ref)) {
+      evaluate::AttachDeclaration(
+          messages_.Say(
+              "ALLOCATABLE object '%s' may not appear in COMMON block /%s/"_err_en_US,
+              ref->name(), symbol.name()),
+          *ref);
+    }
+    if (ref->attrs().test(Attr::BIND_C)) {
+      evaluate::AttachDeclaration(
+          messages_.Say(
+              "BIND(C) object '%s' may not appear in COMMON block /%s/"_err_en_US,
+              ref->name(), symbol.name()),
+          *ref);
+    }
+    if (IsNamedConstant(*ref)) {
+      evaluate::AttachDeclaration(
+          messages_.Say(
+              "Named constant '%s' may not appear in COMMON block /%s/"_err_en_US,
+              ref->name(), symbol.name()),
+          *ref);
+    }
+    if (IsDummy(*ref)) {
+      evaluate::AttachDeclaration(
+          messages_.Say(
+              "Dummy argument '%s' may not appear in COMMON block /%s/"_err_en_US,
+              ref->name(), symbol.name()),
+          *ref);
+    }
+    if (ref->IsFuncResult()) {
+      evaluate::AttachDeclaration(
+          messages_.Say(
+              "Function result '%s' may not appear in COMMON block /%s/"_err_en_US,
+              ref->name(), symbol.name()),
+          *ref);
+    }
+    if (const auto *type{ref->GetType()}) {
+      if (type->category() == DeclTypeSpec::ClassStar) {
+        evaluate::AttachDeclaration(
+            messages_.Say(
+                "Unlimited polymorphic pointer '%s' may not appear in COMMON block /%s/"_err_en_US,
+                ref->name(), symbol.name()),
+            *ref);
+      } else if (const auto *derived{type->AsDerived()}) {
+        if (!IsSequenceOrBindCType(derived)) {
+          evaluate::AttachDeclaration(
+              evaluate::AttachDeclaration(
+                  messages_.Say(
+                      "Object '%s' whose derived type '%s' is neither SEQUENCE nor BIND(C) may not appear in COMMON block /%s/"_err_en_US,
+                      ref->name(), derived->name(), symbol.name()),
+                  derived->typeSymbol()),
+              *ref);
+        } else if (auto componentPath{
+                       derived->ComponentWithDefaultInitialization()}) {
+          evaluate::AttachDeclaration(
+              evaluate::AttachDeclaration(
+                  messages_.Say(
+                      "COMMON block /%s/ may not have the member '%s' whose derived type '%s' has a component '%s' that is ALLOCATABLE or has default initialization"_err_en_US,
+                      symbol.name(), ref->name(), derived->name(),
+                      *componentPath),
+                  derived->typeSymbol()),
+              *ref);
+        }
+      }
     }
   }
 }
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 02fcf021137ab..18fc63814d973 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -625,7 +625,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     for (const parser::OmpObject &obj : x.v) {
       auto *name{std::get_if<parser::Name>(&obj.u)};
       if (name && !name->symbol) {
-        Resolve(*name, currScope().MakeCommonBlock(name->source));
+        Resolve(*name, currScope().MakeCommonBlock(name->source, name->source));
       }
     }
   }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 5041a6a08fc3c..aa09d49f1453b 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1106,8 +1106,9 @@ class DeclarationVisitor : public ArraySpecVisitor,
   // or nullptr on error.
   Symbol *DeclareStatementEntity(const parser::DoVariable &,
       const std::optional<parser::IntegerTypeSpec> &);
-  Symbol &MakeCommonBlockSymbol(const parser::Name &);
-  Symbol &MakeCommonBlockSymbol(const std::optional<parser::Name> &);
+  Symbol &MakeCommonBlockSymbol(const parser::Name &, SourceName);
+  Symbol &MakeCommonBlockSymbol(
+      const std::optional<parser::Name> &, SourceName);
   bool CheckUseError(const parser::Name &);
   void CheckAccessibility(const SourceName &, bool, Symbol &);
   void CheckCommonBlocks();
@@ -1244,8 +1245,6 @@ class DeclarationVisitor : public ArraySpecVisitor,
   bool OkToAddComponent(const parser::Name &, const Symbol *extends = nullptr);
   ParamValue GetParamValue(
       const parser::TypeParamValue &, common::TypeParamAttr attr);
-  void CheckCommonBlockDerivedType(
-      const SourceName &, const Symbol &, UnorderedSymbolSet &);
   Attrs HandleSaveName(const SourceName &, Attrs);
   void AddSaveName(std::set<SourceName> &, const SourceName &);
   bool HandleUnrestrictedSpecificIntrinsicFunction(const parser::Name &);
@@ -5564,7 +5563,7 @@ bool DeclarationVisitor::Pre(const parser::BindEntity &x) {
   if (kind == parser::BindEntity::Kind::Object) {
     symbol = &HandleAttributeStmt(Attr::BIND_C, name);
   } else {
-    symbol = &MakeCommonBlockSymbol(name);
+    symbol = &MakeCommonBlockSymbol(name, name.source);
     SetExplicitAttr(*symbol, Attr::BIND_C);
   }
   // 8.6.4(1)
@@ -7147,7 +7146,7 @@ bool DeclarationVisitor::Pre(const parser::SaveStmt &x) {
       auto kind{std::get<parser::SavedEntity::Kind>(y.t)};
       const auto &name{std::get<parser::Name>(y.t)};
       if (kind == parser::SavedEntity::Kind::Common) {
-        MakeCommonBlockSymbol(name);
+        MakeCommonBlockSymbol(name, name.source);
         AddSaveName(specPartState_.saveInfo.commons, name.source);
       } else {
         HandleAttributeStmt(Attr::SAVE, name);
@@ -7227,59 +7226,22 @@ void DeclarationVisitor::CheckCommonBlocks() {
     if (symbol.get<CommonBlockDetails>().objects().empty() &&
         symbol.attrs().test(Attr::BIND_C)) {
       Say(symbol.name(),
-          "'%s' appears as a COMMON block in a BIND statement but not in"
-          " a COMMON statement"_err_en_US);
-    }
-  }
-  // check objects in common blocks
-  for (const auto &name : specPartState_.commonBlockObjects) {
-    const auto *symbol{currScope().FindSymbol(name)};
-    if (!symbol) {
-      continue;
-    }
-    const auto &attrs{symbol->attrs()};
-    if (attrs.test(Attr::ALLOCATABLE)) {
-      Say(name,
-          "ALLOCATABLE object '%s' may not appear in a COMMON block"_err_en_US);
-    } else if (attrs.test(Attr::BIND_C)) {
-      Say(name,
-          "Variable '%s' with BIND attribute may not appear in a COMMON block"_err_en_US);
-    } else if (IsNamedConstant(*symbol)) {
-      Say(name,
-          "A named constant '%s' may not appear in a COMMON block"_err_en_US);
-    } else if (IsDummy(*symbol)) {
-      Say(name,
-          "Dummy argument '%s' may not appear in a COMMON block"_err_en_US);
-    } else if (symbol->IsFuncResult()) {
-      Say(name,
-          "Function result '%s' may not appear in a COMMON block"_err_en_US);
-    } else if (const DeclTypeSpec * type{symbol->GetType()}) {
-      if (type->category() == DeclTypeSpec::ClassStar) {
-        Say(name,
-            "Unlimited polymorphic pointer '%s' may not appear in a COMMON block"_err_en_US);
-      } else if (const auto *derived{type->AsDerived()}) {
-        if (!IsSequenceOrBindCType(derived)) {
-          Say(name,
-              "Derived type '%s' in COMMON block must have the BIND or"
-              " SEQUENCE attribute"_err_en_US);
-        }
-        UnorderedSymbolSet typeSet;
-        CheckCommonBlockDerivedType(name, derived->typeSymbol(), typeSet);
-      }
+          "'%s' appears as a COMMON block in a BIND statement but not in a COMMON statement"_err_en_US);
     }
   }
   specPartState_.commonBlockObjects = {};
 }
 
-Symbol &DeclarationVisitor::MakeCommonBlockSymbol(const parser::Name &name) {
-  return Resolve(name, currScope().MakeCommonBlock(name.source));
+Symbol &DeclarationVisitor::MakeCommonBlockSymbol(
+    const parser::Name &name, SourceName location) {
+  return Resolve(name, currScope().MakeCommonBlock(name.source, location));
 }
 Symbol &DeclarationVisitor::MakeCommonBlockSymbol(
-    const std::optional<parser::Name> &name) {
+    const std::optional<parser::Name> &name, SourceName location) {
   if (name) {
-    return MakeCommonBlockSymbol(*name);
+    return MakeCommonBlockSymbol(*name, location);
   } else {
-    return MakeCommonBlockSymbol(parser::Name{});
+    return MakeCommonBlockSymbol(parser::Name{}, location);
   }
 }
 
@@ -7287,43 +7249,6 @@ bool DeclarationVisitor::NameIsKnownOrIntrinsic(const parser::Name &name) {
   return FindSymbol(name) || HandleUnrestrictedSpecificIntrinsicFunction(name);
 }
 
-// Check if this derived type can be in a COMMON block.
-void DeclarationVisitor::CheckCommonBlockDerivedType(const SourceName &name,
-    const Symbol &typeSymbol, UnorderedSymbolSet &typeSet) {
-  if (auto iter{typeSet.find(SymbolRef{typeSymbol})}; iter != typeSet.end()) {
-    return;
-  }
-  typeSet.emplace(typeSymbol);
-  if (const auto *scope{typeSymbol.scope()}) {
-    for (const auto &pair : *scope) {
-      const Symbol &component{*pair.second};
-      if (component.attrs().test(Attr::ALLOCATABLE)) {
-        Say2(name,
-            "Derived type variable '%s' may not appear in a COMMON block"
-            " due to ALLOCATABLE component"_err_en_US,
-            component.name(), "Component with ALLOCATABLE attribute"_en_US);
-        return;
-      }
-      const auto *details{component.detailsIf<ObjectEntityDetails>()};
-      if (component.test(Symbol::Flag::InDataStmt) ||
-          (details && details->init())) {
-        Say2(name,
-            "Derived type variable '%s' may not appear in a COMMON block due to component with default initialization"_err_en_US,
-            component.name(), "Component with default initialization"_en_US);
-        return;
-      }
-      if (details) {
-        if (const auto *type{details->type()}) {
-          if (const auto *derived{type->AsDerived()}) {
-            const Symbol &derivedTypeSymbol{derived->typeSymbol()};
-            CheckCommonBlockDerivedType(name, derivedTypeSymbol, typeSet);
-          }
-        }
-      }
-    }
-  }
-}
-
 bool DeclarationVisitor::HandleUnrestrictedSpecificIntrinsicFunction(
     const parser::Name &name) {
   if (auto interface{context().intrinsics().IsSpecificIntrinsicFunction(
@@ -9655,7 +9580,7 @@ void ResolveNamesVisitor::CreateCommonBlockSymbols(
     const parser::CommonStmt &commonStmt) {
   for (const parser::CommonStmt::Block &block : commonStmt.blocks) {
     const auto &[name, objects] = block.t;
-    Symbol &commonBlock{MakeCommonBlockSymbol(name)};
+    Symbol &commonBlock{MakeCommonBlockSymbol(name, commonStmt.source)};
     for (const auto &object : objects) {
       Symbol &obj{DeclareObjectEntity(std::get<parser::Name>(object.t))};
       if (auto *details{obj.detailsIf<ObjectEntityDetails>()}) {
diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp
index 9c5682bed06cb..4af371f3611f3 100644
--- a/flang/lib/Semantics/scope.cpp
+++ b/flang/lib/Semantics/scope.cpp
@@ -143,12 +143,13 @@ void Scope::add_crayPointer(const SourceName &name, Symbol &pointer) {
   crayPointers_.emplace(name, pointer);
 }
 
-Symbol &Scope::MakeCommonBlock(const SourceName &name) {
+Symbol &Scope::MakeCommonBlock(SourceName name, SourceName location) {
   const auto it{commonBlocks_.find(name)};
   if (it != commonBlocks_.end()) {
     return *it->second;
   } else {
-    Symbol &symbol{MakeSymbol(name, Attrs{}, CommonBlockDetails{})};
+    Symbol &symbol{MakeSymbol(
+        name, Attrs{}, CommonBlockDetails{name.empty() ? location : name})};
     commonBlocks_.emplace(name, symbol);
     return symbol;
   }
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index 6db11aaf56c2a..bdb5377265c14 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -313,15 +313,13 @@ class CommonBlockMap {
   /// Return the symbol of an initialized member if a COMMON block
   /// is initalized. Otherwise, return nullptr.
   static Symbol *CommonBlockIsInitialized(const Symbol &common) {
-    const auto &commonDetails =
-        common.get<Fortran::semantics::CommonBlockDetails>();
-
+    const auto &commonDetails{
+        common.get<Fortran::semantics::CommonBlockDetails>()};
     for (const auto &member : commonDetails.objects()) {
       if (IsInitialized(*member)) {
         return &*member;
       }
     }
-
     // Common block may be initialized via initialized variables that are in an
     // equivalence with the common block members.
     for (const Fortran::semantics::EquivalenceSet &set :
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index 964a37e1c822b..69e6ffa47d09e 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -206,14 +206,25 @@ bool DerivedTypeSpec::IsForwardReferenced() const {
   return typeSymbol_.get<DerivedTypeDetails>().isForwardReferenced();
 }
 
-bool DerivedTypeSpec::HasDefaultInitialization(
+std::optional<std::string> DerivedTypeSpec::ComponentWithDefaultInitialization(
     bool ignoreAllocatable, bool ignorePointer) const {
   DirectComponentIterator components{*this};
-  return bool{std::find_if(
-      components.begin(), components.end(), [&](const Symbol &component) {
-        return IsInitialized(component, /*ignoreDataStatements=*/true,
-            ignoreAllocatable, ignorePointer);
-      })};
+  if (auto it{std::find_if(components.begin(), components.end(),
+          [ignoreAllocatable, ignorePointer](const Symbol &component) {
+            return (!ignoreAllocatable && IsAllocatable(component)) ||
+                (!ignorePointer && IsPointer(component)) ||
+                HasDeclarationInitializer(component);
+          })}) {
+    return it.BuildResultDesignatorName();
+  } else {
+    return std::nullopt;
+  }
+}
+
+bool DerivedTypeSpec::HasDefaultInitialization(
+    bool ignoreAllocatable, bool ignorePointer) const {
+  return ComponentWithDefaultInitialization(ignoreAllocatable, ignorePointer)
+      .has_value();
 }
 
 bool DerivedTypeSpec::HasDestruction() const {
diff --git a/flang/test/Semantics/declarations01.f90 b/flang/test/Semantics/declarations01.f90
index 77cb6b4f1fef8..3d8754e2bc8fa 100644
--- a/flang/test/Semantics/declarations01.f90
+++ b/flang/test/Semantics/declarations01.f90
@@ -7,7 +7,7 @@ function f1() result(x)
 
   integer, parameter :: x2 = 1
   integer :: x3
-  !ERROR: A named constant 'x2' may not appear in a COMMON block
+  !ERROR: Named constant 'x2' may not appear in COMMON block /blk/
   common /blk/ x2, x3
 
 end
diff --git a/flang/test/Semantics/declarations08.f90 b/flang/test/Semantics/declarations08.f90
index 2c4027d117365..de7d5d75f60e9 100644
--- a/flang/test/Semantics/declarations08.f90
+++ b/flang/test/Semantics/declarations08.f90
@@ -2,7 +2,7 @@
 pointer(p,x)
 !ERROR: Cray pointee 'y' may not be a member of an EQUIVALENCE group
 pointer(p,y)
-!ERROR: Cray pointee 'x' may not be a member of a COMMON block
+!ERROR: Cray pointee 'x' may not be a member of COMMON block //
 common x
 equivalence(y,z)
 !ERROR: Cray pointee 'v' may not be initialized
diff --git a/flang/test/Semantics/resolve42.f90 b/flang/test/Semantics/resolve42.f90
index 5a433d06ccc1d..13caff0b87d85 100644
--- a/flang/test/Semantics/resolve42.f90
+++ b/flang/test/Semantics/resolve42.f90
@@ -28,17 +28,17 @@ subroutine s5
 end
 
 function f6(x) result(r)
-  !ERROR: ALLOCATABLE object 'y' may not appear in a COMMON block
-  !ERROR: Dummy argument 'x' may not appear in a COMMON block
+  !ERROR: ALLOCATABLE object 'y' may not appear in COMMON block //
+  !ERROR: Dummy argument 'x' may not appear in COMMON block //
+  !ERROR: Function result 'r' may not appear in COMMON block //
   common y,x,z
   allocatable y
-  !ERROR: Function result 'r' may not appear in a COMMON block
   common r
 end
 
 module m7
-  !ERROR: Variable 'w' with BIND attribute may not appear in a COMMON block
-  !ERROR: Variable 'z' with BIND attribute may not appear in a COMMON block
+  !ERROR: BIND(C) object 'w' may not appear in COMMON block //
+  !ERROR: BIND(C) object 'z' may not appear in COMMON block //
   common w,z
   integer, bind(c) :: z
   integer, bind(c,name="w") :: w
@@ -48,8 +48,8 @@ module m8
   type t
   end type
   class(*), pointer :: x
-  !ERROR: Unlimited polymorphic pointer 'x' may not appear in a COMMON block
-  !ERROR: Unlimited polymorphic pointer 'y' may not appear in a COMMON block
+  !ERROR: Unlimited polymorphic pointer 'x' may not appear in COMMON block //
+  !ERROR: Unlimited polymorphic pointer 'y' may not appear in COMMON block //
   common x, y
   class(*), pointer :: y
 end
@@ -67,7 +67,7 @@ module m10
   type t
   end type
   type(t) :: x
-  !ERROR: Derived type 'x' in COMMON block must have the BIND or SEQUENCE attribute
+  !ERROR: Object 'x' whose derived type 't' is neither SEQUENCE nor BIND(C) may not appear in COMMON block //
   common x
 end
 
@@ -82,7 +82,7 @@ module m11
     integer:: c
   end type
   type(t2) :: x2
-  !ERROR: Derived type variable 'x2' may not appear in a COMMON block due to ALLOCATABLE component
+  !ERROR: COMMON block /c2/ may not have the member 'x2' whose derived type 't2' has a component '%b%a' that is ALLOCATABLE or has default initialization
   common /c2/ x2
 end
 
@@ -97,7 +97,7 @@ module m12
     integer:: c
   end type
   type(t2) :: x2
-  !ERROR: Derived type variable 'x2' may not appear in a COMMON block due to component with default initialization
+  !ERROR: COMMON block /c3/ may not have the member 'x2' whose derived type 't2' has a component '%b%a' that is ALLOCATABLE or has default initialization
   common /c3/ x2
 end
 
@@ -112,3 +112,21 @@ subroutine s14
   !ERROR: 'c' appears as a COMMON block in a BIND statement but not in a COMMON statement
   bind(c) :: /c/
 end
+
+module m15
+  interface
+    subroutine sub
+    end subroutine
+  end interface
+  type t1
+    sequence
+    procedure(sub), pointer, nopass :: pp => sub
+  end type
+  type t2
+    sequence
+    type(t1) :: a
+  end type
+  type(t2) :: x2
+  !ERROR: COMMON block /c4/ may not have the member 'x2' whose derived type 't2' has a component '%a%pp' that is ALLOCATABLE or has default initialization
+  common /c4/ x2
+end

From cbe3b72c4c9378c9e4f2fd1b5e4d0f9e63d8f6bb Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Fri, 3 Oct 2025 10:48:13 -0700
Subject: [PATCH 677/878] [flang][CUDA] Downgrade error to warning (#161570)

The compiler currently emit an error about the lack of an explicit
procedure interface when an external procedure that is called via an
implicit interface is known to have an dummy argument with a CUDA data
attribute, even when the corresponding actual argument does have a CUDA
data attribute. This behavior is inconsistent with what happens when
such a call is to an external in another source file and its definition
is not visible -- the compiler silently accepts an actual argument with
a CUDA data attribute across the implicit interface.

Harmonize this situation so that an actual argument with a CUDA data
attribute in a reference to a procedure with an implicit interface
elicits a usage warning encouraging the use of explicit interfaces. Only
when the procedure's definition is visible, and incompatible, will an
error message appear.
---
 .../include/flang/Evaluate/characteristics.h  |  9 ++-
 flang/lib/Evaluate/characteristics.cpp        | 14 ++--
 flang/lib/Semantics/check-call.cpp            | 71 ++++++++++++-------
 flang/lib/Semantics/expression.cpp            |  2 +-
 .../test/Semantics/boz-literal-constants.f90  |  2 +-
 flang/test/Semantics/call13.f90               |  2 +-
 flang/test/Semantics/cuf24.cuf                | 40 +++++++++++
 flang/test/Semantics/null01.f90               |  4 +-
 8 files changed, 106 insertions(+), 38 deletions(-)
 create mode 100644 flang/test/Semantics/cuf24.cuf

diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index b6a9ebefec9df..4cf82e7c14d70 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -251,7 +251,8 @@ struct DummyDataObject {
       std::optional<std::string> *warning = nullptr) const;
   static std::optional<DummyDataObject> Characterize(
       const semantics::Symbol &, FoldingContext &);
-  bool CanBePassedViaImplicitInterface(std::string *whyNot = nullptr) const;
+  bool CanBePassedViaImplicitInterface(
+      std::string *whyNot = nullptr, bool checkCUDA = true) const;
   bool IsPassedByDescriptor(bool isBindC) const;
   llvm::raw_ostream &Dump(llvm::raw_ostream &) const;
 
@@ -307,7 +308,8 @@ struct DummyArgument {
   void SetOptional(bool = true);
   common::Intent GetIntent() const;
   void SetIntent(common::Intent);
-  bool CanBePassedViaImplicitInterface(std::string *whyNot = nullptr) const;
+  bool CanBePassedViaImplicitInterface(
+      std::string *whyNot = nullptr, bool checkCUDA = true) const;
   bool IsTypelessIntrinsicDummy() const;
   bool IsCompatibleWith(const DummyArgument &, std::string *whyNot = nullptr,
       std::optional<std::string> *warning = nullptr) const;
@@ -402,7 +404,8 @@ struct Procedure {
     return !attrs.test(Attr::ImplicitInterface);
   }
   std::optional<int> FindPassIndex(std::optional<parser::CharBlock>) const;
-  bool CanBeCalledViaImplicitInterface(std::string *whyNot = nullptr) const;
+  bool CanBeCalledViaImplicitInterface(
+      std::string *whyNot = nullptr, bool checkCUDA = true) const;
   bool CanOverride(const Procedure &, std::optional<int> passIndex) const;
   bool IsCompatibleWith(const Procedure &, bool ignoreImplicitVsExplicit,
       std::string *whyNot = nullptr, const SpecificIntrinsic * = nullptr,
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index 37c62c93a87df..542f1223e658d 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -458,7 +458,7 @@ std::optional<DummyDataObject> DummyDataObject::Characterize(
 }
 
 bool DummyDataObject::CanBePassedViaImplicitInterface(
-    std::string *whyNot) const {
+    std::string *whyNot, bool checkCUDA) const {
   if ((attrs &
           Attrs{Attr::Allocatable, Attr::Asynchronous, Attr::Optional,
               Attr::Pointer, Attr::Target, Attr::Value, Attr::Volatile})
@@ -482,7 +482,7 @@ bool DummyDataObject::CanBePassedViaImplicitInterface(
       *whyNot = "a dummy argument is polymorphic";
     }
     return false; // 15.4.2.2(3)(f)
-  } else if (cudaDataAttr) {
+  } else if (checkCUDA && cudaDataAttr) {
     if (whyNot) {
       *whyNot = "a dummy argument has a CUDA data attribute";
     }
@@ -1012,9 +1012,10 @@ common::Intent DummyArgument::GetIntent() const {
       u);
 }
 
-bool DummyArgument::CanBePassedViaImplicitInterface(std::string *whyNot) const {
+bool DummyArgument::CanBePassedViaImplicitInterface(
+    std::string *whyNot, bool checkCUDA) const {
   if (const auto *object{std::get_if<DummyDataObject>(&u)}) {
-    return object->CanBePassedViaImplicitInterface(whyNot);
+    return object->CanBePassedViaImplicitInterface(whyNot, checkCUDA);
   } else if (const auto *proc{std::get_if<DummyProcedure>(&u)}) {
     return proc->CanBePassedViaImplicitInterface(whyNot);
   } else {
@@ -1501,7 +1502,8 @@ std::optional<Procedure> Procedure::FromActuals(const ProcedureDesignator &proc,
   return callee;
 }
 
-bool Procedure::CanBeCalledViaImplicitInterface(std::string *whyNot) const {
+bool Procedure::CanBeCalledViaImplicitInterface(
+    std::string *whyNot, bool checkCUDA) const {
   if (attrs.test(Attr::Elemental)) {
     if (whyNot) {
       *whyNot = "the procedure is elemental";
@@ -1524,7 +1526,7 @@ bool Procedure::CanBeCalledViaImplicitInterface(std::string *whyNot) const {
     return false;
   } else {
     for (const DummyArgument &arg : dummyArguments) {
-      if (!arg.CanBePassedViaImplicitInterface(whyNot)) {
+      if (!arg.CanBePassedViaImplicitInterface(whyNot, checkCUDA)) {
         return false;
       }
     }
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 4939d8d64a999..81c53aaf9e339 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -56,28 +56,44 @@ static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
         "%VAL argument must be a scalar numeric or logical expression"_err_en_US);
   }
   if (const auto *expr{arg.UnwrapExpr()}) {
-    if (const Symbol * base{GetFirstSymbol(*expr)};
-        base && IsFunctionResult(*base)) {
-      context.NoteDefinedSymbol(*base);
+    if (const Symbol *base{GetFirstSymbol(*expr)}) {
+      const Symbol &symbol{GetAssociationRoot(*base)};
+      if (IsFunctionResult(symbol)) {
+        context.NoteDefinedSymbol(symbol);
+      }
     }
     if (IsBOZLiteral(*expr)) {
-      messages.Say("BOZ argument requires an explicit interface"_err_en_US);
+      messages.Say("BOZ argument %s requires an explicit interface"_err_en_US,
+          expr->AsFortran());
     } else if (evaluate::IsNullPointerOrAllocatable(expr)) {
       messages.Say(
-          "Null pointer argument requires an explicit interface"_err_en_US);
+          "Null pointer argument '%s' requires an explicit interface"_err_en_US,
+          expr->AsFortran());
     } else if (auto named{evaluate::ExtractNamedEntity(*expr)}) {
-      const Symbol &symbol{named->GetLastSymbol()};
-      if (IsAssumedRank(symbol)) {
+      const Symbol &resolved{ResolveAssociations(named->GetLastSymbol())};
+      if (IsAssumedRank(resolved)) {
         messages.Say(
-            "Assumed rank argument requires an explicit interface"_err_en_US);
+            "Assumed rank argument '%s' requires an explicit interface"_err_en_US,
+            expr->AsFortran());
       }
+      const Symbol &symbol{GetAssociationRoot(resolved)};
       if (symbol.attrs().test(Attr::ASYNCHRONOUS)) {
         messages.Say(
-            "ASYNCHRONOUS argument requires an explicit interface"_err_en_US);
+            "ASYNCHRONOUS argument '%s' requires an explicit interface"_err_en_US,
+            expr->AsFortran());
       }
       if (symbol.attrs().test(Attr::VOLATILE)) {
         messages.Say(
-            "VOLATILE argument requires an explicit interface"_err_en_US);
+            "VOLATILE argument '%s' requires an explicit interface"_err_en_US,
+            expr->AsFortran());
+      }
+      if (const auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
+        if (object->cudaDataAttr()) {
+          messages.Warn(/*inModuleFile=*/false, context.languageFeatures(),
+              common::UsageWarning::CUDAUsage,
+              "Actual argument '%s' with CUDA data attributes should be passed via an explicit interface"_warn_en_US,
+              expr->AsFortran());
+        }
       }
     } else if (auto argChars{characteristics::DummyArgument::FromActual(
                    "actual argument", *expr, context.foldingContext(),
@@ -2387,44 +2403,51 @@ bool CheckArguments(const characteristics::Procedure &proc,
   evaluate::FoldingContext foldingContext{context.foldingContext()};
   parser::ContextualMessages &messages{foldingContext.messages()};
   bool allowArgumentConversions{true};
+  parser::Messages implicitBuffer;
   if (!explicitInterface || treatingExternalAsImplicit) {
-    parser::Messages buffer;
     {
-      auto restorer{messages.SetMessages(buffer)};
+      auto restorer{messages.SetMessages(implicitBuffer)};
       for (auto &actual : actuals) {
         if (actual) {
           CheckImplicitInterfaceArg(*actual, messages, context);
         }
       }
     }
-    if (!buffer.empty()) {
+    if (implicitBuffer.AnyFatalError()) {
       if (auto *msgs{messages.messages()}) {
-        msgs->Annex(std::move(buffer));
+        msgs->Annex(std::move(implicitBuffer));
       }
       return false; // don't pile on
     }
     allowArgumentConversions = false;
   }
   if (explicitInterface) {
-    auto buffer{CheckExplicitInterface(proc, actuals, context, &scope,
+    auto explicitBuffer{CheckExplicitInterface(proc, actuals, context, &scope,
         intrinsic, allowArgumentConversions,
         /*extentErrors=*/true, ignoreImplicitVsExplicit)};
-    if (!buffer.empty()) {
+    if (!explicitBuffer.empty()) {
       if (treatingExternalAsImplicit) {
-        if (auto *msg{foldingContext.Warn(
+        // Combine all messages into one warning
+        if (auto *warning{messages.Warn(/*inModuleFile=*/false,
+                context.languageFeatures(),
                 common::UsageWarning::KnownBadImplicitInterface,
                 "If the procedure's interface were explicit, this reference would be in error"_warn_en_US)}) {
-          buffer.AttachTo(*msg, parser::Severity::Because);
-        } else {
-          buffer.clear();
+          explicitBuffer.AttachTo(*warning, parser::Severity::Because);
         }
+      } else if (auto *msgs{messages.messages()}) {
+        msgs->Annex(std::move(explicitBuffer));
       }
-      if (auto *msgs{messages.messages()}) {
-        msgs->Annex(std::move(buffer));
-      }
+      // These messages override any in implicitBuffer.
       return false;
     }
   }
-  return true;
+  if (!implicitBuffer.empty()) {
+    if (auto *msgs{messages.messages()}) {
+      msgs->Annex(std::move(implicitBuffer));
+    }
+    return false;
+  } else {
+    return true; // no messages
+  }
 }
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 836500145e4a2..fc268886c5feb 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -3628,7 +3628,7 @@ std::optional<characteristics::Procedure> ExpressionAnalyzer::CheckCall(
   if (chars) {
     std::string whyNot;
     if (treatExternalAsImplicit &&
-        !chars->CanBeCalledViaImplicitInterface(&whyNot)) {
+        !chars->CanBeCalledViaImplicitInterface(&whyNot, /*checkCUDA=*/false)) {
       if (auto *msg{Say(callSite,
               "References to the procedure '%s' require an explicit interface"_err_en_US,
               DEREF(procSymbol).name())};
diff --git a/flang/test/Semantics/boz-literal-constants.f90 b/flang/test/Semantics/boz-literal-constants.f90
index 4d957d13f3f67..67e9ce78537b7 100644
--- a/flang/test/Semantics/boz-literal-constants.f90
+++ b/flang/test/Semantics/boz-literal-constants.f90
@@ -120,7 +120,7 @@ subroutine explicit(n, x, c)
   !ERROR: Actual argument 'z'55'' associated with dummy argument 'c=' is not a variable or typed expression
   call explicit(z'deadbeef', o'666', b'01010101')
 
-  !ERROR: BOZ argument requires an explicit interface
+  !ERROR: BOZ argument z'12345' requires an explicit interface
   call implictSub(Z'12345')
 
   !ERROR: Output item must not be a BOZ literal constant
diff --git a/flang/test/Semantics/call13.f90 b/flang/test/Semantics/call13.f90
index 3f7fb2efc8f63..90e19180f9c56 100644
--- a/flang/test/Semantics/call13.f90
+++ b/flang/test/Semantics/call13.f90
@@ -20,7 +20,7 @@ subroutine s(assumedRank, coarray, class, classStar, typeStar)
   real :: array(implicit01())  ! 15.4.2.2(2)
   !ERROR: Keyword 'keyword=' may not appear in a reference to a procedure with an implicit interface
   call implicit10(1, 2, keyword=3)  ! 15.4.2.2(1)
-  !ERROR: Assumed rank argument requires an explicit interface
+  !ERROR: Assumed rank argument 'assumedrank' requires an explicit interface
   call implicit11(assumedRank)  ! 15.4.2.2(3)(c)
   call implicit12(coarray)  ! ok
   call implicit12a(coarray[1]) ! ok
diff --git a/flang/test/Semantics/cuf24.cuf b/flang/test/Semantics/cuf24.cuf
new file mode 100644
index 0000000000000..67c9d5d72ebbd
--- /dev/null
+++ b/flang/test/Semantics/cuf24.cuf
@@ -0,0 +1,40 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1 -fopenacc
+
+subroutine implicitDeviceInSameFile(v)
+  real, device :: v(10)
+end
+
+subroutine implicitNonDeviceInSameFile(v)
+  real :: v(10)
+end
+
+program p
+  real, device :: dev(10)
+  real :: host(10)
+  interface
+    subroutine explicitDevice(v)
+      real, device :: v(10)
+    end
+    subroutine explicitNonDevice(v)
+      real :: v(10)
+    end
+  end interface
+  !WARNING: Actual argument 'dev' with CUDA data attributes should be passed via an explicit interface [-Wcuda-usage]
+  call implicit1(dev)
+  call implicit2(host)
+  !WARNING: Actual argument 'dev' with CUDA data attributes should be passed via an explicit interface [-Wcuda-usage]
+  call implicitDeviceInSameFile(dev)
+  !WARNING: If the procedure's interface were explicit, this reference would be in error [-Wknown-bad-implicit-interface]
+  !BECAUSE: dummy argument 'v=' has ATTRIBUTES(DEVICE) but its associated actual argument has no CUDA data attribute
+  call implicitDeviceInSameFile(host)
+  !WARNING: If the procedure's interface were explicit, this reference would be in error [-Wknown-bad-implicit-interface]
+  !BECAUSE: dummy argument 'v=' has no CUDA data attribute but its associated actual argument has ATTRIBUTES(DEVICE)
+  call implicitNonDeviceInSameFile(dev)
+  call implicitNonDeviceInSameFile(host)
+  call explicitDevice(dev)
+  !ERROR: dummy argument 'v=' has ATTRIBUTES(DEVICE) but its associated actual argument has no CUDA data attribute
+  call explicitDevice(host)
+  !ERROR: dummy argument 'v=' has no CUDA data attribute but its associated actual argument has ATTRIBUTES(DEVICE)
+  call explicitNonDevice(dev)
+  call explicitNonDevice(host)
+end
diff --git a/flang/test/Semantics/null01.f90 b/flang/test/Semantics/null01.f90
index 64c9881297308..ccf617996c30b 100644
--- a/flang/test/Semantics/null01.f90
+++ b/flang/test/Semantics/null01.f90
@@ -116,9 +116,9 @@ function f3()
   call optionalAllocatable(null(mold=ip0))
   call optionalAllocatable(null(mold=ia0)) ! fine
   call optionalAllocatable(null()) ! fine
-  !ERROR: Null pointer argument requires an explicit interface
+  !ERROR: Null pointer argument 'NULL()' requires an explicit interface
   call implicit(null())
-  !ERROR: Null pointer argument requires an explicit interface
+  !ERROR: Null pointer argument 'null(mold=ip0)' requires an explicit interface
   call implicit(null(mold=ip0))
   !ERROR: A NULL() pointer is not allowed for 'x=' intrinsic argument
   print *, sin(null(rp0))

From 0b8381aba9f90884ddfc69393d6f2bb1bda7facf Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Fri, 3 Oct 2025 10:48:32 -0700
Subject: [PATCH 678/878] [flang] Fix bogus generic interface error due to
 hermetic module files (#161607)

When the same generic interface is processed via USE association from
its original module file and from a copy in a hermetic module file, we
need to do a better job at detecting and omitting duplicate specific
procedures. They won't have the same symbol addresses, but they will
have the same name, module name, and characteristics. This will avoid a
bogus error about multiple specific procedures matching the actual
arguments later when the merged generic interface is referenced.
---
 flang/include/flang/Semantics/tools.h      |  2 +
 flang/lib/Semantics/check-declarations.cpp | 10 +--
 flang/lib/Semantics/resolve-names.cpp      | 78 ++++++++++++++--------
 flang/lib/Semantics/tools.cpp              |  5 ++
 flang/test/Semantics/modfile80.F90         | 25 +++++++
 5 files changed, 83 insertions(+), 37 deletions(-)
 create mode 100644 flang/test/Semantics/modfile80.F90

diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index db73a85768f5b..b977fb812fb11 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -770,5 +770,7 @@ std::string GetCommonBlockObjectName(const Symbol &, bool underscoring);
 // Check for ambiguous USE associations
 bool HadUseError(SemanticsContext &, SourceName at, const Symbol *);
 
+bool AreSameModuleSymbol(const Symbol &, const Symbol &);
+
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TOOLS_H_
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 8a80bf045bcbc..75934243b7916 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3048,14 +3048,6 @@ static std::optional<std::string> DefinesGlobalName(const Symbol &symbol) {
   return std::nullopt;
 }
 
-static bool IsSameSymbolFromHermeticModule(
-    const Symbol &symbol, const Symbol &other) {
-  return symbol.name() == other.name() && symbol.owner().IsModule() &&
-      other.owner().IsModule() && symbol.owner() != other.owner() &&
-      symbol.owner().GetName() &&
-      symbol.owner().GetName() == other.owner().GetName();
-}
-
 // 19.2 p2
 void CheckHelper::CheckGlobalName(const Symbol &symbol) {
   if (auto global{DefinesGlobalName(symbol)}) {
@@ -3073,7 +3065,7 @@ void CheckHelper::CheckGlobalName(const Symbol &symbol) {
           (!IsExternalProcedureDefinition(symbol) ||
               !IsExternalProcedureDefinition(other))) {
         // both are procedures/BLOCK DATA, not both definitions
-      } else if (IsSameSymbolFromHermeticModule(symbol, other)) {
+      } else if (AreSameModuleSymbol(symbol, other)) {
         // Both symbols are the same thing.
       } else if (symbol.has<ModuleDetails>()) {
         Warn(common::LanguageFeature::BenignNameClash, symbol.name(),
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index aa09d49f1453b..b7c7603d667d8 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3962,8 +3962,26 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     }
   }
 
+  auto AreSameModuleProcOrBothInterfaces{[](const Symbol &p1,
+                                             const Symbol &p2) {
+    if (IsProcedure(p1) && !IsPointer(p1) && IsProcedure(p2) &&
+        !IsPointer(p2)) {
+      auto classification{ClassifyProcedure(p1)};
+      if (classification == ClassifyProcedure(p2)) {
+        if (classification == ProcedureDefinitionClass::External) {
+          const auto *subp1{p1.detailsIf<SubprogramDetails>()};
+          const auto *subp2{p2.detailsIf<SubprogramDetails>()};
+          return subp1 && subp1->isInterface() && subp2 && subp2->isInterface();
+        } else if (classification == ProcedureDefinitionClass::Module) {
+          return AreSameModuleSymbol(p1, p2);
+        }
+      }
+    }
+    return false;
+  }};
+
   auto AreSameProcedure{[&](const Symbol &p1, const Symbol &p2) {
-    if (&p1 == &p2) {
+    if (&p1.GetUltimate() == &p2.GetUltimate()) {
       return true;
     } else if (p1.name() != p2.name()) {
       return false;
@@ -3971,31 +3989,16 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
         p2.attrs().test(Attr::INTRINSIC)) {
       return p1.attrs().test(Attr::INTRINSIC) &&
           p2.attrs().test(Attr::INTRINSIC);
-    } else if (!IsProcedure(p1) || !IsProcedure(p2)) {
-      return false;
-    } else if (IsPointer(p1) || IsPointer(p2)) {
-      return false;
-    } else if (const auto *subp{p1.detailsIf<SubprogramDetails>()};
-               subp && !subp->isInterface()) {
-      return false; // defined in module, not an external
-    } else if (const auto *subp{p2.detailsIf<SubprogramDetails>()};
-               subp && !subp->isInterface()) {
-      return false; // defined in module, not an external
+    } else if (AreSameModuleProcOrBothInterfaces(p1, p2)) {
+      // Both are external interfaces, perhaps to the same procedure,
+      // or both are module procedures from modules with the same name.
+      auto p1Chars{evaluate::characteristics::Procedure::Characterize(
+          p1, GetFoldingContext())};
+      auto p2Chars{evaluate::characteristics::Procedure::Characterize(
+          p2, GetFoldingContext())};
+      return p1Chars && p2Chars && *p1Chars == *p2Chars;
     } else {
-      // Both are external interfaces, perhaps to the same procedure
-      auto class1{ClassifyProcedure(p1)};
-      auto class2{ClassifyProcedure(p2)};
-      if (class1 == ProcedureDefinitionClass::External &&
-          class2 == ProcedureDefinitionClass::External) {
-        auto chars1{evaluate::characteristics::Procedure::Characterize(
-            p1, GetFoldingContext())};
-        auto chars2{evaluate::characteristics::Procedure::Characterize(
-            p2, GetFoldingContext())};
-        // same procedure interface defined identically in two modules?
-        return chars1 && chars2 && *chars1 == *chars2;
-      } else {
-        return false;
-      }
+      return false;
     }
   }};
 
@@ -4096,13 +4099,32 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
       localSymbol = &newSymbol;
     }
     if (useGeneric) {
-      // Combine two use-associated generics
+      // Combine two use-associated generics.
       localSymbol->attrs() =
           useSymbol.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE};
       localSymbol->flags() = useSymbol.flags();
       AddGenericUse(*localGeneric, localName, useUltimate);
-      localGeneric->clear_derivedType();
-      localGeneric->CopyFrom(*useGeneric);
+      // Don't duplicate specific procedures.
+      std::size_t originalLocalSpecifics{localGeneric->specificProcs().size()};
+      std::size_t useSpecifics{useGeneric->specificProcs().size()};
+      CHECK(originalLocalSpecifics == localGeneric->bindingNames().size());
+      CHECK(useSpecifics == useGeneric->bindingNames().size());
+      std::size_t j{0};
+      for (const Symbol &useSpecific : useGeneric->specificProcs()) {
+        SourceName useBindingName{useGeneric->bindingNames()[j++]};
+        bool isDuplicate{false};
+        std::size_t k{0};
+        for (const Symbol &localSpecific : localGeneric->specificProcs()) {
+          if (localGeneric->bindingNames()[k++] == useBindingName &&
+              AreSameProcedure(localSpecific, useSpecific)) {
+            isDuplicate = true;
+            break;
+          }
+        }
+        if (!isDuplicate) {
+          localGeneric->AddSpecificProc(useSpecific, useBindingName);
+        }
+      }
     }
     localGeneric->clear_derivedType();
     if (combinedDerivedType) {
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 28829d3eda308..8eddd03faa962 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1870,4 +1870,9 @@ bool HadUseError(
   }
 }
 
+bool AreSameModuleSymbol(const Symbol &symbol, const Symbol &other) {
+  return symbol.name() == other.name() && symbol.owner().IsModule() &&
+      other.owner().IsModule() && symbol.owner().GetName() &&
+      symbol.owner().GetName() == other.owner().GetName();
+}
 } // namespace Fortran::semantics
diff --git a/flang/test/Semantics/modfile80.F90 b/flang/test/Semantics/modfile80.F90
new file mode 100644
index 0000000000000..425847ebcb229
--- /dev/null
+++ b/flang/test/Semantics/modfile80.F90
@@ -0,0 +1,25 @@
+!RUN: %flang_fc1 -DPART1 %s
+!RUN: %flang_fc1 -DPART2 -fhermetic-module-files %s
+!RUN: %flang_fc1 -DPART3 | FileCheck --allow-empty %s
+!CHECK-NOT: error:
+
+#if defined PART1
+module modfile80a
+  interface generic
+    module procedure specific
+  end interface
+ contains
+  subroutine specific
+  end
+end
+#elif defined PART2
+module modfile80b
+  use modfile80a
+end
+#else
+program test
+  use modfile80a
+  use modfile80b
+  call generic
+end
+#endif

From 2e67f5ceb8585983e1ca64e776bcbdb86b27c61f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Fri, 3 Oct 2025 10:48:07 -0700
Subject: [PATCH 679/878] [SLP][NFC]Add udiv/srem test cases, NFC

---
 .../SLPVectorizer/X86/no_alternate_divrem.ll  | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
index ed0bd3f38ead7..cf62fd5cf66f7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll
@@ -55,6 +55,54 @@ entry:
   ret void
 }
 
+define void @test_add_udiv(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @test_add_udiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
+; CHECK-NEXT:    [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
+; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, i32 [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> <i32 1146, i32 146, i32 0, i32 0>, [[TMP3]]
+; CHECK-NEXT:    [[RES2:%.*]] = udiv i32 [[V2]], [[Y2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[RES2]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[V3]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[ARR2:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep1.1 = getelementptr i32, ptr %arr1, i32 1
+  %gep1.2 = getelementptr i32, ptr %arr1, i32 2
+  %gep1.3 = getelementptr i32, ptr %arr1, i32 3
+  %gep2.1 = getelementptr i32, ptr %arr2, i32 1
+  %gep2.2 = getelementptr i32, ptr %arr2, i32 2
+  %gep2.3 = getelementptr i32, ptr %arr2, i32 3
+  %v0 = load i32, ptr %arr1
+  %v1 = load i32, ptr %gep1.1
+  %v2 = load i32, ptr %gep1.2
+  %v3 = load i32, ptr %gep1.3
+  %y0 = add nsw i32 %a0, 1146
+  %y1 = add nsw i32 %a1, 146
+  %y2 = add nsw i32 %a2, 42
+  %y3 = add nsw i32 %a3, 0
+  %res0 = add nsw i32 %v0, %y0
+  %res1 = add nsw i32 %v1, %y1
+  %res2 = udiv i32 %v2, %y2
+  %res3 = add nsw i32 %v3, %y3
+  store i32 %res0, ptr %arr2
+  store i32 %res1, ptr %gep2.1
+  store i32 %res2, ptr %gep2.2
+  store i32 %res3, ptr %gep2.3
+  ret void
+}
+
 ;; Similar test, but now div/rem is main opcode and not the alternate one. Same issue.
 define void @test_urem_add(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: @test_urem_add(
@@ -114,3 +162,56 @@ entry:
   store i32 %res3, ptr %gep2.3
   ret void
 }
+
+define void @test_srem_add(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: @test_srem_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 1
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1]], i32 2
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3
+; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr i32, ptr [[ARR2:%.*]], i32 1
+; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr i32, ptr [[ARR2]], i32 2
+; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr i32, ptr [[ARR2]], i32 3
+; CHECK-NEXT:    [[V0:%.*]] = load i32, ptr [[ARR1]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[GEP1_1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4
+; CHECK-NEXT:    [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4
+; CHECK-NEXT:    [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146
+; CHECK-NEXT:    [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146
+; CHECK-NEXT:    [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42
+; CHECK-NEXT:    [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0
+; CHECK-NEXT:    [[RES0:%.*]] = srem i32 [[V0]], [[Y0]]
+; CHECK-NEXT:    [[RES1:%.*]] = srem i32 [[V1]], [[Y1]]
+; CHECK-NEXT:    [[RES2:%.*]] = srem i32 [[V2]], [[Y2]]
+; CHECK-NEXT:    [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]]
+; CHECK-NEXT:    store i32 [[RES0]], ptr [[ARR2]], align 4
+; CHECK-NEXT:    store i32 [[RES1]], ptr [[GEP2_1]], align 4
+; CHECK-NEXT:    store i32 [[RES2]], ptr [[GEP2_2]], align 4
+; CHECK-NEXT:    store i32 [[RES3]], ptr [[GEP2_3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep1.1 = getelementptr i32, ptr %arr1, i32 1
+  %gep1.2 = getelementptr i32, ptr %arr1, i32 2
+  %gep1.3 = getelementptr i32, ptr %arr1, i32 3
+  %gep2.1 = getelementptr i32, ptr %arr2, i32 1
+  %gep2.2 = getelementptr i32, ptr %arr2, i32 2
+  %gep2.3 = getelementptr i32, ptr %arr2, i32 3
+  %v0 = load i32, ptr %arr1
+  %v1 = load i32, ptr %gep1.1
+  %v2 = load i32, ptr %gep1.2
+  %v3 = load i32, ptr %gep1.3
+  %y0 = add nsw i32 %a0, 1146
+  %y1 = add nsw i32 %a1, 146
+  %y2 = add nsw i32 %a2, 42
+  %y3 = add nsw i32 %a3, 0
+  %res0 = srem i32 %v0, %y0
+  %res1 = srem i32 %v1, %y1
+  %res2 = srem i32 %v2, %y2
+  %res3 = add nsw i32 %v3, %y3
+  store i32 %res0, ptr %arr2
+  store i32 %res1, ptr %gep2.1
+  store i32 %res2, ptr %gep2.2
+  store i32 %res3, ptr %gep2.3
+  ret void
+}

From 616e5230c7e31caae2c58b46c9655a3617a437a3 Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Fri, 3 Oct 2025 10:51:15 -0700
Subject: [PATCH 680/878] Fixing memory leaks in tests (#161878)

Fixing the memory leaks introduced (#158376) in the unit tests of
FunctionPropertiesAnalysis and IR2Vec.
---
 llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp | 6 +++---
 llvm/unittests/Analysis/IR2VecTest.cpp                     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
index b6e8567ee514d..497da8f3fc70b 100644
--- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
@@ -46,8 +46,8 @@ class FunctionPropertiesAnalysisTest : public testing::Test {
     MAM.registerPass([VocabVector = std::move(VocabVector)]() mutable {
       return IR2VecVocabAnalysis(std::move(VocabVector));
     });
-    IR2VecVocab =
-        new ir2vec::Vocabulary(ir2vec::Vocabulary::createDummyVocabForTest(1));
+    IR2VecVocab = std::make_unique<ir2vec::Vocabulary>(
+        ir2vec::Vocabulary::createDummyVocabForTest(1));
     MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
     FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); });
     FAM.registerPass([&] { return DominatorTreeAnalysis(); });
@@ -69,7 +69,7 @@ class FunctionPropertiesAnalysisTest : public testing::Test {
   std::unique_ptr<LoopInfo> LI;
   FunctionAnalysisManager FAM;
   ModuleAnalysisManager MAM;
-  ir2vec::Vocabulary *IR2VecVocab;
+  std::unique_ptr<ir2vec::Vocabulary> IR2VecVocab;
 
   void TearDown() override {
     // Restore original IR2Vec weights
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 743628fffac76..d136cb6a316b1 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -295,7 +295,7 @@ TEST(IR2VecTest, ZeroDimensionEmbedding) {
 // Fixture for IR2Vec tests requiring IR setup.
 class IR2VecTestFixture : public ::testing::Test {
 protected:
-  Vocabulary *V;
+  std::unique_ptr<Vocabulary> V;
   LLVMContext Ctx;
   std::unique_ptr<Module> M;
   Function *F = nullptr;
@@ -304,7 +304,7 @@ class IR2VecTestFixture : public ::testing::Test {
   Instruction *RetInst = nullptr;
 
   void SetUp() override {
-    V = new Vocabulary(Vocabulary::createDummyVocabForTest(2));
+    V = std::make_unique<Vocabulary>(Vocabulary::createDummyVocabForTest(2));
 
     // Setup IR
     M = std::make_unique<Module>("TestM", Ctx);

From eabfed8690a1e87056140c1b311f82457e27feb9 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 10:01:00 -0700
Subject: [PATCH 681/878] [MLIR] Apply clang-tidy fixes for
 modernize-use-bool-literals in VectorEmulateNarrowType.cpp (NFC)

---
 mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 3a6684f4edfb7..255f2bf5a8161 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -796,7 +796,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
                                currentSourceIndex, remainingElements, 0);
 
       // Generate back mask.
-      auto maskValues = SmallVector<bool>(emulatedPerContainerElem, 0);
+      auto maskValues = SmallVector<bool>(emulatedPerContainerElem, false);
       std::fill_n(maskValues.begin(), remainingElements, 1);
       auto backMask = arith::ConstantOp::create(
           rewriter, loc,

From aea5399919ade83f587a57395ed606528f1985f2 Mon Sep 17 00:00:00 2001
From: Fateme Hosseini <136356764+fhossein-quic@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:44:35 -0500
Subject: [PATCH 682/878] [Hexagon] Support lowering of setuo & seto for vector
 types in Hexagon (#158740)

Resolves instruction selection failure for v64f16 and v32f32 vector
types.

Patch by: Fateme Hosseini

---------

Co-authored-by: Kaushik Kulkarni <quic_kauskulk@quicinc.com>
---
 .../Target/Hexagon/HexagonISelLoweringHVX.cpp |  4 +
 .../test/CodeGen/Hexagon/inst_setcc_uno_uo.ll | 93 +++++++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 llvm/test/CodeGen/Hexagon/inst_setcc_uno_uo.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 004050407e556..a94e131dd7214 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -359,6 +359,8 @@ HexagonTargetLowering::initializeHVXLowering() {
   setCondCodeAction(ISD::SETULE, MVT::v64f16, Expand);
   setCondCodeAction(ISD::SETUGE, MVT::v64f16, Expand);
   setCondCodeAction(ISD::SETULT, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETUO, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETO, MVT::v64f16, Expand);
 
   setCondCodeAction(ISD::SETNE,  MVT::v32f32, Expand);
   setCondCodeAction(ISD::SETLE,  MVT::v32f32, Expand);
@@ -372,6 +374,8 @@ HexagonTargetLowering::initializeHVXLowering() {
   setCondCodeAction(ISD::SETULE, MVT::v32f32, Expand);
   setCondCodeAction(ISD::SETUGE, MVT::v32f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETUO, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETO, MVT::v32f32, Expand);
 
   // Boolean vectors.
 
diff --git a/llvm/test/CodeGen/Hexagon/inst_setcc_uno_uo.ll b/llvm/test/CodeGen/Hexagon/inst_setcc_uno_uo.ll
new file mode 100644
index 0000000000000..8b121c539229d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/inst_setcc_uno_uo.ll
@@ -0,0 +1,93 @@
+;; RUN: llc --mtriple=hexagon -mattr=+hvxv79,+hvx-length128b %s -o - | FileCheck %s
+
+define dso_local void @store_isnan_f32(ptr %a, ptr %b, ptr %isnan_cmp) local_unnamed_addr {
+entry:
+  %arrayidx_a = getelementptr inbounds nuw float, ptr %a, i32 0
+  %arrayidx_b = getelementptr inbounds nuw float, ptr %b, i32 0
+  %0 = load <32 x float>, ptr %arrayidx_a, align 4
+  %1 = load <32 x float>, ptr %arrayidx_b, align 4
+  %.vectorized = fcmp uno <32 x float> %0, %1
+  %.LS.instance = zext <32 x i1> %.vectorized to <32 x i32>
+  %arrayidx1 = getelementptr inbounds nuw i32, ptr %isnan_cmp, i32 0
+  store <32 x i32> %.LS.instance, ptr %arrayidx1, align 4
+  ret void
+}
+
+; CHECK:      store_isnan_f32
+; CHECK:      [[RONE32:r[0-9]+]] = #1
+; CHECK:      [[VOP2_F32:v[0-9]+]] = vxor([[VOP2_F32]],[[VOP2_F32]])
+; CHECK:      [[VOP1_F32:v[0-9]+]] = vmemu(r0+#0)
+; CHECK:      [[VONES32:v[0-9]+]] = vsplat([[RONE32]])
+; CHECK:      [[Q1_F32:q[0-9]+]] = vcmp.eq([[VOP1_F32]].w,[[VOP1_F32]].w)
+; CHECK:      [[VOP3_F32:v[0-9]+]] = vmemu(r1+#0)
+; CHECK:      [[Q1_F32]] &= vcmp.eq([[VOP3_F32]].w,[[VOP3_F32]].w)
+; CHECK:      [[VOUT_F32:v[0-9]+]] = vmux([[Q1_F32]],[[VOP2_F32]],[[VONES32]])
+; CHECK:      vmemu(r2+#0) = [[VOUT_F32]]
+
+define dso_local void @store_isnan_f16(ptr %a, ptr %b, ptr %isnan_cmp) local_unnamed_addr {
+entry:
+  %arrayidx_a = getelementptr inbounds nuw half, ptr %a, i32 0
+  %arrayidx_b = getelementptr inbounds nuw half, ptr %b, i32 0
+  %0 = load <64 x half>, ptr %arrayidx_a, align 2
+  %1 = load <64 x half>, ptr %arrayidx_b, align 2
+  %.vectorized = fcmp uno <64 x half> %0, %1
+  %conv.LS.instance = zext <64 x i1> %.vectorized to <64 x i16>
+  %arrayidx1 = getelementptr inbounds nuw i16, ptr %isnan_cmp, i32 0
+  store <64 x i16> %conv.LS.instance, ptr %arrayidx1, align 2
+  ret void
+}
+; CHECK-LABEL: store_isnan_f16
+; CHECK:       [[RONE16:r[0-9]+]] = #1
+; CHECK:       [[VOP2_F16:v[0-9]+]] = vxor([[VOP2_F16]],[[VOP2_F16]])
+; CHECK:       [[VOP1_F16:v[0-9]+]] = vmemu(r0+#0)
+; CHECK:       [[VONES16:v[0-9]+]].h = vsplat([[RONE16]])
+; CHECK:       [[Q1_F16:q[0-9]+]] = vcmp.eq([[VOP1_F16]].h,[[VOP1_F16]].h)
+; CHECK:       [[VOP3_F16:v[0-9]+]] = vmemu(r1+#0)
+; CHECK:       [[Q1_F16]] &= vcmp.eq([[VOP3_F16]].h,[[VOP3_F16]].h)
+; CHECK:       [[VOUT_F16:v[0-9]+]] = vmux([[Q1_F16]],[[VOP2_F16]],[[VONES16]])
+; CHECK:       vmemu(r2+#0) = [[VOUT_F32]]
+
+define dso_local void @store_isordered_f32(ptr %a, ptr %b, ptr %isordered_cmp) local_unnamed_addr {
+entry:
+  %arrayidx_a = getelementptr inbounds nuw float, ptr %a, i32 0
+  %arrayidx_b = getelementptr inbounds nuw float, ptr %b, i32 0
+  %0 = load <32 x float>, ptr %arrayidx_a, align 4
+  %1 = load <32 x float>, ptr %arrayidx_b, align 4
+  %.vectorized = fcmp ord <32 x float> %0, %1
+  %.LS.instance = zext <32 x i1> %.vectorized to <32 x i32>
+  %arrayidx1 = getelementptr inbounds nuw i32, ptr %isordered_cmp, i32 0
+  store <32 x i32> %.LS.instance, ptr %arrayidx1, align 4
+  ret void
+}
+; CHECK-LABEL: store_isordered_f32
+; CHECK:       [[VOP2_ORD_F32:v[0-9]+]] = vxor([[VOP2_ORD_F32]],[[VOP2_ORD_F32]])
+; CHECK:       [[VOP1_ORD_F32:v[0-9]+]] = vmemu(r0+#0)
+; CHECK:       [[VONES_ORD_F32:v[0-9]+]] = vsplat([[RONE32]])
+; CHECK:       [[Q1_ORD_F32:q[0-9]+]] = vcmp.eq([[VOP1_ORD_F32]].w,[[VOP1_ORD_F32]].w)
+; CHECK:       [[VOP3_ORD_F32:v[0-9]+]] = vmemu(r1+#0)
+; CHECK:       [[Q1_ORD_F32]] &= vcmp.eq([[VOP3_ORD_F32]].w,[[VOP3_ORD_F32]].w)
+; CHECK:       [[VOUT_ORD_F32:v[0-9]+]] = vmux([[Q1_ORD_F32]],[[VONES_ORD_F32]],[[VOP2_ORD_F32]])
+; CHECK:       vmemu(r2+#0) = [[VOUT_ORD_F32]]
+
+
+define dso_local void @store_isordered_f16(ptr %a, ptr %b, ptr %isordered_cmp) local_unnamed_addr {
+entry:
+  %arrayidx_a = getelementptr inbounds nuw half, ptr %a, i32 0
+  %arrayidx_b = getelementptr inbounds nuw half, ptr %b, i32 0
+  %0 = load <64 x half>, ptr %arrayidx_a, align 2
+  %1 = load <64 x half>, ptr %arrayidx_b, align 2
+  %.vectorized = fcmp ord <64 x half> %0, %1
+  %conv.LS.instance = zext <64 x i1> %.vectorized to <64 x i16>
+  %arrayidx1 = getelementptr inbounds nuw i16, ptr %isordered_cmp, i32 0
+  store <64 x i16> %conv.LS.instance, ptr %arrayidx1, align 2
+  ret void
+}
+; CHECK-LABEL: store_isordered_f16
+; CHECK:       [[VOP2_ORD_F16:v[0-9]+]] = vxor([[VOP2_ORD_F16]],[[VOP2_ORD_F16]])
+; CHECK:       [[VOP1_ORD_F16:v[0-9]+]] = vmemu(r0+#0)
+; CHECK:       [[VONES_ORD_F16:v[0-9]+]].h = vsplat([[RONE16]])
+; CHECK:       [[Q1_ORD_F16:q[0-9]+]] = vcmp.eq([[VOP1_ORD_F16]].h,[[VOP1_ORD_F16]].h)
+; CHECK:       [[VOP3_ORD_F16:v[0-9]+]] = vmemu(r1+#0)
+; CHECK:       [[Q1_ORD_F16]] &= vcmp.eq([[VOP3_ORD_F16]].h,[[VOP3_ORD_F16]].h)
+; CHECK:       [[VOUT_ORD_F16:v[0-9]+]] = vmux([[Q1_ORD_F16]],[[VONES_ORD_F16]],[[VOP2_ORD_F16]])
+; CHECK:       vmemu(r2+#0) = [[VOUT_ORD_F16]]

From 478048df443a25d75f7cb2dd7b50ba94e73303c2 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Fri, 3 Oct 2025 11:53:32 -0700
Subject: [PATCH 683/878] [mlir][acc] Add firstprivate operands to `acc.loop`
 (#161881)

Add support for firstprivate operands to the OpenACC loop construct,
enabling representation of privatization scenarios that require
initialization from original values.
---
 flang/lib/Lower/OpenACC.cpp                   |  3 ++
 .../mlir/Dialect/OpenACC/OpenACCOps.td        | 53 +++++++++++++++----
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 23 +++++++-
 mlir/test/Dialect/OpenACC/ops.mlir            | 36 +++++++++++++
 4 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index f9b9b850ad839..4a9e49435a907 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -2222,6 +2222,9 @@ buildACCLoopOp(Fortran::lower::AbstractConverter &converter,
   addOperands(operands, operandSegments, tileOperands);
   addOperands(operands, operandSegments, cacheOperands);
   addOperands(operands, operandSegments, privateOperands);
+  // fill empty firstprivate operands since they are not permitted
+  // from OpenACC language perspective.
+  addOperands(operands, operandSegments, {});
   addOperands(operands, operandSegments, reductionOperands);
 
   auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 01ab6df8f6c72..77e833f8f9492 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2383,15 +2383,38 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
   let summary = "loop construct";
 
   let description = [{
-    The "acc.loop" operation represents the OpenACC loop construct. The lower
-    and upper bounds specify a half-open range: the range includes the lower
-    bound but does not include the upper bound. If the `inclusive` attribute is
-    set then the upper bound is included.
+    The `acc.loop` operation represents the OpenACC loop construct and when
+    bounds are included, the associated source language loop iterators. The
+    lower and upper bounds specify a half-open range: the range includes the
+    lower bound but does not include the upper bound. If the `inclusive`
+    attribute is set then the upper bound is included.
+
+    In cases where the OpenACC loop directive needs to capture multiple
+    source language loops, such as in the case of `collapse` or `tile`,
+    the multiple induction arguments are used to capture each case. Having
+    such a representation makes sure no intermediate transformation such
+    as Loop Invariant Code Motion breaks the property requested by the
+    clause on the loop constructs.
+
+    Each `acc.loop` holds private and reduction operands which are the
+    ssa values from the corresponding `acc.private` or `acc.reduction`
+    operations. Additionally, firstprivate operands are supported to
+    represent cases where privatization is needed with initialization
+    from an original value. While the OpenACC specification does not
+    explicitly support firstprivate on loop constructs, this extension
+    enables representing privatization scenarios that arise from an
+    optimization and codegen pipeline operating on acc dialect.
+
+    The operation supports capturing information that it comes combined
+    constructs (e.g., `parallel loop`, `kernels loop`, `serial loop`)
+    through the `combined` attribute despite requiring the `acc.loop`
+    to be decomposed from the compute operation representing compute
+    construct.
 
     Example:
 
     ```mlir
-    acc.loop gang() vector() (%arg3 : index, %arg4 : index, %arg5 : index) = 
+    acc.loop gang() vector() (%arg3 : index, %arg4 : index, %arg5 : index) =
         (%c0, %c0, %c0 : index, index, index) to 
         (%c10, %c10, %c10 : index, index, index) step 
         (%c1, %c1, %c1 : index, index, index) {
@@ -2400,10 +2423,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     } attributes { collapse = [3] }
     ```
 
-    `collapse`, `gang`, `worker`, `vector`, `seq`, `independent`, `auto` and
-    `tile` operands are supported with `device_type` information. They should
-    only be accessed by the extra provided getters. If modified, the
-    corresponding `device_type` attributes must be modified as well.
+    `collapse`, `gang`, `worker`, `vector`, `seq`, `independent`, `auto`,
+    `cache`, and `tile` operands are supported with `device_type`
+    information. These clauses should only be accessed through the provided
+    device-type-aware getter methods. When modifying these operands, the
+    corresponding `device_type` attributes must be updated to maintain
+    consistency between operands and their target device types.
   }];
 
   let arguments = (ins
@@ -2433,6 +2458,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
       Variadic<OpenACC_AnyPointerOrMappableType>:$cacheOperands,
       Variadic<OpenACC_AnyPointerOrMappableType>:$privateOperands,
       OptionalAttr<SymbolRefArrayAttr>:$privatizationRecipes,
+      Variadic<OpenACC_AnyPointerOrMappableType>:$firstprivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$firstprivatizationRecipes,
       Variadic<AnyType>:$reductionOperands,
       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
       OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined
@@ -2589,6 +2616,10 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     /// Adds a private clause variable to this operation, including its recipe.
     void addPrivatization(MLIRContext *, mlir::acc::PrivateOp op,
                           mlir::acc::PrivateRecipeOp recipe);
+    /// Adds a firstprivate clause variable to this operation, including its
+    /// recipe.
+    void addFirstPrivatization(MLIRContext *, mlir::acc::FirstprivateOp op,
+                               mlir::acc::FirstprivateRecipeOp recipe);
     /// Adds a reduction clause variable to this operation, including its
     /// recipe.
     void addReduction(MLIRContext *, mlir::acc::ReductionOp op,
@@ -2609,6 +2640,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
             type($vectorOperands), $vectorOperandsDeviceType, $vector)
       | `private` `(` custom<SymOperandList>(
             $privateOperands, type($privateOperands), $privatizationRecipes) `)`
+      | `firstprivate` `(` custom<SymOperandList>($firstprivateOperands,
+            type($firstprivateOperands), $firstprivatizationRecipes) `)`
       | `tile` `(` custom<DeviceTypeOperandsWithSegment>($tileOperands,
             type($tileOperands), $tileOperandsDeviceType, $tileOperandsSegments)
         `)`
@@ -2665,6 +2698,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
           /*cacheOperands=*/{},
           /*privateOperands=*/{},
           /*privatizationRecipes=*/nullptr,
+          /*firstprivateOperands=*/{},
+          /*firstprivatizationRecipes=*/nullptr,
           /*reductionOperands=*/{},
           /*reductionRecipes=*/nullptr,
           /*combined=*/nullptr);
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index ee3e4029abfb2..6598ac141008f 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2674,6 +2674,11 @@ LogicalResult acc::LoopOp::verify() {
           "privatizations", false)))
     return failure();
 
+  if (failed(checkSymOperandList<mlir::acc::FirstprivateRecipeOp>(
+          *this, getFirstprivatizationRecipes(), getFirstprivateOperands(),
+          "firstprivate", "firstprivatizations", /*checkOperandType=*/false)))
+    return failure();
+
   if (failed(checkSymOperandList<mlir::acc::ReductionRecipeOp>(
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
           "reductions", false)))
@@ -2737,7 +2742,8 @@ LogicalResult acc::LoopOp::verify() {
 }
 
 unsigned LoopOp::getNumDataOperands() {
-  return getReductionOperands().size() + getPrivateOperands().size();
+  return getReductionOperands().size() + getPrivateOperands().size() +
+         getFirstprivateOperands().size();
 }
 
 Value LoopOp::getDataOperand(unsigned i) {
@@ -3117,6 +3123,21 @@ void acc::LoopOp::addPrivatization(MLIRContext *context,
   setPrivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
 }
 
+void acc::LoopOp::addFirstPrivatization(
+    MLIRContext *context, mlir::acc::FirstprivateOp op,
+    mlir::acc::FirstprivateRecipeOp recipe) {
+  getFirstprivateOperandsMutable().append(op.getResult());
+
+  llvm::SmallVector<mlir::Attribute> recipes;
+
+  if (getFirstprivatizationRecipesAttr())
+    llvm::copy(getFirstprivatizationRecipesAttr(), std::back_inserter(recipes));
+
+  recipes.push_back(
+      mlir::SymbolRefAttr::get(context, recipe.getSymName().str()));
+  setFirstprivatizationRecipesAttr(mlir::ArrayAttr::get(context, recipes));
+}
+
 void acc::LoopOp::addReduction(MLIRContext *context, mlir::acc::ReductionOp op,
                                mlir::acc::ReductionRecipeOp recipe) {
   getReductionOperandsMutable().append(op.getResult());
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index cb69058268172..1484d7efd87c2 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -358,6 +358,41 @@ func.func @acc_loop_multiple_block() {
 
 // -----
 
+acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init {
+^bb0(%arg0: memref<10xf32>):
+  %0 = memref.alloca() : memref<10xf32>
+  acc.yield %0 : memref<10xf32>
+} copy {
+^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>):
+  memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32>
+  acc.terminator
+} destroy {
+^bb0(%arg0: memref<10xf32>):
+  acc.terminator
+}
+
+func.func @testloopfirstprivate(%a: memref<10xf32>, %b: memref<10xf32>) -> () {
+  %c0 = arith.constant 0 : index
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
+  %firstprivate = acc.firstprivate varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+  acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %firstprivate : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
+    "test.openacc_dummy_op"() : () -> ()
+    acc.yield
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+  return
+}
+
+// CHECK-LABEL: func.func @testloopfirstprivate(
+// CHECK-SAME:    %[[ARG0:.*]]: memref<10xf32>, %[[ARG1:.*]]: memref<10xf32>)
+// CHECK:         %[[FIRSTPRIVATE:.*]] = acc.firstprivate varPtr(%[[ARG0]] : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+// CHECK:         acc.loop firstprivate(@firstprivatization_memref_10xf32 -> %[[FIRSTPRIVATE]] : memref<10xf32>) control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK:           "test.openacc_dummy_op"() : () -> ()
+// CHECK:           acc.yield
+// CHECK:         } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
+
+// -----
+
 acc.private.recipe @privatization_memref_10_f32 : memref<10xf32> init {
 ^bb0(%arg0: memref<10xf32>):
   %0 = memref.alloc() : memref<10xf32>
@@ -535,6 +570,7 @@ acc.firstprivate.recipe @firstprivatization_memref_10xf32 : memref<10xf32> init
   acc.yield %0 : memref<10xf32>
 } copy {
 ^bb0(%arg0: memref<10xf32>, %arg1: memref<10xf32>):
+  memref.copy %arg0, %arg1 : memref<10xf32> to memref<10xf32>
   acc.terminator
 } destroy {
 ^bb0(%arg0: memref<10xf32>):

From 16f5a85fb648027b9644b21c8c0fa9188d6c39b9 Mon Sep 17 00:00:00 2001
From: Andres-Salamanca <andrealebarbaritos@gmail.com>
Date: Fri, 3 Oct 2025 13:56:53 -0500
Subject: [PATCH 684/878] [CIR] Initial support for emitting coroutine body
 (#161616)

This PR adds new `FuncOp` attributes (`coroutine` and `builtin`) and
begins the implementation of the `emitCoroutineBody` function. Feature
markers were also added for guidance in future PRs.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td |  17 +++
 clang/include/clang/CIR/MissingFeatures.h    |   7 ++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp      |  26 ++++
 clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp    |  82 +++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp     |   3 +
 clang/lib/CIR/CodeGen/CIRGenFunction.h       |  18 +++
 clang/lib/CIR/CodeGen/CIRGenModule.cpp       |   9 ++
 clang/lib/CIR/CodeGen/CIRGenModule.h         |   7 ++
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp         |   1 +
 clang/lib/CIR/CodeGen/CMakeLists.txt         |   1 +
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp      |  13 ++
 clang/test/CIR/CodeGen/coro-task.cpp         | 123 +++++++++++++++++++
 clang/test/CIR/IR/func.cir                   |  11 ++
 13 files changed, 318 insertions(+)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
 create mode 100644 clang/test/CIR/CodeGen/coro-task.cpp

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 0a78492aa9a86..7f2e55d14b51c 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2341,6 +2341,12 @@ def CIR_FuncOp : CIR_Op<"func", [
     The function linkage information is specified by `linkage`, as defined by
     `GlobalLinkageKind` attribute.
 
+    A compiler builtin function must be marked as `builtin` for further
+    processing when lowering from CIR.
+
+    The `coroutine` keyword is used to mark a coroutine function, which requires
+    at least one `cir.await` instruction to be used in its body.
+
     The `lambda` translates to a C++ `operator()` that implements a lambda, this
     allow callsites to make certain assumptions about the real function nature
     when writing analysis.
@@ -2362,11 +2368,22 @@ def CIR_FuncOp : CIR_Op<"func", [
     // Linkage information
     cir.func linkonce_odr @some_method(...)
     ```
+    // Builtin function
+    cir.func builtin @__builtin_coro_end(!cir.ptr<i8>, !cir.bool) -> !cir.bool
+    // Coroutine
+    cir.func coroutine @_Z10silly_taskv() -> !CoroTask {
+      ...
+      cir.await(...)
+      ...
+    }
+    ```
   }];
 
   let arguments = (ins SymbolNameAttr:$sym_name,
                        CIR_VisibilityAttr:$global_visibility,
                        TypeAttrOf<CIR_FuncType>:$function_type,
+                       UnitAttr:$builtin,
+                       UnitAttr:$coroutine,
                        UnitAttr:$lambda,
                        UnitAttr:$no_proto,
                        UnitAttr:$dso_local,
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 3dfcafc0399a5..0e7cec4e1a0e0 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -136,6 +136,13 @@ struct MissingFeatures {
   static bool recordZeroInitPadding() { return false; }
   static bool zeroSizeRecordMembers() { return false; }
 
+  // Coroutines
+  static bool coroAllocBuiltinCall() { return false; }
+  static bool coroBeginBuiltinCall() { return false; }
+  static bool coroEndBuiltinCall() { return false; }
+  static bool coroSizeBuiltinCall() { return false; }
+  static bool coroutineFrame() { return false; }
+
   // Various handling of deferred processing in CIRGenModule.
   static bool cgmRelease() { return false; }
   static bool deferredVtables() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index cf17de144f4d9..4cfa91e09efb4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -428,6 +428,32 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitUnaryFPBuiltin<cir::ATanOp>(*this, *e);
   case Builtin::BI__builtin_elementwise_cos:
     return emitUnaryFPBuiltin<cir::CosOp>(*this, *e);
+  case Builtin::BI__builtin_coro_id:
+  case Builtin::BI__builtin_coro_promise:
+  case Builtin::BI__builtin_coro_resume:
+  case Builtin::BI__builtin_coro_noop:
+  case Builtin::BI__builtin_coro_destroy:
+  case Builtin::BI__builtin_coro_done:
+  case Builtin::BI__builtin_coro_alloc:
+  case Builtin::BI__builtin_coro_begin:
+  case Builtin::BI__builtin_coro_end:
+  case Builtin::BI__builtin_coro_suspend:
+  case Builtin::BI__builtin_coro_align:
+    cgm.errorNYI(e->getSourceRange(), "BI__builtin_coro_id like NYI");
+    return getUndefRValue(e->getType());
+
+  case Builtin::BI__builtin_coro_frame: {
+    cgm.errorNYI(e->getSourceRange(), "BI__builtin_coro_frame NYI");
+    assert(!cir::MissingFeatures::coroutineFrame());
+    return getUndefRValue(e->getType());
+  }
+  case Builtin::BI__builtin_coro_free:
+  case Builtin::BI__builtin_coro_size: {
+    cgm.errorNYI(e->getSourceRange(),
+                 "BI__builtin_coro_free, BI__builtin_coro_size NYI");
+    assert(!cir::MissingFeatures::coroSizeBuiltinCall());
+    return getUndefRValue(e->getType());
+  }
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
new file mode 100644
index 0000000000000..c25cce4ab33b3
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
@@ -0,0 +1,82 @@
+//===----- CGCoroutine.cpp - Emit CIR Code for C++ coroutines -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code dealing with C++ code generation of coroutines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+#include "mlir/Support/LLVM.h"
+#include "clang/AST/StmtCXX.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+struct clang::CIRGen::CGCoroData {
+  // Stores the __builtin_coro_id emitted in the function so that we can supply
+  // it as the first argument to other builtins.
+  cir::CallOp coroId = nullptr;
+};
+
+// Defining these here allows to keep CGCoroData private to this file.
+CIRGenFunction::CGCoroInfo::CGCoroInfo() {}
+CIRGenFunction::CGCoroInfo::~CGCoroInfo() {}
+
+static void createCoroData(CIRGenFunction &cgf,
+                           CIRGenFunction::CGCoroInfo &curCoro,
+                           cir::CallOp coroId) {
+  assert(!curCoro.data && "EmitCoroutineBodyStatement called twice?");
+
+  curCoro.data = std::make_unique<CGCoroData>();
+  curCoro.data->coroId = coroId;
+}
+
+cir::CallOp CIRGenFunction::emitCoroIDBuiltinCall(mlir::Location loc,
+                                                  mlir::Value nullPtr) {
+  cir::IntType int32Ty = builder.getUInt32Ty();
+
+  const TargetInfo &ti = cgm.getASTContext().getTargetInfo();
+  unsigned newAlign = ti.getNewAlign() / ti.getCharWidth();
+
+  mlir::Operation *builtin = cgm.getGlobalValue(cgm.builtinCoroId);
+
+  cir::FuncOp fnOp;
+  if (!builtin) {
+    fnOp = cgm.createCIRBuiltinFunction(
+        loc, cgm.builtinCoroId,
+        cir::FuncType::get({int32Ty, VoidPtrTy, VoidPtrTy, VoidPtrTy}, int32Ty),
+        /*FD=*/nullptr);
+    assert(fnOp && "should always succeed");
+  } else {
+    fnOp = cast<cir::FuncOp>(builtin);
+  }
+
+  return builder.createCallOp(loc, fnOp,
+                              mlir::ValueRange{builder.getUInt32(newAlign, loc),
+                                               nullPtr, nullPtr, nullPtr});
+}
+
+mlir::LogicalResult
+CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
+  mlir::Location openCurlyLoc = getLoc(s.getBeginLoc());
+  cir::ConstantOp nullPtrCst = builder.getNullPtr(VoidPtrTy, openCurlyLoc);
+
+  auto fn = mlir::cast<cir::FuncOp>(curFn);
+  fn.setCoroutine(true);
+  cir::CallOp coroId = emitCoroIDBuiltinCall(openCurlyLoc, nullPtrCst);
+  createCoroData(*this, curCoro, coroId);
+
+  assert(!cir::MissingFeatures::coroAllocBuiltinCall());
+
+  assert(!cir::MissingFeatures::coroBeginBuiltinCall());
+
+  assert(!cir::MissingFeatures::generateDebugInfo());
+  return mlir::success();
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 3e34f5ca541fe..52fb0d758dff8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -342,6 +342,9 @@ void CIRGenFunction::LexicalScope::cleanup() {
 cir::ReturnOp CIRGenFunction::LexicalScope::emitReturn(mlir::Location loc) {
   CIRGenBuilderTy &builder = cgf.getBuilder();
 
+  // If we are on a coroutine, add the coro_end builtin call.
+  assert(!cir::MissingFeatures::coroEndBuiltinCall());
+
   auto fn = dyn_cast<cir::FuncOp>(cgf.curFn);
   assert(fn && "emitReturn from non-function");
   if (!fn.getFunctionType().hasVoidReturn()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 3eba242550c2c..dfd9d2ccf0313 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -47,6 +47,8 @@ class LoopOp;
 
 namespace clang::CIRGen {
 
+struct CGCoroData;
+
 class CIRGenFunction : public CIRGenTypeCache {
 public:
   CIRGenModule &cgm;
@@ -66,6 +68,18 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// The compiler-generated variable that holds the return value.
   std::optional<mlir::Value> fnRetAlloca;
 
+  // Holds coroutine data if the current function is a coroutine. We use a
+  // wrapper to manage its lifetime, so that we don't have to define CGCoroData
+  // in this header.
+  struct CGCoroInfo {
+    std::unique_ptr<CGCoroData> data;
+    CGCoroInfo();
+    ~CGCoroInfo();
+  };
+  CGCoroInfo curCoro;
+
+  bool isCoroutine() const { return curCoro.data != nullptr; }
+
   /// The temporary alloca to hold the return value. This is
   /// invalid iff the function has no return value.
   Address returnValue = Address::invalid();
@@ -1174,6 +1188,10 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   void emitConstructorBody(FunctionArgList &args);
 
+  mlir::LogicalResult emitCoroutineBody(const CoroutineBodyStmt &s);
+  cir::CallOp emitCoroEndBuiltinCall(mlir::Location loc, mlir::Value nullPtr);
+  cir::CallOp emitCoroIDBuiltinCall(mlir::Location loc, mlir::Value nullPtr);
+
   void emitDestroy(Address addr, QualType type, Destroyer *destroyer);
 
   void emitDestructorBody(FunctionArgList &args);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index beeabafaeb964..8485564a4a117 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -2069,6 +2069,15 @@ CIRGenModule::createCIRFunction(mlir::Location loc, StringRef name,
   return func;
 }
 
+cir::FuncOp
+CIRGenModule::createCIRBuiltinFunction(mlir::Location loc, StringRef name,
+                                       cir::FuncType ty,
+                                       const clang::FunctionDecl *fd) {
+  cir::FuncOp fnOp = createCIRFunction(loc, name, ty, fd);
+  fnOp.setBuiltin(true);
+  return fnOp;
+}
+
 mlir::SymbolTable::Visibility
 CIRGenModule::getMLIRVisibility(cir::GlobalOp op) {
   // MLIR doesn't accept public symbols declarations (only
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 541432407fea5..c6a6681021d47 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -475,6 +475,13 @@ class CIRGenModule : public CIRGenTypeCache {
                                 cir::FuncType funcType,
                                 const clang::FunctionDecl *funcDecl);
 
+  /// Create a CIR function with builtin attribute set.
+  cir::FuncOp createCIRBuiltinFunction(mlir::Location loc, llvm::StringRef name,
+                                       cir::FuncType ty,
+                                       const clang::FunctionDecl *fd);
+
+  static constexpr const char *builtinCoroId = "__builtin_coro_id";
+
   /// Given a builtin id for a function like "__builtin_fabsf", return a
   /// Function* for "fabsf".
   cir::FuncOp getBuiltinLibFunction(const FunctionDecl *fd, unsigned builtinID);
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 644c383693e37..0b8f8bfdfb046 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -197,6 +197,7 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
   case Stmt::SEHLeaveStmtClass:
   case Stmt::SYCLKernelCallStmtClass:
   case Stmt::CoroutineBodyStmtClass:
+    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
   case Stmt::CoreturnStmtClass:
   case Stmt::CXXTryStmtClass:
   case Stmt::IndirectGotoStmtClass:
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 3ebf460f7d34c..36db4bd9bab25 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -14,6 +14,7 @@ add_clang_library(clangCIR
   CIRGenCall.cpp
   CIRGenClass.cpp
   CIRGenCleanup.cpp
+  CIRGenCoroutine.cpp
   CIRGenCXX.cpp
   CIRGenCXXABI.cpp
   CIRGenBuiltin.cpp
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 6b5cc808e9a29..fba094f42414f 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1632,12 +1632,19 @@ ParseResult cir::FuncOp::parse(OpAsmParser &parser, OperationState &state) {
   llvm::SMLoc loc = parser.getCurrentLocation();
   mlir::Builder &builder = parser.getBuilder();
 
+  mlir::StringAttr builtinNameAttr = getBuiltinAttrName(state.name);
+  mlir::StringAttr coroutineNameAttr = getCoroutineAttrName(state.name);
   mlir::StringAttr lambdaNameAttr = getLambdaAttrName(state.name);
   mlir::StringAttr noProtoNameAttr = getNoProtoAttrName(state.name);
   mlir::StringAttr visNameAttr = getSymVisibilityAttrName(state.name);
   mlir::StringAttr visibilityNameAttr = getGlobalVisibilityAttrName(state.name);
   mlir::StringAttr dsoLocalNameAttr = getDsoLocalAttrName(state.name);
 
+  if (::mlir::succeeded(parser.parseOptionalKeyword(builtinNameAttr.strref())))
+    state.addAttribute(builtinNameAttr, parser.getBuilder().getUnitAttr());
+  if (::mlir::succeeded(
+          parser.parseOptionalKeyword(coroutineNameAttr.strref())))
+    state.addAttribute(coroutineNameAttr, parser.getBuilder().getUnitAttr());
   if (::mlir::succeeded(parser.parseOptionalKeyword(lambdaNameAttr.strref())))
     state.addAttribute(lambdaNameAttr, parser.getBuilder().getUnitAttr());
   if (parser.parseOptionalKeyword(noProtoNameAttr).succeeded())
@@ -1747,6 +1754,12 @@ mlir::Region *cir::FuncOp::getCallableRegion() {
 }
 
 void cir::FuncOp::print(OpAsmPrinter &p) {
+  if (getBuiltin())
+    p << " builtin";
+
+  if (getCoroutine())
+    p << " coroutine";
+
   if (getLambda())
     p << " lambda";
 
diff --git a/clang/test/CIR/CodeGen/coro-task.cpp b/clang/test/CIR/CodeGen/coro-task.cpp
new file mode 100644
index 0000000000000..1fc7d77be2bce
--- /dev/null
+++ b/clang/test/CIR/CodeGen/coro-task.cpp
@@ -0,0 +1,123 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+
+namespace std {
+
+template<typename T> struct remove_reference       { typedef T type; };
+template<typename T> struct remove_reference<T &>  { typedef T type; };
+template<typename T> struct remove_reference<T &&> { typedef T type; };
+
+template<typename T>
+typename remove_reference<T>::type &&move(T &&t) noexcept;
+
+template <class Ret, typename... T>
+struct coroutine_traits { using promise_type = typename Ret::promise_type; };
+
+template <class Promise = void>
+struct coroutine_handle {
+  static coroutine_handle from_address(void *) noexcept;
+};
+template <>
+struct coroutine_handle<void> {
+  template <class PromiseType>
+  coroutine_handle(coroutine_handle<PromiseType>) noexcept;
+  static coroutine_handle from_address(void *);
+};
+
+struct suspend_always {
+  bool await_ready() noexcept { return false; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+struct suspend_never {
+  bool await_ready() noexcept { return true; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+} // namespace std
+
+namespace folly {
+namespace coro {
+
+using std::suspend_always;
+using std::suspend_never;
+using std::coroutine_handle;
+
+using SemiFuture = int;
+
+template<class T>
+struct Task {
+    struct promise_type {
+        Task<T> get_return_object() noexcept;
+        suspend_always initial_suspend() noexcept;
+        suspend_always final_suspend() noexcept;
+        void return_value(T);
+        void unhandled_exception();
+        auto yield_value(Task<T>) noexcept { return final_suspend(); }
+    };
+    bool await_ready() noexcept { return false; }
+    void await_suspend(coroutine_handle<>) noexcept {}
+    T await_resume();
+};
+
+template<>
+struct Task<void> {
+    struct promise_type {
+        Task<void> get_return_object() noexcept;
+        suspend_always initial_suspend() noexcept;
+        suspend_always final_suspend() noexcept;
+        void return_void() noexcept;
+        void unhandled_exception() noexcept;
+        auto yield_value(Task<void>) noexcept { return final_suspend(); }
+    };
+    bool await_ready() noexcept { return false; }
+    void await_suspend(coroutine_handle<>) noexcept {}
+    void await_resume() noexcept {}
+    SemiFuture semi();
+};
+
+// FIXME: add CIRGen support here.
+// struct blocking_wait_fn {
+//   template <typename T>
+//   T operator()(Task<T>&& awaitable) const {
+//     return T();
+//   }
+// };
+
+// inline constexpr blocking_wait_fn blocking_wait{};
+// static constexpr blocking_wait_fn const& blockingWait = blocking_wait;
+
+struct co_invoke_fn {
+  template <typename F, typename... A>
+  Task<void> operator()(F&& f, A&&... a) const {
+    return Task<void>();
+  }
+};
+
+co_invoke_fn co_invoke;
+
+}} // namespace folly::coro
+
+// CIR-DAG: ![[VoidTask:.*]] = !cir.record<struct "folly::coro::Task<void>" padded {!u8i}>
+
+// CIR: module {{.*}} {
+// CIR-NEXT: cir.global external @_ZN5folly4coro9co_invokeE = #cir.zero : !rec_folly3A3Acoro3A3Aco_invoke_fn
+
+// CIR: cir.func builtin private @__builtin_coro_id(!u32i, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>) -> !u32i
+
+using VoidTask = folly::coro::Task<void>;
+
+VoidTask silly_task() {
+  co_await std::suspend_always();
+}
+
+// CIR: cir.func coroutine dso_local @_Z10silly_taskv() -> ![[VoidTask]]
+// CHECK: %[[#VoidTaskAddr:]] = cir.alloca ![[VoidTask]], {{.*}}, ["__retval"]
+
+// Get coroutine id with __builtin_coro_id.
+
+// CIR: %[[NullPtr:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void>
+// CIR: %[[Align:.*]] = cir.const #cir.int<16> : !u32i
+// CIR: %[[CoroId:.*]] = cir.call @__builtin_coro_id(%[[Align]], %[[NullPtr]], %[[NullPtr]], %[[NullPtr]])
diff --git a/clang/test/CIR/IR/func.cir b/clang/test/CIR/IR/func.cir
index 9532859587629..d7e81849ab74e 100644
--- a/clang/test/CIR/IR/func.cir
+++ b/clang/test/CIR/IR/func.cir
@@ -99,4 +99,15 @@ cir.func @ullfunc() -> !u64i {
 // CHECK:   %[[VAL:.*]] = cir.const #cir.int<42> : !u64i
 // CHECK:   cir.return %[[VAL:.*]] : !u64i
 // CHECK: }
+
+cir.func coroutine @coro() {
+  cir.return
+}
+// CHECK: cir.func{{.*}} coroutine @coro()
+
+cir.func builtin @builtin() {
+  cir.return
+}
+// CHECK: cir.func{{.*}} builtin @builtin()
+
 }

From 0cc2ad3c00e255a044ca65af2c09bdfbafc6143d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 3 Oct 2025 14:57:02 -0400
Subject: [PATCH 685/878] [gn] port 1af8ed198828

---
 llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
index 2ab2a0eb2783a..5d1fb02978758 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni
@@ -529,7 +529,7 @@ if (current_cpu == "ve") {
 if (current_cpu == "wasm") {
   builtins_sources += [
     "wasm/__c_longjmp.S",
-    "wasm/__cpp_exceptions.S",
+    "wasm/__cpp_exception.S",
   ]
 }
 

From 8284e4d7cd5cce7f14f820ea2a3f043311eb1d4c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 11:58:38 -0700
Subject: [PATCH 686/878] [RISCV] Remove unused uimm5i32 from RISCVGISel.td.
 NFC

---
 llvm/lib/Target/RISCV/RISCVGISel.td | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index cf6f83a09610d..b444c65361429 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -126,8 +126,6 @@ let Predicates = [HasAtomicLdSt, IsRV64] in {
 // RV64 i32 patterns not used by SelectionDAG
 //===----------------------------------------------------------------------===//
 
-def uimm5i32 : ImmLeaf<i32, [{return isUInt<5>(Imm);}]>;
-
 def zext_is_sext : PatFrag<(ops node:$src), (zext node:$src), [{
   KnownBits Known = CurDAG->computeKnownBits(N->getOperand(0), 0);
   return Known.isNonNegative();

From 8ae30a3facd25c9c7c2cfb96b69466a6c4d22baa Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Fri, 3 Oct 2025 12:14:44 -0700
Subject: [PATCH 687/878] [lldb] Adding A new Binding helper for JSONTransport.
 (#159160)

This adds a new Binding helper class to allow mapping of incoming and
outgoing requests / events to specific handlers.

This should make it easier to create new protocol implementations and
allow us to create a relay in the lldb-mcp binary.
---
 lldb/include/lldb/Host/JSONTransport.h        | 626 +++++++++++++++++-
 lldb/include/lldb/Protocol/MCP/MCPError.h     |   5 +-
 lldb/include/lldb/Protocol/MCP/Protocol.h     |   5 +
 lldb/include/lldb/Protocol/MCP/Server.h       |  72 +-
 lldb/include/lldb/Protocol/MCP/Transport.h    |  60 +-
 lldb/source/Host/common/JSONTransport.cpp     |  26 +-
 .../Protocol/MCP/ProtocolServerMCP.cpp        |  52 +-
 .../Plugins/Protocol/MCP/ProtocolServerMCP.h  |  20 +-
 lldb/source/Protocol/MCP/MCPError.cpp         |   9 +-
 lldb/source/Protocol/MCP/Server.cpp           | 209 ++----
 lldb/tools/lldb-dap/DAP.h                     |   6 +-
 lldb/tools/lldb-dap/Protocol/ProtocolBase.h   |   6 +-
 lldb/tools/lldb-dap/Transport.h               |  11 +-
 lldb/unittests/DAP/DAPTest.cpp                |  20 +-
 lldb/unittests/DAP/Handler/DisconnectTest.cpp |   4 +-
 lldb/unittests/DAP/TestBase.cpp               |  42 +-
 lldb/unittests/DAP/TestBase.h                 | 123 ++--
 lldb/unittests/Host/JSONTransportTest.cpp     | 478 ++++++++++---
 .../Protocol/ProtocolMCPServerTest.cpp        | 307 +++++----
 .../Host/JSONTransportTestUtilities.h         | 100 ++-
 20 files changed, 1552 insertions(+), 629 deletions(-)

diff --git a/lldb/include/lldb/Host/JSONTransport.h b/lldb/include/lldb/Host/JSONTransport.h
index c73021d204258..1453316e96fb4 100644
--- a/lldb/include/lldb/Host/JSONTransport.h
+++ b/lldb/include/lldb/Host/JSONTransport.h
@@ -18,6 +18,7 @@
 #include "lldb/Utility/IOObject.h"
 #include "lldb/Utility/Status.h"
 #include "lldb/lldb-forward.h"
+#include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
@@ -25,13 +26,23 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
+#include <functional>
+#include <mutex>
+#include <optional>
 #include <string>
 #include <system_error>
+#include <type_traits>
+#include <utility>
 #include <variant>
 #include <vector>
+#if __cplusplus >= 202002L
+#include <concepts>
+#endif
 
-namespace lldb_private {
+namespace lldb_private::transport {
 
+/// An error to indicate that the transport reached EOF but there were still
+/// unhandled contents in the read buffer.
 class TransportUnhandledContentsError
     : public llvm::ErrorInfo<TransportUnhandledContentsError> {
 public:
@@ -50,17 +61,75 @@ class TransportUnhandledContentsError
   std::string m_unhandled_contents;
 };
 
+/// An error to indicate that the parameters of a Req, Resp or Evt could not be
+/// deserialized.
+class InvalidParams : public llvm::ErrorInfo<InvalidParams> {
+public:
+  static char ID;
+
+  explicit InvalidParams(std::string method, std::string context)
+      : m_method(std::move(method)), m_context(std::move(context)) {}
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override;
+
+private:
+  /// The JSONRPC remote method call.
+  std::string m_method;
+
+  /// Additional context from the parsing failure, e.g. "missing value at
+  /// (root)[1].str".
+  std::string m_context;
+};
+
+/// An error to indicate that no handler was registered for a given method.
+class MethodNotFound : public llvm::ErrorInfo<MethodNotFound> {
+public:
+  static char ID;
+
+  static constexpr int kErrorCode = -32601;
+
+  explicit MethodNotFound(std::string method) : m_method(std::move(method)) {}
+
+  void log(llvm::raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override;
+
+private:
+  std::string m_method;
+};
+
+#if __cplusplus >= 202002L
+/// A ProtocolDescriptor details the types used in a JSONTransport for handling
+/// transport communication.
+template <typename T>
+concept ProtocolDescriptor = requires {
+  typename T::Id;
+  typename T::Req;
+  typename T::Resp;
+  typename T::Evt;
+};
+#endif
+
 /// A transport is responsible for maintaining the connection to a client
 /// application, and reading/writing structured messages to it.
 ///
-/// Transports have limited thread safety requirements:
+/// JSONTransport have limited thread safety requirements:
 ///  - Messages will not be sent concurrently.
 ///  - Messages MAY be sent while Run() is reading, or its callback is active.
-template <typename Req, typename Resp, typename Evt> class Transport {
+///
+#if __cplusplus >= 202002L
+template <ProtocolDescriptor Proto>
+#else
+template <typename Proto>
+#endif
+class JSONTransport {
 public:
+  using Req = typename Proto::Req;
+  using Resp = typename Proto::Resp;
+  using Evt = typename Proto::Evt;
   using Message = std::variant<Req, Resp, Evt>;
 
-  virtual ~Transport() = default;
+  virtual ~JSONTransport() = default;
 
   /// Sends an event, a message that does not require a response.
   virtual llvm::Error Send(const Evt &) = 0;
@@ -69,7 +138,8 @@ template <typename Req, typename Resp, typename Evt> class Transport {
   /// Sends a response to a specific request.
   virtual llvm::Error Send(const Resp &) = 0;
 
-  /// Implemented to handle incoming messages. (See Run() below).
+  /// Implemented to handle incoming messages. (See `RegisterMessageHandler()`
+  /// below).
   class MessageHandler {
   public:
     virtual ~MessageHandler() = default;
@@ -90,8 +160,6 @@ template <typename Req, typename Resp, typename Evt> class Transport {
     virtual void OnClosed() = 0;
   };
 
-  using MessageHandlerSP = std::shared_ptr<MessageHandler>;
-
   /// RegisterMessageHandler registers the Transport with the given MainLoop and
   /// handles any incoming messages using the given MessageHandler.
   ///
@@ -108,18 +176,23 @@ template <typename Req, typename Resp, typename Evt> class Transport {
 };
 
 /// An IOTransport sends and receives messages using an IOObject.
-template <typename Req, typename Resp, typename Evt>
-class IOTransport : public Transport<Req, Resp, Evt> {
+template <typename Proto> class IOTransport : public JSONTransport<Proto> {
 public:
-  using Transport<Req, Resp, Evt>::Transport;
-  using MessageHandler = typename Transport<Req, Resp, Evt>::MessageHandler;
+  using Message = typename JSONTransport<Proto>::Message;
+  using MessageHandler = typename JSONTransport<Proto>::MessageHandler;
 
   IOTransport(lldb::IOObjectSP in, lldb::IOObjectSP out)
       : m_in(in), m_out(out) {}
 
-  llvm::Error Send(const Evt &evt) override { return Write(evt); }
-  llvm::Error Send(const Req &req) override { return Write(req); }
-  llvm::Error Send(const Resp &resp) override { return Write(resp); }
+  llvm::Error Send(const typename Proto::Evt &evt) override {
+    return Write(evt);
+  }
+  llvm::Error Send(const typename Proto::Req &req) override {
+    return Write(req);
+  }
+  llvm::Error Send(const typename Proto::Resp &resp) override {
+    return Write(resp);
+  }
 
   llvm::Expected<MainLoop::ReadHandleUP>
   RegisterMessageHandler(MainLoop &loop, MessageHandler &handler) override {
@@ -139,7 +212,7 @@ class IOTransport : public Transport<Req, Resp, Evt> {
   /// detail.
   static constexpr size_t kReadBufferSize = 1024;
 
-  // FIXME: Write should be protected.
+protected:
   llvm::Error Write(const llvm::json::Value &message) {
     this->Logv("<-- {0}", message);
     std::string output = Encode(message);
@@ -147,7 +220,6 @@ class IOTransport : public Transport<Req, Resp, Evt> {
     return m_out->Write(output.data(), bytes_written).takeError();
   }
 
-protected:
   virtual llvm::Expected<std::vector<std::string>> Parse() = 0;
   virtual std::string Encode(const llvm::json::Value &message) = 0;
 
@@ -174,9 +246,8 @@ class IOTransport : public Transport<Req, Resp, Evt> {
       }
 
       for (const std::string &raw_message : *raw_messages) {
-        llvm::Expected<typename Transport<Req, Resp, Evt>::Message> message =
-            llvm::json::parse<typename Transport<Req, Resp, Evt>::Message>(
-                raw_message);
+        llvm::Expected<Message> message =
+            llvm::json::parse<Message>(raw_message);
         if (!message) {
           handler.OnError(message.takeError());
           return;
@@ -201,10 +272,14 @@ class IOTransport : public Transport<Req, Resp, Evt> {
 };
 
 /// A transport class for JSON with a HTTP header.
-template <typename Req, typename Resp, typename Evt>
-class HTTPDelimitedJSONTransport : public IOTransport<Req, Resp, Evt> {
+#if __cplusplus >= 202002L
+template <ProtocolDescriptor Proto>
+#else
+template <typename Proto>
+#endif
+class HTTPDelimitedJSONTransport : public IOTransport<Proto> {
 public:
-  using IOTransport<Req, Resp, Evt>::IOTransport;
+  using IOTransport<Proto>::IOTransport;
 
 protected:
   /// Encodes messages based on
@@ -230,8 +305,8 @@ class HTTPDelimitedJSONTransport : public IOTransport<Req, Resp, Evt> {
       for (const llvm::StringRef &header :
            llvm::split(headers, kHeaderSeparator)) {
         auto [key, value] = header.split(kHeaderFieldSeparator);
-        // 'Content-Length' is the only meaningful key at the moment. Others are
-        // ignored.
+        // 'Content-Length' is the only meaningful key at the moment. Others
+        // are ignored.
         if (!key.equals_insensitive(kHeaderContentLength))
           continue;
 
@@ -268,10 +343,14 @@ class HTTPDelimitedJSONTransport : public IOTransport<Req, Resp, Evt> {
 };
 
 /// A transport class for JSON RPC.
-template <typename Req, typename Resp, typename Evt>
-class JSONRPCTransport : public IOTransport<Req, Resp, Evt> {
+#if __cplusplus >= 202002L
+template <ProtocolDescriptor Proto>
+#else
+template <typename Proto>
+#endif
+class JSONRPCTransport : public IOTransport<Proto> {
 public:
-  using IOTransport<Req, Resp, Evt>::IOTransport;
+  using IOTransport<Proto>::IOTransport;
 
 protected:
   std::string Encode(const llvm::json::Value &message) override {
@@ -297,6 +376,497 @@ class JSONRPCTransport : public IOTransport<Req, Resp, Evt> {
   static constexpr llvm::StringLiteral kMessageSeparator = "\n";
 };
 
-} // namespace lldb_private
+/// A handler for the response to an outgoing request.
+template <typename T>
+using Reply =
+    std::conditional_t<std::is_void_v<T>,
+                       llvm::unique_function<void(llvm::Error)>,
+                       llvm::unique_function<void(llvm::Expected<T>)>>;
+
+namespace detail {
+template <typename R, typename P> struct request_t final {
+  using type = llvm::unique_function<void(const P &, Reply<R>)>;
+};
+template <typename R> struct request_t<R, void> final {
+  using type = llvm::unique_function<void(Reply<R>)>;
+};
+template <typename P> struct event_t final {
+  using type = llvm::unique_function<void(const P &)>;
+};
+template <> struct event_t<void> final {
+  using type = llvm::unique_function<void()>;
+};
+} // namespace detail
+
+template <typename R, typename P>
+using OutgoingRequest = typename detail::request_t<R, P>::type;
+
+/// A function to send an outgoing event.
+template <typename P> using OutgoingEvent = typename detail::event_t<P>::type;
+
+#if __cplusplus >= 202002L
+/// This represents a protocol description that includes additional helpers
+/// for constructing requests, responses and events to work with `Binder`.
+template <typename T>
+concept BindingBuilder =
+    ProtocolDescriptor<T> &&
+    requires(T::Id id, T::Req req, T::Resp resp, T::Evt evt,
+             llvm::StringRef method, std::optional<llvm::json::Value> params,
+             std::optional<llvm::json::Value> result, llvm::Error err) {
+      /// For initializing the unique sequence identifier;
+      { T::InitialId() } -> std::same_as<typename T::Id>;
+      /// Incrementing the sequence identifier.
+      { id++ } -> std::same_as<typename T::Id>;
+
+      /// Constructing protocol types
+      /// @{
+      /// Construct a new request.
+      { T::Make(id, method, params) } -> std::same_as<typename T::Req>;
+      /// Construct a new error response.
+      { T::Make(req, std::move(err)) } -> std::same_as<typename T::Resp>;
+      /// Construct a new success response.
+      { T::Make(req, result) } -> std::same_as<typename T::Resp>;
+      /// Construct a new event.
+      { T::Make(method, params) } -> std::same_as<typename T::Evt>;
+      /// @}
+
+      /// Keys for associated types.
+      /// @{
+      /// Looking up in flight responses.
+      { T::KeyFor(resp) } -> std::same_as<typename T::Id>;
+      /// Extract method from request.
+      { T::KeyFor(req) } -> std::same_as<std::string>;
+      /// Extract method from event.
+      { T::KeyFor(evt) } -> std::same_as<std::string>;
+      /// @}
+
+      /// Extracting information from associated types.
+      /// @{
+      /// Extract parameters from a request.
+      { T::Extract(req) } -> std::same_as<std::optional<llvm::json::Value>>;
+      /// Extract result from a response.
+      { T::Extract(resp) } -> std::same_as<llvm::Expected<llvm::json::Value>>;
+      /// Extract parameters from an event.
+      { T::Extract(evt) } -> std::same_as<std::optional<llvm::json::Value>>;
+      /// @}
+    };
+#endif
+
+/// Binder collects a table of functions that handle calls.
+///
+/// The wrapper takes care of parsing/serializing responses.
+///
+/// This allows a JSONTransport to handle incoming and outgoing requests and
+/// events.
+///
+/// A bind of an incoming request to a lambda.
+/// \code{cpp}
+/// Binder binder{transport};
+/// binder.bind<int, vector<int>>("adder", [](const vector<int> &params) {
+///   int sum = 0;
+///   for (int v : params)
+///     sum += v;
+///   return sum;
+/// });
+/// \endcode
+///
+/// A bind of an outgoing request.
+/// \code{cpp}
+/// OutgoingRequest<int, vector<int>> call_add =
+///     binder.bind<int, vector<int>>("add");
+/// call_add({1,2,3}, [](Expected<int> result) {
+///   cout << *result << "\n";
+/// });
+/// \endcode
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+class Binder : public JSONTransport<Proto>::MessageHandler {
+  using Req = typename Proto::Req;
+  using Resp = typename Proto::Resp;
+  using Evt = typename Proto::Evt;
+  using Id = typename Proto::Id;
+  using Transport = JSONTransport<Proto>;
+  using MessageHandler = typename Transport::MessageHandler;
+
+public:
+  explicit Binder(Transport &transport) : m_transport(transport), m_seq(0) {}
+
+  Binder(const Binder &) = delete;
+  Binder &operator=(const Binder &) = delete;
+
+  /// Bind a handler on transport disconnect.
+  template <typename Fn, typename... Args>
+  void OnDisconnect(Fn &&fn, Args &&...args);
+
+  /// Bind a handler on error when communicating with the transport.
+  template <typename Fn, typename... Args>
+  void OnError(Fn &&fn, Args &&...args);
+
+  /// Bind a handler for an incoming request.
+  /// e.g. `bind("peek", &ThisModule::peek, this);`.
+  /// Handler should be e.g. `Expected<PeekResult> peek(const PeekParams&);`
+  /// PeekParams must be JSON parsable and PeekResult must be serializable.
+  template <typename Result, typename Params, typename Fn, typename... Args>
+  void Bind(llvm::StringLiteral method, Fn &&fn, Args &&...args);
+
+  /// Bind a handler for an incoming event.
+  /// e.g. `bind("peek", &ThisModule::peek, this);`
+  /// Handler should be e.g. `void peek(const PeekParams&);`
+  /// PeekParams must be JSON parsable.
+  template <typename Params, typename Fn, typename... Args>
+  void Bind(llvm::StringLiteral method, Fn &&fn, Args &&...args);
+
+  /// Bind a function object to be used for outgoing requests.
+  /// e.g. `OutgoingRequest<Params, Result> Edit = bind("edit");`
+  /// Params must be JSON-serializable, Result must be parsable.
+  template <typename Result, typename Params>
+  OutgoingRequest<Result, Params> Bind(llvm::StringLiteral method);
+
+  /// Bind a function object to be used for outgoing events.
+  /// e.g. `OutgoingEvent<LogParams> Log = bind("log");`
+  /// LogParams must be JSON-serializable.
+  template <typename Params>
+  OutgoingEvent<Params> Bind(llvm::StringLiteral method);
+
+  void Received(const Evt &evt) override {
+    std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+    auto it = m_event_handlers.find(Proto::KeyFor(evt));
+    if (it == m_event_handlers.end()) {
+      OnError(llvm::createStringError(
+          llvm::formatv("no handled for event {0}", toJSON(evt))));
+      return;
+    }
+    it->second(evt);
+  }
+
+  void Received(const Req &req) override {
+    ReplyOnce reply(req, &m_transport, this);
+
+    std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+    auto it = m_request_handlers.find(Proto::KeyFor(req));
+    if (it == m_request_handlers.end()) {
+      reply(Proto::Make(req, llvm::createStringError("method not found")));
+      return;
+    }
+
+    it->second(req, std::move(reply));
+  }
+
+  void Received(const Resp &resp) override {
+    std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+
+    Id id = Proto::KeyFor(resp);
+    auto it = m_pending_responses.find(id);
+    if (it == m_pending_responses.end()) {
+      OnError(llvm::createStringError(
+          llvm::formatv("no pending request for {0}", toJSON(resp))));
+      return;
+    }
+
+    it->second(resp);
+    m_pending_responses.erase(it);
+  }
+
+  void OnError(llvm::Error err) override {
+    std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+    if (m_error_handler)
+      m_error_handler(std::move(err));
+  }
+
+  void OnClosed() override {
+    std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+    if (m_disconnect_handler)
+      m_disconnect_handler();
+  }
+
+private:
+  template <typename T>
+  llvm::Expected<T> static Parse(const llvm::json::Value &raw,
+                                 llvm::StringRef method);
+
+  template <typename T> using Callback = llvm::unique_function<T>;
+
+  std::recursive_mutex m_mutex;
+  Transport &m_transport;
+  Id m_seq;
+  std::map<Id, Callback<void(const Resp &)>> m_pending_responses;
+  llvm::StringMap<Callback<void(const Req &, Callback<void(const Resp &)>)>>
+      m_request_handlers;
+  llvm::StringMap<Callback<void(const Evt &)>> m_event_handlers;
+  Callback<void()> m_disconnect_handler;
+  Callback<void(llvm::Error)> m_error_handler;
+
+  /// Function object to reply to a call.
+  /// Each instance must be called exactly once, otherwise:
+  ///  - the bug is logged, and (in debug mode) an assert will fire
+  ///  - if there was no reply, an error reply is sent
+  ///  - if there were multiple replies, only the first is sent
+  class ReplyOnce {
+    std::atomic<bool> replied = {false};
+    const Req req;
+    Transport *transport;    // Null when moved-from.
+    MessageHandler *handler; // Null when moved-from.
+
+  public:
+    ReplyOnce(const Req req, Transport *transport, MessageHandler *handler)
+        : req(req), transport(transport), handler(handler) {
+      assert(handler);
+    }
+    ReplyOnce(ReplyOnce &&other)
+        : replied(other.replied.load()), req(other.req),
+          transport(other.transport), handler(other.handler) {
+      other.transport = nullptr;
+      other.handler = nullptr;
+    }
+    ReplyOnce &operator=(ReplyOnce &&) = delete;
+    ReplyOnce(const ReplyOnce &) = delete;
+    ReplyOnce &operator=(const ReplyOnce &) = delete;
+
+    ~ReplyOnce() {
+      if (transport && handler && !replied) {
+        assert(false && "must reply to all calls!");
+        (*this)(Proto::Make(req, llvm::createStringError("failed to reply")));
+      }
+    }
+
+    void operator()(const Resp &resp) {
+      assert(transport && handler && "moved-from!");
+      if (replied.exchange(true)) {
+        assert(false && "must reply to each call only once!");
+        return;
+      }
+
+      if (llvm::Error error = transport->Send(resp))
+        handler->OnError(std::move(error));
+    }
+  };
+};
+
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+template <typename Fn, typename... Args>
+void Binder<Proto>::OnDisconnect(Fn &&fn, Args &&...args) {
+  m_disconnect_handler = [fn, args...]() mutable {
+    std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)...);
+  };
+}
+
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+template <typename Fn, typename... Args>
+void Binder<Proto>::OnError(Fn &&fn, Args &&...args) {
+  m_error_handler = [fn, args...](llvm::Error error) mutable {
+    std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)...,
+                std::move(error));
+  };
+}
+
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+template <typename Result, typename Params, typename Fn, typename... Args>
+void Binder<Proto>::Bind(llvm::StringLiteral method, Fn &&fn, Args &&...args) {
+  assert(m_request_handlers.find(method) == m_request_handlers.end() &&
+         "request already bound");
+  if constexpr (std::is_void_v<Result> && std::is_void_v<Params>) {
+    m_request_handlers[method] =
+        [fn, args...](const Req &req,
+                      llvm::unique_function<void(const Resp &)> reply) mutable {
+          llvm::Error result =
+              std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)...);
+          reply(Proto::Make(req, std::move(result)));
+        };
+  } else if constexpr (std::is_void_v<Params>) {
+    m_request_handlers[method] =
+        [fn, args...](const Req &req,
+                      llvm::unique_function<void(const Resp &)> reply) mutable {
+          llvm::Expected<Result> result =
+              std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)...);
+          if (!result)
+            return reply(Proto::Make(req, result.takeError()));
+          reply(Proto::Make(req, toJSON(*result)));
+        };
+  } else if constexpr (std::is_void_v<Result>) {
+    m_request_handlers[method] =
+        [method, fn,
+         args...](const Req &req,
+                  llvm::unique_function<void(const Resp &)> reply) mutable {
+          llvm::Expected<Params> params =
+              Parse<Params>(Proto::Extract(req), method);
+          if (!params)
+            return reply(Proto::Make(req, params.takeError()));
+
+          llvm::Error result = std::invoke(
+              std::forward<Fn>(fn), std::forward<Args>(args)..., *params);
+          reply(Proto::Make(req, std::move(result)));
+        };
+  } else {
+    m_request_handlers[method] =
+        [method, fn,
+         args...](const Req &req,
+                  llvm::unique_function<void(const Resp &)> reply) mutable {
+          llvm::Expected<Params> params =
+              Parse<Params>(Proto::Extract(req), method);
+          if (!params)
+            return reply(Proto::Make(req, params.takeError()));
+
+          llvm::Expected<Result> result = std::invoke(
+              std::forward<Fn>(fn), std::forward<Args>(args)..., *params);
+          if (!result)
+            return reply(Proto::Make(req, result.takeError()));
+
+          reply(Proto::Make(req, toJSON(*result)));
+        };
+  }
+}
+
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+template <typename Params, typename Fn, typename... Args>
+void Binder<Proto>::Bind(llvm::StringLiteral method, Fn &&fn, Args &&...args) {
+  assert(m_event_handlers.find(method) == m_event_handlers.end() &&
+         "event already bound");
+  if constexpr (std::is_void_v<Params>) {
+    m_event_handlers[method] = [fn, args...](const Evt &) mutable {
+      std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)...);
+    };
+  } else {
+    m_event_handlers[method] = [this, method, fn,
+                                args...](const Evt &evt) mutable {
+      llvm::Expected<Params> params =
+          Parse<Params>(Proto::Extract(evt), method);
+      if (!params)
+        return OnError(params.takeError());
+      std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)..., *params);
+    };
+  }
+}
+
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+template <typename Result, typename Params>
+OutgoingRequest<Result, Params>
+Binder<Proto>::Bind(llvm::StringLiteral method) {
+  if constexpr (std::is_void_v<Result> && std::is_void_v<Params>) {
+    return [this, method](Reply<Result> fn) {
+      std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+      Id id = ++m_seq;
+      Req req = Proto::Make(id, method, std::nullopt);
+      m_pending_responses[id] = [fn = std::move(fn)](const Resp &resp) mutable {
+        llvm::Expected<llvm::json::Value> result = Proto::Extract(resp);
+        if (!result)
+          return fn(result.takeError());
+        fn(llvm::Error::success());
+      };
+      if (llvm::Error error = m_transport.Send(req))
+        OnError(std::move(error));
+    };
+  } else if constexpr (std::is_void_v<Params>) {
+    return [this, method](Reply<Result> fn) {
+      std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+      Id id = ++m_seq;
+      Req req = Proto::Make(id, method, std::nullopt);
+      m_pending_responses[id] = [fn = std::move(fn),
+                                 method](const Resp &resp) mutable {
+        llvm::Expected<llvm::json::Value> result = Proto::Extract(resp);
+        if (!result)
+          return fn(result.takeError());
+        fn(Parse<Result>(*result, method));
+      };
+      if (llvm::Error error = m_transport.Send(req))
+        OnError(std::move(error));
+    };
+  } else if constexpr (std::is_void_v<Result>) {
+    return [this, method](const Params &params, Reply<Result> fn) {
+      std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+      Id id = ++m_seq;
+      Req req = Proto::Make(id, method, llvm::json::Value(params));
+      m_pending_responses[id] = [fn = std::move(fn)](const Resp &resp) mutable {
+        llvm::Expected<llvm::json::Value> result = Proto::Extract(resp);
+        if (!result)
+          return fn(result.takeError());
+        fn(llvm::Error::success());
+      };
+      if (llvm::Error error = m_transport.Send(req))
+        OnError(std::move(error));
+    };
+  } else {
+    return [this, method](const Params &params, Reply<Result> fn) {
+      std::scoped_lock<std::recursive_mutex> guard(m_mutex);
+      Id id = ++m_seq;
+      Req req = Proto::Make(id, method, llvm::json::Value(params));
+      m_pending_responses[id] = [fn = std::move(fn),
+                                 method](const Resp &resp) mutable {
+        llvm::Expected<llvm::json::Value> result = Proto::Extract(resp);
+        if (llvm::Error err = result.takeError())
+          return fn(std::move(err));
+        fn(Parse<Result>(*result, method));
+      };
+      if (llvm::Error error = m_transport.Send(req))
+        OnError(std::move(error));
+    };
+  }
+}
+
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+template <typename Params>
+OutgoingEvent<Params> Binder<Proto>::Bind(llvm::StringLiteral method) {
+  if constexpr (std::is_void_v<Params>) {
+    return [this, method]() {
+      if (llvm::Error error =
+              m_transport.Send(Proto::Make(method, std::nullopt)))
+        OnError(std::move(error));
+    };
+  } else {
+    return [this, method](const Params &params) {
+      if (llvm::Error error =
+              m_transport.Send(Proto::Make(method, toJSON(params))))
+        OnError(std::move(error));
+    };
+  }
+}
+
+#if __cplusplus >= 202002L
+template <BindingBuilder Proto>
+#else
+template <typename Proto>
+#endif
+template <typename T>
+llvm::Expected<T> Binder<Proto>::Parse(const llvm::json::Value &raw,
+                                       llvm::StringRef method) {
+  T result;
+  llvm::json::Path::Root root;
+  if (!fromJSON(raw, result, root)) {
+    // Dump the relevant parts of the broken message.
+    std::string context;
+    llvm::raw_string_ostream OS(context);
+    root.printErrorContext(raw, OS);
+    return llvm::make_error<InvalidParams>(method.str(), context);
+  }
+  return std::move(result);
+}
+
+} // namespace lldb_private::transport
 
 #endif
diff --git a/lldb/include/lldb/Protocol/MCP/MCPError.h b/lldb/include/lldb/Protocol/MCP/MCPError.h
index 55dd40f124a15..609a1733197d4 100644
--- a/lldb/include/lldb/Protocol/MCP/MCPError.h
+++ b/lldb/include/lldb/Protocol/MCP/MCPError.h
@@ -9,7 +9,6 @@
 #ifndef LLDB_PROTOCOL_MCP_MCPERROR_H
 #define LLDB_PROTOCOL_MCP_MCPERROR_H
 
-#include "lldb/Protocol/MCP/Protocol.h"
 #include "llvm/Support/Error.h"
 #include <string>
 
@@ -26,14 +25,12 @@ class MCPError : public llvm::ErrorInfo<MCPError> {
 
   const std::string &getMessage() const { return m_message; }
 
-  lldb_protocol::mcp::Error toProtocolError() const;
-
   static constexpr int64_t kResourceNotFound = -32002;
   static constexpr int64_t kInternalError = -32603;
 
 private:
   std::string m_message;
-  int64_t m_error_code;
+  int m_error_code;
 };
 
 class UnsupportedURI : public llvm::ErrorInfo<UnsupportedURI> {
diff --git a/lldb/include/lldb/Protocol/MCP/Protocol.h b/lldb/include/lldb/Protocol/MCP/Protocol.h
index 6e1ffcbe1f3e3..a0ba8659ffe24 100644
--- a/lldb/include/lldb/Protocol/MCP/Protocol.h
+++ b/lldb/include/lldb/Protocol/MCP/Protocol.h
@@ -14,6 +14,7 @@
 #ifndef LLDB_PROTOCOL_MCP_PROTOCOL_H
 #define LLDB_PROTOCOL_MCP_PROTOCOL_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/JSON.h"
 #include <optional>
 #include <string>
@@ -322,6 +323,10 @@ struct CallToolResult {
 llvm::json::Value toJSON(const CallToolResult &);
 bool fromJSON(const llvm::json::Value &, CallToolResult &, llvm::json::Path);
 
+lldb_protocol::mcp::Request
+MakeRequest(int64_t id, llvm::StringRef method,
+            std::optional<llvm::json::Value> params);
+
 } // namespace lldb_protocol::mcp
 
 #endif
diff --git a/lldb/include/lldb/Protocol/MCP/Server.h b/lldb/include/lldb/Protocol/MCP/Server.h
index 970980d075ea6..f185d51f41192 100644
--- a/lldb/include/lldb/Protocol/MCP/Server.h
+++ b/lldb/include/lldb/Protocol/MCP/Server.h
@@ -9,7 +9,6 @@
 #ifndef LLDB_PROTOCOL_MCP_SERVER_H
 #define LLDB_PROTOCOL_MCP_SERVER_H
 
-#include "lldb/Host/JSONTransport.h"
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Protocol/MCP/Protocol.h"
 #include "lldb/Protocol/MCP/Resource.h"
@@ -19,75 +18,66 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/Signals.h"
-#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
 
 namespace lldb_protocol::mcp {
 
-class Server : public MCPTransport::MessageHandler {
-  using ClosedCallback = llvm::unique_function<void()>;
+class Server {
+
+  using MCPTransportUP = std::unique_ptr<lldb_protocol::mcp::MCPTransport>;
+
+  using ReadHandleUP = lldb_private::MainLoop::ReadHandleUP;
 
 public:
-  Server(std::string name, std::string version, MCPTransport &client,
-         LogCallback log_callback = {}, ClosedCallback closed_callback = {});
+  Server(std::string name, std::string version, LogCallback log_callback = {});
   ~Server() = default;
 
-  using NotificationHandler = std::function<void(const Notification &)>;
-
   void AddTool(std::unique_ptr<Tool> tool);
   void AddResourceProvider(std::unique_ptr<ResourceProvider> resource_provider);
-  void AddNotificationHandler(llvm::StringRef method,
-                              NotificationHandler handler);
-
-protected:
-  ServerCapabilities GetCapabilities();
-
-  using RequestHandler =
-      std::function<llvm::Expected<Response>(const Request &)>;
 
-  void AddRequestHandlers();
+  llvm::Error Accept(lldb_private::MainLoop &, MCPTransportUP);
 
-  void AddRequestHandler(llvm::StringRef method, RequestHandler handler);
-
-  llvm::Expected<std::optional<Message>> HandleData(llvm::StringRef data);
-
-  llvm::Expected<Response> Handle(const Request &request);
-  void Handle(const Notification &notification);
+protected:
+  MCPBinderUP Bind(MCPTransport &);
 
-  llvm::Expected<Response> InitializeHandler(const Request &);
+  ServerCapabilities GetCapabilities();
 
-  llvm::Expected<Response> ToolsListHandler(const Request &);
-  llvm::Expected<Response> ToolsCallHandler(const Request &);
+  llvm::Expected<InitializeResult> InitializeHandler(const InitializeParams &);
 
-  llvm::Expected<Response> ResourcesListHandler(const Request &);
-  llvm::Expected<Response> ResourcesReadHandler(const Request &);
+  llvm::Expected<ListToolsResult> ToolsListHandler();
+  llvm::Expected<CallToolResult> ToolsCallHandler(const CallToolParams &);
 
-  void Received(const Request &) override;
-  void Received(const Response &) override;
-  void Received(const Notification &) override;
-  void OnError(llvm::Error) override;
-  void OnClosed() override;
+  llvm::Expected<ListResourcesResult> ResourcesListHandler();
+  llvm::Expected<ReadResourceResult>
+  ResourcesReadHandler(const ReadResourceParams &);
 
-protected:
-  void Log(llvm::StringRef);
+  template <typename... Ts> inline auto Logv(const char *Fmt, Ts &&...Vals) {
+    Log(llvm::formatv(Fmt, std::forward<Ts>(Vals)...).str());
+  }
+  void Log(llvm::StringRef message) {
+    if (m_log_callback)
+      m_log_callback(message);
+  }
 
 private:
   const std::string m_name;
   const std::string m_version;
 
-  MCPTransport &m_client;
   LogCallback m_log_callback;
-  ClosedCallback m_closed_callback;
+  struct Client {
+    ReadHandleUP handle;
+    MCPTransportUP transport;
+    MCPBinderUP binder;
+  };
+  std::map<MCPTransport *, Client> m_instances;
 
   llvm::StringMap<std::unique_ptr<Tool>> m_tools;
   std::vector<std::unique_ptr<ResourceProvider>> m_resource_providers;
-
-  llvm::StringMap<RequestHandler> m_request_handlers;
-  llvm::StringMap<NotificationHandler> m_notification_handlers;
 };
 
 class ServerInfoHandle;
@@ -121,7 +111,7 @@ class ServerInfoHandle {
   ServerInfoHandle &operator=(const ServerInfoHandle &) = delete;
   /// @}
 
-  /// Remove the file.
+  /// Remove the file on disk, if one is tracked.
   void Remove();
 
 private:
diff --git a/lldb/include/lldb/Protocol/MCP/Transport.h b/lldb/include/lldb/Protocol/MCP/Transport.h
index 47c2ccfc44dfe..b7a1eb778d660 100644
--- a/lldb/include/lldb/Protocol/MCP/Transport.h
+++ b/lldb/include/lldb/Protocol/MCP/Transport.h
@@ -10,22 +10,78 @@
 #define LLDB_PROTOCOL_MCP_TRANSPORT_H
 
 #include "lldb/Host/JSONTransport.h"
+#include "lldb/Protocol/MCP/MCPError.h"
 #include "lldb/Protocol/MCP/Protocol.h"
 #include "lldb/lldb-forward.h"
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <sys/types.h>
 
 namespace lldb_protocol::mcp {
 
+struct ProtocolDescriptor {
+  using Id = int64_t;
+  using Req = Request;
+  using Resp = Response;
+  using Evt = Notification;
+
+  static inline Id InitialId() { return 0; }
+  static inline Request Make(Id id, llvm::StringRef method,
+                             std::optional<llvm::json::Value> params) {
+    return Request{id, method.str(), params};
+  }
+  static inline Notification Make(llvm::StringRef method,
+                                  std::optional<llvm::json::Value> params) {
+    return Notification{method.str(), params};
+  }
+  static inline Response Make(Req req, llvm::Error error) {
+    lldb_protocol::mcp::Error protocol_error;
+    llvm::handleAllErrors(
+        std::move(error), [&](const llvm::ErrorInfoBase &err) {
+          std::error_code cerr = err.convertToErrorCode();
+          protocol_error.code =
+              cerr == llvm::inconvertibleErrorCode()
+                  ? lldb_protocol::mcp::eErrorCodeInternalError
+                  : cerr.value();
+          protocol_error.message = err.message();
+        });
+
+    return Response{req.id, std::move(protocol_error)};
+  }
+  static inline Response Make(Req req,
+                              std::optional<llvm::json::Value> result) {
+    return Response{req.id, std::move(result)};
+  }
+  static inline Id KeyFor(Response r) { return std::get<Id>(r.id); }
+  static inline std::string KeyFor(Request r) { return r.method; }
+  static inline std::string KeyFor(Notification n) { return n.method; }
+  static inline std::optional<llvm::json::Value> Extract(Request r) {
+    return r.params;
+  }
+  static inline llvm::Expected<llvm::json::Value> Extract(Response r) {
+    if (const lldb_protocol::mcp::Error *error =
+            std::get_if<lldb_protocol::mcp::Error>(&r.result))
+      return llvm::make_error<lldb_protocol::mcp::MCPError>(error->message,
+                                                            error->code);
+    return std::get<llvm::json::Value>(r.result);
+  }
+  static inline std::optional<llvm::json::Value> Extract(Notification n) {
+    return n.params;
+  }
+};
+
 /// Generic transport that uses the MCP protocol.
-using MCPTransport = lldb_private::Transport<Request, Response, Notification>;
+using MCPTransport = lldb_private::transport::JSONTransport<ProtocolDescriptor>;
+using MCPBinder = lldb_private::transport::Binder<ProtocolDescriptor>;
+using MCPBinderUP = std::unique_ptr<MCPBinder>;
 
 /// Generic logging callback, to allow the MCP server / client / transport layer
 /// to be independent of the lldb log implementation.
 using LogCallback = llvm::unique_function<void(llvm::StringRef message)>;
 
 class Transport final
-    : public lldb_private::JSONRPCTransport<Request, Response, Notification> {
+    : public lldb_private::transport::JSONRPCTransport<ProtocolDescriptor> {
 public:
   Transport(lldb::IOObjectSP in, lldb::IOObjectSP out,
             LogCallback log_callback = {});
diff --git a/lldb/source/Host/common/JSONTransport.cpp b/lldb/source/Host/common/JSONTransport.cpp
index c4b42eafc85d3..22de7fa8cbead 100644
--- a/lldb/source/Host/common/JSONTransport.cpp
+++ b/lldb/source/Host/common/JSONTransport.cpp
@@ -14,8 +14,7 @@
 #include <string>
 
 using namespace llvm;
-using namespace lldb;
-using namespace lldb_private;
+using namespace lldb_private::transport;
 
 char TransportUnhandledContentsError::ID;
 
@@ -23,10 +22,31 @@ TransportUnhandledContentsError::TransportUnhandledContentsError(
     std::string unhandled_contents)
     : m_unhandled_contents(unhandled_contents) {}
 
-void TransportUnhandledContentsError::log(llvm::raw_ostream &OS) const {
+void TransportUnhandledContentsError::log(raw_ostream &OS) const {
   OS << "transport EOF with unhandled contents: '" << m_unhandled_contents
      << "'";
 }
 std::error_code TransportUnhandledContentsError::convertToErrorCode() const {
   return std::make_error_code(std::errc::bad_message);
 }
+
+char InvalidParams::ID;
+
+void InvalidParams::log(raw_ostream &OS) const {
+  OS << "invalid parameters for method '" << m_method << "': '" << m_context
+     << "'";
+}
+std::error_code InvalidParams::convertToErrorCode() const {
+  return std::make_error_code(std::errc::invalid_argument);
+}
+
+char MethodNotFound::ID;
+
+void MethodNotFound::log(raw_ostream &OS) const {
+  OS << "method not found: '" << m_method << "'";
+}
+
+std::error_code MethodNotFound::convertToErrorCode() const {
+  // JSON-RPC Method not found
+  return std::error_code(MethodNotFound::kErrorCode, std::generic_category());
+}
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
index d7293fc28c524..33bdd5eec3644 100644
--- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.cpp
@@ -52,11 +52,6 @@ llvm::StringRef ProtocolServerMCP::GetPluginDescriptionStatic() {
 }
 
 void ProtocolServerMCP::Extend(lldb_protocol::mcp::Server &server) const {
-  server.AddNotificationHandler("notifications/initialized",
-                                [](const lldb_protocol::mcp::Notification &) {
-                                  LLDB_LOG(GetLog(LLDBLog::Host),
-                                           "MCP initialization complete");
-                                });
   server.AddTool(
       std::make_unique<CommandTool>("command", "Run an lldb command."));
   server.AddTool(std::make_unique<DebuggerListTool>(
@@ -74,26 +69,9 @@ void ProtocolServerMCP::AcceptCallback(std::unique_ptr<Socket> socket) {
       io_sp, io_sp, [client_name](llvm::StringRef message) {
         LLDB_LOG(GetLog(LLDBLog::Host), "{0}: {1}", client_name, message);
       });
-  MCPTransport *transport_ptr = transport_up.get();
-  auto instance_up = std::make_unique<lldb_protocol::mcp::Server>(
-      std::string(kName), std::string(kVersion), *transport_up,
-      /*log_callback=*/
-      [client_name](llvm::StringRef message) {
-        LLDB_LOG(GetLog(LLDBLog::Host), "{0} Server: {1}", client_name,
-                 message);
-      },
-      /*closed_callback=*/
-      [this, transport_ptr]() { m_instances.erase(transport_ptr); });
-  Extend(*instance_up);
-  llvm::Expected<MainLoop::ReadHandleUP> handle =
-      transport_up->RegisterMessageHandler(m_loop, *instance_up);
-  if (!handle) {
-    LLDB_LOG_ERROR(log, handle.takeError(), "Failed to run MCP server: {0}");
-    return;
-  }
-  m_instances[transport_ptr] =
-      std::make_tuple<ServerUP, ReadHandleUP, TransportUP>(
-          std::move(instance_up), std::move(*handle), std::move(transport_up));
+
+  if (auto error = m_server->Accept(m_loop, std::move(transport_up)))
+    LLDB_LOG_ERROR(log, std::move(error), "{0}:");
 }
 
 llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
@@ -124,14 +102,21 @@ llvm::Error ProtocolServerMCP::Start(ProtocolServer::Connection connection) {
       llvm::join(m_listener->GetListeningConnectionURI(), ", ");
 
   ServerInfo info{listening_uris[0]};
-  llvm::Expected<ServerInfoHandle> handle = ServerInfo::Write(info);
-  if (!handle)
-    return handle.takeError();
+  llvm::Expected<ServerInfoHandle> server_info_handle = ServerInfo::Write(info);
+  if (!server_info_handle)
+    return server_info_handle.takeError();
+
+  m_client_count = 0;
+  m_server = std::make_unique<lldb_protocol::mcp::Server>(
+      std::string(kName), std::string(kVersion), [](StringRef message) {
+        LLDB_LOG(GetLog(LLDBLog::Host), "MCP Server: {0}", message);
+      });
+  Extend(*m_server);
 
   m_running = true;
-  m_server_info_handle = std::move(*handle);
-  m_listen_handlers = std::move(*handles);
-  m_loop_thread = std::thread([=] {
+  m_server_info_handle = std::move(*server_info_handle);
+  m_accept_handles = std::move(*handles);
+  m_loop_thread = std::thread([this] {
     llvm::set_thread_name("protocol-server.mcp");
     m_loop.Run();
   });
@@ -155,9 +140,10 @@ llvm::Error ProtocolServerMCP::Stop() {
   if (m_loop_thread.joinable())
     m_loop_thread.join();
 
+  m_accept_handles.clear();
+
+  m_server.reset(nullptr);
   m_server_info_handle.Remove();
-  m_listen_handlers.clear();
-  m_instances.clear();
 
   return llvm::Error::success();
 }
diff --git a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
index b325a3681bccb..e0f2a6ccea1f5 100644
--- a/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
+++ b/lldb/source/Plugins/Protocol/MCP/ProtocolServerMCP.h
@@ -23,16 +23,17 @@
 namespace lldb_private::mcp {
 
 class ProtocolServerMCP : public ProtocolServer {
-  using ReadHandleUP = MainLoopBase::ReadHandleUP;
-  using TransportUP = std::unique_ptr<lldb_protocol::mcp::MCPTransport>;
+
   using ServerUP = std::unique_ptr<lldb_protocol::mcp::Server>;
 
+  using ReadHandleUP = MainLoop::ReadHandleUP;
+
 public:
   ProtocolServerMCP();
-  virtual ~ProtocolServerMCP() override;
+  ~ProtocolServerMCP() override;
 
-  virtual llvm::Error Start(ProtocolServer::Connection connection) override;
-  virtual llvm::Error Stop() override;
+  llvm::Error Start(ProtocolServer::Connection connection) override;
+  llvm::Error Stop() override;
 
   static void Initialize();
   static void Terminate();
@@ -56,19 +57,18 @@ class ProtocolServerMCP : public ProtocolServer {
 
   bool m_running = false;
 
-  lldb_protocol::mcp::ServerInfoHandle m_server_info_handle;
   lldb_private::MainLoop m_loop;
   std::thread m_loop_thread;
   std::mutex m_mutex;
   size_t m_client_count = 0;
 
   std::unique_ptr<Socket> m_listener;
+  std::vector<ReadHandleUP> m_accept_handles;
 
-  std::vector<ReadHandleUP> m_listen_handlers;
-  std::map<lldb_protocol::mcp::MCPTransport *,
-           std::tuple<ServerUP, ReadHandleUP, TransportUP>>
-      m_instances;
+  ServerUP m_server;
+  lldb_protocol::mcp::ServerInfoHandle m_server_info_handle;
 };
+
 } // namespace lldb_private::mcp
 
 #endif
diff --git a/lldb/source/Protocol/MCP/MCPError.cpp b/lldb/source/Protocol/MCP/MCPError.cpp
index e140d11e12cfe..cfac055ba5f11 100644
--- a/lldb/source/Protocol/MCP/MCPError.cpp
+++ b/lldb/source/Protocol/MCP/MCPError.cpp
@@ -22,14 +22,7 @@ MCPError::MCPError(std::string message, int64_t error_code)
 void MCPError::log(llvm::raw_ostream &OS) const { OS << m_message; }
 
 std::error_code MCPError::convertToErrorCode() const {
-  return llvm::inconvertibleErrorCode();
-}
-
-lldb_protocol::mcp::Error MCPError::toProtocolError() const {
-  lldb_protocol::mcp::Error error;
-  error.code = m_error_code;
-  error.message = m_message;
-  return error;
+  return std::error_code(m_error_code, std::generic_category());
 }
 
 UnsupportedURI::UnsupportedURI(std::string uri) : m_uri(uri) {}
diff --git a/lldb/source/Protocol/MCP/Server.cpp b/lldb/source/Protocol/MCP/Server.cpp
index 19030a3a4e5d6..71323adbac5f5 100644
--- a/lldb/source/Protocol/MCP/Server.cpp
+++ b/lldb/source/Protocol/MCP/Server.cpp
@@ -12,6 +12,7 @@
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Protocol/MCP/MCPError.h"
 #include "lldb/Protocol/MCP/Protocol.h"
+#include "lldb/Protocol/MCP/Transport.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/JSON.h"
@@ -108,48 +109,9 @@ Expected<std::vector<ServerInfo>> ServerInfo::Load() {
   return infos;
 }
 
-Server::Server(std::string name, std::string version, MCPTransport &client,
-               LogCallback log_callback, ClosedCallback closed_callback)
-    : m_name(std::move(name)), m_version(std::move(version)), m_client(client),
-      m_log_callback(std::move(log_callback)),
-      m_closed_callback(std::move(closed_callback)) {
-  AddRequestHandlers();
-}
-
-void Server::AddRequestHandlers() {
-  AddRequestHandler("initialize", std::bind(&Server::InitializeHandler, this,
-                                            std::placeholders::_1));
-  AddRequestHandler("tools/list", std::bind(&Server::ToolsListHandler, this,
-                                            std::placeholders::_1));
-  AddRequestHandler("tools/call", std::bind(&Server::ToolsCallHandler, this,
-                                            std::placeholders::_1));
-  AddRequestHandler("resources/list", std::bind(&Server::ResourcesListHandler,
-                                                this, std::placeholders::_1));
-  AddRequestHandler("resources/read", std::bind(&Server::ResourcesReadHandler,
-                                                this, std::placeholders::_1));
-}
-
-llvm::Expected<Response> Server::Handle(const Request &request) {
-  auto it = m_request_handlers.find(request.method);
-  if (it != m_request_handlers.end()) {
-    llvm::Expected<Response> response = it->second(request);
-    if (!response)
-      return response;
-    response->id = request.id;
-    return *response;
-  }
-
-  return llvm::make_error<MCPError>(
-      llvm::formatv("no handler for request: {0}", request.method).str());
-}
-
-void Server::Handle(const Notification &notification) {
-  auto it = m_notification_handlers.find(notification.method);
-  if (it != m_notification_handlers.end()) {
-    it->second(notification);
-    return;
-  }
-}
+Server::Server(std::string name, std::string version, LogCallback log_callback)
+    : m_name(std::move(name)), m_version(std::move(version)),
+      m_log_callback(std::move(log_callback)) {}
 
 void Server::AddTool(std::unique_ptr<Tool> tool) {
   if (!tool)
@@ -164,48 +126,64 @@ void Server::AddResourceProvider(
   m_resource_providers.push_back(std::move(resource_provider));
 }
 
-void Server::AddRequestHandler(llvm::StringRef method, RequestHandler handler) {
-  m_request_handlers[method] = std::move(handler);
-}
-
-void Server::AddNotificationHandler(llvm::StringRef method,
-                                    NotificationHandler handler) {
-  m_notification_handlers[method] = std::move(handler);
-}
-
-llvm::Expected<Response> Server::InitializeHandler(const Request &request) {
-  Response response;
+MCPBinderUP Server::Bind(MCPTransport &transport) {
+  MCPBinderUP binder_up = std::make_unique<MCPBinder>(transport);
+  binder_up->Bind<InitializeResult, InitializeParams>(
+      "initialize", &Server::InitializeHandler, this);
+  binder_up->Bind<ListToolsResult, void>("tools/list",
+                                         &Server::ToolsListHandler, this);
+  binder_up->Bind<CallToolResult, CallToolParams>(
+      "tools/call", &Server::ToolsCallHandler, this);
+  binder_up->Bind<ListResourcesResult, void>(
+      "resources/list", &Server::ResourcesListHandler, this);
+  binder_up->Bind<ReadResourceResult, ReadResourceParams>(
+      "resources/read", &Server::ResourcesReadHandler, this);
+  binder_up->Bind<void>("notifications/initialized",
+                        [this]() { Log("MCP initialization complete"); });
+  return binder_up;
+}
+
+llvm::Error Server::Accept(MainLoop &loop, MCPTransportUP transport) {
+  MCPBinderUP binder = Bind(*transport);
+  MCPTransport *transport_ptr = transport.get();
+  binder->OnDisconnect([this, transport_ptr]() {
+    assert(m_instances.find(transport_ptr) != m_instances.end() &&
+           "Client not found in m_instances");
+    m_instances.erase(transport_ptr);
+  });
+  binder->OnError([this](llvm::Error err) {
+    Logv("Transport error: {0}", llvm::toString(std::move(err)));
+  });
+
+  auto handle = transport->RegisterMessageHandler(loop, *binder);
+  if (!handle)
+    return handle.takeError();
+
+  m_instances[transport_ptr] =
+      Client{std::move(*handle), std::move(transport), std::move(binder)};
+  return llvm::Error::success();
+}
+
+Expected<InitializeResult>
+Server::InitializeHandler(const InitializeParams &request) {
   InitializeResult result;
   result.protocolVersion = mcp::kProtocolVersion;
   result.capabilities = GetCapabilities();
   result.serverInfo.name = m_name;
   result.serverInfo.version = m_version;
-  response.result = std::move(result);
-  return response;
+  return result;
 }
 
-llvm::Expected<Response> Server::ToolsListHandler(const Request &request) {
-  Response response;
-
+llvm::Expected<ListToolsResult> Server::ToolsListHandler() {
   ListToolsResult result;
   for (const auto &tool : m_tools)
     result.tools.emplace_back(tool.second->GetDefinition());
 
-  response.result = std::move(result);
-
-  return response;
+  return result;
 }
 
-llvm::Expected<Response> Server::ToolsCallHandler(const Request &request) {
-  Response response;
-
-  if (!request.params)
-    return llvm::createStringError("no tool parameters");
-  CallToolParams params;
-  json::Path::Root root("params");
-  if (!fromJSON(request.params, params, root))
-    return root.getError();
-
+llvm::Expected<CallToolResult>
+Server::ToolsCallHandler(const CallToolParams &params) {
   llvm::StringRef tool_name = params.name;
   if (tool_name.empty())
     return llvm::createStringError("no tool name");
@@ -222,113 +200,50 @@ llvm::Expected<Response> Server::ToolsCallHandler(const Request &request) {
   if (!text_result)
     return text_result.takeError();
 
-  response.result = toJSON(*text_result);
-
-  return response;
+  return text_result;
 }
 
-llvm::Expected<Response> Server::ResourcesListHandler(const Request &request) {
-  Response response;
-
+llvm::Expected<ListResourcesResult> Server::ResourcesListHandler() {
   ListResourcesResult result;
   for (std::unique_ptr<ResourceProvider> &resource_provider_up :
        m_resource_providers)
     for (const Resource &resource : resource_provider_up->GetResources())
       result.resources.push_back(resource);
 
-  response.result = std::move(result);
-
-  return response;
+  return result;
 }
 
-llvm::Expected<Response> Server::ResourcesReadHandler(const Request &request) {
-  Response response;
-
-  if (!request.params)
-    return llvm::createStringError("no resource parameters");
-
-  ReadResourceParams params;
-  json::Path::Root root("params");
-  if (!fromJSON(request.params, params, root))
-    return root.getError();
-
-  llvm::StringRef uri_str = params.uri;
+Expected<ReadResourceResult>
+Server::ResourcesReadHandler(const ReadResourceParams &params) {
+  StringRef uri_str = params.uri;
   if (uri_str.empty())
-    return llvm::createStringError("no resource uri");
+    return createStringError("no resource uri");
 
   for (std::unique_ptr<ResourceProvider> &resource_provider_up :
        m_resource_providers) {
-    llvm::Expected<ReadResourceResult> result =
+    Expected<ReadResourceResult> result =
         resource_provider_up->ReadResource(uri_str);
     if (result.errorIsA<UnsupportedURI>()) {
-      llvm::consumeError(result.takeError());
+      consumeError(result.takeError());
       continue;
     }
     if (!result)
       return result.takeError();
 
-    Response response;
-    response.result = std::move(*result);
-    return response;
+    return *result;
   }
 
   return make_error<MCPError>(
-      llvm::formatv("no resource handler for uri: {0}", uri_str).str(),
+      formatv("no resource handler for uri: {0}", uri_str).str(),
       MCPError::kResourceNotFound);
 }
 
 ServerCapabilities Server::GetCapabilities() {
   lldb_protocol::mcp::ServerCapabilities capabilities;
   capabilities.supportsToolsList = true;
+  capabilities.supportsResourcesList = true;
   // FIXME: Support sending notifications when a debugger/target are
   // added/removed.
-  capabilities.supportsResourcesList = false;
+  capabilities.supportsResourcesSubscribe = false;
   return capabilities;
 }
-
-void Server::Log(llvm::StringRef message) {
-  if (m_log_callback)
-    m_log_callback(message);
-}
-
-void Server::Received(const Request &request) {
-  auto SendResponse = [this](const Response &response) {
-    if (llvm::Error error = m_client.Send(response))
-      Log(llvm::toString(std::move(error)));
-  };
-
-  llvm::Expected<Response> response = Handle(request);
-  if (response)
-    return SendResponse(*response);
-
-  lldb_protocol::mcp::Error protocol_error;
-  llvm::handleAllErrors(
-      response.takeError(),
-      [&](const MCPError &err) { protocol_error = err.toProtocolError(); },
-      [&](const llvm::ErrorInfoBase &err) {
-        protocol_error.code = MCPError::kInternalError;
-        protocol_error.message = err.message();
-      });
-  Response error_response;
-  error_response.id = request.id;
-  error_response.result = std::move(protocol_error);
-  SendResponse(error_response);
-}
-
-void Server::Received(const Response &response) {
-  Log("unexpected MCP message: response");
-}
-
-void Server::Received(const Notification &notification) {
-  Handle(notification);
-}
-
-void Server::OnError(llvm::Error error) {
-  Log(llvm::toString(std::move(error)));
-}
-
-void Server::OnClosed() {
-  Log("EOF");
-  if (m_closed_callback)
-    m_closed_callback();
-}
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index 71681fd4b51ed..a90ddf59671ee 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -78,11 +78,9 @@ enum DAPBroadcasterBits {
 
 enum class ReplMode { Variable = 0, Command, Auto };
 
-using DAPTransport =
-    lldb_private::Transport<protocol::Request, protocol::Response,
-                            protocol::Event>;
+using DAPTransport = lldb_private::transport::JSONTransport<ProtocolDescriptor>;
 
-struct DAP final : private DAPTransport::MessageHandler {
+struct DAP final : public DAPTransport::MessageHandler {
   /// Path to the lldb-dap binary itself.
   static llvm::StringRef debug_adapter_path;
 
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
index 0a9ef538a7398..92e41b1dbf595 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
@@ -30,6 +30,8 @@ namespace lldb_dap::protocol {
 
 // MARK: Base Protocol
 
+using Id = int64_t;
+
 /// A client or debug adapter initiated request.
 struct Request {
   /// Sequence number of the message (also known as message ID). The `seq` for
@@ -39,7 +41,7 @@ struct Request {
   /// associate requests with their corresponding responses. For protocol
   /// messages of type `request` the sequence number can be used to cancel the
   /// request.
-  int64_t seq;
+  Id seq;
 
   /// The command to execute.
   std::string command;
@@ -76,7 +78,7 @@ enum ResponseMessage : unsigned {
 /// Response for a request.
 struct Response {
   /// Sequence number of the corresponding request.
-  int64_t request_seq;
+  Id request_seq;
 
   /// The command requested.
   std::string command;
diff --git a/lldb/tools/lldb-dap/Transport.h b/lldb/tools/lldb-dap/Transport.h
index 4a9dd76c2303e..58c48c133f9cb 100644
--- a/lldb/tools/lldb-dap/Transport.h
+++ b/lldb/tools/lldb-dap/Transport.h
@@ -22,11 +22,18 @@
 
 namespace lldb_dap {
 
+struct ProtocolDescriptor {
+  using Id = protocol::Id;
+  using Req = protocol::Request;
+  using Resp = protocol::Response;
+  using Evt = protocol::Event;
+};
+
 /// A transport class that performs the Debug Adapter Protocol communication
 /// with the client.
 class Transport final
-    : public lldb_private::HTTPDelimitedJSONTransport<
-          protocol::Request, protocol::Response, protocol::Event> {
+    : public lldb_private::transport::HTTPDelimitedJSONTransport<
+          ProtocolDescriptor> {
 public:
   Transport(llvm::StringRef client_name, lldb_dap::Log *log,
             lldb::IOObjectSP input, lldb::IOObjectSP output);
diff --git a/lldb/unittests/DAP/DAPTest.cpp b/lldb/unittests/DAP/DAPTest.cpp
index 2090fe6896d6b..4fd6cd546e6fa 100644
--- a/lldb/unittests/DAP/DAPTest.cpp
+++ b/lldb/unittests/DAP/DAPTest.cpp
@@ -9,13 +9,10 @@
 #include "DAP.h"
 #include "Protocol/ProtocolBase.h"
 #include "TestBase.h"
-#include "llvm/Testing/Support/Error.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <optional>
 
-using namespace llvm;
-using namespace lldb;
 using namespace lldb_dap;
 using namespace lldb_dap_tests;
 using namespace lldb_dap::protocol;
@@ -24,18 +21,7 @@ using namespace testing;
 class DAPTest : public TransportBase {};
 
 TEST_F(DAPTest, SendProtocolMessages) {
-  DAP dap{
-      /*log=*/nullptr,
-      /*default_repl_mode=*/ReplMode::Auto,
-      /*pre_init_commands=*/{},
-      /*no_lldbinit=*/false,
-      /*client_name=*/"test_client",
-      /*transport=*/*transport,
-      /*loop=*/loop,
-  };
-  dap.Send(Event{/*event=*/"my-event", /*body=*/std::nullopt});
-  loop.AddPendingCallback(
-      [](lldb_private::MainLoopBase &loop) { loop.RequestTermination(); });
-  EXPECT_CALL(client, Received(IsEvent("my-event", std::nullopt)));
-  ASSERT_THAT_ERROR(dap.Loop(), llvm::Succeeded());
+  dap->Send(Event{/*event=*/"my-event", /*body=*/std::nullopt});
+  EXPECT_CALL(client, Received(IsEvent("my-event")));
+  Run();
 }
diff --git a/lldb/unittests/DAP/Handler/DisconnectTest.cpp b/lldb/unittests/DAP/Handler/DisconnectTest.cpp
index c6ff1f90b01d5..88d6e9a69eca3 100644
--- a/lldb/unittests/DAP/Handler/DisconnectTest.cpp
+++ b/lldb/unittests/DAP/Handler/DisconnectTest.cpp
@@ -31,7 +31,7 @@ TEST_F(DisconnectRequestHandlerTest, DisconnectTriggersTerminated) {
   DisconnectRequestHandler handler(*dap);
   ASSERT_THAT_ERROR(handler.Run(std::nullopt), Succeeded());
   EXPECT_CALL(client, Received(IsEvent("terminated", _)));
-  RunOnce();
+  Run();
 }
 
 TEST_F(DisconnectRequestHandlerTest, DisconnectTriggersTerminateCommands) {
@@ -53,5 +53,5 @@ TEST_F(DisconnectRequestHandlerTest, DisconnectTriggersTerminateCommands) {
   EXPECT_CALL(client, Received(Output("(lldb) script print(2)\n")));
   EXPECT_CALL(client, Received(Output("Running terminateCommands:\n")));
   EXPECT_CALL(client, Received(IsEvent("terminated", _)));
-  RunOnce();
+  Run();
 }
diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp
index ba7baf2103799..3721e09d8b699 100644
--- a/lldb/unittests/DAP/TestBase.cpp
+++ b/lldb/unittests/DAP/TestBase.cpp
@@ -32,23 +32,9 @@ using lldb_private::FileSystem;
 using lldb_private::MainLoop;
 using lldb_private::Pipe;
 
-Expected<MainLoop::ReadHandleUP>
-TestTransport::RegisterMessageHandler(MainLoop &loop, MessageHandler &handler) {
-  Expected<lldb::FileUP> dummy_file = FileSystem::Instance().Open(
-      FileSpec(FileSystem::DEV_NULL), File::eOpenOptionReadWrite);
-  if (!dummy_file)
-    return dummy_file.takeError();
-  m_dummy_file = std::move(*dummy_file);
-  lldb_private::Status status;
-  auto handle = loop.RegisterReadObject(
-      m_dummy_file, [](lldb_private::MainLoopBase &) {}, status);
-  if (status.Fail())
-    return status.takeError();
-  return handle;
-}
+void TransportBase::SetUp() {
+  std::tie(to_client, to_server) = TestDAPTransport::createPair();
 
-void DAPTestBase::SetUp() {
-  TransportBase::SetUp();
   std::error_code EC;
   log = std::make_unique<Log>("-", EC);
   dap = std::make_unique<DAP>(
@@ -57,16 +43,30 @@ void DAPTestBase::SetUp() {
       /*pre_init_commands=*/std::vector<std::string>(),
       /*no_lldbinit=*/false,
       /*client_name=*/"test_client",
-      /*transport=*/*transport, /*loop=*/loop);
+      /*transport=*/*to_client, /*loop=*/loop);
+
+  auto server_handle = to_server->RegisterMessageHandler(loop, *dap.get());
+  EXPECT_THAT_EXPECTED(server_handle, Succeeded());
+  handles[0] = std::move(*server_handle);
+
+  auto client_handle = to_client->RegisterMessageHandler(loop, client);
+  EXPECT_THAT_EXPECTED(client_handle, Succeeded());
+  handles[1] = std::move(*client_handle);
 }
 
+void TransportBase::Run() {
+  loop.AddPendingCallback(
+      [](lldb_private::MainLoopBase &loop) { loop.RequestTermination(); });
+  EXPECT_THAT_ERROR(loop.Run().takeError(), llvm::Succeeded());
+}
+
+void DAPTestBase::SetUp() { TransportBase::SetUp(); }
+
 void DAPTestBase::TearDown() {
-  if (core) {
+  if (core)
     ASSERT_THAT_ERROR(core->discard(), Succeeded());
-  }
-  if (binary) {
+  if (binary)
     ASSERT_THAT_ERROR(binary->discard(), Succeeded());
-  }
 }
 
 void DAPTestBase::SetUpTestSuite() {
diff --git a/lldb/unittests/DAP/TestBase.h b/lldb/unittests/DAP/TestBase.h
index c19eead4e37e7..c32f3a769c737 100644
--- a/lldb/unittests/DAP/TestBase.h
+++ b/lldb/unittests/DAP/TestBase.h
@@ -7,73 +7,48 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
+#include "DAPLog.h"
 #include "Protocol/ProtocolBase.h"
 #include "TestingSupport/Host/JSONTransportTestUtilities.h"
 #include "TestingSupport/SubsystemRAII.h"
+#include "Transport.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Host/MainLoopBase.h"
-#include "lldb/lldb-forward.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/JSON.h"
-#include "llvm/Testing/Support/Error.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <memory>
+#include <optional>
+
+/// Helpers for gtest printing.
+namespace lldb_dap::protocol {
+
+inline void PrintTo(const Request &req, std::ostream *os) {
+  *os << llvm::formatv("{0}", toJSON(req)).str();
+}
+
+inline void PrintTo(const Response &resp, std::ostream *os) {
+  *os << llvm::formatv("{0}", toJSON(resp)).str();
+}
+
+inline void PrintTo(const Event &evt, std::ostream *os) {
+  *os << llvm::formatv("{0}", toJSON(evt)).str();
+}
+
+inline void PrintTo(const Message &message, std::ostream *os) {
+  return std::visit([os](auto &&message) { return PrintTo(message, os); },
+                    message);
+}
+
+} // namespace lldb_dap::protocol
 
 namespace lldb_dap_tests {
 
-class TestTransport final
-    : public lldb_private::Transport<lldb_dap::protocol::Request,
-                                     lldb_dap::protocol::Response,
-                                     lldb_dap::protocol::Event> {
-public:
-  using Message = lldb_private::Transport<lldb_dap::protocol::Request,
-                                          lldb_dap::protocol::Response,
-                                          lldb_dap::protocol::Event>::Message;
-
-  TestTransport(lldb_private::MainLoop &loop, MessageHandler &handler)
-      : m_loop(loop), m_handler(handler) {}
-
-  llvm::Error Send(const lldb_dap::protocol::Event &e) override {
-    m_loop.AddPendingCallback([this, e](lldb_private::MainLoopBase &) {
-      this->m_handler.Received(e);
-    });
-    return llvm::Error::success();
-  }
-
-  llvm::Error Send(const lldb_dap::protocol::Request &r) override {
-    m_loop.AddPendingCallback([this, r](lldb_private::MainLoopBase &) {
-      this->m_handler.Received(r);
-    });
-    return llvm::Error::success();
-  }
-
-  llvm::Error Send(const lldb_dap::protocol::Response &r) override {
-    m_loop.AddPendingCallback([this, r](lldb_private::MainLoopBase &) {
-      this->m_handler.Received(r);
-    });
-    return llvm::Error::success();
-  }
-
-  llvm::Expected<lldb_private::MainLoop::ReadHandleUP>
-  RegisterMessageHandler(lldb_private::MainLoop &loop,
-                         MessageHandler &handler) override;
-
-  void Log(llvm::StringRef message) override {
-    log_messages.emplace_back(message);
-  }
-
-  std::vector<std::string> log_messages;
-
-private:
-  lldb_private::MainLoop &m_loop;
-  MessageHandler &m_handler;
-  lldb::FileSP m_dummy_file;
-};
+using TestDAPTransport = TestTransport<lldb_dap::ProtocolDescriptor>;
 
 /// A base class for tests that need transport configured for communicating DAP
 /// messages.
@@ -82,22 +57,36 @@ class TransportBase : public testing::Test {
   lldb_private::SubsystemRAII<lldb_private::FileSystem, lldb_private::HostInfo>
       subsystems;
   lldb_private::MainLoop loop;
-  std::unique_ptr<TestTransport> transport;
-  MockMessageHandler<lldb_dap::protocol::Request, lldb_dap::protocol::Response,
-                     lldb_dap::protocol::Event>
-      client;
-
-  void SetUp() override {
-    transport = std::make_unique<TestTransport>(loop, client);
-  }
+  lldb_private::MainLoop::ReadHandleUP handles[2];
+
+  std::unique_ptr<lldb_dap::Log> log;
+
+  std::unique_ptr<TestDAPTransport> to_client;
+  MockMessageHandler<lldb_dap::ProtocolDescriptor> client;
+
+  std::unique_ptr<TestDAPTransport> to_server;
+  std::unique_ptr<lldb_dap::DAP> dap;
+
+  void SetUp() override;
+
+  void Run();
 };
 
 /// A matcher for a DAP event.
-template <typename M1, typename M2>
+template <typename EventMatcher, typename BodyMatcher>
 inline testing::Matcher<const lldb_dap::protocol::Event &>
-IsEvent(const M1 &m1, const M2 &m2) {
-  return testing::AllOf(testing::Field(&lldb_dap::protocol::Event::event, m1),
-                        testing::Field(&lldb_dap::protocol::Event::body, m2));
+IsEvent(const EventMatcher &event_matcher, const BodyMatcher &body_matcher) {
+  return testing::AllOf(
+      testing::Field(&lldb_dap::protocol::Event::event, event_matcher),
+      testing::Field(&lldb_dap::protocol::Event::body, body_matcher));
+}
+
+template <typename EventMatcher>
+inline testing::Matcher<const lldb_dap::protocol::Event &>
+IsEvent(const EventMatcher &event_matcher) {
+  return testing::AllOf(
+      testing::Field(&lldb_dap::protocol::Event::event, event_matcher),
+      testing::Field(&lldb_dap::protocol::Event::body, std::nullopt));
 }
 
 /// Matches an "output" event.
@@ -110,8 +99,6 @@ inline auto Output(llvm::StringRef o, llvm::StringRef cat = "console") {
 /// A base class for tests that interact with a `lldb_dap::DAP` instance.
 class DAPTestBase : public TransportBase {
 protected:
-  std::unique_ptr<lldb_dap::Log> log;
-  std::unique_ptr<lldb_dap::DAP> dap;
   std::optional<llvm::sys::fs::TempFile> core;
   std::optional<llvm::sys::fs::TempFile> binary;
 
@@ -126,12 +113,6 @@ class DAPTestBase : public TransportBase {
   bool GetDebuggerSupportsTarget(llvm::StringRef platform);
   void CreateDebugger();
   void LoadCore();
-
-  void RunOnce() {
-    loop.AddPendingCallback(
-        [](lldb_private::MainLoopBase &loop) { loop.RequestTermination(); });
-    ASSERT_THAT_ERROR(dap->Loop(), llvm::Succeeded());
-  }
 };
 
 } // namespace lldb_dap_tests
diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp
index 3a36bf21f07ff..7db6508aa4a3c 100644
--- a/lldb/unittests/Host/JSONTransportTest.cpp
+++ b/lldb/unittests/Host/JSONTransportTest.cpp
@@ -9,6 +9,7 @@
 #include "lldb/Host/JSONTransport.h"
 #include "TestingSupport/Host/JSONTransportTestUtilities.h"
 #include "TestingSupport/Host/PipeTestUtilities.h"
+#include "TestingSupport/SubsystemRAII.h"
 #include "lldb/Host/File.h"
 #include "lldb/Host/MainLoop.h"
 #include "lldb/Host/MainLoopBase.h"
@@ -25,27 +26,45 @@
 #include <chrono>
 #include <cstddef>
 #include <memory>
+#include <optional>
 #include <string>
+#include <system_error>
 
 using namespace llvm;
 using namespace lldb_private;
+using namespace lldb_private::transport;
 using testing::_;
 using testing::HasSubstr;
 using testing::InSequence;
+using testing::Ref;
+
+namespace llvm::json {
+static bool fromJSON(const Value &V, Value &T, Path P) {
+  T = V;
+  return true;
+}
+} // namespace llvm::json
 
 namespace {
 
 namespace test_protocol {
 
 struct Req {
+  int id = 0;
   std::string name;
+  std::optional<json::Value> params;
 };
-json::Value toJSON(const Req &T) { return json::Object{{"req", T.name}}; }
+json::Value toJSON(const Req &T) {
+  return json::Object{{"name", T.name}, {"id", T.id}, {"params", T.params}};
+}
 bool fromJSON(const json::Value &V, Req &T, json::Path P) {
   json::ObjectMapper O(V, P);
-  return O && O.map("req", T.name);
+  return O && O.map("name", T.name) && O.map("id", T.id) &&
+         O.map("params", T.params);
+}
+bool operator==(const Req &a, const Req &b) {
+  return a.name == b.name && a.id == b.id && a.params == b.params;
 }
-bool operator==(const Req &a, const Req &b) { return a.name == b.name; }
 inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Req &V) {
   OS << toJSON(V);
   return OS;
@@ -58,14 +77,22 @@ void PrintTo(const Req &message, std::ostream *os) {
 }
 
 struct Resp {
-  std::string name;
+  int id = 0;
+  int errorCode = 0;
+  std::optional<json::Value> result;
 };
-json::Value toJSON(const Resp &T) { return json::Object{{"resp", T.name}}; }
+json::Value toJSON(const Resp &T) {
+  return json::Object{
+      {"id", T.id}, {"errorCode", T.errorCode}, {"result", T.result}};
+}
 bool fromJSON(const json::Value &V, Resp &T, json::Path P) {
   json::ObjectMapper O(V, P);
-  return O && O.map("resp", T.name);
+  return O && O.map("id", T.id) && O.mapOptional("errorCode", T.errorCode) &&
+         O.map("result", T.result);
+}
+bool operator==(const Resp &a, const Resp &b) {
+  return a.id == b.id && a.errorCode == b.errorCode && a.result == b.result;
 }
-bool operator==(const Resp &a, const Resp &b) { return a.name == b.name; }
 inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Resp &V) {
   OS << toJSON(V);
   return OS;
@@ -79,11 +106,14 @@ void PrintTo(const Resp &message, std::ostream *os) {
 
 struct Evt {
   std::string name;
+  std::optional<json::Value> params;
 };
-json::Value toJSON(const Evt &T) { return json::Object{{"evt", T.name}}; }
+json::Value toJSON(const Evt &T) {
+  return json::Object{{"name", T.name}, {"params", T.params}};
+}
 bool fromJSON(const json::Value &V, Evt &T, json::Path P) {
   json::ObjectMapper O(V, P);
-  return O && O.map("evt", T.name);
+  return O && O.map("name", T.name) && O.map("params", T.params);
 }
 bool operator==(const Evt &a, const Evt &b) { return a.name == b.name; }
 inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Evt &V) {
@@ -107,41 +137,114 @@ bool fromJSON(const json::Value &V, Message &msg, json::Path P) {
     P.report("expected object");
     return false;
   }
-  if (O->get("req")) {
-    Req R;
-    if (!fromJSON(V, R, P))
+
+  if (O->find("id") == O->end()) {
+    Evt E;
+    if (!fromJSON(V, E, P))
       return false;
 
-    msg = std::move(R);
+    msg = std::move(E);
     return true;
   }
-  if (O->get("resp")) {
-    Resp R;
+
+  if (O->get("name")) {
+    Req R;
     if (!fromJSON(V, R, P))
       return false;
 
     msg = std::move(R);
     return true;
   }
-  if (O->get("evt")) {
-    Evt E;
-    if (!fromJSON(V, E, P))
-      return false;
 
-    msg = std::move(E);
-    return true;
-  }
-  P.report("unknown message type");
-  return false;
+  Resp R;
+  if (!fromJSON(V, R, P))
+    return false;
+
+  msg = std::move(R);
+  return true;
 }
 
-} // namespace test_protocol
+struct MyFnParams {
+  int a = 0;
+  int b = 0;
+};
+json::Value toJSON(const MyFnParams &T) {
+  return json::Object{{"a", T.a}, {"b", T.b}};
+}
+bool fromJSON(const json::Value &V, MyFnParams &T, json::Path P) {
+  json::ObjectMapper O(V, P);
+  return O && O.map("a", T.a) && O.map("b", T.b);
+}
+
+struct MyFnResult {
+  int c = 0;
+};
+json::Value toJSON(const MyFnResult &T) { return json::Object{{"c", T.c}}; }
+bool fromJSON(const json::Value &V, MyFnResult &T, json::Path P) {
+  json::ObjectMapper O(V, P);
+  return O && O.map("c", T.c);
+}
 
-template <typename T, typename Req, typename Resp, typename Evt>
-class JSONTransportTest : public PipePairTest {
+struct ProtoDesc {
+  using Id = int;
+  using Req = Req;
+  using Resp = Resp;
+  using Evt = Evt;
 
+  static inline Id InitialId() { return 0; }
+  static inline Req Make(Id id, llvm::StringRef method,
+                         std::optional<llvm::json::Value> params) {
+    return Req{id, method.str(), params};
+  }
+  static inline Evt Make(llvm::StringRef method,
+                         std::optional<llvm::json::Value> params) {
+    return Evt{method.str(), params};
+  }
+  static inline Resp Make(Req req, llvm::Error error) {
+    Resp resp;
+    resp.id = req.id;
+    llvm::handleAllErrors(
+        std::move(error), [&](const llvm::ErrorInfoBase &err) {
+          std::error_code cerr = err.convertToErrorCode();
+          resp.errorCode =
+              cerr == llvm::inconvertibleErrorCode() ? 1 : cerr.value();
+          resp.result = err.message();
+        });
+    return resp;
+  }
+  static inline Resp Make(Req req, std::optional<llvm::json::Value> result) {
+    return Resp{req.id, 0, std::move(result)};
+  }
+  static inline Id KeyFor(Resp r) { return r.id; }
+  static inline std::string KeyFor(Req r) { return r.name; }
+  static inline std::string KeyFor(Evt e) { return e.name; }
+  static inline std::optional<llvm::json::Value> Extract(Req r) {
+    return r.params;
+  }
+  static inline llvm::Expected<llvm::json::Value> Extract(Resp r) {
+    if (r.errorCode != 0)
+      return llvm::createStringError(
+          std::error_code(r.errorCode, std::generic_category()),
+          r.result && r.result->getAsString() ? *r.result->getAsString()
+                                              : "no-message");
+    return r.result;
+  }
+  static inline std::optional<llvm::json::Value> Extract(Evt e) {
+    return e.params;
+  }
+};
+
+using Transport = TestTransport<ProtoDesc>;
+using Binder = lldb_private::transport::Binder<ProtoDesc>;
+using MessageHandler = MockMessageHandler<ProtoDesc>;
+
+} // namespace test_protocol
+
+template <typename T> class JSONTransportTest : public PipePairTest {
 protected:
-  MockMessageHandler<Req, Resp, Evt> message_handler;
+  SubsystemRAII<FileSystem> subsystems;
+
+  test_protocol::MessageHandler message_handler;
   std::unique_ptr<T> transport;
   MainLoop loop;
 
@@ -191,8 +294,7 @@ class JSONTransportTest : public PipePairTest {
 };
 
 class TestHTTPDelimitedJSONTransport final
-    : public HTTPDelimitedJSONTransport<test_protocol::Req, test_protocol::Resp,
-                                        test_protocol::Evt> {
+    : public HTTPDelimitedJSONTransport<test_protocol::ProtoDesc> {
 public:
   using HTTPDelimitedJSONTransport::HTTPDelimitedJSONTransport;
 
@@ -204,9 +306,7 @@ class TestHTTPDelimitedJSONTransport final
 };
 
 class HTTPDelimitedJSONTransportTest
-    : public JSONTransportTest<TestHTTPDelimitedJSONTransport,
-                               test_protocol::Req, test_protocol::Resp,
-                               test_protocol::Evt> {
+    : public JSONTransportTest<TestHTTPDelimitedJSONTransport> {
 public:
   using JSONTransportTest::JSONTransportTest;
 
@@ -222,8 +322,7 @@ class HTTPDelimitedJSONTransportTest
 };
 
 class TestJSONRPCTransport final
-    : public JSONRPCTransport<test_protocol::Req, test_protocol::Resp,
-                              test_protocol::Evt> {
+    : public JSONRPCTransport<test_protocol::ProtoDesc> {
 public:
   using JSONRPCTransport::JSONRPCTransport;
 
@@ -234,9 +333,7 @@ class TestJSONRPCTransport final
   std::vector<std::string> log_messages;
 };
 
-class JSONRPCTransportTest
-    : public JSONTransportTest<TestJSONRPCTransport, test_protocol::Req,
-                               test_protocol::Resp, test_protocol::Evt> {
+class JSONRPCTransportTest : public JSONTransportTest<TestJSONRPCTransport> {
 public:
   using JSONTransportTest::JSONTransportTest;
 
@@ -248,6 +345,33 @@ class JSONRPCTransportTest
   }
 };
 
+class TransportBinderTest : public testing::Test {
+protected:
+  SubsystemRAII<FileSystem> subsystems;
+
+  std::unique_ptr<test_protocol::Transport> to_remote;
+  std::unique_ptr<test_protocol::Transport> from_remote;
+  std::unique_ptr<test_protocol::Binder> binder;
+  test_protocol::MessageHandler remote;
+  MainLoop loop;
+
+  void SetUp() override {
+    std::tie(to_remote, from_remote) = test_protocol::Transport::createPair();
+    binder = std::make_unique<test_protocol::Binder>(*to_remote);
+
+    auto binder_handle = to_remote->RegisterMessageHandler(loop, remote);
+    EXPECT_THAT_EXPECTED(binder_handle, Succeeded());
+
+    auto remote_handle = from_remote->RegisterMessageHandler(loop, *binder);
+    EXPECT_THAT_EXPECTED(remote_handle, Succeeded());
+  }
+
+  void Run() {
+    loop.AddPendingCallback([](auto &loop) { loop.RequestTermination(); });
+    EXPECT_THAT_ERROR(loop.Run().takeError(), Succeeded());
+  }
+};
+
 } // namespace
 
 // Failing on Windows, see https://github.com/llvm/llvm-project/issues/153446.
@@ -269,35 +393,45 @@ TEST_F(HTTPDelimitedJSONTransportTest, MalformedRequests) {
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, Read) {
-  Write(Req{"foo"});
-  EXPECT_CALL(message_handler, Received(Req{"foo"}));
+  Write(Req{6, "foo", std::nullopt});
+  EXPECT_CALL(message_handler, Received(Req{6, "foo", std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, ReadMultipleMessagesInSingleWrite) {
   InSequence seq;
-  Write(Message{Req{"one"}}, Message{Evt{"two"}}, Message{Resp{"three"}});
-  EXPECT_CALL(message_handler, Received(Req{"one"}));
-  EXPECT_CALL(message_handler, Received(Evt{"two"}));
-  EXPECT_CALL(message_handler, Received(Resp{"three"}));
+  Write(
+      Message{
+          Req{6, "one", std::nullopt},
+      },
+      Message{
+          Evt{"two", std::nullopt},
+      },
+      Message{
+          Resp{2, 0, std::nullopt},
+      });
+  EXPECT_CALL(message_handler, Received(Req{6, "one", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Evt{"two", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Resp{2, 0, std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, ReadAcrossMultipleChunks) {
   std::string long_str = std::string(
-      HTTPDelimitedJSONTransport<Req, Resp, Evt>::kReadBufferSize * 2, 'x');
-  Write(Req{long_str});
-  EXPECT_CALL(message_handler, Received(Req{long_str}));
+      HTTPDelimitedJSONTransport<test_protocol::ProtoDesc>::kReadBufferSize * 2,
+      'x');
+  Write(Req{5, long_str, std::nullopt});
+  EXPECT_CALL(message_handler, Received(Req{5, long_str, std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, ReadPartialMessage) {
-  std::string message = Encode(Req{"foo"});
+  std::string message = Encode(Req{5, "foo", std::nullopt});
   auto split_at = message.size() / 2;
   std::string part1 = message.substr(0, split_at);
   std::string part2 = message.substr(split_at);
 
-  EXPECT_CALL(message_handler, Received(Req{"foo"}));
+  EXPECT_CALL(message_handler, Received(Req{5, "foo", std::nullopt}));
 
   ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded());
   loop.AddPendingCallback(
@@ -309,12 +443,12 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadPartialMessage) {
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, ReadWithZeroByteWrites) {
-  std::string message = Encode(Req{"foo"});
+  std::string message = Encode(Req{6, "foo", std::nullopt});
   auto split_at = message.size() / 2;
   std::string part1 = message.substr(0, split_at);
   std::string part2 = message.substr(split_at);
 
-  EXPECT_CALL(message_handler, Received(Req{"foo"}));
+  EXPECT_CALL(message_handler, Received(Req{6, "foo", std::nullopt}));
 
   ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded());
 
@@ -366,20 +500,21 @@ TEST_F(HTTPDelimitedJSONTransportTest, InvalidTransport) {
 }
 
 TEST_F(HTTPDelimitedJSONTransportTest, Write) {
-  ASSERT_THAT_ERROR(transport->Send(Req{"foo"}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Resp{"bar"}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Evt{"baz"}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Req{7, "foo", std::nullopt}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Resp{5, 0, "bar"}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Evt{"baz", std::nullopt}), Succeeded());
   output.CloseWriteFileDescriptor();
   char buf[1024];
   Expected<size_t> bytes_read =
       output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
   ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
-  ASSERT_EQ(StringRef(buf, *bytes_read), StringRef("Content-Length: 13\r\n\r\n"
-                                                   R"({"req":"foo"})"
-                                                   "Content-Length: 14\r\n\r\n"
-                                                   R"({"resp":"bar"})"
-                                                   "Content-Length: 13\r\n\r\n"
-                                                   R"({"evt":"baz"})"));
+  ASSERT_EQ(StringRef(buf, *bytes_read),
+            StringRef("Content-Length: 35\r\n\r\n"
+                      R"({"id":7,"name":"foo","params":null})"
+                      "Content-Length: 37\r\n\r\n"
+                      R"({"errorCode":0,"id":5,"result":"bar"})"
+                      "Content-Length: 28\r\n\r\n"
+                      R"({"name":"baz","params":null})"));
 }
 
 TEST_F(JSONRPCTransportTest, MalformedRequests) {
@@ -395,37 +530,38 @@ TEST_F(JSONRPCTransportTest, MalformedRequests) {
 }
 
 TEST_F(JSONRPCTransportTest, Read) {
-  Write(Message{Req{"foo"}});
-  EXPECT_CALL(message_handler, Received(Req{"foo"}));
+  Write(Message{Req{1, "foo", std::nullopt}});
+  EXPECT_CALL(message_handler, Received(Req{1, "foo", std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(JSONRPCTransportTest, ReadMultipleMessagesInSingleWrite) {
   InSequence seq;
-  Write(Message{Req{"one"}}, Message{Evt{"two"}}, Message{Resp{"three"}});
-  EXPECT_CALL(message_handler, Received(Req{"one"}));
-  EXPECT_CALL(message_handler, Received(Evt{"two"}));
-  EXPECT_CALL(message_handler, Received(Resp{"three"}));
+  Write(Message{Req{1, "one", std::nullopt}}, Message{Evt{"two", std::nullopt}},
+        Message{Resp{3, 0, "three"}});
+  EXPECT_CALL(message_handler, Received(Req{1, "one", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Evt{"two", std::nullopt}));
+  EXPECT_CALL(message_handler, Received(Resp{3, 0, "three"}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(JSONRPCTransportTest, ReadAcrossMultipleChunks) {
   // Use a string longer than the chunk size to ensure we split the message
   // across the chunk boundary.
-  std::string long_str =
-      std::string(IOTransport<Req, Resp, Evt>::kReadBufferSize * 2, 'x');
-  Write(Req{long_str});
-  EXPECT_CALL(message_handler, Received(Req{long_str}));
+  std::string long_str = std::string(
+      IOTransport<test_protocol::ProtoDesc>::kReadBufferSize * 2, 'x');
+  Write(Req{42, long_str, std::nullopt});
+  EXPECT_CALL(message_handler, Received(Req{42, long_str, std::nullopt}));
   ASSERT_THAT_ERROR(Run(), Succeeded());
 }
 
 TEST_F(JSONRPCTransportTest, ReadPartialMessage) {
-  std::string message = R"({"req": "foo"})"
+  std::string message = R"({"id":42,"name":"foo","params":null})"
                         "\n";
   std::string part1 = message.substr(0, 7);
   std::string part2 = message.substr(7);
 
-  EXPECT_CALL(message_handler, Received(Req{"foo"}));
+  EXPECT_CALL(message_handler, Received(Req{42, "foo", std::nullopt}));
 
   ASSERT_THAT_EXPECTED(input.Write(part1.data(), part1.size()), Succeeded());
   loop.AddPendingCallback(
@@ -455,20 +591,21 @@ TEST_F(JSONRPCTransportTest, ReaderWithUnhandledData) {
 }
 
 TEST_F(JSONRPCTransportTest, Write) {
-  ASSERT_THAT_ERROR(transport->Send(Req{"foo"}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Resp{"bar"}), Succeeded());
-  ASSERT_THAT_ERROR(transport->Send(Evt{"baz"}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Req{11, "foo", std::nullopt}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Resp{14, 0, "bar"}), Succeeded());
+  ASSERT_THAT_ERROR(transport->Send(Evt{"baz", std::nullopt}), Succeeded());
   output.CloseWriteFileDescriptor();
   char buf[1024];
   Expected<size_t> bytes_read =
       output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
   ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
-  ASSERT_EQ(StringRef(buf, *bytes_read), StringRef(R"({"req":"foo"})"
-                                                   "\n"
-                                                   R"({"resp":"bar"})"
-                                                   "\n"
-                                                   R"({"evt":"baz"})"
-                                                   "\n"));
+  ASSERT_EQ(StringRef(buf, *bytes_read),
+            StringRef(R"({"id":11,"name":"foo","params":null})"
+                      "\n"
+                      R"({"errorCode":0,"id":14,"result":"bar"})"
+                      "\n"
+                      R"({"name":"baz","params":null})"
+                      "\n"));
 }
 
 TEST_F(JSONRPCTransportTest, InvalidTransport) {
@@ -477,4 +614,183 @@ TEST_F(JSONRPCTransportTest, InvalidTransport) {
                     FailedWithMessage("IO object is not valid."));
 }
 
+// Out-bound binding request handler.
+TEST_F(TransportBinderTest, OutBoundRequests) {
+  OutgoingRequest<MyFnResult, MyFnParams> addFn =
+      binder->Bind<MyFnResult, MyFnParams>("add");
+  bool replied = false;
+  addFn(MyFnParams{1, 2}, [&](Expected<MyFnResult> result) {
+    EXPECT_THAT_EXPECTED(result, Succeeded());
+    EXPECT_EQ(result->c, 3);
+    replied = true;
+  });
+  EXPECT_CALL(remote, Received(Req{1, "add", MyFnParams{1, 2}}));
+  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, toJSON(MyFnResult{3})}),
+                    Succeeded());
+  Run();
+  EXPECT_TRUE(replied);
+}
+
+TEST_F(TransportBinderTest, OutBoundRequestsVoidParams) {
+  OutgoingRequest<MyFnResult, void> voidParamFn =
+      binder->Bind<MyFnResult, void>("voidParam");
+  bool replied = false;
+  voidParamFn([&](Expected<MyFnResult> result) {
+    EXPECT_THAT_EXPECTED(result, Succeeded());
+    EXPECT_EQ(result->c, 3);
+    replied = true;
+  });
+  EXPECT_CALL(remote, Received(Req{1, "voidParam", std::nullopt}));
+  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, toJSON(MyFnResult{3})}),
+                    Succeeded());
+  Run();
+  EXPECT_TRUE(replied);
+}
+
+TEST_F(TransportBinderTest, OutBoundRequestsVoidResult) {
+  OutgoingRequest<void, MyFnParams> voidResultFn =
+      binder->Bind<void, MyFnParams>("voidResult");
+  bool replied = false;
+  voidResultFn(MyFnParams{4, 5}, [&](llvm::Error error) {
+    EXPECT_THAT_ERROR(std::move(error), Succeeded());
+    replied = true;
+  });
+  EXPECT_CALL(remote, Received(Req{1, "voidResult", MyFnParams{4, 5}}));
+  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, std::nullopt}), Succeeded());
+  Run();
+  EXPECT_TRUE(replied);
+}
+
+TEST_F(TransportBinderTest, OutBoundRequestsVoidParamsAndVoidResult) {
+  OutgoingRequest<void, void> voidParamAndResultFn =
+      binder->Bind<void, void>("voidParamAndResult");
+  bool replied = false;
+  voidParamAndResultFn([&](llvm::Error error) {
+    EXPECT_THAT_ERROR(std::move(error), Succeeded());
+    replied = true;
+  });
+  EXPECT_CALL(remote, Received(Req{1, "voidParamAndResult", std::nullopt}));
+  EXPECT_THAT_ERROR(from_remote->Send(Resp{1, 0, std::nullopt}), Succeeded());
+  Run();
+  EXPECT_TRUE(replied);
+}
+
+// In-bound binding request handler.
+TEST_F(TransportBinderTest, InBoundRequests) {
+  bool called = false;
+  binder->Bind<MyFnResult, MyFnParams>(
+      "add",
+      [&](const int captured_param,
+          const MyFnParams &params) -> Expected<MyFnResult> {
+        called = true;
+        return MyFnResult{params.a + params.b + captured_param};
+      },
+      2);
+  EXPECT_THAT_ERROR(from_remote->Send(Req{1, "add", MyFnParams{3, 4}}),
+                    Succeeded());
+
+  EXPECT_CALL(remote, Received(Resp{1, 0, MyFnResult{9}}));
+  Run();
+  EXPECT_TRUE(called);
+}
+
+TEST_F(TransportBinderTest, InBoundRequestsVoidParams) {
+  bool called = false;
+  binder->Bind<MyFnResult, void>(
+      "voidParam",
+      [&](const int captured_param) -> Expected<MyFnResult> {
+        called = true;
+        return MyFnResult{captured_param};
+      },
+      2);
+  EXPECT_THAT_ERROR(from_remote->Send(Req{2, "voidParam", std::nullopt}),
+                    Succeeded());
+  EXPECT_CALL(remote, Received(Resp{2, 0, MyFnResult{2}}));
+  Run();
+  EXPECT_TRUE(called);
+}
+
+TEST_F(TransportBinderTest, InBoundRequestsVoidResult) {
+  bool called = false;
+  binder->Bind<void, MyFnParams>(
+      "voidResult",
+      [&](const int captured_param, const MyFnParams &params) -> llvm::Error {
+        called = true;
+        EXPECT_EQ(captured_param, 2);
+        EXPECT_EQ(params.a, 3);
+        EXPECT_EQ(params.b, 4);
+        return llvm::Error::success();
+      },
+      2);
+  EXPECT_THAT_ERROR(from_remote->Send(Req{3, "voidResult", MyFnParams{3, 4}}),
+                    Succeeded());
+  EXPECT_CALL(remote, Received(Resp{3, 0, std::nullopt}));
+  Run();
+  EXPECT_TRUE(called);
+}
+TEST_F(TransportBinderTest, InBoundRequestsVoidParamsAndResult) {
+  bool called = false;
+  binder->Bind<void, void>(
+      "voidParamAndResult",
+      [&](const int captured_param) -> llvm::Error {
+        called = true;
+        EXPECT_EQ(captured_param, 2);
+        return llvm::Error::success();
+      },
+      2);
+  EXPECT_THAT_ERROR(
+      from_remote->Send(Req{4, "voidParamAndResult", std::nullopt}),
+      Succeeded());
+  EXPECT_CALL(remote, Received(Resp{4, 0, std::nullopt}));
+  Run();
+  EXPECT_TRUE(called);
+}
+
+// Out-bound binding event handler.
+TEST_F(TransportBinderTest, OutBoundEvents) {
+  OutgoingEvent<MyFnParams> emitEvent = binder->Bind<MyFnParams>("evt");
+  emitEvent(MyFnParams{1, 2});
+  EXPECT_CALL(remote, Received(Evt{"evt", MyFnParams{1, 2}}));
+  Run();
+}
+
+TEST_F(TransportBinderTest, OutBoundEventsVoidParams) {
+  OutgoingEvent<void> emitEvent = binder->Bind<void>("evt");
+  emitEvent();
+  EXPECT_CALL(remote, Received(Evt{"evt", std::nullopt}));
+  Run();
+}
+
+// In-bound binding event handler.
+TEST_F(TransportBinderTest, InBoundEvents) {
+  bool called = false;
+  binder->Bind<MyFnParams>(
+      "evt",
+      [&](const int captured_arg, const MyFnParams &params) {
+        EXPECT_EQ(captured_arg, 42);
+        EXPECT_EQ(params.a, 3);
+        EXPECT_EQ(params.b, 4);
+        called = true;
+      },
+      42);
+  EXPECT_THAT_ERROR(from_remote->Send(Evt{"evt", MyFnParams{3, 4}}),
+                    Succeeded());
+  Run();
+  EXPECT_TRUE(called);
+}
+
+TEST_F(TransportBinderTest, InBoundEventsVoidParams) {
+  bool called = false;
+  binder->Bind<void>(
+      "evt",
+      [&](const int captured_arg) {
+        EXPECT_EQ(captured_arg, 42);
+        called = true;
+      },
+      42);
+  EXPECT_THAT_ERROR(from_remote->Send(Evt{"evt", std::nullopt}), Succeeded());
+  Run();
+  EXPECT_TRUE(called);
+}
+
 #endif
diff --git a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
index f3ca4cfc01788..9628cbd91e9ce 100644
--- a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
+++ b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
@@ -6,9 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ProtocolMCPTestUtilities.h"
+#include "ProtocolMCPTestUtilities.h" // IWYU pragma: keep
 #include "TestingSupport/Host/JSONTransportTestUtilities.h"
-#include "TestingSupport/Host/PipeTestUtilities.h"
 #include "TestingSupport/SubsystemRAII.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
@@ -28,20 +27,22 @@
 #include "llvm/Testing/Support/Error.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
-#include <chrono>
-#include <condition_variable>
+#include <future>
+#include <memory>
+#include <optional>
+#include <system_error>
 
 using namespace llvm;
 using namespace lldb;
 using namespace lldb_private;
+using namespace lldb_private::transport;
 using namespace lldb_protocol::mcp;
 
 namespace {
 
-class TestServer : public Server {
-public:
-  using Server::Server;
-};
+template <typename T> Response make_response(T &&result, Id id = 1) {
+  return Response{id, std::forward<T>(result)};
+}
 
 /// Test tool that returns it argument as text.
 class TestTool : public Tool {
@@ -101,7 +102,9 @@ class ErrorTool : public Tool {
   using Tool::Tool;
 
   llvm::Expected<CallToolResult> Call(const ToolArguments &args) override {
-    return llvm::createStringError("error");
+    return llvm::createStringError(
+        std::error_code(eErrorCodeInternalError, std::generic_category()),
+        "error");
   }
 };
 
@@ -118,195 +121,207 @@ class FailTool : public Tool {
   }
 };
 
-class ProtocolServerMCPTest : public PipePairTest {
+class TestServer : public Server {
+public:
+  using Server::Bind;
+  using Server::Server;
+};
+
+using Transport = TestTransport<lldb_protocol::mcp::ProtocolDescriptor>;
+
+class ProtocolServerMCPTest : public testing::Test {
 public:
   SubsystemRAII<FileSystem, HostInfo, Socket> subsystems;
 
   MainLoop loop;
+  lldb_private::MainLoop::ReadHandleUP handles[2];
 
-  std::unique_ptr<lldb_protocol::mcp::Transport> from_client;
-  std::unique_ptr<lldb_protocol::mcp::Transport> to_client;
-  MainLoopBase::ReadHandleUP handles[2];
-
+  std::unique_ptr<Transport> to_server;
+  MCPBinderUP binder;
   std::unique_ptr<TestServer> server_up;
-  MockMessageHandler<Request, Response, Notification> message_handler;
 
-  llvm::Error Write(llvm::StringRef message) {
-    llvm::Expected<json::Value> value = json::parse(message);
-    if (!value)
-      return value.takeError();
-    return from_client->Write(*value);
-  }
+  std::unique_ptr<Transport> to_client;
+  MockMessageHandler<lldb_protocol::mcp::ProtocolDescriptor> client;
 
-  llvm::Error Write(json::Value value) { return from_client->Write(value); }
+  std::vector<std::string> logged_messages;
 
-  /// Run the transport MainLoop and return any messages received.
-  llvm::Error Run() {
-    loop.AddCallback([](MainLoopBase &loop) { loop.RequestTermination(); },
-                     std::chrono::milliseconds(10));
-    return loop.Run().takeError();
+  /// Runs the MainLoop a single time, executing any pending callbacks.
+  void Run() {
+    loop.AddPendingCallback(
+        [](MainLoopBase &loop) { loop.RequestTermination(); });
+    EXPECT_THAT_ERROR(loop.Run().takeError(), Succeeded());
   }
 
   void SetUp() override {
-    PipePairTest::SetUp();
-
-    from_client = std::make_unique<lldb_protocol::mcp::Transport>(
-        std::make_shared<NativeFile>(input.GetReadFileDescriptor(),
-                                     File::eOpenOptionReadOnly,
-                                     NativeFile::Unowned),
-        std::make_shared<NativeFile>(output.GetWriteFileDescriptor(),
-                                     File::eOpenOptionWriteOnly,
-                                     NativeFile::Unowned),
-        [](StringRef message) {
-          // Uncomment for debugging
-          // llvm::errs() << "from_client: " << message << '\n';
-        });
-    to_client = std::make_unique<lldb_protocol::mcp::Transport>(
-        std::make_shared<NativeFile>(output.GetReadFileDescriptor(),
-                                     File::eOpenOptionReadOnly,
-                                     NativeFile::Unowned),
-        std::make_shared<NativeFile>(input.GetWriteFileDescriptor(),
-                                     File::eOpenOptionWriteOnly,
-                                     NativeFile::Unowned),
-        [](StringRef message) {
-          // Uncomment for debugging
-          // llvm::errs() << "to_client: " << message << '\n';
-        });
-
-    server_up = std::make_unique<TestServer>("lldb-mcp", "0.1.0", *to_client,
-                                             [](StringRef message) {
-                                               // Uncomment for debugging
-                                               // llvm::errs() << "server: " <<
-                                               // message << '\n';
-                                             });
-
-    auto maybe_from_client_handle =
-        from_client->RegisterMessageHandler(loop, message_handler);
-    EXPECT_THAT_EXPECTED(maybe_from_client_handle, Succeeded());
-    handles[0] = std::move(*maybe_from_client_handle);
-
-    auto maybe_to_client_handle =
-        to_client->RegisterMessageHandler(loop, *server_up);
-    EXPECT_THAT_EXPECTED(maybe_to_client_handle, Succeeded());
-    handles[1] = std::move(*maybe_to_client_handle);
+    std::tie(to_client, to_server) = Transport::createPair();
+
+    server_up = std::make_unique<TestServer>(
+        "lldb-mcp", "0.1.0",
+        [this](StringRef msg) { logged_messages.push_back(msg.str()); });
+    binder = server_up->Bind(*to_client);
+    auto server_handle = to_server->RegisterMessageHandler(loop, *binder);
+    EXPECT_THAT_EXPECTED(server_handle, Succeeded());
+    binder->OnError([](llvm::Error error) {
+      llvm::errs() << formatv("Server transport error: {0}", error);
+    });
+    handles[0] = std::move(*server_handle);
+
+    auto client_handle = to_client->RegisterMessageHandler(loop, client);
+    EXPECT_THAT_EXPECTED(client_handle, Succeeded());
+    handles[1] = std::move(*client_handle);
+  }
+
+  template <typename Result, typename Params>
+  Expected<json::Value> Call(StringRef method, const Params &params) {
+    std::promise<Response> promised_result;
+    Request req =
+        lldb_protocol::mcp::Request{/*id=*/1, method.str(), toJSON(params)};
+    EXPECT_THAT_ERROR(to_server->Send(req), Succeeded());
+    EXPECT_CALL(client, Received(testing::An<const Response &>()))
+        .WillOnce(
+            [&](const Response &resp) { promised_result.set_value(resp); });
+    Run();
+    Response resp = promised_result.get_future().get();
+    return toJSON(resp);
+  }
+
+  template <typename Result>
+  Expected<json::Value>
+  Capture(llvm::unique_function<void(Reply<Result>)> &fn) {
+    std::promise<llvm::Expected<Result>> promised_result;
+    fn([&promised_result](llvm::Expected<Result> result) {
+      promised_result.set_value(std::move(result));
+    });
+    Run();
+    llvm::Expected<Result> result = promised_result.get_future().get();
+    if (!result)
+      return result.takeError();
+    return toJSON(*result);
+  }
+
+  template <typename Result, typename Params>
+  Expected<json::Value>
+  Capture(llvm::unique_function<void(const Params &, Reply<Result>)> &fn,
+          const Params &params) {
+    std::promise<llvm::Expected<Result>> promised_result;
+    fn(params, [&promised_result](llvm::Expected<Result> result) {
+      promised_result.set_value(std::move(result));
+    });
+    Run();
+    llvm::Expected<Result> result = promised_result.get_future().get();
+    if (!result)
+      return result.takeError();
+    return toJSON(*result);
   }
 };
 
 template <typename T>
-Request make_request(StringLiteral method, T &&params, Id id = 1) {
-  return Request{id, method.str(), toJSON(std::forward<T>(params))};
-}
-
-template <typename T> Response make_response(T &&result, Id id = 1) {
-  return Response{id, std::forward<T>(result)};
+inline testing::internal::EqMatcher<llvm::json::Value> HasJSON(T x) {
+  return testing::internal::EqMatcher<llvm::json::Value>(toJSON(x));
 }
 
 } // namespace
 
 TEST_F(ProtocolServerMCPTest, Initialization) {
-  Request request = make_request(
-      "initialize", InitializeParams{/*protocolVersion=*/"2024-11-05",
-                                     /*capabilities=*/{},
-                                     /*clientInfo=*/{"lldb-unit", "0.1.0"}});
-  Response response = make_response(
-      InitializeResult{/*protocolVersion=*/"2024-11-05",
-                       /*capabilities=*/{/*supportsToolsList=*/true},
-                       /*serverInfo=*/{"lldb-mcp", "0.1.0"}});
-
-  ASSERT_THAT_ERROR(Write(request), Succeeded());
-  EXPECT_CALL(message_handler, Received(response));
-  EXPECT_THAT_ERROR(Run(), Succeeded());
+  EXPECT_THAT_EXPECTED(
+      (Call<InitializeResult, InitializeParams>(
+          "initialize",
+          InitializeParams{/*protocolVersion=*/"2024-11-05",
+                           /*capabilities=*/{},
+                           /*clientInfo=*/{"lldb-unit", "0.1.0"}})),
+      HasValue(make_response(
+          InitializeResult{/*protocolVersion=*/"2024-11-05",
+                           /*capabilities=*/
+                           {
+                               /*supportsToolsList=*/true,
+                               /*supportsResourcesList=*/true,
+                           },
+                           /*serverInfo=*/{"lldb-mcp", "0.1.0"}})));
 }
 
 TEST_F(ProtocolServerMCPTest, ToolsList) {
   server_up->AddTool(std::make_unique<TestTool>("test", "test tool"));
 
-  Request request = make_request("tools/list", Void{}, /*id=*/"one");
-
   ToolDefinition test_tool;
   test_tool.name = "test";
   test_tool.description = "test tool";
   test_tool.inputSchema = json::Object{{"type", "object"}};
 
-  Response response = make_response(ListToolsResult{{test_tool}}, /*id=*/"one");
-
-  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
-  EXPECT_CALL(message_handler, Received(response));
-  EXPECT_THAT_ERROR(Run(), Succeeded());
+  EXPECT_THAT_EXPECTED(Call<ListToolsResult>("tools/list", Void{}),
+                       HasValue(make_response(ListToolsResult{{test_tool}})));
 }
 
 TEST_F(ProtocolServerMCPTest, ResourcesList) {
   server_up->AddResourceProvider(std::make_unique<TestResourceProvider>());
 
-  Request request = make_request("resources/list", Void{});
-  Response response = make_response(ListResourcesResult{
-      {{/*uri=*/"lldb://foo/bar", /*name=*/"name",
-        /*description=*/"description", /*mimeType=*/"application/json"}}});
-
-  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
-  EXPECT_CALL(message_handler, Received(response));
-  EXPECT_THAT_ERROR(Run(), Succeeded());
+  EXPECT_THAT_EXPECTED(Call<ListResourcesResult>("resources/list", Void{}),
+                       HasValue(make_response(ListResourcesResult{{
+                           {
+                               /*uri=*/"lldb://foo/bar",
+                               /*name=*/"name",
+                               /*description=*/"description",
+                               /*mimeType=*/"application/json",
+                           },
+                       }})));
 }
 
 TEST_F(ProtocolServerMCPTest, ToolsCall) {
   server_up->AddTool(std::make_unique<TestTool>("test", "test tool"));
 
-  Request request = make_request(
-      "tools/call", CallToolParams{/*name=*/"test", /*arguments=*/json::Object{
-                                       {"arguments", "foo"},
-                                       {"debugger_id", 0},
-                                   }});
-  Response response = make_response(CallToolResult{{{/*text=*/"foo"}}});
-
-  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
-  EXPECT_CALL(message_handler, Received(response));
-  EXPECT_THAT_ERROR(Run(), Succeeded());
+  EXPECT_THAT_EXPECTED(
+      (Call<CallToolResult, CallToolParams>("tools/call",
+                                            CallToolParams{
+                                                /*name=*/"test",
+                                                /*arguments=*/
+                                                json::Object{
+                                                    {"arguments", "foo"},
+                                                    {"debugger_id", 0},
+                                                },
+                                            })),
+      HasValue(make_response(CallToolResult{{{/*text=*/"foo"}}})));
 }
 
 TEST_F(ProtocolServerMCPTest, ToolsCallError) {
   server_up->AddTool(std::make_unique<ErrorTool>("error", "error tool"));
 
-  Request request = make_request(
-      "tools/call", CallToolParams{/*name=*/"error", /*arguments=*/json::Object{
-                                       {"arguments", "foo"},
-                                       {"debugger_id", 0},
-                                   }});
-  Response response =
-      make_response(lldb_protocol::mcp::Error{eErrorCodeInternalError,
-                                              /*message=*/"error"});
-
-  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
-  EXPECT_CALL(message_handler, Received(response));
-  EXPECT_THAT_ERROR(Run(), Succeeded());
+  EXPECT_THAT_EXPECTED((Call<CallToolResult, CallToolParams>(
+                           "tools/call", CallToolParams{
+                                             /*name=*/"error",
+                                             /*arguments=*/
+                                             json::Object{
+                                                 {"arguments", "foo"},
+                                                 {"debugger_id", 0},
+                                             },
+                                         })),
+                       HasValue(make_response(lldb_protocol::mcp::Error{
+                           eErrorCodeInternalError, "error"})));
 }
 
 TEST_F(ProtocolServerMCPTest, ToolsCallFail) {
   server_up->AddTool(std::make_unique<FailTool>("fail", "fail tool"));
 
-  Request request = make_request(
-      "tools/call", CallToolParams{/*name=*/"fail", /*arguments=*/json::Object{
-                                       {"arguments", "foo"},
-                                       {"debugger_id", 0},
-                                   }});
-  Response response =
-      make_response(CallToolResult{{{/*text=*/"failed"}}, /*isError=*/true});
-
-  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
-  EXPECT_CALL(message_handler, Received(response));
-  EXPECT_THAT_ERROR(Run(), Succeeded());
+  EXPECT_THAT_EXPECTED((Call<CallToolResult, CallToolParams>(
+                           "tools/call", CallToolParams{
+                                             /*name=*/"fail",
+                                             /*arguments=*/
+                                             json::Object{
+                                                 {"arguments", "foo"},
+                                                 {"debugger_id", 0},
+                                             },
+                                         })),
+                       HasValue(make_response(CallToolResult{
+                           {{/*text=*/"failed"}},
+                           /*isError=*/true,
+                       })));
 }
 
 TEST_F(ProtocolServerMCPTest, NotificationInitialized) {
-  bool handler_called = false;
-  std::condition_variable cv;
-
-  server_up->AddNotificationHandler(
-      "notifications/initialized",
-      [&](const Notification &notification) { handler_called = true; });
-  llvm::StringLiteral request =
-      R"json({"method":"notifications/initialized","jsonrpc":"2.0"})json";
-
-  ASSERT_THAT_ERROR(Write(request), llvm::Succeeded());
-  EXPECT_THAT_ERROR(Run(), Succeeded());
-  EXPECT_TRUE(handler_called);
+  EXPECT_THAT_ERROR(to_server->Send(lldb_protocol::mcp::Notification{
+                        "notifications/initialized",
+                        std::nullopt,
+                    }),
+                    Succeeded());
+  Run();
+  EXPECT_THAT(logged_messages,
+              testing::Contains("MCP initialization complete"));
 }
diff --git a/lldb/unittests/TestingSupport/Host/JSONTransportTestUtilities.h b/lldb/unittests/TestingSupport/Host/JSONTransportTestUtilities.h
index 5a9eb8e59f2b6..bacf8ca36aa07 100644
--- a/lldb/unittests/TestingSupport/Host/JSONTransportTestUtilities.h
+++ b/lldb/unittests/TestingSupport/Host/JSONTransportTestUtilities.h
@@ -6,19 +6,105 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLDB_UNITTESTS_TESTINGSUPPORT_HOST_NATIVEPROCESSTESTUTILS_H
-#define LLDB_UNITTESTS_TESTINGSUPPORT_HOST_NATIVEPROCESSTESTUTILS_H
+#ifndef LLDB_UNITTESTS_TESTINGSUPPORT_HOST_JSONTRANSPORTTESTUTILITIES_H
+#define LLDB_UNITTESTS_TESTINGSUPPORT_HOST_JSONTRANSPORTTESTUTILITIES_H
 
+#include "lldb/Host/FileSystem.h"
 #include "lldb/Host/JSONTransport.h"
+#include "lldb/Host/MainLoop.h"
+#include "lldb/Utility/FileSpec.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Testing/Support/Error.h"
 #include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <cstddef>
+#include <memory>
+#include <utility>
 
-template <typename Req, typename Resp, typename Evt>
+template <typename Proto>
+class TestTransport final
+    : public lldb_private::transport::JSONTransport<Proto> {
+public:
+  using MessageHandler =
+      typename lldb_private::transport::JSONTransport<Proto>::MessageHandler;
+
+  static std::pair<std::unique_ptr<TestTransport<Proto>>,
+                   std::unique_ptr<TestTransport<Proto>>>
+  createPair() {
+    std::unique_ptr<TestTransport<Proto>> transports[2] = {
+        std::make_unique<TestTransport<Proto>>(),
+        std::make_unique<TestTransport<Proto>>()};
+    return std::make_pair(std::move(transports[0]), std::move(transports[1]));
+  }
+
+  explicit TestTransport() {
+    llvm::Expected<lldb::FileUP> dummy_file =
+        lldb_private::FileSystem::Instance().Open(
+            lldb_private::FileSpec(lldb_private::FileSystem::DEV_NULL),
+            lldb_private::File::eOpenOptionReadWrite);
+    EXPECT_THAT_EXPECTED(dummy_file, llvm::Succeeded());
+    m_dummy_file = std::move(*dummy_file);
+  }
+
+  llvm::Error Send(const typename Proto::Evt &evt) override {
+    EXPECT_TRUE(m_loop && m_handler)
+        << "Send called before RegisterMessageHandler";
+    m_loop->AddPendingCallback([this, evt](lldb_private::MainLoopBase &) {
+      m_handler->Received(evt);
+    });
+    return llvm::Error::success();
+  }
+
+  llvm::Error Send(const typename Proto::Req &req) override {
+    EXPECT_TRUE(m_loop && m_handler)
+        << "Send called before RegisterMessageHandler";
+    m_loop->AddPendingCallback([this, req](lldb_private::MainLoopBase &) {
+      m_handler->Received(req);
+    });
+    return llvm::Error::success();
+  }
+
+  llvm::Error Send(const typename Proto::Resp &resp) override {
+    EXPECT_TRUE(m_loop && m_handler)
+        << "Send called before RegisterMessageHandler";
+    m_loop->AddPendingCallback([this, resp](lldb_private::MainLoopBase &) {
+      m_handler->Received(resp);
+    });
+    return llvm::Error::success();
+  }
+
+  llvm::Expected<lldb_private::MainLoop::ReadHandleUP>
+  RegisterMessageHandler(lldb_private::MainLoop &loop,
+                         MessageHandler &handler) override {
+    if (!m_loop)
+      m_loop = &loop;
+    if (!m_handler)
+      m_handler = &handler;
+    lldb_private::Status status;
+    auto handle = loop.RegisterReadObject(
+        m_dummy_file, [](lldb_private::MainLoopBase &) {}, status);
+    if (status.Fail())
+      return status.takeError();
+    return handle;
+  }
+
+protected:
+  void Log(llvm::StringRef message) override {};
+
+private:
+  lldb_private::MainLoop *m_loop = nullptr;
+  MessageHandler *m_handler = nullptr;
+  // Dummy file for registering with the MainLoop.
+  lldb::FileSP m_dummy_file = nullptr;
+};
+
+template <typename Proto>
 class MockMessageHandler final
-    : public lldb_private::Transport<Req, Resp, Evt>::MessageHandler {
+    : public lldb_private::transport::JSONTransport<Proto>::MessageHandler {
 public:
-  MOCK_METHOD(void, Received, (const Evt &), (override));
-  MOCK_METHOD(void, Received, (const Req &), (override));
-  MOCK_METHOD(void, Received, (const Resp &), (override));
+  MOCK_METHOD(void, Received, (const typename Proto::Req &), (override));
+  MOCK_METHOD(void, Received, (const typename Proto::Resp &), (override));
+  MOCK_METHOD(void, Received, (const typename Proto::Evt &), (override));
   MOCK_METHOD(void, OnError, (llvm::Error), (override));
   MOCK_METHOD(void, OnClosed, (), (override));
 };

From 119216ead189f9a695ab6361ab86d0121e765797 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 3 Oct 2025 20:16:08 +0100
Subject: [PATCH 688/878] [AArch64][GlobalISel] Use TargetConstant for shift
 immediates (#161527)

This changes the intrinsic definitions for shifts to use IntArg, which
in turn changes how the shifts are represented in SDAG to use
TargetConstant (and fixes up a number of ISel lowering places too). The
vecshift immediates are changed from ImmLeaf to TImmLeaf to keep them
matching the TargetConstant. On the GISel side the constant shift
amounts are then represented as immediate operands, not separate constants.
The end result is that this allows a few more patterns to match in GISel.
---
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  10 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  15 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td | 104 ++----
 .../GISel/AArch64PostLegalizerLowering.cpp    |   3 +-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  12 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 106 +++---
 .../AArch64/GlobalISel/combine-udiv.ll        |   4 +-
 .../postlegalizer-lowering-sextinreg.mir      |   3 +-
 .../postlegalizer-lowering-vashr-vlshr.mir    |   9 +-
 .../GlobalISel/select-neon-vcvtfxu2fp.mir     |   3 +-
 .../GlobalISel/select-vector-shift.mir        |   6 +-
 .../AArch64/aarch64-matrix-umull-smull.ll     |   6 +-
 .../test/CodeGen/AArch64/arm64-neon-3vdiff.ll | 204 +++--------
 .../CodeGen/AArch64/arm64-subvector-extend.ll |  12 +-
 llvm/test/CodeGen/AArch64/arm64-vabs.ll       |  24 +-
 llvm/test/CodeGen/AArch64/arm64-vadd.ll       | 341 ++++++------------
 llvm/test/CodeGen/AArch64/combine-sdiv.ll     | 137 ++++---
 .../CodeGen/AArch64/extract-vector-elt.ll     |   4 +-
 llvm/test/CodeGen/AArch64/fcmp.ll             |  18 +-
 .../AArch64/neon-bitwise-instructions.ll      |  12 +-
 .../AArch64/neon-compare-instructions.ll      | 113 ++----
 .../CodeGen/AArch64/neon-shift-left-long.ll   |   2 +-
 llvm/test/CodeGen/AArch64/select_cc.ll        |   4 +-
 .../CodeGen/AArch64/selectcc-to-shiftand.ll   |   8 +-
 24 files changed, 408 insertions(+), 752 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index fbc92d77da1ab..b0269eec3347a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -162,7 +162,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                 [LLVMExtendedType<0>, llvm_i32_ty],
-                [IntrNoMem]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                 [LLVMTruncatedType<0>],
@@ -187,13 +187,13 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
   class AdvSIMD_3VectorArg_Scalar_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
-               [IntrNoMem]>;
+               [IntrNoMem, ImmArg<ArgIndex<2>>]>;
   class AdvSIMD_CvtFxToFP_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
-                [IntrNoMem]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
   class AdvSIMD_CvtFPToFx_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty],
-                [IntrNoMem]>;
+                [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
   class AdvSIMD_1Arg_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>;
@@ -221,7 +221,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 
 // Arithmetic ops
 
-let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
+let TargetPrefix = "aarch64" in {
   // Vector Add Across Lanes
   def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
   def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 70d5ad7d660f1..dc8e7c84f5e2c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16461,7 +16461,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
 
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, DL, MVT::i32));
+                         DAG.getTargetConstant(Cnt, DL, MVT::i32));
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
                                        MVT::i32),
@@ -16491,7 +16491,8 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
       unsigned Opc =
           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
+                         DAG.getTargetConstant(Cnt, DL, MVT::i32),
+                         Op->getFlags());
     }
 
     // Right shift register.  Note, there is not a shift right register
@@ -19973,7 +19974,7 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
   SDValue FixConv =
       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
                   DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
-                  Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
+                  Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));
   // We can handle smaller integers by generating an extra trunc.
   if (IntBits < FloatBits)
     FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
@@ -20696,7 +20697,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
         N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
         SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
         SDValue NewShiftConstant =
-            DAG.getConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
+            DAG.getTargetConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
 
         return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
       }
@@ -22373,14 +22374,14 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
 
   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
     Op = DAG.getNode(Opcode, DL, VT, Op,
-                     DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32));
+                     DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32, true));
     if (N->getValueType(0) == MVT::i64)
       Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
                        DAG.getConstant(0, DL, MVT::i64));
     return Op;
   } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
     Op = DAG.getNode(Opcode, DL, VT, Op,
-                     DAG.getConstant(ShiftAmount, DL, MVT::i32));
+                     DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
     if (N->getValueType(0) == MVT::i64)
       Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
                        DAG.getConstant(0, DL, MVT::i64));
@@ -23198,7 +23199,7 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
                            Op.getOperand(ExtOffset == 0 ? 0 : 1));
   if (Shift != 0)
     BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
-                     DAG.getConstant(Shift, DL, MVT::i32));
+                     DAG.getTargetConstant(Shift, DL, MVT::i32));
   return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 6ef0a95d7406d..09ce713ea862c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -812,49 +812,49 @@ def fixedpoint_recip_f16_i64 : fixedpoint_recip_i64<f16>;
 def fixedpoint_recip_f32_i64 : fixedpoint_recip_i64<f32>;
 def fixedpoint_recip_f64_i64 : fixedpoint_recip_i64<f64>;
 
-def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftR8 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
 }]> {
   let EncoderMethod = "getVecShiftR8OpValue";
   let DecoderMethod = "DecodeVecShiftR8Imm";
   let ParserMatchClass = Imm1_8Operand;
 }
-def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftR16 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
 }]> {
   let EncoderMethod = "getVecShiftR16OpValue";
   let DecoderMethod = "DecodeVecShiftR16Imm";
   let ParserMatchClass = Imm1_16Operand;
 }
-def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftR16Narrow : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
 }]> {
   let EncoderMethod = "getVecShiftR16OpValue";
   let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
   let ParserMatchClass = Imm1_8Operand;
 }
-def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
 }]> {
   let EncoderMethod = "getVecShiftR32OpValue";
   let DecoderMethod = "DecodeVecShiftR32Imm";
   let ParserMatchClass = Imm1_32Operand;
 }
-def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftR32Narrow : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
 }]> {
   let EncoderMethod = "getVecShiftR32OpValue";
   let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
   let ParserMatchClass = Imm1_16Operand;
 }
-def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftR64 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
 }]> {
   let EncoderMethod = "getVecShiftR64OpValue";
   let DecoderMethod = "DecodeVecShiftR64Imm";
   let ParserMatchClass = Imm1_64Operand;
 }
-def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftR64Narrow : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
 }]> {
   let EncoderMethod = "getVecShiftR64OpValue";
@@ -862,37 +862,6 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm1_32Operand;
 }
 
-// Same as vecshiftR#N, but use TargetConstant (TimmLeaf) instead of Constant
-// (ImmLeaf)
-def tvecshiftR8 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR8OpValue";
-  let DecoderMethod = "DecodeVecShiftR8Imm";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def tvecshiftR16 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16Imm";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32Imm";
-  let ParserMatchClass = Imm1_32Operand;
-}
-def tvecshiftR64 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64Imm";
-  let ParserMatchClass = Imm1_64Operand;
-}
-
 def Imm0_0Operand : AsmImmRange<0, 0>;
 def Imm0_1Operand : AsmImmRange<0, 1>;
 def Imm1_1Operand : AsmImmRange<1, 1>;
@@ -904,28 +873,28 @@ def Imm0_15Operand : AsmImmRange<0, 15>;
 def Imm0_31Operand : AsmImmRange<0, 31>;
 def Imm0_63Operand : AsmImmRange<0, 63>;
 
-def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftL8 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) < 8);
 }]> {
   let EncoderMethod = "getVecShiftL8OpValue";
   let DecoderMethod = "DecodeVecShiftL8Imm";
   let ParserMatchClass = Imm0_7Operand;
 }
-def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftL16 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) < 16);
 }]> {
   let EncoderMethod = "getVecShiftL16OpValue";
   let DecoderMethod = "DecodeVecShiftL16Imm";
   let ParserMatchClass = Imm0_15Operand;
 }
-def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftL32 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) < 32);
 }]> {
   let EncoderMethod = "getVecShiftL32OpValue";
   let DecoderMethod = "DecodeVecShiftL32Imm";
   let ParserMatchClass = Imm0_31Operand;
 }
-def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+def vecshiftL64 : Operand<i32>, TImmLeaf<i32, [{
   return (((uint32_t)Imm) < 64);
 }]> {
   let EncoderMethod = "getVecShiftL64OpValue";
@@ -933,36 +902,6 @@ def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_63Operand;
 }
 
-// Same as vecshiftL#N, but use TargetConstant (TimmLeaf) instead of Constant
-// (ImmLeaf)
-def tvecshiftL8 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 8);
-}]> {
-  let EncoderMethod = "getVecShiftL8OpValue";
-  let DecoderMethod = "DecodeVecShiftL8Imm";
-  let ParserMatchClass = Imm0_7Operand;
-}
-def tvecshiftL16 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 16);
-}]> {
-  let EncoderMethod = "getVecShiftL16OpValue";
-  let DecoderMethod = "DecodeVecShiftL16Imm";
-  let ParserMatchClass = Imm0_15Operand;
-}
-def tvecshiftL32 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 32);
-}]> {
-  let EncoderMethod = "getVecShiftL32OpValue";
-  let DecoderMethod = "DecodeVecShiftL32Imm";
-  let ParserMatchClass = Imm0_31Operand;
-}
-def tvecshiftL64 : Operand<i32>, TImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 64);
-}]> {
-  let EncoderMethod = "getVecShiftL64OpValue";
-  let DecoderMethod = "DecodeVecShiftL64Imm";
-  let ParserMatchClass = Imm0_63Operand;
-}
 
 // Crazy immediate formats used by 32-bit and 64-bit logical immediate
 // instructions for splatting repeating bit patterns across the immediate.
@@ -10232,7 +10171,7 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
   def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
                                   V64, V64, vecshiftR16,
                                   asm, ".4h", ".4h",
-      [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> {
+      [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 vecshiftR16:$imm)))]> {
     bits<4> imm;
     let Inst{19-16} = imm;
   }
@@ -10240,15 +10179,16 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
   def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
                                   V128, V128, vecshiftR16,
                                   asm, ".8h", ".8h",
-      [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> {
+      [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 vecshiftR16:$imm)))]> {
     bits<4> imm;
     let Inst{19-16} = imm;
   }
   } // Predicates = [HasNEON, HasFullFP16]
+
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
-      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 vecshiftR32:$imm)))]> {
     bits<5> imm;
     let Inst{20-16} = imm;
   }
@@ -10256,7 +10196,7 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
   def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
                                   V128, V128, vecshiftR32,
                                   asm, ".4s", ".4s",
-      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 vecshiftR32:$imm)))]> {
     bits<5> imm;
     let Inst{20-16} = imm;
   }
@@ -10264,7 +10204,7 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
   def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
                                   V128, V128, vecshiftR64,
                                   asm, ".2d", ".2d",
-      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 vecshiftR64:$imm)))]> {
     bits<6> imm;
     let Inst{21-16} = imm;
   }
@@ -10276,7 +10216,7 @@ multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
   def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
                                   V64, V64, vecshiftR16,
                                   asm, ".4h", ".4h",
-      [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> {
+      [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 vecshiftR16:$imm)))]> {
     bits<4> imm;
     let Inst{19-16} = imm;
   }
@@ -10284,7 +10224,7 @@ multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
   def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
                                   V128, V128, vecshiftR16,
                                   asm, ".8h", ".8h",
-      [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> {
+      [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 vecshiftR16:$imm)))]> {
     bits<4> imm;
     let Inst{19-16} = imm;
   }
@@ -10293,7 +10233,7 @@ multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
   def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
                                   V64, V64, vecshiftR32,
                                   asm, ".2s", ".2s",
-      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 vecshiftR32:$imm)))]> {
     bits<5> imm;
     let Inst{20-16} = imm;
   }
@@ -10301,7 +10241,7 @@ multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
   def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
                                   V128, V128, vecshiftR32,
                                   asm, ".4s", ".4s",
-      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 vecshiftR32:$imm)))]> {
     bits<5> imm;
     let Inst{20-16} = imm;
   }
@@ -10309,7 +10249,7 @@ multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm,
   def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
                                   V128, V128, vecshiftR64,
                                   asm, ".2d", ".2d",
-      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 vecshiftR64:$imm)))]> {
     bits<6> imm;
     let Inst{21-16} = imm;
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 6025f1c9f5f4e..63313da17851e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -556,8 +556,7 @@ void applyVAshrLshrImm(MachineInstr &MI, MachineRegisterInfo &MRI,
   unsigned NewOpc =
       Opc == TargetOpcode::G_ASHR ? AArch64::G_VASHR : AArch64::G_VLSHR;
   MachineIRBuilder MIB(MI);
-  auto ImmDef = MIB.buildConstant(LLT::scalar(32), Imm);
-  MIB.buildInstr(NewOpc, {MI.getOperand(0)}, {MI.getOperand(1), ImmDef});
+  MIB.buildInstr(NewOpc, {MI.getOperand(0)}, {MI.getOperand(1)}).addImm(Imm);
   MI.eraseFromParent();
 }
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 539470d52f54e..be44b8fa2c194 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -4967,7 +4967,7 @@ multiclass sme2_movaz_array_to_vec_vg4_multi<string mnemonic> {
 //===----------------------------------------------------------------------===//
 // SME2 multi-vec saturating shift right narrow
 class sme2_sat_shift_vector_vg2<string mnemonic, bit op, bit u>
-    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, tvecshiftR16:$imm4),
+    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, vecshiftR16:$imm4),
         mnemonic, "\t$Zd, $Zn, $imm4",
         "", []>, Sched<[]> {
   bits<4> imm4;
@@ -4985,7 +4985,7 @@ class sme2_sat_shift_vector_vg2<string mnemonic, bit op, bit u>
 multiclass sme2_sat_shift_vector_vg2<string mnemonic, bit op, bit u, SDPatternOperator intrinsic> {
   def _H : sme2_sat_shift_vector_vg2<mnemonic, op, u>;
 
-  def : SME2_Sat_Shift_VG2_Pat<NAME # _H, intrinsic, nxv8i16, nxv4i32, tvecshiftR16>;
+  def : SME2_Sat_Shift_VG2_Pat<NAME # _H, intrinsic, nxv8i16, nxv4i32, vecshiftR16>;
 }
 
 class sme2_sat_shift_vector_vg4<bits<2> sz, bits<3> op, ZPRRegOp zpr_ty,
@@ -5008,20 +5008,20 @@ class sme2_sat_shift_vector_vg4<bits<2> sz, bits<3> op, ZPRRegOp zpr_ty,
 }
 
 multiclass sme2_sat_shift_vector_vg4<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
-  def _B : sme2_sat_shift_vector_vg4<{0,1}, op, ZPR8, ZZZZ_s_mul_r, tvecshiftR32,
+  def _B : sme2_sat_shift_vector_vg4<{0,1}, op, ZPR8, ZZZZ_s_mul_r, vecshiftR32,
                                      mnemonic>{
     bits<5> imm;
     let Inst{20-16} = imm;
   }
-  def _H : sme2_sat_shift_vector_vg4<{1,?}, op, ZPR16, ZZZZ_d_mul_r, tvecshiftR64,
+  def _H : sme2_sat_shift_vector_vg4<{1,?}, op, ZPR16, ZZZZ_d_mul_r, vecshiftR64,
                                       mnemonic> {
     bits<6> imm;
     let Inst{22}    = imm{5};
     let Inst{20-16} = imm{4-0};
   }
 
-  def : SME2_Sat_Shift_VG4_Pat<NAME # _B, intrinsic, nxv16i8, nxv4i32, tvecshiftR32>;
-  def : SME2_Sat_Shift_VG4_Pat<NAME # _H, intrinsic, nxv8i16, nxv2i64, tvecshiftR64>;
+  def : SME2_Sat_Shift_VG4_Pat<NAME # _B, intrinsic, nxv16i8, nxv4i32, vecshiftR32>;
+  def : SME2_Sat_Shift_VG4_Pat<NAME # _H, intrinsic, nxv8i16, nxv2i64, vecshiftR64>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 9a23c35766cac..3cdd505f12116 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4436,9 +4436,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
                                         ZPR64, ZPR32, vecshiftL32> {
     let Inst{20-19} = imm{4-3};
   }
-  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, vecshiftL8,  !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, vecshiftL16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, vecshiftL32, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4481,10 +4481,10 @@ multiclass sve2_int_bin_shift_imm_left<bit opc, string asm,
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, vecshiftL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, vecshiftL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, vecshiftL64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve2_int_bin_shift_imm_right<bit opc, string asm,
@@ -4501,10 +4501,10 @@ multiclass sve2_int_bin_shift_imm_right<bit opc, string asm,
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
@@ -4546,10 +4546,10 @@ multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>;
 
   def : SVE_Shift_Add_All_Active_Pat<nxv16i8, shift_op, nxv16i1, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME # _B)>;
   def : SVE_Shift_Add_All_Active_Pat<nxv8i16, shift_op, nxv8i1, nxv8i16, nxv8i16, i32, !cast<Instruction>(NAME # _H)>;
@@ -4676,18 +4676,18 @@ class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
 multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm,
                                                       SDPatternOperator op> {
   def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
-                                                tvecshiftR8>;
+                                                vecshiftR8>;
   def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
-                                                tvecshiftR16> {
+                                                vecshiftR16> {
     let Inst{19} = imm{3};
   }
   def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
-                                                tvecshiftR32> {
+                                                vecshiftR32> {
     let Inst{20-19} = imm{4-3};
   }
-  def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv4i32, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv2i64, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
 }
 
 class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
@@ -4717,18 +4717,18 @@ class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
 multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm,
                                                    SDPatternOperator op> {
   def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
-                                             tvecshiftR8>;
+                                             vecshiftR8>;
   def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
-                                             tvecshiftR16> {
+                                             vecshiftR16> {
     let Inst{19} = imm{3};
   }
   def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
-                                             tvecshiftR32> {
+                                             vecshiftR32> {
     let Inst{20-19} = imm{4-3};
   }
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv4i32, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv2i64, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
 }
 
 class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
@@ -5461,10 +5461,10 @@ multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> {
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6443,10 +6443,10 @@ multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string Ps,
     let Inst{9-8} = imm{4-3};
   }
 
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, vecshiftL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, vecshiftL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, vecshiftL64, !cast<Instruction>(NAME # _D)>;
 }
 
 // As above but shift amount takes the form of a "vector immediate".
@@ -6460,15 +6460,15 @@ multiclass sve_int_bin_pred_shift_imm_left_dup<bits<4> opc, string asm,
 }
 
 multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
-  def _B_ZERO : PredTwoOpImmPseudo<NAME # _B, ZPR8,  tvecshiftL8,  FalseLanesZero>;
-  def _H_ZERO : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
-  def _S_ZERO : PredTwoOpImmPseudo<NAME # _S, ZPR32, tvecshiftL32, FalseLanesZero>;
-  def _D_ZERO : PredTwoOpImmPseudo<NAME # _D, ZPR64, tvecshiftL64, FalseLanesZero>;
+  def _B_ZERO : PredTwoOpImmPseudo<NAME # _B, ZPR8,  vecshiftL8,  FalseLanesZero>;
+  def _H_ZERO : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftL16, FalseLanesZero>;
+  def _S_ZERO : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftL32, FalseLanesZero>;
+  def _D_ZERO : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftL64, FalseLanesZero>;
 
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8,  !cast<Pseudo>(NAME # _B_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1,  nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _H_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1,  nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _S_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1,  nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _D_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, vecshiftL8,  !cast<Pseudo>(NAME # _B_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1,  nxv8i16, vecshiftL16, !cast<Pseudo>(NAME # _H_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1,  nxv4i32, vecshiftL32, !cast<Pseudo>(NAME # _S_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1,  nxv2i64, vecshiftL64, !cast<Pseudo>(NAME # _D_ZERO)>;
 }
 
 multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
@@ -6489,10 +6489,10 @@ multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
     let Inst{9-8} = imm{4-3};
   }
 
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
 // As above but shift amount takes the form of a "vector immediate".
@@ -6511,10 +6511,10 @@ multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op =
   def _S_ZERO : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftR32, FalseLanesZero>;
   def _D_ZERO : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftR64, FalseLanesZero>;
 
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _B_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _H_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _S_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _D_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, vecshiftR8, !cast<Pseudo>(NAME # _B_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, vecshiftR16, !cast<Pseudo>(NAME # _H_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, vecshiftR32, !cast<Pseudo>(NAME # _S_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, vecshiftR64, !cast<Pseudo>(NAME # _D_ZERO)>;
 }
 
 class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
@@ -10031,7 +10031,7 @@ multiclass sve2p1_multi_vec_extract_narrow<string mnemonic, bits<2> opc, SDPatte
 
 // SVE2 multi-vec shift narrow
 class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz>
-    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, tvecshiftR16:$imm4),
+    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, vecshiftR16:$imm4),
         mnemonic, "\t$Zd, $Zn, $imm4",
         "", []>, Sched<[]> {
   bits<5> Zd;
@@ -10055,7 +10055,7 @@ class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz>
 multiclass sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> {
   def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, opc, 0b01>;
 
-  def : SVE2p1_Sat_Shift_VG2_Pat<NAME, intrinsic, nxv8i16, nxv4i32, tvecshiftR16>;
+  def : SVE2p1_Sat_Shift_VG2_Pat<NAME, intrinsic, nxv8i16, nxv4i32, vecshiftR16>;
 }
 
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index 7872c027aff2b..461a7ef67e9e0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -177,7 +177,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; GISEL-NEXT:    neg v2.16b, v3.16b
 ; GISEL-NEXT:    shl v3.16b, v4.16b, #7
 ; GISEL-NEXT:    ushl v1.16b, v1.16b, v2.16b
-; GISEL-NEXT:    sshr v2.16b, v3.16b, #7
+; GISEL-NEXT:    cmlt v2.16b, v3.16b, #0
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %div = udiv <16 x i8> %x, <i8 -64, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -229,7 +229,7 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; GISEL-NEXT:    add v1.8h, v2.8h, v1.8h
 ; GISEL-NEXT:    neg v2.8h, v4.8h
 ; GISEL-NEXT:    ushl v1.8h, v1.8h, v2.8h
-; GISEL-NEXT:    sshr v2.8h, v3.8h, #15
+; GISEL-NEXT:    cmlt v2.8h, v3.8h, #0
 ; GISEL-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %a0, <i16 1, i16 119, i16 73, i16 -111, i16 -3, i16 118, i16 32, i16 31>
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-sextinreg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-sextinreg.mir
index 0b950b784e8d7..76d4d29102cc1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-sextinreg.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-sextinreg.mir
@@ -14,8 +14,7 @@ body: |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CHECK-NEXT: [[DUP:%[0-9]+]]:_(<4 x s32>) = G_DUP [[C]](s32)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(<4 x s32>) = G_SHL %v1, [[DUP]](<4 x s32>)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: %sext:_(<4 x s32>) = G_VASHR [[SHL]], [[C1]](s32)
+    ; CHECK-NEXT: %sext:_(<4 x s32>) = G_VASHR [[SHL]], 16
     ; CHECK-NEXT: $q0 = COPY %sext(<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %v1:_(<4 x s32>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-vashr-vlshr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-vashr-vlshr.mir
index b3fb5a405135e..dfaddba5c7772 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-vashr-vlshr.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-vashr-vlshr.mir
@@ -15,8 +15,7 @@ body:             |
     ; CHECK: liveins: $d0, $d1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
-    ; CHECK-NEXT: [[VASHR:%[0-9]+]]:_(<4 x s32>) = G_VASHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[VASHR:%[0-9]+]]:_(<4 x s32>) = G_VASHR [[COPY]], 5
     ; CHECK-NEXT: $q0 = COPY [[VASHR]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<4 x s32>) = COPY $q0
@@ -39,8 +38,7 @@ body:             |
     ; CHECK: liveins: $d0, $d1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
-    ; CHECK-NEXT: [[VLSHR:%[0-9]+]]:_(<4 x s32>) = G_VLSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[VLSHR:%[0-9]+]]:_(<4 x s32>) = G_VLSHR [[COPY]], 5
     ; CHECK-NEXT: $q0 = COPY [[VLSHR]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<4 x s32>) = COPY $q0
@@ -63,8 +61,7 @@ body:             |
     ; CHECK: liveins: $d0, $d1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
-    ; CHECK-NEXT: [[VLSHR:%[0-9]+]]:_(<8 x s16>) = G_VLSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[VLSHR:%[0-9]+]]:_(<8 x s16>) = G_VLSHR [[COPY]], 5
     ; CHECK-NEXT: $q0 = COPY [[VLSHR]](<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<8 x s16>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vcvtfxu2fp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vcvtfxu2fp.mir
index c38e4a8707e1a..cf227cbb6f33b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vcvtfxu2fp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-neon-vcvtfxu2fp.mir
@@ -29,7 +29,6 @@ body:             |
     ; CHECK-NEXT: [[UCVTFd:%[0-9]+]]:fpr64 = UCVTFd [[COPY]], 12
     ; CHECK-NEXT: $d1 = COPY [[UCVTFd]]
     %0(s64) = COPY $d0
-    %1(s32) = G_CONSTANT i32 12
-    %2(s64) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.vcvtfxu2fp.f64), %0, %1
+    %2(s64) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.vcvtfxu2fp.f64), %0, 12
     $d1 = COPY %2(s64)
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir
index 07061152f8f50..9fa63268afa82 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-vector-shift.mir
@@ -499,8 +499,7 @@ body:             |
     ; CHECK-NEXT: $d0 = COPY [[SSHRv4i16_shift]]
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:fpr(<4 x s16>) = COPY $d0
-    %1:gpr(s32) = G_CONSTANT i32 5
-    %2:fpr(<4 x s16>) = G_VASHR %0, %1
+    %2:fpr(<4 x s16>) = G_VASHR %0, 5
     $d0 = COPY %2(<4 x s16>)
     RET_ReallyLR implicit $d0
 ...
@@ -520,8 +519,7 @@ body:             |
     ; CHECK-NEXT: $d0 = COPY [[USHRv4i16_shift]]
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:fpr(<4 x s16>) = COPY $d0
-    %1:gpr(s32) = G_CONSTANT i32 5
-    %2:fpr(<4 x s16>) = G_VLSHR %0, %1
+    %2:fpr(<4 x s16>) = G_VLSHR %0, 5
     $d0 = COPY %2(<4 x s16>)
     RET_ReallyLR implicit $d0
 ...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index cdde11042462b..63c08ddb04f7e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -902,7 +902,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    subs x2, x2, #8
 ; CHECK-GI-NEXT:    add x8, x8, #8
 ; CHECK-GI-NEXT:    umull v1.8h, v1.8b, v0.8b
-; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #15
+; CHECK-GI-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-GI-NEXT:    str d1, [x0], #32
 ; CHECK-GI-NEXT:    b.ne .LBB8_1
@@ -967,8 +967,8 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov d2, v1.d[1]
 ; CHECK-GI-NEXT:    smull v1.8h, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    smull v2.8h, v2.8b, v0.8b
-; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #15
-; CHECK-GI-NEXT:    sshr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-GI-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    str q1, [x0], #32
 ; CHECK-GI-NEXT:    b.ne .LBB9_1
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index 9bafc5b8aea62..2a8b3ce2ae10b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -999,16 +999,10 @@ entry:
 }
 
 define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SD-LABEL: test_vaddhn_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vaddhn_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vaddhn_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %vaddhn.i = add <8 x i16> %a, %b
   %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1017,16 +1011,10 @@ entry:
 }
 
 define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SD-LABEL: test_vaddhn_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vaddhn_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vaddhn_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %vaddhn.i = add <4 x i32> %a, %b
   %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1035,16 +1023,10 @@ entry:
 }
 
 define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SD-LABEL: test_vaddhn_s64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vaddhn_s64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vaddhn_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
 entry:
   %vaddhn.i = add <2 x i64> %a, %b
   %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
@@ -1053,16 +1035,10 @@ entry:
 }
 
 define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SD-LABEL: test_vaddhn_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vaddhn_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vaddhn_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %vaddhn.i = add <8 x i16> %a, %b
   %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1071,16 +1047,10 @@ entry:
 }
 
 define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SD-LABEL: test_vaddhn_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vaddhn_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vaddhn_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %vaddhn.i = add <4 x i32> %a, %b
   %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1089,16 +1059,10 @@ entry:
 }
 
 define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SD-LABEL: test_vaddhn_u64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vaddhn_u64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vaddhn_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
 entry:
   %vaddhn.i = add <2 x i64> %a, %b
   %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
@@ -1115,9 +1079,8 @@ define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ;
 ; CHECK-GI-LABEL: test_vaddhn_high_s16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    addhn v1.8b, v1.8h, v2.8h
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1141,9 +1104,8 @@ define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b)
 ;
 ; CHECK-GI-LABEL: test_vaddhn_high_s32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    addhn v1.4h, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1167,9 +1129,8 @@ define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b)
 ;
 ; CHECK-GI-LABEL: test_vaddhn_high_s64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    addhn v1.2s, v1.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1193,9 +1154,8 @@ define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ;
 ; CHECK-GI-LABEL: test_vaddhn_high_u16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    addhn v1.8b, v1.8h, v2.8h
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1219,9 +1179,8 @@ define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b)
 ;
 ; CHECK-GI-LABEL: test_vaddhn_high_u32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    addhn v1.4h, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1245,9 +1204,8 @@ define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b)
 ;
 ; CHECK-GI-LABEL: test_vaddhn_high_u64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    addhn v1.2s, v1.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1461,16 +1419,10 @@ entry:
 }
 
 define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SD-LABEL: test_vsubhn_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vsubhn_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vsubhn_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %vsubhn.i = sub <8 x i16> %a, %b
   %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1479,16 +1431,10 @@ entry:
 }
 
 define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SD-LABEL: test_vsubhn_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vsubhn_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vsubhn_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %vsubhn.i = sub <4 x i32> %a, %b
   %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1497,16 +1443,10 @@ entry:
 }
 
 define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SD-LABEL: test_vsubhn_s64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vsubhn_s64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vsubhn_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
 entry:
   %vsubhn.i = sub <2 x i64> %a, %b
   %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
@@ -1515,16 +1455,10 @@ entry:
 }
 
 define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-SD-LABEL: test_vsubhn_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vsubhn_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vsubhn_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
 entry:
   %vsubhn.i = sub <8 x i16> %a, %b
   %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -1533,16 +1467,10 @@ entry:
 }
 
 define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-SD-LABEL: test_vsubhn_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vsubhn_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vsubhn_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
 entry:
   %vsubhn.i = sub <4 x i32> %a, %b
   %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
@@ -1551,16 +1479,10 @@ entry:
 }
 
 define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-SD-LABEL: test_vsubhn_u64:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vsubhn_u64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vsubhn_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    subhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
 entry:
   %vsubhn.i = sub <2 x i64> %a, %b
   %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
@@ -1577,9 +1499,8 @@ define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ;
 ; CHECK-GI-LABEL: test_vsubhn_high_s16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    subhn v1.8b, v1.8h, v2.8h
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1603,9 +1524,8 @@ define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b)
 ;
 ; CHECK-GI-LABEL: test_vsubhn_high_s32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    subhn v1.4h, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1629,9 +1549,8 @@ define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b)
 ;
 ; CHECK-GI-LABEL: test_vsubhn_high_s64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    subhn v1.2s, v1.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1655,9 +1574,8 @@ define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b)
 ;
 ; CHECK-GI-LABEL: test_vsubhn_high_u16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    subhn v1.8b, v1.8h, v2.8h
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.8b, v1.8h, #8
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1681,9 +1599,8 @@ define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b)
 ;
 ; CHECK-GI-LABEL: test_vsubhn_high_u32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    subhn v1.4h, v1.4s, v2.4s
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.4h, v1.4s, #16
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
@@ -1707,9 +1624,8 @@ define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b)
 ;
 ; CHECK-GI-LABEL: test_vsubhn_high_u64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    subhn v1.2s, v1.2d, v2.2d
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #32
 ; CHECK-GI-NEXT:    fmov x8, d1
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index 84879d15de238..03e6ca1a8e146 100644
--- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -524,8 +524,8 @@ define <32 x i8> @sext_v32i1(<32 x i1> %arg) {
 ; CHECK-GI-NEXT:    mov.b v1[15], w9
 ; CHECK-GI-NEXT:    shl.16b v0, v0, #7
 ; CHECK-GI-NEXT:    shl.16b v1, v1, #7
-; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
-; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
+; CHECK-GI-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-GI-NEXT:    cmlt.16b v1, v1, #0
 ; CHECK-GI-NEXT:    ret
   %res = sext <32 x i1> %arg to <32 x i8>
   ret <32 x i8> %res
@@ -934,10 +934,10 @@ define <64 x i8> @sext_v64i1(<64 x i1> %arg) {
 ; CHECK-GI-NEXT:    shl.16b v1, v1, #7
 ; CHECK-GI-NEXT:    shl.16b v2, v2, #7
 ; CHECK-GI-NEXT:    shl.16b v3, v3, #7
-; CHECK-GI-NEXT:    sshr.16b v0, v0, #7
-; CHECK-GI-NEXT:    sshr.16b v1, v1, #7
-; CHECK-GI-NEXT:    sshr.16b v2, v2, #7
-; CHECK-GI-NEXT:    sshr.16b v3, v3, #7
+; CHECK-GI-NEXT:    cmlt.16b v0, v0, #0
+; CHECK-GI-NEXT:    cmlt.16b v1, v1, #0
+; CHECK-GI-NEXT:    cmlt.16b v2, v2, #0
+; CHECK-GI-NEXT:    cmlt.16b v3, v3, #0
 ; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
   %res = sext <64 x i1> %arg to <64 x i8>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index c408d7fe42000..a3f4722e14406 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -1914,21 +1914,13 @@ define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
 }
 
 define <8 x i16> @pr88784(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) {
-; CHECK-SD-LABEL: pr88784:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    usubl.8h v0, v0, v1
-; CHECK-SD-NEXT:    cmlt.8h v1, v2, #0
-; CHECK-SD-NEXT:    ssra.8h v0, v2, #15
-; CHECK-SD-NEXT:    eor.16b v0, v1, v0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: pr88784:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
-; CHECK-GI-NEXT:    sshr.8h v1, v2, #15
-; CHECK-GI-NEXT:    ssra.8h v0, v2, #15
-; CHECK-GI-NEXT:    eor.16b v0, v1, v0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: pr88784:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    usubl.8h v0, v0, v1
+; CHECK-NEXT:    cmlt.8h v1, v2, #0
+; CHECK-NEXT:    ssra.8h v0, v2, #15
+; CHECK-NEXT:    eor.16b v0, v1, v0
+; CHECK-NEXT:    ret
   %l4 = zext <8 x i8> %l0 to <8 x i16>
   %l5 = ashr <8 x i16> %l2, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   %l6 = zext <8 x i8> %l1 to <8 x i16>
@@ -1947,7 +1939,7 @@ define <8 x i16> @pr88784_fixed(<8 x i8> %l0, <8 x i8> %l1, <8 x i16> %l2) {
 ; CHECK-GI-LABEL: pr88784_fixed:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    usubl.8h v0, v0, v1
-; CHECK-GI-NEXT:    sshr.8h v1, v0, #15
+; CHECK-GI-NEXT:    cmlt.8h v1, v0, #0
 ; CHECK-GI-NEXT:    ssra.8h v0, v0, #15
 ; CHECK-GI-NEXT:    eor.16b v0, v1, v0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index 11fb73237da07..938712ace9a49 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1103,20 +1103,12 @@ define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: addhn8b_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: addhn8b_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: addhn8b_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %sum = add <8 x i16> %tmp1, %tmp2
@@ -1126,20 +1118,12 @@ define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: addhn4h_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: addhn4h_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: addhn4h_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %sum = add <4 x i32> %tmp1, %tmp2
@@ -1149,20 +1133,12 @@ define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: addhn2s_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: addhn2s_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: addhn2s_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %sum = add <2 x i64> %tmp1, %tmp2
@@ -1172,22 +1148,13 @@ define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: addhn2_16b_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x0]
-; CHECK-SD-NEXT:    ldr q2, [x1]
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: addhn2_16b_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: addhn2_16b_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %sum = add <8 x i16> %tmp1, %tmp2
@@ -1198,22 +1165,13 @@ define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: addhn2_8h_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x0]
-; CHECK-SD-NEXT:    ldr q2, [x1]
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: addhn2_8h_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: addhn2_8h_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %sum = add <4 x i32> %tmp1, %tmp2
@@ -1224,22 +1182,13 @@ define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: addhn2_4s_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x0]
-; CHECK-SD-NEXT:    ldr q2, [x1]
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: addhn2_4s_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: addhn2_4s_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %sum = add <2 x i64> %tmp1, %tmp2
@@ -1250,22 +1199,13 @@ define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
-; CHECK-SD-LABEL: addhn_addhn2_4s:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x0]
-; CHECK-SD-NEXT:    ldr q2, [x1]
-; CHECK-SD-NEXT:    addhn v0.2s, v1.2d, v2.2d
-; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: addhn_addhn2_4s:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    add v1.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    shrn v0.2s, v1.2d, #32
-; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: addhn_addhn2_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    addhn v0.2s, v1.2d, v2.2d
+; CHECK-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-NEXT:    ret
             %tmp1 = load <2 x i64>, ptr %A
             %tmp2 = load <2 x i64>, ptr %B
             %sum1 = add <2 x i64> %tmp1, %tmp2
@@ -1281,20 +1221,12 @@ define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
 }
 
 define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: subhn8b_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: subhn8b_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: subhn8b_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    subhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %diff = sub <8 x i16> %tmp1, %tmp2
@@ -1304,20 +1236,12 @@ define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: subhn4h_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: subhn4h_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: subhn4h_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    subhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %diff = sub <4 x i32> %tmp1, %tmp2
@@ -1327,20 +1251,12 @@ define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: subhn2s_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q0, [x0]
-; CHECK-SD-NEXT:    ldr q1, [x1]
-; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: subhn2s_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
-; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: subhn2s_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    subhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %diff = sub <2 x i64> %tmp1, %tmp2
@@ -1350,22 +1266,13 @@ define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: subhn2_16b_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x0]
-; CHECK-SD-NEXT:    ldr q2, [x1]
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: subhn2_16b_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
-; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: subhn2_16b_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %diff = sub <8 x i16> %tmp1, %tmp2
@@ -1376,22 +1283,13 @@ define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: subhn2_8h_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x0]
-; CHECK-SD-NEXT:    ldr q2, [x1]
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: subhn2_8h_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: subhn2_8h_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %diff = sub <4 x i32> %tmp1, %tmp2
@@ -1402,22 +1300,13 @@ define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-; CHECK-SD-LABEL: subhn2_4s_natural:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr q1, [x0]
-; CHECK-SD-NEXT:    ldr q2, [x1]
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: subhn2_4s_natural:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ldr q2, [x1]
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: subhn2_4s_natural:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q2, [x1]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %diff = sub <2 x i64> %tmp1, %tmp2
@@ -1428,20 +1317,12 @@ define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @neg_narrow_i8(<16 x i16> %a) {
-; CHECK-SD-LABEL: neg_narrow_i8:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    movi v2.2d, #0xffffffffffffffff
-; CHECK-SD-NEXT:    subhn v0.8b, v2.8h, v0.8h
-; CHECK-SD-NEXT:    subhn2 v0.16b, v2.8h, v1.8h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: neg_narrow_i8:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
-; CHECK-GI-NEXT:    mvn v1.16b, v1.16b
-; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
-; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: neg_narrow_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-NEXT:    subhn v0.8b, v2.8h, v0.8h
+; CHECK-NEXT:    subhn2 v0.16b, v2.8h, v1.8h
+; CHECK-NEXT:    ret
   %not.i = xor <16 x i16> %a, splat (i16 -1)
   %s = lshr <16 x i16> %not.i, splat (i16 8)
   %vshrn_n = trunc nuw <16 x i16> %s to <16 x i8>
@@ -1449,20 +1330,12 @@ define <16 x i8> @neg_narrow_i8(<16 x i16> %a) {
 }
 
 define <8 x i16> @neg_narrow_i16(<8 x i32> %a) {
-; CHECK-SD-LABEL: neg_narrow_i16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    movi v2.2d, #0xffffffffffffffff
-; CHECK-SD-NEXT:    subhn v0.4h, v2.4s, v0.4s
-; CHECK-SD-NEXT:    subhn2 v0.8h, v2.4s, v1.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: neg_narrow_i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
-; CHECK-GI-NEXT:    mvn v1.16b, v1.16b
-; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: neg_narrow_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-NEXT:    subhn v0.4h, v2.4s, v0.4s
+; CHECK-NEXT:    subhn2 v0.8h, v2.4s, v1.4s
+; CHECK-NEXT:    ret
   %not.i = xor <8 x i32> %a, splat (i32 -1)
   %s = lshr <8 x i32> %not.i, splat (i32 16)
   %vshrn_n = trunc nuw <8 x i32> %s to <8 x i16>
@@ -1470,20 +1343,12 @@ define <8 x i16> @neg_narrow_i16(<8 x i32> %a) {
 }
 
 define <4 x i32> @neg_narrow_i32(<4 x i64> %a) {
-; CHECK-SD-LABEL: neg_narrow_i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    movi v2.2d, #0xffffffffffffffff
-; CHECK-SD-NEXT:    subhn v0.2s, v2.2d, v0.2d
-; CHECK-SD-NEXT:    subhn2 v0.4s, v2.2d, v1.2d
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: neg_narrow_i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
-; CHECK-GI-NEXT:    mvn v1.16b, v1.16b
-; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: neg_narrow_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.2d, #0xffffffffffffffff
+; CHECK-NEXT:    subhn v0.2s, v2.2d, v0.2d
+; CHECK-NEXT:    subhn2 v0.4s, v2.2d, v1.2d
+; CHECK-NEXT:    ret
   %not.i = xor <4 x i64> %a, splat (i64 -1)
   %s = lshr <4 x i64> %not.i, splat (i64 32)
   %vshrn_n = trunc nuw <4 x i64> %s to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index 9d0ade2480428..dc88f9414b866 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -66,9 +66,9 @@ define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
 ;
 ; CHECK-GI-LABEL: combine_vec_sdiv_by_minsigned:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v1.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    usra v0.4s, v1.4s, #1
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    neg v0.4s, v0.4s
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
@@ -176,7 +176,7 @@ define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
 ; CHECK-GI-NEXT:    mov v1.s[2], w9
 ; CHECK-GI-NEXT:    mov v1.s[3], w9
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
@@ -185,39 +185,24 @@ define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
 }
 
 define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
-; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v1.4s, v0.4s, #0
-; CHECK-SD-NEXT:    usra v0.4s, v1.4s, #30
-; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #2
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v1.4s, v0.4s, #31
-; CHECK-GI-NEXT:    usra v0.4s, v1.4s, #30
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: combine_vec_sdiv_by_pow2a:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.4s, v0.4s, #0
+; CHECK-NEXT:    usra v0.4s, v1.4s, #30
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #2
+; CHECK-NEXT:    ret
   %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
   ret <4 x i32> %1
 }
 
 define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
-; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2a_neg:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v1.4s, v0.4s, #0
-; CHECK-SD-NEXT:    usra v0.4s, v1.4s, #30
-; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #2
-; CHECK-SD-NEXT:    neg v0.4s, v0.4s
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2a_neg:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v1.4s, v0.4s, #31
-; CHECK-GI-NEXT:    usra v0.4s, v1.4s, #30
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #2
-; CHECK-GI-NEXT:    neg v0.4s, v0.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: combine_vec_sdiv_by_pow2a_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.4s, v0.4s, #0
+; CHECK-NEXT:    usra v0.4s, v1.4s, #30
+; CHECK-NEXT:    sshr v0.4s, v0.4s, #2
+; CHECK-NEXT:    neg v0.4s, v0.4s
+; CHECK-NEXT:    ret
   %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
   ret <4 x i32> %1
 }
@@ -240,7 +225,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
 ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI14_1
-; CHECK-GI-NEXT:    sshr v2.16b, v0.16b, #7
+; CHECK-GI-NEXT:    cmlt v2.16b, v0.16b, #0
 ; CHECK-GI-NEXT:    adrp x9, .LCPI14_0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI14_2
@@ -252,7 +237,7 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
 ; CHECK-GI-NEXT:    neg v2.16b, v2.16b
 ; CHECK-GI-NEXT:    add v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    sshl v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    sshr v2.16b, v3.16b, #7
+; CHECK-GI-NEXT:    cmlt v2.16b, v3.16b, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
@@ -278,7 +263,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
 ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI15_1
-; CHECK-GI-NEXT:    sshr v2.8h, v0.8h, #15
+; CHECK-GI-NEXT:    cmlt v2.8h, v0.8h, #0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI15_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    ldr d3, [x8, :lo12:.LCPI15_0]
@@ -291,7 +276,7 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
 ; CHECK-GI-NEXT:    add v1.8h, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    shl v2.8h, v2.8h, #15
 ; CHECK-GI-NEXT:    sshl v1.8h, v1.8h, v3.8h
-; CHECK-GI-NEXT:    sshr v2.8h, v2.8h, #15
+; CHECK-GI-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
@@ -322,8 +307,8 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI16_1
-; CHECK-GI-NEXT:    sshr v3.8h, v0.8h, #15
-; CHECK-GI-NEXT:    sshr v4.8h, v1.8h, #15
+; CHECK-GI-NEXT:    cmlt v3.8h, v0.8h, #0
+; CHECK-GI-NEXT:    cmlt v4.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI16_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-GI-NEXT:    ldr d5, [x8, :lo12:.LCPI16_0]
@@ -339,7 +324,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; CHECK-GI-NEXT:    add v2.8h, v1.8h, v2.8h
 ; CHECK-GI-NEXT:    sshl v3.8h, v3.8h, v4.8h
 ; CHECK-GI-NEXT:    sshl v2.8h, v2.8h, v4.8h
-; CHECK-GI-NEXT:    sshr v4.8h, v5.8h, #15
+; CHECK-GI-NEXT:    cmlt v4.8h, v5.8h, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v3.16b, v4.16b
 ; CHECK-GI-NEXT:    bif v1.16b, v2.16b, v4.16b
 ; CHECK-GI-NEXT:    ret
@@ -381,12 +366,12 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_1
-; CHECK-GI-NEXT:    sshr v5.8h, v0.8h, #15
-; CHECK-GI-NEXT:    sshr v6.8h, v1.8h, #15
+; CHECK-GI-NEXT:    cmlt v5.8h, v0.8h, #0
+; CHECK-GI-NEXT:    cmlt v6.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI17_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_0
-; CHECK-GI-NEXT:    sshr v7.8h, v2.8h, #15
-; CHECK-GI-NEXT:    sshr v16.8h, v3.8h, #15
+; CHECK-GI-NEXT:    cmlt v7.8h, v2.8h, #0
+; CHECK-GI-NEXT:    cmlt v16.8h, v3.8h, #0
 ; CHECK-GI-NEXT:    ldr d17, [x8, :lo12:.LCPI17_0]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI17_2
 ; CHECK-GI-NEXT:    neg v4.8h, v4.8h
@@ -402,7 +387,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; CHECK-GI-NEXT:    add v6.8h, v1.8h, v6.8h
 ; CHECK-GI-NEXT:    add v7.8h, v2.8h, v7.8h
 ; CHECK-GI-NEXT:    add v4.8h, v3.8h, v4.8h
-; CHECK-GI-NEXT:    sshr v17.8h, v17.8h, #15
+; CHECK-GI-NEXT:    cmlt v17.8h, v17.8h, #0
 ; CHECK-GI-NEXT:    sshl v5.8h, v5.8h, v16.8h
 ; CHECK-GI-NEXT:    sshl v6.8h, v6.8h, v16.8h
 ; CHECK-GI-NEXT:    sshl v7.8h, v7.8h, v16.8h
@@ -436,7 +421,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
-; CHECK-GI-NEXT:    sshr v3.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v3.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI18_0
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI18_0]
@@ -451,7 +436,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
 ; CHECK-GI-NEXT:    mov v1.s[3], w9
 ; CHECK-GI-NEXT:    sshl v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
@@ -483,10 +468,10 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
-; CHECK-GI-NEXT:    sshr v4.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v4.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    fmov s2, w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI19_0
-; CHECK-GI-NEXT:    sshr v5.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v5.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI19_0]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI19_1
 ; CHECK-GI-NEXT:    mov v2.h[1], w9
@@ -503,7 +488,7 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; CHECK-GI-NEXT:    sshl v3.4s, v3.4s, v5.4s
 ; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #31
-; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT:    cmlt v2.4s, v2.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v4.16b, v2.16b
 ; CHECK-GI-NEXT:    bif v1.16b, v3.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
@@ -546,13 +531,13 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
-; CHECK-GI-NEXT:    sshr v6.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v6.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    fmov s4, w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI20_0
-; CHECK-GI-NEXT:    sshr v7.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v7.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI20_0]
-; CHECK-GI-NEXT:    sshr v16.4s, v2.4s, #31
-; CHECK-GI-NEXT:    sshr v17.4s, v3.4s, #31
+; CHECK-GI-NEXT:    cmlt v16.4s, v2.4s, #0
+; CHECK-GI-NEXT:    cmlt v17.4s, v3.4s, #0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI20_1
 ; CHECK-GI-NEXT:    mov v4.h[1], w9
 ; CHECK-GI-NEXT:    neg v5.4s, v5.4s
@@ -574,7 +559,7 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; CHECK-GI-NEXT:    sshl v5.4s, v5.4s, v17.4s
 ; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
 ; CHECK-GI-NEXT:    shl v4.4s, v4.4s, #31
-; CHECK-GI-NEXT:    sshr v4.4s, v4.4s, #31
+; CHECK-GI-NEXT:    cmlt v4.4s, v4.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v6.16b, v4.16b
 ; CHECK-GI-NEXT:    bif v1.16b, v7.16b, v4.16b
 ; CHECK-GI-NEXT:    bif v2.16b, v16.16b, v4.16b
@@ -603,7 +588,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
 ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI21_1
-; CHECK-GI-NEXT:    sshr v2.2d, v0.2d, #63
+; CHECK-GI-NEXT:    cmlt v2.2d, v0.2d, #0
 ; CHECK-GI-NEXT:    adrp x9, .LCPI21_0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI21_1]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI21_2
@@ -615,7 +600,7 @@ define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
 ; CHECK-GI-NEXT:    neg v2.2d, v2.2d
 ; CHECK-GI-NEXT:    add v1.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    sshl v1.2d, v1.2d, v2.2d
-; CHECK-GI-NEXT:    sshr v2.2d, v3.2d, #63
+; CHECK-GI-NEXT:    cmlt v2.2d, v3.2d, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
@@ -649,7 +634,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI22_2
-; CHECK-GI-NEXT:    sshr v3.2d, v0.2d, #63
+; CHECK-GI-NEXT:    cmlt v3.2d, v0.2d, #0
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI22_2]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI22_1
 ; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI22_1]
@@ -662,13 +647,13 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI22_3
 ; CHECK-GI-NEXT:    neg v5.2d, v5.2d
 ; CHECK-GI-NEXT:    ushl v2.2d, v3.2d, v2.2d
-; CHECK-GI-NEXT:    sshr v3.2d, v1.2d, #63
+; CHECK-GI-NEXT:    cmlt v3.2d, v1.2d, #0
 ; CHECK-GI-NEXT:    shl v6.2d, v6.2d, #63
 ; CHECK-GI-NEXT:    add v2.2d, v0.2d, v2.2d
 ; CHECK-GI-NEXT:    ushl v3.2d, v3.2d, v4.2d
 ; CHECK-GI-NEXT:    ldr q4, [x8, :lo12:.LCPI22_3]
 ; CHECK-GI-NEXT:    sshl v2.2d, v2.2d, v5.2d
-; CHECK-GI-NEXT:    sshr v5.2d, v6.2d, #63
+; CHECK-GI-NEXT:    cmlt v5.2d, v6.2d, #0
 ; CHECK-GI-NEXT:    add v1.2d, v1.2d, v3.2d
 ; CHECK-GI-NEXT:    neg v3.2d, v4.2d
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v5.16b
@@ -715,13 +700,13 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov w8, #1 // =0x1
 ; CHECK-GI-NEXT:    mov w9, #0 // =0x0
-; CHECK-GI-NEXT:    sshr v7.2d, v0.2d, #63
+; CHECK-GI-NEXT:    cmlt v7.2d, v0.2d, #0
 ; CHECK-GI-NEXT:    fmov s4, w8
 ; CHECK-GI-NEXT:    adrp x8, .LCPI23_1
-; CHECK-GI-NEXT:    sshr v16.2d, v1.2d, #63
+; CHECK-GI-NEXT:    cmlt v16.2d, v1.2d, #0
 ; CHECK-GI-NEXT:    ldr q5, [x8, :lo12:.LCPI23_1]
-; CHECK-GI-NEXT:    sshr v17.2d, v2.2d, #63
-; CHECK-GI-NEXT:    sshr v18.2d, v3.2d, #63
+; CHECK-GI-NEXT:    cmlt v17.2d, v2.2d, #0
+; CHECK-GI-NEXT:    cmlt v18.2d, v3.2d, #0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI23_3
 ; CHECK-GI-NEXT:    mov v4.h[1], w9
 ; CHECK-GI-NEXT:    neg v5.2d, v5.2d
@@ -754,9 +739,9 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; CHECK-GI-NEXT:    shl v4.2d, v4.2d, #63
 ; CHECK-GI-NEXT:    sshl v16.2d, v16.2d, v20.2d
 ; CHECK-GI-NEXT:    sshl v6.2d, v6.2d, v20.2d
-; CHECK-GI-NEXT:    sshr v17.2d, v17.2d, #63
-; CHECK-GI-NEXT:    sshr v18.2d, v18.2d, #63
-; CHECK-GI-NEXT:    sshr v4.2d, v4.2d, #63
+; CHECK-GI-NEXT:    cmlt v17.2d, v17.2d, #0
+; CHECK-GI-NEXT:    cmlt v18.2d, v18.2d, #0
+; CHECK-GI-NEXT:    cmlt v4.2d, v4.2d, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v7.16b, v17.16b
 ; CHECK-GI-NEXT:    bif v1.16b, v16.16b, v18.16b
 ; CHECK-GI-NEXT:    bif v2.16b, v5.16b, v4.16b
@@ -792,7 +777,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; CHECK-GI-NEXT:    adrp x10, .LCPI24_0
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    ldr q2, [x10, :lo12:.LCPI24_0]
-; CHECK-GI-NEXT:    sshr v3.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v3.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    fmov s4, w9
 ; CHECK-GI-NEXT:    adrp x10, .LCPI24_1
 ; CHECK-GI-NEXT:    neg v2.4s, v2.4s
@@ -807,10 +792,10 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
 ; CHECK-GI-NEXT:    mov v1.s[3], w9
 ; CHECK-GI-NEXT:    sshl v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    shl v1.4s, v4.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    neg v2.4s, v0.4s
 ; CHECK-GI-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
@@ -871,7 +856,7 @@ define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
 ; CHECK-GI-NEXT:    neg v2.16b, v0.16b
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI25_0]
 ; CHECK-GI-NEXT:    shl v1.16b, v1.16b, #7
-; CHECK-GI-NEXT:    sshr v1.16b, v1.16b, #7
+; CHECK-GI-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-GI-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -901,7 +886,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; CHECK-GI-LABEL: non_splat_minus_one_divisor_1:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI26_2
-; CHECK-GI-NEXT:    sshr v2.16b, v0.16b, #7
+; CHECK-GI-NEXT:    cmlt v2.16b, v0.16b, #0
 ; CHECK-GI-NEXT:    adrp x9, .LCPI26_1
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI26_2]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI26_3
@@ -914,11 +899,11 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; CHECK-GI-NEXT:    neg v2.16b, v2.16b
 ; CHECK-GI-NEXT:    add v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    sshl v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    sshr v2.16b, v3.16b, #7
+; CHECK-GI-NEXT:    cmlt v2.16b, v3.16b, #0
 ; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI26_0]
 ; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    shl v1.16b, v3.16b, #7
-; CHECK-GI-NEXT:    sshr v1.16b, v1.16b, #7
+; CHECK-GI-NEXT:    cmlt v1.16b, v1.16b, #0
 ; CHECK-GI-NEXT:    neg v2.16b, v0.16b
 ; CHECK-GI-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
@@ -954,7 +939,7 @@ define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    ldr q2, [x9, :lo12:.LCPI27_0]
 ; CHECK-GI-NEXT:    fmov s4, w8
-; CHECK-GI-NEXT:    sshr v3.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v3.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    adrp x9, .LCPI27_1
 ; CHECK-GI-NEXT:    neg v2.4s, v2.4s
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
@@ -969,10 +954,10 @@ define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
 ; CHECK-GI-NEXT:    sshl v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    mov v4.s[3], w8
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    shl v1.4s, v4.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    neg v2.4s, v0.4s
 ; CHECK-GI-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
@@ -1207,7 +1192,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI34_0]
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    shl v1.8h, v1.8h, #15
-; CHECK-GI-NEXT:    sshr v1.8h, v1.8h, #15
+; CHECK-GI-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 121cc30692124..babb4ed3016e8 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -605,7 +605,7 @@ define i32 @extract_v4i32_select(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x i1> %c
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0x3
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    str q0, [sp]
 ; CHECK-GI-NEXT:    ldr w0, [x9, x8, lsl #2]
@@ -634,7 +634,7 @@ define i32 @extract_v4i32_select_const(<4 x i32> %a, <4 x i32> %b, i32 %c, <4 x
 ; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI23_0]
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    mov s0, v0.s[2]
 ; CHECK-GI-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 6d673f1204c7f..30fb82ea5c517 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -661,7 +661,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #63
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-GI-NEXT:    bsl v0.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    add sp, sp, #80
 ; CHECK-GI-NEXT:    ret
@@ -1540,7 +1540,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-FP16-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-GI-FP16-NEXT:    fmov s4, w8
 ; CHECK-GI-FP16-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-FP16-NEXT:    ushl v1.4s, v1.4s, v2.4s
@@ -1602,7 +1602,7 @@ define <4 x i32> @v4f16_i32(<4 x half> %a, <4 x half> %b, <4 x i32> %d, <4 x i32
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.4h, v1.4h, v0.4h
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-GI-FP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
@@ -1657,8 +1657,8 @@ define <8 x i32> @v8f16_i32(<8 x half> %a, <8 x half> %b, <8 x i32> %d, <8 x i32
 ; CHECK-GI-FP16-NEXT:    ushll2 v0.4s, v0.8h, #0
 ; CHECK-GI-FP16-NEXT:    shl v1.4s, v1.4s, #31
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    sshr v1.4s, v1.4s, #31
-; CHECK-GI-FP16-NEXT:    sshr v6.4s, v0.4s, #31
+; CHECK-GI-FP16-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-GI-FP16-NEXT:    cmlt v6.4s, v0.4s, #0
 ; CHECK-GI-FP16-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    mov v1.16b, v6.16b
 ; CHECK-GI-FP16-NEXT:    bsl v0.16b, v2.16b, v4.16b
@@ -1748,10 +1748,10 @@ define <16 x i32> @v16f16_i32(<16 x half> %a, <16 x half> %b, <16 x i32> %d, <16
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    shl v3.4s, v3.4s, #31
 ; CHECK-GI-FP16-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-FP16-NEXT:    sshr v2.4s, v2.4s, #31
-; CHECK-GI-FP16-NEXT:    sshr v16.4s, v0.4s, #31
-; CHECK-GI-FP16-NEXT:    sshr v3.4s, v3.4s, #31
-; CHECK-GI-FP16-NEXT:    sshr v17.4s, v1.4s, #31
+; CHECK-GI-FP16-NEXT:    cmlt v2.4s, v2.4s, #0
+; CHECK-GI-FP16-NEXT:    cmlt v16.4s, v0.4s, #0
+; CHECK-GI-FP16-NEXT:    cmlt v3.4s, v3.4s, #0
+; CHECK-GI-FP16-NEXT:    cmlt v17.4s, v1.4s, #0
 ; CHECK-GI-FP16-NEXT:    ldp q0, q1, [sp]
 ; CHECK-GI-FP16-NEXT:    bit v0.16b, v4.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov v2.16b, v3.16b
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 0c84468f3934b..2026959a51a8e 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1110,7 +1110,7 @@ define <8 x i8> @vselect_constant_cond_zero_v8i8(<8 x i8> %a) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI83_0
 ; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI83_0]
 ; CHECK-GI-NEXT:    shl v1.8b, v1.8b, #7
-; CHECK-GI-NEXT:    sshr v1.8b, v1.8b, #7
+; CHECK-GI-NEXT:    cmlt v1.8b, v1.8b, #0
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ret
   %b = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> zeroinitializer
@@ -1133,7 +1133,7 @@ define <4 x i16> @vselect_constant_cond_zero_v4i16(<4 x i16> %a) {
 ; CHECK-GI-NEXT:    mov v1.h[2], w9
 ; CHECK-GI-NEXT:    mov v1.h[3], w8
 ; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #15
-; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #15
+; CHECK-GI-NEXT:    cmlt v1.4h, v1.4h, #0
 ; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> %a, <4 x i16> zeroinitializer
@@ -1157,7 +1157,7 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) {
 ; CHECK-GI-NEXT:    mov v1.s[2], w9
 ; CHECK-GI-NEXT:    mov v1.s[3], w8
 ; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a, <4 x i32> zeroinitializer
@@ -1176,7 +1176,7 @@ define <8 x i8> @vselect_constant_cond_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI86_0
 ; CHECK-GI-NEXT:    ldr d2, [x8, :lo12:.LCPI86_0]
 ; CHECK-GI-NEXT:    shl v2.8b, v2.8b, #7
-; CHECK-GI-NEXT:    sshr v2.8b, v2.8b, #7
+; CHECK-GI-NEXT:    cmlt v2.8b, v2.8b, #0
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
   %c = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i8> %a, <8 x i8> %b
@@ -1199,7 +1199,7 @@ define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-GI-NEXT:    mov v2.h[2], w9
 ; CHECK-GI-NEXT:    mov v2.h[3], w8
 ; CHECK-GI-NEXT:    shl v2.4h, v2.4h, #15
-; CHECK-GI-NEXT:    sshr v2.4h, v2.4h, #15
+; CHECK-GI-NEXT:    cmlt v2.4h, v2.4h, #0
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
   %c = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> %a, <4 x i16> %b
@@ -1223,7 +1223,7 @@ define <4 x i32> @vselect_constant_cond_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-GI-NEXT:    mov v2.s[2], w9
 ; CHECK-GI-NEXT:    mov v2.s[3], w8
 ; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #31
-; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #31
+; CHECK-GI-NEXT:    cmlt v2.4s, v2.4s, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
   %c = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> %a, <4 x i32> %b
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index fb8b721769b2b..11b3b62ec1c8d 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -966,7 +966,7 @@ define <8 x i8> @cmgez8xi8_alt(<8 x i8> %A) {
 ;
 ; CHECK-GI-LABEL: cmgez8xi8_alt:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.8b, v0.8b, #7
+; CHECK-GI-NEXT:    cmlt v0.8b, v0.8b, #0
 ; CHECK-GI-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
   %sign = ashr <8 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -982,7 +982,7 @@ define <16 x i8> @cmgez16xi8_alt(<16 x i8> %A) {
 ;
 ; CHECK-GI-LABEL: cmgez16xi8_alt:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.16b, v0.16b, #7
+; CHECK-GI-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %sign = ashr <16 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -998,7 +998,7 @@ define <4 x i16> @cmgez4xi16_alt(<4 x i16> %A) {
 ;
 ; CHECK-GI-LABEL: cmgez4xi16_alt:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #15
+; CHECK-GI-NEXT:    cmlt v0.4h, v0.4h, #0
 ; CHECK-GI-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
   %sign = ashr <4 x i16> %A, <i16 15, i16 15, i16 15, i16 15>
@@ -1014,7 +1014,7 @@ define <8 x i16> @cmgez8xi16_alt(<8 x i16> %A) {
 ;
 ; CHECK-GI-LABEL: cmgez8xi16_alt:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #15
+; CHECK-GI-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %sign = ashr <8 x i16> %A, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -1030,7 +1030,7 @@ define <2 x i32> @cmgez2xi32_alt(<2 x i32> %A) {
 ;
 ; CHECK-GI-LABEL: cmgez2xi32_alt:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #31
+; CHECK-GI-NEXT:    cmlt v0.2s, v0.2s, #0
 ; CHECK-GI-NEXT:    mvn v0.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
   %sign = ashr <2 x i32> %A, <i32 31, i32 31>
@@ -1046,7 +1046,7 @@ define <4 x i32> @cmgez4xi32_alt(<4 x i32> %A) {
 ;
 ; CHECK-GI-LABEL: cmgez4xi32_alt:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %sign = ashr <4 x i32> %A, <i32 31, i32 31, i32 31, i32 31>
@@ -1062,7 +1062,7 @@ define <2 x i64> @cmgez2xi64_alt(<2 x i64> %A) {
 ;
 ; CHECK-GI-LABEL: cmgez2xi64_alt:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-GI-NEXT:    mvn v0.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %sign = ashr <2 x i64> %A, <i64 63, i64 63>
@@ -1503,99 +1503,64 @@ entry:
 }
 
 define <8 x i8> @cmltz8xi8_alt(<8 x i8> %A) {
-; CHECK-SD-LABEL: cmltz8xi8_alt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.8b, v0.8b, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: cmltz8xi8_alt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.8b, v0.8b, #7
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cmltz8xi8_alt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    ret
   %A.lobit = ashr <8 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   ret <8 x i8> %A.lobit
 }
 
 define <16 x i8> @cmltz16xi8_alt(<16 x i8> %A) {
-; CHECK-SD-LABEL: cmltz16xi8_alt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: cmltz16xi8_alt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.16b, v0.16b, #7
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cmltz16xi8_alt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    ret
   %A.lobit = ashr <16 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
   ret <16 x i8> %A.lobit
 }
 
 define <4 x i16> @cmltz4xi16_alt(<4 x i16> %A) {
-; CHECK-SD-LABEL: cmltz4xi16_alt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.4h, v0.4h, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: cmltz4xi16_alt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #15
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cmltz4xi16_alt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    ret
   %A.lobit = ashr <4 x i16> %A, <i16 15, i16 15, i16 15, i16 15>
   ret <4 x i16> %A.lobit
 }
 
 define <8 x i16> @cmltz8xi16_alt(<8 x i16> %A) {
-; CHECK-SD-LABEL: cmltz8xi16_alt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.8h, v0.8h, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: cmltz8xi16_alt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #15
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cmltz8xi16_alt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    ret
   %A.lobit = ashr <8 x i16> %A, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %A.lobit
 }
 
 define <2 x i32> @cmltz2xi32_alt(<2 x i32> %A) {
-; CHECK-SD-LABEL: cmltz2xi32_alt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.2s, v0.2s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: cmltz2xi32_alt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #31
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cmltz2xi32_alt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    ret
   %A.lobit = ashr <2 x i32> %A, <i32 31, i32 31>
   ret <2 x i32> %A.lobit
 }
 
 define <4 x i32> @cmltz4xi32_alt(<4 x i32> %A) {
-; CHECK-SD-LABEL: cmltz4xi32_alt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.4s, v0.4s, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: cmltz4xi32_alt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cmltz4xi32_alt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    ret
   %A.lobit = ashr <4 x i32> %A, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %A.lobit
 }
 
 define <2 x i64> @cmltz2xi64_alt(<2 x i64> %A) {
-; CHECK-SD-LABEL: cmltz2xi64_alt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    cmlt v0.2d, v0.2d, #0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: cmltz2xi64_alt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: cmltz2xi64_alt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    ret
   %A.lobit = ashr <2 x i64> %A, <i64 63, i64 63>
   ret <2 x i64> %A.lobit
 }
@@ -2523,7 +2488,7 @@ define <2 x i32> @fcmal2xfloat(<2 x float> %A, <2 x float> %B) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    movi v0.2s, #1
 ; CHECK-GI-NEXT:    shl v0.2s, v0.2s, #31
-; CHECK-GI-NEXT:    sshr v0.2s, v0.2s, #31
+; CHECK-GI-NEXT:    cmlt v0.2s, v0.2s, #0
 ; CHECK-GI-NEXT:    ret
   %tmp3 = fcmp true <2 x float> %A, %B
   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
@@ -2542,7 +2507,7 @@ define <4 x i32> @fcmal4xfloat(<4 x float> %A, <4 x float> %B) {
 ; CHECK-GI-NEXT:    dup v0.2s, w8
 ; CHECK-GI-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    ret
   %tmp3 = fcmp true <4 x float> %A, %B
   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
@@ -2559,7 +2524,7 @@ define <2 x i64> @fcmal2xdouble(<2 x double> %A, <2 x double> %B) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI221_0
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI221_0]
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #63
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-GI-NEXT:    ret
   %tmp3 = fcmp true <2 x double> %A, %B
   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
@@ -2589,7 +2554,7 @@ define <4 x i32> @fcmnv4xfloat(<4 x float> %A, <4 x float> %B) {
 ; CHECK-GI-NEXT:    mov v0.s[1], w8
 ; CHECK-GI-NEXT:    mov v0.d[1], v0.d[0]
 ; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    ret
   %tmp3 = fcmp false <4 x float> %A, %B
   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll b/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll
index 282f43763f362..a8c55b476b810 100644
--- a/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll
+++ b/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -465,7 +465,7 @@ define <8 x i16> @test_ushll_cmp(<8 x i8> %a, <8 x i8> %b) #0 {
 ; CHECK-GI-NEXT:    movi v1.2d, #0xff00ff00ff00ff
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #15
-; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #15
+; CHECK-GI-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %cmp.i = icmp eq <8 x i8> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/select_cc.ll b/llvm/test/CodeGen/AArch64/select_cc.ll
index 483f6c26af8c1..b562340bbc765 100644
--- a/llvm/test/CodeGen/AArch64/select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/select_cc.ll
@@ -98,7 +98,7 @@ define <2 x double> @select_olt_load_cmp(<2 x double> %a, ptr %src) {
 ; CHECK-GI-NEXT:    fcmgt v1.2s, v1.2s, #0.0
 ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #63
-; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #63
+; CHECK-GI-NEXT:    cmlt v1.2d, v1.2d, #0
 ; CHECK-GI-NEXT:    bif v0.16b, v2.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -136,7 +136,7 @@ define <4 x i32> @select_icmp_sgt(<4 x i32> %a, <4 x i8> %b) {
 ; CHECK-GI-NEXT:    mov v2.s[2], w8
 ; CHECK-GI-NEXT:    mov v2.s[3], w9
 ; CHECK-GI-NEXT:    shl v1.4s, v2.4s, #31
-; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #31
+; CHECK-GI-NEXT:    cmlt v1.4s, v1.4s, #0
 ; CHECK-GI-NEXT:    bic v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index 293b74ecd9d3a..96a7a9d039c21 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -255,7 +255,7 @@ define <16 x i8> @sel_shift_bool_v16i8(<16 x i1> %t) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    shl v0.16b, v0.16b, #7
 ; CHECK-GI-NEXT:    movi v1.16b, #128
-; CHECK-GI-NEXT:    sshr v0.16b, v0.16b, #7
+; CHECK-GI-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %shl = select <16 x i1> %t, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <16 x i8> zeroinitializer
@@ -277,7 +277,7 @@ define <8 x i16> @sel_shift_bool_v8i16(<8 x i1> %t) {
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    movi v1.8h, #128
 ; CHECK-GI-NEXT:    shl v0.8h, v0.8h, #15
-; CHECK-GI-NEXT:    sshr v0.8h, v0.8h, #15
+; CHECK-GI-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %shl= select <8 x i1> %t, <8 x i16> <i16 128, i16 128, i16 128, i16 128, i16 128, i16 128, i16 128, i16 128>, <8 x i16> zeroinitializer
@@ -299,7 +299,7 @@ define <4 x i32> @sel_shift_bool_v4i32(<4 x i1> %t) {
 ; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    movi v1.4s, #64
 ; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #31
-; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %shl = select <4 x i1> %t, <4 x i32> <i32 64, i32 64, i32 64, i32 64>, <4 x i32> zeroinitializer
@@ -323,7 +323,7 @@ define <2 x i64> @sel_shift_bool_v2i64(<2 x i1> %t) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI16_0
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #63
-; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #63
+; CHECK-GI-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-GI-NEXT:    and v0.16b, v1.16b, v0.16b
 ; CHECK-GI-NEXT:    ret
   %shl = select <2 x i1> %t, <2 x i64> <i64 65536, i64 65536>, <2 x i64> zeroinitializer

From 272025e5afa858ebc6e49df88958b34a9b186f54 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 12:14:20 -0700
Subject: [PATCH 689/878] [RISCV] Remove unneeded anyext/trunc patterns from
 RISCVGISel.td. NFC

We have code to manually select these as copies already. There are
multiple combinations of types that need to be supported due to
s32 and s16 being legal for the GPR register bank. It's simpler
to handle generically than to write out all the patterns.
---
 llvm/lib/Target/RISCV/RISCVGISel.td | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index b444c65361429..9e833b762bd42 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -138,9 +138,7 @@ def : LdPat<extloadi16, LH, i32>;
 def : StPat<truncstorei8, SB, GPR, i32>;
 def : StPat<truncstorei16, SH, GPR, i32>;
 
-def : Pat<(anyext (i32 GPR:$src)), (COPY GPR:$src)>;
 def : Pat<(sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>;
-def : Pat<(i32 (trunc GPR:$src)), (COPY GPR:$src)>;
 
 def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32),
           (ADDIW GPR:$rs1, simm12_lo:$imm)>;

From f5dc553e93293743f4c1e36dc119e4823f5970d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 3 Oct 2025 22:23:26 +0300
Subject: [PATCH 690/878] [libcxx] [ci] Make the CI find the right version of
 Clang-cl (#161736)

We install a (or in practice, update a preexisting) copy of Clang, in
order to get a specific version that we want. At that point in the
procedure, this is the only version of Clang in the PATH.

However, the step of adding the MSVC build tools to the environment also
ends up adding another copy of Clang, bundled with MSVC, into the PATH.

Due to this, CMake ends up finding and preferring the older version of
Clang bundled with MSVC, rather than the one we intend to be used.

Manually add the directory of the version of Clang we want to use at the
head of the search path, after initializing the MSVC build tool
environment.

The directory name we add is a hardcoded guess of where it is installed
- this is not ideal, but seems like the most straightforward solution
for now.
---
 .github/workflows/libcxx-build-and-test.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 1c07a0adc6e99..77f79a85a0a2f 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -281,6 +281,10 @@ jobs:
       - name: Set up the MSVC dev environment
         if: ${{ matrix.mingw != true }}
         uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+      - name: Add the installed Clang at the start of the path
+        if: ${{ matrix.mingw != true }}
+        run: |
+          echo "c:\Program Files\LLVM\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
       - name: Build and test
         run: |
           bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}

From 3b38314a2b69847305b692677dbe64f1eb43235a Mon Sep 17 00:00:00 2001
From: Alan Zhao <ayzhao@google.com>
Date: Fri, 3 Oct 2025 12:26:09 -0700
Subject: [PATCH 691/878] Reapply "[InstCombine] Preserve profile after folding
 select instructions with conditionals" (#161885) (#161890)

This reverts commit 572b579632fb79ea6eb562a537c9ff1280b3d4f5.

This is a reland of #159666 but with a fix moving the `extern`
declaration of the flag under the LLVM namespace, which is needed to fix
a linker error caused by #161240.
---
 .../InstCombine/InstCombineSelect.cpp         | 20 +++++++-
 .../Transforms/InstCombine/select-and-cmp.ll  | 44 +++++++++-------
 .../Transforms/InstCombine/select-or-cmp.ll   | 50 +++++++++++--------
 3 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 87000a1c36eef..3df448ddde99c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -50,6 +50,9 @@
 using namespace llvm;
 using namespace PatternMatch;
 
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
 
 /// Replace a select operand based on an equality comparison with the identity
 /// constant of a binop.
@@ -4492,8 +4495,21 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
   auto FoldSelectWithAndOrCond = [&](bool IsAnd, Value *A,
                                      Value *B) -> Instruction * {
     if (Value *V = simplifySelectInst(B, TrueVal, FalseVal,
-                                      SQ.getWithInstruction(&SI)))
-      return SelectInst::Create(A, IsAnd ? V : TrueVal, IsAnd ? FalseVal : V);
+                                      SQ.getWithInstruction(&SI))) {
+      Value *NewTrueVal = IsAnd ? V : TrueVal;
+      Value *NewFalseVal = IsAnd ? FalseVal : V;
+
+      // If the True and False values don't change, then preserve the branch
+      // metadata of the original select as the net effect of this change is to
+      // simplify the conditional.
+      Instruction *MDFrom = nullptr;
+      if (NewTrueVal == TrueVal && NewFalseVal == FalseVal &&
+          !ProfcheckDisableMetadataFixes) {
+        MDFrom = &SI;
+      }
+      return SelectInst::Create(A, NewTrueVal, NewFalseVal, "", nullptr,
+                                MDFrom);
+    }
 
     // Is (select B, T, F) a SPF?
     if (CondVal->hasOneUse() && SelType->isIntOrIntVectorTy()) {
diff --git a/llvm/test/Transforms/InstCombine/select-and-cmp.ll b/llvm/test/Transforms/InstCombine/select-and-cmp.ll
index 50e1493cad5c7..26c04ad412445 100644
--- a/llvm/test/Transforms/InstCombine/select-and-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-and-cmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i32 @select_and_icmp(i32 %x, i32 %y, i32 %z) {
@@ -114,34 +114,34 @@ define i32 @select_and_icmp_inv(i32 %x, i32 %y, i32 %z) {
 
 ; Below used to be negative tests in InstSimplify, but are no more negative cases here
 
-define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_1(
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
   %C = and i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_2(
 ; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = and i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
 define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_3(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -153,8 +153,8 @@ define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_4(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -166,7 +166,7 @@ define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_1(
-; CHECK-NEXT:    ret i32 [[Z]]
+; CHECK-NEXT:    ret i32 [[Z:%.*]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
@@ -177,8 +177,8 @@ define i32 @select_and_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_2(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -191,8 +191,8 @@ define i32 @select_and_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_3(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -204,8 +204,8 @@ define i32 @select_and_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_and_icmp_alt_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_alt_bad_4(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -322,3 +322,11 @@ define i32 @select_and_icmp_alt_bad_false_val(i32 %x, i32 %y, i32 %z, i32 %k) {
   %D = select i1 %C, i32 %x, i32 %k
   ret i32 %D
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+;.
diff --git a/llvm/test/Transforms/InstCombine/select-or-cmp.ll b/llvm/test/Transforms/InstCombine/select-or-cmp.ll
index 72a3747c5e405..82b069bc91e88 100644
--- a/llvm/test/Transforms/InstCombine/select-or-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-or-cmp.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
 define i32 @select_or_icmp(i32 %x, i32 %y, i32 %z) {
@@ -114,47 +114,47 @@ define i32 @select_or_icmp_inv(i32 %x, i32 %y, i32 %z) {
 
 ; Below used to be negative tests in InstSimplify, but are no more negative cases here
 
-define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_1(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_1(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]], !prof [[PROF1:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp ne i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_2(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_2(
-; CHECK-NEXT:    ret i32 [[Z]]
+; CHECK-NEXT:    ret i32 [[Z:%.*]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
-define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) {
+define i32 @select_and_icmp_pred_bad_3(i32 %x, i32 %y, i32 %z) !prof !0 {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_3(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[Z]], i32 [[X:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
   %B = icmp eq i32 %y, %z
   %C = or i1 %A, %B
-  %D = select i1 %C, i32 %z, i32 %x
+  %D = select i1 %C, i32 %z, i32 %x, !prof !1
   ret i32 %D
 }
 
 define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_and_icmp_pred_bad_4(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -166,8 +166,8 @@ define i32 @select_and_icmp_pred_bad_4(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_1(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -179,7 +179,7 @@ define i32 @select_or_icmp_alt_bad_1(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_2(
-; CHECK-NEXT:    ret i32 [[X]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %A = icmp ne i32 %x, %z
   %B = icmp eq i32 %y, %z
@@ -190,8 +190,8 @@ define i32 @select_or_icmp_alt_bad_2(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_3(
-; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X]], i32 [[Z]]
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B]], i32 [[X:%.*]], i32 [[Z]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp eq i32 %x, %z
@@ -203,8 +203,8 @@ define i32 @select_or_icmp_alt_bad_3(i32 %x, i32 %y, i32 %z) {
 
 define i32 @select_or_icmp_alt_bad_4(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @select_or_icmp_alt_bad_4(
-; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = icmp eq i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = select i1 [[B_NOT]], i32 [[Z]], i32 [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp ne i32 %x, %z
@@ -321,3 +321,11 @@ define i32 @select_or_icmp_alt_bad_false_val(i32 %x, i32 %y, i32 %z, i32 %k) {
   %D = select i1 %C, i32 %x, i32 %k
   ret i32 %D
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 3, i32 2}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 2, i32 3}
+;.

From 99b0aafa4ae792104948e40d64d9fef6e8c7c93b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 12:34:25 -0700
Subject: [PATCH 692/878] [AArch64] Use isStrongerThanMonotonic. NFC (#161866)

---
 llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 96cc3f3cac91c..3e55b7620ebf9 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2957,9 +2957,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
 
     // Need special instructions for atomics that affect ordering.
-    if (Order != AtomicOrdering::NotAtomic &&
-        Order != AtomicOrdering::Unordered &&
-        Order != AtomicOrdering::Monotonic) {
+    if (isStrongerThanMonotonic(Order)) {
       assert(!isa<GZExtLoad>(LdSt));
       assert(MemSizeInBytes <= 8 &&
              "128-bit atomics should already be custom-legalized");

From b86fef88c54b20812ed3f1a67869dcaa54a07882 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com>
Date: Fri, 3 Oct 2025 12:35:13 -0700
Subject: [PATCH 693/878] [mlir][xegpu] Create a test pass for subgroup
 distribution.  (#161592)

Current subgroup distribution test employ the entire
`xegpu-subgroup-distribute` pass which include multiple steps like
layout propagation, move func body into warp op, and distribute to work
items.

This makes it harder to isolate the testing for xegpu subgroup
distribution logic, because certain corner cases may be not supported
yet by other steps mentioned above.

This PR introduces a test pass for subgroup distribution logic and
isolate the testing for distribution logic. We plan to add more corner
case (that were not possible before) covering non-xegpu ops (like
vector) in next PRs.

This PR also include,
1. minor bug fixes in gather/scatter distribution.
2. bug fix in vector multi reduction lowering where it fails to retain
some layouts.
---
 .../mlir/Dialect/XeGPU/Transforms/Passes.td   |   4 -
 .../Transforms/XeGPUSubgroupDistribute.cpp    |  67 +-
 .../XeGPU/subgroup-distribute-unit.mlir       | 575 ++++++++++++++++++
 .../Dialect/XeGPU/subgroup-distribute.mlir    | 570 ++++-------------
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  30 +
 5 files changed, 757 insertions(+), 489 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir

diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index 83b128e2c7cbf..564d9c4d5422b 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -27,10 +27,6 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
                            "vector::VectorDialect"];
-  let options = [Option<
-    "enableSGReductions", "enable-sg-reductions", "bool",
-    /*default=*/"true",
-    "Enable subgroup reductions using subgroup shuffles.">];
 }
 
 def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 882691fd19f58..f1dbc5ddb2022 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -875,14 +875,17 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
           storeScatterOp,
           "Some vector operands have no layouts, using defaults instead.");
     }
-    VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
-    VectorType expectedPayloadTy = VectorType::get(
-        {distPayloadTy.getNumElements()}, distPayloadTy.getElementType());
+    // Distributed store payload type according to the lane layout.
+    VectorType distPayloadTyByWarpOp = distStoreVecByWarpOpOrFailure.value();
+    // Expected distributed payload type is always 1D.
+    VectorType expectedPayloadTy =
+        VectorType::get({distPayloadTyByWarpOp.getNumElements()},
+                        distPayloadTyByWarpOp.getElementType());
 
     SmallVector<size_t> newRetIndices;
     SmallVector<Value> operands = storeScatterOp->getOperands();
     SmallVector<Type> operandTypesToYield = {
-        expectedPayloadTy, operands[1].getType(),
+        distPayloadTyByWarpOp, operands[1].getType(),
         distOffsetsByWarpOpOrFailure.value(),
         distMaskByWarpOpOrFailure.value()};
 
@@ -890,8 +893,11 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
     SmallVector<Value> newStoreScatterOpOperands = llvm::map_to_vector(
         newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
-
+    // The payload operand may need type adjustment due to mismatch between warp
+    // distributed type and expected SIMT type.
     rewriter.setInsertionPointAfter(newWarpOp);
+    newStoreScatterOpOperands[0] = resolveDistributedTy(
+        newStoreScatterOpOperands[0], expectedPayloadTy, rewriter);
     xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
         rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
         storeScatterOp->getAttrs());
@@ -976,8 +982,11 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
         distMaskByWarpOpOrFailure.value()};
 
     const unsigned operandIdx = producedByLastLoad->getOperandNumber();
-    VectorType loadVecTy =
+    VectorType distResultTy =
         cast<VectorType>(warpOp.getResult(operandIdx).getType());
+    // Distributed load op will always be 1D.
+    VectorType loadVecTy = VectorType::get({distResultTy.getNumElements()},
+                                           distResultTy.getElementType());
 
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
@@ -991,13 +1000,16 @@ struct LoadDistribution final : public gpu::WarpDistributionPattern {
         loadGatherOp->getAttrs());
     xegpu::removeLayoutAttrs(newOp);
     Value distributedVal = newWarpOp.getResult(operandIdx);
-    rewriter.replaceAllUsesWith(distributedVal, newOp->getResult(0));
+    // Resolve the output type and replace all uses.
+    rewriter.replaceAllUsesWith(
+        distributedVal,
+        resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
     return success();
   }
 };
 
 /// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
-/// VectorReductionOps.
+/// VectorReductionOps. We also insert layouts for the newly created ops.
 static Value lowerToVectorReductions(TypedValue<VectorType> src,
                                      TypedValue<VectorType> acc,
                                      vector::CombiningKind kind,
@@ -1014,6 +1026,9 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
   Value reductionResult = arith::ConstantOp::create(
       rewriter, loc, acc.getType(),
       DenseElementsAttr::get(acc.getType(), zeroAttr));
+  // Reduction result should have the same layout as the accumulator.
+  xegpu::setDistributeLayoutAttr(cast<OpResult>(reductionResult),
+                                 xegpu::getDistributeLayoutAttr(acc));
   // For each slice of the source, extract the slice vector, do a reduction
   // and, insert the reduced value back to the result vector.
   for (int i = 0; i < nSlices; ++i) {
@@ -1029,13 +1044,23 @@ static Value lowerToVectorReductions(TypedValue<VectorType> src,
         vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                               sliceSizes, {1, 1});
     int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
-    Value slice = vector::ShapeCastOp::create(
+    vector::ShapeCastOp slice = vector::ShapeCastOp::create(
         rewriter, loc,
         VectorType::get({nSliceElements}, sourceType.getElementType()),
         extractOp.getResult());
+    // Shape cast is currently handled in xegpu side. So layouts must be
+    // retained during lowering. Shape cast output has the same layout as the
+    // accumulator. Shape cast source has the same layout as the original
+    // reduction source.
+    // TODO: other ops generated here may also need layout attributes.
+    xegpu::setDistributeLayoutAttr(slice->getOpOperand(0),
+                                   xegpu::getDistributeLayoutAttr(src));
+    xegpu::setDistributeLayoutAttr(slice->getOpResult(0),
+                                   xegpu::getDistributeLayoutAttr(acc));
+    // Extract and reduction results in scalars, so no result layout is needed.
     Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
-    Value reduction =
-        vector::ReductionOp::create(rewriter, loc, kind, slice, accExtract);
+    Value reduction = vector::ReductionOp::create(
+        rewriter, loc, kind, slice.getResult(), accExtract);
     reductionResult =
         vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
   }
@@ -1107,7 +1132,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
       return failure();
     auto reductionOp =
         cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
-    unsigned operandNumber = yieldOperand->getOperandNumber();
+    unsigned operandIdx = yieldOperand->getOperandNumber();
     VectorType sourceType = reductionOp.getSourceVectorType();
     // Only 2D vectors are supported.
     if (sourceType.getRank() != 2)
@@ -1121,7 +1146,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
           warpOp, "Only 1 reduction dimension is supported.");
     int64_t reductionDim = reductionDims[0];
     VectorType distributedResultType =
-        cast<VectorType>(warpOp.getResult(operandNumber).getType());
+        cast<VectorType>(warpOp.getResult(operandIdx).getType());
     VectorType resultType = cast<VectorType>(reductionOp.getType());
     xegpu::DistributeLayoutAttr sourceLayout =
         xegpu::getDistributeLayoutAttr(reductionOp.getSource());
@@ -1184,7 +1209,7 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
           cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
           reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
       // Replace the warp op result with the final result.
-      rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
+      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
       return success();
     }
     // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
@@ -1217,7 +1242,7 @@ struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
     auto resultDistTy =
         cast<VectorType>(warpOp.getResult(operandNumber).getType());
     xegpu::DistributeLayoutAttr sourceLayout =
-        xegpu::getDistributeLayoutAttr(shapeCastOp.getSource());
+        xegpu::getDistributeLayoutAttr(shapeCastOp->getOpOperand(0));
     xegpu::DistributeLayoutAttr resultLayout =
         xegpu::getDistributeLayoutAttr(shapeCastOp.getResult());
     if (!sourceLayout || !resultLayout)
@@ -1403,11 +1428,6 @@ namespace {
 struct XeGPUSubgroupDistributePass final
     : public xegpu::impl::XeGPUSubgroupDistributeBase<
           XeGPUSubgroupDistributePass> {
-  XeGPUSubgroupDistributePass() = default;
-  XeGPUSubgroupDistributePass(const XeGPUSubgroupDistributePass &other) =
-      default;
-  XeGPUSubgroupDistributePass(xegpu::XeGPUSubgroupDistributeOptions options)
-      : XeGPUSubgroupDistributeBase(options) {}
   void runOnOperation() override;
 };
 } // namespace
@@ -1515,10 +1535,9 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     return laneVal;
   };
 
-  if (enableSGReductions)
-    vector::populateDistributeReduction(
-        patterns, warpReduction,
-        /*pattern benefit=*/regularPatternBenefit);
+  vector::populateDistributeReduction(
+      patterns, warpReduction,
+      /*pattern benefit=*/regularPatternBenefit);
 
   vector::populatePropagateWarpVectorDistributionPatterns(
       patterns, distributionFn, shuffleFn,
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
new file mode 100644
index 0000000000000..40b66d18cc47f
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -0,0 +1,575 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -test-xegpu-sg-distribute -allow-unregistered-dialect \
+// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @store_nd_1d
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
+// CHECK-SAME:      -> (vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:           gpu.yield %{{.*}} : vector<16xf32>,
+// CHECK-SAME:        !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:    }
+// CHECK-NEXT:    %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
+// CHECK-NEXT:    xegpu.store_nd %[[W]]#0, %[[T1]][%[[W]]#2]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
+gpu.module @xevm_module{
+  gpu.func @store_nd_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %cst = "some_op"() : () -> vector<16xf32>
+      xegpu.store_nd %cst, %0 [%c0] {layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @store_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16]
+// CHECK-SAME:    -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[CAST:.*]] = vector.shape_cast %[[W]]#0 : vector<16x1xf16> to vector<16xf16>
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.store_nd %[[CAST]], %[[T1]][%[[W]]#2, %[[W]]#3]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+gpu.module @xevm_module{
+  gpu.func @store_nd_2d(%laneid : index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %cst = "some_op"() : () -> vector<16x16xf16>
+      xegpu.store_nd %cst, %0 [%c0, %c0] {layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    gpu.return
+  }
+}
+
+
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<1xf32>,
+// CHECK-SAME:    !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16xf32,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf32> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.load_nd %[[T1]][%[[W]]#2]  : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
+gpu.module @xevm_module{
+  gpu.func @load_nd_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf32>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %1 = xegpu.load_nd %0 [%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} :
+        !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
+      gpu.yield %1 : vector<16xf32>
+    }
+    "some_user_op"(%r) : (vector<1xf32>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:     #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK:       vector.shape_cast %[[T2]] : vector<16xf16> to vector<16x1xf16>
+gpu.module @xevm_module{
+  gpu.func @load_nd_2d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+      gpu.yield %1 : vector<16x16xf16>
+    }
+    "some_user_op"(%r) : (vector<16x1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_nd_array_length
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<2x16x1xf16>,
+// CHECK-SAME:    !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : vector<2x16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<
+// CHECK-SAME:      array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#1 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16],
+// CHECK-SAME:      lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+// CHECK-NEXT:  %[[T2:.*]] = xegpu.load_nd %[[T1]][%[[W]]#2, %[[W]]#3]  : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
+// CHECK-NEXT:  vector.shape_cast %[[T2]] : vector<32xf16> to vector<2x16x1xf16>
+gpu.module @xevm_module{
+  gpu.func @load_nd_array_length(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x16x1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+        #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>,
+          #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
+      gpu.yield %1 : vector<2x16x16xf16>
+    }
+    "some_user_op"(%r) : (vector<2x16x1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @dpas
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] ->
+// CHECK-SAME:    (vector<8x1xf32>, vector<8x1xf16>, vector<16x1xf16>, vector<8x1xf32>) {
+// CHECK:         gpu.yield %{{.*}} : vector<8x16xf32>, vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32>
+// CHECK-NEXT:  }
+// CHECK-DAG:   %[[T1:.*]] = vector.shape_cast %[[W]]#1 : vector<8x1xf16> to vector<8xf16>
+// CHECK-DAG:   %[[T2:.*]] = vector.shape_cast %[[W]]#2 : vector<16x1xf16> to vector<16xf16>
+// CHECK-DAG:   %[[T3:.*]] = vector.shape_cast %[[W]]#3 : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:  %[[T4:.*]] = xegpu.dpas %[[T1]], %[[T2]], %[[T3]] : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32>
+// CHECK-NEXT:  vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
+gpu.module @xevm_module{
+  gpu.func @dpas(%laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<8x1xf32>) {
+      %0 = "some_op"() : () -> vector<8x16xf16>
+      %1 = "some_op"() : () -> vector<16x16xf16>
+      %2 = "some_op"() : () -> vector<8x16xf32>
+      %3 = xegpu.dpas %0, %1, %2
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      gpu.yield %3 : vector<8x16xf32>
+    }
+    "some_user_op"(%r) : (vector<8x1xf32>) -> ()
+    gpu.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG1]])[16] -> (!xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, ui64
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.create_nd_tdesc %[[W]]#1, shape : [64, 128], strides : [128, 1] : ui64 -> !xegpu.tensor_desc<16x16xf16>
+// CHECK-NEXT:  builtin.unrealized_conversion_cast %[[T1]] : !xegpu.tensor_desc<16x16xf16> to !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> {resolve_simt_type_mismatch}
+gpu.module @xevm_module{
+  gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+      %0 = xegpu.create_nd_tdesc %arg0, shape:[64, 128], strides:[128, 1] : ui64 ->
+        !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      gpu.yield %0 : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    "some_user_op"(%r)
+      : (!xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_2d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, index, index) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+// CHECK-SAME:      , index, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16x16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> to !xegpu.tensor_desc<16x16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1, %[[W]]#2]
+// CHECK-SAME:    <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
+gpu.module @xevm_module{
+  gpu.func @prefetch_2d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : ()
+        -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      xegpu.prefetch_nd %0[%c0, %c0]
+        <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @prefetch_1d
+// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: index) {
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (!xegpu.tensor_desc<16xf16,
+// CHECK-SAME:     #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index) {
+// CHECK:         gpu.yield %{{.*}} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>, index
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]]#0 : !xegpu.tensor_desc<16xf16,
+// CHECK-SAME:    #xegpu.layout<lane_layout = [16], lane_data = [1]>> to !xegpu.tensor_desc<16xf16> {resolve_simt_type_mismatch}
+// CHECK-NEXT:  xegpu.prefetch_nd %[[T1]][%[[W]]#1] <{l1_hint = #xegpu.cache_hint<cached>,
+// CHECK-SAME:    l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
+gpu.module @xevm_module{
+  gpu.func @prefetch_1d(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %0 = "some_op"() : ()
+        -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      xegpu.prefetch_nd %0[%c0]
+        <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
+// CHECK:  gpu.warp_execute_on_lane_0(%{{.*}})[16] -> ({{.*}}) {
+// CHECK:     gpu.yield %{{.*}}
+// CHECK:  }
+// CHECK:  %{{.*}} = xegpu.load_nd %{{.*}}  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
+// CHECK:  gpu.barrier
+gpu.module @xevm_module{
+  gpu.func @gpu_barrier(%laneid: index) {
+    %c0 = arith.constant 0 : index
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<1xf16>) {
+      %0 = "some_op"() : () -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
+      %1 = xegpu.load_nd %0[%c0]
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
+      gpu.barrier
+      gpu.yield %1 : vector<16xf16>
+    }
+    "some_user_op"(%r) : (vector<1xf16>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
+// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<2xf32>, vector<16x2xf32>, vector<2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x32xf32>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<16x32xf32>, vector<32xf32>
+// CHECK-NEXT:  }
+// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T3:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
+// CHECK:       %[[T4:.*]] = vector.reduction <add>, %[[T2]], %[[T3]] : vector<16xf32> into f32
+// CHECK:       %[[T5:.*]] = vector.extract_strided_slice %[[W]]#1
+// CHECK-SAME:    {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T6:.*]] = vector.shape_cast %[[T5]] : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T7:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
+// CHECK:       %[[T8:.*]] = vector.reduction <add>, %[[T6]], %[[T7]] : vector<16xf32> into f32
+// CHECK:       %[[T9:.*]] = vector.from_elements %[[T4]], %[[T8]] : vector<2xf32>
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x32xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}
+      dense<0.0>  : vector<32xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+    {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+      layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>
+    }  [0]
+    : vector<16x32xf32> to vector<32xf32>
+    gpu.yield %1 : vector<32xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
+// CHECK:      %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK-NEXT:   %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<2x16xf32>
+// CHECK-NEXT:   %[[T2:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T3:.*]] = vector.reduction <add>, %[[T2]], %cst : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T4:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
+// CHECK-NEXT:   %[[T5:.*]] = vector.reduction <add>, %[[T4]], %cst : vector<16xf32> into f32
+// CHECK-NEXT:   %[[T6:.*]] = vector.from_elements %[[T3]], %[[T5]] : vector<2xf32>
+// CHECK-NEXT:   gpu.yield %[[T6]] : vector<2xf32>
+// CHECK-NEXT: }
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<2x16xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<2xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<2x16xf32> to vector<2xf32>
+    gpu.yield %1 : vector<2xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
+// CHECK:       %[[ACC:.*]] = arith.constant {{.*}} dense<0.000000e+00> : vector<32xf32>
+// CHECK:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>, vector<2x16xf32>, vector<2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<32x16xf32>
+// CHECK:         gpu.yield %9, %[[SRC]], %[[ACC]] : vector<32xf32>, vector<32x16xf32>, vector<32xf32>
+// CHECK:       }
+// CHECK:       %[[T1:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
+// CHECK:       %[[T2:.*]] = vector.extract %[[W]]#2[0] : f32 from vector<2xf32>
+// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T1]], %[[T2]] : vector<16xf32> into f32
+// CHECK:       %[[T4:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
+// CHECK:       %[[T5:.*]] = vector.extract %[[W]]#2[1] : f32 from vector<2xf32>
+// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T4]], %[[T5]] : vector<16xf32> into f32
+// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+      : () -> (vector<32x16xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}
+      dense<0.0>  : vector<32xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>
+      }
+      [1] : vector<32x16xf32> to vector<32xf32>
+    gpu.yield %1 : vector<32xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
+// CHECK:     %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2xf32>) {
+// CHECK:       %[[SRC:.*]] = "some_def"() {{.*}} : () -> vector<16x2xf32>
+// CHECK:       %[[T1:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:    {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T2:.*]] = vector.shape_cast %[[T1]] {{.*}} : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T3:.*]] = vector.reduction <add>, %[[T2]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[T4:.*]] = vector.extract_strided_slice %[[SRC]]
+// CHECK-SAME:     {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
+// CHECK:       %[[T5:.*]] = vector.shape_cast %[[T4]] {{.*}} : vector<16x1xf32> to vector<16xf32>
+// CHECK:       %[[T6:.*]] = vector.reduction <add>, %[[T5]], %{{.*}} : vector<16xf32> into f32
+// CHECK:       %[[T7:.*]] = vector.from_elements %[[T3]], %[[T6]] : vector<2xf32>
+// CHECK:       gpu.yield %[[T7]] : vector<2xf32>
+// CHECK:     }
+gpu.module @xevm_module{
+gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction(%laneid: index) {
+  %c0 = arith.constant 0 : index
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2xf32>) {
+    %src = "some_def"()
+      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+      : () -> (vector<16x2xf32>)
+    %acc = arith.constant
+      {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
+      dense<0.0>  : vector<2xf32>
+    %1 = vector.multi_reduction <add>, %src, %acc
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>,
+        layout_operand_1 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>,
+        layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>
+      }
+      [0] : vector<16x2xf32> to vector<2xf32>
+    gpu.yield %1 : vector<2xf32>
+  }
+  "some_user_op"(%r) : (vector<2xf32>) -> ()
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
+// CHECK:       %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
+// CHECK:       %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<1x8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]] :
+// CHECK-SAME:      vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
+// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3 <{chunk_size = 8 : i64}>
+// CHECK-SAME:    : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+gpu.module @xevm_module{
+  gpu.func @scatter_ops_chunksize(%laneid: index, %src: memref<256xf16>) {
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %1 = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<1>: vector<16xi1>
+      %offset = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<12> : vector<16xindex>
+      %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}>
+        {
+          layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+        }
+        : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+      xegpu.store %3, %src[%offset], %1 <{chunk_size=8}>
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>,
+          layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+          layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+        }
+        : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
+// CHECK:       %[[OFFSETS:.*]] = arith.constant {{.*}} dense<12> : vector<16xindex>
+// CHECK:       %[[MASKS:.*]] = arith.constant {{.*}} dense<true> : vector<16xi1>
+// CHECK:       %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:    -> (vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}}, %[[OFFSETS]], %[[MASKS]]
+// CHECK-SAME:    : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = xegpu.load %[[W]]#1[%[[W]]#2], %[[W]]#3
+// CHECK-SAME:    : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK-NEXT:  xegpu.store %[[T1]], %[[W]]#1[%[[W]]#2], %[[W]]#3
+// CHECK-SAME:    : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+gpu.module @xevm_module{
+  gpu.func @scatter_ops(%src: memref<256xf16>, %laneid: index) {
+    gpu.warp_execute_on_lane_0(%laneid)[16] {
+      %1 = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<1> : vector<16xi1>
+      %offset = arith.constant
+        {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>}
+        dense<12> : vector<16xindex>
+      %3 = xegpu.load %src[%offset], %1
+      {
+        layout_operand_1 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+      } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+      xegpu.store %3, %src[%offset], %1
+      {
+        layout_operand_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_2 = #xegpu.layout<lane_layout = [16], lane_data = [1]>,
+        layout_operand_3 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
+      }
+      : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    }
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (index, memref<256x256xf16>) {
+// CHECK:         gpu.yield %{{.*}}, %{{.*}} : index, memref<256x256xf16>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[W]]#1 : memref<256x256xf16> -> index
+// CHECK-NEXT:  arith.index_cast %[[INTPTR]] : index to i64
+gpu.module @xevm_module{
+  gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (index) {
+      %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
+      gpu.yield %ptr : index
+    }
+    %ptr_i64 = arith.index_cast %r : index to i64
+    "some_user_op"(%ptr_i64) : (i64) -> ()
+    gpu.return
+  }
+}
+
+
+// -----
+// CHECK-LABEL: gpu.func @vector_transpose(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<2x1xf32>, vector<1x2xf32>) {
+// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<16x2xf32>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<2x16xf32>, vector<16x2xf32>
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %[[T1:.*]] = vector.transpose %[[W]]#1, [1, 0] : vector<1x2xf32> to vector<2x1xf32>
+gpu.module @xevm_module{
+  gpu.func @vector_transpose(%arg0: memref<2x16xf32>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<2x1xf32>) {
+      %cst = "some_op"()
+        {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+        : () -> (vector<16x2xf32>)
+      %transpose = vector.transpose %cst, [1, 0]
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [16 , 1], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<16x2xf32> to vector<2x16xf32>
+      gpu.yield %transpose : vector<2x16xf32>
+    }
+    "some_user_op"(%r) : (vector<2x1xf32>) -> ()
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_bitcast(
+// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (vector<4x1xi16>, vector<4x2xi8>) {
+// CHECK:         %[[SRC:.*]] = "some_op"() {{.*}} : () -> vector<4x32xi8>
+// CHECK:         gpu.yield %{{.*}}, %[[SRC]] : vector<4x16xi16>, vector<4x32xi8>
+// CHECK:       }
+// CHECK:       vector.bitcast %[[W]]#1 : vector<4x2xi8> to vector<4x1xi16>
+gpu.module @xevm_module{
+  gpu.func @vector_bitcast(%arg0: memref<4x16xi16>, %laneid: index) {
+    %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<4x1xi16>) {
+      %cst = "some_op"()
+        {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
+        : () -> (vector<4x32xi8>)
+      %bitcast = vector.bitcast %cst
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : vector<4x32xi8> to vector<4x16xi16>
+      gpu.yield %bitcast : vector<4x16xi16>
+    }
+    "some_user_op"(%r) : (vector<4x1xi16>) -> ()
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 59fac26d18cf4..0e1365aa64171 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -1,198 +1,76 @@
 // RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' -xegpu-subgroup-distribute \
 // RUN: -allow-unregistered-dialect -canonicalize -cse -split-input-file %s | FileCheck %s
 
-// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc' \
-// RUN: -xegpu-subgroup-distribute="enable-sg-reductions=false" -allow-unregistered-dialect \
-// RUN: -canonicalize -cse -split-input-file %s | FileCheck %s --check-prefix=CHECK-REDUCTION
-
-// CHECK-LABEL: gpu.func @store_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}]  : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-// CHECK: gpu.return
-gpu.module @xevm_module{
-  gpu.func @store_nd_1d(%arg0: memref<16xf32>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf32>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %cst, %0 [%c0]  : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @store_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16>
-// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} dense<1.000000e+00> : vector<16x16xf16>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %cst, %0 [%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32>
-gpu.module @xevm_module{
-  gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0 [%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf32>
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %1, %2 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_nd_array_length
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
-// CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16>
-// CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16>
-// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<2x16x16xf16>
-    %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16> from vector<2x16x16xf16>
-    %3 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %2, %3[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @load_dpas_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}]  <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
-gpu.module @xevm_module{
-  gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-    %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %4, %5[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-
-// -----
 // CHECK-LABEL: gpu.func @load_dpas_postop_store
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
-// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
-// CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
-// CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>,
+// CHECK-SAME:      %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
+// CHECK:         %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+// CHECK:         %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}]  : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
+// CHECK:         %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16>
+// CHECK:         %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
+// CHECK-DAG:     %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
+// CHECK:         %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32>
+// CHECK:         %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32>
+// CHECK-DAG:     %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-DAG:     %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:         xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @xevm_module{
   gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
     %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %3 = xegpu.load_nd %2[%c0, %c0]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
-    %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
-    %5 = math.exp %4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xf32>
-    %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
+    %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16>
+      -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %1 = xegpu.load_nd %0[%c0, %c0]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+      !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf16>
+
+    %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    %3 = xegpu.load_nd %2[%c0, %c0]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+      -> vector<16x16xf16>
+
+    %4 = xegpu.dpas %1, %3
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL: gpu.func @create_nd_tdesc_non_memref
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index,
-// CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}]  : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}]  : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %1 = xegpu.load_nd %0[%c0, %c0]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
-    %2 = xegpu.create_nd_tdesc %arg1, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %5 = math.exp %4
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xf32>
+
+    %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> ->
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>,
+      !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
 }
 
 // -----
-// TODO: gemm does not use update_nd_offset because of an issue in scf-for distribution.
 // CHECK-LABEL: gpu.func @gemm
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
-// CHECK-DAG: %[[BLOCK_ID_X:.*]] = gpu.block_id x
-// CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y
-// CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
-// CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
-// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
-// CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) {
-// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
-// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
-// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
-// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
-// CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
-// CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
-// CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
-// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
+// CHECK:         (%[[ARG0:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<1024x1024xbf16>,
+// CHECK-SAME:     %[[ARG2:[0-9a-zA-Z]+]]: memref<1024x1024xf32>) {
+// CHECK-DAG:         %[[BLOCK_ID_X:.*]] = gpu.block_id x
+// CHECK-DAG:         %[[BLOCK_ID_Y:.*]] = gpu.block_id y
+// CHECK-DAG:         %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index
+// CHECK-DAG:         %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index
+// CHECK:             %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK-NEXT:        %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+// CHECK-NEXT:        %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32>
+// CHECK:             %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]])
+// CHECK-SAME:          -> (vector<8x1xf32>) {
+// CHECK-DAG:           %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16>
+// CHECK-DAG:           %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16>
+// CHECK-DAG:           %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16>
+// CHECK-DAG:           %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16>
+// CHECK-DAG:           %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:          %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]]
+// CHECK-SAME:            : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32>
+// CHECK-NEXT:          %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32>
+// CHECK-NEXT:          scf.yield %[[T16]] : vector<8x1xf32>
+// CHECK-NEXT:        }
+// CHECK-NEXT:        %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32>
+// CHECK-NEXT:        xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32>
 gpu.module @xevm_module{
 gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){
   %c0 = arith.constant 0 : index
@@ -203,213 +81,56 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar
   %block_id_y = gpu.block_id  y
   %0 = arith.muli %block_id_x, %c8 : index
   %1 = arith.muli %block_id_y, %c16 : index
-  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %3 = xegpu.load_nd %2[%0, %1]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
-  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
-    %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
-    %7 = xegpu.load_nd %5[%0, %arg3]   {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
-    %8 = xegpu.load_nd %6[%arg3, %1]  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
-    %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
-    scf.yield %9 : vector<8x16xf32>
-  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-  xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+  %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> ->
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+  %3 = xegpu.load_nd %2[%0, %1]
+    {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+    : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL: gpu.func @prefetch_2d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16>
-gpu.module @xevm_module{
-  gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @prefetch_1d
-// CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}]  <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16>
-gpu.module @xevm_module{
-  gpu.func @prefetch_1d(%arg0: memref<256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.prefetch_nd %0[%c0]  <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
+  %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) {
 
-// -----
-// CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) {
-// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}]  : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
-// CHECK-NEXT: gpu.barrier
-// CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16>
-// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16>
-gpu.module @xevm_module{
-  gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) {
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    %1 = xegpu.load_nd %0[%c0]  {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>> -> vector<16xf16>
-    gpu.barrier
-    %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
+    %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16>
+      -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
 
-// -----
-// CHECK-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction
-// CHECK:       %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] ->
-// CHECK-SAME:    (!xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x2xf32>) {
-// CHECK:             %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<16x32xf32>
-// CHECK-NEXT:        gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, vector<16x32xf32>
-// CHECK-NEXT:  }
-// CHECK:       %[[COL0:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-NEXT:  %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-NEXT:  %[[RED0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
-// CHECK:       %[[COL1:.*]] = vector.extract_strided_slice %[[W]]#1 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-NEXT:  %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-NEXT:  %[[RED1:.*]] = vector.reduction <add>, %[[CAST1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:  vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<16x32xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<32xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>}  [0]
-    : vector<16x32xf32> to vector<32xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : vector<32xf32> to vector<1x32xf32>
-  xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    %7 = xegpu.load_nd %5[%0, %arg3]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xbf16>
+    %8 = xegpu.load_nd %6[%arg3, %1]
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>}
+      : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xbf16>
 
-// -----
-// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction
-// CHECK-REDUCTION:         %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<2x16xf32,
-// CHECK-REDUCTION-SAME:      #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32) {
-// CHECK-REDUCTION:           %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : () -> vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[ROW0:.*]] = vector.extract %[[SRC]][0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:      %[[ROW1:.*]] = vector.extract %[[SRC]][1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-REDUCTION-NEXT:      %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:      gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>, f32, f32
-// CHECK-REDUCTION-NEXT:    }
-// CHECK-REDUCTION-NEXT:    vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}  : () -> (vector<2x16xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<2xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}
-    [1] : vector<2x16xf32> to vector<2xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    : vector<2xf32> to vector<2x1xf32>
-  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<2x1xf32> to vector<2x16xf32>
-  xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    %9 = xegpu.dpas %7, %8, %arg4
+      {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32>
 
-// -----
-// CHECK-LABEL:   gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction
-// CHECK:             %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%0)[16] ->
-// CHECK-SAME:          (!xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<2x16xf32>) {
-// CHECK:                 %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<32x16xf32>
-// CHECK-NEXT:            gpu.yield %{{.*}}, %[[SRC]] : !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, vector<32x16xf32>
-// CHECK-NEXT:        }
-// CHECK:             %[[ROW0:.*]] = vector.extract %[[W]]#1[0] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:        %[[R0:.*]] = vector.reduction <add>, %[[ROW0]], %{{.*}} : vector<16xf32> into f32
-// CHECK:             %[[ROW1:.*]] = vector.extract %[[W]]#1[1] : vector<16xf32> from vector<2x16xf32>
-// CHECK-NEXT:        %[[R1:.*]] = vector.reduction <add>, %[[ROW1]], %{{.*}} : vector<16xf32> into f32
-// CHECK-NEXT:        vector.from_elements %[[R0]], %[[R1]] : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<32x16xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>} dense<0.0>  : vector<32xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [1]>}  [1]
-    : vector<32x16xf32> to vector<32xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-    : vector<32xf32> to vector<32x1xf32>
-  xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-}
+    scf.yield %9 : vector<8x16xf32>
+  } {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 
-// -----
-// CHECK-REDUCTION-LABEL: gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction
-// CHECK-REDUCTION:       %[[W:.*]]:3 = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> (!xegpu.tensor_desc<16x2xf32,
-// CHECK-REDUCTION-SAME:    #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32) {
-// CHECK-REDUCTION:          %[[SRC:.*]] = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : () -> vector<16x2xf32>
-// CHECK-REDUCTION-NEXT:     %[[COL0:.*]] = vector.extract_strided_slice %[[SRC]] {offsets = [0, 0], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REDUCTION-NEXT:     %[[CAST0:.*]] = vector.shape_cast %[[COL0]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-REDUCTION-NEXT:     %[[R0:.*]] = vector.reduction <add>, %[[CAST0]], %{{.*}} : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:     %[[COL1:.*]] = vector.extract_strided_slice %5 {offsets = [0, 1], sizes = [16, 1], strides = [1, 1]} : vector<16x2xf32> to vector<16x1xf32>
-// CHECK-REDUCTION-NEXT:     %[[CAST1:.*]] = vector.shape_cast %[[COL1]] : vector<16x1xf32> to vector<16xf32>
-// CHECK-REDUCTION-NEXT:     %[[R1:.*]] = vector.reduction <add>, %[[CAST1]], %cst : vector<16xf32> into f32
-// CHECK-REDUCTION-NEXT:     gpu.yield %4, %[[R1]], %[[R0]] : !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>, f32, f32
-// CHECK-REDUCTION-NEXT:   }
-// CHECK-REDUCTION-NEXT:   vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32>
-gpu.module @xevm_module{
-gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() {
-  %c0 = arith.constant 0 : index
-  %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
-  %src = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}  : () -> (vector<16x2xf32>)
-  %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>} dense<0.0>  : vector<2xf32>
-  %1 = vector.multi_reduction <add>, %src, %acc {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, dims = [0]>}
-    [0] : vector<16x2xf32> to vector<2xf32>
-  %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
-    : vector<2xf32> to vector<1x2xf32>
-  %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} : vector<1x2xf32> to vector<16x2xf32>
-  xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+  xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>,
+    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   gpu.return
 }
 }
 
 // -----
-// CHECK-LABEL: gpu.func @scatter_ops_chunksize({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
-  gpu.func @scatter_ops_chunksize(%src: memref<256xf16>) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
-      layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
-    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
-    xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}},
-// CHECK-SAME: %[[PREDICATE:.*]]: i1) {
-// CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16>
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) {
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16>
-// CHECK-NEXT: } else {
-// CHECK-NEXT:   scf.yield %[[DEFAULT]] : vector<8xf16>
-// CHECK-NEXT: }
-// CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+// CHECK-LABEL: gpu.func @scatter_ops_scf_yield
+// CHECK:         (%{{.*}}: memref<256xf16>, %[[PREDICATE:[a-zA-Z0-9]+]]: i1) {
+// CHECK-DAG:      %[[CST:.*]] = arith.constant dense<1.200000e+01> : vector<1x8xf16>
+// CHECK-DAG:      %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK-DAG:      %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK:          %[[IF:.*]] = scf.if %[[PREDICATE]] -> (vector<1x8xf16>) {
+// CHECK-NEXT:        %[[LD:.*]] = xegpu.load %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:          : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:        %[[LD_CAST:.*]] = vector.shape_cast %[[LD]] : vector<8xf16> to vector<1x8xf16>
+// CHECK-NEXT:        scf.yield %[[LD_CAST]] : vector<1x8xf16>
+// CHECK-NEXT:      } else {
+// CHECK-NEXT:        scf.yield %[[CST]] : vector<1x8xf16>
+// CHECK-NEXT:      }
+// CHECK-NEXT:      %[[IF_CAST:.*]] = vector.shape_cast %[[IF]] : vector<1x8xf16> to vector<8xf16>
+// CHECK-NEXT:      xegpu.store %[[IF_CAST]], %{{.*}}[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:        vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.module @xevm_module{
   gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
     %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
@@ -432,13 +153,15 @@ gpu.module @xevm_module{
 
 // -----
 // CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
-// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
-// CHECK: scf.if %[[PREDICATE]] {
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-// CHECK-NEXT: }
+// CHECK:         %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK:         %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK:         %[[PREDICATE:.*]] = llvm.mlir.poison : i1
+// CHECK:         scf.if %[[PREDICATE]] {
+// CHECK-NEXT:      %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:         memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT:      xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}>
+// CHECK-SAME:         vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+// CHECK-NEXT:    }
 gpu.module @xevm_module{
   gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) {
     %pred = llvm.mlir.poison : i1
@@ -454,89 +177,14 @@ gpu.module @xevm_module{
   }
 }
 
-// -----
-// CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
-// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK-NEXT: %[[LANE_OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
-// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[LANE_OFFSET]]], %[[MASK]] : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
-// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[LANE_OFFSET]]], %[[MASK]] : vector<1xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
-gpu.module @xevm_module{
-  gpu.func @scatter_ops(%src: memref<256xf16>) {
-    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
-    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
-    %3 = xegpu.load %src[%offset], %1 {
-      layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>
-    } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
-    xegpu.store %3, %src[%offset], %1 : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @memref_extract_aligned_pointer_as_index(
-// CHECK:  %{{.*}} = memref.extract_aligned_pointer_as_index %{{.*}} : memref<256x256xf16> -> index
-gpu.module @xevm_module{
-  gpu.func @memref_extract_aligned_pointer_as_index(%arg0 : memref<256x256xf16>) {
-    %c0 = arith.constant 0 : index
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1.000000e+00> : vector<16xf16>
-    %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index
-    %ptr_i64 = arith.index_cast %ptr : index to i64
-    %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64
-      -> !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-    gpu.return
-  }
-}
-
-
-// -----
-// CHECK-LABEL: gpu.func @vector_transpose(
-// CHECK:         %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32>
-// CHECK:         %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32>
-// CHECK:         xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32>
-gpu.module @xevm_module{
-  gpu.func @vector_transpose(%arg0: memref<2x16xf32>) {
-    %cst = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} dense<1.000000e+00>
-      : vector<16x2xf32>
-    %c0 = arith.constant 0 : index
-    %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<16x2xf32> to vector<2x16xf32>
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32>
-      -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>,
-      !xegpu.tensor_desc<2x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
-// -----
-// CHECK-LABEL: gpu.func @vector_bitcast(
-// CHECK:      %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16>
-// CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16>
-// CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16>
-// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}]  : vector<4xi16>, !xegpu.tensor_desc<4x16xi16>
-gpu.module @xevm_module{
-  gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) {
-    %cst = "some_op"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>}
-      : () -> (vector<4x32xi8>)
-    %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-      : vector<4x32xi8> to vector<4x16xi16>
-    %c0 = arith.constant 0 : index
-    %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16>
-      -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>,
-      !xegpu.tensor_desc<4x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    gpu.return
-  }
-}
-
 // -----
 // CHECK-LABEL: gpu.func @mma_transpose_b(
 // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) {
 // CHECK-DAG:     %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
 // CHECK-DAG:     %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32>
 // CHECK-DAG:     %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
-// CHECK-DAG:     %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
+// CHECK-DAG:     %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array<i64: 1, 0>}>
+// CHECK-SAME:      !xegpu.tensor_desc<16x8xi32> -> vector<8xi32>
 // CHECK-NEXT:    %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32>
 // CHECK-NEXT:    %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16>
 // CHECK-NEXT:    %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16>
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index e51cac4286f0c..6ba7a004b7d31 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -218,6 +218,35 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
   }
 };
 
+struct TestXeGPUSGDistribute
+    : public PassWrapper<TestXeGPUSGDistribute,
+                         OperationPass<gpu::GPUModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestXeGPUSGDistribute)
+
+  StringRef getArgument() const final { return "test-xegpu-sg-distribute"; }
+
+  StringRef getDescription() const final {
+    return "Test the implementation of XeGPU Subgroup Distribution";
+  }
+
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect>();
+    registry.insert<memref::MemRefDialect>();
+    registry.insert<xegpu::XeGPUDialect>();
+    registry.insert<vector::VectorDialect>();
+    registry.insert<index::IndexDialect>();
+  }
+
+  TestXeGPUSGDistribute() = default;
+  TestXeGPUSGDistribute(const TestXeGPUSGDistribute &pass) = default;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+    (void)applyPatternsGreedily(getOperation(), std::move(patterns));
+  }
+};
+
 struct TestXeGPULayoutInterface
     : public PassWrapper<TestXeGPULayoutInterface,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -282,6 +311,7 @@ namespace test {
 void registerTestXeGPULowerings() {
   PassRegistration<TestXeGPUUnrollingPatterns>();
   PassRegistration<TestXeGPULayoutInterface>();
+  PassRegistration<TestXeGPUSGDistribute>();
 }
 } // namespace test
 } // namespace mlir

From 178e2a704b1dc5209e4cc24866a2738c5bc468f3 Mon Sep 17 00:00:00 2001
From: Yatao Wang <ningxinr@live.cn>
Date: Fri, 3 Oct 2025 12:39:45 -0700
Subject: [PATCH 694/878] [LLVM][CodeGen] Check Non Saturate Case in
 isSaturatingMinMax (#160637)

Fix Issue #160611
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   2 +-
 llvm/test/CodeGen/AArch64/fpclamptosat.ll     |  55 +++--
 llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll | 101 ++++++---
 llvm/test/CodeGen/ARM/fpclamptosat.ll         |  48 ++++-
 llvm/test/CodeGen/ARM/fpclamptosat_vec.ll     | 107 ++++++++--
 llvm/test/CodeGen/RISCV/fpclamptosat.ll       |  58 ++++-
 .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll     | 202 ++++++++++++++++--
 llvm/test/CodeGen/WebAssembly/fpclamptosat.ll |  89 +++++---
 .../CodeGen/WebAssembly/fpclamptosat_vec.ll   |  78 +++++--
 llvm/test/CodeGen/X86/fpclamptosat.ll         |  45 ++--
 llvm/test/CodeGen/X86/fpclamptosat_vec.ll     | 105 ++++++---
 11 files changed, 715 insertions(+), 175 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 558c5a0390228..309f1bea8b77c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6046,7 +6046,7 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
     return N02;
   }
 
-  if (MaxC == 0 && MinCPlus1.isPowerOf2()) {
+  if (MaxC == 0 && MinC != 0 && MinCPlus1.isPowerOf2()) {
     BW = MinCPlus1.exactLogBase2();
     Unsigned = true;
     return N02;
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat.ll b/llvm/test/CodeGen/AArch64/fpclamptosat.ll
index 00de1530fb72c..24be9234515e8 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat.ll
@@ -111,14 +111,14 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32(half %x) {
-; CHECK-CVT-LABEL: utesth_f16i32:
+define i32 @utest_f16i32(half %x) {
+; CHECK-CVT-LABEL: utest_f16i32:
 ; CHECK-CVT:       // %bb.0: // %entry
 ; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fcvtzu w0, s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: utesth_f16i32:
+; CHECK-FP16-LABEL: utest_f16i32:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fcvtzu w0, h0
 ; CHECK-FP16-NEXT:    ret
@@ -298,8 +298,8 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16(half %x) {
-; CHECK-CVT-LABEL: utesth_f16i16:
+define i16 @utest_f16i16(half %x) {
+; CHECK-CVT-LABEL: utest_f16i16:
 ; CHECK-CVT:       // %bb.0: // %entry
 ; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    mov w9, #65535 // =0xffff
@@ -308,7 +308,7 @@ define i16 @utesth_f16i16(half %x) {
 ; CHECK-CVT-NEXT:    csel w0, w8, w9, lo
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: utesth_f16i16:
+; CHECK-FP16-LABEL: utest_f16i16:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fcvtzu w8, h0
 ; CHECK-FP16-NEXT:    mov w9, #65535 // =0xffff
@@ -493,8 +493,8 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64(half %x) {
-; CHECK-LABEL: utesth_f16i64:
+define i64 @utest_f16i64(half %x) {
+; CHECK-LABEL: utest_f16i64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
@@ -636,14 +636,14 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32_mm(half %x) {
-; CHECK-CVT-LABEL: utesth_f16i32_mm:
+define i32 @utest_f16i32_mm(half %x) {
+; CHECK-CVT-LABEL: utest_f16i32_mm:
 ; CHECK-CVT:       // %bb.0: // %entry
 ; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    fcvtzu w0, s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: utesth_f16i32_mm:
+; CHECK-FP16-LABEL: utest_f16i32_mm:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fcvtzu w0, h0
 ; CHECK-FP16-NEXT:    ret
@@ -808,8 +808,8 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16_mm(half %x) {
-; CHECK-CVT-LABEL: utesth_f16i16_mm:
+define i16 @utest_f16i16_mm(half %x) {
+; CHECK-CVT-LABEL: utest_f16i16_mm:
 ; CHECK-CVT:       // %bb.0: // %entry
 ; CHECK-CVT-NEXT:    fcvt s0, h0
 ; CHECK-CVT-NEXT:    mov w9, #65535 // =0xffff
@@ -818,7 +818,7 @@ define i16 @utesth_f16i16_mm(half %x) {
 ; CHECK-CVT-NEXT:    csel w0, w8, w9, lo
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-FP16-LABEL: utesth_f16i16_mm:
+; CHECK-FP16-LABEL: utest_f16i16_mm:
 ; CHECK-FP16:       // %bb.0: // %entry
 ; CHECK-FP16-NEXT:    fcvtzu w8, h0
 ; CHECK-FP16-NEXT:    mov w9, #65535 // =0xffff
@@ -986,8 +986,8 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64_mm(half %x) {
-; CHECK-LABEL: utesth_f16i64_mm:
+define i64 @utest_f16i64_mm(half %x) {
+; CHECK-LABEL: utest_f16i64_mm:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
@@ -1026,6 +1026,29 @@ entry:
   ret i64 %conv6
 }
 
+; i32 non saturate
+
+define i32 @ustest_f16i32_nsat(half %x) {
+; CHECK-CVT-LABEL: ustest_f16i32_nsat:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvt s0, h0
+; CHECK-CVT-NEXT:    fcvtzs w8, s0
+; CHECK-CVT-NEXT:    and w8, w8, w8, asr #31
+; CHECK-CVT-NEXT:    bic w0, w8, w8, asr #31
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-FP16-LABEL: ustest_f16i32_nsat:
+; CHECK-FP16:       // %bb.0:
+; CHECK-FP16-NEXT:    fcvtzs w8, h0
+; CHECK-FP16-NEXT:    and w8, w8, w8, asr #31
+; CHECK-FP16-NEXT:    bic w0, w8, w8, asr #31
+; CHECK-FP16-NEXT:    ret
+  %conv = fptosi half %x to i32
+  %spec.store.select = call i32 @llvm.smin.i32(i32 0, i32 %conv)
+  %spec.store.select7 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 0)
+  ret i32 %spec.store.select7
+}
+
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.umin.i32(i32, i32)
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index b09a86723bb56..637c02875b84e 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -321,20 +321,20 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32(<4 x half> %x) {
-; CHECK-CVT-SD-LABEL: utesth_f16i32:
+define <4 x i32> @utest_f16i32(<4 x half> %x) {
+; CHECK-CVT-SD-LABEL: utest_f16i32:
 ; CHECK-CVT-SD:       // %bb.0: // %entry
 ; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-CVT-SD-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: utesth_f16i32:
+; CHECK-FP16-SD-LABEL: utest_f16i32:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
 ; CHECK-FP16-SD-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-FP16-SD-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-FP16-SD-NEXT:    ret
 ;
-; CHECK-CVT-GI-LABEL: utesth_f16i32:
+; CHECK-CVT-GI-LABEL: utest_f16i32:
 ; CHECK-CVT-GI:       // %bb.0: // %entry
 ; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-CVT-GI-NEXT:    movi v1.2d, #0x000000ffffffff
@@ -349,7 +349,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-CVT-GI-NEXT:    uzp1 v0.4s, v2.4s, v0.4s
 ; CHECK-CVT-GI-NEXT:    ret
 ;
-; CHECK-FP16-GI-LABEL: utesth_f16i32:
+; CHECK-FP16-GI-LABEL: utest_f16i32:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-FP16-GI-NEXT:    mov h2, v0.h[1]
@@ -614,8 +614,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16(<8 x half> %x) {
-; CHECK-CVT-LABEL: utesth_f16i16:
+define <8 x i16> @utest_f16i16(<8 x half> %x) {
+; CHECK-CVT-LABEL: utest_f16i16:
 ; CHECK-CVT:       // %bb.0: // %entry
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
@@ -625,12 +625,12 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: utesth_f16i16:
+; CHECK-FP16-SD-LABEL: utest_f16i16:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
 ; CHECK-FP16-SD-NEXT:    fcvtzu v0.8h, v0.8h
 ; CHECK-FP16-SD-NEXT:    ret
 ;
-; CHECK-FP16-GI-LABEL: utesth_f16i16:
+; CHECK-FP16-GI-LABEL: utest_f16i16:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-FP16-GI-NEXT:    fcvtl2 v0.4s, v0.8h
@@ -1746,8 +1746,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64(<2 x half> %x) {
-; CHECK-CVT-SD-LABEL: utesth_f16i64:
+define <2 x i64> @utest_f16i64(<2 x half> %x) {
+; CHECK-CVT-SD-LABEL: utest_f16i64:
 ; CHECK-CVT-SD:       // %bb.0: // %entry
 ; CHECK-CVT-SD-NEXT:    sub sp, sp, #48
 ; CHECK-CVT-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
@@ -1777,7 +1777,7 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-CVT-SD-NEXT:    add sp, sp, #48
 ; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: utesth_f16i64:
+; CHECK-FP16-SD-LABEL: utest_f16i64:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
 ; CHECK-FP16-SD-NEXT:    sub sp, sp, #48
 ; CHECK-FP16-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
@@ -1807,7 +1807,7 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-FP16-SD-NEXT:    add sp, sp, #48
 ; CHECK-FP16-SD-NEXT:    ret
 ;
-; CHECK-CVT-GI-LABEL: utesth_f16i64:
+; CHECK-CVT-GI-LABEL: utest_f16i64:
 ; CHECK-CVT-GI:       // %bb.0: // %entry
 ; CHECK-CVT-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-CVT-GI-NEXT:    mov h1, v0.h[1]
@@ -1819,7 +1819,7 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-CVT-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-CVT-GI-NEXT:    ret
 ;
-; CHECK-FP16-GI-LABEL: utesth_f16i64:
+; CHECK-FP16-GI-LABEL: utest_f16i64:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-FP16-GI-NEXT:    mov h1, v0.h[1]
@@ -2307,20 +2307,20 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
-; CHECK-CVT-SD-LABEL: utesth_f16i32_mm:
+define <4 x i32> @utest_f16i32_mm(<4 x half> %x) {
+; CHECK-CVT-SD-LABEL: utest_f16i32_mm:
 ; CHECK-CVT-SD:       // %bb.0: // %entry
 ; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-CVT-SD-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: utesth_f16i32_mm:
+; CHECK-FP16-SD-LABEL: utest_f16i32_mm:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
 ; CHECK-FP16-SD-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-FP16-SD-NEXT:    fcvtzu v0.4s, v0.4s
 ; CHECK-FP16-SD-NEXT:    ret
 ;
-; CHECK-CVT-GI-LABEL: utesth_f16i32_mm:
+; CHECK-CVT-GI-LABEL: utest_f16i32_mm:
 ; CHECK-CVT-GI:       // %bb.0: // %entry
 ; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-CVT-GI-NEXT:    movi v1.2d, #0x000000ffffffff
@@ -2335,7 +2335,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-CVT-GI-NEXT:    uzp1 v0.4s, v2.4s, v0.4s
 ; CHECK-CVT-GI-NEXT:    ret
 ;
-; CHECK-FP16-GI-LABEL: utesth_f16i32_mm:
+; CHECK-FP16-GI-LABEL: utest_f16i32_mm:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-FP16-GI-NEXT:    mov h2, v0.h[1]
@@ -2585,8 +2585,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
-; CHECK-CVT-LABEL: utesth_f16i16_mm:
+define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
+; CHECK-CVT-LABEL: utest_f16i16_mm:
 ; CHECK-CVT:       // %bb.0: // %entry
 ; CHECK-CVT-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-CVT-NEXT:    fcvtl2 v0.4s, v0.8h
@@ -2596,12 +2596,12 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-CVT-NEXT:    uqxtn2 v0.8h, v2.4s
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: utesth_f16i16_mm:
+; CHECK-FP16-SD-LABEL: utest_f16i16_mm:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
 ; CHECK-FP16-SD-NEXT:    fcvtzu v0.8h, v0.8h
 ; CHECK-FP16-SD-NEXT:    ret
 ;
-; CHECK-FP16-GI-LABEL: utesth_f16i16_mm:
+; CHECK-FP16-GI-LABEL: utest_f16i16_mm:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    fcvtl v1.4s, v0.4h
 ; CHECK-FP16-GI-NEXT:    fcvtl2 v0.4s, v0.8h
@@ -3694,8 +3694,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
-; CHECK-CVT-SD-LABEL: utesth_f16i64_mm:
+define <2 x i64> @utest_f16i64_mm(<2 x half> %x) {
+; CHECK-CVT-SD-LABEL: utest_f16i64_mm:
 ; CHECK-CVT-SD:       // %bb.0: // %entry
 ; CHECK-CVT-SD-NEXT:    sub sp, sp, #48
 ; CHECK-CVT-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
@@ -3725,7 +3725,7 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-CVT-SD-NEXT:    add sp, sp, #48
 ; CHECK-CVT-SD-NEXT:    ret
 ;
-; CHECK-FP16-SD-LABEL: utesth_f16i64_mm:
+; CHECK-FP16-SD-LABEL: utest_f16i64_mm:
 ; CHECK-FP16-SD:       // %bb.0: // %entry
 ; CHECK-FP16-SD-NEXT:    sub sp, sp, #48
 ; CHECK-FP16-SD-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
@@ -3755,7 +3755,7 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-FP16-SD-NEXT:    add sp, sp, #48
 ; CHECK-FP16-SD-NEXT:    ret
 ;
-; CHECK-CVT-GI-LABEL: utesth_f16i64_mm:
+; CHECK-CVT-GI-LABEL: utest_f16i64_mm:
 ; CHECK-CVT-GI:       // %bb.0: // %entry
 ; CHECK-CVT-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-CVT-GI-NEXT:    mov h1, v0.h[1]
@@ -3767,7 +3767,7 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-CVT-GI-NEXT:    mov v0.d[1], x9
 ; CHECK-CVT-GI-NEXT:    ret
 ;
-; CHECK-FP16-GI-LABEL: utesth_f16i64_mm:
+; CHECK-FP16-GI-LABEL: utest_f16i64_mm:
 ; CHECK-FP16-GI:       // %bb.0: // %entry
 ; CHECK-FP16-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-FP16-GI-NEXT:    mov h1, v0.h[1]
@@ -3941,6 +3941,51 @@ entry:
   ret <2 x i64> %conv6
 }
 
+; i32 non saturate
+
+define <4 x i32> @ustest_f16i32_nsat(<4 x half> %x) {
+; CHECK-CVT-SD-LABEL: ustest_f16i32_nsat:
+; CHECK-CVT-SD:       // %bb.0: // %entry
+; CHECK-CVT-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-CVT-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-SD-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-CVT-SD-NEXT:    ret
+;
+; CHECK-FP16-SD-LABEL: ustest_f16i32_nsat:
+; CHECK-FP16-SD:       // %bb.0: // %entry
+; CHECK-FP16-SD-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-FP16-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-FP16-SD-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-FP16-SD-NEXT:    smin v0.4s, v0.4s, v1.4s
+; CHECK-FP16-SD-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-FP16-SD-NEXT:    ret
+;
+; CHECK-CVT-GI-LABEL: ustest_f16i32_nsat:
+; CHECK-CVT-GI:       // %bb.0: // %entry
+; CHECK-CVT-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-CVT-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-CVT-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    smin v0.4s, v1.4s, v0.4s
+; CHECK-CVT-GI-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-CVT-GI-NEXT:    ret
+;
+; CHECK-FP16-GI-LABEL: ustest_f16i32_nsat:
+; CHECK-FP16-GI:       // %bb.0: // %entry
+; CHECK-FP16-GI-NEXT:    fcvtl v0.4s, v0.4h
+; CHECK-FP16-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-FP16-GI-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-FP16-GI-NEXT:    smin v0.4s, v1.4s, v0.4s
+; CHECK-FP16-GI-NEXT:    smax v0.4s, v0.4s, v1.4s
+; CHECK-FP16-GI-NEXT:    ret
+entry:
+  %conv = fptosi <4 x half> %x to <4 x i32>
+  %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> zeroinitializer, <4 x i32> %conv)
+  %spec.store.select7 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %spec.store.select, <4 x i32> zeroinitializer)
+  ret <4 x i32> %spec.store.select7
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat.ll b/llvm/test/CodeGen/ARM/fpclamptosat.ll
index 8ab56b228d2a7..a6f0a03fc7e5b 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat.ll
@@ -383,8 +383,8 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32(half %x) {
-; SOFT-LABEL: utesth_f16i32:
+define i32 @utest_f16i32(half %x) {
+; SOFT-LABEL: utest_f16i32:
 ; SOFT:       @ %bb.0: @ %entry
 ; SOFT-NEXT:    .save {r7, lr}
 ; SOFT-NEXT:    push {r7, lr}
@@ -400,7 +400,7 @@ define i32 @utesth_f16i32(half %x) {
 ; SOFT-NEXT:  .LBB7_2: @ %entry
 ; SOFT-NEXT:    pop {r7, pc}
 ;
-; VFP2-LABEL: utesth_f16i32:
+; VFP2-LABEL: utest_f16i32:
 ; VFP2:       @ %bb.0: @ %entry
 ; VFP2-NEXT:    .save {r7, lr}
 ; VFP2-NEXT:    push {r7, lr}
@@ -411,7 +411,7 @@ define i32 @utesth_f16i32(half %x) {
 ; VFP2-NEXT:    vmov r0, s0
 ; VFP2-NEXT:    pop {r7, pc}
 ;
-; FULL-LABEL: utesth_f16i32:
+; FULL-LABEL: utest_f16i32:
 ; FULL:       @ %bb.0: @ %entry
 ; FULL-NEXT:    vcvt.u32.f16 s0, s0
 ; FULL-NEXT:    vmov r0, s0
@@ -3985,6 +3985,46 @@ entry:
   ret i32 %spec.store.select7
 }
 
+; i32 non saturate
+
+define i32 @ustest_f16i32_nsat(half %x) {
+; SOFT-LABEL: ustest_f16i32_nsat:
+; SOFT:       @ %bb.0:
+; SOFT-NEXT:    .save {r7, lr}
+; SOFT-NEXT:    push {r7, lr}
+; SOFT-NEXT:    uxth r0, r0
+; SOFT-NEXT:    bl __aeabi_h2f
+; SOFT-NEXT:    bl __aeabi_f2iz
+; SOFT-NEXT:    asrs r1, r0, #31
+; SOFT-NEXT:    ands r0, r1
+; SOFT-NEXT:    asrs r1, r0, #31
+; SOFT-NEXT:    bics r0, r1
+; SOFT-NEXT:    pop {r7, pc}
+;
+; VFP2-LABEL: ustest_f16i32_nsat:
+; VFP2:       @ %bb.0:
+; VFP2-NEXT:    .save {r7, lr}
+; VFP2-NEXT:    push {r7, lr}
+; VFP2-NEXT:    vmov r0, s0
+; VFP2-NEXT:    bl __aeabi_h2f
+; VFP2-NEXT:    vmov s0, r0
+; VFP2-NEXT:    vcvt.s32.f32 s0, s0
+; VFP2-NEXT:    vmov r0, s0
+; VFP2-NEXT:    usat r0, #0, r0
+; VFP2-NEXT:    pop {r7, pc}
+;
+; FULL-LABEL: ustest_f16i32_nsat:
+; FULL:       @ %bb.0:
+; FULL-NEXT:    vcvt.s32.f16 s0, s0
+; FULL-NEXT:    vmov r0, s0
+; FULL-NEXT:    usat r0, #0, r0
+; FULL-NEXT:    bx lr
+  %conv = fptosi half %x to i32
+  %spec.store.select = call i32 @llvm.smin.i32(i32 0, i32 %conv)
+  %spec.store.select7 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 0)
+  ret i32 %spec.store.select7
+}
+
 
 
 declare i32 @llvm.smin.i32(i32, i32)
diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
index 96f009a4da02d..ba31b353ee1f6 100644
--- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll
@@ -748,8 +748,8 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32(<4 x half> %x) {
-; CHECK-NEON-LABEL: utesth_f16i32:
+define <4 x i32> @utest_f16i32(<4 x half> %x) {
+; CHECK-NEON-LABEL: utest_f16i32:
 ; CHECK-NEON:       @ %bb.0: @ %entry
 ; CHECK-NEON-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-NEON-NEXT:    push {r4, r5, r6, r7, r8, r9, r11, lr}
@@ -821,7 +821,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NEON-NEXT:    vpop {d12, d13}
 ; CHECK-NEON-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, pc}
 ;
-; CHECK-FP16-LABEL: utesth_f16i32:
+; CHECK-FP16-LABEL: utest_f16i32:
 ; CHECK-FP16:       @ %bb.0: @ %entry
 ; CHECK-FP16-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-FP16-NEXT:    push {r4, r5, r6, r7, r8, r9, r11, lr}
@@ -1366,8 +1366,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16(<8 x half> %x) {
-; CHECK-NEON-LABEL: utesth_f16i16:
+define <8 x i16> @utest_f16i16(<8 x half> %x) {
+; CHECK-NEON-LABEL: utest_f16i16:
 ; CHECK-NEON:       @ %bb.0: @ %entry
 ; CHECK-NEON-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEON-NEXT:    push {r4, r5, r6, r7, r11, lr}
@@ -1441,7 +1441,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEON-NEXT:    pop {r4, r5, r6, r7, r11, pc}
 ;
-; CHECK-FP16-LABEL: utesth_f16i16:
+; CHECK-FP16-LABEL: utest_f16i16:
 ; CHECK-FP16:       @ %bb.0: @ %entry
 ; CHECK-FP16-NEXT:    vmovx.f16 s4, s0
 ; CHECK-FP16-NEXT:    vcvt.u32.f16 s12, s0
@@ -2109,8 +2109,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64(<2 x half> %x) {
-; CHECK-NEON-LABEL: utesth_f16i64:
+define <2 x i64> @utest_f16i64(<2 x half> %x) {
+; CHECK-NEON-LABEL: utest_f16i64:
 ; CHECK-NEON:       @ %bb.0: @ %entry
 ; CHECK-NEON-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEON-NEXT:    push {r4, r5, r6, lr}
@@ -2148,7 +2148,7 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NEON-NEXT:    vpop {d8}
 ; CHECK-NEON-NEXT:    pop {r4, r5, r6, pc}
 ;
-; CHECK-FP16-LABEL: utesth_f16i64:
+; CHECK-FP16-LABEL: utest_f16i64:
 ; CHECK-FP16:       @ %bb.0: @ %entry
 ; CHECK-FP16-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-FP16-NEXT:    push {r4, r5, r6, lr}
@@ -2835,8 +2835,8 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
-; CHECK-NEON-LABEL: utesth_f16i32_mm:
+define <4 x i32> @utest_f16i32_mm(<4 x half> %x) {
+; CHECK-NEON-LABEL: utest_f16i32_mm:
 ; CHECK-NEON:       @ %bb.0: @ %entry
 ; CHECK-NEON-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEON-NEXT:    push {r4, r5, r6, r7, r11, lr}
@@ -2881,7 +2881,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NEON-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEON-NEXT:    pop {r4, r5, r6, r7, r11, pc}
 ;
-; CHECK-FP16-LABEL: utesth_f16i32_mm:
+; CHECK-FP16-LABEL: utest_f16i32_mm:
 ; CHECK-FP16:       @ %bb.0: @ %entry
 ; CHECK-FP16-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-FP16-NEXT:    push {r4, r5, r6, lr}
@@ -3344,8 +3344,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
-; CHECK-NEON-LABEL: utesth_f16i16_mm:
+define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
+; CHECK-NEON-LABEL: utest_f16i16_mm:
 ; CHECK-NEON:       @ %bb.0: @ %entry
 ; CHECK-NEON-NEXT:    .save {r4, r5, r6, r7, r11, lr}
 ; CHECK-NEON-NEXT:    push {r4, r5, r6, r7, r11, lr}
@@ -3419,7 +3419,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NEON-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEON-NEXT:    pop {r4, r5, r6, r7, r11, pc}
 ;
-; CHECK-FP16-LABEL: utesth_f16i16_mm:
+; CHECK-FP16-LABEL: utest_f16i16_mm:
 ; CHECK-FP16:       @ %bb.0: @ %entry
 ; CHECK-FP16-NEXT:    vmovx.f16 s4, s0
 ; CHECK-FP16-NEXT:    vcvt.u32.f16 s12, s0
@@ -4044,8 +4044,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
-; CHECK-NEON-LABEL: utesth_f16i64_mm:
+define <2 x i64> @utest_f16i64_mm(<2 x half> %x) {
+; CHECK-NEON-LABEL: utest_f16i64_mm:
 ; CHECK-NEON:       @ %bb.0: @ %entry
 ; CHECK-NEON-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEON-NEXT:    push {r4, r5, r6, lr}
@@ -4083,7 +4083,7 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-NEON-NEXT:    vpop {d8}
 ; CHECK-NEON-NEXT:    pop {r4, r5, r6, pc}
 ;
-; CHECK-FP16-LABEL: utesth_f16i64_mm:
+; CHECK-FP16-LABEL: utest_f16i64_mm:
 ; CHECK-FP16:       @ %bb.0: @ %entry
 ; CHECK-FP16-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-FP16-NEXT:    push {r4, r5, r6, lr}
@@ -4215,6 +4215,77 @@ entry:
   ret <2 x i64> %conv6
 }
 
+; i32 non saturate
+
+define <4 x i32> @ustest_f16i32_nsat(<4 x half> %x) {
+; CHECK-NEON-LABEL: ustest_f16i32_nsat:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    .save {r4, lr}
+; CHECK-NEON-NEXT:    push {r4, lr}
+; CHECK-NEON-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEON-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    vmov.f32 s16, s3
+; CHECK-NEON-NEXT:    vmov.f32 s18, s2
+; CHECK-NEON-NEXT:    vmov.f32 s20, s1
+; CHECK-NEON-NEXT:    bl __aeabi_h2f
+; CHECK-NEON-NEXT:    mov r4, r0
+; CHECK-NEON-NEXT:    vmov r0, s16
+; CHECK-NEON-NEXT:    bl __aeabi_h2f
+; CHECK-NEON-NEXT:    vmov s16, r0
+; CHECK-NEON-NEXT:    vmov r0, s18
+; CHECK-NEON-NEXT:    bl __aeabi_h2f
+; CHECK-NEON-NEXT:    vmov s0, r0
+; CHECK-NEON-NEXT:    vmov r1, s20
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-NEON-NEXT:    vmov s18, r4
+; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    vmov.32 d11[0], r0
+; CHECK-NEON-NEXT:    mov r0, r1
+; CHECK-NEON-NEXT:    bl __aeabi_h2f
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s2, s18
+; CHECK-NEON-NEXT:    vmov s0, r0
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s4, s16
+; CHECK-NEON-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-NEON-NEXT:    vmov.i32 q8, #0x0
+; CHECK-NEON-NEXT:    vmov r0, s2
+; CHECK-NEON-NEXT:    vmov.32 d10[0], r0
+; CHECK-NEON-NEXT:    vmov r0, s4
+; CHECK-NEON-NEXT:    vmov.32 d11[1], r0
+; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    vmov.32 d10[1], r0
+; CHECK-NEON-NEXT:    vmin.s32 q9, q5, q8
+; CHECK-NEON-NEXT:    vmax.s32 q0, q9, q8
+; CHECK-NEON-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEON-NEXT:    pop {r4, pc}
+;
+; CHECK-FP16-LABEL: ustest_f16i32_nsat:
+; CHECK-FP16:       @ %bb.0: @ %entry
+; CHECK-FP16-NEXT:    vmovx.f16 s2, s0
+; CHECK-FP16-NEXT:    vcvt.s32.f16 s6, s0
+; CHECK-FP16-NEXT:    vcvt.s32.f16 s0, s1
+; CHECK-FP16-NEXT:    vmovx.f16 s4, s1
+; CHECK-FP16-NEXT:    vmov r0, s0
+; CHECK-FP16-NEXT:    vcvt.s32.f16 s4, s4
+; CHECK-FP16-NEXT:    vcvt.s32.f16 s2, s2
+; CHECK-FP16-NEXT:    vmov.i32 q9, #0x0
+; CHECK-FP16-NEXT:    vmov.32 d17[0], r0
+; CHECK-FP16-NEXT:    vmov r0, s6
+; CHECK-FP16-NEXT:    vmov.32 d16[0], r0
+; CHECK-FP16-NEXT:    vmov r0, s4
+; CHECK-FP16-NEXT:    vmov.32 d17[1], r0
+; CHECK-FP16-NEXT:    vmov r0, s2
+; CHECK-FP16-NEXT:    vmov.32 d16[1], r0
+; CHECK-FP16-NEXT:    vmin.s32 q8, q8, q9
+; CHECK-FP16-NEXT:    vmax.s32 q0, q8, q9
+; CHECK-FP16-NEXT:    bx lr
+entry:
+  %conv = fptosi <4 x half> %x to <4 x i32>
+  %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> zeroinitializer, <4 x i32> %conv)
+  %spec.store.select7 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %spec.store.select, <4 x i32> zeroinitializer)
+  ret <4 x i32> %spec.store.select7
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 18d071cc39bb6..a0d1ecce74e04 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -436,8 +436,8 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32(half %x) {
-; RV32-LABEL: utesth_f16i32:
+define i32 @utest_f16i32(half %x) {
+; RV32-LABEL: utest_f16i32:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
@@ -456,7 +456,7 @@ define i32 @utesth_f16i32(half %x) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: utesth_f16i32:
+; RV64-LABEL: utest_f16i32:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
@@ -974,8 +974,8 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16(half %x) {
-; RV32-LABEL: utesth_f16i16:
+define i16 @utest_f16i16(half %x) {
+; RV32-LABEL: utest_f16i16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
@@ -995,7 +995,7 @@ define i16 @utesth_f16i16(half %x) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: utesth_f16i16:
+; RV64-LABEL: utest_f16i16:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
@@ -3829,6 +3829,52 @@ entry:
   ret i64 %conv6
 }
 
+; i32 non saturate
+
+define i32 @ustest_f16i32_nsat(half %x) {
+; RV32-LABEL: ustest_f16i32_nsat:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    call __extendhfsf2
+; RV32-NEXT:    fcvt.w.s a0, fa0, rtz
+; RV32-NEXT:    srai a1, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    sgtz a1, a0
+; RV32-NEXT:    neg a1, a1
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ustest_f16i32_nsat:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    call __extendhfsf2
+; RV64-NEXT:    fcvt.l.s a0, fa0, rtz
+; RV64-NEXT:    srai a1, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    sgtz a1, a0
+; RV64-NEXT:    neg a1, a1
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %conv = fptosi half %x to i32
+  %spec.store.select = call i32 @llvm.smin.i32(i32 0, i32 %conv)
+  %spec.store.select7 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 0)
+  ret i32 %spec.store.select7
+}
+
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.umin.i32(i32, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index aba9d37bfaa04..f597762521006 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -519,8 +519,8 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32(<4 x half> %x) {
-; CHECK-NOV-LABEL: utesth_f16i32:
+define <4 x i32> @utest_f16i32(<4 x half> %x) {
+; CHECK-NOV-LABEL: utest_f16i32:
 ; CHECK-NOV:       # %bb.0: # %entry
 ; CHECK-NOV-NEXT:    addi sp, sp, -64
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 64
@@ -610,7 +610,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    bgeu a3, a1, .LBB7_4
 ; CHECK-NOV-NEXT:    j .LBB7_5
 ;
-; CHECK-V-LABEL: utesth_f16i32:
+; CHECK-V-LABEL: utest_f16i32:
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    addi sp, sp, -48
 ; CHECK-V-NEXT:    .cfi_def_cfa_offset 48
@@ -1594,8 +1594,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16(<8 x half> %x) {
-; CHECK-NOV-LABEL: utesth_f16i16:
+define <8 x i16> @utest_f16i16(<8 x half> %x) {
+; CHECK-NOV-LABEL: utest_f16i16:
 ; CHECK-NOV:       # %bb.0: # %entry
 ; CHECK-NOV-NEXT:    addi sp, sp, -128
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 128
@@ -1765,7 +1765,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    bgeu a7, a3, .LBB16_8
 ; CHECK-NOV-NEXT:    j .LBB16_9
 ;
-; CHECK-V-LABEL: utesth_f16i16:
+; CHECK-V-LABEL: utest_f16i16:
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    addi sp, sp, -80
 ; CHECK-V-NEXT:    .cfi_def_cfa_offset 80
@@ -3332,8 +3332,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64(<2 x half> %x) {
-; CHECK-NOV-LABEL: utesth_f16i64:
+define <2 x i64> @utest_f16i64(<2 x half> %x) {
+; CHECK-NOV-LABEL: utest_f16i64:
 ; CHECK-NOV:       # %bb.0: # %entry
 ; CHECK-NOV-NEXT:    addi sp, sp, -32
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 32
@@ -3373,7 +3373,7 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NOV-NEXT:    ret
 ;
-; CHECK-V-LABEL: utesth_f16i64:
+; CHECK-V-LABEL: utest_f16i64:
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    addi sp, sp, -32
 ; CHECK-V-NEXT:    .cfi_def_cfa_offset 32
@@ -4074,8 +4074,8 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
-; CHECK-NOV-LABEL: utesth_f16i32_mm:
+define <4 x i32> @utest_f16i32_mm(<4 x half> %x) {
+; CHECK-NOV-LABEL: utest_f16i32_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
 ; CHECK-NOV-NEXT:    addi sp, sp, -64
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 64
@@ -4165,7 +4165,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    bgeu a3, a1, .LBB34_4
 ; CHECK-NOV-NEXT:    j .LBB34_5
 ;
-; CHECK-V-LABEL: utesth_f16i32_mm:
+; CHECK-V-LABEL: utest_f16i32_mm:
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    addi sp, sp, -48
 ; CHECK-V-NEXT:    .cfi_def_cfa_offset 48
@@ -5134,8 +5134,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
-; CHECK-NOV-LABEL: utesth_f16i16_mm:
+define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
+; CHECK-NOV-LABEL: utest_f16i16_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
 ; CHECK-NOV-NEXT:    addi sp, sp, -128
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 128
@@ -5305,7 +5305,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    bgeu a7, a3, .LBB43_8
 ; CHECK-NOV-NEXT:    j .LBB43_9
 ;
-; CHECK-V-LABEL: utesth_f16i16_mm:
+; CHECK-V-LABEL: utest_f16i16_mm:
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    addi sp, sp, -80
 ; CHECK-V-NEXT:    .cfi_def_cfa_offset 80
@@ -6837,8 +6837,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
-; CHECK-NOV-LABEL: utesth_f16i64_mm:
+define <2 x i64> @utest_f16i64_mm(<2 x half> %x) {
+; CHECK-NOV-LABEL: utest_f16i64_mm:
 ; CHECK-NOV:       # %bb.0: # %entry
 ; CHECK-NOV-NEXT:    addi sp, sp, -32
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 32
@@ -6877,7 +6877,7 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NOV-NEXT:    ret
 ;
-; CHECK-V-LABEL: utesth_f16i64_mm:
+; CHECK-V-LABEL: utest_f16i64_mm:
 ; CHECK-V:       # %bb.0: # %entry
 ; CHECK-V-NEXT:    addi sp, sp, -32
 ; CHECK-V-NEXT:    .cfi_def_cfa_offset 32
@@ -7048,6 +7048,172 @@ entry:
   ret <2 x i64> %conv6
 }
 
+; i32 non saturate
+
+define <4 x i32> @ustest_f16i32_nsat(<4 x half> %x) {
+; CHECK-NOV-LABEL: ustest_f16i32_nsat:
+; CHECK-NOV:       # %bb.0: # %entry
+; CHECK-NOV-NEXT:    addi sp, sp, -64
+; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NOV-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; CHECK-NOV-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; CHECK-NOV-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; CHECK-NOV-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
+; CHECK-NOV-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
+; CHECK-NOV-NEXT:    fsd fs0, 16(sp) # 8-byte Folded Spill
+; CHECK-NOV-NEXT:    fsd fs1, 8(sp) # 8-byte Folded Spill
+; CHECK-NOV-NEXT:    .cfi_offset ra, -8
+; CHECK-NOV-NEXT:    .cfi_offset s0, -16
+; CHECK-NOV-NEXT:    .cfi_offset s1, -24
+; CHECK-NOV-NEXT:    .cfi_offset s2, -32
+; CHECK-NOV-NEXT:    .cfi_offset s3, -40
+; CHECK-NOV-NEXT:    .cfi_offset fs0, -48
+; CHECK-NOV-NEXT:    .cfi_offset fs1, -56
+; CHECK-NOV-NEXT:    lhu s1, 0(a1)
+; CHECK-NOV-NEXT:    lhu s2, 8(a1)
+; CHECK-NOV-NEXT:    lhu a2, 16(a1)
+; CHECK-NOV-NEXT:    lhu s3, 24(a1)
+; CHECK-NOV-NEXT:    mv s0, a0
+; CHECK-NOV-NEXT:    fmv.w.x fa0, a2
+; CHECK-NOV-NEXT:    call __extendhfsf2
+; CHECK-NOV-NEXT:    fmv.s fs0, fa0
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s2
+; CHECK-NOV-NEXT:    call __extendhfsf2
+; CHECK-NOV-NEXT:    fmv.s fs1, fa0
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s1
+; CHECK-NOV-NEXT:    call __extendhfsf2
+; CHECK-NOV-NEXT:    fcvt.l.s s1, fa0, rtz
+; CHECK-NOV-NEXT:    fcvt.l.s s2, fs1, rtz
+; CHECK-NOV-NEXT:    fmv.w.x fa0, s3
+; CHECK-NOV-NEXT:    fcvt.l.s s3, fs0, rtz
+; CHECK-NOV-NEXT:    call __extendhfsf2
+; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
+; CHECK-NOV-NEXT:    srai a1, s3, 63
+; CHECK-NOV-NEXT:    and a1, a1, s3
+; CHECK-NOV-NEXT:    srai a2, s2, 63
+; CHECK-NOV-NEXT:    and a2, a2, s2
+; CHECK-NOV-NEXT:    srai a3, s1, 63
+; CHECK-NOV-NEXT:    and a3, a3, s1
+; CHECK-NOV-NEXT:    srai a4, a0, 63
+; CHECK-NOV-NEXT:    and a0, a4, a0
+; CHECK-NOV-NEXT:    sgtz a4, a3
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    and a3, a4, a3
+; CHECK-NOV-NEXT:    sgtz a4, a2
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    and a2, a4, a2
+; CHECK-NOV-NEXT:    sgtz a4, a1
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    and a1, a4, a1
+; CHECK-NOV-NEXT:    sgtz a4, a0
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    and a0, a4, a0
+; CHECK-NOV-NEXT:    sw a3, 0(s0)
+; CHECK-NOV-NEXT:    sw a2, 4(s0)
+; CHECK-NOV-NEXT:    sw a1, 8(s0)
+; CHECK-NOV-NEXT:    sw a0, 12(s0)
+; CHECK-NOV-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; CHECK-NOV-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; CHECK-NOV-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; CHECK-NOV-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
+; CHECK-NOV-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
+; CHECK-NOV-NEXT:    fld fs0, 16(sp) # 8-byte Folded Reload
+; CHECK-NOV-NEXT:    fld fs1, 8(sp) # 8-byte Folded Reload
+; CHECK-NOV-NEXT:    .cfi_restore ra
+; CHECK-NOV-NEXT:    .cfi_restore s0
+; CHECK-NOV-NEXT:    .cfi_restore s1
+; CHECK-NOV-NEXT:    .cfi_restore s2
+; CHECK-NOV-NEXT:    .cfi_restore s3
+; CHECK-NOV-NEXT:    .cfi_restore fs0
+; CHECK-NOV-NEXT:    .cfi_restore fs1
+; CHECK-NOV-NEXT:    addi sp, sp, 64
+; CHECK-NOV-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NOV-NEXT:    ret
+;
+; CHECK-V-LABEL: ustest_f16i32_nsat:
+; CHECK-V:       # %bb.0: # %entry
+; CHECK-V-NEXT:    addi sp, sp, -48
+; CHECK-V-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-V-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; CHECK-V-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; CHECK-V-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
+; CHECK-V-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
+; CHECK-V-NEXT:    .cfi_offset ra, -8
+; CHECK-V-NEXT:    .cfi_offset s0, -16
+; CHECK-V-NEXT:    .cfi_offset s1, -24
+; CHECK-V-NEXT:    .cfi_offset s2, -32
+; CHECK-V-NEXT:    csrr a1, vlenb
+; CHECK-V-NEXT:    slli a1, a1, 1
+; CHECK-V-NEXT:    sub sp, sp, a1
+; CHECK-V-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; CHECK-V-NEXT:    lhu s0, 0(a0)
+; CHECK-V-NEXT:    lhu s1, 8(a0)
+; CHECK-V-NEXT:    lhu s2, 16(a0)
+; CHECK-V-NEXT:    lhu a0, 24(a0)
+; CHECK-V-NEXT:    fmv.w.x fa0, a0
+; CHECK-V-NEXT:    call __extendhfsf2
+; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT:    fmv.w.x fa0, s2
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-V-NEXT:    call __extendhfsf2
+; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-V-NEXT:    fmv.w.x fa0, s1
+; CHECK-V-NEXT:    call __extendhfsf2
+; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT:    fmv.w.x fa0, s0
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; CHECK-V-NEXT:    call __extendhfsf2
+; CHECK-V-NEXT:    fcvt.l.s a0, fa0, rtz
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addi a0, sp, 16
+; CHECK-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    add a0, sp, a0
+; CHECK-V-NEXT:    addi a0, a0, 16
+; CHECK-V-NEXT:    vl1r.v v9, (a0) # vscale x 8-byte Folded Reload
+; CHECK-V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-V-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-V-NEXT:    vmin.vx v8, v8, zero
+; CHECK-V-NEXT:    vmax.vx v8, v8, zero
+; CHECK-V-NEXT:    csrr a0, vlenb
+; CHECK-V-NEXT:    slli a0, a0, 1
+; CHECK-V-NEXT:    add sp, sp, a0
+; CHECK-V-NEXT:    .cfi_def_cfa sp, 48
+; CHECK-V-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; CHECK-V-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; CHECK-V-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
+; CHECK-V-NEXT:    ld s2, 16(sp) # 8-byte Folded Reload
+; CHECK-V-NEXT:    .cfi_restore ra
+; CHECK-V-NEXT:    .cfi_restore s0
+; CHECK-V-NEXT:    .cfi_restore s1
+; CHECK-V-NEXT:    .cfi_restore s2
+; CHECK-V-NEXT:    addi sp, sp, 48
+; CHECK-V-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-V-NEXT:    ret
+entry:
+  %conv = fptosi <4 x half> %x to <4 x i32>
+  %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> zeroinitializer, <4 x i32> %conv)
+  %spec.store.select7 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %spec.store.select, <4 x i32> zeroinitializer)
+  ret <4 x i32> %spec.store.select7
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
index 137994ceac132..59f3edc7168be 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat.ll
@@ -136,9 +136,9 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32(half %x) {
-; CHECK-LABEL: utesth_f16i32:
-; CHECK:         .functype utesth_f16i32 (f32) -> (i32)
+define i32 @utest_f16i32(half %x) {
+; CHECK-LABEL: utest_f16i32:
+; CHECK:         .functype utest_f16i32 (f32) -> (i32)
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
@@ -153,9 +153,9 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32_cse(half %x) {
-; CHECK-LABEL: utesth_f16i32_cse:
-; CHECK:         .functype utesth_f16i32_cse (f32) -> (i32)
+define i32 @utest_f16i32_cse(half %x) {
+; CHECK-LABEL: utest_f16i32_cse:
+; CHECK:         .functype utest_f16i32_cse (f32) -> (i32)
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
@@ -403,9 +403,9 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16(half %x) {
-; CHECK-LABEL: utesth_f16i16:
-; CHECK:         .functype utesth_f16i16 (f32) -> (i32)
+define i16 @utest_f16i16(half %x) {
+; CHECK-LABEL: utest_f16i16:
+; CHECK:         .functype utest_f16i16 (f32) -> (i32)
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
@@ -427,9 +427,9 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16_cse(half %x) {
-; CHECK-LABEL: utesth_f16i16_cse:
-; CHECK:         .functype utesth_f16i16_cse (f32) -> (i32)
+define i16 @utest_f16i16_cse(half %x) {
+; CHECK-LABEL: utest_f16i16_cse:
+; CHECK:         .functype utest_f16i16_cse (f32) -> (i32)
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
@@ -880,9 +880,9 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64(half %x) {
-; CHECK-LABEL: utesth_f16i64:
-; CHECK:         .functype utesth_f16i64 (f32) -> (i64)
+define i64 @utest_f16i64(half %x) {
+; CHECK-LABEL: utest_f16i64:
+; CHECK:         .functype utest_f16i64 (f32) -> (i64)
 ; CHECK-NEXT:    .local i32, i64, i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    global.get __stack_pointer
@@ -919,9 +919,9 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64_cse(half %x) {
-; CHECK-LABEL: utesth_f16i64_cse:
-; CHECK:         .functype utesth_f16i64_cse (f32) -> (i64)
+define i64 @utest_f16i64_cse(half %x) {
+; CHECK-LABEL: utest_f16i64_cse:
+; CHECK:         .functype utest_f16i64_cse (f32) -> (i64)
 ; CHECK-NEXT:    .local i32, i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    global.get __stack_pointer
@@ -1118,9 +1118,9 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32_mm(half %x) {
-; CHECK-LABEL: utesth_f16i32_mm:
-; CHECK:         .functype utesth_f16i32_mm (f32) -> (i32)
+define i32 @utest_f16i32_mm(half %x) {
+; CHECK-LABEL: utest_f16i32_mm:
+; CHECK:         .functype utest_f16i32_mm (f32) -> (i32)
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    call __truncsfhf2
@@ -1353,9 +1353,9 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16_mm(half %x) {
-; CHECK-LABEL: utesth_f16i16_mm:
-; CHECK:         .functype utesth_f16i16_mm (f32) -> (i32)
+define i16 @utest_f16i16_mm(half %x) {
+; CHECK-LABEL: utest_f16i16_mm:
+; CHECK:         .functype utest_f16i16_mm (f32) -> (i32)
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 0
@@ -1637,9 +1637,9 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64_mm(half %x) {
-; CHECK-LABEL: utesth_f16i64_mm:
-; CHECK:         .functype utesth_f16i64_mm (f32) -> (i64)
+define i64 @utest_f16i64_mm(half %x) {
+; CHECK-LABEL: utest_f16i64_mm:
+; CHECK:         .functype utest_f16i64_mm (f32) -> (i64)
 ; CHECK-NEXT:    .local i32, i64, i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    global.get __stack_pointer
@@ -1724,9 +1724,9 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64_mm_cse(half %x) {
-; CHECK-LABEL: utesth_f16i64_mm_cse:
-; CHECK:         .functype utesth_f16i64_mm_cse (f32) -> (i64)
+define i64 @utest_f16i64_mm_cse(half %x) {
+; CHECK-LABEL: utest_f16i64_mm_cse:
+; CHECK:         .functype utest_f16i64_mm_cse (f32) -> (i64)
 ; CHECK-NEXT:    .local i32, i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    global.get __stack_pointer
@@ -1754,6 +1754,35 @@ entry:
   ret i64 %conv6
 }
 
+; i32 non saturate
+
+define i32 @ustest_f16i32_nsat(half %x) {
+; CHECK-LABEL: ustest_f16i32_nsat:
+; CHECK:         .functype ustest_f16i32_nsat (f32) -> (i32)
+; CHECK-NEXT:    .local i32
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    call __truncsfhf2
+; CHECK-NEXT:    call __extendhfsf2
+; CHECK-NEXT:    i32.trunc_sat_f32_s
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    i32.const 31
+; CHECK-NEXT:    i32.shr_s
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    i32.gt_s
+; CHECK-NEXT:    i32.select
+; CHECK-NEXT:    # fallthrough-return
+  %conv = fptosi half %x to i32
+  %spec.store.select = call i32 @llvm.smin.i32(i32 0, i32 %conv)
+  %spec.store.select7 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 0)
+  ret i32 %spec.store.select7
+}
+
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.umin.i32(i32, i32)
diff --git a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
index 7190e162eb010..52f57dc7d1a3e 100644
--- a/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll
@@ -209,9 +209,9 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32(<4 x half> %x) {
-; CHECK-LABEL: utesth_f16i32:
-; CHECK:         .functype utesth_f16i32 (f32, f32, f32, f32) -> (v128)
+define <4 x i32> @utest_f16i32(<4 x half> %x) {
+; CHECK-LABEL: utest_f16i32:
+; CHECK:         .functype utest_f16i32 (f32, f32, f32, f32) -> (v128)
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    call __truncsfhf2
@@ -513,9 +513,9 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16(<8 x half> %x) {
-; CHECK-LABEL: utesth_f16i16:
-; CHECK:         .functype utesth_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
+define <8 x i16> @utest_f16i16(<8 x half> %x) {
+; CHECK-LABEL: utest_f16i16:
+; CHECK:         .functype utest_f16i16 (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
 ; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
@@ -1295,9 +1295,9 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64(<2 x half> %x) {
-; CHECK-LABEL: utesth_f16i64:
-; CHECK:         .functype utesth_f16i64 (f32, f32) -> (v128)
+define <2 x i64> @utest_f16i64(<2 x half> %x) {
+; CHECK-LABEL: utest_f16i64:
+; CHECK:         .functype utest_f16i64 (f32, f32) -> (v128)
 ; CHECK-NEXT:    .local i32, i64, i64, i64, i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    global.get __stack_pointer
@@ -1649,9 +1649,9 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
-; CHECK-LABEL: utesth_f16i32_mm:
-; CHECK:         .functype utesth_f16i32_mm (f32, f32, f32, f32) -> (v128)
+define <4 x i32> @utest_f16i32_mm(<4 x half> %x) {
+; CHECK-LABEL: utest_f16i32_mm:
+; CHECK:         .functype utest_f16i32_mm (f32, f32, f32, f32) -> (v128)
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    call __truncsfhf2
@@ -1938,9 +1938,9 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
-; CHECK-LABEL: utesth_f16i16_mm:
-; CHECK:         .functype utesth_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
+define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
+; CHECK-LABEL: utest_f16i16_mm:
+; CHECK:         .functype utest_f16i16_mm (f32, f32, f32, f32, f32, f32, f32, f32) -> (v128)
 ; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    local.get 5
@@ -2673,9 +2673,9 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
-; CHECK-LABEL: utesth_f16i64_mm:
-; CHECK:         .functype utesth_f16i64_mm (f32, f32) -> (v128)
+define <2 x i64> @utest_f16i64_mm(<2 x half> %x) {
+; CHECK-LABEL: utest_f16i64_mm:
+; CHECK:         .functype utest_f16i64_mm (f32, f32) -> (v128)
 ; CHECK-NEXT:    .local i32, i64, i64, i64, i64
 ; CHECK-NEXT:  # %bb.0: # %entry
 ; CHECK-NEXT:    global.get __stack_pointer
@@ -2810,6 +2810,48 @@ entry:
   ret <2 x i64> %conv6
 }
 
+; i32 non saturate
+
+define <4 x i32> @ustest_f16i32_nsat(<4 x half> %x) {
+; CHECK-LABEL: ustest_f16i32_nsat:
+; CHECK:         .functype ustest_f16i32_nsat (f32, f32, f32, f32) -> (v128)
+; CHECK-NEXT:    .local v128
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    call __truncsfhf2
+; CHECK-NEXT:    call __extendhfsf2
+; CHECK-NEXT:    local.set 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    call __truncsfhf2
+; CHECK-NEXT:    call __extendhfsf2
+; CHECK-NEXT:    i32.trunc_sat_f32_s
+; CHECK-NEXT:    i32x4.splat
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i32.trunc_sat_f32_s
+; CHECK-NEXT:    i32x4.replace_lane 1
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    call __truncsfhf2
+; CHECK-NEXT:    call __extendhfsf2
+; CHECK-NEXT:    i32.trunc_sat_f32_s
+; CHECK-NEXT:    i32x4.replace_lane 2
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    call __truncsfhf2
+; CHECK-NEXT:    call __extendhfsf2
+; CHECK-NEXT:    i32.trunc_sat_f32_s
+; CHECK-NEXT:    i32x4.replace_lane 3
+; CHECK-NEXT:    v128.const 0, 0, 0, 0
+; CHECK-NEXT:    local.tee 4
+; CHECK-NEXT:    i32x4.min_s
+; CHECK-NEXT:    local.get 4
+; CHECK-NEXT:    i32x4.max_s
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %conv = fptosi <4 x half> %x to <4 x i32>
+  %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> zeroinitializer, <4 x i32> %conv)
+  %spec.store.select7 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %spec.store.select, <4 x i32> zeroinitializer)
+  ret <4 x i32> %spec.store.select7
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll
index 3f5ec7b530fe0..67483be7a3fde 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat.ll
@@ -161,8 +161,8 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32(half %x) nounwind {
-; CHECK-LABEL: utesth_f16i32:
+define i32 @utest_f16i32(half %x) nounwind {
+; CHECK-LABEL: utest_f16i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
@@ -360,8 +360,8 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16(half %x) nounwind {
-; CHECK-LABEL: utesth_f16i16:
+define i16 @utest_f16i16(half %x) nounwind {
+; CHECK-LABEL: utest_f16i16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
@@ -566,8 +566,8 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64(half %x) nounwind {
-; CHECK-LABEL: utesth_f16i64:
+define i64 @utest_f16i64(half %x) nounwind {
+; CHECK-LABEL: utest_f16i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq __fixunshfti@PLT
@@ -762,8 +762,8 @@ entry:
   ret i32 %conv6
 }
 
-define i32 @utesth_f16i32_mm(half %x) nounwind {
-; CHECK-LABEL: utesth_f16i32_mm:
+define i32 @utest_f16i32_mm(half %x) nounwind {
+; CHECK-LABEL: utest_f16i32_mm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
@@ -946,8 +946,8 @@ entry:
   ret i16 %conv6
 }
 
-define i16 @utesth_f16i16_mm(half %x) nounwind {
-; CHECK-LABEL: utesth_f16i16_mm:
+define i16 @utest_f16i16_mm(half %x) nounwind {
+; CHECK-LABEL: utest_f16i16_mm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
@@ -1131,8 +1131,8 @@ entry:
   ret i64 %conv6
 }
 
-define i64 @utesth_f16i64_mm(half %x) nounwind {
-; CHECK-LABEL: utesth_f16i64_mm:
+define i64 @utest_f16i64_mm(half %x) nounwind {
+; CHECK-LABEL: utest_f16i64_mm:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    callq __fixunshfti@PLT
@@ -1170,6 +1170,27 @@ entry:
   ret i64 %conv6
 }
 
+; i32 non saturate
+
+define i32 @ustest_f16i32_nsat(half %x) nounwind {
+; CHECK-LABEL: ustest_f16i32_nsat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq __extendhfsf2@PLT
+; CHECK-NEXT:    cvttss2si %xmm0, %ecx
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    andl %ecx, %eax
+; CHECK-NEXT:    cmovlel %edx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+  %conv = fptosi half %x to i32
+  %spec.store.select = call i32 @llvm.smin.i32(i32 0, i32 %conv)
+  %spec.store.select7 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 0)
+  ret i32 %spec.store.select7
+}
+
 declare i32 @llvm.smin.i32(i32, i32)
 declare i32 @llvm.smax.i32(i32, i32)
 declare i32 @llvm.umin.i32(i32, i32)
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 1a2cfd69650b8..991ce33e58b4c 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -747,8 +747,8 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
-; SSE-LABEL: utesth_f16i32:
+define <4 x i32> @utest_f16i32(<4 x half> %x) nounwind {
+; SSE-LABEL: utest_f16i32:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    subq $72, %rsp
 ; SSE-NEXT:    movaps %xmm0, %xmm1
@@ -835,7 +835,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; SSE-NEXT:    addq $72, %rsp
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: utesth_f16i32:
+; AVX2-LABEL: utest_f16i32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
@@ -893,7 +893,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: utesth_f16i32:
+; AVX512-LABEL: utest_f16i32:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2uqq %ymm0, %zmm0
@@ -1338,8 +1338,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16(<8 x half> %x) nounwind {
-; SSE-LABEL: utesth_f16i16:
+define <8 x i16> @utest_f16i16(<8 x half> %x) nounwind {
+; SSE-LABEL: utest_f16i16:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    subq $72, %rsp
 ; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
@@ -1436,7 +1436,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) nounwind {
 ; SSE-NEXT:    addq $72, %rsp
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: utesth_f16i16:
+; AVX2-LABEL: utest_f16i16:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
@@ -1453,7 +1453,7 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: utesth_f16i16:
+; AVX512-LABEL: utest_f16i16:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
 ; AVX512-NEXT:    vcvttps2udq %ymm0, %ymm0
@@ -2456,8 +2456,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64(<2 x half> %x) nounwind {
-; SSE-LABEL: utesth_f16i64:
+define <2 x i64> @utest_f16i64(<2 x half> %x) nounwind {
+; SSE-LABEL: utest_f16i64:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
@@ -2483,7 +2483,7 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) nounwind {
 ; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: utesth_f16i64:
+; AVX2-LABEL: utest_f16i64:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
@@ -2508,7 +2508,7 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) nounwind {
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: utesth_f16i64:
+; AVX512-LABEL: utest_f16i64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    pushq %r14
 ; AVX512-NEXT:    pushq %rbx
@@ -3359,8 +3359,8 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
-; SSE-LABEL: utesth_f16i32_mm:
+define <4 x i32> @utest_f16i32_mm(<4 x half> %x) nounwind {
+; SSE-LABEL: utest_f16i32_mm:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    subq $72, %rsp
 ; SSE-NEXT:    movaps %xmm0, %xmm1
@@ -3447,7 +3447,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; SSE-NEXT:    addq $72, %rsp
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: utesth_f16i32_mm:
+; AVX2-LABEL: utest_f16i32_mm:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
@@ -3505,7 +3505,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: utesth_f16i32_mm:
+; AVX512-LABEL: utest_f16i32_mm:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vcvttps2uqq %ymm0, %zmm0
@@ -3935,8 +3935,8 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) nounwind {
-; SSE-LABEL: utesth_f16i16_mm:
+define <8 x i16> @utest_f16i16_mm(<8 x half> %x) nounwind {
+; SSE-LABEL: utest_f16i16_mm:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    subq $72, %rsp
 ; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
@@ -4033,7 +4033,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) nounwind {
 ; SSE-NEXT:    addq $72, %rsp
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: utesth_f16i16_mm:
+; AVX2-LABEL: utest_f16i16_mm:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
 ; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
@@ -4050,7 +4050,7 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: utesth_f16i16_mm:
+; AVX512-LABEL: utest_f16i16_mm:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
 ; AVX512-NEXT:    vcvttps2udq %ymm0, %ymm0
@@ -4820,8 +4820,8 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) nounwind {
-; SSE-LABEL: utesth_f16i64_mm:
+define <2 x i64> @utest_f16i64_mm(<2 x half> %x) nounwind {
+; SSE-LABEL: utest_f16i64_mm:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
@@ -4847,7 +4847,7 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) nounwind {
 ; SSE-NEXT:    popq %r14
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: utesth_f16i64_mm:
+; AVX2-LABEL: utest_f16i64_mm:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
@@ -4872,7 +4872,7 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) nounwind {
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: utesth_f16i64_mm:
+; AVX512-LABEL: utest_f16i64_mm:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    pushq %r14
 ; AVX512-NEXT:    pushq %rbx
@@ -4974,6 +4974,63 @@ entry:
   ret <2 x i64> %conv6
 }
 
+; i32 non saturate
+
+define <4 x i32> @ustest_f16i32_nsat(<4 x half> %x) nounwind {
+; SSE-LABEL: ustest_f16i32_nsat:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f16i32_nsat:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+entry:
+  %conv = fptosi <4 x half> %x to <4 x i32>
+  %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> zeroinitializer, <4 x i32> %conv)
+  %spec.store.select7 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %spec.store.select, <4 x i32> zeroinitializer)
+  ret <4 x i32> %spec.store.select7
+}
+
 declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
 declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)

From be29612ffceac888f931dc45664f7c42cea9b598 Mon Sep 17 00:00:00 2001
From: ManuelJBrito <59119670+ManuelJBrito@users.noreply.github.com>
Date: Fri, 3 Oct 2025 20:52:29 +0100
Subject: [PATCH 695/878] [NewGVN] Remove returned arg simplification (#161865)

Replacing uses of the return value with the argument is already handled
in other passes, additionally it causes issues with memory value
numbering when the call is a memory defining intrinsic.
fixes #159918
---
 llvm/lib/Transforms/Scalar/NewGVN.cpp   |  4 ----
 llvm/test/Transforms/NewGVN/pr159918.ll | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/NewGVN/pr159918.ll

diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 9d4fb79416596..d6b76335dbb9f 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1646,10 +1646,6 @@ NewGVN::performSymbolicPredicateInfoEvaluation(BitCastInst *I) const {
 // Evaluate read only and pure calls, and create an expression result.
 NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   auto *CI = cast<CallInst>(I);
-  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-    if (auto *ReturnedValue = II->getReturnedArgOperand())
-      return ExprResult::some(createVariableOrConstant(ReturnedValue));
-  }
 
   // FIXME: Currently the calls which may access the thread id may
   // be considered as not accessing the memory. But this is
diff --git a/llvm/test/Transforms/NewGVN/pr159918.ll b/llvm/test/Transforms/NewGVN/pr159918.ll
new file mode 100644
index 0000000000000..3fad6e66d11ac
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/pr159918.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=newgvn < %s | FileCheck %s
+
+; Don't use returned argument in memory defining intrinsics.
+define void @wombat(ptr %arg) {
+; CHECK-LABEL: define void @wombat(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @llvm.objc.retain(ptr [[LOAD]])
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[ARG]], align 8
+; CHECK-NEXT:    ret void
+;
+  %load = load ptr, ptr %arg, align 8
+  %call = call ptr @llvm.objc.retain(ptr %load)
+  store ptr %call, ptr %arg, align 8
+  ret void
+}
+
+declare ptr @llvm.objc.retain(ptr returned) #0
+
+attributes #0 = { nounwind }

From b0ad9c293a493c5912ffee6c18191bf35095f9f7 Mon Sep 17 00:00:00 2001
From: Lucie Choi <ychoi0407@gmail.com>
Date: Fri, 3 Oct 2025 13:02:06 -0700
Subject: [PATCH 696/878] [SPIR-V] Fix `asdouble` issue in SPIRV codegen to
 correctly generate `OpBitCast` instruction. (#161891)

Generate `OpBitCast` instruction for pointer cast operation if the
element type is different.

The HLSL for the unit test is
```hlsl
StructuredBuffer<uint2> In : register(t0);

RWStructuredBuffer<double2> Out : register(u2);


[numthreads(1,1,1)]
void main() {
  Out[0] = asdouble(In[0], In[1]);
}
```

Resolves https://github.com/llvm/llvm-project/issues/153513
---
 .../Target/SPIRV/SPIRVLegalizePointerCast.cpp | 25 ++++++++++++++++-
 .../CodeGen/SPIRV/pointers/ptrcast-bitcast.ll | 28 +++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 75055072573b3..ebd957c42762c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -188,8 +188,31 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     FixedVectorType *SrcType = cast<FixedVectorType>(Src->getType());
     FixedVectorType *DstType =
         cast<FixedVectorType>(GR->findDeducedElementType(Dst));
-    assert(DstType->getNumElements() >= SrcType->getNumElements());
+    auto dstNumElements = DstType->getNumElements();
+    auto srcNumElements = SrcType->getNumElements();
+
+    // if the element type differs, it is a bitcast.
+    if (DstType->getElementType() != SrcType->getElementType()) {
+      // Support bitcast between vectors of different sizes only if
+      // the total bitwidth is the same.
+      auto dstBitWidth =
+          DstType->getElementType()->getScalarSizeInBits() * dstNumElements;
+      auto srcBitWidth =
+          SrcType->getElementType()->getScalarSizeInBits() * srcNumElements;
+      assert(dstBitWidth == srcBitWidth &&
+             "Unsupported bitcast between vectors of different sizes.");
+
+      Src =
+          B.CreateIntrinsic(Intrinsic::spv_bitcast, {DstType, SrcType}, {Src});
+      buildAssignType(B, DstType, Src);
+      SrcType = DstType;
+
+      StoreInst *SI = B.CreateStore(Src, Dst);
+      SI->setAlignment(Alignment);
+      return SI;
+    }
 
+    assert(DstType->getNumElements() >= SrcType->getNumElements());
     LoadInst *LI = B.CreateLoad(DstType, Dst);
     LI->setAlignment(Alignment);
     Value *OldValues = LI;
diff --git a/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
new file mode 100644
index 0000000000000..84913283f6868
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/ptrcast-bitcast.ll
@@ -0,0 +1,28 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s --match-full-lines
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG:        %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:     %[[#v2_uint:]] = OpTypeVector %[[#uint]] 2
+; CHECK-DAG:      %[[#double:]] = OpTypeFloat 64
+; CHECK-DAG:   %[[#v2_double:]] = OpTypeVector %[[#double]] 2
+; CHECK-DAG:     %[[#v4_uint:]] = OpTypeVector %[[#uint]] 4
+@.str = private unnamed_addr constant [3 x i8] c"In\00", align 1
+@.str.2 = private unnamed_addr constant [4 x i8] c"Out\00", align 1
+
+define void @main() local_unnamed_addr #0 {
+entry:
+  %0 = tail call target("spirv.VulkanBuffer", [0 x <2 x i32>], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2i32_12_0t(i32 0, i32 0, i32 1, i32 0, ptr nonnull @.str)
+  %1 = tail call target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0v2f64_12_1t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.2)
+  %2 = tail call noundef align 8 dereferenceable(8) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2i32_12_0t(target("spirv.VulkanBuffer", [0 x <2 x i32>], 12, 0) %0, i32 0)
+  %3 = load <2 x i32>, ptr addrspace(11) %2, align 8
+  %4 = tail call noundef align 8 dereferenceable(8) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2i32_12_0t(target("spirv.VulkanBuffer", [0 x <2 x i32>], 12, 0) %0, i32 1)
+  %5 = load <2 x i32>, ptr addrspace(11) %4, align 8
+; CHECK: %[[#tmp:]] = OpVectorShuffle %[[#v4_uint]] {{%[0-9]+}} {{%[0-9]+}} 0 2 1 3
+  %6 = shufflevector <2 x i32> %3, <2 x i32> %5, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK: %[[#access:]] = OpAccessChain {{.*}}
+  %7 = tail call noundef align 16 dereferenceable(16) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0v2f64_12_1t(target("spirv.VulkanBuffer", [0 x <2 x double>], 12, 1) %1, i32 0)
+; CHECK: %[[#bitcast:]] = OpBitcast %[[#v2_double]] %[[#tmp]]
+; CHECK: OpStore %[[#access]] %[[#bitcast]] Aligned 16
+  store <4 x i32> %6, ptr addrspace(11) %7, align 16
+  ret void
+}

From 36dc2a941f531d6b5662f2ad2c11e7264e5d9622 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 13:09:10 -0700
Subject: [PATCH 697/878] [RISCV] Reverse the operands in ins for Zalasr store
 instructions. NFC (#161882)

Match the assembly printing order rather than sorting by operand name.

Tnis is consistent with normal store instructions.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
index 1674c957b6579..1dd733208e3f2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZalasr.td
@@ -26,7 +26,7 @@ class LAQ_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
 class SRL_r<bit aq, bit rl, bits<3> funct3, string opcodestr>
     : RVInstRAtomic<0b00111, aq, rl, funct3, OPC_AMO,
-                    (outs ), (ins GPRMemZeroOffset:$rs1, GPR:$rs2),
+                    (outs), (ins GPR:$rs2, GPRMemZeroOffset:$rs1),
                     opcodestr, "$rs2, $rs1"> {
   let rd = 0;
 }
@@ -71,7 +71,7 @@ class PatLAQ<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
 //  while atomic_store has data, addr
 class PatSRL<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
     : Pat<(OpNode (vt GPR:$rs2), (XLenVT GPRMemZeroOffset:$rs1)),
-          (Inst GPRMemZeroOffset:$rs1, GPR:$rs2)>;
+          (Inst GPR:$rs2, GPRMemZeroOffset:$rs1)>;
 
 
 let Predicates = [HasStdExtZalasr] in {

From 2a059042882ed5108478c635322e4e94439386f5 Mon Sep 17 00:00:00 2001
From: sstwcw <su3e8a96kzlver@posteo.net>
Date: Fri, 3 Oct 2025 20:40:48 +0000
Subject: [PATCH 698/878] [clang-format] Keep the ObjC selector name and
 `@selector` together (#160739)

Fixes #36459.

after

```Objective-C
- (void)test {
  if ([object
          respondsToSelector:@selector(
                                 selectorNameThatIsReallyLong:param1:param2:)])
    return;
}
```

before

```Objective-C
- (void)test {
  if ([object respondsToSelector:@selector
              (selectorNameThatIsReallyLong:param1:param2:)])
    return;
}
```

Before this patch, the `ObjCMethodExpr` type was assigned to many kinds
of tokens. The rule for allowing breaking the line before the colon on
line TokenAnnotator.cpp:6289 was intended for method declarations and
calls. It matched the parenthesis following `@selector` by mistake. To
fix the problem, this patch adds a new type for `@selector`. Most of the
special things in the code related to the old type is intended for other
constructs. So most of the code related to the old type is not changed
in this patch.
---
 clang/lib/Format/Format.cpp                   |  2 +-
 clang/lib/Format/FormatToken.h                | 11 +++++++
 clang/lib/Format/TokenAnnotator.cpp           | 21 ++++++-------
 clang/unittests/Format/FormatTestObjC.cpp     |  9 ++++++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 31 +++++++++++++++++++
 5 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 2bf62448a7df3..686e54128d372 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -3199,7 +3199,7 @@ class ObjCHeaderStyleGuesser : public TokenAnalyzer {
                                Keywords.kw_NS_OPTIONS, TT_ObjCBlockLBrace,
                                TT_ObjCBlockLParen, TT_ObjCDecl, TT_ObjCForIn,
                                TT_ObjCMethodExpr, TT_ObjCMethodSpecifier,
-                               TT_ObjCProperty)) {
+                               TT_ObjCProperty, TT_ObjCSelector)) {
           LLVM_DEBUG(llvm::dbgs()
                      << "Detected ObjC at location "
                      << FormatTok->Tok.getLocation().printToString(
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index e4ddd610b9722..f015d27bed6af 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -127,9 +127,17 @@ namespace format {
   TYPE(ObjCBlockLParen)                                                        \
   TYPE(ObjCDecl)                                                               \
   TYPE(ObjCForIn)                                                              \
+  /* The square brackets surrounding a method call, the colon separating the   \
+   * method or parameter name and the argument inside the square brackets, and \
+   * the colon separating the method or parameter name and the type inside the \
+   * method declaration. */                                                    \
   TYPE(ObjCMethodExpr)                                                         \
+  /* The '+' or '-' at the start of the line. */                               \
   TYPE(ObjCMethodSpecifier)                                                    \
   TYPE(ObjCProperty)                                                           \
+  /* The parentheses following '@selector' and the colon following the method  \
+   * or parameter name inside the parentheses. */                              \
+  TYPE(ObjCSelector)                                                           \
   TYPE(ObjCStringLiteral)                                                      \
   TYPE(OverloadedOperator)                                                     \
   TYPE(OverloadedOperatorLParen)                                               \
@@ -146,6 +154,9 @@ namespace format {
   TYPE(RequiresExpression)                                                     \
   TYPE(RequiresExpressionLBrace)                                               \
   TYPE(RequiresExpressionLParen)                                               \
+  /* The hash key in languages that have hash literals, not including the      \
+   * field name in the C++ struct literal. Also the method or parameter name   \
+   * in the Objective-C method declaration or call. */                         \
   TYPE(SelectorName)                                                           \
   TYPE(StartOfName)                                                            \
   TYPE(StatementAttributeLikeMacro)                                            \
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 59f81b3617ad9..5b784eded4601 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -321,13 +321,13 @@ class AnnotatingParser {
       return parseUntouchableParens();
     }
 
-    bool StartsObjCMethodExpr = false;
+    bool StartsObjCSelector = false;
     if (!Style.isVerilog()) {
       if (FormatToken *MaybeSel = OpeningParen.Previous) {
         // @selector( starts a selector.
         if (MaybeSel->is(tok::objc_selector) && MaybeSel->Previous &&
             MaybeSel->Previous->is(tok::at)) {
-          StartsObjCMethodExpr = true;
+          StartsObjCSelector = true;
         }
       }
     }
@@ -451,10 +451,8 @@ class AnnotatingParser {
       }
     }
 
-    if (StartsObjCMethodExpr) {
-      Contexts.back().ColonIsObjCMethodExpr = true;
-      OpeningParen.setType(TT_ObjCMethodExpr);
-    }
+    if (StartsObjCSelector)
+      OpeningParen.setType(TT_ObjCSelector);
 
     // MightBeFunctionType and ProbablyFunctionType are used for
     // function pointer and reference types as well as Objective-C
@@ -513,8 +511,8 @@ class AnnotatingParser {
           }
         }
 
-        if (StartsObjCMethodExpr) {
-          CurrentToken->setType(TT_ObjCMethodExpr);
+        if (StartsObjCSelector) {
+          CurrentToken->setType(TT_ObjCSelector);
           if (Contexts.back().FirstObjCSelectorName) {
             Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName =
                 Contexts.back().LongestObjCSelectorName;
@@ -1449,7 +1447,7 @@ class AnnotatingParser {
                    Next->Next->is(tok::colon)))) {
         // This handles a special macro in ObjC code where selectors including
         // the colon are passed as macro arguments.
-        Tok->setType(TT_ObjCMethodExpr);
+        Tok->setType(TT_ObjCSelector);
       }
       break;
     case tok::pipe:
@@ -4608,7 +4606,7 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
     return false;
   }
   if (Left.is(tok::colon))
-    return Left.isNot(TT_ObjCMethodExpr);
+    return Left.isNoneOf(TT_ObjCSelector, TT_ObjCMethodExpr);
   if (Left.is(tok::coloncolon))
     return false;
   if (Left.is(tok::less) || Right.isOneOf(tok::greater, tok::less)) {
@@ -5464,7 +5462,7 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     // `private:` and `public:`.
     if (!Right.getNextNonComment())
       return false;
-    if (Right.is(TT_ObjCMethodExpr))
+    if (Right.isOneOf(TT_ObjCSelector, TT_ObjCMethodExpr))
       return false;
     if (Left.is(tok::question))
       return false;
@@ -6288,6 +6286,7 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
     return Style.BreakInheritanceList == FormatStyle::BILS_AfterColon;
   if (Right.is(TT_InheritanceColon))
     return Style.BreakInheritanceList != FormatStyle::BILS_AfterColon;
+  // When the method parameter has no name, allow breaking before the colon.
   if (Right.is(TT_ObjCMethodExpr) && Right.isNot(tok::r_square) &&
       Left.isNot(TT_SelectorName)) {
     return true;
diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp
index f7f73db62045c..700d7cf8efca6 100644
--- a/clang/unittests/Format/FormatTestObjC.cpp
+++ b/clang/unittests/Format/FormatTestObjC.cpp
@@ -763,6 +763,15 @@ TEST_F(FormatTestObjC, FormatObjCMethodExpr) {
       "                  backing:NSBackingStoreBuffered\n"
       "                    defer:NO]);\n"
       "}");
+  Style.ColumnLimit = 63;
+  verifyFormat(
+      "- (void)test {\n"
+      "  if ([object\n"
+      "          respondsToSelector:@selector(\n"
+      "                                 selectorName:param1:param2:)])\n"
+      "    return;\n"
+      "}");
+  Style.ColumnLimit = PreviousColumnLimit;
   verifyFormat("[contentsContainer replaceSubview:[subviews objectAtIndex:0]\n"
                "                             with:contentsNativeView];");
 
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 4a8f27f656f1d..c21b118847d99 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1929,6 +1929,37 @@ TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodExpr) {
   ASSERT_EQ(Tokens.size(), 20u) << Tokens;
   EXPECT_TOKEN(Tokens[9], tok::l_square, TT_ObjCMethodExpr);
   EXPECT_TOKEN(Tokens[15], tok::greater, TT_BinaryOperator);
+
+  Tokens = annotate("a = @selector(name:);");
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
+  EXPECT_TOKEN(Tokens[4], tok::l_paren, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[6], tok::colon, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_ObjCSelector);
+
+  Tokens =
+      annotate("[object respondsToSelector:@selector(name:param1:param2:)\n"
+               "        respondsToSelector:@selector(name:param1:param2:)];");
+  ASSERT_EQ(Tokens.size(), 29u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_square, TT_ObjCMethodExpr);
+  EXPECT_TOKEN(Tokens[3], tok::colon, TT_ObjCMethodExpr);
+  EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[8], tok::colon, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[10], tok::colon, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[12], tok::colon, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[13], tok::r_paren, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[15], tok::colon, TT_ObjCMethodExpr);
+  EXPECT_TOKEN(Tokens[18], tok::l_paren, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[20], tok::colon, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[22], tok::colon, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[24], tok::colon, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[25], tok::r_paren, TT_ObjCSelector);
+  EXPECT_TOKEN(Tokens[26], tok::r_square, TT_ObjCMethodExpr);
+
+  Tokens = annotate("[a b:c];");
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::l_square, TT_ObjCMethodExpr);
+  EXPECT_TOKEN(Tokens[3], tok::colon, TT_ObjCMethodExpr);
+  EXPECT_TOKEN(Tokens[5], tok::r_square, TT_ObjCMethodExpr);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodDecl) {

From 1d65d9ce06fef890389e61990d9c748162334e55 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 3 Oct 2025 22:01:45 +0100
Subject: [PATCH 699/878] [VPlan] Match legacy CM in ::computeCost if load is
 used by load/store.

If a load is scalarized because it is used by a load/store address, the
legacy cost model does not pass ScalarEvolution to getAddressComputationCost.

Match the behavior in VPReplicateRecipe::computeCost.
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   7 +-
 .../X86/replicating-load-store-costs.ll       | 126 ++++++++++++++++++
 2 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 43d61f2321a2c..a88cffc855192 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3298,10 +3298,11 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
         UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
 
     Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
-
+    bool UsedByLoadStoreAddress = isUsedByLoadStoreAddress(this);
     InstructionCost ScalarCost =
         ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
+                              PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
+                              nullptr, Ctx.CostKind);
     if (isSingleScalar())
       return ScalarCost;
 
@@ -3312,7 +3313,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     // vectorized addressing or the loaded value is used as part of an address
     // of another load or store.
     bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
-    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
+    if (PreferVectorizedAddressing || !UsedByLoadStoreAddress) {
       bool EfficientVectorLoadStore =
           Ctx.TTI.supportsEfficientVectorElementLoadStore();
       if (!(IsLoad && !PreferVectorizedAddressing) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index 87848730c8f01..f5329cf86451b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -454,6 +454,132 @@ exit:
   ret void
 }
 
+declare i1 @cond()
+
+define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %ptr.c) {
+; I64-LABEL: define double @test_load_used_by_other_load_scev(
+; I64-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) {
+; I64-NEXT:  [[ENTRY:.*]]:
+; I64-NEXT:    br label %[[OUTER_LOOP:.*]]
+; I64:       [[OUTER_LOOP_LOOPEXIT:.*]]:
+; I64-NEXT:    br label %[[OUTER_LOOP]]
+; I64:       [[OUTER_LOOP]]:
+; I64-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ]
+; I64-NEXT:    [[COND:%.*]] = call i1 @cond()
+; I64-NEXT:    br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
+; I64:       [[INNER_LOOP_PREHEADER]]:
+; I64-NEXT:    br label %[[VECTOR_PH:.*]]
+; I64:       [[VECTOR_PH]]:
+; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
+; I64:       [[VECTOR_BODY]]:
+; I64-NEXT:    [[TMP0:%.*]] = add i64 0, 1
+; I64-NEXT:    [[TMP1:%.*]] = add i64 1, 1
+; I64-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]]
+; I64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]]
+; I64-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]]
+; I64-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]]
+; I64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
+; I64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
+; I64-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]]
+; I64-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
+; I64-NEXT:    [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
+; I64-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
+; I64-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
+; I64-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer
+; I64-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
+; I64-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
+; I64-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8
+; I64-NEXT:    [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
+; I64-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0
+; I64-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1
+; I64-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer
+; I64-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0
+; I64-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
+; I64-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> <i32 1, i32 2>
+; I64-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer
+; I64-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer
+; I64-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00)
+; I64-NEXT:    [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8
+; I64-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
+; I64-NEXT:    [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0
+; I64-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
+; I64-NEXT:    [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
+; I64-NEXT:    [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
+; I64-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; I64:       [[MIDDLE_BLOCK]]:
+; I64-NEXT:    [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
+; I64-NEXT:    br label %[[OUTER_LOOP_LOOPEXIT]]
+; I64:       [[EXIT]]:
+; I64-NEXT:    ret double [[ACCUM]]
+;
+; I32-LABEL: define double @test_load_used_by_other_load_scev(
+; I32-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) {
+; I32-NEXT:  [[ENTRY:.*]]:
+; I32-NEXT:    br label %[[OUTER_LOOP:.*]]
+; I32:       [[OUTER_LOOP]]:
+; I32-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ]
+; I32-NEXT:    [[COND:%.*]] = call i1 @cond()
+; I32-NEXT:    br i1 [[COND]], label %[[INNER_LOOP]], label %[[EXIT:.*]]
+; I32:       [[INNER_LOOP]]:
+; I32-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[OUTER_LOOP]] ], [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ]
+; I32-NEXT:    [[ACCUM_INNER:%.*]] = phi double [ [[ACCUM]], %[[OUTER_LOOP]] ], [ [[MUL1:%.*]], %[[INNER_LOOP]] ]
+; I32-NEXT:    [[IDX_PLUS1:%.*]] = add i64 [[IV]], 1
+; I32-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[IDX_PLUS1]]
+; I32-NEXT:    [[GEP_A_I64:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[IDX_PLUS1]]
+; I32-NEXT:    [[LOAD_IDX:%.*]] = load i64, ptr [[GEP_A_I64]], align 8
+; I32-NEXT:    [[GEP_B:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[LOAD_IDX]]
+; I32-NEXT:    [[LOAD_A:%.*]] = load double, ptr [[PTR_A]], align 8
+; I32-NEXT:    [[ADD1:%.*]] = fadd double [[LOAD_A]], 0.000000e+00
+; I32-NEXT:    [[GEP_C_OFFSET:%.*]] = getelementptr i8, ptr [[GEP_C]], i64 8
+; I32-NEXT:    [[LOAD_C:%.*]] = load double, ptr [[GEP_C_OFFSET]], align 8
+; I32-NEXT:    [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
+; I32-NEXT:    [[MUL2:%.*]] = fmul double [[LOAD_C]], 0.000000e+00
+; I32-NEXT:    [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
+; I32-NEXT:    [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
+; I32-NEXT:    [[LOAD_B:%.*]] = load double, ptr [[GEP_B]], align 8
+; I32-NEXT:    [[DIV:%.*]] = fdiv double [[LOAD_B]], [[ADD3]]
+; I32-NEXT:    [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]]
+; I32-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; I32-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
+; I32-NEXT:    br i1 [[EXITCOND]], label %[[OUTER_LOOP]], label %[[INNER_LOOP]]
+; I32:       [[EXIT]]:
+; I32-NEXT:    ret double [[ACCUM]]
+;
+entry:
+  br label %outer.loop
+
+outer.loop:
+  %accum = phi double [ 0.0, %entry ], [ %result, %inner.loop ]
+  %cond = call i1 @cond()
+  br i1 %cond, label %inner.loop, label %exit
+
+inner.loop:
+  %iv = phi i64 [ 0, %outer.loop ], [ %iv.next, %inner.loop ]
+  %accum.inner = phi double [ %accum, %outer.loop ], [ %mul1, %inner.loop ]
+  %idx.plus1 = add i64 %iv, 1
+  %gep.c = getelementptr i8, ptr %ptr.c, i64 %idx.plus1
+  %gep.a.i64 = getelementptr i64, ptr %ptr.a, i64 %idx.plus1
+  %load.idx = load i64, ptr %gep.a.i64, align 8
+  %gep.b = getelementptr double, ptr %ptr.b, i64 %load.idx
+  %load.a = load double, ptr %ptr.a, align 8
+  %add1 = fadd double %load.a, 0.000000e+00
+  %gep.c.offset = getelementptr i8, ptr %gep.c, i64 8
+  %load.c = load double, ptr %gep.c.offset, align 8
+  %mul1 = fmul double %add1, 0.000000e+00
+  %mul2 = fmul double %load.c, 0.000000e+00
+  %add2 = fadd double %mul2, 0.000000e+00
+  %add3 = fadd double %add2, 1.000000e+00
+  %load.b = load double, ptr %gep.b, align 8
+  %div = fdiv double %load.b, %add3
+  %result = fsub double %accum.inner, %div
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv, 1
+  br i1 %exitcond, label %outer.loop, label %inner.loop
+
+exit:
+  ret double %accum
+}
+
 attributes #0 = { "target-cpu"="znver2" }
 
 !0 = distinct !{!0, !1}

From 50285eaa594618bc430114da17fcce3c24f36810 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Fri, 3 Oct 2025 14:26:22 -0700
Subject: [PATCH 700/878] [OpenMP][Clang][NFC] Initializer all of ScanInfo
 member variables and add deleted copy ctor and assignment operator (#158130)

Static analysis flagged that we were not initializing all of the members
of ScanInfo, fix this so that they are all initialized. Also it pointed
out that we were not following the rule of three. We had a custom
destructor but not copy constructor or assignment. We should never copy
or assignment so defaulting them as deleted.
---
 llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 0a11617ea971c..5331cb5abdc6f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -4001,15 +4001,17 @@ class ScanInfo {
 
   /// Keeps track of value of iteration variable for input/scan loop to be
   /// used for Scan directive lowering
-  llvm::Value *IV;
+  llvm::Value *IV = nullptr;
 
   /// Stores the span of canonical loop being lowered to be used for temporary
   /// buffer allocation or Finalization.
-  llvm::Value *Span;
+  llvm::Value *Span = nullptr;
 
   ScanInfo() {
     ScanBuffPtrs = new llvm::SmallDenseMap<llvm::Value *, llvm::Value *>();
   }
+  ScanInfo(ScanInfo &) = delete;
+  ScanInfo &operator=(const ScanInfo &) = delete;
 
   ~ScanInfo() { delete (ScanBuffPtrs); }
 };

From b8127cc8d0f5cb48c106199a059442e2719e8656 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Fri, 3 Oct 2025 17:34:42 -0400
Subject: [PATCH 701/878] [AMDGPU][True16][CodeGen] fix v_mov_b16_t16 index in
 folding pass (#161764)

With true16 mode v_mov_b16_t16 is added as new foldable copy inst, but
the src operand is in different index.

Use the correct src index for  v_mov_b16_t16.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp |  4 +++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp    | 26 +++++++++++++++++++++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h      |  1 +
 llvm/test/CodeGen/AMDGPU/true16-fold.mir  | 25 ++++++++++++++++++++++
 4 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 82789bc4968c5..90c828ba8dfab 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -932,7 +932,9 @@ static MachineOperand *lookUpCopyChain(const SIInstrInfo &TII,
   for (MachineInstr *SubDef = MRI.getVRegDef(SrcReg);
        SubDef && TII.isFoldableCopy(*SubDef);
        SubDef = MRI.getVRegDef(Sub->getReg())) {
-    MachineOperand &SrcOp = SubDef->getOperand(1);
+    unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
+    MachineOperand &SrcOp = SubDef->getOperand(SrcIdx);
+
     if (SrcOp.isImm())
       return &SrcOp;
     if (!SrcOp.isReg() || SrcOp.getReg().isPhysical())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index cda8069936af2..46757cf5fe90c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3433,6 +3433,32 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   }
 }
 
+unsigned SIInstrInfo::getFoldableCopySrcIdx(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B16_t16_e32:
+  case AMDGPU::V_MOV_B16_t16_e64:
+    return 2;
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::V_MOV_B64_e32:
+  case AMDGPU::V_MOV_B64_e64:
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::S_MOV_B64_IMM_PSEUDO:
+  case AMDGPU::COPY:
+  case AMDGPU::WWM_COPY:
+  case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
+  case AMDGPU::V_ACCVGPR_READ_B32_e64:
+  case AMDGPU::V_ACCVGPR_MOV_B32:
+  case AMDGPU::AV_MOV_B32_IMM_PSEUDO:
+  case AMDGPU::AV_MOV_B64_IMM_PSEUDO:
+    return 1;
+  default:
+    llvm_unreachable("MI is not a foldable copy");
+  }
+}
+
 static constexpr AMDGPU::OpName ModifierOpNames[] = {
     AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
     AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a21089f8e0fcc..cc59acf1ebd94 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -417,6 +417,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                                   const MachineInstr &MIb) const override;
 
   static bool isFoldableCopy(const MachineInstr &MI);
+  static unsigned getFoldableCopySrcIdx(const MachineInstr &MI);
 
   void removeModOperands(MachineInstr &MI) const;
 
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
index 93cc12f152cca..9484417e63c98 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -57,6 +57,7 @@ body:             |
     %4:vgpr_16 = COPY %3:sgpr_lo16
     %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec
     S_ENDPGM 0, implicit %5
+...
 
 ---
 name:            fold_16bit_madmix_clamp
@@ -207,3 +208,27 @@ body:             |
     $vgpr0 = COPY %4
     S_ENDPGM 0, implicit $vgpr0
 ...
+
+---
+name:            fold_imm16_across_reg_sequence
+tracksRegLiveness: true
+registers:
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-LABEL: name: fold_imm16_across_reg_sequence
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    ; CHECK-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[V_MOV_B16_t16_e64_1]], %subreg.hi16
+    ; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, -1, 0, -1, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F32_e64_]]
+    ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    %1:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec
+    %2:vgpr_32 = REG_SEQUENCE %0, %subreg.lo16, %1, %subreg.hi16
+    %3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %3
+    S_ENDPGM 0, implicit $vgpr0
+...

From 9c118aa6a6fedb1006a3a81ebca40f226e5abf93 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 14:31:47 -0700
Subject: [PATCH 702/878] [RISCV] Remove unusable pattern from RISCVGISel.td.
 NFC

This pattern doesn't have a GISelPredicateCode so it's only usable
in SDAG.
---
 llvm/lib/Target/RISCV/RISCVGISel.td | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 9e833b762bd42..7f5d0af565355 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -126,11 +126,6 @@ let Predicates = [HasAtomicLdSt, IsRV64] in {
 // RV64 i32 patterns not used by SelectionDAG
 //===----------------------------------------------------------------------===//
 
-def zext_is_sext : PatFrag<(ops node:$src), (zext node:$src), [{
-  KnownBits Known = CurDAG->computeKnownBits(N->getOperand(0), 0);
-  return Known.isNonNegative();
-}]>;
-
 let Predicates = [IsRV64] in {
 def : LdPat<extloadi8, LBU, i32>; // Prefer unsigned due to no c.lb in Zcb.
 def : LdPat<extloadi16, LH, i32>;
@@ -142,9 +137,6 @@ def : Pat<(sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>;
 
 def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32),
           (ADDIW GPR:$rs1, simm12_lo:$imm)>;
-
-// Use sext if the sign bit of the input is 0.
-def : Pat<(zext_is_sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>;
 }
 
 let Predicates = [IsRV64, NoStdExtZba] in

From fb458aa91f8fa614086e855ab29749e81e834194 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Fri, 3 Oct 2025 14:42:33 -0400
Subject: [PATCH 703/878] Reland "[LLVM] Add GNU make jobserver support
 (#145131)"

With fix for JobServerTest where default parallel scheduling
strategy is saved/restored.
---
 clang/include/clang/Driver/Options.td         |   5 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |  22 +-
 clang/test/Driver/hip-options.hip             |   6 +
 clang/test/Driver/linker-wrapper.c            |   2 +
 .../ClangLinkerWrapper.cpp                    |  18 +-
 .../clang-linker-wrapper/LinkerWrapperOpts.td |   3 +-
 llvm/include/llvm/Support/Jobserver.h         | 162 +++++++
 llvm/include/llvm/Support/ThreadPool.h        |   4 +
 llvm/include/llvm/Support/Threading.h         |  18 +
 llvm/lib/Support/CMakeLists.txt               |   1 +
 llvm/lib/Support/Jobserver.cpp                | 259 ++++++++++
 llvm/lib/Support/Parallel.cpp                 |  98 +++-
 llvm/lib/Support/ThreadPool.cpp               | 108 ++++-
 llvm/lib/Support/Threading.cpp                |   5 +
 llvm/lib/Support/Unix/Jobserver.inc           | 195 ++++++++
 llvm/lib/Support/Windows/Jobserver.inc        |  79 ++++
 llvm/unittests/Support/CMakeLists.txt         |   1 +
 llvm/unittests/Support/JobserverTest.cpp      | 442 ++++++++++++++++++
 18 files changed, 1398 insertions(+), 30 deletions(-)
 create mode 100644 llvm/include/llvm/Support/Jobserver.h
 create mode 100644 llvm/lib/Support/Jobserver.cpp
 create mode 100644 llvm/lib/Support/Unix/Jobserver.inc
 create mode 100644 llvm/lib/Support/Windows/Jobserver.inc
 create mode 100644 llvm/unittests/Support/JobserverTest.cpp

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2ef609831637e..5a48f0bcf65e5 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1258,8 +1258,9 @@ def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
   HelpText<"Compression level for offload device binaries (HIP only)">;
 
 def offload_jobs_EQ : Joined<["--"], "offload-jobs=">,
-  HelpText<"Specify the number of threads to use for device offloading tasks"
-           " during compilation.">;
+  HelpText<"Specify the number of threads to use for device offloading tasks "
+           "during compilation. Can be a positive integer or the string "
+           "'jobserver' to use the make-style jobserver from the environment.">;
 
 defm offload_via_llvm : BoolFOption<"offload-via-llvm",
   LangOpts<"OffloadViaLLVM">, DefaultFalse,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 412a176006bc0..684cc0902916f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9224,14 +9224,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   addOffloadCompressArgs(Args, CmdArgs);
 
   if (Arg *A = Args.getLastArg(options::OPT_offload_jobs_EQ)) {
-    int NumThreads;
-    if (StringRef(A->getValue()).getAsInteger(10, NumThreads) ||
-        NumThreads <= 0)
-      C.getDriver().Diag(diag::err_drv_invalid_int_value)
-          << A->getAsString(Args) << A->getValue();
-    else
-      CmdArgs.push_back(
-          Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
+    StringRef Val = A->getValue();
+
+    if (Val.equals_insensitive("jobserver"))
+      CmdArgs.push_back(Args.MakeArgString("--wrapper-jobs=jobserver"));
+    else {
+      int NumThreads;
+      if (Val.getAsInteger(10, NumThreads) || NumThreads <= 0) {
+        C.getDriver().Diag(diag::err_drv_invalid_int_value)
+            << A->getAsString(Args) << Val;
+      } else {
+        CmdArgs.push_back(
+            Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
+      }
+    }
   }
 
   const char *Exec =
diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index 6206020d76db6..09f1ffa62d348 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -254,3 +254,9 @@
 // RUN:   --offload-arch=gfx1100 --offload-new-driver --offload-jobs=0x4 %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=INVJOBS %s
 // INVJOBS: clang: error: invalid integral value '0x4' in '--offload-jobs=0x4'
+
+// RUN: %clang -### -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
+// RUN:   --offload-arch=gfx1100 --offload-new-driver --offload-jobs=jobserver %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=JOBSV %s
+// JOBSV: clang-linker-wrapper{{.*}} "--wrapper-jobs=jobserver"
+
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index c060dae7bb154..1c0fb9644ef54 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -114,6 +114,8 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   -fembed-offload-object=%t.out
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=4 \
 // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
+// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=jobserver \
+// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
 
 // CUDA-PAR: fatbinary{{.*}}-64 --create {{.*}}.fatbin
 
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 1419b8c90a625..4d5b956031674 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1295,12 +1295,18 @@ int main(int Argc, char **Argv) {
 
   parallel::strategy = hardware_concurrency(1);
   if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) {
-    unsigned Threads = 0;
-    if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0)
-      reportError(createStringError("%s: expected a positive integer, got '%s'",
-                                    Arg->getSpelling().data(),
-                                    Arg->getValue()));
-    parallel::strategy = hardware_concurrency(Threads);
+    StringRef Val = Arg->getValue();
+    if (Val.equals_insensitive("jobserver"))
+      parallel::strategy = jobserver_concurrency();
+    else {
+      unsigned Threads = 0;
+      if (!llvm::to_integer(Val, Threads) || Threads == 0)
+        reportError(createStringError(
+            "%s: expected a positive integer or 'jobserver', got '%s'",
+            Arg->getSpelling().data(), Val.data()));
+      else
+        parallel::strategy = hardware_concurrency(Threads);
+    }
   }
 
   if (Args.hasArg(OPT_wrapper_time_trace_eq)) {
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index fa73e02fd5178..87f911c749bf6 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -53,7 +53,8 @@ def wrapper_time_trace_granularity : Joined<["--"], "wrapper-time-trace-granular
 
 def wrapper_jobs : Joined<["--"], "wrapper-jobs=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">,
-  HelpText<"Sets the number of parallel jobs to use for device linking">;
+  HelpText<"Sets the number of parallel jobs for device linking. Can be a "
+            "positive integer or 'jobserver'.">;
 
 def override_image : Joined<["--"], "override-image=">,
   Flags<[WrapperOnlyOption]>, MetaVarName<"<kind=file>">,
diff --git a/llvm/include/llvm/Support/Jobserver.h b/llvm/include/llvm/Support/Jobserver.h
new file mode 100644
index 0000000000000..6bee3b5671d55
--- /dev/null
+++ b/llvm/include/llvm/Support/Jobserver.h
@@ -0,0 +1,162 @@
+//===- llvm/Support/Jobserver.h - Jobserver Client --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a client for the GNU Make jobserver protocol. This allows
+// LLVM tools to coordinate parallel execution with a parent `make` process.
+//
+// The jobserver protocol is a mechanism for GNU Make to share its pool of
+// available "job slots" with the subprocesses it invokes. This is particularly
+// useful for tools that can perform parallel operations themselves (e.g., a
+// multi-threaded linker or compiler). By participating in this protocol, a
+// tool can ensure the total number of concurrent jobs does not exceed the
+// limit specified by the user (e.g., `make -j8`).
+//
+// How it works:
+//
+// 1. Establishment:
+//    A child process discovers the jobserver by inspecting the `MAKEFLAGS`
+//    environment variable. If a jobserver is active, this variable will
+//    contain a `--jobserver-auth=<value>` argument. The format of `<value>`
+//    determines how to communicate with the server.
+//
+// 2. The Implicit Slot:
+//    Every command invoked by `make` is granted one "implicit" job slot. This
+//    means a tool can always perform at least one unit of work without needing
+//    to communicate with the jobserver. This implicit slot should NEVER be
+//    released back to the jobserver.
+//
+// 3. Acquiring and Releasing Slots:
+//    On POSIX systems, the jobserver is implemented as a pipe. The
+//    `--jobserver-auth` value specifies either a path to a named pipe
+//    (`fifo:PATH`) or a pair of file descriptors (`R,W`). The pipe is
+//    pre-loaded with single-character tokens, one for each available job slot.
+//
+//    - To acquire an additional slot, a client reads a single-character token
+//      from the pipe.
+//    - To release a slot, the client must write the *exact same* character
+//      token back to the pipe.
+//
+//    It is critical that a client releases all acquired slots before it exits,
+//    even in cases of error, to avoid deadlocking the build.
+//
+// Example:
+//    A multi-threaded linker invoked by `make -j8` wants to use multiple
+//    threads. It first checks for the jobserver. It knows it has one implicit
+//    slot, so it can use one thread. It then tries to acquire 7 more slots by
+//    reading 7 tokens from the jobserver pipe. If it only receives 3 tokens,
+//    it knows it can use a total of 1 (implicit) + 3 (acquired) = 4 threads.
+//    Before exiting, it must write the 3 tokens it read back to the pipe.
+//
+// For more context, see:
+//   - GNU Make manual on job slots:
+//     https://www.gnu.org/software/make/manual/html_node/Job-Slots.html
+//   - LLVM RFC discussion on jobserver support:
+//     https://discourse.llvm.org/t/rfc-adding-gnu-make-jobserver-
+//     support-to-llvm-for-coordinated-parallelism/87034
+//   - Ninja’s jobserver support PR:
+//     https://github.com/ninja-build/ninja/pull/2506
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_JOBSERVER_H
+#define LLVM_SUPPORT_JOBSERVER_H
+
+#include "llvm/ADT/StringRef.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+
+/// A JobSlot represents a single job slot that can be acquired from or released
+/// to a jobserver pool. This class is move-only.
+class JobSlot {
+public:
+  /// Default constructor creates an invalid instance.
+  JobSlot() = default;
+
+  // Move operations are allowed.
+  JobSlot(JobSlot &&Other) noexcept : Value(Other.Value) {
+    Other.Value = kInvalidValue;
+  }
+  JobSlot &operator=(JobSlot &&Other) noexcept {
+    if (this != &Other) {
+      this->Value = Other.Value;
+      Other.Value = kInvalidValue;
+    }
+    return *this;
+  }
+
+  // Copy operations are disallowed.
+  JobSlot(const JobSlot &) = delete;
+  JobSlot &operator=(const JobSlot &) = delete;
+
+  /// Returns true if this instance is valid (either implicit or explicit).
+  bool isValid() const { return Value >= 0; }
+
+  /// Returns true if this instance represents the implicit job slot.
+  bool isImplicit() const { return Value == kImplicitValue; }
+
+  static JobSlot createExplicit(uint8_t V) {
+    return JobSlot(static_cast<int16_t>(V));
+  }
+
+  static JobSlot createImplicit() { return JobSlot(kImplicitValue); }
+
+  uint8_t getExplicitValue() const;
+  bool isExplicit() const { return isValid() && !isImplicit(); }
+
+private:
+  friend class JobserverClient;
+  friend class JobserverClientImpl;
+
+  JobSlot(int16_t V) : Value(V) {}
+
+  /// The jobserver pipe carries explicit tokens (bytes 0–255). We reserve two
+  /// sentinels in Value for special cases:
+  ///   kInvalidValue  (-1): no slot held
+  ///   kImplicitValue (INT16_MAX): implicit slot granted at startup (no pipe
+  ///   I/O)
+  ///
+  /// We use int16_t so Value can store 0–255 explicit tokens and
+  /// sentinels without overflow, enforces fixed 16-bit width, and avoids
+  /// unsigned/signed mix-ups.
+  static constexpr int16_t kInvalidValue = -1;
+  static constexpr int16_t kImplicitValue = INT16_MAX;
+  int16_t Value = kInvalidValue;
+};
+
+/// The public interface for a jobserver client.
+/// This client is a lazy-initialized singleton that is created on first use.
+class JobserverClient {
+public:
+  virtual ~JobserverClient();
+
+  /// Tries to acquire a job slot from the pool. On failure (e.g., if the pool
+  /// is empty), this returns an invalid JobSlot instance. The first successful
+  /// call will always return the implicit slot.
+  virtual JobSlot tryAcquire() = 0;
+
+  /// Releases a job slot back to the pool.
+  virtual void release(JobSlot Slot) = 0;
+
+  /// Returns the number of job slots available, as determined on first use.
+  /// This value is cached. Returns 0 if no jobserver is active.
+  virtual unsigned getNumJobs() const = 0;
+
+  /// Returns the singleton instance of the JobserverClient.
+  /// The instance is created on the first call to this function.
+  /// Returns a nullptr if no jobserver is configured or an error occurs.
+  static JobserverClient *getInstance();
+
+  /// Resets the singleton instance. For testing purposes only.
+  static void resetForTesting();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_JOBSERVER_H
diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h
index c26681c25c8f6..c20efc7396b79 100644
--- a/llvm/include/llvm/Support/ThreadPool.h
+++ b/llvm/include/llvm/Support/ThreadPool.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Jobserver.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
@@ -180,6 +181,7 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
   void grow(int requested);
 
   void processTasks(ThreadPoolTaskGroup *WaitingForGroup);
+  void processTasksWithJobserver();
 
   /// Threads in flight
   std::vector<llvm::thread> Threads;
@@ -208,6 +210,8 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
 
   /// Maximum number of threads to potentially grow this pool to.
   const unsigned MaxThreadCount;
+
+  JobserverClient *TheJobserver = nullptr;
 };
 #endif // LLVM_ENABLE_THREADS
 
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index d3fe0a57ee44e..88846807f111a 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -142,6 +142,11 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     /// the thread shall remain on the actual CPU socket.
     LLVM_ABI std::optional<unsigned>
     compute_cpu_socket(unsigned ThreadPoolNum) const;
+
+    /// If true, the thread pool will attempt to coordinate with a GNU Make
+    /// jobserver, acquiring a job slot before processing a task. If no
+    /// jobserver is found in the environment, this is ignored.
+    bool UseJobserver = false;
   };
 
   /// Build a strategy from a number of threads as a string provided in \p Num.
@@ -210,6 +215,19 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
     return S;
   }
 
+  /// Returns a thread strategy that attempts to coordinate with a GNU Make
+  /// jobserver. The number of active threads will be limited by the number of
+  /// available job slots. If no jobserver is detected in the environment, this
+  /// strategy falls back to the default hardware_concurrency() behavior.
+  inline ThreadPoolStrategy jobserver_concurrency() {
+    ThreadPoolStrategy S;
+    S.UseJobserver = true;
+    // We can still request all threads be created, as they will simply
+    // block waiting for a job slot if the jobserver is the limiting factor.
+    S.ThreadsRequested = 0; // 0 means 'use all available'
+    return S;
+  }
+
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
   /// unique across the entire system, so portable code should not assume
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 7da972f372c5b..42b21b5e62029 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -207,6 +207,7 @@ add_llvm_component_library(LLVMSupport
   InstructionCost.cpp
   IntEqClasses.cpp
   IntervalMap.cpp
+  Jobserver.cpp
   JSON.cpp
   KnownBits.cpp
   KnownFPClass.cpp
diff --git a/llvm/lib/Support/Jobserver.cpp b/llvm/lib/Support/Jobserver.cpp
new file mode 100644
index 0000000000000..9f726eb37506f
--- /dev/null
+++ b/llvm/lib/Support/Jobserver.cpp
@@ -0,0 +1,259 @@
+//===- llvm/Support/Jobserver.cpp - Jobserver Client Implementation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Jobserver.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <new>
+
+#define DEBUG_TYPE "jobserver"
+
+using namespace llvm;
+
+namespace {
+struct FdPair {
+  int Read = -1;
+  int Write = -1;
+  bool isValid() const { return Read >= 0 && Write >= 0; }
+};
+
+struct JobserverConfig {
+  enum Mode {
+    None,
+    PosixFifo,
+    PosixPipe,
+    Win32Semaphore,
+  };
+  Mode TheMode = None;
+  std::string Path;
+  FdPair PipeFDs;
+};
+
+/// A helper function that checks if `Input` starts with `Prefix`.
+/// If it does, it removes the prefix from `Input`, assigns the remainder to
+/// `Value`, and returns true. Otherwise, it returns false.
+bool getPrefixedValue(StringRef Input, StringRef Prefix, StringRef &Value) {
+  if (Input.consume_front(Prefix)) {
+    Value = Input;
+    return true;
+  }
+  return false;
+}
+
+/// A helper function to parse a string in the format "R,W" where R and W are
+/// non-negative integers representing file descriptors. It populates the
+/// `ReadFD` and `WriteFD` output parameters. Returns true on success.
+static std::optional<FdPair> getFileDescriptorPair(StringRef Input) {
+  FdPair FDs;
+  if (Input.consumeInteger(10, FDs.Read))
+    return std::nullopt;
+  if (!Input.consume_front(","))
+    return std::nullopt;
+  if (Input.consumeInteger(10, FDs.Write))
+    return std::nullopt;
+  if (!Input.empty() || !FDs.isValid())
+    return std::nullopt;
+  return FDs;
+}
+
+/// Parses the `MAKEFLAGS` environment variable string to find jobserver
+/// arguments. It splits the string into space-separated arguments and searches
+/// for `--jobserver-auth` or `--jobserver-fds`. Based on the value of these
+/// arguments, it determines the jobserver mode (Pipe, FIFO, or Semaphore) and
+/// connection details (file descriptors or path).
+Expected<JobserverConfig> parseNativeMakeFlags(StringRef MakeFlags) {
+  JobserverConfig Config;
+  if (MakeFlags.empty())
+    return Config;
+
+  // Split the MAKEFLAGS string into arguments.
+  SmallVector<StringRef, 8> Args;
+  SplitString(MakeFlags, Args);
+
+  // If '-n' (dry-run) is present as a legacy flag (not starting with '-'),
+  // disable the jobserver.
+  if (!Args.empty() && !Args[0].starts_with("-") && Args[0].contains('n'))
+    return Config;
+
+  // Iterate through arguments to find jobserver flags.
+  // Note that make may pass multiple --jobserver-auth flags; the last one wins.
+  for (StringRef Arg : Args) {
+    StringRef Value;
+    if (getPrefixedValue(Arg, "--jobserver-auth=", Value)) {
+      // Try to parse as a file descriptor pair first.
+      if (auto FDPair = getFileDescriptorPair(Value)) {
+        Config.TheMode = JobserverConfig::PosixPipe;
+        Config.PipeFDs = *FDPair;
+      } else {
+        StringRef FifoPath;
+        // If not FDs, try to parse as a named pipe (fifo).
+        if (getPrefixedValue(Value, "fifo:", FifoPath)) {
+          Config.TheMode = JobserverConfig::PosixFifo;
+          Config.Path = FifoPath.str();
+        } else {
+          // Otherwise, assume it's a Windows semaphore.
+          Config.TheMode = JobserverConfig::Win32Semaphore;
+          Config.Path = Value.str();
+        }
+      }
+    } else if (getPrefixedValue(Arg, "--jobserver-fds=", Value)) {
+      // This is an alternative, older syntax for the pipe-based server.
+      if (auto FDPair = getFileDescriptorPair(Value)) {
+        Config.TheMode = JobserverConfig::PosixPipe;
+        Config.PipeFDs = *FDPair;
+      } else {
+        return createStringError(inconvertibleErrorCode(),
+                                 "Invalid file descriptor pair in MAKEFLAGS");
+      }
+    }
+  }
+
+// Perform platform-specific validation.
+#ifdef _WIN32
+  if (Config.TheMode == JobserverConfig::PosixFifo ||
+      Config.TheMode == JobserverConfig::PosixPipe)
+    return createStringError(
+        inconvertibleErrorCode(),
+        "FIFO/Pipe-based jobserver is not supported on Windows");
+#else
+  if (Config.TheMode == JobserverConfig::Win32Semaphore)
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Semaphore-based jobserver is not supported on this platform");
+#endif
+  return Config;
+}
+
+std::once_flag GJobserverOnceFlag;
+JobserverClient *GJobserver = nullptr;
+
+} // namespace
+
+namespace llvm {
+class JobserverClientImpl : public JobserverClient {
+  bool IsInitialized = false;
+  std::atomic<bool> HasImplicitSlot{true};
+  unsigned NumJobs = 0;
+
+public:
+  JobserverClientImpl(const JobserverConfig &Config);
+  ~JobserverClientImpl() override;
+
+  JobSlot tryAcquire() override;
+  void release(JobSlot Slot) override;
+  unsigned getNumJobs() const override { return NumJobs; }
+
+  bool isValid() const { return IsInitialized; }
+
+private:
+#if defined(LLVM_ON_UNIX)
+  int ReadFD = -1;
+  int WriteFD = -1;
+  std::string FifoPath;
+#elif defined(_WIN32)
+  void *Semaphore = nullptr;
+#endif
+};
+} // namespace llvm
+
+// Include the platform-specific parts of the class.
+#if defined(LLVM_ON_UNIX)
+#include "Unix/Jobserver.inc"
+#elif defined(_WIN32)
+#include "Windows/Jobserver.inc"
+#else
+// Dummy implementation for unsupported platforms.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {}
+JobserverClientImpl::~JobserverClientImpl() = default;
+JobSlot JobserverClientImpl::tryAcquire() { return JobSlot(); }
+void JobserverClientImpl::release(JobSlot Slot) {}
+#endif
+
+namespace llvm {
+JobserverClient::~JobserverClient() = default;
+
+uint8_t JobSlot::getExplicitValue() const {
+  assert(isExplicit() && "Cannot get value of implicit or invalid slot");
+  return static_cast<uint8_t>(Value);
+}
+
+/// This is the main entry point for acquiring a jobserver client. It uses a
+/// std::call_once to ensure the singleton `GJobserver` instance is created
+/// safely in a multi-threaded environment. On first call, it reads the
+/// `MAKEFLAGS` environment variable, parses it, and attempts to construct and
+/// initialize a `JobserverClientImpl`. If successful, the global instance is
+/// stored in `GJobserver`. Subsequent calls will return the existing instance.
+JobserverClient *JobserverClient::getInstance() {
+  std::call_once(GJobserverOnceFlag, []() {
+    LLVM_DEBUG(
+        dbgs()
+        << "JobserverClient::getInstance() called for the first time.\n");
+    const char *MakeFlagsEnv = getenv("MAKEFLAGS");
+    if (!MakeFlagsEnv) {
+      errs() << "Warning: failed to create jobserver client due to MAKEFLAGS "
+                "environment variable not found\n";
+      return;
+    }
+
+    LLVM_DEBUG(dbgs() << "Found MAKEFLAGS = \"" << MakeFlagsEnv << "\"\n");
+
+    auto ConfigOrErr = parseNativeMakeFlags(MakeFlagsEnv);
+    if (Error Err = ConfigOrErr.takeError()) {
+      errs() << "Warning: failed to create jobserver client due to invalid "
+                "MAKEFLAGS environment variable: "
+             << toString(std::move(Err)) << "\n";
+      return;
+    }
+
+    JobserverConfig Config = *ConfigOrErr;
+    if (Config.TheMode == JobserverConfig::None) {
+      errs() << "Warning: failed to create jobserver client due to jobserver "
+                "mode missing in MAKEFLAGS environment variable\n";
+      return;
+    }
+
+    if (Config.TheMode == JobserverConfig::PosixPipe) {
+#if defined(LLVM_ON_UNIX)
+      if (!areFdsValid(Config.PipeFDs.Read, Config.PipeFDs.Write)) {
+        errs() << "Warning: failed to create jobserver client due to invalid "
+                  "Pipe FDs in MAKEFLAGS environment variable\n";
+        return;
+      }
+#endif
+    }
+
+    auto Client = std::make_unique<JobserverClientImpl>(Config);
+    if (Client->isValid()) {
+      LLVM_DEBUG(dbgs() << "Jobserver client created successfully!\n");
+      GJobserver = Client.release();
+    } else
+      errs() << "Warning: jobserver client initialization failed.\n";
+  });
+  return GJobserver;
+}
+
+/// For testing purposes only. This function resets the singleton instance by
+/// destroying the existing client and re-initializing the `std::once_flag`.
+/// This allows tests to simulate the first-time initialization of the
+/// jobserver client multiple times.
+void JobserverClient::resetForTesting() {
+  delete GJobserver;
+  GJobserver = nullptr;
+  // Re-construct the std::once_flag in place to reset the singleton state.
+  new (&GJobserverOnceFlag) std::once_flag();
+}
+} // namespace llvm
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 3ac6fc74fd3e0..8e0c724accb36 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -7,12 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Parallel.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/ExponentialBackoff.h"
+#include "llvm/Support/Jobserver.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Threading.h"
 
 #include <atomic>
 #include <future>
+#include <memory>
+#include <mutex>
 #include <thread>
 #include <vector>
 
@@ -49,6 +54,9 @@ class Executor {
 class ThreadPoolExecutor : public Executor {
 public:
   explicit ThreadPoolExecutor(ThreadPoolStrategy S) {
+    if (S.UseJobserver)
+      TheJobserver = JobserverClient::getInstance();
+
     ThreadCount = S.compute_thread_count();
     // Spawn all but one of the threads in another thread as spawning threads
     // can take a while.
@@ -69,6 +77,10 @@ class ThreadPoolExecutor : public Executor {
     });
   }
 
+  // To make sure the thread pool executor can only be created with a parallel
+  // strategy.
+  ThreadPoolExecutor() = delete;
+
   void stop() {
     {
       std::lock_guard<std::mutex> Lock(Mutex);
@@ -111,15 +123,62 @@ class ThreadPoolExecutor : public Executor {
   void work(ThreadPoolStrategy S, unsigned ThreadID) {
     threadIndex = ThreadID;
     S.apply_thread_strategy(ThreadID);
+    // Note on jobserver deadlock avoidance:
+    // GNU Make grants each invoked process one implicit job slot. Our
+    // JobserverClient models this by returning an implicit JobSlot on the
+    // first successful tryAcquire() in a process. This guarantees forward
+    // progress without requiring a dedicated "always-on" thread here.
+
+    static thread_local std::unique_ptr<ExponentialBackoff> Backoff;
+
     while (true) {
-      std::unique_lock<std::mutex> Lock(Mutex);
-      Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
-      if (Stop)
-        break;
-      auto Task = std::move(WorkStack.back());
-      WorkStack.pop_back();
-      Lock.unlock();
-      Task();
+      if (TheJobserver) {
+        // Jobserver-mode scheduling:
+        // - Acquire one job slot (with exponential backoff to avoid busy-wait).
+        // - While holding the slot, drain and run tasks from the local queue.
+        // - Release the slot when the queue is empty or when shutting down.
+        // Rationale: Holding a slot amortizes acquire/release overhead over
+        // multiple tasks and avoids requeue/yield churn, while still enforcing
+        // the jobserver’s global concurrency limit. With K available slots,
+        // up to K workers run tasks in parallel; within each worker tasks run
+        // sequentially until the local queue is empty.
+        ExponentialBackoff Backoff(std::chrono::hours(24));
+        JobSlot Slot;
+        do {
+          if (Stop)
+            return;
+          Slot = TheJobserver->tryAcquire();
+          if (Slot.isValid())
+            break;
+        } while (Backoff.waitForNextAttempt());
+
+        auto SlotReleaser = llvm::make_scope_exit(
+            [&] { TheJobserver->release(std::move(Slot)); });
+
+        while (true) {
+          std::function<void()> Task;
+          {
+            std::unique_lock<std::mutex> Lock(Mutex);
+            Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+            if (Stop && WorkStack.empty())
+              return;
+            if (WorkStack.empty())
+              break;
+            Task = std::move(WorkStack.back());
+            WorkStack.pop_back();
+          }
+          Task();
+        }
+      } else {
+        std::unique_lock<std::mutex> Lock(Mutex);
+        Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+        if (Stop)
+          break;
+        auto Task = std::move(WorkStack.back());
+        WorkStack.pop_back();
+        Lock.unlock();
+        Task();
+      }
     }
   }
 
@@ -130,9 +189,20 @@ class ThreadPoolExecutor : public Executor {
   std::promise<void> ThreadsCreated;
   std::vector<std::thread> Threads;
   unsigned ThreadCount;
+
+  JobserverClient *TheJobserver = nullptr;
 };
 
-Executor *Executor::getDefaultExecutor() {
+// A global raw pointer to the executor. Lifetime is managed by the
+// objects created within createExecutor().
+static Executor *TheExec = nullptr;
+static std::once_flag Flag;
+
+// This function will be called exactly once to create the executor.
+// It contains the necessary platform-specific logic. Since functions
+// called by std::call_once cannot return value, we have to set the
+// executor as a global variable.
+void createExecutor() {
 #ifdef _WIN32
   // The ManagedStatic enables the ThreadPoolExecutor to be stopped via
   // llvm_shutdown() which allows a "clean" fast exit, e.g. via _exit(). This
@@ -156,16 +226,22 @@ Executor *Executor::getDefaultExecutor() {
                        ThreadPoolExecutor::Deleter>
       ManagedExec;
   static std::unique_ptr<ThreadPoolExecutor> Exec(&(*ManagedExec));
-  return Exec.get();
+  TheExec = Exec.get();
 #else
   // ManagedStatic is not desired on other platforms. When `Exec` is destroyed
   // by llvm_shutdown(), worker threads will clean up and invoke TLS
   // destructors. This can lead to race conditions if other threads attempt to
   // access TLS objects that have already been destroyed.
   static ThreadPoolExecutor Exec(strategy);
-  return &Exec;
+  TheExec = &Exec;
 #endif
 }
+
+Executor *Executor::getDefaultExecutor() {
+  // Use std::call_once to lazily and safely initialize the executor.
+  std::call_once(Flag, createExecutor);
+  return TheExec;
+}
 } // namespace
 } // namespace detail
 
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index c304f0f45360b..69602688cf3fd 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
+//
 // This file implements a crude C++11 based thread pool.
 //
 //===----------------------------------------------------------------------===//
@@ -14,6 +15,8 @@
 
 #include "llvm/Config/llvm-config.h"
 
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/ExponentialBackoff.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,7 +36,10 @@ ThreadPoolInterface::~ThreadPoolInterface() = default;
 #if LLVM_ENABLE_THREADS
 
 StdThreadPool::StdThreadPool(ThreadPoolStrategy S)
-    : Strategy(S), MaxThreadCount(S.compute_thread_count()) {}
+    : Strategy(S), MaxThreadCount(S.compute_thread_count()) {
+  if (Strategy.UseJobserver)
+    TheJobserver = JobserverClient::getInstance();
+}
 
 void StdThreadPool::grow(int requested) {
   llvm::sys::ScopedWriter LockGuard(ThreadsLock);
@@ -45,7 +51,15 @@ void StdThreadPool::grow(int requested) {
     Threads.emplace_back([this, ThreadID] {
       set_thread_name(formatv("llvm-worker-{0}", ThreadID));
       Strategy.apply_thread_strategy(ThreadID);
-      processTasks(nullptr);
+      // Note on jobserver deadlock avoidance:
+      // GNU Make grants each invoked process one implicit job slot.
+      // JobserverClient::tryAcquire() returns that implicit slot on the first
+      // successful call in a process, ensuring forward progress without a
+      // dedicated "always-on" thread.
+      if (TheJobserver)
+        processTasksWithJobserver();
+      else
+        processTasks(nullptr);
     });
   }
 }
@@ -133,6 +147,96 @@ void StdThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) {
   }
 }
 
+/// Main loop for worker threads when using a jobserver.
+/// This function uses a two-level queue; it first acquires a job slot from the
+/// external jobserver, then retrieves a task from the internal queue.
+/// This allows the thread pool to cooperate with build systems like `make -j`.
+void StdThreadPool::processTasksWithJobserver() {
+  while (true) {
+    // Acquire a job slot from the external jobserver.
+    // This polls for a slot and yields the thread to avoid a high-CPU wait.
+    JobSlot Slot;
+    // The timeout for the backoff can be very long, as the shutdown
+    // is checked on each iteration. The sleep duration is capped by MaxWait
+    // in ExponentialBackoff, so shutdown latency is not a problem.
+    ExponentialBackoff Backoff(std::chrono::hours(24));
+    bool AcquiredToken = false;
+    do {
+      // Return if the thread pool is shutting down.
+      {
+        std::unique_lock<std::mutex> LockGuard(QueueLock);
+        if (!EnableFlag)
+          return;
+      }
+
+      Slot = TheJobserver->tryAcquire();
+      if (Slot.isValid()) {
+        AcquiredToken = true;
+        break;
+      }
+    } while (Backoff.waitForNextAttempt());
+
+    if (!AcquiredToken) {
+      // This is practically unreachable with a 24h timeout and indicates a
+      // deeper problem if hit.
+      report_fatal_error("Timed out waiting for jobserver token.");
+    }
+
+    // `make_scope_exit` guarantees the job slot is released, even if the
+    // task throws or we exit early. This prevents deadlocking the build.
+    auto SlotReleaser =
+        make_scope_exit([&] { TheJobserver->release(std::move(Slot)); });
+
+    // While we hold a job slot, process tasks from the internal queue.
+    while (true) {
+      std::function<void()> Task;
+      ThreadPoolTaskGroup *GroupOfTask = nullptr;
+
+      {
+        std::unique_lock<std::mutex> LockGuard(QueueLock);
+
+        // Wait until a task is available or the pool is shutting down.
+        QueueCondition.wait(LockGuard,
+                            [&] { return !EnableFlag || !Tasks.empty(); });
+
+        // If shutting down and the queue is empty, the thread can terminate.
+        if (!EnableFlag && Tasks.empty())
+          return;
+
+        // If the queue is empty, we're done processing tasks for now.
+        // Break the inner loop to release the job slot.
+        if (Tasks.empty())
+          break;
+
+        // A task is available. Mark it as active before releasing the lock
+        // to prevent race conditions with `wait()`.
+        ++ActiveThreads;
+        Task = std::move(Tasks.front().first);
+        GroupOfTask = Tasks.front().second;
+        if (GroupOfTask != nullptr)
+          ++ActiveGroups[GroupOfTask];
+        Tasks.pop_front();
+      } // The queue lock is released.
+
+      // Run the task. The job slot remains acquired during execution.
+      Task();
+
+      // The task has finished. Update the active count and notify any waiters.
+      {
+        std::lock_guard<std::mutex> LockGuard(QueueLock);
+        --ActiveThreads;
+        if (GroupOfTask != nullptr) {
+          auto A = ActiveGroups.find(GroupOfTask);
+          if (--(A->second) == 0)
+            ActiveGroups.erase(A);
+        }
+        // If all tasks are complete, notify any waiting threads.
+        if (workCompletedUnlocked(nullptr))
+          CompletionCondition.notify_all();
+      }
+    }
+  }
+}
 bool StdThreadPool::workCompletedUnlocked(ThreadPoolTaskGroup *Group) const {
   if (Group == nullptr)
     return !ActiveThreads && Tasks.empty();
diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index 693de0e6400fb..9da357a7ebb91 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/Threading.h"
 #include "llvm/Config/config.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Jobserver.h"
 
 #include <cassert>
 #include <optional>
@@ -51,6 +52,10 @@ int llvm::get_physical_cores() { return -1; }
 static int computeHostNumHardwareThreads();
 
 unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  if (UseJobserver)
+    if (auto JS = JobserverClient::getInstance())
+      return JS->getNumJobs();
+
   int MaxThreadCount =
       UseHyperThreads ? computeHostNumHardwareThreads() : get_physical_cores();
   if (MaxThreadCount <= 0)
diff --git a/llvm/lib/Support/Unix/Jobserver.inc b/llvm/lib/Support/Unix/Jobserver.inc
new file mode 100644
index 0000000000000..53bf7f288ca1f
--- /dev/null
+++ b/llvm/lib/Support/Unix/Jobserver.inc
@@ -0,0 +1,195 @@
+//===- llvm/Support/Unix/Jobserver.inc - Unix Jobserver Impl ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the UNIX-specific parts of the JobserverClient class.
+//
+//===----------------------------------------------------------------------===//
+
+#include <atomic>
+#include <cassert>
+#include <cerrno>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+/// Returns true if the given file descriptor is a FIFO (named pipe).
+bool isFifo(int FD) {
+  struct stat StatBuf;
+  if (::fstat(FD, &StatBuf) != 0)
+    return false;
+  return S_ISFIFO(StatBuf.st_mode);
+}
+
+/// Returns true if the given file descriptors are valid.
+bool areFdsValid(int ReadFD, int WriteFD) {
+  if (ReadFD == -1 || WriteFD == -1)
+    return false;
+  // Check if the file descriptors are actually valid by checking their flags.
+  return ::fcntl(ReadFD, F_GETFD) != -1 && ::fcntl(WriteFD, F_GETFD) != -1;
+}
+} // namespace
+
+/// The constructor sets up the client based on the provided configuration.
+/// For pipe-based jobservers, it duplicates the inherited file descriptors,
+/// sets them to close-on-exec, and makes the read descriptor non-blocking.
+/// For FIFO-based jobservers, it opens the named pipe. After setup, it drains
+/// all available tokens from the jobserver to determine the total number of
+/// available jobs (`NumJobs`), then immediately releases them back.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
+  switch (Config.TheMode) {
+  case JobserverConfig::PosixPipe: {
+    // Duplicate the read and write file descriptors.
+    int NewReadFD = ::dup(Config.PipeFDs.Read);
+    if (NewReadFD < 0)
+      return;
+    int NewWriteFD = ::dup(Config.PipeFDs.Write);
+    if (NewWriteFD < 0) {
+      ::close(NewReadFD);
+      return;
+    }
+    // Set the new descriptors to be closed automatically on exec().
+    if (::fcntl(NewReadFD, F_SETFD, FD_CLOEXEC) == -1 ||
+        ::fcntl(NewWriteFD, F_SETFD, FD_CLOEXEC) == -1) {
+      ::close(NewReadFD);
+      ::close(NewWriteFD);
+      return;
+    }
+    // Set the read descriptor to non-blocking.
+    int flags = ::fcntl(NewReadFD, F_GETFL, 0);
+    if (flags == -1 || ::fcntl(NewReadFD, F_SETFL, flags | O_NONBLOCK) == -1) {
+      ::close(NewReadFD);
+      ::close(NewWriteFD);
+      return;
+    }
+    ReadFD = NewReadFD;
+    WriteFD = NewWriteFD;
+    break;
+  }
+  case JobserverConfig::PosixFifo:
+    // Open the FIFO for reading. It must be non-blocking and close-on-exec.
+    ReadFD = ::open(Config.Path.c_str(), O_RDONLY | O_NONBLOCK | O_CLOEXEC);
+    if (ReadFD < 0 || !isFifo(ReadFD)) {
+      if (ReadFD >= 0)
+        ::close(ReadFD);
+      ReadFD = -1;
+      return;
+    }
+    FifoPath = Config.Path;
+    // The write FD is opened on-demand in release().
+    WriteFD = -1;
+    break;
+  default:
+    return;
+  }
+
+  IsInitialized = true;
+  // Determine the total number of jobs by acquiring all available slots and
+  // then immediately releasing them.
+  SmallVector<JobSlot, 8> Slots;
+  while (true) {
+    auto S = tryAcquire();
+    if (!S.isValid())
+      break;
+    Slots.push_back(std::move(S));
+  }
+  NumJobs = Slots.size();
+  assert(NumJobs >= 1 && "Invalid number of jobs");
+  for (auto &S : Slots)
+    release(std::move(S));
+}
+
+/// The destructor closes any open file descriptors.
+JobserverClientImpl::~JobserverClientImpl() {
+  if (ReadFD >= 0)
+    ::close(ReadFD);
+  if (WriteFD >= 0)
+    ::close(WriteFD);
+}
+
+/// Tries to acquire a job slot. The first call to this function will always
+/// successfully acquire the single "implicit" slot that is granted to every
+/// process started by `make`. Subsequent calls attempt to read a one-byte
+/// token from the jobserver's read pipe. A successful read grants one
+/// explicit job slot. The read is non-blocking; if no token is available,
+/// it fails and returns an invalid JobSlot.
+JobSlot JobserverClientImpl::tryAcquire() {
+  if (!IsInitialized)
+    return JobSlot();
+
+  // The first acquisition is always for the implicit slot.
+  if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
+    LLVM_DEBUG(dbgs() << "Acquired implicit job slot.\n");
+    return JobSlot::createImplicit();
+  }
+
+  char Token;
+  ssize_t Ret;
+  LLVM_DEBUG(dbgs() << "Attempting to read token from FD " << ReadFD << ".\n");
+  // Loop to retry on EINTR (interrupted system call).
+  do {
+    Ret = ::read(ReadFD, &Token, 1);
+  } while (Ret < 0 && errno == EINTR);
+
+  if (Ret == 1) {
+    LLVM_DEBUG(dbgs() << "Acquired explicit token '" << Token << "'.\n");
+    return JobSlot::createExplicit(static_cast<uint8_t>(Token));
+  }
+
+  LLVM_DEBUG(dbgs() << "Failed to acquire job slot, read returned " << Ret
+                    << ".\n");
+  return JobSlot();
+}
+
+/// Releases a job slot back to the pool. If the slot is implicit, it simply
+/// resets a flag. If the slot is explicit, it writes the character token
+/// associated with the slot back into the jobserver's write pipe. For FIFO
+/// jobservers, this may require opening the FIFO for writing if it hasn't
+/// been already.
+void JobserverClientImpl::release(JobSlot Slot) {
+  if (!Slot.isValid())
+    return;
+
+  // Releasing the implicit slot just makes it available for the next acquire.
+  if (Slot.isImplicit()) {
+    LLVM_DEBUG(dbgs() << "Released implicit job slot.\n");
+    [[maybe_unused]] bool was_already_released =
+        HasImplicitSlot.exchange(true, std::memory_order_release);
+    assert(!was_already_released && "Implicit slot released twice");
+    return;
+  }
+
+  uint8_t Token = Slot.getExplicitValue();
+  LLVM_DEBUG(dbgs() << "Releasing explicit token '" << (char)Token << "' to FD "
+                    << WriteFD << ".\n");
+
+  // For FIFO-based jobservers, the write FD might not be open yet.
+  // Open it on the first release.
+  if (WriteFD < 0) {
+    LLVM_DEBUG(dbgs() << "WriteFD is invalid, opening FIFO: " << FifoPath
+                      << "\n");
+    WriteFD = ::open(FifoPath.c_str(), O_WRONLY | O_CLOEXEC);
+    if (WriteFD < 0) {
+      LLVM_DEBUG(dbgs() << "Failed to open FIFO for writing.\n");
+      return;
+    }
+    LLVM_DEBUG(dbgs() << "Opened FIFO as new WriteFD: " << WriteFD << "\n");
+  }
+
+  ssize_t Written;
+  // Loop to retry on EINTR (interrupted system call).
+  do {
+    Written = ::write(WriteFD, &Token, 1);
+  } while (Written < 0 && errno == EINTR);
+
+  if (Written <= 0) {
+    LLVM_DEBUG(dbgs() << "Failed to write token to pipe, write returned "
+                      << Written << "\n");
+  }
+}
diff --git a/llvm/lib/Support/Windows/Jobserver.inc b/llvm/lib/Support/Windows/Jobserver.inc
new file mode 100644
index 0000000000000..79028eee4b302
--- /dev/null
+++ b/llvm/lib/Support/Windows/Jobserver.inc
@@ -0,0 +1,79 @@
+//==- llvm/Support/Windows/Jobserver.inc - Windows Jobserver Impl -*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Windows-specific parts of the JobserverClient class.
+// On Windows, the jobserver is implemented using a named semaphore.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Windows/WindowsSupport.h"
+#include <atomic>
+#include <cassert>
+
+namespace llvm {
+/// The constructor for the Windows jobserver client. It attempts to open a
+/// handle to an existing named semaphore, the name of which is provided by
+/// GNU make in the --jobserver-auth argument. If the semaphore is opened
+/// successfully, the client is marked as initialized.
+JobserverClientImpl::JobserverClientImpl(const JobserverConfig &Config) {
+  Semaphore = (void *)::OpenSemaphoreA(SEMAPHORE_MODIFY_STATE | SYNCHRONIZE,
+                                       FALSE, Config.Path.c_str());
+  if (Semaphore != nullptr)
+    IsInitialized = true;
+}
+
+/// The destructor closes the handle to the semaphore, releasing the resource.
+JobserverClientImpl::~JobserverClientImpl() {
+  if (Semaphore != nullptr)
+    ::CloseHandle((HANDLE)Semaphore);
+}
+
+/// Tries to acquire a job slot. The first call always returns the implicit
+/// slot. Subsequent calls use a non-blocking wait on the semaphore
+/// (`WaitForSingleObject` with a timeout of 0). If the wait succeeds, the
+/// semaphore's count is decremented, and an explicit job slot is acquired.
+/// If the wait times out, it means no slots are available, and an invalid
+/// slot is returned.
+JobSlot JobserverClientImpl::tryAcquire() {
+  if (!IsInitialized)
+    return JobSlot();
+
+  // First, grant the implicit slot.
+  if (HasImplicitSlot.exchange(false, std::memory_order_acquire)) {
+    return JobSlot::createImplicit();
+  }
+
+  // Try to acquire a slot from the semaphore without blocking.
+  if (::WaitForSingleObject((HANDLE)Semaphore, 0) == WAIT_OBJECT_0) {
+    // The explicit token value is arbitrary on Windows, as the semaphore
+    // count is the real resource.
+    return JobSlot::createExplicit(1);
+  }
+
+  return JobSlot(); // Invalid slot
+}
+
+/// Releases a job slot back to the pool. If the slot is implicit, it simply
+/// resets a flag. For an explicit slot, it increments the semaphore's count
+/// by one using `ReleaseSemaphore`, making the slot available to other
+/// processes.
+void JobserverClientImpl::release(JobSlot Slot) {
+  if (!IsInitialized || !Slot.isValid())
+    return;
+
+  if (Slot.isImplicit()) {
+    [[maybe_unused]] bool was_already_released =
+        HasImplicitSlot.exchange(true, std::memory_order_release);
+    assert(!was_already_released && "Implicit slot released twice");
+    return;
+  }
+
+  // Release the slot by incrementing the semaphore count.
+  (void)::ReleaseSemaphore((HANDLE)Semaphore, 1, NULL);
+}
+} // namespace llvm
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index d1dfb1dc4a722..25efa00c5abfd 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -52,6 +52,7 @@ add_llvm_unittest(SupportTests
   IndexedAccessorTest.cpp
   InstructionCostTest.cpp
   InterleavedRangeTest.cpp
+  JobserverTest.cpp
   JSONTest.cpp
   KnownBitsTest.cpp
   LEB128Test.cpp
diff --git a/llvm/unittests/Support/JobserverTest.cpp b/llvm/unittests/Support/JobserverTest.cpp
new file mode 100644
index 0000000000000..ddee0231af223
--- /dev/null
+++ b/llvm/unittests/Support/JobserverTest.cpp
@@ -0,0 +1,442 @@
+//===- llvm/unittest/Support/JobserverTest.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Jobserver.h unit tests.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Jobserver.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Parallel.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+#include <future>
+#include <random>
+#include <stdlib.h>
+
+#if defined(LLVM_ON_UNIX)
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/FileSystem.h"
+#include <atomic>
+#include <condition_variable>
+#include <fcntl.h>
+#include <mutex>
+#include <sys/stat.h>
+#include <thread>
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+#endif
+
+#define DEBUG_TYPE "jobserver-test"
+
+using namespace llvm;
+
+namespace {
+
+// RAII helper to set an environment variable for the duration of a test.
+class ScopedEnvironment {
+  std::string Name;
+  std::string OldValue;
+  bool HadOldValue;
+
+public:
+  ScopedEnvironment(const char *Name, const char *Value) : Name(Name) {
+#if defined(_WIN32)
+    char *Old = nullptr;
+    size_t OldLen;
+    errno_t err = _dupenv_s(&Old, &OldLen, Name);
+    if (err == 0 && Old != nullptr) {
+      HadOldValue = true;
+      OldValue = Old;
+      free(Old);
+    } else {
+      HadOldValue = false;
+    }
+    _putenv_s(Name, Value);
+#else
+    const char *Old = getenv(Name);
+    if (Old) {
+      HadOldValue = true;
+      OldValue = Old;
+    } else {
+      HadOldValue = false;
+    }
+    setenv(Name, Value, 1);
+#endif
+  }
+
+  ~ScopedEnvironment() {
+#if defined(_WIN32)
+    if (HadOldValue)
+      _putenv_s(Name.c_str(), OldValue.c_str());
+    else
+      // On Windows, setting an environment variable to an empty string
+      // unsets it, making getenv() return NULL.
+      _putenv_s(Name.c_str(), "");
+#else
+    if (HadOldValue)
+      setenv(Name.c_str(), OldValue.c_str(), 1);
+    else
+      unsetenv(Name.c_str());
+#endif
+  }
+};
+
+TEST(Jobserver, Slot) {
+  // Default constructor creates an invalid slot.
+  JobSlot S1;
+  EXPECT_FALSE(S1.isValid());
+  EXPECT_FALSE(S1.isImplicit());
+
+  // Create an implicit slot.
+  JobSlot S2 = JobSlot::createImplicit();
+  EXPECT_TRUE(S2.isValid());
+  EXPECT_TRUE(S2.isImplicit());
+
+  // Create an explicit slot.
+  JobSlot S3 = JobSlot::createExplicit(42);
+  EXPECT_TRUE(S3.isValid());
+  EXPECT_FALSE(S3.isImplicit());
+
+  // Test move construction.
+  JobSlot S4 = std::move(S2);
+  EXPECT_TRUE(S4.isValid());
+  EXPECT_TRUE(S4.isImplicit());
+  EXPECT_FALSE(S2.isValid()); // S2 is now invalid.
+
+  // Test move assignment.
+  S1 = std::move(S3);
+  EXPECT_TRUE(S1.isValid());
+  EXPECT_FALSE(S1.isImplicit());
+  EXPECT_FALSE(S3.isValid()); // S3 is now invalid.
+}
+
+// Test fixture for parsing tests to ensure the singleton state is
+// reset between each test case.
+class JobserverParsingTest : public ::testing::Test {
+protected:
+  void TearDown() override { JobserverClient::resetForTesting(); }
+};
+
+TEST_F(JobserverParsingTest, NoMakeflags) {
+  // No MAKEFLAGS, should be null.
+  ScopedEnvironment Env("MAKEFLAGS", "");
+  // On Unix, setting an env var to "" makes getenv() return an empty
+  // string, not NULL. We must call unsetenv() to test the case where
+  // the variable is truly not present.
+#if !defined(_WIN32)
+  unsetenv("MAKEFLAGS");
+#endif
+  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
+}
+
+TEST_F(JobserverParsingTest, EmptyMakeflags) {
+  // Empty MAKEFLAGS, should be null.
+  ScopedEnvironment Env("MAKEFLAGS", "");
+  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
+}
+
+TEST_F(JobserverParsingTest, DryRunFlag) {
+  // Dry-run flag 'n', should be null.
+  ScopedEnvironment Env("MAKEFLAGS", "n -j --jobserver-auth=fifo:/tmp/foo");
+  EXPECT_EQ(JobserverClient::getInstance(), nullptr);
+}
+
+// Separate fixture for non-threaded client tests.
+class JobserverClientTest : public JobserverParsingTest {};
+
+#if defined(LLVM_ON_UNIX)
+// RAII helper to create and clean up a temporary FIFO file.
+class ScopedFifo {
+  SmallString<128> Path;
+  bool IsValid = false;
+
+public:
+  ScopedFifo() {
+    // To get a unique, non-colliding name for a FIFO, we use the
+    // createTemporaryFile function to reserve a name in the filesystem.
+    std::error_code EC =
+        sys::fs::createTemporaryFile("jobserver-test", "fifo", Path);
+    if (EC)
+      return;
+    // Then we immediately remove the regular file it created, but keep the
+    // unique path.
+    sys::fs::remove(Path);
+    // Finally, we create the FIFO at that safe, unique path.
+    if (mkfifo(Path.c_str(), 0600) != 0)
+      return;
+    IsValid = true;
+  }
+
+  ~ScopedFifo() {
+    if (IsValid)
+      sys::fs::remove(Path);
+  }
+
+  const char *c_str() const { return Path.data(); }
+  bool isValid() const { return IsValid; }
+};
+
+TEST_F(JobserverClientTest, UnixClientFifo) {
+  // This test covers basic FIFO client creation and behavior with an empty
+  // FIFO. No job tokens are available.
+  ScopedFifo F;
+  ASSERT_TRUE(F.isValid());
+
+  // Intentionally inserted \t in environment string.
+  std::string Makeflags = " \t -j4\t \t--jobserver-auth=fifo:";
+  Makeflags += F.c_str();
+  ScopedEnvironment Env("MAKEFLAGS", Makeflags.c_str());
+
+  JobserverClient *Client = JobserverClient::getInstance();
+  ASSERT_NE(Client, nullptr);
+
+  // Get the implicit token.
+  JobSlot S1 = Client->tryAcquire();
+  EXPECT_TRUE(S1.isValid());
+  EXPECT_TRUE(S1.isImplicit());
+
+  // FIFO is empty, next acquire fails.
+  JobSlot S2 = Client->tryAcquire();
+  EXPECT_FALSE(S2.isValid());
+
+  // Release does not write to the pipe for the implicit token.
+  Client->release(std::move(S1));
+
+  // Re-acquire the implicit token.
+  S1 = Client->tryAcquire();
+  EXPECT_TRUE(S1.isValid());
+}
+
+#if LLVM_ENABLE_THREADS
+// Test fixture for tests that use the jobserver strategy. It creates a
+// temporary FIFO, sets MAKEFLAGS, and provides a helper to pre-load the FIFO
+// with job tokens, simulating `make -jN`.
+class JobserverStrategyTest : public JobserverParsingTest {
+protected:
+  std::unique_ptr<ScopedFifo> TheFifo;
+  std::thread MakeThread;
+  std::atomic<bool> StopMakeThread{false};
+  // Save and restore the global parallel strategy to avoid interfering with
+  // other tests in the same process.
+  ThreadPoolStrategy SavedStrategy;
+
+  void SetUp() override {
+    SavedStrategy = parallel::strategy;
+    TheFifo = std::make_unique<ScopedFifo>();
+    ASSERT_TRUE(TheFifo->isValid());
+
+    std::string MakeFlags = "--jobserver-auth=fifo:";
+    MakeFlags += TheFifo->c_str();
+    setenv("MAKEFLAGS", MakeFlags.c_str(), 1);
+  }
+
+  void TearDown() override {
+    if (MakeThread.joinable()) {
+      StopMakeThread = true;
+      MakeThread.join();
+    }
+    unsetenv("MAKEFLAGS");
+    TheFifo.reset();
+    // Restore the original strategy to ensure subsequent tests are unaffected.
+    parallel::strategy = SavedStrategy;
+  }
+
+  // Starts a background thread that emulates `make`. It populates the FIFO
+  // with initial tokens and then recycles tokens released by clients.
+  void startMakeProxy(int NumInitialJobs) {
+    MakeThread = std::thread([this, NumInitialJobs]() {
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Thread started.\n");
+      // Open the FIFO for reading and writing. This call does not block.
+      int RWFd = open(TheFifo->c_str(), O_RDWR);
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Opened FIFO " << TheFifo->c_str()
+                        << " with O_RDWR, FD=" << RWFd << "\n");
+      if (RWFd == -1) {
+        LLVM_DEBUG(
+            dbgs()
+            << "[MakeProxy] ERROR: Failed to open FIFO with O_RDWR. Errno: "
+            << errno << "\n");
+        return;
+      }
+
+      // Populate with initial jobs.
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Writing " << NumInitialJobs
+                        << " initial tokens.\n");
+      for (int i = 0; i < NumInitialJobs; ++i) {
+        if (write(RWFd, "+", 1) != 1) {
+          LLVM_DEBUG(dbgs()
+                     << "[MakeProxy] ERROR: Failed to write initial token " << i
+                     << ".\n");
+          close(RWFd);
+          return;
+        }
+      }
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Finished writing initial tokens.\n");
+
+      // Make the read non-blocking so we can periodically check StopMakeThread.
+      int flags = fcntl(RWFd, F_GETFL, 0);
+      fcntl(RWFd, F_SETFL, flags | O_NONBLOCK);
+
+      while (!StopMakeThread) {
+        char Token;
+        ssize_t Ret = read(RWFd, &Token, 1);
+        if (Ret == 1) {
+          LLVM_DEBUG(dbgs() << "[MakeProxy] Read token '" << Token
+                            << "' to recycle.\n");
+          // A client released a token, 'make' makes it available again.
+          std::this_thread::sleep_for(std::chrono::microseconds(100));
+          ssize_t WRet;
+          do {
+            WRet = write(RWFd, &Token, 1);
+          } while (WRet < 0 && errno == EINTR);
+          if (WRet <= 0) {
+            LLVM_DEBUG(
+                dbgs()
+                << "[MakeProxy] ERROR: Failed to write recycled token.\n");
+            break; // Error, stop the proxy.
+          }
+          LLVM_DEBUG(dbgs()
+                     << "[MakeProxy] Wrote token '" << Token << "' back.\n");
+        } else if (Ret < 0 && errno != EAGAIN && errno != EWOULDBLOCK) {
+          LLVM_DEBUG(dbgs() << "[MakeProxy] ERROR: Read failed with errno "
+                            << errno << ".\n");
+          break; // Error, stop the proxy.
+        }
+        // Yield to prevent this thread from busy-waiting.
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      }
+      LLVM_DEBUG(dbgs() << "[MakeProxy] Thread stopping.\n");
+      close(RWFd);
+    });
+
+    // Give the proxy thread a moment to start and populate the FIFO.
+    // This is a simple way to avoid a race condition where the client starts
+    // before the initial tokens are in the pipe.
+    std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  }
+};
+
+TEST_F(JobserverStrategyTest, ThreadPoolConcurrencyIsLimited) {
+  // This test simulates `make -j3`. We will have 1 implicit job slot and
+  // we will add 2 explicit job tokens to the FIFO, for a total of 3.
+  const int NumExplicitJobs = 2;
+  const int ConcurrencyLimit = NumExplicitJobs + 1; // +1 for the implicit slot
+  const int NumTasks = 8; // More tasks than available slots.
+
+  LLVM_DEBUG(dbgs() << "Calling startMakeProxy with " << NumExplicitJobs
+                    << " jobs.\n");
+  startMakeProxy(NumExplicitJobs);
+  LLVM_DEBUG(dbgs() << "MakeProxy is running.\n");
+
+  // Create the thread pool. Its constructor will call jobserver_concurrency()
+  // and create a client that reads from our pre-loaded FIFO.
+  StdThreadPool Pool(jobserver_concurrency());
+
+  std::atomic<int> ActiveTasks{0};
+  std::atomic<int> MaxActiveTasks{0};
+  std::atomic<int> CompletedTasks{0};
+  std::mutex M;
+  std::condition_variable CV;
+
+  // Dispatch more tasks than there are job slots. The pool should block
+  // and only run up to `ConcurrencyLimit` tasks at once.
+  for (int i = 0; i < NumTasks; ++i) {
+    Pool.async([&, i] {
+      // Track the number of concurrently running tasks.
+      int CurrentActive = ++ActiveTasks;
+      LLVM_DEBUG(dbgs() << "Task " << i << ": Active tasks: " << CurrentActive
+                        << "\n");
+      int OldMax = MaxActiveTasks.load();
+      while (CurrentActive > OldMax)
+        MaxActiveTasks.compare_exchange_weak(OldMax, CurrentActive);
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(25));
+
+      --ActiveTasks;
+      if (++CompletedTasks == NumTasks) {
+        std::lock_guard<std::mutex> Lock(M);
+        CV.notify_one();
+      }
+    });
+  }
+
+  // Wait for all tasks to complete.
+  std::unique_lock<std::mutex> Lock(M);
+  CV.wait(Lock, [&] { return CompletedTasks == NumTasks; });
+
+  LLVM_DEBUG(dbgs() << "Test finished. Max active tasks was " << MaxActiveTasks
+                    << ".\n");
+  // The key assertion: the maximum number of concurrent tasks should
+  // not have exceeded the limit imposed by the jobserver.
+  EXPECT_LE(MaxActiveTasks, ConcurrencyLimit);
+  EXPECT_EQ(CompletedTasks, NumTasks);
+}
+
+TEST_F(JobserverStrategyTest, ParallelForIsLimited) {
+  // This test verifies that llvm::parallelFor respects the jobserver limit.
+  const int NumExplicitJobs = 3;
+  const int ConcurrencyLimit = NumExplicitJobs + 1; // +1 implicit
+  const int NumTasks = 20;
+
+  LLVM_DEBUG(dbgs() << "Calling startMakeProxy with " << NumExplicitJobs
+                    << " jobs.\n");
+  startMakeProxy(NumExplicitJobs);
+  LLVM_DEBUG(dbgs() << "MakeProxy is running.\n");
+
+  // Set the global strategy. parallelFor will use this.
+  parallel::strategy = jobserver_concurrency();
+
+  std::atomic<int> ActiveTasks{0};
+  std::atomic<int> MaxActiveTasks{0};
+
+  parallelFor(0, NumTasks, [&](int i) {
+    int CurrentActive = ++ActiveTasks;
+    LLVM_DEBUG(dbgs() << "Task " << i << ": Active tasks: " << CurrentActive
+                      << "\n");
+    int OldMax = MaxActiveTasks.load();
+    while (CurrentActive > OldMax)
+      MaxActiveTasks.compare_exchange_weak(OldMax, CurrentActive);
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(20));
+    --ActiveTasks;
+  });
+
+  LLVM_DEBUG(dbgs() << "ParallelFor finished. Max active tasks was "
+                    << MaxActiveTasks << ".\n");
+  EXPECT_LE(MaxActiveTasks, ConcurrencyLimit);
+}
+
+TEST_F(JobserverStrategyTest, ParallelSortIsLimited) {
+  // This test serves as an integration test to ensure parallelSort completes
+  // correctly when running under the jobserver strategy. It doesn't directly
+  // measure concurrency but verifies correctness.
+  const int NumExplicitJobs = 3;
+  startMakeProxy(NumExplicitJobs);
+
+  parallel::strategy = jobserver_concurrency();
+
+  std::vector<int> V(1024);
+  // Fill with random data
+  std::mt19937 randEngine;
+  std::uniform_int_distribution<int> dist;
+  for (int &i : V)
+    i = dist(randEngine);
+
+  parallelSort(V.begin(), V.end());
+  ASSERT_TRUE(llvm::is_sorted(V));
+}
+
+#endif // LLVM_ENABLE_THREADS
+
+#endif // defined(LLVM_ON_UNIX)
+
+} // end anonymous namespace

From 7f51a2a47d2e706d04855b0e41690ebafa2b3238 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 3 Oct 2025 23:15:38 +0100
Subject: [PATCH 704/878] [lldb][Language] Simplify
 SourceLanguage::GetDescription (#161804)

Currently we don't benefit from the user-friendly names that
`LanguageDescription` returns because we would always use
`Language::GetNameForLanguageType`. I'm not aware of a situation where
`GetDescription` should prefer the non-human readable form of the name
with. This patch removes the call to `GetNameForLanguageType`.

`LanguageDescription` already handles languages that it doesn't know
about. For those it would return `Unknown`. The LLDB language types
should all be available via DWARF. If there are languages that don't map
cleanly between `lldb::LanguageType` and `DW_LANG`, then we should add
explicit support for that in the `SourceLanguage::SourceLanguage`
constructor.
---
 lldb/source/Target/Language.cpp      | 26 ++++++++---
 lldb/unittests/Target/CMakeLists.txt |  1 +
 lldb/unittests/Target/Language.cpp   | 69 ++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 6 deletions(-)
 create mode 100644 lldb/unittests/Target/Language.cpp

diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index d4a926874b0ce..c8cb0ca31e91b 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -543,9 +543,26 @@ Language::Language() = default;
 // Destructor
 Language::~Language() = default;
 
+static std::optional<llvm::dwarf::SourceLanguage>
+ToDwarfSourceLanguage(lldb::LanguageType language_type) {
+  if (language_type < lldb::eLanguageTypeLastStandardLanguage)
+    return static_cast<llvm::dwarf::SourceLanguage>(language_type);
+
+  switch (language_type) {
+  case eLanguageTypeMipsAssembler:
+    return llvm::dwarf::DW_LANG_Mips_Assembler;
+  default:
+    return std::nullopt;
+  }
+}
+
 SourceLanguage::SourceLanguage(lldb::LanguageType language_type) {
-  auto lname =
-      llvm::dwarf::toDW_LNAME((llvm::dwarf::SourceLanguage)language_type);
+  std::optional<llvm::dwarf::SourceLanguage> dwarf_lang =
+      ToDwarfSourceLanguage(language_type);
+  if (!dwarf_lang)
+    return;
+
+  auto lname = llvm::dwarf::toDW_LNAME(*dwarf_lang);
   if (!lname)
     return;
   name = lname->first;
@@ -560,11 +577,8 @@ lldb::LanguageType SourceLanguage::AsLanguageType() const {
 }
 
 llvm::StringRef SourceLanguage::GetDescription() const {
-  LanguageType type = AsLanguageType();
-  if (type)
-    return Language::GetNameForLanguageType(type);
   return llvm::dwarf::LanguageDescription(
-      (llvm::dwarf::SourceLanguageName)name);
+      static_cast<llvm::dwarf::SourceLanguageName>(name));
 }
 bool SourceLanguage::IsC() const { return name == llvm::dwarf::DW_LNAME_C; }
 
diff --git a/lldb/unittests/Target/CMakeLists.txt b/lldb/unittests/Target/CMakeLists.txt
index 3169339ec699f..0c79675a3d890 100644
--- a/lldb/unittests/Target/CMakeLists.txt
+++ b/lldb/unittests/Target/CMakeLists.txt
@@ -2,6 +2,7 @@ add_lldb_unittest(TargetTests
   ABITest.cpp
   DynamicRegisterInfoTest.cpp
   ExecutionContextTest.cpp
+  Language.cpp
   LocateModuleCallbackTest.cpp
   MemoryRegionInfoTest.cpp
   MemoryTest.cpp
diff --git a/lldb/unittests/Target/Language.cpp b/lldb/unittests/Target/Language.cpp
new file mode 100644
index 0000000000000..a00fda78d569a
--- /dev/null
+++ b/lldb/unittests/Target/Language.cpp
@@ -0,0 +1,69 @@
+//===-- LanguageTest.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Target/Language.h"
+#include "lldb/lldb-enumerations.h"
+#include "gtest/gtest.h"
+
+using namespace lldb_private;
+using namespace lldb;
+
+namespace {
+class LanguageTest : public ::testing::Test {};
+} // namespace
+
+TEST_F(LanguageTest, SourceLanguage_GetDescription) {
+  for (uint32_t i = 1; i < lldb::eNumLanguageTypes; ++i) {
+    // 0x29 is unassigned
+    if (i == 0x29)
+      continue;
+
+    auto lang_type = static_cast<lldb::LanguageType>(i);
+    if (lang_type == lldb::eLanguageTypeLastStandardLanguage)
+      continue;
+
+    SourceLanguage lang(lang_type);
+
+    // eLanguageTypeHIP is not implemented as a DW_LNAME because of a conflict.
+    if (lang_type == lldb::eLanguageTypeHIP)
+      EXPECT_FALSE(lang);
+    else
+      EXPECT_TRUE(lang);
+  }
+
+  EXPECT_EQ(SourceLanguage(eLanguageTypeC_plus_plus).GetDescription(),
+            "ISO C++");
+  EXPECT_EQ(SourceLanguage(eLanguageTypeC_plus_plus_17).GetDescription(),
+            "ISO C++");
+  EXPECT_EQ(SourceLanguage(eLanguageTypeC_plus_plus_20).GetDescription(),
+            "ISO C++");
+
+  EXPECT_EQ(SourceLanguage(eLanguageTypeObjC).GetDescription(), "Objective C");
+  EXPECT_EQ(SourceLanguage(eLanguageTypeMipsAssembler).GetDescription(),
+            "Assembly");
+
+  auto next_vendor_language =
+      static_cast<lldb::LanguageType>(eLanguageTypeMipsAssembler + 1);
+  if (next_vendor_language < eNumLanguageTypes)
+    EXPECT_NE(SourceLanguage(next_vendor_language).GetDescription(), "Unknown");
+
+  EXPECT_EQ(SourceLanguage(eLanguageTypeUnknown).GetDescription(), "Unknown");
+}
+
+TEST_F(LanguageTest, SourceLanguage_AsLanguageType) {
+  EXPECT_EQ(SourceLanguage(eLanguageTypeC_plus_plus).AsLanguageType(),
+            eLanguageTypeC_plus_plus);
+  EXPECT_EQ(SourceLanguage(eLanguageTypeC_plus_plus_03).AsLanguageType(),
+            eLanguageTypeC_plus_plus_03);
+
+  // Vendor-specific language code.
+  EXPECT_EQ(SourceLanguage(eLanguageTypeMipsAssembler).AsLanguageType(),
+            eLanguageTypeAssembly);
+  EXPECT_EQ(SourceLanguage(eLanguageTypeUnknown).AsLanguageType(),
+            eLanguageTypeUnknown);
+}

From 2c3724419c04c3b6d918eb4c2eec00a4372d2937 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 3 Oct 2025 23:23:08 +0100
Subject: [PATCH 705/878] [lldb][Language] Add
 Language::GetDisplayNameForLanguageType API (#161803)

The intention for this API is to be used when presenting language names
to the user, e.g., in expression evaluation diagnostics or LLDB errors.

Most uses of `GetNameForLanguageType` can be probably replaced with
`GetDisplayNameForLanguageType`, but that's out of scope of this PR.

This uses `llvm::dwarf::LanguageDescription` under the hood, so we would
lose the version numbers in the names. If we deem those to be important,
we could switch to an implementation that hardcodes a list of
user-friendly names with version numbers included.

The intention is to use it from
https://github.com/llvm/llvm-project/pull/161688

Depends on https://github.com/llvm/llvm-project/pull/161804
---
 lldb/include/lldb/Target/Language.h | 7 +++++++
 lldb/source/Target/Language.cpp     | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/lldb/include/lldb/Target/Language.h b/lldb/include/lldb/Target/Language.h
index 6f20a02335b35..9958b6ea2f815 100644
--- a/lldb/include/lldb/Target/Language.h
+++ b/lldb/include/lldb/Target/Language.h
@@ -404,8 +404,15 @@ class Language : public PluginInterface {
   GetLanguageTypeFromString(const char *string) = delete;
   static lldb::LanguageType GetLanguageTypeFromString(llvm::StringRef string);
 
+  /// Returns the internal LLDB name for the specified language. When presenting
+  /// the language name to users, use \ref GetDisplayNameForLanguageType
+  /// instead.
   static const char *GetNameForLanguageType(lldb::LanguageType language);
 
+  /// Returns a user-friendly name for the specified language.
+  static llvm::StringRef
+  GetDisplayNameForLanguageType(lldb::LanguageType language);
+
   static void PrintAllLanguages(Stream &s, const char *prefix,
                                 const char *suffix);
 
diff --git a/lldb/source/Target/Language.cpp b/lldb/source/Target/Language.cpp
index c8cb0ca31e91b..395718ecbe292 100644
--- a/lldb/source/Target/Language.cpp
+++ b/lldb/source/Target/Language.cpp
@@ -271,6 +271,10 @@ const char *Language::GetNameForLanguageType(LanguageType language) {
     return language_names[eLanguageTypeUnknown].name;
 }
 
+llvm::StringRef Language::GetDisplayNameForLanguageType(LanguageType language) {
+  return SourceLanguage(language).GetDescription();
+}
+
 void Language::PrintSupportedLanguagesForExpressions(Stream &s,
                                                      llvm::StringRef prefix,
                                                      llvm::StringRef suffix) {

From df61e349bc4441640a825acefef753637a55633f Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Sat, 4 Oct 2025 00:21:26 +0200
Subject: [PATCH 706/878] [OpenMP][test] .f90 -> .F90

The test makes use of the preprocessor, which requires a .F90 suffix
---
 openmp/runtime/test/transform/tile/{intfor.f90 => intfor.F90} | 1 +
 1 file changed, 1 insertion(+)
 rename openmp/runtime/test/transform/tile/{intfor.f90 => intfor.F90} (98%)

diff --git a/openmp/runtime/test/transform/tile/intfor.f90 b/openmp/runtime/test/transform/tile/intfor.F90
similarity index 98%
rename from openmp/runtime/test/transform/tile/intfor.f90
rename to openmp/runtime/test/transform/tile/intfor.F90
index dac0de6a99021..4ca9f14fdae9f 100644
--- a/openmp/runtime/test/transform/tile/intfor.f90
+++ b/openmp/runtime/test/transform/tile/intfor.F90
@@ -10,6 +10,7 @@
 ! RUN: %t-ub18.exe | FileCheck %s --match-full-lines
 
 program tile_intfor_1d
+  implicit none
   integer i
   print *, 'do'
 

From fc1df44dec6396a468edab8136b1969eede54509 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 3 Oct 2025 22:28:43 +0000
Subject: [PATCH 707/878] [gn build] Port fb458aa91f8f

---
 llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn       | 1 +
 llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 6ca766ca4caa3..38ba4661daacc 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -103,6 +103,7 @@ static_library("Support") {
     "IntEqClasses.cpp",
     "IntervalMap.cpp",
     "JSON.cpp",
+    "Jobserver.cpp",
     "KnownBits.cpp",
     "KnownFPClass.cpp",
     "LEB128.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index 42c1a15820aab..a25f0585c80ef 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -56,6 +56,7 @@ unittest("SupportTests") {
     "InstructionCostTest.cpp",
     "InterleavedRangeTest.cpp",
     "JSONTest.cpp",
+    "JobserverTest.cpp",
     "KnownBitsTest.cpp",
     "LEB128Test.cpp",
     "LineIteratorTest.cpp",

From 162b87b0ac86f7604db245b67874fc6715b2f06b Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Fri, 3 Oct 2025 15:53:24 -0700
Subject: [PATCH 708/878] [lldb][yaml2macho-core] Set binary path for tests
 differently

The way I was setting the path to the yaml2macho-core tool for
API tests assumed that the llvm tool bin directory was the same
as the lldb tool bin directory.  There are build configuration
styles where they are not.  Set it the same way lldb-dap etc
are set to the lldb bin dir.
---
 lldb/packages/Python/lldbsuite/test/dotest.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index 2966ac04227cb..e30d5490bea22 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -280,9 +280,6 @@ def parseOptionsAndInitTestdirs():
         configuration.llvm_tools_dir = args.llvm_tools_dir
         configuration.filecheck = shutil.which("FileCheck", path=args.llvm_tools_dir)
         configuration.yaml2obj = shutil.which("yaml2obj", path=args.llvm_tools_dir)
-        configuration.yaml2macho_core = shutil.which(
-            "yaml2macho-core", path=args.llvm_tools_dir
-        )
 
     if not configuration.get_filecheck_path():
         logging.warning("No valid FileCheck executable; some tests may fail...")
@@ -563,6 +560,8 @@ def setupSysPath():
     if is_exe(lldbDAPExec):
         os.environ["LLDBDAP_EXEC"] = lldbDAPExec
 
+    configuration.yaml2macho_core = shutil.which("yaml2macho-core", path=lldbDir)
+
     lldbPythonDir = None  # The directory that contains 'lldb/__init__.py'
 
     # If our lldb supports the -P option, use it to find the python path:

From c488dca6564d11ae84fb482599996a9d310f370d Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Sat, 4 Oct 2025 00:17:16 +0100
Subject: [PATCH 709/878] [lldb][test] check if CoreDumping is supported at
 runtime  (#161385)

#160333 reimplementation but at runtime instead because of broken CI.

---------

Co-authored-by: Michael Buch <michaelbuch12@gmail.com>
Co-authored-by: Daniel Thornburgh <mysterymath@gmail.com>
---
 lldb/unittests/Host/posix/HostTest.cpp | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/lldb/unittests/Host/posix/HostTest.cpp b/lldb/unittests/Host/posix/HostTest.cpp
index dc75b288ba76a..7135f266a3507 100644
--- a/lldb/unittests/Host/posix/HostTest.cpp
+++ b/lldb/unittests/Host/posix/HostTest.cpp
@@ -15,10 +15,6 @@
 #include <cerrno>
 #include <sys/resource.h>
 
-#ifdef __linux__
-#include <linux/version.h>
-#endif // __linux__
-
 using namespace lldb_private;
 
 namespace {
@@ -120,12 +116,13 @@ TEST_F(HostTest, GetProcessInfoSetsPriority) {
   ASSERT_TRUE(Info.IsZombie().has_value());
   ASSERT_FALSE(Info.IsZombie().value());
 
-  // CoreDumping was added in kernel version 4.15.
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-  ASSERT_TRUE(Info.IsCoreDumping().has_value());
-  ASSERT_FALSE(Info.IsCoreDumping().value());
-#else
-  ASSERT_FALSE(Info.IsCoreDumping().has_value());
-#endif
+  const llvm::VersionTuple host_version = HostInfo::GetOSVersion();
+  ASSERT_FALSE(host_version.empty());
+  if (host_version >= llvm::VersionTuple(4, 15, 0)) {
+    ASSERT_TRUE(Info.IsCoreDumping().has_value());
+    ASSERT_FALSE(Info.IsCoreDumping().value());
+  } else {
+    ASSERT_FALSE(Info.IsCoreDumping().has_value());
+  }
 }
 #endif

From c2c2e4ec90d0c1d65fff34fc4835dc6ba1ab25fc Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 3 Oct 2025 16:18:21 -0700
Subject: [PATCH 710/878] [SelectionDAG] Add support to dump DAGs with sorted
 nodes (#161097)

An alternative approach to #149732 , which sorts the DAG before dumping
it. That approach runs a risk of altering the codegen result as we don't
know if any of the downstream DAG users relies on the node ID, which was
updated as part of the sorting.

The new method proposed by this PR does not update the node ID or any of
the DAG's internal states: the newly added
`SelectionDAG::getTopologicallyOrderedNodes` is a const member function
that returns a list of all nodes in their topological order.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  8 +++-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 39 +++++++++++++++++++
 .../SelectionDAG/SelectionDAGDumper.cpp       | 15 ++++++-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 25 +++++++-----
 4 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d9d6f0bcdcb84..62c0806dada2e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1959,6 +1959,10 @@ class SelectionDAG {
   LLVM_ABI SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
                                                 SDValue NewMemOp);
 
+  /// Get all the nodes in their topological order without modifying any states.
+  LLVM_ABI void getTopologicallyOrderedNodes(
+      SmallVectorImpl<const SDNode *> &SortedNodes) const;
+
   /// Topological-sort the AllNodes list and a
   /// assign a unique node id for each node in the DAG based on their
   /// topological order. Returns the number of nodes.
@@ -2009,7 +2013,9 @@ class SelectionDAG {
   /// function mirrors \c llvm::salvageDebugInfo.
   LLVM_ABI void salvageDebugInfo(SDNode &N);
 
-  LLVM_ABI void dump() const;
+  /// Dump the textual format of this DAG. Print nodes in sorted orders if \p
+  /// Sorted is true.
+  LLVM_ABI void dump(bool Sorted = false) const;
 
   /// In most cases this function returns the ABI alignment for a given type,
   /// except for illegal vector types where the alignment exceeds that of the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 95f53fe0bfdba..6ea2e2708c162 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -12698,6 +12698,45 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   return DAGSize;
 }
 
+void SelectionDAG::getTopologicallyOrderedNodes(
+    SmallVectorImpl<const SDNode *> &SortedNodes) const {
+  SortedNodes.clear();
+  // Node -> remaining number of outstanding operands.
+  DenseMap<const SDNode *, unsigned> RemainingOperands;
+
+  // Put nodes without any operands into SortedNodes first.
+  for (const SDNode &N : allnodes()) {
+    checkForCycles(&N, this);
+    unsigned NumOperands = N.getNumOperands();
+    if (NumOperands == 0)
+      SortedNodes.push_back(&N);
+    else
+      // Record their total number of outstanding operands.
+      RemainingOperands[&N] = NumOperands;
+  }
+
+  // A node is pushed into SortedNodes when all of its operands (predecessors in
+  // the graph) are also in SortedNodes.
+  for (unsigned i = 0U; i < SortedNodes.size(); ++i) {
+    const SDNode *N = SortedNodes[i];
+    for (const SDNode *U : N->users()) {
+      unsigned &NumRemOperands = RemainingOperands[U];
+      assert(NumRemOperands && "Invalid number of remaining operands");
+      --NumRemOperands;
+      if (!NumRemOperands)
+        SortedNodes.push_back(U);
+    }
+  }
+
+  assert(SortedNodes.size() == AllNodes.size() && "Node count mismatch");
+  assert(SortedNodes.front()->getOpcode() == ISD::EntryToken &&
+         "First node in topological sort is not the entry token");
+  assert(SortedNodes.front()->getNumOperands() == 0 &&
+         "First node in topological sort has operands");
+  assert(SortedNodes.back()->use_empty() &&
+         "Last node in topologic sort has users");
+}
+
 /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
 /// value is produced by SD.
 void SelectionDAG::AddDbgValue(SDDbgValue *DB, bool isParameter) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 4b2a00c2e2cfa..fcfbfe6c461d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -1061,13 +1061,24 @@ static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
   N->dump(G);
 }
 
-LLVM_DUMP_METHOD void SelectionDAG::dump() const {
+LLVM_DUMP_METHOD void SelectionDAG::dump(bool Sorted) const {
   dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n";
 
-  for (const SDNode &N : allnodes()) {
+  auto dumpEachNode = [this](const SDNode &N) {
     if (!N.hasOneUse() && &N != getRoot().getNode() &&
         (!shouldPrintInline(N, this) || N.use_empty()))
       DumpNodes(&N, 2, this);
+  };
+
+  if (Sorted) {
+    SmallVector<const SDNode *> SortedNodes;
+    SortedNodes.reserve(AllNodes.size());
+    getTopologicallyOrderedNodes(SortedNodes);
+    for (const SDNode *N : SortedNodes)
+      dumpEachNode(*N);
+  } else {
+    for (const SDNode &N : allnodes())
+      dumpEachNode(N);
   }
 
   if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index e61558c59bf0d..c35f29dc2548c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -144,6 +144,11 @@ UseMBPI("use-mbpi",
         cl::init(true), cl::Hidden);
 
 #ifndef NDEBUG
+static cl::opt<bool>
+    DumpSortedDAG("dump-sorted-dags", cl::Hidden,
+                  cl::desc("Print DAGs with sorted nodes in debug dump"),
+                  cl::init(false));
+
 static cl::opt<std::string>
 FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
                         cl::desc("Only display the basic block whose name "
@@ -932,7 +937,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   ISEL_DUMP(dbgs() << "\nInitial selection DAG: "
                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                    << "'\n";
-            CurDAG->dump());
+            CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
   if (TTI->hasBranchDivergence())
@@ -952,7 +957,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   ISEL_DUMP(dbgs() << "\nOptimized lowered selection DAG: "
                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                    << "'\n";
-            CurDAG->dump());
+            CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
   if (TTI->hasBranchDivergence())
@@ -974,7 +979,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   ISEL_DUMP(dbgs() << "\nType-legalized selection DAG: "
                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                    << "'\n";
-            CurDAG->dump());
+            CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
   if (TTI->hasBranchDivergence())
@@ -998,7 +1003,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     ISEL_DUMP(dbgs() << "\nOptimized type-legalized selection DAG: "
                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                      << "'\n";
-              CurDAG->dump());
+              CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
     if (TTI->hasBranchDivergence())
@@ -1016,7 +1021,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     ISEL_DUMP(dbgs() << "\nVector-legalized selection DAG: "
                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                      << "'\n";
-              CurDAG->dump());
+              CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
     if (TTI->hasBranchDivergence())
@@ -1032,7 +1037,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     ISEL_DUMP(dbgs() << "\nVector/type-legalized selection DAG: "
                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                      << "'\n";
-              CurDAG->dump());
+              CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
     if (TTI->hasBranchDivergence())
@@ -1052,7 +1057,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     ISEL_DUMP(dbgs() << "\nOptimized vector-legalized selection DAG: "
                      << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                      << "'\n";
-              CurDAG->dump());
+              CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
     if (TTI->hasBranchDivergence())
@@ -1072,7 +1077,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   ISEL_DUMP(dbgs() << "\nLegalized selection DAG: "
                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                    << "'\n";
-            CurDAG->dump());
+            CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
   if (TTI->hasBranchDivergence())
@@ -1092,7 +1097,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   ISEL_DUMP(dbgs() << "\nOptimized legalized selection DAG: "
                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                    << "'\n";
-            CurDAG->dump());
+            CurDAG->dump(DumpSortedDAG));
 
 #if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
   if (TTI->hasBranchDivergence())
@@ -1116,7 +1121,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   ISEL_DUMP(dbgs() << "\nSelected selection DAG: "
                    << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                    << "'\n";
-            CurDAG->dump());
+            CurDAG->dump(DumpSortedDAG));
 
   if (ViewSchedDAGs && MatchFilterBB)
     CurDAG->viewGraph("scheduler input for " + BlockName);

From 4368616452476472e3d776c9ae72be34fa674146 Mon Sep 17 00:00:00 2001
From: Qiongsi Wu <qiongsiwu@gmail.com>
Date: Fri, 3 Oct 2025 16:26:42 -0700
Subject: [PATCH 711/878] [clang][Dependency Scanning] Refactor Scanning
 Compiler Instance Initialization (#161300)

This PR follows https://github.com/llvm/llvm-project/pull/160795, and it
is the second of a series of planned PRs to land
https://github.com/llvm/llvm-project/pull/160207 in smaller pieces.

The initialization steps before and within
`DependencyScanningAction::runInvocation` are broken up in to several
helper functions. The intention is to reuse the helper functions in a
followup PR to initialize the `CompilerInstanceWithContext`.

Part of work for rdar://136303612.
---
 .../Tooling/DependencyScanning/CMakeLists.txt |   1 -
 .../DependencyScannerImpl.cpp                 | 279 ++++++++++++++----
 .../DependencyScannerImpl.h                   |  84 +++++-
 .../DependencyScanningWorker.cpp              | 189 +++---------
 4 files changed, 346 insertions(+), 207 deletions(-)

diff --git a/clang/lib/Tooling/DependencyScanning/CMakeLists.txt b/clang/lib/Tooling/DependencyScanning/CMakeLists.txt
index 53a2728bd5786..76bdc50097fff 100644
--- a/clang/lib/Tooling/DependencyScanning/CMakeLists.txt
+++ b/clang/lib/Tooling/DependencyScanning/CMakeLists.txt
@@ -24,6 +24,5 @@ add_clang_library(clangDependencyScanning
   clangFrontend
   clangLex
   clangSerialization
-  clangTooling
   ${LLVM_PTHREAD_LIB}
   )
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index 010380dee3617..e1f4d0d361185 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -9,8 +9,10 @@
 #include "DependencyScannerImpl.h"
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/DiagnosticSerialization.h"
+#include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h"
+#include "llvm/TargetParser/Host.h"
 
 using namespace clang;
 using namespace tooling;
@@ -332,11 +334,9 @@ class ScanningDependencyDirectivesGetter : public DependencyDirectivesGetter {
     return DepFS->getDirectiveTokens(File.getName());
   }
 };
-} // namespace
 
 /// Sanitize diagnostic options for dependency scan.
-void clang::tooling::dependencies::sanitizeDiagOpts(
-    DiagnosticOptions &DiagOpts) {
+void sanitizeDiagOpts(DiagnosticOptions &DiagOpts) {
   // Don't print 'X warnings and Y errors generated'.
   DiagOpts.ShowCarets = false;
   // Don't write out diagnostic file.
@@ -355,44 +355,146 @@ void clang::tooling::dependencies::sanitizeDiagOpts(
         .Default(true);
   });
 }
+} // namespace
 
-bool DependencyScanningAction::runInvocation(
-    std::shared_ptr<CompilerInvocation> Invocation,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
-    std::shared_ptr<PCHContainerOperations> PCHContainerOps,
-    DiagnosticConsumer *DiagConsumer) {
-  // Making sure that we canonicalize the defines before we create the deep
-  // copy to avoid unnecessary variants in the scanner and in the resulting
-  // explicit command lines.
-  if (any(Service.getOptimizeArgs() & ScanningOptimizations::Macros))
-    canonicalizeDefines(Invocation->getPreprocessorOpts());
+namespace clang::tooling::dependencies {
+std::unique_ptr<DiagnosticOptions>
+createDiagOptions(ArrayRef<std::string> CommandLine) {
+  std::vector<const char *> CLI;
+  for (const std::string &Arg : CommandLine)
+    CLI.push_back(Arg.c_str());
+  auto DiagOpts = CreateAndPopulateDiagOpts(CLI);
+  sanitizeDiagOpts(*DiagOpts);
+  return DiagOpts;
+}
 
-  // Make a deep copy of the original Clang invocation.
-  CompilerInvocation OriginalInvocation(*Invocation);
+DignosticsEngineWithDiagOpts::DignosticsEngineWithDiagOpts(
+    ArrayRef<std::string> CommandLine,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS, DiagnosticConsumer &DC) {
+  std::vector<const char *> CCommandLine(CommandLine.size(), nullptr);
+  llvm::transform(CommandLine, CCommandLine.begin(),
+                  [](const std::string &Str) { return Str.c_str(); });
+  DiagOpts = CreateAndPopulateDiagOpts(CCommandLine);
+  sanitizeDiagOpts(*DiagOpts);
+  DiagEngine = CompilerInstance::createDiagnostics(*FS, *DiagOpts, &DC,
+                                                   /*ShouldOwnClient=*/false);
+}
 
-  if (Scanned) {
-    // Scanning runs once for the first -cc1 invocation in a chain of driver
-    // jobs. For any dependent jobs, reuse the scanning result and just
-    // update the LastCC1Arguments to correspond to the new invocation.
-    // FIXME: to support multi-arch builds, each arch requires a separate scan
-    setLastCC1Arguments(std::move(OriginalInvocation));
-    return true;
+std::pair<std::unique_ptr<driver::Driver>, std::unique_ptr<driver::Compilation>>
+buildCompilation(ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
+                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS) {
+  SmallVector<const char *, 256> Argv;
+  Argv.reserve(ArgStrs.size());
+  for (const std::string &Arg : ArgStrs)
+    Argv.push_back(Arg.c_str());
+
+  std::unique_ptr<driver::Driver> Driver = std::make_unique<driver::Driver>(
+      Argv[0], llvm::sys::getDefaultTargetTriple(), Diags,
+      "clang LLVM compiler", FS);
+  Driver->setTitle("clang_based_tool");
+
+  llvm::BumpPtrAllocator Alloc;
+  bool CLMode = driver::IsClangCL(
+      driver::getDriverMode(Argv[0], ArrayRef(Argv).slice(1)));
+
+  if (llvm::Error E =
+          driver::expandResponseFiles(Argv, CLMode, Alloc, FS.get())) {
+    Diags.Report(diag::err_drv_expand_response_file)
+        << llvm::toString(std::move(E));
+    return std::make_pair(nullptr, nullptr);
   }
 
-  Scanned = true;
+  std::unique_ptr<driver::Compilation> Compilation(
+      Driver->BuildCompilation(Argv));
+  if (!Compilation)
+    return std::make_pair(nullptr, nullptr);
 
-  // Create a compiler instance to handle the actual work.
-  auto ModCache = makeInProcessModuleCache(Service.getModuleCacheEntries());
-  ScanInstanceStorage.emplace(std::move(Invocation), std::move(PCHContainerOps),
-                              ModCache.get());
-  CompilerInstance &ScanInstance = *ScanInstanceStorage;
+  if (Compilation->containsError())
+    return std::make_pair(nullptr, nullptr);
+
+  return std::make_pair(std::move(Driver), std::move(Compilation));
+}
+
+std::unique_ptr<CompilerInvocation>
+createCompilerInvocation(ArrayRef<std::string> CommandLine,
+                         DiagnosticsEngine &Diags) {
+  llvm::opt::ArgStringList Argv;
+  for (const std::string &Str : ArrayRef(CommandLine).drop_front())
+    Argv.push_back(Str.c_str());
+
+  auto Invocation = std::make_unique<CompilerInvocation>();
+  if (!CompilerInvocation::CreateFromArgs(*Invocation, Argv, Diags)) {
+    // FIXME: Should we just go on like cc1_main does?
+    return nullptr;
+  }
+  return Invocation;
+}
+
+std::pair<IntrusiveRefCntPtr<llvm::vfs::FileSystem>, std::vector<std::string>>
+initVFSForTUBuferScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
+                          ArrayRef<std::string> CommandLine,
+                          StringRef WorkingDirectory,
+                          llvm::MemoryBufferRef TUBuffer) {
+  // Reset what might have been modified in the previous worker invocation.
+  BaseFS->setCurrentWorkingDirectory(WorkingDirectory);
+
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> ModifiedFS;
+  auto OverlayFS =
+      llvm::makeIntrusiveRefCnt<llvm::vfs::OverlayFileSystem>(BaseFS);
+  auto InMemoryFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  InMemoryFS->setCurrentWorkingDirectory(WorkingDirectory);
+  auto InputPath = TUBuffer.getBufferIdentifier();
+  InMemoryFS->addFile(
+      InputPath, 0, llvm::MemoryBuffer::getMemBufferCopy(TUBuffer.getBuffer()));
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> InMemoryOverlay = InMemoryFS;
+
+  OverlayFS->pushOverlay(InMemoryOverlay);
+  ModifiedFS = OverlayFS;
+  std::vector<std::string> ModifiedCommandLine(CommandLine);
+  ModifiedCommandLine.emplace_back(InputPath);
+
+  return std::make_pair(ModifiedFS, ModifiedCommandLine);
+}
+
+std::pair<IntrusiveRefCntPtr<llvm::vfs::FileSystem>, std::vector<std::string>>
+initVFSForByNameScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
+                         ArrayRef<std::string> CommandLine,
+                         StringRef WorkingDirectory, StringRef ModuleName) {
+  // Reset what might have been modified in the previous worker invocation.
+  BaseFS->setCurrentWorkingDirectory(WorkingDirectory);
+
+  // If we're scanning based on a module name alone, we don't expect the client
+  // to provide us with an input file. However, the driver really wants to have
+  // one. Let's just make it up to make the driver happy.
+  auto OverlayFS =
+      llvm::makeIntrusiveRefCnt<llvm::vfs::OverlayFileSystem>(BaseFS);
+  auto InMemoryFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  InMemoryFS->setCurrentWorkingDirectory(WorkingDirectory);
+  SmallString<128> FakeInputPath;
+  // TODO: We should retry the creation if the path already exists.
+  llvm::sys::fs::createUniquePath(ModuleName + "-%%%%%%%%.input", FakeInputPath,
+                                  /*MakeAbsolute=*/false);
+  InMemoryFS->addFile(FakeInputPath, 0, llvm::MemoryBuffer::getMemBuffer(""));
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> InMemoryOverlay = InMemoryFS;
+  OverlayFS->pushOverlay(InMemoryOverlay);
+
+  std::vector<std::string> ModifiedCommandLine(CommandLine);
+  ModifiedCommandLine.emplace_back(FakeInputPath);
+
+  return std::make_pair(OverlayFS, ModifiedCommandLine);
+}
+
+bool initializeScanCompilerInstance(
+    CompilerInstance &ScanInstance,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+    DiagnosticConsumer *DiagConsumer, DependencyScanningService &Service,
+    IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS) {
   ScanInstance.setBuildingModule(false);
 
   ScanInstance.createVirtualFileSystem(FS, DiagConsumer);
 
   // Create the compiler's actual diagnostics engine.
   sanitizeDiagOpts(ScanInstance.getDiagnosticOpts());
-  assert(!DiagConsumerFinished && "attempt to reuse finished consumer");
   ScanInstance.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false);
   if (!ScanInstance.hasDiagnostics())
     return false;
@@ -435,6 +537,26 @@ bool DependencyScanningAction::runInvocation(
 
   ScanInstance.createSourceManager();
 
+  // Consider different header search and diagnostic options to create
+  // different modules. This avoids the unsound aliasing of module PCMs.
+  //
+  // TODO: Implement diagnostic bucketing to reduce the impact of strict
+  // context hashing.
+  ScanInstance.getHeaderSearchOpts().ModulesStrictContextHash = true;
+  ScanInstance.getHeaderSearchOpts().ModulesSerializeOnlyPreprocessor = true;
+  ScanInstance.getHeaderSearchOpts().ModulesSkipDiagnosticOptions = true;
+  ScanInstance.getHeaderSearchOpts().ModulesSkipHeaderSearchPaths = true;
+  ScanInstance.getHeaderSearchOpts().ModulesSkipPragmaDiagnosticMappings = true;
+  ScanInstance.getHeaderSearchOpts().ModulesForceValidateUserHeaders = false;
+
+  // Avoid some checks and module map parsing when loading PCM files.
+  ScanInstance.getPreprocessorOpts().ModulesCheckRelocated = false;
+
+  return true;
+}
+
+llvm::SmallVector<StringRef>
+getInitialStableDirs(const CompilerInstance &ScanInstance) {
   // Create a collection of stable directories derived from the ScanInstance
   // for determining whether module dependencies would fully resolve from
   // those directories.
@@ -442,7 +564,12 @@ bool DependencyScanningAction::runInvocation(
   const StringRef Sysroot = ScanInstance.getHeaderSearchOpts().Sysroot;
   if (!Sysroot.empty() && (llvm::sys::path::root_directory(Sysroot) != Sysroot))
     StableDirs = {Sysroot, ScanInstance.getHeaderSearchOpts().ResourceDir};
+  return StableDirs;
+}
 
+std::optional<PrebuiltModulesAttrsMap>
+computePrebuiltModulesASTMap(CompilerInstance &ScanInstance,
+                             llvm::SmallVector<StringRef> &StableDirs) {
   // Store a mapping of prebuilt module files and their properties like header
   // search options. This will prevent the implicit build to create duplicate
   // modules and will force reuse of the existing prebuilt module files
@@ -454,12 +581,14 @@ bool DependencyScanningAction::runInvocation(
             ScanInstance.getPreprocessorOpts().ImplicitPCHInclude, ScanInstance,
             ScanInstance.getHeaderSearchOpts().PrebuiltModuleFiles,
             PrebuiltModulesASTMap, ScanInstance.getDiagnostics(), StableDirs))
-      return false;
+      return {};
 
-  // Create the dependency collector that will collect the produced
-  // dependencies.
-  //
-  // This also moves the existing dependency output options from the
+  return PrebuiltModulesASTMap;
+}
+
+std::unique_ptr<DependencyOutputOptions>
+takeDependencyOutputOptionsFrom(CompilerInstance &ScanInstance) {
+  // This function moves the existing dependency output options from the
   // invocation to the collector. The options in the invocation are reset,
   // which ensures that the compiler won't create new dependency collectors,
   // and thus won't write out the extra '.d' files to disk.
@@ -472,35 +601,85 @@ bool DependencyScanningAction::runInvocation(
                                      ScanInstance.getFrontendOpts().Inputs)};
   Opts->IncludeSystemHeaders = true;
 
+  return Opts;
+}
+
+std::shared_ptr<ModuleDepCollector> initializeScanInstanceDependencyCollector(
+    CompilerInstance &ScanInstance,
+    std::unique_ptr<DependencyOutputOptions> DepOutputOpts,
+    StringRef WorkingDirectory, DependencyConsumer &Consumer,
+    DependencyScanningService &Service, CompilerInvocation &Inv,
+    DependencyActionController &Controller,
+    PrebuiltModulesAttrsMap PrebuiltModulesASTMap,
+    llvm::SmallVector<StringRef> &StableDirs) {
+  std::shared_ptr<ModuleDepCollector> MDC;
   switch (Service.getFormat()) {
   case ScanningOutputFormat::Make:
     ScanInstance.addDependencyCollector(
         std::make_shared<DependencyConsumerForwarder>(
-            std::move(Opts), WorkingDirectory, Consumer));
+            std::move(DepOutputOpts), WorkingDirectory, Consumer));
     break;
   case ScanningOutputFormat::P1689:
   case ScanningOutputFormat::Full:
     MDC = std::make_shared<ModuleDepCollector>(
-        Service, std::move(Opts), ScanInstance, Consumer, Controller,
-        OriginalInvocation, std::move(PrebuiltModulesASTMap), StableDirs);
+        Service, std::move(DepOutputOpts), ScanInstance, Consumer, Controller,
+        Inv, std::move(PrebuiltModulesASTMap), StableDirs);
     ScanInstance.addDependencyCollector(MDC);
     break;
   }
 
-  // Consider different header search and diagnostic options to create
-  // different modules. This avoids the unsound aliasing of module PCMs.
-  //
-  // TODO: Implement diagnostic bucketing to reduce the impact of strict
-  // context hashing.
-  ScanInstance.getHeaderSearchOpts().ModulesStrictContextHash = true;
-  ScanInstance.getHeaderSearchOpts().ModulesSerializeOnlyPreprocessor = true;
-  ScanInstance.getHeaderSearchOpts().ModulesSkipDiagnosticOptions = true;
-  ScanInstance.getHeaderSearchOpts().ModulesSkipHeaderSearchPaths = true;
-  ScanInstance.getHeaderSearchOpts().ModulesSkipPragmaDiagnosticMappings = true;
-  ScanInstance.getHeaderSearchOpts().ModulesForceValidateUserHeaders = false;
+  return MDC;
+}
+} // namespace clang::tooling::dependencies
 
-  // Avoid some checks and module map parsing when loading PCM files.
-  ScanInstance.getPreprocessorOpts().ModulesCheckRelocated = false;
+bool DependencyScanningAction::runInvocation(
+    std::unique_ptr<CompilerInvocation> Invocation,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+    std::shared_ptr<PCHContainerOperations> PCHContainerOps,
+    DiagnosticConsumer *DiagConsumer) {
+  // Making sure that we canonicalize the defines before we create the deep
+  // copy to avoid unnecessary variants in the scanner and in the resulting
+  // explicit command lines.
+  if (any(Service.getOptimizeArgs() & ScanningOptimizations::Macros))
+    canonicalizeDefines(Invocation->getPreprocessorOpts());
+
+  // Make a deep copy of the original Clang invocation.
+  CompilerInvocation OriginalInvocation(*Invocation);
+
+  if (Scanned) {
+    // Scanning runs once for the first -cc1 invocation in a chain of driver
+    // jobs. For any dependent jobs, reuse the scanning result and just
+    // update the LastCC1Arguments to correspond to the new invocation.
+    // FIXME: to support multi-arch builds, each arch requires a separate scan
+    setLastCC1Arguments(std::move(OriginalInvocation));
+    return true;
+  }
+
+  Scanned = true;
+
+  // Create a compiler instance to handle the actual work.
+  auto ModCache = makeInProcessModuleCache(Service.getModuleCacheEntries());
+  ScanInstanceStorage.emplace(std::move(Invocation), std::move(PCHContainerOps),
+                              ModCache.get());
+  CompilerInstance &ScanInstance = *ScanInstanceStorage;
+
+  assert(!DiagConsumerFinished && "attempt to reuse finished consumer");
+  if (!initializeScanCompilerInstance(ScanInstance, FS, DiagConsumer, Service,
+                                      DepFS))
+    return false;
+
+  llvm::SmallVector<StringRef> StableDirs = getInitialStableDirs(ScanInstance);
+  auto MaybePrebuiltModulesASTMap =
+      computePrebuiltModulesASTMap(ScanInstance, StableDirs);
+  if (!MaybePrebuiltModulesASTMap)
+    return false;
+
+  auto DepOutputOpts = takeDependencyOutputOptionsFrom(ScanInstance);
+
+  MDC = initializeScanInstanceDependencyCollector(
+      ScanInstance, std::move(DepOutputOpts), WorkingDirectory, Consumer,
+      Service, OriginalInvocation, Controller, *MaybePrebuiltModulesASTMap,
+      StableDirs);
 
   std::unique_ptr<FrontendAction> Action;
 
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
index 32fbcfffde53c..71c6731803597 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
@@ -9,8 +9,10 @@
 #ifndef LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNER_H
 #define LLVM_CLANG_TOOLING_DEPENDENCYSCANNING_DEPENDENCYSCANNER_H
 
+#include "clang/Driver/Compilation.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
 #include "clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h"
 #include "clang/Tooling/DependencyScanning/ModuleDepCollector.h"
@@ -30,12 +32,12 @@ class DependencyScanningAction {
   DependencyScanningAction(
       DependencyScanningService &Service, StringRef WorkingDirectory,
       DependencyConsumer &Consumer, DependencyActionController &Controller,
-      llvm::IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS,
+      IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS,
       std::optional<StringRef> ModuleName = std::nullopt)
       : Service(Service), WorkingDirectory(WorkingDirectory),
         Consumer(Consumer), Controller(Controller), DepFS(std::move(DepFS)),
         ModuleName(ModuleName) {}
-  bool runInvocation(std::shared_ptr<CompilerInvocation> Invocation,
+  bool runInvocation(std::unique_ptr<CompilerInvocation> Invocation,
                      IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
                      std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                      DiagnosticConsumer *DiagConsumer);
@@ -63,7 +65,7 @@ class DependencyScanningAction {
   StringRef WorkingDirectory;
   DependencyConsumer &Consumer;
   DependencyActionController &Controller;
-  llvm::IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS;
+  IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS;
   std::optional<StringRef> ModuleName;
   std::optional<CompilerInstance> ScanInstanceStorage;
   std::shared_ptr<ModuleDepCollector> MDC;
@@ -72,9 +74,81 @@ class DependencyScanningAction {
   bool DiagConsumerFinished = false;
 };
 
-// Helper functions
-void sanitizeDiagOpts(DiagnosticOptions &DiagOpts);
+// Helper functions and data types.
+std::unique_ptr<DiagnosticOptions>
+createDiagOptions(ArrayRef<std::string> CommandLine);
 
+struct DignosticsEngineWithDiagOpts {
+  // We need to bound the lifetime of the DiagOpts used to create the
+  // DiganosticsEngine with the DiagnosticsEngine itself.
+  std::unique_ptr<DiagnosticOptions> DiagOpts;
+  IntrusiveRefCntPtr<DiagnosticsEngine> DiagEngine;
+
+  DignosticsEngineWithDiagOpts(ArrayRef<std::string> CommandLine,
+                               IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+                               DiagnosticConsumer &DC);
+};
+
+struct TextDiagnosticsPrinterWithOutput {
+  // We need to bound the lifetime of the data that supports the DiagPrinter
+  // with it together so they have the same lifetime.
+  std::string DiagnosticOutput;
+  llvm::raw_string_ostream DiagnosticsOS;
+  std::unique_ptr<DiagnosticOptions> DiagOpts;
+  TextDiagnosticPrinter DiagPrinter;
+
+  TextDiagnosticsPrinterWithOutput(ArrayRef<std::string> CommandLine)
+      : DiagnosticsOS(DiagnosticOutput),
+        DiagOpts(createDiagOptions(CommandLine)),
+        DiagPrinter(DiagnosticsOS, *DiagOpts) {}
+};
+
+std::pair<std::unique_ptr<driver::Driver>, std::unique_ptr<driver::Compilation>>
+buildCompilation(ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
+                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
+
+std::unique_ptr<CompilerInvocation>
+createCompilerInvocation(ArrayRef<std::string> CommandLine,
+                         DiagnosticsEngine &Diags);
+
+std::pair<IntrusiveRefCntPtr<llvm::vfs::FileSystem>, std::vector<std::string>>
+initVFSForTUBuferScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
+                          ArrayRef<std::string> CommandLine,
+                          StringRef WorkingDirectory,
+                          llvm::MemoryBufferRef TUBuffer);
+
+std::pair<IntrusiveRefCntPtr<llvm::vfs::FileSystem>, std::vector<std::string>>
+initVFSForByNameScanning(IntrusiveRefCntPtr<llvm::vfs::FileSystem> BaseFS,
+                         ArrayRef<std::string> CommandLine,
+                         StringRef WorkingDirectory, StringRef ModuleName);
+
+bool initializeScanCompilerInstance(
+    CompilerInstance &ScanInstance,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+    DiagnosticConsumer *DiagConsumer, DependencyScanningService &Service,
+    IntrusiveRefCntPtr<DependencyScanningWorkerFilesystem> DepFS);
+
+SmallVector<StringRef>
+getInitialStableDirs(const CompilerInstance &ScanInstance);
+
+std::optional<PrebuiltModulesAttrsMap>
+computePrebuiltModulesASTMap(CompilerInstance &ScanInstance,
+                             SmallVector<StringRef> &StableDirs);
+
+std::unique_ptr<DependencyOutputOptions>
+takeDependencyOutputOptionsFrom(CompilerInstance &ScanInstance);
+
+/// Create the dependency collector that will collect the produced
+/// dependencies. May return the created ModuleDepCollector depending
+/// on the scanning format.
+std::shared_ptr<ModuleDepCollector> initializeScanInstanceDependencyCollector(
+    CompilerInstance &ScanInstance,
+    std::unique_ptr<DependencyOutputOptions> DepOutputOpts,
+    StringRef WorkingDirectory, DependencyConsumer &Consumer,
+    DependencyScanningService &Service, CompilerInvocation &Inv,
+    DependencyActionController &Controller,
+    PrebuiltModulesAttrsMap PrebuiltModulesASTMap,
+    llvm::SmallVector<StringRef> &StableDirs);
 } // namespace dependencies
 } // namespace tooling
 } // namespace clang
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 796e587ba9147..95154212603ac 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -8,29 +8,9 @@
 
 #include "clang/Tooling/DependencyScanning/DependencyScanningWorker.h"
 #include "DependencyScannerImpl.h"
-#include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Basic/DiagnosticFrontend.h"
-#include "clang/Basic/DiagnosticSerialization.h"
-#include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Driver/Job.h"
 #include "clang/Driver/Tool.h"
-#include "clang/Frontend/CompilerInstance.h"
-#include "clang/Frontend/CompilerInvocation.h"
-#include "clang/Frontend/FrontendActions.h"
-#include "clang/Frontend/TextDiagnosticPrinter.h"
-#include "clang/Frontend/Utils.h"
-#include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Serialization/ObjectFilePCHContainerReader.h"
-#include "clang/Tooling/DependencyScanning/DependencyScanningService.h"
-#include "clang/Tooling/DependencyScanning/InProcessModuleCache.h"
-#include "clang/Tooling/DependencyScanning/ModuleDepCollector.h"
-#include "llvm/ADT/IntrusiveRefCntPtr.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/TargetParser/Host.h"
-#include <optional>
 
 using namespace clang;
 using namespace tooling;
@@ -63,32 +43,19 @@ DependencyScanningWorker::DependencyScanningWorker(
   }
 }
 
-static std::unique_ptr<DiagnosticOptions>
-createDiagOptions(const std::vector<std::string> &CommandLine) {
-  std::vector<const char *> CLI;
-  for (const std::string &Arg : CommandLine)
-    CLI.push_back(Arg.c_str());
-  auto DiagOpts = CreateAndPopulateDiagOpts(CLI);
-  sanitizeDiagOpts(*DiagOpts);
-  return DiagOpts;
-}
-
 llvm::Error DependencyScanningWorker::computeDependencies(
     StringRef WorkingDirectory, const std::vector<std::string> &CommandLine,
     DependencyConsumer &Consumer, DependencyActionController &Controller,
     std::optional<llvm::MemoryBufferRef> TUBuffer) {
   // Capture the emitted diagnostics and report them to the client
   // in the case of a failure.
-  std::string DiagnosticOutput;
-  llvm::raw_string_ostream DiagnosticsOS(DiagnosticOutput);
-  auto DiagOpts = createDiagOptions(CommandLine);
-  TextDiagnosticPrinter DiagPrinter(DiagnosticsOS, *DiagOpts);
+  TextDiagnosticsPrinterWithOutput DiagPrinterWithOS(CommandLine);
 
   if (computeDependencies(WorkingDirectory, CommandLine, Consumer, Controller,
-                          DiagPrinter, TUBuffer))
+                          DiagPrinterWithOS.DiagPrinter, TUBuffer))
     return llvm::Error::success();
-  return llvm::make_error<llvm::StringError>(DiagnosticsOS.str(),
-                                             llvm::inconvertibleErrorCode());
+  return llvm::make_error<llvm::StringError>(
+      DiagPrinterWithOS.DiagnosticsOS.str(), llvm::inconvertibleErrorCode());
 }
 
 llvm::Error DependencyScanningWorker::computeDependencies(
@@ -97,51 +64,24 @@ llvm::Error DependencyScanningWorker::computeDependencies(
     StringRef ModuleName) {
   // Capture the emitted diagnostics and report them to the client
   // in the case of a failure.
-  std::string DiagnosticOutput;
-  llvm::raw_string_ostream DiagnosticsOS(DiagnosticOutput);
-  auto DiagOpts = createDiagOptions(CommandLine);
-  TextDiagnosticPrinter DiagPrinter(DiagnosticsOS, *DiagOpts);
+  TextDiagnosticsPrinterWithOutput DiagPrinterWithOS(CommandLine);
 
   if (computeDependencies(WorkingDirectory, CommandLine, Consumer, Controller,
-                          DiagPrinter, ModuleName))
+                          DiagPrinterWithOS.DiagPrinter, ModuleName))
     return llvm::Error::success();
-  return llvm::make_error<llvm::StringError>(DiagnosticsOS.str(),
-                                             llvm::inconvertibleErrorCode());
+  return llvm::make_error<llvm::StringError>(
+      DiagPrinterWithOS.DiagnosticsOS.str(), llvm::inconvertibleErrorCode());
 }
 
 static bool forEachDriverJob(
     ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
     llvm::function_ref<bool(const driver::Command &Cmd)> Callback) {
-  SmallVector<const char *, 256> Argv;
-  Argv.reserve(ArgStrs.size());
-  for (const std::string &Arg : ArgStrs)
-    Argv.push_back(Arg.c_str());
-
-  std::unique_ptr<driver::Driver> Driver = std::make_unique<driver::Driver>(
-      Argv[0], llvm::sys::getDefaultTargetTriple(), Diags,
-      "clang LLVM compiler", FS);
-  Driver->setTitle("clang_based_tool");
-
-  llvm::BumpPtrAllocator Alloc;
-  bool CLMode = driver::IsClangCL(
-      driver::getDriverMode(Argv[0], ArrayRef(Argv).slice(1)));
-
-  if (llvm::Error E =
-          driver::expandResponseFiles(Argv, CLMode, Alloc, FS.get())) {
-    Diags.Report(diag::err_drv_expand_response_file)
-        << llvm::toString(std::move(E));
-    return false;
-  }
-
-  const std::unique_ptr<driver::Compilation> Compilation(
-      Driver->BuildCompilation(llvm::ArrayRef(Argv)));
+  // Compilation holds a non-owning a reference to the Driver, hence we need to
+  // keep the Driver alive when we use Compilation.
+  auto [Driver, Compilation] = buildCompilation(ArgStrs, Diags, FS);
   if (!Compilation)
     return false;
-
-  if (Compilation->containsError())
-    return false;
-
   for (const driver::Command &Job : Compilation->getJobs()) {
     if (!Callback(Job))
       return false;
@@ -150,30 +90,21 @@ static bool forEachDriverJob(
 }
 
 static bool createAndRunToolInvocation(
-    std::vector<std::string> CommandLine, DependencyScanningAction &Action,
+    const std::vector<std::string> &CommandLine,
+    DependencyScanningAction &Action,
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
     std::shared_ptr<clang::PCHContainerOperations> &PCHContainerOps,
     DiagnosticsEngine &Diags, DependencyConsumer &Consumer) {
-
-  // Save executable path before providing CommandLine to ToolInvocation
-  std::string Executable = CommandLine[0];
-
-  llvm::opt::ArgStringList Argv;
-  for (const std::string &Str : ArrayRef(CommandLine).drop_front())
-    Argv.push_back(Str.c_str());
-
-  auto Invocation = std::make_shared<CompilerInvocation>();
-  if (!CompilerInvocation::CreateFromArgs(*Invocation, Argv, Diags)) {
-    // FIXME: Should we just go on like cc1_main does?
+  auto Invocation = createCompilerInvocation(CommandLine, Diags);
+  if (!Invocation)
     return false;
-  }
 
   if (!Action.runInvocation(std::move(Invocation), std::move(FS),
                             PCHContainerOps, Diags.getClient()))
     return false;
 
   std::vector<std::string> Args = Action.takeLastCC1Arguments();
-  Consumer.handleBuildCommand({std::move(Executable), std::move(Args)});
+  Consumer.handleBuildCommand({CommandLine[0], std::move(Args)});
   return true;
 }
 
@@ -182,24 +113,19 @@ bool DependencyScanningWorker::scanDependencies(
     DependencyConsumer &Consumer, DependencyActionController &Controller,
     DiagnosticConsumer &DC, llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
     std::optional<StringRef> ModuleName) {
-  std::vector<const char *> CCommandLine(CommandLine.size(), nullptr);
-  llvm::transform(CommandLine, CCommandLine.begin(),
-                  [](const std::string &Str) { return Str.c_str(); });
-  auto DiagOpts = CreateAndPopulateDiagOpts(CCommandLine);
-  sanitizeDiagOpts(*DiagOpts);
-  auto Diags = CompilerInstance::createDiagnostics(*FS, *DiagOpts, &DC,
-                                                   /*ShouldOwnClient=*/false);
-
+  DignosticsEngineWithDiagOpts DiagEngineWithCmdAndOpts(CommandLine, FS, DC);
   DependencyScanningAction Action(Service, WorkingDirectory, Consumer,
                                   Controller, DepFS, ModuleName);
 
   bool Success = false;
   if (CommandLine[1] == "-cc1") {
-    Success = createAndRunToolInvocation(CommandLine, Action, FS,
-                                         PCHContainerOps, *Diags, Consumer);
+    Success = createAndRunToolInvocation(
+        CommandLine, Action, FS, PCHContainerOps,
+        *DiagEngineWithCmdAndOpts.DiagEngine, Consumer);
   } else {
     Success = forEachDriverJob(
-        CommandLine, *Diags, FS, [&](const driver::Command &Cmd) {
+        CommandLine, *DiagEngineWithCmdAndOpts.DiagEngine, FS,
+        [&](const driver::Command &Cmd) {
           if (StringRef(Cmd.getCreator().getName()) != "clang") {
             // Non-clang command. Just pass through to the dependency
             // consumer.
@@ -218,13 +144,15 @@ bool DependencyScanningWorker::scanDependencies(
           // system to ensure that any file system requests that
           // are made by the driver do not go through the
           // dependency scanning filesystem.
-          return createAndRunToolInvocation(std::move(Argv), Action, FS,
-                                            PCHContainerOps, *Diags, Consumer);
+          return createAndRunToolInvocation(
+              std::move(Argv), Action, FS, PCHContainerOps,
+              *DiagEngineWithCmdAndOpts.DiagEngine, Consumer);
         });
   }
 
   if (Success && !Action.hasScanned())
-    Diags->Report(diag::err_fe_expected_compiler_job)
+    DiagEngineWithCmdAndOpts.DiagEngine->Report(
+        diag::err_fe_expected_compiler_job)
         << llvm::join(CommandLine, " ");
 
   // Ensure finish() is called even if we never reached ExecuteAction().
@@ -238,66 +166,25 @@ bool DependencyScanningWorker::computeDependencies(
     StringRef WorkingDirectory, const std::vector<std::string> &CommandLine,
     DependencyConsumer &Consumer, DependencyActionController &Controller,
     DiagnosticConsumer &DC, std::optional<llvm::MemoryBufferRef> TUBuffer) {
-  // Reset what might have been modified in the previous worker invocation.
-  BaseFS->setCurrentWorkingDirectory(WorkingDirectory);
-
-  std::optional<std::vector<std::string>> ModifiedCommandLine;
-  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> ModifiedFS;
-
-  // If we're scanning based on a module name alone, we don't expect the client
-  // to provide us with an input file. However, the driver really wants to have
-  // one. Let's just make it up to make the driver happy.
   if (TUBuffer) {
-    auto OverlayFS =
-        llvm::makeIntrusiveRefCnt<llvm::vfs::OverlayFileSystem>(BaseFS);
-    auto InMemoryFS =
-        llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
-    InMemoryFS->setCurrentWorkingDirectory(WorkingDirectory);
-    auto InputPath = TUBuffer->getBufferIdentifier();
-    InMemoryFS->addFile(
-        InputPath, 0,
-        llvm::MemoryBuffer::getMemBufferCopy(TUBuffer->getBuffer()));
-    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> InMemoryOverlay =
-        InMemoryFS;
-
-    OverlayFS->pushOverlay(InMemoryOverlay);
-    ModifiedFS = OverlayFS;
-    ModifiedCommandLine = CommandLine;
-    ModifiedCommandLine->emplace_back(InputPath);
+    auto [FinalFS, FinalCommandLine] = initVFSForTUBuferScanning(
+        BaseFS, CommandLine, WorkingDirectory, *TUBuffer);
+    return scanDependencies(WorkingDirectory, FinalCommandLine, Consumer,
+                            Controller, DC, FinalFS,
+                            /*ModuleName=*/std::nullopt);
+  } else {
+    BaseFS->setCurrentWorkingDirectory(WorkingDirectory);
+    return scanDependencies(WorkingDirectory, CommandLine, Consumer, Controller,
+                            DC, BaseFS, /*ModuleName=*/std::nullopt);
   }
-
-  const std::vector<std::string> &FinalCommandLine =
-      ModifiedCommandLine ? *ModifiedCommandLine : CommandLine;
-  auto &FinalFS = ModifiedFS ? ModifiedFS : BaseFS;
-
-  return scanDependencies(WorkingDirectory, FinalCommandLine, Consumer,
-                          Controller, DC, FinalFS, /*ModuleName=*/std::nullopt);
 }
 
 bool DependencyScanningWorker::computeDependencies(
     StringRef WorkingDirectory, const std::vector<std::string> &CommandLine,
     DependencyConsumer &Consumer, DependencyActionController &Controller,
     DiagnosticConsumer &DC, StringRef ModuleName) {
-  // Reset what might have been modified in the previous worker invocation.
-  BaseFS->setCurrentWorkingDirectory(WorkingDirectory);
-
-  // If we're scanning based on a module name alone, we don't expect the client
-  // to provide us with an input file. However, the driver really wants to have
-  // one. Let's just make it up to make the driver happy.
-  auto OverlayFS =
-      llvm::makeIntrusiveRefCnt<llvm::vfs::OverlayFileSystem>(BaseFS);
-  auto InMemoryFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
-  InMemoryFS->setCurrentWorkingDirectory(WorkingDirectory);
-  SmallString<128> FakeInputPath;
-  // TODO: We should retry the creation if the path already exists.
-  llvm::sys::fs::createUniquePath(ModuleName + "-%%%%%%%%.input", FakeInputPath,
-                                  /*MakeAbsolute=*/false);
-  InMemoryFS->addFile(FakeInputPath, 0, llvm::MemoryBuffer::getMemBuffer(""));
-  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> InMemoryOverlay = InMemoryFS;
-
-  OverlayFS->pushOverlay(InMemoryOverlay);
-  auto ModifiedCommandLine = CommandLine;
-  ModifiedCommandLine.emplace_back(FakeInputPath);
+  auto [OverlayFS, ModifiedCommandLine] = initVFSForByNameScanning(
+      BaseFS, CommandLine, WorkingDirectory, ModuleName);
 
   return scanDependencies(WorkingDirectory, ModifiedCommandLine, Consumer,
                           Controller, DC, OverlayFS, ModuleName);

From d322ebdeabf6acac6d1b1580c10abdbc7aef2802 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 3 Oct 2025 16:35:17 -0700
Subject: [PATCH 712/878] [MemProf] Suppress duplicate clones in the LTO
 backend (#161551)

In some cases due to phase ordering issues with re-cloning during
function assignment, we may end up with duplicate clones in the
summaries (calling the same set of callee clones and/or allocation
hints).

Ideally we would fix this in the thin link, but for now, detect and
suppress these in the LTO backend. In order to satisfy possibly
cross-module references, make each duplicate an alias to the first
identical copy, which gets materialized.

This reduces ThinLTO backend compile times.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 178 ++++++++++++++----
 llvm/test/ThinLTO/X86/memprof-dups.ll         | 138 ++++++++++++++
 .../ThinLTO/X86/memprof_imported_internal.ll  |  10 +-
 3 files changed, 285 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/memprof-dups.ll

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index ddb95a4184756..faeab95a3b43f 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -40,6 +41,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/InterleavedRange.h"
+#include "llvm/Support/SHA1.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
@@ -60,6 +62,9 @@ STATISTIC(FunctionClonesThinBackend,
           "Number of function clones created during ThinLTO backend");
 STATISTIC(FunctionsClonedThinBackend,
           "Number of functions that had clones created during ThinLTO backend");
+STATISTIC(
+    FunctionCloneDuplicatesThinBackend,
+    "Number of function clone duplicates detected during ThinLTO backend");
 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
                             "cloned) during whole program analysis");
 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
@@ -5186,19 +5191,127 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
   return Changed;
 }
 
+// Compute a SHA1 hash of the callsite and alloc version information of clone I
+// in the summary, to use in detection of duplicate clones.
+uint64_t ComputeHash(const FunctionSummary *FS, unsigned I) {
+  SHA1 Hasher;
+  // Update hash with any callsites that call non-default (non-zero) callee
+  // versions.
+  for (auto &SN : FS->callsites()) {
+    // In theory all callsites and allocs in this function should have the same
+    // number of clone entries, but handle any discrepancies gracefully below
+    // for NDEBUG builds.
+    assert(
+        SN.Clones.size() > I &&
+        "Callsite summary has fewer entries than other summaries in function");
+    if (SN.Clones.size() <= I || !SN.Clones[I])
+      continue;
+    uint8_t Data[sizeof(SN.Clones[I])];
+    support::endian::write32le(Data, SN.Clones[I]);
+    Hasher.update(Data);
+  }
+  // Update hash with any allocs that have non-default (non-None) hints.
+  for (auto &AN : FS->allocs()) {
+    // In theory all callsites and allocs in this function should have the same
+    // number of clone entries, but handle any discrepancies gracefully below
+    // for NDEBUG builds.
+    assert(AN.Versions.size() > I &&
+           "Alloc summary has fewer entries than other summaries in function");
+    if (AN.Versions.size() <= I ||
+        (AllocationType)AN.Versions[I] == AllocationType::None)
+      continue;
+    Hasher.update(ArrayRef<uint8_t>(&AN.Versions[I], 1));
+  }
+  return support::endian::read64le(Hasher.result().data());
+}
+
 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
-        &FuncToAliasMap) {
+        &FuncToAliasMap,
+    FunctionSummary *FS) {
+  auto TakeDeclNameAndReplace = [](GlobalValue *DeclGV, GlobalValue *NewGV) {
+    // We might have created this when adjusting callsite in another
+    // function. It should be a declaration.
+    assert(DeclGV->isDeclaration());
+    NewGV->takeName(DeclGV);
+    DeclGV->replaceAllUsesWith(NewGV);
+    DeclGV->eraseFromParent();
+  };
+
+  // Handle aliases to this function, and create analogous alias clones to the
+  // provided clone of this function.
+  auto CloneFuncAliases = [&](Function *NewF, unsigned I) {
+    if (!FuncToAliasMap.count(&F))
+      return;
+    for (auto *A : FuncToAliasMap[&F]) {
+      std::string AliasName = getMemProfFuncName(A->getName(), I);
+      auto *PrevA = M.getNamedAlias(AliasName);
+      auto *NewA = GlobalAlias::create(A->getValueType(),
+                                       A->getType()->getPointerAddressSpace(),
+                                       A->getLinkage(), AliasName, NewF);
+      NewA->copyAttributesFrom(A);
+      if (PrevA)
+        TakeDeclNameAndReplace(PrevA, NewA);
+    }
+  };
+
   // The first "clone" is the original copy, we should only call this if we
   // needed to create new clones.
   assert(NumClones > 1);
   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
   VMaps.reserve(NumClones - 1);
   FunctionsClonedThinBackend++;
+
+  // Map of hash of callsite/alloc versions to the instantiated function clone
+  // (possibly the original) implementing those calls. Used to avoid
+  // instantiating duplicate function clones.
+  // FIXME: Ideally the thin link would not generate such duplicate clones to
+  // start with, but right now it happens due to phase ordering in the function
+  // assignment and possible new clones that produces. We simply make each
+  // duplicate an alias to the matching instantiated clone recorded in the map
+  // (except for available_externally which are made declarations as they would
+  // be aliases in the prevailing module, and available_externally aliases are
+  // not well supported right now).
+  DenseMap<uint64_t, Function *> HashToFunc;
+
+  // Save the hash of the original function version.
+  HashToFunc[ComputeHash(FS, 0)] = &F;
+
   for (unsigned I = 1; I < NumClones; I++) {
     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
+    std::string Name = getMemProfFuncName(F.getName(), I);
+    auto Hash = ComputeHash(FS, I);
+    // If this clone would duplicate a previously seen clone, don't generate the
+    // duplicate clone body, just make an alias to satisfy any (potentially
+    // cross-module) references.
+    if (HashToFunc.contains(Hash)) {
+      FunctionCloneDuplicatesThinBackend++;
+      auto *Func = HashToFunc[Hash];
+      if (Func->hasAvailableExternallyLinkage()) {
+        // Skip these as EliminateAvailableExternallyPass does not handle
+        // available_externally aliases correctly and we end up with an
+        // available_externally alias to a declaration. Just create a
+        // declaration for now as we know we will have a definition in another
+        // module.
+        auto Decl = M.getOrInsertFunction(Name, Func->getFunctionType());
+        ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
+                 << "created clone decl " << ore::NV("Decl", Decl.getCallee()));
+        continue;
+      }
+      auto *PrevF = M.getFunction(Name);
+      auto *Alias = GlobalAlias::create(Name, Func);
+      if (PrevF)
+        TakeDeclNameAndReplace(PrevF, Alias);
+      ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
+               << "created clone alias " << ore::NV("Alias", Alias));
+
+      // Now handle aliases to this function, and clone those as well.
+      CloneFuncAliases(Func, I);
+      continue;
+    }
     auto *NewF = CloneFunction(&F, *VMaps.back());
+    HashToFunc[Hash] = NewF;
     FunctionClonesThinBackend++;
     // Strip memprof and callsite metadata from clone as they are no longer
     // needed.
@@ -5208,40 +5321,17 @@ static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
       }
     }
-    std::string Name = getMemProfFuncName(F.getName(), I);
     auto *PrevF = M.getFunction(Name);
-    if (PrevF) {
-      // We might have created this when adjusting callsite in another
-      // function. It should be a declaration.
-      assert(PrevF->isDeclaration());
-      NewF->takeName(PrevF);
-      PrevF->replaceAllUsesWith(NewF);
-      PrevF->eraseFromParent();
-    } else
+    if (PrevF)
+      TakeDeclNameAndReplace(PrevF, NewF);
+    else
       NewF->setName(Name);
     updateSubprogramLinkageName(NewF, Name);
     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
              << "created clone " << ore::NV("NewFunction", NewF));
 
     // Now handle aliases to this function, and clone those as well.
-    if (!FuncToAliasMap.count(&F))
-      continue;
-    for (auto *A : FuncToAliasMap[&F]) {
-      std::string Name = getMemProfFuncName(A->getName(), I);
-      auto *PrevA = M.getNamedAlias(Name);
-      auto *NewA = GlobalAlias::create(A->getValueType(),
-                                       A->getType()->getPointerAddressSpace(),
-                                       A->getLinkage(), Name, NewF);
-      NewA->copyAttributesFrom(A);
-      if (PrevA) {
-        // We might have created this when adjusting callsite in another
-        // function. It should be a declaration.
-        assert(PrevA->isDeclaration());
-        NewA->takeName(PrevA);
-        PrevA->replaceAllUsesWith(NewA);
-        PrevA->eraseFromParent();
-      }
-    }
+    CloneFuncAliases(NewF, I);
   }
   return VMaps;
 }
@@ -5401,7 +5491,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
     bool ClonesCreated = false;
     unsigned NumClonesCreated = 0;
-    auto CloneFuncIfNeeded = [&](unsigned NumClones) {
+    auto CloneFuncIfNeeded = [&](unsigned NumClones, FunctionSummary *FS) {
       // We should at least have version 0 which is the original copy.
       assert(NumClones > 0);
       // If only one copy needed use original.
@@ -5415,7 +5505,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
         assert(NumClonesCreated == NumClones);
         return;
       }
-      VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
+      VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap, FS);
       // The first "clone" is the original copy, which doesn't have a VMap.
       assert(VMaps.size() == NumClones - 1);
       Changed = true;
@@ -5424,9 +5514,9 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
     };
 
     auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
-                             Function *CalledFunction) {
+                             Function *CalledFunction, FunctionSummary *FS) {
       // Perform cloning if not yet done.
-      CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
+      CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size(), FS);
 
       assert(!isMemProfClone(*CalledFunction));
 
@@ -5448,6 +5538,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
       // below.
       auto CalleeOrigName = CalledFunction->getName();
       for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
+        // If the VMap is empty, this clone was a duplicate of another and was
+        // created as an alias or a declaration.
+        if (J > 0 && VMaps[J - 1]->empty())
+          continue;
         // Do nothing if this version calls the original version of its
         // callee.
         if (!StackNode.Clones[J])
@@ -5591,7 +5685,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
 #endif
 
           // Perform cloning if not yet done.
-          CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
+          CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size(), FS);
 
           OrigAllocsThinBackend++;
           AllocVersionsThinBackend += AllocNode.Versions.size();
@@ -5624,6 +5718,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
 
           // Update the allocation types per the summary info.
           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
+            // If the VMap is empty, this clone was a duplicate of another and
+            // was created as an alias or a declaration.
+            if (J > 0 && VMaps[J - 1]->empty())
+              continue;
             // Ignore any that didn't get an assigned allocation type.
             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
               continue;
@@ -5670,7 +5768,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
             // we don't need to do ICP, but might need to clone this
             // function as it is the target of other cloned calls.
             if (NumClones)
-              CloneFuncIfNeeded(NumClones);
+              CloneFuncIfNeeded(NumClones, FS);
           }
 
           else {
@@ -5690,7 +5788,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
             }
 #endif
 
-            CloneCallsite(StackNode, CB, CalledFunction);
+            CloneCallsite(StackNode, CB, CalledFunction, FS);
           }
         } else if (CB->isTailCall() && CalledFunction) {
           // Locate the synthesized callsite info for the callee VI, if any was
@@ -5700,7 +5798,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
           if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
             auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
             assert(Callsite != MapTailCallCalleeVIToCallsite.end());
-            CloneCallsite(Callsite->second, CB, CalledFunction);
+            CloneCallsite(Callsite->second, CB, CalledFunction, FS);
           }
         }
       }
@@ -5846,6 +5944,10 @@ void MemProfContextDisambiguation::performICP(
       // check.
       CallBase *CBClone = CB;
       for (unsigned J = 0; J < NumClones; J++) {
+        // If the VMap is empty, this clone was a duplicate of another and was
+        // created as an alias or a declaration.
+        if (J > 0 && VMaps[J - 1]->empty())
+          continue;
         // Copy 0 is the original function.
         if (J > 0)
           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
@@ -5891,6 +5993,10 @@ void MemProfContextDisambiguation::performICP(
     // TotalCount and the number promoted.
     CallBase *CBClone = CB;
     for (unsigned J = 0; J < NumClones; J++) {
+      // If the VMap is empty, this clone was a duplicate of another and was
+      // created as an alias or a declaration.
+      if (J > 0 && VMaps[J - 1]->empty())
+        continue;
       // Copy 0 is the original function.
       if (J > 0)
         CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
diff --git a/llvm/test/ThinLTO/X86/memprof-dups.ll b/llvm/test/ThinLTO/X86/memprof-dups.ll
new file mode 100644
index 0000000000000..8accc836456ed
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-dups.ll
@@ -0,0 +1,138 @@
+;; Check that duplicate spurious duplicate (identical) clones are simply
+;; created as aliases to the first identical copy, rather than creating
+;; multiple clones that call the same callee clones or have the same
+;; allocation types. This currently happens in some cases due to additional
+;; cloning performed during function assignment.
+;;
+;; The ThinLTO combined summary was manually modified as described there
+;; to force multiple identical copies of various functions.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: llvm-as src.ll -o src.o
+; RUN: llvm-as src.o.thinlto.ll -o src.o.thinlto.bc
+; RUN: opt -passes=memprof-context-disambiguation -stats \
+; RUN: 	-memprof-import-summary=src.o.thinlto.bc \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:	src.o -S 2>&1 | FileCheck %s
+
+; CHECK: created clone bar.memprof.1
+;; Duplicates of bar are created as declarations since bar is available_externally,
+;; and the compiler does not well support available_externally aliases.
+; CHECK: created clone decl bar.memprof.2
+; CHECK: created clone decl bar.memprof.3
+; CHECK: created clone _Z3foov.memprof.1
+;; Duplicates of _Z3foov are created as aliases to the appropriate materialized
+;; clone of _Z3foov.
+; CHECK: created clone alias _Z3foov.memprof.2
+; CHECK: created clone alias _Z3foov.memprof.3
+
+;--- src.ll
+source_filename = "memprof-distrib-alias.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@_Z8fooAliasv = alias ptr (...), ptr @_Z3foov
+
+;; Original alias is unchanged.
+; CHECK: @_Z8fooAliasv = alias ptr (...), ptr @_Z3foov{{$}}
+;; We create an equivalent alias for the cloned def @_Z3foov.memprof.1.
+; CHECK: @_Z8fooAliasv.memprof.1 = alias ptr (...), ptr @_Z3foov.memprof.1
+
+;; We should also create aliases for the duplicate clones of _Z3foov
+;; (_Z3foov.memprof.2 and _Z3foov.memprof.3) to the versions they are duplicates
+;; of, and ditto for the associated @_Z8fooAliasv clones.
+;;
+;; _Z3foov.memprof.2 is a duplicate of original _Z3foov, and thus so is _Z8fooAliasv.memprof.2
+; CHECK: @_Z3foov.memprof.2 = alias ptr (), ptr @_Z3foov{{$}}
+; CHECK: @_Z8fooAliasv.memprof.2 = alias ptr (...), ptr @_Z3foov{{$}}
+;; _Z3foov.memprof.3 is a duplicate of _Z3foov.memprof.1, and thus so is _Z8fooAliasv.memprof.3
+; CHECK: @_Z3foov.memprof.3 = alias ptr (), ptr @_Z3foov.memprof.1
+; CHECK: @_Z8fooAliasv.memprof.3 = alias ptr (...), ptr @_Z3foov.memprof.1
+
+; CHECK-LABEL: define i32 @main()
+define i32 @main() #0 {
+entry:
+  ;; The first call to bar does not allocate cold memory. It should call
+  ;; the original function, which eventually calls the original allocation
+  ;; decorated with a "notcold" attribute.
+  ; CHECK:   call {{.*}} @bar()
+  %call = call ptr @bar(), !callsite !0
+  ;; The second call to bar allocates cold memory. It should call the cloned
+  ;; function which eventually calls a cloned allocation decorated with a
+  ;; "cold" attribute.
+  ; CHECK:   call {{.*}} @bar.memprof.1()
+  %call1 = call ptr @bar(), !callsite !1
+  ret i32 0
+}
+
+; CHECK-LABEL: define available_externally i32 @bar()
+define available_externally i32 @bar() #0 {
+entry:
+  ; CHECK: call {{.*}} @_Z8fooAliasv()
+  %call = call ptr @_Z8fooAliasv(), !callsite !8
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+; CHECK-LABEL: define ptr @_Z3foov()
+define ptr @_Z3foov() #0 {
+entry:
+  ; CHECK: call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
+  %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+  ret ptr null
+}
+
+; We create actual clone for bar.memprof.1.
+; CHECK: define available_externally i32 @bar.memprof.1()
+; CHECK:   call {{.*}} @_Z3foov.memprof.1()
+
+;; bar.memprof.2 and bar.memprof.3 are duplicates (of original bar and
+;; bar.memprof.1, respectively). However, they are available externally,
+;; so rather than create an alias we simply create a declaration, since the
+;; compiler does not fully support available_externally aliases.
+; CHECK: declare i32 @bar.memprof.2
+; CHECK: declare i32 @bar.memprof.3
+
+; We create actual clone for foo.memprof.1.
+; CHECK: define {{.*}} @_Z3foov.memprof.1()
+; CHECK:   call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
+
+; CHECK: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; CHECK: attributes #[[COLD]] = { "memprof"="cold" }
+
+; CHECK: 4 memprof-context-disambiguation - Number of function clone duplicates detected during ThinLTO backend
+; CHECK: 2 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 9086428284934609951, i64 1234, i64 8632435727821051414}
+!5 = !{!6, !"cold"}
+!6 = !{i64 9086428284934609951, i64 1234, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 1234}
+
+;--- src.o.thinlto.ll
+; ModuleID = 'src.o.thinlto.ll'
+source_filename = "src.o.thinlto.bc"
+
+^0 = module: (path: "src.o", hash: (1720506022, 1575514144, 2506794664, 3599359797, 3160884478))
+^1 = gv: (guid: 6583049656999245004, summaries: (alias: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 1, dsoLocal: 1, canAutoHide: 0, importType: definition), aliasee: ^2)))
+;; Summary for _Z3foov, where the allocs part has been manually modified to add
+;; two additional clones that are the same as the prior versions:
+;;	... allocs: ((versions: (notcold, cold, notcold, cold), ...
+^2 = gv: (guid: 9191153033785521275, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 1, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (notcold, cold, notcold, cold), memProf: ((type: notcold, stackIds: (1234, 8632435727821051414)), (type: cold, stackIds: (1234, 15025054523792398438))))))))
+^3 = gv: (guid: 15822663052811949562, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 1, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 3, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^4)), callsites: ((callee: ^4, clones: (0), stackIds: (8632435727821051414)), (callee: ^4, clones: (1), stackIds: (15025054523792398438))))))
+;; Summary for bar, where the callsites part has been manually modified to add
+;; two additional clones that are the same as the prior clones:
+;;	... callsites: ((callee: ^1, clones: (0, 1, 0, 1), ...
+^4 = gv: (guid: 16434608426314478903, summaries: (function: (module: ^0, flags: (linkage: available_externally, visibility: default, notEligibleToImport: 0, live: 1, dsoLocal: 1, canAutoHide: 0, importType: definition), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0, 1, 0, 1), stackIds: (1234))))))
+^6 = flags: 353
+^7 = blockcount: 0
diff --git a/llvm/test/ThinLTO/X86/memprof_imported_internal.ll b/llvm/test/ThinLTO/X86/memprof_imported_internal.ll
index a6e254ca4d586..09784f823ec2e 100644
--- a/llvm/test/ThinLTO/X86/memprof_imported_internal.ll
+++ b/llvm/test/ThinLTO/X86/memprof_imported_internal.ll
@@ -63,14 +63,14 @@
 ; CHECK:  tail call void @_ZL9internal1v.llvm.3267420853450984672()
 ; CHECK:  tail call void @_ZL9internal2v.llvm.3267420853450984672.memprof.1()
 ; CHECK-LABEL: declare void @_ZL9internal2v.llvm.3267420853450984672.memprof.1()
-;; We should have 2 clones of src2.cc's internal1 function, calling a single
-;; clone of external2.
+;; We should have one clone of src2.cc's internal1 function, calling a single
+;; clone of external2, and a second clone that was detected to be a duplicate
+;; of the first that becomes a declaration (since this is available_externally -
+;; in the module with the prevailing copy it would be an alias to clone 1).
 ; CHECK-LABEL: define available_externally void @_ZL9internal1v.llvm.3267420853450984672.memprof.1()
 ; CHECK:  tail call void @_Z9external2v.memprof.1()
 ; CHECK:  tail call void @_Z9external2v.memprof.1()
-; CHECK-LABEL: define available_externally void @_ZL9internal1v.llvm.3267420853450984672.memprof.2()
-; CHECK:  tail call void @_Z9external2v.memprof.1()
-; CHECK:  tail call void @_Z9external2v.memprof.1()
+; CHECK: declare void @_ZL9internal1v.llvm.3267420853450984672.memprof.2()
 ; CHECK-NOT: memprof
 
 ;--- src1.ll

From 1eaf081d5eee11ad5797d8825b0510d61a034099 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Fri, 3 Oct 2025 17:29:13 -0700
Subject: [PATCH 713/878] [HLSL][NFC] Merge element type tests for structured
 buffers (#161895)

The change merges 4 similar test files into one file with multiple RUN: lines and uses macros to parametrize the test.
---
 .../AppendStructuredBuffer-elementtype.hlsl   |  54 ---------
 .../ConsumeStructuredBuffer-elementtype.hlsl  |  54 ---------
 .../RWStructuredBuffer-elementtype.hlsl       |  74 ------------
 .../StructuredBuffer-elementtype.hlsl         |  61 ----------
 .../StructuredBuffers-elementtype.hlsl        | 113 ++++++++++++++++++
 5 files changed, 113 insertions(+), 243 deletions(-)
 delete mode 100644 clang/test/CodeGenHLSL/resources/AppendStructuredBuffer-elementtype.hlsl
 delete mode 100644 clang/test/CodeGenHLSL/resources/ConsumeStructuredBuffer-elementtype.hlsl
 delete mode 100644 clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl
 delete mode 100644 clang/test/CodeGenHLSL/resources/StructuredBuffer-elementtype.hlsl
 create mode 100644 clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl

diff --git a/clang/test/CodeGenHLSL/resources/AppendStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/AppendStructuredBuffer-elementtype.hlsl
deleted file mode 100644
index 094006fbd0ebe..0000000000000
--- a/clang/test/CodeGenHLSL/resources/AppendStructuredBuffer-elementtype.hlsl
+++ /dev/null
@@ -1,54 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPV
-
-struct MyStruct {
-  float4 a;
-  int2 b;
-};
-
-// DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.12" = type { target("dx.RawBuffer", %struct.MyStruct, 1, 0)
-// DXIL: %struct.MyStruct = type <{ <4 x float>, <2 x i32> }>
-// DXIL: %"class.hlsl::AppendStructuredBuffer.13" = type { target("dx.RawBuffer", i32, 1, 0)
-// SPV: %"class.hlsl::AppendStructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1)
-// DXIL: %"class.hlsl::AppendStructuredBuffer.14" = type { target("dx.RawBuffer", <4 x i32>, 1, 0)
-// SPV: %"class.hlsl::AppendStructuredBuffer.14" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1)
-
-AppendStructuredBuffer<int16_t> BufI16;
-AppendStructuredBuffer<uint16_t> BufU16;
-AppendStructuredBuffer<int> BufI32;
-AppendStructuredBuffer<uint> BufU32;
-AppendStructuredBuffer<int64_t> BufI64;
-AppendStructuredBuffer<uint64_t> BufU64;
-AppendStructuredBuffer<half> BufF16;
-AppendStructuredBuffer<float> BufF32;
-AppendStructuredBuffer<double> BufF64;
-AppendStructuredBuffer< vector<int16_t, 4> > BufI16x4;
-AppendStructuredBuffer< vector<uint, 3> > BufU32x3;
-AppendStructuredBuffer<half2> BufF16x2;
-AppendStructuredBuffer<float3> BufF32x3;
-// TODO: AppendStructuredBuffer<snorm half> BufSNormF16;
-// TODO: AppendStructuredBuffer<unorm half> BufUNormF16;
-// TODO: AppendStructuredBuffer<snorm float> BufSNormF32;
-// TODO: AppendStructuredBuffer<unorm float> BufUNormF32;
-// TODO: AppendStructuredBuffer<snorm double> BufSNormF64;
-// TODO: AppendStructuredBuffer<unorm double> BufUNormF64;
-AppendStructuredBuffer<MyStruct> BufMyStruct;
-AppendStructuredBuffer<bool> BufBool;
-AppendStructuredBuffer<bool4> BufBoolVec;
-
-[numthreads(1,1,1)]
-void main(int GI : SV_GroupIndex) {
-}
diff --git a/clang/test/CodeGenHLSL/resources/ConsumeStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/ConsumeStructuredBuffer-elementtype.hlsl
deleted file mode 100644
index 632fd910aee15..0000000000000
--- a/clang/test/CodeGenHLSL/resources/ConsumeStructuredBuffer-elementtype.hlsl
+++ /dev/null
@@ -1,54 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPV
-
-struct MyStruct {
-  float4 a;
-  int2 b;
-};
-
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.12" = type { target("dx.RawBuffer", %struct.MyStruct, 1, 0)
-// DXIL: %struct.MyStruct = type <{ <4 x float>, <2 x i32> }>
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.13" = type { target("dx.RawBuffer", i32, 1, 0)
-// SPV: %"class.hlsl::ConsumeStructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1)
-// DXIL: %"class.hlsl::ConsumeStructuredBuffer.14" = type { target("dx.RawBuffer", <4 x i32>, 1, 0)
-// SPV: %"class.hlsl::ConsumeStructuredBuffer.14" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1)
-
-ConsumeStructuredBuffer<int16_t> BufI16;
-ConsumeStructuredBuffer<uint16_t> BufU16;
-ConsumeStructuredBuffer<int> BufI32;
-ConsumeStructuredBuffer<uint> BufU32;
-ConsumeStructuredBuffer<int64_t> BufI64;
-ConsumeStructuredBuffer<uint64_t> BufU64;
-ConsumeStructuredBuffer<half> BufF16;
-ConsumeStructuredBuffer<float> BufF32;
-ConsumeStructuredBuffer<double> BufF64;
-ConsumeStructuredBuffer< vector<int16_t, 4> > BufI16x4;
-ConsumeStructuredBuffer< vector<uint, 3> > BufU32x3;
-ConsumeStructuredBuffer<half2> BufF16x2;
-ConsumeStructuredBuffer<float3> BufF32x3;
-// TODO: ConsumeStructuredBuffer<snorm half> BufSNormF16;
-// TODO: ConsumeStructuredBuffer<unorm half> BufUNormF16;
-// TODO: ConsumeStructuredBuffer<snorm float> BufSNormF32;
-// TODO: ConsumeStructuredBuffer<unorm float> BufUNormF32;
-// TODO: ConsumeStructuredBuffer<snorm double> BufSNormF64;
-// TODO: ConsumeStructuredBuffer<unorm double> BufUNormF64;
-ConsumeStructuredBuffer<MyStruct> BufMyStruct;
-ConsumeStructuredBuffer<bool> BufBool;
-ConsumeStructuredBuffer<bool4> BufBoolVec;
-
-[numthreads(1,1,1)]
-void main(int GI : SV_GroupIndex) {
-}
diff --git a/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl
deleted file mode 100644
index 9f0a5b79ee6e4..0000000000000
--- a/clang/test/CodeGenHLSL/resources/RWStructuredBuffer-elementtype.hlsl
+++ /dev/null
@@ -1,74 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPV
-
-// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.0" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.1" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.2" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.3" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.4" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0), target("dx.RawBuffer", half, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.5" = type { target("spirv.VulkanBuffer", [0 x half], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.6" = type { target("spirv.VulkanBuffer", [0 x float], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0), target("dx.RawBuffer", double, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.7" = type { target("spirv.VulkanBuffer", [0 x double], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0), target("dx.RawBuffer", <4 x i16>, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.8" = type { target("spirv.VulkanBuffer", [0 x <4 x i16>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0), target("dx.RawBuffer", <3 x i32>, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.9" = type { target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0), target("dx.RawBuffer", <2 x half>, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.10" = type { target("spirv.VulkanBuffer", [0 x <2 x half>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0), target("dx.RawBuffer", <3 x float>, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.11" = type { target("spirv.VulkanBuffer", [0 x <3 x float>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.12" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-// CHECK: %"class.hlsl::RWStructuredBuffer.13" = type { target("dx.RawBuffer", <4 x i32>, 1, 0), target("dx.RawBuffer", <4 x i32>, 1, 0) }
-// SPV: %"class.hlsl::RWStructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
-
-RWStructuredBuffer<int16_t> BufI16;
-RWStructuredBuffer<uint16_t> BufU16;
-RWStructuredBuffer<int> BufI32;
-RWStructuredBuffer<uint> BufU32;
-RWStructuredBuffer<int64_t> BufI64;
-RWStructuredBuffer<uint64_t> BufU64;
-RWStructuredBuffer<half> BufF16;
-RWStructuredBuffer<float> BufF32;
-RWStructuredBuffer<double> BufF64;
-RWStructuredBuffer< vector<int16_t, 4> > BufI16x4;
-RWStructuredBuffer< vector<uint, 3> > BufU32x3;
-RWStructuredBuffer<half2> BufF16x2;
-RWStructuredBuffer<float3> BufF32x3;
-RWStructuredBuffer<bool> BufBool;
-RWStructuredBuffer<bool4> BufBoolVec;
-// TODO: RWStructuredBuffer<snorm half> BufSNormF16;
-// TODO: RWStructuredBuffer<unorm half> BufUNormF16;
-// TODO: RWStructuredBuffer<snorm float> BufSNormF32;
-// TODO: RWStructuredBuffer<unorm float> BufUNormF32;
-// TODO: RWStructuredBuffer<snorm double> BufSNormF64;
-// TODO: RWStructuredBuffer<unorm double> BufUNormF64;
-
-[numthreads(1,1,1)]
-void main(int GI : SV_GroupIndex) {
-  BufI16[GI] = 0;
-  BufU16[GI] = 0;
-  BufI32[GI] = 0;
-  BufU32[GI] = 0;
-  BufI64[GI] = 0;
-  BufU64[GI] = 0;
-  BufF16[GI] = 0;
-  BufF32[GI] = 0;
-  BufF64[GI] = 0;
-  BufI16x4[GI] = 0;
-  BufU32x3[GI] = 0;
-  BufF16x2[GI] = 0;
-  BufF32x3[GI] = 0;
-  BufBool[GI] = false;
-  BufBool[GI] = false;
-}
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffer-elementtype.hlsl
deleted file mode 100644
index 00216df7fa29e..0000000000000
--- a/clang/test/CodeGenHLSL/resources/StructuredBuffer-elementtype.hlsl
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=CHECK
-// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=SPV
-
-// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", i16, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.0" = type { target("dx.RawBuffer", i16, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.1" = type { target("dx.RawBuffer", i32, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.2" = type { target("dx.RawBuffer", i32, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.3" = type { target("dx.RawBuffer", i64, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.4" = type { target("dx.RawBuffer", i64, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.5" = type { target("dx.RawBuffer", half, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.6" = type { target("dx.RawBuffer", float, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.7" = type { target("dx.RawBuffer", double, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 0, 0) }
-// CHECK: %"class.hlsl::StructuredBuffer.12" = type { target("dx.RawBuffer", i32, 0, 0) }
-// SPV: %"class.hlsl::StructuredBuffer.12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.13" = type { target("dx.RawBuffer", <4 x i32>, 0, 0) }
-// SPV: %"class.hlsl::StructuredBuffer.13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 0)
-
-StructuredBuffer<int16_t> BufI16;
-StructuredBuffer<uint16_t> BufU16;
-StructuredBuffer<int> BufI32;
-StructuredBuffer<uint> BufU32;
-StructuredBuffer<int64_t> BufI64;
-StructuredBuffer<uint64_t> BufU64;
-StructuredBuffer<half> BufF16;
-StructuredBuffer<float> BufF32;
-StructuredBuffer<double> BufF64;
-StructuredBuffer< vector<int16_t, 4> > BufI16x4;
-StructuredBuffer< vector<uint, 3> > BufU32x3;
-StructuredBuffer<half2> BufF16x2;
-StructuredBuffer<float3> BufF32x3;
-StructuredBuffer<bool> BufBool;
-StructuredBuffer<bool4> BufBoolVec;
-// TODO: StructuredBuffer<snorm half> BufSNormF16;
-// TODO: StructuredBuffer<unorm half> BufUNormF16;
-// TODO: StructuredBuffer<snorm float> BufSNormF32;
-// TODO: StructuredBuffer<unorm float> BufUNormF32;
-// TODO: StructuredBuffer<snorm double> BufSNormF64;
-// TODO: StructuredBuffer<unorm double> BufUNormF64;
-
-[numthreads(1,1,1)]
-void main(int GI : SV_GroupIndex) {
-  int16_t v1 = BufI16[GI];
-  uint16_t v2 = BufU16[GI];
-  int v3 = BufI32[GI];
-  uint v4 = BufU32[GI];
-  int64_t v5 = BufI64[GI];
-  uint64_t v6 = BufU64[GI];
-  half v7 = BufF16[GI];
-  float v8 = BufF32[GI];
-  double v9 = BufF64[GI];
-  vector<int16_t,4> v10 = BufI16x4[GI];
-  vector<int, 3> v11 = BufU32x3[GI];
-  half2 v12 = BufF16x2[GI];
-  float3 v13 = BufF32x3[GI];
-  bool v14 = BufBool[GI];
-  bool4 v15 = BufBoolVec[GI];
-}
diff --git a/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
new file mode 100644
index 0000000000000..2b286bde88468
--- /dev/null
+++ b/clang/test/CodeGenHLSL/resources/StructuredBuffers-elementtype.hlsl
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=DXIL-RO
+
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=StructuredBuffer %s | FileCheck %s -DRESOURCE=StructuredBuffer -check-prefixes=SPV-RO
+
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=DXIL-RW
+
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=RWStructuredBuffer %s | FileCheck %s -DRESOURCE=RWStructuredBuffer -check-prefixes=SPV-RW
+
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=DXIL-RW
+
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=AppendStructuredBuffer %s | FileCheck %s -DRESOURCE=AppendStructuredBuffer -check-prefixes=SPV-RW
+
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=DXIL-RW
+
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -finclude-default-header -fnative-half-type \
+// RUN: -emit-llvm -o - -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck %s -DRESOURCE=ConsumeStructuredBuffer -check-prefixes=SPV-RW
+
+// DXIL-RO: %"class.hlsl::[[RESOURCE]]" = type { target("dx.RawBuffer", i16, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].0" = type { target("dx.RawBuffer", i16, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].1" = type { target("dx.RawBuffer", i32, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].2" = type { target("dx.RawBuffer", i32, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].3" = type { target("dx.RawBuffer", i64, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].4" = type { target("dx.RawBuffer", i64, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].5" = type { target("dx.RawBuffer", half, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].6" = type { target("dx.RawBuffer", float, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].7" = type { target("dx.RawBuffer", double, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].8" = type { target("dx.RawBuffer", <4 x i16>, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].9" = type { target("dx.RawBuffer", <3 x i32>, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].10" = type { target("dx.RawBuffer", <2 x half>, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].11" = type { target("dx.RawBuffer", <3 x float>, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].12" = type { target("dx.RawBuffer", i32, 0, 0) }
+// DXIL-RO: %"class.hlsl::[[RESOURCE]].13" = type { target("dx.RawBuffer", <4 x i32>, 0, 0) }
+
+// DXIL-RW: %"class.hlsl::[[RESOURCE]]" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].0" = type { target("dx.RawBuffer", i16, 1, 0), target("dx.RawBuffer", i16, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].1" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].2" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].3" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].4" = type { target("dx.RawBuffer", i64, 1, 0), target("dx.RawBuffer", i64, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].5" = type { target("dx.RawBuffer", half, 1, 0), target("dx.RawBuffer", half, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].6" = type { target("dx.RawBuffer", float, 1, 0), target("dx.RawBuffer", float, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].7" = type { target("dx.RawBuffer", double, 1, 0), target("dx.RawBuffer", double, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0), target("dx.RawBuffer", <4 x i16>, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0), target("dx.RawBuffer", <3 x i32>, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].10" = type { target("dx.RawBuffer", <2 x half>, 1, 0), target("dx.RawBuffer", <2 x half>, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].11" = type { target("dx.RawBuffer", <3 x float>, 1, 0), target("dx.RawBuffer", <3 x float>, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].12" = type { target("dx.RawBuffer", i32, 1, 0), target("dx.RawBuffer", i32, 1, 0) }
+// DXIL-RW: %"class.hlsl::[[RESOURCE]].13" = type { target("dx.RawBuffer", <4 x i32>, 1, 0), target("dx.RawBuffer", <4 x i32>, 1, 0) }
+
+// SPV-RO: %"class.hlsl::[[RESOURCE]]" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].0" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].1" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].2" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].3" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].4" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].5" = type { target("spirv.VulkanBuffer", [0 x half], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].6" = type { target("spirv.VulkanBuffer", [0 x float], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].7" = type { target("spirv.VulkanBuffer", [0 x double], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].8" = type { target("spirv.VulkanBuffer", [0 x <4 x i16>], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].9" = type { target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].10" = type { target("spirv.VulkanBuffer", [0 x <2 x half>], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].11" = type { target("spirv.VulkanBuffer", [0 x <3 x float>], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 0) }
+// SPV-RO: %"class.hlsl::[[RESOURCE]].13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 0) }
+
+// SPV-RW: %"class.hlsl::[[RESOURCE]]" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].0" = type { target("spirv.VulkanBuffer", [0 x i16], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].1" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].2" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].3" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].4" = type { target("spirv.VulkanBuffer", [0 x i64], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].5" = type { target("spirv.VulkanBuffer", [0 x half], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].6" = type { target("spirv.VulkanBuffer", [0 x float], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].7" = type { target("spirv.VulkanBuffer", [0 x double], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].8" = type { target("spirv.VulkanBuffer", [0 x <4 x i16>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].9" = type { target("spirv.VulkanBuffer", [0 x <3 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].10" = type { target("spirv.VulkanBuffer", [0 x <2 x half>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].11" = type { target("spirv.VulkanBuffer", [0 x <3 x float>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].12" = type { target("spirv.VulkanBuffer", [0 x i32], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+// SPV-RW: %"class.hlsl::[[RESOURCE]].13" = type { target("spirv.VulkanBuffer", [0 x <4 x i32>], 12, 1), target("spirv.VulkanBuffer", i32, 12, 1) }
+
+RESOURCE<int16_t> BufI16;
+RESOURCE<uint16_t> BufU16;
+RESOURCE<int> BufI32;
+RESOURCE<uint> BufU32;
+RESOURCE<int64_t> BufI64;
+RESOURCE<uint64_t> BufU64;
+RESOURCE<half> BufF16;
+RESOURCE<float> BufF32;
+RESOURCE<double> BufF64;
+RESOURCE< vector<int16_t, 4> > BufI16x4;
+RESOURCE< vector<uint, 3> > BufU32x3;
+RESOURCE<half2> BufF16x2;
+RESOURCE<float3> BufF32x3;
+RESOURCE<bool> BufBool;
+RESOURCE<bool4> BufBoolVec;
+// TODO: RESOURCE<snorm half> BufSNormF16;
+// TODO: RESOURCE<unorm half> BufUNormF16;
+// TODO: RESOURCE<snorm float> BufSNormF32;
+// TODO: RESOURCE<unorm float> BufUNormF32;
+// TODO: RESOURCE<snorm double> BufSNormF64;
+// TODO: RESOURCE<unorm double> BufUNormF64;
+
+[numthreads(1,1,1)]
+void main() {
+}

From 0b543e39351a72f65521522fbe9f3622abc00b47 Mon Sep 17 00:00:00 2001
From: Alex White <milkeeycat@gmail.com>
Date: Sat, 4 Oct 2025 03:44:01 +0300
Subject: [PATCH 714/878] [TableGen] Set `G_GLOBAL_VALUE` out operand type to
 `ptype0` (#161894)

I'm not familiar with how every target handles this generic opcode but I
think it shouldn't break anything and it makes sense to use `ptype0`
because `G_GLOBAL_VALUE` returns a pointer.
---
 llvm/include/llvm/Target/GenericOpcodes.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index faf77880e4614..e3f995d53484f 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -126,7 +126,7 @@ def G_FRAME_INDEX : GenericInstruction {
 }
 
 def G_GLOBAL_VALUE : GenericInstruction {
-  let OutOperandList = (outs type0:$dst);
+  let OutOperandList = (outs ptype0:$dst);
   let InOperandList = (ins unknown:$src);
   let hasSideEffects = false;
 }

From 716fe1cbd9496564ac3588069846a52b8539df65 Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon@amd.com>
Date: Sat, 4 Oct 2025 03:12:21 +0200
Subject: [PATCH 715/878] [Flang][OpenMP] Fix negative array indexing with
 allocatable derived type array maps (#154193)

The main problem is that the previous intermediate map generation for
allocatable members wasn't quite handling negative bounds acccesses
correctly, it seems to require slightly more complicated access using
shape_shift/dimension information. So this more closely mimics what
Flang generates in other cases now.

There is still a path for non-Box types to go down the old route for the
moment, so it is possible we may still have issues with negative bounds
in these cases. But, that's better in another PR if we come across it,
instead of too much change in this one.
---
 flang/lib/Lower/OpenMP/Utils.cpp              | 53 ++++++++++++++-----
 .../OpenMP/map-types-and-sizes.f90            | 23 ++++----
 .../map-neg-alloca-derived-type-array.f90     | 27 ++++++++++
 3 files changed, 81 insertions(+), 22 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/map-neg-alloca-derived-type-array.f90

diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 29cccbd1bfe5a..37b926e2f23f2 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -14,11 +14,12 @@
 
 #include "ClauseFinder.h"
 #include "flang/Evaluate/fold.h"
-#include "flang/Lower/OpenMP/Clauses.h"
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
 #include <flang/Lower/DirectivesCommon.h>
+#include <flang/Lower/OpenMP/Clauses.h>
 #include <flang/Lower/PFTBuilder.h>
+#include <flang/Lower/Support/PrivateReductionUtils.h>
 #include <flang/Optimizer/Builder/FIRBuilder.h>
 #include <flang/Optimizer/Builder/Todo.h>
 #include <flang/Parser/openmp-utils.h>
@@ -180,16 +181,11 @@ static void generateArrayIndices(lower::AbstractConverter &converter,
   for (auto v : arr->subscript()) {
     if (std::holds_alternative<Triplet>(v.u))
       TODO(clauseLocation, "Triplet indexing in map clause is unsupported");
-
     auto expr = std::get<Fortran::evaluate::IndirectSubscriptIntegerExpr>(v.u);
     mlir::Value subscript =
         fir::getBase(converter.genExprValue(toEvExpr(expr.value()), stmtCtx));
-    mlir::Value one = firOpBuilder.createIntegerConstant(
-        clauseLocation, firOpBuilder.getIndexType(), 1);
-    subscript = firOpBuilder.createConvert(
-        clauseLocation, firOpBuilder.getIndexType(), subscript);
-    indices.push_back(mlir::arith::SubIOp::create(firOpBuilder, clauseLocation,
-                                                  subscript, one));
+    indices.push_back(firOpBuilder.createConvert(
+        clauseLocation, firOpBuilder.getIndexType(), subscript));
   }
 }
 
@@ -322,10 +318,42 @@ mlir::Value createParentSymAndGenIntermediateMaps(
                              subscriptIndices, objectList[i]);
         assert(!subscriptIndices.empty() &&
                "missing expected indices for map clause");
-        curValue = fir::CoordinateOp::create(
-            firOpBuilder, clauseLocation,
-            firOpBuilder.getRefType(arrType.getEleTy()), curValue,
-            subscriptIndices);
+        if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(curValue.getType())) {
+          // To accommodate indexing into box types of all dimensions including
+          // negative dimensions we have to take into consideration the lower
+          // bounds and extents of the data (stored in the box) and convey it
+          // to the ArrayCoorOp so that it can appropriately access the element
+          // utilising the subscript we provide and the runtime sizes stored in
+          // the Box. To do so we need to generate a ShapeShiftOp which combines
+          // both the lb (ShiftOp) and extent (ShapeOp) of the Box, giving the
+          // ArrayCoorOp the spatial information it needs to calculate the
+          // underlying address.
+          mlir::Value shapeShift = Fortran::lower::getShapeShift(
+              firOpBuilder, clauseLocation, curValue);
+          auto addrOp =
+              fir::BoxAddrOp::create(firOpBuilder, clauseLocation, curValue);
+          curValue = fir::ArrayCoorOp::create(
+              firOpBuilder, clauseLocation,
+              firOpBuilder.getRefType(arrType.getEleTy()), addrOp, shapeShift,
+              /*slice=*/mlir::Value{}, subscriptIndices,
+              /*typeparms=*/mlir::ValueRange{});
+        } else {
+          // We're required to negate by one in the non-Box case as I believe
+          // we do not have the shape generated from the dimensions to help
+          // adjust the indexing.
+          // TODO/FIXME: This may need adjusted to support bounds of unusual
+          // dimensions, if that's the case then it is likely best to fold this
+          // branch into the above.
+          mlir::Value one = firOpBuilder.createIntegerConstant(
+              clauseLocation, firOpBuilder.getIndexType(), 1);
+          for (auto &v : subscriptIndices)
+            v = mlir::arith::SubIOp::create(firOpBuilder, clauseLocation, v,
+                                            one);
+          curValue = fir::CoordinateOp::create(
+              firOpBuilder, clauseLocation,
+              firOpBuilder.getRefType(arrType.getEleTy()), curValue,
+              subscriptIndices);
+        }
       }
     }
 
@@ -415,7 +443,6 @@ mlir::Value createParentSymAndGenIntermediateMaps(
       currentIndicesIdx++;
     }
   }
-
   return curValue;
 }
 
diff --git a/flang/test/Integration/OpenMP/map-types-and-sizes.f90 b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
index 665be5a8db4d4..44a049f5ac510 100644
--- a/flang/test/Integration/OpenMP/map-types-and-sizes.f90
+++ b/flang/test/Integration/OpenMP/map-types-and-sizes.f90
@@ -785,15 +785,20 @@ end subroutine mapType_common_block_members
 !CHECK: %[[BOUNDS_CALC:.*]] = sub i64 %[[BOUNDS_LD_2]], 1
 !CHECK: %[[OFF_PTR_CALC_0:.*]] = sub i64 %[[BOUNDS_LD]], 1
 !CHECK: %[[OFF_PTR_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[OFF_PTR_1]], i32 0, i32 0
+!CHECK: %[[GEP_LB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_0]], i32 0, i32 7, i64 0, i32 0
+!CHECK: %[[LOAD_LB:.*]] = load i64, ptr %[[GEP_LB]], align 8
+!CHECK: %[[GEP_UB:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_0]], i32 0, i32 7, i64 0, i32 1
+!CHECK: %[[LOAD_UB:.*]] = load i64, ptr %[[GEP_UB]], align 8
 !CHECK: %[[GEP_DESC_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_0]], i32 0, i32 0
-!CHECK: %[[LOAD_DESC_PTR:.*]] = load ptr, ptr %[[GEP_DESC_PTR]], align 8
-!CHECK: %[[SZ_CALC_1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[ALLOCA_0]], i32 0, i32 7, i32 0, i32 2
-!CHECK: %[[SZ_CALC_2:.*]] = load i64, ptr %[[SZ_CALC_1]], align 8
-!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 1, %[[SZ_CALC_2]]
-!CHECK: %[[SZ_CALC_4:.*]] = add nsw i64 %[[SZ_CALC_3]], 0
-!CHECK: %[[SZ_CALC_5:.*]] = getelementptr i8, ptr %[[LOAD_DESC_PTR]], i64 %[[SZ_CALC_4]]
-!CHECK: %[[SZ_CALC_6:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTvertexes, ptr %[[SZ_CALC_5]], i32 0, i32 2
-!CHECK: %[[OFF_PTR_4:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[SZ_CALC_6]], i32 0, i32 0
+!CHECK: %[[SZ_CALC_1:.*]] = load ptr, ptr %[[GEP_DESC_PTR]], align 8
+!CHECK: %[[SZ_CALC_2:.*]] = sub nsw i64 2, %[[LOAD_LB]]
+!CHECK: %[[SZ_CALC_3:.*]] = mul nsw i64 %[[SZ_CALC_2]], 1
+!CHECK: %[[SZ_CALC_4:.*]] = mul nsw i64 %[[SZ_CALC_3]], 1
+!CHECK: %[[SZ_CALC_5:.*]] = add nsw i64 %[[SZ_CALC_4]], 0
+!CHECK: %[[SZ_CALC_6:.*]] = mul nsw i64 1, %[[LOAD_UB]]
+!CHECK: %[[SZ_CALC_7:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTvertexes, ptr %[[SZ_CALC_1]], i64 %[[SZ_CALC_5]]
+!CHECK: %[[SZ_CALC_8:.*]] = getelementptr %_QFmaptype_nested_derived_type_member_idxTvertexes, ptr %[[SZ_CALC_7]], i32 0, i32 2
+!CHECK: %[[OFF_PTR_4:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[SZ_CALC_8]], i32 0, i32 0
 !CHECK: %[[OFF_PTR_CALC_1:.*]] = sub i64 %[[OFF_PTR_CALC_0]], 0
 !CHECK: %[[OFF_PTR_CALC_2:.*]] = add i64 %[[OFF_PTR_CALC_1]], 1
 !CHECK: %[[OFF_PTR_CALC_3:.*]] = mul i64 1, %[[OFF_PTR_CALC_2]]
@@ -838,7 +843,7 @@ end subroutine mapType_common_block_members
 !CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [7 x ptr], ptr %.offload_baseptrs, i32 0, i32 4
 !CHECK: store ptr %[[BASE_PTR_1]], ptr %[[BASE_PTR_ARR]], align 8
 !CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [7 x ptr], ptr %.offload_ptrs, i32 0, i32 4
-!CHECK: store ptr %[[SZ_CALC_6]], ptr %[[OFFLOAD_PTR_ARR]], align 8
+!CHECK: store ptr %[[SZ_CALC_8]], ptr %[[OFFLOAD_PTR_ARR]], align 8
 !CHECK: %[[BASE_PTR_ARR:.*]] = getelementptr inbounds [7 x ptr], ptr %.offload_baseptrs, i32 0, i32 5
 !CHECK: store ptr %[[BASE_PTR_1]], ptr %[[BASE_PTR_ARR]], align 8
 !CHECK: %[[OFFLOAD_PTR_ARR:.*]] = getelementptr inbounds [7 x ptr], ptr %.offload_ptrs, i32 0, i32 5
diff --git a/flang/test/Lower/OpenMP/map-neg-alloca-derived-type-array.f90 b/flang/test/Lower/OpenMP/map-neg-alloca-derived-type-array.f90
new file mode 100644
index 0000000000000..dd8721b97dccd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/map-neg-alloca-derived-type-array.f90
@@ -0,0 +1,27 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+subroutine map_negative_bounds_allocatable_dtype()
+    type derived_type
+        real(4), pointer :: data(:,:,:) => null()
+    end type
+    type(derived_type), allocatable :: dtype(:,:)
+
+    !$omp target map(tofrom: dtype(-1,1)%data)
+        dtype(-1,1)%data(1,1,1) = 10
+    !$omp end target
+end subroutine
+
+! CHECK:           %[[VAL_1:.*]] = arith.constant -1 : i64
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i64) -> index
+! CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %{{.*}}, %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFmap_negative_bounds_allocatable_dtypeTderived_type{data:!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>}>>>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_8:.*]]:3 = fir.box_dims %{{.*}}, %[[VAL_7]] : (!fir.box<!fir.heap<!fir.array<?x?x!fir.type<_QFmap_negative_bounds_allocatable_dtypeTderived_type{data:!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>}>>>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_9:.*]] = fir.shape_shift %[[VAL_6]]#0, %[[VAL_6]]#1, %[[VAL_8]]#0, %[[VAL_8]]#1 : (index, index, index, index) -> !fir.shapeshift<2>
+! CHECK:           %[[VAL_10:.*]] = fir.array_coor %{{.*}}(%[[VAL_9]]) %[[VAL_2]], %[[VAL_4]] : (!fir.heap<!fir.array<?x?x!fir.type<_QFmap_negative_bounds_allocatable_dtypeTderived_type{data:!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>}>>>, !fir.shapeshift<2>, index, index) -> !fir.ref<!fir.type<_QFmap_negative_bounds_allocatable_dtypeTderived_type{data:!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>}>>
+! CHECK:           %[[VAL_11:.*]] = fir.coordinate_of %[[VAL_10]], data : (!fir.ref<!fir.type<_QFmap_negative_bounds_allocatable_dtypeTderived_type{data:!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>>
+! CHECK:           %[[VAL_12:.*]] = fir.box_offset %[[VAL_11]] base_addr : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x?x?xf32>>>
+! CHECK:           %[[VAL_13:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>>, f32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_12]] : !fir.llvm_ptr<!fir.ref<!fir.array<?x?x?xf32>>>) bounds({{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?x?x?xf32>>> {name = ""}
+! CHECK:           %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>>, !fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?x?xf32>>>> {name = {{.*}}}

From a4b297532d3b3555b70c8b047e7d0ef3457f6ac8 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Sat, 4 Oct 2025 09:12:39 +0800
Subject: [PATCH 716/878] [SPIRV] Fix unused-variable warnings (NFC)

/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp:198:12:
 error: unused variable 'dstBitWidth' [-Werror,-Wunused-variable]
      auto dstBitWidth =
           ^
/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp:200:12:
 error: unused variable 'srcBitWidth' [-Werror,-Wunused-variable]
      auto srcBitWidth =
           ^
2 errors generated.
---
 llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index ebd957c42762c..e8c849ed00c74 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -195,9 +195,9 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     if (DstType->getElementType() != SrcType->getElementType()) {
       // Support bitcast between vectors of different sizes only if
       // the total bitwidth is the same.
-      auto dstBitWidth =
+      [[maybe_unused]] auto dstBitWidth =
           DstType->getElementType()->getScalarSizeInBits() * dstNumElements;
-      auto srcBitWidth =
+      [[maybe_unused]] auto srcBitWidth =
           SrcType->getElementType()->getScalarSizeInBits() * srcNumElements;
       assert(dstBitWidth == srcBitWidth &&
              "Unsupported bitcast between vectors of different sizes.");

From f3673c5e5bfef176b3f630d82ec2299390b8a69a Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Fri, 3 Oct 2025 18:16:22 -0700
Subject: [PATCH 717/878] [lldb] Disabling tests in win32 to investigate the
 root cause. (#161931)

These tests are failing on win32 platforms, disabling while I
investigate the root cause.
---
 lldb/unittests/Protocol/ProtocolMCPServerTest.cpp | 5 +++++
 lldb/unittests/Protocol/ProtocolMCPTest.cpp       | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
index 9628cbd91e9ce..45464db958e04 100644
--- a/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
+++ b/lldb/unittests/Protocol/ProtocolMCPServerTest.cpp
@@ -38,6 +38,9 @@ using namespace lldb_private;
 using namespace lldb_private::transport;
 using namespace lldb_protocol::mcp;
 
+// Flakey, see https://github.com/llvm/llvm-project/issues/152677.
+#ifndef _WIN32
+
 namespace {
 
 template <typename T> Response make_response(T &&result, Id id = 1) {
@@ -325,3 +328,5 @@ TEST_F(ProtocolServerMCPTest, NotificationInitialized) {
   EXPECT_THAT(logged_messages,
               testing::Contains("MCP initialization complete"));
 }
+
+#endif
diff --git a/lldb/unittests/Protocol/ProtocolMCPTest.cpp b/lldb/unittests/Protocol/ProtocolMCPTest.cpp
index 396e361e873fe..5f7391e43fe34 100644
--- a/lldb/unittests/Protocol/ProtocolMCPTest.cpp
+++ b/lldb/unittests/Protocol/ProtocolMCPTest.cpp
@@ -16,6 +16,9 @@ using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_protocol::mcp;
 
+// Flakey, see https://github.com/llvm/llvm-project/issues/152677.
+#ifndef _WIN32
+
 TEST(ProtocolMCPTest, Request) {
   Request request;
   request.id = 1;
@@ -292,3 +295,5 @@ TEST(ProtocolMCPTest, ReadResourceResultEmpty) {
 
   EXPECT_TRUE(deserialized_result->contents.empty());
 }
+
+#endif

From cb8b48e583f8f31ef162bcbaabbe95815f08230d Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 3 Oct 2025 18:22:53 -0700
Subject: [PATCH 718/878] [TableGen][MC] Pass a MCSubtargetInfo instance into
 resolveVariantSchedClassImpl (#161886)

`Target_MC::resolveVariantSchedClassImpl` is the implementation function
for `TargetGenMCSubtargetInfo::resolveVariantSchedClass`. Despite being
only called by `resolveVariantSchedClass`,
`resolveVariantSchedClassImpl` is still a standalone function that
cannot access a MCSubtargetInfo through `this` (i.e.
`TargetGenMCSubtargetInfo`). And having access to a `MCSubtargetInfo`
could be useful for some (future) SchedPredicate.

This patch modifies TableGen to generate `resolveVariantSchedClassImpl`
with an additional `MCSubtargetInfo` argument passing in. Note that this
does not change any public interface in either `TargetGenMCSubtargetInfo
` or `MCSubtargetInfo`, as `resolveVariantSchedClassImpl` is basically
an internal function.
---
 llvm/test/TableGen/ResolveSchedClass.td  | 18 ++++++++++++++++++
 llvm/utils/TableGen/SubtargetEmitter.cpp | 10 ++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/TableGen/ResolveSchedClass.td

diff --git a/llvm/test/TableGen/ResolveSchedClass.td b/llvm/test/TableGen/ResolveSchedClass.td
new file mode 100644
index 0000000000000..8c9ef1eab3566
--- /dev/null
+++ b/llvm/test/TableGen/ResolveSchedClass.td
@@ -0,0 +1,18 @@
+// RUN: llvm-tblgen -gen-subtarget -I %p/../../include %s -o - | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def TestTargetInstrInfo : InstrInfo;
+
+def TestTarget : Target {
+  let InstructionSet = TestTargetInstrInfo;
+}
+
+// CHECK:  unsigned resolveVariantSchedClassImpl(unsigned SchedClass,
+// CHECK-NEXT:      const MCInst *MI, const MCInstrInfo *MCII, const MCSubtargetInfo &STI, unsigned CPUID)
+
+// CHECK: unsigned resolveVariantSchedClass(unsigned SchedClass,
+// CHECK-NEXT:    const MCInst *MI, const MCInstrInfo *MCII,
+// CHECK-NEXT:    unsigned CPUID) const override {
+// CHECK-NEXT:   return TestTarget_MC::resolveVariantSchedClassImpl(SchedClass, MI, MCII, *this, CPUID);
+// CHECK-NEXT: }
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 0f42d49a6bea1..b0a309cdbfef8 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1761,7 +1761,7 @@ void SubtargetEmitter::emitSchedModelHelpers(const std::string &ClassName,
      << "\n::resolveVariantSchedClass(unsigned SchedClass, const MCInst *MI,"
      << " const MCInstrInfo *MCII, unsigned CPUID) const {\n"
      << "  return " << Target << "_MC"
-     << "::resolveVariantSchedClassImpl(SchedClass, MI, MCII, CPUID);\n"
+     << "::resolveVariantSchedClassImpl(SchedClass, MI, MCII, *this, CPUID);\n"
      << "} // " << ClassName << "::resolveVariantSchedClass\n\n";
 
   STIPredicateExpander PE(Target, /*Indent=*/0);
@@ -1923,7 +1923,8 @@ void SubtargetEmitter::parseFeaturesFunction(raw_ostream &OS) {
 void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
   OS << "namespace " << Target << "_MC {\n"
      << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,\n"
-     << "    const MCInst *MI, const MCInstrInfo *MCII, unsigned CPUID) {\n";
+     << "    const MCInst *MI, const MCInstrInfo *MCII, "
+     << "const MCSubtargetInfo &STI, unsigned CPUID) {\n";
   emitSchedModelHelpersImpl(OS, /* OnlyExpandMCPredicates */ true);
   OS << "}\n";
   OS << "} // end namespace " << Target << "_MC\n\n";
@@ -1945,7 +1946,7 @@ void SubtargetEmitter::emitGenMCSubtargetInfo(raw_ostream &OS) {
      << "      const MCInst *MI, const MCInstrInfo *MCII,\n"
      << "      unsigned CPUID) const override {\n"
      << "    return " << Target << "_MC"
-     << "::resolveVariantSchedClassImpl(SchedClass, MI, MCII, CPUID);\n";
+     << "::resolveVariantSchedClassImpl(SchedClass, MI, MCII, *this, CPUID);\n";
   OS << "  }\n";
   if (TGT.getHwModes().getNumModeIds() > 1) {
     OS << "  unsigned getHwModeSet() const override;\n";
@@ -2073,7 +2074,8 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "class DFAPacketizer;\n";
   OS << "namespace " << Target << "_MC {\n"
      << "unsigned resolveVariantSchedClassImpl(unsigned SchedClass,"
-     << " const MCInst *MI, const MCInstrInfo *MCII, unsigned CPUID);\n"
+     << " const MCInst *MI, const MCInstrInfo *MCII, "
+     << "const MCSubtargetInfo &STI, unsigned CPUID);\n"
      << "} // end namespace " << Target << "_MC\n\n";
   OS << "struct " << ClassName << " : public TargetSubtargetInfo {\n"
      << "  explicit " << ClassName << "(const Triple &TT, StringRef CPU, "

From 1a3f84864f9d69e0c98500349a638f6ee360322e Mon Sep 17 00:00:00 2001
From: Victor Chernyakin <chernyakin.victor.j@outlook.com>
Date: Fri, 3 Oct 2025 18:29:11 -0700
Subject: [PATCH 719/878] [clang-tidy] Fix typoed option name in
 `bugprone-signed-char-misuse` (#161064)

Following the example of #158282.
---
 .../bugprone/SignedCharMisuseCheck.cpp          |  6 +++---
 .../clang-tidy/bugprone/SignedCharMisuseCheck.h |  2 +-
 clang-tools-extra/docs/ReleaseNotes.rst         | 17 +++++++++++------
 .../checks/bugprone/signed-char-misuse.rst      |  2 +-
 .../bugprone/signed-char-misuse-with-option.cpp |  2 +-
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
index 1041355a0caad..742d85bb7bab9 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.cpp
@@ -21,12 +21,12 @@ static constexpr int UnsignedASCIIUpperBound = 127;
 SignedCharMisuseCheck::SignedCharMisuseCheck(StringRef Name,
                                              ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
-      CharTypdefsToIgnoreList(Options.get("CharTypdefsToIgnore", "")),
+      CharTypedefsToIgnoreList(Options.get("CharTypedefsToIgnore", "")),
       DiagnoseSignedUnsignedCharComparisons(
           Options.get("DiagnoseSignedUnsignedCharComparisons", true)) {}
 
 void SignedCharMisuseCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
-  Options.store(Opts, "CharTypdefsToIgnore", CharTypdefsToIgnoreList);
+  Options.store(Opts, "CharTypedefsToIgnore", CharTypedefsToIgnoreList);
   Options.store(Opts, "DiagnoseSignedUnsignedCharComparisons",
                 DiagnoseSignedUnsignedCharComparisons);
 }
@@ -39,7 +39,7 @@ BindableMatcher<clang::Stmt> SignedCharMisuseCheck::charCastExpression(
   // (e.g. typedef char sal_Int8). In this case, we don't need to
   // worry about the misinterpretation of char values.
   const auto IntTypedef = qualType(hasDeclaration(typedefDecl(
-      hasAnyName(utils::options::parseStringList(CharTypdefsToIgnoreList)))));
+      hasAnyName(utils::options::parseStringList(CharTypedefsToIgnoreList)))));
 
   auto CharTypeExpr = expr();
   if (IsSigned) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h
index 56504e5a0fd2f..c6d9f29523c5e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h
@@ -35,7 +35,7 @@ class SignedCharMisuseCheck : public ClangTidyCheck {
       const ast_matchers::internal::Matcher<clang::QualType> &IntegerType,
       const std::string &CastBindName) const;
 
-  const StringRef CharTypdefsToIgnoreList;
+  const StringRef CharTypedefsToIgnoreList;
   const bool DiagnoseSignedUnsignedCharComparisons;
 };
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 62e1987377989..7e836a7114d50 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -54,12 +54,17 @@ Potentially Breaking Changes
   :program:`clang-tidy-20`. Users should use the check-specific options of the
   same name instead.
 
-- Renamed :program:`clang-tidy`'s option name of check
-  :doc:`bugprone-easily-swappable-parameters
-  <clang-tidy/checks/bugprone/easily-swappable-parameters>` from
-  ``NamePrefixSuffixSilenceDissimilarityTreshold`` to
-  ``NamePrefixSuffixSilenceDissimilarityThreshold``,
-  correcting a spelling mistake.
+- Renamed a few :program:`clang-tidy` check options, as they
+  were misspelled:
+
+  - `NamePrefixSuffixSilenceDissimilarityTreshold` to
+    `NamePrefixSuffixSilenceDissimilarityThreshold` in
+    :doc:`bugprone-easily-swappable-parameters
+    <clang-tidy/checks/bugprone/easily-swappable-parameters>`
+
+  - `CharTypdefsToIgnore` to `CharTypedefsToIgnore` in
+    :doc:`bugprone-signed-char-misuse
+    <clang-tidy/checks/bugprone/signed-char-misuse>`
 
 Improvements to clangd
 ----------------------
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst
index 4edbad5eac81b..3e06e11dffcc7 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst
@@ -107,7 +107,7 @@ so both arguments will have the same type.
 Options
 -------
 
-.. option:: CharTypdefsToIgnore
+.. option:: CharTypedefsToIgnore
 
   A semicolon-separated list of typedef names. In this list, we can list
   typedefs for ``char`` or ``signed char``, which will be ignored by the
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/signed-char-misuse-with-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/signed-char-misuse-with-option.cpp
index 9f9d61a56f6c3..c11be9414ac70 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/signed-char-misuse-with-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/signed-char-misuse-with-option.cpp
@@ -1,6 +1,6 @@
 // RUN: %check_clang_tidy %s bugprone-signed-char-misuse %t \
 // RUN: -config='{CheckOptions: \
-// RUN:  {bugprone-signed-char-misuse.CharTypdefsToIgnore: "sal_Int8;int8_t"}}' \
+// RUN:  {bugprone-signed-char-misuse.CharTypedefsToIgnore: "sal_Int8;int8_t"}}' \
 // RUN: --
 
 ///////////////////////////////////////////////////////////////////

From 3896212ceab8fe963335e8a31b898b6099292c88 Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg@gmail.com>
Date: Fri, 3 Oct 2025 21:40:54 -0400
Subject: [PATCH 720/878] [CIR] Implement Target-specific address space
 handling support for `PointerType` (#161028)

This PR adds support for address spaces in CIR pointer types by:

1. Introducing a `TargetAddressSpaceAttr` to represent target-specific
numeric address spaces (A Lang-specific attribute is to be implemented
in a different PR)
2. Extending the `PointerType` to include an optional address space
parameter
3. Adding helper methods in `CIRBaseBuilder` to create pointers with
address spaces
4. Implementing custom parsers and printers for address space attributes
5. Updating the LLVM lowering to properly handle address spaces when
converting CIR to LLVM IR

The implementation allows for creating pointers with specific address
spaces, which is necessary for supporting language features like Clang's
`__attribute__((address_space(N)))`. Address spaces are preserved
through the CIR representation and correctly lowered to LLVM IR.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      | 27 ++++++-
 clang/include/clang/CIR/Dialect/IR/CIRAttrs.h |  1 +
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  | 27 +++++++
 clang/include/clang/CIR/Dialect/IR/CIRTypes.h |  3 +
 .../include/clang/CIR/Dialect/IR/CIRTypes.td  | 52 ++++++++----
 clang/include/clang/CIR/MissingFeatures.h     |  1 +
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  4 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  1 +
 clang/lib/CIR/CodeGen/CIRGenTypeCache.h       |  7 ++
 clang/lib/CIR/CodeGen/CIRGenTypes.cpp         |  2 +-
 clang/lib/CIR/CodeGen/TargetInfo.h            |  6 ++
 clang/lib/CIR/Dialect/IR/CIRAttrs.cpp         | 10 +++
 clang/lib/CIR/Dialect/IR/CIRTypes.cpp         | 81 +++++++++++++++----
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  9 +--
 clang/test/CIR/CodeGen/address-space.c        | 30 +++++++
 clang/test/CIR/IR/invalid-addrspace.cir       | 27 +++++++
 16 files changed, 247 insertions(+), 41 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/address-space.c
 create mode 100644 clang/test/CIR/IR/invalid-addrspace.cir

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 8a5bf0376ec98..93d81e37c9240 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_CIR_DIALECT_BUILDER_CIRBASEBUILDER_H
 
 #include "clang/AST/CharUnits.h"
+#include "clang/Basic/AddressSpaces.h"
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
@@ -129,8 +130,30 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::PointerType::get(ty);
   }
 
-  cir::PointerType getVoidPtrTy() {
-    return getPointerTo(cir::VoidType::get(getContext()));
+  cir::PointerType getPointerTo(mlir::Type ty, cir::TargetAddressSpaceAttr as) {
+    return cir::PointerType::get(ty, as);
+  }
+
+  cir::PointerType getPointerTo(mlir::Type ty, clang::LangAS langAS) {
+    if (langAS == clang::LangAS::Default) // Default address space.
+      return getPointerTo(ty);
+
+    if (clang::isTargetAddressSpace(langAS)) {
+      unsigned addrSpace = clang::toTargetAddressSpace(langAS);
+      auto asAttr = cir::TargetAddressSpaceAttr::get(
+          getContext(), getUI32IntegerAttr(addrSpace));
+      return getPointerTo(ty, asAttr);
+    }
+
+    llvm_unreachable("language-specific address spaces NYI");
+  }
+
+  cir::PointerType getVoidPtrTy(clang::LangAS langAS = clang::LangAS::Default) {
+    return getPointerTo(cir::VoidType::get(getContext()), langAS);
+  }
+
+  cir::PointerType getVoidPtrTy(cir::TargetAddressSpaceAttr as) {
+    return getPointerTo(cir::VoidType::get(getContext()), as);
   }
 
   cir::BoolAttr getCIRBoolAttr(bool state) {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
index 925a9a87e267f..03a6a97dc8c2e 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
@@ -15,6 +15,7 @@
 
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributeInterfaces.h"
+#include "clang/Basic/AddressSpaces.h"
 
 #include "clang/CIR/Dialect/IR/CIROpsEnums.h"
 
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index f8358de9a1eb9..7714750a53d44 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -601,6 +601,33 @@ def CIR_VTableAttr : CIR_Attr<"VTable", "vtable", [TypedAttrInterface]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// TargetAddressSpaceAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_TargetAddressSpaceAttr : CIR_Attr< "TargetAddressSpace",
+                                         "target_address_space"> {
+  let summary = "Represents a target-specific numeric address space";
+  let description = [{
+    The TargetAddressSpaceAttr represents a target-specific numeric address space,
+    corresponding to the LLVM IR `addressspace` qualifier and the clang
+     `address_space` attribute.
+    
+    A value of zero represents the default address space. The semantics of non-zero
+    address spaces are target-specific.
+
+    Example:
+    ```mlir
+    // Target-specific numeric address spaces
+    !cir.ptr<!s32i, addrspace(target<1>)>
+    !cir.ptr<!s32i, addrspace(target<10>)>
+    ```
+  }];
+
+  let parameters = (ins "mlir::IntegerAttr":$value);
+  let assemblyFormat = "`<` `target` `<` $value `>` `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // ConstComplexAttr
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.h b/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
index bfa165cdd945e..45f646f1c9dfa 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.h
@@ -16,6 +16,9 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "clang/Basic/AddressSpaces.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
+#include "clang/CIR/Dialect/IR/CIROpsEnums.h"
 #include "clang/CIR/Interfaces/CIRTypeInterfaces.h"
 
 namespace cir {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index 4eec34cb299ab..313184764f536 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -14,10 +14,12 @@
 #define CLANG_CIR_DIALECT_IR_CIRTYPES_TD
 
 include "clang/CIR/Dialect/IR/CIRDialect.td"
+include "clang/CIR/Dialect/IR/CIREnumAttr.td"
 include "clang/CIR/Dialect/IR/CIRTypeConstraints.td"
 include "clang/CIR/Interfaces/CIRTypeInterfaces.td"
 include "mlir/Interfaces/DataLayoutInterfaces.td"
 include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/EnumAttr.td"
 
 //===----------------------------------------------------------------------===//
 // CIR Types
@@ -226,32 +228,54 @@ def CIR_PointerType : CIR_Type<"Pointer", "ptr", [
 ]> {
   let summary = "CIR pointer type";
   let description = [{
-    The `!cir.ptr` type represents C and C++ pointer types and C++ reference
-    types, other than pointers-to-members.  The `pointee` type is the type
-    pointed to.
+    The `!cir.ptr` type is a typed pointer type. It is used to represent
+    pointers to objects in C/C++. The type of the pointed-to object is given by
+    the `pointee` parameter. The `addrSpace` parameter is an optional address
+    space attribute that specifies the address space of the pointer. If not
+    specified, the pointer is assumed to be in the default address space.
 
-    TODO(CIR): The address space attribute is not yet implemented.
+    The `!cir.ptr` type can point to any type, including fundamental types,
+    records, arrays, vectors, functions, and other pointers. It can also point
+    to incomplete types, such as incomplete records.
+
+    Examples:
+
+    ```mlir
+    !cir.ptr<!cir.int<u, 8>>
+    !cir.ptr<!cir.float>
+    !cir.ptr<!cir.record<struct "MyStruct">>
+    !cir.ptr<!cir.int<u, 8>, target_address_space(1)>
+    !cir.ptr<!cir.record<struct "MyStruct">, target_address_space(5)>
+    ```
   }];
 
-  let parameters = (ins "mlir::Type":$pointee);
+  let parameters = (ins
+    "mlir::Type":$pointee,
+    OptionalParameter<
+        "cir::TargetAddressSpaceAttr">:$addrSpace
+  );
 
+  let skipDefaultBuilders = 1;
   let builders = [
-    TypeBuilderWithInferredContext<(ins "mlir::Type":$pointee), [{
-      return $_get(pointee.getContext(), pointee);
+    TypeBuilderWithInferredContext<(ins
+      "mlir::Type":$pointee,
+       CArg<"cir::TargetAddressSpaceAttr", "nullptr">:$addrSpace), [{
+        return $_get(pointee.getContext(), pointee, addrSpace);
     }]>,
-    TypeBuilder<(ins "mlir::Type":$pointee), [{
-      return $_get($_ctxt, pointee);
+    TypeBuilder<(ins
+      "mlir::Type":$pointee,
+      CArg<"cir::TargetAddressSpaceAttr", "nullptr">:$addrSpace), [{
+        return $_get($_ctxt, pointee, addrSpace);
     }]>
   ];
 
   let assemblyFormat = [{
-    `<` $pointee  `>`
+    `<`
+      $pointee
+      ( `,` ` ` custom<TargetAddressSpace>($addrSpace)^ )?
+    `>`
   }];
 
-  let genVerifyDecl = 1;
-
-  let skipDefaultBuilders = 1;
-
   let extraClassDeclaration = [{
     template <typename ...Types>
     bool isPtrTo() const {
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 0e7cec4e1a0e0..f7ca2761b85c5 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -216,6 +216,7 @@ struct MissingFeatures {
   static bool dataLayoutTypeIsSized() { return false; }
   static bool dataLayoutTypeAllocSize() { return false; }
   static bool dataLayoutTypeStoreSize() { return false; }
+  static bool dataLayoutPtrHandlingBasedOnLangAS() { return false; }
   static bool deferredCXXGlobalInit() { return false; }
   static bool deleteArray() { return false; }
   static bool devirtualizeMemberFunction() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index b4c8924d14b73..be948908e18d5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -2053,8 +2053,8 @@ mlir::Value CIRGenFunction::emitAlloca(StringRef name, mlir::Type ty,
   // CIR uses its own alloca address space rather than follow the target data
   // layout like original CodeGen. The data layout awareness should be done in
   // the lowering pass instead.
-  assert(!cir::MissingFeatures::addressSpace());
-  cir::PointerType localVarPtrTy = builder.getPointerTo(ty);
+  cir::PointerType localVarPtrTy =
+      builder.getPointerTo(ty, getCIRAllocaAddressSpace());
   mlir::IntegerAttr alignIntAttr = cgm.getSize(alignment);
 
   mlir::Value addr;
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 8485564a4a117..910c8a9b8f98a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -76,6 +76,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
   SInt128Ty = cir::IntType::get(&getMLIRContext(), 128, /*isSigned=*/true);
   UInt8Ty = cir::IntType::get(&getMLIRContext(), 8, /*isSigned=*/false);
   UInt8PtrTy = cir::PointerType::get(UInt8Ty);
+  cirAllocaAddressSpace = getTargetCIRGenInfo().getCIRAllocaAddressSpace();
   UInt16Ty = cir::IntType::get(&getMLIRContext(), 16, /*isSigned=*/false);
   UInt32Ty = cir::IntType::get(&getMLIRContext(), 32, /*isSigned=*/false);
   UInt64Ty = cir::IntType::get(&getMLIRContext(), 64, /*isSigned=*/false);
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index cc3ce09be4f95..273ec7f06b4b5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -14,6 +14,7 @@
 #define LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H
 
 #include "clang/AST/CharUnits.h"
+#include "clang/Basic/AddressSpaces.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 
 namespace clang::CIRGen {
@@ -73,6 +74,8 @@ struct CIRGenTypeCache {
   /// The alignment of size_t.
   unsigned char SizeAlignInBytes;
 
+  cir::TargetAddressSpaceAttr cirAllocaAddressSpace;
+
   clang::CharUnits getSizeAlign() const {
     return clang::CharUnits::fromQuantity(SizeAlignInBytes);
   }
@@ -80,6 +83,10 @@ struct CIRGenTypeCache {
   clang::CharUnits getPointerAlign() const {
     return clang::CharUnits::fromQuantity(PointerAlignInBytes);
   }
+
+  cir::TargetAddressSpaceAttr getCIRAllocaAddressSpace() const {
+    return cirAllocaAddressSpace;
+  }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index bb24933a22ed7..e65896a9ff109 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -417,7 +417,7 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
 
     mlir::Type pointeeType = convertType(elemTy);
 
-    resultType = builder.getPointerTo(pointeeType);
+    resultType = builder.getPointerTo(pointeeType, elemTy.getAddressSpace());
     break;
   }
 
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.h b/clang/lib/CIR/CodeGen/TargetInfo.h
index a5c548aa2c7c4..dbb0312c76040 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.h
+++ b/clang/lib/CIR/CodeGen/TargetInfo.h
@@ -16,6 +16,7 @@
 
 #include "ABIInfo.h"
 #include "CIRGenTypes.h"
+#include "clang/Basic/AddressSpaces.h"
 
 #include <memory>
 #include <utility>
@@ -43,6 +44,11 @@ class TargetCIRGenInfo {
   /// Returns ABI info helper for the target.
   const ABIInfo &getABIInfo() const { return *info; }
 
+  /// Get the address space for alloca.
+  virtual cir::TargetAddressSpaceAttr getCIRAllocaAddressSpace() const {
+    return {};
+  }
+
   /// Determine whether a call to an unprototyped functions under
   /// the given calling convention should use the variadic
   /// convention or the non-variadic convention.
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index 95faad6746955..3484c59df4ece 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -43,6 +43,16 @@ parseFloatLiteral(mlir::AsmParser &parser,
                   mlir::FailureOr<llvm::APFloat> &value,
                   cir::FPTypeInterface fpType);
 
+//===----------------------------------------------------------------------===//
+// AddressSpaceAttr
+//===----------------------------------------------------------------------===//
+
+mlir::ParseResult parseTargetAddressSpace(mlir::AsmParser &p,
+                                          cir::TargetAddressSpaceAttr &attr);
+
+void printTargetAddressSpace(mlir::AsmPrinter &p,
+                             cir::TargetAddressSpaceAttr attr);
+
 static mlir::ParseResult parseConstPtr(mlir::AsmParser &parser,
                                        mlir::IntegerAttr &value);
 
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index 35b4513c5789f..5897352829891 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -13,6 +13,7 @@
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 
 #include "mlir/IR/DialectImplementation.h"
+#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/IR/CIRTypesDetails.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -38,6 +39,27 @@ parseFuncTypeParams(mlir::AsmParser &p, llvm::SmallVector<mlir::Type> &params,
 static void printFuncTypeParams(mlir::AsmPrinter &p,
                                 mlir::ArrayRef<mlir::Type> params,
                                 bool isVarArg);
+//===----------------------------------------------------------------------===//
+// CIR Custom Parser/Printer Signatures
+//===----------------------------------------------------------------------===//
+
+static mlir::ParseResult
+parseFuncTypeParams(mlir::AsmParser &p, llvm::SmallVector<mlir::Type> &params,
+                    bool &isVarArg);
+
+static void printFuncTypeParams(mlir::AsmPrinter &p,
+                                mlir::ArrayRef<mlir::Type> params,
+                                bool isVarArg);
+
+//===----------------------------------------------------------------------===//
+// AddressSpace
+//===----------------------------------------------------------------------===//
+
+mlir::ParseResult parseTargetAddressSpace(mlir::AsmParser &p,
+                                          cir::TargetAddressSpaceAttr &attr);
+
+void printTargetAddressSpace(mlir::AsmPrinter &p,
+                             cir::TargetAddressSpaceAttr attr);
 
 //===----------------------------------------------------------------------===//
 // Get autogenerated stuff
@@ -297,6 +319,22 @@ bool RecordType::isLayoutIdentical(const RecordType &other) {
 // Data Layout information for types
 //===----------------------------------------------------------------------===//
 
+llvm::TypeSize
+PointerType::getTypeSizeInBits(const ::mlir::DataLayout &dataLayout,
+                               ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: improve this in face of address spaces
+  assert(!cir::MissingFeatures::dataLayoutPtrHandlingBasedOnLangAS());
+  return llvm::TypeSize::getFixed(64);
+}
+
+uint64_t
+PointerType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
+                             ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: improve this in face of address spaces
+  assert(!cir::MissingFeatures::dataLayoutPtrHandlingBasedOnLangAS());
+  return 8;
+}
+
 llvm::TypeSize
 RecordType::getTypeSizeInBits(const mlir::DataLayout &dataLayout,
                               mlir::DataLayoutEntryListRef params) const {
@@ -766,30 +804,39 @@ mlir::LogicalResult cir::VectorType::verify(
 }
 
 //===----------------------------------------------------------------------===//
-// PointerType Definitions
+// TargetAddressSpace definitions
 //===----------------------------------------------------------------------===//
 
-llvm::TypeSize
-PointerType::getTypeSizeInBits(const ::mlir::DataLayout &dataLayout,
-                               ::mlir::DataLayoutEntryListRef params) const {
-  // FIXME: improve this in face of address spaces
-  return llvm::TypeSize::getFixed(64);
-}
+mlir::ParseResult parseTargetAddressSpace(mlir::AsmParser &p,
+                                          cir::TargetAddressSpaceAttr &attr) {
+  if (failed(p.parseKeyword("target_address_space")))
+    return mlir::failure();
 
-uint64_t
-PointerType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
-                             ::mlir::DataLayoutEntryListRef params) const {
-  // FIXME: improve this in face of address spaces
-  return 8;
-}
+  if (failed(p.parseLParen()))
+    return mlir::failure();
 
-mlir::LogicalResult
-PointerType::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
-                    mlir::Type pointee) {
-  // TODO(CIR): Verification of the address space goes here.
+  int32_t targetValue;
+  if (failed(p.parseInteger(targetValue)))
+    return p.emitError(p.getCurrentLocation(),
+                       "expected integer address space value");
+
+  if (failed(p.parseRParen()))
+    return p.emitError(p.getCurrentLocation(),
+                       "expected ')' after address space value");
+
+  mlir::MLIRContext *context = p.getBuilder().getContext();
+  attr = cir::TargetAddressSpaceAttr::get(
+      context, p.getBuilder().getUI32IntegerAttr(targetValue));
   return mlir::success();
 }
 
+// The custom printer for the `addrspace` parameter in `!cir.ptr`.
+// in the format of `target_address_space(N)`.
+void printTargetAddressSpace(mlir::AsmPrinter &p,
+                             cir::TargetAddressSpaceAttr attr) {
+  p << "target_address_space(" << attr.getValue().getUInt() << ")";
+}
+
 //===----------------------------------------------------------------------===//
 // CIR Dialect
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 4bc7783175120..bfb1262dec1f3 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -32,6 +32,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
 #include "clang/CIR/Dialect/Passes.h"
 #include "clang/CIR/LoweringHelpers.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -2308,11 +2309,9 @@ mlir::LogicalResult CIRToLLVMSelectOpLowering::matchAndRewrite(
 static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
                                  mlir::DataLayout &dataLayout) {
   converter.addConversion([&](cir::PointerType type) -> mlir::Type {
-    // Drop pointee type since LLVM dialect only allows opaque pointers.
-    assert(!cir::MissingFeatures::addressSpace());
-    unsigned targetAS = 0;
-
-    return mlir::LLVM::LLVMPointerType::get(type.getContext(), targetAS);
+    unsigned addrSpace =
+        type.getAddrSpace() ? type.getAddrSpace().getValue().getUInt() : 0;
+    return mlir::LLVM::LLVMPointerType::get(type.getContext(), addrSpace);
   });
   converter.addConversion([&](cir::VPtrType type) -> mlir::Type {
     assert(!cir::MissingFeatures::addressSpace());
diff --git a/clang/test/CIR/CodeGen/address-space.c b/clang/test/CIR/CodeGen/address-space.c
new file mode 100644
index 0000000000000..a334b8a2907e4
--- /dev/null
+++ b/clang/test/CIR/CodeGen/address-space.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+// Test address space 1
+// CIR: cir.func dso_local @foo(%arg0: !cir.ptr<!s32i, target_address_space(1)>
+// LLVM: define dso_local void @foo(ptr addrspace(1) %0)
+// OGCG: define dso_local void @foo(ptr addrspace(1) noundef %arg)
+void foo(int __attribute__((address_space(1))) *arg) {
+  return;
+}
+
+// Test explicit address space 0 (should be same as default)
+// CIR: cir.func dso_local @bar(%arg0: !cir.ptr<!s32i, target_address_space(0)>
+// LLVM: define dso_local void @bar(ptr %0)
+// OGCG: define dso_local void @bar(ptr noundef %arg)
+void bar(int __attribute__((address_space(0))) *arg) {
+  return;
+}
+
+// Test default address space (no attribute)
+// CIR: cir.func dso_local @baz(%arg0: !cir.ptr<!s32i>
+// LLVM: define dso_local void @baz(ptr %0)
+// OGCG: define dso_local void @baz(ptr noundef %arg)
+void baz(int *arg) {
+  return;
+}
diff --git a/clang/test/CIR/IR/invalid-addrspace.cir b/clang/test/CIR/IR/invalid-addrspace.cir
new file mode 100644
index 0000000000000..8f188b840bdec
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-addrspace.cir
@@ -0,0 +1,27 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+// -----
+
+!u64i = !cir.int<u, 64>
+// expected-error @below {{expected 'target_address_space'}}
+cir.func @address_space1(%p : !cir.ptr<!u64i, foobar>) {
+  cir.return
+}
+
+// -----
+
+!u64i = !cir.int<u, 64>
+// expected-error@below {{expected '('}}
+cir.func @address_space2(%p : !cir.ptr<!u64i, target_address_space>) {
+  cir.return
+}
+
+// -----
+
+!u64i = !cir.int<u, 64>
+// expected-error@+2 {{expected integer value}}
+// expected-error@below {{expected integer address space value}}
+cir.func @address_space3(%p : !cir.ptr<!u64i, target_address_space()>) {
+  cir.return
+}
+

From 1b30e49b9b37673a1e8437b54c077611789f60a3 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 4 Oct 2025 11:33:45 +0900
Subject: [PATCH 721/878] AMDGPU: Remove m0 classes (#161758)

These are singleton register classes, which are not a good idea
and also are unused.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |  19 +-
 .../AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll     |   1 +
 .../GlobalISel/irtranslator-inline-asm.ll     |  32 +-
 .../regbankcombiner-ignore-copies-crash.mir   |   4 +-
 .../AMDGPU/branch-relax-indirect-branch.mir   |   4 +-
 .../AMDGPU/branch-relax-no-terminators.mir    |   4 +-
 .../coalesce-copy-to-agpr-to-av-registers.mir | 240 ++++++------
 .../AMDGPU/coalescer-early-clobber-subreg.mir |  16 +-
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   |  24 +-
 llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir   |   4 +-
 ...class-vgpr-mfma-to-av-with-load-source.mir |  12 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   |  24 +-
 ...local-stack-alloc-add-references.gfx10.mir |  30 +-
 .../local-stack-alloc-add-references.gfx8.mir | 360 +++++++++---------
 .../local-stack-alloc-add-references.gfx9.mir | 180 ++++-----
 .../machine-scheduler-sink-trivial-remats.mir |   4 +-
 llvm/test/CodeGen/AMDGPU/mai-hazards.mir      |   6 +-
 ...al-regcopy-and-spill-missed-at-regalloc.ll |  24 +-
 ...lloc-failure-overlapping-insert-assert.mir |  16 +-
 .../AMDGPU/rename-independent-subregs.mir     |   4 +-
 .../rewrite-vgpr-mfma-to-agpr-copy-from.mir   |   4 +-
 ...gpr-mfma-to-agpr-subreg-insert-extract.mir |  12 +-
 ...te-vgpr-mfma-to-agpr-subreg-src2-chain.mir |  36 +-
 ...ssert-dead-def-subreg-use-other-subreg.mir |   4 +-
 ...dleMoveUp-subreg-def-across-subreg-def.mir |  16 +-
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |   4 +-
 ...ubreg-undef-def-with-other-subreg-defs.mir |  24 +-
 .../Inputs/amdgpu_isel.ll.expected            |   4 +-
 28 files changed, 550 insertions(+), 562 deletions(-)
 create mode 120000 llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 56305804e816a..f98e31229b246 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -367,19 +367,6 @@ def SCC_CLASS : SIRegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
   let BaseClassOrder = 10000;
 }
 
-def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
-  let CopyCost = 1;
-  let isAllocatable = 0;
-  let HasSGPR = 1;
-}
-
-def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> {
-  let CopyCost = 1;
-  let Size = 16;
-  let isAllocatable = 0;
-  let HasSGPR = 1;
-}
-
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
 def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
@@ -797,7 +784,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
    TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
    SRC_SHARED_LIMIT_LO_LO16, SRC_PRIVATE_BASE_LO_LO16, SRC_PRIVATE_LIMIT_LO_LO16,
    SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16,
-   EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16,
+   EXEC_LO_LO16, EXEC_HI_LO16, M0_LO16, SRC_FLAT_SCRATCH_BASE_LO_LO16,
    SRC_FLAT_SCRATCH_BASE_HI_LO16)> {
   let Size = 16;
   let isAllocatable = 0;
@@ -805,7 +792,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
 }
 
 def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
-  (add SReg_32_XM0_XEXEC, M0_CLASS)> {
+  (add SReg_32_XM0_XEXEC, M0)> {
   let AllocationPriority = 0;
 }
 
@@ -830,7 +817,7 @@ def APERTURE_Class : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
 
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
-  (add SReg_32_XM0, M0_CLASS)> {
+  (add SReg_32_XM0, M0)> {
   let AllocationPriority = 0;
   let HasSGPR = 1;
   let BaseClassOrder = 32;
diff --git a/llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll
new file mode 120000
index 0000000000000..8747bd528438c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll
@@ -0,0 +1 @@
+matt@mattbookAMD.56897
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index 2cde060529bec..a54dc9dda16e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @asm_simple_agpr_clobber() {
 define i32 @asm_vgpr_early_clobber() {
   ; CHECK-LABEL: name: asm_vgpr_early_clobber
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %8, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7; v_mov_b32 $1, 7", 1 /* sideeffect attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %8, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %9, !1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
@@ -94,7 +94,7 @@ entry:
 define i32 @test_single_vgpr_output() nounwind {
   ; CHECK-LABEL: name: test_single_vgpr_output
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 7", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -106,7 +106,7 @@ entry:
 define i32 @test_single_sgpr_output_s32() nounwind {
   ; CHECK-LABEL: name: test_single_sgpr_output_s32
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -119,7 +119,7 @@ entry:
 define float @test_multiple_register_outputs_same() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_same
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 2031626 /* regdef:VGPR_32 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_mov_b32 $1, 1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 1835018 /* regdef:VGPR_32 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
@@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %8, 3735562 /* regdef:VReg_64 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3473418 /* regdef:VReg_64 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %9
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
@@ -171,7 +171,7 @@ define amdgpu_kernel void @test_input_vgpr_imm() {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[C]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 v0, $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY1]]
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "v_mov_b32 v0, $0", "v"(i32 42)
   ret void
@@ -185,7 +185,7 @@ define amdgpu_kernel void @test_input_sgpr_imm() {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY [[C]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 s0, $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[COPY1]]
   ; CHECK-NEXT:   S_ENDPGM 0
   call void asm sideeffect "s_mov_b32 s0, $0", "s"(i32 42)
   ret void
@@ -212,7 +212,7 @@ define float @test_input_vgpr(i32 %src) nounwind {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %9, 2031625 /* reguse:VGPR_32 */, [[COPY1]]
+  ; CHECK-NEXT:   INLINEASM &"v_add_f32 $0, 1.0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 1835017 /* reguse:VGPR_32 */, [[COPY1]]
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -227,7 +227,7 @@ define i32 @test_memory_constraint(ptr addrspace(3) %a) nounwind {
   ; CHECK-NEXT:   liveins: $vgpr0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 2031626 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
+  ; CHECK-NEXT:   INLINEASM &"ds_read_b32 $0, $1", 8 /* mayload attdialect */, 1835018 /* regdef:VGPR_32 */, def %9, 262158 /* mem:m */, [[COPY]](p3)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %9
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY1]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -244,7 +244,7 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AND]](s32)
-  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &";", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %11
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -256,13 +256,13 @@ define i32 @test_vgpr_matching_constraint(i32 %a) nounwind {
 define i32 @test_sgpr_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %10
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 8", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %10
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY %10
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %12, 2621449 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"s_add_u32 $0, $1, $2", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %12, 2424841 /* reguse:SReg_32 */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY %12
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY4]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -285,7 +285,7 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
   ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
-  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11, 2031626 /* regdef:VGPR_32 */, def %12, 2031626 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
+  ; CHECK-NEXT:   INLINEASM &"; ", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11, 1835018 /* regdef:VGPR_32 */, def %12, 1835018 /* regdef:VGPR_32 */, def %13, 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3), 2147614729 /* reguse tiedto:$2 */, [[COPY4]](tied-def 7), 2147549193 /* reguse tiedto:$1 */, [[COPY5]](tied-def 5)
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY %11
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY %12
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:_(s32) = COPY %13
@@ -306,10 +306,10 @@ define void @test_many_matching_constraints(i32 %a, i32 %b, i32 %c) nounwind {
 define i32 @test_sgpr_to_vgpr_move_matching_constraint() nounwind {
   ; CHECK-LABEL: name: test_sgpr_to_vgpr_move_matching_constraint
   ; CHECK: bb.1.entry:
-  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2621450 /* regdef:SReg_32 */, def %8
+  ; CHECK-NEXT:   INLINEASM &"s_mov_b32 $0, 7", 0 /* attdialect */, 2424842 /* regdef:SReg_32 */, def %8
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %10, 2147483657 /* reguse tiedto:$0 */, [[COPY1]](tied-def 3)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY %10
   ; CHECK-NEXT:   $vgpr0 = COPY [[COPY2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
index f2d3272e8727f..137488f24a331 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-ignore-copies-crash.mir
@@ -24,7 +24,7 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
-    ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %5(s32)
+    ; CHECK-NEXT: INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5(s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[AMDGPU_FMED3_:%[0-9]+]]:vgpr(s32) = nnan G_AMDGPU_FMED3 [[FMUL]], %5, [[COPY2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_FMED3_]](s32)
@@ -33,7 +33,7 @@ body: |
     %2:vgpr(s32) = COPY %1(s32)
     %3:vgpr(s32) = G_FMUL %0, %2
     %4:sgpr(s32) = G_FCONSTANT float 1.000000e+00
-    INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %5:vgpr_32
+    INLINEASM &"v_mov_b32 $0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %5:vgpr_32
     %6:vgpr(s32) = COPY %4(s32)
     %7:vgpr(s32) = nnan G_AMDGPU_FMED3 %3(s32), %5(s32), %6(s32)
     $vgpr0 = COPY %7(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
index 035354c1f1369..8a39d9c517b50 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir
@@ -68,7 +68,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -149,7 +149,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
index 584d6a530fbec..a2f02052cbf36 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir
@@ -69,7 +69,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x04000000), %bb.7(0x7c000000)
   ; CHECK-NEXT:   liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
+  ; CHECK-NEXT:   INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
   ; CHECK-NEXT:   S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.3, implicit killed $scc
   ; CHECK-NEXT: {{  $}}
@@ -151,7 +151,7 @@ body:             |
     successors: %bb.3(0x04000000), %bb.2(0x7c000000)
     liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1
 
-    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2621450 /* regdef:SReg_32 */, def renamable $sgpr4
+    INLINEASM &"v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64\0A    v_nop_e64", 1 /* sideeffect attdialect */, 2424842 /* regdef:SReg_32 */, def renamable $sgpr4
     S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc
     S_CBRANCH_SCC1 %bb.2, implicit killed $scc
 
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index c475efbb1ca26..52cdf5af592d2 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -20,13 +20,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -45,13 +45,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -72,7 +72,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -80,7 +80,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0
     %3.sub1:areg_96 = COPY %1
     %3.sub2:areg_96 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -101,7 +101,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -109,7 +109,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0
     %3.sub1:areg_96_align2 = COPY %1
     %3.sub2:areg_96_align2 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -128,13 +128,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0
     %2.sub2_sub3:areg_128 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -153,13 +153,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0
     %2.sub2_sub3:areg_128_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -178,13 +178,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:sgpr_32 = COPY $sgpr8
     %1:sgpr_32 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -203,13 +203,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0
     %2.sub1_sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -228,13 +228,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0
     %2.sub1_sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -253,13 +253,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0
     %2.sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -278,13 +278,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0
     %2.sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -302,12 +302,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -326,13 +326,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -350,12 +350,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0
     %1.sub1:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -373,12 +373,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0
     %1.sub1:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -398,14 +398,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0
     %1.sub1:areg_128 = COPY %0
     %1.sub2:areg_128 = COPY %0
     %1.sub3:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -425,14 +425,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0
     %1.sub1:areg_128_align2 = COPY %0
     %1.sub2:areg_128_align2 = COPY %0
     %1.sub3:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -451,15 +451,15 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, killed %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
 ...
@@ -477,14 +477,14 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 2031625 /* reguse:VGPR_32 */, killed %0
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
 ...
@@ -503,16 +503,16 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:VReg_64 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
     undef %2.sub0:vreg_64 = COPY %0
     %2.sub1:vreg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:VReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -533,13 +533,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -558,13 +558,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -585,7 +585,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -593,7 +593,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -614,7 +614,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -622,7 +622,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -641,13 +641,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -668,13 +668,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub1:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -693,13 +693,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -718,13 +718,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -743,13 +743,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -768,13 +768,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -793,13 +793,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -817,12 +817,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -841,13 +841,13 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0.sub0
     %1.sub1:areg_96 = COPY %0.sub0
     %1.sub2:areg_96 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -865,12 +865,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0.sub0
     %1.sub1:areg_96_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -890,14 +890,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0.sub0
     %1.sub1:areg_128 = COPY %0.sub0
     %1.sub2:areg_128 = COPY %0.sub0
     %1.sub3:areg_128 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -917,14 +917,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0.sub0
     %1.sub1:areg_128_align2 = COPY %0.sub0
     %1.sub2:areg_128_align2 = COPY %0.sub0
     %1.sub3:areg_128_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -943,13 +943,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -968,13 +968,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64_align2 = COPY $vgpr0
     %0.sub1:vreg_64_align2 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -995,7 +995,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -1003,7 +1003,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1024,7 +1024,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1:vreg_96_align2 = COPY $vgpr1
@@ -1032,7 +1032,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1051,13 +1051,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1076,13 +1076,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1101,13 +1101,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1126,13 +1126,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1150,13 +1150,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub2
     %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1176,13 +1176,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1201,13 +1201,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1226,13 +1226,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96_align2 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1251,13 +1251,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1274,11 +1274,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %2:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -1295,11 +1295,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1316,11 +1316,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1337,11 +1337,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6750217 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1358,11 +1358,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1379,11 +1379,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1400,11 +1400,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:sreg_64 = COPY $sgpr8_sgpr9
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1421,11 +1421,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %2:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
index c07c9efcf901d..126cbc643accf 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir
@@ -20,10 +20,10 @@ body:             |
     ; CHECK-LABEL: name: foo1
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %0:vgpr_32, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -41,10 +41,10 @@ body:             |
     ; CHECK-LABEL: name: foo2
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %0
     %2.sub1:vreg_64 = COPY killed %1
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -62,10 +62,10 @@ body:             |
     ; CHECK-LABEL: name: foo3
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %1:vgpr_32, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
@@ -83,10 +83,10 @@ body:             |
     ; CHECK-LABEL: name: foo4
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2031626 /* regdef:VGPR_32 */, def undef %2.sub0
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VGPR_32 */, def undef %2.sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
     ; CHECK-NEXT: S_ENDPGM 0
-    INLINEASM &"", 0 /* attdialect */, 2031627 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %1:vgpr_32
+    INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32
     undef %2.sub0:vreg_64 = COPY killed %1
     %2.sub1:vreg_64 = COPY killed %0
     FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index 1635b09313102..4dfdb56a69ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -370,7 +370,7 @@ body:            |
     ; HAZARD-LABEL: name: inline_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
@@ -378,10 +378,10 @@ body:            |
     ; NOHAZARD-LABEL: name: inline_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -397,17 +397,17 @@ body:            |
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: sdwa_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
   S_ENDPGM 0
 ...
 
@@ -421,19 +421,19 @@ body:            |
     ; HAZARD-LABEL: name: inline_inline_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: inline_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0, 2031625 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0, 1835017 /* reguse:VGPR_32 */, $vgpr1
   S_ENDPGM 0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 97f8fd61e4298..6b9b77c228350 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -1112,11 +1112,11 @@ body:             |
     ; GCN-NEXT: S_WAITCNT 0
     ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, killed renamable $vgpr2
+    ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2
     ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31
     S_WAITCNT 0
     renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, killed $vgpr0, 0, killed $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, killed renamable $vgpr2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, killed renamable $vgpr2
     S_SETPC_B64_return undef $sgpr30_sgpr31
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 63db24a63a54a..2573acd2ada9a 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body:             |
     S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
     S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
     S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
     S_ENDPGM 0
 
 ...
@@ -1368,7 +1368,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1408,7 +1408,7 @@ body:             |
     undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
     early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
@@ -1726,7 +1726,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1763,7 +1763,7 @@ body:             |
     undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
     %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 5b3e48698c32b..32b1b9be39f38 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10420234 /* regdef:SGPR_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10420233 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10420234 /* regdef:SGPR_128 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10420233 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7667722 /* regdef:VReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7667721 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
 
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8847370 /* regdef:AReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8585226 /* regdef:AReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9568266 /* regdef:AReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9306122 /* regdef:AReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
index 8acf32e60cc9e..e6a52342337f3 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
@@ -18,21 +18,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -53,27 +53,27 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
index 16908266854a3..98b7f4a5aa1c5 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
@@ -21,9 +21,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -31,9 +31,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -41,10 +41,10 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -52,9 +52,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
@@ -62,15 +62,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -88,42 +88,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX942: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
     ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1, implicit $vcc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1, implicit $vcc
     SI_RETURN
 
 ...
@@ -144,9 +144,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -154,9 +154,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -164,10 +164,10 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -175,9 +175,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
@@ -185,15 +185,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -214,9 +214,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -224,9 +224,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -234,9 +234,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -244,9 +244,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
@@ -254,14 +254,14 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def dead $scc
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -279,42 +279,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX942: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
     ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
     %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN implicit %2
 
 ...
@@ -332,42 +332,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
     ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0
     %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1
     SI_RETURN
 
 ...
@@ -385,42 +385,42 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
     ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0
     %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1
     SI_RETURN
 
 ...
@@ -443,9 +443,9 @@ body:             |
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -454,9 +454,9 @@ body:             |
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -465,9 +465,9 @@ body:             |
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -476,9 +476,9 @@ body:             |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets
@@ -487,17 +487,17 @@ body:             |
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY]], %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY1]], %stack.0, implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = COPY $sgpr4
     %1:sreg_32 = COPY $sgpr5
 
     %2:sreg_32 = S_ADD_I32 %0, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2
     %3:sreg_32 = S_ADD_I32 %1, %stack.0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %3
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3
     SI_RETURN
 
 ...
@@ -520,9 +520,9 @@ body:             |
     ; GFX803-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX803-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -531,9 +531,9 @@ body:             |
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX900-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX900-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -542,9 +542,9 @@ body:             |
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX942-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -553,9 +553,9 @@ body:             |
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX10-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__reg_offsets_commute
@@ -564,17 +564,17 @@ body:             |
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr4
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5
     ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY]], implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[COPY1]], implicit-def dead $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:sreg_32 = COPY $sgpr4
     %1:sreg_32 = COPY $sgpr5
 
     %2:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %2
     %3:sreg_32 = S_ADD_I32 %stack.0, %1, implicit-def dead $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %3
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %3
     SI_RETURN
 
 ...
@@ -592,48 +592,48 @@ body:             |
   bb.0:
     ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX803-NEXT: S_NOP 0, implicit $scc
     ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX803-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX900-NEXT: S_NOP 0, implicit $scc
     ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX900-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX942-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX942: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX942-NEXT: S_NOP 0, implicit $scc
     ; GFX942-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX942-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX10-NEXT: S_NOP 0, implicit $scc
     ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX10-NEXT: SI_RETURN implicit $scc
     ;
     ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
     ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_]]
     ; GFX12-NEXT: S_NOP 0, implicit $scc
     ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
     ; GFX12-NEXT: SI_RETURN implicit $scc
     %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %0
     S_NOP 0, implicit $scc
     %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2621449 /* reguse:SReg_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:SReg_32 */, %1
     SI_RETURN implicit $scc
 
 ...
@@ -656,9 +656,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -667,9 +667,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -678,9 +678,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -689,9 +689,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
@@ -700,15 +700,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -731,9 +731,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -742,9 +742,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -753,9 +753,9 @@ body:             |
     ; GFX942-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -764,9 +764,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
@@ -775,15 +775,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -805,9 +805,9 @@ body:             |
     ; GFX803-NEXT: {{  $}}
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -815,9 +815,9 @@ body:             |
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -825,9 +825,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -836,9 +836,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
@@ -848,16 +848,16 @@ body:             |
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -880,9 +880,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -891,9 +891,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -903,10 +903,10 @@ body:             |
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -915,9 +915,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
@@ -926,15 +926,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -957,9 +957,9 @@ body:             |
     ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX803-NEXT: SI_RETURN
     ;
     ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -968,9 +968,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -980,10 +980,10 @@ body:             |
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
     ; GFX942-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -992,9 +992,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
@@ -1003,15 +1003,15 @@ body:             |
     ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
index 9183fe5d55a9c..19ca463d7ecbb 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
@@ -20,16 +20,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
     ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
@@ -37,21 +37,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
     ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -72,16 +72,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
     ; GFX942: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
@@ -89,21 +89,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
     ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -124,16 +124,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
     ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
@@ -141,21 +141,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -178,9 +178,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -188,9 +188,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -199,9 +199,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
@@ -209,15 +209,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -240,9 +240,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -250,9 +250,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -261,9 +261,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
@@ -271,15 +271,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %vgpr_offset:vgpr_32 = COPY $vgpr0
     %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -301,9 +301,9 @@ body:             |
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -311,9 +311,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX942-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -322,9 +322,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
@@ -332,15 +332,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
     ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -363,9 +363,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -373,9 +373,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -384,9 +384,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
@@ -394,15 +394,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -425,9 +425,9 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -435,9 +435,9 @@ body:             |
     ; GFX942-NEXT: {{  $}}
     ; GFX942-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX942-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -446,9 +446,9 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
@@ -456,15 +456,15 @@ body:             |
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
     ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %sgpr_offset:sreg_32 = COPY $sgpr8
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
@@ -486,16 +486,16 @@ body:             |
     ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
-    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX900-NEXT: SI_RETURN
     ;
     ; GFX942-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
     ; GFX942: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX942-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
-    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX942-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX942-NEXT: SI_RETURN
     ;
     ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
@@ -503,21 +503,21 @@ body:             |
     ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
-    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX10-NEXT: SI_RETURN
     ;
     ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
     ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
     ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
-    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
     ; GFX12-NEXT: SI_RETURN
     %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %0
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %0
     %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 8f228b75cabfa..c117473581746 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -6429,7 +6429,7 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %22, 2031625 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]]
+  ; GFX908-NEXT:   INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22, 1835017 /* reguse:VGPR_32 */, [[V_CVT_I32_F64_e32_4]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
@@ -6478,7 +6478,7 @@ body:             |
     %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
-    INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 2031626 /* regdef:VGPR_32 */, def %22:vgpr_32, 2031625 /* reguse:VGPR_32 */, %4
+    INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
index c19d5a6979377..8b2c5887c97ee 100644
--- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir
@@ -33,7 +33,7 @@ name:            asm_write_vgpr_accvgpr_write_read
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
@@ -47,7 +47,7 @@ name:            asm_write_vgpr_accvgpr_write_read_partialnop
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr0
     S_NOP 0
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
@@ -60,7 +60,7 @@ name:            asm_write_vgpr_accvgpr_write_read_otherreg
 body:             |
   bb.0:
     liveins: $vgpr0
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr1
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def $vgpr1
     $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index f88b1bfae2c68..95638c75ad0a4 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -11,10 +11,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908: bb.0 (%ir-block.0):
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %25
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7667722 /* regdef:VReg_128 */, def %25
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %25
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3473418 /* regdef:VReg_64 */, def %27
   ; REGALLOC-GFX908-NEXT:   SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -36,10 +36,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
   ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-  ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7667722 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3473418 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -60,10 +60,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64_Align2 */, def %21
   ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -79,10 +79,10 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A: bb.0 (%ir-block.0):
   ; PEI-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; PEI-GFX90A-NEXT: {{  $}}
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index 80afe7a180182..94aa6fee14a50 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -43,17 +43,17 @@ machineFunctionInfo:
 body:             |
   bb.0:
 
-    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2424842 /* regdef:AGPR_32 */, implicit-def $agpr0
+    INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:AGPR_32 */, implicit-def $agpr0
     %14:vgpr_32 = COPY killed $agpr0
-    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 40042506 /* regdef:VReg_512 */, def %7, 19464202 /* regdef:VReg_256 */, def %8, 7929866 /* regdef:VReg_128 */, def %9, 5963786 /* regdef:VReg_96 */, def %10, 5963786 /* regdef:VReg_96 */, def %11
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39780362 /* regdef:VReg_512 */, def %7, 19202058 /* regdef:VReg_256 */, def %8, 7667722 /* regdef:VReg_128 */, def %9, 5701642 /* regdef:VReg_96 */, def %10, 5701642 /* regdef:VReg_96 */, def %11
     INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40042505 /* reguse:VReg_512 */, %7
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19464201 /* reguse:VReg_256 */, %8
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, %9
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %10
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %11
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39780361 /* reguse:VReg_512 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19202057 /* reguse:VReg_256 */, %8
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7667721 /* reguse:VReg_128 */, %9
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5701641 /* reguse:VReg_96 */, %10
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5701641 /* reguse:VReg_96 */, %11
     $agpr1 = COPY %14
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, killed $agpr1
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, killed $agpr1
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
index e1148c437e91d..35be513c784bb 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -73,7 +73,7 @@ body: |
 # (1) %0.sub0 + %0.sub0 and (2) %0.sub1 + %0.sub1
 # Check that renaming (2) does not inadvertently rename (1).
 # CHECK-LABEL: name: test2
-# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0, 2031626 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5)
+# CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def dead %1.sub1, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %1.sub1(tied-def 5)
 name: test2
 body: |
   bb.0:
@@ -81,7 +81,7 @@ body: |
 
   bb.1:
     undef %0.sub1:vreg_64 = V_ALIGNBIT_B32_e64 %0.sub0:vreg_64, %0.sub0:vreg_64, 16, implicit $exec
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 2031626 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0:vreg_64, 1835018 /* regdef:VGPR_32 */, def %0.sub1:vreg_64, 2147483657 /* reguse tiedto:$0 */, undef %0.sub0:vreg_64(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %0.sub1:vreg_64(tied-def 5)
     S_BRANCH %bb.1
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
index ad490f8fe54d8..c58b9dacd004d 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
@@ -43,7 +43,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3735561 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
     %1:av_64_align2 = COPY $vgpr0_vgpr1
@@ -51,7 +51,7 @@ body:             |
     %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_128_align2 = COPY %3
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3735561 /* reguse:VReg_64_Align2 */, %5
     SI_RETURN
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index 0b4e6622b9418..bae84f4d510f8 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body:             |
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body:             |
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -172,7 +172,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -208,7 +208,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub1:areg_128_align2 = COPY %4.sub2
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index dcf3b8b0ce389..e9f45e3db7ec6 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -47,7 +47,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -79,7 +79,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -90,7 +90,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %4.sub0_sub1
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %6:areg_64_align2 = COPY %5
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
     GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -114,7 +114,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -126,7 +126,7 @@ body:             |
     undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %7:areg_64_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %7
     GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 
@@ -151,7 +151,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %5.sub0_sub1
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %8.sub0_sub1:areg_128_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 
@@ -189,7 +189,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORD [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -202,7 +202,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %5.sub0_sub1
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %8:agpr_32 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, %8:agpr_32
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, %8:agpr_32
     GLOBAL_STORE_DWORD %0, %8, 0, 0, implicit $exec :: (store (s32), addrspace 1)
     SI_RETURN
 
@@ -231,7 +231,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -287,7 +287,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -327,7 +327,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4325385 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
index 3c7dd6463813f..af882c06e1b4e 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir
@@ -37,7 +37,7 @@ body:             |
   ; CHECK-NEXT:   dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
   ; CHECK-NEXT:   undef [[COPY2:%[0-9]+]].sub1:vreg_512 = COPY [[COPY]].sub1
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead [[COPY1]], 2031626 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 2031625 /* reguse:VGPR_32 */, [[COPY1]], 2031625 /* reguse:VGPR_32 */, [[COPY]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[COPY1]], 1835018 /* regdef:VGPR_32 */, def dead [[COPY]].sub1, 1835017 /* reguse:VGPR_32 */, [[COPY1]], 1835017 /* reguse:VGPR_32 */, [[COPY]].sub1
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub0:vreg_512 = COPY [[COPY]].sub0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub3:vreg_512 = COPY [[COPY]].sub3
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]].sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_1]]
@@ -63,7 +63,7 @@ body:             |
     undef %11.sub0:vreg_512 = COPY %4.sub0
     %12:vgpr_32 = COPY %4.sub0
     %11.sub1:vreg_512 = COPY %4.sub1
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 2031626 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 2031625 /* reguse:VGPR_32 */, %12:vgpr_32, 2031625 /* reguse:VGPR_32 */, %4.sub1:vreg_512
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %12:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %4.sub1:vreg_512, 1835017 /* reguse:VGPR_32 */, %12:vgpr_32, 1835017 /* reguse:VGPR_32 */, %4.sub1:vreg_512
     %11.sub2:vreg_512 = COPY undef %1
     %11.sub3:vreg_512 = COPY %4.sub3
     %11.sub5:vreg_512 = COPY undef %1
diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
index 24df4b8947672..3e8e1878e0be5 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir
@@ -40,18 +40,18 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead %11
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %11
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %15, 2031626 /* regdef:VGPR_32 */, def %16
+  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15, 1835018 /* regdef:VGPR_32 */, def %16
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_1:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %21, 2031626 /* regdef:VGPR_32 */, def %22
+  ; CHECK-NEXT:   INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21, 1835018 /* regdef:VGPR_32 */, def %22
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_2]], 0, 0, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 2031626 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 2031625 /* reguse:VGPR_32 */, %15, 2031625 /* reguse:VGPR_32 */, %16, 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_3]], 1835018 /* regdef:VGPR_32 */, def dead [[V_MOV_B32_e32_4]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B64_gfx9_]].sub0, 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_3]](tied-def 3), 2147549193 /* reguse tiedto:$1 */, [[V_MOV_B32_e32_4]](tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_1]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_3]], 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_2]]
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_1]]
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
   ; CHECK-NEXT:   DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
@@ -94,21 +94,21 @@ body:             |
     %10:vgpr_32 = IMPLICIT_DEF
 
   bb.1:
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %11:vgpr_32
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %11:vgpr_32
     GLOBAL_STORE_DWORD undef %12:vreg_64, %1, 0, 0, implicit $exec :: (store (s32), addrspace 1)
     %13:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 3)
-    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %15:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %16:vgpr_32
+    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %15:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %16:vgpr_32
     %17:vgpr_32 = DS_READ_B32_gfx9 %6, 0, 0, implicit $exec
     %18:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec
     %19:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
-    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %21:vgpr_32, 2031626 /* regdef:VGPR_32 */, def %22:vgpr_32
+    INLINEASM &"def $0 $1", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %21:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %22:vgpr_32
     %23:vgpr_32 = DS_READ_B32_gfx9 %7, 0, 0, implicit $exec
     %24:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %5.sub1:vreg_64 = COPY %6
     %25:vgpr_32 = V_ADD_U32_e32 1, %10, implicit $exec
     %26:sreg_64_xexec = V_CMP_GT_U32_e64 64, %25, implicit $exec
     %27:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 2031626 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 2031625 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 2031625 /* reguse:VGPR_32 */, %15, 2031625 /* reguse:VGPR_32 */, %16, 2031625 /* reguse:VGPR_32 */, %18, 2031625 /* reguse:VGPR_32 */, %17, 2031625 /* reguse:VGPR_32 */, %23, 2031625 /* reguse:VGPR_32 */, %19
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def dead %24:vgpr_32, 1835018 /* regdef:VGPR_32 */, def dead %27:vgpr_32, 1835017 /* reguse:VGPR_32 */, %13.sub0:vreg_64, 2147483657 /* reguse tiedto:$0 */, %24:vgpr_32(tied-def 3), 2147549193 /* reguse tiedto:$1 */, %27:vgpr_32(tied-def 5), 1835017 /* reguse:VGPR_32 */, %15, 1835017 /* reguse:VGPR_32 */, %16, 1835017 /* reguse:VGPR_32 */, %18, 1835017 /* reguse:VGPR_32 */, %17, 1835017 /* reguse:VGPR_32 */, %23, 1835017 /* reguse:VGPR_32 */, %19
     DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store (s32), addrspace 3)
     DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store (s32), addrspace 3)
     DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store (s64), addrspace 3)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 4893bff18570a..4286da391f32e 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec
   ; GCN-NEXT:   [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %14.sub0
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %14.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3735561 /* reguse:VReg_64 */, %14
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:VReg_64 */, %14
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
index b6d630eda8b81..da6b57c776796 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
@@ -28,9 +28,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -41,9 +41,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0, 2031626 /* regdef:VGPR_32 */, def %0.sub1
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0, 1835018 /* regdef:VGPR_32 */, def %0.sub1
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
@@ -69,9 +69,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 2031626 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def [[V_MOV_B32_e32_]], 2147483657 /* reguse tiedto:$0 */, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub1, 1835018 /* regdef:VGPR_32 */, def undef [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -82,9 +82,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031625 /* reguse:VGPR_32 */, %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def %0.sub1, 2031626 /* regdef:VGPR_32 */, def undef %0.sub0
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835017 /* reguse:VGPR_32 */, %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def %0.sub1, 1835018 /* regdef:VGPR_32 */, def undef %0.sub0
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index bc49669df8a2c..eaf9f52e96816 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
 ; CHECK-NEXT:    t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9
-; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<80>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
+; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<76>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
 ; CHECK-NEXT:    t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
 ; CHECK-NEXT:    t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<80>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
+; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<76>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
 ; CHECK-NEXT:    t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33
 ; CHECK-NEXT:    t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
 ; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24

From 910e536fb09f39493906005b9a8d7d1fbcc20e28 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 3 Oct 2025 20:00:58 -0700
Subject: [PATCH 722/878] [SimplifyCFG][profcheck] Synthesize profile for `br
 (X == 0 | X == 1), T, F1 -> switch` (#161549)

We cannot calculate the weights of the switch precisely, but we do know the probability of the default branch. We then split equally the remaining probability over the rest of the cases. If we did nothing, the static estimation could be considerably poorer.


Issue #147390
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 30 ++++++++++++++-----
 .../Transforms/SimplifyCFG/switch_create.ll   | 27 ++++++++++++-----
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 8bba634521e3e..084d9d87c4778 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5152,14 +5152,18 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
   if (ExtraCase && Values.size() < 2)
     return false;
 
-  // TODO: Preserve branch weight metadata, similarly to how
-  // foldValueComparisonIntoPredecessors preserves it.
+  SmallVector<uint32_t> BranchWeights;
+  const bool HasProfile = !ProfcheckDisableMetadataFixes &&
+                          extractBranchWeights(*BI, BranchWeights);
 
   // Figure out which block is which destination.
   BasicBlock *DefaultBB = BI->getSuccessor(1);
   BasicBlock *EdgeBB = BI->getSuccessor(0);
-  if (!TrueWhenEqual)
+  if (!TrueWhenEqual) {
     std::swap(DefaultBB, EdgeBB);
+    if (HasProfile)
+      std::swap(BranchWeights[0], BranchWeights[1]);
+  }
 
   BasicBlock *BB = BI->getParent();
 
@@ -5190,10 +5194,11 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
     if (!isGuaranteedNotToBeUndefOrPoison(ExtraCase, AC, BI, nullptr))
       ExtraCase = Builder.CreateFreeze(ExtraCase);
 
-    if (TrueWhenEqual)
-      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
-    else
-      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+    // We don't have any info about this condition.
+    auto *Br = TrueWhenEqual ? Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB)
+                             : Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+    setExplicitlyUnknownBranchWeightsIfProfiled(*Br, *NewBB->getParent(),
+                                                DEBUG_TYPE);
 
     OldTI->eraseFromParent();
 
@@ -5220,6 +5225,17 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
 
   // Create the new switch instruction now.
   SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+  if (HasProfile) {
+    // We know the weight of the default case. We don't know the weight of the
+    // other cases, but rather than completely lose profiling info, we split
+    // the remaining probability equally over them.
+    SmallVector<uint32_t> NewWeights(Values.size() + 1);
+    NewWeights[0] = BranchWeights[1]; // this is the default, and we swapped if
+                                      // TrueWhenEqual.
+    for (auto &V : drop_begin(NewWeights))
+      V = BranchWeights[0] / Values.size();
+    setBranchWeights(*New, NewWeights, /*IsExpected=*/false);
+  }
 
   // Add all of the 'cases' to the switch instruction.
   for (ConstantInt *Val : Values)
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create.ll b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
index 18c4ade46162c..ef5aee68e268e 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_create.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -S -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck %s
 ; RUN: opt -S -data-layout="p:32:32-p1:16:16" -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
 
@@ -6,12 +6,12 @@ declare void @foo1()
 
 declare void @foo2()
 
-define void @test1(i32 %V) {
+define void @test1(i32 %V) !prof !0 {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    switch i32 [[V:%.*]], label [[F:%.*]] [
 ; CHECK-NEXT:      i32 17, label [[T:%.*]]
 ; CHECK-NEXT:      i32 4, label [[T]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    ], !prof [[PROF1:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       T:
@@ -24,7 +24,7 @@ define void @test1(i32 %V) {
   %C1 = icmp eq i32 %V, 4         ; <i1> [#uses=1]
   %C2 = icmp eq i32 %V, 17                ; <i1> [#uses=1]
   %CN = or i1 %C1, %C2            ; <i1> [#uses=1]
-  br i1 %CN, label %T, label %F
+  br i1 %CN, label %T, label %F, !prof !1
 T:              ; preds = %0
   call void @foo1( )
   ret void
@@ -116,12 +116,12 @@ F:              ; preds = %0
   ret void
 }
 
-define void @test2(i32 %V) {
+define void @test2(i32 %V) !prof !0 {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    switch i32 [[V:%.*]], label [[T:%.*]] [
 ; CHECK-NEXT:      i32 17, label [[F:%.*]]
 ; CHECK-NEXT:      i32 4, label [[F]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    ], !prof [[PROF2:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       T:
@@ -134,7 +134,7 @@ define void @test2(i32 %V) {
   %C1 = icmp ne i32 %V, 4         ; <i1> [#uses=1]
   %C2 = icmp ne i32 %V, 17                ; <i1> [#uses=1]
   %CN = and i1 %C1, %C2           ; <i1> [#uses=1]
-  br i1 %CN, label %T, label %F
+  br i1 %CN, label %T, label %F, !prof !1
 T:              ; preds = %0
   call void @foo1( )
   ret void
@@ -1313,3 +1313,16 @@ if.then:
 if.end:
   ret void
 }
+
+!0 = !{!"function_entry_count", i32 100}
+!1 = !{!"branch_weights", i32 6, i32 10}
+;.
+; DL: attributes #[[ATTR0:[0-9]+]] = { noredzone nounwind ssp }
+; DL: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+; DL: attributes #[[ATTR2]] = { noredzone nounwind }
+; DL: attributes #[[ATTR3]] = { noredzone }
+;.
+; DL: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
+; DL: [[PROF1]] = !{!"branch_weights", i32 10, i32 3, i32 3}
+; DL: [[PROF2]] = !{!"branch_weights", i32 6, i32 5, i32 5}
+;.

From 977b546c32d8978bc5682125e64e84aeee41ee76 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 3 Oct 2025 20:48:26 -0700
Subject: [PATCH 723/878] [TableGen][SchedModel] Introduce a new SchedPredicate
 that checks against SubtargetFeature (#161888)

Introduce a new SchedPredicate, `FeatureSchedPredicate`, that holds true
when a certain SubtargetFeature is enabled. This could be useful when we
want to configure a scheduling model with subtarget features.

I add this as a separate SchedPredicate rather than piggy-back on the
existing `SchedPredicate<[{....}]>` because first and foremost,
`SchedPredicate` is expected to only operate on MachineInstr, so it does
_not_ appear in `MCGenSubtargetInfo::resolveVariantSchedClass` but only
show up in `TargetGenSubtargetInfo::resolveSchedClass`. Yet I think
`FeatureSchedPredicate` will be useful for both MCInst and MachineInstr.
There is another subtle difference between `resolveVariantSchedClass`
and `resolveSchedClass` regarding how we access the MCSubtargetInfo
instance, if we really want to express `FeatureSchedPredicate` using
`SchedPredicate<[{.....}]>`.

So I thought it'll be easier to add another new SchedPredicate for
SubtargetFeature.
---
 llvm/include/llvm/Target/TargetSchedule.td |  6 +++
 llvm/test/TableGen/ResolveSchedClass.td    | 46 ++++++++++++++++++++++
 llvm/utils/TableGen/SubtargetEmitter.cpp   | 21 +++++++++-
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index f55bff16dcecd..d6a40598a7ea2 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -377,6 +377,12 @@ class MCSchedPredicate<MCInstPredicate P> : SchedPredicateBase {
   SchedMachineModel SchedModel = ?;
 }
 
+// A scheduling predicate whose logic depends on a SubtargetFeature.
+class FeatureSchedPredicate<SubtargetFeature SF> : SchedPredicateBase {
+  SubtargetFeature Feature = SF;
+  SchedMachineModel SchedModel = ?;
+}
+
 // Define a predicate to determine which SchedVariant applies to a
 // particular MachineInstr. The code snippet is used as an
 // if-statement's expression. Available variables are MI, SchedModel,
diff --git a/llvm/test/TableGen/ResolveSchedClass.td b/llvm/test/TableGen/ResolveSchedClass.td
index 8c9ef1eab3566..c1cadb359ca03 100644
--- a/llvm/test/TableGen/ResolveSchedClass.td
+++ b/llvm/test/TableGen/ResolveSchedClass.td
@@ -8,11 +8,57 @@ def TestTarget : Target {
   let InstructionSet = TestTargetInstrInfo;
 }
 
+def FeatureFoo : SubtargetFeature<"foo", "HasFoo", "true", "enable foo">;
+
+def ResX0 : ProcResource<1>;
+
+let OutOperandList = (outs), InOperandList = (ins) in
+def Inst_A : Instruction;
+
+def SchedModel_A: SchedMachineModel {
+  let CompleteModel = false;
+}
+
+let SchedModel = SchedModel_A in {
+def SchedWriteResA : SchedWriteRes<[ResX0]> {
+  let Latency = 2;
+}
+def SchedWriteResB : SchedWriteRes<[ResX0]> {
+  let Latency = 4;
+}
+
+// Check SchedPredicate with subtarget feature.
+def FeatureFooPred : FeatureSchedPredicate<FeatureFoo>;
+
+def Variant : SchedWriteVariant<[
+  SchedVar<FeatureFooPred, [SchedWriteResA]>,
+  SchedVar<NoSchedPred,    [SchedWriteResB]>
+]>;
+
+def : InstRW<[Variant], (instrs Inst_A)>;
+}
+
+def ProcessorA: ProcessorModel<"ProcessorA", SchedModel_A, []>;
+
 // CHECK:  unsigned resolveVariantSchedClassImpl(unsigned SchedClass,
 // CHECK-NEXT:      const MCInst *MI, const MCInstrInfo *MCII, const MCSubtargetInfo &STI, unsigned CPUID)
+// CHECK:  case {{.*}}: // Inst_A
+// CHECK-NEXT:   if (CPUID == {{.*}}) { // SchedModel_A
+// CHECK-NEXT:     if (STI.hasFeature(TestTarget::FeatureFoo))
+// CHECK-NEXT:       return {{.*}}; // SchedWriteResA
+// CHECK-NEXT:     return {{.*}}; // SchedWriteResB
 
 // CHECK: unsigned resolveVariantSchedClass(unsigned SchedClass,
 // CHECK-NEXT:    const MCInst *MI, const MCInstrInfo *MCII,
 // CHECK-NEXT:    unsigned CPUID) const override {
 // CHECK-NEXT:   return TestTarget_MC::resolveVariantSchedClassImpl(SchedClass, MI, MCII, *this, CPUID);
 // CHECK-NEXT: }
+
+// CHECK: unsigned TestTargetGenSubtargetInfo
+// CHECK-NEXT: ::resolveSchedClass(unsigned SchedClass, const MachineInstr *MI, const TargetSchedModel *SchedModel) const {
+// CHECK-NEXT:   switch (SchedClass) {
+// CHECK-NEXT:   case {{.*}}: // Inst_A
+// CHECK-NEXT:     if (SchedModel->getProcessorID() == {{.*}}) { // SchedModel_A
+// CHECK-NEXT:       if (this->hasFeature(TestTarget::FeatureFoo))
+// CHECK-NEXT:         return {{.*}}; // SchedWriteResA
+// CHECK-NEXT:       return {{.*}}; // SchedWriteResB
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index b0a309cdbfef8..2f15cc8c76548 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -1586,6 +1586,24 @@ static void emitPredicates(const CodeGenSchedTransition &T,
         continue;
       }
 
+      if (Rec->isSubClassOf("FeatureSchedPredicate")) {
+        const Record *FR = Rec->getValueAsDef("Feature");
+        if (PE.shouldExpandForMC()) {
+          // MC version of this predicate will be emitted into
+          // resolveVariantSchedClassImpl, which accesses MCSubtargetInfo
+          // through argument STI.
+          SS << "STI.";
+        } else {
+          // Otherwise, this predicate will be emitted directly into
+          // TargetGenSubtargetInfo::resolveSchedClass, which can just access
+          // TargetSubtargetInfo / MCSubtargetInfo through `this`.
+          SS << "this->";
+        }
+        SS << "hasFeature(" << PE.getTargetName() << "::" << FR->getName()
+           << ")";
+        continue;
+      }
+
       // Expand this legacy predicate and wrap it around braces if there is more
       // than one predicate to expand.
       SS << ((NumNonTruePreds > 1) ? "(" : "")
@@ -1618,7 +1636,8 @@ static void emitSchedModelHelperEpilogue(raw_ostream &OS,
 
 static bool hasMCSchedPredicates(const CodeGenSchedTransition &T) {
   return all_of(T.PredTerm, [](const Record *Rec) {
-    return Rec->isSubClassOf("MCSchedPredicate");
+    return Rec->isSubClassOf("MCSchedPredicate") ||
+           Rec->isSubClassOf("FeatureSchedPredicate");
   });
 }
 

From 4af3e8f1d4ae0a3eb45068bd12c3b67cb92a7a90 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 4 Oct 2025 12:56:43 +0900
Subject: [PATCH 724/878] AMDGPU: Remove LDS_DIRECT_CLASS register class
 (#161762)

This is a singleton register class which is a bad idea,
and not actually used.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |  20 +-
 .../GlobalISel/irtranslator-inline-asm.ll     |   2 +-
 .../coalesce-copy-to-agpr-to-av-registers.mir | 232 +++++++++---------
 ...class-vgpr-mfma-to-av-with-load-source.mir |  12 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   |  24 +-
 ...al-regcopy-and-spill-missed-at-regalloc.ll |  16 +-
 ...lloc-failure-overlapping-insert-assert.mir |  12 +-
 .../rewrite-vgpr-mfma-to-agpr-copy-from.mir   |   4 +-
 ...gpr-mfma-to-agpr-subreg-insert-extract.mir |  12 +-
 ...te-vgpr-mfma-to-agpr-subreg-src2-chain.mir |  32 +--
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |   2 +-
 .../Inputs/amdgpu_isel.ll.expected            |   4 +-
 12 files changed, 183 insertions(+), 189 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index f98e31229b246..82fc2400a3754 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -761,12 +761,6 @@ def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32,
   let BaseClassOrder = 10000;
 }
 
-def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
-  (add LDS_DIRECT)> {
-  let isAllocatable = 0;
-  let CopyCost = -1;
-}
-
 let GeneratePressureSet = 0, HasSGPR = 1 in {
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
@@ -829,7 +823,7 @@ def SGPR_NULL256 : SIReg<"null">;
 
 let GeneratePressureSet = 0 in {
 def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
-  (add SReg_32, LDS_DIRECT_CLASS)> {
+  (add SReg_32, LDS_DIRECT)> {
   let isAllocatable = 0;
   let HasSGPR = 1;
   let Size = 32;
@@ -968,7 +962,7 @@ defm "" : SRegClass<32, Reg1024Types.types, SGPR_1024Regs>;
 }
 
 def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
-                                 (add VGPR_32, LDS_DIRECT_CLASS)> {
+                                 (add VGPR_32, LDS_DIRECT)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let Size = 32;
@@ -1083,21 +1077,21 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> {
 }
 
 def VS_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
-                          (add VGPR_16, SReg_32, LDS_DIRECT_CLASS)> {
+                          (add VGPR_16, SReg_32, LDS_DIRECT)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let Size = 16;
 }
 
 def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
-                          (add VGPR_16_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
+                          (add VGPR_16_Lo128, SReg_32, LDS_DIRECT)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let Size = 16;
 }
 
 def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
-                          (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
+                          (add VGPR_32, SReg_32, LDS_DIRECT)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
@@ -1105,7 +1099,7 @@ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v
 }
 
 def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
-                          (add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
+                          (add VGPR_32_Lo128, SReg_32, LDS_DIRECT)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
@@ -1113,7 +1107,7 @@ def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2
 }
 
 def VS_32_Lo256 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
-                                  (add VGPR_32_Lo256, SReg_32, LDS_DIRECT_CLASS)> {
+                                  (add VGPR_32_Lo256, SReg_32, LDS_DIRECT)> {
   let isAllocatable = 0;
   let HasVGPR = 1;
   let HasSGPR = 1;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
index a54dc9dda16e0..e5cd0710359ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-inline-asm.ll
@@ -136,7 +136,7 @@ define float @test_multiple_register_outputs_same() #0 {
 define double @test_multiple_register_outputs_mixed() #0 {
   ; CHECK-LABEL: name: test_multiple_register_outputs_mixed
   ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3473418 /* regdef:VReg_64 */, def %9
+  ; CHECK-NEXT:   INLINEASM &"v_mov_b32 $0, 0; v_add_f64 $1, 0, 0", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %8, 3407882 /* regdef:VReg_64 */, def %9
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY %8
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY %9
   ; CHECK-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 52cdf5af592d2..03f1018c40b21 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -20,13 +20,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -45,13 +45,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -72,7 +72,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -80,7 +80,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0
     %3.sub1:areg_96 = COPY %1
     %3.sub2:areg_96 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -101,7 +101,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY3]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -109,7 +109,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0
     %3.sub1:areg_96_align2 = COPY %1
     %3.sub2:areg_96_align2 = COPY %2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -128,13 +128,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0
     %2.sub2_sub3:areg_128 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -153,13 +153,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0
     %2.sub2_sub3:areg_128_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -178,13 +178,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:sgpr_32 = COPY $sgpr8
     %1:sgpr_32 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -203,13 +203,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0
     %2.sub1_sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -228,13 +228,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0
     %2.sub1_sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -253,13 +253,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0
     %2.sub2:areg_96 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -278,13 +278,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0
     %2.sub2:areg_96_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -302,12 +302,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -326,13 +326,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0
     %2.sub1:areg_64_align2 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -350,12 +350,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0
     %1.sub1:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -373,12 +373,12 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0
     %1.sub1:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -398,14 +398,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0
     %1.sub1:areg_128 = COPY %0
     %1.sub2:areg_128 = COPY %0
     %1.sub3:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -425,14 +425,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0
     %1.sub1:areg_128_align2 = COPY %0
     %1.sub2:areg_128_align2 = COPY %0
     %1.sub3:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -451,14 +451,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY2]]
     ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0
     %2.sub1:areg_64 = COPY %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
     INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
@@ -477,13 +477,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1
     INLINEASM &"; use $0", 0 /* attdialect */, 1835017 /* reguse:VGPR_32 */, killed %0
     SI_RETURN
 
@@ -503,16 +503,16 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:VReg_64 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
     %1.sub1:areg_64 = COPY %0
     undef %2.sub0:vreg_64 = COPY %0
     %2.sub1:vreg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %1
-    INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:VReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 3407881 /* reguse:VReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -533,13 +533,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -558,13 +558,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -585,7 +585,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -593,7 +593,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -614,7 +614,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -622,7 +622,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -641,13 +641,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -668,13 +668,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
     %0.sub1:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -693,13 +693,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -718,13 +718,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -743,13 +743,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 =COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -768,13 +768,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -793,13 +793,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -817,12 +817,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -841,13 +841,13 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0.sub0
     %1.sub1:areg_96 = COPY %0.sub0
     %1.sub2:areg_96 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %1
     SI_RETURN
 
 ...
@@ -865,12 +865,12 @@ body:             |
     ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0.sub0
     %1.sub1:areg_96_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %1
     SI_RETURN
 
 ...
@@ -890,14 +890,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0.sub0
     %1.sub1:areg_128 = COPY %0.sub0
     %1.sub2:areg_128 = COPY %0.sub0
     %1.sub3:areg_128 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %1
     SI_RETURN
 
 ...
@@ -917,14 +917,14 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0.sub0
     %1.sub1:areg_128_align2 = COPY %0.sub0
     %1.sub2:areg_128_align2 = COPY %0.sub0
     %1.sub3:areg_128_align2 = COPY %0.sub0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %1
+    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %1
     SI_RETURN
 
 ...
@@ -943,13 +943,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64 = COPY $vgpr0
     %0.sub1:vreg_64 = COPY $vgpr1
     undef %2.sub0:areg_64 = COPY %0.sub0
     %2.sub1:areg_64 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -968,13 +968,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_64_align2 = COPY $vgpr0
     %0.sub1:vreg_64_align2 = COPY $vgpr1
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -995,7 +995,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1:vreg_96 = COPY $vgpr1
@@ -1003,7 +1003,7 @@ body:             |
     undef %3.sub0:areg_96 = COPY %0.sub0
     %3.sub1:areg_96 = COPY %0.sub1
     %3.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1024,7 +1024,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1:vreg_96_align2 = COPY $vgpr1
@@ -1032,7 +1032,7 @@ body:             |
     undef %3.sub0:areg_96_align2 = COPY %0.sub0
     %3.sub1:areg_96_align2 = COPY %0.sub1
     %3.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1051,13 +1051,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1076,13 +1076,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
     %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
     undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
     %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
-    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1101,13 +1101,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:sreg_64 = COPY $sgpr8
     %0.sub1:sreg_64 = COPY $sgpr9
     undef %2.sub0:areg_64_align2 = COPY %0.sub0
     %2.sub1:areg_64_align2 = COPY %0.sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1126,13 +1126,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub0
     %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1150,13 +1150,13 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY2]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96 = COPY $vgpr0
     %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96 = COPY %0.sub2
     %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1176,13 +1176,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0:vreg_96_align2 = COPY $vgpr0
     %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
     undef %2.sub0:areg_96_align2 = COPY %0.sub0
     %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1201,13 +1201,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
     %2.sub2:areg_96 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %2
     SI_RETURN
 
 ...
@@ -1226,13 +1226,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96_align2 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1251,13 +1251,13 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
     ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
     %0.sub2:vreg_96 = COPY $vgpr2
     undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
     %2.sub2:areg_96_align2 = COPY %0.sub2
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1274,11 +1274,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %2:areg_64 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4063241 /* reguse:AReg_64 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 3997705 /* reguse:AReg_64 */, killed %2
     SI_RETURN
 
 ...
@@ -1295,11 +1295,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr0_vgpr1
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1316,11 +1316,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6160393 /* reguse:AReg_96 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6094857 /* reguse:AReg_96 */, %3
     SI_RETURN
 
 ...
@@ -1337,11 +1337,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
     %3:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_96_Align2 */, %3
+    INLINEASM &"; use $0", 0 /* attdialect */, 6422537 /* reguse:AReg_96_Align2 */, %3
     SI_RETURN
 
 ...
@@ -1358,11 +1358,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 8585225 /* reguse:AReg_128 */, killed %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 8519689 /* reguse:AReg_128 */, killed %2
     SI_RETURN
 
 ...
@@ -1379,11 +1379,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     %2:areg_128_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1400,11 +1400,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:sreg_64 = COPY $sgpr8_sgpr9
     %2:areg_64_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
@@ -1421,11 +1421,11 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
     %2:areg_96_align2 = COPY %0
-    INLINEASM &"; use $0", 0 /* attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %2
+    INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %2
     SI_RETURN
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
index 2573acd2ada9a..1445f6b7b58be 100644
--- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
+++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir
@@ -486,7 +486,7 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
   ; CHECK-NEXT:   S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
-  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     S_NOP 0, implicit-def $agpr0
@@ -516,7 +516,7 @@ body:             |
     S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
     S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
     S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
-    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
+    INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2
     S_ENDPGM 0
 
 ...
@@ -1368,7 +1368,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1408,7 +1408,7 @@ body:             |
     undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec
     early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
@@ -1726,7 +1726,7 @@ body:             |
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1)
   ; CHECK-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
+  ; CHECK-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
@@ -1763,7 +1763,7 @@ body:             |
     undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1)
     %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
     %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39976969 /* reguse:VReg_512_Align2 */, %4
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39911433 /* reguse:VReg_512_Align2 */, %4
     S_CBRANCH_VCCNZ %bb.1, implicit $vcc
     S_BRANCH %bb.2
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 32b1b9be39f38..82b9458d09c80 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,16 +8,16 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10420234 /* regdef:SGPR_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10420233 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10420234 /* regdef:SGPR_128 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10354698 /* regdef:SGPR_128 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10420233 /* reguse:SGPR_128 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10354697 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
   call void asm sideeffect "; use $0", "s"(i128 %val)
@@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7667722 /* regdef:VReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7667721 /* reguse:VReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
   call void asm sideeffect "; use $0", "v"(i128 %val)
@@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() {
 
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8585226 /* regdef:AReg_128 */, def %13
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8519690 /* regdef:AReg_128 */, def %13
   ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %13
-  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8585225 /* reguse:AReg_128 */, [[COPY]]
+  ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8519689 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9306122 /* regdef:AReg_128_Align2 */, def %11
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9240586 /* regdef:AReg_128_Align2 */, def %11
   ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
-  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY]]
+  ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
   call void asm sideeffect "; use $0", "a"(i128 %val)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 95638c75ad0a4..2aae26b9470a8 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
   ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7667722 /* regdef:VReg_128 */, def %25
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def %25
   ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %25
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3473418 /* regdef:VReg_64 */, def %27
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def %27
   ; REGALLOC-GFX908-NEXT:   SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
   ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
@@ -37,9 +37,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7667722 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7602186 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3473418 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
+  ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -61,9 +61,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
   ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def %23
   ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %23
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64_Align2 */, def %21
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def %21
   ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21
   ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -80,9 +80,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; PEI-GFX90A-NEXT: {{  $}}
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, undef renamable $agpr0
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
-  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
+  ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3670026 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
index 94aa6fee14a50..a97ef058ce5fa 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir
@@ -45,13 +45,13 @@ body:             |
 
     INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:AGPR_32 */, implicit-def $agpr0
     %14:vgpr_32 = COPY killed $agpr0
-    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39780362 /* regdef:VReg_512 */, def %7, 19202058 /* regdef:VReg_256 */, def %8, 7667722 /* regdef:VReg_128 */, def %9, 5701642 /* regdef:VReg_96 */, def %10, 5701642 /* regdef:VReg_96 */, def %11
+    INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 39714826 /* regdef:VReg_512 */, def %7, 19136522 /* regdef:VReg_256 */, def %8, 7602186 /* regdef:VReg_128 */, def %9, 5636106 /* regdef:VReg_96 */, def %10, 5636106 /* regdef:VReg_96 */, def %11
     INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39780361 /* reguse:VReg_512 */, %7
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19202057 /* reguse:VReg_256 */, %8
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7667721 /* reguse:VReg_128 */, %9
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5701641 /* reguse:VReg_96 */, %10
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5701641 /* reguse:VReg_96 */, %11
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39714825 /* reguse:VReg_512 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19136521 /* reguse:VReg_256 */, %8
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7602185 /* reguse:VReg_128 */, %9
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %10
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5636105 /* reguse:VReg_96 */, %11
     $agpr1 = COPY %14
     INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:AGPR_32 */, killed $agpr1
     SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
index c58b9dacd004d..39f64185b9d57 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir
@@ -43,7 +43,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]]
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3735561 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
     %1:av_64_align2 = COPY $vgpr0_vgpr1
@@ -51,7 +51,7 @@ body:             |
     %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_128_align2 = COPY %3
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3735561 /* reguse:VReg_64_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_64_Align2 */, %5
     SI_RETURN
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
index bae84f4d510f8..c764bc17f0631 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir
@@ -19,7 +19,7 @@ body:             |
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -30,7 +30,7 @@ body:             |
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -172,7 +172,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -183,7 +183,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
@@ -208,7 +208,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
@@ -219,7 +219,7 @@ body:             |
     undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub1:areg_128_align2 = COPY %4.sub2
     %5.sub2_sub3 = IMPLICIT_DEF
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
index e9f45e3db7ec6..c0f5a7737afb0 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir
@@ -17,7 +17,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -26,7 +26,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -47,7 +47,7 @@ body:             |
     ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -56,7 +56,7 @@ body:             |
     %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec
     undef %5.sub0_sub1:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %5
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %5
     GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -79,7 +79,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -90,7 +90,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %4.sub0_sub1
     %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %6:areg_64_align2 = COPY %5
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %6:areg_64_align2
     GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -114,7 +114,7 @@ body:             |
     ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -126,7 +126,7 @@ body:             |
     undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     %7:areg_64_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4653065 /* reguse:AReg_64_Align2 */, %7
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4587529 /* reguse:AReg_64_Align2 */, %7
     GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 
@@ -151,7 +151,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1
     ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -163,7 +163,7 @@ body:             |
     %other_use:vreg_64_align2 = COPY %5.sub0_sub1
     %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec
     undef %8.sub0_sub1:areg_128_align2 = COPY %6
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %8:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 
@@ -231,7 +231,7 @@ body:             |
     ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -245,7 +245,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_128_align2 = COPY %4
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9306121 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9240585 /* reguse:AReg_128_Align2 */, %6:areg_128_align2
     GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     SI_RETURN
 ...
@@ -273,7 +273,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -287,7 +287,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
@@ -313,7 +313,7 @@ body:             |
     ; CHECK-NEXT: %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3
     ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2
-    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, [[COPY3]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, [[COPY3]]
     ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64_align2 = COPY $vgpr4_vgpr5
@@ -327,7 +327,7 @@ body:             |
     %other_use1:vreg_64_align2 = COPY %4.sub2_sub3
     %other_use2:vreg_64 = COPY %4.sub1_sub2
     %6:areg_64 = COPY %4.sub1_sub2
-    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4063241 /* reguse:AReg_64 */, %6:areg_64
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:AReg_64 */, %6:areg_64
     GLOBAL_STORE_DWORDX2 %0, %other_use1, 0, 0, implicit $exec :: (store (s64), addrspace 1)
     SI_RETURN
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 4286da391f32e..b13829e0351f6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1835018 /* regdef:VGPR_32 */, def undef %14.sub0
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
   ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1)
-  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3473417 /* reguse:VReg_64 */, %14
+  ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3407881 /* reguse:VReg_64 */, %14
   ; GCN-NEXT:   S_ENDPGM 0
   %v0 = call i32 asm sideeffect "; def $0", "=v"()
   %tmp = insertelement <2 x i32> poison, i32 %v0, i32 0
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
index eaf9f52e96816..f515ee651d835 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_isel.ll.expected
@@ -7,10 +7,10 @@ define i64 @i64_test(i64 %i) nounwind readnone {
 ; CHECK-NEXT:    t0: ch,glue = EntryToken
 ; CHECK-NEXT:    t2: i32,ch = CopyFromReg # D:1 t0, Register:i32 %8
 ; CHECK-NEXT:    t4: i32,ch = CopyFromReg # D:1 t0, Register:i32 %9
-; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<76>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
+; CHECK-NEXT:    t50: i64 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t2, TargetConstant:i32<3>, t4, TargetConstant:i32<11>
 ; CHECK-NEXT:    t27: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc, align 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
 ; CHECK-NEXT:    t30: i32,ch = BUFFER_LOAD_DWORD_OFFEN<Mem:(dereferenceable load (s32) from %ir.loc + 4, basealign 8, addrspace 5)> TargetFrameIndex:i32<0>, Register:v4i32 $sgpr0_sgpr1_sgpr2_sgpr3, TargetConstant:i32<0>, TargetConstant:i32<4>, TargetConstant:i32<0>, TargetConstant:i1<0>, t0
-; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<76>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
+; CHECK-NEXT:    t33: v2i32 = REG_SEQUENCE # D:1 TargetConstant:i32<75>, t27, TargetConstant:i32<3>, t30, TargetConstant:i32<11>
 ; CHECK-NEXT:    t10: i64 = V_ADD_U64_PSEUDO # D:1 t50, t33
 ; CHECK-NEXT:    t24: i32 = EXTRACT_SUBREG # D:1 t10, TargetConstant:i32<3>
 ; CHECK-NEXT:    t17: ch,glue = CopyToReg # D:1 t0, Register:i32 $vgpr0, t24

From 5ec11900e5df2cb8abcd8626609d38c2a26f0940 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 3 Oct 2025 20:58:13 -0700
Subject: [PATCH 725/878] [profcheck] Exclude new test cases for SPIRV and GVN
 transformations (#161941)

---
 llvm/utils/profcheck-xfail.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 77e6ab7c5a6ea..13121bd7bf190 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -81,6 +81,7 @@ CodeGen/Hexagon/loop-idiom/memmove-rt-check.ll
 CodeGen/NVPTX/lower-ctor-dtor.ll
 CodeGen/PowerPC/P10-stack-alignment.ll
 CodeGen/RISCV/zmmul.ll
+CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
 CodeGen/WebAssembly/memory-interleave.ll
 CodeGen/X86/masked_gather_scatter.ll
 CodeGen/X86/nocfivalue.ll
@@ -780,6 +781,8 @@ Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll
 Transforms/GlobalOpt/shrink-global-to-bool-opaque-ptrs.ll
 Transforms/GVN/debugloc-load-select.ll
 Transforms/GVN/load-through-select-dbg.ll
+Transforms/GVN/masked-load-store-no-mem-dep.ll
+Transforms/GVN/masked-load-store.ll
 Transforms/GVN/opaque-ptr.ll
 Transforms/GVN/pr69301.ll
 Transforms/GVN/pre-invalid-prof-metadata.ll

From 5fe6479bf2dbd008d9e300c0ea2d1c1ff30ebe58 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 3 Oct 2025 21:05:25 -0700
Subject: [PATCH 726/878] [SimplifyCFG][profcheck] Handle branch weights in
 `simplifySwitchLookup` (#161739)

The switch becomes a conditional branch, one edge going to what was the default target of the switch, the other to a BB that performs a lookup in a table. The branch weights are accurately determinable from the ones of the switch.

Issue #147390
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 25 +++++++++++++++----
 .../SimplifyCFG/X86/switch_to_lookup_table.ll | 13 +++++++---
 .../Transforms/SimplifyCFG/rangereduce.ll     | 24 +++++++++++++++---
 3 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 084d9d87c4778..48055ad6ea7e4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -7227,6 +7227,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
       Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
 
   BranchInst *RangeCheckBranch = nullptr;
+  BranchInst *CondBranch = nullptr;
 
   Builder.SetInsertPoint(SI);
   const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
@@ -7241,6 +7242,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
         TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
     RangeCheckBranch =
         Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+    CondBranch = RangeCheckBranch;
     if (DTU)
       Updates.push_back({DominatorTree::Insert, BB, LookupBB});
   }
@@ -7279,7 +7281,7 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
     Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
     Value *LoBit = Builder.CreateTrunc(
         Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
-    Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
+    CondBranch = Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
     if (DTU) {
       Updates.push_back({DominatorTree::Insert, MaskBB, LookupBB});
       Updates.push_back({DominatorTree::Insert, MaskBB, SI->getDefaultDest()});
@@ -7319,19 +7321,32 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
   if (DTU)
     Updates.push_back({DominatorTree::Insert, LookupBB, CommonDest});
 
+  SmallVector<uint32_t> BranchWeights;
+  const bool HasBranchWeights = CondBranch && !ProfcheckDisableMetadataFixes &&
+                                extractBranchWeights(*SI, BranchWeights);
+  uint64_t ToLookupWeight = 0;
+  uint64_t ToDefaultWeight = 0;
+
   // Remove the switch.
   SmallPtrSet<BasicBlock *, 8> RemovedSuccessors;
-  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
-    BasicBlock *Succ = SI->getSuccessor(i);
+  for (unsigned I = 0, E = SI->getNumSuccessors(); I < E; ++I) {
+    BasicBlock *Succ = SI->getSuccessor(I);
 
-    if (Succ == SI->getDefaultDest())
+    if (Succ == SI->getDefaultDest()) {
+      if (HasBranchWeights)
+        ToDefaultWeight += BranchWeights[I];
       continue;
+    }
     Succ->removePredecessor(BB);
     if (DTU && RemovedSuccessors.insert(Succ).second)
       Updates.push_back({DominatorTree::Delete, BB, Succ});
+    if (HasBranchWeights)
+      ToLookupWeight += BranchWeights[I];
   }
   SI->eraseFromParent();
-
+  if (HasBranchWeights)
+    setFittedBranchWeights(*CondBranch, {ToLookupWeight, ToDefaultWeight},
+                           /*IsExpected=*/false);
   if (DTU)
     DTU->applyUpdates(Updates);
 
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index f9e79cabac51d..bee6b375ea11a 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -1565,14 +1565,14 @@ end:
 ; lookup (since i3 can only hold values in the range of explicit
 ; values) and simultaneously trying to generate a branch to deal with
 ; the fact that we have holes in the range.
-define i32 @covered_switch_with_bit_tests(i3) {
+define i32 @covered_switch_with_bit_tests(i3) !prof !0 {
 ; CHECK-LABEL: @covered_switch_with_bit_tests(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SWITCH_TABLEIDX:%.*]] = sub i3 [[TMP0:%.*]], -4
 ; CHECK-NEXT:    [[SWITCH_MASKINDEX:%.*]] = zext i3 [[SWITCH_TABLEIDX]] to i8
 ; CHECK-NEXT:    [[SWITCH_SHIFTED:%.*]] = lshr i8 -61, [[SWITCH_MASKINDEX]]
 ; CHECK-NEXT:    [[SWITCH_LOBIT:%.*]] = trunc i8 [[SWITCH_SHIFTED]] to i1
-; CHECK-NEXT:    br i1 [[SWITCH_LOBIT]], label [[SWITCH_LOOKUP:%.*]], label [[L6:%.*]]
+; CHECK-NEXT:    br i1 [[SWITCH_LOBIT]], label [[SWITCH_LOOKUP:%.*]], label [[L6:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       switch.lookup:
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[SWITCH_TABLEIDX]] to i64
 ; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [8 x i32], ptr @switch.table.covered_switch_with_bit_tests, i64 0, i64 [[TMP1]]
@@ -1588,7 +1588,7 @@ entry:
   i3 -4, label %l5
   i3 3, label %l1
   i3 2, label %l1
-  ]
+  ], !prof !1
 
 l1: br label %l2
 
@@ -2425,3 +2425,10 @@ return:
   %res = phi i1 [ 0, %bb0 ], [ 1, %bb1 ]
   ret i1 %res
 }
+
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13}
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 36, i32 3}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
index 17d65a4d4fa5e..d1fba91d1e505 100644
--- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -1,15 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-to-lookup -S | FileCheck %s
 ; RUN: opt < %s -passes='simplifycfg<switch-to-lookup>' -S | FileCheck %s
 
 target datalayout = "e-n32"
 
-define i32 @test1(i32 %a) {
+;.
+; CHECK: @switch.table.test1 = private unnamed_addr constant [4 x i32] [i32 11984, i32 1143, i32 99783, i32 99783], align 4
+; CHECK: @switch.table.test3 = private unnamed_addr constant [3 x i32] [i32 11984, i32 1143, i32 99783], align 4
+; CHECK: @switch.table.test6 = private unnamed_addr constant [4 x i32] [i32 99783, i32 99783, i32 1143, i32 11984], align 4
+; CHECK: @switch.table.test8 = private unnamed_addr constant [5 x i32] [i32 11984, i32 1143, i32 99783, i32 8867, i32 99783], align 4
+; CHECK: @switch.table.test9 = private unnamed_addr constant [8 x i32] [i32 99783, i32 8867, i32 99783, i32 8867, i32 8867, i32 8867, i32 11984, i32 1143], align 4
+;.
+define i32 @test1(i32 %a) !prof !0 {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], 97
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 30)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 4
-; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       switch.lookup:
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test1, i64 0, i64 [[TMP4]]
@@ -24,7 +31,7 @@ define i32 @test1(i32 %a) {
   i32 101, label %two
   i32 105, label %three
   i32 109, label %three
-  ]
+  ], !prof !1
 
 def:
   ret i32 8867
@@ -310,3 +317,12 @@ three:
   ret i32 99783
 }
 
+!0 = !{!"function_entry_count", i32 100}
+!1 = !{!"branch_weights", i32 5, i32 7, i32 11, i32 13, i32 17}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { optsize }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 48, i32 5}
+;.

From e61b6f63ed98041911853101c7e5821f0861765b Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 3 Oct 2025 21:10:02 -0700
Subject: [PATCH 727/878] [nfc][profcheck] Re-sort exclude list (#161942)

---
 llvm/utils/profcheck-xfail.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 13121bd7bf190..53187c898b093 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -108,7 +108,6 @@ Instrumentation/AddressSanitizer/AMDGPU/global_metadata_addrspacecasts.ll
 Instrumentation/AddressSanitizer/AMDGPU/instrument-stack.ll
 Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_lds_globals.ll
 Instrumentation/AddressSanitizer/AMDGPU/no_redzones_in_scratch_globals.ll
-Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll
 Instrumentation/AddressSanitizer/asan_address_space_attr.ll
 Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll
 Instrumentation/AddressSanitizer/asan-disable-sanitizer-instrumentation.ll
@@ -194,6 +193,7 @@ Instrumentation/AddressSanitizer/odr-check-ignore.ll
 Instrumentation/AddressSanitizer/program-addrspace.ll
 Instrumentation/AddressSanitizer/ps4.ll
 Instrumentation/AddressSanitizer/remove-memory-effects.ll
+Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll
 Instrumentation/AddressSanitizer/scale-offset.ll
 Instrumentation/AddressSanitizer/skip-coro.ll
 Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
@@ -781,8 +781,8 @@ Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll
 Transforms/GlobalOpt/shrink-global-to-bool-opaque-ptrs.ll
 Transforms/GVN/debugloc-load-select.ll
 Transforms/GVN/load-through-select-dbg.ll
-Transforms/GVN/masked-load-store-no-mem-dep.ll
 Transforms/GVN/masked-load-store.ll
+Transforms/GVN/masked-load-store-no-mem-dep.ll
 Transforms/GVN/opaque-ptr.ll
 Transforms/GVN/pr69301.ll
 Transforms/GVN/pre-invalid-prof-metadata.ll
@@ -1297,9 +1297,9 @@ Transforms/OpenMP/spmdization_remarks.ll
 Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll
 Transforms/PGOProfile/chr-dead-pred.ll
 Transforms/PGOProfile/chr-dup-threshold.ll
+Transforms/PGOProfile/chr-lifetimes.ll
 Transforms/PGOProfile/chr.ll
 Transforms/PGOProfile/chr-poison.ll
-Transforms/PGOProfile/chr-lifetimes.ll
 Transforms/PGOProfile/comdat.ll
 Transforms/PGOProfile/cspgo_profile_summary.ll
 Transforms/PGOProfile/memop_profile_funclet_wasm.ll
@@ -1432,10 +1432,10 @@ Transforms/SimplifyCFG/switch-dup-bbs.ll
 Transforms/SimplifyCFG/switch_mask.ll
 Transforms/SimplifyCFG/switch_msan.ll
 Transforms/SimplifyCFG/switch-on-const-select.ll
-Transforms/SimplifyCFG/switch-transformations-no-lut.ll
 Transforms/SimplifyCFG/switchToSelect-domtree-preservation-edgecase.ll
 Transforms/SimplifyCFG/switch-to-select-multiple-edge-per-block-phi.ll
 Transforms/SimplifyCFG/switch-to-select-two-case.ll
+Transforms/SimplifyCFG/switch-transformations-no-lut.ll
 Transforms/SimplifyCFG/wc-widen-block.ll
 Transforms/SimplifyCFG/X86/disable-lookup-table.ll
 Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -1530,8 +1530,8 @@ Transforms/SROA/select-load.ll
 Transforms/SROA/slice-width.ll
 Transforms/SROA/std-clamp.ll
 Transforms/SROA/vector-conversion.ll
-Transforms/SROA/vector-promotion.ll
 Transforms/SROA/vector-promotion-cannot-tree-structure-merge.ll
+Transforms/SROA/vector-promotion.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll

From b1e29ec3b73b9dd06656c7e30ace597ff72cde70 Mon Sep 17 00:00:00 2001
From: Andre Kuhlenschmidt <andre.kuhlenschmidt@gmail.com>
Date: Fri, 3 Oct 2025 21:39:04 -0700
Subject: [PATCH 728/878] [flang] remove sequences of duplicate messages
 (#161916)

Fixes bug exposed by https://github.com/llvm/llvm-project/pull/161915 by keeping a cache of messages printed at a given location.
---
 flang/include/flang/Parser/message.h |  2 +-
 flang/lib/Parser/message.cpp         | 24 ++++++++++++++++++++----
 flang/test/Semantics/associated.f90  |  2 --
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/flang/include/flang/Parser/message.h b/flang/include/flang/Parser/message.h
index 224263e4be860..7c639eff1eeef 100644
--- a/flang/include/flang/Parser/message.h
+++ b/flang/include/flang/Parser/message.h
@@ -307,9 +307,9 @@ class Message : public common::ReferenceCounted<Message> {
   bool Merge(const Message &);
   bool operator==(const Message &that) const;
   bool operator!=(const Message &that) const { return !(*this == that); }
+  bool AtSameLocation(const Message &) const;
 
 private:
-  bool AtSameLocation(const Message &) const;
   std::variant<ProvenanceRange, CharBlock> location_;
   std::variant<MessageFixedText, MessageFormattedText, MessageExpectedText>
       text_;
diff --git a/flang/lib/Parser/message.cpp b/flang/lib/Parser/message.cpp
index 2c4f930c0b088..cfcd08b0861ef 100644
--- a/flang/lib/Parser/message.cpp
+++ b/flang/lib/Parser/message.cpp
@@ -477,15 +477,31 @@ void Messages::Emit(llvm::raw_ostream &o, const AllCookedSources &allCooked,
   }
   std::stable_sort(sorted.begin(), sorted.end(),
       [](const Message *x, const Message *y) { return x->SortBefore(*y); });
-  const Message *lastMsg{nullptr};
+  std::vector<const Message *> msgsWithLastLocation;
   std::size_t errorsEmitted{0};
   for (const Message *msg : sorted) {
-    if (lastMsg && *msg == *lastMsg) {
-      // Don't emit two identical messages for the same location
+    bool shouldSkipMsg{false};
+    // Don't emit two identical messages for the same location.
+    // At the same location, messages are sorted by the order they were
+    // added to the Messages buffer, which is a decent proxy for the
+    // causality of the messages.
+    if (!msgsWithLastLocation.empty()) {
+      if (msgsWithLastLocation[0]->AtSameLocation(*msg)) {
+        for (const Message *msgAtThisLocation : msgsWithLastLocation) {
+          if (*msg == *msgAtThisLocation) {
+            shouldSkipMsg = true; // continue loop over sorted messages
+            break;
+          }
+        }
+      } else {
+        msgsWithLastLocation.clear();
+      }
+    }
+    if (shouldSkipMsg) {
       continue;
     }
+    msgsWithLastLocation.push_back(msg);
     msg->Emit(o, allCooked, echoSourceLines, hintFlagPtr);
-    lastMsg = msg;
     if (warningsAreErrors || msg->IsFatal()) {
       ++errorsEmitted;
     }
diff --git a/flang/test/Semantics/associated.f90 b/flang/test/Semantics/associated.f90
index 7cb6c240db226..731f2d828995f 100644
--- a/flang/test/Semantics/associated.f90
+++ b/flang/test/Semantics/associated.f90
@@ -253,8 +253,6 @@ subroutine test(assumedRank)
     lvar = associated(intPointerVar1, targetIntCoarray[1])
     !ERROR: 'neverdeclared' is not a procedure
     !ERROR: Could not characterize intrinsic function actual argument 'badpointer'
-    !ERROR: 'neverdeclared' is not a procedure
-    !ERROR: Could not characterize intrinsic function actual argument 'badpointer'
     lvar = associated(badPointer)
   end subroutine test
 end subroutine assoc

From e95a571f402074ac438235b13a4099f43c4e822c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 3 Oct 2025 21:57:37 -0700
Subject: [PATCH 729/878] [RISCV] Add i32 to some QC_SHLADD patterns to reduce
 RISCVGenDAGISel.inc size. NFC

The shift amount type is independent of the output type. We need to
force it to i32 to prevent tablegen from creating an unnecessary
i64 pattern.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index efdbd1298aec6..447f05cf88788 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1417,9 +1417,9 @@ class SelectQCbi<CondCode Cond, DAGOperand InTyImm, Pseudo OpNode >
 let Predicates = [HasVendorXqciac, IsRV32] in {
 def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))),
           (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>;
-def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
+def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)),
           (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
-def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
+def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)),
           (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
 } // Predicates = [HasVendorXqciac, IsRV32]
 

From 7fbfbab3324b88d75d68b2bdb3aed696be80d8d5 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Sat, 4 Oct 2025 14:13:57 +0800
Subject: [PATCH 730/878] [Clang][NFC] Avoid duplication in
 BuildDeductionGuideForTypeAlias (#161948)

This addresses the post-commit review https://github.com/llvm/llvm-project/pull/161035#discussion_r2403312649 from Shafik
---
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 9a61888e318b3..8ba23aa334afd 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -1189,14 +1189,13 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
     // parameter list of a synthesized CTAD guide. See also the FIXME in
     // test/SemaCXX/cxx20-ctad-type-alias.cpp:test25.
     Sema::CheckTemplateArgumentInfo CTAI;
+    for (auto TA : Output.arguments())
+      if (SemaRef.CheckTemplateArgument(
+              TP, TA, F, F->getLocation(), F->getLocation(),
+              /*ArgumentPackIndex=*/-1, CTAI,
+              Sema::CheckTemplateArgumentKind::CTAK_Specified))
+        return nullptr;
     if (Input.getArgument().getKind() == TemplateArgument::Pack) {
-      for (auto TA : Output.arguments()) {
-        if (SemaRef.CheckTemplateArgument(
-                TP, TA, F, F->getLocation(), F->getLocation(),
-                /*ArgumentPackIndex=*/-1, CTAI,
-                Sema::CheckTemplateArgumentKind::CTAK_Specified))
-          return nullptr;
-      }
       // We will substitute the non-deduced template arguments with these
       // transformed (unpacked at this point) arguments, where that substitution
       // requires a pack for the corresponding parameter packs.
@@ -1204,12 +1203,6 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
           TemplateArgument::CreatePackCopy(Context, CTAI.SugaredConverted);
     } else {
       assert(Output.arguments().size() == 1);
-      TemplateArgumentLoc Transformed = Output.arguments()[0];
-      if (SemaRef.CheckTemplateArgument(
-              TP, Transformed, F, F->getLocation(), F->getLocation(),
-              /*ArgumentPackIndex=*/-1, CTAI,
-              Sema::CheckTemplateArgumentKind::CTAK_Specified))
-        return nullptr;
       TemplateArgsForBuildingFPrime[Index] = CTAI.SugaredConverted[0];
     }
   }

From 0ce7cbddd29bc857c6d500a6d754352d24254b71 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Sat, 4 Oct 2025 07:18:40 +0100
Subject: [PATCH 731/878] [AMDGPU] Add another test for missing S_WAIT_XCNT
 (#161838)

---
 llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 40 ++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 6fe99d895c7bb..1b8e126f19ae1 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,6 +945,46 @@ body: |
     $vgpr0 = V_MOV_B32_e32 0, implicit $exec
 ...
 
+# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
+---
+name: wait_kmcnt_with_outstanding_vmem_2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: wait_kmcnt_with_outstanding_vmem_2
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr2
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
+
 ---
 name: wait_loadcnt_with_outstanding_smem
 tracksRegLiveness: true

From cd32b9b6c3b2fda3a781d9ce0a0cf8e21c9d0137 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 4 Oct 2025 16:33:17 +1000
Subject: [PATCH 732/878] [MachO] Move getArchTriple implementation into
 BinaryFormat. (#161468)

There's nothing ObjectFile specific about getArchTriple, so move it into
the BinaryFormat library so that clients can use it without taking a
dependence on libObject.

MachOObjectFile::getArchTriple is updated to call through to the moved
implementation.
---
 llvm/include/llvm/BinaryFormat/MachO.h |   9 +-
 llvm/include/llvm/Object/MachO.h       |   8 +-
 llvm/lib/BinaryFormat/MachO.cpp        | 157 ++++++++++++++++++++++++
 llvm/lib/Object/MachOObjectFile.cpp    | 158 -------------------------
 4 files changed, 169 insertions(+), 163 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
index 5dbdfb13d1a5f..ad6b27f61ec91 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -17,11 +17,9 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SwapByteOrder.h"
+#include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
-
-class Triple;
-
 namespace MachO {
 // Enums from <mach-o/loader.h>
 enum : uint32_t {
@@ -1710,6 +1708,11 @@ LLVM_ABI Expected<uint32_t> getCPUSubType(const Triple &T,
                                           unsigned PtrAuthABIVersion,
                                           bool PtrAuthKernelABIVersion);
 
+LLVM_ABI Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
+LLVM_ABI Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
+                              const char **McpuDefault = nullptr,
+                              const char **ArchFlag = nullptr);
+
 struct x86_thread_state32_t {
   uint32_t eax;
   uint32_t ebx;
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 3f4a21d2d1bac..0dfe9802009a2 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -752,10 +752,14 @@ class LLVM_ABI MachOObjectFile : public ObjectFile {
   static StringRef guessLibraryShortName(StringRef Name, bool &isFramework,
                                          StringRef &Suffix);
 
-  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
+  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType) {
+    return MachO::getArch(CPUType, CPUSubType);
+  }
   static Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
                               const char **McpuDefault = nullptr,
-                              const char **ArchFlag = nullptr);
+                              const char **ArchFlag = nullptr) {
+    return MachO::getArchTriple(CPUType, CPUSubType, McpuDefault, ArchFlag);
+  }
   static bool isValidArch(StringRef ArchFlag);
   static ArrayRef<StringRef> getValidArchs();
   static Triple getHostArch();
diff --git a/llvm/lib/BinaryFormat/MachO.cpp b/llvm/lib/BinaryFormat/MachO.cpp
index f46b9d5147ff1..b2af2434f3518 100644
--- a/llvm/lib/BinaryFormat/MachO.cpp
+++ b/llvm/lib/BinaryFormat/MachO.cpp
@@ -123,3 +123,160 @@ Expected<uint32_t> MachO::getCPUSubType(const Triple &T,
   return CPU_SUBTYPE_ARM64E_WITH_PTRAUTH_VERSION(PtrAuthABIVersion,
                                                  PtrAuthKernelABIVersion);
 }
+
+Triple::ArchType MachO::getArch(uint32_t CPUType, uint32_t CPUSubType) {
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    return Triple::x86;
+  case MachO::CPU_TYPE_X86_64:
+    return Triple::x86_64;
+  case MachO::CPU_TYPE_ARM:
+    return Triple::arm;
+  case MachO::CPU_TYPE_ARM64:
+    return Triple::aarch64;
+  case MachO::CPU_TYPE_ARM64_32:
+    return Triple::aarch64_32;
+  case MachO::CPU_TYPE_POWERPC:
+    return Triple::ppc;
+  case MachO::CPU_TYPE_POWERPC64:
+    return Triple::ppc64;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+Triple MachO::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
+                            const char **McpuDefault, const char **ArchFlag) {
+  if (McpuDefault)
+    *McpuDefault = nullptr;
+  if (ArchFlag)
+    *ArchFlag = nullptr;
+
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_I386_ALL:
+      if (ArchFlag)
+        *ArchFlag = "i386";
+      return Triple("i386-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_X86_64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_X86_64_ALL:
+      if (ArchFlag)
+        *ArchFlag = "x86_64";
+      return Triple("x86_64-apple-darwin");
+    case MachO::CPU_SUBTYPE_X86_64_H:
+      if (ArchFlag)
+        *ArchFlag = "x86_64h";
+      return Triple("x86_64h-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM_V4T:
+      if (ArchFlag)
+        *ArchFlag = "armv4t";
+      return Triple("armv4t-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+      if (ArchFlag)
+        *ArchFlag = "armv5e";
+      return Triple("armv5e-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_XSCALE:
+      if (ArchFlag)
+        *ArchFlag = "xscale";
+      return Triple("xscale-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6:
+      if (ArchFlag)
+        *ArchFlag = "armv6";
+      return Triple("armv6-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m0";
+      if (ArchFlag)
+        *ArchFlag = "armv6m";
+      return Triple("armv6m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7:
+      if (ArchFlag)
+        *ArchFlag = "armv7";
+      return Triple("armv7-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7EM:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m4";
+      if (ArchFlag)
+        *ArchFlag = "armv7em";
+      return Triple("thumbv7em-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7K:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7k";
+      return Triple("armv7k-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m3";
+      if (ArchFlag)
+        *ArchFlag = "armv7m";
+      return Triple("thumbv7m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7S:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7s";
+      return Triple("armv7s-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_ALL:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64";
+      return Triple("arm64-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM64E:
+      if (McpuDefault)
+        *McpuDefault = "apple-a12";
+      if (ArchFlag)
+        *ArchFlag = "arm64e";
+      return Triple("arm64e-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64_32:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_32_V8:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64_32";
+      return Triple("arm64_32-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc";
+      return Triple("ppc-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc64";
+      return Triple("ppc64-apple-darwin");
+    default:
+      return Triple();
+    }
+  default:
+    return Triple();
+  }
+}
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index e09dc947c2779..abadb047ef521 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -2685,164 +2685,6 @@ StringRef MachOObjectFile::getFileFormatName() const {
   }
 }
 
-Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType) {
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    return Triple::x86;
-  case MachO::CPU_TYPE_X86_64:
-    return Triple::x86_64;
-  case MachO::CPU_TYPE_ARM:
-    return Triple::arm;
-  case MachO::CPU_TYPE_ARM64:
-    return Triple::aarch64;
-  case MachO::CPU_TYPE_ARM64_32:
-    return Triple::aarch64_32;
-  case MachO::CPU_TYPE_POWERPC:
-    return Triple::ppc;
-  case MachO::CPU_TYPE_POWERPC64:
-    return Triple::ppc64;
-  default:
-    return Triple::UnknownArch;
-  }
-}
-
-Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
-                                      const char **McpuDefault,
-                                      const char **ArchFlag) {
-  if (McpuDefault)
-    *McpuDefault = nullptr;
-  if (ArchFlag)
-    *ArchFlag = nullptr;
-
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_I386_ALL:
-      if (ArchFlag)
-        *ArchFlag = "i386";
-      return Triple("i386-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_X86_64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_X86_64_ALL:
-      if (ArchFlag)
-        *ArchFlag = "x86_64";
-      return Triple("x86_64-apple-darwin");
-    case MachO::CPU_SUBTYPE_X86_64_H:
-      if (ArchFlag)
-        *ArchFlag = "x86_64h";
-      return Triple("x86_64h-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM_V4T:
-      if (ArchFlag)
-        *ArchFlag = "armv4t";
-      return Triple("armv4t-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
-      if (ArchFlag)
-        *ArchFlag = "armv5e";
-      return Triple("armv5e-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_XSCALE:
-      if (ArchFlag)
-        *ArchFlag = "xscale";
-      return Triple("xscale-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6:
-      if (ArchFlag)
-        *ArchFlag = "armv6";
-      return Triple("armv6-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m0";
-      if (ArchFlag)
-        *ArchFlag = "armv6m";
-      return Triple("armv6m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7:
-      if (ArchFlag)
-        *ArchFlag = "armv7";
-      return Triple("armv7-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7EM:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m4";
-      if (ArchFlag)
-        *ArchFlag = "armv7em";
-      return Triple("thumbv7em-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7K:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7k";
-      return Triple("armv7k-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m3";
-      if (ArchFlag)
-        *ArchFlag = "armv7m";
-      return Triple("thumbv7m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7S:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7s";
-      return Triple("armv7s-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_ALL:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64";
-      return Triple("arm64-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM64E:
-      if (McpuDefault)
-        *McpuDefault = "apple-a12";
-      if (ArchFlag)
-        *ArchFlag = "arm64e";
-      return Triple("arm64e-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64_32:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_32_V8:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64_32";
-      return Triple("arm64_32-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc";
-      return Triple("ppc-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc64";
-      return Triple("ppc64-apple-darwin");
-    default:
-      return Triple();
-    }
-  default:
-    return Triple();
-  }
-}
-
 Triple MachOObjectFile::getHostArch() {
   return Triple(sys::getDefaultTargetTriple());
 }

From 30f7c5d04c395e766bd778acfa46ee88e2ad300d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 3 Oct 2025 23:46:51 -0700
Subject: [PATCH 733/878] [ADT] Add bit_width_constexpr (#161775)

This patch adds llvm::bit_width_constexpr, a constexpr version of
llvm::bit_width.

The new function is intended to serve as a marker.  When we switch to
C++20, we will most likely go through functions in llvm/ADT/bit.h and
replace them with their counterparts from <bit>.  With
llvm::bit_width_constexpr, we can easily replace its use with
std::bit_width.

This patch refactors a couple of places.  Specifically:

- bitWidth in BitmaskEnum.h is replaced with the new function.

- bitsRequired in PointerUnion.h is redefined in terms of the new
  function.

I've used Compiler Explorer to check the equivalence:

https://godbolt.org/z/1oKMK9Ez7
---
 llvm/include/llvm/ADT/BitmaskEnum.h  |  7 ++-----
 llvm/include/llvm/ADT/PointerUnion.h |  2 +-
 llvm/include/llvm/ADT/bit.h          | 17 +++++++++++++++++
 llvm/unittests/ADT/BitTest.cpp       | 23 +++++++++++++++++++++++
 4 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index 7214f25b0aa10..d464cbcfcdffd 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -14,6 +14,7 @@
 #include <utility>
 
 #include "llvm/ADT/STLForwardCompat.h"
+#include "llvm/ADT/bit.h"
 #include "llvm/Support/MathExtras.h"
 
 /// LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can
@@ -138,10 +139,6 @@ template <typename E> constexpr std::underlying_type_t<E> Underlying(E Val) {
   return U;
 }
 
-constexpr unsigned bitWidth(uint64_t Value) {
-  return Value ? 1 + bitWidth(Value >> 1) : 0;
-}
-
 template <typename E, typename = std::enable_if_t<is_bitmask_enum<E>::value>>
 constexpr bool operator!(E Val) {
   return Val == static_cast<E>(0);
@@ -220,7 +217,7 @@ e &operator>>=(e &lhs, e rhs) {
 // Enable bitmask enums in namespace ::llvm and all nested namespaces.
 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
 template <typename E, typename = std::enable_if_t<is_bitmask_enum<E>::value>>
-constexpr unsigned BitWidth = BitmaskEnumDetail::bitWidth(
+constexpr unsigned BitWidth = llvm::bit_width_constexpr(
     uint64_t{llvm::to_underlying(E::LLVM_BITMASK_LARGEST_ENUMERATOR)});
 
 } // namespace llvm
diff --git a/llvm/include/llvm/ADT/PointerUnion.h b/llvm/include/llvm/ADT/PointerUnion.h
index ca0e1ed3d49d1..7b66177373011 100644
--- a/llvm/include/llvm/ADT/PointerUnion.h
+++ b/llvm/include/llvm/ADT/PointerUnion.h
@@ -31,7 +31,7 @@ namespace pointer_union_detail {
   /// Determine the number of bits required to store integers with values < n.
   /// This is ceil(log2(n)).
   constexpr int bitsRequired(unsigned n) {
-    return n > 1 ? 1 + bitsRequired((n + 1) / 2) : 0;
+    return n == 0 ? 0 : llvm::bit_width_constexpr(n - 1);
   }
 
   template <typename... Ts> constexpr int lowBitsAvailable() {
diff --git a/llvm/include/llvm/ADT/bit.h b/llvm/include/llvm/ADT/bit.h
index 67c0a1c3300fa..66c4f94813241 100644
--- a/llvm/include/llvm/ADT/bit.h
+++ b/llvm/include/llvm/ADT/bit.h
@@ -292,6 +292,23 @@ template <typename T> [[nodiscard]] int bit_width(T Value) {
   return std::numeric_limits<T>::digits - llvm::countl_zero(Value);
 }
 
+/// Returns the number of bits needed to represent Value if Value is nonzero.
+/// Returns 0 otherwise.
+///
+/// A constexpr version of bit_width.
+///
+/// Ex. bit_width_constexpr(5) == 3.
+template <typename T> [[nodiscard]] constexpr int bit_width_constexpr(T Value) {
+  static_assert(std::is_unsigned_v<T>,
+                "Only unsigned integral types are allowed.");
+  int Width = 0;
+  while (Value > 0) {
+    Value >>= 1;
+    ++Width;
+  }
+  return Width;
+}
+
 /// Returns the largest integral power of two no greater than Value if Value is
 /// nonzero.  Returns 0 otherwise.
 ///
diff --git a/llvm/unittests/ADT/BitTest.cpp b/llvm/unittests/ADT/BitTest.cpp
index 88ae36c44bdb9..eaed4e1fe327d 100644
--- a/llvm/unittests/ADT/BitTest.cpp
+++ b/llvm/unittests/ADT/BitTest.cpp
@@ -247,6 +247,29 @@ TEST(BitTest, BitWidth) {
   EXPECT_EQ(64, llvm::bit_width(uint64_t(0xffffffffffffffffull)));
 }
 
+TEST(BitTest, BitWidthConstexpr) {
+  static_assert(llvm::bit_width_constexpr(0u) == 0);
+  static_assert(llvm::bit_width_constexpr(1u) == 1);
+  static_assert(llvm::bit_width_constexpr(2u) == 2);
+  static_assert(llvm::bit_width_constexpr(3u) == 2);
+  static_assert(llvm::bit_width_constexpr(4u) == 3);
+  static_assert(llvm::bit_width_constexpr(5u) == 3);
+  static_assert(llvm::bit_width_constexpr(6u) == 3);
+  static_assert(llvm::bit_width_constexpr(7u) == 3);
+  static_assert(llvm::bit_width_constexpr(8u) == 4);
+
+  static_assert(llvm::bit_width_constexpr(255u) == 8);
+  static_assert(llvm::bit_width_constexpr(256u) == 9);
+  static_assert(llvm::bit_width_constexpr(257u) == 9);
+
+  static_assert(
+      llvm::bit_width_constexpr(std::numeric_limits<uint16_t>::max()) == 16);
+  static_assert(
+      llvm::bit_width_constexpr(std::numeric_limits<uint32_t>::max()) == 32);
+  static_assert(
+      llvm::bit_width_constexpr(std::numeric_limits<uint64_t>::max()) == 64);
+}
+
 TEST(BitTest, CountlZero) {
   uint8_t Z8 = 0;
   uint16_t Z16 = 0;

From 2689abab42c98abb3bdeb8aad38744d3153c0c6b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 3 Oct 2025 23:46:58 -0700
Subject: [PATCH 734/878] [IR] Consolidate ValueMap iterator classes (NFC)
 (#161777)

This patch consolidates ValueMapIterator and ValueMapConstIterator
into ValueMapIteratorImpl.  ValueMapIteratorImpl takes a boolean
template parameter to determine whether it should act as a const
iterator or a non-const one.  ValueMapIterator and
ValueMapConstIterator are now type aliases of ValueMapIteratorImpl.
---
 llvm/include/llvm/IR/ValueMap.h | 88 ++++++++++-----------------------
 1 file changed, 26 insertions(+), 62 deletions(-)

diff --git a/llvm/include/llvm/IR/ValueMap.h b/llvm/include/llvm/IR/ValueMap.h
index 97653c2282aba..9ab7d8ba17a79 100644
--- a/llvm/include/llvm/IR/ValueMap.h
+++ b/llvm/include/llvm/IR/ValueMap.h
@@ -44,8 +44,8 @@ namespace llvm {
 
 template <typename KeyT, typename ValueT, typename Config>
 class ValueMapCallbackVH;
-template <typename DenseMapT, typename KeyT> class ValueMapIterator;
-template <typename DenseMapT, typename KeyT> class ValueMapConstIterator;
+template <typename DenseMapT, typename KeyT, bool IsConst>
+class ValueMapIteratorImpl;
 
 /// This class defines the default behavior for configurable aspects of
 /// ValueMap<>.  User Configs should inherit from this class to be as compatible
@@ -132,8 +132,8 @@ class ValueMap {
     return Where->second.get();
   }
 
-  using iterator = ValueMapIterator<MapT, KeyT>;
-  using const_iterator = ValueMapConstIterator<MapT, KeyT>;
+  using iterator = ValueMapIteratorImpl<MapT, KeyT, false>;
+  using const_iterator = ValueMapIteratorImpl<MapT, KeyT, true>;
 
   inline iterator begin() { return iterator(Map.begin()); }
   inline iterator end() { return iterator(Map.end()); }
@@ -318,8 +318,10 @@ struct DenseMapInfo<ValueMapCallbackVH<KeyT, ValueT, Config>> {
   }
 };
 
-template <typename DenseMapT, typename KeyT> class ValueMapIterator {
-  using BaseT = typename DenseMapT::iterator;
+template <typename DenseMapT, typename KeyT, bool IsConst>
+class ValueMapIteratorImpl {
+  using BaseT = std::conditional_t<IsConst, typename DenseMapT::const_iterator,
+                                   typename DenseMapT::iterator>;
   using ValueT = typename DenseMapT::mapped_type;
 
   BaseT I;
@@ -331,14 +333,20 @@ template <typename DenseMapT, typename KeyT> class ValueMapIterator {
   using pointer = value_type *;
   using reference = value_type &;
 
-  ValueMapIterator() : I() {}
-  ValueMapIterator(BaseT I) : I(I) {}
+  ValueMapIteratorImpl() = default;
+  ValueMapIteratorImpl(BaseT I) : I(I) {}
+
+  // Allow conversion from iterator to const_iterator.
+  template <bool C = IsConst, typename = std::enable_if_t<C>>
+  ValueMapIteratorImpl(
+      const ValueMapIteratorImpl<DenseMapT, KeyT, false> &Other)
+      : I(Other.base()) {}
 
   BaseT base() const { return I; }
 
   struct ValueTypeProxy {
     const KeyT first;
-    ValueT &second;
+    std::conditional_t<IsConst, const ValueT &, ValueT &> second;
 
     ValueTypeProxy *operator->() { return this; }
 
@@ -354,69 +362,25 @@ template <typename DenseMapT, typename KeyT> class ValueMapIterator {
 
   ValueTypeProxy operator->() const { return operator*(); }
 
-  bool operator==(const ValueMapIterator &RHS) const { return I == RHS.I; }
-  bool operator!=(const ValueMapIterator &RHS) const { return I != RHS.I; }
+  bool operator==(const ValueMapIteratorImpl &RHS) const { return I == RHS.I; }
+  bool operator!=(const ValueMapIteratorImpl &RHS) const { return I != RHS.I; }
 
-  inline ValueMapIterator &operator++() { // Preincrement
+  inline ValueMapIteratorImpl &operator++() { // Preincrement
     ++I;
     return *this;
   }
-  ValueMapIterator operator++(int) { // Postincrement
-    ValueMapIterator tmp = *this;
+  ValueMapIteratorImpl operator++(int) { // Postincrement
+    ValueMapIteratorImpl tmp = *this;
     ++*this;
     return tmp;
   }
 };
 
-template <typename DenseMapT, typename KeyT> class ValueMapConstIterator {
-  using BaseT = typename DenseMapT::const_iterator;
-  using ValueT = typename DenseMapT::mapped_type;
-
-  BaseT I;
-
-public:
-  using iterator_category = std::forward_iterator_tag;
-  using value_type = std::pair<KeyT, typename DenseMapT::mapped_type>;
-  using difference_type = std::ptrdiff_t;
-  using pointer = value_type *;
-  using reference = value_type &;
-
-  ValueMapConstIterator() : I() {}
-  ValueMapConstIterator(BaseT I) : I(I) {}
-  ValueMapConstIterator(ValueMapIterator<DenseMapT, KeyT> Other)
-      : I(Other.base()) {}
-
-  BaseT base() const { return I; }
+template <typename DenseMapT, typename KeyT>
+using ValueMapIterator = ValueMapIteratorImpl<DenseMapT, KeyT, false>;
 
-  struct ValueTypeProxy {
-    const KeyT first;
-    const ValueT &second;
-    ValueTypeProxy *operator->() { return this; }
-    operator std::pair<KeyT, ValueT>() const {
-      return std::make_pair(first, second);
-    }
-  };
-
-  ValueTypeProxy operator*() const {
-    ValueTypeProxy Result = {I->first.Unwrap(), I->second};
-    return Result;
-  }
-
-  ValueTypeProxy operator->() const { return operator*(); }
-
-  bool operator==(const ValueMapConstIterator &RHS) const { return I == RHS.I; }
-  bool operator!=(const ValueMapConstIterator &RHS) const { return I != RHS.I; }
-
-  inline ValueMapConstIterator &operator++() { // Preincrement
-    ++I;
-    return *this;
-  }
-  ValueMapConstIterator operator++(int) { // Postincrement
-    ValueMapConstIterator tmp = *this;
-    ++*this;
-    return tmp;
-  }
-};
+template <typename DenseMapT, typename KeyT>
+using ValueMapConstIterator = ValueMapIteratorImpl<DenseMapT, KeyT, true>;
 
 } // end namespace llvm
 

From 8af44b07db315baf9819d3395ab9efc64b26af08 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 3 Oct 2025 23:47:06 -0700
Subject: [PATCH 735/878] [Support] Use a C++17 fold expression in
 ScopedPrinter.h (NFC) (#161778)

---
 llvm/include/llvm/Support/ScopedPrinter.h | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Support/ScopedPrinter.h b/llvm/include/llvm/Support/ScopedPrinter.h
index a08cc8fd31fd2..94080e85a9048 100644
--- a/llvm/include/llvm/Support/ScopedPrinter.h
+++ b/llvm/include/llvm/Support/ScopedPrinter.h
@@ -284,9 +284,11 @@ class LLVM_ABI ScopedPrinter {
     startLine() << Label << ": " << (Value ? "Yes" : "No") << '\n';
   }
 
-  template <typename... T> void printVersion(StringRef Label, T... Version) {
+  template <typename T, typename... TArgs>
+  void printVersion(StringRef Label, T MajorVersion, TArgs... MinorVersions) {
     startLine() << Label << ": ";
-    printVersionInternal(Version...);
+    getOStream() << MajorVersion;
+    ((getOStream() << '.' << MinorVersions), ...);
     getOStream() << "\n";
   }
 
@@ -454,16 +456,6 @@ class LLVM_ABI ScopedPrinter {
   virtual raw_ostream &getOStream() { return OS; }
 
 private:
-  template <typename T> void printVersionInternal(T Value) {
-    getOStream() << Value;
-  }
-
-  template <typename S, typename T, typename... TArgs>
-  void printVersionInternal(S Value, T Value2, TArgs... Args) {
-    getOStream() << Value << ".";
-    printVersionInternal(Value2, Args...);
-  }
-
   static bool flagName(const FlagEntry &LHS, const FlagEntry &RHS) {
     return LHS.Name < RHS.Name;
   }

From 93b01739563202436573b3fd4b5a3ad9142b55c3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 3 Oct 2025 23:47:13 -0700
Subject: [PATCH 736/878] [llvm] Proofread ProgrammersManual.rst (#161779)

---
 llvm/docs/ProgrammersManual.rst | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index f2b31211cf0dc..9cdac9c59fa9b 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -463,7 +463,7 @@ recovery.
    situations where you absolutely must emit a non-programmatic error and
    the ``Error`` model isn't workable you can call ``reportFatalUsageError``,
    which will call installed error handlers, print a message, and exit the
-   program. The use of `reportFatalUsageError` in this case is discouraged.
+   program. The use of ``reportFatalUsageError`` in this case is discouraged.
 
 Recoverable errors are modeled using LLVM's ``Error`` scheme. This scheme
 represents errors using function return values, similar to classic C integer
@@ -579,7 +579,7 @@ This second form is often more readable for functions that involve multiple
 
 If an ``Expected<T>`` value will be moved into an existing variable then the
 ``moveInto()`` method avoids the need to name an extra variable.  This is
-useful to enable ``operator->()`` the ``Expected<T>`` value has pointer-like
+useful to enable ``operator->()`` if the ``Expected<T>`` value has pointer-like
 semantics.  For example:
 
 .. code-block:: c++
@@ -957,7 +957,7 @@ Concatenating Errors with joinErrors
 """"""""""""""""""""""""""""""""""""
 
 In the archive walking example above, ``BadFileFormat`` errors are simply
-consumed and ignored. If the client had wanted report these errors after
+consumed and ignored. If the client had wanted to report these errors after
 completing the walk over the archive they could use the ``joinErrors`` utility:
 
 .. code-block:: c++
@@ -989,7 +989,7 @@ Building fallible iterators and iterator ranges
 """""""""""""""""""""""""""""""""""""""""""""""
 
 The archive walking examples above retrieve archive members by index; however,
-this requires considerable boiler-plate for iteration and error checking. We can
+this requires considerable boilerplate for iteration and error checking. We can
 clean this up by using the "fallible iterator" pattern, which supports the
 following natural iteration idiom for fallible containers like Archive:
 
@@ -1039,7 +1039,7 @@ fallible_iterator utility which provides ``operator++`` and ``operator--``,
 returning any errors via a reference passed in to the wrapper at construction
 time. The fallible_iterator wrapper takes care of (a) jumping to the end of the
 range on error, and (b) marking the error as checked whenever an iterator is
-compared to ``end`` and found to be inequal (in particular, this marks the
+compared to ``end`` and found to be unequal (in particular, this marks the
 error as checked throughout the body of a range-based for loop), enabling early
 exit from the loop without redundant error checking.
 
@@ -1452,7 +1452,7 @@ A more general utility is provided in `llvm/tools/reduce-chunk-list/reduce-chunk
 How to use reduce-chunk-list:
 First, Figure out the number of calls to the debug counter you want to minimize.
 To do so, run the compilation command causing you want to minimize with `-print-debug-counter` adding a `-mllvm` if needed.
-Than find the line with the counter of interest. it should look like:
+Then find the line with the counter of interest. it should look like:
 
 .. code-block:: none
 
@@ -1460,7 +1460,7 @@ Than find the line with the counter of interest. it should look like:
 
 The number of calls to `my-counter` is 5678
 
-Than Find the minimum set of chunks that is interesting, with `reduce-chunk-list`.
+Then find the minimum set of chunks that is interesting, with `reduce-chunk-list`.
 Build a reproducer script like:
 
 .. code-block:: bash
@@ -1469,7 +1469,7 @@ Build a reproducer script like:
   opt -debug-counter=my-counter=$1
   # ... Test result of the command. Failure of the script is considered interesting
 
-Than run `reduce-chunk-list my-script.sh 0-5678 2>&1 | tee dump_bisect`
+Then run `reduce-chunk-list my-script.sh 0-5678 2>&1 | tee dump_bisect`
 This command may take some time.
 but when it is done, it will print the result like: `Minimal Chunks = 0:1:5:11-12:33-34`
 
@@ -1528,7 +1528,7 @@ LLVM has a plethora of data structures in the ``llvm/ADT/`` directory, and we
 commonly use STL data structures.  This section describes the trade-offs you
 should consider when you pick one.
 
-The first step is a choose your own adventure: do you want a sequential
+The first step is to choose your own adventure: do you want a sequential
 container, a set-like container, or a map-like container?  The most important
 thing when choosing a container is the algorithmic properties of how you plan to
 access the container.  Based on that, you should use:

From 25e02a43fe6bf95c118d7c3862d7d7c4abe4dcde Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 4 Oct 2025 16:59:55 +1000
Subject: [PATCH 737/878] Revert "[MachO] Move getArchTriple implementation
 into BinaryFormat. (#161468)"

Reverts commit cd32b9b6c3b while I investigate some bot failures, e.g.
https://lab.llvm.org/buildbot/#/builders/10/builds/14784.
---
 llvm/include/llvm/BinaryFormat/MachO.h |   9 +-
 llvm/include/llvm/Object/MachO.h       |   8 +-
 llvm/lib/BinaryFormat/MachO.cpp        | 157 ------------------------
 llvm/lib/Object/MachOObjectFile.cpp    | 158 +++++++++++++++++++++++++
 4 files changed, 163 insertions(+), 169 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
index ad6b27f61ec91..5dbdfb13d1a5f 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -17,9 +17,11 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SwapByteOrder.h"
-#include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
+
+class Triple;
+
 namespace MachO {
 // Enums from <mach-o/loader.h>
 enum : uint32_t {
@@ -1708,11 +1710,6 @@ LLVM_ABI Expected<uint32_t> getCPUSubType(const Triple &T,
                                           unsigned PtrAuthABIVersion,
                                           bool PtrAuthKernelABIVersion);
 
-LLVM_ABI Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
-LLVM_ABI Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
-                              const char **McpuDefault = nullptr,
-                              const char **ArchFlag = nullptr);
-
 struct x86_thread_state32_t {
   uint32_t eax;
   uint32_t ebx;
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 0dfe9802009a2..3f4a21d2d1bac 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -752,14 +752,10 @@ class LLVM_ABI MachOObjectFile : public ObjectFile {
   static StringRef guessLibraryShortName(StringRef Name, bool &isFramework,
                                          StringRef &Suffix);
 
-  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType) {
-    return MachO::getArch(CPUType, CPUSubType);
-  }
+  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
   static Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
                               const char **McpuDefault = nullptr,
-                              const char **ArchFlag = nullptr) {
-    return MachO::getArchTriple(CPUType, CPUSubType, McpuDefault, ArchFlag);
-  }
+                              const char **ArchFlag = nullptr);
   static bool isValidArch(StringRef ArchFlag);
   static ArrayRef<StringRef> getValidArchs();
   static Triple getHostArch();
diff --git a/llvm/lib/BinaryFormat/MachO.cpp b/llvm/lib/BinaryFormat/MachO.cpp
index b2af2434f3518..f46b9d5147ff1 100644
--- a/llvm/lib/BinaryFormat/MachO.cpp
+++ b/llvm/lib/BinaryFormat/MachO.cpp
@@ -123,160 +123,3 @@ Expected<uint32_t> MachO::getCPUSubType(const Triple &T,
   return CPU_SUBTYPE_ARM64E_WITH_PTRAUTH_VERSION(PtrAuthABIVersion,
                                                  PtrAuthKernelABIVersion);
 }
-
-Triple::ArchType MachO::getArch(uint32_t CPUType, uint32_t CPUSubType) {
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    return Triple::x86;
-  case MachO::CPU_TYPE_X86_64:
-    return Triple::x86_64;
-  case MachO::CPU_TYPE_ARM:
-    return Triple::arm;
-  case MachO::CPU_TYPE_ARM64:
-    return Triple::aarch64;
-  case MachO::CPU_TYPE_ARM64_32:
-    return Triple::aarch64_32;
-  case MachO::CPU_TYPE_POWERPC:
-    return Triple::ppc;
-  case MachO::CPU_TYPE_POWERPC64:
-    return Triple::ppc64;
-  default:
-    return Triple::UnknownArch;
-  }
-}
-
-Triple MachO::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
-                            const char **McpuDefault, const char **ArchFlag) {
-  if (McpuDefault)
-    *McpuDefault = nullptr;
-  if (ArchFlag)
-    *ArchFlag = nullptr;
-
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_I386_ALL:
-      if (ArchFlag)
-        *ArchFlag = "i386";
-      return Triple("i386-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_X86_64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_X86_64_ALL:
-      if (ArchFlag)
-        *ArchFlag = "x86_64";
-      return Triple("x86_64-apple-darwin");
-    case MachO::CPU_SUBTYPE_X86_64_H:
-      if (ArchFlag)
-        *ArchFlag = "x86_64h";
-      return Triple("x86_64h-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM_V4T:
-      if (ArchFlag)
-        *ArchFlag = "armv4t";
-      return Triple("armv4t-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
-      if (ArchFlag)
-        *ArchFlag = "armv5e";
-      return Triple("armv5e-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_XSCALE:
-      if (ArchFlag)
-        *ArchFlag = "xscale";
-      return Triple("xscale-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6:
-      if (ArchFlag)
-        *ArchFlag = "armv6";
-      return Triple("armv6-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m0";
-      if (ArchFlag)
-        *ArchFlag = "armv6m";
-      return Triple("armv6m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7:
-      if (ArchFlag)
-        *ArchFlag = "armv7";
-      return Triple("armv7-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7EM:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m4";
-      if (ArchFlag)
-        *ArchFlag = "armv7em";
-      return Triple("thumbv7em-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7K:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7k";
-      return Triple("armv7k-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m3";
-      if (ArchFlag)
-        *ArchFlag = "armv7m";
-      return Triple("thumbv7m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7S:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7s";
-      return Triple("armv7s-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_ALL:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64";
-      return Triple("arm64-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM64E:
-      if (McpuDefault)
-        *McpuDefault = "apple-a12";
-      if (ArchFlag)
-        *ArchFlag = "arm64e";
-      return Triple("arm64e-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64_32:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_32_V8:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64_32";
-      return Triple("arm64_32-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc";
-      return Triple("ppc-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc64";
-      return Triple("ppc64-apple-darwin");
-    default:
-      return Triple();
-    }
-  default:
-    return Triple();
-  }
-}
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index abadb047ef521..e09dc947c2779 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -2685,6 +2685,164 @@ StringRef MachOObjectFile::getFileFormatName() const {
   }
 }
 
+Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType) {
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    return Triple::x86;
+  case MachO::CPU_TYPE_X86_64:
+    return Triple::x86_64;
+  case MachO::CPU_TYPE_ARM:
+    return Triple::arm;
+  case MachO::CPU_TYPE_ARM64:
+    return Triple::aarch64;
+  case MachO::CPU_TYPE_ARM64_32:
+    return Triple::aarch64_32;
+  case MachO::CPU_TYPE_POWERPC:
+    return Triple::ppc;
+  case MachO::CPU_TYPE_POWERPC64:
+    return Triple::ppc64;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
+                                      const char **McpuDefault,
+                                      const char **ArchFlag) {
+  if (McpuDefault)
+    *McpuDefault = nullptr;
+  if (ArchFlag)
+    *ArchFlag = nullptr;
+
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_I386_ALL:
+      if (ArchFlag)
+        *ArchFlag = "i386";
+      return Triple("i386-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_X86_64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_X86_64_ALL:
+      if (ArchFlag)
+        *ArchFlag = "x86_64";
+      return Triple("x86_64-apple-darwin");
+    case MachO::CPU_SUBTYPE_X86_64_H:
+      if (ArchFlag)
+        *ArchFlag = "x86_64h";
+      return Triple("x86_64h-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM_V4T:
+      if (ArchFlag)
+        *ArchFlag = "armv4t";
+      return Triple("armv4t-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+      if (ArchFlag)
+        *ArchFlag = "armv5e";
+      return Triple("armv5e-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_XSCALE:
+      if (ArchFlag)
+        *ArchFlag = "xscale";
+      return Triple("xscale-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6:
+      if (ArchFlag)
+        *ArchFlag = "armv6";
+      return Triple("armv6-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m0";
+      if (ArchFlag)
+        *ArchFlag = "armv6m";
+      return Triple("armv6m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7:
+      if (ArchFlag)
+        *ArchFlag = "armv7";
+      return Triple("armv7-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7EM:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m4";
+      if (ArchFlag)
+        *ArchFlag = "armv7em";
+      return Triple("thumbv7em-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7K:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7k";
+      return Triple("armv7k-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m3";
+      if (ArchFlag)
+        *ArchFlag = "armv7m";
+      return Triple("thumbv7m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7S:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7s";
+      return Triple("armv7s-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_ALL:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64";
+      return Triple("arm64-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM64E:
+      if (McpuDefault)
+        *McpuDefault = "apple-a12";
+      if (ArchFlag)
+        *ArchFlag = "arm64e";
+      return Triple("arm64e-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64_32:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_32_V8:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64_32";
+      return Triple("arm64_32-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc";
+      return Triple("ppc-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc64";
+      return Triple("ppc64-apple-darwin");
+    default:
+      return Triple();
+    }
+  default:
+    return Triple();
+  }
+}
+
 Triple MachOObjectFile::getHostArch() {
   return Triple(sys::getDefaultTargetTriple());
 }

From ad7334b067ced512d65cd7576b0472cf8fc0487c Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sat, 4 Oct 2025 00:36:05 -0700
Subject: [PATCH 738/878] [BOLT] Bump d3js version in dot2html

---
 bolt/utils/dot2html/d3-graphviz-template.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/utils/dot2html/d3-graphviz-template.html b/bolt/utils/dot2html/d3-graphviz-template.html
index c86f779f8562d..46fdf2afbc8a2 100644
--- a/bolt/utils/dot2html/d3-graphviz-template.html
+++ b/bolt/utils/dot2html/d3-graphviz-template.html
@@ -1,9 +1,9 @@
 <!DOCTYPE html>
 <meta charset="utf-8">
 <body>
-<script src="https://d3js.org/d3.v5.min.js"></script>
-<script src="https://unpkg.com/@hpcc-js/wasm@0.3.11/dist/index.min.js"></script>
-<script src="https://unpkg.com/d3-graphviz@3.0.5/build/d3-graphviz.js"></script>
+<script src="https://d3js.org/d3.v7.min.js"></script>
+<script src="https://unpkg.com/@hpcc-js/wasm@2.20.0/dist/graphviz.umd.js"></script>
+<script src="https://unpkg.com/d3-graphviz@5.6.0/build/d3-graphviz.js"></script>
 <div id="graph" style="text-align: center;"></div>
 <script>
 var dotSrc = `

From 823094e407d74ce1b2a805bbfa4a9cfaf743d23c Mon Sep 17 00:00:00 2001
From: Andre Kuhlenschmidt <andre.kuhlenschmidt@gmail.com>
Date: Sat, 4 Oct 2025 00:55:14 -0700
Subject: [PATCH 739/878] [flang][semantics] fix IsConstantExpr for intrinsic
 with optional argument (#161915)

fixes https://github.com/llvm/llvm-project/issues/161694

Exposes that some sequences of duplicate messages are being printed,
which is fixed in https://github.com/llvm/llvm-project/pull/161916 .
---
 flang/lib/Evaluate/check-expression.cpp        | 18 ++++++++++++------
 flang/test/Semantics/intrinsics03.f90          |  2 +-
 flang/test/Semantics/intrinsics04.f90          |  2 +-
 .../test/Semantics/type-parameter-constant.f90 | 15 +++++++++++++++
 4 files changed, 29 insertions(+), 8 deletions(-)
 create mode 100644 flang/test/Semantics/type-parameter-constant.f90

diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 8931cbe485ac2..647eebaaa070b 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -135,16 +135,22 @@ bool IsConstantExprHelper<INVARIANT>::operator()(
     } else if (proc.IsPure()) {
       std::size_t j{0};
       for (const auto &arg : call.arguments()) {
-        if (const auto *dataDummy{j < proc.dummyArguments.size()
-                    ? std::get_if<characteristics::DummyDataObject>(
-                          &proc.dummyArguments[j].u)
-                    : nullptr};
-            dataDummy &&
+        const auto *dataDummy{j < proc.dummyArguments.size()
+                ? std::get_if<characteristics::DummyDataObject>(
+                      &proc.dummyArguments[j].u)
+                : nullptr};
+        if (dataDummy &&
             dataDummy->attrs.test(
                 characteristics::DummyDataObject::Attr::OnlyIntrinsicInquiry)) {
           // The value of the argument doesn't matter
         } else if (!arg) {
-          return false;
+          if (dataDummy &&
+              dataDummy->attrs.test(
+                  characteristics::DummyDataObject::Attr::Optional)) {
+            // Missing optional arguments are okay.
+          } else {
+            return false;
+          }
         } else if (const auto *expr{arg->UnwrapExpr()};
             !expr || !(*this)(*expr)) {
           return false;
diff --git a/flang/test/Semantics/intrinsics03.f90 b/flang/test/Semantics/intrinsics03.f90
index a5b13b655cf41..1a4269868a3d4 100644
--- a/flang/test/Semantics/intrinsics03.f90
+++ b/flang/test/Semantics/intrinsics03.f90
@@ -129,6 +129,6 @@ subroutine ichar_tests()
   !Without -Wportability, the warning isn't emitted and the parameter is constant.
   integer, parameter :: a2 = ichar('B ')
   !ERROR: Character in intrinsic function ichar must have length one
-  !ERROR: Must be a constant value
+  !ERROR: Value of named constant 'a3' (ichar("")) cannot be computed as a constant value
   integer, parameter :: a3 = ichar('')
 end subroutine
diff --git a/flang/test/Semantics/intrinsics04.f90 b/flang/test/Semantics/intrinsics04.f90
index abb8fe321a572..e733067237d17 100644
--- a/flang/test/Semantics/intrinsics04.f90
+++ b/flang/test/Semantics/intrinsics04.f90
@@ -29,6 +29,6 @@ subroutine ichar_tests()
   !WARNING: Character in intrinsic function ichar should have length one [-Wportability]
   integer, parameter :: a2 = ichar('B ')
   !ERROR: Character in intrinsic function ichar must have length one
-  !ERROR: Must be a constant value
+  !ERROR: Value of named constant 'a3' (ichar("")) cannot be computed as a constant value
   integer, parameter :: a3 = ichar('')
 end subroutine
diff --git a/flang/test/Semantics/type-parameter-constant.f90 b/flang/test/Semantics/type-parameter-constant.f90
new file mode 100644
index 0000000000000..376bffd0233ff
--- /dev/null
+++ b/flang/test/Semantics/type-parameter-constant.f90
@@ -0,0 +1,15 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+type A (p, r)
+  integer, kind :: p, r
+  !ERROR: KIND parameter expression (int(selected_real_kind(six,twenty_three),kind=8)) of intrinsic type REAL did not resolve to a constant value
+  real (selected_real_kind(p, r)) :: data
+end type
+   integer :: six = 6, twenty_three = 23
+   type(a(6,23)) :: a1
+   !ERROR: Value of KIND type parameter 'p' must be constant
+   !ERROR: Value of KIND type parameter 'r' must be constant
+   !WARNING: specification expression refers to local object 'six' (initialized and saved) [-Wsaved-local-in-spec-expr]
+   !WARNING: specification expression refers to local object 'twenty_three' (initialized and saved) [-Wsaved-local-in-spec-expr]
+   type(a(six, twenty_three)) :: a2 
+   print *, a1%data%kind
+end
\ No newline at end of file

From 54012102c4a8e8bba9b3b27291dab126e3521544 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 4 Oct 2025 18:46:35 +1000
Subject: [PATCH 740/878] Reapply [MachO] Move getArchTriple implementation..."
 with fixes. (#161949)

This reapplies cd32b9b6c3b, which was reverted in 25e02a43fe6 due to bot
failures.

The failures all appear to be link errors due to the Object library not
depending on BinaryFormat. This commit adds the missing dependence.
---
 llvm/include/llvm/BinaryFormat/MachO.h |   9 +-
 llvm/include/llvm/Object/MachO.h       |   8 +-
 llvm/lib/BinaryFormat/MachO.cpp        | 157 ++++++++++++++++++++++++
 llvm/lib/Object/CMakeLists.txt         |   1 +
 llvm/lib/Object/MachOObjectFile.cpp    | 158 -------------------------
 5 files changed, 170 insertions(+), 163 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
index 5dbdfb13d1a5f..ad6b27f61ec91 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -17,11 +17,9 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SwapByteOrder.h"
+#include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
-
-class Triple;
-
 namespace MachO {
 // Enums from <mach-o/loader.h>
 enum : uint32_t {
@@ -1710,6 +1708,11 @@ LLVM_ABI Expected<uint32_t> getCPUSubType(const Triple &T,
                                           unsigned PtrAuthABIVersion,
                                           bool PtrAuthKernelABIVersion);
 
+LLVM_ABI Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
+LLVM_ABI Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
+                              const char **McpuDefault = nullptr,
+                              const char **ArchFlag = nullptr);
+
 struct x86_thread_state32_t {
   uint32_t eax;
   uint32_t ebx;
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 3f4a21d2d1bac..0dfe9802009a2 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -752,10 +752,14 @@ class LLVM_ABI MachOObjectFile : public ObjectFile {
   static StringRef guessLibraryShortName(StringRef Name, bool &isFramework,
                                          StringRef &Suffix);
 
-  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
+  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType) {
+    return MachO::getArch(CPUType, CPUSubType);
+  }
   static Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
                               const char **McpuDefault = nullptr,
-                              const char **ArchFlag = nullptr);
+                              const char **ArchFlag = nullptr) {
+    return MachO::getArchTriple(CPUType, CPUSubType, McpuDefault, ArchFlag);
+  }
   static bool isValidArch(StringRef ArchFlag);
   static ArrayRef<StringRef> getValidArchs();
   static Triple getHostArch();
diff --git a/llvm/lib/BinaryFormat/MachO.cpp b/llvm/lib/BinaryFormat/MachO.cpp
index f46b9d5147ff1..b2af2434f3518 100644
--- a/llvm/lib/BinaryFormat/MachO.cpp
+++ b/llvm/lib/BinaryFormat/MachO.cpp
@@ -123,3 +123,160 @@ Expected<uint32_t> MachO::getCPUSubType(const Triple &T,
   return CPU_SUBTYPE_ARM64E_WITH_PTRAUTH_VERSION(PtrAuthABIVersion,
                                                  PtrAuthKernelABIVersion);
 }
+
+Triple::ArchType MachO::getArch(uint32_t CPUType, uint32_t CPUSubType) {
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    return Triple::x86;
+  case MachO::CPU_TYPE_X86_64:
+    return Triple::x86_64;
+  case MachO::CPU_TYPE_ARM:
+    return Triple::arm;
+  case MachO::CPU_TYPE_ARM64:
+    return Triple::aarch64;
+  case MachO::CPU_TYPE_ARM64_32:
+    return Triple::aarch64_32;
+  case MachO::CPU_TYPE_POWERPC:
+    return Triple::ppc;
+  case MachO::CPU_TYPE_POWERPC64:
+    return Triple::ppc64;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+Triple MachO::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
+                            const char **McpuDefault, const char **ArchFlag) {
+  if (McpuDefault)
+    *McpuDefault = nullptr;
+  if (ArchFlag)
+    *ArchFlag = nullptr;
+
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_I386_ALL:
+      if (ArchFlag)
+        *ArchFlag = "i386";
+      return Triple("i386-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_X86_64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_X86_64_ALL:
+      if (ArchFlag)
+        *ArchFlag = "x86_64";
+      return Triple("x86_64-apple-darwin");
+    case MachO::CPU_SUBTYPE_X86_64_H:
+      if (ArchFlag)
+        *ArchFlag = "x86_64h";
+      return Triple("x86_64h-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM_V4T:
+      if (ArchFlag)
+        *ArchFlag = "armv4t";
+      return Triple("armv4t-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+      if (ArchFlag)
+        *ArchFlag = "armv5e";
+      return Triple("armv5e-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_XSCALE:
+      if (ArchFlag)
+        *ArchFlag = "xscale";
+      return Triple("xscale-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6:
+      if (ArchFlag)
+        *ArchFlag = "armv6";
+      return Triple("armv6-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m0";
+      if (ArchFlag)
+        *ArchFlag = "armv6m";
+      return Triple("armv6m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7:
+      if (ArchFlag)
+        *ArchFlag = "armv7";
+      return Triple("armv7-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7EM:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m4";
+      if (ArchFlag)
+        *ArchFlag = "armv7em";
+      return Triple("thumbv7em-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7K:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7k";
+      return Triple("armv7k-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m3";
+      if (ArchFlag)
+        *ArchFlag = "armv7m";
+      return Triple("thumbv7m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7S:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7s";
+      return Triple("armv7s-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_ALL:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64";
+      return Triple("arm64-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM64E:
+      if (McpuDefault)
+        *McpuDefault = "apple-a12";
+      if (ArchFlag)
+        *ArchFlag = "arm64e";
+      return Triple("arm64e-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64_32:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_32_V8:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64_32";
+      return Triple("arm64_32-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc";
+      return Triple("ppc-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc64";
+      return Triple("ppc64-apple-darwin");
+    default:
+      return Triple();
+    }
+  default:
+    return Triple();
+  }
+}
diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt
index 0f6d2f7c59a5c..b36120a528b1b 100644
--- a/llvm/lib/Object/CMakeLists.txt
+++ b/llvm/lib/Object/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_component_library(LLVMObject
 
   LINK_COMPONENTS
   BitReader
+  BinaryFormat
   Core
   MC
   IRReader
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index e09dc947c2779..abadb047ef521 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -2685,164 +2685,6 @@ StringRef MachOObjectFile::getFileFormatName() const {
   }
 }
 
-Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType) {
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    return Triple::x86;
-  case MachO::CPU_TYPE_X86_64:
-    return Triple::x86_64;
-  case MachO::CPU_TYPE_ARM:
-    return Triple::arm;
-  case MachO::CPU_TYPE_ARM64:
-    return Triple::aarch64;
-  case MachO::CPU_TYPE_ARM64_32:
-    return Triple::aarch64_32;
-  case MachO::CPU_TYPE_POWERPC:
-    return Triple::ppc;
-  case MachO::CPU_TYPE_POWERPC64:
-    return Triple::ppc64;
-  default:
-    return Triple::UnknownArch;
-  }
-}
-
-Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
-                                      const char **McpuDefault,
-                                      const char **ArchFlag) {
-  if (McpuDefault)
-    *McpuDefault = nullptr;
-  if (ArchFlag)
-    *ArchFlag = nullptr;
-
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_I386_ALL:
-      if (ArchFlag)
-        *ArchFlag = "i386";
-      return Triple("i386-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_X86_64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_X86_64_ALL:
-      if (ArchFlag)
-        *ArchFlag = "x86_64";
-      return Triple("x86_64-apple-darwin");
-    case MachO::CPU_SUBTYPE_X86_64_H:
-      if (ArchFlag)
-        *ArchFlag = "x86_64h";
-      return Triple("x86_64h-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM_V4T:
-      if (ArchFlag)
-        *ArchFlag = "armv4t";
-      return Triple("armv4t-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
-      if (ArchFlag)
-        *ArchFlag = "armv5e";
-      return Triple("armv5e-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_XSCALE:
-      if (ArchFlag)
-        *ArchFlag = "xscale";
-      return Triple("xscale-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6:
-      if (ArchFlag)
-        *ArchFlag = "armv6";
-      return Triple("armv6-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m0";
-      if (ArchFlag)
-        *ArchFlag = "armv6m";
-      return Triple("armv6m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7:
-      if (ArchFlag)
-        *ArchFlag = "armv7";
-      return Triple("armv7-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7EM:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m4";
-      if (ArchFlag)
-        *ArchFlag = "armv7em";
-      return Triple("thumbv7em-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7K:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7k";
-      return Triple("armv7k-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m3";
-      if (ArchFlag)
-        *ArchFlag = "armv7m";
-      return Triple("thumbv7m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7S:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7s";
-      return Triple("armv7s-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_ALL:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64";
-      return Triple("arm64-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM64E:
-      if (McpuDefault)
-        *McpuDefault = "apple-a12";
-      if (ArchFlag)
-        *ArchFlag = "arm64e";
-      return Triple("arm64e-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64_32:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_32_V8:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64_32";
-      return Triple("arm64_32-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc";
-      return Triple("ppc-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc64";
-      return Triple("ppc64-apple-darwin");
-    default:
-      return Triple();
-    }
-  default:
-    return Triple();
-  }
-}
-
 Triple MachOObjectFile::getHostArch() {
   return Triple(sys::getDefaultTargetTriple());
 }

From 47361e7e0d3acb5b14f4a0dbd0b451bf7881eca2 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 4 Oct 2025 11:09:01 +0200
Subject: [PATCH 741/878] [CIR] Implement BinComma Expr for AggregateExpr
 (#161823)

Implement the BinComma Expr support for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  3 ++-
 clang/test/CIR/CodeGen/struct.cpp             | 22 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 1e987f3bedc7e..d3540a2649cb4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -184,7 +184,8 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitBinAssign");
   }
   void VisitBinComma(const BinaryOperator *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitBinComma");
+    cgf.emitIgnoredExpr(e->getLHS());
+    Visit(e->getRHS());
   }
   void VisitBinCmp(const BinaryOperator *e) {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitBinCmp");
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index 96db82a89977c..c0f0e0b5b853f 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -183,3 +183,25 @@ void generic_selection() {
 // OGCG:   %[[C_ADDR:.*]] = alloca i32, align 4
 // OGCG:   %[[D_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[D_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
+
+void bin_comma() { 
+  CompleteS a = (CompleteS(), CompleteS());
+}
+
+// CIR: cir.func{{.*}} @_Z9bin_commav()
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
+// CIR:   %[[TMP_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp0"]
+// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[TMP_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
+// CIR:   %[[ZERO:.*]] = cir.const #cir.zero : !rec_CompleteS
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
+
+// LLVM: define{{.*}} void @_Z9bin_commav()
+// LLVM:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   %[[TMP_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM:   store %struct.CompleteS zeroinitializer, ptr %[[TMP_ADDR]], align 4
+// LLVM:   store %struct.CompleteS zeroinitializer, ptr %[[A_ADDR]], align 4
+
+// OGCG: define{{.*}} void @_Z9bin_commav()
+// OGCG:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG:   call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)

From c06aa2e8137a7e27e0bd0296d2f489c44d47f6f2 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 4 Oct 2025 19:15:08 +1000
Subject: [PATCH 742/878] Revert "Reapply [MachO] Move getArchTriple impl...
 with fixes. (#161949)"

This reverts commit 54012102c4a8e8bba9b3b27291dab126e3521544 while I further
investigate bot failures. Apparently adding a dependence on BinaryFormat to
Object was insufficient to fix the original linker issues.
---
 llvm/include/llvm/BinaryFormat/MachO.h |   9 +-
 llvm/include/llvm/Object/MachO.h       |   8 +-
 llvm/lib/BinaryFormat/MachO.cpp        | 157 ------------------------
 llvm/lib/Object/CMakeLists.txt         |   1 -
 llvm/lib/Object/MachOObjectFile.cpp    | 158 +++++++++++++++++++++++++
 5 files changed, 163 insertions(+), 170 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
index ad6b27f61ec91..5dbdfb13d1a5f 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -17,9 +17,11 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SwapByteOrder.h"
-#include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
+
+class Triple;
+
 namespace MachO {
 // Enums from <mach-o/loader.h>
 enum : uint32_t {
@@ -1708,11 +1710,6 @@ LLVM_ABI Expected<uint32_t> getCPUSubType(const Triple &T,
                                           unsigned PtrAuthABIVersion,
                                           bool PtrAuthKernelABIVersion);
 
-LLVM_ABI Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
-LLVM_ABI Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
-                              const char **McpuDefault = nullptr,
-                              const char **ArchFlag = nullptr);
-
 struct x86_thread_state32_t {
   uint32_t eax;
   uint32_t ebx;
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 0dfe9802009a2..3f4a21d2d1bac 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -752,14 +752,10 @@ class LLVM_ABI MachOObjectFile : public ObjectFile {
   static StringRef guessLibraryShortName(StringRef Name, bool &isFramework,
                                          StringRef &Suffix);
 
-  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType) {
-    return MachO::getArch(CPUType, CPUSubType);
-  }
+  static Triple::ArchType getArch(uint32_t CPUType, uint32_t CPUSubType);
   static Triple getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
                               const char **McpuDefault = nullptr,
-                              const char **ArchFlag = nullptr) {
-    return MachO::getArchTriple(CPUType, CPUSubType, McpuDefault, ArchFlag);
-  }
+                              const char **ArchFlag = nullptr);
   static bool isValidArch(StringRef ArchFlag);
   static ArrayRef<StringRef> getValidArchs();
   static Triple getHostArch();
diff --git a/llvm/lib/BinaryFormat/MachO.cpp b/llvm/lib/BinaryFormat/MachO.cpp
index b2af2434f3518..f46b9d5147ff1 100644
--- a/llvm/lib/BinaryFormat/MachO.cpp
+++ b/llvm/lib/BinaryFormat/MachO.cpp
@@ -123,160 +123,3 @@ Expected<uint32_t> MachO::getCPUSubType(const Triple &T,
   return CPU_SUBTYPE_ARM64E_WITH_PTRAUTH_VERSION(PtrAuthABIVersion,
                                                  PtrAuthKernelABIVersion);
 }
-
-Triple::ArchType MachO::getArch(uint32_t CPUType, uint32_t CPUSubType) {
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    return Triple::x86;
-  case MachO::CPU_TYPE_X86_64:
-    return Triple::x86_64;
-  case MachO::CPU_TYPE_ARM:
-    return Triple::arm;
-  case MachO::CPU_TYPE_ARM64:
-    return Triple::aarch64;
-  case MachO::CPU_TYPE_ARM64_32:
-    return Triple::aarch64_32;
-  case MachO::CPU_TYPE_POWERPC:
-    return Triple::ppc;
-  case MachO::CPU_TYPE_POWERPC64:
-    return Triple::ppc64;
-  default:
-    return Triple::UnknownArch;
-  }
-}
-
-Triple MachO::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
-                            const char **McpuDefault, const char **ArchFlag) {
-  if (McpuDefault)
-    *McpuDefault = nullptr;
-  if (ArchFlag)
-    *ArchFlag = nullptr;
-
-  switch (CPUType) {
-  case MachO::CPU_TYPE_I386:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_I386_ALL:
-      if (ArchFlag)
-        *ArchFlag = "i386";
-      return Triple("i386-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_X86_64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_X86_64_ALL:
-      if (ArchFlag)
-        *ArchFlag = "x86_64";
-      return Triple("x86_64-apple-darwin");
-    case MachO::CPU_SUBTYPE_X86_64_H:
-      if (ArchFlag)
-        *ArchFlag = "x86_64h";
-      return Triple("x86_64h-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM_V4T:
-      if (ArchFlag)
-        *ArchFlag = "armv4t";
-      return Triple("armv4t-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
-      if (ArchFlag)
-        *ArchFlag = "armv5e";
-      return Triple("armv5e-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_XSCALE:
-      if (ArchFlag)
-        *ArchFlag = "xscale";
-      return Triple("xscale-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6:
-      if (ArchFlag)
-        *ArchFlag = "armv6";
-      return Triple("armv6-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V6M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m0";
-      if (ArchFlag)
-        *ArchFlag = "armv6m";
-      return Triple("armv6m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7:
-      if (ArchFlag)
-        *ArchFlag = "armv7";
-      return Triple("armv7-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7EM:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m4";
-      if (ArchFlag)
-        *ArchFlag = "armv7em";
-      return Triple("thumbv7em-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7K:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7k";
-      return Triple("armv7k-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7M:
-      if (McpuDefault)
-        *McpuDefault = "cortex-m3";
-      if (ArchFlag)
-        *ArchFlag = "armv7m";
-      return Triple("thumbv7m-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM_V7S:
-      if (McpuDefault)
-        *McpuDefault = "cortex-a7";
-      if (ArchFlag)
-        *ArchFlag = "armv7s";
-      return Triple("armv7s-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_ALL:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64";
-      return Triple("arm64-apple-darwin");
-    case MachO::CPU_SUBTYPE_ARM64E:
-      if (McpuDefault)
-        *McpuDefault = "apple-a12";
-      if (ArchFlag)
-        *ArchFlag = "arm64e";
-      return Triple("arm64e-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_ARM64_32:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_ARM64_32_V8:
-      if (McpuDefault)
-        *McpuDefault = "cyclone";
-      if (ArchFlag)
-        *ArchFlag = "arm64_32";
-      return Triple("arm64_32-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc";
-      return Triple("ppc-apple-darwin");
-    default:
-      return Triple();
-    }
-  case MachO::CPU_TYPE_POWERPC64:
-    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
-    case MachO::CPU_SUBTYPE_POWERPC_ALL:
-      if (ArchFlag)
-        *ArchFlag = "ppc64";
-      return Triple("ppc64-apple-darwin");
-    default:
-      return Triple();
-    }
-  default:
-    return Triple();
-  }
-}
diff --git a/llvm/lib/Object/CMakeLists.txt b/llvm/lib/Object/CMakeLists.txt
index b36120a528b1b..0f6d2f7c59a5c 100644
--- a/llvm/lib/Object/CMakeLists.txt
+++ b/llvm/lib/Object/CMakeLists.txt
@@ -45,7 +45,6 @@ add_llvm_component_library(LLVMObject
 
   LINK_COMPONENTS
   BitReader
-  BinaryFormat
   Core
   MC
   IRReader
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index abadb047ef521..e09dc947c2779 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -2685,6 +2685,164 @@ StringRef MachOObjectFile::getFileFormatName() const {
   }
 }
 
+Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType, uint32_t CPUSubType) {
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    return Triple::x86;
+  case MachO::CPU_TYPE_X86_64:
+    return Triple::x86_64;
+  case MachO::CPU_TYPE_ARM:
+    return Triple::arm;
+  case MachO::CPU_TYPE_ARM64:
+    return Triple::aarch64;
+  case MachO::CPU_TYPE_ARM64_32:
+    return Triple::aarch64_32;
+  case MachO::CPU_TYPE_POWERPC:
+    return Triple::ppc;
+  case MachO::CPU_TYPE_POWERPC64:
+    return Triple::ppc64;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
+                                      const char **McpuDefault,
+                                      const char **ArchFlag) {
+  if (McpuDefault)
+    *McpuDefault = nullptr;
+  if (ArchFlag)
+    *ArchFlag = nullptr;
+
+  switch (CPUType) {
+  case MachO::CPU_TYPE_I386:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_I386_ALL:
+      if (ArchFlag)
+        *ArchFlag = "i386";
+      return Triple("i386-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_X86_64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_X86_64_ALL:
+      if (ArchFlag)
+        *ArchFlag = "x86_64";
+      return Triple("x86_64-apple-darwin");
+    case MachO::CPU_SUBTYPE_X86_64_H:
+      if (ArchFlag)
+        *ArchFlag = "x86_64h";
+      return Triple("x86_64h-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM_V4T:
+      if (ArchFlag)
+        *ArchFlag = "armv4t";
+      return Triple("armv4t-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V5TEJ:
+      if (ArchFlag)
+        *ArchFlag = "armv5e";
+      return Triple("armv5e-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_XSCALE:
+      if (ArchFlag)
+        *ArchFlag = "xscale";
+      return Triple("xscale-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6:
+      if (ArchFlag)
+        *ArchFlag = "armv6";
+      return Triple("armv6-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V6M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m0";
+      if (ArchFlag)
+        *ArchFlag = "armv6m";
+      return Triple("armv6m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7:
+      if (ArchFlag)
+        *ArchFlag = "armv7";
+      return Triple("armv7-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7EM:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m4";
+      if (ArchFlag)
+        *ArchFlag = "armv7em";
+      return Triple("thumbv7em-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7K:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7k";
+      return Triple("armv7k-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7M:
+      if (McpuDefault)
+        *McpuDefault = "cortex-m3";
+      if (ArchFlag)
+        *ArchFlag = "armv7m";
+      return Triple("thumbv7m-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM_V7S:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
+      if (ArchFlag)
+        *ArchFlag = "armv7s";
+      return Triple("armv7s-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_ALL:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64";
+      return Triple("arm64-apple-darwin");
+    case MachO::CPU_SUBTYPE_ARM64E:
+      if (McpuDefault)
+        *McpuDefault = "apple-a12";
+      if (ArchFlag)
+        *ArchFlag = "arm64e";
+      return Triple("arm64e-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_ARM64_32:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_ARM64_32_V8:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
+      if (ArchFlag)
+        *ArchFlag = "arm64_32";
+      return Triple("arm64_32-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc";
+      return Triple("ppc-apple-darwin");
+    default:
+      return Triple();
+    }
+  case MachO::CPU_TYPE_POWERPC64:
+    switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
+    case MachO::CPU_SUBTYPE_POWERPC_ALL:
+      if (ArchFlag)
+        *ArchFlag = "ppc64";
+      return Triple("ppc64-apple-darwin");
+    default:
+      return Triple();
+    }
+  default:
+    return Triple();
+  }
+}
+
 Triple MachOObjectFile::getHostArch() {
   return Triple(sys::getDefaultTargetTriple());
 }

From 8243c368b750cfe127aed3cd96c675b4499be7f9 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 4 Oct 2025 11:25:40 +0200
Subject: [PATCH 743/878] [CIR] Upstream CXXParenListInitExpr for AggregateExpr
 (#161876)

Upstream the CXXParenListInitExpr support for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  6 ++--
 clang/test/CIR/CodeGen/paren-init-list.cpp    | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/paren-init-list.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index d3540a2649cb4..bec9bbfcc39b2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -213,9 +213,11 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   }
   void VisitChooseExpr(const ChooseExpr *e) { Visit(e->getChosenSubExpr()); }
   void VisitCXXParenListInitExpr(CXXParenListInitExpr *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(),
-                     "AggExprEmitter: VisitCXXParenListInitExpr");
+    visitCXXParenListOrInitListExpr(e, e->getInitExprs(),
+                                    e->getInitializedFieldInUnion(),
+                                    e->getArrayFiller());
   }
+
   void VisitArrayInitLoopExpr(const ArrayInitLoopExpr *e,
                               llvm::Value *outerBegin = nullptr) {
     cgf.cgm.errorNYI(e->getSourceRange(),
diff --git a/clang/test/CIR/CodeGen/paren-init-list.cpp b/clang/test/CIR/CodeGen/paren-init-list.cpp
new file mode 100644
index 0000000000000..0efa36352899e
--- /dev/null
+++ b/clang/test/CIR/CodeGen/paren-init-list.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+struct CompleteS {
+  int a;
+  char b;
+};
+
+void cxx_paren_list_init_expr() { CompleteS a(1, 'a'); }
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
+// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
+// CIR: %[[ELEM_0_VAL:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[ELEM_0_VAL]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
+// CIR: %[[ELEM_1_VAL:.*]] = cir.const #cir.int<97> : !s8i
+// CIR: cir.store{{.*}} %[[ELEM_1_VAL]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+
+// LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
+// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
+// LLVM: store i8 97, ptr %[[ELEM_1_PTR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[A_ADDR]], ptr align 4 @__const._Z24cxx_paren_list_init_exprv.a, i64 8, i1 false)

From 7c666e24807b4b2a07370a11706a4bff8f34aef2 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 4 Oct 2025 11:52:09 +0200
Subject: [PATCH 744/878] [CIR] Implement UnaryExtension for AggregateExpr
 (#161820)

Implement the UnaryExtension support for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  5 +----
 clang/test/CIR/CodeGen/struct.cpp             | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index bec9bbfcc39b2..e51c3fcdef962 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -144,10 +144,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   void VisitUnaryCoawait(UnaryOperator *e) {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitUnaryCoawait");
   }
-  void VisitUnaryExtension(UnaryOperator *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(),
-                     "AggExprEmitter: VisitUnaryExtension");
-  }
+  void VisitUnaryExtension(UnaryOperator *e) { Visit(e->getSubExpr()); }
   void VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "AggExprEmitter: VisitSubstNonTypeTemplateParmExpr");
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index c0f0e0b5b853f..52367bd9c6caf 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -184,6 +184,20 @@ void generic_selection() {
 // OGCG:   %[[D_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[D_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
 
+void unary_extension() {
+  CompleteS a = __extension__ CompleteS();
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
+// CIR: %[[ZERO_INIT:.*]] = cir.const #cir.zero : !rec_CompleteS
+// CIR: cir.store{{.*}} %[[ZERO_INIT]], %[[A_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
+
+// LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: store %struct.CompleteS zeroinitializer, ptr %[[A_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
+
 void bin_comma() { 
   CompleteS a = (CompleteS(), CompleteS());
 }

From a6a78eb2f67257adf2aa0b86f40a58c61eb0f3e2 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 4 Oct 2025 12:05:56 +0200
Subject: [PATCH 745/878] [CIR] Support ComplexType in CallExpr args (#156236)

This change adds support for ComplexType in the CallExpr args

Issue: https://github.com/llvm/llvm-project/issues/141365
---
 clang/lib/CIR/CodeGen/CIRGenCall.cpp | 26 ++++++++++++----
 clang/lib/CIR/CodeGen/CIRGenCall.h   |  2 ++
 clang/test/CIR/CodeGen/complex.cpp   | 46 ++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 2970b369a85d0..61072f0883728 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -522,7 +522,8 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
     assert(!cir::MissingFeatures::opCallPaddingArgs());
 
     mlir::Type argType = convertType(canQualArgType);
-    if (!mlir::isa<cir::RecordType>(argType)) {
+    if (!mlir::isa<cir::RecordType>(argType) &&
+        !mlir::isa<cir::ComplexType>(argType)) {
       mlir::Value v;
       if (arg.isAggregate())
         cgm.errorNYI(loc, "emitCall: aggregate call argument");
@@ -540,15 +541,16 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
       cirCallArgs[argNo] = v;
     } else {
       Address src = Address::invalid();
-      if (!arg.isAggregate())
-        cgm.errorNYI(loc, "emitCall: non-aggregate call argument");
-      else
+      if (!arg.isAggregate()) {
+        src = createMemTemp(arg.ty, loc, "coerce");
+        arg.copyInto(*this, src, loc);
+      } else {
         src = arg.hasLValue() ? arg.getKnownLValue().getAddress()
                               : arg.getKnownRValue().getAggregateAddress();
+      }
 
       // Fast-isel and the optimizer generally like scalar values better than
       // FCAs, so we flatten them if this is safe to do for this argument.
-      auto argRecordTy = cast<cir::RecordType>(argType);
       mlir::Type srcTy = src.getElementType();
       // FIXME(cir): get proper location for each argument.
       mlir::Location argLoc = loc;
@@ -564,7 +566,7 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
       // uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(STy);
       // if (SrcSize < DstSize) {
       assert(!cir::MissingFeatures::dataLayoutTypeAllocSize());
-      if (srcTy != argRecordTy) {
+      if (srcTy != argType) {
         cgm.errorNYI(loc, "emitCall: source type does not match argument type");
       } else {
         // FIXME(cir): this currently only runs when the types are exactly the
@@ -676,6 +678,18 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
   llvm_unreachable("Invalid evaluation kind");
 }
 
+void CallArg::copyInto(CIRGenFunction &cgf, Address addr,
+                       mlir::Location loc) const {
+  LValue dst = cgf.makeAddrLValue(addr, ty);
+  if (!hasLV && rv.isScalar())
+    cgf.cgm.errorNYI(loc, "copyInto scalar value");
+  else if (!hasLV && rv.isComplex())
+    cgf.emitStoreOfComplex(loc, rv.getComplexValue(), dst, /*isInit=*/true);
+  else
+    cgf.cgm.errorNYI(loc, "copyInto hasLV");
+  isUsed = true;
+}
+
 void CIRGenFunction::emitCallArg(CallArgList &args, const clang::Expr *e,
                                  clang::QualType argType) {
   assert(argType->isReferenceType() == e->isGLValue() &&
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h
index 52d541f2b09b5..55b3d9765c5c5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.h
@@ -224,6 +224,8 @@ struct CallArg {
   }
 
   bool isAggregate() const { return hasLV || rv.isAggregate(); }
+
+  void copyInto(CIRGenFunction &cgf, Address addr, mlir::Location loc) const;
 };
 
 class CallArgList : public llvm::SmallVector<CallArg, 8> {
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index ae69b2486efd0..a9ab87e1a1234 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1311,3 +1311,49 @@ void real_on_scalar_from_imag_with_type_promotion() {
 // OGCG: %[[A_IMAG_F32:.*]] = fpext half %[[A_IMAG]] to float
 // OGCG: %[[A_IMAG_F16:.*]] = fptrunc float %[[A_IMAG_F32]] to half
 // OGCG: store half %[[A_IMAG_F16]], ptr %[[B_ADDR]], align 2
+
+void complex_type_parameter(float _Complex a) {}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a", init]
+// CIR: cir.store %{{.*}}, %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+// Test will be updated when that is implemented
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: store { float, float } %{{.*}}, ptr %[[A_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: store <2 x float> %a.coerce, ptr %[[A_ADDR]], align 4
+
+void complex_type_argument() {
+  float _Complex a;
+  complex_type_parameter(a);
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
+// CIR: %[[ARG_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["coerce"]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[TMP_A]], %[[ARG_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR: %[[TMP_ARG:.*]] = cir.load{{.*}} %[[ARG_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
+// CIR: cir.call @_Z22complex_type_parameterCf(%[[TMP_ARG]]) : (!cir.complex<!cir.float>) -> ()
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[ARG_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
+// LLVM: store { float, float } %[[TMP_A]], ptr %[[ARG_ADDR]], align 4
+// LLVM: %[[TMP_ARG:.*]] = load { float, float }, ptr %[[ARG_ADDR]], align 4
+// LLVM: call void @_Z22complex_type_parameterCf({ float, float } %[[TMP_ARG]])
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[ARG_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4
+// OGCG: %[[ARG_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[ARG_ADDR]], i32 0, i32 0
+// OGCG: %[[ARG_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[ARG_ADDR]], i32 0, i32 1
+// OGCG: store float %[[A_REAL]], ptr %[[ARG_REAL_PTR]], align 4
+// OGCG: store float %[[A_IMAG]], ptr %[[ARG_IMAG_PTR]], align 4
+// OGCG: %[[TMP_ARG:.*]] = load <2 x float>, ptr %[[ARG_ADDR]], align 4
+// OGCG: call void @_Z22complex_type_parameterCf(<2 x float> noundef %[[TMP_ARG]])

From 9a2a4f65bc70712c667feaf0f6559f7ab94d7e11 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 4 Oct 2025 12:38:09 +0200
Subject: [PATCH 746/878] [CIR] Implement emitAtomicInit for AggregateExpr
 (#161826)

Implement emitAtomicInit support for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenAtomic.cpp | 20 ++++++++++++++++--
 clang/test/CIR/CodeGen/struct.cpp      | 28 ++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index e943b0252bf4e..0f4d6d20afc52 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -718,10 +718,26 @@ void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) {
     return;
   }
 
-  case cir::TEK_Aggregate:
-    cgm.errorNYI(init->getSourceRange(), "emitAtomicInit: aggregate type");
+  case cir::TEK_Aggregate: {
+    // Fix up the destination if the initializer isn't an expression
+    // of atomic type.
+    bool zeroed = false;
+    if (!init->getType()->isAtomicType()) {
+      zeroed = atomics.emitMemSetZeroIfNecessary();
+      dest = atomics.projectValue();
+    }
+
+    // Evaluate the expression directly into the destination.
+    assert(!cir::MissingFeatures::aggValueSlotGC());
+    AggValueSlot slot = AggValueSlot::forLValue(
+        dest, AggValueSlot::IsNotDestructed, AggValueSlot::IsNotAliased,
+        AggValueSlot::DoesNotOverlap,
+        zeroed ? AggValueSlot::IsZeroed : AggValueSlot::IsNotZeroed);
+
+    emitAggExpr(init, slot);
     return;
   }
+  }
 
   llvm_unreachable("bad evaluation kind");
 }
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index 52367bd9c6caf..c68f7426661b5 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -184,6 +184,34 @@ void generic_selection() {
 // OGCG:   %[[D_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[D_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
 
+void atomic_init() {
+  _Atomic CompleteS a;
+  __c11_atomic_init(&a, {});
+}
+
+// CIR: cir.func{{.*}} @_Z11atomic_initv()
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a"]
+// CIR:   %[[ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.store{{.*}} %[[CONST_0]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s8i
+// CIR:   cir.store{{.*}} %[[CONST_0]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+
+// LLVM: define{{.*}} void @_Z11atomic_initv()
+// LLVM:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 8
+// LLVM:   %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
+// LLVM:   store i32 0, ptr %[[ELEM_0_PTR]], align 8
+// LLVM:   %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
+// LLVM:   store i8 0, ptr %[[ELEM_1_PTR]], align 4
+
+// OGCG: define{{.*}} void @_Z11atomic_initv()
+// OGCG:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 8
+// OGCG:   %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG:   store i32 0, ptr %[[ELEM_0_PTR]], align 8
+// OGCG:   %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG:   store i8 0, ptr %[[ELEM_1_PTR]], align 4
+
 void unary_extension() {
   CompleteS a = __extension__ CompleteS();
 }

From a368fb5205a305197b650cbbc7264085d133e3e3 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 4 Oct 2025 13:39:26 +0200
Subject: [PATCH 747/878] [CIR] Update ComplexImagOp to work on scalar type
 (#161571)

Update cir::ComplexImagOp to make it visible on scalars

Issue https://github.com/llvm/llvm-project/issues/160568
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |  7 ++++---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 10 ++++++----
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    | 20 +++++++++----------
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 13 +++++++++---
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 15 ++++++++++++--
 clang/test/CIR/CodeGen/complex.cpp            | 16 ++++++++-------
 6 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 93d81e37c9240..569491a61a964 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -178,9 +178,10 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   }
 
   mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) {
-    auto operandTy = mlir::cast<cir::ComplexType>(operand.getType());
-    return cir::ComplexImagOp::create(*this, loc, operandTy.getElementType(),
-                                      operand);
+    auto resultType = operand.getType();
+    if (auto complexResultType = mlir::dyn_cast<cir::ComplexType>(resultType))
+      resultType = complexResultType.getElementType();
+    return cir::ComplexImagOp::create(*this, loc, resultType, operand);
   }
 
   cir::LoadOp createLoad(mlir::Location loc, mlir::Value ptr,
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 7f2e55d14b51c..c81f64deff7bc 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3308,18 +3308,20 @@ def CIR_ComplexRealOp : CIR_Op<"complex.real", [Pure]> {
 def CIR_ComplexImagOp : CIR_Op<"complex.imag", [Pure]> {
   let summary = "Extract the imaginary part of a complex value";
   let description = [{
-    `cir.complex.imag` operation takes an operand of `!cir.complex` type and
-    yields the imaginary part of it.
+    `cir.complex.imag` operation takes an operand of `!cir.complex`, `!cir.int`
+    or `!cir.float`. If the operand is `!cir.complex`, the imag part of it will
+    be returned, otherwise a zero value will be returned.  
 
     Example:
 
     ```mlir
-    %1 = cir.complex.imag %0 : !cir.complex<!cir.float> -> !cir.float
+    %imag = cir.complex.imag %complex : !cir.complex<!cir.float> -> !cir.float
+    %imag = cir.complex.imag %scalar : !cir.float -> !cir.float
     ```
   }];
 
   let results = (outs CIR_AnyIntOrFloatType:$result);
-  let arguments = (ins CIR_ComplexType:$operand);
+  let arguments = (ins CIR_AnyComplexOrIntOrFloatType:$operand);
 
   let assemblyFormat = [{
     $operand `:` qualified(type($operand)) `->` qualified(type($result))
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 500007f6f241b..768d75dc35394 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -2159,16 +2159,16 @@ mlir::Value ScalarExprEmitter::VisitRealImag(const UnaryOperator *e,
 
   // __imag on a scalar returns zero. Emit the subexpr to ensure side
   // effects are evaluated, but not the actual value.
-  if (op->isGLValue())
-    cgf.emitLValue(op);
-  else if (!promotionTy.isNull())
-    cgf.emitPromotedScalarExpr(op, promotionTy);
-  else
-    cgf.emitScalarExpr(op);
-
-  mlir::Type valueTy =
-      cgf.convertType(promotionTy.isNull() ? e->getType() : promotionTy);
-  return builder.getNullValue(valueTy, loc);
+  mlir::Value operand;
+  if (op->isGLValue()) {
+    operand = cgf.emitLValue(op).getPointer();
+    operand = cir::LoadOp::create(builder, loc, operand);
+  } else if (!promotionTy.isNull()) {
+    operand = cgf.emitPromotedScalarExpr(op, promotionTy);
+  } else {
+    operand = cgf.emitScalarExpr(op);
+  }
+  return builder.createComplexImag(loc, operand);
 }
 
 /// Return the size or alignment of the type of argument of the sizeof
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index fba094f42414f..cdd4e3c96ca98 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -2402,9 +2402,8 @@ OpFoldResult cir::ComplexCreateOp::fold(FoldAdaptor adaptor) {
 
 LogicalResult cir::ComplexRealOp::verify() {
   mlir::Type operandTy = getOperand().getType();
-  if (auto complexOperandTy = mlir::dyn_cast<cir::ComplexType>(operandTy)) {
+  if (auto complexOperandTy = mlir::dyn_cast<cir::ComplexType>(operandTy))
     operandTy = complexOperandTy.getElementType();
-  }
 
   if (getType() != operandTy) {
     emitOpError() << ": result type does not match operand type";
@@ -2431,14 +2430,22 @@ OpFoldResult cir::ComplexRealOp::fold(FoldAdaptor adaptor) {
 //===----------------------------------------------------------------------===//
 
 LogicalResult cir::ComplexImagOp::verify() {
-  if (getType() != getOperand().getType().getElementType()) {
+  mlir::Type operandTy = getOperand().getType();
+  if (auto complexOperandTy = mlir::dyn_cast<cir::ComplexType>(operandTy))
+    operandTy = complexOperandTy.getElementType();
+
+  if (getType() != operandTy) {
     emitOpError() << ": result type does not match operand type";
     return failure();
   }
+
   return success();
 }
 
 OpFoldResult cir::ComplexImagOp::fold(FoldAdaptor adaptor) {
+  if (!mlir::isa<cir::ComplexType>(getOperand().getType()))
+    return nullptr;
+
   if (auto complexCreateOp = getOperand().getDefiningOp<cir::ComplexCreateOp>())
     return complexCreateOp.getOperand(1);
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index bfb1262dec1f3..1ff8cc55b57fa 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -3061,8 +3061,19 @@ mlir::LogicalResult CIRToLLVMComplexImagOpLowering::matchAndRewrite(
     cir::ComplexImagOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   mlir::Type resultLLVMTy = getTypeConverter()->convertType(op.getType());
-  rewriter.replaceOpWithNewOp<mlir::LLVM::ExtractValueOp>(
-      op, resultLLVMTy, adaptor.getOperand(), llvm::ArrayRef<std::int64_t>{1});
+  mlir::Value operand = adaptor.getOperand();
+  mlir::Location loc = op.getLoc();
+
+  if (mlir::isa<cir::ComplexType>(op.getOperand().getType())) {
+    operand = mlir::LLVM::ExtractValueOp::create(
+        rewriter, loc, resultLLVMTy, operand, llvm::ArrayRef<std::int64_t>{1});
+  } else {
+    mlir::TypedAttr zeroAttr = rewriter.getZeroAttr(resultLLVMTy);
+    operand =
+        mlir::LLVM::ConstantOp::create(rewriter, loc, resultLLVMTy, zeroAttr);
+  }
+
+  rewriter.replaceOp(op, operand);
   return mlir::success();
 }
 
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index a9ab87e1a1234..3524b8bb07ad1 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1160,8 +1160,9 @@ void imag_on_scalar_glvalue() {
 
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
-// CIR: %[[CONST_ZERO:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.float
-// CIR: cir.store{{.*}} %[[CONST_ZERO]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
+// CIR: %[[TMP_A:.*]] = cir.load %[[A_ADDR]] : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.float -> !cir.float
+// CIR: cir.store{{.*}} %[[A_IMAG]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
 
 // LLVM: %[[A_ADDR:.*]] = alloca float, i64 1, align 4
 // LLVM: %[[B_ADDR:.*]] = alloca float, i64 1, align 4
@@ -1205,9 +1206,10 @@ void imag_on_scalar_with_type_promotion() {
 
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
-// CIR: %[[CONST_ZERO:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.float
-// CIR: %[[CONST_ZERO_F16:.*]] = cir.cast floating %[[CONST_ZERO]] : !cir.float -> !cir.f16
-// CIR: cir.store{{.*}} %[[CONST_ZERO_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
+// CIR: %[[TMP_A:.*]] = cir.load %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
+// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.f16 -> !cir.f16
+// CIR: %[[A_IMAG_F16:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.f16
+// CIR: cir.store{{.*}} %[[A_IMAG_F16]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: %[[A_ADDR:.*]] = alloca half, i64 1, align 2
 // LLVM: %[[B_ADDR:.*]] = alloca half, i64 1, align 2
@@ -1225,8 +1227,8 @@ void imag_on_const_scalar() {
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
 // CIR: %[[CONST_ONE:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
-// CIR: %[[CONST_ZERO:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.float
-// CIR: cir.store{{.*}} %[[CONST_ZERO]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
+// CIR: %[[CONST_IMAG:.*]] = cir.complex.imag %[[CONST_ONE]] : !cir.float -> !cir.float
+// CIR: cir.store{{.*}} %[[CONST_IMAG]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
 
 // LLVM: %[[A_ADDR:.*]] = alloca float, i64 1, align 4
 // LLVM: %[[B_ADDR:.*]] = alloca float, i64 1, align 4

From 9ebf1e91759d74e703d02ff385dabf0a53ac58b9 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Sat, 4 Oct 2025 05:28:24 -0700
Subject: [PATCH 748/878] [NFC][InstCombine] Fix namespace usage in InstCombine
 (#161902)

---
 llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp | 6 ++----
 llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index cba282cea72b8..a2e8c695331a6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -15,13 +15,12 @@
 
 using namespace llvm;
 
-namespace {
 /// Return true if and only if the given instruction does not modify the memory
 /// location referenced.  Note that an idemptent atomicrmw may still have
 /// ordering effects on nearby instructions, or be volatile.
 /// TODO: Common w/ the version in AtomicExpandPass, and change the term used.
 /// Idemptotent is confusing in this context.
-bool isIdempotentRMW(AtomicRMWInst& RMWI) {
+static bool isIdempotentRMW(AtomicRMWInst &RMWI) {
   if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
     switch(RMWI.getOperation()) {
     case AtomicRMWInst::FAdd: // -0.0
@@ -59,7 +58,7 @@ bool isIdempotentRMW(AtomicRMWInst& RMWI) {
 
 /// Return true if the given instruction always produces a value in memory
 /// equivalent to its value operand.
-bool isSaturating(AtomicRMWInst& RMWI) {
+static bool isSaturating(AtomicRMWInst &RMWI) {
   if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
     switch (RMWI.getOperation()) {
     case AtomicRMWInst::FMax:
@@ -98,7 +97,6 @@ bool isSaturating(AtomicRMWInst& RMWI) {
     return C->isMaxValue(false);
   };
 }
-} // namespace
 
 Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 917004c4702b6..048cdf497f0f4 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -132,8 +132,6 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
-namespace llvm {
-
 static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking",
                                        cl::desc("Enable code sinking"),
                                        cl::init(true));
@@ -146,7 +144,9 @@ static cl::opt<unsigned>
 MaxArraySize("instcombine-maxarray-size", cl::init(1024),
              cl::desc("Maximum array size considered when doing a combine"));
 
+namespace llvm {
 extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+} // end namespace llvm
 
 // FIXME: Remove this flag when it is no longer necessary to convert
 // llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
@@ -158,8 +158,6 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
                                                cl::Hidden, cl::init(true));
 
-} // end namespace llvm
-
 std::optional<Instruction *>
 InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
   // Handle target specific intrinsics

From e9330fd244f93c53b13a876769fe9555913e6028 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Sat, 4 Oct 2025 05:28:56 -0700
Subject: [PATCH 749/878] [NFC][TableGen] Migrate IfDef/Namespace emitter from
 MLIR to LLVM (#161744)

---
 llvm/include/llvm/TableGen/CodeGenHelpers.h   | 67 +++++++++++++++++
 .../utils/TableGen/Basic/DirectiveEmitter.cpp | 73 +++++--------------
 .../TableGen/Basic/TargetFeaturesEmitter.cpp  | 36 +++++----
 mlir/include/mlir/TableGen/CodeGenHelpers.h   | 42 ++---------
 mlir/include/mlir/TableGen/Dialect.h          |  1 +
 mlir/lib/TableGen/CodeGenHelpers.cpp          |  1 +
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp   | 20 ++---
 mlir/tools/mlir-tblgen/DialectGen.cpp         |  4 +-
 mlir/tools/mlir-tblgen/OmpOpGen.cpp           |  2 +-
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp   | 11 +--
 mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp      |  3 +-
 11 files changed, 132 insertions(+), 128 deletions(-)
 create mode 100644 llvm/include/llvm/TableGen/CodeGenHelpers.h

diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
new file mode 100644
index 0000000000000..7dca6a051ba75
--- /dev/null
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common utilities for generating C++ code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TABLEGEN_CODEGENHELPERS_H
+#define LLVM_TABLEGEN_CODEGENHELPERS_H
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+namespace llvm {
+// Simple RAII helper for emitting ifdef-undef-endif scope.
+class IfDefEmitter {
+public:
+  IfDefEmitter(raw_ostream &OS, StringRef Name) : Name(Name.str()), OS(OS) {
+    OS << "#ifdef " << Name << "\n"
+       << "#undef " << Name << "\n\n";
+  }
+  ~IfDefEmitter() { OS << "\n#endif // " << Name << "\n\n"; }
+
+private:
+  std::string Name;
+  raw_ostream &OS;
+};
+
+// Simple RAII helper for emitting namespace scope. Name can be a single
+// namespace (empty for anonymous namespace) or nested namespace.
+class NamespaceEmitter {
+public:
+  NamespaceEmitter(raw_ostream &OS, StringRef Name) : OS(OS) {
+    emitNamespaceStarts(Name);
+  }
+
+  ~NamespaceEmitter() { close(); }
+
+  // Explicit function to close the namespace scopes.
+  void close() {
+    for (StringRef NS : llvm::reverse(Namespaces))
+      OS << "} // namespace " << NS << "\n";
+    Namespaces.clear();
+  }
+
+private:
+  void emitNamespaceStarts(StringRef Name) {
+    llvm::SplitString(Name, Namespaces, "::");
+    for (StringRef NS : Namespaces)
+      OS << "namespace " << NS << " {\n";
+  }
+
+  SmallVector<StringRef, 2> Namespaces;
+  raw_ostream &OS;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TABLEGEN_CODEGENHELPERS_H
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index f0e23690367db..b4d816ea211f7 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -29,27 +30,11 @@
 
 using namespace llvm;
 
-namespace {
-// Simple RAII helper for defining ifdef-undef-endif scopes.
-class IfDefScope {
-public:
-  IfDefScope(StringRef Name, raw_ostream &OS) : Name(Name), OS(OS) {
-    OS << "#ifdef " << Name << "\n"
-       << "#undef " << Name << "\n";
-  }
-
-  ~IfDefScope() { OS << "\n#endif // " << Name << "\n\n"; }
-
-private:
-  StringRef Name;
-  raw_ostream &OS;
-};
-} // namespace
-
 namespace {
 enum class Frontend { LLVM, Flang, Clang };
+} // namespace
 
-StringRef getFESpelling(Frontend FE) {
+static StringRef getFESpelling(Frontend FE) {
   switch (FE) {
   case Frontend::LLVM:
     return "llvm";
@@ -60,7 +45,6 @@ StringRef getFESpelling(Frontend FE) {
   }
   llvm_unreachable("unknown FE kind");
 }
-} // namespace
 
 // Get the full namespace qualifier for the directive language.
 static std::string getQualifier(const DirectiveLanguage &DirLang,
@@ -297,13 +281,8 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   OS << "#include <cstddef>\n"; // for size_t
   OS << "#include <utility>\n"; // for std::pair
   OS << "\n";
-  OS << "namespace llvm {\n";
-
-  // Open namespaces defined in the directive language
-  SmallVector<StringRef, 2> Namespaces;
-  SplitString(DirLang.getCppNamespace(), Namespaces, "::");
-  for (auto Ns : Namespaces)
-    OS << "namespace " << Ns << " {\n";
+  NamespaceEmitter LlvmNS(OS, "llvm");
+  NamespaceEmitter DirLangNS(OS, DirLang.getCppNamespace());
 
   if (DirLang.hasEnableBitmaskEnumInNamespace())
     OS << "\nLLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n";
@@ -380,9 +359,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
     OS << "\n";
   }
 
-  // Closing namespaces
-  for (auto Ns : reverse(Namespaces))
-    OS << "} // namespace " << Ns << "\n";
+  DirLangNS.close();
 
   // These specializations need to be in ::llvm.
   for (StringRef Enum : {"Association", "Category", "Directive", "Clause"}) {
@@ -392,9 +369,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
     OS << "  static constexpr bool is_iterable = true;\n";
     OS << "};\n";
   }
-
-  OS << "} // namespace llvm\n";
-
+  LlvmNS.close();
   OS << "#endif // LLVM_" << Lang << "_INC\n";
 }
 
@@ -971,11 +946,10 @@ static void generateDirectiveClauseSets(const DirectiveLanguage &DirLang,
   std::string IfDefName{"GEN_"};
   IfDefName += getFESpelling(FE).upper();
   IfDefName += "_DIRECTIVE_CLAUSE_SETS";
-  IfDefScope Scope(IfDefName, OS);
+  IfDefEmitter Scope(OS, IfDefName);
 
   StringRef Namespace =
       getFESpelling(FE == Frontend::Flang ? Frontend::LLVM : FE);
-  OS << "\n";
   // The namespace has to be different for clang vs flang, as 2 structs with the
   // same name but different layout is UB.  So just put the 'clang' on in the
   // clang namespace.
@@ -1016,9 +990,8 @@ static void generateDirectiveClauseMap(const DirectiveLanguage &DirLang,
   std::string IfDefName{"GEN_"};
   IfDefName += getFESpelling(FE).upper();
   IfDefName += "_DIRECTIVE_CLAUSE_MAP";
-  IfDefScope Scope(IfDefName, OS);
+  IfDefEmitter Scope(OS, IfDefName);
 
-  OS << "\n";
   OS << "{\n";
 
   // The namespace has to be different for clang vs flang, as 2 structs with the
@@ -1062,9 +1035,7 @@ static void generateDirectiveClauseMap(const DirectiveLanguage &DirLang,
 static void generateFlangClauseParserClass(const DirectiveLanguage &DirLang,
                                            raw_ostream &OS) {
 
-  IfDefScope Scope("GEN_FLANG_CLAUSE_PARSER_CLASSES", OS);
-
-  OS << "\n";
+  IfDefEmitter Scope(OS, "GEN_FLANG_CLAUSE_PARSER_CLASSES");
 
   for (const Clause Clause : DirLang.getClauses()) {
     if (!Clause.getFlangClass().empty()) {
@@ -1089,9 +1060,8 @@ static void generateFlangClauseParserClass(const DirectiveLanguage &DirLang,
 static void generateFlangClauseParserClassList(const DirectiveLanguage &DirLang,
                                                raw_ostream &OS) {
 
-  IfDefScope Scope("GEN_FLANG_CLAUSE_PARSER_CLASSES_LIST", OS);
+  IfDefEmitter Scope(OS, "GEN_FLANG_CLAUSE_PARSER_CLASSES_LIST");
 
-  OS << "\n";
   interleaveComma(DirLang.getClauses(), OS, [&](const Record *C) {
     Clause Clause(C);
     OS << Clause.getFormattedParserClassName() << "\n";
@@ -1102,9 +1072,8 @@ static void generateFlangClauseParserClassList(const DirectiveLanguage &DirLang,
 static void generateFlangClauseDump(const DirectiveLanguage &DirLang,
                                     raw_ostream &OS) {
 
-  IfDefScope Scope("GEN_FLANG_DUMP_PARSE_TREE_CLAUSES", OS);
+  IfDefEmitter Scope(OS, "GEN_FLANG_DUMP_PARSE_TREE_CLAUSES");
 
-  OS << "\n";
   for (const Clause Clause : DirLang.getClauses()) {
     OS << "NODE(" << DirLang.getFlangClauseBaseClass() << ", "
        << Clause.getFormattedParserClassName() << ")\n";
@@ -1116,10 +1085,9 @@ static void generateFlangClauseDump(const DirectiveLanguage &DirLang,
 static void generateFlangClauseUnparse(const DirectiveLanguage &DirLang,
                                        raw_ostream &OS) {
 
-  IfDefScope Scope("GEN_FLANG_CLAUSE_UNPARSE", OS);
+  IfDefEmitter Scope(OS, "GEN_FLANG_CLAUSE_UNPARSE");
 
   StringRef Base = DirLang.getFlangClauseBaseClass();
-  OS << "\n";
 
   for (const Clause Clause : DirLang.getClauses()) {
     if (Clause.skipFlangUnparser())
@@ -1172,9 +1140,8 @@ static void generateFlangClauseUnparse(const DirectiveLanguage &DirLang,
 static void generateFlangClauseCheckPrototypes(const DirectiveLanguage &DirLang,
                                                raw_ostream &OS) {
 
-  IfDefScope Scope("GEN_FLANG_CLAUSE_CHECK_ENTER", OS);
+  IfDefEmitter Scope(OS, "GEN_FLANG_CLAUSE_CHECK_ENTER");
 
-  OS << "\n";
   for (const Clause Clause : DirLang.getClauses()) {
     OS << "void Enter(const parser::" << DirLang.getFlangClauseBaseClass()
        << "::" << Clause.getFormattedParserClassName() << " &);\n";
@@ -1186,12 +1153,11 @@ static void generateFlangClauseCheckPrototypes(const DirectiveLanguage &DirLang,
 static void generateFlangClauseParserKindMap(const DirectiveLanguage &DirLang,
                                              raw_ostream &OS) {
 
-  IfDefScope Scope("GEN_FLANG_CLAUSE_PARSER_KIND_MAP", OS);
+  IfDefEmitter Scope(OS, "GEN_FLANG_CLAUSE_PARSER_KIND_MAP");
 
   StringRef Prefix = DirLang.getClausePrefix();
   std::string Qual = getQualifier(DirLang);
 
-  OS << "\n";
   for (const Record *R : DirLang.getClauses()) {
     Clause C(R);
     OS << "if constexpr (std::is_same_v<A, parser::"
@@ -1216,11 +1182,10 @@ static void generateFlangClausesParser(const DirectiveLanguage &DirLang,
   llvm::sort(Names, [](const auto &A, const auto &B) {
     return A.second.Name > B.second.Name;
   });
-  IfDefScope Scope("GEN_FLANG_CLAUSES_PARSER", OS);
+  IfDefEmitter Scope(OS, "GEN_FLANG_CLAUSES_PARSER");
   StringRef Base = DirLang.getFlangClauseBaseClass();
 
   unsigned LastIndex = Names.size() - 1;
-  OS << "\n";
   OS << "TYPE_PARSER(\n";
   for (auto [Index, RecSp] : llvm::enumerate(Names)) {
     auto [R, S] = RecSp;
@@ -1313,10 +1278,9 @@ static void emitDirectivesFlangImpl(const DirectiveLanguage &DirLang,
 static void generateClauseClassMacro(const DirectiveLanguage &DirLang,
                                      raw_ostream &OS) {
   // Generate macros style information for legacy code in clang
-  IfDefScope Scope("GEN_CLANG_CLAUSE_CLASS", OS);
+  IfDefEmitter Scope(OS, "GEN_CLANG_CLAUSE_CLASS");
 
   StringRef Prefix = DirLang.getClausePrefix();
-  OS << "\n";
 
   OS << "#ifndef CLAUSE\n";
   OS << "#define CLAUSE(Enum, Str, Implicit)\n";
@@ -1375,12 +1339,11 @@ static void generateClauseClassMacro(const DirectiveLanguage &DirLang,
 // language. This code can be included in library.
 void emitDirectivesBasicImpl(const DirectiveLanguage &DirLang,
                              raw_ostream &OS) {
-  IfDefScope Scope("GEN_DIRECTIVES_IMPL", OS);
+  IfDefEmitter Scope(OS, "GEN_DIRECTIVES_IMPL");
 
   StringRef DPrefix = DirLang.getDirectivePrefix();
   StringRef CPrefix = DirLang.getClausePrefix();
 
-  OS << "\n";
   OS << "#include \"llvm/Frontend/Directive/Spelling.h\"\n";
   OS << "#include \"llvm/Support/ErrorHandling.h\"\n";
   OS << "#include <utility>\n";
diff --git a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
index 6b723bc0fb02d..e2b524155a72c 100644
--- a/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/TargetFeaturesEmitter.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetFeaturesEmitter.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
@@ -43,7 +44,7 @@ FeatureMapTy TargetFeaturesEmitter::enumeration(raw_ostream &OS) {
     PrintFatalError(
         "Too many subtarget features! Bump MAX_SUBTARGET_FEATURES.");
 
-  OS << "namespace " << Target << " {\n";
+  NamespaceEmitter NS(OS, Target);
 
   OS << "enum {\n";
 
@@ -58,9 +59,8 @@ FeatureMapTy TargetFeaturesEmitter::enumeration(raw_ostream &OS) {
 
   OS << "  " << "NumSubtargetFeatures = " << N << "\n";
 
-  // Close enumeration and namespace
+  // Close enumeration.
   OS << "};\n";
-  OS << "} // end namespace " << Target << "\n";
   return FeatureMap;
 }
 
@@ -149,25 +149,23 @@ void TargetFeaturesEmitter::printCPUKeyValues(raw_ostream &OS,
 void TargetFeaturesEmitter::run(raw_ostream &OS) {
   OS << "// Autogenerated by TargetFeatureEmitter.cpp\n\n";
 
-  OS << "\n#ifdef GET_SUBTARGETFEATURES_ENUM\n";
-  OS << "#undef GET_SUBTARGETFEATURES_ENUM\n\n";
-
-  OS << "namespace llvm {\n";
-  auto FeatureMap = enumeration(OS);
-  OS << "} // end namespace llvm\n\n";
-  OS << "#endif // GET_SUBTARGETFEATURES_ENUM\n\n";
+  FeatureMapTy FeatureMap;
+  {
+    IfDefEmitter IfDef(OS, "GET_SUBTARGETFEATURES_ENUM");
+    NamespaceEmitter NS(OS, "llvm");
+    FeatureMap = enumeration(OS);
+  }
 
-  OS << "\n#ifdef GET_SUBTARGETFEATURES_KV\n";
-  OS << "#undef GET_SUBTARGETFEATURES_KV\n\n";
+  {
+    IfDefEmitter IfDef(OS, "GET_SUBTARGETFEATURES_KV");
+    NamespaceEmitter NS(OS, "llvm");
 
-  OS << "namespace llvm {\n";
-  printFeatureKeyValues(OS, FeatureMap);
-  OS << "\n";
+    printFeatureKeyValues(OS, FeatureMap);
+    OS << "\n";
 
-  printCPUKeyValues(OS, FeatureMap);
-  OS << "\n";
-  OS << "} // end namespace llvm\n\n";
-  OS << "#endif // GET_SUBTARGETFEATURES_KV\n\n";
+    printCPUKeyValues(OS, FeatureMap);
+    OS << "\n";
+  }
 }
 
 static TableGen::Emitter::OptClass<TargetFeaturesEmitter>
diff --git a/mlir/include/mlir/TableGen/CodeGenHelpers.h b/mlir/include/mlir/TableGen/CodeGenHelpers.h
index cf14f65b93ed2..252da21419b6e 100644
--- a/mlir/include/mlir/TableGen/CodeGenHelpers.h
+++ b/mlir/include/mlir/TableGen/CodeGenHelpers.h
@@ -1,3 +1,4 @@
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -20,6 +21,8 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
+#include <utility>
 
 namespace llvm {
 class RecordKeeper;
@@ -36,46 +39,17 @@ std::string strfmt(const char *fmt, Parameters &&...parameters) {
   return llvm::formatv(fmt, std::forward<Parameters>(parameters)...).str();
 }
 
-// Simple RAII helper for defining ifdef-undef-endif scopes.
-class IfDefScope {
-public:
-  IfDefScope(llvm::StringRef name, llvm::raw_ostream &os)
-      : name(name.str()), os(os) {
-    os << "#ifdef " << name << "\n"
-       << "#undef " << name << "\n\n";
-  }
-  ~IfDefScope() { os << "\n#endif  // " << name << "\n\n"; }
-
-private:
-  std::string name;
-  llvm::raw_ostream &os;
-};
-
-// A helper RAII class to emit nested namespaces for this op.
-class NamespaceEmitter {
+// A helper RAII class to emit nested namespaces for a dialect.
+class DialectNamespaceEmitter {
 public:
-  NamespaceEmitter(raw_ostream &os, const Dialect &dialect) : os(os) {
+  DialectNamespaceEmitter(raw_ostream &os, const Dialect &dialect) {
     if (!dialect)
       return;
-    emitNamespaceStarts(os, dialect.getCppNamespace());
-  }
-  NamespaceEmitter(raw_ostream &os, StringRef cppNamespace) : os(os) {
-    emitNamespaceStarts(os, cppNamespace);
-  }
-
-  ~NamespaceEmitter() {
-    for (StringRef ns : llvm::reverse(namespaces))
-      os << "} // namespace " << ns << "\n";
+    nsEmitter.emplace(os, dialect.getCppNamespace());
   }
 
 private:
-  void emitNamespaceStarts(raw_ostream &os, StringRef cppNamespace) {
-    llvm::SplitString(cppNamespace, namespaces, "::");
-    for (StringRef ns : namespaces)
-      os << "namespace " << ns << " {\n";
-  }
-  raw_ostream &os;
-  SmallVector<StringRef, 2> namespaces;
+  std::optional<llvm::NamespaceEmitter> nsEmitter;
 };
 
 /// This class deduplicates shared operation verification code by emitting
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
index ea8f40555e445..37c6427a9282a 100644
--- a/mlir/include/mlir/TableGen/Dialect.h
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -107,6 +107,7 @@ class Dialect {
 
   // Returns whether the dialect is defined.
   explicit operator bool() const { return def != nullptr; }
+  bool isDefined() const { return def != nullptr; }
 
 private:
   const llvm::Record *def;
diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp
index 2c119fd680b69..cb90ef82c1c39 100644
--- a/mlir/lib/TableGen/CodeGenHelpers.cpp
+++ b/mlir/lib/TableGen/CodeGenHelpers.cpp
@@ -16,6 +16,7 @@
 #include "mlir/TableGen/Pattern.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Path.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index b9115657d6bf3..06ef396b9b21d 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -10,12 +10,12 @@
 #include "CppGenUtilities.h"
 #include "mlir/TableGen/AttrOrTypeDef.h"
 #include "mlir/TableGen/Class.h"
-#include "mlir/TableGen/CodeGenHelpers.h"
 #include "mlir/TableGen/Format.h"
 #include "mlir/TableGen/GenInfo.h"
 #include "mlir/TableGen/Interfaces.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
@@ -71,14 +71,14 @@ class DefGen {
 
   void emitDecl(raw_ostream &os) const {
     if (storageCls && def.genStorageClass()) {
-      NamespaceEmitter ns(os, def.getStorageNamespace());
+      llvm::NamespaceEmitter ns(os, def.getStorageNamespace());
       os << "struct " << def.getStorageClassName() << ";\n";
     }
     defCls.writeDeclTo(os);
   }
   void emitDef(raw_ostream &os) const {
     if (storageCls && def.genStorageClass()) {
-      NamespaceEmitter ns(os, def.getStorageNamespace());
+      llvm::NamespaceEmitter ns(os, def.getStorageNamespace());
       storageCls->writeDeclTo(os); // everything is inline
     }
     defCls.writeDefTo(os);
@@ -850,7 +850,7 @@ class AsmPrinter;
 
 bool DefGenerator::emitDecls(StringRef selectedDialect) {
   emitSourceFileHeader((defType + "Def Declarations").str(), os);
-  IfDefScope scope("GET_" + defType.upper() + "DEF_CLASSES", os);
+  llvm::IfDefEmitter scope(os, "GET_" + defType.upper() + "DEF_CLASSES");
 
   // Output the common "header".
   os << typeDefDeclHeader;
@@ -860,7 +860,7 @@ bool DefGenerator::emitDecls(StringRef selectedDialect) {
   if (defs.empty())
     return false;
   {
-    NamespaceEmitter nsEmitter(os, defs.front().getDialect());
+    DialectNamespaceEmitter nsEmitter(os, defs.front().getDialect());
 
     // Declare all the def classes first (in case they reference each other).
     for (const AttrOrTypeDef &def : defs) {
@@ -892,7 +892,7 @@ bool DefGenerator::emitDecls(StringRef selectedDialect) {
 //===----------------------------------------------------------------------===//
 
 void DefGenerator::emitTypeDefList(ArrayRef<AttrOrTypeDef> defs) {
-  IfDefScope scope("GET_" + defType.upper() + "DEF_LIST", os);
+  llvm::IfDefEmitter scope(os, "GET_" + defType.upper() + "DEF_LIST");
   auto interleaveFn = [&](const AttrOrTypeDef &def) {
     os << def.getDialect().getCppNamespace() << "::" << def.getCppClassName();
   };
@@ -1083,11 +1083,11 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) {
     return false;
   emitTypeDefList(defs);
 
-  IfDefScope scope("GET_" + defType.upper() + "DEF_CLASSES", os);
+  llvm::IfDefEmitter scope(os, "GET_" + defType.upper() + "DEF_CLASSES");
   emitParsePrintDispatch(defs);
   for (const AttrOrTypeDef &def : defs) {
     {
-      NamespaceEmitter ns(os, def.getDialect());
+      DialectNamespaceEmitter ns(os, def.getDialect());
       DefGen gen(def);
       gen.emitDef(os);
     }
@@ -1102,7 +1102,7 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) {
 
   // Emit the default parser/printer for Attributes if the dialect asked for it.
   if (isAttrGenerator && firstDialect.useDefaultAttributePrinterParser()) {
-    NamespaceEmitter nsEmitter(os, firstDialect);
+    DialectNamespaceEmitter nsEmitter(os, firstDialect);
     if (firstDialect.isExtensible()) {
       os << llvm::formatv(dialectDefaultAttrPrinterParserDispatch,
                           firstDialect.getCppClassName(),
@@ -1116,7 +1116,7 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) {
 
   // Emit the default parser/printer for Types if the dialect asked for it.
   if (!isAttrGenerator && firstDialect.useDefaultTypePrinterParser()) {
-    NamespaceEmitter nsEmitter(os, firstDialect);
+    DialectNamespaceEmitter nsEmitter(os, firstDialect);
     if (firstDialect.isExtensible()) {
       os << llvm::formatv(dialectDefaultTypePrinterParserDispatch,
                           firstDialect.getCppClassName(),
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 02941ec1268cb..2e8810d5d37f4 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -242,7 +242,7 @@ static const char *const discardableAttrHelperDecl = R"(
 static void emitDialectDecl(Dialect &dialect, raw_ostream &os) {
   // Emit all nested namespaces.
   {
-    NamespaceEmitter nsEmitter(os, dialect);
+    DialectNamespaceEmitter nsEmitter(os, dialect);
 
     // Emit the start of the decl.
     std::string cppName = dialect.getCppClassName();
@@ -358,7 +358,7 @@ static void emitDialectDef(Dialect &dialect, const RecordKeeper &records,
        << "::" << cppClassName << ")\n";
 
   // Emit all nested namespaces.
-  NamespaceEmitter nsEmitter(os, dialect);
+  DialectNamespaceEmitter nsEmitter(os, dialect);
 
   /// Build the list of dependent dialects.
   std::string dependentDialectRegistrations;
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index 91bc61bcdbb67..e1be11a0d92ed 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -342,7 +342,7 @@ static bool verifyDecls(const RecordKeeper &records, raw_ostream &) {
 /// structures according to the `clauses` argument of each definition deriving
 /// from `OpenMP_Op`.
 static bool genClauseOps(const RecordKeeper &records, raw_ostream &os) {
-  mlir::tblgen::NamespaceEmitter ns(os, "mlir::omp");
+  llvm::NamespaceEmitter ns(os, "mlir::omp");
   for (const Record *clause : records.getAllDerivedDefinitions("OpenMP_Clause"))
     genClauseOpsStruct(clause, os);
 
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 7e8e559baf878..c3420d433523a 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -4855,7 +4856,7 @@ static void emitOpClassDecls(const RecordKeeper &records,
   }
 
   // Emit the op class declarations.
-  IfDefScope scope("GET_OP_CLASSES", os);
+  IfDefEmitter scope(os, "GET_OP_CLASSES");
   if (defs.empty())
     return;
   StaticVerifierFunctionEmitter staticVerifierEmitter(os, records);
@@ -4898,7 +4899,7 @@ static bool emitOpDecls(const RecordKeeper &records, raw_ostream &os) {
     return false;
 
   Dialect dialect = Operator(defs.front()).getDialect();
-  NamespaceEmitter ns(os, dialect);
+  DialectNamespaceEmitter ns(os, dialect);
 
   const char *const opRegistrationHook =
       "void register{0}Operations{1}({2}::{0} *dialect);\n";
@@ -4921,7 +4922,7 @@ static void emitOpDefShard(const RecordKeeper &records,
   std::string shardGuard = "GET_OP_DEFS_";
   std::string indexStr = std::to_string(shardIndex);
   shardGuard += indexStr;
-  IfDefScope scope(shardGuard, os);
+  IfDefEmitter scope(os, shardGuard);
 
   // Emit the op registration hook in the first shard.
   const char *const opRegistrationHook =
@@ -4962,14 +4963,14 @@ static bool emitOpDefs(const RecordKeeper &records, raw_ostream &os) {
   // If no shard was requested, emit the regular op list and class definitions.
   if (shardedDefs.size() == 1) {
     {
-      IfDefScope scope("GET_OP_LIST", os);
+      IfDefEmitter scope(os, "GET_OP_LIST");
       interleave(
           defs, os,
           [&](const Record *def) { os << Operator(def).getQualCppClassName(); },
           ",\n");
     }
     {
-      IfDefScope scope("GET_OP_CLASSES", os);
+      IfDefEmitter scope(os, "GET_OP_CLASSES");
       emitOpClassDefs(records, defs, os);
     }
     return false;
diff --git a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
index 41ffdfcbde945..3ead2f0e37214 100644
--- a/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
+++ b/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp
@@ -50,7 +50,6 @@ using mlir::tblgen::EnumCase;
 using mlir::tblgen::EnumInfo;
 using mlir::tblgen::NamedAttribute;
 using mlir::tblgen::NamedTypeConstraint;
-using mlir::tblgen::NamespaceEmitter;
 using mlir::tblgen::Operator;
 
 //===----------------------------------------------------------------------===//
@@ -261,7 +260,7 @@ static void emitInterfaceDecl(const Availability &availability,
       std::string(formatv("{0}Traits", interfaceName));
 
   StringRef cppNamespace = availability.getInterfaceClassNamespace();
-  NamespaceEmitter nsEmitter(os, cppNamespace);
+  llvm::NamespaceEmitter nsEmitter(os, cppNamespace);
   os << "class " << interfaceName << ";\n\n";
 
   // Emit the traits struct containing the concept and model declarations.

From f3703f36ee5cf09b0fe86a25270d5923deb43788 Mon Sep 17 00:00:00 2001
From: Owen Anderson <resistor@mac.com>
Date: Sat, 4 Oct 2025 22:45:13 +0900
Subject: [PATCH 750/878] [TableGen] Look up registers directly in the
 CodeGenRegBank in CompressInstEmitter, rather than indirecting via the name.
 (#161853)

The previous code was subtly incorrect, as it indexed the RegistersByName map using the tblgen Def name of the register, rather than the AsmName with which the table was initialized. But all of this indirection via the name was unnecessary.
---
 llvm/utils/TableGen/CompressInstEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index ccf83859924bc..d8c5ca7c1e1a3 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -167,7 +167,7 @@ bool CompressInstEmitter::validateRegister(const Record *Reg,
   assert(RegClass->isSubClassOf("RegisterClass") &&
          "RegClass record should be a RegisterClass");
   const CodeGenRegisterClass &RC = Target.getRegisterClass(RegClass);
-  const CodeGenRegister *R = Target.getRegisterByName(Reg->getName().lower());
+  const CodeGenRegister *R = Target.getRegBank().getReg(Reg);
   assert(R != nullptr && "Register not defined!!");
   return RC.contains(R);
 }

From 679d2b2ab618a1933c5feb216d665b703d80a650 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 4 Oct 2025 23:45:16 +0900
Subject: [PATCH 751/878] AMDGPU: Fix using IRAttribute with nounwind for
 AMDGPUNoAGPR (#161954)

Don't think this did anything harmful, but it doesn't make sense
to report this as implementing nounwind handling.
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9dd64e0f6b35b..cb49936871e74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1224,12 +1224,9 @@ static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
 }
 
 // TODO: Migrate to range merge of amdgpu-agpr-alloc.
-// FIXME: Why is this using Attribute::NoUnwind?
-struct AAAMDGPUNoAGPR
-    : public IRAttribute<Attribute::NoUnwind,
-                         StateWrapper<BooleanState, AbstractAttribute>,
-                         AAAMDGPUNoAGPR> {
-  AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+struct AAAMDGPUNoAGPR : public StateWrapper<BooleanState, AbstractAttribute> {
+  using Base = StateWrapper<BooleanState, AbstractAttribute>;
+  AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
   static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
                                            Attributor &A) {

From 1f5fa79d26c8a92e35ccca39f6ae5323ad896d9e Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sat, 4 Oct 2025 11:43:41 -0700
Subject: [PATCH 752/878] [Github] Add pr-subscribers-infrastructure
 notifications (#142697)

This patch sets up the new PRs labeller for a
pr-subscribers-infrastructure team that can be used for recieving
notifications about infrastructure changes in the monorepo. This is
primarily centered around the .ci directory currently where most of the
action is taking place.
---
 .github/new-prs-labeler.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 16ada9e1f072d..da1d1ca9b515d 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -1121,3 +1121,6 @@ tablegen:
   - llvm/include/TableGen/**
   - llvm/lib/TableGen/**
   - llvm/utils/TableGen/**
+
+infrastructure:
+  - .ci/**

From 648e65e2c51b58deb05b70d190f6a55063c93322 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sat, 4 Oct 2025 11:50:24 -0700
Subject: [PATCH 753/878] [Github] Rename llvm-tests.yml workflow (#153866)

This check used to also run tests for everything, but has since been
modified to only run the ABI tests with actual testing being run through
the premerge configuration. This patch renames the workflow to better
reflect this.
---
 .github/workflows/{llvm-tests.yml => llvm-abi-tests.yml} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename .github/workflows/{llvm-tests.yml => llvm-abi-tests.yml} (99%)

diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-abi-tests.yml
similarity index 99%
rename from .github/workflows/llvm-tests.yml
rename to .github/workflows/llvm-abi-tests.yml
index c4701c7283da0..f73d180bb0005 100644
--- a/.github/workflows/llvm-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -1,4 +1,4 @@
-name: LLVM Tests
+name: LLVM ABI Tests
 
 permissions:
   contents: read

From 9e4af2ffa6e7ff4119b3975df76184bacb1fc209 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 4 Oct 2025 19:52:46 +0100
Subject: [PATCH 754/878] [ARM] Update and cleanup lround/llround tests. NFC

Similar to f4370fb801aa, the fp16 tests do not work yet.
---
 llvm/test/CodeGen/ARM/llround-conv.ll | 74 ++++++++++++++++++++++-----
 llvm/test/CodeGen/ARM/lround-conv.ll  | 46 ++++++++++++-----
 2 files changed, 94 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/ARM/llround-conv.ll b/llvm/test/CodeGen/ARM/llround-conv.ll
index 0f57e4ab50a5b..f734db89af2f7 100644
--- a/llvm/test/CodeGen/ARM/llround-conv.ll
+++ b/llvm/test/CodeGen/ARM/llround-conv.ll
@@ -1,25 +1,71 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+define i64 @testmsxh_builtin(half %x) {
+; CHECK-SOFT-LABEL: testmsxh_builtin:
+; CHECK-SOFT:       @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT:    .save {r11, lr}
+; CHECK-SOFT-NEXT:    push {r11, lr}
+; CHECK-SOFT-NEXT:    bl __aeabi_h2f
+; CHECK-SOFT-NEXT:    bl llroundf
+; CHECK-SOFT-NEXT:    pop {r11, pc}
+;
+; CHECK-NOFP16-LABEL: testmsxh_builtin:
+; CHECK-NOFP16:       @ %bb.0: @ %entry
+; CHECK-NOFP16-NEXT:    .save {r11, lr}
+; CHECK-NOFP16-NEXT:    push {r11, lr}
+; CHECK-NOFP16-NEXT:    vmov r0, s0
+; CHECK-NOFP16-NEXT:    bl __aeabi_h2f
+; CHECK-NOFP16-NEXT:    vmov s0, r0
+; CHECK-NOFP16-NEXT:    bl llroundf
+; CHECK-NOFP16-NEXT:    pop {r11, pc}
+;
+; CHECK-FP16-LABEL: testmsxh_builtin:
+; CHECK-FP16:       @ %bb.0: @ %entry
+; CHECK-FP16-NEXT:    .save {r11, lr}
+; CHECK-FP16-NEXT:    push {r11, lr}
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    bl llroundf
+; CHECK-FP16-NEXT:    pop {r11, pc}
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f16(half %x)
+  ret i64 %0
+}
 
-; SOFTFP-LABEL: testmsxs_builtin:
-; SOFTFP:       bl      llroundf
-; HARDFP-LABEL: testmsxs_builtin:
-; HARDFP:       bl      llroundf
 define i64 @testmsxs_builtin(float %x) {
+; CHECK-LABEL: testmsxs_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llroundf
+; CHECK-NEXT:    pop {r11, pc}
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   ret i64 %0
 }
 
-; SOFTFP-LABEL: testmsxd_builtin:
-; SOFTFP:       bl      llround
-; HARDFP-LABEL: testmsxd_builtin:
-; HARDFP:       bl      llround
 define i64 @testmsxd_builtin(double %x) {
+; CHECK-LABEL: testmsxd_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llround
+; CHECK-NEXT:    pop {r11, pc}
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   ret i64 %0
 }
 
-declare i64 @llvm.llround.f32(float) nounwind readnone
-declare i64 @llvm.llround.f64(double) nounwind readnone
+define i64 @testmsxq_builtin(fp128 %x) {
+; CHECK-LABEL: testmsxq_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl llroundl
+; CHECK-NEXT:    pop {r11, pc}
+entry:
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
+  ret i64 %0
+}
diff --git a/llvm/test/CodeGen/ARM/lround-conv.ll b/llvm/test/CodeGen/ARM/lround-conv.ll
index 3aaed74830b87..03f7a0d7a44c4 100644
--- a/llvm/test/CodeGen/ARM/lround-conv.ll
+++ b/llvm/test/CodeGen/ARM/lround-conv.ll
@@ -1,25 +1,47 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+;define i32 @testmswh_builtin(half %x) {
+;entry:
+;  %0 = tail call i32 @llvm.lround.i32.f16(half %x)
+;  ret i32 %0
+;}
 
-; SOFTFP-LABEL: testmsws_builtin:
-; SOFTFP:       bl      lroundf
-; HARDFP-LABEL: testmsws_builtin:
-; HARDFP:       bl      lroundf
 define i32 @testmsws_builtin(float %x) {
+; CHECK-LABEL: testmsws_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    b lroundf
 entry:
   %0 = tail call i32 @llvm.lround.i32.f32(float %x)
   ret i32 %0
 }
 
-; SOFTFP-LABEL: testmswd_builtin:
-; SOFTFP:       bl      lround
-; HARDFP-LABEL: testmswd_builtin:
-; HARDFP:       bl      lround
 define i32 @testmswd_builtin(double %x) {
+; CHECK-LABEL: testmswd_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    b lround
 entry:
   %0 = tail call i32 @llvm.lround.i32.f64(double %x)
   ret i32 %0
 }
 
-declare i32 @llvm.lround.i32.f32(float) nounwind readnone
-declare i32 @llvm.lround.i32.f64(double) nounwind readnone
+define i32 @testmswq_builtin(fp128 %x) {
+; CHECK-LABEL: testmswq_builtin:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bl lroundl
+; CHECK-NEXT:    pop {r11, pc}
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f128(fp128 %x)
+  ret i32 %0
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP16: {{.*}}
+; CHECK-FPv8: {{.*}}
+; CHECK-NOFP16: {{.*}}
+; CHECK-SOFT: {{.*}}

From 9dcfebfaee2defe51862699e8fa0c48f32916812 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Sat, 4 Oct 2025 22:46:16 +0300
Subject: [PATCH 755/878] [ARM] Auto-decode vpred_n/vpred_r operands (#160282)

Make the operands auto-decodable by adding `bits<0>` fields to
instructions.

Now we try to decode a vpred_n/vpred_r operand only if the instruction
being decoded has one.
We still need post-decoding pass to check/advance VPT state.

Part of #156540.
---
 llvm/lib/Target/ARM/ARMInstrCDE.td            |   1 +
 llvm/lib/Target/ARM/ARMInstrMVE.td            |   1 +
 .../ARM/Disassembler/ARMDisassembler.cpp      | 101 ++++++++----------
 3 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
index 54e27a6be5583..f4326de5ed667 100644
--- a/llvm/lib/Target/ARM/ARMInstrCDE.td
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -268,6 +268,7 @@ class CDE_Vec_Instr<bit acc, dag oops, dag iops, string asm, string cstr,
                      !con(iops, (ins vpred:$vp)), asm,
                      !strconcat(cstr, vpred.vpred_constraint)>,
     CDE_RequiresQReg {
+  bits<0> vp;
 }
 
 
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 9dffd945d5baa..e24413465799f 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -409,6 +409,7 @@ class MVE_p<dag oops, dag iops, InstrItinClass itin, string iname,
            !strconcat(iname, "${vp}",
                       !if(!eq(suffix, ""), "", !strconcat(".", suffix))),
            ops, !strconcat(cstr, vpred.vpred_constraint), vecsize, pattern> {
+  bits<0> vp;
   let Inst{31-29} = 0b111;
   let Inst{27-26} = 0b11;
 }
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index d358913d38af9..e67db8e3159c0 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -639,6 +639,43 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeVpredNOperand(MCInst &Inst,
+                                        const MCDisassembler *Decoder) {
+  const auto *D = static_cast<const ARMDisassembler *>(Decoder);
+  unsigned VCC = D->VPTBlock.getVPTPred();
+  MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0;
+
+  Inst.addOperand(MCOperand::createImm(VCC));             // $cond
+  Inst.addOperand(MCOperand::createReg(CondReg));         // $cond_reg
+  Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeVpredROperand(MCInst &Inst,
+                                        const MCDisassembler *Decoder) {
+  const auto *D = static_cast<const ARMDisassembler *>(Decoder);
+  unsigned VCC = D->VPTBlock.getVPTPred();
+  MCRegister CondReg = VCC == ARMVCC::None ? ARM::NoRegister : ARM::P0;
+
+  Inst.addOperand(MCOperand::createImm(VCC));             // $cond
+  Inst.addOperand(MCOperand::createReg(CondReg));         // $cond_reg
+  Inst.addOperand(MCOperand::createReg(ARM::NoRegister)); // $tp_reg
+
+  // The last sub-operand ($inactive) is tied to an output operand.
+  // The output operand has already been decoded, so just copy it.
+  const MCInstrDesc &MCID = D->MCII->get(Inst.getOpcode());
+  unsigned InactiveOpIdx = Inst.getNumOperands();
+  int TiedOpIdx = MCID.getOperandConstraint(InactiveOpIdx, MCOI::TIED_TO);
+  assert(TiedOpIdx >= 0 &&
+         "Inactive register in vpred_r is not tied to an output!");
+
+  // Make a copy of the operand to ensure it is not invalidated when MI grows.
+  Inst.addOperand(MCOperand(Inst.getOperand(TiedOpIdx))); // $inactive
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
                                           const MCDisassembler *Decoder) {
@@ -2777,6 +2814,7 @@ static DecodeStatus DecodeMVEModImmInstruction(MCInst &Inst, unsigned Insn,
 
   Inst.addOperand(MCOperand::createImm(imm));
 
+  Check(S, DecodeVpredROperand(Inst, Decoder));
   return S;
 }
 
@@ -2802,6 +2840,7 @@ static DecodeStatus DecodeMVEVADCInstruction(MCInst &Inst, unsigned Insn,
   if (!fieldFromInstruction(Insn, 12, 1)) // I bit clear => need input FPSCR
     Inst.addOperand(MCOperand::createReg(ARM::FPSCR_NZCV));
 
+  Check(S, DecodeVpredROperand(Inst, Decoder));
   return S;
 }
 
@@ -5466,30 +5505,6 @@ static DecodeStatus DecodeVPTMaskOperand(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecodeVpredROperand(MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address,
-                                        const MCDisassembler *Decoder) {
-  // The vpred_r operand type includes an MQPR register field derived
-  // from the encoding. But we don't actually want to add an operand
-  // to the MCInst at this stage, because AddThumbPredicate will do it
-  // later, and will infer the register number from the TIED_TO
-  // constraint. So this is a deliberately empty decoder method that
-  // will inhibit the auto-generated disassembly code from adding an
-  // operand at all.
-  return MCDisassembler::Success;
-}
-
-[[maybe_unused]] static DecodeStatus
-DecodeVpredNOperand(MCInst &Inst, unsigned RegNo, uint64_t Address,
-                    const MCDisassembler *Decoder) {
-  // Similar to above, we want to ensure that no operands are added for the
-  // vpred operands. (This is marked "maybe_unused" for the moment; because
-  // DecoderEmitter currently (wrongly) omits operands with no instruction bits,
-  // the decoder doesn't actually call it yet. That will be addressed in a
-  // future change.)
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus
 DecodeRestrictedIPredicateOperand(MCInst &Inst, unsigned Val, uint64_t Address,
                                   const MCDisassembler *Decoder) {
@@ -5668,6 +5683,7 @@ DecodeMVE_MEM_pre(MCInst &Inst, unsigned Val, uint64_t Address,
   if (!Check(S, AddrDecoder(Inst, addr, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -5871,7 +5887,7 @@ static DecodeStatus DecodeMVEVCVTt1fp(MCInst &Inst, unsigned Insn,
     return MCDisassembler::Fail;
   if (!Check(S, DecodeVCVTImmOperand(Inst, imm6, Address, Decoder)))
     return MCDisassembler::Fail;
-
+  Check(S, DecodeVpredROperand(Inst, Decoder));
   return S;
 }
 
@@ -5906,6 +5922,7 @@ static DecodeStatus DecodeMVEVCMP(MCInst &Inst, unsigned Insn, uint64_t Address,
   if (!Check(S, predicate_decoder(Inst, fc, Address, Decoder)))
     return MCDisassembler::Fail;
 
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -5916,6 +5933,7 @@ static DecodeStatus DecodeMveVCTP(MCInst &Inst, unsigned Insn, uint64_t Address,
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
   if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -5925,6 +5943,7 @@ static DecodeStatus DecodeMVEVPNOT(MCInst &Inst, unsigned Insn,
   DecodeStatus S = MCDisassembler::Success;
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
+  Check(S, DecodeVpredNOperand(Inst, Decoder));
   return S;
 }
 
@@ -6199,15 +6218,13 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
       (isVectorPredicable(MI) && ITBlock.instrInITBlock()))
     S = SoftFail;
 
-  // If we're in an IT/VPT block, base the predicate on that.  Otherwise,
+  // If we're in an IT block, base the predicate on that.  Otherwise,
   // assume a predicate of AL.
   unsigned CC = ARMCC::AL;
-  unsigned VCC = ARMVCC::None;
   if (ITBlock.instrInITBlock()) {
     CC = ITBlock.getITCC();
     ITBlock.advanceITState();
   } else if (VPTBlock.instrInVPTBlock()) {
-    VCC = VPTBlock.getVPTPred();
     VPTBlock.advanceVPTState();
   }
 
@@ -6230,34 +6247,6 @@ ARMDisassembler::AddThumbPredicate(MCInst &MI) const {
     Check(S, SoftFail);
   }
 
-  MCInst::iterator VCCI = MI.begin();
-  unsigned VCCPos;
-  for (VCCPos = 0; VCCPos < MCID.NumOperands; ++VCCPos, ++VCCI) {
-    if (ARM::isVpred(MCID.operands()[VCCPos].OperandType) || VCCI == MI.end())
-      break;
-  }
-
-  if (isVectorPredicable(MI)) {
-    VCCI = MI.insert(VCCI, MCOperand::createImm(VCC));
-    ++VCCI;
-    if (VCC == ARMVCC::None)
-      VCCI = MI.insert(VCCI, MCOperand::createReg(0));
-    else
-      VCCI = MI.insert(VCCI, MCOperand::createReg(ARM::P0));
-    ++VCCI;
-    VCCI = MI.insert(VCCI, MCOperand::createReg(0));
-    ++VCCI;
-    if (MCID.operands()[VCCPos].OperandType == ARM::OPERAND_VPRED_R) {
-      int TiedOp = MCID.getOperandConstraint(VCCPos + 3, MCOI::TIED_TO);
-      assert(TiedOp >= 0 &&
-             "Inactive register in vpred_r is not tied to an output!");
-      // Copy the operand to ensure it's not invalidated when MI grows.
-      MI.insert(VCCI, MCOperand(MI.getOperand(TiedOp)));
-    }
-  } else if (VCC != ARMVCC::None) {
-    Check(S, SoftFail);
-  }
-
   return S;
 }
 

From cc127f086df8db320f2e07f5fe6a669433e0c5b0 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sat, 4 Oct 2025 12:46:47 -0700
Subject: [PATCH 756/878] [TargetRegistry] Remove deprecated
 createTargetMachine (#161053)

This was included in the v21 release, so we can reasonably remove it
now. Add a TODO for the other functions that have not yet made it into a
release. It does not cost much to keep them around until the next
release given the small amount of code and should give a little bit of
extra time for some downstreams to migrate.
---
 llvm/include/llvm/MC/TargetRegistry.h | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/MC/TargetRegistry.h b/llvm/include/llvm/MC/TargetRegistry.h
index 019ee602975f7..570d4c0e8d272 100644
--- a/llvm/include/llvm/MC/TargetRegistry.h
+++ b/llvm/include/llvm/MC/TargetRegistry.h
@@ -389,6 +389,7 @@ class Target {
   /// @name Feature Constructors
   /// @{
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCAsmInfo *createMCAsmInfo(const MCRegisterInfo &MRI, StringRef TheTriple,
                              const MCTargetOptions &Options) const {
@@ -440,6 +441,7 @@ class Target {
     return MCInstrAnalysisCtorFn(Info);
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCRegisterInfo *createMCRegInfo(StringRef TT) const {
     if (!MCRegInfoCtorFn)
@@ -454,6 +456,7 @@ class Target {
     return MCRegInfoCtorFn(TT);
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCSubtargetInfo *createMCSubtargetInfo(StringRef TheTriple, StringRef CPU,
                                          StringRef Features) const {
@@ -496,16 +499,6 @@ class Target {
                                JIT);
   }
 
-  [[deprecated("Use overload accepting Triple instead")]]
-  TargetMachine *createTargetMachine(
-      StringRef TT, StringRef CPU, StringRef Features,
-      const TargetOptions &Options, std::optional<Reloc::Model> RM,
-      std::optional<CodeModel::Model> CM = std::nullopt,
-      CodeGenOptLevel OL = CodeGenOptLevel::Default, bool JIT = false) const {
-    return createTargetMachine(Triple(TT), CPU, Features, Options, RM, CM, OL,
-                               JIT);
-  }
-
   /// createMCAsmBackend - Create a target specific assembly parser.
   MCAsmBackend *createMCAsmBackend(const MCSubtargetInfo &STI,
                                    const MCRegisterInfo &MRI,
@@ -599,6 +592,7 @@ class Target {
     return nullptr;
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCRelocationInfo *createMCRelocationInfo(StringRef TT, MCContext &Ctx) const {
     return createMCRelocationInfo(Triple(TT), Ctx);
@@ -616,6 +610,7 @@ class Target {
     return Fn(TT, Ctx);
   }
 
+  // TODO(boomanaiden154): Remove this function after LLVM 22 branches.
   [[deprecated("Use overload accepting Triple instead")]]
   MCSymbolizer *
   createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,

From 6ee362e1b5eb52421e0e700074c40ff9e7e0205e Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Sat, 4 Oct 2025 15:49:25 -0400
Subject: [PATCH 757/878] [mlir][vector] Simplify rewrite pattern inheriting
 constructors. NFC. (#161966)

Use the `Base` type alias from
https://github.com/llvm/llvm-project/pull/158433.
---
 .../Conversion/VectorToAMX/VectorToAMX.cpp    |  2 +-
 .../VectorToArmSME/VectorToArmSME.cpp         | 26 +++++++++----------
 .../Conversion/VectorToGPU/VectorToGPU.cpp    |  4 +--
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  4 +--
 .../VectorToSPIRV/VectorToSPIRV.cpp           |  2 +-
 .../VectorToXeGPU/VectorToXeGPU.cpp           | 14 +++++-----
 6 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
index 7b9ed1d8cd21a..79c2f23c8e7f3 100644
--- a/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
+++ b/mlir/lib/Conversion/VectorToAMX/VectorToAMX.cpp
@@ -363,7 +363,7 @@ static TypedValue<VectorType> storeTile(PatternRewriter &rewriter,
 }
 
 struct ContractionToAMX : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index 4e1da39c29260..363685a691180 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -44,7 +44,7 @@ namespace {
 ///   arm_sme.tile_load ... layout<vertical>
 struct TransferReadToArmSMELowering
     : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp transferReadOp,
                                 PatternRewriter &rewriter) const final {
@@ -120,7 +120,7 @@ struct TransferReadToArmSMELowering
 ///     : memref<?x?xi8>, vector<[16]x[16]xi8>
 struct TransferWriteToArmSMELowering
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const final {
@@ -157,7 +157,7 @@ struct TransferWriteToArmSMELowering
 
 /// Conversion pattern for vector.load.
 struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> {
-  using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::LoadOp load,
                                 PatternRewriter &rewriter) const override {
@@ -173,7 +173,7 @@ struct VectorLoadToArmSMELowering : public OpRewritePattern<vector::LoadOp> {
 
 /// Conversion pattern for vector.store.
 struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> {
-  using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::StoreOp store,
                                 PatternRewriter &rewriter) const override {
@@ -208,7 +208,7 @@ struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> {
 /// Supports scalar, 0-d vector, and 1-d vector broadcasts.
 struct BroadcastOpToArmSMELowering
     : public OpRewritePattern<vector::BroadcastOp> {
-  using OpRewritePattern<vector::BroadcastOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::BroadcastOp broadcastOp,
                                 PatternRewriter &rewriter) const final {
@@ -279,7 +279,7 @@ struct BroadcastOpToArmSMELowering
 /// implementation, perhaps with tile <-> vector (MOVA) ops.
 struct TransposeOpToArmSMELowering
     : public OpRewritePattern<vector::TransposeOp> {
-  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp transposeOp,
                                 PatternRewriter &rewriter) const final {
@@ -372,7 +372,7 @@ struct TransposeOpToArmSMELowering
 struct VectorOuterProductToArmSMELowering
     : public OpRewritePattern<vector::OuterProductOp> {
 
-  using OpRewritePattern<vector::OuterProductOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::OuterProductOp outerProductOp,
                                 PatternRewriter &rewriter) const override {
@@ -451,7 +451,7 @@ struct VectorOuterProductToArmSMELowering
 /// ```
 struct VectorExtractToArmSMELowering
     : public OpRewritePattern<vector::ExtractOp> {
-  using OpRewritePattern<vector::ExtractOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -507,7 +507,7 @@ struct VectorExtractToArmSMELowering
 /// ```
 struct VectorInsertToArmSMELowering
     : public OpRewritePattern<vector::InsertOp> {
-  using OpRewritePattern<vector::InsertOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::InsertOp insertOp,
                                 PatternRewriter &rewriter) const override {
@@ -568,7 +568,7 @@ struct VectorInsertToArmSMELowering
 ///  }
 ///  ```
 struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
-  using OpRewritePattern<vector::PrintOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::PrintOp printOp,
                                 PatternRewriter &rewriter) const override {
@@ -623,7 +623,7 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
 ///  ```
 struct FoldTransferWriteOfExtractTileSlice
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const final {
@@ -679,7 +679,7 @@ struct FoldTransferWriteOfExtractTileSlice
 /// ```
 struct ExtractFromCreateMaskToPselLowering
     : public OpRewritePattern<vector::ExtractOp> {
-  using OpRewritePattern<vector::ExtractOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ExtractOp extractOp,
                                 PatternRewriter &rewriter) const override {
@@ -734,7 +734,7 @@ struct ExtractFromCreateMaskToPselLowering
 // Convert all `vector.splat` to `vector.broadcast`. There is a path from
 // `vector.broadcast` to ArmSME via another pattern.
 struct ConvertSplatToBroadcast : public OpRewritePattern<vector::SplatOp> {
-  using OpRewritePattern<vector::SplatOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::SplatOp splatOp,
                                 PatternRewriter &rewriter) const final {
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index d6a262275be3d..98434357f826f 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -386,7 +386,7 @@ namespace {
 // to MMA matmul.
 struct PrepareContractToGPUMMA
     : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp op,
                                 PatternRewriter &rewriter) const override {
@@ -450,7 +450,7 @@ struct PrepareContractToGPUMMA
 // Shared Memory to registers.
 struct CombineTransferReadOpTranspose final
     : public OpRewritePattern<vector::TransposeOp> {
-  using OpRewritePattern<vector::TransposeOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index e0b1a88d01cdc..546164628b795 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1342,7 +1342,7 @@ struct VectorScalableExtractOpLowering
 /// ```
 class VectorFMAOpNDRewritePattern : public OpRewritePattern<FMAOp> {
 public:
-  using OpRewritePattern<FMAOp>::OpRewritePattern;
+  using Base::Base;
 
   void initialize() {
     // This pattern recursively unpacks one dimension at a time. The recursion
@@ -2127,7 +2127,7 @@ FailureOr<Value> ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp(
 class TransposeOpToMatrixTransposeOpLowering
     : public OpRewritePattern<vector::TransposeOp> {
 public:
-  using OpRewritePattern<TransposeOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransposeOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
index 5061a4454a7fd..311ff6f5fbeee 100644
--- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
+++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp
@@ -838,7 +838,7 @@ struct VectorStoreOpConverter final
 
 struct VectorReductionToIntDotProd final
     : OpRewritePattern<vector::ReductionOp> {
-  using OpRewritePattern::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ReductionOp op,
                                 PatternRewriter &rewriter) const override {
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 9f5585a701438..e2c7d803e5a5e 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -475,7 +475,7 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp,
 }
 
 struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
@@ -546,7 +546,7 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
 
 struct TransferWriteLowering
     : public OpRewritePattern<vector::TransferWriteOp> {
-  using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
                                 PatternRewriter &rewriter) const override {
@@ -597,7 +597,7 @@ struct TransferWriteLowering
 };
 
 struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
-  using OpRewritePattern<vector::GatherOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::GatherOp gatherOp,
                                 PatternRewriter &rewriter) const override {
@@ -632,7 +632,7 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
 };
 
 struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
-  using OpRewritePattern<vector::ScatterOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ScatterOp scatterOp,
                                 PatternRewriter &rewriter) const override {
@@ -662,7 +662,7 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
 };
 
 struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
-  using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::LoadOp loadOp,
                                 PatternRewriter &rewriter) const override {
@@ -694,7 +694,7 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
 };
 
 struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
-  using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::StoreOp storeOp,
                                 PatternRewriter &rewriter) const override {
@@ -727,7 +727,7 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
 };
 
 struct ContractionLowering : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern<vector::ContractionOp>::OpRewritePattern;
+  using Base::Base;
 
   LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
                                 PatternRewriter &rewriter) const override {

From 8a8a589f24e9520db28180b6651b632799513cb5 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Sat, 4 Oct 2025 13:48:52 -0700
Subject: [PATCH 758/878] Update wording for GitHub CI/CD admin requests to
 direct to the LLVM Infrastructure Area Team (#150462)

(we should probably add some doc pages about governance/area teams/etc
that this
could link to... )
---
 llvm/docs/GitHub.rst | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index eae693b1d06ce..f86c73642f9b3 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -567,8 +567,11 @@ branch.  Just make sure to add the release milestone to the pull request.
 Getting admin access to CI infrastructure
 =========================================
 
-Any individual who is responsible for setting up and/or maintaining CI infrastructure for a LLVM project can
-request to be granted the CI/CD role to the LLVM organization admins. The request can be made by creating
-`a Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the ``infrastructure`` label.
-Applicants must include a justification for why the role is being requested. Applications are reviewed on a
-case-by-case basis by the LLVM admins and the role can be revoked at any point as the LLVM admins see fit.
+Any individual who is responsible for setting up and/or maintaining CI
+infrastructure for a LLVM project can request to be granted the CI/CD role by
+the LLVM infrastructure area team. The request can be made by creating `a
+Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the
+``infrastructure`` label.  Applicants must include a justification for why the
+role is being requested. Applications are reviewed on a case-by-case basis by
+the LLVM infrastructure area team and the role can be revoked at any point as
+the area team sees fit.

From 85c7cea8aca5bc58a9bc52f79433569d3cf87ca9 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Sat, 4 Oct 2025 16:51:43 -0400
Subject: [PATCH 759/878] [ADT] Add `DefaultUnreachable("msg")` to TypeSwitch
 (#161970)

This is to allow making it explicit that all the cases must be handled.
The error message is customizable.

Something similar was already supported using the conversion operator
for the typed case but less explicit. In the `void` case when
`TypeSwitch` doesn't return anything, this was not possible without a
custom lambda.
---
 llvm/include/llvm/ADT/TypeSwitch.h    | 19 +++++++++++++++---
 llvm/unittests/ADT/TypeSwitchTest.cpp | 28 +++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ADT/TypeSwitch.h b/llvm/include/llvm/ADT/TypeSwitch.h
index 5bbbdf23b257e..5657303b0a1f2 100644
--- a/llvm/include/llvm/ADT/TypeSwitch.h
+++ b/llvm/include/llvm/ADT/TypeSwitch.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <optional>
 
 namespace llvm {
@@ -117,11 +118,16 @@ class TypeSwitch : public detail::TypeSwitchBase<TypeSwitch<T, ResultT>, T> {
     return defaultResult;
   }
 
-  [[nodiscard]] operator ResultT() {
-    assert(result && "Fell off the end of a type-switch");
-    return std::move(*result);
+  /// Declare default as unreachable, making sure that all cases were handled.
+  [[nodiscard]] ResultT DefaultUnreachable(
+      const char *message = "Fell off the end of a type-switch") {
+    if (result)
+      return std::move(*result);
+    llvm_unreachable(message);
   }
 
+  [[nodiscard]] operator ResultT() { return DefaultUnreachable(); }
+
 private:
   /// The pointer to the result of this switch statement, once known,
   /// null before that.
@@ -158,6 +164,13 @@ class TypeSwitch<T, void>
       defaultFn(this->value);
   }
 
+  /// Declare default as unreachable, making sure that all cases were handled.
+  void DefaultUnreachable(
+      const char *message = "Fell off the end of a type-switch") {
+    if (!foundMatch)
+      llvm_unreachable(message);
+  }
+
 private:
   /// A flag detailing if we have already found a match.
   bool foundMatch = false;
diff --git a/llvm/unittests/ADT/TypeSwitchTest.cpp b/llvm/unittests/ADT/TypeSwitchTest.cpp
index c54b7987edf7e..a7d934265c5f0 100644
--- a/llvm/unittests/ADT/TypeSwitchTest.cpp
+++ b/llvm/unittests/ADT/TypeSwitchTest.cpp
@@ -114,3 +114,31 @@ TEST(TypeSwitchTest, CasesOptional) {
   EXPECT_EQ(std::nullopt, translate(DerivedC()));
   EXPECT_EQ(-1, translate(DerivedD()));
 }
+
+TEST(TypeSwitchTest, DefaultUnreachableWithValue) {
+  auto translate = [](auto value) {
+    return TypeSwitch<Base *, int>(&value)
+        .Case([](DerivedA *) { return 0; })
+        .DefaultUnreachable("Unhandled type");
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG)
+  EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type");
+#endif
+}
+
+TEST(TypeSwitchTest, DefaultUnreachableWithVoid) {
+  auto translate = [](auto value) {
+    int result = -1;
+    TypeSwitch<Base *>(&value)
+        .Case([&result](DerivedA *) { result = 0; })
+        .DefaultUnreachable("Unhandled type");
+    return result;
+  };
+  EXPECT_EQ(0, translate(DerivedA()));
+
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG)
+  EXPECT_DEATH((void)translate(DerivedD()), "Unhandled type");
+#endif
+}

From 9bfbff2c22f14d0494d324302bb5df0f7d2a7c45 Mon Sep 17 00:00:00 2001
From: Joseph Bak <36170953+josephbak@users.noreply.github.com>
Date: Sat, 4 Oct 2025 17:12:47 -0400
Subject: [PATCH 760/878] =?UTF-8?q?[mlir][docs]=20Fix=20typo:=20'DDR'=20?=
 =?UTF-8?q?=E2=86=92=20'DRR'=20in=20Creating=20a=20Dialect=20tutorial=20(#?=
 =?UTF-8?q?161967)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes a small typo in the Creating a Dialect tutorial.

The rewrite rules format should be DRR (Declarative Rewrite Rules), not
DDR.
---
 mlir/docs/Tutorials/CreatingADialect.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/CreatingADialect.md b/mlir/docs/Tutorials/CreatingADialect.md
index af709fc46eff5..e30b80bc5207c 100644
--- a/mlir/docs/Tutorials/CreatingADialect.md
+++ b/mlir/docs/Tutorials/CreatingADialect.md
@@ -22,7 +22,7 @@ typically defined in FooDialect.cpp, which includes FooOps.cpp.inc and
 FooOpsInterfaces.h.inc.
 
 The 'Transforms' directory contains rewrite rules for the dialect,
-typically described in TableGen file using the [DDR
+typically described in TableGen file using the [DRR
 format](../DeclarativeRewrites.md).
 
 Note that dialect names should not generally be suffixed with “Ops”,

From e301a7bbe1fb7006b969c54675ec3813b45d82c7 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Sat, 4 Oct 2025 17:56:13 -0400
Subject: [PATCH 761/878] [TTI] Remove getVPMemoryOpCost. NFC (#160838)

No target implements this and nothing calls it. The cost of
vp.load/store/gather/scatter etc. are handled by
BasicTTIImpl::getIntrinsicInstrCost which forwards it onto the
appropriate TTI::getMemoryOpCost/getGatherScatterOpCost etc.
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h     | 6 ------
 llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 8 --------
 2 files changed, 14 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7a4abe9ee5082..5d3b233ed6b6a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1551,12 +1551,6 @@ class TargetTransformInfo {
       OperandValueInfo OpdInfo = {OK_AnyValue, OP_None},
       const Instruction *I = nullptr) const;
 
-  /// \return The cost of VP Load and Store instructions.
-  LLVM_ABI InstructionCost getVPMemoryOpCost(
-      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
-      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
-      const Instruction *I = nullptr) const;
-
   /// \return The cost of masked Load and Store instructions.
   LLVM_ABI InstructionCost getMaskedMemoryOpCost(
       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 566e1cf51631a..4cd607c0d0c8d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -841,14 +841,6 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
-  virtual InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src,
-                                            Align Alignment,
-                                            unsigned AddressSpace,
-                                            TTI::TargetCostKind CostKind,
-                                            const Instruction *I) const {
-    return 1;
-  }
-
   virtual InstructionCost
   getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                         unsigned AddressSpace,

From 24c1bb60e321c16cb8247b45b080b2d5e02a2b31 Mon Sep 17 00:00:00 2001
From: ssijaric-nv <ssijaric@nvidia.com>
Date: Sat, 4 Oct 2025 15:22:54 -0700
Subject: [PATCH 762/878] [MC] Make .note.GNU-stack explicit for the trampoline
 case (#151754)

In the presence of trampolines, the .note.GNU-stack section is not emitted. The
absence of .note.GNU-stack results in the stack marked executable by some
linkers. But others require an explict .note.GNU-stack section.

The GNU ld 2.43 on x86 machines, for example, issues the following:

missing .note.GNU-stack section implies executable stack
NOTE: This behaviour is deprecated and will be removed in a future version of the linker

On one of the ARM machines, the absence of .note.GNU-stack results in the stack
marked as non-executable:

STACK off 0x0000000000000000 vaddr 0x0000000000000000 paddr 0x0000000000000000 align 2**4
filesz 0x0000000000000000 memsz 0x0000000000000000 flags rw-

This change just emits the explicit .note.GNU-stack and marks it executable if required.
---
 llvm/include/llvm/MC/MCAsmInfo.h                | 8 ++++----
 llvm/include/llvm/MC/MCAsmInfoELF.h             | 2 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp      | 8 +++++---
 llvm/lib/MC/MCAsmInfoELF.cpp                    | 5 +++--
 llvm/lib/MC/MCELFStreamer.cpp                   | 2 +-
 llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h | 2 +-
 llvm/test/CodeGen/AArch64/trampoline.ll         | 6 ++++++
 llvm/test/CodeGen/RISCV/rv64-trampoline.ll      | 7 +++++++
 llvm/tools/llvm-mc/llvm-mc.cpp                  | 3 ++-
 9 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 6c12cd347901a..7a2e9ad154f01 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -464,10 +464,10 @@ class LLVM_ABI MCAsmInfo {
   const char *getData64bitsDirective() const { return Data64bitsDirective; }
   bool supportsSignedData() const { return SupportsSignedData; }
 
-  /// Targets can implement this method to specify a section to switch to if the
-  /// translation unit doesn't have any trampolines that require an executable
-  /// stack.
-  virtual MCSection *getNonexecutableStackSection(MCContext &Ctx) const {
+  /// Targets can implement this method to specify a section to switch to
+  /// depending on whether the translation unit has any trampolines that require
+  /// an executable stack.
+  virtual MCSection *getStackSection(MCContext &Ctx, bool Exec) const {
     return nullptr;
   }
 
diff --git a/llvm/include/llvm/MC/MCAsmInfoELF.h b/llvm/include/llvm/MC/MCAsmInfoELF.h
index e0678888d1003..095ee4dabca14 100644
--- a/llvm/include/llvm/MC/MCAsmInfoELF.h
+++ b/llvm/include/llvm/MC/MCAsmInfoELF.h
@@ -15,7 +15,7 @@ namespace llvm {
 
 class MCAsmInfoELF : public MCAsmInfo {
   virtual void anchor();
-  MCSection *getNonexecutableStackSection(MCContext &Ctx) const override;
+  MCSection *getStackSection(MCContext &Ctx, bool Exec) const override;
   void printSwitchToSection(const MCSection &, uint32_t, const Triple &,
                             raw_ostream &) const final;
   bool useCodeAlign(const MCSection &Sec) const final;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 11efe492c57cc..10df9c1f97eae 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2866,9 +2866,11 @@ bool AsmPrinter::doFinalization(Module &M) {
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
-  if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
-    if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
-      OutStreamer->switchSection(S);
+  bool HasTrampolineUses =
+      InitTrampolineIntrinsic && !InitTrampolineIntrinsic->use_empty();
+  MCSection *S = MAI->getStackSection(OutContext, /*Exec=*/HasTrampolineUses);
+  if (S)
+    OutStreamer->switchSection(S);
 
   if (TM.Options.EmitAddrsig) {
     // Emit address-significance attributes for all globals.
diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp
index cdae9d7860f33..98090d34bcbdc 100644
--- a/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -27,12 +27,13 @@ using namespace llvm;
 
 void MCAsmInfoELF::anchor() {}
 
-MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
+MCSection *MCAsmInfoELF::getStackSection(MCContext &Ctx, bool Exec) const {
   // Solaris doesn't know/doesn't care about .note.GNU-stack sections, so
   // don't emit them.
   if (Ctx.getTargetTriple().isOSSolaris())
     return nullptr;
-  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0);
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+                           Exec ? ELF::SHF_EXECINSTR : 0U);
 }
 
 bool MCAsmInfoELF::useCodeAlign(const MCSection &Sec) const {
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 2881d7cfab4ba..1bc1b92610871 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -54,7 +54,7 @@ void MCELFStreamer::initSections(bool NoExecStack, const MCSubtargetInfo &STI) {
                     &STI);
 
   if (NoExecStack)
-    switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
+    switchSection(Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false));
 }
 
 void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index dfd896fbc735f..8d8066a55b43e 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -49,7 +49,7 @@ class BPFMCAsmInfo : public MCAsmInfoELF {
     DwarfUsesRelocationsAcrossSections = enable;
   }
 
-  MCSection *getNonexecutableStackSection(MCContext &Ctx) const override {
+  MCSection *getStackSection(MCContext &Ctx, bool Exec) const override {
     return nullptr;
   }
 };
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index 0e682704afbf8..3e933fadc4fa2 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -263,3 +263,9 @@ define i64 @func2() {
   %fp = call ptr @llvm.adjust.trampoline(ptr @trampg)
   ret i64 0
 }
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; CHECK-LINUX:         .section        ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
index 34d46579518ea..c68fa59cd5780 100644
--- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -78,3 +78,10 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
   ret i64 %ret
 
 }
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; RV64-LINUX:         .section        ".note.GNU-stack","x",@progbits
+; RV64:               .section        ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index ba550792cef40..2a89961cd7bbb 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -642,7 +642,8 @@ int main(int argc, char **argv) {
                : MAB->createObjectWriter(*OS),
         std::unique_ptr<MCCodeEmitter>(CE), *STI));
     if (NoExecStack)
-      Str->switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
+      Str->switchSection(
+          Ctx.getAsmInfo()->getStackSection(Ctx, /*Exec=*/false));
     Str->emitVersionForTarget(TheTriple, VersionTuple(), nullptr,
                               VersionTuple());
   }

From 795a115d1919966df72079eb3bd82699bfb2fa58 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Sat, 4 Oct 2025 18:50:44 -0400
Subject: [PATCH 763/878] [RegAlloc] Remove default restriction on non-trivial
 rematerialization (#159211)

In the register allocator we define non-trivial rematerialization as the
rematerlization of an instruction with virtual register uses.

We have been able to perform non-trivial rematerialization for a while,
but it has been prevented by default unless specifically overriden by
the target in `TargetTransformInfo::isReMaterializableImpl`. The
original reasoning for this given by the comment in the default
implementation is because we might increase a live range of the virtual
register, but we don't actually do this.
LiveRangeEdit::allUsesAvailableAt makes sure that we only rematerialize
instructions whose virtual registers are already live at the use sites.

https://reviews.llvm.org/D106408 had originally tried to remove this
restriction but it was reverted after some performance regressions were
reported. We think it is likely that the regressions were caused by the
fact that the old isTriviallyReMaterializable API sometimes returned
true for non-trivial rematerializations.

However https://github.com/llvm/llvm-project/pull/160377 recently split
the API out into a separate non-trivial and trivial version and updated
the call-sites accordingly, and
https://github.com/llvm/llvm-project/pull/160709 and #159180 fixed
heuristics which weren't accounting for the difference between
non-trivial and trivial.

With these fixes in place, this patch proposes to again allow
non-trivial rematerialization by default which reduces a significant
amount of spills and reloads across various targets.

For llvm-test-suite built with -O3 -flto, we get the following geomean
reduction in reloads:

- arm64-apple-darwin: 11.6%
- riscv64-linux-gnu: 8.1%
- x86_64-linux-gnu: 6.5%
---
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |   6 -
 .../GlobalISel/split-wide-shifts-multiway.ll  | 812 +++++++++---------
 .../AArch64/aarch64-matrix-umull-smull.ll     |  12 +-
 .../CodeGen/AArch64/machine-combiner-copy.ll  |   2 +-
 .../CodeGen/AArch64/machine-licm-sub-loop.ll  |  47 +-
 llvm/test/CodeGen/AArch64/peephole-and-tst.ll |   4 +-
 .../AArch64/reserveXreg-for-regalloc.ll       |   6 +-
 llvm/test/CodeGen/AArch64/tbl-loops.ll        |   8 +-
 llvm/test/CodeGen/ARM/combine-movc-sub.ll     |  12 +-
 llvm/test/CodeGen/ARM/extract-bits.ll         | 148 ++--
 llvm/test/CodeGen/ARM/extract-lowbits.ll      |  92 +-
 llvm/test/CodeGen/RISCV/pr69586.ll            | 204 ++---
 llvm/test/CodeGen/SystemZ/llvm.sincos.ll      |   4 +-
 .../Thumb2/LowOverheadLoops/reductions.ll     |   6 +-
 .../Thumb2/LowOverheadLoops/spillingmove.ll   | 151 ++--
 .../varying-outer-2d-reduction.ll             |  80 +-
 .../LowOverheadLoops/vcmp-vpst-combination.ll |  13 +-
 .../CodeGen/Thumb2/mve-float16regloops.ll     |  82 +-
 .../CodeGen/Thumb2/mve-float32regloops.ll     | 100 ++-
 .../CodeGen/Thumb2/mve-gather-increment.ll    | 278 +++---
 llvm/test/CodeGen/Thumb2/mve-phireg.ll        |  30 +-
 llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll   | 519 +++++------
 llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll   |  22 +-
 .../CodeGen/Thumb2/mve-scatter-increment.ll   |  10 +-
 .../CodeGen/Thumb2/mve-vecreduce-addpred.ll   |  16 +-
 .../CodeGen/Thumb2/mve-vecreduce-mlapred.ll   |  92 +-
 .../X86/AMX/amx-greedy-ra-spill-shape.ll      |  82 +-
 .../CodeGen/X86/dag-update-nodetomatch.ll     |   5 +-
 .../X86/delete-dead-instrs-with-live-uses.mir |   4 +-
 llvm/test/CodeGen/X86/inalloca-invoke.ll      |   2 +-
 llvm/test/CodeGen/X86/licm-regpressure.ll     |  62 +-
 31 files changed, 1442 insertions(+), 1469 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 2f3b7a2c8fcdf..3c41bbeb4b327 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1657,12 +1657,6 @@ bool TargetInstrInfo::isReMaterializableImpl(
     // same virtual register, though.
     if (MO.isDef() && Reg != DefReg)
       return false;
-
-    // Don't allow any virtual-register uses. Rematting an instruction with
-    // virtual register uses would length the live ranges of the uses, which
-    // is not necessarily a good idea, certainly not "trivial".
-    if (MO.isUse())
-      return false;
   }
 
   // Everything checked out.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index ed68723e470a2..41f7ab89094ad 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -1219,14 +1219,14 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ;
 ; GISEL-LABEL: test_shl_i1024:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    sub sp, sp, #416
-; GISEL-NEXT:    stp x28, x27, [sp, #320] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x26, x25, [sp, #336] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x24, x23, [sp, #352] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x22, x21, [sp, #368] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x20, x19, [sp, #384] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x29, x30, [sp, #400] ; 16-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 416
+; GISEL-NEXT:    sub sp, sp, #432
+; GISEL-NEXT:    stp x28, x27, [sp, #336] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x26, x25, [sp, #352] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x24, x23, [sp, #368] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x22, x21, [sp, #384] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x20, x19, [sp, #400] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #416] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 432
 ; GISEL-NEXT:    .cfi_offset w30, -8
 ; GISEL-NEXT:    .cfi_offset w29, -16
 ; GISEL-NEXT:    .cfi_offset w19, -24
@@ -1242,38 +1242,44 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    ldp x10, x11, [x1]
 ; GISEL-NEXT:    mov w8, w2
 ; GISEL-NEXT:    lsr x9, x8, #6
-; GISEL-NEXT:    and x16, x8, #0x3f
+; GISEL-NEXT:    and x12, x8, #0x3f
+; GISEL-NEXT:    str x0, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x14, x8, #0x3f
 ; GISEL-NEXT:    mov w13, #64 ; =0x40
-; GISEL-NEXT:    sub x21, x13, x16
-; GISEL-NEXT:    str x0, [sp, #112] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x24, x16
-; GISEL-NEXT:    lsl x25, x10, x16
+; GISEL-NEXT:    and x16, x8, #0x3f
+; GISEL-NEXT:    lsl x0, x10, x12
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    lsr x26, x10, x21
-; GISEL-NEXT:    lsl x2, x11, x16
-; GISEL-NEXT:    lsr x23, x11, x21
-; GISEL-NEXT:    mov x22, x21
-; GISEL-NEXT:    csel x12, x25, xzr, eq
+; GISEL-NEXT:    sub x2, x13, x14
+; GISEL-NEXT:    lsr x3, x10, x2
+; GISEL-NEXT:    lsl x6, x11, x14
+; GISEL-NEXT:    and x14, x8, #0x3f
+; GISEL-NEXT:    csel x12, x0, xzr, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x1, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT:    lsr x20, x11, x2
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    str x23, [sp, #208] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x24, x0
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    stp x24, x22, [sp, #40] ; 16-byte Folded Spill
+; GISEL-NEXT:    mov x7, x3
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #4
+; GISEL-NEXT:    mov x28, x1
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #5
+; GISEL-NEXT:    and x21, x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #6
+; GISEL-NEXT:    str x6, [sp, #24] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #7
+; GISEL-NEXT:    str x28, [sp, #304] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #8
+; GISEL-NEXT:    str x7, [sp, #272] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #9
+; GISEL-NEXT:    str x20, [sp, #112] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1290,13 +1296,13 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x10, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #192] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x26, eq
+; GISEL-NEXT:    str x10, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x10, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x10, x2, x10
+; GISEL-NEXT:    orr x10, x6, x10
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x0, x10, eq
 ; GISEL-NEXT:    cmp x9, #2
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #3
@@ -1327,25 +1333,24 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    lsl x20, x12, x16
+; GISEL-NEXT:    lsl x26, x12, x14
 ; GISEL-NEXT:    csel x11, x11, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #184] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    str x11, [sp, #224] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x11, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x20, x11
-; GISEL-NEXT:    lsr x15, x12, x21
-; GISEL-NEXT:    lsl x14, x10, x16
+; GISEL-NEXT:    orr x11, x26, x11
+; GISEL-NEXT:    lsr x15, x12, x2
+; GISEL-NEXT:    lsl x30, x10, x16
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x17, x10, x21
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    lsr x17, x10, x2
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x20, [sp, #8] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x0, x11, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #4
@@ -1375,23 +1380,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #216] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x14, x11
+; GISEL-NEXT:    orr x11, x30, x11
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x0, x11, eq
 ; GISEL-NEXT:    cmp x9, #4
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #5
@@ -1421,33 +1426,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    lsl x0, x12, x16
 ; GISEL-NEXT:    csel x10, x10, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x10, [sp, #208] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #0
 ; GISEL-NEXT:    orr x10, x0, x10
-; GISEL-NEXT:    lsr x27, x12, x21
+; GISEL-NEXT:    lsr x4, x12, x2
 ; GISEL-NEXT:    lsl x19, x11, x16
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x3, x11, x21
+; GISEL-NEXT:    mov x16, x15
 ; GISEL-NEXT:    csel x13, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    stp x27, x0, [sp, #240] ; 16-byte Folded Spill
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    mov x7, x3
+; GISEL-NEXT:    str x4, [sp, #248] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    str x0, [sp, #48] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x24, x10, eq
 ; GISEL-NEXT:    cmp x9, #5
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #6
@@ -1473,8 +1478,8 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #160] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x27, eq
+; GISEL-NEXT:    str x10, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x10, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #0
 ; GISEL-NEXT:    orr x10, x19, x10
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
@@ -1486,20 +1491,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    and x15, x8, #0x3f
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    lsr x3, x11, x2
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x24, x10, eq
 ; GISEL-NEXT:    cmp x9, #6
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #7
@@ -1522,21 +1529,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    lsl x4, x12, x16
+; GISEL-NEXT:    lsl x22, x12, x15
 ; GISEL-NEXT:    csel x11, x11, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #192] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x4, x11
-; GISEL-NEXT:    lsl x30, x10, x16
-; GISEL-NEXT:    lsr x28, x10, x21
+; GISEL-NEXT:    orr x11, x22, x11
+; GISEL-NEXT:    lsl x5, x10, x15
+; GISEL-NEXT:    lsr x27, x10, x2
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x30, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x25, x27
 ; GISEL-NEXT:    orr x13, x19, x13
+; GISEL-NEXT:    mov x14, x5
+; GISEL-NEXT:    str x27, [sp, #328] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x17, eq
@@ -1544,30 +1553,29 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    orr x13, x30, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    lsr x13, x12, x21
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    lsr x13, x12, x2
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    mov x6, x13
+; GISEL-NEXT:    mov x15, x13
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    str x6, [sp, #256] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
@@ -1584,18 +1592,18 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #184] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x30, x11
+; GISEL-NEXT:    orr x11, x5, x11
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x12, x4, x12
+; GISEL-NEXT:    orr x12, x22, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x27, eq
+; GISEL-NEXT:    csel x12, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #2
 ; GISEL-NEXT:    orr x12, x19, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
@@ -1605,22 +1613,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x12, x0, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
@@ -1635,39 +1643,34 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #14
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
-; GISEL-NEXT:    ldp x11, x5, [x1, #64]
+; GISEL-NEXT:    ldp x11, x1, [x1, #64]
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x12, x10, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsl x21, x11, x16
-; GISEL-NEXT:    str x12, [sp, #136] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x12, xzr, x28, eq
+; GISEL-NEXT:    lsl x23, x11, x21
+; GISEL-NEXT:    str x12, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    lsr x10, x11, x22
-; GISEL-NEXT:    mov x16, x19
+; GISEL-NEXT:    orr x12, x23, x12
+; GISEL-NEXT:    lsr x21, x11, x2
+; GISEL-NEXT:    str x23, [sp, #288] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x12, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    mov x1, x16
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x16, [sp, #304] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    orr x13, x5, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    lsl x3, x5, x24
-; GISEL-NEXT:    orr x13, x4, x13
+; GISEL-NEXT:    orr x13, x22, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    stp x21, x3, [sp, #216] ; 16-byte Folded Spill
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x13, x19, x13
-; GISEL-NEXT:    mov x19, x28
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x17, eq
@@ -1675,27 +1678,30 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    orr x13, x30, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    csel x12, x25, x12, eq
+; GISEL-NEXT:    and x13, x8, #0x3f
+; GISEL-NEXT:    csel x12, x24, x12, eq
 ; GISEL-NEXT:    cmp x9, #9
+; GISEL-NEXT:    lsl x10, x1, x13
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #11
+; GISEL-NEXT:    stp x10, x15, [sp, #312] ; 16-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1708,69 +1714,69 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x11, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #128] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x10, eq
+; GISEL-NEXT:    str x11, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x11, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x10, x11
+; GISEL-NEXT:    mov x10, x23
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x28, eq
+; GISEL-NEXT:    csel x12, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    mov x28, x4
-; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    str x28, [sp, #32] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x27, x24
+; GISEL-NEXT:    orr x12, x23, x12
+; GISEL-NEXT:    mov x23, x15
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x6, eq
+; GISEL-NEXT:    csel x12, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x30, x12
+; GISEL-NEXT:    mov x15, x22
+; GISEL-NEXT:    orr x12, x5, x12
+; GISEL-NEXT:    mov x5, x3
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x7, eq
+; GISEL-NEXT:    stp x14, x5, [sp, #256] ; 16-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x12, x4, x12
-; GISEL-NEXT:    mov x4, x20
+; GISEL-NEXT:    mov x5, x4
+; GISEL-NEXT:    orr x12, x22, x12
+; GISEL-NEXT:    lsr x22, x1, x2
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x27, eq
+; GISEL-NEXT:    csel x12, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x27, x2
-; GISEL-NEXT:    orr x12, x16, x12
-; GISEL-NEXT:    mov x16, x17
+; GISEL-NEXT:    str x22, [sp, #240] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x19, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    mov x17, x15
 ; GISEL-NEXT:    orr x12, x0, x12
-; GISEL-NEXT:    lsr x0, x5, x22
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    ldr x15, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x12, x14, x12
-; GISEL-NEXT:    str x0, [sp, #280] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    mov x23, x25
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    str x23, [sp, #288] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x12, x2, x12
-; GISEL-NEXT:    mov x2, x3
+; GISEL-NEXT:    mov x7, x14
+; GISEL-NEXT:    orr x12, x6, x12
+; GISEL-NEXT:    mov x6, x28
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    mov x25, x26
+; GISEL-NEXT:    ldr x24, [x6, #88]
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #11
+; GISEL-NEXT:    ldr x6, [sp, #272] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
@@ -1780,80 +1786,84 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
+; GISEL-NEXT:    ldr x11, [x28, #80]
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x12, x5, x12, eq
-; GISEL-NEXT:    ldp x11, x5, [x15, #80]
+; GISEL-NEXT:    csel x12, x1, x12, eq
+; GISEL-NEXT:    mov x28, x2
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x12, [sp, #120] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x15, x7
-; GISEL-NEXT:    csel x12, xzr, x0, eq
+; GISEL-NEXT:    lsl x2, x11, x13
+; GISEL-NEXT:    str x12, [sp, #160] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    str x15, [sp, #24] ; 8-byte Folded Spill
-; GISEL-NEXT:    lsl x20, x11, x24
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    str x20, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    str x28, [sp, #16] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    str x2, [sp, #280] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x12, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x10, eq
+; GISEL-NEXT:    csel x13, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x13, x3, x13
-; GISEL-NEXT:    lsl x3, x5, x24
+; GISEL-NEXT:    orr x13, x1, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x19, eq
+; GISEL-NEXT:    csel x13, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    stp x19, x3, [sp, #264] ; 16-byte Folded Spill
-; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    mov x25, x16
+; GISEL-NEXT:    orr x13, x10, x13
+; GISEL-NEXT:    mov x10, x30
+; GISEL-NEXT:    str x25, [sp, #80] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x6, eq
+; GISEL-NEXT:    csel x13, xzr, x23, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    mov x23, x3
+; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    mov x14, x17
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
-; GISEL-NEXT:    ldp x7, x30, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT:    stp x19, x14, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x13, x28, x13
+; GISEL-NEXT:    mov x3, x21
+; GISEL-NEXT:    orr x13, x15, x13
+; GISEL-NEXT:    str x3, [sp, #32] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x1, x13
-; GISEL-NEXT:    mov x1, x14
+; GISEL-NEXT:    mov x4, x0
+; GISEL-NEXT:    orr x13, x19, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x16, eq
+; GISEL-NEXT:    csel x13, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    mov x17, x27
+; GISEL-NEXT:    orr x13, x0, x13
+; GISEL-NEXT:    ldr x0, [sp, #24] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x17, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    ldr x14, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    ldp x30, x16, [sp, #320] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x14, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x13, x4, x13
-; GISEL-NEXT:    mov x4, x10
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x6, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    mov x26, x27
-; GISEL-NEXT:    orr x13, x27, x13
-; GISEL-NEXT:    lsr x27, x11, x22
+; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    mov x13, x23
-; GISEL-NEXT:    csel x12, x23, x12, eq
+; GISEL-NEXT:    lsr x13, x11, x28
+; GISEL-NEXT:    csel x12, x27, x12, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    str x27, [sp, #64] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    mov x23, x20
+; GISEL-NEXT:    str x13, [sp, #96] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #13
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1864,71 +1874,77 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x11, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #104] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x27, eq
+; GISEL-NEXT:    str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x11, x8, #0x3f
+; GISEL-NEXT:    lsl x27, x24, x11
+; GISEL-NEXT:    csel x11, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x27, x11
+; GISEL-NEXT:    str x27, [sp, #56] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x0, eq
+; GISEL-NEXT:    csel x12, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    mov x0, x7
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    mov x20, x16
+; GISEL-NEXT:    mov x22, x2
+; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    mov x2, x14
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x10, eq
+; GISEL-NEXT:    csel x12, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    ldr x10, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x12, x2, x12
-; GISEL-NEXT:    ldr x2, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x21, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x12, x1, x12
+; GISEL-NEXT:    mov x1, x27
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x19, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    ldr x21, [sp, #200] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x6, eq
+; GISEL-NEXT:    csel x12, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x21, x12
+; GISEL-NEXT:    orr x12, x7, x12
+; GISEL-NEXT:    mov x7, x15
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    str x7, [sp, #40] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x23, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x12, x28, x12
+; GISEL-NEXT:    orr x12, x15, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x7, eq
+; GISEL-NEXT:    csel x12, xzr, x5, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    mov x7, x17
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    mov x5, x19
+; GISEL-NEXT:    orr x12, x19, x12
+; GISEL-NEXT:    mov x19, x7
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x16, eq
+; GISEL-NEXT:    csel x12, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x12, x30, x12
+; GISEL-NEXT:    lsr x14, x24, x28
+; GISEL-NEXT:    orr x12, x4, x12
+; GISEL-NEXT:    mov x4, x10
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x17, eq
+; GISEL-NEXT:    csel x12, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    mov x17, x24
-; GISEL-NEXT:    orr x12, x1, x12
+; GISEL-NEXT:    orr x12, x10, x12
+; GISEL-NEXT:    ldr x10, [sp, #304] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x14, eq
-; GISEL-NEXT:    ldr x14, [sp, #8] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x25, eq
+; GISEL-NEXT:    csel x12, xzr, x6, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x12, x26, x12
+; GISEL-NEXT:    orr x12, x0, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    csel x11, x13, x11, eq
+; GISEL-NEXT:    csel x11, x17, x11, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #13
@@ -1937,393 +1953,395 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
+; GISEL-NEXT:    ldp x11, x6, [x10, #96]
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    ldp x11, x10, [x10, #96]
-; GISEL-NEXT:    csel x12, x5, x12, eq
-; GISEL-NEXT:    str x12, [sp, #96] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x12, x22
-; GISEL-NEXT:    lsr x22, x5, x22
-; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    mov x5, x27
-; GISEL-NEXT:    lsl x24, x11, x24
-; GISEL-NEXT:    str x10, [sp, #296] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x22, eq
+; GISEL-NEXT:    and x10, x8, #0x3f
+; GISEL-NEXT:    csel x12, x24, x12, eq
+; GISEL-NEXT:    tst x8, #0x3f
+; GISEL-NEXT:    ldr x24, [sp, #248] ; 8-byte Folded Reload
+; GISEL-NEXT:    lsl x15, x11, x10
+; GISEL-NEXT:    csel x10, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    str x22, [sp, #16] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x10, x24, x10
+; GISEL-NEXT:    str x12, [sp, #136] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x12, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x10, x15, x10
+; GISEL-NEXT:    str x15, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x15, x13
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    ldr x27, [sp, #280] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x13, x3, x13
-; GISEL-NEXT:    mov x3, x26
+; GISEL-NEXT:    orr x13, x27, x13
+; GISEL-NEXT:    ldr x27, [sp, #240] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x13, x23, x13
-; GISEL-NEXT:    mov x23, x4
+; GISEL-NEXT:    orr x13, x22, x13
+; GISEL-NEXT:    ldr x22, [sp, #272] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x4, eq
-; GISEL-NEXT:    ldp x4, x16, [sp, #216] ; 16-byte Folded Reload
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x16, x13
+; GISEL-NEXT:    orr x13, x12, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x19, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x19, x1
-; GISEL-NEXT:    orr x13, x4, x13
+; GISEL-NEXT:    mov x16, x17
+; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    ldp x23, x21, [sp, #256] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x6, eq
+; GISEL-NEXT:    csel x13, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    mov x6, x14
-; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    mov x30, x0
+; GISEL-NEXT:    orr x13, x23, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x28, x13
+; GISEL-NEXT:    orr x13, x7, x13
+; GISEL-NEXT:    mov x7, x14
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x0, eq
+; GISEL-NEXT:    csel x13, xzr, x24, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    mov x0, x23
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x5, x13
+; GISEL-NEXT:    ldr x5, [sp, #48] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x20, eq
+; GISEL-NEXT:    csel x13, xzr, x2, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x13, x30, x13
-; GISEL-NEXT:    ldr x30, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x2, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x5, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
+; GISEL-NEXT:    csel x13, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x13, x1, x13
+; GISEL-NEXT:    mov x25, x6
+; GISEL-NEXT:    orr x13, x4, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x30, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    ldp x14, x2, [sp, #264] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x25, eq
+; GISEL-NEXT:    csel x13, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x13, x26, x13
-; GISEL-NEXT:    ldr x26, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    lsr x13, x11, x12
-; GISEL-NEXT:    csel x10, x26, x10, eq
+; GISEL-NEXT:    lsr x13, x11, x28
+; GISEL-NEXT:    csel x10, x17, x10, eq
 ; GISEL-NEXT:    cmp x9, #13
+; GISEL-NEXT:    ldr x17, [sp, #80] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    str x13, [sp, #72] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x13, [sp, #104] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #88] ; 8-byte Folded Spill
-; GISEL-NEXT:    ldr x10, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT:    lsl x11, x10, x17
+; GISEL-NEXT:    str x10, [sp, #128] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x10, x8, #0x3f
+; GISEL-NEXT:    lsl x11, x6, x10
 ; GISEL-NEXT:    csel x10, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    ldr x17, [sp, #232] ; 8-byte Folded Reload
-; GISEL-NEXT:    ldr x13, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x0, x13, [sp, #280] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x6, x16
 ; GISEL-NEXT:    orr x10, x11, x10
-; GISEL-NEXT:    str x11, [sp, #56] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #88] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x22, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x11, x24, x11
+; GISEL-NEXT:    orr x11, x2, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x5, eq
+; GISEL-NEXT:    csel x11, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x11, x2, x11
-; GISEL-NEXT:    ldp x12, x5, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x15, x3
+; GISEL-NEXT:    orr x11, x1, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    mov x27, x30
-; GISEL-NEXT:    orr x11, x17, x11
+; GISEL-NEXT:    orr x11, x0, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    csel x11, xzr, x3, eq
+; GISEL-NEXT:    ldp x14, x3, [sp, #320] ; 16-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x23, x20
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x14, eq
+; GISEL-NEXT:    csel x11, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x11, x4, x11
+; GISEL-NEXT:    orr x11, x13, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x13, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x11, x21, x11
-; GISEL-NEXT:    ldr x21, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x23, x11
+; GISEL-NEXT:    mov x23, x5
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x15, eq
+; GISEL-NEXT:    csel x11, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x11, x28, x11
+; GISEL-NEXT:    mov x21, x4
+; GISEL-NEXT:    orr x11, x19, x11
+; GISEL-NEXT:    ldp x12, x19, [sp, #64] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x12, eq
+; GISEL-NEXT:    csel x11, xzr, x24, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x11, x16, x11
+; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x20, eq
+; GISEL-NEXT:    csel x11, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #9
 ; GISEL-NEXT:    orr x11, x5, x11
+; GISEL-NEXT:    mov x5, x30
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x7, eq
+; GISEL-NEXT:    csel x11, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x11, x1, x11
-; GISEL-NEXT:    ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x4, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x30, eq
+; GISEL-NEXT:    csel x11, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x11, x6, x11
+; GISEL-NEXT:    orr x11, x26, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x25, eq
+; GISEL-NEXT:    csel x11, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x30, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    csel x10, x26, x10, eq
+; GISEL-NEXT:    csel x10, x16, x10, eq
 ; GISEL-NEXT:    cmp x9, #14
+; GISEL-NEXT:    ldr x16, [sp, #304] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x11, xzr, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x11, x21, x11, eq
-; GISEL-NEXT:    ldp x10, x20, [x1, #112]
-; GISEL-NEXT:    str x11, [sp, #80] ; 8-byte Folded Spill
-; GISEL-NEXT:    ldp x11, x4, [sp, #40] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x10, x4, [x16, #112]
+; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    str x11, [sp, #120] ; 8-byte Folded Spill
+; GISEL-NEXT:    lsr x11, x25, x28
+; GISEL-NEXT:    and x16, x8, #0x3f
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x21, x21, x4
-; GISEL-NEXT:    lsl x28, x10, x11
-; GISEL-NEXT:    csel x1, xzr, x21, eq
-; GISEL-NEXT:    str x21, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x25, [sp, #88] ; 8-byte Folded Reload
+; GISEL-NEXT:    lsl x24, x10, x16
+; GISEL-NEXT:    csel x1, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x1, x28, x1
-; GISEL-NEXT:    ldr x21, [sp, #72] ; 8-byte Folded Reload
-; GISEL-NEXT:    str x28, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldp x16, x28, [sp, #96] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x1, x24, x1
 ; GISEL-NEXT:    csel x1, x1, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    ldr x28, [sp, #56] ; 8-byte Folded Reload
-; GISEL-NEXT:    csel x30, xzr, x21, eq
+; GISEL-NEXT:    csel x30, xzr, x28, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x30, x28, x30
+; GISEL-NEXT:    orr x30, x25, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x22, eq
+; GISEL-NEXT:    csel x30, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    ldr x22, [sp, #64] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x30, x24, x30
+; GISEL-NEXT:    orr x30, x2, x30
+; GISEL-NEXT:    ldr x2, [sp, #56] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x22, eq
+; GISEL-NEXT:    csel x30, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x30, x2, x30
-; GISEL-NEXT:    ldr x2, [sp, #280] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x2, eq
+; GISEL-NEXT:    csel x30, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x30, x17, x30
-; GISEL-NEXT:    ldr x17, [sp, #224] ; 8-byte Folded Reload
+; GISEL-NEXT:    mov x27, x13
+; GISEL-NEXT:    orr x30, x0, x30
+; GISEL-NEXT:    ldr x0, [sp, #248] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x0, eq
+; GISEL-NEXT:    csel x30, xzr, x15, eq
+; GISEL-NEXT:    ldr x15, [sp, #312] ; 8-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x30, x17, x30
+; GISEL-NEXT:    orr x30, x15, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x14, eq
-; GISEL-NEXT:    ldr x14, [sp, #216] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x30, x14, x30
+; GISEL-NEXT:    ldr x3, [sp, #40] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x30, x13, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x13, eq
-; GISEL-NEXT:    ldr x13, [sp, #200] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x14, eq
+; GISEL-NEXT:    ldp x13, x14, [sp, #256] ; 16-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #7
 ; GISEL-NEXT:    orr x30, x13, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x15, eq
-; GISEL-NEXT:    ldr x15, [sp, #32] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x30, x15, x30
+; GISEL-NEXT:    orr x30, x3, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x12, eq
+; GISEL-NEXT:    csel x30, xzr, x0, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x30, x16, x30
+; GISEL-NEXT:    orr x30, x12, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x23, eq
+; GISEL-NEXT:    csel x30, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x30, x5, x30
+; GISEL-NEXT:    orr x30, x23, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x7, eq
+; GISEL-NEXT:    csel x30, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x30, x19, x30
+; GISEL-NEXT:    orr x30, x21, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x27, eq
+; GISEL-NEXT:    csel x30, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x30, x6, x30
+; GISEL-NEXT:    mov x20, x26
+; GISEL-NEXT:    orr x30, x26, x30
+; GISEL-NEXT:    mov x26, x5
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x25, eq
+; GISEL-NEXT:    csel x30, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    orr x30, x3, x30
+; GISEL-NEXT:    orr x30, x5, x30
+; GISEL-NEXT:    ldr x5, [sp, #16] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    lsr x30, x10, x4
-; GISEL-NEXT:    csel x1, x26, x1, eq
+; GISEL-NEXT:    csel x1, x6, x1, eq
 ; GISEL-NEXT:    cmp x9, #15
+; GISEL-NEXT:    lsr x30, x10, x5
 ; GISEL-NEXT:    csel x1, xzr, x1, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x26, x10, x1, eq
-; GISEL-NEXT:    lsl x10, x20, x11
+; GISEL-NEXT:    csel x5, x10, x1, eq
+; GISEL-NEXT:    and x10, x8, #0x3f
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x16, xzr, x30, eq
+; GISEL-NEXT:    lsl x10, x4, x10
+; GISEL-NEXT:    csel x1, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    ldr x11, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x10, x10, x16
-; GISEL-NEXT:    ldr x16, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x29, x30, [sp, #416] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x10, x10, x1
+; GISEL-NEXT:    ldr x1, [sp, #296] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #272] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x24, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x21, eq
+; GISEL-NEXT:    csel x11, xzr, x28, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x11, x28, x11
-; GISEL-NEXT:    ldp x29, x30, [sp, #400] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x25, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x11, eq
+; GISEL-NEXT:    csel x11, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x11, x24, x11
+; GISEL-NEXT:    orr x11, x1, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x22, eq
+; GISEL-NEXT:    csel x11, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #232] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x16, [sp, #280] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x2, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
+; GISEL-NEXT:    ldr x11, [sp, #240] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x2, eq
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #5
 ; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldp x22, x21, [sp, #368] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
+; GISEL-NEXT:    ldr x11, [sp, #32] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x0, eq
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x11, x17, x11
+; GISEL-NEXT:    orr x11, x15, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #264] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #328] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x11, x14, x11
+; GISEL-NEXT:    orr x11, x27, x11
+; GISEL-NEXT:    ldp x28, x27, [sp, #336] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #320] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
 ; GISEL-NEXT:    orr x11, x13, x11
-; GISEL-NEXT:    ldr x13, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x13, [sp, #144] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #24] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x11, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x11, x15, x11
+; GISEL-NEXT:    orr x11, x3, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x12, eq
-; GISEL-NEXT:    ldr x12, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x11, xzr, x0, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #192] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #232] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13]
-; GISEL-NEXT:    ldp x12, x11, [sp, #176] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #216] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #8]
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    csel x11, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x11, x5, x11
-; GISEL-NEXT:    ldp x24, x23, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x23, x11
+; GISEL-NEXT:    ldp x24, x23, [sp, #368] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #168] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #208] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #24]
-; GISEL-NEXT:    ldp x12, x11, [sp, #152] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #192] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #32]
-; GISEL-NEXT:    csel x11, xzr, x7, eq
+; GISEL-NEXT:    csel x11, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x11, x19, x11
+; GISEL-NEXT:    orr x11, x21, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #144] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #184] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #48]
-; GISEL-NEXT:    ldp x12, x11, [sp, #128] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #168] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #56]
-; GISEL-NEXT:    csel x11, xzr, x27, eq
+; GISEL-NEXT:    ldr x11, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x12, [sp, #136] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    orr x11, x6, x11
-; GISEL-NEXT:    ldp x28, x27, [sp, #320] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x20, x11
+; GISEL-NEXT:    ldp x20, x19, [sp, #400] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #160] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #72]
-; GISEL-NEXT:    ldp x12, x11, [sp, #96] ; 16-byte Folded Reload
-; GISEL-NEXT:    stp x11, x12, [x13, #80]
-; GISEL-NEXT:    csel x11, xzr, x25, eq
+; GISEL-NEXT:    ldr x11, [sp, #152] ; 8-byte Folded Reload
+; GISEL-NEXT:    str x11, [x13, #80]
+; GISEL-NEXT:    csel x11, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x26, x11
+; GISEL-NEXT:    ldp x22, x21, [sp, #384] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
-; GISEL-NEXT:    ldr x9, [sp, #288] ; 8-byte Folded Reload
-; GISEL-NEXT:    ldr x11, [sp, #88] ; 8-byte Folded Reload
-; GISEL-NEXT:    csel x9, x9, x10, eq
+; GISEL-NEXT:    ldr x9, [sp, #128] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x26, x25, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT:    stp x12, x9, [x13, #88]
+; GISEL-NEXT:    csel x9, x6, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    ldr x8, [sp, #80] ; 8-byte Folded Reload
-; GISEL-NEXT:    stp x11, x8, [x13, #96]
-; GISEL-NEXT:    csel x8, x20, x9, eq
-; GISEL-NEXT:    stp x26, x8, [x13, #112]
-; GISEL-NEXT:    ldp x20, x19, [sp, #384] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x26, x25, [sp, #336] ; 16-byte Folded Reload
-; GISEL-NEXT:    add sp, sp, #416
+; GISEL-NEXT:    ldr x8, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT:    stp x8, x5, [x13, #104]
+; GISEL-NEXT:    csel x8, x4, x9, eq
+; GISEL-NEXT:    str x8, [x13, #120]
+; GISEL-NEXT:    add sp, sp, #432
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i1024, ptr %input, align 128
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 63c08ddb04f7e..b215c518dce12 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-SD-NEXT:    add x10, x2, #32
 ; CHECK-SD-NEXT:    add x11, x0, #16
-; CHECK-SD-NEXT:    mov x12, x9
+; CHECK-SD-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
@@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-GI-NEXT:    and x10, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    add x11, x2, #32
 ; CHECK-GI-NEXT:    add x12, x0, #16
-; CHECK-GI-NEXT:    mov x13, x10
+; CHECK-GI-NEXT:    and x13, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:  .LBB3_3: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-SD-NEXT:    add x10, x2, #32
 ; CHECK-SD-NEXT:    add x11, x0, #16
-; CHECK-SD-NEXT:    mov x12, x9
+; CHECK-SD-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
@@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-GI-NEXT:    and x8, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    add x10, x2, #32
 ; CHECK-GI-NEXT:    add x11, x0, #16
-; CHECK-GI-NEXT:    mov x12, x8
+; CHECK-GI-NEXT:    and x12, x9, #0xfffffff0
 ; CHECK-GI-NEXT:  .LBB4_3: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    and w13, w1, #0xffff
@@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
 ; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
 ; CHECK-SD-NEXT:    fmov s2, w9
 ; CHECK-SD-NEXT:    add x8, x0, #8
-; CHECK-SD-NEXT:    mov x12, x11
+; CHECK-SD-NEXT:    and x12, x10, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp d3, d4, [x8, #-8]
@@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
 ; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-GI-NEXT:    add x10, x0, #8
+; CHECK-GI-NEXT:    and x11, x8, #0xfffffff0
 ; CHECK-GI-NEXT:    sbfx w9, w9, #8, #8
 ; CHECK-GI-NEXT:    dup v2.8h, w9
 ; CHECK-GI-NEXT:    and x9, x8, #0xfffffff0
-; CHECK-GI-NEXT:    mov x11, x9
 ; CHECK-GI-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldp d3, d4, [x10, #-8]
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 4c8e589391c3a..c23e4e182f2ef 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-NEXT:    add x10, x1, #16
 ; CHECK-NEXT:    add x11, x0, #16
-; CHECK-NEXT:    mov x12, x9
+; CHECK-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q4, [x10, #-16]
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
index f6bbdf5d95d87..1770bb9794f09 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    and x12, x10, #0xfffffff0
 ; CHECK-NEXT:    add x13, x1, #32
-; CHECK-NEXT:    add x14, x2, #16
 ; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
@@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
 ; CHECK-NEXT:    // Child Loop BB0_9 Depth 2
-; CHECK-NEXT:    ldrsh w15, [x2, x9, lsl #1]
+; CHECK-NEXT:    ldrsh w14, [x2, x9, lsl #1]
 ; CHECK-NEXT:    cmp w0, #16
 ; CHECK-NEXT:    b.hs .LBB0_5
 ; CHECK-NEXT:  // %bb.4: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    mov x18, xzr
+; CHECK-NEXT:    mov x17, xzr
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:  .LBB0_5: // %vector.ph
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    dup v0.8h, w15
-; CHECK-NEXT:    mov x16, x14
-; CHECK-NEXT:    mov x17, x13
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    dup v0.8h, w14
+; CHECK-NEXT:    add x15, x2, #16
+; CHECK-NEXT:    mov x16, x13
+; CHECK-NEXT:    and x17, x10, #0xfffffff0
 ; CHECK-NEXT:  .LBB0_6: // %vector.body
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
-; CHECK-NEXT:    subs x18, x18, #16
-; CHECK-NEXT:    ldp q3, q2, [x17, #-32]
-; CHECK-NEXT:    add x16, x16, #32
-; CHECK-NEXT:    ldp q6, q5, [x17]
+; CHECK-NEXT:    ldp q1, q4, [x15, #-16]
+; CHECK-NEXT:    subs x17, x17, #16
+; CHECK-NEXT:    ldp q3, q2, [x16, #-32]
+; CHECK-NEXT:    add x15, x15, #32
+; CHECK-NEXT:    ldp q6, q5, [x16]
 ; CHECK-NEXT:    smlal2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    smlal2 v5.4s, v0.8h, v4.8h
 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
-; CHECK-NEXT:    stp q3, q2, [x17, #-32]
-; CHECK-NEXT:    stp q6, q5, [x17], #64
+; CHECK-NEXT:    stp q3, q2, [x16, #-32]
+; CHECK-NEXT:    stp q6, q5, [x16], #64
 ; CHECK-NEXT:    b.ne .LBB0_6
 ; CHECK-NEXT:  // %bb.7: // %middle.block
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    cmp x12, x10
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    and x17, x10, #0xfffffff0
 ; CHECK-NEXT:    b.eq .LBB0_2
 ; CHECK-NEXT:  .LBB0_8: // %for.body4.us.preheader
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    add x16, x18, x8
-; CHECK-NEXT:    add x17, x2, x18, lsl #1
-; CHECK-NEXT:    sub x18, x10, x18
-; CHECK-NEXT:    add x16, x1, x16, lsl #2
+; CHECK-NEXT:    add x15, x17, x8
+; CHECK-NEXT:    add x16, x2, x17, lsl #1
+; CHECK-NEXT:    sub x17, x10, x17
+; CHECK-NEXT:    add x15, x1, x15, lsl #2
 ; CHECK-NEXT:  .LBB0_9: // %for.body4.us
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrsh w3, [x17], #2
-; CHECK-NEXT:    ldr w4, [x16]
-; CHECK-NEXT:    subs x18, x18, #1
-; CHECK-NEXT:    madd w3, w3, w15, w4
-; CHECK-NEXT:    str w3, [x16], #4
+; CHECK-NEXT:    ldrsh w18, [x16], #2
+; CHECK-NEXT:    ldr w3, [x15]
+; CHECK-NEXT:    subs x17, x17, #1
+; CHECK-NEXT:    madd w18, w18, w14, w3
+; CHECK-NEXT:    str w18, [x15], #4
 ; CHECK-NEXT:    b.ne .LBB0_9
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_10: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 3caac1d13495d..74b0e69d1b05b 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) {
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-NEXT:    and x20, x0, #0x3
 ; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    mov x0, x20
+; CHECK-GI-NEXT:    and x20, x0, #0x3
+; CHECK-GI-NEXT:    and x0, x0, #0x3
 ; CHECK-GI-NEXT:    bl callee
 ; CHECK-GI-NEXT:    tst x19, #0x3
 ; CHECK-GI-NEXT:    csel x0, x20, x0, eq
diff --git a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
index e0f21550cd8e3..58c01db032a8c 100644
--- a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
+++ b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
@@ -7,20 +7,16 @@
 define void @foo(i64 %v1, i64 %v2, ptr %ptr) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    add x3, x0, x1
-; CHECK-NEXT:    str x3, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    str x3, [x2, #8]
 ; CHECK-NEXT:    ldr x3, [x2, #16]
 ; CHECK-NEXT:    add x3, x0, x3
 ; CHECK-NEXT:    sub x3, x3, x1
 ; CHECK-NEXT:    str x3, [x2, #16]
-; CHECK-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    add x3, x0, x1
 ; CHECK-NEXT:    str x3, [x2, #24]
 ; CHECK-NEXT:    str x0, [x2, #32]
 ; CHECK-NEXT:    str x1, [x2, #40]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %v3 = add i64 %v1, %v2
   %p1 = getelementptr i64, ptr %ptr, i64 1
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 5fc996ad921ff..0f629971b5844 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    add x13, x1, #16
 ; CHECK-NEXT:    add x8, x1, x10, lsl #2
 ; CHECK-NEXT:    add x9, x0, x10
-; CHECK-NEXT:    mov x14, x10
+; CHECK-NEXT:    and x14, x11, #0x1fffffff8
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q2, [x13, #-16]
@@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    mov w8, #1132396544 // =0x437f0000
 ; CHECK-NEXT:    and x10, x11, #0x1fffffffc
 ; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:    add x8, x1, x10, lsl #3
 ; CHECK-NEXT:    add x9, x0, x10, lsl #1
-; CHECK-NEXT:    mov x12, x10
 ; CHECK-NEXT:  .LBB1_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld2 { v1.4s, v2.4s }, [x1], #32
@@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    add x9, x10, x10, lsl #1
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:    add x8, x1, x9, lsl #2
 ; CHECK-NEXT:    add x9, x0, x9
 ; CHECK-NEXT:  .LBB2_4: // %vector.body
@@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    add x8, x1, x10, lsl #4
 ; CHECK-NEXT:    add x9, x0, x10, lsl #2
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:  .LBB3_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
index ca5d089443542..8ca4c4320987e 100644
--- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll
+++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
@@ -27,11 +27,11 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    sub.w r7, r2, #32
-; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    sub.w r8, r2, #32
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    add.w r6, r0, r7, lsr #5
+; CHECK-NEXT:    add.w r7, r0, r8, lsr #5
 ; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    b .LBB0_2
@@ -44,16 +44,16 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
 ; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    cmp r4, #31
 ; CHECK-NEXT:    ldr r0, [r1, #16]
-; CHECK-NEXT:    add.w r0, r0, r6, lsl #2
+; CHECK-NEXT:    add.w r0, r0, r7, lsl #2
 ; CHECK-NEXT:    ldr r0, [r0, #40]
 ; CHECK-NEXT:    it hi
-; CHECK-NEXT:    andhi r2, r7, #31
+; CHECK-NEXT:    andhi r2, r8, #31
 ; CHECK-NEXT:    lsrs r0, r2
 ; CHECK-NEXT:    lsls r0, r0, #31
 ; CHECK-NEXT:    beq .LBB0_1
 ; CHECK-NEXT:  @ %bb.3: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl foo
 ; CHECK-NEXT:    str.w r9, [r5, #4]
 ; CHECK-NEXT:    b .LBB0_1
diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll
index 77deaa5f59339..d717806098fb5 100644
--- a/llvm/test/CodeGen/ARM/extract-bits.ll
+++ b/llvm/test/CodeGen/ARM/extract-bits.ll
@@ -316,28 +316,28 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ;
 ; V7A-LABEL: bextr64_a0:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r4, lr}
-; V7A-NEXT:    push {r4, lr}
-; V7A-NEXT:    ldr r12, [sp, #8]
-; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr lr, [sp, #16]
+; V7A-NEXT:    mov r5, #1
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    rsb r3, r12, #32
-; V7A-NEXT:    subs r4, r12, #32
-; V7A-NEXT:    lsr r3, lr, r3
-; V7A-NEXT:    lslpl r3, lr, r4
-; V7A-NEXT:    lsl r4, lr, r12
-; V7A-NEXT:    movwpl r4, #0
-; V7A-NEXT:    subs r4, r4, #1
-; V7A-NEXT:    sbc r12, r3, #0
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    orr r0, r0, r1, lsl r3
-; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    rsb r12, lr, #32
+; V7A-NEXT:    subs r4, lr, #32
+; V7A-NEXT:    lsr r3, r5, r12
+; V7A-NEXT:    lslpl r3, r5, r4
+; V7A-NEXT:    lsl r5, r5, lr
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    rsb r4, r2, #32
+; V7A-NEXT:    subs r5, r5, #1
+; V7A-NEXT:    sbc r3, r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    subs r4, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r4
 ; V7A-NEXT:    lsr r1, r1, r2
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    and r0, r4, r0
-; V7A-NEXT:    and r1, r12, r1
-; V7A-NEXT:    pop {r4, pc}
+; V7A-NEXT:    and r0, r5, r0
+; V7A-NEXT:    and r1, r3, r1
+; V7A-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; V7A-T-LABEL: bextr64_a0:
 ; V7A-T:       @ %bb.0:
@@ -434,28 +434,28 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
 ;
 ; V7A-LABEL: bextr64_a0_arithmetic:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r4, lr}
-; V7A-NEXT:    push {r4, lr}
-; V7A-NEXT:    ldr r12, [sp, #8]
-; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr lr, [sp, #16]
+; V7A-NEXT:    mov r5, #1
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    rsb r3, r12, #32
-; V7A-NEXT:    subs r4, r12, #32
-; V7A-NEXT:    lsr r3, lr, r3
-; V7A-NEXT:    lslpl r3, lr, r4
-; V7A-NEXT:    lsl r4, lr, r12
-; V7A-NEXT:    movwpl r4, #0
-; V7A-NEXT:    subs r4, r4, #1
-; V7A-NEXT:    sbc r12, r3, #0
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    orr r0, r0, r1, lsl r3
-; V7A-NEXT:    subs r3, r2, #32
+; V7A-NEXT:    rsb r12, lr, #32
+; V7A-NEXT:    subs r4, lr, #32
+; V7A-NEXT:    lsr r3, r5, r12
+; V7A-NEXT:    lslpl r3, r5, r4
+; V7A-NEXT:    lsl r5, r5, lr
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    rsb r4, r2, #32
+; V7A-NEXT:    subs r5, r5, #1
+; V7A-NEXT:    sbc r3, r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    subs r4, r2, #32
 ; V7A-NEXT:    asr r2, r1, r2
-; V7A-NEXT:    asrpl r0, r1, r3
 ; V7A-NEXT:    asrpl r2, r1, #31
-; V7A-NEXT:    and r0, r4, r0
-; V7A-NEXT:    and r1, r12, r2
-; V7A-NEXT:    pop {r4, pc}
+; V7A-NEXT:    asrpl r0, r1, r4
+; V7A-NEXT:    and r1, r3, r2
+; V7A-NEXT:    and r0, r5, r0
+; V7A-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; V7A-T-LABEL: bextr64_a0_arithmetic:
 ; V7A-T:       @ %bb.0:
@@ -911,28 +911,28 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ;
 ; V7A-LABEL: bextr64_a4_commutative:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r4, lr}
-; V7A-NEXT:    push {r4, lr}
-; V7A-NEXT:    ldr r12, [sp, #8]
-; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    .save {r4, r5, r11, lr}
+; V7A-NEXT:    push {r4, r5, r11, lr}
+; V7A-NEXT:    ldr lr, [sp, #16]
+; V7A-NEXT:    mov r5, #1
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    rsb r3, r12, #32
-; V7A-NEXT:    subs r4, r12, #32
-; V7A-NEXT:    lsr r3, lr, r3
-; V7A-NEXT:    lslpl r3, lr, r4
-; V7A-NEXT:    lsl r4, lr, r12
-; V7A-NEXT:    movwpl r4, #0
-; V7A-NEXT:    subs r4, r4, #1
-; V7A-NEXT:    sbc r12, r3, #0
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    orr r0, r0, r1, lsl r3
-; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsrpl r0, r1, r3
+; V7A-NEXT:    rsb r12, lr, #32
+; V7A-NEXT:    subs r4, lr, #32
+; V7A-NEXT:    lsr r3, r5, r12
+; V7A-NEXT:    lslpl r3, r5, r4
+; V7A-NEXT:    lsl r5, r5, lr
+; V7A-NEXT:    movwpl r5, #0
+; V7A-NEXT:    rsb r4, r2, #32
+; V7A-NEXT:    subs r5, r5, #1
+; V7A-NEXT:    sbc r3, r3, #0
+; V7A-NEXT:    orr r0, r0, r1, lsl r4
+; V7A-NEXT:    subs r4, r2, #32
+; V7A-NEXT:    lsrpl r0, r1, r4
 ; V7A-NEXT:    lsr r1, r1, r2
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    and r0, r0, r4
-; V7A-NEXT:    and r1, r1, r12
-; V7A-NEXT:    pop {r4, pc}
+; V7A-NEXT:    and r0, r0, r5
+; V7A-NEXT:    and r1, r1, r3
+; V7A-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; V7A-T-LABEL: bextr64_a4_commutative:
 ; V7A-T:       @ %bb.0:
@@ -3456,22 +3456,22 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; V7M-NEXT:    uxtb r2, r2
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
-; V7M-NEXT:    rsb.w r12, r2, #32
+; V7M-NEXT:    rsb.w r3, r2, #32
 ; V7M-NEXT:    lsls r1, r2
-; V7M-NEXT:    sub.w r3, r2, #32
-; V7M-NEXT:    lsr.w r4, r0, r12
+; V7M-NEXT:    sub.w r12, r2, #32
+; V7M-NEXT:    lsr.w r4, r0, r3
 ; V7M-NEXT:    orrs r1, r4
-; V7M-NEXT:    cmp r3, #0
+; V7M-NEXT:    cmp.w r12, #0
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lslpl.w r1, r0, r3
+; V7M-NEXT:    lslpl.w r1, r0, r12
 ; V7M-NEXT:    lsl.w r0, r0, r2
-; V7M-NEXT:    lsl.w r4, r1, r12
+; V7M-NEXT:    lsl.w r3, r1, r3
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r0, #0
 ; V7M-NEXT:    lsr.w r0, r0, r2
-; V7M-NEXT:    orr.w r0, r0, r4
+; V7M-NEXT:    orr.w r0, r0, r3
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsrpl.w r0, r1, r12
 ; V7M-NEXT:    lsr.w r1, r1, r2
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
@@ -3715,26 +3715,26 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
 ; V7M-NEXT:    uxtb r2, r2
 ; V7M-NEXT:    lsl.w r0, lr, r0
 ; V7M-NEXT:    orr.w r0, r0, r12
-; V7M-NEXT:    rsb.w r12, r2, #32
+; V7M-NEXT:    sub.w r12, r2, #32
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    lsrpl.w r0, lr, r3
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
+; V7M-NEXT:    rsb.w r3, r2, #32
 ; V7M-NEXT:    lsls r1, r2
-; V7M-NEXT:    sub.w r3, r2, #32
-; V7M-NEXT:    lsr.w r4, r0, r12
-; V7M-NEXT:    orrs r1, r4
-; V7M-NEXT:    cmp r3, #0
+; V7M-NEXT:    cmp.w r12, #0
+; V7M-NEXT:    lsr.w r4, r0, r3
+; V7M-NEXT:    orr.w r1, r1, r4
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lslpl.w r1, r0, r3
+; V7M-NEXT:    lslpl.w r1, r0, r12
 ; V7M-NEXT:    lsl.w r0, r0, r2
-; V7M-NEXT:    lsl.w r4, r1, r12
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r0, #0
+; V7M-NEXT:    lsl.w r3, r1, r3
 ; V7M-NEXT:    lsr.w r0, r0, r2
-; V7M-NEXT:    orr.w r0, r0, r4
+; V7M-NEXT:    orr.w r0, r0, r3
 ; V7M-NEXT:    it pl
-; V7M-NEXT:    lsrpl.w r0, r1, r3
+; V7M-NEXT:    lsrpl.w r0, r1, r12
 ; V7M-NEXT:    lsr.w r1, r1, r2
 ; V7M-NEXT:    it pl
 ; V7M-NEXT:    movpl r1, #0
diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll
index b483793187513..373d998a0eebd 100644
--- a/llvm/test/CodeGen/ARM/extract-lowbits.ll
+++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll
@@ -243,15 +243,15 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
 ; V7A:       @ %bb.0:
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    mov lr, #1
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r2, r0
 ; V7A-NEXT:    and r1, r3, r1
 ; V7A-NEXT:    pop {r11, pc}
@@ -323,15 +323,15 @@ define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind {
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
 ; V7A-NEXT:    and r2, r2, #63
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    mov lr, #1
+; V7A-NEXT:    rsb r12, r2, #32
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r2, r0
 ; V7A-NEXT:    and r1, r3, r1
 ; V7A-NEXT:    pop {r11, pc}
@@ -404,15 +404,15 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ; V7A:       @ %bb.0:
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    mov lr, #1
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r2, r0
 ; V7A-NEXT:    and r1, r3, r1
 ; V7A-NEXT:    pop {r11, pc}
@@ -644,15 +644,15 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; V7A:       @ %bb.0:
 ; V7A-NEXT:    .save {r11, lr}
 ; V7A-NEXT:    push {r11, lr}
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    mov r12, #1
-; V7A-NEXT:    lsr lr, r12, r3
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    mov lr, #1
 ; V7A-NEXT:    subs r3, r2, #32
-; V7A-NEXT:    lsl r2, r12, r2
+; V7A-NEXT:    lsl r2, lr, r2
+; V7A-NEXT:    lsr r12, lr, r12
 ; V7A-NEXT:    movwpl r2, #0
-; V7A-NEXT:    lslpl lr, r12, r3
+; V7A-NEXT:    lslpl r12, lr, r3
 ; V7A-NEXT:    subs r2, r2, #1
-; V7A-NEXT:    sbc r3, lr, #0
+; V7A-NEXT:    sbc r3, r12, #0
 ; V7A-NEXT:    and r0, r0, r2
 ; V7A-NEXT:    and r1, r1, r3
 ; V7A-NEXT:    pop {r11, pc}
@@ -2144,23 +2144,23 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
 ;
 ; V7A-LABEL: bzhi64_d2_load:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r5, r7, r11, lr}
-; V7A-NEXT:    push {r5, r7, r11, lr}
+; V7A-NEXT:    .save {r5, lr}
+; V7A-NEXT:    push {r5, lr}
 ; V7A-NEXT:    rsb r3, r2, #64
-; V7A-NEXT:    ldm r0, {r0, r7}
-; V7A-NEXT:    rsb r1, r3, #32
+; V7A-NEXT:    ldm r0, {r0, r5}
+; V7A-NEXT:    rsb r12, r3, #32
 ; V7A-NEXT:    rsbs r2, r2, #32
-; V7A-NEXT:    lsr r5, r0, r1
-; V7A-NEXT:    orr r7, r5, r7, lsl r3
-; V7A-NEXT:    lslpl r7, r0, r2
+; V7A-NEXT:    lsr r1, r0, r12
+; V7A-NEXT:    orr r1, r1, r5, lsl r3
+; V7A-NEXT:    lslpl r1, r0, r2
 ; V7A-NEXT:    lsl r0, r0, r3
 ; V7A-NEXT:    movwpl r0, #0
 ; V7A-NEXT:    lsr r0, r0, r3
-; V7A-NEXT:    orr r0, r0, r7, lsl r1
-; V7A-NEXT:    lsr r1, r7, r3
-; V7A-NEXT:    lsrpl r0, r7, r2
+; V7A-NEXT:    orr r0, r0, r1, lsl r12
+; V7A-NEXT:    lsrpl r0, r1, r2
+; V7A-NEXT:    lsr r1, r1, r3
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    pop {r5, r7, r11, pc}
+; V7A-NEXT:    pop {r5, pc}
 ;
 ; V7A-T-LABEL: bzhi64_d2_load:
 ; V7A-T:       @ %bb.0:
@@ -2237,26 +2237,26 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
 ;
 ; V7A-LABEL: bzhi64_d3_load_indexzext:
 ; V7A:       @ %bb.0:
-; V7A-NEXT:    .save {r5, r7, r11, lr}
-; V7A-NEXT:    push {r5, r7, r11, lr}
+; V7A-NEXT:    .save {r5, lr}
+; V7A-NEXT:    push {r5, lr}
 ; V7A-NEXT:    rsb r1, r1, #64
-; V7A-NEXT:    ldm r0, {r0, r7}
+; V7A-NEXT:    ldm r0, {r0, r5}
 ; V7A-NEXT:    uxtb r2, r1
-; V7A-NEXT:    rsb r3, r2, #32
-; V7A-NEXT:    lsr r5, r0, r3
-; V7A-NEXT:    orr r7, r5, r7, lsl r2
+; V7A-NEXT:    rsb r12, r2, #32
+; V7A-NEXT:    lsr r3, r0, r12
+; V7A-NEXT:    orr r3, r3, r5, lsl r2
 ; V7A-NEXT:    mvn r5, #31
 ; V7A-NEXT:    uxtab r1, r5, r1
 ; V7A-NEXT:    cmp r1, #0
-; V7A-NEXT:    lslpl r7, r0, r1
+; V7A-NEXT:    lslpl r3, r0, r1
 ; V7A-NEXT:    lsl r0, r0, r2
 ; V7A-NEXT:    movwpl r0, #0
 ; V7A-NEXT:    lsr r0, r0, r2
-; V7A-NEXT:    orr r0, r0, r7, lsl r3
-; V7A-NEXT:    lsrpl r0, r7, r1
-; V7A-NEXT:    lsr r1, r7, r2
+; V7A-NEXT:    orr r0, r0, r3, lsl r12
+; V7A-NEXT:    lsrpl r0, r3, r1
+; V7A-NEXT:    lsr r1, r3, r2
 ; V7A-NEXT:    movwpl r1, #0
-; V7A-NEXT:    pop {r5, r7, r11, pc}
+; V7A-NEXT:    pop {r5, pc}
 ;
 ; V7A-T-LABEL: bzhi64_d3_load_indexzext:
 ; V7A-T:       @ %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index e761d3a9a75c3..33b89a405d8e3 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,119 +39,118 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a2, a2, 1
 ; NOREMAT-NEXT:    sub sp, sp, a2
 ; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT:    mv a7, a0
-; NOREMAT-NEXT:    li a0, 32
-; NOREMAT-NEXT:    addi a5, a7, 512
-; NOREMAT-NEXT:    addi a4, a7, 1024
-; NOREMAT-NEXT:    addi a6, a7, 1536
-; NOREMAT-NEXT:    li t4, 1
-; NOREMAT-NEXT:    li a2, 5
+; NOREMAT-NEXT:    li a7, 32
+; NOREMAT-NEXT:    addi s10, a0, 512
+; NOREMAT-NEXT:    addi a4, a0, 1024
+; NOREMAT-NEXT:    addi a6, a0, 1536
+; NOREMAT-NEXT:    li t0, 1
+; NOREMAT-NEXT:    li a3, 5
 ; NOREMAT-NEXT:    li t1, 3
-; NOREMAT-NEXT:    li t0, 7
-; NOREMAT-NEXT:    lui t5, 1
+; NOREMAT-NEXT:    li a2, 7
+; NOREMAT-NEXT:    lui t2, 1
 ; NOREMAT-NEXT:    li s4, 9
 ; NOREMAT-NEXT:    li s6, 11
 ; NOREMAT-NEXT:    li s9, 13
 ; NOREMAT-NEXT:    li ra, 15
-; NOREMAT-NEXT:    lui t2, 2
+; NOREMAT-NEXT:    lui a5, 2
 ; NOREMAT-NEXT:    lui s1, 3
 ; NOREMAT-NEXT:    lui t3, 4
 ; NOREMAT-NEXT:    lui s0, 5
 ; NOREMAT-NEXT:    lui s3, 6
 ; NOREMAT-NEXT:    lui s7, 7
-; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT:    slli t4, t4, 11
-; NOREMAT-NEXT:    sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli a3, a2, 9
-; NOREMAT-NEXT:    sd a3, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t0, t0, 11
+; NOREMAT-NEXT:    sd t0, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t4, a3, 9
+; NOREMAT-NEXT:    sd t4, 504(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    slli t6, t1, 10
-; NOREMAT-NEXT:    slli s2, t0, 9
-; NOREMAT-NEXT:    add a0, a7, t5
+; NOREMAT-NEXT:    slli s2, a2, 9
+; NOREMAT-NEXT:    add a7, a0, t2
 ; NOREMAT-NEXT:    lui s11, 1
 ; NOREMAT-NEXT:    slli s4, s4, 9
-; NOREMAT-NEXT:    slli s5, a2, 10
+; NOREMAT-NEXT:    slli s5, a3, 10
 ; NOREMAT-NEXT:    slli s6, s6, 9
 ; NOREMAT-NEXT:    slli s8, t1, 11
-; NOREMAT-NEXT:    vle32.v v8, (a5)
+; NOREMAT-NEXT:    vle32.v v8, (s10)
 ; NOREMAT-NEXT:    slli s9, s9, 9
 ; NOREMAT-NEXT:    li t5, 13
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    slli s10, t0, 10
+; NOREMAT-NEXT:    slli s10, a2, 10
 ; NOREMAT-NEXT:    vle32.v v0, (a6)
 ; NOREMAT-NEXT:    vle32.v v12, (a6)
 ; NOREMAT-NEXT:    slli ra, ra, 9
-; NOREMAT-NEXT:    vle32.v v4, (a0)
-; NOREMAT-NEXT:    vle32.v v20, (a0)
-; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    vle32.v v4, (a7)
+; NOREMAT-NEXT:    vle32.v v20, (a7)
+; NOREMAT-NEXT:    add a4, a0, a5
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    add a4, a0, s1
 ; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    add a4, a0, t3
 ; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    add a4, a7, s0
-; NOREMAT-NEXT:    vle32.v v14, (a7)
+; NOREMAT-NEXT:    add a4, a0, s0
+; NOREMAT-NEXT:    vle32.v v14, (a0)
 ; NOREMAT-NEXT:    vle32.v v18, (a4)
 ; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    add a4, a7, s3
+; NOREMAT-NEXT:    add a4, a0, s3
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, t4
+; NOREMAT-NEXT:    addi a4, sp, 640
+; NOREMAT-NEXT:    vs2r.v v8, (a4) # vscale x 16-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a0, t0
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, a3
+; NOREMAT-NEXT:    add a4, a0, t4
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, t6
+; NOREMAT-NEXT:    add a4, a0, t6
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, s2
+; NOREMAT-NEXT:    add a4, a0, s2
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s7
+; NOREMAT-NEXT:    add a4, a0, s7
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, s4
+; NOREMAT-NEXT:    add a4, a0, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s5
+; NOREMAT-NEXT:    add a4, a0, s5
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s6
+; NOREMAT-NEXT:    add a4, a0, s6
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s8
+; NOREMAT-NEXT:    add a4, a0, s8
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s9
+; NOREMAT-NEXT:    add a4, a0, s9
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s10
+; NOREMAT-NEXT:    add a4, a0, s10
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    add a4, a0, ra
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    lui t4, 8
-; NOREMAT-NEXT:    add a5, a7, t4
+; NOREMAT-NEXT:    add a5, a0, t4
 ; NOREMAT-NEXT:    vle32.v v20, (a5)
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
@@ -159,14 +158,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a4, a4, 9
 ; NOREMAT-NEXT:    li s1, 17
 ; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
 ; NOREMAT-NEXT:    li a5, 9
 ; NOREMAT-NEXT:    slli a4, a5, 10
 ; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
@@ -174,256 +173,257 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a4, a4, 9
 ; NOREMAT-NEXT:    li t2, 19
 ; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    slli a3, a2, 11
+; NOREMAT-NEXT:    slli a3, a3, 11
 ; NOREMAT-NEXT:    sd a3, 600(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li s7, 21
 ; NOREMAT-NEXT:    slli a3, s7, 9
 ; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
 ; NOREMAT-NEXT:    li a6, 11
 ; NOREMAT-NEXT:    slli a3, a6, 10
 ; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
 ; NOREMAT-NEXT:    li s3, 23
 ; NOREMAT-NEXT:    slli a3, s3, 9
 ; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    li s0, 25
 ; NOREMAT-NEXT:    slli a3, s0, 9
 ; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    slli a3, t5, 10
 ; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
 ; NOREMAT-NEXT:    li t3, 27
 ; NOREMAT-NEXT:    slli a3, t3, 9
 ; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    slli a2, t0, 11
+; NOREMAT-NEXT:    slli a2, a2, 11
 ; NOREMAT-NEXT:    sd a2, 544(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li t0, 29
 ; NOREMAT-NEXT:    slli a2, t0, 9
 ; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT:    li a3, 15
-; NOREMAT-NEXT:    slli a2, a3, 10
+; NOREMAT-NEXT:    li a7, 15
+; NOREMAT-NEXT:    slli a2, a7, 10
 ; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
 ; NOREMAT-NEXT:    li t1, 31
 ; NOREMAT-NEXT:    slli a2, t1, 9
 ; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT:    lui a4, 4
-; NOREMAT-NEXT:    addi a0, a4, 512
-; NOREMAT-NEXT:    sd a0, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
-; NOREMAT-NEXT:    vle32.v v8, (a0)
-; NOREMAT-NEXT:    vle32.v v26, (a0)
+; NOREMAT-NEXT:    lui a3, 4
+; NOREMAT-NEXT:    addi a2, a3, 512
+; NOREMAT-NEXT:    sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
 ; NOREMAT-NEXT:    slli a2, s1, 10
 ; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    addi a2, a4, 1536
+; NOREMAT-NEXT:    addi a2, a3, 1536
 ; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    lui a4, 4
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, a5, 11
 ; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
 ; NOREMAT-NEXT:    lui a5, 5
 ; NOREMAT-NEXT:    addi a2, a5, -1536
 ; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
 ; NOREMAT-NEXT:    slli a2, t2, 10
 ; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t2, 19
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    li a3, 19
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    addi a2, a5, -512
 ; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
 ; NOREMAT-NEXT:    addi a2, a5, 512
 ; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, s7, 10
 ; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
 ; NOREMAT-NEXT:    addi a2, a5, 1536
 ; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    slli a2, a6, 11
 ; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
 ; NOREMAT-NEXT:    lui a6, 6
 ; NOREMAT-NEXT:    addi a2, a6, -1536
 ; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    slli a2, s3, 10
 ; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    addi a2, a6, -512
 ; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
 ; NOREMAT-NEXT:    addi a2, a6, 512
 ; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, s0, 10
 ; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
 ; NOREMAT-NEXT:    addi a2, a6, 1536
 ; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    slli a2, t5, 11
 ; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
 ; NOREMAT-NEXT:    lui s0, 7
 ; NOREMAT-NEXT:    addi a2, s0, -1536
 ; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t3, 10
 ; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; NOREMAT-NEXT:    addi a2, sp, 640
+; NOREMAT-NEXT:    vl2r.v v12, (a2) # vscale x 16-byte Folded Reload
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
 ; NOREMAT-NEXT:    addi a2, s0, -512
 ; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
 ; NOREMAT-NEXT:    addi a2, s0, 512
 ; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui t3, 7
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, t0, 10
 ; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
 ; NOREMAT-NEXT:    addi a2, t3, 1536
 ; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, a3, 11
+; NOREMAT-NEXT:    slli a2, a7, 11
 ; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
 ; NOREMAT-NEXT:    addi a2, t4, -1536
 ; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t1, 10
 ; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, t4, -512
-; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    addi a2, t4, -512
+; NOREMAT-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a0, a2
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; NOREMAT-NEXT:    vle32.v v12, (a0)
 ; NOREMAT-NEXT:    vle32.v v0, (a0)
@@ -476,7 +476,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addi s11, a0, 512
 ; NOREMAT-NEXT:    addi s7, a0, 1024
 ; NOREMAT-NEXT:    addi s3, a0, 1536
-; NOREMAT-NEXT:    slli s1, t2, 11
+; NOREMAT-NEXT:    slli s1, a3, 11
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addi t2, a0, -1536
 ; NOREMAT-NEXT:    addi a7, a0, -1024
diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
index 97980770cc7c9..e3ed31fae70c0 100644
--- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
+++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
@@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ; LINUX-NEXT:    ld %f10, 8(%r3)
 ; LINUX-NEXT:    ld %f0, 16(%r3)
 ; LINUX-NEXT:    ld %f2, 24(%r3)
-; LINUX-NEXT:    la %r3, 16(%r2)
-; LINUX-NEXT:    la %r4, 48(%r2)
 ; LINUX-NEXT:    la %r2, 176(%r15)
+; LINUX-NEXT:    la %r3, 16(%r13)
+; LINUX-NEXT:    la %r4, 48(%r13)
 ; LINUX-NEXT:    std %f0, 176(%r15)
 ; LINUX-NEXT:    std %f2, 184(%r15)
 ; LINUX-NEXT:    brasl %r14, sincosl@PLT
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 6f986ce28381b..c418038b751d7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur
 ; CHECK-NEXT:    cbz r2, .LBB7_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    bic r3, r3, #7
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    add.w r12, r4, r3, lsr #3
 ; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    mov r4, r1
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
index 40207090fda66..fe06601e77bb3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
@@ -16,39 +16,40 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    ldrsh.w r7, [r2]
 ; CHECK-NEXT:    cmp r7, #1
-; CHECK-NEXT:    blt.w .LBB0_6
+; CHECK-NEXT:    blt .LBB0_6
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond3.preheader.us.preheader
-; CHECK-NEXT:    movs r2, #252
 ; CHECK-NEXT:    ldr r4, [sp, #152]
+; CHECK-NEXT:    movs r2, #252
 ; CHECK-NEXT:    and.w r6, r2, r3, lsr #3
 ; CHECK-NEXT:    movs r2, #120
 ; CHECK-NEXT:    and.w r5, r2, r3, lsr #9
 ; CHECK-NEXT:    lsls r3, r3, #3
-; CHECK-NEXT:    uxtb r3, r3
 ; CHECK-NEXT:    muls r6, r4, r6
+; CHECK-NEXT:    uxtb r3, r3
 ; CHECK-NEXT:    rsb.w r2, r4, #256
-; CHECK-NEXT:    vmov.i16 q2, #0xfc
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
+; CHECK-NEXT:    vdup.16 q0, r6
 ; CHECK-NEXT:    mul lr, r5, r4
-; CHECK-NEXT:    vdup.16 q4, r6
 ; CHECK-NEXT:    mov.w r6, #2016
-; CHECK-NEXT:    vmov.i16 q6, #0xf8
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    mul r5, r3, r4
 ; CHECK-NEXT:    adds r3, r7, #7
+; CHECK-NEXT:    vdup.16 q0, r6
 ; CHECK-NEXT:    bic r3, r3, #7
-; CHECK-NEXT:    vdup.16 q3, lr
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vdup.16 q0, r5
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vdup.16 q0, lr
 ; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    vdup.16 q0, r5
-; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    add.w r3, r4, r3, lsr #3
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q0, #0xf800
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    lsls r1, r1, #1
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    vdup.16 q5, r6
-; CHECK-NEXT:    vmov.i16 q7, #0x78
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i16 q4, #0xf8
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
@@ -59,37 +60,31 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    @ Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.16 r6
-; CHECK-NEXT:    subs r6, #8
+; CHECK-NEXT:    vmov.i16 q5, #0xf800
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrht.u16 q0, [r5]
-; CHECK-NEXT:    vshr.u16 q1, q0, #3
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmla.i16 q2, q1, r2
-; CHECK-NEXT:    vshr.u16 q1, q2, #5
-; CHECK-NEXT:    vshl.i16 q2, q0, #3
-; CHECK-NEXT:    vand q3, q1, q5
-; CHECK-NEXT:    vmov q1, q7
-; CHECK-NEXT:    vand q2, q2, q6
-; CHECK-NEXT:    vmov q7, q6
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    subs r6, #8
+; CHECK-NEXT:    vshr.u16 q3, q0, #3
+; CHECK-NEXT:    vand q3, q3, q1
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmla.i16 q1, q3, r2
+; CHECK-NEXT:    vshl.i16 q3, q0, #3
+; CHECK-NEXT:    vand q3, q3, q4
+; CHECK-NEXT:    vmov q4, q6
+; CHECK-NEXT:    vshr.u16 q1, q1, #5
+; CHECK-NEXT:    vmla.i16 q4, q3, r2
+; CHECK-NEXT:    vshr.u16 q3, q4, #11
+; CHECK-NEXT:    vand q1, q1, q7
+; CHECK-NEXT:    vorr q1, q1, q3
 ; CHECK-NEXT:    vshr.u16 q0, q0, #9
-; CHECK-NEXT:    vmla.i16 q4, q2, r2
-; CHECK-NEXT:    vshr.u16 q2, q4, #11
-; CHECK-NEXT:    vmov q4, q5
-; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    vmov q6, q7
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vorr q1, q3, q2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmla.i16 q2, q0, r2
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i16 q3, #0x78
+; CHECK-NEXT:    vmov.i16 q4, #0xf8
+; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmla.i16 q3, q0, r2
+; CHECK-NEXT:    vand q0, q3, q5
 ; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.16 q0, [r5], #16
 ; CHECK-NEXT:    le lr, .LBB0_4
@@ -190,7 +185,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
 ; CHECK-NEXT:    cmp.w r12, #1
-; CHECK-NEXT:    blt.w .LBB1_7
+; CHECK-NEXT:    blt .LBB1_7
 ; CHECK-NEXT:  @ %bb.1: @ %for.cond3.preheader.lr.ph
 ; CHECK-NEXT:    ldrsh.w r2, [r2]
 ; CHECK-NEXT:    cmp r2, #1
@@ -200,71 +195,70 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    sub sp, #80
-; CHECK-NEXT:    ldr r7, [sp, #168]
+; CHECK-NEXT:    ldr r7, [sp, #88]
 ; CHECK-NEXT:    movs r5, #120
 ; CHECK-NEXT:    lsls r6, r3, #3
 ; CHECK-NEXT:    movs r4, #252
 ; CHECK-NEXT:    and.w r5, r5, r3, lsr #9
 ; CHECK-NEXT:    uxtb r6, r6
 ; CHECK-NEXT:    and.w r3, r4, r3, lsr #3
+; CHECK-NEXT:    adds r4, r2, #7
 ; CHECK-NEXT:    muls r6, r7, r6
+; CHECK-NEXT:    bic r4, r4, #7
 ; CHECK-NEXT:    mul lr, r3, r7
-; CHECK-NEXT:    vdup.16 q0, r6
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, lr
 ; CHECK-NEXT:    muls r5, r7, r5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q0, #0xfc
-; CHECK-NEXT:    mov.w r6, #2016
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, r5
 ; CHECK-NEXT:    rsb.w r3, r7, #256
 ; CHECK-NEXT:    lsls r7, r1, #1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, r6
+; CHECK-NEXT:    sub.w r1, r4, #8
+; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    vmov.i16 q2, #0xf8
-; CHECK-NEXT:    vmov.i16 q5, #0x78
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q6, #0xf800
+; CHECK-NEXT:    add.w r1, r4, r1, lsr #3
+; CHECK-NEXT:    vdup.16 q6, r6
+; CHECK-NEXT:    mov.w r6, #2016
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vdup.16 q3, lr
+; CHECK-NEXT:    vdup.16 q5, r5
+; CHECK-NEXT:    vdup.16 q7, r6
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB1_3: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_4 Depth 2
 ; CHECK-NEXT:    mov r5, r0
-; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrh.u16 q0, [r5]
+; CHECK-NEXT:    vctp.16 r6
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r5]
 ; CHECK-NEXT:    vshl.i16 q1, q0, #3
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    subs r6, #8
 ; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmla.i16 q3, q1, r3
-; CHECK-NEXT:    vmov.f64 d8, d4
-; CHECK-NEXT:    vmov.f64 d9, d5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vshr.u16 q2, q0, #9
+; CHECK-NEXT:    vmov.i16 q2, #0x78
+; CHECK-NEXT:    vshr.u16 q4, q0, #9
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov q2, q6
+; CHECK-NEXT:    vmla.i16 q2, q1, r3
 ; CHECK-NEXT:    vshr.u16 q0, q0, #3
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov q1, q3
 ; CHECK-NEXT:    vmla.i16 q1, q0, r3
-; CHECK-NEXT:    vand q2, q2, q5
-; CHECK-NEXT:    vshr.u16 q0, q3, #11
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vshr.u16 q0, q2, #11
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vmla.i16 q2, q4, r3
 ; CHECK-NEXT:    vshr.u16 q1, q1, #5
-; CHECK-NEXT:    vmla.i16 q3, q2, r3
+; CHECK-NEXT:    vmov.i16 q4, #0xf800
 ; CHECK-NEXT:    vand q1, q1, q7
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vand q1, q3, q6
+; CHECK-NEXT:    vand q1, q2, q4
+; CHECK-NEXT:    vmov.i16 q2, #0xf8
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vstrh.16 q0, [r5], #16
-; CHECK-NEXT:    vmov.f64 d4, d8
-; CHECK-NEXT:    vmov.f64 d5, d9
-; CHECK-NEXT:    letp lr, .LBB1_4
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r5], #16
+; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB1_3 Depth=1
 ; CHECK-NEXT:    adds r4, #1
@@ -272,7 +266,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK-NEXT:    cmp r4, r12
 ; CHECK-NEXT:    bne .LBB1_3
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 07c06e10979cd..1769c5d2fd385 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -17,17 +17,16 @@
 define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
 ; ENABLED-LABEL: varying_outer_2d_reduction:
 ; ENABLED:       @ %bb.0: @ %entry
-; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; ENABLED-NEXT:    sub sp, #4
 ; ENABLED-NEXT:    cmp r3, #1
-; ENABLED-NEXT:    str r0, [sp] @ 4-byte Spill
-; ENABLED-NEXT:    blt .LBB0_8
-; ENABLED-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; ENABLED-NEXT:    ldr r0, [sp, #36]
-; ENABLED-NEXT:    add.w r12, r2, #3
-; ENABLED-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; ENABLED-NEXT:    mov.w r8, #0
-; ENABLED-NEXT:    mov r9, r12
+; ENABLED-NEXT:    it lt
+; ENABLED-NEXT:    bxlt lr
+; ENABLED-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    mov r11, r0
+; ENABLED-NEXT:    ldr r0, [sp, #32]
+; ENABLED-NEXT:    add.w r9, r2, #3
+; ENABLED-NEXT:    mov.w r12, #0
+; ENABLED-NEXT:    mov r10, r11
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
 ; ENABLED-NEXT:    b .LBB0_4
@@ -37,31 +36,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    lsrs r0, r0, #16
 ; ENABLED-NEXT:    sub.w r9, r9, #1
-; ENABLED-NEXT:    strh.w r0, [r1, r8, lsl #1]
-; ENABLED-NEXT:    add.w r8, r8, #1
+; ENABLED-NEXT:    strh.w r0, [r1, r12, lsl #1]
+; ENABLED-NEXT:    add.w r12, r12, #1
 ; ENABLED-NEXT:    add.w r10, r10, #2
-; ENABLED-NEXT:    cmp r8, r3
+; ENABLED-NEXT:    cmp r12, r3
 ; ENABLED-NEXT:    beq .LBB0_8
 ; ENABLED-NEXT:  .LBB0_4: @ %for.body
 ; ENABLED-NEXT:    @ =>This Loop Header: Depth=1
 ; ENABLED-NEXT:    @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT:    cmp r2, r8
+; ENABLED-NEXT:    cmp r2, r12
 ; ENABLED-NEXT:    ble .LBB0_2
 ; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    sub.w r4, r2, r8
+; ENABLED-NEXT:    sub.w r4, r2, r12
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
 ; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
-; ENABLED-NEXT:    sub.w r0, r12, r8
+; ENABLED-NEXT:    adds r0, r2, #3
+; ENABLED-NEXT:    sub.w r0, r0, r12
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
 ; ENABLED-NEXT:    mov r7, r10
 ; ENABLED-NEXT:    dls lr, r0
-; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ENABLED-NEXT:    mov r0, r11
 ; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -82,23 +82,22 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    vpsel q0, q1, q0
 ; ENABLED-NEXT:    vaddv.u32 r0, q0
 ; ENABLED-NEXT:    b .LBB0_3
-; ENABLED-NEXT:  .LBB0_8: @ %for.end17
-; ENABLED-NEXT:    add sp, #4
-; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; ENABLED-NEXT:  .LBB0_8:
+; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    bx lr
 ;
 ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
 ; NOREDUCTIONS:       @ %bb.0: @ %entry
-; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; NOREDUCTIONS-NEXT:    sub sp, #4
 ; NOREDUCTIONS-NEXT:    cmp r3, #1
-; NOREDUCTIONS-NEXT:    str r0, [sp] @ 4-byte Spill
-; NOREDUCTIONS-NEXT:    blt .LBB0_8
-; NOREDUCTIONS-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT:    ldr r0, [sp, #36]
-; NOREDUCTIONS-NEXT:    add.w r12, r2, #3
-; NOREDUCTIONS-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT:    mov.w r8, #0
-; NOREDUCTIONS-NEXT:    mov r9, r12
+; NOREDUCTIONS-NEXT:    it lt
+; NOREDUCTIONS-NEXT:    bxlt lr
+; NOREDUCTIONS-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    mov r11, r0
+; NOREDUCTIONS-NEXT:    ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT:    add.w r9, r2, #3
+; NOREDUCTIONS-NEXT:    mov.w r12, #0
+; NOREDUCTIONS-NEXT:    mov r10, r11
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
 ; NOREDUCTIONS-NEXT:    b .LBB0_4
@@ -108,31 +107,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    lsrs r0, r0, #16
 ; NOREDUCTIONS-NEXT:    sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r8, lsl #1]
-; NOREDUCTIONS-NEXT:    add.w r8, r8, #1
+; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r12, lsl #1]
+; NOREDUCTIONS-NEXT:    add.w r12, r12, #1
 ; NOREDUCTIONS-NEXT:    add.w r10, r10, #2
-; NOREDUCTIONS-NEXT:    cmp r8, r3
+; NOREDUCTIONS-NEXT:    cmp r12, r3
 ; NOREDUCTIONS-NEXT:    beq .LBB0_8
 ; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.body
 ; NOREDUCTIONS-NEXT:    @ =>This Loop Header: Depth=1
 ; NOREDUCTIONS-NEXT:    @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT:    cmp r2, r8
+; NOREDUCTIONS-NEXT:    cmp r2, r12
 ; NOREDUCTIONS-NEXT:    ble .LBB0_2
 ; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    sub.w r4, r2, r8
+; NOREDUCTIONS-NEXT:    sub.w r4, r2, r12
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
 ; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT:    sub.w r0, r12, r8
+; NOREDUCTIONS-NEXT:    adds r0, r2, #3
+; NOREDUCTIONS-NEXT:    sub.w r0, r0, r12
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r7, r10
 ; NOREDUCTIONS-NEXT:    dls lr, r0
-; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT:    mov r0, r11
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    vpsel q0, q1, q0
 ; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
 ; NOREDUCTIONS-NEXT:    b .LBB0_3
-; NOREDUCTIONS-NEXT:  .LBB0_8: @ %for.end17
-; NOREDUCTIONS-NEXT:    add sp, #4
-; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; NOREDUCTIONS-NEXT:  .LBB0_8:
+; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    bx lr
 entry:
   %conv = sext i16 %N to i32
   %cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index e0a61b1f9d956..78dc35b283d18 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) {
 ; CHECK-NEXT:    cmp r0, #1
 ; CHECK-NEXT:    blt .LBB1_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    vmov.i32 q1, #0x1
+; CHECK-NEXT:    vmov.i32 q0, #0x1
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.32 lr, r0
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmovt q2, q1
-; CHECK-NEXT:    vaddva.u32 r2, q2
+; CHECK-NEXT:    vmovt q1, q0
+; CHECK-NEXT:    vaddva.u32 r2, q1
 ; CHECK-NEXT:    letp lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index c8dd949ca9d88..a9043476b0549 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -993,10 +993,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    .pad #20
+; CHECK-NEXT:    sub sp, #20
 ; CHECK-NEXT:    cmp r3, #8
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    blo.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.1: @ %if.then
 ; CHECK-NEXT:    lsrs.w r12, r3, #2
@@ -1016,50 +1016,48 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, r7, #2
 ; CHECK-NEXT:    rsbs r7, r4, #0
-; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r7, r3, #16
-; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r4, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    add.w r6, r6, r0, lsl #1
 ; CHECK-NEXT:    b .LBB16_5
 ; CHECK-NEXT:  .LBB16_4: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    wls lr, r0, .LBB16_5
 ; CHECK-NEXT:    b .LBB16_10
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
-; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
+; CHECK-NEXT:    add.w r0, r6, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldrh.w lr, [r3, #14]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
-; CHECK-NEXT:    ldrh.w r8, [r3, #12]
+; CHECK-NEXT:    ldrh.w r10, [r3, #12]
 ; CHECK-NEXT:    ldrh r7, [r3, #10]
 ; CHECK-NEXT:    ldrh r4, [r3, #8]
 ; CHECK-NEXT:    ldrh r6, [r3, #6]
 ; CHECK-NEXT:    ldrh.w r9, [r3, #4]
 ; CHECK-NEXT:    ldrh.w r11, [r3, #2]
-; CHECK-NEXT:    ldrh.w r10, [r3]
+; CHECK-NEXT:    ldrh.w r8, [r3]
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r5, #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmul.f16 q0, q0, r10
+; CHECK-NEXT:    vmul.f16 q0, q0, r8
 ; CHECK-NEXT:    adds r0, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r11
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
@@ -1068,73 +1066,73 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    add.w r0, r5, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r6
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT:    add.w r6, r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    add.w r0, r5, #14
 ; CHECK-NEXT:    vfma.f16 q0, q1, r7
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
-; CHECK-NEXT:    adds r5, #16
-; CHECK-NEXT:    vfma.f16 q0, q1, r8
+; CHECK-NEXT:    vfma.f16 q0, q1, r10
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_9
 ; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r3, #16
 ; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r0, [r6], #16
-; CHECK-NEXT:    vldrw.u32 q1, [r5]
-; CHECK-NEXT:    adds r4, r5, #2
+; CHECK-NEXT:    ldrh r0, [r5], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r6]
+; CHECK-NEXT:    adds r4, r6, #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-14]
-; CHECK-NEXT:    adds r4, r5, #6
+; CHECK-NEXT:    ldrh r0, [r5, #-14]
+; CHECK-NEXT:    adds r4, r6, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-12]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
+; CHECK-NEXT:    ldrh r0, [r5, #-12]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #4]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-10]
-; CHECK-NEXT:    add.w r4, r5, #10
+; CHECK-NEXT:    ldrh r0, [r5, #-10]
+; CHECK-NEXT:    add.w r4, r6, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-8]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT:    ldrh r0, [r5, #-8]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #8]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-6]
-; CHECK-NEXT:    ldrh r4, [r6, #-2]
+; CHECK-NEXT:    ldrh r0, [r5, #-6]
+; CHECK-NEXT:    ldrh r4, [r5, #-2]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-4]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
+; CHECK-NEXT:    ldrh r0, [r5, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #12]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    add.w r0, r5, #14
+; CHECK-NEXT:    add.w r0, r6, #14
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    adds r5, #16
+; CHECK-NEXT:    adds r6, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r3, #16
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:  .LBB16_11: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r4, [r6], #2
+; CHECK-NEXT:    ldrh r4, [r5], #2
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_11
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    add sp, #20
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 28166e455aba2..f7b4548f127bf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -995,46 +995,44 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    ldrh r6, [r0]
-; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    ldrd r4, r10, [r0, #4]
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    ldrd r7, r10, [r0, #4]
 ; CHECK-NEXT:    sub.w r0, r6, #8
 ; CHECK-NEXT:    add.w r3, r0, r0, lsr #29
 ; CHECK-NEXT:    and r0, r0, #7
-; CHECK-NEXT:    asrs r7, r3, #3
-; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    asrs r5, r3, #3
+; CHECK-NEXT:    cmp r5, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r5, r3, #3
-; CHECK-NEXT:    add.w r3, r4, r6, lsl #2
+; CHECK-NEXT:    asrgt r4, r3, #3
+; CHECK-NEXT:    add.w r3, r7, r6, lsl #2
 ; CHECK-NEXT:    sub.w r9, r3, #4
 ; CHECK-NEXT:    rsbs r3, r6, #0
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r10, #32
-; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r4, [sp] @ 4-byte Spill
+; CHECK-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    add.w r7, r7, r0, lsl #2
 ; CHECK-NEXT:    b .LBB16_5
 ; CHECK-NEXT:  .LBB16_4: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload
 ; CHECK-NEXT:    wls lr, r0, .LBB16_5
 ; CHECK-NEXT:    b .LBB16_10
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
-; CHECK-NEXT:    add.w r4, r0, #16
+; CHECK-NEXT:    add.w r0, r7, r0, lsl #2
+; CHECK-NEXT:    add.w r7, r0, #16
 ; CHECK-NEXT:    beq .LBB16_12
 ; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
@@ -1042,76 +1040,76 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
 ; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    ldrd r3, r7, [r10]
+; CHECK-NEXT:    ldrd r3, r4, [r10]
 ; CHECK-NEXT:    ldm.w lr, {r0, r5, r6, lr}
 ; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
-; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
+; CHECK-NEXT:    vldrw.u32 q0, [r7], #32
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r3
-; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
-; CHECK-NEXT:    vfma.f32 q0, q1, r7
-; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
+; CHECK-NEXT:    vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r7, #-20]
+; CHECK-NEXT:    vfma.f32 q0, q1, r4
+; CHECK-NEXT:    vldrw.u32 q5, [r7, #-16]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r0
-; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT:    vldrw.u32 q2, [r7, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r5
-; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT:    vldrw.u32 q3, [r7, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r6
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q2, lr
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_9
 ; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r10, #32
 ; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
-; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
-; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
+; CHECK-NEXT:    ldm.w r4, {r0, r3, r5, r6, r8, r11}
+; CHECK-NEXT:    vldrw.u32 q1, [r7], #32
+; CHECK-NEXT:    vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r7, #-20]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
-; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-28]
+; CHECK-NEXT:    vldrw.u32 q5, [r7, #-16]
+; CHECK-NEXT:    vldrw.u32 q2, [r7, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r3
-; CHECK-NEXT:    ldrd r9, r1, [r7, #24]
+; CHECK-NEXT:    ldrd r9, r1, [r4, #24]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r5
-; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT:    vldrw.u32 q3, [r7, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r8
-; CHECK-NEXT:    adds r7, #32
+; CHECK-NEXT:    adds r4, #32
 ; CHECK-NEXT:    vfma.f32 q0, q2, r11
 ; CHECK-NEXT:    vfma.f32 q0, q3, r9
 ; CHECK-NEXT:    vfma.f32 q0, q1, r1
 ; CHECK-NEXT:    le lr, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r10, #32
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:  .LBB16_11: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r0, [r7], #4
+; CHECK-NEXT:    ldr r0, [r4], #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    le lr, .LBB16_11
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12:
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index e8b49c1067379..0d86f22a321e0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -711,8 +711,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #136
-; CHECK-NEXT:    sub sp, #136
+; CHECK-NEXT:    .pad #120
+; CHECK-NEXT:    sub sp, #120
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #64] @ 8-byte Folded Spill
 ; CHECK-NEXT:    blt.w .LBB14_5
@@ -725,22 +725,20 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #8
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    add.w r1, r2, r1, lsr #3
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
 ; CHECK-NEXT:    adr r1, .LCPI14_0
 ; CHECK-NEXT:    adr r2, .LCPI14_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #24] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    add r2, sp, #120
+; CHECK-NEXT:    add r2, sp, #104
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #8] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB14_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    add.w r10, sp, #104
+; CHECK-NEXT:    add.w r10, sp, #88
 ; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #24] @ 16-byte Reload
@@ -762,7 +760,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh.w r12, [r4]
-; CHECK-NEXT:    add r4, sp, #88
+; CHECK-NEXT:    add r4, sp, #72
 ; CHECK-NEXT:    ldrh.w r11, [r5]
 ; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    ldrh r5, [r6]
@@ -807,7 +805,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vmov.16 q3[1], r5
 ; CHECK-NEXT:    vmov r2, r5, d5
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    vadd.i16 q6, q6, q2
 ; CHECK-NEXT:    vadd.i16 q5, q5, q2
 ; CHECK-NEXT:    vadd.i16 q4, q4, q2
@@ -849,7 +847,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    cmp r1, r3
 ; CHECK-NEXT:    bne.w .LBB14_2
 ; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #136
+; CHECK-NEXT:    add sp, #120
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -950,7 +948,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    adr r6, .LCPI15_9
-; CHECK-NEXT:    vmov.i32 q2, #0x30
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
@@ -963,212 +960,213 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:  .LBB15_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    adr r1, .LCPI15_3
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_4
 ; CHECK-NEXT:    vldrw.u32 q5, [r1]
+; CHECK-NEXT:    adr r1, .LCPI15_4
+; CHECK-NEXT:    vstrw.32 q2, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_2
-; CHECK-NEXT:    vldrw.u32 q3, [r1]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_10
-; CHECK-NEXT:    vstrw.32 q6, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_11
 ; CHECK-NEXT:    ldr.w r8, [sp, #116] @ 4-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q6, [sp, #264] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q2, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    mov r11, r10
-; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #216] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB15_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB15_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q4, q1, r0
-; CHECK-NEXT:    vstrw.32 q7, [sp, #136] @ 16-byte Spill
-; CHECK-NEXT:    vmov r1, lr, d8
-; CHECK-NEXT:    vadd.i32 q7, q7, r0
-; CHECK-NEXT:    vmov r5, r4, d15
-; CHECK-NEXT:    vadd.i32 q6, q0, r0
-; CHECK-NEXT:    vmov r6, r7, d13
+; CHECK-NEXT:    vmov q0, q7
+; CHECK-NEXT:    vstrw.32 q7, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q7, q5, r0
+; CHECK-NEXT:    vstrw.32 q5, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q5, q0, r0
+; CHECK-NEXT:    vmov q0, q6
+; CHECK-NEXT:    vadd.i32 q6, q4, r0
+; CHECK-NEXT:    vmov r5, r4, d11
+; CHECK-NEXT:    vmov r1, lr, d12
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vmov r6, r7, d15
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #152] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #168] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #216] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vstrw.32 q5, [sp, #120] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    vstrw.32 q4, [sp, #168] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #120] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #136] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #184] @ 16-byte Reload
 ; CHECK-NEXT:    subs.w r11, r11, #16
-; CHECK-NEXT:    ldrb.w r9, [r1]
-; CHECK-NEXT:    vmov r1, r3, d14
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb.w r9, [r1]
+; CHECK-NEXT:    vmov r1, r3, d10
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[0], r1
+; CHECK-NEXT:    vmov.8 q5[0], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[1], r1
-; CHECK-NEXT:    vmov r1, r3, d12
-; CHECK-NEXT:    vmov.8 q7[2], r5
+; CHECK-NEXT:    vmov.8 q5[1], r1
+; CHECK-NEXT:    vmov r1, r3, d14
+; CHECK-NEXT:    vmov.8 q5[2], r5
 ; CHECK-NEXT:    ldrb r5, [r6]
 ; CHECK-NEXT:    ldrb r6, [r4]
-; CHECK-NEXT:    vmov.8 q7[3], r6
+; CHECK-NEXT:    vmov.8 q5[3], r6
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q6[0], r1
-; CHECK-NEXT:    vmov r6, r1, d2
-; CHECK-NEXT:    vmov.8 q6[1], r3
-; CHECK-NEXT:    vmov.8 q6[2], r5
-; CHECK-NEXT:    vmov.8 q6[3], r7
+; CHECK-NEXT:    vmov.8 q7[0], r1
+; CHECK-NEXT:    vmov r6, r1, d4
+; CHECK-NEXT:    vmov.8 q7[1], r3
+; CHECK-NEXT:    vmov.8 q7[2], r5
+; CHECK-NEXT:    vmov.8 q7[3], r7
 ; CHECK-NEXT:    ldrb.w r7, [lr]
-; CHECK-NEXT:    vmov.8 q6[4], r9
-; CHECK-NEXT:    vmov.8 q6[5], r7
+; CHECK-NEXT:    vmov.8 q7[4], r9
+; CHECK-NEXT:    vmov.8 q7[5], r7
 ; CHECK-NEXT:    ldrb r4, [r1]
-; CHECK-NEXT:    vmov r1, r5, d3
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #232] @ 16-byte Reload
+; CHECK-NEXT:    vmov r1, r5, d5
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #280] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, r3, d9
+; CHECK-NEXT:    vmov r1, r3, d13
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #184] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #232] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q6[6], r1
-; CHECK-NEXT:    vmov r1, r7, d0
-; CHECK-NEXT:    vmov.8 q6[7], r3
+; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    vmov r1, r7, d4
+; CHECK-NEXT:    vmov.8 q7[7], r3
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q7[4], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[5], r7
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov.8 q5[4], r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vmov.8 q5[5], r7
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #296] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    vmov.8 q5[6], r1
 ; CHECK-NEXT:    ldrb r1, [r6]
-; CHECK-NEXT:    vmov r7, r6, d0
-; CHECK-NEXT:    vmov.8 q7[7], r3
-; CHECK-NEXT:    vmov r3, lr, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[8], r1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov.8 q7[9], r4
-; CHECK-NEXT:    vmov r4, r1, d0
-; CHECK-NEXT:    vmov.8 q7[10], r12
-; CHECK-NEXT:    vmov.8 q7[11], r5
+; CHECK-NEXT:    vmov.8 q5[7], r3
+; CHECK-NEXT:    vmov r7, r6, d4
+; CHECK-NEXT:    vmov r3, lr, d5
+; CHECK-NEXT:    vmov.8 q5[8], r1
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vmov.8 q5[9], r4
+; CHECK-NEXT:    vmov r4, r1, d4
+; CHECK-NEXT:    vmov.8 q5[10], r12
+; CHECK-NEXT:    vmov.8 q5[11], r5
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #264] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[8], r4
-; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov.8 q6[9], r1
-; CHECK-NEXT:    vadd.i32 q0, q5, r0
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q7[8], r4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    vmov.8 q7[9], r1
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #216] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.8 q6[10], r5
-; CHECK-NEXT:    vmov.8 q6[11], r4
-; CHECK-NEXT:    vmov.8 q6[12], r7
-; CHECK-NEXT:    vmov.8 q6[13], r6
-; CHECK-NEXT:    vmov.8 q6[14], r3
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov.8 q7[10], r5
+; CHECK-NEXT:    vmov.8 q7[11], r4
+; CHECK-NEXT:    vmov.8 q7[12], r7
+; CHECK-NEXT:    vmov.8 q7[13], r6
+; CHECK-NEXT:    vmov.8 q7[14], r3
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[12], r1
+; CHECK-NEXT:    vmov.8 q5[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q1, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #152] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vmov.8 q5[13], r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[14], r1
+; CHECK-NEXT:    vmov.8 q5[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[15], r1
+; CHECK-NEXT:    vmov.8 q5[15], r1
 ; CHECK-NEXT:    ldrb.w r1, [lr]
-; CHECK-NEXT:    vmov.8 q6[15], r1
-; CHECK-NEXT:    vmov r1, r3, d0
-; CHECK-NEXT:    vadd.i8 q6, q6, q7
+; CHECK-NEXT:    vmov.8 q7[15], r1
+; CHECK-NEXT:    vmov r1, r3, d4
+; CHECK-NEXT:    vadd.i8 q5, q7, q5
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    vmov.8 q7[0], r1
 ; CHECK-NEXT:    vmov.8 q7[1], r3
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q3, r0
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q4, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[2], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[3], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[4], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[5], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q5, r0
-; CHECK-NEXT:    vadd.i32 q5, q5, q2
-; CHECK-NEXT:    vstrw.32 q5, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #120] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q6, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[6], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[7], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[8], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[9], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q4, r0
-; CHECK-NEXT:    vadd.i32 q4, q4, q2
-; CHECK-NEXT:    vstrw.32 q4, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[10], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[11], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vmov r1, r3, d5
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[15], r1
-; CHECK-NEXT:    vadd.i8 q0, q6, q7
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #136] @ 16-byte Reload
-; CHECK-NEXT:    vstrb.8 q0, [r8], #16
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #168] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q7, q7, q2
+; CHECK-NEXT:    vadd.i8 q2, q5, q7
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT:    vstrb.8 q2, [r8], #16
+; CHECK-NEXT:    vmov.i32 q2, #0x30
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #136] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #152] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q4, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #168] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q6, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #120] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #280] @ 16-byte Spill
 ; CHECK-NEXT:    bne.w .LBB15_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB15_2 Depth=1
@@ -1501,14 +1499,14 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    blt .LBB18_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI18_0
+; CHECK-NEXT:    adr r3, .LCPI18_0
 ; CHECK-NEXT:    adr r4, .LCPI18_1
 ; CHECK-NEXT:    adr r5, .LCPI18_2
 ; CHECK-NEXT:    adr r6, .LCPI18_3
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
-; CHECK-NEXT:    vldrw.u32 q3, [lr]
+; CHECK-NEXT:    vldrw.u32 q3, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r1
 ; CHECK-NEXT:    vadd.i32 q2, q2, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index dad856c0677a1..00a998cba6426 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() {
 ; CHECK-NEXT:    vmov.i32 q5, #0x0
 ; CHECK-NEXT:    vpsel q6, q4, q3
 ; CHECK-NEXT:    vstrh.16 q6, [r0]
-; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vmov.i32 q6, #0x0
 ; CHECK-NEXT:    cbz r1, .LBB0_2
 ; CHECK-NEXT:    le .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: @ %for.cond4.preheader
@@ -135,12 +135,12 @@ vector.body115:                                   ; preds = %vector.body115, %ve
 define dso_local i32 @e() #0 {
 ; CHECK-LABEL: e:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #408
-; CHECK-NEXT:    sub sp, #408
+; CHECK-NEXT:    .pad #392
+; CHECK-NEXT:    sub sp, #392
 ; CHECK-NEXT:    movw r7, :lower16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s15, .LCPI1_1
 ; CHECK-NEXT:    movt r7, :upper16:.L_MergedGlobals
@@ -148,18 +148,16 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    mov r4, r7
 ; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    ldr r6, [r4, #8]!
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    ldr r0, [r3, #4]!
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    movt r2, :upper16:e
+; CHECK-NEXT:    ldr r0, [r3, #4]!
 ; CHECK-NEXT:    vmov r5, s15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vldr s12, .LCPI1_0
+; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
 ; CHECK-NEXT:    vdup.32 q7, r3
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #92]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #76]
 ; CHECK-NEXT:    vmov q0, q7
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vmov q4, q7
@@ -168,7 +166,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov s21, r2
 ; CHECK-NEXT:    movs r1, #64
 ; CHECK-NEXT:    vmov.f32 s20, s12
-; CHECK-NEXT:    str r0, [sp, #40]
+; CHECK-NEXT:    str r0, [sp, #24]
 ; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    str r6, [r0]
 ; CHECK-NEXT:    vmov.f32 s23, s15
@@ -186,12 +184,12 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
 ; CHECK-NEXT:    vmov.32 q4[0], r8
 ; CHECK-NEXT:    @ implicit-def: $r2
-; CHECK-NEXT:    str.w r8, [sp, #44]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #60]
-; CHECK-NEXT:    strh.w r12, [sp, #406]
+; CHECK-NEXT:    str.w r8, [sp, #28]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #44]
+; CHECK-NEXT:    strh.w r12, [sp, #390]
 ; CHECK-NEXT:    wlstp.8 lr, r1, .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: @ %entry
@@ -199,7 +197,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    str.w r8, [r7]
 ; CHECK-NEXT:    vstrw.32 q4, [r0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
-; CHECK-NEXT:    str.w r12, [sp, #324]
+; CHECK-NEXT:    str.w r12, [sp, #308]
 ; CHECK-NEXT:  .LBB1_3: @ %for.cond
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    b .LBB1_3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index f90af3cc5ba24..2587a0bb6e00b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -115,17 +115,17 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    ldr r3, [r0]
 ; CHECK-NEXT:    add.w r11, r3, r12, lsl #2
-; CHECK-NEXT:    add.w r7, r3, r12, lsl #3
-; CHECK-NEXT:    lsl.w r9, r12, #3
+; CHECK-NEXT:    add.w r6, r3, r12, lsl #3
+; CHECK-NEXT:    lsl.w r10, r12, #3
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
 ; CHECK-NEXT:    ldr r5, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r9, r4, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w r10, r4, #1
 ; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB1_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_2 Depth=1
@@ -139,11 +139,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s2, s2, s3
-; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r9, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    add r11, r9
+; CHECK-NEXT:    add r11, r10
 ; CHECK-NEXT:    vadd.f32 s6, s6, s7
-; CHECK-NEXT:    add r7, r9
+; CHECK-NEXT:    add r6, r10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vadd.f32 s2, s4, s6
@@ -228,46 +228,40 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #3
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo .LBB2_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r9, [r0, #8]
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
-; CHECK-NEXT:    adds r3, #3
+; CHECK-NEXT:    add.w r3, r9, #3
 ; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    add.w r10, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r0, r9, r9, lsl #1
 ; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    add.w r10, r1, r9, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r9, lsl #3
+; CHECK-NEXT:    add.w r1, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r3, r5, r3, lsr #2
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    lsl.w r11, r0, #2
-; CHECK-NEXT:    add.w r1, r5, r3, lsr #2
-; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    adds r0, r5, #2
-; CHECK-NEXT:    adds r2, r5, #1
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r4, r10
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r9
 ; CHECK-NEXT:  .LBB2_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB2_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -282,31 +276,31 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r5, #1
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    add r9, r11
+; CHECK-NEXT:    add r10, r11
 ; CHECK-NEXT:    vadd.f32 s6, s6, s7
-; CHECK-NEXT:    add.w r0, r1, r2, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    add r12, r11
 ; CHECK-NEXT:    vadd.f32 s2, s2, s3
-; CHECK-NEXT:    add r10, r11
+; CHECK-NEXT:    add r1, r11
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s8, s8, s10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s8, [r0]
-; CHECK-NEXT:    add.w r0, r1, r5, lsl #2
-; CHECK-NEXT:    adds r5, #3
+; CHECK-NEXT:    add.w r0, r2, r5, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
+; CHECK-NEXT:    adds r0, r5, #2
+; CHECK-NEXT:    adds r5, #3
+; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r5, r0
 ; CHECK-NEXT:    blo .LBB2_2
 ; CHECK-NEXT:  .LBB2_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -394,15 +388,15 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    .pad #40
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
-; CHECK-NEXT:    blo.w .LBB3_5
+; CHECK-NEXT:    blo .LBB3_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    ldr r2, [r0, #8]
 ; CHECK-NEXT:    movs r6, #1
@@ -410,34 +404,28 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    add.w r0, r2, r2, lsl #1
 ; CHECK-NEXT:    add.w r12, r1, r2, lsl #2
 ; CHECK-NEXT:    add.w r8, r1, r2, lsl #3
-; CHECK-NEXT:    add.w r9, r1, r2, lsl #4
-; CHECK-NEXT:    add.w r11, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r10, r1, r2, lsl #4
+; CHECK-NEXT:    add.w r9, r1, r0, lsl #2
 ; CHECK-NEXT:    adds r0, r2, #3
 ; CHECK-NEXT:    bic r0, r0, #3
 ; CHECK-NEXT:    subs r0, #4
 ; CHECK-NEXT:    add.w r0, r6, r0, lsr #2
-; CHECK-NEXT:    strd r0, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r0, r2, [sp, #4] @ 8-byte Folded Spill
 ; CHECK-NEXT:    lsls r0, r2, #4
-; CHECK-NEXT:    ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB3_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT:    adds r0, r6, #3
-; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r6, #2
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r6, #1
-; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r5, r11
-; CHECK-NEXT:    mov r4, r9
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:    mov r4, r10
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r7
 ; CHECK-NEXT:  .LBB3_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
@@ -455,9 +443,9 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
-; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #1
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
@@ -471,24 +459,24 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s12, [r0]
 ; CHECK-NEXT:    add.w r0, r1, r6, lsl #2
-; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    vstr s8, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #2
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #3
+; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r12, r0
 ; CHECK-NEXT:    add r8, r0
-; CHECK-NEXT:    add r11, r0
 ; CHECK-NEXT:    add r9, r0
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add r10, r0
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r6, r0
 ; CHECK-NEXT:    blo .LBB3_2
 ; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #40
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -588,60 +576,53 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #5
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB4_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r12, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r12, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r12, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r1, r3, r3, lsl #2
-; CHECK-NEXT:    lsls r1, r1, #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r1, r12, r12, lsl #2
+; CHECK-NEXT:    lsls r1, r1, #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB4_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB4_3 Depth 2
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    add.w r10, r0, #2
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r11, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB4_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB4_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r9, r3, r5
 ; CHECK-NEXT:    vldrw.u32 q5, [r4], #16
 ; CHECK-NEXT:    vldrw.u32 q6, [r3], #16
-; CHECK-NEXT:    add.w r12, r9, r5
+; CHECK-NEXT:    add.w r10, r9, r5
 ; CHECK-NEXT:    vfma.f32 q3, q6, q5
 ; CHECK-NEXT:    vldrw.u32 q6, [r9]
-; CHECK-NEXT:    add.w r6, r12, r5
+; CHECK-NEXT:    add.w r6, r10, r5
 ; CHECK-NEXT:    vfma.f32 q4, q6, q5
-; CHECK-NEXT:    vldrw.u32 q6, [r12]
+; CHECK-NEXT:    vldrw.u32 q6, [r10]
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vfma.f32 q2, q6, q5
 ; CHECK-NEXT:    vldrw.u32 q6, [r6]
@@ -662,30 +643,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s1, s16, s18
-; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s12, s12, s14
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vadd.f32 s6, s8, s10
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    adds r0, #5
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    add.w r1, r2, r10, lsl #2
+; CHECK-NEXT:    adds r1, r0, #2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    adds r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r8, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
-; CHECK-NEXT:    blo.w .LBB4_2
+; CHECK-NEXT:    blo .LBB4_2
 ; CHECK-NEXT:  .LBB4_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -797,63 +779,54 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #6
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB5_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r12, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r12, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r12, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r1, r3, r3, lsl #1
+; CHECK-NEXT:    add.w r1, r12, r12, lsl #1
 ; CHECK-NEXT:    lsls r1, r1, #3
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB5_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB5_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r11, r0, #2
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov q5, q1
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB5_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    add.w r12, r3, r5
+; CHECK-NEXT:    add.w r10, r3, r5
 ; CHECK-NEXT:    vldrw.u32 q6, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q7, [r3], #16
-; CHECK-NEXT:    add.w r10, r12, r5
+; CHECK-NEXT:    add.w r11, r10, r5
 ; CHECK-NEXT:    vfma.f32 q4, q7, q6
-; CHECK-NEXT:    vldrw.u32 q7, [r12]
-; CHECK-NEXT:    add.w r6, r10, r5
-; CHECK-NEXT:    vfma.f32 q5, q7, q6
 ; CHECK-NEXT:    vldrw.u32 q7, [r10]
+; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vfma.f32 q5, q7, q6
+; CHECK-NEXT:    vldrw.u32 q7, [r11]
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vfma.f32 q2, q7, q6
 ; CHECK-NEXT:    vldrw.u32 q7, [r6]
@@ -885,28 +858,29 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    vstr s3, [r1]
-; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
+; CHECK-NEXT:    adds r1, r0, #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    vadd.f32 s6, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
+; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
 ; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r8, r1
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB5_2
 ; CHECK-NEXT:  .LBB5_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1030,73 +1004,64 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #72
-; CHECK-NEXT:    sub sp, #72
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #7
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB6_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r10, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r10, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r9, r1, r10, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r10, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    rsb r1, r3, r3, lsl #3
-; CHECK-NEXT:    lsls r1, r1, #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    rsb r1, r10, r10, lsl #3
+; CHECK-NEXT:    lsls r1, r1, #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB6_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB6_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #6
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #2
 ; CHECK-NEXT:    add.w r8, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q1, q2
-; CHECK-NEXT:    mov r12, r7
-; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    dls lr, r6
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q6, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB6_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r12
-; CHECK-NEXT:    add.w r10, r3, r5
+; CHECK-NEXT:    add.w r11, r3, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT:    add.w r11, r10, r5
+; CHECK-NEXT:    add.w r6, r11, r5
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r10]
-; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q6, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
@@ -1104,26 +1069,26 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r6]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r6, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q1, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adds r6, r7, r5
-; CHECK-NEXT:    vstrw.32 q1, [sp, #56] @ 16-byte Spill
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q1, q0, q7
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    adds r7, r6, r5
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q1, q3
 ; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q3, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    vldrwt.u32 q0, [r7]
 ; CHECK-NEXT:    vmov q4, q5
-; CHECK-NEXT:    adds r7, r6, r5
+; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q4, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r7]
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q2, q0, q7
 ; CHECK-NEXT:    le lr, .LBB6_3
@@ -1138,45 +1103,45 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s9, s18, s19
 ; CHECK-NEXT:    vadd.f32 s11, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 s2, s3, s1
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
+; CHECK-NEXT:    vadd.f32 s2, s3, s1
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
-; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
-; CHECK-NEXT:    adds r0, #7
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vstr s2, [r1]
-; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
+; CHECK-NEXT:    vstr s0, [r1]
+; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s8, s8, s10
 ; CHECK-NEXT:    vadd.f32 s6, s7, s5
-; CHECK-NEXT:    vstr s4, [r1]
+; CHECK-NEXT:    vstr s2, [r1]
+; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vadd.f32 s10, s11, s9
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vstr s4, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    vadd.f32 s12, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s10, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #6
+; CHECK-NEXT:    adds r0, #7
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r9, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB6_2
 ; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #72
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1312,107 +1277,99 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #88
-; CHECK-NEXT:    sub sp, #88
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #8
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB7_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r11, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r11, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r11, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r6, r3, #2
+; CHECK-NEXT:    lsl.w r6, r11, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    lsls r1, r3, #5
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    lsl.w r1, r11, #5
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB7_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB7_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #7
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #6
-; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    ldr.w r9, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    ldr.w r9, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    adds r4, r0, #3
 ; CHECK-NEXT:    add.w r8, r0, #2
 ; CHECK-NEXT:    adds r1, r0, #1
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    mov r3, r12
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov q6, q3
-; CHECK-NEXT:    vmov q4, q3
-; CHECK-NEXT:    vmov q7, q3
-; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    mov r10, r7
-; CHECK-NEXT:    vstrw.32 q3, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q6, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q7, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    mov r10, r11
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB7_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r10
-; CHECK-NEXT:    add.w r11, r3, r6
+; CHECK-NEXT:    adds r5, r3, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
-; CHECK-NEXT:    add.w r5, r11, r6
+; CHECK-NEXT:    adds r7, r5, r6
 ; CHECK-NEXT:    sub.w r10, r10, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q6, q1, q0
-; CHECK-NEXT:    vldrwt.u32 q1, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q7, q1, q0
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov q3, q4
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q3, q2
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    adds r5, r7, r6
-; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q4, q1, q0
+; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    adds r7, r5, r6
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q4, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    add r5, r6
+; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q3, q1, q0
 ; CHECK-NEXT:    le lr, .LBB7_3
@@ -1425,12 +1382,12 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s6, s24, s25
 ; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s9, s18, s19
 ; CHECK-NEXT:    vadd.f32 s11, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    vadd.f32 s13, s18, s19
@@ -1445,33 +1402,33 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s3, s20, s21
-; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
 ; CHECK-NEXT:    vadd.f32 s12, s7, s5
 ; CHECK-NEXT:    vstr s10, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vstr s14, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    vadd.f32 s4, s3, s1
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
+; CHECK-NEXT:    vadd.f32 s4, s3, s1
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #6
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #7
+; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r12, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB7_2
 ; CHECK-NEXT:  .LBB7_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #88
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
index 29c4fb902bf36..413c4a14a2593 100644
--- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
@@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB34_1: @ %for.body.preheader
-; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB34_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
-; CHECK-NEXT:    vfma.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r1], #16
+; CHECK-NEXT:    vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r1], #16
 ; CHECK-NEXT:    letp lr, .LBB34_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB35_1: @ %while.body.lr.ph
-; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB35_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vfma.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r0], #16
+; CHECK-NEXT:    vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB35_2
 ; CHECK-NEXT:  @ %bb.3: @ %while.end
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index e845070d57904..62482c11d27e2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -287,17 +287,17 @@ define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n)
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    blt .LBB5_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI5_0
-; CHECK-NEXT:    adr r4, .LCPI5_1
+; CHECK-NEXT:    adr r4, .LCPI5_0
+; CHECK-NEXT:    adr r3, .LCPI5_1
 ; CHECK-NEXT:    adr r5, .LCPI5_2
 ; CHECK-NEXT:    adr r6, .LCPI5_3
-; CHECK-NEXT:    vldrw.u32 q2, [r4]
+; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    vldrw.u32 q3, [r4]
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q3, [lr]
+; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r1
-; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r1
 ; CHECK-NEXT:    mov.w r12, #1
 ; CHECK-NEXT:    movs r4, #3
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index f9948db66b3b3..c92c2be2834e7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -656,14 +656,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
@@ -706,7 +704,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    vmov r2, r3, d15
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q2[3]
@@ -785,6 +783,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vpsel q6, q1, q7
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -853,7 +852,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -2065,14 +2063,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[1]
@@ -2115,7 +2111,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    orr.w lr, lr, r3
 ; CHECK-NEXT:    add r12, r2
 ; CHECK-NEXT:    vmov r3, r2, d15
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[3]
@@ -2194,6 +2190,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vpsel q6, q1, q7
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -2264,7 +2261,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    adc.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 63b1431ac0fa4..9f551836c8b17 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -817,16 +817,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q6, q2, q0
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vmov.u8 r0, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov.16 q0[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q6[1]
 ; CHECK-NEXT:    vmov.16 q0[1], r0
@@ -842,9 +840,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q0[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q6[7]
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vmov.u8 r2, q3[0]
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q7, q2, q4
 ; CHECK-NEXT:    vmov.u16 r0, q7[2]
 ; CHECK-NEXT:    vmov.u16 r1, q7[0]
@@ -895,7 +892,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -916,8 +913,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.u16 r3, q7[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q0, q0, q4
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q4, q0
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
 ; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
@@ -932,7 +929,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r0, s30
 ; CHECK-NEXT:    vmov r1, s28
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0xff
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -960,7 +957,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -1041,7 +1038,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -1062,7 +1059,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.u16 r3, q6[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q0, #0xff
 ; CHECK-NEXT:    vpsel q0, q0, q4
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
@@ -1117,7 +1114,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    adc.w r1, r1, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -1137,16 +1133,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vmov.s8 r2, q1[0]
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
-; CHECK-NEXT:    vmov.s8 r3, q3[0]
+; CHECK-NEXT:    vmov.s8 r2, q1[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
 ; CHECK-NEXT:    vmov.16 q4[1], r0
@@ -1162,9 +1156,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q4[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[7]
 ; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.s8 r3, q3[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov.u16 r0, q6[2]
 ; CHECK-NEXT:    vmov.u16 r1, q6[0]
@@ -1198,7 +1192,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.s8 r3, q3[3]
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
@@ -1219,7 +1213,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -1273,17 +1268,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q6[7], r2
 ; CHECK-NEXT:    vmov.s8 r0, q1[8]
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
+; CHECK-NEXT:    vmov.i8 q6, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q6
 ; CHECK-NEXT:    vmov.s8 r1, q3[8]
-; CHECK-NEXT:    vpsel q5, q2, q7
-; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov.u16 r2, q5[2]
 ; CHECK-NEXT:    vmov.u16 r3, q5[0]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[3]
 ; CHECK-NEXT:    vmov.u16 r3, q5[1]
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vpsel q6, q2, q6
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -1365,7 +1361,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    adc.w r1, r1, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -2296,16 +2291,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q6, q2, q0
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vmov.u8 r2, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov.16 q0[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q6[1]
 ; CHECK-NEXT:    vmov.16 q0[1], r2
@@ -2321,9 +2314,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q0[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q6[7]
 ; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vmov.u8 r4, q3[2]
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q7, q2, q4
 ; CHECK-NEXT:    vmov.u16 r2, q7[2]
 ; CHECK-NEXT:    vmov.u16 r3, q7[0]
@@ -2374,7 +2366,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r3, r3, r2
 ; CHECK-NEXT:    umull r4, r5, r5, r4
@@ -2395,8 +2387,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u16 r4, q7[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q0, q0, q4
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q4, q0
 ; CHECK-NEXT:    vmov r5, r4, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
 ; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
@@ -2411,7 +2403,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    vmov r2, s30
 ; CHECK-NEXT:    vmov r3, s28
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0xff
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2439,7 +2431,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2520,7 +2512,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2541,7 +2533,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u16 r4, q6[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q0, #0xff
 ; CHECK-NEXT:    vpsel q0, q0, q4
 ; CHECK-NEXT:    vmov r5, r4, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
@@ -2598,7 +2590,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -2619,14 +2610,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
 ; CHECK-NEXT:    vmov.s8 r4, q1[2]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
@@ -2676,7 +2665,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, r3, d15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[3]
 ; CHECK-NEXT:    vmov.s8 r3, q3[3]
@@ -2701,7 +2690,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov r5, r4, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
@@ -2755,17 +2745,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q6[7], r5
 ; CHECK-NEXT:    vmov.s8 r2, q1[8]
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
+; CHECK-NEXT:    vmov.i8 q6, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q6
 ; CHECK-NEXT:    vmov.s8 r3, q3[8]
-; CHECK-NEXT:    vpsel q5, q2, q7
-; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov.u16 r5, q5[2]
 ; CHECK-NEXT:    vmov.u16 r4, q5[0]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
 ; CHECK-NEXT:    vmov.u16 r5, q5[3]
 ; CHECK-NEXT:    vmov.u16 r4, q5[1]
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vpsel q6, q2, q6
 ; CHECK-NEXT:    vmov r5, r4, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
@@ -2849,7 +2840,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
index 0c349c3aa8ec1..cba394f9e55d6 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
@@ -59,18 +59,18 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT:   [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
   ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOV32rm2]].sub_32bit
   ; CHECK-NEXT:   [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
-  ; CHECK-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5)
   ; CHECK-NEXT:   [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]]
   ; CHECK-NEXT:   [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]]
   ; CHECK-NEXT:   MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8)
+  ; CHECK-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[COPY1]] :: (store (s64) into %stack.5)
   ; CHECK-NEXT:   MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg
   ; CHECK-NEXT:   MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32rm2]]
   ; CHECK-NEXT:   MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7)
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags
@@ -87,8 +87,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
   ; CHECK-NEXT:   CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16)
-  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
-  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+  ; CHECK-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
+  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
+  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
+  ; CHECK-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm6]]
   ; CHECK-NEXT:   JCC_1 %bb.5, 13, implicit $eflags
   ; CHECK-NEXT:   JMP_1 %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -98,9 +101,8 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1)
-  ; CHECK-NEXT:   MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13)
-  ; CHECK-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
-  ; CHECK-NEXT:   undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]]
+  ; CHECK-NEXT:   [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
+  ; CHECK-NEXT:   undef [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm7]]
   ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
   ; CHECK-NEXT:   [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
   ; CHECK-NEXT:   JMP_1 %bb.6
@@ -123,40 +125,30 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT: bb.6.for.body17:
   ; CHECK-NEXT:   successors: %bb.6(0x7c000000), %bb.5(0x04000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit
-  ; CHECK-NEXT:   [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
-  ; CHECK-NEXT:   [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]]
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]]
-  ; CHECK-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
-  ; CHECK-NEXT:   [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg
-  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]]
-  ; CHECK-NEXT:   [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]]
-  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]]
-  ; CHECK-NEXT:   [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
-  ; CHECK-NEXT:   [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]]
-  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
-  ; CHECK-NEXT:   PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
+  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit
+  ; CHECK-NEXT:   [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY9]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY10]].sub_32bit
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+  ; CHECK-NEXT:   [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[MOV64rm7]], 1, [[COPY10]], 0, $noreg
+  ; CHECK-NEXT:   [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[COPY15]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY13]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY12]]
+  ; CHECK-NEXT:   [[MOVSX64rm32_1:%[0-9]+]]:gr64 = COPY [[COPY11]]
+  ; CHECK-NEXT:   [[MOV32rm8:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm8]]
+  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY9]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
+  ; CHECK-NEXT:   PTILESTOREDV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
   ; CHECK-NEXT:   [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
-  ; CHECK-NEXT:   CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags
+  ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm1]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY10]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
+  ; CHECK-NEXT:   CMP64rr [[MOV64rm4]], [[MOVSX64rm32_1]], implicit-def $eflags
   ; CHECK-NEXT:   JCC_1 %bb.6, 12, implicit $eflags
   ; CHECK-NEXT:   JMP_1 %bb.5
 entry:
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index bf6b09674e187..b428ce457ff40 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    movl (%r8), %edx
 ; CHECK-NEXT:    leal 8(,%rbx,8), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    leaq 8(%rsi), %rax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    leaq 32(%rsi), %r11
 ; CHECK-NEXT:    leaq 8(,%rbx,8), %rbx
 ; CHECK-NEXT:    xorl %r14d, %r14d
@@ -189,7 +187,8 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    jae .LBB1_7
 ; CHECK-NEXT:  # %bb.6: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT:    leaq 8(%rsi), %r9
+; CHECK-NEXT:    addq %r9, %rax
 ; CHECK-NEXT:    leaq (%rax,%r10,8), %rax
 ; CHECK-NEXT:    cmpq %r15, %rax
 ; CHECK-NEXT:    ja .LBB1_14
diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
index 10ee445788757..d355374097651 100644
--- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
+++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
@@ -7,8 +7,8 @@
 # CHECK: jne
 # CHECK: andl    $-16, %edx
 # CHECK: xorl    %ebx, %ebx
-# CHECK: movl    -16(%ebp), %esi
-# CHECK: xorl    %eax, %eax
+# CHECK: xorl    %esi, %esi
+# CHECK: movl    %eax, %ecx
 
 name:            test
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll
index c2728f78c81e4..68cb24dece0a9 100644
--- a/llvm/test/CodeGen/X86/inalloca-invoke.ll
+++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll
@@ -23,7 +23,6 @@ blah:
 ; CHECK:  pushl   %eax
 ; CHECK:  subl    $20, %esp
 ; CHECK:  movl %esp, %[[beg:[^ ]*]]
-; CHECK:  leal 12(%[[beg]]), %[[end:[^ ]*]]
 
   call void @begin(ptr sret(%Iter) %temp.lvalue)
 ; CHECK:  calll _begin
@@ -32,6 +31,7 @@ blah:
           to label %invoke.cont unwind label %lpad
 
 ;  Uses end as sret param.
+; CHECK:  leal 12(%[[beg]]), %[[end:[^ ]*]]
 ; CHECK:  pushl %[[end]]
 ; CHECK:  calll _plus
 
diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll
index 72a4832062765..26ed2a37fb647 100644
--- a/llvm/test/CodeGen/X86/licm-regpressure.ll
+++ b/llvm/test/CodeGen/X86/licm-regpressure.ll
@@ -1,14 +1,64 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; This tests currently fails as MachineLICM does not compute register pressure
-; correctly. More details: llvm.org/PR23143
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s
 
-; MachineLICM should take register pressure into account.
-; CHECK-NOT: Spill
+; FIXME: MachineLICM does not compute register pressure correctly and we end up
+; emitting too many ADD64ri32s. More details: llvm.org/PR23143
 
 %struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
 
 define void @test(i1 %b, ptr %a) nounwind {
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $edi, $rsi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr8 = COPY [[COPY1]].sub_8bit
+  ; CHECK-NEXT:   [[ADD64ri32_:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 4, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_1:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 8, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_2:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 12, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_3:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 16, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_4:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 20, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_5:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 24, implicit-def dead $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.loop-body:
+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[COPY]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_1]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_2]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_3]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_4]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_5]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   TEST8ri [[COPY2]], 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.1, 5, implicit $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.loop-exit:
+  ; CHECK-NEXT:   RET 0
 entry:
   br label %loop-header
 

From 074308c64ba10a3346c65deda67501e7bfc58eaa Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sun, 5 Oct 2025 12:04:19 +1100
Subject: [PATCH 764/878] [orc-rt] Support multiple copies of OpCounter
 unittest utility. (#161985)

This commit templatizes OpCounter with a size_t argument, allowing
multiple copies of OpCounter to be easily created. This functionality
will be used in upcoming unit tests that need to count operations on
several types at once.
---
 orc-rt/unittests/CMakeLists.txt      |  1 -
 orc-rt/unittests/CommonTestUtils.cpp | 20 --------------------
 orc-rt/unittests/CommonTestUtils.h   |  9 ++++++++-
 orc-rt/unittests/bind-test.cpp       | 26 +++++++++++++-------------
 4 files changed, 21 insertions(+), 35 deletions(-)
 delete mode 100644 orc-rt/unittests/CommonTestUtils.cpp

diff --git a/orc-rt/unittests/CMakeLists.txt b/orc-rt/unittests/CMakeLists.txt
index 54c453de3a97f..4d3da682d4222 100644
--- a/orc-rt/unittests/CMakeLists.txt
+++ b/orc-rt/unittests/CMakeLists.txt
@@ -15,7 +15,6 @@ add_orc_rt_unittest(CoreTests
   AllocActionTest.cpp
   BitmaskEnumTest.cpp
   CallableTraitsHelperTest.cpp
-  CommonTestUtils.cpp
   ErrorTest.cpp
   ExecutorAddressTest.cpp
   IntervalMapTest.cpp
diff --git a/orc-rt/unittests/CommonTestUtils.cpp b/orc-rt/unittests/CommonTestUtils.cpp
deleted file mode 100644
index d9f9433d167fd..0000000000000
--- a/orc-rt/unittests/CommonTestUtils.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===- CommonTestUtils.cpp ------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common test utilities.
-//
-//===----------------------------------------------------------------------===//
-
-#include "CommonTestUtils.h"
-
-size_t OpCounter::DefaultConstructions = 0;
-size_t OpCounter::CopyConstructions = 0;
-size_t OpCounter::CopyAssignments = 0;
-size_t OpCounter::MoveConstructions = 0;
-size_t OpCounter::MoveAssignments = 0;
-size_t OpCounter::Destructions = 0;
diff --git a/orc-rt/unittests/CommonTestUtils.h b/orc-rt/unittests/CommonTestUtils.h
index 5ff2c8e6e8989..1c66bddaf75be 100644
--- a/orc-rt/unittests/CommonTestUtils.h
+++ b/orc-rt/unittests/CommonTestUtils.h
@@ -11,7 +11,7 @@
 
 #include <cstddef>
 
-class OpCounter {
+template <size_t Idx = 0> class OpCounter {
 public:
   OpCounter() { ++DefaultConstructions; }
   OpCounter(const OpCounter &Other) { ++CopyConstructions; }
@@ -57,4 +57,11 @@ class OpCounter {
   static size_t Destructions;
 };
 
+template <size_t Idx> size_t OpCounter<Idx>::DefaultConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::CopyConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::CopyAssignments = 0;
+template <size_t Idx> size_t OpCounter<Idx>::MoveConstructions = 0;
+template <size_t Idx> size_t OpCounter<Idx>::MoveAssignments = 0;
+template <size_t Idx> size_t OpCounter<Idx>::Destructions = 0;
+
 #endif // ORC_RT_UNITTEST_COMMONTESTUTILS_H
diff --git a/orc-rt/unittests/bind-test.cpp b/orc-rt/unittests/bind-test.cpp
index bfaef4e9c1ebe..93a61e6387b49 100644
--- a/orc-rt/unittests/bind-test.cpp
+++ b/orc-rt/unittests/bind-test.cpp
@@ -47,28 +47,28 @@ TEST(BindTest, LambdaCapture) {
 }
 
 TEST(BindTest, MinimalMoves) {
-  OpCounter::reset();
+  OpCounter<>::reset();
   {
-    auto B = bind_front([](OpCounter &O, int) {}, OpCounter());
+    auto B = bind_front([](OpCounter<> &O, int) {}, OpCounter<>());
     B(0);
   }
-  EXPECT_EQ(OpCounter::defaultConstructions(), 1U);
-  EXPECT_EQ(OpCounter::copies(), 0U);
-  EXPECT_EQ(OpCounter::moves(), 1U);
-  EXPECT_EQ(OpCounter::destructions(), 2U);
+  EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U);
+  EXPECT_EQ(OpCounter<>::copies(), 0U);
+  EXPECT_EQ(OpCounter<>::moves(), 1U);
+  EXPECT_EQ(OpCounter<>::destructions(), 2U);
 }
 
 TEST(BindTest, MinimalCopies) {
-  OpCounter::reset();
+  OpCounter<>::reset();
   {
-    OpCounter O;
-    auto B = bind_front([](OpCounter &O, int) {}, O);
+    OpCounter<> O;
+    auto B = bind_front([](OpCounter<> &O, int) {}, O);
     B(0);
   }
-  EXPECT_EQ(OpCounter::defaultConstructions(), 1U);
-  EXPECT_EQ(OpCounter::copies(), 1U);
-  EXPECT_EQ(OpCounter::moves(), 0U);
-  EXPECT_EQ(OpCounter::destructions(), 2U);
+  EXPECT_EQ(OpCounter<>::defaultConstructions(), 1U);
+  EXPECT_EQ(OpCounter<>::copies(), 1U);
+  EXPECT_EQ(OpCounter<>::moves(), 0U);
+  EXPECT_EQ(OpCounter<>::destructions(), 2U);
 }
 
 TEST(BindTest, ForwardUnboundArgs) {

From 8181c3deae482769937bdcee68f381df4141eb24 Mon Sep 17 00:00:00 2001
From: Twice <twice@apache.org>
Date: Sun, 5 Oct 2025 11:12:11 +0800
Subject: [PATCH 765/878] [MLIR][Python] Expose the insertion point of pattern
 rewriter (#161001)

In [#160520](https://github.com/llvm/llvm-project/pull/160520), we
discussed the current limitations of PDL rewriting in Python (see [this
comment](https://github.com/llvm/llvm-project/pull/160520#issuecomment-3332326184)).
At the moment, we cannot create new operations in PDL native (python)
rewrite functions because the `PatternRewriter` APIs are not exposed.

This PR introduces bindings to retrieve the insertion point of the
`PatternRewriter`, enabling users to create new operations within Python
rewrite functions. With this capability, more complex rewrites e.g. with
branching and loops that involve op creations become possible.

---------

Co-authored-by: Maksim Levental <maksim.levental@gmail.com>
---
 mlir/include/mlir-c/Rewrite.h                | 14 +++
 mlir/lib/Bindings/Python/IRCore.cpp          |  3 +
 mlir/lib/Bindings/Python/IRModule.h          |  2 +
 mlir/lib/Bindings/Python/Rewrite.cpp         | 34 +++++++-
 mlir/lib/CAPI/Transforms/Rewrite.cpp         | 15 ++++
 mlir/test/python/integration/dialects/pdl.py | 91 +++++++++++++++++++-
 6 files changed, 154 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h
index 77be1f480eacf..5dd285ee076c4 100644
--- a/mlir/include/mlir-c/Rewrite.h
+++ b/mlir/include/mlir-c/Rewrite.h
@@ -101,6 +101,12 @@ mlirRewriterBaseGetInsertionBlock(MlirRewriterBase rewriter);
 MLIR_CAPI_EXPORTED MlirBlock
 mlirRewriterBaseGetBlock(MlirRewriterBase rewriter);
 
+/// Returns the operation right after the current insertion point
+/// of the rewriter. A null MlirOperation will be returned
+// if the current insertion point is at the end of the block.
+MLIR_CAPI_EXPORTED MlirOperation
+mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter);
+
 //===----------------------------------------------------------------------===//
 /// Block and operation creation/insertion/cloning
 //===----------------------------------------------------------------------===//
@@ -310,6 +316,14 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirApplyPatternsAndFoldGreedily(
     MlirModule op, MlirFrozenRewritePatternSet patterns,
     MlirGreedyRewriteDriverConfig);
 
+//===----------------------------------------------------------------------===//
+/// PatternRewriter API
+//===----------------------------------------------------------------------===//
+
+/// Cast the PatternRewriter to a RewriterBase
+MLIR_CAPI_EXPORTED MlirRewriterBase
+mlirPatternRewriterAsBase(MlirPatternRewriter rewriter);
+
 //===----------------------------------------------------------------------===//
 /// PDLPatternModule API
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 32b2b0c648cff..7b1710656243a 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2046,6 +2046,9 @@ PyInsertionPoint::PyInsertionPoint(PyOperationBase &beforeOperationBase)
     : refOperation(beforeOperationBase.getOperation().getRef()),
       block((*refOperation)->getBlock()) {}
 
+PyInsertionPoint::PyInsertionPoint(PyOperationRef beforeOperationRef)
+    : refOperation(beforeOperationRef), block((*refOperation)->getBlock()) {}
+
 void PyInsertionPoint::insert(PyOperationBase &operationBase) {
   PyOperation &operation = operationBase.getOperation();
   if (operation.isAttached())
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index edbd73eade906..e706be3b4d32a 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -841,6 +841,8 @@ class PyInsertionPoint {
   PyInsertionPoint(const PyBlock &block);
   /// Creates an insertion point positioned before a reference operation.
   PyInsertionPoint(PyOperationBase &beforeOperationBase);
+  /// Creates an insertion point positioned before a reference operation.
+  PyInsertionPoint(PyOperationRef beforeOperationRef);
 
   /// Shortcut to create an insertion point at the beginning of the block.
   static PyInsertionPoint atBlockBegin(PyBlock &block);
diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp
index 836f44fd7d4be..9e3d9703c82e8 100644
--- a/mlir/lib/Bindings/Python/Rewrite.cpp
+++ b/mlir/lib/Bindings/Python/Rewrite.cpp
@@ -26,6 +26,30 @@ using namespace mlir::python;
 
 namespace {
 
+class PyPatternRewriter {
+public:
+  PyPatternRewriter(MlirPatternRewriter rewriter)
+      : base(mlirPatternRewriterAsBase(rewriter)),
+        ctx(PyMlirContext::forContext(mlirRewriterBaseGetContext(base))) {}
+
+  PyInsertionPoint getInsertionPoint() const {
+    MlirBlock block = mlirRewriterBaseGetInsertionBlock(base);
+    MlirOperation op = mlirRewriterBaseGetOperationAfterInsertion(base);
+
+    if (mlirOperationIsNull(op)) {
+      MlirOperation owner = mlirBlockGetParentOperation(block);
+      auto parent = PyOperation::forOperation(ctx, owner);
+      return PyInsertionPoint(PyBlock(parent, block));
+    }
+
+    return PyInsertionPoint(PyOperation::forOperation(ctx, op));
+  }
+
+private:
+  MlirRewriterBase base;
+  PyMlirContextRef ctx;
+};
+
 #if MLIR_ENABLE_PDL_IN_PATTERNMATCH
 static nb::object objectFromPDLValue(MlirPDLValue value) {
   if (MlirValue v = mlirPDLValueAsValue(value); !mlirValueIsNull(v))
@@ -84,7 +108,8 @@ class PyPDLPatternModule {
            void *userData) -> MlirLogicalResult {
           nb::handle f = nb::handle(static_cast<PyObject *>(userData));
           return logicalResultFromObject(
-              f(rewriter, results, objectsFromPDLValues(nValues, values)));
+              f(PyPatternRewriter(rewriter), results,
+                objectsFromPDLValues(nValues, values)));
         },
         fn.ptr());
   }
@@ -98,7 +123,8 @@ class PyPDLPatternModule {
            void *userData) -> MlirLogicalResult {
           nb::handle f = nb::handle(static_cast<PyObject *>(userData));
           return logicalResultFromObject(
-              f(rewriter, results, objectsFromPDLValues(nValues, values)));
+              f(PyPatternRewriter(rewriter), results,
+                objectsFromPDLValues(nValues, values)));
         },
         fn.ptr());
   }
@@ -143,7 +169,9 @@ class PyFrozenRewritePatternSet {
 
 /// Create the `mlir.rewrite` here.
 void mlir::python::populateRewriteSubmodule(nb::module_ &m) {
-  nb::class_<MlirPatternRewriter>(m, "PatternRewriter");
+  nb::class_<PyPatternRewriter>(m, "PatternRewriter")
+      .def_prop_ro("ip", &PyPatternRewriter::getInsertionPoint,
+                   "The current insertion point of the PatternRewriter.");
   //----------------------------------------------------------------------------
   // Mapping of the PDLResultList and PDLModule
   //----------------------------------------------------------------------------
diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp
index 0d56259e469af..c15a73b991f5d 100644
--- a/mlir/lib/CAPI/Transforms/Rewrite.cpp
+++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp
@@ -70,6 +70,17 @@ MlirBlock mlirRewriterBaseGetBlock(MlirRewriterBase rewriter) {
   return wrap(unwrap(rewriter)->getBlock());
 }
 
+MlirOperation
+mlirRewriterBaseGetOperationAfterInsertion(MlirRewriterBase rewriter) {
+  mlir::RewriterBase *base = unwrap(rewriter);
+  mlir::Block *block = base->getInsertionBlock();
+  mlir::Block::iterator it = base->getInsertionPoint();
+  if (it == block->end())
+    return {nullptr};
+
+  return wrap(std::addressof(*it));
+}
+
 //===----------------------------------------------------------------------===//
 /// Block and operation creation/insertion/cloning
 //===----------------------------------------------------------------------===//
@@ -317,6 +328,10 @@ inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) {
   return {rewriter};
 }
 
+MlirRewriterBase mlirPatternRewriterAsBase(MlirPatternRewriter rewriter) {
+  return wrap(static_cast<mlir::RewriterBase *>(unwrap(rewriter)));
+}
+
 //===----------------------------------------------------------------------===//
 /// PDLPatternModule API
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/python/integration/dialects/pdl.py b/mlir/test/python/integration/dialects/pdl.py
index c8e6197e03842..fe27dd4203a21 100644
--- a/mlir/test/python/integration/dialects/pdl.py
+++ b/mlir/test/python/integration/dialects/pdl.py
@@ -16,6 +16,7 @@ def construct_and_print_in_module(f):
             print(module)
     return f
 
+
 def get_pdl_patterns():
     # Create a rewrite from add to mul. This will match
     # - operation name is arith.addi
@@ -121,8 +122,10 @@ def load_myint_dialect():
 
 
 # This PDL pattern is to fold constant additions,
-# i.e. add(constant0, constant1) -> constant2
-# where constant2 = constant0 + constant1.
+# including two patterns:
+# 1. add(constant0, constant1) -> constant2
+#    where constant2 = constant0 + constant1;
+# 2. add(x, 0) or add(0, x) -> x.
 def get_pdl_pattern_fold():
     m = Module.create()
     i32 = IntegerType.get_signless(32)
@@ -237,3 +240,87 @@ def test_pdl_register_function_constraint(module_):
     apply_patterns_and_fold_greedily(module_, frozen)
 
     return module_
+
+
+# This pattern is to expand constant to additions
+# unless the constant is no more than 1,
+# e.g. 3 -> 1 + 2 -> 1 + (1 + 1).
+def get_pdl_pattern_expand():
+    m = Module.create()
+    i32 = IntegerType.get_signless(32)
+    with InsertionPoint(m.body):
+
+        @pdl.pattern(benefit=1, sym_name="myint_constant_expand")
+        def pat():
+            t = pdl.TypeOp(i32)
+            cst = pdl.AttributeOp()
+            pdl.apply_native_constraint([], "is_one", [cst])
+            op0 = pdl.OperationOp(
+                name="myint.constant", attributes={"value": cst}, types=[t]
+            )
+
+            @pdl.rewrite()
+            def rew():
+                expanded = pdl.apply_native_rewrite(
+                    [pdl.OperationType.get()], "expand", [cst]
+                )
+                pdl.ReplaceOp(op0, with_op=expanded)
+
+    def is_one(rewriter, results, values):
+        cst = values[0].value
+        return cst <= 1
+
+    def expand(rewriter, results, values):
+        cst = values[0].value
+        c1 = cst // 2
+        c2 = cst - c1
+        with rewriter.ip:
+            op1 = Operation.create(
+                "myint.constant",
+                results=[i32],
+                attributes={"value": IntegerAttr.get(i32, c1)},
+            )
+            op2 = Operation.create(
+                "myint.constant",
+                results=[i32],
+                attributes={"value": IntegerAttr.get(i32, c2)},
+            )
+            res = Operation.create(
+                "myint.add", results=[i32], operands=[op1.result, op2.result]
+            )
+        results.append(res)
+
+    pdl_module = PDLModule(m)
+    pdl_module.register_constraint_function("is_one", is_one)
+    pdl_module.register_rewrite_function("expand", expand)
+    return pdl_module.freeze()
+
+
+# CHECK-LABEL: TEST: test_pdl_register_function_expand
+# CHECK: %0 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %1 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %2 = "myint.add"(%0, %1) : (i32, i32) -> i32
+# CHECK: %3 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %4 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %5 = "myint.constant"() {value = 1 : i32} : () -> i32
+# CHECK: %6 = "myint.add"(%4, %5) : (i32, i32) -> i32
+# CHECK: %7 = "myint.add"(%3, %6) : (i32, i32) -> i32
+# CHECK: %8 = "myint.add"(%2, %7) : (i32, i32) -> i32
+# CHECK: return %8 : i32
+@construct_and_print_in_module
+def test_pdl_register_function_expand(module_):
+    load_myint_dialect()
+
+    module_ = Module.parse(
+        """
+        func.func @f() -> i32 {
+          %0 = "myint.constant"() { value = 5 }: () -> (i32)
+          return %0 : i32
+        }
+        """
+    )
+
+    frozen = get_pdl_pattern_expand()
+    apply_patterns_and_fold_greedily(module_, frozen)
+
+    return module_

From 3e78c313bcfa54f8d1f1bdf221611d461e56111c Mon Sep 17 00:00:00 2001
From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com>
Date: Sun, 5 Oct 2025 06:31:39 +0300
Subject: [PATCH 766/878] [libc][math] Refactor exp2 implementation to
 header-only in src/__support/math folder. (#161297)

Part of #147386

in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
---
 libc/shared/math.h                            |   1 +
 libc/shared/math/exp2.h                       |  23 +
 libc/src/__support/math/CMakeLists.txt        |  31 ++
 .../math/common_constants.h}                  |  47 +-
 libc/src/__support/math/exp2.h                | 425 ++++++++++++++++++
 libc/src/math/generic/CMakeLists.txt          |  56 +--
 libc/src/math/generic/common_constants.h      |  73 ---
 libc/src/math/generic/exp2.cpp                | 398 +---------------
 libc/src/math/generic/expm1.cpp               |   5 +-
 libc/src/math/generic/expm1f.cpp              |   3 +-
 libc/src/math/generic/log.cpp                 |   4 +-
 libc/src/math/generic/log10.cpp               |   5 +-
 libc/src/math/generic/log10f.cpp              |   3 +-
 libc/src/math/generic/log1p.cpp               |   4 +-
 libc/src/math/generic/log1pf.cpp              |   4 +-
 libc/src/math/generic/log2.cpp                |   5 +-
 libc/src/math/generic/log2f.cpp               |   5 +-
 libc/src/math/generic/log_range_reduction.h   |   3 +-
 libc/src/math/generic/logf.cpp                |   3 +-
 libc/src/math/generic/pow.cpp                 |   5 +-
 libc/src/math/generic/powf.cpp                |   6 +-
 libc/test/shared/CMakeLists.txt               |   1 +
 libc/test/shared/shared_math_test.cpp         |   1 +
 .../llvm-project-overlay/libc/BUILD.bazel     |  85 ++--
 24 files changed, 620 insertions(+), 576 deletions(-)
 create mode 100644 libc/shared/math/exp2.h
 rename libc/src/{math/generic/common_constants.cpp => __support/math/common_constants.h} (95%)
 create mode 100644 libc/src/__support/math/exp2.h
 delete mode 100644 libc/src/math/generic/common_constants.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
index 4b2a0d8c712ad..924d0cb4f3bfa 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -47,6 +47,7 @@
 #include "math/exp10f16.h"
 #include "math/exp10m1f.h"
 #include "math/exp10m1f16.h"
+#include "math/exp2.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp2.h b/libc/shared/math/exp2.h
new file mode 100644
index 0000000000000..6f1e143376a8a
--- /dev/null
+++ b/libc/shared/math/exp2.h
@@ -0,0 +1,23 @@
+//===-- Shared exp2 function ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP2_H
+#define LLVM_LIBC_SHARED_MATH_EXP2_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/exp2.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp2;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP2_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 98f9bb42f91f4..4130fdf02078b 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -373,6 +373,15 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  common_constants
+  HDRS
+    common_constants.h
+  DEPENDS
+    libc.src.__support.macros.config
+    libc.src.__support.number_pair
+)
+
 add_header_library(
   cos
   HDRS
@@ -704,6 +713,28 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  exp2
+  HDRS
+    exp2.h
+  DEPENDS
+    .common_constants
+    .exp_utils
+    libc.src.__support.CPP.bit
+    libc.src.__support.CPP.optional
+    libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.triple_double
+    libc.src.__support.integer_literals
+    libc.src.__support.macros.optimization
+    libc.src.errno.errno
+)
+
 add_header_library(
   exp10
   HDRS
diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/__support/math/common_constants.h
similarity index 95%
rename from libc/src/math/generic/common_constants.cpp
rename to libc/src/__support/math/common_constants.h
index 2a15df243b5a4..53abbfeef3412 100644
--- a/libc/src/math/generic/common_constants.cpp
+++ b/libc/src/__support/math/common_constants.h
@@ -6,12 +6,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "common_constants.h"
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
+
 #include "src/__support/macros/config.h"
 #include "src/__support/number_pair.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
+namespace common_constants_internal {
+
+// log(2) generated by Sollya with:
+// > a = 2^-43 * nearestint(2^43*log(2));
+// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.
+static constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43
+// > b = round(log10(2) - a, D, RN);
+static constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97
+
+// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with:
+// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]);
+constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
+                                  -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
+                                  -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};
+
 // Range reduction constants for logarithms.
 // r(0) = 1, r(127) = 0.5
 // r(k) = 2^-8 * ceil(2^8 * (1 - 2^-8) / (1 + k*2^-7))
@@ -19,7 +36,7 @@ namespace LIBC_NAMESPACE_DECL {
 // precision, and -2^-8 <= v < 2^-7.
 // TODO(lntue): Add reference to how the constants are derived after the
 // resulting paper is ready.
-alignas(8) const float R[128] = {
+alignas(8) static constexpr float R[128] = {
     0x1p0,     0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1,  0x1.ecp-1, 0x1.e8p-1,
     0x1.e4p-1, 0x1.ep-1,  0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
     0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1,  0x1.bep-1, 0x1.bap-1,
@@ -40,7 +57,7 @@ alignas(8) const float R[128] = {
     0x1.0ap-1, 0x1.08p-1, 0x1.08p-1, 0x1.06p-1, 0x1.06p-1, 0x1.04p-1, 0x1.04p-1,
     0x1.02p-1, 0x1.0p-1};
 
-const double RD[128] = {
+static constexpr double RD[128] = {
     0x1p0,     0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1,  0x1.ecp-1, 0x1.e8p-1,
     0x1.e4p-1, 0x1.ep-1,  0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
     0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1,  0x1.bep-1, 0x1.bap-1,
@@ -65,7 +82,7 @@ const double RD[128] = {
 // available.
 // Generated by Sollya with the formula: CD[i] = RD[i]*(1 + i*2^-7) - 1
 // for RD[i] defined on the table above.
-const double CD[128] = {
+static constexpr double CD[128] = {
     0.0,        -0x1p-14,   -0x1p-12,   -0x1.2p-11,  -0x1p-10,   -0x1.9p-10,
     -0x1.2p-9,  -0x1.88p-9, -0x1p-8,    -0x1.9p-11,  -0x1.fp-10, -0x1.9cp-9,
     -0x1p-12,   -0x1.cp-10, -0x1.bp-9,  -0x1.5p-11,  -0x1.4p-9,  0x1p-14,
@@ -90,7 +107,7 @@ const double CD[128] = {
     -0x1p-14,   -0x1p-8,
 };
 
-const double LOG_R[128] = {
+static constexpr double LOG_R[128] = {
     0x0.0000000000000p0,  0x1.010157588de71p-7, 0x1.0205658935847p-6,
     0x1.8492528c8cabfp-6, 0x1.0415d89e74444p-5, 0x1.466aed42de3eap-5,
     0x1.894aa149fb343p-5, 0x1.ccb73cdddb2ccp-5, 0x1.08598b59e3a07p-4,
@@ -135,7 +152,7 @@ const double LOG_R[128] = {
     0x1.5707a26bb8c66p-1, 0x1.5af405c3649ep-1,  0x1.5af405c3649ep-1,
     0x1.5ee82aa24192p-1,  0x0.000000000000p0};
 
-const double LOG2_R[128] = {
+static constexpr double LOG2_R[128] = {
     0x0.0000000000000p+0, 0x1.72c7ba20f7327p-7, 0x1.743ee861f3556p-6,
     0x1.184b8e4c56af8p-5, 0x1.77394c9d958d5p-5, 0x1.d6ebd1f1febfep-5,
     0x1.1bb32a600549dp-4, 0x1.4c560fe68af88p-4, 0x1.7d60496cfbb4cp-4,
@@ -188,7 +205,7 @@ const double LOG2_R[128] = {
 //     print("{", -c, ",", -b, "},");
 //   };
 // We replace LOG_R[0] with log10(1.0) == 0.0
-alignas(16) const NumberPair<double> LOG_R_DD[128] = {
+alignas(16) static constexpr NumberPair<double> LOG_R_DD[128] = {
     {0.0, 0.0},
     {-0x1.0c76b999d2be8p-46, 0x1.010157589p-7},
     {-0x1.3dc5b06e2f7d2p-45, 0x1.0205658938p-6},
@@ -324,7 +341,7 @@ alignas(16) const NumberPair<double> LOG_R_DD[128] = {
 // Output range:
 //   [-0x1.3ffcp-15, 0x1.3e3dp-15]
 // We store S2[i] = 2^16 (r(i - 2^6) - 1).
-alignas(8) const int S2[193] = {
+alignas(8) static constexpr int S2[193] = {
     0x101,  0xfd,   0xf9,   0xf5,   0xf1,   0xed,   0xe9,   0xe5,   0xe1,
     0xdd,   0xd9,   0xd5,   0xd1,   0xcd,   0xc9,   0xc5,   0xc1,   0xbd,
     0xb9,   0xb4,   0xb0,   0xac,   0xa8,   0xa4,   0xa0,   0x9c,   0x98,
@@ -348,7 +365,7 @@ alignas(8) const int S2[193] = {
     -0x1cd, -0x1d1, -0x1d5, -0x1d9, -0x1dd, -0x1e0, -0x1e4, -0x1e8, -0x1ec,
     -0x1f0, -0x1f4, -0x1f8, -0x1fc};
 
-const double R2[193] = {
+static constexpr double R2[193] = {
     0x1.0101p0,  0x1.00fdp0,  0x1.00f9p0,  0x1.00f5p0,  0x1.00f1p0,
     0x1.00edp0,  0x1.00e9p0,  0x1.00e5p0,  0x1.00e1p0,  0x1.00ddp0,
     0x1.00d9p0,  0x1.00d5p0,  0x1.00d1p0,  0x1.00cdp0,  0x1.00c9p0,
@@ -395,7 +412,7 @@ const double R2[193] = {
 // Output range:
 //   [-0x1.01928p-22 , 0x1p-22]
 // We store S[i] = 2^21 (r(i - 80) - 1).
-alignas(8) const int S3[161] = {
+alignas(8) static constexpr int S3[161] = {
     0x50,  0x4f,  0x4e,  0x4d,  0x4c,  0x4b,  0x4a,  0x49,  0x48,  0x47,  0x46,
     0x45,  0x44,  0x43,  0x42,  0x41,  0x40,  0x3f,  0x3e,  0x3d,  0x3c,  0x3b,
     0x3a,  0x39,  0x38,  0x37,  0x36,  0x35,  0x34,  0x33,  0x32,  0x31,  0x30,
@@ -418,7 +435,7 @@ alignas(8) const int S3[161] = {
 // Output range:
 //   [-0x1.0002143p-29 , 0x1p-29]
 // We store S[i] = 2^28 (r(i - 65) - 1).
-alignas(8) const int S4[130] = {
+alignas(8) static constexpr int S4[130] = {
     0x41,  0x40,  0x3f,  0x3e,  0x3d,  0x3c,  0x3b,  0x3a,  0x39,  0x38,  0x37,
     0x36,  0x35,  0x34,  0x33,  0x32,  0x31,  0x30,  0x2f,  0x2e,  0x2d,  0x2c,
     0x2b,  0x2a,  0x29,  0x28,  0x27,  0x26,  0x25,  0x24,  0x23,  0x22,  0x21,
@@ -439,7 +456,7 @@ alignas(8) const int S4[130] = {
 // Table is generated with Sollya as follow:
 // > display = hexadecimal;
 // > for i from -104 to 89 do { D(exp(i)); };
-const double EXP_M1[195] = {
+static constexpr double EXP_M1[195] = {
     0x1.f1e6b68529e33p-151, 0x1.525be4e4e601dp-149, 0x1.cbe0a45f75eb1p-148,
     0x1.3884e838aea68p-146, 0x1.a8c1f14e2af5dp-145, 0x1.20a717e64a9bdp-143,
     0x1.8851d84118908p-142, 0x1.0a9bdfb02d240p-140, 0x1.6a5bea046b42ep-139,
@@ -511,7 +528,7 @@ const double EXP_M1[195] = {
 // Table is generated with Sollya as follow:
 // > display = hexadecimal;
 // > for i from 0 to 127 do { D(exp(i / 128)); };
-const double EXP_M2[128] = {
+static constexpr double EXP_M2[128] = {
     0x1.0000000000000p0, 0x1.0202015600446p0, 0x1.04080ab55de39p0,
     0x1.06122436410ddp0, 0x1.08205601127edp0, 0x1.0a32a84e9c1f6p0,
     0x1.0c49236829e8cp0, 0x1.0e63cfa7ab09dp0, 0x1.1082b577d34edp0,
@@ -557,4 +574,8 @@ const double EXP_M2[128] = {
     0x1.568bb722dd593p1, 0x1.593b7d72305bbp1,
 };
 
+} // namespace common_constants_internal
+
 } // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_COMMON_CONSTANTS_H
diff --git a/libc/src/__support/math/exp2.h b/libc/src/__support/math/exp2.h
new file mode 100644
index 0000000000000..7eaa465f93841
--- /dev/null
+++ b/libc/src/__support/math/exp2.h
@@ -0,0 +1,425 @@
+//===-- Implementation header for exp2 --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
+
+#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
+#include "exp_constants.h"
+#include "exp_utils.h" // ziv_test_denorm.
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/triple_double.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace exp2_internal {
+
+using namespace common_constants_internal;
+
+using fputil::DoubleDouble;
+using fputil::TripleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+using LIBC_NAMESPACE::operator""_u128;
+
+// Error bounds:
+// Errors when using double precision.
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+constexpr double ERR_D = 0x1.0p-63;
+#else
+constexpr double ERR_D = 0x1.8p-63;
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Errors when using double-double precision.
+constexpr double ERR_DD = 0x1.0p-100;
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// Polynomial approximations with double precision.  Generated by Sollya with:
+// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
+// > P;
+// Error bounds:
+//   | output - (2^dx - 1) / dx | < 1.5 * 2^-52.
+LIBC_INLINE static double poly_approx_d(double dx) {
+  // dx^2
+  double dx2 = dx * dx;
+  double c0 =
+      fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1);
+  double c1 =
+      fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5);
+  double p = fputil::multiply_add(dx2, c1, c0);
+  return p;
+}
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+// Polynomial approximation with double-double precision.  Generated by Solya
+// with:
+// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
+// Error bounds:
+//   | output - 2^(dx) | < 2^-101
+LIBC_INLINE static constexpr DoubleDouble
+poly_approx_dd(const DoubleDouble &dx) {
+  // Taylor polynomial.
+  constexpr DoubleDouble COEFFS[] = {
+      {0, 0x1p0},
+      {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1},
+      {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3},
+      {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5},
+      {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7},
+      {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10},
+      {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13},
+  };
+
+  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
+                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
+  return p;
+}
+
+// Polynomial approximation with 128-bit precision:
+// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
+// For |dx| < 2^-13 + 2^-30:
+//   | output - exp(dx) | < 2^-126.
+LIBC_INLINE static constexpr Float128 poly_approx_f128(const Float128 &dx) {
+  constexpr Float128 COEFFS_128[]{
+      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
+      {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128},
+      {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128},
+      {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128},
+      {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128},
+      {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128},
+      {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128},
+      {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128},
+  };
+
+  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
+                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
+                                COEFFS_128[6], COEFFS_128[7]);
+  return p;
+}
+
+// Compute 2^(x) using 128-bit precision.
+// TODO(lntue): investigate triple-double precision implementation for this
+// step.
+LIBC_INLINE static constexpr Float128 exp2_f128(double x, int hi, int idx1,
+                                                int idx2) {
+  Float128 dx = Float128(x);
+
+  // TODO: Skip recalculating exp_mid1 and exp_mid2.
+  Float128 exp_mid1 =
+      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
+                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
+                                          Float128(EXP2_MID1[idx1].lo)));
+
+  Float128 exp_mid2 =
+      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
+                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
+                                          Float128(EXP2_MID2[idx2].lo)));
+
+  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
+
+  Float128 p = poly_approx_f128(dx);
+
+  Float128 r = fputil::quick_mul(exp_mid, p);
+
+  r.exponent += hi;
+
+  return r;
+}
+
+// Compute 2^x with double-double precision.
+LIBC_INLINE static DoubleDouble
+exp2_double_double(double x, const DoubleDouble &exp_mid) {
+  DoubleDouble dx({0, x});
+
+  // Degree-6 polynomial approximation in double-double precision.
+  // | p - 2^x | < 2^-103.
+  DoubleDouble p = poly_approx_dd(dx);
+
+  // Error bounds: 2^-102.
+  DoubleDouble r = fputil::quick_mult(exp_mid, p);
+
+  return r;
+}
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+// When output is denormal.
+LIBC_INLINE static double exp2_denorm(double x) {
+  // Range reduction.
+  int k =
+      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 2^-13 + 2^-30.
+  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
+      .value();
+#else
+  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use double-double
+  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
+
+  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
+      LIBC_LIKELY(r.has_value()))
+    return r.value();
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+// Check for exceptional cases when:
+//  * log2(1 - 2^-54) < x < log2(1 + 2^-53)
+//  * x >= 1024
+//  * x <= -1022
+//  * x is inf or nan
+LIBC_INLINE static constexpr double set_exceptional(double x) {
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+  uint64_t x_abs = xbits.abs().uintval();
+
+  // |x| < log2(1 + 2^-53)
+  if (x_abs <= 0x3ca71547652b82fd) {
+    // 2^(x) ~ 1 + x/2
+    return fputil::multiply_add(x, 0.5, 1.0);
+  }
+
+  // x <= -1022 || x >= 1024 or inf/nan.
+  if (x_u > 0xc08ff00000000000) {
+    // x <= -1075 or -inf/nan
+    if (x_u >= 0xc090cc0000000000) {
+      // exp(-Inf) = 0
+      if (xbits.is_inf())
+        return 0.0;
+
+      // exp(nan) = nan
+      if (xbits.is_nan())
+        return x;
+
+      if (fputil::quick_get_round() == FE_UPWARD)
+        return FPBits::min_subnormal().get_val();
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW);
+      return 0.0;
+    }
+
+    return exp2_denorm(x);
+  }
+
+  // x >= 1024 or +inf/nan
+  // x is finite
+  if (x_u < 0x7ff0'0000'0000'0000ULL) {
+    int rounding = fputil::quick_get_round();
+    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+      return FPBits::max_normal().get_val();
+
+    fputil::set_errno_if_required(ERANGE);
+    fputil::raise_except_if_required(FE_OVERFLOW);
+  }
+  // x is +inf or nan
+  return x + FPBits::inf().get_val();
+}
+
+} // namespace exp2_internal
+
+LIBC_INLINE static constexpr double exp2(double x) {
+  using namespace exp2_internal;
+  using FPBits = typename fputil::FPBits<double>;
+  FPBits xbits(x);
+
+  uint64_t x_u = xbits.uintval();
+
+  // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53).
+  if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 ||
+                    (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) ||
+                    x_u <= 0x3ca71547652b82fd)) {
+    return set_exceptional(x);
+  }
+
+  // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024
+
+  // Range reduction:
+  // Let x = (hi + mid1 + mid2) + lo
+  // in which:
+  //   hi is an integer
+  //   mid1 * 2^6 is an integer
+  //   mid2 * 2^12 is an integer
+  // then:
+  //   2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo).
+  // With this formula:
+  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
+  //     field.
+  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
+  //   - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
+  //
+  // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12.
+  // Since |x| < |-1075)| < 2^11,
+  //   |x * 2^12| < 2^11 * 2^12 < 2^23,
+  // So we can fit the rounded result round(x * 2^12) in int32_t.
+  // Thus, the goal is to be able to use an additional addition and fixed width
+  // shift to get an int32_t representing round(x * 2^12).
+  //
+  // Assuming int32_t using 2-complement representation, since the mantissa part
+  // of a double precision is unsigned with the leading bit hidden, if we add an
+  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
+  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
+  // considered as a proper 2-complement representations of x*2^12.
+  //
+  // One small problem with this approach is that the sum (x*2^12 + C) in
+  // double precision is rounded to the least significant bit of the dorminant
+  // factor C.  In order to minimize the rounding errors from this addition, we
+  // want to minimize e1.  Another constraint that we want is that after
+  // shifting the mantissa so that the least significant bit of int32_t
+  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
+  // any adjustment.  So combining these 2 requirements, we can choose
+  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
+  // after right shifting the mantissa, the resulting int32_t has correct sign.
+  // With this choice of C, the number of mantissa bits we need to shift to the
+  // right is: 52 - 33 = 19.
+  //
+  // Moreover, since the integer right shifts are equivalent to rounding down,
+  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
+  // +infinity.  So in particular, we can compute:
+  //   hmm = x * 2^12 + C,
+  // where C = 2^33 + 2^32 + 2^-1, then if
+  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
+  // the reduced argument:
+  //   lo = x - 2^-12 * k is bounded by:
+  //   |lo| <= 2^-13 + 2^-12*2^-19
+  //         = 2^-13 + 2^-31.
+  //
+  // Finally, notice that k only uses the mantissa of x * 2^12, so the
+  // exponent 2^12 is not needed.  So we can simply define
+  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
+  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
+
+  // Rounding errors <= 2^-31.
+  int k =
+      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
+  double kd = static_cast<double>(k);
+
+  uint32_t idx1 = (k >> 6) & 0x3f;
+  uint32_t idx2 = k & 0x3f;
+
+  int hi = k >> 12;
+
+  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
+  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
+  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
+
+  // |dx| < 2^-13 + 2^-30.
+  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
+
+  // We use the degree-4 polynomial to approximate 2^(lo):
+  //   2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo)
+  // So that the errors are bounded by:
+  //   |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
+  // Let P_ be an evaluation of P where all intermediate computations are in
+  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
+  // errors can be bounded by:
+  //      |P_(lo) - P(lo)| < 2^-51
+  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-64
+  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63.
+  // Since we approximate
+  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
+  // We use the expression:
+  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
+  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
+  // with errors bounded by 2^-63.
+
+  double mid_lo = dx * exp_mid.hi;
+
+  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
+  double p = poly_approx_d(dx);
+
+  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+  // field.
+  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+  double r =
+      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
+  return r;
+#else
+  double upper = exp_mid.hi + (lo + ERR_D);
+  double lower = exp_mid.hi + (lo - ERR_D);
+
+  if (LIBC_LIKELY(upper == lower)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
+    return r;
+  }
+
+  // Use double-double
+  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
+
+  double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
+  double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
+
+  if (LIBC_LIKELY(upper_dd == lower_dd)) {
+    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
+    // field.
+    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
+    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
+    return r;
+  }
+
+  // Use 128-bit precision
+  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
+
+  return static_cast<double>(r_f128);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 99c1b08326d53..28ea475f09656 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1448,21 +1448,7 @@ add_entrypoint_object(
   HDRS
     ../exp2.h
   DEPENDS
-    .common_constants
-    libc.src.__support.CPP.bit
-    libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.FPUtil.triple_double
-    libc.src.__support.integer_literals
-    libc.src.__support.macros.optimization
-    libc.src.__support.math.exp_utils
-    libc.src.errno.errno
+    libc.src.__support.math.exp2
 )
 
 add_header_library(
@@ -1613,7 +1599,6 @@ add_entrypoint_object(
   HDRS
     ../expm1.h
   DEPENDS
-    .common_constants
     libc.src.__support.CPP.bit
     libc.src.__support.FPUtil.dyadic_float
     libc.src.__support.FPUtil.fenv_impl
@@ -1624,6 +1609,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.triple_double
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
     libc.src.errno.errno
 )
 
@@ -1634,7 +1620,6 @@ add_entrypoint_object(
   HDRS
     ../expm1f.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.basic_operations
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
@@ -1643,6 +1628,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
     libc.src.errno.errno
 )
 
@@ -1673,7 +1659,6 @@ add_entrypoint_object(
   HDRS
     ../powf.h
   DEPENDS
-    .common_constants
     .exp2f_impl
     libc.src.__support.math.exp10f
     libc.src.__support.CPP.bit
@@ -1685,6 +1670,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.FPUtil.triple_double
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
     libc.src.errno.errno
 )
 
@@ -1695,7 +1681,6 @@ add_entrypoint_object(
   HDRS
     ../pow.h
   DEPENDS
-    .common_constants
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
     libc.src.__support.CPP.bit
@@ -1707,6 +1692,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.sqrt
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2043,26 +2029,14 @@ add_entrypoint_object(
     libc.src.__support.macros.properties.types
 )
 
-add_object_library(
-  common_constants
-  HDRS
-    common_constants.h
-  SRCS
-    common_constants.cpp
-  DEPENDS
-    libc.src.__support.math.exp_constants
-    libc.src.__support.math.acosh_float_constants
-    libc.src.__support.number_pair
-)
-
 add_header_library(
   log_range_reduction
   HDRS
     log_range_reduction.h
   DEPENDS
-    .common_constants
-    libc.src.__support.uint128
     libc.src.__support.FPUtil.dyadic_float
+    libc.src.__support.math.common_constants
+    libc.src.__support.uint128
 )
 
 add_entrypoint_object(
@@ -2072,7 +2046,6 @@ add_entrypoint_object(
   HDRS
     ../log10.h
   DEPENDS
-    .common_constants
     .log_range_reduction
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
@@ -2082,6 +2055,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2091,12 +2065,12 @@ add_entrypoint_object(
   HDRS
     ../log10f.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.polyeval
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2126,7 +2100,6 @@ add_entrypoint_object(
   HDRS
     ../log1p.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
     libc.src.__support.FPUtil.fenv_impl
@@ -2135,6 +2108,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2144,13 +2118,13 @@ add_entrypoint_object(
   HDRS
     ../log1pf.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2160,7 +2134,6 @@ add_entrypoint_object(
   HDRS
     ../log2.h
   DEPENDS
-    .common_constants
     .log_range_reduction
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
@@ -2170,6 +2143,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2179,13 +2153,13 @@ add_entrypoint_object(
   HDRS
     ../log2f.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fma
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2215,7 +2189,6 @@ add_entrypoint_object(
   HDRS
     ../log.h
   DEPENDS
-    .common_constants
     .log_range_reduction
     libc.src.__support.FPUtil.double_double
     libc.src.__support.FPUtil.dyadic_float
@@ -2225,6 +2198,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.integer_literals
     libc.src.__support.macros.optimization
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
@@ -2234,7 +2208,6 @@ add_entrypoint_object(
   HDRS
     ../logf.h
   DEPENDS
-    .common_constants
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
@@ -2242,6 +2215,7 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
     libc.src.__support.macros.properties.cpu_features
+    libc.src.__support.math.common_constants
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
deleted file mode 100644
index 9ee31f09baa8a..0000000000000
--- a/libc/src/math/generic/common_constants.h
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- Common constants for math functions ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
-#define LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
-
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/math/acosh_float_constants.h"
-#include "src/__support/math/exp_constants.h"
-#include "src/__support/number_pair.h"
-
-namespace LIBC_NAMESPACE_DECL {
-
-// Lookup table for range reduction constants r for logarithms.
-extern const float R[128];
-
-// Lookup table for range reduction constants r for logarithms.
-extern const double RD[128];
-
-// Lookup table for compensated constants for exact range reduction when FMA
-// instructions are not available.
-extern const double CD[128];
-
-// Lookup table for -log(r)
-extern const double LOG_R[128];
-extern const NumberPair<double> LOG_R_DD[128];
-
-// Lookup table for -log2(r)
-extern const double LOG2_R[128];
-
-// Minimax polynomial for (log(1 + x) - x)/x^2, generated by sollya with:
-// > P = fpminimax((log(1 + x) - x)/x^2, 5, [|D...|], [-2^-8, 2^-7]);
-constexpr double LOG_COEFFS[6] = {-0x1.fffffffffffffp-2, 0x1.5555555554a9bp-2,
-                                  -0x1.0000000094567p-2, 0x1.99999dcc9823cp-3,
-                                  -0x1.55550ac2e537ap-3, 0x1.21a02c4e624d7p-3};
-
-// Logarithm Range Reduction - Step 2, 3, and 4.
-extern const int S2[193];
-extern const int S3[161];
-extern const int S4[130];
-
-extern const double R2[193];
-
-// log(2) generated by Sollya with:
-// > a = 2^-43 * nearestint(2^43*log(2));
-// LSB = 2^-43 is chosen so that e_x * LOG_2_HI is exact for -1075 < e_x < 1024.
-constexpr double LOG_2_HI = 0x1.62e42fefa38p-1; // LSB = 2^-43
-// > b = round(log10(2) - a, D, RN);
-constexpr double LOG_2_LO = 0x1.ef35793c7673p-45; // LSB = 2^-97
-
-// Lookup table for exp(m) with m = -104, ..., 89.
-//   -104 = floor(log(single precision's min denormal))
-//     89 = ceil(log(single precision's max normal))
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for i from -104 to 89 do { D(exp(i)); };
-extern const double EXP_M1[195];
-
-// Lookup table for exp(m * 2^(-7)) with m = 0, ..., 127.
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for i from 0 to 127 do { D(exp(i / 128)); };
-extern const double EXP_M2[128];
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 154154f2b90c3..20e1ff5fa43a7 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -7,404 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp2.h"
-#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2.
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/optional.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/double_double.h"
-#include "src/__support/FPUtil/dyadic_float.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/FPUtil/triple_double.h"
-#include "src/__support/common.h"
-#include "src/__support/integer_literals.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/math/exp_utils.h"      // ziv_test_denorm.
+#include "src/__support/math/exp2.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-using fputil::DoubleDouble;
-using fputil::TripleDouble;
-using Float128 = typename fputil::DyadicFloat<128>;
-
-using LIBC_NAMESPACE::operator""_u128;
-
-// Error bounds:
-// Errors when using double precision.
-#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-constexpr double ERR_D = 0x1.0p-63;
-#else
-constexpr double ERR_D = 0x1.8p-63;
-#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Errors when using double-double precision.
-constexpr double ERR_DD = 0x1.0p-100;
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-namespace {
-
-// Polynomial approximations with double precision.  Generated by Sollya with:
-// > P = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
-// > P;
-// Error bounds:
-//   | output - (2^dx - 1) / dx | < 1.5 * 2^-52.
-LIBC_INLINE double poly_approx_d(double dx) {
-  // dx^2
-  double dx2 = dx * dx;
-  double c0 =
-      fputil::multiply_add(dx, 0x1.ebfbdff82c58ep-3, 0x1.62e42fefa39efp-1);
-  double c1 =
-      fputil::multiply_add(dx, 0x1.3b2aba7a95a89p-7, 0x1.c6b08e8fc0c0ep-5);
-  double p = fputil::multiply_add(dx2, c1, c0);
-  return p;
-}
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-// Polynomial approximation with double-double precision.  Generated by Solya
-// with:
-// > P = fpminimax((2^x - 1)/x, 5, [|DD...|], [-2^-13 - 2^-30, 2^-13 + 2^-30]);
-// Error bounds:
-//   | output - 2^(dx) | < 2^-101
-DoubleDouble poly_approx_dd(const DoubleDouble &dx) {
-  // Taylor polynomial.
-  constexpr DoubleDouble COEFFS[] = {
-      {0, 0x1p0},
-      {0x1.abc9e3b39824p-56, 0x1.62e42fefa39efp-1},
-      {-0x1.5e43a53e4527bp-57, 0x1.ebfbdff82c58fp-3},
-      {-0x1.d37963a9444eep-59, 0x1.c6b08d704a0cp-5},
-      {0x1.4eda1a81133dap-62, 0x1.3b2ab6fba4e77p-7},
-      {-0x1.c53fd1ba85d14p-64, 0x1.5d87fe7a265a5p-10},
-      {0x1.d89250b013eb8p-70, 0x1.430912f86cb8ep-13},
-  };
-
-  DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2],
-                                    COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]);
-  return p;
-}
-
-// Polynomial approximation with 128-bit precision:
-// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7
-// For |dx| < 2^-13 + 2^-30:
-//   | output - exp(dx) | < 2^-126.
-Float128 poly_approx_f128(const Float128 &dx) {
-  constexpr Float128 COEFFS_128[]{
-      {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0
-      {Sign::POS, -128, 0xb17217f7'd1cf79ab'c9e3b398'03f2f6af_u128},
-      {Sign::POS, -128, 0x3d7f7bff'058b1d50'de2d60dd'9c9a1d9f_u128},
-      {Sign::POS, -132, 0xe35846b8'2505fc59'9d3b15d9'e7fb6897_u128},
-      {Sign::POS, -134, 0x9d955b7d'd273b94e'184462f6'bcd2b9e7_u128},
-      {Sign::POS, -137, 0xaec3ff3c'53398883'39ea1bb9'64c51a89_u128},
-      {Sign::POS, -138, 0x2861225f'345c396a'842c5341'8fa8ae61_u128},
-      {Sign::POS, -144, 0xffe5fe2d'109a319d'7abeb5ab'd5ad2079_u128},
-  };
-
-  Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2],
-                                COEFFS_128[3], COEFFS_128[4], COEFFS_128[5],
-                                COEFFS_128[6], COEFFS_128[7]);
-  return p;
-}
-
-// Compute 2^(x) using 128-bit precision.
-// TODO(lntue): investigate triple-double precision implementation for this
-// step.
-Float128 exp2_f128(double x, int hi, int idx1, int idx2) {
-  Float128 dx = Float128(x);
-
-  // TODO: Skip recalculating exp_mid1 and exp_mid2.
-  Float128 exp_mid1 =
-      fputil::quick_add(Float128(EXP2_MID1[idx1].hi),
-                        fputil::quick_add(Float128(EXP2_MID1[idx1].mid),
-                                          Float128(EXP2_MID1[idx1].lo)));
-
-  Float128 exp_mid2 =
-      fputil::quick_add(Float128(EXP2_MID2[idx2].hi),
-                        fputil::quick_add(Float128(EXP2_MID2[idx2].mid),
-                                          Float128(EXP2_MID2[idx2].lo)));
-
-  Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2);
-
-  Float128 p = poly_approx_f128(dx);
-
-  Float128 r = fputil::quick_mul(exp_mid, p);
-
-  r.exponent += hi;
-
-  return r;
-}
-
-// Compute 2^x with double-double precision.
-DoubleDouble exp2_double_double(double x, const DoubleDouble &exp_mid) {
-  DoubleDouble dx({0, x});
-
-  // Degree-6 polynomial approximation in double-double precision.
-  // | p - 2^x | < 2^-103.
-  DoubleDouble p = poly_approx_dd(dx);
-
-  // Error bounds: 2^-102.
-  DoubleDouble r = fputil::quick_mult(exp_mid, p);
-
-  return r;
-}
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-// When output is denormal.
-double exp2_denorm(double x) {
-  // Range reduction.
-  int k =
-      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 2^-13 + 2^-30.
-  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  return ziv_test_denorm</*SKIP_ZIV_TEST=*/true>(hi, exp_mid.hi, lo, ERR_D)
-      .value();
-#else
-  if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use double-double
-  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
-
-  if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD);
-      LIBC_LIKELY(r.has_value()))
-    return r.value();
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
-
-// Check for exceptional cases when:
-//  * log2(1 - 2^-54) < x < log2(1 + 2^-53)
-//  * x >= 1024
-//  * x <= -1022
-//  * x is inf or nan
-double set_exceptional(double x) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-  uint64_t x_abs = xbits.abs().uintval();
-
-  // |x| < log2(1 + 2^-53)
-  if (x_abs <= 0x3ca71547652b82fd) {
-    // 2^(x) ~ 1 + x/2
-    return fputil::multiply_add(x, 0.5, 1.0);
-  }
-
-  // x <= -1022 || x >= 1024 or inf/nan.
-  if (x_u > 0xc08ff00000000000) {
-    // x <= -1075 or -inf/nan
-    if (x_u >= 0xc090cc0000000000) {
-      // exp(-Inf) = 0
-      if (xbits.is_inf())
-        return 0.0;
-
-      // exp(nan) = nan
-      if (xbits.is_nan())
-        return x;
-
-      if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_subnormal().get_val();
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW);
-      return 0.0;
-    }
-
-    return exp2_denorm(x);
-  }
-
-  // x >= 1024 or +inf/nan
-  // x is finite
-  if (x_u < 0x7ff0'0000'0000'0000ULL) {
-    int rounding = fputil::quick_get_round();
-    if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
-
-    fputil::set_errno_if_required(ERANGE);
-    fputil::raise_except_if_required(FE_OVERFLOW);
-  }
-  // x is +inf or nan
-  return x + FPBits::inf().get_val();
-}
-
-} // namespace
-
-LLVM_LIBC_FUNCTION(double, exp2, (double x)) {
-  using FPBits = typename fputil::FPBits<double>;
-  FPBits xbits(x);
-
-  uint64_t x_u = xbits.uintval();
-
-  // x < -1022 or x >= 1024 or log2(1 - 2^-54) < x < log2(1 + 2^-53).
-  if (LIBC_UNLIKELY(x_u > 0xc08ff00000000000 ||
-                    (x_u <= 0xbc971547652b82fe && x_u >= 0x4090000000000000) ||
-                    x_u <= 0x3ca71547652b82fd)) {
-    return set_exceptional(x);
-  }
-
-  // Now -1075 < x <= log2(1 - 2^-54) or log2(1 + 2^-53) < x < 1024
-
-  // Range reduction:
-  // Let x = (hi + mid1 + mid2) + lo
-  // in which:
-  //   hi is an integer
-  //   mid1 * 2^6 is an integer
-  //   mid2 * 2^12 is an integer
-  // then:
-  //   2^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 2^(lo).
-  // With this formula:
-  //   - multiplying by 2^hi is exact and cheap, simply by adding the exponent
-  //     field.
-  //   - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables.
-  //   - 2^(lo) ~ 1 + a0*lo + a1 * lo^2 + ...
-  //
-  // We compute (hi + mid1 + mid2) together by perform the rounding on x * 2^12.
-  // Since |x| < |-1075)| < 2^11,
-  //   |x * 2^12| < 2^11 * 2^12 < 2^23,
-  // So we can fit the rounded result round(x * 2^12) in int32_t.
-  // Thus, the goal is to be able to use an additional addition and fixed width
-  // shift to get an int32_t representing round(x * 2^12).
-  //
-  // Assuming int32_t using 2-complement representation, since the mantissa part
-  // of a double precision is unsigned with the leading bit hidden, if we add an
-  // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^25 to the product, the
-  // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be
-  // considered as a proper 2-complement representations of x*2^12.
-  //
-  // One small problem with this approach is that the sum (x*2^12 + C) in
-  // double precision is rounded to the least significant bit of the dorminant
-  // factor C.  In order to minimize the rounding errors from this addition, we
-  // want to minimize e1.  Another constraint that we want is that after
-  // shifting the mantissa so that the least significant bit of int32_t
-  // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without
-  // any adjustment.  So combining these 2 requirements, we can choose
-  //   C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence
-  // after right shifting the mantissa, the resulting int32_t has correct sign.
-  // With this choice of C, the number of mantissa bits we need to shift to the
-  // right is: 52 - 33 = 19.
-  //
-  // Moreover, since the integer right shifts are equivalent to rounding down,
-  // we can add an extra 0.5 so that it will become round-to-nearest, tie-to-
-  // +infinity.  So in particular, we can compute:
-  //   hmm = x * 2^12 + C,
-  // where C = 2^33 + 2^32 + 2^-1, then if
-  //   k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19),
-  // the reduced argument:
-  //   lo = x - 2^-12 * k is bounded by:
-  //   |lo| <= 2^-13 + 2^-12*2^-19
-  //         = 2^-13 + 2^-31.
-  //
-  // Finally, notice that k only uses the mantissa of x * 2^12, so the
-  // exponent 2^12 is not needed.  So we can simply define
-  //   C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and
-  //   k = int32_t(lower 51 bits of double(x + C) >> 19).
-
-  // Rounding errors <= 2^-31.
-  int k =
-      static_cast<int>(cpp::bit_cast<uint64_t>(x + 0x1.8000'0000'4p21) >> 19);
-  double kd = static_cast<double>(k);
-
-  uint32_t idx1 = (k >> 6) & 0x3f;
-  uint32_t idx2 = k & 0x3f;
-
-  int hi = k >> 12;
-
-  DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi};
-  DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi};
-  DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2);
-
-  // |dx| < 2^-13 + 2^-30.
-  double dx = fputil::multiply_add(kd, -0x1.0p-12, x); // exact
-
-  // We use the degree-4 polynomial to approximate 2^(lo):
-  //   2^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 = 1 + lo * P(lo)
-  // So that the errors are bounded by:
-  //   |P(lo) - (2^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58
-  // Let P_ be an evaluation of P where all intermediate computations are in
-  // double precision.  Using either Horner's or Estrin's schemes, the evaluated
-  // errors can be bounded by:
-  //      |P_(lo) - P(lo)| < 2^-51
-  //   => |lo * P_(lo) - (2^lo - 1) | < 2^-64
-  //   => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-63.
-  // Since we approximate
-  //   2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo,
-  // We use the expression:
-  //    (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~
-  //  ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo)
-  // with errors bounded by 2^-63.
-
-  double mid_lo = dx * exp_mid.hi;
-
-  // Approximate (2^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4.
-  double p = poly_approx_d(dx);
-
-  double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo);
-
-#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-  // field.
-  int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-  double r =
-      cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(exp_mid.hi + lo));
-  return r;
-#else
-  double upper = exp_mid.hi + (lo + ERR_D);
-  double lower = exp_mid.hi + (lo - ERR_D);
-
-  if (LIBC_LIKELY(upper == lower)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper));
-    return r;
-  }
-
-  // Use double-double
-  DoubleDouble r_dd = exp2_double_double(dx, exp_mid);
-
-  double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD);
-  double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD);
-
-  if (LIBC_LIKELY(upper_dd == lower_dd)) {
-    // To multiply by 2^hi, a fast way is to simply add hi to the exponent
-    // field.
-    int64_t exp_hi = static_cast<int64_t>(hi) << FPBits::FRACTION_LEN;
-    double r = cpp::bit_cast<double>(exp_hi + cpp::bit_cast<int64_t>(upper_dd));
-    return r;
-  }
-
-  // Use 128-bit precision
-  Float128 r_f128 = exp2_f128(dx, hi, idx1, idx2);
-
-  return static_cast<double>(r_f128);
-#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-}
+LLVM_LIBC_FUNCTION(double, exp2, (double x)) { return math::exp2(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index c360554a8af8f..a3d0c1aa5261c 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expm1.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/CPP/bit.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -22,6 +21,8 @@
 #include "src/__support/integer_literals.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/math/exp_constants.h"
 
 #if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
 #define LIBC_MATH_EXPM1_SKIP_ACCURATE_PASS
@@ -59,6 +60,8 @@ constexpr double MLOG_2_EXP2_M12_LO = 0x1.b0e2633fe0685p-79;
 
 namespace {
 
+using namespace common_constants_internal;
+
 // Polynomial approximations with double precision:
 // Return expm1(dx) / x ~ 1 + dx / 2 + dx^2 / 6 + dx^3 / 24.
 // For |dx| < 2^-13 + 2^-30:
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index b2967e2516197..72c8aa358d618 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expm1f.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/FPUtil/BasicOperations.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FMA.h"
@@ -20,10 +19,12 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
+  using namespace common_constants_internal;
   using FPBits = typename fputil::FPBits<float>;
   FPBits xbits(x);
 
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index 0cd4424ee0baf..66ce0591e78ef 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -18,8 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
 #include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 // A simple upper bound for the error of e_x * log(2) - log(r).
 constexpr double HI_ERR = 0x1.0p-85;
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index 1c4e559ba083c..95f24fac01453 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -18,8 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
 #include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 constexpr fputil::DoubleDouble LOG10_E = {0x1.95355baaafad3p-57,
                                           0x1.bcb7b1526e50ep-2};
 
@@ -739,6 +741,7 @@ double log10_accurate(int e_x, int index, double m_x) {
 } // namespace
 
 LLVM_LIBC_FUNCTION(double, log10, (double x)) {
+  using namespace common_constants_internal;
   using FPBits_t = typename fputil::FPBits<double>;
 
   FPBits_t xbits(x);
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index 81e7cdbcfe5a1..6b9cc5d3e0dbf 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/log10f.h"
-#include "common_constants.h" // Lookup table for (1/f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FMA.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -18,6 +17,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f)
 
 // This is an algorithm for log10(x) in single precision which is
 // correctly rounded for all rounding modes, based on the implementation of
@@ -104,6 +104,7 @@ static constexpr double LOG10_R[128] = {
     0x1.30cb3a7bb3625p-2, 0x1.34413509f79ffp-2};
 
 LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
+  using namespace common_constants_internal;
   constexpr double LOG10_2 = 0x1.34413509f79ffp-2;
 
   using FPBits = typename fputil::FPBits<float>;
diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp
index 09f465a6ba774..1595981ac309b 100644
--- a/libc/src/math/generic/log1p.cpp
+++ b/libc/src/math/generic/log1p.cpp
@@ -18,7 +18,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -29,6 +29,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 // R1[i] = 2^-8 * nearestint( 2^8 / (1 + i * 2^-7) )
 constexpr double R1[129] = {
     0x1p0,     0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1,  0x1.ecp-1, 0x1.eap-1,
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 16b1b34a6c944..f0289c2320a24 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/log1pf.h"
-#include "common_constants.h" // Lookup table for (1/f) and log(f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FMA.h"
 #include "src/__support/FPUtil/FPBits.h"
@@ -18,6 +17,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/acosh_float_constants.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f)
 
 // This is an algorithm for log10(x) in single precision which is
 // correctly rounded for all rounding modes.
@@ -38,6 +39,7 @@ namespace internal {
 // We don't need to treat denormal and 0
 LIBC_INLINE float log(double x) {
   using namespace acoshf_internal;
+  using namespace common_constants_internal;
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
 
   using FPBits = typename fputil::FPBits<double>;
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index 27ca2fc350f17..f0c0ae302592f 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -18,8 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 
-#include "common_constants.h"
 #include "log_range_reduction.h"
+#include "src/__support/math/common_constants.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -30,6 +30,8 @@ using LIBC_NAMESPACE::operator""_u128;
 
 namespace {
 
+using namespace common_constants_internal;
+
 constexpr fputil::DoubleDouble LOG2_E = {0x1.777d0ffda0d24p-56,
                                          0x1.71547652b82fep0};
 
@@ -859,6 +861,7 @@ double log2_accurate(int e_x, int index, double m_x) {
 } // namespace
 
 LLVM_LIBC_FUNCTION(double, log2, (double x)) {
+  using namespace common_constants_internal;
   using FPBits_t = typename fputil::FPBits<double>;
 
   FPBits_t xbits(x);
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index cff718eec2169..7353f0302ab3e 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/log2f.h"
-#include "common_constants.h" // Lookup table for (1/f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -15,7 +14,8 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/optimization.h"   // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f)
 
 // This is a correctly-rounded algorithm for log2(x) in single precision with
 // round-to-nearest, tie-to-even mode from the RLIBM project at:
@@ -55,6 +55,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
+  using namespace common_constants_internal;
   using FPBits = typename fputil::FPBits<float>;
 
   FPBits xbits(x);
diff --git a/libc/src/math/generic/log_range_reduction.h b/libc/src/math/generic/log_range_reduction.h
index 8c94230ee7577..7484506ddf68f 100644
--- a/libc/src/math/generic/log_range_reduction.h
+++ b/libc/src/math/generic/log_range_reduction.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
 
-#include "common_constants.h"
 #include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/math/common_constants.h"
 #include "src/__support/uint128.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -36,6 +36,7 @@ struct LogRR {
 LIBC_INLINE fputil::DyadicFloat<128>
 log_range_reduction(double m_x, const LogRR &log_table,
                     fputil::DyadicFloat<128> &sum) {
+  using namespace common_constants_internal;
   using Float128 = typename fputil::DyadicFloat<128>;
   using MType = typename Float128::MantissaType;
 
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index e8d2ba2cfe175..4d2947dc14db0 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/logf.h"
-#include "common_constants.h" // Lookup table for (1/f) and log(f)
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -17,6 +16,7 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/common_constants.h" // Lookup table for (1/f) and log(f)
 
 // This is an algorithm for log(x) in single precision which is correctly
 // rounded for all rounding modes, based on the implementation of log(x) from
@@ -53,6 +53,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float, logf, (float x)) {
+  using namespace common_constants_internal;
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
   using FPBits = typename fputil::FPBits<float>;
 
diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp
index 43e99a7acf690..c9f685b82fcb5 100644
--- a/libc/src/math/generic/pow.cpp
+++ b/libc/src/math/generic/pow.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/pow.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/CPP/bit.h"
@@ -21,6 +20,8 @@
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/math/exp_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -28,6 +29,8 @@ using fputil::DoubleDouble;
 
 namespace {
 
+using namespace common_constants_internal;
+
 // Constants for log2(x) range reduction, generated by Sollya with:
 // > for i from 0 to 127 do {
 //     r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) );
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index a45ef511c9bad..12246e9878758 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/powf.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/CPP/bit.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
@@ -15,10 +14,13 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/sqrt.h" // Speedup for powf(x, 1/2) = sqrtf(x)
+#include "src/__support/FPUtil/triple_double.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
 #include "src/__support/math/exp10f.h" // Speedup for powf(10, y) = exp10f(y)
+#include "src/__support/math/exp_constants.h"
 
 #include "exp2f_impl.h"  // Speedup for powf(2, y) = exp2f(y)
 
@@ -29,6 +31,8 @@ using fputil::TripleDouble;
 
 namespace {
 
+using namespace common_constants_internal;
+
 #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 alignas(16) constexpr DoubleDouble LOG2_R_DD[128] = {
     {0.0, 0.0},
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index ea4634cbe7f9f..040f6356b2577 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -40,6 +40,7 @@ add_fp_unittest(
     libc.src.__support.math.exp10m1f16
     libc.src.__support.math.erff
     libc.src.__support.math.exp
+    libc.src.__support.math.exp2
     libc.src.__support.math.exp10
     libc.src.__support.math.exp10f
     libc.src.__support.math.exp10f16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 17221932927b0..ef2e7b8d71456 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -80,6 +80,7 @@ TEST(LlvmLibcSharedMathTest, AllDouble) {
   EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::cos(0.0));
   EXPECT_FP_EQ(0x0p+0, LIBC_NAMESPACE::shared::dsqrtl(0.0));
   EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp(0.0));
+  EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp2(0.0));
   EXPECT_FP_EQ(0x1p+0, LIBC_NAMESPACE::shared::exp10(0.0));
 }
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 026664bd019f9..e91e7c8aa7c86 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2156,25 +2156,14 @@ libc_function(
 
 ########################### math support library ###############################
 
-libc_support_library(
-    name = "common_constants",
-    srcs = ["src/math/generic/common_constants.cpp"],
-    hdrs = ["src/math/generic/common_constants.h"],
-    deps = [
-        ":__support_math_acosh_float_constants",
-        ":__support_math_exp_constants",
-        ":__support_number_pair",
-    ],
-)
-
 libc_support_library(
     name = "log_range_reduction",
     hdrs = ["src/math/generic/log_range_reduction.h"],
     deps = [
         ":__support_common",
         ":__support_fputil_dyadic_float",
+        ":__support_math_common_constants",
         ":__support_uint128",
-        ":common_constants",
     ],
 )
 
@@ -2189,8 +2178,8 @@ libc_support_library(
         ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
+        ":__support_math_common_constants",
         ":__support_math_exp10f_utils",
-        ":common_constants",
     ],
 )
 
@@ -2557,6 +2546,16 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_common_constants",
+    hdrs = ["src/__support/math/common_constants.h"],
+    deps = [
+        ":__support_math_acosh_float_constants",
+        ":__support_math_exp_constants",
+        ":__support_number_pair",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_cos",
     hdrs = ["src/__support/math/cos.h"],
@@ -2632,8 +2631,8 @@ libc_support_library(
         ":__support_fputil_polyeval",
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
+        ":__support_math_common_constants",
         ":__support_sincosf_utils",
-        ":common_constants",
     ],
 )
 
@@ -2878,6 +2877,24 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp2",
+    hdrs = ["src/__support/math/exp2.h"],
+    deps = [
+        ":__support_fputil_double_double",
+        ":__support_fputil_dyadic_float",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_fputil_triple_double",
+        ":__support_integer_literals",
+        ":__support_macros_optimization",
+        ":__support_math_common_constants",
+        ":__support_math_exp_utils",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_exp10",
     hdrs = ["src/__support/math/exp10.h"],
@@ -3652,17 +3669,7 @@ libc_math_function(
 libc_math_function(
     name = "exp2",
     additional_deps = [
-        ":__support_fputil_double_double",
-        ":__support_fputil_dyadic_float",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
-        ":__support_fputil_triple_double",
-        ":__support_integer_literals",
-        ":__support_macros_optimization",
-        ":__support_math_exp_utils",
-        ":common_constants",
+        ":__support_math_exp2",
     ],
 )
 
@@ -3706,7 +3713,7 @@ libc_math_function(
         ":__support_fputil_triple_double",
         ":__support_integer_literals",
         ":__support_macros_optimization",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -3720,7 +3727,7 @@ libc_math_function(
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4233,7 +4240,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":log_range_reduction",
     ],
 )
@@ -4246,7 +4253,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4268,7 +4275,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":log_range_reduction",
     ],
 )
@@ -4281,7 +4288,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4303,7 +4310,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4315,7 +4322,7 @@ libc_math_function(
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4330,7 +4337,7 @@ libc_math_function(
         ":__support_integer_literals",
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":log_range_reduction",
     ],
 )
@@ -4342,7 +4349,7 @@ libc_math_function(
         ":__support_fputil_multiply_add",
         ":__support_fputil_polyeval",
         ":__support_macros_optimization",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4488,7 +4495,7 @@ libc_math_function(
         ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
         ":__support_fputil_sqrt",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4503,7 +4510,7 @@ libc_math_function(
         ":__support_fputil_triple_double",
         ":__support_macros_optimization",
         ":__support_math_exp10f",
-        ":common_constants",
+        ":__support_math_common_constants",
         ":exp2f_impl",
     ],
 )
@@ -4664,7 +4671,7 @@ libc_math_function(
         ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
         ":__support_math_sinhfcoshf_utils",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 
@@ -4771,7 +4778,7 @@ libc_math_function(
         ":__support_macros_optimization",
         ":__support_macros_properties_cpu_features",
         ":__support_math_exp10f_utils",
-        ":common_constants",
+        ":__support_math_common_constants",
     ],
 )
 

From 92d83134b44161cad50198e663b543b46d25b45a Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Sun, 5 Oct 2025 13:04:36 +0800
Subject: [PATCH 767/878] [Clang] Fix concept paramater mapping and caching
 (#161994)

This expression is not handled by default in RAV, so our parameter
mapping and cache mechanism don't work when it appears in a template
argument list.

There are a few other expressions, such as PackIndexingExpr and
FunctionParmPackExpr, which are also no-ops by default. I don't have a
test case for them now, so let's leave those until users complain :/

There was also a bug in updating the parameter mapping, where the
AssociatedDecl was not updated accordingly.

Also also, this fixes another regression reported in
https://github.com/llvm/llvm-project/pull/161671#issuecomment-3367225480,
where we failed to account for the variable initializer in cache
profiling.

Relies on #161671

Fixes https://github.com/llvm/llvm-project/issues/161983
Fixes https://github.com/llvm/llvm-project/issues/161987
---
 clang/lib/Sema/SemaConcept.cpp           | 29 +++++-----
 clang/lib/Sema/SemaTemplateDeduction.cpp |  4 ++
 clang/test/SemaTemplate/concepts.cpp     | 71 ++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 8413090e7ebd1..11d2d5c3938c1 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -264,14 +264,6 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
 
   UnsignedOrNone OuterPackSubstIndex;
 
-  TemplateArgument getPackSubstitutedTemplateArgument(TemplateArgument Arg) {
-    assert(*SemaRef.ArgPackSubstIndex < Arg.pack_size());
-    Arg = Arg.pack_begin()[*SemaRef.ArgPackSubstIndex];
-    if (Arg.isPackExpansion())
-      Arg = Arg.getPackExpansionPattern();
-    return Arg;
-  }
-
   bool shouldVisitTemplateInstantiations() const { return true; }
 
 public:
@@ -294,7 +286,7 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
       assert(Arg.getKind() == TemplateArgument::Pack &&
              "Missing argument pack");
 
-      Arg = getPackSubstitutedTemplateArgument(Arg);
+      Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
     }
 
     UsedTemplateArgs.push_back(
@@ -312,7 +304,7 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
     if (NTTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
       assert(Arg.getKind() == TemplateArgument::Pack &&
              "Missing argument pack");
-      Arg = getPackSubstitutedTemplateArgument(Arg);
+      Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
     }
 
     UsedTemplateArgs.push_back(
@@ -325,8 +317,11 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
   }
 
   bool TraverseDecl(Decl *D) {
-    if (auto *VD = dyn_cast<ValueDecl>(D))
+    if (auto *VD = dyn_cast<ValueDecl>(D)) {
+      if (auto *Var = dyn_cast<VarDecl>(VD))
+        TraverseStmt(Var->getInit());
       return TraverseType(VD->getType());
+    }
 
     return inherited::TraverseDecl(D);
   }
@@ -363,6 +358,14 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
     return inherited::TraverseTemplateArgument(Arg);
   }
 
+  bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) {
+    return TraverseDecl(SOPE->getPack());
+  }
+
+  bool VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *E) {
+    return inherited::TraverseStmt(E->getReplacement());
+  }
+
   void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
     if (!Constraint.hasParameterMapping()) {
       for (const auto &List : TemplateArgs)
@@ -2083,8 +2086,8 @@ bool SubstituteParameterMappings::substitute(ConceptIdConstraint &CC) {
                                         /*UpdateArgsWithConversions=*/false))
     return true;
   auto TemplateArgs = *MLTAL;
-  TemplateArgs.replaceOutermostTemplateArguments(
-      TemplateArgs.getAssociatedDecl(0).first, CTAI.SugaredConverted);
+  TemplateArgs.replaceOutermostTemplateArguments(CSE->getNamedConcept(),
+                                                 CTAI.SugaredConverted);
   return SubstituteParameterMappings(SemaRef, &TemplateArgs, ArgsAsWritten,
                                      InFoldExpr)
       .substitute(CC.getNormalizedConstraint());
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 6bba505ece07d..3baa9775a49e4 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -6718,6 +6718,10 @@ struct MarkUsedTemplateParameterVisitor : DynamicRecursiveASTVisitor {
     }
     return true;
   }
+
+  bool TraverseSizeOfPackExpr(SizeOfPackExpr *SOPE) override {
+    return TraverseDecl(SOPE->getPack());
+  }
 };
 }
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 6d29f8b798e73..e5e081ffb9d0f 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1333,4 +1333,75 @@ static_assert(__cpp17_iterator<not_move_constructible>); \
 // expected-note@#is_move_constructible_v {{because 'is_move_constructible_v<parameter_mapping_regressions::case3::not_move_constructible>' evaluated to false}}
 }
 
+namespace case4 {
+
+template<bool b>
+concept bool_ = b;
+
+template<typename... Ts>
+concept unary = bool_<sizeof...(Ts) == 1>;
+
+static_assert(!unary<>);
+static_assert(unary<void>);
+
+}
+
+namespace case5 {
+
+template<int size>
+concept true1 = size == size;
+
+template<typename... Ts>
+concept true2 = true1<sizeof...(Ts)>;
+
+template<typename... Ts>
+concept true3 = true2<Ts...>;
+
+static_assert(true3<void>);
+
+}
+
+namespace case6 {
+
+namespace std {
+template <int __v>
+struct integral_constant {
+  static const int value = __v;
+};
+
+template <class _Tp, class... _Args>
+constexpr bool is_constructible_v = __is_constructible(_Tp, _Args...);
+
+template <class _From, class _To>
+constexpr bool is_convertible_v = __is_convertible(_From, _To);
+
+template <class>
+struct tuple_size;
+
+template <class _Tp>
+constexpr decltype(sizeof(int)) tuple_size_v = tuple_size<_Tp>::value;
+}  // namespace std
+
+template <int N, int X>
+concept FixedExtentConstructibleFromExtent = X == N;
+
+template <int Extent>
+struct span {
+  int static constexpr extent = Extent;
+  template <typename R, int N = std::tuple_size_v<R>>
+    requires(FixedExtentConstructibleFromExtent<extent, N>)
+  span(R);
+};
+
+template <class, int>
+struct array {};
+
+template <class _Tp, decltype(sizeof(int)) _Size>
+struct std::tuple_size<array<_Tp, _Size>> : integral_constant<_Size> {};
+
+static_assert(std::is_convertible_v<array<int, 3>, span<3>>);
+static_assert(!std::is_constructible_v<span<4>, array<int, 3>>);
+
+}
+
 }

From c41611bacb0ca72a283accd7680061421454e152 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 5 Oct 2025 14:02:00 +0800
Subject: [PATCH 768/878] [InstCombine] Fix pointer replacement in
 `foldSelectValueEquivalence` (#161701)

Closes https://github.com/llvm/llvm-project/issues/161636.

Compile-time impact (+0.06%):
https://llvm-compile-time-tracker.com/compare.php?from=c2ef022aa7413ddc9aba48fa6fbe6fbd0cb14e19&to=9a0f0302efc30580136d191e66bac929f08ee25f&stat=instructions%3Au
I used to disable this fold for pointers, because I cannot construct a
positive test that is covered by `foldSelectValueEquivalence ` but not
covered by `simplifySelectWithICmpCond`. But the IR diff shows we still
benefit from the fold in InstCombine:
+ Bail out on pointers:
https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2880
+ This patch: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2882
---
 .../InstCombine/InstCombineSelect.cpp         | 30 +++++++++++------
 .../test/Transforms/InstCombine/select-gep.ll | 32 +++++++++++++++++++
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3df448ddde99c..8f60e506e8a33 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OverflowInstAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -42,6 +43,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <cassert>
+#include <optional>
 #include <utility>
 
 #define DEBUG_TYPE "instcombine"
@@ -1451,10 +1453,16 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
     return nullptr;
   };
 
-  if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS))
-    return R;
-  if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS))
-    return R;
+  bool CanReplaceCmpLHSWithRHS = canReplacePointersIfEqual(CmpLHS, CmpRHS, DL);
+  if (CanReplaceCmpLHSWithRHS) {
+    if (Instruction *R = ReplaceOldOpWithNewOp(CmpLHS, CmpRHS))
+      return R;
+  }
+  bool CanReplaceCmpRHSWithLHS = canReplacePointersIfEqual(CmpRHS, CmpLHS, DL);
+  if (CanReplaceCmpRHSWithLHS) {
+    if (Instruction *R = ReplaceOldOpWithNewOp(CmpRHS, CmpLHS))
+      return R;
+  }
 
   auto *FalseInst = dyn_cast<Instruction>(FalseVal);
   if (!FalseInst)
@@ -1469,12 +1477,14 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
   // Example:
   // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
   SmallVector<Instruction *> DropFlags;
-  if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
-                             /* AllowRefinement */ false,
-                             &DropFlags) == TrueVal ||
-      simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
-                             /* AllowRefinement */ false,
-                             &DropFlags) == TrueVal) {
+  if ((CanReplaceCmpLHSWithRHS &&
+       simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
+                              /* AllowRefinement */ false,
+                              &DropFlags) == TrueVal) ||
+      (CanReplaceCmpRHSWithLHS &&
+       simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
+                              /* AllowRefinement */ false,
+                              &DropFlags) == TrueVal)) {
     for (Instruction *I : DropFlags) {
       I->dropPoisonGeneratingAnnotations();
       Worklist.add(I);
diff --git a/llvm/test/Transforms/InstCombine/select-gep.ll b/llvm/test/Transforms/InstCombine/select-gep.ll
index dd8dffba11b05..718133699a8a7 100644
--- a/llvm/test/Transforms/InstCombine/select-gep.ll
+++ b/llvm/test/Transforms/InstCombine/select-gep.ll
@@ -286,3 +286,35 @@ define <2 x ptr> @test7(<2 x ptr> %p1, i64 %idx, <2 x i1> %cc) {
   %select = select <2 x i1> %cc, <2 x ptr> %p1, <2 x ptr> %gep
   ret <2 x ptr> %select
 }
+
+define ptr @ptr_eq_replace_freeze1(ptr %p, ptr %q) {
+; CHECK-LABEL: @ptr_eq_replace_freeze1(
+; CHECK-NEXT:    [[Q_FR:%.*]] = freeze ptr [[Q:%.*]]
+; CHECK-NEXT:    [[Q_FR1:%.*]] = freeze ptr [[Q1:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[Q_FR]], [[Q_FR1]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], ptr [[Q_FR]], ptr [[Q_FR1]]
+; CHECK-NEXT:    ret ptr [[SELECT]]
+;
+  %p.fr = freeze ptr %p
+  %q.fr = freeze ptr %q
+  %cmp = icmp eq ptr %p.fr, %q.fr
+  %select = select i1 %cmp, ptr %p.fr, ptr %q.fr
+  ret ptr %select
+}
+
+define ptr @ptr_eq_replace_freeze2(ptr %p, ptr %q) {
+; CHECK-LABEL: @ptr_eq_replace_freeze2(
+; CHECK-NEXT:    [[P_FR:%.*]] = freeze ptr [[P:%.*]]
+; CHECK-NEXT:    [[P_FR1:%.*]] = freeze ptr [[P1:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P_FR1]], [[P_FR]]
+; CHECK-NEXT:    [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P_FR1]], ptr [[P_FR]]
+; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i8, ptr [[SELECT_V]], i64 16
+; CHECK-NEXT:    ret ptr [[SELECT]]
+;
+  %gep1 = getelementptr i32, ptr %p, i64 4
+  %gep2 = getelementptr i32, ptr %q, i64 4
+  %cmp = icmp eq ptr %p, %q
+  %cmp.fr = freeze i1 %cmp
+  %select = select i1 %cmp.fr, ptr %gep1, ptr %gep2
+  ret ptr %select
+}

From a7414796c0854a9e6f649d922a58aa63147ae2e4 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Sat, 4 Oct 2025 23:19:20 -0700
Subject: [PATCH 769/878] [MachineScheduler] Convert some of the debug prints
 into using LDBG. NFC (#161997)

These lines are heavily skewed and hard to read. Using the new LDBG
there instead.

NFC.
---
 llvm/lib/CodeGen/MachineScheduler.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 299bcc46e4bd2..22a59ec21b948 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLog.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
@@ -3349,14 +3350,16 @@ void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
   if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
     return;
 
-  LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
-    dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
+  if (CurrZone.isResourceLimited()) {
+    LDBG() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
            << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
-  } if (OtherResLimited) dbgs()
-                 << "  RemainingLimit: "
-                 << SchedModel->getResourceName(OtherCritIdx) << "\n";
-             if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
-             << "  Latency limited both directions.\n");
+  }
+  if (OtherResLimited) {
+    LDBG() << "  RemainingLimit: " << SchedModel->getResourceName(OtherCritIdx)
+           << "\n";
+  }
+  if (!CurrZone.isResourceLimited() && !OtherResLimited)
+    LDBG() << "  Latency limited both directions.\n";
 
   if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
     Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();

From 505956eeb0943461f9f0c10f0cca0da185fa142d Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Sun, 5 Oct 2025 14:20:09 +0800
Subject: [PATCH 770/878] [Clang] Use the templated declaration for
 DiagnoseUseOfDecl (#161900)

We missed the check of diagnose_if attributes for the constructor
templates, because we used the template declaration, rather than its
templated one.

Also, we can avoid the duplicate constraint checking because it's
already performed in overload resolution.

There are some diagnostic regressions, all of which are warnings for
uses of lambdas in C++03 mode, which I believe we should still diagnose.

Fixes https://github.com/llvm/llvm-project/issues/160776
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaInit.cpp                   |  6 ++--
 .../test/Parser/cxx0x-lambda-expressions.cpp  |  2 +-
 clang/test/SemaCXX/diagnose_if.cpp            | 27 +++++++++++++++++
 clang/test/SemaCXX/lambda-expressions.cpp     | 30 ++++++++++++++-----
 5 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d2e5bd284d350..2ac5f4aab75dc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -452,6 +452,7 @@ Bug Fixes to AST Handling
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
+- Fixed missing diagnostics of ``diagnose_if`` on templates involved in initialization. (#GH160776)
 
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 0d0d2c00eb5d4..922fcacdd7ae9 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -7539,7 +7539,7 @@ PerformConstructorInitialization(Sema &S,
 
   // Only check access if all of that succeeded.
   S.CheckConstructorAccess(Loc, Constructor, Step.Function.FoundDecl, Entity);
-  if (S.DiagnoseUseOfDecl(Step.Function.FoundDecl, Loc))
+  if (S.DiagnoseUseOfOverloadedDecl(Constructor, Loc))
     return ExprError();
 
   if (const ArrayType *AT = S.Context.getAsArrayType(Entity.getType()))
@@ -8092,7 +8092,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
 
         S.CheckConstructorAccess(Kind.getLocation(), Constructor, FoundFn,
                                  Entity);
-        if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
+        if (S.DiagnoseUseOfOverloadedDecl(Constructor, Kind.getLocation()))
           return ExprError();
 
         CastKind = CK_ConstructorConversion;
@@ -8102,7 +8102,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
         CXXConversionDecl *Conversion = cast<CXXConversionDecl>(Fn);
         S.CheckMemberOperatorAccess(Kind.getLocation(), CurInit.get(), nullptr,
                                     FoundFn);
-        if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
+        if (S.DiagnoseUseOfOverloadedDecl(Conversion, Kind.getLocation()))
           return ExprError();
 
         CurInit = S.BuildCXXMemberCallExpr(CurInit.get(), FoundFn, Conversion,
diff --git a/clang/test/Parser/cxx0x-lambda-expressions.cpp b/clang/test/Parser/cxx0x-lambda-expressions.cpp
index f90f8ce27c53e..5b57c7f638e8a 100644
--- a/clang/test/Parser/cxx0x-lambda-expressions.cpp
+++ b/clang/test/Parser/cxx0x-lambda-expressions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions -Wno-local-type-template-args
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++11 -Wno-c99-designator %s
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx17ext,cxx20ext,cxx23ext          -std=c++14 -Wno-c99-designator %s
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx20ext,cxx23ext                   -std=c++17 -Wno-c99-designator %s
diff --git a/clang/test/SemaCXX/diagnose_if.cpp b/clang/test/SemaCXX/diagnose_if.cpp
index 1b9e660c4e224..0af8bb76496ea 100644
--- a/clang/test/SemaCXX/diagnose_if.cpp
+++ b/clang/test/SemaCXX/diagnose_if.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20
 // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++20 -verify=expected,cxx20 -fexperimental-new-constant-interpreter
 
 #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
 
@@ -665,3 +667,28 @@ void run() {
   switch (constexpr Foo i = 2) { default: break; } // expected-error{{oh no}}
 }
 }
+
+namespace GH160776 {
+
+struct ConstructorTemplate {
+  template <class T>
+  explicit ConstructorTemplate(T x)
+      _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") {} // expected-note {{diagnose_if}}
+
+  template <class T>
+#if __cplusplus >= 202002L
+  requires (sizeof(T) == 1) // cxx20-note {{evaluated to false}}
+#endif
+  operator T() _diagnose_if(sizeof(T) == sizeof(char), "oh no", "error") { // expected-note {{diagnose_if}} \
+                                                                           // cxx20-note {{constraints not satisfied}}
+    return T{};
+  }
+};
+
+void run() {
+  ConstructorTemplate x('1'); // expected-error {{oh no}}
+  char y = x; // expected-error {{oh no}}
+  int z = x; // cxx20-error {{no viable conversion}}
+}
+
+}
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 8ea8e324cf5d2..f9d7cfcbbd18d 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -149,7 +149,8 @@ namespace PR12031 {
   void f(int i, X x);
   void g() {
     const int v = 10;
-    f(v, [](){});
+    f(v, [](){}); // cxx03-warning {{template argument uses local type}} \
+                         // cxx03-note {{while substituting}}
   }
 }
 
@@ -572,26 +573,37 @@ namespace PR27994 {
 struct A { template <class T> A(T); };
 
 template <class T>
-struct B {
+struct B { // #PR27994_B
   int x;
-  A a = [&] { int y = x; };
-  A b = [&] { [&] { [&] { int y = x; }; }; };
-  A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}}
-  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}}
+  A a = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \
+                            //   cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+  A b = [&] { [&] { [&] { int y = x; }; }; }; // cxx03-warning {{template argument uses unnamed type}} \
+                                              //   cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+  A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}} \
+                                        // cxx03-warning {{template argument uses unnamed type}} \
+                                        //  cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
+  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}} \
+                                                                       // cxx03-warning {{template argument uses unnamed type}} \
+                                                                       //  cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
 };
 
 B<int> b;
+// cxx03-note@#PR27994_B 4{{in instantiation of default member initializer}}
+// cxx03-note@-2 4{{in evaluation of exception}}
 
 template <class T> struct C {
   struct D {
+    // cxx03-note@-1 {{in instantiation of default member initializer}}
     int x;
-    A f = [&] { int y = x; };
+    A f = [&] { int y = x; }; // cxx03-warning {{template argument uses unnamed type}} \
+                              // cxx03-note {{while substituting}} cxx03-note {{unnamed type used}}
   };
 };
 
 int func() {
   C<int> a;
   decltype(a)::D b;
+  // cxx03-note@-1 {{in evaluation of exception}}
 }
 }
 
@@ -606,8 +618,12 @@ struct S1 {
 
 void foo1() {
   auto s0 = S1([name=]() {}); // expected-error {{expected expression}}
+                                     // cxx03-warning@-1 {{template argument uses local type}} \
+                                     // cxx03-note@-1 {{while substituting deduced template arguments}}
   auto s1 = S1([name=name]() {}); // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}}
                                   // cxx03-cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}}
+                                  // cxx03-warning@-2 {{template argument uses local type}} \
+                                  // cxx03-note@-2 {{while substituting deduced template arguments}}
 }
 }
 

From 8dac6e28c951b33659f1f1f80e0dd553788abaab Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Sun, 5 Oct 2025 08:35:56 +0200
Subject: [PATCH 771/878] =?UTF-8?q?Reapply=20"[clang]=20Convert=20second?=
 =?UTF-8?q?=20arg=20of=20=5F=5Fbuiltin=5Fassume=5Faligned=20to=20Co?=
 =?UTF-8?q?=E2=80=A6=20(#161945)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…nstantExpr (#161314)" (#161719)

This reverts commit f1650cf91b01470ce44f47797663d59f00828493.
---
 clang/docs/ReleaseNotes.rst                   |  2 ++
 clang/lib/CodeGen/CGExprScalar.cpp            |  7 ++++++-
 clang/lib/Sema/SemaChecking.cpp               |  3 +++
 clang/test/SemaCXX/builtin-assume-aligned.cpp | 13 +++++++++++++
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2ac5f4aab75dc..ab536adcd3ba0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -250,6 +250,8 @@ Non-comprehensive list of changes in this release
 
 - ``__builtin_assume_dereferenceable`` now accepts non-constant size operands.
 
+- Fixed a crash when the second argument to ``__builtin_assume_aligned`` was not constant (#GH161314)
+
 New Compiler Flags
 ------------------
 - New option ``-fno-sanitize-debug-trap-reasons`` added to disable emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index f319b176513f8..c961222766475 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -465,11 +465,16 @@ class ScalarExprEmitter
       return nullptr;
 
     if (Value *Result = ConstantEmitter(CGF).tryEmitConstantExpr(E)) {
-      if (E->isGLValue())
+      if (E->isGLValue()) {
+        // This was already converted to an rvalue when it was constant
+        // evaluated.
+        if (E->hasAPValueResult() && !E->getAPValueResult().isLValue())
+          return Result;
         return CGF.EmitLoadOfScalar(
             Address(Result, CGF.convertTypeForLoadStore(E->getType()),
                     CGF.getContext().getTypeAlignInChars(E->getType())),
             /*Volatile*/ false, E->getType(), E->getExprLoc());
+      }
       return Result;
     }
     return Visit(E->getSubExpr());
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7ce3513fe0969..3cc61b167ba98 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5954,6 +5954,9 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
     if (Result > Sema::MaximumAlignment)
       Diag(TheCall->getBeginLoc(), diag::warn_assume_aligned_too_great)
           << SecondArg->getSourceRange() << Sema::MaximumAlignment;
+
+    TheCall->setArg(1,
+                    ConstantExpr::Create(Context, SecondArg, APValue(Result)));
   }
 
   if (NumArgs > 2) {
diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp
index 48bd8414fc50a..30296c72c6be8 100644
--- a/clang/test/SemaCXX/builtin-assume-aligned.cpp
+++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp
@@ -47,3 +47,16 @@ constexpr void *s1 = __builtin_assume_aligned(x, 32);
 constexpr void *s2 = __builtin_assume_aligned(x, 32, 5);
 constexpr void *s3 = __builtin_assume_aligned(x, 32, -1);
 
+
+constexpr int add(int a, int b) {
+  return a+b;
+}
+constexpr void *c1 = __builtin_assume_aligned(p, add(1,1));
+constexpr void *c2 = __builtin_assume_aligned(p, add(2,1)); // expected-error {{not a power of 2}}
+
+constexpr long kAlignment = 128;
+long AllocateAlignedBytes_payload;
+void AllocateAlignedBytes() {
+  void *m = __builtin_assume_aligned(
+      reinterpret_cast<void *>(AllocateAlignedBytes_payload), kAlignment);
+}

From 5284c83a8ff143b2d93853d1209f06d7d571f865 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Sun, 5 Oct 2025 09:04:27 +0200
Subject: [PATCH 772/878] Revert "[MachineScheduler] Convert some of the debug
 prints into using LDBG. NFC (#161997)"

This reverts commit a7414796c0854a9e6f649d922a58aa63147ae2e4.

This breaks builds:
 3355 |            << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
      |               ~~~~~~~~~~  ^
/home/buildbot/workspace/bolt-aarch64-ubuntu-clang/llvm-project/llvm/lib/CodeGen/MachineScheduler.cpp:3358:51: error: no member named 'getResourceName' in 'llvm::TargetSchedModel'
 3358 |     LDBG() << "  RemainingLimit: " << SchedModel->getResourceName(OtherCritIdx)
      |                                       ~~~~~~~~~~  ^
2 errors generated.

E.g. https://lab.llvm.org/buildbot/#/builders/128/builds/7522
---
 llvm/lib/CodeGen/MachineScheduler.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 22a59ec21b948..299bcc46e4bd2 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -56,7 +56,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugLog.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
@@ -3350,16 +3349,14 @@ void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
   if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
     return;
 
-  if (CurrZone.isResourceLimited()) {
-    LDBG() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
+  LLVM_DEBUG(if (CurrZone.isResourceLimited()) {
+    dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
            << SchedModel->getResourceName(CurrZone.getZoneCritResIdx()) << "\n";
-  }
-  if (OtherResLimited) {
-    LDBG() << "  RemainingLimit: " << SchedModel->getResourceName(OtherCritIdx)
-           << "\n";
-  }
-  if (!CurrZone.isResourceLimited() && !OtherResLimited)
-    LDBG() << "  Latency limited both directions.\n";
+  } if (OtherResLimited) dbgs()
+                 << "  RemainingLimit: "
+                 << SchedModel->getResourceName(OtherCritIdx) << "\n";
+             if (!CurrZone.isResourceLimited() && !OtherResLimited) dbgs()
+             << "  Latency limited both directions.\n");
 
   if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
     Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();

From e8489c162b32d9bc458a0ec779b69a23c9de478d Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sun, 5 Oct 2025 18:21:00 +1100
Subject: [PATCH 773/878] [orc-rt] WrapperFunction::handle: add by-ref args,
 minimize temporaries. (#161999)

This adds support for WrapperFunction::handle handlers that take their
arguments by reference, rather than by value.

This commit also reduces the number of temporary objects created to
support SPS-transparent conversion in SPSWrapperFunction.
---
 orc-rt/include/orc-rt/SPSWrapperFunction.h  |  9 ++-
 orc-rt/include/orc-rt/WrapperFunction.h     | 27 +++++--
 orc-rt/unittests/SPSWrapperFunctionTest.cpp | 79 +++++++++++++++++++++
 3 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 14a3d8e3d6ad6..3ed3295731780 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -57,8 +57,8 @@ template <typename... SPSArgTs> struct WFSPSHelper {
   template <typename... Ts>
   using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type;
 
-  template <typename T> static T fromSerializable(T &&Arg) noexcept {
-    return Arg;
+  template <typename T> static T &&fromSerializable(T &&Arg) noexcept {
+    return std::forward<T>(Arg);
   }
 
   static Error fromSerializable(SPSSerializableError Err) noexcept {
@@ -86,7 +86,10 @@ template <typename... SPSArgTs> struct WFSPSHelper {
                                 decltype(Args)>::deserialize(IB, Args))
       return std::nullopt;
     return std::apply(
-        [](auto &&...A) { return ArgTuple(fromSerializable(A)...); },
+        [](auto &&...A) {
+          return std::optional<ArgTuple>(std::in_place,
+                                         std::move(fromSerializable(A))...);
+        },
         std::move(Args));
   }
 };
diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index ca165db7188b4..47e770f0bfbf7 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -111,7 +111,23 @@ struct WFHandlerTraitsImpl {
   static_assert(std::is_void_v<RetT>,
                 "Async wrapper function handler must return void");
   typedef ReturnT YieldType;
-  typedef std::tuple<ArgTs...> ArgTupleType;
+  typedef std::tuple<std::decay_t<ArgTs>...> ArgTupleType;
+
+  // Forwards arguments based on the parameter types of the handler.
+  template <typename FnT> class ForwardArgsAsRequested {
+  public:
+    ForwardArgsAsRequested(FnT &&Fn) : Fn(std::move(Fn)) {}
+    void operator()(ArgTs &...Args) { Fn(std::forward<ArgTs>(Args)...); }
+
+  private:
+    FnT Fn;
+  };
+
+  template <typename FnT>
+  static ForwardArgsAsRequested<std::decay_t<FnT>>
+  forwardArgsAsRequested(FnT &&Fn) {
+    return ForwardArgsAsRequested<std::decay_t<FnT>>(std::forward<FnT>(Fn));
+  }
 };
 
 template <typename C>
@@ -244,10 +260,11 @@ struct WrapperFunction {
 
     if (auto Args =
             S.arguments().template deserialize<ArgTuple>(std::move(ArgBytes)))
-      std::apply(bind_front(std::forward<Handler>(H),
-                            detail::StructuredYield<RetTupleType, Serializer>(
-                                Session, CallCtx, Return, std::move(S))),
-                 std::move(*Args));
+      std::apply(HandlerTraits::forwardArgsAsRequested(bind_front(
+                     std::forward<Handler>(H),
+                     detail::StructuredYield<RetTupleType, Serializer>(
+                         Session, CallCtx, Return, std::move(S)))),
+                 *Args);
     else
       Return(Session, CallCtx,
              WrapperFunctionBuffer::createOutOfBandError(
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index c0c86ff8715ce..32aaa61639dbb 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CommonTestUtils.h"
+
 #include "orc-rt/SPSWrapperFunction.h"
 #include "orc-rt/WrapperFunction.h"
 #include "orc-rt/move_only_function.h"
@@ -218,3 +220,80 @@ TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) {
 
   EXPECT_EQ(ErrMsg, "N is not a multiple of 2");
 }
+
+template <size_t N> struct SPSOpCounter {};
+
+namespace orc_rt {
+template <size_t N>
+class SPSSerializationTraits<SPSOpCounter<N>, OpCounter<N>> {
+public:
+  static size_t size(const OpCounter<N> &O) { return 0; }
+  static bool serialize(SPSOutputBuffer &OB, const OpCounter<N> &O) {
+    return true;
+  }
+  static bool deserialize(SPSInputBuffer &OB, OpCounter<N> &O) { return true; }
+};
+} // namespace orc_rt
+
+static void
+handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session,
+                                        void *CallCtx,
+                                        orc_rt_WrapperFunctionReturn Return,
+                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
+  SPSWrapperFunction<void(
+      SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
+      SPSOpCounter<3>)>::handle(Session, CallCtx, Return, ArgBytes,
+                                [](move_only_function<void()> Return,
+                                   OpCounter<0>, OpCounter<1> &,
+                                   const OpCounter<2> &,
+                                   OpCounter<3> &&) { Return(); });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestHandlerWithReferences) {
+  // Test that we can handle by-value, by-ref, by-const-ref, and by-rvalue-ref
+  // arguments, and that we generate the expected number of moves.
+  OpCounter<0>::reset();
+  OpCounter<1>::reset();
+  OpCounter<2>::reset();
+  OpCounter<3>::reset();
+
+  bool DidRun = false;
+  SPSWrapperFunction<void(SPSOpCounter<0>, SPSOpCounter<1>, SPSOpCounter<2>,
+                          SPSOpCounter<3>)>::
+      call(
+          DirectCaller(nullptr, handle_with_reference_types_sps_wrapper),
+          [&](Error R) {
+            cantFail(std::move(R));
+            DidRun = true;
+          },
+          OpCounter<0>(), OpCounter<1>(), OpCounter<2>(), OpCounter<3>());
+
+  EXPECT_TRUE(DidRun);
+
+  // We expect two default constructions for each parameter: one for the
+  // argument to call, and one for the object to deserialize into.
+  EXPECT_EQ(OpCounter<0>::defaultConstructions(), 2U);
+  EXPECT_EQ(OpCounter<1>::defaultConstructions(), 2U);
+  EXPECT_EQ(OpCounter<2>::defaultConstructions(), 2U);
+  EXPECT_EQ(OpCounter<3>::defaultConstructions(), 2U);
+
+  // Pass-by-value: we expect two moves (one for SPS transparent conversion,
+  // one to copy the value to the parameter), and no copies.
+  EXPECT_EQ(OpCounter<0>::moves(), 2U);
+  EXPECT_EQ(OpCounter<0>::copies(), 0U);
+
+  // Pass-by-lvalue-reference: we expect one move (for SPS transparent
+  // conversion), no copies.
+  EXPECT_EQ(OpCounter<1>::moves(), 1U);
+  EXPECT_EQ(OpCounter<1>::copies(), 0U);
+
+  // Pass-by-const-lvalue-reference: we expect one move (for SPS transparent
+  // conversion), no copies.
+  EXPECT_EQ(OpCounter<2>::moves(), 1U);
+  EXPECT_EQ(OpCounter<2>::copies(), 0U);
+
+  // Pass-by-rvalue-reference: we expect one move (for SPS transparent
+  // conversion), no copies.
+  EXPECT_EQ(OpCounter<3>::moves(), 1U);
+  EXPECT_EQ(OpCounter<3>::copies(), 0U);
+}

From 0338350ccb506020529259427ec1c66ca6569749 Mon Sep 17 00:00:00 2001
From: Maxime Arthaud <maxime@arthaud.me>
Date: Sun, 5 Oct 2025 10:01:19 +0200
Subject: [PATCH 774/878] [llvm-c] Add missing nullptr check in
 LLVMGetFirstDbgRecord (#151101)

I'm using the LLVM C bindings through the llvm-sys rust crate, and
noticed that LLVMGetFirstDbgRecord and LLVMGetLastDbgRecord are
segfault-ing when called on instructions without debug markers. I found
out it's missing a null pointer check. This PR fixes the issue.
---
 llvm/lib/IR/Core.cpp               | 4 ++++
 llvm/tools/llvm-c-test/debuginfo.c | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 8b5965ba45a6d..df0c85b87ba60 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2994,6 +2994,8 @@ LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) {
 
 LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
+  if (!Instr->DebugMarker)
+    return nullptr;
   auto I = Instr->DebugMarker->StoredDbgRecords.begin();
   if (I == Instr->DebugMarker->StoredDbgRecords.end())
     return nullptr;
@@ -3002,6 +3004,8 @@ LLVMDbgRecordRef LLVMGetFirstDbgRecord(LLVMValueRef Inst) {
 
 LLVMDbgRecordRef LLVMGetLastDbgRecord(LLVMValueRef Inst) {
   Instruction *Instr = unwrap<Instruction>(Inst);
+  if (!Instr->DebugMarker)
+    return nullptr;
   auto I = Instr->DebugMarker->StoredDbgRecords.rbegin();
   if (I == Instr->DebugMarker->StoredDbgRecords.rend())
     return nullptr;
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index 0f09c74a476bb..e376d82009be0 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -325,6 +325,13 @@ int llvm_test_dibuilder(void) {
   LLVMValueRef Phi2 = LLVMBuildPhi(Builder, I64, "p2");
   LLVMAddIncoming(Phi2, &Zero, &FooEntryBlock, 1);
 
+  // Test that LLVMGetFirstDbgRecord and LLVMGetLastDbgRecord return NULL for
+  // instructions without debug info.
+  LLVMDbgRecordRef Phi1FirstDbgRecord = LLVMGetFirstDbgRecord(Phi1);
+  assert(Phi1FirstDbgRecord == NULL);
+  LLVMDbgRecordRef Phi1LastDbgRecord = LLVMGetLastDbgRecord(Phi1);
+  assert(Phi1LastDbgRecord == NULL);
+
   // Insert a non-phi before the `ret` but not before the debug records to
   // test that works as expected.
   LLVMPositionBuilder(Builder, FooVarBlock, Ret);

From ca5ece89394f64ab814032d9562b2e4770160523 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 5 Oct 2025 16:15:51 +0800
Subject: [PATCH 775/878] [InstSimplify] Simplify fcmp implied by dominating
 fcmp (#161090)

This patch simplifies an fcmp into true/false if it is implied by a
dominating fcmp.
As an initial support, it only handles two cases:
+ `fcmp pred1, X, Y -> fcmp pred2, X, Y`: use set operations.
+ `fcmp pred1, X, C1 -> fcmp pred2, X, C2`: use `ConstantFPRange` and
set operations.

Note: It doesn't fix https://github.com/llvm/llvm-project/issues/70985,
as the second fcmp in the motivating case is not dominated by the edge.
We may need to adjust JumpThreading to handle this case.

Comptime impact (~+0.1%):
https://llvm-compile-time-tracker.com/compare.php?from=a728f213c863e4dd19f8969a417148d2951323c0&to=8ca70404fb0d66a824f39d83050ac38e2f1b25b9&stat=instructions:u
IR diff: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2848
---
 llvm/lib/Analysis/InstructionSimplify.cpp     |   4 +
 llvm/lib/Analysis/ValueTracking.cpp           |  98 ++++++++-
 llvm/test/CodeGen/AMDGPU/sgpr-copy.ll         |   2 +-
 .../Transforms/InstCombine/clamp-to-minmax.ll |   6 +-
 .../Transforms/InstSimplify/domcondition.ll   | 207 ++++++++++++++++++
 5 files changed, 303 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 07f4a8e5c889e..0d978d4da125e 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4164,6 +4164,10 @@ static Value *simplifyFCmpInst(CmpPredicate Pred, Value *LHS, Value *RHS,
       return ConstantInt::get(RetTy, Pred == CmpInst::FCMP_UNO);
   }
 
+  if (std::optional<bool> Res =
+          isImpliedByDomCondition(Pred, LHS, RHS, Q.CxtI, Q.DL))
+    return ConstantInt::getBool(RetTy, *Res);
+
   const APFloat *C = nullptr;
   match(RHS, m_APFloatAllowPoison(C));
   std::optional<KnownFPClass> FullKnownClassLHS;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 1eda7a7c90b08..a42c061d012ae 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFPRange.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -9474,6 +9475,69 @@ isImpliedCondICmps(CmpPredicate LPred, const Value *L0, const Value *L1,
   return std::nullopt;
 }
 
+/// Return true if LHS implies RHS (expanded to its components as "R0 RPred R1")
+/// is true.  Return false if LHS implies RHS is false. Otherwise, return
+/// std::nullopt if we can't infer anything.
+static std::optional<bool>
+isImpliedCondFCmps(FCmpInst::Predicate LPred, const Value *L0, const Value *L1,
+                   FCmpInst::Predicate RPred, const Value *R0, const Value *R1,
+                   const DataLayout &DL, bool LHSIsTrue) {
+  // The rest of the logic assumes the LHS condition is true.  If that's not the
+  // case, invert the predicate to make it so.
+  if (!LHSIsTrue)
+    LPred = FCmpInst::getInversePredicate(LPred);
+
+  // We can have non-canonical operands, so try to normalize any common operand
+  // to L0/R0.
+  if (L0 == R1) {
+    std::swap(R0, R1);
+    RPred = FCmpInst::getSwappedPredicate(RPred);
+  }
+  if (R0 == L1) {
+    std::swap(L0, L1);
+    LPred = FCmpInst::getSwappedPredicate(LPred);
+  }
+  if (L1 == R1) {
+    // If we have L0 == R0 and L1 == R1, then make L1/R1 the constants.
+    if (L0 != R0 || match(L0, m_ImmConstant())) {
+      std::swap(L0, L1);
+      LPred = ICmpInst::getSwappedCmpPredicate(LPred);
+      std::swap(R0, R1);
+      RPred = ICmpInst::getSwappedCmpPredicate(RPred);
+    }
+  }
+
+  // Can we infer anything when the two compares have matching operands?
+  if (L0 == R0 && L1 == R1) {
+    if ((LPred & RPred) == LPred)
+      return true;
+    if ((LPred & ~RPred) == LPred)
+      return false;
+  }
+
+  // See if we can infer anything if operand-0 matches and we have at least one
+  // constant.
+  const APFloat *L1C, *R1C;
+  if (L0 == R0 && match(L1, m_APFloat(L1C)) && match(R1, m_APFloat(R1C))) {
+    if (std::optional<ConstantFPRange> DomCR =
+            ConstantFPRange::makeExactFCmpRegion(LPred, *L1C)) {
+      if (std::optional<ConstantFPRange> ImpliedCR =
+              ConstantFPRange::makeExactFCmpRegion(RPred, *R1C)) {
+        if (ImpliedCR->contains(*DomCR))
+          return true;
+      }
+      if (std::optional<ConstantFPRange> ImpliedCR =
+              ConstantFPRange::makeExactFCmpRegion(
+                  FCmpInst::getInversePredicate(RPred), *R1C)) {
+        if (ImpliedCR->contains(*DomCR))
+          return false;
+      }
+    }
+  }
+
+  return std::nullopt;
+}
+
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return std::nullopt if we can't infer anything.  We
 /// expect the RHS to be an icmp and the LHS to be an 'and', 'or', or a 'select'
@@ -9529,15 +9593,24 @@ llvm::isImpliedCondition(const Value *LHS, CmpPredicate RHSPred,
     LHSIsTrue = !LHSIsTrue;
 
   // Both LHS and RHS are icmps.
-  if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
-    return isImpliedCondICmps(LHSCmp->getCmpPredicate(), LHSCmp->getOperand(0),
-                              LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1,
-                              DL, LHSIsTrue);
-  const Value *V;
-  if (match(LHS, m_NUWTrunc(m_Value(V))))
-    return isImpliedCondICmps(CmpInst::ICMP_NE, V,
-                              ConstantInt::get(V->getType(), 0), RHSPred,
-                              RHSOp0, RHSOp1, DL, LHSIsTrue);
+  if (RHSOp0->getType()->getScalarType()->isIntOrPtrTy()) {
+    if (const auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
+      return isImpliedCondICmps(LHSCmp->getCmpPredicate(),
+                                LHSCmp->getOperand(0), LHSCmp->getOperand(1),
+                                RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue);
+    const Value *V;
+    if (match(LHS, m_NUWTrunc(m_Value(V))))
+      return isImpliedCondICmps(CmpInst::ICMP_NE, V,
+                                ConstantInt::get(V->getType(), 0), RHSPred,
+                                RHSOp0, RHSOp1, DL, LHSIsTrue);
+  } else {
+    assert(RHSOp0->getType()->isFPOrFPVectorTy() &&
+           "Expected floating point type only!");
+    if (const auto *LHSCmp = dyn_cast<FCmpInst>(LHS))
+      return isImpliedCondFCmps(LHSCmp->getPredicate(), LHSCmp->getOperand(0),
+                                LHSCmp->getOperand(1), RHSPred, RHSOp0, RHSOp1,
+                                DL, LHSIsTrue);
+  }
 
   /// The LHS should be an 'or', 'and', or a 'select' instruction.  We expect
   /// the RHS to be an icmp.
@@ -9574,6 +9647,13 @@ std::optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
       return InvertRHS ? !*Implied : *Implied;
     return std::nullopt;
   }
+  if (const FCmpInst *RHSCmp = dyn_cast<FCmpInst>(RHS)) {
+    if (auto Implied = isImpliedCondition(
+            LHS, RHSCmp->getPredicate(), RHSCmp->getOperand(0),
+            RHSCmp->getOperand(1), DL, LHSIsTrue, Depth))
+      return InvertRHS ? !*Implied : *Implied;
+    return std::nullopt;
+  }
 
   const Value *V;
   if (match(RHS, m_NUWTrunc(m_Value(V)))) {
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index c82b3410a0c42..5bc9cdb5de527 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -256,7 +256,7 @@ endif:                                            ; preds = %else, %if
 define amdgpu_kernel void @copy1(ptr addrspace(1) %out, ptr addrspace(1) %in0) {
 entry:
   %tmp = load float, ptr addrspace(1) %in0
-  %tmp1 = fcmp oeq float %tmp, 0.000000e+00
+  %tmp1 = fcmp one float %tmp, 0.000000e+00
   br i1 %tmp1, label %if0, label %endif
 
 if0:                                              ; preds = %entry
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index 7f3276608c5da..0ccaa9c393654 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -172,10 +172,8 @@ define float @clamp_negative_wrong_const(float %x) {
 ; Like @clamp_test_1 but both are min
 define float @clamp_negative_same_op(float %x) {
 ; CHECK-LABEL: @clamp_negative_same_op(
-; CHECK-NEXT:    [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT:    [[INNER_SEL:%.*]] = select nnan ninf i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]]
-; CHECK-NEXT:    [[OUTER_CMP:%.*]] = fcmp fast ult float [[X]], 1.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00
+; CHECK-NEXT:    [[OUTER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = select nnan ninf i1 [[OUTER_CMP_INV]], float 1.000000e+00, float [[X]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %inner_cmp = fcmp fast ult float %x, 255.0
diff --git a/llvm/test/Transforms/InstSimplify/domcondition.ll b/llvm/test/Transforms/InstSimplify/domcondition.ll
index 43be5de6bf95d..2893bb155bbe7 100644
--- a/llvm/test/Transforms/InstSimplify/domcondition.ll
+++ b/llvm/test/Transforms/InstSimplify/domcondition.ll
@@ -278,3 +278,210 @@ end:
 }
 
 declare void @foo(i32)
+
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_true(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_true(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %x, 1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_in_else_implied_by_dom_cond_range_true(float %x) {
+; CHECK-LABEL: @simplify_fcmp_in_else_implied_by_dom_cond_range_true(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 1.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = fcmp olt float %x, 1.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  ret i1 true
+
+if.else:
+  %cmp2 = fcmp uge float %x, 0.5
+  ret i1 %cmp2
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_false(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_false(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ogt float %x, 1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_true(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_true(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ole float %x, %y
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_false(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_false(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ogt float %x, %y
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_commuted(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_commuted(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret i1 true
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp oge float %y, %x
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+; Negative tests
+
+define i1 @simplify_fcmp_implied_by_dom_cond_wrong_range(float %x) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[X]], -1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %x, -1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_range_mismatched_operand(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[Y:%.*]], 1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, 0.0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %y, 1.0
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_wrong_pred(float %x, float %y) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_wrong_pred(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt float [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp ole float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp olt float %x, %y
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}
+
+define i1 @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand(float %x, float %y, float %z) {
+; CHECK-LABEL: @simplify_fcmp_implied_by_dom_cond_pred_mismatched_operand(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ole float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       if.else:
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = fcmp olt float %x, %y
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = fcmp ole float %x, %z
+  ret i1 %cmp2
+
+if.else:
+  ret i1 false
+}

From 90d5795a3da3c336bda3c9f9c2f85210395cf676 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Sun, 5 Oct 2025 01:46:23 -0700
Subject: [PATCH 776/878] [NFC] Remove accidently added file in #161758
 (#161991)

---
 llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll

diff --git a/llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll
deleted file mode 120000
index 8747bd528438c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/.#llvm.amdgcn.smfmac.gfx950.ll
+++ /dev/null
@@ -1 +0,0 @@
-matt@mattbookAMD.56897
\ No newline at end of file

From f61789f5f6788322d8864f2c32021fd552f3f7b4 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Sun, 5 Oct 2025 06:27:59 -0400
Subject: [PATCH 777/878] [ADT] Add `DefaultUnreachable("msg")` to StringSwitch
 (#161976)

Similar to TypeSwitch (#161970), allow for explicit unreachable default
case with a custom error message on unhandled cases.

StringSwitch already allowed for checking if any of the cases matched
with the conversion operator, but `DefaultUnreachable` is more explicit
and allows for a custom message.
---
 llvm/include/llvm/ADT/StringSwitch.h    | 12 +++++++++---
 llvm/unittests/ADT/StringSwitchTest.cpp | 16 ++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 86e591c71c92e..0ce7c57a358f2 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -14,6 +14,7 @@
 #define LLVM_ADT_STRINGSWITCH_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstring>
 #include <optional>
@@ -180,11 +181,16 @@ class StringSwitch {
     return Value;
   }
 
-  [[nodiscard]] operator R() {
-    assert(Result && "Fell off the end of a string-switch");
-    return std::move(*Result);
+  /// Declare default as unreachable, making sure that all cases were handled.
+  [[nodiscard]] R DefaultUnreachable(
+      const char *Message = "Fell off the end of a string-switch") {
+    if (Result)
+      return std::move(*Result);
+    llvm_unreachable(Message);
   }
 
+  [[nodiscard]] operator R() { return DefaultUnreachable(); }
+
 private:
   // Returns true when `Str` matches the `S` argument, and stores the result.
   bool CaseImpl(T &Value, StringLiteral S) {
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index 2953f4b0a381b..bcb1521baa287 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -230,3 +230,19 @@ TEST(StringSwitchTest, CasesCopies) {
       "Foo", "Bar", "Baz", "Qux", Copyable{NumCopies});
   EXPECT_EQ(NumCopies, 1u);
 }
+
+TEST(StringSwitchTest, DefaultUnreachable) {
+  auto Translate = [](StringRef S) {
+    return llvm::StringSwitch<int>(S)
+        .Case("A", 0)
+        .Case("B", 1)
+        .DefaultUnreachable("Unhandled case");
+  };
+
+  EXPECT_EQ(0, Translate("A"));
+  EXPECT_EQ(1, Translate("B"));
+
+#if defined(GTEST_HAS_DEATH_TEST) && !defined(NDEBUG)
+  EXPECT_DEATH((void)Translate("C"), "Unhandled case");
+#endif
+}

From 1d01a8473908cf3422fee4905fcf82f160ae0be0 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Sun, 5 Oct 2025 12:45:52 +0200
Subject: [PATCH 778/878] [LangRef] Indent literal block properly (NFC)

---
 llvm/docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 22b58bf0f5735..20bd8113cdb32 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -8904,7 +8904,7 @@ that may set ``errno``, allowing optimizations such as store-to-load forwarding
 across such routines.
 
 For example, the following is a valid metadata specifying the TBAA information
-for an integer access:
+for an integer access::
 
     !llvm.errno.tbaa = !{!0}
     !0 = !{!1, !1, i64 0}

From 2ef3771175ff36e8a14a949e35fbba24f5ff3e73 Mon Sep 17 00:00:00 2001
From: Ivan Dzuhan <nadare2357@gmail.com>
Date: Sun, 5 Oct 2025 14:14:28 +0300
Subject: [PATCH 779/878] [docs] Fix enumeration in GettingStarted.rst (#96684)

---
 llvm/docs/GettingStarted.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst
index 4777cdebc6f4a..bdb318df9f1d4 100644
--- a/llvm/docs/GettingStarted.rst
+++ b/llvm/docs/GettingStarted.rst
@@ -46,10 +46,10 @@ Getting the Source Code and Building LLVM
      stacked pull requests and reverts), you can filter them from your
      `git fetch` (or `git pull`) with this configuration:
 
-.. code-block:: console
-
-  git config --add remote.origin.fetch '^refs/heads/users/*'
-  git config --add remote.origin.fetch '^refs/heads/revert-*'
+   .. code-block:: console
+   
+     git config --add remote.origin.fetch '^refs/heads/users/*'
+     git config --add remote.origin.fetch '^refs/heads/revert-*'
 
 #. Configure and build LLVM and Clang:
 

From 76cff3bcdbe945c053f4c3a7d9c99c7c2ae2bb10 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 5 Oct 2025 12:16:31 +0100
Subject: [PATCH 780/878] [clang][Sema] NormalizedConstraint - fix MSVC "not
 all control paths return a value" warnings. NFC. (#162004)

---
 clang/include/clang/Sema/SemaConcept.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
index 51ca1e16331f5..bdd997b18cb08 100644
--- a/clang/include/clang/Sema/SemaConcept.h
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -257,6 +257,7 @@ struct NormalizedConstraint {
     case ConstraintKind::FoldExpanded:
       return FoldExpanded.Pattern->getBeginLoc();
     }
+    llvm_unreachable("Unknown ConstraintKind enum");
   }
 
   SourceLocation getEndLoc() const {
@@ -270,6 +271,7 @@ struct NormalizedConstraint {
     case ConstraintKind::FoldExpanded:
       return FoldExpanded.Pattern->getEndLoc();
     }
+    llvm_unreachable("Unknown ConstraintKind enum");
   }
 
   SourceRange getSourceRange() const { return {getBeginLoc(), getEndLoc()}; }

From 1af06cb636c780b2f24f247f0efc0576e411d9c4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 5 Oct 2025 12:37:16 +0100
Subject: [PATCH 781/878] [clang][bytecode] interp__builtin_ia32_pshuf - modulo
 lane index to allow reuse of PSHUFD/LW/HW mask decode. NFC (#162006)

Removes need to offset PSHUFHW land index to extract the shuffle mask element.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 6053237b1a261..68ebfdf27ba43 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2796,7 +2796,7 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
     unsigned LaneBase = (Idx / LaneElts) * LaneElts;
     unsigned LaneIdx = Idx % LaneElts;
     unsigned SrcIdx = Idx;
-    unsigned Sel = (Ctl >> (2 * LaneIdx)) & 0x3;
+    unsigned Sel = (Ctl >> (2 * (LaneIdx & 0x3))) & 0x3;
     if (ElemBits == 32) {
       SrcIdx = LaneBase + Sel;
     } else {
@@ -2805,8 +2805,6 @@ static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
       if (!IsShufHW && !InHigh) {
         SrcIdx = LaneBase + Sel;
       } else if (IsShufHW && InHigh) {
-        unsigned Rel = LaneIdx - HalfSize;
-        Sel = (Ctl >> (2 * Rel)) & 0x3;
         SrcIdx = LaneBase + HalfSize + Sel;
       }
     }

From 3149a7720f714c14f7e6320745d9e35f49dba62b Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sun, 5 Oct 2025 13:38:27 +0200
Subject: [PATCH 782/878] [CIR] Implement DesignatedInitUpdateExpr for
 AggregateExpr (#161897)

Implement the DesignatedInitUpdateExpr support for AggregateExpr
---
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  6 ++--
 clang/test/CIR/CodeGen/struct.cpp             | 33 +++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index e51c3fcdef962..60ccf18b743df 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -201,8 +201,10 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   }
 
   void VisitDesignatedInitUpdateExpr(DesignatedInitUpdateExpr *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(),
-                     "AggExprEmitter: VisitDesignatedInitUpdateExpr");
+    AggValueSlot dest = ensureSlot(cgf.getLoc(e->getExprLoc()), e->getType());
+    LValue destLV = cgf.makeAddrLValue(dest.getAddress(), e->getType());
+    emitInitializationToLValue(e->getBase(), destLV);
+    VisitInitListExpr(e->getUpdater());
   }
   void VisitAbstractConditionalOperator(const AbstractConditionalOperator *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index c68f7426661b5..263799f8a5deb 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -184,6 +184,39 @@ void generic_selection() {
 // OGCG:   %[[D_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[D_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
 
+void designated_init_update_expr() {
+  CompleteS a;
+
+  struct Container {
+    CompleteS c;
+  } b = {a, .c.a = 1};
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a"]
+// CIR: %[[B_ADDR:.*]] = cir.alloca !rec_Container, !cir.ptr<!rec_Container>, ["b", init]
+// CIR: %[[C_ADDR:.*]] = cir.get_member %[[B_ADDR]][0] {name = "c"} : !cir.ptr<!rec_Container> -> !cir.ptr<!rec_CompleteS>
+// CIR: cir.call @_ZN9CompleteSC1ERKS_(%2, %[[A_ADDR]]) nothrow : (!cir.ptr<!rec_CompleteS>, !cir.ptr<!rec_CompleteS>) -> ()
+// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[C_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_1]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[C_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
+
+// LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: %[[B_ADDR:.*]] = alloca %struct.Container, i64 1, align 4
+// LLVM: %[[C_ADDR:.*]] = getelementptr %struct.Container, ptr %[[B_ADDR]], i32 0, i32 0
+// LLVM: call void @_ZN9CompleteSC1ERKS_(ptr %[[C_ADDR]], ptr %[[A_ADDR]])
+// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[C_ADDR]], i32 0, i32 0
+// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[C_ADDR]], i32 0, i32 1
+
+// OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca %struct.Container, align 4
+// OGCG: %[[C_ADDR:.*]] = getelementptr inbounds nuw %struct.Container, ptr %[[B_ADDR]], i32 0, i32 0
+// OGCG: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[C_ADDR]], ptr align 4 %[[A_ADDR]], i64 8, i1 false)
+// OGCG: %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[C_ADDR]], i32 0, i32 0
+// OGCG: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// OGCG: %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[C_ADDR]], i32 0, i32 1
+
 void atomic_init() {
   _Atomic CompleteS a;
   __c11_atomic_init(&a, {});

From 94eade61a02ae5fc6e7e19f1aff8c0eeb0b0d0a0 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Sun, 5 Oct 2025 08:37:52 -0700
Subject: [PATCH 783/878] Revert "[VPlan] Match legacy CM in ::computeCost if
 load is used by load/store."

This reverts commit 1d65d9ce06fef890389e61990d9c748162334e55 to fix
crashes, reported in the commits
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   7 +-
 .../X86/replicating-load-store-costs.ll       | 126 ------------------
 2 files changed, 3 insertions(+), 130 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a88cffc855192..43d61f2321a2c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3298,11 +3298,10 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
         UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
 
     Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
-    bool UsedByLoadStoreAddress = isUsedByLoadStoreAddress(this);
+
     InstructionCost ScalarCost =
         ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                              PtrTy, UsedByLoadStoreAddress ? nullptr : &Ctx.SE,
-                              nullptr, Ctx.CostKind);
+                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
     if (isSingleScalar())
       return ScalarCost;
 
@@ -3313,7 +3312,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
     // vectorized addressing or the loaded value is used as part of an address
     // of another load or store.
     bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
-    if (PreferVectorizedAddressing || !UsedByLoadStoreAddress) {
+    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
       bool EfficientVectorLoadStore =
           Ctx.TTI.supportsEfficientVectorElementLoadStore();
       if (!(IsLoad && !PreferVectorizedAddressing) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index f5329cf86451b..87848730c8f01 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -454,132 +454,6 @@ exit:
   ret void
 }
 
-declare i1 @cond()
-
-define double @test_load_used_by_other_load_scev(ptr %ptr.a, ptr %ptr.b, ptr %ptr.c) {
-; I64-LABEL: define double @test_load_used_by_other_load_scev(
-; I64-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) {
-; I64-NEXT:  [[ENTRY:.*]]:
-; I64-NEXT:    br label %[[OUTER_LOOP:.*]]
-; I64:       [[OUTER_LOOP_LOOPEXIT:.*]]:
-; I64-NEXT:    br label %[[OUTER_LOOP]]
-; I64:       [[OUTER_LOOP]]:
-; I64-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ]
-; I64-NEXT:    [[COND:%.*]] = call i1 @cond()
-; I64-NEXT:    br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
-; I64:       [[INNER_LOOP_PREHEADER]]:
-; I64-NEXT:    br label %[[VECTOR_PH:.*]]
-; I64:       [[VECTOR_PH]]:
-; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; I64:       [[VECTOR_BODY]]:
-; I64-NEXT:    [[TMP0:%.*]] = add i64 0, 1
-; I64-NEXT:    [[TMP1:%.*]] = add i64 1, 1
-; I64-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]]
-; I64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]]
-; I64-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]]
-; I64-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]]
-; I64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
-; I64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
-; I64-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]]
-; I64-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
-; I64-NEXT:    [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
-; I64-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
-; I64-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; I64-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer
-; I64-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
-; I64-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
-; I64-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8
-; I64-NEXT:    [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
-; I64-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0
-; I64-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1
-; I64-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer
-; I64-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0
-; I64-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
-; I64-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> <i32 1, i32 2>
-; I64-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer
-; I64-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer
-; I64-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00)
-; I64-NEXT:    [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8
-; I64-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
-; I64-NEXT:    [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0
-; I64-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
-; I64-NEXT:    [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
-; I64-NEXT:    [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
-; I64-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
-; I64:       [[MIDDLE_BLOCK]]:
-; I64-NEXT:    [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
-; I64-NEXT:    br label %[[OUTER_LOOP_LOOPEXIT]]
-; I64:       [[EXIT]]:
-; I64-NEXT:    ret double [[ACCUM]]
-;
-; I32-LABEL: define double @test_load_used_by_other_load_scev(
-; I32-SAME: ptr [[PTR_A:%.*]], ptr [[PTR_B:%.*]], ptr [[PTR_C:%.*]]) {
-; I32-NEXT:  [[ENTRY:.*]]:
-; I32-NEXT:    br label %[[OUTER_LOOP:.*]]
-; I32:       [[OUTER_LOOP]]:
-; I32-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ]
-; I32-NEXT:    [[COND:%.*]] = call i1 @cond()
-; I32-NEXT:    br i1 [[COND]], label %[[INNER_LOOP]], label %[[EXIT:.*]]
-; I32:       [[INNER_LOOP]]:
-; I32-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[OUTER_LOOP]] ], [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ]
-; I32-NEXT:    [[ACCUM_INNER:%.*]] = phi double [ [[ACCUM]], %[[OUTER_LOOP]] ], [ [[MUL1:%.*]], %[[INNER_LOOP]] ]
-; I32-NEXT:    [[IDX_PLUS1:%.*]] = add i64 [[IV]], 1
-; I32-NEXT:    [[GEP_C:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[IDX_PLUS1]]
-; I32-NEXT:    [[GEP_A_I64:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[IDX_PLUS1]]
-; I32-NEXT:    [[LOAD_IDX:%.*]] = load i64, ptr [[GEP_A_I64]], align 8
-; I32-NEXT:    [[GEP_B:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[LOAD_IDX]]
-; I32-NEXT:    [[LOAD_A:%.*]] = load double, ptr [[PTR_A]], align 8
-; I32-NEXT:    [[ADD1:%.*]] = fadd double [[LOAD_A]], 0.000000e+00
-; I32-NEXT:    [[GEP_C_OFFSET:%.*]] = getelementptr i8, ptr [[GEP_C]], i64 8
-; I32-NEXT:    [[LOAD_C:%.*]] = load double, ptr [[GEP_C_OFFSET]], align 8
-; I32-NEXT:    [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
-; I32-NEXT:    [[MUL2:%.*]] = fmul double [[LOAD_C]], 0.000000e+00
-; I32-NEXT:    [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
-; I32-NEXT:    [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
-; I32-NEXT:    [[LOAD_B:%.*]] = load double, ptr [[GEP_B]], align 8
-; I32-NEXT:    [[DIV:%.*]] = fdiv double [[LOAD_B]], [[ADD3]]
-; I32-NEXT:    [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]]
-; I32-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; I32-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
-; I32-NEXT:    br i1 [[EXITCOND]], label %[[OUTER_LOOP]], label %[[INNER_LOOP]]
-; I32:       [[EXIT]]:
-; I32-NEXT:    ret double [[ACCUM]]
-;
-entry:
-  br label %outer.loop
-
-outer.loop:
-  %accum = phi double [ 0.0, %entry ], [ %result, %inner.loop ]
-  %cond = call i1 @cond()
-  br i1 %cond, label %inner.loop, label %exit
-
-inner.loop:
-  %iv = phi i64 [ 0, %outer.loop ], [ %iv.next, %inner.loop ]
-  %accum.inner = phi double [ %accum, %outer.loop ], [ %mul1, %inner.loop ]
-  %idx.plus1 = add i64 %iv, 1
-  %gep.c = getelementptr i8, ptr %ptr.c, i64 %idx.plus1
-  %gep.a.i64 = getelementptr i64, ptr %ptr.a, i64 %idx.plus1
-  %load.idx = load i64, ptr %gep.a.i64, align 8
-  %gep.b = getelementptr double, ptr %ptr.b, i64 %load.idx
-  %load.a = load double, ptr %ptr.a, align 8
-  %add1 = fadd double %load.a, 0.000000e+00
-  %gep.c.offset = getelementptr i8, ptr %gep.c, i64 8
-  %load.c = load double, ptr %gep.c.offset, align 8
-  %mul1 = fmul double %add1, 0.000000e+00
-  %mul2 = fmul double %load.c, 0.000000e+00
-  %add2 = fadd double %mul2, 0.000000e+00
-  %add3 = fadd double %add2, 1.000000e+00
-  %load.b = load double, ptr %gep.b, align 8
-  %div = fdiv double %load.b, %add3
-  %result = fsub double %accum.inner, %div
-  %iv.next = add i64 %iv, 1
-  %exitcond = icmp eq i64 %iv, 1
-  br i1 %exitcond, label %outer.loop, label %inner.loop
-
-exit:
-  ret double %accum
-}
-
 attributes #0 = { "target-cpu"="znver2" }
 
 !0 = distinct !{!0, !1}

From f80c0baf058dbdc54d413cdc56a6c551dfa66387 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Sun, 5 Oct 2025 08:38:17 -0700
Subject: [PATCH 784/878] Revert "Reapply "[VPlan] Compute cost of more
 replicating loads/stores in ::computeCost. (#160053)" (#161724)"

This reverts commit 8f2466bc72a5ab163621cb1bf4bf53a27f1cefe7 to fix
crashes reported in commits
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  16 +--
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   9 +-
 llvm/lib/Transforms/Vectorize/VPlanHelpers.h  |  16 +--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 122 +++---------------
 4 files changed, 29 insertions(+), 134 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index cb6bfb26673bb..56a3d6d518dd1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3903,8 +3903,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
       if (VF.isScalar())
         continue;
 
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
       precomputeCosts(*Plan, VF, CostCtx);
       auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -4161,8 +4160,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
       // cost model.
-      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind,
-                            *CM.PSE.getSE());
+      VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind);
       VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
       assert(VectorRegion && "Expected to have a vector region!");
       for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
@@ -6854,7 +6852,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
 
 InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
                                                ElementCount VF) const {
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
@@ -7087,8 +7085,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   // simplifications not accounted for in the legacy cost model. If that's the
   // case, don't trigger the assertion, as the extra simplifications may cause a
   // different VF to be picked by the VPlan-based cost model.
-  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind,
-                        *CM.PSE.getSE());
+  VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind);
   precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
   // Verify that the VPlan-based and legacy cost models agree, except for VPlans
   // with early exits and plans with additional VPlan simplifications. The
@@ -8624,8 +8621,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // TODO: Enable following transform when the EVL-version of extended-reduction
   // and mulacc-reduction are implemented.
   if (!CM.foldTailWithEVL()) {
-    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
-                          *CM.PSE.getSE());
+    VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
     VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
                              CostCtx, Range);
   }
@@ -10079,7 +10075,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     bool ForceVectorization =
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM,
-                          CM.CostKind, *CM.PSE.getSE());
+                          CM.CostKind);
     if (!ForceVectorization &&
         !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
                                      LVP.getPlanFor(VF.Width), SEL,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 2555ebe2ad897..07b191a787806 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1772,8 +1772,7 @@ VPCostContext::getOperandInfo(VPValue *V) const {
 }
 
 InstructionCost VPCostContext::getScalarizationOverhead(
-    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF,
-    bool AlwaysIncludeReplicatingR) {
+    Type *ResultTy, ArrayRef<const VPValue *> Operands, ElementCount VF) {
   if (VF.isScalar())
     return 0;
 
@@ -1793,11 +1792,7 @@ InstructionCost VPCostContext::getScalarizationOverhead(
   SmallPtrSet<const VPValue *, 4> UniqueOperands;
   SmallVector<Type *> Tys;
   for (auto *Op : Operands) {
-    if (Op->isLiveIn() ||
-        (!AlwaysIncludeReplicatingR &&
-         isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op)) ||
-        (isa<VPReplicateRecipe>(Op) &&
-         cast<VPReplicateRecipe>(Op)->getOpcode() == Instruction::Load) ||
+    if (Op->isLiveIn() || isa<VPReplicateRecipe, VPPredInstPHIRecipe>(Op) ||
         !UniqueOperands.insert(Op).second)
       continue;
     Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
index 1580a3be3180a..fc1a09e9850f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -349,14 +349,12 @@ struct VPCostContext {
   LoopVectorizationCostModel &CM;
   SmallPtrSet<Instruction *, 8> SkipCostComputation;
   TargetTransformInfo::TargetCostKind CostKind;
-  ScalarEvolution &SE;
 
   VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
                 const VPlan &Plan, LoopVectorizationCostModel &CM,
-                TargetTransformInfo::TargetCostKind CostKind,
-                ScalarEvolution &SE)
+                TargetTransformInfo::TargetCostKind CostKind)
       : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM),
-        CostKind(CostKind), SE(SE) {}
+        CostKind(CostKind) {}
 
   /// Return the cost for \p UI with \p VF using the legacy cost model as
   /// fallback until computing the cost of all recipes migrates to VPlan.
@@ -376,12 +374,10 @@ struct VPCostContext {
 
   /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy
   /// and \p Operands with \p VF. This is a convenience wrapper for the
-  /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR
-  /// is true, always compute the cost of scalarizing replicating operands.
-  InstructionCost
-  getScalarizationOverhead(Type *ResultTy, ArrayRef<const VPValue *> Operands,
-                           ElementCount VF,
-                           bool AlwaysIncludeReplicatingR = false);
+  /// type-based getScalarizationOverhead API.
+  InstructionCost getScalarizationOverhead(Type *ResultTy,
+                                           ArrayRef<const VPValue *> Operands,
+                                           ElementCount VF);
 };
 
 /// This class can be used to assign names to VPValues. For VPValues without
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 43d61f2321a2c..67b9244e9dc72 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -40,7 +40,6 @@
 #include <cassert>
 
 using namespace llvm;
-using namespace llvm::VPlanPatternMatch;
 
 using VectorParts = SmallVector<Value *, 2>;
 
@@ -304,6 +303,7 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
   VPRecipeBase *OpR = Op->getDefiningRecipe();
 
   // If the partial reduction is predicated, a select will be operand 0
+  using namespace llvm::VPlanPatternMatch;
   if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
     OpR = Op->getDefiningRecipe();
   }
@@ -1963,6 +1963,7 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
   Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
 
   VPValue *Op0, *Op1;
+  using namespace llvm::VPlanPatternMatch;
   if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
       (match(this, m_LogicalAnd(m_VPValue(Op0), m_VPValue(Op1))) ||
        match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1))))) {
@@ -3110,62 +3111,6 @@ bool VPReplicateRecipe::shouldPack() const {
   });
 }
 
-/// Returns true if \p Ptr is a pointer computation for which the legacy cost
-/// model computes a SCEV expression when computing the address cost.
-static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) {
-  auto *PtrR = Ptr->getDefiningRecipe();
-  if (!PtrR || !((isa<VPReplicateRecipe>(PtrR) &&
-                  cast<VPReplicateRecipe>(PtrR)->getOpcode() ==
-                      Instruction::GetElementPtr) ||
-                 isa<VPWidenGEPRecipe>(PtrR) ||
-                 match(Ptr, m_GetElementPtr(m_VPValue(), m_VPValue()))))
-    return false;
-
-  // We are looking for a GEP where all indices are either loop invariant or
-  // inductions.
-  for (VPValue *Opd : drop_begin(PtrR->operands())) {
-    if (!Opd->isDefinedOutsideLoopRegions() &&
-        !isa<VPScalarIVStepsRecipe, VPWidenIntOrFpInductionRecipe>(Opd))
-      return false;
-  }
-
-  return true;
-}
-
-/// Returns true if \p V is used as part of the address of another load or
-/// store.
-static bool isUsedByLoadStoreAddress(const VPUser *V) {
-  SmallPtrSet<const VPUser *, 4> Seen;
-  SmallVector<const VPUser *> WorkList = {V};
-
-  while (!WorkList.empty()) {
-    auto *Cur = dyn_cast<VPSingleDefRecipe>(WorkList.pop_back_val());
-    if (!Cur || !Seen.insert(Cur).second)
-      continue;
-
-    for (VPUser *U : Cur->users()) {
-      if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(U))
-        if (InterleaveR->getAddr() == Cur)
-          return true;
-      if (auto *RepR = dyn_cast<VPReplicateRecipe>(U)) {
-        if (RepR->getOpcode() == Instruction::Load &&
-            RepR->getOperand(0) == Cur)
-          return true;
-        if (RepR->getOpcode() == Instruction::Store &&
-            RepR->getOperand(1) == Cur)
-          return true;
-      }
-      if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(U)) {
-        if (MemR->getAddr() == Cur && MemR->isConsecutive())
-          return true;
-      }
-    }
-
-    append_range(WorkList, cast<VPSingleDefRecipe>(Cur)->users());
-  }
-  return false;
-}
-
 InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
   Instruction *UI = cast<Instruction>(getUnderlyingValue());
@@ -3273,58 +3218,21 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   }
   case Instruction::Load:
   case Instruction::Store: {
-    if (VF.isScalable() && !isSingleScalar())
-      return InstructionCost::getInvalid();
-
+    if (isSingleScalar()) {
+      bool IsLoad = UI->getOpcode() == Instruction::Load;
+      Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
+      Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1));
+      const Align Alignment = getLoadStoreAlignment(UI);
+      unsigned AS = getLoadStoreAddressSpace(UI);
+      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
+      InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
+          UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI);
+      return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
+                                   ScalarPtrTy, nullptr, nullptr, Ctx.CostKind);
+    }
     // TODO: See getMemInstScalarizationCost for how to handle replicating and
     // predicated cases.
-    const VPRegionBlock *ParentRegion = getParent()->getParent();
-    if (ParentRegion && ParentRegion->isReplicator())
-      break;
-
-    bool IsLoad = UI->getOpcode() == Instruction::Load;
-    const VPValue *PtrOp = getOperand(!IsLoad);
-    // TODO: Handle cases where we need to pass a SCEV to
-    // getAddressComputationCost.
-    if (shouldUseAddressAccessSCEV(PtrOp))
-      break;
-
-    Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0));
-    Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp);
-    const Align Alignment = getLoadStoreAlignment(UI);
-    unsigned AS = getLoadStoreAddressSpace(UI);
-    TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0));
-    InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
-        UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo);
-
-    Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF);
-
-    InstructionCost ScalarCost =
-        ScalarMemOpCost + Ctx.TTI.getAddressComputationCost(
-                              PtrTy, &Ctx.SE, nullptr, Ctx.CostKind);
-    if (isSingleScalar())
-      return ScalarCost;
-
-    SmallVector<const VPValue *> OpsToScalarize;
-    Type *ResultTy = Type::getVoidTy(PtrTy->getContext());
-    // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
-    // don't assign scalarization overhead in general, if the target prefers
-    // vectorized addressing or the loaded value is used as part of an address
-    // of another load or store.
-    bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
-    if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) {
-      bool EfficientVectorLoadStore =
-          Ctx.TTI.supportsEfficientVectorElementLoadStore();
-      if (!(IsLoad && !PreferVectorizedAddressing) &&
-          !(!IsLoad && EfficientVectorLoadStore))
-        append_range(OpsToScalarize, operands());
-
-      if (!EfficientVectorLoadStore)
-        ResultTy = Ctx.Types.inferScalarType(this);
-    }
-
-    return (ScalarCost * VF.getFixedValue()) +
-           Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true);
+    break;
   }
   }
 

From 718ef3427a2ddd09122e2185962e4b241848d8b9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 5 Oct 2025 09:05:19 -0700
Subject: [PATCH 785/878] [llvm-c-test] Fix warnings

This patch fixes:

  llvm/tools/llvm-c-test/debuginfo.c:330:20: error: unused variable
  'Phi1FirstDbgRecord' [-Werror,-Wunused-variable]

  llvm/tools/llvm-c-test/debuginfo.c:332:20: error: unused variable
  'Phi1LastDbgRecord' [-Werror,-Wunused-variable]
---
 llvm/tools/llvm-c-test/debuginfo.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index e376d82009be0..a2f4b3ee8b50c 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -328,8 +328,10 @@ int llvm_test_dibuilder(void) {
   // Test that LLVMGetFirstDbgRecord and LLVMGetLastDbgRecord return NULL for
   // instructions without debug info.
   LLVMDbgRecordRef Phi1FirstDbgRecord = LLVMGetFirstDbgRecord(Phi1);
+  (void)Phi1FirstDbgRecord;
   assert(Phi1FirstDbgRecord == NULL);
   LLVMDbgRecordRef Phi1LastDbgRecord = LLVMGetLastDbgRecord(Phi1);
+  (void)Phi1LastDbgRecord;
   assert(Phi1LastDbgRecord == NULL);
 
   // Insert a non-phi before the `ret` but not before the debug records to

From e543ca685791b7424fb99187507fd555ff7832d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Sun, 5 Oct 2025 18:33:02 +0200
Subject: [PATCH 786/878] [mlir][vector][nfc] Add comments in tests (#160106)

Small follow-up for https://github.com/llvm/llvm-project/pull/158528,
otherwise it's not clear what makes the updated tests "negative".
---
 mlir/test/Dialect/Vector/canonicalize.mlir | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 5448976f84760..bccf5d5b77b0e 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -3377,6 +3377,9 @@ func.func @negative_from_elements_to_constant() -> vector<1x!llvm.ptr> {
 
 // -----
 
+// `foldFromElementsToConstant` does not support `ub.poison`, so it bails out.
+// Instead, other folders apply here (e.g. `rewriteFromElementsAsBroadcast`).
+
 // CHECK-LABEL: @negative_from_elements_poison
 //       CHECK:   %[[VAL:.*]] = ub.poison : vector<2xf32>
 //       CHECK:   return %[[VAL]] : vector<2xf32>
@@ -3388,6 +3391,9 @@ func.func @negative_from_elements_poison_f32() -> vector<2xf32> {
 
 // -----
 
+// `foldFromElementsToConstant` does not support `ub.poison`, so it bails out.
+// Instead, other folders apply here (e.g. `rewriteFromElementsAsBroadcast`).
+
 // CHECK-LABEL: @negative_from_elements_poison_i32
 //       CHECK:   %[[VAL:.*]] = ub.poison : vector<2xi32>
 //       CHECK:   return %[[VAL]] : vector<2xi32>
@@ -3399,6 +3405,9 @@ func.func @negative_from_elements_poison_i32() -> vector<2xi32> {
 
 // -----
 
+// `foldFromElementsToConstant` does not support `ub.poison`, so it bails out.
+// Instead, other folders apply here (e.g. `rewriteFromElementsAsBroadcast`).
+
 // CHECK-LABEL: @negative_from_elements_poison_constant_mix
 //       CHECK:   %[[POISON:.*]] = ub.poison : f32
 //       CHECK:   %[[CONST:.*]] = arith.constant 1.000000e+00 : f32

From 2121bda3424d7d8a18b4cd6718514be9bef5932e Mon Sep 17 00:00:00 2001
From: Andrey Ali Khan Bolshakov <bolsh.andrey@yandex.ru>
Date: Sun, 5 Oct 2025 19:43:53 +0300
Subject: [PATCH 787/878] [clang][AST] Don't print inherited default template
 args (#161953)

Prior to this change, for the code like this:
```cpp
template <int, int = 0>
class Tpl;
template <int = 0, int>
class Tpl;
```
pretty-printing produced an uncompilable code:
```cpp
template <int, int = 0> class Tpl;
template <int = 0, int = 0> class Tpl;
```
---
 clang/docs/ReleaseNotes.rst            |  3 +++
 clang/lib/AST/DeclPrinter.cpp          |  4 ++--
 clang/test/AST/ast-print-record-decl.c | 13 ++++++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ab536adcd3ba0..05379f44a0a39 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -123,6 +123,9 @@ AST Dumping Potentially Breaking Changes
 
     ``__atomic_test_and_set(p, 0)``
 
+- Pretty-printing of templates with inherited (i.e. specified in a previous
+  redeclaration) default arguments has been fixed.
+
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
 - Members of anonymous unions/structs are now injected as ``IndirectFieldDecl``
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 196057f7b45a4..7001adeff5397 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1894,7 +1894,7 @@ void DeclPrinter::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *TTP) {
       Out << TTP->getDeclName();
   }
 
-  if (TTP->hasDefaultArgument()) {
+  if (TTP->hasDefaultArgument() && !TTP->defaultArgumentWasInherited()) {
     Out << " = ";
     TTP->getDefaultArgument().getArgument().print(Policy, Out,
                                                   /*IncludeType=*/false);
@@ -1909,7 +1909,7 @@ void DeclPrinter::VisitNonTypeTemplateParmDecl(
         Policy.CleanUglifiedParameters ? II->deuglifiedName() : II->getName();
   printDeclType(NTTP->getType(), Name, NTTP->isParameterPack());
 
-  if (NTTP->hasDefaultArgument()) {
+  if (NTTP->hasDefaultArgument() && !NTTP->defaultArgumentWasInherited()) {
     Out << " = ";
     NTTP->getDefaultArgument().getArgument().print(Policy, Out,
                                                    /*IncludeType=*/false);
diff --git a/clang/test/AST/ast-print-record-decl.c b/clang/test/AST/ast-print-record-decl.c
index d3717a4feab83..fd815881ebeb0 100644
--- a/clang/test/AST/ast-print-record-decl.c
+++ b/clang/test/AST/ast-print-record-decl.c
@@ -290,9 +290,9 @@ KW DeclGroupInMemberList {
 // A tag decl group in the tag decl's own member list is exercised in
 // defSelfRef above.
 
+#ifdef __cplusplus
 
 // Check out-of-line record definition
-#ifdef __cplusplus
 // PRINT-CXX-NEXT: [[KW]] OutOfLineRecord {
 KW OutOfLineRecord {
   // PRINT-CXX-NEXT: [[KW]] Inner
@@ -304,4 +304,15 @@ KW OutOfLineRecord {
 KW OutOfLineRecord::Inner {
   // PRINT-CXX-NEXT: };
 };
+
+// PRINT-CXX-NEXT: template <typename, typename = int> [[KW]] SmearedTypeDefArgs;
+template <typename, typename = int> KW SmearedTypeDefArgs;
+// PRINT-CXX-NEXT: template <typename = int, typename> [[KW]] SmearedTypeDefArgs;
+template <typename = int, typename> KW SmearedTypeDefArgs;
+
+// PRINT-CXX-NEXT: template <int, int = 0> [[KW]] SmearedNTTPDefArgs;
+template <int, int = 0> KW SmearedNTTPDefArgs;
+// PRINT-CXX-NEXT: template <int = 0, int> [[KW]] SmearedNTTPDefArgs;
+template <int = 0, int> KW SmearedNTTPDefArgs;
+
 #endif

From 3ae7af70ce633f1dd552dc7ba3f1b000c66fc821 Mon Sep 17 00:00:00 2001
From: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
Date: Mon, 6 Oct 2025 02:04:23 +0800
Subject: [PATCH 788/878] Revert "[compiler-rt][asan][tests] Stabilize wchar
 tests on Darwin/Android" (#162001)

Reverts llvm/llvm-project#161624
---
 compiler-rt/test/asan/TestCases/wcscat.cpp  | 20 ++++++++++----------
 compiler-rt/test/asan/TestCases/wcscpy.cpp  | 20 ++++++++++----------
 compiler-rt/test/asan/TestCases/wcsncat.cpp | 20 ++++++++++----------
 compiler-rt/test/asan/TestCases/wcsncpy.cpp | 20 ++++++++++----------
 4 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
index f0a8ec12580b3..dcdff88c18ef1 100644
--- a/compiler-rt/test/asan/TestCases/wcscat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscat.cpp
@@ -1,26 +1,26 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  const wchar_t *start = L"X means ";
-  const wchar_t *append = L"dog";
+  wchar_t *start = L"X means ";
+  wchar_t *append = L"dog";
   wchar_t goodDst[12];
   wcscpy(goodDst, start);
   wcscat(goodDst, append);
 
   wchar_t badDst[9];
   wcscpy(badDst, start);
-  fprintf(stderr, "Good so far.\n");
+  printf("Good so far.\n");
   // CHECK: Good so far.
-  fflush(stderr);
+  fflush(stdout);
   wcscat(badDst, append); // Boom!
   // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcscat
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
index a280d29289e37..414d83303a960 100644
--- a/compiler-rt/test/asan/TestCases/wcscpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcscpy.cpp
@@ -1,23 +1,23 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  const wchar_t *src = L"X means dog";
+  wchar_t *src = L"X means dog";
   wchar_t goodDst[12];
   wcscpy(goodDst, src);
 
   wchar_t badDst[7];
-  fprintf(stderr, "Good so far.\n");
+  printf("Good so far.\n");
   // CHECK: Good so far.
-  fflush(stderr);
+  fflush(stdout);
   wcscpy(badDst, src); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcscpy
+  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
index eb7d095e45c7a..3ab7fc8f55d63 100644
--- a/compiler-rt/test/asan/TestCases/wcsncat.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncat.cpp
@@ -1,14 +1,14 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  const wchar_t *start = L"X means ";
-  const wchar_t *append = L"dog";
+  wchar_t *start = L"X means ";
+  wchar_t *append = L"dog";
   wchar_t goodDst[15];
   wcscpy(goodDst, start);
   wcsncat(goodDst, append, 5);
@@ -16,12 +16,12 @@ int main() {
   wchar_t badDst[11];
   wcscpy(badDst, start);
   wcsncat(badDst, append, 1);
-  fprintf(stderr, "Good so far.\n");
+  printf("Good so far.\n");
   // CHECK: Good so far.
-  fflush(stderr);
+  fflush(stdout);
   wcsncat(badDst, append, 3); // Boom!
   // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcsncat
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
index 1106bf5d264e5..6177b72990a0a 100644
--- a/compiler-rt/test/asan/TestCases/wcsncpy.cpp
+++ b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
@@ -1,25 +1,25 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
 
 #include <stdio.h>
 #include <wchar.h>
 
 int main() {
-  const wchar_t *src = L"X means dog";
+  wchar_t *src = L"X means dog";
   wchar_t goodDst[12];
   wcsncpy(goodDst, src, 12);
 
   wchar_t badDst[7];
   wcsncpy(badDst, src, 7); // This should still work.
-  fprintf(stderr, "Good so far.\n");
+  printf("Good so far.\n");
   // CHECK: Good so far.
-  fflush(stderr);
+  fflush(stdout);
 
   wcsncpy(badDst, src, 15); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
-  // CHECK: #0 {{0x[0-9a-f]+}} in wcsncpy
+  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
+  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
   printf("Should have failed with ASAN error.\n");
 }
\ No newline at end of file

From c793782b03aba045f3b8ae6aa90b7bb7b9579a09 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Sun, 5 Oct 2025 18:17:38 +0000
Subject: [PATCH 789/878] Revert "[compiler-rt][asan] Add wcscpy/wcsncpy;
 enable wcscat/wcsncat on Windows" (#162021)

Reverts llvm/llvm-project#160493 due to buildbot failures e.g.,
https://github.com/llvm/llvm-project/pull/160493#issuecomment-3357314356

The fix-forward (https://github.com/llvm/llvm-project/pull/161624) still
had failures on Darwin, and was reverted in
https://github.com/llvm/llvm-project/pull/162001 i.e., this pull request
completes the revert to green for this patch stack.
---
 compiler-rt/lib/asan/asan_interceptors.cpp    | 46 ++-----------------
 compiler-rt/lib/asan/asan_interceptors.h      |  1 -
 .../asan/asan_win_static_runtime_thunk.cpp    |  4 --
 .../sanitizer_platform_interceptors.h         |  2 +-
 compiler-rt/test/asan/TestCases/wcscat.cpp    | 26 -----------
 compiler-rt/test/asan/TestCases/wcscpy.cpp    | 23 ----------
 compiler-rt/test/asan/TestCases/wcsncat.cpp   | 27 -----------
 compiler-rt/test/asan/TestCases/wcsncpy.cpp   | 25 ----------
 8 files changed, 4 insertions(+), 150 deletions(-)
 delete mode 100644 compiler-rt/test/asan/TestCases/wcscat.cpp
 delete mode 100644 compiler-rt/test/asan/TestCases/wcscpy.cpp
 delete mode 100644 compiler-rt/test/asan/TestCases/wcsncat.cpp
 delete mode 100644 compiler-rt/test/asan/TestCases/wcsncpy.cpp

diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 0f613f0fdc30b..7c9a08b9083a2 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -58,20 +58,13 @@ namespace __asan {
 
 static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
 #if SANITIZER_INTERCEPT_STRNLEN
-  if (REAL(strnlen))
+  if (REAL(strnlen)) {
     return REAL(strnlen)(s, maxlen);
-#  endif
+  }
+#endif
   return internal_strnlen(s, maxlen);
 }
 
-static inline uptr MaybeRealWcsnlen(const wchar_t* s, uptr maxlen) {
-#  if SANITIZER_INTERCEPT_WCSNLEN
-  if (REAL(wcsnlen))
-    return REAL(wcsnlen)(s, maxlen);
-#  endif
-  return internal_wcsnlen(s, maxlen);
-}
-
 void SetThreadName(const char *name) {
   AsanThread *t = GetCurrentThread();
   if (t)
@@ -577,20 +570,6 @@ INTERCEPTOR(char *, strcpy, char *to, const char *from) {
   return REAL(strcpy)(to, from);
 }
 
-INTERCEPTOR(wchar_t*, wcscpy, wchar_t* to, const wchar_t* from) {
-  void* ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, wcscpy);
-  if (!TryAsanInitFromRtl())
-    return REAL(wcscpy)(to, from);
-  if (flags()->replace_str) {
-    uptr size = (internal_wcslen(from) + 1) * sizeof(wchar_t);
-    CHECK_RANGES_OVERLAP("wcscpy", to, size, from, size);
-    ASAN_READ_RANGE(ctx, from, size);
-    ASAN_WRITE_RANGE(ctx, to, size);
-  }
-  return REAL(wcscpy)(to, from);
-}
-
 // Windows doesn't always define the strdup identifier,
 // and when it does it's a macro defined to either _strdup
 // or _strdup_dbg, _strdup_dbg ends up calling _strdup, so
@@ -654,20 +633,6 @@ INTERCEPTOR(char*, strncpy, char *to, const char *from, usize size) {
   return REAL(strncpy)(to, from, size);
 }
 
-INTERCEPTOR(wchar_t*, wcsncpy, wchar_t* to, const wchar_t* from, uptr size) {
-  void* ctx;
-  ASAN_INTERCEPTOR_ENTER(ctx, wcsncpy);
-  AsanInitFromRtl();
-  if (flags()->replace_str) {
-    uptr from_size =
-        Min(size, MaybeRealWcsnlen(from, size) + 1) * sizeof(wchar_t);
-    CHECK_RANGES_OVERLAP("wcsncpy", to, from_size, from, from_size);
-    ASAN_READ_RANGE(ctx, from, from_size);
-    ASAN_WRITE_RANGE(ctx, to, size * sizeof(wchar_t));
-  }
-  return REAL(wcsncpy)(to, from, size);
-}
-
 template <typename Fn>
 static ALWAYS_INLINE auto StrtolImpl(void *ctx, Fn real, const char *nptr,
                                      char **endptr, int base)
@@ -844,11 +809,6 @@ void InitializeAsanInterceptors() {
   ASAN_INTERCEPT_FUNC(strncat);
   ASAN_INTERCEPT_FUNC(strncpy);
   ASAN_INTERCEPT_FUNC(strdup);
-
-  // Intercept wcs* functions.
-  ASAN_INTERCEPT_FUNC(wcscpy);
-  ASAN_INTERCEPT_FUNC(wcsncpy);
-
 #  if ASAN_INTERCEPT___STRDUP
   ASAN_INTERCEPT_FUNC(__strdup);
 #endif
diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h
index 2d551cfafd1f5..3e2386eaf8092 100644
--- a/compiler-rt/lib/asan/asan_interceptors.h
+++ b/compiler-rt/lib/asan/asan_interceptors.h
@@ -129,7 +129,6 @@ DECLARE_REAL(char*, strchr, const char *str, int c)
 DECLARE_REAL(SIZE_T, strlen, const char *s)
 DECLARE_REAL(char*, strncpy, char *to, const char *from, SIZE_T size)
 DECLARE_REAL(SIZE_T, strnlen, const char *s, SIZE_T maxlen)
-DECLARE_REAL(SIZE_T, wcsnlen, const wchar_t* s, SIZE_T maxlen)
 DECLARE_REAL(char*, strstr, const char *s1, const char *s2)
 
 #  if !SANITIZER_APPLE
diff --git a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
index 46e0e90738f24..4a69b66574039 100644
--- a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
@@ -63,10 +63,6 @@ INTERCEPT_LIBRARY_FUNCTION_ASAN(strpbrk);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strspn);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strstr);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strtok);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(wcscat);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(wcscpy);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsncat);
-INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsncpy);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(wcslen);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsnlen);
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 88ecd7e16306a..29987decdff45 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -551,7 +551,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_MALLOC_USABLE_SIZE (!SI_MAC && !SI_NETBSD)
 #define SANITIZER_INTERCEPT_MCHECK_MPROBE SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_WCSLEN 1
-#define SANITIZER_INTERCEPT_WCSCAT (SI_POSIX || SI_WINDOWS)
+#define SANITIZER_INTERCEPT_WCSCAT SI_POSIX
 #define SANITIZER_INTERCEPT_WCSDUP SI_POSIX
 #define SANITIZER_INTERCEPT_SIGNAL_AND_SIGACTION (!SI_WINDOWS && SI_NOT_FUCHSIA)
 #define SANITIZER_INTERCEPT_BSD_SIGNAL SI_ANDROID
diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
deleted file mode 100644
index dcdff88c18ef1..0000000000000
--- a/compiler-rt/test/asan/TestCases/wcscat.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-
-#include <stdio.h>
-#include <wchar.h>
-
-int main() {
-  wchar_t *start = L"X means ";
-  wchar_t *append = L"dog";
-  wchar_t goodDst[12];
-  wcscpy(goodDst, start);
-  wcscat(goodDst, append);
-
-  wchar_t badDst[9];
-  wcscpy(badDst, start);
-  printf("Good so far.\n");
-  // CHECK: Good so far.
-  fflush(stdout);
-  wcscat(badDst, append); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
-  printf("Should have failed with ASAN error.\n");
-}
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
deleted file mode 100644
index 414d83303a960..0000000000000
--- a/compiler-rt/test/asan/TestCases/wcscpy.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-
-#include <stdio.h>
-#include <wchar.h>
-
-int main() {
-  wchar_t *src = L"X means dog";
-  wchar_t goodDst[12];
-  wcscpy(goodDst, src);
-
-  wchar_t badDst[7];
-  printf("Good so far.\n");
-  // CHECK: Good so far.
-  fflush(stdout);
-  wcscpy(badDst, src); // Boom!
-  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcscpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
-  printf("Should have failed with ASAN error.\n");
-}
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
deleted file mode 100644
index 3ab7fc8f55d63..0000000000000
--- a/compiler-rt/test/asan/TestCases/wcsncat.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-
-#include <stdio.h>
-#include <wchar.h>
-
-int main() {
-  wchar_t *start = L"X means ";
-  wchar_t *append = L"dog";
-  wchar_t goodDst[15];
-  wcscpy(goodDst, start);
-  wcsncat(goodDst, append, 5);
-
-  wchar_t badDst[11];
-  wcscpy(badDst, start);
-  wcsncat(badDst, append, 1);
-  printf("Good so far.\n");
-  // CHECK: Good so far.
-  fflush(stdout);
-  wcsncat(badDst, append, 3); // Boom!
-  // CHECK: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncat{{.*}}sanitizer_common_interceptors.inc:{{[0-9]+}}
-  printf("Should have failed with ASAN error.\n");
-}
\ No newline at end of file
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
deleted file mode 100644
index 6177b72990a0a..0000000000000
--- a/compiler-rt/test/asan/TestCases/wcsncpy.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: %clangxx_asan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-// RUN: %clangxx_asan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
-
-#include <stdio.h>
-#include <wchar.h>
-
-int main() {
-  wchar_t *src = L"X means dog";
-  wchar_t goodDst[12];
-  wcsncpy(goodDst, src, 12);
-
-  wchar_t badDst[7];
-  wcsncpy(badDst, src, 7); // This should still work.
-  printf("Good so far.\n");
-  // CHECK: Good so far.
-  fflush(stdout);
-
-  wcsncpy(badDst, src, 15); // Boom!
-  // CHECK:ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
-  // CHECK: WRITE of size {{[0-9]+}} at [[ADDR:0x[0-9a-f]+]] thread T0
-  // CHECK: #0 [[ADDR:0x[0-9a-f]+]] in wcsncpy{{.*}}asan_interceptors.cpp:{{[0-9]+}}
-  printf("Should have failed with ASAN error.\n");
-}
\ No newline at end of file

From 4a2f29dd0ccbb378a7c0698a51f3dc42ee615ff4 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Sun, 5 Oct 2025 17:58:25 -0300
Subject: [PATCH 790/878] [clang] don't print redundant context notes when
 instantiating alias templates (#161986)

The redundant notes were introduced with the workaround for finding the
template instantiationa args for lambdas inside template type aliases.

This removes the notes for the cases where we are simply instantiating
an outer template, and when diagnosing uses of the alias template.

Also adds comments calling the workaround explicitly.
---
 clang/lib/Sema/SemaTemplate.cpp                | 13 +++++++++----
 clang/lib/Sema/SemaTemplateInstantiate.cpp     |  6 ++++++
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp | 18 ++++++++++--------
 .../SemaTemplate/alias-template-deprecated.cpp | 17 ++++++-----------
 clang/test/SemaTemplate/alias-templates.cpp    |  8 ++++++++
 5 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index dcf2876af81fc..419f3e1ad30ed 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3822,14 +3822,19 @@ QualType Sema::CheckTemplateIdType(ElaboratedTypeKeyword Keyword,
         AliasTemplate->getTemplateParameters()->getDepth());
 
     LocalInstantiationScope Scope(*this);
-    InstantiatingTemplate Inst(
-        *this, /*PointOfInstantiation=*/TemplateLoc,
-        /*Entity=*/AliasTemplate,
-        /*TemplateArgs=*/TemplateArgLists.getInnermost());
 
     // Diagnose uses of this alias.
     (void)DiagnoseUseOfDecl(AliasTemplate, TemplateLoc);
 
+    // FIXME: The TemplateArgs passed here are not used for the context note,
+    // nor they should, because this note will be pointing to the specialization
+    // anyway. These arguments are needed for a hack for instantiating lambdas
+    // in the pattern of the alias. In getTemplateInstantiationArgs, these
+    // arguments will be used for collating the template arguments needed to
+    // instantiate the lambda.
+    InstantiatingTemplate Inst(*this, /*PointOfInstantiation=*/TemplateLoc,
+                               /*Entity=*/AliasTemplate,
+                               /*TemplateArgs=*/CTAI.SugaredConverted);
     if (Inst.isInvalid())
       return QualType();
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 1f762ca64c05f..7b05e4cf1385f 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1271,6 +1271,12 @@ void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) {
                PDiag(diag::note_building_deduction_guide_here));
       break;
     case CodeSynthesisContext::TypeAliasTemplateInstantiation:
+      // Workaround for a workaround: don't produce a note if we are merely
+      // instantiating some other template which contains this alias template.
+      // This would be redundant either with the error itself, or some other
+      // context note attached to it.
+      if (Active->NumTemplateArgs == 0)
+        break;
       DiagFunc(Active->PointOfInstantiation,
                PDiag(diag::note_template_type_alias_instantiation_here)
                    << cast<TypeAliasTemplateDecl>(Active->Entity)
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index e2dc70360506e..3819f775811e5 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1580,17 +1580,19 @@ Decl *TemplateDeclInstantiator::InstantiateTypeAliasTemplateDecl(
   if (!InstParams)
     return nullptr;
 
-  TypeAliasDecl *Pattern = D->getTemplatedDecl();
-  Sema::InstantiatingTemplate InstTemplate(
-      SemaRef, D->getBeginLoc(), D,
-      D->getTemplateDepth() >= TemplateArgs.getNumLevels()
-          ? ArrayRef<TemplateArgument>()
-          : (TemplateArgs.begin() + TemplateArgs.getNumLevels() - 1 -
-             D->getTemplateDepth())
-                ->Args);
+  // FIXME: This is a hack for instantiating lambdas in the pattern of the
+  // alias. We are not really instantiating the alias at its template level,
+  // that only happens in CheckTemplateId, this is only for outer templates
+  // which contain it. In getTemplateInstantiationArgs, the template arguments
+  // used here would be used for collating the template arguments needed to
+  // instantiate the lambda. Pass an empty argument list, so this workaround
+  // doesn't get confused if there is an outer alias being instantiated.
+  Sema::InstantiatingTemplate InstTemplate(SemaRef, D->getBeginLoc(), D,
+                                           ArrayRef<TemplateArgument>());
   if (InstTemplate.isInvalid())
     return nullptr;
 
+  TypeAliasDecl *Pattern = D->getTemplatedDecl();
   TypeAliasTemplateDecl *PrevAliasTemplate = nullptr;
   if (getPreviousDeclForInstantiation<TypedefNameDecl>(Pattern)) {
     DeclContext::lookup_result Found = Owner->lookup(Pattern->getDeclName());
diff --git a/clang/test/SemaTemplate/alias-template-deprecated.cpp b/clang/test/SemaTemplate/alias-template-deprecated.cpp
index 7418e222bcfc5..6dffd37d14909 100644
--- a/clang/test/SemaTemplate/alias-template-deprecated.cpp
+++ b/clang/test/SemaTemplate/alias-template-deprecated.cpp
@@ -46,23 +46,19 @@ using UsingInstWithCPPAttr [[deprecated("Do not use this")]] = NoAttr<int>;
 void bar() {
   NoAttr<int> obj; // Okay
 
-  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithAttr' is deprecated}}
   UsingWithAttr<int> objUsingWA;
 
-  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithAttr' is deprecated}}
   NoAttr<UsingWithAttr<int>> s;
 
   // expected-note@+1 {{'DepInt' has been explicitly marked deprecated here}}
   using DepInt [[deprecated]] = int;
-  // expected-warning@+3 {{'UsingWithAttr' is deprecated}}
-  // expected-warning@+2 {{'DepInt' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
+  // expected-warning@+1 {{'DepInt' is deprecated}}
   using X = UsingWithAttr<DepInt>;
 
-  // expected-warning@+2 {{'UsingWithAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithAttr' is deprecated}}
   UsingWithAttr<int>().foo();
 
   // expected-warning@+1 {{'UsingInstWithAttr' is deprecated}}
@@ -74,8 +70,7 @@ void bar() {
   // expected-warning@+1 {{'UsingTDWithAttr' is deprecated}}
   UsingTDWithAttr objUTDWA;
 
-  // expected-warning@+2 {{'UsingWithCPPAttr' is deprecated}}
-  // expected-note@+1 {{in instantiation of template type alias 'UsingWithCPPAttr' requested here}}
+  // expected-warning@+1 {{'UsingWithCPPAttr' is deprecated}}
   UsingWithCPPAttr<int> objUsingWCPPA;
 
   // expected-warning@+1 {{'UsingInstWithCPPAttr' is deprecated: Do not use this}}
diff --git a/clang/test/SemaTemplate/alias-templates.cpp b/clang/test/SemaTemplate/alias-templates.cpp
index ab5cad72faf1b..09fe72e9c7db0 100644
--- a/clang/test/SemaTemplate/alias-templates.cpp
+++ b/clang/test/SemaTemplate/alias-templates.cpp
@@ -312,3 +312,11 @@ namespace resolved_nttp {
 
   using TC2 = decltype(C<bool, 2, 3>::p); // expected-note {{instantiation of}}
 }
+
+namespace OuterSubstFailure {
+  template <class T> struct A {
+      template <class> using B = T&;
+      // expected-error@-1 {{cannot form a reference to 'void'}}
+  };
+  template struct A<void>; // expected-note {{requested here}}
+} // namespace OuterSubstFailure

From c40ee0f32218be0e7f0abd43f96f2f65d2bb1a2a Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Sun, 5 Oct 2025 15:57:08 -0700
Subject: [PATCH 791/878] Reapply "[MemProf] Add ambigous memprof attribute"
 (#161717) (#161918)

Reapply llvm/llvm-project#157204 with fix and a new test for the issue
it caused (the test change provoked the assert that was converted to an
if condition).

Also, make the application of this new attribute under an (on by
default) flag, so that it can be more easily disabled if needed. Add
test for the new flag.
---
 .../include/llvm/Analysis/MemoryProfileInfo.h |  8 ++++++
 llvm/lib/Analysis/MemoryProfileInfo.cpp       | 28 +++++++++++++++++++
 .../IPO/MemProfContextDisambiguation.cpp      |  9 ++++--
 llvm/test/ThinLTO/X86/memprof-basic.ll        |  5 +++-
 llvm/test/Transforms/PGOProfile/memprof.ll    | 19 +++++++++----
 .../Analysis/MemoryProfileInfoTest.cpp        | 21 +++++++++-----
 6 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index 571caf95f275d..be690a49767b4 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -59,6 +59,14 @@ LLVM_ABI std::string getAllocTypeAttributeString(AllocationType Type);
 /// True if the AllocTypes bitmask contains just a single type.
 LLVM_ABI bool hasSingleAllocType(uint8_t AllocTypes);
 
+/// Removes any existing "ambiguous" memprof attribute. Called before we apply a
+/// specific allocation type such as "cold", "notcold", or "hot".
+LLVM_ABI void removeAnyExistingAmbiguousAttribute(CallBase *CB);
+
+/// Adds an "ambiguous" memprof attribute to call with a matched allocation
+/// profile but that we haven't yet been able to disambiguate.
+LLVM_ABI void addAmbiguousAttribute(CallBase *CB);
+
 /// Class to build a trie of call stack contexts for a particular profiled
 /// allocation call, along with their associated allocation types.
 /// The allocation will be at the root of the trie, which is then used to
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 0c1f8dbd1119a..92a5b6f378aab 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -54,6 +54,10 @@ cl::opt<unsigned> MinPercentMaxColdSize(
     "memprof-min-percent-max-cold-size", cl::init(100), cl::Hidden,
     cl::desc("Min percent of max cold bytes for critical cold context"));
 
+LLVM_ABI cl::opt<bool> MemProfUseAmbiguousAttributes(
+    "memprof-ambiguous-attributes", cl::init(true), cl::Hidden,
+    cl::desc("Apply ambiguous memprof attribute to ambiguous allocations"));
+
 } // end namespace llvm
 
 bool llvm::memprof::metadataIncludesAllContextSizeInfo() {
@@ -125,6 +129,26 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) {
   return NumAllocTypes == 1;
 }
 
+void llvm::memprof::removeAnyExistingAmbiguousAttribute(CallBase *CB) {
+  if (!CB->hasFnAttr("memprof"))
+    return;
+  assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
+  CB->removeFnAttr("memprof");
+}
+
+void llvm::memprof::addAmbiguousAttribute(CallBase *CB) {
+  if (!MemProfUseAmbiguousAttributes)
+    return;
+  // We may have an existing ambiguous attribute if we are reanalyzing
+  // after inlining.
+  if (CB->hasFnAttr("memprof")) {
+    assert(CB->getFnAttr("memprof").getValueAsString() == "ambiguous");
+  } else {
+    auto A = llvm::Attribute::get(CB->getContext(), "memprof", "ambiguous");
+    CB->addFnAttr(A);
+  }
+}
+
 void CallStackTrie::addCallStack(
     AllocationType AllocType, ArrayRef<uint64_t> StackIds,
     std::vector<ContextTotalSize> ContextSizeInfo) {
@@ -470,6 +494,9 @@ void CallStackTrie::addSingleAllocTypeAttribute(CallBase *CI, AllocationType AT,
                                                 StringRef Descriptor) {
   auto AllocTypeString = getAllocTypeAttributeString(AT);
   auto A = llvm::Attribute::get(CI->getContext(), "memprof", AllocTypeString);
+  // After inlining we may be able to convert an existing ambiguous allocation
+  // to an unambiguous one.
+  removeAnyExistingAmbiguousAttribute(CI);
   CI->addFnAttr(A);
   if (MemProfReportHintedSizes) {
     std::vector<ContextTotalSize> ContextSizeInfo;
@@ -529,6 +556,7 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
     assert(MIBCallStack.size() == 1 &&
            "Should only be left with Alloc's location in stack");
     CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
+    addAmbiguousAttribute(CI);
     return true;
   }
   // If there exists corner case that CallStackTrie has one chain to leaf
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index faeab95a3b43f..cfdfd94b4e27a 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3986,6 +3986,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
 void ModuleCallsiteContextGraph::updateAllocationCall(
     CallInfo &Call, AllocationType AllocType) {
   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+  removeAnyExistingAmbiguousAttribute(cast<CallBase>(Call.call()));
   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
                                 "memprof", AllocTypeString);
   cast<CallBase>(Call.call())->addFnAttr(A);
@@ -5661,9 +5662,10 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
 
         // Include allocs that were already assigned a memprof function
-        // attribute in the statistics.
-        if (CB->getAttributes().hasFnAttr("memprof")) {
-          assert(!MemProfMD);
+        // attribute in the statistics. Only do this for those that do not have
+        // memprof metadata, since we add an "ambiguous" memprof attribute by
+        // default.
+        if (CB->getAttributes().hasFnAttr("memprof") && !MemProfMD) {
           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
               ? AllocTypeColdThinBackend++
               : AllocTypeNotColdThinBackend++;
@@ -5740,6 +5742,7 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
               // clone J-1 (J==0 is the original clone and does not have a VMaps
               // entry).
               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+            removeAnyExistingAmbiguousAttribute(CBClone);
             CBClone->addFnAttr(A);
             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
                      << ore::NV("AllocationCall", CBClone) << " in clone "
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 0ff0ce04452f1..537e1b8a26839 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -103,7 +103,9 @@ declare i32 @sleep()
 
 define internal ptr @_Z3barv() #0 !dbg !15 {
 entry:
-  %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+  ;; Use an ambiguous attribute for this allocation, which is now added to such
+  ;; allocations during matching. It should not affect cloning.
+  %call = call ptr @_Znam(i64 0) #1, !memprof !2, !callsite !7
   ret ptr null
 }
 
@@ -125,6 +127,7 @@ entry:
 uselistorder ptr @_Z3foov, { 1, 0 }
 
 attributes #0 = { noinline optnone }
+attributes #1 = { "memprof"="ambiguous" }
 
 !llvm.dbg.cu = !{!13}
 !llvm.module.flags = !{!20, !21}
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index c69d0311e0388..f6a89a8ceb86a 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -38,7 +38,7 @@
 ; ALL-NOT: no profile data available for function
 
 ;; Using a memprof-only profile for memprof-use should only give memprof metadata
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-print-match-info -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,MEMPROFMATCHINFO,MEMPROFSTATS
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-print-match-info -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,MEMPROFMATCHINFO,MEMPROFSTATS,AMBIG
 ; There should not be any PGO metadata
 ; MEMPROFONLY-NOT: !prof
 
@@ -51,10 +51,10 @@
 ;; Test the same thing but by passing the memory profile through to a default
 ;; pipeline via -memory-profile-file=, which should cause the necessary field
 ;; of the PGOOptions structure to be populated with the profile filename.
-; RUN: opt < %s -passes='default<O2>' -memory-profile-file=%t.memprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY
+; RUN: opt < %s -passes='default<O2>' -memory-profile-file=%t.memprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,AMBIG
 
 ;; Using a pgo+memprof profile for memprof-use should only give memprof metadata
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,MEMPROFONLY,AMBIG
 
 ;; Using a pgo-only profile for memprof-use should give an error
 ; RUN: not opt < %s -passes='memprof-use<profile-filename=%t.pgoprofdata>' -S 2>&1 | FileCheck %s --check-prefixes=MEMPROFWITHPGOONLY
@@ -72,7 +72,7 @@
 
 ;; Using a pgo+memprof profile for both memprof-use and pgo-instr-use should
 ;; give both memprof and pgo metadata.
-; RUN: opt < %s -passes='pgo-instr-use,memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-test-profile-file=%t.pgomemprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,PGO
+; RUN: opt < %s -passes='pgo-instr-use,memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-test-profile-file=%t.pgomemprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,PGO,AMBIG
 
 ;; Check that the total sizes are reported if requested. A message should be
 ;; emitted for the pruned context. Also check that remarks are emitted for the
@@ -108,7 +108,11 @@
 
 ;; However, with the same threshold, but hot hints not enabled, it should be
 ;; notcold again.
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-min-ave-lifetime-access-density-hot-threshold=0 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-min-ave-lifetime-access-density-hot-threshold=0 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,AMBIG
+
+;; Test that we don't get an ambiguous memprof attribute when
+;; -memprof-ambiguous-attributes is disabled.
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-ambiguous-attributes=false 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,NOAMBIG
 
 ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched with 1 frames
 ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched with 1 frames
@@ -140,7 +144,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; PGO: !prof
 define dso_local noundef ptr @_Z3foov() #0 !dbg !10 {
 entry:
-  ; MEMPROF: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+  ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A0:[0-9]+]]{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
   ; MEMPROFNOCOLINFO: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
   %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !13
   ret ptr %call, !dbg !14
@@ -364,6 +368,9 @@ for.end:                                          ; preds = %for.cond
   ret i32 0, !dbg !103
 }
 
+;; We optionally apply an ambiguous memprof attribute to ambiguous allocations
+; AMBIG: #[[A0]] = { builtin allocsize(0) "memprof"="ambiguous" }
+; NOAMBIG: #[[A0]] = { builtin allocsize(0) }
 ; MEMPROF: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
 ; MEMPROF: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }
 ; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]]}
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index d8457a30fd2f7..d1c0f643f5cd7 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -230,7 +230,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -279,7 +280,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -333,7 +335,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -392,7 +395,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   CallBase *Call = findCall(*Func, "call");
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
@@ -463,7 +467,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   ASSERT_NE(Call, nullptr);
   Trie.buildAndAttachMIBMetadata(Call);
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -536,7 +541,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   // Restore original option value.
   MemProfKeepAllNotColdContexts = OrigMemProfKeepAllNotColdContexts;
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   EXPECT_THAT(MemProfMD, MemprofMetadataEquals(ExpectedVals));
@@ -664,7 +670,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   // The hot allocations will be converted to NotCold and pruned as they
   // are unnecessary to determine how to clone the cold allocation.
 
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
+  EXPECT_TRUE(Call->hasFnAttr("memprof"));
+  EXPECT_EQ(Call->getFnAttr("memprof").getValueAsString(), "ambiguous");
   EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
   MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
   ASSERT_EQ(MemProfMD->getNumOperands(), 2u);

From 99d802a3332b25deae1c8c2be5b8875ba9926a4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Sun, 5 Oct 2025 14:47:16 -1000
Subject: [PATCH 792/878] [flang][cuda] Fix linkage for dynamic shared memory
 (#161940)

---
 flang/include/flang/Optimizer/Builder/FIRBuilder.h          | 2 ++
 .../Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp     | 6 +++++-
 flang/test/Fir/CUDA/cuda-shared-offset.mlir                 | 4 +++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index d3af3bafbf279..2ce0d86d0213e 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -372,6 +372,8 @@ class FirOpBuilder : public mlir::OpBuilder, public mlir::OpBuilder::Listener {
     return createCommonLinkage(getContext());
   }
 
+  mlir::StringAttr createExternalLinkage() { return getStringAttr("external"); }
+
   mlir::StringAttr createInternalLinkage() { return getStringAttr("internal"); }
 
   mlir::StringAttr createLinkOnceLinkage() { return getStringAttr("linkonce"); }
diff --git a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
index 6e04c71c41606..09126e047d382 100644
--- a/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -143,7 +143,11 @@ struct CUFComputeSharedMemoryOffsetsAndSize
       auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
       std::string sharedMemGlobalName =
           (funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
-      mlir::StringAttr linkage = builder.createInternalLinkage();
+      // Dynamic shared memory needs an external linkage while static shared
+      // memory needs an internal linkage.
+      mlir::StringAttr linkage = nbDynamicSharedVariables > 0
+                                     ? builder.createExternalLinkage()
+                                     : builder.createInternalLinkage();
       builder.setInsertionPointToEnd(gpuMod.getBody());
       llvm::SmallVector<mlir::NamedAttribute> attrs;
       auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
index 29316c90e5281..9c057d024426a 100644
--- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir
+++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
@@ -17,7 +17,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>       
 // CHECK: gpu.return
 // CHECK: }
-// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+// CHECK: fir.global external @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
 
 // -----
 
@@ -158,3 +158,5 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK-LABEL: gpu.func @_QMmtestsPtestany
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf64>, %c-1{{.*}} : index {bindc_name = "dmasks", uniq_name = "_QMmtestsFtestanyEdmasks"} -> !fir.ref<!fir.array<?xf64>>
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
+
+// CHECK: fir.global external @_QMmtestsPtestany__shared_mem {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>

From 2e6da800484d9cf5a75a9a57919f855d7de70d42 Mon Sep 17 00:00:00 2001
From: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
Date: Mon, 6 Oct 2025 09:03:12 +0800
Subject: [PATCH 793/878] [compiler-rt][asan] Reland: wcscpy/wcsncpy
 interceptors and stabilize wchar tests on Darwin/Android (#162028)

### Summary
Reland: wcscpy/wcsncpy interceptors and stabilize wchar tests on
Darwin/Android. Functional reland (runtime + tests).

### Context
Reland of #160493 and #161624; previously reverted by #162021 and
#162001 to restore green.

### Motivation
- Restore wchar interceptors (wcscpy/wcsncpy), broaden ASan coverage,
and improve Windows parity with narrow-string checks.
- Make tests robust across Darwin/Android to keep bots green.

### Runtime (wcscpy/wcsncpy)
- Add overlap checks; mark read/write ranges in bytes.
- Use MaybeRealWcsnlen when available to bound reads.
- Register Windows static runtime thunk where applicable.

### Tests (wcscpy/wcsncpy/wcscat/wcsncat)
- Android: keep `%env_asan_opts=log_to_stderr=1` so the ASan header is
on stderr.
- Darwin: tolerate reordering by putting all four key lines in one DAG
group:

```cpp
// CHECK-DAG: Good so far.
// CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:...]] at pc {{...}} bp {{...}} sp {{...}}
// CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
// CHECK-DAG: #0 {{0x[0-9a-f]+}} in <func>
```

### Risk
- Functional reland (runtime + tests), intended to restore functionality
and maintain stability across platforms.

---------

Signed-off-by: Yixuan Cao <caoyixuan2019@email.szu.edu.cn>
---
 compiler-rt/lib/asan/asan_interceptors.cpp    | 46 +++++++++++++++++--
 compiler-rt/lib/asan/asan_interceptors.h      |  1 +
 .../asan/asan_win_static_runtime_thunk.cpp    |  4 ++
 .../sanitizer_platform_interceptors.h         |  2 +-
 compiler-rt/test/asan/TestCases/wcscat.cpp    | 26 +++++++++++
 compiler-rt/test/asan/TestCases/wcscpy.cpp    | 23 ++++++++++
 compiler-rt/test/asan/TestCases/wcsncat.cpp   | 27 +++++++++++
 compiler-rt/test/asan/TestCases/wcsncpy.cpp   | 24 ++++++++++
 8 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 compiler-rt/test/asan/TestCases/wcscat.cpp
 create mode 100644 compiler-rt/test/asan/TestCases/wcscpy.cpp
 create mode 100644 compiler-rt/test/asan/TestCases/wcsncat.cpp
 create mode 100644 compiler-rt/test/asan/TestCases/wcsncpy.cpp

diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 7c9a08b9083a2..0f613f0fdc30b 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -58,13 +58,20 @@ namespace __asan {
 
 static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
 #if SANITIZER_INTERCEPT_STRNLEN
-  if (REAL(strnlen)) {
+  if (REAL(strnlen))
     return REAL(strnlen)(s, maxlen);
-  }
-#endif
+#  endif
   return internal_strnlen(s, maxlen);
 }
 
+static inline uptr MaybeRealWcsnlen(const wchar_t* s, uptr maxlen) {
+#  if SANITIZER_INTERCEPT_WCSNLEN
+  if (REAL(wcsnlen))
+    return REAL(wcsnlen)(s, maxlen);
+#  endif
+  return internal_wcsnlen(s, maxlen);
+}
+
 void SetThreadName(const char *name) {
   AsanThread *t = GetCurrentThread();
   if (t)
@@ -570,6 +577,20 @@ INTERCEPTOR(char *, strcpy, char *to, const char *from) {
   return REAL(strcpy)(to, from);
 }
 
+INTERCEPTOR(wchar_t*, wcscpy, wchar_t* to, const wchar_t* from) {
+  void* ctx;
+  ASAN_INTERCEPTOR_ENTER(ctx, wcscpy);
+  if (!TryAsanInitFromRtl())
+    return REAL(wcscpy)(to, from);
+  if (flags()->replace_str) {
+    uptr size = (internal_wcslen(from) + 1) * sizeof(wchar_t);
+    CHECK_RANGES_OVERLAP("wcscpy", to, size, from, size);
+    ASAN_READ_RANGE(ctx, from, size);
+    ASAN_WRITE_RANGE(ctx, to, size);
+  }
+  return REAL(wcscpy)(to, from);
+}
+
 // Windows doesn't always define the strdup identifier,
 // and when it does it's a macro defined to either _strdup
 // or _strdup_dbg, _strdup_dbg ends up calling _strdup, so
@@ -633,6 +654,20 @@ INTERCEPTOR(char*, strncpy, char *to, const char *from, usize size) {
   return REAL(strncpy)(to, from, size);
 }
 
+INTERCEPTOR(wchar_t*, wcsncpy, wchar_t* to, const wchar_t* from, uptr size) {
+  void* ctx;
+  ASAN_INTERCEPTOR_ENTER(ctx, wcsncpy);
+  AsanInitFromRtl();
+  if (flags()->replace_str) {
+    uptr from_size =
+        Min(size, MaybeRealWcsnlen(from, size) + 1) * sizeof(wchar_t);
+    CHECK_RANGES_OVERLAP("wcsncpy", to, from_size, from, from_size);
+    ASAN_READ_RANGE(ctx, from, from_size);
+    ASAN_WRITE_RANGE(ctx, to, size * sizeof(wchar_t));
+  }
+  return REAL(wcsncpy)(to, from, size);
+}
+
 template <typename Fn>
 static ALWAYS_INLINE auto StrtolImpl(void *ctx, Fn real, const char *nptr,
                                      char **endptr, int base)
@@ -809,6 +844,11 @@ void InitializeAsanInterceptors() {
   ASAN_INTERCEPT_FUNC(strncat);
   ASAN_INTERCEPT_FUNC(strncpy);
   ASAN_INTERCEPT_FUNC(strdup);
+
+  // Intercept wcs* functions.
+  ASAN_INTERCEPT_FUNC(wcscpy);
+  ASAN_INTERCEPT_FUNC(wcsncpy);
+
 #  if ASAN_INTERCEPT___STRDUP
   ASAN_INTERCEPT_FUNC(__strdup);
 #endif
diff --git a/compiler-rt/lib/asan/asan_interceptors.h b/compiler-rt/lib/asan/asan_interceptors.h
index 3e2386eaf8092..2d551cfafd1f5 100644
--- a/compiler-rt/lib/asan/asan_interceptors.h
+++ b/compiler-rt/lib/asan/asan_interceptors.h
@@ -129,6 +129,7 @@ DECLARE_REAL(char*, strchr, const char *str, int c)
 DECLARE_REAL(SIZE_T, strlen, const char *s)
 DECLARE_REAL(char*, strncpy, char *to, const char *from, SIZE_T size)
 DECLARE_REAL(SIZE_T, strnlen, const char *s, SIZE_T maxlen)
+DECLARE_REAL(SIZE_T, wcsnlen, const wchar_t* s, SIZE_T maxlen)
 DECLARE_REAL(char*, strstr, const char *s1, const char *s2)
 
 #  if !SANITIZER_APPLE
diff --git a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
index 4a69b66574039..46e0e90738f24 100644
--- a/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
+++ b/compiler-rt/lib/asan/asan_win_static_runtime_thunk.cpp
@@ -63,6 +63,10 @@ INTERCEPT_LIBRARY_FUNCTION_ASAN(strpbrk);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strspn);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strstr);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(strtok);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcscat);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcscpy);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsncat);
+INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsncpy);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(wcslen);
 INTERCEPT_LIBRARY_FUNCTION_ASAN(wcsnlen);
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 29987decdff45..88ecd7e16306a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -551,7 +551,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_MALLOC_USABLE_SIZE (!SI_MAC && !SI_NETBSD)
 #define SANITIZER_INTERCEPT_MCHECK_MPROBE SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_WCSLEN 1
-#define SANITIZER_INTERCEPT_WCSCAT SI_POSIX
+#define SANITIZER_INTERCEPT_WCSCAT (SI_POSIX || SI_WINDOWS)
 #define SANITIZER_INTERCEPT_WCSDUP SI_POSIX
 #define SANITIZER_INTERCEPT_SIGNAL_AND_SIGACTION (!SI_WINDOWS && SI_NOT_FUCHSIA)
 #define SANITIZER_INTERCEPT_BSD_SIGNAL SI_ANDROID
diff --git a/compiler-rt/test/asan/TestCases/wcscat.cpp b/compiler-rt/test/asan/TestCases/wcscat.cpp
new file mode 100644
index 0000000000000..fd0b5a4310351
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcscat.cpp
@@ -0,0 +1,26 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  const wchar_t *start = L"X means ";
+  const wchar_t *append = L"dog";
+  wchar_t goodDst[12];
+  wcscpy(goodDst, start);
+  wcscat(goodDst, append);
+
+  wchar_t badDst[9];
+  wcscpy(badDst, start);
+  fprintf(stderr, "Good so far.\n");
+  // CHECK-DAG: Good so far.
+  fflush(stderr);
+  wcscat(badDst, append); // Boom!
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcscat
+  printf("Should have failed with ASAN error.\n");
+}
diff --git a/compiler-rt/test/asan/TestCases/wcscpy.cpp b/compiler-rt/test/asan/TestCases/wcscpy.cpp
new file mode 100644
index 0000000000000..8133a588cb071
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcscpy.cpp
@@ -0,0 +1,23 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  const wchar_t *src = L"X means dog";
+  wchar_t goodDst[12];
+  wcscpy(goodDst, src);
+
+  wchar_t badDst[7];
+  fprintf(stderr, "Good so far.\n");
+  // CHECK-DAG: Good so far.
+  fflush(stderr);
+  wcscpy(badDst, src); // Boom!
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcscpy
+  printf("Should have failed with ASAN error.\n");
+}
diff --git a/compiler-rt/test/asan/TestCases/wcsncat.cpp b/compiler-rt/test/asan/TestCases/wcsncat.cpp
new file mode 100644
index 0000000000000..365e732e2d051
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcsncat.cpp
@@ -0,0 +1,27 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  const wchar_t *start = L"X means ";
+  const wchar_t *append = L"dog";
+  wchar_t goodDst[15];
+  wcscpy(goodDst, start);
+  wcsncat(goodDst, append, 5);
+
+  wchar_t badDst[11];
+  wcscpy(badDst, start);
+  wcsncat(badDst, append, 1);
+  fprintf(stderr, "Good so far.\n");
+  // CHECK-DAG: Good so far.
+  fflush(stderr);
+  wcsncat(badDst, append, 3); // Boom!
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcsncat
+  printf("Should have failed with ASAN error.\n");
+}
diff --git a/compiler-rt/test/asan/TestCases/wcsncpy.cpp b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
new file mode 100644
index 0000000000000..485ddc4804dcd
--- /dev/null
+++ b/compiler-rt/test/asan/TestCases/wcsncpy.cpp
@@ -0,0 +1,24 @@
+// RUN: %clangxx_asan -O0 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O1 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O2 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+// RUN: %clangxx_asan -O3 %s -o %t && not %env_asan_opts=log_to_stderr=1 %run %t 2>&1 | FileCheck %s --check-prefix=CHECK
+
+#include <stdio.h>
+#include <wchar.h>
+
+int main() {
+  const wchar_t *src = L"X means dog";
+  wchar_t goodDst[12];
+  wcsncpy(goodDst, src, 12);
+
+  wchar_t badDst[7];
+  wcsncpy(badDst, src, 7); // This should still work.
+  fprintf(stderr, "Good so far.\n");
+  // CHECK-DAG: Good so far.
+  fflush(stderr);
+  wcsncpy(badDst, src, 15); // Boom!
+  // CHECK-DAG: ERROR: AddressSanitizer: stack-buffer-overflow on address [[ADDR:0x[0-9a-f]+]] at pc {{0x[0-9a-f]+}} bp {{0x[0-9a-f]+}} sp {{0x[0-9a-f]+}}
+  // CHECK-DAG: WRITE of size {{[0-9]+}} at [[ADDR]] thread T0
+  // CHECK-DAG: #0 {{0x[0-9a-f]+}} in wcsncpy
+  printf("Should have failed with ASAN error.\n");
+}

From 6f3d765d04041412e9801187eb261253c3ceb2a1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 6 Oct 2025 10:56:17 +0900
Subject: [PATCH 794/878] libclc: Add gfx1250 and gfx1251 to amdgpu target list
 (#162034)

---
 libclc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 7960f3494770e..ba0fc4b38ce81 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -225,7 +225,7 @@ set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii
   gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036
   gfx1100 gfx1101 gfx1102 gfx1103
   gfx1150 gfx1151 gfx1152 gfx1153
-  gfx1200 gfx1201
+  gfx1200 gfx1201 gfx1250 gfx1251
 )
 
 # pkg-config file

From 1f82e818faac8a2ff868ec364ce1f3de5ceebf2d Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar@nvidia.com>
Date: Mon, 6 Oct 2025 09:15:23 +0530
Subject: [PATCH 795/878] [clang][NVPTX] Add intrinsics and builtins for CVT RS
 rounding mode (#160494)

This change adds LLVM intrinsics and clang builtins for the `cvt`
RS rounding mode instruction variants.

Tests are added in `convert-sm103a.ll` and verified through ptxas-13.0.
---
 clang/include/clang/Basic/BuiltinsNVPTX.td    |  57 ++++
 clang/test/CodeGen/builtins-nvptx.c           | 123 ++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  37 +++
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp   |   3 +
 llvm/lib/Target/NVPTX/NVPTX.h                 |   1 +
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  76 ++++-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |   5 +
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  54 +++-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |  67 ++++
 llvm/test/CodeGen/NVPTX/convert-sm103a.ll     | 297 ++++++++++++++++++
 10 files changed, 715 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm103a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 2d6fa1771014d..d923d2a90e908 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -579,11 +579,35 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)
 def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
 def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
 def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2bf16x2_rs :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2bf16x2_rs_relu :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2bf16x2_rs_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2bf16x2_rs_relu_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
 
 def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
 def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
 def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
 def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2f16x2_rs :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2f16x2_rs_relu :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2f16x2_rs_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_ff2f16x2_rs_relu_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
 
 def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
@@ -616,6 +640,19 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_f32x4_to_e4m3x4_rs_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e5m2x4_rs_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+
 def __nvvm_ff_to_e2m3x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_e2m3x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_e3m2x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
@@ -626,12 +663,32 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 
+def __nvvm_f32x4_to_e2m3x4_rs_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e3m2x4_rs_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite :
+  NVPTXBuiltinSMAndPTX<"_Vector<4, char>(_Vector<4, float>, uint32_t)", 
+                       SM<"100a", [SM_103a]>, PTX87>;
+
 def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 
 def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 
+def __nvvm_f32x4_to_e2m1x4_rs_satfinite :
+  NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+def __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite :
+  NVPTXBuiltinSMAndPTX<"short(_Vector<4, float>, uint32_t)",
+                       SM<"100a", [SM_103a]>, PTX87>;
+
 def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index f994adb14e457..e3be262622844 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -43,6 +43,12 @@
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \
 // RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_103a -target-feature +ptx87 -DPTX=87 \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM103a %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 -DPTX=87 \
+// RUN:            -disable-llvm-optzns -fcuda-is-device -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX87_SM100a %s
 // ###  The last run to check with the highest SM and PTX version available
 // ###  to make sure target builtins are still accepted.
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \
@@ -1203,6 +1209,123 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() {
   // CHECK: ret void
 }
 
+__device__ void nvvm_cvt_sm100a_sm103a() {
+#if (PTX >= 87) && (__CUDA_ARCH_FEAT_SM100_ALL || __CUDA_ARCH_FEAT_SM103_ALL)
+  
+  typedef __fp16 f16x2 __attribute__((ext_vector_type(2)));
+  typedef __bf16 bf16x2 __attribute__((ext_vector_type(2)));
+  typedef char uint8x4 __attribute__((ext_vector_type(4)));
+
+// CHECK_PTX87_SM100a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R1]], ptr %r1
+// CHECK_PTX87_SM103a: %[[R1:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R1]], ptr %r1
+  f16x2 r1 =  __nvvm_ff2f16x2_rs(1.0f, 1.0f, 0);
+  
+// CHECK_PTX87_SM100a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R2]], ptr %r2
+// CHECK_PTX87_SM103a: %[[R2:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R2]], ptr %r2
+  f16x2 r2 =  __nvvm_ff2f16x2_rs_relu(1.0f, 1.0f, 0);
+  
+// CHECK_PTX87_SM100a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R3]], ptr %r3
+// CHECK_PTX87_SM103a: %[[R3:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R3]], ptr %r3
+  f16x2 r3 =  __nvvm_ff2f16x2_rs_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x half> %[[R4]], ptr %r4
+// CHECK_PTX87_SM103a: %[[R4:.*]] = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x half> %[[R4]], ptr %r4
+  f16x2 r4 =  __nvvm_ff2f16x2_rs_relu_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R5]], ptr %r5
+// CHECK_PTX87_SM103a: %[[R5:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R5]], ptr %r5
+  bf16x2 r5 =  __nvvm_ff2bf16x2_rs(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R6]], ptr %r6
+// CHECK_PTX87_SM103a: %[[R6:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R6]], ptr %r6
+  bf16x2 r6 =  __nvvm_ff2bf16x2_rs_relu(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R7]], ptr %r7
+// CHECK_PTX87_SM103a: %[[R7:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R7]], ptr %r7
+  bf16x2 r7 =  __nvvm_ff2bf16x2_rs_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM100a: store <2 x bfloat> %[[R8]], ptr %r8
+// CHECK_PTX87_SM103a: %[[R8:.*]] = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float 1.000000e+00, float 1.000000e+00, i32 0)
+// CHECK_PTX87_SM103a: store <2 x bfloat> %[[R8]], ptr %r8
+  bf16x2 r8 =  __nvvm_ff2bf16x2_rs_relu_satfinite(1.0f, 1.0f, 0);
+
+// CHECK_PTX87_SM100a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R9]], ptr %r9
+// CHECK_PTX87_SM103a: %[[R9:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R9]], ptr %r9
+  uint8x4 r9 =  __nvvm_f32x4_to_e4m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R10]], ptr %r10
+// CHECK_PTX87_SM103a: %[[R10:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R10]], ptr %r10
+  uint8x4 r10 =  __nvvm_f32x4_to_e4m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R11]], ptr %r11
+// CHECK_PTX87_SM103a: %[[R11:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R11]], ptr %r11
+  uint8x4 r11 =  __nvvm_f32x4_to_e5m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R12]], ptr %r12
+// CHECK_PTX87_SM103a: %[[R12:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R12]], ptr %r12
+  uint8x4 r12 =  __nvvm_f32x4_to_e5m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R13]], ptr %r13
+// CHECK_PTX87_SM103a: %[[R13:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R13]], ptr %r13
+  uint8x4 r13 = __nvvm_f32x4_to_e2m3x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);  
+
+// CHECK_PTX87_SM100a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R14]], ptr %r14
+// CHECK_PTX87_SM103a: %[[R14:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R14]], ptr %r14
+  uint8x4 r14 = __nvvm_f32x4_to_e2m3x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R15]], ptr %r15
+// CHECK_PTX87_SM103a: %[[R15:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R15]], ptr %r15
+  uint8x4 r15 = __nvvm_f32x4_to_e3m2x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store <4 x i8> %[[R16]], ptr %r16
+// CHECK_PTX87_SM103a: %[[R16:.*]] = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store <4 x i8> %[[R16]], ptr %r16
+  uint8x4 r16 = __nvvm_f32x4_to_e3m2x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store i16 %[[R17]], ptr %r17
+// CHECK_PTX87_SM103a: %[[R17:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store i16 %[[R17]], ptr %r17
+  short r17 = __nvvm_f32x4_to_e2m1x4_rs_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+
+// CHECK_PTX87_SM100a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM100a: store i16 %[[R18]], ptr %r18
+// CHECK_PTX87_SM103a: %[[R18:.*]] = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> splat (float 1.000000e+00), i32 0)
+// CHECK_PTX87_SM103a: store i16 %[[R18]], ptr %r18
+  short r18 = __nvvm_f32x4_to_e2m1x4_rs_relu_satfinite({1.0f, 1.0f, 1.0f, 1.0f}, 0);
+#endif
+}
+
 #define NAN32 0x7FBFFFFF
 #define NAN16 (__bf16)0x7FBF
 #define BF16 (__bf16)0.1f
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 9cfab26fffa54..23d878f726c5e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1493,6 +1493,18 @@ let TargetPrefix = "nvvm" in {
     }
   }
 
+  // RS rounding mode (Stochastic Rounding) conversions for f16x2, bf16x2 types
+  // The last i32 operand provides the random bits for the conversion
+  foreach relu = ["", "_relu"] in {
+    foreach satfinite = ["", "_satfinite"] in {
+      def int_nvvm_ff2f16x2_rs # relu # satfinite : NVVMBuiltin,
+          PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
+
+      def int_nvvm_ff2bf16x2_rs # relu # satfinite : NVVMBuiltin,
+          PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty]>;
+    }
+  }
+
   foreach satfinite = ["", "_satfinite"] in {
     def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin,
         PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>;
@@ -1515,6 +1527,15 @@ let TargetPrefix = "nvvm" in {
           PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
     }
   }
+  
+  // RS rounding mode (Stochastic Rounding) conversions for f8x4 types
+  // The last i32 operand provides the random bits for the conversion
+  foreach type = ["e4m3x4", "e5m2x4"] in {
+    foreach relu = ["", "_relu"] in {
+      def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin,
+          PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+    }
+  }
 
   // FP4 conversions.
   foreach relu = ["", "_relu"] in {
@@ -1524,6 +1545,13 @@ let TargetPrefix = "nvvm" in {
     def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin,
         PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
   }
+  
+  // RS rounding mode (Stochastic Rounding) conversions for f4x4 type
+  // The last i32 operand provides the random bits for the conversion
+  foreach relu = ["", "_relu"] in {
+    def int_nvvm_f32x4_to_e2m1x4_rs # relu # _satfinite : NVVMBuiltin,
+        PureIntrinsic<[llvm_i16_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+  }
 
   // FP6 conversions.
   foreach type = ["e2m3x2", "e3m2x2"] in {
@@ -1535,6 +1563,15 @@ let TargetPrefix = "nvvm" in {
           PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>;
     }
   }
+  
+  // RS rounding mode (Stochastic Rounding) conversions for f6x4 types
+  // The last i32 operand provides the random bits for the conversion
+  foreach type = ["e2m3x4", "e3m2x4"] in {
+    foreach relu = ["", "_relu"] in {
+      def int_nvvm_f32x4_to_ # type # _rs # relu # _satfinite : NVVMBuiltin,
+          PureIntrinsic<[llvm_v4i8_ty], [llvm_v4f32_ty, llvm_i32_ty]>;
+    }
+  }
 
   // UE8M0x2 conversions.
   foreach rmode = ["_rz", "_rp"] in {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index f9bdc09935330..77913f27838e2 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -149,6 +149,9 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
     case NVPTX::PTXCvtMode::RNA:
       O << ".rna";
       return;
+    case NVPTX::PTXCvtMode::RS:
+      O << ".rs";
+      return;
     }
   }
   llvm_unreachable("Invalid conversion modifier");
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 77a0e03d4075a..1e0f747f8f7fc 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -207,6 +207,7 @@ enum CvtMode {
   RM,
   RP,
   RNA,
+  RS,
 
   BASE_MASK = 0x0F,
   FTZ_FLAG = 0x10,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8c21746c4369e..bc047a4aa999d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1096,9 +1096,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // Enable custom lowering for the following:
   //   * MVT::i128 - clusterlaunchcontrol
   //   * MVT::i32 - prmt
+  //   * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
   //   * MVT::Other - internal.addrspace.wrap
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other},
-                     Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN,
+                     {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
 }
 
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1181,6 +1182,11 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
         NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT)
     MAKE_CASE(
         NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT)
+    MAKE_CASE(NVPTXISD::CVT_E4M3X4_F32X4_RS_SF)
+    MAKE_CASE(NVPTXISD::CVT_E5M2X4_F32X4_RS_SF)
+    MAKE_CASE(NVPTXISD::CVT_E2M3X4_F32X4_RS_SF)
+    MAKE_CASE(NVPTXISD::CVT_E3M2X4_F32X4_RS_SF)
+    MAKE_CASE(NVPTXISD::CVT_E2M1X4_F32X4_RS_SF)
   }
   return nullptr;
 
@@ -2903,6 +2909,61 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op,
                      {TryCancelResponse0, TryCancelResponse1});
 }
 
+static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG) {
+  SDNode *N = Op.getNode();
+  SDLoc DL(N);
+  SDValue F32Vec = N->getOperand(1);
+  SDValue RBits = N->getOperand(2);
+
+  unsigned IntrinsicID = N->getConstantOperandVal(0);
+
+  // Extract the 4 float elements from the vector
+  SmallVector<SDValue, 6> Ops;
+  for (unsigned i = 0; i < 4; ++i)
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
+                              DAG.getIntPtrConstant(i, DL)));
+
+  using NVPTX::PTXCvtMode::CvtMode;
+
+  auto [OpCode, RetTy, CvtModeFlag] =
+      [&]() -> std::tuple<NVPTXISD::NodeType, MVT::SimpleValueType, uint32_t> {
+    switch (IntrinsicID) {
+    case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
+      return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
+              CvtMode::RS | CvtMode::RELU_FLAG};
+    case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
+      return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+    case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
+      return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
+              CvtMode::RS | CvtMode::RELU_FLAG};
+    case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
+      return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+    case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
+      return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
+              CvtMode::RS | CvtMode::RELU_FLAG};
+    case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
+      return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+    case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
+      return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
+              CvtMode::RS | CvtMode::RELU_FLAG};
+    case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
+      return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
+    case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
+      return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
+              CvtMode::RS | CvtMode::RELU_FLAG};
+    case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
+      return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
+    default:
+      llvm_unreachable("unsupported/unhandled intrinsic");
+    }
+  }();
+
+  Ops.push_back(RBits);
+  Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
+
+  return DAG.getNode(OpCode, DL, RetTy, Ops);
+}
+
 static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) {
   const unsigned Mode = [&]() {
     switch (Op->getConstantOperandVal(0)) {
@@ -2972,6 +3033,17 @@ static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
   case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
     return LowerClusterLaunchControlQueryCancel(Op, DAG);
+  case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
+  case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
+    return lowerCvtRSIntrinsics(Op, DAG);
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 769d2fe46f2c8..63fa0bb9159ff 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -79,6 +79,11 @@ enum NodeType : unsigned {
   CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X,
   CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y,
   CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z,
+  CVT_E4M3X4_F32X4_RS_SF,
+  CVT_E5M2X4_F32X4_RS_SF,
+  CVT_E2M3X4_F32X4_RS_SF,
+  CVT_E3M2X4_F32X4_RS_SF,
+  CVT_E2M1X4_F32X4_RS_SF,
 
   FIRST_MEMORY_OPCODE,
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4cacee2290763..6c14cf0b324e1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -34,7 +34,8 @@ def CvtRN   : PatLeaf<(i32 0x5)>;
 def CvtRZ   : PatLeaf<(i32 0x6)>;
 def CvtRM   : PatLeaf<(i32 0x7)>;
 def CvtRP   : PatLeaf<(i32 0x8)>;
-def CvtRNA   : PatLeaf<(i32 0x9)>;
+def CvtRNA  : PatLeaf<(i32 0x9)>;
+def CvtRS   : PatLeaf<(i32 0xA)>;
 
 def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
 def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
@@ -50,8 +51,9 @@ def CvtSAT      : PatLeaf<(i32 0x20)>;
 def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
 
 def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
-def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
-def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
+def CvtRN_RELU     : PatLeaf<(i32 0x45)>;
+def CvtRZ_RELU     : PatLeaf<(i32 0x46)>;
+def CvtRS_RELU     : PatLeaf<(i32 0x4A)>;
 
 def CvtMode : Operand<i32> {
   let PrintMethod = "printCvtMode";
@@ -133,6 +135,11 @@ def hasSM100a : Predicate<"Subtarget->getSmVersion() == 100 && Subtarget->hasArc
 def hasSM101a : Predicate<"Subtarget->getSmVersion() == 101 && Subtarget->hasArchAccelFeatures()">;
 def hasSM120a : Predicate<"Subtarget->getSmVersion() == 120 && Subtarget->hasArchAccelFeatures()">;
 
+def hasSM100aOrSM103a :
+  Predicate<"(Subtarget->getSmVersion() == 100 || " # 
+              "Subtarget->getSmVersion() == 103) " #
+            "&& Subtarget->hasArchAccelFeatures()">;
+
 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
                           "&& Subtarget->getPTXVersion() >= 64)">;
@@ -593,6 +600,23 @@ let hasSideEffects = false in {
 
   defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", B32>;
   defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", B32>;
+  
+  multiclass CVT_FROM_FLOAT_V2_RS<string FromName, RegisterClass RC> {
+    def _f32_rs :
+      BasicFlagsNVPTXInst<(outs RC:$dst),
+                (ins B32:$src1, B32:$src2, B32:$src3),
+                (ins CvtMode:$mode),
+                "cvt${mode:base}${mode:relu}." # FromName # ".f32">;
+
+    def _f32_rs_sf :
+      BasicFlagsNVPTXInst<(outs RC:$dst),
+                (ins B32:$src1, B32:$src2, B32:$src3),
+                (ins CvtMode:$mode),
+                "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">;
+  }
+
+  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_RS<"f16x2", B32>;
+  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_RS<"bf16x2", B32>;
 
   // FP8 conversions.
   multiclass CVT_TO_F8X2<string F8Name> {
@@ -619,6 +643,15 @@ let hasSideEffects = false in {
 
   def CVT_f16x2_e4m3x2 : CVT_f16x2_fp8<"e4m3">;
   def CVT_f16x2_e5m2x2 : CVT_f16x2_fp8<"e5m2">;
+  
+  class CVT_TO_FP8X4<string F8Name> :
+    NVPTXInst<(outs B32:$dst),
+              (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode),
+              "cvt${mode:base}${mode:relu}.satfinite." # F8Name # 
+                "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">;
+  
+  def CVT_e4m3x4_f32x4_rs_sf : CVT_TO_FP8X4<"e4m3">;
+  def CVT_e5m2x4_f32x4_rs_sf : CVT_TO_FP8X4<"e5m2">;
 
   // Float to TF32 conversions
   multiclass CVT_TO_TF32<string Modifier, list<Predicate> Preds = [hasPTX<78>, hasSM<90>]> {
@@ -652,6 +685,15 @@ let hasSideEffects = false in {
                                       "cvt${mode:base}${mode:relu}.f16x2." # type>;
   }
   
+  class CVT_TO_FP6X4<string F6Name> :
+    NVPTXInst<(outs B32:$dst),
+              (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode),
+              "cvt${mode:base}${mode:relu}.satfinite." # F6Name #
+                "x4.f32 \t$dst, {{$src1, $src2, $src3, $src4}}, $src5;">;
+
+  def CVT_e2m3x4_f32x4_rs_sf : CVT_TO_FP6X4<"e2m3">;
+  def CVT_e3m2x4_f32x4_rs_sf : CVT_TO_FP6X4<"e3m2">;
+  
   // FP4 conversions.
   def CVT_e2m1x2_f32_sf : NVPTXInst<(outs B16:$dst),
       (ins B32:$src1, B32:$src2, CvtMode:$mode),
@@ -668,6 +710,12 @@ let hasSideEffects = false in {
                  "cvt.u8.u16 \t%e2m1x2_in, $src; \n\t",
                  "cvt${mode:base}${mode:relu}.f16x2.e2m1x2 \t$dst, %e2m1x2_in; \n\t",
                  "}}"), []>;
+                 
+  def CVT_e2m1x4_f32x4_rs_sf :
+    NVPTXInst<(outs B16:$dst),
+              (ins B32:$src1, B32:$src2, B32:$src3, B32:$src4, B32:$src5, CvtMode:$mode),
+              "cvt${mode:base}${mode:relu}.satfinite.e2m1x4.f32 \t" # 
+                "$dst, {{$src1, $src2, $src3, $src4}}, $src5;">;
 
   // UE8M0x2 conversions.
   class CVT_f32_to_ue8m0x2<string sat = ""> :
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index e91171c1ae38f..28d4bb917ff69 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1782,11 +1782,32 @@ def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, C
 def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b),      (CVT_bf16x2_f32 $a, $b, CvtRZ)>;
 def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>;
 
+let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
+def : Pat<(int_nvvm_ff2bf16x2_rs f32:$a, f32:$b, i32:$c),
+          (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2bf16x2_rs_relu f32:$a, f32:$b, i32:$c),
+          (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS_RELU)>;
+def : Pat<(int_nvvm_ff2bf16x2_rs_satfinite f32:$a, f32:$b, i32:$c), 
+          (CVT_bf16x2_f32_rs_sf $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2bf16x2_rs_relu_satfinite f32:$a, f32:$b, i32:$c),  
+          (CVT_bf16x2_f32_rs_sf $a, $b, $c, CvtRS_RELU)>;
+}
+
 def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b),      (CVT_f16x2_f32 $a, $b, CvtRN)>;
 def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>;
 def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b),      (CVT_f16x2_f32 $a, $b, CvtRZ)>;
 def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>;
 
+let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
+def : Pat<(int_nvvm_ff2f16x2_rs f32:$a, f32:$b, i32:$c),
+          (CVT_f16x2_f32_rs $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2f16x2_rs_relu f32:$a, f32:$b, i32:$c),
+          (CVT_f16x2_f32_rs $a, $b, $c, CvtRS_RELU)>;
+def : Pat<(int_nvvm_ff2f16x2_rs_satfinite f32:$a, f32:$b, i32:$c), 
+          (CVT_f16x2_f32_rs_sf $a, $b, $c, CvtRS)>;
+def : Pat<(int_nvvm_ff2f16x2_rs_relu_satfinite f32:$a, f32:$b, i32:$c), 
+          (CVT_f16x2_f32_rs_sf $a, $b, $c, CvtRS_RELU)>;
+}
 def : Pat<(int_nvvm_f2bf16_rn f32:$a),      (CVT_bf16_f32 $a, CvtRN)>;
 def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>;
 def : Pat<(int_nvvm_f2bf16_rz f32:$a),      (CVT_bf16_f32 $a, CvtRZ)>;
@@ -1929,6 +1950,52 @@ let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in {
             (CVT_bf16x2_ue8m0x2 $a)>;
 }
 
+def SDT_CVT_F32X4_TO_FPX4_RS_VEC :
+  SDTypeProfile<1, 6, [SDTCisVec<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, 
+                       SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>;
+
+def SDT_CVT_F32X4_TO_FPX4_RS_INT :
+  SDTypeProfile<1, 6, [SDTCisInt<0>, SDTCisFP<1>, SDTCisFP<2>, SDTCisFP<3>, 
+                       SDTCisFP<4>, SDTCisInt<5>, SDTCisInt<6>]>;
+
+class CVT_F32X4_TO_FPX4_RS_SF_NODE<string FPName, SDTypeProfile SDT> :
+  SDNode<"NVPTXISD::CVT_" # FPName # "X4_F32X4_RS_SF", SDT, []>;
+  
+multiclass CVT_F32X4_TO_FPX4_RS_SF_VEC<string FPName, VTVec RetTy> {
+  def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE<!toupper(FPName),
+                      SDT_CVT_F32X4_TO_FPX4_RS_VEC>
+                   f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)),
+            (!cast<NVPTXInst>("CVT_" # FPName # "x4_f32x4_rs_sf") 
+              $f1, $f2, $f3, $f4, $rbits, CvtRS)>;
+  
+  def : Pat<(RetTy (CVT_F32X4_TO_FPX4_RS_SF_NODE<!toupper(FPName), 
+                      SDT_CVT_F32X4_TO_FPX4_RS_VEC>
+                   f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)),
+            (!cast<NVPTXInst>("CVT_" # FPName # "x4_f32x4_rs_sf") 
+              $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>;
+}
+
+// RS rounding mode conversions
+let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in {
+// FP8x4 conversions
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e4m3", v4i8>;
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e5m2", v4i8>;
+
+// FP6x4 conversions
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e2m3", v4i8>;
+defm : CVT_F32X4_TO_FPX4_RS_SF_VEC<"e3m2", v4i8>;
+
+// FP4x4 conversions
+def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", 
+                  SDT_CVT_F32X4_TO_FPX4_RS_INT>
+                f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS)),
+          (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS)>;
+def : Pat<(i16 (CVT_F32X4_TO_FPX4_RS_SF_NODE<"E2M1", 
+                  SDT_CVT_F32X4_TO_FPX4_RS_INT>
+                f32:$f1, f32:$f2, f32:$f3, f32:$f4, i32:$rbits, CvtRS_RELU)),
+          (CVT_e2m1x4_f32x4_rs_sf $f1, $f2, $f3, $f4, $rbits, CvtRS_RELU)>;
+}
+
 //
 // FNS
 //
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm103a.ll b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll
new file mode 100644
index 0000000000000..54b4dd88867ed
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm103a.ll
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | FileCheck %s
+; RUN: %if ptxas-sm_100a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-sm_103a && ptxas-isa-8.7 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx87 | %ptxas-verify -arch=sm_103a %}
+
+; F16X2 conversions
+
+define <2 x half> @cvt_rs_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_f16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_f16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %f1, float %f2, i32 %rbits)
+  ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rs_relu_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_f16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_relu_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_relu_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_relu_f16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.relu.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %f1, float %f2, i32 %rbits)
+  ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rs_sf_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_f16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_sf_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_sf_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_sf_f16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.satfinite.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %f1, float %f2, i32 %rbits)
+  ret <2 x half> %val
+}
+
+define <2 x half> @cvt_rs_relu_sf_f16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_f16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_relu_sf_f16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_relu_sf_f16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_relu_sf_f16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.relu.satfinite.f16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %f1, float %f2, i32 %rbits)
+  ret <2 x half> %val
+}
+
+; BF16X2 conversions
+
+define <2 x bfloat> @cvt_rs_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_bf16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_bf16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %f1, float %f2, i32 %rbits)
+  ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rs_relu_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_bf16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_relu_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_relu_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_relu_bf16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.relu.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %f1, float %f2, i32 %rbits)
+  ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rs_sf_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_bf16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_sf_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_sf_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_sf_bf16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.satfinite.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %f1, float %f2, i32 %rbits)
+  ret <2 x bfloat> %val
+}
+
+define <2 x bfloat> @cvt_rs_relu_sf_bf16x2_f32(float %f1, float %f2, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_bf16x2_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [cvt_rs_relu_sf_bf16x2_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [cvt_rs_relu_sf_bf16x2_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [cvt_rs_relu_sf_bf16x2_f32_param_2];
+; CHECK-NEXT:    cvt.rs.relu.satfinite.bf16x2.f32 %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %f1, float %f2, i32 %rbits)
+  ret <2 x bfloat> %val
+}
+
+; F8X4 conversions
+
+define <4 x i8> @cvt_rs_sf_e4m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e4m3x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e4m3x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_sf_e4m3x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e4m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e4m3x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e4m3x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_relu_sf_e4m3x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.relu.satfinite.e4m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_sf_e5m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e5m2x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e5m2x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_sf_e5m2x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e5m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e5m2x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e5m2x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_relu_sf_e5m2x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.relu.satfinite.e5m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+; F6X4 conversions
+
+define <4 x i8> @cvt_rs_sf_e2m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e2m3x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e2m3x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_sf_e2m3x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e2m3x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e2m3x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e2m3x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_relu_sf_e2m3x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.relu.satfinite.e2m3x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_sf_e3m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e3m2x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e3m2x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_sf_e3m2x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+define <4 x i8> @cvt_rs_relu_sf_e3m2x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e3m2x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e3m2x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_relu_sf_e3m2x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.relu.satfinite.e3m2x4.f32 %r6, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret <4 x i8> %val
+}
+
+; F4X4 conversions
+
+define i16 @cvt_rs_sf_e2m1x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_sf_e2m1x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_sf_e2m1x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_sf_e2m1x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    cvt.u32.u16 %r6, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret i16 %val
+}
+
+define i16 @cvt_rs_relu_sf_e2m1x4_f32(<4 x float> %fvec, i32 %rbits) {
+; CHECK-LABEL: cvt_rs_relu_sf_e2m1x4_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [cvt_rs_relu_sf_e2m1x4_f32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r5, [cvt_rs_relu_sf_e2m1x4_f32_param_1];
+; CHECK-NEXT:    cvt.rs.relu.satfinite.e2m1x4.f32 %rs1, {%r1, %r2, %r3, %r4}, %r5;
+; CHECK-NEXT:    cvt.u32.u16 %r6, %rs1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
+  %val = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %fvec, i32 %rbits)
+  ret i16 %val
+}

From 6b1604ac30082cd7316f06c2b904cb23af95468e Mon Sep 17 00:00:00 2001
From: Connector Switch <c8ef@outlook.com>
Date: Mon, 6 Oct 2025 11:46:45 +0800
Subject: [PATCH 796/878] [libc] add IPPROTO related macros (#161855)

---
 libc/config/linux/aarch64/headers.txt         |  1 +
 libc/config/linux/riscv/headers.txt           |  1 +
 libc/config/linux/x86_64/headers.txt          |  1 +
 libc/include/CMakeLists.txt                   | 11 +++++++++++
 libc/include/llvm-libc-macros/CMakeLists.txt  |  6 ++++++
 .../llvm-libc-macros/netinet-in-macros.h      | 19 +++++++++++++++++++
 libc/include/netinet/in.h.def                 | 18 ++++++++++++++++++
 libc/include/netinet/in.yaml                  |  7 +++++++
 libc/test/include/CMakeLists.txt              | 10 ++++++++++
 libc/test/include/netinet_in_test.cpp         | 19 +++++++++++++++++++
 10 files changed, 93 insertions(+)
 create mode 100644 libc/include/llvm-libc-macros/netinet-in-macros.h
 create mode 100644 libc/include/netinet/in.h.def
 create mode 100644 libc/include/netinet/in.yaml
 create mode 100644 libc/test/include/netinet_in_test.cpp

diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index 6d3bc9188583b..0573851fbc20b 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -18,6 +18,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.locale
     libc.include.malloc
     libc.include.math
+    libc.include.netinet_in
     libc.include.poll
     libc.include.pthread
     libc.include.sched
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index 6d3bc9188583b..0573851fbc20b 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -18,6 +18,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.locale
     libc.include.malloc
     libc.include.math
+    libc.include.netinet_in
     libc.include.poll
     libc.include.pthread
     libc.include.sched
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 6d3bc9188583b..0573851fbc20b 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -18,6 +18,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.locale
     libc.include.malloc
     libc.include.math
+    libc.include.netinet_in
     libc.include.poll
     libc.include.pthread
     libc.include.sched
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 8fd37b0ee5ad6..afa90e6c8b655 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -191,6 +191,17 @@ add_header_macro(
     .inttypes
 )
 
+file(MAKE_DIRECTORY ${LIBC_INCLUDE_DIR}/netinet)
+
+add_header_macro(
+  netinet_in
+  ../libc/include/netinet/in.yaml
+  netinet/in.h
+  DEPENDS
+    .llvm_libc_common_h
+    .llvm-libc-macros.netinet_in_macros
+)
+
 add_header_macro(
   assert
   ../libc/include/assert.yaml
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index f71540313f0b3..76c03d913ee12 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -152,6 +152,12 @@ add_macro_header(
     .math_macros
 )
 
+add_macro_header(
+  netinet_in_macros
+  HDR
+    netinet-in-macros.h
+)
+
 add_macro_header(
   offsetof_macro
   HDR
diff --git a/libc/include/llvm-libc-macros/netinet-in-macros.h b/libc/include/llvm-libc-macros/netinet-in-macros.h
new file mode 100644
index 0000000000000..c05e5e2aec248
--- /dev/null
+++ b/libc/include/llvm-libc-macros/netinet-in-macros.h
@@ -0,0 +1,19 @@
+//===-- Definition of macros from netinet/in.h ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_MACROS_NETINET_IN_MACROS_H
+#define LLVM_LIBC_MACROS_NETINET_IN_MACROS_H
+
+#define IPPROTO_IP 0
+#define IPPROTO_ICMP 1
+#define IPPROTO_TCP 6
+#define IPPROTO_UDP 17
+#define IPPROTO_IPV6 41
+#define IPPROTO_RAW 255
+
+#endif // LLVM_LIBC_MACROS_NETINET_IN_MACROS_H
diff --git a/libc/include/netinet/in.h.def b/libc/include/netinet/in.h.def
new file mode 100644
index 0000000000000..d9a6a6ebe375d
--- /dev/null
+++ b/libc/include/netinet/in.h.def
@@ -0,0 +1,18 @@
+//===-- C standard library header in.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_NETINET_IN_H
+#define LLVM_LIBC_NETINET_IN_H
+
+#include "__llvm-libc-common.h"
+
+#include "../llvm-libc-macros/netinet-in-macros.h"
+
+%%public_api()
+
+#endif // LLVM_LIBC_NETINET_IN_H
diff --git a/libc/include/netinet/in.yaml b/libc/include/netinet/in.yaml
new file mode 100644
index 0000000000000..fbd798d60cbf9
--- /dev/null
+++ b/libc/include/netinet/in.yaml
@@ -0,0 +1,7 @@
+header: netinet/in.h
+header_template: in.h.def
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index 11e4c3a84157f..3ac5615d7e209 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -207,6 +207,16 @@ add_libc_test(
     libc.include.llvm-libc-macros.math_function_macros
 )
 
+add_libc_test(
+  netinet_in_test
+  SUITE
+    libc_include_tests
+  SRCS
+    netinet_in_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.netinet_in_macros
+)
+
 add_libc_test(
   signbit_test
   SUITE
diff --git a/libc/test/include/netinet_in_test.cpp b/libc/test/include/netinet_in_test.cpp
new file mode 100644
index 0000000000000..a6c47a779ea44
--- /dev/null
+++ b/libc/test/include/netinet_in_test.cpp
@@ -0,0 +1,19 @@
+//===-- Unittests for netinet/in macro ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-macros/netinet-in-macros.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcNetinetInTest, IPPROTOMacro) {
+  EXPECT_EQ(IPPROTO_IP, 0);
+  EXPECT_EQ(IPPROTO_ICMP, 1);
+  EXPECT_EQ(IPPROTO_TCP, 6);
+  EXPECT_EQ(IPPROTO_UDP, 17);
+  EXPECT_EQ(IPPROTO_IPV6, 41);
+  EXPECT_EQ(IPPROTO_RAW, 255);
+}

From 36cfdebe927c34508c1e245b459da43b745ae620 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 6 Oct 2025 15:46:42 +1100
Subject: [PATCH 797/878] [orc-rt] Add method-wrapper utils for use with
 WrapperFunction::handle. (#162035)

WrapperFunction::handleWithAsyncMethod can be used to wrap asynchronous
methods (void methods whose first arguments are Return callbacks) for
use with WrapperFunction::handle.

WrapperFunction::handleWithSyncMethod can be used to wrap regular
(non-asynchronous) methods for use with WrapperFunction::handle.

Both variants return function objects that take a Return callback as
their first argument, and an ExecutorAddr representing the address of
the instance to call the object on. For asynchronous methods the
resulting function object (AsyncMethod<method-ptr>) forwards the Return
callback through to the method. For synchronous methods the method is
called and the result passed to the Return callback.
---
 orc-rt/include/orc-rt/WrapperFunction.h     | 124 ++++++++++++++++++++
 orc-rt/unittests/SPSWrapperFunctionTest.cpp |  50 ++++++++
 2 files changed, 174 insertions(+)

diff --git a/orc-rt/include/orc-rt/WrapperFunction.h b/orc-rt/include/orc-rt/WrapperFunction.h
index 47e770f0bfbf7..c43f1c5b4a753 100644
--- a/orc-rt/include/orc-rt/WrapperFunction.h
+++ b/orc-rt/include/orc-rt/WrapperFunction.h
@@ -16,7 +16,9 @@
 #include "orc-rt-c/WrapperFunction.h"
 #include "orc-rt/CallableTraitsHelper.h"
 #include "orc-rt/Error.h"
+#include "orc-rt/ExecutorAddress.h"
 #include "orc-rt/bind.h"
+#include "orc-rt/move_only_function.h"
 
 #include <utility>
 
@@ -205,6 +207,128 @@ struct ResultDeserializer<std::tuple<Error>, Serializer> {
 /// wrapper functions in C++.
 struct WrapperFunction {
 
+  /// Wraps an asynchronous method (a method returning void, and taking a
+  /// return callback as its first argument) for use with
+  /// WrapperFunction::handle.
+  ///
+  /// AsyncMethod's call operator takes an ExecutorAddr as its second argument,
+  /// casts it to a ClassT*, and then calls the wrapped method on that pointer,
+  /// forwarding the return callback and any subsequent arguments (after the
+  /// second argument representing the object address).
+  ///
+  /// This utility removes some of the boilerplate from writing wrappers for
+  /// method calls.
+  template <typename ClassT, typename ReturnT, typename... ArgTs>
+  struct AsyncMethod {
+    AsyncMethod(void (ClassT::*M)(ReturnT, ArgTs...)) : M(M) {}
+    void operator()(ReturnT &&Return, ExecutorAddr Obj, ArgTs &&...Args) {
+      (Obj.toPtr<ClassT *>()->*M)(std::forward<ReturnT>(Return),
+                                  std::forward<ArgTs>(Args)...);
+    }
+
+  private:
+    void (ClassT::*M)(ReturnT, ArgTs...);
+  };
+
+  /// Create an AsyncMethod wrapper for the given method pointer. The given
+  /// method should be asynchronous: returning void, and taking a return
+  /// callback as its first argument.
+  ///
+  /// The handWithAsyncMethod function can be used to remove some of the
+  /// boilerplate from writing wrappers for method calls:
+  ///
+  ///   @code{.cpp}
+  ///   class MyClass {
+  ///   public:
+  ///     void myMethod(move_only_function<void(std::string)> Return,
+  //                    uint32_t X, bool Y) { ... }
+  ///   };
+  ///
+  ///   // SPS Method signature -- note MyClass object address as first
+  ///   // argument.
+  ///   using SPSMyMethodWrapperSignature =
+  ///     SPSString(SPSExecutorAddr, uint32_t, bool);
+  ///
+  ///
+  ///   static void adder_add_async_sps_wrapper(
+  ///       orc_rt_SessionRef Session, void *CallCtx,
+  ///       orc_rt_WrapperFunctionReturn Return,
+  ///       orc_rt_WrapperFunctionBuffer ArgBytes) {
+  ///     using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool);
+  ///     SPSWrapperFunction<SPSSig>::handle(
+  ///         Session, CallCtx, Return, ArgBytes,
+  ///         WrapperFunction::handleWithAsyncMethod(&MyClass::myMethod));
+  ///   }
+  ///   @endcode
+  ///
+  template <typename ClassT, typename ReturnT, typename... ArgTs>
+  static AsyncMethod<ClassT, ReturnT, ArgTs...>
+  handleWithAsyncMethod(void (ClassT::*M)(ReturnT, ArgTs...)) {
+    return AsyncMethod<ClassT, ReturnT, ArgTs...>(M);
+  }
+
+  /// Wraps a synchronous method (an ordinary method that returns its result,
+  /// as opposed to an asynchronous method, see AsyncMethod) for use with
+  /// WrapperFunction::handle.
+  ///
+  /// SyncMethod's call operator takes a return callback as its first argument
+  /// and an ExecutorAddr as its second argument. The ExecutorAddr argument is
+  /// cast to a ClassT*, and then called passing the subsequent arguments
+  /// (after the second argument representing the object address). The Return
+  /// callback is then called on the value returned from the method.
+  ///
+  /// This utility removes some of the boilerplate from writing wrappers for
+  /// method calls.
+  template <typename ClassT, typename RetT, typename... ArgTs>
+  class SyncMethod {
+  public:
+    SyncMethod(RetT (ClassT::*M)(ArgTs...)) : M(M) {}
+
+    void operator()(move_only_function<void(RetT)> Return, ExecutorAddr Obj,
+                    ArgTs &&...Args) {
+      Return((Obj.toPtr<ClassT *>()->*M)(std::forward<ArgTs>(Args)...));
+    }
+
+  private:
+    RetT (ClassT::*M)(ArgTs...);
+  };
+
+  /// Create an SyncMethod wrapper for the given method pointer. The given
+  /// method should be synchronous, i.e. returning its result (as opposed to
+  /// asynchronous, see AsyncMethod).
+  ///
+  /// The handWithAsyncMethod function can be used to remove some of the
+  /// boilerplate from writing wrappers for method calls:
+  ///
+  ///   @code{.cpp}
+  ///   class MyClass {
+  ///   public:
+  ///     std::string myMethod(uint32_t X, bool Y) { ... }
+  ///   };
+  ///
+  ///   // SPS Method signature -- note MyClass object address as first
+  ///   // argument.
+  ///   using SPSMyMethodWrapperSignature =
+  ///     SPSString(SPSExecutorAddr, uint32_t, bool);
+  ///
+  ///
+  ///   static void adder_add_sync_sps_wrapper(
+  ///       orc_rt_SessionRef Session, void *CallCtx,
+  ///       orc_rt_WrapperFunctionReturn Return,
+  ///       orc_rt_WrapperFunctionBuffer ArgBytes) {
+  ///     using SPSSig = SPSString(SPSExecutorAddr, int32_t, bool);
+  ///     SPSWrapperFunction<SPSSig>::handle(
+  ///         Session, CallCtx, Return, ArgBytes,
+  ///         WrapperFunction::handleWithSyncMethod(&Adder::addSync));
+  ///   }
+  ///   @endcode
+  ///
+  template <typename ClassT, typename RetT, typename... ArgTs>
+  static SyncMethod<ClassT, RetT, ArgTs...>
+  handleWithSyncMethod(RetT (ClassT::*M)(ArgTs...)) {
+    return SyncMethod<ClassT, RetT, ArgTs...>(M);
+  }
+
   /// Make a call to a wrapper function.
   ///
   /// This utility serializes and deserializes arguments and return values
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index 32aaa61639dbb..e010e2a067adf 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -297,3 +297,53 @@ TEST(SPSWrapperFunctionUtilsTest, TestHandlerWithReferences) {
   EXPECT_EQ(OpCounter<3>::moves(), 1U);
   EXPECT_EQ(OpCounter<3>::copies(), 0U);
 }
+
+namespace {
+class Adder {
+public:
+  int32_t addSync(int32_t X, int32_t Y) { return X + Y; }
+  void addAsync(move_only_function<void(int32_t)> Return, int32_t X,
+                int32_t Y) {
+    Return(addSync(X, Y));
+  }
+};
+} // anonymous namespace
+
+static void adder_add_async_sps_wrapper(orc_rt_SessionRef Session,
+                                        void *CallCtx,
+                                        orc_rt_WrapperFunctionReturn Return,
+                                        orc_rt_WrapperFunctionBuffer ArgBytes) {
+  SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::handle(
+      Session, CallCtx, Return, ArgBytes,
+      WrapperFunction::handleWithAsyncMethod(&Adder::addAsync));
+}
+
+TEST(SPSWrapperFunctionUtilsTest, HandleWtihAsyncMethod) {
+  auto A = std::make_unique<Adder>();
+  int32_t Result = 0;
+  SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::call(
+      DirectCaller(nullptr, adder_add_async_sps_wrapper),
+      [&](Expected<int32_t> R) { Result = cantFail(std::move(R)); },
+      ExecutorAddr::fromPtr(A.get()), 41, 1);
+
+  EXPECT_EQ(Result, 42);
+}
+
+static void adder_add_sync_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+                                       orc_rt_WrapperFunctionReturn Return,
+                                       orc_rt_WrapperFunctionBuffer ArgBytes) {
+  SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::handle(
+      Session, CallCtx, Return, ArgBytes,
+      WrapperFunction::handleWithSyncMethod(&Adder::addSync));
+}
+
+TEST(SPSWrapperFunctionUtilsTest, HandleWithSyncMethod) {
+  auto A = std::make_unique<Adder>();
+  int32_t Result = 0;
+  SPSWrapperFunction<int32_t(SPSExecutorAddr, int32_t, int32_t)>::call(
+      DirectCaller(nullptr, adder_add_sync_sps_wrapper),
+      [&](Expected<int32_t> R) { Result = cantFail(std::move(R)); },
+      ExecutorAddr::fromPtr(A.get()), 41, 1);
+
+  EXPECT_EQ(Result, 42);
+}

From bbdcba9b851abe37cf2b10ec6d9b50c12cdd3604 Mon Sep 17 00:00:00 2001
From: dianqk <dianqk@dianqk.net>
Date: Mon, 6 Oct 2025 12:56:43 +0800
Subject: [PATCH 798/878]  [SimplifyCFG] Fold the contiguous wrapping cases
 into ICmp.  (#161000)

Fixes #157113.

Take the following IR as an example; we know the destination of the `[1,
3]` cases is `%else`.

```llvm
define i32 @src(i8 range(i8 0, 6) %arg) {
  switch i8 %arg, label %else [
    i8 0, label %if
    i8 4, label %if
    i8 5, label %if
  ]

if:
  ret i32 0

else:
  ret i32 1
}
```

We can first try the non-wrapping range for both destinations, but I
don't see how that would be any better.

Proof: https://alive2.llvm.org/ce/z/acdWD4.
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 149 +++++++++---
 .../Coroutines/coro-catchswitch-cleanuppad.ll |   4 +-
 .../SimplifyCFG/switch-dead-default.ll        |  12 +-
 .../SimplifyCFG/switch-range-to-icmp.ll       | 213 ++++++++++++++++++
 4 files changed, 331 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 48055ad6ea7e4..148bfa861eee1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5734,15 +5734,66 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
   return Changed;
 }
 
-static bool casesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
+struct ContiguousCasesResult {
+  ConstantInt *Min;
+  ConstantInt *Max;
+  BasicBlock *Dest;
+  BasicBlock *OtherDest;
+  SmallVectorImpl<ConstantInt *> *Cases;
+  SmallVectorImpl<ConstantInt *> *OtherCases;
+};
+
+static std::optional<ContiguousCasesResult>
+findContiguousCases(Value *Condition, SmallVectorImpl<ConstantInt *> &Cases,
+                    SmallVectorImpl<ConstantInt *> &OtherCases,
+                    BasicBlock *Dest, BasicBlock *OtherDest) {
   assert(Cases.size() >= 1);
 
   array_pod_sort(Cases.begin(), Cases.end(), constantIntSortPredicate);
-  for (size_t I = 1, E = Cases.size(); I != E; ++I) {
-    if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
-      return false;
+  const APInt &Min = Cases.back()->getValue();
+  const APInt &Max = Cases.front()->getValue();
+  APInt Offset = Max - Min;
+  size_t ContiguousOffset = Cases.size() - 1;
+  if (Offset == ContiguousOffset) {
+    return ContiguousCasesResult{
+        /*Min=*/Cases.back(),
+        /*Max=*/Cases.front(),
+        /*Dest=*/Dest,
+        /*OtherDest=*/OtherDest,
+        /*Cases=*/&Cases,
+        /*OtherCases=*/&OtherCases,
+    };
   }
-  return true;
+  ConstantRange CR = computeConstantRange(Condition, /*ForSigned=*/false);
+  // If this is a wrapping contiguous range, that is, [Min, OtherMin] +
+  // [OtherMax, Max] (also [OtherMax, OtherMin]), [OtherMin+1, OtherMax-1] is a
+  // contiguous range for the other destination. N.B. If CR is not a full range,
+  // Max+1 is not equal to Min. It's not continuous in arithmetic.
+  if (Max == CR.getUnsignedMax() && Min == CR.getUnsignedMin()) {
+    assert(Cases.size() >= 2);
+    auto *It =
+        std::adjacent_find(Cases.begin(), Cases.end(), [](auto L, auto R) {
+          return L->getValue() != R->getValue() + 1;
+        });
+    if (It == Cases.end())
+      return std::nullopt;
+    auto [OtherMax, OtherMin] = std::make_pair(*It, *std::next(It));
+    if ((Max - OtherMax->getValue()) + (OtherMin->getValue() - Min) ==
+        Cases.size() - 2) {
+      return ContiguousCasesResult{
+          /*Min=*/cast<ConstantInt>(
+              ConstantInt::get(OtherMin->getType(), OtherMin->getValue() + 1)),
+          /*Max=*/
+          cast<ConstantInt>(
+              ConstantInt::get(OtherMax->getType(), OtherMax->getValue() - 1)),
+          /*Dest=*/OtherDest,
+          /*OtherDest=*/Dest,
+          /*Cases=*/&OtherCases,
+          /*OtherCases=*/&Cases,
+      };
+    }
+  }
+  return std::nullopt;
 }
 
 static void createUnreachableSwitchDefault(SwitchInst *Switch,
@@ -5779,7 +5830,6 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   bool HasDefault = !SI->defaultDestUnreachable();
 
   auto *BB = SI->getParent();
-
   // Partition the cases into two sets with different destinations.
   BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
   BasicBlock *DestB = nullptr;
@@ -5813,37 +5863,62 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   assert(!CasesA.empty() || HasDefault);
 
   // Figure out if one of the sets of cases form a contiguous range.
-  SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
-  BasicBlock *ContiguousDest = nullptr;
-  BasicBlock *OtherDest = nullptr;
-  if (!CasesA.empty() && casesAreContiguous(CasesA)) {
-    ContiguousCases = &CasesA;
-    ContiguousDest = DestA;
-    OtherDest = DestB;
-  } else if (casesAreContiguous(CasesB)) {
-    ContiguousCases = &CasesB;
-    ContiguousDest = DestB;
-    OtherDest = DestA;
-  } else
-    return false;
+  std::optional<ContiguousCasesResult> ContiguousCases;
+
+  // Only one icmp is needed when there is only one case.
+  if (!HasDefault && CasesA.size() == 1)
+    ContiguousCases = ContiguousCasesResult{
+        /*Min=*/CasesA[0],
+        /*Max=*/CasesA[0],
+        /*Dest=*/DestA,
+        /*OtherDest=*/DestB,
+        /*Cases=*/&CasesA,
+        /*OtherCases=*/&CasesB,
+    };
+  else if (CasesB.size() == 1)
+    ContiguousCases = ContiguousCasesResult{
+        /*Min=*/CasesB[0],
+        /*Max=*/CasesB[0],
+        /*Dest=*/DestB,
+        /*OtherDest=*/DestA,
+        /*Cases=*/&CasesB,
+        /*OtherCases=*/&CasesA,
+    };
+  // Correctness: Cases to the default destination cannot be contiguous cases.
+  else if (!HasDefault)
+    ContiguousCases =
+        findContiguousCases(SI->getCondition(), CasesA, CasesB, DestA, DestB);
 
-  // Start building the compare and branch.
+  if (!ContiguousCases)
+    ContiguousCases =
+        findContiguousCases(SI->getCondition(), CasesB, CasesA, DestB, DestA);
 
-  Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
-  Constant *NumCases =
-      ConstantInt::get(Offset->getType(), ContiguousCases->size());
+  if (!ContiguousCases)
+    return false;
 
-  Value *Sub = SI->getCondition();
-  if (!Offset->isNullValue())
-    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+  auto [Min, Max, Dest, OtherDest, Cases, OtherCases] = *ContiguousCases;
 
-  Value *Cmp;
+  // Start building the compare and branch.
+
+  Constant *Offset = ConstantExpr::getNeg(Min);
+  Constant *NumCases = ConstantInt::get(Offset->getType(),
+                                        Max->getValue() - Min->getValue() + 1);
+  BranchInst *NewBI;
+  if (NumCases->isOneValue()) {
+    assert(Max->getValue() == Min->getValue());
+    Value *Cmp = Builder.CreateICmpEQ(SI->getCondition(), Min);
+    NewBI = Builder.CreateCondBr(Cmp, Dest, OtherDest);
+  }
   // If NumCases overflowed, then all possible values jump to the successor.
-  if (NumCases->isNullValue() && !ContiguousCases->empty())
-    Cmp = ConstantInt::getTrue(SI->getContext());
-  else
-    Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
-  BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest);
+  else if (NumCases->isNullValue() && !Cases->empty()) {
+    NewBI = Builder.CreateBr(Dest);
+  } else {
+    Value *Sub = SI->getCondition();
+    if (!Offset->isNullValue())
+      Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+    Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+    NewBI = Builder.CreateCondBr(Cmp, Dest, OtherDest);
+  }
 
   // Update weight for the newly-created conditional branch.
   if (hasBranchWeightMD(*SI)) {
@@ -5853,7 +5928,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
       uint64_t TrueWeight = 0;
       uint64_t FalseWeight = 0;
       for (size_t I = 0, E = Weights.size(); I != E; ++I) {
-        if (SI->getSuccessor(I) == ContiguousDest)
+        if (SI->getSuccessor(I) == Dest)
           TrueWeight += Weights[I];
         else
           FalseWeight += Weights[I];
@@ -5868,15 +5943,15 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   }
 
   // Prune obsolete incoming values off the successors' PHI nodes.
-  for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
-    unsigned PreviousEdges = ContiguousCases->size();
-    if (ContiguousDest == SI->getDefaultDest())
+  for (auto BBI = Dest->begin(); isa<PHINode>(BBI); ++BBI) {
+    unsigned PreviousEdges = Cases->size();
+    if (Dest == SI->getDefaultDest())
       ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
       cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
   }
   for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
-    unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
+    unsigned PreviousEdges = OtherCases->size();
     if (OtherDest == SI->getDefaultDest())
       ++PreviousEdges;
     for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
diff --git a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
index d0e7c1c29eb32..e1e1611ee3362 100644
--- a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
+++ b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll
@@ -80,8 +80,8 @@ cleanup2:
 ; CHECK: cleanup2.corodispatch:
 ; CHECK:   %1 = phi i8 [ 0, %handler2 ], [ 1, %catch.dispatch.2 ]
 ; CHECK:   %2 = cleanuppad within %h1 []
-; CHECK:   %switch = icmp ult i8 %1, 1
-; CHECK:   br i1 %switch, label %cleanup2.from.handler2, label %cleanup2.from.catch.dispatch.2
+; CHECK:   %3 = icmp eq i8 %1, 0
+; CHECK:   br i1 %3, label %cleanup2.from.handler2, label %cleanup2.from.catch.dispatch.2
 
 ; CHECK: cleanup2.from.handler2:
 ; CHECK:   %valueB.reload = load i32, ptr %valueB.spill.addr, align 4
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
index 4a457cc177e85..a0e29dd19dd84 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
@@ -7,8 +7,7 @@ declare void @foo(i32)
 define void @test(i1 %a) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i1 [[A:%.*]]) {
-; CHECK-NEXT:    [[A_OFF:%.*]] = add i1 [[A]], true
-; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i1 [[A_OFF]], true
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i1 [[A]], true
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -209,8 +208,7 @@ define void @test5(i8 %a) {
 ; CHECK-SAME: i8 [[A:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[A]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[A_OFF:%.*]] = add i8 [[A]], -1
-; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[A_OFF]], 1
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i8 [[A]], 1
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -243,8 +241,7 @@ define void @test6(i8 %a) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[A]], -2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[AND]], -2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[A_OFF:%.*]] = add i8 [[A]], 1
-; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[A_OFF]], 1
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i8 [[A]], -1
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -279,8 +276,7 @@ define void @test7(i8 %a) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[A]], -2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[AND]], -2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
-; CHECK-NEXT:    [[A_OFF:%.*]] = add i8 [[A]], 1
-; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[A_OFF]], 1
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i8 [[A]], -1
 ; CHECK-NEXT:    br i1 [[SWITCH]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 8f2ae2d054f1e..0fc3c19edd1f3 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -188,4 +188,217 @@ exit:
   ret void
 }
 
+define i32 @wrapping_known_range(i8 range(i8 0, 6) %arg) {
+; CHECK-LABEL: @wrapping_known_range(
+; CHECK-NEXT:    [[ARG_OFF:%.*]] = add i8 [[ARG:%.*]], -1
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[ARG_OFF]], 3
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[ELSE:%.*]], label [[IF:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[I0:%.*]], [[IF]] ], [ [[I1:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       if:
+; CHECK-NEXT:    [[I0]] = call i32 @f(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[I1]] = call i32 @f(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i8 %arg, label %else [
+  i8 0, label %if
+  i8 4, label %if
+  i8 5, label %if
+  ]
+
+if:
+  %i0 = call i32 @f(i32 0)
+  ret i32 %i0
+
+else:
+  %i1 = call i32 @f(i32 1)
+  ret i32 %i1
+}
+
+define i32 @wrapping_known_range_2(i8 range(i8 0, 6) %arg) {
+; CHECK-LABEL: @wrapping_known_range_2(
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i8 [[ARG:%.*]], 1
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[ELSE:%.*]], label [[IF:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[I0:%.*]], [[IF]] ], [ [[I1:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       if:
+; CHECK-NEXT:    [[I0]] = call i32 @f(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[I1]] = call i32 @f(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i8 %arg, label %else [
+  i8 0, label %if
+  i8 2, label %if
+  i8 3, label %if
+  i8 4, label %if
+  i8 5, label %if
+  ]
+
+if:
+  %i0 = call i32 @f(i32 0)
+  ret i32 %i0
+
+else:
+  %i1 = call i32 @f(i32 1)
+  ret i32 %i1
+}
+
+define i32 @wrapping_range(i8 %arg) {
+; CHECK-LABEL: @wrapping_range(
+; CHECK-NEXT:    [[ARG_OFF:%.*]] = add i8 [[ARG:%.*]], -1
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[ARG_OFF]], -4
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[ELSE:%.*]], label [[IF:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[I0:%.*]], [[IF]] ], [ [[I1:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       if:
+; CHECK-NEXT:    [[I0]] = call i32 @f(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[I1]] = call i32 @f(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i8 %arg, label %else [
+  i8 0, label %if
+  i8 -3, label %if
+  i8 -2, label %if
+  i8 -1, label %if
+  ]
+
+if:
+  %i0 = call i32 @f(i32 0)
+  ret i32 %i0
+
+else:
+  %i1 = call i32 @f(i32 1)
+  ret i32 %i1
+}
+
+define i8 @wrapping_range_phi(i8 %arg) {
+; CHECK-LABEL: @wrapping_range_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_OFF:%.*]] = add i8 [[ARG:%.*]], -1
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp ult i8 [[ARG_OFF]], -2
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[SWITCH]], i8 0, i8 1
+; CHECK-NEXT:    ret i8 [[SPEC_SELECT]]
+;
+entry:
+  switch i8 %arg, label %else [
+  i8 0, label %if
+  i8 -1, label %if
+  ]
+
+if:
+  %i = phi i8 [ 0, %else ], [ 1, %entry ], [ 1, %entry ]
+  ret i8 %i
+
+else:
+  br label %if
+}
+
+define i32 @no_continuous_wrapping_range(i8 %arg) {
+; CHECK-LABEL: @no_continuous_wrapping_range(
+; CHECK-NEXT:    switch i8 [[ARG:%.*]], label [[ELSE:%.*]] [
+; CHECK-NEXT:      i8 0, label [[IF:%.*]]
+; CHECK-NEXT:      i8 -3, label [[IF]]
+; CHECK-NEXT:      i8 -1, label [[IF]]
+; CHECK-NEXT:    ]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[I0:%.*]], [[IF]] ], [ [[I1:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       if:
+; CHECK-NEXT:    [[I0]] = call i32 @f(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[I1]] = call i32 @f(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i8 %arg, label %else [
+  i8 0, label %if
+  i8 -3, label %if
+  i8 -1, label %if
+  ]
+
+if:
+  %i0 = call i32 @f(i32 0)
+  ret i32 %i0
+
+else:
+  %i1 = call i32 @f(i32 1)
+  ret i32 %i1
+}
+
+define i32 @one_case_1(i32 %x) {
+; CHECK-LABEL: @one_case_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[X:%.*]], 10
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[A:%.*]], label [[B:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[TMP0:%.*]], [[B]] ], [ [[TMP1:%.*]], [[A]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       a:
+; CHECK-NEXT:    [[TMP0]] = call i32 @f(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       b:
+; CHECK-NEXT:    [[TMP1]] = call i32 @f(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  switch i32 %x, label %unreachable [
+  i32 5, label %a
+  i32 6, label %a
+  i32 7, label %a
+  i32 10, label %b
+  ]
+
+unreachable:
+  unreachable
+a:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+b:
+  %1 = call i32 @f(i32 1)
+  ret i32 %1
+}
+
+define i32 @one_case_2(i32 %x) {
+; CHECK-LABEL: @one_case_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SWITCH:%.*]] = icmp eq i32 [[X:%.*]], 5
+; CHECK-NEXT:    br i1 [[SWITCH]], label [[A:%.*]], label [[B:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[TMP0:%.*]], [[A]] ], [ [[TMP1:%.*]], [[B]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       a:
+; CHECK-NEXT:    [[TMP0]] = call i32 @f(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       b:
+; CHECK-NEXT:    [[TMP1]] = call i32 @f(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  switch i32 %x, label %unreachable [
+  i32 5, label %a
+  i32 10, label %b
+  i32 11, label %b
+  i32 12, label %b
+  i32 13, label %b
+  ]
+
+unreachable:
+  unreachable
+a:
+  %0 = call i32 @f(i32 0)
+  ret i32 %0
+b:
+  %1 = call i32 @f(i32 1)
+  ret i32 %1
+}
+
 declare void @bar(ptr nonnull dereferenceable(4))

From bea0225c304e3f8efbca48f8c5ee4b39d8f42e0d Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 6 Oct 2025 01:05:53 -0400
Subject: [PATCH 799/878] [AMDGPU] Make cluster a target feature (#162040)

This replaces the original arch check.
---
 clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 ++--
 llvm/lib/Target/AMDGPU/AMDGPU.td            | 7 +++++++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h       | 4 +++-
 llvm/lib/TargetParser/TargetParser.cpp      | 1 +
 4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index af1ef64764cf4..c0c22bcecf9ae 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -109,8 +109,8 @@
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
-// GFX1251: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+cluster,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1251: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+cluster,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 9446144d30e9b..ddb238120dd02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1462,6 +1462,12 @@ def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records
   "The buffer resource (V#) supports 45-bit num_records"
 >;
 
+def FeatureCluster : SubtargetFeature< "cluster",
+  "HasCluster",
+  "true",
+  "Has cluster support"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -2128,6 +2134,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    Feature45BitNumRecordsBufferResource,
    FeatureSupportsXNACK,
    FeatureXNACK,
+   FeatureCluster,
 ]>;
 
 def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a54d6651c25c1..879bf5aac079f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -288,6 +288,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool Has45BitNumRecordsBufferResource = false;
 
+  bool HasCluster = false;
+
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;
 
@@ -1837,7 +1839,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   }
 
   /// \returns true if the subtarget supports clusters of workgroups.
-  bool hasClusters() const { return GFX1250Insts; }
+  bool hasClusters() const { return HasCluster; }
 
   /// \returns true if the subtarget requires a wait for xcnt before atomic
   /// flat/global stores & rmw.
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 34b09b14b0138..b90669029eea4 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -444,6 +444,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["atomic-fmin-fmax-global-f32"] = true;
     Features["atomic-fmin-fmax-global-f64"] = true;
     Features["wavefrontsize32"] = true;
+    Features["cluster"] = true;
     break;
   case GK_GFX1201:
   case GK_GFX1200:

From a406eb460c05e1171971ed1dace2546e3901eb61 Mon Sep 17 00:00:00 2001
From: "Luo, Yuanke" <lyk_03@hotmail.com>
Date: Mon, 6 Oct 2025 14:07:08 +0800
Subject: [PATCH 800/878] [CUDA] Remove CUDAAllowVariadicFunctions option and
 its sema check (#161350)

Variadic argument for NVPTX has been support in
https://github.com/llvm/llvm-project/commit/486d00eca6b6ab470e8324b52cdf9f32023c1c9a
We can remove `CUDAAllowVariadicFunctions` option and its sema check. The CC1 option
`fcuda_allow_variadic_functions` is retained to not break the existing code building.

---------

Co-authored-by: Yuanke Luo <ykluo@birentech.com>
---
 clang/include/clang/Basic/LangOptions.def |  1 -
 clang/include/clang/Driver/Options.td     |  3 +--
 clang/lib/Sema/SemaDecl.cpp               | 11 ----------
 clang/test/SemaCUDA/vararg.cu             | 25 +++++------------------
 4 files changed, 6 insertions(+), 34 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 84f5ab3443a59..9e850089ad87f 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -245,7 +245,6 @@ LANGOPT(HLSLStrictAvailability, 1, 0, NotCompatible,
 LANGOPT(HLSLSpvUseUnknownImageFormat, 1, 0, NotCompatible, "For storage images and texel buffers, sets the default format to 'Unknown' when not specified via the `vk::image_format` attribute. If this option is not used, the format is inferred from the resource's data type.")
 
 LANGOPT(CUDAIsDevice      , 1, 0, NotCompatible, "compiling for CUDA device")
-LANGOPT(CUDAAllowVariadicFunctions, 1, 0, NotCompatible, "allowing variadic functions in CUDA device code")
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, NotCompatible, "treating unattributed constexpr functions as __host__ __device__")
 LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, NotCompatible, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, NotCompatible, "generate relocatable device code")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 5a48f0bcf65e5..9bfa1dd52effe 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8733,8 +8733,7 @@ def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
   HelpText<"Incorporate CUDA device-side binary into host object file.">,
   MarshallingInfoString<CodeGenOpts<"CudaGpuBinaryFileName">>;
 def fcuda_allow_variadic_functions : Flag<["-"], "fcuda-allow-variadic-functions">,
-  HelpText<"Allow variadic functions in CUDA device code.">,
-  MarshallingInfoFlag<LangOpts<"CUDAAllowVariadicFunctions">>;
+  HelpText<"Deprecated; Allow variadic functions in CUDA device code.">;
 def fno_cuda_host_device_constexpr : Flag<["-"], "fno-cuda-host-device-constexpr">,
   HelpText<"Don't treat unattributed constexpr functions as __host__ __device__.">,
   MarshallingInfoNegativeFlag<LangOpts<"CUDAHostDeviceConstexpr">>;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 0069b08f1991a..6eaf7b9435491 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11041,17 +11041,6 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
             << CUDA().getConfigureFuncName();
       Context.setcudaConfigureCallDecl(NewFD);
     }
-
-    // Variadic functions, other than a *declaration* of printf, are not allowed
-    // in device-side CUDA code, unless someone passed
-    // -fcuda-allow-variadic-functions.
-    if (!getLangOpts().CUDAAllowVariadicFunctions && NewFD->isVariadic() &&
-        (NewFD->hasAttr<CUDADeviceAttr>() ||
-         NewFD->hasAttr<CUDAGlobalAttr>()) &&
-        !(II && II->isStr("printf") && NewFD->isExternC() &&
-          !D.isFunctionDefinition())) {
-      Diag(NewFD->getLocation(), diag::err_variadic_device_fn);
-    }
   }
 
   MarkUnusedFileScopedDecl(NewFD);
diff --git a/clang/test/SemaCUDA/vararg.cu b/clang/test/SemaCUDA/vararg.cu
index 0238f42dc40a9..62693e1d4a0af 100644
--- a/clang/test/SemaCUDA/vararg.cu
+++ b/clang/test/SemaCUDA/vararg.cu
@@ -1,11 +1,9 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only \
-// RUN:   -verify -DEXPECT_VA_ARG_ERR -DEXPECT_VARARG_ERR %s
+// RUN:   -verify -DEXPECT_VA_ARG_ERR %s
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only \
 // RUN:   -fcuda-allow-variadic-functions -verify -DEXPECT_VA_ARG_ERR %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify \
-// RUN:   -DEXPECT_VARARG_ERR %s
 
 #include <stdarg.h>
 #include "Inputs/cuda.h"
@@ -30,28 +28,15 @@ __device__ void baz() {
 #endif
 }
 
-__device__ void vararg(const char* x, ...) {}
-#ifdef EXPECT_VARARG_ERR
-// expected-error@-2 {{CUDA device code does not support variadic functions}}
-#endif
+__device__ void vararg(const char* x, ...) {} // OK
 
 template <typename T>
-__device__ void vararg(T t, ...) {}
-#ifdef EXPECT_VARARG_ERR
-// expected-error@-2 {{CUDA device code does not support variadic functions}}
-#endif
+__device__ void vararg(T t, ...) {} // OK
 
 extern "C" __device__ int printf(const char* fmt, ...);  // OK, special case.
 
-// Definition of printf not allowed.
-extern "C" __device__ int printf(const char* fmt, ...) { return 0; }
-#ifdef EXPECT_VARARG_ERR
-// expected-error@-2 {{CUDA device code does not support variadic functions}}
-#endif
+extern "C" __device__ int printf(const char* fmt, ...) { return 0; } // OK
 
 namespace ns {
-__device__ int printf(const char* fmt, ...);
-#ifdef EXPECT_VARARG_ERR
-// expected-error@-2 {{CUDA device code does not support variadic functions}}
-#endif
+__device__ int printf(const char* fmt, ...); // OK
 }

From 550b2ef041ba16ee8b5f55b5f2307f501b2c15a0 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 6 Oct 2025 07:54:24 +0100
Subject: [PATCH 801/878] [lldb][test][NFC] Rename Language.cpp to
 LanguageTest.cpp

So it's consistent with the other tests in this directory.

Also aligns with the source file header comment.
---
 lldb/unittests/Target/CMakeLists.txt                     | 2 +-
 lldb/unittests/Target/{Language.cpp => LanguageTest.cpp} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename lldb/unittests/Target/{Language.cpp => LanguageTest.cpp} (100%)

diff --git a/lldb/unittests/Target/CMakeLists.txt b/lldb/unittests/Target/CMakeLists.txt
index 0c79675a3d890..83eec3b1117bf 100644
--- a/lldb/unittests/Target/CMakeLists.txt
+++ b/lldb/unittests/Target/CMakeLists.txt
@@ -2,7 +2,7 @@ add_lldb_unittest(TargetTests
   ABITest.cpp
   DynamicRegisterInfoTest.cpp
   ExecutionContextTest.cpp
-  Language.cpp
+  LanguageTest.cpp
   LocateModuleCallbackTest.cpp
   MemoryRegionInfoTest.cpp
   MemoryTest.cpp
diff --git a/lldb/unittests/Target/Language.cpp b/lldb/unittests/Target/LanguageTest.cpp
similarity index 100%
rename from lldb/unittests/Target/Language.cpp
rename to lldb/unittests/Target/LanguageTest.cpp

From 5e92e7f4c0fb9ab92572fb974591d52266be8fc6 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 6 Oct 2025 15:14:46 +0800
Subject: [PATCH 802/878] [SCCP] Strengthen two-instruction range checks
 (#162008)

This patch implements the todo discussed in
https://github.com/llvm/llvm-project/pull/158495#discussion_r2349609838.
It also fixes a regression introduced by
https://github.com/llvm/llvm-project/pull/161000. See also
https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2890#discussion_r2404016316.

IR diff: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2892
---
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      | 39 +++++++++++--------
 .../Transforms/SCCP/relax-range-checks.ll     | 24 ++++++++++++
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index af216cd9214bf..9693ae6b8ceb5 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -317,24 +317,29 @@ static Value *simplifyInstruction(SCCPSolver &Solver,
       // Early exit if we know nothing about X.
       if (LRange.isFullSet())
         return nullptr;
-      // We are allowed to refine the comparison to either true or false for out
-      // of range inputs. Here we refine the comparison to true, i.e. we relax
-      // the range check.
-      auto NewCR = CR->exactUnionWith(LRange.inverse());
-      // TODO: Check if we can narrow the range check to an equality test.
-      // E.g, for X in [0, 4), X - 3 u< 2 -> X == 3
-      if (!NewCR)
+      auto ConvertCRToICmp =
+          [&](const std::optional<ConstantRange> &NewCR) -> Value * {
+        ICmpInst::Predicate Pred;
+        APInt RHS;
+        // Check if we can represent NewCR as an icmp predicate.
+        if (NewCR && NewCR->getEquivalentICmp(Pred, RHS)) {
+          IRBuilder<NoFolder> Builder(&Inst);
+          Value *NewICmp =
+              Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS));
+          InsertedValues.insert(NewICmp);
+          return NewICmp;
+        }
         return nullptr;
-      ICmpInst::Predicate Pred;
-      APInt RHS;
-      // Check if we can represent NewCR as an icmp predicate.
-      if (NewCR->getEquivalentICmp(Pred, RHS)) {
-        IRBuilder<NoFolder> Builder(&Inst);
-        Value *NewICmp =
-            Builder.CreateICmp(Pred, X, ConstantInt::get(X->getType(), RHS));
-        InsertedValues.insert(NewICmp);
-        return NewICmp;
-      }
+      };
+      // We are allowed to refine the comparison to either true or false for out
+      // of range inputs.
+      // Here we refine the comparison to false, and check if we can narrow the
+      // range check to a simpler test.
+      if (auto *V = ConvertCRToICmp(CR->exactIntersectWith(LRange)))
+        return V;
+      // Here we refine the comparison to true, i.e. we relax the range check.
+      if (auto *V = ConvertCRToICmp(CR->exactUnionWith(LRange.inverse())))
+        return V;
     }
   }
 
diff --git a/llvm/test/Transforms/SCCP/relax-range-checks.ll b/llvm/test/Transforms/SCCP/relax-range-checks.ll
index 90722f350aa9e..34e48136df37a 100644
--- a/llvm/test/Transforms/SCCP/relax-range-checks.ll
+++ b/llvm/test/Transforms/SCCP/relax-range-checks.ll
@@ -89,4 +89,28 @@ define i1 @relax_range_check_multiuse(i8 range(i8 0, 5) %x)  {
   ret i1 %ret
 }
 
+define i1 @range_check_to_icmp_eq1(i32 range(i32 0, 4) %x) {
+; CHECK-LABEL: define i1 @range_check_to_icmp_eq1(
+; CHECK-SAME: i32 range(i32 0, 4) [[X:%.*]]) {
+; CHECK-NEXT:    [[OFF:%.*]] = add nsw i32 [[X]], -3
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 3
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+  %off = add nsw i32 %x, -3
+  %cmp = icmp ult i32 %off, 2
+  ret i1 %cmp
+}
+
+define i1 @range_check_to_icmp_eq2(i32 range(i32 -1, 2) %x) {
+; CHECK-LABEL: define i1 @range_check_to_icmp_eq2(
+; CHECK-SAME: i32 range(i32 -1, 2) [[X:%.*]]) {
+; CHECK-NEXT:    [[OFF:%.*]] = add nsw i32 [[X]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %off = add nsw i32 %x, -1
+  %cmp = icmp ult i32 %off, -2
+  ret i1 %cmp
+}
+
 declare void @use(i8)

From 7185dd66c7740159797834d57e52cfe38cf6e050 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Mon, 6 Oct 2025 15:40:44 +0800
Subject: [PATCH 803/878] [X86][AVX512] Add missing mayLoad attribute to AVX512
 instructions (#162036)

Fixes crashes reported in #157034.
---
 llvm/lib/Target/X86/X86InstrAVX512.td         | 90 +++++++++++++++++--
 .../X86/Generic/resources-avx512vbmi2.s       |  4 +-
 .../X86/Generic/resources-avx512vbmi2vl.s     |  8 +-
 .../X86/IceLakeServer/resources-avx512vbmi2.s |  4 +-
 .../IceLakeServer/resources-avx512vbmi2vl.s   |  8 +-
 .../SapphireRapids/resources-avx512vbmi2.s    |  4 +-
 .../SapphireRapids/resources-avx512vbmi2vl.s  |  8 +-
 .../X86/Znver4/resources-avx512vbmi2.s        |  4 +-
 .../X86/Znver4/resources-avx512vbmi2vl.s      |  8 +-
 9 files changed, 105 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 564810cb4b88e..83bd6ac26cc59 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -662,6 +662,7 @@ def VINSERTPSZrri : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
       EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
+let mayLoad = 1 in
 def VINSERTPSZrmi : AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -1293,6 +1294,7 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                                       SDPatternOperator OpNode,
                                       X86VectorVTInfo _Dst,
                                       X86VectorVTInfo _Src> {
+  let hasSideEffects = 0, mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (_Dst.VT (OpNode addr:$src))>,
@@ -1748,6 +1750,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
           (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
           EVEX, VVVV, AVX5128IBase, Sched<[sched]>;
 
+  let hasSideEffects = 0, mayLoad = 1 in
   defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -1759,7 +1762,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
 multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
                             X86FoldableSchedWrite sched,
                             X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, mayLoad = 1 in
   defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
               (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3),
               OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
@@ -1987,6 +1990,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                                                   _.FRC:$src2,
                                                   timm:$cc))]>,
                         EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+    let mayLoad = 1 in
     def rmi : AVX512Ii8<0xC2, MRMSrcMem,
                         (outs _.KRC:$dst),
                         (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -2145,6 +2149,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                                 (_.VT _.RC:$src2),
                                                 cond)))]>,
              EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1 in
   def rmi : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
              !strconcat("vpcmp", Suffix,
@@ -2167,6 +2172,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                                          (_.VT _.RC:$src2),
                                                          cond))))]>,
               EVEX, VVVV, EVEX_K, Sched<[sched]>;
+  let mayLoad = 1 in
   def rmik : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
                                     u8imm:$cc),
@@ -2198,6 +2204,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                               PatFrag Frag_su, X86FoldableSchedWrite sched,
                               X86VectorVTInfo _, string Name> :
            avx512_icmp_cc<opc, Suffix, Frag, Frag_su, sched, _, Name> {
+  let mayLoad = 1 in {
   def rmbi : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
                                      u8imm:$cc),
@@ -2221,6 +2228,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                                              (_.BroadcastLdFrag addr:$src2),
                                              cond))))]>,
               EVEX, VVVV, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 
   def : Pat<(_.KVT (Frag:$cc (_.BroadcastLdFrag addr:$src2),
                     (_.VT _.RC:$src1), cond)),
@@ -2305,6 +2313,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
                    (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
                    1>, Sched<[sched]>;
 
+  let mayLoad = 1 in {
   defm  rmi  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                 (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
                 "vcmp"#_.Suffix,
@@ -2329,6 +2338,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
                             timm:$cc)>,
                 EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
+  }
 
   // Patterns for selecting with loads in other operand.
   def : Pat<(X86any_cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
@@ -3771,6 +3781,7 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src
                       [(set VR128X:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))]>,
                         EVEX, Sched<[WriteVecMoveFromGpr]>;
+let mayLoad = 1 in
 def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
                       "vmovd\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
@@ -3874,7 +3885,7 @@ def VMOVSS2DIZrr  : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
 
 // Move Quadword Int to Packed Quadword Int
 //
-let ExeDomain = SSEPackedInt in {
+let ExeDomain = SSEPackedInt, mayLoad = 1, hasSideEffects = 0 in {
 def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
                       (ins i64mem:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
@@ -3930,13 +3941,13 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                      (_.VT _.RC:$src0))))],
              _.ExeDomain>, EVEX, VVVV, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
-  let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  let canFoldAsLoad = 1, isReMaterializable = 1, mayLoad = 1, hasSideEffects = 0 in {
   def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
              [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))],
              _.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
   // _alt version uses FR32/FR64 register class.
-  let isCodeGenOnly = 1 in
+  let isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in
   def rm_alt : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
                  [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
@@ -4557,6 +4568,7 @@ let Predicates = [HasAVX512] in {
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//
 
+let mayLoad = 1, hasSideEffects = 0 in {
 def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
                       (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLSNT.ZMM.RM]>,
@@ -4575,11 +4587,12 @@ let Predicates = [HasVLX] in {
                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLSNT.XMM.RM]>,
                       EVEX, T8, PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
 }
+}
 
 multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                         X86SchedWriteMoveLS Sched,
                         PatFrag st_frag = alignednontemporalstore> {
-  let SchedRW = [Sched.MR], AddedComplexity = 400 in
+  let mayStore = 1, SchedRW = [Sched.MR], AddedComplexity = 400 in
   def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(st_frag (_.VT _.RC:$src), addr:$dst)],
@@ -4682,6 +4695,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     IsCommutable, IsCommutable>, AVX512BIBase, EVEX, VVVV,
                     Sched<[sched]>;
 
+  let mayLoad = 1, hasSideEffects = 0 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
@@ -4694,6 +4708,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _, X86FoldableSchedWrite sched,
                             bit IsCommutable = 0> :
            avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> {
+  let mayLoad = 1, hasSideEffects = 0 in
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                   "${src2}"#_.BroadcastStr#", $src1",
@@ -4811,6 +4826,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                                          (_Src.VT _Src.RC:$src2))),
                             IsCommutable>,
                             AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1, hasSideEffects = 0 in {
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
@@ -4828,6 +4844,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                                  (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
                     AVX512BIBase, EVEX, VVVV, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 }
 
 defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
@@ -4893,6 +4910,7 @@ defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU,
 multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _Src, X86VectorVTInfo _Dst,
                             X86FoldableSchedWrite sched> {
+  let mayLoad = 1, hasSideEffects = 0 in
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                     (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
                     OpcodeStr,
@@ -4916,6 +4934,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                                          (_Src.VT _Src.RC:$src2))),
                             IsCommutable, IsCommutable>,
                             EVEX_CD8<_Src.EltSize, CD8VF>, EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1, hasSideEffects = 0 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
@@ -5370,6 +5389,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                            (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">,
                            Sched<[sched]>;
 
+  let mayLoad = 1 in
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -5384,6 +5404,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                           Sched<[sched]> {
     let isCommutable = IsCommutable;
   }
+  let mayLoad = 1 in
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5414,6 +5435,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                            (_.VT (VecNode _.RC:$src1, _.RC:$src2)), "_Int">,
                            Sched<[sched]>, SIMD_EXC;
 
+  let mayLoad = 1 in
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -5430,6 +5452,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                           Sched<[sched]> {
     let isCommutable = IsCommutable;
   }
+  let mayLoad = 1 in
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5509,6 +5532,7 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                           Sched<[sched]> {
     let isCommutable = 1;
   }
+  let mayLoad = 1 in
   def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5737,6 +5761,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
                   EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -5749,6 +5774,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
                    EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
+  }
 }
 
 multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5759,6 +5785,7 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
                   Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -5916,6 +5943,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (i8 timm:$src2)))>,
                    Sched<[sched]>;
+  let mayLoad = 1 in
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
@@ -5928,7 +5956,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
 multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
                              string OpcodeStr, SDNode OpNode,
                              X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, mayLoad = 1 in
   defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
       "$src2, ${src1}"#_.BroadcastStr, "${src1}"#_.BroadcastStr#", $src2",
@@ -5946,6 +5974,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>,
                    AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
@@ -6095,6 +6124,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>,
                    AVX5128IBase, EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
@@ -6107,7 +6137,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, mayLoad = 1 in
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                     "${src2}"#_.BroadcastStr#", $src1",
@@ -6372,6 +6402,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   (_.VT (OpNode _.RC:$src1,
                                (Ctrl.VT Ctrl.RC:$src2)))>,
                   T8, PD, EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
@@ -6389,6 +6420,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                             (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
                    T8, PD, EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 }
 
 multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
@@ -7258,6 +7290,7 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
                       (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>,
                EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
 
+  let mayLoad = 1 in
   def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
                 (ins DstVT.RC:$src1, x86memop:$src2),
                 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -7400,6 +7433,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                  [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>,
                  EVEX, VEX_LIG, EVEX_B, EVEX_RC,
                  Sched<[sched]>;
+    let mayLoad = 1 in
     def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode
@@ -7451,6 +7485,7 @@ multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode SrcVT.FRC:$src))]>,
                 EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+    let mayLoad = 1 in
     def rm : AVX512<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode (SrcVT.ScalarLdFrag addr:$src)))]>,
@@ -7572,6 +7607,7 @@ let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in {
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
               EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+  let mayLoad = 1 in
   def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
@@ -7587,6 +7623,7 @@ let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in {
             !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
             [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>,
                                   EVEX, VEX_LIG, EVEX_B, Sched<[sched]>;
+  let mayLoad = 1 in
   def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
               (ins _SrcRC.IntScalarMemOp:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
@@ -7644,6 +7681,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                        (_Src.VT _Src.RC:$src2))), "_Int">,
                          EVEX, VVVV, VEX_LIG, Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -7807,6 +7845,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
                                        _.ImmAllZerosV)>,
                          EVEX, Sched<[sched]>;
 
+  let mayLoad = 1 in {
   defm rm : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins MemOp:$src),
                          (ins _.RC:$src0, MaskRC:$mask, MemOp:$src),
@@ -7840,6 +7879,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
                                        _.ImmAllZerosV)>,
                          EVEX, EVEX_B, Sched<[sched.Folded]>;
   }
+  }
 }
 // Conversion with SAE - suppress all exceptions
 multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -8944,6 +8984,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                             (X86any_cvtph2ps (_src.VT _src.RC:$src)),
                             (X86cvtph2ps (_src.VT _src.RC:$src))>,
                             T8, PD, Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86any_cvtph2ps (_src.VT ld_dag)),
@@ -9161,6 +9202,7 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            "$src2, $src1", "$src1, $src2",
                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
                            EVEX, VVVV, VEX_LIG, Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -9621,6 +9663,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
                          (i32 timm:$src3))), "_Int">, EVEX_B,
                          Sched<[sched]>;
 
+  let mayLoad = 1 in
   defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
                          OpcodeStr,
@@ -9999,6 +10042,7 @@ multiclass avx512_pmovx_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWr
                     (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
                   EVEX, Sched<[sched]>;
 
+  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
                   (ins x86memop:$src), OpcodeStr ,"$src", "$src",
                   (DestInfo.VT (LdFrag addr:$src))>,
@@ -10601,6 +10645,7 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
               (null_frag)>, AVX5128IBase,
               Sched<[sched]>;
 
+  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
               (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
               (null_frag)>,
@@ -10673,6 +10718,7 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr,
                       (OpNode (_.VT _.RC:$src1), (i32 timm:$src2)),
                       (MaskOpNode (_.VT _.RC:$src1), (i32 timm:$src2))>,
                       Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rmi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
                     OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2",
@@ -10691,6 +10737,7 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr,
                                 (i32 timm:$src2))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
+  }
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
@@ -10739,6 +10786,7 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (i32 timm:$src3))>,
                       Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10755,6 +10803,7 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (i32 timm:$src3))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
+  }
 }
 
 //handle instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm)
@@ -10770,6 +10819,7 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                (SrcInfo.VT SrcInfo.RC:$src2),
                                (i8 timm:$src3)))>,
                   Sched<[sched]>;
+  let mayLoad = 1 in
   defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
                 (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
                 OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10788,7 +10838,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86FoldableSchedWrite sched, X86VectorVTInfo _>:
   avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{
 
-  let ExeDomain = _.ExeDomain, ImmT = Imm8 in
+  let ExeDomain = _.ExeDomain, ImmT = Imm8, mayLoad = 1 in
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
                     OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
@@ -10811,6 +10861,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (i32 timm:$src3))>,
                       Sched<[sched]>;
+  let mayLoad = 1 in
   defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10979,6 +11030,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                          (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,
                                                   (i8 timm:$src3)))))>,
                   Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                 (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
                 OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -11000,6 +11052,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                                    (i8 timm:$src3)))))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
+  }
 }
 
 multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched,
@@ -11031,6 +11084,7 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
                   OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                   (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>,
                   Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                 (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
                 OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -11048,6 +11102,7 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
                               (i8 timm:$src3))>, EVEX_B,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
+  }
 }
 
 multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
@@ -11202,6 +11257,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (_.VT (OpNode (_.VT _.RC:$src1)))>, EVEX, AVX5128IBase,
                     Sched<[sched]>;
 
+  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.MemOp:$src1), OpcodeStr,
                   "$src1", "$src1",
@@ -11214,6 +11270,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86FoldableSchedWrite sched, X86VectorVTInfo _> :
            avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> {
+  let mayLoad = 1 in
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.ScalarMemOp:$src1), OpcodeStr,
                   "${src1}"#_.BroadcastStr,
@@ -11368,6 +11425,7 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr,
                    (ins _.RC:$src), OpcodeStr, "$src", "$src",
                    (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX,
                    Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                  (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
                  (_.VT (_.BroadcastLdFrag addr:$src))>,
@@ -11513,6 +11571,7 @@ defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, REX_W;
 multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                             X86VectorVTInfo _, PatFrag LdFrag,
                                             SDPatternOperator immoperator> {
+  let mayLoad = 1 in
   def rmi : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst),
       (ins _.RC:$src1,  _.ScalarMemOp:$src2, u8imm:$src3),
       OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -11650,6 +11709,7 @@ multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
                                 (OpNode (_src.VT _src.RC:$src1),
                                         (_src.VT _src.RC:$src2))))]>,
              Sched<[sched]>;
+  let mayLoad = 1 in
   def rm : AVX512BI<opc, MRMSrcMem,
            (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -11751,6 +11811,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src3),
                               (i8 timm:$src4)), 1, 1>,
                       AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
                     OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -11770,6 +11831,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (i8 timm:$src4)), 1, 0>, EVEX_B,
                     AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
   }// Constraints = "$src1 = $dst"
 
   // Additional patterns for matching passthru operand in other positions.
@@ -12016,6 +12078,7 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
                                       (_.VT _.RC:$src2),
                                       (TblVT.VT _.RC:$src3),
                                       (i32 timm:$src4))>, Sched<[sched]>;
+    let mayLoad = 1 in {
     defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
                       OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -12033,6 +12096,7 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
                                     (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)),
                                     (i32 timm:$src4))>,
                     EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+    }
   } // Constraints = "$src1 = $dst"
 }
 
@@ -12075,6 +12139,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
                                         (_src3VT.VT _src3VT.RC:$src3),
                                         (i32 timm:$src4))>,
                       EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+    let mayLoad = 1 in
     defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                      (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
                      OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -12417,6 +12482,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                             VTI.RC:$src2, VTI.RC:$src3)),
                                    IsCommutable, IsCommutable>,
                                    EVEX, VVVV, T8, Sched<[sched]>;
+  let mayLoad = 1 in {
   defm rm  :  AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
@@ -12435,6 +12501,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    T8, Sched<[sched.Folded, sched.ReadAfterFold,
                                                 sched.ReadAfterFold]>;
   }
+  }
 }
 
 multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
@@ -12508,6 +12575,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
                                 (VTI.VT VTI.RC:$src2))>, EVEX, VVVV, T8, PD,
                                 Sched<[sched]>;
+  let mayLoad = 1 in
   defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
                                 (ins VTI.RC:$src1, VTI.MemOp:$src2),
                                 "vpshufbitqmb",
@@ -12557,7 +12625,7 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
                                       X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
                                       X86VectorVTInfo BcstVTI>
            : avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> {
-  let ExeDomain = VTI.ExeDomain in
+  let ExeDomain = VTI.ExeDomain, mayLoad = 1 in
   defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src1, BcstVTI.ScalarMemOp:$src2, u8imm:$src3),
                 OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1",
@@ -12660,6 +12728,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                             _.RC:$src1, (_.VT _.RC:$src2)))]>,
                   EVEX, VVVV, T8, XD, Sched<[sched]>;
 
+  let mayLoad = 1 in {
   def rm : I<0x68, MRMSrcMem,
                   (outs _.KRPC:$dst),
                   (ins  _.RC:$src1, _.MemOp:$src2),
@@ -12679,6 +12748,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
                   EVEX, VVVV, T8, XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 }
 
 multiclass avx512_vp2intersect<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
@@ -12882,6 +12952,7 @@ let Predicates = [HasFP16] in {
 // Move word ( r/m16) to Packed word
 def VMOVW2SHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
                       "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, Sched<[WriteVecMoveFromGpr]>;
+let mayLoad = 1 in
 def VMOVWrm : AVX512<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i16mem:$src),
                       "vmovw\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
@@ -13607,6 +13678,7 @@ multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNod
                         (v4f32 (OpNode VR128X:$src1, VR128X:$src2)),
                         IsCommutable, IsCommutable, IsCommutable,
                         X86selects, "@earlyclobber $dst">, Sched<[WriteFMAX]>;
+    let mayLoad = 1 in
     defm rm : AVX512_maskable<opc, MRMSrcMem, f32x_info, (outs VR128X:$dst),
                         (ins VR128X:$src1, ssmem:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2.s
index d777d31cdfa16..8e0d47e45c12d 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2.s
@@ -153,12 +153,12 @@ vpshrdw           $1, (%rax), %zmm17, %zmm19 {k1}{z}
 # CHECK-NEXT:  2      8     1.00           *            vpcompressw	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpcompressw	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                  U     vpexpandb	%zmm16, %zmm19
-# CHECK-NEXT:  2      8     1.00                  U     vpexpandb	(%rax), %zmm19
+# CHECK-NEXT:  2      8     1.00    *             U     vpexpandb	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vpexpandb	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      8     1.00    *                   vpexpandb	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpexpandb	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                  U     vpexpandw	%zmm16, %zmm19
-# CHECK-NEXT:  2      8     1.00                  U     vpexpandw	(%rax), %zmm19
+# CHECK-NEXT:  2      8     1.00    *             U     vpexpandw	(%rax), %zmm19
 # CHECK-NEXT:  1      1     1.00                        vpexpandw	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      8     1.00    *                   vpexpandw	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpexpandw	%zmm16, %zmm19 {%k1} {z}
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2vl.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2vl.s
index 99b88fe24765c..f6be96414e364 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vbmi2vl.s
@@ -295,22 +295,22 @@ vpshrdw           $1, (%rax), %ymm17, %ymm19 {k1}{z}
 # CHECK-NEXT:  2      8     1.00           *            vpcompressw	%ymm16, (%rax) {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpcompressw	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                  U     vpexpandb	%xmm16, %xmm19
-# CHECK-NEXT:  2      8     1.00                  U     vpexpandb	(%rax), %xmm19
+# CHECK-NEXT:  2      8     1.00    *             U     vpexpandb	(%rax), %xmm19
 # CHECK-NEXT:  1      1     1.00                        vpexpandb	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      8     1.00    *                   vpexpandb	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpexpandb	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                  U     vpexpandb	%ymm16, %ymm19
-# CHECK-NEXT:  2      8     1.00                  U     vpexpandb	(%rax), %ymm19
+# CHECK-NEXT:  2      8     1.00    *             U     vpexpandb	(%rax), %ymm19
 # CHECK-NEXT:  1      1     1.00                        vpexpandb	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  2      8     1.00    *                   vpexpandb	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpexpandb	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                  U     vpexpandw	%xmm16, %xmm19
-# CHECK-NEXT:  2      8     1.00                  U     vpexpandw	(%rax), %xmm19
+# CHECK-NEXT:  2      8     1.00    *             U     vpexpandw	(%rax), %xmm19
 # CHECK-NEXT:  1      1     1.00                        vpexpandw	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      8     1.00    *                   vpexpandw	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpexpandw	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      1     1.00                  U     vpexpandw	%ymm16, %ymm19
-# CHECK-NEXT:  2      8     1.00                  U     vpexpandw	(%rax), %ymm19
+# CHECK-NEXT:  2      8     1.00    *             U     vpexpandw	(%rax), %ymm19
 # CHECK-NEXT:  1      1     1.00                        vpexpandw	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  2      8     1.00    *                   vpexpandw	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      1     1.00                        vpexpandw	%ymm16, %ymm19 {%k1} {z}
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2.s
index 08f07dcb3b976..5c987eeb00ac3 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2.s
@@ -153,12 +153,12 @@ vpshrdw           $1, (%rax), %zmm17, %zmm19 {k1}{z}
 # CHECK-NEXT:  2      10    1.00           *            vpcompressw	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpcompressw	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                  U     vpexpandb	%zmm16, %zmm19
-# CHECK-NEXT:  2      10    1.00                  U     vpexpandb	(%rax), %zmm19
+# CHECK-NEXT:  2      10    1.00    *             U     vpexpandb	(%rax), %zmm19
 # CHECK-NEXT:  1      3     1.00                        vpexpandb	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      10    1.00    *                   vpexpandb	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpexpandb	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                  U     vpexpandw	%zmm16, %zmm19
-# CHECK-NEXT:  2      10    1.00                  U     vpexpandw	(%rax), %zmm19
+# CHECK-NEXT:  2      10    1.00    *             U     vpexpandw	(%rax), %zmm19
 # CHECK-NEXT:  1      3     1.00                        vpexpandw	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      10    1.00    *                   vpexpandw	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpexpandw	%zmm16, %zmm19 {%k1} {z}
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2vl.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2vl.s
index 0194303612b6a..023026b963ffe 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2vl.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vbmi2vl.s
@@ -295,22 +295,22 @@ vpshrdw           $1, (%rax), %ymm17, %ymm19 {k1}{z}
 # CHECK-NEXT:  2      10    1.00           *            vpcompressw	%ymm16, (%rax) {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpcompressw	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                  U     vpexpandb	%xmm16, %xmm19
-# CHECK-NEXT:  2      10    1.00                  U     vpexpandb	(%rax), %xmm19
+# CHECK-NEXT:  2      10    1.00    *             U     vpexpandb	(%rax), %xmm19
 # CHECK-NEXT:  1      3     1.00                        vpexpandb	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      10    1.00    *                   vpexpandb	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpexpandb	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                  U     vpexpandb	%ymm16, %ymm19
-# CHECK-NEXT:  2      10    1.00                  U     vpexpandb	(%rax), %ymm19
+# CHECK-NEXT:  2      10    1.00    *             U     vpexpandb	(%rax), %ymm19
 # CHECK-NEXT:  1      3     1.00                        vpexpandb	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  2      10    1.00    *                   vpexpandb	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpexpandb	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                  U     vpexpandw	%xmm16, %xmm19
-# CHECK-NEXT:  2      10    1.00                  U     vpexpandw	(%rax), %xmm19
+# CHECK-NEXT:  2      10    1.00    *             U     vpexpandw	(%rax), %xmm19
 # CHECK-NEXT:  1      3     1.00                        vpexpandw	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      10    1.00    *                   vpexpandw	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpexpandw	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                  U     vpexpandw	%ymm16, %ymm19
-# CHECK-NEXT:  2      10    1.00                  U     vpexpandw	(%rax), %ymm19
+# CHECK-NEXT:  2      10    1.00    *             U     vpexpandw	(%rax), %ymm19
 # CHECK-NEXT:  1      3     1.00                        vpexpandw	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  2      10    1.00    *                   vpexpandw	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      3     1.00                        vpexpandw	%ymm16, %ymm19 {%k1} {z}
diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2.s
index ed8a4170d0938..db1f9aff776ea 100644
--- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2.s
@@ -153,12 +153,12 @@ vpshrdw           $1, (%rax), %zmm17, %zmm19 {k1}{z}
 # CHECK-NEXT:  6      14    2.00           *            vpcompressw	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  2      6     2.00                        vpcompressw	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      3     2.00                  U     vpexpandb	%zmm16, %zmm19
-# CHECK-NEXT:  3      11    2.00                  U     vpexpandb	(%rax), %zmm19
+# CHECK-NEXT:  3      11    2.00    *             U     vpexpandb	(%rax), %zmm19
 # CHECK-NEXT:  2      8     2.00                        vpexpandb	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  3      13    2.00    *                   vpexpandb	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  2      8     2.00                        vpexpandb	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  2      3     2.00                  U     vpexpandw	%zmm16, %zmm19
-# CHECK-NEXT:  3      11    2.00                  U     vpexpandw	(%rax), %zmm19
+# CHECK-NEXT:  3      11    2.00    *             U     vpexpandw	(%rax), %zmm19
 # CHECK-NEXT:  2      8     2.00                        vpexpandw	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  3      13    2.00    *                   vpexpandw	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  2      8     2.00                        vpexpandw	%zmm16, %zmm19 {%k1} {z}
diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2vl.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2vl.s
index 3db09bc332d8f..9277a913d174e 100644
--- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2vl.s
+++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vbmi2vl.s
@@ -295,22 +295,22 @@ vpshrdw           $1, (%rax), %ymm17, %ymm19 {k1}{z}
 # CHECK-NEXT:  6      14    2.00           *            vpcompressw	%ymm16, (%rax) {%k1}
 # CHECK-NEXT:  2      6     2.00                        vpcompressw	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      3     2.00                  U     vpexpandb	%xmm16, %xmm19
-# CHECK-NEXT:  3      10    2.00                  U     vpexpandb	(%rax), %xmm19
+# CHECK-NEXT:  3      10    2.00    *             U     vpexpandb	(%rax), %xmm19
 # CHECK-NEXT:  2      8     2.00                        vpexpandb	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  3      13    2.00    *                   vpexpandb	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  2      8     2.00                        vpexpandb	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  2      3     2.00                  U     vpexpandb	%ymm16, %ymm19
-# CHECK-NEXT:  3      11    2.00                  U     vpexpandb	(%rax), %ymm19
+# CHECK-NEXT:  3      11    2.00    *             U     vpexpandb	(%rax), %ymm19
 # CHECK-NEXT:  2      8     2.00                        vpexpandb	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  3      13    2.00    *                   vpexpandb	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  2      8     2.00                        vpexpandb	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      3     2.00                  U     vpexpandw	%xmm16, %xmm19
-# CHECK-NEXT:  3      10    2.00                  U     vpexpandw	(%rax), %xmm19
+# CHECK-NEXT:  3      10    2.00    *             U     vpexpandw	(%rax), %xmm19
 # CHECK-NEXT:  2      8     2.00                        vpexpandw	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  3      13    2.00    *                   vpexpandw	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  2      8     2.00                        vpexpandw	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  2      3     2.00                  U     vpexpandw	%ymm16, %ymm19
-# CHECK-NEXT:  3      11    2.00                  U     vpexpandw	(%rax), %ymm19
+# CHECK-NEXT:  3      11    2.00    *             U     vpexpandw	(%rax), %ymm19
 # CHECK-NEXT:  2      8     2.00                        vpexpandw	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  3      13    2.00    *                   vpexpandw	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  2      8     2.00                        vpexpandw	%ymm16, %ymm19 {%k1} {z}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2.s
index 594518dcc775a..88e140d54b7a2 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2.s
@@ -153,12 +153,12 @@ vpshrdw           $1, (%rax), %zmm17, %zmm19 {k1}{z}
 # CHECK-NEXT:  2      8     0.50           *            vpcompressw	%zmm16, (%rax) {%k1}
 # CHECK-NEXT:  1      5     1.00                        vpcompressw	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      5     1.00                  U     vpexpandb	%zmm16, %zmm19
-# CHECK-NEXT:  2      8     0.50                  U     vpexpandb	(%rax), %zmm19
+# CHECK-NEXT:  2      8     0.50    *             U     vpexpandb	(%rax), %zmm19
 # CHECK-NEXT:  1      5     1.00                        vpexpandb	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vpexpandb	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      5     1.00                        vpexpandb	%zmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      5     1.00                  U     vpexpandw	%zmm16, %zmm19
-# CHECK-NEXT:  2      8     0.50                  U     vpexpandw	(%rax), %zmm19
+# CHECK-NEXT:  2      8     0.50    *             U     vpexpandw	(%rax), %zmm19
 # CHECK-NEXT:  1      5     1.00                        vpexpandw	%zmm16, %zmm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vpexpandw	(%rax), %zmm19 {%k1}
 # CHECK-NEXT:  1      5     1.00                        vpexpandw	%zmm16, %zmm19 {%k1} {z}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2vl.s
index 7b9c2516929b8..325835a62f748 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vbmi2vl.s
@@ -295,22 +295,22 @@ vpshrdw           $1, (%rax), %ymm17, %ymm19 {k1}{z}
 # CHECK-NEXT:  2      8     0.50           *            vpcompressw	%ymm16, (%rax) {%k1}
 # CHECK-NEXT:  1      4     1.00                        vpcompressw	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      1     0.50                  U     vpexpandb	%xmm16, %xmm19
-# CHECK-NEXT:  2      8     0.50                  U     vpexpandb	(%rax), %xmm19
+# CHECK-NEXT:  2      8     0.50    *             U     vpexpandb	(%rax), %xmm19
 # CHECK-NEXT:  2      1     0.50                        vpexpandb	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vpexpandb	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  2      1     0.50                        vpexpandb	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      4     1.00                  U     vpexpandb	%ymm16, %ymm19
-# CHECK-NEXT:  2      8     0.50                  U     vpexpandb	(%rax), %ymm19
+# CHECK-NEXT:  2      8     0.50    *             U     vpexpandb	(%rax), %ymm19
 # CHECK-NEXT:  1      4     1.00                        vpexpandb	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vpexpandb	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      4     1.00                        vpexpandb	%ymm16, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      1     0.50                  U     vpexpandw	%xmm16, %xmm19
-# CHECK-NEXT:  2      8     0.50                  U     vpexpandw	(%rax), %xmm19
+# CHECK-NEXT:  2      8     0.50    *             U     vpexpandw	(%rax), %xmm19
 # CHECK-NEXT:  2      1     0.50                        vpexpandw	%xmm16, %xmm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vpexpandw	(%rax), %xmm19 {%k1}
 # CHECK-NEXT:  2      1     0.50                        vpexpandw	%xmm16, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      4     1.00                  U     vpexpandw	%ymm16, %ymm19
-# CHECK-NEXT:  2      8     0.50                  U     vpexpandw	(%rax), %ymm19
+# CHECK-NEXT:  2      8     0.50    *             U     vpexpandw	(%rax), %ymm19
 # CHECK-NEXT:  1      4     1.00                        vpexpandw	%ymm16, %ymm19 {%k1}
 # CHECK-NEXT:  2      8     0.50    *                   vpexpandw	(%rax), %ymm19 {%k1}
 # CHECK-NEXT:  1      4     1.00                        vpexpandw	%ymm16, %ymm19 {%k1} {z}

From 732a3662736db4ef75b02ddbcdad23385446cea9 Mon Sep 17 00:00:00 2001
From: Tomer Shafir <tomer.shafir8@gmail.com>
Date: Mon, 6 Oct 2025 10:48:52 +0300
Subject: [PATCH 804/878] [NFC][AArch64] Flatten a branch on
 AArch64InstrInfo::copyPhysReg (#161138)

Simplifies the code and improves readability.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 50 ++++++++++----------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5a90da1fade39..cad9b1493b86b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5098,33 +5098,31 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (Subtarget.hasZeroCycleRegMoveGPR64() &&
+               !Subtarget.hasZeroCycleRegMoveGPR32()) {
+      // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+      MCRegister DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+      assert(DestRegX.isValid() && "Destination super-reg not valid");
+      MCRegister SrcRegX =
+          SrcReg == AArch64::WZR
+              ? AArch64::XZR
+              : TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                         &AArch64::GPR64spRegClass);
+      assert(SrcRegX.isValid() && "Source super-reg not valid");
+      // This instruction is reading and writing X registers.  This may upset
+      // the register scavenger and machine verifier, so we need to indicate
+      // that we are reading an undefined value from SrcRegX, but a proper
+      // value from SrcReg.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+          .addReg(AArch64::XZR)
+          .addReg(SrcRegX, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
     } else {
-      if (Subtarget.hasZeroCycleRegMoveGPR64() &&
-          !Subtarget.hasZeroCycleRegMoveGPR32()) {
-        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
-        MCRegister DestRegX = TRI->getMatchingSuperReg(
-            DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
-        assert(DestRegX.isValid() && "Destination super-reg not valid");
-        MCRegister SrcRegX =
-            SrcReg == AArch64::WZR
-                ? AArch64::XZR
-                : TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
-                                           &AArch64::GPR64spRegClass);
-        assert(SrcRegX.isValid() && "Source super-reg not valid");
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
-            .addReg(AArch64::XZR)
-            .addReg(SrcRegX, RegState::Undef)
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        // Otherwise, expand to ORR WZR.
-        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
-            .addReg(AArch64::WZR)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-      }
+      // Otherwise, expand to ORR WZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+          .addReg(AArch64::WZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
     }
     return;
   }

From ebbc0e97b991c98bbcacf3d49b54685ef1a73188 Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus@amd.com>
Date: Mon, 6 Oct 2025 09:50:11 +0200
Subject: [PATCH 805/878] [AMDGPU] Remove subtarget features for dynamic VGPRs
 (#160822)

Users of the backend are expected to enable dynamic VGPRs via the
`amdgpu-dynamic-vgpr-block-size` attribute instead of the subtarget
features (see https://github.com/llvm/llvm-project/pull/133444).
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |  14 --
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  10 +-
 .../AMDGPU/pal-metadata-3.0-callable.ll       |   2 -
 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll  |  13 +-
 .../CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll  | 204 ++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll  |  13 +-
 6 files changed, 213 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ddb238120dd02..6b3c151cdd88b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1411,20 +1411,6 @@ def FeatureGloballyAddressableScratch : SubtargetFeature<
   "FLAT instructions can access scratch memory for any thread in any wave"
 >;
 
-// FIXME: Remove after all users are migrated to attribute.
-def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
-  "DynamicVGPR",
-  "true",
-  "Enable dynamic VGPR mode"
->;
-
-// FIXME: Remove after all users are migrated to attribute.
-def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
-  "DynamicVGPRBlockSize32",
-  "true",
-  "Use a block size of 32 for dynamic VGPR allocation (default is 16)"
->;
-
 // Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
 // restoring the callee-saved registers.
 def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 20fa1412a778e..f7f4d46b4d39c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1353,11 +1353,6 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
   if (DynamicVGPRBlockSize != 0)
     return DynamicVGPRBlockSize;
 
-  // Temporarily check the subtarget feature, until we fully switch to using
-  // attributes.
-  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
-    return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
-
   bool IsWave32 = EnableWavefrontSize32
                       ? *EnableWavefrontSize32
                       : STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1412,10 +1407,7 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
   if (Features.test(FeatureGFX90AInsts))
     return 512;
 
-  // Temporarily check the subtarget feature, until we fully switch to using
-  // attributes.
-  if (DynamicVGPRBlockSize != 0 ||
-      STI->getFeatureBits().test(FeatureDynamicVGPR))
+  if (DynamicVGPRBlockSize != 0)
     // On GFX12 we can allocate at most 8 blocks of VGPRs.
     return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
   return getAddressableNumArchVGPRs(STI);
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 6b7d704d93e6d..ede470b49e250 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -1,13 +1,11 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefixes=CHECK,GFX11 %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck --check-prefixes=CHECK,GFX12 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck --check-prefixes=CHECK,GFX12,DVGPR %s
 
 ; CHECK:           .amdgpu_pal_metadata
 ; CHECK-NEXT: ---
 ; CHECK-NEXT: amdpal.pipelines:
 ; CHECK-NEXT:  - .api:            Vulkan
 ; CHECK-NEXT:    .compute_registers:
-; DVGPR-NEXT:      .dynamic_vgpr_en:   true
 ; CHECK-NEXT:      .tg_size_en:     true
 ; CHECK-NEXT:      .tgid_x_en:      false
 ; CHECK-NEXT:      .tgid_y_en:      false
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 5c0c366277829..53254992c1122 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -1,17 +1,14 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11,NODVGPR
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK,NODVGPR
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK
 
 ; CHECK-LABEL: {{^}}_amdgpu_cs_main:
-; NODVGPR: ; TotalNumSgprs: 4
-; DVGPR: ; TotalNumSgprs: 34
+; CHECK: ; TotalNumSgprs: 4
 ; CHECK: ; NumVgprs: 2
 ; CHECK:           .amdgpu_pal_metadata
 ; CHECK-NEXT: ---
 ; CHECK-NEXT: amdpal.pipelines:
 ; CHECK-NEXT:   - .api:            Vulkan
 ; CHECK-NEXT:     .compute_registers:
-; DVGPR-NEXT:       .dynamic_vgpr_en:   true
 ; CHECK-NEXT:       .tg_size_en:     true
 ; CHECK-NEXT:       .tgid_x_en:      false
 ; CHECK-NEXT:       .tgid_y_en:      false
@@ -57,7 +54,6 @@
 ; CHECK-NEXT:      .cs:
 ; CHECK-NEXT:        .checksum_value: 0x9444d7d0
 ; CHECK-NEXT:        .debug_mode:     false
-; DVGPR-NEXT:        .dynamic_vgpr_saved_count: 0x70
 ; CHECK-NEXT:        .entry_point:    _amdgpu_cs_main
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
@@ -69,8 +65,7 @@
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
-; NODVGPR-NEXT:      .sgpr_count:     0x4
-; DVGPR-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .sgpr_count:     0x4
 ; CHECK-NEXT:        .sgpr_limit:     0x6a
 ; CHECK-NEXT:        .threadgroup_dimensions:
 ; CHECK-NEXT:          - 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll
new file mode 100644
index 0000000000000..e598b0c5bef4d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6-dvgpr.ll
@@ -0,0 +1,204 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; TotalNumSgprs: 34
+; CHECK: ; NumVgprs: 2
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:   - .api:            Vulkan
+; CHECK-NEXT:     .compute_registers:
+; CHECK-NEXT:       .dynamic_vgpr_en:   true
+; CHECK-NEXT:       .tg_size_en:     true
+; CHECK-NEXT:       .tgid_x_en:      false
+; CHECK-NEXT:       .tgid_y_en:      false
+; CHECK-NEXT:       .tgid_z_en:      false
+; CHECK-NEXT:       .tidig_comp_cnt: 0x1
+; CHECK-NEXT:     .graphics_registers:
+; CHECK-NEXT:      .ps_extra_lds_size: 0
+; CHECK-NEXT:      .spi_ps_input_addr:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: true
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: true
+; CHECK-NEXT:        .linear_centroid_ena: true
+; CHECK-NEXT:        .linear_sample_ena: true
+; CHECK-NEXT:        .persp_center_ena: true
+; CHECK-NEXT:        .persp_centroid_ena: true
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: true
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:      .spi_ps_input_ena:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: false
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: false
+; CHECK-NEXT:        .linear_centroid_ena: false
+; CHECK-NEXT:        .linear_sample_ena: false
+; CHECK-NEXT:        .persp_center_ena: false
+; CHECK-NEXT:        .persp_centroid_ena: false
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: false
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .dynamic_vgpr_saved_count: 0x70
+; CHECK-NOT:        .entry_point:    _amdgpu_cs_main
+; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .forward_progress: true
+; GFX11-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x40
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .gs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NOT:        .entry_point:    _amdgpu_gs_main
+; CHECK-NEXT:        .entry_point_symbol:    gs_shader
+; CHECK-NEXT:        .forward_progress: true
+; GFX11-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0x200
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK-NEXT:      .hs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NOT:        .entry_point:    _amdgpu_hs_main
+; CHECK-NEXT:        .entry_point_symbol:    hs_shader
+; CHECK-NEXT:        .forward_progress: true
+; GFX11-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0x1000
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK-NEXT:      .ps:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NOT:        .entry_point:    _amdgpu_ps_main
+; CHECK-NEXT:        .entry_point_symbol:    ps_shader
+; CHECK-NEXT:        .forward_progress: true
+; GFX11-NEXT:        .ieee_mode:      false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK:    .registers:      {}
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0x6
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = and i64 %i, -4294967296
+  %i2 = zext i32 %arg1 to i64
+  %i3 = or i64 %i1, %i2
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = and i32 %arg2, 1023
+  %i6 = lshr i32 %arg2, 10
+  %i7 = and i32 %i6, 1023
+  %i8 = add nuw nsw i32 %i7, %i5
+  %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  %.idx = shl nuw nsw i32 %i8, 2
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+  ret void
+}
+
+define dllexport amdgpu_ps void @ps_shader() #1 {
+  ret void
+}
+
+@LDS.GS = external addrspace(3) global [1 x i32], align 4
+
+define dllexport amdgpu_gs void @gs_shader() {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+@LDS.HS = external addrspace(3) global [1024 x i32], align 4
+
+define dllexport amdgpu_hs void @hs_shader() {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+attributes #0 = { nounwind memory(readwrite) "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr-block-size"="16" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" "amdgpu-dynamic-vgpr-block-size"="16" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\06"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
index 830872a58f0b8..d2f26e87243d2 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll
@@ -1,17 +1,14 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11,NODVGPR
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK,NODVGPR
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK
 
 ; CHECK-LABEL: {{^}}_amdgpu_cs_main:
-; NODVGPR: ; TotalNumSgprs: 4
-; DVGPR: ; TotalNumSgprs: 34
+; CHECK: ; TotalNumSgprs: 4
 ; CHECK: ; NumVgprs: 2
 ; CHECK:           .amdgpu_pal_metadata
 ; CHECK-NEXT: ---
 ; CHECK-NEXT: amdpal.pipelines:
 ; CHECK-NEXT:   - .api:            Vulkan
 ; CHECK-NEXT:     .compute_registers:
-; DVGPR-NEXT:       .dynamic_vgpr_en:   true
 ; CHECK-NEXT:       .tg_size_en:     true
 ; CHECK-NEXT:       .tgid_x_en:      false
 ; CHECK-NEXT:       .tgid_y_en:      false
@@ -57,7 +54,6 @@
 ; CHECK-NEXT:      .cs:
 ; CHECK-NEXT:        .checksum_value: 0x9444d7d0
 ; CHECK-NEXT:        .debug_mode:     false
-; DVGPR-NEXT:        .dynamic_vgpr_saved_count: 0x70
 ; CHECK-NOT:        .entry_point:    _amdgpu_cs_main
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
@@ -69,8 +65,7 @@
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
-; NODVGPR-NEXT:      .sgpr_count:     0x4
-; DVGPR-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .sgpr_count:     0x4
 ; CHECK-NEXT:        .sgpr_limit:     0x6a
 ; CHECK-NEXT:        .threadgroup_dimensions:
 ; CHECK-NEXT:          - 0x1

From e573c795e4938440aa1ddb0371568be69eb08390 Mon Sep 17 00:00:00 2001
From: rdez13 <140968532+rdez13@users.noreply.github.com>
Date: Mon, 6 Oct 2025 03:58:40 -0400
Subject: [PATCH 806/878] [clang][x86][bytecode] Replace interp__builtin_rotate
 with static bool interp__builtin_elementwise_int_binop callback #160289
 (#161924)

Fixes #160289
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 29 +++++++-----------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 68ebfdf27ba43..a3c4ba5447250 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -736,25 +736,6 @@ static bool interp__builtin_expect(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-/// rotateleft(value, amount)
-static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC,
-                                   const InterpFrame *Frame,
-                                   const CallExpr *Call, bool Right) {
-  APSInt Amount = popToAPSInt(S, Call->getArg(1));
-  APSInt Value = popToAPSInt(S, Call->getArg(0));
-
-  APSInt Result;
-  if (Right)
-    Result = APSInt(Value.rotr(Amount.urem(Value.getBitWidth())),
-                    /*IsUnsigned=*/true);
-  else // Left.
-    Result = APSInt(Value.rotl(Amount.urem(Value.getBitWidth())),
-                    /*IsUnsigned=*/true);
-
-  pushInteger(S, Result, Call->getType());
-  return true;
-}
-
 static bool interp__builtin_ffs(InterpState &S, CodePtr OpPC,
                                 const InterpFrame *Frame,
                                 const CallExpr *Call) {
@@ -3160,7 +3141,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI_rotl:
   case Builtin::BI_lrotl:
   case Builtin::BI_rotl64:
-    return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/false);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
+          return Value.rotl(Amount);
+        });
 
   case Builtin::BI__builtin_rotateright8:
   case Builtin::BI__builtin_rotateright16:
@@ -3171,7 +3155,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI_rotr:
   case Builtin::BI_lrotr:
   case Builtin::BI_rotr64:
-    return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/true);
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
+          return Value.rotr(Amount);
+        });
 
   case Builtin::BI__builtin_ffs:
   case Builtin::BI__builtin_ffsl:

From bd8a7f9ef394c7f722fc8ae3f852311550669e56 Mon Sep 17 00:00:00 2001
From: Kirill Vedernikov <kvedernikov@nvidia.com>
Date: Mon, 6 Oct 2025 10:21:49 +0200
Subject: [PATCH 807/878] [NVPTX] Added more MMA intrinsics for F8F6F4 and FP64
 types. (#156040)

This change adds more MMA intrinsics for F8F6F4 and FP64 types. The implementation is based on [PTX ISA version 9.0](https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma). New restrictions were added for dtype/ctype combinations for MMA and sparse MMA intrinsics. MLIR restrictions for dtype/ctype MMA intrinsics were aligned with NVVM IR.
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td       | 102 ++++++++++++++--
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td     |  30 +++--
 llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py |   2 +-
 llvm/test/CodeGen/NVPTX/wmma.py              | 115 +++++++++++++++++--
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td  |   5 +-
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp   |   3 +
 mlir/test/Dialect/LLVMIR/invalid.mlir        |  30 +++++
 mlir/test/Dialect/LLVMIR/nvvm.mlir           |  24 ----
 mlir/test/Target/LLVMIR/nvvmir.mlir          |  26 -----
 9 files changed, 252 insertions(+), 85 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 23d878f726c5e..3af1750ffcf3f 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -272,6 +272,10 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType, bit IsSparse = fals
       !eq(gft,"m16n8k16:d:f32") : !listsplat(llvm_float_ty, 4),
       !eq(gft,"m16n8k4:c:f32") : !listsplat(llvm_float_ty, 4),
       !eq(gft,"m16n8k4:d:f32") : !listsplat(llvm_float_ty, 4),
+      !eq(gft,"m16n8k32:c:f16") : !listsplat(llvm_v2f16_ty, 2),
+      !eq(gft,"m16n8k32:c:f32") : !listsplat(llvm_float_ty, 4),
+      !eq(gft,"m16n8k32:d:f16") : !listsplat(llvm_v2f16_ty, 2),
+      !eq(gft,"m16n8k32:d:f32") : !listsplat(llvm_float_ty, 4),
 
       // wmma fp16 -> fp16/fp32 @  m16n16k16/m8n32k16/m32n8k16
       // All other supported geometries use the same fragment format for f32 and
@@ -298,6 +302,21 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType, bit IsSparse = fals
       !eq(gft,"m8n8k4:c:f64") : !listsplat(llvm_double_ty, 2),
       !eq(gft,"m8n8k4:d:f64") : !listsplat(llvm_double_ty, 2),
 
+      !eq(gft,"m16n8k4:a:f64") : !listsplat(llvm_double_ty, 2),
+      !eq(gft,"m16n8k4:b:f64") : [llvm_double_ty],
+      !eq(gft,"m16n8k4:c:f64") : !listsplat(llvm_double_ty, 4),
+      !eq(gft,"m16n8k4:d:f64") : !listsplat(llvm_double_ty, 4),
+
+      !eq(gft,"m16n8k8:a:f64") : !listsplat(llvm_double_ty, 4),
+      !eq(gft,"m16n8k8:b:f64") : !listsplat(llvm_double_ty, 2),
+      !eq(gft,"m16n8k8:c:f64") : !listsplat(llvm_double_ty, 4),
+      !eq(gft,"m16n8k8:d:f64") : !listsplat(llvm_double_ty, 4),
+
+      !eq(gft,"m16n8k16:a:f64") : !listsplat(llvm_double_ty, 8),
+      !eq(gft,"m16n8k16:b:f64") : !listsplat(llvm_double_ty, 4),
+      !eq(gft,"m16n8k16:c:f64") : !listsplat(llvm_double_ty, 4),
+      !eq(gft,"m16n8k16:d:f64") : !listsplat(llvm_double_ty, 4),
+
       // wmma bf16 -> s32 @ m16n16k16/m8n32k16/m32n8k16
       !eq(gft,"m16n16k16:a:bf16") : !listsplat(llvm_i32_ty, 4),
       !eq(gft,"m16n16k16:b:bf16") : !listsplat(llvm_i32_ty, 4),
@@ -378,6 +397,26 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType, bit IsSparse = fals
       !eq(gft,"m16n8k64:c:s32") : !listsplat(llvm_i32_ty, 4),
       !eq(gft,"m16n8k64:d:s32") : !listsplat(llvm_i32_ty, 4),
 
+      // mma e4m3/e5m2 -> f16/f32 @ m16n8k16
+      !eq(gft,"m16n8k16:a:e4m3") : !listsplat(llvm_i32_ty, 2),
+      !eq(gft,"m16n8k16:a:e5m2") : !listsplat(llvm_i32_ty, 2),
+      !eq(gft,"m16n8k16:b:e4m3") : [llvm_i32_ty],
+      !eq(gft,"m16n8k16:b:e5m2") : [llvm_i32_ty],
+      // mma e4m3/e5m2/e3m2/e2m3/e2m1 -> f32 @ m16n8k32
+      !eq(gft,"m16n8k32:a:e4m3") : !listsplat(llvm_i32_ty, 4),
+      !eq(gft,"m16n8k32:a:e5m2") : !listsplat(llvm_i32_ty, 4),
+      !eq(gft,"m16n8k32:a:e3m2") : !listsplat(llvm_i32_ty, 4),
+      !eq(gft,"m16n8k32:a:e2m3") : !listsplat(llvm_i32_ty, 4),
+      !eq(gft,"m16n8k32:a:e2m1") : !listsplat(llvm_i32_ty, 4),
+      !eq(gft,"m16n8k32:b:e4m3") : !listsplat(llvm_i32_ty, 2),
+      !eq(gft,"m16n8k32:b:e5m2") : !listsplat(llvm_i32_ty, 2),
+      !eq(gft,"m16n8k32:b:e3m2") : !listsplat(llvm_i32_ty, 2),
+      !eq(gft,"m16n8k32:b:e2m3") : !listsplat(llvm_i32_ty, 2),
+      !eq(gft,"m16n8k32:b:e2m1") : !listsplat(llvm_i32_ty, 2),
+      // mma e2m1 -> f32 @m16n8k64
+      !eq(gft,"m16n8k64:a:e2m1") : !listsplat(llvm_i32_ty, 4),
+      !eq(gft,"m16n8k64:b:e2m1") : !listsplat(llvm_i32_ty, 2),
+
       // wmma/mma b1 -> s32 @ m8n8k128(b1)
       !eq(gft,"m8n8k128:a:b1") : [llvm_i32_ty],
       !eq(gft,"m8n8k128:b:b1") : [llvm_i32_ty],
@@ -468,7 +507,7 @@ class WMMA_NAME<string ALayout, string BLayout, int Satfinite, string Rnd, strin
                   # !if(Satfinite, "_satfinite", "");
 }
 
-class MMA_NAME<string ALayout, string BLayout, int Satfinite, string b1op,
+class MMA_NAME<string ALayout, string BLayout, int Satfinite, string b1op, string Kind,
                WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
   string signature = MMA_SIGNATURE<A, B, C, D>.ret;
   string record = "int_nvvm_mma"
@@ -476,6 +515,7 @@ class MMA_NAME<string ALayout, string BLayout, int Satfinite, string b1op,
                   # "_" # A.geom
                   # "_" # ALayout
                   # "_" # BLayout
+                  # !if(!ne(Kind, ""), !strconcat("_", !subst("::", "_", Kind)), "")
                   # !if(Satfinite, "_satfinite", "")
                   # signature;
 }
@@ -601,7 +641,7 @@ class NVVM_MMA_OPS {
             ["m16n8k16", "m16n8k8"],
             ["bf16"], [], ["f32"], []>.ret;
   list<list<WMMA_REGS>> f64_mma_ops = MMA_OPS<
-            ["m8n8k4"],
+            ["m8n8k4", "m16n8k4", "m16n8k8", "m16n8k16"],
             ["f64"], [], ["f64"], []>.ret;
   list<list<WMMA_REGS>> fp_mma_ops = MMA_OPS<
             ["m8n8k4", "m16n8k8", "m16n8k16"],
@@ -609,6 +649,18 @@ class NVVM_MMA_OPS {
   list<list<WMMA_REGS>> int_mma_ops = MMA_OPS<
             ["m8n8k16", "m16n8k16", "m16n8k32"],
             ["s8", "u8"], ["s8", "u8"], ["s32"], []>.ret;
+  // m16n8k32 fp8 variants are intersected with f8f6f4 variants
+  // and processed there
+  list<list<WMMA_REGS>> fp8_mma_ops = MMA_OPS<
+            ["m16n8k16"],
+            ["e4m3", "e5m2"], ["e4m3", "e5m2"],
+            ["f16", "f32"], ["f16", "f32"]>.ret;
+  // it also contains e4m3/e5m2 from fp8 variants
+  list<list<WMMA_REGS>> f8f6f4_mma_ops = MMA_OPS<
+            ["m16n8k32"],
+            ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+            ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+            ["f16", "f32"], ["f16", "f32"]>.ret;
   list<list<WMMA_REGS>> subint_mma_ops = MMA_OPS<
             ["m8n8k32", "m16n8k32", "m16n8k64"],
             ["s4", "u4"], ["s4", "u4"], ["s32"], []>.ret;
@@ -617,7 +669,8 @@ class NVVM_MMA_OPS {
             ["b1"], [], ["s32"], []>.ret;
   list<list<WMMA_REGS>> all_mma_ops = !listconcat(
             tf32_mma_ops, bf16_mma_ops, f64_mma_ops,
-            fp_mma_ops, int_mma_ops, subint_mma_ops, bit_mma_ops);
+            fp_mma_ops, fp8_mma_ops, f8f6f4_mma_ops,
+            int_mma_ops, subint_mma_ops, bit_mma_ops);
 
   list<list<WMMA_REGS>> bf16_mma_sp_ops = MMA_OPS<
             ["m16n8k16", "m16n8k32"],
@@ -770,7 +823,8 @@ class NVVM_MMA_B1OPS<list<WMMA_REGS> frags> {
 // if NVVM_MMA_SUPPORTED<...>.ret then
 //   def : FOO<>; // The record will only be defined for supported ops.
 //
-class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b, int satf> {
+class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b,
+                         string kind, int satf> {
   // MMA ops check both layouts.
   string layout = layout_a # ":" # layout_b;
   string a_type = frags[0].ptx_elt_type;
@@ -805,10 +859,31 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
          !or(!ne(a_type, b_type),
              !ne(c_type, d_type))): false,
 
-    // m16n8k8 requires C and D to be the same type.
-    !and(!eq(geom, "m16n8k8"),
+    // m16n8k16/m16n8k32 requires C and D to be the same type
+    !and(!or(!eq(geom, "m16n8k16"),
+             !eq(geom, "m16n8k32")),
          !ne(c_type, d_type)): false,
 
+    // Limit kind to valid types and geometries
+    !and(!ne(kind, ""),
+         !or(!ne(geom, "m16n8k32"),
+             !and(!ne(a_type, "e4m3"),
+                  !ne(a_type, "e5m2"),
+                  !ne(a_type, "e3m2"),
+                  !ne(a_type, "e2m3"),
+                  !ne(a_type, "e2m1")))): false,
+
+    // Limit m16n8k16/m16n8k32 with no kind to valid types
+    !and(!eq(kind, ""),
+         !or(!eq(geom, "m16n8k16"),
+             !eq(geom, "m16n8k32")),
+             !or(!eq(a_type, "e3m2"),
+                 !eq(a_type, "e2m3"),
+                 !eq(a_type, "e2m1"),
+                 !eq(b_type, "e3m2"),
+                 !eq(b_type, "e2m3"),
+                 !eq(b_type, "e2m1"))): false,
+
     // All other are OK.
     true: true
   );
@@ -882,9 +957,10 @@ class NVVM_MMA_SP_SUPPORTED<list<WMMA_REGS> frags, string metadata,
              !eq(a_type, "tf32")),
          !ne(a_type, b_type)): false,
 
-    // m16n8k16 and m16n8k32 requires C and D to be the same type.
+    // m16n8k16, m16n8k32 and m16n8k64 requires C and D to be the same type.
     !and(!or(!eq(geom, "m16n8k16"),
-             !eq(geom, "m16n8k32")),
+             !eq(geom, "m16n8k32"),
+             !eq(geom, "m16n8k64")),
          !ne(c_type, d_type)): false,
 
     !and(!eq(kind, ""),
@@ -2252,10 +2328,12 @@ foreach layout_a = ["row", "col"] in {
     foreach satf = [0, 1] in {
       foreach op = NVVM_MMA_OPS.all_mma_ops in {
         foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
-          if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
-            def MMA_NAME<layout_a, layout_b, satf, b1op, op[0], op[1], op[2], op[3]>.record
-              : NVVM_MMA<op[0], op[1], op[2], op[3]>;
-          }
+          foreach kind = ["", "kind::f8f6f4"] in {
+            if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, kind, satf>.ret then {
+                def MMA_NAME<layout_a, layout_b, satf, b1op, kind, op[0], op[1], op[2], op[3]>.record
+                : NVVM_MMA<op[0], op[1], op[2], op[3]>;
+            }
+          } // kind
         } // b1op
       } // op
     } // satf
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 28d4bb917ff69..a8b854ff1f793 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4528,6 +4528,10 @@ class WMMA_REGINFO<WMMA_REGS r, string op, string metadata = "", string kind = "
         !eq(ptx_elt_type, "e2m1"),
         !ne(kind, "")) : [hasSM120a, hasPTX<87>],
 
+    !and(!or(!eq(ptx_elt_type,"e4m3"),
+             !eq(ptx_elt_type,"e5m2")),
+         !eq(geom, "m16n8k16")) : [hasSM<89>, hasPTX<87>],
+
     !or(!eq(ptx_elt_type, "e4m3"),
         !eq(ptx_elt_type, "e5m2")) : [hasSM<89>, hasPTX<84>],
 
@@ -4543,6 +4547,11 @@ class WMMA_REGINFO<WMMA_REGS r, string op, string metadata = "", string kind = "
     !and(!eq(geom, "m8n8k4"),
          !eq(ptx_elt_type, "f64")) : [hasSM<80>, hasPTX<70>],
 
+    !and(!or(!eq(geom, "m16n8k4"),
+             !eq(geom, "m16n8k8"),
+             !eq(geom, "m16n8k16")),
+         !eq(ptx_elt_type, "f64")) : [hasSM<90>, hasPTX<78>],
+
     // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16
     !and(!or(!eq(geom, "m8n32k16"),
              !eq(geom, "m32n8k16")),
@@ -4827,8 +4836,8 @@ defset list<WMMA_INSTR> WMMAs  = {
 // MMA
 class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
                WMMA_REGINFO FragC, WMMA_REGINFO FragD,
-               string ALayout, string BLayout, int Satfinite, string b1op>
-  : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, FragA, FragB, FragC, FragD>.record,
+               string ALayout, string BLayout, int Satfinite, string b1op, string Kind>
+  : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record,
                         [FragA.Ins, FragB.Ins, FragC.Ins]>,
     // Requires does not seem to have effect on Instruction w/o Patterns.
     // We set it here anyways and propagate to the Pat<> we construct below.
@@ -4843,6 +4852,7 @@ class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
                   # FragA.geom
                   # "." # ALayout
                   # "." # BLayout
+                  # !if(!ne(Kind, ""), "." # Kind, "")
                   # !if(Satfinite, ".satfinite", "")
                   # TypeList
                   # b1op # "\n\t\t"
@@ -4859,13 +4869,15 @@ defset list<WMMA_INSTR> MMAs  = {
       foreach satf = [0, 1] in {
         foreach op = NVVM_MMA_OPS.all_mma_ops in {
           foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
-            if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, satf>.ret then {
-              def : MMA<WMMA_REGINFO<op[0], "mma">,
-                        WMMA_REGINFO<op[1], "mma">,
-                        WMMA_REGINFO<op[2], "mma">,
-                        WMMA_REGINFO<op[3], "mma">,
-                        layout_a, layout_b, satf, b1op>;
-            }
+            foreach kind = ["", "kind::f8f6f4"] in {
+              if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, kind, satf>.ret then {
+                def : MMA<WMMA_REGINFO<op[0], "mma", "", kind>,
+                          WMMA_REGINFO<op[1], "mma", "", kind>,
+                          WMMA_REGINFO<op[2], "mma", "", kind>,
+                          WMMA_REGINFO<op[3], "mma", "", kind>,
+                          layout_a, layout_b, satf, b1op, kind>;
+              }
+            } // kind
           } // b1op
         } // op
       } // satf
diff --git a/llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py b/llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py
index ae781df0116fd..40055ae519fc4 100644
--- a/llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py
+++ b/llvm/test/CodeGen/NVPTX/wmma-ptx87-sm120a.py
@@ -2,7 +2,7 @@
 # RUN: %python %s --ptx=87 --gpu-arch=120 --aa > %t-ptx87-sm_120a.ll
 # RUN: llc < %t-ptx87-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 \
 # RUN:           | FileCheck %t-ptx87-sm_120a.ll
-# RUN: %if ptxas-12.7 %{                                                  \
+# RUN: %if ptxas-sm_120a && ptxas-isa-8.7 %{                                  \
 # RUN: llc < %t-ptx87-sm_120a.ll -mtriple=nvptx64 -mcpu=sm_120a -mattr=+ptx87 \
 # RUN:           | %ptxas-verify -arch=sm_120a                              \
 # RUN: %}
diff --git a/llvm/test/CodeGen/NVPTX/wmma.py b/llvm/test/CodeGen/NVPTX/wmma.py
index 6d73bce46da7c..8427ae4ad72da 100644
--- a/llvm/test/CodeGen/NVPTX/wmma.py
+++ b/llvm/test/CodeGen/NVPTX/wmma.py
@@ -90,6 +90,21 @@ def __init__(self, geom, frag, ptx_elt_type, is_mma_sparse=False):
             "m16n8k32:b:s8": 2,
             "m16n8k32:c:s32": 4,
             "m16n8k32:d:s32": 4,
+            # e4m3/e5m2/e3m2/e2m3/e2m1 -> f16/f32 @ m16n8k16/m16n8k32
+            "m16n8k16:a:e4m3": 2,
+            "m16n8k16:a:e5m2": 2,
+            "m16n8k32:a:e4m3": 4,
+            "m16n8k32:a:e5m2": 4,
+            "m16n8k32:a:e3m2": 4,
+            "m16n8k32:a:e2m3": 4,
+            "m16n8k32:a:e2m1": 4,
+            "m16n8k16:b:e4m3": 1,
+            "m16n8k16:b:e5m2": 1,
+            "m16n8k32:b:e4m3": 2,
+            "m16n8k32:b:e5m2": 2,
+            "m16n8k32:b:e3m2": 2,
+            "m16n8k32:b:e2m3": 2,
+            "m16n8k32:b:e2m1": 2,
             # mma sp
             "m16n8k32:a:bf16": 4,
             "m16n8k32:a:f16": 4,
@@ -182,6 +197,18 @@ def __init__(self, geom, frag, ptx_elt_type, is_mma_sparse=False):
             "m8n8k4:b:f64": 1,
             "m8n8k4:c:f64": 2,
             "m8n8k4:d:f64": 2,
+            "m16n8k4:a:f64": 2,
+            "m16n8k4:b:f64": 1,
+            "m16n8k4:c:f64": 4,
+            "m16n8k4:d:f64": 4,
+            "m16n8k8:a:f64": 4,
+            "m16n8k8:b:f64": 2,
+            "m16n8k8:c:f64": 4,
+            "m16n8k8:d:f64": 4,
+            "m16n8k16:a:f64": 8,
+            "m16n8k16:b:f64": 4,
+            "m16n8k16:c:f64": 4,
+            "m16n8k16:d:f64": 4,
             # tf32 -> s32 @ m16n16k8
             "m16n16k8:a:tf32": 4,
             "m16n16k8:b:tf32": 4,
@@ -324,7 +351,9 @@ def get_wmma_ops():
 
 def get_mma_ops():
     return (
-        make_mma_ops(["m8n8k4"], ["f64"], [], ["f64"], [])
+        make_mma_ops(
+            ["m8n8k4", "m16n8k4", "m16n8k8", "m16n8k16"], ["f64"], [], ["f64"], []
+        )
         + make_mma_ops(["m16n8k4", "m16n8k8"], ["tf32"], [], ["f32"], [])
         + make_mma_ops(["m16n8k16", "m16n8k8"], ["bf16"], [], ["f32"], [])
         + make_mma_ops(
@@ -341,6 +370,20 @@ def get_mma_ops():
             ["m8n8k32", "m16n8k32", "m16n8k64"], ["s4", "u4"], ["s4", "u4"], ["s32"], []
         )
         + make_mma_ops(["m8n8k128", "m16n8k128", "m16n8k256"], ["b1"], [], ["s32"], [])
+        + make_mma_ops(
+            ["m16n8k16"],
+            ["e4m3", "e5m2"],
+            ["e4m3", "e5m2"],
+            ["f16", "f32"],
+            ["f16", "f32"],
+        )
+        + make_mma_ops(
+            ["m16n8k32"],
+            ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+            ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"],
+            ["f16", "f32"],
+            ["f16", "f32"],
+        )
     )
 
 
@@ -492,7 +535,7 @@ def is_wmma_variant_supported(op, layout_a, layout_b, rnd, satf):
     return True
 
 
-def is_mma_variant_supported(op, layout_a, layout_b, satf):
+def is_mma_variant_supported(op, layout_a, layout_b, kind, satf):
     if not (
         is_type_supported(op.a.mma_type.ptx_type) and is_mma_geom_supported(op.a.geom)
     ):
@@ -516,13 +559,53 @@ def is_mma_variant_supported(op, layout_a, layout_b, satf):
     ):
         return False
 
+    if (
+        op.a.geom != "m8n8k4"
+        and op.a.mma_type.ptx_type == "f64"
+        and (ptx_version < 78 or gpu_arch < 90)
+    ):
+        return False
+
     # C and D type must be the same
-    if op.a.geom == "m16n8k16" and op.c.mma_type.ptx_type != op.d.mma_type.ptx_type:
+    if (
+        op.a.geom in ["m16n8k16", "m16n8k32"]
+        and op.c.mma_type.ptx_type != op.d.mma_type.ptx_type
+    ):
+        return False
+
+    if (
+        op.a.geom in ["m16n8k16", "m16n8k32"]
+        and any(
+            x in ["e4m3", "e5m2"]
+            for x in (op.a.mma_type.ptx_type, op.b.mma_type.ptx_type)
+        )
+        and ptx_version < 87
+    ):
+        return False
+
+    if kind != "" and not (ptx_version >= 87 and gpu_arch >= 120 and aa):
+        return False
+
+    if kind != "" and (
+        op.a.geom != "m16n8k32"
+        or op.a.mma_type.ptx_type not in ["e4m3", "e5m2", "e3m2", "e2m3", "e2m1"]
+    ):
+        return False
+
+    if (
+        kind == ""
+        and op.a.geom in ["m16n8k16", "m16n8k32"]
+        and any(
+            x in ["e3m2", "e2m3", "e2m1"]
+            for x in (op.a.mma_type.ptx_type, op.b.mma_type.ptx_type)
+        )
+    ):
         return False
 
     # Require row/col layout for all MMA except m8n8k4 on FP16
     if not (op.a.geom == "m8n8k4" and op.a.mma_type.ptx_type == "f16"):
         return layout_a == "row" and layout_b == "col"
+
     return True
 
 
@@ -937,7 +1020,12 @@ def common_mma_test_gen(params, op, intrinsic_template, instruction_template):
 """
 
     test_params = params
-    test_params["intrinsic"] = Template(intrinsic_template).substitute(params)
+    test_params["intrinsic"] = (
+        Template(intrinsic_template)
+        .substitute(params)
+        .replace("::", ".")
+        .replace("_", ".")
+    )
     test_params["function"] = test_params["intrinsic"].replace(".", "_")
     test_params["instruction"] = Template(instruction_template).substitute(params)
     test_params["ret_ty"] = make_wmma_ld_ret_ty(op.d)
@@ -1002,16 +1090,20 @@ def gen_wmma_mma_tests():
 
 
 def gen_mma_tests():
-    mma_intrinsic_template = "llvm.nvvm.mma${b1op}.${geom}.${alayout}.${blayout}${satf}.${intrinsic_signature}"
-    mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}${satf}.${ptx_signature}${b1op}"
+    mma_intrinsic_template = "llvm.nvvm.mma${b1op}.${geom}.${alayout}.${blayout}${kind}${satf}.${intrinsic_signature}"
+    mma_instruction_template = "mma.sync${aligned}.${geom}.${alayout}.${blayout}${kind}${satf}.${ptx_signature}${b1op}"
 
     generated_items = []
 
-    for op, alayout, blayout, satf in product(
-        get_mma_ops(), ["row", "col"], ["row", "col"], [".satfinite", ""]
+    for op, alayout, blayout, kind, satf in product(
+        get_mma_ops(),
+        ["row", "col"],
+        ["row", "col"],
+        ["", ".kind::f8f6f4"],
+        [".satfinite", ""],
     ):
 
-        if not is_mma_variant_supported(op, alayout, blayout, satf):
+        if not is_mma_variant_supported(op, alayout, blayout, kind, satf):
             continue
 
         for b1op in get_b1_ops(op.a.mma_type.ptx_type):
@@ -1024,6 +1116,7 @@ def gen_mma_tests():
                 "satf": satf,
                 "geom": op.a.geom,
                 "b1op": b1op,
+                "kind": kind,
             }
 
             intrinsic_template = mma_intrinsic_template
@@ -1105,9 +1198,9 @@ def is_mma_sp_variant_supported(op, metadata, kind, satf):
     ):
         return False
 
-    # C and D type must be the same for m16n8k16/m16n8k32
+    # C and D type must be the same for m16n8k16/m16n8k32/m16n8k64
     if (
-        op.a.geom in ["m16n8k16", "m16n8k32"]
+        op.a.geom in ["m16n8k16", "m16n8k32", "m16n8k64"]
         and op.c.mma_type.ptx_type != op.d.mma_type.ptx_type
     ):
         return False
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index f56c1e5b936e6..f8e3167b42c35 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2160,8 +2160,9 @@ class NVVM_MMA_SUPPORTED<list<WMMA_REGS> frags, string layout_a, string layout_b
          !or(!ne(a_type, b_type),
              !ne(c_type, d_type))): false,
 
-    // m16n8k8 requires C and D to be the same type.
-    !and(!eq(geom, "m16n8k8"),
+    // m16n8k16/m16n8k32 requires C and D to be the same type
+    !and(!or(!eq(geom, "m16n8k16"),
+             !eq(geom, "m16n8k32")),
          !ne(c_type, d_type)): false,
 
     // All other are OK.
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 682bf8cc102cb..e8f8824d47de0 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -648,6 +648,9 @@ LogicalResult MmaOp::verify() {
     expectedB.emplace_back(unitB, multiplicandFragType);
     allowedShapes.push_back({16, 8, kFactor});
     allowedShapes.push_back({16, 8, kFactor * 2});
+
+    if (resultPtxType() != accumPtxType())
+      return emitOpError("ctype does not match dtype");
   }
 
   // In the M=8 case, there is only 1 possible case per data type.
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index 627abd0665d8c..b7ca71aa1245f 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -743,6 +743,36 @@ func.func @nvvm_invalid_mma_8(%a0 : i32, %a1 : i32,
 
 // -----
 
+// f32 return type, f16 accumulate type
+llvm.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
+                                     %a2 : vector<2xf16>, %a3 : vector<2xf16>,
+                                     %b0 : vector<2xf16>, %b1 : vector<2xf16>,
+                                     %c0 : vector<2xf16>, %c1 : vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)> {
+  // C and D should have the same type according to PTX ISA
+  // expected-error@+1 {{'nvvm.mma.sync' op ctype does not match dtype}}
+  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1]
+    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
+     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)>
+  llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
+}
+
+// -----
+
+// f16 return type, f32 accumulate type
+llvm.func @nvvm_mma_m16n8k16_f16_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
+                                     %a2 : vector<2xf16>, %a3 : vector<2xf16>,
+                                     %b0 : vector<2xf16>, %b1 : vector<2xf16>,
+                                     %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)> {
+  // C and D should have the same type according to PTX ISA
+  // expected-error@+1 {{'nvvm.mma.sync' op ctype does not match dtype}}
+  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3]
+    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
+     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
+  llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
+}
+
+// -----
+
 func.func @atomicrmw_mismatched_operands(%f32_ptr : !llvm.ptr, %f32 : f32) {
   // expected-error@+1 {{op failed to verify that result #0 and operand #1 have the same type}}
   %0 = "llvm.atomicrmw"(%f32_ptr, %f32) {bin_op=11, ordering=1} : (!llvm.ptr, f32) -> i32
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 3277e62893527..0243f5eb8c862 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -227,30 +227,6 @@ func.func @nvvm_mma_m16n8k16_f16_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
   llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
 }
 
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16
-func.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
-                                %a2 : vector<2xf16>, %a3 : vector<2xf16>,
-                                %b0 : vector<2xf16>, %b1 : vector<2xf16>,
-                                %c0 : vector<2xf16>, %c1 : vector<2xf16>) {
-  // CHECK: nvvm.mma.sync A[{{.*}}, {{.*}}, {{.*}}, {{.*}}] B[{{.*}}, {{.*}}] C[{{.*}}, {{.*}}] {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)>
-  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1]
-    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
-     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>,vector<2xf16>,vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)>
-  llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
-}
-
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f32
-func.func @nvvm_mma_m16n8k16_f16_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
-                                %a2 : vector<2xf16>, %a3 : vector<2xf16>,
-                                %b0 : vector<2xf16>, %b1 : vector<2xf16>,
-                                %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) {
-  // CHECK: nvvm.mma.sync A[{{.*}}, {{.*}}, {{.*}}, {{.*}}] B[{{.*}}, {{.*}}] C[{{.*}}, {{.*}}, {{.*}}, {{.*}}] {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>, shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
-  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3]
-    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
-     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
-  llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
-}
-
 // CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f32
 func.func @nvvm_mma_m16n8k16_f32_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
                                 %a2 : vector<2xf16>, %a3 : vector<2xf16>,
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 62aeb071c5786..00a479d1f877d 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -302,32 +302,6 @@ llvm.func @nvvm_mma_m16n8k16_bf16_bf16(%a0 : i32, %a1 : i32, %a2 : i32, %a3 : i3
   llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
 }
 
-// f32 return type, f16 accumulate type
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f16
-llvm.func @nvvm_mma_m16n8k16_f32_f16(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
-                                %a2 : vector<2xf16>, %a3 : vector<2xf16>,
-                                %b0 : vector<2xf16>, %b1 : vector<2xf16>,
-                                %c0 : vector<2xf16>, %c1 : vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)> {
-  // CHECK: call { float, float, float, float } @llvm.nvvm.mma.m16n8k16.row.col.f32.f16
-  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1]
-    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
-     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, vector<2xf16>) -> !llvm.struct<(f32, f32, f32, f32)>
-  llvm.return %0 : !llvm.struct<(f32, f32, f32, f32)>
-}
-
-// f16 return type, f32 accumulate type
-// CHECK-LABEL: @nvvm_mma_m16n8k16_f16_f32
-llvm.func @nvvm_mma_m16n8k16_f16_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,
-                                %a2 : vector<2xf16>, %a3 : vector<2xf16>,
-                                %b0 : vector<2xf16>, %b1 : vector<2xf16>,
-                                %c0 : f32, %c1 : f32, %c2 : f32, %c3 : f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)> {
-  // CHECK: call { <2 x half>, <2 x half> } @llvm.nvvm.mma.m16n8k16.row.col.f16.f32
-  %0 = nvvm.mma.sync A[%a0, %a1, %a2, %a3] B[%b0, %b1] C[%c0, %c1, %c2, %c3]
-    {layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<col>,
-     shape = #nvvm.shape<m = 16, n = 8, k = 16>} : (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(vector<2xf16>, vector<2xf16>)>
-  llvm.return %0 : !llvm.struct<(vector<2xf16>, vector<2xf16>)>
-}
-
 // f32 return type, f32 accumulate type
 // CHECK-LABEL: @nvvm_mma_m16n8k16_f32_f32
 llvm.func @nvvm_mma_m16n8k16_f32_f32(%a0 : vector<2xf16>, %a1 : vector<2xf16>,

From 38896d67e458cf4d3b5ce0c3742f48e97527c797 Mon Sep 17 00:00:00 2001
From: sskzakaria <ssskzakaria@proton.me>
Date: Mon, 6 Oct 2025 04:28:06 -0400
Subject: [PATCH 808/878] [Headers][X86] Add constexpr support for some AVX512
 masked extension/truncation intrinsics. (#161984)

The following AVX[512] intrinsics are now constexpr:
* _mm_cvtepi32_epi8
* _mm_cvtepi32_epi16
* _mm_cvtepi64_epi8
* _mm_cvtepi64_epi16
* _mm_cvtepi64_epi32
* _mm256_cvtepi32_epi8
* _mm256_cvtepi32_epi16
* _mm256_cvtepi64_epi8
* _mm256_cvtepi64_epi16
* _mm256_cvtepi64_epi32
* _mm256_mask_cvtepi64_epi32
* _mm256_maskz_cvtepi64_epi32

Fixes #154539
---
 clang/lib/Headers/avx512vlintrin.h         | 63 +++++++++-------------
 clang/test/CodeGen/X86/avx512vl-builtins.c | 24 +++++++++
 2 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 754f43ad88543..965741f0ff944 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -7330,9 +7330,8 @@ _mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi8 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi32_epi8(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
@@ -7360,9 +7359,8 @@ _mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi8 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi32_epi8(__m256i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v8si)__A, __v8qi),
       (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
@@ -7370,8 +7368,7 @@ _mm256_cvtepi32_epi8 (__m256i __A)
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
+_mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
   return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
               (__v16qi) __O, __M);
 }
@@ -7390,9 +7387,8 @@ _mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi16 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi32_epi16(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7);
@@ -7419,9 +7415,8 @@ _mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi16 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi32_epi16(__m256i __A) {
   return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi);
 }
 
@@ -7446,9 +7441,8 @@ _mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi8 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi64_epi8(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
@@ -7475,9 +7469,8 @@ _mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi8 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi64_epi8(__m256i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
@@ -7504,9 +7497,8 @@ _mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi32 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi64_epi32(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3);
 }
@@ -7532,23 +7524,20 @@ _mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi32 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi64_epi32(__m256i __A) {
   return (__m128i)__builtin_convertvector((__v4di)__A, __v4si);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm256_cvtepi64_epi32(__A),
                                              (__v4si)__O);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) {
   return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
                                              (__v4si)_mm256_cvtepi64_epi32(__A),
                                              (__v4si)_mm_setzero_si128());
@@ -7560,9 +7549,8 @@ _mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
   __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi16 (__m128i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_cvtepi64_epi16(__m128i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3,
       3, 3, 3, 3);
@@ -7590,9 +7578,8 @@ _mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
   __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi16 (__m256i __A)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_cvtepi64_epi16(__m256i __A) {
   return (__m128i)__builtin_shufflevector(
       __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
       2, 3, 4, 5, 6, 7);
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 88006232c5c99..6d9187086c267 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -9272,6 +9272,8 @@ __m128i test_mm_cvtepi32_epi8(__m128i __A) {
   return _mm_cvtepi32_epi8(__A); 
 }
 
+TEST_CONSTEXPR(match_v16qi(_mm_cvtepi32_epi8((__m128i)(__v4si){1, 2, 3, 4}), 1 ,2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m128i test_mm_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi32_epi8
   // CHECK: @llvm.x86.avx512.mask.pmov.db.128
@@ -9297,6 +9299,8 @@ __m128i test_mm256_cvtepi32_epi8(__m256i __A) {
   return _mm256_cvtepi32_epi8(__A); 
 }
 
+TEST_CONSTEXPR(match_v16qi(_mm256_cvtepi32_epi8((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0 ,0));
+
 __m128i test_mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi32_epi8
   // CHECK: @llvm.x86.avx512.mask.pmov.db.256
@@ -9322,6 +9326,8 @@ __m128i test_mm_cvtepi32_epi16(__m128i __A) {
   return _mm_cvtepi32_epi16(__A); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm_cvtepi32_epi16((__m128i)(__v4si){1, 2, 3, 4}), 1 ,2, 3, 4, 0, 0, 0, 0));
+
 __m128i test_mm_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi32_epi16
   // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
@@ -9346,6 +9352,8 @@ __m128i test_mm256_cvtepi32_epi16(__m256i __A) {
   return _mm256_cvtepi32_epi16(__A); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm256_cvtepi32_epi16((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), 1, 2, 3, 4, 5, 6, 7, 8));
+
 __m128i test_mm256_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi32_epi16
   // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
@@ -9371,6 +9379,8 @@ __m128i test_mm_cvtepi64_epi8(__m128i __A) {
   return _mm_cvtepi64_epi8(__A); 
 }
 
+TEST_CONSTEXPR(match_v16qi(_mm_cvtepi64_epi8((__m128i)(__v2di){1, 2}), 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m128i test_mm_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi64_epi8
   // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
@@ -9396,6 +9406,8 @@ __m128i test_mm256_cvtepi64_epi8(__m256i __A) {
   return _mm256_cvtepi64_epi8(__A); 
 }
 
+TEST_CONSTEXPR(match_v16qi(_mm256_cvtepi64_epi8((__m256i)(__v4di){1, 2, 3, 4}), 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m128i test_mm256_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi64_epi8
   // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
@@ -9421,6 +9433,8 @@ __m128i test_mm_cvtepi64_epi32(__m128i __A) {
   return _mm_cvtepi64_epi32(__A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm_cvtepi64_epi32((__m128i)(__v2di){1, 2}),1, 2, 0, 0));
+
 __m128i test_mm_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi64_epi32
   // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
@@ -9445,6 +9459,8 @@ __m128i test_mm256_cvtepi64_epi32(__m256i __A) {
   return _mm256_cvtepi64_epi32(__A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm256_cvtepi64_epi32((__m256i)(__v4di){1 ,2 ,3 ,4}), 1, 2, 3, 4));
+
 __m128i test_mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi64_epi32
   // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
@@ -9452,6 +9468,8 @@ __m128i test_mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
   return _mm256_mask_cvtepi64_epi32(__O, __M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm256_mask_cvtepi64_epi32(_mm_set1_epi32(-777), 0xA,(__m256i)(__v4di){1, -2, 3, -4}), -777, -2, -777, -4));
+
 __m128i test_mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) {
   // CHECK-LABEL: test_mm256_maskz_cvtepi64_epi32
   // CHECK: trunc <4 x i64> %{{.*}} to <4 x i32>
@@ -9459,6 +9477,8 @@ __m128i test_mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) {
   return _mm256_maskz_cvtepi64_epi32(__M, __A); 
 }
 
+TEST_CONSTEXPR(match_v4si(_mm256_maskz_cvtepi64_epi32( 0xA,(__m256i)(__v4di){1, -2, 3, -4}),0 , -2, 0, -4));
+
 void test_mm256_mask_cvtepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi64_storeu_epi32
   // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.256
@@ -9472,6 +9492,8 @@ __m128i test_mm_cvtepi64_epi16(__m128i __A) {
   return _mm_cvtepi64_epi16(__A); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm_cvtepi64_epi16((__m128i)(__v2di){1, 2}),1, 2, 0, 0, 0, 0, 0, 0));
+
 __m128i test_mm_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi64_epi16
   // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
@@ -9497,6 +9519,8 @@ __m128i test_mm256_cvtepi64_epi16(__m256i __A) {
   return _mm256_cvtepi64_epi16(__A); 
 }
 
+TEST_CONSTEXPR(match_v8hi(_mm256_cvtepi64_epi16((__m256i)(__v4di){1 ,2, 3, 4}),1, 2, 3, 4, 0, 0, 0, 0));
+
 __m128i test_mm256_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
   // CHECK-LABEL: test_mm256_mask_cvtepi64_epi16
   // CHECK: @llvm.x86.avx512.mask.pmov.qw.256

From a9dafc9bdcfc1090d0744d0c708c5d133bc0fd84 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Mon, 6 Oct 2025 10:34:49 +0200
Subject: [PATCH 809/878] [Sema] Compare canonical conversion function
 (#154158)

With lazy template loading, it is possible to find non-canonical
FunctionDecls, depending on when redecl chains are completed. This
is a problem for templated conversion operators that would allow to
call either the copy assignment or the move assignment operator.
This ambiguity is resolved by isBetterReferenceBindingKind (called
from CompareStandardConversionSequences) ranking rvalue refs over
lvalue refs.

Unfortunately, this fix is hard to test in isolation without the
changes in https://github.com/llvm/llvm-project/pull/133057 that
make lazy template loading more likely to complete redecl chains
at "inconvenient" times. The added reproducer passes before and
after this commit, but would have failed with the proposed changes
of the linked PR.

Kudos to Maksim Ivanov for providing an initial version of the
reproducer that I further simplified.
---
 clang/lib/Sema/SemaOverload.cpp |  13 ++-
 clang/test/Modules/pr133057.cpp | 143 ++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/pr133057.cpp

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index b8701147ae7d4..5657dfe0b9553 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -4413,14 +4413,23 @@ CompareImplicitConversionSequences(Sema &S, SourceLocation Loc,
     Result = CompareStandardConversionSequences(S, Loc,
                                                 ICS1.Standard, ICS2.Standard);
   else if (ICS1.isUserDefined()) {
+    // With lazy template loading, it is possible to find non-canonical
+    // FunctionDecls, depending on when redecl chains are completed. Make sure
+    // to compare the canonical decls of conversion functions. This avoids
+    // ambiguity problems for templated conversion operators.
+    const FunctionDecl *ConvFunc1 = ICS1.UserDefined.ConversionFunction;
+    if (ConvFunc1)
+      ConvFunc1 = ConvFunc1->getCanonicalDecl();
+    const FunctionDecl *ConvFunc2 = ICS2.UserDefined.ConversionFunction;
+    if (ConvFunc2)
+      ConvFunc2 = ConvFunc2->getCanonicalDecl();
     // User-defined conversion sequence U1 is a better conversion
     // sequence than another user-defined conversion sequence U2 if
     // they contain the same user-defined conversion function or
     // constructor and if the second standard conversion sequence of
     // U1 is better than the second standard conversion sequence of
     // U2 (C++ 13.3.3.2p3).
-    if (ICS1.UserDefined.ConversionFunction ==
-          ICS2.UserDefined.ConversionFunction)
+    if (ConvFunc1 == ConvFunc2)
       Result = CompareStandardConversionSequences(S, Loc,
                                                   ICS1.UserDefined.After,
                                                   ICS2.UserDefined.After);
diff --git a/clang/test/Modules/pr133057.cpp b/clang/test/Modules/pr133057.cpp
new file mode 100644
index 0000000000000..b273fc318058e
--- /dev/null
+++ b/clang/test/Modules/pr133057.cpp
@@ -0,0 +1,143 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -xc++ -std=c++20 -emit-module -fmodule-name=hf -fno-cxx-modules -fmodules -fno-implicit-modules %t/CMO.cppmap -o %t/WI9.pcm
+// RUN: %clang_cc1 -xc++ -std=c++20 -emit-module -fmodule-name=g -fno-cxx-modules -fmodules -fno-implicit-modules -fmodule-file=%t/WI9.pcm %t/E6H.cppmap -o %t/4BK.pcm
+// RUN: %clang_cc1 -xc++ -std=c++20 -emit-module -fmodule-name=r -fno-cxx-modules -fmodules -fno-implicit-modules -fmodule-file=%t/WI9.pcm %t/HMT.cppmap -o %t/LUM.pcm
+// RUN: %clang_cc1 -xc++ -std=c++20 -emit-module -fmodule-name=q -fno-cxx-modules -fmodules -fno-implicit-modules -fmodule-file=%t/LUM.pcm -fmodule-file=%t/4BK.pcm %t/JOV.cppmap -o %t/9VX.pcm
+// RUN: %clang_cc1 -xc++ -std=c++20 -verify -fsyntax-only -fno-cxx-modules -fmodules -fno-implicit-modules -fmodule-file=%t/9VX.pcm %t/XFD.cc
+
+//--- 2OT.h
+#include "LQ1.h"
+
+namespace ciy {
+namespace xqk {
+template <typename>
+class vum {
+ public:
+  using sc = std::C::wmd;
+  friend bool operator==(vum, vum);
+};
+template <typename>
+class me {
+ public:
+  using vbh = vum<me>;
+  using sc = std::C::vy<vbh>::sc;
+  template <typename db>
+  operator db() { return {}; }
+};
+}  // namespace xqk
+template <typename vus>
+xqk::me<vus> uvo(std::C::wmd, vus);
+}  // namespace ciy
+
+class ua {
+  std::C::wmd kij() {
+    ciy::uvo(kij(), '-');
+    return {};
+  }
+};
+
+//--- 9KF.h
+#include "LQ1.h"
+#include "2OT.h"
+namespace {
+void al(std::C::wmd lou) { std::C::jv<std::C::wmd> yt = ciy::uvo(lou, '/'); }
+}  // namespace
+
+//--- CMO.cppmap
+module "hf" {
+header "LQ1.h"
+}
+
+
+//--- E6H.cppmap
+module "g" {
+export *
+header "2OT.h"
+}
+
+
+//--- HMT.cppmap
+module "r" {
+header "2OT.h"
+}
+
+
+//--- JOV.cppmap
+module "q" {
+header "9KF.h"
+}
+
+
+//--- LQ1.h
+namespace std {
+namespace C {
+template <class zd>
+struct vy : zd {};
+template <class ub>
+struct vy<ub*> {
+  typedef ub jz;
+};
+struct wmd {};
+template <class uo, class zt>
+void sk(uo k, zt gf) {
+  (void)(k != gf);
+}
+template <class uo>
+class fm {
+ public:
+  fm(uo);
+};
+template <class kj, class kju>
+bool operator==(kj, kju);
+template <class epn>
+void afm(epn) {
+  using yp = vy<epn>;
+  if (__is_trivially_copyable(yp)) {
+    sk(fm(epn()), nullptr);
+  }
+}
+template <class ub>
+class jv {
+ public:
+  constexpr void gq();
+  ub *nef;
+};
+template <class ub>
+constexpr void jv<ub>::gq() {
+    afm(nef);
+}
+}  // namespace C
+}  // namespace std
+namespace ciy {
+}  // namespace ciy
+
+//--- XFD.cc
+// expected-no-diagnostics
+#include "LQ1.h"
+#include "2OT.h"
+class wiy {
+ public:
+  std::C::wmd eyb();
+};
+template <typename wpa>
+void i(wpa fg) {
+  std::C::jv<std::C::wmd> zs;
+  zs = ciy::uvo(fg.eyb(), '\n');
+}
+namespace ciy {
+namespace xqk {
+struct sbv;
+std::C::jv<sbv> ns() {
+  std::C::jv<sbv> ubs;
+  ubs.gq();
+  return ubs;
+}
+}  // namespace xqk
+}  // namespace ciy
+void s() {
+  wiy fg;
+  i(fg);
+}

From b023ca9ce9359c2195795401c6c146e12db84f0e Mon Sep 17 00:00:00 2001
From: Vincent <llvm@viceroygroup.ca>
Date: Mon, 6 Oct 2025 01:43:45 -0700
Subject: [PATCH 810/878] [llvm][docs] Fixed documentation (#158795)

fixes #158643
---
 llvm/docs/PDB/HashTable.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/PDB/HashTable.rst b/llvm/docs/PDB/HashTable.rst
index 581ec598791c1..7420510af5f8e 100644
--- a/llvm/docs/PDB/HashTable.rst
+++ b/llvm/docs/PDB/HashTable.rst
@@ -17,8 +17,8 @@ a consumer to read a list of values and reconstruct the hash table on the fly.
 The serialization format supports hash tables of arbitrarily large size and
 capacity, as well as value types and hash functions.  The only supported key
 value type is a uint32.  The only requirement is that the producer and consumer
-agree on the hash function.  As such, the hash function can is not discussed
-further in this document, it is assumed that for a particular instance of a PDB
+agree on the hash function.  As such, the hash function is not discussed
+further in this document. It is assumed that for a particular instance of a PDB
 file hash table, the appropriate hash function is being used.
 
 On-Disk Format

From af2059791e5f37822cc2984c102d7a0358d58243 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer@nvidia.com>
Date: Mon, 6 Oct 2025 09:49:15 +0100
Subject: [PATCH 811/878] Reapply "[llvm-exegesis] Exclude loads/stores from
 aliasing instruction set" (#156735) (#159366)

Move the mayLoad/mayStore checks out of hasMemoryOperands; there are
instructions with these properties that don't have operands.

This is relanding 899ee375e99c04ef2c4a67dc70b266c929ad43f4 with a
minor tweak.
---
 .../test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s | 8 ++++++++
 llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp   | 6 ++++++
 2 files changed, 14 insertions(+)
 create mode 100644 llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s

diff --git a/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
new file mode 100644
index 0000000000000..65e1203bb275d
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
@@ -0,0 +1,8 @@
+REQUIRES: aarch64-registered-target
+
+RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FMOVWSr --benchmark-phase=assemble-measured-code 2>&1
+RUN: llvm-objdump -d %d > %t.s
+RUN: FileCheck %s < %t.s
+
+CHECK-NOT: ld{{[1-4]}}
+CHECK-NOT: st{{[1-4]}}
diff --git a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp
index bdfc93e22273b..707e6ee2d434b 100644
--- a/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SerialSnippetGenerator.cpp
@@ -57,6 +57,12 @@ computeAliasingInstructions(const LLVMState &State, const Instruction *Instr,
       continue;
     if (OtherInstr.hasMemoryOperands())
       continue;
+    // Filtering out loads/stores might belong in hasMemoryOperands(), but that
+    // complicates things as there are instructions with may load/store that
+    // don't have operands (e.g. X86's CLUI instruction). So, it's easier to
+    // filter them out here.
+    if (OtherInstr.Description.mayLoad() || OtherInstr.Description.mayStore())
+      continue;
     if (!ET.allowAsBackToBack(OtherInstr))
       continue;
     if (Instr->hasAliasingRegistersThrough(OtherInstr, ForbiddenRegisters))

From 1bd9c1bde38acddd71bf52fd3748d4c7fc75e8ba Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Mon, 6 Oct 2025 09:56:04 +0100
Subject: [PATCH 812/878] [Dexter] Allow retries on all dexter tests to avoid
 lldb-dap flakiness (#161847)

This isn't pretty but should help us keep the bot stable while issues
such as #158306 and #158311 are investigated
---
 cross-project-tests/debuginfo-tests/dexter-tests/lit.local.cfg | 3 +++
 .../debuginfo-tests/dexter/feature_tests/lit.local.cfg         | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/lit.local.cfg b/cross-project-tests/debuginfo-tests/dexter-tests/lit.local.cfg
index bace385c23f7c..6b711331897ef 100644
--- a/cross-project-tests/debuginfo-tests/dexter-tests/lit.local.cfg
+++ b/cross-project-tests/debuginfo-tests/dexter-tests/lit.local.cfg
@@ -1,2 +1,5 @@
 if "dexter" not in config.available_features:
     config.unsupported = True
+
+# LLDB through lldb-dap causes spurious failures.
+config.test_retry_attempts = 2
diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/lit.local.cfg b/cross-project-tests/debuginfo-tests/dexter/feature_tests/lit.local.cfg
index 16b9690546702..3b98bf565bcf4 100644
--- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/lit.local.cfg
+++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/lit.local.cfg
@@ -3,3 +3,6 @@ if "dexter" not in config.available_features:
 
 config.name = "DExTer feature tests"
 config.suffixes = [".cpp", ".c", ".test"]
+
+# LLDB through lldb-dap causes spurious failures.
+config.test_retry_attempts = 2

From 1087c1079f870518b6bf6e2f6ed764d3e90611ae Mon Sep 17 00:00:00 2001
From: lonely eagle <2020382038@qq.com>
Date: Mon, 6 Oct 2025 17:05:21 +0800
Subject: [PATCH 813/878] [mlir][bufferize] Add hoist-dynamic-allocs-option to
 buffer-results-to-out-params (#160985)

Add hoist-dynamic-allocs-option to buffer-results-to-out-params. This PR
supported that obtain the size of the dynamic shape memref through the
caller-callee relationship.
---
 .../Dialect/Bufferization/Transforms/Passes.h | 13 ++-
 .../Bufferization/Transforms/Passes.td        |  2 +
 .../Transforms/BufferResultsToOutParams.cpp   | 96 +++++++++++++++++--
 ...ts-to-out-params-hosit-dynamic-allocs.mlir | 79 +++++++++++++++
 ...ts-to-out-params-hosit-static-allocs.mlir} |  0
 5 files changed, 178 insertions(+), 12 deletions(-)
 create mode 100644 mlir/test/Transforms/buffer-results-to-out-params-hosit-dynamic-allocs.mlir
 rename mlir/test/Transforms/{buffer-results-to-out-params-elim.mlir => buffer-results-to-out-params-hosit-static-allocs.mlir} (100%)

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
index a2409f2796b94..67ac487d8226d 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.h
@@ -131,8 +131,8 @@ struct BufferResultsToOutParamsOpts {
   /// Allocator function: Generate a memref allocation with the given type.
   /// Since `promoteBufferResultsToOutParams` doesn't allow dynamically shaped
   /// results, we don't allow passing a range of values for dynamic dims.
-  using AllocationFn =
-      std::function<FailureOr<Value>(OpBuilder &, Location, MemRefType)>;
+  using AllocationFn = std::function<FailureOr<Value>(OpBuilder &, Location,
+                                                      MemRefType, ValueRange)>;
 
   /// Memcpy function: Generate a memcpy between two memrefs.
   using MemCpyFn =
@@ -147,8 +147,9 @@ struct BufferResultsToOutParamsOpts {
   /// Allocation function; used to allocate a memref.
   /// Default memref.alloc is used
   AllocationFn allocationFn = [](OpBuilder &builder, Location loc,
-                                 MemRefType type) {
-    return memref::AllocOp::create(builder, loc, type).getResult();
+                                 MemRefType type, ValueRange dynamicSizes) {
+    return memref::AllocOp::create(builder, loc, type, dynamicSizes)
+        .getResult();
   };
 
   /// Memcpy function; used to create a copy between two memrefs.
@@ -166,6 +167,10 @@ struct BufferResultsToOutParamsOpts {
   /// If true, the pass eliminates the memref.alloc and memcpy if the returned
   /// memref is allocated in the current function.
   bool hoistStaticAllocs = false;
+
+  /// If true, the pass eliminates the memref.alloc and memcpy if the returned
+  /// memref is allocated in the current function and has dynamic shape.
+  bool hoistDynamicAllocs = false;
 };
 
 /// Replace buffers that are returned from a function with an out parameter.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index a0d113c150c5e..cad44cb15f479 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -256,6 +256,8 @@ def BufferResultsToOutParamsPass
               "Add the attribute 'bufferize.result' to all output parameters.">,
        Option<"hoistStaticAllocs", "hoist-static-allocs", "bool",
               /*default=*/"false", "Hoist static allocations to call sites.">,
+       Option<"hoistDynamicAllocs", "hoist-dynamic-allocs", "bool",
+              /*default=*/"false", "Hoist dynamic allocations to call sites.">,
   ];
   let dependentDialects = ["memref::MemRefDialect"];
 }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
index e30e094c28467..25f941dc16516 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferResultsToOutParams.cpp
@@ -23,6 +23,8 @@ namespace bufferization {
 using namespace mlir;
 using AllocationFn = bufferization::BufferResultsToOutParamsOpts::AllocationFn;
 using MemCpyFn = bufferization::BufferResultsToOutParamsOpts::MemCpyFn;
+using AllocDynamicSizesMap =
+    llvm::DenseMap<func::FuncOp, SmallVector<SmallVector<Value>>>;
 
 /// Return `true` if the given MemRef type has a fully dynamic layout.
 static bool hasFullyDynamicLayoutMap(MemRefType type) {
@@ -43,6 +45,50 @@ static bool hasStaticIdentityLayout(MemRefType type) {
   return type.getLayout().isIdentity();
 }
 
+/// Return the dynamic shapes of the `memref` based on the defining op. If the
+/// complete dynamic shape fails to be captured, return an empty value.
+/// Currently, only function block arguments are supported for capturing.
+static SmallVector<Value> getDynamicSize(Value memref, func::FuncOp funcOp) {
+  Operation *defOp = memref.getDefiningOp();
+  if (!defOp)
+    return {};
+  auto operands = defOp->getOperands();
+  SmallVector<Value> dynamicSizes;
+  for (Value size : operands) {
+    if (!isa<IndexType>(size.getType()))
+      continue;
+
+    BlockArgument sizeSrc = dyn_cast<BlockArgument>(size);
+    if (!sizeSrc)
+      return {};
+    auto arguments = funcOp.getArguments();
+    auto iter = llvm::find(arguments, sizeSrc);
+    if (iter == arguments.end())
+      return {};
+    dynamicSizes.push_back(*iter);
+  }
+  return dynamicSizes;
+}
+
+/// Returns the dynamic sizes at the callee, through the call relationship
+/// between the caller and callee.
+static SmallVector<Value> mapDynamicSizeAtCaller(func::CallOp call,
+                                                 func::FuncOp callee,
+                                                 ValueRange dynamicSizes) {
+  SmallVector<Value> mappedDynamicSizes;
+  for (Value size : dynamicSizes) {
+    for (auto [src, dst] :
+         llvm::zip_first(call.getOperands(), callee.getArguments())) {
+      if (size != dst)
+        continue;
+      mappedDynamicSizes.push_back(src);
+    }
+  }
+  assert(mappedDynamicSizes.size() == dynamicSizes.size() &&
+         "could not find all dynamic sizes");
+  return mappedDynamicSizes;
+}
+
 // Updates the func op and entry block.
 //
 // Any args appended to the entry block are added to `appendedEntryArgs`.
@@ -109,6 +155,7 @@ updateFuncOp(func::FuncOp func,
 // the given out-params.
 static LogicalResult
 updateReturnOps(func::FuncOp func, ArrayRef<BlockArgument> appendedEntryArgs,
+                AllocDynamicSizesMap &map,
                 const bufferization::BufferResultsToOutParamsOpts &options) {
   auto res = func.walk([&](func::ReturnOp op) {
     SmallVector<Value, 6> copyIntoOutParams;
@@ -120,12 +167,22 @@ updateReturnOps(func::FuncOp func, ArrayRef<BlockArgument> appendedEntryArgs,
         keepAsReturnOperands.push_back(operand);
     }
     OpBuilder builder(op);
+    SmallVector<SmallVector<Value>> dynamicSizes;
     for (auto [orig, arg] : llvm::zip(copyIntoOutParams, appendedEntryArgs)) {
-      if (options.hoistStaticAllocs &&
+      bool hoistStaticAllocs =
+          options.hoistStaticAllocs &&
+          cast<MemRefType>(orig.getType()).hasStaticShape();
+      bool hoistDynamicAllocs =
+          options.hoistDynamicAllocs &&
+          !cast<MemRefType>(orig.getType()).hasStaticShape();
+      if ((hoistStaticAllocs || hoistDynamicAllocs) &&
           isa_and_nonnull<bufferization::AllocationOpInterface>(
-              orig.getDefiningOp()) &&
-          mlir::cast<MemRefType>(orig.getType()).hasStaticShape()) {
+              orig.getDefiningOp())) {
         orig.replaceAllUsesWith(arg);
+        if (hoistDynamicAllocs) {
+          SmallVector<Value> dynamicSize = getDynamicSize(orig, func);
+          dynamicSizes.push_back(dynamicSize);
+        }
         orig.getDefiningOp()->erase();
       } else {
         if (failed(options.memCpyFn(builder, op.getLoc(), orig, arg)))
@@ -134,6 +191,10 @@ updateReturnOps(func::FuncOp func, ArrayRef<BlockArgument> appendedEntryArgs,
     }
     func::ReturnOp::create(builder, op.getLoc(), keepAsReturnOperands);
     op.erase();
+    auto dynamicSizePair =
+        std::pair<func::FuncOp, SmallVector<SmallVector<Value>>>(func,
+                                                                 dynamicSizes);
+    map.insert(dynamicSizePair);
     return WalkResult::advance();
   });
   return failure(res.wasInterrupted());
@@ -142,7 +203,7 @@ updateReturnOps(func::FuncOp func, ArrayRef<BlockArgument> appendedEntryArgs,
 // Updates all CallOps in the scope of the given ModuleOp by allocating
 // temporary buffers for newly introduced out params.
 static LogicalResult
-updateCalls(ModuleOp module,
+updateCalls(ModuleOp module, const AllocDynamicSizesMap &map,
             const bufferization::BufferResultsToOutParamsOpts &options) {
   bool didFail = false;
   SymbolTable symtab(module);
@@ -166,8 +227,15 @@ updateCalls(ModuleOp module,
     }
     SmallVector<Value, 6> outParams;
     OpBuilder builder(op);
+    SmallVector<SmallVector<Value>> dynamicSizes = map.lookup(callee);
+    size_t dynamicSizesIndex = 0;
     for (Value memref : replaceWithOutParams) {
-      if (!cast<MemRefType>(memref.getType()).hasStaticShape()) {
+      SmallVector<Value> dynamicSize = dynamicSizes.size() > dynamicSizesIndex
+                                           ? dynamicSizes[dynamicSizesIndex]
+                                           : SmallVector<Value>();
+      bool memrefStaticShape =
+          cast<MemRefType>(memref.getType()).hasStaticShape();
+      if (!memrefStaticShape && dynamicSize.empty()) {
         op.emitError()
             << "cannot create out param for dynamically shaped result";
         didFail = true;
@@ -177,8 +245,15 @@ updateCalls(ModuleOp module,
       auto allocType =
           MemRefType::get(memrefType.getShape(), memrefType.getElementType(),
                           AffineMap(), memrefType.getMemorySpace());
+
+      if (memrefStaticShape) {
+        dynamicSize = {};
+      } else {
+        ++dynamicSizesIndex;
+        dynamicSize = mapDynamicSizeAtCaller(op, callee, dynamicSize);
+      }
       auto maybeOutParam =
-          options.allocationFn(builder, op.getLoc(), allocType);
+          options.allocationFn(builder, op.getLoc(), allocType, dynamicSize);
       if (failed(maybeOutParam)) {
         op.emitError() << "failed to create allocation op";
         didFail = true;
@@ -213,6 +288,9 @@ updateCalls(ModuleOp module,
 LogicalResult mlir::bufferization::promoteBufferResultsToOutParams(
     ModuleOp module,
     const bufferization::BufferResultsToOutParamsOpts &options) {
+  // It maps the shape source of the dynamic shape memref returned by each
+  // function.
+  AllocDynamicSizesMap map;
   for (auto func : module.getOps<func::FuncOp>()) {
     if (!options.filterFn(&func))
       continue;
@@ -222,11 +300,11 @@ LogicalResult mlir::bufferization::promoteBufferResultsToOutParams(
       return failure();
     if (func.isExternal())
       continue;
-    if (failed(updateReturnOps(func, appendedEntryArgs, options))) {
+    if (failed(updateReturnOps(func, appendedEntryArgs, map, options))) {
       return failure();
     }
   }
-  if (failed(updateCalls(module, options)))
+  if (failed(updateCalls(module, map, options)))
     return failure();
   return success();
 }
@@ -243,6 +321,8 @@ struct BufferResultsToOutParamsPass
       options.addResultAttribute = true;
     if (hoistStaticAllocs)
       options.hoistStaticAllocs = true;
+    if (hoistDynamicAllocs)
+      options.hoistDynamicAllocs = true;
 
     if (failed(bufferization::promoteBufferResultsToOutParams(getOperation(),
                                                               options)))
diff --git a/mlir/test/Transforms/buffer-results-to-out-params-hosit-dynamic-allocs.mlir b/mlir/test/Transforms/buffer-results-to-out-params-hosit-dynamic-allocs.mlir
new file mode 100644
index 0000000000000..f33eb8e26fbce
--- /dev/null
+++ b/mlir/test/Transforms/buffer-results-to-out-params-hosit-dynamic-allocs.mlir
@@ -0,0 +1,79 @@
+// RUN: mlir-opt -allow-unregistered-dialect -p 'builtin.module(buffer-results-to-out-params{hoist-dynamic-allocs})' %s -split-input-file | FileCheck %s
+
+func.func private @single_alloc(%size : index) -> (memref<?xf32>) {
+  %alloc = memref.alloc(%size) : memref<?xf32>
+  return %alloc : memref<?xf32>
+}
+
+func.func @single_alloc_test(%size : index) {
+  %alloc = call @single_alloc(%size) : (index) -> (memref<?xf32>)
+  "test.sink"(%alloc) : (memref<?xf32>) -> ()
+}
+
+// CHECK-LABEL: func.func private @single_alloc(
+//  CHECK-SAME:   %{{.*}}: index,
+//  CHECK-SAME:   %{{.*}}: memref<?xf32>) {
+
+// CHECK-LABEL: func.func @single_alloc_test(
+//  CHECK-SAME:   %[[size:.*]]: index) {
+//       CHECK:   %[[alloc:.*]] = memref.alloc(%[[size]]) : memref<?xf32>
+//       CHECK:   call @single_alloc(%[[size]], %[[alloc]]) : (index, memref<?xf32>) -> ()
+//       CHECK:   "test.sink"(%[[alloc]]) : (memref<?xf32>) -> ()
+//       CHECK: }
+
+// -----
+
+func.func private @mult_alloc(%size0 : index, %size1 : index) -> (memref<?x?xf32>, memref<?xf32>) {
+  %alloc0 = memref.alloc(%size0, %size1) : memref<?x?xf32>
+  %alloc1 = memref.alloc(%size1) : memref<?xf32>
+  return %alloc0, %alloc1 : memref<?x?xf32>, memref<?xf32>
+}
+
+func.func @mult_alloc_test(%size0 : index, %size1: index) {
+  %alloc0, %alloc1 = call @mult_alloc(%size0, %size1) : (index, index) -> (memref<?x?xf32>, memref<?xf32>)
+  "test.sink"(%alloc0, %alloc1) : (memref<?x?xf32>, memref<?xf32>) -> ()
+}
+
+// CHECK-LABEL: func private @mult_alloc(
+//  CHECK-SAME:    %{{.*}}: index,  %{{.*}}: index,
+//  CHECK-SAME:    %{{.*}}: memref<?x?xf32>, %{{.*}}: memref<?xf32>) {
+
+// CHECK-LABEL: func @mult_alloc_test(
+//  CHECK-SAME:   %[[size0:.*]]: index,
+//  CHECK-SAME:   %[[size1:.*]]: index) {
+//       CHECK:   %[[alloc0:.*]] = memref.alloc(%[[size0]], %[[size1]]) : memref<?x?xf32>
+//       CHECK:   %[[alloc1:.*]] = memref.alloc(%[[size1]]) : memref<?xf32>
+//       CHECK:   call @mult_alloc(%[[size0]], %[[size1]], %[[alloc0]], %[[alloc1]]) : (index, index, memref<?x?xf32>, memref<?xf32>) -> ()
+//       CHECK:   "test.sink"(%[[alloc0]], %[[alloc1]]) : (memref<?x?xf32>, memref<?xf32>) -> ()
+//       CHECK: }
+
+
+// -----
+
+func.func private @complex_alloc(%size0 : index, %size1 : index) -> (memref<?x?xf32>, memref<4xf32>, memref<?xf32>) {
+  %alloc0 = memref.alloc(%size0, %size1) : memref<?x?xf32>
+  %alloc1 = memref.alloc() : memref<4xf32>
+  %alloc2 = memref.alloc(%size1) : memref<?xf32>
+  return %alloc0, %alloc1, %alloc2 : memref<?x?xf32>, memref<4xf32>, memref<?xf32>
+}
+
+func.func @complex_alloc_test(%size0 : index, %size1: index) {
+  %alloc0, %alloc1, %alloc2 = call @complex_alloc(%size0, %size1) : (index, index) -> (memref<?x?xf32>, memref<4xf32>, memref<?xf32>)
+  "test.sink"(%alloc0, %alloc1, %alloc2) : (memref<?x?xf32>, memref<4xf32>, memref<?xf32>) -> ()
+}
+
+// CHECK-LABEL: func private @complex_alloc(
+//  CHECK-SAME:   %{{.*}}: index, %{{.*}}: index,
+//  CHECK-SAME:   %{{.*}}: memref<?x?xf32>,
+//  CHECK-SAME:   %{{.*}}: memref<4xf32>,
+//  CHECK-SAME:   %{{.*}}: memref<?xf32>) {
+
+// CHECK-LABEL: func @complex_alloc_test(
+//  CHECK-SAME:   %[[size0:.*]]: index,
+//  CHECK-SAME:   %[[size1:.*]]: index) {
+//       CHECK:   %[[alloc0:.*]] = memref.alloc(%[[size0]], %[[size1]]) : memref<?x?xf32>
+//       CHECK:   %[[alloc1:.*]] = memref.alloc() : memref<4xf32>
+//       CHECK:   %[[alloc2:.*]] = memref.alloc(%[[size1]]) : memref<?xf32>
+//       CHECK:   call @complex_alloc(%[[size0]], %[[size1]], %[[alloc0]], %[[alloc1]], %[[alloc2]]) : (index, index, memref<?x?xf32>, memref<4xf32>, memref<?xf32>) -> ()
+//       CHECK:   "test.sink"(%[[alloc0]], %[[alloc1]], %[[alloc2]]) : (memref<?x?xf32>, memref<4xf32>, memref<?xf32>) -> ()
+//       CHECK: }
diff --git a/mlir/test/Transforms/buffer-results-to-out-params-elim.mlir b/mlir/test/Transforms/buffer-results-to-out-params-hosit-static-allocs.mlir
similarity index 100%
rename from mlir/test/Transforms/buffer-results-to-out-params-elim.mlir
rename to mlir/test/Transforms/buffer-results-to-out-params-hosit-static-allocs.mlir

From a9ca220e8138f39d504faa48ba771dd90fd89838 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 6 Oct 2025 10:08:10 +0100
Subject: [PATCH 814/878] [AArch64][SME] Remove support for
 `-arch64-enable-zpr-predicate-spills` (#161819)

This was a stop-gap solution until we implemented
`-aarch64-split-sve-objects`. It was never enabled by default, and
likely saw no real-world use.

Let's remove this to reduce the maintenance burden.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  355 +-----
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   14 -
 .../AArch64/AArch64PrologueEpilogue.cpp       |    3 -
 .../lib/Target/AArch64/AArch64RegisterInfo.td |   11 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |   19 -
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |    2 -
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |   14 -
 .../AArch64/spill-fill-zpr-predicates.mir     | 1009 -----------------
 .../AArch64/ssve-stack-hazard-remarks.ll      |   11 -
 9 files changed, 11 insertions(+), 1427 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 4357264d2232d..b98f7978912f0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -345,12 +345,6 @@ static unsigned getStackHazardSize(const MachineFunction &MF) {
   return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
 }
 
-/// Returns true if PPRs are spilled as ZPRs.
-static bool arePPRsSpilledAsZPR(const MachineFunction &MF) {
-  return MF.getSubtarget().getRegisterInfo()->getSpillSize(
-             AArch64::PPRRegClass) == 16;
-}
-
 StackOffset
 AArch64FrameLowering::getZPRStackSize(const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -1966,8 +1960,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
       break;
     case RegPairInfo::PPR:
-      StrOpc =
-          Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
+      StrOpc = AArch64::STR_PXI;
       break;
     case RegPairInfo::VG:
       StrOpc = AArch64::STRXui;
@@ -2178,8 +2171,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
       break;
     case RegPairInfo::PPR:
-      LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
-                          : AArch64::LDR_PXI;
+      LdrOpc = AArch64::LDR_PXI;
       break;
     case RegPairInfo::VG:
       continue;
@@ -2286,9 +2278,7 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
 
 // Returns true if the LDST MachineInstr \p MI is a PPR access.
 static bool isPPRAccess(const MachineInstr &MI) {
-  return MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
-         MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
-         AArch64::PPRRegClass.contains(MI.getOperand(0).getReg());
+  return AArch64::PPRRegClass.contains(MI.getOperand(0).getReg());
 }
 
 // Check if a Hazard slot is needed for the current function, and if so create
@@ -2390,12 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
       return;
     }
 
-    if (arePPRsSpilledAsZPR(MF)) {
-      LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with "
-                           "-aarch64-enable-zpr-predicate-spills");
-      return;
-    }
-
     // If another calling convention is explicitly set FPRs can't be promoted to
     // ZPR callee-saves.
     if (!is_contained({CallingConv::C, CallingConv::Fast,
@@ -2519,14 +2503,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       continue;
     }
 
-    // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
-    // spilled. If all of p0-p3 are used as return values p4 is must be free
-    // to reload p8-p15.
-    if (RegInfo->getSpillSize(AArch64::PPRRegClass) == 16 &&
-        AArch64::PPR_p8to15RegClass.contains(Reg)) {
-      SavedRegs.set(AArch64::P4);
-    }
-
     // MachO's compact unwind format relies on all registers being stored in
     // pairs.
     // FIXME: the usual format is actually better if unwinding isn't needed.
@@ -2587,7 +2563,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     auto SpillSize = TRI->getSpillSize(*RC);
     bool IsZPR = AArch64::ZPRRegClass.contains(Reg);
     bool IsPPR = !IsZPR && AArch64::PPRRegClass.contains(Reg);
-    if (IsZPR || (IsPPR && arePPRsSpilledAsZPR(MF)))
+    if (IsZPR)
       ZPRCSStackSize += SpillSize;
     else if (IsPPR)
       PPRCSStackSize += SpillSize;
@@ -2961,314 +2937,8 @@ static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
   return SVEStack;
 }
 
-/// Attempts to scavenge a register from \p ScavengeableRegs given the used
-/// registers in \p UsedRegs.
-static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
-                                    BitVector const &ScavengeableRegs,
-                                    Register PreferredReg) {
-  if (PreferredReg != AArch64::NoRegister && UsedRegs.available(PreferredReg))
-    return PreferredReg;
-  for (auto Reg : ScavengeableRegs.set_bits()) {
-    if (UsedRegs.available(Reg))
-      return Reg;
-  }
-  return AArch64::NoRegister;
-}
-
-/// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
-/// \p MachineInstrs.
-static void propagateFrameFlags(MachineInstr &SourceMI,
-                                ArrayRef<MachineInstr *> MachineInstrs) {
-  for (MachineInstr *MI : MachineInstrs) {
-    if (SourceMI.getFlag(MachineInstr::FrameSetup))
-      MI->setFlag(MachineInstr::FrameSetup);
-    if (SourceMI.getFlag(MachineInstr::FrameDestroy))
-      MI->setFlag(MachineInstr::FrameDestroy);
-  }
-}
-
-/// RAII helper class for scavenging or spilling a register. On construction
-/// attempts to find a free register of class \p RC (given \p UsedRegs and \p
-/// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
-/// MaybeSpillFI to free a register. The free'd register is returned via the \p
-/// FreeReg output parameter. On destruction, if there is a spill, its previous
-/// value is reloaded. The spilling and scavenging is only valid at the
-/// insertion point \p MBBI, this class should _not_ be used in places that
-/// create or manipulate basic blocks, moving the expected insertion point.
-struct ScopedScavengeOrSpill {
-  ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete;
-  ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete;
-
-  ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator MBBI,
-                        Register SpillCandidate, const TargetRegisterClass &RC,
-                        LiveRegUnits const &UsedRegs,
-                        BitVector const &AllocatableRegs,
-                        std::optional<int> *MaybeSpillFI,
-                        Register PreferredReg = AArch64::NoRegister)
-      : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
-                                          *MF.getSubtarget().getInstrInfo())),
-        TRI(*MF.getSubtarget().getRegisterInfo()) {
-    FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs, PreferredReg);
-    if (FreeReg != AArch64::NoRegister)
-      return;
-    assert(MaybeSpillFI && "Expected emergency spill slot FI information "
-                           "(attempted to spill in prologue/epilogue?)");
-    if (!MaybeSpillFI->has_value()) {
-      MachineFrameInfo &MFI = MF.getFrameInfo();
-      *MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
-                                                 TRI.getSpillAlign(RC));
-    }
-    FreeReg = SpillCandidate;
-    SpillFI = MaybeSpillFI->value();
-    TII.storeRegToStackSlot(MBB, MBBI, FreeReg, false, *SpillFI, &RC, &TRI,
-                            Register());
-  }
-
-  bool hasSpilled() const { return SpillFI.has_value(); }
-
-  /// Returns the free register (found from scavenging or spilling a register).
-  Register freeRegister() const { return FreeReg; }
-
-  Register operator*() const { return freeRegister(); }
-
-  ~ScopedScavengeOrSpill() {
-    if (hasSpilled())
-      TII.loadRegFromStackSlot(MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI,
-                               Register());
-  }
-
-private:
-  MachineBasicBlock &MBB;
-  MachineBasicBlock::iterator MBBI;
-  const TargetRegisterClass &RC;
-  const AArch64InstrInfo &TII;
-  const TargetRegisterInfo &TRI;
-  Register FreeReg = AArch64::NoRegister;
-  std::optional<int> SpillFI;
-};
-
-/// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
-/// FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
-struct EmergencyStackSlots {
-  std::optional<int> ZPRSpillFI;
-  std::optional<int> PPRSpillFI;
-  std::optional<int> GPRSpillFI;
-};
-
-/// Registers available for scavenging (ZPR, PPR3b, GPR).
-struct ScavengeableRegs {
-  BitVector ZPRRegs;
-  BitVector PPR3bRegs;
-  BitVector GPRRegs;
-};
-
-static bool isInPrologueOrEpilogue(const MachineInstr &MI) {
-  return MI.getFlag(MachineInstr::FrameSetup) ||
-         MI.getFlag(MachineInstr::FrameDestroy);
-}
-
-/// Expands:
-/// ```
-/// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
-/// ```
-/// To:
-/// ```
-/// $z0 = CPY_ZPzI_B $p0, 1, 0
-/// STR_ZXI $z0, $stack.0, 0
-/// ```
-/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
-/// spilling if necessary).
-static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
-                                          MachineInstr &MI,
-                                          const TargetRegisterInfo &TRI,
-                                          LiveRegUnits const &UsedRegs,
-                                          ScavengeableRegs const &SR,
-                                          EmergencyStackSlots &SpillSlots) {
-  MachineFunction &MF = *MBB.getParent();
-  auto *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
-
-  ScopedScavengeOrSpill ZPredReg(
-      MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
-      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
-
-  SmallVector<MachineInstr *, 2> MachineInstrs;
-  const DebugLoc &DL = MI.getDebugLoc();
-  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::CPY_ZPzI_B))
-                              .addReg(*ZPredReg, RegState::Define)
-                              .add(MI.getOperand(0))
-                              .addImm(1)
-                              .addImm(0)
-                              .getInstr());
-  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::STR_ZXI))
-                              .addReg(*ZPredReg)
-                              .add(MI.getOperand(1))
-                              .addImm(MI.getOperand(2).getImm())
-                              .setMemRefs(MI.memoperands())
-                              .getInstr());
-  propagateFrameFlags(MI, MachineInstrs);
-}
-
-/// Expands:
-/// ```
-/// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
-/// ```
-/// To:
-/// ```
-/// $z0 = LDR_ZXI %stack.0, 0
-/// $p0 = PTRUE_B 31, implicit $vg
-/// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-/// ```
-/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
-/// spilling if necessary). If the status flags are in use at the point of
-/// expansion they are preserved (by moving them to/from a GPR). This may cause
-/// an additional spill if no GPR is free at the expansion point.
-static bool expandFillPPRFromZPRSlotPseudo(
-    MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI,
-    LiveRegUnits const &UsedRegs, ScavengeableRegs const &SR,
-    MachineInstr *&LastPTrue, EmergencyStackSlots &SpillSlots) {
-  MachineFunction &MF = *MBB.getParent();
-  auto *TII =
-      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
-
-  ScopedScavengeOrSpill ZPredReg(
-      MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
-      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
-
-  ScopedScavengeOrSpill PredReg(
-      MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs,
-      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI,
-      /*PreferredReg=*/
-      LastPTrue ? LastPTrue->getOperand(0).getReg() : AArch64::NoRegister);
-
-  // Elide NZCV spills if we know it is not used.
-  bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
-  std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
-  if (IsNZCVUsed)
-    NZCVSaveReg.emplace(
-        MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs,
-        isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI);
-  SmallVector<MachineInstr *, 4> MachineInstrs;
-  const DebugLoc &DL = MI.getDebugLoc();
-  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI))
-                              .addReg(*ZPredReg, RegState::Define)
-                              .add(MI.getOperand(1))
-                              .addImm(MI.getOperand(2).getImm())
-                              .setMemRefs(MI.memoperands())
-                              .getInstr());
-  if (IsNZCVUsed)
-    MachineInstrs.push_back(
-        BuildMI(MBB, MI, DL, TII->get(AArch64::MRS))
-            .addReg(NZCVSaveReg->freeRegister(), RegState::Define)
-            .addImm(AArch64SysReg::NZCV)
-            .addReg(AArch64::NZCV, RegState::Implicit)
-            .getInstr());
-
-  // Reuse previous ptrue if we know it has not been clobbered.
-  if (LastPTrue) {
-    assert(*PredReg == LastPTrue->getOperand(0).getReg());
-    LastPTrue->moveBefore(&MI);
-  } else {
-    LastPTrue = BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
-                    .addReg(*PredReg, RegState::Define)
-                    .addImm(31);
-  }
-  MachineInstrs.push_back(LastPTrue);
-  MachineInstrs.push_back(
-      BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B))
-          .addReg(MI.getOperand(0).getReg(), RegState::Define)
-          .addReg(*PredReg)
-          .addReg(*ZPredReg)
-          .addImm(0)
-          .addReg(AArch64::NZCV, RegState::ImplicitDefine)
-          .getInstr());
-  if (IsNZCVUsed)
-    MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MSR))
-                                .addImm(AArch64SysReg::NZCV)
-                                .addReg(NZCVSaveReg->freeRegister())
-                                .addReg(AArch64::NZCV, RegState::ImplicitDefine)
-                                .getInstr());
-
-  propagateFrameFlags(MI, MachineInstrs);
-  return PredReg.hasSpilled();
-}
-
-/// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
-/// operations within the MachineBasicBlock \p MBB.
-static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
-                                          const TargetRegisterInfo &TRI,
-                                          ScavengeableRegs const &SR,
-                                          EmergencyStackSlots &SpillSlots) {
-  LiveRegUnits UsedRegs(TRI);
-  UsedRegs.addLiveOuts(MBB);
-  bool HasPPRSpills = false;
-  MachineInstr *LastPTrue = nullptr;
-  for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
-    UsedRegs.stepBackward(MI);
-    switch (MI.getOpcode()) {
-    case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
-      if (LastPTrue &&
-          MI.definesRegister(LastPTrue->getOperand(0).getReg(), &TRI))
-        LastPTrue = nullptr;
-      HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR,
-                                                     LastPTrue, SpillSlots);
-      MI.eraseFromParent();
-      break;
-    case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
-      expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots);
-      MI.eraseFromParent();
-      [[fallthrough]];
-    default:
-      LastPTrue = nullptr;
-      break;
-    }
-  }
-
-  return HasPPRSpills;
-}
-
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
-
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  const TargetSubtargetInfo &TSI = MF.getSubtarget();
-  const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
-
-  // If predicates spills are 16-bytes we may need to expand
-  // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
-  if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) {
-    auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
-      BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
-      assert(Regs.count() > 0 && "Expected scavengeable registers");
-      return Regs;
-    };
-
-    ScavengeableRegs SR{};
-    SR.ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
-    // Only p0-7 are possible as the second operand of cmpne (needed for fills).
-    SR.PPR3bRegs = ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
-    SR.GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
-
-    EmergencyStackSlots SpillSlots;
-    for (MachineBasicBlock &MBB : MF) {
-      // In the case we had to spill a predicate (in the range p0-p7) to reload
-      // a predicate (>= p8), additional spill/fill pseudos will be created.
-      // These need an additional expansion pass. Note: There will only be at
-      // most two expansion passes, as spilling/filling a predicate in the range
-      // p0-p7 never requires spilling another predicate.
-      for (int Pass = 0; Pass < 2; Pass++) {
-        bool HasPPRSpills =
-            expandSMEPPRToZPRSpillPseudos(MBB, TRI, SR, SpillSlots);
-        assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
-        if (!HasPPRSpills)
-          break;
-      }
-    }
-  }
-
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
          "Upwards growing stack unsupported");
 
@@ -3279,6 +2949,9 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   if (!MF.hasEHFunclets())
     return;
 
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
   // Win64 C++ EH needs to allocate space for the catch objects in the fixed
   // object area right next to the UnwindHelp object.
   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
@@ -4280,18 +3953,10 @@ void AArch64FrameLowering::emitRemarks(
           }
 
           unsigned RegTy = StackAccess::AccessType::GPR;
-          if (MFI.hasScalableStackID(FrameIdx)) {
-            // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
-            // spill/fill the predicate as a data vector (so are an FPR access).
-            if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
-                MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
-                AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) {
-              RegTy = StackAccess::PPR;
-            } else
-              RegTy = StackAccess::FPR;
-          } else if (AArch64InstrInfo::isFpOrNEON(MI)) {
+          if (MFI.hasScalableStackID(FrameIdx))
+            RegTy = isPPRAccess(MI) ? StackAccess::PPR : StackAccess::FPR;
+          else if (AArch64InstrInfo::isFpOrNEON(MI))
             RegTy = StackAccess::FPR;
-          }
 
           StackAccesses[ArrIdx].AccessTypes |= RegTy;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index cad9b1493b86b..b8761d971a67d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2579,8 +2579,6 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STZ2Gi:
   case AArch64::STZGi:
   case AArch64::TAGPstack:
-  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
-  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
     return 2;
   case AArch64::LD1B_D_IMM:
   case AArch64::LD1B_H_IMM:
@@ -4387,8 +4385,6 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -256;
     MaxOffset = 254;
     break;
-  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
-  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
   case AArch64::LDR_ZXI:
   case AArch64::STR_ZXI:
     Scale = TypeSize::getScalable(16);
@@ -5648,11 +5644,6 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZXI;
       StackID = TargetStackID::ScalableVector;
-    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
-             "Unexpected predicate store without SVE store instructions");
-      Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
-      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
@@ -5833,11 +5824,6 @@ void AArch64InstrInfo::loadRegFromStackSlot(
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZXI;
       StackID = TargetStackID::ScalableVector;
-    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
-      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
-             "Unexpected predicate load without SVE load instructions");
-      Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
-      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index aed137c654d15..15681617f5c05 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -57,10 +57,7 @@ static bool isPartOfZPRCalleeSaves(MachineBasicBlock::iterator I) {
   case AArch64::ST1B_2Z_IMM:
   case AArch64::STR_ZXI:
   case AArch64::LDR_ZXI:
-  case AArch64::CPY_ZPzI_B:
-  case AArch64::CMPNE_PPzZI_B:
   case AArch64::PTRUE_C_B:
-  case AArch64::PTRUE_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
   case AArch64::SEH_SavePReg:
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 5d898622f8586..ef974df823100 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -980,19 +980,10 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
 //******************************************************************************
 
 // SVE predicate register classes.
-
-// Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet()
-// (without the use of the table-gen'd predicates).
-def SMEWithZPRPredicateSpills : HwMode<[Predicate<"false">]>;
-
-def PPRSpillFillRI : RegInfoByHwMode<
-      [DefaultMode,              SMEWithZPRPredicateSpills],
-      [RegInfo<16,16,16>,        RegInfo<16,128,128>]>;
-
 class PPRClass<int firstreg, int lastreg, int step = 1> : RegisterClass<"AArch64",
                                   [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
                                   (sequence "P%u", firstreg, lastreg, step)> {
-  let RegInfos = PPRSpillFillRI;
+  let Size = 16;
 }
 
 def PPR    : PPRClass<0, 15> {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 98e0a1180510c..12ddf47169806 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -86,11 +86,6 @@ static cl::alias AArch64StreamingStackHazardSize(
     cl::desc("alias for -aarch64-streaming-hazard-size"),
     cl::aliasopt(AArch64StreamingHazardSize));
 
-static cl::opt<bool> EnableZPRPredicateSpills(
-    "aarch64-enable-zpr-predicate-spills", cl::init(false), cl::Hidden,
-    cl::desc(
-        "Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));
-
 static cl::opt<unsigned>
     VScaleForTuningOpt("sve-vscale-for-tuning", cl::Hidden,
                        cl::desc("Force a vscale for tuning factor for SVE"));
@@ -426,20 +421,6 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
   EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
 }
 
-unsigned AArch64Subtarget::getHwModeSet() const {
-  AArch64HwModeBits Modes = AArch64HwModeBits::DefaultMode;
-
-  // Use a special hardware mode in streaming[-compatible] functions with
-  // aarch64-enable-zpr-predicate-spills. This changes the spill size (and
-  // alignment) for the predicate register class.
-  if (EnableZPRPredicateSpills.getValue() &&
-      (isStreaming() || isStreamingCompatible())) {
-    Modes |= AArch64HwModeBits::SMEWithZPRPredicateSpills;
-  }
-
-  return to_underlying(Modes);
-}
-
 const CallLowering *AArch64Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 671df35cd3799..8974965c41fe3 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -130,8 +130,6 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
                    bool IsStreaming = false, bool IsStreamingCompatible = false,
                    bool HasMinSize = false);
 
-  virtual unsigned getHwModeSet() const override;
-
 // Getters for SubtargetFeatures defined in tablegen
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
   bool GETTER() const { return ATTRIBUTE; }
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index be44b8fa2c194..33f35ad98a425 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -58,20 +58,6 @@ def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
   let hasSideEffects = 0;
 }
 
-def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
-  Pseudo<(outs), (ins PPRorPNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
-{
-  let mayStore = 1;
-  let hasSideEffects = 0;
-}
-
-def FILL_PPR_FROM_ZPR_SLOT_PSEUDO :
-  Pseudo<(outs PPRorPNRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
-{
-  let mayLoad = 1;
-  let hasSideEffects = 0;
-}
-
 def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
 // SME ZA loads and stores
 def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
deleted file mode 100644
index 0298168bb47a7..0000000000000
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ /dev/null
@@ -1,1009 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -run-pass=greedy %s -o - | FileCheck %s
-# RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
---- |
-  source_filename = "<stdin>"
-  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--linux-gnu"
-
-  define aarch64_sve_vector_pcs void @zpr_predicate_spill() #0 { entry: unreachable }
-
-  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv() #0 { entry: unreachable }
-
-  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr() #0 { entry: unreachable }
-
-  define aarch64_sve_vector_pcs void @zpr_predicate_spill__spill_zpr() #0 { entry: unreachable }
-
-  define aarch64_sve_vector_pcs void @zpr_predicate_spill_above_p7() #0 { entry: unreachable }
-
-  define aarch64_sve_vector_pcs void @zpr_predicate_spill_p4_saved() #0 { entry: unreachable }
-
-  attributes #0 = {nounwind "target-features"="+sme,+sve" "aarch64_pstate_sm_compatible"}
-...
----
-name: zpr_predicate_spill
-tracksRegLiveness: true
-stack:
-liveins:
-  - { reg: '$p0' }
-body:             |
-  bb.0.entry:
-    liveins: $p0
-
-    ; CHECK-LABEL: name: zpr_predicate_spill
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
-    ; CHECK: liveins: $p0
-    ; CHECK-NEXT: {{  $}}
-    ;
-    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
-    ;
-    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
-    ;
-    ; CHECK-NEXT: RET_ReallyLR implicit $p0
-
-    ; EXPAND-LABEL: name: zpr_predicate_spill
-    ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
-    ; EXPAND-NEXT: {{  $}}
-    ;
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-    ;
-    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.0)
-    ;
-    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ;
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-    ; EXPAND-NEXT: RET undef $lr, implicit $p0
-    %1:ppr = COPY $p0
-
-    $p0 = IMPLICIT_DEF
-    $p1 = IMPLICIT_DEF
-    $p2 = IMPLICIT_DEF
-    $p3 = IMPLICIT_DEF
-    $p4 = IMPLICIT_DEF
-    $p5 = IMPLICIT_DEF
-    $p6 = IMPLICIT_DEF
-    $p7 = IMPLICIT_DEF
-    $p8 = IMPLICIT_DEF
-    $p9 = IMPLICIT_DEF
-    $p10 = IMPLICIT_DEF
-    $p11 = IMPLICIT_DEF
-    $p12 = IMPLICIT_DEF
-    $p13 = IMPLICIT_DEF
-    $p14 = IMPLICIT_DEF
-    $p15 = IMPLICIT_DEF
-
-    $p0 = COPY %1
-
-    RET_ReallyLR implicit $p0
-...
----
-name: zpr_predicate_spill__save_restore_nzcv
-tracksRegLiveness: true
-stack:
-liveins:
-  - { reg: '$p0' }
-body:             |
-  bb.0.entry:
-    liveins: $p0
-
-    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
-    ; CHECK: liveins: $p0
-    ; CHECK-NEXT: {{  $}}
-    ;
-    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
-    ;
-    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
-    ;
-    ; CHECK-NEXT: FAKE_USE implicit $nzcv
-    ;
-    ; CHECK-NEXT: RET_ReallyLR implicit $p0
-
-    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv
-    ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
-    ; EXPAND-NEXT: {{  $}}
-    ;
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-    ;
-    ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.0)
-    ;
-    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $fp = MRS 55824, implicit-def $nzcv, implicit $nzcv
-    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: MSR 55824, $fp, implicit-def $nzcv
-    ;
-    ; EXPAND-NEXT: FAKE_USE implicit $nzcv
-    ;
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-    ; EXPAND-NEXT: RET undef $lr, implicit $p0
-    $nzcv = IMPLICIT_DEF
-
-    %1:ppr = COPY $p0
-
-    $p0 = IMPLICIT_DEF
-    $p1 = IMPLICIT_DEF
-    $p2 = IMPLICIT_DEF
-    $p3 = IMPLICIT_DEF
-    $p4 = IMPLICIT_DEF
-    $p5 = IMPLICIT_DEF
-    $p6 = IMPLICIT_DEF
-    $p7 = IMPLICIT_DEF
-    $p8 = IMPLICIT_DEF
-    $p9 = IMPLICIT_DEF
-    $p10 = IMPLICIT_DEF
-    $p11 = IMPLICIT_DEF
-    $p12 = IMPLICIT_DEF
-    $p13 = IMPLICIT_DEF
-    $p14 = IMPLICIT_DEF
-    $p15 = IMPLICIT_DEF
-
-    $p0 = COPY %1
-
-    FAKE_USE implicit $nzcv
-
-    RET_ReallyLR implicit $p0
-...
----
-name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
-tracksRegLiveness: true
-stack:
-liveins:
-  - { reg: '$p0' }
-  - { reg: '$x0' }
-  - { reg: '$x1' }
-  - { reg: '$x2' }
-  - { reg: '$x3' }
-  - { reg: '$x4' }
-  - { reg: '$x5' }
-  - { reg: '$x6' }
-  - { reg: '$x7' }
-body:             |
-  bb.0.entry:
-    liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
-
-    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
-    ; CHECK: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
-    ; CHECK-NEXT: {{  $}}
-    ;
-    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: $x8 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x9 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x10 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x11 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x12 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x13 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x14 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x15 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x16 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x17 = IMPLICIT_DEF
-    ; CHECK-NEXT: $x18 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
-    ;
-    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
-    ;
-    ; CHECK-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
-    ;
-    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
-
-    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
-    ; EXPAND: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
-    ; EXPAND-NEXT: {{  $}}
-    ;
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-    ;
-    ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $x8 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x9 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x10 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x11 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x12 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x13 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x14 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x15 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x16 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x17 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $x18 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: $fp = ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $fp, 0 :: (store (s128) into %stack.0)
-    ;
-    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $fp, 0 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $fp = MRS 55824, implicit-def $nzcv, implicit $nzcv
-    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: MSR 55824, $fp, implicit-def $nzcv
-    ;
-    ; EXPAND-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
-    ;
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
-    $nzcv = IMPLICIT_DEF
-    $x8 = IMPLICIT_DEF
-    $x9 = IMPLICIT_DEF
-    $x10 = IMPLICIT_DEF
-    $x11 = IMPLICIT_DEF
-    $x12 = IMPLICIT_DEF
-    $x13 = IMPLICIT_DEF
-    $x14 = IMPLICIT_DEF
-    $x15 = IMPLICIT_DEF
-    $x16 = IMPLICIT_DEF
-    $x17 = IMPLICIT_DEF
-    $x18 = IMPLICIT_DEF
-
-    %1:ppr = COPY $p0
-
-    $p0 = IMPLICIT_DEF
-    $p1 = IMPLICIT_DEF
-    $p2 = IMPLICIT_DEF
-    $p3 = IMPLICIT_DEF
-    $p4 = IMPLICIT_DEF
-    $p5 = IMPLICIT_DEF
-    $p6 = IMPLICIT_DEF
-    $p7 = IMPLICIT_DEF
-    $p8 = IMPLICIT_DEF
-    $p9 = IMPLICIT_DEF
-    $p10 = IMPLICIT_DEF
-    $p11 = IMPLICIT_DEF
-    $p12 = IMPLICIT_DEF
-    $p13 = IMPLICIT_DEF
-    $p14 = IMPLICIT_DEF
-    $p15 = IMPLICIT_DEF
-
-    $p0 = COPY %1
-
-    FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
-
-    RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
-...
----
-name: zpr_predicate_spill__spill_zpr
-tracksRegLiveness: true
-stack:
-liveins:
-  - { reg: '$p0' }
-  - { reg: '$z0' }
-  - { reg: '$z1' }
-  - { reg: '$z2' }
-  - { reg: '$z3' }
-  - { reg: '$z4' }
-  - { reg: '$z5' }
-  - { reg: '$z6' }
-  - { reg: '$z7' }
-body:             |
-  bb.0.entry:
-    liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
-
-    ; CHECK-LABEL: name: zpr_predicate_spill__spill_zpr
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
-    ; CHECK: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
-    ; CHECK-NEXT: {{  $}}
-    ;
-    ; CHECK-NEXT: $z16 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z17 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z18 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z19 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z20 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z21 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z22 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z23 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z24 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z25 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z26 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z27 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z28 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z29 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z30 = IMPLICIT_DEF
-    ; CHECK-NEXT: $z31 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
-    ;
-    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
-    ;
-    ; CHECK-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
-    ;
-    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
-
-    ; EXPAND-LABEL: name: zpr_predicate_spill__spill_zpr
-    ; EXPAND: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4, $z23, $z22, $z21, $z20, $z19, $z18, $z17, $z16
-    ; EXPAND-NEXT: {{  $}}
-    ;
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.22)
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -20, implicit $vg
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 0 :: (store (s128) into %stack.21)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 1 :: (store (s128) into %stack.20)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 2 :: (store (s128) into %stack.19)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 3 :: (store (s128) into %stack.18)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 4 :: (store (s128) into %stack.17)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 5 :: (store (s128) into %stack.16)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 6 :: (store (s128) into %stack.15)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 7 :: (store (s128) into %stack.14)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 8 :: (store (s128) into %stack.13)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 9 :: (store (s128) into %stack.12)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 10 :: (store (s128) into %stack.11)
-    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 11 :: (store (s128) into %stack.10)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z23, $sp, 12 :: (store (s128) into %stack.9)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z22, $sp, 13 :: (store (s128) into %stack.8)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z21, $sp, 14 :: (store (s128) into %stack.7)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z20, $sp, 15 :: (store (s128) into %stack.6)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z19, $sp, 16 :: (store (s128) into %stack.5)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z18, $sp, 17 :: (store (s128) into %stack.4)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z17, $sp, 18 :: (store (s128) into %stack.3)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z16, $sp, 19 :: (store (s128) into %stack.2)
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
-    ;
-    ; EXPAND-NEXT: $z16 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z17 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z18 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z19 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z20 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z21 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z22 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z23 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z24 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z25 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z26 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z27 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z28 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z29 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z30 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $z31 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.24)
-    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 1 :: (store (s128) into %stack.0)
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 0 :: (load (s128) from %stack.24)
-    ;
-    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.24)
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 1 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.24)
-    ;
-    ; EXPAND-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
-    ;
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-    ; EXPAND-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 12 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 13 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $z21 = frame-destroy LDR_ZXI $sp, 14 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $z20 = frame-destroy LDR_ZXI $sp, 15 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $z19 = frame-destroy LDR_ZXI $sp, 16 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $z18 = frame-destroy LDR_ZXI $sp, 17 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $z17 = frame-destroy LDR_ZXI $sp, 18 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $z16 = frame-destroy LDR_ZXI $sp, 19 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.21)
-    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.20)
-    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
-    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
-    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.17)
-    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.16)
-    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.15)
-    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.14)
-    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 20, implicit $vg
-    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.22)
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
-    $z16 = IMPLICIT_DEF
-    $z17 = IMPLICIT_DEF
-    $z18 = IMPLICIT_DEF
-    $z19 = IMPLICIT_DEF
-    $z20 = IMPLICIT_DEF
-    $z21 = IMPLICIT_DEF
-    $z22 = IMPLICIT_DEF
-    $z23 = IMPLICIT_DEF
-    $z24 = IMPLICIT_DEF
-    $z25 = IMPLICIT_DEF
-    $z26 = IMPLICIT_DEF
-    $z27 = IMPLICIT_DEF
-    $z28 = IMPLICIT_DEF
-    $z29 = IMPLICIT_DEF
-    $z30 = IMPLICIT_DEF
-    $z31 = IMPLICIT_DEF
-
-    %1:ppr = COPY $p0
-
-    $p0 = IMPLICIT_DEF
-    $p1 = IMPLICIT_DEF
-    $p2 = IMPLICIT_DEF
-    $p3 = IMPLICIT_DEF
-    $p4 = IMPLICIT_DEF
-    $p5 = IMPLICIT_DEF
-    $p6 = IMPLICIT_DEF
-    $p7 = IMPLICIT_DEF
-    $p8 = IMPLICIT_DEF
-    $p9 = IMPLICIT_DEF
-    $p10 = IMPLICIT_DEF
-    $p11 = IMPLICIT_DEF
-    $p12 = IMPLICIT_DEF
-    $p13 = IMPLICIT_DEF
-    $p14 = IMPLICIT_DEF
-    $p15 = IMPLICIT_DEF
-
-    $p0 = COPY %1
-
-    FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
-
-    RET_ReallyLR implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
-...
----
-name: zpr_predicate_spill_above_p7
-tracksRegLiveness: true
-stack:
-liveins:
-  - { reg: '$p0' }
-  - { reg: '$p1' }
-  - { reg: '$p2' }
-  - { reg: '$p3' }
-body:             |
-  bb.0.entry:
-    liveins: $p0, $p1, $p2, $p3
-
-    ; CHECK-LABEL: name: zpr_predicate_spill_above_p7
-    ; CHECK: stack:
-    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
-    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
-    ; CHECK: liveins: $p0, $p1, $p2, $p3
-    ; CHECK-NEXT: {{  $}}
-    ;
-    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p15, %stack.0, 0 :: (store (s128) into %stack.0)
-    ;
-    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
-    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: $p15 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
-    ;
-    ; CHECK-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
-    ;
-    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
-
-    ; EXPAND-LABEL: name: zpr_predicate_spill_above_p7
-    ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
-    ; EXPAND-NEXT: {{  $}}
-    ;
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
-    ;
-    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p15, 1, 0
-    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 1 :: (store (s128) into %stack.0)
-    ;
-    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
-    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.16)
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 1 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p15 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.16)
-    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ;
-    ; EXPAND-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
-    ;
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
-    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
-    $p15 = IMPLICIT_DEF
-    %1:ppr = COPY $p15
-
-    $p0 = IMPLICIT_DEF
-    $p1 = IMPLICIT_DEF
-    $p2 = IMPLICIT_DEF
-    $p3 = IMPLICIT_DEF
-    $p4 = IMPLICIT_DEF
-    $p5 = IMPLICIT_DEF
-    $p6 = IMPLICIT_DEF
-    $p7 = IMPLICIT_DEF
-    $p8 = IMPLICIT_DEF
-    $p9 = IMPLICIT_DEF
-    $p10 = IMPLICIT_DEF
-    $p11 = IMPLICIT_DEF
-    $p12 = IMPLICIT_DEF
-    $p13 = IMPLICIT_DEF
-    $p14 = IMPLICIT_DEF
-    $p15 = IMPLICIT_DEF
-
-    $p15 = COPY %1
-
-    FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
-
-    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
-...
----
-name: zpr_predicate_spill_p4_saved
-tracksRegLiveness: true
-stack:
-liveins:
-  - { reg: '$p0' }
-  - { reg: '$p1' }
-  - { reg: '$p2' }
-  - { reg: '$p3' }
-body:             |
-  bb.0.entry:
-    liveins: $p0, $p1, $p2, $p3
-
-    ; CHECK-LABEL: name: zpr_predicate_spill_p4_saved
-    ; CHECK: liveins: $p0, $p1, $p2, $p3
-    ; CHECK-NEXT: {{  $}}
-    ;
-    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ;
-    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
-
-    ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
-    ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
-    ; EXPAND-NEXT: {{  $}}
-    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
-    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1)
-    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
-    ;
-    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
-    ;
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
-    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
-
-    ; If we spill a register above p8, p4 must also be saved, so we can guarantee
-    ; they will be a register (in the range p0-p7 to for the cmpne reload).
-    $p8 = IMPLICIT_DEF
-
-    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
-...
diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
index 01e3d3ae5e1fd..c0a2943de59e1 100644
--- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
@@ -1,7 +1,5 @@
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-PADDING
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-enable-zpr-predicate-spills -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-ZPR-PRED-SPILLS
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-enable-zpr-predicate-spills -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-ZPR-PRED-SPILLS-WITH-PADDING
 
 ; Don't emit remarks for non-streaming functions.
 define float @csr_x20_stackargs_notsc(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) {
@@ -69,16 +67,11 @@ entry:
 
 ; SVE calling conventions
 ; Padding is placed between predicate and fpr/zpr register spills, so only emit remarks when hazard padding is off.
-; Note: The -aarch64-enable-zpr-predicate-spills option is deprecated (and will be removed soon).
 
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call':
-; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
-; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
-; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
-; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
@@ -89,10 +82,6 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call':
-; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
-; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
-; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at {{.*}} is too close to FPR stack object
-; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %0 = alloca [37 x i8], align 16

From a13ff2cb027eaeffe67241bbc7f7308affb3aae7 Mon Sep 17 00:00:00 2001
From: Bonsthie <barnabe.bonnet@gmail.com>
Date: Mon, 6 Oct 2025 11:25:11 +0200
Subject: [PATCH 815/878] [X86][GISel] Add missing legalization for
 G_IMPLICIT_DEF (#161699)

Legalize scalar and vector integer types for `G_IMPLICIT_DEF` at
SSE2/AVX2/AVX-512 widths. This is groundwork for upcoming `G_*_VECTOR`
legalization, since vector inserts/builds rely on undef bases.
---
 .../X86/GISel/X86InstructionSelector.cpp      |  1 +
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 49 ++++++------
 .../CodeGen/X86/GlobalISel/legalize-phi.mir   | 14 ++--
 .../GlobalISel/legalize-undef-vec-scaling.mir | 32 ++++++++
 .../select-constant-fold-barrier-vec256.mir   | 23 ++++++
 .../select-constant-fold-barrier-vec512.mir   | 23 ++++++
 .../select-constant-fold-barrier.mir          | 77 +++++++++++++++++++
 .../X86/GlobalISel/select-freeze-vec256.mir   | 23 ++++++
 .../X86/GlobalISel/select-freeze-vec512.mir   | 23 ++++++
 .../CodeGen/X86/GlobalISel/select-freeze.mir  | 77 +++++++++++++++++++
 10 files changed, 308 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/legalize-undef-vec-scaling.mir
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec256.mir
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec512.mir
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier.mir
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec256.mir
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec512.mir
 create mode 100644 llvm/test/CodeGen/X86/GlobalISel/select-freeze.mir

diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index 3090ad313b90d..27fba34b58e5d 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -407,6 +407,7 @@ bool X86InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_TRUNC:
     return selectTruncOrPtrToInt(I, MRI, MF);
   case TargetOpcode::G_INTTOPTR:
+  case TargetOpcode::G_FREEZE:
     return selectCopy(I, MRI);
   case TargetOpcode::G_ZEXT:
     return selectZext(I, MRI, MF);
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index e7709ef589502..11ef72119616e 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -89,9 +89,29 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   // 32/64-bits needs support for s64/s128 to handle cases:
   // s64 = EXTEND (G_IMPLICIT_DEF s32) -> s64 = G_IMPLICIT_DEF
   // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF
-  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+  getActionDefinitionsBuilder(
+      {G_IMPLICIT_DEF, G_PHI, G_FREEZE, G_CONSTANT_FOLD_BARRIER})
       .legalFor({p0, s1, s8, s16, s32, s64})
-      .legalFor(Is64Bit, {s128});
+      .legalFor(UseX87, {s80})
+      .legalFor(Is64Bit, {s128})
+      .legalFor(HasSSE2, {v16s8, v8s16, v4s32, v2s64})
+      .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64})
+      .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64})
+      .widenScalarOrEltToNextPow2(0, /*Min=*/8)
+      .clampScalarOrElt(0, s8, sMaxScalar)
+      .moreElementsToNextPow2(0)
+      .clampMinNumElements(0, s8, 16)
+      .clampMinNumElements(0, s16, 8)
+      .clampMinNumElements(0, s32, 4)
+      .clampMinNumElements(0, s64, 2)
+      .clampMaxNumElements(0, s8, HasAVX512 ? 64 : (HasAVX ? 32 : 16))
+      .clampMaxNumElements(0, s16, HasAVX512 ? 32 : (HasAVX ? 16 : 8))
+      .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX ? 8 : 4))
+      .clampMaxNumElements(0, s64, HasAVX512 ? 8 : (HasAVX ? 4 : 2))
+      .clampMaxNumElements(0, p0,
+                           Is64Bit ? s64MaxVector.getNumElements()
+                                   : s32MaxVector.getNumElements())
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
   getActionDefinitionsBuilder(G_CONSTANT)
       .legalFor({p0, s8, s16, s32})
@@ -289,26 +309,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .clampScalar(1, s16, sMaxScalar)
       .scalarSameSizeAs(0, 1);
 
-  // control flow
-  getActionDefinitionsBuilder(G_PHI)
-      .legalFor({s8, s16, s32, p0})
-      .legalFor(UseX87, {s80})
-      .legalFor(Is64Bit, {s64})
-      .legalFor(HasSSE1, {v16s8, v8s16, v4s32, v2s64})
-      .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64})
-      .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64})
-      .clampMinNumElements(0, s8, 16)
-      .clampMinNumElements(0, s16, 8)
-      .clampMinNumElements(0, s32, 4)
-      .clampMinNumElements(0, s64, 2)
-      .clampMaxNumElements(0, s8, HasAVX512 ? 64 : (HasAVX ? 32 : 16))
-      .clampMaxNumElements(0, s16, HasAVX512 ? 32 : (HasAVX ? 16 : 8))
-      .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX ? 8 : 4))
-      .clampMaxNumElements(0, s64, HasAVX512 ? 8 : (HasAVX ? 4 : 2))
-      .widenScalarToNextPow2(0, /*Min=*/32)
-      .clampScalar(0, s8, sMaxScalar)
-      .scalarize(0);
-
   getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
 
   // pointer handling
@@ -592,11 +592,6 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .minScalar(0, LLT::scalar(32))
       .libcall();
 
-  getActionDefinitionsBuilder({G_FREEZE, G_CONSTANT_FOLD_BARRIER})
-      .legalFor({s8, s16, s32, s64, p0})
-      .widenScalarToNextPow2(0, /*Min=*/8)
-      .clampScalar(0, s8, sMaxScalar);
-
   getLegacyLegalizerInfo().computeTables();
   verify(*STI.getInstrInfo());
 }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-phi.mir
index 31de686878a97..92e458896c8d8 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-phi.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-phi.mir
@@ -148,21 +148,21 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $edi
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $esi
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY1]](s32)
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $edx
+  ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s1) = G_TRUNC [[COPY2]](s32)
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s8)
-  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
-  ; CHECK-NEXT:   G_BRCOND [[TRUNC]](s1), %bb.2
+  ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[ICMP]](s8)
+  ; CHECK-NEXT:   G_BRCOND [[TRUNC1]](s1), %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cond.false:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32)
-  ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.cond.end:
-  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s8) = G_PHI [[TRUNC2]](s8), %bb.1, [[TRUNC1]](s8), %bb.0
-  ; CHECK-NEXT:   $al = COPY [[PHI]](s8)
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[TRUNC2]](s1), %bb.1, [[TRUNC]](s1), %bb.0
+  ; CHECK-NEXT:   [[EXT:%[0-9]+]]:_(s8) = G_ANYEXT [[PHI]](s1)
+  ; CHECK-NEXT:   $al = COPY [[EXT]](s8)
   ; CHECK-NEXT:   RET 0, implicit $al
   bb.1.entry:
     successors: %bb.3(0x40000000), %bb.2(0x40000000)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-undef-vec-scaling.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-undef-vec-scaling.mir
new file mode 100644
index 0000000000000..b02832b9824ad
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-undef-vec-scaling.mir
@@ -0,0 +1,32 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=avx2 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o -  | FileCheck %s --check-prefixes=CHECK,AVX2
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=sse2 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o -  | FileCheck %s --check-prefixes=CHECK,SSE2
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=avx512f -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel*' %s -o -  | FileCheck %s --check-prefixes=CHECK,AVX512F 
+
+
+---
+name: test_basic_g_implicit_def_v8i64
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_basic_g_implicit_def_v8i64
+    ; AVX512F: {{%[0-9]+}}:_(<8 x s64>) = G_IMPLICIT_DEF
+    ; AVX2: [[DEF_AVX2:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; AVX2-NEXT: {{%[0-9]+}}:_(<8 x s64>) = G_CONCAT_VECTORS [[DEF_AVX2]](<4 x s64>), [[DEF_AVX2]](<4 x s64>)
+    ; SSE2: [[DEF_SSE2:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
+    ; SSE2-NEXT: {{%[0-9]+}}:_(<8 x s64>) = G_CONCAT_VECTORS [[DEF_SSE2]](<2 x s64>), [[DEF_SSE2]](<2 x s64>), [[DEF_SSE2]](<2 x s64>), [[DEF_SSE2]](<2 x s64>)
+    %0:_(<8 x s64>) = G_IMPLICIT_DEF
+    RET 0, implicit %0
+...
+
+---
+name: test_g_implicit_def_cample_size
+body: |
+  bb.1:
+   ; CHECK-LABEL: name: test_g_implicit_def_cample_size
+   ; AVX512: {{%[0-9]+}}:_(<8 x s64>) = G_IMPLICIT_DEF
+   ; AVX2: {{%[0-9]+}}:_(<4 x s64>) = G_IMPLICIT_DEF
+   ; SSE2: {{%[0-9]+}}:_(<2 x s64>) = G_IMPLICIT_DEF
+    %0:_(<5 x s63>) = G_IMPLICIT_DEF
+    RET 0, implicit %0
+...
+
+
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec256.mir b/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec256.mir
new file mode 100644
index 0000000000000..254c1b65f9e63
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec256.mir
@@ -0,0 +1,23 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            select_cfb_vec256
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: vecr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vecr, preferred-register: '', flags: [  ] }
+body:             |
+  bb.0:
+    liveins: $ymm0
+
+    ; CHECK-LABEL: name:            select_cfb_vec256
+    ; CHECK: [[COPY:%[0-9]+]]:vr256 = COPY $ymm0
+    ; CHECK-NOT: G_CONSTANT_FOLD_BARRIER
+    ; CHECK-NEXT: $ymm1 = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $ymm1
+    %0:vecr(<8 x s32>) = COPY $ymm0
+    %1:vecr(<8 x s32>) = G_CONSTANT_FOLD_BARRIER %0
+    $ymm1 = COPY %1(<8 x s32>)
+    RET 0, implicit $ymm1
+...
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec512.mir b/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec512.mir
new file mode 100644
index 0000000000000..3da354ba3934e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier-vec512.mir
@@ -0,0 +1,23 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            select_cfb_vec512
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: vecr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vecr, preferred-register: '', flags: [  ] }
+body:             |
+  bb.0:
+    liveins: $zmm0
+
+    ; CHECK-LABEL: name:            select_cfb_vec512
+    ; CHECK: [[COPY:%[0-9]+]]:vr512 = COPY $zmm0
+    ; CHECK-NOT: G_CONSTANT_FOLD_BARRIER
+    ; CHECK-NEXT: $zmm1 = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $zmm1
+    %0:vecr(<8 x s64>) = COPY $zmm0
+    %1:vecr(<8 x s64>) = G_CONSTANT_FOLD_BARRIER %0
+    $zmm1 = COPY %1(<8 x s64>)
+    RET 0, implicit $zmm1
+...
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier.mir b/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier.mir
new file mode 100644
index 0000000000000..fa012f9209d6e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-constant-fold-barrier.mir
@@ -0,0 +1,77 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+
+---
+name:            select_cfb_scalar_s32
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: gpr, preferred-register: '', flags: [  ] }
+liveins:
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0:
+    liveins: $edi
+
+    ; CHECK-LABEL: name:            select_cfb_scalar_s32
+    ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $edi
+    ; CHECK-NOT: G_CONSTANT_FOLD_BARRIER
+    ; CHECK-NEXT: $eax = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $eax
+    %0:gpr(s32) = COPY $edi
+    %1:gpr(s32) = G_CONSTANT_FOLD_BARRIER %0
+    $eax = COPY %1(s32)
+    RET 0, implicit $eax
+...
+
+---
+name:            select_cfb_scalar_s64
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: gpr, preferred-register: '', flags: [  ] }
+liveins:
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0:
+    liveins: $rdi
+
+    ; CHECK-LABEL: name:            select_cfb_scalar_s64
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+    ; CHECK-NOT: G_CONSTANT_FOLD_BARRIER
+    ; CHECK-NEXT: $rax = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $rax
+    %0:gpr(s64) = COPY $rdi
+    %1:gpr(s64) = G_CONSTANT_FOLD_BARRIER %0
+    $rax = COPY %1(s64)
+    RET 0, implicit $rax
+...
+
+
+---
+name:            select_cfb_vec128
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: vecr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vecr, preferred-register: '', flags: [  ] }
+body:             |
+  bb.0:
+    liveins: $xmm0
+
+    ; CHECK-LABEL: name:            select_cfb_vec128
+    ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
+    ; CHECK-NOT: G_CONSTANT_FOLD_BARRIER
+    ; CHECK-NEXT: $xmm1 = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $xmm1
+    %0:vecr(<4 x s32>) = COPY $xmm0
+    %1:vecr(<4 x s32>) = G_CONSTANT_FOLD_BARRIER %0
+    $xmm1 = COPY %1(<4 x s32>)
+    RET 0, implicit $xmm1
+...
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec256.mir b/llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec256.mir
new file mode 100644
index 0000000000000..11251e4085cce
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec256.mir
@@ -0,0 +1,23 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            select_freeze_vec256
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: vecr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vecr, preferred-register: '', flags: [  ] }
+body:             |
+  bb.0:
+    liveins: $ymm0
+
+    ; CHECK-LABEL: name:            select_freeze_vec256
+    ; CHECK: [[COPY:%[0-9]+]]:vr256 = COPY $ymm0
+    ; CHECK-NOT: G_FREEZE
+    ; CHECK-NEXT: $ymm1 = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $ymm1
+    %0:vecr(<8 x s32>) = COPY $ymm0
+    %1:vecr(<8 x s32>) = G_FREEZE %0
+    $ymm1 = COPY %1(<8 x s32>)
+    RET 0, implicit $ymm1
+...
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec512.mir b/llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec512.mir
new file mode 100644
index 0000000000000..bcf299acef18b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-freeze-vec512.mir
@@ -0,0 +1,23 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            select_freeze_vec512
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: vecr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vecr, preferred-register: '', flags: [  ] }
+body:             |
+  bb.0:
+    liveins: $zmm0
+
+    ; CHECK-LABEL: name:            select_freeze_vec512
+    ; CHECK: [[COPY:%[0-9]+]]:vr512 = COPY $zmm0
+    ; CHECK-NOT: G_FREEZE
+    ; CHECK-NEXT: $zmm1 = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $zmm1
+    %0:vecr(<8 x s64>) = COPY $zmm0
+    %1:vecr(<8 x s64>) = G_FREEZE %0
+    $zmm1 = COPY %1(<8 x s64>)
+    RET 0, implicit $zmm1
+...
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-freeze.mir b/llvm/test/CodeGen/X86/GlobalISel/select-freeze.mir
new file mode 100644
index 0000000000000..cf5ad47ccf338
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-freeze.mir
@@ -0,0 +1,77 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+
+---
+name:            select_freeze_scalar_s32
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: gpr, preferred-register: '', flags: [  ] }
+liveins:
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0:
+    liveins: $edi
+
+    ; CHECK-LABEL: name:            select_freeze_scalar_s32
+    ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $edi
+    ; CHECK-NOT: G_FREEZE
+    ; CHECK-NEXT: $eax = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $eax
+    %0:gpr(s32) = COPY $edi
+    %1:gpr(s32) = G_FREEZE %0
+    $eax = COPY %1(s32)
+    RET 0, implicit $eax
+...
+
+---
+name:            select_freeze_scalar_s64
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: gpr, preferred-register: '', flags: [  ] }
+liveins:
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0:
+    liveins: $rdi
+
+    ; CHECK-LABEL: name:            select_freeze_scalar_s64
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+    ; CHECK-NOT: G_FREEZE
+    ; CHECK-NEXT: $rax = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $rax
+    %0:gpr(s64) = COPY $rdi
+    %1:gpr(s64) = G_FREEZE %0
+    $rax = COPY %1(s64)
+    RET 0, implicit $rax
+...
+
+
+---
+name:            select_freeze_vec128
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: vecr, preferred-register: '', flags: [  ] }
+  - { id: 1, class: vecr, preferred-register: '', flags: [  ] }
+body:             |
+  bb.0:
+    liveins: $xmm0
+
+    ; CHECK-LABEL: name:            select_freeze_vec128
+    ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
+    ; CHECK-NOT: G_FREEZE
+    ; CHECK-NEXT: $xmm1 = COPY [[COPY]]
+    ; CHECK-NEXT: RET 0, implicit $xmm1
+    %0:vecr(<4 x s32>) = COPY $xmm0
+    %1:vecr(<4 x s32>) = G_FREEZE %0
+    $xmm1 = COPY %1(<4 x s32>)
+    RET 0, implicit $xmm1
+...

From 69761e761c8d37f4fa96af4483c37c436fa7d295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 6 Oct 2025 12:41:32 +0300
Subject: [PATCH 816/878] [clang] [test] Verify that intrinsic headers don't
 use unreserved names (#161817)

This mirrors a similar test that libcxx does, to make sure that the
libcxx headers don't use any unreserved symbols.

The header for polluting with defines is based very far on the libcxx
one; some parts of it could possibly be omitted, but I included most of
it for completeness here.

This should allow catching these issues earlier, to avoid issues like
#161808 and #98478 happening again.
---
 clang/test/Headers/arm-acle-header.c          |   2 +
 clang/test/Headers/arm-cde-header.c           |   2 +
 clang/test/Headers/arm-cmse-header.c          |   2 +
 clang/test/Headers/arm-fp16-header.c          |   2 +
 clang/test/Headers/arm-neon-header.c          |   2 +
 clang/test/Headers/system_reserved_names.h    | 165 ++++++++++++++++++
 .../Headers/x86-intrinsics-headers-clean.cpp  |   2 +
 clang/test/Headers/x86-intrinsics-headers.c   |   2 +
 8 files changed, 179 insertions(+)
 create mode 100644 clang/test/Headers/system_reserved_names.h

diff --git a/clang/test/Headers/arm-acle-header.c b/clang/test/Headers/arm-acle-header.c
index fea8472183c87..58fcc6648bdec 100644
--- a/clang/test/Headers/arm-acle-header.c
+++ b/clang/test/Headers/arm-acle-header.c
@@ -10,6 +10,8 @@
 // RUN: %clang_cc1 -x c++ -triple arm64ec-windows -target-cpu cortex-a53 -fsyntax-only -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=19.11 %s
 // expected-no-diagnostics
 
+#include "system_reserved_names.h"
+
 #include <arm_acle.h>
 #ifdef _MSC_VER
 #include <intrin.h>
diff --git a/clang/test/Headers/arm-cde-header.c b/clang/test/Headers/arm-cde-header.c
index 1f60368ee9033..526202ab4f280 100644
--- a/clang/test/Headers/arm-cde-header.c
+++ b/clang/test/Headers/arm-cde-header.c
@@ -9,5 +9,7 @@
 
 // Check that the headers don't conflict with each other
 
+#include "system_reserved_names.h"
+
 #include <arm_cde.h>
 #include <arm_mve.h>
diff --git a/clang/test/Headers/arm-cmse-header.c b/clang/test/Headers/arm-cmse-header.c
index 862572d8adcd9..c21c1ff595b4f 100644
--- a/clang/test/Headers/arm-cmse-header.c
+++ b/clang/test/Headers/arm-cmse-header.c
@@ -2,6 +2,8 @@
 // RUN: %clang_cc1 -triple thumbv8m.base-eabi  -fsyntax-only -ffreestanding -x c++ %s -verify -mcmse
 // expected-no-diagnostics
 
+#include "system_reserved_names.h"
+
 #include <arm_cmse.h>
 
 typedef void (*callback_t)(void);
diff --git a/clang/test/Headers/arm-fp16-header.c b/clang/test/Headers/arm-fp16-header.c
index b1a87faebfe0b..e472654af24bd 100644
--- a/clang/test/Headers/arm-fp16-header.c
+++ b/clang/test/Headers/arm-fp16-header.c
@@ -18,4 +18,6 @@
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
+#include "system_reserved_names.h"
+
 #include <arm_fp16.h>
diff --git a/clang/test/Headers/arm-neon-header.c b/clang/test/Headers/arm-neon-header.c
index 89bd5aaa25420..43b1b355d99f4 100644
--- a/clang/test/Headers/arm-neon-header.c
+++ b/clang/test/Headers/arm-neon-header.c
@@ -26,4 +26,6 @@
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
+#include "system_reserved_names.h"
+
 #include <arm_neon.h>
diff --git a/clang/test/Headers/system_reserved_names.h b/clang/test/Headers/system_reserved_names.h
new file mode 100644
index 0000000000000..1a53f4fa8c1ff
--- /dev/null
+++ b/clang/test/Headers/system_reserved_names.h
@@ -0,0 +1,165 @@
+// Test that headers are not tripped up by the surrounding code defining various
+// alphabetic macros. Also ensure that we don't swallow the definition of user
+// provided macros (in other words, ensure that we push/pop correctly everywhere).
+//
+// The contents of this header is a lightly trimmed version of
+// libcxx/test/libcxx/system_reserved_names.gen.py; additions to that testcase
+// can be synced into this header as well.
+
+#define SYSTEM_RESERVED_NAME This name should not be used in Clang headers
+
+// libc++ does not use single-letter names as a matter of principle.
+// But Windows' own <wchar.h>, <math.h>, and <exception> use many of these
+// (at least C,E,F,I,M,N,P,S,X,Y,Z) as uglified function parameter names,
+// so don't define these on Windows.
+//
+#ifndef _WIN32
+#define _A SYSTEM_RESERVED_NAME
+#define _B SYSTEM_RESERVED_NAME
+#define _C SYSTEM_RESERVED_NAME
+#define _D SYSTEM_RESERVED_NAME
+#define _E SYSTEM_RESERVED_NAME
+#define _F SYSTEM_RESERVED_NAME
+#define _G SYSTEM_RESERVED_NAME
+#define _H SYSTEM_RESERVED_NAME
+#define _I SYSTEM_RESERVED_NAME
+#define _J SYSTEM_RESERVED_NAME
+#define _K SYSTEM_RESERVED_NAME
+#define _L SYSTEM_RESERVED_NAME
+#define _M SYSTEM_RESERVED_NAME
+#define _N SYSTEM_RESERVED_NAME
+#define _O SYSTEM_RESERVED_NAME
+#define _P SYSTEM_RESERVED_NAME
+#define _Q SYSTEM_RESERVED_NAME
+#define _R SYSTEM_RESERVED_NAME
+#define _S SYSTEM_RESERVED_NAME
+#define _T SYSTEM_RESERVED_NAME
+#define _U SYSTEM_RESERVED_NAME
+#define _V SYSTEM_RESERVED_NAME
+#define _W SYSTEM_RESERVED_NAME
+#define _X SYSTEM_RESERVED_NAME
+#define _Y SYSTEM_RESERVED_NAME
+#define _Z SYSTEM_RESERVED_NAME
+#endif
+
+// FreeBSD's <sys/types.h> uses _M
+//
+#ifdef __FreeBSD__
+# undef _M
+#endif
+
+// Test that libc++ doesn't use names that collide with FreeBSD system macros.
+// newlib and picolibc also define these macros
+#if !defined(__FreeBSD__) && !defined(_NEWLIB_VERSION)
+#  define __null_sentinel SYSTEM_RESERVED_NAME
+#  define __generic SYSTEM_RESERVED_NAME
+#endif
+
+// tchar.h defines these macros on Windows
+#ifndef _WIN32
+# define _UI   SYSTEM_RESERVED_NAME
+# define _PUC  SYSTEM_RESERVED_NAME
+# define _CPUC SYSTEM_RESERVED_NAME
+# define _PC   SYSTEM_RESERVED_NAME
+# define _CRPC SYSTEM_RESERVED_NAME
+# define _CPC  SYSTEM_RESERVED_NAME
+#endif
+
+// yvals.h on MINGW defines this macro
+#ifndef _WIN32
+# define _C2 SYSTEM_RESERVED_NAME
+#endif
+
+// Test that libc++ doesn't use names that collide with Win32 API macros.
+// Obviously we can only define these on non-Windows platforms.
+#ifndef _WIN32
+# define __allocator SYSTEM_RESERVED_NAME
+# define __bound SYSTEM_RESERVED_NAME
+# define __deallocate SYSTEM_RESERVED_NAME
+# define __deref SYSTEM_RESERVED_NAME
+# define __format_string SYSTEM_RESERVED_NAME
+# define __full SYSTEM_RESERVED_NAME
+# define __in SYSTEM_RESERVED_NAME
+# define __inout SYSTEM_RESERVED_NAME
+# define __nz SYSTEM_RESERVED_NAME
+# define __out SYSTEM_RESERVED_NAME
+# define __part SYSTEM_RESERVED_NAME
+# define __post SYSTEM_RESERVED_NAME
+# define __pre SYSTEM_RESERVED_NAME
+#endif
+
+// Newlib & picolibc use __input as a parameter name of a64l & l64a
+#ifndef _NEWLIB_VERSION
+# define __input SYSTEM_RESERVED_NAME
+#endif
+#define __output SYSTEM_RESERVED_NAME
+
+#define __acquire SYSTEM_RESERVED_NAME
+#define __release SYSTEM_RESERVED_NAME
+
+// Android and FreeBSD use this for __attribute__((__unused__))
+#if !defined(__FreeBSD__)  && !defined(__ANDROID__)
+#define __unused SYSTEM_RESERVED_NAME
+#endif
+
+// These names are not reserved, so the user can macro-define them.
+// These are intended to find improperly _Uglified template parameters.
+#define A SYSTEM_RESERVED_NAME
+#define Arg SYSTEM_RESERVED_NAME
+#define Args SYSTEM_RESERVED_NAME
+#define As SYSTEM_RESERVED_NAME
+#define B SYSTEM_RESERVED_NAME
+#define Bs SYSTEM_RESERVED_NAME
+#define C SYSTEM_RESERVED_NAME
+#define Cp SYSTEM_RESERVED_NAME
+#define Cs SYSTEM_RESERVED_NAME
+// Windows setjmp.h contains a struct member named 'D' on ARM/AArch64.
+#ifndef _WIN32
+# define D SYSTEM_RESERVED_NAME
+#endif
+#define Dp SYSTEM_RESERVED_NAME
+#define Ds SYSTEM_RESERVED_NAME
+#define E SYSTEM_RESERVED_NAME
+#define Ep SYSTEM_RESERVED_NAME
+#define Es SYSTEM_RESERVED_NAME
+#define N SYSTEM_RESERVED_NAME
+#define Np SYSTEM_RESERVED_NAME
+#define Ns SYSTEM_RESERVED_NAME
+#define R SYSTEM_RESERVED_NAME
+#define Rp SYSTEM_RESERVED_NAME
+#define Rs SYSTEM_RESERVED_NAME
+#define T SYSTEM_RESERVED_NAME
+#define Tp SYSTEM_RESERVED_NAME
+#define Ts SYSTEM_RESERVED_NAME
+#define Type SYSTEM_RESERVED_NAME
+#define Types SYSTEM_RESERVED_NAME
+#define U SYSTEM_RESERVED_NAME
+#define Up SYSTEM_RESERVED_NAME
+#define Us SYSTEM_RESERVED_NAME
+#define V SYSTEM_RESERVED_NAME
+#define Vp SYSTEM_RESERVED_NAME
+#define Vs SYSTEM_RESERVED_NAME
+#define X SYSTEM_RESERVED_NAME
+#define Xp SYSTEM_RESERVED_NAME
+#define Xs SYSTEM_RESERVED_NAME
+
+// The classic Windows min/max macros
+#define min SYSTEM_RESERVED_NAME
+#define max SYSTEM_RESERVED_NAME
+
+// Test to make sure curses has no conflicting macros with the standard library
+#define move SYSTEM_RESERVED_NAME
+#define erase SYSTEM_RESERVED_NAME
+#define refresh SYSTEM_RESERVED_NAME
+
+// Dinkumware libc ctype.h uses these definitions
+#define _XA SYSTEM_RESERVED_NAME
+#define _XS SYSTEM_RESERVED_NAME
+#define _BB SYSTEM_RESERVED_NAME
+#define _CN SYSTEM_RESERVED_NAME
+#define _DI SYSTEM_RESERVED_NAME
+#define _LO SYSTEM_RESERVED_NAME
+#define _PU SYSTEM_RESERVED_NAME
+#define _SP SYSTEM_RESERVED_NAME
+#define _UP SYSTEM_RESERVED_NAME
+#define _XD SYSTEM_RESERVED_NAME
diff --git a/clang/test/Headers/x86-intrinsics-headers-clean.cpp b/clang/test/Headers/x86-intrinsics-headers-clean.cpp
index a19207f34ead8..0a04bcebfaece 100644
--- a/clang/test/Headers/x86-intrinsics-headers-clean.cpp
+++ b/clang/test/Headers/x86-intrinsics-headers-clean.cpp
@@ -10,4 +10,6 @@
 
 // expected-no-diagnostics
 
+#include "system_reserved_names.h"
+
 #include <x86intrin.h>
diff --git a/clang/test/Headers/x86-intrinsics-headers.c b/clang/test/Headers/x86-intrinsics-headers.c
index dc06cbde0f587..89a7d4dc21508 100644
--- a/clang/test/Headers/x86-intrinsics-headers.c
+++ b/clang/test/Headers/x86-intrinsics-headers.c
@@ -5,6 +5,8 @@
 // XFAIL: target=arm64ec-pc-windows-msvc
 // These intrinsics are not yet implemented for Arm64EC.
 
+#include "system_reserved_names.h"
+
 #if defined(i386) || defined(__x86_64__)
 
 #ifdef __SSE4_2__

From 4ab25977ea47c31a8b68fd607ec5f985de722fd4 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 6 Oct 2025 21:05:32 +1100
Subject: [PATCH 817/878] [orc-rt] Remove incorrect noexcept specifiers.

Conversions between Error/Expected and their serializable counterparts may
throw.
---
 orc-rt/include/orc-rt/SPSWrapperFunction.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 3ed3295731780..275e253330f20 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -37,12 +37,12 @@ template <typename... SPSArgTs> struct WFSPSHelper {
     return Arg;
   }
 
-  static SPSSerializableError toSerializable(Error Err) noexcept {
+  static SPSSerializableError toSerializable(Error Err) {
     return SPSSerializableError(std::move(Err));
   }
 
   template <typename T>
-  static SPSSerializableExpected<T> toSerializable(Expected<T> Arg) noexcept {
+  static SPSSerializableExpected<T> toSerializable(Expected<T> Arg) {
     return SPSSerializableExpected<T>(std::move(Arg));
   }
 
@@ -61,12 +61,12 @@ template <typename... SPSArgTs> struct WFSPSHelper {
     return std::forward<T>(Arg);
   }
 
-  static Error fromSerializable(SPSSerializableError Err) noexcept {
+  static Error fromSerializable(SPSSerializableError Err) {
     return Err.toError();
   }
 
   template <typename T>
-  static Expected<T> fromSerializable(SPSSerializableExpected<T> Val) noexcept {
+  static Expected<T> fromSerializable(SPSSerializableExpected<T> Val) {
     return Val.toExpected();
   }
 

From 93073af121051738937313111e069b61a3bd09db Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 6 Oct 2025 11:13:01 +0100
Subject: [PATCH 818/878] [LV] Move 3 functions into VPlanTransforms (NFC)
 (#158644)

Two of them are actually transforms, and the third is a dependent
static.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 214 +-----------------
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 199 ++++++++++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  13 ++
 3 files changed, 218 insertions(+), 208 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 56a3d6d518dd1..e434e733e442a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8201,211 +8201,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   }
 }
 
-/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
-/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
-/// the end value of the induction.
-static VPInstruction *addResumePhiRecipeForInduction(
-    VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
-    VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
-  auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
-  // Truncated wide inductions resume from the last lane of their vector value
-  // in the last vector iteration which is handled elsewhere.
-  if (WideIntOrFp && WideIntOrFp->getTruncInst())
-    return nullptr;
-
-  VPValue *Start = WideIV->getStartValue();
-  VPValue *Step = WideIV->getStepValue();
-  const InductionDescriptor &ID = WideIV->getInductionDescriptor();
-  VPValue *EndValue = VectorTC;
-  if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
-    EndValue = VectorPHBuilder.createDerivedIV(
-        ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
-        Start, VectorTC, Step);
-  }
-
-  // EndValue is derived from the vector trip count (which has the same type as
-  // the widest induction) and thus may be wider than the induction here.
-  Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
-  if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
-    EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
-                                                ScalarTypeOfWideIV,
-                                                WideIV->getDebugLoc());
-  }
-
-  auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
-      {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
-  return ResumePhiRecipe;
-}
-
-/// Create resume phis in the scalar preheader for first-order recurrences,
-/// reductions and inductions, and update the VPIRInstructions wrapping the
-/// original phis in the scalar header. End values for inductions are added to
-/// \p IVEndValues.
-static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
-                                DenseMap<VPValue *, VPValue *> &IVEndValues) {
-  VPTypeAnalysis TypeInfo(Plan);
-  auto *ScalarPH = Plan.getScalarPreheader();
-  auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
-  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
-  VPBuilder VectorPHBuilder(
-      cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
-  VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-  VPBuilder ScalarPHBuilder(ScalarPH);
-  for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
-    auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
-
-    // TODO: Extract final value from induction recipe initially, optimize to
-    // pre-computed end value together in optimizeInductionExitUsers.
-    auto *VectorPhiR =
-        cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
-    if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
-      if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
-              WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
-              &Plan.getVectorTripCount())) {
-        assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
-        IVEndValues[WideIVR] = ResumePhi->getOperand(0);
-        ScalarPhiIRI->addOperand(ResumePhi);
-        continue;
-      }
-      // TODO: Also handle truncated inductions here. Computing end-values
-      // separately should be done as VPlan-to-VPlan optimization, after
-      // legalizing all resume values to use the last lane from the loop.
-      assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
-             "should only skip truncated wide inductions");
-      continue;
-    }
-
-    // The backedge value provides the value to resume coming out of a loop,
-    // which for FORs is a vector whose last element needs to be extracted. The
-    // start value provides the value if the loop is bypassed.
-    bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
-    auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
-    assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
-           "Cannot handle loops with uncountable early exits");
-    if (IsFOR)
-      ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
-          VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
-          "vector.recur.extract");
-    StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
-    auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
-        {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
-    ScalarPhiIRI->addOperand(ResumePhiR);
-  }
-}
-
-/// Handle users in the exit block for first order reductions in the original
-/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
-/// users in the original exit block using the VPIRInstruction wrapping to the
-/// LCSSA phi.
-static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range) {
-  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
-  auto *ScalarPHVPBB = Plan.getScalarPreheader();
-  auto *MiddleVPBB = Plan.getMiddleBlock();
-  VPBuilder ScalarPHBuilder(ScalarPHVPBB);
-  VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-
-  auto IsScalableOne = [](ElementCount VF) -> bool {
-    return VF == ElementCount::getScalable(1);
-  };
-
-  for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
-    auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
-    if (!FOR)
-      continue;
-
-    assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
-           "Cannot handle loops with uncountable early exits");
-
-    // This is the second phase of vectorizing first-order recurrences, creating
-    // extract for users outside the loop. An overview of the transformation is
-    // described below. Suppose we have the following loop with some use after
-    // the loop of the last a[i-1],
-    //
-    //   for (int i = 0; i < n; ++i) {
-    //     t = a[i - 1];
-    //     b[i] = a[i] - t;
-    //   }
-    //   use t;
-    //
-    // There is a first-order recurrence on "a". For this loop, the shorthand
-    // scalar IR looks like:
-    //
-    //   scalar.ph:
-    //     s.init = a[-1]
-    //     br scalar.body
-    //
-    //   scalar.body:
-    //     i = phi [0, scalar.ph], [i+1, scalar.body]
-    //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
-    //     s2 = a[i]
-    //     b[i] = s2 - s1
-    //     br cond, scalar.body, exit.block
-    //
-    //   exit.block:
-    //     use = lcssa.phi [s1, scalar.body]
-    //
-    // In this example, s1 is a recurrence because it's value depends on the
-    // previous iteration. In the first phase of vectorization, we created a
-    // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
-    // for users in the scalar preheader and exit block.
-    //
-    //   vector.ph:
-    //     v_init = vector(..., ..., ..., a[-1])
-    //     br vector.body
-    //
-    //   vector.body
-    //     i = phi [0, vector.ph], [i+4, vector.body]
-    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
-    //     v2 = a[i, i+1, i+2, i+3]
-    //     b[i] = v2 - v1
-    //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
-    //     b[i, i+1, i+2, i+3] = v2 - v1
-    //     br cond, vector.body, middle.block
-    //
-    //   middle.block:
-    //     vector.recur.extract.for.phi = v2(2)
-    //     vector.recur.extract = v2(3)
-    //     br cond, scalar.ph, exit.block
-    //
-    //   scalar.ph:
-    //     scalar.recur.init = phi [vector.recur.extract, middle.block],
-    //                             [s.init, otherwise]
-    //     br scalar.body
-    //
-    //   scalar.body:
-    //     i = phi [0, scalar.ph], [i+1, scalar.body]
-    //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
-    //     s2 = a[i]
-    //     b[i] = s2 - s1
-    //     br cond, scalar.body, exit.block
-    //
-    //   exit.block:
-    //     lo = lcssa.phi [s1, scalar.body],
-    //                    [vector.recur.extract.for.phi, middle.block]
-    //
-    // Now update VPIRInstructions modeling LCSSA phis in the exit block.
-    // Extract the penultimate value of the recurrence and use it as operand for
-    // the VPIRInstruction modeling the phi.
-    for (VPUser *U : FOR->users()) {
-      using namespace llvm::VPlanPatternMatch;
-      if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
-        continue;
-      // For VF vscale x 1, if vscale = 1, we are unable to extract the
-      // penultimate value of the recurrence. Instead we rely on the existing
-      // extract of the last element from the result of
-      // VPInstruction::FirstOrderRecurrenceSplice.
-      // TODO: Consider vscale_range info and UF.
-      if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
-                                                             Range))
-        return;
-      VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
-          VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
-          {}, "vector.recur.extract.for.phi");
-      cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
-    }
-  }
-}
-
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
 
@@ -8598,9 +8393,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     R->setOperand(1, WideIV->getStepValue());
   }
 
-  addExitUsersForFirstOrderRecurrences(*Plan, Range);
+  VPlanTransforms::runPass(
+      VPlanTransforms::addExitUsersForFirstOrderRecurrences, *Plan, Range);
   DenseMap<VPValue *, VPValue *> IVEndValues;
-  addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
+  VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan,
+                           RecipeBuilder, IVEndValues);
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8711,7 +8508,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   DenseMap<VPValue *, VPValue *> IVEndValues;
   // TODO: IVEndValues are not used yet in the native path, to optimize exit
   // values.
-  addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
+  VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan,
+                           RecipeBuilder, IVEndValues);
 
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ca63bf337b5b8..ebf833e2b7e88 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4198,3 +4198,202 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
   MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
 }
+
+/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
+/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
+/// the end value of the induction.
+static VPInstruction *addResumePhiRecipeForInduction(
+    VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
+    VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
+  auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
+  // Truncated wide inductions resume from the last lane of their vector value
+  // in the last vector iteration which is handled elsewhere.
+  if (WideIntOrFp && WideIntOrFp->getTruncInst())
+    return nullptr;
+
+  VPValue *Start = WideIV->getStartValue();
+  VPValue *Step = WideIV->getStepValue();
+  const InductionDescriptor &ID = WideIV->getInductionDescriptor();
+  VPValue *EndValue = VectorTC;
+  if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
+    EndValue = VectorPHBuilder.createDerivedIV(
+        ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
+        Start, VectorTC, Step);
+  }
+
+  // EndValue is derived from the vector trip count (which has the same type as
+  // the widest induction) and thus may be wider than the induction here.
+  Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
+  if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
+    EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
+                                                ScalarTypeOfWideIV,
+                                                WideIV->getDebugLoc());
+  }
+
+  auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
+      {EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
+  return ResumePhiRecipe;
+}
+
+void VPlanTransforms::addScalarResumePhis(
+    VPlan &Plan, VPRecipeBuilder &Builder,
+    DenseMap<VPValue *, VPValue *> &IVEndValues) {
+  VPTypeAnalysis TypeInfo(Plan);
+  auto *ScalarPH = Plan.getScalarPreheader();
+  auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  VPBuilder VectorPHBuilder(
+      cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
+  VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+  VPBuilder ScalarPHBuilder(ScalarPH);
+  for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
+    auto *ScalarPhiIRI = cast<VPIRPhi>(&ScalarPhiR);
+
+    // TODO: Extract final value from induction recipe initially, optimize to
+    // pre-computed end value together in optimizeInductionExitUsers.
+    auto *VectorPhiR =
+        cast<VPHeaderPHIRecipe>(Builder.getRecipe(&ScalarPhiIRI->getIRPhi()));
+    if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
+      if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
+              WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
+              &Plan.getVectorTripCount())) {
+        assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
+        IVEndValues[WideIVR] = ResumePhi->getOperand(0);
+        ScalarPhiIRI->addOperand(ResumePhi);
+        continue;
+      }
+      // TODO: Also handle truncated inductions here. Computing end-values
+      // separately should be done as VPlan-to-VPlan optimization, after
+      // legalizing all resume values to use the last lane from the loop.
+      assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
+             "should only skip truncated wide inductions");
+      continue;
+    }
+
+    // The backedge value provides the value to resume coming out of a loop,
+    // which for FORs is a vector whose last element needs to be extracted. The
+    // start value provides the value if the loop is bypassed.
+    bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
+    auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
+    assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+           "Cannot handle loops with uncountable early exits");
+    if (IsFOR)
+      ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
+          VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
+          "vector.recur.extract");
+    StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
+    auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
+        {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
+    ScalarPhiIRI->addOperand(ResumePhiR);
+  }
+}
+
+void VPlanTransforms::addExitUsersForFirstOrderRecurrences(VPlan &Plan,
+                                                           VFRange &Range) {
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  auto *ScalarPHVPBB = Plan.getScalarPreheader();
+  auto *MiddleVPBB = Plan.getMiddleBlock();
+  VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+  VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
+
+  auto IsScalableOne = [](ElementCount VF) -> bool {
+    return VF == ElementCount::getScalable(1);
+  };
+
+  for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
+    auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
+    if (!FOR)
+      continue;
+
+    assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
+           "Cannot handle loops with uncountable early exits");
+
+    // This is the second phase of vectorizing first-order recurrences, creating
+    // extract for users outside the loop. An overview of the transformation is
+    // described below. Suppose we have the following loop with some use after
+    // the loop of the last a[i-1],
+    //
+    //   for (int i = 0; i < n; ++i) {
+    //     t = a[i - 1];
+    //     b[i] = a[i] - t;
+    //   }
+    //   use t;
+    //
+    // There is a first-order recurrence on "a". For this loop, the shorthand
+    // scalar IR looks like:
+    //
+    //   scalar.ph:
+    //     s.init = a[-1]
+    //     br scalar.body
+    //
+    //   scalar.body:
+    //     i = phi [0, scalar.ph], [i+1, scalar.body]
+    //     s1 = phi [s.init, scalar.ph], [s2, scalar.body]
+    //     s2 = a[i]
+    //     b[i] = s2 - s1
+    //     br cond, scalar.body, exit.block
+    //
+    //   exit.block:
+    //     use = lcssa.phi [s1, scalar.body]
+    //
+    // In this example, s1 is a recurrence because it's value depends on the
+    // previous iteration. In the first phase of vectorization, we created a
+    // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
+    // for users in the scalar preheader and exit block.
+    //
+    //   vector.ph:
+    //     v_init = vector(..., ..., ..., a[-1])
+    //     br vector.body
+    //
+    //   vector.body
+    //     i = phi [0, vector.ph], [i+4, vector.body]
+    //     v1 = phi [v_init, vector.ph], [v2, vector.body]
+    //     v2 = a[i, i+1, i+2, i+3]
+    //     b[i] = v2 - v1
+    //     // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
+    //     b[i, i+1, i+2, i+3] = v2 - v1
+    //     br cond, vector.body, middle.block
+    //
+    //   middle.block:
+    //     vector.recur.extract.for.phi = v2(2)
+    //     vector.recur.extract = v2(3)
+    //     br cond, scalar.ph, exit.block
+    //
+    //   scalar.ph:
+    //     scalar.recur.init = phi [vector.recur.extract, middle.block],
+    //                             [s.init, otherwise]
+    //     br scalar.body
+    //
+    //   scalar.body:
+    //     i = phi [0, scalar.ph], [i+1, scalar.body]
+    //     s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
+    //     s2 = a[i]
+    //     b[i] = s2 - s1
+    //     br cond, scalar.body, exit.block
+    //
+    //   exit.block:
+    //     lo = lcssa.phi [s1, scalar.body],
+    //                    [vector.recur.extract.for.phi, middle.block]
+    //
+    // Now update VPIRInstructions modeling LCSSA phis in the exit block.
+    // Extract the penultimate value of the recurrence and use it as operand for
+    // the VPIRInstruction modeling the phi.
+    for (VPUser *U : FOR->users()) {
+      using namespace llvm::VPlanPatternMatch;
+      if (!match(U, m_ExtractLastElement(m_Specific(FOR))))
+        continue;
+      // For VF vscale x 1, if vscale = 1, we are unable to extract the
+      // penultimate value of the recurrence. Instead we rely on the existing
+      // extract of the last element from the result of
+      // VPInstruction::FirstOrderRecurrenceSplice.
+      // TODO: Consider vscale_range info and UF.
+      if (LoopVectorizationPlanner::getDecisionAndClampRange(IsScalableOne,
+                                                             Range))
+        return;
+      VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
+          VPInstruction::ExtractPenultimateElement, {FOR->getBackedgeValue()},
+          {}, "vector.recur.extract.for.phi");
+      cast<VPInstruction>(U)->replaceAllUsesWith(PenultimateElement);
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 2f00e51ba3025..5a8a2bbc2975e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -363,6 +363,19 @@ struct VPlanTransforms {
   static void
   addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
                                     std::optional<unsigned> VScaleForTuning);
+
+  /// Create resume phis in the scalar preheader for first-order recurrences,
+  /// reductions and inductions, and update the VPIRInstructions wrapping the
+  /// original phis in the scalar header. End values for inductions are added to
+  /// \p IVEndValues.
+  static void addScalarResumePhis(VPlan &Plan, VPRecipeBuilder &Builder,
+                                  DenseMap<VPValue *, VPValue *> &IVEndValues);
+
+  /// Handle users in the exit block for first order reductions in the original
+  /// exit block. The penultimate value of recurrences is fed to their LCSSA phi
+  /// users in the original exit block using the VPIRInstruction wrapping to the
+  /// LCSSA phi.
+  static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range);
 };
 
 } // namespace llvm

From 830373372c6e6776149948dd8d3044f06ce9780f Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Mon, 6 Oct 2025 11:58:44 +0200
Subject: [PATCH 819/878] [OpenMP] Clean-up Fortran tests

 * Use "do" for DO loops, there is no "for" in Fortran and it is always
   integer

 * Add -cpp to not rely on file name case

 * Add "implicit none" safety
---
 openmp/runtime/test/transform/tile/{intfor.F90 => do.F90} | 8 ++++----
 .../test/transform/tile/{intfor_2d.f90 => do_2d.f90}      | 3 ++-
 .../tile/{intfor_2d_varsizes.F90 => do_2d_varsizes.f90}   | 3 ++-
 .../unroll/{heuristic_intdo.f90 => heuristic_do.f90}      | 3 ++-
 4 files changed, 10 insertions(+), 7 deletions(-)
 rename openmp/runtime/test/transform/tile/{intfor.F90 => do.F90} (68%)
 rename openmp/runtime/test/transform/tile/{intfor_2d.f90 => do_2d.f90} (96%)
 rename openmp/runtime/test/transform/tile/{intfor_2d_varsizes.F90 => do_2d_varsizes.f90} (96%)
 rename openmp/runtime/test/transform/unroll/{heuristic_intdo.f90 => heuristic_do.f90} (90%)

diff --git a/openmp/runtime/test/transform/tile/intfor.F90 b/openmp/runtime/test/transform/tile/do.F90
similarity index 68%
rename from openmp/runtime/test/transform/tile/intfor.F90
rename to openmp/runtime/test/transform/tile/do.F90
index 4ca9f14fdae9f..74aa54b4b7b61 100644
--- a/openmp/runtime/test/transform/tile/intfor.F90
+++ b/openmp/runtime/test/transform/tile/do.F90
@@ -2,14 +2,14 @@
 ! It is done 3 times corresponding to every possible fraction of the last
 ! iteration before passing beyond UB.
 
-! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -DUB=16 %s -o %t-ub16.exe
-! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -DUB=17 %s -o %t-ub17.exe
-! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -DUB=18 %s -o %t-ub18.exe
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -cpp -DUB=16 %s -o %t-ub16.exe
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -cpp -DUB=17 %s -o %t-ub17.exe
+! RUN: %flang %flags %openmp_flags -fopenmp-version=51 -cpp -DUB=18 %s -o %t-ub18.exe
 ! RUN: %t-ub16.exe | FileCheck %s --match-full-lines
 ! RUN: %t-ub17.exe | FileCheck %s --match-full-lines
 ! RUN: %t-ub18.exe | FileCheck %s --match-full-lines
 
-program tile_intfor_1d
+program tile_do_1d
   implicit none
   integer i
   print *, 'do'
diff --git a/openmp/runtime/test/transform/tile/intfor_2d.f90 b/openmp/runtime/test/transform/tile/do_2d.f90
similarity index 96%
rename from openmp/runtime/test/transform/tile/intfor_2d.f90
rename to openmp/runtime/test/transform/tile/do_2d.f90
index 6bc90c768b8d3..162bed0d0d1e9 100644
--- a/openmp/runtime/test/transform/tile/intfor_2d.f90
+++ b/openmp/runtime/test/transform/tile/do_2d.f90
@@ -4,7 +4,8 @@
 ! RUN: %t.exe | FileCheck %s --match-full-lines
 
 
-program tile_intfor_2d
+program tile_do_2d
+  implicit none
   integer i, j
   print *, 'do'
 
diff --git a/openmp/runtime/test/transform/tile/intfor_2d_varsizes.F90 b/openmp/runtime/test/transform/tile/do_2d_varsizes.f90
similarity index 96%
rename from openmp/runtime/test/transform/tile/intfor_2d_varsizes.F90
rename to openmp/runtime/test/transform/tile/do_2d_varsizes.f90
index 4cb5adf606dd2..7d60ad019926c 100644
--- a/openmp/runtime/test/transform/tile/intfor_2d_varsizes.F90
+++ b/openmp/runtime/test/transform/tile/do_2d_varsizes.f90
@@ -3,7 +3,8 @@
 ! RUN: %flang %flags %openmp_flags -fopenmp-version=51 %s -o %t.exe
 ! RUN: %t.exe | FileCheck %s --match-full-lines
 
-program tile_intfor_varsizes
+program tile_do_2d_varsizes
+  implicit none
   integer i
 
   call kernel(7,17,3,2)
diff --git a/openmp/runtime/test/transform/unroll/heuristic_intdo.f90 b/openmp/runtime/test/transform/unroll/heuristic_do.f90
similarity index 90%
rename from openmp/runtime/test/transform/unroll/heuristic_intdo.f90
rename to openmp/runtime/test/transform/unroll/heuristic_do.f90
index d0ef938dd3a8f..c646e5274cce1 100644
--- a/openmp/runtime/test/transform/unroll/heuristic_intdo.f90
+++ b/openmp/runtime/test/transform/unroll/heuristic_do.f90
@@ -4,7 +4,8 @@
 ! RUN: %t.exe | FileCheck %s --match-full-lines
 
 
-program unroll_heuristic
+program unroll_heuristic_do
+  implicit none
   integer :: i
   print *, 'do'
 

From 23f010f1ab09263d79027c70d5f4cddfe0055ca9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Oct 2025 11:26:51 +0100
Subject: [PATCH 820/878] [clang] SemaConcept.cpp - fix MSVC "not all control
 paths return a value" warnings. NFC. (#162060)

---
 clang/lib/Sema/SemaConcept.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 11d2d5c3938c1..999e302c02535 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1049,6 +1049,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
   case NormalizedConstraint::ConstraintKind::Compound:
     return Evaluate(static_cast<const CompoundConstraint &>(Constraint), MLTAL);
   }
+  llvm_unreachable("Unknown ConstraintKind enum");
 }
 
 static bool CheckConstraintSatisfaction(
@@ -2141,6 +2142,7 @@ bool SubstituteParameterMappings::substitute(NormalizedConstraint &N) {
     return substitute(Compound.getRHS());
   }
   }
+  llvm_unreachable("Unknown ConstraintKind enum");
 }
 
 } // namespace
@@ -2561,7 +2563,6 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
   };
 
   switch (NC.getKind()) {
-
   case NormalizedConstraint::ConstraintKind::Atomic:
     return {{find(&static_cast<const AtomicConstraint &>(NC))}};
 
@@ -2601,6 +2602,7 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
     return Res;
   }
   }
+  llvm_unreachable("Unknown ConstraintKind enum");
 }
 
 void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) {

From 93408f5312a555bad59c4f75d83970ddb48e07ad Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Oct 2025 11:30:46 +0100
Subject: [PATCH 821/878] [AArch64] determineSVEStackSizes - fix MSVC
 signed/unsigned comparison failure. NFC. (#162059)

---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index b98f7978912f0..c76689f47d91c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2878,7 +2878,7 @@ static SVEStackSizes determineSVEStackSizes(MachineFunction &MF,
     StackTop += MFI.getObjectSize(FI);
     StackTop = alignTo(StackTop, Alignment);
 
-    assert(StackTop < std::numeric_limits<int64_t>::max() &&
+    assert(StackTop < (uint64_t)std::numeric_limits<int64_t>::max() &&
            "SVE StackTop far too large?!");
 
     int64_t Offset = -int64_t(StackTop);

From 913ae2d37219edbf992277ad909a8fddd1c2371a Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Mon, 6 Oct 2025 11:30:55 +0100
Subject: [PATCH 822/878] [llvm][docs] Minor fixes and improvements for release
 process (#151956)

- The list numbering in [1] currently starts again after item 3 due to
the code-block.
- Remove mentions of Phabricator and Subversion.
- In final step of [2] remove mention of
llvm/utils/git/sync-release-repo.sh, which was removed in #73682.
- Add direct links to:
  - www-releases repo.
  - backporting doc [3].
  - Getting Started page.
  - RELEASE_TESTERS.txt.
  - Release Sources GitHub workflow.

[1] https://llvm.org/docs/HowToReleaseLLVM.html#create-release-branch
[2]
https://llvm.org/docs/HowToReleaseLLVM.html#triaging-bug-reports-for-releases
[3]
https://llvm.org/docs/GitHub.html#backporting-fixes-to-the-release-branches
---
 llvm/docs/HowToReleaseLLVM.rst | 55 ++++++++++++++++------------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index f3792e367de0d..1795d3a81ce95 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -116,13 +116,11 @@ Branch the Git trunk using the following procedure:
 
 #. Bump the version in trunk to N.0.0git with the script in
    ``llvm/utils/release/bump-version.py``, and tag the commit with llvmorg-N-init.
-   If ``X`` is the version to be released, then ``N`` is ``X + 1``.
+   If ``X`` is the version to be released, then ``N`` is ``X + 1``. ::
 
-::
-
-  $ git tag -sa llvmorg-N-init
+    $ git tag -sa llvmorg-N-init
 
-4. Clear the release notes in trunk with the script in
+#. Clear the release notes in trunk with the script in
    ``llvm/utils/release/clear-release-notes.py``.
 
 #. Create the release branch from the last known good revision from before the
@@ -145,10 +143,12 @@ Tag release candidates:
   $ git tag -sa llvmorg-X.Y.Z-rcN
 
 The pre-packaged source tarballs will be automatically generated via the
-"Release Sources" workflow on GitHub.  This workflow will create an artifact
-containing all the release tarballs and the artifact attestation.  The
-Release Manager should download the artifact, verify the tarballs, sign them,
-and then upload them to the release page.
+`Release Sources
+<https://github.com/llvm/llvm-project/actions/workflows/release-sources.yml>`_
+workflow on GitHub.  This workflow will create an artifact containing all the
+release tarballs and the artifact attestation.  The Release Manager should
+download the artifact, verify the tarballs, sign them, and then upload them to
+the release page.
 
 ::
 
@@ -217,8 +217,9 @@ consistently validated and released binaries for their targets/OSs. To contact
 them, you should post on the `Discourse forums (Project
 Infrastructure - Release Testers). <https://discourse.llvm.org/c/infrastructure/release-testers/66>`_
 
-The official testers list is in the file ``RELEASE_TESTERS.TXT``, in the ``LLVM``
-repository.
+The official testers list is in the file `RELEASE_TESTERS.TXT
+<https://github.com/llvm/llvm-project/blob/main/llvm/RELEASE_TESTERS.TXT>`_, in
+the LLVM repository.
 
 Community Testing
 -----------------
@@ -276,7 +277,8 @@ from the Milestone. Debugging can continue, but on trunk.
 Backport Requests
 -----------------
 
-Instructions for requesting a backport to a stable branch can be found :doc:`here <GitHub>`.
+Instructions for requesting a backport to a stable branch can be found
+:ref:`here <backporting>`.
 
 Triaging Bug Reports for Releases
 ---------------------------------
@@ -301,26 +303,19 @@ This section describes how to triage bug reports:
    using the /cherry-pick or /branch comments if this has not been done already.
 
 #. If a bug has been fixed and has a pull request created for backporting it,
-   then update its status to "Needs Review" and notify a knowledgeable reviewer.
-   Usually you will want to notify the person who approved the patch in Phabricator,
-   but you may use your best judgement on who a good reviewer would be.  Once
-   you have identified the reviewer(s), assign the issue to them and mention
-   them (i.e @username) in a comment and ask them if the patch is safe to backport.
-   You should also review the bug yourself to ensure that it meets the requirements
-   for committing to the release branch.
+   then update its status to "Needs Review" and notify a knowledgeable
+   reviewer.  Usually you will want to notify the person who approved the
+   patch, but you may use your best judgement on who a good reviewer would be.
+   Once you have identified the reviewer(s), assign the issue to them and
+   mention them (i.e @username) in a comment and ask them if the patch is safe
+   to backport.  You should also review the bug yourself to ensure that it
+   meets the requirements for committing to the release branch.
 
 #. Once a bug has been reviewed, add the release:reviewed label and update the
    issue's status to "Needs Merge".  Check the pull request associated with the
    issue.  If all the tests pass, then the pull request can be merged.  If not,
    then add a comment on the issue asking someone to take a look at the failures.
 
-#. Once the pull request has been merged push it to the official release branch
-   with the script ``llvm/utils/git/sync-release-repo.sh``.
-
-   Then add a comment to the issue stating that the fix has been merged along with
-   the git hashes from the release branch.  Add the release:merged label to the issue
-   and close it.
-
 
 Release Patch Rules
 -------------------
@@ -364,9 +359,8 @@ Update Documentation
 Review the documentation in the release branch and ensure that it is up
 to date.  The "Release Notes" must be updated to reflect new features, bug
 fixes, new known issues, and changes in the list of supported platforms.
-The "Getting Started Guide" should be updated to reflect the new release
-version number tag available from Subversion and changes in basic system
-requirements.
+The :doc:`GettingStarted` page should be updated to reflect the new release
+version number tag and changes in basic system requirements.
 
 .. _tag:
 
@@ -386,7 +380,8 @@ Update the LLVM Website
 The website must be updated before the release announcement is sent out.  Here
 is what to do:
 
-#. Check out the ``www-releases`` module from GitHub.
+#. Check out the `www-releases <https://github.com/llvm/www-releases>`_ repo
+   from GitHub.
 
 #. Create a new sub-directory ``X.Y.Z`` in the releases directory.
 

From 10da6f05cc4828c02ceebc8d0e2d8fbb03363a12 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Oct 2025 11:44:09 +0100
Subject: [PATCH 823/878] [X86] x86-shrink-wrap-unwind.ll - regenerate test
 checks (#162061)

---
 .../CodeGen/X86/x86-shrink-wrap-unwind.ll     | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index 3349d31cad4b9..b2064b11802dc 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -317,13 +317,13 @@ define void @with_nounwind(i1 %cond) nounwind personality ptr @my_personality {
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB4_1: ## %throw
-; CHECK-NEXT:  Ltmp0:
+; CHECK-NEXT:  Ltmp0: ## EH_LABEL
 ; CHECK-NEXT:    callq _throw_exception
-; CHECK-NEXT:  Ltmp1:
+; CHECK-NEXT:  Ltmp1: ## EH_LABEL
 ; CHECK-NEXT:  ## %bb.2: ## %unreachable
 ; CHECK-NEXT:    ud2
 ; CHECK-NEXT:  LBB4_3: ## %landing
-; CHECK-NEXT:  Ltmp2:
+; CHECK-NEXT:  Ltmp2: ## EH_LABEL
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  Lfunc_end0:
@@ -340,12 +340,12 @@ define void @with_nounwind(i1 %cond) nounwind personality ptr @my_personality {
 ; NOCOMPACTUNWIND-NEXT:    retq
 ; NOCOMPACTUNWIND-NEXT:  .LBB4_1: # %throw
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 16
-; NOCOMPACTUNWIND-NEXT:  .Ltmp0:
+; NOCOMPACTUNWIND-NEXT:  .Ltmp0: # EH_LABEL
 ; NOCOMPACTUNWIND-NEXT:    callq throw_exception@PLT
-; NOCOMPACTUNWIND-NEXT:  .Ltmp1:
+; NOCOMPACTUNWIND-NEXT:  .Ltmp1: # EH_LABEL
 ; NOCOMPACTUNWIND-NEXT:  # %bb.2: # %unreachable
 ; NOCOMPACTUNWIND-NEXT:  .LBB4_3: # %landing
-; NOCOMPACTUNWIND-NEXT:  .Ltmp2:
+; NOCOMPACTUNWIND-NEXT:  .Ltmp2: # EH_LABEL
 ; NOCOMPACTUNWIND-NEXT:    popq %rax
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 8
 ; NOCOMPACTUNWIND-NEXT:    retq
@@ -379,9 +379,9 @@ define void @with_nounwind_same_succ(i1 %cond) nounwind personality ptr @my_pers
 ; CHECK-NEXT:  ## %bb.1: ## %throw
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:  Ltmp3:
+; CHECK-NEXT:  Ltmp3: ## EH_LABEL
 ; CHECK-NEXT:    callq _throw_exception
-; CHECK-NEXT:  Ltmp4:
+; CHECK-NEXT:  Ltmp4: ## EH_LABEL
 ; CHECK-NEXT:  LBB5_3: ## %fallthrough
 ; CHECK-NEXT:    ## InlineAsm Start
 ; CHECK-NEXT:    nop
@@ -390,7 +390,7 @@ define void @with_nounwind_same_succ(i1 %cond) nounwind personality ptr @my_pers
 ; CHECK-NEXT:  LBB5_4: ## %return
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB5_2: ## %landing
-; CHECK-NEXT:  Ltmp5:
+; CHECK-NEXT:  Ltmp5: ## EH_LABEL
 ; CHECK-NEXT:    jmp LBB5_3
 ; CHECK-NEXT:  Lfunc_end1:
 ;
@@ -401,9 +401,9 @@ define void @with_nounwind_same_succ(i1 %cond) nounwind personality ptr @my_pers
 ; NOCOMPACTUNWIND-NEXT:  # %bb.1: # %throw
 ; NOCOMPACTUNWIND-NEXT:    pushq %rax
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 16
-; NOCOMPACTUNWIND-NEXT:  .Ltmp3:
+; NOCOMPACTUNWIND-NEXT:  .Ltmp3: # EH_LABEL
 ; NOCOMPACTUNWIND-NEXT:    callq throw_exception@PLT
-; NOCOMPACTUNWIND-NEXT:  .Ltmp4:
+; NOCOMPACTUNWIND-NEXT:  .Ltmp4: # EH_LABEL
 ; NOCOMPACTUNWIND-NEXT:  .LBB5_3: # %fallthrough
 ; NOCOMPACTUNWIND-NEXT:    #APP
 ; NOCOMPACTUNWIND-NEXT:    nop
@@ -414,7 +414,7 @@ define void @with_nounwind_same_succ(i1 %cond) nounwind personality ptr @my_pers
 ; NOCOMPACTUNWIND-NEXT:    retq
 ; NOCOMPACTUNWIND-NEXT:  .LBB5_2: # %landing
 ; NOCOMPACTUNWIND-NEXT:    .cfi_def_cfa_offset 16
-; NOCOMPACTUNWIND-NEXT:  .Ltmp5:
+; NOCOMPACTUNWIND-NEXT:  .Ltmp5: # EH_LABEL
 ; NOCOMPACTUNWIND-NEXT:    jmp .LBB5_3
 entry:
   br i1 %cond, label %throw, label %return

From 4b05a12e9c0de38f54a6440a2cfe3741780418aa Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 6 Oct 2025 12:47:49 +0200
Subject: [PATCH 824/878] [libc++] Fix simd_unary.pass.cpp with AppleClang

When using AppleClang the `clang` feature flag is not set, but the
compiler supports `-flax-vector-conversions=integer`. This adds another
`ADDITIONAL_COMPILE_FLAGS` for AppleClang to fix the CI.
---
 libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
index 2c3751a97cf4e..874c6e119243f 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
@@ -14,6 +14,7 @@
 
 // FIXME: This should work with -flax-vector-conversions=none
 // ADDITIONAL_COMPILE_FLAGS(clang): -flax-vector-conversions=integer
+// ADDITIONAL_COMPILE_FLAGS(apple-clang): -flax-vector-conversions=integer
 
 // <experimental/simd>
 //

From 5547c0cff3eec029318317cb263b0ddc37c5bfd0 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 6 Oct 2025 06:49:42 -0400
Subject: [PATCH 825/878] [SPIRV] Implement LLVM IR and backend for typed
 buffer counters (#161425)

This commit implements the backend portion of the typed buffer counter
proposal described in
https://github.com/llvm/wg-hlsl/blob/main/proposals/0023-typed-buffer-counters.md.
This is the second part of the implementation, focusing on the LLVM IR
and SPIR-V backend.

Specifically, this commit implements the "LLVM IR Generation and Backend
Handling"
section of the proposal. This includes:
- Adding the `llvm.spv.resource.counterhandlefromimplicitbinding` and
  `llvm.spv.resource.counterhandlefrombinding` intrinsics.
- Implementing the selection of these intrinsics in the SPIRV backend to
  generate the correct `OpVariable` and `OpDecorate` instructions for
  the counter buffer.
- Handling `IncrementCounter` and `DecrementCounter` via a new
  `llvm.spv.resource.updatecounter` intrinsic, which is lowered to
  `OpAtomicIAdd`.
- Adding a new test file to verify the implementation.

Contributes to https://github.com/llvm/llvm-project/issues/137032

---------

Co-authored-by: Marcos Maronas <marcos.maronas@intel.com>
---
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   8 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 134 ++++++++++++
 .../SPIRV/SPIRVLegalizeImplicitBinding.cpp    | 192 ++++++++++++------
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp          |   6 +
 llvm/lib/Target/SPIRV/SPIRVUtils.h            |   3 +
 .../SPIRV/hlsl-resources/test_counters.ll     |  65 ++++++
 6 files changed, 346 insertions(+), 62 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-resources/test_counters.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 823c491e1bfee..66e24fae08a7b 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -150,6 +150,14 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty]
                               [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
                                llvm_i32_ty, llvm_ptr_ty],
                               [IntrNoMem]>;
+  def int_spv_resource_counterhandlefromimplicitbinding
+      : DefaultAttrsIntrinsic<[llvm_any_ty],
+                              [llvm_any_ty, llvm_i32_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
+  def int_spv_resource_counterhandlefrombinding
+      : DefaultAttrsIntrinsic<[llvm_any_ty],
+                              [llvm_any_ty, llvm_i32_ty, llvm_i32_ty],
+                              [IntrNoMem]>;
 
   def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
   def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 0afec42135337..989950fb8f8b5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -307,6 +307,10 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType,
                                MachineInstr &I) const;
 
+  bool selectCounterHandleFromBinding(Register &ResVReg,
+                                      const SPIRVType *ResType,
+                                      MachineInstr &I) const;
+
   bool selectReadImageIntrinsic(Register &ResVReg, const SPIRVType *ResType,
                                 MachineInstr &I) const;
   bool selectImageWriteIntrinsic(MachineInstr &I) const;
@@ -314,6 +318,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
                                 MachineInstr &I) const;
   bool selectModf(Register ResVReg, const SPIRVType *ResType,
                   MachineInstr &I) const;
+  bool selectUpdateCounter(Register &ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I) const;
   bool selectFrexp(Register ResVReg, const SPIRVType *ResType,
                    MachineInstr &I) const;
   // Utilities
@@ -3443,6 +3449,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_resource_handlefrombinding: {
     return selectHandleFromBinding(ResVReg, ResType, I);
   }
+  case Intrinsic::spv_resource_counterhandlefrombinding:
+    return selectCounterHandleFromBinding(ResVReg, ResType, I);
+  case Intrinsic::spv_resource_updatecounter:
+    return selectUpdateCounter(ResVReg, ResType, I);
   case Intrinsic::spv_resource_store_typedbuffer: {
     return selectImageWriteIntrinsic(I);
   }
@@ -3478,6 +3488,130 @@ bool SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg,
                                   *cast<GIntrinsic>(&I), I);
 }
 
+bool SPIRVInstructionSelector::selectCounterHandleFromBinding(
+    Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const {
+  auto &Intr = cast<GIntrinsic>(I);
+  assert(Intr.getIntrinsicID() ==
+         Intrinsic::spv_resource_counterhandlefrombinding);
+
+  // Extract information from the intrinsic call.
+  Register MainHandleReg = Intr.getOperand(2).getReg();
+  auto *MainHandleDef = cast<GIntrinsic>(getVRegDef(*MRI, MainHandleReg));
+  assert(MainHandleDef->getIntrinsicID() ==
+         Intrinsic::spv_resource_handlefrombinding);
+
+  uint32_t Set = getIConstVal(Intr.getOperand(4).getReg(), MRI);
+  uint32_t Binding = getIConstVal(Intr.getOperand(3).getReg(), MRI);
+  uint32_t ArraySize = getIConstVal(MainHandleDef->getOperand(4).getReg(), MRI);
+  Register IndexReg = MainHandleDef->getOperand(5).getReg();
+  const bool IsNonUniform = false;
+  std::string CounterName =
+      getStringValueFromReg(MainHandleDef->getOperand(6).getReg(), *MRI) +
+      ".counter";
+
+  // Create the counter variable.
+  MachineIRBuilder MIRBuilder(I);
+  Register CounterVarReg = buildPointerToResource(
+      GR.getPointeeType(ResType), GR.getPointerStorageClass(ResType), Set,
+      Binding, ArraySize, IndexReg, IsNonUniform, CounterName, MIRBuilder);
+
+  return BuildCOPY(ResVReg, CounterVarReg, I);
+}
+
+bool SPIRVInstructionSelector::selectUpdateCounter(Register &ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I) const {
+  auto &Intr = cast<GIntrinsic>(I);
+  assert(Intr.getIntrinsicID() == Intrinsic::spv_resource_updatecounter);
+
+  Register CounterHandleReg = Intr.getOperand(2).getReg();
+  Register IncrReg = Intr.getOperand(3).getReg();
+
+  // The counter handle is a pointer to the counter variable (which is a struct
+  // containing an i32). We need to get a pointer to that i32 member to do the
+  // atomic operation.
+#ifndef NDEBUG
+  SPIRVType *CounterVarType = GR.getSPIRVTypeForVReg(CounterHandleReg);
+  SPIRVType *CounterVarPointeeType = GR.getPointeeType(CounterVarType);
+  assert(CounterVarPointeeType &&
+         CounterVarPointeeType->getOpcode() == SPIRV::OpTypeStruct &&
+         "Counter variable must be a struct");
+  assert(GR.getPointerStorageClass(CounterVarType) ==
+             SPIRV::StorageClass::StorageBuffer &&
+         "Counter variable must be in the storage buffer storage class");
+  assert(CounterVarPointeeType->getNumOperands() == 2 &&
+         "Counter variable must have exactly 1 member in the struct");
+  const SPIRVType *MemberType =
+      GR.getSPIRVTypeForVReg(CounterVarPointeeType->getOperand(1).getReg());
+  assert(MemberType->getOpcode() == SPIRV::OpTypeInt &&
+         "Counter variable struct must have a single i32 member");
+#endif
+
+  // The struct has a single i32 member.
+  MachineIRBuilder MIRBuilder(I);
+  const Type *LLVMIntType =
+      Type::getInt32Ty(I.getMF()->getFunction().getContext());
+
+  SPIRVType *IntPtrType = GR.getOrCreateSPIRVPointerType(
+      LLVMIntType, MIRBuilder, SPIRV::StorageClass::StorageBuffer);
+
+  auto Zero = buildI32Constant(0, I);
+  if (!Zero.second)
+    return false;
+
+  Register PtrToCounter =
+      MRI->createVirtualRegister(GR.getRegClass(IntPtrType));
+  if (!BuildMI(*I.getParent(), I, I.getDebugLoc(),
+               TII.get(SPIRV::OpAccessChain))
+           .addDef(PtrToCounter)
+           .addUse(GR.getSPIRVTypeID(IntPtrType))
+           .addUse(CounterHandleReg)
+           .addUse(Zero.first)
+           .constrainAllUses(TII, TRI, RBI)) {
+    return false;
+  }
+
+  // For UAV/SSBO counters, the scope is Device. The counter variable is not
+  // used as a flag. So the memory semantics can be None.
+  auto Scope = buildI32Constant(SPIRV::Scope::Device, I);
+  if (!Scope.second)
+    return false;
+  auto Semantics = buildI32Constant(SPIRV::MemorySemantics::None, I);
+  if (!Semantics.second)
+    return false;
+
+  int64_t IncrVal = getIConstValSext(IncrReg, MRI);
+  auto Incr = buildI32Constant(static_cast<uint32_t>(IncrVal), I);
+  if (!Incr.second)
+    return false;
+
+  Register AtomicRes = MRI->createVirtualRegister(GR.getRegClass(ResType));
+  if (!BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpAtomicIAdd))
+           .addDef(AtomicRes)
+           .addUse(GR.getSPIRVTypeID(ResType))
+           .addUse(PtrToCounter)
+           .addUse(Scope.first)
+           .addUse(Semantics.first)
+           .addUse(Incr.first)
+           .constrainAllUses(TII, TRI, RBI)) {
+    return false;
+  }
+  if (IncrVal >= 0) {
+    return BuildCOPY(ResVReg, AtomicRes, I);
+  }
+
+  // In HLSL, IncrementCounter returns the value *before* the increment, while
+  // DecrementCounter returns the value *after* the decrement. Both are lowered
+  // to the same atomic intrinsic which returns the value *before* the
+  // operation. So for decrements (negative IncrVal), we must subtract the
+  // increment value from the result to get the post-decrement value.
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(AtomicRes)
+      .addUse(Incr.first)
+      .constrainAllUses(TII, TRI, RBI);
+}
 bool SPIRVInstructionSelector::selectReadImageIntrinsic(
     Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const {
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
index 205895e48a379..fc14a03f69140 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp
@@ -39,6 +39,10 @@ class SPIRVLegalizeImplicitBinding : public ModulePass {
   void collectBindingInfo(Module &M);
   uint32_t getAndReserveFirstUnusedBinding(uint32_t DescSet);
   void replaceImplicitBindingCalls(Module &M);
+  void replaceResourceHandleCall(Module &M, CallInst *OldCI,
+                                 uint32_t NewBinding);
+  void replaceCounterHandleCall(Module &M, CallInst *OldCI,
+                                uint32_t NewBinding);
   void verifyUniqueOrderIdPerResource(SmallVectorImpl<CallInst *> &Calls);
 
   // A map from descriptor set to a bit vector of used binding numbers.
@@ -56,64 +60,93 @@ struct BindingInfoCollector : public InstVisitor<BindingInfoCollector> {
       : UsedBindings(UsedBindings), ImplicitBindingCalls(ImplicitBindingCalls) {
   }
 
+  void addBinding(uint32_t DescSet, uint32_t Binding) {
+    if (UsedBindings.size() <= DescSet) {
+      UsedBindings.resize(DescSet + 1);
+      UsedBindings[DescSet].resize(64);
+    }
+    if (UsedBindings[DescSet].size() <= Binding) {
+      UsedBindings[DescSet].resize(2 * Binding + 1);
+    }
+    UsedBindings[DescSet].set(Binding);
+  }
+
   void visitCallInst(CallInst &CI) {
     if (CI.getIntrinsicID() == Intrinsic::spv_resource_handlefrombinding) {
       const uint32_t DescSet =
           cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
       const uint32_t Binding =
           cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
-
-      if (UsedBindings.size() <= DescSet) {
-        UsedBindings.resize(DescSet + 1);
-        UsedBindings[DescSet].resize(64);
-      }
-      if (UsedBindings[DescSet].size() <= Binding) {
-        UsedBindings[DescSet].resize(2 * Binding + 1);
-      }
-      UsedBindings[DescSet].set(Binding);
+      addBinding(DescSet, Binding);
     } else if (CI.getIntrinsicID() ==
                Intrinsic::spv_resource_handlefromimplicitbinding) {
       ImplicitBindingCalls.push_back(&CI);
+    } else if (CI.getIntrinsicID() ==
+               Intrinsic::spv_resource_counterhandlefrombinding) {
+      const uint32_t DescSet =
+          cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
+      const uint32_t Binding =
+          cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
+      addBinding(DescSet, Binding);
+    } else if (CI.getIntrinsicID() ==
+               Intrinsic::spv_resource_counterhandlefromimplicitbinding) {
+      ImplicitBindingCalls.push_back(&CI);
     }
   }
 };
 
+static uint32_t getOrderId(const CallInst *CI) {
+  uint32_t OrderIdArgIdx = 0;
+  switch (CI->getIntrinsicID()) {
+  case Intrinsic::spv_resource_handlefromimplicitbinding:
+    OrderIdArgIdx = 0;
+    break;
+  case Intrinsic::spv_resource_counterhandlefromimplicitbinding:
+    OrderIdArgIdx = 1;
+    break;
+  default:
+    llvm_unreachable("CallInst is not an implicit binding intrinsic");
+  }
+  return cast<ConstantInt>(CI->getArgOperand(OrderIdArgIdx))->getZExtValue();
+}
+
+static uint32_t getDescSet(const CallInst *CI) {
+  uint32_t DescSetArgIdx;
+  switch (CI->getIntrinsicID()) {
+  case Intrinsic::spv_resource_handlefromimplicitbinding:
+  case Intrinsic::spv_resource_handlefrombinding:
+    DescSetArgIdx = 1;
+    break;
+  case Intrinsic::spv_resource_counterhandlefromimplicitbinding:
+  case Intrinsic::spv_resource_counterhandlefrombinding:
+    DescSetArgIdx = 2;
+    break;
+  default:
+    llvm_unreachable("CallInst is not an implicit binding intrinsic");
+  }
+  return cast<ConstantInt>(CI->getArgOperand(DescSetArgIdx))->getZExtValue();
+}
+
 void SPIRVLegalizeImplicitBinding::collectBindingInfo(Module &M) {
   BindingInfoCollector InfoCollector(UsedBindings, ImplicitBindingCalls);
   InfoCollector.visit(M);
 
   // Sort the collected calls by their order ID.
-  std::sort(
-      ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(),
-      [](const CallInst *A, const CallInst *B) {
-        const uint32_t OrderIdArgIdx = 0;
-        const uint32_t OrderA =
-            cast<ConstantInt>(A->getArgOperand(OrderIdArgIdx))->getZExtValue();
-        const uint32_t OrderB =
-            cast<ConstantInt>(B->getArgOperand(OrderIdArgIdx))->getZExtValue();
-        return OrderA < OrderB;
-      });
+  std::sort(ImplicitBindingCalls.begin(), ImplicitBindingCalls.end(),
+            [](const CallInst *A, const CallInst *B) {
+              return getOrderId(A) < getOrderId(B);
+            });
 }
 
 void SPIRVLegalizeImplicitBinding::verifyUniqueOrderIdPerResource(
     SmallVectorImpl<CallInst *> &Calls) {
   // Check that the order Id is unique per resource.
   for (uint32_t i = 1; i < Calls.size(); ++i) {
-    const uint32_t OrderIdArgIdx = 0;
-    const uint32_t DescSetArgIdx = 1;
-    const uint32_t OrderA =
-        cast<ConstantInt>(Calls[i - 1]->getArgOperand(OrderIdArgIdx))
-            ->getZExtValue();
-    const uint32_t OrderB =
-        cast<ConstantInt>(Calls[i]->getArgOperand(OrderIdArgIdx))
-            ->getZExtValue();
+    const uint32_t OrderA = getOrderId(Calls[i - 1]);
+    const uint32_t OrderB = getOrderId(Calls[i]);
     if (OrderA == OrderB) {
-      const uint32_t DescSetA =
-          cast<ConstantInt>(Calls[i - 1]->getArgOperand(DescSetArgIdx))
-              ->getZExtValue();
-      const uint32_t DescSetB =
-          cast<ConstantInt>(Calls[i]->getArgOperand(DescSetArgIdx))
-              ->getZExtValue();
+      const uint32_t DescSetA = getDescSet(Calls[i - 1]);
+      const uint32_t DescSetB = getDescSet(Calls[i]);
       if (DescSetA != DescSetB) {
         report_fatal_error("Implicit binding calls with the same order ID must "
                            "have the same descriptor set");
@@ -144,36 +177,26 @@ void SPIRVLegalizeImplicitBinding::replaceImplicitBindingCalls(Module &M) {
   uint32_t lastBindingNumber = -1;
 
   for (CallInst *OldCI : ImplicitBindingCalls) {
-    IRBuilder<> Builder(OldCI);
-    const uint32_t OrderId =
-        cast<ConstantInt>(OldCI->getArgOperand(0))->getZExtValue();
-    const uint32_t DescSet =
-        cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue();
-
-    // Reuse an existing binding for this order ID, if one was already assigned.
-    // Otherwise, assign a new binding.
-    const uint32_t NewBinding = (lastOrderId == OrderId)
-                                    ? lastBindingNumber
-                                    : getAndReserveFirstUnusedBinding(DescSet);
-    lastOrderId = OrderId;
-    lastBindingNumber = NewBinding;
-
-    SmallVector<Value *, 8> Args;
-    Args.push_back(Builder.getInt32(DescSet));
-    Args.push_back(Builder.getInt32(NewBinding));
-
-    // Copy the remaining arguments from the old call.
-    for (uint32_t i = 2; i < OldCI->arg_size(); ++i) {
-      Args.push_back(OldCI->getArgOperand(i));
+    const uint32_t OrderId = getOrderId(OldCI);
+    uint32_t BindingNumber;
+    if (OrderId == lastOrderId) {
+      BindingNumber = lastBindingNumber;
+    } else {
+      const uint32_t DescSet = getDescSet(OldCI);
+      BindingNumber = getAndReserveFirstUnusedBinding(DescSet);
     }
 
-    Function *NewFunc = Intrinsic::getOrInsertDeclaration(
-        &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType());
-    CallInst *NewCI = Builder.CreateCall(NewFunc, Args);
-    NewCI->setCallingConv(OldCI->getCallingConv());
-
-    OldCI->replaceAllUsesWith(NewCI);
-    OldCI->eraseFromParent();
+    if (OldCI->getIntrinsicID() ==
+        Intrinsic::spv_resource_handlefromimplicitbinding) {
+      replaceResourceHandleCall(M, OldCI, BindingNumber);
+    } else {
+      assert(OldCI->getIntrinsicID() ==
+                 Intrinsic::spv_resource_counterhandlefromimplicitbinding &&
+             "Unexpected implicit binding intrinsic");
+      replaceCounterHandleCall(M, OldCI, BindingNumber);
+    }
+    lastOrderId = OrderId;
+    lastBindingNumber = BindingNumber;
   }
 }
 
@@ -196,4 +219,49 @@ INITIALIZE_PASS(SPIRVLegalizeImplicitBinding, "legalize-spirv-implicit-binding",
 
 ModulePass *llvm::createSPIRVLegalizeImplicitBindingPass() {
   return new SPIRVLegalizeImplicitBinding();
-}
\ No newline at end of file
+}
+
+void SPIRVLegalizeImplicitBinding::replaceResourceHandleCall(
+    Module &M, CallInst *OldCI, uint32_t NewBinding) {
+  IRBuilder<> Builder(OldCI);
+  const uint32_t DescSet =
+      cast<ConstantInt>(OldCI->getArgOperand(1))->getZExtValue();
+
+  SmallVector<Value *, 8> Args;
+  Args.push_back(Builder.getInt32(DescSet));
+  Args.push_back(Builder.getInt32(NewBinding));
+
+  // Copy the remaining arguments from the old call.
+  for (uint32_t i = 2; i < OldCI->arg_size(); ++i) {
+    Args.push_back(OldCI->getArgOperand(i));
+  }
+
+  Function *NewFunc = Intrinsic::getOrInsertDeclaration(
+      &M, Intrinsic::spv_resource_handlefrombinding, OldCI->getType());
+  CallInst *NewCI = Builder.CreateCall(NewFunc, Args);
+  NewCI->setCallingConv(OldCI->getCallingConv());
+
+  OldCI->replaceAllUsesWith(NewCI);
+  OldCI->eraseFromParent();
+}
+
+void SPIRVLegalizeImplicitBinding::replaceCounterHandleCall(
+    Module &M, CallInst *OldCI, uint32_t NewBinding) {
+  IRBuilder<> Builder(OldCI);
+  const uint32_t DescSet =
+      cast<ConstantInt>(OldCI->getArgOperand(2))->getZExtValue();
+
+  SmallVector<Value *, 8> Args;
+  Args.push_back(OldCI->getArgOperand(0));
+  Args.push_back(Builder.getInt32(NewBinding));
+  Args.push_back(Builder.getInt32(DescSet));
+
+  Type *Tys[] = {OldCI->getType(), OldCI->getArgOperand(0)->getType()};
+  Function *NewFunc = Intrinsic::getOrInsertDeclaration(
+      &M, Intrinsic::spv_resource_counterhandlefrombinding, Tys);
+  CallInst *NewCI = Builder.CreateCall(NewFunc, Args);
+  NewCI->setCallingConv(OldCI->getCallingConv());
+
+  OldCI->replaceAllUsesWith(NewCI);
+  OldCI->eraseFromParent();
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 327c011ea178f..1d47c892d03b1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -385,6 +385,12 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) {
   return MI->getOperand(1).getCImm()->getValue().getZExtValue();
 }
 
+int64_t getIConstValSext(Register ConstReg, const MachineRegisterInfo *MRI) {
+  const MachineInstr *MI = getDefInstrMaybeConstant(ConstReg, MRI);
+  assert(MI && MI->getOpcode() == TargetOpcode::G_CONSTANT);
+  return MI->getOperand(1).getCImm()->getSExtValue();
+}
+
 bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID) {
   if (const auto *GI = dyn_cast<GIntrinsic>(&MI))
     return GI->is(IntrinsicID);
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 409a0fd758a32..5777a24faabed 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -289,6 +289,9 @@ MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,
 // Get constant integer value of the given ConstReg.
 uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI);
 
+// Get constant integer value of the given ConstReg, sign-extended.
+int64_t getIConstValSext(Register ConstReg, const MachineRegisterInfo *MRI);
+
 // Check if MI is a SPIR-V specific intrinsic call.
 bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID);
 // Check if it's a SPIR-V specific intrinsic call.
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/test_counters.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/test_counters.ll
new file mode 100644
index 0000000000000..b178a56f71751
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/test_counters.ll
@@ -0,0 +1,65 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+; ModuleID = 'test_counters.hlsl'
+source_filename = "test_counters.hlsl"
+
+; CHECK: OpCapability Int8
+; CHECK-DAG: OpName [[OutputBuffer:%[0-9]+]] "OutputBuffer"
+; CHECK-DAG: OpName [[InputBuffer:%[0-9]+]] "InputBuffer"
+; CHECK-DAG: OpName [[OutputBufferCounter:%[0-9]+]] "OutputBuffer.counter"
+; CHECK-DAG: OpName [[InputBufferCounter:%[0-9]+]] "InputBuffer.counter"
+; CHECK-DAG: OpDecorate [[OutputBuffer]] DescriptorSet 0
+; CHECK-DAG: OpDecorate [[OutputBuffer]] Binding 10
+; CHECK-DAG: OpDecorate [[OutputBufferCounter]] DescriptorSet 0
+; CHECK-DAG: OpDecorate [[OutputBufferCounter]] Binding 0
+; CHECK-DAG: OpDecorate [[InputBuffer]] DescriptorSet 0
+; CHECK-DAG: OpDecorate [[InputBuffer]] Binding 1
+; CHECK-DAG: OpDecorate [[InputBufferCounter]] DescriptorSet 0
+; CHECK-DAG: OpDecorate [[InputBufferCounter]] Binding 2
+; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: [[zero:%[0-9]+]] = OpConstant [[int]] 0{{$}}
+; CHECK-DAG: [[one:%[0-9]+]] = OpConstant [[int]] 1{{$}}
+; CHECK-DAG: [[minus_one:%[0-9]+]] = OpConstant [[int]] 4294967295
+; CHECK: [[OutputBufferHandle:%[0-9]+]] = OpCopyObject {{%[0-9]+}} [[OutputBuffer]]
+; CHECK: [[InputBufferHandle:%[0-9]+]] = OpCopyObject {{%[0-9]+}} [[InputBuffer]]
+; CHECK: [[InputCounterAC:%[0-9]+]] = OpAccessChain {{%[0-9]+}} [[InputBufferCounter]] [[zero]]
+; CHECK: [[dec:%[0-9]+]] = OpAtomicIAdd [[int]] [[InputCounterAC]] [[one]] [[zero]] [[minus_one]]
+; CHECK: [[iadd:%[0-9]+]] = OpIAdd [[int]] [[dec]] [[minus_one]]
+; CHECK: [[OutputCounterAC:%[0-9]+]] = OpAccessChain {{%[0-9]+}} [[OutputBufferCounter]] [[zero]]
+; CHECK: [[inc:%[0-9]+]] = OpAtomicIAdd [[int]] [[OutputCounterAC]] [[one]] [[zero]] [[one]]
+; CHECK: [[InputAC:%[0-9]+]] = OpAccessChain {{%[0-9]+}} [[InputBufferHandle]] [[zero]] [[iadd]]
+; CHECK: [[load:%[0-9]+]] = OpLoad {{%[0-9]+}} [[InputAC]]
+; CHECK: [[OutputAC:%[0-9]+]] = OpAccessChain {{%[0-9]+}} [[OutputBufferHandle]] [[zero]] [[inc]]
+; CHECK: OpStore [[OutputAC]] [[load]]
+
+
+target triple = "spirv1.6-unknown-vulkan1.3-compute"
+
+@.str = private unnamed_addr constant [13 x i8] c"OutputBuffer\00"
+@.str.2 = private unnamed_addr constant [12 x i8] c"InputBuffer\00"
+
+define void @main() #0 {
+entry:
+  %0 = call target("spirv.VulkanBuffer", [0 x float], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0f32_12_1t(i32 0, i32 10, i32 1, i32 0, ptr @.str)
+  %1 = call target("spirv.VulkanBuffer", i32, 12, 1) @llvm.spv.resource.counterhandlefromimplicitbinding.tspirv.VulkanBuffer_i32_12_1t.tspirv.VulkanBuffer_a0f32_12_1t(target("spirv.VulkanBuffer", [0 x float], 12, 1) %0, i32 0, i32 0)
+  %2 = call target("spirv.VulkanBuffer", [0 x float], 12, 1) @llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_a0f32_12_1t(i32 1, i32 0, i32 1, i32 0, ptr @.str.2)
+  %3 = call target("spirv.VulkanBuffer", i32, 12, 1) @llvm.spv.resource.counterhandlefromimplicitbinding.tspirv.VulkanBuffer_i32_12_1t.tspirv.VulkanBuffer_a0f32_12_1t(target("spirv.VulkanBuffer", [0 x float], 12, 1) %2, i32 2, i32 0)
+  %4 = call i32 @llvm.spv.resource.updatecounter.tspirv.VulkanBuffer_i32_12_1t(target("spirv.VulkanBuffer", i32, 12, 1) %3, i8 -1)
+  %5 = call i32 @llvm.spv.resource.updatecounter.tspirv.VulkanBuffer_i32_12_1t(target("spirv.VulkanBuffer", i32, 12, 1) %1, i8 1)
+  %6 = call ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0f32_12_1t(target("spirv.VulkanBuffer", [0 x float], 12, 1) %2, i32 %4)
+  %7 = load float, ptr addrspace(11) %6
+  %8 = call ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0f32_12_1t(target("spirv.VulkanBuffer", [0 x float], 12, 1) %0, i32 %5)
+  store float %7, ptr addrspace(11) %8
+  ret void
+}
+
+declare target("spirv.VulkanBuffer", [0 x float], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0f32_12_1t(i32, i32, i32, i32, ptr) #1
+declare target("spirv.VulkanBuffer", i32, 12, 1) @llvm.spv.resource.counterhandlefromimplicitbinding.tspirv.VulkanBuffer_i32_12_1t.tspirv.VulkanBuffer_a0f32_12_1t(target("spirv.VulkanBuffer", [0 x float], 12, 1), i32, i32) #1
+declare target("spirv.VulkanBuffer", [0 x float], 12, 1) @llvm.spv.resource.handlefromimplicitbinding.tspirv.VulkanBuffer_a0f32_12_1t(i32, i32, i32, i32, ptr) #1
+declare i32 @llvm.spv.resource.updatecounter.tspirv.VulkanBuffer_i32_12_1t(target("spirv.VulkanBuffer", i32, 12, 1), i8) #2
+declare ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0f32_12_1t(target("spirv.VulkanBuffer", [0 x float], 12, 1), i32) #1
+
+attributes #0 = { "hlsl.shader"="compute" "hlsl.numthreads"="1,1,1" }
+attributes #1 = { memory(none) }
+attributes #2 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }

From 5d7f324614d7a5c0de89cfe8295a9b2b7ef5d073 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 6 Oct 2025 07:07:37 -0400
Subject: [PATCH 826/878] [SLP]Enable Shl as a base opcode in copyables
 (#156766)

Enables Shl matching for the nodes, where copyable can be modelled as
shl %v, 0
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  6 +-
 .../strided-loads-with-external-indices.ll    | 12 ++--
 .../X86/ext-used-scalar-different-bitwidth.ll |  4 +-
 .../X86/vect_copyable_in_binops.ll            | 32 ++---------
 .../bool-logical-op-reduction-with-poison.ll  | 55 +++++++++++++------
 5 files changed, 54 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fedca65d241e8..91c3d4270b241 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10620,7 +10620,8 @@ class InstructionsCompatibilityAnalysis {
   /// Checks if the opcode is supported as the main opcode for copyable
   /// elements.
   static bool isSupportedOpcode(const unsigned Opcode) {
-    return Opcode == Instruction::Add || Opcode == Instruction::LShr;
+    return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
+           Opcode == Instruction::Shl;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -10937,6 +10938,7 @@ class InstructionsCompatibilityAnalysis {
       switch (MainOpcode) {
       case Instruction::Add:
       case Instruction::LShr:
+      case Instruction::Shl:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
@@ -22006,6 +22008,8 @@ bool BoUpSLP::collectValuesToDemote(
       return all_of(E.Scalars, [&](Value *V) {
         if (isa<PoisonValue>(V))
           return true;
+        if (E.isCopyableElement(V))
+          return true;
         auto *I = cast<Instruction>(V);
         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
         return AmtKnownBits.getMaxValue().ult(BitWidth);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
index 655db54af98ac..a079203696cdd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
@@ -10,14 +10,10 @@ define void @test() {
 ; CHECK-NEXT:    [[SUB4_I_I65_US:%.*]] = or i64 0, 1
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    [[ADD_I_I62_US:%.*]] = shl i64 0, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 1>, i64 [[ADD_I_I62_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0
 ; CHECK-NEXT:    br label [[BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
index 77585965d68e9..87f2ccaf76497 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
@@ -8,8 +8,8 @@ define i32 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    store i32 152, ptr @f, align 4
 ; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD_I:%.*]] = load i32, ptr @f, align 4
-; CHECK-NEXT:    [[ADD_I_I:%.*]] = shl i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], 24
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[ADD_I_I]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = shl <8 x i32> [[TMP3]], <i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> <i32 83886080, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr <8 x i32> [[TMP1]], splat (i32 24)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[TMP2]], <i32 66440127, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 75aec45d3e2fc..3e0a3741d6bbc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -247,32 +247,12 @@ entry:
 }
 
 define void @shl0(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @shl0(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store i32 [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
-; NON-POW2-NEXT:    store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @shl0(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store i32 [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
-; POW2-ONLY-NEXT:    store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
-; POW2-ONLY-NEXT:    store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @shl0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
index a5b1e9b4575f0..769b3604d41c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
@@ -1,25 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
 
 
 define i1 @test(i32 %0, i32 %1, i32 %p) {
-; CHECK-LABEL: define i1 @test(
-; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp slt i32 0, [[P]]
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
-; CHECK-NEXT:    ret i1 [[OP_RDX2]]
+; X86-LABEL: define i1 @test(
+; X86-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
+; X86-NEXT:  entry:
+; X86-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
+; X86-NEXT:    [[CMP6:%.*]] = icmp slt i32 0, [[P]]
+; X86-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; X86-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
+; X86-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
+; X86-NEXT:    [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
+; X86-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
+; X86-NEXT:    ret i1 [[OP_RDX2]]
+;
+; AARCH64-LABEL: define i1 @test(
+; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
+; AARCH64-NEXT:    [[SHL4:%.*]] = shl i32 0, [[TMP1]]
+; AARCH64-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; AARCH64-NEXT:    [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[P]], i32 0
+; AARCH64-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; AARCH64-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; AARCH64-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]]
+; AARCH64-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = freeze i1 [[OP_RDX]]
+; AARCH64-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]]
+; AARCH64-NEXT:    ret i1 [[OP_RDX2]]
 ;
 entry:
   %cmp1 = icmp sgt i32 %0, 0

From 1c5186c315fdc6a070c302fe78f0e18122b9038f Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Mon, 6 Oct 2025 13:21:10 +0200
Subject: [PATCH 827/878] [OpenMP][omptest] Enable missing callback (#161650)

The registration of this callback handler was disabled for some reason.
Local testing did not bring up any issues when I enabled it.

Side effect is: Silences current warning about unused function.
---
 openmp/tools/omptest/src/OmptTester.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/tools/omptest/src/OmptTester.cpp b/openmp/tools/omptest/src/OmptTester.cpp
index afa96ac1d4b23..1a83f621173ad 100644
--- a/openmp/tools/omptest/src/OmptTester.cpp
+++ b/openmp/tools/omptest/src/OmptTester.cpp
@@ -390,7 +390,7 @@ int ompt_initialize(ompt_function_lookup_t lookup, int initial_device_num,
   register_ompt_callback(ompt_callback_parallel_begin);
   register_ompt_callback(ompt_callback_parallel_end);
   register_ompt_callback(ompt_callback_work);
-  // register_ompt_callback(ompt_callback_dispatch);
+  register_ompt_callback(ompt_callback_dispatch);
   register_ompt_callback(ompt_callback_task_create);
   // register_ompt_callback(ompt_callback_dependences);
   // register_ompt_callback(ompt_callback_task_dependence);

From 7f43b80d85758037b61eaec01ef8aac934307dc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 6 Oct 2025 14:28:10 +0300
Subject: [PATCH 828/878] [libcxx] [ci] Stop manually installing ninja in the
 Windows build jobs (#161907)

Ninja is officially included among the preinstalled tools on the Windows
runners now.

This should reduce the risk for stray failures here; sometimes,
attempting to install Ninja through Chocolatey have caused spurious
failures.
---
 .github/workflows/libcxx-build-and-test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 77f79a85a0a2f..b78f2c69408e7 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -255,7 +255,6 @@ jobs:
       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
       - name: Install dependencies
         run: |
-          choco install -y ninja
           pip install psutil
       - name: Install a current LLVM
         if: ${{ matrix.mingw != true }}

From 9a111ff91c5dc7d59e1fc9d35f3e43e1c5699120 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 6 Oct 2025 22:30:57 +1100
Subject: [PATCH 829/878] [orc-rt] Enable transparent SPS conversion for ptrs
 via ExecutorAddr. (#162069)

Allows SPS wrapper function calls and handles to use pointer arguments.
These will be converted to ExecutorAddr for serialization /
deserialization.
---
 orc-rt/include/orc-rt/SPSWrapperFunction.h  | 72 ++++++++++++---------
 orc-rt/unittests/SPSWrapperFunctionTest.cpp | 21 ++++++
 2 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index 275e253330f20..e5ed14f5cd721 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -33,47 +33,61 @@ template <typename... SPSArgTs> struct WFSPSHelper {
     return std::move(R);
   }
 
-  template <typename T> static const T &toSerializable(const T &Arg) noexcept {
-    return Arg;
-  }
+  template <typename T> struct Serializable {
+    typedef std::decay_t<T> serializable_type;
+    static const T &to(const T &Arg) noexcept { return Arg; }
+    static T &&from(T &&Arg) noexcept { return std::forward<T>(Arg); }
+  };
 
-  static SPSSerializableError toSerializable(Error Err) {
-    return SPSSerializableError(std::move(Err));
-  }
+  template <typename T> struct Serializable<T *> {
+    typedef ExecutorAddr serializable_type;
+    static ExecutorAddr to(T *Arg) { return ExecutorAddr::fromPtr(Arg); }
+    static T *from(ExecutorAddr A) { return A.toPtr<T *>(); }
+  };
 
-  template <typename T>
-  static SPSSerializableExpected<T> toSerializable(Expected<T> Arg) {
-    return SPSSerializableExpected<T>(std::move(Arg));
-  }
+  template <> struct Serializable<Error> {
+    typedef SPSSerializableError serializable_type;
+    static SPSSerializableError to(Error Err) {
+      return SPSSerializableError(std::move(Err));
+    }
+    static Error from(SPSSerializableError Err) { return Err.toError(); }
+  };
+
+  template <typename T> struct Serializable<Expected<T>> {
+    typedef SPSSerializableExpected<T> serializable_type;
+    static SPSSerializableExpected<T> to(Expected<T> Val) {
+      return SPSSerializableExpected<T>(std::move(Val));
+    }
+    static Expected<T> from(SPSSerializableExpected<T> Val) {
+      return Val.toExpected();
+    }
+  };
 
   template <typename... Ts> struct DeserializableTuple;
 
   template <typename... Ts> struct DeserializableTuple<std::tuple<Ts...>> {
-    typedef std::tuple<
-        std::decay_t<decltype(toSerializable(std::declval<Ts>()))>...>
-        type;
+    typedef std::tuple<typename Serializable<Ts>::serializable_type...> type;
   };
 
   template <typename... Ts>
   using DeserializableTuple_t = typename DeserializableTuple<Ts...>::type;
 
-  template <typename T> static T &&fromSerializable(T &&Arg) noexcept {
-    return std::forward<T>(Arg);
-  }
-
-  static Error fromSerializable(SPSSerializableError Err) {
-    return Err.toError();
-  }
-
-  template <typename T>
-  static Expected<T> fromSerializable(SPSSerializableExpected<T> Val) {
-    return Val.toExpected();
+  template <typename ArgTuple, typename... SerializableArgs, std::size_t... Is>
+  std::optional<ArgTuple>
+  applySerializationConversions(std::tuple<SerializableArgs...> &Inputs,
+                                std::index_sequence<Is...>) {
+    static_assert(sizeof...(SerializableArgs) ==
+                      std::index_sequence<Is...>::size(),
+                  "Tuple sizes don't match");
+    return std::optional<ArgTuple>(
+        std::in_place, Serializable<std::tuple_element_t<Is, ArgTuple>>::from(
+                           std::move(std::get<Is>(Inputs)))...);
   }
 
 public:
   template <typename... ArgTs>
   std::optional<WrapperFunctionBuffer> serialize(ArgTs &&...Args) {
-    return serializeImpl(toSerializable(std::forward<ArgTs>(Args))...);
+    return serializeImpl(Serializable<ArgTs>::to(std::forward<ArgTs>(Args))...);
   }
 
   template <typename ArgTuple>
@@ -85,12 +99,8 @@ template <typename... SPSArgTs> struct WFSPSHelper {
     if (!SPSSerializationTraits<SPSTuple<SPSArgTs...>,
                                 decltype(Args)>::deserialize(IB, Args))
       return std::nullopt;
-    return std::apply(
-        [](auto &&...A) {
-          return std::optional<ArgTuple>(std::in_place,
-                                         std::move(fromSerializable(A))...);
-        },
-        std::move(Args));
+    return applySerializationConversions<ArgTuple>(
+        Args, std::make_index_sequence<std::tuple_size_v<ArgTuple>>());
   }
 };
 
diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index e010e2a067adf..a8409cc10052f 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -221,6 +221,27 @@ TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) {
   EXPECT_EQ(ErrMsg, "N is not a multiple of 2");
 }
 
+static void
+round_trip_int_pointer_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
+                                   orc_rt_WrapperFunctionReturn Return,
+                                   orc_rt_WrapperFunctionBuffer ArgBytes) {
+  SPSWrapperFunction<SPSExecutorAddr(SPSExecutorAddr)>::handle(
+      Session, CallCtx, Return, ArgBytes,
+      [](move_only_function<void(int32_t *)> Return, int32_t *P) {
+        Return(P);
+      });
+}
+
+TEST(SPSWrapperFunctionUtilsTest, TestTransparentSerializationForPointers) {
+  int X = 42;
+  int *P = nullptr;
+  SPSWrapperFunction<SPSExecutorAddr(SPSExecutorAddr)>::call(
+      DirectCaller(nullptr, round_trip_int_pointer_sps_wrapper),
+      [&](Expected<int32_t *> R) { P = cantFail(std::move(R)); }, &X);
+
+  EXPECT_EQ(P, &X);
+}
+
 template <size_t N> struct SPSOpCounter {};
 
 namespace orc_rt {

From f8baf07c7cc2c85c2273606ecf5b15bc23228102 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 6 Oct 2025 22:25:14 +1100
Subject: [PATCH 830/878] [orc-rt] Clean up SPSWrapperFunction unittest names.

Drop the redundant 'Test' prefix and rename transparent serialization tests to
clarify their purpose.
---
 orc-rt/unittests/SPSWrapperFunctionTest.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/orc-rt/unittests/SPSWrapperFunctionTest.cpp b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
index a8409cc10052f..7f88ce0434ddb 100644
--- a/orc-rt/unittests/SPSWrapperFunctionTest.cpp
+++ b/orc-rt/unittests/SPSWrapperFunctionTest.cpp
@@ -82,7 +82,7 @@ static void void_noop_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
       [](move_only_function<void()> Return) { Return(); });
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestVoidNoop) {
+TEST(SPSWrapperFunctionUtilsTest, VoidNoop) {
   bool Ran = false;
   SPSWrapperFunction<void()>::call(DirectCaller(nullptr, void_noop_sps_wrapper),
                                    [&](Error Err) {
@@ -102,7 +102,7 @@ static void add_via_lambda_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
       });
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestBinaryOpViaLambda) {
+TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaLambda) {
   int32_t Result = 0;
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::call(
       DirectCaller(nullptr, add_via_lambda_sps_wrapper),
@@ -123,7 +123,7 @@ add_via_function_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
       Session, CallCtx, Return, ArgBytes, add_via_function);
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestBinaryOpViaFunction) {
+TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunction) {
   int32_t Result = 0;
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::call(
       DirectCaller(nullptr, add_via_function_sps_wrapper),
@@ -139,7 +139,7 @@ add_via_function_pointer_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
       Session, CallCtx, Return, ArgBytes, &add_via_function);
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestBinaryOpViaFunctionPointer) {
+TEST(SPSWrapperFunctionUtilsTest, BinaryOpViaFunctionPointer) {
   int32_t Result = 0;
   SPSWrapperFunction<int32_t(int32_t, int32_t)>::call(
       DirectCaller(nullptr, add_via_function_pointer_sps_wrapper),
@@ -161,7 +161,7 @@ static void improbable_feat_sps_wrapper(orc_rt_SessionRef Session,
       });
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorSuccessCase) {
+TEST(SPSWrapperFunctionUtilsTest, TransparentConversionErrorSuccessCase) {
   bool DidRun = false;
   SPSWrapperFunction<SPSError(bool)>::call(
       DirectCaller(nullptr, improbable_feat_sps_wrapper),
@@ -174,7 +174,7 @@ TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorSuccessCase) {
   EXPECT_TRUE(DidRun);
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningErrorFailureCase) {
+TEST(SPSWrapperFunctionUtilsTest, TransparentConversionErrorFailureCase) {
   std::string ErrMsg;
   SPSWrapperFunction<SPSError(bool)>::call(
       DirectCaller(nullptr, improbable_feat_sps_wrapper),
@@ -197,7 +197,7 @@ static void halve_number_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
       });
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedSuccessCase) {
+TEST(SPSWrapperFunctionUtilsTest, TransparentConversionExpectedSuccessCase) {
   int32_t Result = 0;
   SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call(
       DirectCaller(nullptr, halve_number_sps_wrapper),
@@ -209,7 +209,7 @@ TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedSuccessCase) {
   EXPECT_EQ(Result, 1);
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestFunctionReturningExpectedFailureCase) {
+TEST(SPSWrapperFunctionUtilsTest, TransparentConversionExpectedFailureCase) {
   std::string ErrMsg;
   SPSWrapperFunction<SPSExpected<int32_t>(int32_t)>::call(
       DirectCaller(nullptr, halve_number_sps_wrapper),
@@ -232,7 +232,7 @@ round_trip_int_pointer_sps_wrapper(orc_rt_SessionRef Session, void *CallCtx,
       });
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestTransparentSerializationForPointers) {
+TEST(SPSWrapperFunctionUtilsTest, TransparentSerializationPointers) {
   int X = 42;
   int *P = nullptr;
   SPSWrapperFunction<SPSExecutorAddr(SPSExecutorAddr)>::call(
@@ -270,7 +270,7 @@ handle_with_reference_types_sps_wrapper(orc_rt_SessionRef Session,
                                    OpCounter<3> &&) { Return(); });
 }
 
-TEST(SPSWrapperFunctionUtilsTest, TestHandlerWithReferences) {
+TEST(SPSWrapperFunctionUtilsTest, HandlerWithReferences) {
   // Test that we can handle by-value, by-ref, by-const-ref, and by-rvalue-ref
   // arguments, and that we generate the expected number of moves.
   OpCounter<0>::reset();

From 5e07093917fa84b3ee9d09189a34a0c750f68cb7 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 6 Oct 2025 07:37:42 -0400
Subject: [PATCH 831/878] [mlir][spirv] Simplify unreachable default cases in
 type switch. NFC. (#162010)

Use `DefaultUnreachable` from
https://github.com/llvm/llvm-project/pull/161970.
---
 mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp       |  5 +----
 mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp            |  2 +-
 mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp              | 11 ++++-------
 mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp |  8 ++------
 4 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index 9b6154057b806..50fca564b5b64 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -1118,10 +1118,7 @@ StringRef getTypeMangling(Type type, bool isSigned) {
           llvm_unreachable("Unsupported integer width");
         }
       })
-      .Default([](auto) {
-        llvm_unreachable("No mangling defined");
-        return "";
-      });
+      .DefaultUnreachable("No mangling defined");
 }
 
 template <typename ReduceOp>
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
index c8efdf0094223..24c33f9ae1b90 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -987,7 +987,7 @@ void SPIRVDialect::printType(Type type, DialectAsmPrinter &os) const {
       .Case<ArrayType, CooperativeMatrixType, PointerType, RuntimeArrayType,
             ImageType, SampledImageType, StructType, MatrixType, TensorArmType>(
           [&](auto type) { print(type, os); })
-      .Default([](Type) { llvm_unreachable("unhandled SPIR-V type"); });
+      .DefaultUnreachable("Unhandled SPIR-V type");
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
index 7e9a80e7d73a1..f895807ea1d18 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
@@ -57,7 +57,7 @@ class TypeExtensionVisitor {
           for (Type elementType : concreteType.getElementTypes())
             add(elementType);
         })
-        .Default([](SPIRVType) { llvm_unreachable("Unhandled type"); });
+        .DefaultUnreachable("Unhandled type");
   }
 
   void add(Type type) { add(cast<SPIRVType>(type)); }
@@ -107,7 +107,7 @@ class TypeCapabilityVisitor {
           for (Type elementType : concreteType.getElementTypes())
             add(elementType);
         })
-        .Default([](SPIRVType) { llvm_unreachable("Unhandled type"); });
+        .DefaultUnreachable("Unhandled type");
   }
 
   void add(Type type) { add(cast<SPIRVType>(type)); }
@@ -198,8 +198,7 @@ Type CompositeType::getElementType(unsigned index) const {
       .Case<MatrixType>([](MatrixType type) { return type.getColumnType(); })
       .Case<StructType>(
           [index](StructType type) { return type.getElementType(index); })
-      .Default(
-          [](Type) -> Type { llvm_unreachable("invalid composite type"); });
+      .DefaultUnreachable("Invalid composite type");
 }
 
 unsigned CompositeType::getNumElements() const {
@@ -207,9 +206,7 @@ unsigned CompositeType::getNumElements() const {
       .Case<ArrayType, StructType, TensorArmType, VectorType>(
           [](auto type) { return type.getNumElements(); })
       .Case<MatrixType>([](MatrixType type) { return type.getNumColumns(); })
-      .Default([](SPIRVType) -> unsigned {
-        llvm_unreachable("Invalid type for number of elements query");
-      });
+      .DefaultUnreachable("Invalid type for number of elements query");
 }
 
 bool CompositeType::hasCompileTimeKnownNumElements() const {
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index 122f61e0a66ae..88e1ab6ab1e4d 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -622,7 +622,7 @@ static spirv::Dim convertRank(int64_t rank) {
 }
 
 static spirv::ImageFormat getImageFormat(Type elementType) {
-  return llvm::TypeSwitch<Type, spirv::ImageFormat>(elementType)
+  return TypeSwitch<Type, spirv::ImageFormat>(elementType)
       .Case<Float16Type>([](Float16Type) { return spirv::ImageFormat::R16f; })
       .Case<Float32Type>([](Float32Type) { return spirv::ImageFormat::R32f; })
       .Case<IntegerType>([](IntegerType intType) {
@@ -639,11 +639,7 @@ static spirv::ImageFormat getImageFormat(Type elementType) {
           llvm_unreachable("Unhandled integer type!");
         }
       })
-      .Default([](Type) {
-        llvm_unreachable("Unhandled element type!");
-        // We need to return something here to satisfy the type switch.
-        return spirv::ImageFormat::R32f;
-      });
+      .DefaultUnreachable("Unhandled element type!");
 #undef BIT_WIDTH_CASE
 }
 

From fee71a3474ed09eb06a0d2c10edad376cab61ced Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 6 Oct 2025 13:14:34 +0100
Subject: [PATCH 832/878] [mlir][tosa] Apply 'Symbol' trait to `tosa.variable`
 (#153223)

Implement SymbolOpInterface on tosa.variable so that it's declaration is
automatically inserted into its parents SymbolTable.

Verifiers for tosa.variable_read/write can now look up the symbol and
guarantee it exists, and duplicate names are caught at creation time.
Previously this was completed by walking the graph which could be
inefficient.

Unfortunately, the Symbol trait expects to find a symbol name via a
hard-coded attribute name "sym_name". Therefore, "name" is renamed
to"sym_name" and a getName() wrapper is provided for backwards
compatibility.

This change also restricts tosa.variable declarations to ops that carry
a SymbolTable (e.g. modules), rather than allowing them to be placed
inside a func.func.

Note: EXT-VARIABLE is an experimental extension in the TOSA
specification, so is not subject to backwards compatibility guarantees.
---
 .../mlir/Dialect/Tosa/IR/TosaOpBase.td        |   4 +-
 .../mlir/Dialect/Tosa/IR/TosaUtilOps.td       |  16 ++-
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          |  67 ++-------
 mlir/test/Dialect/Tosa/invalid.mlir           |  55 ++++----
 mlir/test/Dialect/Tosa/invalid_extension.mlir |  22 +--
 mlir/test/Dialect/Tosa/level_check.mlir       |  30 ++--
 mlir/test/Dialect/Tosa/variables.mlir         | 132 ++++++++++--------
 mlir/test/Dialect/Tosa/verifier.mlir          |  56 +++++---
 8 files changed, 197 insertions(+), 185 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 115a11b346780..80337fc30bc66 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -201,9 +201,9 @@ def Tosa_PadOpQuantInfoBuilder : OpBuilder<
 // and optional initial value. The builder will extract var_shape and element type
 // attributes from variable type.
 def Tosa_VariableOpBuilder : OpBuilder<
-  (ins "StringRef":$name, "Type":$variable_type, "Attribute":$initial_value),
+  (ins "StringRef":$sym_name, "Type":$variable_type, "Attribute":$initial_value),
   [{
-    buildVariableOp($_builder, $_state, name, variable_type, initial_value);
+    buildVariableOp($_builder, $_state, sym_name, variable_type, initial_value);
   }]>;
 
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td
index d819cc198e3f2..f1a618e75061b 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td
@@ -18,6 +18,7 @@
 include "mlir/IR/OpBase.td"
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/VectorInterfaces.td"
 include "mlir/Dialect/Tosa/IR/TosaInterfaces.td"
@@ -82,7 +83,7 @@ def Tosa_YieldOp : Tosa_Op<"yield", [
 //===----------------------------------------------------------------------===//
 // Operator: variable
 //===----------------------------------------------------------------------===//
-def Tosa_VariableOp : Tosa_Op<"variable", []> {
+def Tosa_VariableOp : Tosa_Op<"variable", [Symbol]> {
   let summary = "Defines a variable";
 
   let description = [{
@@ -91,7 +92,10 @@ def Tosa_VariableOp : Tosa_Op<"variable", []> {
   }];
 
   let arguments = (ins
-    SymbolNameAttr:$name,
+    // Note: "sym_name" is used as opposed to "name" in the specification,
+    // since a Symbol must be named "sym_name" for it to be recognised by
+    // the containing SymbolTable.
+    SymbolNameAttr:$sym_name,
     IndexElementsAttr:$var_shape,
     TypeAttr:$type,
     OptionalAttr<AnyAttr>:$initial_value
@@ -105,14 +109,18 @@ def Tosa_VariableOp : Tosa_Op<"variable", []> {
   let hasCustomAssemblyFormat = 1;
 
   let assemblyFormat = [{
-    $name
+    $sym_name
     attr-dict
     custom<VariableOpTypeOrInitialValue>($var_shape, $type, $initial_value)
   }];
 
   let builders = [Tosa_VariableOpBuilder];
 
-  let hasVerifier = 1;
+  let extraClassDeclaration = [{
+    ::llvm::StringRef getName() {
+      return getSymName();
+    }
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 332f1a0e5506f..c51b5e9cbfc78 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -905,56 +905,29 @@ static inline LogicalResult errorIfShapeNotSizeOne(Operation *op, Type type) {
   return shapeAdaptor.getNumElements() == 1 ? success() : failure();
 }
 
-// Returns the first declaration point prior to this operation or failure if
-// not found.
-static FailureOr<tosa::VariableOp> findVariableDecl(Operation *op,
-                                                    StringRef symName) {
-  ModuleOp module = op->getParentOfType<ModuleOp>();
-  tosa::VariableOp varOp = nullptr;
-
-  // TODO: Adopt SymbolTable trait to Varible ops.
-  // Currently, the variable's definition point is searched via walk(),
-  // starting from the top-level ModuleOp and stopping at the point of use. Once
-  // TOSA control flow and variable extensions reach the complete state, may
-  // leverage MLIR's Symbol Table functionality to look up symbol and enhance
-  // the search to a TOSA specific graph traversal over the IR structure.
-  module.walk([&](Operation *tempOp) {
-    // Reach this op itself.
-    if (tempOp == op) {
-      return WalkResult::interrupt();
-    }
-
-    if (auto tosaOp = dyn_cast<tosa::VariableOp>(tempOp)) {
-      if (symName == tosaOp.getName()) {
-        varOp = tosaOp;
-        return WalkResult::interrupt();
-      }
-    }
-
-    return WalkResult::advance();
-  });
-
-  if (varOp)
-    return varOp;
-
-  return failure();
-}
-
 template <typename T>
 static LogicalResult verifyVariableOpErrorIf(T op, Type type, StringRef name) {
-  StringRef symName = op.getName();
-  FailureOr<tosa::VariableOp> varOp = findVariableDecl(op, symName);
-  if (failed(varOp))
+  Operation *symTableOp =
+      op->template getParentWithTrait<OpTrait::SymbolTable>();
+  if (!symTableOp)
+    // If the operation is not the scope of a symbol table, we cannot
+    // verify it against it's declaration.
+    return success();
+
+  SymbolTable symTable(symTableOp);
+  const auto varOp = symTable.lookup<tosa::VariableOp>(op.getName());
+
+  // Verify prior declaration
+  if (!varOp)
     return op->emitOpError("'")
-           << symName << "' has not been declared by 'tosa.variable'";
+           << op.getName() << "' has not been declared by 'tosa.variable'";
 
   // Verify type and shape
-  auto variableType = getVariableType(varOp.value());
+  auto variableType = getVariableType(varOp);
   if (errorIfTypeOrShapeMismatch(op, type, name, variableType,
                                  "the input tensor")
           .failed())
     return failure();
-
   return success();
 }
 
@@ -1418,7 +1391,7 @@ static void buildVariableOp(OpBuilder &builder, OperationState &result,
   ArrayRef<int64_t> shape = shapedType.getShape();
   auto varShapeAttr = builder.getIndexTensorAttr(convertFromMlirShape(shape));
 
-  result.addAttribute("name", nameAttr);
+  result.addAttribute("sym_name", nameAttr);
   result.addAttribute("var_shape", varShapeAttr);
   result.addAttribute("type", elementTypeAttr);
   result.addAttribute("initial_value", initialValue);
@@ -4160,16 +4133,6 @@ LogicalResult tosa::SelectOp::verify() {
   return success();
 }
 
-LogicalResult tosa::VariableOp::verify() {
-  StringRef symName = getName();
-  FailureOr<tosa::VariableOp> varOp = findVariableDecl(*this, symName);
-  if (succeeded(varOp))
-    return emitOpError("illegal to have multiple declaration of '")
-           << symName << "'";
-
-  return success();
-}
-
 LogicalResult tosa::VariableReadOp::verify() {
   if (verifyVariableOpErrorIf(*this, getOutput1().getType(), "'output1'")
           .failed())
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 41c3243792259..e60f1c9b4a01a 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -573,64 +573,61 @@ func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>, %arg1: ten
 
 // -----
 
-func.func @test_variable_unranked(%arg0: tensor<2x4x8xi8>) -> () {
+module {
   tosa.variable @stored_var : tensor<*xi8>
   // expected-error@+1 {{custom op 'tosa.variable' expected ranked type}}
-  return
 }
 
 // -----
 
-func.func @test_variable_unranked_initial_value(%arg0: tensor<2x4x8xi8>) -> () {
+module {
   // expected-error@+1 {{elements literal type must have static shape}}
   tosa.variable @stored_var = dense<0> : tensor<*xi8>
   // expected-error@+1 {{custom op 'tosa.variable' expected attribute}}
-  return
-}
-
-// -----
-
-func.func @test_variable_duplicates(%arg0: tensor<2x4x8xi8>) -> () {
-  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8>
-  // expected-error@+1 {{'tosa.variable' op illegal to have multiple declaration of 'stored_var'}}
-  tosa.variable @stored_var = dense<3> : tensor<1x4x8xi8>
-  return
 }
 
 // -----
 
-func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () {
+module {
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8>
-  // expected-error@+1 {{'tosa.variable_read' op require same element type for 'output1' ('i16') and the input tensor ('i8')}}
-  %0 = tosa.variable_read @stored_var : tensor<2x4x8xi16>
-  return
+  func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () {
+    // expected-error@+1 {{'tosa.variable_read' op require same element type for 'output1' ('i16') and the input tensor ('i8')}}
+    %0 = tosa.variable_read @stored_var : tensor<2x4x8xi16>
+    return
+  }
 }
 
 // -----
 
-func.func @test_variable_read_shape(%arg0: tensor<2x4x8xi8>) -> () {
+module {
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8>
-  // expected-error@+1 {{'tosa.variable_read' op require same element type for 'output1' ('i32') and the input tensor ('i8'}}
-  %0 = tosa.variable_read @stored_var : tensor<1x4x8xi32>
-  return
+  func.func @test_variable_read_shape(%arg0: tensor<2x4x8xi8>) -> () {
+    // expected-error@+1 {{'tosa.variable_read' op require same element type for 'output1' ('i32') and the input tensor ('i8'}}
+    %0 = tosa.variable_read @stored_var : tensor<1x4x8xi32>
+    return
+  }
 }
 
 // -----
 
-func.func @test_variable_write_type(%arg0: tensor<2x4x8xi16>) -> () {
+module {
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8>
-  // expected-error@+1 {{'tosa.variable_write' op require same element type for 'input1' ('i16') and the input tensor ('i8')}}
-  tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi16>
-  return
+  func.func @test_variable_write_type(%arg0: tensor<2x4x8xi16>) -> () {
+    // expected-error@+1 {{'tosa.variable_write' op require same element type for 'input1' ('i16') and the input tensor ('i8')}}
+    tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi16>
+    return
+  }
 }
 
 // -----
 
-func.func @test_variable_write_shape(%arg0: tensor<1x4x8xi8>) -> () {
+module {
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8>
-  // expected-error@+1 {{'tosa.variable_write' op require same shapes for 'input1' ('tensor<1x4x8xi8>') and the input tensor ('tensor<2x4x8xi8>')}}
-  tosa.variable_write @stored_var, %arg0 : tensor<1x4x8xi8>
-  return
+  func.func @test_variable_write_shape(%arg0: tensor<1x4x8xi8>) -> () {
+    // expected-error@+1 {{'tosa.variable_write' op require same shapes for 'input1' ('tensor<1x4x8xi8>') and the input tensor ('tensor<2x4x8xi8>')}}
+    tosa.variable_write @stored_var, %arg0 : tensor<1x4x8xi8>
+    return
+  }
 }
 
 // -----
diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir
index 3138ce2621a3a..1daabe9222a9b 100644
--- a/mlir/test/Dialect/Tosa/invalid_extension.mlir
+++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir
@@ -310,21 +310,27 @@ func.func @test_identity(%arg0: tensor<13x21x3xi4>) -> tensor<13x21x3xi4> {
 }
 
 // -----
-func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () {
+module {
   // expected-error@+1 {{'tosa.variable' op illegal: requires [variable] but not enabled in target}}
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8>
-  // expected-error@+1 {{'tosa.variable_read' op illegal: requires [variable]}}
-  %0 = tosa.variable_read @stored_var : tensor<2x4x8xi8>
-  return
+
+  func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () {
+    // expected-error@+1 {{'tosa.variable_read' op illegal: requires [variable]}}
+    %0 = tosa.variable_read @stored_var : tensor<2x4x8xi8>
+    return
+  }
 }
 
 // -----
-func.func @test_variable_write_type(%arg0: tensor<2x4x8xi8>) -> () {
+module {
   // expected-error@+1 {{'tosa.variable' op illegal: requires [variable] but not enabled in target}}
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi8>
-  // expected-error@+1 {{'tosa.variable_write' op illegal: requires [variable]}}
-  tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi8>
-  return
+
+  func.func @test_variable_write_type(%arg0: tensor<2x4x8xi8>) -> () {
+    // expected-error@+1 {{'tosa.variable_write' op illegal: requires [variable]}}
+    tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi8>
+    return
+  }
 }
 
 // -----
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index 3742adf650408..5bf2dbb8d02b1 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -1097,14 +1097,17 @@ func.func @test_scatter_tensor_size_invalid(%arg0: tensor<13x260000000x3xf32>, %
 
 // -----
 
-func.func @test_variable_read_write_tensor_size_invalid() -> () {
+module {
   // expected-error@+1 {{'tosa.variable' op failed level check: variable type tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
   tosa.variable @stored_var : tensor<536870912xf32>
-  // expected-error@+1 {{'tosa.variable_read' op failed level check: result tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
-  %0 = tosa.variable_read @stored_var : tensor<536870912xf32>
-  // expected-error@+1 {{'tosa.variable_write' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
-  tosa.variable_write @stored_var, %0 : tensor<536870912xf32>
-  return
+
+  func.func @test_variable_read_write_tensor_size_invalid() -> () {
+    // expected-error@+1 {{'tosa.variable_read' op failed level check: result tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
+    %0 = tosa.variable_read @stored_var : tensor<536870912xf32>
+    // expected-error@+1 {{'tosa.variable_write' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
+    tosa.variable_write @stored_var, %0 : tensor<536870912xf32>
+    return
+  }
 }
 
 // -----
@@ -1165,14 +1168,17 @@ func.func @test_cond_if_rank_invalid(%arg0: tensor<1x1x1x1x1x1x1x1xf32>, %arg1:
 
 // -----
 
-func.func @test_variable_read_write_rank_invalid() -> () {
+module {
   // expected-error@+1 {{'tosa.variable' op failed level check: variable type rank(shape) <= MAX_RANK}}
   tosa.variable @stored_var : tensor<1x1x1x1x1x1x1x1xf32>
-  // expected-error@+1 {{'tosa.variable_read' op failed level check: result rank(shape) <= MAX_RANK}}
-  %0 = tosa.variable_read @stored_var : tensor<1x1x1x1x1x1x1x1xf32>
-  // expected-error@+1 {{'tosa.variable_write' op failed level check: operand rank(shape) <= MAX_RANK}}
-  tosa.variable_write @stored_var, %0 : tensor<1x1x1x1x1x1x1x1xf32>
-  return
+
+  func.func @test_variable_read_write_rank_invalid() -> () {
+    // expected-error@+1 {{'tosa.variable_read' op failed level check: result rank(shape) <= MAX_RANK}}
+    %0 = tosa.variable_read @stored_var : tensor<1x1x1x1x1x1x1x1xf32>
+    // expected-error@+1 {{'tosa.variable_write' op failed level check: operand rank(shape) <= MAX_RANK}}
+    tosa.variable_write @stored_var, %0 : tensor<1x1x1x1x1x1x1x1xf32>
+    return
+  }
 }
 
 // -----
diff --git a/mlir/test/Dialect/Tosa/variables.mlir b/mlir/test/Dialect/Tosa/variables.mlir
index 9953eb375d3ac..0c104e8e8d7ea 100644
--- a/mlir/test/Dialect/Tosa/variables.mlir
+++ b/mlir/test/Dialect/Tosa/variables.mlir
@@ -3,76 +3,98 @@
 
 
 // -----
-// CHECK-LABEL:   @test_variable_scalar(
-// CHECK-SAME:                        %[[ADD_VAL:.*]]: tensor<f32>) {
-func.func @test_variable_scalar(%arg0: tensor<f32>) -> () {
-  // CHECK:           tosa.variable @stored_var = dense<3.140000e+00> : tensor<f32>
+
+module {
+  // CHECK: tosa.variable @stored_var = dense<3.140000e+00> : tensor<f32>
   tosa.variable @stored_var = dense<3.14> : tensor<f32>
-  // CHECK:           %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<f32>
-  %0 = tosa.variable_read @stored_var : tensor<f32>
-  // CHECK:           %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  %1 = "tosa.add"(%arg0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK:           tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<f32>
-  tosa.variable_write @stored_var, %1 : tensor<f32>
-  return
+
+  // CHECK-LABEL: @test_variable_scalar(
+  // CHECK-SAME: %[[ADD_VAL:.*]]: tensor<f32>) {
+  func.func @test_variable_scalar(%arg0: tensor<f32>) -> () {
+    // CHECK: %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<f32>
+    %0 = tosa.variable_read @stored_var : tensor<f32>
+    // CHECK: %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %1 = "tosa.add"(%arg0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    // CHECK: tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<f32>
+    tosa.variable_write @stored_var, %1 : tensor<f32>
+    return
+  }
 }
 
+
 // -----
-// CHECK-LABEL:   @test_variable_tensor(
-// CHECK-SAME:                        %[[ADD_VAL:.*]]: tensor<2x4x8xi32>) {
-func.func @test_variable_tensor(%arg0: tensor<2x4x8xi32>) -> () {
-  // CHECK:           tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+
+module {
+  // CHECK: tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
-  // CHECK:           %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<2x4x8xi32>
-  %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
-  // CHECK:           %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
-  %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
-  // CHECK:           tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32>
-  tosa.variable_write @stored_var, %1 : tensor<2x4x8xi32>
-  return
+
+  // CHECK-LABEL: @test_variable_tensor(
+  // CHECK-SAME: %[[ADD_VAL:.*]]: tensor<2x4x8xi32>) {
+  func.func @test_variable_tensor(%arg0: tensor<2x4x8xi32>) -> () {
+    // CHECK: %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<2x4x8xi32>
+    %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
+    // CHECK: %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+    %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+    // CHECK: tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32>
+    tosa.variable_write @stored_var, %1 : tensor<2x4x8xi32>
+    return
+  }
 }
 
 // -----
-// CHECK-LABEL:   @test_variable_scalar_no_initial_value(
-// CHECK-SAME:                        %[[ADD_VAL:.*]]: tensor<f32>) {
-func.func @test_variable_scalar_no_initial_value(%arg0: tensor<f32>) -> () {
-  // CHECK:           tosa.variable @stored_var : tensor<f32>
+
+module {
+  // CHECK: tosa.variable @stored_var : tensor<f32>
   tosa.variable @stored_var : tensor<f32>
-  // CHECK:           %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<f32>
-  %0 = tosa.variable_read @stored_var : tensor<f32>
-  // CHECK:           %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  %1 = "tosa.add"(%arg0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
-  // CHECK:           tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<f32>
-  tosa.variable_write @stored_var, %1 : tensor<f32>
-  return
+
+  // CHECK-LABEL: @test_variable_scalar_no_initial_value(
+  // CHECK-SAME: %[[ADD_VAL:.*]]: tensor<f32>) {
+  func.func @test_variable_scalar_no_initial_value(%arg0: tensor<f32>) -> () {
+    // CHECK: %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<f32>
+    %0 = tosa.variable_read @stored_var : tensor<f32>
+    // CHECK: %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %1 = "tosa.add"(%arg0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    // CHECK: tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<f32>
+    tosa.variable_write @stored_var, %1 : tensor<f32>
+    return
+  }
 }
 
 // -----
-// CHECK-LABEL:   @test_variable_tensor_no_initial_value(
-// CHECK-SAME:                        %[[ADD_VAL:.*]]: tensor<2x4x8xi32>) {
-func.func @test_variable_tensor_no_initial_value(%arg0: tensor<2x4x8xi32>) -> () {
-  // CHECK:           tosa.variable @stored_var : tensor<2x4x8xi32>
+
+module {
+  // CHECK: tosa.variable @stored_var : tensor<2x4x8xi32>
   tosa.variable @stored_var : tensor<2x4x8xi32>
-  // CHECK:           %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<2x4x8xi32>
-  %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
-  // CHECK:           %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
-  %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
-  // CHECK:           tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32>
-  tosa.variable_write @stored_var, %1 : tensor<2x4x8xi32>
-  return
+
+  // CHECK-LABEL: @test_variable_tensor_no_initial_value(
+  // CHECK-SAME: %[[ADD_VAL:.*]]: tensor<2x4x8xi32>) {
+  func.func @test_variable_tensor_no_initial_value(%arg0: tensor<2x4x8xi32>) -> () {
+    // CHECK: %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<2x4x8xi32>
+    %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
+    // CHECK: %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+    %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+    // CHECK: tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32>
+    tosa.variable_write @stored_var, %1 : tensor<2x4x8xi32>
+    return
+  }
 }
 
+
 // -----
-// CHECK-LABEL:   @test_variable_tensor_with_unknowns(
-// CHECK-SAME:                        %[[ADD_VAL:.*]]: tensor<2x4x8xi32>) {
-func.func @test_variable_tensor_with_unknowns(%arg0: tensor<2x4x8xi32>) -> () {
-  // CHECK:           tosa.variable @stored_var : tensor<2x?x8xi32>
+
+module {
+  // CHECK: tosa.variable @stored_var : tensor<2x?x8xi32>
   tosa.variable @stored_var : tensor<2x?x8xi32>
-  // CHECK:           %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<2x4x8xi32>
-  %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
-  // CHECK:           %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
-  %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
-  // CHECK:           tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32>
-  tosa.variable_write @stored_var, %1 : tensor<2x4x8xi32>
-  return
+
+  // CHECK-LABEL: @test_variable_tensor_with_unknowns(
+  // CHECK-SAME: %[[ADD_VAL:.*]]: tensor<2x4x8xi32>) {
+  func.func @test_variable_tensor_with_unknowns(%arg0: tensor<2x4x8xi32>) -> () {
+    // CHECK: %[[STORED_VAL:.*]] = tosa.variable_read @stored_var : tensor<2x4x8xi32>
+    %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
+    // CHECK: %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+    %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+    // CHECK: tosa.variable_write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32>
+    tosa.variable_write @stored_var, %1 : tensor<2x4x8xi32>
+    return
+  }
 }
diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir
index 0128da729136e..430b06ad16c39 100644
--- a/mlir/test/Dialect/Tosa/verifier.mlir
+++ b/mlir/test/Dialect/Tosa/verifier.mlir
@@ -944,29 +944,27 @@ func.func @test_while_loop_cond_output_not_bool(%arg0: tensor<10xi32>, %arg1: te
 
 // -----
 
-func.func @test_variable_multiple_declaration() -> () {
+module {
+  // expected-note@below {{see existing symbol definition here}}
   tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
-  // expected-error@+1 {{'tosa.variable' op illegal to have multiple declaration of 'stored_var'}}
+  // expected-error@+1 {{redefinition of symbol named 'stored_var'}}
   tosa.variable @stored_var = dense<-3> : tensor<2x4x8xi32>
-  return
 }
 
 // -----
 
-func.func @test_variable_shape_mismatch() -> () {
+module {
   // expected-error@+1 {{inferred shape of elements literal ([2]) does not match type ([3])}}
   tosa.variable @stored_var = dense<[3.14, 2.14]> : tensor<3xf32>
   // expected-error@+1 {{custom op 'tosa.variable' expected attribute}}
-  return
 }
 
 // -----
 
-func.func @test_variable_type_mismatch() -> () {
+module {
   // expected-error@+1 {{expected integer elements, but parsed floating-point}}
   tosa.variable @stored_var = dense<-1.2> : tensor<2x4x8xi32>
   // expected-error@+1 {{custom op 'tosa.variable' expected attribute}}
-  return
 }
 
 // -----
@@ -979,20 +977,26 @@ func.func @test_variable_read_no_declaration() -> () {
 
 // -----
 
-func.func @test_variable_read_type_mismatch() -> () {
+module {
   tosa.variable @stored_var = dense<-1.2> : tensor<2x4x8xf32>
-  // expected-error@+1 {{'tosa.variable_read' op require same element type for 'output1' ('i32') and the input tensor ('f32')}}
-  %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
-  return
+
+  func.func @test_variable_read_type_mismatch() -> () {
+    // expected-error@+1 {{'tosa.variable_read' op require same element type for 'output1' ('i32') and the input tensor ('f32')}}
+    %0 = tosa.variable_read @stored_var : tensor<2x4x8xi32>
+    return
+  }
 }
 
 // -----
 
-func.func @test_variable_read_shape_mismatch() -> () {
+module {
   tosa.variable @stored_var = dense<-1.2> : tensor<8x4x2xf32>
-  // expected-error@+1 {{'tosa.variable_read' op require same shapes for 'output1' ('tensor<2x4x8xf32>') and the input tensor ('tensor<8x4x2xf32>')}}
-  %0 = tosa.variable_read @stored_var : tensor<2x4x8xf32>
-  return
+
+  func.func @test_variable_read_shape_mismatch() -> () {
+    // expected-error@+1 {{'tosa.variable_read' op require same shapes for 'output1' ('tensor<2x4x8xf32>') and the input tensor ('tensor<8x4x2xf32>')}}
+    %0 = tosa.variable_read @stored_var : tensor<2x4x8xf32>
+    return
+  }
 }
 
 // -----
@@ -1005,20 +1009,26 @@ func.func @test_variable_write_no_declaration(%arg0: tensor<f32>) -> () {
 
 // -----
 
-func.func @test_variable_write_type_mismatch(%arg0: tensor<2x4x8xi32>) -> () {
+module {
   tosa.variable @stored_var = dense<-1.2> : tensor<2x4x8xf32>
-  // expected-error@+1 {{'tosa.variable_write' op require same element type for 'input1' ('i32') and the input tensor ('f32')}}
-  tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi32>
-  return
+
+  func.func @test_variable_write_type_mismatch(%arg0: tensor<2x4x8xi32>) -> () {
+    // expected-error@+1 {{'tosa.variable_write' op require same element type for 'input1' ('i32') and the input tensor ('f32')}}
+    tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xi32>
+    return
+  }
 }
 
 // -----
 
-func.func @test_variable_write_shape_mismatch(%arg0: tensor<2x4x8xf32>) -> () {
+module {
   tosa.variable @stored_var = dense<-1.2> : tensor<8x4x2xf32>
-  // expected-error@+1 {{'tosa.variable_write' op require same shapes for 'input1' ('tensor<2x4x8xf32>') and the input tensor ('tensor<8x4x2xf32>')}}
-  tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xf32>
-  return
+
+  func.func @test_variable_write_shape_mismatch(%arg0: tensor<2x4x8xf32>) -> () {
+    // expected-error@+1 {{'tosa.variable_write' op require same shapes for 'input1' ('tensor<2x4x8xf32>') and the input tensor ('tensor<8x4x2xf32>')}}
+    tosa.variable_write @stored_var, %arg0 : tensor<2x4x8xf32>
+    return
+  }
 }
 
 // -----

From 5296d017381f5bb4e3b29644767b98ce336698ce Mon Sep 17 00:00:00 2001
From: marius doerner <marius.doerner1@icloud.com>
Date: Mon, 6 Oct 2025 15:08:38 +0200
Subject: [PATCH 833/878] [clang][bytecode] Assert on virtual func call from
 array elem (#158502)

Fixes #152893.

An assert was raised when a constexpr virtual function was called from
an constexpr array element with -fexperimental-new-constant-interpreter
set.
---
 clang/lib/AST/ByteCode/Interp.cpp             | 43 +++++++---
 clang/test/AST/ByteCode/cxx20.cpp             | 85 ++++++++++++++++++-
 .../SemaCXX/constant-expression-p2280r4.cpp   |  2 +-
 3 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 21af3d6ac7f90..89043968915a9 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1638,6 +1638,36 @@ bool Call(InterpState &S, CodePtr OpPC, const Function *Func,
   return true;
 }
 
+static bool GetDynamicDecl(InterpState &S, CodePtr OpPC, Pointer TypePtr,
+                           const CXXRecordDecl *&DynamicDecl) {
+  while (TypePtr.isBaseClass())
+    TypePtr = TypePtr.getBase();
+
+  QualType DynamicType = TypePtr.getType();
+  if (TypePtr.isStatic() || TypePtr.isConst()) {
+    const VarDecl *VD = TypePtr.getDeclDesc()->asVarDecl();
+    if (!VD->isConstexpr()) {
+      const Expr *E = S.Current->getExpr(OpPC);
+      APValue V = TypePtr.toAPValue(S.getASTContext());
+      QualType TT = S.getASTContext().getLValueReferenceType(DynamicType);
+      S.FFDiag(E, diag::note_constexpr_polymorphic_unknown_dynamic_type)
+          << AccessKinds::AK_MemberCall << V.getAsString(S.getASTContext(), TT);
+      return false;
+    }
+  }
+
+  if (DynamicType->isPointerType() || DynamicType->isReferenceType()) {
+    DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
+  } else if (DynamicType->isArrayType()) {
+    const Type *ElemType = DynamicType->getPointeeOrArrayElementType();
+    assert(ElemType);
+    DynamicDecl = ElemType->getAsCXXRecordDecl();
+  } else {
+    DynamicDecl = DynamicType->getAsCXXRecordDecl();
+  }
+  return true;
+}
+
 bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
               uint32_t VarArgSize) {
   assert(Func->hasThisPointer());
@@ -1662,17 +1692,8 @@ bool CallVirt(InterpState &S, CodePtr OpPC, const Function *Func,
   }
 
   const CXXRecordDecl *DynamicDecl = nullptr;
-  {
-    Pointer TypePtr = ThisPtr;
-    while (TypePtr.isBaseClass())
-      TypePtr = TypePtr.getBase();
-
-    QualType DynamicType = TypePtr.getType();
-    if (DynamicType->isPointerType() || DynamicType->isReferenceType())
-      DynamicDecl = DynamicType->getPointeeCXXRecordDecl();
-    else
-      DynamicDecl = DynamicType->getAsCXXRecordDecl();
-  }
+  if (!GetDynamicDecl(S, OpPC, ThisPtr, DynamicDecl))
+    return false;
   assert(DynamicDecl);
 
   const auto *StaticDecl = cast<CXXRecordDecl>(Func->getParentDecl());
diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp
index 67bf9a732d8b7..1888998ebe3dd 100644
--- a/clang/test/AST/ByteCode/cxx20.cpp
+++ b/clang/test/AST/ByteCode/cxx20.cpp
@@ -1070,9 +1070,30 @@ namespace Virtual {
   public:
     int a = f();
 
-    virtual constexpr int f() { return 10; }
+    virtual constexpr int f() const { return 10; }
   };
 
+  K k;
+  static_assert(k.f() == 10); // both-error {{not an integral constant expression}} \
+                              // both-note {{virtual function called on object 'k' whose dynamic type is not constant}}
+
+  void f() {
+    constexpr K k;
+    static_assert(k.f() == 10);
+  }
+
+  void f2() {
+    K k;
+    static_assert(k.f() == 10); // both-error {{not an integral constant expression}} \
+                                // both-note {{virtual function called on object 'k' whose dynamic type is not constant}}
+  }
+  
+  static_assert(K().f() == 10);
+
+  void f3() {
+    static_assert(K().f() == 10);
+  }
+
   class L : public K {
   public:
     int b = f();
@@ -1083,6 +1104,42 @@ namespace Virtual {
   static_assert(l.a == 10);
   static_assert(l.b == 10);
   static_assert(l.c == 10);
+  static_assert(l.f() == 10);
+
+  struct M {
+    K& mk = k;
+  };
+  static_assert(M{}.mk.f() == 10); // both-error {{not an integral constant expression}} \
+                                   // both-note {{virtual function called on object 'k' whose dynamic type is not constant}}
+
+  struct N {
+    K* mk = &k;
+  };
+  static_assert(N{}.mk->f() == 10); // both-error {{not an integral constant expression}} \
+                                    // both-note {{virtual function called on object 'k' whose dynamic type is not constant}}
+
+  extern K o;
+  static_assert(o.f() == 10); // both-error {{not an integral constant expression}} \
+                              // both-note {{virtual function called on object 'o' whose dynamic type is not constant}}
+  static K p;
+  static_assert(p.f() == 10); // both-error {{not an integral constant expression}} \
+                              // both-note {{virtual function called on object 'p' whose dynamic type is not constant}}
+  
+  void f4() {
+    static K p;
+    static_assert(p.f() == 10); // both-error {{not an integral constant expression}} \
+                                // both-note {{virtual function called on object 'p' whose dynamic type is not constant}}
+  }
+  
+  const K q;
+  static_assert(q.f() == 10); // both-error {{not an integral constant expression}} \
+                              // both-note {{virtual function called on object 'q' whose dynamic type is not constant}}
+
+  void f5() {
+    const K q;
+    static_assert(q.f() == 10); // both-error {{not an integral constant expression}} \
+                                // both-note {{virtual function called on object 'q' whose dynamic type is not constant}}
+  }
 }
 
 namespace DiscardedTrivialCXXConstructExpr {
@@ -1100,3 +1157,29 @@ namespace DiscardedTrivialCXXConstructExpr {
   constexpr int y = foo(12); // both-error {{must be initialized by a constant expression}} \
                              // both-note {{in call to}}
 }
+
+namespace VirtualFunctionCallThroughArrayElem {
+  struct X {
+    constexpr virtual int foo() const {
+      return 3;
+    }
+  };
+  constexpr X xs[5];
+  static_assert(xs[3].foo() == 3);
+
+  constexpr X xs2[1][2];
+  static_assert(xs2[0].foo() == 3); // both-error {{is not a structure or union}}
+  static_assert(xs2[0][0].foo() == 3);
+
+  struct Y: public X {
+    constexpr int foo() const override {
+      return 1;
+    }
+  };
+  constexpr Y ys[20];
+  static_assert(ys[12].foo() == static_cast<const X&>(ys[12]).foo());
+
+  X a[3][4];
+  static_assert(a[2][3].foo()); // both-error {{not an integral constant expression}} \
+                                // both-note {{virtual function called on object 'a[2][3]' whose dynamic type is not constant}}
+}
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index 78e2e17016280..5cbfaffd04aa0 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -44,7 +44,7 @@ void splash(Swim& swam) {                 // nointerpreter-note {{declared here}
   static_assert(how_many(swam) == 28);    // ok
   static_assert(Swim().lochte() == 12);   // ok
   static_assert(swam.lochte() == 12);     // expected-error {{static assertion expression is not an integral constant expression}} \
-                                          // nointerpreter-note {{virtual function called on object 'swam' whose dynamic type is not constant}}
+                                          // expected-note {{virtual function called on object 'swam' whose dynamic type is not constant}}
   static_assert(swam.coughlin == 12);     // expected-error {{static assertion expression is not an integral constant expression}} \
                                           // nointerpreter-note {{read of variable 'swam' whose value is not known}}
 }

From 8bab6c4e8c4f9a7b32ac5dd94436922c84705a86 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 6 Oct 2025 09:23:25 -0400
Subject: [PATCH 834/878] [mlir] Simplify unreachable type switch cases. NFC.
 (#162032)

Use `DefaultUnreachable` from
https://github.com/llvm/llvm-project/pull/161970.
---
 mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp |  2 +-
 mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp        |  4 +---
 .../Dialect/ArmSME/Transforms/OuterProductFusion.cpp |  4 ++--
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp               |  2 +-
 .../lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp |  4 +---
 mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp        |  6 ++----
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp        |  4 +---
 mlir/lib/Dialect/Linalg/Transforms/Loops.cpp         |  2 +-
 .../Dialect/Linalg/Transforms/NamedToElementwise.cpp |  5 +----
 mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp    |  5 +----
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp              |  4 +---
 .../Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp | 12 ++++++------
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp         |  5 +----
 .../Dialect/SCF/Transforms/TileUsingInterface.cpp    |  7 +++----
 mlir/lib/Dialect/Transform/IR/TransformOps.cpp       | 10 ++--------
 mlir/lib/Interfaces/DataLayoutInterfaces.cpp         |  5 +----
 mlir/lib/Rewrite/ByteCode.cpp                        |  8 ++------
 .../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp     |  9 ++-------
 mlir/lib/Target/LLVMIR/TypeToLLVM.cpp                |  4 +---
 mlir/lib/Tools/PDLL/AST/NodePrinter.cpp              |  4 ++--
 mlir/lib/Tools/PDLL/AST/Nodes.cpp                    |  2 +-
 mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp       |  5 +----
 22 files changed, 35 insertions(+), 78 deletions(-)

diff --git a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
index d57926ec5a1ca..39d4815dc73b7 100644
--- a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
+++ b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
@@ -243,7 +243,7 @@ static void getTreePredicates(std::vector<PositionalPredicate> &predList,
       .Case<OperandPosition, OperandGroupPosition>([&](auto *pos) {
         getOperandTreePredicates(predList, val, builder, inputs, pos);
       })
-      .Default([](auto *) { llvm_unreachable("unexpected position kind"); });
+      .DefaultUnreachable("unexpected position kind");
 }
 
 static void getAttributePredicates(pdl::AttributeOp op,
diff --git a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
index 0f90acf0d9c39..57877b859b7d4 100644
--- a/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
+++ b/mlir/lib/Conversion/XeVMToLLVM/XeVMToLLVM.cpp
@@ -68,9 +68,7 @@ std::string getTypeMangling(Type ty, bool isUnsigned = false) {
           llvm_unreachable("unhandled integer type");
         }
       })
-      .Default([](Type) -> std::string {
-        llvm_unreachable("unhandled type for mangling");
-      });
+      .DefaultUnreachable("unhandled type for mangling");
 }
 
 std::string mangle(StringRef baseName, ArrayRef<Type> types,
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
index 9196d2ef79592..39e398b8ae6ae 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/OuterProductFusion.cpp
@@ -170,7 +170,7 @@ class OuterProductFusion2Way
                 op2, op.getResultType(), lhs, rhs, lhsMask, rhsMask,
                 op1.getAcc());
           })
-          .Default([&](auto) { llvm_unreachable("unexpected extend op!"); });
+          .DefaultUnreachable("unexpected extend op!");
     } else if (kind == arm_sme::CombiningKind::Sub) {
       TypeSwitch<Operation *>(extOp)
           .Case<arith::ExtFOp>([&](auto) {
@@ -188,7 +188,7 @@ class OuterProductFusion2Way
                 op2, op.getResultType(), lhs, rhs, lhsMask, rhsMask,
                 op1.getAcc());
           })
-          .Default([&](auto) { llvm_unreachable("unexpected extend op!"); });
+          .DefaultUnreachable("unexpected extend op!");
     } else {
       llvm_unreachable("unexpected arm_sme::CombiningKind!");
     }
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index c0f9132de3db4..19eba6beacd86 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -375,7 +375,7 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
         os << shape.back() << 'x' << fragTy.getElementType();
         os << ", \"" << fragTy.getOperand() << "\"" << '>';
       })
-      .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); });
+      .DefaultUnreachable("unexpected 'gpu' type kind");
 }
 
 static LogicalResult verifyKnownLaunchSizeAttr(Operation *op,
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 2561f6606067f..0a3ef7d5c9890 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -847,9 +847,7 @@ getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
             return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping,
                                     *maybeMaskingAttr);
           })
-          .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
-            llvm_unreachable("unknown mapping attribute");
-          });
+          .DefaultUnreachable("unknown mapping attribute");
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
index ef3802758cc23..cee943d2d86c6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMMemorySlot.cpp
@@ -1096,10 +1096,8 @@ static Value memsetGetStored(MemsetIntr op, const MemorySlot &slot,
         Value intVal = buildMemsetValue(type.getWidth());
         return LLVM::BitcastOp::create(builder, op.getLoc(), type, intVal);
       })
-      .Default([](Type) -> Value {
-        llvm_unreachable(
-            "getStored should not be called on memset to unsupported type");
-      });
+      .DefaultUnreachable(
+          "getStored should not be called on memset to unsupported type");
 }
 
 template <class MemsetIntr>
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
index 297640cdd49d0..705d07d3e6c42 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
@@ -45,9 +45,7 @@ static StringRef getTypeKeyword(Type type) {
       .Case<LLVMStructType>([&](Type) { return "struct"; })
       .Case<LLVMTargetExtType>([&](Type) { return "target"; })
       .Case<LLVMX86AMXType>([&](Type) { return "x86_amx"; })
-      .Default([](Type) -> StringRef {
-        llvm_unreachable("unexpected 'llvm' type kind");
-      });
+      .DefaultUnreachable("unexpected 'llvm' type kind");
 }
 
 /// Prints a structure type. Keeps track of known struct names to handle self-
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
index 38f1a8b7247eb..42160a15bc755 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
@@ -192,7 +192,7 @@ static void replaceIndexOpsByInductionVariables(RewriterBase &rewriter,
         .Case([&](affine::AffineForOp affineForOp) {
           allIvs.push_back(affineForOp.getInductionVar());
         })
-        .Default([&](Operation *op) { assert(false && "unexpected op"); });
+        .DefaultUnreachable("unexpected op");
   }
   assert(linalgOp.getNumLoops() == allIvs.size() &&
          "expected the number of loops and induction variables to match");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/NamedToElementwise.cpp b/mlir/lib/Dialect/Linalg/Transforms/NamedToElementwise.cpp
index 00a076b6e9746..c9045566473cb 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/NamedToElementwise.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/NamedToElementwise.cpp
@@ -48,10 +48,7 @@ ElementwiseKind getKind(Operation *op) {
       .Case([](SquareOp) { return ElementwiseKind::square; })
       .Case([](TanhOp) { return ElementwiseKind::tanh; })
       .Case([](ErfOp) { return ElementwiseKind::erf; })
-      .Default([&](Operation *op) {
-        llvm_unreachable("unhandled case in named to elementwise");
-        return ElementwiseKind::sub;
-      });
+      .DefaultUnreachable("unhandled case in named to elementwise");
 }
 
 template <typename NamedOpTy>
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index e9a8b253eea35..7863c2120fe18 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1427,10 +1427,7 @@ FailureOr<Conv1DOp> DownscaleSizeOneWindowed2DConvolution<Conv2DOp, Conv1DOp>::
           .Case([&](linalg::PoolingNchwMaxOp op) {
             return std::make_tuple(0, 1, 2, 3);
           })
-          .Default([&](Operation *op) {
-            llvm_unreachable("unexpected conv2d/pool2d operation.");
-            return std::make_tuple(0, 0, 0, 0);
-          });
+          .DefaultUnreachable("unexpected conv2d/pool2d operation.");
 
   // Only handle the case where at least one of the window dimensions is
   // of size 1. Other cases can rely on tiling to reduce to such cases.
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 3593b5348d268..24d3722cf5426 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -604,9 +604,7 @@ static Operation *materializeTiledShape(OpBuilder &builder, Location loc,
                             builder, loc, valueToTile, sliceParams.offsets,
                             sliceParams.sizes, sliceParams.strides);
                       })
-                      .Default([](ShapedType) -> Operation * {
-                        llvm_unreachable("Unexpected shaped type");
-                      });
+                      .DefaultUnreachable("Unexpected shaped type");
   return sliceOp;
 }
 
diff --git a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
index 24da447ad7685..214410f78e51c 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/FoldMemRefAliasOps.cpp
@@ -315,7 +315,7 @@ LogicalResult LoadOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
             op, op.getType(), subViewOp.getSource(), sourceIndices,
             op.getTranspose(), op.getNumTiles());
       })
-      .Default([](Operation *) { llvm_unreachable("unexpected operation."); });
+      .DefaultUnreachable("unexpected operation");
   return success();
 }
 
@@ -367,7 +367,7 @@ LogicalResult LoadOpOfExpandShapeOpFolder<OpTy>::matchAndRewrite(
             op, op.getType(), expandShapeOp.getViewSource(), sourceIndices,
             op.getMask(), op.getPassThru());
       })
-      .Default([](Operation *) { llvm_unreachable("unexpected operation."); });
+      .DefaultUnreachable("unexpected operation");
   return success();
 }
 
@@ -415,7 +415,7 @@ LogicalResult LoadOpOfCollapseShapeOpFolder<OpTy>::matchAndRewrite(
             op, op.getType(), collapseShapeOp.getViewSource(), sourceIndices,
             op.getMask(), op.getPassThru());
       })
-      .Default([](Operation *) { llvm_unreachable("unexpected operation."); });
+      .DefaultUnreachable("unexpected operation");
   return success();
 }
 
@@ -482,7 +482,7 @@ LogicalResult StoreOpOfSubViewOpFolder<OpTy>::matchAndRewrite(
             op, op.getSrc(), subViewOp.getSource(), sourceIndices,
             op.getLeadDimension(), op.getTransposeAttr());
       })
-      .Default([](Operation *) { llvm_unreachable("unexpected operation."); });
+      .DefaultUnreachable("unexpected operation");
   return success();
 }
 
@@ -535,7 +535,7 @@ LogicalResult StoreOpOfExpandShapeOpFolder<OpTy>::matchAndRewrite(
             op, expandShapeOp.getViewSource(), sourceIndices, op.getMask(),
             op.getValueToStore());
       })
-      .Default([](Operation *) { llvm_unreachable("unexpected operation."); });
+      .DefaultUnreachable("unexpected operation");
   return success();
 }
 
@@ -584,7 +584,7 @@ LogicalResult StoreOpOfCollapseShapeOpFolder<OpTy>::matchAndRewrite(
             op, collapseShapeOp.getViewSource(), sourceIndices, op.getMask(),
             op.getValueToStore());
       })
-      .Default([](Operation *) { llvm_unreachable("unexpected operation."); });
+      .DefaultUnreachable("unexpected operation");
   return success();
 }
 
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 5672942a18231..fd4cabbada96f 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -3425,10 +3425,7 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) {
               }
               llvm_unreachable("Unexpected generatee argument");
             })
-            .Default([&](Operation *op) {
-              assert(false && "TODO: Custom name for this operation");
-              return "transformed";
-            });
+            .DefaultUnreachable("TODO: Custom name for this operation");
   }
 
   setNameFn(result, cliName);
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 36685d3affe03..29b770fb4b279 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -2177,10 +2177,9 @@ cloneAsInsertSlices(RewriterBase &rewriter,
               auto clonedOp = cloneAsInsertSlice(rewriter, op);
               clonedSlices.push_back(clonedOp);
             })
-        .Default([&](Operation *op) {
-          // Assert here assuming this has already been checked.
-          assert(0 && "unexpected slice type while cloning as insert slice");
-        });
+        // Assert here assuming this has already been checked.
+        .DefaultUnreachable(
+            "unexpected slice type while cloning as insert slice");
   }
   return clonedSlices;
 }
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 3385b2a38afc1..77e74cac0133c 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -2597,10 +2597,7 @@ transform::NumAssociationsOp::apply(transform::TransformRewriter &rewriter,
           .Case([&](TransformParamTypeInterface param) {
             return llvm::range_size(state.getParams(getHandle()));
           })
-          .Default([](Type) {
-            llvm_unreachable("unknown kind of transform dialect type");
-            return 0;
-          });
+          .DefaultUnreachable("unknown kind of transform dialect type");
   results.setParams(cast<OpResult>(getNum()),
                     rewriter.getI64IntegerAttr(numAssociations));
   return DiagnosedSilenceableFailure::success();
@@ -2657,10 +2654,7 @@ transform::SplitHandleOp::apply(transform::TransformRewriter &rewriter,
           .Case<TransformParamTypeInterface>([&](auto x) {
             return llvm::range_size(state.getParams(getHandle()));
           })
-          .Default([](auto x) {
-            llvm_unreachable("unknown transform dialect type interface");
-            return -1;
-          });
+          .DefaultUnreachable("unknown transform dialect type interface");
 
   auto produceNumOpsError = [&]() {
     return emitSilenceableError()
diff --git a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
index 3b6330b6a6c58..782384999c70c 100644
--- a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
+++ b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
@@ -364,10 +364,7 @@ static DataLayoutSpecInterface getSpec(Operation *operation) {
   return llvm::TypeSwitch<Operation *, DataLayoutSpecInterface>(operation)
       .Case<ModuleOp, DataLayoutOpInterface>(
           [&](auto op) { return op.getDataLayoutSpec(); })
-      .Default([](Operation *) {
-        llvm_unreachable("expected an op with data layout spec");
-        return DataLayoutSpecInterface();
-      });
+      .DefaultUnreachable("expected an op with data layout spec");
 }
 
 static TargetSystemSpecInterface getTargetSystemSpec(Operation *operation) {
diff --git a/mlir/lib/Rewrite/ByteCode.cpp b/mlir/lib/Rewrite/ByteCode.cpp
index 5cbea5d2c6cee..33fbd2a9579f0 100644
--- a/mlir/lib/Rewrite/ByteCode.cpp
+++ b/mlir/lib/Rewrite/ByteCode.cpp
@@ -764,9 +764,7 @@ void Generator::generate(Operation *op, ByteCodeWriter &writer) {
             pdl_interp::SwitchOperandCountOp, pdl_interp::SwitchOperationNameOp,
             pdl_interp::SwitchResultCountOp>(
           [&](auto interpOp) { this->generate(interpOp, writer); })
-      .Default([](Operation *) {
-        llvm_unreachable("unknown `pdl_interp` operation");
-      });
+      .DefaultUnreachable("unknown `pdl_interp` operation");
 }
 
 void Generator::generate(pdl_interp::ApplyConstraintOp op,
@@ -913,9 +911,7 @@ void Generator::generate(pdl_interp::ExtractOp op, ByteCodeWriter &writer) {
           .Case([](pdl::OperationType) { return OpCode::ExtractOp; })
           .Case([](pdl::ValueType) { return OpCode::ExtractValue; })
           .Case([](pdl::TypeType) { return OpCode::ExtractType; })
-          .Default([](Type) -> OpCode {
-            llvm_unreachable("unsupported element type");
-          });
+          .DefaultUnreachable("unsupported element type");
   writer.append(opCode, op.getRange(), op.getIndex(), op.getResult());
 }
 void Generator::generate(pdl_interp::FinalizeOp op, ByteCodeWriter &writer) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 9fcb02eb4be3d..1e2099d6cc1b2 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -4716,10 +4716,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
             info.HasNoWait = updateDataOp.getNowait();
             return success();
           })
-          .Default([&](Operation *op) {
-            llvm_unreachable("unexpected operation");
-            return failure();
-          });
+          .DefaultUnreachable("unexpected operation");
 
   if (failed(result))
     return failure();
@@ -5312,9 +5309,7 @@ extractHostEvalClauses(omp::TargetOp targetOp, Value &numThreads,
             (void)found;
             assert(found && "unsupported host_eval use");
           })
-          .Default([](Operation *) {
-            llvm_unreachable("unsupported host_eval use");
-          });
+          .DefaultUnreachable("unsupported host_eval use");
     }
   }
 }
diff --git a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
index 4d204744450a8..807a94c61f0c8 100644
--- a/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
+++ b/mlir/lib/Target/LLVMIR/TypeToLLVM.cpp
@@ -74,9 +74,7 @@ class TypeToLLVMIRTranslatorImpl {
                   LLVM::LLVMPointerType, LLVM::LLVMStructType, VectorType,
                   LLVM::LLVMTargetExtType, PtrLikeTypeInterface>(
                 [this](auto type) { return this->translate(type); })
-            .Default([](Type t) -> llvm::Type * {
-              llvm_unreachable("unknown LLVM dialect type");
-            });
+            .DefaultUnreachable("unknown LLVM dialect type");
 
     // Cache the result of the conversion and return.
     knownTranslations.try_emplace(type, translated);
diff --git a/mlir/lib/Tools/PDLL/AST/NodePrinter.cpp b/mlir/lib/Tools/PDLL/AST/NodePrinter.cpp
index e2c987ae1b37f..f49d3d048a579 100644
--- a/mlir/lib/Tools/PDLL/AST/NodePrinter.cpp
+++ b/mlir/lib/Tools/PDLL/AST/NodePrinter.cpp
@@ -154,7 +154,7 @@ void NodePrinter::print(Type type) {
       })
       .Case([&](TypeType) { os << "Type"; })
       .Case([&](ValueType) { os << "Value"; })
-      .Default([](Type) { llvm_unreachable("unknown AST type"); });
+      .DefaultUnreachable("unknown AST type");
 }
 
 void NodePrinter::print(const Node *node) {
@@ -182,7 +182,7 @@ void NodePrinter::print(const Node *node) {
           const VariableDecl,
 
           const Module>([&](auto derivedNode) { this->printImpl(derivedNode); })
-      .Default([](const Node *) { llvm_unreachable("unknown AST node"); });
+      .DefaultUnreachable("unknown AST node");
   elementIndentStack.pop_back();
 }
 
diff --git a/mlir/lib/Tools/PDLL/AST/Nodes.cpp b/mlir/lib/Tools/PDLL/AST/Nodes.cpp
index 159ce6235662b..5aa09375bc79d 100644
--- a/mlir/lib/Tools/PDLL/AST/Nodes.cpp
+++ b/mlir/lib/Tools/PDLL/AST/Nodes.cpp
@@ -72,7 +72,7 @@ class NodeVisitor {
 
             const Module>(
             [&](auto derivedNode) { this->visitImpl(derivedNode); })
-        .Default([](const Node *) { llvm_unreachable("unknown AST node"); });
+        .DefaultUnreachable("unknown AST node");
   }
 
 private:
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
index 8dd971374fa21..34547e9fed062 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeFormatGen.cpp
@@ -89,10 +89,7 @@ static ParameterElement *getEncapsulatedParameterElement(FormatElement *el) {
       .Case<ParameterElement>([&](auto param) { return param; })
       .Case<RefDirective>(
           [&](auto ref) { return cast<ParameterElement>(ref->getArg()); })
-      .Default([&](auto el) {
-        assert(false && "unexpected struct element type");
-        return nullptr;
-      });
+      .DefaultUnreachable("unexpected struct element type");
 }
 
 /// Shorthand functions that can be used with ranged-based conditions.

From 542cba893018e6c7faebbc5c19e5c10034c160ea Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Mon, 6 Oct 2025 06:46:21 -0700
Subject: [PATCH 835/878] [OpenACC][CIR] Handle firstprivate bounds recipe
 lowering (#161873)

These work the same as the other two (private and reduction) except that
the expression for the 'init' is a copy instead of a default/value init,
and in a separate region. This patch gets all of that correct, and
ensures we generate these as expected.

There is a little extra work to make sure that the bounds-loop
generation does 2 separate array index operations, otherwise this is
very much like the reduction implementation.
---
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp |  60 +-
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h   | 109 +--
 clang/lib/Sema/SemaOpenACC.cpp                |  21 +-
 .../combined-firstprivate-clause.cpp          | 501 ++++++-------
 .../compute-firstprivate-clause.c             | 232 +++---
 .../compute-firstprivate-clause.cpp           | 501 ++++++-------
 .../firstprivate-clause-recipes.cpp           | 691 ++++++++++++++++++
 7 files changed, 1312 insertions(+), 803 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index bbc45e5f92ca5..24a5fc27c4775 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -221,10 +221,9 @@ mlir::Value OpenACCRecipeBuilderBase::makeBoundsAlloca(
   return initialAlloca;
 }
 
-mlir::Value
-OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
-                                           mlir::Value bound,
-                                           mlir::Location loc, bool inverse) {
+std::pair<mlir::Value, mlir::Value> OpenACCRecipeBuilderBase::createBoundsLoop(
+    mlir::Value subscriptedValue, mlir::Value subscriptedValue2,
+    mlir::Value bound, mlir::Location loc, bool inverse) {
   mlir::Operation *bodyInsertLoc;
 
   mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy);
@@ -249,7 +248,6 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
 
     return cir::PtrStrideOp::create(builder, loc, eltLoad.getType(), eltLoad,
                                     idxLoad);
-        
   };
 
   auto forStmtBuilder = [&]() {
@@ -303,6 +301,8 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
 
           if (subscriptedValue)
             subscriptedValue = doSubscriptOp(subscriptedValue, load);
+          if (subscriptedValue2)
+            subscriptedValue2 = doSubscriptOp(subscriptedValue2, load);
           bodyInsertLoc = builder.createYield(loc);
         },
         /*stepBuilder=*/
@@ -325,7 +325,7 @@ OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue,
   // Leave the insertion point to be inside the body, so we can loop over
   // these things.
   builder.setInsertionPoint(bodyInsertLoc);
-  return subscriptedValue;
+  return {subscriptedValue, subscriptedValue2};
 }
 
 mlir::acc::ReductionOperator
@@ -434,7 +434,7 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
     mlir::Location loc, mlir::Location locEnd, SourceRange exprRange,
     mlir::Value mainOp, mlir::Region &recipeInitRegion, size_t numBounds,
     llvm::ArrayRef<QualType> boundTypes, const VarDecl *allocaDecl,
-    QualType origType) {
+    QualType origType, bool emitInitExpr) {
   assert(allocaDecl && "Required recipe variable not set?");
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, allocaDecl};
 
@@ -464,14 +464,15 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
     // initialize this variable correctly.
     CIRGenFunction::AutoVarEmission tempDeclEmission =
         cgf.emitAutoVarAlloca(*allocaDecl, builder.saveInsertionPoint());
-    cgf.emitAutoVarInit(tempDeclEmission);
+    if (emitInitExpr)
+      cgf.emitAutoVarInit(tempDeclEmission);
   } else {
     mlir::Value alloca = makeBoundsAlloca(
         block, exprRange, loc, allocaDecl->getName(), numBounds, boundTypes);
 
     // If the initializer is trivial, there is nothing to do here, so save
     // ourselves some effort.
-    if (allocaDecl->getInit() &&
+    if (emitInitExpr && allocaDecl->getInit() &&
         (!cgf.isTrivialInitializer(allocaDecl->getInit()) ||
          cgf.getContext().getLangOpts().getTrivialAutoVarInit() !=
              LangOptions::TrivialAutoVarInitKind::Uninitialized))
@@ -484,35 +485,42 @@ void OpenACCRecipeBuilderBase::createInitRecipe(
 
 void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy(
     mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
-    CIRGenFunction::AutoVarEmission tempDeclEmission,
-    mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe,
-    const VarDecl *temporary) {
-  mlir::Block *block =
-      createRecipeBlock(recipe.getCopyRegion(), mainOp.getType(), loc,
-                        /*numBounds=*/0, /*isInit=*/false);
-  builder.setInsertionPointToEnd(&recipe.getCopyRegion().back());
+    const VarDecl *allocaDecl, const VarDecl *temporary,
+    mlir::Region &copyRegion, size_t numBounds) {
+  mlir::Block *block = createRecipeBlock(copyRegion, mainOp.getType(), loc,
+                                         numBounds, /*isInit=*/false);
+  builder.setInsertionPointToEnd(&copyRegion.back());
   CIRGenFunction::LexicalScope ls(cgf, loc, block);
 
-  mlir::BlockArgument fromArg = block->getArgument(0);
-  mlir::BlockArgument toArg = block->getArgument(1);
+  mlir::Value fromArg = block->getArgument(0);
+  mlir::Value toArg = block->getArgument(1);
 
-  mlir::Type elementTy =
-      mlir::cast<cir::PointerType>(mainOp.getType()).getPointee();
+  llvm::MutableArrayRef<mlir::BlockArgument> boundsRange =
+      block->getArguments().drop_front(2);
 
-  // Set the address of the emission to be the argument, so that we initialize
-  // that instead of the variable in the other block.
-  tempDeclEmission.setAllocatedAddress(
-      Address{toArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)});
+  for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange))
+    std::tie(fromArg, toArg) =
+        createBoundsLoop(fromArg, toArg, boundArg, loc, /*inverse=*/false);
+
+  // Set up the 'to' address.
+  mlir::Type elementTy =
+      mlir::cast<cir::PointerType>(toArg.getType()).getPointee();
+  CIRGenFunction::AutoVarEmission tempDeclEmission(*allocaDecl);
   tempDeclEmission.emittedAsOffload = true;
+  tempDeclEmission.setAllocatedAddress(
+      Address{toArg, elementTy, cgf.getContext().getDeclAlign(allocaDecl)});
 
+  // Set up the 'from' address from the temporary.
   CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, temporary};
   cgf.setAddrOfLocalVar(
       temporary,
-      Address{fromArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)});
-
+      Address{fromArg, elementTy, cgf.getContext().getDeclAlign(allocaDecl)});
   cgf.emitAutoVarInit(tempDeclEmission);
+
+  builder.setInsertionPointToEnd(&copyRegion.back());
   mlir::acc::YieldOp::create(builder, locEnd);
 }
+
 // This function generates the 'combiner' section for a reduction recipe. Note
 // that this function is not 'insertion point' clean, in that it alters the
 // insertion point to be inside of the 'combiner' section of the recipe, but
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
index 21707ad137f77..a5da7444ebf96 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h
@@ -49,14 +49,16 @@ class OpenACCRecipeBuilderBase {
   // Creates a loop through an 'acc.bounds', leaving the 'insertion' point to be
   // the inside of the loop body. Traverses LB->UB UNLESS `inverse` is set.
   // Returns the 'subscriptedValue' changed with the new bounds subscript.
+  std::pair<mlir::Value, mlir::Value>
+  createBoundsLoop(mlir::Value subscriptedValue, mlir::Value subscriptedValue2,
+                   mlir::Value bound, mlir::Location loc, bool inverse);
+
   mlir::Value createBoundsLoop(mlir::Value subscriptedValue, mlir::Value bound,
-                               mlir::Location loc, bool inverse);
+                               mlir::Location loc, bool inverse) {
+    return createBoundsLoop(subscriptedValue, {}, bound, loc, inverse).first;
+  }
+
   mlir::acc::ReductionOperator convertReductionOp(OpenACCReductionOperator op);
-  void createFirstprivateRecipeCopy(
-      mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp,
-      CIRGenFunction::AutoVarEmission tempDeclEmission,
-      mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe,
-      const VarDecl *temporary);
 
   // This function generates the 'combiner' section for a reduction recipe. Note
   // that this function is not 'insertion point' clean, in that it alters the
@@ -66,11 +68,19 @@ class OpenACCRecipeBuilderBase {
                                      mlir::Value mainOp,
                                      mlir::acc::ReductionRecipeOp recipe,
                                      size_t numBounds);
+
   void createInitRecipe(mlir::Location loc, mlir::Location locEnd,
                         SourceRange exprRange, mlir::Value mainOp,
                         mlir::Region &recipeInitRegion, size_t numBounds,
                         llvm::ArrayRef<QualType> boundTypes,
-                        const VarDecl *allocaDecl, QualType origType);
+                        const VarDecl *allocaDecl, QualType origType,
+                        bool emitInitExpr);
+
+  void createFirstprivateRecipeCopy(mlir::Location loc, mlir::Location locEnd,
+                                    mlir::Value mainOp,
+                                    const VarDecl *allocaDecl,
+                                    const VarDecl *temporary,
+                                    mlir::Region &copyRegion, size_t numBounds);
 
   void createRecipeDestroySection(mlir::Location loc, mlir::Location locEnd,
                                   mlir::Value mainOp, CharUnits alignment,
@@ -150,63 +160,6 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
     return recipeName;
   }
 
-  // Create the 'init' section of the recipe, including the 'copy' section for
-  // 'firstprivate'.  Note that this function is not 'insertion point' clean, in
-  // that it alters the insertion point to be inside of the 'destroy' section of
-  // the recipe, but doesn't restore it aftewards.
-  void createRecipeInitCopy(mlir::Location loc, mlir::Location locEnd,
-                            SourceRange exprRange, mlir::Value mainOp,
-                            RecipeTy recipe, const VarDecl *varRecipe,
-                            const VarDecl *temporary) {
-    // TODO: OpenACC: when we get the 'pointer' variants for
-    // firstprivate/reduction, this probably should be removed/split into
-    // functions for the BuilderBase.
-    assert(varRecipe && "Required recipe variable not set?");
-
-    CIRGenFunction::AutoVarEmission tempDeclEmission{
-        CIRGenFunction::AutoVarEmission::invalid()};
-    CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, varRecipe};
-
-    // Do the 'init' section of the recipe IR, which does an alloca, then the
-    // initialization (except for firstprivate).
-    mlir::Block *block =
-        createRecipeBlock(recipe.getInitRegion(), mainOp.getType(), loc,
-                          /*numBounds=*/0, /*isInit=*/true);
-    builder.setInsertionPointToEnd(&recipe.getInitRegion().back());
-    CIRGenFunction::LexicalScope ls(cgf, loc, block);
-
-    tempDeclEmission =
-        cgf.emitAutoVarAlloca(*varRecipe, builder.saveInsertionPoint());
-
-    // 'firstprivate' doesn't do its initialization in the 'init' section,
-    // instead it does it in the 'copy' section.  SO, only do 'init' here for
-    // reduction.
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::ReductionRecipeOp>) {
-      // Unlike Private, the recipe here is always required as it has to do
-      // init, not just 'default' init.
-      if (!varRecipe->getInit())
-        cgf.cgm.errorNYI(exprRange, "reduction init recipe");
-      cgf.emitAutoVarInit(tempDeclEmission);
-    }
-
-    mlir::acc::YieldOp::create(builder, locEnd);
-
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>) {
-      if (!varRecipe->getInit()) {
-        // If we don't have any initialization recipe, we failed during Sema to
-        // initialize this correctly. If we disable the
-        // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll
-        // emit an error to tell us.  However, emitting those errors during
-        // production is a violation of the standard, so we cannot do them.
-        cgf.cgm.errorNYI(
-            exprRange, "firstprivate copy-init recipe not properly generated");
-      }
-
-      createFirstprivateRecipeCopy(loc, locEnd, mainOp, tempDeclEmission,
-                                   recipe, varRecipe, temporary);
-    }
-  }
-
 public:
   OpenACCRecipeBuilder(CIRGen::CIRGenFunction &cgf,
                        CIRGen::CIRGenBuilderTy &builder)
@@ -221,19 +174,6 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
                BuiltinType::ArraySection) &&
            "array section shouldn't make it to recipe creation");
 
-    // TODO: OpenACC: This is a bit of a hackery to get this to not change for
-    // the non-private recipes. This will be removed soon, when we get this
-    // 'right' for firstprivate and reduction.
-    if constexpr (std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>) {
-      if (numBounds) {
-        cgf.cgm.errorNYI(varRef->getSourceRange(),
-                         "firstprivate-init with bounds");
-      }
-      boundTypes = {};
-      numBounds = 0;
-      origType = baseType;
-    }
-
     mlir::ModuleOp mod = builder.getBlock()
                              ->getParent()
                              ->template getParentOfType<mlir::ModuleOp>();
@@ -262,21 +202,20 @@ class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase {
     if constexpr (std::is_same_v<RecipeTy, mlir::acc::PrivateRecipeOp>) {
       createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
                        recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
-                       origType);
+                       origType, /*emitInitExpr=*/true);
     } else if constexpr (std::is_same_v<RecipeTy,
                                         mlir::acc::ReductionRecipeOp>) {
       createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
                        recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
-                       origType);
+                       origType, /*emitInitExpr=*/true);
       createReductionRecipeCombiner(loc, locEnd, mainOp, recipe, numBounds);
     } else {
       static_assert(std::is_same_v<RecipeTy, mlir::acc::FirstprivateRecipeOp>);
-      // TODO: OpenACC: we probably want this to call createInitRecipe as well,
-      // but do so in a way that omits the 'initialization', so that we can do
-      // it separately, since it belongs in the 'copy' region. It also might
-      // need a way of getting the tempDeclEmission out of it for that purpose.
-      createRecipeInitCopy(loc, locEnd, varRef->getSourceRange(), mainOp,
-                           recipe, varRecipe, temporary);
+      createInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp,
+                       recipe.getInitRegion(), numBounds, boundTypes, varRecipe,
+                       origType, /*emitInitExpr=*/false);
+      createFirstprivateRecipeCopy(loc, locEnd, mainOp, varRecipe, temporary,
+                                   recipe.getCopyRegion(), numBounds);
     }
 
     if (origType.isDestructedType())
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 7ad7049237e63..8471f02f323d6 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2724,16 +2724,6 @@ Expr *GenerateReductionInitRecipeExpr(ASTContext &Context,
   return InitExpr;
 }
 
-const Expr *StripOffBounds(const Expr *VarExpr) {
-  while (isa_and_present<ArraySectionExpr, ArraySubscriptExpr>(VarExpr)) {
-    if (const auto *AS = dyn_cast<ArraySectionExpr>(VarExpr))
-      VarExpr = AS->getBase()->IgnoreParenImpCasts();
-    else if (const auto *Sub = dyn_cast<ArraySubscriptExpr>(VarExpr))
-      VarExpr = Sub->getBase()->IgnoreParenImpCasts();
-  }
-  return VarExpr;
-}
-
 VarDecl *CreateAllocaDecl(ASTContext &Ctx, DeclContext *DC,
                           SourceLocation BeginLoc, IdentifierInfo *VarName,
                           QualType VarTy) {
@@ -2794,17 +2784,18 @@ OpenACCPrivateRecipe SemaOpenACC::CreatePrivateInitRecipe(const Expr *VarExpr) {
 
 OpenACCFirstPrivateRecipe
 SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) {
-  // TODO: OpenACC: This shouldn't be necessary, see PrivateInitRecipe
-  VarExpr = StripOffBounds(VarExpr);
-
+  // We don't strip bounds here, so that we are doing our recipe init at the
+  // 'lowest' possible level.  Codegen is going to have to do its own 'looping'.
   if (!VarExpr || VarExpr->getType()->isDependentType())
     return OpenACCFirstPrivateRecipe::Empty();
 
   QualType VarTy =
       VarExpr->getType().getNonReferenceType().getUnqualifiedType();
 
-  // TODO: OpenACC: for arrays/bounds versions, we're going to have to do a
-  // different initializer, but for now we can go ahead with this.
+  // Array sections are special, and we have to treat them that way.
+  if (const auto *ASE =
+          dyn_cast<ArraySectionExpr>(VarExpr->IgnoreParenImpCasts()))
+    VarTy = ArraySectionExpr::getBaseOriginalType(ASE);
 
   VarDecl *AllocaDecl = CreateAllocaDecl(
       getASTContext(), SemaRef.getCurContext(), VarExpr->getBeginLoc(),
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
index e836a37a9bccd..726cd3efe7621 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -81,292 +81,247 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!s32i>, %[[ZERO]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[ONE]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ONE_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[TWO]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[TWO_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[THREE]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[THREE_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[FOUR]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[FOUR_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load{{.*}} %[[STRIDE_FROM]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[FROM_LOAD]], %[[STRIDE_TO]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!cir.float>, %[[ZERO]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[ONE]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ONE_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[TWO]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[TWO_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[THREE]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[THREE_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[FOUR]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[FOUR_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load{{.*}} %[[STRIDE_FROM]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[FROM_LOAD]], %[[STRIDE_TO]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_CopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.firstprivate.init"]
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor> 
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
-//
 // CHECK-NEXT: } destroy {
-// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i
-// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[ELEM:.*]] = cir.ptr_stride(%[[ARRPTR]] : !cir.ptr<!rec_HasDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>, ["__array_idx"]
-// CHECK-NEXT: cir.store %[[ELEM]], %[[ITR]] : !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!cir.ptr<!rec_HasDtor>>, !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[ELEM_LOAD]]) nothrow : (!cir.ptr<!rec_HasDtor>) -> ()
-// CHECK-NEXT: %[[NEG_ONE:.*]] =  cir.const #cir.int<-1> : !s64i
-// CHECK-NEXT: %[[PREVELEM:.*]] = cir.ptr_stride(%[[ELEM_LOAD]] : !cir.ptr<!rec_HasDtor>, %[[NEG_ONE]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.store %[[PREVELEM]], %[[ITR]] : !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasDtor>) -> ()
 // CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!cir.ptr<!rec_HasDtor>>, !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[ELEM_LOAD]], %[[ARRPTR]]) : !cir.ptr<!rec_HasDtor>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
+//
 
 extern "C" void acc_combined() {
   // CHECK: cir.func{{.*}} @acc_combined() {
@@ -482,7 +437,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr<!cir.array<!s32i x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 5>> {name = "someIntArr[1]"}
-  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>) {
+  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>) {
   // CHECK-NEXT: acc.loop combined(serial)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -497,7 +452,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 5>> {name = "someFloatArr[1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -512,7 +467,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1]"}
-  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) {
+  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) {
   // CHECK-NEXT: acc.loop combined(serial)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -527,7 +482,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {name = "hasCopyArr[1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -542,7 +497,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {name = "notDefCtorArr[1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -557,7 +512,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -607,12 +562,12 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1]"}
-  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
+  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
   // CHECK-NEXT: acc.loop combined(serial)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -629,7 +584,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr<!cir.array<!s32i x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 5>> {name = "someIntArr[1:1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -645,7 +600,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 5>> {name = "someFloatArr[1:1]"}
-  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>) {
+  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>) {
   // CHECK-NEXT: acc.loop combined(serial)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -661,7 +616,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1:1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -677,7 +632,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {name = "hasCopyArr[1:1]"}
-  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) {
+  // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) {
   // CHECK-NEXT: acc.loop combined(serial)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -693,7 +648,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {name = "notDefCtorArr[1:1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -709,7 +664,7 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1:1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) {
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) {
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
@@ -765,12 +720,12 @@ extern "C" void acc_combined() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1:1]"}
-  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
+  // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
   // CHECK-NEXT: acc.loop combined(parallel)
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
index de6e7b0314fa9..94c297301d635 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -34,140 +34,110 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!s32i>, %[[ZERO]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[ONE]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ONE_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[TWO]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[TWO_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[THREE]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[THREE_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[FOUR]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[FOUR_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load{{.*}} %[[STRIDE_FROM]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[FROM_LOAD]], %[[STRIDE_TO]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!cir.float>, %[[ZERO]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[ONE]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ONE_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[TWO]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[TWO_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[THREE]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[THREE_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[FOUR]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[FOUR_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load{{.*}} %[[STRIDE_FROM]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[FROM_LOAD]], %[[STRIDE_TO]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
 // CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.copy %[[FROM_OFFSET:.*]] to %[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.copy %[[FROM_OFFSET]] to %[[TO_OFFSET]] : !cir.ptr<!rec_NoCopyConstruct>
-//
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: cir.copy %[[STRIDE_FROM]] to %[[STRIDE_TO]] : !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 
@@ -227,7 +197,7 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr<!cir.array<!s32i x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 5>> {name = "someIntArr[1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(someFloatArr[1])
@@ -239,7 +209,7 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 5>> {name = "someFloatArr[1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc serial firstprivate(noCopyArr[1])
@@ -251,7 +221,7 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc serial firstprivate(someIntArr[1], someFloatArr[1], noCopyArr[1])
@@ -277,9 +247,9 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
@@ -293,7 +263,7 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr<!cir.array<!s32i x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 5>> {name = "someIntArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc serial firstprivate(someFloatArr[1:1])
@@ -306,7 +276,7 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 5>> {name = "someFloatArr[1:1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(noCopyArr[1:1])
@@ -319,7 +289,7 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(someIntArr[1:1], someFloatArr[1:1], noCopyArr[1:1])
@@ -348,9 +318,9 @@ void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
index fca3ca85c9edf..1e174bbdbe5ac 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct NoCopyConstruct {};
 
@@ -81,292 +81,247 @@ struct HasDtor {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_i : !cir.ptr<!cir.array<!s32i x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!s32i x 5>, !cir.ptr<!cir.array<!s32i x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!s32i>, %[[ZERO]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[ONE]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ONE_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[TWO]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[TWO_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[THREE]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[THREE_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!s32i>, %[[FOUR]] : !s64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[FOUR_2]] : !u64i), !cir.ptr<!s32i>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!s32i>, !s32i
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!s32i>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!s32i>
+// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load{{.*}} %[[STRIDE_FROM]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[FROM_LOAD]], %[[STRIDE_TO]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_f : !cir.ptr<!cir.array<!cir.float x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!cir.float x 5>, !cir.ptr<!cir.array<!cir.float x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!cir.float>, %[[ZERO]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_DECAY]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[ONE]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ONE_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[TWO]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[TWO_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[THREE]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[THREE_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!cir.float>, %[[FOUR]] : !s64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[FOUR_2]] : !u64i), !cir.ptr<!cir.float>
-// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load {{.*}}%[[FROM_OFFSET]] : !cir.ptr<!cir.float>, !cir.float
-// CHECK-NEXT: cir.store {{.*}} %[[FROM_LOAD]], %[[TO_OFFSET]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!cir.float>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[FROM_LOAD:.*]] = cir.load{{.*}} %[[STRIDE_FROM]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[FROM_LOAD]], %[[STRIDE_TO]] : !cir.float, !cir.ptr<!cir.float>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!rec_NoCopyConstruct x 5>, !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
-// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> -> !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_NoCopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NoCopyConstruct>
+// CHECK-NEXT: cir.call @_ZN15NoCopyConstructC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_NoCopyConstruct>, !cir.ptr<!rec_NoCopyConstruct>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_13CopyConstruct : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!rec_CopyConstruct x 5>, !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ZERO]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR]] : !s64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_CopyConstruct>
-// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_CopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> -> !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_CopyConstruct>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_CopyConstruct>
+// CHECK-NEXT: cir.call @_ZN13CopyConstructC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) : (!cir.ptr<!rec_CopyConstruct>, !cir.ptr<!rec_CopyConstruct>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.firstprivate.init"]
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.alloca !cir.array<!rec_NonDefaultCtor x 5>, !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[FROM_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[FROM_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
-// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> -> !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_NonDefaultCtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_NonDefaultCtor>
+// CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_NonDefaultCtor>, !cir.ptr<!rec_NonDefaultCtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
 //
-// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
-// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt1__ZTSA5_7HasDtor : !cir.ptr<!cir.array<!rec_HasDtor x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
 // CHECK-NEXT: cir.alloca !cir.array<!rec_HasDtor x 5>, !cir.ptr<!cir.array<!rec_HasDtor x 5>>, ["openacc.firstprivate.init"]
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } copy {
-// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[TO_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor> 
-// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ZERO]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_DECAY]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ONE]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[ONE_2:.*]] = cir.const #cir.int<1>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ONE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[TWO:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[TWO]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[TWO_2:.*]] = cir.const #cir.int<2>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[TWO_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[THREE:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[THREE]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[THREE_2:.*]] = cir.const #cir.int<3>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[THREE_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
-// CHECK-NEXT: %[[FOUR:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[TO_OFFSET:.*]] = cir.ptr_stride(%[[TO_DECAY]] : !cir.ptr<!rec_HasDtor>, %[[FOUR]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FOUR_2:.*]] = cir.const #cir.int<4>
-// CHECK-NEXT: %[[DECAY_FROM:.*]] =  cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[FROM_OFFSET:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[FOUR_2]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[TO_OFFSET]], %[[FROM_OFFSET]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
-//
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[DECAY_FROM]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[DECAY_TO]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: cir.call @_ZN7HasDtorC1ERKS_(%[[STRIDE_TO]], %[[STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_HasDtor>, !cir.ptr<!rec_HasDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
-//
 // CHECK-NEXT: } destroy {
-// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}):
-// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i
-// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[ELEM:.*]] = cir.ptr_stride(%[[ARRPTR]] : !cir.ptr<!rec_HasDtor>, %[[LAST_IDX]] : !u64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>, ["__array_idx"]
-// CHECK-NEXT: cir.store %[[ELEM]], %[[ITR]] : !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>
-// CHECK-NEXT: cir.do {
-// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!cir.ptr<!rec_HasDtor>>, !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[ELEM_LOAD]]) nothrow : (!cir.ptr<!rec_HasDtor>) -> ()
-// CHECK-NEXT: %[[NEG_ONE:.*]] =  cir.const #cir.int<-1> : !s64i
-// CHECK-NEXT: %[[PREVELEM:.*]] = cir.ptr_stride(%[[ELEM_LOAD]] : !cir.ptr<!rec_HasDtor>, %[[NEG_ONE]] : !s64i), !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: cir.store %[[PREVELEM]], %[[ITR]] : !cir.ptr<!rec_HasDtor>, !cir.ptr<!cir.ptr<!rec_HasDtor>>
+// CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasDtor x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR_LOAD]], %[[LB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARG]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>> -> !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!rec_HasDtor>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!rec_HasDtor>
+// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_HasDtor>) -> ()
 // CHECK-NEXT: cir.yield
-// CHECK-NEXT: } while {
-// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!cir.ptr<!rec_HasDtor>>, !cir.ptr<!rec_HasDtor>
-// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[ELEM_LOAD]], %[[ARRPTR]]) : !cir.ptr<!rec_HasDtor>, !cir.bool
-// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: }
+//
 
 extern "C" void acc_compute() {
   // CHECK: cir.func{{.*}} @acc_compute() {
@@ -461,7 +416,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr<!cir.array<!s32i x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 5>> {name = "someIntArr[1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(someFloatArr[1])
@@ -473,7 +428,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 5>> {name = "someFloatArr[1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc serial firstprivate(noCopyArr[1])
@@ -485,7 +440,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(hasCopyArr[1])
@@ -497,7 +452,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {name = "hasCopyArr[1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(notDefCtorArr[1])
@@ -509,7 +464,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {name = "notDefCtorArr[1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(dtorArr[1])
@@ -521,7 +476,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc serial firstprivate(someIntArr[1], someFloatArr[1], noCopyArr[1], hasCopyArr[1], notDefCtorArr[1], dtorArr[1])
@@ -568,12 +523,12 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
@@ -587,7 +542,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr<!cir.array<!s32i x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!s32i x 5>> {name = "someIntArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr<!cir.array<!s32i x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc serial firstprivate(someFloatArr[1:1])
@@ -600,7 +555,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr<!cir.array<!cir.float x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!cir.float x 5>> {name = "someFloatArr[1:1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.float x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(noCopyArr[1:1])
@@ -613,7 +568,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>> {name = "noCopyArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc serial firstprivate(hasCopyArr[1:1])
@@ -626,7 +581,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_CopyConstruct x 5>> {name = "hasCopyArr[1:1]"}
-  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>)
+  // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(notDefCtorArr[1:1])
@@ -639,7 +594,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>> {name = "notDefCtorArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(dtorArr[1:1])
@@ -652,7 +607,7 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc parallel firstprivate(someIntArr[1:1], someFloatArr[1:1], noCopyArr[1:1], hasCopyArr[1:1], notDefCtorArr[1:1], dtorArr[1:1])
@@ -705,12 +660,12 @@ extern "C" void acc_compute() {
   // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1
   // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64)
   // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>) bounds(%[[BOUNDS]]) -> !cir.ptr<!cir.array<!rec_HasDtor x 5>> {name = "dtorArr[1:1]"}
-  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
-  // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
+  // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__Bcnt1__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr<!cir.array<!s32i x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr<!cir.array<!cir.float x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr<!cir.array<!rec_NoCopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr<!cir.array<!rec_CopyConstruct x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr<!cir.array<!rec_NonDefaultCtor x 5>>,
+  // CHECK-SAME: @firstprivatization__Bcnt1__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr<!cir.array<!rec_HasDtor x 5>>)
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp
new file mode 100644
index 0000000000000..e10d737402761
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp
@@ -0,0 +1,691 @@
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+
+// Note: unlike the 'private' recipe checks, this is just for spot-checking,
+// so this test isn't as comprehensive.  The same code paths are used for
+// 'private', so we just count on those to catch the errors.
+struct NoOps {
+  int i;
+  ~NoOps();
+};
+
+struct CtorDtor {
+  int i;
+  CtorDtor();
+  ~CtorDtor();
+};
+
+void do_things(unsigned A, unsigned B) {
+  NoOps ThreeArr[5][5][5];
+
+#pragma acc parallel firstprivate(ThreeArr[B][B][B])
+// CHECK:acc.firstprivate.recipe @firstprivatization__Bcnt3__ZTSA5_A5_A5_5NoOps : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>, !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>>, ["openacc.firstprivate.init"] {alignment = 4 : i64}
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } copy {
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[ARG_FROM]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_FROM:.*]] = cir.ptr_stride(%[[TLA_DECAY_FROM]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[TLA_DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[ARG_TO]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_TO:.*]] = cir.ptr_stride(%[[TLA_DECAY_TO]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE_FROM]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE_FROM:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY_FROM]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE_TO]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE_TO:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY_TO]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE_FROM]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND1_STRIDE_FROM:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY_FROM]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE_TO]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND1_STRIDE_TO:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY_TO]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1ERKS_(%[[BOUND1_STRIDE_TO]], %[[BOUND1_STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_NoOps>, !cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT:} destroy {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB3_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR3_LOAD]], %[[LB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_DECAY:.*]] = cir.cast array_to_ptrdecay %[[PRIVATE]] : !cir.ptr<!cir.array<!cir.array<!cir.array<!rec_NoOps x 5> x 5> x 5>> -> !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_DECAY]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB2_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR2_LOAD]], %[[LB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.array<!rec_NoOps x 5> x 5>> -> !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.array<!rec_NoOps x 5>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.array<!rec_NoOps x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[LAST_SUB_ONE:.*]] = cir.binop(sub, %[[UB1_CAST]], %[[ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[LAST_SUB_ONE]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR1_LOAD]], %[[LB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND2_STRIDE]] : !cir.ptr<!cir.array<!rec_NoOps x 5>> -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND1_STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_DECAY]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsD1Ev(%[[BOUND1_STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT:acc.yield
+// CHECK-NEXT:}
+  ;
+
+  NoOps ***ThreePtr;
+#pragma acc parallel firstprivate(ThreePtr[B][B][A:B])
+// CHECK: acc.firstprivate.recipe @firstprivatization__Bcnt3__ZTSPPP5NoOps : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TOP_LEVEL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, ["openacc.firstprivate.init"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[INT_PTR_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[SIZEOF_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_VLA_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPPER_LIMIT:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_LIMIT]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[TOP_LEVEL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: %[[INT_PTR_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_2:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[UPPER_BOUND_CAST]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_PTR_PTR:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS]], %[[SIZEOF_PTR_PTR]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_PTR_ALLOCA:.*]] = cir.alloca !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPPER_BOUND_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_2]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_VLA_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: %[[INT_PTR_UPPER_BOUND:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UPPER_BOUND_CAST_3:.*]] = builtin.unrealized_conversion_cast %[[INT_PTR_UPPER_BOUND]] : index to !u64i
+// CHECK-NEXT: %[[NUM_ELTS_2:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[SIZEOF_INT:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[CALC_ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS_2]], %[[SIZEOF_INT]]) : !u64i
+// CHECK-NEXT: %[[INT_PTR_ALLOCA:.*]] = cir.alloca !rec_NoOps, !cir.ptr<!rec_NoOps>, %[[CALC_ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// Copy array pointer to the original alloca.
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+//
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UPPER_BOUND_CAST_3]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_ALLOCA]] : !cir.ptr<!rec_NoOps>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[DEST_STRIDE:.*]] = cir.ptr_stride(%[[INT_PTR_PTR_ALLOCA]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.store %[[SRC_STRIDE]], %[[DEST_STRIDE]] : !cir.ptr<!rec_NoOps>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.yield
+//
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+//
+// CHECK-NEXT: } copy {
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD_FROM:.*]] = cir.load %[[ARG_FROM]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_FROM:.*]] = cir.ptr_stride(%[[TLA_LOAD_FROM]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[TLA_LOAD_TO:.*]] = cir.load %[[ARG_TO]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDETO:.*]] = cir.ptr_stride(%[[TLA_LOAD_TO]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD_FROM:.*]] = cir.load %[[BOUND3_STRIDE_FROM]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE_FROM:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD_FROM]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD_TO:.*]] = cir.load %[[BOUND3_STRIDE_TO]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE_TO:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD_TO]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD_FROM:.*]] = cir.load %[[BOUND2_STRIDE_FROM]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD_FROM]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD_TO:.*]] = cir.load %[[BOUND2_STRIDE_TO]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD_TO]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsC1ERKS_(%[[BOUND1_STRIDE_TO]], %[[BOUND1_STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_NoOps>, !cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB3:.*]] = cir.binop(sub, %[[UB3_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB3]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR3_LOAD]], %[[LB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[PRIVATE]] : !cir.ptr<!cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>>, !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB2:.*]] = cir.binop(sub, %[[UB2_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB2]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR2_LOAD]], %[[LB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_LOAD:.*]] = cir.load %[[BOUND3_STRIDE]] : !cir.ptr<!cir.ptr<!cir.ptr<!rec_NoOps>>>, !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_LOAD]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB1:.*]] = cir.binop(sub, %[[UB1_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB1]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR1_LOAD]], %[[LB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_NoOps>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: cir.call @_ZN5NoOpsD1Ev(%[[STRIDE]]) nothrow : (!cir.ptr<!rec_NoOps>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+//
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: }
+;
+  using PtrTArrayTy = CtorDtor*[5];
+  PtrTArrayTy *PtrArrayPtr;
+
+#pragma acc parallel firstprivate(PtrArrayPtr[B][B][B])
+// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__Bcnt3__ZTSPA5_P8CtorDtor : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> init {
+// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: %[[TL_ALLOCA:.*]] = cir.alloca !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, ["openacc.firstprivate.init"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i 
+// CHECK-NEXT: %[[ARR_SIZE:.*]] = cir.const #cir.int<40> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ARR_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA:.*]] = cir.alloca !cir.array<!cir.ptr<!rec_CtorDtor> x 5>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 8 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[UPP_BOUND:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UPP_BOUND]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB3_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[SRC_IDX]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[TL_ALLOCA]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> 
+// CHECK-NEXT: cir.yield 
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+//
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS:.*]] = cir.binop(mul, %[[UB2_CAST]], %[[UB3_CAST]]) : !u64i
+//
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: %[[DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR_ALLOCA]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ZERO]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// 
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i 
+// CHECK-NEXT: %[[NUM_ELTS2:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[NUM_ELTS]]) : !u64i
+// CHECK-NEXT: %[[ELT_SIZE:.*]] = cir.const #cir.int<4> : !u64i
+// CHECK-NEXT: %[[ALLOCA_SIZE:.*]] = cir.binop(mul, %[[NUM_ELTS2]], %[[ELT_SIZE]]) : !u64i
+// CHECK-NEXT: %[[ARR_ALLOCA2:.*]] = cir.alloca !rec_CtorDtor, !cir.ptr<!rec_CtorDtor>, %[[ALLOCA_SIZE]] : !u64i, ["openacc.init.bounds"] {alignment = 4 : i64}
+//
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !u64i
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[NUM_ELTS]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[SRC_IDX:.*]] = cir.binop(mul, %[[UB1_CAST]], %[[ITR_LOAD]]) : !u64i
+// CHECK-NEXT: %[[SRC:.*]] = cir.ptr_stride(%[[ARR_ALLOCA2]] : !cir.ptr<!rec_CtorDtor>, %[[SRC_IDX]] : !u64i), !cir.ptr<!rec_CtorDtor> 
+// CHECK-NEXT: %[[DEST:.*]] = cir.ptr_stride(%[[STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.store %[[SRC]], %[[DEST]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } 
+// CHECK-NEXT: } 
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } copy {
+// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD_FROM:.*]] = cir.load %[[ARG_FROM]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_FROM:.*]] = cir.ptr_stride(%[[TLA_LOAD_FROM]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: %[[TLA_LOAD_TO:.*]] = cir.load %[[ARG_TO]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_TO:.*]] = cir.ptr_stride(%[[TLA_LOAD_TO]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY_FROM:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE_FROM]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE_FROM:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY_FROM]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY_TO:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE_TO]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE_TO:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY_TO]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD_FROM:.*]] = cir.load %[[BOUND2_STRIDE_FROM]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE_FROM:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD_FROM]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD_TO:.*]] = cir.load %[[BOUND2_STRIDE_TO]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE_TO:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD_TO]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorC1ERKS_(%[[BOUND1_STRIDE_TO]], %[[BOUND1_STRIDE_FROM]]) nothrow : (!cir.ptr<!rec_CtorDtor>, !cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield
+// CHECK-NEXT: } destroy {
+// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB3:.*]] = cir.binop(sub, %[[UB3_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB3]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+//
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR3_LOAD]], %[[LB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[TLA_LOAD:.*]] = cir.load %[[PRIVATE]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: %[[BOUND3_STRIDE:.*]] = cir.ptr_stride(%[[TLA_LOAD]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>, %[[ITR3_LOAD]] : !u64i), !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB2:.*]] = cir.binop(sub, %[[UB2_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB2]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR2_LOAD]], %[[LB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_CtorDtor> x 5>> -> !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: %[[BOUND2_STRIDE:.*]] = cir.ptr_stride(%[[BOUND3_STRIDE_DECAY]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, %[[ITR2_LOAD]] : !u64i), !cir.ptr<!cir.ptr<!rec_CtorDtor>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: %[[CONST_ONE:.*]] = cir.const #cir.int<1> : !u64i
+// CHECK-NEXT: %[[ONE_BELOW_UB1:.*]] = cir.binop(sub, %[[UB1_CAST]], %[[CONST_ONE]]) : !u64i
+// CHECK-NEXT: cir.store %[[ONE_BELOW_UB1]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(ge, %[[ITR1_LOAD]], %[[LB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[BOUND2_STRIDE_LOAD:.*]] = cir.load %[[BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_CtorDtor>>, !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: %[[STRIDE:.*]] = cir.ptr_stride(%[[BOUND2_STRIDE_LOAD]] : !cir.ptr<!rec_CtorDtor>, %[[ITR1_LOAD]] : !u64i), !cir.ptr<!rec_CtorDtor>
+// CHECK-NEXT: cir.call @_ZN8CtorDtorD1Ev(%[[STRIDE]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield 
+// CHECK-NEXT: }
+  ;
+}

From c6a4e84a10ae8b163c9cae3f9a49eb8077a499ff Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 6 Oct 2025 22:54:41 +0900
Subject: [PATCH 836/878] AMDGPU: Remove unnecessary reference (#162085)

---
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7b94ea3ffbf1f..f291e373a470d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -541,7 +541,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
 
 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
     const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
-  const auto &[Min, Max] = NumVGPRBounds;
+  const auto [Min, Max] = NumVGPRBounds;
 
   // Check if maximum number of VGPRs was explicitly requested using
   // "amdgpu-num-vgpr" attribute.

From f31bc666f42bb6bf0a3312a1d2ec230c390e8171 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 6 Oct 2025 16:18:12 +0200
Subject: [PATCH 837/878] [IR] Handle addrspacecast in findBaseObject()
 (#162076)

Make findBaseObject() look through addrspacecast, so that
getAliaseeObject() works with an aliasee that uses and addrspacecast.
This fixes a crash during module summary index emission.

Fixes https://github.com/llvm/llvm-project/issues/161646.
---
 llvm/lib/IR/Globals.cpp                          | 1 +
 llvm/test/Bitcode/thinlto-alias-addrspacecast.ll | 7 +++++++
 2 files changed, 8 insertions(+)
 create mode 100644 llvm/test/Bitcode/thinlto-alias-addrspacecast.ll

diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 1a7a5c5fbad6b..c3a472b0cc66d 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -419,6 +419,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases,
     case Instruction::PtrToAddr:
     case Instruction::PtrToInt:
     case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
     case Instruction::GetElementPtr:
       return findBaseObject(CE->getOperand(0), Aliases, Op);
     default:
diff --git a/llvm/test/Bitcode/thinlto-alias-addrspacecast.ll b/llvm/test/Bitcode/thinlto-alias-addrspacecast.ll
new file mode 100644
index 0000000000000..fe4f05e138c8a
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-alias-addrspacecast.ll
@@ -0,0 +1,7 @@
+; RUN: opt -module-summary < %s | llvm-dis | FileCheck %s
+
+@__oclc_ABI_version = linkonce_odr hidden addrspace(4) constant i32 500, align 4
+@_ZL20__oclc_ABI_version__ = internal alias i32, addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr)
+
+; CHECK: ^1 = gv: (name: "__oclc_ABI_version", summaries: (variable: (module: ^0, flags: {{.*}})))
+; CHECK: ^2 = gv: (name: "_ZL20__oclc_ABI_version__", summaries: (alias: (module: ^0, flags: {{.*}}, aliasee: ^1)))

From f3a952311c9d7cfe56fefe14c3ece777f679b164 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Mon, 6 Oct 2025 16:32:13 +0200
Subject: [PATCH 838/878] [AArch64] Return Invalid partial reduction cost for
 i128 accumulator. (#162066)

PR #158641 introduced an issue where i128 accumulator types resulted
in a valid cost, because for a <2 x i128> type the code that
checks for unsupported type legalization would see a type action
of 'TypeSplitVector' which is supported, even though the legalised
type of <1 x i128> would require further scalarization.

This fixes https://github.com/llvm/llvm-project/issues/162009
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 11 ++-
 .../LoopVectorize/AArch64/pr162009.ll         | 79 +++++++++++++++++++
 2 files changed, 86 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/pr162009.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 50a8754d74075..479e34515fc8a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5666,18 +5666,21 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   VectorType *AccumVectorType =
       VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
   // We don't yet support all kinds of legalization.
-  auto TA = TLI->getTypeAction(AccumVectorType->getContext(),
-                               EVT::getEVT(AccumVectorType));
-  switch (TA) {
+  auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
+                                   EVT::getEVT(AccumVectorType));
+  switch (TC.first) {
   default:
     return Invalid;
   case TargetLowering::TypeLegal:
   case TargetLowering::TypePromoteInteger:
   case TargetLowering::TypeSplitVector:
+    // The legalised type (e.g. after splitting) must be legal too.
+    if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
+        TargetLowering::TypeLegal)
+      return Invalid;
     break;
   }
 
-  // Check what kind of type-legalisation happens.
   std::pair<InstructionCost, MVT> AccumLT =
       getTypeLegalizationCost(AccumVectorType);
   std::pair<InstructionCost, MVT> InputLT =
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr162009.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr162009.ll
new file mode 100644
index 0000000000000..6095b246ccdad
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr162009.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NO-PARTIAL-REDUCTION
+
+target triple = "aarch64"
+
+define i128 @add_reduc_i32_i128_unsupported(ptr %a, ptr %b) "target-features"="+dotprod" {
+; CHECK-NO-PARTIAL-REDUCTION-LABEL: define i128 @add_reduc_i32_i128_unsupported(
+; CHECK-NO-PARTIAL-REDUCTION-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:  [[ENTRY:.*:]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-NO-PARTIAL-REDUCTION:       [[VECTOR_PH]]:
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-NO-PARTIAL-REDUCTION:       [[VECTOR_BODY]]:
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i128> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 1
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[WIDE_LOAD1]] to <4 x i64>
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP4:%.*]] = mul nuw <4 x i64> [[TMP1]], [[TMP3]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP5:%.*]] = zext <4 x i64> [[TMP4]] to <4 x i128>
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP7]] = add <4 x i128> [[VEC_PHI]], [[TMP5]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4024
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NO-PARTIAL-REDUCTION:       [[MIDDLE_BLOCK]]:
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[TMP8:%.*]] = call i128 @llvm.vector.reduce.add.v4i128(<4 x i128> [[TMP7]])
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    br label %[[SCALAR_PH:.*]]
+; CHECK-NO-PARTIAL-REDUCTION:       [[SCALAR_PH]]:
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-NO-PARTIAL-REDUCTION:       [[FOR_BODY]]:
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[IV:%.*]] = phi i64 [ 4024, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[ACCUM:%.*]] = phi i128 [ [[TMP8]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[LOAD_A:%.*]] = load i32, ptr [[GEP_A]], align 1
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[EXT_A:%.*]] = zext i32 [[LOAD_A]] to i64
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[GEP_B:%.*]] = getelementptr i32, ptr [[B]], i64 [[IV]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[LOAD_B:%.*]] = load i32, ptr [[GEP_B]], align 1
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[EXT_B:%.*]] = zext i32 [[LOAD_B]] to i64
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[MUL:%.*]] = mul nuw i64 [[EXT_A]], [[EXT_B]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[MUL_ZEXT:%.*]] = zext i64 [[MUL]] to i128
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[ADD]] = add i128 [[ACCUM]], [[MUL_ZEXT]]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 4025
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NO-PARTIAL-REDUCTION:       [[FOR_EXIT]]:
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    [[ADD_LCSSA:%.*]] = phi i128 [ [[ADD]], %[[FOR_BODY]] ]
+; CHECK-NO-PARTIAL-REDUCTION-NEXT:    ret i128 [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i128 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 1
+  %ext.a = zext i32 %load.a to i64
+  %gep.b = getelementptr i32, ptr %b, i64 %iv
+  %load.b = load i32, ptr %gep.b, align 1
+  %ext.b = zext i32 %load.b to i64
+  %mul = mul nuw i64 %ext.a, %ext.b
+  %mul.zext = zext i64 %mul to i128
+  %add = add i128 %accum, %mul.zext
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 4025
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:
+  ret i128 %add
+}
+;.
+; CHECK-NO-PARTIAL-REDUCTION: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-NO-PARTIAL-REDUCTION: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-NO-PARTIAL-REDUCTION: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-NO-PARTIAL-REDUCTION: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

From 48db3fd7026737d0fefe376e08ffee2ad996163a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 6 Oct 2025 23:34:39 +0900
Subject: [PATCH 839/878] AMDGPU: Stop handling AGPR case in
 getCrossCopyRegClass (#161800)

This isn't what this is for. In the sense this hook is concerned with,
you can copy between AGPRs. This only changes some DAG scheduling
decisions; later passes are responsible for dealing with the bad
agpr-agpr handling.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   3 -
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  12 +-
 .../CodeGen/AMDGPU/agpr-copy-propagation.mir  |   4 +-
 .../AMDGPU/mfma-no-register-aliasing.ll       | 106 +++++++++---------
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |   9 +-
 5 files changed, 66 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 3c2dd4252c583..bd95ee4042874 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1118,11 +1118,8 @@ SIRegisterInfo::getPointerRegClass(unsigned Kind) const {
 
 const TargetRegisterClass *
 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
-    return getEquivalentVGPRClass(RC);
   if (RC == &AMDGPU::SCC_CLASSRegClass)
     return getWaveMaskRegClass();
-
   return RC;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 9e240238c1066..ebbeab94066d6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -146,9 +146,9 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; copy
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_accvgpr_read_b32 v32, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v39, a2
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a3, v32
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v39
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use a3 v[0:31]
 ; GFX908-NEXT:    ;;#ASMEND
@@ -437,9 +437,9 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
 ; GFX908-NEXT:    ; copy
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    s_nop 7
-; GFX908-NEXT:    v_accvgpr_read_b32 v33, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v35, a2
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a3, v33
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v35
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use a3 v[0:31]
 ; GFX908-NEXT:    ;;#ASMEND
@@ -1045,9 +1045,9 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; copy
 ; GFX908-NEXT:    ;;#ASMEND
-; GFX908-NEXT:    v_accvgpr_read_b32 v32, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v39, a2
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a3, v32
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v39
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; use a3 v[0:31]
 ; GFX908-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
index a42cf43fe56fd..7e82382d9028c 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-propagation.mir
@@ -40,8 +40,8 @@ body: |
     ; GFX908: liveins: $agpr0
     ; GFX908-NEXT: {{  $}}
     ; GFX908-NEXT: renamable $vgpr0 = COPY renamable $agpr0, implicit $exec
-    ; GFX908-NEXT: renamable $agpr1 = COPY renamable $vgpr0, implicit $exec
-    ; GFX908-NEXT: renamable $agpr2 = COPY renamable $vgpr0, implicit $exec
+    ; GFX908-NEXT: renamable $agpr1 = COPY $agpr0, implicit $exec
+    ; GFX908-NEXT: renamable $agpr2 = COPY $agpr0, implicit $exec
     ; GFX908-NEXT: S_ENDPGM 0, implicit $vgpr0, implicit $agpr1, implicit $agpr2
     ;
     ; GFX90A-LABEL: name: do_not_propagate_agpr_to_agpr
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 51cd564bdece3..f46116ef752c9 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -95,66 +95,66 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31]
 ; GREEDY908-NEXT:    s_nop 15
 ; GREEDY908-NEXT:    s_nop 1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a32
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v5, a61
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a60
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a2, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a33
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v7, a59
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v8, a58
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a3, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a32
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a33
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a34
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v9, a57
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v10, a56
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a3, v6
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a4, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a35
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v11, a55
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v12, a54
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a5, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a36
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v13, a53
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v14, a52
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a6, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a35
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a36
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a37
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v15, a51
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v16, a50
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a5, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a6, v6
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a7, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a38
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v17, a49
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v18, a48
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a8, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a39
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v19, a47
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a46
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a9, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a38
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a39
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a40
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a16, v2
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a17, v19
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a8, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a9, v6
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a10, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a41
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a18, v18
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a19, v17
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a11, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a42
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a20, v16
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a21, v15
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a12, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a41
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a42
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a43
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a22, v14
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a23, v13
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a11, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a12, v6
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a13, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a44
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a24, v12
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a25, v11
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a14, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a45
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a26, v10
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a27, v9
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a15, v1
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a28, v8
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a29, v7
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a44
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a45
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a46
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a14, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a15, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a16, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a47
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a48
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a49
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a17, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a18, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a19, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a50
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a51
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a52
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a20, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a21, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a22, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a53
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a54
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a55
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a23, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a24, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a25, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a56
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a57
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a58
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a26, v2
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a27, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a28, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a59
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a60
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a61
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a29, v2
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a30, v6
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a31, v5
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a31, v1
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; GREEDY908-NEXT:    s_nop 15
@@ -667,11 +667,11 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
 ; GREEDY908-NEXT:    s_nop 8
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v5, a18
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a19
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a18
 ; GREEDY908-NEXT:    s_nop 0
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a0, v5
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a1, v2
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a0, v3
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY908-NEXT:    s_nop 9
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index cf244f0b1f884..be1788c6ec83f 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -54,19 +54,20 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX908-NEXT:    s_branch .LBB0_2
 ; GFX908-NEXT:  .LBB0_1: ; %bb2
 ; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
+; GFX908-NEXT:    s_nop 6
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a2
 ; GFX908-NEXT:    s_or_b32 s4, s3, 1
 ; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
 ; GFX908-NEXT:    s_mov_b32 s3, s2
 ; GFX908-NEXT:    v_mov_b32_e32 v1, s2
-; GFX908-NEXT:    s_nop 2
-; GFX908-NEXT:    v_accvgpr_read_b32 v0, a2
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v3
 ; GFX908-NEXT:    v_accvgpr_read_b32 v4, a1
 ; GFX908-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    s_and_b32 s3, s5, s4
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v4
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT:    s_and_b32 s3, s5, s4
+; GFX908-NEXT:    s_nop 0
 ; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3]
 ; GFX908-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX908-NEXT:  .LBB0_2: ; %bb

From 4efe170d858eb54432f520abb4e7f0086236748b Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 6 Oct 2025 14:50:45 +0000
Subject: [PATCH 840/878] [llvm-exegesis] Disable load store aliasing test

Test added by #159366

This is causing objdump to crash more often than not on our 2 stage
SVE bots, disabling it and I will investigate tomorrow.

Could be the changes in the PR, or a pre-existing codegen or
llvm-objdump problem.
---
 llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
index 65e1203bb275d..c8a5746ee7f66 100644
--- a/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
+++ b/llvm/test/tools/llvm-exegesis/AArch64/no-aliasing-ld-str.s
@@ -1,4 +1,6 @@
 REQUIRES: aarch64-registered-target
+// Flakey on SVE buildbots, disabled pending invesgitation.
+UNSUPPORTED: target={{.*}}
 
 RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FMOVWSr --benchmark-phase=assemble-measured-code 2>&1
 RUN: llvm-objdump -d %d > %t.s

From 919470311fbdf366e25156ea20227ac5b76ad618 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 6 Oct 2025 23:57:19 +0900
Subject: [PATCH 841/878] clang/AMDGPU: Report some missing OpenCL 2.0 feature
 macros (#160826)

Report __opencl_c_program_scope_global_variables and
__opencl_c_device_enqueue as supported. These 2.0 features are
supported but were missing from the extension map.

__opencl_c_atomic_scope_all_devices should also be reported, but
that seems to not just work by adding it to the map for some
reason.

The existing test for these macros was also broken, since it was
missing CL3.0 run lines, so add those.
---
 clang/lib/Basic/Targets/AMDGPU.h             |  7 ++--
 clang/test/Misc/amdgcn.languageOptsOpenCL.cl | 34 +++++++++++++++++---
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h
index 552698a680d3e..dfcc79402257a 100644
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@@ -319,9 +319,12 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUTargetInfo final : public TargetInfo {
       Opts["__opencl_c_images"] = true;
       Opts["__opencl_c_3d_image_writes"] = true;
       Opts["cl_khr_3d_image_writes"] = true;
+      Opts["__opencl_c_program_scope_global_variables"] = true;
 
-      Opts["__opencl_c_generic_address_space"] =
-          GPUKind >= llvm::AMDGPU::GK_GFX700;
+      if (GPUKind >= llvm::AMDGPU::GK_GFX700) {
+        Opts["__opencl_c_generic_address_space"] = true;
+        Opts["__opencl_c_device_enqueue"] = true;
+      }
     }
   }
 
diff --git a/clang/test/Misc/amdgcn.languageOptsOpenCL.cl b/clang/test/Misc/amdgcn.languageOptsOpenCL.cl
index 50c78d70b83d9..80c0825895c86 100644
--- a/clang/test/Misc/amdgcn.languageOptsOpenCL.cl
+++ b/clang/test/Misc/amdgcn.languageOptsOpenCL.cl
@@ -8,6 +8,9 @@
 // RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
 // RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
 
+// RUN: %clang_cc1 -x cl -cl-std=CL3.0 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL3.0 %s -verify -triple amdgcn-unknown-unknown -target-cpu gfx700 -Wpedantic-core-features -DTEST_CORE_FEATURES -DFLAT_SUPPORT
+
 // Extensions in all versions
 #ifndef cl_clang_storage_class_specifiers
 #error "Missing cl_clang_storage_class_specifiers define"
@@ -156,10 +159,31 @@
 #pragma OPENCL EXTENSION cl_amd_media_ops2: enable
 
 #if (__OPENCL_C_VERSION__ >= 300)
-#ifndef __opencl_c_generic_address_space
-#error "Missing __opencl_c_generic_address_space define"
-#else
-#error "Incorrect __opencl_c_generic_address_space define"
+  #ifndef __opencl_c_program_scope_global_variables
+    #error "Missing __opencl_c_program_scope_global_variables define"
+  #endif
 #endif
-#pragma OPENCL EXTENSION __opencl_c_generic_address_space: enable
+
+#if (__OPENCL_C_VERSION__ >= 300)
+  #ifdef FLAT_SUPPORT
+    #ifndef __opencl_c_generic_address_space
+      #error "Missing __opencl_c_generic_address_space define"
+    #endif
+  #else
+    #ifdef __opencl_c_generic_address_space
+      #error "Incorrect __opencl_c_generic_address_space define"
+    #endif
+  #endif
+#endif
+
+#if (__OPENCL_C_VERSION__ >= 300)
+  #ifdef FLAT_SUPPORT
+    #ifndef __opencl_c_device_enqueue
+      #error "Missing __opencl_c_device_enqueue define"
+    #endif
+  #else
+    #ifdef __opencl_c_device_enqueue
+      #error "Incorrect __opencl_c_device_enqueue define"
+    #endif
+  #endif
 #endif

From 47d74ca157b4381c98ec92aaf4c5c6303e5da387 Mon Sep 17 00:00:00 2001
From: Usha Gupta <usha.gupta@arm.com>
Date: Mon, 6 Oct 2025 15:57:27 +0100
Subject: [PATCH 842/878] [FuncAttrs][LTO] Relax norecurse attribute inference
 during postlink LTO (#158608)

This PR, which supersedes
https://github.com/llvm/llvm-project/pull/139943, extends the scenarios
where the 'norecurse' attribute can be inferred.

Currently, the 'norecurse' attribute is only inferred if all called
functions also have this attribute. This change introduces a new pass in
the LTO pipeline, run after Whole Program Devirtualization, to broaden
the inference criteria. The new pass inspects all functions in the
module and sets a flag if any functions are external or have their
addresses taken (while ignoring those already marked norecurse). This
flag is then used with the existing conditions to enable inference in
more cases.

This enhancement allows 'norecurse' to be applied in situations where a
function calls a recursive function, but is not part of the same
recursion chain.

For example, foo can now be marked 'norecurse' in the following
scenarios:

`foo -> callee1 -> callee2 -> callee2`
In this case, foo and callee1 can both be marked 'norecurse' because
they're not part of the callee2 recursion.

Similarly, foo can be marked 'norecurse' here:

`foo -> callee1 -> callee2 -> callee1`
Here, foo is not part of the callee1 -> callee2 -> callee1 recursion
chain, so it can be marked 'norecurse'.
---
 .../llvm/Transforms/IPO/FunctionAttrs.h       |  13 ++
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp     | 119 ++++++++++++---
 llvm/test/Other/new-pm-lto-defaults.ll        |   1 +
 .../norecurse_libfunc_address_taken.ll        |  40 +++++
 .../norecurse_libfunc_no_address_taken.ll     |  45 ++++++
 .../Transforms/FunctionAttrs/norecurse_lto.ll |  69 +++++++++
 .../norecurse_multi_scc_indirect_recursion.ll | 141 ++++++++++++++++++
 ...norecurse_multi_scc_indirect_recursion1.ll |  98 ++++++++++++
 .../norecurse_multinode_refscc.ll             |  41 +++++
 .../norecurse_self_recursive_callee.ll        |  88 +++++++++++
 12 files changed, 635 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_address_taken.ll
 create mode 100644 llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_no_address_taken.ll
 create mode 100644 llvm/test/Transforms/FunctionAttrs/norecurse_lto.ll
 create mode 100644 llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion.ll
 create mode 100644 llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion1.ll
 create mode 100644 llvm/test/Transforms/FunctionAttrs/norecurse_multinode_refscc.ll
 create mode 100644 llvm/test/Transforms/FunctionAttrs/norecurse_self_recursive_callee.ll

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h b/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
index 754714dceb7a6..eaca0a8fdac0b 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionAttrs.h
@@ -79,6 +79,19 @@ class ReversePostOrderFunctionAttrsPass
   LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
+/// Additional 'norecurse' attribute deduction during postlink LTO phase.
+///
+/// This is a module pass that infers 'norecurse' attribute on functions.
+/// It runs during LTO and analyzes the module's call graph to find functions
+/// that are guaranteed not to call themselves, either directly or indirectly.
+/// The pass uses a module-wide flag which checks if any function's address is
+/// taken or any function in the module has external linkage, to safely handle
+/// indirect and library function calls from current function.
+class NoRecurseLTOInferencePass
+    : public PassInfoMixin<NoRecurseLTOInferencePass> {
+public:
+  LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_IPO_FUNCTIONATTRS_H
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 7069e8d67c2f1..119caead6e8bf 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1960,6 +1960,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // is fixed.
   MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
 
+  MPM.addPass(NoRecurseLTOInferencePass());
   // Stop here at -O1.
   if (Level == OptimizationLevel::O1) {
     // The LowerTypeTestsPass needs to run to lower type metadata and the
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index f0e7d36f78aab..88550ea055012 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -119,6 +119,7 @@ MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("module-inline", ModuleInlinerPass())
 MODULE_PASS("name-anon-globals", NameAnonGlobalPass())
 MODULE_PASS("no-op-module", NoOpModulePass())
+MODULE_PASS("norecurse-lto-inference", NoRecurseLTOInferencePass())
 MODULE_PASS("nsan", NumericalStabilitySanitizerPass())
 MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("openmp-opt-postlink",
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 8d9a0e7eaef63..50130da01c7ba 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -2067,6 +2067,36 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes,
   AI.run(SCCNodes, Changed);
 }
 
+// Determines if the function 'F' can be marked 'norecurse'.
+// It returns true if any call within 'F' could lead to a recursive
+// call back to 'F', and false otherwise.
+// The 'AnyFunctionsAddressIsTaken' parameter is a module-wide flag
+// that is true if any function's address is taken, or if any function
+// has external linkage. This is used to determine the safety of
+// external/library calls.
+static bool mayHaveRecursiveCallee(Function &F,
+                                   bool AnyFunctionsAddressIsTaken = true) {
+  for (const auto &BB : F) {
+    for (const auto &I : BB.instructionsWithoutDebug()) {
+      if (const auto *CB = dyn_cast<CallBase>(&I)) {
+        const Function *Callee = CB->getCalledFunction();
+        if (!Callee || Callee == &F)
+          return true;
+
+        if (Callee->doesNotRecurse())
+          continue;
+
+        if (!AnyFunctionsAddressIsTaken ||
+            (Callee->isDeclaration() &&
+             Callee->hasFnAttribute(Attribute::NoCallback)))
+          continue;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
                               SmallPtrSet<Function *, 8> &Changed) {
   // Try and identify functions that do not recurse.
@@ -2078,28 +2108,14 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
   Function *F = *SCCNodes.begin();
   if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
     return;
-
-  // If all of the calls in F are identifiable and are to norecurse functions, F
-  // is norecurse. This check also detects self-recursion as F is not currently
-  // marked norecurse, so any called from F to F will not be marked norecurse.
-  for (auto &BB : *F)
-    for (auto &I : BB.instructionsWithoutDebug())
-      if (auto *CB = dyn_cast<CallBase>(&I)) {
-        Function *Callee = CB->getCalledFunction();
-        if (!Callee || Callee == F ||
-            (!Callee->doesNotRecurse() &&
-             !(Callee->isDeclaration() &&
-               Callee->hasFnAttribute(Attribute::NoCallback))))
-          // Function calls a potentially recursive function.
-          return;
-      }
-
-  // Every call was to a non-recursive function other than this function, and
-  // we have no indirect recursion as the SCC size is one. This function cannot
-  // recurse.
-  F->setDoesNotRecurse();
-  ++NumNoRecurse;
-  Changed.insert(F);
+  if (!mayHaveRecursiveCallee(*F)) {
+    // Every call was to a non-recursive function other than this function, and
+    // we have no indirect recursion as the SCC size is one. This function
+    // cannot recurse.
+    F->setDoesNotRecurse();
+    ++NumNoRecurse;
+    Changed.insert(F);
+  }
 }
 
 // Set the noreturn function attribute if possible.
@@ -2429,3 +2445,62 @@ ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
   PA.preserve<LazyCallGraphAnalysis>();
   return PA;
 }
+
+PreservedAnalyses NoRecurseLTOInferencePass::run(Module &M,
+                                                 ModuleAnalysisManager &MAM) {
+
+  // Check if any function in the whole program has its address taken or has
+  // potentially external linkage.
+  // We use this information when inferring norecurse attribute: If there is
+  // no function whose address is taken and all functions have internal
+  // linkage, there is no path for a callback to any user function.
+  bool AnyFunctionsAddressIsTaken = false;
+  for (Function &F : M) {
+    if (F.isDeclaration() || F.doesNotRecurse())
+      continue;
+    if (!F.hasLocalLinkage() || F.hasAddressTaken()) {
+      AnyFunctionsAddressIsTaken = true;
+      break;
+    }
+  }
+
+  // Run norecurse inference on all RefSCCs in the LazyCallGraph for this
+  // module.
+  bool Changed = false;
+  LazyCallGraph &CG = MAM.getResult<LazyCallGraphAnalysis>(M);
+  CG.buildRefSCCs();
+
+  for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs()) {
+    // Skip any RefSCC that is part of a call cycle. A RefSCC containing more
+    // than one SCC indicates a recursive relationship involving indirect calls.
+    if (RC.size() > 1)
+      continue;
+
+    // RefSCC contains a single-SCC. SCC size > 1 indicates mutually recursive
+    // functions. Ex: foo1 -> foo2 -> foo3 -> foo1.
+    LazyCallGraph::SCC &S = *RC.begin();
+    if (S.size() > 1)
+      continue;
+
+    // Get the single function from this SCC.
+    Function &F = S.begin()->getFunction();
+    if (!F.hasExactDefinition() || F.doesNotRecurse())
+      continue;
+
+    // If the analysis confirms that this function has no recursive calls
+    // (either direct, indirect, or through external linkages),
+    // we can safely apply the norecurse attribute.
+    if (!mayHaveRecursiveCallee(F, AnyFunctionsAddressIsTaken)) {
+      F.setDoesNotRecurse();
+      ++NumNoRecurse;
+      Changed = true;
+    }
+  }
+
+  PreservedAnalyses PA;
+  if (Changed)
+    PA.preserve<LazyCallGraphAnalysis>();
+  else
+    PA = PreservedAnalyses::all();
+  return PA;
+}
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index 3aea0f2061f3e..f595dfe1d6845 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -67,6 +67,7 @@
 ; CHECK-O1-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-O-NEXT: Running pass: GlobalSplitPass
 ; CHECK-O-NEXT: Running pass: WholeProgramDevirtPass
+; CHECK-O-NEXT: Running pass: NoRecurseLTOInferencePass
 ; CHECK-O23SZ-NEXT: Running pass: CoroEarlyPass
 ; CHECK-O1-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O23SZ-NEXT: Running pass: GlobalOptPass
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_address_taken.ll b/llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_address_taken.ll
new file mode 100644
index 0000000000000..bcdf75b021866
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_address_taken.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt < %s -passes=norecurse-lto-inference -S | FileCheck %s
+
+; This test includes a call to a library function which is not marked as
+; NoCallback. Function bob() does not have internal linkage and hence prevents
+; norecurse to be added.
+
+@.str = private unnamed_addr constant [12 x i8] c"Hello World\00", align 1
+
+;.
+; CHECK: @.str = private unnamed_addr constant [12 x i8] c"Hello World\00", align 1
+;.
+define dso_local void @bob() {
+; CHECK-LABEL: define dso_local void @bob() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = tail call i32 (ptr, ...) @printf(ptr  nonnull dereferenceable(1) @.str)
+  ret void
+}
+
+declare  i32 @printf(ptr  readonly captures(none), ...)
+
+define dso_local  i32 @main() norecurse {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define dso_local i32 @main(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bob()
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  tail call void @bob()
+  ret i32 0
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { norecurse }
+;.
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_no_address_taken.ll b/llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_no_address_taken.ll
new file mode 100644
index 0000000000000..a03b4ca635b1e
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse_libfunc_no_address_taken.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt < %s -passes=norecurse-lto-inference -S | FileCheck %s
+
+; This test includes a call to a library function which is not marked as
+; NoCallback. All functions except main() are internal and main is marked
+; norecurse, so as to not block norecurse to be added to bob().
+
+@.str = private unnamed_addr constant [12 x i8] c"Hello World\00", align 1
+
+; Function Attrs: nofree noinline nounwind uwtable
+;.
+; CHECK: @.str = private unnamed_addr constant [12 x i8] c"Hello World\00", align 1
+;.
+define internal void @bob() {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define internal void @bob(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = tail call i32 (ptr, ...) @printf(ptr  nonnull dereferenceable(1) @.str)
+  ret void
+}
+
+; Function Attrs: nofree nounwind
+declare  i32 @printf(ptr  readonly captures(none), ...)
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local  i32 @main() norecurse {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define dso_local i32 @main(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bob()
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  tail call void @bob()
+  ret i32 0
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { norecurse }
+;.
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse_lto.ll b/llvm/test/Transforms/FunctionAttrs/norecurse_lto.ll
new file mode 100644
index 0000000000000..5be707bef8655
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse_lto.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt < %s -passes=norecurse-lto-inference -S | FileCheck %s
+
+; This test includes a call graph which has a recursive function(foo2) which
+; calls a non-recursive internal function (foo3) satisfying the norecurse
+; attribute criteria.
+
+
+define internal void @foo3() {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define internal void @foo3(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define internal i32 @foo2(i32 %accum, i32 %n) {
+; CHECK-LABEL: define internal i32 @foo2(
+; CHECK-SAME: i32 [[ACCUM:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[RECURSE:.*]]
+; CHECK:       [[RECURSE]]:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[ACCUM]], [[SUB]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo2(i32 [[MUL]], i32 [[SUB]])
+; CHECK-NEXT:    call void @foo3()
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[ACCUM]], %[[ENTRY]] ], [ [[CALL]], %[[RECURSE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cmp = icmp eq i32 %n, 0
+  br i1 %cmp, label %exit, label %recurse
+
+recurse:
+  %sub = sub i32 %n, 1
+  %mul = mul i32 %accum, %sub
+  %call = call i32 @foo2(i32 %mul, i32 %sub)
+  call void @foo3()
+  br label %exit
+
+exit:
+  %res = phi i32 [ %accum, %entry ], [ %call, %recurse ]
+  ret i32 %res
+}
+
+define internal i32 @foo1() {
+; CHECK-LABEL: define internal i32 @foo1() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @foo2(i32 1, i32 5)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @foo2(i32 1, i32 5)
+  ret i32 %res
+}
+
+define dso_local i32 @main() {
+; CHECK-LABEL: define dso_local i32 @main() {
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @foo1()
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @foo1()
+  ret i32 %res
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { norecurse }
+;.
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion.ll b/llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion.ll
new file mode 100644
index 0000000000000..e351f60cba2db
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt < %s -passes=norecurse-lto-inference -S | FileCheck %s
+
+; This test includes a call graph with multiple SCCs. The purpose of this is
+; to check that norecurse is not added when a function is part of non-singular
+; SCC.
+; There are three different SCCs in this test:
+;  SCC#1:  f1, foo, bar, foo1, bar1
+;  SCC#2:  bar2, bar3, bar4
+;  SCC#3:  baz, fun
+; None of these functions should be marked as norecurse
+
+define internal void @bar1() {
+; CHECK-LABEL: define internal void @bar1() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @f1()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @f1()
+  ret void
+}
+
+define internal void @f1() {
+; CHECK-LABEL: define internal void @f1() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @foo()
+; CHECK-NEXT:    tail call void @bar2()
+; CHECK-NEXT:    tail call void @baz()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @foo()
+  tail call void @bar2()
+  tail call void @baz()
+  ret void
+}
+
+define dso_local  i32 @main() norecurse {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define dso_local i32 @main(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @f1()
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  tail call void @f1()
+  ret i32 0
+}
+
+define internal void @foo1() {
+; CHECK-LABEL: define internal void @foo1() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar1()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar1()
+  ret void
+}
+
+define internal void @bar() {
+; CHECK-LABEL: define internal void @bar() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @foo1()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @foo1()
+  ret void
+}
+
+define internal void @foo() {
+; CHECK-LABEL: define internal void @foo() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar()
+  ret void
+}
+
+define internal void @bar4() {
+; CHECK-LABEL: define internal void @bar4() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar2()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar2()
+  ret void
+}
+
+define internal void @bar2() {
+; CHECK-LABEL: define internal void @bar2() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar3()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar3()
+  ret void
+}
+
+define internal void @bar3() {
+; CHECK-LABEL: define internal void @bar3() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar4()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar4()
+  ret void
+}
+
+define internal void @fun() {
+; CHECK-LABEL: define internal void @fun() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @baz()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @baz()
+  ret void
+}
+
+define internal void @baz() {
+; CHECK-LABEL: define internal void @baz() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @fun()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @fun()
+  ret void
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { norecurse }
+;.
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion1.ll b/llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion1.ll
new file mode 100644
index 0000000000000..cd940379c5f53
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse_multi_scc_indirect_recursion1.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt < %s -passes=norecurse-lto-inference -S | FileCheck %s
+
+; This test includes a call graph with multiple SCCs. The purpose of this is
+; to check that norecurse is added to a function which calls a function which
+; is indirectly recursive but is not part of the recursive chain.
+; There are two SCCs in this test:
+;  SCC#1:  bar2, bar3, bar4
+;  SCC#2:  baz, fun
+; f1() calls bar2 and baz, both of which are part of some indirect recursive
+; chain. but does not call back f1() and hence f1() can be marked as
+; norecurse.
+
+define dso_local  i32 @main() norecurse {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define dso_local i32 @main(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @f1()
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  tail call void @f1()
+  ret i32 0
+}
+
+define internal void @f1() {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define internal void @f1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar2()
+; CHECK-NEXT:    tail call void @baz()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar2()
+  tail call void @baz()
+  ret void
+}
+
+define internal void @bar4() {
+; CHECK-LABEL: define internal void @bar4() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar2()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar2()
+  ret void
+}
+
+define internal void @bar2() {
+; CHECK-LABEL: define internal void @bar2() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar3()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar3()
+  ret void
+}
+
+define internal void @bar3() {
+; CHECK-LABEL: define internal void @bar3() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bar4()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @bar4()
+  ret void
+}
+
+define internal void @fun() {
+; CHECK-LABEL: define internal void @fun() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @baz()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @baz()
+  ret void
+}
+
+define internal void @baz() {
+; CHECK-LABEL: define internal void @baz() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @fun()
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @fun()
+  ret void
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { norecurse }
+;.
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse_multinode_refscc.ll b/llvm/test/Transforms/FunctionAttrs/norecurse_multinode_refscc.ll
new file mode 100644
index 0000000000000..8b81a900bdef8
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse_multinode_refscc.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -passes=norecurse-lto-inference -S %s | FileCheck %s
+
+; This is a negative test which results in RefSCC with size > 1.
+; RefSCC : [(f2), (f1)]
+; --- SCC A (f1) --- size() = 1
+define internal void @f1() {
+; CHECK-LABEL: define internal void @f1() {
+; CHECK-NEXT:    call void @f2()
+; CHECK-NEXT:    ret void
+;
+  call void @f2()
+  ret void
+}
+
+; --- SCC B (f2) --- size() = 1
+; f2 indirectly calls f1 using locally allocated function pointer
+define internal void @f2() {
+; CHECK-LABEL: define internal void @f2() {
+; CHECK-NEXT:    [[FP:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr @f1, ptr [[FP]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr, ptr [[FP]], align 8
+; CHECK-NEXT:    call void [[TMP]]()
+; CHECK-NEXT:    ret void
+;
+  %fp = alloca void ()*
+  store void ()* @f1, void ()** %fp
+  %tmp = load void ()*, void ()** %fp
+  call void %tmp()
+  ret void
+}
+
+define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:    call void @f1()
+; CHECK-NEXT:    ret i32 0
+;
+  call void @f1()
+  ret i32 0
+}
+
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse_self_recursive_callee.ll b/llvm/test/Transforms/FunctionAttrs/norecurse_self_recursive_callee.ll
new file mode 100644
index 0000000000000..461e5dff92905
--- /dev/null
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse_self_recursive_callee.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt < %s -passes=norecurse-lto-inference -S | FileCheck %s
+
+; This test includes a call graph with a self recursive function.
+; The purpose of this is to check that norecurse is added to functions
+; which have a self-recursive function in the call-chain.
+; The call-chain in this test is as follows
+; main -> bob -> callee1 -> callee2
+; where callee2 is self recursive.
+
+@x = dso_local global i32 4, align 4
+@y = dso_local global i32 2, align 4
+
+;.
+; CHECK: @x = dso_local global i32 4, align 4
+; CHECK: @y = dso_local global i32 2, align 4
+;.
+define internal void @callee2() {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define internal void @callee2(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @y, align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
+; CHECK-NEXT:    store volatile i32 [[INC]], ptr @y, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load volatile i32, ptr @y, align 4
+  %inc = add nsw i32 %0, 1
+  store volatile i32 %inc, ptr @y, align 4
+  ret void
+}
+
+define internal void @callee1(i32  %x) {
+; CHECK-LABEL: define internal void @callee1(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    tail call void @callee1(i32 [[X]])
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    tail call void @callee2()
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %x, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  tail call void @callee1(i32  %x)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  tail call void @callee2()
+  ret void
+}
+
+define internal void @bob() {
+; CHECK-LABEL: define internal void @bob() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load volatile i32, ptr @x, align 4
+; CHECK-NEXT:    tail call void @callee2(i32 [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load volatile i32, ptr @x, align 4
+  tail call void @callee2(i32  %0)
+  ret void
+}
+
+define dso_local i32 @main() norecurse {
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define dso_local i32 @main(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    tail call void @bob()
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  tail call void @bob()
+  ret i32 0
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { norecurse }
+;.

From 6b5fecf93bb330079fd91f5729ef0eedb288c3a9 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <git@ozinenko.com>
Date: Mon, 6 Oct 2025 16:58:00 +0200
Subject: [PATCH 843/878] [mlir] transform dialect: don't crash in verifiers
 (#161098)

Fix crashes in the verifier of `transform.with_named_sequence` attribute
attached to a symbol table operation caused by it constructing a call
graph inside the symbol table. The call graph construction assumes calls
and callables, such as functions or named sequences, have been verified,
but it is not yet the case when the attribute verifier on the (parent)
symbol table operation runs. Trigger such verification manually before
constructing the call graph. This adds redundancy in verification, but
there is currently no mechanism to change the order of verificaiton. In
performance-critical scenarios, verification can be disabled altogether.

Remove unnecessary verfificaton from `transform::IncludeOp::getEffects`.
It was introduced along with the op definition as the op used to inspect
the body of callee, which assumed the body existed, to identify handle
consumption behavior. This was later evolved to having explicit argument
attributes on the callee, which handles the absence of such attributes
gracefully without the need for verification, but the verification was
never removed. It would have been causing infinite recursion if kept in
place.

Fixes #159646.
Fixes #159734.
Fixes #159736.
---
 .../Dialect/Transform/IR/TransformDialect.cpp | 15 +++++
 .../lib/Dialect/Transform/IR/TransformOps.cpp |  8 +--
 mlir/test/Dialect/Transform/ops-invalid.mlir  | 65 +++++++++++++++++--
 3 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
index a500228d68c77..45cef9c162c70 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Transform/IR/Utils.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Verifier.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/TypeSwitch.h"
 
@@ -140,6 +141,20 @@ LogicalResult transform::TransformDialect::verifyOperationAttribute(
                                         "operations with symbol tables";
     }
 
+    // Pre-verify calls and callables because call graph construction below
+    // assumes they are valid, but this verifier runs before verifying the
+    // nested operations.
+    WalkResult walkResult = op->walk([](Operation *nested) {
+      if (!isa<CallableOpInterface, CallOpInterface>(nested))
+        return WalkResult::advance();
+
+      if (failed(verify(nested, /*verifyRecursively=*/false)))
+        return WalkResult::interrupt();
+      return WalkResult::advance();
+    });
+    if (walkResult.wasInterrupted())
+      return failure();
+
     const mlir::CallGraph callgraph(op);
     for (auto scc = llvm::scc_begin(&callgraph); !scc.isAtEnd(); ++scc) {
       if (!scc.hasCycle())
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 77e74cac0133c..365afab3764c8 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -2097,17 +2097,11 @@ void transform::IncludeOp::getEffects(
       getOperation(), getTarget());
   if (!callee)
     return defaultEffects();
-  DiagnosedSilenceableFailure earlyVerifierResult =
-      verifyNamedSequenceOp(callee, /*emitWarnings=*/false);
-  if (!earlyVerifierResult.succeeded()) {
-    (void)earlyVerifierResult.silence();
-    return defaultEffects();
-  }
 
   for (unsigned i = 0, e = getNumOperands(); i < e; ++i) {
     if (callee.getArgAttr(i, TransformDialect::kArgConsumedAttrName))
       consumesHandle(getOperation()->getOpOperand(i), effects);
-    else
+    else if (callee.getArgAttr(i, TransformDialect::kArgReadOnlyAttrName))
       onlyReadsHandle(getOperation()->getOpOperand(i), effects);
   }
 }
diff --git a/mlir/test/Dialect/Transform/ops-invalid.mlir b/mlir/test/Dialect/Transform/ops-invalid.mlir
index 71a260f1196e9..68305de73761a 100644
--- a/mlir/test/Dialect/Transform/ops-invalid.mlir
+++ b/mlir/test/Dialect/Transform/ops-invalid.mlir
@@ -369,6 +369,7 @@ module attributes { transform.with_named_sequence } {
   // expected-error @below {{recursion not allowed in named sequences}}
   transform.named_sequence @self_recursion() -> () {
     transform.include @self_recursion failures(suppress) () : () -> ()
+    transform.yield
   }
 }
 
@@ -376,13 +377,13 @@ module attributes { transform.with_named_sequence } {
 
 module @mutual_recursion attributes { transform.with_named_sequence } {
   // expected-note @below {{operation on recursion stack}}  
-  transform.named_sequence @foo(%arg0: !transform.any_op) -> () {
+  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
     transform.include @bar failures(suppress) (%arg0) : (!transform.any_op) -> ()
     transform.yield
   }
 
   // expected-error @below {{recursion not allowed in named sequences}}
-  transform.named_sequence @bar(%arg0: !transform.any_op) -> () {
+  transform.named_sequence @bar(%arg0: !transform.any_op {transform.readonly}) -> () {
     transform.include @foo failures(propagate) (%arg0) : (!transform.any_op) -> ()
     transform.yield
   }
@@ -430,7 +431,7 @@ module attributes { transform.with_named_sequence } {
 // -----
 
 module attributes { transform.with_named_sequence } {
-  transform.named_sequence @foo(%arg0: !transform.any_op) -> () {
+  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
     transform.yield
   }
 
@@ -444,7 +445,7 @@ module attributes { transform.with_named_sequence } {
 // -----
 
 module attributes { transform.with_named_sequence } {
-  transform.named_sequence @foo(%arg0: !transform.any_op) -> (!transform.any_op) {
+  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
     transform.yield %arg0 : !transform.any_op
   }
 
@@ -458,7 +459,7 @@ module attributes { transform.with_named_sequence } {
 // -----
 
 module attributes { transform.with_named_sequence } {
-  transform.named_sequence @foo(%arg0: !transform.any_op) -> (!transform.any_op) {
+  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
     transform.yield %arg0 : !transform.any_op
   }
 
@@ -543,7 +544,6 @@ module attributes { transform.with_named_sequence } {
 // -----
 
 module attributes { transform.with_named_sequence } {
-  // expected-error @below {{must provide consumed/readonly status for arguments of external or called ops}}
   transform.named_sequence @foo(%op: !transform.any_op) {
     transform.debug.emit_remark_at %op, "message" : !transform.any_op
     transform.yield
@@ -551,6 +551,8 @@ module attributes { transform.with_named_sequence } {
 
   transform.sequence failures(propagate) {
   ^bb0(%arg0: !transform.any_op):
+    // expected-error @below {{TransformOpInterface requires memory effects on operands to be specified}}
+    // expected-note @below {{no effects specified for operand #0}}
     transform.include @foo failures(propagate) (%arg0) : (!transform.any_op) -> ()
     transform.yield
   }
@@ -908,3 +910,54 @@ module attributes { transform.with_named_sequence } {
     transform.yield
   }
 }
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) -> () {
+    // Intentionally malformed func with no region. This shouldn't crash the
+    // verifier of `with_named_sequence` that runs before we get to the
+    // function.
+    // expected-error @below {{requires one region}}
+    "func.func"() : () -> ()
+    transform.yield
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) -> () {
+    // Intentionally malformed call with a region. This shouldn't crash the
+    // verifier of `with_named_sequence` that runs before we get to the call.
+    // expected-error @below {{requires zero regions}}
+    "func.call"() <{
+      function_type = () -> (),
+      sym_name = "lambda_function"
+    }> ({
+    ^bb0:
+      "func.return"() : () -> ()
+    }) : () -> ()
+    transform.yield
+  }
+}
+
+// -----
+
+module attributes { transform.with_named_sequence } {
+  // Intentionally malformed sequence where the verifier should not crash.
+  // expected-error @below {{ op expects argument attribute array to have the same number of elements as the number of function arguments, got 1, but expected 3}}
+  "transform.named_sequence"() <{
+    arg_attrs = [{transform.readonly}],
+    function_type = (i1, tensor<f32>, tensor<f32>) -> (),
+    sym_name = "print_message"
+  }> ({}) : () -> ()
+  "transform.named_sequence"() <{
+    function_type = (!transform.any_op) -> (),
+    sym_name = "reference_other_module"
+  }> ({
+  ^bb0(%arg0: !transform.any_op):
+    "transform.include"(%arg0) <{target = @print_message}> : (!transform.any_op) -> ()
+    "transform.yield"() : () -> ()
+  }) : () -> ()
+}

From 2b153a4c93dafc36d237d63ac64376ef3494285f Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Mon, 6 Oct 2025 07:58:11 -0700
Subject: [PATCH 844/878] [NFC][MLIR][TableGen] Use ArrayRef instead of const
 vector reference (#162016)

---
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp |  4 ++--
 mlir/tools/mlir-tblgen/EnumsGen.cpp         | 10 ++++------
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 13 +++++--------
 mlir/tools/mlir-tblgen/TosaUtilsGen.cpp     |  2 +-
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 06ef396b9b21d..cd41e6d20d70e 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -1166,7 +1166,7 @@ getAllCppAttrConstraints(const RecordKeeper &records) {
 
 /// Emit the declarations for the given constraints, of the form:
 /// `bool <constraintCppFunctionName>(<parameterTypeName> <parameterName>);`
-static void emitConstraintDecls(const std::vector<Constraint> &constraints,
+static void emitConstraintDecls(ArrayRef<Constraint> constraints,
                                 raw_ostream &os, StringRef parameterTypeName,
                                 StringRef parameterName) {
   static const char *const constraintDecl = "bool {0}({1} {2});\n";
@@ -1192,7 +1192,7 @@ static void emitAttrConstraintDecls(const RecordKeeper &records,
 ///   return (<condition>); }`
 /// where `<condition>` is the condition template with the `self` variable
 /// replaced with the `selfName` parameter.
-static void emitConstraintDefs(const std::vector<Constraint> &constraints,
+static void emitConstraintDefs(ArrayRef<Constraint> constraints,
                                raw_ostream &os, StringRef parameterTypeName,
                                StringRef selfName) {
   static const char *const constraintDef = R"(
diff --git a/mlir/tools/mlir-tblgen/EnumsGen.cpp b/mlir/tools/mlir-tblgen/EnumsGen.cpp
index d4d32f5885971..d55ad482f02c2 100644
--- a/mlir/tools/mlir-tblgen/EnumsGen.cpp
+++ b/mlir/tools/mlir-tblgen/EnumsGen.cpp
@@ -46,8 +46,7 @@ static std::string makeIdentifier(StringRef str) {
 
 static void emitEnumClass(const Record &enumDef, StringRef enumName,
                           StringRef underlyingType, StringRef description,
-                          const std::vector<EnumCase> &enumerants,
-                          raw_ostream &os) {
+                          ArrayRef<EnumCase> enumerants, raw_ostream &os) {
   os << "// " << description << "\n";
   os << "enum class " << enumName;
 
@@ -55,14 +54,13 @@ static void emitEnumClass(const Record &enumDef, StringRef enumName,
     os << " : " << underlyingType;
   os << " {\n";
 
-  for (const auto &enumerant : enumerants) {
+  for (const EnumCase &enumerant : enumerants) {
     auto symbol = makeIdentifier(enumerant.getSymbol());
     auto value = enumerant.getValue();
-    if (value >= 0) {
+    if (value >= 0)
       os << formatv("  {0} = {1},\n", symbol, value);
-    } else {
+    else
       os << formatv("  {0},\n", symbol);
-    }
   }
   os << "};\n\n";
 }
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index c3420d433523a..6895a04f85518 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -4801,11 +4801,9 @@ void OpOperandAdaptorEmitter::emitDef(
 }
 
 /// Emit the class declarations or definitions for the given op defs.
-static void
-emitOpClasses(const RecordKeeper &records,
-              const std::vector<const Record *> &defs, raw_ostream &os,
-              const StaticVerifierFunctionEmitter &staticVerifierEmitter,
-              bool emitDecl) {
+static void emitOpClasses(
+    const RecordKeeper &records, ArrayRef<const Record *> defs, raw_ostream &os,
+    const StaticVerifierFunctionEmitter &staticVerifierEmitter, bool emitDecl) {
   if (defs.empty())
     return;
 
@@ -4840,11 +4838,10 @@ emitOpClasses(const RecordKeeper &records,
 
 /// Emit the declarations for the provided op classes.
 static void emitOpClassDecls(const RecordKeeper &records,
-                             const std::vector<const Record *> &defs,
-                             raw_ostream &os) {
+                             ArrayRef<const Record *> defs, raw_ostream &os) {
   // First emit forward declaration for each class, this allows them to refer
   // to each others in traits for example.
-  for (auto *def : defs) {
+  for (const Record *def : defs) {
     Operator op(*def);
     NamespaceEmitter emitter(os, op.getCppNamespace());
     std::string comments = tblgen::emitSummaryAndDescComments(
diff --git a/mlir/tools/mlir-tblgen/TosaUtilsGen.cpp b/mlir/tools/mlir-tblgen/TosaUtilsGen.cpp
index c92954600f411..dc8cc58498230 100644
--- a/mlir/tools/mlir-tblgen/TosaUtilsGen.cpp
+++ b/mlir/tools/mlir-tblgen/TosaUtilsGen.cpp
@@ -1,4 +1,4 @@
-//===- TosaUtilsGen.cpp - Tosa utility generator -===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From 95215a3f0d5931fb47a0655cfb6825d8a904ce1e Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Mon, 6 Oct 2025 07:59:04 -0700
Subject: [PATCH 845/878] [NFC][MLIR][TableGen] Change
 `emitSummaryAndDescComments` to write to os directly (#162014)

Change `emitSummaryAndDescComments` to directly write to the output
stream, avoiding creating large intermediate strings.
---
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp |  7 ++-----
 mlir/tools/mlir-tblgen/CppGenUtilities.cpp  | 17 +++++++++++------
 mlir/tools/mlir-tblgen/CppGenUtilities.h    | 10 ++++++----
 mlir/tools/mlir-tblgen/DialectGen.cpp       |  9 ++++-----
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp |  7 ++-----
 mlir/tools/mlir-tblgen/OpInterfacesGen.cpp  | 14 ++++----------
 6 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index cd41e6d20d70e..a580b1b23f67f 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -864,11 +864,8 @@ bool DefGenerator::emitDecls(StringRef selectedDialect) {
 
     // Declare all the def classes first (in case they reference each other).
     for (const AttrOrTypeDef &def : defs) {
-      std::string comments = tblgen::emitSummaryAndDescComments(
-          def.getSummary(), def.getDescription());
-      if (!comments.empty()) {
-        os << comments << "\n";
-      }
+      tblgen::emitSummaryAndDescComments(os, def.getSummary(),
+                                         def.getDescription());
       os << "class " << def.getCppClassName() << ";\n";
     }
 
diff --git a/mlir/tools/mlir-tblgen/CppGenUtilities.cpp b/mlir/tools/mlir-tblgen/CppGenUtilities.cpp
index ebca20cc685f4..fddd7790a4375 100644
--- a/mlir/tools/mlir-tblgen/CppGenUtilities.cpp
+++ b/mlir/tools/mlir-tblgen/CppGenUtilities.cpp
@@ -14,26 +14,31 @@
 #include "CppGenUtilities.h"
 #include "mlir/Support/IndentedOstream.h"
 
-std::string
-mlir::tblgen::emitSummaryAndDescComments(llvm::StringRef summary,
-                                         llvm::StringRef description) {
+void mlir::tblgen::emitSummaryAndDescComments(llvm::raw_ostream &os,
+                                              llvm::StringRef summary,
+                                              llvm::StringRef description,
+                                              bool terminateComment) {
 
   std::string comments = "";
   StringRef trimmedSummary = summary.trim();
   StringRef trimmedDesc = description.trim();
-  llvm::raw_string_ostream os(comments);
   raw_indented_ostream ros(os);
 
+  bool empty = true;
   if (!trimmedSummary.empty()) {
     ros.printReindented(trimmedSummary, "/// ");
+    empty = false;
   }
 
   if (!trimmedDesc.empty()) {
-    if (!trimmedSummary.empty()) {
+    if (!empty) {
       // If there is a summary, add a newline after it.
       ros << "\n";
     }
     ros.printReindented(trimmedDesc, "/// ");
+    empty = false;
   }
-  return comments;
+
+  if (!empty && terminateComment)
+    ros << "\n";
 }
diff --git a/mlir/tools/mlir-tblgen/CppGenUtilities.h b/mlir/tools/mlir-tblgen/CppGenUtilities.h
index 231c59a9e148f..69d8cd85ecf70 100644
--- a/mlir/tools/mlir-tblgen/CppGenUtilities.h
+++ b/mlir/tools/mlir-tblgen/CppGenUtilities.h
@@ -15,14 +15,16 @@
 #define MLIR_TOOLS_MLIRTBLGEN_CPPGENUTILITIES_H_
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
 namespace tblgen {
 
-// Emit the summary and description as a C++ comment, perperly aligned placed
-// adjacent to the class declaration of generated classes.
-std::string emitSummaryAndDescComments(llvm::StringRef summary,
-                                       llvm::StringRef description);
+// Emit the summary and description as a C++ comment. If `terminateComment` is
+// true, terminates the comment with a `\n`.
+void emitSummaryAndDescComments(llvm::raw_ostream &os, llvm::StringRef summary,
+                                llvm::StringRef description,
+                                bool terminateComment = true);
 } // namespace tblgen
 } // namespace mlir
 
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 2e8810d5d37f4..c2c0c1f415254 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -109,9 +109,7 @@ tblgen::findDialectToGenerate(ArrayRef<Dialect> dialects) {
 /// {0}: The name of the dialect class.
 /// {1}: The dialect namespace.
 /// {2}: The dialect parent class.
-/// {3}: The summary and description comments.
 static const char *const dialectDeclBeginStr = R"(
-{3}
 class {0} : public ::mlir::{2} {
   explicit {0}(::mlir::MLIRContext *context);
 
@@ -249,10 +247,11 @@ static void emitDialectDecl(Dialect &dialect, raw_ostream &os) {
     StringRef superClassName =
         dialect.isExtensible() ? "ExtensibleDialect" : "Dialect";
 
-    std::string comments = tblgen::emitSummaryAndDescComments(
-        dialect.getSummary(), dialect.getDescription());
+    tblgen::emitSummaryAndDescComments(os, dialect.getSummary(),
+                                       dialect.getDescription(),
+                                       /*terminateCmment=*/false);
     os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName(),
-                        superClassName, comments);
+                        superClassName);
 
     // If the dialect requested the default attribute printer and parser, emit
     // the declarations for the hooks.
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 6895a04f85518..2ddb07de48a76 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -4844,11 +4844,8 @@ static void emitOpClassDecls(const RecordKeeper &records,
   for (const Record *def : defs) {
     Operator op(*def);
     NamespaceEmitter emitter(os, op.getCppNamespace());
-    std::string comments = tblgen::emitSummaryAndDescComments(
-        op.getSummary(), op.getDescription());
-    if (!comments.empty()) {
-      os << comments << "\n";
-    }
+    tblgen::emitSummaryAndDescComments(os, op.getSummary(),
+                                       op.getDescription());
     os << "class " << op.getCppClassName() << ";\n";
   }
 
diff --git a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
index 3cc1636ac3317..25f160d079f9d 100644
--- a/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpInterfacesGen.cpp
@@ -536,11 +536,8 @@ void InterfaceGenerator::forwardDeclareInterface(const Interface &interface) {
 
   // Emit a forward declaration of the interface class so that it becomes usable
   // in the signature of its methods.
-  std::string comments = tblgen::emitSummaryAndDescComments(
-      "", interface.getDescription().value_or(""));
-  if (!comments.empty()) {
-    os << comments << "\n";
-  }
+  tblgen::emitSummaryAndDescComments(os, "",
+                                     interface.getDescription().value_or(""));
 
   StringRef interfaceName = interface.getName();
   os << "class " << interfaceName << ";\n";
@@ -560,11 +557,8 @@ void InterfaceGenerator::emitInterfaceDecl(const Interface &interface) {
 
   // Emit a forward declaration of the interface class so that it becomes usable
   // in the signature of its methods.
-  std::string comments = tblgen::emitSummaryAndDescComments(
-      "", interface.getDescription().value_or(""));
-  if (!comments.empty()) {
-    os << comments << "\n";
-  }
+  tblgen::emitSummaryAndDescComments(os, "",
+                                     interface.getDescription().value_or(""));
 
   // Emit the traits struct containing the concept and model declarations.
   os << "namespace detail {\n"

From 23e35bd43cf18ee479e6d5df08189db4591c403c Mon Sep 17 00:00:00 2001
From: Cameron McInally <cameron.mcinally@nyu.edu>
Date: Mon, 6 Oct 2025 10:59:17 -0400
Subject: [PATCH 846/878] [Flang][Tests] Add GPL notice to GFortran test suite
 documentation. (#161912)

Add a GPL notice to the GFortran test suite documentation and redirect
to the LICENSE file distributed with the test suite.

Co-authored-by: Cameron McInally <cmcinally@nvidia.com>
---
 flang/docs/FortranLLVMTestSuite.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flang/docs/FortranLLVMTestSuite.md b/flang/docs/FortranLLVMTestSuite.md
index 8d9daa45ffbcc..17083b4c2a792 100644
--- a/flang/docs/FortranLLVMTestSuite.md
+++ b/flang/docs/FortranLLVMTestSuite.md
@@ -73,3 +73,5 @@ instructions described [above](#running-the-llvm-test-suite-with-fortran).
 There are additional configure-time options that can be used with the gfortran 
 tests. More details about those options and their purpose can be found in 
 [`Fortran/gfortran/README.md`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/README.md).
+
+ These tests are Free Software and are shared under the terms of the GNU General Public License (GPL). For more details, please see the accompanying [`LICENSE`](https://github.com/llvm/llvm-test-suite/tree/main/Fortran/gfortran/LICENSE.txt) file.

From 45c41247f82e5691425542de829d568cdc2fb580 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Mon, 6 Oct 2025 18:13:25 +0300
Subject: [PATCH 847/878] [libc++][ranges] P3060R3: Add
 `std::views::indices(n)` (#146823)

Implements [P3060R3](https://wg21.link/P3060R3)

Closes #148175

# References

- https://github.com/cplusplus/draft/issues/7966
- https://github.com/cplusplus/draft/pull/8006
- https://wg21.link/customization.point.object
- https://wg21.link/range.iota.overview
- https://wg21.link/ranges.syn

---------

Co-authored-by: Hristo Hristov <zingam@outlook.com>
Co-authored-by: A. Jiang <de34@live.cn>
---
 libcxx/docs/FeatureTestMacroTable.rst         |  2 +
 libcxx/docs/ReleaseNotes/22.rst               |  1 +
 libcxx/docs/Status/Cxx2cPapers.csv            |  2 +-
 libcxx/include/__ranges/iota_view.h           |  9 ++
 libcxx/include/ranges                         |  5 +
 libcxx/include/version                        |  2 +
 libcxx/modules/std/ranges.inc                 |  3 +
 .../ranges.version.compile.pass.cpp           | 27 ++++++
 .../version.version.compile.pass.cpp          | 27 ++++++
 .../cpo.compile.pass.cpp                      |  3 +
 .../range.iota.view/indices.pass.cpp          | 97 +++++++++++++++++++
 .../generate_feature_test_macro_components.py |  5 +
 12 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 3c7175c73f21e..c7f01e6868395 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -488,6 +488,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_ranges_concat``                                *unimplemented*
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_ranges_indices``                               ``202506L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_ratio``                                        ``202306L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_rcu``                                          *unimplemented*
diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst
index 8d023a14e89e6..ec23ba9d1e3a1 100644
--- a/libcxx/docs/ReleaseNotes/22.rst
+++ b/libcxx/docs/ReleaseNotes/22.rst
@@ -42,6 +42,7 @@ Implemented Papers
   is implemented in this release)
 - P3044R2: sub-``string_view`` from ``string`` (`Github <https://llvm.org/PR148140>`__)
 - P3223R2: Making ``std::istream::ignore`` less surprising (`Github <https://llvm.org/PR148178>`__)
+- P3060R3: Add ``std::views::indices(n)`` (`Github <https://llvm.org/PR148175>`__)
 - P3168R2: Give ``std::optional`` Range Support (`Github <https://llvm.org/PR105430>`__)
 
 Improvements and New Features
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 4e0918b0246c1..69b9984f43ccf 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -149,7 +149,7 @@
 "`P3503R3 <https://wg21.link/P3503R3>`__","Make type-erased allocator use in ``promise`` and ``packaged_task`` consistent","2025-06 (Sofia)","","","`#148164 <https://github.com/llvm/llvm-project/issues/148164>`__",""
 "`P3008R6 <https://wg21.link/P3008R6>`__","Atomic floating-point min/max","2025-06 (Sofia)","","","`#148168 <https://github.com/llvm/llvm-project/issues/148168>`__",""
 "`P3111R8 <https://wg21.link/P3111R8>`__","Atomic Reduction Operations","2025-06 (Sofia)","","","`#148174 <https://github.com/llvm/llvm-project/issues/148174>`__",""
-"`P3060R3 <https://wg21.link/P3060R3>`__","Add ``std::views::indices(n)``","2025-06 (Sofia)","","","`#148175 <https://github.com/llvm/llvm-project/issues/148175>`__",""
+"`P3060R3 <https://wg21.link/P3060R3>`__","Add ``std::views::indices(n)``","2025-06 (Sofia)","|Complete|","22","`#148175 <https://github.com/llvm/llvm-project/issues/148175>`__",""
 "`P2319R5 <https://wg21.link/P2319R5>`__","Prevent ``path`` presentation problems","2025-06 (Sofia)","","","`#148177 <https://github.com/llvm/llvm-project/issues/148177>`__",""
 "`P3223R2 <https://wg21.link/P3223R2>`__","Making ``std::istream::ignore`` less surprising","2025-06 (Sofia)","|Complete|","22","`#148178 <https://github.com/llvm/llvm-project/issues/148178>`__",""
 "`P2781R9 <https://wg21.link/P2781R9>`__","``std::constant_wrapper``","2025-06 (Sofia)","","","`#148179 <https://github.com/llvm/llvm-project/issues/148179>`__",""
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
index 32ff3408e54ea..22adc22e69190 100644
--- a/libcxx/include/__ranges/iota_view.h
+++ b/libcxx/include/__ranges/iota_view.h
@@ -393,6 +393,15 @@ struct __fn {
 inline namespace __cpo {
 inline constexpr auto iota = __iota::__fn{};
 } // namespace __cpo
+
+#  if _LIBCPP_STD_VER >= 26
+
+inline constexpr auto indices = [](__integer_like auto __size) static {
+  return ranges::views::iota(decltype(__size){}, __size);
+};
+
+#  endif
+
 } // namespace views
 } // namespace ranges
 
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index 96d7a6b897188..cfaa66a0831b3 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -267,6 +267,11 @@ namespace std::ranges {
   template<class W, class Bound>
     inline constexpr bool enable_borrowed_range<iota_view<W, Bound>> = true;
 
+  namespace views {
+    inline constexpr unspecified iota = unspecified;
+    inline constexpr unspecified indices = unspecified; // Since C++26
+  }
+
   // [range.repeat], repeat view
   template<class T>
     concept integer-like-with-usable-difference-type =  // exposition only
diff --git a/libcxx/include/version b/libcxx/include/version
index a132f0808aeb8..99e69299921fa 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -205,6 +205,7 @@ __cpp_lib_ranges_chunk_by                               202202L <ranges>
 __cpp_lib_ranges_concat                                 202403L <ranges>
 __cpp_lib_ranges_contains                               202207L <algorithm>
 __cpp_lib_ranges_find_last                              202207L <algorithm>
+__cpp_lib_ranges_indices                                202506L <ranges>
 __cpp_lib_ranges_iota                                   202202L <numeric>
 __cpp_lib_ranges_join_with                              202202L <ranges>
 __cpp_lib_ranges_repeat                                 202207L <ranges>
@@ -591,6 +592,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_out_ptr                              202311L
 // # define __cpp_lib_philox_engine                        202406L
 // # define __cpp_lib_ranges_concat                        202403L
+# define __cpp_lib_ranges_indices                       202506L
 # define __cpp_lib_ratio                                202306L
 // # define __cpp_lib_rcu                                  202306L
 # define __cpp_lib_reference_wrapper                    202403L
diff --git a/libcxx/modules/std/ranges.inc b/libcxx/modules/std/ranges.inc
index 7ede42e4f7b0a..cc7daa3cd1aec 100644
--- a/libcxx/modules/std/ranges.inc
+++ b/libcxx/modules/std/ranges.inc
@@ -114,6 +114,9 @@ export namespace std {
 
     namespace views {
       using std::ranges::views::iota;
+#if _LIBCPP_STD_VER >= 26
+      using std::ranges::views::indices;
+#endif
     } // namespace views
 
 #if _LIBCPP_STD_VER >= 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
index df19f03e7dba1..5116864879485 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp
@@ -48,6 +48,10 @@
 #    error "__cpp_lib_ranges_concat should not be defined before c++26"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_join_with
 #    error "__cpp_lib_ranges_join_with should not be defined before c++23"
 #  endif
@@ -98,6 +102,10 @@
 #    error "__cpp_lib_ranges_concat should not be defined before c++26"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_join_with
 #    error "__cpp_lib_ranges_join_with should not be defined before c++23"
 #  endif
@@ -148,6 +156,10 @@
 #    error "__cpp_lib_ranges_concat should not be defined before c++26"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_join_with
 #    error "__cpp_lib_ranges_join_with should not be defined before c++23"
 #  endif
@@ -201,6 +213,10 @@
 #    error "__cpp_lib_ranges_concat should not be defined before c++26"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_join_with
 #    error "__cpp_lib_ranges_join_with should not be defined before c++23"
 #  endif
@@ -278,6 +294,10 @@
 #    error "__cpp_lib_ranges_concat should not be defined before c++26"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_ranges_join_with
 #    error "__cpp_lib_ranges_join_with should be defined in c++23"
 #  endif
@@ -400,6 +420,13 @@
 #    endif
 #  endif
 
+#  ifndef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should be defined in c++26"
+#  endif
+#  if __cpp_lib_ranges_indices != 202506L
+#    error "__cpp_lib_ranges_indices should have the value 202506L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_ranges_join_with
 #    error "__cpp_lib_ranges_join_with should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 6aa704a3ead3f..9a8a1da88ee8f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -664,6 +664,10 @@
 #    error "__cpp_lib_ranges_find_last should not be defined before c++23"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_iota
 #    error "__cpp_lib_ranges_iota should not be defined before c++23"
 #  endif
@@ -1608,6 +1612,10 @@
 #    error "__cpp_lib_ranges_find_last should not be defined before c++23"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_iota
 #    error "__cpp_lib_ranges_iota should not be defined before c++23"
 #  endif
@@ -2723,6 +2731,10 @@
 #    error "__cpp_lib_ranges_find_last should not be defined before c++23"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_iota
 #    error "__cpp_lib_ranges_iota should not be defined before c++23"
 #  endif
@@ -4111,6 +4123,10 @@
 #    error "__cpp_lib_ranges_find_last should not be defined before c++23"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_ranges_iota
 #    error "__cpp_lib_ranges_iota should not be defined before c++23"
 #  endif
@@ -5694,6 +5710,10 @@
 #    error "__cpp_lib_ranges_find_last should have the value 202207L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_ranges_iota
 #    error "__cpp_lib_ranges_iota should be defined in c++23"
 #  endif
@@ -7610,6 +7630,13 @@
 #    error "__cpp_lib_ranges_find_last should have the value 202207L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_ranges_indices
+#    error "__cpp_lib_ranges_indices should be defined in c++26"
+#  endif
+#  if __cpp_lib_ranges_indices != 202506L
+#    error "__cpp_lib_ranges_indices should have the value 202506L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_ranges_iota
 #    error "__cpp_lib_ranges_iota should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp b/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp
index 49497875dcf95..7e2510f5b5d13 100644
--- a/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp
+++ b/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp
@@ -89,6 +89,9 @@ static_assert(test(std::ranges::ssize, a));
 // views::empty<T> is not a CPO
 static_assert(test(std::views::iota, 1));
 static_assert(test(std::views::iota, 1, 10));
+#if TEST_STD_VER >= 26
+static_assert(test(std::views::indices, 10));
+#endif
 #ifndef TEST_HAS_NO_LOCALIZATION
 static_assert(test(std::views::istream<int>, stream));
 #endif
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp
new file mode 100644
index 0000000000000..d92b6cb876a40
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/indices.pass.cpp
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: std-at-least-c++26
+
+// ranges
+
+// inline constexpr unspecified indices = unspecified;
+
+#include <cassert>
+#include <cstddef>
+#include <ranges>
+#include <vector>
+
+#include "test_macros.h"
+#define TEST_HAS_NO_INT128 // Size cannot be larger than 64 bits
+#include "type_algorithms.h"
+
+#include "types.h"
+
+// Test SFINAE.
+
+template <typename SizeType>
+concept HasIndices = requires(SizeType s) { std::ranges::views::indices(s); };
+
+struct NotIntegerLike {};
+
+void test_SFINAE() {
+  static_assert(HasIndices<std::size_t>);
+  types::for_each(types::integer_types(), []<typename T> { static_assert(HasIndices<T>); });
+
+  // Non-integer-like types should not satisfy HasIndices
+  static_assert(!HasIndices<bool>);
+  static_assert(!HasIndices<float>);
+  static_assert(!HasIndices<void>);
+  static_assert(!HasIndices<SomeInt>); // Does satisfy is_integer_like, but not the conversion to std::size_t
+  static_assert(!HasIndices<NotIntegerLike>);
+}
+
+constexpr bool test() {
+  {
+    auto indices_view = std::ranges::views::indices(5);
+    static_assert(std::ranges::range<decltype(indices_view)>);
+
+    assert(indices_view.size() == 5);
+
+    assert(indices_view[0] == 0);
+    assert(indices_view[1] == 1);
+    assert(indices_view[2] == 2);
+    assert(indices_view[3] == 3);
+    assert(indices_view[4] == 4);
+  }
+
+  {
+    std::vector v(5, 0);
+
+    auto indices_view = std::ranges::views::indices(std::ranges::size(v));
+    static_assert(std::ranges::range<decltype(indices_view)>);
+
+    assert(indices_view.size() == 5);
+
+    assert(indices_view[0] == 0);
+    assert(indices_view[1] == 1);
+    assert(indices_view[2] == 2);
+    assert(indices_view[3] == 3);
+    assert(indices_view[4] == 4);
+  }
+
+  {
+    std::vector v(5, SomeInt{});
+
+    auto indices_view = std::ranges::views::indices(std::ranges::size(v));
+    static_assert(std::ranges::range<decltype(indices_view)>);
+
+    assert(indices_view.size() == 5);
+
+    assert(indices_view[0] == 0);
+    assert(indices_view[1] == 1);
+    assert(indices_view[2] == 2);
+    assert(indices_view[3] == 3);
+    assert(indices_view[4] == 4);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 5d469d4914b0b..2d5b66d994da0 100644
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1113,6 +1113,11 @@ def add_version_header(tc):
             "values": {"c++23": 202207},
             "headers": ["algorithm"],
         },
+        {
+            "name": "__cpp_lib_ranges_indices",
+            "values": {"c++26": 202506},
+            "headers": ["ranges"],
+        },
         {
             "name": "__cpp_lib_ranges_iota",
             "values": {"c++23": 202202},

From 35c57a778bbd25d4df92b4b0e172b2f2aa3bf4f3 Mon Sep 17 00:00:00 2001
From: Sarah Spall <sarahspall@microsoft.com>
Date: Mon, 6 Oct 2025 08:26:23 -0700
Subject: [PATCH 848/878] [HLSL] Add support for elementwise and aggregate
 splat casting struct types with bitfields (#161263)

Adds support for elementwise and aggregate splat casting struct types
with bitfields. Replacing existing Flattening function which used to
produce a list of GEPs representing a flattened object with one that
produces a list of LValues representing a flattened object. The LValues
can be used by EmitStoreThroughLValue and EmitLoadOfLValue, ensuring
bitfields are properly loaded and stored. This also simplifies the code
in the elementwise and aggregate splat casting functions.
Closes #125986
---
 clang/lib/CodeGen/CGExpr.cpp                  | 100 +++++---
 clang/lib/CodeGen/CGExprAgg.cpp               | 146 +++++-------
 clang/lib/CodeGen/CGExprScalar.cpp            |  51 ++--
 clang/lib/CodeGen/CodeGenFunction.h           |   6 +-
 clang/lib/Sema/SemaHLSL.cpp                   |   6 -
 .../BasicFeatures/AggregateSplatCast.hlsl     |  52 ++++-
 .../BasicFeatures/ArrayElementwiseCast.hlsl   |  46 +++-
 .../BasicFeatures/StructElementwiseCast.hlsl  | 221 +++++++++++++++++-
 .../BasicFeatures/VectorElementwiseCast.hlsl  |  42 ++++
 .../Language/AggregateSplatCast-errors.hlsl   |   6 -
 .../Language/ElementwiseCast-errors.hlsl      |  21 --
 11 files changed, 489 insertions(+), 208 deletions(-)

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e6e4947882544..9f30287b68c79 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6784,29 +6784,26 @@ LValue CodeGenFunction::EmitPseudoObjectLValue(const PseudoObjectExpr *E) {
   return emitPseudoObjectExpr(*this, E, true, AggValueSlot::ignored()).LV;
 }
 
-void CodeGenFunction::FlattenAccessAndType(
-    Address Addr, QualType AddrType,
-    SmallVectorImpl<std::pair<Address, llvm::Value *>> &AccessList,
-    SmallVectorImpl<QualType> &FlatTypes) {
-  // WorkList is list of type we are processing + the Index List to access
-  // the field of that type in Addr for use in a GEP
-  llvm::SmallVector<std::pair<QualType, llvm::SmallVector<llvm::Value *, 4>>,
-                    16>
+void CodeGenFunction::FlattenAccessAndTypeLValue(
+    LValue Val, SmallVectorImpl<LValue> &AccessList) {
+
+  llvm::SmallVector<
+      std::tuple<LValue, QualType, llvm::SmallVector<llvm::Value *, 4>>, 16>
       WorkList;
   llvm::IntegerType *IdxTy = llvm::IntegerType::get(getLLVMContext(), 32);
-  // Addr should be a pointer so we need to 'dereference' it
-  WorkList.push_back({AddrType, {llvm::ConstantInt::get(IdxTy, 0)}});
+  WorkList.push_back({Val, Val.getType(), {llvm::ConstantInt::get(IdxTy, 0)}});
 
   while (!WorkList.empty()) {
-    auto [T, IdxList] = WorkList.pop_back_val();
+    auto [LVal, T, IdxList] = WorkList.pop_back_val();
     T = T.getCanonicalType().getUnqualifiedType();
     assert(!isa<MatrixType>(T) && "Matrix types not yet supported in HLSL");
+
     if (const auto *CAT = dyn_cast<ConstantArrayType>(T)) {
       uint64_t Size = CAT->getZExtSize();
       for (int64_t I = Size - 1; I > -1; I--) {
         llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
         IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, I));
-        WorkList.emplace_back(CAT->getElementType(), IdxListCopy);
+        WorkList.emplace_back(LVal, CAT->getElementType(), IdxListCopy);
       }
     } else if (const auto *RT = dyn_cast<RecordType>(T)) {
       const RecordDecl *Record = RT->getOriginalDecl()->getDefinitionOrSelf();
@@ -6814,44 +6811,75 @@ void CodeGenFunction::FlattenAccessAndType(
 
       const CXXRecordDecl *CXXD = dyn_cast<CXXRecordDecl>(Record);
 
-      llvm::SmallVector<QualType, 16> FieldTypes;
+      llvm::SmallVector<
+          std::tuple<LValue, QualType, llvm::SmallVector<llvm::Value *, 4>>, 16>
+          ReverseList;
       if (CXXD && CXXD->isStandardLayout())
         Record = CXXD->getStandardLayoutBaseWithFields();
 
       // deal with potential base classes
       if (CXXD && !CXXD->isStandardLayout()) {
-        for (auto &Base : CXXD->bases())
-          FieldTypes.push_back(Base.getType());
+        if (CXXD->getNumBases() > 0) {
+          assert(CXXD->getNumBases() == 1 &&
+                 "HLSL doesn't support multiple inheritance.");
+          auto Base = CXXD->bases_begin();
+          llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
+          IdxListCopy.push_back(llvm::ConstantInt::get(
+              IdxTy, 0)); // base struct should be at index zero
+          ReverseList.emplace_back(LVal, Base->getType(), IdxListCopy);
+        }
       }
 
-      for (auto *FD : Record->fields())
-        FieldTypes.push_back(FD->getType());
+      const CGRecordLayout &Layout = CGM.getTypes().getCGRecordLayout(Record);
 
-      for (int64_t I = FieldTypes.size() - 1; I > -1; I--) {
-        llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
-        IdxListCopy.push_back(llvm::ConstantInt::get(IdxTy, I));
-        WorkList.insert(WorkList.end(), {FieldTypes[I], IdxListCopy});
+      llvm::Type *LLVMT = ConvertTypeForMem(T);
+      CharUnits Align = getContext().getTypeAlignInChars(T);
+      LValue RLValue;
+      bool createdGEP = false;
+      for (auto *FD : Record->fields()) {
+        if (FD->isBitField()) {
+          if (FD->isUnnamedBitField())
+            continue;
+          if (!createdGEP) {
+            createdGEP = true;
+            Address GEP = Builder.CreateInBoundsGEP(LVal.getAddress(), IdxList,
+                                                    LLVMT, Align, "gep");
+            RLValue = MakeAddrLValue(GEP, T);
+          }
+          LValue FieldLVal = EmitLValueForField(RLValue, FD, true);
+          ReverseList.push_back({FieldLVal, FD->getType(), {}});
+        } else {
+          llvm::SmallVector<llvm::Value *, 4> IdxListCopy = IdxList;
+          IdxListCopy.push_back(
+              llvm::ConstantInt::get(IdxTy, Layout.getLLVMFieldNo(FD)));
+          ReverseList.emplace_back(LVal, FD->getType(), IdxListCopy);
+        }
       }
+
+      std::reverse(ReverseList.begin(), ReverseList.end());
+      llvm::append_range(WorkList, ReverseList);
     } else if (const auto *VT = dyn_cast<VectorType>(T)) {
       llvm::Type *LLVMT = ConvertTypeForMem(T);
       CharUnits Align = getContext().getTypeAlignInChars(T);
-      Address GEP =
-          Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "vector.gep");
+      Address GEP = Builder.CreateInBoundsGEP(LVal.getAddress(), IdxList, LLVMT,
+                                              Align, "vector.gep");
+      LValue Base = MakeAddrLValue(GEP, T);
       for (unsigned I = 0, E = VT->getNumElements(); I < E; I++) {
-        llvm::Value *Idx = llvm::ConstantInt::get(IdxTy, I);
-        // gep on vector fields is not recommended so combine gep with
-        // extract/insert
-        AccessList.emplace_back(GEP, Idx);
-        FlatTypes.push_back(VT->getElementType());
+        llvm::Constant *Idx = llvm::ConstantInt::get(IdxTy, I);
+        LValue LV =
+            LValue::MakeVectorElt(Base.getAddress(), Idx, VT->getElementType(),
+                                  Base.getBaseInfo(), TBAAAccessInfo());
+        AccessList.emplace_back(LV);
       }
-    } else {
-      // a scalar/builtin type
-      llvm::Type *LLVMT = ConvertTypeForMem(T);
-      CharUnits Align = getContext().getTypeAlignInChars(T);
-      Address GEP =
-          Builder.CreateInBoundsGEP(Addr, IdxList, LLVMT, Align, "gep");
-      AccessList.emplace_back(GEP, nullptr);
-      FlatTypes.push_back(T);
+    } else { // a scalar/builtin type
+      if (!IdxList.empty()) {
+        llvm::Type *LLVMT = ConvertTypeForMem(T);
+        CharUnits Align = getContext().getTypeAlignInChars(T);
+        Address GEP = Builder.CreateInBoundsGEP(LVal.getAddress(), IdxList,
+                                                LLVMT, Align, "gep");
+        AccessList.emplace_back(MakeAddrLValue(GEP, T));
+      } else // must be a bitfield we already created an lvalue for
+        AccessList.emplace_back(LVal);
     }
   }
 }
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index b8150a24d45fc..07b9aebe0bbe3 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -488,100 +488,62 @@ static bool isTrivialFiller(Expr *E) {
   return false;
 }
 
-static void EmitHLSLAggregateSplatCast(CodeGenFunction &CGF, Address DestVal,
-                                       QualType DestTy, llvm::Value *SrcVal,
-                                       QualType SrcTy, SourceLocation Loc) {
+// emit an elementwise cast where the RHS is a scalar or vector
+// or emit an aggregate splat cast
+static void EmitHLSLScalarElementwiseAndSplatCasts(CodeGenFunction &CGF,
+                                                   LValue DestVal,
+                                                   llvm::Value *SrcVal,
+                                                   QualType SrcTy,
+                                                   SourceLocation Loc) {
   // Flatten our destination
-  SmallVector<QualType> DestTypes; // Flattened type
-  SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
-  // ^^ Flattened accesses to DestVal we want to store into
-  CGF.FlattenAccessAndType(DestVal, DestTy, StoreGEPList, DestTypes);
-
-  assert(SrcTy->isScalarType() && "Invalid HLSL Aggregate splat cast.");
-  for (unsigned I = 0, Size = StoreGEPList.size(); I < Size; ++I) {
-    llvm::Value *Cast =
-        CGF.EmitScalarConversion(SrcVal, SrcTy, DestTypes[I], Loc);
-
-    // store back
-    llvm::Value *Idx = StoreGEPList[I].second;
-    if (Idx) {
-      llvm::Value *V =
-          CGF.Builder.CreateLoad(StoreGEPList[I].first, "load.for.insert");
-      Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
-    }
-    CGF.Builder.CreateStore(Cast, StoreGEPList[I].first);
-  }
-}
-
-// emit a flat cast where the RHS is a scalar, including vector
-static void EmitHLSLScalarFlatCast(CodeGenFunction &CGF, Address DestVal,
-                                   QualType DestTy, llvm::Value *SrcVal,
-                                   QualType SrcTy, SourceLocation Loc) {
-  // Flatten our destination
-  SmallVector<QualType, 16> DestTypes; // Flattened type
-  SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
-  // ^^ Flattened accesses to DestVal we want to store into
-  CGF.FlattenAccessAndType(DestVal, DestTy, StoreGEPList, DestTypes);
-
-  assert(SrcTy->isVectorType() && "HLSL Flat cast doesn't handle splatting.");
-  const VectorType *VT = SrcTy->getAs<VectorType>();
-  SrcTy = VT->getElementType();
-  assert(StoreGEPList.size() <= VT->getNumElements() &&
-         "Cannot perform HLSL flat cast when vector source \
-         object has less elements than flattened destination \
-         object.");
-  for (unsigned I = 0, Size = StoreGEPList.size(); I < Size; I++) {
-    llvm::Value *Load = CGF.Builder.CreateExtractElement(SrcVal, I, "vec.load");
+  SmallVector<LValue, 16> StoreList;
+  CGF.FlattenAccessAndTypeLValue(DestVal, StoreList);
+
+  bool isVector = false;
+  if (auto *VT = SrcTy->getAs<VectorType>()) {
+    isVector = true;
+    SrcTy = VT->getElementType();
+    assert(StoreList.size() <= VT->getNumElements() &&
+           "Cannot perform HLSL flat cast when vector source \
+           object has less elements than flattened destination \
+           object.");
+  }
+
+  for (unsigned I = 0, Size = StoreList.size(); I < Size; I++) {
+    LValue DestLVal = StoreList[I];
+    llvm::Value *Load =
+        isVector ? CGF.Builder.CreateExtractElement(SrcVal, I, "vec.load")
+                 : SrcVal;
     llvm::Value *Cast =
-        CGF.EmitScalarConversion(Load, SrcTy, DestTypes[I], Loc);
-
-    // store back
-    llvm::Value *Idx = StoreGEPList[I].second;
-    if (Idx) {
-      llvm::Value *V =
-          CGF.Builder.CreateLoad(StoreGEPList[I].first, "load.for.insert");
-      Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
-    }
-    CGF.Builder.CreateStore(Cast, StoreGEPList[I].first);
+        CGF.EmitScalarConversion(Load, SrcTy, DestLVal.getType(), Loc);
+    CGF.EmitStoreThroughLValue(RValue::get(Cast), DestLVal);
   }
 }
 
 // emit a flat cast where the RHS is an aggregate
-static void EmitHLSLElementwiseCast(CodeGenFunction &CGF, Address DestVal,
-                                    QualType DestTy, Address SrcVal,
-                                    QualType SrcTy, SourceLocation Loc) {
+static void EmitHLSLElementwiseCast(CodeGenFunction &CGF, LValue DestVal,
+                                    LValue SrcVal, SourceLocation Loc) {
   // Flatten our destination
-  SmallVector<QualType, 16> DestTypes; // Flattened type
-  SmallVector<std::pair<Address, llvm::Value *>, 16> StoreGEPList;
-  // ^^ Flattened accesses to DestVal we want to store into
-  CGF.FlattenAccessAndType(DestVal, DestTy, StoreGEPList, DestTypes);
+  SmallVector<LValue, 16> StoreList;
+  CGF.FlattenAccessAndTypeLValue(DestVal, StoreList);
   // Flatten our src
-  SmallVector<QualType, 16> SrcTypes; // Flattened type
-  SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
-  // ^^ Flattened accesses to SrcVal we want to load from
-  CGF.FlattenAccessAndType(SrcVal, SrcTy, LoadGEPList, SrcTypes);
+  SmallVector<LValue, 16> LoadList;
+  CGF.FlattenAccessAndTypeLValue(SrcVal, LoadList);
 
-  assert(StoreGEPList.size() <= LoadGEPList.size() &&
-         "Cannot perform HLSL flat cast when flattened source object \
+  assert(StoreList.size() <= LoadList.size() &&
+         "Cannot perform HLSL elementwise cast when flattened source object \
           has less elements than flattened destination object.");
-  // apply casts to what we load from LoadGEPList
+  // apply casts to what we load from LoadList
   // and store result in Dest
-  for (unsigned I = 0, E = StoreGEPList.size(); I < E; I++) {
-    llvm::Value *Idx = LoadGEPList[I].second;
-    llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[I].first, "load");
-    Load =
-        Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract") : Load;
-    llvm::Value *Cast =
-        CGF.EmitScalarConversion(Load, SrcTypes[I], DestTypes[I], Loc);
-
-    // store back
-    Idx = StoreGEPList[I].second;
-    if (Idx) {
-      llvm::Value *V =
-          CGF.Builder.CreateLoad(StoreGEPList[I].first, "load.for.insert");
-      Cast = CGF.Builder.CreateInsertElement(V, Cast, Idx);
-    }
-    CGF.Builder.CreateStore(Cast, StoreGEPList[I].first);
+  for (unsigned I = 0, E = StoreList.size(); I < E; I++) {
+    LValue DestLVal = StoreList[I];
+    LValue SrcLVal = LoadList[I];
+    RValue RVal = CGF.EmitLoadOfLValue(SrcLVal, Loc);
+    assert(RVal.isScalar() && "All flattened source values should be scalars");
+    llvm::Value *Val = RVal.getScalarVal();
+    llvm::Value *Cast = CGF.EmitScalarConversion(Val, SrcLVal.getType(),
+                                                 DestLVal.getType(), Loc);
+    CGF.EmitStoreThroughLValue(RValue::get(Cast), DestLVal);
   }
 }
 
@@ -988,31 +950,33 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
     Expr *Src = E->getSubExpr();
     QualType SrcTy = Src->getType();
     RValue RV = CGF.EmitAnyExpr(Src);
-    QualType DestTy = E->getType();
-    Address DestVal = Dest.getAddress();
+    LValue DestLVal = CGF.MakeAddrLValue(Dest.getAddress(), E->getType());
     SourceLocation Loc = E->getExprLoc();
 
-    assert(RV.isScalar() && "RHS of HLSL splat cast must be a scalar.");
+    assert(RV.isScalar() && SrcTy->isScalarType() &&
+           "RHS of HLSL splat cast must be a scalar.");
     llvm::Value *SrcVal = RV.getScalarVal();
-    EmitHLSLAggregateSplatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
+    EmitHLSLScalarElementwiseAndSplatCasts(CGF, DestLVal, SrcVal, SrcTy, Loc);
     break;
   }
   case CK_HLSLElementwiseCast: {
     Expr *Src = E->getSubExpr();
     QualType SrcTy = Src->getType();
     RValue RV = CGF.EmitAnyExpr(Src);
-    QualType DestTy = E->getType();
-    Address DestVal = Dest.getAddress();
+    LValue DestLVal = CGF.MakeAddrLValue(Dest.getAddress(), E->getType());
     SourceLocation Loc = E->getExprLoc();
 
     if (RV.isScalar()) {
       llvm::Value *SrcVal = RV.getScalarVal();
-      EmitHLSLScalarFlatCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
+      assert(SrcTy->isVectorType() &&
+             "HLSL Elementwise cast doesn't handle splatting.");
+      EmitHLSLScalarElementwiseAndSplatCasts(CGF, DestLVal, SrcVal, SrcTy, Loc);
     } else {
       assert(RV.isAggregate() &&
              "Can't perform HLSL Aggregate cast on a complex type.");
       Address SrcVal = RV.getAggregateAddress();
-      EmitHLSLElementwiseCast(CGF, DestVal, DestTy, SrcVal, SrcTy, Loc);
+      EmitHLSLElementwiseCast(CGF, DestLVal, CGF.MakeAddrLValue(SrcVal, SrcTy),
+                              Loc);
     }
     break;
   }
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index c961222766475..06d9d812bc60b 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2397,39 +2397,37 @@ bool CodeGenFunction::ShouldNullCheckClassCastValue(const CastExpr *CE) {
 }
 
 // RHS is an aggregate type
-static Value *EmitHLSLElementwiseCast(CodeGenFunction &CGF, Address RHSVal,
-                                      QualType RHSTy, QualType LHSTy,
-                                      SourceLocation Loc) {
-  SmallVector<std::pair<Address, llvm::Value *>, 16> LoadGEPList;
-  SmallVector<QualType, 16> SrcTypes; // Flattened type
-  CGF.FlattenAccessAndType(RHSVal, RHSTy, LoadGEPList, SrcTypes);
-  // LHS is either a vector or a builtin?
+static Value *EmitHLSLElementwiseCast(CodeGenFunction &CGF, LValue SrcVal,
+                                      QualType DestTy, SourceLocation Loc) {
+  SmallVector<LValue, 16> LoadList;
+  CGF.FlattenAccessAndTypeLValue(SrcVal, LoadList);
+  // Dest is either a vector or a builtin?
   // if its a vector create a temp alloca to store into and return that
-  if (auto *VecTy = LHSTy->getAs<VectorType>()) {
-    assert(SrcTypes.size() >= VecTy->getNumElements() &&
-           "Flattened type on RHS must have more elements than vector on LHS.");
+  if (auto *VecTy = DestTy->getAs<VectorType>()) {
+    assert(LoadList.size() >= VecTy->getNumElements() &&
+           "Flattened type on RHS must have the same number or more elements "
+           "than vector on LHS.");
     llvm::Value *V =
-        CGF.Builder.CreateLoad(CGF.CreateIRTemp(LHSTy, "flatcast.tmp"));
+        CGF.Builder.CreateLoad(CGF.CreateIRTemp(DestTy, "flatcast.tmp"));
     // write to V.
     for (unsigned I = 0, E = VecTy->getNumElements(); I < E; I++) {
-      llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[I].first, "load");
-      llvm::Value *Idx = LoadGEPList[I].second;
-      Load = Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract")
-                 : Load;
-      llvm::Value *Cast = CGF.EmitScalarConversion(
-          Load, SrcTypes[I], VecTy->getElementType(), Loc);
+      RValue RVal = CGF.EmitLoadOfLValue(LoadList[I], Loc);
+      assert(RVal.isScalar() &&
+             "All flattened source values should be scalars.");
+      llvm::Value *Cast =
+          CGF.EmitScalarConversion(RVal.getScalarVal(), LoadList[I].getType(),
+                                   VecTy->getElementType(), Loc);
       V = CGF.Builder.CreateInsertElement(V, Cast, I);
     }
     return V;
   }
-  // i its a builtin just do an extract element or load.
-  assert(LHSTy->isBuiltinType() &&
+  // if its a builtin just do an extract element or load.
+  assert(DestTy->isBuiltinType() &&
          "Destination type must be a vector or builtin type.");
-  llvm::Value *Load = CGF.Builder.CreateLoad(LoadGEPList[0].first, "load");
-  llvm::Value *Idx = LoadGEPList[0].second;
-  Load =
-      Idx ? CGF.Builder.CreateExtractElement(Load, Idx, "vec.extract") : Load;
-  return CGF.EmitScalarConversion(Load, LHSTy, SrcTypes[0], Loc);
+  RValue RVal = CGF.EmitLoadOfLValue(LoadList[0], Loc);
+  assert(RVal.isScalar() && "All flattened source values should be scalars.");
+  return CGF.EmitScalarConversion(RVal.getScalarVal(), LoadList[0].getType(),
+                                  DestTy, Loc);
 }
 
 // VisitCastExpr - Emit code for an explicit or implicit cast.  Implicit casts
@@ -2954,12 +2952,11 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
   case CK_HLSLElementwiseCast: {
     RValue RV = CGF.EmitAnyExpr(E);
     SourceLocation Loc = CE->getExprLoc();
-    QualType SrcTy = E->getType();
 
     assert(RV.isAggregate() && "Not a valid HLSL Elementwise Cast.");
     // RHS is an aggregate
-    Address SrcVal = RV.getAggregateAddress();
-    return EmitHLSLElementwiseCast(CGF, SrcVal, SrcTy, DestTy, Loc);
+    LValue SrcVal = CGF.MakeAddrLValue(RV.getAggregateAddress(), E->getType());
+    return EmitHLSLElementwiseCast(CGF, SrcVal, DestTy, Loc);
   }
   } // end of switch
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index f0565c1de04c4..99de6e177eef5 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4464,10 +4464,8 @@ class CodeGenFunction : public CodeGenTypeCache {
                                 AggValueSlot slot = AggValueSlot::ignored());
   LValue EmitPseudoObjectLValue(const PseudoObjectExpr *e);
 
-  void FlattenAccessAndType(
-      Address Addr, QualType AddrTy,
-      SmallVectorImpl<std::pair<Address, llvm::Value *>> &AccessList,
-      SmallVectorImpl<QualType> &FlatTypes);
+  void FlattenAccessAndTypeLValue(LValue LVal,
+                                  SmallVectorImpl<LValue> &AccessList);
 
   llvm::Value *EmitIvarOffset(const ObjCInterfaceDecl *Interface,
                               const ObjCIvarDecl *Ivar);
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index fa30c66b62684..2b375b93e3a48 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -3571,9 +3571,6 @@ bool SemaHLSL::CanPerformAggregateSplatCast(Expr *Src, QualType DestTy) {
   if (SrcVecTy)
     SrcTy = SrcVecTy->getElementType();
 
-  if (ContainsBitField(DestTy))
-    return false;
-
   llvm::SmallVector<QualType> DestTypes;
   BuildFlattenedTypeList(DestTy, DestTypes);
 
@@ -3600,9 +3597,6 @@ bool SemaHLSL::CanPerformElementwiseCast(Expr *Src, QualType DestTy) {
       (DestTy->isScalarType() || DestTy->isVectorType()))
     return false;
 
-  if (ContainsBitField(DestTy) || ContainsBitField(SrcTy))
-    return false;
-
   llvm::SmallVector<QualType> DestTypes;
   BuildFlattenedTypeList(DestTy, DestTypes);
   llvm::SmallVector<QualType> SrcTypes;
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/AggregateSplatCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/AggregateSplatCast.hlsl
index 512fcd435191a..9524f024e8d46 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/AggregateSplatCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/AggregateSplatCast.hlsl
@@ -54,18 +54,16 @@ struct S {
 
 // struct splats
 // CHECK-LABEL: define void {{.*}}call3
-// CHECK: [[A:%.*]] = alloca <1 x i32>, align 4
+// CHECK: [[AA:%.*]] = alloca i32, align 4
 // CHECK: [[s:%.*]] = alloca %struct.S, align 1
-// CHECK-NEXT: store <1 x i32> splat (i32 1), ptr [[A]], align 4
-// CHECK-NEXT: [[L:%.*]] = load <1 x i32>, ptr [[A]], align 4
-// CHECK-NEXT: [[VL:%.*]] = extractelement <1 x i32> [[L]], i32 0
+// CHECK-NEXT: store i32 %A, ptr [[AA]], align 4
+// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[AA]], align 4
 // CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 0
 // CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 1
-// CHECK-NEXT: store i32 [[VL]], ptr [[G1]], align 4
-// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[VL]] to float
+// CHECK-NEXT: store i32 [[L]], ptr [[G1]], align 4
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[L]] to float
 // CHECK-NEXT: store float [[C]], ptr [[G2]], align 4
-export void call3() {
-  int1 A = {1};
+export void call3(int A) {
   S s = (S)A;
 }
 
@@ -85,3 +83,41 @@ export void call5() {
   int1 A = {1};
   S s = (S)A;
 }
+
+struct BFields {
+  double DF;
+  int E: 15;
+  int : 8;
+  float F;
+};
+
+struct Derived : BFields {
+  int G;
+};
+
+// derived struct with bitfields splat from scalar
+// CHECK-LABEL: call6
+// CHECK: [[AAddr:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[D:%.*]] = alloca %struct.Derived, align 1
+// CHECK-NEXT: store i32 %A, ptr [[AAddr]], align 4
+// CHECK-NEXT: [[B:%.*]] = load i32, ptr [[AAddr]], align 4
+// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 0
+// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep]], i32 0, i32 1
+// CHECK-NEXT: [[Gep1:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[Gep2:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 0, i32 2
+// CHECK-NEXT: [[Gep3:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 1
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[B]] to double
+// CHECK-NEXT: store double [[C]], ptr [[Gep1]], align 8
+// CHECK-NEXT: [[H:%.*]] = trunc i32 [[B]] to i24
+// CHECK-NEXT: [[BFL:%.*]] = load i24, ptr [[E]], align 1
+// CHECK-NEXT: [[BFV:%.*]] = and i24 [[H]], 32767
+// CHECK-NEXT: [[BFC:%.*]] = and i24 [[BFL]], -32768
+// CHECK-NEXT: [[BFS:%.*]] = or i24 [[BFC]], [[BFV]]
+// CHECK-NEXT: store i24 [[BFS]], ptr [[E]], align 1
+// CHECK-NEXT: [[C4:%.*]] = sitofp i32 [[B]] to float
+// CHECK-NEXT: store float [[C4]], ptr [[Gep2]], align 4
+// CHECK-NEXT: store i32 [[B]], ptr [[Gep3]], align 4
+// CHECK-NEXT: ret void
+export void call6(int A) {
+  Derived D = (Derived)A;
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayElementwiseCast.hlsl
index ac02ddf5765ed..5f2182e27285e 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/ArrayElementwiseCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayElementwiseCast.hlsl
@@ -10,7 +10,8 @@
 // CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 0
 // CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [2 x i32], ptr [[Tmp]], i32 0, i32 1
 // CHECK-NEXT: [[L:%.*]] = load i32, ptr [[G1]], align 4
-// CHECK-NEXT: store i32 [[L]], ptr [[B]], align 4
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[L]] to float
+// CHECK-NEXT: store float [[C]], ptr [[B]], align 4
 export void call0() {
   int A[2] = {0,1};
   float B = (float)A;
@@ -141,3 +142,46 @@ export void call7() {
   int A[1] = {1};
   A = (int[1])s;
 }
+
+struct BFields {
+  double D;
+  int E: 15;
+  int : 8;
+  float F;
+};
+
+struct Derived : BFields {
+  int G;
+};
+
+// flatten from a derived struct with bitfields
+// CHECK-LABEL: call8
+// CHECK: [[A:%.*]] = alloca [4 x i32], align 4
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.Derived, align 1
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[Tmp]], ptr align 1 %D, i32 19, i1 false)
+// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 0, i32 0
+// CHECK-NEXT: [[Gep1:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 0, i32 1
+// CHECK-NEXT: [[Gep2:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 0, i32 2
+// CHECK-NEXT: [[Gep3:%.*]] = getelementptr inbounds [4 x i32], ptr [[A]], i32 0, i32 3
+// CHECK-NEXT: [[Gep4:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep4]], i32 0, i32 1
+// CHECK-NEXT: [[Gep5:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[Gep6:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 2
+// CHECK-NEXT: [[Gep7:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 1
+// CHECK-NEXT: [[Z:%.*]] = load double, ptr [[Gep5]], align 8
+// CHECK-NEXT: [[C:%.*]] = fptosi double [[Z]] to i32
+// CHECK-NEXT: store i32 [[C]], ptr [[Gep]], align 4
+// CHECK-NEXT: [[BFL:%.*]] = load i24, ptr [[E]], align 1
+// CHECK-NEXT: [[BFShl:%.*]] = shl i24 [[BFL]], 9
+// CHECK-NEXT: [[BFAshr:%.*]] = ashr i24 [[BFShl]], 9
+// CHECK-NEXT: [[BFC:%.*]] = sext i24 [[BFAshr]] to i32
+// CHECK-NEXT: store i32 [[BFC]], ptr [[Gep1]], align 4
+// CHECK-NEXT: [[Y:%.*]] = load float, ptr [[Gep6]], align 4
+// CHECK-NEXT: [[C8:%.*]] = fptosi float [[Y]] to i32
+// CHECK-NEXT: store i32 [[C8]], ptr [[Gep2]], align 4
+// CHECK-NEXT: [[X:%.*]] = load i32, ptr [[Gep7]], align 4
+// CHECK-NEXT: store i32 [[X]], ptr [[Gep3]], align 4
+// CHECK-NEXT: ret void
+export void call8(Derived D) {
+  int A[4] = (int[4])D;  
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
index 81b9f5b28cc7e..4e29994afd27e 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/StructElementwiseCast.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
 struct S {
   int X;
@@ -127,14 +127,219 @@ struct T {
 // CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 0
 // CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds %struct.S, ptr [[s]], i32 0, i32 1
 // CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds %struct.T, ptr [[Tmp]], i32 0, i32 0
-// CHECK-NEXT: %gep3 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 0, i32 1
-// CHECK-NEXT: %gep4 = getelementptr inbounds %struct.T, ptr %agg-temp, i32 0, i32 2
-// CHECK-NEXT: %load = load i32, ptr %gep2, align 4
-// CHECK-NEXT: store i32 %load, ptr %gep, align 4
-// CHECK-NEXT: %load5 = load i32, ptr %gep3, align 4
-// CHECK-NEXT: %conv = sitofp i32 %load5 to float
-// CHECK-NEXT: store float %conv, ptr %gep1, align 4
+// CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds %struct.T, ptr %agg-temp, i32 0, i32 1
+// CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds %struct.T, ptr %agg-temp, i32 0, i32 2
+// CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[G3]], align 4
+// CHECK-NEXT: store i32 [[L1]], ptr [[G1]], align 4
+// CHECK-NEXT: [[L2:%.*]] = load i32, ptr [[G4]], align 4
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[L2]] to float
+// CHECK-NEXT: store float [[C]], ptr [[G2]], align 4
 export void call8() {
   T t = {1,2,3};
   S s = (S)t;
 }
+
+struct BFields {
+  double D;
+  int E: 15;
+  int : 8;
+  float F;
+};
+
+struct Derived : BFields {
+  int G;
+};
+
+// Derived Struct truncate to scalar
+// CHECK-LABEL: call9
+// CHECK: [[D2:%.*]] = alloca double, align 8
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.Derived, align 1
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[Tmp]], ptr align 1 %D, i32 19, i1 false)
+// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep]], i32 0, i32 1
+// CHECK-NEXT: [[Gep1:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[Gep2:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 2
+// CHECK-NEXT: [[Gep3:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 1
+// CHECK-NEXT: [[A:%.*]] = load double, ptr [[Gep1]], align 8
+// CHECK-NEXT: store double [[A]], ptr [[D2]], align 8
+// CHECK-NEXT: ret void
+export void call9(Derived D) {
+  double D2 = (double)D;
+}
+
+// Derived struct from vector
+// CHECK-LABEL: call10
+// CHECK: [[IAddr:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT: [[D:%.*]] = alloca %struct.Derived, align 1
+// CHECK-NEXT: store <4 x i32> %I, ptr [[IAddr]], align 16
+// CHECK-NEXT: [[A:%.*]] = load <4 x i32>, ptr [[IAddr]], align 16
+// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 0
+// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep]], i32 0, i32 1
+// CHECK-NEXT: [[Gep1:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[Gep2:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 0, i32 2
+// CHECK-NEXT: [[Gep3:%.*]] = getelementptr inbounds %struct.Derived, ptr [[D]], i32 0, i32 1
+// CHECK-NEXT: [[VL:%.*]] = extractelement <4 x i32> [[A]], i64 0
+// CHECK-NEXT: [[C:%.*]] = sitofp i32 [[VL]] to double
+// CHECK-NEXT: store double [[C]], ptr [[Gep1]], align 8
+// CHECK-NEXT: [[VL4:%.*]] = extractelement <4 x i32> [[A]], i64 1
+// CHECK-NEXT: [[B:%.*]] = trunc i32 [[VL4]] to i24
+// CHECK-NEXT: [[BFL:%.*]] = load i24, ptr [[E]], align 1
+// CHECK-NEXT: [[BFV:%.*]] = and i24 [[B]], 32767
+// CHECK-NEXT: [[BFC:%.*]] = and i24 [[BFL]], -32768
+// CHECK-NEXT: [[BFSet:%.*]] = or i24 [[BFC]], [[BFV]]
+// CHECK-NEXT: store i24 [[BFSet]], ptr [[E]], align 1
+// CHECK-NEXT: [[VL5:%.*]] = extractelement <4 x i32> [[A]], i64 2
+// CHECK-NEXT: [[C6:%.*]] = sitofp i32 [[VL5]] to float
+// CHECK-NEXT: store float [[C6]], ptr [[Gep2]], align 4
+// CHECK-NEXT: [[VL7:%.*]] = extractelement <4 x i32> [[A]], i64 3
+// CHECK-NEXT: store i32 [[VL7]], ptr [[Gep3]], align 4
+// CHECK-NEXT: ret void
+export void call10(int4 I) {
+  Derived D = (Derived)I;
+}
+
+// truncate derived struct
+// CHECK-LABEL: call11
+// CHECK: [[B:%.*]] = alloca %struct.BFields, align 1
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.Derived, align 1
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[Tmp]], ptr align 1 [[D]], i32 19, i1 false)
+// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds %struct.BFields, ptr [[B]], i32 0
+// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep]], i32 0, i32 1
+// CHECK-NEXT: [[Gep1:%.*]] = getelementptr inbounds %struct.BFields, ptr [[B]], i32 0, i32 0
+// CHECK-NEXT: [[Gep2:%.*]] = getelementptr inbounds %struct.BFields, ptr [[B]], i32 0, i32 2
+// CHECK-NEXT: [[Gep3:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[E4:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep3]], i32 0, i32 1
+// CHECK-NEXT: [[Gep5:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[Gep6:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 2
+// CHECK-NEXT: [[Gep7:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 1
+// CHECK-NEXT: [[A:%.*]] = load double, ptr [[Gep5]], align 8
+// CHECK-NEXT: store double [[A]], ptr [[Gep1]], align 8
+// CHECK-NEXT: [[BFl:%.*]] = load i24, ptr [[E4]], align 1
+// CHECK-NEXT: [[Shl:%.*]] = shl i24 [[BFL]], 9
+// CHECK-NEXT: [[Ashr:%.*]] = ashr i24 [[Shl]], 9
+// CHECK-NEXT: [[BFC:%.*]] = sext i24 [[Ashr]] to i32
+// CHECK-NEXT: [[B:%.*]] = trunc i32 [[BFC]] to i24
+// CHECK-NEXT: [[BFL8:%.*]] = load i24, ptr [[E]], align 1
+// CHECK-NEXT: [[BFV:%.*]] = and i24 [[B]], 32767
+// CHECK-NEXT: [[BFC:%.*]] = and i24 [[BFL8]], -32768
+// CHECK-NEXT: [[BFSet:%.*]] = or i24 [[BFC]], [[BFV]]
+// CHECK-NEXT: store i24 [[BFSet]], ptr [[E]], align 1
+// CHECK-NEXT: [[C:%.*]] = load float, ptr [[Gep6]], align 4
+// CHECK-NEXT: store float [[C]], ptr [[Gep2]], align 4
+// CHECK-NEXT: ret void
+export void call11(Derived D) {
+  BFields B = (BFields)D;
+}
+
+struct Empty {
+};
+
+// cast to an empty struct
+// CHECK-LABEL: call12
+// CHECK: [[I:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT: [[E:%.*]] = alloca %struct.Empty, align 1
+// CHECK-NEXT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr [[I]], align 16
+// CHECK-NEXT: [[A:%.*]] = load <4 x i32>, ptr [[I]], align 16
+// CHECK-NEXt: ret void
+export void call12() {
+  int4 I = {1,2,3,4};
+  Empty E = (Empty)I;
+}
+
+struct MoreBFields {
+  int A;
+  uint64_t B: 60;
+  float C;
+  uint16_t D: 10;
+  uint16_t E: 6;
+  int : 32;
+  double F;
+  int : 8;
+  uint G;
+};
+
+// more complicated bitfield case
+// CHECK-LABEL: call13
+// CHECK: [[AA:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[MBF:%.*]] = alloca %struct.MoreBFields, align 1
+// CHECK-NEXT: store i32 %A, ptr [[AA]], align 4
+// CHECK-NEXT: [[Z:%.*]] = load i32, ptr [[AA]], align 4
+// get the gep for the struct.
+// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds %struct.MoreBFields, ptr [[MBF]], i32 0
+// CHECK-NEXT: [[FieldB:%.*]] = getelementptr inbounds nuw %struct.MoreBFields, ptr [[Gep]], i32 0, i32 1
+// D and E share the same field index
+// CHECK-NEXT: [[FieldD:%.*]] = getelementptr inbounds nuw %struct.MoreBFields, ptr [[Gep]], i32 0, i32 3
+// CHECK-NEXT: [[FieldE:%.*]] = getelementptr inbounds nuw %struct.MoreBFields, ptr [[Gep]], i32 0, i32 3
+// CHECK-NEXT: [[FieldA:%.*]] = getelementptr inbounds %struct.MoreBFields, ptr [[MBF]], i32 0, i32 0
+// CHECK-NEXT: [[FieldC:%.*]] = getelementptr inbounds %struct.MoreBFields, ptr [[MBF]], i32 0, i32 2
+// CHECK-NEXT: [[FieldF:%.*]] = getelementptr inbounds %struct.MoreBFields, ptr [[MBF]], i32 0, i32 5
+// CHECK-NEXT: [[FieldG:%.*]] = getelementptr inbounds %struct.MoreBFields, ptr [[MBF]], i32 0, i32 7
+// store int A into field A
+// CHECK-NEXT: store i32 [[Z]], ptr [[FieldA]], align 4
+// store int A in bitField B, do necessary conversions
+// CHECK-NEXT: [[Conv:%.*]] = sext i32 [[Z]] to i64
+// CHECK-NEXT: [[BFL:%.*]] = load i64, ptr [[FieldB]], align 1
+// CHECK-NEXT: [[BFV:%.*]] = and i64 [[Conv]], 1152921504606846975
+// CHECK-NEXT: [[BFC:%.*]] = and i64 [[BFL]], -1152921504606846976
+// CHECK-NEXT: [[BFS:%.*]] = or i64 [[BFC]], [[BFV]]
+// CHECK-NEXT: store i64 [[BFS]], ptr [[FieldB]], align 1
+// store int A into field C
+// CHECK-NEXT: [[Conv5:%.*]] = sitofp i32 [[Z]] to float
+// CHECK-NEXT: store float [[Conv5]], ptr [[FieldC]], align 4
+// store int A into bitfield D
+// CHECK-NEXT: [[Conv6:%.*]] = trunc i32 [[Z]] to i16
+// CHECK-NEXT: [[FDL:%.*]] = load i16, ptr [[FieldD]], align 1
+// CHECK-NEXT: [[FDV:%.*]] = and i16 [[Conv6]], 1023
+// CHECK-NEXT: [[FDC:%.*]] = and i16 [[FDL]], -1024
+// CHECK-NEXT: [[FDS:%.*]] = or i16 [[FDC]], [[FDV]]
+// CHECK-NEXT: store i16 [[FDS]], ptr [[FieldD]], align 1
+// store int A into bitfield E;
+// CHECK-NEXT: [[Conv11:%.*]] = trunc i32 [[Z]] to i16
+// CHECK-NEXT: [[FEL:%.*]] = load i16, ptr [[FieldE]], align 1
+// CHECK-NEXT: [[FEV:%.*]] = and i16 [[Conv11]], 63
+// CHECK-NEXT: [[FESHL:%.*]] = shl i16 [[FEV]], 10
+// CHECK-NEXT: [[FEC:%.*]] = and i16 [[FEL]], 1023
+// CHECK-NEXT: [[FES:%.*]] = or i16 [[FEC]], [[FESHL]]
+// CHECK-NEXT: store i16 [[FES]], ptr [[FieldE]], align 1
+// store int A into field F
+// CHECK-NEXT: [[Conv16:%.*]] = sitofp i32 [[Z]] to double
+// CHECK-NEXT: store double [[Conv16]], ptr [[FieldF]], align 8
+// store int A into field G
+// CHECK-NEXT: store i32 [[Z]], ptr [[FieldG]], align 4
+// CHECK-NEXT: ret void
+export void call13(int A) {
+  MoreBFields MBF = (MoreBFields)A;
+}
+
+struct Inner {
+  int Z;
+  int Y : 25;
+};
+
+struct Outer {
+  int A;
+  Inner I;
+};
+
+// show usage of "extra" gep for struct containing bitfield
+// CHECK-LABEL: call14
+// CHECK: [[AA:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[O:%.*]] = alloca %struct.Outer, align 1
+// CHECK-NEXT: store i32 %A, ptr [[AA]], align 4
+// CHECK-NEXT: [[Z:%.*]] = load i32, ptr [[AA]], align 4
+// CHECK-NEXT: [[FieldA:%.*]] = getelementptr inbounds %struct.Outer, ptr [[O]], i32 0, i32 0
+// showing real usage of "extra gep". need Inner struct to generate access of its bitfield.
+// CHECK-NEXT: [[FieldI:%.*]] = getelementptr inbounds %struct.Outer, ptr [[O]], i32 0, i32 1
+// CHECK-NEXT: [[FieldY:%.*]] = getelementptr inbounds nuw %struct.Inner, ptr [[FieldI]], i32 0, i32 1
+// CHECK-NEXT: [[FieldZ:%.*]] = getelementptr inbounds %struct.Outer, ptr [[O]], i32 0, i32 1, i32 0
+// CHECK-NEXT: store i32 [[Z]], ptr [[FieldA]], align 4
+// CHECK-NEXT: store i32 [[Z]], ptr [[FieldZ]], align 4
+// CHECK-NEXT: [[BFL:%.*]] = load i32, ptr [[FieldY]], align 1
+// CHECK-NEXT: [[BFV:%.*]] = and i32 [[Z]], 33554431
+// CHECK-NEXT: [[BFC:%.*]] = and i32 [[BFL]], -33554432
+// CHECK-NEXT: [[BFS:%.*]] = or i32 [[BFC]], [[BFV]]
+// CHECK-NEXT: store i32 [[BFS]], ptr [[FieldY]], align 1
+// CHECK-NEXT: ret void
+export void call14(int A) {
+  Outer O = (Outer)A;
+}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
index 253b38a7ca072..26aa41aaf4626 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/VectorElementwiseCast.hlsl
@@ -79,3 +79,45 @@ export void call5() {
  S s = {1, 2.0};
  int A = (int)s;
 }
+
+struct BFields {
+  double D;
+  int E: 15;
+  int : 8;
+  float F;
+};
+
+struct Derived : BFields {
+  int G;
+};
+
+// vector flat cast from derived struct with bitfield
+// CHECK-LABEL: call6
+// CHECK: [[A:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT: [[Tmp:%.*]] = alloca %struct.Derived, align 1
+// CHECK-NEXT: [[FlatTmp:%.*]] = alloca <4 x i32>, align 16
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[Tmp]], ptr align 1 %D, i32 19, i1 false)
+// CHECK-NEXT: [[Gep:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0
+// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw %struct.BFields, ptr [[Gep]], i32 0, i32 1
+// CHECK-NEXT: [[Gep1:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 0
+// CHECK-NEXT: [[Gep2:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 0, i32 2
+// CHECK-NEXT: [[Gep3:%.*]] = getelementptr inbounds %struct.Derived, ptr [[Tmp]], i32 0, i32 1
+// CHECK-NEXT: [[Z:%.*]] = load <4 x i32>, ptr [[FlatTmp]], align 16
+// CHECK-NEXT: [[Y:%.*]] = load double, ptr [[Gep1]], align 8
+// CHECK-NEXT: [[C:%.*]] = fptosi double [[Y]] to i32
+// CHECK-NEXT: [[X:%.*]] = insertelement <4 x i32> [[Z]], i32 [[C]], i64 0
+// CHECK-NEXT: [[BFL:%.*]] = load i24, ptr [[E]], align 1
+// CHECK-NEXT: [[BFShl:%.*]] = shl i24 [[BFL]], 9
+// CHECK-NEXT: [[BFAshr:%.*]] = ashr i24 [[BFShl]], 9
+// CHECK-NEXT: [[BFC:%.*]] = sext i24 [[BFAshr]] to i32
+// CHECK-NEXT: [[W:%.*]] = insertelement <4 x i32> [[X]], i32 [[BFC]], i64 1
+// CHECK-NEXT: [[V:%.*]] = load float, ptr [[Gep2]], align 4
+// CHECK-NEXT: [[C4:%.*]] = fptosi float [[V]] to i32
+// CHECK-NEXT: [[U:%.*]] = insertelement <4 x i32> [[W]], i32 [[C4]], i64 2
+// CHECK-NEXT: [[T:%.*]] = load i32, ptr [[Gep3]], align 4
+// CHECK-NEXT: [[S:%.*]] = insertelement <4 x i32> [[U]], i32 [[T]], i64 3
+// CHECK-NEXT: store <4 x i32> [[S]], ptr [[A]], align 16
+// CHECK-NEXT: ret void
+export void call6(Derived D) {
+  int4 A = (int4)D;
+}
diff --git a/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl b/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl
index 2320e136b2536..fbb47bd2e7d39 100644
--- a/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl
+++ b/clang/test/SemaHLSL/Language/AggregateSplatCast-errors.hlsl
@@ -13,12 +13,6 @@ struct R {
   };
 };
 
-// casting types which contain bitfields is not yet supported.
-export void cantCast() {
-  S s = (S)1;
-  // expected-error@-1 {{no matching conversion for C-style cast from 'int' to 'S'}}
-}
-
 // Can't cast a union
 export void cantCast2() {
   R r = (R)1;
diff --git a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
index 30591507b3260..d9f50e9b0307f 100644
--- a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
+++ b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl
@@ -7,27 +7,6 @@ export void cantCast() {
   // expected-error@-1 {{C-style cast from 'int[3]' to 'int[4]' is not allowed}}
 }
 
-struct S {
-// expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int2' (aka 'vector<int, 2>') to 'const S' for 1st argument}}
-// expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int2' (aka 'vector<int, 2>') to 'S' for 1st argument}}
-// expected-note@-3 {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
-  int A : 8;
-  int B;
-};
-
-// casting types which contain bitfields is not yet supported.
-export void cantCast2() {
-  S s = {1,2};
-  int2 C = (int2)s;
-  // expected-error@-1 {{cannot convert 'S' to 'int2' (aka 'vector<int, 2>') without a conversion operator}}
-}
-
-export void cantCast3() {
-  int2 C = {1,2};
-  S s = (S)C;
-  // expected-error@-1 {{no matching conversion for C-style cast from 'int2' (aka 'vector<int, 2>') to 'S'}}
-}
-
 struct R {
 // expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int2' (aka 'vector<int, 2>') to 'const R' for 1st argument}}
 // expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int2' (aka 'vector<int, 2>') to 'R' for 1st argument}}

From a663119455df2720e0c2b8b11fdb978b9d9dddb3 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 6 Oct 2025 16:36:50 +0100
Subject: [PATCH 849/878] [LV] Fix verifier failures due to 93073af (#162097)

Follow up on 93073af ([LV] Move 3 functions into VPlanTransforms (NFC))
to not call runPass on the moved functions, as that results in verifier
failures.

Ref: https://lab.llvm.org/buildbot/#/builders/187/builds/12178
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e434e733e442a..d393a9c1506d6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8393,11 +8393,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     R->setOperand(1, WideIV->getStepValue());
   }
 
-  VPlanTransforms::runPass(
-      VPlanTransforms::addExitUsersForFirstOrderRecurrences, *Plan, Range);
+  // TODO: We can't call runPass on these transforms yet, due to verifier
+  // failures.
+  VPlanTransforms::addExitUsersForFirstOrderRecurrences(*Plan, Range);
   DenseMap<VPValue *, VPValue *> IVEndValues;
-  VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan,
-                           RecipeBuilder, IVEndValues);
+  VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -8508,8 +8508,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   DenseMap<VPValue *, VPValue *> IVEndValues;
   // TODO: IVEndValues are not used yet in the native path, to optimize exit
   // values.
-  VPlanTransforms::runPass(VPlanTransforms::addScalarResumePhis, *Plan,
-                           RecipeBuilder, IVEndValues);
+  // TODO: We can't call runPass on the transform yet, due to verifier
+  // failures.
+  VPlanTransforms::addScalarResumePhis(*Plan, RecipeBuilder, IVEndValues);
 
   assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
   return Plan;

From 6620e53511341703f3a2c54e4e9f1252d7d8dd1f Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 6 Oct 2025 17:41:22 +0200
Subject: [PATCH 850/878] [CIR][NFC] Update Complex CXX new test to use regex
 (#162024)

Update Complex CXX new test to use regex for variable names
---
 clang/test/CIR/CodeGen/new.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/test/CIR/CodeGen/new.cpp b/clang/test/CIR/CodeGen/new.cpp
index 91dae3f28c572..3dcf7af83dd78 100644
--- a/clang/test/CIR/CodeGen/new.cpp
+++ b/clang/test/CIR/CodeGen/new.cpp
@@ -158,13 +158,13 @@ void test_new_with_complex_type() {
 }
 
 // CHECK: cir.func{{.*}} @_Z26test_new_with_complex_typev
-// CHECK:   %0 = cir.alloca !cir.ptr<!cir.complex<!cir.float>>, !cir.ptr<!cir.ptr<!cir.complex<!cir.float>>>, ["a", init]
-// CHECK:   %1 = cir.const #cir.int<8> : !u64i
-// CHECK:   %2 = cir.call @_Znwm(%1) : (!u64i) -> !cir.ptr<!void>
-// CHECK:   %3 = cir.cast bitcast %2 : !cir.ptr<!void> -> !cir.ptr<!cir.complex<!cir.float>>
-// CHECK:   %4 = cir.const #cir.const_complex<#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float> : !cir.complex<!cir.float>
-// CHECK:   cir.store align(8) %4, %3 : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
-// CHECK:   cir.store align(8) %3, %0 : !cir.ptr<!cir.complex<!cir.float>>, !cir.ptr<!cir.ptr<!cir.complex<!cir.float>>>
+// CHECK:   %[[A_ADDR:.*]] = cir.alloca !cir.ptr<!cir.complex<!cir.float>>, !cir.ptr<!cir.ptr<!cir.complex<!cir.float>>>, ["a", init]
+// CHECK:   %[[COMPLEX_SIZE:.*]] = cir.const #cir.int<8> : !u64i
+// CHECK:   %[[NEW_COMPLEX:.*]] = cir.call @_Znwm(%[[COMPLEX_SIZE]]) : (!u64i) -> !cir.ptr<!void>
+// CHECK:   %[[COMPLEX_PTR:.*]] = cir.cast bitcast %[[NEW_COMPLEX]] : !cir.ptr<!void> -> !cir.ptr<!cir.complex<!cir.float>>
+// CHECK:   %[[COMPLEX_VAL:.*]] = cir.const #cir.const_complex<#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CHECK:   cir.store{{.*}} %[[COMPLEX_VAL]], %[[COMPLEX_PTR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CHECK:   cir.store{{.*}} %[[COMPLEX_PTR]], %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.ptr<!cir.ptr<!cir.complex<!cir.float>>>
 
 // LLVM: define{{.*}} void @_Z26test_new_with_complex_typev
 // LLVM:   %[[A_ADDR:.*]] = alloca ptr, i64 1, align 8

From 1cc9a8c1272f428edcd4caf871cb66f973f4c13e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Oct 2025 00:44:45 +0900
Subject: [PATCH 851/878] AMDGPU: Stop using the wavemask register class for
 SCC cross class copies (#161801)

SCC should be copied to a 32-bit SGPR. Using a wave mask doesn't make
sense.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index bd95ee4042874..311557909916a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1118,9 +1118,7 @@ SIRegisterInfo::getPointerRegClass(unsigned Kind) const {
 
 const TargetRegisterClass *
 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &AMDGPU::SCC_CLASSRegClass)
-    return getWaveMaskRegClass();
-  return RC;
+  return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
 }
 
 static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI,

From e9f3be63d3928b24f3667d8aaadfbee9d325015f Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Mon, 6 Oct 2025 11:46:15 -0400
Subject: [PATCH 852/878] [NFC][PowerPC] Cleanup isImm and getImmEncoding
 functions (#161567)

Refactor and replace explicit Imm `getImm*Encodng() | isU*Imm() |
isS*Imm()` functions to a generic one that takes a template.
This is in prep for followup batch to implement `paddis` which takes a
pcrel Imm == 32bits. Doing this
refactor so we don't have to copy and paste the same set of functions
again with only the bit length changes.
---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 54 ++++++---------
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp | 39 +++--------
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.h   | 17 ++---
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td      | 24 -------
 llvm/lib/Target/PowerPC/PPCRegisterInfo.td    | 67 ++++++++++++++-----
 5 files changed, 85 insertions(+), 116 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 1fc475dc6cb7e..561a9c51b9cc2 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -349,32 +349,30 @@ struct PPCOperand : public MCParsedAsmOperand {
   bool isImm() const override {
     return Kind == Immediate || Kind == Expression;
   }
-  bool isU1Imm() const { return Kind == Immediate && isUInt<1>(getImm()); }
-  bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
-  bool isU3Imm() const { return Kind == Immediate && isUInt<3>(getImm()); }
-  bool isU4Imm() const { return Kind == Immediate && isUInt<4>(getImm()); }
-  bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
-  bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
-  bool isU6Imm() const { return Kind == Immediate && isUInt<6>(getImm()); }
-  bool isU6ImmX2() const { return Kind == Immediate &&
-                                  isUInt<6>(getImm()) &&
-                                  (getImm() & 1) == 0; }
-  bool isU7Imm() const { return Kind == Immediate && isUInt<7>(getImm()); }
-  bool isU7ImmX4() const { return Kind == Immediate &&
-                                  isUInt<7>(getImm()) &&
-                                  (getImm() & 3) == 0; }
-  bool isU8Imm() const { return Kind == Immediate && isUInt<8>(getImm()); }
-  bool isU8ImmX8() const { return Kind == Immediate &&
-                                  isUInt<8>(getImm()) &&
-                                  (getImm() & 7) == 0; }
-
-  bool isU10Imm() const { return Kind == Immediate && isUInt<10>(getImm()); }
-  bool isU12Imm() const { return Kind == Immediate && isUInt<12>(getImm()); }
+
+  template <uint64_t N> bool isUImm() const {
+    return Kind == Immediate && isUInt<N>(getImm());
+  }
+  template <uint64_t N> bool isSImm() const {
+    return Kind == Immediate && isInt<N>(getImm());
+  }
+  bool isU6ImmX2() const { return isUImm<6>() && (getImm() & 1) == 0; }
+  bool isU7ImmX4() const { return isUImm<7>() && (getImm() & 3) == 0; }
+  bool isU8ImmX8() const { return isUImm<8>() && (getImm() & 7) == 0; }
+
   bool isU16Imm() const { return isExtImm<16>(/*Signed*/ false, 1); }
   bool isS16Imm() const { return isExtImm<16>(/*Signed*/ true, 1); }
   bool isS16ImmX4() const { return isExtImm<16>(/*Signed*/ true, 4); }
   bool isS16ImmX16() const { return isExtImm<16>(/*Signed*/ true, 16); }
   bool isS17Imm() const { return isExtImm<17>(/*Signed*/ true, 1); }
+  bool isS34Imm() const {
+    // Once the PC-Rel ABI is finalized, evaluate whether a 34-bit
+    // ContextImmediate is needed.
+    return Kind == Expression || isSImm<34>();
+  }
+  bool isS34ImmX16() const {
+    return Kind == Expression || (isSImm<34>() && (getImm() & 15) == 0);
+  }
 
   bool isHashImmX8() const {
     // The Hash Imm form is used for instructions that check or store a hash.
@@ -384,16 +382,6 @@ struct PPCOperand : public MCParsedAsmOperand {
             (getImm() & 7) == 0);
   }
 
-  bool isS34ImmX16() const {
-    return Kind == Expression ||
-           (Kind == Immediate && isInt<34>(getImm()) && (getImm() & 15) == 0);
-  }
-  bool isS34Imm() const {
-    // Once the PC-Rel ABI is finalized, evaluate whether a 34-bit
-    // ContextImmediate is needed.
-    return Kind == Expression || (Kind == Immediate && isInt<34>(getImm()));
-  }
-
   bool isTLSReg() const { return Kind == TLSRegister; }
   bool isDirectBr() const {
     if (Kind == Expression)
@@ -1637,7 +1625,7 @@ bool PPCAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
     if (Operands.size() != 5)
       return false;
     PPCOperand &EHOp = (PPCOperand &)*Operands[4];
-    if (EHOp.isU1Imm() && EHOp.getImm() == 0)
+    if (EHOp.isUImm<1>() && EHOp.getImm() == 0)
       Operands.pop_back();
   }
 
@@ -1817,7 +1805,7 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
   }
 
   PPCOperand &Op = static_cast<PPCOperand &>(AsmOp);
-  if (Op.isU3Imm() && Op.getImm() == ImmVal)
+  if (Op.isUImm<3>() && Op.getImm() == ImmVal)
     return Match_Success;
 
   return Match_InvalidOperand;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 48c31c91e9338..81d8e94b660d7 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -206,45 +206,24 @@ PPCMCCodeEmitter::getVSRpEvenEncoding(const MCInst &MI, unsigned OpNo,
   return RegBits;
 }
 
-unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups, STI);
-
-  // Add a fixup for the immediate field.
-  addFixup(Fixups, IsLittleEndian ? 0 : 2, MO.getExpr(), PPC::fixup_ppc_half16);
-  return 0;
-}
-
-uint64_t PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI,
-                                            MCFixupKind Fixup) const {
+template <MCFixupKind Fixup>
+uint64_t PPCMCCodeEmitter::getImmEncoding(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpNo);
   assert(!MO.isReg() && "Not expecting a register for this operand.");
   if (MO.isImm())
     return getMachineOpValue(MI, MO, Fixups, STI);
 
+  uint32_t Offset = 0;
+  if (Fixup == PPC::fixup_ppc_half16)
+    Offset = IsLittleEndian ? 0 : 2;
+
   // Add a fixup for the immediate field.
-  addFixup(Fixups, 0, MO.getExpr(), Fixup);
+  addFixup(Fixups, Offset, MO.getExpr(), Fixup);
   return 0;
 }
 
-uint64_t
-PPCMCCodeEmitter::getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  return getImm34Encoding(MI, OpNo, Fixups, STI, PPC::fixup_ppc_imm34);
-}
-
-uint64_t
-PPCMCCodeEmitter::getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
-  return getImm34Encoding(MI, OpNo, Fixups, STI, PPC::fixup_ppc_pcrel34);
-}
-
 unsigned PPCMCCodeEmitter::getDispRIEncoding(const MCInst &MI, unsigned OpNo,
                                              SmallVectorImpl<MCFixup> &Fixups,
                                              const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
index b574557183194..3356513ff4938 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -47,19 +47,10 @@ class PPCMCCodeEmitter : public MCCodeEmitter {
   unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
-  unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
-                            SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI) const;
-  uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo,
-                            SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI,
-                            MCFixupKind Fixup) const;
-  uint64_t getImm34EncodingNoPCRel(const MCInst &MI, unsigned OpNo,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
-  uint64_t getImm34EncodingPCRel(const MCInst &MI, unsigned OpNo,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
+  template <MCFixupKind Fixup>
+  uint64_t getImmEncoding(const MCInst &MI, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups,
+                          const MCSubtargetInfo &STI) const;
   unsigned getDispRIEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 60efa4c8f0a37..fdca5ebc854ba 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -14,30 +14,6 @@
 //===----------------------------------------------------------------------===//
 // 64-bit operands.
 //
-def s16imm64 : Operand<i64> {
-  let PrintMethod = "printS16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
-  let ParserMatchClass = PPCS16ImmAsmOperand;
-  let DecoderMethod = "decodeSImmOperand<16>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def u16imm64 : Operand<i64> {
-  let PrintMethod = "printU16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
-  let ParserMatchClass = PPCU16ImmAsmOperand;
-  let DecoderMethod = "decodeUImmOperand<16>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
-def s17imm64 : Operand<i64> {
-  // This operand type is used for addis/lis to allow the assembler parser
-  // to accept immediates in the range -65536..65535 for compatibility with
-  // the GNU assembler.  The operand is treated as 16-bit otherwise.
-  let PrintMethod = "printS16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
-  let ParserMatchClass = PPCS17ImmAsmOperand;
-  let DecoderMethod = "decodeSImmOperand<16>";
-  let OperandType = "OPERAND_IMMEDIATE";
-}
 def tocentry : Operand<iPTR> {
   let MIOperandInfo = (ops i64imm:$imm);
 }
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 6d8c1223adf78..65d0484805b95 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -615,7 +615,8 @@ def spe4rc : RegisterOperand<GPRC> {
 }
 
 def PPCU1ImmAsmOperand : AsmOperandClass {
-  let Name = "U1Imm"; let PredicateMethod = "isU1Imm";
+  let Name = "U1Imm";
+  let PredicateMethod = "isUImm<1>";
   let RenderMethod = "addImmOperands";
 }
 def u1imm   : Operand<i32> {
@@ -626,7 +627,8 @@ def u1imm   : Operand<i32> {
 }
 
 def PPCU2ImmAsmOperand : AsmOperandClass {
-  let Name = "U2Imm"; let PredicateMethod = "isU2Imm";
+  let Name = "U2Imm";
+  let PredicateMethod = "isUImm<2>";
   let RenderMethod = "addImmOperands";
 }
 def u2imm   : Operand<i32> {
@@ -647,7 +649,8 @@ def atimm   : Operand<i32> {
 }
 
 def PPCU3ImmAsmOperand : AsmOperandClass {
-  let Name = "U3Imm"; let PredicateMethod = "isU3Imm";
+  let Name = "U3Imm";
+  let PredicateMethod = "isUImm<3>";
   let RenderMethod = "addImmOperands";
 }
 def u3imm   : Operand<i32> {
@@ -658,7 +661,8 @@ def u3imm   : Operand<i32> {
 }
 
 def PPCU4ImmAsmOperand : AsmOperandClass {
-  let Name = "U4Imm"; let PredicateMethod = "isU4Imm";
+  let Name = "U4Imm";
+  let PredicateMethod = "isUImm<4>";
   let RenderMethod = "addImmOperands";
 }
 def u4imm   : Operand<i32> {
@@ -668,7 +672,8 @@ def u4imm   : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCS5ImmAsmOperand : AsmOperandClass {
-  let Name = "S5Imm"; let PredicateMethod = "isS5Imm";
+  let Name = "S5Imm";
+  let PredicateMethod = "isSImm<5>";
   let RenderMethod = "addImmOperands";
 }
 def s5imm   : Operand<i32> {
@@ -678,7 +683,8 @@ def s5imm   : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU5ImmAsmOperand : AsmOperandClass {
-  let Name = "U5Imm"; let PredicateMethod = "isU5Imm";
+  let Name = "U5Imm";
+  let PredicateMethod = "isUImm<5>";
   let RenderMethod = "addImmOperands";
 }
 def u5imm   : Operand<i32> {
@@ -688,7 +694,8 @@ def u5imm   : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU6ImmAsmOperand : AsmOperandClass {
-  let Name = "U6Imm"; let PredicateMethod = "isU6Imm";
+  let Name = "U6Imm";
+  let PredicateMethod = "isUImm<6>";
   let RenderMethod = "addImmOperands";
 }
 def u6imm   : Operand<i32> {
@@ -698,7 +705,8 @@ def u6imm   : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU7ImmAsmOperand : AsmOperandClass {
-  let Name = "U7Imm"; let PredicateMethod = "isU7Imm";
+  let Name = "U7Imm";
+  let PredicateMethod = "isUImm<7>";
   let RenderMethod = "addImmOperands";
 }
 def u7imm   : Operand<i32> {
@@ -708,7 +716,8 @@ def u7imm   : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU8ImmAsmOperand : AsmOperandClass {
-  let Name = "U8Imm"; let PredicateMethod = "isU8Imm";
+  let Name = "U8Imm";
+  let PredicateMethod = "isUImm<8>";
   let RenderMethod = "addImmOperands";
 }
 def u8imm   : Operand<i32> {
@@ -718,7 +727,8 @@ def u8imm   : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU10ImmAsmOperand : AsmOperandClass {
-  let Name = "U10Imm"; let PredicateMethod = "isU10Imm";
+  let Name = "U10Imm";
+  let PredicateMethod = "isUImm<10>";
   let RenderMethod = "addImmOperands";
 }
 def u10imm  : Operand<i32> {
@@ -728,7 +738,8 @@ def u10imm  : Operand<i32> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def PPCU12ImmAsmOperand : AsmOperandClass {
-  let Name = "U12Imm"; let PredicateMethod = "isU12Imm";
+  let Name = "U12Imm";
+  let PredicateMethod = "isUImm<12>";
   let RenderMethod = "addImmOperands";
 }
 def u12imm  : Operand<i32> {
@@ -743,7 +754,14 @@ def PPCS16ImmAsmOperand : AsmOperandClass {
 }
 def s16imm  : Operand<i32> {
   let PrintMethod = "printS16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>";
+  let ParserMatchClass = PPCS16ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>";
   let ParserMatchClass = PPCS16ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<16>";
   let OperandType = "OPERAND_IMMEDIATE";
@@ -754,7 +772,14 @@ def PPCU16ImmAsmOperand : AsmOperandClass {
 }
 def u16imm  : Operand<i32> {
   let PrintMethod = "printU16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>";
+  let ParserMatchClass = PPCU16ImmAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def u16imm64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>";
   let ParserMatchClass = PPCU16ImmAsmOperand;
   let DecoderMethod = "decodeUImmOperand<16>";
   let OperandType = "OPERAND_IMMEDIATE";
@@ -768,7 +793,17 @@ def s17imm  : Operand<i32> {
   // to accept immediates in the range -65536..65535 for compatibility with
   // the GNU assembler.  The operand is treated as 16-bit otherwise.
   let PrintMethod = "printS16ImmOperand";
-  let EncoderMethod = "getImm16Encoding";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<16>";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+def s17imm64 : Operand<i64> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_half16>";
   let ParserMatchClass = PPCS17ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<16>";
   let OperandType = "OPERAND_IMMEDIATE";
@@ -780,14 +815,14 @@ def PPCS34ImmAsmOperand : AsmOperandClass {
 }
 def s34imm : Operand<i64> {
   let PrintMethod = "printS34ImmOperand";
-  let EncoderMethod = "getImm34EncodingNoPCRel";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_imm34>";
   let ParserMatchClass = PPCS34ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<34>";
   let OperandType = "OPERAND_IMMEDIATE";
 }
 def s34imm_pcrel : Operand<i64> {
   let PrintMethod = "printS34ImmOperand";
-  let EncoderMethod = "getImm34EncodingPCRel";
+  let EncoderMethod = "getImmEncoding<PPC::fixup_ppc_pcrel34>";
   let ParserMatchClass = PPCS34ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<34>";
   let OperandType = "OPERAND_IMMEDIATE";

From 9e8dda103425d355e06aca7e069050381ae84ceb Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 6 Oct 2025 11:55:39 -0400
Subject: [PATCH 853/878] [NFC] Change spelling of cluster feature to
 "clusters" (#162103)

---
 clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 ++--
 llvm/lib/Target/AMDGPU/AMDGPU.td            | 8 ++++----
 llvm/lib/Target/AMDGPU/GCNSubtarget.h       | 4 ++--
 llvm/lib/TargetParser/TargetParser.cpp      | 2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index c0c22bcecf9ae..7cc83c0a8064e 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -109,8 +109,8 @@
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+cluster,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
-// GFX1251: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+cluster,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1251: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 6b3c151cdd88b..1a697f7745375 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1448,10 +1448,10 @@ def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records
   "The buffer resource (V#) supports 45-bit num_records"
 >;
 
-def FeatureCluster : SubtargetFeature< "cluster",
-  "HasCluster",
+def FeatureClusters : SubtargetFeature< "clusters",
+  "HasClusters",
   "true",
-  "Has cluster support"
+  "Has clusters of workgroups support"
 >;
 
 // Dummy feature used to disable assembler instructions.
@@ -2120,7 +2120,7 @@ def FeatureISAVersion12_50 : FeatureSet<
    Feature45BitNumRecordsBufferResource,
    FeatureSupportsXNACK,
    FeatureXNACK,
-   FeatureCluster,
+   FeatureClusters,
 ]>;
 
 def FeatureISAVersion12_51 : FeatureSet<
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 879bf5aac079f..c2e6078bcc61a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -288,7 +288,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool Has45BitNumRecordsBufferResource = false;
 
-  bool HasCluster = false;
+  bool HasClusters = false;
 
   // Dummy feature to use for assembler in tablegen.
   bool FeatureDisable = false;
@@ -1839,7 +1839,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   }
 
   /// \returns true if the subtarget supports clusters of workgroups.
-  bool hasClusters() const { return HasCluster; }
+  bool hasClusters() const { return HasClusters; }
 
   /// \returns true if the subtarget requires a wait for xcnt before atomic
   /// flat/global stores & rmw.
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index b90669029eea4..62a3c88b7b0a7 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -444,7 +444,7 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["atomic-fmin-fmax-global-f32"] = true;
     Features["atomic-fmin-fmax-global-f64"] = true;
     Features["wavefrontsize32"] = true;
-    Features["cluster"] = true;
+    Features["clusters"] = true;
     break;
   case GK_GFX1201:
   case GK_GFX1200:

From 44b2673544bf32ae498cfa22193090f9fd7dae24 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 6 Oct 2025 08:57:29 -0700
Subject: [PATCH 854/878] [CIR] Implement initial LoweringPrepare support for
 global ctors (#161452)

This adds the initial support for lowering the 'ctor' region of
cir.global operations to an init function which is called from a
TU-specific static initialization function.

This does not yet add an attribute to hold a list of global
initializers. That will be added in a future change.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |   1 +
 clang/include/clang/CIR/MissingFeatures.h     |   5 +
 .../Dialect/Transforms/LoweringPrepare.cpp    | 144 +++++++++++++++++-
 clang/test/CIR/CodeGen/global-init.cpp        |  23 ++-
 4 files changed, 165 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 569491a61a964..89b519e96a93e 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -125,6 +125,7 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
   cir::ConstantOp getTrue(mlir::Location loc) { return getBool(true, loc); }
 
   cir::BoolType getBoolTy() { return cir::BoolType::get(getContext()); }
+  cir::VoidType getVoidTy() { return cir::VoidType::get(getContext()); }
 
   cir::PointerType getPointerTo(mlir::Type ty) {
     return cir::PointerType::get(ty);
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index f7ca2761b85c5..f79580083cf9f 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -37,6 +37,11 @@ struct MissingFeatures {
   static bool opGlobalDLLImportExport() { return false; }
   static bool opGlobalPartition() { return false; }
   static bool opGlobalUsedOrCompilerUsed() { return false; }
+  static bool opGlobalAnnotations() { return false; }
+  static bool opGlobalDtorLowering() { return false; }
+  static bool opGlobalCtorAttr() { return false; }
+  static bool opGlobalCtorPriority() { return false; }
+  static bool opGlobalCtorList() { return false; }
   static bool setDSOLocal() { return false; }
   static bool setComdat() { return false; }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index c15637d297cd1..2eeef819dc3fc 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -8,18 +8,39 @@
 
 #include "PassDetail.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/Basic/Module.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/IR/CIROpsEnums.h"
 #include "clang/CIR/Dialect/Passes.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/Support/Path.h"
 
 #include <memory>
 
 using namespace mlir;
 using namespace cir;
 
+static SmallString<128> getTransformedFileName(mlir::ModuleOp mlirModule) {
+  SmallString<128> fileName;
+
+  if (mlirModule.getSymName())
+    fileName = llvm::sys::path::filename(mlirModule.getSymName()->str());
+
+  if (fileName.empty())
+    fileName = "<null>";
+
+  for (size_t i = 0; i < fileName.size(); ++i) {
+    // Replace everything that's not [a-zA-Z0-9._] with a _. This set happens
+    // to be the set of C preprocessing numbers.
+    if (!clang::isPreprocessingNumberBody(fileName[i]))
+      fileName[i] = '_';
+  }
+
+  return fileName;
+}
+
 namespace {
 struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   LoweringPreparePass() = default;
@@ -30,9 +51,16 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   void lowerComplexDivOp(cir::ComplexDivOp op);
   void lowerComplexMulOp(cir::ComplexMulOp op);
   void lowerUnaryOp(cir::UnaryOp op);
+  void lowerGlobalOp(cir::GlobalOp op);
   void lowerArrayDtor(cir::ArrayDtor op);
   void lowerArrayCtor(cir::ArrayCtor op);
 
+  /// Build the function that initializes the specified global
+  cir::FuncOp buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op);
+
+  /// Build a module init function that calls all the dynamic initializers.
+  void buildCXXGlobalInitFunc();
+
   cir::FuncOp buildRuntimeFunction(
       mlir::OpBuilder &builder, llvm::StringRef name, mlir::Location loc,
       cir::FuncType type,
@@ -47,6 +75,10 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
   /// Tracks current module.
   mlir::ModuleOp mlirModule;
 
+  /// Tracks existing dynamic initializers.
+  llvm::StringMap<uint32_t> dynamicInitializerNames;
+  llvm::SmallVector<cir::FuncOp> dynamicInitializers;
+
   void setASTContext(clang::ASTContext *c) { astCtx = c; }
 };
 
@@ -589,6 +621,111 @@ void LoweringPreparePass::lowerUnaryOp(cir::UnaryOp op) {
   op.erase();
 }
 
+cir::FuncOp
+LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op) {
+  // TODO(cir): Store this in the GlobalOp.
+  // This should come from the MangleContext, but for now I'm hardcoding it.
+  SmallString<256> fnName("__cxx_global_var_init");
+  // Get a unique name
+  uint32_t cnt = dynamicInitializerNames[fnName]++;
+  if (cnt)
+    fnName += "." + llvm::Twine(cnt).str();
+
+  // Create a variable initialization function.
+  CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointAfter(op);
+  auto fnType = cir::FuncType::get({}, builder.getVoidTy());
+  FuncOp f = buildRuntimeFunction(builder, fnName, op.getLoc(), fnType,
+                                  cir::GlobalLinkageKind::InternalLinkage);
+
+  // Move over the initialzation code of the ctor region.
+  mlir::Block *entryBB = f.addEntryBlock();
+  if (!op.getCtorRegion().empty()) {
+    mlir::Block &block = op.getCtorRegion().front();
+    entryBB->getOperations().splice(entryBB->begin(), block.getOperations(),
+                                    block.begin(), std::prev(block.end()));
+  }
+
+  // Register the destructor call with __cxa_atexit
+  mlir::Region &dtorRegion = op.getDtorRegion();
+  if (!dtorRegion.empty()) {
+    assert(!cir::MissingFeatures::opGlobalDtorLowering());
+    llvm_unreachable("dtor region lowering is NYI");
+  }
+
+  // Replace cir.yield with cir.return
+  builder.setInsertionPointToEnd(entryBB);
+  mlir::Operation *yieldOp = nullptr;
+  if (!op.getCtorRegion().empty()) {
+    mlir::Block &block = op.getCtorRegion().front();
+    yieldOp = &block.getOperations().back();
+  } else {
+    assert(!cir::MissingFeatures::opGlobalDtorLowering());
+    llvm_unreachable("dtor region lowering is NYI");
+  }
+
+  assert(isa<YieldOp>(*yieldOp));
+  cir::ReturnOp::create(builder, yieldOp->getLoc());
+  return f;
+}
+
+void LoweringPreparePass::lowerGlobalOp(GlobalOp op) {
+  mlir::Region &ctorRegion = op.getCtorRegion();
+  mlir::Region &dtorRegion = op.getDtorRegion();
+
+  if (!ctorRegion.empty() || !dtorRegion.empty()) {
+    // Build a variable initialization function and move the initialzation code
+    // in the ctor region over.
+    cir::FuncOp f = buildCXXGlobalVarDeclInitFunc(op);
+
+    // Clear the ctor and dtor region
+    ctorRegion.getBlocks().clear();
+    dtorRegion.getBlocks().clear();
+
+    assert(!cir::MissingFeatures::astVarDeclInterface());
+    dynamicInitializers.push_back(f);
+  }
+
+  assert(!cir::MissingFeatures::opGlobalAnnotations());
+}
+
+void LoweringPreparePass::buildCXXGlobalInitFunc() {
+  if (dynamicInitializers.empty())
+    return;
+
+  assert(!cir::MissingFeatures::opGlobalCtorList());
+
+  SmallString<256> fnName;
+  // Include the filename in the symbol name. Including "sub_" matches gcc
+  // and makes sure these symbols appear lexicographically behind the symbols
+  // with priority (TBD).  Module implementation units behave the same
+  // way as a non-modular TU with imports.
+  // TODO: check CXX20ModuleInits
+  if (astCtx->getCurrentNamedModule() &&
+      !astCtx->getCurrentNamedModule()->isModuleImplementation()) {
+    llvm::raw_svector_ostream out(fnName);
+    std::unique_ptr<clang::MangleContext> mangleCtx(
+        astCtx->createMangleContext());
+    cast<clang::ItaniumMangleContext>(*mangleCtx)
+        .mangleModuleInitializer(astCtx->getCurrentNamedModule(), out);
+  } else {
+    fnName += "_GLOBAL__sub_I_";
+    fnName += getTransformedFileName(mlirModule);
+  }
+
+  CIRBaseBuilderTy builder(getContext());
+  builder.setInsertionPointToEnd(&mlirModule.getBodyRegion().back());
+  auto fnType = cir::FuncType::get({}, builder.getVoidTy());
+  cir::FuncOp f =
+      buildRuntimeFunction(builder, fnName, mlirModule.getLoc(), fnType,
+                           cir::GlobalLinkageKind::ExternalLinkage);
+  builder.setInsertionPointToStart(f.addEntryBlock());
+  for (cir::FuncOp &f : dynamicInitializers)
+    builder.createCallOp(f.getLoc(), f, {});
+
+  cir::ReturnOp::create(builder, f.getLoc());
+}
+
 static void lowerArrayDtorCtorIntoLoop(cir::CIRBaseBuilderTy &builder,
                                        clang::ASTContext *astCtx,
                                        mlir::Operation *op, mlir::Type eltTy,
@@ -691,6 +828,8 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
     lowerComplexDivOp(complexDiv);
   else if (auto complexMul = mlir::dyn_cast<cir::ComplexMulOp>(op))
     lowerComplexMulOp(complexMul);
+  else if (auto glob = mlir::dyn_cast<cir::GlobalOp>(op))
+    lowerGlobalOp(glob);
   else if (auto unary = mlir::dyn_cast<cir::UnaryOp>(op))
     lowerUnaryOp(unary);
 }
@@ -704,12 +843,15 @@ void LoweringPreparePass::runOnOperation() {
 
   op->walk([&](mlir::Operation *op) {
     if (mlir::isa<cir::ArrayCtor, cir::ArrayDtor, cir::CastOp,
-                  cir::ComplexMulOp, cir::ComplexDivOp, cir::UnaryOp>(op))
+                  cir::ComplexMulOp, cir::ComplexDivOp, cir::GlobalOp,
+                  cir::UnaryOp>(op))
       opsToTransform.push_back(op);
   });
 
   for (mlir::Operation *o : opsToTransform)
     runOnOp(o);
+
+  buildCXXGlobalInitFunc();
 }
 
 std::unique_ptr<Pass> mlir::createLoweringPreparePass() {
diff --git a/clang/test/CIR/CodeGen/global-init.cpp b/clang/test/CIR/CodeGen/global-init.cpp
index 102affc5563ac..0c19e686b2493 100644
--- a/clang/test/CIR/CodeGen/global-init.cpp
+++ b/clang/test/CIR/CodeGen/global-init.cpp
@@ -1,8 +1,9 @@
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2> %t-before.cir
+// RUN: FileCheck --input-file=%t-before.cir %s --check-prefix=CIR-BEFORE-LPP
 // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
 
-// Note: The CIR generated from this test isn't ready for lowering to LLVM yet.
-//       That will require changes to LoweringPrepare.
+// Note: The LoweringPrepare work isn't yet complete. We still need to create
+//       the global ctor list attribute.
 
 struct NeedsCtor {
   NeedsCtor();
@@ -10,8 +11,16 @@ struct NeedsCtor {
 
 NeedsCtor needsCtor;
 
-// CIR: cir.func private @_ZN9NeedsCtorC1Ev(!cir.ptr<!rec_NeedsCtor>)
-// CIR: cir.global external @needsCtor = ctor : !rec_NeedsCtor {
-// CIR:   %[[THIS:.*]] = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
-// CIR:   cir.call @_ZN9NeedsCtorC1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+// CIR-BEFORE-LPP: cir.global external @needsCtor = ctor : !rec_NeedsCtor {
+// CIR-BEFORE-LPP:   %[[THIS:.*]] = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
+// CIR-BEFORE-LPP:   cir.call @_ZN9NeedsCtorC1Ev(%[[THIS]]) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+
+// CIR: cir.global external @needsCtor = #cir.zero : !rec_NeedsCtor
+// CIR: cir.func internal private @__cxx_global_var_init() {
+// CIR:   %0 = cir.get_global @needsCtor : !cir.ptr<!rec_NeedsCtor>
+// CIR:   cir.call @_ZN9NeedsCtorC1Ev(%0) : (!cir.ptr<!rec_NeedsCtor>) -> ()
+
+// CIR: cir.func private @_GLOBAL__sub_I_[[FILENAME:.*]]() {
+// CIR:   cir.call @__cxx_global_var_init() : () -> ()
+// CIR:   cir.return
 // CIR: }

From 08e95405752e6d3475afdebe3bd5f2ff2ff17712 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 6 Oct 2025 09:02:55 -0700
Subject: [PATCH 855/878] [CAS] Add OnDiskDataAllocator (#161264)

Add OnDiskDataAllocator, which is the data pool implementation inside a
OnDiskCAS that stores data in a single file. It is a based on
MappedFileRegionArena and wrapped inside a CAS database file.
---
 llvm/include/llvm/CAS/OnDiskDataAllocator.h   |  95 +++++++
 llvm/lib/CAS/CMakeLists.txt                   |   1 +
 llvm/lib/CAS/OnDiskDataAllocator.cpp          | 232 ++++++++++++++++++
 llvm/unittests/CAS/CMakeLists.txt             |   1 +
 .../unittests/CAS/OnDiskDataAllocatorTest.cpp |  66 +++++
 5 files changed, 395 insertions(+)
 create mode 100644 llvm/include/llvm/CAS/OnDiskDataAllocator.h
 create mode 100644 llvm/lib/CAS/OnDiskDataAllocator.cpp
 create mode 100644 llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp

diff --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
new file mode 100644
index 0000000000000..2809df800621b
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for OnDiskDataAllocator, a file backed data
+/// pool can be used to allocate space to store data packed in a single file. It
+/// is based on MappedFileRegionArena and includes a header in the beginning to
+/// provide metadata.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKDATAALLOCATOR_H
+#define LLVM_CAS_ONDISKDATAALLOCATOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CAS/FileOffset.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+/// Sink for data. Stores variable length data with 8-byte alignment. Does not
+/// track size of data, which is assumed to known from context, or embedded.
+/// Uses 0-padding but does not guarantee 0-termination.
+class OnDiskDataAllocator {
+public:
+  using ValueProxy = MutableArrayRef<char>;
+
+  /// A pointer to data stored on disk.
+  class OnDiskPtr {
+  public:
+    FileOffset getOffset() const { return Offset; }
+    explicit operator bool() const { return bool(getOffset()); }
+    const ValueProxy &operator*() const {
+      assert(Offset && "Null dereference");
+      return Value;
+    }
+    const ValueProxy *operator->() const {
+      assert(Offset && "Null dereference");
+      return &Value;
+    }
+
+    OnDiskPtr() = default;
+
+  private:
+    friend class OnDiskDataAllocator;
+    OnDiskPtr(FileOffset Offset, ValueProxy Value)
+        : Offset(Offset), Value(Value) {}
+    FileOffset Offset;
+    ValueProxy Value;
+  };
+
+  /// Get the data of \p Size stored at the given \p Offset. Note the allocator
+  /// doesn't keep track of the allocation size, thus \p Size doesn't need to
+  /// match the size of allocation but needs to be smaller.
+  Expected<ArrayRef<char>> get(FileOffset Offset, size_t Size) const;
+
+  /// Allocate at least \p Size with 8-byte alignment.
+  Expected<OnDiskPtr> allocate(size_t Size);
+
+  /// \returns the buffer that was allocated at \p create time, with size
+  /// \p UserHeaderSize.
+  MutableArrayRef<uint8_t> getUserHeader();
+
+  size_t size() const;
+  size_t capacity() const;
+
+  static Expected<OnDiskDataAllocator>
+  create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+         std::optional<uint64_t> NewFileInitialSize,
+         uint32_t UserHeaderSize = 0,
+         function_ref<void(void *)> UserHeaderInit = nullptr);
+
+  OnDiskDataAllocator(OnDiskDataAllocator &&RHS);
+  OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS);
+
+  // No copy. Just call \a create() again.
+  OnDiskDataAllocator(const OnDiskDataAllocator &) = delete;
+  OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete;
+
+  ~OnDiskDataAllocator();
+
+private:
+  struct ImplType;
+  explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl);
+  std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_ONDISKDATAALLOCATOR_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 7ae5f7e46418e..bca39b645af45 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_component_library(LLVMCAS
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
+  OnDiskDataAllocator.cpp
   OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
new file mode 100644
index 0000000000000..da9b1856e2f29
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskDataAllocator.cpp
@@ -0,0 +1,232 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskDataAllocator.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "DatabaseFile.h"
+#include "llvm/Config/llvm-config.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// DataAllocator data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// DataAllocator table layout:
+/// - [8-bytes: Generic table header]
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - 8-bytes: Size for user data header
+/// - <user data buffer>
+///
+/// Record layout:
+/// - <data>
+class DataAllocatorHandle {
+public:
+  static constexpr TableHandle::TableKind Kind =
+      TableHandle::TableKind::DataAllocator;
+
+  struct Header {
+    TableHandle::Header GenericHeader;
+    std::atomic<int64_t> AllocatorOffset;
+    const uint64_t UserHeaderSize;
+  };
+
+  operator TableHandle() const {
+    if (!H)
+      return TableHandle();
+    return TableHandle(*Region, H->GenericHeader);
+  }
+
+  Expected<MutableArrayRef<char>> allocate(MappedFileRegionArena &Alloc,
+                                           size_t DataSize) {
+    assert(&Alloc.getRegion() == Region);
+    auto Ptr = Alloc.allocate(DataSize);
+    if (LLVM_UNLIKELY(!Ptr))
+      return Ptr.takeError();
+    return MutableArrayRef(*Ptr, DataSize);
+  }
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  MutableArrayRef<uint8_t> getUserHeader() {
+    return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1),
+                           H->UserHeaderSize);
+  }
+
+  static Expected<DataAllocatorHandle>
+  create(MappedFileRegionArena &Alloc, StringRef Name, uint32_t UserHeaderSize);
+
+  DataAllocatorHandle() = default;
+  DataAllocatorHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H) {}
+  DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : DataAllocatorHandle(
+            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskDataAllocator::ImplType {
+  DatabaseFile File;
+  DataAllocatorHandle Store;
+};
+
+Expected<DataAllocatorHandle>
+DataAllocatorHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+                            uint32_t UserHeaderSize) {
+  // Allocate.
+  auto Offset =
+      Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1);
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  // Construct the header and the name.
+  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+  auto *H = new (Alloc.getRegion().data() + *Offset)
+      Header{{TableHandle::TableKind::DataAllocator,
+              static_cast<uint16_t>(Name.size()),
+              static_cast<int32_t>(sizeof(Header) + UserHeaderSize)},
+             /*AllocatorOffset=*/{0},
+             /*UserHeaderSize=*/UserHeaderSize};
+  // Memset UserHeader.
+  char *UserHeader = reinterpret_cast<char *>(H + 1);
+  memset(UserHeader, 0, UserHeaderSize);
+  // Write database file name (null-terminated).
+  char *NameStorage = UserHeader + UserHeaderSize;
+  llvm::copy(Name, NameStorage);
+  NameStorage[Name.size()] = 0;
+  return DataAllocatorHandle(Alloc.getRegion(), *H);
+}
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+    const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize,
+    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+    function_ref<void(void *)> UserHeaderInit) {
+  assert(!UserHeaderSize || UserHeaderInit);
+  SmallString<128> PathStorage;
+  StringRef Path = PathTwine.toStringRef(PathStorage);
+  SmallString<128> TableNameStorage;
+  StringRef TableName = TableNameTwine.toStringRef(TableNameStorage);
+
+  // Constructor for if the file doesn't exist.
+  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+    auto Store =
+        DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize);
+    if (LLVM_UNLIKELY(!Store))
+      return Store.takeError();
+
+    if (auto E = DB.addTable(*Store))
+      return E;
+
+    if (UserHeaderSize)
+      UserHeaderInit(Store->getUserHeader().data());
+    return Error::success();
+  };
+
+  // Get or create the file.
+  Expected<DatabaseFile> File =
+      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+  if (!File)
+    return File.takeError();
+
+  // Find the table and validate it.
+  std::optional<TableHandle> Table = File->findTable(TableName);
+  if (!Table)
+    return createTableConfigError(std::errc::argument_out_of_domain, Path,
+                                  TableName, "table not found");
+  if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind,
+                           (size_t)Table->getHeader().Kind, Path, TableName))
+    return std::move(E);
+  auto Store = Table->cast<DataAllocatorHandle>();
+  assert(Store && "Already checked the kind");
+
+  // Success.
+  OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store};
+  return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+Expected<OnDiskDataAllocator::OnDiskPtr>
+OnDiskDataAllocator::allocate(size_t Size) {
+  auto Data = Impl->Store.allocate(Impl->File.getAlloc(), Size);
+  if (LLVM_UNLIKELY(!Data))
+    return Data.takeError();
+
+  return OnDiskPtr(FileOffset(Data->data() - Impl->Store.getRegion().data()),
+                   *Data);
+}
+
+Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
+                                                  size_t Size) const {
+  assert(Offset);
+  assert(Impl);
+  if (Offset.get() + Size >= Impl->File.getAlloc().size())
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "requested size too large in allocator");
+  return ArrayRef{Impl->File.getRegion().data() + Offset.get(), Size};
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+  return Impl->Store.getUserHeader();
+}
+
+size_t OnDiskDataAllocator::size() const { return Impl->File.size(); }
+size_t OnDiskDataAllocator::capacity() const {
+  return Impl->File.getRegion().size();
+}
+
+OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl)
+    : Impl(std::move(Impl)) {}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskDataAllocator::ImplType {};
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+    const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+    function_ref<void(void *)> UserHeaderInit) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+Expected<OnDiskDataAllocator::pointer>
+OnDiskDataAllocator::allocate(size_t Size) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
+  return nullptr;
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
+
+size_t OnDiskDataAllocator::size() const { return 0; }
+size_t OnDiskDataAllocator::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator &
+OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator::~OnDiskDataAllocator() = default;
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index 0f8fcb9e98954..ee40e6c9879a1 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
+  OnDiskDataAllocatorTest.cpp
   OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
   )
diff --git a/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
new file mode 100644
index 0000000000000..966fa03076841
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST(OnDiskDataAllocatorTest, Allocate) {
+  unittest::TempDir Temp("data-allocator", /*Unique=*/true);
+  constexpr size_t MB = 1024u * 1024u;
+
+  std::optional<OnDiskDataAllocator> Allocator;
+  ASSERT_THAT_ERROR(OnDiskDataAllocator::create(
+                        Temp.path("allocator"), "data", /*MaxFileSize=*/MB,
+                        /*NewFileInitialSize=*/std::nullopt)
+                        .moveInto(Allocator),
+                    Succeeded());
+
+  // Allocate.
+  {
+    for (size_t Size = 1; Size < 16; ++Size) {
+      OnDiskDataAllocator::OnDiskPtr P;
+      ASSERT_THAT_ERROR(Allocator->allocate(Size).moveInto(P), Succeeded());
+      EXPECT_TRUE(
+          isAligned(MappedFileRegionArena::getAlign(), P.getOffset().get()));
+    }
+  }
+
+  // Out of space.
+  {
+    OnDiskDataAllocator::OnDiskPtr P;
+    ASSERT_THAT_ERROR(Allocator->allocate(MB).moveInto(P), Failed());
+  }
+
+  // Check size and capacity.
+  {
+    ASSERT_EQ(Allocator->capacity(), MB);
+    ASSERT_LE(Allocator->size(), MB);
+  }
+
+  // Get.
+  {
+    OnDiskDataAllocator::OnDiskPtr P;
+    ASSERT_THAT_ERROR(Allocator->allocate(32).moveInto(P), Succeeded());
+    ArrayRef<char> Data;
+    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 16).moveInto(Data),
+                      Succeeded());
+    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 1025).moveInto(Data),
+                      Failed());
+  }
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS

From 27c207ef4c7ec11757c26ae11e3aa45cdb4dc90f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Mon, 6 Oct 2025 18:13:31 +0200
Subject: [PATCH 856/878] [NFC][SPIRV] GetElementPtrInst does not need a call
 to isInstructionTriviallyDead after replaceUsesofWith (#162045)

A getelementptr is always removable after replacing all its uses, since
it doesn't have side effects and always returns.
---
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 9f2e07508a36a..e16c8f0fc302e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -2811,9 +2811,7 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
     GetElementPtrInst *NewGEP = simplifyZeroLengthArrayGepInst(Ref);
     if (NewGEP) {
       Ref->replaceAllUsesWith(NewGEP);
-      if (isInstructionTriviallyDead(Ref))
-        DeadInsts.insert(Ref);
-
+      DeadInsts.insert(Ref);
       Ref = NewGEP;
     }
     if (Type *GepTy = getGEPType(Ref))

From ebfb16a28512917b6346fd4e2b8b9b937c2df485 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 6 Oct 2025 12:21:43 -0400
Subject: [PATCH 857/878] Revert "[CAS] Add OnDiskDataAllocator (#161264)"

This reverts commit 08e95405752e6d3475afdebe3bd5f2ff2ff17712.

Doesn't build on some bots,
see comments on https://github.com/llvm/llvm-project/pull/161264
---
 llvm/include/llvm/CAS/OnDiskDataAllocator.h   |  95 -------
 llvm/lib/CAS/CMakeLists.txt                   |   1 -
 llvm/lib/CAS/OnDiskDataAllocator.cpp          | 232 ------------------
 llvm/unittests/CAS/CMakeLists.txt             |   1 -
 .../unittests/CAS/OnDiskDataAllocatorTest.cpp |  66 -----
 5 files changed, 395 deletions(-)
 delete mode 100644 llvm/include/llvm/CAS/OnDiskDataAllocator.h
 delete mode 100644 llvm/lib/CAS/OnDiskDataAllocator.cpp
 delete mode 100644 llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp

diff --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
deleted file mode 100644
index 2809df800621b..0000000000000
--- a/llvm/include/llvm/CAS/OnDiskDataAllocator.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file declares interface for OnDiskDataAllocator, a file backed data
-/// pool can be used to allocate space to store data packed in a single file. It
-/// is based on MappedFileRegionArena and includes a header in the beginning to
-/// provide metadata.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CAS_ONDISKDATAALLOCATOR_H
-#define LLVM_CAS_ONDISKDATAALLOCATOR_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/CAS/FileOffset.h"
-#include "llvm/Support/Error.h"
-
-namespace llvm::cas {
-
-/// Sink for data. Stores variable length data with 8-byte alignment. Does not
-/// track size of data, which is assumed to known from context, or embedded.
-/// Uses 0-padding but does not guarantee 0-termination.
-class OnDiskDataAllocator {
-public:
-  using ValueProxy = MutableArrayRef<char>;
-
-  /// A pointer to data stored on disk.
-  class OnDiskPtr {
-  public:
-    FileOffset getOffset() const { return Offset; }
-    explicit operator bool() const { return bool(getOffset()); }
-    const ValueProxy &operator*() const {
-      assert(Offset && "Null dereference");
-      return Value;
-    }
-    const ValueProxy *operator->() const {
-      assert(Offset && "Null dereference");
-      return &Value;
-    }
-
-    OnDiskPtr() = default;
-
-  private:
-    friend class OnDiskDataAllocator;
-    OnDiskPtr(FileOffset Offset, ValueProxy Value)
-        : Offset(Offset), Value(Value) {}
-    FileOffset Offset;
-    ValueProxy Value;
-  };
-
-  /// Get the data of \p Size stored at the given \p Offset. Note the allocator
-  /// doesn't keep track of the allocation size, thus \p Size doesn't need to
-  /// match the size of allocation but needs to be smaller.
-  Expected<ArrayRef<char>> get(FileOffset Offset, size_t Size) const;
-
-  /// Allocate at least \p Size with 8-byte alignment.
-  Expected<OnDiskPtr> allocate(size_t Size);
-
-  /// \returns the buffer that was allocated at \p create time, with size
-  /// \p UserHeaderSize.
-  MutableArrayRef<uint8_t> getUserHeader();
-
-  size_t size() const;
-  size_t capacity() const;
-
-  static Expected<OnDiskDataAllocator>
-  create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
-         std::optional<uint64_t> NewFileInitialSize,
-         uint32_t UserHeaderSize = 0,
-         function_ref<void(void *)> UserHeaderInit = nullptr);
-
-  OnDiskDataAllocator(OnDiskDataAllocator &&RHS);
-  OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS);
-
-  // No copy. Just call \a create() again.
-  OnDiskDataAllocator(const OnDiskDataAllocator &) = delete;
-  OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete;
-
-  ~OnDiskDataAllocator();
-
-private:
-  struct ImplType;
-  explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl);
-  std::unique_ptr<ImplType> Impl;
-};
-
-} // namespace llvm::cas
-
-#endif // LLVM_CAS_ONDISKDATAALLOCATOR_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index bca39b645af45..7ae5f7e46418e 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -7,7 +7,6 @@ add_llvm_component_library(LLVMCAS
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
-  OnDiskDataAllocator.cpp
   OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
deleted file mode 100644
index da9b1856e2f29..0000000000000
--- a/llvm/lib/CAS/OnDiskDataAllocator.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file Implements OnDiskDataAllocator.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CAS/OnDiskDataAllocator.h"
-#include "DatabaseFile.h"
-#include "llvm/Config/llvm-config.h"
-
-using namespace llvm;
-using namespace llvm::cas;
-using namespace llvm::cas::ondisk;
-
-#if LLVM_ENABLE_ONDISK_CAS
-
-//===----------------------------------------------------------------------===//
-// DataAllocator data structures.
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// DataAllocator table layout:
-/// - [8-bytes: Generic table header]
-/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
-/// - 8-bytes: Size for user data header
-/// - <user data buffer>
-///
-/// Record layout:
-/// - <data>
-class DataAllocatorHandle {
-public:
-  static constexpr TableHandle::TableKind Kind =
-      TableHandle::TableKind::DataAllocator;
-
-  struct Header {
-    TableHandle::Header GenericHeader;
-    std::atomic<int64_t> AllocatorOffset;
-    const uint64_t UserHeaderSize;
-  };
-
-  operator TableHandle() const {
-    if (!H)
-      return TableHandle();
-    return TableHandle(*Region, H->GenericHeader);
-  }
-
-  Expected<MutableArrayRef<char>> allocate(MappedFileRegionArena &Alloc,
-                                           size_t DataSize) {
-    assert(&Alloc.getRegion() == Region);
-    auto Ptr = Alloc.allocate(DataSize);
-    if (LLVM_UNLIKELY(!Ptr))
-      return Ptr.takeError();
-    return MutableArrayRef(*Ptr, DataSize);
-  }
-
-  explicit operator bool() const { return H; }
-  const Header &getHeader() const { return *H; }
-  MappedFileRegion &getRegion() const { return *Region; }
-
-  MutableArrayRef<uint8_t> getUserHeader() {
-    return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1),
-                           H->UserHeaderSize);
-  }
-
-  static Expected<DataAllocatorHandle>
-  create(MappedFileRegionArena &Alloc, StringRef Name, uint32_t UserHeaderSize);
-
-  DataAllocatorHandle() = default;
-  DataAllocatorHandle(MappedFileRegion &Region, Header &H)
-      : Region(&Region), H(&H) {}
-  DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
-      : DataAllocatorHandle(
-            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
-  }
-
-private:
-  MappedFileRegion *Region = nullptr;
-  Header *H = nullptr;
-};
-
-} // end anonymous namespace
-
-struct OnDiskDataAllocator::ImplType {
-  DatabaseFile File;
-  DataAllocatorHandle Store;
-};
-
-Expected<DataAllocatorHandle>
-DataAllocatorHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
-                            uint32_t UserHeaderSize) {
-  // Allocate.
-  auto Offset =
-      Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1);
-  if (LLVM_UNLIKELY(!Offset))
-    return Offset.takeError();
-
-  // Construct the header and the name.
-  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
-  auto *H = new (Alloc.getRegion().data() + *Offset)
-      Header{{TableHandle::TableKind::DataAllocator,
-              static_cast<uint16_t>(Name.size()),
-              static_cast<int32_t>(sizeof(Header) + UserHeaderSize)},
-             /*AllocatorOffset=*/{0},
-             /*UserHeaderSize=*/UserHeaderSize};
-  // Memset UserHeader.
-  char *UserHeader = reinterpret_cast<char *>(H + 1);
-  memset(UserHeader, 0, UserHeaderSize);
-  // Write database file name (null-terminated).
-  char *NameStorage = UserHeader + UserHeaderSize;
-  llvm::copy(Name, NameStorage);
-  NameStorage[Name.size()] = 0;
-  return DataAllocatorHandle(Alloc.getRegion(), *H);
-}
-
-Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
-    const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize,
-    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
-    function_ref<void(void *)> UserHeaderInit) {
-  assert(!UserHeaderSize || UserHeaderInit);
-  SmallString<128> PathStorage;
-  StringRef Path = PathTwine.toStringRef(PathStorage);
-  SmallString<128> TableNameStorage;
-  StringRef TableName = TableNameTwine.toStringRef(TableNameStorage);
-
-  // Constructor for if the file doesn't exist.
-  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
-    auto Store =
-        DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize);
-    if (LLVM_UNLIKELY(!Store))
-      return Store.takeError();
-
-    if (auto E = DB.addTable(*Store))
-      return E;
-
-    if (UserHeaderSize)
-      UserHeaderInit(Store->getUserHeader().data());
-    return Error::success();
-  };
-
-  // Get or create the file.
-  Expected<DatabaseFile> File =
-      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
-  if (!File)
-    return File.takeError();
-
-  // Find the table and validate it.
-  std::optional<TableHandle> Table = File->findTable(TableName);
-  if (!Table)
-    return createTableConfigError(std::errc::argument_out_of_domain, Path,
-                                  TableName, "table not found");
-  if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind,
-                           (size_t)Table->getHeader().Kind, Path, TableName))
-    return std::move(E);
-  auto Store = Table->cast<DataAllocatorHandle>();
-  assert(Store && "Already checked the kind");
-
-  // Success.
-  OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store};
-  return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl)));
-}
-
-Expected<OnDiskDataAllocator::OnDiskPtr>
-OnDiskDataAllocator::allocate(size_t Size) {
-  auto Data = Impl->Store.allocate(Impl->File.getAlloc(), Size);
-  if (LLVM_UNLIKELY(!Data))
-    return Data.takeError();
-
-  return OnDiskPtr(FileOffset(Data->data() - Impl->Store.getRegion().data()),
-                   *Data);
-}
-
-Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
-                                                  size_t Size) const {
-  assert(Offset);
-  assert(Impl);
-  if (Offset.get() + Size >= Impl->File.getAlloc().size())
-    return createStringError(make_error_code(std::errc::protocol_error),
-                             "requested size too large in allocator");
-  return ArrayRef{Impl->File.getRegion().data() + Offset.get(), Size};
-}
-
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
-  return Impl->Store.getUserHeader();
-}
-
-size_t OnDiskDataAllocator::size() const { return Impl->File.size(); }
-size_t OnDiskDataAllocator::capacity() const {
-  return Impl->File.getRegion().size();
-}
-
-OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl)
-    : Impl(std::move(Impl)) {}
-
-#else // !LLVM_ENABLE_ONDISK_CAS
-
-struct OnDiskDataAllocator::ImplType {};
-
-Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
-    const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
-    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
-    function_ref<void(void *)> UserHeaderInit) {
-  return createStringError(make_error_code(std::errc::not_supported),
-                           "OnDiskDataAllocator is not supported");
-}
-
-Expected<OnDiskDataAllocator::pointer>
-OnDiskDataAllocator::allocate(size_t Size) {
-  return createStringError(make_error_code(std::errc::not_supported),
-                           "OnDiskDataAllocator is not supported");
-}
-
-const char *OnDiskDataAllocator::beginData(FileOffset Offset) const {
-  return nullptr;
-}
-
-MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
-
-size_t OnDiskDataAllocator::size() const { return 0; }
-size_t OnDiskDataAllocator::capacity() const { return 0; }
-
-#endif // LLVM_ENABLE_ONDISK_CAS
-
-OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default;
-OnDiskDataAllocator &
-OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default;
-OnDiskDataAllocator::~OnDiskDataAllocator() = default;
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index ee40e6c9879a1..0f8fcb9e98954 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -8,7 +8,6 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
-  OnDiskDataAllocatorTest.cpp
   OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
   )
diff --git a/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
deleted file mode 100644
index 966fa03076841..0000000000000
--- a/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CAS/OnDiskDataAllocator.h"
-#include "llvm/CAS/MappedFileRegionArena.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/Support/Alignment.h"
-#include "llvm/Testing/Support/Error.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
-
-#if LLVM_ENABLE_ONDISK_CAS
-
-using namespace llvm;
-using namespace llvm::cas;
-
-TEST(OnDiskDataAllocatorTest, Allocate) {
-  unittest::TempDir Temp("data-allocator", /*Unique=*/true);
-  constexpr size_t MB = 1024u * 1024u;
-
-  std::optional<OnDiskDataAllocator> Allocator;
-  ASSERT_THAT_ERROR(OnDiskDataAllocator::create(
-                        Temp.path("allocator"), "data", /*MaxFileSize=*/MB,
-                        /*NewFileInitialSize=*/std::nullopt)
-                        .moveInto(Allocator),
-                    Succeeded());
-
-  // Allocate.
-  {
-    for (size_t Size = 1; Size < 16; ++Size) {
-      OnDiskDataAllocator::OnDiskPtr P;
-      ASSERT_THAT_ERROR(Allocator->allocate(Size).moveInto(P), Succeeded());
-      EXPECT_TRUE(
-          isAligned(MappedFileRegionArena::getAlign(), P.getOffset().get()));
-    }
-  }
-
-  // Out of space.
-  {
-    OnDiskDataAllocator::OnDiskPtr P;
-    ASSERT_THAT_ERROR(Allocator->allocate(MB).moveInto(P), Failed());
-  }
-
-  // Check size and capacity.
-  {
-    ASSERT_EQ(Allocator->capacity(), MB);
-    ASSERT_LE(Allocator->size(), MB);
-  }
-
-  // Get.
-  {
-    OnDiskDataAllocator::OnDiskPtr P;
-    ASSERT_THAT_ERROR(Allocator->allocate(32).moveInto(P), Succeeded());
-    ArrayRef<char> Data;
-    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 16).moveInto(Data),
-                      Succeeded());
-    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 1025).moveInto(Data),
-                      Failed());
-  }
-}
-
-#endif // LLVM_ENABLE_ONDISK_CAS

From 134407d5f9d81d454c37400d163e2f1163ba0b8b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Oct 2025 01:22:46 +0900
Subject: [PATCH 858/878] AMDGPU: Add gfx1250 to sram-ecc elf header flags test
 (#162107)

---
 llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
index c4479b3a51d00..e3bc516ecfd31 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
@@ -15,6 +15,9 @@
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s
 ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s
 
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1250 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX1250 %s
+; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1250 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX1250 %s
+
 ; NO-SRAM-ECC-GFX906:      Flags [
 ; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_FEATURE_XNACK_V3   (0x100)
 ; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
@@ -52,6 +55,11 @@
 ; SRAM-ECC-GFX950:    EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
 ; SRAM-ECC-GFX950:  ]
 
+; SRAM-ECC-GFX1250: Flags [
+; SRAM-ECC-GFX1250:    EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200)
+; SRAM-ECC-GFX1250:    EF_AMDGPU_MACH_AMDGCN_GFX1250 (0x49)
+; SRAM-ECC-GFX1250:  ]
+
 define amdgpu_kernel void @elf_header() {
   ret void
 }

From c52de9ab48a57978fcf428ddcfe54963f8645d60 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 6 Oct 2025 09:35:20 -0700
Subject: [PATCH 859/878] [CAS] Rename OnDiskTrieRawHashMap::pointer ->
 OnDiskPtr. NFC (#161548)

Rename the ondisk pointer type in OnDiskTrieRawHashMap to match
OnDiskDataAllocator. NFC.
---
 llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h  | 32 +++++++++----------
 llvm/lib/CAS/OnDiskTrieRawHashMap.cpp         | 23 ++++++-------
 .../CAS/OnDiskTrieRawHashMapTest.cpp          | 10 +++---
 3 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
index 5e41bf6ab571e..fbd68d0f2f53e 100644
--- a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
+++ b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h
@@ -133,38 +133,38 @@ class OnDiskTrieRawHashMap {
     bool IsValue = false;
   };
 
-  class pointer;
-  class const_pointer : public PointerImpl<ConstValueProxy> {
+  class OnDiskPtr;
+  class ConstOnDiskPtr : public PointerImpl<ConstValueProxy> {
   public:
-    const_pointer() = default;
+    ConstOnDiskPtr() = default;
 
   private:
-    friend class pointer;
+    friend class OnDiskPtr;
     friend class OnDiskTrieRawHashMap;
-    using const_pointer::PointerImpl::PointerImpl;
+    using ConstOnDiskPtr::PointerImpl::PointerImpl;
   };
 
-  class pointer : public PointerImpl<ValueProxy> {
+  class OnDiskPtr : public PointerImpl<ValueProxy> {
   public:
-    operator const_pointer() const {
-      return const_pointer(Value, getOffset(), IsValue);
+    operator ConstOnDiskPtr() const {
+      return ConstOnDiskPtr(Value, getOffset(), IsValue);
     }
 
-    pointer() = default;
+    OnDiskPtr() = default;
 
   private:
     friend class OnDiskTrieRawHashMap;
-    using pointer::PointerImpl::PointerImpl;
+    using OnDiskPtr::PointerImpl::PointerImpl;
   };
 
   /// Find the value from hash.
   ///
   /// \returns pointer to the value if exists, otherwise returns a non-value
   /// pointer that evaluates to `false` when convert to boolean.
-  const_pointer find(ArrayRef<uint8_t> Hash) const;
+  ConstOnDiskPtr find(ArrayRef<uint8_t> Hash) const;
 
   /// Helper function to recover a pointer into the trie from file offset.
-  Expected<const_pointer> recoverFromFileOffset(FileOffset Offset) const;
+  Expected<ConstOnDiskPtr> recoverFromFileOffset(FileOffset Offset) const;
 
   using LazyInsertOnConstructCB =
       function_ref<void(FileOffset TentativeOffset, ValueProxy TentativeValue)>;
@@ -186,11 +186,11 @@ class OnDiskTrieRawHashMap {
   /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize
   /// simultaneous writes, but that seems dangerous to use in a memory-mapped
   /// file in case a process crashes in the busy state.
-  Expected<pointer> insertLazy(ArrayRef<uint8_t> Hash,
-                               LazyInsertOnConstructCB OnConstruct = nullptr,
-                               LazyInsertOnLeakCB OnLeak = nullptr);
+  Expected<OnDiskPtr> insertLazy(ArrayRef<uint8_t> Hash,
+                                 LazyInsertOnConstructCB OnConstruct = nullptr,
+                                 LazyInsertOnLeakCB OnLeak = nullptr);
 
-  Expected<pointer> insert(const ConstValueProxy &Value) {
+  Expected<OnDiskPtr> insert(const ConstValueProxy &Value) {
     return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) {
       assert(Allocated.Hash == Value.Hash);
       assert(Allocated.Data.size() == Value.Data.size());
diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
index 940389336ce22..d062f20855f44 100644
--- a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -427,7 +427,7 @@ TrieRawHashMapHandle::createRecord(MappedFileRegionArena &Alloc,
   return Record;
 }
 
-Expected<OnDiskTrieRawHashMap::const_pointer>
+Expected<OnDiskTrieRawHashMap::ConstOnDiskPtr>
 OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
   // Check alignment.
   if (!isAligned(MappedFileRegionArena::getAlign(), Offset.get()))
@@ -448,17 +448,17 @@ OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
   // Looks okay...
   TrieRawHashMapHandle::RecordData D =
       Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset));
-  return const_pointer(D.Proxy, D.getFileOffset());
+  return ConstOnDiskPtr(D.Proxy, D.getFileOffset());
 }
 
-OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::ConstOnDiskPtr
 OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
   TrieRawHashMapHandle Trie = Impl->Trie;
   assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash");
 
   SubtrieHandle S = Trie.getRoot();
   if (!S)
-    return const_pointer();
+    return ConstOnDiskPtr();
 
   TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash);
   size_t Index = IndexGen.next();
@@ -466,13 +466,13 @@ OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
     // Try to set the content.
     SubtrieSlotValue V = S.load(Index);
     if (!V)
-      return const_pointer();
+      return ConstOnDiskPtr();
 
     // Check for an exact match.
     if (V.isData()) {
       TrieRawHashMapHandle::RecordData D = Trie.getRecord(V);
-      return D.Proxy.Hash == Hash ? const_pointer(D.Proxy, D.getFileOffset())
-                                  : const_pointer();
+      return D.Proxy.Hash == Hash ? ConstOnDiskPtr(D.Proxy, D.getFileOffset())
+                                  : ConstOnDiskPtr();
     }
 
     Index = IndexGen.next();
@@ -490,7 +490,7 @@ void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) {
   H->NumBits = NumBits;
 }
 
-Expected<OnDiskTrieRawHashMap::pointer>
+Expected<OnDiskTrieRawHashMap::OnDiskPtr>
 OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
                                  LazyInsertOnConstructCB OnConstruct,
                                  LazyInsertOnLeakCB OnLeak) {
@@ -523,7 +523,8 @@ OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
       }
 
       if (S->compare_exchange_strong(Index, Existing, NewRecord->Offset))
-        return pointer(NewRecord->Proxy, NewRecord->Offset.asDataFileOffset());
+        return OnDiskPtr(NewRecord->Proxy,
+                         NewRecord->Offset.asDataFileOffset());
 
       // Race means that Existing is no longer empty; fall through...
     }
@@ -540,8 +541,8 @@ OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
       if (NewRecord && OnLeak)
         OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy,
                ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy);
-      return pointer(ExistingRecord.Proxy,
-                     ExistingRecord.Offset.asDataFileOffset());
+      return OnDiskPtr(ExistingRecord.Proxy,
+                       ExistingRecord.Offset.asDataFileOffset());
     }
 
     // Sink the existing content as long as the indexes match.
diff --git a/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
index 7bedfe4b29e30..6034c7023a6e3 100644
--- a/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
+++ b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp
@@ -71,7 +71,7 @@ TEST_P(OnDiskTrieRawHashMapTestFixture, General) {
   std::optional<FileOffset> Offset;
   std::optional<MutableArrayRef<char>> Data;
   {
-    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    std::optional<OnDiskTrieRawHashMap::OnDiskPtr> Insertion;
     ASSERT_THAT_ERROR(Trie1->insert({Hash0, Data0v1}).moveInto(Insertion),
                       Succeeded());
     EXPECT_EQ(Hash0, (*Insertion)->Hash);
@@ -128,7 +128,7 @@ TEST_P(OnDiskTrieRawHashMapTestFixture, General) {
 
   // Recover from an offset.
   {
-    OnDiskTrieRawHashMap::const_pointer Recovered;
+    OnDiskTrieRawHashMap::ConstOnDiskPtr Recovered;
     ASSERT_THAT_ERROR(Trie1->recoverFromFileOffset(*Offset).moveInto(Recovered),
                       Succeeded());
     ASSERT_TRUE(Recovered);
@@ -140,14 +140,14 @@ TEST_P(OnDiskTrieRawHashMapTestFixture, General) {
   // Recover from a bad offset.
   {
     FileOffset BadOffset(1);
-    OnDiskTrieRawHashMap::const_pointer Recovered;
+    OnDiskTrieRawHashMap::ConstOnDiskPtr Recovered;
     ASSERT_THAT_ERROR(
         Trie1->recoverFromFileOffset(BadOffset).moveInto(Recovered), Failed());
   }
 
   // Insert another thing.
   {
-    std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+    std::optional<OnDiskTrieRawHashMap::OnDiskPtr> Insertion;
     ASSERT_THAT_ERROR(Trie1->insert({Hash1, Data1}).moveInto(Insertion),
                       Succeeded());
     EXPECT_EQ(Hash1, (*Insertion)->Hash);
@@ -210,7 +210,7 @@ TEST(OnDiskTrieRawHashMapTest, OutOfSpace) {
   auto Hash0 = ArrayRef(Hash0Bytes);
   constexpr StringLiteral Data0v1Bytes = "data0.v1";
   ArrayRef<char> Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size());
-  std::optional<OnDiskTrieRawHashMap::pointer> Insertion;
+  std::optional<OnDiskTrieRawHashMap::OnDiskPtr> Insertion;
   ASSERT_THAT_ERROR(Trie->insert({Hash0, Data0v1}).moveInto(Insertion),
                     Failed());
 }

From 3f3d522ba7a4bb93520d216e593124ce6f790320 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Mon, 6 Oct 2025 18:40:57 +0200
Subject: [PATCH 860/878] [PowerPC] recognize `vmnsub` in older ppc versions
 (#155465)

fixes https://github.com/llvm/llvm-project/issues/129432

Recognize expansion sequence of negate where it isn't legal in order to select multiply-subtract.
---
 llvm/lib/Target/PowerPC/PPCInstrAltivec.td | 19 ++++++++++--
 llvm/test/CodeGen/PowerPC/vec-nmsub.ll     | 36 ++++++++++++++++++++++
 2 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/vec-nmsub.ll

diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index c616db4a1031c..23d6d8853800f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -30,6 +30,11 @@
 // Altivec transformation functions and pattern fragments.
 //
 
+// fneg is not legal, and desugared as an xor.
+def desugared_fneg : PatFrag<(ops node:$x), (v4f32 (bitconvert (xor (bitconvert $x), 
+                             (int_ppc_altivec_vslw (bitconvert (v16i8 immAllOnesV)), 
+                             (bitconvert (v16i8 immAllOnesV))))))>; 
+
 def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
                               (vector_shuffle node:$lhs, node:$rhs), [{
   return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), 0, *CurDAG);
@@ -467,11 +472,12 @@ def VMADDFP : VAForm_1<46, (outs vrrc:$RT), (ins vrrc:$RA, vrrc:$RC, vrrc:$RB),
                        [(set v4f32:$RT,
                         (fma v4f32:$RA, v4f32:$RC, v4f32:$RB))]>;
 
-// FIXME: The fma+fneg pattern won't match because fneg is not legal.
+// fneg is not legal, hence we have to match on the desugared version. 
 def VNMSUBFP: VAForm_1<47, (outs vrrc:$RT), (ins vrrc:$RA, vrrc:$RC, vrrc:$RB),
                        "vnmsubfp $RT, $RA, $RC, $RB", IIC_VecFP,
-                       [(set v4f32:$RT, (fneg (fma v4f32:$RA, v4f32:$RC,
-                                                  (fneg v4f32:$RB))))]>;
+                       [(set v4f32:$RT, (desugared_fneg (fma v4f32:$RA, v4f32:$RC,
+                                                  (desugared_fneg v4f32:$RB))))]>;
+
 let hasSideEffects = 1 in {
   def VMHADDSHS  : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
   def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
@@ -892,6 +898,13 @@ def : Pat<(mul v8i16:$vA, v8i16:$vB), (VMLADDUHM $vA, $vB, (v8i16(V_SET0H)))>;
 // Add
 def : Pat<(add (mul v8i16:$vA, v8i16:$vB), v8i16:$vC), (VMLADDUHM $vA, $vB, $vC)>;
 
+
+// Fused negated multiply-subtract
+def : Pat<(v4f32 (desugared_fneg
+                    (int_ppc_altivec_vmaddfp v4f32:$RA, v4f32:$RC,
+                         (desugared_fneg v4f32:$RB)))),
+          (VNMSUBFP $RA, $RC, $RB)>;
+
 // Saturating adds/subtracts.
 def : Pat<(v16i8 (saddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDSBS $vA, $vB))>;
 def : Pat<(v16i8 (uaddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDUBS $vA, $vB))>;
diff --git a/llvm/test/CodeGen/PowerPC/vec-nmsub.ll b/llvm/test/CodeGen/PowerPC/vec-nmsub.ll
new file mode 100644
index 0000000000000..8f4ac972346b3
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vec-nmsub.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs < %s -mcpu=pwr5 -mtriple=ppc32-- -mattr=+altivec | FileCheck %s
+
+define dso_local <4 x float> @intrinsic(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c) local_unnamed_addr {
+; CHECK-LABEL: intrinsic:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vnmsubfp 2, 2, 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %0 = tail call <4 x float> @llvm.ppc.altivec.vnmsubfp(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %0
+}
+
+define <4 x float> @manual_llvm_fma(<4 x float> %a, <4 x float> %b, <4 x float> %c) unnamed_addr {
+; CHECK-LABEL: manual_llvm_fma:
+; CHECK:       # %bb.0: # %start
+; CHECK-NEXT:    vnmsubfp 2, 2, 3, 4
+; CHECK-NEXT:    blr
+start:
+  %0 = fneg <4 x float> %c
+  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %0)
+  %2 = fneg <4 x float> %1
+  ret <4 x float> %2
+}
+
+define dso_local <4 x float> @manual_vmaddfp(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c) local_unnamed_addr {
+; CHECK-LABEL: manual_vmaddfp:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vnmsubfp 2, 2, 3, 4
+; CHECK-NEXT:    blr
+entry:
+  %fneg.i3 = fneg <4 x float> %c
+  %0 = tail call <4 x float> @llvm.ppc.altivec.vmaddfp(<4 x float> %a, <4 x float> %b, <4 x float> %fneg.i3)
+  %fneg.i = fneg <4 x float> %0
+  ret <4 x float> %fneg.i
+}

From f23c0e6f558880e6bb9314d88409d4cc83b94ca9 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 6 Oct 2025 09:42:12 -0700
Subject: [PATCH 861/878] [CAS] Fix #161548 for broken build (#162116)

Followup to fix the broken configuraiton.
---
 llvm/lib/CAS/OnDiskTrieRawHashMap.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
index d062f20855f44..323b21e7cb691 100644
--- a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
+++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp
@@ -1136,7 +1136,7 @@ OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine,
                            "OnDiskTrieRawHashMap is not supported");
 }
 
-Expected<OnDiskTrieRawHashMap::pointer>
+Expected<OnDiskTrieRawHashMap::OnDiskPtr>
 OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
                                  LazyInsertOnConstructCB OnConstruct,
                                  LazyInsertOnLeakCB OnLeak) {
@@ -1144,15 +1144,15 @@ OnDiskTrieRawHashMap::insertLazy(ArrayRef<uint8_t> Hash,
                            "OnDiskTrieRawHashMap is not supported");
 }
 
-Expected<OnDiskTrieRawHashMap::const_pointer>
+Expected<OnDiskTrieRawHashMap::ConstOnDiskPtr>
 OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const {
   return createStringError(make_error_code(std::errc::not_supported),
                            "OnDiskTrieRawHashMap is not supported");
 }
 
-OnDiskTrieRawHashMap::const_pointer
+OnDiskTrieRawHashMap::ConstOnDiskPtr
 OnDiskTrieRawHashMap::find(ArrayRef<uint8_t> Hash) const {
-  return const_pointer();
+  return ConstOnDiskPtr();
 }
 
 void OnDiskTrieRawHashMap::print(

From d7feeda43717669e207529fdb9b69e9082c6df4a Mon Sep 17 00:00:00 2001
From: Shawn K <kimshawn02@icloud.com>
Date: Mon, 6 Oct 2025 09:44:37 -0700
Subject: [PATCH 862/878] [Clang] VectorExprEvaluator::VisitCallExpr /
 InterpretBuiltin - add AVX512 VPTERNLOGD/VPTERNLOGQ intrinsics to be used in
 constexpr (#158703)

Fix #157698

Add handling for `__builtin_ia32_pternlog[d/q][128/256/512]_mask[z]` intrinsics to `VectorExprEvaluator::VisitCallExpr` and `InterpBuiltin.cpp` with the corresponding test coverage:

```
_mm_mask_ternarylogic_epi32
_mm_maskz_ternarylogic_epi32
_mm_ternarylogic_epi32
_mm256_mask_ternarylogic_epi32
_mm256_maskz_ternarylogic_epi32
_mm256_ternarylogic_epi32
_mm512_mask_ternarylogic_epi32
_mm512_maskz_ternarylogic_epi32
_mm512_ternarylogic_epi32
_mm_mask_ternarylogic_epi64
_mm_maskz_ternarylogic_epi64
_mm_ternarylogic_epi64
_mm256_mask_ternarylogic_epi64
_mm256_maskz_ternarylogic_epi64
_mm256_ternarylogic_epi64
_mm512_mask_ternarylogic_epi64
_mm512_maskz_ternarylogic_epi64
_mm512_ternarylogic_epi64
```

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 clang/include/clang/Basic/BuiltinsX86.td   |  16 +-
 clang/lib/AST/ByteCode/InterpBuiltin.cpp   |  56 ++++
 clang/lib/AST/ExprConstant.cpp             |  91 ++++++
 clang/test/CodeGen/X86/avx512f-builtins.c  | 310 +++++++++++++++++++
 clang/test/CodeGen/X86/avx512vl-builtins.c | 331 +++++++++++++++++++++
 5 files changed, 800 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index a0181b7ae8f9d..41652259cf6a3 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -2409,28 +2409,36 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
   def psraq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
   def psrld512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>)">;
   def psrlq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx512f",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pternlogd512_mask : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
   def pternlogd512_maskz : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, _Constant int, unsigned short)">;
   def pternlogq512_mask : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
   def pternlogq512_maskz : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def pternlogd128_mask : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
   def pternlogd128_maskz : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def pternlogd256_mask : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
   def pternlogd256_maskz : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def pternlogq128_mask : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
   def pternlogq128_maskz : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>, _Constant int, unsigned char)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def pternlogq256_mask : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
   def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">;
 }
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a3c4ba5447250..6af7ef32e69aa 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2897,7 +2897,49 @@ static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
   });
 
   Dst.initializeAllElements();
+  return true;
+}
+
+static bool interp__builtin_ia32_pternlog(InterpState &S, CodePtr OpPC,
+                                          const CallExpr *Call, bool MaskZ) {
+  assert(Call->getNumArgs() == 5);
+
+  APInt U = popToAPSInt(S, Call->getArg(4));   // Lane mask
+  APInt Imm = popToAPSInt(S, Call->getArg(3)); // Ternary truth table
+  const Pointer &C = S.Stk.pop<Pointer>();
+  const Pointer &B = S.Stk.pop<Pointer>();
+  const Pointer &A = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
 
+  unsigned DstLen = A.getNumElems();
+  const QualType ElemQT = getElemType(A);
+  const OptPrimType ElemPT = S.getContext().classify(ElemQT);
+  unsigned LaneWidth = S.getASTContext().getTypeSize(ElemQT);
+  bool DstUnsigned = ElemQT->isUnsignedIntegerOrEnumerationType();
+
+  INT_TYPE_SWITCH_NO_BOOL(*ElemPT, {
+    for (unsigned I = 0; I != DstLen; ++I) {
+      APInt ALane = A.elem<T>(I).toAPSInt();
+      APInt BLane = B.elem<T>(I).toAPSInt();
+      APInt CLane = C.elem<T>(I).toAPSInt();
+      APInt RLane(LaneWidth, 0);
+      if (U[I]) { // If lane not masked, compute ternary logic.
+        for (unsigned Bit = 0; Bit != LaneWidth; ++Bit) {
+          unsigned ABit = ALane[Bit];
+          unsigned BBit = BLane[Bit];
+          unsigned CBit = CLane[Bit];
+          unsigned Idx = (ABit << 2) | (BBit << 1) | (CBit);
+          RLane.setBitVal(Bit, Imm[Idx]);
+        }
+        Dst.elem<T>(I) = static_cast<T>(APSInt(RLane, DstUnsigned));
+      } else if (MaskZ) { // If zero masked, zero the lane.
+        Dst.elem<T>(I) = static_cast<T>(APSInt(RLane, DstUnsigned));
+      } else { // Just masked, put in A lane.
+        Dst.elem<T>(I) = static_cast<T>(APSInt(ALane, DstUnsigned));
+      }
+    }
+  });
+  Dst.initializeAllElements();
   return true;
 }
 
@@ -3760,6 +3802,20 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
         S, OpPC, Call,
         [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
 
+  case X86::BI__builtin_ia32_pternlogd128_mask:
+  case X86::BI__builtin_ia32_pternlogd256_mask:
+  case X86::BI__builtin_ia32_pternlogd512_mask:
+  case X86::BI__builtin_ia32_pternlogq128_mask:
+  case X86::BI__builtin_ia32_pternlogq256_mask:
+  case X86::BI__builtin_ia32_pternlogq512_mask:
+    return interp__builtin_ia32_pternlog(S, OpPC, Call, /*MaskZ=*/false);
+  case X86::BI__builtin_ia32_pternlogd128_maskz:
+  case X86::BI__builtin_ia32_pternlogd256_maskz:
+  case X86::BI__builtin_ia32_pternlogd512_maskz:
+  case X86::BI__builtin_ia32_pternlogq128_maskz:
+  case X86::BI__builtin_ia32_pternlogq256_maskz:
+  case X86::BI__builtin_ia32_pternlogq512_maskz:
+    return interp__builtin_ia32_pternlog(S, OpPC, Call, /*MaskZ=*/true);
   case Builtin::BI__builtin_elementwise_fshl:
     return interp__builtin_elementwise_triop(S, OpPC, Call,
                                              llvm::APIntOps::fshl);
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 7bf28d988f405..0b23eeda77597 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12168,6 +12168,97 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(R, E);
   }
 
+  case X86::BI__builtin_ia32_pternlogd128_mask:
+  case X86::BI__builtin_ia32_pternlogd256_mask:
+  case X86::BI__builtin_ia32_pternlogd512_mask:
+  case X86::BI__builtin_ia32_pternlogq128_mask:
+  case X86::BI__builtin_ia32_pternlogq256_mask:
+  case X86::BI__builtin_ia32_pternlogq512_mask: {
+    APValue AValue, BValue, CValue, ImmValue, UValue;
+    if (!EvaluateAsRValue(Info, E->getArg(0), AValue) ||
+        !EvaluateAsRValue(Info, E->getArg(1), BValue) ||
+        !EvaluateAsRValue(Info, E->getArg(2), CValue) ||
+        !EvaluateAsRValue(Info, E->getArg(3), ImmValue) ||
+        !EvaluateAsRValue(Info, E->getArg(4), UValue))
+      return false;
+
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+    APInt Imm = ImmValue.getInt();
+    APInt U = UValue.getInt();
+    unsigned ResultLen = AValue.getVectorLength();
+    SmallVector<APValue, 16> ResultElements;
+    ResultElements.reserve(ResultLen);
+
+    for (unsigned EltNum = 0; EltNum < ResultLen; ++EltNum) {
+      APInt ALane = AValue.getVectorElt(EltNum).getInt();
+      APInt BLane = BValue.getVectorElt(EltNum).getInt();
+      APInt CLane = CValue.getVectorElt(EltNum).getInt();
+
+      if (U[EltNum]) {
+        unsigned BitWidth = ALane.getBitWidth();
+        APInt ResLane(BitWidth, 0);
+
+        for (unsigned Bit = 0; Bit < BitWidth; ++Bit) {
+          unsigned ABit = ALane[Bit];
+          unsigned BBit = BLane[Bit];
+          unsigned CBit = CLane[Bit];
+
+          unsigned Idx = (ABit << 2) | (BBit << 1) | CBit;
+          ResLane.setBitVal(Bit, Imm[Idx]);
+        }
+        ResultElements.push_back(APValue(APSInt(ResLane, DestUnsigned)));
+      } else {
+        ResultElements.push_back(APValue(APSInt(ALane, DestUnsigned)));
+      }
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  case X86::BI__builtin_ia32_pternlogd128_maskz:
+  case X86::BI__builtin_ia32_pternlogd256_maskz:
+  case X86::BI__builtin_ia32_pternlogd512_maskz:
+  case X86::BI__builtin_ia32_pternlogq128_maskz:
+  case X86::BI__builtin_ia32_pternlogq256_maskz:
+  case X86::BI__builtin_ia32_pternlogq512_maskz: {
+    APValue AValue, BValue, CValue, ImmValue, UValue;
+    if (!EvaluateAsRValue(Info, E->getArg(0), AValue) ||
+        !EvaluateAsRValue(Info, E->getArg(1), BValue) ||
+        !EvaluateAsRValue(Info, E->getArg(2), CValue) ||
+        !EvaluateAsRValue(Info, E->getArg(3), ImmValue) ||
+        !EvaluateAsRValue(Info, E->getArg(4), UValue))
+      return false;
+
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
+    APInt Imm = ImmValue.getInt();
+    APInt U = UValue.getInt();
+    unsigned ResultLen = AValue.getVectorLength();
+    SmallVector<APValue, 16> ResultElements;
+    ResultElements.reserve(ResultLen);
+
+    for (unsigned EltNum = 0; EltNum < ResultLen; ++EltNum) {
+      APInt ALane = AValue.getVectorElt(EltNum).getInt();
+      APInt BLane = BValue.getVectorElt(EltNum).getInt();
+      APInt CLane = CValue.getVectorElt(EltNum).getInt();
+
+      unsigned BitWidth = ALane.getBitWidth();
+      APInt ResLane(BitWidth, 0);
+
+      if (U[EltNum]) {
+        for (unsigned Bit = 0; Bit < BitWidth; ++Bit) {
+          unsigned ABit = ALane[Bit];
+          unsigned BBit = BLane[Bit];
+          unsigned CBit = CLane[Bit];
+
+          unsigned Idx = (ABit << 2) | (BBit << 1) | CBit;
+          ResLane.setBitVal(Bit, Imm[Idx]);
+        }
+      }
+      ResultElements.push_back(APValue(APSInt(ResLane, DestUnsigned)));
+    }
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
   case Builtin::BI__builtin_elementwise_clzg:
   case Builtin::BI__builtin_elementwise_ctzg: {
     APValue SourceLHS;
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 47cb485a84210..7756f0da18c03 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -6273,6 +6273,78 @@ __m512i test_mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C) {
   // CHECK: @llvm.x86.avx512.pternlog.d.512({{.*}}, i32 240)
   return _mm512_ternarylogic_epi32(__A, __B, __C, _MM_TERNLOG_A);
 }
+TEST_CONSTEXPR(match_v16si(
+  _mm512_ternarylogic_epi32(
+    ((__m512i)((__v16si){
+      0x6AA79987, (int)0xBB91433A, 0x029A7245, (int)0xD1F6F86C,
+      (int)0xD340BBCD, (int)0xCD8778E7, 0x4C73A942, (int)0xDAEA58BA,
+      0x5E503A67, (int)0xEE897110, 0x3193CA54, 0x452EC40A,
+      (int)0x90E5E945, 0x6FACAA50, 0x29645F8B, 0x5F811CB9
+    })),
+    ((__m512i)((__v16si){
+      0x1FCFF454, (int)0xDFC9E3B1, 0x6ED4E94B, 0x42D6CB5C,
+      (int)0x8FE46024, (int)0xA091250E, 0x2CA1C789, (int)0x9C9CEA0C,
+      (int)0x8D9FE5B9, 0x2FD2B7A4, 0x5ADAD121, (int)0xBCF74D7A,
+      (int)0xF543BBCF, (int)0xBB9D58E4, 0x175F0CD2, (int)0x87F26AEE
+    })),
+    ((__m512i)((__v16si){
+      (int)0xFA882692, (int)0xBC428D42, 0x6980A81F, (int)0x95C5FB98,
+      (int)0x8101E89A, 0x2AA4857E, 0x25ECE845, 0x34A9AF41,
+      (int)0xB80E3B0D, 0x13ED748B, 0x30A1F6D5, (int)0xD64A3CE0,
+      0x57708107, 0x527122DC, 0x06057C82, 0x7576714A
+    })),
+    (unsigned char)0x11), // ~A & ~C
+  0x00300929, 0x0034100C, (int)0x902B16A0, 0x28280423,
+  0x701A1741, 0x554A5A81, (int)0xD2121032, 0x434210B2,
+  0x42600042, (int)0xC0000850, (int)0x8504080A, 0x01008205,
+  0x088C4430, 0x04028503, (int)0xE8A0832D, 0x08098411));
+TEST_CONSTEXPR(match_v16si(
+  _mm512_ternarylogic_epi32(
+    ((__m512i)((__v16si){
+      (int)0xA3B1799D, (int)0x46685257, (int)0x392456DE, (int)0xBC8960A9,
+      (int)0x6C031199, (int)0x07A0CA6E, (int)0x37F8A88B, (int)0x8B8148F6,
+      (int)0x386ECBE0, (int)0x96DA1DAC, (int)0xCE4A2BBD, (int)0xB2B9437A,
+      (int)0x571AA876, (int)0x27CD8130, (int)0x562B0F79, (int)0x17BE3111
+    })),
+    ((__m512i)((__v16si){
+      (int)0x18C26797, (int)0xD8F56413, (int)0x9A8DCA03, (int)0xCE9FF57F,
+      (int)0xBACFB3D0, (int)0x89463E85, (int)0x60E7A113, (int)0x8D5288F1,
+      (int)0xDC98D2C1, (int)0x93CD59BF, (int)0xB45ED1F0, (int)0x19DB3AD0,
+      (int)0x47294739, (int)0x5D65A441, (int)0x5EC42E08, (int)0xA5E5A5AB
+    })),
+    ((__m512i)((__v16si){
+      (int)0xBAA80DD4, (int)0x29D4BEEF, (int)0x6123FDF7, (int)0x8E944239,
+      (int)0xAF42E12F, (int)0xC6A7EE39, (int)0x50C187FC, (int)0x448AAA9E,
+      (int)0x508EBAD7, (int)0xA7CAD415, (int)0x757750A9, (int)0x43CF2FDE,
+      (int)0x95A76D79, (int)0x663F1C97, (int)0xFF5E9FF0, (int)0x827050A8
+    })),
+    (unsigned char)0x38), // (C & ~B) | (~C & A & B)
+  (int)0xBB311C08, (int)0x0E9C3644, (int)0x21219CDD, (int)0x32140090,
+  (int)0xC640A009, (int)0x86A6E46B, (int)0x57190998, (int)0x0683C006,
+  (int)0x60E61921, (int)0x05124411, (int)0x7A147A0D, (int)0xA36269AA,
+  (int)0x1033ED4F, (int)0x62A80531, (int)0x086F0171, (int)0x925A10B8));
+TEST_CONSTEXPR(match_v16si(
+  _mm512_ternarylogic_epi32(
+    ((__m512i)((__v16si){
+      (int)0x3193CA54, (int)0x90E5E945, (int)0x29645F8B, (int)0x6ED4E94B,
+      (int)0x8D9FE5B9, (int)0x8101E89A, (int)0x25ECE845, (int)0xB80E3B0D,
+      (int)0x57708107, (int)0x06057C82, (int)0x56EAA301, (int)0xBE99854A,
+      (int)0x00E266D0, (int)0xDEEA959E, (int)0x2DCAABD5, (int)0x6A1ECCDA})),
+    ((__m512i)((__v16si){
+      (int)0x93FD7234, (int)0xBC90A6EC, (int)0xD3285151, (int)0xCE9FB6A8,
+      (int)0x3B788B66, (int)0xDF8960AD, (int)0x2F927291, (int)0x96AF0DEA,
+      (int)0xF56AE7EA, (int)0x2A04F77A, (int)0xD50B612B, (int)0x3AA725CB,
+      (int)0x8A04F74F, (int)0x282FE557, (int)0x52E1FBB0, (int)0x0CA02F4D})),
+    ((__m512i)((__v16si){
+      (int)0xB6307BAD, (int)0x141CB03E, (int)0xEBAA7701, (int)0xC9F0B072,
+      (int)0x5E2503DD, (int)0xC2E1DAC4, (int)0x0FC01B11, (int)0xA0485922,
+      (int)0x339BB47E, (int)0xB2D4F32A, (int)0x8E7AE9AF, (int)0x147DE9B0,
+      (int)0xF79FCAA0, (int)0x3B0B6398, (int)0x29DDF4C7, (int)0x49CDBEC7})),
+    (unsigned char)0xC3), // ~(B ^ C)
+  (int)0x5D91479F, (int)0xD38AB056, (int)0x05B3F125, (int)0x5FB4A01C,
+  (int)0x49189120, (int)0xA17777C8, (int)0xF581652B, (int)0xD15EC918,
+  (int)0x5DE59912, (int)0xD3FE7407, (int)0x7C1E3DD5, (int)0x7BC15F7E,
+  (int)0x75196E60, (int)0x093A8F36, (int)0x80D4AF9A, (int)0x99411C68));
 
 __m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
   // CHECK-LABEL: test_mm512_mask_ternarylogic_epi32
@@ -6280,6 +6352,63 @@ __m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i _
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_ternarylogic_epi32(__A, __U, __B, __C, _MM_TERNLOG_B);
 }
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_ternarylogic_epi32(
+    _mm512_setr_epi32(
+      (int)0xFFFFFFFF, 0x00000000, (int)0xDEADBEEF, (int)0xCAFEBABE, 0x12345678, (int)0x87654321,
+      (int)0xAAAAAAAA, 0x55555555, (int)0xF00DBEEF, (int)0xBAD2FEAF, 0x0112358D, (int)0xDEADF00D,
+      (int)0x8BADF00D, (int)0xBADDCAFE, (int)0xBAADF00D, (int)0xBAAAAAAD),
+    (__mmask16)0x9D71,
+    _mm512_setr_epi32(
+      0x11111111, 0x22222222, 0x33333333, 0x44444444, (int)0xABCDEF01, (int)0xFEDCBA98,
+      (int)0xCCCCCCCC, 0x33333333, 0x1337BEEF, 0x01010101, (int)0x81321345, (int)0xBAADF00D,
+      0x1BADB002, 0x5EE7C0DE, 0x12345678, 0x55555555),
+    _mm512_setr_epi32(
+      (int)0xF0F0F0F0, 0x0F0F0F0F, 0x1234ABCD, (int)0x9876FEDC, 0x00FF00FF, (int)0xFF00FF00,
+      (int)0xFF0000FF, 0x00FFFF00, 0x50D4CAFE, (int)0x8BADF00D, (int)0xABCDEFFF, (int)0xFEEDF00D,
+      (int)0xBEEFCAFE, (int)0xDEADC0DE, (int)0x1BADBEEF, 0x33333333),
+    (unsigned char)0xB1), // op: (~B & (A | ~C)) | (B & A & C)
+  (int)0xFEFEFEFE, 0x00000000, (int)0xDEADBEEF, (int)0xCAFEBABE, 0x54341078, (int)0x87234367,
+  (int)0xAA3333AA, 0x55555555, (int)0xFC0C8BEE, (int)0xBAD2FEAF, 0x5500258D, (int)0xDFBFFFFF,
+  (int)0xCABDC50D, (int)0xBADDCAFE, (int)0xBAADF00D, (int)0xBAAAAAA9));
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_ternarylogic_epi32(
+    _mm512_setr_epi32(
+      0x0000FFFF, (int)0xFFFF0000, 0x01010101, (int)0xFF00FF00, (int)0xAAAAAAAA, 0x33333333,
+      (int)0xF0F0F0F0, 0x0F0F0F0F, 0x12345678, (int)0x87654321, 0x7FFFFFFF, (int)0xDEADBEEF,
+      (int)0xCAFEBABE, 0x01234567, (int)0xABCDEF01, (int)0xFEDCBA98),
+    (__mmask16)0x3C3C,
+    _mm512_setr_epi32(
+      0x1111EEEE, 0x2222DDDD, (int)0x80808080, 0x00FF00FF, 0x55555555, 0x00000000,
+      (int)0xCCCCCCCC, 0x33333333, 0x11111111, 0x22222222, (int)0x80000000, 0x12345678,
+      0x11223344, (int)0xFEDCBA98, (int)0xBAD0BAD0, (int)0xBEEFCAFE),
+    _mm512_setr_epi32(
+      0x12345678, (int)0x87654321, 0x7F7F7F7F, (int)0xFEDCBA98, (int)0xCCCCCCCC, (int)0xFFFFFFFF,
+      0x11111111, 0x22222222, (int)0xABABABAB, (int)0xCDCDCDCD, 0x00000001, (int)0xFACEB00C,
+      0x55667788, (int)0xABCDEF01, 0x12345678, (int)0xDEADBEEF),
+    (unsigned char)0xE8), // op: (A & B) | (B & C) | (C & A) (Majority)
+  0x0000FFFF, (int)0xFFFF0000, 0x01010101, (int)0xFEDCBA98, (int)0xCCCCCCCC, 0x33333333,
+  (int)0xF0F0F0F0, 0x0F0F0F0F, 0x12345678, (int)0x87654321, 0x00000001, (int)0xDAACB66C,
+  0x5166338C, (int)0xABCDEF01, (int)0xABCDEF01, (int)0xFEDCBA98));
+TEST_CONSTEXPR(match_v16si(
+  _mm512_mask_ternarylogic_epi32(
+    _mm512_setr_epi32(
+      (int)0xDEADBEEF, 0x01234567, (int)0xAAAAAAAA, 0x0F0F0F0F, (int)0xBAADF00D, 0x00000001,
+      (int)0x80000000, 0x7FFFFFFF, (int)0xCAFEBABE, 0x13579BDF, (int)0xABCDEF01, (int)0xCAFEBABE,
+      (int)0xDEADBEEF, (int)0xFF00FF00, (int)0xBEEFCAFE, 0x00000001),
+    (__mmask16)0xBEEF,
+    _mm512_setr_epi32(
+      (int)0xFACEB00C, (int)0x89ABCDEF, 0x55555555, (int)0xF0F0F0F0, 0x1337C0DE, 0x00000002,
+      0x40000000, (int)0xBFFFFFFF, 0x00000000, 0x2468ACE0, 0x10FEDCBA, 0x00000000,
+      (int)0xFEEDFACE, 0x00FF00FF, 0x12345678, 0x00000002),
+    _mm512_setr_epi32(
+      0x12345678, (int)0xFFFFFFFF, (int)0xCCCCCCCC, (int)0x88888888, (int)0xDEADC0DE, 0x00000004,
+      0x20000000, (int)0xDFFFFFFF, (int)0xFFFFFFFF, (int)0xFEDCBA98, 0x55555555, (int)0xFFFFFFFF,
+      (int)0x8BADF00D, (int)0xF0F0F0F0, (int)0xFACEB00C, 0x00000003),
+    (unsigned char)0x96), // op: A ^ B ^ C (XOR3)
+  (int)0x3657589B, 0x77777777, 0x33333333, 0x77777777, (int)0xBAADF00D, 0x00000007,
+  (int)0xE0000000, 0x1FFFFFFF, (int)0xCAFEBABE, (int)0xC9E38DA7, (int)0xEE6666EE, 0x35014541,
+  (int)0xABEDB42C, 0x0F0F0F0F, (int)0xBEEFCAFE, 0x00000000));
 
 __m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   // CHECK-LABEL: test_mm512_maskz_ternarylogic_epi32
@@ -6287,12 +6416,106 @@ __m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_maskz_ternarylogic_epi32(__U, __A, __B, __C, _MM_TERNLOG_C);
 }
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_ternarylogic_epi32(
+    (__mmask16)0x6498,
+    ((__m512i)((__v16si){
+      1393174638, 1243877629,  -826208314, 1770837977,
+     -1678093555,  -414088391, 1288769935,  703296098,
+      1428104678,   405688910,  -167788555, 1965219804,
+     -1959018749,   514303227,   754191429,  579811517})),
+    ((__m512i)((__v16si){
+     -1301280384,  -923736510,  -797648805,   475853364,
+      1247377062,   213070102,   626020209,  2037794518,
+       122183669,  1712787569, -1042441569, -1416844145,
+      1374304252, -1323427639,  1432483217,  1621706359})),
+    ((__m512i)((__v16si){
+       234227517,  -313293475,  1851213039,  -300885844,
+     -1479339544,   575183087,  -655840260, -1853668117,
+       433622095,   933629633, -1324904005,   -68434060,
+       486070655,   226865941, -1461464269,  1471789621})),
+    (unsigned char)0xAB), // (~A & ~B) | (B & C)
+   0, 0, 0, -298592082,
+  -1479042568, 0, 0, -1752969749,
+   0, 0, -1157115461, 0,
+   0, 1304818453, -1427385541, 0));
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_ternarylogic_epi32(
+    (__mmask16)0xA593,
+    ((__m512i)((__v16si){
+      1789368711,  -1148107974,   43676229,  -772343700,
+      -750732339,   -846759705,  1282648386,  -622176070,
+      1582316135,   -292982512,   831769172,  1160692746,
+     -1863980731,   1873586768,   694443915,  1602297017})),
+    ((__m512i)((__v16si){
+       533722196,   -540417103,  1859447115,  1121373020,
+     -1880858588,  -1601100530,   748799881, -1667438068,
+     -1918900807,    802338724,  1524289825, -1124643462,
+      -180110385,  -1147315996,   392105170, -2014156050})),
+    ((__m512i)((__v16si){
+       -91740526,  -1136489150,  1770039327, -1782187112,
+     -2130581350,    715425150,   636282949,   883535681,
+     -1207026931,    334328971,   815920853,  -699777824,
+      1466990855,   1383146204,   101022850,  1970696522})),
+    (unsigned char)0x21), // (~B) & ~(A ^ C)
+   1611661482,   539234310,          0,          0,
+    538610824,           0,          0,    18874368,
+    270539268,           0, -1543175586,          0,
+            0,  1075980051,          0,  1342738432));
+TEST_CONSTEXPR(match_v16si(
+  _mm512_maskz_ternarylogic_epi32(
+    (__mmask16)0xC3A5,
+    ((__m512i)((__v16si){
+      0x00000000, -0x1, (int)0x80000000, 0x7FFFFFFF,
+      (int)0xAAAAAAAA, 0x55555555, 0x00000001, (int)0xFFFFFFFE,
+      0x0000FFFF, (int)0xFFFF0000, (int)0xDEADBEEF, (int)0xCAFEBABE,
+      0x01234567, (int)0x89ABCDEF, 0x13579BDF, 0x2468ACE0})),
+    ((__m512i)((__v16si){
+      0x2468ACE0, 0x13579BDF, (int)0x89ABCDEF, 0x01234567,
+      (int)0xCAFEBABE, (int)0xDEADBEEF, (int)0xFFFF0000, 0x0000FFFF,
+      (int)0xFFFFFFFE, 0x00000001, 0x55555555, (int)0xAAAAAAAA,
+      0x7FFFFFFF, (int)0x80000000, -0x1, 0x00000000})),
+    ((__m512i)((__v16si){
+      -0x1, 0x00000000, -0x1, 0x00000000,
+      -0x1, 0x00000000, -0x1, 0x00000000,
+      -0x1, 0x00000000, -0x1, 0x00000000,
+      -0x1, 0x00000000, -0x1, 0x00000000})),
+    (unsigned char)0xC9), // F = (A & B) | (~A & ~(B ^ C))
+  0x2468ACE0, 0x0, (int)0x89ABCDEF, 0x0,
+  0x0, 0x74071445, 0x0, 0x0000FFFE,
+  (int)0xFFFFFFFE, 0x0000FFFE, 0x0, 0x0,
+  0x0, 0x0, (int)0xFFFFFFFF, (int)0xDB97531F));
 
 __m512i test_mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C) {
   // CHECK-LABEL: test_mm512_ternarylogic_epi64
   // CHECK: @llvm.x86.avx512.pternlog.q.512({{.*}}, i32 192)
   return _mm512_ternarylogic_epi64(__A, __B, __C, _MM_TERNLOG_A & _MM_TERNLOG_B);
 }
+TEST_CONSTEXPR(match_v8di(
+  _mm512_ternarylogic_epi64(
+    ((__m512i)((__v8di){0x1111, 0x2222, 0x3333, 0x4444, 0x5555, 0x6666, 0x7777, 0x8888})),
+    ((__m512i)((__v8di){0xAAAA, 0xBBBB, 0xCCCC, 0xDDDD, 0xEEEE, 0xFFFF, 0x1111, 0x2222})),
+    ((__m512i)((__v8di){-0x1, 0x0, -0x1, 0x0, -0x1, 0x0, -0x1, 0x0})),
+    (unsigned char)0xD8), // C ? B : A
+  0xAAAA, 0x2222, 0xCCCC, 0x4444, 0xEEEE, 0x6666, 0x1111, 0x8888));
+TEST_CONSTEXPR(match_v8di(
+  _mm512_ternarylogic_epi64(
+    ((__m512i)((__v8di){-0x1, 0x0, -0x1, 0x0, 0xF0F0, 0xFF, -0x5555555555555556, 0x5555555555555555})),
+    ((__m512i)((__v8di){0x1234, 0xFFFF, 0xFF, 0xF0F, 0x3333, 0xFF00, -0x5555555555555556, -0x0F0F0F0F0F0F0F10})),
+    ((__m512i)((__v8di){0xFFFF, 0x1234, 0xF0F, 0xFF00, 0xF0F0, 0x3333, 0x5555555555555555, 0x0F0F0F0F0F0F0F0})),
+    (unsigned char)0x8F), // ~A | (B & C)
+  0x1234, -0x1, 0xF, -0x1, -0xC0C1, -0x100, 0x5555555555555555, -0x5505050505050506));
+TEST_CONSTEXPR(match_v8di(
+  _mm512_ternarylogic_epi64(
+    ((__m512i)((__v8di){0x7FFFFFFFFFFFFFFF, 0x0, 0x00FF00FF00FF00FF, 0x0F0F0F0F0F0F0F0F,
+                        0x123456789ABCDEF0, 0x3333333333333333, 0x5555555555555555, 0x0123456789ABCDEF})),
+    ((__m512i)((__v8di){0x1111111111111111, 0x2222222222222222, 0xFFFFFFFF, -0x100000000,
+                        0x0, -0x3333333333333334, -0x0F0F0F0F0F0F0F10, -0x123456789ABCDF0})),
+    ((__m512i)((__v8di){0x2222222222222222, 0x1111111111111111, -0x1000000000000, 0xFFFFFFFF,
+                        -0x1, 0x0, 0x0F0F0F0F0F0F0F0F, 0x0})),
+    (unsigned char)0xE0), // A & (B | C)
+  0x3333333333333333, 0x0, 0x00FF000000FF00FF, 0x0F0F0F0F0F0F0F0F,
+  0x123456789ABCDEF0, 0x0, 0x5555555555555555, 0x0));
 
 __m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) {
   // CHECK-LABEL: test_mm512_mask_ternarylogic_epi64
@@ -6300,6 +6523,40 @@ __m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_ternarylogic_epi64(__A, __U, __B, __C, _MM_TERNLOG_B | _MM_TERNLOG_C);
 }
+TEST_CONSTEXPR(match_v8di(
+  _mm512_mask_ternarylogic_epi64(
+    ((__m512i)((__v8di){0x0LL, 0x1LL, 0x2LL, 0x3LL, 0x4LL, 0x5LL, 0x6LL, 0x7LL})),
+    (__mmask8)0xFF,
+    ((__m512i)((__v8di){0x1LL, 0x1LL, 0x1LL, 0x1LL, 0x1LL, 0x1LL, 0x1LL, 0x1LL})),
+    ((__m512i)((__v8di){0x0LL, 0x0LL, 0x0LL, 0x0LL, 0x0LL, 0x0LL, 0x0LL, 0x0LL})),
+    (unsigned char)0x96),
+  0x1, 0x0, 0x3, 0x2, 0x5, 0x4, 0x7, 0x6));
+TEST_CONSTEXPR(match_v8di(
+  _mm512_mask_ternarylogic_epi64(
+    ((__m512i)((__v8di){
+      (long long)0x9FD641D41C6A70FEULL, (long long)0xB51D9082CF18D398ULL,
+      (long long)0x730E520285F4D01BULL, (long long)0x347E72CE341FD932ULL,
+      (long long)0x438F8D9BEA5D486FULL, (long long)0xFDB554A5DEEF750DULL,
+      (long long)0x0ABAA254BFFC2308ULL, (long long)0x825FE29BF1D51FC6ULL
+    })),
+    (__mmask8)0xE4,
+    ((__m512i)((__v8di){
+      (long long)0xC1779B12FA832A6EULL, (long long)0xCF6E876B587C4762ULL,
+      (long long)0x25DC09833D4ECA24ULL, (long long)0x34E55E25691BB80AULL,
+      (long long)0x9A02450CD8F20DD7ULL, (long long)0x78B9E240FB5B77A9ULL,
+      (long long)0xE1F37F76C1162596ULL, (long long)0xDCCB561738CE2941ULL
+    })),
+    ((__m512i)((__v8di){
+      (long long)0xD13840986BC8DC3CULL, (long long)0x34CDE7E8C960187EULL,
+      (long long)0x7EE068D9D111EEB8ULL, (long long)0xAD11149DE686B811ULL,
+      (long long)0x849F38BFD9AB0DFAULL, (long long)0x5C28948ED106227BULL,
+      (long long)0xFB1918D4A18E304DULL, (long long)0x4EDE6944F84AD59FULL
+    })),
+    (unsigned char)0x67),
+  (long long)0x9FD641D41C6A70FEULL, (long long)0xB51D9082CF18D398ULL,
+  (long long)0xDB3DE57EEE5F25DCULL, (long long)0x347E72CE341FD932ULL,
+  (long long)0x438F8D9BEA5D486FULL, (long long)0x26D37FDE2A5DDDD2ULL,
+  (long long)0x1EEE67AB6099DDFBULL, (long long)0xB3353F73C6A4FCFEULL));
 
 __m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) {
   // CHECK-LABEL: test_mm512_maskz_ternarylogic_epi64
@@ -6307,6 +6564,59 @@ __m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> zeroinitializer
   return _mm512_maskz_ternarylogic_epi64(__U, __A, __B, __C, ~_MM_TERNLOG_A | (_MM_TERNLOG_B ^ _MM_TERNLOG_C));
 }
+TEST_CONSTEXPR(match_v8di(
+  _mm512_maskz_ternarylogic_epi64(
+    (__mmask8)0x6D,
+    ((__m512i)((__v8di){
+      (long long)0xFFFFFFFFFFFFFFFF, (long long)0x0000000000000000,
+      (long long)0x0000FFFF0000FFFF, (long long)0x5555555555555555,
+      (long long)0x0123456789ABCDEF, (long long)0x1122334455667788,
+      (long long)0x00000000FFFFFFFF, (long long)0x0F0F0F0F0F0F0F0F
+    })),
+    ((__m512i)((__v8di){
+      (long long)0x000000000000000B, (long long)0x000000000000000C,
+      (long long)0x00000000FFFF0000, (long long)0x3333333333333333,
+      (long long)0x0FEDCBA987654321, (long long)0x1111111111111111,
+      (long long)0x7FFFFFFFFFFFFFFF, (long long)0x2222222222222222
+    })),
+    ((__m512i)((__v8di){
+      (long long)0x000000000000000C, (long long)0x000000000000000B,
+      (long long)0x00F0F0F0F0F0F0F0, (long long)0x5555555555555555,
+      (long long)0x0000000000000000, (long long)0x7FFFFFFFFFFFFFFF,
+      (long long)0x0000000000000001, (long long)0x2222222222222222
+    })),
+    (unsigned char)0x89),
+  (long long)0x0000000000000008, (long long)0x0000000000000000,
+  (long long)0xFF0F0000F0F00000, (long long)0x9999999999999999,
+  (long long)0x0000000000000000, (long long)0x9111111111111111,
+  (long long)0x8000000000000001, (long long)0x0000000000000000));
+
+TEST_CONSTEXPR(match_v8di(
+  _mm512_maskz_ternarylogic_epi64(
+    (__mmask8)0x6D,
+    ((__m512i)((__v8di){
+      (long long)0xFFFFFFFFFFFFFFFF, (long long)0x0000000000000000,
+      (long long)0x0000FFFF0000FFFF, (long long)0x5555555555555555,
+      (long long)0x0123456789ABCDEF, (long long)0x1122334455667788,
+      (long long)0x00000000FFFFFFFF, (long long)0x0F0F0F0F0F0F0F0F
+    })),
+    ((__m512i)((__v8di){
+      (long long)0x000000000000000B, (long long)0x000000000000000C,
+      (long long)0x00000000FFFF0000, (long long)0x3333333333333333,
+      (long long)0x0FEDCBA987654321, (long long)0x1111111111111111,
+      (long long)0x7FFFFFFFFFFFFFFF, (long long)0x2222222222222222
+    })),
+    ((__m512i)((__v8di){
+      (long long)0x000000000000000C, (long long)0x000000000000000B,
+      (long long)0x00F0F0F0F0F0F0F0, (long long)0x5555555555555555,
+      (long long)0x0000000000000000, (long long)0x7FFFFFFFFFFFFFFF,
+      (long long)0x0000000000000001, (long long)0x2222222222222222
+    })),
+    (unsigned char)0x29),
+  (long long)0x0000000000000004, (long long)0x0000000000000000,
+  (long long)0xFF0FF0F0F0F0F0F0, (long long)0xCCCCCCCCCCCCCCCC,
+  (long long)0x0000000000000000, (long long)0x8033225544776699,
+  (long long)0x8000000000000000, (long long)0x0000000000000000));
 
 __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) {
   // CHECK-LABEL: test_mm512_shuffle_f32x4
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 6d9187086c267..51385d57d2944 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -8359,6 +8359,27 @@ __m128i test_mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK: @llvm.x86.avx512.pternlog.d.128
   return _mm_ternarylogic_epi32(__A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v4si(
+  _mm_ternarylogic_epi32(
+    ((__m128i)((__v4si){(int)0x7FFFFFFF, (int)0x80000000, (int)0xAAAAAAAA, 0x00000000})),
+    ((__m128i)((__v4si){0x00000000, (int)0xFFFFFFFF, 0x12345678, (int)0xFFFFFFFF})),
+    ((__m128i)((__v4si){(int)0xCAFEBABE, 0x0F0F0F0F, (int)0xFFFFFFFF, 0x00000000})),
+    (unsigned char)0xCA), /* B ? (A | C) : (C & ~A) */
+  (int)0x80000000, (int)0x8F0F0F0F, 0x5775577D, 0x00000000));
+TEST_CONSTEXPR(match_v4si(
+  _mm_ternarylogic_epi32(
+    ((__m128i)((__v4si){0x12345678, (int)0x80000000, 0x00000000, (int)0xAAAAAAAA})),
+    ((__m128i)((__v4si){0x0000FFFF, 0x7FFFFFFF, 0x55555555, 0x00000000})),
+    ((__m128i)((__v4si){(int)0xF0F0F0F0, 0x00000001, 0x0F0F0F0F, 0x33333333})),
+    (unsigned char)0xFE), /* A | B | C */
+  (int)0xF2F4FFFF, (int)0xFFFFFFFF, 0x5F5F5F5F, (int)0xBBBBBBBB));
+TEST_CONSTEXPR(match_v4si(
+  _mm_ternarylogic_epi32(
+    ((__m128i)((__v4si){(int)0xFFFFFFFF, 0x12345678, (int)0x80000000, 0x0F0F0F0F})),
+    ((__m128i)((__v4si){0x00FF00FF, (int)0xFFFFFFFF, 0x7FFFFFFF, (int)0xF0F0F0F0})),
+    ((__m128i)((__v4si){0x0F0F0F0F, 0x00FF00FF, (int)0xFFFFFFFF, (int)0xFFFFFFFF})),
+    (unsigned char)0x80), /* A & B & C */
+  0x000F000F, 0x00340078, 0x00000000, 0x00000000));
 
 __m128i test_mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_mask_ternarylogic_epi32
@@ -8366,6 +8387,30 @@ __m128i test_mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, __m128i __B,
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_ternarylogic_epi32(
+    ((__m128i)((__v4si){-0x1, 0x0, -0x1, 0x0})),
+    (__mmask8)0x03,
+    ((__m128i)((__v4si){0xB, 0xB, 0xB, 0xB})),
+    ((__m128i)((__v4si){0xC, 0xC, 0xC, 0xC })),
+    (unsigned char)0xCA), // A ? B : C
+  0xB, 0xC, -0x1, 0x0));
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_ternarylogic_epi32(
+    ((__m128i)((__v4si){0x9, 0x9, 0x9, 0x9})),
+    (__mmask8)0x0C,
+    ((__m128i)((__v4si){0x4, 0x4, 0x4, 0x4})),
+    ((__m128i)((__v4si){0x2, 0x2, 0x2, 0x2})),
+    (unsigned char)0xFE), // A | B | C
+  0x9, 0x9, 0xF, 0xF));
+TEST_CONSTEXPR(match_v4si(
+  _mm_mask_ternarylogic_epi32(
+    ((__m128i)((__v4si){0x9, 0x9, 0x9, 0x9})),
+    (__mmask8)0x05,
+    ((__m128i)((__v4si){0x4, 0x4, 0x4, 0x4})),
+    ((__m128i)((__v4si){0x2, 0x2, 0x2, 0x2})),
+    (unsigned char)0x80), // A & B & C
+  0x0, 0x9, 0x0, 0x9));
 
 __m128i test_mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_maskz_ternarylogic_epi32
@@ -8373,12 +8418,57 @@ __m128i test_mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, __m128i __B,
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
   return _mm_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_ternarylogic_epi32(
+    (__mmask8)0x0B,
+    ((__m128i)((__v4si){(int)0xDEADBEEF, 0, (int)0xFFFFFFFF, 0x13579BDF})),
+    ((__m128i)((__v4si){(int)0xFFFFFFFF, 0, (int)0xFFFFFFFF, 0})),
+    ((__m128i)((__v4si){(int)0xCAFEBABE, (int)0xFFFFFFFF, 0, 0x2468ACE0})),
+    (unsigned char)0xE2), // B ? A : C
+  (int)0xDEADBEEF, (int)0xFFFFFFFF, 0, 0x2468ACE0));
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_ternarylogic_epi32(
+    (__mmask8)0x0C,
+    ((__m128i)((__v4si){0, (int)0xFFFFFFFF, (int)0xAAAAAAAA, 0x55555555})),
+    ((__m128i)((__v4si){(int)0xFFFFFFFF, 0, (int)0xFFFFFFFF, (int)0xFFFFFFFF})),
+    ((__m128i)((__v4si){(int)0xF0F0F0F0, 0, 0, (int)0xFFFFFFFF})),
+    (unsigned char)0x7F),   // ~(A & B) | ~(B & C)
+  0, 0, (int)0xFFFFFFFF, (int)0xAAAAAAAA));
+TEST_CONSTEXPR(match_v4si(
+  _mm_maskz_ternarylogic_epi32(
+    (__mmask8)0x05,
+    ((__m128i)((__v4si){(int)0xFFFFFFFF, 0, 0x12345678, 0})),
+    ((__m128i)((__v4si){0, 0, 0x0000FFFF, (int)0xFFFFFFFF})),
+    ((__m128i)((__v4si){0, 0, 0x0000000F, 0})),
+    (unsigned char)0xBF),   // ~A | ~B | C   imm = 0xBF
+  (int)0xFFFFFFFF, 0, (int)0xFFFFA98F, 0));
 
 __m256i test_mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_ternarylogic_epi32
   // CHECK: @llvm.x86.avx512.pternlog.d.256
   return _mm256_ternarylogic_epi32(__A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v8si(
+  _mm256_ternarylogic_epi32(
+    ((__m256i)((__v8si){0x12345678, 0x00000000, (int)0xFFFFFFFF, 0x7FFFFFFF, (int)0x80000000, 0x00FF00FF, (int)0xF0F0F0F0, (int)0xAAAAAAAA})),
+    ((__m256i)((__v8si){(int)0xDEADBEEF, 0x11111111, 0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777})),
+    ((__m256i)((__v8si){(int)0xCAFEBABE, (int)0x88888888, (int)0x99999999, (int)0xAAAAAAAA, (int)0xBBBBBBBB, (int)0xCCCCCCCC, (int)0xDDDDDDDD, (int)0xFFFFFFFF})),
+    (unsigned char)0xF0), /* A */
+  0x12345678, 0x00000000, (int)0xFFFFFFFF, 0x7FFFFFFF, (int)0x80000000, 0x00FF00FF, (int)0xF0F0F0F0, (int)0xAAAAAAAA));
+TEST_CONSTEXPR(match_v8si(
+  _mm256_ternarylogic_epi32(
+    ((__m256i)((__v8si){0x12345678, 0x00000000, (int)0xFFFFFFFF, 0x7FFFFFFF, (int)0x80000000, 0x00FF00FF, (int)0xF0F0F0F0, (int)0xAAAAAAAA})),
+    ((__m256i)((__v8si){(int)0xAAAAAAAA, (int)0xBBBBBBBB, (int)0xCCCCCCCC, (int)0xDDDDDDDD, (int)0xEEEEEEEE, (int)0xFFFFFFFF, 0x00000000, 0x11111111})),
+    ((__m256i)((__v8si){0x22222222, 0x33333333, 0x44444444, 0x55555555, 0x66666666, 0x77777777, (int)0x88888888, (int)0x99999999})),
+    (unsigned char)0x0F), /* ~A */
+  (int)0xEDCBA987, (int)0xFFFFFFFF, 0x00000000, (int)0x80000000, 0x7FFFFFFF, (int)0xFF00FF00, 0x0F0F0F0F, 0x55555555));
+TEST_CONSTEXPR(match_v8si(
+  _mm256_ternarylogic_epi32(
+    ((__m256i)((__v8si){0x0F0F0F0F, (int)0xAAAAAAAA, 0x12345678, 0x00000000, (int)0xFFFFFFFF, 0x13579BDF, (int)0x80000000, 0x7FFFFFFF})),
+    ((__m256i)((__v8si){(int)0xF0F0F0F0, 0x55555555, 0x11111111, (int)0xFFFFFFFF, 0x00000000, 0x02468ACE, 0x7FFFFFFF, (int)0x80000000})),
+    ((__m256i)((__v8si){(int)0xAAAAAAAA, (int)0xAAAAAAAA, (int)0xAAAAAAAA, (int)0xAAAAAAAA, (int)0xAAAAAAAA, (int)0xAAAAAAAA, (int)0xAAAAAAAA, (int)0xAAAAAAAA})),
+    (unsigned char)0x3C), /* A ^ B */
+  (int)0xFFFFFFFF, (int)0xFFFFFFFF, 0x03254769, (int)0xFFFFFFFF, (int)0xFFFFFFFF, 0x11111111, (int)0xFFFFFFFF, (int)0xFFFFFFFF));
 
 __m256i test_mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_mask_ternarylogic_epi32
@@ -8386,6 +8476,30 @@ __m256i test_mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, __m256i __
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_ternarylogic_epi32(
+    ((__m256i)((__v8si){(int)0xFFFFFFFF, 0x00000000, 0x12345678, (int)0xAAAAAAAA, 0x7FFFFFFF, (int)0x80000000, 0x13579BDF, 0x2468ACE0})),
+    (__mmask8)0xA5,
+    ((__m256i)((__v8si){(int)0xFFFFFFFF, 0x00000000, (int)0xFFFFFFFF, 0x00000000, (int)0xFFFFFFFF, 0x00000000, (int)0xFFFFFFFF, 0x00000000})),
+    ((__m256i)((__v8si){0x00000000, (int)0xFFFFFFFF, 0x55555555, 0x33333333, (int)0x89ABCDEF, 0x00000000, (int)0xFFFFFFFF, 0x11111111})),
+    (unsigned char)0xE2), /* B ? A : C */
+  (int)0xFFFFFFFF, 0x00000000, 0x12345678, (int)0xAAAAAAAA, 0x7FFFFFFF, 0x00000000, 0x13579BDF, 0x11111111));
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_ternarylogic_epi32(
+    ((__m256i)((__v8si){0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F})),
+    (__mmask8)0xFF,
+    ((__m256i)((__v8si){0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF})),
+    ((__m256i)((__v8si){0x33333333, 0x33333333, 0x33333333, 0x33333333, 0x33333333, 0x33333333, 0x33333333, 0x33333333})),
+    (unsigned char)0x96), /* A ^ B ^ C */
+  0x3CC33CC3, 0x3CC33CC3, 0x3CC33CC3, 0x3CC33CC3, 0x3CC33CC3, 0x3CC33CC3, 0x3CC33CC3, 0x3CC33CC3));
+TEST_CONSTEXPR(match_v8si(
+  _mm256_mask_ternarylogic_epi32(
+    ((__m256i)((__v8si){(int)0xFFFFFFFF, 0x00000000, (int)0xFFFFFFFF, 0x12345678, (int)0xAAAAAAAA, 0x55555555, (int)0x80000000, 0x7FFFFFFF})),
+    (__mmask8)0x5A,
+    ((__m256i)((__v8si){0x00000000, (int)0xFFFFFFFF, 0x11111111, (int)0xFFFFFFFF, 0x55555555, (int)0xAAAAAAAA, (int)0x80000000, 0x7FFFFFFF})),
+    ((__m256i)((__v8si){0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000})),
+    (unsigned char)0xC0), /* A & B */
+  (int)0xFFFFFFFF, 0x00000000, (int)0xFFFFFFFF, 0x12345678, 0x00000000, 0x55555555, (int)0x80000000, 0x7FFFFFFF));
 
 __m256i test_mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_maskz_ternarylogic_epi32
@@ -8393,12 +8507,60 @@ __m256i test_mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, __m256i _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> zeroinitializer
   return _mm256_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_ternarylogic_epi32(
+    (__mmask8)0x6D,
+    ((__m256i)((__v8si){(int)-1, 0, (int)-1, 0, (int)-1, 0, (int)-1, 0})),
+    ((__m256i)((__v8si){0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB, 0xB})),
+    ((__m256i)((__v8si){0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC, 0xC})),
+    (unsigned char)0x30), /* A & ~B */
+  (int)0xFFFFFFF4, 0, (int)0xFFFFFFF4, 0, 0, 0, (int)0xFFFFFFF4, 0));
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_ternarylogic_epi32(
+    (__mmask8)0x90,
+    ((__m256i)((__v8si){0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9})),
+    ((__m256i)((__v8si){0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4})),
+    ((__m256i)((__v8si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+    (unsigned char)0x44), /* B & ~C */
+  0, 0, 0, 0, 0x4, 0, 0, 0x4));
+TEST_CONSTEXPR(match_v8si(
+  _mm256_maskz_ternarylogic_epi32(
+    (__mmask8)0x0F,
+    ((__m256i)((__v8si){0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3})),
+    ((__m256i)((__v8si){0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1})),
+    ((__m256i)((__v8si){0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2})),
+    (unsigned char)0x28), /* (A ^ B) & C */
+  0x2, 0x2, 0x2, 0x2, 0, 0, 0, 0));
 
 __m128i test_mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_ternarylogic_epi64
   // CHECK: @llvm.x86.avx512.pternlog.q.128
   return _mm_ternarylogic_epi64(__A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v2di(
+  _mm_ternarylogic_epi64(
+    ((__m128i)((__v2di){ (long long)0xBB91433A6AA79987ULL, (long long)0xD1F6F86C029A7245ULL })),
+    ((__m128i)((__v2di){ (long long)0xCD8778E7D340BBCDULL, (long long)0xDAEA58BA4C73A942ULL })),
+    ((__m128i)((__v2di){ (long long)0xEE8971105E503A67ULL,  (long long)0x452EC40A3193CA54ULL })),
+    (unsigned char)0x77),  // F = ~(A & B)
+  (long long)0x337E8FFFADBFC5BAULL,
+  (long long)0xBFD5BFF5FFEC77BFULL));
+TEST_CONSTEXPR(match_v2di(
+  _mm_ternarylogic_epi64(
+    ((__m128i)((__v2di){ (long long)0x6FACAA5090E5E945ULL, (long long)0x5F811CB929645F8BULL })),
+    ((__m128i)((__v2di){ (long long)0xDFC9E3B11FCFF454ULL, (long long)0x42D6CB5C6ED4E94BULL })),
+    ((__m128i)((__v2di){ (long long)0xA091250E8FE46024ULL, (long long)0x9C9CEA0C2CA1C789ULL })),
+    (unsigned char)0xDD),  // F = (~A) | B
+  (long long)0xDFEFFBF17FDFFFDFULL,
+  (long long)0x63F7DFFFFFDEF97FULL));
+TEST_CONSTEXPR(match_v2di(
+  _mm_ternarylogic_epi64(
+    ((__m128i)((__v2di){ (long long)0x2FD2B7A48D9FE5B9ULL, (long long)0xBCF74D7A5ADAD121ULL })),
+    ((__m128i)((__v2di){ (long long)0xBB9D58E4F543BBCFULL, (long long)0x87F26AEE175F0CD2ULL })),
+    ((__m128i)((__v2di){ (long long)0xBC428D42FA882692ULL, (long long)0x95C5FB986980A81FULL })),
+    (unsigned char)0x22),  // F = A & ~B
+  (long long)0x044285020A880410ULL,
+  (long long)0x100591106880A00DULL));
 
 __m128i test_mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_mask_ternarylogic_epi64
@@ -8406,6 +8568,40 @@ __m128i test_mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, __m128i __B,
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v2di(
+  _mm_mask_ternarylogic_epi64(
+    ((__m128i)((__v2di){(long long)0xF4C3B00C0D15EA5ELL, (long long)0x0123456789ABCDE0LL})),
+    (__mmask8)0x9D,
+    ((__m128i)((__v2di){(long long)0x9A7F3C2155EE00DDLL, (long long)0xDEADBEEFCAFEBABELL})),
+    ((__m128i)((__v2di){(long long)0x00F0F0F0F0F0F0F0LL, (long long)0x13579BDF2468ACE0LL})),
+    (unsigned char)0xFF), // All 1s
+  (long long)-1,
+  (long long)0x0123456789ABCDE0LL));
+TEST_CONSTEXPR(match_v2di(
+  _mm_mask_ternarylogic_epi64(
+    ((__m128i)((__v2di){ (long long)0x3A7C19E54B20D8A1LL, (long long)0x4F12B39D0C85E762LL })),
+    (__mmask8)0xD2,
+    ((__m128i)((__v2di){ (long long)0x6D93A0F217C54E3BLL, (long long)0x24E1C7A95B08D6F2LL })),
+    ((__m128i)((__v2di){ (long long)0x5A0C3E19D472B8F5LL, (long long)0x0187D3B2C9E4056ALL })),
+    (unsigned char)0x00),
+  (long long)0x3A7C19E54B20D8A1LL,
+  (long long)0x0LL)); // All 0s
+TEST_CONSTEXPR(match_v2di(
+  _mm_mask_ternarylogic_epi64(
+    ((__m128i)((__v2di){
+      (long long)0xA3F10B6C7D8294E1ULL, (long long)0x19D4E7350AB2C98FLL
+    })),
+    (__mmask8)0xB5,
+    ((__m128i)((__v2di){
+      (long long)0x5C2E9A10F4B7D863LL, (long long)0x9B7E1D2C3A4F5E60LL
+    })),
+    ((__m128i)((__v2di){
+      (long long)0x2A6D3F81C9E047B5LL, (long long)0x7F0A1C3E5D2B6490LL
+    })),
+    (unsigned char)0x55), // ~C
+  (long long)0xD592C07E361FB84AULL,
+  (long long)0x19D4E7350AB2C98FLL
+));
 
 __m128i test_mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_maskz_ternarylogic_epi64
@@ -8413,12 +8609,72 @@ __m128i test_mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, __m128i __B,
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> zeroinitializer
   return _mm_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v2di(
+  _mm_maskz_ternarylogic_epi64(
+    (__mmask8)0xA9,
+    ((__m128i)((__v2di){
+      (long long)0x8F3A5C7E21D4B690ULL, (long long)0x5AD02CE19B7F46A3ULL
+    })),
+    ((__m128i)((__v2di){
+      (long long)0xC19E04B2A7D35F68ULL, (long long)0x2F7B93C4E1A05D76ULL
+    })),
+    ((__m128i)((__v2di){
+      (long long)0x7A0C1D2E3F405162ULL, (long long)0xD4E5F60718293A4BULL
+    })),
+    (unsigned char)0xD2),  // F = C ? (B | ~A) : (A & ~B)
+  (long long)0xB53A457239D4B692ULL,
+  (long long)0x0ULL));
+TEST_CONSTEXPR(match_v2di(
+  _mm_maskz_ternarylogic_epi64(
+    (__mmask8)0xB6,
+    ((__m128i)((__v2di){
+      (long long)0x83C1D2E3F4051627ULL, (long long)0x5A0B1C2D3E4F6071ULL
+    })),
+    ((__m128i)((__v2di){
+      (long long)0x9E8D7C6B5A493827ULL, (long long)0x13579BDF2468ACE0ULL
+    })),
+    ((__m128i)((__v2di){
+      (long long)0x02468ACE13579BDFULL, (long long)0xFEDCBA9876543210ULL
+    })),
+    (unsigned char)0xFE),  // F = A | B | C
+  (long long)0x0ULL,
+  (long long)0xFFDFBFFF7E7FFEF1ULL));
+TEST_CONSTEXPR(match_v2di(
+  _mm_maskz_ternarylogic_epi64(
+    (__mmask8)0xA5,
+    ((__m128i)((__v2di){
+      (long long)0x1C80317FA3B1799DULL, (long long)0xBDD640FB06671AD1ULL
+    })),
+    ((__m128i)((__v2di){
+      (long long)0x3EB13B9046685257ULL, (long long)0x23B8C1E9392456DEULL
+    })),
+    ((__m128i)((__v2di){
+      (long long)0x1A3D1FA7BC8960A9ULL, (long long)0xBD9C66B3AD3C2D6DULL
+    })),
+    (unsigned char)0x80),  // F = A & B & C
+  (long long)0x1800110000004001ULL,
+  (long long)0x0ULL
+));
 
 __m256i test_mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_ternarylogic_epi64
   // CHECK: @llvm.x86.avx512.pternlog.q.256
   return _mm256_ternarylogic_epi64(__A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v4di(
+  _mm256_ternarylogic_epi64(
+    ((__m256i)((__v4di){-0x1, 0x0, -0x1, 0x0})),
+    ((__m256i)((__v4di){0xB, 0xB, 0xB, 0xB})),
+    ((__m256i)((__v4di){0xC, 0xC, 0xC, 0xC})),
+    (unsigned char)0x94),
+  (long long)-0x8, (long long)0x3, (long long)-0x8, (long long)0x3));
+TEST_CONSTEXPR(match_v4di(
+  _mm256_ternarylogic_epi64(
+    ((__m256i)((__v4di){0x9, 0x9, 0x9, 0x9})),
+    ((__m256i)((__v4di){0x4, 0x4, 0x4, 0x4})),
+    ((__m256i)((__v4di){0x2, 0x2, 0x2, 0x2})),
+    (unsigned char)0x76),
+  (long long)0xF, (long long)0xF, (long long)0xF, (long long)0xF));
 
 __m256i test_mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_mask_ternarylogic_epi64
@@ -8426,6 +8682,46 @@ __m256i test_mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, __m256i __
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v4di(
+  _mm256_mask_ternarylogic_epi64(
+    ((__m256i)((__v4di){
+      (long long)0x0123456789ABCDEFULL, (long long)0x0F0F0F0F0F0F0F0FULL,
+      (long long)0xAAAAAAAAAAAAAAAALL, (long long)0x13579BDF02468ACEULL
+    })),
+    (__mmask8)0x09,
+    ((__m256i)((__v4di){
+      (long long)0x1111111111111111ULL, (long long)0x2222222222222222ULL,
+      (long long)0x3333333333333333ULL, (long long)0x4444444444444444ULL
+    })),
+    ((__m256i)((__v4di){
+      (long long)0x5555555555555555ULL, (long long)0x6666666666666666ULL,
+      (long long)0x7777777777777777ULL, (long long)0x8888888888888888ULL
+    })),
+    (unsigned char)0x12),
+  (long long)0x44660022CCEE88AAULL,
+  (long long)0x0F0F0F0F0F0F0F0FULL,
+  (long long)0xAAAAAAAAAAAAAAAALL,
+  (long long)0x9B9B13138A8A0202ULL));
+TEST_CONSTEXPR(match_v4di(
+  _mm256_mask_ternarylogic_epi64(
+    ((__m256i)((__v4di){
+      (long long)0xDEADBEEFDEADBEEFULL, (long long)0xCAFEBABECAFEBABEULL,
+      (long long)0xF00DFACEF00DFACEULL, (long long)0x0123456789ABCDEFULL
+    })),
+    (__mmask8)0x06,
+    ((__m256i)((__v4di){
+      (long long)0x0000000000000000ULL, (long long)0xFFFFFFFFFFFFFFFFULL,
+      (long long)0x13579BDF13579BDFULL, (long long)0x0AAAAAAAAAAAAAAULL
+    })),
+    ((__m256i)((__v4di){
+      (long long)0x1111111111111111ULL, (long long)0x2222222222222222ULL,
+      (long long)0x3333333333333333ULL, (long long)0x4444444444444444ULL
+    })),
+    (unsigned char)0x23),
+  (long long)0xDEADBEEFDEADBEEFULL,
+  (long long)0x0000000000000000ULL,
+  (long long)0x2CA024202CA02420ULL,
+  (long long)0x0123456789ABCDEFULL));
 
 __m256i test_mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_maskz_ternarylogic_epi64
@@ -8433,6 +8729,41 @@ __m256i test_mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, __m256i _
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> zeroinitializer
   return _mm256_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
 }
+TEST_CONSTEXPR(match_v4di(
+  _mm256_maskz_ternarylogic_epi64(
+    (__mmask8)0x05,
+    ((__m256i)((__v4di){
+      (long long)0x1, (long long)0x2, (long long)0x0, (long long)0x7
+    })),
+    ((__m256i)((__v4di){
+      (long long)0x0, (long long)0x3, (long long)0x4, (long long)0x0
+    })),
+    ((__m256i)((__v4di){
+      (long long)0x0, (long long)0x5, (long long)0x0, (long long)0x1
+    })),
+    (unsigned char)0xFE),
+  (long long)0x1,
+  (long long)0x0,
+  (long long)0x4,
+  (long long)0x0));
+TEST_CONSTEXPR(match_v4di(
+  _mm256_maskz_ternarylogic_epi64(
+    (__mmask8)0x0A,
+    ((__m256i)((__v4di){
+      (long long)0x1, (long long)0x0, (long long)0x2, (long long)0x1
+    })),
+    ((__m256i)((__v4di){
+      (long long)0x0, (long long)0x1, (long long)0x0, (long long)0x0
+    })),
+    ((__m256i)((__v4di){
+      (long long)0x0, (long long)0x0, (long long)0x4, (long long)0x1
+    })),
+    (unsigned char)0xED),
+  (long long)0x0,
+  (long long)-0x1,
+  (long long)0x0,
+  (long long)-0x1));
+
 __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) {
   // CHECK-LABEL: test_mm256_shuffle_f32x4
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>

From 208231d1973bb4dd8c8991cfae8db422c443fcdf Mon Sep 17 00:00:00 2001
From: Fei Peng <pengfei.02@bytedance.com>
Date: Mon, 6 Oct 2025 09:51:26 -0700
Subject: [PATCH 863/878] [compiler-rt][TSan] Add support for Android (#147580)

1. Fixed Android setjmp issue. The root cause is that TSan initializes
before longjmp_xor_key is set up. During __libc_init_vdso, a call to
strcmp triggers TSan initialization, which occurs before
__libc_init_setjmp_cookie. The solution is to call
InitializeLongjmpXorKey on the first use of longjmp_xor_key.
Additionally, correct LONG_JMP_SP_ENV_SLOT by following the bionic
source code.
2. Skip thr object range check on Android. On Android, thr is allocated
on the heap, causing the check to fail.
3. Disable intercepting clone on Android. pthread_create internally
calls clone. Disabling the interception of clone resolves the issue in
most scenarios.
4. Use a workaround to recover the thr pointer stored in
TLS_SLOT_SANITIZER slot, whose value was modified by Skia.

This PR solved the issue from NDK
https://github.com/android/ndk/issues/1041.

Test project: https://github.com/bytedance/android_tsan_sample/
---
 .../lib/tsan/rtl/tsan_interceptors_posix.cpp  |  8 ++-
 .../lib/tsan/rtl/tsan_platform_linux.cpp      | 49 ++++++++++++++-----
 compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp  |  6 ++-
 3 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index b46a81031258c..28e35229bd5e4 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -2412,7 +2412,11 @@ TSAN_INTERCEPTOR(int, vfork, int fake) {
 }
 #endif
 
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX && !SANITIZER_ANDROID
+// Bionic's pthread_create internally calls clone. When the CLONE_THREAD flag is
+// set, clone does not create a new process but a new thread. This is a
+// workaround for Android. Disabling the interception of clone solves the
+// problem in most scenarios.
 TSAN_INTERCEPTOR(int, clone, int (*fn)(void *), void *stack, int flags,
                  void *arg, int *parent_tid, void *tls, pid_t *child_tid) {
   SCOPED_INTERCEPTOR_RAW(clone, fn, stack, flags, arg, parent_tid, tls,
@@ -3135,7 +3139,7 @@ void InitializeInterceptors() {
 
   TSAN_INTERCEPT(fork);
   TSAN_INTERCEPT(vfork);
-#if SANITIZER_LINUX
+#if SANITIZER_LINUX && !SANITIZER_ANDROID
   TSAN_INTERCEPT(clone);
 #endif
 #if !SANITIZER_ANDROID
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
index 4b55aab49a2b9..6b6538735beb0 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
@@ -486,8 +486,20 @@ int ExtractRecvmsgFDs(void *msgp, int *fds, int nfd) {
 
 // Reverse operation of libc stack pointer mangling
 static uptr UnmangleLongJmpSp(uptr mangled_sp) {
-#if defined(__x86_64__)
-# if SANITIZER_LINUX
+#    if SANITIZER_ANDROID
+  if (longjmp_xor_key == 0) {
+    // bionic libc initialization process: __libc_init_globals ->
+    // __libc_init_vdso (calls strcmp) -> __libc_init_setjmp_cookie. strcmp is
+    // intercepted by TSan, so during TSan initialization the setjmp_cookie
+    // remains uninitialized. On Android, longjmp_xor_key must be set on first
+    // use.
+    InitializeLongjmpXorKey();
+    CHECK_NE(longjmp_xor_key, 0);
+  }
+#    endif
+
+#    if defined(__x86_64__)
+#      if SANITIZER_LINUX
   // Reverse of:
   //   xor  %fs:0x30, %rsi
   //   rol  $0x11, %rsi
@@ -542,13 +554,23 @@ static uptr UnmangleLongJmpSp(uptr mangled_sp) {
 # else
 #  define LONG_JMP_SP_ENV_SLOT 2
 # endif
-#elif SANITIZER_LINUX
-# ifdef __aarch64__
-#  define LONG_JMP_SP_ENV_SLOT 13
-# elif defined(__loongarch__)
-#  define LONG_JMP_SP_ENV_SLOT 1
-# elif defined(__mips64)
-#  define LONG_JMP_SP_ENV_SLOT 1
+#    elif SANITIZER_ANDROID
+#      ifdef __aarch64__
+#        define LONG_JMP_SP_ENV_SLOT 3
+#      elif SANITIZER_RISCV64
+#        define LONG_JMP_SP_ENV_SLOT 3
+#      elif defined(__x86_64__)
+#        define LONG_JMP_SP_ENV_SLOT 6
+#      else
+#        error unsupported
+#      endif
+#    elif SANITIZER_LINUX
+#      ifdef __aarch64__
+#        define LONG_JMP_SP_ENV_SLOT 13
+#      elif defined(__loongarch__)
+#        define LONG_JMP_SP_ENV_SLOT 1
+#      elif defined(__mips64)
+#        define LONG_JMP_SP_ENV_SLOT 1
 #      elif SANITIZER_RISCV64
 #        define LONG_JMP_SP_ENV_SLOT 13
 #      elif defined(__s390x__)
@@ -556,7 +578,7 @@ static uptr UnmangleLongJmpSp(uptr mangled_sp) {
 #      else
 #        define LONG_JMP_SP_ENV_SLOT 6
 #      endif
-#endif
+#    endif
 
 uptr ExtractLongJmpSp(uptr *env) {
   uptr mangled_sp = env[LONG_JMP_SP_ENV_SLOT];
@@ -653,7 +675,12 @@ ThreadState *cur_thread() {
     }
     CHECK_EQ(0, internal_sigprocmask(SIG_SETMASK, &oldset, nullptr));
   }
-  return thr;
+
+  // Skia calls mallopt(M_THREAD_DISABLE_MEM_INIT, 1), which sets the least
+  // significant bit of TLS_SLOT_SANITIZER to 1. Scudo allocator uses this bit
+  // as a flag to disable memory initialization. This is a workaround to get the
+  // correct ThreadState pointer.
+  reinterpret_cast<ThreadState*>(addr & ~1ULL);
 }
 
 void set_cur_thread(ThreadState *thr) {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
index b1464ccfddeb4..978d853b0bc7e 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
@@ -206,10 +206,14 @@ void ThreadStart(ThreadState *thr, Tid tid, ThreadID os_id,
   }
 #endif
 
-#if !SANITIZER_GO
+#if !SANITIZER_GO && !SANITIZER_ANDROID
   // Don't imitate stack/TLS writes for the main thread,
   // because its initialization is synchronized with all
   // subsequent threads anyway.
+  // Because thr is created by MmapOrDie, the thr object
+  // is not in tls, the pointer to the thr object is in
+  // TLS_SLOT_SANITIZER slot. So skip this check on
+  // Android platform.
   if (tid != kMainTid) {
     if (stk_addr && stk_size) {
       const uptr pc = StackTrace::GetNextInstructionPc(

From ec0db6619f442a01d202c172c33b0e9416fcf29d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 6 Oct 2025 17:53:31 +0100
Subject: [PATCH 864/878] [X86] avx512fp16intrin.h - allow _mm512_cvtsh_h to be
 used in constexpr (#162114)

This was missed in the earlier f16c/fp16 constexpr patches
---
 clang/lib/Headers/avx512fp16intrin.h         | 3 ++-
 clang/test/CodeGen/X86/avx512fp16-builtins.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 4bd798129a25d..d951ba0f05bcd 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -41,7 +41,8 @@ typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
 #define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
 #endif
 
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_cvtsh_h(__m512h __a) {
   return __a[0];
 }
 
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
index 37443d584614d..dbf89b38e262b 100644
--- a/clang/test/CodeGen/X86/avx512fp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -17,6 +17,7 @@ _Float16 test_mm512_cvtsh_h(__m512h __A) {
   // CHECK: extractelement <32 x half> %{{.*}}, i32 0
   return _mm512_cvtsh_h(__A);
 }
+TEST_CONSTEXPR(_mm512_cvtsh_h((__m512h){-32.0, 31.0, -30.0, 29.0, -28.0, 27.0, -26.0, 25.0, -24.0, 23.0, -22.0, 21.0, -20.0, 19.0, -18.0, 17.0, -16.0, 15.0, -14.0, 13.0, -12.0, 11.0, -10.0, 9.0, -8.0, 7.0, -6.0, 5.0, -4.0, 3.0, -2.0, 1.0}) == -32.0);
 
 __m128h test_mm_setzero_ph(void) {
   // CHECK-LABEL: test_mm_setzero_ph

From 141964b392188dc0582a1ceb732aac9da78c653e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 6 Oct 2025 10:02:33 -0700
Subject: [PATCH 865/878] [RISCV][GISel] Force atomic G_LOAD/STORE to the GPR
 register bank. (#162042)

We don't have FPR isel patterns for G_LOAD/STORE so force to the GPR
register bank.
---
 .../RISCV/GISel/RISCVRegisterBankInfo.cpp     |   8 +
 .../RISCV/GlobalISel/atomic-load-store-fp.ll  | 950 ++++++++++++++++++
 2 files changed, 958 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store-fp.ll

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 597dd1271d394..9f9ae2f5c6dc6 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -324,6 +324,10 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
     OpdsMapping[0] = GPRValueMapping;
 
+    // Atomics always use GPR destinations. Don't refine any further.
+    if (cast<GLoad>(MI).isAtomic())
+      break;
+
     // Use FPR64 for s64 loads on rv32.
     if (GPRSize == 32 && Size.getFixedValue() == 64) {
       assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD());
@@ -358,6 +362,10 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
     OpdsMapping[0] = GPRValueMapping;
 
+    // Atomics always use GPR sources. Don't refine any further.
+    if (cast<GStore>(MI).isAtomic())
+      break;
+
     // Use FPR64 for s64 stores on rv32.
     if (GPRSize == 32 && Size.getFixedValue() == 64) {
       assert(MF.getSubtarget<RISCVSubtarget>().hasStdExtD());
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store-fp.ll b/llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store-fp.ll
new file mode 100644
index 0000000000000..4ad2d2cc3e845
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/atomic-load-store-fp.ll
@@ -0,0 +1,950 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+d -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32I %s
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+d,+a,+no-trailing-seq-cst-fence \
+; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=RV32IA,RV32IA-WMO %s
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+d,+a,+ztso,+no-trailing-seq-cst-fence \
+; RUN:     -verify-machineinstrs < %s | FileCheck -check-prefixes=RV32IA,RV32IA-TSO %s
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -global-isel -mattr=+d,+a,+no-trailing-seq-cst-fence \
+; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=RV64IA,RV64IA-WMO %s
+; RUN: llc -mtriple=riscv64 -global-isel -mattr=+d,+a,+ztso,+no-trailing-seq-cst-fence \
+; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefixes=RV64IA,RV64IA-TSO %s
+
+
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+d,+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-WMO-TRAILING-FENCE %s
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+d,+a,+ztso -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV32IA,RV32IA-TSO-TRAILING-FENCE %s
+
+; RUN: llc -mtriple=riscv64 -global-isel -mattr=+d,+a -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-WMO-TRAILING-FENCE %s
+; RUN: llc -mtriple=riscv64 -global-isel -mattr=+d,+a,+ztso -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=RV64IA,RV64IA-TSO-TRAILING-FENCE %s
+
+
+define float @atomic_load_f32_unordered(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f32_unordered:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    fmv.w.x fa0, a0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_load_f32_unordered:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    fmv.w.x fa0, a0
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f32_unordered:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_load_f32_unordered:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    lw a0, 0(a0)
+; RV64IA-NEXT:    fmv.w.x fa0, a0
+; RV64IA-NEXT:    ret
+  %1 = load atomic float, ptr %a unordered, align 4
+  ret float %1
+}
+
+define float @atomic_load_f32_monotonic(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f32_monotonic:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    fmv.w.x fa0, a0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_load_f32_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    lw a0, 0(a0)
+; RV32IA-NEXT:    fmv.w.x fa0, a0
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f32_monotonic:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_load_f32_monotonic:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    lw a0, 0(a0)
+; RV64IA-NEXT:    fmv.w.x fa0, a0
+; RV64IA-NEXT:    ret
+  %1 = load atomic float, ptr %a monotonic, align 4
+  ret float %1
+}
+
+define float @atomic_load_f32_acquire(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f32_acquire:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    fmv.w.x fa0, a0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-WMO-LABEL: atomic_load_f32_acquire:
+; RV32IA-WMO:       # %bb.0:
+; RV32IA-WMO-NEXT:    lw a0, 0(a0)
+; RV32IA-WMO-NEXT:    fence r, rw
+; RV32IA-WMO-NEXT:    fmv.w.x fa0, a0
+; RV32IA-WMO-NEXT:    ret
+;
+; RV32IA-TSO-LABEL: atomic_load_f32_acquire:
+; RV32IA-TSO:       # %bb.0:
+; RV32IA-TSO-NEXT:    lw a0, 0(a0)
+; RV32IA-TSO-NEXT:    fmv.w.x fa0, a0
+; RV32IA-TSO-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f32_acquire:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_load_f32_acquire:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    lw a0, 0(a0)
+; RV64IA-WMO-NEXT:    fence r, rw
+; RV64IA-WMO-NEXT:    fmv.w.x fa0, a0
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_load_f32_acquire:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    lw a0, 0(a0)
+; RV64IA-TSO-NEXT:    fmv.w.x fa0, a0
+; RV64IA-TSO-NEXT:    ret
+;
+; RV32IA-WMO-TRAILING-FENCE-LABEL: atomic_load_f32_acquire:
+; RV32IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV32IA-TSO-TRAILING-FENCE-LABEL: atomic_load_f32_acquire:
+; RV32IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_load_f32_acquire:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_load_f32_acquire:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  %1 = load atomic float, ptr %a acquire, align 4
+  ret float %1
+}
+
+define float @atomic_load_f32_seq_cst(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f32_seq_cst:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_4
+; RV32I-NEXT:    fmv.w.x fa0, a0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-WMO-LABEL: atomic_load_f32_seq_cst:
+; RV32IA-WMO:       # %bb.0:
+; RV32IA-WMO-NEXT:    fence rw, rw
+; RV32IA-WMO-NEXT:    lw a0, 0(a0)
+; RV32IA-WMO-NEXT:    fence r, rw
+; RV32IA-WMO-NEXT:    fmv.w.x fa0, a0
+; RV32IA-WMO-NEXT:    ret
+;
+; RV32IA-TSO-LABEL: atomic_load_f32_seq_cst:
+; RV32IA-TSO:       # %bb.0:
+; RV32IA-TSO-NEXT:    fence rw, rw
+; RV32IA-TSO-NEXT:    lw a0, 0(a0)
+; RV32IA-TSO-NEXT:    fmv.w.x fa0, a0
+; RV32IA-TSO-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f32_seq_cst:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_load_f32_seq_cst:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    fence rw, rw
+; RV64IA-WMO-NEXT:    lw a0, 0(a0)
+; RV64IA-WMO-NEXT:    fence r, rw
+; RV64IA-WMO-NEXT:    fmv.w.x fa0, a0
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_load_f32_seq_cst:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    fence rw, rw
+; RV64IA-TSO-NEXT:    lw a0, 0(a0)
+; RV64IA-TSO-NEXT:    fmv.w.x fa0, a0
+; RV64IA-TSO-NEXT:    ret
+;
+; RV32IA-WMO-TRAILING-FENCE-LABEL: atomic_load_f32_seq_cst:
+; RV32IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV32IA-TSO-TRAILING-FENCE-LABEL: atomic_load_f32_seq_cst:
+; RV32IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_load_f32_seq_cst:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_load_f32_seq_cst:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    lw a0, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.w.x fa0, a0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  %1 = load atomic float, ptr %a seq_cst, align 4
+  ret float %1
+}
+
+define double @atomic_load_f64_unordered(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f64_unordered:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw a0, 0(sp)
+; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    fld fa0, 0(sp)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_load_f64_unordered:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    li a1, 0
+; RV32IA-NEXT:    call __atomic_load_8
+; RV32IA-NEXT:    sw a0, 0(sp)
+; RV32IA-NEXT:    sw a1, 4(sp)
+; RV32IA-NEXT:    fld fa0, 0(sp)
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f64_unordered:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_load_f64_unordered:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    ld a0, 0(a0)
+; RV64IA-NEXT:    fmv.d.x fa0, a0
+; RV64IA-NEXT:    ret
+  %1 = load atomic double, ptr %a unordered, align 8
+  ret double %1
+}
+
+define double @atomic_load_f64_monotonic(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f64_monotonic:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw a0, 0(sp)
+; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    fld fa0, 0(sp)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_load_f64_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    li a1, 0
+; RV32IA-NEXT:    call __atomic_load_8
+; RV32IA-NEXT:    sw a0, 0(sp)
+; RV32IA-NEXT:    sw a1, 4(sp)
+; RV32IA-NEXT:    fld fa0, 0(sp)
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f64_monotonic:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_load_f64_monotonic:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    ld a0, 0(a0)
+; RV64IA-NEXT:    fmv.d.x fa0, a0
+; RV64IA-NEXT:    ret
+  %1 = load atomic double, ptr %a monotonic, align 8
+  ret double %1
+}
+
+define double @atomic_load_f64_acquire(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f64_acquire:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 2
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw a0, 0(sp)
+; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    fld fa0, 0(sp)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_load_f64_acquire:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    li a1, 2
+; RV32IA-NEXT:    call __atomic_load_8
+; RV32IA-NEXT:    sw a0, 0(sp)
+; RV32IA-NEXT:    sw a1, 4(sp)
+; RV32IA-NEXT:    fld fa0, 0(sp)
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f64_acquire:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 2
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_load_f64_acquire:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    ld a0, 0(a0)
+; RV64IA-WMO-NEXT:    fence r, rw
+; RV64IA-WMO-NEXT:    fmv.d.x fa0, a0
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_load_f64_acquire:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    ld a0, 0(a0)
+; RV64IA-TSO-NEXT:    fmv.d.x fa0, a0
+; RV64IA-TSO-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_load_f64_acquire:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ld a0, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.d.x fa0, a0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_load_f64_acquire:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ld a0, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.d.x fa0, a0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  %1 = load atomic double, ptr %a acquire, align 8
+  ret double %1
+}
+
+define double @atomic_load_f64_seq_cst(ptr %a) nounwind {
+; RV32I-LABEL: atomic_load_f64_seq_cst:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a1, 5
+; RV32I-NEXT:    call __atomic_load_8
+; RV32I-NEXT:    sw a0, 0(sp)
+; RV32I-NEXT:    sw a1, 4(sp)
+; RV32I-NEXT:    fld fa0, 0(sp)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_load_f64_seq_cst:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    li a1, 5
+; RV32IA-NEXT:    call __atomic_load_8
+; RV32IA-NEXT:    sw a0, 0(sp)
+; RV32IA-NEXT:    sw a1, 4(sp)
+; RV32IA-NEXT:    fld fa0, 0(sp)
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_load_f64_seq_cst:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __atomic_load_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_load_f64_seq_cst:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    fence rw, rw
+; RV64IA-WMO-NEXT:    ld a0, 0(a0)
+; RV64IA-WMO-NEXT:    fence r, rw
+; RV64IA-WMO-NEXT:    fmv.d.x fa0, a0
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_load_f64_seq_cst:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    fence rw, rw
+; RV64IA-TSO-NEXT:    ld a0, 0(a0)
+; RV64IA-TSO-NEXT:    fmv.d.x fa0, a0
+; RV64IA-TSO-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_load_f64_seq_cst:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ld a0, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence r, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.d.x fa0, a0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_load_f64_seq_cst:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ld a0, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.d.x fa0, a0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  %1 = load atomic double, ptr %a seq_cst, align 8
+  ret double %1
+}
+
+define void @atomic_store_f32_unordered(ptr %a, float %b) nounwind {
+; RV32I-LABEL: atomic_store_f32_unordered:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    fmv.x.w a1, fa0
+; RV32I-NEXT:    li a2, 0
+; RV32I-NEXT:    call __atomic_store_4
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_store_f32_unordered:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    fmv.x.w a1, fa0
+; RV32IA-NEXT:    sw a1, 0(a0)
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f32_unordered:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 0
+; RV64I-NEXT:    call __atomic_store_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_store_f32_unordered:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    fmv.x.w a1, fa0
+; RV64IA-NEXT:    sw a1, 0(a0)
+; RV64IA-NEXT:    ret
+  store atomic float %b, ptr %a unordered, align 4
+  ret void
+}
+
+define void @atomic_store_f32_monotonic(ptr %a, float %b) nounwind {
+; RV32I-LABEL: atomic_store_f32_monotonic:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    fmv.x.w a1, fa0
+; RV32I-NEXT:    li a2, 0
+; RV32I-NEXT:    call __atomic_store_4
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_store_f32_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    fmv.x.w a1, fa0
+; RV32IA-NEXT:    sw a1, 0(a0)
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f32_monotonic:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 0
+; RV64I-NEXT:    call __atomic_store_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_store_f32_monotonic:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    fmv.x.w a1, fa0
+; RV64IA-NEXT:    sw a1, 0(a0)
+; RV64IA-NEXT:    ret
+  store atomic float %b, ptr %a monotonic, align 4
+  ret void
+}
+
+define void @atomic_store_f32_release(ptr %a, float %b) nounwind {
+; RV32I-LABEL: atomic_store_f32_release:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a2, 3
+; RV32I-NEXT:    fmv.x.w a1, fa0
+; RV32I-NEXT:    call __atomic_store_4
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-WMO-LABEL: atomic_store_f32_release:
+; RV32IA-WMO:       # %bb.0:
+; RV32IA-WMO-NEXT:    fence rw, w
+; RV32IA-WMO-NEXT:    fmv.x.w a1, fa0
+; RV32IA-WMO-NEXT:    sw a1, 0(a0)
+; RV32IA-WMO-NEXT:    ret
+;
+; RV32IA-TSO-LABEL: atomic_store_f32_release:
+; RV32IA-TSO:       # %bb.0:
+; RV32IA-TSO-NEXT:    fmv.x.w a1, fa0
+; RV32IA-TSO-NEXT:    sw a1, 0(a0)
+; RV32IA-TSO-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f32_release:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 3
+; RV64I-NEXT:    call __atomic_store_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_store_f32_release:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    fence rw, w
+; RV64IA-WMO-NEXT:    fmv.x.w a1, fa0
+; RV64IA-WMO-NEXT:    sw a1, 0(a0)
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_store_f32_release:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    fmv.x.w a1, fa0
+; RV64IA-TSO-NEXT:    sw a1, 0(a0)
+; RV64IA-TSO-NEXT:    ret
+;
+; RV32IA-WMO-TRAILING-FENCE-LABEL: atomic_store_f32_release:
+; RV32IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence rw, w
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV32IA-TSO-TRAILING-FENCE-LABEL: atomic_store_f32_release:
+; RV32IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_store_f32_release:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, w
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_store_f32_release:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  store atomic float %b, ptr %a release, align 4
+  ret void
+}
+
+define void @atomic_store_f32_seq_cst(ptr %a, float %b) nounwind {
+; RV32I-LABEL: atomic_store_f32_seq_cst:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a2, 5
+; RV32I-NEXT:    fmv.x.w a1, fa0
+; RV32I-NEXT:    call __atomic_store_4
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-WMO-LABEL: atomic_store_f32_seq_cst:
+; RV32IA-WMO:       # %bb.0:
+; RV32IA-WMO-NEXT:    fence rw, w
+; RV32IA-WMO-NEXT:    fmv.x.w a1, fa0
+; RV32IA-WMO-NEXT:    sw a1, 0(a0)
+; RV32IA-WMO-NEXT:    ret
+;
+; RV32IA-TSO-LABEL: atomic_store_f32_seq_cst:
+; RV32IA-TSO:       # %bb.0:
+; RV32IA-TSO-NEXT:    fmv.x.w a1, fa0
+; RV32IA-TSO-NEXT:    sw a1, 0(a0)
+; RV32IA-TSO-NEXT:    fence rw, rw
+; RV32IA-TSO-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f32_seq_cst:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 5
+; RV64I-NEXT:    call __atomic_store_4
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_store_f32_seq_cst:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    fence rw, w
+; RV64IA-WMO-NEXT:    fmv.x.w a1, fa0
+; RV64IA-WMO-NEXT:    sw a1, 0(a0)
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_store_f32_seq_cst:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    fmv.x.w a1, fa0
+; RV64IA-TSO-NEXT:    sw a1, 0(a0)
+; RV64IA-TSO-NEXT:    fence rw, rw
+; RV64IA-TSO-NEXT:    ret
+;
+; RV32IA-WMO-TRAILING-FENCE-LABEL: atomic_store_f32_seq_cst:
+; RV32IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence rw, w
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV32IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV32IA-TSO-TRAILING-FENCE-LABEL: atomic_store_f32_seq_cst:
+; RV32IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV32IA-TSO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_store_f32_seq_cst:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, w
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_store_f32_seq_cst:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.x.w a1, fa0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    sw a1, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  store atomic float %b, ptr %a seq_cst, align 4
+  ret void
+}
+
+define void @atomic_store_f64_unordered(ptr %a, double %b) nounwind {
+; RV32I-LABEL: atomic_store_f64_unordered:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    fsd fa0, 0(sp)
+; RV32I-NEXT:    lw a1, 0(sp)
+; RV32I-NEXT:    lw a2, 4(sp)
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __atomic_store_8
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_store_f64_unordered:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    fsd fa0, 0(sp)
+; RV32IA-NEXT:    lw a1, 0(sp)
+; RV32IA-NEXT:    lw a2, 4(sp)
+; RV32IA-NEXT:    li a3, 0
+; RV32IA-NEXT:    call __atomic_store_8
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f64_unordered:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 0
+; RV64I-NEXT:    call __atomic_store_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_store_f64_unordered:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    fmv.x.d a1, fa0
+; RV64IA-NEXT:    sd a1, 0(a0)
+; RV64IA-NEXT:    ret
+  store atomic double %b, ptr %a unordered, align 8
+  ret void
+}
+
+define void @atomic_store_f64_monotonic(ptr %a, double %b) nounwind {
+; RV32I-LABEL: atomic_store_f64_monotonic:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    fsd fa0, 0(sp)
+; RV32I-NEXT:    lw a1, 0(sp)
+; RV32I-NEXT:    lw a2, 4(sp)
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __atomic_store_8
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_store_f64_monotonic:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    fsd fa0, 0(sp)
+; RV32IA-NEXT:    lw a1, 0(sp)
+; RV32IA-NEXT:    lw a2, 4(sp)
+; RV32IA-NEXT:    li a3, 0
+; RV32IA-NEXT:    call __atomic_store_8
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f64_monotonic:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 0
+; RV64I-NEXT:    call __atomic_store_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-LABEL: atomic_store_f64_monotonic:
+; RV64IA:       # %bb.0:
+; RV64IA-NEXT:    fmv.x.d a1, fa0
+; RV64IA-NEXT:    sd a1, 0(a0)
+; RV64IA-NEXT:    ret
+  store atomic double %b, ptr %a monotonic, align 8
+  ret void
+}
+
+define void @atomic_store_f64_release(ptr %a, double %b) nounwind {
+; RV32I-LABEL: atomic_store_f64_release:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    fsd fa0, 0(sp)
+; RV32I-NEXT:    lw a1, 0(sp)
+; RV32I-NEXT:    lw a2, 4(sp)
+; RV32I-NEXT:    li a3, 3
+; RV32I-NEXT:    call __atomic_store_8
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_store_f64_release:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    fsd fa0, 0(sp)
+; RV32IA-NEXT:    lw a1, 0(sp)
+; RV32IA-NEXT:    lw a2, 4(sp)
+; RV32IA-NEXT:    li a3, 3
+; RV32IA-NEXT:    call __atomic_store_8
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f64_release:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 3
+; RV64I-NEXT:    call __atomic_store_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_store_f64_release:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    fence rw, w
+; RV64IA-WMO-NEXT:    fmv.x.d a1, fa0
+; RV64IA-WMO-NEXT:    sd a1, 0(a0)
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_store_f64_release:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    fmv.x.d a1, fa0
+; RV64IA-TSO-NEXT:    sd a1, 0(a0)
+; RV64IA-TSO-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_store_f64_release:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, w
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.x.d a1, fa0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    sd a1, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_store_f64_release:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.x.d a1, fa0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    sd a1, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  store atomic double %b, ptr %a release, align 8
+  ret void
+}
+
+define void @atomic_store_f64_seq_cst(ptr %a, double %b) nounwind {
+; RV32I-LABEL: atomic_store_f64_seq_cst:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    fsd fa0, 0(sp)
+; RV32I-NEXT:    lw a1, 0(sp)
+; RV32I-NEXT:    lw a2, 4(sp)
+; RV32I-NEXT:    li a3, 5
+; RV32I-NEXT:    call __atomic_store_8
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IA-LABEL: atomic_store_f64_seq_cst:
+; RV32IA:       # %bb.0:
+; RV32IA-NEXT:    addi sp, sp, -16
+; RV32IA-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IA-NEXT:    fsd fa0, 0(sp)
+; RV32IA-NEXT:    lw a1, 0(sp)
+; RV32IA-NEXT:    lw a2, 4(sp)
+; RV32IA-NEXT:    li a3, 5
+; RV32IA-NEXT:    call __atomic_store_8
+; RV32IA-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IA-NEXT:    addi sp, sp, 16
+; RV32IA-NEXT:    ret
+;
+; RV64I-LABEL: atomic_store_f64_seq_cst:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a2, 5
+; RV64I-NEXT:    call __atomic_store_8
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IA-WMO-LABEL: atomic_store_f64_seq_cst:
+; RV64IA-WMO:       # %bb.0:
+; RV64IA-WMO-NEXT:    fence rw, w
+; RV64IA-WMO-NEXT:    fmv.x.d a1, fa0
+; RV64IA-WMO-NEXT:    sd a1, 0(a0)
+; RV64IA-WMO-NEXT:    ret
+;
+; RV64IA-TSO-LABEL: atomic_store_f64_seq_cst:
+; RV64IA-TSO:       # %bb.0:
+; RV64IA-TSO-NEXT:    fmv.x.d a1, fa0
+; RV64IA-TSO-NEXT:    sd a1, 0(a0)
+; RV64IA-TSO-NEXT:    fence rw, rw
+; RV64IA-TSO-NEXT:    ret
+;
+; RV64IA-WMO-TRAILING-FENCE-LABEL: atomic_store_f64_seq_cst:
+; RV64IA-WMO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, w
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fmv.x.d a1, fa0
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    sd a1, 0(a0)
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-WMO-TRAILING-FENCE-NEXT:    ret
+;
+; RV64IA-TSO-TRAILING-FENCE-LABEL: atomic_store_f64_seq_cst:
+; RV64IA-TSO-TRAILING-FENCE:       # %bb.0:
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fmv.x.d a1, fa0
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    sd a1, 0(a0)
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    fence rw, rw
+; RV64IA-TSO-TRAILING-FENCE-NEXT:    ret
+  store atomic double %b, ptr %a seq_cst, align 8
+  ret void
+}

From 2aff3c6a6d52f8413d31e370660a1ba5781b9be4 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Mon, 6 Oct 2025 10:07:13 -0700
Subject: [PATCH 866/878] Re-land #161264: [CAS] Add OnDiskDataAllocator
 (#162112)

Fix the build configuration that has OnDiskCAS disabled.
---
 llvm/include/llvm/CAS/OnDiskDataAllocator.h   |  95 +++++++
 llvm/lib/CAS/CMakeLists.txt                   |   1 +
 llvm/lib/CAS/OnDiskDataAllocator.cpp          | 234 ++++++++++++++++++
 llvm/unittests/CAS/CMakeLists.txt             |   1 +
 .../unittests/CAS/OnDiskDataAllocatorTest.cpp |  66 +++++
 5 files changed, 397 insertions(+)
 create mode 100644 llvm/include/llvm/CAS/OnDiskDataAllocator.h
 create mode 100644 llvm/lib/CAS/OnDiskDataAllocator.cpp
 create mode 100644 llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp

diff --git a/llvm/include/llvm/CAS/OnDiskDataAllocator.h b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
new file mode 100644
index 0000000000000..2809df800621b
--- /dev/null
+++ b/llvm/include/llvm/CAS/OnDiskDataAllocator.h
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file declares interface for OnDiskDataAllocator, a file backed data
+/// pool can be used to allocate space to store data packed in a single file. It
+/// is based on MappedFileRegionArena and includes a header in the beginning to
+/// provide metadata.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_ONDISKDATAALLOCATOR_H
+#define LLVM_CAS_ONDISKDATAALLOCATOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CAS/FileOffset.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+/// Sink for data. Stores variable length data with 8-byte alignment. Does not
+/// track size of data, which is assumed to known from context, or embedded.
+/// Uses 0-padding but does not guarantee 0-termination.
+class OnDiskDataAllocator {
+public:
+  using ValueProxy = MutableArrayRef<char>;
+
+  /// A pointer to data stored on disk.
+  class OnDiskPtr {
+  public:
+    FileOffset getOffset() const { return Offset; }
+    explicit operator bool() const { return bool(getOffset()); }
+    const ValueProxy &operator*() const {
+      assert(Offset && "Null dereference");
+      return Value;
+    }
+    const ValueProxy *operator->() const {
+      assert(Offset && "Null dereference");
+      return &Value;
+    }
+
+    OnDiskPtr() = default;
+
+  private:
+    friend class OnDiskDataAllocator;
+    OnDiskPtr(FileOffset Offset, ValueProxy Value)
+        : Offset(Offset), Value(Value) {}
+    FileOffset Offset;
+    ValueProxy Value;
+  };
+
+  /// Get the data of \p Size stored at the given \p Offset. Note the allocator
+  /// doesn't keep track of the allocation size, thus \p Size doesn't need to
+  /// match the size of allocation but needs to be smaller.
+  Expected<ArrayRef<char>> get(FileOffset Offset, size_t Size) const;
+
+  /// Allocate at least \p Size with 8-byte alignment.
+  Expected<OnDiskPtr> allocate(size_t Size);
+
+  /// \returns the buffer that was allocated at \p create time, with size
+  /// \p UserHeaderSize.
+  MutableArrayRef<uint8_t> getUserHeader();
+
+  size_t size() const;
+  size_t capacity() const;
+
+  static Expected<OnDiskDataAllocator>
+  create(const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+         std::optional<uint64_t> NewFileInitialSize,
+         uint32_t UserHeaderSize = 0,
+         function_ref<void(void *)> UserHeaderInit = nullptr);
+
+  OnDiskDataAllocator(OnDiskDataAllocator &&RHS);
+  OnDiskDataAllocator &operator=(OnDiskDataAllocator &&RHS);
+
+  // No copy. Just call \a create() again.
+  OnDiskDataAllocator(const OnDiskDataAllocator &) = delete;
+  OnDiskDataAllocator &operator=(const OnDiskDataAllocator &) = delete;
+
+  ~OnDiskDataAllocator();
+
+private:
+  struct ImplType;
+  explicit OnDiskDataAllocator(std::unique_ptr<ImplType> Impl);
+  std::unique_ptr<ImplType> Impl;
+};
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_ONDISKDATAALLOCATOR_H
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index 7ae5f7e46418e..bca39b645af45 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_component_library(LLVMCAS
   MappedFileRegionArena.cpp
   ObjectStore.cpp
   OnDiskCommon.cpp
+  OnDiskDataAllocator.cpp
   OnDiskTrieRawHashMap.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/CAS/OnDiskDataAllocator.cpp b/llvm/lib/CAS/OnDiskDataAllocator.cpp
new file mode 100644
index 0000000000000..13bbd66139178
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskDataAllocator.cpp
@@ -0,0 +1,234 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file Implements OnDiskDataAllocator.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "DatabaseFile.h"
+#include "llvm/Config/llvm-config.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+//===----------------------------------------------------------------------===//
+// DataAllocator data structures.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// DataAllocator table layout:
+/// - [8-bytes: Generic table header]
+/// - 8-bytes: AllocatorOffset (reserved for implementing free lists)
+/// - 8-bytes: Size for user data header
+/// - <user data buffer>
+///
+/// Record layout:
+/// - <data>
+class DataAllocatorHandle {
+public:
+  static constexpr TableHandle::TableKind Kind =
+      TableHandle::TableKind::DataAllocator;
+
+  struct Header {
+    TableHandle::Header GenericHeader;
+    std::atomic<int64_t> AllocatorOffset;
+    const uint64_t UserHeaderSize;
+  };
+
+  operator TableHandle() const {
+    if (!H)
+      return TableHandle();
+    return TableHandle(*Region, H->GenericHeader);
+  }
+
+  Expected<MutableArrayRef<char>> allocate(MappedFileRegionArena &Alloc,
+                                           size_t DataSize) {
+    assert(&Alloc.getRegion() == Region);
+    auto Ptr = Alloc.allocate(DataSize);
+    if (LLVM_UNLIKELY(!Ptr))
+      return Ptr.takeError();
+    return MutableArrayRef(*Ptr, DataSize);
+  }
+
+  explicit operator bool() const { return H; }
+  const Header &getHeader() const { return *H; }
+  MappedFileRegion &getRegion() const { return *Region; }
+
+  MutableArrayRef<uint8_t> getUserHeader() {
+    return MutableArrayRef(reinterpret_cast<uint8_t *>(H + 1),
+                           H->UserHeaderSize);
+  }
+
+  static Expected<DataAllocatorHandle>
+  create(MappedFileRegionArena &Alloc, StringRef Name, uint32_t UserHeaderSize);
+
+  DataAllocatorHandle() = default;
+  DataAllocatorHandle(MappedFileRegion &Region, Header &H)
+      : Region(&Region), H(&H) {}
+  DataAllocatorHandle(MappedFileRegion &Region, intptr_t HeaderOffset)
+      : DataAllocatorHandle(
+            Region, *reinterpret_cast<Header *>(Region.data() + HeaderOffset)) {
+  }
+
+private:
+  MappedFileRegion *Region = nullptr;
+  Header *H = nullptr;
+};
+
+} // end anonymous namespace
+
+struct OnDiskDataAllocator::ImplType {
+  DatabaseFile File;
+  DataAllocatorHandle Store;
+};
+
+Expected<DataAllocatorHandle>
+DataAllocatorHandle::create(MappedFileRegionArena &Alloc, StringRef Name,
+                            uint32_t UserHeaderSize) {
+  // Allocate.
+  auto Offset =
+      Alloc.allocateOffset(sizeof(Header) + UserHeaderSize + Name.size() + 1);
+  if (LLVM_UNLIKELY(!Offset))
+    return Offset.takeError();
+
+  // Construct the header and the name.
+  assert(Name.size() <= UINT16_MAX && "Expected smaller table name");
+  auto *H = new (Alloc.getRegion().data() + *Offset)
+      Header{{TableHandle::TableKind::DataAllocator,
+              static_cast<uint16_t>(Name.size()),
+              static_cast<int32_t>(sizeof(Header) + UserHeaderSize)},
+             /*AllocatorOffset=*/{0},
+             /*UserHeaderSize=*/UserHeaderSize};
+  // Memset UserHeader.
+  char *UserHeader = reinterpret_cast<char *>(H + 1);
+  memset(UserHeader, 0, UserHeaderSize);
+  // Write database file name (null-terminated).
+  char *NameStorage = UserHeader + UserHeaderSize;
+  llvm::copy(Name, NameStorage);
+  NameStorage[Name.size()] = 0;
+  return DataAllocatorHandle(Alloc.getRegion(), *H);
+}
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+    const Twine &PathTwine, const Twine &TableNameTwine, uint64_t MaxFileSize,
+    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+    function_ref<void(void *)> UserHeaderInit) {
+  assert(!UserHeaderSize || UserHeaderInit);
+  SmallString<128> PathStorage;
+  StringRef Path = PathTwine.toStringRef(PathStorage);
+  SmallString<128> TableNameStorage;
+  StringRef TableName = TableNameTwine.toStringRef(TableNameStorage);
+
+  // Constructor for if the file doesn't exist.
+  auto NewDBConstructor = [&](DatabaseFile &DB) -> Error {
+    auto Store =
+        DataAllocatorHandle::create(DB.getAlloc(), TableName, UserHeaderSize);
+    if (LLVM_UNLIKELY(!Store))
+      return Store.takeError();
+
+    if (auto E = DB.addTable(*Store))
+      return E;
+
+    if (UserHeaderSize)
+      UserHeaderInit(Store->getUserHeader().data());
+    return Error::success();
+  };
+
+  // Get or create the file.
+  Expected<DatabaseFile> File =
+      DatabaseFile::create(Path, MaxFileSize, NewDBConstructor);
+  if (!File)
+    return File.takeError();
+
+  // Find the table and validate it.
+  std::optional<TableHandle> Table = File->findTable(TableName);
+  if (!Table)
+    return createTableConfigError(std::errc::argument_out_of_domain, Path,
+                                  TableName, "table not found");
+  if (Error E = checkTable("table kind", (size_t)DataAllocatorHandle::Kind,
+                           (size_t)Table->getHeader().Kind, Path, TableName))
+    return std::move(E);
+  auto Store = Table->cast<DataAllocatorHandle>();
+  assert(Store && "Already checked the kind");
+
+  // Success.
+  OnDiskDataAllocator::ImplType Impl{DatabaseFile(std::move(*File)), Store};
+  return OnDiskDataAllocator(std::make_unique<ImplType>(std::move(Impl)));
+}
+
+Expected<OnDiskDataAllocator::OnDiskPtr>
+OnDiskDataAllocator::allocate(size_t Size) {
+  auto Data = Impl->Store.allocate(Impl->File.getAlloc(), Size);
+  if (LLVM_UNLIKELY(!Data))
+    return Data.takeError();
+
+  return OnDiskPtr(FileOffset(Data->data() - Impl->Store.getRegion().data()),
+                   *Data);
+}
+
+Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
+                                                  size_t Size) const {
+  assert(Offset);
+  assert(Impl);
+  if (Offset.get() + Size >= Impl->File.getAlloc().size())
+    return createStringError(make_error_code(std::errc::protocol_error),
+                             "requested size too large in allocator");
+  return ArrayRef<char>{Impl->File.getRegion().data() + Offset.get(), Size};
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() {
+  return Impl->Store.getUserHeader();
+}
+
+size_t OnDiskDataAllocator::size() const { return Impl->File.size(); }
+size_t OnDiskDataAllocator::capacity() const {
+  return Impl->File.getRegion().size();
+}
+
+OnDiskDataAllocator::OnDiskDataAllocator(std::unique_ptr<ImplType> Impl)
+    : Impl(std::move(Impl)) {}
+
+#else // !LLVM_ENABLE_ONDISK_CAS
+
+struct OnDiskDataAllocator::ImplType {};
+
+Expected<OnDiskDataAllocator> OnDiskDataAllocator::create(
+    const Twine &Path, const Twine &TableName, uint64_t MaxFileSize,
+    std::optional<uint64_t> NewFileInitialSize, uint32_t UserHeaderSize,
+    function_ref<void(void *)> UserHeaderInit) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+Expected<OnDiskDataAllocator::OnDiskPtr>
+OnDiskDataAllocator::allocate(size_t Size) {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+Expected<ArrayRef<char>> OnDiskDataAllocator::get(FileOffset Offset,
+                                                  size_t Size) const {
+  return createStringError(make_error_code(std::errc::not_supported),
+                           "OnDiskDataAllocator is not supported");
+}
+
+MutableArrayRef<uint8_t> OnDiskDataAllocator::getUserHeader() { return {}; }
+
+size_t OnDiskDataAllocator::size() const { return 0; }
+size_t OnDiskDataAllocator::capacity() const { return 0; }
+
+#endif // LLVM_ENABLE_ONDISK_CAS
+
+OnDiskDataAllocator::OnDiskDataAllocator(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator &
+OnDiskDataAllocator::operator=(OnDiskDataAllocator &&RHS) = default;
+OnDiskDataAllocator::~OnDiskDataAllocator() = default;
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index 0f8fcb9e98954..ee40e6c9879a1 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_unittest(CASTests
   ActionCacheTest.cpp
   CASTestConfig.cpp
   ObjectStoreTest.cpp
+  OnDiskDataAllocatorTest.cpp
   OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
   )
diff --git a/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
new file mode 100644
index 0000000000000..966fa03076841
--- /dev/null
+++ b/llvm/unittests/CAS/OnDiskDataAllocatorTest.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/OnDiskDataAllocator.h"
+#include "llvm/CAS/MappedFileRegionArena.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#if LLVM_ENABLE_ONDISK_CAS
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST(OnDiskDataAllocatorTest, Allocate) {
+  unittest::TempDir Temp("data-allocator", /*Unique=*/true);
+  constexpr size_t MB = 1024u * 1024u;
+
+  std::optional<OnDiskDataAllocator> Allocator;
+  ASSERT_THAT_ERROR(OnDiskDataAllocator::create(
+                        Temp.path("allocator"), "data", /*MaxFileSize=*/MB,
+                        /*NewFileInitialSize=*/std::nullopt)
+                        .moveInto(Allocator),
+                    Succeeded());
+
+  // Allocate.
+  {
+    for (size_t Size = 1; Size < 16; ++Size) {
+      OnDiskDataAllocator::OnDiskPtr P;
+      ASSERT_THAT_ERROR(Allocator->allocate(Size).moveInto(P), Succeeded());
+      EXPECT_TRUE(
+          isAligned(MappedFileRegionArena::getAlign(), P.getOffset().get()));
+    }
+  }
+
+  // Out of space.
+  {
+    OnDiskDataAllocator::OnDiskPtr P;
+    ASSERT_THAT_ERROR(Allocator->allocate(MB).moveInto(P), Failed());
+  }
+
+  // Check size and capacity.
+  {
+    ASSERT_EQ(Allocator->capacity(), MB);
+    ASSERT_LE(Allocator->size(), MB);
+  }
+
+  // Get.
+  {
+    OnDiskDataAllocator::OnDiskPtr P;
+    ASSERT_THAT_ERROR(Allocator->allocate(32).moveInto(P), Succeeded());
+    ArrayRef<char> Data;
+    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 16).moveInto(Data),
+                      Succeeded());
+    ASSERT_THAT_ERROR(Allocator->get(P.getOffset(), 1025).moveInto(Data),
+                      Failed());
+  }
+}
+
+#endif // LLVM_ENABLE_ONDISK_CAS

From 2f3bb7678182182e47b0ab5e23fa09edaa390414 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Mon, 6 Oct 2025 22:41:52 +0530
Subject: [PATCH 867/878] [clang][DependencyScanning] Reset options generated
 for named module compilations. (#161486)

The driver-generated -cc1 command-lines for C++ named module inputs
introduce some command-line options which affect the canonical module
build command (and therefore the context hash).
This resets those options.
---
 .../DependencyScannerImpl.cpp                 |   2 +
 .../DependencyScanning/ModuleDepCollector.cpp |   4 +
 ...modules-context-hash-from-named-module.cpp | 121 ++++++++++++++++++
 3 files changed, 127 insertions(+)
 create mode 100644 clang/test/ClangScanDeps/modules-context-hash-from-named-module.cpp

diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index e1f4d0d361185..b0096d8e6b08b 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -509,6 +509,8 @@ bool initializeScanCompilerInstance(
   ScanInstance.getFrontendOpts().DisableFree = false;
   ScanInstance.getFrontendOpts().GenerateGlobalModuleIndex = false;
   ScanInstance.getFrontendOpts().UseGlobalModuleIndex = false;
+  ScanInstance.getFrontendOpts().GenReducedBMI = false;
+  ScanInstance.getFrontendOpts().ModuleOutputPath.clear();
   // This will prevent us compiling individual modules asynchronously since
   // FileManager is not thread-safe, but it does improve performance for now.
   ScanInstance.getFrontendOpts().ModulesShareFileManager = true;
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index d67178c153e88..a117bec0d656e 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -263,6 +263,10 @@ makeCommonInvocationForModuleBuild(CompilerInvocation CI) {
   // units.
   CI.getFrontendOpts().Inputs.clear();
   CI.getFrontendOpts().OutputFile.clear();
+  CI.getFrontendOpts().GenReducedBMI = false;
+  CI.getFrontendOpts().ModuleOutputPath.clear();
+  CI.getHeaderSearchOpts().ModulesSkipHeaderSearchPaths = false;
+  CI.getHeaderSearchOpts().ModulesSkipDiagnosticOptions = false;
   // LLVM options are not going to affect the AST
   CI.getFrontendOpts().LLVMArgs.clear();
 
diff --git a/clang/test/ClangScanDeps/modules-context-hash-from-named-module.cpp b/clang/test/ClangScanDeps/modules-context-hash-from-named-module.cpp
new file mode 100644
index 0000000000000..c27202234f4c5
--- /dev/null
+++ b/clang/test/ClangScanDeps/modules-context-hash-from-named-module.cpp
@@ -0,0 +1,121 @@
+// Checks that driver-generated options for C++ module inputs preserve the
+// canonical module build commands compared to an equivalent non-module input,
+// and that they do not produce additional internal scanning PCMs.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+//--- main.cpp
+#include "root.h"
+import A;
+import B;
+
+auto main() -> int { return 1; }
+
+//--- A.cppm
+module;
+#include "root.h"
+export module A;
+
+//--- B.cppm
+module;
+#include "root.h"
+export module B;
+
+//--- module.modulemap
+module root { header "root.h" }
+
+//--- root.h
+// empty
+
+// RUN: %clang -std=c++23 -fmodules \
+// RUN:   -fmodules-cache-path=%t/modules-cache \
+// RUN:   %t/main.cpp %t/A.cppm %t/B.cppm \
+// RUN:   -fsyntax-only -fdriver-only -MJ %t/deps.json
+//
+// RUN: sed -e '1s/^/[/' -e '$s/,$/]/' -e 's:\\\\\?:/:g' %t/deps.json \
+// RUN:   > %t/compile_commands.json
+//
+// RUN: clang-scan-deps \
+// RUN:   -compilation-database=%t/compile_commands.json \
+// RUN:   -format experimental-full \
+// RUN:   | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t
+
+// CHECK:      {
+// CHECK-NEXT:   "modules": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/module.modulemap",
+// CHECK:            "context-hash": "[[HASH_ROOT:.*]]",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/module.modulemap",
+// CHECK-NEXT:         "[[PREFIX]]/root.h"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "link-libraries": [],
+// CHECK-NEXT:       "name": "root"
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ],
+// CHECK-NEXT:   "translation-units": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "commands": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "clang-context-hash": "{{.*}}",
+// CHECK-NEXT:           "named-module-deps": [
+// CHECK-NEXT:             "A",
+// CHECK-NEXT:             "B"
+// CHECK-NEXT:           ],
+// CHECK-NEXT:           "clang-module-deps": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:               "context-hash": "[[HASH_ROOT]]",
+// CHECK-NEXT:               "module-name": "root"
+// CHECK-NEXT:             }
+// CHECK-NEXT:           ],
+// CHECK:                "file-deps": [
+// CHECK-NEXT:             "[[PREFIX]]/main.cpp"
+// CHECK-NEXT:           ],
+// CHECK-NEXT:           "input-file": "[[PREFIX]]/main.cpp"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ]
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "commands": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "clang-context-hash": "{{.*}}",
+// CHECK-NEXT:           "named-module": "A",
+// CHECK-NEXT:           "clang-module-deps": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:               "context-hash": "[[HASH_ROOT]]",
+// CHECK-NEXT:               "module-name": "root"
+// CHECK-NEXT:             }
+// CHECK-NEXT:           ],
+// CHECK:                "file-deps": [
+// CHECK-NEXT:             "[[PREFIX]]/A.cppm"
+// CHECK-NEXT:           ],
+// CHECK-NEXT:           "input-file": "[[PREFIX]]/A.cppm"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ]
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "commands": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "clang-context-hash": "{{.*}}",
+// CHECK-NEXT:           "named-module": "B",
+// CHECK-NEXT:           "clang-module-deps": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:               "context-hash": "[[HASH_ROOT]]",
+// CHECK-NEXT:               "module-name": "root"
+// CHECK-NEXT:             }
+// CHECK-NEXT:           ],
+// CHECK:                "file-deps": [
+// CHECK-NEXT:             "[[PREFIX]]/B.cppm"
+// CHECK-NEXT:           ],
+// CHECK-NEXT:           "input-file": "[[PREFIX]]/B.cppm"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ]
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ]
+// CHECK-NEXT: }
+
+// This tests that the scanner doesn't produce multiple internal scanning PCMs
+// for our single Clang module (root).
+// RUN: find %t/modules-cache -name "*.pcm" | wc -l | grep 1

From 84a214856ad989f37af19f5e8aaa9ec2346dde6f Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 6 Oct 2025 13:17:04 -0400
Subject: [PATCH 868/878] [MLIR][Python] use `FetchContent_Declare` for
 nanobind and remove pybind (#161230)

Inspired by this comment
https://github.com/llvm/llvm-project/pull/157930#issuecomment-3346634290
(and long-standing issues related to finding nanobind/pybind in the
right place), this PR moves to using `FetchContent_Declare` to get the
nanobind dependency. This is pretty standard (see e.g.,
[IREE](https://github.com/iree-org/iree/blob/cf60359b7443b0e92e15fb6ffc011525dc40e772/CMakeLists.txt#L842-L848)).
This PR also removes pybind which has been deprecated for almost a year
(https://github.com/llvm/llvm-project/pull/117922) and which isn't
compatible (for whatever reason) with `FetchContent_Declare`.

---------

Co-authored-by: Jacques Pienaar <jpienaar@google.com>
---
 .ci/all_requirements.txt                      |   8 -
 mlir/cmake/modules/AddMLIRPython.cmake        | 127 ++--
 mlir/cmake/modules/MLIRDetectPythonEnv.cmake  |  91 +--
 mlir/docs/Dialects/Linalg/OpDSL.md            |   4 +-
 mlir/examples/standalone/pyproject.toml       |   4 +-
 .../examples/standalone/python/CMakeLists.txt |  18 -
 .../python/StandaloneExtensionPybind11.cpp    |  38 --
 .../dialects/standalone_pybind11.py           |   6 -
 .../standalone/test/python/smoketest.py       |  11 +-
 .../mlir/Bindings/Python/PybindAdaptors.h     | 616 ------------------
 mlir/python/CMakeLists.txt                    |  61 +-
 mlir/python/mlir/dialects/python_test.py      |  11 +-
 mlir/python/requirements.txt                  |   2 -
 mlir/test/python/dialects/python_test.py      |  31 +-
 mlir/test/python/lib/CMakeLists.txt           |   1 -
 .../python/lib/PythonTestModulePybind11.cpp   | 118 ----
 .../update_core_linalg_named_ops.sh.in        |   2 +-
 .../llvm-project-overlay/mlir/BUILD.bazel     |  54 +-
 18 files changed, 121 insertions(+), 1082 deletions(-)
 delete mode 100644 mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
 delete mode 100644 mlir/examples/standalone/python/mlir_standalone/dialects/standalone_pybind11.py
 delete mode 100644 mlir/include/mlir/Bindings/Python/PybindAdaptors.h
 delete mode 100644 mlir/test/python/lib/PythonTestModulePybind11.cpp

diff --git a/.ci/all_requirements.txt b/.ci/all_requirements.txt
index ac9682a09bec1..313ab1076bb26 100644
--- a/.ci/all_requirements.txt
+++ b/.ci/all_requirements.txt
@@ -194,10 +194,6 @@ ml-dtypes==0.5.1 ; python_version < "3.13" \
     --hash=sha256:d13755f8e8445b3870114e5b6240facaa7cb0c3361e54beba3e07fa912a6e12b \
     --hash=sha256:fd918d4e6a4e0c110e2e05be7a7814d10dc1b95872accbf6512b80a109b71ae1
     # via -r mlir/python/requirements.txt
-nanobind==2.9.2 \
-    --hash=sha256:c37957ffd5eac7eda349cff3622ecd32e5ee1244ecc912c99b5bc8188bafd16e \
-    --hash=sha256:e7608472de99d375759814cab3e2c94aba3f9ec80e62cfef8ced495ca5c27d6e
-    # via -r mlir/python/requirements.txt
 numpy==2.0.2 \
     --hash=sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a \
     --hash=sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195 \
@@ -299,10 +295,6 @@ pyasn1-modules==0.4.2 \
     --hash=sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a \
     --hash=sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6
     # via google-auth
-pybind11==2.13.6 \
-    --hash=sha256:237c41e29157b962835d356b370ededd57594a26d5894a795960f0047cb5caf5 \
-    --hash=sha256:ba6af10348c12b24e92fa086b39cfba0eff619b61ac77c406167d813b096d39a
-    # via -r mlir/python/requirements.txt
 pyyaml==6.0.1 \
     --hash=sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5 \
     --hash=sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc \
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index fa6aec8a603a9..ea34f948cd3dd 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -123,12 +123,12 @@ function(mlir_generate_type_stubs)
     "IMPORT_PATHS;DEPENDS_TARGETS;OUTPUTS;DEPENDS_TARGET_SRC_DEPS"
     ${ARGN})
 
-  # for people doing find_package(nanobind)
+  # for people installing a distro (e.g., pip install) of nanobind
   if(EXISTS ${nanobind_DIR}/../src/stubgen.py)
     set(NB_STUBGEN "${nanobind_DIR}/../src/stubgen.py")
   elseif(EXISTS ${nanobind_DIR}/../stubgen.py)
     set(NB_STUBGEN "${nanobind_DIR}/../stubgen.py")
-  # for people using FetchContent_Declare and FetchContent_MakeAvailable
+  # for people using nanobind git source tree (e.g., FetchContent_Declare and FetchContent_MakeAvailable)
   elseif(EXISTS ${nanobind_SOURCE_DIR}/src/stubgen.py)
     set(NB_STUBGEN "${nanobind_SOURCE_DIR}/src/stubgen.py")
   elseif(EXISTS ${nanobind_SOURCE_DIR}/stubgen.py)
@@ -226,11 +226,10 @@ endfunction()
 #   EMBED_CAPI_LINK_LIBS: Dependent CAPI libraries that this extension depends
 #     on. These will be collected for all extensions and put into an
 #     aggregate dylib that is linked against.
-#   PYTHON_BINDINGS_LIBRARY: Either pybind11 or nanobind.
 function(declare_mlir_python_extension name)
   cmake_parse_arguments(ARG
     ""
-    "ROOT_DIR;MODULE_NAME;ADD_TO_PARENT;PYTHON_BINDINGS_LIBRARY"
+    "ROOT_DIR;MODULE_NAME;ADD_TO_PARENT"
     "SOURCES;PRIVATE_LINK_LIBS;EMBED_CAPI_LINK_LIBS"
     ${ARGN})
 
@@ -239,20 +238,15 @@ function(declare_mlir_python_extension name)
   endif()
   set(_install_destination "src/python/${name}")
 
-  if(NOT ARG_PYTHON_BINDINGS_LIBRARY)
-    set(ARG_PYTHON_BINDINGS_LIBRARY "pybind11")
-  endif()
-
   add_library(${name} INTERFACE)
   set_target_properties(${name} PROPERTIES
     # Yes: Leading-lowercase property names are load bearing and the recommended
     # way to do this: https://gitlab.kitware.com/cmake/cmake/-/issues/19261
-    EXPORT_PROPERTIES "mlir_python_SOURCES_TYPE;mlir_python_EXTENSION_MODULE_NAME;mlir_python_EMBED_CAPI_LINK_LIBS;mlir_python_DEPENDS;mlir_python_BINDINGS_LIBRARY"
+    EXPORT_PROPERTIES "mlir_python_SOURCES_TYPE;mlir_python_EXTENSION_MODULE_NAME;mlir_python_EMBED_CAPI_LINK_LIBS;mlir_python_DEPENDS"
     mlir_python_SOURCES_TYPE extension
     mlir_python_EXTENSION_MODULE_NAME "${ARG_MODULE_NAME}"
     mlir_python_EMBED_CAPI_LINK_LIBS "${ARG_EMBED_CAPI_LINK_LIBS}"
     mlir_python_DEPENDS ""
-    mlir_python_BINDINGS_LIBRARY "${ARG_PYTHON_BINDINGS_LIBRARY}"
   )
 
   # Set the interface source and link_libs properties of the target
@@ -341,14 +335,12 @@ function(add_mlir_python_modules name)
     elseif(_source_type STREQUAL "extension")
       # Native CPP extension.
       get_target_property(_module_name ${sources_target} mlir_python_EXTENSION_MODULE_NAME)
-      get_target_property(_bindings_library ${sources_target} mlir_python_BINDINGS_LIBRARY)
       # Transform relative source to based on root dir.
       set(_extension_target "${modules_target}.extension.${_module_name}.dso")
       add_mlir_python_extension(${_extension_target} "${_module_name}"
         INSTALL_COMPONENT ${modules_target}
         INSTALL_DIR "${ARG_INSTALL_PREFIX}/_mlir_libs"
         OUTPUT_DIRECTORY "${ARG_ROOT_PREFIX}/_mlir_libs"
-        PYTHON_BINDINGS_LIBRARY ${_bindings_library}
         LINK_LIBS PRIVATE
           ${sources_target}
           ${ARG_COMMON_CAPI_LINK_LIBS}
@@ -753,7 +745,7 @@ endfunction()
 function(add_mlir_python_extension libname extname)
   cmake_parse_arguments(ARG
   ""
-  "INSTALL_COMPONENT;INSTALL_DIR;OUTPUT_DIRECTORY;PYTHON_BINDINGS_LIBRARY"
+  "INSTALL_COMPONENT;INSTALL_DIR;OUTPUT_DIRECTORY"
   "SOURCES;LINK_LIBS"
   ${ARGN})
   if(ARG_UNPARSED_ARGUMENTS)
@@ -761,7 +753,7 @@ function(add_mlir_python_extension libname extname)
   endif()
 
   # The extension itself must be compiled with RTTI and exceptions enabled.
-  # Also, some warning classes triggered by pybind11 are disabled.
+  # Also, some warning classes triggered by nanobind are disabled.
   set(eh_rtti_enable)
   if (MSVC)
     set(eh_rtti_enable /EHsc /GR)
@@ -769,62 +761,53 @@ function(add_mlir_python_extension libname extname)
     set(eh_rtti_enable -frtti -fexceptions)
   endif ()
 
-  # The actual extension library produces a shared-object or DLL and has
-  # sources that must be compiled in accordance with pybind11 needs (RTTI and
-  # exceptions).
-  if(NOT DEFINED ARG_PYTHON_BINDINGS_LIBRARY OR ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "pybind11")
-    pybind11_add_module(${libname}
-      ${ARG_SOURCES}
-    )
-  elseif(ARG_PYTHON_BINDINGS_LIBRARY STREQUAL "nanobind")
-    nanobind_add_module(${libname}
-      NB_DOMAIN ${MLIR_BINDINGS_PYTHON_NB_DOMAIN}
-      FREE_THREADED
-      ${ARG_SOURCES}
-    )
+  nanobind_add_module(${libname}
+    NB_DOMAIN ${MLIR_BINDINGS_PYTHON_NB_DOMAIN}
+    FREE_THREADED
+    ${ARG_SOURCES}
+  )
 
-    if (NOT MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES
-        AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
-      # Avoid some warnings from upstream nanobind.
-      # If a superproject set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES, let
-      # the super project handle compile options as it wishes.
-      get_property(NB_LIBRARY_TARGET_NAME TARGET ${libname} PROPERTY LINK_LIBRARIES)
-      target_compile_options(${NB_LIBRARY_TARGET_NAME}
-        PRIVATE
-          -Wall -Wextra -Wpedantic
-          -Wno-c++98-compat-extra-semi
-          -Wno-cast-qual
-          -Wno-covered-switch-default
-          -Wno-deprecated-literal-operator
-          -Wno-nested-anon-types
-          -Wno-unused-parameter
-          -Wno-zero-length-array
-          ${eh_rtti_enable})
-
-      target_compile_options(${libname}
-        PRIVATE
-          -Wall -Wextra -Wpedantic
-          -Wno-c++98-compat-extra-semi
-          -Wno-cast-qual
-          -Wno-covered-switch-default
-          -Wno-deprecated-literal-operator
-          -Wno-nested-anon-types
-          -Wno-unused-parameter
-          -Wno-zero-length-array
-          ${eh_rtti_enable})
-    endif()
+  if (NOT MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES
+      AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
+    # Avoid some warnings from upstream nanobind.
+    # If a superproject set MLIR_DISABLE_CONFIGURE_PYTHON_DEV_PACKAGES, let
+    # the super project handle compile options as it wishes.
+    get_property(NB_LIBRARY_TARGET_NAME TARGET ${libname} PROPERTY LINK_LIBRARIES)
+    target_compile_options(${NB_LIBRARY_TARGET_NAME}
+      PRIVATE
+        -Wall -Wextra -Wpedantic
+        -Wno-c++98-compat-extra-semi
+        -Wno-cast-qual
+        -Wno-covered-switch-default
+        -Wno-deprecated-literal-operator
+        -Wno-nested-anon-types
+        -Wno-unused-parameter
+        -Wno-zero-length-array
+        ${eh_rtti_enable})
+
+    target_compile_options(${libname}
+      PRIVATE
+        -Wall -Wextra -Wpedantic
+        -Wno-c++98-compat-extra-semi
+        -Wno-cast-qual
+        -Wno-covered-switch-default
+        -Wno-deprecated-literal-operator
+        -Wno-nested-anon-types
+        -Wno-unused-parameter
+        -Wno-zero-length-array
+        ${eh_rtti_enable})
+  endif()
 
-    if(APPLE)
-      # NanobindAdaptors.h uses PyClassMethod_New to build `pure_subclass`es but nanobind
-      # doesn't declare this API as undefined in its linker flags. So we need to declare it as such
-      # for downstream users that do not do something like `-undefined dynamic_lookup`.
-      # Same for the rest.
-      target_link_options(${libname} PUBLIC
-        "LINKER:-U,_PyClassMethod_New"
-        "LINKER:-U,_PyCode_Addr2Location"
-        "LINKER:-U,_PyFrame_GetLasti"
-      )
-    endif()
+  if(APPLE)
+    # NanobindAdaptors.h uses PyClassMethod_New to build `pure_subclass`es but nanobind
+    # doesn't declare this API as undefined in its linker flags. So we need to declare it as such
+    # for downstream users that do not do something like `-undefined dynamic_lookup`.
+    # Same for the rest.
+    target_link_options(${libname} PUBLIC
+      "LINKER:-U,_PyClassMethod_New"
+      "LINKER:-U,_PyCode_Addr2Location"
+      "LINKER:-U,_PyFrame_GetLasti"
+    )
   endif()
 
   target_compile_options(${libname} PRIVATE ${eh_rtti_enable})
@@ -862,11 +845,11 @@ function(add_mlir_python_extension libname extname)
   if(WIN32)
     # On Windows, pyconfig.h (and by extension python.h) hardcode the version of the
     # python library which will be used for linkage depending on the flavor of the build.
-    # pybind11 has a workaround which depends on the definition of Py_DEBUG (if Py_DEBUG
-    # is not passed in as a compile definition, pybind11 undefs _DEBUG when including
+    # nanobind has a workaround which depends on the definition of Py_DEBUG (if Py_DEBUG
+    # is not passed in as a compile definition, nanobind undefs _DEBUG when including
     # python.h, so that the release python library would be used).
-    # Since mlir uses pybind11, we can leverage their workaround by never directly
-    # pyconfig.h or python.h and instead relying on the pybind11 headers to include the
+    # Since mlir uses nanobind, we can leverage their workaround by never directly
+    # pyconfig.h or python.h and instead relying on the nanobind headers to include the
     # necessary python headers. This results in mlir always linking against the
     # release python library via the (undocumented) cmake property Python3_LIBRARY_RELEASE.
     target_link_libraries(${libname} PRIVATE ${Python3_LIBRARY_RELEASE})
diff --git a/mlir/cmake/modules/MLIRDetectPythonEnv.cmake b/mlir/cmake/modules/MLIRDetectPythonEnv.cmake
index d18f8c06158b2..edbad2e6b5d62 100644
--- a/mlir/cmake/modules/MLIRDetectPythonEnv.cmake
+++ b/mlir/cmake/modules/MLIRDetectPythonEnv.cmake
@@ -46,81 +46,20 @@ macro(mlir_configure_python_dev_packages)
     message(STATUS "Found python include dirs: ${Python3_INCLUDE_DIRS}")
     message(STATUS "Found python libraries: ${Python3_LIBRARIES}")
     message(STATUS "Found numpy v${Python3_NumPy_VERSION}: ${Python3_NumPy_INCLUDE_DIRS}")
-    mlir_detect_pybind11_install()
-    find_package(pybind11 2.10 CONFIG REQUIRED)
-    message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIR}")
-    message(STATUS "Python prefix = '${PYTHON_MODULE_PREFIX}', "
-                  "suffix = '${PYTHON_MODULE_SUFFIX}', "
-                  "extension = '${PYTHON_MODULE_EXTENSION}")
-
-    mlir_detect_nanobind_install()
-    find_package(nanobind 2.9 CONFIG REQUIRED)
-    message(STATUS "Found nanobind v${nanobind_VERSION}: ${nanobind_INCLUDE_DIR}")
-    message(STATUS "Python prefix = '${PYTHON_MODULE_PREFIX}', "
-                  "suffix = '${PYTHON_MODULE_SUFFIX}', "
-                  "extension = '${PYTHON_MODULE_EXTENSION}")
-  endif()
-endmacro()
-
-# Detects a pybind11 package installed in the current python environment
-# and sets variables to allow it to be found. This allows pybind11 to be
-# installed via pip, which typically yields a much more recent version than
-# the OS install, which will be available otherwise.
-function(mlir_detect_pybind11_install)
-  if(pybind11_DIR)
-    message(STATUS "Using explicit pybind11 cmake directory: ${pybind11_DIR} (-Dpybind11_DIR to change)")
-  else()
-    message(STATUS "Checking for pybind11 in python path...")
-    execute_process(
-      COMMAND "${Python3_EXECUTABLE}"
-      -c "import pybind11;print(pybind11.get_cmake_dir(), end='')"
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-      RESULT_VARIABLE STATUS
-      OUTPUT_VARIABLE PACKAGE_DIR
-      ERROR_QUIET)
-    if(NOT STATUS EQUAL "0")
-      message(STATUS "not found (install via 'pip install pybind11' or set pybind11_DIR)")
-      return()
-    endif()
-    message(STATUS "found (${PACKAGE_DIR})")
-    set(pybind11_DIR "${PACKAGE_DIR}" PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-# Detects a nanobind package installed in the current python environment
-# and sets variables to allow it to be found. This allows nanobind to be
-# installed via pip, which typically yields a much more recent version than
-# the OS install, which will be available otherwise.
-function(mlir_detect_nanobind_install)
-  if(nanobind_DIR)
-    message(STATUS "Using explicit nanobind cmake directory: ${nanobind_DIR} (-Dnanobind_DIR to change)")
-  else()
-    message(STATUS "Checking for nanobind in python path...")
-    execute_process(
-      COMMAND "${Python3_EXECUTABLE}"
-      -c "import nanobind;print(nanobind.cmake_dir(), end='')"
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-      RESULT_VARIABLE STATUS
-      OUTPUT_VARIABLE PACKAGE_DIR
-      ERROR_QUIET)
-    if(NOT STATUS EQUAL "0")
-      message(STATUS "not found (install via 'pip install nanobind' or set nanobind_DIR)")
-      return()
+    message(STATUS "Python extension suffix for modules: '${Python3_SOABI}'")
+    if(nanobind_DIR)
+      message(STATUS "Using explicit nanobind cmake directory: ${nanobind_DIR} (-Dnanobind_DIR to change)")
+      find_package(nanobind 2.9 CONFIG REQUIRED)
+    else()
+      include(FetchContent)
+      FetchContent_Declare(
+        nanobind
+        GIT_REPOSITORY https://github.com/wjakob/nanobind.git
+        GIT_TAG        v2.9.0
+        GIT_SHALLOW    TRUE
+      )
+      FetchContent_MakeAvailable(nanobind)
     endif()
-    message(STATUS "found (${PACKAGE_DIR})")
-    set(nanobind_DIR "${PACKAGE_DIR}" PARENT_SCOPE)
-    execute_process(
-      COMMAND "${Python3_EXECUTABLE}"
-      -c "import nanobind;print(nanobind.include_dir(), end='')"
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-      RESULT_VARIABLE STATUS
-      OUTPUT_VARIABLE PACKAGE_DIR
-      ERROR_QUIET)
-    if(NOT STATUS EQUAL "0")
-      message(STATUS "not found (install via 'pip install nanobind' or set nanobind_DIR)")
-      return()
-    endif()
-    set(nanobind_INCLUDE_DIR "${PACKAGE_DIR}" PARENT_SCOPE)
+    message(STATUS "Found nanobind: ${NB_DIR}")
   endif()
-endfunction()
+endmacro()
diff --git a/mlir/docs/Dialects/Linalg/OpDSL.md b/mlir/docs/Dialects/Linalg/OpDSL.md
index b892bbe427a18..5d7e2740ec90b 100644
--- a/mlir/docs/Dialects/Linalg/OpDSL.md
+++ b/mlir/docs/Dialects/Linalg/OpDSL.md
@@ -16,7 +16,7 @@ corresponding `linalg.generic` IR for the composition.
 ## Basic usage
 
 The tool is bundled with the MLIR Python bindings. To use from the CMake build
-tree, MLIR must be build with Python bindings enabled
+tree, MLIR must be built with Python bindings enabled
 (`-DMLIR_ENABLE_BINDINGS_PYTHON=ON`). Then add the `python` directory in the
 build tree to your `PYTHONPATH` environment variable (i.e. `export
 PYTHONPATH=$PWD/build/tools/mlir/python_packages/mlir_core`). Optionally, use an
@@ -24,7 +24,7 @@ installed MLIR package, if available, to avoid building.
 
 ```shell
 # Dump the `core_named_ops.py` module as YAML.
-python -m mlir.dialects.linalg.opdsl.dump_oplib .ops.core_named_ops
+python -m mlir.dialects.linalg.opdsl.dump_oplib.ops.core_named_ops
 ```
 
 Alternatively, run the `$PWD/build/bin/update_core_linalg_named_ops.sh` script,
diff --git a/mlir/examples/standalone/pyproject.toml b/mlir/examples/standalone/pyproject.toml
index 5a1e6e86513c3..75e215369ed55 100644
--- a/mlir/examples/standalone/pyproject.toml
+++ b/mlir/examples/standalone/pyproject.toml
@@ -23,9 +23,7 @@ Discussions = "https://discourse.llvm.org/"
 [build-system]
 requires = [
     "scikit-build-core>=0.10.7",
-    "typing_extensions>=4.12.2",
-    "nanobind>=2.9, <3.0",
-    "pybind11>=2.10.0, <=2.13.6",
+    "typing_extensions>=4.12.2"
 ]
 build-backend = "scikit_build_core.build"
 
diff --git a/mlir/examples/standalone/python/CMakeLists.txt b/mlir/examples/standalone/python/CMakeLists.txt
index 905c944939756..108c343714421 100644
--- a/mlir/examples/standalone/python/CMakeLists.txt
+++ b/mlir/examples/standalone/python/CMakeLists.txt
@@ -16,27 +16,10 @@ declare_mlir_dialect_python_bindings(
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir_standalone"
   TD_FILE dialects/StandaloneOps.td
   SOURCES
-    dialects/standalone_pybind11.py
     dialects/standalone_nanobind.py
     _mlir_libs/_standaloneDialectsNanobind/py.typed
   DIALECT_NAME standalone)
 
-
-declare_mlir_python_extension(StandalonePythonSources.Pybind11Extension
-  MODULE_NAME _standaloneDialectsPybind11
-  ADD_TO_PARENT StandalonePythonSources
-  SOURCES
-    StandaloneExtensionPybind11.cpp
-  PRIVATE_LINK_LIBS
-    LLVMSupport
-  EMBED_CAPI_LINK_LIBS
-    MLIRCAPIIR
-    MLIRCAPIArith
-    MLIRCAPITransforms
-    StandaloneCAPI
-  PYTHON_BINDINGS_LIBRARY pybind11
-)
-
 declare_mlir_python_extension(StandalonePythonSources.NanobindExtension
   MODULE_NAME _standaloneDialectsNanobind
   ADD_TO_PARENT StandalonePythonSources
@@ -49,7 +32,6 @@ declare_mlir_python_extension(StandalonePythonSources.NanobindExtension
     MLIRCAPIArith
     MLIRCAPITransforms
     StandaloneCAPI
-  PYTHON_BINDINGS_LIBRARY nanobind
 )
 
 
diff --git a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
deleted file mode 100644
index da8c2167dc36b..0000000000000
--- a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===- StandaloneExtensionPybind11.cpp - Extension module -----------------===//
-//
-// This is the pybind11 version of the example module. There is also a nanobind
-// example in StandaloneExtensionNanobind.cpp.
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Standalone-c/Dialects.h"
-#include "mlir-c/Dialect/Arith.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-
-using namespace mlir::python::adaptors;
-
-PYBIND11_MODULE(_standaloneDialectsPybind11, m) {
-  //===--------------------------------------------------------------------===//
-  // standalone dialect
-  //===--------------------------------------------------------------------===//
-  auto standaloneM = m.def_submodule("standalone");
-
-  standaloneM.def(
-      "register_dialects",
-      [](MlirContext context, bool load) {
-        MlirDialectHandle arithHandle = mlirGetDialectHandle__arith__();
-        MlirDialectHandle standaloneHandle =
-            mlirGetDialectHandle__standalone__();
-        mlirDialectHandleRegisterDialect(arithHandle, context);
-        mlirDialectHandleRegisterDialect(standaloneHandle, context);
-        if (load) {
-          mlirDialectHandleLoadDialect(arithHandle, context);
-          mlirDialectHandleRegisterDialect(standaloneHandle, context);
-        }
-      },
-      py::arg("context") = py::none(), py::arg("load") = true);
-}
diff --git a/mlir/examples/standalone/python/mlir_standalone/dialects/standalone_pybind11.py b/mlir/examples/standalone/python/mlir_standalone/dialects/standalone_pybind11.py
deleted file mode 100644
index bfb98e404e13f..0000000000000
--- a/mlir/examples/standalone/python/mlir_standalone/dialects/standalone_pybind11.py
+++ /dev/null
@@ -1,6 +0,0 @@
-#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-#  See https://llvm.org/LICENSE.txt for license information.
-#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from ._standalone_ops_gen import *
-from .._mlir_libs._standaloneDialectsPybind11.standalone import *
diff --git a/mlir/examples/standalone/test/python/smoketest.py b/mlir/examples/standalone/test/python/smoketest.py
index 26d84fd63e947..f8819841fac45 100644
--- a/mlir/examples/standalone/test/python/smoketest.py
+++ b/mlir/examples/standalone/test/python/smoketest.py
@@ -1,16 +1,7 @@
-# RUN: %python %s pybind11 | FileCheck %s
 # RUN: %python %s nanobind | FileCheck %s
 
-import sys
 from mlir_standalone.ir import *
-
-if sys.argv[1] == "pybind11":
-    from mlir_standalone.dialects import standalone_pybind11 as standalone_d
-elif sys.argv[1] == "nanobind":
-    from mlir_standalone.dialects import standalone_nanobind as standalone_d
-else:
-    raise ValueError("Expected either pybind11 or nanobind as arguments")
-
+from mlir_standalone.dialects import standalone_nanobind as standalone_d
 
 with Context():
     standalone_d.register_dialects()
diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
deleted file mode 100644
index edc69774be922..0000000000000
--- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
+++ /dev/null
@@ -1,616 +0,0 @@
-//===- PybindAdaptors.h - Interop with MLIR APIs via pybind11 -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This file contains adaptors for clients of the core MLIR Python APIs to
-// interop via MLIR CAPI types, using pybind11. The facilities here do not
-// depend on implementation details of the MLIR Python API and do not introduce
-// C++-level dependencies with it (requiring only Python and CAPI-level
-// dependencies).
-//
-// It is encouraged to be used both in-tree and out-of-tree. For in-tree use
-// cases, it should be used for dialect implementations (versus relying on
-// Pybind-based internals of the core libraries).
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_BINDINGS_PYTHON_PYBINDADAPTORS_H
-#define MLIR_BINDINGS_PYTHON_PYBINDADAPTORS_H
-
-#include <pybind11/functional.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/pytypes.h>
-#include <pybind11/stl.h>
-
-#include "mlir-c/Bindings/Python/Interop.h"
-#include "mlir-c/Diagnostics.h"
-#include "mlir-c/IR.h"
-
-#include "llvm/ADT/Twine.h"
-
-namespace py = pybind11;
-using namespace py::literals;
-
-// Raw CAPI type casters need to be declared before use, so always include them
-// first.
-namespace pybind11 {
-namespace detail {
-
-/// Helper to convert a presumed MLIR API object to a capsule, accepting either
-/// an explicit Capsule (which can happen when two C APIs are communicating
-/// directly via Python) or indirectly by querying the MLIR_PYTHON_CAPI_PTR_ATTR
-/// attribute (through which supported MLIR Python API objects export their
-/// contained API pointer as a capsule). Throws a type error if the object is
-/// neither. This is intended to be used from type casters, which are invoked
-/// with a raw handle (unowned). The returned object's lifetime may not extend
-/// beyond the apiObject handle without explicitly having its refcount increased
-/// (i.e. on return).
-static py::object mlirApiObjectToCapsule(py::handle apiObject) {
-  if (PyCapsule_CheckExact(apiObject.ptr()))
-    return py::reinterpret_borrow<py::object>(apiObject);
-  if (!py::hasattr(apiObject, MLIR_PYTHON_CAPI_PTR_ATTR)) {
-    auto repr = py::repr(apiObject).cast<std::string>();
-    throw py::type_error(
-        (llvm::Twine("Expected an MLIR object (got ") + repr + ").").str());
-  }
-  return apiObject.attr(MLIR_PYTHON_CAPI_PTR_ATTR);
-}
-
-// Note: Currently all of the following support cast from py::object to the
-// Mlir* C-API type, but only a few light-weight, context-bound ones
-// implicitly cast the other way because the use case has not yet emerged and
-// ownership is unclear.
-
-/// Casts object <-> MlirAffineMap.
-template <>
-struct type_caster<MlirAffineMap> {
-  PYBIND11_TYPE_CASTER(MlirAffineMap, _("MlirAffineMap"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToAffineMap(capsule.ptr());
-    if (mlirAffineMapIsNull(value)) {
-      return false;
-    }
-    return !mlirAffineMapIsNull(value);
-  }
-  static handle cast(MlirAffineMap v, return_value_policy, handle) {
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonAffineMapToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("AffineMap")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .release();
-  }
-};
-
-/// Casts object <-> MlirAttribute.
-template <>
-struct type_caster<MlirAttribute> {
-  PYBIND11_TYPE_CASTER(MlirAttribute, _("MlirAttribute"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToAttribute(capsule.ptr());
-    return !mlirAttributeIsNull(value);
-  }
-  static handle cast(MlirAttribute v, return_value_policy, handle) {
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonAttributeToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("Attribute")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .attr(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR)()
-        .release();
-  }
-};
-
-/// Casts object -> MlirBlock.
-template <>
-struct type_caster<MlirBlock> {
-  PYBIND11_TYPE_CASTER(MlirBlock, _("MlirBlock"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToBlock(capsule.ptr());
-    return !mlirBlockIsNull(value);
-  }
-};
-
-/// Casts object -> MlirContext.
-template <>
-struct type_caster<MlirContext> {
-  PYBIND11_TYPE_CASTER(MlirContext, _("MlirContext"));
-  bool load(handle src, bool) {
-    if (src.is_none()) {
-      // Gets the current thread-bound context.
-      // TODO: This raises an error of "No current context" currently.
-      // Update the implementation to pretty-print the helpful error that the
-      // core implementations print in this case.
-      src = py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-                .attr("Context")
-                .attr("current");
-    }
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToContext(capsule.ptr());
-    return !mlirContextIsNull(value);
-  }
-};
-
-/// Casts object <-> MlirDialectRegistry.
-template <>
-struct type_caster<MlirDialectRegistry> {
-  PYBIND11_TYPE_CASTER(MlirDialectRegistry, _("MlirDialectRegistry"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToDialectRegistry(capsule.ptr());
-    return !mlirDialectRegistryIsNull(value);
-  }
-  static handle cast(MlirDialectRegistry v, return_value_policy, handle) {
-    py::object capsule = py::reinterpret_steal<py::object>(
-        mlirPythonDialectRegistryToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("DialectRegistry")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .release();
-  }
-};
-
-/// Casts object <-> MlirLocation.
-template <>
-struct type_caster<MlirLocation> {
-  PYBIND11_TYPE_CASTER(MlirLocation, _("MlirLocation"));
-  bool load(handle src, bool) {
-    if (src.is_none()) {
-      // Gets the current thread-bound context.
-      src = py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-                .attr("Location")
-                .attr("current");
-    }
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToLocation(capsule.ptr());
-    return !mlirLocationIsNull(value);
-  }
-  static handle cast(MlirLocation v, return_value_policy, handle) {
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonLocationToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("Location")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .release();
-  }
-};
-
-/// Casts object <-> MlirModule.
-template <>
-struct type_caster<MlirModule> {
-  PYBIND11_TYPE_CASTER(MlirModule, _("MlirModule"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToModule(capsule.ptr());
-    return !mlirModuleIsNull(value);
-  }
-  static handle cast(MlirModule v, return_value_policy, handle) {
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonModuleToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("Module")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .release();
-  };
-};
-
-/// Casts object <-> MlirFrozenRewritePatternSet.
-template <>
-struct type_caster<MlirFrozenRewritePatternSet> {
-  PYBIND11_TYPE_CASTER(MlirFrozenRewritePatternSet,
-                       _("MlirFrozenRewritePatternSet"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToFrozenRewritePatternSet(capsule.ptr());
-    return value.ptr != nullptr;
-  }
-  static handle cast(MlirFrozenRewritePatternSet v, return_value_policy,
-                     handle) {
-    py::object capsule = py::reinterpret_steal<py::object>(
-        mlirPythonFrozenRewritePatternSetToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("rewrite"))
-        .attr("FrozenRewritePatternSet")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .release();
-  };
-};
-
-/// Casts object <-> MlirOperation.
-template <>
-struct type_caster<MlirOperation> {
-  PYBIND11_TYPE_CASTER(MlirOperation, _("MlirOperation"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToOperation(capsule.ptr());
-    return !mlirOperationIsNull(value);
-  }
-  static handle cast(MlirOperation v, return_value_policy, handle) {
-    if (v.ptr == nullptr)
-      return py::none();
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonOperationToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("Operation")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .release();
-  };
-};
-
-/// Casts object <-> MlirValue.
-template <>
-struct type_caster<MlirValue> {
-  PYBIND11_TYPE_CASTER(MlirValue, _("MlirValue"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToValue(capsule.ptr());
-    return !mlirValueIsNull(value);
-  }
-  static handle cast(MlirValue v, return_value_policy, handle) {
-    if (v.ptr == nullptr)
-      return py::none();
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonValueToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("Value")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .attr(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR)()
-        .release();
-  };
-};
-
-/// Casts object -> MlirPassManager.
-template <>
-struct type_caster<MlirPassManager> {
-  PYBIND11_TYPE_CASTER(MlirPassManager, _("MlirPassManager"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToPassManager(capsule.ptr());
-    return !mlirPassManagerIsNull(value);
-  }
-};
-
-/// Casts object <-> MlirTypeID.
-template <>
-struct type_caster<MlirTypeID> {
-  PYBIND11_TYPE_CASTER(MlirTypeID, _("MlirTypeID"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToTypeID(capsule.ptr());
-    return !mlirTypeIDIsNull(value);
-  }
-  static handle cast(MlirTypeID v, return_value_policy, handle) {
-    if (v.ptr == nullptr)
-      return py::none();
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonTypeIDToCapsule(v));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("TypeID")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .release();
-  };
-};
-
-/// Casts object <-> MlirType.
-template <>
-struct type_caster<MlirType> {
-  PYBIND11_TYPE_CASTER(MlirType, _("MlirType"));
-  bool load(handle src, bool) {
-    py::object capsule = mlirApiObjectToCapsule(src);
-    value = mlirPythonCapsuleToType(capsule.ptr());
-    return !mlirTypeIsNull(value);
-  }
-  static handle cast(MlirType t, return_value_policy, handle) {
-    py::object capsule =
-        py::reinterpret_steal<py::object>(mlirPythonTypeToCapsule(t));
-    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-        .attr("Type")
-        .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
-        .attr(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR)()
-        .release();
-  }
-};
-
-} // namespace detail
-} // namespace pybind11
-
-namespace mlir {
-namespace python {
-namespace adaptors {
-
-/// Provides a facility like py::class_ for defining a new class in a scope,
-/// but this allows extension of an arbitrary Python class, defining methods
-/// on it is a similar way. Classes defined in this way are very similar to
-/// if defined in Python in the usual way but use Pybind11 machinery to do
-/// it. These are not "real" Pybind11 classes but pure Python classes with no
-/// relation to a concrete C++ class.
-///
-/// Derived from a discussion upstream:
-///   https://github.com/pybind/pybind11/issues/1193
-///   (plus a fair amount of extra curricular poking)
-///   TODO: If this proves useful, see about including it in pybind11.
-class pure_subclass {
-public:
-  pure_subclass(py::handle scope, const char *derivedClassName,
-                const py::object &superClass) {
-    py::object pyType =
-        py::reinterpret_borrow<py::object>((PyObject *)&PyType_Type);
-    py::object metaclass = pyType(superClass);
-    py::dict attributes;
-
-    thisClass =
-        metaclass(derivedClassName, py::make_tuple(superClass), attributes);
-    scope.attr(derivedClassName) = thisClass;
-  }
-
-  template <typename Func, typename... Extra>
-  pure_subclass &def(const char *name, Func &&f, const Extra &...extra) {
-    py::cpp_function cf(
-        std::forward<Func>(f), py::name(name), py::is_method(thisClass),
-        py::sibling(py::getattr(thisClass, name, py::none())), extra...);
-    thisClass.attr(cf.name()) = cf;
-    return *this;
-  }
-
-  template <typename Func, typename... Extra>
-  pure_subclass &def_property_readonly(const char *name, Func &&f,
-                                       const Extra &...extra) {
-    py::cpp_function cf(
-        std::forward<Func>(f), py::name(name), py::is_method(thisClass),
-        py::sibling(py::getattr(thisClass, name, py::none())), extra...);
-    auto builtinProperty =
-        py::reinterpret_borrow<py::object>((PyObject *)&PyProperty_Type);
-    thisClass.attr(name) = builtinProperty(cf);
-    return *this;
-  }
-
-  template <typename Func, typename... Extra>
-  pure_subclass &def_staticmethod(const char *name, Func &&f,
-                                  const Extra &...extra) {
-    static_assert(!std::is_member_function_pointer<Func>::value,
-                  "def_staticmethod(...) called with a non-static member "
-                  "function pointer");
-    py::cpp_function cf(std::forward<Func>(f), py::name(name),
-                        py::scope(thisClass), extra...);
-    thisClass.attr(cf.name()) = py::staticmethod(cf);
-    return *this;
-  }
-
-  template <typename Func, typename... Extra>
-  pure_subclass &def_classmethod(const char *name, Func &&f,
-                                 const Extra &...extra) {
-    static_assert(!std::is_member_function_pointer<Func>::value,
-                  "def_classmethod(...) called with a non-static member "
-                  "function pointer");
-    py::cpp_function cf(std::forward<Func>(f), py::name(name),
-                        py::scope(thisClass), extra...);
-    thisClass.attr(cf.name()) =
-        py::reinterpret_borrow<py::object>(PyClassMethod_New(cf.ptr()));
-    return *this;
-  }
-
-  py::object get_class() const { return thisClass; }
-
-protected:
-  py::object superClass;
-  py::object thisClass;
-};
-
-/// Creates a custom subclass of mlir.ir.Attribute, implementing a casting
-/// constructor and type checking methods.
-class mlir_attribute_subclass : public pure_subclass {
-public:
-  using IsAFunctionTy = bool (*)(MlirAttribute);
-  using GetTypeIDFunctionTy = MlirTypeID (*)();
-
-  /// Subclasses by looking up the super-class dynamically.
-  mlir_attribute_subclass(py::handle scope, const char *attrClassName,
-                          IsAFunctionTy isaFunction,
-                          GetTypeIDFunctionTy getTypeIDFunction = nullptr)
-      : mlir_attribute_subclass(
-            scope, attrClassName, isaFunction,
-            py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-                .attr("Attribute"),
-            getTypeIDFunction) {}
-
-  /// Subclasses with a provided mlir.ir.Attribute super-class. This must
-  /// be used if the subclass is being defined in the same extension module
-  /// as the mlir.ir class (otherwise, it will trigger a recursive
-  /// initialization).
-  mlir_attribute_subclass(py::handle scope, const char *typeClassName,
-                          IsAFunctionTy isaFunction, const py::object &superCls,
-                          GetTypeIDFunctionTy getTypeIDFunction = nullptr)
-      : pure_subclass(scope, typeClassName, superCls) {
-    // Casting constructor. Note that it hard, if not impossible, to properly
-    // call chain to parent `__init__` in pybind11 due to its special handling
-    // for init functions that don't have a fully constructed self-reference,
-    // which makes it impossible to forward it to `__init__` of a superclass.
-    // Instead, provide a custom `__new__` and call that of a superclass, which
-    // eventually calls `__init__` of the superclass. Since attribute subclasses
-    // have no additional members, we can just return the instance thus created
-    // without amending it.
-    std::string captureTypeName(
-        typeClassName); // As string in case if typeClassName is not static.
-    py::cpp_function newCf(
-        [superCls, isaFunction, captureTypeName](py::object cls,
-                                                 py::object otherAttribute) {
-          MlirAttribute rawAttribute = py::cast<MlirAttribute>(otherAttribute);
-          if (!isaFunction(rawAttribute)) {
-            auto origRepr = py::repr(otherAttribute).cast<std::string>();
-            throw std::invalid_argument(
-                (llvm::Twine("Cannot cast attribute to ") + captureTypeName +
-                 " (from " + origRepr + ")")
-                    .str());
-          }
-          py::object self = superCls.attr("__new__")(cls, otherAttribute);
-          return self;
-        },
-        py::name("__new__"), py::arg("cls"), py::arg("cast_from_attr"));
-    thisClass.attr("__new__") = newCf;
-
-    // 'isinstance' method.
-    def_staticmethod(
-        "isinstance",
-        [isaFunction](MlirAttribute other) { return isaFunction(other); },
-        py::arg("other_attribute"));
-    def("__repr__", [superCls, captureTypeName](py::object self) {
-      return py::repr(superCls(self))
-          .attr("replace")(superCls.attr("__name__"), captureTypeName);
-    });
-    if (getTypeIDFunction) {
-      def_staticmethod("get_static_typeid",
-                       [getTypeIDFunction]() { return getTypeIDFunction(); });
-      py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-          .attr(MLIR_PYTHON_CAPI_TYPE_CASTER_REGISTER_ATTR)(
-              getTypeIDFunction())(pybind11::cpp_function(
-              [thisClass = thisClass](const py::object &mlirAttribute) {
-                return thisClass(mlirAttribute);
-              }));
-    }
-  }
-};
-
-/// Creates a custom subclass of mlir.ir.Type, implementing a casting
-/// constructor and type checking methods.
-class mlir_type_subclass : public pure_subclass {
-public:
-  using IsAFunctionTy = bool (*)(MlirType);
-  using GetTypeIDFunctionTy = MlirTypeID (*)();
-
-  /// Subclasses by looking up the super-class dynamically.
-  mlir_type_subclass(py::handle scope, const char *typeClassName,
-                     IsAFunctionTy isaFunction,
-                     GetTypeIDFunctionTy getTypeIDFunction = nullptr)
-      : mlir_type_subclass(
-            scope, typeClassName, isaFunction,
-            py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir")).attr("Type"),
-            getTypeIDFunction) {}
-
-  /// Subclasses with a provided mlir.ir.Type super-class. This must
-  /// be used if the subclass is being defined in the same extension module
-  /// as the mlir.ir class (otherwise, it will trigger a recursive
-  /// initialization).
-  mlir_type_subclass(py::handle scope, const char *typeClassName,
-                     IsAFunctionTy isaFunction, const py::object &superCls,
-                     GetTypeIDFunctionTy getTypeIDFunction = nullptr)
-      : pure_subclass(scope, typeClassName, superCls) {
-    // Casting constructor. Note that it hard, if not impossible, to properly
-    // call chain to parent `__init__` in pybind11 due to its special handling
-    // for init functions that don't have a fully constructed self-reference,
-    // which makes it impossible to forward it to `__init__` of a superclass.
-    // Instead, provide a custom `__new__` and call that of a superclass, which
-    // eventually calls `__init__` of the superclass. Since attribute subclasses
-    // have no additional members, we can just return the instance thus created
-    // without amending it.
-    std::string captureTypeName(
-        typeClassName); // As string in case if typeClassName is not static.
-    py::cpp_function newCf(
-        [superCls, isaFunction, captureTypeName](py::object cls,
-                                                 py::object otherType) {
-          MlirType rawType = py::cast<MlirType>(otherType);
-          if (!isaFunction(rawType)) {
-            auto origRepr = py::repr(otherType).cast<std::string>();
-            throw std::invalid_argument((llvm::Twine("Cannot cast type to ") +
-                                         captureTypeName + " (from " +
-                                         origRepr + ")")
-                                            .str());
-          }
-          py::object self = superCls.attr("__new__")(cls, otherType);
-          return self;
-        },
-        py::name("__new__"), py::arg("cls"), py::arg("cast_from_type"));
-    thisClass.attr("__new__") = newCf;
-
-    // 'isinstance' method.
-    def_staticmethod(
-        "isinstance",
-        [isaFunction](MlirType other) { return isaFunction(other); },
-        py::arg("other_type"));
-    def("__repr__", [superCls, captureTypeName](py::object self) {
-      return py::repr(superCls(self))
-          .attr("replace")(superCls.attr("__name__"), captureTypeName);
-    });
-    if (getTypeIDFunction) {
-      // 'get_static_typeid' method.
-      // This is modeled as a static method instead of a static property because
-      // `def_property_readonly_static` is not available in `pure_subclass` and
-      // we do not want to introduce the complexity that pybind uses to
-      // implement it.
-      def_staticmethod("get_static_typeid",
-                       [getTypeIDFunction]() { return getTypeIDFunction(); });
-      py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-          .attr(MLIR_PYTHON_CAPI_TYPE_CASTER_REGISTER_ATTR)(
-              getTypeIDFunction())(pybind11::cpp_function(
-              [thisClass = thisClass](const py::object &mlirType) {
-                return thisClass(mlirType);
-              }));
-    }
-  }
-};
-
-/// Creates a custom subclass of mlir.ir.Value, implementing a casting
-/// constructor and type checking methods.
-class mlir_value_subclass : public pure_subclass {
-public:
-  using IsAFunctionTy = bool (*)(MlirValue);
-
-  /// Subclasses by looking up the super-class dynamically.
-  mlir_value_subclass(py::handle scope, const char *valueClassName,
-                      IsAFunctionTy isaFunction)
-      : mlir_value_subclass(
-            scope, valueClassName, isaFunction,
-            py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir")).attr("Value")) {
-  }
-
-  /// Subclasses with a provided mlir.ir.Value super-class. This must
-  /// be used if the subclass is being defined in the same extension module
-  /// as the mlir.ir class (otherwise, it will trigger a recursive
-  /// initialization).
-  mlir_value_subclass(py::handle scope, const char *valueClassName,
-                      IsAFunctionTy isaFunction, const py::object &superCls)
-      : pure_subclass(scope, valueClassName, superCls) {
-    // Casting constructor. Note that it hard, if not impossible, to properly
-    // call chain to parent `__init__` in pybind11 due to its special handling
-    // for init functions that don't have a fully constructed self-reference,
-    // which makes it impossible to forward it to `__init__` of a superclass.
-    // Instead, provide a custom `__new__` and call that of a superclass, which
-    // eventually calls `__init__` of the superclass. Since attribute subclasses
-    // have no additional members, we can just return the instance thus created
-    // without amending it.
-    std::string captureValueName(
-        valueClassName); // As string in case if valueClassName is not static.
-    py::cpp_function newCf(
-        [superCls, isaFunction, captureValueName](py::object cls,
-                                                  py::object otherValue) {
-          MlirValue rawValue = py::cast<MlirValue>(otherValue);
-          if (!isaFunction(rawValue)) {
-            auto origRepr = py::repr(otherValue).cast<std::string>();
-            throw std::invalid_argument((llvm::Twine("Cannot cast value to ") +
-                                         captureValueName + " (from " +
-                                         origRepr + ")")
-                                            .str());
-          }
-          py::object self = superCls.attr("__new__")(cls, otherValue);
-          return self;
-        },
-        py::name("__new__"), py::arg("cls"), py::arg("cast_from_value"));
-    thisClass.attr("__new__") = newCf;
-
-    // 'isinstance' method.
-    def_staticmethod(
-        "isinstance",
-        [isaFunction](MlirValue other) { return isaFunction(other); },
-        py::arg("other_value"));
-  }
-};
-
-} // namespace adaptors
-
-} // namespace python
-} // namespace mlir
-
-#endif // MLIR_BINDINGS_PYTHON_PYBINDADAPTORS_H
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 9f5246de6bda0..cea5b252e0faf 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -440,11 +440,11 @@ declare_mlir_dialect_python_bindings(
   DIALECT_NAME smt)
 
 declare_mlir_dialect_python_bindings(
-    ADD_TO_PARENT MLIRPythonSources.Dialects
-    ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
-    TD_FILE dialects/SPIRVOps.td
-    SOURCES dialects/spirv.py
-    DIALECT_NAME spirv)
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/SPIRVOps.td
+  SOURCES dialects/spirv.py
+  DIALECT_NAME spirv)
 
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
@@ -501,7 +501,6 @@ declare_mlir_python_extension(MLIRPythonExtension.Core
   MODULE_NAME _mlir
   ADD_TO_PARENT MLIRPythonSources.Core
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     MainModule.cpp
     IRAffine.cpp
@@ -540,7 +539,6 @@ declare_mlir_python_extension(MLIRPythonExtension.Core
 declare_mlir_python_extension(MLIRPythonExtension.RegisterEverything
   MODULE_NAME _mlirRegisterEverything
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     RegisterEverything.cpp
   PRIVATE_LINK_LIBS
@@ -551,11 +549,10 @@ declare_mlir_python_extension(MLIRPythonExtension.RegisterEverything
     MLIRCAPIRegisterEverything
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.Linalg.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.Linalg.Nanobind
   MODULE_NAME _mlirDialectsLinalg
   ADD_TO_PARENT MLIRPythonSources.Dialects.linalg
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectLinalg.cpp
   PRIVATE_LINK_LIBS
@@ -565,11 +562,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.Linalg.Pybind
     MLIRCAPILinalg
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.GPU.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.GPU.Nanobind
   MODULE_NAME _mlirDialectsGPU
   ADD_TO_PARENT MLIRPythonSources.Dialects.gpu
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectGPU.cpp
   PRIVATE_LINK_LIBS
@@ -579,11 +575,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.GPU.Pybind
     MLIRCAPIGPU
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.LLVM.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.LLVM.Nanobind
   MODULE_NAME _mlirDialectsLLVM
   ADD_TO_PARENT MLIRPythonSources.Dialects.llvm
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectLLVM.cpp
   PRIVATE_LINK_LIBS
@@ -593,11 +588,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.LLVM.Pybind
     MLIRCAPILLVM
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.Quant.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.Quant.Nanobind
   MODULE_NAME _mlirDialectsQuant
   ADD_TO_PARENT MLIRPythonSources.Dialects.quant
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectQuant.cpp
   PRIVATE_LINK_LIBS
@@ -607,11 +601,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.Quant.Pybind
     MLIRCAPIQuant
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.NVGPU.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.NVGPU.Nanobind
   MODULE_NAME _mlirDialectsNVGPU
   ADD_TO_PARENT MLIRPythonSources.Dialects.nvgpu
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectNVGPU.cpp
   PRIVATE_LINK_LIBS
@@ -621,11 +614,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.NVGPU.Pybind
     MLIRCAPINVGPU
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.PDL.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.PDL.Nanobind
   MODULE_NAME _mlirDialectsPDL
   ADD_TO_PARENT MLIRPythonSources.Dialects.pdl
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectPDL.cpp
   PRIVATE_LINK_LIBS
@@ -635,11 +627,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.PDL.Pybind
     MLIRCAPIPDL
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.SparseTensor.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.SparseTensor.Nanobind
   MODULE_NAME _mlirDialectsSparseTensor
   ADD_TO_PARENT MLIRPythonSources.Dialects.sparse_tensor
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectSparseTensor.cpp
   PRIVATE_LINK_LIBS
@@ -649,11 +640,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.SparseTensor.Pybind
     MLIRCAPISparseTensor
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.Transform.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.Transform.Nanobind
   MODULE_NAME _mlirDialectsTransform
   ADD_TO_PARENT MLIRPythonSources.Dialects.transform
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectTransform.cpp
   PRIVATE_LINK_LIBS
@@ -663,11 +653,10 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.Transform.Pybind
     MLIRCAPITransformDialect
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.IRDL.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.IRDL.Nanobind
   MODULE_NAME _mlirDialectsIRDL
   ADD_TO_PARENT MLIRPythonSources.Dialects.irdl
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectIRDL.cpp
   PRIVATE_LINK_LIBS
@@ -681,7 +670,6 @@ declare_mlir_python_extension(MLIRPythonExtension.AsyncDialectPasses
   MODULE_NAME _mlirAsyncPasses
   ADD_TO_PARENT MLIRPythonSources.Dialects.async
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     AsyncPasses.cpp
   PRIVATE_LINK_LIBS
@@ -695,7 +683,6 @@ if(MLIR_ENABLE_EXECUTION_ENGINE)
     MODULE_NAME _mlirExecutionEngine
     ADD_TO_PARENT MLIRPythonSources.ExecutionEngine
     ROOT_DIR "${PYTHON_SOURCE_DIR}"
-    PYTHON_BINDINGS_LIBRARY nanobind
     SOURCES
       ExecutionEngineModule.cpp
     PRIVATE_LINK_LIBS
@@ -709,7 +696,6 @@ declare_mlir_python_extension(MLIRPythonExtension.GPUDialectPasses
   MODULE_NAME _mlirGPUPasses
   ADD_TO_PARENT MLIRPythonSources.Dialects.gpu
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     GPUPasses.cpp
   PRIVATE_LINK_LIBS
@@ -722,7 +708,6 @@ declare_mlir_python_extension(MLIRPythonExtension.LinalgPasses
   MODULE_NAME _mlirLinalgPasses
   ADD_TO_PARENT MLIRPythonSources.Dialects.linalg
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     LinalgPasses.cpp
   PRIVATE_LINK_LIBS
@@ -731,11 +716,10 @@ declare_mlir_python_extension(MLIRPythonExtension.LinalgPasses
     MLIRCAPILinalg
 )
 
-declare_mlir_python_extension(MLIRPythonExtension.Dialects.SMT.Pybind
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.SMT.Nanobind
   MODULE_NAME _mlirDialectsSMT
   ADD_TO_PARENT MLIRPythonSources.Dialects.smt
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     DialectSMT.cpp
     # Headers must be included explicitly so they are installed.
@@ -752,7 +736,6 @@ declare_mlir_python_extension(MLIRPythonExtension.SparseTensorDialectPasses
   MODULE_NAME _mlirSparseTensorPasses
   ADD_TO_PARENT MLIRPythonSources.Dialects.sparse_tensor
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     SparseTensorPasses.cpp
   PRIVATE_LINK_LIBS
@@ -765,7 +748,6 @@ declare_mlir_python_extension(MLIRPythonExtension.TransformInterpreter
   MODULE_NAME _mlirTransformInterpreter
   ADD_TO_PARENT MLIRPythonSources.Dialects.transform
   ROOT_DIR "${PYTHON_SOURCE_DIR}"
-  PYTHON_BINDINGS_LIBRARY nanobind
   SOURCES
     TransformInterpreter.cpp
   PRIVATE_LINK_LIBS
@@ -807,23 +789,10 @@ if(MLIR_INCLUDE_TESTS)
     ADD_TO_PARENT MLIRPythonTestSources.Dialects.PythonTest
     SOURCES "dialects/_python_test_ops_gen.py")
 
-  declare_mlir_python_extension(MLIRPythonTestSources.PythonTestExtensionPybind11
-    MODULE_NAME _mlirPythonTestPybind11
-    ADD_TO_PARENT MLIRPythonTestSources.Dialects
-    ROOT_DIR "${MLIR_SOURCE_DIR}/test/python/lib"
-    PYTHON_BINDINGS_LIBRARY pybind11
-    SOURCES
-      PythonTestModulePybind11.cpp
-    PRIVATE_LINK_LIBS
-      LLVMSupport
-    EMBED_CAPI_LINK_LIBS
-      MLIRCAPIPythonTestDialect
-  )
   declare_mlir_python_extension(MLIRPythonTestSources.PythonTestExtensionNanobind
     MODULE_NAME _mlirPythonTestNanobind
     ADD_TO_PARENT MLIRPythonTestSources.Dialects
     ROOT_DIR "${MLIR_SOURCE_DIR}/test/python/lib"
-    PYTHON_BINDINGS_LIBRARY nanobind
     SOURCES
       PythonTestModuleNanobind.cpp
     PRIVATE_LINK_LIBS
diff --git a/mlir/python/mlir/dialects/python_test.py b/mlir/python/mlir/dialects/python_test.py
index 9380896c8c06e..56d3c0f7a5465 100644
--- a/mlir/python/mlir/dialects/python_test.py
+++ b/mlir/python/mlir/dialects/python_test.py
@@ -5,12 +5,7 @@
 from ._python_test_ops_gen import *
 
 
-def register_python_test_dialect(registry, use_nanobind):
-    if use_nanobind:
-        from .._mlir_libs import _mlirPythonTestNanobind
+def register_python_test_dialect(registry):
+    from .._mlir_libs import _mlirPythonTestNanobind
 
-        _mlirPythonTestNanobind.register_dialect(registry)
-    else:
-        from .._mlir_libs import _mlirPythonTestPybind11
-
-        _mlirPythonTestPybind11.register_dialect(registry)
+    _mlirPythonTestNanobind.register_dialect(registry)
diff --git a/mlir/python/requirements.txt b/mlir/python/requirements.txt
index abe09259bb1e8..5ff9500e50127 100644
--- a/mlir/python/requirements.txt
+++ b/mlir/python/requirements.txt
@@ -1,6 +1,4 @@
-nanobind>=2.9, <3.0
 numpy>=1.19.5, <=2.1.2
-pybind11>=2.10.0, <=2.13.6
 PyYAML>=5.4.0, <=6.0.1
 ml_dtypes>=0.1.0, <=0.6.0; python_version<"3.13"   # provides several NumPy dtype extensions, including the bf16
 ml_dtypes>=0.5.0, <=0.6.0; python_version>="3.13"
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
index 1194e32c960c8..5a9acc7dcf6bb 100644
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -1,5 +1,4 @@
-# RUN: %PYTHON %s pybind11 | FileCheck %s
-# RUN: %PYTHON %s nanobind | FileCheck %s
+# RUN: %PYTHON %s | FileCheck %s
 import sys
 import typing
 from typing import Union, Optional
@@ -10,26 +9,14 @@
 import mlir.dialects.tensor as tensor
 import mlir.dialects.arith as arith
 
-if sys.argv[1] == "pybind11":
-    from mlir._mlir_libs._mlirPythonTestPybind11 import (
-        TestAttr,
-        TestType,
-        TestTensorValue,
-        TestIntegerRankedTensorType,
-    )
-
-    test.register_python_test_dialect(get_dialect_registry(), use_nanobind=False)
-elif sys.argv[1] == "nanobind":
-    from mlir._mlir_libs._mlirPythonTestNanobind import (
-        TestAttr,
-        TestType,
-        TestTensorValue,
-        TestIntegerRankedTensorType,
-    )
-
-    test.register_python_test_dialect(get_dialect_registry(), use_nanobind=True)
-else:
-    raise ValueError("Expected pybind11 or nanobind as argument")
+from mlir._mlir_libs._mlirPythonTestNanobind import (
+    TestAttr,
+    TestType,
+    TestTensorValue,
+    TestIntegerRankedTensorType,
+)
+
+test.register_python_test_dialect(get_dialect_registry())
 
 
 def run(f):
diff --git a/mlir/test/python/lib/CMakeLists.txt b/mlir/test/python/lib/CMakeLists.txt
index 9a813dace2f54..f51a7b44e64dd 100644
--- a/mlir/test/python/lib/CMakeLists.txt
+++ b/mlir/test/python/lib/CMakeLists.txt
@@ -1,7 +1,6 @@
 set(LLVM_OPTIONAL_SOURCES
   PythonTestCAPI.cpp
   PythonTestDialect.cpp
-  PythonTestModulePybind11.cpp
   PythonTestModuleNanobind.cpp
 )
 
diff --git a/mlir/test/python/lib/PythonTestModulePybind11.cpp b/mlir/test/python/lib/PythonTestModulePybind11.cpp
deleted file mode 100644
index 94a5f5178d16e..0000000000000
--- a/mlir/test/python/lib/PythonTestModulePybind11.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//===- PythonTestModule.cpp - Python extension for the PythonTest dialect -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This is the pybind11 edition of the PythonTest dialect module.
-//===----------------------------------------------------------------------===//
-
-#include "PythonTestCAPI.h"
-#include "mlir-c/BuiltinAttributes.h"
-#include "mlir-c/BuiltinTypes.h"
-#include "mlir-c/IR.h"
-#include "mlir/Bindings/Python/PybindAdaptors.h"
-
-namespace py = pybind11;
-using namespace mlir::python::adaptors;
-using namespace pybind11::literals;
-
-static bool mlirTypeIsARankedIntegerTensor(MlirType t) {
-  return mlirTypeIsARankedTensor(t) &&
-         mlirTypeIsAInteger(mlirShapedTypeGetElementType(t));
-}
-
-PYBIND11_MODULE(_mlirPythonTestPybind11, m) {
-  m.def(
-      "register_python_test_dialect",
-      [](MlirContext context, bool load) {
-        MlirDialectHandle pythonTestDialect =
-            mlirGetDialectHandle__python_test__();
-        mlirDialectHandleRegisterDialect(pythonTestDialect, context);
-        if (load) {
-          mlirDialectHandleLoadDialect(pythonTestDialect, context);
-        }
-      },
-      py::arg("context"), py::arg("load") = true);
-
-  m.def(
-      "register_dialect",
-      [](MlirDialectRegistry registry) {
-        MlirDialectHandle pythonTestDialect =
-            mlirGetDialectHandle__python_test__();
-        mlirDialectHandleInsertDialect(pythonTestDialect, registry);
-      },
-      py::arg("registry"));
-
-  mlir_attribute_subclass(m, "TestAttr",
-                          mlirAttributeIsAPythonTestTestAttribute,
-                          mlirPythonTestTestAttributeGetTypeID)
-      .def_classmethod(
-          "get",
-          [](const py::object &cls, MlirContext ctx) {
-            return cls(mlirPythonTestTestAttributeGet(ctx));
-          },
-          py::arg("cls"), py::arg("context") = py::none());
-
-  mlir_type_subclass(m, "TestType", mlirTypeIsAPythonTestTestType,
-                     mlirPythonTestTestTypeGetTypeID)
-      .def_classmethod(
-          "get",
-          [](const py::object &cls, MlirContext ctx) {
-            return cls(mlirPythonTestTestTypeGet(ctx));
-          },
-          py::arg("cls"), py::arg("context") = py::none());
-
-  auto typeCls =
-      mlir_type_subclass(m, "TestIntegerRankedTensorType",
-                         mlirTypeIsARankedIntegerTensor,
-                         py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-                             .attr("RankedTensorType"))
-          .def_classmethod(
-              "get",
-              [](const py::object &cls, std::vector<int64_t> shape,
-                 unsigned width, MlirContext ctx) {
-                MlirAttribute encoding = mlirAttributeGetNull();
-                return cls(mlirRankedTensorTypeGet(
-                    shape.size(), shape.data(), mlirIntegerTypeGet(ctx, width),
-                    encoding));
-              },
-              "cls"_a, "shape"_a, "width"_a, "context"_a = py::none());
-
-  assert(py::hasattr(typeCls.get_class(), "static_typeid") &&
-         "TestIntegerRankedTensorType has no static_typeid");
-
-  MlirTypeID mlirRankedTensorTypeID = mlirRankedTensorTypeGetTypeID();
-
-  py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-      .attr(MLIR_PYTHON_CAPI_TYPE_CASTER_REGISTER_ATTR)(mlirRankedTensorTypeID,
-                                                        "replace"_a = true)(
-          pybind11::cpp_function([typeCls](const py::object &mlirType) {
-            return typeCls.get_class()(mlirType);
-          }));
-
-  auto valueCls = mlir_value_subclass(m, "TestTensorValue",
-                                      mlirTypeIsAPythonTestTestTensorValue)
-                      .def("is_null", [](MlirValue &self) {
-                        return mlirValueIsNull(self);
-                      });
-
-  py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
-      .attr(MLIR_PYTHON_CAPI_VALUE_CASTER_REGISTER_ATTR)(
-          mlirRankedTensorTypeID)(
-          pybind11::cpp_function([valueCls](const py::object &valueObj) {
-            py::object capsule = mlirApiObjectToCapsule(valueObj);
-            MlirValue v = mlirPythonCapsuleToValue(capsule.ptr());
-            MlirType t = mlirValueGetType(v);
-            // This is hyper-specific in order to exercise/test registering a
-            // value caster from cpp (but only for a single test case; see
-            // testTensorValue python_test.py).
-            if (mlirShapedTypeHasStaticShape(t) &&
-                mlirShapedTypeGetDimSize(t, 0) == 1 &&
-                mlirShapedTypeGetDimSize(t, 1) == 2 &&
-                mlirShapedTypeGetDimSize(t, 2) == 3)
-              return valueCls.get_class()(valueObj);
-            return valueObj;
-          }));
-}
diff --git a/mlir/tools/mlir-linalg-ods-gen/update_core_linalg_named_ops.sh.in b/mlir/tools/mlir-linalg-ods-gen/update_core_linalg_named_ops.sh.in
index da4db3971987d..0bb6a209b6c18 100755
--- a/mlir/tools/mlir-linalg-ods-gen/update_core_linalg_named_ops.sh.in
+++ b/mlir/tools/mlir-linalg-ods-gen/update_core_linalg_named_ops.sh.in
@@ -26,7 +26,7 @@ export PYTHONPATH="$python_package_dir"
 OUTPUT="$(
   echo "### AUTOGENERATED from core_named_ops.py" && \
   echo "### To regenerate, run: bin/update_core_linalg_named_ops.sh" && \
-  "$python_exe" -m mlir.dialects.linalg.opdsl.dump_oplib .ops.core_named_ops \
+  "$python_exe" -m mlir.dialects.linalg.opdsl.dump_oplib.ops.core_named_ops \
 )"
 echo "$OUTPUT" > "$dest_file"
 echo "Success."
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 422c29fc9c4d5..0c77a1e62723a 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1048,32 +1048,6 @@ filegroup(
     ]),
 )
 
-cc_library(
-    name = "MLIRBindingsPythonHeaders",
-    includes = [
-        "include",
-    ],
-    textual_hdrs = [":MLIRBindingsPythonHeaderFiles"],
-    deps = [
-        ":CAPIIRHeaders",
-        "@pybind11",
-        "@rules_python//python/cc:current_py_cc_headers",
-    ],
-)
-
-cc_library(
-    name = "MLIRBindingsPythonHeadersAndDeps",
-    includes = [
-        "include",
-    ],
-    textual_hdrs = [":MLIRBindingsPythonHeaderFiles"],
-    deps = [
-        ":CAPIIR",
-        "@pybind11",
-        "@rules_python//python/cc:current_py_cc_headers",
-    ],
-)
-
 cc_library(
     name = "MLIRBindingsPythonNanobindHeaders",
     includes = [
@@ -1087,6 +1061,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "MLIRBindingsPythonHeaders",
+    actual = ":MLIRBindingsPythonNanobindHeaders",
+)
+
 cc_library(
     name = "MLIRBindingsPythonNanobindHeadersAndDeps",
     includes = [
@@ -1100,6 +1079,11 @@ cc_library(
     ],
 )
 
+alias(
+    name = "MLIRBindingsPythonHeadersAndDeps",
+    actual = ":MLIRBindingsPythonNanobindHeadersAndDeps",
+)
+
 # These flags are needed for pybind11 to work.
 PYBIND11_COPTS = [
     "-fexceptions",
@@ -1147,7 +1131,7 @@ cc_library(
         ":CAPIIR",
         ":CAPIInterfaces",
         ":CAPITransforms",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         ":Support",
         ":config",
         "//llvm:Support",
@@ -1170,7 +1154,7 @@ cc_library(
         ":CAPIDebugHeaders",
         ":CAPIIRHeaders",
         ":CAPITransformsHeaders",
-        ":MLIRBindingsPythonNanobindHeaders",
+        ":MLIRBindingsPythonHeaders",
         ":Support",
         ":config",
         "//llvm:Support",
@@ -1220,7 +1204,7 @@ cc_binary(
     linkstatic = 0,
     deps = [
         ":CAPIIR",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         "@nanobind",
     ],
 )
@@ -1238,7 +1222,7 @@ cc_binary(
     deps = [
         ":CAPIIR",
         ":CAPILinalg",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         "@nanobind",
     ],
 )
@@ -1253,7 +1237,7 @@ cc_binary(
     deps = [
         ":CAPIIR",
         ":CAPILLVM",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         "@nanobind",
     ],
 )
@@ -1268,7 +1252,7 @@ cc_binary(
     deps = [
         ":CAPIIR",
         ":CAPIQuant",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         "@nanobind",
     ],
 )
@@ -1283,7 +1267,7 @@ cc_binary(
     deps = [
         ":CAPIIR",
         ":CAPISparseTensor",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         "@nanobind",
     ],
 )
@@ -1298,7 +1282,7 @@ cc_binary(
     linkstatic = 0,
     deps = [
         ":CAPIExecutionEngine",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         "@nanobind",
         "@rules_python//python/cc:current_py_cc_headers",
     ],
@@ -1314,7 +1298,7 @@ cc_binary(
     linkstatic = 0,
     deps = [
         ":CAPILinalg",
-        ":MLIRBindingsPythonNanobindHeadersAndDeps",
+        ":MLIRBindingsPythonHeadersAndDeps",
         "@nanobind",
         "@rules_python//python/cc:current_py_cc_headers",
     ],

From 8889377f5c867ddb07658f6286f4e58e64e9e30a Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 6 Oct 2025 18:23:07 +0100
Subject: [PATCH 869/878] [lldb][docs] DW_AT_APPLE_major_runtime_version ->
 DW_AT_APPLE_major_runtime_vers (#162062)

`DW_AT_APPLE_major_runtime_version` doesn't exist. This should be
`DW_AT_APPLE_major_runtime_vers`.
---
 lldb/docs/resources/extensions.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/docs/resources/extensions.rst b/lldb/docs/resources/extensions.rst
index 30bd6d5c6b8da..61fffe7a56d27 100644
--- a/lldb/docs/resources/extensions.rst
+++ b/lldb/docs/resources/extensions.rst
@@ -134,5 +134,5 @@ Objective-C runtime
 
 Clang emits the Objective-C runtime version into the
 ``DW_TAG_compile_unit`` using the
-``DW_AT_APPLE_major_runtime_version`` attribute. The value 2 stands
+``DW_AT_APPLE_major_runtime_vers`` attribute. The value 2 stands
 for Objective-C 2.0.

From 95e0ae9fa7f3bfbfe3dc587428a064cbb8deb3ca Mon Sep 17 00:00:00 2001
From: Govind Malasani <145235389+gmalasan@users.noreply.github.com>
Date: Mon, 6 Oct 2025 13:29:48 -0400
Subject: [PATCH 870/878] [MLIR][SparseTensor] Loop ordering strategy
 infrastructure (flag) (#154656)

As discussed before, this PR adds the basic infrastructure/boiler plate
for loop ordering strategies to be implemented.

If this looks ok, I wanted to also mention some of the heuristics that I
would implement next, if they sound reasonable to you guys:
- Parallel first : prioritize parallel loops over reduction loops
- Dense outer : prioritize the most dense loops first
- Sparse outer : the opposite, potentially useful in some cases?

There is another that I am considering, stride/memory aware, which would
prioritize loops with better stride patterns (like sequential or
linear). Not sure how well this carries over to Sparse Tensor though.
Are there any ideas/heuristics that I should definitely try to
implement?

As we discussed, I will try to incrementally add heuristics. Sorry for
the delay on my end, and thank you so much for the feedback!

---------

Co-authored-by: Aart Bik <ajcbik@google.com>
---
 .../Dialect/SparseTensor/Transforms/Passes.h  | 19 +++++++++++++--
 .../Dialect/SparseTensor/Transforms/Passes.td |  5 ++++
 .../Transforms/SparseReinterpretMap.cpp       | 24 ++++++++++++-------
 .../Transforms/SparseTensorPasses.cpp         | 11 ++++++++-
 .../Transforms/Utils/IterationGraphSorter.cpp | 22 ++++++++++++-----
 .../Transforms/Utils/IterationGraphSorter.h   | 17 +++++++++----
 6 files changed, 77 insertions(+), 21 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index 212f7b6f13c26..af64370a62dd7 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -55,6 +55,16 @@ enum class SparseEmitStrategy {
   kDebugInterface, // generate only place-holder for sparse iteration
 };
 
+namespace sparse_tensor {
+
+/// Defines a strategy for loop ordering during sparse code generation.
+enum class LoopOrderingStrategy : unsigned {
+  kDefault, ///< Default strategy (eagerly selects last loop in topological
+            ///< sort).
+};
+
+} // namespace sparse_tensor
+
 #define GEN_PASS_DECL
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"
 
@@ -71,11 +81,16 @@ std::unique_ptr<Pass> createSparseAssembler(bool directOut);
 // The SparseReinterpretMap pass.
 //===----------------------------------------------------------------------===//
 
-void populateSparseReinterpretMap(RewritePatternSet &patterns,
-                                  ReinterpretMapScope scope);
+void populateSparseReinterpretMap(
+    RewritePatternSet &patterns, ReinterpretMapScope scope,
+    sparse_tensor::LoopOrderingStrategy strategy =
+        sparse_tensor::LoopOrderingStrategy::kDefault);
 
 std::unique_ptr<Pass> createSparseReinterpretMapPass();
 std::unique_ptr<Pass> createSparseReinterpretMapPass(ReinterpretMapScope scope);
+std::unique_ptr<Pass>
+createSparseReinterpretMapPass(ReinterpretMapScope scope,
+                               sparse_tensor::LoopOrderingStrategy strategy);
 
 //===----------------------------------------------------------------------===//
 // The PreSparsificationRewriting pass.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 2513e106f5b06..75e77d67db1b3 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -81,6 +81,11 @@ def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
          clEnumValN(mlir::ReinterpretMapScope::kExceptGeneric,
                     "except-generic",
                     "Run on operations expect linalg.generic (e.g., foreach)"))}]>,
+    Option<"loopOrderingStrategy", "loop-ordering-strategy", "mlir::sparse_tensor::LoopOrderingStrategy",
+       "mlir::sparse_tensor::LoopOrderingStrategy::kDefault",
+       "Set the loop ordering strategy for sparse code generation", [{llvm::cl::values(
+         clEnumValN(mlir::sparse_tensor::LoopOrderingStrategy::kDefault, "default",
+                    "Default strategy (eagerly selects last loop in topological sort)"))}]>,
   ];
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index a1e35b87399ca..0fc5cc76de39c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -59,7 +59,7 @@ struct DemapInsRewriter : public OpRewritePattern<SourceOp> {
 
 // Flattens an affine expression into a list of AffineDimExprs.
 struct AffineDimCollector : public AffineExprVisitor<AffineDimCollector> {
-  explicit AffineDimCollector(unsigned dimNum) : dims(dimNum){};
+  explicit AffineDimCollector(unsigned dimNum) : dims(dimNum) {};
   void visitDimExpr(AffineDimExpr expr) { dims.set(expr.getPosition()); }
   BitVector dims;
 };
@@ -67,7 +67,7 @@ struct AffineDimCollector : public AffineExprVisitor<AffineDimCollector> {
 // Flattens an affine expression into a list of AffineDimExprs.
 struct AffineExprAdmissibleVisitor
     : public AffineExprVisitor<AffineExprAdmissibleVisitor> {
-  explicit AffineExprAdmissibleVisitor(bool isOutput) : isOutput(isOutput){};
+  explicit AffineExprAdmissibleVisitor(bool isOutput) : isOutput(isOutput) {};
 
   // We only allow AffineDimExpr on output.
   void visitAddExpr(AffineBinaryOpExpr expr) {
@@ -407,7 +407,10 @@ struct GenericOpReinterpretMap
 };
 
 struct GenericOpScheduler : public OpRewritePattern<linalg::GenericOp> {
-  using OpRewritePattern::OpRewritePattern;
+  GenericOpScheduler(MLIRContext *context,
+                     sparse_tensor::LoopOrderingStrategy strategy)
+      : OpRewritePattern<linalg::GenericOp>(context), strategy(strategy) {}
+
   LogicalResult matchAndRewrite(linalg::GenericOp linalgOp,
                                 PatternRewriter &rewriter) const override {
     if (linalgOp.getNumDpsInits() != 1 || !linalgOp.hasPureTensorSemantics() ||
@@ -420,7 +423,8 @@ struct GenericOpScheduler : public OpRewritePattern<linalg::GenericOp> {
     if (linalgOp->hasAttr(sorted))
       return failure();
 
-    auto scheduler = IterationGraphSorter::fromGenericOp(linalgOp);
+    // Pass strategy to IterationGraphSorter.
+    auto scheduler = IterationGraphSorter::fromGenericOp(linalgOp, strategy);
     bool isAdmissible = false;
     AffineMap order;
     // A const list of all masks that we used for iteration graph
@@ -582,6 +586,9 @@ struct GenericOpScheduler : public OpRewritePattern<linalg::GenericOp> {
     // TODO: convert more than one?
     return failure();
   }
+
+private:
+  sparse_tensor::LoopOrderingStrategy strategy;
 };
 
 //===----------------------------------------------------------------------===//
@@ -786,12 +793,13 @@ struct ForeachOpDemapper
 
 } // namespace
 
-void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns,
-                                        ReinterpretMapScope scope) {
+void mlir::populateSparseReinterpretMap(
+    RewritePatternSet &patterns, ReinterpretMapScope scope,
+    sparse_tensor::LoopOrderingStrategy strategy) {
   if (scope == ReinterpretMapScope::kAll ||
       scope == ReinterpretMapScope::kGenericOnly) {
-    patterns.add<GenericOpReinterpretMap, GenericOpScheduler>(
-        patterns.getContext());
+    patterns.add<GenericOpReinterpretMap>(patterns.getContext());
+    patterns.add<GenericOpScheduler>(patterns.getContext(), strategy);
   }
   if (scope == ReinterpretMapScope::kAll ||
       scope == ReinterpretMapScope::kExceptGeneric) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index 153b9b170e5d3..b660e22154688 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -67,12 +67,13 @@ struct SparseReinterpretMap
   SparseReinterpretMap(const SparseReinterpretMap &pass) = default;
   SparseReinterpretMap(const SparseReinterpretMapOptions &options) {
     scope = options.scope;
+    loopOrderingStrategy = options.loopOrderingStrategy;
   }
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
-    populateSparseReinterpretMap(patterns, scope);
+    populateSparseReinterpretMap(patterns, scope, loopOrderingStrategy);
     (void)applyPatternsGreedily(getOperation(), std::move(patterns));
   }
 };
@@ -438,6 +439,14 @@ mlir::createSparseReinterpretMapPass(ReinterpretMapScope scope) {
   return std::make_unique<SparseReinterpretMap>(options);
 }
 
+std::unique_ptr<Pass> mlir::createSparseReinterpretMapPass(
+    ReinterpretMapScope scope, sparse_tensor::LoopOrderingStrategy strategy) {
+  SparseReinterpretMapOptions options;
+  options.scope = scope;
+  options.loopOrderingStrategy = strategy;
+  return std::make_unique<SparseReinterpretMap>(options);
+}
+
 std::unique_ptr<Pass> mlir::createPreSparsificationRewritePass() {
   return std::make_unique<PreSparsificationRewritePass>();
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
index c7e463a5a5b49..73e0f3d2891d7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
@@ -100,7 +100,15 @@ AffineMap IterationGraphSorter::topoSort() {
     // We always prefer a parallel loop over a reduction loop because putting
     // a reduction loop early might make the loop sequence inadmissible.
     auto &it = !parIt.empty() ? parIt : redIt;
-    auto src = it.back();
+
+    // Select loop based on strategy.
+    unsigned src;
+    switch (strategy) {
+    case sparse_tensor::LoopOrderingStrategy::kDefault:
+      src = it.back();
+      break;
+    }
+
     loopOrder.push_back(src);
     it.pop_back();
     // Update in-degree, and push 0-degree node into worklist.
@@ -122,8 +130,8 @@ AffineMap IterationGraphSorter::topoSort() {
   return AffineMap();
 }
 
-IterationGraphSorter
-IterationGraphSorter::fromGenericOp(linalg::GenericOp genericOp) {
+IterationGraphSorter IterationGraphSorter::fromGenericOp(
+    linalg::GenericOp genericOp, sparse_tensor::LoopOrderingStrategy strategy) {
   // Must be a demapped sparse kernel.
   assert(!hasAnyNonIdentityOperandsOrResults(genericOp) &&
          hasAnySparseOperandOrResult(genericOp) &&
@@ -140,14 +148,16 @@ IterationGraphSorter::fromGenericOp(linalg::GenericOp genericOp) {
       genericOp.getIteratorTypesArray();
 
   return IterationGraphSorter(std::move(ins), std::move(loopMap), out, outMap,
-                              std::move(iterTypes));
+                              std::move(iterTypes), strategy);
 }
 
 IterationGraphSorter::IterationGraphSorter(
     SmallVector<Value> &&ins, SmallVector<AffineMap> &&loop2InsLvl, Value out,
-    AffineMap loop2OutLvl, SmallVector<utils::IteratorType> &&iterTypes)
+    AffineMap loop2OutLvl, SmallVector<utils::IteratorType> &&iterTypes,
+    sparse_tensor::LoopOrderingStrategy strategy)
     : ins(std::move(ins)), loop2InsLvl(std::move(loop2InsLvl)), out(out),
-      loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypes)) {
+      loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypes)),
+      strategy(strategy) {
   // One map per tensor.
   assert(loop2InsLvl.size() == ins.size());
   // All the affine maps have the same number of dimensions (loops).
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
index a6abe9eb76c47..b2a16e9382758 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_UTILS_ITERATIONGRAPHSORTER_H_
 #define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_UTILS_ITERATIONGRAPHSORTER_H_
 
+#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
 #include "mlir/IR/AffineMap.h"
 
 namespace mlir {
@@ -41,9 +42,12 @@ enum class SortMask : unsigned {
 
 class IterationGraphSorter {
 public:
-  /// Factory method that construct an iteration graph sorter
-  /// for the given linalg.generic operation.
-  static IterationGraphSorter fromGenericOp(linalg::GenericOp genericOp);
+  /// Factory method that constructs an iteration graph sorter
+  /// for the given linalg.generic operation with a specific loop ordering
+  /// strategy.
+  static IterationGraphSorter
+  fromGenericOp(linalg::GenericOp genericOp,
+                sparse_tensor::LoopOrderingStrategy strategy);
 
   /// Returns a permutation that represents the scheduled loop order.
   /// Note that the returned AffineMap could be null if the kernel
@@ -58,7 +62,9 @@ class IterationGraphSorter {
   IterationGraphSorter(SmallVector<Value> &&ins,
                        SmallVector<AffineMap> &&loop2InsLvl, Value out,
                        AffineMap loop2OutLvl,
-                       SmallVector<utils::IteratorType> &&iterTypes);
+                       SmallVector<utils::IteratorType> &&iterTypes,
+                       sparse_tensor::LoopOrderingStrategy strategy =
+                           sparse_tensor::LoopOrderingStrategy::kDefault);
 
   // Adds all the constraints in the given loop to level map.
   void addConstraints(Value t, AffineMap loop2LvlMap);
@@ -84,6 +90,9 @@ class IterationGraphSorter {
 
   // InDegree used for topo sort.
   std::vector<unsigned> inDegree;
+
+  // Loop ordering strategy.
+  sparse_tensor::LoopOrderingStrategy strategy;
 };
 
 } // namespace sparse_tensor

From 255db37e778e0cb61417765f12a280605efc11b8 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Mon, 6 Oct 2025 13:30:02 -0400
Subject: [PATCH 871/878] [clangd] Retrieve documentation for class member
 instance from index (#153337)

Fixes https://github.com/clangd/clangd/issues/2290
---
 clang-tools-extra/clangd/CodeComplete.cpp     | 11 ++-
 .../clangd/unittests/CodeCompleteTests.cpp    | 26 ++++++-
 clang/include/clang/AST/DeclTemplate.h        |  5 ++
 clang/lib/AST/ASTContext.cpp                  | 70 -------------------
 clang/lib/AST/DeclTemplate.cpp                | 67 ++++++++++++++++++
 5 files changed, 106 insertions(+), 73 deletions(-)

diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index c6deed3ab1e7b..e4df7581f1315 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -43,6 +43,7 @@
 #include "support/Trace.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclBase.h"
+#include "clang/AST/DeclTemplate.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
@@ -1886,7 +1887,15 @@ class CodeCompleteFlow {
         for (auto &Cand : C.first) {
           if (Cand.SemaResult &&
               Cand.SemaResult->Kind == CodeCompletionResult::RK_Declaration) {
-            auto ID = clangd::getSymbolID(Cand.SemaResult->getDeclaration());
+            const NamedDecl *DeclToLookup = Cand.SemaResult->getDeclaration();
+            // For instantiations of members of class templates, the
+            // documentation will be stored at the member's original
+            // declaration.
+            if (const NamedDecl *Adjusted =
+                    dyn_cast<NamedDecl>(&adjustDeclToTemplate(*DeclToLookup))) {
+              DeclToLookup = Adjusted;
+            }
+            auto ID = clangd::getSymbolID(DeclToLookup);
             if (!ID)
               continue;
             Req.IDs.insert(ID);
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index 7640569128172..768f88f177e56 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -1154,23 +1154,45 @@ TEST(CompletionTest, CommentsOnMembersFromHeader) {
       /// This is a member function.
       int delta();
     };
+
+    template <typename T>
+    struct beta {
+      /// This is a member field inside a template.
+      int omega;
+
+      /// This is a member function inside a template.
+      int epsilon();
+    };
   )cpp";
 
   auto File = testPath("foo.cpp");
   Annotations Test(R"cpp(
 #include "foo.h"
 alpha a;
-int x = a.^
+beta<int> b;
+int x = a.$p1^;
+int y = b.$p2^;
      )cpp");
   runAddDocument(Server, File, Test.code());
   auto CompletionList =
-      llvm::cantFail(runCodeComplete(Server, File, Test.point(), {}));
+      llvm::cantFail(runCodeComplete(Server, File, Test.point("p1"), {}));
 
   EXPECT_THAT(CompletionList.Completions,
               Contains(AllOf(named("gamma"), doc("This is a member field."))));
   EXPECT_THAT(
       CompletionList.Completions,
       Contains(AllOf(named("delta"), doc("This is a member function."))));
+
+  CompletionList =
+      llvm::cantFail(runCodeComplete(Server, File, Test.point("p2"), {}));
+
+  EXPECT_THAT(CompletionList.Completions,
+              Contains(AllOf(named("omega")
+                             /* FIXME: Doc retrieval does not work yet*/)));
+  EXPECT_THAT(
+      CompletionList.Completions,
+      Contains(AllOf(named("epsilon"),
+                     doc("This is a member function inside a template."))));
 }
 
 TEST(CompletionTest, CommentsOnMembersFromHeaderOverloadBundling) {
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index bba72365089f9..a3c67a60d0329 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -3399,6 +3399,11 @@ inline UnsignedOrNone getExpandedPackSize(const NamedDecl *Param) {
 /// for their AssociatedDecl.
 TemplateParameterList *getReplacedTemplateParameterList(const Decl *D);
 
+/// If we have a 'templated' declaration for a template, adjust 'D' to
+/// refer to the actual template.
+/// If we have an implicit instantiation, adjust 'D' to refer to template.
+const Decl &adjustDeclToTemplate(const Decl &D);
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_DECLTEMPLATE_H
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 056bfe36b2a0a..a074990cce7fe 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -330,76 +330,6 @@ void ASTContext::addComment(const RawComment &RC) {
   Comments.addComment(RC, LangOpts.CommentOpts, BumpAlloc);
 }
 
-/// If we have a 'templated' declaration for a template, adjust 'D' to
-/// refer to the actual template.
-/// If we have an implicit instantiation, adjust 'D' to refer to template.
-static const Decl &adjustDeclToTemplate(const Decl &D) {
-  if (const auto *FD = dyn_cast<FunctionDecl>(&D)) {
-    // Is this function declaration part of a function template?
-    if (const FunctionTemplateDecl *FTD = FD->getDescribedFunctionTemplate())
-      return *FTD;
-
-    // Nothing to do if function is not an implicit instantiation.
-    if (FD->getTemplateSpecializationKind() != TSK_ImplicitInstantiation)
-      return D;
-
-    // Function is an implicit instantiation of a function template?
-    if (const FunctionTemplateDecl *FTD = FD->getPrimaryTemplate())
-      return *FTD;
-
-    // Function is instantiated from a member definition of a class template?
-    if (const FunctionDecl *MemberDecl =
-            FD->getInstantiatedFromMemberFunction())
-      return *MemberDecl;
-
-    return D;
-  }
-  if (const auto *VD = dyn_cast<VarDecl>(&D)) {
-    // Static data member is instantiated from a member definition of a class
-    // template?
-    if (VD->isStaticDataMember())
-      if (const VarDecl *MemberDecl = VD->getInstantiatedFromStaticDataMember())
-        return *MemberDecl;
-
-    return D;
-  }
-  if (const auto *CRD = dyn_cast<CXXRecordDecl>(&D)) {
-    // Is this class declaration part of a class template?
-    if (const ClassTemplateDecl *CTD = CRD->getDescribedClassTemplate())
-      return *CTD;
-
-    // Class is an implicit instantiation of a class template or partial
-    // specialization?
-    if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(CRD)) {
-      if (CTSD->getSpecializationKind() != TSK_ImplicitInstantiation)
-        return D;
-      llvm::PointerUnion<ClassTemplateDecl *,
-                         ClassTemplatePartialSpecializationDecl *>
-          PU = CTSD->getSpecializedTemplateOrPartial();
-      return isa<ClassTemplateDecl *>(PU)
-                 ? *static_cast<const Decl *>(cast<ClassTemplateDecl *>(PU))
-                 : *static_cast<const Decl *>(
-                       cast<ClassTemplatePartialSpecializationDecl *>(PU));
-    }
-
-    // Class is instantiated from a member definition of a class template?
-    if (const MemberSpecializationInfo *Info =
-            CRD->getMemberSpecializationInfo())
-      return *Info->getInstantiatedFrom();
-
-    return D;
-  }
-  if (const auto *ED = dyn_cast<EnumDecl>(&D)) {
-    // Enum is instantiated from a member definition of a class template?
-    if (const EnumDecl *MemberDecl = ED->getInstantiatedFromMemberEnum())
-      return *MemberDecl;
-
-    return D;
-  }
-  // FIXME: Adjust alias templates?
-  return D;
-}
-
 const RawComment *ASTContext::getRawCommentForAnyRedecl(
                                                 const Decl *D,
                                                 const Decl **OriginalDecl) const {
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index b6bb6117d42af..e5fba1bf9d6bc 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1708,3 +1708,70 @@ TemplateParameterList *clang::getReplacedTemplateParameterList(const Decl *D) {
     llvm_unreachable("Unhandled templated declaration kind");
   }
 }
+
+const Decl &clang::adjustDeclToTemplate(const Decl &D) {
+  if (const auto *FD = dyn_cast<FunctionDecl>(&D)) {
+    // Is this function declaration part of a function template?
+    if (const FunctionTemplateDecl *FTD = FD->getDescribedFunctionTemplate())
+      return *FTD;
+
+    // Nothing to do if function is not an implicit instantiation.
+    if (FD->getTemplateSpecializationKind() != TSK_ImplicitInstantiation)
+      return D;
+
+    // Function is an implicit instantiation of a function template?
+    if (const FunctionTemplateDecl *FTD = FD->getPrimaryTemplate())
+      return *FTD;
+
+    // Function is instantiated from a member definition of a class template?
+    if (const FunctionDecl *MemberDecl =
+            FD->getInstantiatedFromMemberFunction())
+      return *MemberDecl;
+
+    return D;
+  }
+  if (const auto *VD = dyn_cast<VarDecl>(&D)) {
+    // Static data member is instantiated from a member definition of a class
+    // template?
+    if (VD->isStaticDataMember())
+      if (const VarDecl *MemberDecl = VD->getInstantiatedFromStaticDataMember())
+        return *MemberDecl;
+
+    return D;
+  }
+  if (const auto *CRD = dyn_cast<CXXRecordDecl>(&D)) {
+    // Is this class declaration part of a class template?
+    if (const ClassTemplateDecl *CTD = CRD->getDescribedClassTemplate())
+      return *CTD;
+
+    // Class is an implicit instantiation of a class template or partial
+    // specialization?
+    if (const auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(CRD)) {
+      if (CTSD->getSpecializationKind() != TSK_ImplicitInstantiation)
+        return D;
+      llvm::PointerUnion<ClassTemplateDecl *,
+                         ClassTemplatePartialSpecializationDecl *>
+          PU = CTSD->getSpecializedTemplateOrPartial();
+      return isa<ClassTemplateDecl *>(PU)
+                 ? *static_cast<const Decl *>(cast<ClassTemplateDecl *>(PU))
+                 : *static_cast<const Decl *>(
+                       cast<ClassTemplatePartialSpecializationDecl *>(PU));
+    }
+
+    // Class is instantiated from a member definition of a class template?
+    if (const MemberSpecializationInfo *Info =
+            CRD->getMemberSpecializationInfo())
+      return *Info->getInstantiatedFrom();
+
+    return D;
+  }
+  if (const auto *ED = dyn_cast<EnumDecl>(&D)) {
+    // Enum is instantiated from a member definition of a class template?
+    if (const EnumDecl *MemberDecl = ED->getInstantiatedFromMemberEnum())
+      return *MemberDecl;
+
+    return D;
+  }
+  // FIXME: Adjust alias templates?
+  return D;
+}

From 68a71969f9478affee0c9c1b221d01a7781d9b89 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 6 Oct 2025 10:33:05 -0700
Subject: [PATCH 872/878] [NFC][AST] Clean up XRayFilter and NoSanitizeL before
 backend (#162119)

I don't think we need them after AST is destroyed.
If I am wrong we should see crashes immediately, as pointers
de-referenced unchecked.
---
 clang/lib/AST/ASTContext.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a074990cce7fe..a8b41ba18fa01 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -906,6 +906,9 @@ void ASTContext::cleanup() {
   for (const auto &Value : ModuleInitializers)
     Value.second->~PerModuleInitializers();
   ModuleInitializers.clear();
+
+  XRayFilter.reset();
+  NoSanitizeL.reset();
 }
 
 ASTContext::~ASTContext() { cleanup(); }

From d3d7c3c8d1d83cf5f94ae55fd39c2a2f98f93d5c Mon Sep 17 00:00:00 2001
From: Chris B <chris.bieneman@me.com>
Date: Mon, 6 Oct 2025 12:39:12 -0500
Subject: [PATCH 873/878] [HLSL] Fix vector list initialization (#161421)

This simplifies and cleans up the vector list initialization behavior.
This simplifies the work we do in SemaInit by just relying on SemaHLSL's
initialization list flattening. This change fixes some outstanding
limitations by supporting structure to vector initialization, but
re-introduces HLSL's limitations around overload resolution in
initializers.

---------

Co-authored-by: Helena Kotas <hekotas@microsoft.com>
---
 clang/include/clang/Sema/Initialization.h     |   3 +
 clang/lib/Sema/SemaInit.cpp                   |  48 ++---
 clang/test/AST/HLSL/vector-constructors.hlsl  |  25 +--
 .../CodeGenHLSL/BasicFeatures/InitLists.hlsl  | 173 +++++++++++++-----
 .../BuiltIns/vector-constructors-errors.hlsl  |  31 ++++
 .../BuiltIns/vector-constructors-erros.hlsl   |  20 --
 6 files changed, 185 insertions(+), 115 deletions(-)
 create mode 100644 clang/test/SemaHLSL/BuiltIns/vector-constructors-errors.hlsl
 delete mode 100644 clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl

diff --git a/clang/include/clang/Sema/Initialization.h b/clang/include/clang/Sema/Initialization.h
index d7675ea153afb..5e96317ffb7fe 100644
--- a/clang/include/clang/Sema/Initialization.h
+++ b/clang/include/clang/Sema/Initialization.h
@@ -1126,6 +1126,9 @@ class InitializationSequence {
 
     // A designated initializer was provided for a non-aggregate type.
     FK_DesignatedInitForNonAggregate,
+
+    /// HLSL intialization list flattening failed.
+    FK_HLSLInitListFlatteningFailed,
   };
 
 private:
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 922fcacdd7ae9..543db46fa0593 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -3920,6 +3920,7 @@ bool InitializationSequence::isAmbiguous() const {
   case FK_AddressOfUnaddressableFunction:
   case FK_ParenthesizedListInitFailed:
   case FK_DesignatedInitForNonAggregate:
+  case FK_HLSLInitListFlatteningFailed:
     return false;
 
   case FK_ReferenceInitOverloadFailed:
@@ -4882,8 +4883,10 @@ static void TryListInitialization(Sema &S,
                                   bool TreatUnavailableAsInvalid) {
   QualType DestType = Entity.getType();
 
-  if (S.getLangOpts().HLSL && !S.HLSL().transformInitList(Entity, InitList))
+  if (S.getLangOpts().HLSL && !S.HLSL().transformInitList(Entity, InitList)) {
+    Sequence.SetFailed(InitializationSequence::FK_HLSLInitListFlatteningFailed);
     return;
+  }
 
   // C++ doesn't allow scalar initialization with more than one argument.
   // But C99 complex numbers are scalars and it makes sense there.
@@ -6817,33 +6820,18 @@ void InitializationSequence::InitializeFrom(Sema &S,
   assert(Args.size() >= 1 && "Zero-argument case handled above");
 
   // For HLSL ext vector types we allow list initialization behavior for C++
-  // constructor syntax. This is accomplished by converting initialization
-  // arguments an InitListExpr late.
+  // functional cast expressions which look like constructor syntax. This is
+  // accomplished by converting initialization arguments to InitListExpr.
   if (S.getLangOpts().HLSL && Args.size() > 1 && DestType->isExtVectorType() &&
       (SourceType.isNull() ||
        !Context.hasSameUnqualifiedType(SourceType, DestType))) {
-
-    llvm::SmallVector<Expr *> InitArgs;
-    for (auto *Arg : Args) {
-      if (Arg->getType()->isExtVectorType()) {
-        const auto *VTy = Arg->getType()->castAs<ExtVectorType>();
-        unsigned Elm = VTy->getNumElements();
-        for (unsigned Idx = 0; Idx < Elm; ++Idx) {
-          InitArgs.emplace_back(new (Context) ArraySubscriptExpr(
-              Arg,
-              IntegerLiteral::Create(
-                  Context, llvm::APInt(Context.getIntWidth(Context.IntTy), Idx),
-                  Context.IntTy, SourceLocation()),
-              VTy->getElementType(), Arg->getValueKind(), Arg->getObjectKind(),
-              SourceLocation()));
-        }
-      } else
-        InitArgs.emplace_back(Arg);
-    }
-    InitListExpr *ILE = new (Context) InitListExpr(
-        S.getASTContext(), SourceLocation(), InitArgs, SourceLocation());
+    InitListExpr *ILE = new (Context)
+        InitListExpr(S.getASTContext(), Args.front()->getBeginLoc(), Args,
+                     Args.back()->getEndLoc());
+    ILE->setType(DestType);
     Args[0] = ILE;
-    AddListInitializationStep(DestType);
+    TryListInitialization(S, Entity, Kind, ILE, *this,
+                          TreatUnavailableAsInvalid);
     return;
   }
 
@@ -9301,6 +9289,14 @@ bool InitializationSequence::Diagnose(Sema &S,
     break;
   }
 
+  case InitializationSequence::FK_HLSLInitListFlatteningFailed: {
+    // Unlike C/C++ list initialization, there is no fallback if it fails. This
+    // allows us to diagnose the failure when it happens in the
+    // TryListInitialization call instead of delaying the diagnosis, which is
+    // beneficial because the flattening is also expensive.
+    break;
+  }
+
   case FK_ExplicitConstructor: {
     S.Diag(Kind.getLocation(), diag::err_selected_explicit_constructor)
       << Args[0]->getSourceRange();
@@ -9499,6 +9495,10 @@ void InitializationSequence::dump(raw_ostream &OS) const {
     case FK_DesignatedInitForNonAggregate:
       OS << "designated initializer for non-aggregate type";
       break;
+
+    case FK_HLSLInitListFlatteningFailed:
+      OS << "HLSL initialization list flattening failed";
+      break;
     }
     OS << '\n';
     return;
diff --git a/clang/test/AST/HLSL/vector-constructors.hlsl b/clang/test/AST/HLSL/vector-constructors.hlsl
index fd43a7dcbfcca..ab547554b148d 100644
--- a/clang/test/AST/HLSL/vector-constructors.hlsl
+++ b/clang/test/AST/HLSL/vector-constructors.hlsl
@@ -14,7 +14,7 @@ void entry() {
 // parameters to an initialization list
 // CHECK-LABEL: VarDecl {{.*}} used Vec2 'float2':'vector<float, 2>' cinit
 // CHECK-NEXT: CXXFunctionalCastExpr {{.*}} 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
-// CHECK-NEXT: InitListExpr {{.*}} 'float2':'vector<float, 2>'
+// CHECK-NEXT: InitListExpr {{0x[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
 // CHECK-NEXT: FloatingLiteral {{.*}} 'float' 1.000000e+00
 // CHECK-NEXT: FloatingLiteral {{.*}} 'float' 2.000000e+00
 
@@ -28,11 +28,11 @@ void entry() {
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector<float, 2>' lvalue Var {{.*}} 'Vec2' 'float2':'vector<float, 2>'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue
 // CHECK-NEXT: DeclRefExpr {{.*}} 'float2':'vector<float, 2>' lvalue Var {{.*}} 'Vec2' 'float2':'vector<float, 2>'
-// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 1
+// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1
 // CHECK-NEXT: FloatingLiteral {{.*}} 'float' 3.000000e+00
 
 // CHECK: VarDecl {{.*}} 'float3':'vector<float, 3>' cinit
@@ -93,25 +93,6 @@ void entry() {
 // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .f {{.*}}
 // CHECK-NEXT: DeclRefExpr {{.*}} 'struct S' lvalue Var {{.*}} 's' 'struct S'
 
-  struct T {
-    operator float() const { return 1.0f; }
-  } t;
-  float2 foo5 = float2(t, t); // user-defined cast operator
-
-// CHECK-LABEL: VarDecl {{.*}} foo5 'float2'
-// CHECK-NEXT: CXXFunctionalCastExpr
-// CHECK-NEXT: InitListExpr
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <UserDefinedConversion>
-// CHECK-NEXT: CXXMemberCallExpr {{.*}} 'float'
-// CHECK-NEXT: MemberExpr {{.*}} '<bound member function type>' .operator float {{.*}}
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const T' lvalue <NoOp>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'struct T' lvalue Var {{.*}} 't' 'struct T'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <UserDefinedConversion>
-// CHECK-NEXT: CXXMemberCallExpr {{.*}} 'float'
-// CHECK-NEXT: MemberExpr {{.*}} '<bound member function type>' .operator float {{.*}}
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const T' lvalue <NoOp>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'struct T' lvalue Var {{.*}} 't' 'struct T'
-
   typedef float2 second_level_of_typedefs;
   second_level_of_typedefs foo6 = float2(1.0f, 2.0f);
 
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
index 7e83e5f168538..82ed7545cfdc5 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
@@ -61,10 +61,6 @@ struct EmptyDerived : Empty {};
 
 struct UnnamedDerived : UnnamedOnly {};
 
-// CHECK-DAG: [[ConstE:@.*]] = private unnamed_addr constant %struct.Empty undef, align 1
-// CHECK-DAG: [[ConstUO:@.*]] = private unnamed_addr constant %struct.UnnamedOnly undef, align 1
-// CHECK-DAG: [[ConstED:@.*]] = private unnamed_addr constant %struct.EmptyDerived undef, align 1
-// CHECK-DAG: [[ConstUD:@.*]] = private unnamed_addr constant %struct.UnnamedDerived undef, align 1
 
 // Case 1: Extraneous braces get ignored in literal instantiation.
 // CHECK-LABEL: define hidden void @_Z5case1v(
@@ -911,15 +907,15 @@ TwoFloats case15(SlicyBits SB) {
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
 // CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
 // CHECK-NEXT:    store float [[TMP1]], ptr [[X1]], align 1
 // CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[X_ADDR]], align 4, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
 // CHECK-NEXT:    [[MUL:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP3]], 1.500000e+00
 // CHECK-NEXT:    store float [[MUL]], ptr [[Y]], align 1
-// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[X_ADDR]], align 4, !nonnull [[META3]], !align [[META4]]
 // CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4
 // CHECK-NEXT:    [[MUL2:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP5]], 2.000000e+00
 // CHECK-NEXT:    store float [[MUL2]], ptr [[TMP4]], align 4
@@ -964,94 +960,173 @@ FourFloats case16() {
 }
 
 
+// CHECK-LABEL: define hidden noundef i32 @_Z12case17Helperi(
+// CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X_ADDR]], align 4
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
 int case17Helper(int x) {
   return x;
 }
 
 // InitList with OpaqueValueExpr
-// CHECK-LABEL: define hidden void {{.*}}case17
-// CHECK: [[X:%.*]] = alloca <2 x i32>, align 8
-// CHECK-NEXT: [[C:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 0)
-// CHECK-NEXT: [[C1:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 1)
-// CHECK-NEXT: [[VI:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0
-// CHECK-NEXT: [[VI2:%.*]] = insertelement <2 x i32> [[VI]], i32 [[C1]], i32 1
-// CHECK-NEXT: store <2 x i32> [[VI2]], ptr [[X]], align 8
-// CHECK-NEXT: ret void
+// CHECK-LABEL: define hidden void @_Z6case17v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X:%.*]] = alloca <2 x i32>, align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z12case17Helperi(i32 noundef 0) #[[ATTR2]]
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z12case17Helperi(i32 noundef 1) #[[ATTR2]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[CALL]], i32 0
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[CALL1]], i32 1
+// CHECK-NEXT:    store <2 x i32> [[VECINIT2]], ptr [[X]], align 8
+// CHECK-NEXT:    ret void
+//
 void case17() {
   int2 X = {case17Helper(0), case17Helper(1)};
 }
 
 // InitList with Struct with unnamed bitfield on LHS
-// CHECK-LABEL: case18
-// CHECK: [[U:%.*]] = alloca %struct.Unnamed, align 1
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[U]], ptr align 1 {{.*}}, i32 5, i1 false)
+// CHECK-LABEL: define hidden void @_Z6case18v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[U:%.*]] = alloca [[STRUCT_UNNAMED:%.*]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[U]], ptr align 1 @__const._Z6case18v.U, i32 5, i1 false)
+// CHECK-NEXT:    ret void
+//
 void case18() {
   Unnamed U = {1};
 }
 
 // InitList with Struct with unnamed bitfield on RHS
-// CHECK-LABEL: case19
-// CHECK: [[TI:%.*]] = alloca %struct.TwoInts, align 1
-// CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw %struct.TwoInts, ptr [[TI]], i32 0, i32 0
-// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw %struct.Unnamed, ptr %U, i32 0, i32 0
-// CHECK-NEXT: [[L:%.*]] = load i32, ptr [[A]], align 1
-// CHECK-NEXT: store i32 [[L]], ptr [[Z]], align 1
-// CHECK-NEXT: [[W:%.*]] = getelementptr inbounds nuw %struct.TwoInts, ptr [[TI]], i32 0, i32 1
-// CHECK-NEXT: store i32 1, ptr [[W]], align 1
+// CHECK-LABEL: define hidden void @_Z6case197Unnamed(
+// CHECK-SAME: ptr noundef byval([[STRUCT_UNNAMED:%.*]]) align 1 [[U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TI:%.*]] = alloca [[STRUCT_TWOINTS:%.*]], align 1
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 0
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_UNNAMED]], ptr [[U]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[Z]], align 1
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 1
+// CHECK-NEXT:    store i32 1, ptr [[W]], align 1
+// CHECK-NEXT:    ret void
+//
 void case19(Unnamed U) {
   TwoInts TI = {U, 1};
 }
 
 // InitList with Empty Struct on LHS
-// CHECK-LABEL: case20
-// CHECK: [[E:%.*]] = alloca %struct.Empty, align 1
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[E]], ptr align 1 [[ConstE]], i32 1, i1 false)
+// CHECK-LABEL: define hidden void @_Z6case20v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[E:%.*]] = alloca [[STRUCT_EMPTY:%.*]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[E]], ptr align 1 @__const._Z6case20v.E, i32 1, i1 false)
+// CHECK-NEXT:    ret void
+//
 void case20() {
   Empty E = {};
 }
 
 // InitList with Empty Struct on RHS
-// CHECK-LABEL: case21
-// CHECK: [[TI:%.*]] = alloca %struct.TwoInts, align 1
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %TI, ptr align 1 {{.*}}, i32 8, i1 false)
+// CHECK-LABEL: define hidden void @_Z6case215Empty(
+// CHECK-SAME: ptr noundef byval([[STRUCT_EMPTY:%.*]]) align 1 [[E:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TI:%.*]] = alloca [[STRUCT_TWOINTS:%.*]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[TI]], ptr align 1 @__const._Z6case215Empty.TI, i32 8, i1 false)
+// CHECK-NEXT:    ret void
+//
 void case21(Empty E) {
   TwoInts TI = {E, 1, 2};
 }
 
 // InitList with Struct with only unnamed bitfield on LHS
-// CHECK-LABEL: case22
-// CHECK: [[UO:%.*]] = alloca %struct.UnnamedOnly, align 1
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[UO]], ptr align 1 [[ConstUO]], i32 1, i1 false)
+// CHECK-LABEL: define hidden void @_Z6case22v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[UO:%.*]] = alloca [[STRUCT_UNNAMEDONLY:%.*]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[UO]], ptr align 1 @__const._Z6case22v.UO, i32 1, i1 false)
+// CHECK-NEXT:    ret void
+//
 void case22() {
- UnnamedOnly UO = {}; 
+ UnnamedOnly UO = {};
 }
 
 // InitList with Struct with only unnamed bitfield on RHS
-// CHECK-LABEL: case23
-// CHECK: [[TI:%.*]] = alloca %struct.TwoInts, align 1
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[TI]], ptr align 1 {{.*}}, i32 8, i1 false)
+// CHECK-LABEL: define hidden void @_Z6case2311UnnamedOnly(
+// CHECK-SAME: ptr noundef byval([[STRUCT_UNNAMEDONLY:%.*]]) align 1 [[UO:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TI:%.*]] = alloca [[STRUCT_TWOINTS:%.*]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[TI]], ptr align 1 @__const._Z6case2311UnnamedOnly.TI, i32 8, i1 false)
+// CHECK-NEXT:    ret void
+//
 void case23(UnnamedOnly UO) {
   TwoInts TI = {UO, 1, 2};
 }
 
 // InitList with Derived empty struct on LHS
 // InitList with Derived unnamed bitfield on LHS
-// CHECK-LABEL: case24
-// CHECK: [[ED:%.*]] = alloca %struct.EmptyDerived, align 1
-// CHECK-NEXT: [[UD:%.*]] = alloca %struct.UnnamedDerived, align 1
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %ED, ptr align 1 [[ConstED]], i32 1, i1 false)
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %UD, ptr align 1 [[ConstUD]], i32 1, i1 false)
+// CHECK-LABEL: define hidden void @_Z6case24v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[ED:%.*]] = alloca [[STRUCT_EMPTYDERIVED:%.*]], align 1
+// CHECK-NEXT:    [[UD:%.*]] = alloca [[STRUCT_UNNAMEDDERIVED:%.*]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[ED]], ptr align 1 @__const._Z6case24v.ED, i32 1, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[UD]], ptr align 1 @__const._Z6case24v.UD, i32 1, i1 false)
+// CHECK-NEXT:    ret void
+//
 void case24() {
  EmptyDerived ED = {};
  UnnamedDerived UD = {};
 }
 
-// CHECK-LABEL: case25
-// CHECK: [[TI1:%.*]] = alloca %struct.TwoInts, align 1
-// CHECK-NEXT: [[TI2:%.*]] = alloca %struct.TwoInts, align 1
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %TI1, ptr align 1 {{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %TI2, ptr align 1 {{.*}}, i32 8, i1 false)
+// CHECK-LABEL: define hidden void @_Z6case2512EmptyDerived14UnnamedDerived(
+// CHECK-SAME: ptr noundef byval([[STRUCT_EMPTYDERIVED:%.*]]) align 1 [[ED:%.*]], ptr noundef byval([[STRUCT_UNNAMEDDERIVED:%.*]]) align 1 [[UD:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TI1:%.*]] = alloca [[STRUCT_TWOINTS:%.*]], align 1
+// CHECK-NEXT:    [[TI2:%.*]] = alloca [[STRUCT_TWOINTS]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[TI1]], ptr align 1 @__const._Z6case2512EmptyDerived14UnnamedDerived.TI1, i32 8, i1 false)
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[TI2]], ptr align 1 @__const._Z6case2512EmptyDerived14UnnamedDerived.TI2, i32 8, i1 false)
+// CHECK-NEXT:    ret void
+//
 void case25(EmptyDerived ED, UnnamedDerived UD) {
  TwoInts TI1 = {ED, 1, 2};
  TwoInts TI2 = {UD, 1, 2};
 }
+
+// CHECK-LABEL: define hidden void @_Z6case267TwoInts(
+// CHECK-SAME: ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 1 [[TI:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[F:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[F2:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Z]], align 1
+// CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x float> poison, float [[CONV]], i32 0
+// CHECK-NEXT:    [[W:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[W]], align 1
+// CHECK-NEXT:    [[CONV1:%.*]] = sitofp i32 [[TMP1]] to float
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x float> [[VECINIT]], float [[CONV1]], i32 1
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x float> [[VECINIT2]], float 1.000000e+00, i32 2
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <4 x float> [[VECINIT3]], float 2.000000e+00, i32 3
+// CHECK-NEXT:    store <4 x float> [[VECINIT4]], ptr [[F]], align 16
+// CHECK-NEXT:    [[Z5:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Z5]], align 1
+// CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[TMP2]] to float
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <3 x float> <float 3.000000e+00, float poison, float poison>, float [[CONV6]], i32 1
+// CHECK-NEXT:    [[W8:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[W8]], align 1
+// CHECK-NEXT:    [[CONV9:%.*]] = sitofp i32 [[TMP3]] to float
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <3 x float> [[VECINIT7]], float [[CONV9]], i32 2
+// CHECK-NEXT:    store <3 x float> [[VECINIT10]], ptr [[F2]], align 16
+// CHECK-NEXT:    ret void
+//
+void case26(TwoInts TI) {
+  float4 F = float4(TI, 1, 2);
+  float3 F2 = float3(3, TI);
+}
+//.
+// CHECK: [[META3]] = !{}
+// CHECK: [[META4]] = !{i64 4}
+//.
diff --git a/clang/test/SemaHLSL/BuiltIns/vector-constructors-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/vector-constructors-errors.hlsl
new file mode 100644
index 0000000000000..26133acb5832b
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/vector-constructors-errors.hlsl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+
+struct S { float f; };
+struct S2 { float f; int i; };
+
+[numthreads(1,1,1)]
+void entry() {
+  float2 LilVec = float2(1.0, 2.0);
+  float2 BrokenVec = float2(1.0, 2.0, 3.0); // expected-error{{too many initializers in list for type 'float2' (vector of 2 'float' values) (expected 2 but found 3)}}
+  float3 NormieVec = float3(LilVec, 3.0, 4.0); // expected-error{{too many initializers in list for type 'float3' (vector of 3 'float' values) (expected 3 but found 4)}}
+  float3 BrokenNormie = float3(3.0, 4.0); // expected-error{{too few initializers in list for type 'float3' (vector of 3 'float' values) (expected 3 but found 2)}}
+  float3 OverwhemledNormie = float3(3.0, 4.0, 5.0, 6.0); // expected-error{{too many initializers in list for type 'float3' (vector of 3 'float' values) (expected 3 but found 4)}}
+
+  // These next two are a bit strange, but are consistent with HLSL today.
+  S s;
+  float2 GettingStrange = float2(s, s);
+  S2 s2 = {1.0f, 2};
+  float2 AlsoStrange = float2(s2);
+
+  float2 TooManyStruts = float2(s2, s); // expected-error{{too many initializers in list for type 'float2' (vector of 2 'float' values) (expected 2 but found 3)}}
+
+  // HLSL does not yet allow user-defined conversions.
+  struct T {
+    operator float() const { return 1.0f; }
+  } t;
+  // TODO: Should this work? Today HLSL doesn't resolve user-defined conversions here, but we maybe should...
+  float2 foo5 = float2(t, t); // expected-error{{too few initializers in list for type 'float2' (vector of 2 'float' values) (expected 2 but found 0)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl b/clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl
deleted file mode 100644
index b004acdc7c502..0000000000000
--- a/clang/test/SemaHLSL/BuiltIns/vector-constructors-erros.hlsl
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
-
-typedef float float2 __attribute__((ext_vector_type(2)));
-typedef float float3 __attribute__((ext_vector_type(3)));
-
-struct S { float f; };
-struct S2 { float f; int i; };
-
-[numthreads(1,1,1)]
-void entry() {
-  float2 LilVec = float2(1.0, 2.0);
-  float2 BrokenVec = float2(1.0, 2.0, 3.0); // expected-error{{excess elements in vector initializer}}
-  float3 NormieVec = float3(LilVec, 3.0, 4.0); // expected-error{{excess elements in vector initializer}}
-  float3 BrokenNormie = float3(3.0, 4.0); // expected-error{{too few elements in vector initialization (expected 3 elements, have 2)}}
-  float3 OverwhemledNormie = float3(3.0, 4.0, 5.0, 6.0); // expected-error{{excess elements in vector initializer}}
-
-  // These _should_ work in HLSL but aren't yet supported.
-  S s;
-  float2 GettingStrange = float2(s, s); // expected-error{{no viable conversion from 'S' to 'float'}} expected-error{{no viable conversion from 'S' to 'float'}}
-}

From 839b91c2294b4aeb5598309f90afa241ace5acef Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Mon, 6 Oct 2025 10:41:46 -0700
Subject: [PATCH 874/878] [clang] Move `-fprofile-instrument-use-path=` check
 to driver (#159667)

The frontend currently opens the path provided via
`-fprofile-instrument-use-path=` to learn the kind of the
instrumentation data and set the `CodeGenOptions::ProfileUse` value.
This happens during command-line parsing, where we don't have a
correctly configured VFS yet, so the behavior is quite different from
all other frontend inputs. We need to move this logic out of the
frontend command line parsing logic somewhere where we do have the
configured VFS.

The complication is that the `ProfileUse` flag is being used to set
preprocessor macros, and there isn't a great place between command line
parsing and preprocessor initialization to perform this logic.

This PR solves the issue by deducing the kind of instrumentation data
right in the driver and then passing it via a new flag to the frontend.
This shouldn't change observable behavior of Clang on the driver level,
and only affects the frontend command line interface, which is an
implementation detail anyway.
---
 .../clang/Basic/DiagnosticDriverKinds.td      |  3 ++
 clang/include/clang/Driver/Options.td         |  5 +++
 clang/lib/CodeGen/CodeGenModule.cpp           | 13 ++++--
 clang/lib/Driver/ToolChains/Clang.cpp         | 43 ++++++++++++++++---
 clang/lib/Frontend/CompilerInvocation.cpp     | 42 ++----------------
 clang/test/CodeGen/cspgo-instrumentation.c    |  6 +--
 .../test/CodeGen/cspgo-instrumentation_lto.c  |  6 +--
 .../CodeGen/cspgo-instrumentation_thinlto.c   |  8 ++--
 clang/test/CodeGen/opt-record.c               |  2 +-
 clang/test/CodeGen/pgo-instrumentation.c      |  4 +-
 .../thinlto-clang-diagnostic-handler-in-be.c  |  2 +-
 clang/test/CodeGenCXX/profile-remap.cpp       |  4 +-
 .../CXX/fdebug-info-for-profiling.cpp         |  2 +-
 clang/test/Driver/Inputs/a.proftext           |  2 +
 clang/test/Driver/cl-options.c                | 17 +++++---
 clang/test/Driver/clang_f_opts.c              | 18 +++++---
 clang/test/Driver/fcs-profile-generate.c      | 12 ++++--
 clang/test/Driver/fsplit-machine-functions.c  | 12 ++++--
 .../optimization-remark-with-hotness-new-pm.c | 11 ++---
 clang/test/Profile/c-captured.c               |  2 +-
 clang/test/Profile/c-counter-overflows.c      |  2 +-
 clang/test/Profile/c-general.c                |  8 ++--
 clang/test/Profile/c-outdated-data.c          |  4 +-
 clang/test/Profile/c-unprofiled-blocks.c      |  2 +-
 clang/test/Profile/c-unprofiled.c             |  2 +-
 clang/test/Profile/cxx-class.cpp              |  2 +-
 clang/test/Profile/cxx-hash-v2.cpp            |  4 +-
 clang/test/Profile/cxx-lambda.cpp             |  2 +-
 clang/test/Profile/cxx-missing-bodies.cpp     |  2 +-
 .../Profile/cxx-never-executed-branch.cpp     |  2 +-
 clang/test/Profile/cxx-rangefor.cpp           |  2 +-
 clang/test/Profile/cxx-templates.cpp          |  2 +-
 clang/test/Profile/cxx-throws.cpp             |  4 +-
 clang/test/Profile/func-entry.c               |  2 +-
 clang/test/Profile/misexpect-branch-cold.c    |  2 +-
 .../misexpect-branch-nonconst-expected-val.c  |  2 +-
 .../Profile/misexpect-branch-unpredictable.c  |  2 +-
 clang/test/Profile/misexpect-branch.c         |  8 ++--
 clang/test/Profile/misexpect-switch-default.c |  2 +-
 .../test/Profile/misexpect-switch-nonconst.c  |  2 +-
 .../misexpect-switch-only-default-case.c      |  2 +-
 clang/test/Profile/misexpect-switch.c         |  2 +-
 clang/test/Profile/objc-general.m             |  2 +-
 .../test/Profile/profile-does-not-exist-ir.c  | 10 +++--
 clang/test/Profile/profile-does-not-exist.c   |  6 ++-
 clang/test/Profile/profile-summary.c          |  2 +-
 46 files changed, 167 insertions(+), 129 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/a.proftext

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index ceb69091b2a51..0581bf353d936 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -683,6 +683,9 @@ def warn_drv_fine_grained_bitfield_accesses_ignored : Warning<
   "option '-ffine-grained-bitfield-accesses' cannot be enabled together with a sanitizer; flag ignored">,
   InGroup<OptionIgnored>;
 
+def err_drv_profile_instrument_use_path_with_no_kind : Error<
+  "option '-fprofile-instrument-use-path=' requires -fprofile-instrument-use=<kind>">;
+
 def note_drv_verify_prefix_spelling : Note<
   "-verify prefixes must start with a letter and contain only alphanumeric"
   " characters, hyphens, and underscores">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9bfa1dd52effe..60c4ad408ee43 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -7957,6 +7957,11 @@ def fprofile_instrument_path_EQ : Joined<["-"], "fprofile-instrument-path=">,
     HelpText<"Generate instrumented code to collect execution counts into "
              "<file> (overridden by LLVM_PROFILE_FILE env var)">,
     MarshallingInfoString<CodeGenOpts<"InstrProfileOutput">>;
+def fprofile_instrument_use_EQ : Joined<["-"], "fprofile-instrument-use=">,
+    HelpText<"Enable PGO use instrumentation">, Values<"none,clang,llvm,csllvm,sample-coldcov">,
+    NormalizedValuesScope<"llvm::driver::ProfileInstrKind">,
+    NormalizedValues<["ProfileNone", "ProfileClangInstr", "ProfileIRInstr", "ProfileCSIRInstr", "ProfileIRSampleColdCov"]>,
+    MarshallingInfoEnum<CodeGenOpts<"ProfileUse">, "ProfileNone">;
 def fprofile_instrument_use_path_EQ :
     Joined<["-"], "fprofile-instrument-use-path=">,
     HelpText<"Specify the profile path in PGO use compilation">,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index f6f7f22a09004..8d019d4b2da25 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -493,10 +493,15 @@ CodeGenModule::CodeGenModule(ASTContext &C,
     auto ReaderOrErr = llvm::IndexedInstrProfReader::create(
         CodeGenOpts.ProfileInstrumentUsePath, *FS,
         CodeGenOpts.ProfileRemappingFile);
-    // We're checking for profile read errors in CompilerInvocation, so if
-    // there was an error it should've already been caught. If it hasn't been
-    // somehow, trip an assertion.
-    assert(ReaderOrErr);
+    if (auto E = ReaderOrErr.takeError()) {
+      unsigned DiagID = Diags.getCustomDiagID(
+          DiagnosticsEngine::Error, "Error in reading profile %0: %1");
+      llvm::handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EI) {
+        Diags.Report(DiagID)
+            << CodeGenOpts.ProfileInstrumentUsePath << EI.message();
+      });
+      return;
+    }
     PGOReader = std::move(ReaderOrErr.get());
   }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 684cc0902916f..107b9ffd439a3 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Compression.h"
@@ -485,19 +486,47 @@ static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
   }
 
   if (ProfileUseArg) {
+    SmallString<128> UsePathBuf;
+    StringRef UsePath;
     if (ProfileUseArg->getOption().matches(options::OPT_fprofile_instr_use_EQ))
-      CmdArgs.push_back(Args.MakeArgString(
-          Twine("-fprofile-instrument-use-path=") + ProfileUseArg->getValue()));
+      UsePath = ProfileUseArg->getValue();
     else if ((ProfileUseArg->getOption().matches(
                   options::OPT_fprofile_use_EQ) ||
               ProfileUseArg->getOption().matches(
                   options::OPT_fprofile_instr_use))) {
-      SmallString<128> Path(
-          ProfileUseArg->getNumValues() == 0 ? "" : ProfileUseArg->getValue());
-      if (Path.empty() || llvm::sys::fs::is_directory(Path))
-        llvm::sys::path::append(Path, "default.profdata");
+      UsePathBuf =
+          ProfileUseArg->getNumValues() == 0 ? "" : ProfileUseArg->getValue();
+      if (UsePathBuf.empty() || llvm::sys::fs::is_directory(UsePathBuf))
+        llvm::sys::path::append(UsePathBuf, "default.profdata");
+      UsePath = UsePathBuf;
+    }
+    auto ReaderOrErr =
+        llvm::IndexedInstrProfReader::create(UsePath, D.getVFS());
+    if (auto E = ReaderOrErr.takeError()) {
+      auto DiagID = D.getDiags().getCustomDiagID(
+          DiagnosticsEngine::Error, "Error in reading profile %0: %1");
+      llvm::handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EI) {
+        D.Diag(DiagID) << UsePath.str() << EI.message();
+      });
+    } else {
+      std::unique_ptr<llvm::IndexedInstrProfReader> PGOReader =
+          std::move(ReaderOrErr.get());
+      StringRef UseKind;
+      // Currently memprof profiles are only added at the IR level. Mark the
+      // profile type as IR in that case as well and the subsequent matching
+      // needs to detect which is available (might be one or both).
+      if (PGOReader->isIRLevelProfile() || PGOReader->hasMemoryProfile()) {
+        if (PGOReader->hasCSIRLevelProfile())
+          UseKind = "csllvm";
+        else
+          UseKind = "llvm";
+      } else
+        UseKind = "clang";
+
+      CmdArgs.push_back(
+          Args.MakeArgString("-fprofile-instrument-use=" + UseKind));
       CmdArgs.push_back(
-          Args.MakeArgString(Twine("-fprofile-instrument-use-path=") + Path));
+          Args.MakeArgString("-fprofile-instrument-use-path=" + UsePath));
     }
   }
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 422375240bab6..a0e0f3771846a 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1473,34 +1473,6 @@ static std::string serializeXRayInstrumentationBundle(const XRayInstrSet &S) {
   return Buffer;
 }
 
-// Set the profile kind using fprofile-instrument-use-path.
-static void setPGOUseInstrumentor(CodeGenOptions &Opts,
-                                  const Twine &ProfileName,
-                                  llvm::vfs::FileSystem &FS,
-                                  DiagnosticsEngine &Diags) {
-  auto ReaderOrErr = llvm::IndexedInstrProfReader::create(ProfileName, FS);
-  if (auto E = ReaderOrErr.takeError()) {
-    unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
-                                            "Error in reading profile %0: %1");
-    llvm::handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EI) {
-      Diags.Report(DiagID) << ProfileName.str() << EI.message();
-    });
-    return;
-  }
-  std::unique_ptr<llvm::IndexedInstrProfReader> PGOReader =
-    std::move(ReaderOrErr.get());
-  // Currently memprof profiles are only added at the IR level. Mark the profile
-  // type as IR in that case as well and the subsequent matching needs to detect
-  // which is available (might be one or both).
-  if (PGOReader->isIRLevelProfile() || PGOReader->hasMemoryProfile()) {
-    if (PGOReader->hasCSIRLevelProfile())
-      Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileCSIRInstr);
-    else
-      Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
-  } else
-    Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileClangInstr);
-}
-
 void CompilerInvocation::setDefaultPointerAuthOptions(
     PointerAuthOptions &Opts, const LangOptions &LangOpts,
     const llvm::Triple &Triple) {
@@ -5090,16 +5062,10 @@ bool CompilerInvocation::CreateFromArgsImpl(
     append_range(Res.getCodeGenOpts().CommandLineArgs, CommandLineArgs);
   }
 
-  // Set PGOOptions. Need to create a temporary VFS to read the profile
-  // to determine the PGO type.
-  if (!Res.getCodeGenOpts().ProfileInstrumentUsePath.empty()) {
-    auto FS =
-        createVFSFromOverlayFiles(Res.getHeaderSearchOpts().VFSOverlayFiles,
-                                  Diags, llvm::vfs::getRealFileSystem());
-    setPGOUseInstrumentor(Res.getCodeGenOpts(),
-                          Res.getCodeGenOpts().ProfileInstrumentUsePath, *FS,
-                          Diags);
-  }
+  if (!Res.getCodeGenOpts().ProfileInstrumentUsePath.empty() &&
+      Res.getCodeGenOpts().getProfileUse() ==
+          llvm::driver::ProfileInstrKind::ProfileNone)
+    Diags.Report(diag::err_drv_profile_instrument_use_path_with_no_kind);
 
   FixupInvocation(Res, Diags, Args, DashX);
 
diff --git a/clang/test/CodeGen/cspgo-instrumentation.c b/clang/test/CodeGen/cspgo-instrumentation.c
index 3f90bb4396d70..f42d68aba0eaa 100644
--- a/clang/test/CodeGen/cspgo-instrumentation.c
+++ b/clang/test/CodeGen/cspgo-instrumentation.c
@@ -9,19 +9,19 @@
 // RUN: llvm-profdata merge -o %t/noncs.profdata %S/Inputs/pgotestir.proftext
 //
 // Ensure Pass PGOInstrumentationUsePass and PGOInstrumentationGenPass are invoked.
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t/noncs.profdata -fprofile-instrument=csllvm -fprofile-instrument-path=default.profraw  %s -fdebug-pass-manager  -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN2
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t/noncs.profdata -fprofile-instrument=csllvm -fprofile-instrument-path=default.profraw  %s -fdebug-pass-manager  -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN2
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN2: Running pass: PGOInstrumentationUse
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN2: Running pass: PGOInstrumentationGenCreateVar on
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN2: Running pass: PGOInstrumentationGen on
 
 // Ensure Pass PGOInstrumentationUsePass is invoked only once.
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t/noncs.profdata %s -fdebug-pass-manager -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t/noncs.profdata %s -fdebug-pass-manager -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE
 // CHECK-PGOUSEPASS-INVOKED-USE: Running pass: PGOInstrumentationUse
 // CHECK-PGOUSEPASS-INVOKED-USE-NOT: Running pass: PGOInstrumentationGenCreateVar
 // CHECK-PGOUSEPASS-INVOKED-USE-NOT: Running pass: PGOInstrumentationUse
 //
 // Ensure Pass PGOInstrumentationUsePass is invoked twice.
 // RUN: llvm-profdata merge -o %t/cs.profdata %S/Inputs/pgotestir_cs.proftext
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t/cs.profdata %s -fdebug-pass-manager  -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE2
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=csllvm -fprofile-instrument-use-path=%t/cs.profdata %s -fdebug-pass-manager  -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE2
 // CHECK-PGOUSEPASS-INVOKED-USE2: Running pass: PGOInstrumentationUse
 // CHECK-PGOUSEPASS-INVOKED-USE2: Running pass: PGOInstrumentationUse
diff --git a/clang/test/CodeGen/cspgo-instrumentation_lto.c b/clang/test/CodeGen/cspgo-instrumentation_lto.c
index c4296842ae50e..5d541c6688a1f 100644
--- a/clang/test/CodeGen/cspgo-instrumentation_lto.c
+++ b/clang/test/CodeGen/cspgo-instrumentation_lto.c
@@ -4,7 +4,7 @@
 // RUN: llvm-profdata merge -o %t/noncs.profdata %S/Inputs/pgotestir.proftext
 //
 // Ensure Pass PGOInstrumentationGenPass is not invoked in PreLink.
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t/noncs.profdata -fprofile-instrument=csllvm %s -flto -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t/noncs.profdata -fprofile-instrument=csllvm %s -flto -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE: Running pass: PGOInstrumentationUse
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE: Running pass: PGOInstrumentationGenCreateVar
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE-NOT: Running pass: PGOInstrumentationGen on
@@ -18,12 +18,12 @@
 // RUN: llvm-profdata merge -o %t/cs.profdata %S/Inputs/pgotestir_cs.proftext
 //
 // Ensure Pass PGOInstrumentationUsePass is invoked Once in PreLink.
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t/cs.profdata %s -flto -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=csllvm -fprofile-instrument-use-path=%t/cs.profdata %s -flto -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE: Running pass: PGOInstrumentationUse
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE-NOT: Running pass: PGOInstrumentationGenCreateVar
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE-NOT: Running pass: PGOInstrumentationUse
 //
 // Ensure Pass PGOInstrumentationUSEPass is invoked in PostLink.
-// RUN: %clang_cc1 -O2 -x ir %t/foo_fe_pm.bc -fdebug-pass-manager -fprofile-instrument-use-path=%t/cs.profdata -flto -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST
+// RUN: %clang_cc1 -O2 -x ir %t/foo_fe_pm.bc -fdebug-pass-manager -fprofile-instrument-use=csllvm -fprofile-instrument-use-path=%t/cs.profdata -flto -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST: Running pass: PGOInstrumentationUse
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST-NOT: Running pass: PGOInstrumentationUse
diff --git a/clang/test/CodeGen/cspgo-instrumentation_thinlto.c b/clang/test/CodeGen/cspgo-instrumentation_thinlto.c
index f79433827ce17..0627f64a69728 100644
--- a/clang/test/CodeGen/cspgo-instrumentation_thinlto.c
+++ b/clang/test/CodeGen/cspgo-instrumentation_thinlto.c
@@ -4,7 +4,7 @@
 // RUN: llvm-profdata merge -o %t/noncs.profdata %S/Inputs/pgotestir.proftext
 //
 // Ensure Pass PGOInstrumentationGenPass is not invoked in PreLink.
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t/noncs.profdata -fprofile-instrument=csllvm %s -fprofile-instrument-path=default.profraw  -flto=thin -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t/noncs.profdata -fprofile-instrument=csllvm %s -fprofile-instrument-path=default.profraw  -flto=thin -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE: Running pass: PGOInstrumentationUse
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE: Running pass: PGOInstrumentationGenCreateVar
 // CHECK-CSPGOGENPASS-INVOKED-INSTR-GEN-PRE-NOT: Running pass: PGOInstrumentationGen on
@@ -19,16 +19,16 @@
 // RUN: llvm-profdata merge -o %t/cs.profdata %S/Inputs/pgotestir_cs.proftext
 //
 // Ensure Pass PGOInstrumentationUsePass is invoked Once in PreLink.
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t/cs.profdata %s -flto=thin -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=csllvm -fprofile-instrument-use-path=%t/cs.profdata %s -flto=thin -fdebug-pass-manager -emit-llvm-bc -o %t/foo_fe_pm.bc 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE: Running pass: PGOInstrumentationUse
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-PRE-NOT: Running pass: PGOInstrumentationUse
 //
 // RUN: llvm-lto -thinlto -o %t/foo_pm %t/foo_fe_pm.bc
 // Ensure Pass PGOInstrumentationUSEPass is invoked in PostLink.
-// RUN: %clang_cc1 -O2 -x ir %t/foo_fe_pm.bc -fthinlto-index=%t/foo_pm.thinlto.bc -fdebug-pass-manager -fprofile-instrument-use-path=%t/cs.profdata -flto=thin -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST
+// RUN: %clang_cc1 -O2 -x ir %t/foo_fe_pm.bc -fthinlto-index=%t/foo_pm.thinlto.bc -fdebug-pass-manager -fprofile-instrument-use=csllvm -fprofile-instrument-use-path=%t/cs.profdata -flto=thin -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST: Running pass: PGOInstrumentationUse
 // CHECK-CSPGOUSEPASS-INVOKED-INSTR-USE-POST-NOT: Running pass: PGOInstrumentationUse
 //
 // Finally, test if a non-cs profile is passed to PostLink passes, PGO UsePass is not invoked.
-// RUN: %clang_cc1 -O2 -x ir %t/foo_fe_pm.bc -fthinlto-index=%t/foo_pm.thinlto.bc -fdebug-pass-manager -fprofile-instrument-use-path=%t/noncs.profdata -flto=thin -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-INSTR-USE-POST
+// RUN: %clang_cc1 -O2 -x ir %t/foo_fe_pm.bc -fthinlto-index=%t/foo_pm.thinlto.bc -fdebug-pass-manager -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t/noncs.profdata -flto=thin -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-INSTR-USE-POST
 // CHECK-PGOUSEPASS-INVOKED-INSTR-USE-POST-NOT: Running pass: PGOInstrumentationUse
diff --git a/clang/test/CodeGen/opt-record.c b/clang/test/CodeGen/opt-record.c
index 391c14b7bbcc2..f54a6225760aa 100644
--- a/clang/test/CodeGen/opt-record.c
+++ b/clang/test/CodeGen/opt-record.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -O3 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 %s -o %t -opt-record-file %t.yaml -emit-obj
 // RUN: cat %t.yaml | FileCheck %s
 // RUN: llvm-profdata merge %S/Inputs/opt-record.proftext -o %t.profdata
-// RUN: %clang_cc1 -O3 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -fprofile-instrument-use-path=%t.profdata %s -o %t -opt-record-file %t.yaml -emit-obj
+// RUN: %clang_cc1 -O3 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata %s -o %t -opt-record-file %t.yaml -emit-obj
 // RUN: cat %t.yaml | FileCheck -check-prefix=CHECK -check-prefix=CHECK-PGO %s
 // RUN: %clang_cc1 -O3 -triple x86_64-unknown-linux-gnu -target-cpu x86-64 %s -o %t -opt-record-file %t.yaml -opt-record-passes inline -emit-obj
 // RUN: cat %t.yaml | FileCheck -check-prefix=CHECK-PASSES %s
diff --git a/clang/test/CodeGen/pgo-instrumentation.c b/clang/test/CodeGen/pgo-instrumentation.c
index c01658065497e..7c878250ae33e 100644
--- a/clang/test/CodeGen/pgo-instrumentation.c
+++ b/clang/test/CodeGen/pgo-instrumentation.c
@@ -15,10 +15,10 @@
 
 // Ensure Pass PGOInstrumentationUsePass is invoked.
 // RUN: llvm-profdata merge -o %t.profdata %S/Inputs/pgotestir.profraw
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t.profdata %s -fdebug-pass-manager -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-INSTR-USE
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t.profdata %s -fdebug-pass-manager -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-INSTR-USE
 // CHECK-PGOUSEPASS-INVOKED-INSTR-USE: Running pass: PGOInstrumentationUse on
 //
 // Ensure Pass PGOInstrumentationUsePass is not invoked.
 // RUN: llvm-profdata merge -o %t.profdata %S/Inputs/pgotestclang.profraw
-// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t.profdata %s -fdebug-pass-manager -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE-CLANG
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata %s -fdebug-pass-manager -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE-CLANG
 // CHECK-PGOUSEPASS-INVOKED-USE-CLANG-NOT: Running pass: PGOInstrumentationUse on
diff --git a/clang/test/CodeGen/thinlto-clang-diagnostic-handler-in-be.c b/clang/test/CodeGen/thinlto-clang-diagnostic-handler-in-be.c
index efbcc851ad770..a12574d28c731 100644
--- a/clang/test/CodeGen/thinlto-clang-diagnostic-handler-in-be.c
+++ b/clang/test/CodeGen/thinlto-clang-diagnostic-handler-in-be.c
@@ -7,7 +7,7 @@
 // RUN: llvm-lto -thinlto -o %t %t1.bo
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -O2 -x ir %t1.bo -fthinlto-index=%t.thinlto.bc -emit-obj -Rpass-analysis=info 2>&1 | FileCheck %s -check-prefix=CHECK-REMARK
 // RUN: llvm-profdata merge -o %t2.profdata %S/Inputs/thinlto_expect2.proftext
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -O2 -x ir %t1.bo -fthinlto-index=%t.thinlto.bc -fprofile-instrument-use-path=%t2.profdata -emit-obj 2>&1 | FileCheck %s -allow-empty -check-prefix=CHECK-NOWARNING
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -O2 -x ir %t1.bo -fthinlto-index=%t.thinlto.bc -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t2.profdata -emit-obj 2>&1 | FileCheck %s -allow-empty -check-prefix=CHECK-NOWARNING
 
 int sum;
 __attribute__((noinline)) void bar(void) {
diff --git a/clang/test/CodeGenCXX/profile-remap.cpp b/clang/test/CodeGenCXX/profile-remap.cpp
index 4bce6df9d3af2..748c58d91d105 100644
--- a/clang/test/CodeGenCXX/profile-remap.cpp
+++ b/clang/test/CodeGenCXX/profile-remap.cpp
@@ -2,9 +2,9 @@
 //
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -fprofile-sample-use=%S/Inputs/profile-remap.samples -fprofile-remapping-file=%S/Inputs/profile-remap.map -O2 %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAMPLES
 // RUN: llvm-profdata merge -output %t.profdata %S/Inputs/profile-remap.proftext
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fprofile-instrument-use-path=%t.profdata -fprofile-remapping-file=%S/Inputs/profile-remap.map -O2 %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-INSTR
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t.profdata -fprofile-remapping-file=%S/Inputs/profile-remap.map -O2 %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-INSTR
 // RUN: llvm-profdata merge -output %t.profdata %S/Inputs/profile-remap_entry.proftext
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -fprofile-instrument-use-path=%t.profdata -fprofile-remapping-file=%S/Inputs/profile-remap.map -O2 %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-INSTR
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t.profdata -fprofile-remapping-file=%S/Inputs/profile-remap.map -O2 %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-INSTR
 
 namespace Foo {
   struct X {};
diff --git a/clang/test/DebugInfo/CXX/fdebug-info-for-profiling.cpp b/clang/test/DebugInfo/CXX/fdebug-info-for-profiling.cpp
index e468a8098d7c8..63de73c90bd2f 100644
--- a/clang/test/DebugInfo/CXX/fdebug-info-for-profiling.cpp
+++ b/clang/test/DebugInfo/CXX/fdebug-info-for-profiling.cpp
@@ -13,7 +13,7 @@
 
 // RUN: echo > %t.proftext
 // RUN: llvm-profdata merge %t.proftext -o %t.profdata
-// RUN: %clang_cc1 -emit-llvm -fdebug-pass-manager -O1 -fprofile-instrument-use-path=%t.profdata -fdebug-info-for-profiling %s -o - 2>&1 | FileCheck %s --check-prefix=DISCR
+// RUN: %clang_cc1 -emit-llvm -fdebug-pass-manager -O1 -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -fdebug-info-for-profiling %s -o - 2>&1 | FileCheck %s --check-prefix=DISCR
 // RUN: %clang_cc1 -emit-llvm -fdebug-pass-manager -O1 -fdebug-info-for-profiling -fpseudo-probe-for-profiling %s -o - 2>&1 | FileCheck %s --check-prefix=PROBE
 
 // NODISCR-NOT: Running pass: AddDiscriminatorsPass
diff --git a/clang/test/Driver/Inputs/a.proftext b/clang/test/Driver/Inputs/a.proftext
new file mode 100644
index 0000000000000..05ab839db07c7
--- /dev/null
+++ b/clang/test/Driver/Inputs/a.proftext
@@ -0,0 +1,2 @@
+# IR level Instrumentation Flag
+:ir
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 26050760cb480..1b1169b71554a 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -92,16 +92,21 @@
 // RUN: not %clang_cl -### /FAcsu -fprofile-instr-generate -fprofile-instr-use=file -- %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GEN-USE %s
 // CHECK-NO-MIX-GEN-USE: '{{[a-z=-]*}}' not allowed with '{{[a-z=-]*}}'
 
+// RUN: rm -rf %t && mkdir %t
+// RUN: llvm-profdata merge -o %t/somefile.prof %S/Inputs/a.proftext
+// RUN: llvm-profdata merge -o %t/default.profdata %S/Inputs/a.proftext
+// RUN: cd %t
+
 // RUN: %clang_cl -### /FA -fprofile-instr-use -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE %s
 // RUN: %clang_cl -### /FA -fprofile-use -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE %s
-// RUN: %clang_cl -### /FA -fprofile-instr-use=/tmp/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
-// RUN: %clang_cl -### /FA -fprofile-use=/tmp/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
+// RUN: %clang_cl -### /FA -fprofile-instr-use=%t/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
+// RUN: %clang_cl -### /FA -fprofile-use=%t/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
 // RUN: %clang_cl -### /FAcsu -fprofile-instr-use -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE %s
 // RUN: %clang_cl -### /FAcsu -fprofile-use -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE %s
-// RUN: %clang_cl -### /FAcsu -fprofile-instr-use=/tmp/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
-// RUN: %clang_cl -### /FAcsu -fprofile-use=/tmp/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
-// CHECK-PROFILE-USE: "-fprofile-instrument-use-path=default.profdata"
-// CHECK-PROFILE-USE-FILE: "-fprofile-instrument-use-path=/tmp/somefile.prof"
+// RUN: %clang_cl -### /FAcsu -fprofile-instr-use=%t/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
+// RUN: %clang_cl -### /FAcsu -fprofile-use=%t/somefile.prof -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
+// CHECK-PROFILE-USE: "-fprofile-instrument-use-path={{.*}}default.profdata"
+// CHECK-PROFILE-USE-FILE: "-fprofile-instrument-use-path={{.*}}somefile.prof"
 
 // RUN: %clang_cl /GA -### -- %s 2>&1 | FileCheck -check-prefix=GA %s
 // GA: -ftls-model=local-exec
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 94b983f14e3ef..765f9d6ae3212 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -131,16 +131,22 @@
 // CHECK-DISABLE-COVERAGE-NOT: "-fcoverage-mapping"
 // CHECK-PROFILE-REMAP: "-fprofile-remapping-file=foo/bar.txt"
 
+// RUN: rm -rf %t && mkdir %t
+// RUN: llvm-profdata merge -o %t/somefile.prof %S/Inputs/a.proftext
+// RUN: llvm-profdata merge -o %t/default.profdata %S/Inputs/a.proftext
+// RUN: cd %t
+
 // RUN: %clang -### -S -fprofile-use %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE %s
 // RUN: %clang -### -S -fprofile-instr-use %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE %s
-// RUN: mkdir -p %t.d/some/dir
-// RUN: %clang -### -S -fprofile-use=%t.d/some/dir %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s
-// RUN: %clang -### -S -fprofile-instr-use=/tmp/somefile.prof %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
+// RUN: mkdir -p %t/some/dir
+// RUN: cp %t/default.profdata %t/some/dir
+// RUN: %clang -### -S -fprofile-use=%t/some/dir %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s
+// RUN: %clang -### -S -fprofile-instr-use=%t/somefile.prof %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
 // CHECK-PROFILE-USE: "-fprofile-instrument-use-path=default.profdata"
-// CHECK-PROFILE-USE-DIR: "-fprofile-instrument-use-path={{.*}}.d/some/dir{{/|\\\\}}default.profdata"
-// CHECK-PROFILE-USE-FILE: "-fprofile-instrument-use-path=/tmp/somefile.prof"
+// CHECK-PROFILE-USE-DIR: "-fprofile-instrument-use-path={{.*}}some/dir{{/|\\\\}}default.profdata"
+// CHECK-PROFILE-USE-FILE: "-fprofile-instrument-use-path={{.*}}somefile.prof"
 
-// RUN: %clang -### -S -fprofile-instr-use=%t.profdata -fdiagnostics-misexpect-tolerance=10 -Wmisexpect %s 2>&1 | FileCheck %s --check-prefix=CHECK-MISEXPECT-TOLLERANCE
+// RUN: %clang -### -S -fprofile-instr-use=%t/somefile.prof -fdiagnostics-misexpect-tolerance=10 -Wmisexpect %s 2>&1 | FileCheck %s --check-prefix=CHECK-MISEXPECT-TOLLERANCE
 // CHECK-MISEXPECT-TOLLERANCE: "-fdiagnostics-misexpect-tolerance=10"
 // CHECK-MISEXPECT-TOLLERANCE-NOT: argument unused
 
diff --git a/clang/test/Driver/fcs-profile-generate.c b/clang/test/Driver/fcs-profile-generate.c
index f69fa44be873e..ecb5aad58fd46 100644
--- a/clang/test/Driver/fcs-profile-generate.c
+++ b/clang/test/Driver/fcs-profile-generate.c
@@ -1,10 +1,14 @@
-// RUN: %clang -### -c -fprofile-use=a.profdata -fcs-profile-generate %s 2>&1 | FileCheck %s
+// RUN: rm -rf %t && mkdir %t
+// RUN: llvm-profdata merge -o %t/a.profdata %S/Inputs/a.proftext
+
+// RUN: %clang -### -c -fprofile-use=%t/a.profdata -fcs-profile-generate %s 2>&1 | FileCheck %s
 // CHECK:      "-fprofile-instrument=csllvm"
 // CHECK-NOT:  "-fprofile-instrument-path=
-// CHECK-SAME: "-fprofile-instrument-use-path=a.profdata"
+// CHECK-SAME: "-fprofile-instrument-use=llvm"
+// CHECK-SAME: "-fprofile-instrument-use-path={{.*}}a.profdata"
 
-// RUN: %clang -### -c -fprofile-use=a.profdata -fcs-profile-generate=dir %s 2>&1 | FileCheck %s --check-prefix=CHECK1
-// CHECK1: "-fprofile-instrument=csllvm"{{.*}} "-fprofile-instrument-path=dir{{/|\\\\}}default_%m.profraw" "-fprofile-instrument-use-path=a.profdata"
+// RUN: %clang -### -c -fprofile-use=%t/a.profdata -fcs-profile-generate=dir %s 2>&1 | FileCheck %s --check-prefix=CHECK1
+// CHECK1: "-fprofile-instrument=csllvm"{{.*}} "-fprofile-instrument-path=dir{{/|\\\\}}default_%m.profraw" "-fprofile-instrument-use=llvm" "-fprofile-instrument-use-path={{.*}}a.profdata"
 
 /// Degradation case. This usage does not make much sense.
 // RUN: %clang -### -c -fcs-profile-generate %s 2>&1 | FileCheck %s --check-prefix=NOUSE
diff --git a/clang/test/Driver/fsplit-machine-functions.c b/clang/test/Driver/fsplit-machine-functions.c
index 02bab476e831d..153aafba8fbd0 100644
--- a/clang/test/Driver/fsplit-machine-functions.c
+++ b/clang/test/Driver/fsplit-machine-functions.c
@@ -1,9 +1,13 @@
-// RUN: %clang -### --target=x86_64 -fprofile-use=default.profdata -fsplit-machine-functions %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LTO-NEG
-// RUN: %clang -### --target=aarch64 -fprofile-use=default.profdata -fsplit-machine-functions %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LTO-NEG
-// RUN: %clang -### --target=x86_64 -fprofile-use=default.profdata -fsplit-machine-functions -fno-split-machine-functions %s -c 2>&1 | FileCheck -check-prefix=NEG %s
+// RUN: rm -rf %t && mkdir %t
+// RUN: llvm-profdata merge -o %t/default.profdata %S/Inputs/a.proftext
+
+// RUN: %clang -### --target=x86_64 -fprofile-use=%t/default.profdata -fsplit-machine-functions %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LTO-NEG
+// RUN: %clang -### --target=aarch64 -fprofile-use=%t/default.profdata -fsplit-machine-functions %s 2>&1 | FileCheck %s --check-prefixes=CHECK,LTO-NEG
+// RUN: %clang -### --target=x86_64 -fprofile-use=%t/default.profdata -fsplit-machine-functions -fno-split-machine-functions %s -c 2>&1 | FileCheck -check-prefix=NEG %s
 
 // CHECK:      "-fsplit-machine-functions"
-// CHECK-SAME: "-fprofile-instrument-use-path=default.profdata"
+// CHECK-SAME: "-fprofile-instrument-use=llvm"
+// CHECK-SAME: "-fprofile-instrument-use-path={{.*}}default.profdata"
 
 // NEG-NOT:    "-fsplit-machine-functions"
 
diff --git a/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c b/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c
index 173b43ba41bfe..0c7e96182aebf 100644
--- a/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c
+++ b/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c
@@ -15,7 +15,8 @@
 //
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name \
 // RUN:     optimization-remark-with-hotness.c %s -emit-llvm-only \
-// RUN:     -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
+// RUN:     -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata \
+// RUN:     -Rpass=inline \
 // RUN:     -O1 \
 // RUN:     -Rpass-analysis=inline -Rpass-missed=inline \
 // RUN:     -fdiagnostics-show-hotness -verify
@@ -34,24 +35,24 @@
 // RUN:     -verify
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name \
 // RUN:     optimization-remark-with-hotness.c %s -emit-llvm-only \
-// RUN:     -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
+// RUN:     -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
 // RUN:     -O1 \
 // RUN:     -Rpass-analysis=inline -Rpass-missed=inline \
 // RUN:     -fdiagnostics-show-hotness -fdiagnostics-hotness-threshold=10 -verify
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name \
 // RUN:     optimization-remark-with-hotness.c %s -emit-llvm-only \
-// RUN:     -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
+// RUN:     -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
 // RUN:     -O1 \
 // RUN:     -Rpass-analysis=inline 2>&1 | FileCheck -check-prefix=HOTNESS_OFF %s
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name \
 // RUN:     optimization-remark-with-hotness.c %s -emit-llvm-only \
-// RUN:     -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
+// RUN:     -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
 // RUN:     -O1 \
 // RUN:     -Rpass-analysis=inline -Rno-pass-with-hotness 2>&1 | FileCheck \
 // RUN:     -check-prefix=HOTNESS_OFF %s
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name \
 // RUN:     optimization-remark-with-hotness.c %s -emit-llvm-only \
-// RUN:     -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
+// RUN:     -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -Rpass=inline \
 // RUN:     -Rpass-analysis=inline -fdiagnostics-show-hotness \
 // RUN:     -fdiagnostics-hotness-threshold=100  2>&1 \
 // RUN:     | FileCheck -allow-empty -check-prefix=THRESHOLD %s
diff --git a/clang/test/Profile/c-captured.c b/clang/test/Profile/c-captured.c
index 3a802494eb6a6..e38798983b6c2 100644
--- a/clang/test/Profile/c-captured.c
+++ b/clang/test/Profile/c-captured.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOGEN -check-prefix=PGOALL %s
 
 // RUN: llvm-profdata merge %S/Inputs/c-captured.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE -check-prefix=PGOALL %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE -check-prefix=PGOALL %s
 
 // PGOGEN: @[[DCC:__profc_debug_captured]] = private global [3 x i64] zeroinitializer
 // PGOGEN: @[[CSC:__profc_c_captured.c___captured_stmt]] = private global [2 x i64] zeroinitializer
diff --git a/clang/test/Profile/c-counter-overflows.c b/clang/test/Profile/c-counter-overflows.c
index 5cb32bbc30af4..bc12865b32b5d 100644
--- a/clang/test/Profile/c-counter-overflows.c
+++ b/clang/test/Profile/c-counter-overflows.c
@@ -2,7 +2,7 @@
 // truncated.
 
 // RUN: llvm-profdata merge %S/Inputs/c-counter-overflows.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-counter-overflows.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-counter-overflows.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck %s
 
 typedef unsigned long long uint64_t;
 
diff --git a/clang/test/Profile/c-general.c b/clang/test/Profile/c-general.c
index 2f621ec9b0bf9..ee36a43dac081 100644
--- a/clang/test/Profile/c-general.c
+++ b/clang/test/Profile/c-general.c
@@ -3,11 +3,11 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/c-general.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v5 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v5 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 // Also check compatibility with older profiles.
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v1 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v1 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 
 // RUN: %clang -fprofile-generate -E -dM %s | FileCheck -match-full-lines -check-prefix=PROFGENMACRO %s
 // RUN: %clang -fprofile-instr-generate -E -dM %s | FileCheck -match-full-lines -check-prefix=PROFGENMACRO %s
diff --git a/clang/test/Profile/c-outdated-data.c b/clang/test/Profile/c-outdated-data.c
index 454e4d799e0bb..7071401599b89 100644
--- a/clang/test/Profile/c-outdated-data.c
+++ b/clang/test/Profile/c-outdated-data.c
@@ -4,8 +4,8 @@
 // doesn't play well with warnings that have no line number.
 
 // RUN: llvm-profdata merge %S/Inputs/c-outdated-data.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck %s -check-prefix=NO_MISSING
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -Wprofile-instr-missing -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck %s -check-prefix=WITH_MISSING
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck %s -check-prefix=NO_MISSING
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -Wprofile-instr-missing -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck %s -check-prefix=WITH_MISSING
 
 // NO_MISSING: warning: profile data may be out of date: of 3 functions, 2 have mismatched data that will be ignored
 // NO_MISSING-NOT: 1 has no data
diff --git a/clang/test/Profile/c-unprofiled-blocks.c b/clang/test/Profile/c-unprofiled-blocks.c
index e25bbc50de2ba..bcf3e06371e09 100644
--- a/clang/test/Profile/c-unprofiled-blocks.c
+++ b/clang/test/Profile/c-unprofiled-blocks.c
@@ -2,7 +2,7 @@
 // runs) shouldn't have any branch weight metadata added.
 
 // RUN: llvm-profdata merge %S/Inputs/c-unprofiled-blocks.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled-blocks.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled-blocks.c %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
 
 // PGOUSE-LABEL: @never_called(i32 noundef %i)
 int never_called(int i) {
diff --git a/clang/test/Profile/c-unprofiled.c b/clang/test/Profile/c-unprofiled.c
index 3466079f69bd8..3f52d773e1c06 100644
--- a/clang/test/Profile/c-unprofiled.c
+++ b/clang/test/Profile/c-unprofiled.c
@@ -7,7 +7,7 @@
 // doesn't play well with warnings that have no line number.
 
 // RUN: llvm-profdata merge %S/Inputs/c-unprofiled.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled.c -I %S/Inputs/ %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%t.profdata -Wprofile-instr-unprofiled 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled.c -I %S/Inputs/ %s -o /dev/null -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -Wprofile-instr-unprofiled 2>&1 | FileCheck %s
 
 // CHECK: warning: no profile data available for file "c-unprofiled.c"
 
diff --git a/clang/test/Profile/cxx-class.cpp b/clang/test/Profile/cxx-class.cpp
index c38a81e0d86e8..ce9ddcfc4553f 100644
--- a/clang/test/Profile/cxx-class.cpp
+++ b/clang/test/Profile/cxx-class.cpp
@@ -9,7 +9,7 @@
 // RUN: FileCheck --input-file=%tgen -check-prefix=VDTRGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-class.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -triple %itanium_abi_triple > %tuse
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -triple %itanium_abi_triple > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=CTRUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=DTRUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=MTHUSE %s
diff --git a/clang/test/Profile/cxx-hash-v2.cpp b/clang/test/Profile/cxx-hash-v2.cpp
index cb633d53f6f30..53097228ba521 100644
--- a/clang/test/Profile/cxx-hash-v2.cpp
+++ b/clang/test/Profile/cxx-hash-v2.cpp
@@ -6,8 +6,8 @@
 // RUN: diff %t.hashes %t.hashes.unique
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-hash-v2.proftext -o %t.profdata
-// RUN: %clang_cc1 -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-apple-macosx10.9 -main-file-name cxx-hash-v2.mm %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck %s -allow-empty
-// RUN: %clang_cc1 -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-apple-macosx10.9 -main-file-name cxx-hash-v2.mm %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%S/Inputs/cxx-hash-v2.profdata.v5 2>&1 | FileCheck %s -allow-empty
+// RUN: %clang_cc1 -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-apple-macosx10.9 -main-file-name cxx-hash-v2.mm %s -o /dev/null -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck %s -allow-empty
+// RUN: %clang_cc1 -std=c++11 -fexceptions -fcxx-exceptions -triple x86_64-apple-macosx10.9 -main-file-name cxx-hash-v2.mm %s -o /dev/null -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%S/Inputs/cxx-hash-v2.profdata.v5 2>&1 | FileCheck %s -allow-empty
 
 // CHECK-NOT: warning: profile data may be out of date
 
diff --git a/clang/test/Profile/cxx-lambda.cpp b/clang/test/Profile/cxx-lambda.cpp
index 589f922c70782..b05fdd3d26b7c 100644
--- a/clang/test/Profile/cxx-lambda.cpp
+++ b/clang/test/Profile/cxx-lambda.cpp
@@ -5,7 +5,7 @@
 // RUN: FileCheck -allow-deprecated-dag-overlap  --input-file=%tgen -check-prefix=LMBGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-lambda.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck -allow-deprecated-dag-overlap  --input-file=%tuse -check-prefix=PGOUSE %s
 // RUN: FileCheck -allow-deprecated-dag-overlap  --input-file=%tuse -check-prefix=LMBUSE %s
 
diff --git a/clang/test/Profile/cxx-missing-bodies.cpp b/clang/test/Profile/cxx-missing-bodies.cpp
index 6d34fca482c99..87541c2825b25 100644
--- a/clang/test/Profile/cxx-missing-bodies.cpp
+++ b/clang/test/Profile/cxx-missing-bodies.cpp
@@ -2,7 +2,7 @@
 //
 // Don't crash when presented profile data for functions without bodies:
 // RUN: llvm-profdata merge %S/Inputs/cxx-missing-bodies.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -std=c++11 -emit-llvm-only -triple=i386-pc-win32 -fno-rtti -fprofile-instrument-use-path=%t.profdata -w
+// RUN: %clang_cc1 %s -std=c++11 -emit-llvm-only -triple=i386-pc-win32 -fno-rtti -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -w
 
 // GEN-NOT: __profn{{.*}}??_GA@@UAEPAXI@Z
 // GEN-NOT: __profn{{.*}}??_DA@@QAEXXZ
diff --git a/clang/test/Profile/cxx-never-executed-branch.cpp b/clang/test/Profile/cxx-never-executed-branch.cpp
index 812f65f3996d3..d7bdd3c67a15d 100644
--- a/clang/test/Profile/cxx-never-executed-branch.cpp
+++ b/clang/test/Profile/cxx-never-executed-branch.cpp
@@ -1,7 +1,7 @@
 // Test that clang doesn't emit llvm.expect when the counter is 0
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-never-executed-branch.proftext -o %t.profdata
-// RUN: %clang_cc1 -std=c++20 %s -triple %itanium_abi_triple -O2 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -disable-llvm-passes | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 %s -triple %itanium_abi_triple -O2 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -disable-llvm-passes | FileCheck %s
 
 int rand();
 
diff --git a/clang/test/Profile/cxx-rangefor.cpp b/clang/test/Profile/cxx-rangefor.cpp
index 58c578c3c1017..43bdaa9b70aeb 100644
--- a/clang/test/Profile/cxx-rangefor.cpp
+++ b/clang/test/Profile/cxx-rangefor.cpp
@@ -4,7 +4,7 @@
 // RUN: FileCheck --input-file=%tgen -check-prefix=CHECK -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-rangefor.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=CHECK -check-prefix=PGOUSE %s
 
 // PGOGEN: @[[RFC:__profc__Z9range_forv]] = {{(private|internal)}} global [5 x i64] zeroinitializer
diff --git a/clang/test/Profile/cxx-templates.cpp b/clang/test/Profile/cxx-templates.cpp
index 51c8f1f6d1edd..09cf11e841dc4 100644
--- a/clang/test/Profile/cxx-templates.cpp
+++ b/clang/test/Profile/cxx-templates.cpp
@@ -6,7 +6,7 @@
 // RUN: FileCheck --input-file=%tgen -check-prefix=T100GEN -check-prefix=ALL %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-templates.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=T0USE -check-prefix=ALL %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=T100USE -check-prefix=ALL %s
 
diff --git a/clang/test/Profile/cxx-throws.cpp b/clang/test/Profile/cxx-throws.cpp
index e989c2d39dfbb..fc04add24b97d 100644
--- a/clang/test/Profile/cxx-throws.cpp
+++ b/clang/test/Profile/cxx-throws.cpp
@@ -7,8 +7,8 @@
 // RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOGEN-EXC %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-throws.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE %s
-// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE-EXC %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE-EXC %s
 
 // PGOGEN: @[[THC:__profc__Z6throwsv]] = {{(private|internal)}} global [9 x i64] zeroinitializer
 // PGOGEN-EXC: @[[THC:__profc__Z6throwsv]] = {{(private|internal)}} global [9 x i64] zeroinitializer
diff --git a/clang/test/Profile/func-entry.c b/clang/test/Profile/func-entry.c
index 4a4ca2288352b..27d11cc254ea8 100644
--- a/clang/test/Profile/func-entry.c
+++ b/clang/test/Profile/func-entry.c
@@ -1,7 +1,7 @@
 // Test that function entry counts are set correctly.
 
 // RUN: llvm-profdata merge %S/Inputs/func-entry.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
+// RUN: %clang_cc1 %s -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck %s
 
 void foo(void);
 
diff --git a/clang/test/Profile/misexpect-branch-cold.c b/clang/test/Profile/misexpect-branch-cold.c
index 6d34f92a25454..43ef3fcdda0ba 100644
--- a/clang/test/Profile/misexpect-branch-cold.c
+++ b/clang/test/Profile/misexpect-branch-cold.c
@@ -1,7 +1,7 @@
 // Test that misexpect emits no warning when prediction is correct
 
 // RUN: llvm-profdata merge %S/Inputs/misexpect-branch.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect
+// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect
 
 // expected-no-diagnostics
 #define likely(x) __builtin_expect(!!(x), 1)
diff --git a/clang/test/Profile/misexpect-branch-nonconst-expected-val.c b/clang/test/Profile/misexpect-branch-nonconst-expected-val.c
index c5167b9a2a0b7..fa2b4f1755040 100644
--- a/clang/test/Profile/misexpect-branch-nonconst-expected-val.c
+++ b/clang/test/Profile/misexpect-branch-nonconst-expected-val.c
@@ -1,7 +1,7 @@
 // Test that misexpect emits no warning when condition is not a compile-time constant
 
 // RUN: llvm-profdata merge %S/Inputs/misexpect-branch-nonconst-expect-arg.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect
+// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect
 
 // expected-no-diagnostics
 int foo(int);
diff --git a/clang/test/Profile/misexpect-branch-unpredictable.c b/clang/test/Profile/misexpect-branch-unpredictable.c
index 6c4f90146a2ae..ffaf25aed82bd 100644
--- a/clang/test/Profile/misexpect-branch-unpredictable.c
+++ b/clang/test/Profile/misexpect-branch-unpredictable.c
@@ -1,7 +1,7 @@
 // Test that misexpect emits no warning when prediction is correct
 
 // RUN: llvm-profdata merge %S/Inputs/misexpect-branch.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect
+// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect
 
 // expected-no-diagnostics
 #define unpredictable(x) __builtin_unpredictable(!!(x))
diff --git a/clang/test/Profile/misexpect-branch.c b/clang/test/Profile/misexpect-branch.c
index 5c4394405e178..fb7225047fcb0 100644
--- a/clang/test/Profile/misexpect-branch.c
+++ b/clang/test/Profile/misexpect-branch.c
@@ -2,12 +2,12 @@
 
 // test diagnostics are issued when profiling data mis-matches annotations
 // RUN: llvm-profdata merge %S/Inputs/misexpect-branch.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify=imprecise -Wmisexpect
-// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify=exact -Wmisexpect -debug-info-kind=line-tables-only
+// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify=imprecise -Wmisexpect
+// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify=exact -Wmisexpect -debug-info-kind=line-tables-only
 
 // there should be no diagnostics when the tolerance is sufficiently high, or when -Wmisexpect is not requested
-// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify=foo -fdiagnostics-misexpect-tolerance=10 -Wmisexpect -debug-info-kind=line-tables-only
-// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify=foo
+// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify=foo -fdiagnostics-misexpect-tolerance=10 -Wmisexpect -debug-info-kind=line-tables-only
+// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify=foo
 
 // Ensure we emit an error when we don't use pgo with tolerance threshold
 // RUN: %clang_cc1 %s -O2 -o - -emit-llvm  -fdiagnostics-misexpect-tolerance=10 -Wmisexpect -debug-info-kind=line-tables-only 2>&1 | FileCheck -check-prefix=NO_PGO %s
diff --git a/clang/test/Profile/misexpect-switch-default.c b/clang/test/Profile/misexpect-switch-default.c
index cd337b9430171..89eb64fea420b 100644
--- a/clang/test/Profile/misexpect-switch-default.c
+++ b/clang/test/Profile/misexpect-switch-default.c
@@ -1,7 +1,7 @@
 // Test that misexpect detects mis-annotated switch statements for default case
 
 // RUN: llvm-profdata merge %S/Inputs/misexpect-switch-default.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect -debug-info-kind=line-tables-only
+// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect -debug-info-kind=line-tables-only
 
 #define inner_loop 1000
 #define outer_loop 20
diff --git a/clang/test/Profile/misexpect-switch-nonconst.c b/clang/test/Profile/misexpect-switch-nonconst.c
index cff3bce0d6d0a..f18baea83cc1d 100644
--- a/clang/test/Profile/misexpect-switch-nonconst.c
+++ b/clang/test/Profile/misexpect-switch-nonconst.c
@@ -1,7 +1,7 @@
 // Test that misexpect emits no warning when switch condition is non-const
 
 // RUN: llvm-profdata merge %S/Inputs/misexpect-switch-nonconst.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify
+// RUN: %clang_cc1 %s -O2 -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Profile/misexpect-switch-only-default-case.c b/clang/test/Profile/misexpect-switch-only-default-case.c
index 26e8564c81b54..ef0f124305625 100644
--- a/clang/test/Profile/misexpect-switch-only-default-case.c
+++ b/clang/test/Profile/misexpect-switch-only-default-case.c
@@ -1,7 +1,7 @@
 // Test that misexpect emits no warning when there is only one switch case
 
 // RUN: llvm-profdata merge %S/Inputs/misexpect-switch-default-only.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect -debug-info-kind=line-tables-only
+// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect -debug-info-kind=line-tables-only
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Profile/misexpect-switch.c b/clang/test/Profile/misexpect-switch.c
index 84a7174f635fd..ae13cf7675706 100644
--- a/clang/test/Profile/misexpect-switch.c
+++ b/clang/test/Profile/misexpect-switch.c
@@ -1,7 +1,7 @@
 // Test that misexpect detects mis-annotated switch statements
 
 // RUN: llvm-profdata merge %S/Inputs/misexpect-switch.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect -debug-info-kind=line-tables-only
+// RUN: %clang_cc1 %s -O2 -o - -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata -verify -Wmisexpect -debug-info-kind=line-tables-only
 
 #define inner_loop 1000
 #define outer_loop 20
diff --git a/clang/test/Profile/objc-general.m b/clang/test/Profile/objc-general.m
index a3dcb1b2128c6..a711645e7c833 100644
--- a/clang/test/Profile/objc-general.m
+++ b/clang/test/Profile/objc-general.m
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instrument=clang | FileCheck -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/objc-general.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata 2>&1 | FileCheck -check-prefix=PGOUSE %s
 
 // PGOUSE-NOT: warning: profile data may be out of date
 
diff --git a/clang/test/Profile/profile-does-not-exist-ir.c b/clang/test/Profile/profile-does-not-exist-ir.c
index 842a3d44a387b..cb7bef2b7043b 100644
--- a/clang/test/Profile/profile-does-not-exist-ir.c
+++ b/clang/test/Profile/profile-does-not-exist-ir.c
@@ -1,4 +1,8 @@
-// RUN: not %clang_cc1 -emit-llvm -x ir %s -o - -fprofile-instrument-use-path=%t.nonexistent.profdata 2>&1 | FileCheck %s
+; RUN: not %clang_cc1 -emit-llvm -x ir %s -o - -fprofile-instrument-use=llvm -fprofile-instrument-use-path=%t.nonexistent.profdata 2>&1 | FileCheck %s
 
-// CHECK: error: Error in reading profile {{.*}}.nonexistent.profdata:
-// CHECK-NOT: Assertion failed
+; CHECK: error: {{.*}}.nonexistent.profdata:
+; CHECK-NOT: Assertion failed
+
+define i32 @main() {
+  ret i32 0
+}
diff --git a/clang/test/Profile/profile-does-not-exist.c b/clang/test/Profile/profile-does-not-exist.c
index 88d55d8668ef8..e9e04bed19b84 100644
--- a/clang/test/Profile/profile-does-not-exist.c
+++ b/clang/test/Profile/profile-does-not-exist.c
@@ -1,4 +1,8 @@
-// RUN: not %clang_cc1 -emit-llvm %s -o - -fprofile-instrument-use-path=%t.nonexistent.profdata 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -emit-llvm %s -o - -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.nonexistent.profdata 2>&1 | FileCheck %s
 
 // CHECK: error: Error in reading profile {{.*}}.nonexistent.profdata:
 // CHECK-NOT: Assertion failed
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/Profile/profile-summary.c b/clang/test/Profile/profile-summary.c
index 47fd9e85a4539..31e8497ed6398 100644
--- a/clang/test/Profile/profile-summary.c
+++ b/clang/test/Profile/profile-summary.c
@@ -1,7 +1,7 @@
 // Test that profile summary is set correctly.
 
 // RUN: llvm-profdata merge %S/Inputs/max-function-count.proftext -o %t.profdata
-// RUN: %clang_cc1 %s -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
+// RUN: %clang_cc1 %s -o - -disable-llvm-passes -emit-llvm -fprofile-instrument-use=clang -fprofile-instrument-use-path=%t.profdata | FileCheck %s
 //
 int begin(int i) {
   if (i)

From d7371aae9fdb0c4a9f58e24ab4621ae503e90f98 Mon Sep 17 00:00:00 2001
From: "Attarde, Mahesh" <mahesh.attarde@intel.com>
Date: Fri, 26 Sep 2025 03:24:33 -0700
Subject: [PATCH 875/878] [X86][GlobalIsel] support G_IS_FPCLASS

---
 .../lib/Target/X86/GISel/X86LegalizerInfo.cpp |   3 +
 llvm/test/CodeGen/X86/isel-fpclass.ll         | 549 +++++++++++++++---
 2 files changed, 469 insertions(+), 83 deletions(-)

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index 11ef72119616e..b3a20bc696a42 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -414,6 +414,9 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
+  getActionDefinitionsBuilder(G_IS_FPCLASS)
+      .lower();
+
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalFor({s32, s64})
diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index df04b673d8223..206b0e415826b 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -3,14 +3,14 @@
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
 ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
 ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
-; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=2  | FileCheck %s -check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=2  | FileCheck %s -check-prefixes=X64,X64-GISEL
+; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X86-GISEL
+; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-GISEL
 
-define i1 @isnone_f(float %x) nounwind {
-; X86-LABEL: isnone_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
+define i1 @isnone_f(float %x) {
+; X86-SDAGISEL-LABEL: isnone_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    xorl %eax, %eax
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-LABEL: isnone_f:
 ; X64:       # %bb.0: # %entry
@@ -23,6 +23,11 @@ define i1 @isnone_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isnone_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
   ret i1 %0
@@ -45,6 +50,11 @@ define i1 @isany_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isany_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movb $1, %al
+; X86-GISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
   ret i1 %0
@@ -62,16 +72,16 @@ define i1 @issignaling_f(float %x) nounwind {
 ; X86-NEXT:    andb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: issignaling_f:
-; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-NEXT:    setl %cl
-; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X64-NEXT:    setge %al
-; X64-NEXT:    andb %cl, %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: issignaling_f:
+; X64-SDAGISEL:       # %bb.0:
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-SDAGISEL-NEXT:    setl %cl
+; X64-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-SDAGISEL-NEXT:    setge %al
+; X64-SDAGISEL-NEXT:    andb %cl, %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: issignaling_f:
 ; X86-FASTISEL:       # %bb.0:
@@ -87,6 +97,43 @@ define i1 @issignaling_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    andb %cl, %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: issignaling_f:
+; X64-FASTISEL:       # %bb.0:
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-FASTISEL-NEXT:    setl %cl
+; X64-FASTISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-FASTISEL-NEXT:    setge %al
+; X64-FASTISEL-NEXT:    andb %cl, %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: issignaling_f:
+; X86-GISEL:       # %bb.0:
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %dl
+; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    andb %dl, %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: issignaling_f:
+; X64-GISEL:       # %bb.0:
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %dl
+; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    andb %dl, %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
    %a0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1)  ; "snan"
    ret i1 %a0
 }
@@ -100,13 +147,13 @@ define i1 @issignaling_f(float %x) nounwind {
 ; X86-NEXT:    setge %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: isquiet_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-NEXT:    setge %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: isquiet_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-SDAGISEL-NEXT:    setge %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: isquiet_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -119,6 +166,34 @@ define i1 @issignaling_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setge %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isquiet_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-FASTISEL-NEXT:    setge %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isquiet_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-GISEL-NEXT:    setae %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isquiet_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GISEL-NEXT:    setae %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
  entry:
    %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2)  ; "qnan"
    ret i1 %0
@@ -133,13 +208,13 @@ define i1 @not_isquiet_f(float %x) nounwind {
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: not_isquiet_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X64-NEXT:    setl %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_isquiet_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-SDAGISEL-NEXT:    setl %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_isquiet_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -152,6 +227,52 @@ define i1 @not_isquiet_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setl %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_isquiet_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-FASTISEL-NEXT:    setl %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_isquiet_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %dl
+; X86-GISEL-NEXT:    orb %cl, %dl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %cl
+; X86-GISEL-NEXT:    orb %dl, %cl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %dl
+; X86-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    andb %dl, %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_isquiet_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %dl
+; X64-GISEL-NEXT:    orb %cl, %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %cl
+; X64-GISEL-NEXT:    orb %dl, %cl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %dl
+; X64-GISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    andb %dl, %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021)  ; ~"qnan"
   ret i1 %0
@@ -166,13 +287,13 @@ define i1 @isinf_f(float %x) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: isinf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: isinf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    sete %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: isinf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -185,6 +306,34 @@ define i1 @isinf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isinf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    sete %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isinf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isinf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
   ret i1 %0
@@ -199,13 +348,13 @@ define i1 @not_isinf_f(float %x) nounwind {
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: not_isinf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_isinf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setne %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_isinf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -218,6 +367,40 @@ define i1 @not_isinf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setne %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_isinf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setne %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_isinf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %dl
+; X86-GISEL-NEXT:    orb %cl, %dl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %al
+; X86-GISEL-NEXT:    orb %dl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_isinf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %dl
+; X64-GISEL-NEXT:    orb %cl, %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %al
+; X64-GISEL-NEXT:    orb %dl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; ~0x204 = "~inf"
   ret i1 %0
@@ -230,12 +413,12 @@ define i1 @is_plus_inf_f(float %x) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: is_plus_inf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: is_plus_inf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    sete %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: is_plus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -246,6 +429,30 @@ define i1 @is_plus_inf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: is_plus_inf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    sete %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: is_plus_inf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: is_plus_inf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512)  ; 0x200 = "+inf"
   ret i1 %0
@@ -258,12 +465,12 @@ define i1 @is_minus_inf_f(float %x) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: is_minus_inf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: is_minus_inf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-SDAGISEL-NEXT:    sete %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: is_minus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -274,6 +481,30 @@ define i1 @is_minus_inf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    sete %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: is_minus_inf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-FASTISEL-NEXT:    sete %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: is_minus_inf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-GISEL-NEXT:    sete %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: is_minus_inf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-GISEL-NEXT:    sete %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4)  ; "-inf"
   ret i1 %0
@@ -286,12 +517,12 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: not_is_minus_inf_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_is_minus_inf_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-SDAGISEL-NEXT:    setne %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_is_minus_inf_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -302,6 +533,52 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setne %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_is_minus_inf_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-FASTISEL-NEXT:    setne %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_is_minus_inf_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    pushl %ebx
+; X86-GISEL-NEXT:    .cfi_def_cfa_offset 8
+; X86-GISEL-NEXT:    .cfi_offset %ebx, -8
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    movl %eax, %ecx
+; X86-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %edx, %edx
+; X86-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %bl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %ah
+; X86-GISEL-NEXT:    orb %dl, %ah
+; X86-GISEL-NEXT:    orb %bl, %ah
+; X86-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %al
+; X86-GISEL-NEXT:    orb %ah, %al
+; X86-GISEL-NEXT:    popl %ebx
+; X86-GISEL-NEXT:    .cfi_def_cfa_offset 4
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_is_minus_inf_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    movl %eax, %ecx
+; X64-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %edx, %edx
+; X64-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %sil
+; X64-GISEL-NEXT:    orb %dl, %sil
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %al
+; X64-GISEL-NEXT:    orb %dl, %al
+; X64-GISEL-NEXT:    orb %sil, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019)  ; ~"-inf"
   ret i1 %0
@@ -316,13 +593,13 @@ define i1 @isfinite_f(float %x) nounwind {
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: isfinite_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setl %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: isfinite_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setl %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: isfinite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -335,6 +612,34 @@ define i1 @isfinite_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setl %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: isfinite_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setl %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: isfinite_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: isfinite_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %0
@@ -349,13 +654,13 @@ define i1 @not_isfinite_f(float %x) nounwind {
 ; X86-NEXT:    setge %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: not_isfinite_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setge %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: not_isfinite_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setge %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: not_isfinite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -368,6 +673,40 @@ define i1 @not_isfinite_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setge %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: not_isfinite_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setge %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: not_isfinite_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    sete %dl
+; X86-GISEL-NEXT:    orb %cl, %dl
+; X86-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-GISEL-NEXT:    seta %al
+; X86-GISEL-NEXT:    orb %dl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: not_isfinite_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    sete %dl
+; X64-GISEL-NEXT:    orb %cl, %dl
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    seta %al
+; X64-GISEL-NEXT:    orb %dl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; ~0x1f8 = "~finite"
   ret i1 %0
@@ -380,12 +719,12 @@ define i1 @is_plus_finite_f(float %x) nounwind {
 ; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: is_plus_finite_f:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
+; X64-SDAGISEL-LABEL: is_plus_finite_f:
+; X64-SDAGISEL:       # %bb.0: # %entry
+; X64-SDAGISEL-NEXT:    movd %xmm0, %eax
+; X64-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-SDAGISEL-NEXT:    setb %al
+; X64-SDAGISEL-NEXT:    retq
 ;
 ; X86-FASTISEL-LABEL: is_plus_finite_f:
 ; X86-FASTISEL:       # %bb.0: # %entry
@@ -396,6 +735,30 @@ define i1 @is_plus_finite_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    setb %al
 ; X86-FASTISEL-NEXT:    popl %ecx
 ; X86-FASTISEL-NEXT:    retl
+;
+; X64-FASTISEL-LABEL: is_plus_finite_f:
+; X64-FASTISEL:       # %bb.0: # %entry
+; X64-FASTISEL-NEXT:    movd %xmm0, %eax
+; X64-FASTISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-FASTISEL-NEXT:    setb %al
+; X64-FASTISEL-NEXT:    retq
+;
+; X86-GISEL-LABEL: is_plus_finite_f:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %ecx, %ecx
+; X86-GISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-GISEL-NEXT:    setb %al
+; X86-GISEL-NEXT:    orb %cl, %al
+; X86-GISEL-NEXT:    retl
+;
+; X64-GISEL-LABEL: is_plus_finite_f:
+; X64-GISEL:       # %bb.0: # %entry
+; X64-GISEL-NEXT:    xorl %ecx, %ecx
+; X64-GISEL-NEXT:    movd %xmm0, %eax
+; X64-GISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GISEL-NEXT:    setb %al
+; X64-GISEL-NEXT:    orb %cl, %al
+; X64-GISEL-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448)  ; 0x1c0 = "+finite"
   ret i1 %0
@@ -418,6 +781,11 @@ define i1 @isnone_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isnone_d:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    retl
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0)
     ret i1 %0
@@ -440,6 +808,11 @@ define i1 @isany_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
+;
+; X86-GISEL-LABEL: isany_d:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movb $1, %al
+; X86-GISEL-NEXT:    retl
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023)
     ret i1 %0
@@ -470,6 +843,11 @@ define i1 @isnone_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    xorl %eax, %eax
 ; X64-FASTISEL-NEXT:    retq
 ;
+; X86-GISEL-LABEL: isnone_f80:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    xorl %eax, %eax
+; X86-GISEL-NEXT:    retl
+;
 ; X64-GISEL-LABEL: isnone_f80:
 ; X64-GISEL:       # %bb.0: # %entry
 ; X64-GISEL-NEXT:    xorl %eax, %eax
@@ -504,6 +882,11 @@ define i1 @isany_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    movb $1, %al
 ; X64-FASTISEL-NEXT:    retq
 ;
+; X86-GISEL-LABEL: isany_f80:
+; X86-GISEL:       # %bb.0: # %entry
+; X86-GISEL-NEXT:    movb $1, %al
+; X86-GISEL-NEXT:    retl
+;
 ; X64-GISEL-LABEL: isany_f80:
 ; X64-GISEL:       # %bb.0: # %entry
 ; X64-GISEL-NEXT:    movb $1, %al

From 520e566c6052182a8f40382e4b9fe03faa07e50e Mon Sep 17 00:00:00 2001
From: "Attarde, Mahesh" <mahesh.attarde@intel.com>
Date: Fri, 26 Sep 2025 04:39:34 -0700
Subject: [PATCH 876/878] fmt

---
 llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index b3a20bc696a42..d8262d889c0a3 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -414,8 +414,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
 
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
-  getActionDefinitionsBuilder(G_IS_FPCLASS)
-      .lower();
+  getActionDefinitionsBuilder(G_IS_FPCLASS).lower();
 
   // fp constants
   getActionDefinitionsBuilder(G_FCONSTANT)

From cbf8a90cecb1d09eb8a17ab19db3e0008e39d720 Mon Sep 17 00:00:00 2001
From: "Attarde, Mahesh" <mahesh.attarde@intel.com>
Date: Mon, 6 Oct 2025 10:51:58 -0700
Subject: [PATCH 877/878] update test

---
 llvm/test/CodeGen/X86/isel-fpclass.ll | 189 +++++++++++---------------
 1 file changed, 78 insertions(+), 111 deletions(-)

diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index 206b0e415826b..5d6a5f14a324b 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86,X86-SDAGISEL
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
 ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
 ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
-; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X86-GISEL
+; RUN: llc < %s -mtriple=i686-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X86,X86-GISEL
 ; RUN: llc < %s -mtriple=x86_64-linux -global-isel -global-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-GISEL
 
 define i1 @isnone_f(float %x) {
-; X86-SDAGISEL-LABEL: isnone_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    xorl %eax, %eax
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isnone_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: isnone_f:
 ; X64:       # %bb.0: # %entry
@@ -23,11 +23,6 @@ define i1 @isnone_f(float %x) {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: isnone_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    xorl %eax, %eax
-; X86-GISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
   ret i1 %0
@@ -50,27 +45,22 @@ define i1 @isany_f(float %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: isany_f:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movb $1, %al
-; X86-GISEL-NEXT:    retl
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
   ret i1 %0
 }
 
 define i1 @issignaling_f(float %x) nounwind {
-; X86-LABEL: issignaling_f:
-; X86:       # %bb.0:
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-NEXT:    setge %al
-; X86-NEXT:    andb %cl, %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: issignaling_f:
+; X86-SDAGISEL:       # %bb.0:
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setl %cl
+; X86-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    andb %cl, %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: issignaling_f:
 ; X64-SDAGISEL:       # %bb.0:
@@ -139,13 +129,13 @@ define i1 @issignaling_f(float %x) nounwind {
 }
 
  define i1 @isquiet_f(float %x) nounwind {
-; X86-LABEL: isquiet_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-NEXT:    setge %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: isquiet_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isquiet_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -200,13 +190,13 @@ define i1 @issignaling_f(float %x) nounwind {
 }
 
 define i1 @not_isquiet_f(float %x) nounwind {
-; X86-LABEL: not_isquiet_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-NEXT:    setl %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: not_isquiet_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-SDAGISEL-NEXT:    setl %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_isquiet_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -279,13 +269,13 @@ entry:
 }
 
 define i1 @isinf_f(float %x) nounwind {
-; X86-LABEL: isinf_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: isinf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isinf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -340,13 +330,13 @@ entry:
 }
 
 define i1 @not_isinf_f(float %x) nounwind {
-; X86-LABEL: not_isinf_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: not_isinf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setne %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_isinf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -407,11 +397,11 @@ entry:
 }
 
 define i1 @is_plus_inf_f(float %x) nounwind {
-; X86-LABEL: is_plus_inf_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: is_plus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: is_plus_inf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -459,11 +449,11 @@ entry:
 }
 
 define i1 @is_minus_inf_f(float %x) nounwind {
-; X86-LABEL: is_minus_inf_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: is_minus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-SDAGISEL-NEXT:    sete %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: is_minus_inf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -511,11 +501,11 @@ entry:
 }
 
 define i1 @not_is_minus_inf_f(float %x) nounwind {
-; X86-LABEL: not_is_minus_inf_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-SDAGISEL-NEXT:    setne %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_is_minus_inf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -544,8 +534,6 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
 ; X86-GISEL-LABEL: not_is_minus_inf_f:
 ; X86-GISEL:       # %bb.0: # %entry
 ; X86-GISEL-NEXT:    pushl %ebx
-; X86-GISEL-NEXT:    .cfi_def_cfa_offset 8
-; X86-GISEL-NEXT:    .cfi_offset %ebx, -8
 ; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-GISEL-NEXT:    movl %eax, %ecx
 ; X86-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
@@ -560,7 +548,6 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
 ; X86-GISEL-NEXT:    seta %al
 ; X86-GISEL-NEXT:    orb %ah, %al
 ; X86-GISEL-NEXT:    popl %ebx
-; X86-GISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-GISEL-NEXT:    retl
 ;
 ; X64-GISEL-LABEL: not_is_minus_inf_f:
@@ -585,13 +572,13 @@ entry:
 }
 
 define i1 @isfinite_f(float %x) nounwind {
-; X86-LABEL: isfinite_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setl %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: isfinite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setl %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isfinite_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -646,13 +633,13 @@ entry:
 }
 
 define i1 @not_isfinite_f(float %x) nounwind {
-; X86-LABEL: not_isfinite_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-NEXT:    setge %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: not_isfinite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setge %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_isfinite_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -713,11 +700,11 @@ entry:
 }
 
 define i1 @is_plus_finite_f(float %x) nounwind {
-; X86-LABEL: is_plus_finite_f:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-NEXT:    setb %al
-; X86-NEXT:    retl
+; X86-SDAGISEL-LABEL: is_plus_finite_f:
+; X86-SDAGISEL:       # %bb.0: # %entry
+; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-SDAGISEL-NEXT:    setb %al
+; X86-SDAGISEL-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: is_plus_finite_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -781,11 +768,6 @@ define i1 @isnone_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    xorl %eax, %eax
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: isnone_d:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    xorl %eax, %eax
-; X86-GISEL-NEXT:    retl
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 0)
     ret i1 %0
@@ -808,11 +790,6 @@ define i1 @isany_d(double %x) nounwind {
 ; X86-FASTISEL-NEXT:    fstp %st(0)
 ; X86-FASTISEL-NEXT:    movb $1, %al
 ; X86-FASTISEL-NEXT:    retl
-;
-; X86-GISEL-LABEL: isany_d:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movb $1, %al
-; X86-GISEL-NEXT:    retl
 entry:
     %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1023)
     ret i1 %0
@@ -843,11 +820,6 @@ define i1 @isnone_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    xorl %eax, %eax
 ; X64-FASTISEL-NEXT:    retq
 ;
-; X86-GISEL-LABEL: isnone_f80:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    xorl %eax, %eax
-; X86-GISEL-NEXT:    retl
-;
 ; X64-GISEL-LABEL: isnone_f80:
 ; X64-GISEL:       # %bb.0: # %entry
 ; X64-GISEL-NEXT:    xorl %eax, %eax
@@ -882,11 +854,6 @@ define i1 @isany_f80(x86_fp80 %x) nounwind {
 ; X64-FASTISEL-NEXT:    movb $1, %al
 ; X64-FASTISEL-NEXT:    retq
 ;
-; X86-GISEL-LABEL: isany_f80:
-; X86-GISEL:       # %bb.0: # %entry
-; X86-GISEL-NEXT:    movb $1, %al
-; X86-GISEL-NEXT:    retl
-;
 ; X64-GISEL-LABEL: isany_f80:
 ; X64-GISEL:       # %bb.0: # %entry
 ; X64-GISEL-NEXT:    movb $1, %al

From 7adfdbe78dc66a4da739b7e336f077185287671c Mon Sep 17 00:00:00 2001
From: "Attarde, Mahesh" <mahesh.attarde@intel.com>
Date: Mon, 6 Oct 2025 11:29:52 -0700
Subject: [PATCH 878/878] rebase test

---
 llvm/test/CodeGen/X86/isel-fpclass.ll | 149 +++++++++++++-------------
 1 file changed, 73 insertions(+), 76 deletions(-)

diff --git a/llvm/test/CodeGen/X86/isel-fpclass.ll b/llvm/test/CodeGen/X86/isel-fpclass.ll
index ed394ef4151ff..66296f131962e 100644
--- a/llvm/test/CodeGen/X86/isel-fpclass.ll
+++ b/llvm/test/CodeGen/X86/isel-fpclass.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86,X86-SDAGISEL
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-SDAGISEL
 ; RUN: llc < %s -mtriple=i686-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X86-FASTISEL
 ; RUN: llc < %s -mtriple=x86_64-linux -fast-isel -fast-isel-abort=1  | FileCheck %s -check-prefixes=X64,X64-FASTISEL
@@ -61,16 +61,16 @@ entry:
 }
 
 define i1 @issignaling_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: issignaling_f:
-; X86-SDAGISEL:       # %bb.0:
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setl %cl
-; X86-SDAGISEL-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    andb %cl, %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: issignaling_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: issignaling_f:
 ; X64-SDAGISEL:       # %bb.0:
@@ -139,13 +139,13 @@ define i1 @issignaling_f(float %x) nounwind {
 }
 
  define i1 @isquiet_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isquiet_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isquiet_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -200,13 +200,13 @@ define i1 @issignaling_f(float %x) nounwind {
 }
 
 define i1 @not_isquiet_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isquiet_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; X86-SDAGISEL-NEXT:    setl %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_isquiet_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -279,13 +279,13 @@ entry:
 }
 
 define i1 @isinf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isinf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isinf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -340,13 +340,13 @@ entry:
 }
 
 define i1 @not_isinf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isinf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setne %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_isinf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -407,11 +407,11 @@ entry:
 }
 
 define i1 @is_plus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_plus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: is_plus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: is_plus_inf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -459,11 +459,11 @@ entry:
 }
 
 define i1 @is_minus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_minus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT:    sete %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: is_minus_inf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -511,11 +511,11 @@ entry:
 }
 
 define i1 @not_is_minus_inf_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_is_minus_inf_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; X86-SDAGISEL-NEXT:    setne %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_is_minus_inf_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -544,8 +544,6 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
 ; X86-GISEL-LABEL: not_is_minus_inf_f:
 ; X86-GISEL:       # %bb.0: # %entry
 ; X86-GISEL-NEXT:    pushl %ebx
-; X86-GISEL-NEXT:    .cfi_def_cfa_offset 8
-; X86-GISEL-NEXT:    .cfi_offset %ebx, -8
 ; X86-GISEL-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-GISEL-NEXT:    movl %eax, %ecx
 ; X86-GISEL-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
@@ -560,7 +558,6 @@ define i1 @not_is_minus_inf_f(float %x) nounwind {
 ; X86-GISEL-NEXT:    seta %al
 ; X86-GISEL-NEXT:    orb %ah, %al
 ; X86-GISEL-NEXT:    popl %ebx
-; X86-GISEL-NEXT:    .cfi_def_cfa_offset 4
 ; X86-GISEL-NEXT:    retl
 ;
 ; X64-GISEL-LABEL: not_is_minus_inf_f:
@@ -585,13 +582,13 @@ entry:
 }
 
 define i1 @isfinite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: isfinite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setl %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: isfinite_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -646,13 +643,13 @@ entry:
 }
 
 define i1 @not_isfinite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: not_isfinite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-SDAGISEL-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setge %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: not_isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: not_isfinite_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry
@@ -713,11 +710,11 @@ entry:
 }
 
 define i1 @is_plus_finite_f(float %x) nounwind {
-; X86-SDAGISEL-LABEL: is_plus_finite_f:
-; X86-SDAGISEL:       # %bb.0: # %entry
-; X86-SDAGISEL-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; X86-SDAGISEL-NEXT:    setb %al
-; X86-SDAGISEL-NEXT:    retl
+; X86-LABEL: is_plus_finite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
 ;
 ; X64-SDAGISEL-LABEL: is_plus_finite_f:
 ; X64-SDAGISEL:       # %bb.0: # %entry